From ef2efb7ccfc24c3451c521e37cba7c8390ab1c40 Mon Sep 17 00:00:00 2001 From: James Newling Date: Sun, 18 Aug 2024 15:07:04 -0700 Subject: [PATCH] squash commit --- .../target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp | 57 +++++++++--------- .../target/AMD-AIE/aie/test/aiert_insts.mlir | 19 ++---- .../target/AMD-AIE/aie/test/dma_to_npu.mlir | 23 ------- .../AMD-AIE/aie/test/push_to_queue.mlir | 13 ++-- .../AMD-AIE/iree-amd-aie/Target/AIETarget.cpp | 60 ++++++++----------- .../AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp | 44 ++++++++++---- .../iree-amd-aie/Transforms/Passes.cpp | 10 ++-- tests/samples/conv_pipeline_e2e.mlir | 10 ---- .../samples/matmul_peeled_objectfifo_e2e.mlir | 17 +++--- tests/samples/pack_peel_pipeline_matmul.mlir | 10 ---- ...pack_peel_pipeline_matmul_elementwise.mlir | 34 +++-------- tests/samples/pad_pack_pipeline_e2e.mlir | 21 ------- 12 files changed, 116 insertions(+), 202 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp b/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp index f15990681..d63cac489 100644 --- a/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp +++ b/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp @@ -10,7 +10,6 @@ #include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/Support/Format.h" #include "mlir/IR/AsmState.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" @@ -512,33 +511,37 @@ struct AMDAIEDmaToNpuPass : mlir::OperationPass { instructions[2] = count; instructions[3] = instructions.size() * sizeof(uint32_t); - ArrayRef instsArrRef(instructions.data(), instructions.size()); - device->setAttr( - "npu_instructions", - DenseUI32ResourceElementsAttr::get( - RankedTensorType::get( - instsArrRef.size(), - IntegerType::get(&getContext(), 32, IntegerType::Unsigned)), - "npu_instructions", - HeapAsmResourceBlob::allocateAndCopyInferAlign(instsArrRef))); - // The LX instructions for the entry point function are already generated by - // the pass hence we can safely delete the function as it is of no use to - // us. A reason to do this is that otherwise it is unceseccarily lowered to - // llvm where it can have a chance to crash in case the argument list is not - // lowerable for reasons such as memref's with dynamic offsets. - auto symName = dyn_cast_or_null(device->getAttr("sym_name")); + + // Note on use of 'DenseIntElementsAttr' below: + // We cannot use DenseUI32ResourceElementsAttr because it doesn't + // serialize-deserialize, and we (currently) serialize the module before + // running aie2xclbin. + { + ArrayRef instsArrRef(instructions.data(), instructions.size()); + ShapedType type = + RankedTensorType::get({static_cast(instsArrRef.size())}, + IntegerType::get(&getContext(), 32)); + auto instsAttr = DenseIntElementsAttr::get(type, instsArrRef); + device->setAttr("npu_instructions", instsAttr); + } + SmallVector seqOps; - device->walk([&](RuntimeSequenceOp seqOp) { - // if the deviceOp has a symbol name attached to it we look for the - // sequence op that partically matches that symbol, if not we collect all - // sequenceOps. - if (!symName || - symName.str().find(seqOp.getSymName()->str()) != std::string::npos) - seqOps.push_back(seqOp); - }); - // If exactly one entry point function is found we can delete it. For any - // other result we do not make any change. - if (seqOps.size() == 1) seqOps[0].erase(); + device->walk([&](RuntimeSequenceOp seqOp) { seqOps.push_back(seqOp); }); + + if (seqOps.size() > 1) { + device->emitOpError("has ") + << seqOps.size() + << " aiex.runtime_sequence ops. Expected no more than 1."; + signalPassFailure(); + } + + if (seqOps.size() == 1) { + auto seqOp = seqOps[0]; + StringRef name = seqOp.getSymName().value(); + device->setAttr("runtime_sequence_name", + StringAttr::get(&getContext(), name)); + seqOp.erase(); + } } }; diff --git a/compiler/plugins/target/AMD-AIE/aie/test/aiert_insts.mlir b/compiler/plugins/target/AMD-AIE/aie/test/aiert_insts.mlir index cb82fcd22..7fa0c7b3b 100644 --- a/compiler/plugins/target/AMD-AIE/aie/test/aiert_insts.mlir +++ b/compiler/plugins/target/AMD-AIE/aie/test/aiert_insts.mlir @@ -1,19 +1,12 @@ // RUN: iree-opt --amdaie-dma-to-npu %s | FileCheck %s -// CHECK-LABEL: aie.device(npu1_4col) { -// CHECK: memref.global "public" @of_toMem : memref<32xi32> -// CHECK: memref.global "public" @of_fromMem : memref<32xi32> -// CHECK: aie.shim_dma_allocation @of_fromMem(MM2S, 0, 0) -// CHECK: aie.shim_dma_allocation @of_toMem(S2MM, 0, 0) -// CHECK: } {npu_instructions = dense_resource : tensor<64xui32>} -// CHECK: {-# -// CHECK: dialect_resources: { -// CHECK: builtin: { -// CHECK: npu_instructions: "0x0400000000010306050100000600000000010000010000000000000020D0010030000000200000000000000000000000000000000000008000000000000000000000000281000000300000000000000000000000000000000000000024D001000000000002000000000000000000000000000000000000000000000004D20100000000000100008018000000010000000000000000D001003000000020000000800000000000000000008000070020800F000000000000000000000281000000300000000000000000000000000000000000000004D001000000000000000000000000008000000000000000000000000000000014D20100000000000000000018000000" -// CHECK: } -// CHECK: } -// CHECK: #-} +// CHECK-LABEL: aie.device(npu1_4col) { +// CHECK: memref.global "public" @of_toMem : memref<32xi32> +// CHECK: memref.global "public" @of_fromMem : memref<32xi32> +// CHECK: aie.shim_dma_allocation @of_fromMem(MM2S, 0, 0) +// CHECK: aie.shim_dma_allocation @of_toMem(S2MM, 0, 0) +// CHECK: } {npu_instructions = dense<[100860160, 261, 6, 256, 1, 0, 118816, 48, 32, 0, 0, 0, -2147483648, 0, 0, 33554432, 129, 48, 0, 0, 0, 0, 118820, 0, 2, 0, 0, 0, 0, 0, 119300, 0, -2147483647, 24, 1, 0, 118784, 48, 32, 128, 0, 8388608, -2145386489, 15, 0, 33554432, 129, 48, 0, 0, 0, 0, 118788, 0, 0, 0, 128, 0, 0, 0, 119316, 0, 0, 24]> : tensor<64xi32>, runtime_sequence_name = "sequence"} module { diff --git a/compiler/plugins/target/AMD-AIE/aie/test/dma_to_npu.mlir b/compiler/plugins/target/AMD-AIE/aie/test/dma_to_npu.mlir index f4cfd5647..3a78c854c 100644 --- a/compiler/plugins/target/AMD-AIE/aie/test/dma_to_npu.mlir +++ b/compiler/plugins/target/AMD-AIE/aie/test/dma_to_npu.mlir @@ -77,26 +77,3 @@ module { } {sym_name = "explicit_sym_name_0"} } -// ----- - -// CHECK-LABEL: aie.device(npu1_4col) { -// CHECK: memref.global "public" @toMem : memref<16xi32> -// CHECK: func.func @pretend_microkernel -// CHECK: aiex.runtime_sequence @explicit_sym_name -// CHECK: aie.shim_dma_allocation @toMem(MM2S, 1, 1) - -module { - aie.device(npu1_4col) { - memref.global "public" @toMem : memref<16xi32> - func.func @pretend_microkernel(%arg0: memref<16xi32>, %arg1: memref<16xi32>) { - return - } - - aiex.runtime_sequence @explicit_sym_name(%arg0: memref<16xi32>, %arg1: memref<16xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> - aiex.npu.dma_wait {symbol = @toMem} - } - aie.shim_dma_allocation @toMem (MM2S, 1, 1) - } {sym_name = "wrong_sym_name"} -} - diff --git a/compiler/plugins/target/AMD-AIE/aie/test/push_to_queue.mlir b/compiler/plugins/target/AMD-AIE/aie/test/push_to_queue.mlir index 72a5ade0e..7b726e581 100644 --- a/compiler/plugins/target/AMD-AIE/aie/test/push_to_queue.mlir +++ b/compiler/plugins/target/AMD-AIE/aie/test/push_to_queue.mlir @@ -3,16 +3,11 @@ // CHECK: module { // CHECK: aie.device(npu1_4col) { -// CHECK: } {npu_instructions = dense_resource : tensor<16xui32>} -// CHECK: } - -// CHECK: {-# -// CHECK: dialect_resources: { -// CHECK: builtin: { -// CHECK: npu_instructions: "0x040000000001030605010000020000004000000000000000000000000CD20100000000000300008018000000000000000000000014D20104000000000200030018000000" +// CHECK: func.func @sequence() { +// CHECK: return // CHECK: } -// CHECK: } -// CHECK: #-} +// CHECK: } {npu_instructions = dense<[100860160, 261, 2, 64, 0, 0, 119308, 0, -2147483645, 24, 0, 0, 67228180, 0, 196610, 24]> : tensor<16xi32>} +// CHECK: } module { aie.device(npu1_4col) { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp index 8f764732a..e62db71a7 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp @@ -16,7 +16,6 @@ #include "aievec/XLLVMDialect.h" #include "air/Dialect/AIR/AIRDialect.h" #include "air/Dialect/AIRRt/AIRRtDialect.h" -#include "iree-amd-aie/IR/AMDAIEAttrs.h" #include "iree-amd-aie/IR/AMDAIEDialect.h" #include "iree-amd-aie/Transforms/Passes.h" #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h" @@ -28,8 +27,12 @@ #include "llvm/Support/Path.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/ToolOutputFile.h" +#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h" +#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h" #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h" -#include "mlir/Conversion/Passes.h" +#include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h" +#include "mlir/Conversion/MathToLLVM/MathToLLVM.h" +#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" #include "mlir/Dialect/DLTI/DLTI.h" #include "mlir/Dialect/EmitC/IR/EmitC.h" #include "mlir/Dialect/Func/Extensions/AllExtensions.h" @@ -75,42 +78,27 @@ static llvm::cl::opt clEnableAMDAIEUkernels( "unprefixed microkernels to enable, e.g. `matmul`."), llvm::cl::init("none")); -// Utility to find aie.device Op corresponding to the export Op. -// For example, we have -// hal.executable.variant { -// hal.executable.export symbol1 -// hal.executable.export symbol2 -// module { -// aie.device { -// ... -// aiex.runtime_sequence symbol1 -// } -// aie.device { -// ... -// aiex.runtime_sequence symbol2 -// } -// } -// } -// Hence we need to find the aiex.runtime_sequence that coresponds to the export -// op symbol and return its parent aie.device Op. This is what we will pass to -// the `aie2xclbin` tool for artifact generation per entry point. -static xilinx::AIE::DeviceOp getDeviceOpFromEntryPoint(ModuleOp moduleOp, - StringRef exportOpName) { +static xilinx::AIE::DeviceOp getDeviceOpWithName(ModuleOp moduleOp, + StringRef targetName) { xilinx::AIE::DeviceOp deviceOp; - moduleOp.walk([&](xilinx::AIEX::RuntimeSequenceOp sequenceOp) { - if (sequenceOp.getSymName() == exportOpName) { - deviceOp = - dyn_cast_or_null(sequenceOp->getParentOp()); - return WalkResult::interrupt(); - } - return WalkResult::advance(); + uint32_t nDeviceOpsVisited = 0; + moduleOp.walk([&](xilinx::AIE::DeviceOp d) { + ++nDeviceOpsVisited; + // This attribute should've been set in the dma-to-npu pass. + auto maybeName = d->getAttrOfType("runtime_sequence_name"); + if (!maybeName) return WalkResult::advance(); + auto name = maybeName.getValue(); + if (name != targetName) return WalkResult::advance(); + deviceOp = d; + return WalkResult::interrupt(); }); - if (!deviceOp) { - moduleOp.emitError() - << "failed to find aie.device containing func.func with symbol " - << exportOpName; - } + + if (!deviceOp) + moduleOp.emitError() << "visited " << nDeviceOpsVisited + << " aie.device ops, and failed to find one with name " + << targetName; + return deviceOp; } @@ -291,7 +279,7 @@ LogicalResult AIETargetBackend::serializeExecutable( } StringRef exportOpName = exportOp.getSymName(); - deviceOps.push_back(getDeviceOpFromEntryPoint(moduleOp, exportOpName)); + deviceOps.push_back(getDeviceOpWithName(moduleOp, exportOpName)); // The xclbin kernel name, appended with instance name suffix (`:MLIRAIEV1`, // 10 chars) is required by the xclbinutil to have a length smaller or equal diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp index a9bf83a60..9cbec2f71 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp @@ -1101,6 +1101,35 @@ static LogicalResult generateUnifiedObject( return success(); } +FailureOr> getNpuInstructions(ModuleOp moduleOp) { + auto ctx = moduleOp.getContext(); + SmallVector npuInstructions; + auto deviceOps = moduleOp.getOps(); + auto nDeviceOps = std::distance(deviceOps.begin(), deviceOps.end()); + if (nDeviceOps != 1) + return emitError(UnknownLoc::get(ctx), + "Expected exactly one AIE::DeviceOp in the module"); + + auto deviceOp = *deviceOps.begin(); + auto maybeNpuInstructions = deviceOp->getAttr("npu_instructions"); + if (!maybeNpuInstructions) + return emitError(UnknownLoc::get(ctx), + "Expected npu_instructions attribute on aie.device. "); + + auto npuInstructionsAttr = + dyn_cast(maybeNpuInstructions); + + if (!npuInstructionsAttr) { + return emitError(UnknownLoc::get(ctx), + "failed to cast to DenseUI32ResourceElementsAttr"); + } + + for (auto c : npuInstructionsAttr) { + npuInstructions.push_back(c.getSExtValue()); + } + return npuInstructions; +} + LogicalResult aie2xclbin( MLIRContext *ctx, ModuleOp moduleOp, const std::string &outputNPU, const std::string &outputXCLBin, bool printIRBeforeAll, @@ -1115,19 +1144,12 @@ LogicalResult aie2xclbin( PassManager pm(ctx, mlir::ModuleOp::getOperationName()); applyConfigToPassManager(pm, printIRBeforeAll, printIRAfterAll, printIRModuleScope, timing); - // generateNPUInstructions - pm.addNestedPass( - mlir::iree_compiler::AMDAIE::createAMDAIEDmaToNpuPass()); if (failed(pm.run(moduleOp))) return moduleOp.emitOpError(": NPU Instruction pipeline failed"); - std::optional> npuInstructions = - cast( - (*moduleOp.getOps().begin()) - ->getAttr("npu_instructions")) - .tryGetAsArrayRef(); - if (!npuInstructions) - return moduleOp.emitOpError(": No NPU instructions in device op"); + auto maybeNpuInstructions = getNpuInstructions(moduleOp); + if (failed(maybeNpuInstructions)) return failure(); + auto npuInstructions = maybeNpuInstructions.value(); std::string errorMessage; auto output = openOutputFile(outputNPU, &errorMessage); @@ -1136,7 +1158,7 @@ LogicalResult aie2xclbin( << errorMessage; return failure(); } - for (auto w : *npuInstructions) output->os() << llvm::format("%08X\n", w); + for (auto w : npuInstructions) output->os() << llvm::format("%08X\n", w); output->keep(); Path unifiedObj = Path(tempDir) / "input.o"; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index b0c8f799c..3f7935844 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -13,7 +13,6 @@ #include "iree-amd-aie/IR/AMDAIEAttrs.h" #include "iree-dialects/Dialect/LinalgTransform/Passes.h" #include "iree/compiler/Codegen/Common/Passes.h" -#include "iree/compiler/Utils/PassUtils.h" #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h" #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h" @@ -618,8 +617,6 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) { passManager.addPass(createAMDAIELowerToAIEPass()); passManager.addPass(createCanonicalizerPass()); - passManager.addPass(createConvertLinalgToLoopsPass()); - // Now lower using the AIE passes from MLIR-AIE. addMLIRAIELoweringPasses(passManager); } @@ -785,10 +782,13 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device) { } void addMLIRAIELoweringPasses(OpPassManager &passManager) { - passManager.addPass(createLowerAffinePass()); OpPassManager &devicePM = passManager.nest(); - devicePM.addPass(createAMDAIEAssignLockIDsPass()); devicePM.addPass(createAMDAIEObjectFifoStatefulTransformPass()); + devicePM.addPass(createAMDAIEDmaToNpuPass()); + passManager.addPass(createCanonicalizerPass()); + passManager.addPass(createConvertLinalgToLoopsPass()); + passManager.addPass(createLowerAffinePass()); + devicePM.addPass(createAMDAIEAssignLockIDsPass()); devicePM.addPass(createAMDAIEAssignBufferDescriptorIDsPass()); devicePM.addPass(createAMDAIEAssignBufferAddressesBasicPass()); devicePM.addPass(createAMDAIEPathfinderPass()); diff --git a/tests/samples/conv_pipeline_e2e.mlir b/tests/samples/conv_pipeline_e2e.mlir index 71b1442b8..7c6957017 100644 --- a/tests/samples/conv_pipeline_e2e.mlir +++ b/tests/samples/conv_pipeline_e2e.mlir @@ -13,11 +13,6 @@ func.func @conv_2d_nhwc_hwcf(%arg0: tensor<2x14x14x32xi32>, %arg1: tensor<3x3x32 // CHECK: aie.shim_dma_allocation // CHECK: aie.shim_dma_allocation // CHECK: aie.shim_dma_allocation -// CHECK: aiex.runtime_sequence @conv_2d_nhwc_hwcf_dispatch_0_conv_2d_nhwc_hwcf_2x12x12x64x3x3x32_i32(%arg0: memref<2x14x14x32xi32>, %arg1: memref<3x3x32x64xi32>, %arg2: memref<2x12x12x64xi32>) -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.sync // ----- @@ -34,8 +29,3 @@ func.func @conv_2d_nhwc_hwcf_q(%arg0: tensor<2x14x14x32xi8>, %arg1: tensor<3x3x3 // CHECK: aie.shim_dma_allocation // CHECK: aie.shim_dma_allocation // CHECK: aie.shim_dma_allocation -// CHECK: aiex.runtime_sequence @conv_2d_nhwc_hwcf_q_dispatch_0_conv_2d_nhwc_hwcf_q_2x12x12x64x3x3x32_i8xi8xi32xi32xi32(%arg0: memref<3136xi32>, %arg1: memref<4608xi32>, %arg2: memref<2x12x12x64xi32>) -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.sync diff --git a/tests/samples/matmul_peeled_objectfifo_e2e.mlir b/tests/samples/matmul_peeled_objectfifo_e2e.mlir index 386214f58..484494045 100644 --- a/tests/samples/matmul_peeled_objectfifo_e2e.mlir +++ b/tests/samples/matmul_peeled_objectfifo_e2e.mlir @@ -1,6 +1,8 @@ // RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources --iree-amdaie-target-device=npu1_4col %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-lower-to-aie-pipeline=objectFifo --iree-amdaie-tile-pipeline=pack-peel --split-input-file | FileCheck %s // CHECK-LABEL: hal.executable.export public @matmul_i32_dispatch_0_matmul_128x128x256_i32 + +// CHECK: aie.device(npu1_4col) { // CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2) // CHECK-DAG: %[[TILE_0_3:.+]] = aie.tile(0, 3) // CHECK-DAG: %[[TILE_1_2:.+]] = aie.tile(1, 2) @@ -11,21 +13,16 @@ // CHECK-DAG: aie.core(%[[TILE_1_2]]) // CHECK-DAG: aie.core(%[[TILE_0_3]]) // CHECK-DAG: aie.core(%[[TILE_1_3]]) -// CHECK-DAG: aiex.runtime_sequence @matmul_i32_dispatch_0_matmul_128x128x256_i32(%[[ARG0:.+]]: memref<128x256xi32>, %[[ARG1:.+]]: memref<256x128xi32>, %[[ARG2:.+]]: memref<128x128xi32>) -// CHECK-DAG: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG2]][0, 0, 0, 0][2, 2, 64, 64][8192, 64, 128, 1]) {id = 0 : i64, issue_token = true, metadata = @[[OBJ10:.+]]} -// CHECK-DAG: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 8, 64, 32][0, 32, 256, 1]) {id = 1 : i64, issue_token = true, metadata = @[[OBJ0:.+]]} -// CHECK-DAG: aiex.npu.dma_wait {symbol = @[[OBJ0]]} -// CHECK-DAG: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][8, 2, 32, 32][4096, 32, 128, 1]) {id = 2 : i64, issue_token = true, metadata = @[[OBJ1:.+]]} -// CHECK-DAG: aiex.npu.dma_wait {symbol = @[[OBJ1]]} -// CHECK-DAG: aiex.npu.dma_wait {symbol = @[[OBJ10]]} -// CHECK-DAG: aie.shim_dma_allocation @[[OBJ0]](MM2S, 0, 0) -// CHECK-DAG: aie.shim_dma_allocation @[[OBJ1]](MM2S, 1, 0) +// CHECK-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 0, 0) +// CHECK-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 1, 0) // CHECK-DAG: aie.memtile_dma(%[[TILE_0_1]]) // CHECK-DAG: aie.mem(%[[TILE_0_2]]) // CHECK-DAG: aie.mem(%[[TILE_0_3]]) // CHECK-DAG: aie.mem(%[[TILE_1_2]]) // CHECK-DAG: aie.mem(%[[TILE_1_3]]) -// CHECK-DAG: aie.shim_dma_allocation @[[OBJ10]](S2MM, 0, 0) +// CHECK-DAG: aie.shim_dma_allocation {{.*}}(S2MM, 0, 0) +// CHECK: {npu_instructions = +// CHECK-SAME: runtime_sequence_name = "matmul_i32_dispatch_0_matmul_128x128x256_i32" func.func @matmul_i32(%lhs: tensor<128x256xi32>, %rhs: tensor<256x128xi32>) -> tensor<128x128xi32> { %cst = arith.constant 0 : i32 diff --git a/tests/samples/pack_peel_pipeline_matmul.mlir b/tests/samples/pack_peel_pipeline_matmul.mlir index 344c34e5d..a626a2132 100644 --- a/tests/samples/pack_peel_pipeline_matmul.mlir +++ b/tests/samples/pack_peel_pipeline_matmul.mlir @@ -15,11 +15,6 @@ func.func @matmul_i8_i32(%lhs: tensor<32x16xi8>, %rhs: tensor<16x32xi8>) -> tens // CHECK: aie.shim_dma_allocation // CHECK: aie.shim_dma_allocation // CHECK: aie.shim_dma_allocation -// CHECK: aiex.runtime_sequence @matmul_i8_i32_dispatch_0_matmul_32x32x16_i8xi8xi32(%arg0: memref<128xi32>, %arg1: memref<128xi32>, %arg2: memref<32x32xi32>) -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.sync // ----- @@ -38,8 +33,3 @@ func.func @matmul_bf16(%lhs: tensor<16x32xbf16>, %rhs: tensor<32x16xbf16>) -> te // CHECK: aie.shim_dma_allocation // CHECK: aie.shim_dma_allocation // CHECK: aie.shim_dma_allocation -// CHECK: aiex.runtime_sequence @matmul_bf16_dispatch_0_matmul_16x16x32_bf16(%arg0: memref<256xi32>, %arg1: memref<256xi32>, %arg2: memref<128xi32>) -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.sync diff --git a/tests/samples/pack_peel_pipeline_matmul_elementwise.mlir b/tests/samples/pack_peel_pipeline_matmul_elementwise.mlir index 9c2cbf935..c99b3b269 100644 --- a/tests/samples/pack_peel_pipeline_matmul_elementwise.mlir +++ b/tests/samples/pack_peel_pipeline_matmul_elementwise.mlir @@ -20,15 +20,8 @@ func.func @matmul_elementwise_i32(%lhs: tensor<1024x512xi32>, %rhs: tensor<512x1 } // CHECK-LABEL: hal.executable.export public @matmul_elementwise_i32_dispatch_0_matmul_1024x1024x512_i32 -// CHECK: aie.device(npu1_4col) -// CHECK: aie.shim_dma_allocation -// CHECK: aie.shim_dma_allocation -// CHECK: aie.shim_dma_allocation -// CHECK: aiex.runtime_sequence @matmul_elementwise_i32_dispatch_0_matmul_1024x1024x512_i32(%arg0: memref<1024x512xi32>, %arg1: memref<512x1024xi32>, %arg2: memref<1024x1024xi32>, %arg3: memref<1024x1024xi32>) -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.sync +// CHECK: aie.device(npu1_4col) +// CHECK-COUNT-3: aie.shim_dma_allocation // ----- @@ -52,15 +45,8 @@ func.func @matmul_elementwise_bf16_f32(%arg0: tensor<1024x512xbf16>, %arg1: tens } // CHECK-LABEL: hal.executable.export public @matmul_elementwise_bf16_f32_dispatch_0_matmul_1024x1024x512_bf16xbf16xf32 -// CHECK: aie.device(npu1_4col) -// CHECK: aie.shim_dma_allocation -// CHECK: aie.shim_dma_allocation -// CHECK: aie.shim_dma_allocation -// CHECK: aiex.runtime_sequence @matmul_elementwise_bf16_f32_dispatch_0_matmul_1024x1024x512_bf16xbf16xf32(%arg0: memref<262144xi32>, %arg1: memref<262144xi32>, %arg2: memref<1024xf32>, %arg3: memref<1024x1024xf32>) -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.sync +// CHECK: aie.device(npu1_4col) +// CHECK-COUNT-3: aie.shim_dma_allocation // ----- func.func @matmul_elementwise_bf16(%arg0: tensor<512x512xbf16>, %arg1: tensor<512x16384xbf16>, %arg2: tensor<512xf32>) -> tensor<512x16384xbf16> { @@ -78,12 +64,6 @@ func.func @matmul_elementwise_bf16(%arg0: tensor<512x512xbf16>, %arg1: tensor<51 return %11 : tensor<512x16384xbf16> } -// CHECK-LABEL: hal.executable.export public @matmul_elementwise_bf16_dispatch_0_matmul_512x16384x512_bf16xbf16xf32 -// CHECK: aie.device(npu1_4col) -// CHECK: aie.shim_dma_allocation -// CHECK: aie.shim_dma_allocation -// CHECK: aie.shim_dma_allocation -// CHECK: aiex.runtime_sequence @matmul_elementwise_bf16_dispatch_0_matmul_512x16384x512_bf16xbf16xf32(%arg0: memref<131072xi32>, %arg1: memref<4194304xi32>, %arg2: memref<512xf32>, %arg3: memref<4194304xi32>) -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd -// CHECK: aiex.npu.dma_memcpy_nd +// CHECK-LABEL: hal.executable.export public @matmul_elementwise_bf16_dispatch_0_matmul_512x16384x512_bf16xbf16xf32 +// CHECK: aie.device(npu1_4col) +// CHECK-COUNT-3: aie.shim_dma_allocation diff --git a/tests/samples/pad_pack_pipeline_e2e.mlir b/tests/samples/pad_pack_pipeline_e2e.mlir index 18d9d8708..14bdcb04c 100644 --- a/tests/samples/pad_pack_pipeline_e2e.mlir +++ b/tests/samples/pad_pack_pipeline_e2e.mlir @@ -7,11 +7,6 @@ // CPP: aie.shim_dma_allocation // CPP: aie.shim_dma_allocation // CPP: aie.shim_dma_allocation -// CPP: aiex.runtime_sequence @matmul_small_dispatch_0_matmul_8x32x16_i32(%arg0: memref<8x16xi32>, %arg1: memref<16x32xi32>, %arg2: memref<8x32xi32>) -// CPP: aiex.npu.dma_memcpy_nd -// CPP: aiex.npu.dma_memcpy_nd -// CPP: aiex.npu.dma_memcpy_nd -// CPP: aiex.npu.sync func.func @matmul_small(%lhs : tensor<8x16xi32>, %rhs : tensor<16x32xi32>) -> tensor<8x32xi32> { %empty = tensor.empty() : tensor<8x32xi32> @@ -29,12 +24,6 @@ func.func @matmul_small(%lhs : tensor<8x16xi32>, // CPP: aie.shim_dma_allocation // CPP: aie.shim_dma_allocation // CPP: aie.shim_dma_allocation -// CPP: aiex.runtime_sequence @matmul_large_dispatch_0_matmul_2048x2048x2048_i32(%arg0: memref<2048x2048xi32>, %arg1: memref<2048x2048xi32>, %arg2: memref<2048x2048xi32>) -// CPP: aiex.npu.dma_memcpy_nd -// CPP: aiex.npu.dma_memcpy_nd -// CPP: aiex.npu.dma_memcpy_nd -// CPP: aiex.npu.sync - func.func @matmul_large(%lhs: tensor<2048x2048xi32>, %rhs: tensor<2048x2048xi32>) -> tensor<2048x2048xi32> { %empty = tensor.empty() : tensor<2048x2048xi32> %cst = arith.constant 0 : i32 @@ -54,11 +43,6 @@ func.func @matmul_large(%lhs: tensor<2048x2048xi32>, %rhs: tensor<2048x2048xi32> // CPP: aie.shim_dma_allocation // CPP: aie.shim_dma_allocation // CPP: aie.shim_dma_allocation -// CPP: aiex.runtime_sequence @generic_matmul_transpose_static_dispatch_0_matmul_like_8x32x16_i32(%arg0: memref<8x16xi32>, %arg1: memref<32x16xi32>, %arg2: memref<8x32xi32>) -// CPP: aiex.npu.dma_memcpy_nd -// CPP: aiex.npu.dma_memcpy_nd -// CPP: aiex.npu.dma_memcpy_nd -// CPP: aiex.npu.sync func.func @generic_matmul_transpose_static(%lhs : tensor<8x16xi32>, %rhs : tensor<32x16xi32>) -> tensor<8x32xi32> { %cst = arith.constant 0 : i32 @@ -82,11 +66,6 @@ func.func @generic_matmul_transpose_static(%lhs : tensor<8x16xi32>, // CPP: aie.shim_dma_allocation // CPP: aie.shim_dma_allocation // CPP: aie.shim_dma_allocation -// CPP: aiex.runtime_sequence @matmul_transpose_b_static_dispatch_0_matmul_transpose_b_8x32x16_i32(%arg0: memref<8x16xi32>, %arg1: memref<32x16xi32>, %arg2: memref<8x32xi32>) -// CPP: aiex.npu.dma_memcpy_nd -// CPP: aiex.npu.dma_memcpy_nd -// CPP: aiex.npu.dma_memcpy_nd -// CPP: aiex.npu.sync func.func @matmul_transpose_b_static(%lhs : tensor<8x16xi32>, %rhs : tensor<32x16xi32>) -> tensor<8x32xi32> { %cst = arith.constant 0 : i32