diff --git a/include/aie/Dialect/AIE/IR/AIEOps.td b/include/aie/Dialect/AIE/IR/AIEOps.td index 46b9bcda9a..dfd757a420 100644 --- a/include/aie/Dialect/AIE/IR/AIEOps.td +++ b/include/aie/Dialect/AIE/IR/AIEOps.td @@ -1668,7 +1668,9 @@ def AIE_ObjectFifoCreateOp: AIE_Op<"objectfifo", [HasParent<"DeviceOp">, Symbol] DefaultValuedAttr:$plio, // via_shared_mem==0 means use producer tile's memory module // via_shared_mem==1 means use consumer tile's memory module - OptionalAttr:$via_shared_mem + OptionalAttr:$via_shared_mem, + // memtile_repeat==0 means "do it once" and don't repeat + OptionalAttr:$memtile_repeat ); let assemblyFormat = [{ @@ -1763,16 +1765,10 @@ def AIE_ObjectFifoLinkOp: AIE_Op<"objectfifo.link", [HasParent<"DeviceOp">]> { let extraClassDeclaration = [{ std::vector getInputObjectFifos(); std::vector getOutputObjectFifos(); - - bool isJoin() { - return getFifoIns().size() > 1; - } - - bool isDistribute() { - return getFifoOuts().size() > 1; - } - + bool isJoin() { return getFifoIns().size() > 1; } + bool isDistribute() { return getFifoOuts().size() > 1; } std::optional getOptionalSharedTile(); + std::optional getRepeatCount(); std::vector getJoinTranferLengths(); std::vector getDistributeTranferLengths(); }]; diff --git a/lib/Dialect/AIE/IR/AIEDialect.cpp b/lib/Dialect/AIE/IR/AIEDialect.cpp index 87c9c2bef9..1443b18764 100644 --- a/lib/Dialect/AIE/IR/AIEDialect.cpp +++ b/lib/Dialect/AIE/IR/AIEDialect.cpp @@ -493,6 +493,12 @@ LogicalResult ObjectFifoCreateOp::verify() { "`via_shared_mem` can only be used in 1-to-1 object FIFOs"); } + if (getMemtileRepeat().has_value()) { + if (!getProducerTileOp().isMemTile()) + return emitError("`memtile_repeat` can only be used with a mem tile " + "producer"); + } + return success(); } @@ -625,6 +631,26 @@ LogicalResult ObjectFifoLinkOp::verify() { return emitOpError("currently does not support objectFifos with " "dimensionsFromStreamPerConsumer."); } + + for (auto fifoOut : getOutputObjectFifos()) { + for (auto dims : fifoOut.getDimensionsFromStreamPerConsumer()) { + if (!dims.empty()) + return emitOpError("currently does not support objectFifos with " + "dimensionsFromStreamPerConsumer."); + } + } + + std::vector repeat_counts; + for (auto fifoOut : getOutputObjectFifos()) { + if (fifoOut.getMemtileRepeat().has_value()) + repeat_counts.push_back(fifoOut.getMemtileRepeat().value()); + else + repeat_counts.push_back(0); + } + for (auto repeat : repeat_counts) + if (repeat_counts[0] != repeat) + return emitError("repeat counts of output object FIFOs must be equal"); + } else { if (!getSrcOffsets().empty() && !getDstOffsets().empty()) return emitOpError("all offsets should be empty if there is no " @@ -732,6 +758,13 @@ std::vector ObjectFifoLinkOp::getDistributeTranferLengths() { return lengths; } +std::optional ObjectFifoLinkOp::getRepeatCount() { + for (auto fifoOut : getOutputObjectFifos()) + if (fifoOut.getMemtileRepeat().has_value()) + return {fifoOut.getMemtileRepeat().value()}; + return {}; +} + //===----------------------------------------------------------------------===// // ObjectFifoRegisterExternalBuffersOp //===----------------------------------------------------------------------===// diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp index 11168a4c8b..94c3d4221e 100644 --- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp +++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp @@ -427,6 +427,8 @@ struct AIEObjectFifoStatefulTransformPass of_elem_index++; } if (linked) { + if (linkOp->getRepeatCount().has_value()) + numElem *= linkOp->getRepeatCount().value() + 1; if (linkOp->isDistribute()) numElem *= linkOp->getFifoOuts().size(); else if (linkOp->isJoin()) @@ -692,6 +694,19 @@ struct AIEObjectFifoStatefulTransformPass int acqNum = 1; int relNum = 1; + // check for repeat count + int repeatCount = 0; + if (!dims.getValue().empty()) { + auto highestStride = dims.getValue().begin()->getStride() - 1; + if (highestStride == 0) { + repeatCount = dims.getValue().begin()->getSize(); + dims = AIE::BDDimLayoutArrayAttr::get(op->getContext(), + dims.getValue().drop_front(1)); + } + } + if (op.getMemtileRepeat().has_value()) + repeatCount = op.getMemtileRepeat().value(); + // search for the buffers/locks (based on if this objFifo has a link) // identify size difference between input and output memrefs ObjectFifoCreateOp target = op; @@ -704,12 +719,20 @@ struct AIEObjectFifoStatefulTransformPass auto srcOffsets = linkOp->getSrcOffsets(); auto dstOffsets = linkOp->getDstOffsets(); + if (target == op) { + if (linkOp->getRepeatCount().has_value()) { + // +1 for original data movement + acqNum *= linkOp->getRepeatCount().value() + 1; + relNum *= linkOp->getRepeatCount().value() + 1; + } + } + if (linkOp->isJoin()) { // compute offset and length isJoin = true; if (target == op) { - acqNum = linkOp->getFifoIns().size(); - relNum = linkOp->getFifoIns().size(); + acqNum *= linkOp->getFifoIns().size(); + relNum *= linkOp->getFifoIns().size(); } else { int i = 0; for (auto fifoIn : linkOp->getInputObjectFifos()) { @@ -718,15 +741,14 @@ struct AIEObjectFifoStatefulTransformPass i++; } extraOffset = *getConstantIntValue(srcOffsets[i]); - if (dims.getValue().empty()) - lenOut = linkOp->getJoinTranferLengths()[i]; + lenOut = linkOp->getJoinTranferLengths()[i]; } } else if (linkOp->isDistribute()) { // compute offset and length isDistribute = true; if (target == op) { - acqNum = linkOp->getFifoOuts().size(); - relNum = linkOp->getFifoOuts().size(); + acqNum *= linkOp->getFifoOuts().size(); + relNum *= linkOp->getFifoOuts().size(); } else { int i = 0; for (auto fifoOut : linkOp->getOutputObjectFifos()) { @@ -735,8 +757,7 @@ struct AIEObjectFifoStatefulTransformPass i++; } extraOffset = *getConstantIntValue(dstOffsets[i]); - if (dims.getValue().empty()) - lenOut = linkOp->getDistributeTranferLengths()[i]; + lenOut = linkOp->getDistributeTranferLengths()[i]; } } else { if (target != op) { @@ -787,17 +808,6 @@ struct AIEObjectFifoStatefulTransformPass Block *dmaBlock = builder.createBlock(endBlock); Block *bdBlock = builder.createBlock(endBlock); - // check for repeat count in objfifo dims - int repeatCount = 0; - if (!dims.getValue().empty()) { - auto highestStride = dims.getValue().begin()->getStride(); - if (highestStride == 0) { - repeatCount = dims.getValue().begin()->getSize(); - dims = AIE::BDDimLayoutArrayAttr::get(op->getContext(), - dims.getValue().drop_front(1)); - } - } - // create DMA channel builder.setInsertionPointToStart(dmaBlock); builder.create(builder.getUnknownLoc(), channelDir, diff --git a/programming_examples/basic/memtile_repeat/distribute_repeat/CMakeLists.txt b/programming_examples/basic/memtile_repeat/distribute_repeat/CMakeLists.txt new file mode 100644 index 0000000000..5da03ef2b2 --- /dev/null +++ b/programming_examples/basic/memtile_repeat/distribute_repeat/CMakeLists.txt @@ -0,0 +1,75 @@ +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023 Advanced Micro Devices, Inc. + +# parameters +# -DBOOST_ROOT: Path to Boost install +# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo +# -DXRT_LIB_DIR: Path to xrt_coreutil.lib +# -DTARGET_NAME: Target name to be built + +# cmake needs this line +cmake_minimum_required(VERSION 3.1) + +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED YES) + +find_program(WSL NAMES powershell.exe) + +if (NOT WSL) + set(CMAKE_C_COMPILER gcc-13) + set(CMAKE_CXX_COMPILER g++-13) + set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") + set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") +else() + set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") + set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") +endif() + +set(TARGET_NAME test CACHE STRING "Target to be built") + +SET (ProjectName proj_${TARGET_NAME}) +SET (currentTarget ${TARGET_NAME}) + +if ( WSL ) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR}) +endif () + +project(${ProjectName}) + +# Find packages +find_package(Boost REQUIRED) + +add_executable(${currentTarget} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../../runtime_lib/test_lib/test_utils.cpp + test.cpp +) + +target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1) + +target_include_directories (${currentTarget} PUBLIC + ${XRT_INC_DIR} + ${Boost_INCLUDE_DIRS} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../../runtime_lib/test_lib +) + +target_link_directories(${currentTarget} PUBLIC + ${XRT_LIB_DIR} + ${Boost_LIBRARY_DIRS} +) + +if (NOT WSL) + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + boost_program_options + boost_filesystem + ) +else() + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + ) +endif() diff --git a/programming_examples/basic/memtile_repeat/distribute_repeat/Makefile b/programming_examples/basic/memtile_repeat/distribute_repeat/Makefile new file mode 100644 index 0000000000..470206c9a6 --- /dev/null +++ b/programming_examples/basic/memtile_repeat/distribute_repeat/Makefile @@ -0,0 +1,69 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## + +srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +include ${srcdir}/../../../makefile-common + +all: build/final.xclbin build/insts.txt + +devicename ?= npu +targetname = distribute_repeat +LENGTH ?= 36 + +build/aie.mlir: ${srcdir}/aie2.py + mkdir -p ${@D} + python3 $< ${LENGTH} ${devicename} ${col} > $@ + +.PHONY: inst/insts.txt +inst/insts.txt: ${srcdir}/aie2.py + rm -rf inst + mkdir -p inst + python3 $< ${LENGTH} > inst/aie.mlir + pushd inst && aiecc.py --aie-only-generate-npu --npu-insts-name=insts.txt aie.mlir && popd + ${powershell} ./build/${targetname}.exe -x build/final.xclbin -i inst/insts.txt -k MLIR_AIE -l ${LENGTH} + +build/final.xclbin: build/aie.mlir + mkdir -p ${@D} + cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ + --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) + +${targetname}.exe: ${srcdir}/test.cpp + rm -rf _build + mkdir -p _build + cd _build && ${powershell} cmake ${srcdir} -DTARGET_NAME=${targetname} + cd _build && ${powershell} cmake --build . --config Release +ifeq "${powershell}" "powershell.exe" + cp _build/${targetname}.exe $@ +else + cp _build/${targetname} $@ +endif + +run: ${targetname}.exe build/final.xclbin build/insts.txt + ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE -l ${LENGTH} + +# Changing variables when we target VCK5000 +vck5000: devicename=xcvc1902 +vck5000: col=6 + +vck5000: build/aie.mlir + aiecc.py --link_against_hsa --host-target=x86_64-amd-linux-gnu build/aie.mlir \ + -I/opt/xaiengine/include \ + -I${srcdir}/../../../../install/runtime_lib/x86_64-hsa/test_lib/include \ + -L/opt/xaiengine/lib \ + -L/lib/x86_64-linux-gnu/ \ + ${srcdir}/test_vck5000.cpp \ + ${srcdir}/../../../../install/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \ + -Wl,-R/opt/xaiengine/lib \ + -Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf + +run_vck5000: + test.elf + +clean: + rm -rf build _build inst ${targetname}.exe diff --git a/programming_examples/basic/memtile_repeat/distribute_repeat/README.md b/programming_examples/basic/memtile_repeat/distribute_repeat/README.md new file mode 100644 index 0000000000..31d03b9d59 --- /dev/null +++ b/programming_examples/basic/memtile_repeat/distribute_repeat/README.md @@ -0,0 +1,21 @@ + + +# Distribute Repeat + +This reference design can be run on a Ryzen™ AI NPU. + +TODO + +To compile and run the design for NPU: +``` +make +make run +``` \ No newline at end of file diff --git a/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py b/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py new file mode 100644 index 0000000000..1cf5752701 --- /dev/null +++ b/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py @@ -0,0 +1,114 @@ +# memtile_repeat/distribute_repeat/aie2.py -*- Python -*- +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.dialects.scf import * +from aie.extras.dialects.ext import memref, arith +from aie.extras.context import mlir_mod_ctx + +dev = AIEDevice.npu1_1col +col = 0 +N = 36 + +if len(sys.argv) > 1: + N = int(sys.argv[1]) + +if len(sys.argv) > 2: + if sys.argv[2] == "npu": + dev = AIEDevice.npu1_1col + elif sys.argv[2] == "xcvc1902": + dev = AIEDevice.xcvc1902 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[2])) + +if len(sys.argv) > 3: + col = int(sys.argv[3]) + +repeat_counter = 6 +out_size = N * (repeat_counter + 1) + + +def distribute_repeat(): + with mlir_mod_ctx() as ctx: + + @device(dev) + def device_body(): + memRef_in_ty = T.memref(N, T.i32()) + memRef_out_ty = T.memref(out_size, T.i32()) + memRef_18_ty = T.memref(N // 2, T.i32()) + + # Tile declarations + ShimTile = tile(col, 0) + MemTile = tile(col, 1) + ComputeTile2 = tile(col, 2) + ComputeTile3 = tile(col, 3) + + # AIE-array data movement with object fifos + of_in = object_fifo("in", ShimTile, MemTile, 1, memRef_in_ty) + of_in2 = object_fifo("in2", MemTile, ComputeTile2, 2, memRef_18_ty) + of_in3 = object_fifo("in3", MemTile, ComputeTile3, 2, memRef_18_ty) + of_in2.set_memtile_repeat(repeat_counter) + of_in3.set_memtile_repeat(repeat_counter) + object_fifo_link(of_in, [of_in2, of_in3], [], [0, N // 2]) + + of_out2 = object_fifo("out2", ComputeTile2, MemTile, 2, memRef_18_ty) + of_out3 = object_fifo("out3", ComputeTile3, MemTile, 2, memRef_18_ty) + of_out = object_fifo("out", MemTile, ShimTile, 1, memRef_out_ty) + object_fifo_link([of_out2, of_out3], of_out, [0, out_size // 2], []) + + # Set up compute tiles + + # Compute tile 2 + @core(ComputeTile2) + def core_body(): + for _ in for_(sys.maxsize): + elemOut = of_out2.acquire(ObjectFifoPort.Produce, 1) + elemIn = of_in2.acquire(ObjectFifoPort.Consume, 1) + for i in for_(N // 2): + v0 = memref.load(elemIn, [i]) + v1 = arith.addi(v0, arith.constant(1, T.i32())) + memref.store(v1, elemOut, [i]) + yield_([]) + of_in2.release(ObjectFifoPort.Consume, 1) + of_out2.release(ObjectFifoPort.Produce, 1) + yield_([]) + + # Compute tile 3 + @core(ComputeTile3) + def core_body(): + for _ in for_(sys.maxsize): + elemOut = of_out3.acquire(ObjectFifoPort.Produce, 1) + elemIn = of_in3.acquire(ObjectFifoPort.Consume, 1) + for i in for_(N // 2): + v0 = memref.load(elemIn, [i]) + v1 = arith.addi(v0, arith.constant(2, T.i32())) + memref.store(v1, elemOut, [i]) + yield_([]) + of_in3.release(ObjectFifoPort.Consume, 1) + of_out3.release(ObjectFifoPort.Produce, 1) + yield_([]) + + # To/from AIE-array data movement + tensor_out_ty = T.memref(out_size, T.i32()) + tensor_in_ty = T.memref(N, T.i32()) + + @runtime_sequence(tensor_in_ty, tensor_in_ty, tensor_out_ty) + def sequence(A, B, C): + npu_dma_memcpy_nd( + metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, out_size] + ) + npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_sync(column=0, row=0, direction=0, channel=0) + + print(ctx.module) + + +distribute_repeat() diff --git a/programming_examples/basic/memtile_repeat/distribute_repeat/run_makefile.lit b/programming_examples/basic/memtile_repeat/distribute_repeat/run_makefile.lit new file mode 100644 index 0000000000..3f4f5c1c05 --- /dev/null +++ b/programming_examples/basic/memtile_repeat/distribute_repeat/run_makefile.lit @@ -0,0 +1,9 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, chess +// +// RUN: make -f %S/Makefile clean +// RUN: make -f %S/Makefile +// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s +// XFAIL: * diff --git a/programming_examples/basic/memtile_repeat/distribute_repeat/test.cpp b/programming_examples/basic/memtile_repeat/distribute_repeat/test.cpp new file mode 100644 index 0000000000..d026919d0d --- /dev/null +++ b/programming_examples/basic/memtile_repeat/distribute_repeat/test.cpp @@ -0,0 +1,199 @@ +//===- test.cpp -------------------------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +namespace po = boost::program_options; + +void check_arg_file_exists(po::variables_map &vm_in, std::string name) { + if (!vm_in.count(name)) { + throw std::runtime_error("Error: no " + name + " file was provided\n"); + } else { + std::ifstream test(vm_in[name].as()); + if (!test) { + throw std::runtime_error("The " + name + " file " + + vm_in[name].as() + + " does not exist.\n"); + } + } +} + +std::vector load_instr_sequence(std::string instr_path) { + std::ifstream instr_file(instr_path); + std::string line; + std::vector instr_v; + while (std::getline(instr_file, line)) { + std::istringstream iss(line); + uint32_t a; + if (!(iss >> std::hex >> a)) { + throw std::runtime_error("Unable to parse instruction file\n"); + } + instr_v.push_back(a); + } + return instr_v; +} + +int main(int argc, const char *argv[]) { + // Program arguments parsing + po::options_description desc("Allowed options"); + desc.add_options()("help,h", "produce help message")( + "xclbin,x", po::value()->required(), + "the input xclbin path")( + "kernel,k", po::value()->required(), + "the kernel name in the XCLBIN (for instance PP_PRE_FD)")( + "verbosity,v", po::value()->default_value(0), + "the verbosity of the output")( + "instr,i", po::value()->required(), + "path of file containing userspace instructions to be sent to the LX6")( + "length,l", po::value()->default_value(4096), + "the length of the transfer in int32_t"); + po::variables_map vm; + + try { + po::store(po::parse_command_line(argc, argv, desc), vm); + po::notify(vm); + + if (vm.count("help")) { + std::cout << desc << std::endl; + return 1; + } + } catch (const std::exception &ex) { + std::cerr << ex.what() << "\n\n"; + std::cerr << "Usage:\n" << desc << std::endl; + return 1; + } + + check_arg_file_exists(vm, "xclbin"); + check_arg_file_exists(vm, "instr"); + + std::vector instr_v = + load_instr_sequence(vm["instr"].as()); + + int verbosity = vm["verbosity"].as(); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << std::endl; + + int N = vm["length"].as(); + int R = 6; + int O = N * (R + 1); + + // Start the XRT test code + // Get a device handle + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + // Load the xclbin + if (verbosity >= 1) + std::cout << "Loading xclbin: " << vm["xclbin"].as() + << std::endl; + auto xclbin = xrt::xclbin(vm["xclbin"].as()); + + if (verbosity >= 1) + std::cout << "Kernel opcode: " << vm["kernel"].as() + << std::endl; + std::string Node = vm["kernel"].as(); + + // Get the kernel from the xclbin + auto xkernels = xclbin.get_kernels(); + auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), + [Node](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + std::cout << "Name: " << name << std::endl; + return name.rfind(Node, 0) == 0; + }); + auto kernelName = xkernel.get_name(); + + if (verbosity >= 1) + std::cout << "Registering xclbin: " << vm["xclbin"].as() + << "\n"; + + device.register_xclbin(xclbin); + + // get a hardware context + if (verbosity >= 1) + std::cout << "Getting hardware context." << std::endl; + xrt::hw_context context(device, xclbin.get_uuid()); + + // get a kernel handle + if (verbosity >= 1) + std::cout << "Getting handle to kernel:" << kernelName << std::endl; + auto kernel = xrt::kernel(context, kernelName); + + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + auto bo_inA = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(2)); + auto bo_inB = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(3)); + auto bo_out = xrt::bo(device, O * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(4)); + + if (verbosity >= 1) + std::cout << "Writing data into buffer objects." << std::endl; + + int32_t *bufInA = bo_inA.map(); + std::vector srcVecA; + for (int i = 0; i < N; i++) + srcVecA.push_back(1); + memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t))); + + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + if (verbosity >= 1) + std::cout << "Running Kernel." << std::endl; + auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + run.wait(); + + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + uint32_t *bufOut = bo_out.map(); + + int errors = 0; + int repeat_pattern_limit_1 = O / 2; + + for (uint32_t i = 0; i < O; i++) { + uint32_t ref = i; + if (i < repeat_pattern_limit_1) + ref = 2; + else + ref = 3; + if (*(bufOut + i) != ref) { + std::cout << "error at index[" << i << "]: expected " << ref << " got " + << *(bufOut + i) << std::endl; + errors++; + } + } + + if (!errors) { + std::cout << std::endl << "PASS!" << std::endl << std::endl; + return 0; + } else { + std::cout << std::endl + << errors << " mismatches." << std::endl + << std::endl; + std::cout << std::endl << "fail." << std::endl << std::endl; + return 1; + } +} diff --git a/programming_examples/basic/memtile_repeat/simple_repeat/CMakeLists.txt b/programming_examples/basic/memtile_repeat/simple_repeat/CMakeLists.txt new file mode 100644 index 0000000000..5da03ef2b2 --- /dev/null +++ b/programming_examples/basic/memtile_repeat/simple_repeat/CMakeLists.txt @@ -0,0 +1,75 @@ +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023 Advanced Micro Devices, Inc. + +# parameters +# -DBOOST_ROOT: Path to Boost install +# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo +# -DXRT_LIB_DIR: Path to xrt_coreutil.lib +# -DTARGET_NAME: Target name to be built + +# cmake needs this line +cmake_minimum_required(VERSION 3.1) + +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED YES) + +find_program(WSL NAMES powershell.exe) + +if (NOT WSL) + set(CMAKE_C_COMPILER gcc-13) + set(CMAKE_CXX_COMPILER g++-13) + set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") + set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") +else() + set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") + set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") +endif() + +set(TARGET_NAME test CACHE STRING "Target to be built") + +SET (ProjectName proj_${TARGET_NAME}) +SET (currentTarget ${TARGET_NAME}) + +if ( WSL ) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR}) +endif () + +project(${ProjectName}) + +# Find packages +find_package(Boost REQUIRED) + +add_executable(${currentTarget} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../../runtime_lib/test_lib/test_utils.cpp + test.cpp +) + +target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1) + +target_include_directories (${currentTarget} PUBLIC + ${XRT_INC_DIR} + ${Boost_INCLUDE_DIRS} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../../runtime_lib/test_lib +) + +target_link_directories(${currentTarget} PUBLIC + ${XRT_LIB_DIR} + ${Boost_LIBRARY_DIRS} +) + +if (NOT WSL) + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + boost_program_options + boost_filesystem + ) +else() + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + ) +endif() diff --git a/programming_examples/basic/memtile_repeat/simple_repeat/Makefile b/programming_examples/basic/memtile_repeat/simple_repeat/Makefile new file mode 100644 index 0000000000..1461af0f86 --- /dev/null +++ b/programming_examples/basic/memtile_repeat/simple_repeat/Makefile @@ -0,0 +1,69 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## + +srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +include ${srcdir}/../../../makefile-common + +all: build/final.xclbin build/insts.txt + +devicename ?= npu +targetname = simple_repeat +LENGTH ?= 4096 + +build/aie.mlir: ${srcdir}/aie2.py + mkdir -p ${@D} + python3 $< ${LENGTH} ${devicename} ${col} > $@ + +.PHONY: inst/insts.txt +inst/insts.txt: ${srcdir}/aie2.py + rm -rf inst + mkdir -p inst + python3 $< ${LENGTH} > inst/aie.mlir + pushd inst && aiecc.py --aie-only-generate-npu --npu-insts-name=insts.txt aie.mlir && popd + ${powershell} ./build/${targetname}.exe -x build/final.xclbin -i inst/insts.txt -k MLIR_AIE -l ${LENGTH} + +build/final.xclbin: build/aie.mlir + mkdir -p ${@D} + cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ + --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) + +${targetname}.exe: ${srcdir}/test.cpp + rm -rf _build + mkdir -p _build + cd _build && ${powershell} cmake ${srcdir} -DTARGET_NAME=${targetname} + cd _build && ${powershell} cmake --build . --config Release +ifeq "${powershell}" "powershell.exe" + cp _build/${targetname}.exe $@ +else + cp _build/${targetname} $@ +endif + +run: ${targetname}.exe build/final.xclbin build/insts.txt + ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE -l ${LENGTH} + +# Changing variables when we target VCK5000 +vck5000: devicename=xcvc1902 +vck5000: col=6 + +vck5000: build/aie.mlir + aiecc.py --link_against_hsa --host-target=x86_64-amd-linux-gnu build/aie.mlir \ + -I/opt/xaiengine/include \ + -I${srcdir}/../../../../install/runtime_lib/x86_64-hsa/test_lib/include \ + -L/opt/xaiengine/lib \ + -L/lib/x86_64-linux-gnu/ \ + ${srcdir}/test_vck5000.cpp \ + ${srcdir}/../../../../install/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \ + -Wl,-R/opt/xaiengine/lib \ + -Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf + +run_vck5000: + test.elf + +clean: + rm -rf build _build inst ${targetname}.exe diff --git a/programming_examples/basic/memtile_repeat/simple_repeat/README.md b/programming_examples/basic/memtile_repeat/simple_repeat/README.md new file mode 100644 index 0000000000..dd7beb07c9 --- /dev/null +++ b/programming_examples/basic/memtile_repeat/simple_repeat/README.md @@ -0,0 +1,29 @@ + + +# Simple Repeat + +This reference design can be run on a Ryzen™ AI NPU. + +In the [design](./aie2.py) data is brought from external memory via the `ShimTile` to the `MemTile` and back by using an implicit copy via the compute tile's Data Movement Accelerator (DMA). Furthermore, the input data is repeated by the `MemTile` three times which results in the output data consisting of four instances of the input data. + +The implicit copy is performed using the `object_fifo_link` operation that specifies how input data arriving via `of_in` should be sent further via `of_out` by specifically leveraging the compute tile's DMA. This operation and its functionality are described in more depth in [Section-2b](../../../programming_guide/section-2/section-2b/03_Link_Distribute_Join/README.md#object-fifo-link) of the programming guide. + +The repeat count is specified as follows: +``` +of_out.set_memtile_repeat(3) +``` +Specifically, the instruction above specifies the number of repetitions that the producer side of the `of_out` objectfifo should do. + +To compile and run the design for NPU: +``` +make +make run +``` \ No newline at end of file diff --git a/programming_examples/basic/memtile_repeat/simple_repeat/aie2.py b/programming_examples/basic/memtile_repeat/simple_repeat/aie2.py new file mode 100644 index 0000000000..21244483df --- /dev/null +++ b/programming_examples/basic/memtile_repeat/simple_repeat/aie2.py @@ -0,0 +1,68 @@ +# memtile_repeat/simple_repeat/aie2.py -*- Python -*- +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.dialects.scf import * +from aie.extras.dialects.ext import memref, arith +from aie.extras.context import mlir_mod_ctx + +N = 4096 +dev = AIEDevice.npu1_1col +col = 0 + +if len(sys.argv) > 1: + N = int(sys.argv[1]) + +if len(sys.argv) > 2: + if sys.argv[2] == "npu": + dev = AIEDevice.npu1_1col + elif sys.argv[2] == "xcvc1902": + dev = AIEDevice.xcvc1902 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[2])) + +if len(sys.argv) > 3: + col = int(sys.argv[3]) + + +def simple_repeat(): + with mlir_mod_ctx() as ctx: + + @device(dev) + def device_body(): + memRef_ty = T.memref(1024, T.i32()) + + # Tile declarations + ShimTile = tile(col, 0) + MemTile = tile(col, 1) + + # AIE-array data movement with object fifos + of_in = object_fifo("in", ShimTile, MemTile, 1, memRef_ty) + of_out = object_fifo("out", MemTile, ShimTile, 1, memRef_ty) + of_out.set_memtile_repeat(3) + object_fifo_link(of_in, of_out) + + # To/from AIE-array data movement + tensor_ty = T.memref(N, T.i32()) + tensor_in_ty = T.memref(N // 4, T.i32()) + + @runtime_sequence(tensor_in_ty, tensor_ty, tensor_ty) + def sequence(A, B, C): + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd( + metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N // 4] + ) + npu_sync(column=0, row=0, direction=0, channel=0) + + print(ctx.module) + + +simple_repeat() diff --git a/programming_examples/basic/memtile_repeat/simple_repeat/run_makefile.lit b/programming_examples/basic/memtile_repeat/simple_repeat/run_makefile.lit new file mode 100644 index 0000000000..3f4f5c1c05 --- /dev/null +++ b/programming_examples/basic/memtile_repeat/simple_repeat/run_makefile.lit @@ -0,0 +1,9 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, chess +// +// RUN: make -f %S/Makefile clean +// RUN: make -f %S/Makefile +// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s +// XFAIL: * diff --git a/programming_examples/basic/memtile_repeat/simple_repeat/test.cpp b/programming_examples/basic/memtile_repeat/simple_repeat/test.cpp new file mode 100644 index 0000000000..c788ecdd91 --- /dev/null +++ b/programming_examples/basic/memtile_repeat/simple_repeat/test.cpp @@ -0,0 +1,197 @@ +//===- test.cpp -------------------------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +namespace po = boost::program_options; + +void check_arg_file_exists(po::variables_map &vm_in, std::string name) { + if (!vm_in.count(name)) { + throw std::runtime_error("Error: no " + name + " file was provided\n"); + } else { + std::ifstream test(vm_in[name].as()); + if (!test) { + throw std::runtime_error("The " + name + " file " + + vm_in[name].as() + + " does not exist.\n"); + } + } +} + +std::vector load_instr_sequence(std::string instr_path) { + std::ifstream instr_file(instr_path); + std::string line; + std::vector instr_v; + while (std::getline(instr_file, line)) { + std::istringstream iss(line); + uint32_t a; + if (!(iss >> std::hex >> a)) { + throw std::runtime_error("Unable to parse instruction file\n"); + } + instr_v.push_back(a); + } + return instr_v; +} + +int main(int argc, const char *argv[]) { + // Program arguments parsing + po::options_description desc("Allowed options"); + desc.add_options()("help,h", "produce help message")( + "xclbin,x", po::value()->required(), + "the input xclbin path")( + "kernel,k", po::value()->required(), + "the kernel name in the XCLBIN (for instance PP_PRE_FD)")( + "verbosity,v", po::value()->default_value(0), + "the verbosity of the output")( + "instr,i", po::value()->required(), + "path of file containing userspace instructions to be sent to the LX6")( + "length,l", po::value()->default_value(4096), + "the length of the transfer in int32_t"); + po::variables_map vm; + + try { + po::store(po::parse_command_line(argc, argv, desc), vm); + po::notify(vm); + + if (vm.count("help")) { + std::cout << desc << std::endl; + return 1; + } + } catch (const std::exception &ex) { + std::cerr << ex.what() << "\n\n"; + std::cerr << "Usage:\n" << desc << std::endl; + return 1; + } + + check_arg_file_exists(vm, "xclbin"); + check_arg_file_exists(vm, "instr"); + + std::vector instr_v = + load_instr_sequence(vm["instr"].as()); + + int verbosity = vm["verbosity"].as(); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << std::endl; + + int N = vm["length"].as(); + if ((N % 1024)) { + std::cerr << "Length must be a multiple of 1024." << std::endl; + return 1; + } + int repeat_pattern_size = N / 4; + + // Start the XRT test code + // Get a device handle + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + // Load the xclbin + if (verbosity >= 1) + std::cout << "Loading xclbin: " << vm["xclbin"].as() + << std::endl; + auto xclbin = xrt::xclbin(vm["xclbin"].as()); + + if (verbosity >= 1) + std::cout << "Kernel opcode: " << vm["kernel"].as() + << std::endl; + std::string Node = vm["kernel"].as(); + + // Get the kernel from the xclbin + auto xkernels = xclbin.get_kernels(); + auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), + [Node](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + std::cout << "Name: " << name << std::endl; + return name.rfind(Node, 0) == 0; + }); + auto kernelName = xkernel.get_name(); + + if (verbosity >= 1) + std::cout << "Registering xclbin: " << vm["xclbin"].as() + << "\n"; + + device.register_xclbin(xclbin); + + // get a hardware context + if (verbosity >= 1) + std::cout << "Getting hardware context." << std::endl; + xrt::hw_context context(device, xclbin.get_uuid()); + + // get a kernel handle + if (verbosity >= 1) + std::cout << "Getting handle to kernel:" << kernelName << std::endl; + auto kernel = xrt::kernel(context, kernelName); + + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + auto bo_inA = xrt::bo(device, repeat_pattern_size * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + auto bo_inB = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(3)); + auto bo_out = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(4)); + + if (verbosity >= 1) + std::cout << "Writing data into buffer objects." << std::endl; + + int32_t *bufInA = bo_inA.map(); + std::vector srcVecA; + for (int i = 0; i < repeat_pattern_size; i++) + srcVecA.push_back(i); + memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t))); + + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + if (verbosity >= 1) + std::cout << "Running Kernel." << std::endl; + auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + run.wait(); + + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + uint32_t *bufOut = bo_out.map(); + + int errors = 0; + + for (uint32_t i = 0; i < N; i++) { + uint32_t ref = i % repeat_pattern_size; + if (*(bufOut + i) != ref) { + std::cout << "error at index[" << i << "]: expected " << ref << " got " + << *(bufOut + i) << std::endl; + errors++; + } + } + + if (!errors) { + std::cout << std::endl << "PASS!" << std::endl << std::endl; + return 0; + } else { + std::cout << std::endl + << errors << " mismatches." << std::endl + << std::endl; + std::cout << std::endl << "fail." << std::endl << std::endl; + return 1; + } +} diff --git a/python/dialects/aie.py b/python/dialects/aie.py index ff8175fd04..a592ce6f6e 100644 --- a/python/dialects/aie.py +++ b/python/dialects/aie.py @@ -325,6 +325,10 @@ def set_via_shared_mem(self, port): int_num = IntegerAttr.get(T.i32(), num) self.attributes["via_shared_mem"] = int_num + def set_memtile_repeat(self, num): + int_num = IntegerAttr.get(T.i32(), num) + self.attributes["memtile_repeat"] = int_num + # Create an aie objectFifo_link between input and output objectFifos. class object_fifo_link(ObjectFifoLinkOp): diff --git a/test/objectFifo-stateful-transform/memtile_repeat_test.mlir b/test/objectFifo-stateful-transform/memtile_repeat_test.mlir new file mode 100644 index 0000000000..f1533c47f8 --- /dev/null +++ b/test/objectFifo-stateful-transform/memtile_repeat_test.mlir @@ -0,0 +1,127 @@ +//===- memtile_repeat_test.mlir --------------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// RUN: aie-opt --aie-objectFifo-stateful-transform %s | FileCheck %s + +// CHECK: module @memtileRepeat { +// CHECK: aie.device(npu1) { +// CHECK: memref.global "public" @of2_cons : memref<16xi32> +// CHECK: memref.global "public" @of2 : memref<16xi32> +// CHECK: memref.global "public" @of1_cons : memref<16xi32> +// CHECK: memref.global "public" @of1 : memref<16xi32> +// CHECK: memref.global "public" @of0_cons : memref<32xi32> +// CHECK: memref.global "public" @of0 : memref<32xi32> +// CHECK: %tile_1_0 = aie.tile(1, 0) +// CHECK: %tile_1_1 = aie.tile(1, 1) +// CHECK: %tile_1_2 = aie.tile(1, 2) +// CHECK: %tile_3_3 = aie.tile(3, 3) +// CHECK: %of2_cons_buff_0 = aie.buffer(%tile_3_3) {sym_name = "of2_cons_buff_0"} : memref<16xi32> +// CHECK: %of2_cons_buff_1 = aie.buffer(%tile_3_3) {sym_name = "of2_cons_buff_1"} : memref<16xi32> +// CHECK: %of2_cons_prod_lock = aie.lock(%tile_3_3, 0) {init = 2 : i32, sym_name = "of2_cons_prod_lock"} +// CHECK: %of2_cons_cons_lock = aie.lock(%tile_3_3, 1) {init = 0 : i32, sym_name = "of2_cons_cons_lock"} +// CHECK: %of1_cons_buff_0 = aie.buffer(%tile_1_2) {sym_name = "of1_cons_buff_0"} : memref<16xi32> +// CHECK: %of1_cons_buff_1 = aie.buffer(%tile_1_2) {sym_name = "of1_cons_buff_1"} : memref<16xi32> +// CHECK: %of1_cons_prod_lock = aie.lock(%tile_1_2, 0) {init = 2 : i32, sym_name = "of1_cons_prod_lock"} +// CHECK: %of1_cons_cons_lock = aie.lock(%tile_1_2, 1) {init = 0 : i32, sym_name = "of1_cons_cons_lock"} +// CHECK: %of0_cons_buff_0 = aie.buffer(%tile_1_1) {sym_name = "of0_cons_buff_0"} : memref<32xi32> +// CHECK: %of0_cons_buff_1 = aie.buffer(%tile_1_1) {sym_name = "of0_cons_buff_1"} : memref<32xi32> +// CHECK: %of0_cons_prod_lock = aie.lock(%tile_1_1, 0) {init = 12 : i32, sym_name = "of0_cons_prod_lock"} +// CHECK: %of0_cons_cons_lock = aie.lock(%tile_1_1, 1) {init = 0 : i32, sym_name = "of0_cons_cons_lock"} +// CHECK: %of0_prod_lock = aie.lock(%tile_1_0, 0) {init = 1 : i32, sym_name = "of0_prod_lock"} +// CHECK: %of0_cons_lock = aie.lock(%tile_1_0, 1) {init = 0 : i32, sym_name = "of0_cons_lock"} +// CHECK: aie.flow(%tile_1_0, DMA : 0, %tile_1_1, DMA : 0) +// CHECK: aie.flow(%tile_1_1, DMA : 0, %tile_1_2, DMA : 0) +// CHECK: aie.flow(%tile_1_1, DMA : 1, %tile_3_3, DMA : 0) +// CHECK: aie.shim_dma_allocation @of0(MM2S, 0, 1) +// CHECK: %memtile_dma_1_1 = aie.memtile_dma(%tile_1_1) { +// CHECK: %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3) +// CHECK: ^bb1: // 2 preds: ^bb0, ^bb2 +// CHECK: aie.use_lock(%of0_cons_prod_lock, AcquireGreaterEqual, 6) +// CHECK: aie.dma_bd(%of0_cons_buff_0 : memref<32xi32>, 0, 32) +// CHECK: aie.use_lock(%of0_cons_cons_lock, Release, 6) +// CHECK: aie.next_bd ^bb2 +// CHECK: ^bb2: // pred: ^bb1 +// CHECK: aie.use_lock(%of0_cons_prod_lock, AcquireGreaterEqual, 6) +// CHECK: aie.dma_bd(%of0_cons_buff_1 : memref<32xi32>, 0, 32) +// CHECK: aie.use_lock(%of0_cons_cons_lock, Release, 6) +// CHECK: aie.next_bd ^bb1 +// CHECK: ^bb3: // pred: ^bb0 +// CHECK: %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb6, repeat_count = 2) +// CHECK: ^bb4: // 2 preds: ^bb3, ^bb5 +// CHECK: aie.use_lock(%of0_cons_cons_lock, AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%of0_cons_buff_0 : memref<32xi32>, 0, 16) +// CHECK: aie.use_lock(%of0_cons_prod_lock, Release, 1) +// CHECK: aie.next_bd ^bb5 +// CHECK: ^bb5: // pred: ^bb4 +// CHECK: aie.use_lock(%of0_cons_cons_lock, AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%of0_cons_buff_1 : memref<32xi32>, 0, 16) +// CHECK: aie.use_lock(%of0_cons_prod_lock, Release, 1) +// CHECK: aie.next_bd ^bb4 +// CHECK: ^bb6: // pred: ^bb3 +// CHECK: %2 = aie.dma_start(MM2S, 1, ^bb7, ^bb9, repeat_count = 2) +// CHECK: ^bb7: // 2 preds: ^bb6, ^bb8 +// CHECK: aie.use_lock(%of0_cons_cons_lock, AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%of0_cons_buff_0 : memref<32xi32>, 16, 16) +// CHECK: aie.use_lock(%of0_cons_prod_lock, Release, 1) +// CHECK: aie.next_bd ^bb8 +// CHECK: ^bb8: // pred: ^bb7 +// CHECK: aie.use_lock(%of0_cons_cons_lock, AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%of0_cons_buff_1 : memref<32xi32>, 16, 16) +// CHECK: aie.use_lock(%of0_cons_prod_lock, Release, 1) +// CHECK: aie.next_bd ^bb7 +// CHECK: ^bb9: // pred: ^bb6 +// CHECK: aie.end +// CHECK: } +// CHECK: %mem_1_2 = aie.mem(%tile_1_2) { +// CHECK: %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3) +// CHECK: ^bb1: // 2 preds: ^bb0, ^bb2 +// CHECK: aie.use_lock(%of1_cons_prod_lock, AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%of1_cons_buff_0 : memref<16xi32>, 0, 16) +// CHECK: aie.use_lock(%of1_cons_cons_lock, Release, 1) +// CHECK: aie.next_bd ^bb2 +// CHECK: ^bb2: // pred: ^bb1 +// CHECK: aie.use_lock(%of1_cons_prod_lock, AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%of1_cons_buff_1 : memref<16xi32>, 0, 16) +// CHECK: aie.use_lock(%of1_cons_cons_lock, Release, 1) +// CHECK: aie.next_bd ^bb1 +// CHECK: ^bb3: // pred: ^bb0 +// CHECK: aie.end +// CHECK: } +// CHECK: %mem_3_3 = aie.mem(%tile_3_3) { +// CHECK: %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3) +// CHECK: ^bb1: // 2 preds: ^bb0, ^bb2 +// CHECK: aie.use_lock(%of2_cons_prod_lock, AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%of2_cons_buff_0 : memref<16xi32>, 0, 16) +// CHECK: aie.use_lock(%of2_cons_cons_lock, Release, 1) +// CHECK: aie.next_bd ^bb2 +// CHECK: ^bb2: // pred: ^bb1 +// CHECK: aie.use_lock(%of2_cons_prod_lock, AcquireGreaterEqual, 1) +// CHECK: aie.dma_bd(%of2_cons_buff_1 : memref<16xi32>, 0, 16) +// CHECK: aie.use_lock(%of2_cons_cons_lock, Release, 1) +// CHECK: aie.next_bd ^bb1 +// CHECK: ^bb3: // pred: ^bb0 +// CHECK: aie.end +// CHECK: } +// CHECK: } +// CHECK: } + +module @memtileRepeat { + aie.device(npu1) { + %tile10 = aie.tile(1, 0) + %tile11 = aie.tile(1, 1) + %tile12 = aie.tile(1, 2) + %tile33 = aie.tile(3, 3) + + aie.objectfifo @of0 (%tile10, {%tile11}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @of1 (%tile11, {%tile12}, 2 : i32) {memtile_repeat = 2 : i32} : !aie.objectfifo> + aie.objectfifo @of2 (%tile11, {%tile33}, 2 : i32) {memtile_repeat = 2 : i32} : !aie.objectfifo> + aie.objectfifo.link [@of0] -> [@of1, @of2] ([] [0, 16]) + } +} diff --git a/test/python/objFifo.py b/test/python/objFifo.py index 586a3929d3..254f07d98d 100644 --- a/test/python/objFifo.py +++ b/test/python/objFifo.py @@ -12,9 +12,10 @@ Device, Core, end, + buffer, ) from aie.extras.dialects.ext import memref, arith -from aie.ir import InsertionPoint, TypeAttr, Block +from aie.ir import InsertionPoint, TypeAttr, Block, IntegerAttr, IntegerType from util import construct_and_print_module @@ -22,10 +23,12 @@ # CHECK: module { # CHECK: aie.device(xcve2302) { # CHECK: %tile_0_0 = aie.tile(0, 0) +# CHECK: %tile_0_1 = aie.tile(0, 1) # CHECK: %tile_1_2 = aie.tile(1, 2) # CHECK: %tile_1_3 = aie.tile(1, 3) # CHECK: aie.objectfifo @of0(%tile_0_0, {%tile_1_2}, 2 : i32) : !aie.objectfifo> -# CHECK: aie.objectfifo @of1(%tile_1_2, {%tile_1_3}, 2 : i32) {via_shared_mem = 1 : i32} : !aie.objectfifo> +# CHECK: aie.objectfifo @of1(%tile_0_1, {%tile_1_2}, 2 : i32) {memtile_repeat = 4 : i32} : !aie.objectfifo> +# CHECK: aie.objectfifo @of2(%tile_1_2, {%tile_1_3}, 2 : i32) {via_shared_mem = 1 : i32} : !aie.objectfifo> # CHECK: %core_1_2 = aie.core(%tile_1_2) { # CHECK: %0 = aie.objectfifo.acquire @of0(Consume, 1) : !aie.objectfifosubview> # CHECK: %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<256xi32> @@ -37,18 +40,23 @@ # CHECK: } # CHECK: } # CHECK: } + + @construct_and_print_module def objFifo_example(): dev = Device(AIEDevice.xcve2302) dev_block = Block.create_at_start(dev.body_region) with InsertionPoint(dev_block): S = tile(0, 0) + M = tile(0, 1) T_ = tile(1, 2) C_ = tile(1, 3) of0 = object_fifo("of0", S, T_, 2, T.memref(256, T.i32())) - of1 = object_fifo("of1", T_, C_, 2, T.memref(256, T.i32())) - of1.set_via_shared_mem(ObjectFifoPort.Consume) + of1 = object_fifo("of1", M, T_, 2, T.memref(256, T.i32())) + of1.set_memtile_repeat(4) + of2 = object_fifo("of2", T_, C_, 2, T.memref(256, T.i32())) + of2.set_via_shared_mem(ObjectFifoPort.Consume) C = Core(T_) bb = Block.create_at_start(C.body)