Skip to content

Commit

Permalink
Linear transfer without transformation but with repeat (#1882)
Browse files Browse the repository at this point in the history
  • Loading branch information
hunhoffe authored Oct 25, 2024
1 parent 8329b6c commit 088876d
Show file tree
Hide file tree
Showing 6 changed files with 225 additions and 28 deletions.
4 changes: 4 additions & 0 deletions include/aie/Dialect/AIE/IR/AIEOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -857,6 +857,10 @@ def AIE_DMABDOp: AIE_Op<"dma_bd", []> {
// access/store element at/to index (i * 16 /*stride_2*/ + j * 1 /*stride_1*/ + k * 2 /*stride_0*/)
```

Note that an additional dimension of sizes/strides is accepted (5th dimension for memtiles, 4th otherwise);
the additional size value is interpreted as a repeat count whereas the additional stride value is
interpreted as an iteration stride.

#### Important gotcha regarding strides

All strides are expressed in multiples of the element width (just like `len` and `offset`)
Expand Down
20 changes: 8 additions & 12 deletions lib/Dialect/AIEX/IR/AIEXDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,10 +185,6 @@ verifyStridesWraps(mlir::Operation *forOp, mlir::MemRefType referencedBufType,
return forOp->emitOpError(msg.str());
}

if (skipTransformationChecks) {
return success();
}

for (int i = 0; i < 3; i++) {
if (inputSizes[i] > 1 && inputStrides[i] < 1) {
// If inputSize[i] == 1, anything is allowable in the stride, since that
Expand All @@ -198,8 +194,8 @@ verifyStridesWraps(mlir::Operation *forOp, mlir::MemRefType referencedBufType,
<< i << " must be a positive integer.";
}
}
// A value of zero is allowable for the fourth-dimension stride, as such a
// "repeat" can be accomplished by setting size==1 and repeat_count=size.
// A value of zero is allowable for the fourth-dimension stride
// (this indicates an interation stride for the repeat of 0)
if (inputSizes[3] > 1 && inputStrides[3] < 0) {
return forOp->emitOpError("Stride 3 must be a non-negative integer.");
}
Expand All @@ -219,7 +215,7 @@ verifyStridesWraps(mlir::Operation *forOp, mlir::MemRefType referencedBufType,
}
}

if (hardwareSizes[0] > (1 << wrap_bits) - 1)
if (!skipTransformationChecks && hardwareSizes[0] > (1 << wrap_bits) - 1)
return forOp->emitOpError(
"Size 0 exceeds the [0:" + std::to_string((1 << wrap_bits) - 1) +
"] range.");
Expand Down Expand Up @@ -322,9 +318,10 @@ int64_t AIEX::NpuDmaMemcpyNdOp::getOffsetInBytes() {
return offset;
}

// dma_memcpy_nd transfers of the form [1, 1, 1, len][0, 0, 0, 1] do not
// dma_memcpy_nd transfers of the form [*, 1, 1, len][*, 0, 0, 1] do not
// specify any data layout transformation, but simply express a contiguous
// transfer of `len`.
// transfer of `len`. We exclude checks to 4th dimension, because repeat count
// is still possible without a data layout transformation.
bool AIEX::NpuDmaMemcpyNdOp::isLinearTransferWithoutTransformation() {
llvm::SmallVector<int64_t, 4> inputSizes =
llvm::map_to_vector(llvm::reverse(getMixedSizes()), [](OpFoldResult s) {
Expand All @@ -334,9 +331,8 @@ bool AIEX::NpuDmaMemcpyNdOp::isLinearTransferWithoutTransformation() {
llvm::map_to_vector(llvm::reverse(getMixedStrides()), [](OpFoldResult s) {
return getConstantIntValue(s).value();
});
return (inputSizes[1] == 1 && inputSizes[2] == 1 && inputSizes[3] == 1 &&
inputStrides[0] == 1 && inputStrides[1] == 0 &&
inputStrides[2] == 0 && inputStrides[3] == 0);
return (inputSizes[1] == 1 && inputSizes[2] == 1 && inputStrides[0] == 1 &&
inputStrides[1] == 0 && inputStrides[2] == 0);
}

LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() {
Expand Down
29 changes: 14 additions & 15 deletions lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -413,23 +413,22 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {

// d2_stride
d2_stride = IntegerAttr::get(i32ty, strides[2]);

// iteration_current, iteration_size, iteration_stride, repeat_count
if (inputSizes[3] > 1) {
if (inputStrides[3] > 0) {
iteration_size = IntegerAttr::get(i32ty, sizes[3]);
iteration_stride = IntegerAttr::get(i32ty, strides[3]);
} else {
// We allow users to encode the repeat_count as a dimension 3 stride
// of 0. This must lower to a iteration wrap of 0, so no stride is
// ever added. We then repeat the BD using the repeat_count in
// NpuPushQueueOp.
iteration_size = zero;
iteration_stride = zero;
}
}
// iteration_current, iteration_size, iteration_stride, repeat_count
if (inputSizes[3] > 1) {
if (inputStrides[3] > 0) {
iteration_size = IntegerAttr::get(i32ty, sizes[3]);
iteration_stride = IntegerAttr::get(i32ty, strides[3]);
} else {
// We allow users to encode the repeat_count as a dimension 3 stride
// of 0. This must lower to a iteration wrap of 0, so no stride is
// ever added. We then repeat the BD using the repeat_count in
// NpuPushQueueOp.
iteration_size = zero;
iteration_stride = zero;
}
repeat_count = IntegerAttr::get(i32ty, sizes[3]);
}
repeat_count = IntegerAttr::get(i32ty, sizes[3]);

// next_bd

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,6 @@ The signature of the `aie.runtime_sequence()` operation lists as its arguments a
* For each `tile_row` in the current row block:
* The DMA transfer function `npu_dma_memcpy_nd` loads a segment of matrix A and matrix B data (submatrix a, submatrix b) from the host into the corresponding `inA_fifos` for the respective column, maintaining the appropriate strides and offsets.
* Analogously to the data layout transformations described [further above](#tiling-and-data-layout-transformations) to translate a `m`&times;`k` matrix into blocks of `r`&times;`s`-submatrices, this transfer translates the input `M`&times;`K` and `K`&times;`N` matrices into submatrices of size `m`&times;`k` and `k`&times;`n`.
> Note that data layout transformations in the `npu_dma_memcpy_nd` operation are expressed in units of 4 bytes. This is why you will see all strides and the lowest-dimension length multiplied by a factor of `word_size_in` or `word_size_out` (to get the size in bytes) and then divided by four (to get the size in units of 4 bytes). This discrepancy will be streamlined in future versions.
* The DMA transfer function `npu_dma_memcpy_nd` sends a segment of matrix C data (submatrix c) from the corresponding `outC_fifos` for the respective column, back to the host while maintaining the appropriate strides and offsets.
* After completing DMA transfers for each column, `dma_wait` is used to synchronize their completion.

Expand Down
77 changes: 77 additions & 0 deletions test/npu-xrt/nd_memcpy_linear_repeat/aie2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2024 AMD Inc.

# REQUIRES: ryzen_ai, valid_xchess_license
#
# RUN: %python %S/aie2.py > ./aie2.mlir
# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
# RUN: %run_on_npu ./test.exe | FileCheck %s
# CHECK: PASS!

import numpy as np
from aie.extras.context import mlir_mod_ctx

from aie.dialects.aie import *
from aie.dialects.aiex import *
from aie.helpers.dialects.ext.scf import _for as range_

dtype = np.int16
repeat_count = 3
a_len = 2048
c_len = a_len * repeat_count


def design():

with mlir_mod_ctx() as ctx:

@device(AIEDevice.npu1_4col)
def device_body():
a_ty = np.ndarray[(a_len,), np.dtype[dtype]]
c_ty = np.ndarray[(c_len,), np.dtype[dtype]]

ShimTile = tile(0, 0)
ComputeTile = tile(0, 2)
fifo_a = object_fifo("fifo_a", ShimTile, ComputeTile, 2, a_ty)
fifo_c = object_fifo("fifo_c", ComputeTile, ShimTile, 2, a_ty)

# Core
@core(ComputeTile)
def core_body():
for _ in range_(0, 0xFFFFFFFF):
for i in range_(repeat_count):
elem_c = fifo_c.acquire(ObjectFifoPort.Produce, 1)
elem_a = fifo_a.acquire(ObjectFifoPort.Consume, 1)
for i in range_(a_len):
elem_c[i] = elem_a[i]
fifo_a.release(ObjectFifoPort.Consume, 1)
fifo_c.release(ObjectFifoPort.Produce, 1)

# To/from AIE-array data movement
@runtime_sequence(a_ty, a_ty, c_ty)
def sequence(A, _B, C):
npu_dma_memcpy_nd(
metadata=fifo_a,
bd_id=1,
mem=A,
sizes=[repeat_count, 1, 1, a_len],
strides=[0, 0, 0, 1],
)
npu_dma_memcpy_nd(
metadata=fifo_c,
bd_id=0,
mem=C,
sizes=[1, 1, 1, c_len],
strides=[0, 0, 0, 1],
)
dma_wait(fifo_c)

print(ctx.module)


design()
122 changes: 122 additions & 0 deletions test/npu-xrt/nd_memcpy_linear_repeat/test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// (c) Copyright 2024 AMD Inc.

#include <cassert>
#include <cstring>
#include <fstream>
#include <iomanip>

#include "xrt/xrt_bo.h"
#include "xrt/xrt_device.h"
#include "xrt/xrt_kernel.h"

#include "test_utils.h"

#ifndef XCLBIN
#define XCLBIN "final.xclbin"
#endif

#ifndef INSTS_TXT
#define INSTS_TXT "insts.txt"
#endif

#ifndef KERNEL_NAME
#define KERNEL_NAME "MLIR_AIE"
#endif

#define DTYPE int16_t
#define A_DATATYPE DTYPE
#define C_DATATYPE DTYPE

#define A_LEN 2048
#define REPEAT_COUNT 3
#define C_LEN (A_LEN * REPEAT_COUNT)

#define A_SIZE (A_LEN * sizeof(A_DATATYPE)) // in bytes
#define B_SIZE A_SIZE // in bytes
#define C_SIZE (C_LEN * sizeof(C_DATATYPE)) // in bytes

int main(int argc, const char *argv[]) {

std::vector<uint32_t> instr_v = test_utils::load_instr_sequence(INSTS_TXT);
assert(instr_v.size() > 0);

// Get a device handle
unsigned int device_index = 0;
xrt::device device = xrt::device(device_index);

// Load the xclbin
xrt::xclbin xclbin = xrt::xclbin(XCLBIN);

// Get the kernel from the xclbin
std::vector<xrt::xclbin::kernel> xkernels = xclbin.get_kernels();
xrt::xclbin::kernel xkernel = *std::find_if(
xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel &k) {
return k.get_name().rfind(KERNEL_NAME, 0) == 0;
});
std::string kernel_name = xkernel.get_name();
assert(strcmp(kernel_name.c_str(), KERNEL_NAME) == 0);

device.register_xclbin(xclbin);

// get a hardware context
xrt::hw_context context(device, xclbin.get_uuid());

// get a kernel handle
auto kernel = xrt::kernel(context, kernel_name);

auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
auto bo_a =
xrt::bo(device, A_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
auto bo_b =
xrt::bo(device, B_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
auto bo_c =
xrt::bo(device, C_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));

A_DATATYPE *buf_a = bo_a.map<A_DATATYPE *>();
for (int i = 0; i < A_SIZE / sizeof(buf_a[0]); i++) {
buf_a[i] = 2 * i; // even
}
C_DATATYPE *buf_c = bo_c.map<C_DATATYPE *>();
memset(buf_c, 0, C_SIZE);

// Instruction buffer for DMA configuration
void *bufInstr = bo_instr.map<void *>();
memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));

bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_a.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_b.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_c.sync(XCL_BO_SYNC_BO_TO_DEVICE);

unsigned int opcode = 3;
auto run = kernel(opcode, bo_instr, instr_v.size(), bo_a, bo_b, bo_c);
ert_cmd_state r = run.wait();
if (r != ERT_CMD_STATE_COMPLETED) {
std::cout << "Kernel did not complete. Returned status: " << r << "\n";
return 1;
}

bo_c.sync(XCL_BO_SYNC_BO_FROM_DEVICE);

int errors = 0;
for (int i = 0; i < C_SIZE / sizeof(buf_c[0]); i++) {
std::cout << std::setw(4) << (long)buf_c[i] << " ";
if (buf_c[i] != buf_a[i % A_LEN]) {
errors += 1;
}
}
std::cout << std::endl;

if (errors == 0) {
std::cout << "PASS!" << std::endl;
} else {
std::cout << "FAIL." << std::endl;
}

return 0;
}

0 comments on commit 088876d

Please sign in to comment.