diff --git a/example/gemm/CMakeLists.txt b/example/gemm/CMakeLists.txt index be0954f..f885dfc 100644 --- a/example/gemm/CMakeLists.txt +++ b/example/gemm/CMakeLists.txt @@ -16,7 +16,7 @@ function(add_aie_gemm_xclbin m k n NUM_COLUMNS B_COL_MAJ C_COL_MAJ TRACE_SIZE AR add_aie_design(${XCLBIN_NAME} XCLBIN_ONLY PYTHON gemm.py - PYTHON_FLAGS --dev ${DEVICE} -M ${DUMMY_M} -K ${DUMMY_K} -N ${DUMMY_N} -m ${m} -k ${k} -n ${n} --n-aie-cols ${NUM_COLUMNS} --b-col-maj ${B_COL_MAJ} --c-col-maj ${C_COL_MAJ} --dtype_in ${INPUT_DATA_TYPE_PYTHON} --dtype_out ${OUTPUT_DATA_TYPE_PYTHON} --trace_size ${TRACE_SIZE} ${EMULATE_STR} --prio-accuracy --output-file-path ${CMAKE_BINARY_DIR}/aie/${XCLBIN_NAME}.mlir + PYTHON_FLAGS --dev ${DEVICE} -M ${DUMMY_M} -K ${DUMMY_K} -N ${DUMMY_N} -m ${m} -k ${k} -n ${n} --n-aie-cols ${NUM_COLUMNS} --b-col-maj ${B_COL_MAJ} --c-col-maj ${C_COL_MAJ} --dtype_in ${INPUT_DATA_TYPE_PYTHON} --dtype_out ${OUTPUT_DATA_TYPE_PYTHON} --trace_size ${TRACE_SIZE} ${EMULATE_STR} ${PRIO_ACC_STR} --output-file-path ${CMAKE_BINARY_DIR}/aie/${XCLBIN_NAME}.mlir AIE_CORE_KERNELS ${ARCHIVE_NAME} EXTRA_AIECC_FLAGS --dynamic-objFifos OUTPUT_XCLBIN GEMM_XCLBIN_${XCLBIN_NAME}) @@ -43,7 +43,7 @@ function(add_aie_gemm_design M K N m k n NUM_COLUMNS B_COL_MAJ C_COL_MAJ TRACE_S add_aie_design(${EXAMPLE} INSTS_ONLY PYTHON gemm.py - PYTHON_FLAGS --dev ${DEVICE} -M ${M} -K ${K} -N ${N} -m ${m} -k ${k} -n ${n} --n-aie-cols ${NUM_COLUMNS} --b-col-maj ${B_COL_MAJ} --c-col-maj ${C_COL_MAJ} --dtype_in ${INPUT_DATA_TYPE_PYTHON} --dtype_out ${OUTPUT_DATA_TYPE_PYTHON} --trace_size ${TRACE_SIZE} ${EMULATE_STR} --prio-accuracy --output-file-path ${CMAKE_BINARY_DIR}/aie/${EXAMPLE}.mlir + PYTHON_FLAGS --dev ${DEVICE} -M ${M} -K ${K} -N ${N} -m ${m} -k ${k} -n ${n} --n-aie-cols ${NUM_COLUMNS} --b-col-maj ${B_COL_MAJ} --c-col-maj ${C_COL_MAJ} --dtype_in ${INPUT_DATA_TYPE_PYTHON} --dtype_out ${OUTPUT_DATA_TYPE_PYTHON} --trace_size ${TRACE_SIZE} ${EMULATE_STR} ${PRIO_ACC_STR} --output-file-path ${CMAKE_BINARY_DIR}/aie/${EXAMPLE}.mlir EXTRA_AIECC_FLAGS --dynamic-objFifos OUTPUT_INSTS GEMM_INSTS) @@ -69,7 +69,8 @@ function(add_aie_gemm_design M K N m k n NUM_COLUMNS B_COL_MAJ C_COL_MAJ TRACE_S "PASS!" METRICS "Latency" [=[Latency \(us\): (?P\d+)]=] - "Bandwidth" [=[Effective Bandwidth: (?P[\d\.e\+-]+) GB/s]=]) + "Bandwidth" [=[Effective Bandwidth: (?P[\d\.e\+-]+) GB/s]=] + "GFLOP/s" [=[Throughput:\s*(?P[\d\.e\+-]+) GFLOP/s]=]) endfunction() set(M_LIST "2048") @@ -86,15 +87,22 @@ set(ARCHIVE_NAME "gemm_${m}x${k}x${n}_archive.a") set(AIE_BUILD_DIR ${CMAKE_BINARY_DIR}/aie) set(ARCHIVE_PATH "${AIE_BUILD_DIR}/${ARCHIVE_NAME}") +set(PRIO_ACCURACY True) set(EMULATE_BFLOAT16_MMUL_WITH_BFP16 False) set (B_COL_MAJ 0) set (C_COL_MAJ 0) -set(MM_KERNEL_DEFINES "DIM_M=${m}" "DIM_K=${k}" "DIM_N=${n}" "bf16_f32_ONLY" "ROUND_CONV_EVEN") +set(MM_KERNEL_DEFINES "DIM_M=${m}" "DIM_K=${k}" "DIM_N=${n}" "ROUND_CONV_EVEN") if (EMULATE_BFLOAT16_MMUL_WITH_BFP16) set(EMULATE_STR --emulate-bf16-mmul-with-bfp16) list(APPEND MM_KERNEL_DEFINES "AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16") endif() +if (PRIO_ACCURACY) + set(PRIO_ACC_STR --prio-accuracy) + list(APPEND MM_KERNEL_DEFINES "bf16_f32_ONLY") +else() + list(APPEND MM_KERNEL_DEFINES "bf16_bf16_ONLY") +endif() if (B_COL_MAJ) list(APPEND MM_KERNEL_DEFINES "B_COL_MAJ") endif() diff --git a/example/gemm/gemm.cpp b/example/gemm/gemm.cpp index 7fb8e25..0fc09ea 100644 --- a/example/gemm/gemm.cpp +++ b/example/gemm/gemm.cpp @@ -160,6 +160,11 @@ int main(int argc, const char *argv[]) double bandwidth_GBps = (total_bytes / (1024 * 1024 * 1024)) / (npu_time * 1e-6); std::cout << "Effective Bandwidth: " << bandwidth_GBps << " GB/s" << std::endl; + // Need to cast to long long since M/K/N=2048 would give 17*10^9 ops for example + unsigned long long n_ops = static_cast(M) * K * N * 2; + float throughput = n_ops / npu_time / 1e3; // GOP/s + std::cout << "Throughput: " << throughput << " GFLOP/s" << std::endl; + std::bfloat16_t *bufOut1 = bo_out.map(); // Compare with golden reference diff --git a/example/gemm/gemm.py b/example/gemm/gemm.py index b80fc32..7232960 100644 --- a/example/gemm/gemm.py +++ b/example/gemm/gemm.py @@ -1,20 +1,28 @@ # SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -import argparse -import numpy as np +from ml_dtypes import bfloat16 from pathlib import Path -from aie.extras.context import mlir_mod_ctx - -from aie.dialects.aie import * -from aie.dialects.aiex import * -from aie.dialects.scf import FlatSymbolRefAttr -import aie.dialects.index as index_dialect -from aie.helpers.dialects.ext.scf import _for as range_ -from aie.helpers.taplib import TensorAccessPattern, TensorAccessSequence - -from aie.iron import str_to_dtype +import numpy as np +import argparse +import sys + +from aie.iron import ( + Kernel, + ObjectFifo, + Program, + GlobalBuffer, + Runtime, + Worker, + WorkerRuntimeBarrier, + LocalBuffer, + str_to_dtype, +) +from aie.iron.placers import SequentialPlacer +from aie.iron.device import NPU1Col1, NPU1Col2, NPU1, NPU2, Tile +from aie.helpers.taplib import TensorAccessSequence, TensorTiler2D +from aie.iron.controlflow import range_ microkernel_mac_dim_map = { @@ -74,34 +82,33 @@ def main(): ) args = argparser.parse_args() - with mlir_mod_ctx() as ctx: - maybe_taps = my_matmul( - args.dev, - args.M, - args.K, - args.N, - args.m, - args.k, - args.n, - args.n_aie_cols, - args.dtype_in, - args.dtype_out, - args.b_col_maj, - args.c_col_maj, - args.scalar, - args.emulate_bf16_mmul_with_bfp16, - args.prio_accuracy, - args.trace_size, - args.generate_taps, - ) + maybe_module = my_matmul( + args.dev, + args.M, + args.K, + args.N, + args.m, + args.k, + args.n, + args.n_aie_cols, + args.dtype_in, + args.dtype_out, + args.b_col_maj, + args.c_col_maj, + args.scalar, + args.emulate_bf16_mmul_with_bfp16, + args.prio_accuracy, + args.trace_size, + args.generate_taps, + ) + if args.generate_taps: + return maybe_module + else: output_file_path = Path(args.output_file_path) with open(output_file_path, "w") as f: - f.write(str(ctx.module)) - - if args.generate_taps: - return maybe_taps + f.write(str(maybe_module)) def ceildiv(a, b): @@ -128,30 +135,49 @@ def my_matmul( generate_taps=False, ): n_aie_rows = 4 - n_aie_cores = n_aie_rows * n_aie_cols - use_larger_internal_buffer = dtype_out_str == "bf16" and prio_accuracy dtype_in = str_to_dtype(dtype_in_str) - # Accumulate in place with f32 buffer that will be converted to bf16 for output transfer to L2 - dtype_out_internal = ( - str_to_dtype("f32") - if use_larger_internal_buffer - else str_to_dtype(dtype_in_str) - ) - dtype_out_transfer = str_to_dtype(dtype_out_str) + dtype_out = str_to_dtype(dtype_out_str) + + # When using more AIE columns than n_aie_rows (4) (applicable to NPU2), + # restrict the number of shim/mem tiles to n_aie_rows, + # since we have only n_aie_rows row tiles for matrix A + # When using n_aie_rows (4) or less AIE columns (both NPU and NPU2), + # the number of shim/mem tiles are equal to n_aie_cols. + # We use the distribute pattern in object FIFO (see linking for A below), + # since we have n_aie_rows (4) row tiles for matrix A + n_shim_mem_A = min(n_aie_cols, n_aie_rows) + + # Integer division when n_aie_cols < 4, otherwise set to 1 + n_A_tiles_per_shim = n_aie_rows // n_aie_cols if n_aie_cols < 4 else 1 + + mem_tile_m_A = m * n_A_tiles_per_shim + mem_tile_m_C = m * n_aie_rows + mem_tile_n = n * n_aie_cols + + if prio_accuracy: + assert ( + dtype_out_str == "bf16" + ), f"prio_accuracy flag is a feature only for bfloat16 output data types" + use_larger_internal_buffer = True + # If prio_accuracy flag is enabled, gemm for bfloat16 will accumulate in place with a f32 buffer, + # which will be converted to bf16 after the reduction loop finishes for output transfer to L2 + dtype_out_internal = str_to_dtype("f32") + assert np.issubdtype(dtype_in, np.integer) == np.issubdtype( + dtype_out_internal, np.integer + ), f"Input dtype ({dtype_in}) and output dtype ({dtype_out_internal}) must either both be integral or both be float" + assert ( + np.dtype(dtype_out_internal).itemsize >= np.dtype(dtype_in).itemsize + ), f"Output dtype ({dtype_out_internal}) must be equal or larger to input dtype ({dtype_in})" + else: + use_larger_internal_buffer = False assert np.issubdtype(dtype_in, np.integer) == np.issubdtype( - dtype_out_internal, np.integer - ), f"Input dtype ({dtype_in}) and output dtype ({dtype_out_internal}) must either both be integral or both be float" - assert np.issubdtype(dtype_in, np.integer) == np.issubdtype( - dtype_out_transfer, np.integer - ), f"Input dtype ({dtype_in}) and output dtype ({dtype_out_transfer}) must either both be integral or both be float" + dtype_out, np.integer + ), f"Input dtype ({dtype_in}) and output dtype ({dtype_out}) must either both be integral or both be float" assert ( - np.dtype(dtype_out_internal).itemsize >= np.dtype(dtype_in).itemsize - ), f"Output dtype ({dtype_out_internal}) must be equal or larger to input dtype ({dtype_in})" - assert ( - np.dtype(dtype_out_transfer).itemsize >= np.dtype(dtype_in).itemsize - ), f"Output dtype ({dtype_out_transfer}) must be equal or larger to input dtype ({dtype_in})" + np.dtype(dtype_out).itemsize >= np.dtype(dtype_in).itemsize + ), f"Output dtype ({dtype_out}) must be equal or larger to input dtype ({dtype_in})" # r, s, t are the dimensions required by the microkernel MAC instructions. mac_dims = microkernel_mac_dim_map[dev][dtype_in_str] @@ -175,8 +201,8 @@ def my_matmul( # rows, s.t. each of the n_rows compute cores in a column receives a # contiguous (m, k)-sized block of A. assert ( - M % (m * n_aie_rows) == 0 - ), """A must be tileable into (m * n_aie_rows, k)-sized blocks""" + M % mem_tile_m_A == 0 + ), """A must be tileable into (m * n_A_tiles_per_shim, k)-sized blocks""" # Both A and B are tiled in the K dimension into size k. assert K % k == 0 @@ -185,9 +211,18 @@ def my_matmul( # Conceptually, we do the same as with A, but instead of broadcasting # across columns we broadcast across rows and distribute across columns. assert ( - N % (n * n_aie_cols) == 0 + N % mem_tile_n == 0 ), """B must be tileable into (k, n * n_aie_cols)-sized blocks""" + # Output matrix C: + # Conceptually, we divide output C into (m * n_rows, n)-sized blocks. These + # blocks are _distributed_ across AIE core columns, and _joined_ across + # rows, s.t. each of the n_rows compute cores in a column send a + # contiguous (m, n)-sized block of C. + assert ( + M % mem_tile_m_C == 0 + ), """C must be tileable into (m * n_aie_rows, n)-sized blocks""" + # r, s, t are the dimensions required by the microkernel MAC instructions. if not use_scalar: assert m % r == 0 @@ -200,32 +235,15 @@ def my_matmul( # a big performance cost. fifo_depth = 2 - n_tiles_per_core = (M // m) * (N // n) // n_aie_cores - - # When using more AIE columns than n_aie_rows (4) (applicable to NPU2), - # restrict the number of shim/mem tiles to n_aie_rows, - # since we have only n_aie_rows row tiles for matrix A - if n_aie_cols > n_aie_rows: - n_shim_mem_A = n_aie_rows - # When using n_aie_rows (4) or less AIE columns (both NPU and NPU2), - # the number of shim/mem tiles are equal to n_aie_cols. - # We use the distribute pattern in object FIFO (see linking for A below), - # since we have n_aie_rows (4) row tiles for matrix A - else: - n_shim_mem_A = n_aie_cols - - # Integer division when n_aie_cols < 4, otherwise set to 1 - n_A_tiles_per_shim = n_aie_rows // n_aie_cols if n_aie_cols < 4 else 1 - if dev == "npu": if n_aie_cols == 1: - dev_ty = AIEDevice.npu1_1col + dev_ty = NPU1Col1() elif n_aie_cols == 2: - dev_ty = AIEDevice.npu1_2col + dev_ty = NPU1Col2() elif n_aie_cols == 4: - dev_ty = AIEDevice.npu1 + dev_ty = NPU1() else: - dev_ty = AIEDevice.npu2 + dev_ty = NPU2() # These will hold TensorAccessPattern objects that represent the runtime # npu_dma_memcpy_nd operations of this design. They are only used if generate_taps is true @@ -233,490 +251,418 @@ def my_matmul( B_taps = [] C_taps = [] - @device(dev_ty) - def device_body(): - A_l2_ty = np.ndarray[(m * k * n_A_tiles_per_shim,), np.dtype[dtype_in]] - B_l2_ty = np.ndarray[(k * n,), np.dtype[dtype_in]] - C_l2_ty = np.ndarray[(m * n * n_aie_rows,), np.dtype[dtype_out_transfer]] - A_l1_ty = np.ndarray[(m, k), np.dtype[dtype_in]] - B_l1_ty = np.ndarray[(k, n), np.dtype[dtype_in]] + # Define tensor types + A_ty = np.ndarray[(M * K,), np.dtype[dtype_in]] + B_ty = np.ndarray[(K * N,), np.dtype[dtype_in]] + C_ty = np.ndarray[(M * N,), np.dtype[dtype_out]] + A_l2_ty = np.ndarray[(mem_tile_m_A * k,), np.dtype[dtype_in]] + B_l2_ty = np.ndarray[(k * n,), np.dtype[dtype_in]] + C_l2_ty = np.ndarray[(mem_tile_m_C * n,), np.dtype[dtype_out]] + A_l1_ty = np.ndarray[(m, k), np.dtype[dtype_in]] + B_l1_ty = np.ndarray[(k, n), np.dtype[dtype_in]] + C_l1_ty = np.ndarray[(m, n), np.dtype[dtype_out]] + + # AIE Core Function declarations + scalar_suffix = "_scalar" if use_scalar else "" + if use_larger_internal_buffer: + # Fix fifo depth for C objfifo to 1 since 1 buffer will be used for accumulation + # and another for transfer to L2 + fifo_depth_out = 1 + # Set the type for accumulation C_l1_ty_internal = np.ndarray[(m, n), np.dtype[dtype_out_internal]] - C_l1_ty_transfer = np.ndarray[(m, n), np.dtype[dtype_out_transfer]] - - # AIE Core Function declarations - scalar_suffix = "_scalar" if use_scalar else "" - zero = external_func( - f"zero{scalar_suffix}_{'f32' if use_larger_internal_buffer else dtype_out_str}", - inputs=[C_l1_ty_internal], + # A kernel to convert from the internal f32 accumulation to bf16 for transfer to L2 is needed + convert_copy_kernel = Kernel( + f"convert_copy_f32_to_bf16", + f"gemm_{m}x{k}x{n}_archive.a", + [C_l1_ty_internal, C_l1_ty, np.int32], ) - matmul = external_func( - f"matmul{scalar_suffix}_{dtype_in_str}_{'f32' if use_larger_internal_buffer else dtype_out_str}", - inputs=[A_l1_ty, B_l1_ty, C_l1_ty_internal], + # Fix the kernels to use f32 outputs + zero_kernel = Kernel( + f"zero{scalar_suffix}_f32", + f"gemm_{m}x{k}x{n}_archive.a", + [C_l1_ty_internal], + ) + matmul_func_name = f"matmul{scalar_suffix}_{dtype_in_str}_f32" + matmul_kernel = Kernel( + matmul_func_name, + f"gemm_{m}x{k}x{n}_archive.a", + [A_l1_ty, B_l1_ty, C_l1_ty_internal], + ) + else: + # No need to use separate buffers for accumulation and transfer to L2, so + # we only need the zero and matmul kernels + fifo_depth_out = fifo_depth + zero_kernel = Kernel( + f"zero{scalar_suffix}_{dtype_out_str}", + f"gemm_{m}x{k}x{n}_archive.a", + [C_l1_ty], + ) + matmul_func_name = f"matmul{scalar_suffix}_{dtype_in_str}_{dtype_out_str}" + matmul_kernel = Kernel( + matmul_func_name, + f"gemm_{m}x{k}x{n}_archive.a", + [A_l1_ty, B_l1_ty, C_l1_ty], ) - if use_larger_internal_buffer: - convert_copy = external_func( - f"convert_copy_f32_to_bf16", - inputs=[C_l1_ty_internal, C_l1_ty_transfer, np.int32], - ) - - # Tile declarations as tile[row][col] - tiles = [ - [tile(col, row) for col in range(0, n_aie_cols)] for row in range(0, 6) - ] - shim_tiles = tiles[0] - mem_tiles = tiles[1] - core_tiles = tiles[2:] - # AIE-array data movement with object fifos - A_l3l2_fifos = [None] * n_shim_mem_A - A_l2l1_fifos = [None] * n_aie_rows + # Tile declarations as tile[row][col] + tiles = [[(col, row) for col in range(0, n_aie_cols)] for row in range(0, 6)] + core_tiles = tiles[2:] - B_l3l2_fifos = [None] * n_aie_cols - B_l2l1_fifos = [None] * n_aie_cols + # AIE-array data movement with object fifos + A_l3l2_fifos = [None] * n_shim_mem_A + A_l2l1_fifos = [None] * n_aie_rows - C_l1_fifos = [[None] * n_aie_cols for _ in range(n_aie_rows)] - C_l1l2_fifos = [[None] * n_aie_cols for _ in range(n_aie_rows)] - C_l2l3_fifos = [None] * n_aie_cols + B_l3l2_fifos = [None] * n_aie_cols + B_l2l1_fifos = [None] * n_aie_cols - # Run-time parameters - rtp_locks = [[None] * n_aie_cols for _ in range(4)] - rtp_bufs = [[None] * n_aie_cols for _ in range(4)] - for col in range(n_aie_cols): - for row in range(n_aie_rows): - # RTP index 0: K // k - # RTP index 1: num tiles == M * N // (m * n * n_aie_cores) - rtp_bufs[row][col] = buffer( - core_tiles[row][col], - datatype=T.memref(3, T.i32()), - name=f"rtp_{row}_{col}", - ) - rtp_locks[row][col] = lock( - core_tiles[row][col], sym_name=f"rtp_lock_{row}_{col}", init=0 - ) + C_l1l2_fifos = [[None] * n_aie_cols for _ in range(n_aie_rows)] + C_l2l3_fifos = [None] * n_aie_cols - # Input A - # L3 -> L2 data movement - for i in range(n_shim_mem_A): - A_l3l2_fifos[i] = object_fifo( - f"A_L3L2_{i}", - ( - shim_tiles[2 * i] if n_aie_cols == 8 else shim_tiles[i] + # Runtime parameters + rtps = [ + [ + GlobalBuffer( + np.ndarray[(2,), np.dtype[np.int32]], + name=f"rtp{row}_{col}", + initial_value=np.array([0, 0], dtype=np.int32), + use_write_rtp=True, + ) + for col in range(n_aie_cols) + ] + for row in range(n_aie_rows) + ] + + # Create barriers to synchronize individual workers with the runtime sequence + workerBarriers = [ + [WorkerRuntimeBarrier() for col in range(n_aie_cols)] + for row in range(n_aie_rows) + ] + + # Input A + for i in range(n_shim_mem_A): + A_l3l2_fifos[i] = ObjectFifo(A_l2_ty, name=f"A_L3L2_{i}", depth=fifo_depth) + # If n_shim_mem_A == n_rows, n_A_tiles_per_shim is 1 and + # this simply links a_l3l2_fifos[i] to a_l2l1_fifos[i] directly, + # If n_shim_mem_A < n_rows, each column receives multiple rows of + # tiles; distribute it along rows of AIE cores. + start_row = i * n_A_tiles_per_shim + stop_row = start_row + n_A_tiles_per_shim + of_offsets = [m * k * j for j in range(stop_row - start_row)] + dims_to_stream = [ + [ + (m // r, r * k), + (k // s, s), + (r, k), + (s, 1), + ] + ] * (stop_row - start_row) + a_tmp_fifos = ( + A_l3l2_fifos[i] + .cons() + .split( + of_offsets, + obj_types=[A_l1_ty] * (stop_row - start_row), + names=[f"A_L2L1_{row}" for row in range(start_row, stop_row)], + dims_to_stream=dims_to_stream, + placement=Tile( + 2 * i if n_aie_cols == 8 else i, 1 ), # alternate columns in full 4x8 NPU2 case - mem_tiles[2 * i] if n_aie_cols == 8 else mem_tiles[i], - fifo_depth, - A_l2_ty, ) + ) - # L2 -> L1 data movement - for row in range(n_aie_rows): - A_l2l1_fifos[row] = object_fifo( - f"A_L2L1_{row}", - ( - mem_tiles[2 * row] - if n_aie_cols == 8 - else mem_tiles[row // n_A_tiles_per_shim] - ), - core_tiles[row][0:n_aie_cols], # broadcast along one row - fifo_depth, - A_l1_ty, - ( - [ - (m // r, r * k), - (k // s, s), - (r, k), - (s, 1), - ] - if not use_scalar - else [] - ), + for j in range(stop_row - start_row): + A_l2l1_fifos[j + start_row] = a_tmp_fifos[j] + + # Input B + for col in range(n_aie_cols): + B_l3l2_fifos[col] = ObjectFifo(B_l2_ty, name=f"B_L3L2_{col}", depth=fifo_depth) + if b_col_maj: + dims_to_stream = [(n // t, t * k), (k // s, s), (t, k), (s, 1)] + else: + dims_to_stream = [(k // s, s * n), (n // t, t), (s, n), (t, 1)] + B_l2l1_fifos[col] = ( + B_l3l2_fifos[col] + .cons() + .forward( + obj_type=B_l1_ty, + name=f"B_L2L1_{col}", + dims_to_stream=dims_to_stream, + placement=Tile(col, 1), ) + ) - # A_l3_l2 and A_l2_l1 object FIFO linking - for i in range(n_shim_mem_A): - # If n_shim_mem_A == n_rows, n_A_tiles_per_shim is 1 and - # this simply links a_l3l2_fifos[i] to a_l2l1_fifos[i] directly, - # If n_shim_mem_A < n_rows, each column receives multiple rows of - # tiles; distribute it along rows of AIE cores. - start_row = i * n_A_tiles_per_shim - stop_row = start_row + n_A_tiles_per_shim - if stop_row - start_row > 1: - of_offsets = [m * k * j for j in range(stop_row - start_row)] - else: - of_offsets = [] - object_fifo_link( - A_l3l2_fifos[i], - [A_l2l1_fifos[j] for j in range(start_row, stop_row)], - [], + # Output C + C_l2l3_fifos[col] = ObjectFifo( + C_l2_ty, + name=f"C_L2L3_{col}", + depth=fifo_depth, + dims_to_stream=[(m // r, r * n), (r, t), (n // t, r * t), (t, 1)], + ) + of_offsets = [m * n * i for i in range(n_aie_rows)] + + # join along one column + c_tmp_fifos = ( + C_l2l3_fifos[col] + .prod() + .join( of_offsets, + obj_types=[C_l1_ty] * n_aie_rows, + names=[f"C_L1L2_{col}_{row}" for row in range(n_aie_rows)], + depths=[fifo_depth_out] * n_aie_rows, + placement=Tile(col, 1), ) + ) + for j in range(n_aie_rows): + C_l1l2_fifos[j][col] = c_tmp_fifos[j] - # Input B - for col in range(n_aie_cols): - # L3 -> L2 data movement - B_l3l2_fifos[col] = object_fifo( - f"B_L3L2_{col}", - shim_tiles[col], - mem_tiles[col], - fifo_depth, - B_l2_ty, - ) - # L2 -> L1 data movement - B_l2l1_fifos[col] = object_fifo( - f"B_L2L1_{col}", - mem_tiles[col], - [ - core_tiles[j][col] for j in range(n_aie_rows) - ], # broadcast along one column - fifo_depth, - B_l1_ty, - ( - ( - [ - (k // s, s * n), - (n // t, t), - (s, n), - (t, 1), - ] - if not b_col_maj - else [ - (n // t, t * k), - (k // s, s), - (t, k), - (s, 1), - ] - ) - if not use_scalar - else [] - ), + # Tasks for each worker to perform + def core_fn(in_a, in_b, out_c, zero, matmul, convert_copy, my_rtp, barrier): + if use_larger_internal_buffer: + elem_out_internal = LocalBuffer( + type=C_l1_ty_internal, ) - # B_l3_l2 and B_l2_l1 object FIFO linking - object_fifo_link(B_l3l2_fifos[col], B_l2l1_fifos[col]) + barrier.wait_for_value(1) + rtp_K_div_k = my_rtp[0] + rtp_n_tiles_per_core = my_rtp[1] + loop = range(1) # Workaround for issue #1547 + if rtp_n_tiles_per_core > 1: + loop = range_(rtp_n_tiles_per_core) + for _ in loop: + if not use_larger_internal_buffer: + elem_out_internal = out_c.acquire(1) + zero(elem_out_internal) + + for _ in range_(rtp_K_div_k): + elem_in_a = in_a.acquire(1) + elem_in_b = in_b.acquire(1) + matmul(elem_in_a, elem_in_b, elem_out_internal) + in_a.release(1) + in_b.release(1) + + if use_larger_internal_buffer: + elem_out_transfer = out_c.acquire(1) + convert_copy(elem_out_internal, elem_out_transfer, m * n) + out_c.release(1) + else: + out_c.release(1) - # Output C + # Set up compute tiles + workers = [] + for row in range(n_aie_rows): for col in range(n_aie_cols): - for row in range(n_aie_rows): - if use_larger_internal_buffer: - C_l1_fifos[row][col] = object_fifo( - f"C_L1_{col}_{row}", - core_tiles[row][col], - core_tiles[row][col], - 1, - C_l1_ty_internal, - ) - C_l1l2_fifos[row][col] = object_fifo( - f"C_L1L2_{col}_{row}", - core_tiles[row][col], - mem_tiles[col], - 1, - C_l1_ty_transfer, - ) - else: - C_l1l2_fifos[row][col] = object_fifo( - f"C_L1L2_{col}_{row}", - core_tiles[row][col], - mem_tiles[col], - 2, - C_l1_ty_transfer, - ) - C_l2l3_fifos[col] = object_fifo( - f"C_L2L3_{col}", - mem_tiles[col], - shim_tiles[col], - fifo_depth, - C_l2_ty, - ( - ( - [ - (m // r, r * n), - (r, t), - (n // t, r * t), - (t, 1), - ] - if not c_col_maj - else [(n // t, t * m), (t, r), (m // r, r * t), (r, 1)] - ) - if not use_scalar - else [] - ), + tile_col, tile_row = core_tiles[row][col] + workers.append( + Worker( + core_fn, + [ + A_l2l1_fifos[row].cons(), + B_l2l1_fifos[col].cons(), + C_l1l2_fifos[row][col].prod(), + zero_kernel, + matmul_kernel, + convert_copy_kernel if use_larger_internal_buffer else None, + rtps[row][col], + workerBarriers[row][col], + ], + placement=Tile(tile_col, tile_row), + stack_size=0xD00, + ) ) - if n_aie_rows > 1: - of_offsets = [m * n * i for i in range(n_aie_rows)] - else: - of_offsets = [] - object_fifo_link( - [C_l1l2_fifos[j][col] for j in range(n_aie_rows)], - C_l2l3_fifos[col], - of_offsets, - [], - ) # join along one column - # Set up compute tiles + # We are limited in the number of BDs. After synchronizing, we can reuse BDs. + # We only transfer 6 rows of tiles at once before starting a new transfer block. + # tb = transfer block; block of transfers before sync call + tb_max_n_rows = 4 + tb_n_rows = tb_max_n_rows // 2 + + # Calculate RTP values for the reduction loop and total C tiles + K_div_k = K // k + n_c_col_tiles_per_core = N // mem_tile_n + n_c_row_tiles_per_core = M // mem_tile_m_C + + # Define tensor access patterns (tiling) for A, B, and C + A_tiles = TensorTiler2D.group_tiler( + (M, K), # Size of A matrix + (mem_tile_m_A, k), # Size of A (smallest) tile + (1, K_div_k), # Size of "group" of tiles + # Repeat data so can distribute across whole column + pattern_repeat=n_c_col_tiles_per_core, + prune_step=False, + ) + if b_col_maj: + B_tiles = TensorTiler2D.step_tiler( + (N, K), # Size of B matrix + (n, k), # Size of B tile + # Number of tiles per transfer in each dimension (whole col, partial row) + tile_group_repeats=(n_c_col_tiles_per_core, K_div_k), + # Contiguous tile group in col, but send every n_aie_cols-th tile in the row + tile_group_steps=(n_aie_cols, 1), + prune_step=False, + ) + else: + B_tiles = TensorTiler2D.step_tiler( + (K, N), # Size of B matrix + (k, n), # Size of B tile + # Number of tiles per transfer in each dimension (whole col, partial row) + tile_group_repeats=(K_div_k, n_c_col_tiles_per_core), + # Contiguous tile group in col, but send every n_aie_cols-th tile in the row + tile_group_steps=(1, n_aie_cols), + tile_group_col_major=True, # Send all tiles in column before moving on to next column + prune_step=False, + ) + C_tiles = TensorTiler2D.step_tiler( + (M, N), # Size of C matrix + (mem_tile_m_C, n), # Size of C tile + # Number of tiles per transfer in each dimension (partial col, partial row) + tile_group_repeats=(tb_n_rows, n_c_col_tiles_per_core), + # Collect every n_aie_cols row at a time (mirroring how we sent in B data) + tile_group_steps=(1, n_aie_cols), + prune_step=False, + ) + c_index = 0 + + # Runtime operations to move data to/from the AIE-array + rt = Runtime() + with rt.sequence(A_ty, B_ty, C_ty) as (A, B, C): + rt.start(*workers) + + # Set runtime parameters + def set_rtps(*args): + for row, rtps_row in enumerate(args): + for col, rtp_row_col in enumerate(rtps_row): + rtp_row_col[0] = K_div_k + rtp_row_col[1] = n_c_row_tiles_per_core * n_c_col_tiles_per_core + + rt.inline_ops(set_rtps, rtps) + + # Set the barriers to 1 to allow the worker to read the + # runtime parameters and start the computation for row in range(n_aie_rows): for col in range(n_aie_cols): - - # The stack size choice is a workaround explained here: - # https://github.com/Xilinx/mlir-aie/pull/2391#issuecomment-2967432485 - # In summary, the Peano compiler uses a stack size greater than the default one used by this kernel - # (default is 0x400, chess' stack size is smaller). This is only necessary for bf16 through bfp16 emulation on npu2. - # Exceding the stack size leads to wrong results from the kernel, but no error is triggered. - # Stack usage can be checked as explained here: - # https://github.com/Xilinx/llvm-aie/issues/487#issuecomment-2969438585 - @core( - core_tiles[row][col], - f"gemm_{m}x{k}x{n}_archive.a", - stack_size=0xD00, + rt.set_barrier(workerBarriers[row][col], 1) + + # Task groups will be used to determine when to sync/await/free DMA runtime ops + tg = rt.task_group() + for tb in range(ceildiv(n_c_row_tiles_per_core, tb_max_n_rows)): + for pingpong in [0, 1]: + if c_index >= len(C_tiles): + # May not have pong iteration in some cases + break + + row_base = tb * tb_max_n_rows + pingpong * tb_max_n_rows // 2 + current_tb_n_rows = min( + [tb_max_n_rows // 2, n_c_row_tiles_per_core - row_base] ) - def core_body(): - use_lock(rtp_locks[row][col], action=LockAction.Acquire, value=1) - rtp_K_div_k_i32 = rtp_bufs[row][col][0] - rtp_K_div_k = index_dialect.castu(T.index(), rtp_K_div_k_i32) - rtp_n_tiles_per_core_i32 = rtp_bufs[row][col][1] - rtp_n_tiles_per_core = index_dialect.castu( - T.index(), rtp_n_tiles_per_core_i32 - ) - for _ in range_(0xFFFFFFFF): - for _ in range_(rtp_n_tiles_per_core): - if use_larger_internal_buffer: - elem_out_internal = C_l1_fifos[row][col].acquire( - ObjectFifoPort.Produce, 1 - ) - else: - elem_out_internal = C_l1l2_fifos[row][col].acquire( - ObjectFifoPort.Produce, 1 - ) - zero(elem_out_internal) - - for _ in range_(rtp_K_div_k): - elem_in_a = A_l2l1_fifos[row].acquire( - ObjectFifoPort.Consume, 1 - ) - elem_in_b = B_l2l1_fifos[col].acquire( - ObjectFifoPort.Consume, 1 - ) - matmul(elem_in_a, elem_in_b, elem_out_internal) - A_l2l1_fifos[row].release(ObjectFifoPort.Consume, 1) - B_l2l1_fifos[col].release(ObjectFifoPort.Consume, 1) - if use_larger_internal_buffer: - C_l1_fifos[row][col].release(ObjectFifoPort.Produce, 1) - elem_out_internal = C_l1_fifos[row][col].acquire( - ObjectFifoPort.Consume, 1 - ) - elem_out_transfer = C_l1l2_fifos[row][col].acquire( - ObjectFifoPort.Produce, 1 - ) - convert_copy( - elem_out_internal, elem_out_transfer, m * n - ) - C_l1_fifos[row][col].release(ObjectFifoPort.Consume, 1) - C_l1l2_fifos[row][col].release( - ObjectFifoPort.Produce, 1 - ) - else: - C_l1l2_fifos[row][col].release( - ObjectFifoPort.Produce, 1 - ) - use_lock(rtp_locks[row][col], action=LockAction.Release, value=0) - - # To/from AIE-array data movement - @runtime_sequence( - np.ndarray[(M * K,), np.dtype[dtype_in]], - np.ndarray[(K * N,), np.dtype[dtype_in]], - np.ndarray[(M * N,), np.dtype[dtype_out_transfer]], - ) - def sequence(A, B, C): - # Set runtime parameters - for col in range(n_aie_cols): - for row in range(n_aie_rows): - sym_ref = FlatSymbolRefAttr.get(rtp_bufs[row][col].get_name()) - set_lock_value(rtp_locks[row][col], value=0) - npu_rtp_write(sym_ref, 0, K // k) - npu_rtp_write(sym_ref, 1, n_tiles_per_core) - set_lock_value(rtp_locks[row][col], value=1) - - # We are limited in the number of BDs. After synchronizing, we can reuse BDs. - # We only transfer 4 rows of tiles at once before starting a new transfer block. - # tb = transfer block; block of transfers before sync call - tb_max_n_rows = 4 if not c_col_maj else 2 - for tb in range(ceildiv(M // m // n_aie_rows, tb_max_n_rows)): - for pingpong in [0, 1]: - M // m // n_aie_rows // tb_max_n_rows - row_base = tb * tb_max_n_rows + pingpong * tb_max_n_rows // 2 - bd_id_base = 8 * pingpong - tb_n_rows = min( - [tb_max_n_rows // 2, M // m // n_aie_rows - row_base] + + for col in range(n_aie_cols): + + # This line does not change MLIR output at all - it's just for recording data movement + C_taps.append(C_tiles[c_index]) + + # C Output Transfer: + # The smallest transfer unit is a (m*n_aie_rows)-x-(n)-sized sub-tile of the matrix. + # Transfer one such tile for every (n_aie_cols)-th column, evenly spaced, + # then repeat that (tb_n_rows) times for the next contiguous blocks of rows. + # Each shim will start at a different column offset, transferring interleaved + # columns. For example, shim 0 may transfer the blocks marked 0 below, and shim 1 + # may transfer the blocks marked 1. + # + # N + # ---------------- + # |0011 0011 | + # |0011 0011 | + # |0011 0011 | + # M |0011 0011 | + # | | + # | | + # | | + # | | + # ---------------- + rt.drain( + C_l2l3_fifos[col].cons(), + C, + tap=C_tiles[c_index], + wait=True, + task_group=tg, + placement=Tile(col, 0), ) - if tb_n_rows <= 0: - # for small input sizes, we may not even need a "pong" iteration - break - for col in range(n_aie_cols): - - # C Output Transfer: - # The smallest transfer unit is a (m*n_aie_rows)-x-(n)-sized sub-tile of the matrix. - # Transfer one such tile for every (n_aie_cols)-th column, evenly spaced, - # then repeat that (tb_n_rows) times for the next contiguous blocks of rows. - # Each shim will start at a different column offset, transferring interleaved - # columns. For example, shim 0 may transfer the blocks marked 0 below, and shim 1 - # may transfer the blocks marked 1. + c_index += 1 + + for tile_row in range(current_tb_n_rows): + + # A input transfer: # - # N + # The smallest transfer unit is a (m*n_A_tiles_per_shim)-sized sub-tile of the input matrix. + # Transfer one such tile for every column, contiguously. + # Repeat this transfer with identical tiles a total of (N//n//n_aie_cols) times. + # Each shim transfers the tiles for separate rows. For example, shim 0 may transfer the + # tiles marked 0 below, and shim 1 may transfer the tiles marked 1. + # K # ---------------- - # |0011 0011 | - # |0011 0011 | - # |0011 0011 | - # M |0011 0011 | + # |0000000000000000| (repeated N//n//n_aie_cols times) + # |0000000000000000| + # |1111111111111111| + # M |1111111111111111| # | | # | | # | | # | | # ---------------- - if not c_col_maj: - C_row_offset = row_base * m * n_aie_rows * N - C_col_offset = col * n - C_offset = C_col_offset + C_row_offset - C_sizes = [ - tb_n_rows, - N // n // n_aie_cols, - m * n_aie_rows, - n, - ] - C_strides = [m * n_aie_rows * N, n * n_aie_cols, N, 1] - else: - C_row_offset = row_base * m * n_aie_rows - C_col_offset = col * n * M - C_offset = C_col_offset + C_row_offset - C_sizes = [N // n // n_aie_cols, n_aie_rows, n, m] - C_strides = [M * n * n_aie_cols, m, M, 1] - npu_dma_memcpy_nd( - metadata=C_l2l3_fifos[col], - bd_id=bd_id_base, - mem=C, - offsets=[0, 0, 0, C_offset], - sizes=C_sizes, - strides=C_strides, - ) + tile_offset = ( + (row_base + tile_row) * n_shim_mem_A + col + ) % len(A_tiles) + + # always equal to n_aie_rows since we have n_aie_rows row tiles for matrix A + if col < n_aie_rows: + rt.fill( + A_l3l2_fifos[col].prod(), + A, + tap=A_tiles[tile_offset], + task_group=tg, + placement=Tile( + 2 * col if n_aie_cols == 8 else col, 0 + ), # alternate columns in full 4x8 NPU2 case + ) # Use the calculated sizes/strides/offsets to record the data movement # caused by the above call to npu_dma_memcpy_nd. # This line does not change MLIR output at all. - if generate_taps: - C_taps.append( - TensorAccessPattern( - (M, N), - offset=C_offset, - sizes=C_sizes, - strides=C_strides, - ) - ) - for tile_row in range(tb_n_rows): - - # A input transfer: - # - # The smallest transfer unit is a (m*n_A_tiles_per_shim)-sized sub-tile of the input matrix. - # Transfer one such tile for every column, contiguously. - # Repeat this transfer with identical tiles a total of (N//n//n_aie_cols) times. - # Each shim transfers the tiles for separate rows. For example, shim 0 may transfer the - # tiles marked 0 below, and shim 1 may transfer the tiles marked 1. - # K - # ---------------- - # |0000000000000000| (repeated N//n//n_aie_cols times) - # |0000000000000000| - # |1111111111111111| - # M |1111111111111111| - # | | - # | | - # | | - # | | - # ---------------- - A_block_offset = ( - (row_base + tile_row) * n_aie_rows * m * K - ) # base address for this transfer block for all BDs - A_row_offset = ( - col * n_A_tiles_per_shim * m * K - ) # base address for the shim in this column - A_offset = A_block_offset + A_row_offset - A_sizes = [ - N // n // n_aie_cols, - K // k, - m * n_A_tiles_per_shim, - k, - ] - A_strides = [0, k, K, 1] - - # always equal to n_aie_rows since we have n_aie_rows row tiles for matrix A - if col < n_aie_rows: - npu_dma_memcpy_nd( - metadata=A_l3l2_fifos[col], - bd_id=bd_id_base + 2 * tile_row + 1, - mem=A, - offsets=[0, 0, 0, A_offset], - sizes=A_sizes, - strides=A_strides, - ) - # # Use the calculated sizes/strides/offsets to record the data movement - # # caused by the above call to npu_dma_memcpy_nd. - # # This line does not change MLIR output at all. - if generate_taps: - A_taps.append( - TensorAccessPattern( - (M, K), - offset=A_offset, - sizes=A_sizes, - strides=A_strides, - ) - ) - - # B input transfer: - # Transfer the first a (n)-wide block of columns of B, - # Then transfer the (n_aie_columns)-th such block, and so on. - # Each shim will start at a different column offset. - # For example, shim 0 may transfer the tiles marked 0 below, - # and shim 1 may transfer the tiles marked 1. - # - # N - # ---------------- - # |0011 0011 | - # |0011 0011 | - # |0011 0011 | - # K |0011 0011 | - # |0011 0011 | - # |0011 0011 | - # |0011 0011 | - # |0011 0011 | - # ---------------- - B_col_offset = col * n if not b_col_maj else col * n * K - if not b_col_maj: - B_sizes = [N // n // n_aie_cols, K // k, k, n] - B_strides = [n * n_aie_cols, k * N, N, 1] - else: - B_sizes = [N // n // n_aie_cols, K // k, n, k] - B_strides = [n * n_aie_cols * K, k, K, 1] - - npu_dma_memcpy_nd( - metadata=B_l3l2_fifos[col], - bd_id=bd_id_base + 2 * tile_row + 2, - mem=B, - offsets=[0, 0, 0, B_col_offset], - sizes=B_sizes, - strides=B_strides, - ) - # # Use the calculated sizes/strides/offsets to record the data movement - # # caused by the above call to npu_dma_memcpy_nd. - # # This line does not change MLIR output at all. - if generate_taps: - B_taps.append( - TensorAccessPattern( - (K, N), - offset=B_col_offset, - sizes=B_sizes, - strides=B_strides, - ) - ) - if tb > 0 or (tb == 0 and pingpong > 0): - dma_wait(*C_l2l3_fifos) - dma_wait(*C_l2l3_fifos) + # B input transfer: + # Transfer the first a (n)-wide block of columns of B, + # Then transfer the (n_aie_columns)-th such block, and so on. + # Each shim will start at a different column offset. + # For example, shim 0 may transfer the tiles marked 0 below, + # and shim 1 may transfer the tiles marked 1. + # + # N + # ---------------- + # |0011 0011 | + # |0011 0011 | + # |0011 0011 | + # K |0011 0011 | + # |0011 0011 | + # |0011 0011 | + # |0011 0011 | + # |0011 0011 | + # ---------------- + rt.fill( + B_l3l2_fifos[col].prod(), + B, + tap=B_tiles[col], + task_group=tg, + placement=Tile(col, 0), + ) + + # These lines do not change MLIR output at all - they are just for recording data movement + A_taps.append(A_tiles[tile_offset]) + B_taps.append(B_tiles[col]) + if tb > 0 or (tb == 0 and pingpong > 0): + rt.finish_task_group(tg) + tg = rt.task_group() + rt.finish_task_group(tg) if generate_taps: - # If generate_taps is true, return a representation of tensor tiles + # If generate taps is true, return a representation of tensor access patterns # representing all the npu_dma_memcpy_nd runtime sequence operations per input/ouput tensor. return ( TensorAccessSequence.from_taps(A_taps), @@ -724,6 +670,13 @@ def sequence(A, B, C): TensorAccessSequence.from_taps(C_taps), ) + # Create the program from the device type and runtime + my_program = Program(dev_ty, rt) + + # Place components (assign them resources on the device) and generate an MLIR module + module = my_program.resolve_program(SequentialPlacer()) + return module + if __name__ == "__main__": main()