diff --git a/example/gemm/CMakeLists.txt b/example/gemm/CMakeLists.txt
index be0954f..f885dfc 100644
--- a/example/gemm/CMakeLists.txt
+++ b/example/gemm/CMakeLists.txt
@@ -16,7 +16,7 @@ function(add_aie_gemm_xclbin m k n NUM_COLUMNS B_COL_MAJ C_COL_MAJ TRACE_SIZE AR
     add_aie_design(${XCLBIN_NAME}
                    XCLBIN_ONLY
                    PYTHON gemm.py
-                   PYTHON_FLAGS --dev ${DEVICE} -M ${DUMMY_M} -K ${DUMMY_K} -N ${DUMMY_N} -m ${m} -k ${k} -n ${n} --n-aie-cols ${NUM_COLUMNS} --b-col-maj ${B_COL_MAJ} --c-col-maj ${C_COL_MAJ} --dtype_in ${INPUT_DATA_TYPE_PYTHON} --dtype_out ${OUTPUT_DATA_TYPE_PYTHON} --trace_size ${TRACE_SIZE} ${EMULATE_STR} --prio-accuracy --output-file-path ${CMAKE_BINARY_DIR}/aie/${XCLBIN_NAME}.mlir
+                   PYTHON_FLAGS --dev ${DEVICE} -M ${DUMMY_M} -K ${DUMMY_K} -N ${DUMMY_N} -m ${m} -k ${k} -n ${n} --n-aie-cols ${NUM_COLUMNS} --b-col-maj ${B_COL_MAJ} --c-col-maj ${C_COL_MAJ} --dtype_in ${INPUT_DATA_TYPE_PYTHON} --dtype_out ${OUTPUT_DATA_TYPE_PYTHON} --trace_size ${TRACE_SIZE} ${EMULATE_STR} ${PRIO_ACC_STR} --output-file-path ${CMAKE_BINARY_DIR}/aie/${XCLBIN_NAME}.mlir
                    AIE_CORE_KERNELS ${ARCHIVE_NAME}
                    EXTRA_AIECC_FLAGS --dynamic-objFifos
                    OUTPUT_XCLBIN GEMM_XCLBIN_${XCLBIN_NAME})
@@ -43,7 +43,7 @@ function(add_aie_gemm_design M K N m k n NUM_COLUMNS B_COL_MAJ C_COL_MAJ TRACE_S
     add_aie_design(${EXAMPLE}
                    INSTS_ONLY
                    PYTHON gemm.py
-                   PYTHON_FLAGS --dev ${DEVICE} -M ${M} -K ${K} -N ${N} -m ${m} -k ${k} -n ${n} --n-aie-cols ${NUM_COLUMNS} --b-col-maj ${B_COL_MAJ} --c-col-maj ${C_COL_MAJ} --dtype_in ${INPUT_DATA_TYPE_PYTHON} --dtype_out ${OUTPUT_DATA_TYPE_PYTHON} --trace_size ${TRACE_SIZE} ${EMULATE_STR} --prio-accuracy --output-file-path ${CMAKE_BINARY_DIR}/aie/${EXAMPLE}.mlir
+                   PYTHON_FLAGS --dev ${DEVICE} -M ${M} -K ${K} -N ${N} -m ${m} -k ${k} -n ${n} --n-aie-cols ${NUM_COLUMNS} --b-col-maj ${B_COL_MAJ} --c-col-maj ${C_COL_MAJ} --dtype_in ${INPUT_DATA_TYPE_PYTHON} --dtype_out ${OUTPUT_DATA_TYPE_PYTHON} --trace_size ${TRACE_SIZE} ${EMULATE_STR} ${PRIO_ACC_STR} --output-file-path ${CMAKE_BINARY_DIR}/aie/${EXAMPLE}.mlir
                    EXTRA_AIECC_FLAGS --dynamic-objFifos
                    OUTPUT_INSTS GEMM_INSTS)
 
@@ -69,7 +69,8 @@ function(add_aie_gemm_design M K N m k n NUM_COLUMNS B_COL_MAJ C_COL_MAJ TRACE_S
                         "PASS!"
                     METRICS
                         "Latency" [=[Latency \(us\): (?P<metric>\d+)]=]
-                        "Bandwidth" [=[Effective Bandwidth: (?P<metric>[\d\.e\+-]+) GB/s]=])
+                        "Bandwidth" [=[Effective Bandwidth: (?P<metric>[\d\.e\+-]+) GB/s]=]
+                        "GFLOP/s" [=[Throughput:\s*(?P<metric>[\d\.e\+-]+) GFLOP/s]=])
 endfunction()
 
 set(M_LIST "2048")
@@ -86,15 +87,22 @@ set(ARCHIVE_NAME "gemm_${m}x${k}x${n}_archive.a")
 set(AIE_BUILD_DIR ${CMAKE_BINARY_DIR}/aie)
 set(ARCHIVE_PATH "${AIE_BUILD_DIR}/${ARCHIVE_NAME}")
 
+set(PRIO_ACCURACY True)
 set(EMULATE_BFLOAT16_MMUL_WITH_BFP16 False)
 set (B_COL_MAJ 0)
 set (C_COL_MAJ 0)
 
-set(MM_KERNEL_DEFINES "DIM_M=${m}" "DIM_K=${k}" "DIM_N=${n}" "bf16_f32_ONLY" "ROUND_CONV_EVEN")
+set(MM_KERNEL_DEFINES "DIM_M=${m}" "DIM_K=${k}" "DIM_N=${n}" "ROUND_CONV_EVEN")
 if (EMULATE_BFLOAT16_MMUL_WITH_BFP16)
     set(EMULATE_STR --emulate-bf16-mmul-with-bfp16)
     list(APPEND MM_KERNEL_DEFINES "AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16")
 endif()
+if (PRIO_ACCURACY)
+    set(PRIO_ACC_STR --prio-accuracy)
+    list(APPEND MM_KERNEL_DEFINES "bf16_f32_ONLY")
+else()
+    list(APPEND MM_KERNEL_DEFINES "bf16_bf16_ONLY")
+endif()
 if (B_COL_MAJ)
     list(APPEND MM_KERNEL_DEFINES "B_COL_MAJ")
 endif()
diff --git a/example/gemm/gemm.cpp b/example/gemm/gemm.cpp
index 7fb8e25..0fc09ea 100644
--- a/example/gemm/gemm.cpp
+++ b/example/gemm/gemm.cpp
@@ -160,6 +160,11 @@ int main(int argc, const char *argv[])
     double bandwidth_GBps = (total_bytes / (1024 * 1024 * 1024)) / (npu_time * 1e-6);
     std::cout << "Effective Bandwidth: " << bandwidth_GBps << " GB/s" << std::endl;
 
+    // Need to cast to long long since M/K/N=2048 would give 17*10^9 ops for example
+    unsigned long long n_ops = static_cast<unsigned long long>(M) * K * N * 2;
+    float throughput = n_ops / npu_time / 1e3; // GOP/s
+    std::cout << "Throughput: " << throughput << " GFLOP/s" << std::endl;
+
     std::bfloat16_t *bufOut1 = bo_out.map<std::bfloat16_t *>();
 
     // Compare with golden reference
diff --git a/example/gemm/gemm.py b/example/gemm/gemm.py
index b80fc32..7232960 100644
--- a/example/gemm/gemm.py
+++ b/example/gemm/gemm.py
@@ -1,20 +1,28 @@
 # SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import argparse
-import numpy as np
+from ml_dtypes import bfloat16
 from pathlib import Path
 
-from aie.extras.context import mlir_mod_ctx
-
-from aie.dialects.aie import *
-from aie.dialects.aiex import *
-from aie.dialects.scf import FlatSymbolRefAttr
-import aie.dialects.index as index_dialect
-from aie.helpers.dialects.ext.scf import _for as range_
-from aie.helpers.taplib import TensorAccessPattern, TensorAccessSequence
-
-from aie.iron import str_to_dtype
+import numpy as np
+import argparse
+import sys
+
+from aie.iron import (
+    Kernel,
+    ObjectFifo,
+    Program,
+    GlobalBuffer,
+    Runtime,
+    Worker,
+    WorkerRuntimeBarrier,
+    LocalBuffer,
+    str_to_dtype,
+)
+from aie.iron.placers import SequentialPlacer
+from aie.iron.device import NPU1Col1, NPU1Col2, NPU1, NPU2, Tile
+from aie.helpers.taplib import TensorAccessSequence, TensorTiler2D
+from aie.iron.controlflow import range_
 
 
 microkernel_mac_dim_map = {
@@ -74,34 +82,33 @@ def main():
     )
 
     args = argparser.parse_args()
-    with mlir_mod_ctx() as ctx:
-        maybe_taps = my_matmul(
-            args.dev,
-            args.M,
-            args.K,
-            args.N,
-            args.m,
-            args.k,
-            args.n,
-            args.n_aie_cols,
-            args.dtype_in,
-            args.dtype_out,
-            args.b_col_maj,
-            args.c_col_maj,
-            args.scalar,
-            args.emulate_bf16_mmul_with_bfp16,
-            args.prio_accuracy,
-            args.trace_size,
-            args.generate_taps,
-        )
+    maybe_module = my_matmul(
+        args.dev,
+        args.M,
+        args.K,
+        args.N,
+        args.m,
+        args.k,
+        args.n,
+        args.n_aie_cols,
+        args.dtype_in,
+        args.dtype_out,
+        args.b_col_maj,
+        args.c_col_maj,
+        args.scalar,
+        args.emulate_bf16_mmul_with_bfp16,
+        args.prio_accuracy,
+        args.trace_size,
+        args.generate_taps,
+    )
 
+    if args.generate_taps:
+        return maybe_module
+    else:
         output_file_path = Path(args.output_file_path)
 
         with open(output_file_path, "w") as f:
-            f.write(str(ctx.module))
-
-        if args.generate_taps:
-            return maybe_taps
+            f.write(str(maybe_module))
 
 
 def ceildiv(a, b):
@@ -128,30 +135,49 @@ def my_matmul(
     generate_taps=False,
 ):
     n_aie_rows = 4
-    n_aie_cores = n_aie_rows * n_aie_cols
-    use_larger_internal_buffer = dtype_out_str == "bf16" and prio_accuracy
 
     dtype_in = str_to_dtype(dtype_in_str)
-    # Accumulate in place with f32 buffer that will be converted to bf16 for output transfer to L2
-    dtype_out_internal = (
-        str_to_dtype("f32")
-        if use_larger_internal_buffer
-        else str_to_dtype(dtype_in_str)
-    )
-    dtype_out_transfer = str_to_dtype(dtype_out_str)
+    dtype_out = str_to_dtype(dtype_out_str)
+
+    # When using more AIE columns than n_aie_rows (4) (applicable to NPU2),
+    # restrict the number of shim/mem tiles to n_aie_rows,
+    # since we have only n_aie_rows row tiles for matrix A
+    # When using n_aie_rows (4) or less AIE columns (both NPU and NPU2),
+    # the number of shim/mem tiles are equal to n_aie_cols.
+    # We use the distribute pattern in object FIFO (see linking for A below),
+    # since we have n_aie_rows (4) row tiles for matrix A
+    n_shim_mem_A = min(n_aie_cols, n_aie_rows)
+
+    # Integer division when n_aie_cols < 4, otherwise set to 1
+    n_A_tiles_per_shim = n_aie_rows // n_aie_cols if n_aie_cols < 4 else 1
+
+    mem_tile_m_A = m * n_A_tiles_per_shim
+    mem_tile_m_C = m * n_aie_rows
+    mem_tile_n = n * n_aie_cols
+
+    if prio_accuracy:
+        assert (
+            dtype_out_str == "bf16"
+        ), f"prio_accuracy flag is a feature only for bfloat16 output data types"
+        use_larger_internal_buffer = True
+        # If prio_accuracy flag is enabled, gemm for bfloat16 will accumulate in place with a f32 buffer,
+        # which will be converted to bf16 after the reduction loop finishes for output transfer to L2
+        dtype_out_internal = str_to_dtype("f32")
+        assert np.issubdtype(dtype_in, np.integer) == np.issubdtype(
+            dtype_out_internal, np.integer
+        ), f"Input dtype ({dtype_in}) and output dtype ({dtype_out_internal}) must either both be integral or both be float"
+        assert (
+            np.dtype(dtype_out_internal).itemsize >= np.dtype(dtype_in).itemsize
+        ), f"Output dtype ({dtype_out_internal}) must be equal or larger to input dtype ({dtype_in})"
+    else:
+        use_larger_internal_buffer = False
 
     assert np.issubdtype(dtype_in, np.integer) == np.issubdtype(
-        dtype_out_internal, np.integer
-    ), f"Input dtype ({dtype_in}) and output dtype ({dtype_out_internal}) must either both be integral or both be float"
-    assert np.issubdtype(dtype_in, np.integer) == np.issubdtype(
-        dtype_out_transfer, np.integer
-    ), f"Input dtype ({dtype_in}) and output dtype ({dtype_out_transfer}) must either both be integral or both be float"
+        dtype_out, np.integer
+    ), f"Input dtype ({dtype_in}) and output dtype ({dtype_out}) must either both be integral or both be float"
     assert (
-        np.dtype(dtype_out_internal).itemsize >= np.dtype(dtype_in).itemsize
-    ), f"Output dtype ({dtype_out_internal}) must be equal or larger to input dtype ({dtype_in})"
-    assert (
-        np.dtype(dtype_out_transfer).itemsize >= np.dtype(dtype_in).itemsize
-    ), f"Output dtype ({dtype_out_transfer}) must be equal or larger to input dtype ({dtype_in})"
+        np.dtype(dtype_out).itemsize >= np.dtype(dtype_in).itemsize
+    ), f"Output dtype ({dtype_out}) must be equal or larger to input dtype ({dtype_in})"
 
     # r, s, t are the dimensions required by the microkernel MAC instructions.
     mac_dims = microkernel_mac_dim_map[dev][dtype_in_str]
@@ -175,8 +201,8 @@ def my_matmul(
     # rows, s.t. each of the n_rows compute cores in a column receives a
     # contiguous (m, k)-sized block of A.
     assert (
-        M % (m * n_aie_rows) == 0
-    ), """A must be tileable into (m * n_aie_rows, k)-sized blocks"""
+        M % mem_tile_m_A == 0
+    ), """A must be tileable into (m * n_A_tiles_per_shim, k)-sized blocks"""
 
     # Both A and B are tiled in the K dimension into size k.
     assert K % k == 0
@@ -185,9 +211,18 @@ def my_matmul(
     # Conceptually, we do the same as with A, but instead of broadcasting
     # across columns we broadcast across rows and distribute across columns.
     assert (
-        N % (n * n_aie_cols) == 0
+        N % mem_tile_n == 0
     ), """B must be tileable into (k, n * n_aie_cols)-sized blocks"""
 
+    # Output matrix C:
+    # Conceptually, we divide output C into (m * n_rows, n)-sized blocks. These
+    # blocks are _distributed_ across AIE core columns, and _joined_ across
+    # rows, s.t. each of the n_rows compute cores in a column send a
+    # contiguous (m, n)-sized block of C.
+    assert (
+        M % mem_tile_m_C == 0
+    ), """C must be tileable into (m * n_aie_rows, n)-sized blocks"""
+
     # r, s, t are the dimensions required by the microkernel MAC instructions.
     if not use_scalar:
         assert m % r == 0
@@ -200,32 +235,15 @@ def my_matmul(
     # a big performance cost.
     fifo_depth = 2
 
-    n_tiles_per_core = (M // m) * (N // n) // n_aie_cores
-
-    # When using more AIE columns than n_aie_rows (4) (applicable to NPU2),
-    # restrict the number of shim/mem tiles to n_aie_rows,
-    # since we have only n_aie_rows row tiles for matrix A
-    if n_aie_cols > n_aie_rows:
-        n_shim_mem_A = n_aie_rows
-    # When using n_aie_rows (4) or less AIE columns (both NPU and NPU2),
-    # the number of shim/mem tiles are equal to n_aie_cols.
-    # We use the distribute pattern in object FIFO (see linking for A below),
-    # since we have n_aie_rows (4) row tiles for matrix A
-    else:
-        n_shim_mem_A = n_aie_cols
-
-    # Integer division when n_aie_cols < 4, otherwise set to 1
-    n_A_tiles_per_shim = n_aie_rows // n_aie_cols if n_aie_cols < 4 else 1
-
     if dev == "npu":
         if n_aie_cols == 1:
-            dev_ty = AIEDevice.npu1_1col
+            dev_ty = NPU1Col1()
         elif n_aie_cols == 2:
-            dev_ty = AIEDevice.npu1_2col
+            dev_ty = NPU1Col2()
         elif n_aie_cols == 4:
-            dev_ty = AIEDevice.npu1
+            dev_ty = NPU1()
     else:
-        dev_ty = AIEDevice.npu2
+        dev_ty = NPU2()
 
     # These will hold TensorAccessPattern objects that represent the runtime
     # npu_dma_memcpy_nd operations of this design. They are only used if generate_taps is true
@@ -233,490 +251,418 @@ def my_matmul(
     B_taps = []
     C_taps = []
 
-    @device(dev_ty)
-    def device_body():
-        A_l2_ty = np.ndarray[(m * k * n_A_tiles_per_shim,), np.dtype[dtype_in]]
-        B_l2_ty = np.ndarray[(k * n,), np.dtype[dtype_in]]
-        C_l2_ty = np.ndarray[(m * n * n_aie_rows,), np.dtype[dtype_out_transfer]]
-        A_l1_ty = np.ndarray[(m, k), np.dtype[dtype_in]]
-        B_l1_ty = np.ndarray[(k, n), np.dtype[dtype_in]]
+    # Define tensor types
+    A_ty = np.ndarray[(M * K,), np.dtype[dtype_in]]
+    B_ty = np.ndarray[(K * N,), np.dtype[dtype_in]]
+    C_ty = np.ndarray[(M * N,), np.dtype[dtype_out]]
+    A_l2_ty = np.ndarray[(mem_tile_m_A * k,), np.dtype[dtype_in]]
+    B_l2_ty = np.ndarray[(k * n,), np.dtype[dtype_in]]
+    C_l2_ty = np.ndarray[(mem_tile_m_C * n,), np.dtype[dtype_out]]
+    A_l1_ty = np.ndarray[(m, k), np.dtype[dtype_in]]
+    B_l1_ty = np.ndarray[(k, n), np.dtype[dtype_in]]
+    C_l1_ty = np.ndarray[(m, n), np.dtype[dtype_out]]
+
+    # AIE Core Function declarations
+    scalar_suffix = "_scalar" if use_scalar else ""
+    if use_larger_internal_buffer:
+        # Fix fifo depth for C objfifo to 1 since 1 buffer will be used for accumulation
+        # and another for transfer to L2
+        fifo_depth_out = 1
+        # Set the type for accumulation
         C_l1_ty_internal = np.ndarray[(m, n), np.dtype[dtype_out_internal]]
-        C_l1_ty_transfer = np.ndarray[(m, n), np.dtype[dtype_out_transfer]]
-
-        # AIE Core Function declarations
-        scalar_suffix = "_scalar" if use_scalar else ""
-        zero = external_func(
-            f"zero{scalar_suffix}_{'f32' if use_larger_internal_buffer else dtype_out_str}",
-            inputs=[C_l1_ty_internal],
+        # A kernel to convert from the internal f32 accumulation to bf16 for transfer to L2 is needed
+        convert_copy_kernel = Kernel(
+            f"convert_copy_f32_to_bf16",
+            f"gemm_{m}x{k}x{n}_archive.a",
+            [C_l1_ty_internal, C_l1_ty, np.int32],
         )
-        matmul = external_func(
-            f"matmul{scalar_suffix}_{dtype_in_str}_{'f32' if use_larger_internal_buffer else dtype_out_str}",
-            inputs=[A_l1_ty, B_l1_ty, C_l1_ty_internal],
+        # Fix the kernels to use f32 outputs
+        zero_kernel = Kernel(
+            f"zero{scalar_suffix}_f32",
+            f"gemm_{m}x{k}x{n}_archive.a",
+            [C_l1_ty_internal],
+        )
+        matmul_func_name = f"matmul{scalar_suffix}_{dtype_in_str}_f32"
+        matmul_kernel = Kernel(
+            matmul_func_name,
+            f"gemm_{m}x{k}x{n}_archive.a",
+            [A_l1_ty, B_l1_ty, C_l1_ty_internal],
+        )
+    else:
+        # No need to use separate buffers for accumulation and transfer to L2, so
+        # we only need the zero and matmul kernels
+        fifo_depth_out = fifo_depth
+        zero_kernel = Kernel(
+            f"zero{scalar_suffix}_{dtype_out_str}",
+            f"gemm_{m}x{k}x{n}_archive.a",
+            [C_l1_ty],
+        )
+        matmul_func_name = f"matmul{scalar_suffix}_{dtype_in_str}_{dtype_out_str}"
+        matmul_kernel = Kernel(
+            matmul_func_name,
+            f"gemm_{m}x{k}x{n}_archive.a",
+            [A_l1_ty, B_l1_ty, C_l1_ty],
         )
-        if use_larger_internal_buffer:
-            convert_copy = external_func(
-                f"convert_copy_f32_to_bf16",
-                inputs=[C_l1_ty_internal, C_l1_ty_transfer, np.int32],
-            )
-
-        # Tile declarations as tile[row][col]
-        tiles = [
-            [tile(col, row) for col in range(0, n_aie_cols)] for row in range(0, 6)
-        ]
-        shim_tiles = tiles[0]
-        mem_tiles = tiles[1]
-        core_tiles = tiles[2:]
 
-        # AIE-array data movement with object fifos
-        A_l3l2_fifos = [None] * n_shim_mem_A
-        A_l2l1_fifos = [None] * n_aie_rows
+    # Tile declarations as tile[row][col]
+    tiles = [[(col, row) for col in range(0, n_aie_cols)] for row in range(0, 6)]
+    core_tiles = tiles[2:]
 
-        B_l3l2_fifos = [None] * n_aie_cols
-        B_l2l1_fifos = [None] * n_aie_cols
+    # AIE-array data movement with object fifos
+    A_l3l2_fifos = [None] * n_shim_mem_A
+    A_l2l1_fifos = [None] * n_aie_rows
 
-        C_l1_fifos = [[None] * n_aie_cols for _ in range(n_aie_rows)]
-        C_l1l2_fifos = [[None] * n_aie_cols for _ in range(n_aie_rows)]
-        C_l2l3_fifos = [None] * n_aie_cols
+    B_l3l2_fifos = [None] * n_aie_cols
+    B_l2l1_fifos = [None] * n_aie_cols
 
-        # Run-time parameters
-        rtp_locks = [[None] * n_aie_cols for _ in range(4)]
-        rtp_bufs = [[None] * n_aie_cols for _ in range(4)]
-        for col in range(n_aie_cols):
-            for row in range(n_aie_rows):
-                # RTP index 0: K // k
-                # RTP index 1: num tiles == M * N // (m * n * n_aie_cores)
-                rtp_bufs[row][col] = buffer(
-                    core_tiles[row][col],
-                    datatype=T.memref(3, T.i32()),
-                    name=f"rtp_{row}_{col}",
-                )
-                rtp_locks[row][col] = lock(
-                    core_tiles[row][col], sym_name=f"rtp_lock_{row}_{col}", init=0
-                )
+    C_l1l2_fifos = [[None] * n_aie_cols for _ in range(n_aie_rows)]
+    C_l2l3_fifos = [None] * n_aie_cols
 
-        # Input A
-        # L3 -> L2 data movement
-        for i in range(n_shim_mem_A):
-            A_l3l2_fifos[i] = object_fifo(
-                f"A_L3L2_{i}",
-                (
-                    shim_tiles[2 * i] if n_aie_cols == 8 else shim_tiles[i]
+    # Runtime parameters
+    rtps = [
+        [
+            GlobalBuffer(
+                np.ndarray[(2,), np.dtype[np.int32]],
+                name=f"rtp{row}_{col}",
+                initial_value=np.array([0, 0], dtype=np.int32),
+                use_write_rtp=True,
+            )
+            for col in range(n_aie_cols)
+        ]
+        for row in range(n_aie_rows)
+    ]
+
+    # Create barriers to synchronize individual workers with the runtime sequence
+    workerBarriers = [
+        [WorkerRuntimeBarrier() for col in range(n_aie_cols)]
+        for row in range(n_aie_rows)
+    ]
+
+    # Input A
+    for i in range(n_shim_mem_A):
+        A_l3l2_fifos[i] = ObjectFifo(A_l2_ty, name=f"A_L3L2_{i}", depth=fifo_depth)
+        # If n_shim_mem_A == n_rows, n_A_tiles_per_shim is 1 and
+        # this simply links a_l3l2_fifos[i] to a_l2l1_fifos[i] directly,
+        # If n_shim_mem_A < n_rows, each column receives multiple rows of
+        # tiles; distribute it along rows of AIE cores.
+        start_row = i * n_A_tiles_per_shim
+        stop_row = start_row + n_A_tiles_per_shim
+        of_offsets = [m * k * j for j in range(stop_row - start_row)]
+        dims_to_stream = [
+            [
+                (m // r, r * k),
+                (k // s, s),
+                (r, k),
+                (s, 1),
+            ]
+        ] * (stop_row - start_row)
+        a_tmp_fifos = (
+            A_l3l2_fifos[i]
+            .cons()
+            .split(
+                of_offsets,
+                obj_types=[A_l1_ty] * (stop_row - start_row),
+                names=[f"A_L2L1_{row}" for row in range(start_row, stop_row)],
+                dims_to_stream=dims_to_stream,
+                placement=Tile(
+                    2 * i if n_aie_cols == 8 else i, 1
                 ),  # alternate columns in full 4x8 NPU2 case
-                mem_tiles[2 * i] if n_aie_cols == 8 else mem_tiles[i],
-                fifo_depth,
-                A_l2_ty,
             )
+        )
 
-        # L2 -> L1 data movement
-        for row in range(n_aie_rows):
-            A_l2l1_fifos[row] = object_fifo(
-                f"A_L2L1_{row}",
-                (
-                    mem_tiles[2 * row]
-                    if n_aie_cols == 8
-                    else mem_tiles[row // n_A_tiles_per_shim]
-                ),
-                core_tiles[row][0:n_aie_cols],  # broadcast along one row
-                fifo_depth,
-                A_l1_ty,
-                (
-                    [
-                        (m // r, r * k),
-                        (k // s, s),
-                        (r, k),
-                        (s, 1),
-                    ]
-                    if not use_scalar
-                    else []
-                ),
+        for j in range(stop_row - start_row):
+            A_l2l1_fifos[j + start_row] = a_tmp_fifos[j]
+
+    # Input B
+    for col in range(n_aie_cols):
+        B_l3l2_fifos[col] = ObjectFifo(B_l2_ty, name=f"B_L3L2_{col}", depth=fifo_depth)
+        if b_col_maj:
+            dims_to_stream = [(n // t, t * k), (k // s, s), (t, k), (s, 1)]
+        else:
+            dims_to_stream = [(k // s, s * n), (n // t, t), (s, n), (t, 1)]
+        B_l2l1_fifos[col] = (
+            B_l3l2_fifos[col]
+            .cons()
+            .forward(
+                obj_type=B_l1_ty,
+                name=f"B_L2L1_{col}",
+                dims_to_stream=dims_to_stream,
+                placement=Tile(col, 1),
             )
+        )
 
-        # A_l3_l2 and A_l2_l1 object FIFO linking
-        for i in range(n_shim_mem_A):
-            # If n_shim_mem_A == n_rows, n_A_tiles_per_shim is 1 and
-            # this simply links a_l3l2_fifos[i] to a_l2l1_fifos[i] directly,
-            # If n_shim_mem_A < n_rows, each column receives multiple rows of
-            # tiles; distribute it along rows of AIE cores.
-            start_row = i * n_A_tiles_per_shim
-            stop_row = start_row + n_A_tiles_per_shim
-            if stop_row - start_row > 1:
-                of_offsets = [m * k * j for j in range(stop_row - start_row)]
-            else:
-                of_offsets = []
-            object_fifo_link(
-                A_l3l2_fifos[i],
-                [A_l2l1_fifos[j] for j in range(start_row, stop_row)],
-                [],
+        # Output C
+        C_l2l3_fifos[col] = ObjectFifo(
+            C_l2_ty,
+            name=f"C_L2L3_{col}",
+            depth=fifo_depth,
+            dims_to_stream=[(m // r, r * n), (r, t), (n // t, r * t), (t, 1)],
+        )
+        of_offsets = [m * n * i for i in range(n_aie_rows)]
+
+        # join along one column
+        c_tmp_fifos = (
+            C_l2l3_fifos[col]
+            .prod()
+            .join(
                 of_offsets,
+                obj_types=[C_l1_ty] * n_aie_rows,
+                names=[f"C_L1L2_{col}_{row}" for row in range(n_aie_rows)],
+                depths=[fifo_depth_out] * n_aie_rows,
+                placement=Tile(col, 1),
             )
+        )
+        for j in range(n_aie_rows):
+            C_l1l2_fifos[j][col] = c_tmp_fifos[j]
 
-        # Input B
-        for col in range(n_aie_cols):
-            # L3 -> L2 data movement
-            B_l3l2_fifos[col] = object_fifo(
-                f"B_L3L2_{col}",
-                shim_tiles[col],
-                mem_tiles[col],
-                fifo_depth,
-                B_l2_ty,
-            )
-            # L2 -> L1 data movement
-            B_l2l1_fifos[col] = object_fifo(
-                f"B_L2L1_{col}",
-                mem_tiles[col],
-                [
-                    core_tiles[j][col] for j in range(n_aie_rows)
-                ],  # broadcast along one column
-                fifo_depth,
-                B_l1_ty,
-                (
-                    (
-                        [
-                            (k // s, s * n),
-                            (n // t, t),
-                            (s, n),
-                            (t, 1),
-                        ]
-                        if not b_col_maj
-                        else [
-                            (n // t, t * k),
-                            (k // s, s),
-                            (t, k),
-                            (s, 1),
-                        ]
-                    )
-                    if not use_scalar
-                    else []
-                ),
+    # Tasks for each worker to perform
+    def core_fn(in_a, in_b, out_c, zero, matmul, convert_copy, my_rtp, barrier):
+        if use_larger_internal_buffer:
+            elem_out_internal = LocalBuffer(
+                type=C_l1_ty_internal,
             )
-            # B_l3_l2 and B_l2_l1 object FIFO linking
-            object_fifo_link(B_l3l2_fifos[col], B_l2l1_fifos[col])
+        barrier.wait_for_value(1)
+        rtp_K_div_k = my_rtp[0]
+        rtp_n_tiles_per_core = my_rtp[1]
+        loop = range(1)  # Workaround for issue #1547
+        if rtp_n_tiles_per_core > 1:
+            loop = range_(rtp_n_tiles_per_core)
+        for _ in loop:
+            if not use_larger_internal_buffer:
+                elem_out_internal = out_c.acquire(1)
+            zero(elem_out_internal)
+
+            for _ in range_(rtp_K_div_k):
+                elem_in_a = in_a.acquire(1)
+                elem_in_b = in_b.acquire(1)
+                matmul(elem_in_a, elem_in_b, elem_out_internal)
+                in_a.release(1)
+                in_b.release(1)
+
+            if use_larger_internal_buffer:
+                elem_out_transfer = out_c.acquire(1)
+                convert_copy(elem_out_internal, elem_out_transfer, m * n)
+                out_c.release(1)
+            else:
+                out_c.release(1)
 
-        # Output C
+    # Set up compute tiles
+    workers = []
+    for row in range(n_aie_rows):
         for col in range(n_aie_cols):
-            for row in range(n_aie_rows):
-                if use_larger_internal_buffer:
-                    C_l1_fifos[row][col] = object_fifo(
-                        f"C_L1_{col}_{row}",
-                        core_tiles[row][col],
-                        core_tiles[row][col],
-                        1,
-                        C_l1_ty_internal,
-                    )
-                    C_l1l2_fifos[row][col] = object_fifo(
-                        f"C_L1L2_{col}_{row}",
-                        core_tiles[row][col],
-                        mem_tiles[col],
-                        1,
-                        C_l1_ty_transfer,
-                    )
-                else:
-                    C_l1l2_fifos[row][col] = object_fifo(
-                        f"C_L1L2_{col}_{row}",
-                        core_tiles[row][col],
-                        mem_tiles[col],
-                        2,
-                        C_l1_ty_transfer,
-                    )
-            C_l2l3_fifos[col] = object_fifo(
-                f"C_L2L3_{col}",
-                mem_tiles[col],
-                shim_tiles[col],
-                fifo_depth,
-                C_l2_ty,
-                (
-                    (
-                        [
-                            (m // r, r * n),
-                            (r, t),
-                            (n // t, r * t),
-                            (t, 1),
-                        ]
-                        if not c_col_maj
-                        else [(n // t, t * m), (t, r), (m // r, r * t), (r, 1)]
-                    )
-                    if not use_scalar
-                    else []
-                ),
+            tile_col, tile_row = core_tiles[row][col]
+            workers.append(
+                Worker(
+                    core_fn,
+                    [
+                        A_l2l1_fifos[row].cons(),
+                        B_l2l1_fifos[col].cons(),
+                        C_l1l2_fifos[row][col].prod(),
+                        zero_kernel,
+                        matmul_kernel,
+                        convert_copy_kernel if use_larger_internal_buffer else None,
+                        rtps[row][col],
+                        workerBarriers[row][col],
+                    ],
+                    placement=Tile(tile_col, tile_row),
+                    stack_size=0xD00,
+                )
             )
-            if n_aie_rows > 1:
-                of_offsets = [m * n * i for i in range(n_aie_rows)]
-            else:
-                of_offsets = []
-            object_fifo_link(
-                [C_l1l2_fifos[j][col] for j in range(n_aie_rows)],
-                C_l2l3_fifos[col],
-                of_offsets,
-                [],
-            )  # join along one column
 
-        # Set up compute tiles
+    # We are limited in the number of BDs. After synchronizing, we can reuse BDs.
+    # We only transfer 6 rows of tiles at once before starting a new transfer block.
+    # tb = transfer block; block of transfers before sync call
+    tb_max_n_rows = 4
+    tb_n_rows = tb_max_n_rows // 2
+
+    # Calculate RTP values for the reduction loop and total C tiles
+    K_div_k = K // k
+    n_c_col_tiles_per_core = N // mem_tile_n
+    n_c_row_tiles_per_core = M // mem_tile_m_C
+
+    # Define tensor access patterns (tiling) for A, B, and C
+    A_tiles = TensorTiler2D.group_tiler(
+        (M, K),  # Size of A matrix
+        (mem_tile_m_A, k),  # Size of A (smallest) tile
+        (1, K_div_k),  # Size of "group" of tiles
+        # Repeat data so can distribute across whole column
+        pattern_repeat=n_c_col_tiles_per_core,
+        prune_step=False,
+    )
+    if b_col_maj:
+        B_tiles = TensorTiler2D.step_tiler(
+            (N, K),  # Size of B matrix
+            (n, k),  # Size of B tile
+            # Number of tiles per transfer in each dimension (whole col, partial row)
+            tile_group_repeats=(n_c_col_tiles_per_core, K_div_k),
+            # Contiguous tile group in col, but send every n_aie_cols-th tile in the row
+            tile_group_steps=(n_aie_cols, 1),
+            prune_step=False,
+        )
+    else:
+        B_tiles = TensorTiler2D.step_tiler(
+            (K, N),  # Size of B matrix
+            (k, n),  # Size of B tile
+            # Number of tiles per transfer in each dimension (whole col, partial row)
+            tile_group_repeats=(K_div_k, n_c_col_tiles_per_core),
+            # Contiguous tile group in col, but send every n_aie_cols-th tile in the row
+            tile_group_steps=(1, n_aie_cols),
+            tile_group_col_major=True,  # Send all tiles in column before moving on to next column
+            prune_step=False,
+        )
+    C_tiles = TensorTiler2D.step_tiler(
+        (M, N),  # Size of C matrix
+        (mem_tile_m_C, n),  # Size of C tile
+        # Number of tiles per transfer in each dimension (partial col, partial row)
+        tile_group_repeats=(tb_n_rows, n_c_col_tiles_per_core),
+        # Collect every n_aie_cols row at a time (mirroring how we sent in B data)
+        tile_group_steps=(1, n_aie_cols),
+        prune_step=False,
+    )
+    c_index = 0
+
+    # Runtime operations to move data to/from the AIE-array
+    rt = Runtime()
+    with rt.sequence(A_ty, B_ty, C_ty) as (A, B, C):
+        rt.start(*workers)
+
+        # Set runtime parameters
+        def set_rtps(*args):
+            for row, rtps_row in enumerate(args):
+                for col, rtp_row_col in enumerate(rtps_row):
+                    rtp_row_col[0] = K_div_k
+                    rtp_row_col[1] = n_c_row_tiles_per_core * n_c_col_tiles_per_core
+
+        rt.inline_ops(set_rtps, rtps)
+
+        # Set the barriers to 1 to allow the worker to read the
+        # runtime parameters and start the computation
         for row in range(n_aie_rows):
             for col in range(n_aie_cols):
-
-                # The stack size choice is a workaround explained here:
-                # https://github.com/Xilinx/mlir-aie/pull/2391#issuecomment-2967432485
-                # In summary, the Peano compiler uses a stack size greater than the default one used by this kernel
-                # (default is 0x400, chess' stack size is smaller). This is only necessary for bf16 through bfp16 emulation on npu2.
-                # Exceding the stack size leads to wrong results from the kernel, but no error is triggered.
-                # Stack usage can be checked as explained here:
-                # https://github.com/Xilinx/llvm-aie/issues/487#issuecomment-2969438585
-                @core(
-                    core_tiles[row][col],
-                    f"gemm_{m}x{k}x{n}_archive.a",
-                    stack_size=0xD00,
+                rt.set_barrier(workerBarriers[row][col], 1)
+
+        # Task groups will be used to determine when to sync/await/free DMA runtime ops
+        tg = rt.task_group()
+        for tb in range(ceildiv(n_c_row_tiles_per_core, tb_max_n_rows)):
+            for pingpong in [0, 1]:
+                if c_index >= len(C_tiles):
+                    # May not have pong iteration in some cases
+                    break
+
+                row_base = tb * tb_max_n_rows + pingpong * tb_max_n_rows // 2
+                current_tb_n_rows = min(
+                    [tb_max_n_rows // 2, n_c_row_tiles_per_core - row_base]
                 )
-                def core_body():
-                    use_lock(rtp_locks[row][col], action=LockAction.Acquire, value=1)
-                    rtp_K_div_k_i32 = rtp_bufs[row][col][0]
-                    rtp_K_div_k = index_dialect.castu(T.index(), rtp_K_div_k_i32)
-                    rtp_n_tiles_per_core_i32 = rtp_bufs[row][col][1]
-                    rtp_n_tiles_per_core = index_dialect.castu(
-                        T.index(), rtp_n_tiles_per_core_i32
-                    )
-                    for _ in range_(0xFFFFFFFF):
-                        for _ in range_(rtp_n_tiles_per_core):
-                            if use_larger_internal_buffer:
-                                elem_out_internal = C_l1_fifos[row][col].acquire(
-                                    ObjectFifoPort.Produce, 1
-                                )
-                            else:
-                                elem_out_internal = C_l1l2_fifos[row][col].acquire(
-                                    ObjectFifoPort.Produce, 1
-                                )
-                            zero(elem_out_internal)
-
-                            for _ in range_(rtp_K_div_k):
-                                elem_in_a = A_l2l1_fifos[row].acquire(
-                                    ObjectFifoPort.Consume, 1
-                                )
-                                elem_in_b = B_l2l1_fifos[col].acquire(
-                                    ObjectFifoPort.Consume, 1
-                                )
-                                matmul(elem_in_a, elem_in_b, elem_out_internal)
-                                A_l2l1_fifos[row].release(ObjectFifoPort.Consume, 1)
-                                B_l2l1_fifos[col].release(ObjectFifoPort.Consume, 1)
-                            if use_larger_internal_buffer:
-                                C_l1_fifos[row][col].release(ObjectFifoPort.Produce, 1)
-                                elem_out_internal = C_l1_fifos[row][col].acquire(
-                                    ObjectFifoPort.Consume, 1
-                                )
-                                elem_out_transfer = C_l1l2_fifos[row][col].acquire(
-                                    ObjectFifoPort.Produce, 1
-                                )
-                                convert_copy(
-                                    elem_out_internal, elem_out_transfer, m * n
-                                )
-                                C_l1_fifos[row][col].release(ObjectFifoPort.Consume, 1)
-                                C_l1l2_fifos[row][col].release(
-                                    ObjectFifoPort.Produce, 1
-                                )
-                            else:
-                                C_l1l2_fifos[row][col].release(
-                                    ObjectFifoPort.Produce, 1
-                                )
-                    use_lock(rtp_locks[row][col], action=LockAction.Release, value=0)
-
-        # To/from AIE-array data movement
-        @runtime_sequence(
-            np.ndarray[(M * K,), np.dtype[dtype_in]],
-            np.ndarray[(K * N,), np.dtype[dtype_in]],
-            np.ndarray[(M * N,), np.dtype[dtype_out_transfer]],
-        )
-        def sequence(A, B, C):
-            # Set runtime parameters
-            for col in range(n_aie_cols):
-                for row in range(n_aie_rows):
-                    sym_ref = FlatSymbolRefAttr.get(rtp_bufs[row][col].get_name())
-                    set_lock_value(rtp_locks[row][col], value=0)
-                    npu_rtp_write(sym_ref, 0, K // k)
-                    npu_rtp_write(sym_ref, 1, n_tiles_per_core)
-                    set_lock_value(rtp_locks[row][col], value=1)
-
-            # We are limited in the number of BDs. After synchronizing, we can reuse BDs.
-            # We only transfer 4 rows of tiles at once before starting a new transfer block.
-            # tb = transfer block; block of transfers before sync call
-            tb_max_n_rows = 4 if not c_col_maj else 2
-            for tb in range(ceildiv(M // m // n_aie_rows, tb_max_n_rows)):
-                for pingpong in [0, 1]:
-                    M // m // n_aie_rows // tb_max_n_rows
-                    row_base = tb * tb_max_n_rows + pingpong * tb_max_n_rows // 2
-                    bd_id_base = 8 * pingpong
-                    tb_n_rows = min(
-                        [tb_max_n_rows // 2, M // m // n_aie_rows - row_base]
+
+                for col in range(n_aie_cols):
+
+                    # This line does not change MLIR output at all - it's just for recording data movement
+                    C_taps.append(C_tiles[c_index])
+
+                    # C Output Transfer:
+                    # The smallest transfer unit is a (m*n_aie_rows)-x-(n)-sized sub-tile of the matrix.
+                    # Transfer one such tile for every (n_aie_cols)-th column, evenly spaced,
+                    # then repeat that (tb_n_rows) times for the next contiguous blocks of rows.
+                    # Each shim will start at a different column offset, transferring interleaved
+                    # columns. For example, shim 0 may transfer the blocks marked 0 below, and shim 1
+                    # may transfer the blocks marked 1.
+                    #
+                    #             N
+                    #      ----------------
+                    #     |0011    0011    |
+                    #     |0011    0011    |
+                    #     |0011    0011    |
+                    # M   |0011    0011    |
+                    #     |                |
+                    #     |                |
+                    #     |                |
+                    #     |                |
+                    #      ----------------
+                    rt.drain(
+                        C_l2l3_fifos[col].cons(),
+                        C,
+                        tap=C_tiles[c_index],
+                        wait=True,
+                        task_group=tg,
+                        placement=Tile(col, 0),
                     )
-                    if tb_n_rows <= 0:
-                        # for small input sizes, we may not even need a "pong" iteration
-                        break
-                    for col in range(n_aie_cols):
-
-                        # C Output Transfer:
-                        # The smallest transfer unit is a (m*n_aie_rows)-x-(n)-sized sub-tile of the matrix.
-                        # Transfer one such tile for every (n_aie_cols)-th column, evenly spaced,
-                        # then repeat that (tb_n_rows) times for the next contiguous blocks of rows.
-                        # Each shim will start at a different column offset, transferring interleaved
-                        # columns. For example, shim 0 may transfer the blocks marked 0 below, and shim 1
-                        # may transfer the blocks marked 1.
+                    c_index += 1
+
+                    for tile_row in range(current_tb_n_rows):
+
+                        # A input transfer:
                         #
-                        #             N
+                        # The smallest transfer unit is a (m*n_A_tiles_per_shim)-sized sub-tile of the input matrix.
+                        # Transfer one such tile for every column, contiguously.
+                        # Repeat this transfer with identical tiles a total of (N//n//n_aie_cols) times.
+                        # Each shim transfers the tiles for separate rows. For example, shim 0 may transfer the
+                        # tiles marked 0 below, and shim 1 may transfer the tiles marked 1.
+                        #             K
                         #      ----------------
-                        #     |0011    0011    |
-                        #     |0011    0011    |
-                        #     |0011    0011    |
-                        # M   |0011    0011    |
+                        #     |0000000000000000|    (repeated N//n//n_aie_cols times)
+                        #     |0000000000000000|
+                        #     |1111111111111111|
+                        # M   |1111111111111111|
                         #     |                |
                         #     |                |
                         #     |                |
                         #     |                |
                         #      ----------------
-                        if not c_col_maj:
-                            C_row_offset = row_base * m * n_aie_rows * N
-                            C_col_offset = col * n
-                            C_offset = C_col_offset + C_row_offset
-                            C_sizes = [
-                                tb_n_rows,
-                                N // n // n_aie_cols,
-                                m * n_aie_rows,
-                                n,
-                            ]
-                            C_strides = [m * n_aie_rows * N, n * n_aie_cols, N, 1]
-                        else:
-                            C_row_offset = row_base * m * n_aie_rows
-                            C_col_offset = col * n * M
-                            C_offset = C_col_offset + C_row_offset
-                            C_sizes = [N // n // n_aie_cols, n_aie_rows, n, m]
-                            C_strides = [M * n * n_aie_cols, m, M, 1]
-                        npu_dma_memcpy_nd(
-                            metadata=C_l2l3_fifos[col],
-                            bd_id=bd_id_base,
-                            mem=C,
-                            offsets=[0, 0, 0, C_offset],
-                            sizes=C_sizes,
-                            strides=C_strides,
-                        )
+                        tile_offset = (
+                            (row_base + tile_row) * n_shim_mem_A + col
+                        ) % len(A_tiles)
+
+                        # always equal to n_aie_rows since we have n_aie_rows row tiles for matrix A
+                        if col < n_aie_rows:
+                            rt.fill(
+                                A_l3l2_fifos[col].prod(),
+                                A,
+                                tap=A_tiles[tile_offset],
+                                task_group=tg,
+                                placement=Tile(
+                                    2 * col if n_aie_cols == 8 else col, 0
+                                ),  # alternate columns in full 4x8 NPU2 case
+                            )
                         # Use the calculated sizes/strides/offsets to record the data movement
                         # caused by the above call to npu_dma_memcpy_nd.
                         # This line does not change MLIR output at all.
-                        if generate_taps:
-                            C_taps.append(
-                                TensorAccessPattern(
-                                    (M, N),
-                                    offset=C_offset,
-                                    sizes=C_sizes,
-                                    strides=C_strides,
-                                )
-                            )
 
-                        for tile_row in range(tb_n_rows):
-
-                            # A input transfer:
-                            #
-                            # The smallest transfer unit is a (m*n_A_tiles_per_shim)-sized sub-tile of the input matrix.
-                            # Transfer one such tile for every column, contiguously.
-                            # Repeat this transfer with identical tiles a total of (N//n//n_aie_cols) times.
-                            # Each shim transfers the tiles for separate rows. For example, shim 0 may transfer the
-                            # tiles marked 0 below, and shim 1 may transfer the tiles marked 1.
-                            #             K
-                            #      ----------------
-                            #     |0000000000000000|    (repeated N//n//n_aie_cols times)
-                            #     |0000000000000000|
-                            #     |1111111111111111|
-                            # M   |1111111111111111|
-                            #     |                |
-                            #     |                |
-                            #     |                |
-                            #     |                |
-                            #      ----------------
-                            A_block_offset = (
-                                (row_base + tile_row) * n_aie_rows * m * K
-                            )  # base address for this transfer block for all BDs
-                            A_row_offset = (
-                                col * n_A_tiles_per_shim * m * K
-                            )  # base address for the shim in this column
-                            A_offset = A_block_offset + A_row_offset
-                            A_sizes = [
-                                N // n // n_aie_cols,
-                                K // k,
-                                m * n_A_tiles_per_shim,
-                                k,
-                            ]
-                            A_strides = [0, k, K, 1]
-
-                            # always equal to n_aie_rows since we have n_aie_rows row tiles for matrix A
-                            if col < n_aie_rows:
-                                npu_dma_memcpy_nd(
-                                    metadata=A_l3l2_fifos[col],
-                                    bd_id=bd_id_base + 2 * tile_row + 1,
-                                    mem=A,
-                                    offsets=[0, 0, 0, A_offset],
-                                    sizes=A_sizes,
-                                    strides=A_strides,
-                                )
-                            # # Use the calculated sizes/strides/offsets to record the data movement
-                            # # caused by the above call to npu_dma_memcpy_nd.
-                            # # This line does not change MLIR output at all.
-                            if generate_taps:
-                                A_taps.append(
-                                    TensorAccessPattern(
-                                        (M, K),
-                                        offset=A_offset,
-                                        sizes=A_sizes,
-                                        strides=A_strides,
-                                    )
-                                )
-
-                            # B input transfer:
-                            # Transfer the first a (n)-wide block of columns of B,
-                            # Then transfer the (n_aie_columns)-th such block, and so on.
-                            # Each shim will start at a different column offset.
-                            # For example, shim 0 may transfer the tiles marked 0 below,
-                            # and shim 1 may transfer the tiles marked 1.
-                            #
-                            #             N
-                            #      ----------------
-                            #     |0011    0011    |
-                            #     |0011    0011    |
-                            #     |0011    0011    |
-                            # K   |0011    0011    |
-                            #     |0011    0011    |
-                            #     |0011    0011    |
-                            #     |0011    0011    |
-                            #     |0011    0011    |
-                            #      ----------------
-                            B_col_offset = col * n if not b_col_maj else col * n * K
-                            if not b_col_maj:
-                                B_sizes = [N // n // n_aie_cols, K // k, k, n]
-                                B_strides = [n * n_aie_cols, k * N, N, 1]
-                            else:
-                                B_sizes = [N // n // n_aie_cols, K // k, n, k]
-                                B_strides = [n * n_aie_cols * K, k, K, 1]
-
-                            npu_dma_memcpy_nd(
-                                metadata=B_l3l2_fifos[col],
-                                bd_id=bd_id_base + 2 * tile_row + 2,
-                                mem=B,
-                                offsets=[0, 0, 0, B_col_offset],
-                                sizes=B_sizes,
-                                strides=B_strides,
-                            )
-                            # # Use the calculated sizes/strides/offsets to record the data movement
-                            # # caused by the above call to npu_dma_memcpy_nd.
-                            # # This line does not change MLIR output at all.
-                            if generate_taps:
-                                B_taps.append(
-                                    TensorAccessPattern(
-                                        (K, N),
-                                        offset=B_col_offset,
-                                        sizes=B_sizes,
-                                        strides=B_strides,
-                                    )
-                                )
-                    if tb > 0 or (tb == 0 and pingpong > 0):
-                        dma_wait(*C_l2l3_fifos)
-            dma_wait(*C_l2l3_fifos)
+                        # B input transfer:
+                        # Transfer the first a (n)-wide block of columns of B,
+                        # Then transfer the (n_aie_columns)-th such block, and so on.
+                        # Each shim will start at a different column offset.
+                        # For example, shim 0 may transfer the tiles marked 0 below,
+                        # and shim 1 may transfer the tiles marked 1.
+                        #
+                        #             N
+                        #      ----------------
+                        #     |0011    0011    |
+                        #     |0011    0011    |
+                        #     |0011    0011    |
+                        # K   |0011    0011    |
+                        #     |0011    0011    |
+                        #     |0011    0011    |
+                        #     |0011    0011    |
+                        #     |0011    0011    |
+                        #      ----------------
+                        rt.fill(
+                            B_l3l2_fifos[col].prod(),
+                            B,
+                            tap=B_tiles[col],
+                            task_group=tg,
+                            placement=Tile(col, 0),
+                        )
+
+                        # These lines do not change MLIR output at all - they are just for recording data movement
+                        A_taps.append(A_tiles[tile_offset])
+                        B_taps.append(B_tiles[col])
+                if tb > 0 or (tb == 0 and pingpong > 0):
+                    rt.finish_task_group(tg)
+                    tg = rt.task_group()
+        rt.finish_task_group(tg)
 
     if generate_taps:
-        # If generate_taps is true, return a representation of tensor tiles
+        # If generate taps is true, return a representation of tensor access patterns
         # representing all the npu_dma_memcpy_nd runtime sequence operations per input/ouput tensor.
         return (
             TensorAccessSequence.from_taps(A_taps),
@@ -724,6 +670,13 @@ def sequence(A, B, C):
             TensorAccessSequence.from_taps(C_taps),
         )
 
+    # Create the program from the device type and runtime
+    my_program = Program(dev_ty, rt)
+
+    # Place components (assign them resources on the device) and generate an MLIR module
+    module = my_program.resolve_program(SequentialPlacer())
+    return module
+
 
 if __name__ == "__main__":
     main()