nod-ai · Abhishek-Varma · Nov 5, 2024 · Oct 22, 2024 · Oct 14, 2024 · Oct 14, 2024
diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
@@ -795,6 +795,31 @@ def run(self, config):
             output_type=get_output_type(test_name),
         )
 
+        # 128x128x256 shape test of Matmul + Truncf
+        generate_matmul_test(test_name, template_name, 128, 128, 256, "bf16", "f32")
+        identity_mat = np.eye(128, dtype=np.float32)
+        lhs_ones = np.ones(128 * 256, dtype=np.float32).reshape([128, 256])
+        rhs_ones = np.ones(256 * 128, dtype=np.float32).reshape([256, 128])
+        out_ones = np.ones(128 * 128, dtype=np.float32).reshape([128, 128])
+        lhs = lhs_ones * 2
+        rhs = rhs_ones * 3
+        input_args = generate_inputs(test_name, output_dir, 1, {1: lhs, 2: rhs})
+        aie_vs_baseline(
+            config,
+            test_name,
+            input_args,
+            out_ones * 1536,  # exected output
+            use_ukernel=False,
+            tile_pipeline="pack-peel",
+            lower_to_aie_pipeline="objectFifo",
+            function_name=None,
+            seed=1,
+            rtol=0,
+            atol=0,
+            n_repeats=1,
+            output_type=get_output_type(test_name),
+        )
+
 
 class SmokeSet(TestSet):
     def __init__(self):

@@ -30,6 +30,7 @@
 #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/Dialect/Vector/Utils/VectorUtils.h"
+#include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
@@ -303,6 +304,94 @@ class FlattenContiguousRowMajorTransferWritePattern
 
 }  // namespace copied_from_mlir
 
+/// Utility to check if the indices provided are all 0.
+static LogicalResult isAllZeroOffsetAccess(mlir::OperandRange indices) {
+  if (!llvm::all_of(indices, [](Value val) {
+        IntegerAttr attr;
+        if (!matchPattern(val, m_Constant(&attr))) return false;
+        return attr.getInt() == 0;
+      })) {
+    return failure();
+  }
+  return success();
+}
+
+/// Utility to convert OpFoldResult vector of offsets of a Subview op to
+/// a vector of values.
+static SmallVector<Value> opFoldResultsToValues(PatternRewriter &rewriter,
+                                                Location loc,
+                                                memref::SubViewOp subViewOp) {
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPoint(subViewOp);
+  SmallVector<Value> newIndices;
+  for (OpFoldResult offset : subViewOp.getMixedOffsets()) {
+    Value indexVal;
+    if (auto attr = dyn_cast<Attribute>(offset)) {
+      indexVal = rewriter.create<arith::ConstantIndexOp>(
+          loc, cast<IntegerAttr>(attr).getInt());
+    } else {
+      indexVal = cast<Value>(offset);
+    }
+    newIndices.push_back(indexVal);
+  }
+  return newIndices;
+}
+
+/// A rewriter function to canonicalize the following :-
+/// INPUT:
+///       %b = memref.subview %a [offset0, offset1, ...]
+///       %c = vector.transfer_read %b[0, 0, ...]
+/// OUTPUT:
+///       %c = vector.transfer_read %a[offset0, offset1, ...]
+///
+/// This is needed to enable other set of staged canonicalizations in this pass.
+struct CanonicalizeTrivialReadAccessSubviewOpPattern
+    : public OpRewritePattern<vector::TransferReadOp> {
+  using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::TransferReadOp readOp,
+                                PatternRewriter &rewriter) const override {
+    auto subViewOp = dyn_cast_if_present<memref::SubViewOp>(
+        readOp.getSource().getDefiningOp());
+    if (!subViewOp) return failure();
+    if (failed(isAllZeroOffsetAccess(readOp.getIndices()))) return failure();
+    SmallVector<Value> newIndices =
+        opFoldResultsToValues(rewriter, readOp.getLoc(), subViewOp);
+    rewriter.replaceOpWithNewOp<vector::TransferReadOp>(
+        readOp, readOp.getType(), subViewOp.getSource(), newIndices,
+        readOp.getPadding(), readOp.getInBoundsValues());
+    return success();
+  }
+};
+
+/// A rewriter function to canonicalize the following :-
+/// INPUT:
+///       %b = memref.subview %a [offset0, offset1, ...]
+///       vector.transfer_write %val, %b[0, 0, ...]
+/// OUTPUT:
+///       vector.transfer_write %val, %a[offset0, offset1, ...]
+///
+/// This is needed to enable other set of staged canonicalizations in this pass.
+struct CanonicalizeTrivialWriteAccessSubviewOpPattern
+    : public OpRewritePattern<vector::TransferWriteOp> {
+  using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp,
+                                PatternRewriter &rewriter) const override {
+    auto subViewOp = dyn_cast_if_present<memref::SubViewOp>(
+        writeOp.getSource().getDefiningOp());
+    if (!subViewOp) return failure();
+    if (failed(isAllZeroOffsetAccess(writeOp.getIndices()))) return failure();
+    SmallVector<Value> newIndices =
+        opFoldResultsToValues(rewriter, writeOp.getLoc(), subViewOp);
+    rewriter.create<vector::TransferWriteOp>(
+        writeOp.getLoc(), writeOp.getVector(), subViewOp.getSource(),
+        newIndices, writeOp.getInBoundsValues());
+    rewriter.eraseOp(writeOp);
+    return success();
+  }
+};
+
 static bool isGemmBTransposedContractionOp(vector::ContractionOp op) {
   if (op.getKind() != vector::CombiningKind::ADD) return false;
 
@@ -892,6 +981,12 @@ struct CanonicalizeVectorForAIEVecPass
     auto op = getOperation();
     MLIRContext *context = &getContext();
 
+    {
+      RewritePatternSet patterns(context);
+      patterns.add<CanonicalizeTrivialReadAccessSubviewOpPattern,
+                   CanonicalizeTrivialWriteAccessSubviewOpPattern>(context);
+      (void)applyPatternsAndFoldGreedily(op, std::move(patterns));
+    }
     {
       // These must run before 'populateVectorBroadcastLoweringPatterns'
       // so that broadcasts can be matched before conversion to insert.

@@ -166,3 +166,64 @@ func.func @arith_truncf(%inp: vector<2x3xf32>) -> vector<2x3xbf16> {
     %0 = arith.truncf %inp : vector<2x3xf32> to vector<2x3xbf16>
     return %0 : vector<2x3xbf16>
 }
+
+// -----
+
+// CHECK:       #map = affine_map<()[s0] -> (s0 * 256 + 96)>
+// CHECK-LABEL: @trivial_read_access
+// CHECK-SAME:  (%[[ARG0:.*]]: memref<4x8x4x8xbf16, strided<[256, 32, 8, 1]>>,
+// CHECK-SAME:   %[[ARG1:.*]]: index)
+// CHECK-NOT:     memref.subview
+// CHECK:         %[[COLLAPSE_SHAPE:.*]] = memref.collapse_shape %[[ARG0]]
+// CHECK-SAME:        into memref<1024xbf16, strided<[1]>>
+// CHECK:         %[[APPLY_INDEX:.*]] = affine.apply #map()[%[[ARG1]]]
+// CHECK:         %[[READ:.*]] = vector.transfer_read %[[COLLAPSE_SHAPE]][%[[APPLY_INDEX]]]
+// CHECK:         %[[SHAPE_CAST:.*]] = vector.shape_cast %[[READ]]
+// CHECK-SAME:        vector<32xbf16> to vector<1x1x4x8xbf16>
+// CHECK:         return %[[SHAPE_CAST]]
+func.func @trivial_read_access(%arg0: memref<4x8x4x8xbf16, strided<[256, 32, 8, 1]>>, %in: index) -> vector<1x1x4x8xbf16> {
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : bf16
+    %subview = memref.subview %arg0[%in, 3, 0, 0] [1, 1, 4, 8] [1, 1, 1, 1] : memref<4x8x4x8xbf16, strided<[256, 32, 8, 1]>> to memref<1x1x4x8xbf16, strided<[256, 32, 8, 1], offset: ?>>
+    %read = vector.transfer_read %subview[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x4x8xbf16, strided<[256, 32, 8, 1], offset: ?>>, vector<1x1x4x8xbf16>
+    return %read : vector<1x1x4x8xbf16>
+}
+
+// -----
+
+// CHECK-LABEL: @trivial_read_access_rank_reduced
+// CHECK-SAME:  (%[[ARG0:.*]]: memref<4x8x1x8xbf16, strided<[64, 8, 8, 1]>>)
+// CHECK-NOT:     memref.subview
+// CHECK:         %[[COLLAPSE_SHAPE:.*]] = memref.collapse_shape %[[ARG0]]
+// CHECK-SAME:        into memref<256xbf16, strided<[1]>>
+// CHECK:         %[[READ:.*]] = vector.transfer_read %[[COLLAPSE_SHAPE]]
+// CHECK:         %[[SHAPE_CAST:.*]] = vector.shape_cast %[[READ]]
+// CHECK-SAME:        vector<8xbf16> to vector<1x1x8xbf16>
+// CHECK:         return %[[SHAPE_CAST]]
+func.func @trivial_read_access_rank_reduced(%arg0: memref<4x8x1x8xbf16, strided<[64, 8, 8, 1]>>) -> vector<1x1x8xbf16> {
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : bf16
+    %subview = memref.subview %arg0[2, 3, 0, 0] [1, 1, 1, 8] [1, 1, 1, 1] : memref<4x8x1x8xbf16, strided<[64, 8, 8, 1]>> to memref<1x1x8xbf16, strided<[8, 8, 1], offset: 152>>
+    %read = vector.transfer_read %subview[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x8xbf16, strided<[8, 8, 1], offset: 152>>, vector<1x1x8xbf16>
+    return %read : vector<1x1x8xbf16>
+}
+
+// -----
+
+// CHECK-LABEL: @trivial_write_access
+// CHECK-SAME:  (%[[ARG0:.*]]: memref<8x8x4x4xf32, strided<[128, 16, 4, 1]>>,
+// CHECK-SAME:   %[[ARG1:.*]]: vector<1x1x4x4xf32>)
+// CHECK-NOT:       memref.subview
+// CHECK:           %[[COLLAPSE_SHAPE:.*]] = memref.collapse_shape %[[ARG0]]
+// CHECK-SAME:          : memref<8x8x4x4xf32, strided<[128, 16, 4, 1]>> into memref<1024xf32, strided<[1]>>
+// CHECK:           %[[SHAPE_CAST:.*]] = vector.shape_cast %[[ARG1]]
+// CHECK-SAME:          : vector<1x1x4x4xf32> to vector<16xf32>
+// CHECK:           vector.transfer_write %[[SHAPE_CAST]], %[[COLLAPSE_SHAPE]]
+// CHECK:           return
+func.func @trivial_write_access(%arg0: memref<8x8x4x4xf32, strided<[128, 16, 4, 1]>>, %arg1: vector<1x1x4x4xf32>) {
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : bf16
+    %subview = memref.subview %arg0[2, 3, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<8x8x4x4xf32, strided<[128, 16, 4, 1]>> to memref<1x1x4x4xf32, strided<[128, 16, 4, 1], offset: 304>>
+    vector.transfer_write %arg1, %subview[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x4x4xf32>, memref<1x1x4x4xf32, strided<[128, 16, 4, 1], offset: 304>>
+    return
+}
@@ -14,6 +14,7 @@ iree_lit_test_suite(
     "matmul_pack_peel_objectfifo.mlir"
     "matmul_pack_peel_objectfifo_e2e.mlir"
     "matmul_pad_pack_air_e2e.mlir"
+    "matmul_elementwise_pack_peel_objectfifo_e2e.mlir"
     "xdna_oplib_plugin.mlir"
   TOOLS
     ${IREE_LLD_TARGET}

@@ -0,0 +1,34 @@
+// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources --iree-amdaie-target-device=npu1_4col %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-lower-to-aie-pipeline=objectFifo --iree-amdaie-tile-pipeline=pack-peel --split-input-file | FileCheck %s
+
+// CHECK-LABEL: hal.executable.export public @matmul_truncf_bf16_dispatch_0_matmul_128x128x256_bf16
+// CHECK:       aie.device(npu1_4col) {
+// CHECK-DAG:   %[[TILE_0_2:.+]] = aie.tile(0, 2)
+// CHECK-DAG:   %[[TILE_0_3:.+]] = aie.tile(0, 3)
+// CHECK-DAG:   %[[TILE_1_2:.+]] = aie.tile(1, 2)
+// CHECK-DAG:   %[[TILE_1_3:.+]] = aie.tile(1, 3)
+// CHECK-DAG:   %[[TILE_0_0:.+]] = aie.tile(0, 0)
+// CHECK-DAG:   %[[TILE_0_1:.+]] = aie.tile(0, 1)
+// CHECK-DAG:   aie.core(%[[TILE_0_2]])
+// CHECK-DAG:   aie.core(%[[TILE_1_2]])
+// CHECK-DAG:   aie.core(%[[TILE_0_3]])
+// CHECK-DAG:   aie.core(%[[TILE_1_3]])
+// CHECK-DAG:   aie.shim_dma_allocation {{.*}}(MM2S, 0, 0)
+// CHECK-DAG:   aie.shim_dma_allocation {{.*}}(MM2S, 1, 0)
+// CHECK-DAG:   aie.memtile_dma(%[[TILE_0_1]])
+// CHECK-DAG:   aie.mem(%[[TILE_0_2]])
+// CHECK-DAG:   aie.mem(%[[TILE_0_3]])
+// CHECK-DAG:   aie.mem(%[[TILE_1_2]])
+// CHECK-DAG:   aie.mem(%[[TILE_1_3]])
+// CHECK-DAG:   aie.shim_dma_allocation {{.*}}(S2MM, 0, 0)
+// CHECK:       {npu_instructions =
+// CHECK-SAME:   runtime_sequence_name = "matmul_truncf_bf16_dispatch_0_matmul_128x128x256_bf16xbf16xf32"
+func.func @matmul_truncf_bf16(%lhs: tensor<128x256xbf16>, %rhs: tensor<256x128xbf16>) -> tensor<128x128xbf16>
+{
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<128x128xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<128x128xf32>) -> tensor<128x128xf32>
+  %res = linalg.matmul ins(%lhs, %rhs: tensor<128x256xbf16>, tensor<256x128xbf16>)
+                    outs(%1: tensor<128x128xf32>) -> tensor<128x128xf32>
+  %cast = arith.truncf %res : tensor<128x128xf32> to tensor<128x128xbf16>
+  return %cast : tensor<128x128xbf16>
+}