Merge d37eaf1 into e38cc7f

IanWood1 · web-flow · commit b43ea7037a2f · 2024-06-28T16:25:01.000Z
diff --git a/compiler/src/iree/compiler/GlobalOptimization/RaiseSpecialOps.cpp b/compiler/src/iree/compiler/GlobalOptimization/RaiseSpecialOps.cpp
@@ -7,8 +7,6 @@
 #include "iree-dialects/Transforms/TransformMatchers.h"
 #include "iree/compiler/Dialect/Flow/Transforms/RegionOpUtils.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h"
-#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtInterfaces.h"
-#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
 #include "iree/compiler/GlobalOptimization/PassDetail.h"
 #include "iree/compiler/GlobalOptimization/Passes.h"
 #include "llvm/ADT/STLExtras.h"
@@ -701,67 +699,71 @@ class NamedImplicitCastOpConversion : public OpInterfaceRewritePattern<OpTy> {
     // Returns true if the operand was updated to inform the pattern rewriter
     // of a change.
     Type outElementType = getElementTypeOrSelf(namedOp->getResultTypes()[0]);
-    bool didChangeOperand = false;
-    {
-      OpBuilder::InsertionGuard guard(rewriter);
-      Block *block = &namedOp->getRegion(0).front();
-      rewriter.setInsertionPointToStart(block);
-      auto replaceOperandWithTypeCast = [&](OpOperand &operand) {
-        // If the op already has implicit casting semantics for this operand,
-        // do not fuse.
-        if (getElementTypeOrSelf(operand.get().getType()) != outElementType) {
-          return false;
-        }
-        auto producer = operand.get().getDefiningOp<linalg::GenericOp>();
-        if (!producer) {
-          return false;
-        }
-        if (!linalg::isElementwise(producer) ||
-            producer.getNumDpsInputs() != 1 || producer.getNumDpsInits() != 1) {
-          return false;
-        }
+    OpBuilder::InsertionGuard guard(rewriter);
+    Block *block = &namedOp->getRegion(0).front();
+    rewriter.setInsertionPointToStart(block);
+    bool didChangeOperand =
+        replaceOperandWithTypeCast(namedOp->getOpOperand(0), outElementType,
+                                   namedOp, block, rewriter) ||
+        replaceOperandWithTypeCast(namedOp->getOpOperand(1), outElementType,
+                                   namedOp, block, rewriter);
+    return success(didChangeOperand);
+  }
 
-        if (!llvm::hasSingleElement(
-                producer.getBlock()->without_terminator())) {
-          return false;
-        }
-        // We only handle arith.extf here for two reasons:
-        //  1) This pattern is being applied to convolution/contraction
-        //     interfaces. Extension semantics for integers depend on the named
-        //     op and requires a slightly different pattern.
-        //  2) Truncating operations like `arith.truncf` should not be fused
-        //     with consumers; it would be preferred to fuse those with
-        //     producers (and the consumer fusion is arguably the less canonical
-        //     form).
-        if (!llvm::isa<arith::ExtFOp>(
-                *producer.getBlock()->without_terminator().begin())) {
-          return false;
-        }
-        Type producerElementType = getElementTypeOrSelf(
-            producer.getDpsInputOperand(0)->get().getType());
-        int64_t operandNumber = operand.getOperandNumber();
-        // Set the operand to the linalg op to the smaller one.
-        namedOp->setOperand(operandNumber, producer->getOperand(0));
-
-        // Insert a new block argument into the body of the named op with the
-        // correct type.
-        Value blockArg = block->insertArgument(
-            operandNumber, producerElementType, namedOp.getLoc());
-        // Create the extf.
-        auto ext = rewriter.create<arith::ExtFOp>(namedOp.getLoc(),
-                                                  outElementType, blockArg);
-        // Replace uses of the old argument with the extended value.
-        rewriter.replaceAllUsesWith(block->getArgument(operandNumber + 1),
-                                    ext.getResult());
-        // Erase the old argument.
-        block->eraseArgument(operandNumber + 1);
-        return true;
-      };
-
-      didChangeOperand = replaceOperandWithTypeCast(namedOp->getOpOperand(0));
-      didChangeOperand |= replaceOperandWithTypeCast(namedOp->getOpOperand(1));
+private:
+  static bool replaceOperandWithTypeCast(OpOperand &operand,
+                                         Type outElementType, OpTy namedOp,
+                                         Block *block, RewriterBase &rewriter) {
+    // If the op already has implicit casting semantics for this operand,
+    // do not fuse.
+    if (getElementTypeOrSelf(operand.get().getType()) != outElementType) {
+      return false;
     }
-    return success(didChangeOperand);
+    auto producer = operand.get().getDefiningOp<linalg::GenericOp>();
+    if (!producer) {
+      return false;
+    }
+    if (!linalg::isElementwise(producer) || producer.getNumDpsInputs() != 1 ||
+        producer.getNumDpsInits() != 1) {
+      return false;
+    }
+
+    if (!llvm::hasSingleElement(producer.getBlock()->without_terminator())) {
+      return false;
+    }
+
+    // Note: only extf and extsi are supported
+    //
+    // convolution/contraction ops internally use extsi to cast to the correct
+    // bitwidth.
+    //
+    // Truncating operations like `arith.truncf` should not be fused with
+    // consumers; it would be preferred to fuse those with producers (and the
+    // consumer fusion is arguably the less canonical form).
+    Operation &castOp = *producer.getBlock()->without_terminator().begin();
+    if (!llvm::isa<arith::ExtFOp, arith::ExtSIOp>(castOp)) {
+      return false;
+    }
+    Type producerElementType =
+        getElementTypeOrSelf(producer.getDpsInputOperand(0)->get().getType());
+    int64_t operandNumber = operand.getOperandNumber();
+    // Set the operand to the linalg op to the smaller one.
+    namedOp->setOperand(operandNumber, producer->getOperand(0));
+
+    // Insert a new block argument into the body of the named op with the
+    // correct type.
+    Value blockArg = block->insertArgument(operandNumber, producerElementType,
+                                           namedOp.getLoc());
+    // Create the extf/extsi.
+    IRMapping mapping;
+    mapping.map(castOp.getOperand(0), blockArg);
+    Value ext = rewriter.clone(castOp, mapping)->getResult(0);
+
+    // Replace uses of the old argument with the extended value.
+    rewriter.replaceAllUsesWith(block->getArgument(operandNumber + 1), ext);
+    // Erase the old argument.
+    block->eraseArgument(operandNumber + 1);
+    return true;
   }
 };
 
diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/raise_special_ops.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/raise_special_ops.mlir
@@ -538,3 +538,125 @@ util.func public @conv_nchw_extf_both(%arg0 : tensor<1x5x10x10xf16>,
 //  CHECK-SAME:     %[[ARG1:.+]]: tensor<5x5x3x3xf16>
 //       CHECK:   %[[RESULT:.+]] = linalg.conv_2d_nchw_fchw {{.*}} ins(%[[ARG0]], %[[ARG1]]
 //       CHECK:   util.return %[[RESULT]]
+
+// -----
+
+util.func public @matmul_extsi(%arg0 : tensor<10x20xi32>,
+                               %arg1 : tensor<20x40xi16>) -> tensor<10x40xi32> {
+  %0 = tensor.empty() : tensor<20x40xi32>
+  %1 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
+      iterator_types = ["parallel", "parallel"]}
+      ins(%arg1 : tensor<20x40xi16>) outs(%0 : tensor<20x40xi32>) {
+    ^bb0(%b0 : i16, %b1 : i32):
+      %e = arith.extsi %b0 : i16 to i32
+      linalg.yield %e : i32
+  } -> tensor<20x40xi32>
+  %2 = tensor.empty() : tensor<10x40xi32>
+  %3 = arith.constant 0 : i32
+  %4 = linalg.fill ins(%3 : i32) outs(%2 : tensor<10x40xi32>) -> tensor<10x40xi32>
+  %5 = linalg.matmul ins(%arg0, %1 : tensor<10x20xi32>, tensor<20x40xi32>)
+      outs(%4 : tensor<10x40xi32>) -> tensor<10x40xi32>
+  util.return %5 : tensor<10x40xi32>
+}
+// CHECK-LABEL: util.func public @matmul_extsi
+//  CHECK-SAME:     %[[ARG0:.+]]: tensor<10x20xi32>
+//  CHECK-SAME:     %[[ARG1:.+]]: tensor<20x40xi16>
+//       CHECK:   %[[RESULT:.+]] = linalg.matmul ins(%[[ARG0]], %[[ARG1]]
+//       CHECK:   util.return %[[RESULT]]
+// -----
+
+util.func public @matmul_extsi_a(%arg0 : tensor<10x20xi16>,
+                                 %arg1 : tensor<20x40xi32>) -> tensor<10x40xi32> {
+  %0 = tensor.empty() : tensor<10x20xi32>
+  %1 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
+      iterator_types = ["parallel", "parallel"]}
+      ins(%arg0 : tensor<10x20xi16>) outs(%0 : tensor<10x20xi32>) {
+    ^bb0(%b0 : i16, %b1 : i32):
+      %e = arith.extsi %b0 : i16 to i32
+      linalg.yield %e : i32
+  } -> tensor<10x20xi32>
+  %2 = tensor.empty() : tensor<10x40xi32>
+  %3 = arith.constant 0 : i32
+  %4 = linalg.fill ins(%3 : i32) outs(%2 : tensor<10x40xi32>) -> tensor<10x40xi32>
+  %5 = linalg.matmul ins(%1, %arg1 : tensor<10x20xi32>, tensor<20x40xi32>)
+      outs(%4 : tensor<10x40xi32>) -> tensor<10x40xi32>
+  util.return %5 : tensor<10x40xi32>
+}
+// CHECK-LABEL: util.func public @matmul_extsi_a
+//  CHECK-SAME:     %[[ARG0:.+]]: tensor<10x20xi16>
+//  CHECK-SAME:     %[[ARG1:.+]]: tensor<20x40xi32>
+//       CHECK:   %[[RESULT:.+]] = linalg.matmul ins(%[[ARG0]], %[[ARG1]]
+//       CHECK:   util.return %[[RESULT]]
+
+// -----
+
+util.func public @matmul_extsi_both(%arg0 : tensor<10x20xi16>,
+                                    %arg1 : tensor<20x40xi16>) -> tensor<10x40xi32> {
+  %0 = tensor.empty() : tensor<10x20xi32>
+  %1 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
+      iterator_types = ["parallel", "parallel"]}
+      ins(%arg0 : tensor<10x20xi16>) outs(%0 : tensor<10x20xi32>) {
+    ^bb0(%b0 : i16, %b1 : i32):
+      %e = arith.extsi %b0 : i16 to i32
+      linalg.yield %e : i32
+  } -> tensor<10x20xi32>
+  %2 = tensor.empty() : tensor<20x40xi32>
+  %3 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
+      iterator_types = ["parallel", "parallel"]}
+      ins(%arg1 : tensor<20x40xi16>) outs(%2 : tensor<20x40xi32>) {
+    ^bb0(%b2 : i16, %b3 : i32):
+      %e1 = arith.extsi %b2 : i16 to i32
+      linalg.yield %e1 : i32
+  } -> tensor<20x40xi32>
+  %4 = tensor.empty() : tensor<10x40xi32>
+  %5 = arith.constant 0 : i32
+  %6 = linalg.fill ins(%5 : i32) outs(%4 : tensor<10x40xi32>) -> tensor<10x40xi32>
+  %7 = linalg.matmul ins(%1, %3 : tensor<10x20xi32>, tensor<20x40xi32>)
+      outs(%6 : tensor<10x40xi32>) -> tensor<10x40xi32>
+  util.return %7 : tensor<10x40xi32>
+}
+// CHECK-LABEL: util.func public @matmul_extsi_both
+//  CHECK-SAME:     %[[ARG0:.+]]: tensor<10x20xi16>
+//  CHECK-SAME:     %[[ARG1:.+]]: tensor<20x40xi16>
+//       CHECK:   %[[RESULT:.+]] = linalg.matmul ins(%[[ARG0]], %[[ARG1]]
+//       CHECK:   util.return %[[RESULT]]
+
+// -----
+
+util.func public @conv_nchw_extsi_both(%arg0 : tensor<1x5x10x10xi16>,
+                                       %arg1 : tensor<5x5x3x3xi16>) -> tensor<1x5x8x8xi32> {
+  %0 = tensor.empty() : tensor<1x5x10x10xi32>
+  %1 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
+      iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+      ins(%arg0 : tensor<1x5x10x10xi16>) outs(%0 : tensor<1x5x10x10xi32>) {
+    ^bb0(%b0 : i16, %b1 : i32):
+      %e = arith.extsi %b0 : i16 to i32
+      linalg.yield %e : i32
+  } -> tensor<1x5x10x10xi32>
+  %2 = tensor.empty() : tensor<5x5x3x3xi32>
+  %3 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
+      iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+      ins(%arg1 : tensor<5x5x3x3xi16>) outs(%2 : tensor<5x5x3x3xi32>) {
+    ^bb0(%b2 : i16, %b3 : i32):
+      %e1 = arith.extsi %b2 : i16 to i32
+      linalg.yield %e1 : i32
+  } -> tensor<5x5x3x3xi32>
+  %4 = tensor.empty() : tensor<1x5x8x8xi32>
+  %5 = arith.constant 0 : i32
+  %6 = linalg.fill ins(%5 : i32) outs(%4 : tensor<1x5x8x8xi32>) -> tensor<1x5x8x8xi32>
+  %7 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>}
+      ins(%1, %3 : tensor<1x5x10x10xi32>, tensor<5x5x3x3xi32>)
+      outs(%6 : tensor<1x5x8x8xi32>) -> tensor<1x5x8x8xi32>
+  util.return %7 : tensor<1x5x8x8xi32>
+}
+// CHECK-LABEL: util.func public @conv_nchw_extsi_both
+//  CHECK-SAME:     %[[ARG0:.+]]: tensor<1x5x10x10xi16>
+//  CHECK-SAME:     %[[ARG1:.+]]: tensor<5x5x3x3xi16>
+//       CHECK:   %[[RESULT:.+]] = linalg.conv_2d_nchw_fchw {{.*}} ins(%[[ARG0]], %[[ARG1]]
+//       CHECK:   util.return %[[RESULT]]