[Codegen] Add SwapExtractWithCollapsePattern (llvm#21419)

yzhang93 · web-flow · commit 35984f04b69a · 2025-07-24T20:37:34.000Z
This PR adds a pattern to swap the `collapse_shape` with the `extract_slice` op to enable more loop fusion opportunity. Note this pattern is adapted from the upstream[ `BubbleUpCollapseShapeThroughExtractSlice`](https://github.com/llvm/llvm-project/blob/ffb453989b0e95d85b6cfa543b65fec23b65649d/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp#L322) by allowing some special cases we've seen through the IGEMM path. The special case is processed under `CASE #1` session with the corresponding tests mainly focusing on the changes. Note that some strict conditions are added to match the special case for convolutions which maybe not general and robust enough to add to upstream. In addition, we break the [upstream single pattern](https://github.com/llvm/llvm-project/blob/ffb453989b0e95d85b6cfa543b65fec23b65649d/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp#L767) into two separate ones to be able to apply them separately in different passes. --------- Signed-off-by: yzhang93 <zhyuhang88@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyTilingLevel.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyTilingLevel.cpp
@@ -289,6 +289,23 @@ void GPUApplyTilingLevelPass::runOnOperation() {
 
   MLIRContext *context = &getContext();
 
+  // Swap `collapse_shape` with `extract_slice` to enable more loop fusion
+  // opportunity. Currently this is only needed for convolution IGEMM path.
+  // TODO(vivian): Move the pattern to `GPUFuseAndHoistParallelLoopsPass`.
+  if (normalizeLoops) {
+    funcOp->walk(
+        [&](scf::ForOp forOp) { (void)normalizeLoopBounds(rewriter, forOp); });
+    funcOp->walk([&](scf::ForallOp forallOp) {
+      (void)normalizeLoopBounds(rewriter, forallOp);
+    });
+
+    RewritePatternSet patterns(context);
+    populateSwapExtractWithCollapsePattern(patterns);
+    if (failed(applyPatternsGreedily(funcOp, std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+
   // Apply cleanup patterns.
   {
     RewritePatternSet patterns(context);
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
@@ -326,7 +326,9 @@ def GPUApplyTilingLevelPass :
            )}]>,
     Option<"allowZeroSlices", "allow-zero-slices", "bool",
            /*default=*/"true",
-           "Allow pad fusion to generate zero size slices">
+           "Allow pad fusion to generate zero size slices">,
+    Option<"normalizeLoops", "normalize-loops", "bool", "false",
+           "Enable normalization for scf loops">
   ];
 }
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir
@@ -3,6 +3,7 @@
 // RUN: iree-opt --split-input-file --mlir-print-local-scope --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level{tiling-level=thread}, canonicalize, cse))" %s | FileCheck %s --check-prefix=THREAD
 // RUN: iree-opt --split-input-file --mlir-print-local-scope --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level{tiling-level=subgroup}, canonicalize, cse))" %s | FileCheck %s --check-prefix=SUBGROUP
 // RUN: iree-opt --split-input-file --mlir-print-local-scope --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level{tiling-level=partial_reduction}, canonicalize, cse))" %s | FileCheck %s --check-prefix=PARTRED
+// RUN: iree-opt --split-input-file --mlir-print-local-scope --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level{normalize-loops}, canonicalize, cse))" %s | FileCheck %s --check-prefix=NORM-REDUCTION
 
 #config = #iree_gpu.lowering_config<{thread = [2, 16], subgroup = [2, 16]}>
 #map = affine_map<(d0, d1) -> (d0, d1)>
@@ -536,3 +537,67 @@ func.func @partial_reduction(%3: tensor<?x?xf32>) -> tensor<?xf32> {
 //       PARTRED:   scf.yield
 //       PARTRED:   linalg.reduce ins(%[[OUT]] : tensor<?x8xf32>)
 //  PARTRED-SAME:                 outs(%[[FULL]] : tensor<?xf32>)
+
+// -----
+
+#config = #iree_gpu.lowering_config<{reduction = [0, 32]}>
+func.func @swap_collapse_shape_with_extract_slice(%arg0: tensor<32x3x3x288xf32>) -> tensor<32x2592xf32> {
+  %collapsed = tensor.collapse_shape %arg0 [[0], [1, 2, 3]] : tensor<32x3x3x288xf32> into tensor<32x2592xf32>
+  %empty = tensor.empty() : tensor<32x2592xf32>
+  %0 = linalg.copy {lowering_config = #config} ins(%collapsed : tensor<32x2592xf32>) outs(%empty : tensor<32x2592xf32>) -> tensor<32x2592xf32>
+  return %0: tensor<32x2592xf32>
+}
+
+// NORM-REDUCTION-LABEL: func.func @swap_collapse_shape_with_extract_slice
+//   NORM-REDUCTION-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   NORM-REDUCTION-DAG:   %[[C81:.+]] = arith.constant 81 : index
+//   NORM-REDUCTION-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//       NORM-REDUCTION:   scf.for %[[ARG1:.+]] = %[[C0]] to %[[C81]] step %[[C1]]
+//       NORM-REDUCTION:     %[[APPLY:.+]] = affine.apply affine_map<(d0) -> (d0 * 32)>(%[[ARG1]])
+//       NORM-REDUCTION:     %[[IDX:.+]]:3 = affine.delinearize_index %[[APPLY]] into (3, 3, 288) : index, index, index
+//       NORM-REDUCTION:     %[[SLICE:.+]] = tensor.extract_slice %{{.*}}[0, %[[IDX]]#0, %[[IDX]]#1, %[[IDX]]#2] [32, 1, 1, 32] [1, 1, 1, 1] : tensor<32x3x3x288xf32> to tensor<32x1x1x32xf32>
+//       NORM-REDUCTION:     %[[COLLAPSE:.+]] = tensor.collapse_shape %[[SLICE]] {{\[}}[0], [1, 2, 3]] : tensor<32x1x1x32xf32> into tensor<32x32xf32>
+//       NORM-REDUCTION:     linalg.copy {{.*}} ins(%[[COLLAPSE]]
+
+// Without loop normalization, no swap would happen.
+//                CHECK:   tensor.collapse_shape
+//                CHECK:   scf.for
+//                CHECK:     tensor.extract_slice
+//            CHECK-NOT:     tensor.collapse_shape
+//                CHECK:     linalg.copy
+
+// -----
+
+#config = #iree_gpu.lowering_config<{reduction = [0, 30]}>
+func.func @no_swap_collapse_shape_with_extract_slice(%arg0: tensor<32x3x3x288xf32>) -> tensor<32x2592xf32> {
+  %collapsed = tensor.collapse_shape %arg0 [[0], [1, 2, 3]] : tensor<32x3x3x288xf32> into tensor<32x2592xf32>
+  %empty = tensor.empty() : tensor<32x2592xf32>
+  %0 = linalg.copy {lowering_config = #config} ins(%collapsed : tensor<32x2592xf32>) outs(%empty : tensor<32x2592xf32>) -> tensor<32x2592xf32>
+  return %0: tensor<32x2592xf32>
+}
+
+// No swap would happen when collapsed size is not divisible by offset multiplier.
+// NORM-REDUCTION-LABEL: func.func @no_swap_collapse_shape_with_extract_slice
+//       NORM-REDUCTION:   tensor.collapse_shape
+//       NORM-REDUCTION:   scf.for
+//       NORM-REDUCTION:     tensor.extract_slice
+//   NORM-REDUCTION-NOT:     tensor.collapse_shape
+//       NORM-REDUCTION:     linalg.copy
+
+// -----
+
+#config = #iree_gpu.lowering_config<{reduction = [0, 32]}>
+func.func @no_swap_collapse_shape_with_extract_slice_2(%arg0: tensor<32x2x2x16xf32>) -> tensor<32x64xf32> {
+  %collapsed = tensor.collapse_shape %arg0 [[0], [1, 2, 3]] : tensor<32x2x2x16xf32> into tensor<32x64xf32>
+  %empty = tensor.empty() : tensor<32x64xf32>
+  %0 = linalg.copy {lowering_config = #config} ins(%collapsed : tensor<32x64xf32>) outs(%empty : tensor<32x64xf32>) -> tensor<32x64xf32>
+  return %0: tensor<32x64xf32>
+}
+
+// No swap would happen when the last expanded size is not divisible by collapse size.
+// NORM-REDUCTION-LABEL: func.func @no_swap_collapse_shape_with_extract_slice_2
+//       NORM-REDUCTION:   tensor.collapse_shape
+//       NORM-REDUCTION:   scf.for
+//       NORM-REDUCTION:     tensor.extract_slice
+//   NORM-REDUCTION-NOT:     tensor.collapse_shape
+//       NORM-REDUCTION:     linalg.copy
diff --git a/compiler/src/iree/compiler/Codegen/Common/NormalizeLoopBounds.cpp b/compiler/src/iree/compiler/Codegen/Common/NormalizeLoopBounds.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "iree/compiler/Codegen/Common/Passes.h"
+#include "iree/compiler/Codegen/Common/Transforms.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
@@ -86,8 +87,7 @@ emitNormalizedLoopBounds(RewriterBase &rewriter, Location loc, Block *body,
 /// into a 0-based loop with step 1
 ///   for %ii = 0 to ceildiv(%ub - %lb, %s) step 1
 /// Insert an `affine.apply` operation to compute the denormalized index value.
-static LogicalResult normalizeLoopBounds(RewriterBase &rewriter,
-                                         scf::ForOp forOp) {
+LogicalResult normalizeLoopBounds(RewriterBase &rewriter, scf::ForOp forOp) {
   OpBuilder::InsertionGuard g(rewriter);
   // Return if already normalized.
   std::optional<int64_t> lbInt = getConstantIntValue(forOp.getLowerBound());
@@ -135,8 +135,8 @@ static LogicalResult normalizeLoopBounds(RewriterBase &rewriter,
 /// into a 0-based loop with step 1 (normalized)
 ///   forall (%i, %j) in (ceildiv(%ub0 - %lb0, %s0), ceildiv(%ub1 - %lb1, %s1))
 /// Insert `affine.apply` operations to compute the denormalized index values.
-static LogicalResult normalizeLoopBounds(RewriterBase &rewriter,
-                                         scf::ForallOp forallOp) {
+LogicalResult normalizeLoopBounds(RewriterBase &rewriter,
+                                  scf::ForallOp forallOp) {
   OpBuilder::InsertionGuard g(rewriter);
   if (forallOp.isNormalized())
     return success();
diff --git a/compiler/src/iree/compiler/Codegen/Common/Transforms.cpp b/compiler/src/iree/compiler/Codegen/Common/Transforms.cpp
diff --git a/compiler/src/iree/compiler/Codegen/Common/Transforms.h b/compiler/src/iree/compiler/Codegen/Common/Transforms.h