Skip to content

Commit 35984f0

Browse files
authored
[Codegen] Add SwapExtractWithCollapsePattern (llvm#21419)
This PR adds a pattern to swap the `collapse_shape` with the `extract_slice` op to enable more loop fusion opportunity. Note this pattern is adapted from the upstream[ `BubbleUpCollapseShapeThroughExtractSlice`](https://github.com/llvm/llvm-project/blob/ffb453989b0e95d85b6cfa543b65fec23b65649d/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp#L322) by allowing some special cases we've seen through the IGEMM path. The special case is processed under `CASE #1` session with the corresponding tests mainly focusing on the changes. Note that some strict conditions are added to match the special case for convolutions which maybe not general and robust enough to add to upstream. In addition, we break the [upstream single pattern](https://github.com/llvm/llvm-project/blob/ffb453989b0e95d85b6cfa543b65fec23b65649d/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp#L767) into two separate ones to be able to apply them separately in different passes. --------- Signed-off-by: yzhang93 <zhyuhang88@gmail.com>
1 parent a295e01 commit 35984f0

File tree

6 files changed

+426
-5
lines changed

6 files changed

+426
-5
lines changed

compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyTilingLevel.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,23 @@ void GPUApplyTilingLevelPass::runOnOperation() {
289289

290290
MLIRContext *context = &getContext();
291291

292+
// Swap `collapse_shape` with `extract_slice` to enable more loop fusion
293+
// opportunity. Currently this is only needed for convolution IGEMM path.
294+
// TODO(vivian): Move the pattern to `GPUFuseAndHoistParallelLoopsPass`.
295+
if (normalizeLoops) {
296+
funcOp->walk(
297+
[&](scf::ForOp forOp) { (void)normalizeLoopBounds(rewriter, forOp); });
298+
funcOp->walk([&](scf::ForallOp forallOp) {
299+
(void)normalizeLoopBounds(rewriter, forallOp);
300+
});
301+
302+
RewritePatternSet patterns(context);
303+
populateSwapExtractWithCollapsePattern(patterns);
304+
if (failed(applyPatternsGreedily(funcOp, std::move(patterns)))) {
305+
return signalPassFailure();
306+
}
307+
}
308+
292309
// Apply cleanup patterns.
293310
{
294311
RewritePatternSet patterns(context);

compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,9 @@ def GPUApplyTilingLevelPass :
326326
)}]>,
327327
Option<"allowZeroSlices", "allow-zero-slices", "bool",
328328
/*default=*/"true",
329-
"Allow pad fusion to generate zero size slices">
329+
"Allow pad fusion to generate zero size slices">,
330+
Option<"normalizeLoops", "normalize-loops", "bool", "false",
331+
"Enable normalization for scf loops">
330332
];
331333
}
332334

compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
// RUN: iree-opt --split-input-file --mlir-print-local-scope --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level{tiling-level=thread}, canonicalize, cse))" %s | FileCheck %s --check-prefix=THREAD
44
// RUN: iree-opt --split-input-file --mlir-print-local-scope --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level{tiling-level=subgroup}, canonicalize, cse))" %s | FileCheck %s --check-prefix=SUBGROUP
55
// RUN: iree-opt --split-input-file --mlir-print-local-scope --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level{tiling-level=partial_reduction}, canonicalize, cse))" %s | FileCheck %s --check-prefix=PARTRED
6+
// RUN: iree-opt --split-input-file --mlir-print-local-scope --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level{normalize-loops}, canonicalize, cse))" %s | FileCheck %s --check-prefix=NORM-REDUCTION
67

78
#config = #iree_gpu.lowering_config<{thread = [2, 16], subgroup = [2, 16]}>
89
#map = affine_map<(d0, d1) -> (d0, d1)>
@@ -536,3 +537,67 @@ func.func @partial_reduction(%3: tensor<?x?xf32>) -> tensor<?xf32> {
536537
// PARTRED: scf.yield
537538
// PARTRED: linalg.reduce ins(%[[OUT]] : tensor<?x8xf32>)
538539
// PARTRED-SAME: outs(%[[FULL]] : tensor<?xf32>)
540+
541+
// -----
542+
543+
#config = #iree_gpu.lowering_config<{reduction = [0, 32]}>
544+
func.func @swap_collapse_shape_with_extract_slice(%arg0: tensor<32x3x3x288xf32>) -> tensor<32x2592xf32> {
545+
%collapsed = tensor.collapse_shape %arg0 [[0], [1, 2, 3]] : tensor<32x3x3x288xf32> into tensor<32x2592xf32>
546+
%empty = tensor.empty() : tensor<32x2592xf32>
547+
%0 = linalg.copy {lowering_config = #config} ins(%collapsed : tensor<32x2592xf32>) outs(%empty : tensor<32x2592xf32>) -> tensor<32x2592xf32>
548+
return %0: tensor<32x2592xf32>
549+
}
550+
551+
// NORM-REDUCTION-LABEL: func.func @swap_collapse_shape_with_extract_slice
552+
// NORM-REDUCTION-DAG: %[[C1:.+]] = arith.constant 1 : index
553+
// NORM-REDUCTION-DAG: %[[C81:.+]] = arith.constant 81 : index
554+
// NORM-REDUCTION-DAG: %[[C0:.+]] = arith.constant 0 : index
555+
// NORM-REDUCTION: scf.for %[[ARG1:.+]] = %[[C0]] to %[[C81]] step %[[C1]]
556+
// NORM-REDUCTION: %[[APPLY:.+]] = affine.apply affine_map<(d0) -> (d0 * 32)>(%[[ARG1]])
557+
// NORM-REDUCTION: %[[IDX:.+]]:3 = affine.delinearize_index %[[APPLY]] into (3, 3, 288) : index, index, index
558+
// NORM-REDUCTION: %[[SLICE:.+]] = tensor.extract_slice %{{.*}}[0, %[[IDX]]#0, %[[IDX]]#1, %[[IDX]]#2] [32, 1, 1, 32] [1, 1, 1, 1] : tensor<32x3x3x288xf32> to tensor<32x1x1x32xf32>
559+
// NORM-REDUCTION: %[[COLLAPSE:.+]] = tensor.collapse_shape %[[SLICE]] {{\[}}[0], [1, 2, 3]] : tensor<32x1x1x32xf32> into tensor<32x32xf32>
560+
// NORM-REDUCTION: linalg.copy {{.*}} ins(%[[COLLAPSE]]
561+
562+
// Without loop normalization, no swap would happen.
563+
// CHECK: tensor.collapse_shape
564+
// CHECK: scf.for
565+
// CHECK: tensor.extract_slice
566+
// CHECK-NOT: tensor.collapse_shape
567+
// CHECK: linalg.copy
568+
569+
// -----
570+
571+
#config = #iree_gpu.lowering_config<{reduction = [0, 30]}>
572+
func.func @no_swap_collapse_shape_with_extract_slice(%arg0: tensor<32x3x3x288xf32>) -> tensor<32x2592xf32> {
573+
%collapsed = tensor.collapse_shape %arg0 [[0], [1, 2, 3]] : tensor<32x3x3x288xf32> into tensor<32x2592xf32>
574+
%empty = tensor.empty() : tensor<32x2592xf32>
575+
%0 = linalg.copy {lowering_config = #config} ins(%collapsed : tensor<32x2592xf32>) outs(%empty : tensor<32x2592xf32>) -> tensor<32x2592xf32>
576+
return %0: tensor<32x2592xf32>
577+
}
578+
579+
// No swap would happen when collapsed size is not divisible by offset multiplier.
580+
// NORM-REDUCTION-LABEL: func.func @no_swap_collapse_shape_with_extract_slice
581+
// NORM-REDUCTION: tensor.collapse_shape
582+
// NORM-REDUCTION: scf.for
583+
// NORM-REDUCTION: tensor.extract_slice
584+
// NORM-REDUCTION-NOT: tensor.collapse_shape
585+
// NORM-REDUCTION: linalg.copy
586+
587+
// -----
588+
589+
#config = #iree_gpu.lowering_config<{reduction = [0, 32]}>
590+
func.func @no_swap_collapse_shape_with_extract_slice_2(%arg0: tensor<32x2x2x16xf32>) -> tensor<32x64xf32> {
591+
%collapsed = tensor.collapse_shape %arg0 [[0], [1, 2, 3]] : tensor<32x2x2x16xf32> into tensor<32x64xf32>
592+
%empty = tensor.empty() : tensor<32x64xf32>
593+
%0 = linalg.copy {lowering_config = #config} ins(%collapsed : tensor<32x64xf32>) outs(%empty : tensor<32x64xf32>) -> tensor<32x64xf32>
594+
return %0: tensor<32x64xf32>
595+
}
596+
597+
// No swap would happen when the last expanded size is not divisible by collapse size.
598+
// NORM-REDUCTION-LABEL: func.func @no_swap_collapse_shape_with_extract_slice_2
599+
// NORM-REDUCTION: tensor.collapse_shape
600+
// NORM-REDUCTION: scf.for
601+
// NORM-REDUCTION: tensor.extract_slice
602+
// NORM-REDUCTION-NOT: tensor.collapse_shape
603+
// NORM-REDUCTION: linalg.copy

compiler/src/iree/compiler/Codegen/Common/NormalizeLoopBounds.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66

77
#include "iree/compiler/Codegen/Common/Passes.h"
8+
#include "iree/compiler/Codegen/Common/Transforms.h"
89
#include "mlir/Dialect/Affine/IR/AffineOps.h"
910
#include "mlir/Dialect/Arith/Utils/Utils.h"
1011
#include "mlir/Dialect/SCF/IR/SCF.h"
@@ -86,8 +87,7 @@ emitNormalizedLoopBounds(RewriterBase &rewriter, Location loc, Block *body,
8687
/// into a 0-based loop with step 1
8788
/// for %ii = 0 to ceildiv(%ub - %lb, %s) step 1
8889
/// Insert an `affine.apply` operation to compute the denormalized index value.
89-
static LogicalResult normalizeLoopBounds(RewriterBase &rewriter,
90-
scf::ForOp forOp) {
90+
LogicalResult normalizeLoopBounds(RewriterBase &rewriter, scf::ForOp forOp) {
9191
OpBuilder::InsertionGuard g(rewriter);
9292
// Return if already normalized.
9393
std::optional<int64_t> lbInt = getConstantIntValue(forOp.getLowerBound());
@@ -135,8 +135,8 @@ static LogicalResult normalizeLoopBounds(RewriterBase &rewriter,
135135
/// into a 0-based loop with step 1 (normalized)
136136
/// forall (%i, %j) in (ceildiv(%ub0 - %lb0, %s0), ceildiv(%ub1 - %lb1, %s1))
137137
/// Insert `affine.apply` operations to compute the denormalized index values.
138-
static LogicalResult normalizeLoopBounds(RewriterBase &rewriter,
139-
scf::ForallOp forallOp) {
138+
LogicalResult normalizeLoopBounds(RewriterBase &rewriter,
139+
scf::ForallOp forallOp) {
140140
OpBuilder::InsertionGuard g(rewriter);
141141
if (forallOp.isNormalized())
142142
return success();

0 commit comments

Comments
 (0)