diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h index d4444c3f869e5c..24cb754d65e16c 100644 --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h @@ -28,19 +28,11 @@ namespace linalg { /// 3. No uses of the memref either dominate the transfer_read or are /// dominated by the transfer_write (i.e. no aliasing between the write and /// the read across the loop) -/// 4. The source operands for vector.transfer_{read|write} do not originate -/// from Ops implementing ViewLikeOpInterface (to reduce the risk of -/// aliasing). /// To improve hoisting opportunities, call the `moveLoopInvariantCode` helper /// function on the candidate loop above which to hoist. Hoisting the transfers /// results in scf::ForOp yielding the value that originally transited through /// memory. /// -/// TODO: To further improve hoisting opportunities, fold aliasing memref -/// operations into respective vector.transfer{read|write} operations and -/// avoid using ops implementing ViewLikeOpInterface as the source for transfer -/// Ops. -/// /// WARNING: This hoisting does not model parallelism and is generally incorrect /// when used on distributed loops with memref semantics! void hoistRedundantVectorTransfers(func::FuncOp func); diff --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp index 221bec713b38aa..1c68ca49725eff 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp @@ -55,12 +55,11 @@ void mlir::linalg::hoistRedundantVectorTransfersOnTensor(func::FuncOp func) { static bool noAliasingUseInLoop(vector::TransferReadOp transferRead, LoopLikeOpInterface loop) { Value source = transferRead.getSource(); - - // Skip view-like Ops and retrive the actual soruce Operation - while (auto srcOp = - dyn_cast_or_null(source.getDefiningOp())) - source = srcOp.getViewSource(); - + // Skip subview and collapse_shape Ops + while (auto subView = source.getDefiningOp()) + source = subView.getSource(); + while (auto collapsed = source.getDefiningOp()) + source = collapsed->getOperand(0); llvm::SmallVector users(source.getUsers().begin(), source.getUsers().end()); llvm::SmallDenseSet processed; @@ -69,8 +68,12 @@ static bool noAliasingUseInLoop(vector::TransferReadOp transferRead, // If the user has already been processed skip. if (!processed.insert(user).second) continue; - if (auto viewLike = dyn_cast(user)) { - users.append(viewLike->getUsers().begin(), viewLike->getUsers().end()); + if (auto subView = dyn_cast(user)) { + users.append(subView->getUsers().begin(), subView->getUsers().end()); + continue; + } + if (auto collapsed = dyn_cast(user)) { + users.append(collapsed->getUsers().begin(), collapsed->getUsers().end()); continue; } if (isMemoryEffectFree(user) || isa(user)) @@ -141,9 +144,7 @@ void mlir::linalg::hoistRedundantVectorTransfers(func::FuncOp func) { // Approximate aliasing by checking that: // 1. indices, vector type and permutation map are the same (i.e., the // transfer_read/transfer_write ops are matching), - // 2. source operands for transfer.{read|write} do not originate from - // Ops implementing ViewLikeOpInterface. - // 3. no other operations in the loop access the same memref except + // 2. no other operations in the loop access the same memref except // for transfer_read/transfer_write accessing statically disjoint // slices. if (transferRead.getIndices() != transferWrite.getIndices() || @@ -151,14 +152,6 @@ void mlir::linalg::hoistRedundantVectorTransfers(func::FuncOp func) { transferRead.getPermutationMap() != transferWrite.getPermutationMap()) return WalkResult::advance(); - auto *source = transferRead.getSource().getDefiningOp(); - if (source && isa_and_nonnull(source)) - return WalkResult::advance(); - - source = transferWrite.getSource().getDefiningOp(); - if (source && isa_and_nonnull(source)) - return WalkResult::advance(); - // TODO: may want to memoize this information for performance but it // likely gets invalidated often. DominanceInfo dom(loop); diff --git a/mlir/test/Dialect/Linalg/hoisting.mlir b/mlir/test/Dialect/Linalg/hoisting.mlir index 7d0c3648c344b1..e25914620726b9 100644 --- a/mlir/test/Dialect/Linalg/hoisting.mlir +++ b/mlir/test/Dialect/Linalg/hoisting.mlir @@ -765,10 +765,10 @@ transform.sequence failures(propagate) { // CHECK-LABEL: func.func @no_hoisting_collapse_shape // CHECK: scf.for {{.*}} { -// CHECK: vector.transfer_write {{.*}} : vector<4xi32>, memref<4xi32> -// CHECK-NEXT: vector.transfer_read {{.*}} : memref<1x4x1xi32>, vector<1x4x1xi32> -// CHECK-NEXT: vector.transfer_write {{.*}} : vector<1x4x1xi32>, memref<1x4x1xi32, strided<[20, 1, 1], offset: ?>> -// CHECK-NEXT: } +// CHECK: vector.transfer_write +// CHECK: vector.transfer_read +// CHECK: vector.transfer_write +// CHECK: } func.func @no_hoisting_collapse_shape(%in_0: memref<1x20x1xi32>, %1: memref<9x1xi32>, %vec: vector<4xi32>) { %c0_i32 = arith.constant 0 : i32 @@ -827,48 +827,3 @@ transform.sequence failures(propagate) { transform.structured.hoist_redundant_vector_transfers %0 : (!transform.any_op) -> !transform.any_op } - -// ----- - -// Regression test - hoisting the following `vector.transfer_{read|write}` pair -// would not be safe: -// %lhs = vector.transfer_read %collapsed_1[%c0] -// vector.transfer_write %op, %collapsed_1[%c0] -// That's because the following `vector.transfer_read` reads from the same -// memory (i.e. `%collapsed_1` and `%collapsed_2` alias): -// %acc = vector.transfer_read %collapsed_2[%c0] - -// CHECK-LABEL: func.func @no_hoisting_write_to_memref -// CHECK: scf.for {{.*}} { -// CHECK: vector.transfer_read {{.*}} : memref<2xi32>, vector<1xi32> -// CHECK-NEXT: vector.transfer_read {{.*}} : memref<2xi32>, vector<1xi32> -// CHECK-NEXT: vector.outerproduct {{.*}} : vector<1xi32>, i32 -// CHECK-NEXT: vector.transfer_write {{.*}} : vector<1xi32>, memref<2xi32> -// CHECK-NEXT: } - -func.func @no_hoisting_write_to_memref(%rhs: i32, %arg1: vector<1xi32>) { - %c0_i32 = arith.constant 0 : i32 - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %c20 = arith.constant 20 : index - %alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2xi32> - %cast = memref.cast %alloca : memref<1x1x2xi32> to memref<1x1x2xi32> - %collapsed_1 = memref.collapse_shape %alloca [[0, 1, 2]] : memref<1x1x2xi32> into memref<2xi32> - scf.for %_ = %c0 to %c20 step %c4 { - %collapsed_2 = memref.collapse_shape %alloca [[0, 1, 2]] : memref<1x1x2xi32> into memref<2xi32> - %lhs = vector.transfer_read %collapsed_1[%c0], %c0_i32 {in_bounds = [true]} : memref<2xi32>, vector<1xi32> - %acc = vector.transfer_read %collapsed_2[%c0], %c0_i32 {in_bounds = [true]} : memref<2xi32>, vector<1xi32> - %op = vector.outerproduct %lhs, %rhs, %acc {kind = #vector.kind} : vector<1xi32>, i32 - vector.transfer_write %op, %collapsed_1[%c0] {in_bounds = [true]} : vector<1xi32>, memref<2xi32> - } - return -} - -transform.sequence failures(propagate) { -^bb1(%arg1: !transform.any_op): - %0 = transform.structured.match ops{["func.func"]} in %arg1 - : (!transform.any_op) -> !transform.any_op - transform.structured.hoist_redundant_vector_transfers %0 - : (!transform.any_op) -> !transform.any_op -}