[mlir] Add for loop specialization

Summary: We already had a parallel loop specialization pass that is used to enable unrolling and consecutive vectorization by rewriting loops whose bound is defined as a min of a constant and a dynamic value into a loop with static bound (the constant) and the minimum as bound, wrapped into a conditional to dispatch between the two. This adds the same rewriting for for loops. Differential Revision: https://reviews.llvm.org/D82189
llvm · Jun 22, 2020 · 4bcd08e · 4bcd08e
1 parent 46ea465
commit 4bcd08e
Show file tree

Hide file tree

Showing 7 changed files with 109 additions and 17 deletions.
diff --git a/mlir/include/mlir/Dialect/SCF/Passes.h b/mlir/include/mlir/Dialect/SCF/Passes.h
@@ -20,6 +20,10 @@ namespace mlir {
 
 class Pass;
 
+/// Creates a pass that specializes for loop for unrolling and
+/// vectorization.
+std::unique_ptr<Pass> createForLoopSpecializationPass();
+
 /// Creates a loop fusion pass which fuses parallel loops.
 std::unique_ptr<Pass> createParallelLoopFusionPass();
 

diff --git a/mlir/include/mlir/Dialect/SCF/Passes.td b/mlir/include/mlir/Dialect/SCF/Passes.td
@@ -1,4 +1,4 @@
-//===-- Passes.td - Loop pass definition file --------------*- tablegen -*-===//
+//===-- Passes.td - SCF pass definition file ---------------*- tablegen -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -11,18 +11,24 @@
 
 include "mlir/Pass/PassBase.td"
 
-def LoopParallelLoopFusion : Pass<"parallel-loop-fusion"> {
+def SCFForLoopSpecialization
+    : FunctionPass<"for-loop-specialization"> {
+  let summary = "Specialize `for` loops for vectorization";
+  let constructor = "mlir::createForLoopSpecializationPass()";
+}
+
+def SCFParallelLoopFusion : Pass<"parallel-loop-fusion"> {
   let summary = "Fuse adjacent parallel loops";
   let constructor = "mlir::createParallelLoopFusionPass()";
 }
 
-def LoopParallelLoopSpecialization
+def SCFParallelLoopSpecialization
     : FunctionPass<"parallel-loop-specialization"> {
   let summary = "Specialize parallel loops for vectorization";
   let constructor = "mlir::createParallelLoopSpecializationPass()";
 }
 
-def LoopParallelLoopTiling : FunctionPass<"parallel-loop-tiling"> {
+def SCFParallelLoopTiling : FunctionPass<"parallel-loop-tiling"> {
   let summary = "Tile parallel loops";
   let constructor = "mlir::createParallelLoopTilingPass()";
   let options = [

diff --git a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_mlir_dialect_library(MLIRSCFTransforms
+  LoopSpecialization.cpp
   ParallelLoopFusion.cpp
-  ParallelLoopSpecialization.cpp
   ParallelLoopTiling.cpp
   Utils.cpp
 

diff --git a/...Transforms/ParallelLoopSpecialization.cpp → ...ect/SCF/Transforms/LoopSpecialization.cpp b/...Transforms/ParallelLoopSpecialization.cpp → ...ect/SCF/Transforms/LoopSpecialization.cpp
@@ -1,12 +1,13 @@
-//===- ParallelLoopSpecialization.cpp - scf.parallel specialization ------===//
+//===- LoopSpecialization.cpp - scf.parallel/SCR.for specialization -------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
-// Specializes parallel loops for easier unrolling and vectorization.
+// Specializes parallel loops and for loops for easier unrolling and
+// vectorization.
 //
 //===----------------------------------------------------------------------===//
 
@@ -19,21 +20,22 @@
 #include "mlir/IR/BlockAndValueMapping.h"
 
 using namespace mlir;
+using scf::ForOp;
 using scf::ParallelOp;
 
-/// Rewrite a loop with bounds defined by an affine.min with a constant into 2
-/// loops after checking if the bounds are equal to that constant. This is
-/// beneficial if the loop will almost always have the constant bound and that
-/// version can be fully unrolled and vectorized.
-static void specializeLoopForUnrolling(ParallelOp op) {
+/// Rewrite a parallel loop with bounds defined by an affine.min with a constant
+/// into 2 loops after checking if the bounds are equal to that constant. This
+/// is beneficial if the loop will almost always have the constant bound and
+/// that version can be fully unrolled and vectorized.
+static void specializeParallelLoopForUnrolling(ParallelOp op) {
   SmallVector<int64_t, 2> constantIndices;
   constantIndices.reserve(op.upperBound().size());
   for (auto bound : op.upperBound()) {
     auto minOp = bound.getDefiningOp<AffineMinOp>();
     if (!minOp)
       return;
     int64_t minConstant = std::numeric_limits<int64_t>::max();
-    for (auto expr : minOp.map().getResults()) {
+    for (AffineExpr expr : minOp.map().getResults()) {
       if (auto constantIndex = expr.dyn_cast<AffineConstantExpr>())
         minConstant = std::min(minConstant, constantIndex.getValue());
     }
@@ -58,15 +60,56 @@ static void specializeLoopForUnrolling(ParallelOp op) {
   op.erase();
 }
 
+/// Rewrite a for loop with bounds defined by an affine.min with a constant into
+/// 2 loops after checking if the bounds are equal to that constant. This is
+/// beneficial if the loop will almost always have the constant bound and that
+/// version can be fully unrolled and vectorized.
+static void specializeForLoopForUnrolling(ForOp op) {
+  auto bound = op.upperBound();
+  auto minOp = bound.getDefiningOp<AffineMinOp>();
+  if (!minOp)
+    return;
+  int64_t minConstant = std::numeric_limits<int64_t>::max();
+  for (AffineExpr expr : minOp.map().getResults()) {
+    if (auto constantIndex = expr.dyn_cast<AffineConstantExpr>())
+      minConstant = std::min(minConstant, constantIndex.getValue());
+  }
+  if (minConstant == std::numeric_limits<int64_t>::max())
+    return;
+
+  OpBuilder b(op);
+  BlockAndValueMapping map;
+  Value constant = b.create<ConstantIndexOp>(op.getLoc(), minConstant);
+  Value cond =
+      b.create<CmpIOp>(op.getLoc(), CmpIPredicate::eq, bound, constant);
+  map.map(bound, constant);
+  auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true);
+  ifOp.getThenBodyBuilder().clone(*op.getOperation(), map);
+  ifOp.getElseBodyBuilder().clone(*op.getOperation());
+  op.erase();
+}
+
 namespace {
 struct ParallelLoopSpecialization
-    : public LoopParallelLoopSpecializationBase<ParallelLoopSpecialization> {
+    : public SCFParallelLoopSpecializationBase<ParallelLoopSpecialization> {
+  void runOnFunction() override {
+    getFunction().walk(
+        [](ParallelOp op) { specializeParallelLoopForUnrolling(op); });
+  }
+};
+
+struct ForLoopSpecialization
+    : public SCFForLoopSpecializationBase<ForLoopSpecialization> {
   void runOnFunction() override {
-    getFunction().walk([](ParallelOp op) { specializeLoopForUnrolling(op); });
+    getFunction().walk([](ForOp op) { specializeForLoopForUnrolling(op); });
   }
 };
 } // namespace
 
 std::unique_ptr<Pass> mlir::createParallelLoopSpecializationPass() {
   return std::make_unique<ParallelLoopSpecialization>();
 }
+
+std::unique_ptr<Pass> mlir::createForLoopSpecializationPass() {
+  return std::make_unique<ForLoopSpecialization>();
+}
diff --git a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopFusion.cpp b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopFusion.cpp
@@ -160,7 +160,7 @@ void mlir::scf::naivelyFuseParallelOps(Region &region) {
 
 namespace {
 struct ParallelLoopFusion
-    : public LoopParallelLoopFusionBase<ParallelLoopFusion> {
+    : public SCFParallelLoopFusionBase<ParallelLoopFusion> {
   void runOnOperation() override {
     getOperation()->walk([&](Operation *child) {
       for (Region &region : child->getRegions())

diff --git a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp
@@ -119,7 +119,7 @@ static bool getInnermostNestedLoops(Block *block,
 
 namespace {
 struct ParallelLoopTiling
-    : public LoopParallelLoopTilingBase<ParallelLoopTiling> {
+    : public SCFParallelLoopTilingBase<ParallelLoopTiling> {
   ParallelLoopTiling() = default;
   explicit ParallelLoopTiling(ArrayRef<int64_t> tileSizes) {
     this->tileSizes = tileSizes;

diff --git a/mlir/test/Dialect/SCF/for-loop-specialization.mlir b/mlir/test/Dialect/SCF/for-loop-specialization.mlir
@@ -0,0 +1,39 @@
+// RUN: mlir-opt %s -for-loop-specialization -split-input-file | FileCheck %s
+
+#map0 = affine_map<()[s0, s1] -> (1024, s0 - s1)>
+#map1 = affine_map<()[s0, s1] -> (64, s0 - s1)>
+
+func @for(%outer: index, %A: memref<?xf32>, %B: memref<?xf32>,
+          %C: memref<?xf32>, %result: memref<?xf32>) {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %d0 = dim %A, %c0 : memref<?xf32>
+  %b0 = affine.min #map0()[%d0, %outer]
+  scf.for %i0 = %c0 to %b0 step %c1 {
+    %B_elem = load %B[%i0] : memref<?xf32>
+    %C_elem = load %C[%i0] : memref<?xf32>
+    %sum_elem = addf %B_elem, %C_elem : f32
+    store %sum_elem, %result[%i0] : memref<?xf32>
+  }
+  return
+}
+
+// CHECK-LABEL:   func @for(
+// CHECK-SAME:              [[ARG0:%.*]]: index, [[ARG1:%.*]]: memref<?xf32>, [[ARG2:%.*]]: memref<?xf32>, [[ARG3:%.*]]: memref<?xf32>, [[ARG4:%.*]]: memref<?xf32>) {
+// CHECK:           [[CST_0:%.*]] = constant 0 : index
+// CHECK:           [[CST_1:%.*]] = constant 1 : index
+// CHECK:           [[DIM_0:%.*]] = dim [[ARG1]], [[CST_0]] : memref<?xf32>
+// CHECK:           [[MIN:%.*]] = affine.min #map0(){{\[}}[[DIM_0]], [[ARG0]]]
+// CHECK:           [[CST_1024:%.*]] = constant 1024 : index
+// CHECK:           [[PRED:%.*]] = cmpi "eq", [[MIN]], [[CST_1024]] : index
+// CHECK:           scf.if [[PRED]] {
+// CHECK:             scf.for [[IDX0:%.*]] = [[CST_0]] to [[CST_1024]] step [[CST_1]] {
+// CHECK:               store
+// CHECK:             }
+// CHECK:           } else {
+// CHECK:             scf.for [[IDX0:%.*]] = [[CST_0]] to [[MIN]] step [[CST_1]] {
+// CHECK:               store
+// CHECK:             }
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }