Skip to content

Commit

Permalink
[mlir] Add for loop specialization
Browse files Browse the repository at this point in the history
Summary:
We already had a parallel loop specialization pass that is used to
enable unrolling and consecutive vectorization by rewriting loops
whose bound is defined as a min of a constant and a dynamic value
into a loop with static bound (the constant) and the minimum as
bound, wrapped into a conditional to dispatch between the two.
This adds the same rewriting for for loops.

Differential Revision: https://reviews.llvm.org/D82189
  • Loading branch information
Stephan Herhut committed Jun 22, 2020
1 parent 46ea465 commit 4bcd08e
Show file tree
Hide file tree
Showing 7 changed files with 109 additions and 17 deletions.
4 changes: 4 additions & 0 deletions mlir/include/mlir/Dialect/SCF/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ namespace mlir {

class Pass;

/// Creates a pass that specializes for loop for unrolling and
/// vectorization.
std::unique_ptr<Pass> createForLoopSpecializationPass();

/// Creates a loop fusion pass which fuses parallel loops.
std::unique_ptr<Pass> createParallelLoopFusionPass();

Expand Down
14 changes: 10 additions & 4 deletions mlir/include/mlir/Dialect/SCF/Passes.td
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//===-- Passes.td - Loop pass definition file --------------*- tablegen -*-===//
//===-- Passes.td - SCF pass definition file ---------------*- tablegen -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
Expand All @@ -11,18 +11,24 @@

include "mlir/Pass/PassBase.td"

def LoopParallelLoopFusion : Pass<"parallel-loop-fusion"> {
def SCFForLoopSpecialization
: FunctionPass<"for-loop-specialization"> {
let summary = "Specialize `for` loops for vectorization";
let constructor = "mlir::createForLoopSpecializationPass()";
}

def SCFParallelLoopFusion : Pass<"parallel-loop-fusion"> {
let summary = "Fuse adjacent parallel loops";
let constructor = "mlir::createParallelLoopFusionPass()";
}

def LoopParallelLoopSpecialization
def SCFParallelLoopSpecialization
: FunctionPass<"parallel-loop-specialization"> {
let summary = "Specialize parallel loops for vectorization";
let constructor = "mlir::createParallelLoopSpecializationPass()";
}

def LoopParallelLoopTiling : FunctionPass<"parallel-loop-tiling"> {
def SCFParallelLoopTiling : FunctionPass<"parallel-loop-tiling"> {
let summary = "Tile parallel loops";
let constructor = "mlir::createParallelLoopTilingPass()";
let options = [
Expand Down
2 changes: 1 addition & 1 deletion mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
add_mlir_dialect_library(MLIRSCFTransforms
LoopSpecialization.cpp
ParallelLoopFusion.cpp
ParallelLoopSpecialization.cpp
ParallelLoopTiling.cpp
Utils.cpp

Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
//===- ParallelLoopSpecialization.cpp - scf.parallel specialization ------===//
//===- LoopSpecialization.cpp - scf.parallel/SCR.for specialization -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Specializes parallel loops for easier unrolling and vectorization.
// Specializes parallel loops and for loops for easier unrolling and
// vectorization.
//
//===----------------------------------------------------------------------===//

Expand All @@ -19,21 +20,22 @@
#include "mlir/IR/BlockAndValueMapping.h"

using namespace mlir;
using scf::ForOp;
using scf::ParallelOp;

/// Rewrite a loop with bounds defined by an affine.min with a constant into 2
/// loops after checking if the bounds are equal to that constant. This is
/// beneficial if the loop will almost always have the constant bound and that
/// version can be fully unrolled and vectorized.
static void specializeLoopForUnrolling(ParallelOp op) {
/// Rewrite a parallel loop with bounds defined by an affine.min with a constant
/// into 2 loops after checking if the bounds are equal to that constant. This
/// is beneficial if the loop will almost always have the constant bound and
/// that version can be fully unrolled and vectorized.
static void specializeParallelLoopForUnrolling(ParallelOp op) {
SmallVector<int64_t, 2> constantIndices;
constantIndices.reserve(op.upperBound().size());
for (auto bound : op.upperBound()) {
auto minOp = bound.getDefiningOp<AffineMinOp>();
if (!minOp)
return;
int64_t minConstant = std::numeric_limits<int64_t>::max();
for (auto expr : minOp.map().getResults()) {
for (AffineExpr expr : minOp.map().getResults()) {
if (auto constantIndex = expr.dyn_cast<AffineConstantExpr>())
minConstant = std::min(minConstant, constantIndex.getValue());
}
Expand All @@ -58,15 +60,56 @@ static void specializeLoopForUnrolling(ParallelOp op) {
op.erase();
}

/// Rewrite a for loop with bounds defined by an affine.min with a constant into
/// 2 loops after checking if the bounds are equal to that constant. This is
/// beneficial if the loop will almost always have the constant bound and that
/// version can be fully unrolled and vectorized.
static void specializeForLoopForUnrolling(ForOp op) {
auto bound = op.upperBound();
auto minOp = bound.getDefiningOp<AffineMinOp>();
if (!minOp)
return;
int64_t minConstant = std::numeric_limits<int64_t>::max();
for (AffineExpr expr : minOp.map().getResults()) {
if (auto constantIndex = expr.dyn_cast<AffineConstantExpr>())
minConstant = std::min(minConstant, constantIndex.getValue());
}
if (minConstant == std::numeric_limits<int64_t>::max())
return;

OpBuilder b(op);
BlockAndValueMapping map;
Value constant = b.create<ConstantIndexOp>(op.getLoc(), minConstant);
Value cond =
b.create<CmpIOp>(op.getLoc(), CmpIPredicate::eq, bound, constant);
map.map(bound, constant);
auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true);
ifOp.getThenBodyBuilder().clone(*op.getOperation(), map);
ifOp.getElseBodyBuilder().clone(*op.getOperation());
op.erase();
}

namespace {
struct ParallelLoopSpecialization
: public LoopParallelLoopSpecializationBase<ParallelLoopSpecialization> {
: public SCFParallelLoopSpecializationBase<ParallelLoopSpecialization> {
void runOnFunction() override {
getFunction().walk(
[](ParallelOp op) { specializeParallelLoopForUnrolling(op); });
}
};

struct ForLoopSpecialization
: public SCFForLoopSpecializationBase<ForLoopSpecialization> {
void runOnFunction() override {
getFunction().walk([](ParallelOp op) { specializeLoopForUnrolling(op); });
getFunction().walk([](ForOp op) { specializeForLoopForUnrolling(op); });
}
};
} // namespace

std::unique_ptr<Pass> mlir::createParallelLoopSpecializationPass() {
return std::make_unique<ParallelLoopSpecialization>();
}

std::unique_ptr<Pass> mlir::createForLoopSpecializationPass() {
return std::make_unique<ForLoopSpecialization>();
}
2 changes: 1 addition & 1 deletion mlir/lib/Dialect/SCF/Transforms/ParallelLoopFusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ void mlir::scf::naivelyFuseParallelOps(Region &region) {

namespace {
struct ParallelLoopFusion
: public LoopParallelLoopFusionBase<ParallelLoopFusion> {
: public SCFParallelLoopFusionBase<ParallelLoopFusion> {
void runOnOperation() override {
getOperation()->walk([&](Operation *child) {
for (Region &region : child->getRegions())
Expand Down
2 changes: 1 addition & 1 deletion mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ static bool getInnermostNestedLoops(Block *block,

namespace {
struct ParallelLoopTiling
: public LoopParallelLoopTilingBase<ParallelLoopTiling> {
: public SCFParallelLoopTilingBase<ParallelLoopTiling> {
ParallelLoopTiling() = default;
explicit ParallelLoopTiling(ArrayRef<int64_t> tileSizes) {
this->tileSizes = tileSizes;
Expand Down
39 changes: 39 additions & 0 deletions mlir/test/Dialect/SCF/for-loop-specialization.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// RUN: mlir-opt %s -for-loop-specialization -split-input-file | FileCheck %s

#map0 = affine_map<()[s0, s1] -> (1024, s0 - s1)>
#map1 = affine_map<()[s0, s1] -> (64, s0 - s1)>

func @for(%outer: index, %A: memref<?xf32>, %B: memref<?xf32>,
%C: memref<?xf32>, %result: memref<?xf32>) {
%c0 = constant 0 : index
%c1 = constant 1 : index
%d0 = dim %A, %c0 : memref<?xf32>
%b0 = affine.min #map0()[%d0, %outer]
scf.for %i0 = %c0 to %b0 step %c1 {
%B_elem = load %B[%i0] : memref<?xf32>
%C_elem = load %C[%i0] : memref<?xf32>
%sum_elem = addf %B_elem, %C_elem : f32
store %sum_elem, %result[%i0] : memref<?xf32>
}
return
}

// CHECK-LABEL: func @for(
// CHECK-SAME: [[ARG0:%.*]]: index, [[ARG1:%.*]]: memref<?xf32>, [[ARG2:%.*]]: memref<?xf32>, [[ARG3:%.*]]: memref<?xf32>, [[ARG4:%.*]]: memref<?xf32>) {
// CHECK: [[CST_0:%.*]] = constant 0 : index
// CHECK: [[CST_1:%.*]] = constant 1 : index
// CHECK: [[DIM_0:%.*]] = dim [[ARG1]], [[CST_0]] : memref<?xf32>
// CHECK: [[MIN:%.*]] = affine.min #map0(){{\[}}[[DIM_0]], [[ARG0]]]
// CHECK: [[CST_1024:%.*]] = constant 1024 : index
// CHECK: [[PRED:%.*]] = cmpi "eq", [[MIN]], [[CST_1024]] : index
// CHECK: scf.if [[PRED]] {
// CHECK: scf.for [[IDX0:%.*]] = [[CST_0]] to [[CST_1024]] step [[CST_1]] {
// CHECK: store
// CHECK: }
// CHECK: } else {
// CHECK: scf.for [[IDX0:%.*]] = [[CST_0]] to [[MIN]] step [[CST_1]] {
// CHECK: store
// CHECK: }
// CHECK: }
// CHECK: return
// CHECK: }

0 comments on commit 4bcd08e

Please sign in to comment.