diff --git a/mlir/include/mlir/Dialect/Affine/Passes.h b/mlir/include/mlir/Dialect/Affine/Passes.h index 18b3b790338d8e..fdf515b06359a6 100644 --- a/mlir/include/mlir/Dialect/Affine/Passes.h +++ b/mlir/include/mlir/Dialect/Affine/Passes.h @@ -35,7 +35,7 @@ std::unique_ptr> createSimplifyAffineStructuresPass(); /// operations out of affine loops. std::unique_ptr> createAffineLoopInvariantCodeMotionPass(); - +std::unique_ptr> createAffineLoopInterchange(); /// Creates a pass to convert all parallel affine.for's into 1-d affine.parallel /// ops. std::unique_ptr> createAffineParallelizePass(); diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td index 810640058155fb..f26c8e22e94fa7 100644 --- a/mlir/include/mlir/Dialect/Affine/Passes.td +++ b/mlir/include/mlir/Dialect/Affine/Passes.td @@ -47,7 +47,11 @@ def AffineLoopInvariantCodeMotion let summary = "Hoist loop invariant instructions outside of affine loops"; let constructor = "mlir::createAffineLoopInvariantCodeMotionPass()"; } - +def AffineLoopInterchange + : FunctionPass<"affine-loop-interchange"> { + let summary = "Hoist loop invariant instructions outside of affine loops"; + let constructor = "mlir::createAffineLoopInterchange()"; +} def AffineLoopTiling : FunctionPass<"affine-loop-tile"> { let summary = "Tile affine loop nests"; let constructor = "mlir::createLoopTilingPass()"; diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineLoopInterchange.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineLoopInterchange.cpp new file mode 100644 index 00000000000000..bad877d5961050 --- /dev/null +++ b/mlir/lib/Dialect/Affine/Transforms/AffineLoopInterchange.cpp @@ -0,0 +1,807 @@ +//===- affineLoopInterchange.cpp - Code to perform loop interchange-----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#include "PassDetail.h" +#include "mlir/Analysis/AffineAnalysis.h" +#include "mlir/Analysis/AffineStructures.h" +#include "mlir/Analysis/LoopAnalysis.h" +#include "mlir/Analysis/SliceAnalysis.h" +#include "mlir/Analysis/Utils.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineValueMap.h" +#include "mlir/Dialect/Affine/Passes.h" +#include "mlir/Dialect/StandardOps/IR/Ops.h" +#include "mlir/IR/AffineExpr.h" +#include "mlir/IR/AffineMap.h" +#include "mlir/IR/BlockSupport.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/IntegerSet.h" +#include "mlir/IR/Visitors.h" +#include "mlir/Transforms/LoopUtils.h" +#include "mlir/Transforms/Utils.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include +#include +#include +#include + +#define cache_line_size 64 // 512 bits of cache_line_size + +using namespace mlir; +llvm::DenseMap, unsigned>, int> + hasDependences; + +namespace { +struct LoopInterchange : public AffineLoopInterchangeBase { + void runOnFunction() override; + void handleNestedLoops(Operation &func); + // bool testRectangularForloopNest(AffineForOp forOp); + void getAccessMatrix(AffineForOp op, std::vector &foropvector, + llvm::DenseMap &index_values); + void buildRefGroups(AffineForOp ForOp, + std::vector> *refGroups, + unsigned max_depth, unsigned innermostlooplevel); + int64_t getLoopCost(std::vector &foropvector, int indexofforloop, + std::vector &references); + void getDependenceMatrix(AffineForOp forOp, unsigned maxLoopDepth, + std::vector> *depCompsVec); + void getLoopCosts(std::vector &forops); + +private: + llvm::DenseMap>> + LoopAccessMatrix; + llvm::DenseMap> const_vector; + // used to detect the presence of mod/ceildiv/floordiv operations by the + // getAccessMatrix() method. if any invalid operations are found, this flag is + // set and the LoopAccessMatrix and const_vector are cleared. any subsequent + // operations on the loop nest must first check this flag first. + + llvm::DenseMap elementSize; + std::vector iteration_counts; + std::vector loop_dependence_vector; + std::unordered_map loopcost; + double r; + std::vector> groups; +}; +} // end anonymous namespace + +SmallVector loadsAndStores; + +bool hasIfStatement(AffineForOp forOp) { + int iffound = 0; + forOp.walk([&](Operation *op) { + if (isa(op)) + iffound = 1; + }); + return iffound; +} + +// test for cases where the lower bound / upper bound has some constant variable +// i<=x rather than a simple contant value +bool testRectangularForloopNest(std::vector loopnest) { + for (auto a : loopnest) { + if (!a.hasConstantUpperBound() || !a.hasConstantLowerBound()) + return false; + } + return true; +} + +/// need to create some ID mechanism for all the operations (load/store) +/// denseMap Store the accessMatrix info as a map +/// > and map constants> + + +void LoopInterchange::getAccessMatrix( + AffineForOp op, std::vector& foropvector, + llvm::DenseMap& index_values) { + // for the mod / ceildiv / floordiv operations, consider them similar to + // access to dimensions. + this->LoopAccessMatrix.clear(); + this->const_vector.clear(); + SmallVector loadAndStoreOpInsts; + unsigned opcount = 0; + op.getOperation()->walk([&](Operation* opInst) { + // operation_depth[opcount++] = opInst; + if (isa(opInst) || isa(opInst)) { + loadAndStoreOpInsts.push_back(opInst); + } + }); + opcount = 0; + + unsigned numOps = loadAndStoreOpInsts.size(); + for (unsigned i = 0; (i < numOps); ++i) { + auto* srcOpInst = loadAndStoreOpInsts[i]; + MemRefAccess srcAccess(srcOpInst); + AffineMap map; + if (auto loadOp = dyn_cast(srcAccess.opInst)) + map = loadOp.getAffineMap(); + else if (auto storeOp = dyn_cast(srcAccess.opInst)) + map = storeOp.getAffineMap(); + SmallVector operands(srcAccess.indices.begin(), + srcAccess.indices.end()); + + fullyComposeAffineMapAndOperands(&map, &operands); + map = simplifyAffineMap(map); + canonicalizeMapAndOperands(&map, &operands); + auto a = map.getResults(); // ArrayRef + // no. of values in a = NO. of dimensions of array + uint64_t n_dim = map.getNumDims(); + uint64_t n_symbols = map.getNumSymbols(); + + // ROW: DIMENSIONS of the array + // COL: (NO. OF DIMS + NO. OF SYMBOLS) + std::vector> accessMatrix; + unsigned I_matrix[n_dim + n_symbols]; + std::vector const_vector(a.size()); + // accesssing the affine operations. + // all for a single instruction + + for (auto l = 0; l < a.size(); l++) { + // a has a size equal to the dimension of the array being accessed. + AffineExpr b = a[l]; // expression in the l-th dimension of the array + + std::vector Row(std::max(foropvector.size(), n_dim + n_symbols), 0); + + // check if b is not a constant expr. + // If it is constant expr like A[5], then no need to walk it. 5 = [0 0 + // 0][i j k]' + [5] Instead simply push the value in const_vector and + // leave the Row matrix to be empty full of zeroes. + + if (b.getKind() == AffineExprKind::Constant) { + auto constant = b.cast(); + const_vector[l] = (constant.getValue()); + } + else { + // constructing one row of the access matrix + + b.walk([&](AffineExpr expr) { + // all these expr are sub-level expressions inside the b (top-level) + // b = expr op expr op expr ... + + switch (expr.getKind()) { + + // A[ a + b] ----------> a and b themselves can be expressions. + // but if there is a constant involved, it will always be on the + // right. Then, A[a + constant] = [1 0 0 .. ][a b c ...]' + + // [constant] + + case AffineExprKind::Add: { + bool insertF = true; + AffineBinaryOpExpr op = expr.cast(); + auto lhs = op.getLHS(); + auto rhs = op.getRHS(); + unsigned lhs_position = 0; + unsigned rhs_position = 0; + + auto lhskind = lhs.getKind(); + + if (lhskind == AffineExprKind::DimId) { + auto dim = lhs.cast(); + lhs_position = index_values[operands[dim.getPosition()]]; + } + else if (lhskind == AffineExprKind::SymbolId) { + auto symbol = lhs.cast(); + lhs_position = n_dim + symbol.getPosition(); + } + + // there cannot be a constant in the left. However since it was + // already written.. + + else if (lhskind == AffineExprKind::Constant) { + int cons = rhs.cast().getValue(); + const_vector.push_back(cons); + insertF = false; + } + + // only if there is a 0 multiple for this variable in the access + // matrix, mark a 1. A[a + b] = [1 ...][a ... ]' however, if there + // is a non-zero multiple present, it means that the variable has + // some multiple x associated with some sub-expr. A[ c*a + ...] = + // [c ...][a ..]' in that case, no need to reset the value/ add + // the value on the access matrix + + if (Row[lhs_position] == 0 && insertF) + Row[lhs_position] = 1; + + int64_t cons = 0; + auto rhskind = rhs.getKind(); + insertF = true; // to flag whether or not to modify the Row + // or not. Please note that the Row will not + // be modified in case of a constant. + if (rhskind == AffineExprKind::DimId) { + auto dim = rhs.cast(); + rhs_position = dim.getPosition(); + rhs_position = index_values[operands[rhs_position]]; + } + else if (rhskind == AffineExprKind::SymbolId) { + auto symbol = rhs.cast(); + rhs_position = symbol.getPosition(); + } + else if (rhskind == AffineExprKind::Constant) { + cons = rhs.cast().getValue(); + // if the rhs is a constant, then simply add this constant to + // this dimension. + const_vector[l] += cons; + insertF = false; + } + + if (Row[rhs_position] == 0 && insertF) + Row[rhs_position] = 1; + + break; + } + + // A[5 * j] = [0 5 0][i j k]' + [0] + // The constant multiple is always on right. + + case AffineExprKind::Mul: { + AffineBinaryOpExpr op = expr.cast(); + auto lhs = op.getLHS(); + auto rhs = op.getRHS(); + unsigned position = 0; + switch (lhs.getKind()) { + case AffineExprKind::DimId: { + auto dim = lhs.cast(); + position = index_values[operands[dim.getPosition()]]; + break; + } + case AffineExprKind::SymbolId: { + auto symbol = lhs.cast(); + position = n_dim + symbol.getPosition(); + break; + } + } + switch (rhs.getKind()) { + case AffineExprKind::Constant: { + auto constant = rhs.cast(); + Row[position] = constant.getValue(); + break; + } + } + break; + } + + // for DimID such as A[dim] = [0 1 0][j dim k]' + [0] + + case AffineExprKind::DimId: { + auto dim = expr.cast(); + Row[index_values[operands[dim.getPosition()]]] = 1; + const_vector[l] += 0; + break; + } + + // for symbolID such as A[l] = [0 0 1][j k l]' + [0] + + case AffineExprKind::SymbolId: { + auto symbol = expr.cast(); + Row[index_values[operands[symbol.getPosition()]]] = 1; + const_vector[l] += 0; + break; + } + + case AffineExprKind::CeilDiv: // A[i op 5] ========> op = + // [ceildiv,floordiv, mod] + case AffineExprKind::FloorDiv: + case AffineExprKind::Mod: { + auto dim = expr.cast(); + auto lhs = dim.getLHS(); + // rhs is always contant or symbolic + auto rhs = dim.getRHS(); + unsigned position = 0; + switch (lhs.getKind()) { + case AffineExprKind::DimId: { + auto dim = lhs.cast(); + Row[index_values[operands[dim.getPosition()]]] = 1; + break; + } + case AffineExprKind::SymbolId: { + auto symbol = lhs.cast(); + Row[n_dim + symbol.getPosition()] = 1; + break; + } + } + switch (rhs.getKind()) { + case AffineExprKind::Constant: { + auto constant = rhs.cast(); + break; + } + case AffineExprKind::SymbolId: { + auto symbol = rhs.cast(); + Row[n_dim + symbol.getPosition()] = 1; + break; + } + } + } + } + + }); + } + + // inserting a row into AccessMatrix + // Row may be allowed to be only zeroes for cases such as A[5] + + accessMatrix.push_back(Row); + } + + this->LoopAccessMatrix[srcOpInst] = accessMatrix; + this->const_vector[srcOpInst] = const_vector; + const_vector.clear(); + accessMatrix.clear(); + } +} + +void handleSiblingNestingLoops(AffineForOp parentOp, AffineForOp forOpA, + AffineForOp forOpB) { + OpBuilder builder(parentOp.getOperation()->getBlock(), + std::next(Block::iterator(parentOp))); + auto copyParentForOp = cast(builder.clone(*parentOp)); + // remove forOpA from the original loopnest. + + int pos = 0, pos2 = 0; + int parentpos = 0; + int i = 0; + bool update = true; + parentOp.getOperation()->walk([&](AffineForOp op) { + i++; + if (op.getOperation() == forOpA.getOperation()) { + pos = i; + } + if (parentOp.getOperation() == op.getOperation()) { + parentpos = i; + } + }); + copyParentForOp.getOperation()->walk([&](AffineForOp opp) { + pos2++; + if ((pos2 != pos) && (pos2 != parentpos)) { + opp.getOperation()->erase(); + } + }); + forOpA.getOperation()->erase(); + // forOpB.getOperation()->erase(); +} + +void LoopInterchange::handleNestedLoops(Operation &func) { // converts the imperfectly nested loop nest to a perfectly nested loop nest by loop splitting. + std::vector foropvector; + std::vector opvector; + llvm::DenseMap> fortree; + llvm::DenseMap op_forop; + + func.walk([&](AffineForOp op) { + foropvector.push_back(op); + if (op.getParentOp()->getName().getStringRef() == "affine.for") + fortree[op.getOperation()->getParentOp()].push_back(op); + op_forop[op.getOperation()] = op; + }); + + for (auto a : fortree) { + for (auto b = a.second.size() - 1; b > 0; b--) { + handleSiblingNestingLoops(op_forop[a.first], a.second[b], + a.second[b - 1]); + } + } + return; +} + +void getValidPermutations( + int n, AffineForOp forop, + std::vector> &validPermutations) { + std::vector permutation; + for (unsigned i = 0; i < n; i++) + permutation.push_back(i); + validPermutations.push_back(permutation); + SmallVector perfectloopnest; + getPerfectlyNestedLoops(perfectloopnest, forop); + while (std::next_permutation(permutation.begin(), permutation.end())) { + if (isValidLoopInterchangePermutation( + ArrayRef(perfectloopnest), + ArrayRef(permutation))) + validPermutations.push_back(permutation); + } +} + +#define STAR -99999999999999999 +void LoopInterchange::getDependenceMatrix( + AffineForOp forOp, unsigned maxLoopDepth, + std::vector> *depCompsVec) { + SmallVector loadAndStoreOpInsts; + forOp.getOperation()->walk([&](Operation *opInst) { + if (isa(opInst) || isa(opInst)) + loadAndStoreOpInsts.push_back(opInst); + }); + // hasDependences.clear(); + bool hasDependencepair = false; + unsigned numOps = loadAndStoreOpInsts.size(); + for (unsigned i = 0; i < numOps; ++i) { + auto *srcOpInst = loadAndStoreOpInsts[i]; + for (unsigned j = 0; j < numOps; ++j) { + auto *dstOpInst = loadAndStoreOpInsts[j]; + hasDependencepair = false; + for (unsigned d = 1; d <= maxLoopDepth + 1; ++d) { + MemRefAccess srcAccess(srcOpInst); + MemRefAccess dstAccess(dstOpInst); + FlatAffineConstraints dependenceConstraints; + SmallVector depComps; + DependenceResult result = checkMemrefAccessDependence( + srcAccess, dstAccess, d, &dependenceConstraints, &depComps); + + if (mlir::hasDependence(result)) { + hasDependences[std::make_pair(std::make_pair(srcOpInst, dstOpInst), + d)] = 1; + std::vector components; + for (auto a : depComps) { + if (a.lb.getValue() * a.ub.getValue() >= 0) { + components.push_back((a.lb.getValue() + a.ub.getValue()) / 2); + } else { + if (a.lb.getValue() < 0 && a.ub.getValue() > 0) + components.push_back(a.lb.getValue()); + } + } + depCompsVec->push_back(components); + break; + } + } + } + } + if (depCompsVec->size() == 0) { + // create a zero vector + std::vector depc(maxLoopDepth); + depCompsVec->push_back(depc); + } + // set the dependence vector + std::vector depVec((*depCompsVec)[0].size()); + for (int i = 0; i < ((*depCompsVec)[0].size()); i++) { + for (int j = 0; j < depCompsVec->size(); j++) { + if (((*depCompsVec)[j][i]) != 0) + depVec[i] = 1; + } + } + this->loop_dependence_vector = depVec; +} + +int64_t getPermutationParallelismCost(std::vector permutation, + std::vector loop_dependence_vector, + std::vector iteration_vector) { + unsigned cost = 0; + for (int i = 0; i < permutation.size(); i++) { + if (loop_dependence_vector[permutation[i]]) { + int c = 1; + for (int j = i + 1; j < permutation.size(); j++) { + c *= iteration_vector[permutation[j]]; + } + cost += c; + } + } + return cost; +} + +void LoopInterchange::buildRefGroups( + AffineForOp ForOp, std::vector> *refGroups, + unsigned max_depth, unsigned innermostlooplevel) { + SmallVector loadAndStoreOpInsts; + ForOp.getOperation()->walk([&](Operation *op) { + if (isa(op) || isa(op)) { + MemRefType memreft; + if (isa(op)) { + AffineLoadOp loadOp = dyn_cast(*op); + memreft = loadOp.getMemRefType(); + } else { + AffineStoreOp storeOp = dyn_cast(*op); + memreft = storeOp.getMemRefType(); + } + + // get size of data + auto elementType = memreft.getElementType(); + + unsigned sizeInBits = 0; + if (elementType.isIntOrFloat()) { + sizeInBits = elementType.getIntOrFloatBitWidth(); + } + + unsigned elementsize = llvm::divideCeil(sizeInBits, 8); + this->elementSize[op] = elementsize; + loadAndStoreOpInsts.push_back(op); + } + }); + // now test for dependences + std::vector> groups(loadAndStoreOpInsts.size()); + std::unordered_map getlocation; + std::unordered_map getlocation2; + for (unsigned i = 0; i < loadAndStoreOpInsts.size(); i++) { + getlocation[loadAndStoreOpInsts[i]] = i; + getlocation2[loadAndStoreOpInsts[i]] = i; + groups[i].insert(loadAndStoreOpInsts[i]); + } + // case 1 + unsigned numOps = loadAndStoreOpInsts.size(); + + bool dependencefound = false; + // for (unsigned d = 1; d <= max_depth+1; d++) { + for (unsigned i = 0; i < numOps; ++i) { + auto *srcOpInst = loadAndStoreOpInsts[i]; + MemRefAccess srcAccess(srcOpInst); + for (unsigned j = i + 1; j < numOps; ++j) { + auto *dstOpInst = loadAndStoreOpInsts[j]; + MemRefAccess dstAccess(dstOpInst); + if (srcOpInst != dstOpInst) { + if (srcAccess.memref == dstAccess.memref) { + // get the access matrices of both the source and destination + // operations. + std::vector src_f = this->const_vector[srcOpInst]; + std::vector dest_f = this->const_vector[dstOpInst]; + + bool identicalF_values = + true; // all the values should be similar except the last value + for (int i = 0; i < src_f.size() - 1; i++) { + if ((src_f[i] != dest_f[i])) { + identicalF_values = false; + break; + } + } + + std::vector> lams = + this->LoopAccessMatrix[srcOpInst]; + std::vector> lamd = + this->LoopAccessMatrix[dstOpInst]; + + if (this->LoopAccessMatrix[srcOpInst] == + this->LoopAccessMatrix[dstOpInst] && + identicalF_values) { + // test for the last dimension access. The difference should be + // constant. create the MemRefType object + unsigned elementSize = this->elementSize[srcOpInst]; + + if (abs(src_f[src_f.size() - 1] - dest_f[dest_f.size() - 1]) <= + cache_line_size / elementSize) { + groups[getlocation[srcOpInst]].insert( + groups[getlocation[dstOpInst]].begin(), + groups[getlocation[dstOpInst]].end()); + groups.erase(groups.begin() + getlocation[dstOpInst]); + getlocation[dstOpInst] = getlocation[srcOpInst]; + } + } + } else { + for (unsigned d = 1; d <= max_depth + 1; d++) { + FlatAffineConstraints dependenceConstraints; + SmallVector depComps; + DependenceResult result = checkMemrefAccessDependence( + srcAccess, dstAccess, d, &dependenceConstraints, &depComps); + if (mlir::hasDependence(result)) { + if (d == max_depth + 1) { + // there is a loop-independent dependence + groups[getlocation[srcOpInst]].insert( + groups[getlocation[dstOpInst]].begin(), + groups[getlocation[dstOpInst]].end()); + groups.erase(groups.begin() + getlocation[dstOpInst]); + getlocation[dstOpInst] = getlocation[srcOpInst]; + } else if (!dependencefound) { + // search for all the dependence values other than this level + bool gooddependency = true; + for (unsigned i = 0; i < max_depth; i++) { + if ((depComps[i].lb.getValue() != 0) && + (i != innermostlooplevel)) { + gooddependency = false; + break; + } + } + if (gooddependency && + abs(depComps[innermostlooplevel].lb.getValue()) <= 2) { + groups[getlocation[srcOpInst]].insert( + groups[getlocation[dstOpInst]].begin(), + groups[getlocation[dstOpInst]].end()); + groups.erase(groups.begin() + getlocation[dstOpInst]); + getlocation[dstOpInst] = getlocation[srcOpInst]; + } + dependencefound = true; + } + } + } + } + } + } + } + //} + *refGroups = groups; + this->groups = groups; +} + +void LoopInterchange::getLoopCosts(std::vector &foropvector) { + long double totaliterations = 1; + for (int i = 0; i < foropvector.size(); i++) + totaliterations *= this->iteration_counts[i]; + for (int innerloop = 0; innerloop < foropvector.size(); innerloop++) { + AffineForOp forop = foropvector[innerloop]; + float step = forop.getStep(); + float trip = + (forop.getConstantUpperBound() - forop.getConstantLowerBound() + step) / + step; + long double loopcost = 0; + + for (int i = 0; i < this->groups.size(); i++) { + Operation *op = *this->groups[i].begin(); + float stride = + step * this->LoopAccessMatrix[op][this->LoopAccessMatrix[op].size() - + 1][innerloop]; + long double refcost = trip; + int test = 1; + for (int j = 0; j < this->LoopAccessMatrix[op].size(); j++) { + if (this->LoopAccessMatrix[op][this->LoopAccessMatrix[op].size() - 1] + [innerloop] != 0) + test = 0; + } + if (test) { + refcost = 1; + } else if (stride < 8) { + test = 1; + for (int j = 0; j < this->LoopAccessMatrix[op].size() - 1; j++) { + if (this->LoopAccessMatrix[op][this->LoopAccessMatrix[op].size() - 1] + [innerloop] != 0) + test = 0; + } + if (test) { + refcost = (trip * stride) / 8; + } + } + loopcost += refcost / this->groups[i].size(); + } + this->loopcost[&foropvector[innerloop]] = loopcost * totaliterations / trip; + } + + double e = 0; + for (int i = 0; i < this->loopcost.size(); i++) { + for (int j = 0; j < this->loopcost.size(); j++) { + if (this->loopcost[&foropvector[i]] / this->loopcost[&foropvector[j]] > e) + e = this->loopcost[&foropvector[i]] / this->loopcost[&foropvector[j]]; + } + } + this->r = e; +} + +long double getPermutationCostLocality( + std::vector permutation, std::vector &foropvector, + std::unordered_map loopcost, double fr) { + long double cost = 0; + for (int i = 0; i < loopcost.size(); i++) { + cost += pow(fr, /*foropvector.size()-1-*/ loopcost.size() - 1 - + permutation[i]) * + loopcost[&foropvector[i]]; + } + return cost; +} + +// TODO: test the getAccessMatrix again for all the possible cases -- take help +// from test file. +void LoopInterchange::runOnFunction() { + //handleNestedLoops(); + + std::vector foropvector; + std::vector forindexvector; + std::vector> loopnests; + int loopnestc = 0; + std::vector> dependences; + + // used to hold the id for the for-op induction vars and other variables + // passed in the access function. + llvm::DenseMap index_values; + int index_count = 0; + + // llvm::DenseMap operation_depth; + // llvm::DenseMap ForOp_depth; + unsigned forcount = 0; + unsigned opcount = 0; + + Operation *function = getFunction().getOperation(); + handleNestedLoops(*function); + (*function).walk([&](AffineForOp op) { // innermost loop first order. + // test if it is a top-level affine for op + foropvector.push_back(op); + if (op.hasConstantUpperBound() && op.hasConstantLowerBound()) { + this->iteration_counts.push_back( + (op.getConstantUpperBound() - op.getConstantLowerBound()) / + op.getStep()); + } + Value loop = op.getInductionVar(); + index_values[loop] = index_count; + index_count++; + + // identify if the current forOp is the head of a loopnest. + + if ((op.getParentOp()->getName().getStringRef().str() == "func")) { + std::reverse(foropvector.begin(), foropvector.end()); + loopnests.push_back(foropvector); + //std::vector> dependenceMatrix; + + //getDependenceMatrix(op, foropvector.size(), &dependenceMatrix); + + // the loop nest should not have any if statement and should be + // rectangular in space. + if (!hasIfStatement(op) && testRectangularForloopNest(foropvector)) { + + // modify the index_values map such that the topmost loop induction var + // has an id = 0 + for (auto a : index_values) + index_values[a.first] = index_count - 1 - a.second; + + std::vector> validPermutations; + getValidPermutations(foropvector.size(), op, validPermutations); + if (validPermutations.size() <= 1) { + // do not process. go to next loopnest. + goto newforloop; + } + + std::vector> depComp; + + // get the access matrix for the entire loop nest stored in + // TODO: Test if the values are consistent. + + getAccessMatrix(op, foropvector, index_values); + + std::vector> refGroups; + std::vector> dependenceMatrix; + + getDependenceMatrix(op, foropvector.size(), &dependenceMatrix); + // print the dependence matrix + // int64_t cost = getPermutationParallelismCost(a, + // this->loop_dependence_vector, this->iteration_counts); + buildRefGroups(op, &refGroups, foropvector.size(), 1); + getLoopCosts(foropvector); + + long double mincost = 999999999999999999999; + std::vector bestPermutation; + + for (auto a : validPermutations) { + int64_t parallelCost = getPermutationParallelismCost( + a, this->loop_dependence_vector, this->iteration_counts); + long double localitycost = getPermutationCostLocality( + a, foropvector, this->loopcost, this->r); + long double cost = 100 * parallelCost + localitycost; + + if (cost < mincost) { + mincost = cost; + bestPermutation = a; + } + } + + permuteLoops(MutableArrayRef(foropvector), + ArrayRef(bestPermutation)); + } + + // for every loop nest - whether or not it has to be processed. + // clear the foropvector so that new for loops in the loop nest get added + // again. index_values: these are valid only for this loop nest since the + // new for nest will have new block arguments and hence need new id + // index_count : used to insert id values in the index_values. + newforloop: + foropvector.clear(); + index_values.clear(); + index_count = 0; + forcount = 0; + this->iteration_counts.clear(); + this->loop_dependence_vector.clear(); + this->loopcost.clear(); + this->groups.clear(); + this->elementSize.clear(); + this->LoopAccessMatrix.clear(); + this->const_vector.clear(); + } + + }); +} + +std::unique_ptr> mlir::createAffineLoopInterchange() { + return std::make_unique(); +} diff --git a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt index dddcc93adf0d6d..f7d9a237845c4e 100644 --- a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt @@ -1,6 +1,7 @@ add_mlir_dialect_library(MLIRAffineTransforms AffineDataCopyGeneration.cpp AffineLoopInvariantCodeMotion.cpp + AffineLoopInterchange.cpp AffineParallelize.cpp LoopTiling.cpp LoopUnroll.cpp