From 845ed63e045a8ff94f7038ae3be1ce21339784ee Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Mon, 23 Jul 2018 12:58:05 -0400 Subject: [PATCH 01/16] add ptx backend --- include/llvm/Transforms/Tapir/PTXABI.h | 128 ++++ include/llvm/Transforms/Tapir/TapirTypes.h | 3 +- include/llvm/Transforms/Tapir/TapirUtils.h | 1 + include/llvm/Transforms/Utils/TapirUtils.h | 8 +- lib/Transforms/Tapir/CMakeLists.txt | 1 + lib/Transforms/Tapir/LoopSpawning.cpp | 41 +- lib/Transforms/Tapir/OpenMPABI.cpp | 16 +- lib/Transforms/Tapir/PTXABI.cpp | 725 +++++++++++++++++++++ lib/Transforms/Tapir/TapirToTarget.cpp | 5 +- lib/Transforms/Tapir/TapirUtils.cpp | 3 + lib/Transforms/Utils/LoopUnroll.cpp | 2 +- lib/Transforms/Utils/TapirUtils.cpp | 7 +- projects/compiler-rt | 2 +- 13 files changed, 924 insertions(+), 18 deletions(-) create mode 100644 include/llvm/Transforms/Tapir/PTXABI.h create mode 100644 lib/Transforms/Tapir/PTXABI.cpp diff --git a/include/llvm/Transforms/Tapir/PTXABI.h b/include/llvm/Transforms/Tapir/PTXABI.h new file mode 100644 index 00000000000..10698543896 --- /dev/null +++ b/include/llvm/Transforms/Tapir/PTXABI.h @@ -0,0 +1,128 @@ +/** + *************************************************************************** + * Copyright (c) 2017, Los Alamos National Security, LLC. + * All rights reserved. + * + * Copyright 2010. Los Alamos National Security, LLC. This software was + * produced under U.S. Government contract DE-AC52-06NA25396 for Los + * Alamos National Laboratory (LANL), which is operated by Los Alamos + * National Security, LLC for the U.S. Department of Energy. The + * U.S. Government has rights to use, reproduce, and distribute this + * software. NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, + * LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY + * FOR THE USE OF THIS SOFTWARE. If software is modified to produce + * derivative works, such modified software should be clearly marked, + * so as not to confuse it with the version available from LANL. + * + * Additionally, redistribution and use in source and binary forms, + * with or without modification, are permitted provided that the + * following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * * Neither the name of Los Alamos National Security, LLC, Los + * Alamos National Laboratory, LANL, the U.S. Government, nor the + * names of its contributors may be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND + * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + ***************************************************************************/ + +#ifndef PTX_ABI_H_ +#define PTX_ABI_H_ + +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/TypeBuilder.h" +#include "llvm/IR/ValueSymbolTable.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/Transforms/Tapir/LoopSpawning.h" +#include "llvm/Transforms/Tapir/TapirUtils.h" +#include + +namespace llvm { + +/// PTXABILoopSpawning uses the Cilk Plus ABI to handle Tapir loops. +class PTXABILoopSpawning : public LoopOutline { +public: + PTXABILoopSpawning(Loop *OrigLoop, ScalarEvolution &SE, + LoopInfo *LI, DominatorTree *DT, + AssumptionCache *AC, + OptimizationRemarkEmitter &ORE) + : LoopOutline(OrigLoop, SE, LI, DT, AC, ORE) + {} + + bool processLoop(); + + virtual ~PTXABILoopSpawning() {} + +protected: + +// private: +// /// Report an analysis message to assist the user in diagnosing loops that are +// /// not transformed. These are handled as LoopAccessReport rather than +// /// VectorizationReport because the << operator of LoopSpawningReport returns +// /// LoopAccessReport. +// void emitAnalysis(const LoopAccessReport &Message) const { +// emitAnalysisDiag(OrigLoop, *ORE, Message); +// } +private: + uint32_t nextKernelId_ = 0; +}; + +class PTXABI : public TapirTarget { +public: + PTXABI(); + Value *GetOrCreateWorker8(Function &F) override final; + void createSync(SyncInst &inst, ValueToValueMapTy &DetachCtxToStackFrame) + override final; + + Function *createDetach(DetachInst &Detach, + ValueToValueMapTy &DetachCtxToStackFrame, + DominatorTree &DT, AssumptionCache &AC) override final; + void preProcessFunction(Function &F) override final; + void postProcessFunction(Function &F) override final; + void postProcessHelper(Function &F) override final; + bool processMain(Function &F) override final; + +}; + +} // end of llvm namespace + +#endif diff --git a/include/llvm/Transforms/Tapir/TapirTypes.h b/include/llvm/Transforms/Tapir/TapirTypes.h index f29b8792a5d..455e32dd8d8 100644 --- a/include/llvm/Transforms/Tapir/TapirTypes.h +++ b/include/llvm/Transforms/Tapir/TapirTypes.h @@ -23,7 +23,8 @@ enum class TapirTargetType { Cilk = 2, OpenMP = 3, CilkR = 4, - Qthreads = 5 + Qthreads = 5, + PTX = 6 }; } // end namespace llvm diff --git a/include/llvm/Transforms/Tapir/TapirUtils.h b/include/llvm/Transforms/Tapir/TapirUtils.h index 65e7f0fe360..f1a6a327804 100644 --- a/include/llvm/Transforms/Tapir/TapirUtils.h +++ b/include/llvm/Transforms/Tapir/TapirUtils.h @@ -49,6 +49,7 @@ Function *extractDetachBodyToFunction(DetachInst &Detach, class TapirTarget { public: + virtual ~TapirTarget() {}; //! For use in loopspawning grainsize calculation virtual Value *GetOrCreateWorker8(Function &F) = 0; virtual void createSync(SyncInst &inst, diff --git a/include/llvm/Transforms/Utils/TapirUtils.h b/include/llvm/Transforms/Utils/TapirUtils.h index 4617c738df1..4c2fb19b00a 100644 --- a/include/llvm/Transforms/Utils/TapirUtils.h +++ b/include/llvm/Transforms/Utils/TapirUtils.h @@ -60,6 +60,7 @@ class LoopSpawningHints { enum SpawningStrategy { ST_SEQ, ST_DAC, + ST_GPU, ST_END, }; @@ -93,7 +94,8 @@ class LoopSpawningHints { return "Spawn iterations sequentially"; case LoopSpawningHints::ST_DAC: return "Use divide-and-conquer"; - case LoopSpawningHints::ST_END: + case LoopSpawningHints::ST_GPU: + return "Use gpu"; default: return "Unknown"; } @@ -142,8 +144,8 @@ class LoopSpawningHints { /// 4) The loop only branches to the exit block from the header or the latch. bool isCanonicalTapirLoop(const Loop *L, bool print = false); -//! Identify if a loop could be a DAC loop -bool isDACFor(Loop* L); +//! Identify if a loop could should be handled manually by a parallel loop backend +bool isBackendParallelFor(Loop* L); /// canDetach - Return true if the given function can perform a detach, false /// otherwise. diff --git a/lib/Transforms/Tapir/CMakeLists.txt b/lib/Transforms/Tapir/CMakeLists.txt index 43f0dbe3a2d..2f32875937b 100644 --- a/lib/Transforms/Tapir/CMakeLists.txt +++ b/lib/Transforms/Tapir/CMakeLists.txt @@ -1,6 +1,7 @@ add_llvm_library(LLVMTapirOpts CilkABI.cpp OpenMPABI.cpp + PTXABI.cpp QthreadsABI.cpp SmallBlock.cpp RedundantSpawn.cpp diff --git a/lib/Transforms/Tapir/LoopSpawning.cpp b/lib/Transforms/Tapir/LoopSpawning.cpp index f2f88d9d438..e24bbdd88bc 100644 --- a/lib/Transforms/Tapir/LoopSpawning.cpp +++ b/lib/Transforms/Tapir/LoopSpawning.cpp @@ -46,6 +46,7 @@ #include "llvm/Transforms/Scalar/LoopDeletion.h" #include "llvm/Transforms/Tapir.h" #include "llvm/Transforms/Tapir/Outline.h" +#include "llvm/Transforms/Tapir/PTXABI.h" #include "llvm/Transforms/Tapir/TapirUtils.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include "llvm/Transforms/Utils/LoopUtils.h" @@ -75,7 +76,9 @@ static cl::opt ClTapirTarget( clEnumValN(TapirTargetType::OpenMP, "openmp", "OpenMP"), clEnumValN(TapirTargetType::Qthreads, - "qthreads", "Qthreads"))); + "qthreads", "Qthreads"), + clEnumValN(TapirTargetType::PTX, + "ptx", "PTX"))); namespace { // /// \brief This modifies LoopAccessReport to initialize message with @@ -115,6 +118,13 @@ static void emitMissedWarning(Function *F, Loop *L, << "Tapir loop not transformed: " << "failed to use divide-and-conquer loop spawning"); break; + case LoopSpawningHints::ST_GPU: + ORE->emit(DiagnosticInfoOptimizationFailure( + DEBUG_TYPE, "FailedRequestedSpawning", + L->getStartLoc(), L->getHeader()) + << "Tapir loop not transformed: " + << "failed to use GPU loop spawning"); + break; case LoopSpawningHints::ST_SEQ: ORE->emit(DiagnosticInfoOptimizationFailure( DEBUG_TYPE, "SpawningDisabled", @@ -1417,6 +1427,35 @@ bool LoopSpawningImpl::processLoop(Loop *L) { case LoopSpawningHints::ST_SEQ: DEBUG(dbgs() << "LS: Hints dictate sequential spawning.\n"); break; + case LoopSpawningHints::ST_GPU: + DEBUG(dbgs() << "LS: Hints dictate DAC spawning.\n"); + { + DebugLoc DLoc = L->getStartLoc(); + BasicBlock *Header = L->getHeader(); + PTXABILoopSpawning DLS(L, SE, &LI, &DT, &AC, ORE); + // CilkABILoopSpawning DLS(L, SE, &LI, &DT, &AC, ORE); + // DACLoopSpawning DLS(L, SE, LI, DT, TLI, TTI, ORE); + if (DLS.processLoop()) { + DEBUG({ + if (verifyFunction(*L->getHeader()->getParent())) { + dbgs() << "Transformed function is invalid.\n"; + return false; + } + }); + // Report success. + ORE.emit(OptimizationRemark(LS_NAME, "DACSpawning", DLoc, Header) + << "spawning iterations using divide-and-conquer"); + return true; + } else { + // Report failure. + ORE.emit(OptimizationRemarkMissed(LS_NAME, "NoDACSpawning", DLoc, + Header) + << "cannot spawn iterations using divide-and-conquer"); + emitMissedWarning(F, L, Hints, &ORE); + return false; + } + } + break; case LoopSpawningHints::ST_DAC: DEBUG(dbgs() << "LS: Hints dictate DAC spawning.\n"); { diff --git a/lib/Transforms/Tapir/OpenMPABI.cpp b/lib/Transforms/Tapir/OpenMPABI.cpp index 731a0c0cd93..5ddcf9eb174 100644 --- a/lib/Transforms/Tapir/OpenMPABI.cpp +++ b/lib/Transforms/Tapir/OpenMPABI.cpp @@ -488,7 +488,7 @@ Function* formatFunctionToTask(Function* extracted, CallInst* cal) { IRBuilder<> CallerIRBuilder(cal); auto *SharedsTySize = CallerIRBuilder.getInt64(DL.getTypeAllocSize(SharedsTy)); - auto *KmpTaskTTy = createKmpTaskTTy(C); + //unused -- auto *KmpTaskTTy = createKmpTaskTTy(C); auto *KmpTaskTWithPrivatesTy = createKmpTaskTWithPrivatesTy(SharedsTy);//KmpTaskTTy); auto *KmpTaskTWithPrivatesPtrTy = PointerType::getUnqual(KmpTaskTWithPrivatesTy); @@ -496,11 +496,11 @@ Function* formatFunctionToTask(Function* extracted, CallInst* cal) { CallerIRBuilder.getInt64(DL.getTypeAllocSize(KmpTaskTWithPrivatesTy)); auto *VoidTy = Type::getVoidTy(C); - auto *Int8PtrTy = Type::getInt8PtrTy(C); + // unused -- auto *Int8PtrTy = Type::getInt8PtrTy(C); auto *Int32Ty = Type::getInt32Ty(C); - auto *CopyFnTy = FunctionType::get(VoidTy, {Int8PtrTy}, true); - auto *CopyFnPtrTy = PointerType::getUnqual(CopyFnTy); + // unused -- auto *CopyFnTy = FunctionType::get(VoidTy, {Int8PtrTy}, true); + // unused -- auto *CopyFnPtrTy = PointerType::getUnqual(CopyFnTy); auto *OutlinedFnTy = FunctionType::get( VoidTy, @@ -593,12 +593,12 @@ Function *llvm::OpenMPABI::createDetach(DetachInst &detach, ValueToValueMapTy &DetachCtxToStackFrame, DominatorTree &DT, AssumptionCache &AC) { BasicBlock *detB = detach.getParent(); - Function &F = *(detB->getParent()); + // unused -- Function &F = *(detB->getParent()); BasicBlock *Spawned = detach.getDetached(); BasicBlock *Continue = detach.getContinue(); - Module *M = F.getParent(); + // unused -- Module *M = F.getParent(); CallInst *cal = nullptr; Function *extracted = extractDetachBodyToFunction(detach, DT, AC, &cal); @@ -676,7 +676,7 @@ void llvm::OpenMPABI::postProcessFunction(Function &F) { } } - for(int i=1; ieraseFromParent(); RegionFn->eraseFromParent(); } diff --git a/lib/Transforms/Tapir/PTXABI.cpp b/lib/Transforms/Tapir/PTXABI.cpp new file mode 100644 index 00000000000..249290ee0d5 --- /dev/null +++ b/lib/Transforms/Tapir/PTXABI.cpp @@ -0,0 +1,725 @@ +/** + *************************************************************************** + * Copyright (c) 2017, Los Alamos National Security, LLC. + * All rights reserved. + * + * Copyright 2010. Los Alamos National Security, LLC. This software was + * produced under U.S. Government contract DE-AC52-06NA25396 for Los + * Alamos National Laboratory (LANL), which is operated by Los Alamos + * National Security, LLC for the U.S. Department of Energy. The + * U.S. Government has rights to use, reproduce, and distribute this + * software. NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, + * LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY + * FOR THE USE OF THIS SOFTWARE. If software is modified to produce + * derivative works, such modified software should be clearly marked, + * so as not to confuse it with the version available from LANL. + * + * Additionally, redistribution and use in source and binary forms, + * with or without modification, are permitted provided that the + * following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * * Neither the name of Los Alamos National Security, LLC, Los + * Alamos National Laboratory, LANL, the U.S. Government, nor the + * names of its contributors may be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND + * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + ***************************************************************************/ + +#include "llvm/Transforms/Tapir/PTXABI.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Transforms/Tapir/Outline.h" +#include "llvm/Transforms/Utils/EscapeEnumerator.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/TapirUtils.h" +#include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/Vectorize.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/IR/LegacyPassManager.h" + +#include +#include +#include + +#define np(X) \ + std::cout << __FILE__ << ":" << __LINE__ << ": " << __PRETTY_FUNCTION__ \ + << ": " << #X << " = " << (X) << std::endl + +#include +#include +#include + +using namespace llvm; + +namespace{ + + template + Function* getFunction(Module& M, const char* name){ + return cast(M.getOrInsertFunction(name, + TypeBuilder::get(M.getContext()))); + } + + template + Value* convertInteger(B& b, Value* from, Value* to, const std::string& name){ + auto ft = dyn_cast(from->getType()); + assert(ft && "expected from type as integer type"); + + auto tt = dyn_cast(to->getType()); + assert(tt && "expected to type as integer type"); + + if(ft->getBitWidth() > tt->getBitWidth()){ + return b.CreateTrunc(from, tt, name); + } + else if(ft->getBitWidth() < tt->getBitWidth()){ + return b.CreateZExt(from, tt, name); + } + + return from; + } + +} // namespace + + +//############################################################################## + +PTXABI::PTXABI() {} + +/// \brief Get/Create the worker count for the spawning function. +Value *PTXABI::GetOrCreateWorker8(Function &F) { + Module *M = F.getParent(); + LLVMContext& C = M->getContext(); + return ConstantInt::get(C, APInt(16, 8)); +} + +void PTXABI::createSync(SyncInst &SI, ValueToValueMapTy &DetachCtxToStackFrame) { +} + +Function *PTXABI::createDetach(DetachInst &detach, + ValueToValueMapTy &DetachCtxToStackFrame, + DominatorTree &DT, AssumptionCache &AC) { + BasicBlock *detB = detach.getParent(); + // unused -- Function &F = *(detB->getParent()); + + BasicBlock *Spawned = detach.getDetached(); + BasicBlock *Continue = detach.getContinue(); + + // unused -- Module *M = F.getParent(); + + CallInst *cal = nullptr; + Function *extracted = extractDetachBodyToFunction(detach, DT, AC, &cal); + //extracted = formatFunctionToTask(extracted, cal); + + // Replace the detach with a branch to the continuation. + BranchInst *ContinueBr = BranchInst::Create(Continue); + ReplaceInstWithInst(&detach, ContinueBr); + + // Rewrite phis in the detached block. + { + BasicBlock::iterator BI = Spawned->begin(); + while (PHINode *P = dyn_cast(BI)) { + P->removeIncomingValue(detB); + ++BI; + } + } + return extracted; +} + +void PTXABI::preProcessFunction(Function &F) { +} + +void PTXABI::postProcessFunction(Function &F) { +} + +void PTXABI::postProcessHelper(Function &F) { +} + +bool PTXABI::processMain(Function &F) { + return true; +} + +bool PTXABILoopSpawning::processLoop(){ + Loop *L = OrigLoop; + + // L->dumpVerbose(); + + // code generation is currently limited to a simple canonical loop structure + // whereby we make the following assumptions and check assertions below + // soon we will expand this extraction mechanism to handle more complex + // loops + + using TypeVec = std::vector; + using ValueVec = std::vector; + + LLVMContext& c = L->getHeader()->getContext(); + + IRBuilder<> b(c); + + Type* voidTy = Type::getVoidTy(c); + IntegerType* i8Ty = Type::getInt8Ty(c); + IntegerType* i16Ty = Type::getInt16Ty(c); + IntegerType* i32Ty = Type::getInt32Ty(c); + IntegerType* i64Ty = Type::getInt64Ty(c); + PointerType* voidPtrTy = Type::getInt8PtrTy(c); + + // and LLVM transformation is able in some cases to transform the loop to + // contain a phi node that exists at the entry block + + PHINode* loopNode = L->getCanonicalInductionVariable(); + assert(loopNode && "expected canonical loop"); + + // only handle loops where the induction variable is initialized to a constant + + Value* loopStart = loopNode->getIncomingValue(0); + assert(loopStart && "expected canonical loop start"); + + auto cs = dyn_cast(loopStart); + bool startsAtZero = cs && cs->isZero(); + + BasicBlock* exitBlock = L->getUniqueExitBlock(); + assert(exitBlock && "expected canonical exit block"); + + // and assume that a branch instruction exists here + + BasicBlock* branchBlock = exitBlock->getSinglePredecessor(); + assert(branchBlock && "expected canonical branch block"); + + BranchInst* endBranch = dyn_cast(branchBlock->getTerminator()); + assert(endBranch && "expected canonical end branch instruction"); + + // get the branch condition in order to extract the end loop value + // which we also currently assume is constant + + Value* endBranchCond = endBranch->getCondition(); + CmpInst* cmp = dyn_cast(endBranchCond); + assert(cmp && "expected canonical comparison instruction"); + + Value* loopEnd = cmp->getOperand(1); + assert(loopEnd && "expected canonical loop end"); + + BasicBlock* latchBlock = L->getLoopLatch(); + Instruction* li = latchBlock->getFirstNonPHI(); + unsigned op = li->getOpcode(); + assert(op == Instruction::Add || op == Instruction::Sub && + "expected add or sub in loop latch"); + assert(li->getOperand(0)== loopNode); + Value* stride = li->getOperand(1); + cs = dyn_cast(stride); + bool isUnitStride = cs && cs->isOne(); + + BasicBlock* entryBlock = L->getBlocks()[0]; + + Function* hostFunc = entryBlock->getParent(); + + Module& hostModule = *hostFunc->getParent(); + + // assume a detach exists here and this basic block contains the body + // of the kernel function we will be generating + + DetachInst* detach = dyn_cast(entryBlock->getTerminator()); + assert(detach && "expected canonical loop entry detach"); + + BasicBlock* Body = detach->getDetached(); + + // extract the externally defined variables + // these will be passed in as CUDA arrays + + std::set values; + values.insert(loopNode); + + std::set extValues; + + for(Instruction& ii : *Body){ + if(dyn_cast(&ii)){ + continue; + } + + for(Use& u : ii.operands()){ + Value* v = u.get(); + + if(isa(v)){ + continue; + } + + if(values.find(v) == values.end()){ + extValues.insert(v); + } + } + + values.insert(&ii); + } + + TypeVec paramTypes; + paramTypes.push_back(i64Ty); + paramTypes.push_back(i64Ty); + paramTypes.push_back(i64Ty); + + for(Value* v : extValues){ + if(auto pt = dyn_cast(v->getType())){ + if(auto at = dyn_cast(pt->getElementType())){ + paramTypes.push_back(PointerType::get(at->getElementType(), 0)); + } + else{ + paramTypes.push_back(pt); + } + } + else{ + v->dump(); + assert(false && "expected a pointer or array type"); + } + } + + // create the GPU function + + FunctionType* funcTy = FunctionType::get(voidTy, paramTypes, false); + + Module ptxModule("ptxModule", c); + + // each kernel function is assigned a unique ID by which the kernel + // entry point function is named e.g. run0 for kernel ID 0 + + size_t kernelRunId = nextKernelId_++; + + std::stringstream kstr; + kstr << "run" << kernelRunId; + + Function* f = Function::Create(funcTy, + Function::ExternalLinkage, kstr.str().c_str(), &ptxModule); + + // the first parameter defines the extent of the index space + // i.e. number of threads to launch + auto aitr = f->arg_begin(); + aitr->setName("runSize"); + Value* runSizeParam = aitr; + ++aitr; + + aitr->setName("runStart"); + Value* runStartParam = aitr; + ++aitr; + + aitr->setName("runStride"); + Value* runStrideParam = aitr; + ++aitr; + + std::map m; + + // set and parameter names and map values to be replaced + + size_t i = 0; + + for(Value* v : extValues){ + std::stringstream sstr; + sstr << "arg" << i; + + m[v] = aitr; + aitr->setName(sstr.str()); + ++aitr; + ++i; + } + + // create the entry block which will be used to compute the thread ID + // and simply return if the thread ID is beyond the run size + + BasicBlock* br = BasicBlock::Create(c, "entry", f); + + b.SetInsertPoint(br); + + using SREGFunc = uint32_t(); + + // calls to NVPTX intrinsics to get the thread index, block size, + // and grid dimensions + + Value* threadIdx = b.CreateCall(getFunction(ptxModule, + "llvm.nvvm.read.ptx.sreg.tid.x")); + + Value* blockIdx = b.CreateCall(getFunction(ptxModule, + "llvm.nvvm.read.ptx.sreg.ctaid.x")); + + Value* blockDim = b.CreateCall(getFunction(ptxModule, + "llvm.nvvm.read.ptx.sreg.ntid.x")); + + Value* threadId = + b.CreateAdd(threadIdx, b.CreateMul(blockIdx, blockDim), "threadId"); + + // convert the thread ID into the proper integer type of the loop variable + + threadId = convertInteger(b, threadId, loopNode, "threadId"); + + if(!isUnitStride){ + threadId = b.CreateMul(threadId, runStrideParam); + } + + if(!startsAtZero){ + threadId = b.CreateAdd(threadId, runStartParam); + } + + // return block to exit if thread ID is greater than or equal to run size + + BasicBlock* rb = BasicBlock::Create(c, "exit", f); + BasicBlock* bb = BasicBlock::Create(c, "body", f); + + Value* cond = b.CreateICmpUGE(threadId, runSizeParam); + b.CreateCondBr(cond, rb, bb); + + b.SetInsertPoint(rb); + b.CreateRetVoid(); + + b.SetInsertPoint(bb); + + // map the thread ID into the new values as we clone the instructions + // of the function + + m[loopNode] = threadId; + + BasicBlock::InstListType& il = bb->getInstList(); + + // clone instructions of the body basic block, remapping values as needed + + std::set extReads; + std::set extWrites; + std::map extVars; + + for(Instruction& ii : *Body){ + if(dyn_cast(&ii)){ + continue; + } + + // determine if we are reading or writing the external variables + // i.e. those passed as CUDA arrays + + Instruction* ic = ii.clone(); + + if(auto li = dyn_cast(&ii)){ + Value* v = li->getPointerOperand(); + auto itr = extVars.find(v); + if(itr != extVars.end()){ + extReads.insert(itr->second); + } + } + else if(auto si = dyn_cast(&ii)){ + Value* v = si->getPointerOperand(); + auto itr = extVars.find(v); + if(itr != extVars.end()){ + extWrites.insert(itr->second); + } + } + // if this is a GEP into one of the external variables then keep track of + // which external variable it originally came from + else if(auto gi = dyn_cast(&ii)){ + Value* v = gi->getPointerOperand(); + if(extValues.find(v) != extValues.end()){ + extVars[gi] = v; + if(isa(gi->getSourceElementType())){ + auto cgi = dyn_cast(ic); + cgi->setSourceElementType(m[v]->getType()); + } + } + } + + // remap values as we are cloning the instructions + + for(auto& itr : m){ + ic->replaceUsesOfWith(itr.first, itr.second); + } + + il.push_back(ic); + m[&ii] = ic; + } + + b.CreateRetVoid(); + + // add the necessary NVPTX to mark the global function + + NamedMDNode* annotations = + ptxModule.getOrInsertNamedMetadata("nvvm.annotations"); + + SmallVector av; + + av.push_back(ValueAsMetadata::get(f)); + av.push_back(MDString::get(ptxModule.getContext(), "kernel")); + av.push_back(ValueAsMetadata::get(llvm::ConstantInt::get(i32Ty, 1))); + + annotations->addOperand(MDNode::get(ptxModule.getContext(), av)); + + // remove the basic blocks corresponding to the original LLVM loop + + BasicBlock* predecessor = L->getLoopPreheader(); + entryBlock->removePredecessor(predecessor); + BasicBlock* successor = exitBlock->getSingleSuccessor(); + + BasicBlock* hostBlock = BasicBlock::Create(c, "host.block", hostFunc); + + b.SetInsertPoint(predecessor->getTerminator()); + b.CreateBr(hostBlock); + predecessor->getTerminator()->removeFromParent(); + + successor->removePredecessor(exitBlock); + + { + std::set visited; + visited.insert(exitBlock); + + std::vector next; + next.push_back(entryBlock); + + while(!next.empty()){ + BasicBlock* b = next.back(); + next.pop_back(); + + for(BasicBlock* bn : b->getTerminator()->successors()){ + if(visited.find(bn) == visited.end()){ + next.push_back(bn); + } + } + + b->dropAllReferences(); + b->removeFromParent(); + visited.insert(b); + } + } + + exitBlock->dropAllReferences(); + exitBlock->removeFromParent(); + + // find the NVPTX module pass which will create the PTX code + + const Target* target = nullptr; + + for(TargetRegistry::iterator itr = TargetRegistry::targets().begin(), + itrEnd = TargetRegistry::targets().end(); itr != itrEnd; ++itr){ + if(std::string(itr->getName()) == "nvptx64"){ + target = &*itr; + break; + } + } + + assert(target && "failed to find NVPTX target"); + + Triple triple(sys::getDefaultTargetTriple()); + triple.setArch(Triple::nvptx64); + + // TODO: the version of LLVM that we are using currently only supports + // up to SM_60 – we need SM_70 for Volta architectures + + TargetMachine* targetMachine = + target->createTargetMachine(triple.getTriple(), + //"sm_35", + //"sm_70", + "sm_60", + "", + TargetOptions(), + Reloc::Static, + CodeModel::Default, + CodeGenOpt::Aggressive); + + DataLayout layout("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:" + "64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:" + "64:64-v128:128:128-n16:32:64"); + + ptxModule.setDataLayout(layout); + + legacy::PassManager* passManager = new legacy::PassManager; + + passManager->add(createVerifierPass()); + + // add in our optimization passes + + passManager->add(createInstructionCombiningPass()); + passManager->add(createReassociatePass()); + passManager->add(createGVNPass()); + passManager->add(createCFGSimplificationPass()); + passManager->add(createSLPVectorizerPass()); + passManager->add(createBreakCriticalEdgesPass()); + passManager->add(createConstantPropagationPass()); + passManager->add(createDeadInstEliminationPass()); + passManager->add(createDeadStoreEliminationPass()); + passManager->add(createInstructionCombiningPass()); + passManager->add(createCFGSimplificationPass()); + + SmallVector buf; + raw_svector_ostream ostr(buf); + + bool fail = + targetMachine->addPassesToEmitFile(*passManager, + ostr, + TargetMachine::CGFT_AssemblyFile, + false); + + assert(!fail && "failed to emit PTX"); + + passManager->run(ptxModule); + + delete passManager; + + std::string ptx = ostr.str().str(); + + Constant* pcs = ConstantDataArray::getString(c, ptx); + + // create a global string to hold the PTX code + + GlobalVariable* ptxGlobal = + new GlobalVariable(hostModule, + pcs->getType(), + true, + GlobalValue::PrivateLinkage, + pcs, + "ptx"); + + Value* kernelId = ConstantInt::get(i32Ty, kernelRunId); + + Value* ptxStr = b.CreateBitCast(ptxGlobal, voidPtrTy); + + b.SetInsertPoint(hostBlock); + + // finally, replace where the original loop was with calls to the GPU runtime + + using InitCUDAFunc = void(); + + b.CreateCall(getFunction(hostModule, + "__kitsune_cuda_init"), {}); + + using InitKernelFunc = void(uint32_t, const char*); + + b.CreateCall(getFunction(hostModule, + "__kitsune_gpu_init_kernel"), {kernelId, ptxStr}); + + for(Value* v : extValues){ + Value* elementSize; + Value* vptr; + Value* fieldName; + Value* size; + + // TODO: fix + // this is a temporary hack to get the size of the field + // it will currently only work for a limited case + + if(auto bc = dyn_cast(v)){ + auto ci = dyn_cast(bc->getOperand(0)); + assert(ci && "unable to detect field size"); + + Value* bytes = ci->getOperand(0); + assert(bytes->getType()->isIntegerTy(64)); + + auto pt = dyn_cast(v->getType()); + auto it = dyn_cast(pt->getElementType()); + assert(it && "expected integer type"); + + Constant* fn = ConstantDataArray::getString(c, ci->getName()); + + GlobalVariable* fieldNameGlobal = + new GlobalVariable(hostModule, + fn->getType(), + true, + GlobalValue::PrivateLinkage, + fn, + "field.name"); + + fieldName = b.CreateBitCast(fieldNameGlobal, voidPtrTy); + + vptr = b.CreateBitCast(v, voidPtrTy); + + elementSize = ConstantInt::get(i32Ty, it->getBitWidth()/8); + + size = b.CreateUDiv(bytes, ConstantInt::get(i64Ty, it->getBitWidth()/8)); + } + else if(auto ai = dyn_cast(v)){ + Constant* fn = ConstantDataArray::getString(c, ai->getName()); + + GlobalVariable* fieldNameGlobal = + new GlobalVariable(hostModule, + fn->getType(), + true, + GlobalValue::PrivateLinkage, + fn, + "field.name"); + + fieldName = b.CreateBitCast(fieldNameGlobal, voidPtrTy); + + vptr = b.CreateBitCast(v, voidPtrTy); + + auto at = dyn_cast(ai->getAllocatedType()); + assert(at && "expected array type"); + + elementSize = ConstantInt::get(i32Ty, + at->getElementType()->getPrimitiveSizeInBits()/8); + + size = ConstantInt::get(i64Ty, at->getNumElements()); + } + + uint8_t m = 0; + if(extReads.find(v) != extReads.end()){ + m |= 0b01; + } + + if(extWrites.find(v) != extWrites.end()){ + m |= 0b10; + } + + Value* mode = ConstantInt::get(i8Ty, m); + + TypeVec params = {i32Ty, voidPtrTy, voidPtrTy, i32Ty, i64Ty, i8Ty}; + + Function* initFieldFunc = + llvm::Function::Create(FunctionType::get(voidTy, params, false), + llvm::Function::ExternalLinkage, + "__kitsune_gpu_init_field", + &hostModule); + + b.CreateCall(initFieldFunc, + {kernelId, fieldName, vptr, elementSize, size, mode}); + } + + using SetRunSizeFunc = void(uint32_t, uint64_t, uint64_t, uint64_t); + + Value* runSize = b.CreateSub(loopEnd, loopStart); + + runSize = convertInteger(b, runSize, threadId, "run.size"); + + Value* runStart = convertInteger(b, loopStart, threadId, "run.start"); + + b.CreateCall(getFunction(hostModule, + "__kitsune_gpu_set_run_size"), {kernelId, runSize, runStart, runStart}); + + using RunKernelFunc = void(uint32_t); + + b.CreateCall(getFunction(hostModule, + "__kitsune_gpu_run_kernel"), {kernelId}); + + using FinishFunc = void(); + + b.CreateCall(getFunction(hostModule, + "__kitsune_gpu_finish"), {}); + + b.CreateBr(successor); + + // hostModule.dump(); + + // ptxModule.dump(); + + return true; +} diff --git a/lib/Transforms/Tapir/TapirToTarget.cpp b/lib/Transforms/Tapir/TapirToTarget.cpp index 17035715568..5a9f6ddb766 100644 --- a/lib/Transforms/Tapir/TapirToTarget.cpp +++ b/lib/Transforms/Tapir/TapirToTarget.cpp @@ -34,7 +34,10 @@ static cl::opt ClTapirTarget( clEnumValN(TapirTargetType::Qthreads, "qthreads", "Qthreads"), clEnumValN(TapirTargetType::OpenMP, - "openmp", "OpenMP"))); + "openmp", "OpenMP"), + clEnumValN(TapirTargetType::PTX, + "ptx", "PTX") + )); namespace { diff --git a/lib/Transforms/Tapir/TapirUtils.cpp b/lib/Transforms/Tapir/TapirUtils.cpp index 6a741532ab2..2583dd8f255 100644 --- a/lib/Transforms/Tapir/TapirUtils.cpp +++ b/lib/Transforms/Tapir/TapirUtils.cpp @@ -14,6 +14,7 @@ #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/Transforms/Tapir/CilkABI.h" #include "llvm/Transforms/Tapir/OpenMPABI.h" +#include "llvm/Transforms/Tapir/PTXABI.h" #include "llvm/Transforms/Tapir/QthreadsABI.h" #include "llvm/Transforms/Tapir/Outline.h" #include "llvm/Transforms/Utils/EscapeEnumerator.h" @@ -30,6 +31,8 @@ TapirTarget *llvm::getTapirTargetFromType(TapirTargetType Type) { return new CilkABI(); case TapirTargetType::OpenMP: return new OpenMPABI(); + case TapirTargetType::PTX: + return new PTXABI(); case TapirTargetType::Qthreads: return new QthreadsABI(); case TapirTargetType::None: diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp index c0f10f85fb1..106f5b14f35 100644 --- a/lib/Transforms/Utils/LoopUnroll.cpp +++ b/lib/Transforms/Utils/LoopUnroll.cpp @@ -372,7 +372,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force, // Are we eliminating the loop control altogether? bool CompletelyUnroll = Count == TripCount; - if (isDACFor(L) && !CompletelyUnroll) return false; + if (isBackendParallelFor(L) && !CompletelyUnroll) return false; SmallVector ExitBlocks; L->getExitBlocks(ExitBlocks); std::vector OriginalLoopBlocks = L->getBlocks(); diff --git a/lib/Transforms/Utils/TapirUtils.cpp b/lib/Transforms/Utils/TapirUtils.cpp index 8791e70cc09..9707290c426 100644 --- a/lib/Transforms/Utils/TapirUtils.cpp +++ b/lib/Transforms/Utils/TapirUtils.cpp @@ -178,7 +178,7 @@ bool llvm::MoveStaticAllocasInBlock( BranchInst *llvm::SerializeDetachedCFG(DetachInst *DI, DominatorTree *DT) { //TODO allow to work without dominatortree or code workaround //assert(DT && "Requires DominatorTree (could remove by fixing later TODO)"); - + // Get the parent of the detach instruction. BasicBlock *Detacher = DI->getParent(); // Get the detached block and continuation of this detach. @@ -537,11 +537,12 @@ bool llvm::isCanonicalTapirLoop(const Loop *L, bool print) { return true; } -bool llvm::isDACFor(Loop* L) { +bool llvm::isBackendParallelFor(Loop* L) { // TODO: Use a more precise detection of cilk_for loops. for (BasicBlock* BB : L->blocks()) if (isa(BB->getTerminator())) - return LoopSpawningHints(L).getStrategy() == LoopSpawningHints::ST_DAC; + return LoopSpawningHints(L).getStrategy() == LoopSpawningHints::ST_DAC + || LoopSpawningHints(L).getStrategy() == LoopSpawningHints::ST_GPU; return false; } diff --git a/projects/compiler-rt b/projects/compiler-rt index b91c085d737..85ff07e6de5 160000 --- a/projects/compiler-rt +++ b/projects/compiler-rt @@ -1 +1 @@ -Subproject commit b91c085d73799d9c6fbea0f2a85c12bd332e2cc4 +Subproject commit 85ff07e6de58834c3c5a739de21b45e0809736b6 From 6b516f5b666cb0dc417ffebbfdb66b93c9ae563c Mon Sep 17 00:00:00 2001 From: George Stelle Date: Mon, 23 Jul 2018 12:46:26 -0600 Subject: [PATCH 02/16] Added unroll test --- test/Transforms/Tapir/unroll.ll | 182 ++++++++++++++++++++++++++++++++ 1 file changed, 182 insertions(+) create mode 100644 test/Transforms/Tapir/unroll.ll diff --git a/test/Transforms/Tapir/unroll.ll b/test/Transforms/Tapir/unroll.ll new file mode 100644 index 00000000000..6a34f3a86df --- /dev/null +++ b/test/Transforms/Tapir/unroll.ll @@ -0,0 +1,182 @@ +; Test that we can control unrolling for different tapir backends + +; RUN: opt < %s -loop-unroll -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define void @dac(i32 %n, double* nocapture %a) local_unnamed_addr #0 { +; CHECK-LABEL: dac +; CHECK: detach within +; CHECK-NOT: detach within + +entry: + %syncreg = tail call token @llvm.syncregion.start() + %cmp10 = icmp sgt i32 %n, 0 + br i1 %cmp10, label %pfor.detach.lr.ph, label %pfor.cond.cleanup + +pfor.detach.lr.ph: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %pfor.detach + +pfor.cond.cleanup: ; preds = %pfor.inc, %entry + sync within %syncreg, label %pfor.end.continue + +pfor.end.continue: ; preds = %pfor.cond.cleanup + ret void + +pfor.detach: ; preds = %pfor.inc, %pfor.detach.lr.ph + %indvars.iv = phi i64 [ 0, %pfor.detach.lr.ph ], [ %indvars.iv.next, %pfor.inc ] + detach within %syncreg, label %pfor.body, label %pfor.inc + +pfor.body: ; preds = %pfor.detach + %0 = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %0 to double + %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv + store double %conv, double* %arrayidx, align 8, !tbaa !2 + reattach within %syncreg, label %pfor.inc + +pfor.inc: ; preds = %pfor.body, %pfor.detach + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %pfor.cond.cleanup, label %pfor.detach, !llvm.loop !6 +} + +define void @gpu(i32 %n, double* nocapture %a) local_unnamed_addr #0 { +; CHECK-LABEL: gpu +; CHECK: detach within +; CHECK-NOT: detach within + +entry: + %syncreg = tail call token @llvm.syncregion.start() + %cmp10 = icmp sgt i32 %n, 0 + br i1 %cmp10, label %pfor.detach.lr.ph, label %pfor.cond.cleanup + +pfor.detach.lr.ph: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %pfor.detach + +pfor.cond.cleanup: ; preds = %pfor.inc, %entry + sync within %syncreg, label %pfor.end.continue + +pfor.end.continue: ; preds = %pfor.cond.cleanup + ret void + +pfor.detach: ; preds = %pfor.inc, %pfor.detach.lr.ph + %indvars.iv = phi i64 [ 0, %pfor.detach.lr.ph ], [ %indvars.iv.next, %pfor.inc ] + detach within %syncreg, label %pfor.body, label %pfor.inc + +pfor.body: ; preds = %pfor.detach + %0 = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %0 to double + %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv + store double %conv, double* %arrayidx, align 8, !tbaa !2 + reattach within %syncreg, label %pfor.inc + +pfor.inc: ; preds = %pfor.body, %pfor.detach + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %pfor.cond.cleanup, label %pfor.detach, !llvm.loop !8 +} + +define void @seq(i32 %n, double* nocapture %a) local_unnamed_addr #0 { +; CHECK-LABEL: seq +; CHECK: detach within +; CHECK: detach within +; CHECK: detach within +; CHECK: detach within +; CHECK: detach within + +entry: + %syncreg = tail call token @llvm.syncregion.start() + %cmp10 = icmp sgt i32 %n, 0 + br i1 %cmp10, label %pfor.detach.lr.ph, label %pfor.cond.cleanup + +pfor.detach.lr.ph: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %pfor.detach + +pfor.cond.cleanup: ; preds = %pfor.inc, %entry + sync within %syncreg, label %pfor.end.continue + +pfor.end.continue: ; preds = %pfor.cond.cleanup + ret void + +pfor.detach: ; preds = %pfor.inc, %pfor.detach.lr.ph + %indvars.iv = phi i64 [ 0, %pfor.detach.lr.ph ], [ %indvars.iv.next, %pfor.inc ] + detach within %syncreg, label %pfor.body, label %pfor.inc + +pfor.body: ; preds = %pfor.detach + %0 = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %0 to double + %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv + store double %conv, double* %arrayidx, align 8, !tbaa !2 + reattach within %syncreg, label %pfor.inc + +pfor.inc: ; preds = %pfor.body, %pfor.detach + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %pfor.cond.cleanup, label %pfor.detach, !llvm.loop !10 +} + +define void @none(i32 %n, double* nocapture %a) local_unnamed_addr #0 { +; CHECK-LABEL: none +; CHECK: detach within +; CHECK: detach within +; CHECK: detach within +; CHECK: detach within +; CHECK: detach within +entry: + %syncreg = tail call token @llvm.syncregion.start() + %cmp10 = icmp sgt i32 %n, 0 + br i1 %cmp10, label %pfor.detach.lr.ph, label %pfor.cond.cleanup + +pfor.detach.lr.ph: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %pfor.detach + +pfor.cond.cleanup: ; preds = %pfor.inc, %entry + sync within %syncreg, label %pfor.end.continue + +pfor.end.continue: ; preds = %pfor.cond.cleanup + ret void + +pfor.detach: ; preds = %pfor.inc, %pfor.detach.lr.ph + %indvars.iv = phi i64 [ 0, %pfor.detach.lr.ph ], [ %indvars.iv.next, %pfor.inc ] + detach within %syncreg, label %pfor.body, label %pfor.inc + +pfor.body: ; preds = %pfor.detach + %0 = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %0 to double + %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv + store double %conv, double* %arrayidx, align 8, !tbaa !2 + reattach within %syncreg, label %pfor.inc + +pfor.inc: ; preds = %pfor.body, %pfor.detach + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %pfor.cond.cleanup, label %pfor.detach +} + +; Function Attrs: argmemonly nounwind +declare token @llvm.syncregion.start() #1 + +attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (git@github.com:wsmoses/cilk-clang 5cfdd723a552d2ef151fd8990dec559fa7bd4795) (git@github.com:wsmoses/parallel-ir dfb187fa0b106c5a4f1d96ac14368946cbf50b60)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"double", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} +!6 = distinct !{!6, !7} +!7 = !{!"tapir.loop.spawn.strategy", i32 1} +!8 = distinct !{!8, !9} +!9 = !{!"tapir.loop.spawn.strategy", i32 2} +!10 = distinct !{!10, !11} +!11 = !{!"tapir.loop.spawn.strategy", i32 0} From a1d105c569282df3ef7129801ec59adf9382a0cc Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Mon, 23 Jul 2018 16:41:55 -0400 Subject: [PATCH 03/16] add codegen test for gpu --- test/Transforms/Tapir/gpu-backend.ll | 76 ++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 test/Transforms/Tapir/gpu-backend.ll diff --git a/test/Transforms/Tapir/gpu-backend.ll b/test/Transforms/Tapir/gpu-backend.ll new file mode 100644 index 00000000000..38e88d33a78 --- /dev/null +++ b/test/Transforms/Tapir/gpu-backend.ll @@ -0,0 +1,76 @@ +; RUN: opt < %s -loop-spawning -S | FileCheck %s +; ModuleID = 'test.fcc' +source_filename = "test.fcc" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; CHECK: @ptx = private constant [771 x i8] c"//\0A// Generated by LLVM NVPTX Back-End\0A//\0A\0A.version 5.0\0A.target sm_60\0A.address_size 64\0A\0A\09// .globl\09run0\0A\0A.visible .entry run0(\0A\09.param .u64 run0_param_0,\0A\09.param .u64 run0_param_1,\0A\09.param .u64 run0_param_2,\0A\09.param .u64 run0_param_3\0A)\0A{\0A\09.reg .pred \09%p<2>;\0A\09.reg .b32 \09%r<8>;\0A\09.reg .b64 \09%rd<7>;\0A\0A\09ld.param.u64 \09%rd3, [run0_param_0];\0A\09mov.u32 \09%r1, %tid.x;\0A\09mov.u32 \09%r2, %ctaid.x;\0A\09mov.u32 \09%r3, %ntid.x;\0A\09mad.lo.s32 \09%r4, %r2, %r3, %r1;\0A\09cvt.u64.u32 \09%rd2, %r4;\0A\09setp.lt.u64 \09%p1, %rd2, %rd3;\0A\09@%p1 bra \09LBB0_2;\0A\09ret;\0ALBB0_2:\0A\09ld.param.u64 \09%rd4, [run0_param_3];\0A\09cvta.to.global.u64 \09%rd1, %rd4;\0A\09cvt.u32.u64 \09%r5, %rd2;\0A\09shl.b64 \09%rd5, %rd2, 2;\0A\09add.s64 \09%rd6, %rd1, %rd5;\0A\09ld.global.u32 \09%r6, [%rd6];\0A\09add.s32 \09%r7, %r6, %r5;\0A\09st.global.u32 \09[%rd6], %r7;\0A\09ret;\0A}\0A\0A\0A\00" + +; CHECK: host.block: ; preds = %entry +; CHECK-NEXT: call void @__kitsune_cuda_init() +; CHECK-NEXT: call void @__kitsune_gpu_init_kernel(i32 0, i8* getelementptr inbounds ([771 x i8], [771 x i8]* @ptx, i32 0, i32 0)) +; CHECK-NEXT: %1 = bitcast i32* %0 to i8* +; CHECK-NEXT: call void @__kitsune_gpu_init_field(i32 0, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @field.name, i32 0, i32 0), i8* %1, i32 4, i64 1024, i8 3) +; CHECK-NEXT: call void @__kitsune_gpu_set_run_size(i32 0, i64 1024, i64 0, i64 0) +; CHECK-NEXT: call void @__kitsune_gpu_run_kernel(i32 0) +; CHECK-NEXT: call void @__kitsune_gpu_finish() +; CHECK-NEXT: br label %pfor.end.continue + +; Function Attrs: norecurse uwtable +define i32 @main(i32 %argc, i8** nocapture readnone %argv) local_unnamed_addr #0 { +entry: + %syncreg = tail call token @llvm.syncregion.start() + %call = tail call i8* @_Znam(i64 4096) #3 + %0 = bitcast i8* %call to i32* + call void @llvm.memset.p0i8.i64(i8* nonnull %call, i8 0, i64 4096, i32 4, i1 false) + br label %pfor.detach + +pfor.cond.cleanup: ; preds = %pfor.inc + sync within %syncreg, label %pfor.end.continue + +pfor.end.continue: ; preds = %pfor.cond.cleanup + ret i32 0 + +pfor.detach: ; preds = %pfor.inc, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %pfor.inc ] + detach within %syncreg, label %pfor.body, label %pfor.inc + +pfor.body: ; preds = %pfor.detach + %arrayidx4 = getelementptr inbounds i32, i32* %0, i64 %indvars.iv + %1 = load i32, i32* %arrayidx4, align 4, !tbaa !2 + %2 = trunc i64 %indvars.iv to i32 + %add = add nsw i32 %1, %2 + store i32 %add, i32* %arrayidx4, align 4, !tbaa !2 + reattach within %syncreg, label %pfor.inc + +pfor.inc: ; preds = %pfor.body, %pfor.detach + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %pfor.cond.cleanup, label %pfor.detach, !llvm.loop !6 +} + +; Function Attrs: nobuiltin +declare noalias nonnull i8* @_Znam(i64) local_unnamed_addr #1 + +; Function Attrs: argmemonly nounwind +declare token @llvm.syncregion.start() #2 + +; Function Attrs: argmemonly nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) #2 + +attributes #0 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nobuiltin "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { argmemonly nounwind } +attributes #3 = { builtin } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"Los Alamos National Laboratory clang version 5.0.0 (based on LLVM 5.0.0git-15970c3f598)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C++ TBAA"} +!6 = distinct !{!6, !7} +!7 = !{!"tapir.loop.spawn.strategy", i32 2} From f386baad8cb966c5f021e6fef53dd9a9b17f759b Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Mon, 23 Jul 2018 16:59:06 -0400 Subject: [PATCH 04/16] resolve error messages --- lib/Transforms/Tapir/LoopSpawning.cpp | 12 +++++------- lib/Transforms/Tapir/OpenMPABI.cpp | 10 ++-------- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/lib/Transforms/Tapir/LoopSpawning.cpp b/lib/Transforms/Tapir/LoopSpawning.cpp index e24bbdd88bc..50e3a0a7c86 100644 --- a/lib/Transforms/Tapir/LoopSpawning.cpp +++ b/lib/Transforms/Tapir/LoopSpawning.cpp @@ -1428,13 +1428,11 @@ bool LoopSpawningImpl::processLoop(Loop *L) { DEBUG(dbgs() << "LS: Hints dictate sequential spawning.\n"); break; case LoopSpawningHints::ST_GPU: - DEBUG(dbgs() << "LS: Hints dictate DAC spawning.\n"); + DEBUG(dbgs() << "LS: Hints dictate GPU spawning.\n"); { DebugLoc DLoc = L->getStartLoc(); BasicBlock *Header = L->getHeader(); PTXABILoopSpawning DLS(L, SE, &LI, &DT, &AC, ORE); - // CilkABILoopSpawning DLS(L, SE, &LI, &DT, &AC, ORE); - // DACLoopSpawning DLS(L, SE, LI, DT, TLI, TTI, ORE); if (DLS.processLoop()) { DEBUG({ if (verifyFunction(*L->getHeader()->getParent())) { @@ -1443,14 +1441,14 @@ bool LoopSpawningImpl::processLoop(Loop *L) { } }); // Report success. - ORE.emit(OptimizationRemark(LS_NAME, "DACSpawning", DLoc, Header) - << "spawning iterations using divide-and-conquer"); + ORE.emit(OptimizationRemark(LS_NAME, "GPUSpawning", DLoc, Header) + << "spawning iterations using direct gpu mapping"); return true; } else { // Report failure. - ORE.emit(OptimizationRemarkMissed(LS_NAME, "NoDACSpawning", DLoc, + ORE.emit(OptimizationRemarkMissed(LS_NAME, "NoGPUSpawning", DLoc, Header) - << "cannot spawn iterations using divide-and-conquer"); + << "cannot spawn iterations using direct gpu mapping"); emitMissedWarning(F, L, Hints, &ORE); return false; } diff --git a/lib/Transforms/Tapir/OpenMPABI.cpp b/lib/Transforms/Tapir/OpenMPABI.cpp index 5ddcf9eb174..d80824982ef 100644 --- a/lib/Transforms/Tapir/OpenMPABI.cpp +++ b/lib/Transforms/Tapir/OpenMPABI.cpp @@ -488,19 +488,15 @@ Function* formatFunctionToTask(Function* extracted, CallInst* cal) { IRBuilder<> CallerIRBuilder(cal); auto *SharedsTySize = CallerIRBuilder.getInt64(DL.getTypeAllocSize(SharedsTy)); - //unused -- auto *KmpTaskTTy = createKmpTaskTTy(C); - auto *KmpTaskTWithPrivatesTy = createKmpTaskTWithPrivatesTy(SharedsTy);//KmpTaskTTy); + auto *KmpTaskTWithPrivatesTy = createKmpTaskTWithPrivatesTy(SharedsTy); auto *KmpTaskTWithPrivatesPtrTy = PointerType::getUnqual(KmpTaskTWithPrivatesTy); auto *KmpTaskTWithPrivatesTySize = CallerIRBuilder.getInt64(DL.getTypeAllocSize(KmpTaskTWithPrivatesTy)); auto *VoidTy = Type::getVoidTy(C); - // unused -- auto *Int8PtrTy = Type::getInt8PtrTy(C); auto *Int32Ty = Type::getInt32Ty(C); - // unused -- auto *CopyFnTy = FunctionType::get(VoidTy, {Int8PtrTy}, true); - // unused -- auto *CopyFnPtrTy = PointerType::getUnqual(CopyFnTy); auto *OutlinedFnTy = FunctionType::get( VoidTy, @@ -593,12 +589,10 @@ Function *llvm::OpenMPABI::createDetach(DetachInst &detach, ValueToValueMapTy &DetachCtxToStackFrame, DominatorTree &DT, AssumptionCache &AC) { BasicBlock *detB = detach.getParent(); - // unused -- Function &F = *(detB->getParent()); BasicBlock *Spawned = detach.getDetached(); BasicBlock *Continue = detach.getContinue(); - // unused -- Module *M = F.getParent(); CallInst *cal = nullptr; Function *extracted = extractDetachBodyToFunction(detach, DT, AC, &cal); @@ -803,7 +797,7 @@ void llvm::OpenMPABI::postProcessFunction(Function &F) { OpenMPRuntimeFunction::OMPRTL__kmpc_fork_call, F.getParent()); // Replace the old call with __kmpc_fork_call auto *ForkCall = emitRuntimeCall(ForkRTFn, OMPRegionFnArgs, "", b); - assert(ForkCall != 0); // play it safe -- something better to do here? + assert(ForkCall != 0 && "Failed to emit omp runtime call"); ExtractedFnCI->eraseFromParent(); RegionFn->eraseFromParent(); From e5ab01bcc194642eb2da46fa346caa21b2bf591b Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Mon, 23 Jul 2018 18:13:56 -0400 Subject: [PATCH 05/16] Minor cleanups --- include/llvm/Transforms/Tapir/PTXABI.h | 2 +- lib/Transforms/Tapir/PTXABI.cpp | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/include/llvm/Transforms/Tapir/PTXABI.h b/include/llvm/Transforms/Tapir/PTXABI.h index 10698543896..6e1658ff7af 100644 --- a/include/llvm/Transforms/Tapir/PTXABI.h +++ b/include/llvm/Transforms/Tapir/PTXABI.h @@ -78,7 +78,7 @@ namespace llvm { -/// PTXABILoopSpawning uses the Cilk Plus ABI to handle Tapir loops. +/// PTXABILoopSpawning uses the LLVM PTX backend to handle Tapir loops. class PTXABILoopSpawning : public LoopOutline { public: PTXABILoopSpawning(Loop *OrigLoop, ScalarEvolution &SE, diff --git a/lib/Transforms/Tapir/PTXABI.cpp b/lib/Transforms/Tapir/PTXABI.cpp index 249290ee0d5..246dfcdc068 100644 --- a/lib/Transforms/Tapir/PTXABI.cpp +++ b/lib/Transforms/Tapir/PTXABI.cpp @@ -123,17 +123,14 @@ void PTXABI::createSync(SyncInst &SI, ValueToValueMapTy &DetachCtxToStackFrame) Function *PTXABI::createDetach(DetachInst &detach, ValueToValueMapTy &DetachCtxToStackFrame, DominatorTree &DT, AssumptionCache &AC) { + //TODO nicely replace with serializeDetach BasicBlock *detB = detach.getParent(); - // unused -- Function &F = *(detB->getParent()); BasicBlock *Spawned = detach.getDetached(); BasicBlock *Continue = detach.getContinue(); - // unused -- Module *M = F.getParent(); - CallInst *cal = nullptr; Function *extracted = extractDetachBodyToFunction(detach, DT, AC, &cal); - //extracted = formatFunctionToTask(extracted, cal); // Replace the detach with a branch to the continuation. BranchInst *ContinueBr = BranchInst::Create(Continue); From 88238abf3404aa2d0fbcb0b24dcad97824004db7 Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Tue, 24 Jul 2018 12:17:34 -0400 Subject: [PATCH 06/16] Add kitsune cmake flags for compiler-rt --- projects/compiler-rt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/compiler-rt b/projects/compiler-rt index 85ff07e6de5..1d01b643c56 160000 --- a/projects/compiler-rt +++ b/projects/compiler-rt @@ -1 +1 @@ -Subproject commit 85ff07e6de58834c3c5a739de21b45e0809736b6 +Subproject commit 1d01b643c561b0ebbd8f20038ad178a4063a65ed From 66d5c31d1781fd336ce205b056a57df0148fc388 Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Tue, 24 Jul 2018 13:02:42 -0400 Subject: [PATCH 07/16] Finalize kitsune build --- projects/compiler-rt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/compiler-rt b/projects/compiler-rt index 1d01b643c56..fe2f1c8eda5 160000 --- a/projects/compiler-rt +++ b/projects/compiler-rt @@ -1 +1 @@ -Subproject commit 1d01b643c561b0ebbd8f20038ad178a4063a65ed +Subproject commit fe2f1c8eda539dca91edd7ac2f930a13439bbdbf From eb90d5cf19d8d6d8887bcb575a06868cb0c563b7 Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Tue, 24 Jul 2018 21:39:01 -0400 Subject: [PATCH 08/16] TapirTarget restructure --- include/llvm/Transforms/Tapir/CilkABI.h | 15 +- include/llvm/Transforms/Tapir/LoopSpawning.h | 21 +- include/llvm/Transforms/Tapir/OpenMPABI.h | 22 +- include/llvm/Transforms/Tapir/Outline.h | 122 +- include/llvm/Transforms/Tapir/PTXABI.h | 2 + include/llvm/Transforms/Tapir/QthreadsABI.h | 2 + include/llvm/Transforms/Tapir/TapirUtils.h | 89 ++ include/llvm/Transforms/Utils/TapirUtils.h | 89 -- lib/Transforms/Tapir/CilkABI.cpp | 242 +--- lib/Transforms/Tapir/LoopSpawning.cpp | 1196 ++---------------- lib/Transforms/Tapir/OpenMPABI.cpp | 5 + lib/Transforms/Tapir/Outline.cpp | 67 +- lib/Transforms/Tapir/PTXABI.cpp | 44 +- lib/Transforms/Tapir/QthreadsABI.cpp | 4 + lib/Transforms/Tapir/TapirUtils.cpp | 1033 +++++++++++++++ lib/Transforms/Utils/LoopUnroll.cpp | 2 +- lib/Transforms/Utils/TapirUtils.cpp | 150 --- 17 files changed, 1461 insertions(+), 1644 deletions(-) diff --git a/include/llvm/Transforms/Tapir/CilkABI.h b/include/llvm/Transforms/Tapir/CilkABI.h index eb3f635a1cb..60f0c2eddbb 100644 --- a/include/llvm/Transforms/Tapir/CilkABI.h +++ b/include/llvm/Transforms/Tapir/CilkABI.h @@ -54,19 +54,6 @@ class CilkABILoopSpawning : public LoopOutline { bool processLoop(); virtual ~CilkABILoopSpawning() {} - -protected: - // PHINode* canonicalizeIVs(Type *Ty); - Value* canonicalizeLoopLatch(PHINode *IV, Value *Limit); - -// private: -// /// Report an analysis message to assist the user in diagnosing loops that are -// /// not transformed. These are handled as LoopAccessReport rather than -// /// VectorizationReport because the << operator of LoopSpawningReport returns -// /// LoopAccessReport. -// void emitAnalysis(const LoopAccessReport &Message) const { -// emitAnalysisDiag(OrigLoop, *ORE, Message); -// } }; class CilkABI : public TapirTarget { @@ -83,6 +70,8 @@ class CilkABI : public TapirTarget { void postProcessFunction(Function &F) override final; void postProcessHelper(Function &F) override final; bool processMain(Function &F) override final; + bool processLoop(LoopSpawningHints LSH, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT, + AssumptionCache &AC, OptimizationRemarkEmitter &ORE) override final; struct __cilkrts_pedigree {}; struct __cilkrts_stack_frame {}; diff --git a/include/llvm/Transforms/Tapir/LoopSpawning.h b/include/llvm/Transforms/Tapir/LoopSpawning.h index 947610bae9f..1b658ce685e 100644 --- a/include/llvm/Transforms/Tapir/LoopSpawning.h +++ b/include/llvm/Transforms/Tapir/LoopSpawning.h @@ -36,11 +36,11 @@ namespace llvm { /// lifting a Tapir loop into a separate helper function. class LoopOutline { public: - LoopOutline(Loop *OrigLoop, ScalarEvolution &SE, + inline LoopOutline(Loop *OrigLoop, ScalarEvolution &SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, OptimizationRemarkEmitter &ORE) - : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), AC(AC), ORE(ORE), + : OrigLoop(OrigLoop), OrigFunction(OrigLoop->getHeader()->getParent()), SE(SE), LI(LI), DT(DT), AC(AC), ORE(ORE), ExitBlock(nullptr) { // Use the loop latch to determine the canonical exit block for this loop. @@ -59,10 +59,16 @@ class LoopOutline { protected: PHINode* canonicalizeIVs(Type *Ty); Value* canonicalizeLoopLatch(PHINode *IV, Value *Limit); + bool removeNonCanonicalIVs(BasicBlock* Header, BasicBlock* Preheader, PHINode* CanonicalIV, SmallVector &IVs, SCEVExpander &Exp); + //bool setIVStartingValues(); + void unlinkLoop(); /// The original loop. - Loop *OrigLoop; + Loop * const OrigLoop; + + // Function containing original loop + Function * const OrigFunction; /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies /// dynamic knowledge to simplify SCEV expressions and converts them to a @@ -82,15 +88,6 @@ class LoopOutline { /// latch, and handle other exit blocks (i.e., for exception handling) in a /// special manner. BasicBlock *ExitBlock; - -// private: -// /// Report an analysis message to assist the user in diagnosing loops that are -// /// not transformed. These are handled as LoopAccessReport rather than -// /// VectorizationReport because the << operator of LoopSpawningReport returns -// /// LoopAccessReport. -// void emitAnalysis(const LoopAccessReport &Message) const { -// emitAnalysisDiag(OrigLoop, *ORE, Message); -// } }; /// The LoopSpawning Pass. diff --git a/include/llvm/Transforms/Tapir/OpenMPABI.h b/include/llvm/Transforms/Tapir/OpenMPABI.h index 1a2c06a3e63..599e517b093 100644 --- a/include/llvm/Transforms/Tapir/OpenMPABI.h +++ b/include/llvm/Transforms/Tapir/OpenMPABI.h @@ -60,17 +60,19 @@ enum OpenMPSchedType { class OpenMPABI : public TapirTarget { public: -OpenMPABI(); -Value *GetOrCreateWorker8(Function &F) override final; -void createSync(SyncInst &inst, ValueToValueMapTy &DetachCtxToStackFrame) override final; + OpenMPABI(); + Value *GetOrCreateWorker8(Function &F) override final; + void createSync(SyncInst &inst, ValueToValueMapTy &DetachCtxToStackFrame) override final; -Function *createDetach(DetachInst &Detach, - ValueToValueMapTy &DetachCtxToStackFrame, - DominatorTree &DT, AssumptionCache &AC) override final; -void preProcessFunction(Function &F) override final; -void postProcessFunction(Function &F) override final; -void postProcessHelper(Function &F) override final; -bool processMain(Function &F) override final; + Function *createDetach(DetachInst &Detach, + ValueToValueMapTy &DetachCtxToStackFrame, + DominatorTree &DT, AssumptionCache &AC) override final; + void preProcessFunction(Function &F) override final; + void postProcessFunction(Function &F) override final; + void postProcessHelper(Function &F) override final; + bool processMain(Function &F) override final; + bool processLoop(LoopSpawningHints LSH, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT, + AssumptionCache &AC, OptimizationRemarkEmitter &ORE) override final; }; } // end of llvm namespace diff --git a/include/llvm/Transforms/Tapir/Outline.h b/include/llvm/Transforms/Tapir/Outline.h index c7debe54b8d..6e779fdf719 100644 --- a/include/llvm/Transforms/Tapir/Outline.h +++ b/include/llvm/Transforms/Tapir/Outline.h @@ -29,13 +29,121 @@ namespace llvm { typedef SetVector ValueSet; -/// Find the inputs and outputs for a function outlined from the gives set of -/// basic blocks. -void findInputsOutputs( - const SmallPtrSetImpl &Blocks, - ValueSet &Inputs, ValueSet &Outputs, - const SmallPtrSetImpl *ExitBlocks = nullptr, - DominatorTree *DT = nullptr); +/// definedInRegion - Return true if the specified value is defined in the +/// extracted region. +template +static inline bool definedInRegion(const BasicBlockPtrContainer &Blocks, + Value *V) { + if (Instruction *I = dyn_cast(V)) + if (std::find(Blocks.begin(), Blocks.end(), I->getParent()) != Blocks.end()) + return true; + return false; +} + +/// definedInCaller - Return true if the specified value is defined in the +/// function being code extracted, but not in the region being extracted. +/// These values must be passed in as live-ins to the function. +template +static inline bool definedInCaller(const BasicBlockPtrContainer &Blocks, + Value *V) { + if (isa(V)) return true; + if (Instruction *I = dyn_cast(V)) + if (std::find(Blocks.begin(), Blocks.end(), I->getParent()) != Blocks.end()) + return true; + return false; +} + +// findInputsOutputs - Find inputs and outputs for Blocks. Any blocks in +// ExitBlocks are handled in a special manner: PHI nodes in Exit Blocks are +// ignored when determining inputs. +// Handles rvalues (should be equivalent to lvalue code below) +template +static inline void findInputsOutputs(const BasicBlockPtrContainer &&Blocks, + ValueSet &Inputs, ValueSet &Outputs, + const SmallPtrSetImpl *ExitBlocks = nullptr, + DominatorTree *DT = nullptr) { + for (BasicBlock *BB : Blocks) { + // If a used value is defined outside the region, it's an input. If an + // instruction is used outside the region, it's an output. + for (Instruction &II : *BB) { + for (User::op_iterator OI = II.op_begin(), OE = II.op_end(); OI != OE; + ++OI) { + // The PHI nodes in each exit block will be updated after the exit block + // is cloned. Hence, we don't want to count their uses of values + // defined outside the region. + if (ExitBlocks && ExitBlocks->count(BB)) + if (PHINode *PN = dyn_cast(&II)) + if (std::find(Blocks.begin(), Blocks.end(), PN->getIncomingBlock(*OI)) != Blocks.end()) + continue; + if (definedInCaller(Blocks, *OI)) + Inputs.insert(*OI); + } + + // Ignore outputs from exit blocks. + if (!ExitBlocks || !ExitBlocks->count(BB)) { + for (User *U : II.users()) { + if (!definedInRegion(Blocks, U)) { + // It looks like we have a use outside of the given blocks, but it's + // possible for the use to appear in a basic block that is no longer + // alive. We use the DT to check that this use is still alive. + if (Instruction *I = dyn_cast(U)) { + if (DT && DT->isReachableFromEntry(I->getParent())) { + Outputs.insert(&II); + break; + } + } + } + } + } + } + } +} + +// findInputsOutputs - Find inputs and outputs for Blocks. Any blocks in +// ExitBlocks are handled in a special manner: PHI nodes in Exit Blocks are +// ignored when determining inputs. +// Handles lvalues (should be equivalent to rvalue code above) +template +static inline void findInputsOutputs(const BasicBlockPtrContainer &Blocks, + ValueSet &Inputs, ValueSet &Outputs, + const SmallPtrSetImpl *ExitBlocks = nullptr, + DominatorTree *DT = nullptr) { + for (BasicBlock *BB : Blocks) { + // If a used value is defined outside the region, it's an input. If an + // instruction is used outside the region, it's an output. + for (Instruction &II : *BB) { + for (User::op_iterator OI = II.op_begin(), OE = II.op_end(); OI != OE; + ++OI) { + // The PHI nodes in each exit block will be updated after the exit block + // is cloned. Hence, we don't want to count their uses of values + // defined outside the region. + if (ExitBlocks && ExitBlocks->count(BB)) + if (PHINode *PN = dyn_cast(&II)) + if (std::find(Blocks.begin(), Blocks.end(), PN->getIncomingBlock(*OI)) != Blocks.end()) + continue; + if (definedInCaller(Blocks, *OI)) + Inputs.insert(*OI); + } + + // Ignore outputs from exit blocks. + if (!ExitBlocks || !ExitBlocks->count(BB)) { + for (User *U : II.users()) { + if (!definedInRegion(Blocks, U)) { + // It looks like we have a use outside of the given blocks, but it's + // possible for the use to appear in a basic block that is no longer + // alive. We use the DT to check that this use is still alive. + if (Instruction *I = dyn_cast(U)) { + if (DT && DT->isReachableFromEntry(I->getParent())) { + Outputs.insert(&II); + break; + } + } + } + } + } + } + } +} /// Clone Blocks into NewFunc, transforming the old arguments into references to /// VMap values. diff --git a/include/llvm/Transforms/Tapir/PTXABI.h b/include/llvm/Transforms/Tapir/PTXABI.h index 6e1658ff7af..829fd46bdcf 100644 --- a/include/llvm/Transforms/Tapir/PTXABI.h +++ b/include/llvm/Transforms/Tapir/PTXABI.h @@ -120,6 +120,8 @@ class PTXABI : public TapirTarget { void postProcessFunction(Function &F) override final; void postProcessHelper(Function &F) override final; bool processMain(Function &F) override final; + bool processLoop(LoopSpawningHints LSH, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT, + AssumptionCache &AC, OptimizationRemarkEmitter &ORE) override final; }; diff --git a/include/llvm/Transforms/Tapir/QthreadsABI.h b/include/llvm/Transforms/Tapir/QthreadsABI.h index 2737ffa779c..d4fecbc5b38 100644 --- a/include/llvm/Transforms/Tapir/QthreadsABI.h +++ b/include/llvm/Transforms/Tapir/QthreadsABI.h @@ -56,6 +56,8 @@ class QthreadsABI : public TapirTarget { void postProcessFunction(Function &F) override final; void postProcessHelper(Function &F) override final; bool processMain(Function &F) override final; + bool processLoop(LoopSpawningHints LSH, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT, + AssumptionCache &AC, OptimizationRemarkEmitter &ORE) override final; }; } // end of llvm namespace diff --git a/include/llvm/Transforms/Tapir/TapirUtils.h b/include/llvm/Transforms/Tapir/TapirUtils.h index f1a6a327804..0624627dee7 100644 --- a/include/llvm/Transforms/Tapir/TapirUtils.h +++ b/include/llvm/Transforms/Tapir/TapirUtils.h @@ -19,6 +19,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" @@ -26,6 +27,7 @@ #include "llvm/Transforms/Utils/ValueMapper.h" namespace llvm { +class OptimizationRemarkEmitter; bool verifyDetachedCFG(const DetachInst &Detach, DominatorTree &DT, bool error = true); @@ -47,6 +49,88 @@ Function *extractDetachBodyToFunction(DetachInst &Detach, DominatorTree &DT, AssumptionCache &AC, CallInst **call = nullptr); +/// Utility class for getting and setting loop spawning hints in the form +/// of loop metadata. +/// This class keeps a number of loop annotations locally (as member variables) +/// and can, upon request, write them back as metadata on the loop. It will +/// initially scan the loop for existing metadata, and will update the local +/// values based on information in the loop. +class LoopSpawningHints { +public: + enum SpawningStrategy { + ST_SEQ, + ST_DAC, + ST_GPU, + ST_END, + }; + +private: + enum HintKind { HK_STRATEGY, HK_GRAINSIZE }; + + /// Hint - associates name and validation with the hint value. + struct Hint { + const char *Name; + unsigned Value; // This may have to change for non-numeric values. + HintKind Kind; + + Hint(const char *Name, unsigned Value, HintKind Kind) + : Name(Name), Value(Value), Kind(Kind) {} + + bool validate(unsigned Val); + }; + + /// Spawning strategy + Hint Strategy; + /// Grainsize + Hint Grainsize; + + /// Return the loop metadata prefix. + static inline StringRef Prefix() { return "tapir.loop."; } + +public: + static inline std::string printStrategy(enum SpawningStrategy Strat) { + switch(Strat) { + case LoopSpawningHints::ST_SEQ: + return "Spawn iterations sequentially"; + case LoopSpawningHints::ST_DAC: + return "Use divide-and-conquer"; + case LoopSpawningHints::ST_GPU: + return "Use gpu"; + default: + return "Unknown"; + } + } + + LoopSpawningHints(Loop *L); + + SpawningStrategy getStrategy() const; + + unsigned getGrainsize() const; + + /// The loop these hints belong to. + Loop * const TheLoop; + +private: + /// Find hints specified in the loop metadata and update local values. + void getHintsFromMetadata(); + + /// Checks string hint with one operand and set value if valid. + void setHint(StringRef Name, Metadata *Arg); + + /// Create a new hint from name / value pair. + MDNode *createHintMetadata(StringRef Name, unsigned V) const; + + /// Matches metadata with hint name. + bool matchesHintMetadataName(MDNode *Node, ArrayRef HintTypes); + + /// Sets current hints into loop metadata, keeping other values intact. + void writeHintsToMetadata(ArrayRef HintTypes); + +}; + +//! Identify if a loop could should be handled manually by a parallel loop backend +bool isBackendParallelFor(Loop* L); + class TapirTarget { public: virtual ~TapirTarget() {}; @@ -62,6 +146,11 @@ class TapirTarget { virtual void postProcessFunction(Function &F) = 0; virtual void postProcessHelper(Function &F) = 0; virtual bool processMain(Function &F) = 0; + virtual bool processLoop(LoopSpawningHints LSH, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT, + AssumptionCache &AC, OptimizationRemarkEmitter &ORE) = 0; + //! Helper to perform DAC + bool processDACLoop(LoopSpawningHints LSH, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT, + AssumptionCache &AC, OptimizationRemarkEmitter &ORE); }; TapirTarget *getTapirTargetFromType(TapirTargetType Type); diff --git a/include/llvm/Transforms/Utils/TapirUtils.h b/include/llvm/Transforms/Utils/TapirUtils.h index 4c2fb19b00a..4250a671c95 100644 --- a/include/llvm/Transforms/Utils/TapirUtils.h +++ b/include/llvm/Transforms/Utils/TapirUtils.h @@ -49,92 +49,6 @@ BasicBlock *GetDetachedCtx(BasicBlock *BB); /// - even after ignoring all reattach edges. bool isCriticalContinueEdge(const TerminatorInst *TI, unsigned SuccNum); -/// Utility class for getting and setting loop spawning hints in the form -/// of loop metadata. -/// This class keeps a number of loop annotations locally (as member variables) -/// and can, upon request, write them back as metadata on the loop. It will -/// initially scan the loop for existing metadata, and will update the local -/// values based on information in the loop. -class LoopSpawningHints { -public: - enum SpawningStrategy { - ST_SEQ, - ST_DAC, - ST_GPU, - ST_END, - }; - -private: - enum HintKind { HK_STRATEGY, HK_GRAINSIZE }; - - /// Hint - associates name and validation with the hint value. - struct Hint { - const char *Name; - unsigned Value; // This may have to change for non-numeric values. - HintKind Kind; - - Hint(const char *Name, unsigned Value, HintKind Kind) - : Name(Name), Value(Value), Kind(Kind) {} - - bool validate(unsigned Val); - }; - - /// Spawning strategy - Hint Strategy; - /// Grainsize - Hint Grainsize; - - /// Return the loop metadata prefix. - static inline StringRef Prefix() { return "tapir.loop."; } - -public: - static inline std::string printStrategy(enum SpawningStrategy Strat) { - switch(Strat) { - case LoopSpawningHints::ST_SEQ: - return "Spawn iterations sequentially"; - case LoopSpawningHints::ST_DAC: - return "Use divide-and-conquer"; - case LoopSpawningHints::ST_GPU: - return "Use gpu"; - default: - return "Unknown"; - } - } - - LoopSpawningHints(const Loop *L); - - // /// Dumps all the hint information. - // std::string emitRemark() const { - // LoopSpawningReport R; - // R << "Strategy = " << printStrategy(getStrategy()); - - // return R.str(); - // } - - SpawningStrategy getStrategy() const; - - unsigned getGrainsize() const; - -private: - /// Find hints specified in the loop metadata and update local values. - void getHintsFromMetadata(); - - /// Checks string hint with one operand and set value if valid. - void setHint(StringRef Name, Metadata *Arg); - - /// Create a new hint from name / value pair. - MDNode *createHintMetadata(StringRef Name, unsigned V) const; - - /// Matches metadata with hint name. - bool matchesHintMetadataName(MDNode *Node, ArrayRef HintTypes); - - /// Sets current hints into loop metadata, keeping other values intact. - void writeHintsToMetadata(ArrayRef HintTypes); - - /// The loop these hints belong to. - const Loop *TheLoop; -}; - /// Checks if this loop is a Tapir loop. Right now we check that the loop is /// in a canonical form: /// 1) The header detaches the body. @@ -144,9 +58,6 @@ class LoopSpawningHints { /// 4) The loop only branches to the exit block from the header or the latch. bool isCanonicalTapirLoop(const Loop *L, bool print = false); -//! Identify if a loop could should be handled manually by a parallel loop backend -bool isBackendParallelFor(Loop* L); - /// canDetach - Return true if the given function can perform a detach, false /// otherwise. bool canDetach(const Function *F); diff --git a/lib/Transforms/Tapir/CilkABI.cpp b/lib/Transforms/Tapir/CilkABI.cpp index 8265693b081..8732f19a0b1 100644 --- a/lib/Transforms/Tapir/CilkABI.cpp +++ b/lib/Transforms/Tapir/CilkABI.cpp @@ -1293,41 +1293,6 @@ bool CilkABI::processMain(Function &F) { return false; } -/// \brief Replace the latch of the loop to check that IV is always less than or -/// equal to the limit. -/// -/// This method assumes that the loop has a single loop latch. -Value* CilkABILoopSpawning::canonicalizeLoopLatch(PHINode *IV, Value *Limit) { - Loop *L = OrigLoop; - - Value *NewCondition; - BasicBlock *Header = L->getHeader(); - BasicBlock *Latch = L->getLoopLatch(); - assert(Latch && "No single loop latch found for loop."); - - IRBuilder<> Builder(&*Latch->getFirstInsertionPt()); - - // This process assumes that IV's increment is in Latch. - - // Create comparison between IV and Limit at top of Latch. - NewCondition = - Builder.CreateICmpULT(Builder.CreateAdd(IV, - ConstantInt::get(IV->getType(), 1)), - Limit); - - // Replace the conditional branch at the end of Latch. - BranchInst *LatchBr = dyn_cast_or_null(Latch->getTerminator()); - assert(LatchBr && LatchBr->isConditional() && - "Latch does not terminate with a conditional branch."); - Builder.SetInsertPoint(Latch->getTerminator()); - Builder.CreateCondBr(NewCondition, Header, ExitBlock); - - // Erase the old conditional branch. - LatchBr->eraseFromParent(); - - return NewCondition; -} - /// Top-level call to convert a Tapir loop to be processed using an appropriate /// Cilk ABI call. bool CilkABILoopSpawning::processLoop() { @@ -1363,25 +1328,18 @@ bool CilkABILoopSpawning::processLoop() { } } - Function *F = Header->getParent(); - Module* M = F->getParent(); + Module* M = OrigFunction->getParent(); DEBUG(dbgs() << "LS loop header:" << *Header); DEBUG(dbgs() << "LS loop latch:" << *Latch); - // DEBUG(dbgs() << "LS SE backedge taken count: " << *(SE.getBackedgeTakenCount(L)) << "\n"); - // DEBUG(dbgs() << "LS SE max backedge taken count: " << *(SE.getMaxBackedgeTakenCount(L)) << "\n"); DEBUG(dbgs() << "LS SE exit count: " << *(SE.getExitCount(L, Latch)) << "\n"); /// Get loop limit. const SCEV *BETC = SE.getExitCount(L, Latch); const SCEV *Limit = SE.getAddExpr(BETC, SE.getOne(BETC->getType())); DEBUG(dbgs() << "LS Loop limit: " << *Limit << "\n"); - // PredicatedScalarEvolution PSE(SE, *L); - // const SCEV *PLimit = PSE.getExitCount(L, Latch); - // DEBUG(dbgs() << "LS predicated loop limit: " << *PLimit << "\n"); - // emitAnalysis(LoopSpawningReport() - // << "computed loop limit " << *Limit << "\n"); + if (SE.getCouldNotCompute() == Limit) { DEBUG(dbgs() << "SE could not compute loop limit.\n"); ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "UnknownLoopLimit", @@ -1390,107 +1348,29 @@ bool CilkABILoopSpawning::processLoop() { << "could not compute limit"); return false; } - // ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "LoopLimit", L->getStartLoc(), - // Header) - // << "loop limit: " << NV("Limit", Limit)); - /// Clean up the loop's induction variables. + PHINode *CanonicalIV = canonicalizeIVs(Limit->getType()); if (!CanonicalIV) { DEBUG(dbgs() << "Could not get canonical IV.\n"); - // emitAnalysis(LoopSpawningReport() - // << "Could not get a canonical IV.\n"); ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "NoCanonicalIV", L->getStartLoc(), Header) << "could not find or create canonical IV"); return false; } - const SCEVAddRecExpr *CanonicalSCEV = - cast(SE.getSCEV(CanonicalIV)); - // Remove all IV's other can CanonicalIV. - // First, check that we can do this. - bool CanRemoveIVs = true; - for (BasicBlock::iterator II = Header->begin(); isa(II); ++II) { - PHINode *PN = cast(II); - if (CanonicalIV == PN) continue; - // dbgs() << "IV " << *PN; - const SCEV *S = SE.getSCEV(PN); - // dbgs() << " SCEV " << *S << "\n"; - if (SE.getCouldNotCompute() == S) { - // emitAnalysis(LoopSpawningReport(PN) - // << "Could not compute the scalar evolution.\n"); - ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "NoSCEV", PN) - << "could not compute scalar evolution of " - << NV("PHINode", PN)); - CanRemoveIVs = false; - } - } + // Remove the IV's (other than CanonicalIV) and replace them with + // their stronger forms. + // + // TODO?: We can probably adapt this loop->DAC process such that we + // don't require all IV's to be canonical. + SmallVector IVs; + SCEVExpander Exp(SE, M->getDataLayout(), "ls"); + if (!removeNonCanonicalIVs(Header, Preheader, CanonicalIV, IVs, Exp)) + return false; - if (!CanRemoveIVs) { - DEBUG(dbgs() << "Could not compute scalar evolutions for all IV's.\n"); - return false; - } - - //////////////////////////////////////////////////////////////////////// - // We now have everything we need to extract the loop. It's time to - // do some surgery. - - SCEVExpander Exp(SE, M->getDataLayout(), "ls"); - - // Remove the IV's (other than CanonicalIV) and replace them with - // their stronger forms. - // - // TODO?: We can probably adapt this process such that we don't require all - // IV's to be canonical. - { - SmallVector IVsToRemove; - for (BasicBlock::iterator II = Header->begin(); isa(II); ++II) { - PHINode *PN = cast(II); - if (PN == CanonicalIV) continue; - const SCEV *S = SE.getSCEV(PN); - Value *NewIV = Exp.expandCodeFor(S, S->getType(), CanonicalIV); - PN->replaceAllUsesWith(NewIV); - IVsToRemove.push_back(PN); - } - for (PHINode *PN : IVsToRemove) - PN->eraseFromParent(); - } - - // All remaining IV's should be canonical. Collect them. - // - // TODO?: We can probably adapt this process such that we don't require all - // IV's to be canonical. - SmallVector IVs; - bool AllCanonical = true; - for (BasicBlock::iterator II = Header->begin(); isa(II); ++II) { - PHINode *PN = cast(II); - DEBUG({ - const SCEVAddRecExpr *PNSCEV = - dyn_cast(SE.getSCEV(PN)); - assert(PNSCEV && "PHINode did not have corresponding SCEVAddRecExpr"); - assert(PNSCEV->getStart()->isZero() && - "PHINode SCEV does not start at 0"); - dbgs() << "LS step recurrence for SCEV " << *PNSCEV << " is " - << *(PNSCEV->getStepRecurrence(SE)) << "\n"; - assert(PNSCEV->getStepRecurrence(SE)->isOne() && - "PHINode SCEV step is not 1"); - }); - if (ConstantInt *C = - dyn_cast(PN->getIncomingValueForBlock(Preheader))) { - if (C->isZero()) - IVs.push_back(PN); - } else { - AllCanonical = false; - DEBUG(dbgs() << "Remaining non-canonical PHI Node found: " << *PN << "\n"); - // emitAnalysis(LoopSpawningReport(PN) - // << "Found a remaining non-canonical IV.\n"); - ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "NonCanonicalIV", PN) - << "found a remaining noncanonical IV"); - } - } - if (!AllCanonical) - return false; + const SCEVAddRecExpr *CanonicalSCEV = + cast(SE.getSCEV(CanonicalIV)); // Insert the computation for the loop limit into the Preheader. Value *LimitVar = Exp.expandCodeFor(Limit, Limit->getType(), @@ -1506,45 +1386,11 @@ bool CilkABILoopSpawning::processLoop() { SetVector Inputs, Outputs; SetVector BodyInputs, BodyOutputs; ValueToValueMapTy VMap, InputMap; - std::vector LoopBlocks; AllocaInst* closure; // Add start iteration, end iteration, and grainsize to inputs. { - LoopBlocks = L->getBlocks(); - // // Add exit blocks terminated by unreachable. There should not be any other - // // exit blocks in the loop. - // SmallSet UnreachableExits; - // for (BasicBlock *Exit : ExitBlocks) { - // if (Exit == ExitBlock) continue; - // assert(isa(Exit->getTerminator()) && - // "Found problematic exit block."); - // UnreachableExits.insert(Exit); - // } - - // // Add unreachable and exception-handling exits to the set of loop blocks to - // // clone. - // for (BasicBlock *BB : UnreachableExits) - // LoopBlocks.push_back(BB); - // for (BasicBlock *BB : EHExits) - // LoopBlocks.push_back(BB); - - // DEBUG({ - // dbgs() << "LoopBlocks: "; - // for (BasicBlock *LB : LoopBlocks) - // dbgs() << LB->getName() << "(" - // << *(LB->getTerminator()) << "), "; - // dbgs() << "\n"; - // }); - // Get the inputs and outputs for the loop body. - { - // CodeExtractor Ext(LoopBlocks, DT); - // Ext.findInputsOutputs(BodyInputs, BodyOutputs); - SmallPtrSet Blocks; - for (BasicBlock *BB : LoopBlocks) - Blocks.insert(BB); - findInputsOutputs(Blocks, BodyInputs, BodyOutputs); - } + findInputsOutputs(L->getBlocks(), BodyInputs, BodyOutputs); // Add argument for start of CanonicalIV. DEBUG({ @@ -1601,18 +1447,11 @@ bool CilkABILoopSpawning::processLoop() { } } Inputs.insert(closure); - //errs() << "\n"; - //for(auto& a : Inputs) a->dump(); - //errs() << "\n"; - //StartArg->dump(); - //ea->dump(); + Inputs.remove(StartArg); Inputs.insert(StartArg); Inputs.remove(ea); Inputs.insert(ea); - //errs() << "\n"; - //for(auto& a : Inputs) a->dump(); - //errs() << "\n"; for (Value *V : BodyInputsToRemove) BodyInputs.remove(V); assert(0 == BodyOutputs.size() && @@ -1630,19 +1469,17 @@ bool CilkABILoopSpawning::processLoop() { { SmallVector Returns; // Ignore returns cloned. - // LowerDbgDeclare(*(Header->getParent())); - Helper = CreateHelper(Inputs, Outputs, L->getBlocks(), Header, Preheader, ExitBlock/*L->getExitBlock()*/, VMap, M, - F->getSubprogram() != nullptr, Returns, ".ls", + OrigFunction->getSubprogram() != nullptr, Returns, ".ls", nullptr, nullptr, nullptr); assert(Returns.empty() && "Returns cloned when cloning loop."); // Use a fast calling convention for the helper. //Helper->setCallingConv(CallingConv::Fast); - // Helper->setCallingConv(Header->getParent()->getCallingConv()); + //Helper->setCallingConv(Header->getParent()->getCallingConv()); } BasicBlock *NewPreheader = cast(VMap[Preheader]); @@ -1782,3 +1619,46 @@ bool CilkABILoopSpawning::processLoop() { return Helper; } + +bool llvm::CilkABI::processLoop(LoopSpawningHints LSH, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT, + AssumptionCache &AC, OptimizationRemarkEmitter &ORE) { + if (LSH.getStrategy() != LoopSpawningHints::ST_DAC) + return false; + + if (LSH.getStrategy() == LoopSpawningHints::ST_DAC) + return processDACLoop(LSH, LI, SE, DT, AC, ORE); + + DEBUG(dbgs() << "LS: Using CilkABI spawning.\n"); + + Loop* L = LSH.TheLoop; + + DebugLoc DLoc = L->getStartLoc(); + BasicBlock *Header = L->getHeader(); + CilkABILoopSpawning DLS(L, SE, &LI, &DT, &AC, ORE); + if (DLS.processLoop()) { + DEBUG({ + if (verifyFunction(*L->getHeader()->getParent())) { + dbgs() << "Transformed function is invalid.\n"; + return false; + } + }); + // Report success. + ORE.emit(OptimizationRemark(LS_NAME, "DACSpawning", DLoc, Header) + << "spawning iterations using divide-and-conquer"); + return true; + } else { + // Report failure. + ORE.emit(OptimizationRemarkMissed(LS_NAME, "NoDACSpawning", DLoc, + Header) + << "cannot spawn iterations using divide-and-conquer"); + + ORE.emit(DiagnosticInfoOptimizationFailure( + DEBUG_TYPE, "FailedRequestedSpawning", + L->getStartLoc(), L->getHeader()) + << "Tapir loop not transformed: " + << "failed to use divide-and-conquer loop spawning"); + return false; + } + + return false; +} \ No newline at end of file diff --git a/lib/Transforms/Tapir/LoopSpawning.cpp b/lib/Transforms/Tapir/LoopSpawning.cpp index 50e3a0a7c86..0860d173459 100644 --- a/lib/Transforms/Tapir/LoopSpawning.cpp +++ b/lib/Transforms/Tapir/LoopSpawning.cpp @@ -61,8 +61,6 @@ using namespace llvm; #define DEBUG_TYPE LS_NAME STATISTIC(LoopsAnalyzed, "Number of Tapir loops analyzed"); -STATISTIC(LoopsConvertedToDAC, - "Number of Tapir loops converted to divide-and-conquer iteration spawning"); static cl::opt ClTapirTarget( "ls-tapir-target", cl::desc("Target runtime for Tapir"), @@ -81,35 +79,10 @@ static cl::opt ClTapirTarget( "ptx", "PTX"))); namespace { -// /// \brief This modifies LoopAccessReport to initialize message with -// /// tapir-loop-specific part. -// class LoopSpawningReport : public LoopAccessReport { -// public: -// LoopSpawningReport(Instruction *I = nullptr) -// : LoopAccessReport("loop-spawning: ", I) {} - -// /// \brief This allows promotion of the loop-access analysis report into the -// /// loop-spawning report. It modifies the message to add the -// /// loop-spawning-specific part of the message. -// explicit LoopSpawningReport(const LoopAccessReport &R) -// : LoopAccessReport(Twine("loop-spawning: ") + R.str(), -// R.getInstr()) {} -// }; - -// static void emitAnalysisDiag(const Loop *TheLoop, -// OptimizationRemarkEmitter &ORE, -// const LoopAccessReport &Message) { -// const char *Name = LS_NAME; -// LoopAccessReport::emitAnalysis(Message, TheLoop, Name, ORE); -// } static void emitMissedWarning(Function *F, Loop *L, const LoopSpawningHints &LH, OptimizationRemarkEmitter *ORE) { - // ORE->emit(OptimizationRemarkMissed( - // LS_NAME, "LSHint", L->getStartLoc(), L->getHeader()) - // << "Strategy = " - // << LoopSpawningHints::printStrategy(LH.getStrategy())); switch (LH.getStrategy()) { case LoopSpawningHints::ST_DAC: ORE->emit(DiagnosticInfoOptimizationFailure( @@ -142,74 +115,7 @@ static void emitMissedWarning(Function *F, Loop *L, } } -/// DACLoopSpawning implements the transformation to spawn the iterations of a -/// Tapir loop in a recursive divide-and-conquer fashion. -class DACLoopSpawning : public LoopOutline { -public: - // DACLoopSpawning(Loop *OrigLoop, ScalarEvolution &SE, - // LoopInfo *LI, DominatorTree *DT, - // const TargetLibraryInfo *TLI, - // const TargetTransformInfo *TTI, - // OptimizationRemarkEmitter *ORE) - // : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), - // TLI(TLI), TTI(TTI), ORE(ORE) - // {} - TapirTarget* tapirTarget; - DACLoopSpawning(Loop *OrigLoop, unsigned Grainsize, - ScalarEvolution &SE, - LoopInfo *LI, DominatorTree *DT, - AssumptionCache *AC, - OptimizationRemarkEmitter &ORE, TapirTarget* tapirTarget) - : LoopOutline(OrigLoop, SE, LI, DT, AC, ORE), - tapirTarget(tapirTarget), - SpecifiedGrainsize(Grainsize) - {} - - bool processLoop(); - - virtual ~DACLoopSpawning() {} - -protected: - Value* computeGrainsize(Value *Limit); - void implementDACIterSpawnOnHelper(Function *Helper, - BasicBlock *Preheader, - BasicBlock *Header, - PHINode *CanonicalIV, - Argument *Limit, - Argument *Grainsize, - Instruction *SyncRegion, - DominatorTree *DT, - LoopInfo *LI, - bool CanonicalIVFlagNUW = false, - bool CanonicalIVFlagNSW = false); - unsigned SpecifiedGrainsize; -// private: -// /// Report an analysis message to assist the user in diagnosing loops that are -// /// not transformed. These are handled as LoopAccessReport rather than -// /// VectorizationReport because the << operator of LoopSpawningReport returns -// /// LoopAccessReport. -// void emitAnalysis(const LoopAccessReport &Message) const { -// emitAnalysisDiag(OrigLoop, *ORE, Message); -// } -}; - struct LoopSpawningImpl { - // LoopSpawningImpl(Function &F, LoopInfo &LI, ScalarEvolution &SE, - // DominatorTree &DT, - // const TargetTransformInfo &TTI, - // const TargetLibraryInfo *TLI, - // AliasAnalysis &AA, AssumptionCache &AC, - // OptimizationRemarkEmitter &ORE) - // : F(&F), LI(&LI), SE(&SE), DT(&DT), TTI(&TTI), TLI(TLI), - // AA(&AA), AC(&AC), ORE(&ORE) {} - // LoopSpawningImpl(Function &F, - // function_ref GetLI, - // function_ref GetSE, - // function_ref GetDT, - // OptimizationRemarkEmitter &ORE) - // : F(F), GetLI(GetLI), LI(nullptr), GetSE(GetSE), GetDT(GetDT), - // ORE(ORE) - // {} LoopSpawningImpl(Function &F, LoopInfo &LI, ScalarEvolution &SE, @@ -226,15 +132,9 @@ struct LoopSpawningImpl { bool processLoop(Loop *L); Function &F; - // function_ref GetLI; LoopInfo &LI; - // function_ref GetSE; - // function_ref GetDT; ScalarEvolution &SE; DominatorTree &DT; - // const TargetTransformInfo *TTI; - // const TargetLibraryInfo *TLI; - // AliasAnalysis *AA; AssumptionCache &AC; OptimizationRemarkEmitter &ORE; @@ -267,490 +167,19 @@ PHINode* LoopOutline::canonicalizeIVs(Type *Ty) { return CanonicalIV; } -/// \brief Replace the latch of the loop to check that IV is always less than or -/// equal to the limit. -/// -/// This method assumes that the loop has a single loop latch. -Value* LoopOutline::canonicalizeLoopLatch(PHINode *IV, Value *Limit) { - Loop *L = OrigLoop; - - Value *NewCondition; - BasicBlock *Header = L->getHeader(); - BasicBlock *Latch = L->getLoopLatch(); - assert(Latch && "No single loop latch found for loop."); - - IRBuilder<> Builder(&*Latch->getFirstInsertionPt()); - - // This process assumes that IV's increment is in Latch. - - // Create comparison between IV and Limit at top of Latch. - NewCondition = Builder.CreateICmpULT(IV, Limit); - - // Replace the conditional branch at the end of Latch. - BranchInst *LatchBr = dyn_cast_or_null(Latch->getTerminator()); - assert(LatchBr && LatchBr->isConditional() && - "Latch does not terminate with a conditional branch."); - Builder.SetInsertPoint(Latch->getTerminator()); - Builder.CreateCondBr(NewCondition, Header, ExitBlock); - - // Erase the old conditional branch. - Value *OldCond = LatchBr->getCondition(); - LatchBr->eraseFromParent(); - if (!OldCond->hasNUsesOrMore(1)) - if (Instruction *OldCondInst = dyn_cast(OldCond)) - OldCondInst->eraseFromParent(); - - return NewCondition; -} - -/// Unlink the specified loop, and update analysis accordingly. The heavy -/// lifting of deleting the loop is carried out by a run of LoopDeletion after -/// this pass. -void LoopOutline::unlinkLoop() { - Loop *L = OrigLoop; - - // Get components of the old loop. - BasicBlock *Preheader = L->getLoopPreheader(); - assert(Preheader && "Loop does not have a unique preheader."); - BasicBlock *Latch = L->getLoopLatch(); - - // Invalidate the analysis of the old loop. - SE.forgetLoop(L); - - // Redirect the preheader to branch directly to loop exit. - assert(1 == Preheader->getTerminator()->getNumSuccessors() && - "Preheader does not have a unique successor."); - Preheader->getTerminator()->replaceUsesOfWith(L->getHeader(), - ExitBlock); - - // Rewrite phis in the exit block to get their inputs from - // the preheader instead of the exiting block. - BasicBlock::iterator BI = ExitBlock->begin(); - while (PHINode *P = dyn_cast(BI)) { - int j = P->getBasicBlockIndex(Latch); - assert(j >= 0 && "Can't find exiting block in exit block's phi node!"); - P->setIncomingBlock(j, Preheader); - P->removeIncomingValue(Latch); - ++BI; - } - - // Rewrite phis in the header block to not receive an input from - // the preheader. - BI = L->getHeader()->begin(); - while (PHINode *P = dyn_cast(BI)) { - P->removeIncomingValue(Preheader); - ++BI; - } -} - -/// \brief Compute the grainsize of the loop, based on the limit. -/// -/// The grainsize is computed by the following equation: -/// -/// Grainsize = min(2048, ceil(Limit / (8 * workers))) -/// -/// This computation is inserted into the preheader of the loop. -/// -/// TODO: This method is the only method that depends on the CilkABI. -/// Generalize this method for other grainsize calculations and to query TLI. -Value* DACLoopSpawning::computeGrainsize(Value *Limit) { - Loop *L = OrigLoop; - - Value *Grainsize; - BasicBlock *Preheader = L->getLoopPreheader(); - assert(Preheader && "No Preheader found for loop."); - - IRBuilder<> Builder(Preheader->getTerminator()); - - // Get 8 * workers - Value *Workers8 = Builder.CreateIntCast(tapirTarget->GetOrCreateWorker8(*Preheader->getParent()), - Limit->getType(), false); - // Compute ceil(limit / 8 * workers) = (limit + 8 * workers - 1) / (8 * workers) - Value *SmallLoopVal = - Builder.CreateUDiv(Builder.CreateSub(Builder.CreateAdd(Limit, Workers8), - ConstantInt::get(Limit->getType(), 1)), - Workers8); - // Compute min - Value *LargeLoopVal = ConstantInt::get(Limit->getType(), 2048); - Value *Cmp = Builder.CreateICmpULT(LargeLoopVal, SmallLoopVal); - Grainsize = Builder.CreateSelect(Cmp, LargeLoopVal, SmallLoopVal); - - return Grainsize; -} - -/// \brief Method to help convertLoopToDACIterSpawn convert the Tapir -/// loop cloned into function Helper to spawn its iterations in a -/// parallel divide-and-conquer fashion. -/// -/// Example: Suppose that Helper contains the following Tapir loop: -/// -/// Helper(iter_t start, iter_t end, iter_t grain, ...) { -/// iter_t i = start; -/// ... Other loop setup ... -/// do { -/// spawn { ... loop body ... }; -/// } while (i++ < end); -/// sync; -/// } -/// -/// Then this method transforms Helper into the following form: -/// -/// Helper(iter_t start, iter_t end, iter_t grain, ...) { -/// recur: -/// iter_t itercount = end - start; -/// if (itercount > grain) { -/// // Invariant: itercount >= 2 -/// count_t miditer = start + itercount / 2; -/// spawn Helper(start, miditer, grain, ...); -/// start = miditer + 1; -/// goto recur; -/// } -/// -/// iter_t i = start; -/// ... Other loop setup ... -/// do { -/// ... Loop Body ... -/// } while (i++ < end); -/// sync; -/// } -/// -void DACLoopSpawning::implementDACIterSpawnOnHelper(Function *Helper, - BasicBlock *Preheader, - BasicBlock *Header, - PHINode *CanonicalIV, - Argument *Limit, - Argument *Grainsize, - Instruction *SyncRegion, - DominatorTree *DT, - LoopInfo *LI, - bool CanonicalIVFlagNUW, - bool CanonicalIVFlagNSW) { - // Serialize the cloned copy of the loop. - assert(Preheader->getParent() == Helper && - "Preheader does not belong to helper function."); - assert(Header->getParent() == Helper && - "Header does not belong to helper function."); - assert(CanonicalIV->getParent() == Header && - "CanonicalIV does not belong to header"); - assert(isa(Header->getTerminator()) && - "Cloned header is not terminated by a detach."); - DetachInst *DI = dyn_cast(Header->getTerminator()); - SerializeDetachedCFG(DI, DT); - - // Convert the cloned loop into the strip-mined loop body. - - BasicBlock *DACHead = Preheader; - if (&(Helper->getEntryBlock()) == Preheader) - // Split the entry block. We'll want to create a backedge into - // the split block later. - DACHead = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI); - - BasicBlock *RecurHead, *RecurDet, *RecurCont; - Value *IterCount; - Value *CanonicalIVInput; - PHINode *CanonicalIVStart; - { - Instruction *PreheaderOrigFront = &(DACHead->front()); - IRBuilder<> Builder(PreheaderOrigFront); - // Create branch based on grainsize. - DEBUG(dbgs() << "LS CanonicalIV: " << *CanonicalIV << "\n"); - CanonicalIVInput = CanonicalIV->getIncomingValueForBlock(DACHead); - CanonicalIVStart = Builder.CreatePHI(CanonicalIV->getType(), 2, - CanonicalIV->getName()+".dac"); - CanonicalIVInput->replaceAllUsesWith(CanonicalIVStart); - IterCount = Builder.CreateSub(Limit, CanonicalIVStart, - "itercount"); - Value *IterCountCmp = Builder.CreateICmpUGT(IterCount, Grainsize); - TerminatorInst *RecurTerm = - SplitBlockAndInsertIfThen(IterCountCmp, PreheaderOrigFront, - /*Unreachable=*/false, - /*BranchWeights=*/nullptr, - DT); - RecurHead = RecurTerm->getParent(); - // Create skeleton of divide-and-conquer recursion: - // DACHead -> RecurHead -> RecurDet -> RecurCont -> DACHead - RecurDet = SplitBlock(RecurHead, RecurHead->getTerminator(), - DT, LI); - RecurCont = SplitBlock(RecurDet, RecurDet->getTerminator(), - DT, LI); - RecurCont->getTerminator()->replaceUsesOfWith(RecurTerm->getSuccessor(0), - DACHead); - } - - // Compute mid iteration in RecurHead. - Value *MidIter, *MidIterPlusOne; - { - IRBuilder<> Builder(&(RecurHead->front())); - MidIter = Builder.CreateAdd(CanonicalIVStart, - Builder.CreateLShr(IterCount, 1, - "halfcount"), - "miditer", - CanonicalIVFlagNUW, CanonicalIVFlagNSW); - } - - // Create recursive call in RecurDet. - { - // Create input array for recursive call. - IRBuilder<> Builder(&(RecurDet->front())); - SetVector RecurInputs; - Function::arg_iterator AI = Helper->arg_begin(); - // Handle an initial sret argument, if necessary. Based on how - // the Helper function is created, any sret parameter will be the - // first parameter. - if (Helper->hasParamAttribute(0, Attribute::StructRet)) - RecurInputs.insert(&*AI++); - assert(cast(CanonicalIVInput) == &*AI && - "First non-sret argument does not match original input to canonical IV."); - RecurInputs.insert(CanonicalIVStart); - ++AI; - assert(Limit == &*AI && - "Second non-sret argument does not match original input to the loop limit."); - RecurInputs.insert(MidIter); - ++AI; - for (Function::arg_iterator AE = Helper->arg_end(); - AI != AE; ++AI) - RecurInputs.insert(&*AI); - DEBUG({ - dbgs() << "RecurInputs: "; - for (Value *Input : RecurInputs) - dbgs() << *Input << ", "; - dbgs() << "\n"; - }); - - // Create call instruction. - CallInst *RecurCall = Builder.CreateCall(Helper, RecurInputs.getArrayRef()); - RecurCall->setDebugLoc(Header->getTerminator()->getDebugLoc()); - // Use a fast calling convention for the helper. - RecurCall->setCallingConv(CallingConv::Fast); - // RecurCall->setCallingConv(Helper->getCallingConv()); - // // Update CG graph with the recursive call we just added. - // CG[Helper]->addCalledFunction(RecurCall, CG[Helper]); - } - - // Set up continuation of detached recursive call. We effectively - // inline this tail call automatically. - { - IRBuilder<> Builder(&(RecurCont->front())); - MidIterPlusOne = Builder.CreateAdd(MidIter, - ConstantInt::get(Limit->getType(), 1), - "miditerplusone", - CanonicalIVFlagNUW, - CanonicalIVFlagNSW); - } - - // Finish setup of new phi node for canonical IV. - { - CanonicalIVStart->addIncoming(CanonicalIVInput, Preheader); - CanonicalIVStart->addIncoming(MidIterPlusOne, RecurCont); - } - - /// Make the recursive DAC parallel. - { - IRBuilder<> Builder(RecurHead->getTerminator()); - // Create the detach. - DetachInst *DI = Builder.CreateDetach(RecurDet, RecurCont, SyncRegion); - DI->setDebugLoc(Header->getTerminator()->getDebugLoc()); - RecurHead->getTerminator()->eraseFromParent(); - // Create the reattach. - Builder.SetInsertPoint(RecurDet->getTerminator()); - ReattachInst *RI = Builder.CreateReattach(RecurCont, SyncRegion); - RI->setDebugLoc(Header->getTerminator()->getDebugLoc()); - RecurDet->getTerminator()->eraseFromParent(); - } -} - -/// Helper routine to get all exit blocks of a loop that are unreachable. -static void getEHExits(Loop *L, const BasicBlock *DesignatedExitBlock, - SmallVectorImpl &EHExits) { - SmallVector ExitBlocks; - L->getExitBlocks(ExitBlocks); - - SmallVector WorkList; - for (BasicBlock *Exit : ExitBlocks) { - if (Exit == DesignatedExitBlock) continue; - EHExits.push_back(Exit); - WorkList.push_back(Exit); - } - - // Traverse the CFG from these frontier blocks to find all blocks involved in - // exception-handling exit code. - SmallPtrSet Visited; - while (!WorkList.empty()) { - BasicBlock *BB = WorkList.pop_back_val(); - if (!Visited.insert(BB).second) - continue; - - // Check that the exception handling blocks do not reenter the loop. - assert(!L->contains(BB) && - "Exception handling blocks re-enter loop."); - - for (BasicBlock *Succ : successors(BB)) { - EHExits.push_back(Succ); - WorkList.push_back(Succ); - } - } -} - -/// Convert a pointer to an integer type. -/// -/// Copied from Transforms/Vectorizer/LoopVectorize.cpp. -static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { - if (Ty->isPointerTy()) - return DL.getIntPtrType(Ty); - - // It is possible that char's or short's overflow when we ask for the loop's - // trip count, work around this by changing the type size. - if (Ty->getScalarSizeInBits() < 32) - return Type::getInt32Ty(Ty->getContext()); - - return Ty; -} - -/// Get the wider of two integer types. -/// -/// Copied from Transforms/Vectorizer/LoopVectorize.cpp. -static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { - Ty0 = convertPointerToIntegerType(DL, Ty0); - Ty1 = convertPointerToIntegerType(DL, Ty1); - if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) - return Ty0; - return Ty1; -} - -/// Top-level call to convert loop to spawn its iterations in a -/// divide-and-conquer fashion. -bool DACLoopSpawning::processLoop() { - if (!tapirTarget) { - return false; - } - - Loop *L = OrigLoop; - - BasicBlock *Header = L->getHeader(); - BasicBlock *Preheader = L->getLoopPreheader(); - BasicBlock *Latch = L->getLoopLatch(); - - DEBUG({ - LoopBlocksDFS DFS(L); - DFS.perform(LI); - dbgs() << "Blocks in loop (from DFS):\n"; - for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) - dbgs() << *BB; - }); - - using namespace ore; - - // Check that this loop has a valid exit block after the latch. - if (!ExitBlock) { - DEBUG(dbgs() << "LS loop does not contain valid exit block after latch.\n"); - ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "InvalidLatchExit", - L->getStartLoc(), - Header) - << "invalid latch exit"); - return false; - } - - // Get special exits from this loop. - SmallVector EHExits; - getEHExits(L, ExitBlock, EHExits); - - // Check the exit blocks of the loop. - SmallVector ExitBlocks; - L->getExitBlocks(ExitBlocks); - - for (const BasicBlock *Exit : ExitBlocks) { - if (Exit == ExitBlock) continue; - if (Exit->isLandingPad()) { - DEBUG({ - const LandingPadInst *LPI = Exit->getLandingPadInst(); - dbgs() << "landing pad found: " << *LPI << "\n"; - for (const User *U : LPI->users()) - dbgs() << "\tuser " << *U << "\n"; - }); - } - } - SmallPtrSet HandledExits; - for (BasicBlock *BB : EHExits) - HandledExits.insert(BB); - for (BasicBlock *Exit : ExitBlocks) { - if (Exit == ExitBlock) continue; - if (!HandledExits.count(Exit)) { - DEBUG(dbgs() << "LS loop contains a bad exit block " << *Exit); - ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "BadExit", - L->getStartLoc(), - Header) - << "bad exit block found"); - return false; - } - } - - Function *F = Header->getParent(); - Module* M = F->getParent(); - - DEBUG(dbgs() << "LS loop header:" << *Header); - DEBUG(dbgs() << "LS loop latch:" << *Latch); - DEBUG(dbgs() << "LS SE exit count: " << *(SE.getExitCount(L, Latch)) << "\n"); - - /// Get loop limit. - const SCEV *Limit = SE.getExitCount(L, Latch); - DEBUG(dbgs() << "LS Loop limit: " << *Limit << "\n"); - // PredicatedScalarEvolution PSE(SE, *L); - // const SCEV *PLimit = PSE.getExitCount(L, Latch); - // DEBUG(dbgs() << "LS predicated loop limit: " << *PLimit << "\n"); - // emitAnalysis(LoopSpawningReport() - // << "computed loop limit " << *Limit << "\n"); - if (SE.getCouldNotCompute() == Limit) { - DEBUG(dbgs() << "SE could not compute loop limit.\n"); - ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "UnknownLoopLimit", - L->getStartLoc(), - Header) - << "could not compute limit"); - return false; - } - // ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "LoopLimit", L->getStartLoc(), - // Header) - // << "loop limit: " << NV("Limit", Limit)); - /// Determine the type of the canonical IV. - Type *CanonicalIVTy = Limit->getType(); - { - const DataLayout &DL = M->getDataLayout(); - for (BasicBlock::iterator II = Header->begin(); isa(II); ++II) { - PHINode *PN = cast(II); - if (PN->getType()->isFloatingPointTy()) continue; - CanonicalIVTy = getWiderType(DL, PN->getType(), CanonicalIVTy); - } - Limit = SE.getNoopOrAnyExtend(Limit, CanonicalIVTy); - } - /// Clean up the loop's induction variables. - PHINode *CanonicalIV = canonicalizeIVs(CanonicalIVTy); - if (!CanonicalIV) { - DEBUG(dbgs() << "Could not get canonical IV.\n"); - // emitAnalysis(LoopSpawningReport() - // << "Could not get a canonical IV.\n"); - ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "NoCanonicalIV", - L->getStartLoc(), - Header) - << "could not find or create canonical IV"); - return false; - } - const SCEVAddRecExpr *CanonicalSCEV = - cast(SE.getSCEV(CanonicalIV)); - +// IVs is output +bool LoopOutline::removeNonCanonicalIVs(BasicBlock* Header, BasicBlock* Preheader, PHINode* CanonicalIV, SmallVector &IVs, SCEVExpander &Exp) { // Remove all IV's other than CanonicalIV. // First, check that we can do this. bool CanRemoveIVs = true; for (BasicBlock::iterator II = Header->begin(); isa(II); ++II) { PHINode *PN = cast(II); if (CanonicalIV == PN) continue; - // dbgs() << "IV " << *PN; const SCEV *S = SE.getSCEV(PN); - // dbgs() << " SCEV " << *S << "\n"; if (SE.getCouldNotCompute() == S) { - // emitAnalysis(LoopSpawningReport(PN) - // << "Could not compute the scalar evolution.\n"); ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "NoSCEV", PN) << "could not compute scalar evolution of " - << NV("PHINode", PN)); + << ore::NV("PHINode", PN)); CanRemoveIVs = false; } } @@ -760,17 +189,6 @@ bool DACLoopSpawning::processLoop() { return false; } - //////////////////////////////////////////////////////////////////////// - // We now have everything we need to extract the loop. It's time to - // do some surgery. - - SCEVExpander Exp(SE, M->getDataLayout(), "ls"); - - // Remove the IV's (other than CanonicalIV) and replace them with - // their stronger forms. - // - // TODO?: We can probably adapt this loop->DAC process such that we - // don't require all IV's to be canonical. { SmallVector IVsToRemove; for (BasicBlock::iterator II = Header->begin(); isa(II); ++II) { @@ -780,7 +198,7 @@ bool DACLoopSpawning::processLoop() { DEBUG(dbgs() << "Removing the IV " << *PN << " (" << *S << ")\n"); ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "RemoveIV", PN) << "removing the IV " - << NV("PHINode", PN)); + << ore::NV("PHINode", PN)); Value *NewIV = Exp.expandCodeFor(S, S->getType(), CanonicalIV); PN->replaceAllUsesWith(NewIV); IVsToRemove.push_back(PN); @@ -793,7 +211,6 @@ bool DACLoopSpawning::processLoop() { // // TODO?: We can probably adapt this loop->DAC process such that we // don't require all IV's to be canonical. - SmallVector IVs; bool AllCanonical = true; for (BasicBlock::iterator II = Header->begin(); isa(II); ++II) { PHINode *PN = cast(II); @@ -821,7 +238,7 @@ bool DACLoopSpawning::processLoop() { if (PN != CanonicalIV) ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "SaveIV", PN) << "saving the canonical the IV " - << NV("PHINode", PN)); + << ore::NV("PHINode", PN)); IVs.push_back(PN); } } else { @@ -835,277 +252,26 @@ bool DACLoopSpawning::processLoop() { } } if (!AllCanonical) - return false; - - // Insert the computation for the loop limit into the Preheader. - Value *LimitVar = Exp.expandCodeFor(Limit, CanonicalIVTy, - Preheader->getTerminator()); - DEBUG(dbgs() << "LimitVar: " << *LimitVar << "\n"); - - // Canonicalize the loop latch. - assert(SE.isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_ULT, - CanonicalSCEV, Limit) && - "Loop backedge is not guarded by canonical comparison with limit."); - Value *NewCond = canonicalizeLoopLatch(CanonicalIV, LimitVar); - - // Insert computation of grainsize into the Preheader. - // For debugging: - // Value *GrainVar = ConstantInt::get(Limit->getType(), 2); - Value *GrainVar; - if (!SpecifiedGrainsize) - GrainVar = computeGrainsize(LimitVar); - else - GrainVar = ConstantInt::get(LimitVar->getType(), SpecifiedGrainsize); - - DEBUG(dbgs() << "GrainVar: " << *GrainVar << "\n"); - // emitAnalysis(LoopSpawningReport() - // << "grainsize value " << *GrainVar << "\n"); - // ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "UsingGrainsize", - // L->getStartLoc(), Header) - // << "grainsize: " << NV("Grainsize", GrainVar)); - - /// Clone the loop into a new function. - - // Get the inputs and outputs for the Loop blocks. - SetVector Inputs, Outputs; - SetVector BodyInputs, BodyOutputs; - ValueToValueMapTy VMap, InputMap; - std::vector LoopBlocks; - SmallPtrSet ExitsToSplit; - Value *SRetInput = nullptr; - - // Get the sync region containing this Tapir loop. - const Instruction *InputSyncRegion; - { - const DetachInst *DI = cast(Header->getTerminator()); - InputSyncRegion = cast(DI->getSyncRegion()); - } + return false; +} - // Add start iteration, end iteration, and grainsize to inputs. - { - LoopBlocks = L->getBlocks(); - // // Add exit blocks terminated by unreachable. There should not be any other - // // exit blocks in the loop. - // SmallSet UnreachableExits; - // for (BasicBlock *Exit : ExitBlocks) { - // if (Exit == ExitBlock) continue; - // assert(isa(Exit->getTerminator()) && - // "Found problematic exit block."); - // UnreachableExits.insert(Exit); - // } - - // Add unreachable and exception-handling exits to the set of loop blocks to - // clone. - DEBUG({ - dbgs() << "Handled exits of loop:"; - for (BasicBlock *HE : HandledExits) - dbgs() << *HE; - dbgs() << "\n"; - }); - for (BasicBlock *HE : HandledExits) - LoopBlocks.push_back(HE); - { - const DetachInst *DI = cast(Header->getTerminator()); - BasicBlockEdge DetachEdge(Header, DI->getDetached()); - for (BasicBlock *HE : HandledExits) - if (!DT || !DT->dominates(DetachEdge, HE)) - ExitsToSplit.insert(HE); - DEBUG({ - dbgs() << "Loop exits to split:"; - for (BasicBlock *ETS : ExitsToSplit) - dbgs() << *ETS; - dbgs() << "\n"; - }); +// TODO +/* +bool LoopOutline::setIVStartingValues(Value* newStart, Value* NewCanonicalIV, BasicBlock* NewPreheader) { + if (auto startInst = dyn_cast(NewPreheader)) { + assert(DT->dominates(startInst, NewPreheader->getTerminator())); } - // DEBUG({ - // dbgs() << "LoopBlocks: "; - // for (BasicBlock *LB : LoopBlocks) - // dbgs() << LB->getName() << "(" - // << *(LB->getTerminator()) << "), "; - // dbgs() << "\n"; - // }); - - // Get the inputs and outputs for the loop body. - { - // CodeExtractor Ext(LoopBlocks, DT); - // Ext.findInputsOutputs(BodyInputs, BodyOutputs); - SmallPtrSet Blocks; - for (BasicBlock *BB : LoopBlocks) - Blocks.insert(BB); - findInputsOutputs(Blocks, BodyInputs, BodyOutputs, &ExitsToSplit); - } - - // Scan for any sret parameters in BodyInputs and add them first. - if (F->hasStructRetAttr()) { - Function::arg_iterator ArgIter = F->arg_begin(); - if (F->hasParamAttribute(0, Attribute::StructRet)) - if (BodyInputs.count(&*ArgIter)) - SRetInput = &*ArgIter; - if (F->hasParamAttribute(1, Attribute::StructRet)) { - ++ArgIter; - if (BodyInputs.count(&*ArgIter)) - SRetInput = &*ArgIter; - } - } - if (SRetInput) { - DEBUG(dbgs() << "sret input " << *SRetInput << "\n"); - Inputs.insert(SRetInput); - } - - // Add argument for start of CanonicalIV. - DEBUG({ - Value *CanonicalIVInput = - CanonicalIV->getIncomingValueForBlock(Preheader); - // CanonicalIVInput should be the constant 0. - assert(isa(CanonicalIVInput) && - "Input to canonical IV from preheader is not constant."); - }); - Argument *StartArg = new Argument(CanonicalIV->getType(), - CanonicalIV->getName()+".start"); - Inputs.insert(StartArg); - InputMap[CanonicalIV] = StartArg; - - // Add argument for end. - // - // In the general case, the loop limit is the result of some computation - // that the pass added to the loop's preheader. In this case, the variable - // storing the loop limit is used exactly once, in the canonicalized loop - // latch. In this case, the pass wants to prevent outlining from passing - // the loop-limit variable as an arbitrary argument to the outlined - // function. Hence, this pass adds the loop-limit variable as an argument - // manually. - // - // There are two special cases to consider: the loop limit is a constant, or - // the loop limit is used elsewhere within the loop. To handle these two - // cases, this pass adds an explict argument for the end of the loop, to - // supports the subsequent transformation to using recursive - // divide-and-conquer. After the loop is outlined, this pass will rewrite - // the latch in the outlined loop to use this explicit argument. - // Furthermore, this pass does not prevent outliner from recognizing the - // loop limit as a potential argument to the function. - if (isa(LimitVar) || !LimitVar->hasOneUse()) { - Argument *EndArg = new Argument(LimitVar->getType(), "end"); - Inputs.insert(EndArg); - InputMap[LimitVar] = EndArg; - } else { - // If the limit var is not constant and has exactly one use, then the - // limit var is the result of some nontrivial computation, and that one - // use is the new condition inserted. - Inputs.insert(LimitVar); - InputMap[LimitVar] = LimitVar; - } - - // Add argument for grainsize. - if (isa(GrainVar)) { - Argument *GrainArg = new Argument(GrainVar->getType(), "grainsize"); - Inputs.insert(GrainArg); - InputMap[GrainVar] = GrainArg; - } else { - Inputs.insert(GrainVar); - InputMap[GrainVar] = GrainVar; - } - - // Put all of the inputs together, and clear redundant inputs from - // the set for the loop body. - SmallVector BodyInputsToRemove; - for (Value *V : BodyInputs) - if (V == InputSyncRegion) - BodyInputsToRemove.push_back(V); - else if (!Inputs.count(V)) - Inputs.insert(V); - else - BodyInputsToRemove.push_back(V); - for (Value *V : BodyInputsToRemove) - BodyInputs.remove(V); - DEBUG({ - for (Value *V : BodyInputs) - dbgs() << "Remaining body input: " << *V << "\n"; - }); - for (Value *V : BodyOutputs) - dbgs() << "EL output: " << *V << "\n"; - assert(0 == BodyOutputs.size() && - "All results from parallel loop should be passed by memory already."); - } - DEBUG({ - for (Value *V : Inputs) - dbgs() << "EL input: " << *V << "\n"; - for (Value *V : Outputs) - dbgs() << "EL output: " << *V << "\n"; - }); - - // Clone the loop blocks into a new helper function. - Function *Helper; - { - SmallVector Returns; // Ignore returns cloned. - - // LowerDbgDeclare(*(Header->getParent())); - - Helper = CreateHelper(Inputs, Outputs, LoopBlocks, - Header, Preheader, ExitBlock, - VMap, M, - F->getSubprogram() != nullptr, Returns, ".ls", - &ExitsToSplit, InputSyncRegion, - nullptr, nullptr, nullptr); - - assert(Returns.empty() && "Returns cloned when cloning loop."); - - // Use a fast calling convention for the helper. - Helper->setCallingConv(CallingConv::Fast); - // Helper->setCallingConv(Header->getParent()->getCallingConv()); - } - - // Add a sync to the helper's return. - BasicBlock *HelperHeader = cast(VMap[Header]); - { - BasicBlock *HelperExit = cast(VMap[ExitBlock]); - assert(isa(HelperExit->getTerminator())); - BasicBlock *NewHelperExit = SplitBlock(HelperExit, - HelperExit->getTerminator(), - DT, LI); - IRBuilder<> Builder(&(HelperExit->front())); - SyncInst *NewSync = Builder.CreateSync( - NewHelperExit, - cast(VMap[InputSyncRegion])); - // Set debug info of new sync to match that of terminator of the header of - // the cloned loop. - NewSync->setDebugLoc(HelperHeader->getTerminator()->getDebugLoc()); - HelperExit->getTerminator()->eraseFromParent(); - } - - // // Add syncs to the helper's cloned resume blocks. - // for (BasicBlock *BB : Resumes) { - // BasicBlock *HelperResume = cast(VMap[BB]); - // assert(isa(HelperResume->getTerminator())); - // BasicBlock *NewHelperResume = SplitBlock(HelperResume, - // HelperResume->getTerminator(), - // DT, LI); - // IRBuilder<> Builder(&(HelperResume->front())); - // SyncInst *NewSync = Builder.CreateSync(NewHelperResume); - // // Set debug info of new sync to match that of terminator of the header of - // // the cloned loop. - // NewSync->setDebugLoc(HelperHeader->getTerminator()->getDebugLoc()); - // HelperResume->getTerminator()->eraseFromParent(); - // } - - BasicBlock *NewPreheader = cast(VMap[Preheader]); - PHINode *NewCanonicalIV = cast(VMap[CanonicalIV]); - - // Rewrite the cloned IV's to start at the start iteration argument. - { - // Rewrite clone of canonical IV to start at the start iteration - // argument. - Argument *NewCanonicalIVStart = cast(VMap[InputMap[CanonicalIV]]); { int NewPreheaderIdx = NewCanonicalIV->getBasicBlockIndex(NewPreheader); assert(isa(NewCanonicalIV->getIncomingValue(NewPreheaderIdx)) && "Cloned canonical IV does not inherit a constant value from cloned preheader."); - NewCanonicalIV->setIncomingValue(NewPreheaderIdx, NewCanonicalIVStart); + NewCanonicalIV->setIncomingValue(NewPreheaderIdx, newStart); } // Rewrite other cloned IV's to start at their value at the start // iteration. - const SCEV *StartIterSCEV = SE.getSCEV(NewCanonicalIVStart); + const SCEV *StartIterSCEV = SE.getSCEV(newStart); DEBUG(dbgs() << "StartIterSCEV: " << *StartIterSCEV << "\n"); for (PHINode *IV : IVs) { if (CanonicalIV == IV) continue; @@ -1123,7 +289,6 @@ bool DACLoopSpawning::processLoop() { Value *IVStart = Exp.expandCodeFor(IVAtIter, IVAtIter->getType(), NewPreheader->getTerminator()); - // Set the value that the cloned IV inherits from the cloned preheader. PHINode *NewIV = cast(VMap[IV]); int NewPreheaderIdx = NewIV->getBasicBlockIndex(NewPreheader); @@ -1131,179 +296,83 @@ bool DACLoopSpawning::processLoop() { "Cloned IV does not inherit a constant value from cloned preheader."); NewIV->setIncomingValue(NewPreheaderIdx, IVStart); } +} +*/ - // Remap the newly added instructions in the new preheader to use - // values local to the helper. - for (Instruction &II : *NewPreheader) - RemapInstruction(&II, VMap, RF_IgnoreMissingLocals, - /*TypeMapper=*/nullptr, /*Materializer=*/nullptr); - } +/// \brief Replace the latch of the loop to check that IV is always less than or +/// equal to the limit. +/// +/// This method assumes that the loop has a single loop latch. +Value* LoopOutline::canonicalizeLoopLatch(PHINode *IV, Value *Limit) { + Loop *L = OrigLoop; - // The loop has been outlined by this point. To handle the special cases - // where the loop limit was constant or used elsewhere within the loop, this - // pass rewrites the outlined loop-latch condition to use the explicit - // end-iteration argument. - if (isa(LimitVar) || !LimitVar->hasOneUse()) { - CmpInst *HelperCond = cast(VMap[NewCond]); - assert(((isa(LimitVar) && - HelperCond->getOperand(1) == LimitVar) || - (!LimitVar->hasOneUse() && - HelperCond->getOperand(1) == VMap[LimitVar])) && - "Unexpected condition in loop latch."); - IRBuilder<> Builder(HelperCond); - Value *NewHelperCond = Builder.CreateICmpULT(HelperCond->getOperand(0), - VMap[InputMap[LimitVar]]); - HelperCond->replaceAllUsesWith(NewHelperCond); - HelperCond->eraseFromParent(); - DEBUG(dbgs() << "Rewritten Latch: " << - *(cast(NewHelperCond)->getParent())); - } + Value *NewCondition; + BasicBlock *Header = L->getHeader(); + BasicBlock *Latch = L->getLoopLatch(); + assert(Latch && "No single loop latch found for loop."); - // DEBUGGING: Simply serialize the cloned loop. - // BasicBlock *NewHeader = cast(VMap[Header]); - // SerializeDetachedCFG(cast(NewHeader->getTerminator()), nullptr); - implementDACIterSpawnOnHelper(Helper, NewPreheader, - cast(VMap[Header]), - cast(VMap[CanonicalIV]), - cast(VMap[InputMap[LimitVar]]), - cast(VMap[InputMap[GrainVar]]), - cast(VMap[InputSyncRegion]), - /*DT=*/nullptr, /*LI=*/nullptr, - CanonicalSCEV->getNoWrapFlags(SCEV::FlagNUW), - CanonicalSCEV->getNoWrapFlags(SCEV::FlagNSW)); - - if (verifyFunction(*Helper, &dbgs())) - return false; + IRBuilder<> Builder(&*Latch->getFirstInsertionPt()); - // Update allocas in cloned loop body. - { - // Collect reattach instructions. - SmallVector ReattachPoints; - for (pred_iterator PI = pred_begin(Latch), PE = pred_end(Latch); - PI != PE; ++PI) { - BasicBlock *Pred = *PI; - if (!isa(Pred->getTerminator())) continue; - if (L->contains(Pred)) - ReattachPoints.push_back(cast(VMap[Pred])->getTerminator()); - } - // The cloned loop should be serialized by this point. - BasicBlock *ClonedLoopBodyEntry = - cast(VMap[Header])->getSingleSuccessor(); - assert(ClonedLoopBodyEntry && - "Head of cloned loop body has multiple successors."); - bool ContainsDynamicAllocas = - MoveStaticAllocasInBlock(&Helper->getEntryBlock(), ClonedLoopBodyEntry, - ReattachPoints); - - // If the cloned loop contained dynamic alloca instructions, wrap the cloned - // loop with llvm.stacksave/llvm.stackrestore intrinsics. - if (ContainsDynamicAllocas) { - Module *M = Helper->getParent(); - // Get the two intrinsics we care about. - Function *StackSave = Intrinsic::getDeclaration(M, Intrinsic::stacksave); - Function *StackRestore = - Intrinsic::getDeclaration(M,Intrinsic::stackrestore); - - // Insert the llvm.stacksave. - CallInst *SavedPtr = IRBuilder<>(&*ClonedLoopBodyEntry, - ClonedLoopBodyEntry->begin()) - .CreateCall(StackSave, {}, "savedstack"); - - // Insert a call to llvm.stackrestore before the reattaches in the - // original Tapir loop. - for (Instruction *ExitPoint : ReattachPoints) - IRBuilder<>(ExitPoint).CreateCall(StackRestore, SavedPtr); - } - } + // This process assumes that IV's increment is in Latch. - if (verifyFunction(*Helper, &dbgs())) - return false; + // Create comparison between IV and Limit at top of Latch. + NewCondition = Builder.CreateICmpULT(IV, Limit); - // Add alignment assumptions to arguments of helper, based on alignment of - // values in old function. - AddAlignmentAssumptions(F, Inputs, VMap, - Preheader->getTerminator(), AC, DT); + // Replace the conditional branch at the end of Latch. + BranchInst *LatchBr = dyn_cast_or_null(Latch->getTerminator()); + assert(LatchBr && LatchBr->isConditional() && + "Latch does not terminate with a conditional branch."); + Builder.SetInsertPoint(Latch->getTerminator()); + Builder.CreateCondBr(NewCondition, Header, ExitBlock); - // Add call to new helper function in original function. - { - // Setup arguments for call. - SmallVector TopCallArgs; - // Add sret input, if it exists. - if (SRetInput) - TopCallArgs.push_back(SRetInput); - // Add start iteration 0. - assert(CanonicalSCEV->getStart()->isZero() && - "Canonical IV does not start at zero."); - TopCallArgs.push_back(ConstantInt::get(CanonicalIV->getType(), 0)); - // Add loop limit. - TopCallArgs.push_back(LimitVar); - // Add grainsize. - TopCallArgs.push_back(GrainVar); - // Add the rest of the arguments. - for (Value *V : BodyInputs) - TopCallArgs.push_back(V); - DEBUG({ - for (Value *TCArg : TopCallArgs) - dbgs() << "Top call arg: " << *TCArg << "\n"; - }); + // Erase the old conditional branch. + Value *OldCond = LatchBr->getCondition(); + LatchBr->eraseFromParent(); + if (!OldCond->hasNUsesOrMore(1)) + if (Instruction *OldCondInst = dyn_cast(OldCond)) + OldCondInst->eraseFromParent(); - // Create call instruction. - IRBuilder<> Builder(Preheader->getTerminator()); - CallInst *TopCall = Builder.CreateCall(Helper, - ArrayRef(TopCallArgs)); - - // Use a fast calling convention for the helper. - TopCall->setCallingConv(CallingConv::Fast); - // TopCall->setCallingConv(Helper->getCallingConv()); - TopCall->setDebugLoc(Header->getTerminator()->getDebugLoc()); - // // Update CG graph with the call we just added. - // CG[F]->addCalledFunction(TopCall, CG[Helper]); - } + return NewCondition; +} - // Remove sync of loop in parent. - { - // Get the sync region for this loop's detached iterations. - DetachInst *HeadDetach = cast(Header->getTerminator()); - Value *SyncRegion = HeadDetach->getSyncRegion(); - // Check the Tapir instructions contained in this sync region. Look for a - // single sync instruction among those Tapir instructions. Meanwhile, - // verify that the only detach instruction in this sync region is the detach - // in theloop header. If these conditions are met, then we assume that the - // sync applies to this loop. Otherwise, something more complicated is - // going on, and we give up. - SyncInst *LoopSync = nullptr; - bool SingleSyncJustForLoop = true; - for (User *U : SyncRegion->users()) { - // Skip the detach in the loop header. - if (HeadDetach == U) continue; - // Remember the first sync instruction we find. If we find multiple sync - // instructions, then something nontrivial is going on. - if (SyncInst *SI = dyn_cast(U)) { - if (!LoopSync) - LoopSync = SI; - else - SingleSyncJustForLoop = false; - } - // If we find a detach instruction that is not the loop header's, then - // something nontrivial is going on. - if (isa(U)) - SingleSyncJustForLoop = false; - } - if (LoopSync && SingleSyncJustForLoop) - // Replace the sync with a branch. - ReplaceInstWithInst(LoopSync, - BranchInst::Create(LoopSync->getSuccessor(0))); - else if (!LoopSync) - DEBUG(dbgs() << "No sync found for this loop."); - else - DEBUG(dbgs() << "No single sync found that only affects this loop."); - } +/// Unlink the specified loop, and update analysis accordingly. The heavy +/// lifting of deleting the loop is carried out by a run of LoopDeletion after +/// this pass. +void LoopOutline::unlinkLoop() { + Loop *L = OrigLoop; - ++LoopsConvertedToDAC; + // Get components of the old loop. + BasicBlock *Preheader = L->getLoopPreheader(); + assert(Preheader && "Loop does not have a unique preheader."); + BasicBlock *Latch = L->getLoopLatch(); + + // Invalidate the analysis of the old loop. + SE.forgetLoop(L); + + // Redirect the preheader to branch directly to loop exit. + assert(1 == Preheader->getTerminator()->getNumSuccessors() && + "Preheader does not have a unique successor."); + Preheader->getTerminator()->replaceUsesOfWith(L->getHeader(), + ExitBlock); - unlinkLoop(); + // Rewrite phis in the exit block to get their inputs from + // the preheader instead of the exiting block. + BasicBlock::iterator BI = ExitBlock->begin(); + while (PHINode *P = dyn_cast(BI)) { + int j = P->getBasicBlockIndex(Latch); + assert(j >= 0 && "Can't find exiting block in exit block's phi node!"); + P->setIncomingBlock(j, Preheader); + P->removeIncomingValue(Latch); + ++BI; + } - return Helper; + // Rewrite phis in the header block to not receive an input from + // the preheader. + BI = L->getHeader()->begin(); + while (PHINode *P = dyn_cast(BI)) { + P->removeIncomingValue(Preheader); + ++BI; + } } /// This routine recursively examines all descendants of the specified loop and @@ -1382,6 +451,7 @@ bool LoopSpawningImpl::run() { return Changed; } + // Top-level routine to process a given loop. bool LoopSpawningImpl::processLoop(Loop *L) { #ifndef NDEBUG @@ -1427,62 +497,8 @@ bool LoopSpawningImpl::processLoop(Loop *L) { case LoopSpawningHints::ST_SEQ: DEBUG(dbgs() << "LS: Hints dictate sequential spawning.\n"); break; - case LoopSpawningHints::ST_GPU: - DEBUG(dbgs() << "LS: Hints dictate GPU spawning.\n"); - { - DebugLoc DLoc = L->getStartLoc(); - BasicBlock *Header = L->getHeader(); - PTXABILoopSpawning DLS(L, SE, &LI, &DT, &AC, ORE); - if (DLS.processLoop()) { - DEBUG({ - if (verifyFunction(*L->getHeader()->getParent())) { - dbgs() << "Transformed function is invalid.\n"; - return false; - } - }); - // Report success. - ORE.emit(OptimizationRemark(LS_NAME, "GPUSpawning", DLoc, Header) - << "spawning iterations using direct gpu mapping"); - return true; - } else { - // Report failure. - ORE.emit(OptimizationRemarkMissed(LS_NAME, "NoGPUSpawning", DLoc, - Header) - << "cannot spawn iterations using direct gpu mapping"); - emitMissedWarning(F, L, Hints, &ORE); - return false; - } - } - break; - case LoopSpawningHints::ST_DAC: - DEBUG(dbgs() << "LS: Hints dictate DAC spawning.\n"); - { - DebugLoc DLoc = L->getStartLoc(); - BasicBlock *Header = L->getHeader(); - DACLoopSpawning DLS(L, Hints.getGrainsize(), SE, &LI, &DT, &AC, ORE, tapirTarget); - // CilkABILoopSpawning DLS(L, SE, &LI, &DT, &AC, ORE); - // DACLoopSpawning DLS(L, SE, LI, DT, TLI, TTI, ORE); - if (DLS.processLoop()) { - DEBUG({ - if (verifyFunction(*L->getHeader()->getParent())) { - dbgs() << "Transformed function is invalid.\n"; - return false; - } - }); - // Report success. - ORE.emit(OptimizationRemark(LS_NAME, "DACSpawning", DLoc, Header) - << "spawning iterations using divide-and-conquer"); - return true; - } else { - // Report failure. - ORE.emit(OptimizationRemarkMissed(LS_NAME, "NoDACSpawning", DLoc, - Header) - << "cannot spawn iterations using divide-and-conquer"); - emitMissedWarning(F, L, Hints, &ORE); - return false; - } - } - break; + default: + return tapirTarget->processLoop(Hints, LI, SE, DT, AC, ORE); case LoopSpawningHints::ST_END: dbgs() << "LS: Hints specify unknown spawning strategy.\n"; break; @@ -1490,39 +506,6 @@ bool LoopSpawningImpl::processLoop(Loop *L) { return false; } -// PreservedAnalyses LoopSpawningPass::run(Module &M, ModuleAnalysisManager &AM) { -// // Find functions that detach for processing. -// SmallVector WorkList; -// for (Function &F : M) -// for (BasicBlock &BB : F) -// if (isa(BB.getTerminator())) -// WorkList.push_back(&F); - -// if (WorkList.empty()) -// return PreservedAnalyses::all(); - -// bool Changed = false; -// while (!WorkList.empty()) { -// Function *F = WorkList.back(); -// auto &TLI = AM.getResult(M); -// auto &FAM = AM.getResult(M).getManager(); -// auto &LI = FAM.getResult(*F); -// auto &SE = FAM.getResult(*F); -// auto &DT = FAM.getResult(*F); -// auto &TTI = FAM.getResult(*F); -// auto &AA = FAM.getResult(*F); -// auto &AC = FAM.getResult(*F); -// auto &ORE = FAM.getResult(*F); -// LoopSpawningImpl Impl(*F, LI, SE, DT, TTI, &TLI, AA, AC, ORE); -// Changed |= Impl.run(); -// WorkList.pop_back(); -// } - -// if (Changed) -// return PreservedAnalyses::none(); -// return PreservedAnalyses::all(); -// } - PreservedAnalyses LoopSpawningPass::run(Function &F, FunctionAnalysisManager &AM) { // Determine if function detaches. @@ -1537,13 +520,9 @@ PreservedAnalyses LoopSpawningPass::run(Function &F, auto &LI = AM.getResult(F); auto &SE = AM.getResult(F); auto &DT = AM.getResult(F); - // auto &TTI = AM.getResult(F); - // auto &TLI = AM.getResult(M); - // auto &AA = AM.getResult(F); auto &AC = AM.getResult(F); auto &ORE = AM.getResult(F); - // OptimizationRemarkEmitter ORE(F); bool Changed = LoopSpawningImpl(F, LI, SE, DT, AC, ORE, tapirTarget).run(); @@ -1583,11 +562,6 @@ struct LoopSpawning : public FunctionPass { auto &LI = getAnalysis().getLoopInfo(); auto &SE = getAnalysis().getSE(); auto &DT = getAnalysis().getDomTree(); - // auto *TTI = &getAnalysis().getTTI(*F); - // auto *TLIP = getAnalysisIfAvailable(); - // auto *TLI = TLIP ? &TLIP->getTLI() : nullptr; - // auto *TLI = &getAnalysis().getTLI(); - // auto *AA = &getAnalysis(*F).getAAResults(); auto &AC = getAnalysis().getAssumptionCache(F); auto &ORE = getAnalysis().getORE(); @@ -1605,16 +579,12 @@ struct LoopSpawning : public FunctionPass { AU.addRequired(); AU.addRequired(); AU.addRequired(); - // AU.addRequired(); - // getAAResultsAnalysisUsage(AU); - // AU.addRequired(); AU.addRequired(); } }; } char LoopSpawning::ID = 0; -// static RegisterPass X(LS_NAME, "Transform Tapir loops to spawn iterations efficiently", false, false); static const char ls_name[] = "Loop Spawning"; INITIALIZE_PASS_BEGIN(LoopSpawning, LS_NAME, ls_name, false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) @@ -1625,8 +595,6 @@ INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -// INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) -// INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_END(LoopSpawning, LS_NAME, ls_name, false, false) diff --git a/lib/Transforms/Tapir/OpenMPABI.cpp b/lib/Transforms/Tapir/OpenMPABI.cpp index d80824982ef..c62a5c670ac 100644 --- a/lib/Transforms/Tapir/OpenMPABI.cpp +++ b/lib/Transforms/Tapir/OpenMPABI.cpp @@ -808,3 +808,8 @@ void llvm::OpenMPABI::postProcessHelper(Function &F) {} bool llvm::OpenMPABI::processMain(Function &F) { return false; } + +bool llvm::OpenMPABI::processLoop(LoopSpawningHints LSH, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT, + AssumptionCache &AC, OptimizationRemarkEmitter &ORE) { + return false; +} diff --git a/lib/Transforms/Tapir/Outline.cpp b/lib/Transforms/Tapir/Outline.cpp index 561133c37e8..6e87c3ffaa9 100644 --- a/lib/Transforms/Tapir/Outline.cpp +++ b/lib/Transforms/Tapir/Outline.cpp @@ -21,76 +21,11 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/TapirUtils.h" +#include using namespace llvm; #define DEBUG_TYPE "outlining" -/// definedInRegion - Return true if the specified value is defined in the -/// extracted region. -static bool definedInRegion(const SmallPtrSetImpl &Blocks, - Value *V) { - if (Instruction *I = dyn_cast(V)) - if (Blocks.count(I->getParent())) - return true; - return false; -} - -/// definedInCaller - Return true if the specified value is defined in the -/// function being code extracted, but not in the region being extracted. -/// These values must be passed in as live-ins to the function. -static bool definedInCaller(const SmallPtrSetImpl &Blocks, - Value *V) { - if (isa(V)) return true; - if (Instruction *I = dyn_cast(V)) - if (!Blocks.count(I->getParent())) - return true; - return false; -} - -// findInputsOutputs - Find inputs and outputs for Blocks. Any blocks in -// ExitBlocks are handled in a special manner: PHI nodes in Exit Blocks are -// ignored when determining inputs. -void llvm::findInputsOutputs(const SmallPtrSetImpl &Blocks, - ValueSet &Inputs, ValueSet &Outputs, - const SmallPtrSetImpl *ExitBlocks, - DominatorTree *DT) { - for (BasicBlock *BB : Blocks) { - // If a used value is defined outside the region, it's an input. If an - // instruction is used outside the region, it's an output. - for (Instruction &II : *BB) { - for (User::op_iterator OI = II.op_begin(), OE = II.op_end(); OI != OE; - ++OI) { - // The PHI nodes in each exit block will be updated after the exit block - // is cloned. Hence, we don't want to count their uses of values - // defined outside the region. - if (ExitBlocks && ExitBlocks->count(BB)) - if (PHINode *PN = dyn_cast(&II)) - if (!Blocks.count(PN->getIncomingBlock(*OI))) - continue; - if (definedInCaller(Blocks, *OI)) - Inputs.insert(*OI); - } - - // Ignore outputs from exit blocks. - if (!ExitBlocks || !ExitBlocks->count(BB)) { - for (User *U : II.users()) { - if (!definedInRegion(Blocks, U)) { - // It looks like we have a use outside of the given blocks, but it's - // possible for the use to appear in a basic block that is no longer - // alive. We use the DT to check that this use is still alive. - if (Instruction *I = dyn_cast(U)) { - if (DT && DT->isReachableFromEntry(I->getParent())) { - Outputs.insert(&II); - break; - } - } - } - } - } - } - } -} - // Clone Blocks into NewFunc, transforming the old arguments into references to // VMap values. // diff --git a/lib/Transforms/Tapir/PTXABI.cpp b/lib/Transforms/Tapir/PTXABI.cpp index 246dfcdc068..a0e03f061dc 100644 --- a/lib/Transforms/Tapir/PTXABI.cpp +++ b/lib/Transforms/Tapir/PTXABI.cpp @@ -77,7 +77,9 @@ using namespace llvm; -namespace{ +#define DEBUG_TYPE "ptxabi" + +namespace { template Function* getFunction(Module& M, const char* name){ @@ -720,3 +722,43 @@ bool PTXABILoopSpawning::processLoop(){ return true; } + +bool llvm::PTXABI::processLoop(LoopSpawningHints LSH, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT, + AssumptionCache &AC, OptimizationRemarkEmitter &ORE) { + if (LSH.getStrategy() != LoopSpawningHints::ST_GPU) + return false; + + Loop* L = LSH.TheLoop; + DEBUG(dbgs() << "LS: Hints dictate GPU spawning.\n"); + { + DebugLoc DLoc = L->getStartLoc(); + BasicBlock *Header = L->getHeader(); + PTXABILoopSpawning DLS(L, SE, &LI, &DT, &AC, ORE); + if (DLS.processLoop()) { + DEBUG({ + if (verifyFunction(*L->getHeader()->getParent())) { + dbgs() << "Transformed function is invalid.\n"; + return false; + } + }); + // Report success. + ORE.emit(OptimizationRemark(LS_NAME, "GPUSpawning", DLoc, Header) + << "spawning iterations using direct gpu mapping"); + return true; + } else { + // Report failure. + ORE.emit(OptimizationRemarkMissed(LS_NAME, "NoGPUSpawning", DLoc, + Header) + << "cannot spawn iterations using direct gpu mapping"); + + ORE.emit(DiagnosticInfoOptimizationFailure( + DEBUG_TYPE, "FailedRequestedGPUSpawning", + L->getStartLoc(), L->getHeader()) + << "Tapir loop not transformed: " + << "failed to use direct gpu mapping"); + return false; + } + } + + return false; +} diff --git a/lib/Transforms/Tapir/QthreadsABI.cpp b/lib/Transforms/Tapir/QthreadsABI.cpp index 5320a58335f..83cee9bd887 100644 --- a/lib/Transforms/Tapir/QthreadsABI.cpp +++ b/lib/Transforms/Tapir/QthreadsABI.cpp @@ -267,3 +267,7 @@ bool QthreadsABI::processMain(Function &F) { return true; } +bool llvm::QthreadsABI::processLoop(LoopSpawningHints LSH, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT, + AssumptionCache &AC, OptimizationRemarkEmitter &ORE) { + return false; +} diff --git a/lib/Transforms/Tapir/TapirUtils.cpp b/lib/Transforms/Tapir/TapirUtils.cpp index 2583dd8f255..9439b8e7eea 100644 --- a/lib/Transforms/Tapir/TapirUtils.cpp +++ b/lib/Transforms/Tapir/TapirUtils.cpp @@ -11,12 +11,15 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/Verifier.h" #include "llvm/Transforms/Tapir/CilkABI.h" #include "llvm/Transforms/Tapir/OpenMPABI.h" #include "llvm/Transforms/Tapir/PTXABI.h" #include "llvm/Transforms/Tapir/QthreadsABI.h" #include "llvm/Transforms/Tapir/Outline.h" +#include "llvm/Transforms/Tapir/LoopSpawning.h" #include "llvm/Transforms/Utils/EscapeEnumerator.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/TapirUtils.h" @@ -683,3 +686,1033 @@ bool llvm::attemptSyncRegionElimination(Instruction *SyncRegion) { SyncRegion->eraseFromParent(); return true; } + +llvm::LoopSpawningHints::LoopSpawningHints(Loop *L) + : Strategy("spawn.strategy", ST_SEQ, HK_STRATEGY), + Grainsize("grainsize", 0, HK_GRAINSIZE), + TheLoop(L) { + // Populate values with existing loop metadata. + getHintsFromMetadata(); +} + +LoopSpawningHints::SpawningStrategy +llvm::LoopSpawningHints::getStrategy() const { + return (SpawningStrategy)Strategy.Value; +} + +unsigned llvm::LoopSpawningHints::getGrainsize() const { + return Grainsize.Value; +} + +void llvm::LoopSpawningHints::getHintsFromMetadata() { + MDNode *LoopID = TheLoop->getLoopID(); + if (!LoopID) + return; + + // First operand should refer to the loop id itself. + assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); + assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); + + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + const MDString *S = nullptr; + SmallVector Args; + + // The expected hint is either a MDString or a MDNode with the first + // operand a MDString. + if (const MDNode *MD = dyn_cast(LoopID->getOperand(i))) { + if (!MD || MD->getNumOperands() == 0) + continue; + S = dyn_cast(MD->getOperand(0)); + for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) + Args.push_back(MD->getOperand(i)); + } else { + S = dyn_cast(LoopID->getOperand(i)); + assert(Args.size() == 0 && "too many arguments for MDString"); + } + + if (!S) + continue; + + // Check if the hint starts with the loop metadata prefix. + StringRef Name = S->getString(); + if (Args.size() == 1) + setHint(Name, Args[0]); + } +} + +/// Checks string hint with one operand and set value if valid. +void llvm::LoopSpawningHints::setHint(StringRef Name, Metadata *Arg) { + if (!Name.startswith(Prefix())) + return; + Name = Name.substr(Prefix().size(), StringRef::npos); + + const ConstantInt *C = mdconst::dyn_extract(Arg); + if (!C) + return; + unsigned Val = C->getZExtValue(); + + Hint *Hints[] = {&Strategy, &Grainsize}; + for (auto H : Hints) { + if (Name == H->Name) { + if (H->validate(Val)) + H->Value = Val; + else + DEBUG(dbgs() << " ignoring invalid hint '" << + Name << "'\n"); + break; + } + } +} + +/// Create a new hint from name / value pair. +MDNode *llvm::LoopSpawningHints::createHintMetadata(StringRef Name, + unsigned V) const { + LLVMContext &Context = TheLoop->getHeader()->getContext(); + Metadata *MDs[] = {MDString::get(Context, Name), + ConstantAsMetadata::get( + ConstantInt::get(Type::getInt32Ty(Context), V))}; + return MDNode::get(Context, MDs); +} + +/// Matches metadata with hint name. +bool llvm::LoopSpawningHints::matchesHintMetadataName( + MDNode *Node, ArrayRef HintTypes) { + MDString *Name = dyn_cast(Node->getOperand(0)); + if (!Name) + return false; + + for (auto H : HintTypes) + if (Name->getString().endswith(H.Name)) + return true; + return false; +} + +/// Sets current hints into loop metadata, keeping other values intact. +void llvm::LoopSpawningHints::writeHintsToMetadata(ArrayRef HintTypes) { + if (HintTypes.size() == 0) + return; + + // Reserve the first element to LoopID (see below). + SmallVector MDs(1); + // If the loop already has metadata, then ignore the existing operands. + MDNode *LoopID = TheLoop->getLoopID(); + if (LoopID) { + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + MDNode *Node = cast(LoopID->getOperand(i)); + // If node in update list, ignore old value. + if (!matchesHintMetadataName(Node, HintTypes)) + MDs.push_back(Node); + } + } + + // Now, add the missing hints. + for (auto H : HintTypes) + MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); + + // Replace current metadata node with new one. + LLVMContext &Context = TheLoop->getHeader()->getContext(); + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + + TheLoop->setLoopID(NewLoopID); +} + +bool llvm::LoopSpawningHints::Hint::validate(unsigned Val) { + switch (Kind) { + case HK_STRATEGY: + return (Val < ST_END); + case HK_GRAINSIZE: + return true; + } + return false; +} + +bool llvm::isBackendParallelFor(Loop* L) { + return LoopSpawningHints(L).getStrategy() != LoopSpawningHints::ST_SEQ; +} + + +/// Helper routine to get all exit blocks of a loop that are unreachable. +static void getEHExits(Loop *L, const BasicBlock *DesignatedExitBlock, + SmallVectorImpl &EHExits) { + SmallVector ExitBlocks; + L->getExitBlocks(ExitBlocks); + + SmallVector WorkList; + for (BasicBlock *Exit : ExitBlocks) { + if (Exit == DesignatedExitBlock) continue; + EHExits.push_back(Exit); + WorkList.push_back(Exit); + } + + // Traverse the CFG from these frontier blocks to find all blocks involved in + // exception-handling exit code. + SmallPtrSet Visited; + while (!WorkList.empty()) { + BasicBlock *BB = WorkList.pop_back_val(); + if (!Visited.insert(BB).second) + continue; + + // Check that the exception handling blocks do not reenter the loop. + assert(!L->contains(BB) && + "Exception handling blocks re-enter loop."); + + for (BasicBlock *Succ : successors(BB)) { + EHExits.push_back(Succ); + WorkList.push_back(Succ); + } + } +} + +/// Convert a pointer to an integer type. +/// +/// Copied from Transforms/Vectorizer/LoopVectorize.cpp. +static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { + if (Ty->isPointerTy()) + return DL.getIntPtrType(Ty); + + // It is possible that char's or short's overflow when we ask for the loop's + // trip count, work around this by changing the type size. + if (Ty->getScalarSizeInBits() < 32) + return Type::getInt32Ty(Ty->getContext()); + + return Ty; +} + +/// Get the wider of two integer types. +/// +/// Copied from Transforms/Vectorizer/LoopVectorize.cpp. +static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { + Ty0 = convertPointerToIntegerType(DL, Ty0); + Ty1 = convertPointerToIntegerType(DL, Ty1); + if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) + return Ty0; + return Ty1; +} + +#include "llvm/Analysis/LoopIterator.h" + +STATISTIC(LoopsConvertedToDAC, + "Number of Tapir loops converted to divide-and-conquer iteration spawning"); + +/// DACLoopSpawning implements the transformation to spawn the iterations of a +/// Tapir loop in a recursive divide-and-conquer fashion. +class DACLoopSpawning : public LoopOutline { +public: + TapirTarget* tapirTarget; + DACLoopSpawning(Loop *OrigLoop, unsigned Grainsize, + ScalarEvolution &SE, + LoopInfo *LI, DominatorTree *DT, + AssumptionCache *AC, + OptimizationRemarkEmitter &ORE, TapirTarget* tapirTarget) + : LoopOutline(OrigLoop, SE, LI, DT, AC, ORE), + tapirTarget(tapirTarget), + SpecifiedGrainsize(Grainsize) + {} + + /// Top-level call to convert loop to spawn its iterations in a + /// divide-and-conquer fashion. + bool processLoop() { + Loop *L = OrigLoop; + + BasicBlock *Header = L->getHeader(); + BasicBlock *Preheader = L->getLoopPreheader(); + BasicBlock *Latch = L->getLoopLatch(); + + DEBUG({ + LoopBlocksDFS DFS(L); + DFS.perform(LI); + dbgs() << "Blocks in loop (from DFS):\n"; + for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) + dbgs() << *BB; + }); + + using namespace ore; + + // Check that this loop has a valid exit block after the latch. + if (!ExitBlock) { + DEBUG(dbgs() << "LS loop does not contain valid exit block after latch.\n"); + ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "InvalidLatchExit", + L->getStartLoc(), + Header) + << "invalid latch exit"); + return false; + } + + // Get special exits from this loop. + SmallVector EHExits; + getEHExits(L, ExitBlock, EHExits); + + // Check the exit blocks of the loop. + SmallVector ExitBlocks; + L->getExitBlocks(ExitBlocks); + + for (const BasicBlock *Exit : ExitBlocks) { + if (Exit == ExitBlock) continue; + if (Exit->isLandingPad()) { + DEBUG({ + const LandingPadInst *LPI = Exit->getLandingPadInst(); + dbgs() << "landing pad found: " << *LPI << "\n"; + for (const User *U : LPI->users()) + dbgs() << "\tuser " << *U << "\n"; + }); + } + } + SmallPtrSet HandledExits; + for (BasicBlock *BB : EHExits) + HandledExits.insert(BB); + for (BasicBlock *Exit : ExitBlocks) { + if (Exit == ExitBlock) continue; + if (!HandledExits.count(Exit)) { + DEBUG(dbgs() << "LS loop contains a bad exit block " << *Exit); + ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "BadExit", + L->getStartLoc(), + Header) + << "bad exit block found"); + return false; + } + } + + Module* M = OrigFunction->getParent(); + + DEBUG(dbgs() << "LS loop header:" << *Header); + DEBUG(dbgs() << "LS loop latch:" << *Latch); + DEBUG(dbgs() << "LS SE exit count: " << *(SE.getExitCount(L, Latch)) << "\n"); + + /// Get loop limit. + const SCEV *Limit = SE.getExitCount(L, Latch); + DEBUG(dbgs() << "LS Loop limit: " << *Limit << "\n"); + // PredicatedScalarEvolution PSE(SE, *L); + // const SCEV *PLimit = PSE.getExitCount(L, Latch); + // DEBUG(dbgs() << "LS predicated loop limit: " << *PLimit << "\n"); + // emitAnalysis(LoopSpawningReport() + // << "computed loop limit " << *Limit << "\n"); + if (SE.getCouldNotCompute() == Limit) { + DEBUG(dbgs() << "SE could not compute loop limit.\n"); + ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "UnknownLoopLimit", + L->getStartLoc(), + Header) + << "could not compute limit"); + return false; + } + + /// Determine the type of the canonical IV. + Type *CanonicalIVTy = Limit->getType(); + { + const DataLayout &DL = M->getDataLayout(); + for (BasicBlock::iterator II = Header->begin(); isa(II); ++II) { + PHINode *PN = cast(II); + if (PN->getType()->isFloatingPointTy()) continue; + CanonicalIVTy = getWiderType(DL, PN->getType(), CanonicalIVTy); + } + Limit = SE.getNoopOrAnyExtend(Limit, CanonicalIVTy); + } + /// Clean up the loop's induction variables. + PHINode *CanonicalIV = canonicalizeIVs(CanonicalIVTy); + if (!CanonicalIV) { + DEBUG(dbgs() << "Could not get canonical IV.\n"); + // emitAnalysis(LoopSpawningReport() + // << "Could not get a canonical IV.\n"); + ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "NoCanonicalIV", + L->getStartLoc(), + Header) + << "could not find or create canonical IV"); + return false; + } + + // Remove the IV's (other than CanonicalIV) and replace them with + // their stronger forms. + // + // TODO?: We can probably adapt this loop->DAC process such that we + // don't require all IV's to be canonical. + SmallVector IVs; + SCEVExpander Exp(SE, M->getDataLayout(), "ls"); + if (!removeNonCanonicalIVs(Header, Preheader, CanonicalIV, IVs, Exp)) + return false; + + const SCEVAddRecExpr *CanonicalSCEV = + cast(SE.getSCEV(CanonicalIV)); + + // Insert the computation for the loop limit into the Preheader. + Value *LimitVar = Exp.expandCodeFor(Limit, CanonicalIVTy, + Preheader->getTerminator()); + DEBUG(dbgs() << "LimitVar: " << *LimitVar << "\n"); + + // Canonicalize the loop latch. + assert(SE.isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_ULT, + CanonicalSCEV, Limit) && + "Loop backedge is not guarded by canonical comparison with limit."); + Value *NewCond = canonicalizeLoopLatch(CanonicalIV, LimitVar); + + // Insert computation of grainsize into the Preheader. + Value *GrainVar; + if (!SpecifiedGrainsize) + GrainVar = computeGrainsize(LimitVar); + else + GrainVar = ConstantInt::get(LimitVar->getType(), SpecifiedGrainsize); + + DEBUG(dbgs() << "GrainVar: " << *GrainVar << "\n"); + /// Clone the loop into a new function. + + // Get the inputs and outputs for the Loop blocks. + SetVector Inputs, Outputs; + SetVector BodyInputs, BodyOutputs; + ValueToValueMapTy VMap, InputMap; + std::vector LoopBlocks; + SmallPtrSet ExitsToSplit; + Value *SRetInput = nullptr; + + // Get the sync region containing this Tapir loop. + const Instruction *InputSyncRegion; + { + const DetachInst *DI = cast(Header->getTerminator()); + InputSyncRegion = cast(DI->getSyncRegion()); + } + + // Add start iteration, end iteration, and grainsize to inputs. + { + LoopBlocks = L->getBlocks(); + + // Add unreachable and exception-handling exits to the set of loop blocks to + // clone. + DEBUG({ + dbgs() << "Handled exits of loop:"; + for (BasicBlock *HE : HandledExits) + dbgs() << *HE; + dbgs() << "\n"; + }); + + for (BasicBlock *HE : HandledExits) + LoopBlocks.push_back(HE); + + { + const DetachInst *DI = cast(Header->getTerminator()); + BasicBlockEdge DetachEdge(Header, DI->getDetached()); + for (BasicBlock *HE : HandledExits) + if (!DT || !DT->dominates(DetachEdge, HE)) + ExitsToSplit.insert(HE); + DEBUG({ + dbgs() << "Loop exits to split:"; + for (BasicBlock *ETS : ExitsToSplit) + dbgs() << *ETS; + dbgs() << "\n"; + }); + } + + // Get the inputs and outputs for the loop body. + findInputsOutputs(LoopBlocks, BodyInputs, BodyOutputs, &ExitsToSplit); + + // Scan for any sret parameters in BodyInputs and add them first. + if (OrigFunction->hasStructRetAttr()) { + Function::arg_iterator ArgIter = OrigFunction->arg_begin(); + if (OrigFunction->hasParamAttribute(0, Attribute::StructRet)) + if (BodyInputs.count(&*ArgIter)) + SRetInput = &*ArgIter; + if (OrigFunction->hasParamAttribute(1, Attribute::StructRet)) { + ++ArgIter; + if (BodyInputs.count(&*ArgIter)) + SRetInput = &*ArgIter; + } + } + if (SRetInput) { + DEBUG(dbgs() << "sret input " << *SRetInput << "\n"); + Inputs.insert(SRetInput); + } + + // Add argument for start of CanonicalIV. + DEBUG({ + Value *CanonicalIVInput = + CanonicalIV->getIncomingValueForBlock(Preheader); + // CanonicalIVInput should be the constant 0. + assert(isa(CanonicalIVInput) && + "Input to canonical IV from preheader is not constant."); + }); + Argument *StartArg = new Argument(CanonicalIV->getType(), + CanonicalIV->getName()+".start"); + Inputs.insert(StartArg); + InputMap[CanonicalIV] = StartArg; + + // Add argument for end. + // + // In the general case, the loop limit is the result of some computation + // that the pass added to the loop's preheader. In this case, the variable + // storing the loop limit is used exactly once, in the canonicalized loop + // latch. In this case, the pass wants to prevent outlining from passing + // the loop-limit variable as an arbitrary argument to the outlined + // function. Hence, this pass adds the loop-limit variable as an argument + // manually. + // + // There are two special cases to consider: the loop limit is a constant, or + // the loop limit is used elsewhere within the loop. To handle these two + // cases, this pass adds an explict argument for the end of the loop, to + // supports the subsequent transformation to using recursive + // divide-and-conquer. After the loop is outlined, this pass will rewrite + // the latch in the outlined loop to use this explicit argument. + // Furthermore, this pass does not prevent outliner from recognizing the + // loop limit as a potential argument to the function. + if (isa(LimitVar) || !LimitVar->hasOneUse()) { + Argument *EndArg = new Argument(LimitVar->getType(), "end"); + Inputs.insert(EndArg); + InputMap[LimitVar] = EndArg; + } else { + // If the limit var is not constant and has exactly one use, then the + // limit var is the result of some nontrivial computation, and that one + // use is the new condition inserted. + Inputs.insert(LimitVar); + InputMap[LimitVar] = LimitVar; + } + + // Add argument for grainsize. + if (isa(GrainVar)) { + Argument *GrainArg = new Argument(GrainVar->getType(), "grainsize"); + Inputs.insert(GrainArg); + InputMap[GrainVar] = GrainArg; + } else { + Inputs.insert(GrainVar); + InputMap[GrainVar] = GrainVar; + } + + // Put all of the inputs together, and clear redundant inputs from + // the set for the loop body. + SmallVector BodyInputsToRemove; + for (Value *V : BodyInputs) + if (V == InputSyncRegion) + BodyInputsToRemove.push_back(V); + else if (!Inputs.count(V)) + Inputs.insert(V); + else + BodyInputsToRemove.push_back(V); + for (Value *V : BodyInputsToRemove) + BodyInputs.remove(V); + DEBUG({ + for (Value *V : BodyInputs) + dbgs() << "Remaining body input: " << *V << "\n"; + }); + for (Value *V : BodyOutputs) + dbgs() << "EL output: " << *V << "\n"; + assert(0 == BodyOutputs.size() && + "All results from parallel loop should be passed by memory already."); + } + DEBUG({ + for (Value *V : Inputs) + dbgs() << "EL input: " << *V << "\n"; + for (Value *V : Outputs) + dbgs() << "EL output: " << *V << "\n"; + }); + + // Clone the loop blocks into a new helper function. + Function *Helper; + { + SmallVector Returns; // Ignore returns cloned. + + // LowerDbgDeclare(*(Header->getParent())); + + Helper = CreateHelper(Inputs, Outputs, LoopBlocks, + Header, Preheader, ExitBlock, + VMap, M, + OrigFunction->getSubprogram() != nullptr, Returns, ".ls", + &ExitsToSplit, InputSyncRegion, + nullptr, nullptr, nullptr); + + assert(Returns.empty() && "Returns cloned when cloning loop."); + + // Use a fast calling convention for the helper. + Helper->setCallingConv(CallingConv::Fast); + // Helper->setCallingConv(Header->getParent()->getCallingConv()); + } + + // Add a sync to the helper's return. + BasicBlock *HelperHeader = cast(VMap[Header]); + { + BasicBlock *HelperExit = cast(VMap[ExitBlock]); + assert(isa(HelperExit->getTerminator())); + BasicBlock *NewHelperExit = SplitBlock(HelperExit, + HelperExit->getTerminator(), + DT, LI); + IRBuilder<> Builder(&(HelperExit->front())); + SyncInst *NewSync = Builder.CreateSync( + NewHelperExit, + cast(VMap[InputSyncRegion])); + // Set debug info of new sync to match that of terminator of the header of + // the cloned loop. + NewSync->setDebugLoc(HelperHeader->getTerminator()->getDebugLoc()); + HelperExit->getTerminator()->eraseFromParent(); + } + + BasicBlock *NewPreheader = cast(VMap[Preheader]); + PHINode *NewCanonicalIV = cast(VMap[CanonicalIV]); + + // Rewrite the cloned IV's to start at the start iteration argument. + { + // Rewrite clone of canonical IV to start at the start iteration + // argument. + Argument *NewCanonicalIVStart = cast(VMap[InputMap[CanonicalIV]]); + + { + int NewPreheaderIdx = NewCanonicalIV->getBasicBlockIndex(NewPreheader); + assert(isa(NewCanonicalIV->getIncomingValue(NewPreheaderIdx)) && + "Cloned canonical IV does not inherit a constant value from cloned preheader."); + NewCanonicalIV->setIncomingValue(NewPreheaderIdx, NewCanonicalIVStart); + } + + // Rewrite other cloned IV's to start at their value at the start + // iteration. + const SCEV *StartIterSCEV = SE.getSCEV(NewCanonicalIVStart); + DEBUG(dbgs() << "StartIterSCEV: " << *StartIterSCEV << "\n"); + for (PHINode *IV : IVs) { + if (CanonicalIV == IV) continue; + + // Get the value of the IV at the start iteration. + DEBUG(dbgs() << "IV " << *IV); + const SCEV *IVSCEV = SE.getSCEV(IV); + DEBUG(dbgs() << " (SCEV " << *IVSCEV << ")"); + const SCEVAddRecExpr *IVSCEVAddRec = cast(IVSCEV); + const SCEV *IVAtIter = IVSCEVAddRec->evaluateAtIteration(StartIterSCEV, SE); + DEBUG(dbgs() << " expands at iter " << *StartIterSCEV << + " to " << *IVAtIter << "\n"); + + // NOTE: Expanded code should not refer to other IV's. + Value *IVStart = Exp.expandCodeFor(IVAtIter, IVAtIter->getType(), + NewPreheader->getTerminator()); + + // Set the value that the cloned IV inherits from the cloned preheader. + PHINode *NewIV = cast(VMap[IV]); + int NewPreheaderIdx = NewIV->getBasicBlockIndex(NewPreheader); + assert(isa(NewIV->getIncomingValue(NewPreheaderIdx)) && + "Cloned IV does not inherit a constant value from cloned preheader."); + NewIV->setIncomingValue(NewPreheaderIdx, IVStart); + } + + // Remap the newly added instructions in the new preheader to use + // values local to the helper. + for (Instruction &II : *NewPreheader) + RemapInstruction(&II, VMap, RF_IgnoreMissingLocals, + /*TypeMapper=*/nullptr, /*Materializer=*/nullptr); + } + + // The loop has been outlined by this point. To handle the special cases + // where the loop limit was constant or used elsewhere within the loop, this + // pass rewrites the outlined loop-latch condition to use the explicit + // end-iteration argument. + if (isa(LimitVar) || !LimitVar->hasOneUse()) { + CmpInst *HelperCond = cast(VMap[NewCond]); + assert(((isa(LimitVar) && + HelperCond->getOperand(1) == LimitVar) || + (!LimitVar->hasOneUse() && + HelperCond->getOperand(1) == VMap[LimitVar])) && + "Unexpected condition in loop latch."); + IRBuilder<> Builder(HelperCond); + Value *NewHelperCond = Builder.CreateICmpULT(HelperCond->getOperand(0), + VMap[InputMap[LimitVar]]); + HelperCond->replaceAllUsesWith(NewHelperCond); + HelperCond->eraseFromParent(); + DEBUG(dbgs() << "Rewritten Latch: " << + *(cast(NewHelperCond)->getParent())); + } + + // DEBUGGING: Simply serialize the cloned loop. + // BasicBlock *NewHeader = cast(VMap[Header]); + // SerializeDetachedCFG(cast(NewHeader->getTerminator()), nullptr); + implementDACIterSpawnOnHelper(Helper, NewPreheader, + cast(VMap[Header]), + cast(VMap[CanonicalIV]), + cast(VMap[InputMap[LimitVar]]), + cast(VMap[InputMap[GrainVar]]), + cast(VMap[InputSyncRegion]), + /*DT=*/nullptr, /*LI=*/nullptr, + CanonicalSCEV->getNoWrapFlags(SCEV::FlagNUW), + CanonicalSCEV->getNoWrapFlags(SCEV::FlagNSW)); + + if (verifyFunction(*Helper, &dbgs())) + return false; + + // Update allocas in cloned loop body. + { + // Collect reattach instructions. + SmallVector ReattachPoints; + for (pred_iterator PI = pred_begin(Latch), PE = pred_end(Latch); + PI != PE; ++PI) { + BasicBlock *Pred = *PI; + if (!isa(Pred->getTerminator())) continue; + if (L->contains(Pred)) + ReattachPoints.push_back(cast(VMap[Pred])->getTerminator()); + } + // The cloned loop should be serialized by this point. + BasicBlock *ClonedLoopBodyEntry = + cast(VMap[Header])->getSingleSuccessor(); + assert(ClonedLoopBodyEntry && + "Head of cloned loop body has multiple successors."); + bool ContainsDynamicAllocas = + MoveStaticAllocasInBlock(&Helper->getEntryBlock(), ClonedLoopBodyEntry, + ReattachPoints); + + // If the cloned loop contained dynamic alloca instructions, wrap the cloned + // loop with llvm.stacksave/llvm.stackrestore intrinsics. + if (ContainsDynamicAllocas) { + Module *M = Helper->getParent(); + // Get the two intrinsics we care about. + Function *StackSave = Intrinsic::getDeclaration(M, Intrinsic::stacksave); + Function *StackRestore = + Intrinsic::getDeclaration(M,Intrinsic::stackrestore); + + // Insert the llvm.stacksave. + CallInst *SavedPtr = IRBuilder<>(&*ClonedLoopBodyEntry, + ClonedLoopBodyEntry->begin()) + .CreateCall(StackSave, {}, "savedstack"); + + // Insert a call to llvm.stackrestore before the reattaches in the + // original Tapir loop. + for (Instruction *ExitPoint : ReattachPoints) + IRBuilder<>(ExitPoint).CreateCall(StackRestore, SavedPtr); + } + } + + if (verifyFunction(*Helper, &dbgs())) + return false; + + // Add alignment assumptions to arguments of helper, based on alignment of + // values in old function. + AddAlignmentAssumptions(OrigFunction, Inputs, VMap, + Preheader->getTerminator(), AC, DT); + + // Add call to new helper function in original function. + { + // Setup arguments for call. + SmallVector TopCallArgs; + // Add sret input, if it exists. + if (SRetInput) + TopCallArgs.push_back(SRetInput); + // Add start iteration 0. + assert(CanonicalSCEV->getStart()->isZero() && + "Canonical IV does not start at zero."); + TopCallArgs.push_back(ConstantInt::get(CanonicalIV->getType(), 0)); + // Add loop limit. + TopCallArgs.push_back(LimitVar); + // Add grainsize. + TopCallArgs.push_back(GrainVar); + // Add the rest of the arguments. + for (Value *V : BodyInputs) + TopCallArgs.push_back(V); + DEBUG({ + for (Value *TCArg : TopCallArgs) + dbgs() << "Top call arg: " << *TCArg << "\n"; + }); + + // Create call instruction. + IRBuilder<> Builder(Preheader->getTerminator()); + CallInst *TopCall = Builder.CreateCall(Helper, + ArrayRef(TopCallArgs)); + + // Use a fast calling convention for the helper. + TopCall->setCallingConv(CallingConv::Fast); + // TopCall->setCallingConv(Helper->getCallingConv()); + TopCall->setDebugLoc(Header->getTerminator()->getDebugLoc()); + // // Update CG graph with the call we just added. + // CG[F]->addCalledFunction(TopCall, CG[Helper]); + } + + // Remove sync of loop in parent. + { + // Get the sync region for this loop's detached iterations. + DetachInst *HeadDetach = cast(Header->getTerminator()); + Value *SyncRegion = HeadDetach->getSyncRegion(); + // Check the Tapir instructions contained in this sync region. Look for a + // single sync instruction among those Tapir instructions. Meanwhile, + // verify that the only detach instruction in this sync region is the detach + // in theloop header. If these conditions are met, then we assume that the + // sync applies to this loop. Otherwise, something more complicated is + // going on, and we give up. + SyncInst *LoopSync = nullptr; + bool SingleSyncJustForLoop = true; + for (User *U : SyncRegion->users()) { + // Skip the detach in the loop header. + if (HeadDetach == U) continue; + // Remember the first sync instruction we find. If we find multiple sync + // instructions, then something nontrivial is going on. + if (SyncInst *SI = dyn_cast(U)) { + if (!LoopSync) + LoopSync = SI; + else + SingleSyncJustForLoop = false; + } + // If we find a detach instruction that is not the loop header's, then + // something nontrivial is going on. + if (isa(U)) + SingleSyncJustForLoop = false; + } + if (LoopSync && SingleSyncJustForLoop) + // Replace the sync with a branch. + ReplaceInstWithInst(LoopSync, + BranchInst::Create(LoopSync->getSuccessor(0))); + else if (!LoopSync) + DEBUG(dbgs() << "No sync found for this loop."); + else + DEBUG(dbgs() << "No single sync found that only affects this loop."); + } + + ++LoopsConvertedToDAC; + + unlinkLoop(); + + return Helper; + } + + virtual ~DACLoopSpawning() {} + +protected: + /// \brief Compute the grainsize of the loop, based on the limit. + /// + /// The grainsize is computed by the following equation: + /// + /// Grainsize = min(2048, ceil(Limit / (8 * workers))) + /// + /// This computation is inserted into the preheader of the loop. + /// + /// TODO: This method is the only method that depends on the CilkABI. + /// Generalize this method for other grainsize calculations and to query TLI. + Value* computeGrainsize(Value *Limit) { + Loop *L = OrigLoop; + + Value *Grainsize; + BasicBlock *Preheader = L->getLoopPreheader(); + assert(Preheader && "No Preheader found for loop."); + + IRBuilder<> Builder(Preheader->getTerminator()); + + // Get 8 * workers + Value *Workers8 = Builder.CreateIntCast(tapirTarget->GetOrCreateWorker8(*Preheader->getParent()), + Limit->getType(), false); + // Compute ceil(limit / 8 * workers) = (limit + 8 * workers - 1) / (8 * workers) + Value *SmallLoopVal = + Builder.CreateUDiv(Builder.CreateSub(Builder.CreateAdd(Limit, Workers8), + ConstantInt::get(Limit->getType(), 1)), + Workers8); + // Compute min + Value *LargeLoopVal = ConstantInt::get(Limit->getType(), 2048); + Value *Cmp = Builder.CreateICmpULT(LargeLoopVal, SmallLoopVal); + Grainsize = Builder.CreateSelect(Cmp, LargeLoopVal, SmallLoopVal); + + return Grainsize; + } + +/// \brief Method to help convertLoopToDACIterSpawn convert the Tapir +/// loop cloned into function Helper to spawn its iterations in a +/// parallel divide-and-conquer fashion. +/// +/// Example: Suppose that Helper contains the following Tapir loop: +/// +/// Helper(iter_t start, iter_t end, iter_t grain, ...) { +/// iter_t i = start; +/// ... Other loop setup ... +/// do { +/// spawn { ... loop body ... }; +/// } while (i++ < end); +/// sync; +/// } +/// +/// Then this method transforms Helper into the following form: +/// +/// Helper(iter_t start, iter_t end, iter_t grain, ...) { +/// recur: +/// iter_t itercount = end - start; +/// if (itercount > grain) { +/// // Invariant: itercount >= 2 +/// count_t miditer = start + itercount / 2; +/// spawn Helper(start, miditer, grain, ...); +/// start = miditer + 1; +/// goto recur; +/// } +/// +/// iter_t i = start; +/// ... Other loop setup ... +/// do { +/// ... Loop Body ... +/// } while (i++ < end); +/// sync; +/// } +/// +void implementDACIterSpawnOnHelper(Function *Helper, + BasicBlock *Preheader, + BasicBlock *Header, + PHINode *CanonicalIV, + Argument *Limit, + Argument *Grainsize, + Instruction *SyncRegion, + DominatorTree *DT, + LoopInfo *LI, + bool CanonicalIVFlagNUW = false, + bool CanonicalIVFlagNSW = false) { + // Serialize the cloned copy of the loop. + assert(Preheader->getParent() == Helper && + "Preheader does not belong to helper function."); + assert(Header->getParent() == Helper && + "Header does not belong to helper function."); + assert(CanonicalIV->getParent() == Header && + "CanonicalIV does not belong to header"); + assert(isa(Header->getTerminator()) && + "Cloned header is not terminated by a detach."); + DetachInst *DI = dyn_cast(Header->getTerminator()); + SerializeDetachedCFG(DI, DT); + + // Convert the cloned loop into the strip-mined loop body. + + BasicBlock *DACHead = Preheader; + if (&(Helper->getEntryBlock()) == Preheader) + // Split the entry block. We'll want to create a backedge into + // the split block later. + DACHead = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI); + + BasicBlock *RecurHead, *RecurDet, *RecurCont; + Value *IterCount; + Value *CanonicalIVInput; + PHINode *CanonicalIVStart; + { + Instruction *PreheaderOrigFront = &(DACHead->front()); + IRBuilder<> Builder(PreheaderOrigFront); + // Create branch based on grainsize. + DEBUG(dbgs() << "LS CanonicalIV: " << *CanonicalIV << "\n"); + CanonicalIVInput = CanonicalIV->getIncomingValueForBlock(DACHead); + CanonicalIVStart = Builder.CreatePHI(CanonicalIV->getType(), 2, + CanonicalIV->getName()+".dac"); + CanonicalIVInput->replaceAllUsesWith(CanonicalIVStart); + IterCount = Builder.CreateSub(Limit, CanonicalIVStart, + "itercount"); + Value *IterCountCmp = Builder.CreateICmpUGT(IterCount, Grainsize); + TerminatorInst *RecurTerm = + SplitBlockAndInsertIfThen(IterCountCmp, PreheaderOrigFront, + /*Unreachable=*/false, + /*BranchWeights=*/nullptr, + DT); + RecurHead = RecurTerm->getParent(); + // Create skeleton of divide-and-conquer recursion: + // DACHead -> RecurHead -> RecurDet -> RecurCont -> DACHead + RecurDet = SplitBlock(RecurHead, RecurHead->getTerminator(), + DT, LI); + RecurCont = SplitBlock(RecurDet, RecurDet->getTerminator(), + DT, LI); + RecurCont->getTerminator()->replaceUsesOfWith(RecurTerm->getSuccessor(0), + DACHead); + } + + // Compute mid iteration in RecurHead. + Value *MidIter, *MidIterPlusOne; + { + IRBuilder<> Builder(&(RecurHead->front())); + MidIter = Builder.CreateAdd(CanonicalIVStart, + Builder.CreateLShr(IterCount, 1, + "halfcount"), + "miditer", + CanonicalIVFlagNUW, CanonicalIVFlagNSW); + } + + // Create recursive call in RecurDet. + { + // Create input array for recursive call. + IRBuilder<> Builder(&(RecurDet->front())); + SetVector RecurInputs; + Function::arg_iterator AI = Helper->arg_begin(); + // Handle an initial sret argument, if necessary. Based on how + // the Helper function is created, any sret parameter will be the + // first parameter. + if (Helper->hasParamAttribute(0, Attribute::StructRet)) + RecurInputs.insert(&*AI++); + assert(cast(CanonicalIVInput) == &*AI && + "First non-sret argument does not match original input to canonical IV."); + RecurInputs.insert(CanonicalIVStart); + ++AI; + assert(Limit == &*AI && + "Second non-sret argument does not match original input to the loop limit."); + RecurInputs.insert(MidIter); + ++AI; + for (Function::arg_iterator AE = Helper->arg_end(); + AI != AE; ++AI) + RecurInputs.insert(&*AI); + DEBUG({ + dbgs() << "RecurInputs: "; + for (Value *Input : RecurInputs) + dbgs() << *Input << ", "; + dbgs() << "\n"; + }); + + // Create call instruction. + CallInst *RecurCall = Builder.CreateCall(Helper, RecurInputs.getArrayRef()); + RecurCall->setDebugLoc(Header->getTerminator()->getDebugLoc()); + // Use a fast calling convention for the helper. + RecurCall->setCallingConv(CallingConv::Fast); + // RecurCall->setCallingConv(Helper->getCallingConv()); + // // Update CG graph with the recursive call we just added. + // CG[Helper]->addCalledFunction(RecurCall, CG[Helper]); + } + + // Set up continuation of detached recursive call. We effectively + // inline this tail call automatically. + { + IRBuilder<> Builder(&(RecurCont->front())); + MidIterPlusOne = Builder.CreateAdd(MidIter, + ConstantInt::get(Limit->getType(), 1), + "miditerplusone", + CanonicalIVFlagNUW, + CanonicalIVFlagNSW); + } + + // Finish setup of new phi node for canonical IV. + { + CanonicalIVStart->addIncoming(CanonicalIVInput, Preheader); + CanonicalIVStart->addIncoming(MidIterPlusOne, RecurCont); + } + + /// Make the recursive DAC parallel. + { + IRBuilder<> Builder(RecurHead->getTerminator()); + // Create the detach. + DetachInst *DI = Builder.CreateDetach(RecurDet, RecurCont, SyncRegion); + DI->setDebugLoc(Header->getTerminator()->getDebugLoc()); + RecurHead->getTerminator()->eraseFromParent(); + // Create the reattach. + Builder.SetInsertPoint(RecurDet->getTerminator()); + ReattachInst *RI = Builder.CreateReattach(RecurCont, SyncRegion); + RI->setDebugLoc(Header->getTerminator()->getDebugLoc()); + RecurDet->getTerminator()->eraseFromParent(); + } +} + + unsigned SpecifiedGrainsize; +}; + +bool llvm::TapirTarget::processDACLoop(LoopSpawningHints LSH, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT, + AssumptionCache &AC, OptimizationRemarkEmitter &ORE) { + + DEBUG(dbgs() << "LS: Hints dictate DAC spawning.\n"); + + Loop* L = LSH.TheLoop; + + DebugLoc DLoc = L->getStartLoc(); + BasicBlock *Header = L->getHeader(); + DACLoopSpawning DLS(L, LSH.getGrainsize(), SE, &LI, &DT, &AC, ORE, this); + if (DLS.processLoop()) { + DEBUG({ + if (verifyFunction(*L->getHeader()->getParent())) { + dbgs() << "Transformed function is invalid.\n"; + return false; + } + }); + // Report success. + ORE.emit(OptimizationRemark(LS_NAME, "DACSpawning", DLoc, Header) + << "spawning iterations using divide-and-conquer"); + return true; + } else { + // Report failure. + ORE.emit(OptimizationRemarkMissed(LS_NAME, "NoDACSpawning", DLoc, + Header) + << "cannot spawn iterations using divide-and-conquer"); + ORE.emit(DiagnosticInfoOptimizationFailure( + DEBUG_TYPE, "FailedRequestedSpawning", + L->getStartLoc(), L->getHeader()) + << "Tapir loop not transformed: " + << "failed to use divide-and-conquer loop spawning"); + return false; + } + + return false; +} diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp index 106f5b14f35..ed67b4dec6f 100644 --- a/lib/Transforms/Utils/LoopUnroll.cpp +++ b/lib/Transforms/Utils/LoopUnroll.cpp @@ -32,13 +32,13 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Tapir/TapirUtils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" -#include "llvm/Transforms/Utils/TapirUtils.h" #include "llvm/Transforms/Utils/UnrollLoop.h" using namespace llvm; diff --git a/lib/Transforms/Utils/TapirUtils.cpp b/lib/Transforms/Utils/TapirUtils.cpp index 9707290c426..69e976897ff 100644 --- a/lib/Transforms/Utils/TapirUtils.cpp +++ b/lib/Transforms/Utils/TapirUtils.cpp @@ -322,147 +322,6 @@ bool llvm::isCriticalContinueEdge(const TerminatorInst *TI, unsigned SuccNum) { return false; } -llvm::LoopSpawningHints::LoopSpawningHints(const Loop *L) - : Strategy("spawn.strategy", ST_SEQ, HK_STRATEGY), - Grainsize("grainsize", 0, HK_GRAINSIZE), - TheLoop(L) { - // Populate values with existing loop metadata. - getHintsFromMetadata(); -} - -LoopSpawningHints::SpawningStrategy -llvm::LoopSpawningHints::getStrategy() const { - return (SpawningStrategy)Strategy.Value; -} - -unsigned llvm::LoopSpawningHints::getGrainsize() const { - return Grainsize.Value; -} - -void llvm::LoopSpawningHints::getHintsFromMetadata() { - MDNode *LoopID = TheLoop->getLoopID(); - if (!LoopID) - return; - - // First operand should refer to the loop id itself. - assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); - assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); - - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - const MDString *S = nullptr; - SmallVector Args; - - // The expected hint is either a MDString or a MDNode with the first - // operand a MDString. - if (const MDNode *MD = dyn_cast(LoopID->getOperand(i))) { - if (!MD || MD->getNumOperands() == 0) - continue; - S = dyn_cast(MD->getOperand(0)); - for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) - Args.push_back(MD->getOperand(i)); - } else { - S = dyn_cast(LoopID->getOperand(i)); - assert(Args.size() == 0 && "too many arguments for MDString"); - } - - if (!S) - continue; - - // Check if the hint starts with the loop metadata prefix. - StringRef Name = S->getString(); - if (Args.size() == 1) - setHint(Name, Args[0]); - } -} - -/// Checks string hint with one operand and set value if valid. -void llvm::LoopSpawningHints::setHint(StringRef Name, Metadata *Arg) { - if (!Name.startswith(Prefix())) - return; - Name = Name.substr(Prefix().size(), StringRef::npos); - - const ConstantInt *C = mdconst::dyn_extract(Arg); - if (!C) - return; - unsigned Val = C->getZExtValue(); - - Hint *Hints[] = {&Strategy, &Grainsize}; - for (auto H : Hints) { - if (Name == H->Name) { - if (H->validate(Val)) - H->Value = Val; - else - DEBUG(dbgs() << " ignoring invalid hint '" << - Name << "'\n"); - break; - } - } -} - -/// Create a new hint from name / value pair. -MDNode *llvm::LoopSpawningHints::createHintMetadata(StringRef Name, - unsigned V) const { - LLVMContext &Context = TheLoop->getHeader()->getContext(); - Metadata *MDs[] = {MDString::get(Context, Name), - ConstantAsMetadata::get( - ConstantInt::get(Type::getInt32Ty(Context), V))}; - return MDNode::get(Context, MDs); -} - -/// Matches metadata with hint name. -bool llvm::LoopSpawningHints::matchesHintMetadataName( - MDNode *Node, ArrayRef HintTypes) { - MDString *Name = dyn_cast(Node->getOperand(0)); - if (!Name) - return false; - - for (auto H : HintTypes) - if (Name->getString().endswith(H.Name)) - return true; - return false; -} - -/// Sets current hints into loop metadata, keeping other values intact. -void llvm::LoopSpawningHints::writeHintsToMetadata(ArrayRef HintTypes) { - if (HintTypes.size() == 0) - return; - - // Reserve the first element to LoopID (see below). - SmallVector MDs(1); - // If the loop already has metadata, then ignore the existing operands. - MDNode *LoopID = TheLoop->getLoopID(); - if (LoopID) { - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - MDNode *Node = cast(LoopID->getOperand(i)); - // If node in update list, ignore old value. - if (!matchesHintMetadataName(Node, HintTypes)) - MDs.push_back(Node); - } - } - - // Now, add the missing hints. - for (auto H : HintTypes) - MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); - - // Replace current metadata node with new one. - LLVMContext &Context = TheLoop->getHeader()->getContext(); - MDNode *NewLoopID = MDNode::get(Context, MDs); - // Set operand 0 to refer to the loop id itself. - NewLoopID->replaceOperandWith(0, NewLoopID); - - TheLoop->setLoopID(NewLoopID); -} - -bool llvm::LoopSpawningHints::Hint::validate(unsigned Val) { - switch (Kind) { - case HK_STRATEGY: - return (Val < ST_END); - case HK_GRAINSIZE: - return true; - } - return false; -} - /// Checks if this loop is a Tapir loop. Right now we check that the loop is /// in a canonical form: /// 1) The header detaches the body. @@ -537,15 +396,6 @@ bool llvm::isCanonicalTapirLoop(const Loop *L, bool print) { return true; } -bool llvm::isBackendParallelFor(Loop* L) { - // TODO: Use a more precise detection of cilk_for loops. - for (BasicBlock* BB : L->blocks()) - if (isa(BB->getTerminator())) - return LoopSpawningHints(L).getStrategy() == LoopSpawningHints::ST_DAC - || LoopSpawningHints(L).getStrategy() == LoopSpawningHints::ST_GPU; - return false; -} - /// canDetach - Return true if the given function can perform a detach, false /// otherwise. bool llvm::canDetach(const Function *F) { From 73db2e40a3405d33f728cb4c07b67c30ad4bd235 Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Wed, 25 Jul 2018 10:26:11 -0400 Subject: [PATCH 09/16] Further cleanups / gpu movement --- .circleci/config.yml | 6 +- include/llvm/Transforms/Tapir/CilkABI.h | 15 - include/llvm/Transforms/Tapir/LoopSpawning.h | 41 +- include/llvm/Transforms/Tapir/Outline.h | 17 + include/llvm/Transforms/Tapir/TapirUtils.h | 71 ++- lib/Transforms/Tapir/CilkABI.cpp | 257 +++++------ lib/Transforms/Tapir/LoopSpawning.cpp | 231 +++++++++- lib/Transforms/Tapir/TapirUtils.cpp | 447 +++---------------- lib/Transforms/Utils/LLVMBuild.txt | 2 +- tools/polly | 2 +- 10 files changed, 534 insertions(+), 555 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 10181e73889..5353ecb84bc 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -17,7 +17,8 @@ jobs: command: | mkdir build cd build - cmake .. -DCMAKE_BUILD_TYPE=Release -DLLVM_TARGETS_TO_BUILD=host -DLLVM_BUILD_TESTS=ON -DLLVM_ENABLE_ASSERTIONS=ON -DLLVM_ENABLE_LTO=OFF -DLLVM_USE_LINKER=gold -DLLVM_PARALLEL_COMPILE_JOBS=2 -DLLVM_PARALLEL_LINK_JOBS=1 -DLLVM_BINUTILS_INCDIR=/usr/include -DLLVM_LIT_ARGS="-sv -j 2" + cmake .. -DCMAKE_BUILD_TYPE=Release -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" -DLLVM_BUILD_TESTS=ON -DLLVM_ENABLE_ASSERTIONS=ON -DCOMPILER_RT_BUILD_KITSUNE=OFF -DLLVM_ENABLE_LTO=OFF -DLLVM_USE_LINKER=gold -DLLVM_PARALLEL_COMPILE_JOBS=2 -DLLVM_PARALLEL_LINK_JOBS=1 -DLLVM_BINUTILS_INCDIR=/usr/include -DLLVM_LIT_ARGS="-sv -j 2" + #cmake .. -DCMAKE_BUILD_TYPE=Release -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" -DLLVM_BUILD_TESTS=ON -DLLVM_ENABLE_ASSERTIONS=ON -DCOMPILER_RT_BUILD_KITSUNE=ON -DLLVM_ENABLE_LTO=OFF -DLLVM_USE_LINKER=gold -DLLVM_PARALLEL_COMPILE_JOBS=2 -DLLVM_PARALLEL_LINK_JOBS=1 -DLLVM_BINUTILS_INCDIR=/usr/include -DLLVM_LIT_ARGS="-sv -j 2" - run: name: make command: | @@ -58,7 +59,8 @@ jobs: command: | mkdir build cd build - cmake .. -DCMAKE_BUILD_TYPE=Release -DLLVM_TARGETS_TO_BUILD=host -DLLVM_BUILD_TESTS=ON -DLLVM_ENABLE_ASSERTIONS=ON -DLLVM_ENABLE_LTO=OFF -DLLVM_USE_LINKER=gold -DLLVM_PARALLEL_COMPILE_JOBS=2 -DLLVM_PARALLEL_LINK_JOBS=1 -DLLVM_BINUTILS_INCDIR=/usr/include -DLLVM_LIT_ARGS="-sv -j 2" + cmake .. -DCMAKE_BUILD_TYPE=Release -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" -DLLVM_BUILD_TESTS=ON -DLLVM_ENABLE_ASSERTIONS=ON -DCOMPILER_RT_BUILD_KITSUNE=OFF -DLLVM_ENABLE_LTO=OFF -DLLVM_USE_LINKER=gold -DLLVM_PARALLEL_COMPILE_JOBS=2 -DLLVM_PARALLEL_LINK_JOBS=1 -DLLVM_BINUTILS_INCDIR=/usr/include -DLLVM_LIT_ARGS="-sv -j 2" + #cmake .. -DCMAKE_BUILD_TYPE=Release -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" -DLLVM_BUILD_TESTS=ON -DLLVM_ENABLE_ASSERTIONS=ON -DCOMPILER_RT_BUILD_KITSUNE=ON -DLLVM_ENABLE_LTO=OFF -DLLVM_USE_LINKER=gold -DLLVM_PARALLEL_COMPILE_JOBS=2 -DLLVM_PARALLEL_LINK_JOBS=1 -DLLVM_BINUTILS_INCDIR=/usr/include -DLLVM_LIT_ARGS="-sv -j 2" - run: name: make command: | diff --git a/include/llvm/Transforms/Tapir/CilkABI.h b/include/llvm/Transforms/Tapir/CilkABI.h index 60f0c2eddbb..61f1a0b878e 100644 --- a/include/llvm/Transforms/Tapir/CilkABI.h +++ b/include/llvm/Transforms/Tapir/CilkABI.h @@ -41,21 +41,6 @@ namespace llvm { -/// CilkABILoopSpawning uses the Cilk Plus ABI to handle Tapir loops. -class CilkABILoopSpawning : public LoopOutline { -public: - CilkABILoopSpawning(Loop *OrigLoop, ScalarEvolution &SE, - LoopInfo *LI, DominatorTree *DT, - AssumptionCache *AC, - OptimizationRemarkEmitter &ORE) - : LoopOutline(OrigLoop, SE, LI, DT, AC, ORE) - {} - - bool processLoop(); - - virtual ~CilkABILoopSpawning() {} -}; - class CilkABI : public TapirTarget { public: CilkABI(); diff --git a/include/llvm/Transforms/Tapir/LoopSpawning.h b/include/llvm/Transforms/Tapir/LoopSpawning.h index 1b658ce685e..7da595679c6 100644 --- a/include/llvm/Transforms/Tapir/LoopSpawning.h +++ b/include/llvm/Transforms/Tapir/LoopSpawning.h @@ -58,9 +58,46 @@ class LoopOutline { protected: PHINode* canonicalizeIVs(Type *Ty); + const SCEV* getLimit(); + + /// \brief Compute the grainsize of the loop, based on the limit. + /// + /// The grainsize is computed by the following equation: + /// + /// Grainsize = min(2048, ceil(Limit / (8 * workers))) + /// + /// This computation is inserted into the preheader of the loop. + /// + /// TODO: This method is the only method that depends on the CilkABI. + /// Generalize this method for other grainsize calculations and to query TLI. + Value* computeGrainsize(Value *Limit, TapirTarget* tapirTarget); + Value* canonicalizeLoopLatch(PHINode *IV, Value *Limit); - bool removeNonCanonicalIVs(BasicBlock* Header, BasicBlock* Preheader, PHINode* CanonicalIV, SmallVector &IVs, SCEVExpander &Exp); - //bool setIVStartingValues(); + + bool getHandledExits(BasicBlock* Header, SmallPtrSetImpl &HandledExits); + + bool removeNonCanonicalIVs(BasicBlock* Header, BasicBlock* Preheader, PHINode* CanonicalIV, SmallVectorImpl &IVs); + bool setIVStartingValues(Value* newStart, Value* CanonicalIV, const SmallVectorImpl &IVs, BasicBlock* NewPreheader, ValueToValueMapTy &VMap); + + // In the general case, var is the result of some computation + // in the loop's preheader. The pass wants to prevent outlining from passing + // var as an arbitrary argument to the outlined function, but one that is + // potentially in a specific place for ABI reasons. + // Hence, this pass adds the loop-limit variable as an argument + // manually. + // + // There are two special cases to consider: the var is a constant, or + // the var is used elsewhere within the loop. To handle these two + // cases, this pass adds an explict argument for var, to ensure it isn't + // clobberred by the other use or not passed because it is constant. + static inline Value* ensureDistinctArgument(Value* var, const Twine &name="") { + if (isa(var) || !var->hasOneUse()) { + Argument *argument = new Argument(var->getType(), name); + return argument; + } else { + return var; + } + } void unlinkLoop(); diff --git a/include/llvm/Transforms/Tapir/Outline.h b/include/llvm/Transforms/Tapir/Outline.h index 6e779fdf719..546dac007c5 100644 --- a/include/llvm/Transforms/Tapir/Outline.h +++ b/include/llvm/Transforms/Tapir/Outline.h @@ -29,6 +29,23 @@ namespace llvm { typedef SetVector ValueSet; +/// definedInRegion - Return true if the specified value is used in the +/// extracted region. +template +static inline bool usedInRegion(const BasicBlockPtrContainer &Blocks, + Value *V) { + if (Instruction *I = dyn_cast(V)) { + for (User *U : I->users()) { + if (Instruction *Inst = dyn_cast(U)) { + if (std::find(Blocks.begin(), Blocks.end(), Inst->getParent()) != Blocks.end()) { + return true; + } + } + } + } + return false; +} + /// definedInRegion - Return true if the specified value is defined in the /// extracted region. template diff --git a/include/llvm/Transforms/Tapir/TapirUtils.h b/include/llvm/Transforms/Tapir/TapirUtils.h index 0624627dee7..31c7e15f769 100644 --- a/include/llvm/Transforms/Tapir/TapirUtils.h +++ b/include/llvm/Transforms/Tapir/TapirUtils.h @@ -23,6 +23,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Metadata.h" #include "llvm/Transforms/Tapir/TapirTypes.h" #include "llvm/Transforms/Utils/ValueMapper.h" @@ -84,10 +85,13 @@ class LoopSpawningHints { /// Grainsize Hint Grainsize; + +public: /// Return the loop metadata prefix. static inline StringRef Prefix() { return "tapir.loop."; } + static inline const char* StrategyPrefix() { return "spawn.strategy"; } + static inline const char* GrainsizePrefix() { return "grainsize"; } -public: static inline std::string printStrategy(enum SpawningStrategy Strat) { switch(Strat) { case LoopSpawningHints::ST_SEQ: @@ -110,10 +114,51 @@ class LoopSpawningHints { /// The loop these hints belong to. Loop * const TheLoop; -private: /// Find hints specified in the loop metadata and update local values. - void getHintsFromMetadata(); + static inline std::vector> getHintsFromMetadata(Loop* L) { + MDNode *LoopID = L->getLoopID(); + std::vector> hints; + + if (!LoopID) + return hints; + + // First operand should refer to the loop id itself. + assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); + assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); + + + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + const MDString *S = nullptr; + SmallVector Args; + + // The expected hint is either a MDString or a MDNode with the first + // operand a MDString. + if (const MDNode *MD = dyn_cast(LoopID->getOperand(i))) { + if (!MD || MD->getNumOperands() == 0) + continue; + S = dyn_cast(MD->getOperand(0)); + for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) + Args.push_back(MD->getOperand(i)); + } else { + S = dyn_cast(LoopID->getOperand(i)); + assert(Args.size() == 0 && "too many arguments for MDString"); + } + + if (!S) + continue; + + // Check if the hint starts with the loop metadata prefix. + StringRef Name = S->getString(); + if (Args.size() == 1) { + if (!Name.startswith(Prefix())) + continue; + hints.emplace_back(std::make_pair(Name, Args[0])); + } + } + return hints; + } +private: /// Checks string hint with one operand and set value if valid. void setHint(StringRef Name, Metadata *Arg); @@ -129,7 +174,25 @@ class LoopSpawningHints { }; //! Identify if a loop could should be handled manually by a parallel loop backend -bool isBackendParallelFor(Loop* L); +static inline bool isBackendParallelFor(Loop* L) { + for(auto& hints: LoopSpawningHints::getHintsFromMetadata(L)) { + auto Name = hints.first; + auto Arg = hints.second; + + Name = Name.substr(LoopSpawningHints::Prefix().size(), StringRef::npos); + if (Name != LoopSpawningHints::StrategyPrefix()) continue; + + const ConstantInt *C = mdconst::dyn_extract(Arg); + if (!C) continue; + + unsigned Val = C->getZExtValue(); + + if (Val >= LoopSpawningHints::ST_END) continue; + if (Val != LoopSpawningHints::ST_SEQ) return true; + } + return false; +} + class TapirTarget { public: diff --git a/lib/Transforms/Tapir/CilkABI.cpp b/lib/Transforms/Tapir/CilkABI.cpp index 8732f19a0b1..1eb9197a40e 100644 --- a/lib/Transforms/Tapir/CilkABI.cpp +++ b/lib/Transforms/Tapir/CilkABI.cpp @@ -1293,6 +1293,26 @@ bool CilkABI::processMain(Function &F) { return false; } +/// CilkABILoopSpawning uses the Cilk Plus ABI to handle Tapir loops. +class CilkABILoopSpawning : public LoopOutline { +public: + TapirTarget* tapirTarget; + unsigned SpecifiedGrainsize; + CilkABILoopSpawning(Loop *OrigLoop, unsigned Grainsize, + ScalarEvolution &SE, + LoopInfo *LI, DominatorTree *DT, + AssumptionCache *AC, + OptimizationRemarkEmitter &ORE, TapirTarget* tapirTarget) + : LoopOutline(OrigLoop, SE, LI, DT, AC, ORE), + tapirTarget(tapirTarget), + SpecifiedGrainsize(Grainsize) + {} + + bool processLoop(); + + virtual ~CilkABILoopSpawning() {} +}; + /// Top-level call to convert a Tapir loop to be processed using an appropriate /// Cilk ABI call. bool CilkABILoopSpawning::processLoop() { @@ -1304,118 +1324,106 @@ bool CilkABILoopSpawning::processLoop() { using namespace ore; - // Check the exit blocks of the loop. - if (!ExitBlock) { - DEBUG(dbgs() << "LS loop does not contain valid exit block after latch.\n"); - ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "InvalidLatchExit", - L->getStartLoc(), - Header) - << "invalid latch exit"); + SmallPtrSet HandledExits; + if (!getHandledExits(Header, HandledExits)) return false; - } - - SmallVector ExitBlocks; - L->getExitBlocks(ExitBlocks); - for (const BasicBlock *Exit : ExitBlocks) { - if (Exit == ExitBlock) continue; - if (!isa(Exit->getTerminator())) { - DEBUG(dbgs() << "LS loop contains a bad exit block " << *Exit); - ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "BadExit", - L->getStartLoc(), - Header) - << "bad exit block found"); - return false; - } - } Module* M = OrigFunction->getParent(); DEBUG(dbgs() << "LS loop header:" << *Header); DEBUG(dbgs() << "LS loop latch:" << *Latch); - DEBUG(dbgs() << "LS SE exit count: " << *(SE.getExitCount(L, Latch)) << "\n"); /// Get loop limit. - const SCEV *BETC = SE.getExitCount(L, Latch); - const SCEV *Limit = SE.getAddExpr(BETC, SE.getOne(BETC->getType())); - DEBUG(dbgs() << "LS Loop limit: " << *Limit << "\n"); - - if (SE.getCouldNotCompute() == Limit) { - DEBUG(dbgs() << "SE could not compute loop limit.\n"); - ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "UnknownLoopLimit", - L->getStartLoc(), - Header) - << "could not compute limit"); - return false; - } + const SCEV *Limit = getLimit(); + if (!Limit) return false; + /// Clean up the loop's induction variable. PHINode *CanonicalIV = canonicalizeIVs(Limit->getType()); - if (!CanonicalIV) { - DEBUG(dbgs() << "Could not get canonical IV.\n"); - ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "NoCanonicalIV", - L->getStartLoc(), - Header) - << "could not find or create canonical IV"); + if (!CanonicalIV) return false; + + // Remove the IV's (other than CanonicalIV) and replace them with + // their stronger forms. + // + // TODO?: We can probably adapt this loop->DAC process such that we + // don't require all IV's to be canonical. + SmallVector IVs; + if (!removeNonCanonicalIVs(Header, Preheader, CanonicalIV, IVs)) return false; - } - - // Remove the IV's (other than CanonicalIV) and replace them with - // their stronger forms. - // - // TODO?: We can probably adapt this loop->DAC process such that we - // don't require all IV's to be canonical. - SmallVector IVs; - SCEVExpander Exp(SE, M->getDataLayout(), "ls"); - if (!removeNonCanonicalIVs(Header, Preheader, CanonicalIV, IVs, Exp)) - return false; const SCEVAddRecExpr *CanonicalSCEV = cast(SE.getSCEV(CanonicalIV)); // Insert the computation for the loop limit into the Preheader. + SCEVExpander Exp(SE, M->getDataLayout(), "ls"); Value *LimitVar = Exp.expandCodeFor(Limit, Limit->getType(), Preheader->getTerminator()); DEBUG(dbgs() << "LimitVar: " << *LimitVar << "\n"); // Canonicalize the loop latch. + assert(SE.isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_ULT, + CanonicalSCEV, Limit) && + "Loop backedge is not guarded by canonical comparison with limit."); Value *NewCond = canonicalizeLoopLatch(CanonicalIV, LimitVar); + // Insert computation of grainsize into the Preheader. + Value *GrainVar; + if (!SpecifiedGrainsize) + GrainVar = computeGrainsize(LimitVar, tapirTarget); + else + GrainVar = ConstantInt::get(LimitVar->getType(), SpecifiedGrainsize); + + DEBUG(dbgs() << "GrainVar: " << *GrainVar << "\n"); + /// Clone the loop into a new function. // Get the inputs and outputs for the Loop blocks. SetVector Inputs, Outputs; SetVector BodyInputs, BodyOutputs; - ValueToValueMapTy VMap, InputMap; + ValueToValueMapTy VMap; + std::vector LoopBlocks; + SmallPtrSet ExitsToSplit; AllocaInst* closure; + // Add start iteration, end iteration, and grainsize to inputs. - { - // Get the inputs and outputs for the loop body. - findInputsOutputs(L->getBlocks(), BodyInputs, BodyOutputs); - - // Add argument for start of CanonicalIV. - DEBUG({ - Value *CanonicalIVInput = - CanonicalIV->getIncomingValueForBlock(Preheader); - // CanonicalIVInput should be the constant 0. - assert(isa(CanonicalIVInput) && - "Input to canonical IV from preheader is not constant."); - }); - Argument *StartArg = new Argument(CanonicalIV->getType(), - CanonicalIV->getName()+".start"); - Inputs.insert(StartArg); - InputMap[CanonicalIV] = StartArg; - - // Add argument for end. - Value* ea; - if (isa(LimitVar)) { - Argument *EndArg = new Argument(LimitVar->getType(), "end"); - Inputs.insert(EndArg); - ea = InputMap[LimitVar] = EndArg; - } else { - Inputs.insert(LimitVar); - ea = InputMap[LimitVar] = LimitVar; + LoopBlocks = L->getBlocks(); + + // Add unreachable and exception-handling exits to the set of loop blocks to + // clone. + for (BasicBlock *HE : HandledExits) + LoopBlocks.push_back(HE); + + { + const DetachInst *DI = cast(Header->getTerminator()); + BasicBlockEdge DetachEdge(Header, DI->getDetached()); + for (BasicBlock *HE : HandledExits) + if (!DT || !DT->dominates(DetachEdge, HE)) + ExitsToSplit.insert(HE); + DEBUG({ + dbgs() << "Loop exits to split:"; + for (BasicBlock *ETS : ExitsToSplit) + dbgs() << *ETS; + dbgs() << "\n"; + }); } + // Get the inputs and outputs for the loop body. + findInputsOutputs(LoopBlocks, BodyInputs, BodyOutputs, &ExitsToSplit); + + + Value *CanonicalIVInput = CanonicalIV->getIncomingValueForBlock(Preheader); + + // CanonicalIVInput should be the constant 0. + assert(isa(CanonicalIVInput) && + "Input to canonical IV from preheader is not constant."); + + // Add explicit argument for loop start. + Value* startArg = ensureDistinctArgument(CanonicalIVInput, "start"); + + // Add explicit argument for loop end. + Value* limitArg = ensureDistinctArgument(LimitVar, "end"); + + { // Put all of the inputs together, and clear redundant inputs from // the set for the loop body. SmallVector BodyInputsToRemove; @@ -1446,17 +1454,16 @@ bool CilkABILoopSpawning::processLoop() { U.set(l2); } } + Inputs.insert(closure); + Inputs.insert(startArg); + Inputs.insert(limitArg); - Inputs.remove(StartArg); - Inputs.insert(StartArg); - Inputs.remove(ea); - Inputs.insert(ea); for (Value *V : BodyInputsToRemove) BodyInputs.remove(V); assert(0 == BodyOutputs.size() && "All results from parallel loop should be passed by memory already."); - } + } DEBUG({ for (Value *V : Inputs) dbgs() << "EL input: " << *V << "\n"; @@ -1469,11 +1476,11 @@ bool CilkABILoopSpawning::processLoop() { { SmallVector Returns; // Ignore returns cloned. - Helper = CreateHelper(Inputs, Outputs, L->getBlocks(), - Header, Preheader, ExitBlock/*L->getExitBlock()*/, + Helper = CreateHelper(Inputs, Outputs, LoopBlocks, + Header, Preheader, ExitBlock, VMap, M, OrigFunction->getSubprogram() != nullptr, Returns, ".ls", - nullptr, nullptr, nullptr); + &ExitsToSplit, nullptr, nullptr); assert(Returns.empty() && "Returns cloned when cloning loop."); @@ -1483,66 +1490,32 @@ bool CilkABILoopSpawning::processLoop() { } BasicBlock *NewPreheader = cast(VMap[Preheader]); - PHINode *NewCanonicalIV = cast(VMap[CanonicalIV]); // Rewrite the cloned IV's to start at the start iteration argument. - { - // Rewrite clone of canonical IV to start at the start iteration - // argument. - Argument *NewCanonicalIVStart = cast(VMap[InputMap[CanonicalIV]]); - { - int NewPreheaderIdx = NewCanonicalIV->getBasicBlockIndex(NewPreheader); - assert(isa(NewCanonicalIV->getIncomingValue(NewPreheaderIdx)) && - "Cloned canonical IV does not inherit a constant value from cloned preheader."); - NewCanonicalIV->setIncomingValue(NewPreheaderIdx, NewCanonicalIVStart); - } - - // Rewrite other cloned IV's to start at their value at the start - // iteration. - const SCEV *StartIterSCEV = SE.getSCEV(NewCanonicalIVStart); - DEBUG(dbgs() << "StartIterSCEV: " << *StartIterSCEV << "\n"); - for (PHINode *IV : IVs) { - if (CanonicalIV == IV) continue; - - // Get the value of the IV at the start iteration. - DEBUG(dbgs() << "IV " << *IV); - const SCEV *IVSCEV = SE.getSCEV(IV); - DEBUG(dbgs() << " (SCEV " << *IVSCEV << ")"); - const SCEVAddRecExpr *IVSCEVAddRec = cast(IVSCEV); - const SCEV *IVAtIter = IVSCEVAddRec->evaluateAtIteration(StartIterSCEV, SE); - DEBUG(dbgs() << " expands at iter " << *StartIterSCEV << - " to " << *IVAtIter << "\n"); - - // NOTE: Expanded code should not refer to other IV's. - Value *IVStart = Exp.expandCodeFor(IVAtIter, IVAtIter->getType(), - NewPreheader->getTerminator()); - - - // Set the value that the cloned IV inherits from the cloned preheader. - PHINode *NewIV = cast(VMap[IV]); - int NewPreheaderIdx = NewIV->getBasicBlockIndex(NewPreheader); - assert(isa(NewIV->getIncomingValue(NewPreheaderIdx)) && - "Cloned IV does not inherit a constant value from cloned preheader."); - NewIV->setIncomingValue(NewPreheaderIdx, IVStart); - } - - // Remap the newly added instructions in the new preheader to use - // values local to the helper. - for (Instruction &II : *NewPreheader) - RemapInstruction(&II, VMap, RF_IgnoreMissingLocals, - /*TypeMapper=*/nullptr, /*Materializer=*/nullptr); - } - - // If the loop limit is constant, then rewrite the loop latch - // condition to use the end-iteration argument. - if (isa(LimitVar)) { + Argument *NewCanonicalIVStart = cast(VMap[startArg]); + setIVStartingValues(NewCanonicalIVStart, CanonicalIV, IVs, NewPreheader, VMap); + + // The loop has been outlined by this point. To handle the special cases + // where the loop limit was constant or used elsewhere within the loop, this + // pass rewrites the outlined loop-latch condition to use the explicit + // end-iteration argument. + if (isa(LimitVar) || !LimitVar->hasOneUse()) { CmpInst *HelperCond = cast(VMap[NewCond]); - assert(HelperCond->getOperand(1) == LimitVar); + assert(((isa(LimitVar) && + HelperCond->getOperand(1) == LimitVar) || + (!LimitVar->hasOneUse() && + HelperCond->getOperand(1) == limitArg)) && + "Unexpected condition in loop latch."); IRBuilder<> Builder(HelperCond); Value *NewHelperCond = Builder.CreateICmpULT(HelperCond->getOperand(0), - VMap[InputMap[LimitVar]]); + VMap[limitArg]); HelperCond->replaceAllUsesWith(NewHelperCond); HelperCond->eraseFromParent(); + DEBUG(dbgs() << "Rewritten Latch: " << + *(cast(NewHelperCond)->getParent())); + } else { + CmpInst *HelperCond = cast(VMap[NewCond]); + assert(HelperCond->getOperand(1) == VMap[limitArg]); } // For debugging: @@ -1600,7 +1573,7 @@ bool CilkABILoopSpawning::processLoop() { Builder.CreatePointerCast(Helper, F->getFunctionType()->getParamType(0)), Builder.CreatePointerCast(closure, F->getFunctionType()->getParamType(1)), LimitVar, - ConstantInt::get(IntegerType::get(F->getContext(), sizeof(int)*8),0) + GrainVar }; /*CallInst *TopCall = */Builder.CreateCall(F, args); @@ -1634,7 +1607,7 @@ bool llvm::CilkABI::processLoop(LoopSpawningHints LSH, LoopInfo &LI, ScalarEvolu DebugLoc DLoc = L->getStartLoc(); BasicBlock *Header = L->getHeader(); - CilkABILoopSpawning DLS(L, SE, &LI, &DT, &AC, ORE); + CilkABILoopSpawning DLS(L, LSH.getGrainsize(), SE, &LI, &DT, &AC, ORE, this); if (DLS.processLoop()) { DEBUG({ if (verifyFunction(*L->getHeader()->getParent())) { diff --git a/lib/Transforms/Tapir/LoopSpawning.cpp b/lib/Transforms/Tapir/LoopSpawning.cpp index 0860d173459..4b1d6cb3948 100644 --- a/lib/Transforms/Tapir/LoopSpawning.cpp +++ b/lib/Transforms/Tapir/LoopSpawning.cpp @@ -146,14 +146,23 @@ struct LoopSpawningImpl { /// induction variable created or inserted by the scalar evolution expander. PHINode* LoopOutline::canonicalizeIVs(Type *Ty) { Loop *L = OrigLoop; - BasicBlock* Header = L->getHeader(); - Module* M = Header->getParent()->getParent(); + + Module* M = OrigFunction->getParent(); const DataLayout &DL = M->getDataLayout(); SCEVExpander Exp(SE, DL, "ls"); PHINode *CanonicalIV = Exp.getOrInsertCanonicalInductionVariable(L, Ty); + if (!CanonicalIV) { + DEBUG(dbgs() << "Could not get canonical IV.\n"); + ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "NoCanonicalIV", + L->getStartLoc(), + Header) + << "could not find or create canonical IV"); + return nullptr; + } + DEBUG(dbgs() << "LS Canonical induction variable " << *CanonicalIV << "\n"); SmallVector DeadInsts; @@ -167,8 +176,123 @@ PHINode* LoopOutline::canonicalizeIVs(Type *Ty) { return CanonicalIV; } +/// Helper routine to get all exit blocks of a loop that are unreachable. +static void getEHExits(Loop *L, const BasicBlock *DesignatedExitBlock, + SmallVectorImpl &EHExits) { + SmallVector ExitBlocks; + L->getExitBlocks(ExitBlocks); + + SmallVector WorkList; + for (BasicBlock *Exit : ExitBlocks) { + if (Exit == DesignatedExitBlock) continue; + EHExits.push_back(Exit); + WorkList.push_back(Exit); + } + + // Traverse the CFG from these frontier blocks to find all blocks involved in + // exception-handling exit code. + SmallPtrSet Visited; + while (!WorkList.empty()) { + BasicBlock *BB = WorkList.pop_back_val(); + if (!Visited.insert(BB).second) + continue; + + // Check that the exception handling blocks do not reenter the loop. + assert(!L->contains(BB) && + "Exception handling blocks re-enter loop."); + + for (BasicBlock *Succ : successors(BB)) { + EHExits.push_back(Succ); + WorkList.push_back(Succ); + } + } +} + +Value* LoopOutline::computeGrainsize(Value *Limit, TapirTarget* tapirTarget) { + Loop *L = OrigLoop; + + Value *Grainsize; + BasicBlock *Preheader = L->getLoopPreheader(); + assert(Preheader && "No Preheader found for loop."); + + IRBuilder<> Builder(Preheader->getTerminator()); + + // Get 8 * workers + Value *Workers8 = Builder.CreateIntCast(tapirTarget->GetOrCreateWorker8(*Preheader->getParent()), + Limit->getType(), false); + // Compute ceil(limit / 8 * workers) = (limit + 8 * workers - 1) / (8 * workers) + Value *SmallLoopVal = + Builder.CreateUDiv(Builder.CreateSub(Builder.CreateAdd(Limit, Workers8), + ConstantInt::get(Limit->getType(), 1)), + Workers8); + // Compute min + Value *LargeLoopVal = ConstantInt::get(Limit->getType(), 2048); + Value *Cmp = Builder.CreateICmpULT(LargeLoopVal, SmallLoopVal); + Grainsize = Builder.CreateSelect(Cmp, LargeLoopVal, SmallLoopVal); + + return Grainsize; +} + +bool LoopOutline::getHandledExits(BasicBlock* Header, SmallPtrSetImpl &HandledExits) { + + // Check that this loop has a valid exit block after the latch. + if (!ExitBlock) { + DEBUG(dbgs() << "LS loop does not contain valid exit block after latch.\n"); + ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "InvalidLatchExit", + OrigLoop->getStartLoc(), + Header) + << "invalid latch exit"); + return false; + } + + assert(HandledExits.size() == 0); + // Get special exits from this loop. + SmallVector EHExits; + getEHExits(OrigLoop, ExitBlock, EHExits); + + // Check the exit blocks of the loop. + SmallVector ExitBlocks; + OrigLoop->getExitBlocks(ExitBlocks); + + for (const BasicBlock *Exit : ExitBlocks) { + if (Exit == ExitBlock) continue; + if (Exit->isLandingPad()) { + DEBUG({ + const LandingPadInst *LPI = Exit->getLandingPadInst(); + dbgs() << "landing pad found: " << *LPI << "\n"; + for (const User *U : LPI->users()) + dbgs() << "\tuser " << *U << "\n"; + }); + } + } + for (BasicBlock *BB : EHExits) + HandledExits.insert(BB); + for (BasicBlock *Exit : ExitBlocks) { + if (Exit == ExitBlock) continue; + if (!HandledExits.count(Exit)) { + DEBUG(dbgs() << "LS loop contains a bad exit block " << *Exit); + ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "BadExit", + OrigLoop->getStartLoc(), + Header) + << "bad exit block found"); + return false; + } + } + + DEBUG({ + dbgs() << "Handled exits of loop:"; + for (BasicBlock *HE : HandledExits) + dbgs() << *HE; + dbgs() << "\n"; + }); + + return true; +} + // IVs is output -bool LoopOutline::removeNonCanonicalIVs(BasicBlock* Header, BasicBlock* Preheader, PHINode* CanonicalIV, SmallVector &IVs, SCEVExpander &Exp) { +bool LoopOutline::removeNonCanonicalIVs(BasicBlock* Header, BasicBlock* Preheader, PHINode* CanonicalIV, SmallVectorImpl &IVs) { + assert(IVs.size() == 0); + // Remove all IV's other than CanonicalIV. // First, check that we can do this. bool CanRemoveIVs = true; @@ -190,6 +314,7 @@ bool LoopOutline::removeNonCanonicalIVs(BasicBlock* Header, BasicBlock* Preheade } { + SCEVExpander Exp(SE, OrigFunction->getParent()->getDataLayout(), "ls"); SmallVector IVsToRemove; for (BasicBlock::iterator II = Header->begin(); isa(II); ++II) { PHINode *PN = cast(II); @@ -245,34 +370,99 @@ bool LoopOutline::removeNonCanonicalIVs(BasicBlock* Header, BasicBlock* Preheade AllCanonical = false; DEBUG(dbgs() << "Remaining non-canonical PHI Node found: " << *PN << "\n"); - // emitAnalysis(LoopSpawningReport(PN) - // << "Found a remaining non-canonical IV.\n"); ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "NonCanonicalIV", PN) << "found a remaining noncanonical IV"); } } if (!AllCanonical) return false; + + return true; +} + +/// Begin copied from + +/// Convert a pointer to an integer type. +static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { + if (Ty->isPointerTy()) + return DL.getIntPtrType(Ty); + + // It is possible that char's or short's overflow when we ask for the loop's + // trip count, work around this by changing the type size. + if (Ty->getScalarSizeInBits() < 32) + return Type::getInt32Ty(Ty->getContext()); + + return Ty; } -// TODO -/* -bool LoopOutline::setIVStartingValues(Value* newStart, Value* NewCanonicalIV, BasicBlock* NewPreheader) { +/// Get the wider of two integer types. +static inline Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { + Ty0 = convertPointerToIntegerType(DL, Ty0); + Ty1 = convertPointerToIntegerType(DL, Ty1); + if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) + return Ty0; + return Ty1; +} +/// End copied from + + +const SCEV* LoopOutline::getLimit() { + Loop* L = OrigLoop; + BasicBlock *Header = L->getHeader(); + BasicBlock *Latch = L->getLoopLatch(); + + const SCEV *Limit = SE.getExitCount(L, Latch); + DEBUG(dbgs() << "LS Loop limit: " << *Limit << "\n"); + + if (SE.getCouldNotCompute() == Limit) { + DEBUG(dbgs() << "SE could not compute loop limit.\n"); + ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "UnknownLoopLimit", + L->getStartLoc(), + Header) + << "could not compute limit"); + return nullptr; + } + + /// Determine the type of the canonical IV. + Type *CanonicalIVTy = Limit->getType(); + const DataLayout &DL = OrigFunction->getParent()->getDataLayout(); + + for (BasicBlock::iterator II = Header->begin(); isa(II); ++II) { + PHINode *PN = cast(II); + if (PN->getType()->isFloatingPointTy()) continue; + CanonicalIVTy = getWiderType(DL, PN->getType(), CanonicalIVTy); + } + + Limit = SE.getNoopOrAnyExtend(Limit, CanonicalIVTy); + return Limit; +} + +bool LoopOutline::setIVStartingValues(Value* newStart, Value* CanonicalIV, const SmallVectorImpl &IVs, BasicBlock* NewPreheader, ValueToValueMapTy &VMap) { if (auto startInst = dyn_cast(NewPreheader)) { assert(DT->dominates(startInst, NewPreheader->getTerminator())); } + PHINode *NewCanonicalIV = cast(VMap[CanonicalIV]); + Value* startingValue = nullptr; { int NewPreheaderIdx = NewCanonicalIV->getBasicBlockIndex(NewPreheader); - assert(isa(NewCanonicalIV->getIncomingValue(NewPreheaderIdx)) && - "Cloned canonical IV does not inherit a constant value from cloned preheader."); + startingValue = NewCanonicalIV->getIncomingValue(NewPreheaderIdx); + if (Constant* C = dyn_cast(startingValue)) { + if (C->isZeroValue()) + startingValue = nullptr; + } + //assert(isa(NewCanonicalIV->getIncomingValue(NewPreheaderIdx)) && + // "Cloned canonical IV does not inherit a constant value from cloned preheader."); NewCanonicalIV->setIncomingValue(NewPreheaderIdx, newStart); } + SCEVExpander Exp(SE, OrigFunction->getParent()->getDataLayout(), "ls"); + // Rewrite other cloned IV's to start at their value at the start // iteration. const SCEV *StartIterSCEV = SE.getSCEV(newStart); DEBUG(dbgs() << "StartIterSCEV: " << *StartIterSCEV << "\n"); + for (PHINode *IV : IVs) { if (CanonicalIV == IV) continue; @@ -289,6 +479,11 @@ bool LoopOutline::setIVStartingValues(Value* newStart, Value* NewCanonicalIV, Ba Value *IVStart = Exp.expandCodeFor(IVAtIter, IVAtIter->getType(), NewPreheader->getTerminator()); + if (startingValue) { + IRBuilder<> B(NewPreheader->getTerminator()); + IVStart = B.CreateSub(IVStart, startingValue); + } + // Set the value that the cloned IV inherits from the cloned preheader. PHINode *NewIV = cast(VMap[IV]); int NewPreheaderIdx = NewIV->getBasicBlockIndex(NewPreheader); @@ -296,8 +491,14 @@ bool LoopOutline::setIVStartingValues(Value* newStart, Value* NewCanonicalIV, Ba "Cloned IV does not inherit a constant value from cloned preheader."); NewIV->setIncomingValue(NewPreheaderIdx, IVStart); } + + // Remap the newly added instructions in the new preheader to use + // values local to the helper. + for (Instruction &II : *NewPreheader) + RemapInstruction(&II, VMap, RF_IgnoreMissingLocals, + /*TypeMapper=*/nullptr, /*Materializer=*/nullptr); + return true; } -*/ /// \brief Replace the latch of the loop to check that IV is always less than or /// equal to the limit. @@ -498,6 +699,14 @@ bool LoopSpawningImpl::processLoop(Loop *L) { DEBUG(dbgs() << "LS: Hints dictate sequential spawning.\n"); break; default: + DEBUG({ + llvm::LoopBlocksDFS DFS(L); + DFS.perform(&LI); + dbgs() << "Blocks in loop (from DFS):\n"; + for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) + dbgs() << *BB; + }); + return tapirTarget->processLoop(Hints, LI, SE, DT, AC, ORE); case LoopSpawningHints::ST_END: dbgs() << "LS: Hints specify unknown spawning strategy.\n"; diff --git a/lib/Transforms/Tapir/TapirUtils.cpp b/lib/Transforms/Tapir/TapirUtils.cpp index 9439b8e7eea..abdb5c551c9 100644 --- a/lib/Transforms/Tapir/TapirUtils.cpp +++ b/lib/Transforms/Tapir/TapirUtils.cpp @@ -688,11 +688,13 @@ bool llvm::attemptSyncRegionElimination(Instruction *SyncRegion) { } llvm::LoopSpawningHints::LoopSpawningHints(Loop *L) - : Strategy("spawn.strategy", ST_SEQ, HK_STRATEGY), - Grainsize("grainsize", 0, HK_GRAINSIZE), + : Strategy(StrategyPrefix(), ST_SEQ, HK_STRATEGY), + Grainsize(GrainsizePrefix(), 0, HK_GRAINSIZE), TheLoop(L) { // Populate values with existing loop metadata. - getHintsFromMetadata(); + for(auto& pair: getHintsFromMetadata(TheLoop)) { + setHint(pair.first, pair.second); + } } LoopSpawningHints::SpawningStrategy @@ -704,42 +706,6 @@ unsigned llvm::LoopSpawningHints::getGrainsize() const { return Grainsize.Value; } -void llvm::LoopSpawningHints::getHintsFromMetadata() { - MDNode *LoopID = TheLoop->getLoopID(); - if (!LoopID) - return; - - // First operand should refer to the loop id itself. - assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); - assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); - - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - const MDString *S = nullptr; - SmallVector Args; - - // The expected hint is either a MDString or a MDNode with the first - // operand a MDString. - if (const MDNode *MD = dyn_cast(LoopID->getOperand(i))) { - if (!MD || MD->getNumOperands() == 0) - continue; - S = dyn_cast(MD->getOperand(0)); - for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) - Args.push_back(MD->getOperand(i)); - } else { - S = dyn_cast(LoopID->getOperand(i)); - assert(Args.size() == 0 && "too many arguments for MDString"); - } - - if (!S) - continue; - - // Check if the hint starts with the loop metadata prefix. - StringRef Name = S->getString(); - if (Args.size() == 1) - setHint(Name, Args[0]); - } -} - /// Checks string hint with one operand and set value if valid. void llvm::LoopSpawningHints::setHint(StringRef Name, Metadata *Arg) { if (!Name.startswith(Prefix())) @@ -828,71 +794,6 @@ bool llvm::LoopSpawningHints::Hint::validate(unsigned Val) { return false; } -bool llvm::isBackendParallelFor(Loop* L) { - return LoopSpawningHints(L).getStrategy() != LoopSpawningHints::ST_SEQ; -} - - -/// Helper routine to get all exit blocks of a loop that are unreachable. -static void getEHExits(Loop *L, const BasicBlock *DesignatedExitBlock, - SmallVectorImpl &EHExits) { - SmallVector ExitBlocks; - L->getExitBlocks(ExitBlocks); - - SmallVector WorkList; - for (BasicBlock *Exit : ExitBlocks) { - if (Exit == DesignatedExitBlock) continue; - EHExits.push_back(Exit); - WorkList.push_back(Exit); - } - - // Traverse the CFG from these frontier blocks to find all blocks involved in - // exception-handling exit code. - SmallPtrSet Visited; - while (!WorkList.empty()) { - BasicBlock *BB = WorkList.pop_back_val(); - if (!Visited.insert(BB).second) - continue; - - // Check that the exception handling blocks do not reenter the loop. - assert(!L->contains(BB) && - "Exception handling blocks re-enter loop."); - - for (BasicBlock *Succ : successors(BB)) { - EHExits.push_back(Succ); - WorkList.push_back(Succ); - } - } -} - -/// Convert a pointer to an integer type. -/// -/// Copied from Transforms/Vectorizer/LoopVectorize.cpp. -static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { - if (Ty->isPointerTy()) - return DL.getIntPtrType(Ty); - - // It is possible that char's or short's overflow when we ask for the loop's - // trip count, work around this by changing the type size. - if (Ty->getScalarSizeInBits() < 32) - return Type::getInt32Ty(Ty->getContext()); - - return Ty; -} - -/// Get the wider of two integer types. -/// -/// Copied from Transforms/Vectorizer/LoopVectorize.cpp. -static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { - Ty0 = convertPointerToIntegerType(DL, Ty0); - Ty1 = convertPointerToIntegerType(DL, Ty1); - if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) - return Ty0; - return Ty1; -} - -#include "llvm/Analysis/LoopIterator.h" - STATISTIC(LoopsConvertedToDAC, "Number of Tapir loops converted to divide-and-conquer iteration spawning"); @@ -901,6 +802,7 @@ STATISTIC(LoopsConvertedToDAC, class DACLoopSpawning : public LoopOutline { public: TapirTarget* tapirTarget; + unsigned SpecifiedGrainsize; DACLoopSpawning(Loop *OrigLoop, unsigned Grainsize, ScalarEvolution &SE, LoopInfo *LI, DominatorTree *DT, @@ -920,59 +822,11 @@ class DACLoopSpawning : public LoopOutline { BasicBlock *Preheader = L->getLoopPreheader(); BasicBlock *Latch = L->getLoopLatch(); - DEBUG({ - LoopBlocksDFS DFS(L); - DFS.perform(LI); - dbgs() << "Blocks in loop (from DFS):\n"; - for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) - dbgs() << *BB; - }); - using namespace ore; - // Check that this loop has a valid exit block after the latch. - if (!ExitBlock) { - DEBUG(dbgs() << "LS loop does not contain valid exit block after latch.\n"); - ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "InvalidLatchExit", - L->getStartLoc(), - Header) - << "invalid latch exit"); - return false; - } - - // Get special exits from this loop. - SmallVector EHExits; - getEHExits(L, ExitBlock, EHExits); - - // Check the exit blocks of the loop. - SmallVector ExitBlocks; - L->getExitBlocks(ExitBlocks); - - for (const BasicBlock *Exit : ExitBlocks) { - if (Exit == ExitBlock) continue; - if (Exit->isLandingPad()) { - DEBUG({ - const LandingPadInst *LPI = Exit->getLandingPadInst(); - dbgs() << "landing pad found: " << *LPI << "\n"; - for (const User *U : LPI->users()) - dbgs() << "\tuser " << *U << "\n"; - }); - } - } SmallPtrSet HandledExits; - for (BasicBlock *BB : EHExits) - HandledExits.insert(BB); - for (BasicBlock *Exit : ExitBlocks) { - if (Exit == ExitBlock) continue; - if (!HandledExits.count(Exit)) { - DEBUG(dbgs() << "LS loop contains a bad exit block " << *Exit); - ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "BadExit", - L->getStartLoc(), - Header) - << "bad exit block found"); - return false; - } - } + if (!getHandledExits(Header, HandledExits)) + return false; Module* M = OrigFunction->getParent(); @@ -981,61 +835,28 @@ class DACLoopSpawning : public LoopOutline { DEBUG(dbgs() << "LS SE exit count: " << *(SE.getExitCount(L, Latch)) << "\n"); /// Get loop limit. - const SCEV *Limit = SE.getExitCount(L, Latch); - DEBUG(dbgs() << "LS Loop limit: " << *Limit << "\n"); - // PredicatedScalarEvolution PSE(SE, *L); - // const SCEV *PLimit = PSE.getExitCount(L, Latch); - // DEBUG(dbgs() << "LS predicated loop limit: " << *PLimit << "\n"); - // emitAnalysis(LoopSpawningReport() - // << "computed loop limit " << *Limit << "\n"); - if (SE.getCouldNotCompute() == Limit) { - DEBUG(dbgs() << "SE could not compute loop limit.\n"); - ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "UnknownLoopLimit", - L->getStartLoc(), - Header) - << "could not compute limit"); - return false; - } - - /// Determine the type of the canonical IV. - Type *CanonicalIVTy = Limit->getType(); - { - const DataLayout &DL = M->getDataLayout(); - for (BasicBlock::iterator II = Header->begin(); isa(II); ++II) { - PHINode *PN = cast(II); - if (PN->getType()->isFloatingPointTy()) continue; - CanonicalIVTy = getWiderType(DL, PN->getType(), CanonicalIVTy); - } - Limit = SE.getNoopOrAnyExtend(Limit, CanonicalIVTy); - } - /// Clean up the loop's induction variables. - PHINode *CanonicalIV = canonicalizeIVs(CanonicalIVTy); - if (!CanonicalIV) { - DEBUG(dbgs() << "Could not get canonical IV.\n"); - // emitAnalysis(LoopSpawningReport() - // << "Could not get a canonical IV.\n"); - ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "NoCanonicalIV", - L->getStartLoc(), - Header) - << "could not find or create canonical IV"); - return false; - } - - // Remove the IV's (other than CanonicalIV) and replace them with - // their stronger forms. - // - // TODO?: We can probably adapt this loop->DAC process such that we - // don't require all IV's to be canonical. + const SCEV *Limit = getLimit(); + if (!Limit) return false; + + /// Clean up the loop's induction variable. + PHINode *CanonicalIV = canonicalizeIVs(Limit->getType()); + if (!CanonicalIV) return false; + + // Remove the IV's (other than CanonicalIV) and replace them with + // their stronger forms. + // + // TODO?: We can probably adapt this loop->DAC process such that we + // don't require all IV's to be canonical. SmallVector IVs; - SCEVExpander Exp(SE, M->getDataLayout(), "ls"); - if (!removeNonCanonicalIVs(Header, Preheader, CanonicalIV, IVs, Exp)) + if (!removeNonCanonicalIVs(Header, Preheader, CanonicalIV, IVs)) return false; const SCEVAddRecExpr *CanonicalSCEV = cast(SE.getSCEV(CanonicalIV)); // Insert the computation for the loop limit into the Preheader. - Value *LimitVar = Exp.expandCodeFor(Limit, CanonicalIVTy, + SCEVExpander Exp(SE, M->getDataLayout(), "ls"); + Value *LimitVar = Exp.expandCodeFor(Limit, Limit->getType(), Preheader->getTerminator()); DEBUG(dbgs() << "LimitVar: " << *LimitVar << "\n"); @@ -1048,18 +869,18 @@ class DACLoopSpawning : public LoopOutline { // Insert computation of grainsize into the Preheader. Value *GrainVar; if (!SpecifiedGrainsize) - GrainVar = computeGrainsize(LimitVar); + GrainVar = computeGrainsize(LimitVar, tapirTarget); else GrainVar = ConstantInt::get(LimitVar->getType(), SpecifiedGrainsize); DEBUG(dbgs() << "GrainVar: " << *GrainVar << "\n"); + /// Clone the loop into a new function. // Get the inputs and outputs for the Loop blocks. SetVector Inputs, Outputs; SetVector BodyInputs, BodyOutputs; - ValueToValueMapTy VMap, InputMap; - std::vector LoopBlocks; + ValueToValueMapTy VMap; SmallPtrSet ExitsToSplit; Value *SRetInput = nullptr; @@ -1071,20 +892,10 @@ class DACLoopSpawning : public LoopOutline { } // Add start iteration, end iteration, and grainsize to inputs. - { - LoopBlocks = L->getBlocks(); - // Add unreachable and exception-handling exits to the set of loop blocks to - // clone. - DEBUG({ - dbgs() << "Handled exits of loop:"; - for (BasicBlock *HE : HandledExits) - dbgs() << *HE; - dbgs() << "\n"; - }); - - for (BasicBlock *HE : HandledExits) - LoopBlocks.push_back(HE); + // Blocks to clone are all those in loop and unreachable / exception-handling exits + std::vector LoopBlocks(L->getBlocks()); + LoopBlocks.insert(LoopBlocks.end(), HandledExits.begin(), HandledExits.end()); { const DetachInst *DI = cast(Header->getTerminator()); @@ -1107,12 +918,12 @@ class DACLoopSpawning : public LoopOutline { if (OrigFunction->hasStructRetAttr()) { Function::arg_iterator ArgIter = OrigFunction->arg_begin(); if (OrigFunction->hasParamAttribute(0, Attribute::StructRet)) - if (BodyInputs.count(&*ArgIter)) - SRetInput = &*ArgIter; + if (BodyInputs.count(&*ArgIter)) + SRetInput = &*ArgIter; if (OrigFunction->hasParamAttribute(1, Attribute::StructRet)) { - ++ArgIter; - if (BodyInputs.count(&*ArgIter)) - SRetInput = &*ArgIter; + ++ArgIter; + if (BodyInputs.count(&*ArgIter)) + SRetInput = &*ArgIter; } } if (SRetInput) { @@ -1120,93 +931,50 @@ class DACLoopSpawning : public LoopOutline { Inputs.insert(SRetInput); } - // Add argument for start of CanonicalIV. - DEBUG({ - Value *CanonicalIVInput = - CanonicalIV->getIncomingValueForBlock(Preheader); - // CanonicalIVInput should be the constant 0. - assert(isa(CanonicalIVInput) && - "Input to canonical IV from preheader is not constant."); - }); - Argument *StartArg = new Argument(CanonicalIV->getType(), - CanonicalIV->getName()+".start"); - Inputs.insert(StartArg); - InputMap[CanonicalIV] = StartArg; - - // Add argument for end. - // - // In the general case, the loop limit is the result of some computation - // that the pass added to the loop's preheader. In this case, the variable - // storing the loop limit is used exactly once, in the canonicalized loop - // latch. In this case, the pass wants to prevent outlining from passing - // the loop-limit variable as an arbitrary argument to the outlined - // function. Hence, this pass adds the loop-limit variable as an argument - // manually. - // - // There are two special cases to consider: the loop limit is a constant, or - // the loop limit is used elsewhere within the loop. To handle these two - // cases, this pass adds an explict argument for the end of the loop, to - // supports the subsequent transformation to using recursive - // divide-and-conquer. After the loop is outlined, this pass will rewrite - // the latch in the outlined loop to use this explicit argument. - // Furthermore, this pass does not prevent outliner from recognizing the - // loop limit as a potential argument to the function. - if (isa(LimitVar) || !LimitVar->hasOneUse()) { - Argument *EndArg = new Argument(LimitVar->getType(), "end"); - Inputs.insert(EndArg); - InputMap[LimitVar] = EndArg; - } else { - // If the limit var is not constant and has exactly one use, then the - // limit var is the result of some nontrivial computation, and that one - // use is the new condition inserted. - Inputs.insert(LimitVar); - InputMap[LimitVar] = LimitVar; - } + Value *CanonicalIVInput = CanonicalIV->getIncomingValueForBlock(Preheader); - // Add argument for grainsize. - if (isa(GrainVar)) { - Argument *GrainArg = new Argument(GrainVar->getType(), "grainsize"); - Inputs.insert(GrainArg); - InputMap[GrainVar] = GrainArg; - } else { - Inputs.insert(GrainVar); - InputMap[GrainVar] = GrainVar; - } + // CanonicalIVInput should be the constant 0. + assert(isa(CanonicalIVInput) && + "Input to canonical IV from preheader is not constant."); + + // Add explicit argument for loop start. + Value* startArg = ensureDistinctArgument(CanonicalIVInput, "start"); + Inputs.insert(startArg); + + // Add explicit argument for loop end. + Value* limitArg = ensureDistinctArgument(LimitVar, "end"); + Inputs.insert(limitArg); + + // Add explicit argument for grainsize. + Value* grainArg = ensureDistinctArgument(GrainVar, "grainsize"); + Inputs.insert(grainArg); // Put all of the inputs together, and clear redundant inputs from // the set for the loop body. - SmallVector BodyInputsToRemove; for (Value *V : BodyInputs) - if (V == InputSyncRegion) - BodyInputsToRemove.push_back(V); - else if (!Inputs.count(V)) + if (V != InputSyncRegion && !Inputs.count(V)) { Inputs.insert(V); - else - BodyInputsToRemove.push_back(V); - for (Value *V : BodyInputsToRemove) - BodyInputs.remove(V); - DEBUG({ - for (Value *V : BodyInputs) - dbgs() << "Remaining body input: " << *V << "\n"; - }); - for (Value *V : BodyOutputs) - dbgs() << "EL output: " << *V << "\n"; + DEBUG({ dbgs() << "Remaining body input: " << *V << "\n"; }); + } + + DEBUG({ + for (Value *V : BodyOutputs) + dbgs() << "EL output: " << *V << "\n"; + }); assert(0 == BodyOutputs.size() && "All results from parallel loop should be passed by memory already."); - } + DEBUG({ for (Value *V : Inputs) dbgs() << "EL input: " << *V << "\n"; for (Value *V : Outputs) dbgs() << "EL output: " << *V << "\n"; - }); + }); // Clone the loop blocks into a new helper function. Function *Helper; { - SmallVector Returns; // Ignore returns cloned. - - // LowerDbgDeclare(*(Header->getParent())); + SmallVector Returns; // Ignore returns cloned. Helper = CreateHelper(Inputs, Outputs, LoopBlocks, Header, Preheader, ExitBlock, @@ -1244,52 +1012,8 @@ class DACLoopSpawning : public LoopOutline { PHINode *NewCanonicalIV = cast(VMap[CanonicalIV]); // Rewrite the cloned IV's to start at the start iteration argument. - { - // Rewrite clone of canonical IV to start at the start iteration - // argument. - Argument *NewCanonicalIVStart = cast(VMap[InputMap[CanonicalIV]]); - - { - int NewPreheaderIdx = NewCanonicalIV->getBasicBlockIndex(NewPreheader); - assert(isa(NewCanonicalIV->getIncomingValue(NewPreheaderIdx)) && - "Cloned canonical IV does not inherit a constant value from cloned preheader."); - NewCanonicalIV->setIncomingValue(NewPreheaderIdx, NewCanonicalIVStart); - } - - // Rewrite other cloned IV's to start at their value at the start - // iteration. - const SCEV *StartIterSCEV = SE.getSCEV(NewCanonicalIVStart); - DEBUG(dbgs() << "StartIterSCEV: " << *StartIterSCEV << "\n"); - for (PHINode *IV : IVs) { - if (CanonicalIV == IV) continue; - - // Get the value of the IV at the start iteration. - DEBUG(dbgs() << "IV " << *IV); - const SCEV *IVSCEV = SE.getSCEV(IV); - DEBUG(dbgs() << " (SCEV " << *IVSCEV << ")"); - const SCEVAddRecExpr *IVSCEVAddRec = cast(IVSCEV); - const SCEV *IVAtIter = IVSCEVAddRec->evaluateAtIteration(StartIterSCEV, SE); - DEBUG(dbgs() << " expands at iter " << *StartIterSCEV << - " to " << *IVAtIter << "\n"); - - // NOTE: Expanded code should not refer to other IV's. - Value *IVStart = Exp.expandCodeFor(IVAtIter, IVAtIter->getType(), - NewPreheader->getTerminator()); - - // Set the value that the cloned IV inherits from the cloned preheader. - PHINode *NewIV = cast(VMap[IV]); - int NewPreheaderIdx = NewIV->getBasicBlockIndex(NewPreheader); - assert(isa(NewIV->getIncomingValue(NewPreheaderIdx)) && - "Cloned IV does not inherit a constant value from cloned preheader."); - NewIV->setIncomingValue(NewPreheaderIdx, IVStart); - } - - // Remap the newly added instructions in the new preheader to use - // values local to the helper. - for (Instruction &II : *NewPreheader) - RemapInstruction(&II, VMap, RF_IgnoreMissingLocals, - /*TypeMapper=*/nullptr, /*Materializer=*/nullptr); - } + Argument *NewCanonicalIVStart = cast(VMap[startArg]); + setIVStartingValues(NewCanonicalIVStart, CanonicalIV, IVs, NewPreheader, VMap); // The loop has been outlined by this point. To handle the special cases // where the loop limit was constant or used elsewhere within the loop, this @@ -1300,15 +1024,18 @@ class DACLoopSpawning : public LoopOutline { assert(((isa(LimitVar) && HelperCond->getOperand(1) == LimitVar) || (!LimitVar->hasOneUse() && - HelperCond->getOperand(1) == VMap[LimitVar])) && + HelperCond->getOperand(1) == limitArg)) && "Unexpected condition in loop latch."); IRBuilder<> Builder(HelperCond); Value *NewHelperCond = Builder.CreateICmpULT(HelperCond->getOperand(0), - VMap[InputMap[LimitVar]]); + VMap[limitArg]); HelperCond->replaceAllUsesWith(NewHelperCond); HelperCond->eraseFromParent(); DEBUG(dbgs() << "Rewritten Latch: " << *(cast(NewHelperCond)->getParent())); + } else { + CmpInst *HelperCond = cast(VMap[NewCond]); + assert(HelperCond->getOperand(1) == VMap[limitArg]); } // DEBUGGING: Simply serialize the cloned loop. @@ -1317,8 +1044,8 @@ class DACLoopSpawning : public LoopOutline { implementDACIterSpawnOnHelper(Helper, NewPreheader, cast(VMap[Header]), cast(VMap[CanonicalIV]), - cast(VMap[InputMap[LimitVar]]), - cast(VMap[InputMap[GrainVar]]), + cast(VMap[limitArg]), + cast(VMap[grainArg]), cast(VMap[InputSyncRegion]), /*DT=*/nullptr, /*LI=*/nullptr, CanonicalSCEV->getNoWrapFlags(SCEV::FlagNUW), @@ -1461,40 +1188,7 @@ class DACLoopSpawning : public LoopOutline { virtual ~DACLoopSpawning() {} protected: - /// \brief Compute the grainsize of the loop, based on the limit. - /// - /// The grainsize is computed by the following equation: - /// - /// Grainsize = min(2048, ceil(Limit / (8 * workers))) - /// - /// This computation is inserted into the preheader of the loop. - /// - /// TODO: This method is the only method that depends on the CilkABI. - /// Generalize this method for other grainsize calculations and to query TLI. - Value* computeGrainsize(Value *Limit) { - Loop *L = OrigLoop; - Value *Grainsize; - BasicBlock *Preheader = L->getLoopPreheader(); - assert(Preheader && "No Preheader found for loop."); - - IRBuilder<> Builder(Preheader->getTerminator()); - - // Get 8 * workers - Value *Workers8 = Builder.CreateIntCast(tapirTarget->GetOrCreateWorker8(*Preheader->getParent()), - Limit->getType(), false); - // Compute ceil(limit / 8 * workers) = (limit + 8 * workers - 1) / (8 * workers) - Value *SmallLoopVal = - Builder.CreateUDiv(Builder.CreateSub(Builder.CreateAdd(Limit, Workers8), - ConstantInt::get(Limit->getType(), 1)), - Workers8); - // Compute min - Value *LargeLoopVal = ConstantInt::get(Limit->getType(), 2048); - Value *Cmp = Builder.CreateICmpULT(LargeLoopVal, SmallLoopVal); - Grainsize = Builder.CreateSelect(Cmp, LargeLoopVal, SmallLoopVal); - - return Grainsize; - } /// \brief Method to help convertLoopToDACIterSpawn convert the Tapir /// loop cloned into function Helper to spawn its iterations in a @@ -1677,7 +1371,6 @@ void implementDACIterSpawnOnHelper(Function *Helper, } } - unsigned SpecifiedGrainsize; }; bool llvm::TapirTarget::processDACLoop(LoopSpawningHints LSH, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT, diff --git a/lib/Transforms/Utils/LLVMBuild.txt b/lib/Transforms/Utils/LLVMBuild.txt index ece0ad4dbf4..df7f4f438e1 100644 --- a/lib/Transforms/Utils/LLVMBuild.txt +++ b/lib/Transforms/Utils/LLVMBuild.txt @@ -19,4 +19,4 @@ type = Library name = TransformUtils parent = Transforms -required_libraries = Analysis Core Support +required_libraries = Analysis Core Support \ No newline at end of file diff --git a/tools/polly b/tools/polly index c34815ffbe3..0f95b7d575e 160000 --- a/tools/polly +++ b/tools/polly @@ -1 +1 @@ -Subproject commit c34815ffbe3bf448cf1a16f46aa342b574e477a8 +Subproject commit 0f95b7d575ea43eb36bb0279610d51154f1c761d From ce4802c730ba54828b73d3ef494f5adfbec1821b Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Thu, 26 Jul 2018 10:10:51 -0400 Subject: [PATCH 10/16] continued rework w EH test --- include/llvm/Transforms/Tapir/LoopSpawning.h | 19 ++-- include/llvm/Transforms/Tapir/Outline.h | 25 ++--- include/llvm/Transforms/Tapir/PTXABI.h | 4 +- lib/Transforms/Tapir/CilkABI.cpp | 41 ++++--- lib/Transforms/Tapir/LoopSpawning.cpp | 18 +++- lib/Transforms/Tapir/Outline.cpp | 3 +- lib/Transforms/Tapir/PTXABI.cpp | 58 +++++----- lib/Transforms/Tapir/TapirUtils.cpp | 86 ++++++++------- test/Transforms/Tapir/loopspawning-eh.ll | 106 +++++++++++++++++++ test/Transforms/Tapir/sret-param.ll | 8 +- 10 files changed, 245 insertions(+), 123 deletions(-) create mode 100644 test/Transforms/Tapir/loopspawning-eh.ll diff --git a/include/llvm/Transforms/Tapir/LoopSpawning.h b/include/llvm/Transforms/Tapir/LoopSpawning.h index 7da595679c6..8ad52762c6c 100644 --- a/include/llvm/Transforms/Tapir/LoopSpawning.h +++ b/include/llvm/Transforms/Tapir/LoopSpawning.h @@ -37,8 +37,8 @@ namespace llvm { class LoopOutline { public: inline LoopOutline(Loop *OrigLoop, ScalarEvolution &SE, - LoopInfo *LI, DominatorTree *DT, - AssumptionCache *AC, + LoopInfo &LI, DominatorTree &DT, + AssumptionCache &AC, OptimizationRemarkEmitter &ORE) : OrigLoop(OrigLoop), OrigFunction(OrigLoop->getHeader()->getParent()), SE(SE), LI(LI), DT(DT), AC(AC), ORE(ORE), ExitBlock(nullptr) @@ -90,14 +90,7 @@ class LoopOutline { // the var is used elsewhere within the loop. To handle these two // cases, this pass adds an explict argument for var, to ensure it isn't // clobberred by the other use or not passed because it is constant. - static inline Value* ensureDistinctArgument(Value* var, const Twine &name="") { - if (isa(var) || !var->hasOneUse()) { - Argument *argument = new Argument(var->getType(), name); - return argument; - } else { - return var; - } - } + Value* ensureDistinctArgument(const std::vector &LoopBlocks, Value* var, const Twine &name=""); void unlinkLoop(); @@ -113,11 +106,11 @@ class LoopOutline { // PredicatedScalarEvolution &PSE; ScalarEvolution &SE; /// Loop info. - LoopInfo *LI; + LoopInfo &LI; /// Dominator tree. - DominatorTree *DT; + DominatorTree &DT; /// Assumption cache. - AssumptionCache *AC; + AssumptionCache &AC; /// Interface to emit optimization remarks. OptimizationRemarkEmitter &ORE; diff --git a/include/llvm/Transforms/Tapir/Outline.h b/include/llvm/Transforms/Tapir/Outline.h index 546dac007c5..6d20d1e20ee 100644 --- a/include/llvm/Transforms/Tapir/Outline.h +++ b/include/llvm/Transforms/Tapir/Outline.h @@ -32,18 +32,19 @@ typedef SetVector ValueSet; /// definedInRegion - Return true if the specified value is used in the /// extracted region. template -static inline bool usedInRegion(const BasicBlockPtrContainer &Blocks, +static inline size_t countUseInRegion(const BasicBlockPtrContainer &Blocks, Value *V) { + size_t count = 0; if (Instruction *I = dyn_cast(V)) { for (User *U : I->users()) { if (Instruction *Inst = dyn_cast(U)) { if (std::find(Blocks.begin(), Blocks.end(), Inst->getParent()) != Blocks.end()) { - return true; + count++; } } } } - return false; + return count; } /// definedInRegion - Return true if the specified value is defined in the @@ -65,7 +66,7 @@ static inline bool definedInCaller(const BasicBlockPtrContainer &Blocks, Value *V) { if (isa(V)) return true; if (Instruction *I = dyn_cast(V)) - if (std::find(Blocks.begin(), Blocks.end(), I->getParent()) != Blocks.end()) + if (std::find(Blocks.begin(), Blocks.end(), I->getParent()) == Blocks.end()) return true; return false; } @@ -77,8 +78,8 @@ static inline bool definedInCaller(const BasicBlockPtrContainer &Blocks, template static inline void findInputsOutputs(const BasicBlockPtrContainer &&Blocks, ValueSet &Inputs, ValueSet &Outputs, - const SmallPtrSetImpl *ExitBlocks = nullptr, - DominatorTree *DT = nullptr) { + DominatorTree& DT, + const SmallPtrSetImpl *ExitBlocks = nullptr) { for (BasicBlock *BB : Blocks) { // If a used value is defined outside the region, it's an input. If an // instruction is used outside the region, it's an output. @@ -90,7 +91,7 @@ static inline void findInputsOutputs(const BasicBlockPtrContainer &&Blocks, // defined outside the region. if (ExitBlocks && ExitBlocks->count(BB)) if (PHINode *PN = dyn_cast(&II)) - if (std::find(Blocks.begin(), Blocks.end(), PN->getIncomingBlock(*OI)) != Blocks.end()) + if (std::find(Blocks.begin(), Blocks.end(), PN->getIncomingBlock(*OI)) == Blocks.end()) continue; if (definedInCaller(Blocks, *OI)) Inputs.insert(*OI); @@ -104,7 +105,7 @@ static inline void findInputsOutputs(const BasicBlockPtrContainer &&Blocks, // possible for the use to appear in a basic block that is no longer // alive. We use the DT to check that this use is still alive. if (Instruction *I = dyn_cast(U)) { - if (DT && DT->isReachableFromEntry(I->getParent())) { + if (DT.isReachableFromEntry(I->getParent())) { Outputs.insert(&II); break; } @@ -123,8 +124,8 @@ static inline void findInputsOutputs(const BasicBlockPtrContainer &&Blocks, template static inline void findInputsOutputs(const BasicBlockPtrContainer &Blocks, ValueSet &Inputs, ValueSet &Outputs, - const SmallPtrSetImpl *ExitBlocks = nullptr, - DominatorTree *DT = nullptr) { + DominatorTree& DT, + const SmallPtrSetImpl *ExitBlocks = nullptr) { for (BasicBlock *BB : Blocks) { // If a used value is defined outside the region, it's an input. If an // instruction is used outside the region, it's an output. @@ -136,7 +137,7 @@ static inline void findInputsOutputs(const BasicBlockPtrContainer &Blocks, // defined outside the region. if (ExitBlocks && ExitBlocks->count(BB)) if (PHINode *PN = dyn_cast(&II)) - if (std::find(Blocks.begin(), Blocks.end(), PN->getIncomingBlock(*OI)) != Blocks.end()) + if (std::find(Blocks.begin(), Blocks.end(), PN->getIncomingBlock(*OI)) == Blocks.end()) continue; if (definedInCaller(Blocks, *OI)) Inputs.insert(*OI); @@ -150,7 +151,7 @@ static inline void findInputsOutputs(const BasicBlockPtrContainer &Blocks, // possible for the use to appear in a basic block that is no longer // alive. We use the DT to check that this use is still alive. if (Instruction *I = dyn_cast(U)) { - if (DT && DT->isReachableFromEntry(I->getParent())) { + if (DT.isReachableFromEntry(I->getParent())) { Outputs.insert(&II); break; } diff --git a/include/llvm/Transforms/Tapir/PTXABI.h b/include/llvm/Transforms/Tapir/PTXABI.h index 829fd46bdcf..1bcd7fb227f 100644 --- a/include/llvm/Transforms/Tapir/PTXABI.h +++ b/include/llvm/Transforms/Tapir/PTXABI.h @@ -82,8 +82,8 @@ namespace llvm { class PTXABILoopSpawning : public LoopOutline { public: PTXABILoopSpawning(Loop *OrigLoop, ScalarEvolution &SE, - LoopInfo *LI, DominatorTree *DT, - AssumptionCache *AC, + LoopInfo &LI, DominatorTree &DT, + AssumptionCache &AC, OptimizationRemarkEmitter &ORE) : LoopOutline(OrigLoop, SE, LI, DT, AC, ORE) {} diff --git a/lib/Transforms/Tapir/CilkABI.cpp b/lib/Transforms/Tapir/CilkABI.cpp index 1eb9197a40e..750ed7aff5f 100644 --- a/lib/Transforms/Tapir/CilkABI.cpp +++ b/lib/Transforms/Tapir/CilkABI.cpp @@ -408,8 +408,8 @@ static CallInst *EmitCilkSetJmp(IRBuilder<> &B, Value *SF, Module& M) { LLVMContext &Ctx = M.getContext(); // We always want to save the floating point state too - Triple T(M.getTargetTriple()); - if(T.getArch() == Triple::x86 || T.getArch() == Triple::x86_64) + Triple T(M.getTargetTriple()); + if(T.getArch() == Triple::x86 || T.getArch() == Triple::x86_64) EmitSaveFloatingPointState(B, SF); Type *Int32Ty = Type::getInt32Ty(Ctx); @@ -1300,8 +1300,8 @@ class CilkABILoopSpawning : public LoopOutline { unsigned SpecifiedGrainsize; CilkABILoopSpawning(Loop *OrigLoop, unsigned Grainsize, ScalarEvolution &SE, - LoopInfo *LI, DominatorTree *DT, - AssumptionCache *AC, + LoopInfo &LI, DominatorTree &DT, + AssumptionCache &AC, OptimizationRemarkEmitter &ORE, TapirTarget* tapirTarget) : LoopOutline(OrigLoop, SE, LI, DT, AC, ORE), tapirTarget(tapirTarget), @@ -1385,6 +1385,13 @@ bool CilkABILoopSpawning::processLoop() { SmallPtrSet ExitsToSplit; AllocaInst* closure; + // Get the sync region containing this Tapir loop. + Instruction *InputSyncRegion; + { + const DetachInst *DI = cast(Header->getTerminator()); + InputSyncRegion = cast(DI->getSyncRegion()); + } + // Add start iteration, end iteration, and grainsize to inputs. LoopBlocks = L->getBlocks(); @@ -1397,7 +1404,7 @@ bool CilkABILoopSpawning::processLoop() { const DetachInst *DI = cast(Header->getTerminator()); BasicBlockEdge DetachEdge(Header, DI->getDetached()); for (BasicBlock *HE : HandledExits) - if (!DT || !DT->dominates(DetachEdge, HE)) + if (!DT.dominates(DetachEdge, HE)) ExitsToSplit.insert(HE); DEBUG({ dbgs() << "Loop exits to split:"; @@ -1408,8 +1415,8 @@ bool CilkABILoopSpawning::processLoop() { } // Get the inputs and outputs for the loop body. - findInputsOutputs(LoopBlocks, BodyInputs, BodyOutputs, &ExitsToSplit); - + findInputsOutputs(LoopBlocks, BodyInputs, BodyOutputs, DT, &ExitsToSplit); + BodyInputs.remove(InputSyncRegion); Value *CanonicalIVInput = CanonicalIV->getIncomingValueForBlock(Preheader); @@ -1418,10 +1425,10 @@ bool CilkABILoopSpawning::processLoop() { "Input to canonical IV from preheader is not constant."); // Add explicit argument for loop start. - Value* startArg = ensureDistinctArgument(CanonicalIVInput, "start"); + Value* startArg = ensureDistinctArgument(LoopBlocks, CanonicalIVInput, "start"); // Add explicit argument for loop end. - Value* limitArg = ensureDistinctArgument(LimitVar, "end"); + Value* limitArg = ensureDistinctArgument(LoopBlocks, LimitVar, "end"); { // Put all of the inputs together, and clear redundant inputs from @@ -1480,7 +1487,7 @@ bool CilkABILoopSpawning::processLoop() { Header, Preheader, ExitBlock, VMap, M, OrigFunction->getSubprogram() != nullptr, Returns, ".ls", - &ExitsToSplit, nullptr, nullptr); + &ExitsToSplit, InputSyncRegion, nullptr, nullptr, nullptr); assert(Returns.empty() && "Returns cloned when cloning loop."); @@ -1499,12 +1506,12 @@ bool CilkABILoopSpawning::processLoop() { // where the loop limit was constant or used elsewhere within the loop, this // pass rewrites the outlined loop-latch condition to use the explicit // end-iteration argument. - if (isa(LimitVar) || !LimitVar->hasOneUse()) { + if (isa(LimitVar) || countUseInRegion(LoopBlocks, LimitVar) != 1) { CmpInst *HelperCond = cast(VMap[NewCond]); assert(((isa(LimitVar) && HelperCond->getOperand(1) == LimitVar) || - (!LimitVar->hasOneUse() && - HelperCond->getOperand(1) == limitArg)) && + (countUseInRegion(LoopBlocks, LimitVar) != 1 && + HelperCond->getOperand(1) == VMap[LimitVar] )) && "Unexpected condition in loop latch."); IRBuilder<> Builder(HelperCond); Value *NewHelperCond = Builder.CreateICmpULT(HelperCond->getOperand(0), @@ -1594,7 +1601,7 @@ bool CilkABILoopSpawning::processLoop() { } bool llvm::CilkABI::processLoop(LoopSpawningHints LSH, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT, - AssumptionCache &AC, OptimizationRemarkEmitter &ORE) { + AssumptionCache &AC, OptimizationRemarkEmitter &ORE) { if (LSH.getStrategy() != LoopSpawningHints::ST_DAC) return false; @@ -1607,7 +1614,7 @@ bool llvm::CilkABI::processLoop(LoopSpawningHints LSH, LoopInfo &LI, ScalarEvolu DebugLoc DLoc = L->getStartLoc(); BasicBlock *Header = L->getHeader(); - CilkABILoopSpawning DLS(L, LSH.getGrainsize(), SE, &LI, &DT, &AC, ORE, this); + CilkABILoopSpawning DLS(L, LSH.getGrainsize(), SE, LI, DT, AC, ORE, this); if (DLS.processLoop()) { DEBUG({ if (verifyFunction(*L->getHeader()->getParent())) { @@ -1633,5 +1640,5 @@ bool llvm::CilkABI::processLoop(LoopSpawningHints LSH, LoopInfo &LI, ScalarEvolu return false; } - return false; -} \ No newline at end of file + return false; +} diff --git a/lib/Transforms/Tapir/LoopSpawning.cpp b/lib/Transforms/Tapir/LoopSpawning.cpp index 4b1d6cb3948..d28b91ea5e5 100644 --- a/lib/Transforms/Tapir/LoopSpawning.cpp +++ b/lib/Transforms/Tapir/LoopSpawning.cpp @@ -166,7 +166,7 @@ PHINode* LoopOutline::canonicalizeIVs(Type *Ty) { DEBUG(dbgs() << "LS Canonical induction variable " << *CanonicalIV << "\n"); SmallVector DeadInsts; - Exp.replaceCongruentIVs(L, DT, DeadInsts); + Exp.replaceCongruentIVs(L, &DT, DeadInsts); for (WeakTrackingVH V : DeadInsts) { DEBUG(dbgs() << "LS erasing dead inst " << *V << "\n"); Instruction *I = cast(V); @@ -289,6 +289,16 @@ bool LoopOutline::getHandledExits(BasicBlock* Header, SmallPtrSetImpl &LoopBlocks, Value* var, const Twine &name) { + if (isa(var) || countUseInRegion(LoopBlocks, var) != 1) { + Argument *argument = new Argument(var->getType(), name); + return argument; + } else { + return var; + } + } + // IVs is output bool LoopOutline::removeNonCanonicalIVs(BasicBlock* Header, BasicBlock* Preheader, PHINode* CanonicalIV, SmallVectorImpl &IVs) { assert(IVs.size() == 0); @@ -375,7 +385,7 @@ bool LoopOutline::removeNonCanonicalIVs(BasicBlock* Header, BasicBlock* Preheade } } if (!AllCanonical) - return false; + return false; return true; } @@ -426,7 +436,7 @@ const SCEV* LoopOutline::getLimit() { /// Determine the type of the canonical IV. Type *CanonicalIVTy = Limit->getType(); const DataLayout &DL = OrigFunction->getParent()->getDataLayout(); - + for (BasicBlock::iterator II = Header->begin(); isa(II); ++II) { PHINode *PN = cast(II); if (PN->getType()->isFloatingPointTy()) continue; @@ -439,7 +449,7 @@ const SCEV* LoopOutline::getLimit() { bool LoopOutline::setIVStartingValues(Value* newStart, Value* CanonicalIV, const SmallVectorImpl &IVs, BasicBlock* NewPreheader, ValueToValueMapTy &VMap) { if (auto startInst = dyn_cast(NewPreheader)) { - assert(DT->dominates(startInst, NewPreheader->getTerminator())); + assert(DT.dominates(startInst, NewPreheader->getTerminator())); } PHINode *NewCanonicalIV = cast(VMap[CanonicalIV]); diff --git a/lib/Transforms/Tapir/Outline.cpp b/lib/Transforms/Tapir/Outline.cpp index 6e87c3ffaa9..928647fdd18 100644 --- a/lib/Transforms/Tapir/Outline.cpp +++ b/lib/Transforms/Tapir/Outline.cpp @@ -114,10 +114,11 @@ void llvm::CloneIntoFunction( for (const BasicBlock *BB : Blocks) { BasicBlock *CBB = cast(VMap[BB]); // Loop over all instructions, fixing each one as we find it... - for (Instruction &II : *CBB) + for (Instruction &II : *CBB) { RemapInstruction(&II, VMap, ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, TypeMapper, Materializer); + } } } diff --git a/lib/Transforms/Tapir/PTXABI.cpp b/lib/Transforms/Tapir/PTXABI.cpp index a0e03f061dc..6386e31579e 100644 --- a/lib/Transforms/Tapir/PTXABI.cpp +++ b/lib/Transforms/Tapir/PTXABI.cpp @@ -59,8 +59,8 @@ #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Vectorize.h" #include "llvm/Target/TargetSubtargetInfo.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Support/TargetRegistry.h" #include "llvm/IR/LegacyPassManager.h" #include @@ -85,7 +85,7 @@ namespace { Function* getFunction(Module& M, const char* name){ return cast(M.getOrInsertFunction(name, TypeBuilder::get(M.getContext()))); - } + } template Value* convertInteger(B& b, Value* from, Value* to, const std::string& name){ @@ -104,7 +104,7 @@ namespace { return from; } - + } // namespace @@ -114,7 +114,7 @@ PTXABI::PTXABI() {} /// \brief Get/Create the worker count for the spawning function. Value *PTXABI::GetOrCreateWorker8(Function &F) { - Module *M = F.getParent(); + Module *M = F.getParent(); LLVMContext& C = M->getContext(); return ConstantInt::get(C, APInt(16, 8)); } @@ -186,7 +186,7 @@ bool PTXABILoopSpawning::processLoop(){ IntegerType* i64Ty = Type::getInt64Ty(c); PointerType* voidPtrTy = Type::getInt8PtrTy(c); - // and LLVM transformation is able in some cases to transform the loop to + // and LLVM transformation is able in some cases to transform the loop to // contain a phi node that exists at the entry block PHINode* loopNode = L->getCanonicalInductionVariable(); @@ -269,7 +269,7 @@ bool PTXABILoopSpawning::processLoop(){ extValues.insert(v); } } - + values.insert(&ii); } @@ -345,7 +345,7 @@ bool PTXABILoopSpawning::processLoop(){ // and simply return if the thread ID is beyond the run size BasicBlock* br = BasicBlock::Create(c, "entry", f); - + b.SetInsertPoint(br); using SREGFunc = uint32_t(); @@ -355,14 +355,14 @@ bool PTXABILoopSpawning::processLoop(){ Value* threadIdx = b.CreateCall(getFunction(ptxModule, "llvm.nvvm.read.ptx.sreg.tid.x")); - + Value* blockIdx = b.CreateCall(getFunction(ptxModule, "llvm.nvvm.read.ptx.sreg.ctaid.x")); - + Value* blockDim = b.CreateCall(getFunction(ptxModule, "llvm.nvvm.read.ptx.sreg.ntid.x")); - Value* threadId = + Value* threadId = b.CreateAdd(threadIdx, b.CreateMul(blockIdx, blockDim), "threadId"); // convert the thread ID into the proper integer type of the loop variable @@ -408,7 +408,7 @@ bool PTXABILoopSpawning::processLoop(){ continue; } - // determine if we are reading or writing the external variables + // determine if we are reading or writing the external variables // i.e. those passed as CUDA arrays Instruction* ic = ii.clone(); @@ -435,7 +435,7 @@ bool PTXABILoopSpawning::processLoop(){ extVars[gi] = v; if(isa(gi->getSourceElementType())){ auto cgi = dyn_cast(ic); - cgi->setSourceElementType(m[v]->getType()); + cgi->setSourceElementType(m[v]->getType()); } } } @@ -454,12 +454,12 @@ bool PTXABILoopSpawning::processLoop(){ // add the necessary NVPTX to mark the global function - NamedMDNode* annotations = + NamedMDNode* annotations = ptxModule.getOrInsertNamedMetadata("nvvm.annotations"); - + SmallVector av; - av.push_back(ValueAsMetadata::get(f)); + av.push_back(ValueAsMetadata::get(f)); av.push_back(MDString::get(ptxModule.getContext(), "kernel")); av.push_back(ValueAsMetadata::get(llvm::ConstantInt::get(i32Ty, 1))); @@ -493,7 +493,7 @@ bool PTXABILoopSpawning::processLoop(){ for(BasicBlock* bn : b->getTerminator()->successors()){ if(visited.find(bn) == visited.end()){ next.push_back(bn); - } + } } b->dropAllReferences(); @@ -521,11 +521,11 @@ bool PTXABILoopSpawning::processLoop(){ Triple triple(sys::getDefaultTargetTriple()); triple.setArch(Triple::nvptx64); - + // TODO: the version of LLVM that we are using currently only supports // up to SM_60 – we need SM_70 for Volta architectures - TargetMachine* targetMachine = + TargetMachine* targetMachine = target->createTargetMachine(triple.getTriple(), //"sm_35", //"sm_70", @@ -562,7 +562,7 @@ bool PTXABILoopSpawning::processLoop(){ SmallVector buf; raw_svector_ostream ostr(buf); - + bool fail = targetMachine->addPassesToEmitFile(*passManager, ostr, @@ -570,9 +570,9 @@ bool PTXABILoopSpawning::processLoop(){ false); assert(!fail && "failed to emit PTX"); - + passManager->run(ptxModule); - + delete passManager; std::string ptx = ostr.str().str(); @@ -581,7 +581,7 @@ bool PTXABILoopSpawning::processLoop(){ // create a global string to hold the PTX code - GlobalVariable* ptxGlobal = + GlobalVariable* ptxGlobal = new GlobalVariable(hostModule, pcs->getType(), true, @@ -630,7 +630,7 @@ bool PTXABILoopSpawning::processLoop(){ Constant* fn = ConstantDataArray::getString(c, ci->getName()); - GlobalVariable* fieldNameGlobal = + GlobalVariable* fieldNameGlobal = new GlobalVariable(hostModule, fn->getType(), true, @@ -649,7 +649,7 @@ bool PTXABILoopSpawning::processLoop(){ else if(auto ai = dyn_cast(v)){ Constant* fn = ConstantDataArray::getString(c, ai->getName()); - GlobalVariable* fieldNameGlobal = + GlobalVariable* fieldNameGlobal = new GlobalVariable(hostModule, fn->getType(), true, @@ -666,7 +666,7 @@ bool PTXABILoopSpawning::processLoop(){ elementSize = ConstantInt::get(i32Ty, at->getElementType()->getPrimitiveSizeInBits()/8); - + size = ConstantInt::get(i64Ty, at->getNumElements()); } @@ -724,7 +724,7 @@ bool PTXABILoopSpawning::processLoop(){ } bool llvm::PTXABI::processLoop(LoopSpawningHints LSH, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT, - AssumptionCache &AC, OptimizationRemarkEmitter &ORE) { + AssumptionCache &AC, OptimizationRemarkEmitter &ORE) { if (LSH.getStrategy() != LoopSpawningHints::ST_GPU) return false; @@ -733,7 +733,7 @@ bool llvm::PTXABI::processLoop(LoopSpawningHints LSH, LoopInfo &LI, ScalarEvolut { DebugLoc DLoc = L->getStartLoc(); BasicBlock *Header = L->getHeader(); - PTXABILoopSpawning DLS(L, SE, &LI, &DT, &AC, ORE); + PTXABILoopSpawning DLS(L, SE, LI, DT, AC, ORE); if (DLS.processLoop()) { DEBUG({ if (verifyFunction(*L->getHeader()->getParent())) { @@ -760,5 +760,5 @@ bool llvm::PTXABI::processLoop(LoopSpawningHints LSH, LoopInfo &LI, ScalarEvolut } } - return false; + return false; } diff --git a/lib/Transforms/Tapir/TapirUtils.cpp b/lib/Transforms/Tapir/TapirUtils.cpp index abdb5c551c9..6f8e5e4e2b7 100644 --- a/lib/Transforms/Tapir/TapirUtils.cpp +++ b/lib/Transforms/Tapir/TapirUtils.cpp @@ -291,7 +291,7 @@ Function *llvm::extractDetachBodyToFunction(DetachInst &detach, // Get the inputs and outputs for the detached CFG. SetVector Inputs, Outputs; SetVector BodyInputs; - findInputsOutputs(functionPieces, BodyInputs, Outputs, &ExitBlocks, &DT); + findInputsOutputs(functionPieces, BodyInputs, Outputs, DT, &ExitBlocks); assert(Outputs.empty() && "All results from detached CFG should be passed by memory already."); { @@ -405,7 +405,7 @@ bool llvm::isConstantMemoryFreeOperation(Instruction* I, bool allowsyncregion) { auto id = call->getCalledFunction()->getIntrinsicID(); return (id == Intrinsic::lifetime_start || id == Intrinsic::lifetime_end || - allowsyncregion && (id == Intrinsic::syncregion_start)); + (allowsyncregion && (id == Intrinsic::syncregion_start))); } return isa(I) || isa(I) || @@ -429,7 +429,7 @@ bool llvm::isConstantOperation(Instruction* I, bool allowsyncregion) { auto id = call->getCalledFunction()->getIntrinsicID(); return (id == Intrinsic::lifetime_start || id == Intrinsic::lifetime_end || - allowsyncregion && (id == Intrinsic::syncregion_start)); + (allowsyncregion && (id == Intrinsic::syncregion_start))); } return isa(I) || @@ -805,8 +805,8 @@ class DACLoopSpawning : public LoopOutline { unsigned SpecifiedGrainsize; DACLoopSpawning(Loop *OrigLoop, unsigned Grainsize, ScalarEvolution &SE, - LoopInfo *LI, DominatorTree *DT, - AssumptionCache *AC, + LoopInfo &LI, DominatorTree &DT, + AssumptionCache &AC, OptimizationRemarkEmitter &ORE, TapirTarget* tapirTarget) : LoopOutline(OrigLoop, SE, LI, DT, AC, ORE), tapirTarget(tapirTarget), @@ -850,7 +850,7 @@ class DACLoopSpawning : public LoopOutline { SmallVector IVs; if (!removeNonCanonicalIVs(Header, Preheader, CanonicalIV, IVs)) return false; - + const SCEVAddRecExpr *CanonicalSCEV = cast(SE.getSCEV(CanonicalIV)); @@ -885,7 +885,7 @@ class DACLoopSpawning : public LoopOutline { Value *SRetInput = nullptr; // Get the sync region containing this Tapir loop. - const Instruction *InputSyncRegion; + Instruction *InputSyncRegion; { const DetachInst *DI = cast(Header->getTerminator()); InputSyncRegion = cast(DI->getSyncRegion()); @@ -900,9 +900,10 @@ class DACLoopSpawning : public LoopOutline { { const DetachInst *DI = cast(Header->getTerminator()); BasicBlockEdge DetachEdge(Header, DI->getDetached()); - for (BasicBlock *HE : HandledExits) - if (!DT || !DT->dominates(DetachEdge, HE)) + for (BasicBlock *HE : HandledExits) { + if (!DT.dominates(DetachEdge, HE)) ExitsToSplit.insert(HE); + } DEBUG({ dbgs() << "Loop exits to split:"; for (BasicBlock *ETS : ExitsToSplit) @@ -912,7 +913,26 @@ class DACLoopSpawning : public LoopOutline { } // Get the inputs and outputs for the loop body. - findInputsOutputs(LoopBlocks, BodyInputs, BodyOutputs, &ExitsToSplit); + findInputsOutputs(LoopBlocks, BodyInputs, BodyOutputs, DT, &ExitsToSplit); + BodyInputs.remove(InputSyncRegion); + + Value *CanonicalIVInput = CanonicalIV->getIncomingValueForBlock(Preheader); + + // CanonicalIVInput should be the constant 0. + assert(isa(CanonicalIVInput) && + "Input to canonical IV from preheader is not constant."); + + // Add explicit argument for loop start, removing from inputs if didn't make new var + Value* startArg = ensureDistinctArgument(LoopBlocks, CanonicalIVInput, "start"); + BodyInputs.remove(startArg); + + // Add explicit argument for loop end, removing from inputs if didn't make new var + Value* limitArg = ensureDistinctArgument(LoopBlocks, LimitVar, "end"); + BodyInputs.remove(limitArg); + + // Add explicit argument for grainsize, removing from inputs if didn't make new var + Value* grainArg = ensureDistinctArgument(LoopBlocks, GrainVar, "grainsize"); + BodyInputs.remove(grainArg); // Scan for any sret parameters in BodyInputs and add them first. if (OrigFunction->hasStructRetAttr()) { @@ -925,39 +945,24 @@ class DACLoopSpawning : public LoopOutline { if (BodyInputs.count(&*ArgIter)) SRetInput = &*ArgIter; } + if (SRetInput) BodyInputs.remove(SRetInput); } + + // Put all of the inputs together if (SRetInput) { DEBUG(dbgs() << "sret input " << *SRetInput << "\n"); Inputs.insert(SRetInput); } - Value *CanonicalIVInput = CanonicalIV->getIncomingValueForBlock(Preheader); - - // CanonicalIVInput should be the constant 0. - assert(isa(CanonicalIVInput) && - "Input to canonical IV from preheader is not constant."); - - // Add explicit argument for loop start. - Value* startArg = ensureDistinctArgument(CanonicalIVInput, "start"); Inputs.insert(startArg); - - // Add explicit argument for loop end. - Value* limitArg = ensureDistinctArgument(LimitVar, "end"); Inputs.insert(limitArg); - - // Add explicit argument for grainsize. - Value* grainArg = ensureDistinctArgument(GrainVar, "grainsize"); Inputs.insert(grainArg); - - // Put all of the inputs together, and clear redundant inputs from - // the set for the loop body. - for (Value *V : BodyInputs) - if (V != InputSyncRegion && !Inputs.count(V)) { + for (Value *V : BodyInputs) { Inputs.insert(V); DEBUG({ dbgs() << "Remaining body input: " << *V << "\n"; }); - } + } - DEBUG({ + DEBUG({ for (Value *V : BodyOutputs) dbgs() << "EL output: " << *V << "\n"; }); @@ -975,7 +980,6 @@ class DACLoopSpawning : public LoopOutline { Function *Helper; { SmallVector Returns; // Ignore returns cloned. - Helper = CreateHelper(Inputs, Outputs, LoopBlocks, Header, Preheader, ExitBlock, VMap, M, @@ -997,7 +1001,7 @@ class DACLoopSpawning : public LoopOutline { assert(isa(HelperExit->getTerminator())); BasicBlock *NewHelperExit = SplitBlock(HelperExit, HelperExit->getTerminator(), - DT, LI); + &DT, &LI); IRBuilder<> Builder(&(HelperExit->front())); SyncInst *NewSync = Builder.CreateSync( NewHelperExit, @@ -1019,12 +1023,12 @@ class DACLoopSpawning : public LoopOutline { // where the loop limit was constant or used elsewhere within the loop, this // pass rewrites the outlined loop-latch condition to use the explicit // end-iteration argument. - if (isa(LimitVar) || !LimitVar->hasOneUse()) { + if (isa(LimitVar) || countUseInRegion(LoopBlocks, LimitVar) != 1) { CmpInst *HelperCond = cast(VMap[NewCond]); assert(((isa(LimitVar) && HelperCond->getOperand(1) == LimitVar) || - (!LimitVar->hasOneUse() && - HelperCond->getOperand(1) == limitArg)) && + (countUseInRegion(LoopBlocks, LimitVar) != 1 && + HelperCond->getOperand(1) == VMap[LimitVar] )) && "Unexpected condition in loop latch."); IRBuilder<> Builder(HelperCond); Value *NewHelperCond = Builder.CreateICmpULT(HelperCond->getOperand(0), @@ -1101,7 +1105,7 @@ class DACLoopSpawning : public LoopOutline { // Add alignment assumptions to arguments of helper, based on alignment of // values in old function. AddAlignmentAssumptions(OrigFunction, Inputs, VMap, - Preheader->getTerminator(), AC, DT); + Preheader->getTerminator(), &AC, &DT); // Add call to new helper function in original function. { @@ -1119,8 +1123,9 @@ class DACLoopSpawning : public LoopOutline { // Add grainsize. TopCallArgs.push_back(GrainVar); // Add the rest of the arguments. - for (Value *V : BodyInputs) + for (Value *V : BodyInputs) { TopCallArgs.push_back(V); + } DEBUG({ for (Value *TCArg : TopCallArgs) dbgs() << "Top call arg: " << *TCArg << "\n"; @@ -1133,7 +1138,6 @@ class DACLoopSpawning : public LoopOutline { // Use a fast calling convention for the helper. TopCall->setCallingConv(CallingConv::Fast); - // TopCall->setCallingConv(Helper->getCallingConv()); TopCall->setDebugLoc(Header->getTerminator()->getDebugLoc()); // // Update CG graph with the call we just added. // CG[F]->addCalledFunction(TopCall, CG[Helper]); @@ -1382,7 +1386,7 @@ bool llvm::TapirTarget::processDACLoop(LoopSpawningHints LSH, LoopInfo &LI, Scal DebugLoc DLoc = L->getStartLoc(); BasicBlock *Header = L->getHeader(); - DACLoopSpawning DLS(L, LSH.getGrainsize(), SE, &LI, &DT, &AC, ORE, this); + DACLoopSpawning DLS(L, LSH.getGrainsize(), SE, LI, DT, AC, ORE, this); if (DLS.processLoop()) { DEBUG({ if (verifyFunction(*L->getHeader()->getParent())) { @@ -1407,5 +1411,5 @@ bool llvm::TapirTarget::processDACLoop(LoopSpawningHints LSH, LoopInfo &LI, Scal return false; } - return false; + return false; } diff --git a/test/Transforms/Tapir/loopspawning-eh.ll b/test/Transforms/Tapir/loopspawning-eh.ll new file mode 100644 index 00000000000..74632852395 --- /dev/null +++ b/test/Transforms/Tapir/loopspawning-eh.ll @@ -0,0 +1,106 @@ +; RUN: opt < %s -loop-spawning -ls-tapir-target=cilk -simplifycfg -S | FileCheck %s + +; CHECK: define internal fastcc void @foo_pfor.detach.ls(i64 %start.ls, i64 %.ls, i64 %grainsize.ls) #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) + +; ModuleID = 'newstart.ll' +source_filename = "sret-test.cpp" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%"class.std::vector.0" = type { %"struct.std::_Vector_base.1" } +%"struct.std::_Vector_base.1" = type { %"struct.std::_Vector_base, std::allocator > >::_Vector_impl" } +%"struct.std::_Vector_base, std::allocator > >::_Vector_impl" = type { %"class.std::tuple"*, %"class.std::tuple"*, %"class.std::tuple"* } +%"class.std::tuple" = type { %"struct.std::_Tuple_impl.base", [4 x i8] } +%"struct.std::_Tuple_impl.base" = type <{ %"struct.std::_Tuple_impl.5", %"struct.std::_Head_base.8" }> +%"struct.std::_Tuple_impl.5" = type { %"struct.std::_Tuple_impl.6", %"struct.std::_Head_base.7" } +%"struct.std::_Tuple_impl.6" = type { %"struct.std::_Head_base" } +%"struct.std::_Head_base" = type { i32 } +%"struct.std::_Head_base.7" = type { double } +%"struct.std::_Head_base.8" = type { i32 } +%"class.std::vector" = type { %"struct.std::_Vector_base" } +%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base >::_Vector_impl" } +%"struct.std::_Vector_base >::_Vector_impl" = type { %struct.params*, %struct.params*, %struct.params* } +%struct.params = type { i32, i32, float, float, float, i32 } + +; Function Attrs: uwtable +define void @foo(%"class.std::vector.0"* noalias sret %agg.result, i64 %numiters, i64 %numiters2, i64 %numiters3, i32 %trials, %"class.std::vector"* nocapture readonly dereferenceable(24) %ps) #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %syncreg = tail call token @llvm.syncregion.start() + br label %pfor.detach + +pfor.detach: ; preds = %pfor.inc78, %entry + %indvars.iv395 = phi i64 [ 0, %entry ], [ %indvars.iv.next396, %pfor.inc78 ] + detach within %syncreg, label %pfor.body, label %pfor.inc78 + +pfor.body: ; preds = %pfor.detach + %cmp.i.i = call i1 @a() + br i1 %cmp.i.i, label %if.else.i.i, label %exit2601 + +if.else.i.i: ; preds = %pfor.body + invoke void @invokable2() + to label %exit260 unwind label %lpad64 + +lpad64: ; preds = %if.else.i.i + %lpad64v0 = landingpad { i8*, i32 } + cleanup + br label %invoke.cont.i + +exit260: ; preds = %if.else.i.i + reattach within %syncreg, label %pfor.inc78 + +exit2601: ; preds = %pfor.body + reattach within %syncreg, label %pfor.inc78 + +pfor.inc78: ; preds = %exit2601, %exit260, %pfor.detach + %indvars.iv.next396 = add nuw nsw i64 %indvars.iv395, 1 + %cmp = icmp slt i64 %indvars.iv.next396, %numiters + br i1 %cmp, label %pfor.detach, label %pfor.cond.cleanup, !llvm.loop !2 + +pfor.cond.cleanup: ; preds = %pfor.inc78 + sync within %syncreg, label %for.body90 + +for.body90: ; preds = %pfor.cond.cleanup + invoke void @invokable() + to label %exit220 unwind label %lpad103 + +lpad103: ; preds = %for.body90 + %lpad103v0 = landingpad { i8*, i32 } + cleanup + %lpad103v1 = extractvalue { i8*, i32 } %lpad103v0, 0 + %lpad103v2 = extractvalue { i8*, i32 } %lpad103v0, 1 + br label %invoke.cont.i + +invoke.cont.i: ; preds = %lpad103, %lpad64 + %ehselector.slot.0 = phi i32 [ %lpad103v2, %lpad103 ], [ undef, %lpad64 ] + %exn.slot.0 = phi i8* [ %lpad103v1, %lpad103 ], [ undef, %lpad64 ] + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn.slot.0, 0 + %lpad.val117 = insertvalue { i8*, i32 } %lpad.val, i32 %ehselector.slot.0, 1 + resume { i8*, i32 } %lpad.val117 + +exit220: ; preds = %for.body90 + ret void +} + +declare i1 @a() + +declare i32 @__gxx_personality_v0(...) + +; Function Attrs: argmemonly nounwind +declare token @llvm.syncregion.start() #1 + +; Function Attrs: uwtable +declare void @invokable() #0 + +; Function Attrs: uwtable +declare void @invokable2() #0 + +attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (git@github.com:wsmoses/Tapir-Clang.git 245c29d5cb99796c4107fd83f9bbe668c130b275) (git@github.com:wsmoses/Tapir-LLVM.git 7352407d063c8bac796926ca618e14d8eca87735)"} +!2 = distinct !{!2, !3} +!3 = !{!"tapir.loop.spawn.strategy", i32 1} diff --git a/test/Transforms/Tapir/sret-param.ll b/test/Transforms/Tapir/sret-param.ll index bc22de67c64..2ddce025cf9 100644 --- a/test/Transforms/Tapir/sret-param.ll +++ b/test/Transforms/Tapir/sret-param.ll @@ -887,16 +887,16 @@ _ZNSt12_Vector_baseISt5tupleIJidiEESaIS1_EE13_M_deallocateEPS1_m.exit64: ; preds ret void } -; LS-LABEL: define internal fastcc void @_Z14func_with_sretidRSt6vectorI6paramsSaIS0_EE_pfor.detach.ls(%"class.std::vector.0"* noalias sret align 8 %agg.result.ls, i64 %indvars.iv395.start.ls, i64 %end.ls, i64 %.ls, +; LS-LABEL: define internal fastcc void @_Z14func_with_sretidRSt6vectorI6paramsSaIS0_EE_pfor.detach.ls(%"class.std::vector.0"* noalias sret align 8 %agg.result.ls, ; LS: {{^.split:}} -; LS-NEXT: call fastcc void @_Z14func_with_sretidRSt6vectorI6paramsSaIS0_EE_pfor.detach.ls(%"class.std::vector.0"* %agg.result.ls, i64 %indvars.iv395.ls.dac, i64 %miditer, i64 %.ls, +; LS-NEXT: call fastcc void @_Z14func_with_sretidRSt6vectorI6paramsSaIS0_EE_pfor.detach.ls(%"class.std::vector.0"* %agg.result.ls, ; LS: {{^pfor.detach30.preheader.ls:}} ; LS: call fastcc void @_Z14func_with_sretidRSt6vectorI6paramsSaIS0_EE_pfor.detach.ls_pfor.detach30.ls.ls(%"class.std::vector.0"* %agg.result.ls, i64 0, -; LS-LABEL: define internal fastcc void @_Z14func_with_sretidRSt6vectorI6paramsSaIS0_EE_pfor.detach.ls_pfor.detach30.ls.ls(%"class.std::vector.0"* noalias sret align 8 %agg.result.ls.ls, i64 %indvars.iv391.ls.start.ls, i64 %end.ls, i64 %.ls, +; LS-LABEL: define internal fastcc void @_Z14func_with_sretidRSt6vectorI6paramsSaIS0_EE_pfor.detach.ls_pfor.detach30.ls.ls(%"class.std::vector.0"* noalias sret align 8 %agg.result.ls.ls, ; LS: {{^.split:}} -; LS: call fastcc void @_Z14func_with_sretidRSt6vectorI6paramsSaIS0_EE_pfor.detach.ls_pfor.detach30.ls.ls(%"class.std::vector.0"* %agg.result.ls.ls, i64 %indvars.iv391.ls.ls.dac, i64 %miditer, i64 %.ls, +; LS: call fastcc void @_Z14func_with_sretidRSt6vectorI6paramsSaIS0_EE_pfor.detach.ls_pfor.detach30.ls.ls(%"class.std::vector.0"* %agg.result.ls.ls, ; TT-LABEL: define internal fastcc void @_Z14func_with_sretidRSt6vectorI6paramsSaIS0_EE_pfor.body.cilk(%"class.std::vector.0"* noalias sret align 8 %agg.result.cilk, ; TT: {{^pfor.detach30.cilk.split:}} From 37605e1b4b822421b170d43188259a18997eeb09 Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Thu, 26 Jul 2018 12:36:01 -0400 Subject: [PATCH 11/16] Continue refactor --- include/llvm/Transforms/Tapir/CilkABI.h | 3 +- include/llvm/Transforms/Tapir/LoopSpawning.h | 5 - include/llvm/Transforms/Tapir/TapirTypes.h | 4 +- lib/Transforms/Tapir/CilkABI.cpp | 5 +- lib/Transforms/Tapir/LoopSpawning.cpp | 309 +++++++----------- lib/Transforms/Tapir/TapirToTarget.cpp | 4 +- lib/Transforms/Tapir/TapirUtils.cpp | 6 +- .../Tapir/{looplimit.ll => dac-looplimit.ll} | 0 ...pspawning-eh.ll => dac-loopspawning-eh.ll} | 0 .../{sret-param.ll => dac-sret-param.ll} | 0 test/Transforms/Tapir/oldcilk-looplimit.ll | 96 ++++++ .../Tapir/oldcilk-loopspawning-eh.ll | 106 ++++++ .../Tapir/oldcilk-loopspawning-simple.ll | 98 ++++++ .../Tapir/oldcilk-loopspawning-vec.ll | 51 +++ tools/clang | 2 +- 15 files changed, 488 insertions(+), 201 deletions(-) rename test/Transforms/Tapir/{looplimit.ll => dac-looplimit.ll} (100%) rename test/Transforms/Tapir/{loopspawning-eh.ll => dac-loopspawning-eh.ll} (100%) rename test/Transforms/Tapir/{sret-param.ll => dac-sret-param.ll} (100%) create mode 100644 test/Transforms/Tapir/oldcilk-looplimit.ll create mode 100644 test/Transforms/Tapir/oldcilk-loopspawning-eh.ll create mode 100644 test/Transforms/Tapir/oldcilk-loopspawning-simple.ll create mode 100644 test/Transforms/Tapir/oldcilk-loopspawning-vec.ll diff --git a/include/llvm/Transforms/Tapir/CilkABI.h b/include/llvm/Transforms/Tapir/CilkABI.h index 61f1a0b878e..4ae7da31214 100644 --- a/include/llvm/Transforms/Tapir/CilkABI.h +++ b/include/llvm/Transforms/Tapir/CilkABI.h @@ -42,8 +42,9 @@ namespace llvm { class CilkABI : public TapirTarget { + const bool _useRuntimeForLoop; public: - CilkABI(); + CilkABI(bool useRuntimeForLoop); Value *GetOrCreateWorker8(Function &F) override final; void createSync(SyncInst &inst, ValueToValueMapTy &DetachCtxToStackFrame) override final; diff --git a/include/llvm/Transforms/Tapir/LoopSpawning.h b/include/llvm/Transforms/Tapir/LoopSpawning.h index 8ad52762c6c..50de34e807b 100644 --- a/include/llvm/Transforms/Tapir/LoopSpawning.h +++ b/include/llvm/Transforms/Tapir/LoopSpawning.h @@ -120,11 +120,6 @@ class LoopOutline { BasicBlock *ExitBlock; }; -/// The LoopSpawning Pass. -struct LoopSpawningPass : public PassInfoMixin { - TapirTarget* tapirTarget; - PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); -}; } #endif // LLVM_TRANSFORMS_TAPIR_LOOPSPAWNING_H diff --git a/include/llvm/Transforms/Tapir/TapirTypes.h b/include/llvm/Transforms/Tapir/TapirTypes.h index 455e32dd8d8..6cd76e97c52 100644 --- a/include/llvm/Transforms/Tapir/TapirTypes.h +++ b/include/llvm/Transforms/Tapir/TapirTypes.h @@ -21,8 +21,8 @@ enum class TapirTargetType { None = 0, Serial = 1, Cilk = 2, - OpenMP = 3, - CilkR = 4, + CilkLegacy = 3, + OpenMP = 4, Qthreads = 5, PTX = 6 }; diff --git a/lib/Transforms/Tapir/CilkABI.cpp b/lib/Transforms/Tapir/CilkABI.cpp index 750ed7aff5f..a75f499298a 100644 --- a/lib/Transforms/Tapir/CilkABI.cpp +++ b/lib/Transforms/Tapir/CilkABI.cpp @@ -1156,7 +1156,8 @@ bool makeFunctionDetachable(Function &extracted, //############################################################################## -CilkABI::CilkABI() {} +CilkABI::CilkABI(bool useRuntimeForLoop) : + _useRuntimeForLoop(useRuntimeForLoop) {} /// \brief Get/Create the worker count for the spawning function. Value *CilkABI::GetOrCreateWorker8(Function &F) { @@ -1605,7 +1606,7 @@ bool llvm::CilkABI::processLoop(LoopSpawningHints LSH, LoopInfo &LI, ScalarEvolu if (LSH.getStrategy() != LoopSpawningHints::ST_DAC) return false; - if (LSH.getStrategy() == LoopSpawningHints::ST_DAC) + if (!_useRuntimeForLoop) return processDACLoop(LSH, LI, SE, DT, AC, ORE); DEBUG(dbgs() << "LS: Using CilkABI spawning.\n"); diff --git a/lib/Transforms/Tapir/LoopSpawning.cpp b/lib/Transforms/Tapir/LoopSpawning.cpp index d28b91ea5e5..b84cbb97046 100644 --- a/lib/Transforms/Tapir/LoopSpawning.cpp +++ b/lib/Transforms/Tapir/LoopSpawning.cpp @@ -70,7 +70,9 @@ static cl::opt ClTapirTarget( clEnumValN(TapirTargetType::Serial, "serial", "Serial code"), clEnumValN(TapirTargetType::Cilk, - "cilk", "Cilk Plus"), + "cilk", "Cilk Plus (with new loop backend)"), + clEnumValN(TapirTargetType::CilkLegacy, + "cilklegacy", "Cilk Plus (with ABI loop backend)"), clEnumValN(TapirTargetType::OpenMP, "openmp", "OpenMP"), clEnumValN(TapirTargetType::Qthreads, @@ -115,32 +117,7 @@ static void emitMissedWarning(Function *F, Loop *L, } } -struct LoopSpawningImpl { - LoopSpawningImpl(Function &F, - LoopInfo &LI, - ScalarEvolution &SE, - DominatorTree &DT, - AssumptionCache &AC, - OptimizationRemarkEmitter &ORE, - TapirTarget* tapirTarget) - : F(F), LI(LI), SE(SE), DT(DT), AC(AC), ORE(ORE), tapirTarget(tapirTarget) {} - - bool run(); - -private: - void addTapirLoop(Loop *L, SmallVectorImpl &V); - bool processLoop(Loop *L); - - Function &F; - LoopInfo &LI; - ScalarEvolution &SE; - DominatorTree &DT; - AssumptionCache &AC; - OptimizationRemarkEmitter &ORE; - - TapirTarget* tapirTarget; -}; -} // end anonymous namespace +} /// Canonicalize the induction variables in the loop. Return the canonical /// induction variable created or inserted by the scalar evolution expander. @@ -289,15 +266,14 @@ bool LoopOutline::getHandledExits(BasicBlock* Header, SmallPtrSetImpl &LoopBlocks, Value* var, const Twine &name) { - if (isa(var) || countUseInRegion(LoopBlocks, var) != 1) { - Argument *argument = new Argument(var->getType(), name); - return argument; - } else { - return var; - } +Value* LoopOutline::ensureDistinctArgument(const std::vector &LoopBlocks, Value* var, const Twine &name) { + if (isa(var) || countUseInRegion(LoopBlocks, var) != 1) { + Argument *argument = new Argument(var->getType(), name); + return argument; + } else { + return var; } +} // IVs is output bool LoopOutline::removeNonCanonicalIVs(BasicBlock* Header, BasicBlock* Preheader, PHINode* CanonicalIV, SmallVectorImpl &IVs) { @@ -586,39 +562,6 @@ void LoopOutline::unlinkLoop() { } } -/// This routine recursively examines all descendants of the specified loop and -/// adds all Tapir loops in that tree to the vector. This routine performs a -/// pre-order traversal of the tree of loops and pushes each Tapir loop found -/// onto the end of the vector. -void LoopSpawningImpl::addTapirLoop(Loop *L, SmallVectorImpl &V) { - if (isCanonicalTapirLoop(L)) { - V.push_back(L); - return; - } - - LoopSpawningHints Hints(L); - - DEBUG(dbgs() << "LS: Loop hints:" - << " strategy = " << Hints.printStrategy(Hints.getStrategy()) - << " grainsize = " << Hints.getGrainsize() - << "\n"); - - using namespace ore; - - if (LoopSpawningHints::ST_SEQ != Hints.getStrategy()) { - DEBUG(dbgs() << "LS: Marked loop is not a valid Tapir loop.\n" - << "\tLoop hints:" - << " strategy = " << Hints.printStrategy(Hints.getStrategy()) - << "\n"); - ORE.emit(OptimizationRemarkMissed(LS_NAME, "NotTapir", - L->getStartLoc(), L->getHeader()) - << "marked loop is not a valid Tapir loop"); - } - - for (Loop *InnerL : *L) - addTapirLoop(InnerL, V); -} - #ifndef NDEBUG /// \return string containing a file name and a line # for the given loop. static std::string getDebugLocString(const Loop *L) { @@ -636,134 +579,103 @@ static std::string getDebugLocString(const Loop *L) { } #endif -bool LoopSpawningImpl::run() { - // Build up a worklist of inner-loops to vectorize. This is necessary as - // the act of vectorizing or partially unrolling a loop creates new loops - // and can invalidate iterators across the loops. - SmallVector Worklist; - - // Examine all top-level loops in this function, and call addTapirLoop to push - // those loops onto the work list. - for (Loop *L : LI) - addTapirLoop(L, Worklist); - - LoopsAnalyzed += Worklist.size(); - - // Now walk the identified inner loops. - bool Changed = false; - while (!Worklist.empty()) - // Process the work list of loops backwards. For each tree of loops in this - // function, addTapirLoop pushed those loops onto the work list according to - // a pre-order tree traversal. Therefore, processing the work list - // backwards leads us to process innermost loops first. - Changed |= processLoop(Worklist.pop_back_val()); - - // Process each loop nest in the function. - return Changed; -} - - -// Top-level routine to process a given loop. -bool LoopSpawningImpl::processLoop(Loop *L) { -#ifndef NDEBUG - const std::string DebugLocStr = getDebugLocString(L); -#endif /* NDEBUG */ +namespace { +struct LoopSpawning : public FunctionPass { + /// Pass identification, replacement for typeid + static char ID; + TapirTarget* tapirTarget; + explicit LoopSpawning(TapirTarget* tapirTarget = nullptr) + : FunctionPass(ID), tapirTarget(tapirTarget) { + if (!this->tapirTarget) + this->tapirTarget = getTapirTargetFromType(ClTapirTarget); - // Function containing loop - Function *F = L->getHeader()->getParent(); + assert(this->tapirTarget); + initializeLoopSpawningPass(*PassRegistry::getPassRegistry()); + } - DEBUG(dbgs() << "\nLS: Checking a Tapir loop in \"" - << L->getHeader()->getParent()->getName() << "\" from " - << DebugLocStr << ": " << *L << "\n"); + /// This routine recursively examines all descendants of the specified loop and + /// adds all Tapir loops in that tree to the vector. This routine performs a + /// pre-order traversal of the tree of loops and pushes each Tapir loop found + /// onto the end of the vector. + void addTapirLoop(Loop *L, SmallVectorImpl &V, OptimizationRemarkEmitter &ORE) { + if (isCanonicalTapirLoop(L)) { + V.push_back(L); + return; + } - LoopSpawningHints Hints(L); + LoopSpawningHints Hints(L); - DEBUG(dbgs() << "LS: Loop hints:" - << " strategy = " << Hints.printStrategy(Hints.getStrategy()) - << " grainsize = " << Hints.getGrainsize() - << "\n"); + DEBUG(dbgs() << "LS: Loop hints:" + << " strategy = " << Hints.printStrategy(Hints.getStrategy()) + << " grainsize = " << Hints.getGrainsize() + << "\n"); - using namespace ore; + using namespace ore; - // Get the loop preheader. LoopSimplify should guarantee that the loop - // preheader is not terminated by a sync. - BasicBlock *Preheader = L->getLoopPreheader(); - if (!Preheader) { - DEBUG(dbgs() << "LS: Loop lacks a preheader.\n"); - ORE.emit(OptimizationRemarkMissed(LS_NAME, "NoPreheader", - L->getStartLoc(), L->getHeader()) - << "loop lacks a preheader"); - emitMissedWarning(F, L, Hints, &ORE); - return false; - } else if (!isa(Preheader->getTerminator())) { - DEBUG(dbgs() << "LS: Loop preheader is not terminated by a branch.\n"); - ORE.emit(OptimizationRemarkMissed(LS_NAME, "ComplexPreheader", - L->getStartLoc(), L->getHeader()) - << "loop preheader not terminated by a branch"); - emitMissedWarning(F, L, Hints, &ORE); - return false; - } + if (LoopSpawningHints::ST_SEQ != Hints.getStrategy()) { + DEBUG(dbgs() << "LS: Marked loop is not a valid Tapir loop.\n" + << "\tLoop hints:" + << " strategy = " << Hints.printStrategy(Hints.getStrategy()) + << "\n"); + ORE.emit(OptimizationRemarkMissed(LS_NAME, "NotTapir", + L->getStartLoc(), L->getHeader()) + << "marked loop is not a valid Tapir loop"); + } - switch(Hints.getStrategy()) { - case LoopSpawningHints::ST_SEQ: - DEBUG(dbgs() << "LS: Hints dictate sequential spawning.\n"); - break; - default: - DEBUG({ - llvm::LoopBlocksDFS DFS(L); - DFS.perform(&LI); - dbgs() << "Blocks in loop (from DFS):\n"; - for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) - dbgs() << *BB; - }); - - return tapirTarget->processLoop(Hints, LI, SE, DT, AC, ORE); - case LoopSpawningHints::ST_END: - dbgs() << "LS: Hints specify unknown spawning strategy.\n"; - break; + for (Loop *InnerL : *L) + addTapirLoop(InnerL, V, ORE); } - return false; -} -PreservedAnalyses LoopSpawningPass::run(Function &F, - FunctionAnalysisManager &AM) { - // Determine if function detaches. - bool DetachingFunction = false; - for (BasicBlock &BB : F) - if (isa(BB.getTerminator())) - DetachingFunction = true; + // Top-level routine to process a given loop. + bool processLoop(Loop *L, LoopInfo &LI, ScalarEvolution &SE, + DominatorTree &DT, AssumptionCache &AC, OptimizationRemarkEmitter &ORE) { - if (!DetachingFunction) - return PreservedAnalyses::all(); + // Function containing loop + Function *F = L->getHeader()->getParent(); - auto &LI = AM.getResult(F); - auto &SE = AM.getResult(F); - auto &DT = AM.getResult(F); - auto &AC = AM.getResult(F); - auto &ORE = - AM.getResult(F); + DEBUG(dbgs() << "\nLS: Checking a Tapir loop in \"" + << L->getHeader()->getParent()->getName() << "\" from " + << getDebugLocString(L) << ": " << *L << "\n"); - bool Changed = LoopSpawningImpl(F, LI, SE, DT, AC, ORE, tapirTarget).run(); + LoopSpawningHints Hints(L); - AM.invalidate(F); + DEBUG(dbgs() << "LS: Loop hints:" + << " strategy = " << Hints.printStrategy(Hints.getStrategy()) + << " grainsize = " << Hints.getGrainsize() + << "\n"); - if (Changed) - return PreservedAnalyses::none(); - return PreservedAnalyses::all(); -} + using namespace ore; -namespace { -struct LoopSpawning : public FunctionPass { - /// Pass identification, replacement for typeid - static char ID; - TapirTarget* tapirTarget; - explicit LoopSpawning(TapirTarget* tapirTarget = nullptr) - : FunctionPass(ID), tapirTarget(tapirTarget) { - if (!this->tapirTarget) - this->tapirTarget = getTapirTargetFromType(ClTapirTarget); + // Get the loop preheader. LoopSimplify should guarantee that the loop + // preheader is not terminated by a sync. + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) { + DEBUG(dbgs() << "LS: Loop lacks a preheader.\n"); + ORE.emit(OptimizationRemarkMissed(LS_NAME, "NoPreheader", + L->getStartLoc(), L->getHeader()) + << "loop lacks a preheader"); + emitMissedWarning(F, L, Hints, &ORE); + return false; + } else if (!isa(Preheader->getTerminator())) { + DEBUG(dbgs() << "LS: Loop preheader is not terminated by a branch.\n"); + ORE.emit(OptimizationRemarkMissed(LS_NAME, "ComplexPreheader", + L->getStartLoc(), L->getHeader()) + << "loop preheader not terminated by a branch"); + emitMissedWarning(F, L, Hints, &ORE); + return false; + } - assert(this->tapirTarget); - initializeLoopSpawningPass(*PassRegistry::getPassRegistry()); + switch(Hints.getStrategy()) { + case LoopSpawningHints::ST_SEQ: + DEBUG(dbgs() << "LS: Hints dictate sequential spawning.\n"); + break; + default: + return tapirTarget->processLoop(Hints, LI, SE, DT, AC, ORE); + case LoopSpawningHints::ST_END: + dbgs() << "LS: Hints specify unknown spawning strategy.\n"; + break; + } + return false; } bool runOnFunction(Function &F) override { @@ -778,15 +690,36 @@ struct LoopSpawning : public FunctionPass { if (!DetachingFunction) return false; - auto &LI = getAnalysis().getLoopInfo(); - auto &SE = getAnalysis().getSE(); - auto &DT = getAnalysis().getDomTree(); - auto &AC = getAnalysis().getAssumptionCache(F); - auto &ORE = - getAnalysis().getORE(); - // OptimizationRemarkEmitter ORE(F); + auto &LI = getAnalysis().getLoopInfo(); + auto &SE = getAnalysis().getSE(); + auto &DT = getAnalysis().getDomTree(); + auto &AC = getAnalysis().getAssumptionCache(F); + auto &ORE = getAnalysis().getORE(); + + + // Build up a worklist of inner-loops to vectorize. This is necessary as + // the act of vectorizing or partially unrolling a loop creates new loops + // and can invalidate iterators across the loops. + SmallVector Worklist; + + // Examine all top-level loops in this function, and call addTapirLoop to push + // those loops onto the work list. + for (Loop *L : LI) + addTapirLoop(L, Worklist, ORE); + + LoopsAnalyzed += Worklist.size(); + + // Now walk the identified inner loops. + bool Changed = false; + while (!Worklist.empty()) + // Process the work list of loops backwards. For each tree of loops in this + // function, addTapirLoop pushed those loops onto the work list according to + // a pre-order tree traversal. Therefore, processing the work list + // backwards leads us to process innermost loops first. + Changed |= processLoop(Worklist.pop_back_val(), LI, SE, DT, AC, ORE); - return LoopSpawningImpl(F, LI, SE, DT, AC, ORE, tapirTarget).run(); + // Process each loop nest in the function. + return Changed; } void getAnalysisUsage(AnalysisUsage &AU) const override { diff --git a/lib/Transforms/Tapir/TapirToTarget.cpp b/lib/Transforms/Tapir/TapirToTarget.cpp index 5a9f6ddb766..f96f3352e22 100644 --- a/lib/Transforms/Tapir/TapirToTarget.cpp +++ b/lib/Transforms/Tapir/TapirToTarget.cpp @@ -30,7 +30,9 @@ static cl::opt ClTapirTarget( clEnumValN(TapirTargetType::Serial, "serial", "Serial code"), clEnumValN(TapirTargetType::Cilk, - "cilk", "Cilk Plus"), + "cilk", "Cilk Plus (with new loop backend)"), + clEnumValN(TapirTargetType::CilkLegacy, + "cilklegacy", "Cilk Plus (with ABI loop backend)"), clEnumValN(TapirTargetType::Qthreads, "qthreads", "Qthreads"), clEnumValN(TapirTargetType::OpenMP, diff --git a/lib/Transforms/Tapir/TapirUtils.cpp b/lib/Transforms/Tapir/TapirUtils.cpp index 6f8e5e4e2b7..289645bb0eb 100644 --- a/lib/Transforms/Tapir/TapirUtils.cpp +++ b/lib/Transforms/Tapir/TapirUtils.cpp @@ -31,7 +31,9 @@ using namespace llvm; TapirTarget *llvm::getTapirTargetFromType(TapirTargetType Type) { switch(Type) { case TapirTargetType::Cilk: - return new CilkABI(); + return new CilkABI(/*useRuntimeForLoop=*/false); + case TapirTargetType::CilkLegacy: + return new CilkABI(/*useRuntimeForLoop=*/true); case TapirTargetType::OpenMP: return new OpenMPABI(); case TapirTargetType::PTX: @@ -39,8 +41,10 @@ TapirTarget *llvm::getTapirTargetFromType(TapirTargetType Type) { case TapirTargetType::Qthreads: return new QthreadsABI(); case TapirTargetType::None: + return nullptr; case TapirTargetType::Serial: default: + assert(0 && "Tapir target not implemented"); return nullptr; } } diff --git a/test/Transforms/Tapir/looplimit.ll b/test/Transforms/Tapir/dac-looplimit.ll similarity index 100% rename from test/Transforms/Tapir/looplimit.ll rename to test/Transforms/Tapir/dac-looplimit.ll diff --git a/test/Transforms/Tapir/loopspawning-eh.ll b/test/Transforms/Tapir/dac-loopspawning-eh.ll similarity index 100% rename from test/Transforms/Tapir/loopspawning-eh.ll rename to test/Transforms/Tapir/dac-loopspawning-eh.ll diff --git a/test/Transforms/Tapir/sret-param.ll b/test/Transforms/Tapir/dac-sret-param.ll similarity index 100% rename from test/Transforms/Tapir/sret-param.ll rename to test/Transforms/Tapir/dac-sret-param.ll diff --git a/test/Transforms/Tapir/oldcilk-looplimit.ll b/test/Transforms/Tapir/oldcilk-looplimit.ll new file mode 100644 index 00000000000..4d6e00ef0f8 --- /dev/null +++ b/test/Transforms/Tapir/oldcilk-looplimit.ll @@ -0,0 +1,96 @@ +; Test that Tapir's loop spawning pass correctly transforms a loop +; that reads its original end iteration count. + +; RUN: opt < %s -loop-spawning -S -ls-tapir-target=cilklegacy | FileCheck %s + +source_filename = "looplimittest.c" + +@.str = private unnamed_addr constant [13 x i8] c"Limit is %d\0A\00", align 1 +@str = private unnamed_addr constant [9 x i8] c"Starting\00" +@str.3 = private unnamed_addr constant [9 x i8] c"Finished\00" + +; Function Attrs: noinline nounwind uwtable +define void @foo(i32 %limit) local_unnamed_addr #0 { +entry: + %syncreg = tail call token @llvm.syncregion.start() + %cmp9 = icmp slt i32 %limit, 0 + br i1 %cmp9, label %pfor.cond.cleanup, label %pfor.detach + +; CHECK: pfor.detach.preheader: +; CHECK: call fastcc void @[[OUTLINED:[a-zA-Z0-9._]+]]( +; CHECK: [[TYPE:i[0-9]+]] 0 +; CHECK: [[TYPE]] [[LOOPLIMIT:%[a-zA-Z0-9._]+]] +; CHECK: [[TYPE]] {{[%]?[a-zA-Z0-9._]+}} +; CHECK: i32 %limit + +pfor.cond.cleanup: ; preds = %pfor.inc, %entry + sync within %syncreg, label %pfor.end.continue + +pfor.end.continue: ; preds = %pfor.cond.cleanup + ret void + +; CHECK: define internal fastcc void @[[OUTLINED]]( +; CHECK: [[TYPE]] [[START:%[a-zA-Z0-9._]+]] +; CHECK: [[TYPE]] [[END:%[a-zA-Z0-9._]+]] +; CHECK: [[TYPE]] [[GRAIN:%[a-zA-Z0-9._]+]] +; CHECK: i32 [[LIMITARG:%[a-zA-Z0-9._]+]] + +; CHECK: [[NEWSYNCREG:%[a-zA-Z0-9._]+]] = tail call token @llvm.syncregion.start( + +; CHECK: {{^(;