diff --git a/include/llvm/Transforms/Tapir/PTXABI.h b/include/llvm/Transforms/Tapir/PTXABI.h new file mode 100644 index 00000000000..10698543896 --- /dev/null +++ b/include/llvm/Transforms/Tapir/PTXABI.h @@ -0,0 +1,128 @@ +/** + *************************************************************************** + * Copyright (c) 2017, Los Alamos National Security, LLC. + * All rights reserved. + * + * Copyright 2010. Los Alamos National Security, LLC. This software was + * produced under U.S. Government contract DE-AC52-06NA25396 for Los + * Alamos National Laboratory (LANL), which is operated by Los Alamos + * National Security, LLC for the U.S. Department of Energy. The + * U.S. Government has rights to use, reproduce, and distribute this + * software. NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, + * LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY + * FOR THE USE OF THIS SOFTWARE. If software is modified to produce + * derivative works, such modified software should be clearly marked, + * so as not to confuse it with the version available from LANL. + * + * Additionally, redistribution and use in source and binary forms, + * with or without modification, are permitted provided that the + * following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * * Neither the name of Los Alamos National Security, LLC, Los + * Alamos National Laboratory, LANL, the U.S. Government, nor the + * names of its contributors may be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND + * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + ***************************************************************************/ + +#ifndef PTX_ABI_H_ +#define PTX_ABI_H_ + +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/TypeBuilder.h" +#include "llvm/IR/ValueSymbolTable.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/Transforms/Tapir/LoopSpawning.h" +#include "llvm/Transforms/Tapir/TapirUtils.h" +#include + +namespace llvm { + +/// PTXABILoopSpawning uses the Cilk Plus ABI to handle Tapir loops. +class PTXABILoopSpawning : public LoopOutline { +public: + PTXABILoopSpawning(Loop *OrigLoop, ScalarEvolution &SE, + LoopInfo *LI, DominatorTree *DT, + AssumptionCache *AC, + OptimizationRemarkEmitter &ORE) + : LoopOutline(OrigLoop, SE, LI, DT, AC, ORE) + {} + + bool processLoop(); + + virtual ~PTXABILoopSpawning() {} + +protected: + +// private: +// /// Report an analysis message to assist the user in diagnosing loops that are +// /// not transformed. These are handled as LoopAccessReport rather than +// /// VectorizationReport because the << operator of LoopSpawningReport returns +// /// LoopAccessReport. +// void emitAnalysis(const LoopAccessReport &Message) const { +// emitAnalysisDiag(OrigLoop, *ORE, Message); +// } +private: + uint32_t nextKernelId_ = 0; +}; + +class PTXABI : public TapirTarget { +public: + PTXABI(); + Value *GetOrCreateWorker8(Function &F) override final; + void createSync(SyncInst &inst, ValueToValueMapTy &DetachCtxToStackFrame) + override final; + + Function *createDetach(DetachInst &Detach, + ValueToValueMapTy &DetachCtxToStackFrame, + DominatorTree &DT, AssumptionCache &AC) override final; + void preProcessFunction(Function &F) override final; + void postProcessFunction(Function &F) override final; + void postProcessHelper(Function &F) override final; + bool processMain(Function &F) override final; + +}; + +} // end of llvm namespace + +#endif diff --git a/include/llvm/Transforms/Tapir/TapirTypes.h b/include/llvm/Transforms/Tapir/TapirTypes.h index f29b8792a5d..6b553104b5b 100644 --- a/include/llvm/Transforms/Tapir/TapirTypes.h +++ b/include/llvm/Transforms/Tapir/TapirTypes.h @@ -23,7 +23,10 @@ enum class TapirTargetType { Cilk = 2, OpenMP = 3, CilkR = 4, - Qthreads = 5 + Qthreads = 5, + // +===== Kitsune + PTX = 6 + // ============== }; } // end namespace llvm diff --git a/include/llvm/Transforms/Tapir/TapirUtils.h b/include/llvm/Transforms/Tapir/TapirUtils.h index 65e7f0fe360..f1a6a327804 100644 --- a/include/llvm/Transforms/Tapir/TapirUtils.h +++ b/include/llvm/Transforms/Tapir/TapirUtils.h @@ -49,6 +49,7 @@ Function *extractDetachBodyToFunction(DetachInst &Detach, class TapirTarget { public: + virtual ~TapirTarget() {}; //! For use in loopspawning grainsize calculation virtual Value *GetOrCreateWorker8(Function &F) = 0; virtual void createSync(SyncInst &inst, diff --git a/include/llvm/Transforms/Utils/TapirUtils.h b/include/llvm/Transforms/Utils/TapirUtils.h index 4617c738df1..4c2fb19b00a 100644 --- a/include/llvm/Transforms/Utils/TapirUtils.h +++ b/include/llvm/Transforms/Utils/TapirUtils.h @@ -60,6 +60,7 @@ class LoopSpawningHints { enum SpawningStrategy { ST_SEQ, ST_DAC, + ST_GPU, ST_END, }; @@ -93,7 +94,8 @@ class LoopSpawningHints { return "Spawn iterations sequentially"; case LoopSpawningHints::ST_DAC: return "Use divide-and-conquer"; - case LoopSpawningHints::ST_END: + case LoopSpawningHints::ST_GPU: + return "Use gpu"; default: return "Unknown"; } @@ -142,8 +144,8 @@ class LoopSpawningHints { /// 4) The loop only branches to the exit block from the header or the latch. bool isCanonicalTapirLoop(const Loop *L, bool print = false); -//! Identify if a loop could be a DAC loop -bool isDACFor(Loop* L); +//! Identify if a loop could should be handled manually by a parallel loop backend +bool isBackendParallelFor(Loop* L); /// canDetach - Return true if the given function can perform a detach, false /// otherwise. diff --git a/lib/Transforms/Tapir/CMakeLists.txt b/lib/Transforms/Tapir/CMakeLists.txt index 43f0dbe3a2d..2f32875937b 100644 --- a/lib/Transforms/Tapir/CMakeLists.txt +++ b/lib/Transforms/Tapir/CMakeLists.txt @@ -1,6 +1,7 @@ add_llvm_library(LLVMTapirOpts CilkABI.cpp OpenMPABI.cpp + PTXABI.cpp QthreadsABI.cpp SmallBlock.cpp RedundantSpawn.cpp diff --git a/lib/Transforms/Tapir/LoopSpawning.cpp b/lib/Transforms/Tapir/LoopSpawning.cpp index f2f88d9d438..712bf853324 100644 --- a/lib/Transforms/Tapir/LoopSpawning.cpp +++ b/lib/Transforms/Tapir/LoopSpawning.cpp @@ -53,6 +53,12 @@ #include "llvm/Transforms/Utils/ValueMapper.h" #include +// +===== Kitsune + +#include "llvm/Transforms/Tapir/PTXABI.h" + +// ============== + using std::make_pair; using namespace llvm; @@ -75,7 +81,9 @@ static cl::opt ClTapirTarget( clEnumValN(TapirTargetType::OpenMP, "openmp", "OpenMP"), clEnumValN(TapirTargetType::Qthreads, - "qthreads", "Qthreads"))); + "qthreads", "Qthreads"), + clEnumValN(TapirTargetType::PTX, + "ptx", "PTX"))); namespace { // /// \brief This modifies LoopAccessReport to initialize message with @@ -115,6 +123,13 @@ static void emitMissedWarning(Function *F, Loop *L, << "Tapir loop not transformed: " << "failed to use divide-and-conquer loop spawning"); break; + case LoopSpawningHints::ST_GPU: + ORE->emit(DiagnosticInfoOptimizationFailure( + DEBUG_TYPE, "FailedRequestedSpawning", + L->getStartLoc(), L->getHeader()) + << "Tapir loop not transformed: " + << "failed to use GPU loop spawning"); + break; case LoopSpawningHints::ST_SEQ: ORE->emit(DiagnosticInfoOptimizationFailure( DEBUG_TYPE, "SpawningDisabled", @@ -1417,6 +1432,35 @@ bool LoopSpawningImpl::processLoop(Loop *L) { case LoopSpawningHints::ST_SEQ: DEBUG(dbgs() << "LS: Hints dictate sequential spawning.\n"); break; + case LoopSpawningHints::ST_GPU: + DEBUG(dbgs() << "LS: Hints dictate DAC spawning.\n"); + { + DebugLoc DLoc = L->getStartLoc(); + BasicBlock *Header = L->getHeader(); + PTXABILoopSpawning DLS(L, SE, &LI, &DT, &AC, ORE); + // CilkABILoopSpawning DLS(L, SE, &LI, &DT, &AC, ORE); + // DACLoopSpawning DLS(L, SE, LI, DT, TLI, TTI, ORE); + if (DLS.processLoop()) { + DEBUG({ + if (verifyFunction(*L->getHeader()->getParent())) { + dbgs() << "Transformed function is invalid.\n"; + return false; + } + }); + // Report success. + ORE.emit(OptimizationRemark(LS_NAME, "DACSpawning", DLoc, Header) + << "spawning iterations using divide-and-conquer"); + return true; + } else { + // Report failure. + ORE.emit(OptimizationRemarkMissed(LS_NAME, "NoDACSpawning", DLoc, + Header) + << "cannot spawn iterations using divide-and-conquer"); + emitMissedWarning(F, L, Hints, &ORE); + return false; + } + } + break; case LoopSpawningHints::ST_DAC: DEBUG(dbgs() << "LS: Hints dictate DAC spawning.\n"); { diff --git a/lib/Transforms/Tapir/OpenMPABI.cpp b/lib/Transforms/Tapir/OpenMPABI.cpp index 731a0c0cd93..5ddcf9eb174 100644 --- a/lib/Transforms/Tapir/OpenMPABI.cpp +++ b/lib/Transforms/Tapir/OpenMPABI.cpp @@ -488,7 +488,7 @@ Function* formatFunctionToTask(Function* extracted, CallInst* cal) { IRBuilder<> CallerIRBuilder(cal); auto *SharedsTySize = CallerIRBuilder.getInt64(DL.getTypeAllocSize(SharedsTy)); - auto *KmpTaskTTy = createKmpTaskTTy(C); + //unused -- auto *KmpTaskTTy = createKmpTaskTTy(C); auto *KmpTaskTWithPrivatesTy = createKmpTaskTWithPrivatesTy(SharedsTy);//KmpTaskTTy); auto *KmpTaskTWithPrivatesPtrTy = PointerType::getUnqual(KmpTaskTWithPrivatesTy); @@ -496,11 +496,11 @@ Function* formatFunctionToTask(Function* extracted, CallInst* cal) { CallerIRBuilder.getInt64(DL.getTypeAllocSize(KmpTaskTWithPrivatesTy)); auto *VoidTy = Type::getVoidTy(C); - auto *Int8PtrTy = Type::getInt8PtrTy(C); + // unused -- auto *Int8PtrTy = Type::getInt8PtrTy(C); auto *Int32Ty = Type::getInt32Ty(C); - auto *CopyFnTy = FunctionType::get(VoidTy, {Int8PtrTy}, true); - auto *CopyFnPtrTy = PointerType::getUnqual(CopyFnTy); + // unused -- auto *CopyFnTy = FunctionType::get(VoidTy, {Int8PtrTy}, true); + // unused -- auto *CopyFnPtrTy = PointerType::getUnqual(CopyFnTy); auto *OutlinedFnTy = FunctionType::get( VoidTy, @@ -593,12 +593,12 @@ Function *llvm::OpenMPABI::createDetach(DetachInst &detach, ValueToValueMapTy &DetachCtxToStackFrame, DominatorTree &DT, AssumptionCache &AC) { BasicBlock *detB = detach.getParent(); - Function &F = *(detB->getParent()); + // unused -- Function &F = *(detB->getParent()); BasicBlock *Spawned = detach.getDetached(); BasicBlock *Continue = detach.getContinue(); - Module *M = F.getParent(); + // unused -- Module *M = F.getParent(); CallInst *cal = nullptr; Function *extracted = extractDetachBodyToFunction(detach, DT, AC, &cal); @@ -676,7 +676,7 @@ void llvm::OpenMPABI::postProcessFunction(Function &F) { } } - for(int i=1; ieraseFromParent(); RegionFn->eraseFromParent(); } diff --git a/lib/Transforms/Tapir/PTXABI.cpp b/lib/Transforms/Tapir/PTXABI.cpp new file mode 100644 index 00000000000..249290ee0d5 --- /dev/null +++ b/lib/Transforms/Tapir/PTXABI.cpp @@ -0,0 +1,725 @@ +/** + *************************************************************************** + * Copyright (c) 2017, Los Alamos National Security, LLC. + * All rights reserved. + * + * Copyright 2010. Los Alamos National Security, LLC. This software was + * produced under U.S. Government contract DE-AC52-06NA25396 for Los + * Alamos National Laboratory (LANL), which is operated by Los Alamos + * National Security, LLC for the U.S. Department of Energy. The + * U.S. Government has rights to use, reproduce, and distribute this + * software. NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, + * LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY + * FOR THE USE OF THIS SOFTWARE. If software is modified to produce + * derivative works, such modified software should be clearly marked, + * so as not to confuse it with the version available from LANL. + * + * Additionally, redistribution and use in source and binary forms, + * with or without modification, are permitted provided that the + * following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * * Neither the name of Los Alamos National Security, LLC, Los + * Alamos National Laboratory, LANL, the U.S. Government, nor the + * names of its contributors may be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND + * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + ***************************************************************************/ + +#include "llvm/Transforms/Tapir/PTXABI.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Transforms/Tapir/Outline.h" +#include "llvm/Transforms/Utils/EscapeEnumerator.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/TapirUtils.h" +#include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/Vectorize.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/IR/LegacyPassManager.h" + +#include +#include +#include + +#define np(X) \ + std::cout << __FILE__ << ":" << __LINE__ << ": " << __PRETTY_FUNCTION__ \ + << ": " << #X << " = " << (X) << std::endl + +#include +#include +#include + +using namespace llvm; + +namespace{ + + template + Function* getFunction(Module& M, const char* name){ + return cast(M.getOrInsertFunction(name, + TypeBuilder::get(M.getContext()))); + } + + template + Value* convertInteger(B& b, Value* from, Value* to, const std::string& name){ + auto ft = dyn_cast(from->getType()); + assert(ft && "expected from type as integer type"); + + auto tt = dyn_cast(to->getType()); + assert(tt && "expected to type as integer type"); + + if(ft->getBitWidth() > tt->getBitWidth()){ + return b.CreateTrunc(from, tt, name); + } + else if(ft->getBitWidth() < tt->getBitWidth()){ + return b.CreateZExt(from, tt, name); + } + + return from; + } + +} // namespace + + +//############################################################################## + +PTXABI::PTXABI() {} + +/// \brief Get/Create the worker count for the spawning function. +Value *PTXABI::GetOrCreateWorker8(Function &F) { + Module *M = F.getParent(); + LLVMContext& C = M->getContext(); + return ConstantInt::get(C, APInt(16, 8)); +} + +void PTXABI::createSync(SyncInst &SI, ValueToValueMapTy &DetachCtxToStackFrame) { +} + +Function *PTXABI::createDetach(DetachInst &detach, + ValueToValueMapTy &DetachCtxToStackFrame, + DominatorTree &DT, AssumptionCache &AC) { + BasicBlock *detB = detach.getParent(); + // unused -- Function &F = *(detB->getParent()); + + BasicBlock *Spawned = detach.getDetached(); + BasicBlock *Continue = detach.getContinue(); + + // unused -- Module *M = F.getParent(); + + CallInst *cal = nullptr; + Function *extracted = extractDetachBodyToFunction(detach, DT, AC, &cal); + //extracted = formatFunctionToTask(extracted, cal); + + // Replace the detach with a branch to the continuation. + BranchInst *ContinueBr = BranchInst::Create(Continue); + ReplaceInstWithInst(&detach, ContinueBr); + + // Rewrite phis in the detached block. + { + BasicBlock::iterator BI = Spawned->begin(); + while (PHINode *P = dyn_cast(BI)) { + P->removeIncomingValue(detB); + ++BI; + } + } + return extracted; +} + +void PTXABI::preProcessFunction(Function &F) { +} + +void PTXABI::postProcessFunction(Function &F) { +} + +void PTXABI::postProcessHelper(Function &F) { +} + +bool PTXABI::processMain(Function &F) { + return true; +} + +bool PTXABILoopSpawning::processLoop(){ + Loop *L = OrigLoop; + + // L->dumpVerbose(); + + // code generation is currently limited to a simple canonical loop structure + // whereby we make the following assumptions and check assertions below + // soon we will expand this extraction mechanism to handle more complex + // loops + + using TypeVec = std::vector; + using ValueVec = std::vector; + + LLVMContext& c = L->getHeader()->getContext(); + + IRBuilder<> b(c); + + Type* voidTy = Type::getVoidTy(c); + IntegerType* i8Ty = Type::getInt8Ty(c); + IntegerType* i16Ty = Type::getInt16Ty(c); + IntegerType* i32Ty = Type::getInt32Ty(c); + IntegerType* i64Ty = Type::getInt64Ty(c); + PointerType* voidPtrTy = Type::getInt8PtrTy(c); + + // and LLVM transformation is able in some cases to transform the loop to + // contain a phi node that exists at the entry block + + PHINode* loopNode = L->getCanonicalInductionVariable(); + assert(loopNode && "expected canonical loop"); + + // only handle loops where the induction variable is initialized to a constant + + Value* loopStart = loopNode->getIncomingValue(0); + assert(loopStart && "expected canonical loop start"); + + auto cs = dyn_cast(loopStart); + bool startsAtZero = cs && cs->isZero(); + + BasicBlock* exitBlock = L->getUniqueExitBlock(); + assert(exitBlock && "expected canonical exit block"); + + // and assume that a branch instruction exists here + + BasicBlock* branchBlock = exitBlock->getSinglePredecessor(); + assert(branchBlock && "expected canonical branch block"); + + BranchInst* endBranch = dyn_cast(branchBlock->getTerminator()); + assert(endBranch && "expected canonical end branch instruction"); + + // get the branch condition in order to extract the end loop value + // which we also currently assume is constant + + Value* endBranchCond = endBranch->getCondition(); + CmpInst* cmp = dyn_cast(endBranchCond); + assert(cmp && "expected canonical comparison instruction"); + + Value* loopEnd = cmp->getOperand(1); + assert(loopEnd && "expected canonical loop end"); + + BasicBlock* latchBlock = L->getLoopLatch(); + Instruction* li = latchBlock->getFirstNonPHI(); + unsigned op = li->getOpcode(); + assert(op == Instruction::Add || op == Instruction::Sub && + "expected add or sub in loop latch"); + assert(li->getOperand(0)== loopNode); + Value* stride = li->getOperand(1); + cs = dyn_cast(stride); + bool isUnitStride = cs && cs->isOne(); + + BasicBlock* entryBlock = L->getBlocks()[0]; + + Function* hostFunc = entryBlock->getParent(); + + Module& hostModule = *hostFunc->getParent(); + + // assume a detach exists here and this basic block contains the body + // of the kernel function we will be generating + + DetachInst* detach = dyn_cast(entryBlock->getTerminator()); + assert(detach && "expected canonical loop entry detach"); + + BasicBlock* Body = detach->getDetached(); + + // extract the externally defined variables + // these will be passed in as CUDA arrays + + std::set values; + values.insert(loopNode); + + std::set extValues; + + for(Instruction& ii : *Body){ + if(dyn_cast(&ii)){ + continue; + } + + for(Use& u : ii.operands()){ + Value* v = u.get(); + + if(isa(v)){ + continue; + } + + if(values.find(v) == values.end()){ + extValues.insert(v); + } + } + + values.insert(&ii); + } + + TypeVec paramTypes; + paramTypes.push_back(i64Ty); + paramTypes.push_back(i64Ty); + paramTypes.push_back(i64Ty); + + for(Value* v : extValues){ + if(auto pt = dyn_cast(v->getType())){ + if(auto at = dyn_cast(pt->getElementType())){ + paramTypes.push_back(PointerType::get(at->getElementType(), 0)); + } + else{ + paramTypes.push_back(pt); + } + } + else{ + v->dump(); + assert(false && "expected a pointer or array type"); + } + } + + // create the GPU function + + FunctionType* funcTy = FunctionType::get(voidTy, paramTypes, false); + + Module ptxModule("ptxModule", c); + + // each kernel function is assigned a unique ID by which the kernel + // entry point function is named e.g. run0 for kernel ID 0 + + size_t kernelRunId = nextKernelId_++; + + std::stringstream kstr; + kstr << "run" << kernelRunId; + + Function* f = Function::Create(funcTy, + Function::ExternalLinkage, kstr.str().c_str(), &ptxModule); + + // the first parameter defines the extent of the index space + // i.e. number of threads to launch + auto aitr = f->arg_begin(); + aitr->setName("runSize"); + Value* runSizeParam = aitr; + ++aitr; + + aitr->setName("runStart"); + Value* runStartParam = aitr; + ++aitr; + + aitr->setName("runStride"); + Value* runStrideParam = aitr; + ++aitr; + + std::map m; + + // set and parameter names and map values to be replaced + + size_t i = 0; + + for(Value* v : extValues){ + std::stringstream sstr; + sstr << "arg" << i; + + m[v] = aitr; + aitr->setName(sstr.str()); + ++aitr; + ++i; + } + + // create the entry block which will be used to compute the thread ID + // and simply return if the thread ID is beyond the run size + + BasicBlock* br = BasicBlock::Create(c, "entry", f); + + b.SetInsertPoint(br); + + using SREGFunc = uint32_t(); + + // calls to NVPTX intrinsics to get the thread index, block size, + // and grid dimensions + + Value* threadIdx = b.CreateCall(getFunction(ptxModule, + "llvm.nvvm.read.ptx.sreg.tid.x")); + + Value* blockIdx = b.CreateCall(getFunction(ptxModule, + "llvm.nvvm.read.ptx.sreg.ctaid.x")); + + Value* blockDim = b.CreateCall(getFunction(ptxModule, + "llvm.nvvm.read.ptx.sreg.ntid.x")); + + Value* threadId = + b.CreateAdd(threadIdx, b.CreateMul(blockIdx, blockDim), "threadId"); + + // convert the thread ID into the proper integer type of the loop variable + + threadId = convertInteger(b, threadId, loopNode, "threadId"); + + if(!isUnitStride){ + threadId = b.CreateMul(threadId, runStrideParam); + } + + if(!startsAtZero){ + threadId = b.CreateAdd(threadId, runStartParam); + } + + // return block to exit if thread ID is greater than or equal to run size + + BasicBlock* rb = BasicBlock::Create(c, "exit", f); + BasicBlock* bb = BasicBlock::Create(c, "body", f); + + Value* cond = b.CreateICmpUGE(threadId, runSizeParam); + b.CreateCondBr(cond, rb, bb); + + b.SetInsertPoint(rb); + b.CreateRetVoid(); + + b.SetInsertPoint(bb); + + // map the thread ID into the new values as we clone the instructions + // of the function + + m[loopNode] = threadId; + + BasicBlock::InstListType& il = bb->getInstList(); + + // clone instructions of the body basic block, remapping values as needed + + std::set extReads; + std::set extWrites; + std::map extVars; + + for(Instruction& ii : *Body){ + if(dyn_cast(&ii)){ + continue; + } + + // determine if we are reading or writing the external variables + // i.e. those passed as CUDA arrays + + Instruction* ic = ii.clone(); + + if(auto li = dyn_cast(&ii)){ + Value* v = li->getPointerOperand(); + auto itr = extVars.find(v); + if(itr != extVars.end()){ + extReads.insert(itr->second); + } + } + else if(auto si = dyn_cast(&ii)){ + Value* v = si->getPointerOperand(); + auto itr = extVars.find(v); + if(itr != extVars.end()){ + extWrites.insert(itr->second); + } + } + // if this is a GEP into one of the external variables then keep track of + // which external variable it originally came from + else if(auto gi = dyn_cast(&ii)){ + Value* v = gi->getPointerOperand(); + if(extValues.find(v) != extValues.end()){ + extVars[gi] = v; + if(isa(gi->getSourceElementType())){ + auto cgi = dyn_cast(ic); + cgi->setSourceElementType(m[v]->getType()); + } + } + } + + // remap values as we are cloning the instructions + + for(auto& itr : m){ + ic->replaceUsesOfWith(itr.first, itr.second); + } + + il.push_back(ic); + m[&ii] = ic; + } + + b.CreateRetVoid(); + + // add the necessary NVPTX to mark the global function + + NamedMDNode* annotations = + ptxModule.getOrInsertNamedMetadata("nvvm.annotations"); + + SmallVector av; + + av.push_back(ValueAsMetadata::get(f)); + av.push_back(MDString::get(ptxModule.getContext(), "kernel")); + av.push_back(ValueAsMetadata::get(llvm::ConstantInt::get(i32Ty, 1))); + + annotations->addOperand(MDNode::get(ptxModule.getContext(), av)); + + // remove the basic blocks corresponding to the original LLVM loop + + BasicBlock* predecessor = L->getLoopPreheader(); + entryBlock->removePredecessor(predecessor); + BasicBlock* successor = exitBlock->getSingleSuccessor(); + + BasicBlock* hostBlock = BasicBlock::Create(c, "host.block", hostFunc); + + b.SetInsertPoint(predecessor->getTerminator()); + b.CreateBr(hostBlock); + predecessor->getTerminator()->removeFromParent(); + + successor->removePredecessor(exitBlock); + + { + std::set visited; + visited.insert(exitBlock); + + std::vector next; + next.push_back(entryBlock); + + while(!next.empty()){ + BasicBlock* b = next.back(); + next.pop_back(); + + for(BasicBlock* bn : b->getTerminator()->successors()){ + if(visited.find(bn) == visited.end()){ + next.push_back(bn); + } + } + + b->dropAllReferences(); + b->removeFromParent(); + visited.insert(b); + } + } + + exitBlock->dropAllReferences(); + exitBlock->removeFromParent(); + + // find the NVPTX module pass which will create the PTX code + + const Target* target = nullptr; + + for(TargetRegistry::iterator itr = TargetRegistry::targets().begin(), + itrEnd = TargetRegistry::targets().end(); itr != itrEnd; ++itr){ + if(std::string(itr->getName()) == "nvptx64"){ + target = &*itr; + break; + } + } + + assert(target && "failed to find NVPTX target"); + + Triple triple(sys::getDefaultTargetTriple()); + triple.setArch(Triple::nvptx64); + + // TODO: the version of LLVM that we are using currently only supports + // up to SM_60 – we need SM_70 for Volta architectures + + TargetMachine* targetMachine = + target->createTargetMachine(triple.getTriple(), + //"sm_35", + //"sm_70", + "sm_60", + "", + TargetOptions(), + Reloc::Static, + CodeModel::Default, + CodeGenOpt::Aggressive); + + DataLayout layout("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:" + "64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:" + "64:64-v128:128:128-n16:32:64"); + + ptxModule.setDataLayout(layout); + + legacy::PassManager* passManager = new legacy::PassManager; + + passManager->add(createVerifierPass()); + + // add in our optimization passes + + passManager->add(createInstructionCombiningPass()); + passManager->add(createReassociatePass()); + passManager->add(createGVNPass()); + passManager->add(createCFGSimplificationPass()); + passManager->add(createSLPVectorizerPass()); + passManager->add(createBreakCriticalEdgesPass()); + passManager->add(createConstantPropagationPass()); + passManager->add(createDeadInstEliminationPass()); + passManager->add(createDeadStoreEliminationPass()); + passManager->add(createInstructionCombiningPass()); + passManager->add(createCFGSimplificationPass()); + + SmallVector buf; + raw_svector_ostream ostr(buf); + + bool fail = + targetMachine->addPassesToEmitFile(*passManager, + ostr, + TargetMachine::CGFT_AssemblyFile, + false); + + assert(!fail && "failed to emit PTX"); + + passManager->run(ptxModule); + + delete passManager; + + std::string ptx = ostr.str().str(); + + Constant* pcs = ConstantDataArray::getString(c, ptx); + + // create a global string to hold the PTX code + + GlobalVariable* ptxGlobal = + new GlobalVariable(hostModule, + pcs->getType(), + true, + GlobalValue::PrivateLinkage, + pcs, + "ptx"); + + Value* kernelId = ConstantInt::get(i32Ty, kernelRunId); + + Value* ptxStr = b.CreateBitCast(ptxGlobal, voidPtrTy); + + b.SetInsertPoint(hostBlock); + + // finally, replace where the original loop was with calls to the GPU runtime + + using InitCUDAFunc = void(); + + b.CreateCall(getFunction(hostModule, + "__kitsune_cuda_init"), {}); + + using InitKernelFunc = void(uint32_t, const char*); + + b.CreateCall(getFunction(hostModule, + "__kitsune_gpu_init_kernel"), {kernelId, ptxStr}); + + for(Value* v : extValues){ + Value* elementSize; + Value* vptr; + Value* fieldName; + Value* size; + + // TODO: fix + // this is a temporary hack to get the size of the field + // it will currently only work for a limited case + + if(auto bc = dyn_cast(v)){ + auto ci = dyn_cast(bc->getOperand(0)); + assert(ci && "unable to detect field size"); + + Value* bytes = ci->getOperand(0); + assert(bytes->getType()->isIntegerTy(64)); + + auto pt = dyn_cast(v->getType()); + auto it = dyn_cast(pt->getElementType()); + assert(it && "expected integer type"); + + Constant* fn = ConstantDataArray::getString(c, ci->getName()); + + GlobalVariable* fieldNameGlobal = + new GlobalVariable(hostModule, + fn->getType(), + true, + GlobalValue::PrivateLinkage, + fn, + "field.name"); + + fieldName = b.CreateBitCast(fieldNameGlobal, voidPtrTy); + + vptr = b.CreateBitCast(v, voidPtrTy); + + elementSize = ConstantInt::get(i32Ty, it->getBitWidth()/8); + + size = b.CreateUDiv(bytes, ConstantInt::get(i64Ty, it->getBitWidth()/8)); + } + else if(auto ai = dyn_cast(v)){ + Constant* fn = ConstantDataArray::getString(c, ai->getName()); + + GlobalVariable* fieldNameGlobal = + new GlobalVariable(hostModule, + fn->getType(), + true, + GlobalValue::PrivateLinkage, + fn, + "field.name"); + + fieldName = b.CreateBitCast(fieldNameGlobal, voidPtrTy); + + vptr = b.CreateBitCast(v, voidPtrTy); + + auto at = dyn_cast(ai->getAllocatedType()); + assert(at && "expected array type"); + + elementSize = ConstantInt::get(i32Ty, + at->getElementType()->getPrimitiveSizeInBits()/8); + + size = ConstantInt::get(i64Ty, at->getNumElements()); + } + + uint8_t m = 0; + if(extReads.find(v) != extReads.end()){ + m |= 0b01; + } + + if(extWrites.find(v) != extWrites.end()){ + m |= 0b10; + } + + Value* mode = ConstantInt::get(i8Ty, m); + + TypeVec params = {i32Ty, voidPtrTy, voidPtrTy, i32Ty, i64Ty, i8Ty}; + + Function* initFieldFunc = + llvm::Function::Create(FunctionType::get(voidTy, params, false), + llvm::Function::ExternalLinkage, + "__kitsune_gpu_init_field", + &hostModule); + + b.CreateCall(initFieldFunc, + {kernelId, fieldName, vptr, elementSize, size, mode}); + } + + using SetRunSizeFunc = void(uint32_t, uint64_t, uint64_t, uint64_t); + + Value* runSize = b.CreateSub(loopEnd, loopStart); + + runSize = convertInteger(b, runSize, threadId, "run.size"); + + Value* runStart = convertInteger(b, loopStart, threadId, "run.start"); + + b.CreateCall(getFunction(hostModule, + "__kitsune_gpu_set_run_size"), {kernelId, runSize, runStart, runStart}); + + using RunKernelFunc = void(uint32_t); + + b.CreateCall(getFunction(hostModule, + "__kitsune_gpu_run_kernel"), {kernelId}); + + using FinishFunc = void(); + + b.CreateCall(getFunction(hostModule, + "__kitsune_gpu_finish"), {}); + + b.CreateBr(successor); + + // hostModule.dump(); + + // ptxModule.dump(); + + return true; +} diff --git a/lib/Transforms/Tapir/TapirToTarget.cpp b/lib/Transforms/Tapir/TapirToTarget.cpp index 17035715568..5a9f6ddb766 100644 --- a/lib/Transforms/Tapir/TapirToTarget.cpp +++ b/lib/Transforms/Tapir/TapirToTarget.cpp @@ -34,7 +34,10 @@ static cl::opt ClTapirTarget( clEnumValN(TapirTargetType::Qthreads, "qthreads", "Qthreads"), clEnumValN(TapirTargetType::OpenMP, - "openmp", "OpenMP"))); + "openmp", "OpenMP"), + clEnumValN(TapirTargetType::PTX, + "ptx", "PTX") + )); namespace { diff --git a/lib/Transforms/Tapir/TapirUtils.cpp b/lib/Transforms/Tapir/TapirUtils.cpp index 6a741532ab2..2583dd8f255 100644 --- a/lib/Transforms/Tapir/TapirUtils.cpp +++ b/lib/Transforms/Tapir/TapirUtils.cpp @@ -14,6 +14,7 @@ #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/Transforms/Tapir/CilkABI.h" #include "llvm/Transforms/Tapir/OpenMPABI.h" +#include "llvm/Transforms/Tapir/PTXABI.h" #include "llvm/Transforms/Tapir/QthreadsABI.h" #include "llvm/Transforms/Tapir/Outline.h" #include "llvm/Transforms/Utils/EscapeEnumerator.h" @@ -30,6 +31,8 @@ TapirTarget *llvm::getTapirTargetFromType(TapirTargetType Type) { return new CilkABI(); case TapirTargetType::OpenMP: return new OpenMPABI(); + case TapirTargetType::PTX: + return new PTXABI(); case TapirTargetType::Qthreads: return new QthreadsABI(); case TapirTargetType::None: diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp index c0f10f85fb1..106f5b14f35 100644 --- a/lib/Transforms/Utils/LoopUnroll.cpp +++ b/lib/Transforms/Utils/LoopUnroll.cpp @@ -372,7 +372,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force, // Are we eliminating the loop control altogether? bool CompletelyUnroll = Count == TripCount; - if (isDACFor(L) && !CompletelyUnroll) return false; + if (isBackendParallelFor(L) && !CompletelyUnroll) return false; SmallVector ExitBlocks; L->getExitBlocks(ExitBlocks); std::vector OriginalLoopBlocks = L->getBlocks(); diff --git a/lib/Transforms/Utils/TapirUtils.cpp b/lib/Transforms/Utils/TapirUtils.cpp index 8791e70cc09..9707290c426 100644 --- a/lib/Transforms/Utils/TapirUtils.cpp +++ b/lib/Transforms/Utils/TapirUtils.cpp @@ -178,7 +178,7 @@ bool llvm::MoveStaticAllocasInBlock( BranchInst *llvm::SerializeDetachedCFG(DetachInst *DI, DominatorTree *DT) { //TODO allow to work without dominatortree or code workaround //assert(DT && "Requires DominatorTree (could remove by fixing later TODO)"); - + // Get the parent of the detach instruction. BasicBlock *Detacher = DI->getParent(); // Get the detached block and continuation of this detach. @@ -537,11 +537,12 @@ bool llvm::isCanonicalTapirLoop(const Loop *L, bool print) { return true; } -bool llvm::isDACFor(Loop* L) { +bool llvm::isBackendParallelFor(Loop* L) { // TODO: Use a more precise detection of cilk_for loops. for (BasicBlock* BB : L->blocks()) if (isa(BB->getTerminator())) - return LoopSpawningHints(L).getStrategy() == LoopSpawningHints::ST_DAC; + return LoopSpawningHints(L).getStrategy() == LoopSpawningHints::ST_DAC + || LoopSpawningHints(L).getStrategy() == LoopSpawningHints::ST_GPU; return false; }