diff --git a/src/Makefile b/src/Makefile index 9e2bf245d8c43..70c0444f1f2a3 100644 --- a/src/Makefile +++ b/src/Makefile @@ -50,7 +50,9 @@ endif LLVMLINK := ifeq ($(JULIACODEGEN),LLVM) -SRCS += codegen jitlayers disasm debuginfo llvm-simdloop llvm-ptls llvm-muladd llvm-late-gc-lowering llvm-lower-handlers llvm-gc-invariant-verifier llvm-propagate-addrspaces cgmemmgr +SRCS += codegen jitlayers disasm debuginfo llvm-simdloop llvm-ptls llvm-muladd \ + llvm-late-gc-lowering llvm-lower-handlers llvm-gc-invariant-verifier \ + llvm-propagate-addrspaces llvm-alloc-opt cgmemmgr FLAGS += -I$(shell $(LLVM_CONFIG_HOST) --includedir) LLVM_LIBS := all ifeq ($(USE_POLLY),1) diff --git a/src/ccall.cpp b/src/ccall.cpp index e60525b5cf56d..fd47edd1c78a7 100644 --- a/src/ccall.cpp +++ b/src/ccall.cpp @@ -2146,7 +2146,7 @@ jl_cgval_t function_sig_t::emit_a_ccall( size_t rtsz = jl_datatype_size(rt); assert(rtsz > 0); Value *strct = emit_allocobj(ctx, rtsz, runtime_bt); - int boxalign = jl_gc_alignment(rtsz); + int boxalign = jl_datatype_align(rt); #ifndef JL_NDEBUG #if JL_LLVM_VERSION >= 40000 const DataLayout &DL = jl_data_layout; diff --git a/src/cgutils.cpp b/src/cgutils.cpp index 268c82579b3b7..f6676eb77f2d6 100644 --- a/src/cgutils.cpp +++ b/src/cgutils.cpp @@ -2182,25 +2182,12 @@ static Value *emit_allocobj(jl_codectx_t &ctx, size_t static_size, Value *jt) { JL_FEAT_REQUIRE(ctx, dynamic_alloc); JL_FEAT_REQUIRE(ctx, runtime); - - int osize; - int offset = jl_gc_classify_pools(static_size, &osize); Value *ptls_ptr = emit_bitcast(ctx, ctx.ptlsStates, T_pint8); - Value *v; - if (offset < 0) { - Value *args[] = {ptls_ptr, - ConstantInt::get(T_size, static_size + sizeof(void*))}; - v = ctx.builder.CreateCall(prepare_call(jlalloc_big_func), - ArrayRef(args, 2)); - } - else { - Value *pool_offs = ConstantInt::get(T_int32, offset); - Value *args[] = {ptls_ptr, pool_offs, ConstantInt::get(T_int32, osize)}; - v = ctx.builder.CreateCall(prepare_call(jlalloc_pool_func), - ArrayRef(args, 3)); - } - tbaa_decorate(tbaa_tag, ctx.builder.CreateStore(maybe_decay_untracked(jt), emit_typeptr_addr(ctx, v))); - return v; + auto call = ctx.builder.CreateCall(prepare_call(jl_alloc_obj_func), + {ptls_ptr, ConstantInt::get(T_size, static_size), + maybe_decay_untracked(jt)}); + call->setAttributes(jl_alloc_obj_func->getAttributes()); + return call; } // if ptr is NULL this emits a write barrier _back_ diff --git a/src/codegen.cpp b/src/codegen.cpp index a0c701acbfe6f..3d7fed309b83d 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -314,8 +314,7 @@ static Function *jlgenericfunction_func; static Function *jlenter_func; static Function *jlleave_func; static Function *jlegal_func; -static Function *jlalloc_pool_func; -static Function *jlalloc_big_func; +static Function *jl_alloc_obj_func; static Function *jlisa_func; static Function *jlsubtype_func; static Function *jlapplytype_func; @@ -6614,24 +6613,19 @@ static void init_julia_llvm_env(Module *m) "jl_instantiate_type_in_env", m); add_named_global(jlapplytype_func, &jl_instantiate_type_in_env); - std::vector alloc_pool_args(0); - alloc_pool_args.push_back(T_pint8); - alloc_pool_args.push_back(T_int32); - alloc_pool_args.push_back(T_int32); - jlalloc_pool_func = - Function::Create(FunctionType::get(T_prjlvalue, alloc_pool_args, false), - Function::ExternalLinkage, - "jl_gc_pool_alloc", m); - add_named_global(jlalloc_pool_func, &jl_gc_pool_alloc); - - std::vector alloc_big_args(0); - alloc_big_args.push_back(T_pint8); - alloc_big_args.push_back(T_size); - jlalloc_big_func = - Function::Create(FunctionType::get(T_prjlvalue, alloc_big_args, false), - Function::ExternalLinkage, - "jl_gc_big_alloc", m); - add_named_global(jlalloc_big_func, &jl_gc_big_alloc); + std::vector gc_alloc_args(0); + gc_alloc_args.push_back(T_pint8); + gc_alloc_args.push_back(T_size); + gc_alloc_args.push_back(T_prjlvalue); + jl_alloc_obj_func = Function::Create(FunctionType::get(T_prjlvalue, gc_alloc_args, false), + Function::ExternalLinkage, + "julia.gc_alloc_obj"); +#if JL_LLVM_VERSION >= 50000 + jl_alloc_obj_func->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias); +#else + jl_alloc_obj_func->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias); +#endif + add_named_global(jl_alloc_obj_func, (void*)NULL, /*dllimport*/false); std::vector dlsym_args(0); dlsym_args.push_back(T_pint8); diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp index 7c9ef29df2f9e..5b7397ddc80d0 100644 --- a/src/intrinsics.cpp +++ b/src/intrinsics.cpp @@ -325,11 +325,7 @@ static Value *emit_unbox(jl_codectx_t &ctx, Type *to, const jl_cgval_t &x, jl_va } int alignment; - if (x.isboxed) { - // julia's gc gives 16-byte aligned addresses - alignment = 16; - } - else if (jt) { + if (jt) { alignment = julia_alignment(p, jt, 0); } else { diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp index b816e3a6f2c56..991b4cbc2447b 100644 --- a/src/jitlayers.cpp +++ b/src/jitlayers.cpp @@ -108,6 +108,7 @@ void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level) #endif if (opt_level == 0) { PM->add(createCFGSimplificationPass()); // Clean up disgusting code + PM->add(createAllocOptPass(false)); #if JL_LLVM_VERSION < 50000 PM->add(createBarrierNoopPass()); PM->add(createLowerExcHandlersPass()); @@ -147,6 +148,7 @@ void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level) // effectiveness of the optimization, but should retain correctness. #if JL_LLVM_VERSION < 50000 PM->add(createLowerExcHandlersPass()); + PM->add(createAllocOptPass(true)); PM->add(createLateLowerGCFramePass()); // Remove dead use of ptls PM->add(createDeadCodeEliminationPass()); @@ -161,6 +163,12 @@ void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level) PM->add(createAlwaysInlinerPass()); // Respect always_inline #endif +#if JL_LLVM_VERSION >= 50000 + // Running `memcpyopt` between this and `sroa` seems to give `sroa` a hard time + // merging the `alloca` for the unboxed data and the `alloca` created by the `alloc_opt` + // pass. + PM->add(createAllocOptPass(true)); +#endif PM->add(createInstructionCombiningPass()); // Cleanup for scalarrepl. PM->add(createSROAPass()); // Break up aggregate allocas PM->add(createInstructionCombiningPass()); // Cleanup for scalarrepl. diff --git a/src/jitlayers.h b/src/jitlayers.h index 5703a35efd29d..739e944a36f81 100644 --- a/src/jitlayers.h +++ b/src/jitlayers.h @@ -175,6 +175,7 @@ Pass *createLateLowerGCFramePass(); Pass *createLowerExcHandlersPass(); Pass *createGCInvariantVerifierPass(bool Strong); Pass *createPropagateJuliaAddrspaces(); +Pass *createAllocOptPass(bool); // Whether the Function is an llvm or julia intrinsic. static inline bool isIntrinsicFunction(Function *F) { diff --git a/src/llvm-alloc-opt.cpp b/src/llvm-alloc-opt.cpp new file mode 100644 index 0000000000000..30d5c3d5e8994 --- /dev/null +++ b/src/llvm-alloc-opt.cpp @@ -0,0 +1,729 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +#define DEBUG_TYPE "alloc_opt" +#undef DEBUG +#include "llvm-version.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "fix_llvm_assert.h" + +#include "codegen_shared.h" +#include "julia.h" +#include "julia_internal.h" + +#include +#include + +using namespace llvm; + +extern std::pair tbaa_make_child(const char *name, MDNode *parent=nullptr, bool isConstant=false); + +namespace { + +static void copyMetadata(Instruction *dest, const Instruction *src) +{ +#if JL_LLVM_VERSION < 40000 + if (!src->hasMetadata()) + return; + SmallVector,4> TheMDs; + src->getAllMetadataOtherThanDebugLoc(TheMDs); + for (const auto &MD : TheMDs) + dest->setMetadata(MD.first, MD.second); + dest->setDebugLoc(src->getDebugLoc()); +#else + dest->copyMetadata(*src); +#endif +} + +static bool isBundleOperand(CallInst *call, unsigned idx) +{ +#if JL_LLVM_VERSION < 40000 + return call->hasOperandBundles() && idx >= call->getBundleOperandsStartIndex() && + idx < call->getBundleOperandsEndIndex(); +#else + return call->isBundleOperand(idx); +#endif +} + +/** + * Promote `julia.gc_alloc_obj` which do not have escaping root to a alloca and + * lower other ones to real GC allocation. + * Uses that are not considered to escape the object (i.e. heap address) includes, + * + * * load + * * `pointer_from_objref` + * * `ccall` gcroot array (`jl_roots` operand bundle) + * * store (as address) + * * addrspacecast, bitcast, getelementptr + * + * The results of these cast instructions will be scanned recursively. + * + * All other uses are considered to escape conservatively. + */ + +struct AllocOpt : public FunctionPass { + static char ID; + AllocOpt(bool opt=true) + : FunctionPass(ID), + optimize(opt) + {} + +private: + bool optimize; + LLVMContext *ctx; + + const DataLayout *DL; + + Function *alloc_obj; + Function *pool_alloc; + Function *big_alloc; + Function *ptr_from_objref; + Function *lifetime_start; + Function *lifetime_end; + + Type *T_int8; + Type *T_int32; + Type *T_int64; + Type *T_size; + Type *T_pint8; + Type *T_prjlvalue; + Type *T_pjlvalue; + Type *T_pjlvalue_der; + Type *T_pprjlvalue; + Type *T_ppjlvalue_der; + + MDNode *tbaa_tag; + + struct CheckInstFrame { + Instruction *parent; + uint64_t offset; + Instruction::use_iterator use_it; + Instruction::use_iterator use_end; + }; + typedef SmallVector CheckInstStack; + struct ReplaceUsesFrame { + Instruction *orig_i; + Instruction *new_i; + ReplaceUsesFrame(Instruction *orig_i, Instruction *new_i) + : orig_i(orig_i), + new_i(new_i) + {} + }; + typedef SmallVector ReplaceUsesStack; + + struct LifetimeMarker { + LifetimeMarker(AllocOpt &pass) + : pass(pass), + first_safepoint{}, + stack{} + {} + // insert llvm.lifetime.* calls for `ptr` with size `sz` + // based on the use of `orig` given in `alloc_uses`. + void insert(Instruction *ptr, Constant *sz, Instruction *orig, + const std::set &alloc_uses); + private: + Instruction *getFirstSafepoint(BasicBlock *bb); + void insertEnd(Instruction *ptr, Constant *sz, Instruction *insert); + struct Frame { + BasicBlock *bb; + pred_iterator p_cur; + pred_iterator p_end; + Frame(BasicBlock *bb) + : bb(bb), + p_cur(pred_begin(bb)), + p_end(pred_end(bb)) + {} + }; + AllocOpt &pass; + std::map first_safepoint; + SmallVector stack; + }; + + bool doInitialization(Module &m) override; + bool runOnFunction(Function &F) override; + bool checkInst(Instruction *I, CheckInstStack &stack, std::set &uses, + bool &ignore_tag); + void replaceUsesWith(Instruction *orig_i, Instruction *new_i, ReplaceUsesStack &stack); + void lowerAlloc(CallInst *I, size_t sz); + bool isSafepoint(Instruction *inst); + void getAnalysisUsage(AnalysisUsage &AU) const override + { + if (optimize) { + FunctionPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addPreserved(); + AU.setPreservesCFG(); + } + } +}; + +Instruction *AllocOpt::LifetimeMarker::getFirstSafepoint(BasicBlock *bb) +{ + auto it = first_safepoint.find(bb); + if (it != first_safepoint.end()) + return it->second; + Instruction *first = nullptr; + for (auto &I: *bb) { + if (pass.isSafepoint(&I)) { + first = &I; + break; + } + } + first_safepoint[bb] = first; + return first; +} + +void AllocOpt::LifetimeMarker::insertEnd(Instruction *ptr, Constant *sz, Instruction *insert) +{ + BasicBlock::iterator it(insert); + BasicBlock::iterator begin(insert->getParent()->begin()); + // Makes sure that the end is inserted before nearby start. + // We insert start before the allocation call, if it is the first safepoint we find for + // another instruction, it's better if we insert the end before the start instead of the + // allocation so that the two allocations do not have overlapping lifetime. + while (it != begin) { + --it; + if (auto II = dyn_cast(&*it)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_start || + II->getIntrinsicID() == Intrinsic::lifetime_end) { + insert = II; + continue; + } + } + break; + } + CallInst::Create(pass.lifetime_end, {sz, ptr}, "", insert); +} + +void AllocOpt::LifetimeMarker::insert(Instruction *ptr, Constant *sz, Instruction *orig, + const std::set &alloc_uses) +{ + CallInst::Create(pass.lifetime_start, {sz, ptr}, "", orig); + BasicBlock *def_bb = orig->getParent(); + std::set bbs{def_bb}; + auto &DT = pass.getAnalysis().getDomTree(); + // Collect all BB where the allocation is live + for (auto use: alloc_uses) { + auto bb = use->getParent(); + if (!bbs.insert(bb).second) + continue; + assert(stack.empty()); + Frame cur{bb}; + while (true) { + assert(cur.p_cur != cur.p_end); + auto pred = *cur.p_cur; + ++cur.p_cur; + if (bbs.insert(pred).second) { + if (cur.p_cur != cur.p_end) + stack.push_back(cur); + cur = Frame(pred); + } + if (cur.p_cur == cur.p_end) { + if (stack.empty()) + break; + cur = stack.back(); + stack.pop_back(); + } + } + } +#ifndef JL_NDEBUG + for (auto bb: bbs) { + if (bb == def_bb) + continue; + if (DT.dominates(orig, bb)) + continue; + auto F = bb->getParent(); +#if JL_LLVM_VERSION >= 50000 + F->print(llvm::dbgs(), nullptr, false, true); + orig->print(llvm::dbgs(), true); + jl_safe_printf("Does not dominate BB:\n"); + bb->print(llvm::dbgs(), true); +#else + F->dump(); + orig->dump(); + jl_safe_printf("Does not dominate BB:\n"); + bb->dump(); +#endif + abort(); + } +#endif + // For each BB, find the first instruction(s) where the allocation is possibly dead. + // If all successors are live, then there isn't one. + // If all successors are dead, then it's the first instruction after the last use + // within the BB. + // If some successors are live and others are dead, it's the first instruction in + // the successors that are dead. + std::vector first_dead; + for (auto bb: bbs) { + bool has_use = false; + for (auto succ: successors(bb)) { + // def_bb is the only bb in bbs that's not dominated by orig + if (succ != def_bb && bbs.count(succ)) { + has_use = true; + break; + } + } + if (has_use) { + for (auto succ: successors(bb)) { + if (!bbs.count(succ)) { + first_dead.push_back(&*succ->begin()); + } + } + } + else { + for (auto it = bb->rbegin(), end = bb->rend(); it != end; ++it) { + if (alloc_uses.count(&*it)) { + --it; + first_dead.push_back(&*it); + break; + } + } + } + } + bbs.clear(); + // There can/need only be one lifetime.end for each allocation in each bb, use bbs + // to record that. + // Iterate through the first dead and find the first safepoint following each of them. + while (!first_dead.empty()) { + auto I = first_dead.back(); + first_dead.pop_back(); + auto bb = I->getParent(); + if (!bbs.insert(bb).second) + continue; + if (I == &*bb->begin()) { + // There's no use in or after this bb. If this bb is not dominated by + // the def then it has to be dead on entering this bb. + // Otherwise, there could be use that we don't track + // before hitting the next safepoint. + if (!DT.dominates(orig, bb)) { + insertEnd(ptr, sz, &*bb->getFirstInsertionPt()); + continue; + } + else if (auto insert = getFirstSafepoint(bb)) { + insertEnd(ptr, sz, insert); + } + } + else { + assert(bb == def_bb || DT.dominates(orig, I)); + BasicBlock::iterator it(I); + BasicBlock::iterator end = bb->end(); + bool safepoint_found = false; + for (; it != end; ++it) { + auto insert = &*it; + if (pass.isSafepoint(insert)) { + insertEnd(ptr, sz, insert); + safepoint_found = true; + break; + } + } + if (safepoint_found) { + continue; + } + } + for (auto succ: successors(bb)) { + first_dead.push_back(&*succ->begin()); + } + } +} + +static void addRetNoAlias(Function *F) +{ +#if JL_LLVM_VERSION >= 50000 + F->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias); +#else + F->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias); +#endif +} + +bool AllocOpt::doInitialization(Module &M) +{ + ctx = &M.getContext(); + DL = &M.getDataLayout(); + + alloc_obj = M.getFunction("julia.gc_alloc_obj"); + if (!alloc_obj) + return false; + + ptr_from_objref = M.getFunction("julia.pointer_from_objref"); + + T_prjlvalue = alloc_obj->getReturnType(); + T_pjlvalue = PointerType::get(cast(T_prjlvalue)->getElementType(), 0); + T_pjlvalue_der = PointerType::get(cast(T_prjlvalue)->getElementType(), + AddressSpace::Derived); + T_pprjlvalue = PointerType::get(T_prjlvalue, 0); + T_ppjlvalue_der = PointerType::get(T_prjlvalue, AddressSpace::Derived); + + T_int8 = Type::getInt8Ty(*ctx); + T_int32 = Type::getInt32Ty(*ctx); + T_int64 = Type::getInt64Ty(*ctx); + T_size = sizeof(void*) == 8 ? T_int64 : T_int32; + T_pint8 = PointerType::get(T_int8, 0); + + if (!(pool_alloc = M.getFunction("jl_gc_pool_alloc"))) { + std::vector alloc_pool_args(0); + alloc_pool_args.push_back(T_pint8); + alloc_pool_args.push_back(T_int32); + alloc_pool_args.push_back(T_int32); + pool_alloc = Function::Create(FunctionType::get(T_prjlvalue, alloc_pool_args, false), + Function::ExternalLinkage, "jl_gc_pool_alloc", &M); + addRetNoAlias(pool_alloc); + } + if (!(big_alloc = M.getFunction("jl_gc_big_alloc"))) { + std::vector alloc_big_args(0); + alloc_big_args.push_back(T_pint8); + alloc_big_args.push_back(T_size); + big_alloc = Function::Create(FunctionType::get(T_prjlvalue, alloc_big_args, false), + Function::ExternalLinkage, "jl_gc_big_alloc", &M); + addRetNoAlias(big_alloc); + } + +#if JL_LLVM_VERSION >= 50000 + lifetime_start = Intrinsic::getDeclaration(&M, Intrinsic::lifetime_start, { T_pint8 }); + lifetime_end = Intrinsic::getDeclaration(&M, Intrinsic::lifetime_end, { T_pint8 }); +#else + lifetime_start = Intrinsic::getDeclaration(&M, Intrinsic::lifetime_start); + lifetime_end = Intrinsic::getDeclaration(&M, Intrinsic::lifetime_end); +#endif + + MDNode *tbaa_data; + MDNode *tbaa_data_scalar; + std::tie(tbaa_data, tbaa_data_scalar) = tbaa_make_child("jtbaa_data"); + tbaa_tag = tbaa_make_child("jtbaa_tag", tbaa_data_scalar).first; + + return true; +} + +bool AllocOpt::checkInst(Instruction *I, CheckInstStack &stack, std::set &uses, + bool &ignore_tag) +{ + uses.clear(); + if (I->use_empty()) + return true; + CheckInstFrame cur{I, 0, I->use_begin(), I->use_end()}; + stack.clear(); + + // Recursion + auto push_inst = [&] (Instruction *inst) { + if (cur.use_it != cur.use_end) + stack.push_back(cur); + cur.parent = inst; + cur.use_it = inst->use_begin(); + cur.use_end = inst->use_end(); + }; + + auto check_inst = [&] (Instruction *inst, Use *use) { + if (isa(inst)) + return true; + if (auto call = dyn_cast(inst)) { + // TODO: on LLVM 5.0 we may need to handle certain llvm intrinsics + // including `memcpy`, `memset` etc. We might also need to handle + // `memcmp` by coverting to our own intrinsic and lower it after the gc root pass. + if (ptr_from_objref && ptr_from_objref == call->getCalledFunction()) + return true; + auto opno = use->getOperandNo(); + // Uses in `jl_roots` operand bundle are not counted as escaping, everything else do. + if (!isBundleOperand(call, opno)) + return false; + return call->getOperandBundleForOperand(opno).getTagName() == "jl_roots"; + } + if (auto store = dyn_cast(inst)) { + // Only store value count + if (use->getOperandNo() != StoreInst::getPointerOperandIndex()) + return false; + auto storev = store->getValueOperand(); + // There's GC root in this object. + if (auto ptrtype = dyn_cast(storev->getType())) { + if (ptrtype->getAddressSpace() == AddressSpace::Tracked) { + return false; + } + } + return true; + } + if (isa(inst) || isa(inst)) { + push_inst(inst); + return true; + } + if (auto gep = dyn_cast(inst)) { + APInt apoffset(sizeof(void*) * 8, cur.offset, true); + if (ignore_tag && (!gep->accumulateConstantOffset(*DL, apoffset) || + apoffset.isNegative())) + ignore_tag = false; + push_inst(inst); + cur.offset = apoffset.getLimitedValue(); + // Check overflow + if (cur.offset == UINT64_MAX) + ignore_tag = false; + return true; + } + return false; + }; + + while (true) { + assert(cur.use_it != cur.use_end); + auto use = &*cur.use_it; + auto inst = dyn_cast(use->getUser()); + ++cur.use_it; + if (!inst) + return false; + if (!check_inst(inst, use)) + return false; + uses.insert(inst); + if (cur.use_it == cur.use_end) { + if (stack.empty()) + return true; + cur = stack.back(); + stack.pop_back(); + } + } +} + +// This function needs to handle all cases `AllocOpt::checkInst` can handle. +// This function should not erase any safepoint so that the lifetime marker can find and cache +// all the original safepoints. +void AllocOpt::replaceUsesWith(Instruction *orig_inst, Instruction *new_inst, + ReplaceUsesStack &stack) +{ + auto simple_replace = [&] (Instruction *orig_i, Instruction *new_i) { + if (orig_i->user_empty()) { + if (orig_i != orig_inst) + orig_i->eraseFromParent(); + return true; + } + Type *orig_t = orig_i->getType(); + Type *new_t = new_i->getType(); + if (orig_t == new_t) { + orig_i->replaceAllUsesWith(new_i); + if (orig_i != orig_inst) + orig_i->eraseFromParent(); + return true; + } + return false; + }; + if (simple_replace(orig_inst, new_inst)) + return; + assert(stack.empty()); + ReplaceUsesFrame cur{orig_inst, new_inst}; + auto finish_cur = [&] () { + assert(cur.orig_i->user_empty()); + if (cur.orig_i != orig_inst) { + cur.orig_i->eraseFromParent(); + } + }; + auto push_frame = [&] (Instruction *orig_i, Instruction *new_i) { + if (simple_replace(orig_i, new_i)) + return; + stack.push_back(cur); + cur = {orig_i, new_i}; + }; + // Both `orig_i` and `new_i` should be pointer of the same type + // but possibly different address spaces. `new_i` is always in addrspace 0. + auto replace_inst = [&] (Instruction *user) { + Instruction *orig_i = cur.orig_i; + Instruction *new_i = cur.new_i; + if (isa(user) || isa(user)) { + user->replaceUsesOfWith(orig_i, new_i); + } + else if (auto call = dyn_cast(user)) { + if (ptr_from_objref && ptr_from_objref == call->getCalledFunction()) { + call->replaceAllUsesWith(new_i); + call->eraseFromParent(); + return; + } + // remove from operand bundle + Type *new_t = new_i->getType(); + user->replaceUsesOfWith(orig_i, ConstantPointerNull::get(cast(new_t))); + } + else if (isa(user) || isa(user)) { + auto cast_t = PointerType::get(cast(user->getType())->getElementType(), + 0); + auto replace_i = new_i; + Type *new_t = new_i->getType(); + if (cast_t != new_t) { + replace_i = new BitCastInst(replace_i, cast_t, "", user); + replace_i->setDebugLoc(user->getDebugLoc()); + replace_i->takeName(user); + } + push_frame(user, replace_i); + } + else if (auto gep = dyn_cast(user)) { + SmallVector IdxOperands(gep->idx_begin(), gep->idx_end()); + auto new_gep = GetElementPtrInst::Create(gep->getSourceElementType(), + new_i, IdxOperands, + gep->getName(), gep); + new_gep->setIsInBounds(gep->isInBounds()); + new_gep->takeName(gep); + copyMetadata(new_gep, gep); + push_frame(gep, new_gep); + } + else { + abort(); + } + }; + + while (true) { + replace_inst(cast(*cur.orig_i->user_begin())); + while (cur.orig_i->use_empty()) { + finish_cur(); + if (stack.empty()) + return; + cur = stack.back(); + stack.pop_back(); + } + } +} + +void AllocOpt::lowerAlloc(CallInst *I, size_t sz) +{ + int osize; + int offset = jl_gc_classify_pools(sz, &osize); + IRBuilder<> builder(I); + builder.SetCurrentDebugLocation(I->getDebugLoc()); + auto ptls = I->getArgOperand(0); + CallInst *newI; + if (offset < 0) { + newI = builder.CreateCall(big_alloc, {ptls, ConstantInt::get(T_size, sz + sizeof(void*))}); + } + else { + auto pool_offs = ConstantInt::get(T_int32, offset); + auto pool_osize = ConstantInt::get(T_int32, osize); + newI = builder.CreateCall(pool_alloc, {ptls, pool_offs, pool_osize}); + } + newI->setAttributes(I->getAttributes()); + newI->takeName(I); + copyMetadata(newI, I); + auto derived = builder.CreateAddrSpaceCast(newI, T_pjlvalue_der); + auto cast = builder.CreateBitCast(derived, T_ppjlvalue_der); + auto tagaddr = builder.CreateGEP(T_prjlvalue, cast, {ConstantInt::get(T_size, -1)}); + auto store = builder.CreateStore(I->getArgOperand(2), tagaddr); + store->setMetadata(LLVMContext::MD_tbaa, tbaa_tag); + I->replaceAllUsesWith(newI); +} + +bool AllocOpt::isSafepoint(Instruction *inst) +{ + auto call = dyn_cast(inst); + if (!call) + return false; + if (isa(call)) + return false; + if (auto callee = call->getCalledFunction()) { + // Known functions emitted in codegen that are not safepoints + if (callee == ptr_from_objref || callee->getName() == "memcmp") { + return false; + } + } + return true; +} + +bool AllocOpt::runOnFunction(Function &F) +{ + if (!alloc_obj) + return false; + std::map allocs; + for (auto &bb: F) { + for (auto &I: bb) { + auto call = dyn_cast(&I); + if (!call) + continue; + auto callee = call->getCalledFunction(); + if (!callee) + continue; + size_t sz; + if (callee == alloc_obj) { + assert(call->getNumArgOperands() == 3); + sz = (size_t)cast(call->getArgOperand(1))->getZExtValue(); + } + else { + continue; + } + allocs[call] = sz; + } + } + + auto &entry = F.getEntryBlock(); + CheckInstStack check_stack; + ReplaceUsesStack replace_stack; + std::set alloc_uses; + LifetimeMarker lifetime(*this); + for (auto it: allocs) { + bool ignore_tag = true; + auto orig = it.first; + size_t sz = it.second; + if (optimize && sz < IntegerType::MAX_INT_BITS / 8 && + checkInst(orig, check_stack, alloc_uses, ignore_tag)) { + // The allocation does not escape or get used in a phi node so none of the derived + // SSA from it are live when we run the allocation again. + // It is now safe to promote the allocation to an entry block alloca. + size_t align = 1; + // TODO make codegen handling of alignment consistent and pass that as a parameter + // to the allocation function directly. + if (!ignore_tag) { + align = sz <= 8 ? 8 : JL_SMALL_BYTE_ALIGNMENT; + sz += align; + } + else if (sz > 1) { + align = JL_SMALL_BYTE_ALIGNMENT; + while (sz < align) { + align = align / 2; + } + } + // No debug info for prolog instructions + IRBuilder<> prolog_builder(&entry.front()); + AllocaInst *buff; + Instruction *ptr; + if (sz == 0) { + buff = prolog_builder.CreateAlloca(T_int8, 0); + ptr = buff; + } + else { + buff = prolog_builder.CreateAlloca(Type::getIntNTy(*ctx, sz * 8)); + buff->setAlignment(align); + ptr = cast(prolog_builder.CreateBitCast(buff, T_pint8)); + } + lifetime.insert(ptr, ConstantInt::get(T_int64, sz), orig, alloc_uses); + // Someone might be reading the tag, initialize it. + if (!ignore_tag) { + ptr = cast(prolog_builder.CreateConstGEP1_32(T_int8, ptr, align)); + auto casti = prolog_builder.CreateBitCast(ptr, T_pprjlvalue); + auto tagaddr = prolog_builder.CreateGEP(T_prjlvalue, casti, + {ConstantInt::get(T_size, -1)}); + // Store should be created at the callsite and not in the prolog + auto store = new StoreInst(orig->getArgOperand(2), tagaddr, orig); + store->setMetadata(LLVMContext::MD_tbaa, tbaa_tag); + store->setDebugLoc(orig->getDebugLoc()); + } + auto casti = cast(prolog_builder.CreateBitCast(ptr, T_pjlvalue)); + casti->takeName(orig); + replaceUsesWith(orig, cast(casti), replace_stack); + } + else { + lowerAlloc(orig, sz); + } + } + for (auto it: allocs) + it.first->eraseFromParent(); + return true; +} + +char AllocOpt::ID = 0; +static RegisterPass X("AllocOpt", "Promote heap allocation to stack", + false /* Only looks at CFG */, + false /* Analysis Pass */); + +} + +Pass *createAllocOptPass(bool opt) +{ + return new AllocOpt(opt); +} diff --git a/test/codegen.jl b/test/codegen.jl index 980f8d97e9d0f..4310db7879c3d 100644 --- a/test/codegen.jl +++ b/test/codegen.jl @@ -146,6 +146,25 @@ Base.unsafe_convert(::Type{Ptr{BadRef}}, ar::BadRef) = Ptr{BadRef}(pointer_from_ breakpoint_badref(a::MutableStruct) = ccall(:jl_breakpoint, Void, (Ptr{BadRef},), a) +struct PtrStruct + a::Ptr{Void} + b::Int +end + +mutable struct RealStruct + a::Float64 + b::Int +end + +function Base.cconvert(::Type{Ref{PtrStruct}}, a::RealStruct) + (a, Ref(PtrStruct(pointer_from_objref(a), a.b))) +end +Base.unsafe_convert(::Type{Ref{PtrStruct}}, at::Tuple) = + Base.unsafe_convert(Ref{PtrStruct}, at[2]) + +breakpoint_ptrstruct(a::RealStruct) = + ccall(:jl_breakpoint, Void, (Ref{PtrStruct},), a) + if opt_level > 0 @test !contains(get_llvm(isequal, Tuple{Nullable{BigFloat}, Nullable{BigFloat}}), "%gcframe") @test !contains(get_llvm(pointer_not_safepoint, Tuple{}), "%gcframe") @@ -161,4 +180,25 @@ if opt_level > 0 breakpoint_badref_ir = get_llvm(breakpoint_badref, Tuple{MutableStruct}) @test !contains(breakpoint_badref_ir, "%gcframe") @test !contains(breakpoint_badref_ir, "jl_gc_pool_alloc") + + breakpoint_ptrstruct_ir = get_llvm(breakpoint_ptrstruct, Tuple{RealStruct}) + @test !contains(breakpoint_ptrstruct_ir, "%gcframe") + @test !contains(breakpoint_ptrstruct_ir, "jl_gc_pool_alloc") +end + +function two_breakpoint(a::Float64) + ccall(:jl_breakpoint, Void, (Ref{Float64},), a) + ccall(:jl_breakpoint, Void, (Ref{Float64},), a) +end + +if opt_level > 0 + breakpoint_f64_ir = get_llvm((a)->ccall(:jl_breakpoint, Void, (Ref{Float64},), a), + Tuple{Float64}) + @test !contains(breakpoint_f64_ir, "jl_gc_pool_alloc") + breakpoint_any_ir = get_llvm((a)->ccall(:jl_breakpoint, Void, (Ref{Any},), a), + Tuple{Float64}) + @test contains(breakpoint_any_ir, "jl_gc_pool_alloc") + two_breakpoint_ir = get_llvm(two_breakpoint, Tuple{Float64}) + @test !contains(two_breakpoint_ir, "jl_gc_pool_alloc") + @test contains(two_breakpoint_ir, "llvm.lifetime.end") end diff --git a/test/llvmpasses/.gitignore b/test/llvmpasses/.gitignore new file mode 100644 index 0000000000000..0e62bc85774ee --- /dev/null +++ b/test/llvmpasses/.gitignore @@ -0,0 +1 @@ +/Output/ \ No newline at end of file diff --git a/test/llvmpasses/alloc-opt.jl b/test/llvmpasses/alloc-opt.jl new file mode 100644 index 0000000000000..d267abdec8480 --- /dev/null +++ b/test/llvmpasses/alloc-opt.jl @@ -0,0 +1,204 @@ +# RUN: julia --startup-file=no %s | opt -load libjulia.so -AllocOpt -S - | FileCheck %s + +## WIP + +isz = sizeof(UInt) == 8 ? "i64" : "i32" + +println(""" +%jl_value_t = type opaque +@tag = external addrspace(10) global %jl_value_t +""") + +# CHECK-LABEL: @return_obj +# CHECK-NOT: @julia.gc_alloc_obj +# CHECK: %v = call noalias %jl_value_t addrspace(10)* @jl_gc_pool_alloc +# CHECK: store %jl_value_t addrspace(10)* @tag, %jl_value_t addrspace(10)* addrspace(11)* {{.*}}, !tbaa !0 +println(""" +define %jl_value_t addrspace(10)* @return_obj() { + %ptls = call %jl_value_t*** @jl_get_ptls_states() + %ptls_i8 = bitcast %jl_value_t*** %ptls to i8* + %v = call noalias %jl_value_t addrspace(10)* @julia.gc_alloc_obj(i8* %ptls_i8, $isz 8, %jl_value_t addrspace(10)* @tag) + ret %jl_value_t addrspace(10)* %v +} +""") +# CHECK-LABEL: } + +# CHECK-LABEL: @return_load +# CHECK: alloca i64, align 8 +# CHECK-NOT: @julia.gc_alloc_obj +# CHECK-NOT: @jl_gc_pool_alloc +# CHECK: call void @llvm.lifetime.start(i64 8, i8* +# CHECK-NEXT: %v64 = bitcast %jl_value_t* %v to i64* +# CHECK-NOT: @tag +# CHECK-NOT: @llvm.lifetime.end +println(""" +define i64 @return_load(i64 %i) { + %ptls = call %jl_value_t*** @jl_get_ptls_states() + %ptls_i8 = bitcast %jl_value_t*** %ptls to i8* + %v = call noalias %jl_value_t addrspace(10)* @julia.gc_alloc_obj(i8* %ptls_i8, $isz 8, %jl_value_t addrspace(10)* @tag) + %v64 = bitcast %jl_value_t addrspace(10)* %v to i64 addrspace(10)* + %v64a11 = addrspacecast i64 addrspace(10)* %v64 to i64 addrspace(11)* + store i64 %i, i64 addrspace(11)* %v64a11, align 16, !tbaa !4 + call void @external_function() + %l = load i64, i64 addrspace(11)* %v64a11, align 16, !tbaa !4 + ret i64 %l +} +""") +# CHECK-LABEL: } + +# CHECK-LABEL: @return_tag +# CHECK: alloca i128, align 8 +# CHECK: call %jl_value_t*** @jl_get_ptls_states() +# CHECK-NOT: @julia.gc_alloc_obj +# CHECK-NOT: @jl_gc_pool_alloc +# CHECK: call void @llvm.lifetime.start(i64 16, i8* +# CHECK: store %jl_value_t addrspace(10)* @tag, %jl_value_t addrspace(10)** {{.*}}, !tbaa !0 +# CHECK-NOT: @llvm.lifetime.end +println(""" +define %jl_value_t addrspace(10)* @return_tag() { + %ptls = call %jl_value_t*** @jl_get_ptls_states() + %ptls_i8 = bitcast %jl_value_t*** %ptls to i8* + %v = call noalias %jl_value_t addrspace(10)* @julia.gc_alloc_obj(i8* %ptls_i8, $isz 8, %jl_value_t addrspace(10)* @tag) + %va = addrspacecast %jl_value_t addrspace(10)* %v to %jl_value_t addrspace(11)* + %vab = bitcast %jl_value_t addrspace(11)* %va to %jl_value_t addrspace(10)* addrspace(11)* + %tagaddr = getelementptr %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)* addrspace(11)* %vab, i64 -1 + %tag = load %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)* addrspace(11)* %tagaddr, align 8, !tbaa !0 + ret %jl_value_t addrspace(10)* %tag +} +""") +# CHECK-LABEL: } + +# CHECK-LABEL: @ccall_obj +# CHECK: call %jl_value_t*** @jl_get_ptls_states() +# CHECK-NOT: @julia.gc_alloc_obj +# CHECK: @jl_gc_pool_alloc +# CHECK: store %jl_value_t addrspace(10)* @tag, %jl_value_t addrspace(10)* addrspace(11)* {{.*}}, !tbaa !0 +println(""" +define void @ccall_obj(i8* %fptr) { + %ptls = call %jl_value_t*** @jl_get_ptls_states() + %ptls_i8 = bitcast %jl_value_t*** %ptls to i8* + %v = call noalias %jl_value_t addrspace(10)* @julia.gc_alloc_obj(i8* %ptls_i8, $isz 8, %jl_value_t addrspace(10)* @tag) + %f = bitcast i8* %fptr to void (%jl_value_t addrspace(10)*)* + call void %f(%jl_value_t addrspace(10)* %v) + ret void +} +""") +# CHECK-LABEL: } + +# CHECK-LABEL: @ccall_ptr +# CHECK: alloca i64 +# CHECK: call %jl_value_t*** @jl_get_ptls_states() +# CHECK-NOT: @julia.gc_alloc_obj +# CHECK-NOT: @jl_gc_pool_alloc +# CHECK: call void @llvm.lifetime.start(i64 8, i8* +# CHECK-NEXT: %f = bitcast i8* %fptr to void (%jl_value_t*)* +# CHECK-NEXT: call void %f(%jl_value_t* %v) [ "jl_roots"(%jl_value_t* null), "unknown_bundle"(%jl_value_t* %v) ] +# CHECK-NEXT: ret void +println(""" +define void @ccall_ptr(i8* %fptr) { + %ptls = call %jl_value_t*** @jl_get_ptls_states() + %ptls_i8 = bitcast %jl_value_t*** %ptls to i8* + %v = call noalias %jl_value_t addrspace(10)* @julia.gc_alloc_obj(i8* %ptls_i8, $isz 8, %jl_value_t addrspace(10)* @tag) + %va = addrspacecast %jl_value_t addrspace(10)* %v to %jl_value_t addrspace(11)* + %ptr = call %jl_value_t* @julia.pointer_from_objref(%jl_value_t addrspace(11)* %va) + %f = bitcast i8* %fptr to void (%jl_value_t*)* + call void %f(%jl_value_t* %ptr) [ "jl_roots"(%jl_value_t addrspace(10)* %v), "unknown_bundle"(%jl_value_t* %ptr) ] + ret void +} +""") +# CHECK-LABEL: } + +# CHECK-LABEL: @ccall_unknown_bundle +# CHECK: call %jl_value_t*** @jl_get_ptls_states() +# CHECK-NOT: @julia.gc_alloc_obj +# CHECK: @jl_gc_pool_alloc +# CHECK: store %jl_value_t addrspace(10)* @tag, %jl_value_t addrspace(10)* addrspace(11)* {{.*}}, !tbaa !0 +println(""" +define void @ccall_unknown_bundle(i8* %fptr) { + %ptls = call %jl_value_t*** @jl_get_ptls_states() + %ptls_i8 = bitcast %jl_value_t*** %ptls to i8* + %v = call noalias %jl_value_t addrspace(10)* @julia.gc_alloc_obj(i8* %ptls_i8, $isz 8, %jl_value_t addrspace(10)* @tag) + %va = addrspacecast %jl_value_t addrspace(10)* %v to %jl_value_t addrspace(11)* + %ptr = call %jl_value_t* @julia.pointer_from_objref(%jl_value_t addrspace(11)* %va) + %f = bitcast i8* %fptr to void (%jl_value_t*)* + call void %f(%jl_value_t* %ptr) [ "jl_not_jl_roots"(%jl_value_t addrspace(10)* %v) ] + ret void +} +""") +# CHECK-LABEL: } + +# CHECK-LABEL: @lifetime_branches +# CHECK: alloca i64 +# CHECK: call %jl_value_t*** @jl_get_ptls_states() +# CHECK: L1: +# CHECK-NEXT: call void @llvm.lifetime.start(i64 8, +# CHECK-NEXT: %f = bitcast i8* %fptr to void (%jl_value_t*)* +# CHECK-NEXT: call void %f(%jl_value_t* %v) [ "jl_roots"(%jl_value_t* null) ] +# CHECK-NEXT: br i1 %b2, label %L2, label %L3 + +# CHECK: L2: +# CHECK-NEXT: %f2 = bitcast i8* %fptr to void (%jl_value_t*)* +# CHECK-NEXT: call void @llvm.lifetime.end(i64 8, +# CHECK-NEXT: call void %f2(%jl_value_t* null) + +# CHECK: L3: +# CHECK-NEXT: call void @llvm.lifetime.end(i64 8, +println(""" +define void @lifetime_branches(i8* %fptr, i1 %b, i1 %b2) { + %ptls = call %jl_value_t*** @jl_get_ptls_states() + %ptls_i8 = bitcast %jl_value_t*** %ptls to i8* + br i1 %b, label %L1, label %L3 + +L1: + %v = call noalias %jl_value_t addrspace(10)* @julia.gc_alloc_obj(i8* %ptls_i8, $isz 8, %jl_value_t addrspace(10)* @tag) + %va = addrspacecast %jl_value_t addrspace(10)* %v to %jl_value_t addrspace(11)* + %ptr = call %jl_value_t* @julia.pointer_from_objref(%jl_value_t addrspace(11)* %va) + %f = bitcast i8* %fptr to void (%jl_value_t*)* + call void %f(%jl_value_t* %ptr) [ "jl_roots"(%jl_value_t addrspace(10)* %v) ] + br i1 %b2, label %L2, label %L3 + +L2: + %f2 = bitcast i8* %fptr to void (%jl_value_t*)* + call void %f2(%jl_value_t* null) + br label %L3 + +L3: + ret void +} +""") +# CHECK-LABEL: } + +# CHECK-LABEL: @object_field +# CHECK: call %jl_value_t*** @jl_get_ptls_states() +# CHECK-NOT: @julia.gc_alloc_obj +# CHECK: @jl_gc_pool_alloc +# CHECK: store %jl_value_t addrspace(10)* @tag, %jl_value_t addrspace(10)* addrspace(11)* {{.*}}, !tbaa !0 +println(""" +define void @object_field(%jl_value_t addrspace(10)* %field) { + %ptls = call %jl_value_t*** @jl_get_ptls_states() + %ptls_i8 = bitcast %jl_value_t*** %ptls to i8* + %v = call noalias %jl_value_t addrspace(10)* @julia.gc_alloc_obj(i8* %ptls_i8, $isz 8, %jl_value_t addrspace(10)* @tag) + %va = addrspacecast %jl_value_t addrspace(10)* %v to %jl_value_t addrspace(11)* + %vab = bitcast %jl_value_t addrspace(11)* %va to %jl_value_t addrspace(10)* addrspace(11)* + store %jl_value_t addrspace(10)* %field, %jl_value_t addrspace(10)* addrspace(11)* %vab + ret void +} +""") +# CHECK-LABEL: } + +# CHECK: declare noalias %jl_value_t addrspace(10)* @jl_gc_pool_alloc(i8*, +# CHECK: declare noalias %jl_value_t addrspace(10)* @jl_gc_big_alloc(i8*, +println(""" +declare void @external_function() +declare %jl_value_t*** @jl_get_ptls_states() +declare noalias %jl_value_t addrspace(10)* @julia.gc_alloc_obj(i8*, $isz, %jl_value_t addrspace(10)*) +declare %jl_value_t* @julia.pointer_from_objref(%jl_value_t addrspace(11)*) + +!0 = !{!1, !1, i64 0} +!1 = !{!"jtbaa_tag", !2, i64 0} +!2 = !{!"jtbaa_data", !3, i64 0} +!3 = !{!"jtbaa"} +!4 = !{!5, !5, i64 0} +!5 = !{!"jtbaa_mutab", !6, i64 0} +!6 = !{!"jtbaa_value", !2, i64 0} +""")