From b2ba1717a139be143f78900b3cc989e34fa26b5b Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi Date: Sun, 3 Dec 2023 14:12:35 -0500 Subject: [PATCH 1/9] Stack allocate some genericmemory --- src/ccall.cpp | 21 +- src/codegen.cpp | 16 +- src/genericmemory.c | 60 +++-- src/julia_internal.h | 12 + src/llvm-alloc-helpers.cpp | 2 + src/llvm-alloc-opt.cpp | 433 +++++++++++++++++++++++++++------- src/llvm-late-gc-lowering.cpp | 9 + src/llvm-pass-helpers.cpp | 1 + src/llvm-pass-helpers.h | 1 + 9 files changed, 437 insertions(+), 118 deletions(-) diff --git a/src/ccall.cpp b/src/ccall.cpp index ece0ee24908e8..79294b5ac1f21 100644 --- a/src/ccall.cpp +++ b/src/ccall.cpp @@ -1783,8 +1783,8 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs) assert(!isVa && !llvmcall && nccallargs == 2); const jl_cgval_t &typ = argv[0]; const jl_cgval_t &nel = argv[1]; + auto istyp = argv[0].constant; auto arg_typename = [&] JL_NOTSAFEPOINT { - auto istyp = argv[0].constant; std::string type_str; if (istyp && jl_is_datatype(istyp) && jl_is_genericmemory_type(istyp)){ auto eltype = jl_tparam1(istyp); @@ -1798,8 +1798,23 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs) else type_str = ""; return "Memory{" + type_str + "}[]"; - }; - auto alloc = ctx.builder.CreateCall(prepare_call(jl_allocgenericmemory), { boxed(ctx,typ), emit_unbox(ctx, ctx.types().T_size, nel, (jl_value_t*)jl_ulong_type)}); + }; + auto elsize = emit_unbox(ctx, ctx.types().T_size, nel, (jl_value_t*)jl_ulong_type); + jl_genericmemory_info_t info; + if (istyp && jl_is_datatype(istyp) && jl_is_genericmemory_type(istyp)) { + info = jl_get_genericmemory_info(istyp); + } else { + info = {0, 0, 0, 0}; + } + auto alloc = ctx.builder.CreateCall(prepare_call(jl_allocgenericmemory), + { + boxed(ctx,typ), + elsize, + static_cast(ConstantInt::get(ctx.types().T_size, info.elsize)), + static_cast(ConstantInt::get(getInt8Ty(ctx.builder.getContext()), info.isunion)), + static_cast(ConstantInt::get(getInt8Ty(ctx.builder.getContext()), info.zeroinit)), + static_cast(ConstantInt::get(getInt8Ty(ctx.builder.getContext()), info.isboxed)), + }); setName(ctx.emission_context, alloc, arg_typename); JL_GC_POP(); return mark_julia_type(ctx, alloc, true, jl_any_type); diff --git a/src/codegen.cpp b/src/codegen.cpp index bc7f30f49626d..29495bf36786e 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -1303,12 +1303,20 @@ static const auto sync_gc_total_bytes_func = new JuliaFunction<>{ nullptr, }; static const auto jl_allocgenericmemory = new JuliaFunction{ - XSTR(jl_alloc_genericmemory), + "julia.gc_alloc_genericmemory", [](LLVMContext &C, Type *T_Size) { auto T_prjlvalue = JuliaType::get_prjlvalue_ty(C); return FunctionType::get(T_prjlvalue, // new Memory {T_prjlvalue, // type - T_Size // nelements + T_Size, // nelements + // these fields are for alloc-opt, because + // when compiling for images we need to know these + // to stack allocate arrays + // if it's dynamic, we just set everything to 0 + T_Size, // elsize + getInt8Ty(C), // isunion + getInt8Ty(C), // zeroinit + getInt8Ty(C), // boxed }, false); }, [](LLVMContext &C) { AttrBuilder FnAttrs(C); @@ -1426,7 +1434,7 @@ static const auto gc_loaded_func = new JuliaFunction<>{ AttributeSet FnAttrs = Attributes(C, {Attribute::ReadNone, Attribute::NoSync, Attribute::NoUnwind, Attribute::Speculatable, Attribute::WillReturn, Attribute::NoRecurse}); AttributeSet RetAttrs = Attributes(C, {Attribute::NonNull, Attribute::NoUndef}); return AttributeList::get(C, FnAttrs, RetAttrs, - { Attributes(C, {Attribute::NonNull, Attribute::NoUndef, Attribute::ReadNone, Attribute::NoCapture}), + { Attributes(C, {Attribute::NoUndef, Attribute::ReadNone, Attribute::NoCapture}), Attributes(C, {Attribute::NonNull, Attribute::NoUndef, Attribute::ReadNone}) }); }, }; @@ -9432,7 +9440,7 @@ static void init_jit_functions(void) add_named_global(jlfieldindex_func, &jl_field_index); add_named_global(diff_gc_total_bytes_func, &jl_gc_diff_total_bytes); add_named_global(sync_gc_total_bytes_func, &jl_gc_sync_total_bytes); - add_named_global(jl_allocgenericmemory, &jl_alloc_genericmemory); + add_named_global(jl_allocgenericmemory, (void*)NULL); add_named_global(gcroot_flush_func, (void*)NULL); add_named_global(gc_preserve_begin_func, (void*)NULL); add_named_global(gc_preserve_end_func, (void*)NULL); diff --git a/src/genericmemory.c b/src/genericmemory.c index 24b2bac6b2ac1..57c0485037436 100644 --- a/src/genericmemory.c +++ b/src/genericmemory.c @@ -56,19 +56,40 @@ typedef uint64_t wideint_t; #define MAXINTVAL (((size_t)-1)>>1) -jl_genericmemory_t *_new_genericmemory_(jl_value_t *mtype, size_t nel, int8_t isunion, int8_t zeroinit, size_t elsz) +// used by alloc-opt +JL_DLLEXPORT size_t jl_genericmemory_bytesize(const jl_genericmemory_info_t *info, size_t nel) +{ + wideint_t prod = (wideint_t)nel * info->elsize; + if (info->isunion) { + // an extra byte for each isbits union memory element, stored at m->ptr + m->length + prod += nel; + } + if (nel >= MAXINTVAL || prod >= (wideint_t) MAXINTVAL) + return MAXINTVAL; + return (size_t) prod; +} + +// used by codegen to give info to alloc-opt +JL_DLLEXPORT jl_genericmemory_info_t jl_get_genericmemory_info(jl_value_t *mtype) +{ + assert(jl_is_datatype(mtype)); + jl_genericmemory_info_t info; + info.isboxed = ((jl_datatype_t*)mtype)->layout->flags.arrayelem_isboxed; + info.elsize = info.isboxed ? sizeof(void*) : ((jl_datatype_t*)mtype)->layout->size; + info.isunion = ((jl_datatype_t*)mtype)->layout->flags.arrayelem_isunion; + info.zeroinit = ((jl_datatype_t*)mtype)->zeroinit; + return info; +} + +jl_genericmemory_t *_new_genericmemory_(jl_value_t *mtype, size_t nel, const jl_genericmemory_info_t *info) { jl_task_t *ct = jl_current_task; char *data; jl_genericmemory_t *m; if (nel == 0) // zero-sized allocation optimization return (jl_genericmemory_t*)((jl_datatype_t*)mtype)->instance; - wideint_t prod = (wideint_t)nel * elsz; - if (isunion) { - // an extra byte for each isbits union memory element, stored at m->ptr + m->length - prod += nel; - } - if (nel >= MAXINTVAL || prod >= (wideint_t) MAXINTVAL) + size_t prod = jl_genericmemory_bytesize(info, nel); + if (prod == MAXINTVAL) jl_exceptionf(jl_argumenterror_type, "invalid GenericMemory size"); size_t tot = (size_t)prod + LLT_ALIGN(sizeof(jl_genericmemory_t),JL_SMALL_BYTE_ALIGNMENT); @@ -89,7 +110,7 @@ jl_genericmemory_t *_new_genericmemory_(jl_value_t *mtype, size_t nel, int8_t is m->length = nel; m->ptr = data; - if (zeroinit) + if (info->zeroinit) memset(data, 0, (size_t)prod); return m; } @@ -114,13 +135,8 @@ JL_DLLEXPORT jl_genericmemory_t *jl_alloc_genericmemory(jl_value_t *mtype, size_ if (nel == 0) // zero-sized allocation optimization fast path return m; - size_t elsz = layout->size; - int isboxed = layout->flags.arrayelem_isboxed; - int isunion = layout->flags.arrayelem_isunion; - int zi = ((jl_datatype_t*)mtype)->zeroinit; - if (isboxed) - elsz = sizeof(void*); - return _new_genericmemory_(mtype, nel, isunion, zi, elsz); + jl_genericmemory_info_t info = jl_get_genericmemory_info(mtype); + return _new_genericmemory_(mtype, nel, &info); } JL_DLLEXPORT jl_genericmemory_t *jl_string_to_genericmemory(jl_value_t *str) @@ -447,18 +463,18 @@ JL_DLLEXPORT jl_genericmemory_t *jl_genericmemory_copy_slice(jl_genericmemory_t { jl_value_t *mtype = (jl_value_t*)jl_typetagof(mem); const jl_datatype_layout_t *layout = ((jl_datatype_t*)mtype)->layout; - size_t elsz = layout->size; - int isunion = layout->flags.arrayelem_isunion; - jl_genericmemory_t *new_mem = _new_genericmemory_(mtype, len, isunion, 0, elsz); - if (isunion) { - memcpy(new_mem->ptr, (char*)mem->ptr + (size_t)data * elsz, len * elsz); + jl_genericmemory_info_t info = jl_get_genericmemory_info(mtype); + info.zeroinit = 0; + jl_genericmemory_t *new_mem = _new_genericmemory_(mtype, len, &info); + if (info.isunion) { + memcpy(new_mem->ptr, (char*)mem->ptr + (size_t)data * info.elsize, len * info.elsize); memcpy(jl_genericmemory_typetagdata(new_mem), jl_genericmemory_typetagdata(mem) + (size_t)data, len); } else if (layout->first_ptr != -1) { - memmove_refs((_Atomic(void*)*)new_mem->ptr, (_Atomic(void*)*)data, len * elsz / sizeof(void*)); + memmove_refs((_Atomic(void*)*)new_mem->ptr, (_Atomic(void*)*)data, len * info.elsize / sizeof(void*)); } else if (data != NULL) { - memcpy(new_mem->ptr, data, len * elsz); + memcpy(new_mem->ptr, data, len * info.elsize); } return new_mem; } diff --git a/src/julia_internal.h b/src/julia_internal.h index da025a900400e..1c5ec1a89f7e4 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -639,6 +639,13 @@ typedef union { uint8_t packed; } jl_code_info_flags_t; +typedef struct { + size_t elsize; + uint8_t isunion; + uint8_t zeroinit; + uint8_t isboxed; +} jl_genericmemory_info_t; + // -- functions -- // JL_DLLEXPORT jl_code_info_t *jl_type_infer(jl_method_instance_t *li, size_t world, int force); @@ -988,6 +995,11 @@ size_t external_blob_index(jl_value_t *v) JL_NOTSAFEPOINT; uint8_t jl_object_in_image(jl_value_t* v) JL_NOTSAFEPOINT; +// used by alloc-opt +JL_DLLEXPORT size_t jl_genericmemory_bytesize(const jl_genericmemory_info_t *info, size_t nel); +// used by codegen to give info to alloc-opt +JL_DLLEXPORT jl_genericmemory_info_t jl_get_genericmemory_info(jl_value_t *mtype); + // the first argument to jl_idtable_rehash is used to return a value // make sure it is rooted if it is used after the function returns JL_DLLEXPORT jl_genericmemory_t *jl_idtable_rehash(jl_genericmemory_t *a, size_t newsz); diff --git a/src/llvm-alloc-helpers.cpp b/src/llvm-alloc-helpers.cpp index 953ecc1830142..4578f09d4c558 100644 --- a/src/llvm-alloc-helpers.cpp +++ b/src/llvm-alloc-helpers.cpp @@ -249,6 +249,8 @@ void jl_alloc::runEscapeAnalysis(llvm::CallInst *I, EscapeAnalysisRequiredArgs r } if (required.pass.write_barrier_func == callee) return true; + if (required.pass.gc_loaded_func == callee) + return true; auto opno = use->getOperandNo(); // Uses in `jl_roots` operand bundle are not counted as escaping, everything else is. if (!call->isBundleOperand(opno) || diff --git a/src/llvm-alloc-opt.cpp b/src/llvm-alloc-opt.cpp index 5df4f52aca425..3b1a149021b9d 100644 --- a/src/llvm-alloc-opt.cpp +++ b/src/llvm-alloc-opt.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -109,17 +110,18 @@ struct AllocOpt : public JuliaPassContext { Function *lifetime_start; Function *lifetime_end; + bool cfgChanged = false; bool doInitialization(Module &m); - bool runOnFunction(Function &F, function_ref GetDT); + bool runOnFunction(Function &F, FunctionAnalysisManager &AM); }; struct Optimizer { - Optimizer(Function &F, AllocOpt &pass, function_ref GetDT) + Optimizer(Function &F, AllocOpt &pass, FunctionAnalysisManager &AM) : F(F), ORE(&F), pass(pass), - GetDT(std::move(GetDT)) + AM(AM) {} void initialize(); @@ -145,16 +147,23 @@ struct Optimizer { void splitOnStack(CallInst *orig_inst); void optimizeTag(CallInst *orig_inst); + void optimizeObject(CallInst *orig_inst, size_t sz); + void optimizeArray(CallInst *orig_inst, jl_genericmemory_info_t info); + + void moveSizedArrayToStack(CallInst *orig, jl_genericmemory_info_t info); + void moveUnsizedArrayToStack(CallInst *orig, jl_genericmemory_info_t info); + void replaceArrayUses(CallInst *orig, Value *root, function_ref shell, Value *data); + Function &F; OptimizationRemarkEmitter ORE; AllocOpt &pass; DominatorTree *_DT = nullptr; - function_ref GetDT; + FunctionAnalysisManager &AM; DominatorTree &getDomTree() { if (!_DT) - _DT = &GetDT(); + _DT = &AM.getResult(F); return *_DT; } struct Lifetime { @@ -190,6 +199,7 @@ struct Optimizer { }; SetVector> worklist; + DenseMap arrays; SmallVector removed; AllocUseInfo use_info; CheckInst::Stack check_stack; @@ -215,81 +225,308 @@ void Optimizer::initialize() } } -void Optimizer::optimizeAll() -{ - while (!worklist.empty()) { - auto item = worklist.pop_back_val(); - auto orig = item.first; - size_t sz = item.second; - checkInst(orig); - if (use_info.escaped) { - REMARK([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "Escaped", orig) - << "GC allocation escaped " << ore::NV("GC Allocation", orig); - }); - if (use_info.hastypeof) - optimizeTag(orig); - continue; - } - if (use_info.haserror || use_info.returned) { - REMARK([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "Escaped", orig) - << "GC allocation has error or was returned " << ore::NV("GC Allocation", orig); - }); - if (use_info.hastypeof) - optimizeTag(orig); - continue; - } - if (!use_info.addrescaped && !use_info.hasload && (!use_info.haspreserve || - !use_info.refstore)) { - REMARK([&]() { - return OptimizationRemark(DEBUG_TYPE, "Dead Allocation", orig) - << "GC allocation removed " << ore::NV("GC Allocation", orig); - }); - // No one took the address, no one reads anything and there's no meaningful - // preserve of fields (either no preserve/ccall or no object reference fields) - // We can just delete all the uses. - removeAlloc(orig); - continue; +void Optimizer::optimizeObject(CallInst *orig, size_t sz) { + checkInst(orig); + if (use_info.escaped) { + REMARK([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Escaped", orig) + << "GC allocation escaped " << ore::NV("GC Allocation", orig); + }); + if (use_info.hastypeof) + optimizeTag(orig); + return; + } + if (use_info.haserror || use_info.returned) { + REMARK([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Escaped", orig) + << "GC allocation has error or was returned " << ore::NV("GC Allocation", orig); + }); + if (use_info.hastypeof) + optimizeTag(orig); + return; + } + if (!use_info.addrescaped && !use_info.hasload && (!use_info.haspreserve || + !use_info.refstore)) { + REMARK([&]() { + return OptimizationRemark(DEBUG_TYPE, "Dead Allocation", orig) + << "GC allocation removed " << ore::NV("GC Allocation", orig); + }); + // No one took the address, no one reads anything and there's no meaningful + // preserve of fields (either no preserve/ccall or no object reference fields) + // We can just delete all the uses. + removeAlloc(orig); + return; + } + bool has_ref = use_info.has_unknown_objref; + bool has_refaggr = use_info.has_unknown_objrefaggr; + for (auto memop: use_info.memops) { + auto &field = memop.second; + if (field.hasobjref) { + has_ref = true; + // This can be relaxed a little based on hasload + // TODO: add support for hasaggr load/store + if (field.hasaggr || field.multiloc || field.size != sizeof(void*)) { + has_refaggr = true; + break; + } } - bool has_ref = use_info.has_unknown_objref; - bool has_refaggr = use_info.has_unknown_objrefaggr; - for (auto memop: use_info.memops) { - auto &field = memop.second; - if (field.hasobjref) { - has_ref = true; - // This can be relaxed a little based on hasload - // TODO: add support for hasaggr load/store - if (field.hasaggr || field.multiloc || field.size != sizeof(void*)) { - has_refaggr = true; - break; + } + if (has_refaggr) { + REMARK([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Escaped", orig) + << "GC allocation has unusual object reference, unable to move to stack " << ore::NV("GC Allocation", orig); + }); + if (use_info.hastypeof) + optimizeTag(orig); + return; + } + if (!use_info.hasunknownmem && !use_info.addrescaped) { + REMARK([&](){ + return OptimizationRemark(DEBUG_TYPE, "Stack Split Allocation", orig) + << "GC allocation split on stack " << ore::NV("GC Allocation", orig); + }); + // No one actually care about the memory layout of this object, split it. + splitOnStack(orig); + return; + } + REMARK([&](){ + return OptimizationRemark(DEBUG_TYPE, "Stack Move Allocation", orig) + << "GC allocation moved to stack " << ore::NV("GC Allocation", orig); + }); + // The object has no fields with mix reference access + moveToStack(orig, sz, has_ref, use_info.allockind); +} + +void Optimizer::moveSizedArrayToStack(CallInst *orig, jl_genericmemory_info_t info) { + auto length = orig->getArgOperand(1); + auto ilen = cast(length)->getZExtValue(); + size_t bytes = jl_genericmemory_bytesize(&info, ilen); + size_t maxStackAlloc = 4096; // TODO parameterize by module flag/ctor param + if (bytes > maxStackAlloc) { + dbgs() << "Array was too large to stack allocate\n"; + REMARK([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Escaped", orig) + << "GC genericmemory allocation size is too large " << ore::NV("GC GenericMemory Allocation", orig); + }); + return; + } + auto align = orig->getRetAlign().valueOrOne(); + IRBuilder<> builder(&*F.getEntryBlock().getFirstInsertionPt()); + auto T_size = pass.DL->getIntPtrType(builder.getContext()); + auto data = builder.CreateAlloca(Type::getInt8Ty(builder.getContext()), ConstantInt::get(T_size, bytes)); + data->setAlignment(align); + data->takeName(orig); + auto root = Constant::getNullValue(orig->getType()); // technically valid to root this + Value *shellp2i = nullptr; + + auto shell = [&]() mutable { + if (shellp2i) + return shellp2i; + auto shellData = builder.CreateAlloca(Type::getInt8PtrTy(builder.getContext()), ConstantInt::get(T_size, 2)); + shellData->setAlignment(align); + shellData->setName(data->getName() + ".shell_data"); + auto lenptr = builder.CreateBitCast(shellData, length->getType()->getPointerTo(shellData->getType()->getPointerAddressSpace())); + builder.CreateAlignedStore(length, lenptr, align); + auto dataptr = builder.CreateConstGEP1_64(Type::getInt8PtrTy(builder.getContext()), shellData, 1); + builder.CreateAlignedStore(data, dataptr, Align(std::min(align.value(), (uint64_t) pass.DL->getPointerSize()))); + shellp2i = builder.CreatePtrToInt(shellData, pass.DL->getIntPtrType(builder.getContext())); + return shellp2i; + }; + + // This is kind of a dirty cleanup, but subsequent DCE should clean up all the + // nullptr manipulations + replaceArrayUses(orig, root, shell, data); + orig->eraseFromParent(); +} + +void Optimizer::moveUnsizedArrayToStack(CallInst *orig, jl_genericmemory_info_t info) { + size_t maxStackAlloc = 4096; // TODO parameterize by module flag/ctor param + IRBuilder<> builder(&*F.getEntryBlock().getFirstInsertionPt()); + StringRef origName = orig->getName(); + auto T_size = pass.DL->getIntPtrType(builder.getContext()); + auto data = builder.CreateAlloca(Type::getInt8Ty(builder.getContext()), ConstantInt::get(T_size, maxStackAlloc)); + auto align = orig->getRetAlign().valueOrOne(); + data->setAlignment(align); + data->setName(origName + ".stack_data"); + // don't store the length pointer yet, since it might not be computed here + + // this makes sure we update domtree when splitting the bb, so we preserve the analysis + _DT = AM.getCachedResult(F); + auto origBB = orig->getParent(); + builder.SetInsertPoint(orig); + auto length = orig->getArgOperand(1); + auto fallback = SplitBlockAndInsertIfThen(builder.CreateICmpUGT(length, ConstantInt::get(length->getType(), maxStackAlloc)), orig, false, nullptr, _DT); + pass.cfgChanged = true; + fallback->getParent()->setName("stack_alloc_fallback"); + builder.SetInsertPoint(orig); + auto ownerPhi = builder.CreatePHI(orig->getType(), 2); + auto dataPhi = builder.CreatePHI(Type::getInt8PtrTy(builder.getContext()), 2); + PHINode *shellPhi = nullptr; + + auto shell = [&]() mutable { + if (shellPhi) + return shellPhi; + builder.SetInsertPoint(origBB->getTerminator()); + auto shellData = builder.CreateAlloca(Type::getInt8PtrTy(builder.getContext()), ConstantInt::get(T_size, 2)); + shellData->setAlignment(align); + shellData->setName(data->getName() + ".shell_data"); + auto lenptr = builder.CreateBitCast(shellData, length->getType()->getPointerTo(shellData->getType()->getPointerAddressSpace())); + builder.CreateAlignedStore(length, lenptr, align); + auto dataptr = builder.CreateConstGEP1_64(Type::getInt8PtrTy(builder.getContext()), shellData, 1); + builder.CreateAlignedStore(data, dataptr, Align(std::min(align.value(), (uint64_t) pass.DL->getPointerSize()))); + auto shellStack = builder.CreatePtrToInt(shellData, pass.DL->getIntPtrType(builder.getContext())); + builder.SetInsertPoint(ownerPhi); + shellPhi = builder.CreatePHI(shellData->getType(), 2); + shellPhi->addIncoming(shellStack, origBB); + return shellPhi; + }; + + // Replace all the uses now, before we make the original instruction conditional on array size + replaceArrayUses(orig, ownerPhi, shell, dataPhi); + + orig->moveBefore(fallback); + builder.SetInsertPoint(fallback); + auto casted = builder.CreateBitCast(orig, Type::getInt8PtrTy(builder.getContext())->getPointerTo(orig->getType()->getPointerAddressSpace())); + auto fallbackDataPtr = builder.CreateConstGEP1_64(Type::getInt8PtrTy(builder.getContext()), casted, 1, origName + ".fallback_data_ptr"); + auto fallbackData = builder.CreateAlignedLoad(Type::getInt8PtrTy(builder.getContext()), fallbackDataPtr, Align(pass.DL->getPointerSize()), origName + ".fallback_data"); + auto datacast = builder.CreateBitCast(data, Type::getInt8PtrTy(builder.getContext())); + ownerPhi->addIncoming(Constant::getNullValue(orig->getType()), origBB); + ownerPhi->addIncoming(orig, fallback->getParent()); + dataPhi->addIncoming(datacast, origBB); + dataPhi->addIncoming(fallbackData, fallback->getParent()); + if (shellPhi) { + assert(pass.pointer_from_objref_func); + auto shell = builder.CreateCall(pass.pointer_from_objref_func, {orig}); + shellPhi->addIncoming(shell, fallback->getParent()); + } + + dataPhi->setName(origName + ".data"); + ownerPhi->takeName(orig); + ownerPhi->getParent()->setName("allocated_array"); +} + +void Optimizer::replaceArrayUses(CallInst *alloc, Value *root, function_refshell, Value *data) { + IRBuilder<> builder(alloc->getContext()); + auto type = alloc->getArgOperand(0); + auto length = alloc->getArgOperand(1); + // we need to replace all of the length/data accesses upfront, because in the case of an unsized array alloc + // it's not legal to derive it from the phi node (may be nullptr) + for (auto &field : use_info.memops) { + for (auto &access : field.second.accesses) { + assert(isa(access.inst) && "Should only have loads of array length/data"); + auto load = cast(access.inst); + auto offset = access.offset; + assert((offset == 0 || offset == pass.DL->getPointerSize()) && "Should only have loads of array length/data"); + builder.SetInsertPoint(load); + if (offset == 0) { + assert(load->getType()->isIntegerTy() && "Should only have loads of array length from offset 0"); + assert(load->getType()->getIntegerBitWidth() <= length->getType()->getIntegerBitWidth() && "Should only have loads of array length from offset 0"); + auto len = builder.CreateTrunc(length, load->getType()); // llvm may load a smaller int, but hopefully shouldn't go larger + if (len != length) { + len->takeName(length); + if (auto I = dyn_cast(len)) + if (auto leni = dyn_cast(length)) + I->copyMetadata(*leni); + } + load->replaceAllUsesWith(len); + load->eraseFromParent(); + } else { + if (load->getType()->isIntegerTy()) { + assert(cast(load->getType())->getBitWidth() == pass.DL->getPointerSizeInBits() && "Should only have loads of array data from offset 8"); + auto p2i = builder.CreatePtrToInt(data, load->getType()); + load->replaceAllUsesWith(p2i); + load->eraseFromParent(); + } else { + assert(load->getType()->isPointerTy() && "Should only have loads of array data from offset 8"); + auto atype = PointerType::getWithSamePointeeType(cast(data->getType()), load->getType()->getPointerAddressSpace()); + auto acast = builder.CreateAddrSpaceCast(data, atype); + auto bcast = builder.CreateBitCast(acast, load->getType()); + load->replaceAllUsesWith(bcast); + load->eraseFromParent(); } } } - if (has_refaggr) { - REMARK([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "Escaped", orig) - << "GC allocation has unusual object reference, unable to move to stack " << ore::NV("GC Allocation", orig); - }); - if (use_info.hastypeof) - optimizeTag(orig); - continue; - } - if (!use_info.hasunknownmem && !use_info.addrescaped) { - REMARK([&](){ - return OptimizationRemark(DEBUG_TYPE, "Stack Split Allocation", orig) - << "GC allocation split on stack " << ore::NV("GC Allocation", orig); - }); - // No one actually care about the memory layout of this object, split it. - splitOnStack(orig); - continue; + } + + while (!alloc->use_empty()) { + auto &use = *alloc->use_begin(); + auto user = cast(use.getUser()); + if (auto CI = dyn_cast(user)) { + auto callee = CI->getCalledFunction(); + if (callee == pass.pointer_from_objref_func) { + use.set(shell()); + continue; + } else if (callee == pass.typeof_func) { + use.set(type); + continue; + } + if (CI->isArgOperand(&use)) { + auto arg = CI->getArgOperandNo(&use); + CI->removeParamAttr(arg, Attribute::NonNull); // can actually be null now + } } - REMARK([&](){ - return OptimizationRemark(DEBUG_TYPE, "Stack Move Allocation", orig) - << "GC allocation moved to stack " << ore::NV("GC Allocation", orig); + use.set(root); + } +} + +void Optimizer::optimizeArray(CallInst *orig, jl_genericmemory_info_t info) { + checkInst(orig); + dbgs() << "checking array allocation\n"; + if (use_info.escaped) { + REMARK([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Escaped", orig) + << "GC genericmemory allocation escaped " << ore::NV("GC GenericMemory Allocation", orig); + }); + return; + } + if (use_info.haserror || use_info.returned) { + REMARK([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Escaped", orig) + << "GC allocation has error or was returned " << ore::NV("GC GenericMemory Allocation", orig); }); - // The object has no fields with mix reference access - moveToStack(orig, sz, has_ref, use_info.allockind); + return; + } + if (info.zeroinit || info.isunion || info.isboxed) { + // This is a hack to detect arrays of possibly-pointers, which must always be zero initialized. + // TODO actually support this maybe? + // I think in the future we may be able to support arrays of pointers by hooking into + // the gc roots alloca and adding our own "roots" (stack-allocated pointer arrays) + // We can technically do exactly one pointer-isbits array as well since the flag + // bits are at the end of the array (so we pretend the roots array is shorter than + // it actually is in late-gc-lowering), but that won't scale to multiple of those. + REMARK([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Escaped", orig) + << "GC genericmemory allocation is probably a pointer array " << ore::NV("GC GenericMemory Allocation", orig); + }); + return; + } + // at this point the only valid real operations on orig are loading the length, + // loading the data pointer, and getting a tracked pointer via gc_loaded. + // we will assume that if the data pointer or a tracked pointer escapes, then the + // original array must have also escaped, and therefore we would not have gotten here. + + // since we are here, we're free to turn the whole thing into a stack allocation + // and remove the original allocation. + dbgs() << "Moving array allocation to stack\n"; + if (isa(orig->getArgOperand(1))) { + dbgs() << "allocation was sized\n"; + moveSizedArrayToStack(orig, info); + } else { + dbgs() << "allocation was unsized\n"; + moveUnsizedArrayToStack(orig, info); + } +} + +void Optimizer::optimizeAll() +{ + while (!worklist.empty()) { + auto item = worklist.pop_back_val(); + auto orig = item.first; + size_t sz = item.second; + optimizeObject(orig, sz); + } + for (auto &item: arrays) { + optimizeArray(item.first, item.second); } } @@ -344,15 +581,33 @@ ssize_t Optimizer::getGCAllocSize(Instruction *I) auto call = dyn_cast(I); if (!call) return -1; - if (call->getCalledOperand() != pass.alloc_obj_func) - return -1; - assert(call->arg_size() == 3); - auto CI = dyn_cast(call->getArgOperand(1)); - if (!CI) + if (!call->getCalledOperand()) return -1; - size_t sz = (size_t)CI->getZExtValue(); - if (sz < IntegerType::MAX_INT_BITS / 8 && sz < INT32_MAX) - return sz; + if (call->getCalledOperand() == pass.alloc_obj_func) { + assert(call->arg_size() == 3); + if (auto CI = dyn_cast(call->getArgOperand(1))) { + size_t sz = (size_t)CI->getZExtValue(); + if (sz < IntegerType::MAX_INT_BITS / 8 && sz < INT32_MAX) + return sz; + } + } + if (call->getCalledOperand() == pass.alloc_genericmemory_func) { + assert(call->arg_size() == 6); + if (auto CI = dyn_cast(call->getArgOperand(2))) { + size_t elsz = (size_t)CI->getZExtValue(); + if (elsz != 0) { + auto isunion = dyn_cast(call->getArgOperand(3)); + auto zeroinit = dyn_cast(call->getArgOperand(4)); + auto isboxed = dyn_cast(call->getArgOperand(5)); + if (isunion && zeroinit && isboxed) { + jl_genericmemory_info_t info{elsz, (uint8_t)isunion->getZExtValue(), + (uint8_t)zeroinit->getZExtValue(), + (uint8_t)isboxed->getZExtValue()}; + arrays[call] = info; + } + } + } + } return -1; } @@ -1252,7 +1507,7 @@ void Optimizer::splitOnStack(CallInst *orig_inst) bool AllocOpt::doInitialization(Module &M) { initAll(M); - if (!alloc_obj_func) + if (!alloc_obj_func && !alloc_genericmemory_func) return false; DL = &M.getDataLayout(); @@ -1263,13 +1518,13 @@ bool AllocOpt::doInitialization(Module &M) return true; } -bool AllocOpt::runOnFunction(Function &F, function_ref GetDT) +bool AllocOpt::runOnFunction(Function &F, FunctionAnalysisManager &AM) { - if (!alloc_obj_func) { - LLVM_DEBUG(dbgs() << "AllocOpt: no alloc_obj function found, skipping pass\n"); + if (!alloc_obj_func && !alloc_genericmemory_func) { + LLVM_DEBUG(dbgs() << "AllocOpt: no alloc_obj/alloc_genericmemory function found, skipping pass\n"); return false; } - Optimizer optimizer(F, *this, std::move(GetDT)); + Optimizer optimizer(F, *this, AM); optimizer.initialize(); optimizer.optimizeAll(); bool modified = optimizer.finalize(); @@ -1284,11 +1539,11 @@ bool AllocOpt::runOnFunction(Function &F, function_ref GetDT) PreservedAnalyses AllocOptPass::run(Function &F, FunctionAnalysisManager &AM) { AllocOpt opt; bool modified = opt.doInitialization(*F.getParent()); - if (opt.runOnFunction(F, [&]()->DominatorTree &{ return AM.getResult(F); })) { + if (opt.runOnFunction(F, AM)) { modified = true; } if (modified) { - auto preserved = PreservedAnalyses::allInSet(); + auto preserved = opt.cfgChanged ? PreservedAnalyses::none() : PreservedAnalyses::allInSet(); preserved.preserve(); return preserved; } else { diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index edb3aad8f2328..35af1c04271c8 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -2458,6 +2458,15 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) { // Update the pointer numbering. UpdatePtrNumbering(CI, newI, S); + } else if (alloc_genericmemory_func && callee == alloc_genericmemory_func) { + assert(CI->arg_size() == 6); + auto gmalloc = F.getParent()->getOrInsertFunction("jl_alloc_genericmemory", T_prjlvalue, T_prjlvalue, T_size); + IRBuilder<> builder(CI); + builder.SetCurrentDebugLocation(CI->getDebugLoc()); + auto newI = builder.CreateCall(gmalloc, {CI->getArgOperand(0), CI->getArgOperand(1)}); + newI->takeName(CI); + CI->replaceAllUsesWith(newI); + UpdatePtrNumbering(CI, newI, S); } else if (typeof_func && callee == typeof_func) { assert(CI->arg_size() == 1); IRBuilder<> builder(CI); diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp index d17ce3105135c..9db9424956461 100644 --- a/src/llvm-pass-helpers.cpp +++ b/src/llvm-pass-helpers.cpp @@ -53,6 +53,7 @@ void JuliaPassContext::initFunctions(Module &M) typeof_func = M.getFunction("julia.typeof"); write_barrier_func = M.getFunction("julia.write_barrier"); alloc_obj_func = M.getFunction("julia.gc_alloc_obj"); + alloc_genericmemory_func = M.getFunction("julia.gc_alloc_genericmemory"); call_func = M.getFunction("julia.call"); call2_func = M.getFunction("julia.call2"); call3_func = M.getFunction("julia.call3"); diff --git a/src/llvm-pass-helpers.h b/src/llvm-pass-helpers.h index 346500df51ca1..76613cf04f2bb 100644 --- a/src/llvm-pass-helpers.h +++ b/src/llvm-pass-helpers.h @@ -58,6 +58,7 @@ struct JuliaPassContext { llvm::Function *pointer_from_objref_func; llvm::Function *gc_loaded_func; llvm::Function *alloc_obj_func; + llvm::Function *alloc_genericmemory_func; llvm::Function *typeof_func; llvm::Function *write_barrier_func; llvm::Function *call_func; From 21cefdb01bd4ef2a1ee2a73aa44de4b5653684d7 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi Date: Sun, 3 Dec 2023 22:49:44 -0500 Subject: [PATCH 2/9] Some fixes, make the doctest optimize properly --- src/llvm-alloc-opt.cpp | 26 ++++++++++++++------------ src/llvm-julia-licm.cpp | 16 ++++++++++++++++ src/pipeline.cpp | 22 ++++++++++++++++------ 3 files changed, 46 insertions(+), 18 deletions(-) diff --git a/src/llvm-alloc-opt.cpp b/src/llvm-alloc-opt.cpp index 3b1a149021b9d..756ebb4d5f3c9 100644 --- a/src/llvm-alloc-opt.cpp +++ b/src/llvm-alloc-opt.cpp @@ -150,9 +150,9 @@ struct Optimizer { void optimizeObject(CallInst *orig_inst, size_t sz); void optimizeArray(CallInst *orig_inst, jl_genericmemory_info_t info); - void moveSizedArrayToStack(CallInst *orig, jl_genericmemory_info_t info); - void moveUnsizedArrayToStack(CallInst *orig, jl_genericmemory_info_t info); - void replaceArrayUses(CallInst *orig, Value *root, function_ref shell, Value *data); + void moveSizedBitsArrayToStack(CallInst *orig, jl_genericmemory_info_t info); + void moveUnsizedBitsArrayToStack(CallInst *orig, jl_genericmemory_info_t info); + void replaceBitsArrayUses(CallInst *orig, Value *root, function_ref shell, Value *data); Function &F; OptimizationRemarkEmitter ORE; @@ -297,7 +297,7 @@ void Optimizer::optimizeObject(CallInst *orig, size_t sz) { moveToStack(orig, sz, has_ref, use_info.allockind); } -void Optimizer::moveSizedArrayToStack(CallInst *orig, jl_genericmemory_info_t info) { +void Optimizer::moveSizedBitsArrayToStack(CallInst *orig, jl_genericmemory_info_t info) { auto length = orig->getArgOperand(1); auto ilen = cast(length)->getZExtValue(); size_t bytes = jl_genericmemory_bytesize(&info, ilen); @@ -335,11 +335,11 @@ void Optimizer::moveSizedArrayToStack(CallInst *orig, jl_genericmemory_info_t in // This is kind of a dirty cleanup, but subsequent DCE should clean up all the // nullptr manipulations - replaceArrayUses(orig, root, shell, data); + replaceBitsArrayUses(orig, root, shell, data); orig->eraseFromParent(); } -void Optimizer::moveUnsizedArrayToStack(CallInst *orig, jl_genericmemory_info_t info) { +void Optimizer::moveUnsizedBitsArrayToStack(CallInst *orig, jl_genericmemory_info_t info) { size_t maxStackAlloc = 4096; // TODO parameterize by module flag/ctor param IRBuilder<> builder(&*F.getEntryBlock().getFirstInsertionPt()); StringRef origName = orig->getName(); @@ -355,7 +355,8 @@ void Optimizer::moveUnsizedArrayToStack(CallInst *orig, jl_genericmemory_info_t auto origBB = orig->getParent(); builder.SetInsertPoint(orig); auto length = orig->getArgOperand(1); - auto fallback = SplitBlockAndInsertIfThen(builder.CreateICmpUGT(length, ConstantInt::get(length->getType(), maxStackAlloc)), orig, false, nullptr, _DT); + auto maxElements = ConstantInt::get(length->getType(), maxStackAlloc / info.elsize); + auto fallback = SplitBlockAndInsertIfThen(builder.CreateICmpUGT(length, maxElements), orig, false, nullptr, _DT); pass.cfgChanged = true; fallback->getParent()->setName("stack_alloc_fallback"); builder.SetInsertPoint(orig); @@ -380,9 +381,9 @@ void Optimizer::moveUnsizedArrayToStack(CallInst *orig, jl_genericmemory_info_t shellPhi->addIncoming(shellStack, origBB); return shellPhi; }; - + // Replace all the uses now, before we make the original instruction conditional on array size - replaceArrayUses(orig, ownerPhi, shell, dataPhi); + replaceBitsArrayUses(orig, ownerPhi, shell, dataPhi); orig->moveBefore(fallback); builder.SetInsertPoint(fallback); @@ -405,7 +406,7 @@ void Optimizer::moveUnsizedArrayToStack(CallInst *orig, jl_genericmemory_info_t ownerPhi->getParent()->setName("allocated_array"); } -void Optimizer::replaceArrayUses(CallInst *alloc, Value *root, function_refshell, Value *data) { +void Optimizer::replaceBitsArrayUses(CallInst *alloc, Value *root, function_refshell, Value *data) { IRBuilder<> builder(alloc->getContext()); auto type = alloc->getArgOperand(0); auto length = alloc->getArgOperand(1); @@ -470,6 +471,7 @@ void Optimizer::replaceArrayUses(CallInst *alloc, Value *root, function_ref(orig->getArgOperand(1))) { dbgs() << "allocation was sized\n"; - moveSizedArrayToStack(orig, info); + moveSizedBitsArrayToStack(orig, info); } else { dbgs() << "allocation was unsized\n"; - moveUnsizedArrayToStack(orig, info); + moveUnsizedBitsArrayToStack(orig, info); } } diff --git a/src/llvm-julia-licm.cpp b/src/llvm-julia-licm.cpp index e76beaa3df44f..25fc914c555ae 100644 --- a/src/llvm-julia-licm.cpp +++ b/src/llvm-julia-licm.cpp @@ -339,6 +339,22 @@ struct JuliaLICM : public JuliaPassContext { MSSAU.insertDef(cast(clear_mdef), true); } changed = true; + } else if (callee == gc_loaded_func) { + bool valid = true; + for (std::size_t i = 0; i < call->arg_size(); i++) { + if (!makeLoopInvariant(L, call->getArgOperand(i), + changed, preheader->getTerminator(), + MSSAU, SE)) { + valid = false; + LLVM_DEBUG(dbgs() << "Failed to hoist gc_loaded argument: " << *call->getArgOperand(i) << "\n"); + break; + } + } + if (!valid) { + LLVM_DEBUG(dbgs() << "Failed to hoist gc_loaded: " << *call << "\n"); + continue; + } + moveInstructionBefore(*call, *preheader->getTerminator(), MSSAU, SE); } } } diff --git a/src/pipeline.cpp b/src/pipeline.cpp index 4b099521d33f9..194324cf407e7 100644 --- a/src/pipeline.cpp +++ b/src/pipeline.cpp @@ -480,20 +480,30 @@ static void buildScalarOptimizerPipeline(FunctionPassManager &FPM, PassBuilder * FPM.addPass(IRCEPass()); FPM.addPass(InstCombinePass()); FPM.addPass(JumpThreadingPass()); - } - if (O.getSpeedupLevel() >= 3) { + // TODO we traded gvn later for this gvn and replaced it with earlycse, + // is this a good trade? it really helps with eliding array allocations FPM.addPass(GVNPass()); - } - if (O.getSpeedupLevel() >= 2) { FPM.addPass(DSEPass()); invokePeepholeEPCallbacks(FPM, PB, O); FPM.addPass(SimplifyCFGPass(aggressiveSimplifyCFGOptions())); + // TODO is it worth updating memoryssa in alloc-opt? it would mean at O3 we don't have a + // guaranteed recompute of mssa for LICM JULIA_PASS(FPM.addPass(AllocOptPass())); { + // last-chance loop optimizations + // most array allocations that are elided end up wanting these LoopPassManager LPM; + // TODO reenable the O3 guard if this is too expensive + // if (O.getSpeedupLevel() >= 3) { + LPM.addPass(LICMPass(LICMOptions())); + LPM.addPass(JuliaLICMPass()); + LPM.addPass(IndVarSimplifyPass()); + LPM.addPass(LoopIdiomRecognizePass()); + // LPM.addPass(LoopFullUnrollPass()); // doesn't support memoryssa preservation in LLVM 15 + // } LPM.addPass(LoopDeletionPass()); LPM.addPass(LoopInstSimplifyPass()); - FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM))); + FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM), true)); } FPM.addPass(LoopDistributePass()); } @@ -572,7 +582,7 @@ static void buildCleanupPipeline(ModulePassManager &MPM, PassBuilder *PB, Optimi FunctionPassManager FPM; JULIA_PASS(FPM.addPass(DemoteFloat16Pass())); if (O.getSpeedupLevel() >= 2) { - FPM.addPass(GVNPass()); + FPM.addPass(EarlyCSEPass()); } MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); } From 5d0464cbc5d71c53db7f1f3a1da607a3deaa04ed Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi Date: Sat, 9 Dec 2023 08:45:31 -0500 Subject: [PATCH 3/9] Restore pipeline --- src/pipeline.cpp | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/src/pipeline.cpp b/src/pipeline.cpp index 194324cf407e7..4b099521d33f9 100644 --- a/src/pipeline.cpp +++ b/src/pipeline.cpp @@ -480,30 +480,20 @@ static void buildScalarOptimizerPipeline(FunctionPassManager &FPM, PassBuilder * FPM.addPass(IRCEPass()); FPM.addPass(InstCombinePass()); FPM.addPass(JumpThreadingPass()); - // TODO we traded gvn later for this gvn and replaced it with earlycse, - // is this a good trade? it really helps with eliding array allocations + } + if (O.getSpeedupLevel() >= 3) { FPM.addPass(GVNPass()); + } + if (O.getSpeedupLevel() >= 2) { FPM.addPass(DSEPass()); invokePeepholeEPCallbacks(FPM, PB, O); FPM.addPass(SimplifyCFGPass(aggressiveSimplifyCFGOptions())); - // TODO is it worth updating memoryssa in alloc-opt? it would mean at O3 we don't have a - // guaranteed recompute of mssa for LICM JULIA_PASS(FPM.addPass(AllocOptPass())); { - // last-chance loop optimizations - // most array allocations that are elided end up wanting these LoopPassManager LPM; - // TODO reenable the O3 guard if this is too expensive - // if (O.getSpeedupLevel() >= 3) { - LPM.addPass(LICMPass(LICMOptions())); - LPM.addPass(JuliaLICMPass()); - LPM.addPass(IndVarSimplifyPass()); - LPM.addPass(LoopIdiomRecognizePass()); - // LPM.addPass(LoopFullUnrollPass()); // doesn't support memoryssa preservation in LLVM 15 - // } LPM.addPass(LoopDeletionPass()); LPM.addPass(LoopInstSimplifyPass()); - FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM), true)); + FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM))); } FPM.addPass(LoopDistributePass()); } @@ -582,7 +572,7 @@ static void buildCleanupPipeline(ModulePassManager &MPM, PassBuilder *PB, Optimi FunctionPassManager FPM; JULIA_PASS(FPM.addPass(DemoteFloat16Pass())); if (O.getSpeedupLevel() >= 2) { - FPM.addPass(EarlyCSEPass()); + FPM.addPass(GVNPass()); } MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); } From a595924581bb17feda21f611059463b56fb04dbb Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi Date: Sat, 9 Dec 2023 10:44:32 -0500 Subject: [PATCH 4/9] Fix getField to be more accurate --- src/llvm-alloc-helpers.cpp | 74 ++++++++++++++++++++------------------ src/llvm-alloc-helpers.h | 8 ----- 2 files changed, 40 insertions(+), 42 deletions(-) diff --git a/src/llvm-alloc-helpers.cpp b/src/llvm-alloc-helpers.cpp index 4578f09d4c558..d36387995b5ed 100644 --- a/src/llvm-alloc-helpers.cpp +++ b/src/llvm-alloc-helpers.cpp @@ -31,51 +31,57 @@ static bool hasObjref(Type *ty) std::pair& AllocUseInfo::getField(uint32_t offset, uint32_t size, Type *elty) { - auto it = findLowerField(offset); - auto end = memops.end(); - auto lb = end; // first overlap - auto ub = end; // last overlap - if (it != end) { - // The slot found contains the current location - if (it->first + it->second.size >= offset + size) { - if (it->second.elty != elty) - it->second.elty = nullptr; - assert(it->second.elty == nullptr || (it->first == offset && it->second.size == size)); - return *it; + // get next slot beyond containing slots (-1 to ignore slot starting at offset + size) + auto ub = memops.upper_bound(offset + size - 1); + if (ub == memops.begin()) { + // We need to create a new slot, since there is no slot containing offset + size + return *memops.emplace(offset, Field(size, elty)).first; + } + assert(!memops.empty()); + auto lb = memops.upper_bound(offset); + if (lb == memops.begin()) { + // must create an entry that contains lb + if (size <= lb->first - offset) { + // create entry for entire range + return *memops.emplace(offset, Field(size, elty)).first; } - if (it->first + it->second.size > offset) { - lb = it; - ub = it; + lb = memops.emplace(offset, Field(lb->first - offset, elty)).first; + } else { + --lb; + // lb is dereferenceable since we know memops is not empty + if (lb->first + lb->second.size <= offset) { + // lb does not actually contain offset + ++lb; + if (lb == memops.end()) { + // create entry for entire range + return *memops.emplace(offset, Field(size, elty)).first; + } else { + // create entry for range between offset and lb + if (size <= lb->first - offset) { + return *memops.emplace(offset, Field(size, elty)).first; + } + lb = memops.emplace(offset, Field(lb->first - offset, elty)).first; + } } } - else { - it = memops.begin(); - } - // Now find the last slot that overlaps with the current memory location. - // Also set `lb` if we didn't find any above. - for (; it != end && it->first < offset + size; ++it) { - if (lb == end) - lb = it; - ub = it; + // lb must definitely contain offset at this point + assert(lb->first <= offset && lb->first + lb->second.size > offset); + assert(lb != ub); + if (lb->first + lb->second.size >= offset + size) { + // lb contains entire range + return *lb; } - // no overlap found just create a new one. - if (lb == end) - return *memops.emplace(offset, Field(size, elty)).first; - // We find overlapping but not containing slot we need to merge slot/create new one - uint32_t new_offset = std::min(offset, lb->first); - uint32_t new_addrub = std::max(offset + uint32_t(size), ub->first + ub->second.size); - uint32_t new_size = new_addrub - new_offset; - Field field(new_size, nullptr); + size_t off = lb->first; + Field field(offset - lb->first + size, elty); field.multiloc = true; - ++ub; - for (it = lb; it != ub; ++it) { + for (auto it = lb; it != ub; ++it) { field.hasobjref |= it->second.hasobjref; field.hasload |= it->second.hasload; field.hasaggr |= it->second.hasaggr; field.accesses.append(it->second.accesses.begin(), it->second.accesses.end()); } memops.erase(lb, ub); - return *memops.emplace(new_offset, std::move(field)).first; + return *memops.emplace(off, std::move(field)).first; } bool AllocUseInfo::addMemOp(Instruction *inst, unsigned opno, uint32_t offset, diff --git a/src/llvm-alloc-helpers.h b/src/llvm-alloc-helpers.h index 49c3b15332a56..3c1f1b617b1b4 100644 --- a/src/llvm-alloc-helpers.h +++ b/src/llvm-alloc-helpers.h @@ -119,14 +119,6 @@ namespace jl_alloc { bool addMemOp(llvm::Instruction *inst, unsigned opno, uint32_t offset, llvm::Type *elty, bool isstore, const llvm::DataLayout &DL); std::pair &getField(uint32_t offset, uint32_t size, llvm::Type *elty); - std::map::iterator findLowerField(uint32_t offset) - { - // Find the last field that starts no higher than `offset`. - auto it = memops.upper_bound(offset); - if (it != memops.begin()) - return --it; - return memops.end(); - } }; struct EscapeAnalysisRequiredArgs { From eb7632cc2d68ac850571752503a03bffb99babb5 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi Date: Sat, 9 Dec 2023 10:51:57 -0500 Subject: [PATCH 5/9] Remove fcas before first alloc-opt --- src/pipeline.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/pipeline.cpp b/src/pipeline.cpp index 4b099521d33f9..1a218a004b0b7 100644 --- a/src/pipeline.cpp +++ b/src/pipeline.cpp @@ -371,6 +371,8 @@ static void buildEarlyOptimizerPipeline(ModulePassManager &MPM, PassBuilder *PB, invokeCGSCCCallbacks(CGPM, PB, O); if (O.getSpeedupLevel() >= 2) { FunctionPassManager FPM; + // get rid of random FCAs that confuse alloc-opt + FPM.addPass(InstCombinePass()); JULIA_PASS(FPM.addPass(AllocOptPass())); FPM.addPass(Float2IntPass()); FPM.addPass(LowerConstantIntrinsicsPass()); From 776c4c420d741ad47a22c4ba112c8633a372682d Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi Date: Sat, 9 Dec 2023 13:08:32 -0500 Subject: [PATCH 6/9] Partial escape analysis for objects --- src/llvm-alloc-helpers.cpp | 20 ++++- src/llvm-alloc-helpers.h | 5 +- src/llvm-alloc-opt.cpp | 166 +++++++++++++++++++++++++++++++------ src/llvm-julia-licm.cpp | 12 +++ 4 files changed, 170 insertions(+), 33 deletions(-) diff --git a/src/llvm-alloc-helpers.cpp b/src/llvm-alloc-helpers.cpp index d36387995b5ed..884262ca97530 100644 --- a/src/llvm-alloc-helpers.cpp +++ b/src/llvm-alloc-helpers.cpp @@ -94,7 +94,7 @@ bool AllocUseInfo::addMemOp(Instruction *inst, unsigned opno, uint32_t offset, memop.isaggr = isa(elty) || isa(elty) || isa(elty); memop.isobjref = hasObjref(elty); auto &field = getField(offset, size, elty); - if (field.second.hasobjref != memop.isobjref) + if (field.second.hasobjref != memop.isobjref && !field.second.accesses.empty()) field.second.multiloc = true; // can't split this field, since it contains a mix of references and bits if (!isstore) field.second.hasload = true; @@ -124,7 +124,6 @@ JL_USED_FUNC void AllocUseInfo::dump(llvm::raw_ostream &OS) OS << "escaped: " << escaped << '\n'; OS << "addrescaped: " << addrescaped << '\n'; OS << "returned: " << returned << '\n'; - OS << "haserror: " << haserror << '\n'; OS << "hasload: " << hasload << '\n'; OS << "haspreserve: " << haspreserve << '\n'; OS << "hasunknownmem: " << hasunknownmem << '\n'; @@ -140,10 +139,19 @@ JL_USED_FUNC void AllocUseInfo::dump(llvm::raw_ostream &OS) OS << "Uses: " << uses.size() << '\n'; for (auto inst: uses) inst->print(OS); + OS << '\n'; if (!preserves.empty()) { OS << "Preserves: " << preserves.size() << '\n'; for (auto inst: preserves) inst->print(OS); + OS << '\n'; + } + if (!errorbbs.empty()) { + OS << "ErrorBBs: " << errorbbs.size() << '\n'; + for (auto bb: errorbbs) { + bb->printAsOperand(OS); + OS << '\n'; + } } OS << "MemOps: " << memops.size() << '\n'; for (auto &field: memops) { @@ -152,6 +160,7 @@ JL_USED_FUNC void AllocUseInfo::dump(llvm::raw_ostream &OS) OS << " hasobjref: " << field.second.hasobjref << '\n'; OS << " hasload: " << field.second.hasload << '\n'; OS << " hasaggr: " << field.second.hasaggr << '\n'; + OS << " multiloc: " << field.second.multiloc << '\n'; OS << " accesses: " << field.second.accesses.size() << '\n'; for (auto &memop: field.second.accesses) { OS << " "; @@ -263,7 +272,7 @@ void jl_alloc::runEscapeAnalysis(llvm::CallInst *I, EscapeAnalysisRequiredArgs r call->getOperandBundleForOperand(opno).getTagName() != "jl_roots") { if (isa(call->getParent()->getTerminator())) { LLVM_DEBUG(dbgs() << "Detected use of allocation in block terminating with unreachable, likely error function\n"); - required.use_info.haserror = true; + required.use_info.errorbbs.insert(call->getParent()); return true; } LLVM_DEBUG(dbgs() << "Unknown call, marking escape\n"); @@ -282,6 +291,11 @@ void jl_alloc::runEscapeAnalysis(llvm::CallInst *I, EscapeAnalysisRequiredArgs r if (auto store = dyn_cast(inst)) { // Only store value count if (use->getOperandNo() != StoreInst::getPointerOperandIndex()) { + if (isa(store->getParent()->getTerminator())) { + LLVM_DEBUG(dbgs() << "Detected use of allocation in block terminating with unreachable, likely error function\n"); + required.use_info.errorbbs.insert(store->getParent()); + return true; + } LLVM_DEBUG(dbgs() << "Object address is stored somewhere, marking escape\n"); REMARK([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "StoreObjAddr", diff --git a/src/llvm-alloc-helpers.h b/src/llvm-alloc-helpers.h index 3c1f1b617b1b4..adb428c8e9cc9 100644 --- a/src/llvm-alloc-helpers.h +++ b/src/llvm-alloc-helpers.h @@ -62,6 +62,7 @@ namespace jl_alloc { struct AllocUseInfo { llvm::SmallSet uses; llvm::SmallSet preserves; + llvm::SmallSet errorbbs; std::map memops; // Completely unknown use bool escaped:1; @@ -85,8 +86,6 @@ namespace jl_alloc { bool hasunknownmem:1; // The object is returned bool returned:1; - // The object is used in an error function - bool haserror:1; // For checking attributes of "uninitialized" or "zeroed" or unknown llvm::AllocFnKind allockind; @@ -106,12 +105,12 @@ namespace jl_alloc { hastypeof = false; hasunknownmem = false; returned = false; - haserror = false; allockind = llvm::AllocFnKind::Unknown; has_unknown_objref = false; has_unknown_objrefaggr = false; uses.clear(); preserves.clear(); + errorbbs.clear(); memops.clear(); } void dump(llvm::raw_ostream &OS); diff --git a/src/llvm-alloc-opt.cpp b/src/llvm-alloc-opt.cpp index 756ebb4d5f3c9..89262e4f5e2d7 100644 --- a/src/llvm-alloc-opt.cpp +++ b/src/llvm-alloc-opt.cpp @@ -219,6 +219,8 @@ void Optimizer::pushInstruction(Instruction *I) void Optimizer::initialize() { for (auto &bb: F) { + if (isa(bb.getTerminator())) + continue; for (auto &I: bb) { pushInstruction(&I); } @@ -236,17 +238,17 @@ void Optimizer::optimizeObject(CallInst *orig, size_t sz) { optimizeTag(orig); return; } - if (use_info.haserror || use_info.returned) { + if (use_info.returned) { REMARK([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "Escaped", orig) - << "GC allocation has error or was returned " << ore::NV("GC Allocation", orig); + << "GC allocation was returned " << ore::NV("GC Allocation", orig); }); if (use_info.hastypeof) optimizeTag(orig); return; } - if (!use_info.addrescaped && !use_info.hasload && (!use_info.haspreserve || - !use_info.refstore)) { + if (!use_info.addrescaped && !use_info.hasload && use_info.errorbbs.empty() + && (!use_info.haspreserve || !use_info.refstore)) { REMARK([&]() { return OptimizationRemark(DEBUG_TYPE, "Dead Allocation", orig) << "GC allocation removed " << ore::NV("GC Allocation", orig); @@ -289,6 +291,15 @@ void Optimizer::optimizeObject(CallInst *orig, size_t sz) { splitOnStack(orig); return; } + if (!use_info.errorbbs.empty()) { + REMARK([&](){ + return OptimizationRemarkMissed(DEBUG_TYPE, "Escaped", orig) + << "GC allocation has error " << ore::NV("GC Allocation", orig); + }); + if (use_info.hastypeof) + optimizeTag(orig); + return; + } REMARK([&](){ return OptimizationRemark(DEBUG_TYPE, "Stack Move Allocation", orig) << "GC allocation moved to stack " << ore::NV("GC Allocation", orig); @@ -471,7 +482,6 @@ void Optimizer::replaceBitsArrayUses(CallInst *alloc, Value *root, function_ref< } void Optimizer::optimizeArray(CallInst *orig, jl_genericmemory_info_t info) { - return; checkInst(orig); dbgs() << "checking array allocation\n"; if (use_info.escaped) { @@ -481,7 +491,7 @@ void Optimizer::optimizeArray(CallInst *orig, jl_genericmemory_info_t info) { }); return; } - if (use_info.haserror || use_info.returned) { + if (!use_info.errorbbs.empty() || use_info.returned) { REMARK([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "Escaped", orig) << "GC allocation has error or was returned " << ore::NV("GC GenericMemory Allocation", orig); @@ -1090,7 +1100,8 @@ void Optimizer::removeAlloc(CallInst *orig_inst) // The stored value might be an gc pointer in which case deleting the object // might open more optimization opportunities. if (auto stored_inst = dyn_cast(store->getValueOperand())) - pushInstruction(stored_inst); + if (!isa(stored_inst->getParent()->getTerminator())) + pushInstruction(stored_inst); user->eraseFromParent(); return; } @@ -1184,6 +1195,7 @@ void Optimizer::splitOnStack(CallInst *orig_inst) uint32_t size; }; SmallVector slots; + auto align = orig_inst->getRetAlign().valueOrOne(); for (auto memop: use_info.memops) { auto offset = memop.first; auto &field = memop.second; @@ -1205,12 +1217,99 @@ void Optimizer::splitOnStack(CallInst *orig_inst) allocty = ArrayType::get(Type::getInt8Ty(pass.getLLVMContext()), field.size); } slot.slot = prolog_builder.CreateAlloca(allocty); + slot.slot->setAlignment(Align(MinAlign(align.value(), slot.offset))); IRBuilder<> builder(orig_inst); insertLifetime(prolog_builder.CreateBitCast(slot.slot, Type::getInt8PtrTy(prolog_builder.getContext())), ConstantInt::get(Type::getInt64Ty(prolog_builder.getContext()), field.size), orig_inst); initializeAlloca(builder, slot.slot, use_info.allockind); slots.push_back(std::move(slot)); } + struct ErrorBBInfo { + CallInst *sunk; + Instruction *insertpt; + Instruction *p11i8; + DenseMap p11i8_offsets; + + ErrorBBInfo(CallInst *sunk) : sunk(sunk), insertpt(sunk->getNextNonDebugInstruction()), p11i8(nullptr) {} + + Instruction *gep(uint32_t offset, Type *elty, IRBuilder<> &builder) { + builder.SetInsertPoint(insertpt); + if (!p11i8) { + auto p11jlvalue = builder.CreateAddrSpaceCast(sunk, PointerType::getWithSamePointeeType(cast(sunk->getType()), AddressSpace::Derived)); + p11i8 = cast(builder.CreateBitCast(p11jlvalue, Type::getInt8PtrTy(builder.getContext(), AddressSpace::Derived))); + } + auto it = p11i8_offsets.find(offset); + if (it == p11i8_offsets.end()) { + auto gep = builder.CreateConstInBoundsGEP1_32(Type::getInt8Ty(builder.getContext()), p11i8, offset); + it = p11i8_offsets.insert(std::make_pair(offset, cast(gep))).first; + } + return cast(builder.CreateBitCast(it->second, elty->getPointerTo(AddressSpace::Derived))); + } + }; + auto slot_gep = [&] (SplitSlot &slot, uint32_t offset, Type *elty, IRBuilder<> &builder) { + assert(slot.offset <= offset); + offset -= slot.offset; + auto size = pass.DL->getTypeAllocSize(elty); + Value *addr; + if (offset % size == 0) { + addr = builder.CreateBitCast(slot.slot, elty->getPointerTo()); + if (offset != 0) { + addr = builder.CreateConstInBoundsGEP1_32(elty, addr, offset / size); + } + } + else { + addr = builder.CreateBitCast(slot.slot, Type::getInt8PtrTy(builder.getContext())); + addr = builder.CreateConstInBoundsGEP1_32(Type::getInt8Ty(builder.getContext()), addr, offset); + addr = builder.CreateBitCast(addr, elty->getPointerTo()); + } + return addr; + }; + DenseMap partial_escapes; + IRBuilder<> errbuilder(orig_inst->getContext()); + // sink allocation into error blocks, copy fields + for (auto errbb : use_info.errorbbs) { + auto sunk = cast(orig_inst->clone()); + sunk->insertBefore(&*errbb->getFirstInsertionPt()); + auto &info = partial_escapes.insert(std::make_pair(errbb, ErrorBBInfo(sunk))).first->second; + // note that this also sets the insert point of errbuilder + auto p11i8 = info.gep(0, Type::getInt8Ty(errbuilder.getContext()), errbuilder); + // conservatively just clear the whole thing + errbuilder.CreateMemSet(p11i8, ConstantInt::get(Type::getInt8Ty(errbuilder.getContext()), 0), orig_inst->getArgOperand(1), align); + for (auto &slot : slots) { + auto psize = pass.DL->getPointerSize(); + if (slot.isref) { + auto copyt = pass.T_prjlvalue; + assert(slot.size % psize == 0); + for (uint32_t offset = 0; offset < slot.size; offset += psize) { + auto dest = info.gep(slot.offset + offset, copyt, errbuilder); + auto src = slot_gep(slot, slot.offset + offset, copyt, errbuilder); + auto load = errbuilder.CreateAlignedLoad(copyt, src, Align(MinAlign(slot.slot->getAlign().value(), offset))); + errbuilder.CreateAlignedStore(load, dest, Align(MinAlign(align.value(), slot.offset + offset))); + } + } else { + auto copyt = pass.DL->getIntPtrType(errbuilder.getContext()); + for (uint32_t offset = 0; offset < slot.size; offset += psize) { + auto dest = info.gep(slot.offset + offset, copyt, errbuilder); + auto src = slot_gep(slot, slot.offset + offset, copyt, errbuilder); + auto load = errbuilder.CreateAlignedLoad(copyt, src, Align(MinAlign(slot.slot->getAlign().value(), offset))); + errbuilder.CreateAlignedStore(load, dest, Align(MinAlign(align.value(), slot.offset + offset))); + } + auto remainder = slot.size % psize; + if (remainder != 0) { + copyt = cast(pass.DL->getSmallestLegalIntType(errbuilder.getContext(), 8)); + assert(copyt->getBitWidth() % 8 == 0); + auto copysize = copyt->getBitWidth() / 8; + assert(remainder % copysize == 0); + for (size_t offset = slot.size - remainder; offset < slot.size; offset += copysize) { + auto dest = info.gep(slot.offset + offset, copyt, errbuilder); + auto src = slot_gep(slot, slot.offset + offset, copyt, errbuilder); + auto load = errbuilder.CreateAlignedLoad(copyt, src, Align(MinAlign(slot.slot->getAlign().value(), offset))); + errbuilder.CreateAlignedStore(load, dest, Align(MinAlign(align.value(), slot.offset + offset))); + } + } + } + } + } const auto nslots = slots.size(); auto find_slot = [&] (uint32_t offset) { if (offset == 0) @@ -1254,29 +1353,18 @@ void Optimizer::splitOnStack(CallInst *orig_inst) replace_stack.push_back(cur); cur = {orig_i, offset}; }; - auto slot_gep = [&] (SplitSlot &slot, uint32_t offset, Type *elty, IRBuilder<> &builder) { - assert(slot.offset <= offset); - offset -= slot.offset; - auto size = pass.DL->getTypeAllocSize(elty); - Value *addr; - if (offset % size == 0) { - addr = builder.CreateBitCast(slot.slot, elty->getPointerTo()); - if (offset != 0) { - addr = builder.CreateConstInBoundsGEP1_32(elty, addr, offset / size); - } - } - else { - addr = builder.CreateBitCast(slot.slot, Type::getInt8PtrTy(builder.getContext())); - addr = builder.CreateConstInBoundsGEP1_32(Type::getInt8Ty(builder.getContext()), addr, offset); - addr = builder.CreateBitCast(addr, elty->getPointerTo()); - } - return addr; - }; auto replace_inst = [&] (Use *use) { Instruction *user = cast(use->getUser()); Instruction *orig_i = cur.orig_i; uint32_t offset = cur.offset; + auto errit = partial_escapes.find(user->getParent()); + ErrorBBInfo *errinfo = errit == partial_escapes.end() ? nullptr : &errit->second; if (auto load = dyn_cast(user)) { + if (errinfo) { + auto gep = errinfo->gep(offset, load->getType(), errbuilder); + use->set(gep); + return; + } auto slot_idx = find_slot(offset); auto &slot = slots[slot_idx]; assert(slot.offset <= offset && slot.offset + slot.size >= offset); @@ -1304,7 +1392,13 @@ void Optimizer::splitOnStack(CallInst *orig_inst) } else if (auto store = dyn_cast(user)) { if (auto stored_inst = dyn_cast(store->getValueOperand())) - pushInstruction(stored_inst); + if (!isa(stored_inst->getParent()->getTerminator())) + pushInstruction(stored_inst); // may be able to stack allocate this object too + if (errinfo) { + auto gep = errinfo->gep(offset, store->getValueOperand()->getType(), errbuilder); + use->set(gep); + return; + } auto slot_idx = find_slot(offset); auto &slot = slots[slot_idx]; if (slot.offset > offset || slot.offset + slot.size <= offset) { @@ -1342,6 +1436,11 @@ void Optimizer::splitOnStack(CallInst *orig_inst) return; } else if (isa(user) || isa(user)) { + if (errinfo) { + auto gep = errinfo->gep(offset, user->getType(), errbuilder); + use->set(gep); + return; + } auto slot_idx = find_slot(offset); auto &slot = slots[slot_idx]; assert(slot.offset <= offset && slot.offset + slot.size >= offset); @@ -1358,6 +1457,19 @@ void Optimizer::splitOnStack(CallInst *orig_inst) *use = newptr; } else if (auto call = dyn_cast(user)) { + if (errinfo) { + auto pt = cast(use->get()->getType()); + if (pt->getAddressSpace() == AddressSpace::Tracked) { + assert(offset == 0); // can't have tracked gep + auto bc = errbuilder.CreateBitCast(errinfo->sunk, pt); + use->set(bc); + } else { + auto eltype = pt->isOpaque() ? Type::getInt8Ty(pt->getContext()) : pt->getNonOpaquePointerElementType(); + auto gep = errinfo->gep(offset, eltype, errbuilder); + use->set(gep); + } + return; + } auto callee = call->getCalledOperand(); assert(callee); // makes it clear for clang analyser that `callee` is not NULL if (auto intrinsic = dyn_cast(call)) { @@ -1479,7 +1591,7 @@ void Optimizer::splitOnStack(CallInst *orig_inst) push_frame(user, offset); } else if (auto gep = dyn_cast(user)) { - APInt apoffset(sizeof(void*) * 8, offset, true); + APInt apoffset(pass.DL->getPointerSizeInBits(), offset, true); gep->accumulateConstantOffset(*pass.DL, apoffset); push_frame(gep, apoffset.getLimitedValue()); } diff --git a/src/llvm-julia-licm.cpp b/src/llvm-julia-licm.cpp index 25fc914c555ae..081bb3370f6ba 100644 --- a/src/llvm-julia-licm.cpp +++ b/src/llvm-julia-licm.cpp @@ -146,6 +146,7 @@ struct JuliaLICM : public JuliaPassContext { BasicBlock *header = L->getHeader(); const llvm::DataLayout &DL = header->getModule()->getDataLayout(); initFunctions(*header->getModule()); + Function *except_enter_func = header->getModule()->getFunction("julia.except_enter"); // Also require `gc_preserve_begin_func` whereas // `gc_preserve_end_func` is optional since the input to // `gc_preserve_end_func` must be from `gc_preserve_begin_func`. @@ -182,6 +183,8 @@ struct JuliaLICM : public JuliaPassContext { LoopBlocksRPO worklist(L); worklist.perform(LI); for (auto *bb : worklist) { + if (isa(bb->getTerminator())) + continue; for (BasicBlock::iterator II = bb->begin(), E = bb->end(); II != E;) { auto call = dyn_cast(&*II++); if (!call) @@ -322,6 +325,15 @@ struct JuliaLICM : public JuliaPassContext { }); continue; } + if (!use_info.errorbbs.empty() && except_enter_func) { + // If we escape via error handling, we don't want to catch the error inside the loop + REMARK([&](){ + return OptimizationRemarkMissed(DEBUG_TYPE, "Escape", call) + << "not hoisting gc allocation " << ore::NV("GC Allocation", call) + << " because it may escape via error handling"; + }); + continue; + } REMARK([&](){ return OptimizationRemark(DEBUG_TYPE, "Hoist", call) << "hoisting gc allocation " << ore::NV("GC Allocation", call); From 07c10314647b8790341a8ca531be08357eacc73b Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi Date: Sat, 9 Dec 2023 22:51:32 -0500 Subject: [PATCH 7/9] Partial escape analysis for arrays --- src/codegen.cpp | 20 +-- src/llvm-alloc-helpers.cpp | 10 +- src/llvm-alloc-helpers.h | 2 +- src/llvm-alloc-opt.cpp | 341 +++++++++++++++++++++++++++++++++---- src/llvm-codegen-shared.h | 21 +++ 5 files changed, 335 insertions(+), 59 deletions(-) diff --git a/src/codegen.cpp b/src/codegen.cpp index 29495bf36786e..eddfc117dc9a7 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -603,16 +603,6 @@ static inline void add_named_global(StringRef name, T *addr) add_named_global(name, (void*)(uintptr_t)addr); } -AttributeSet Attributes(LLVMContext &C, std::initializer_list attrkinds, std::initializer_list extra={}) -{ - SmallVector attrs(attrkinds.size() + extra.size()); - for (size_t i = 0; i < attrkinds.size(); i++) - attrs[i] = Attribute::get(C, attrkinds.begin()[i]); - for (size_t i = 0; i < extra.size(); i++) - attrs[attrkinds.size() + i] = extra.begin()[i]; - return AttributeSet::get(C, ArrayRef(attrs)); -} - static Type *get_pjlvalue(LLVMContext &C) { return JuliaType::get_pjlvalue_ty(C); } static FunctionType *get_func_sig(LLVMContext &C) { return JuliaType::get_jlfunc_ty(C); } @@ -1428,14 +1418,8 @@ static const auto gc_loaded_func = new JuliaFunction<>{ // top: // %metadata GC base pointer is ptr(Tracked) // ret addrspacecast ptr to ptr(Loaded) - [](LLVMContext &C) { return FunctionType::get(PointerType::get(JuliaType::get_prjlvalue_ty(C), AddressSpace::Loaded), - {JuliaType::get_prjlvalue_ty(C), PointerType::get(JuliaType::get_prjlvalue_ty(C), 0)}, false); }, - [](LLVMContext &C) { - AttributeSet FnAttrs = Attributes(C, {Attribute::ReadNone, Attribute::NoSync, Attribute::NoUnwind, Attribute::Speculatable, Attribute::WillReturn, Attribute::NoRecurse}); - AttributeSet RetAttrs = Attributes(C, {Attribute::NonNull, Attribute::NoUndef}); - return AttributeList::get(C, FnAttrs, RetAttrs, - { Attributes(C, {Attribute::NoUndef, Attribute::ReadNone, Attribute::NoCapture}), - Attributes(C, {Attribute::NonNull, Attribute::NoUndef, Attribute::ReadNone}) }); }, + [](LLVMContext &C) { return get_gc_loaded_decl(C).first; }, + [](LLVMContext &C) { return get_gc_loaded_decl(C).second; }, }; // julia.call represents a call with julia calling convention, it is used as diff --git a/src/llvm-alloc-helpers.cpp b/src/llvm-alloc-helpers.cpp index 884262ca97530..66a285dbfb378 100644 --- a/src/llvm-alloc-helpers.cpp +++ b/src/llvm-alloc-helpers.cpp @@ -185,11 +185,13 @@ JL_USED_FUNC void AllocUseInfo::dump() #define REMARK(remark) #endif -void jl_alloc::runEscapeAnalysis(llvm::CallInst *I, EscapeAnalysisRequiredArgs required, EscapeAnalysisOptionalArgs options) { +void jl_alloc::runEscapeAnalysis(llvm::Instruction *I, EscapeAnalysisRequiredArgs required, EscapeAnalysisOptionalArgs options) { required.use_info.reset(); - Attribute allockind = I->getFnAttr(Attribute::AllocKind); - if (allockind.isValid()) - required.use_info.allockind = allockind.getAllocKind(); + if (auto CI = dyn_cast(I)) { + Attribute allockind = CI->getFnAttr(Attribute::AllocKind); + if (allockind.isValid()) + required.use_info.allockind = allockind.getAllocKind(); + } if (I->use_empty()) return; CheckInst::Frame cur{I, 0, I->use_begin(), I->use_end()}; diff --git a/src/llvm-alloc-helpers.h b/src/llvm-alloc-helpers.h index adb428c8e9cc9..3a20b3f19dda3 100644 --- a/src/llvm-alloc-helpers.h +++ b/src/llvm-alloc-helpers.h @@ -147,7 +147,7 @@ namespace jl_alloc { } }; - void runEscapeAnalysis(llvm::CallInst *I, EscapeAnalysisRequiredArgs required, EscapeAnalysisOptionalArgs options=EscapeAnalysisOptionalArgs()); + void runEscapeAnalysis(llvm::Instruction *I, EscapeAnalysisRequiredArgs required, EscapeAnalysisOptionalArgs options=EscapeAnalysisOptionalArgs()); } diff --git a/src/llvm-alloc-opt.cpp b/src/llvm-alloc-opt.cpp index 89262e4f5e2d7..0831f146b05db 100644 --- a/src/llvm-alloc-opt.cpp +++ b/src/llvm-alloc-opt.cpp @@ -152,7 +152,28 @@ struct Optimizer { void moveSizedBitsArrayToStack(CallInst *orig, jl_genericmemory_info_t info); void moveUnsizedBitsArrayToStack(CallInst *orig, jl_genericmemory_info_t info); - void replaceBitsArrayUses(CallInst *orig, Value *root, function_ref shell, Value *data); + void replaceBitsArrayUses(CallInst *orig, Value *conditional, Value *root, function_ref shell, Instruction *data); + + + // for deoptimizing array allocations in errors (usually BoundsErrors) + struct SunkenArray { + Instruction *root; // technically also shell at this point + Instruction *data; + Instruction *loaded; + Instruction *err_insertpt; + + struct Frame { + Instruction *orig_i; + Value::use_iterator next; + size_t offset_frame; // index of frame with offset data in stack + // dynamic offset of orig_i from data + MapVector variables; + APInt constant; + bool loaded; + }; + }; + bool canDeoptimizeErrorBlocks(CallInst *orig); + void sinkArrayDataPointer(CallInst *orig, DenseMap &sunken, Value *root, Instruction *data, LoadInst *load); Function &F; OptimizationRemarkEmitter ORE; @@ -202,6 +223,7 @@ struct Optimizer { DenseMap arrays; SmallVector removed; AllocUseInfo use_info; + AllocUseInfo array_data_info; CheckInst::Stack check_stack; Lifetime::Stack lifetime_stack; ReplaceUses::Stack replace_stack; @@ -308,31 +330,64 @@ void Optimizer::optimizeObject(CallInst *orig, size_t sz) { moveToStack(orig, sz, has_ref, use_info.allockind); } -void Optimizer::moveSizedBitsArrayToStack(CallInst *orig, jl_genericmemory_info_t info) { - auto length = orig->getArgOperand(1); - auto ilen = cast(length)->getZExtValue(); - size_t bytes = jl_genericmemory_bytesize(&info, ilen); - size_t maxStackAlloc = 4096; // TODO parameterize by module flag/ctor param - if (bytes > maxStackAlloc) { - dbgs() << "Array was too large to stack allocate\n"; - REMARK([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "Escaped", orig) - << "GC genericmemory allocation size is too large " << ore::NV("GC GenericMemory Allocation", orig); +bool Optimizer::canDeoptimizeErrorBlocks(CallInst *orig) { + SmallSet data_pointers; + for (auto &field : use_info.memops) { + for (auto &acc : field.second.accesses) { + assert(isa(acc.inst) && "Should only have loads of array length/data"); + } + if (field.first == 0) { + if (field.second.size > pass.DL->getPointerSize()) { + dbgs() << "Can't deoptimize error blocks because of large field 0\n"; + return false; + } + } else { + assert(field.first == pass.DL->getPointerSize() && "Got a nonzero/non-data load?"); + assert(field.second.size == pass.DL->getPointerSize() && "Got a load of array data for less than pointer size?"); + for (auto &acc : field.second.accesses) { + data_pointers.insert(cast(acc.inst)); + } + } + } + jl_alloc::EscapeAnalysisRequiredArgs required{array_data_info, check_stack, pass, *pass.DL}; + for (auto I : data_pointers) { + LLVM_DEBUG(dbgs() << "Running escape analysis on " << *I << "\n"); + jl_alloc::runEscapeAnalysis(I, required, jl_alloc::EscapeAnalysisOptionalArgs().with_optimization_remark_emitter(&ORE)); + REMARK([&](){ + std::string suse_info; + llvm::raw_string_ostream osuse_info(suse_info); + array_data_info.dump(osuse_info); + return OptimizationRemarkAnalysis(DEBUG_TYPE, "EscapeAnalysis", I) << "escape analysis for " << ore::NV("GC Allocation", I) << "\n" << ore::NV("UseInfo", osuse_info.str()); }); - return; + // This implicitly relies on PHI nodes of the data pointer escaping; if escape analysis every changes + // to not do that, we'll need to update GEP offset calculations (probably everywhere) + if (array_data_info.escaped) { + REMARK([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Escaped", orig) + << "GC allocation escaped " << ore::NV("GC Allocation", orig); + }); + return false; + } + assert(!array_data_info.returned); + assert(!array_data_info.hastypeof); } + return true; +} + +void Optimizer::moveSizedBitsArrayToStack(CallInst *orig, jl_genericmemory_info_t info) { + auto length = orig->getArgOperand(1); auto align = orig->getRetAlign().valueOrOne(); IRBuilder<> builder(&*F.getEntryBlock().getFirstInsertionPt()); auto T_size = pass.DL->getIntPtrType(builder.getContext()); - auto data = builder.CreateAlloca(Type::getInt8Ty(builder.getContext()), ConstantInt::get(T_size, bytes)); + auto data = builder.CreateAlloca(Type::getInt8Ty(builder.getContext()), length); data->setAlignment(align); data->takeName(orig); auto root = Constant::getNullValue(orig->getType()); // technically valid to root this - Value *shellp2i = nullptr; + Instruction *shell0 = nullptr; - auto shell = [&]() mutable { - if (shellp2i) - return shellp2i; + auto shell = [&]() -> Instruction * { + if (shell0) + return shell0; auto shellData = builder.CreateAlloca(Type::getInt8PtrTy(builder.getContext()), ConstantInt::get(T_size, 2)); shellData->setAlignment(align); shellData->setName(data->getName() + ".shell_data"); @@ -340,18 +395,17 @@ void Optimizer::moveSizedBitsArrayToStack(CallInst *orig, jl_genericmemory_info_ builder.CreateAlignedStore(length, lenptr, align); auto dataptr = builder.CreateConstGEP1_64(Type::getInt8PtrTy(builder.getContext()), shellData, 1); builder.CreateAlignedStore(data, dataptr, Align(std::min(align.value(), (uint64_t) pass.DL->getPointerSize()))); - shellp2i = builder.CreatePtrToInt(shellData, pass.DL->getIntPtrType(builder.getContext())); - return shellp2i; + auto asc = builder.CreateAddrSpaceCast(shellData, Type::getInt8PtrTy(builder.getContext())->getPointerTo(0)); // pointer_to_objref always returns addrspace 0 + shell0 = cast(builder.CreateBitCast(asc, JuliaType::get_pjlvalue_ty(builder.getContext()))); + return shell0; }; - // This is kind of a dirty cleanup, but subsequent DCE should clean up all the - // nullptr manipulations - replaceBitsArrayUses(orig, root, shell, data); + replaceBitsArrayUses(orig, nullptr, root, shell, data); orig->eraseFromParent(); } void Optimizer::moveUnsizedBitsArrayToStack(CallInst *orig, jl_genericmemory_info_t info) { - size_t maxStackAlloc = 4096; // TODO parameterize by module flag/ctor param + size_t maxStackAlloc = 512; // TODO parameterize by module flag/ctor param IRBuilder<> builder(&*F.getEntryBlock().getFirstInsertionPt()); StringRef origName = orig->getName(); auto T_size = pass.DL->getIntPtrType(builder.getContext()); @@ -367,7 +421,8 @@ void Optimizer::moveUnsizedBitsArrayToStack(CallInst *orig, jl_genericmemory_inf builder.SetInsertPoint(orig); auto length = orig->getArgOperand(1); auto maxElements = ConstantInt::get(length->getType(), maxStackAlloc / info.elsize); - auto fallback = SplitBlockAndInsertIfThen(builder.CreateICmpUGT(length, maxElements), orig, false, nullptr, _DT); + auto tooBig = builder.CreateICmpUGT(length, maxElements); + auto fallback = SplitBlockAndInsertIfThen(tooBig, orig, false, nullptr, _DT); pass.cfgChanged = true; fallback->getParent()->setName("stack_alloc_fallback"); builder.SetInsertPoint(orig); @@ -375,7 +430,7 @@ void Optimizer::moveUnsizedBitsArrayToStack(CallInst *orig, jl_genericmemory_inf auto dataPhi = builder.CreatePHI(Type::getInt8PtrTy(builder.getContext()), 2); PHINode *shellPhi = nullptr; - auto shell = [&]() mutable { + auto shell = [&]() -> Instruction * { if (shellPhi) return shellPhi; builder.SetInsertPoint(origBB->getTerminator()); @@ -386,15 +441,16 @@ void Optimizer::moveUnsizedBitsArrayToStack(CallInst *orig, jl_genericmemory_inf builder.CreateAlignedStore(length, lenptr, align); auto dataptr = builder.CreateConstGEP1_64(Type::getInt8PtrTy(builder.getContext()), shellData, 1); builder.CreateAlignedStore(data, dataptr, Align(std::min(align.value(), (uint64_t) pass.DL->getPointerSize()))); - auto shellStack = builder.CreatePtrToInt(shellData, pass.DL->getIntPtrType(builder.getContext())); + auto asc = builder.CreateAddrSpaceCast(shellData, Type::getInt8PtrTy(builder.getContext())->getPointerTo(0)); // pointer_to_objref always returns addrspace 0 + auto bc = builder.CreateBitCast(asc, JuliaType::get_pjlvalue_ty(builder.getContext())); builder.SetInsertPoint(ownerPhi); - shellPhi = builder.CreatePHI(shellData->getType(), 2); - shellPhi->addIncoming(shellStack, origBB); + shellPhi = builder.CreatePHI(bc->getType(), 2); + shellPhi->addIncoming(bc, origBB); return shellPhi; }; // Replace all the uses now, before we make the original instruction conditional on array size - replaceBitsArrayUses(orig, ownerPhi, shell, dataPhi); + replaceBitsArrayUses(orig, tooBig, ownerPhi, shell, dataPhi); orig->moveBefore(fallback); builder.SetInsertPoint(fallback); @@ -417,10 +473,188 @@ void Optimizer::moveUnsizedBitsArrayToStack(CallInst *orig, jl_genericmemory_inf ownerPhi->getParent()->setName("allocated_array"); } -void Optimizer::replaceBitsArrayUses(CallInst *alloc, Value *root, function_refshell, Value *data) { +void Optimizer::sinkArrayDataPointer(CallInst *orig, DenseMap &sunken, Value *root, Instruction *data, LoadInst *load) { + SmallVector stack; + auto BitWidth = pass.DL->getPointerSizeInBits(); + IRBuilder<> builder(orig->getContext()); + auto replace_error_use = [&](Use *use, Instruction *user, SunkenArray &sunk, SunkenArray::Frame &frame) { + auto src = frame.loaded ? sunk.loaded : sunk.data; + if (frame.constant.isZero() && frame.variables.empty()) { + use->set(src); + return; + } + auto addrspace = frame.loaded ? AddressSpace::Loaded : AddressSpace::Generic; + builder.SetInsertPoint(user); + // 0 or 13, both are legal to gep in directly + Value *offset = ConstantInt::get(Type::getIntNTy(builder.getContext(), BitWidth), frame.constant); + for (auto &var : stack[frame.offset_frame].variables) { + Value *idx = var.first; + auto multiplier = var.second; + if (!multiplier.isOne()) { + idx = builder.CreateMul(idx, ConstantInt::get(idx->getType(), multiplier), "", true, true); // geps implicitly have nuw nsw + } + offset = builder.CreateAdd(offset, idx, "", true, true); // geps implicitly have nuw nsw + } + auto bc = builder.CreateBitCast(src, Type::getInt8PtrTy(builder.getContext())->getPointerTo(addrspace)); + auto gep = builder.CreateInBoundsGEP(Type::getInt8Ty(builder.getContext()), bc, {offset}); + auto cast = builder.CreateBitCast(gep, use->get()->getType()); + use->set(cast); + }; + auto replace_use = [&](Use *use) { + auto &cur = stack.back(); + auto inst = cast(use->getUser()); + auto it = sunken.find(inst->getParent()); + if (it != sunken.end()) { + replace_error_use(use, inst, it->second, cur); + return; + } + // we don't actually need to replace any other uses; + // we just need to follow them to see if they end up + // in an error block. + SunkenArray::Frame frame; + frame.orig_i = inst; + frame.loaded = cur.loaded; + switch (inst->getOpcode()) { + case Instruction::BitCast: + case Instruction::AddrSpaceCast: + { + frame.offset_frame = cur.offset_frame; + break; + } + // Nothing to do for these + case Instruction::Load: + case Instruction::Store: + case Instruction::AtomicCmpXchg: + case Instruction::AtomicRMW: + { + return; + } + case Instruction::GetElementPtr: + { + auto gep = cast(inst); + // Copy the current offsets to the new frame + frame.variables = stack[cur.offset_frame].variables; + frame.constant = stack[cur.offset_frame].constant; + bool success = gep->collectOffset(*pass.DL, BitWidth, frame.variables, frame.constant); + assert(success); // TODO this may not work on ARM with scalable vectors, + // but for now let's just start with this + frame.offset_frame = stack.size(); + break; + } + case Instruction::Call: + { + auto call = cast(inst); + auto callee = call->getCalledOperand(); + // Pretty much the only function we'd care about is + // gc_loaded, since everything else only applies to the + // gc-tracked pointer (except for write barrier, which + // doesn't apply since we're not stack allocating + // arrays with gc-tracked pointers) + if (callee != pass.gc_loaded_func) + return; + assert(!frame.loaded); + frame.loaded = true; + break; + } + default: + { + llvm_dump(inst); + llvm_unreachable("Unexpected instruction"); + } + } + frame.next = inst->use_begin(); + stack.push_back(frame); + }; + if (load->use_empty()) { + load->eraseFromParent(); + return; + } + while (true) { + auto &cur = stack.back(); + auto use = &*cur.next; + ++cur.next; + replace_use(use); + while (cur.next == cur.orig_i->use_end()) { + stack.pop_back(); + if (stack.empty()) + return; + } + } + load->replaceAllUsesWith(data); + load->eraseFromParent(); +} + +void Optimizer::replaceBitsArrayUses(CallInst *alloc, Value *conditional, Value *root, function_refshell, Instruction *data) { IRBuilder<> builder(alloc->getContext()); auto type = alloc->getArgOperand(0); auto length = alloc->getArgOperand(1); + auto align = alloc->getRetAlign().valueOrOne(); + DenseMap sunken; + if (!use_info.errorbbs.empty()) { + if (!pass.gc_loaded_func) { + auto decl = get_gc_loaded_decl(builder.getContext()); + auto FC = alloc->getModule()->getOrInsertFunction("julia.gc_loaded", decl.first, decl.second); + pass.gc_loaded_func = cast(FC.getCallee()); + } + IRBuilder<> builder(data->getNextNode()); + auto loaded = builder.CreateCall(pass.gc_loaded_func, {root, data}); + for (auto bb : use_info.errorbbs) { + auto sunk = alloc->clone(); + auto insertpt = &*bb->getFirstInsertionPt(); + Instruction *rootsunk; + Instruction *datasunk; + Instruction *loadedsunk; + if (!conditional) { + sunk->insertBefore(insertpt); + builder.SetInsertPoint(insertpt); + auto asc = builder.CreateAddrSpaceCast(sunk, PointerType::getWithSamePointeeType(cast(sunk->getType()), AddressSpace::Derived)); + auto bc = builder.CreateBitCast(asc, Type::getInt8PtrTy(builder.getContext())->getPointerTo(AddressSpace::Derived)); + auto gep = builder.CreateConstGEP1_64(Type::getInt8PtrTy(builder.getContext()), bc, 1); + datasunk = builder.CreateAlignedLoad(Type::getInt8PtrTy(builder.getContext()), gep, Align(MinAlign(align.value(), pass.DL->getPointerSize()))); + rootsunk = sunk; + loadedsunk = builder.CreateCall(pass.gc_loaded_func, {rootsunk, datasunk}); + // length must dominate here, since the alloc is not in a phi node, + // length must dominate the allocation for obvious reasons, + // and the allocation must dominate its uses (including those in this bb) + // also we know nuw nsw because original alloc would have thrown if not, and no exc handlers + auto memcpy_length = builder.CreateMul(length, alloc->getArgOperand(2), "", true, true); // 2 is elsize + // TODO can i get alignment guarantees for these? + builder.CreateMemCpy(loadedsunk, MaybeAlign(), loaded, MaybeAlign(), memcpy_length); + builder.SetInsertPoint(insertpt); + } else { + assert(pass.cfgChanged); // should have changed it above to get the runtime alloc branch anyways + _DT = AM.getCachedResult(F); + auto term = SplitBlockAndInsertIfThen(conditional, insertpt, false, nullptr, _DT); + term->getParent()->setName("sunk_alloc"); + pass.cfgChanged = true; // should have already been set, but just being consistent + sunk->insertBefore(term); + builder.SetInsertPoint(term); + auto asc = builder.CreateAddrSpaceCast(sunk, PointerType::getWithSamePointeeType(cast(sunk->getType()), AddressSpace::Derived)); + auto sunkdata = builder.CreateBitCast(asc, Type::getInt8PtrTy(builder.getContext())->getPointerTo(AddressSpace::Derived)); + sunkdata = builder.CreateConstGEP1_64(Type::getInt8PtrTy(builder.getContext()), sunkdata, 1); + sunkdata = builder.CreateAlignedLoad(Type::getInt8PtrTy(builder.getContext()), sunkdata, Align(MinAlign(align.value(), pass.DL->getPointerSize()))); + // we have to do this a second time because we need to do the memcpy inside the conditional + auto sunkloaded = builder.CreateCall(pass.gc_loaded_func, {sunk, sunkdata}); + // length must dominate here, since the alloc is not in a phi node, + // length must dominate the allocation for obvious reasons, + // and the allocation must dominate its uses (including those in this bb) + auto memcpy_length = builder.CreateMul(length, alloc->getArgOperand(2)); // 2 is elsize + // TODO can i get alignment guarantees for these? + builder.CreateMemCpy(sunkloaded, MaybeAlign(), loaded, MaybeAlign(), memcpy_length); + builder.SetInsertPoint(insertpt); + auto rootphi = builder.CreatePHI(alloc->getType(), 2); + auto dataphi = builder.CreatePHI(Type::getInt8PtrTy(builder.getContext()), 2); + rootsunk = rootphi; + datasunk = dataphi; + rootphi->addIncoming(root, bb); + rootphi->addIncoming(sunk, term->getParent()); + dataphi->addIncoming(data, bb); + dataphi->addIncoming(sunkdata, term->getParent()); + loadedsunk = builder.CreateCall(pass.gc_loaded_func, {rootsunk, datasunk}); + } + sunken[bb] = {rootsunk, datasunk, loadedsunk, insertpt}; + } + } // we need to replace all of the length/data accesses upfront, because in the case of an unsized array alloc // it's not legal to derive it from the phi node (may be nullptr) for (auto &field : use_info.memops) { @@ -450,6 +684,10 @@ void Optimizer::replaceBitsArrayUses(CallInst *alloc, Value *root, function_ref< load->eraseFromParent(); } else { assert(load->getType()->isPointerTy() && "Should only have loads of array data from offset 8"); + if (!sunken.empty()) { + sinkArrayDataPointer(alloc, sunken, root, data, load); + continue; + } auto atype = PointerType::getWithSamePointeeType(cast(data->getType()), load->getType()->getPointerAddressSpace()); auto acast = builder.CreateAddrSpaceCast(data, atype); auto bcast = builder.CreateBitCast(acast, load->getType()); @@ -463,6 +701,12 @@ void Optimizer::replaceBitsArrayUses(CallInst *alloc, Value *root, function_ref< while (!alloc->use_empty()) { auto &use = *alloc->use_begin(); auto user = cast(use.getUser()); + auto it = sunken.find(user->getParent()); + if (it != sunken.end()) { + auto &sunk = it->second; + use.set(sunk.root); + continue; + } if (auto CI = dyn_cast(user)) { auto callee = CI->getCalledFunction(); if (callee == pass.pointer_from_objref_func) { @@ -491,10 +735,17 @@ void Optimizer::optimizeArray(CallInst *orig, jl_genericmemory_info_t info) { }); return; } - if (!use_info.errorbbs.empty() || use_info.returned) { + if (use_info.returned) { REMARK([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "Escaped", orig) - << "GC allocation has error or was returned " << ore::NV("GC GenericMemory Allocation", orig); + << "GC allocation was returned " << ore::NV("GC GenericMemory Allocation", orig); + }); + return; + } + if (use_info.hasunknownmem) { + REMARK([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Escaped", orig) + << "GC genericmemory allocation has weird IR uses " << ore::NV("GC GenericMemory Allocation", orig); }); return; } @@ -512,6 +763,15 @@ void Optimizer::optimizeArray(CallInst *orig, jl_genericmemory_info_t info) { }); return; } + if (!use_info.errorbbs.empty()) { + if (!canDeoptimizeErrorBlocks(orig)) { + REMARK([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Escaped", orig) + << "GC genericmemory allocation has error " << ore::NV("GC GenericMemory Allocation", orig); + }); + return; + } + } // at this point the only valid real operations on orig are loading the length, // loading the data pointer, and getting a tracked pointer via gc_loaded. // we will assume that if the data pointer or a tracked pointer escapes, then the @@ -521,6 +781,17 @@ void Optimizer::optimizeArray(CallInst *orig, jl_genericmemory_info_t info) { // and remove the original allocation. dbgs() << "Moving array allocation to stack\n"; if (isa(orig->getArgOperand(1))) { + size_t maxSizedStackBytes = 4096; // TODO parameterize by module flag/ctor param + auto length = orig->getArgOperand(1); + auto ilen = cast(length)->getZExtValue(); + size_t bytes = jl_genericmemory_bytesize(&info, ilen); + if (bytes > maxSizedStackBytes) { + REMARK([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Escaped", orig) + << "GC genericmemory allocation size is too large " << ore::NV("GC GenericMemory Allocation", orig); + }); + return; + } dbgs() << "allocation was sized\n"; moveSizedBitsArrayToStack(orig, info); } else { @@ -1271,10 +1542,8 @@ void Optimizer::splitOnStack(CallInst *orig_inst) auto sunk = cast(orig_inst->clone()); sunk->insertBefore(&*errbb->getFirstInsertionPt()); auto &info = partial_escapes.insert(std::make_pair(errbb, ErrorBBInfo(sunk))).first->second; - // note that this also sets the insert point of errbuilder - auto p11i8 = info.gep(0, Type::getInt8Ty(errbuilder.getContext()), errbuilder); - // conservatively just clear the whole thing - errbuilder.CreateMemSet(p11i8, ConstantInt::get(Type::getInt8Ty(errbuilder.getContext()), 0), orig_inst->getArgOperand(1), align); + errbuilder.SetInsertPoint(info.insertpt); + // copy every slot into the final object for (auto &slot : slots) { auto psize = pass.DL->getPointerSize(); if (slot.isref) { diff --git a/src/llvm-codegen-shared.h b/src/llvm-codegen-shared.h index b78a3bea9c0ef..8eb521b0a715a 100644 --- a/src/llvm-codegen-shared.h +++ b/src/llvm-codegen-shared.h @@ -343,6 +343,27 @@ static inline llvm::Value *emit_gc_safe_leave(llvm::IRBuilder<> &builder, llvm:: return emit_gc_state_set(builder, T_size, ptls, state, old_state, final); } +static inline llvm::AttributeSet Attributes(llvm::LLVMContext &C, std::initializer_list attrkinds, std::initializer_list extra={}) +{ + llvm::SmallVector attrs(attrkinds.size() + extra.size()); + for (size_t i = 0; i < attrkinds.size(); i++) + attrs[i] = llvm::Attribute::get(C, attrkinds.begin()[i]); + for (size_t i = 0; i < extra.size(); i++) + attrs[attrkinds.size() + i] = extra.begin()[i]; + return llvm::AttributeSet::get(C, llvm::ArrayRef(attrs)); +} + +static inline std::pair get_gc_loaded_decl(llvm::LLVMContext &C) { + auto FnAttrs = Attributes(C, {llvm::Attribute::ReadNone, llvm::Attribute::NoSync, llvm::Attribute::NoUnwind, llvm::Attribute::Speculatable, llvm::Attribute::WillReturn, llvm::Attribute::NoRecurse}); + auto RetAttrs = Attributes(C, {llvm::Attribute::NonNull, llvm::Attribute::NoUndef}); + auto AL = llvm::AttributeList::get(C, FnAttrs, RetAttrs, + { Attributes(C, {llvm::Attribute::NoUndef, llvm::Attribute::ReadNone, llvm::Attribute::NoCapture}), + Attributes(C, {llvm::Attribute::NonNull, llvm::Attribute::NoUndef, llvm::Attribute::ReadNone}) }); + auto FT = llvm::FunctionType::get(llvm::PointerType::get(JuliaType::get_prjlvalue_ty(C), AddressSpace::Loaded), + {JuliaType::get_prjlvalue_ty(C), llvm::PointerType::get(JuliaType::get_prjlvalue_ty(C), 0)}, false); + return std::make_pair(FT, AL); +} + // Compatibility shims for LLVM attribute APIs that were renamed in LLVM 14. // // Once we no longer support LLVM < 14, these can be mechanically removed by From 3823f89c70aa61443f1278436726afc2c96a8e0f Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi Date: Sun, 10 Dec 2023 00:10:18 -0500 Subject: [PATCH 8/9] Cleanups, fixes --- src/llvm-alloc-helpers.cpp | 29 +++++++++++++++++ src/llvm-alloc-opt.cpp | 66 +++++++++++++++++++++++++++++--------- src/pipeline.cpp | 5 +++ 3 files changed, 85 insertions(+), 15 deletions(-) diff --git a/src/llvm-alloc-helpers.cpp b/src/llvm-alloc-helpers.cpp index 66a285dbfb378..6de0ebdefe380 100644 --- a/src/llvm-alloc-helpers.cpp +++ b/src/llvm-alloc-helpers.cpp @@ -372,6 +372,35 @@ void jl_alloc::runEscapeAnalysis(llvm::Instruction *I, EscapeAnalysisRequiredArg required.use_info.returned = true; return true; } + if (isa(inst)) { + // PHI nodes are immediate, always escapes + // many parts of alloc-opt and julia-licm assume no phis exist, + // so the whole infrastructure would have to be rewritten for it + LLVM_DEBUG(dbgs() << "PHI node, marking escape\n"); + REMARK([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "PhiNode", + inst) + << "PHI node, marking escape (" << ore::NV("Phi", inst) << ")"; + }); + required.use_info.escaped = true; + return false; + } + switch (inst->getOpcode()) { + case Instruction::Select: + case Instruction::PtrToInt: + case Instruction::Freeze: + // These are safe ops that just don't have handling yet in alloc opt, they're fine in error blocks + { + if (isa(inst->getParent()->getTerminator())) { + LLVM_DEBUG(dbgs() << "Detected use of allocation in block terminating with unreachable, likely error function\n"); + required.use_info.errorbbs.insert(inst->getParent()); + return true; + } + break; + } + default: + break; + } LLVM_DEBUG(dbgs() << "Unknown instruction, marking escape\n"); REMARK([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "UnknownInst", diff --git a/src/llvm-alloc-opt.cpp b/src/llvm-alloc-opt.cpp index 0831f146b05db..e93df210688da 100644 --- a/src/llvm-alloc-opt.cpp +++ b/src/llvm-alloc-opt.cpp @@ -166,10 +166,10 @@ struct Optimizer { Instruction *orig_i; Value::use_iterator next; size_t offset_frame; // index of frame with offset data in stack + bool loaded; // dynamic offset of orig_i from data MapVector variables; APInt constant; - bool loaded; }; }; bool canDeoptimizeErrorBlocks(CallInst *orig); @@ -180,6 +180,7 @@ struct Optimizer { AllocOpt &pass; DominatorTree *_DT = nullptr; FunctionAnalysisManager &AM; + Function *except_enter_func = nullptr; DominatorTree &getDomTree() { @@ -247,6 +248,7 @@ void Optimizer::initialize() pushInstruction(&I); } } + except_enter_func = F.getParent()->getFunction("julia.except_enter"); } void Optimizer::optimizeObject(CallInst *orig, size_t sz) { @@ -269,6 +271,15 @@ void Optimizer::optimizeObject(CallInst *orig, size_t sz) { optimizeTag(orig); return; } + if (except_enter_func && !use_info.errorbbs.empty()) { + REMARK([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Escaped", orig) + << "GC allocation has error " << ore::NV("GC Allocation", orig); + }); + if (use_info.hastypeof) + optimizeTag(orig); + return; + } if (!use_info.addrescaped && !use_info.hasload && use_info.errorbbs.empty() && (!use_info.haspreserve || !use_info.refstore)) { REMARK([&]() { @@ -558,6 +569,10 @@ void Optimizer::sinkArrayDataPointer(CallInst *orig, DenseMapprintAsOperand(dbgs(), true); + dbgs() << "\n"; + } llvm_dump(inst); llvm_unreachable("Unexpected instruction"); } @@ -569,19 +584,23 @@ void Optimizer::sinkArrayDataPointer(CallInst *orig, DenseMaperaseFromParent(); return; } + stack.push_back(SunkenArray::Frame{load, load->use_begin(), 0, false, {}, APInt(BitWidth, 0)}); while (true) { - auto &cur = stack.back(); - auto use = &*cur.next; - ++cur.next; + auto cur = &stack.back(); + auto use = &*cur->next; + ++cur->next; replace_use(use); - while (cur.next == cur.orig_i->use_end()) { + while (cur->next == cur->orig_i->use_end()) { stack.pop_back(); - if (stack.empty()) + if (stack.empty()) { + builder.SetInsertPoint(load); + load->replaceAllUsesWith(builder.CreateBitCast(data, load->getType())); + load->eraseFromParent(); return; + } + cur = &stack.back(); } } - load->replaceAllUsesWith(data); - load->eraseFromParent(); } void Optimizer::replaceBitsArrayUses(CallInst *alloc, Value *conditional, Value *root, function_refshell, Instruction *data) { @@ -597,7 +616,7 @@ void Optimizer::replaceBitsArrayUses(CallInst *alloc, Value *conditional, Value pass.gc_loaded_func = cast(FC.getCallee()); } IRBuilder<> builder(data->getNextNode()); - auto loaded = builder.CreateCall(pass.gc_loaded_func, {root, data}); + auto loaded = builder.CreateCall(pass.gc_loaded_func, {root, builder.CreateBitCast(data, JuliaType::get_pprjlvalue_ty(builder.getContext()))}); for (auto bb : use_info.errorbbs) { auto sunk = alloc->clone(); auto insertpt = &*bb->getFirstInsertionPt(); @@ -612,7 +631,7 @@ void Optimizer::replaceBitsArrayUses(CallInst *alloc, Value *conditional, Value auto gep = builder.CreateConstGEP1_64(Type::getInt8PtrTy(builder.getContext()), bc, 1); datasunk = builder.CreateAlignedLoad(Type::getInt8PtrTy(builder.getContext()), gep, Align(MinAlign(align.value(), pass.DL->getPointerSize()))); rootsunk = sunk; - loadedsunk = builder.CreateCall(pass.gc_loaded_func, {rootsunk, datasunk}); + loadedsunk = builder.CreateCall(pass.gc_loaded_func, {rootsunk, builder.CreateBitCast(datasunk, JuliaType::get_pprjlvalue_ty(builder.getContext()))}); // length must dominate here, since the alloc is not in a phi node, // length must dominate the allocation for obvious reasons, // and the allocation must dominate its uses (including those in this bb) @@ -634,7 +653,7 @@ void Optimizer::replaceBitsArrayUses(CallInst *alloc, Value *conditional, Value sunkdata = builder.CreateConstGEP1_64(Type::getInt8PtrTy(builder.getContext()), sunkdata, 1); sunkdata = builder.CreateAlignedLoad(Type::getInt8PtrTy(builder.getContext()), sunkdata, Align(MinAlign(align.value(), pass.DL->getPointerSize()))); // we have to do this a second time because we need to do the memcpy inside the conditional - auto sunkloaded = builder.CreateCall(pass.gc_loaded_func, {sunk, sunkdata}); + auto sunkloaded = builder.CreateCall(pass.gc_loaded_func, {sunk, builder.CreateBitCast(sunkdata, JuliaType::get_pprjlvalue_ty(builder.getContext()))}); // length must dominate here, since the alloc is not in a phi node, // length must dominate the allocation for obvious reasons, // and the allocation must dominate its uses (including those in this bb) @@ -650,9 +669,10 @@ void Optimizer::replaceBitsArrayUses(CallInst *alloc, Value *conditional, Value rootphi->addIncoming(sunk, term->getParent()); dataphi->addIncoming(data, bb); dataphi->addIncoming(sunkdata, term->getParent()); - loadedsunk = builder.CreateCall(pass.gc_loaded_func, {rootsunk, datasunk}); + loadedsunk = builder.CreateCall(pass.gc_loaded_func, {rootsunk, builder.CreateBitCast(datasunk, JuliaType::get_pprjlvalue_ty(builder.getContext()))}); } - sunken[bb] = {rootsunk, datasunk, loadedsunk, insertpt}; + // note that bb != insertpt->getParent() in the unsized case + sunken[insertpt->getParent()] = {rootsunk, datasunk, loadedsunk, insertpt}; } } // we need to replace all of the length/data accesses upfront, because in the case of an unsized array alloc @@ -727,7 +747,6 @@ void Optimizer::replaceBitsArrayUses(CallInst *alloc, Value *conditional, Value void Optimizer::optimizeArray(CallInst *orig, jl_genericmemory_info_t info) { checkInst(orig); - dbgs() << "checking array allocation\n"; if (use_info.escaped) { REMARK([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "Escaped", orig) @@ -742,6 +761,15 @@ void Optimizer::optimizeArray(CallInst *orig, jl_genericmemory_info_t info) { }); return; } + if (except_enter_func && !use_info.errorbbs.empty()) { + REMARK([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Escaped", orig) + << "GC allocation has error " << ore::NV("GC Allocation", orig); + }); + if (use_info.hastypeof) + optimizeTag(orig); + return; + } if (use_info.hasunknownmem) { REMARK([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "Escaped", orig) @@ -779,7 +807,6 @@ void Optimizer::optimizeArray(CallInst *orig, jl_genericmemory_info_t info) { // since we are here, we're free to turn the whole thing into a stack allocation // and remove the original allocation. - dbgs() << "Moving array allocation to stack\n"; if (isa(orig->getArgOperand(1))) { size_t maxSizedStackBytes = 4096; // TODO parameterize by module flag/ctor param auto length = orig->getArgOperand(1); @@ -1865,6 +1892,15 @@ void Optimizer::splitOnStack(CallInst *orig_inst) push_frame(gep, apoffset.getLimitedValue()); } else { + // We don't know what this is + // but some instructions might just occur in paths that are leading to errors, + // so we can just replace the use with a gep'ed pointer. + if (errinfo) { + auto gep = errinfo->gep(offset, Type::getInt8Ty(errbuilder.getContext()), errbuilder); + use->set(errbuilder.CreateBitCast(gep, use->get()->getType())); + return; + } + llvm_dump(user); abort(); } }; diff --git a/src/pipeline.cpp b/src/pipeline.cpp index 1a218a004b0b7..70d619840ad01 100644 --- a/src/pipeline.cpp +++ b/src/pipeline.cpp @@ -428,6 +428,11 @@ static void buildLoopOptimizerPipeline(FunctionPassManager &FPM, PassBuilder *PB //We don't know if the loop callbacks support MSSA FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA = */false)); } + if (O.getSpeedupLevel() >= 2) { + LoopPassManager LPM; + LPM.addPass(LoopFullUnrollPass()); + FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM))); + } if (O.getSpeedupLevel() >= 2) { LoopPassManager LPM; LPM.addPass(BeforeLICMMarkerPass()); From b5a86d8005d6897b60c332f9e59124664d1d99bc Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi Date: Sun, 10 Dec 2023 11:30:23 -0500 Subject: [PATCH 9/9] Fix warnings, remove confusing phi node --- base/boot.jl | 6 +++--- src/llvm-alloc-opt.cpp | 9 +++++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/base/boot.jl b/base/boot.jl index 218e4e2e533b1..3a1488ec4e9b9 100644 --- a/base/boot.jl +++ b/base/boot.jl @@ -507,10 +507,10 @@ const undef = UndefInitializer() # type and dimensionality specified (self::Type{GenericMemory{kind,T,addrspace}})(::UndefInitializer, m::Int) where {T,addrspace,kind} = - if isdefined(self, :instance) && m === 0 - self.instance - else + if (kind === :not_atomic && addrspace === CPU) || (!isdefined(self, :instance) || m !== 0) ccall(:jl_alloc_genericmemory, Ref{GenericMemory{kind,T,addrspace}}, (Any, Int), self, m) + else + self.instance end (self::Type{GenericMemory{kind,T,addrspace}})(::UndefInitializer, d::NTuple{1,Int}) where {T,kind,addrspace} = self(undef, getfield(d,1)) # empty vector constructor diff --git a/src/llvm-alloc-opt.cpp b/src/llvm-alloc-opt.cpp index e93df210688da..dad29922933be 100644 --- a/src/llvm-alloc-opt.cpp +++ b/src/llvm-alloc-opt.cpp @@ -346,6 +346,7 @@ bool Optimizer::canDeoptimizeErrorBlocks(CallInst *orig) { for (auto &field : use_info.memops) { for (auto &acc : field.second.accesses) { assert(isa(acc.inst) && "Should only have loads of array length/data"); + (void) acc; } if (field.first == 0) { if (field.second.size > pass.DL->getPointerSize()) { @@ -549,6 +550,7 @@ void Optimizer::sinkArrayDataPointer(CallInst *orig, DenseMapcollectOffset(*pass.DL, BitWidth, frame.variables, frame.constant); assert(success); // TODO this may not work on ARM with scalable vectors, // but for now let's just start with this + (void) success; frame.offset_frame = stack.size(); break; } @@ -819,6 +821,13 @@ void Optimizer::optimizeArray(CallInst *orig, jl_genericmemory_info_t info) { }); return; } + if (bytes == 0) { + REMARK([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Escaped", orig) + << "GC genericmemory allocation size was 0 " << ore::NV("GC GenericMemory Allocation", orig); + }); + return; + } dbgs() << "allocation was sized\n"; moveSizedBitsArrayToStack(orig, info); } else {