From 615b142c88a074399bac08a0e8fd8f48b491c1fd Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Tue, 3 Jan 2023 16:03:18 -0500 Subject: [PATCH 01/34] Simplify multiversioning --- src/llvm-multiversioning.cpp | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp index 242b0c454ad0a..68042700bb1d0 100644 --- a/src/llvm-multiversioning.cpp +++ b/src/llvm-multiversioning.cpp @@ -222,8 +222,6 @@ struct CloneCtx { int idx; uint32_t flags; std::unique_ptr<ValueToValueMapTy> vmap; // ValueToValueMapTy is not movable.... - // function ids that needs relocation to be initialized - std::set<uint32_t> relocs{}; Target(int idx, const jl_target_spec_t &spec) : idx(idx), flags(spec.flags), @@ -290,8 +288,6 @@ struct CloneCtx { std::vector<std::pair<Constant*,uint32_t>> gv_relocs{}; // Mapping from function id (i.e. 0-based index in `fvars`) to GVs to be initialized. std::map<uint32_t,GlobalVariable*> const_relocs; - // Functions that were referred to by a global alias, and might not have other uses. - std::set<uint32_t> alias_relocs; bool has_veccall{false}; bool has_cloneall{false}; bool allow_bad_fvars{false}; @@ -734,13 +730,6 @@ void CloneCtx::rewrite_alias(GlobalAlias *alias, Function *F) uint32_t id; GlobalVariable *slot; std::tie(id, slot) = get_reloc_slot(F); - for (auto &grp: groups) { - grp.relocs.insert(id); - for (auto &tgt: grp.clones) { - tgt.relocs.insert(id); - } - } - alias_relocs.insert(id); auto BB = BasicBlock::Create(F->getContext(), "top", trampoline); IRBuilder<> irbuilder(BB); @@ -884,15 +873,6 @@ void CloneCtx::fix_inst_uses() if (!use_f->getName().endswith(suffix)) return nullptr; std::tie(id, slot) = get_reloc_slot(orig_f); - - grp.relocs.insert(id); - for (auto &tgt: grp.clones) { - // The enclosing function of the use is cloned, - // no need to deal with this use on this target. - if (map_get(*tgt.vmap, use_f)) - continue; - tgt.relocs.insert(id); - } return slot; }, tbaa_const); } @@ -1018,12 +998,10 @@ void CloneCtx::emit_metadata() } auto it = const_relocs.find(id); if (it != const_relocs.end()) { + shared_relocs.insert(id); values.push_back(id_v); values.push_back(get_ptrdiff32(it->second, gbase)); } - if (alias_relocs.find(id) != alias_relocs.end()) { - shared_relocs.insert(id); - } } values[0] = ConstantInt::get(T_int32, values.size() / 2); ArrayType *vars_type = ArrayType::get(T_int32, values.size()); @@ -1046,7 +1024,7 @@ void CloneCtx::emit_metadata() auto grp = static_cast<Group*>(tgt); count = jl_sysimg_tag_mask; for (uint32_t j = 0; j < nfvars; j++) { - if (shared_relocs.count(j) || tgt->relocs.count(j)) { + if (shared_relocs.count(j)) { count++; idxs.push_back(j); } @@ -1061,7 +1039,7 @@ void CloneCtx::emit_metadata() idxs.push_back(baseidx); for (uint32_t j = 0; j < nfvars; j++) { auto base_f = grp->base_func(fvars[j]); - if (shared_relocs.count(j) || tgt->relocs.count(j)) { + if (shared_relocs.count(j)) { count++; idxs.push_back(jl_sysimg_tag_mask | j); auto f = map_get(*tgt->vmap, base_f, base_f); From 27808e136757c19c9d7accfafae3958f3f48b7f1 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Tue, 3 Jan 2023 18:17:46 -0500 Subject: [PATCH 02/34] Refactor aotcompile --- src/aotcompile.cpp | 231 ++++++++++++++++++++--------------- src/llvm-multiversioning.cpp | 31 +---- 2 files changed, 140 insertions(+), 122 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 907735dfa0128..d3d4529d32c30 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -496,7 +496,8 @@ static void injectCRTAlias(Module &M, StringRef name, StringRef alias, FunctionT if (!target) { target = Function::Create(FT, Function::ExternalLinkage, alias, M); } - Function *interposer = Function::Create(FT, Function::InternalLinkage, name, M); + Function *interposer = Function::Create(FT, Function::ExternalLinkage, name, M); + interposer->setVisibility(GlobalValue::HiddenVisibility); appendToCompilerUsed(M, {interposer}); llvm::IRBuilder<> builder(BasicBlock::Create(M.getContext(), "top", interposer)); @@ -532,7 +533,7 @@ void jl_dump_native_impl(void *native_code, TheTriple.setObjectFormat(Triple::MachO); TheTriple.setOS(llvm::Triple::MacOSX); #endif - std::unique_ptr<TargetMachine> TM( + std::unique_ptr<TargetMachine> SourceTM( jl_ExecutionEngine->getTarget().createTargetMachine( TheTriple.getTriple(), jl_ExecutionEngine->getTargetCPU(), @@ -554,53 +555,16 @@ void jl_dump_native_impl(void *native_code, )); - // set up optimization passes - SmallVector<char, 0> bc_Buffer; - SmallVector<char, 0> obj_Buffer; - SmallVector<char, 0> asm_Buffer; - SmallVector<char, 0> unopt_bc_Buffer; - raw_svector_ostream bc_OS(bc_Buffer); - raw_svector_ostream obj_OS(obj_Buffer); - raw_svector_ostream asm_OS(asm_Buffer); - raw_svector_ostream unopt_bc_OS(unopt_bc_Buffer); std::vector<NewArchiveMember> bc_Archive; std::vector<NewArchiveMember> obj_Archive; std::vector<NewArchiveMember> asm_Archive; std::vector<NewArchiveMember> unopt_bc_Archive; std::vector<std::string> outputs; - PassBuilder emptyPB; - AnalysisManagers empty(emptyPB); - ModulePassManager preopt, postopt; - legacy::PassManager emitter; // MC emission is only supported on legacy PM - - if (unopt_bc_fname) - preopt.addPass(BitcodeWriterPass(unopt_bc_OS)); - - if (bc_fname) - postopt.addPass(BitcodeWriterPass(bc_OS)); - //Is this necessary for TM? - addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis()); - if (obj_fname) - if (TM->addPassesToEmitFile(emitter, obj_OS, nullptr, CGFT_ObjectFile, false)) - jl_safe_printf("ERROR: target does not support generation of object files\n"); - if (asm_fname) - if (TM->addPassesToEmitFile(emitter, asm_OS, nullptr, CGFT_AssemblyFile, false)) - jl_safe_printf("ERROR: target does not support generation of object files\n"); - // Reset the target triple to make sure it matches the new target machine auto dataM = data->M.getModuleUnlocked(); - dataM->setTargetTriple(TM->getTargetTriple().str()); - dataM->setDataLayout(jl_create_datalayout(*TM)); - -#ifndef JL_USE_NEW_PM - legacy::PassManager optimizer; - addTargetPasses(&optimizer, TM->getTargetTriple(), TM->getTargetIRAnalysis()); - addOptimizationPasses(&optimizer, jl_options.opt_level, true, true); - addMachinePasses(&optimizer, jl_options.opt_level); -#else - NewPM optimizer{std::move(TM), getOptLevel(jl_options.opt_level), OptimizationOptions::defaults(true, true)}; -#endif + dataM->setTargetTriple(SourceTM->getTargetTriple().str()); + dataM->setDataLayout(jl_create_datalayout(*SourceTM)); Type *T_size; if (sizeof(size_t) == 8) @@ -609,8 +573,10 @@ void jl_dump_native_impl(void *native_code, T_size = Type::getInt32Ty(Context); Type *T_psize = T_size->getPointerTo(); + bool imaging_mode = imaging_default() || jl_options.outputo; + // add metadata information - if (imaging_default() || jl_options.outputo) { + if (imaging_mode) { emit_offset_table(*dataM, data->jl_sysimg_gvars, "jl_sysimg_gvars", T_psize); emit_offset_table(*dataM, data->jl_sysimg_fvars, "jl_sysimg_fvars", T_psize); @@ -626,70 +592,87 @@ void jl_dump_native_impl(void *native_code, } // do the actual work - auto add_output = [&] (Module &M, StringRef unopt_bc_Name, StringRef bc_Name, StringRef obj_Name, StringRef asm_Name, bool inject_crt) { - preopt.run(M, empty.MAM); - if (bc_fname || obj_fname || asm_fname) { - assert(!verifyModule(M, &errs())); - optimizer.run(M); - assert(!verifyModule(M, &errs())); + auto add_output = [&] (Module &M, StringRef unopt_bc_Name, StringRef bc_Name, StringRef obj_Name, StringRef asm_Name) { + + auto TM = std::unique_ptr<TargetMachine>( + SourceTM->getTarget().createTargetMachine( + SourceTM->getTargetTriple().str(), + SourceTM->getTargetCPU(), + SourceTM->getTargetFeatureString(), + SourceTM->Options, + SourceTM->getRelocationModel(), + SourceTM->getCodeModel(), + SourceTM->getOptLevel())); + + if (unopt_bc_fname) { + SmallVector<char, 0> Buffer; + raw_svector_ostream OS(Buffer); + PassBuilder PB; + AnalysisManagers AM{*TM, PB, OptimizationLevel::O0}; + ModulePassManager MPM; + MPM.addPass(BitcodeWriterPass(OS)); + emit_result(unopt_bc_Archive, Buffer, unopt_bc_Name, outputs); } + if (!bc_fname && !obj_fname && !asm_fname) { + return; + } + assert(!verifyModule(M, &errs())); - if (inject_crt) { - // We would like to emit an alias or an weakref alias to redirect these symbols - // but LLVM doesn't let us emit a GlobalAlias to a declaration... - // So for now we inject a definition of these functions that calls our runtime - // functions. We do so after optimization to avoid cloning these functions. - injectCRTAlias(M, "__gnu_h2f_ieee", "julia__gnu_h2f_ieee", - FunctionType::get(Type::getFloatTy(Context), { Type::getHalfTy(Context) }, false)); - injectCRTAlias(M, "__extendhfsf2", "julia__gnu_h2f_ieee", - FunctionType::get(Type::getFloatTy(Context), { Type::getHalfTy(Context) }, false)); - injectCRTAlias(M, "__gnu_f2h_ieee", "julia__gnu_f2h_ieee", - FunctionType::get(Type::getHalfTy(Context), { Type::getFloatTy(Context) }, false)); - injectCRTAlias(M, "__truncsfhf2", "julia__gnu_f2h_ieee", - FunctionType::get(Type::getHalfTy(Context), { Type::getFloatTy(Context) }, false)); - injectCRTAlias(M, "__truncdfhf2", "julia__truncdfhf2", - FunctionType::get(Type::getHalfTy(Context), { Type::getDoubleTy(Context) }, false)); +#ifndef JL_USE_NEW_PM + legacy::PassManager optimizer; + addTargetPasses(&optimizer, TM->getTargetTriple(), TM->getTargetIRAnalysis()); + addOptimizationPasses(&optimizer, jl_options.opt_level, true, true); + addMachinePasses(&optimizer, jl_options.opt_level); +#else -#if defined(_OS_WINDOWS_) - // Windows expect that the function `_DllMainStartup` is present in an dll. - // Normal compilers use something like Zig's crtdll.c instead we provide a - // a stub implementation. - auto T_pvoid = Type::getInt8Ty(Context)->getPointerTo(); - auto T_int32 = Type::getInt32Ty(Context); - auto FT = FunctionType::get(T_int32, {T_pvoid, T_int32, T_pvoid}, false); - auto F = Function::Create(FT, Function::ExternalLinkage, "_DllMainCRTStartup", M); - F->setCallingConv(CallingConv::X86_StdCall); - - llvm::IRBuilder<> builder(BasicBlock::Create(M.getContext(), "top", F)); - builder.CreateRet(ConstantInt::get(T_int32, 1)); + auto PMTM = std::unique_ptr<TargetMachine>( + SourceTM->getTarget().createTargetMachine( + SourceTM->getTargetTriple().str(), + SourceTM->getTargetCPU(), + SourceTM->getTargetFeatureString(), + SourceTM->Options, + SourceTM->getRelocationModel(), + SourceTM->getCodeModel(), + SourceTM->getOptLevel())); + NewPM optimizer{std::move(PMTM), getOptLevel(jl_options.opt_level), OptimizationOptions::defaults(true, true)}; #endif + optimizer.run(M); + assert(!verifyModule(M, &errs())); + + if (bc_fname) { + SmallVector<char, 0> Buffer; + raw_svector_ostream OS(Buffer); + PassBuilder PB; + AnalysisManagers AM{*TM, PB, OptimizationLevel::O0}; + ModulePassManager MPM; + MPM.addPass(BitcodeWriterPass(OS)); + emit_result(bc_Archive, Buffer, bc_Name, outputs); } - postopt.run(M, empty.MAM); - - // Get target by snooping on multiversioning - GlobalVariable *target_ids = M.getNamedGlobal("jl_dispatch_target_ids"); - if (s && target_ids) { - if(auto targets = dyn_cast<ConstantDataArray>(target_ids->getInitializer())) { - auto rawTargets = targets->getRawDataValues(); - write_int32(s, rawTargets.size()); - ios_write(s, rawTargets.data(), rawTargets.size()); - }; + if (obj_fname) { + SmallVector<char, 0> Buffer; + raw_svector_ostream OS(Buffer); + legacy::PassManager emitter; + addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis()); + if (TM->addPassesToEmitFile(emitter, OS, nullptr, CGFT_ObjectFile, false)) + jl_safe_printf("ERROR: target does not support generation of object files\n"); + emitter.run(M); + emit_result(obj_Archive, Buffer, obj_Name, outputs); } - emitter.run(M); - - if (unopt_bc_fname) - emit_result(unopt_bc_Archive, unopt_bc_Buffer, unopt_bc_Name, outputs); - if (bc_fname) - emit_result(bc_Archive, bc_Buffer, bc_Name, outputs); - if (obj_fname) - emit_result(obj_Archive, obj_Buffer, obj_Name, outputs); - if (asm_fname) - emit_result(asm_Archive, asm_Buffer, asm_Name, outputs); + if (asm_fname) { + SmallVector<char, 0> Buffer; + raw_svector_ostream OS(Buffer); + legacy::PassManager emitter; + addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis()); + if (TM->addPassesToEmitFile(emitter, OS, nullptr, CGFT_AssemblyFile, false)) + jl_safe_printf("ERROR: target does not support generation of assembly files\n"); + emitter.run(M); + emit_result(asm_Archive, Buffer, asm_Name, outputs); + } }; - add_output(*dataM, "unopt.bc", "text.bc", "text.o", "text.s", true); + add_output(*dataM, "unopt.bc", "text.bc", "text.o", "text.s"); orc::ThreadSafeModule sysimage(std::make_unique<Module>("sysimage", Context), TSCtx); auto sysimageM = sysimage.getModuleUnlocked(); @@ -699,6 +682,35 @@ void jl_dump_native_impl(void *native_code, sysimageM->setStackProtectorGuard(dataM->getStackProtectorGuard()); sysimageM->setOverrideStackAlignment(dataM->getOverrideStackAlignment()); #endif + // We would like to emit an alias or an weakref alias to redirect these symbols + // but LLVM doesn't let us emit a GlobalAlias to a declaration... + // So for now we inject a definition of these functions that calls our runtime + // functions. We do so after optimization to avoid cloning these functions. + injectCRTAlias(*sysimageM, "__gnu_h2f_ieee", "julia__gnu_h2f_ieee", + FunctionType::get(Type::getFloatTy(Context), { Type::getHalfTy(Context) }, false)); + injectCRTAlias(*sysimageM, "__extendhfsf2", "julia__gnu_h2f_ieee", + FunctionType::get(Type::getFloatTy(Context), { Type::getHalfTy(Context) }, false)); + injectCRTAlias(*sysimageM, "__gnu_f2h_ieee", "julia__gnu_f2h_ieee", + FunctionType::get(Type::getHalfTy(Context), { Type::getFloatTy(Context) }, false)); + injectCRTAlias(*sysimageM, "__truncsfhf2", "julia__gnu_f2h_ieee", + FunctionType::get(Type::getHalfTy(Context), { Type::getFloatTy(Context) }, false)); + injectCRTAlias(*sysimageM, "__truncdfhf2", "julia__truncdfhf2", + FunctionType::get(Type::getHalfTy(Context), { Type::getDoubleTy(Context) }, false)); + + if (TheTriple.isOSWindows()) { + // Windows expect that the function `_DllMainStartup` is present in an dll. + // Normal compilers use something like Zig's crtdll.c instead we provide a + // a stub implementation. + auto T_pvoid = Type::getInt8Ty(Context)->getPointerTo(); + auto T_int32 = Type::getInt32Ty(Context); + auto FT = FunctionType::get(T_int32, {T_pvoid, T_int32, T_pvoid}, false); + auto F = Function::Create(FT, Function::ExternalLinkage, "_DllMainCRTStartup", *sysimageM); + F->setCallingConv(CallingConv::X86_StdCall); + + llvm::IRBuilder<> builder(BasicBlock::Create(Context, "top", F)); + builder.CreateRet(ConstantInt::get(T_int32, 1)); + } + bool has_veccall = dataM->getModuleFlag("julia.mv.veccall"); data->M = orc::ThreadSafeModule(); // free memory for data->M if (sysimg_data) { @@ -712,7 +724,32 @@ void jl_dump_native_impl(void *native_code, GlobalVariable::ExternalLinkage, len, "jl_system_image_size")); } - add_output(*sysimageM, "data.bc", "data.bc", "data.o", "data.s", false); + if (imaging_mode) { + auto specs = jl_get_llvm_clone_targets(); + const uint32_t base_flags = has_veccall ? JL_TARGET_VEC_CALL : 0; + std::vector<uint8_t> data; + auto push_i32 = [&] (uint32_t v) { + uint8_t buff[4]; + memcpy(buff, &v, 4); + data.insert(data.end(), buff, buff + 4); + }; + push_i32(specs.size()); + for (uint32_t i = 0; i < specs.size(); i++) { + push_i32(base_flags | (specs[i].flags & JL_TARGET_UNKNOWN_NAME)); + auto &specdata = specs[i].data; + data.insert(data.end(), specdata.begin(), specdata.end()); + } + auto value = ConstantDataArray::get(Context, data); + addComdat(new GlobalVariable(*sysimageM, value->getType(), true, + GlobalVariable::ExternalLinkage, + value, "jl_dispatch_target_ids")); + + if (s) { + write_int32(s, data.size()); + ios_write(s, (const char *)data.data(), data.size()); + } + } + add_output(*sysimageM, "data.bc", "data.bc", "data.o", "data.s"); object::Archive::Kind Kind = getDefaultForHost(TheTriple); if (unopt_bc_fname) diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp index 68042700bb1d0..c94aee9927540 100644 --- a/src/llvm-multiversioning.cpp +++ b/src/llvm-multiversioning.cpp @@ -289,7 +289,6 @@ struct CloneCtx { // Mapping from function id (i.e. 0-based index in `fvars`) to GVs to be initialized. std::map<uint32_t,GlobalVariable*> const_relocs; bool has_veccall{false}; - bool has_cloneall{false}; bool allow_bad_fvars{false}; }; @@ -345,7 +344,6 @@ CloneCtx::CloneCtx(Module &M, function_ref<LoopInfo&(Function&)> GetLI, function for (uint32_t i = 1; i < ntargets; i++) { auto &spec = specs[i]; if (spec.flags & JL_TARGET_CLONE_ALL) { - has_cloneall = true; groups.emplace_back(i, spec); } else { @@ -404,7 +402,7 @@ void CloneCtx::clone_function(Function *F, Function *new_f, ValueToValueMapTy &v // Clone all clone_all targets. Makes sure that the base targets are all available. void CloneCtx::clone_bases() { - if (!has_cloneall) + if (groups.size() == 1) return; uint32_t ngrps = groups.size(); for (uint32_t gid = 1; gid < ngrps; gid++) { @@ -553,7 +551,7 @@ void CloneCtx::check_partial(Group &grp, Target &tgt) F->getName() + suffix, &M); new_f->copyAttributesFrom(F); vmap[F] = new_f; - if (!has_cloneall) + if (groups.size() == 1) cloned.insert(orig_f); grp.clone_fs.insert(i); all_origs.insert(orig_f); @@ -607,7 +605,7 @@ void CloneCtx::check_partial(Group &grp, Target &tgt) continue; auto orig_f = orig_funcs[i]; if (all_origs.count(orig_f)) { - if (!has_cloneall) + if (groups.size() == 1) cloned.insert(orig_f); grp.clone_fs.insert(i); } @@ -787,7 +785,7 @@ void CloneCtx::fix_gv_uses() return changed; }; for (auto orig_f: orig_funcs) { - if (!has_cloneall && !cloned.count(orig_f)) + if (groups.size() == 1 && !cloned.count(orig_f)) continue; while (single_pass(orig_f)) { } @@ -952,25 +950,8 @@ void CloneCtx::emit_metadata() } } - // Generate `jl_dispatch_target_ids` - { - const uint32_t base_flags = has_veccall ? JL_TARGET_VEC_CALL : 0; - std::vector<uint8_t> data; - auto push_i32 = [&] (uint32_t v) { - uint8_t buff[4]; - memcpy(buff, &v, 4); - data.insert(data.end(), buff, buff + 4); - }; - push_i32(ntargets); - for (uint32_t i = 0; i < ntargets; i++) { - push_i32(base_flags | (specs[i].flags & JL_TARGET_UNKNOWN_NAME)); - auto &specdata = specs[i].data; - data.insert(data.end(), specdata.begin(), specdata.end()); - } - auto value = ConstantDataArray::get(M.getContext(), data); - add_comdat(new GlobalVariable(M, value->getType(), true, - GlobalVariable::ExternalLinkage, - value, "jl_dispatch_target_ids")); + if (has_veccall) { + M.addModuleFlag(Module::Max, "julia.mv.veccall", 1); } // Generate `jl_dispatch_reloc_slots` From 4524987a384f444e02c0e21afacfc3c4f4d68a4e Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Tue, 3 Jan 2023 18:41:07 -0500 Subject: [PATCH 03/34] Timing print statements --- src/aotcompile.cpp | 47 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index d3d4529d32c30..2c9edecae7df7 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -273,6 +273,8 @@ void replaceUsesWithLoad(Function &F, function_ref<GlobalVariable *(Instruction extern "C" JL_DLLEXPORT void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvmmod, const jl_cgparams_t *cgparams, int _policy, int _imaging_mode, int _external_linkage, size_t _world) { + uint64_t start = jl_hrtime(); + uint64_t end = 0; ++CreateNativeCalls; CreateNativeMax.updateMax(jl_array_len(methods)); if (cgparams == NULL) @@ -464,6 +466,8 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm if (ctx.getContext()) { jl_ExecutionEngine->releaseContext(std::move(ctx)); } + end = jl_hrtime(); + dbgs() << "jl_create_native: " << (end - start) / 1e9 << "s\n"; return (void*)data; } @@ -517,6 +521,8 @@ void jl_dump_native_impl(void *native_code, const char *asm_fname, const char *sysimg_data, size_t sysimg_len, ios_t *s) { + uint64_t start = jl_hrtime(); + uint64_t end = 0; JL_TIMING(NATIVE_DUMP); jl_native_code_desc_t *data = (jl_native_code_desc_t*)native_code; auto TSCtx = data->M.getContext(); @@ -575,6 +581,12 @@ void jl_dump_native_impl(void *native_code, bool imaging_mode = imaging_default() || jl_options.outputo; + end = jl_hrtime(); + + dbgs() << "setup time: " << (end - start) / 1e9 << "s\n"; + + start = jl_hrtime(); + // add metadata information if (imaging_mode) { emit_offset_table(*dataM, data->jl_sysimg_gvars, "jl_sysimg_gvars", T_psize); @@ -591,6 +603,12 @@ void jl_dump_native_impl(void *native_code, "jl_RTLD_DEFAULT_handle_pointer")); } + end = jl_hrtime(); + + dbgs() << "metadata time: " << (end - start) / 1e9 << "s\n"; + + start = jl_hrtime(); + // do the actual work auto add_output = [&] (Module &M, StringRef unopt_bc_Name, StringRef bc_Name, StringRef obj_Name, StringRef asm_Name) { @@ -618,6 +636,9 @@ void jl_dump_native_impl(void *native_code, } assert(!verifyModule(M, &errs())); + uint64_t start = jl_hrtime(); + end = 0; + #ifndef JL_USE_NEW_PM legacy::PassManager optimizer; addTargetPasses(&optimizer, TM->getTargetTriple(), TM->getTargetIRAnalysis()); @@ -639,6 +660,10 @@ void jl_dump_native_impl(void *native_code, optimizer.run(M); assert(!verifyModule(M, &errs())); + end = jl_hrtime(); + + dbgs() << "optimize time: " << (end - start) / 1e9 << "s\n"; + if (bc_fname) { SmallVector<char, 0> Buffer; raw_svector_ostream OS(Buffer); @@ -649,6 +674,8 @@ void jl_dump_native_impl(void *native_code, emit_result(bc_Archive, Buffer, bc_Name, outputs); } + start = jl_hrtime(); + if (obj_fname) { SmallVector<char, 0> Buffer; raw_svector_ostream OS(Buffer); @@ -660,6 +687,10 @@ void jl_dump_native_impl(void *native_code, emit_result(obj_Archive, Buffer, obj_Name, outputs); } + end = jl_hrtime(); + + dbgs() << "codegen time: " << (end - start) / 1e9 << "s\n"; + if (asm_fname) { SmallVector<char, 0> Buffer; raw_svector_ostream OS(Buffer); @@ -674,6 +705,12 @@ void jl_dump_native_impl(void *native_code, add_output(*dataM, "unopt.bc", "text.bc", "text.o", "text.s"); + end = jl_hrtime(); + + dbgs() << "text output time: " << (end - start) / 1e9 << "s\n"; + + start = jl_hrtime(); + orc::ThreadSafeModule sysimage(std::make_unique<Module>("sysimage", Context), TSCtx); auto sysimageM = sysimage.getModuleUnlocked(); sysimageM->setTargetTriple(dataM->getTargetTriple()); @@ -751,6 +788,12 @@ void jl_dump_native_impl(void *native_code, } add_output(*sysimageM, "data.bc", "data.bc", "data.o", "data.s"); + end = jl_hrtime(); + + dbgs() << "data module time: " << (end - start) / 1e9 << "s\n"; + + start = jl_hrtime(); + object::Archive::Kind Kind = getDefaultForHost(TheTriple); if (unopt_bc_fname) handleAllErrors(writeArchive(unopt_bc_fname, unopt_bc_Archive, true, @@ -764,6 +807,10 @@ void jl_dump_native_impl(void *native_code, if (asm_fname) handleAllErrors(writeArchive(asm_fname, asm_Archive, true, Kind, true, false), reportWriterError); + + end = jl_hrtime(); + + dbgs() << "archive time: " << (end - start) / 1e9 << "s\n"; delete data; } From 094269c8c1e506e36f0b4bd7ddc6ec38f279bb3c Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Thu, 5 Jan 2023 14:36:16 -0500 Subject: [PATCH 04/34] Move image init to processor.cpp --- src/llvm-multiversioning.cpp | 67 ++++++++++++++---------------- src/processor.cpp | 79 +++++++++++++++++++++++++----------- src/processor.h | 11 ++++- src/processor_arm.cpp | 4 +- src/processor_fallback.cpp | 4 +- src/processor_x86.cpp | 4 +- src/staticdata.c | 69 ++----------------------------- 7 files changed, 103 insertions(+), 135 deletions(-) diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp index c94aee9927540..3325cb47147a6 100644 --- a/src/llvm-multiversioning.cpp +++ b/src/llvm-multiversioning.cpp @@ -253,21 +253,14 @@ struct CloneCtx { void emit_metadata(); private: void prepare_vmap(ValueToValueMapTy &vmap); - bool is_vector(FunctionType *ty) const; void clone_function(Function *F, Function *new_f, ValueToValueMapTy &vmap); uint32_t collect_func_info(Function &F); void check_partial(Group &grp, Target &tgt); void clone_partial(Group &grp, Target &tgt); - void add_features(Function *F, StringRef name, StringRef features, uint32_t flags) const; - template<typename T> - T *add_comdat(T *G) const; uint32_t get_func_id(Function *F); template<typename Stack> Constant *rewrite_gv_init(const Stack& stack); std::pair<uint32_t,GlobalVariable*> get_reloc_slot(Function *F); - Constant *get_ptrdiff32(Constant *ptr, Constant *base) const; - template<typename T> - Constant *emit_offset_table(const std::vector<T*> &vars, StringRef name) const; void rewrite_alias(GlobalAlias *alias, Function* F); MDNode *tbaa_const; @@ -424,7 +417,7 @@ void CloneCtx::clone_bases() } } -bool CloneCtx::is_vector(FunctionType *ty) const +static bool is_vector(FunctionType *ty) { if (ty->getReturnType()->isVectorTy()) return true; @@ -507,6 +500,29 @@ void CloneCtx::collect_func_infos() } } +static void add_features(Function *F, StringRef name, StringRef features, uint32_t flags) +{ + auto attr = F->getFnAttribute("target-features"); + if (attr.isStringAttribute()) { + std::string new_features(attr.getValueAsString()); + new_features += ","; + new_features += features; + F->addFnAttr("target-features", new_features); + } + else { + F->addFnAttr("target-features", features); + } + F->addFnAttr("target-cpu", name); + if (!F->hasFnAttribute(Attribute::OptimizeNone)) { + if (flags & JL_TARGET_OPTSIZE) { + F->addFnAttr(Attribute::OptimizeForSize); + } + else if (flags & JL_TARGET_MINSIZE) { + F->addFnAttr(Attribute::MinSize); + } + } +} + void CloneCtx::clone_all_partials() { // First decide what to clone @@ -632,29 +648,6 @@ void CloneCtx::clone_partial(Group &grp, Target &tgt) } } -void CloneCtx::add_features(Function *F, StringRef name, StringRef features, uint32_t flags) const -{ - auto attr = F->getFnAttribute("target-features"); - if (attr.isStringAttribute()) { - std::string new_features(attr.getValueAsString()); - new_features += ","; - new_features += features; - F->addFnAttr("target-features", new_features); - } - else { - F->addFnAttr("target-features", features); - } - F->addFnAttr("target-cpu", name); - if (!F->hasFnAttribute(Attribute::OptimizeNone)) { - if (flags & JL_TARGET_OPTSIZE) { - F->addFnAttr(Attribute::OptimizeForSize); - } - else if (flags & JL_TARGET_MINSIZE) { - F->addFnAttr(Attribute::MinSize); - } - } -} - uint32_t CloneCtx::get_func_id(Function *F) { auto &ref = func_ids[F]; @@ -878,7 +871,7 @@ void CloneCtx::fix_inst_uses() } template<typename T> -inline T *CloneCtx::add_comdat(T *G) const +static inline T *add_comdat(T *G) { #if defined(_OS_WINDOWS_) // add __declspec(dllexport) to everything marked for export @@ -890,7 +883,7 @@ inline T *CloneCtx::add_comdat(T *G) const return G; } -Constant *CloneCtx::get_ptrdiff32(Constant *ptr, Constant *base) const +static Constant *get_ptrdiff32(Constant *ptr, Constant *base) { if (ptr->getType()->isPointerTy()) ptr = ConstantExpr::getPtrToInt(ptr, getSizeTy(ptr->getContext())); @@ -899,7 +892,7 @@ Constant *CloneCtx::get_ptrdiff32(Constant *ptr, Constant *base) const } template<typename T> -Constant *CloneCtx::emit_offset_table(const std::vector<T*> &vars, StringRef name) const +static Constant *emit_offset_table(Module &M, const std::vector<T*> &vars, StringRef name) { auto T_int32 = Type::getInt32Ty(M.getContext()); auto T_size = getSizeTy(M.getContext()); @@ -911,7 +904,7 @@ Constant *CloneCtx::emit_offset_table(const std::vector<T*> &vars, StringRef nam name + "_base", base, &M)); } else { - base = ConstantExpr::getNullValue(T_size->getPointerTo()); + base = add_comdat(new GlobalVariable(M, T_size, true, GlobalValue::ExternalLinkage, Constant::getNullValue(T_size), name + "_base")); } auto vbase = ConstantExpr::getPtrToInt(base, T_size); std::vector<Constant*> offsets(nvars + 1); @@ -938,8 +931,8 @@ void CloneCtx::emit_metadata() } // Store back the information about exported functions. - auto fbase = emit_offset_table(fvars, "jl_sysimg_fvars"); - auto gbase = emit_offset_table(gvars, "jl_sysimg_gvars"); + auto fbase = emit_offset_table(M, fvars, "jl_sysimg_fvars"); + auto gbase = emit_offset_table(M, gvars, "jl_sysimg_gvars"); uint32_t ntargets = specs.size(); SmallVector<Target*, 8> targets(ntargets); diff --git a/src/processor.cpp b/src/processor.cpp index 13b40ec4f7363..a8aca2a64ab19 100644 --- a/src/processor.cpp +++ b/src/processor.cpp @@ -17,6 +17,10 @@ #include "julia_assert.h" +#ifndef _OS_WINDOWS_ +#include <dlfcn.h> +#endif + // CPU target string is a list of strings separated by `;` each string starts with a CPU // or architecture name and followed by an optional list of features separated by `,`. // A "generic" or empty CPU name means the basic required feature set of the target ISA @@ -621,44 +625,53 @@ static inline std::vector<TargetData<n>> &get_cmdline_targets(F &&feature_cb) // Load sysimg, use the `callback` for dispatch and perform all relocations // for the selected target. template<typename F> -static inline jl_image_fptrs_t parse_sysimg(void *hdl, F &&callback) +static inline jl_image_t parse_sysimg(void *hdl, F &&callback) { - jl_image_fptrs_t res = {nullptr, 0, nullptr, 0, nullptr, nullptr}; + jl_image_t res{}; // .data base char *data_base; - if (!jl_dlsym(hdl, "jl_sysimg_gvars_base", (void**)&data_base, 0)) { - data_base = NULL; + jl_dlsym(hdl, "jl_sysimg_gvars_base", (void**)&data_base, 1); + + { + void *pgcstack_func_slot; + if (jl_dlsym(hdl, "jl_pgcstack_func_slot", &pgcstack_func_slot, 0)) { + void *pgcstack_key_slot; + jl_dlsym(hdl, "jl_pgcstack_key_slot", &pgcstack_key_slot, 1); + jl_pgcstack_getkey((jl_get_pgcstack_func**)pgcstack_func_slot, (jl_pgcstack_key_t*)pgcstack_key_slot); + + size_t *tls_offset_idx; + jl_dlsym(hdl, "jl_tls_offset", (void **)&tls_offset_idx, 1); + *tls_offset_idx = (uintptr_t)(jl_tls_offset == -1 ? 0 : jl_tls_offset); + } } + // .text base char *text_base; - if (!jl_dlsym(hdl, "jl_sysimg_fvars_base", (void**)&text_base, 0)) { - text_base = NULL; - } - res.base = text_base; + jl_dlsym(hdl, "jl_sysimg_fvars_base", (void**)&text_base, 1); - int32_t *offsets; + const int32_t *offsets; jl_dlsym(hdl, "jl_sysimg_fvars_offsets", (void**)&offsets, 1); uint32_t nfunc = offsets[0]; - res.offsets = offsets + 1; + offsets++; - void *ids; - jl_dlsym(hdl, "jl_dispatch_target_ids", &ids, 1); + const void *ids; + jl_dlsym(hdl, "jl_dispatch_target_ids", (void**)&ids, 1); uint32_t target_idx = callback(ids); - int32_t *reloc_slots; + const int32_t *reloc_slots; jl_dlsym(hdl, "jl_dispatch_reloc_slots", (void **)&reloc_slots, 1); const uint32_t nreloc = reloc_slots[0]; reloc_slots += 1; - uint32_t *clone_idxs; - int32_t *clone_offsets; + const uint32_t *clone_idxs; + const int32_t *clone_offsets; jl_dlsym(hdl, "jl_dispatch_fvars_idxs", (void**)&clone_idxs, 1); jl_dlsym(hdl, "jl_dispatch_fvars_offsets", (void**)&clone_offsets, 1); uint32_t tag_len = clone_idxs[0]; clone_idxs += 1; assert(tag_len & jl_sysimg_tag_mask); - std::vector<const int32_t*> base_offsets = {res.offsets}; + std::vector<const int32_t*> base_offsets = {offsets}; // Find target for (uint32_t i = 0;i < target_idx;i++) { uint32_t len = jl_sysimg_val_mask & tag_len; @@ -680,20 +693,20 @@ static inline jl_image_fptrs_t parse_sysimg(void *hdl, F &&callback) if (clone_all) { // clone_all if (target_idx != 0) { - res.offsets = clone_offsets; + offsets = clone_offsets; } } else { uint32_t base_idx = clone_idxs[0]; assert(base_idx < target_idx); if (target_idx != 0) { - res.offsets = base_offsets[base_idx]; - assert(res.offsets); + offsets = base_offsets[base_idx]; + assert(offsets); } clone_idxs++; - res.nclones = tag_len; - res.clone_offsets = clone_offsets; - res.clone_idxs = clone_idxs; + res.fptrs.nclones = tag_len; + res.fptrs.clone_offsets = clone_offsets; + res.fptrs.clone_idxs = clone_idxs; } // Do relocation uint32_t reloc_i = 0; @@ -702,7 +715,7 @@ static inline jl_image_fptrs_t parse_sysimg(void *hdl, F &&callback) uint32_t idx = clone_idxs[i]; int32_t offset; if (clone_all) { - offset = res.offsets[idx]; + offset = offsets[idx]; } else if (idx & jl_sysimg_tag_mask) { idx = idx & jl_sysimg_val_mask; @@ -718,7 +731,7 @@ static inline jl_image_fptrs_t parse_sysimg(void *hdl, F &&callback) found = true; auto slot = (const void**)(data_base + reloc_slots[reloc_i * 2 + 1]); assert(slot); - *slot = offset + res.base; + *slot = offset + text_base; } else if (reloc_idx > idx) { break; @@ -728,6 +741,24 @@ static inline jl_image_fptrs_t parse_sysimg(void *hdl, F &&callback) (void)found; } + res.fptrs.base = text_base; + res.fptrs.offsets = offsets; + res.gvars_base = (uintptr_t *)data_base; + jl_dlsym(hdl, "jl_sysimg_gvars_offsets", (void **)&res.gvars_offsets, 1); + res.gvars_offsets += 1; + +#ifdef _OS_WINDOWS_ + res.base = (intptr_t)hdl; +#else + Dl_info dlinfo; + if (dladdr((void*)res.gvars_base, &dlinfo) != 0) { + res.base = (intptr_t)dlinfo.dli_fbase; + } + else { + res.base = 0; + } +#endif + return res; } diff --git a/src/processor.h b/src/processor.h index e3f3bd512c910..f76722e885a1d 100644 --- a/src/processor.h +++ b/src/processor.h @@ -155,6 +155,13 @@ typedef struct _jl_image_fptrs_t { const uint32_t *clone_idxs; } jl_image_fptrs_t; +typedef struct { + uint64_t base; + uintptr_t *gvars_base; + const int32_t *gvars_offsets; + jl_image_fptrs_t fptrs; +} jl_image_t; + /** * Initialize the processor dispatch system with sysimg `hdl` (also initialize the sysimg itself). * The dispatch system will find the best implementation to be used in this session. @@ -165,8 +172,8 @@ typedef struct _jl_image_fptrs_t { * * Return the data about the function pointers selected. */ -jl_image_fptrs_t jl_init_processor_sysimg(void *hdl); -jl_image_fptrs_t jl_init_processor_pkgimg(void *hdl); +jl_image_t jl_init_processor_sysimg(void *hdl); +jl_image_t jl_init_processor_pkgimg(void *hdl); // Return the name of the host CPU as a julia string. JL_DLLEXPORT jl_value_t *jl_get_cpu_name(void); diff --git a/src/processor_arm.cpp b/src/processor_arm.cpp index 3e7b22caf00d4..0797fa4381f9d 100644 --- a/src/processor_arm.cpp +++ b/src/processor_arm.cpp @@ -1802,14 +1802,14 @@ JL_DLLEXPORT jl_value_t *jl_get_cpu_name(void) return jl_cstr_to_string(host_cpu_name().c_str()); } -jl_image_fptrs_t jl_init_processor_sysimg(void *hdl) +jl_image_t jl_init_processor_sysimg(void *hdl) { if (!jit_targets.empty()) jl_error("JIT targets already initialized"); return parse_sysimg(hdl, sysimg_init_cb); } -jl_image_fptrs_t jl_init_processor_pkgimg(void *hdl) +jl_image_t jl_init_processor_pkgimg(void *hdl) { if (jit_targets.empty()) jl_error("JIT targets not initialized"); diff --git a/src/processor_fallback.cpp b/src/processor_fallback.cpp index c1353e1bb43b0..1aebde6dab90a 100644 --- a/src/processor_fallback.cpp +++ b/src/processor_fallback.cpp @@ -112,14 +112,14 @@ get_llvm_target_str(const TargetData<1> &data) using namespace Fallback; -jl_image_fptrs_t jl_init_processor_sysimg(void *hdl) +jl_image_t jl_init_processor_sysimg(void *hdl) { if (!jit_targets.empty()) jl_error("JIT targets already initialized"); return parse_sysimg(hdl, sysimg_init_cb); } -jl_image_fptrs_t jl_init_processor_pkgimg(void *hdl) +jl_image_t jl_init_processor_pkgimg(void *hdl) { if (jit_targets.empty()) jl_error("JIT targets not initialized"); diff --git a/src/processor_x86.cpp b/src/processor_x86.cpp index 6b3e7d5b63678..30a6ff9b3dede 100644 --- a/src/processor_x86.cpp +++ b/src/processor_x86.cpp @@ -1039,14 +1039,14 @@ JL_DLLEXPORT jl_value_t *jl_get_cpu_name(void) return jl_cstr_to_string(host_cpu_name().c_str()); } -jl_image_fptrs_t jl_init_processor_sysimg(void *hdl) +jl_image_t jl_init_processor_sysimg(void *hdl) { if (!jit_targets.empty()) jl_error("JIT targets already initialized"); return parse_sysimg(hdl, sysimg_init_cb); } -jl_image_fptrs_t jl_init_processor_pkgimg(void *hdl) +jl_image_t jl_init_processor_pkgimg(void *hdl) { if (jit_targets.empty()) jl_error("JIT targets not initialized"); diff --git a/src/staticdata.c b/src/staticdata.c index cd9ed8b0db088..94e93f4198b4c 100644 --- a/src/staticdata.c +++ b/src/staticdata.c @@ -315,13 +315,6 @@ void *native_functions; // opaque jl_native_code_desc_t blob used for fetching // table of struct field addresses to rewrite during saving static htable_t field_replace; -typedef struct { - uint64_t base; - uintptr_t *gvars_base; - int32_t *gvars_offsets; - jl_image_fptrs_t fptrs; -} jl_image_t; - // array of definitions for the predefined function pointers // (reverse of fptr_to_id) // This is a manually constructed dual of the fvars array, which would be produced by codegen for Julia code, for C. @@ -446,7 +439,7 @@ typedef struct { static void *jl_sysimg_handle = NULL; static jl_image_t sysimage; -static inline uintptr_t *sysimg_gvars(uintptr_t *base, int32_t *offsets, size_t idx) +static inline uintptr_t *sysimg_gvars(uintptr_t *base, const int32_t *offsets, size_t idx) { return base + offsets[idx] / sizeof(base[0]); } @@ -461,32 +454,7 @@ static void jl_load_sysimg_so(void) int imaging_mode = jl_generating_output() && !jl_options.incremental; // in --build mode only use sysimg data, not precompiled native code if (!imaging_mode && jl_options.use_sysimage_native_code==JL_OPTIONS_USE_SYSIMAGE_NATIVE_CODE_YES) { - jl_dlsym(jl_sysimg_handle, "jl_sysimg_gvars_base", (void **)&sysimage.gvars_base, 1); - jl_dlsym(jl_sysimg_handle, "jl_sysimg_gvars_offsets", (void **)&sysimage.gvars_offsets, 1); - sysimage.gvars_offsets += 1; assert(sysimage.fptrs.base); - - void *pgcstack_func_slot; - jl_dlsym(jl_sysimg_handle, "jl_pgcstack_func_slot", &pgcstack_func_slot, 1); - void *pgcstack_key_slot; - jl_dlsym(jl_sysimg_handle, "jl_pgcstack_key_slot", &pgcstack_key_slot, 1); - jl_pgcstack_getkey((jl_get_pgcstack_func**)pgcstack_func_slot, (jl_pgcstack_key_t*)pgcstack_key_slot); - - size_t *tls_offset_idx; - jl_dlsym(jl_sysimg_handle, "jl_tls_offset", (void **)&tls_offset_idx, 1); - *tls_offset_idx = (uintptr_t)(jl_tls_offset == -1 ? 0 : jl_tls_offset); - -#ifdef _OS_WINDOWS_ - sysimage.base = (intptr_t)jl_sysimg_handle; -#else - Dl_info dlinfo; - if (dladdr((void*)sysimage.gvars_base, &dlinfo) != 0) { - sysimage.base = (intptr_t)dlinfo.dli_fbase; - } - else { - sysimage.base = 0; - } -#endif } else { memset(&sysimage.fptrs, 0, sizeof(sysimage.fptrs)); @@ -2693,7 +2661,7 @@ JL_DLLEXPORT void jl_set_sysimg_so(void *handle) if (jl_options.cpu_target == NULL) jl_options.cpu_target = "native"; jl_sysimg_handle = handle; - sysimage.fptrs = jl_init_processor_sysimg(handle); + sysimage = jl_init_processor_sysimg(handle); } #ifndef JL_NDEBUG @@ -3391,38 +3359,7 @@ JL_DLLEXPORT jl_value_t *jl_restore_package_image_from_file(const char *fname, j size_t *plen; jl_dlsym(pkgimg_handle, "jl_system_image_size", (void **)&plen, 1); - jl_image_t pkgimage; - pkgimage.fptrs = jl_init_processor_pkgimg(pkgimg_handle); - if (!jl_dlsym(pkgimg_handle, "jl_sysimg_gvars_base", (void **)&pkgimage.gvars_base, 0)) { - pkgimage.gvars_base = NULL; - } - - jl_dlsym(pkgimg_handle, "jl_sysimg_gvars_offsets", (void **)&pkgimage.gvars_offsets, 1); - pkgimage.gvars_offsets += 1; - - void *pgcstack_func_slot; - jl_dlsym(pkgimg_handle, "jl_pgcstack_func_slot", &pgcstack_func_slot, 0); - if (pgcstack_func_slot) { // Empty package images might miss these - void *pgcstack_key_slot; - jl_dlsym(pkgimg_handle, "jl_pgcstack_key_slot", &pgcstack_key_slot, 1); - jl_pgcstack_getkey((jl_get_pgcstack_func**)pgcstack_func_slot, (jl_pgcstack_key_t*)pgcstack_key_slot); - - size_t *tls_offset_idx; - jl_dlsym(pkgimg_handle, "jl_tls_offset", (void **)&tls_offset_idx, 1); - *tls_offset_idx = (uintptr_t)(jl_tls_offset == -1 ? 0 : jl_tls_offset); - } - - #ifdef _OS_WINDOWS_ - pkgimage.base = (intptr_t)pkgimg_handle; - #else - Dl_info dlinfo; - if (dladdr((void*)pkgimage.gvars_base, &dlinfo) != 0) { - pkgimage.base = (intptr_t)dlinfo.dli_fbase; - } - else { - pkgimage.base = 0; - } - #endif + jl_image_t pkgimage = jl_init_processor_pkgimg(pkgimg_handle); jl_value_t* mod = jl_restore_incremental_from_buf(pkgimg_data, &pkgimage, *plen, depmods, completeinfo); From 2c7375cbb0c5ab7d331829d7a55d97881cd33255 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Thu, 5 Jan 2023 19:33:30 -0500 Subject: [PATCH 05/34] Annotate information before running optimization --- src/aotcompile.cpp | 12 + src/llvm-multiversioning.cpp | 667 +++++++++++++++++------------------ 2 files changed, 344 insertions(+), 335 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 2c9edecae7df7..527b793f142c8 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -512,6 +512,7 @@ static void injectCRTAlias(Module &M, StringRef name, StringRef alias, FunctionT builder.CreateRet(val); } +void multiversioning_preannotate(Module &M); // takes the running content that has collected in the shadow module and dump it to disk // this builds the object file portion of the sysimage files for fast startup @@ -589,6 +590,17 @@ void jl_dump_native_impl(void *native_code, // add metadata information if (imaging_mode) { + multiversioning_preannotate(*dataM); + { + DenseSet<GlobalValue *> fvars(data->jl_sysimg_fvars.begin(), data->jl_sysimg_fvars.end()); + for (auto &F : *dataM) { + if (F.hasFnAttribute("julia.mv.reloc") || F.hasFnAttribute("julia.mv.fvar")) { + if (fvars.insert(&F).second) { + data->jl_sysimg_fvars.push_back(&F); + } + } + } + } emit_offset_table(*dataM, data->jl_sysimg_gvars, "jl_sysimg_gvars", T_psize); emit_offset_table(*dataM, data->jl_sysimg_fvars, "jl_sysimg_fvars", T_psize); diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp index 3325cb47147a6..1a1dc297b2702 100644 --- a/src/llvm-multiversioning.cpp +++ b/src/llvm-multiversioning.cpp @@ -10,6 +10,7 @@ #include <llvm-c/Types.h> #include <llvm/Pass.h> +#include <llvm/ADT/BitVector.h> #include <llvm/ADT/Statistic.h> #include <llvm/IR/Module.h> #include <llvm/IR/LegacyPassManager.h> @@ -217,25 +218,211 @@ void ConstantUses<U>::forward() } } +static bool is_vector(FunctionType *ty) +{ + if (ty->getReturnType()->isVectorTy()) + return true; + for (auto arg: ty->params()) { + if (arg->isVectorTy()) { + return true; + } + } + return false; +} + +static uint32_t collect_func_info(Function &F, bool &has_veccall) +{ + DominatorTree DT(F); + LoopInfo LI(DT); + uint32_t flag = 0; + if (!LI.empty()) + flag |= JL_TARGET_CLONE_LOOP; + if (is_vector(F.getFunctionType())) { + flag |= JL_TARGET_CLONE_SIMD; + has_veccall = true; + } + for (auto &bb: F) { + for (auto &I: bb) { + if (auto call = dyn_cast<CallInst>(&I)) { + if (is_vector(call->getFunctionType())) { + has_veccall = true; + flag |= JL_TARGET_CLONE_SIMD; + } + if (auto callee = call->getCalledFunction()) { + auto name = callee->getName(); + if (name.startswith("llvm.muladd.") || name.startswith("llvm.fma.")) { + flag |= JL_TARGET_CLONE_MATH; + } + else if (name.startswith("julia.cpu.")) { + if (name.startswith("julia.cpu.have_fma.")) { + // for some platforms we know they always do (or don't) support + // FMA. in those cases we don't need to clone the function. + if (!always_have_fma(*callee).hasValue()) + flag |= JL_TARGET_CLONE_CPU; + } else { + flag |= JL_TARGET_CLONE_CPU; + } + } + } + } + else if (auto store = dyn_cast<StoreInst>(&I)) { + if (store->getValueOperand()->getType()->isVectorTy()) { + flag |= JL_TARGET_CLONE_SIMD; + } + } + else if (I.getType()->isVectorTy()) { + flag |= JL_TARGET_CLONE_SIMD; + } + if (auto mathOp = dyn_cast<FPMathOperator>(&I)) { + if (mathOp->getFastMathFlags().any()) { + flag |= JL_TARGET_CLONE_MATH; + } + } + + for (size_t i = 0; i < I.getNumOperands(); i++) { + if(I.getOperand(i)->getType()->isHalfTy()){ + flag |= JL_TARGET_CLONE_FLOAT16; + } + // Check for BFloat16 when they are added to julia can be done here + } + if (has_veccall && (flag & JL_TARGET_CLONE_SIMD) && (flag & JL_TARGET_CLONE_MATH)) { + return flag; + } + } + } + return flag; +} + +static void annotate_module_clones(Module &M) { + CallGraph CG(M); + std::vector<Function *> orig_funcs; + for (auto &F: M) { + if (F.isDeclaration()) + continue; + orig_funcs.push_back(&F); + } + bool has_veccall = false; + auto specs = jl_get_llvm_clone_targets(); + std::vector<APInt> clones(orig_funcs.size(), APInt(specs.size(), 0)); + BitVector subtarget_cloned(orig_funcs.size()); + bool check_relocs = false; + + std::vector<unsigned> func_infos(orig_funcs.size()); + for (unsigned i = 0; i < orig_funcs.size(); i++) { + func_infos[i] = collect_func_info(*orig_funcs[i], has_veccall); + } + for (unsigned i = 1; i < specs.size(); i++) { + if (specs[i].flags & JL_TARGET_CLONE_ALL) { + for (unsigned j = 0; j < orig_funcs.size(); j++) { + clones[j].setBit(i); + } + check_relocs = true; + } else { + unsigned flag = specs[i].flags & clone_mask; + std::set<Function*> sets[2]; + for (unsigned j = 0; j < orig_funcs.size(); j++) { + if (!(func_infos[j] & flag)) { + continue; + } + sets[0].insert(orig_funcs[j]); + } + std::set<Function*> all_origs(sets[0]); + auto *cur_set = &sets[0]; + auto *next_set = &sets[1]; + // Reduce dispatch by expand the cloning set to functions that are directly called by + // and calling cloned functions. + while (!cur_set->empty()) { + for (auto orig_f: *cur_set) { + // Use the uncloned function since it's already in the call graph + auto node = CG[orig_f]; + for (const auto &I: *node) { + auto child_node = I.second; + auto orig_child_f = child_node->getFunction(); + if (!orig_child_f) + continue; + // Already cloned + if (all_origs.count(orig_child_f)) + continue; + bool calling_clone = false; + for (const auto &I2: *child_node) { + auto orig_child_f2 = I2.second->getFunction(); + if (!orig_child_f2) + continue; + if (all_origs.count(orig_child_f2)) { + calling_clone = true; + break; + } + } + if (!calling_clone) + continue; + next_set->insert(orig_child_f); + all_origs.insert(orig_child_f); + } + } + std::swap(cur_set, next_set); + next_set->clear(); + } + for (unsigned j = 0; j < orig_funcs.size(); j++) { + if (all_origs.count(orig_funcs[j])) { + clones[j].setBit(i); + subtarget_cloned.set(j); + } + } + } + } + if (check_relocs) { + for (unsigned i = 0; i < orig_funcs.size(); i++) { + auto &F = *orig_funcs[i]; + if (subtarget_cloned[i] && !ConstantUses<Instruction>(orig_funcs[i], M).done()) { + F.addFnAttr("julia.mv.reloc", ""); + } else { + auto uses = ConstantUses<GlobalValue>(orig_funcs[i], M); + if (!uses.done()) { + bool slot = false; + for (; !uses.done(); uses.next()) { + if (isa<GlobalAlias>(uses.get_info().val)) { + slot = true; + break; + } + } + if (slot) { + F.addFnAttr("julia.mv.reloc", ""); + } else { + F.addFnAttr("julia.mv.fvar", ""); + } + } + } + } + } + SmallString<128> cloneset; + for (unsigned i = 0; i < orig_funcs.size(); i++) { + if (!clones[i].isZero()) { + auto &F = *orig_funcs[i]; + cloneset.clear(); + clones[i].toStringUnsigned(cloneset, 16); + F.addFnAttr("julia.mv.clones", cloneset); + } + } + if (has_veccall) { + M.addModuleFlag(Module::Max, "julia.mv.veccall", 1); + } +} + struct CloneCtx { struct Target { int idx; - uint32_t flags; std::unique_ptr<ValueToValueMapTy> vmap; // ValueToValueMapTy is not movable.... - Target(int idx, const jl_target_spec_t &spec) : + explicit Target(int idx) : idx(idx), - flags(spec.flags), vmap(new ValueToValueMapTy) { } }; struct Group : Target { std::vector<Target> clones; - std::set<uint32_t> clone_fs; - Group(int base, const jl_target_spec_t &spec) : - Target(base, spec), - clones{}, - clone_fs{} + explicit Group(int base) : + Target(base), + clones{} {} Function *base_func(Function *orig_f) const { @@ -243,34 +430,38 @@ struct CloneCtx { return orig_f; return cast<Function>(vmap->lookup(orig_f)); } + + bool has_subtarget_clone(Function *orig_f) const + { + auto base = base_func(orig_f); + for (auto &clone: clones) { + if (map_get(*clone.vmap, base)) + return true; + } + return false; + } }; - CloneCtx(Module &M, function_ref<LoopInfo&(Function&)> GetLI, function_ref<CallGraph&()> GetCG, bool allow_bad_fvars); - void clone_bases(); - void collect_func_infos(); - void clone_all_partials(); + CloneCtx(Module &M, bool allow_bad_fvars); + void prepare_slots(); + void clone_decls(); + void clone_bodies(); void fix_gv_uses(); void fix_inst_uses(); void emit_metadata(); private: void prepare_vmap(ValueToValueMapTy &vmap); - void clone_function(Function *F, Function *new_f, ValueToValueMapTy &vmap); - uint32_t collect_func_info(Function &F); - void check_partial(Group &grp, Target &tgt); void clone_partial(Group &grp, Target &tgt); - uint32_t get_func_id(Function *F); - template<typename Stack> - Constant *rewrite_gv_init(const Stack& stack); - std::pair<uint32_t,GlobalVariable*> get_reloc_slot(Function *F); + uint32_t get_func_id(Function *F) const; + std::pair<uint32_t,GlobalVariable*> get_reloc_slot(Function *F) const; void rewrite_alias(GlobalAlias *alias, Function* F); MDNode *tbaa_const; std::vector<jl_target_spec_t> specs; std::vector<Group> groups{}; + std::vector<Target *> linearized; std::vector<Function*> fvars; std::vector<Constant*> gvars; Module &M; - function_ref<LoopInfo&(Function&)> GetLI; - function_ref<CallGraph&()> GetCG; // Map from original function to one based index in `fvars` std::map<const Function*,uint32_t> func_ids{}; @@ -281,7 +472,7 @@ struct CloneCtx { std::vector<std::pair<Constant*,uint32_t>> gv_relocs{}; // Mapping from function id (i.e. 0-based index in `fvars`) to GVs to be initialized. std::map<uint32_t,GlobalVariable*> const_relocs; - bool has_veccall{false}; + std::map<Function *, GlobalVariable*> extern_relocs; bool allow_bad_fvars{false}; }; @@ -322,36 +513,36 @@ static inline std::vector<T*> consume_gv(Module &M, const char *name, bool allow } // Collect basic information about targets and functions. -CloneCtx::CloneCtx(Module &M, function_ref<LoopInfo&(Function&)> GetLI, function_ref<CallGraph&()> GetCG, bool allow_bad_fvars) +CloneCtx::CloneCtx(Module &M, bool allow_bad_fvars) : tbaa_const(tbaa_make_child_with_context(M.getContext(), "jtbaa_const", nullptr, true).first), specs(jl_get_llvm_clone_targets()), fvars(consume_gv<Function>(M, "jl_sysimg_fvars", allow_bad_fvars)), gvars(consume_gv<Constant>(M, "jl_sysimg_gvars", false)), M(M), - GetLI(GetLI), - GetCG(GetCG), allow_bad_fvars(allow_bad_fvars) { - groups.emplace_back(0, specs[0]); + groups.emplace_back(0); + linearized.resize(specs.size()); + linearized[0] = &groups[0]; + std::vector<unsigned> group_ids(specs.size(), 0); uint32_t ntargets = specs.size(); for (uint32_t i = 1; i < ntargets; i++) { auto &spec = specs[i]; if (spec.flags & JL_TARGET_CLONE_ALL) { - groups.emplace_back(i, spec); + group_ids[i] = groups.size(); + groups.emplace_back(i); } else { - auto base = spec.base; - bool found = false; - for (auto &grp: groups) { - if (grp.idx == base) { - found = true; - grp.clones.emplace_back(i, spec); - break; - } - } - (void)found; + assert(0 <= spec.base && (unsigned) spec.base < i); + group_ids[i] = group_ids[spec.base]; + groups[group_ids[i]].clones.emplace_back(i); } } + for (auto &grp: groups) { + for (auto &tgt: grp.clones) + linearized[tgt.idx] = &tgt; + linearized[grp.idx] = &grp; + } uint32_t nfvars = fvars.size(); for (uint32_t i = 0; i < nfvars; i++) func_ids[fvars[i]] = i + 1; @@ -376,128 +567,64 @@ void CloneCtx::prepare_vmap(ValueToValueMapTy &vmap) } } -void CloneCtx::clone_function(Function *F, Function *new_f, ValueToValueMapTy &vmap) -{ - Function::arg_iterator DestI = new_f->arg_begin(); - for (Function::const_arg_iterator J = F->arg_begin(); J != F->arg_end(); ++J) { - DestI->setName(J->getName()); - vmap[&*J] = &*DestI++; - } - SmallVector<ReturnInst*,8> Returns; -#if JL_LLVM_VERSION >= 130000 - // We are cloning into the same module - CloneFunctionInto(new_f, F, vmap, CloneFunctionChangeType::GlobalChanges, Returns); -#else - CloneFunctionInto(new_f, F, vmap, true, Returns); -#endif -} - -// Clone all clone_all targets. Makes sure that the base targets are all available. -void CloneCtx::clone_bases() +void CloneCtx::prepare_slots() { - if (groups.size() == 1) - return; - uint32_t ngrps = groups.size(); - for (uint32_t gid = 1; gid < ngrps; gid++) { - auto &grp = groups[gid]; - auto suffix = ".clone_" + std::to_string(grp.idx); - auto &vmap = *grp.vmap; - // Fill in old->new mapping. We need to do this before cloning the function so that - // the intra target calls are automatically fixed up on cloning. - for (auto F: orig_funcs) { - Function *new_f = Function::Create(F->getFunctionType(), F->getLinkage(), - F->getName() + suffix, &M); - new_f->copyAttributesFrom(F); - vmap[F] = new_f; - } - prepare_vmap(vmap); - for (auto F: orig_funcs) { - clone_function(F, cast<Function>(vmap.lookup(F)), vmap); - } - } -} - -static bool is_vector(FunctionType *ty) -{ - if (ty->getReturnType()->isVectorTy()) - return true; - for (auto arg: ty->params()) { - if (arg->isVectorTy()) { - return true; + for (auto &F : orig_funcs) { + if (F->hasFnAttribute("julia.mv.reloc")) { + assert(F->hasFnAttribute("julia.mv.clones")); + if (F->isDeclaration()) { + auto GV = new GlobalVariable(M, F->getType(), false, GlobalValue::ExternalLinkage, nullptr, F->getName() + ".reloc_slot"); + GV->setVisibility(GlobalValue::HiddenVisibility); + extern_relocs[F] = GV; + } else { + auto id = get_func_id(F); + auto GV = new GlobalVariable(M, F->getType(), false, GlobalValue::InternalLinkage, Constant::getNullValue(F->getType()), F->getName() + ".reloc_slot"); + GV->setVisibility(GlobalValue::HiddenVisibility); + const_relocs[id] = GV; + } } } - return false; } -uint32_t CloneCtx::collect_func_info(Function &F) +void CloneCtx::clone_decls() { - uint32_t flag = 0; - if (!GetLI(F).empty()) - flag |= JL_TARGET_CLONE_LOOP; - if (is_vector(F.getFunctionType())) { - flag |= JL_TARGET_CLONE_SIMD; - has_veccall = true; + std::vector<std::string> suffixes(specs.size()); + for (unsigned i = 1; i < specs.size(); i++) { + suffixes[i] = "." + std::to_string(i); } - for (auto &bb: F) { - for (auto &I: bb) { - if (auto call = dyn_cast<CallInst>(&I)) { - if (is_vector(call->getFunctionType())) { - has_veccall = true; - flag |= JL_TARGET_CLONE_SIMD; - } - if (auto callee = call->getCalledFunction()) { - auto name = callee->getName(); - if (name.startswith("llvm.muladd.") || name.startswith("llvm.fma.")) { - flag |= JL_TARGET_CLONE_MATH; - } - else if (name.startswith("julia.cpu.")) { - if (name.startswith("julia.cpu.have_fma.")) { - // for some platforms we know they always do (or don't) support - // FMA. in those cases we don't need to clone the function. - if (!always_have_fma(*callee).hasValue()) - flag |= JL_TARGET_CLONE_CPU; - } else { - flag |= JL_TARGET_CLONE_CPU; - } - } - } - } - else if (auto store = dyn_cast<StoreInst>(&I)) { - if (store->getValueOperand()->getType()->isVectorTy()) { - flag |= JL_TARGET_CLONE_SIMD; - } - } - else if (I.getType()->isVectorTy()) { - flag |= JL_TARGET_CLONE_SIMD; - } - if (auto mathOp = dyn_cast<FPMathOperator>(&I)) { - if (mathOp->getFastMathFlags().any()) { - flag |= JL_TARGET_CLONE_MATH; - } - } - - for (size_t i = 0; i < I.getNumOperands(); i++) { - if(I.getOperand(i)->getType()->isHalfTy()){ - flag |= JL_TARGET_CLONE_FLOAT16; - } - // Check for BFloat16 when they are added to julia can be done here - } - if (has_veccall && (flag & JL_TARGET_CLONE_SIMD) && (flag & JL_TARGET_CLONE_MATH) && - (flag & JL_TARGET_CLONE_CPU) && (flag & JL_TARGET_CLONE_FLOAT16)) { - return flag; + for (auto &F : orig_funcs) { + if (!F->hasFnAttribute("julia.mv.clones")) + continue; + APInt clones(specs.size(), F->getFnAttribute("julia.mv.clones").getValueAsString(), 16); + for (unsigned i = 1; i < specs.size(); i++) { + if (!clones[i]) { + continue; } + auto new_F = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName() + suffixes[i], &M); + new_F->copyAttributesFrom(F); + new_F->setVisibility(F->getVisibility()); + auto base_func = F; + if (specs[i].flags & JL_TARGET_CLONE_ALL) + base_func = static_cast<Group*>(linearized[specs[i].base])->base_func(F); + (*linearized[i]->vmap)[base_func] = new_F; } } - return flag; } -void CloneCtx::collect_func_infos() +static void clone_function(Function *F, Function *new_f, ValueToValueMapTy &vmap) { - uint32_t nfuncs = orig_funcs.size(); - func_infos.resize(nfuncs); - for (uint32_t i = 0; i < nfuncs; i++) { - func_infos[i] = collect_func_info(*orig_funcs[i]); + Function::arg_iterator DestI = new_f->arg_begin(); + for (Function::const_arg_iterator J = F->arg_begin(); J != F->arg_end(); ++J) { + DestI->setName(J->getName()); + vmap[&*J] = &*DestI++; } + SmallVector<ReturnInst*,8> Returns; +#if JL_LLVM_VERSION >= 130000 + // We are cloning into the same module + CloneFunctionInto(new_f, F, vmap, CloneFunctionChangeType::GlobalChanges, Returns); +#else + CloneFunctionInto(new_f, F, vmap, true, Returns); +#endif } static void add_features(Function *F, StringRef name, StringRef features, uint32_t flags) @@ -523,149 +650,48 @@ static void add_features(Function *F, StringRef name, StringRef features, uint32 } } -void CloneCtx::clone_all_partials() -{ - // First decide what to clone - // Do this before actually cloning the functions - // so that the call graph is easier to understand - for (auto &grp: groups) { - for (auto &tgt: grp.clones) { - check_partial(grp, tgt); - } - } - for (auto &grp: groups) { - for (auto &tgt: grp.clones) - clone_partial(grp, tgt); - // Also set feature strings for base target functions - // now that all the actual cloning is done. - auto &base_spec = specs[grp.idx]; - for (auto orig_f: orig_funcs) { - add_features(grp.base_func(orig_f), base_spec.cpu_name, - base_spec.cpu_features, base_spec.flags); - } - } - func_infos.clear(); // We don't need this anymore -} - -void CloneCtx::check_partial(Group &grp, Target &tgt) +void CloneCtx::clone_bodies() { - auto flag = specs[tgt.idx].flags & clone_mask; - auto suffix = ".clone_" + std::to_string(tgt.idx); - auto &vmap = *tgt.vmap; - uint32_t nfuncs = func_infos.size(); - - std::set<Function*> all_origs; - // Use a simple heuristic to decide which function we need to clone. - for (uint32_t i = 0; i < nfuncs; i++) { - if (!(func_infos[i] & flag)) - continue; - auto orig_f = orig_funcs[i]; - // Fill in old->new mapping. We need to do this before cloning the function so that - // the intra target calls are automatically fixed up on cloning. - auto F = grp.base_func(orig_f); - Function *new_f = Function::Create(F->getFunctionType(), F->getLinkage(), - F->getName() + suffix, &M); - new_f->copyAttributesFrom(F); - vmap[F] = new_f; - if (groups.size() == 1) - cloned.insert(orig_f); - grp.clone_fs.insert(i); - all_origs.insert(orig_f); - } - std::set<Function*> sets[2]{all_origs, std::set<Function*>{}}; - auto *cur_set = &sets[0]; - auto *next_set = &sets[1]; - // Reduce dispatch by expand the cloning set to functions that are directly called by - // and calling cloned functions. - auto &graph = GetCG(); - while (!cur_set->empty()) { - for (auto orig_f: *cur_set) { - // Use the uncloned function since it's already in the call graph - auto node = graph[orig_f]; - for (const auto &I: *node) { - auto child_node = I.second; - auto orig_child_f = child_node->getFunction(); - if (!orig_child_f) - continue; - // Already cloned - if (all_origs.count(orig_child_f)) - continue; - bool calling_clone = false; - for (const auto &I2: *child_node) { - auto orig_child_f2 = I2.second->getFunction(); - if (!orig_child_f2) - continue; - if (all_origs.count(orig_child_f2)) { - calling_clone = true; - break; + for (auto F : orig_funcs) { + for (unsigned i = 0; i < groups.size(); i++) { + Function *group_F = F; + if (i != 0) { + group_F = groups[i].base_func(F); + if (!F->isDeclaration()) { + clone_function(F, group_F, *groups[i].vmap); + } + } + for (auto &target : groups[i].clones) { + prepare_vmap(*target.vmap); + auto target_F = cast_or_null<Function>(map_get(*target.vmap, F)); + if (target_F) { + if (!F->isDeclaration()) { + clone_function(group_F, target_F, *target.vmap); } + add_features(target_F, specs[target.idx].cpu_name, + specs[target.idx].cpu_features, specs[target.idx].flags); + target_F->addFnAttr("julia.mv.clone", std::to_string(i)); } - if (!calling_clone) - continue; - next_set->insert(orig_child_f); - all_origs.insert(orig_child_f); - auto child_f = grp.base_func(orig_child_f); - Function *new_f = Function::Create(child_f->getFunctionType(), - child_f->getLinkage(), - child_f->getName() + suffix, &M); - new_f->copyAttributesFrom(child_f); - vmap[child_f] = new_f; } - } - std::swap(cur_set, next_set); - next_set->clear(); - } - for (uint32_t i = 0; i < nfuncs; i++) { - // Only need to handle expanded functions - if (func_infos[i] & flag) - continue; - auto orig_f = orig_funcs[i]; - if (all_origs.count(orig_f)) { - if (groups.size() == 1) - cloned.insert(orig_f); - grp.clone_fs.insert(i); - } - } -} - -void CloneCtx::clone_partial(Group &grp, Target &tgt) -{ - auto &spec = specs[tgt.idx]; - auto &vmap = *tgt.vmap; - uint32_t nfuncs = orig_funcs.size(); - prepare_vmap(vmap); - for (uint32_t i = 0; i < nfuncs; i++) { - auto orig_f = orig_funcs[i]; - auto F = grp.base_func(orig_f); - if (auto new_v = map_get(vmap, F)) { - auto new_f = cast<Function>(new_v); - assert(new_f != F); - clone_function(F, new_f, vmap); - // We can set the feature strings now since no one is going to - // clone these functions again. - add_features(new_f, spec.cpu_name, spec.cpu_features, spec.flags); + if (i != 0) { + //TODO should we also do this for target 0? + add_features(group_F, specs[groups[i].idx].cpu_name, + specs[groups[i].idx].cpu_features, specs[groups[i].idx].flags); + } + group_F->addFnAttr("julia.mv.clone", std::to_string(i)); } } } -uint32_t CloneCtx::get_func_id(Function *F) +uint32_t CloneCtx::get_func_id(Function *F) const { - auto &ref = func_ids[F]; - if (!ref) { - if (allow_bad_fvars && F->isDeclaration()) { - // This should never happen in regular use, but can happen if - // bugpoint deletes the function. Just do something here to - // allow bugpoint to proceed. - return (uint32_t)-1; - } - fvars.push_back(F); - ref = fvars.size(); - } - return ref - 1; + auto ref = func_ids.find(F); + assert(ref != func_ids.end() && "Requesting id of non-fvar!"); + return ref->second - 1; } template<typename Stack> -Constant *CloneCtx::rewrite_gv_init(const Stack& stack) +static Constant *rewrite_gv_init(const Stack& stack) { // Null initialize so that LLVM put it in the correct section. SmallVector<Constant*, 8> args; @@ -785,16 +811,18 @@ void CloneCtx::fix_gv_uses() } } -std::pair<uint32_t,GlobalVariable*> CloneCtx::get_reloc_slot(Function *F) +std::pair<uint32_t,GlobalVariable*> CloneCtx::get_reloc_slot(Function *F) const { - // Null initialize so that LLVM put it in the correct section. - auto id = get_func_id(F); - auto &slot = const_relocs[id]; - if (!slot) - slot = new GlobalVariable(M, F->getType(), false, GlobalVariable::InternalLinkage, - ConstantPointerNull::get(F->getType()), - F->getName() + ".reloc_slot"); - return std::make_pair(id, slot); + if (F->isDeclaration()) { + auto extern_decl = extern_relocs.find(F); + assert(extern_decl != extern_relocs.end() && "Missing extern relocation slot!"); + return {(uint32_t)-1, extern_decl->second}; + } else { + auto id = get_func_id(F); + auto slot = const_relocs.find(id); + assert(slot != const_relocs.end() && "Missing relocation slot!"); + return {id, slot->second}; + } } template<typename Stack> @@ -851,17 +879,17 @@ void CloneCtx::fix_inst_uses() { uint32_t nfuncs = orig_funcs.size(); for (auto &grp: groups) { - auto suffix = ".clone_" + std::to_string(grp.idx); for (uint32_t i = 0; i < nfuncs; i++) { - if (!grp.clone_fs.count(i)) - continue; auto orig_f = orig_funcs[i]; + if (!grp.has_subtarget_clone(orig_f)) + continue; auto F = grp.base_func(orig_f); + auto grpidx = std::to_string(grp.idx); replaceUsesWithLoad(*F, [&](Instruction &I) -> GlobalVariable * { uint32_t id; GlobalVariable *slot; auto use_f = I.getFunction(); - if (!use_f->getName().endswith(suffix)) + if (!use_f->hasFnAttribute("julia.mv.clone") || use_f->getFnAttribute("julia.mv.clone").getValueAsString() != grpidx) return nullptr; std::tie(id, slot) = get_reloc_slot(orig_f); return slot; @@ -935,17 +963,6 @@ void CloneCtx::emit_metadata() auto gbase = emit_offset_table(M, gvars, "jl_sysimg_gvars"); uint32_t ntargets = specs.size(); - SmallVector<Target*, 8> targets(ntargets); - for (auto &grp: groups) { - targets[grp.idx] = &grp; - for (auto &tgt: grp.clones) { - targets[tgt.idx] = &tgt; - } - } - - if (has_veccall) { - M.addModuleFlag(Module::Max, "julia.mv.veccall", 1); - } // Generate `jl_dispatch_reloc_slots` std::set<uint32_t> shared_relocs; @@ -989,7 +1006,7 @@ void CloneCtx::emit_metadata() std::vector<uint32_t> idxs; std::vector<Constant*> offsets; for (uint32_t i = 0; i < ntargets; i++) { - auto tgt = targets[i]; + auto tgt = linearized[i]; auto &spec = specs[i]; uint32_t len_idx = idxs.size(); idxs.push_back(0); // We will fill in the real value later. @@ -1009,7 +1026,7 @@ void CloneCtx::emit_metadata() } else { auto baseidx = spec.base; - auto grp = static_cast<Group*>(targets[baseidx]); + auto grp = static_cast<Group*>(linearized[baseidx]); idxs.push_back(baseidx); for (uint32_t j = 0; j < nfvars; j++) { auto base_f = grp->base_func(fvars[j]); @@ -1040,7 +1057,7 @@ void CloneCtx::emit_metadata() } } -static bool runMultiVersioning(Module &M, function_ref<LoopInfo&(Function&)> GetLI, function_ref<CallGraph&()> GetCG, bool allow_bad_fvars) +static bool runMultiVersioning(Module &M, bool allow_bad_fvars) { // Group targets and identify cloning bases. // Also initialize function info maps (we'll update these maps as we go) @@ -1059,19 +1076,13 @@ static bool runMultiVersioning(Module &M, function_ref<LoopInfo&(Function&)> Get !gvars || !gvars->hasInitializer() || !isa<ConstantArray>(gvars->getInitializer()))) return false; - CloneCtx clone(M, GetLI, GetCG, allow_bad_fvars); + CloneCtx clone(M, allow_bad_fvars); + + clone.prepare_slots(); - // Collect a list of original functions and clone base functions - clone.clone_bases(); + clone.clone_decls(); - // Collect function info (type of instruction used) - clone.collect_func_infos(); - - // If any partially cloned target exist decide which functions to clone for these targets. - // Clone functions for each group and collect a list of them. - // We can also add feature strings for cloned functions - // now that no additional cloning needs to be done. - clone.clone_all_partials(); + clone.clone_bodies(); // Scan **ALL** cloned functions (including full cloning for base target) // for global variables initialization use. @@ -1108,24 +1119,12 @@ struct MultiVersioningLegacy: public ModulePass { private: bool runOnModule(Module &M) override; - void getAnalysisUsage(AnalysisUsage &AU) const override - { - AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequired<CallGraphWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); - } bool allow_bad_fvars; }; bool MultiVersioningLegacy::runOnModule(Module &M) { - auto GetLI = [this](Function &F) -> LoopInfo & { - return getAnalysis<LoopInfoWrapperPass>(F).getLoopInfo(); - }; - auto GetCG = [this]() -> CallGraph & { - return getAnalysis<CallGraphWrapperPass>().getCallGraph(); - }; - return runMultiVersioning(M, GetLI, GetCG, allow_bad_fvars); + return runMultiVersioning(M, allow_bad_fvars); } @@ -1136,6 +1135,11 @@ static RegisterPass<MultiVersioningLegacy> X("JuliaMultiVersioning", "JuliaMulti } // anonymous namespace +void multiversioning_preannotate(Module &M) +{ + annotate_module_clones(M); +} + void replaceUsesWithLoad(Function &F, function_ref<GlobalVariable *(Instruction &I)> should_replace, MDNode *tbaa_const) { bool changed; do { @@ -1162,14 +1166,7 @@ void replaceUsesWithLoad(Function &F, function_ref<GlobalVariable *(Instruction PreservedAnalyses MultiVersioning::run(Module &M, ModuleAnalysisManager &AM) { - auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); - auto GetLI = [&](Function &F) -> LoopInfo & { - return FAM.getResult<LoopAnalysis>(F); - }; - auto GetCG = [&]() -> CallGraph & { - return AM.getResult<CallGraphAnalysis>(M); - }; - if (runMultiVersioning(M, GetLI, GetCG, external_use)) { + if (runMultiVersioning(M, external_use)) { auto preserved = PreservedAnalyses::allInSet<CFGAnalyses>(); preserved.preserve<LoopAnalysis>(); return preserved; From 6ab1862106bc7f48afa54bac792cb7909df35cd7 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Thu, 5 Jan 2023 22:17:52 -0500 Subject: [PATCH 06/34] Table-based dlsym --- src/aotcompile.cpp | 112 ++++++++++++++++++++++++++++++++--- src/llvm-multiversioning.cpp | 68 ++++++++++----------- src/llvm-ptls.cpp | 19 +----- src/processor.cpp | 72 ++++++++++++---------- src/processor.h | 32 ++++++++++ 5 files changed, 214 insertions(+), 89 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 527b793f142c8..5873c1ca56477 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -424,7 +424,8 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm //Safe b/c context is locked by params GlobalVariable *G = cast<GlobalVariable>(clone.getModuleUnlocked()->getNamedValue(global)); G->setInitializer(ConstantPointerNull::get(cast<PointerType>(G->getValueType()))); - G->setLinkage(GlobalVariable::InternalLinkage); + G->setLinkage(GlobalValue::ExternalLinkage); + G->setVisibility(GlobalValue::HiddenVisibility); data->jl_sysimg_gvars.push_back(G); } CreateNativeGlobals += gvars.size(); @@ -446,9 +447,9 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm //Safe b/c context is locked by params for (GlobalObject &G : clone.getModuleUnlocked()->global_objects()) { if (!G.isDeclaration()) { - G.setLinkage(Function::InternalLinkage); + G.setLinkage(GlobalValue::ExternalLinkage); + G.setVisibility(GlobalValue::HiddenVisibility); makeSafeName(G); - addComdat(&G); #if defined(_OS_WINDOWS_) && defined(_CPU_X86_64_) // Add unwind exception personalities to functions to handle async exceptions if (Function *F = dyn_cast<Function>(&G)) @@ -514,6 +515,63 @@ static void injectCRTAlias(Module &M, StringRef name, StringRef alias, FunctionT void multiversioning_preannotate(Module &M); +static GlobalVariable *emit_shard_table(Module &M, Type *T_size, Type *T_psize, unsigned threads) { + SmallVector<Constant *, 0> tables(sizeof(jl_image_shard_t) / sizeof(void *) * threads); + for (unsigned i = 0; i < threads; i++) { + auto suffix = "_" + std::to_string(i); + auto create_gv = [&](StringRef name, bool constant) { + auto gv = new GlobalVariable(M, T_size, constant, + GlobalValue::ExternalLinkage, nullptr, name + suffix); + gv->setVisibility(GlobalValue::HiddenVisibility); + return gv; + }; + auto table = tables.data() + i * sizeof(jl_image_shard_t) / sizeof(void *); + table[offsetof(jl_image_shard_t, fvar_base) / sizeof(void*)] = create_gv("jl_fvar_base", false); + table[offsetof(jl_image_shard_t, fvar_offsets) / sizeof(void*)] = create_gv("jl_fvar_offsets", true); + table[offsetof(jl_image_shard_t, fvar_idxs) / sizeof(void*)] = create_gv("jl_fvar_idxs", true); + table[offsetof(jl_image_shard_t, gvar_base) / sizeof(void*)] = create_gv("jl_gvar_base", false); + table[offsetof(jl_image_shard_t, gvar_offsets) / sizeof(void*)] = create_gv("jl_gvar_offsets", true); + table[offsetof(jl_image_shard_t, gvar_idxs) / sizeof(void*)] = create_gv("jl_gvar_idxs", true); + table[offsetof(jl_image_shard_t, clone_slots) / sizeof(void*)] = create_gv("jl_clone_slots", true); + table[offsetof(jl_image_shard_t, clone_offsets) / sizeof(void*)] = create_gv("jl_clone_offsets", true); + table[offsetof(jl_image_shard_t, clone_idxs) / sizeof(void*)] = create_gv("jl_clone_idxs", true); + } + auto tables_arr = ConstantArray::get(ArrayType::get(T_psize, tables.size()), tables); + auto tables_gv = new GlobalVariable(M, tables_arr->getType(), false, + GlobalValue::ExternalLinkage, tables_arr, "jl_shard_tables"); + tables_gv->setVisibility(GlobalValue::HiddenVisibility); + return tables_gv; +} + +static GlobalVariable *emit_ptls_table(Module &M, Type *T_size, Type *T_psize) { + std::array<Constant *, 3> ptls_table{ + new GlobalVariable(M, T_size, false, GlobalValue::ExternalLinkage, Constant::getNullValue(T_size), "jl_pgcstack_func_slot"), + new GlobalVariable(M, T_size, false, GlobalValue::ExternalLinkage, Constant::getNullValue(T_size), "jl_pgcstack_key_slot"), + new GlobalVariable(M, T_size, false, GlobalValue::ExternalLinkage, Constant::getNullValue(T_size), "jl_tls_offset"), + }; + for (auto &gv : ptls_table) + cast<GlobalVariable>(gv)->setVisibility(GlobalValue::HiddenVisibility); + auto ptls_table_arr = ConstantArray::get(ArrayType::get(T_psize, ptls_table.size()), ptls_table); + auto ptls_table_gv = new GlobalVariable(M, ptls_table_arr->getType(), false, + GlobalValue::ExternalLinkage, ptls_table_arr, "jl_ptls_table"); + ptls_table_gv->setVisibility(GlobalValue::HiddenVisibility); + return ptls_table_gv; +} + +static GlobalVariable *emit_image_header(Module &M, unsigned threads, unsigned nfvars, unsigned ngvars) { + constexpr uint32_t version = 1; + std::array<uint32_t, 4> header{ + version, + threads, + nfvars, + ngvars, + }; + auto header_arr = ConstantDataArray::get(M.getContext(), header); + auto header_gv = new GlobalVariable(M, header_arr->getType(), false, + GlobalValue::InternalLinkage, header_arr, "jl_image_header"); + return header_gv; +} + // takes the running content that has collected in the shadow module and dump it to disk // this builds the object file portion of the sysimage files for fast startup extern "C" JL_DLLEXPORT @@ -588,6 +646,10 @@ void jl_dump_native_impl(void *native_code, start = jl_hrtime(); + unsigned threads = 1; + unsigned nfvars = 0; + unsigned ngvars = 0; + // add metadata information if (imaging_mode) { multiversioning_preannotate(*dataM); @@ -601,8 +663,27 @@ void jl_dump_native_impl(void *native_code, } } } - emit_offset_table(*dataM, data->jl_sysimg_gvars, "jl_sysimg_gvars", T_psize); - emit_offset_table(*dataM, data->jl_sysimg_fvars, "jl_sysimg_fvars", T_psize); + nfvars = data->jl_sysimg_fvars.size(); + ngvars = data->jl_sysimg_gvars.size(); + emit_offset_table(*dataM, data->jl_sysimg_gvars, "jl_gvars", T_psize); + emit_offset_table(*dataM, data->jl_sysimg_fvars, "jl_fvars", T_psize); + std::vector<uint32_t> idxs; + idxs.resize(data->jl_sysimg_gvars.size()); + std::iota(idxs.begin(), idxs.end(), 0); + auto gidxs = ConstantDataArray::get(Context, idxs); + auto gidxs_var = new GlobalVariable(*dataM, gidxs->getType(), true, + GlobalVariable::ExternalLinkage, + gidxs, "jl_gvar_idxs"); + gidxs_var->setVisibility(GlobalValue::HiddenVisibility); + idxs.clear(); + idxs.resize(data->jl_sysimg_fvars.size()); + std::iota(idxs.begin(), idxs.end(), 0); + auto fidxs = ConstantDataArray::get(Context, idxs); + auto fidxs_var = new GlobalVariable(*dataM, fidxs->getType(), true, + GlobalVariable::ExternalLinkage, + fidxs, "jl_fvar_idxs"); + fidxs_var->setVisibility(GlobalValue::HiddenVisibility); + dataM->addModuleFlag(Module::Error, "julia.mv.suffix", MDString::get(Context, "_0")); // reflect the address of the jl_RTLD_DEFAULT_handle variable // back to the caller, so that we can check for consistency issues @@ -789,10 +870,23 @@ void jl_dump_native_impl(void *native_code, data.insert(data.end(), specdata.begin(), specdata.end()); } auto value = ConstantDataArray::get(Context, data); - addComdat(new GlobalVariable(*sysimageM, value->getType(), true, - GlobalVariable::ExternalLinkage, - value, "jl_dispatch_target_ids")); - + auto target_ids = new GlobalVariable(*sysimageM, value->getType(), true, + GlobalVariable::InternalLinkage, + value, "jl_dispatch_target_ids"); + auto shards = emit_shard_table(*sysimageM, T_size, T_psize, threads); + auto ptls = emit_ptls_table(*sysimageM, T_size, T_psize); + auto header = emit_image_header(*sysimageM, threads, nfvars, ngvars); + auto AT = ArrayType::get(T_psize, 4); + auto pointers = new GlobalVariable(*sysimageM, AT, false, + GlobalVariable::ExternalLinkage, + ConstantArray::get(AT, { + ConstantExpr::getBitCast(header, T_psize), + ConstantExpr::getBitCast(shards, T_psize), + ConstantExpr::getBitCast(ptls, T_psize), + ConstantExpr::getBitCast(target_ids, T_psize) + }), + "jl_image_pointers"); + addComdat(pointers); if (s) { write_int32(s, data.size()); ios_write(s, (const char *)data.data(), data.size()); diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp index 1a1dc297b2702..44c83502e0537 100644 --- a/src/llvm-multiversioning.cpp +++ b/src/llvm-multiversioning.cpp @@ -516,8 +516,8 @@ static inline std::vector<T*> consume_gv(Module &M, const char *name, bool allow CloneCtx::CloneCtx(Module &M, bool allow_bad_fvars) : tbaa_const(tbaa_make_child_with_context(M.getContext(), "jtbaa_const", nullptr, true).first), specs(jl_get_llvm_clone_targets()), - fvars(consume_gv<Function>(M, "jl_sysimg_fvars", allow_bad_fvars)), - gvars(consume_gv<Constant>(M, "jl_sysimg_gvars", false)), + fvars(consume_gv<Function>(M, "jl_fvars", allow_bad_fvars)), + gvars(consume_gv<Constant>(M, "jl_gvars", false)), M(M), allow_bad_fvars(allow_bad_fvars) { @@ -547,7 +547,7 @@ CloneCtx::CloneCtx(Module &M, bool allow_bad_fvars) for (uint32_t i = 0; i < nfvars; i++) func_ids[fvars[i]] = i + 1; for (auto &F: M) { - if (F.empty()) + if (F.empty() && !F.hasFnAttribute("julia.mv.clones")) continue; orig_funcs.push_back(&F); } @@ -898,19 +898,6 @@ void CloneCtx::fix_inst_uses() } } -template<typename T> -static inline T *add_comdat(T *G) -{ -#if defined(_OS_WINDOWS_) - // add __declspec(dllexport) to everything marked for export - if (G->getLinkage() == GlobalValue::ExternalLinkage) - G->setDLLStorageClass(GlobalValue::DLLExportStorageClass); - else - G->setDLLStorageClass(GlobalValue::DefaultStorageClass); -#endif - return G; -} - static Constant *get_ptrdiff32(Constant *ptr, Constant *base) { if (ptr->getType()->isPointerTy()) @@ -920,7 +907,7 @@ static Constant *get_ptrdiff32(Constant *ptr, Constant *base) } template<typename T> -static Constant *emit_offset_table(Module &M, const std::vector<T*> &vars, StringRef name) +static Constant *emit_offset_table(Module &M, const std::vector<T*> &vars, StringRef name, StringRef suffix) { auto T_int32 = Type::getInt32Ty(M.getContext()); auto T_size = getSizeTy(M.getContext()); @@ -928,11 +915,14 @@ static Constant *emit_offset_table(Module &M, const std::vector<T*> &vars, Strin Constant *base = nullptr; if (nvars > 0) { base = ConstantExpr::getBitCast(vars[0], T_size->getPointerTo()); - add_comdat(GlobalAlias::create(T_size, 0, GlobalVariable::ExternalLinkage, - name + "_base", - base, &M)); + auto ga = GlobalAlias::create(T_size, 0, GlobalVariable::ExternalLinkage, + name + "_base" + suffix, + base, &M); + ga->setVisibility(GlobalValue::HiddenVisibility); } else { - base = add_comdat(new GlobalVariable(M, T_size, true, GlobalValue::ExternalLinkage, Constant::getNullValue(T_size), name + "_base")); + auto gv = new GlobalVariable(M, T_size, true, GlobalValue::ExternalLinkage, Constant::getNullValue(T_size), name + "_base" + suffix); + gv->setVisibility(GlobalValue::HiddenVisibility); + base = gv; } auto vbase = ConstantExpr::getPtrToInt(base, T_size); std::vector<Constant*> offsets(nvars + 1); @@ -943,10 +933,11 @@ static Constant *emit_offset_table(Module &M, const std::vector<T*> &vars, Strin offsets[i + 1] = get_ptrdiff32(vars[i], vbase); } ArrayType *vars_type = ArrayType::get(T_int32, nvars + 1); - add_comdat(new GlobalVariable(M, vars_type, true, + auto gv = new GlobalVariable(M, vars_type, true, GlobalVariable::ExternalLinkage, ConstantArray::get(vars_type, offsets), - name + "_offsets")); + name + "_offsets" + suffix); + gv->setVisibility(GlobalValue::HiddenVisibility); return vbase; } @@ -958,9 +949,17 @@ void CloneCtx::emit_metadata() return; } + StringRef suffix; + if (auto suffix_md = M.getModuleFlag("julia.mv.suffix")) { + suffix = cast<MDString>(suffix_md)->getString(); + } + // Store back the information about exported functions. - auto fbase = emit_offset_table(M, fvars, "jl_sysimg_fvars"); - auto gbase = emit_offset_table(M, gvars, "jl_sysimg_gvars"); + auto fbase = emit_offset_table(M, fvars, "jl_fvar", suffix); + auto gbase = emit_offset_table(M, gvars, "jl_gvar", suffix); + + M.getGlobalVariable("jl_fvar_idxs")->setName("jl_fvar_idxs" + suffix); + M.getGlobalVariable("jl_gvar_idxs")->setName("jl_gvar_idxs" + suffix); uint32_t ntargets = specs.size(); @@ -996,9 +995,10 @@ void CloneCtx::emit_metadata() } values[0] = ConstantInt::get(T_int32, values.size() / 2); ArrayType *vars_type = ArrayType::get(T_int32, values.size()); - add_comdat(new GlobalVariable(M, vars_type, true, GlobalVariable::ExternalLinkage, + auto gv = new GlobalVariable(M, vars_type, true, GlobalVariable::ExternalLinkage, ConstantArray::get(vars_type, values), - "jl_dispatch_reloc_slots")); + "jl_clone_slots" + suffix); + gv->setVisibility(GlobalValue::HiddenVisibility); } // Generate `jl_dispatch_fvars_idxs` and `jl_dispatch_fvars_offsets` @@ -1046,14 +1046,16 @@ void CloneCtx::emit_metadata() idxs[len_idx] = count; } auto idxval = ConstantDataArray::get(M.getContext(), idxs); - add_comdat(new GlobalVariable(M, idxval->getType(), true, + auto gv1 = new GlobalVariable(M, idxval->getType(), true, GlobalVariable::ExternalLinkage, - idxval, "jl_dispatch_fvars_idxs")); + idxval, "jl_clone_idxs" + suffix); + gv1->setVisibility(GlobalValue::HiddenVisibility); ArrayType *offsets_type = ArrayType::get(Type::getInt32Ty(M.getContext()), offsets.size()); - add_comdat(new GlobalVariable(M, offsets_type, true, + auto gv2 = new GlobalVariable(M, offsets_type, true, GlobalVariable::ExternalLinkage, ConstantArray::get(offsets_type, offsets), - "jl_dispatch_fvars_offsets")); + "jl_clone_offsets" + suffix); + gv2->setVisibility(GlobalValue::HiddenVisibility); } } @@ -1070,8 +1072,8 @@ static bool runMultiVersioning(Module &M, bool allow_bad_fvars) if (M.getName() == "sysimage") return false; - GlobalVariable *fvars = M.getGlobalVariable("jl_sysimg_fvars"); - GlobalVariable *gvars = M.getGlobalVariable("jl_sysimg_gvars"); + GlobalVariable *fvars = M.getGlobalVariable("jl_fvars"); + GlobalVariable *gvars = M.getGlobalVariable("jl_gvars"); if (allow_bad_fvars && (!fvars || !fvars->hasInitializer() || !isa<ConstantArray>(fvars->getInitializer()) || !gvars || !gvars->hasInitializer() || !isa<ConstantArray>(gvars->getInitializer()))) return false; diff --git a/src/llvm-ptls.cpp b/src/llvm-ptls.cpp index ea92e1709c597..e49b992ded50f 100644 --- a/src/llvm-ptls.cpp +++ b/src/llvm-ptls.cpp @@ -140,26 +140,11 @@ GlobalVariable *LowerPTLS::create_aliased_global(Type *T, StringRef name) const // the address is visible externally but LLVM can still assume that the // address of this variable doesn't need dynamic relocation // (can be accessed with a single PC-rel load). - auto GV = new GlobalVariable(*M, T, false, GlobalVariable::InternalLinkage, - Constant::getNullValue(T), name + ".real"); - add_comdat(GlobalAlias::create(T, 0, GlobalVariable::ExternalLinkage, - name, GV, M)); + auto GV = new GlobalVariable(*M, T, false, GlobalVariable::ExternalLinkage, + nullptr, name); return GV; } -template<typename T> -inline T *LowerPTLS::add_comdat(T *G) const -{ -#if defined(_OS_WINDOWS_) - // add __declspec(dllexport) to everything marked for export - if (G->getLinkage() == GlobalValue::ExternalLinkage) - G->setDLLStorageClass(GlobalValue::DLLExportStorageClass); - else - G->setDLLStorageClass(GlobalValue::DefaultStorageClass); -#endif - return G; -} - void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, Function *pgcstack_getter, bool or_new, bool *CFGModified) { if (pgcstack->use_empty()) { diff --git a/src/processor.cpp b/src/processor.cpp index a8aca2a64ab19..ea8e4101e6c33 100644 --- a/src/processor.cpp +++ b/src/processor.cpp @@ -21,6 +21,8 @@ #include <dlfcn.h> #endif +#include <iostream> + // CPU target string is a list of strings separated by `;` each string starts with a CPU // or architecture name and followed by an optional list of features separated by `,`. // A "generic" or empty CPU name means the basic required feature set of the target ISA @@ -629,47 +631,42 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback) { jl_image_t res{}; - // .data base - char *data_base; - jl_dlsym(hdl, "jl_sysimg_gvars_base", (void**)&data_base, 1); + const jl_image_pointers_t *pointers; + jl_dlsym(hdl, "jl_image_pointers", (void**)&pointers, 1); - { - void *pgcstack_func_slot; - if (jl_dlsym(hdl, "jl_pgcstack_func_slot", &pgcstack_func_slot, 0)) { - void *pgcstack_key_slot; - jl_dlsym(hdl, "jl_pgcstack_key_slot", &pgcstack_key_slot, 1); - jl_pgcstack_getkey((jl_get_pgcstack_func**)pgcstack_func_slot, (jl_pgcstack_key_t*)pgcstack_key_slot); - - size_t *tls_offset_idx; - jl_dlsym(hdl, "jl_tls_offset", (void **)&tls_offset_idx, 1); - *tls_offset_idx = (uintptr_t)(jl_tls_offset == -1 ? 0 : jl_tls_offset); - } - } + const void *ids = pointers->target_data; + uint32_t target_idx = callback(ids); + + std::cout << "Finished callback\n"; + + auto shard = pointers->shards[0]; + + std::cout << "Shard access is ok\n"; + + // .data base + char *data_base = (char *)shard.gvar_base; // .text base - char *text_base; - jl_dlsym(hdl, "jl_sysimg_fvars_base", (void**)&text_base, 1); + const char *text_base = shard.fvar_base; - const int32_t *offsets; - jl_dlsym(hdl, "jl_sysimg_fvars_offsets", (void**)&offsets, 1); + const int32_t *offsets = shard.fvar_offsets; uint32_t nfunc = offsets[0]; offsets++; - const void *ids; - jl_dlsym(hdl, "jl_dispatch_target_ids", (void**)&ids, 1); - uint32_t target_idx = callback(ids); + std::cout << "Initial offsets\n"; - const int32_t *reloc_slots; - jl_dlsym(hdl, "jl_dispatch_reloc_slots", (void **)&reloc_slots, 1); + const int32_t *reloc_slots = shard.clone_slots; + std::cout << reloc_slots << "\n"; const uint32_t nreloc = reloc_slots[0]; reloc_slots += 1; - const uint32_t *clone_idxs; - const int32_t *clone_offsets; - jl_dlsym(hdl, "jl_dispatch_fvars_idxs", (void**)&clone_idxs, 1); - jl_dlsym(hdl, "jl_dispatch_fvars_offsets", (void**)&clone_offsets, 1); + std::cout << "Set reloc_slots\n"; + const uint32_t *clone_idxs = shard.clone_idxs; + const int32_t *clone_offsets = shard.clone_offsets; uint32_t tag_len = clone_idxs[0]; clone_idxs += 1; + std::cout << "Set clone_idxs\n"; + assert(tag_len & jl_sysimg_tag_mask); std::vector<const int32_t*> base_offsets = {offsets}; // Find target @@ -688,6 +685,8 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback) base_offsets.push_back(tag_len & jl_sysimg_tag_mask ? clone_offsets : nullptr); } + std::cout << "Set offsets\n"; + bool clone_all = (tag_len & jl_sysimg_tag_mask) != 0; // Fill in return value if (clone_all) { @@ -741,17 +740,19 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback) (void)found; } + std::cout << "Finished relocation\n"; + res.fptrs.base = text_base; res.fptrs.offsets = offsets; res.gvars_base = (uintptr_t *)data_base; - jl_dlsym(hdl, "jl_sysimg_gvars_offsets", (void **)&res.gvars_offsets, 1); + res.gvars_offsets = shard.gvar_offsets; res.gvars_offsets += 1; #ifdef _OS_WINDOWS_ res.base = (intptr_t)hdl; #else Dl_info dlinfo; - if (dladdr((void*)res.gvars_base, &dlinfo) != 0) { + if (dladdr((void*)pointers, &dlinfo) != 0) { res.base = (intptr_t)dlinfo.dli_fbase; } else { @@ -759,6 +760,17 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback) } #endif + std::cout << "Starting ptls\n"; + + { + void *pgcstack_func_slot = pointers->ptls->pgcstack_func_slot; + void *pgcstack_key_slot = pointers->ptls->pgcstack_key_slot; + jl_pgcstack_getkey((jl_get_pgcstack_func**)pgcstack_func_slot, (jl_pgcstack_key_t*)pgcstack_key_slot); + + size_t *tls_offset_idx = pointers->ptls->tls_offset; + *tls_offset_idx = (uintptr_t)(jl_tls_offset == -1 ? 0 : jl_tls_offset); + } + return res; } diff --git a/src/processor.h b/src/processor.h index f76722e885a1d..73271290eff76 100644 --- a/src/processor.h +++ b/src/processor.h @@ -162,6 +162,38 @@ typedef struct { jl_image_fptrs_t fptrs; } jl_image_t; +typedef struct { + uint32_t version; + uint32_t nshards; + uint32_t nfvars; + uint32_t ngvars; +} jl_image_header_t; + +typedef struct { + const char *fvar_base; + const int32_t *fvar_offsets; + const uint32_t *fvar_idxs; + uintptr_t *gvar_base; + const int32_t *gvar_offsets; + const uint32_t *gvar_idxs; + const int32_t *clone_slots; + const int32_t *clone_offsets; + const uint32_t *clone_idxs; +} jl_image_shard_t; + +typedef struct { + void *pgcstack_func_slot; + void *pgcstack_key_slot; + size_t *tls_offset; +} jl_image_ptls_t; + +typedef struct { + const jl_image_header_t *header; + const jl_image_shard_t *shards; // nshards-length array + const jl_image_ptls_t *ptls; + const void *target_data; +} jl_image_pointers_t; + /** * Initialize the processor dispatch system with sysimg `hdl` (also initialize the sysimg itself). * The dispatch system will find the best implementation to be used in this session. From 798ee2245b6aae597a99d25f27aa3ed96cf3c2aa Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Thu, 5 Jan 2023 23:54:39 -0500 Subject: [PATCH 07/34] Allow loader to deal with multiple shards --- src/processor.cpp | 232 ++++++++++++++++++++++++++-------------------- 1 file changed, 133 insertions(+), 99 deletions(-) diff --git a/src/processor.cpp b/src/processor.cpp index ea8e4101e6c33..55b2cd2b4ab55 100644 --- a/src/processor.cpp +++ b/src/processor.cpp @@ -636,117 +636,153 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback) const void *ids = pointers->target_data; uint32_t target_idx = callback(ids); + + if (pointers->header->version != 1) { + jl_error("Image file is not compatible with this version of Julia"); + } - std::cout << "Finished callback\n"; - - auto shard = pointers->shards[0]; - - std::cout << "Shard access is ok\n"; - - // .data base - char *data_base = (char *)shard.gvar_base; - - // .text base - const char *text_base = shard.fvar_base; - - const int32_t *offsets = shard.fvar_offsets; - uint32_t nfunc = offsets[0]; - offsets++; - - std::cout << "Initial offsets\n"; - - const int32_t *reloc_slots = shard.clone_slots; - std::cout << reloc_slots << "\n"; - const uint32_t nreloc = reloc_slots[0]; - reloc_slots += 1; - std::cout << "Set reloc_slots\n"; - const uint32_t *clone_idxs = shard.clone_idxs; - const int32_t *clone_offsets = shard.clone_offsets; - uint32_t tag_len = clone_idxs[0]; - clone_idxs += 1; - - std::cout << "Set clone_idxs\n"; + std::vector<const char *> fvars(pointers->header->nfvars); + std::vector<const char *> gvars(pointers->header->ngvars); + + std::vector<std::pair<uint32_t, const char *>> clones; + + for (unsigned i = 0; i < pointers->header->nshards; i++) { + auto shard = pointers->shards[0]; + + // .data base + char *data_base = (char *)shard.gvar_base; + + // .text base + const char *text_base = shard.fvar_base; + + const int32_t *offsets = shard.fvar_offsets; + uint32_t nfunc = offsets[0]; + offsets++; + const int32_t *reloc_slots = shard.clone_slots; + const uint32_t nreloc = reloc_slots[0]; + reloc_slots += 1; + const uint32_t *clone_idxs = shard.clone_idxs; + const int32_t *clone_offsets = shard.clone_offsets; + uint32_t tag_len = clone_idxs[0]; + clone_idxs += 1; + + assert(tag_len & jl_sysimg_tag_mask); + std::vector<const int32_t*> base_offsets = {offsets}; + // Find target + for (uint32_t i = 0;i < target_idx;i++) { + uint32_t len = jl_sysimg_val_mask & tag_len; + if (jl_sysimg_tag_mask & tag_len) { + if (i != 0) + clone_offsets += nfunc; + clone_idxs += len + 1; + } + else { + clone_offsets += len; + clone_idxs += len + 2; + } + tag_len = clone_idxs[-1]; + base_offsets.push_back(tag_len & jl_sysimg_tag_mask ? clone_offsets : nullptr); + } - assert(tag_len & jl_sysimg_tag_mask); - std::vector<const int32_t*> base_offsets = {offsets}; - // Find target - for (uint32_t i = 0;i < target_idx;i++) { - uint32_t len = jl_sysimg_val_mask & tag_len; - if (jl_sysimg_tag_mask & tag_len) { - if (i != 0) - clone_offsets += nfunc; - clone_idxs += len + 1; + bool clone_all = (tag_len & jl_sysimg_tag_mask) != 0; + // Fill in return value + if (clone_all) { + // clone_all + if (target_idx != 0) { + offsets = clone_offsets; + } } else { - clone_offsets += len; - clone_idxs += len + 2; + uint32_t base_idx = clone_idxs[0]; + assert(base_idx < target_idx); + if (target_idx != 0) { + offsets = base_offsets[base_idx]; + assert(offsets); + } + clone_idxs++; + unsigned start = clones.size(); + clones.resize(start + tag_len); + auto idxs = shard.fvar_idxs; + for (unsigned i = 0; i < tag_len; i++) { + clones[start + i] = {(clone_idxs[i] & ~jl_sysimg_val_mask) | idxs[clone_idxs[i] & jl_sysimg_val_mask], clone_offsets[i] + text_base}; + } + } + // Do relocation + uint32_t reloc_i = 0; + uint32_t len = jl_sysimg_val_mask & tag_len; + for (uint32_t i = 0; i < len; i++) { + uint32_t idx = clone_idxs[i]; + int32_t offset; + if (clone_all) { + offset = offsets[idx]; + } + else if (idx & jl_sysimg_tag_mask) { + idx = idx & jl_sysimg_val_mask; + offset = clone_offsets[i]; + } + else { + continue; + } + bool found = false; + for (; reloc_i < nreloc; reloc_i++) { + auto reloc_idx = ((const uint32_t*)reloc_slots)[reloc_i * 2]; + if (reloc_idx == idx) { + found = true; + auto slot = (const void**)(data_base + reloc_slots[reloc_i * 2 + 1]); + assert(slot); + *slot = offset + text_base; + } + else if (reloc_idx > idx) { + break; + } + } + assert(found && "Cannot find GOT entry for cloned function."); + (void)found; } - tag_len = clone_idxs[-1]; - base_offsets.push_back(tag_len & jl_sysimg_tag_mask ? clone_offsets : nullptr); - } - std::cout << "Set offsets\n"; + auto fidxs = shard.fvar_idxs; + for (uint32_t i = 0; i < nfunc; i++) { + fvars[fidxs[i]] = text_base + offsets[i]; + } - bool clone_all = (tag_len & jl_sysimg_tag_mask) != 0; - // Fill in return value - if (clone_all) { - // clone_all - if (target_idx != 0) { - offsets = clone_offsets; + auto gidxs = shard.gvar_idxs; + unsigned ngvars = shard.gvar_offsets[0]; + for (uint32_t i = 0; i < ngvars; i++) { + gvars[gidxs[i]] = data_base + shard.gvar_offsets[i+1]; } } - else { - uint32_t base_idx = clone_idxs[0]; - assert(base_idx < target_idx); - if (target_idx != 0) { - offsets = base_offsets[base_idx]; - assert(offsets); + + if (!fvars.empty()) { + auto offsets = (int32_t *) malloc(sizeof(int32_t) * fvars.size()); + res.fptrs.base = fvars[0]; + for (size_t i = 0; i < fvars.size(); i++) { + offsets[i] = fvars[i] - res.fptrs.base; } - clone_idxs++; - res.fptrs.nclones = tag_len; - res.fptrs.clone_offsets = clone_offsets; - res.fptrs.clone_idxs = clone_idxs; + res.fptrs.offsets = offsets; + res.fptrs.noffsets = fvars.size(); } - // Do relocation - uint32_t reloc_i = 0; - uint32_t len = jl_sysimg_val_mask & tag_len; - for (uint32_t i = 0; i < len; i++) { - uint32_t idx = clone_idxs[i]; - int32_t offset; - if (clone_all) { - offset = offsets[idx]; - } - else if (idx & jl_sysimg_tag_mask) { - idx = idx & jl_sysimg_val_mask; - offset = clone_offsets[i]; - } - else { - continue; - } - bool found = false; - for (; reloc_i < nreloc; reloc_i++) { - auto reloc_idx = ((const uint32_t*)reloc_slots)[reloc_i * 2]; - if (reloc_idx == idx) { - found = true; - auto slot = (const void**)(data_base + reloc_slots[reloc_i * 2 + 1]); - assert(slot); - *slot = offset + text_base; - } - else if (reloc_idx > idx) { - break; - } + + if (!gvars.empty()) { + auto offsets = (int32_t *) malloc(sizeof(int32_t) * gvars.size()); + res.gvars_base = (uintptr_t *)gvars[0]; + for (size_t i = 0; i < gvars.size(); i++) { + offsets[i] = gvars[i] - (const char *)res.gvars_base; } - assert(found && "Cannot find GOT entry for cloned function."); - (void)found; + res.gvars_offsets = offsets; } - std::cout << "Finished relocation\n"; - - res.fptrs.base = text_base; - res.fptrs.offsets = offsets; - res.gvars_base = (uintptr_t *)data_base; - res.gvars_offsets = shard.gvar_offsets; - res.gvars_offsets += 1; + if (!clones.empty()) { + std::sort(clones.begin(), clones.end()); + auto clone_offsets = (int32_t *) malloc(sizeof(int32_t) * clones.size()); + auto clone_idxs = (uint32_t *) malloc(sizeof(uint32_t) * clones.size()); + for (size_t i = 0; i < clones.size(); i++) { + clone_idxs[i] = clones[i].first; + clone_offsets[i] = clones[i].second - res.fptrs.base; + } + res.fptrs.clone_idxs = clone_idxs; + res.fptrs.clone_offsets = clone_offsets; + res.fptrs.nclones = clones.size(); + } #ifdef _OS_WINDOWS_ res.base = (intptr_t)hdl; @@ -760,8 +796,6 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback) } #endif - std::cout << "Starting ptls\n"; - { void *pgcstack_func_slot = pointers->ptls->pgcstack_func_slot; void *pgcstack_key_slot = pointers->ptls->pgcstack_key_slot; From 3915101dc65d3d0844cf8e0f5d5a1e39ddf97407 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Fri, 6 Jan 2023 19:21:47 -0500 Subject: [PATCH 08/34] Multithreaded image builder --- src/aotcompile.cpp | 729 +++++++++++++++++++++++++++++------ src/llvm-codegen-shared.h | 152 ++++++++ src/llvm-multiversioning.cpp | 155 -------- src/processor.cpp | 7 +- 4 files changed, 764 insertions(+), 279 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 5873c1ca56477..8ef715235fb04 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -51,6 +51,7 @@ // for outputting code #include <llvm/Bitcode/BitcodeWriter.h> #include <llvm/Bitcode/BitcodeWriterPass.h> +#include <llvm/Bitcode/BitcodeReader.h> #include "llvm/Object/ArchiveWriter.h" #include <llvm/IR/IRPrintingPasses.h> @@ -74,19 +75,13 @@ STATISTIC(CreateNativeMethods, "Number of methods compiled for jl_create_native" STATISTIC(CreateNativeMax, "Max number of methods compiled at once for jl_create_native"); STATISTIC(CreateNativeGlobals, "Number of globals compiled for jl_create_native"); -template<class T> // for GlobalObject's -static T *addComdat(T *G) +static void addComdat(GlobalValue *G, Triple &T) { -#if defined(_OS_WINDOWS_) - if (!G->isDeclaration()) { + if (T.isOSBinFormatCOFF() && !G->isDeclaration()) { // add __declspec(dllexport) to everything marked for export - if (G->getLinkage() == GlobalValue::ExternalLinkage) - G->setDLLStorageClass(GlobalValue::DLLExportStorageClass); - else - G->setDLLStorageClass(GlobalValue::DefaultStorageClass); + assert(G->hasExternalLinkage() && "Cannot set DLLExport on non-external linkage!"); + G->setDLLStorageClass(GlobalValue::DLLExportStorageClass); } -#endif - return G; } @@ -472,15 +467,6 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm return (void*)data; } - -static void emit_result(std::vector<NewArchiveMember> &Archive, SmallVectorImpl<char> &OS, - StringRef Name, std::vector<std::string> &outputs) -{ - outputs.push_back({ OS.data(), OS.size() }); - Archive.push_back(NewArchiveMember(MemoryBufferRef(outputs.back(), Name))); - OS.clear(); -} - static object::Archive::Kind getDefaultForHost(Triple &triple) { if (triple.isOSDarwin()) @@ -572,6 +558,584 @@ static GlobalVariable *emit_image_header(Module &M, unsigned threads, unsigned n return header_gv; } +struct Partition { + StringSet<> globals; + StringMap<unsigned> fvars; + StringMap<unsigned> gvars; + size_t weight; +}; + +static void get_fvars_gvars(Module &M, DenseMap<GlobalValue *, unsigned> &fvars, DenseMap<GlobalValue *, unsigned> &gvars) { + auto fvars_gv = M.getGlobalVariable("jl_fvars"); + auto gvars_gv = M.getGlobalVariable("jl_gvars"); + assert(fvars_gv); + assert(gvars_gv); + auto fvars_init = cast<ConstantArray>(fvars_gv->getInitializer()); + auto gvars_init = cast<ConstantArray>(gvars_gv->getInitializer()); + std::string suffix; + if (auto md = M.getModuleFlag("julia.mv.suffix")) { + suffix = cast<MDString>(md)->getString().str(); + } + auto fvars_idxs = M.getGlobalVariable("jl_fvar_idxs"); + auto gvars_idxs = M.getGlobalVariable("jl_gvar_idxs"); + assert(fvars_idxs); + assert(gvars_idxs); + auto fvars_idxs_init = cast<ConstantDataArray>(fvars_idxs->getInitializer()); + auto gvars_idxs_init = cast<ConstantDataArray>(gvars_idxs->getInitializer()); + for (unsigned i = 0; i < fvars_init->getNumOperands(); ++i) { + auto gv = cast<GlobalValue>(fvars_init->getOperand(i)->stripPointerCasts()); + auto idx = fvars_idxs_init->getElementAsInteger(i); + fvars[gv] = idx; + } + for (unsigned i = 0; i < gvars_init->getNumOperands(); ++i) { + auto gv = cast<GlobalValue>(gvars_init->getOperand(i)->stripPointerCasts()); + auto idx = gvars_idxs_init->getElementAsInteger(i); + gvars[gv] = idx; + } + fvars_gv->eraseFromParent(); + gvars_gv->eraseFromParent(); + fvars_idxs->eraseFromParent(); + gvars_idxs->eraseFromParent(); +} + +static size_t getFunctionWeight(const Function &F) +{ + size_t weight = 1; + for (const BasicBlock &BB : F) { + weight += BB.size(); + } + // more basic blocks = more complex than just sum of insts, + // add some weight to it + weight += F.size(); + if (F.hasFnAttribute("julia.mv.clones")) { + weight *= F.getFnAttribute("julia.mv.clones").getValueAsString().count(',') + 1; + } + return weight; +} + + +static bool verify_partitioning(const SmallVectorImpl<Partition> &partitions, const Module &M) { + StringMap<uint32_t> GVNames; + bool bad = false; + for (uint32_t i = 0; i < partitions.size(); i++) { + for (auto &name : partitions[i].globals) { + if (GVNames.count(name.getKey())) { + bad = true; + dbgs() << "Duplicate global name " << name.getKey() << " in partitions " << i << " and " << GVNames[name.getKey()] << "\n"; + } + GVNames[name.getKey()] = i; + } + dbgs() << "partition: " << i << " fvars: " << partitions[i].fvars.size() << " gvars: " << partitions[i].gvars.size() << "\n"; + } + for (auto &GV : M.globals()) { + if (GV.isDeclaration()) { + if (GVNames.count(GV.getName())) { + bad = true; + dbgs() << "Global " << GV.getName() << " is a declaration but is in partition " << GVNames[GV.getName()] << "\n"; + } + } else { + if (!GVNames.count(GV.getName())) { + bad = true; + dbgs() << "Global " << GV << " not in any partition\n"; + } + if (!GV.hasExternalLinkage()) { + bad = true; + dbgs() << "Global " << GV << " has non-external linkage " << GV.getLinkage() << " but is in partition " << GVNames[GV.getName()] << "\n"; + } + } + } + return !bad; +} + +// Chop a module up as equally as possible into threads partitions +static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) { + //Start by stripping fvars and gvars, which helpfully removes their uses as well + DenseMap<GlobalValue *, unsigned> fvars, gvars; + get_fvars_gvars(M, fvars, gvars); + + // Partition by union-find, since we only have def->use traversal right now + struct Partitioner { + struct Node { + GlobalValue *GV; + unsigned parent; + unsigned size; + size_t weight; + }; + std::vector<Node> nodes; + DenseMap<GlobalValue *, unsigned> node_map; + unsigned merged; + + unsigned make(GlobalValue *GV, size_t weight) { + unsigned idx = nodes.size(); + nodes.push_back({GV, idx, 1, weight}); + node_map[GV] = idx; + return idx; + } + + unsigned find(unsigned idx) { + while (nodes[idx].parent != idx) { + nodes[idx].parent = nodes[nodes[idx].parent].parent; + idx = nodes[idx].parent; + } + return idx; + } + + unsigned merge(unsigned x, unsigned y) { + x = find(x); + y = find(y); + if (x == y) + return x; + if (nodes[x].size < nodes[y].size) + std::swap(x, y); + nodes[y].parent = x; + nodes[x].size += nodes[y].size; + nodes[x].weight += nodes[y].weight; + merged++; + return x; + } + }; + + Partitioner partitioner; + + for (auto &G : M.global_values()) { + if (G.isDeclaration()) + continue; + if (isa<Function>(G)) { + partitioner.make(&G, getFunctionWeight(cast<Function>(G))); + } else { + partitioner.make(&G, 1); + } + } + + // Merge all uses to go together into the same partition + for (unsigned i = 0; i < partitioner.nodes.size(); ++i) { + for (ConstantUses<GlobalValue> uses(partitioner.nodes[i].GV, M); !uses.done(); uses.next()) { + auto val = uses.get_info().val; + auto idx = partitioner.node_map.find(val); + assert(idx != partitioner.node_map.end()); + partitioner.merge(i, idx->second); + } + } + + SmallVector<Partition, 32> partitions(threads); + // always get the smallest partition first + auto pcomp = [](const Partition *p1, const Partition *p2) { + return p1->weight > p2->weight; + }; + std::priority_queue<Partition *, std::vector<Partition *>, decltype(pcomp)> pq(pcomp); + for (unsigned i = 0; i < threads; ++i) { + pq.push(&partitions[i]); + } + + // Assign the root of each partition to a partition, then assign its children to the same one + for (unsigned i = 0; i < partitioner.nodes.size(); ++i) { + auto root = partitioner.find(i); + if (partitioner.nodes[root].GV) { + auto &node = partitioner.nodes[root]; + auto &P = *pq.top(); + pq.pop(); + auto name = node.GV->getName(); + P.globals.insert(name); + if (fvars.count(node.GV)) + P.fvars[name] = fvars[node.GV]; + if (gvars.count(node.GV)) + P.gvars[name] = gvars[node.GV]; + P.weight += node.weight; + node.GV = nullptr; + node.size = &P - partitions.data(); + pq.push(&P); + } + if (root != i) { + auto &node = partitioner.nodes[i]; + assert(node.GV != nullptr); + // we assigned its root already, so just add it to the root's partition + // don't touch the priority queue, since we're not changing the weight + auto &P = partitions[partitioner.nodes[root].size]; + auto name = node.GV->getName(); + P.globals.insert(name); + if (fvars.count(node.GV)) + P.fvars[name] = fvars[node.GV]; + if (gvars.count(node.GV)) + P.gvars[name] = gvars[node.GV]; + node.GV = nullptr; + node.size = partitioner.nodes[root].size; + } + } + + assert(verify_partitioning(partitions, M) && "Partitioning failed to partition globals correctly"); + + return partitions; +} + +static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *outputs, StringRef name, + NewArchiveMember *unopt, NewArchiveMember *opt, NewArchiveMember *obj, NewArchiveMember *asm_) { + auto TM = std::unique_ptr<TargetMachine>( + SourceTM.getTarget().createTargetMachine( + SourceTM.getTargetTriple().str(), + SourceTM.getTargetCPU(), + SourceTM.getTargetFeatureString(), + SourceTM.Options, + SourceTM.getRelocationModel(), + SourceTM.getCodeModel(), + SourceTM.getOptLevel())); + + if (unopt) { + raw_string_ostream OS(*outputs); + PassBuilder PB; + AnalysisManagers AM{*TM, PB, OptimizationLevel::O0}; + ModulePassManager MPM; + MPM.addPass(BitcodeWriterPass(OS)); + outputs++; + *outputs = (name + "_unopt.bc").str(); + *unopt = NewArchiveMember(MemoryBufferRef(OS.str(), *outputs)); + outputs++; + } + if (!opt && !obj && !asm_) { + return; + } + assert(!verifyModule(M, &errs())); + + uint64_t start = jl_hrtime(); + uint64_t end = 0; + +#ifndef JL_USE_NEW_PM + legacy::PassManager optimizer; + addTargetPasses(&optimizer, TM->getTargetTriple(), TM->getTargetIRAnalysis()); + addOptimizationPasses(&optimizer, jl_options.opt_level, true, true); + addMachinePasses(&optimizer, jl_options.opt_level); +#else + + auto PMTM = std::unique_ptr<TargetMachine>( + SourceTM.getTarget().createTargetMachine( + SourceTM.getTargetTriple().str(), + SourceTM.getTargetCPU(), + SourceTM.getTargetFeatureString(), + SourceTM.Options, + SourceTM.getRelocationModel(), + SourceTM.getCodeModel(), + SourceTM.getOptLevel())); + NewPM optimizer{std::move(PMTM), getOptLevel(jl_options.opt_level), OptimizationOptions::defaults(true, true)}; +#endif + optimizer.run(M); + assert(!verifyModule(M, &errs())); + + end = jl_hrtime(); + + dbgs() << "optimize time: " << (end - start) / 1e9 << "s\n"; + + if (opt) { + raw_string_ostream OS(*outputs); + PassBuilder PB; + AnalysisManagers AM{*TM, PB, OptimizationLevel::O0}; + ModulePassManager MPM; + MPM.addPass(BitcodeWriterPass(OS)); + outputs++; + *outputs = (name + "_opt.bc").str(); + *opt = NewArchiveMember(MemoryBufferRef(OS.str(), *outputs)); + outputs++; + } + + start = jl_hrtime(); + + if (obj) { + SmallVector<char, 0> Buffer; + raw_svector_ostream OS(Buffer); + legacy::PassManager emitter; + addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis()); + if (TM->addPassesToEmitFile(emitter, OS, nullptr, CGFT_ObjectFile, false)) + jl_safe_printf("ERROR: target does not support generation of object files\n"); + emitter.run(M); + *outputs = { Buffer.data(), Buffer.size() }; + outputs++; + *outputs = (name + ".o").str(); + *obj = NewArchiveMember(MemoryBufferRef(outputs[-1], *outputs)); + outputs++; + } + + end = jl_hrtime(); + + dbgs() << "codegen time: " << (end - start) / 1e9 << "s\n"; + + if (asm_) { + SmallVector<char, 0> Buffer; + raw_svector_ostream OS(Buffer); + legacy::PassManager emitter; + addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis()); + if (TM->addPassesToEmitFile(emitter, OS, nullptr, CGFT_AssemblyFile, false)) + jl_safe_printf("ERROR: target does not support generation of assembly files\n"); + emitter.run(M); + *outputs = { Buffer.data(), Buffer.size() }; + outputs++; + *outputs = (name + ".s").str(); + *asm_ = NewArchiveMember(MemoryBufferRef(outputs[-1], *outputs)); + outputs++; + } +} + +static auto serializeModule(const Module &M) { + SmallVector<char, 0> ClonedModuleBuffer; + BitcodeWriter BCWriter(ClonedModuleBuffer); + BCWriter.writeModule(M); + BCWriter.writeSymtab(); + BCWriter.writeStrtab(); + return ClonedModuleBuffer; +} + +static void materializePreserved(Module &M, Partition &partition) { + DenseSet<GlobalValue *> Preserve; + for (auto &GV : M.global_values()) { + if (!GV.isDeclaration()) { + if (partition.globals.count(GV.getName())) { + Preserve.insert(&GV); + } + } + } + for (auto &F : M.functions()) { + if (!F.isDeclaration()) { + if (!Preserve.contains(&F)) { + F.deleteBody(); + F.setLinkage(GlobalValue::ExternalLinkage); + } + } + } + for (auto &GV : M.globals()) { + if (!GV.isDeclaration()) { + if (!Preserve.contains(&GV)) { + GV.setInitializer(nullptr); + GV.setLinkage(GlobalValue::ExternalLinkage); + } + } + } + SmallVector<std::pair<GlobalAlias *, GlobalValue *>> DeletedAliases; + for (auto &GA : M.aliases()) { + if (!GA.isDeclaration()) { + if (!Preserve.contains(&GA)) { + if (GA.getValueType()->isFunctionTy()) { + DeletedAliases.push_back({ &GA, Function::Create(cast<FunctionType>(GA.getValueType()), GlobalValue::ExternalLinkage, "", &M) }); + } else { + DeletedAliases.push_back({ &GA, new GlobalVariable(M, GA.getValueType(), false, GlobalValue::ExternalLinkage, nullptr) }); + } + } + } + } + cantFail(M.materializeAll()); + for (auto &Deleted : DeletedAliases) { + Deleted.second->takeName(Deleted.first); + Deleted.first->replaceAllUsesWith(Deleted.second); + Deleted.first->eraseFromParent(); + } +} + +static void construct_vars(Module &M, Partition &partition) { + std::vector<std::pair<uint32_t, GlobalValue *>> fvar_pairs; + fvar_pairs.reserve(partition.fvars.size()); + for (auto &fvar : partition.fvars) { + auto F = M.getFunction(fvar.first()); + assert(F); + assert(!F->isDeclaration()); + fvar_pairs.push_back({ fvar.second, F }); + } + std::vector<GlobalValue *> fvars; + std::vector<uint32_t> fvar_idxs; + fvars.reserve(fvar_pairs.size()); + fvar_idxs.reserve(fvar_pairs.size()); + std::sort(fvar_pairs.begin(), fvar_pairs.end()); + for (auto &fvar : fvar_pairs) { + fvars.push_back(fvar.second); + fvar_idxs.push_back(fvar.first); + } + std::vector<std::pair<uint32_t, GlobalValue *>> gvar_pairs; + gvar_pairs.reserve(partition.gvars.size()); + for (auto &gvar : partition.gvars) { + auto GV = M.getGlobalVariable(gvar.first()); + assert(GV); + assert(!GV->isDeclaration()); + gvar_pairs.push_back({ gvar.second, GV }); + } + std::vector<GlobalValue *> gvars; + std::vector<uint32_t> gvar_idxs; + gvars.reserve(gvar_pairs.size()); + gvar_idxs.reserve(gvar_pairs.size()); + std::sort(gvar_pairs.begin(), gvar_pairs.end()); + for (auto &gvar : gvar_pairs) { + gvars.push_back(gvar.second); + gvar_idxs.push_back(gvar.first); + } + + // Now commit the fvars, gvars, and idxs + auto T_psize = M.getDataLayout().getIntPtrType(M.getContext())->getPointerTo(); + emit_offset_table(M, fvars, "jl_fvars", T_psize); + emit_offset_table(M, gvars, "jl_gvars", T_psize); + auto fidxs = ConstantDataArray::get(M.getContext(), fvar_idxs); + auto fidxs_var = new GlobalVariable(M, fidxs->getType(), true, + GlobalVariable::ExternalLinkage, + fidxs, "jl_fvar_idxs"); + fidxs_var->setVisibility(GlobalValue::HiddenVisibility); + auto gidxs = ConstantDataArray::get(M.getContext(), gvar_idxs); + auto gidxs_var = new GlobalVariable(M, gidxs->getType(), true, + GlobalVariable::ExternalLinkage, + gidxs, "jl_gvar_idxs"); + gidxs_var->setVisibility(GlobalValue::HiddenVisibility); +} + +static void dropUnusedDeclarations(Module &M) { + SmallVector<GlobalValue *> unused; + for (auto &G : M.global_values()) { + if (G.isDeclaration()) { + if (G.use_empty()) { + unused.push_back(&G); + } else { + G.setDSOLocal(false); // These are never going to be seen in the same module again + G.setVisibility(GlobalValue::DefaultVisibility); + } + } + } + for (auto &G : unused) + G->eraseFromParent(); +} + +static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &outputs, StringRef name, + std::vector<NewArchiveMember> &unopt, std::vector<NewArchiveMember> &opt, + std::vector<NewArchiveMember> &obj, std::vector<NewArchiveMember> &asm_, + bool unopt_out, bool opt_out, bool obj_out, bool asm_out, + unsigned threads) { + uint64_t start = 0, end = 0; + unsigned outcount = unopt_out + opt_out + obj_out + asm_out; + assert(outcount); + outputs.resize(outputs.size() + outcount * threads * 2); + unopt.resize(unopt.size() + unopt_out * threads); + opt.resize(opt.size() + opt_out * threads); + obj.resize(obj.size() + obj_out * threads); + asm_.resize(asm_.size() + asm_out * threads); + if (threads == 1) { + start = jl_hrtime(); + add_output_impl(M, TM, outputs.data() + outputs.size() - outcount * 2, name, + unopt_out ? unopt.data() + unopt.size() - 1 : nullptr, + opt_out ? opt.data() + opt.size() - 1 : nullptr, + obj_out ? obj.data() + obj.size() - 1 : nullptr, + asm_out ? asm_.data() + asm_.size() - 1 : nullptr); + end = jl_hrtime(); + dbgs() << "Time to add output: " << (end - start) / 1e9 << "s\n"; + return; + } + + start = jl_hrtime(); + uint64_t counter = 0; + for (auto &G : M.global_values()) { + if (!G.isDeclaration() && !G.hasName()) { + G.setName("jl_ext_" + Twine(counter++)); + } + } + auto partitions = partitionModule(M, threads); + end = jl_hrtime(); + dbgs() << "Time to partition module: " << (end - start) / 1e9 << "s\n"; + start = jl_hrtime(); + auto serialized = serializeModule(M); + end = jl_hrtime(); + dbgs() << "Time to serialize module: " << (end - start) / 1e9 << "s\n"; + + auto outstart = outputs.data() + outputs.size() - outcount * threads * 2; + auto unoptstart = unopt_out ? unopt.data() + unopt.size() - threads : nullptr; + auto optstart = opt_out ? opt.data() + opt.size() - threads : nullptr; + auto objstart = obj_out ? obj.data() + obj.size() - threads : nullptr; + auto asmstart = asm_out ? asm_.data() + asm_.size() - threads : nullptr; + + std::vector<std::thread> workers(threads); + for (unsigned i = 0; i < threads; i++) { + workers[i] = std::thread([&, i](){ + LLVMContext ctx; + uint64_t start = 0; + uint64_t end = 0; + start = jl_hrtime(); + auto M = cantFail(getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), "Optimized"), ctx), "Error loading module"); + end = jl_hrtime(); + dbgs() << "Deserialization time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; + + dbgs() << "Starting shard " << i << " with weight=" << partitions[i].weight << "\n"; + + start = jl_hrtime(); + materializePreserved(*M, partitions[i]); + end = jl_hrtime(); + dbgs() << "Materialization time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; + + start = jl_hrtime(); + construct_vars(*M, partitions[i]); + M->setModuleFlag(Module::Error, "julia.mv.suffix", MDString::get(M->getContext(), "_" + std::to_string(i))); + end = jl_hrtime(); + + dbgs() << "Construction time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; + + start = jl_hrtime(); + dropUnusedDeclarations(*M); + end = jl_hrtime(); + + dbgs() << "Declaration deletion time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; + + start = jl_hrtime(); + add_output_impl(*M, TM, outstart + i * outcount * 2, name, + unoptstart ? unoptstart + i : nullptr, + optstart ? optstart + i : nullptr, + objstart ? objstart + i : nullptr, + asmstart ? asmstart + i : nullptr); + end = jl_hrtime(); + + dbgs() << "Output time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; + }); + } + + start = jl_hrtime(); + for (auto &w : workers) + w.join(); + end = jl_hrtime(); + + dbgs() << "Total time for parallel output: " << (end - start) / 1e9 << "s\n"; +} + +unsigned compute_image_thread_count(Module &M) { + // 32-bit systems are very memory-constrained +#ifdef _P32 + return 1; +#endif + unsigned threads = std::max(llvm::hardware_concurrency().compute_thread_count() / 2, 1u); + + // memory limit check + // many threads use a lot of memory, so limit on constrained memory systems + size_t available = uv_get_available_memory(); + size_t weight = 0; + for (auto &GV : M.global_values()) { + if (GV.isDeclaration()) + continue; + if (isa<Function>(GV)) { + weight += getFunctionWeight(cast<Function>(GV)); + } else { + weight += 1; + } + } + if (weight == 0) { + dbgs() << "No globals in module, using 1 thread\n"; + return 1; + } + // crude estimate, available / (weight * fudge factor) = max threads + size_t fudge = 10; + unsigned max_threads = std::max(available / (weight * fudge), (size_t)1); + dbgs() << "Weight: " << weight << ", available: " << available << ", wanted: " << threads << ", max threads: " << max_threads << "\n"; + threads = std::min(threads, max_threads); + + // environment variable override + const char *env_threads = getenv("JULIA_IMAGE_THREADS"); + if (env_threads) { + char *endptr; + unsigned long requested = strtoul(env_threads, &endptr, 10); + if (*endptr || !requested) { + jl_safe_printf("WARNING: invalid value '%s' for JULIA_IMAGE_THREADS\n", env_threads); + } else { + threads = requested; + } + } + + return threads; +} + // takes the running content that has collected in the shadow module and dump it to disk // this builds the object file portion of the sysimage files for fast startup extern "C" JL_DLLEXPORT @@ -584,6 +1148,11 @@ void jl_dump_native_impl(void *native_code, uint64_t end = 0; JL_TIMING(NATIVE_DUMP); jl_native_code_desc_t *data = (jl_native_code_desc_t*)native_code; + if (!bc_fname && !unopt_bc_fname && !obj_fname && !asm_fname) { + dbgs() << "No output requested, skipping native code dump?\n"; + delete data; + return; + } auto TSCtx = data->M.getContext(); auto lock = TSCtx.getLock(); LLVMContext &Context = *TSCtx.getContext(); @@ -646,7 +1215,7 @@ void jl_dump_native_impl(void *native_code, start = jl_hrtime(); - unsigned threads = 1; + unsigned threads = compute_image_thread_count(*dataM); unsigned nfvars = 0; unsigned ngvars = 0; @@ -693,7 +1262,7 @@ void jl_dump_native_impl(void *native_code, true, GlobalVariable::ExternalLinkage, jlRTLD_DEFAULT_var, - "jl_RTLD_DEFAULT_handle_pointer")); + "jl_RTLD_DEFAULT_handle_pointer"), TheTriple); } end = jl_hrtime(); @@ -702,101 +1271,14 @@ void jl_dump_native_impl(void *native_code, start = jl_hrtime(); - // do the actual work - auto add_output = [&] (Module &M, StringRef unopt_bc_Name, StringRef bc_Name, StringRef obj_Name, StringRef asm_Name) { - - auto TM = std::unique_ptr<TargetMachine>( - SourceTM->getTarget().createTargetMachine( - SourceTM->getTargetTriple().str(), - SourceTM->getTargetCPU(), - SourceTM->getTargetFeatureString(), - SourceTM->Options, - SourceTM->getRelocationModel(), - SourceTM->getCodeModel(), - SourceTM->getOptLevel())); - - if (unopt_bc_fname) { - SmallVector<char, 0> Buffer; - raw_svector_ostream OS(Buffer); - PassBuilder PB; - AnalysisManagers AM{*TM, PB, OptimizationLevel::O0}; - ModulePassManager MPM; - MPM.addPass(BitcodeWriterPass(OS)); - emit_result(unopt_bc_Archive, Buffer, unopt_bc_Name, outputs); - } - if (!bc_fname && !obj_fname && !asm_fname) { - return; - } - assert(!verifyModule(M, &errs())); - - uint64_t start = jl_hrtime(); - end = 0; - -#ifndef JL_USE_NEW_PM - legacy::PassManager optimizer; - addTargetPasses(&optimizer, TM->getTargetTriple(), TM->getTargetIRAnalysis()); - addOptimizationPasses(&optimizer, jl_options.opt_level, true, true); - addMachinePasses(&optimizer, jl_options.opt_level); -#else - - auto PMTM = std::unique_ptr<TargetMachine>( - SourceTM->getTarget().createTargetMachine( - SourceTM->getTargetTriple().str(), - SourceTM->getTargetCPU(), - SourceTM->getTargetFeatureString(), - SourceTM->Options, - SourceTM->getRelocationModel(), - SourceTM->getCodeModel(), - SourceTM->getOptLevel())); - NewPM optimizer{std::move(PMTM), getOptLevel(jl_options.opt_level), OptimizationOptions::defaults(true, true)}; -#endif - optimizer.run(M); - assert(!verifyModule(M, &errs())); - - end = jl_hrtime(); - - dbgs() << "optimize time: " << (end - start) / 1e9 << "s\n"; - - if (bc_fname) { - SmallVector<char, 0> Buffer; - raw_svector_ostream OS(Buffer); - PassBuilder PB; - AnalysisManagers AM{*TM, PB, OptimizationLevel::O0}; - ModulePassManager MPM; - MPM.addPass(BitcodeWriterPass(OS)); - emit_result(bc_Archive, Buffer, bc_Name, outputs); - } - - start = jl_hrtime(); - - if (obj_fname) { - SmallVector<char, 0> Buffer; - raw_svector_ostream OS(Buffer); - legacy::PassManager emitter; - addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis()); - if (TM->addPassesToEmitFile(emitter, OS, nullptr, CGFT_ObjectFile, false)) - jl_safe_printf("ERROR: target does not support generation of object files\n"); - emitter.run(M); - emit_result(obj_Archive, Buffer, obj_Name, outputs); - } - - end = jl_hrtime(); - - dbgs() << "codegen time: " << (end - start) / 1e9 << "s\n"; - - if (asm_fname) { - SmallVector<char, 0> Buffer; - raw_svector_ostream OS(Buffer); - legacy::PassManager emitter; - addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis()); - if (TM->addPassesToEmitFile(emitter, OS, nullptr, CGFT_AssemblyFile, false)) - jl_safe_printf("ERROR: target does not support generation of assembly files\n"); - emitter.run(M); - emit_result(asm_Archive, Buffer, asm_Name, outputs); - } - }; - - add_output(*dataM, "unopt.bc", "text.bc", "text.o", "text.s"); + auto compile = [&](Module &M, StringRef name, unsigned threads) { add_output( + M, *SourceTM, outputs, name, + unopt_bc_Archive, bc_Archive, obj_Archive, asm_Archive, + !!unopt_bc_fname, !!bc_fname, !!obj_fname, !!asm_fname, + threads + ); }; + + compile(*dataM, "text", threads); end = jl_hrtime(); @@ -804,8 +1286,7 @@ void jl_dump_native_impl(void *native_code, start = jl_hrtime(); - orc::ThreadSafeModule sysimage(std::make_unique<Module>("sysimage", Context), TSCtx); - auto sysimageM = sysimage.getModuleUnlocked(); + auto sysimageM = std::make_unique<Module>("sysimage", Context); sysimageM->setTargetTriple(dataM->getTargetTriple()); sysimageM->setDataLayout(dataM->getDataLayout()); #if JL_LLVM_VERSION >= 130000 @@ -846,13 +1327,15 @@ void jl_dump_native_impl(void *native_code, if (sysimg_data) { Constant *data = ConstantDataArray::get(Context, ArrayRef<uint8_t>((const unsigned char*)sysimg_data, sysimg_len)); - addComdat(new GlobalVariable(*sysimageM, data->getType(), false, + auto sysdata = new GlobalVariable(*sysimageM, data->getType(), false, GlobalVariable::ExternalLinkage, - data, "jl_system_image_data"))->setAlignment(Align(64)); + data, "jl_system_image_data"); + sysdata->setAlignment(Align(64)); + addComdat(sysdata, TheTriple); Constant *len = ConstantInt::get(T_size, sysimg_len); addComdat(new GlobalVariable(*sysimageM, len->getType(), true, GlobalVariable::ExternalLinkage, - len, "jl_system_image_size")); + len, "jl_system_image_size"), TheTriple); } if (imaging_mode) { auto specs = jl_get_llvm_clone_targets(); @@ -886,13 +1369,13 @@ void jl_dump_native_impl(void *native_code, ConstantExpr::getBitCast(target_ids, T_psize) }), "jl_image_pointers"); - addComdat(pointers); + addComdat(pointers, TheTriple); if (s) { write_int32(s, data.size()); ios_write(s, (const char *)data.data(), data.size()); } } - add_output(*sysimageM, "data.bc", "data.bc", "data.o", "data.s"); + compile(*sysimageM, "data", 1); end = jl_hrtime(); diff --git a/src/llvm-codegen-shared.h b/src/llvm-codegen-shared.h index e0edb792d7645..732871b12ff23 100644 --- a/src/llvm-codegen-shared.h +++ b/src/llvm-codegen-shared.h @@ -449,4 +449,156 @@ inline Attribute getAttributeAtIndex(const AttributeList &L, unsigned Index, Att return L.getAttribute(Index, Kind); #endif } + +// Iterate through uses of a particular type. +// Recursively scan through `ConstantExpr` and `ConstantAggregate` use. +template<typename U> +struct ConstantUses { + template<typename T> + struct Info { + llvm::Use *use; + T *val; + // If `samebits == true`, the offset the original value appears in the constant. + size_t offset; + // This specify whether the original value appears in the current value in exactly + // the same bit pattern (with possibly an offset determined by `offset`). + bool samebits; + Info(llvm::Use *use, T *val, size_t offset, bool samebits) : + use(use), + val(val), + offset(offset), + samebits(samebits) + { + } + Info(llvm::Use *use, size_t offset, bool samebits) : + use(use), + val(cast<T>(use->getUser())), + offset(offset), + samebits(samebits) + { + } + }; + using UseInfo = Info<U>; + struct Frame : Info<llvm::Constant> { + template<typename... Args> + Frame(Args &&... args) : + Info<llvm::Constant>(std::forward<Args>(args)...), + cur(this->val->use_empty() ? nullptr : &*this->val->use_begin()), + _next(cur ? cur->getNext() : nullptr) + { + } + private: + void next() + { + cur = _next; + if (!cur) + return; + _next = cur->getNext(); + } + llvm::Use *cur; + llvm::Use *_next; + friend struct ConstantUses; + }; + ConstantUses(llvm::Constant *c, llvm::Module &M) + : stack{Frame(nullptr, c, 0u, true)}, + M(M) + { + forward(); + } + UseInfo get_info() const + { + auto &top = stack.back(); + return UseInfo(top.cur, top.offset, top.samebits); + } + const auto &get_stack() const + { + return stack; + } + void next() + { + stack.back().next(); + forward(); + } + bool done() + { + return stack.empty(); + } +private: + void forward(); + llvm::SmallVector<Frame, 4> stack; + llvm::Module &M; +}; + +template<typename U> +void ConstantUses<U>::forward() +{ + assert(!stack.empty()); + auto frame = &stack.back(); + const auto &DL = M.getDataLayout(); + auto pop = [&] { + stack.pop_back(); + if (stack.empty()) { + return false; + } + frame = &stack.back(); + return true; + }; + auto push = [&] (llvm::Use *use, llvm::Constant *c, size_t offset, bool samebits) { + stack.emplace_back(use, c, offset, samebits); + frame = &stack.back(); + }; + auto handle_constaggr = [&] (llvm::Use *use, llvm::ConstantAggregate *aggr) { + if (!frame->samebits) { + push(use, aggr, 0, false); + return; + } + if (auto strct = dyn_cast<llvm::ConstantStruct>(aggr)) { + auto layout = DL.getStructLayout(strct->getType()); + push(use, strct, frame->offset + layout->getElementOffset(use->getOperandNo()), true); + } + else if (auto ary = dyn_cast<llvm::ConstantArray>(aggr)) { + auto elty = ary->getType()->getElementType(); + push(use, ary, frame->offset + DL.getTypeAllocSize(elty) * use->getOperandNo(), true); + } + else if (auto vec = dyn_cast<llvm::ConstantVector>(aggr)) { + auto elty = vec->getType()->getElementType(); + push(use, vec, frame->offset + DL.getTypeAllocSize(elty) * use->getOperandNo(), true); + } + else { + abort(); + } + }; + auto handle_constexpr = [&] (llvm::Use *use, llvm::ConstantExpr *expr) { + if (!frame->samebits) { + push(use, expr, 0, false); + return; + } + auto opcode = expr->getOpcode(); + if (opcode == llvm::Instruction::PtrToInt || opcode == llvm::Instruction::IntToPtr || + opcode == llvm::Instruction::AddrSpaceCast || opcode == llvm::Instruction::BitCast) { + push(use, expr, frame->offset, true); + } + else { + push(use, expr, 0, false); + } + }; + while (true) { + auto use = frame->cur; + if (!use) { + if (!pop()) + return; + continue; + } + auto user = use->getUser(); + if (isa<U>(user)) + return; + frame->next(); + if (auto aggr = dyn_cast<llvm::ConstantAggregate>(user)) { + handle_constaggr(use, aggr); + } + else if (auto expr = dyn_cast<llvm::ConstantExpr>(user)) { + handle_constexpr(use, expr); + } + } +} } diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp index 44c83502e0537..a172579f8ae4b 100644 --- a/src/llvm-multiversioning.cpp +++ b/src/llvm-multiversioning.cpp @@ -64,160 +64,6 @@ Value *map_get(T &&vmap, Value *key, Value *def=nullptr) return val; } -// Iterate through uses of a particular type. -// Recursively scan through `ConstantExpr` and `ConstantAggregate` use. -template<typename U> -struct ConstantUses { - template<typename T> - struct Info { - Use *use; - T *val; - // If `samebits == true`, the offset the original value appears in the constant. - size_t offset; - // This specify whether the original value appears in the current value in exactly - // the same bit pattern (with possibly an offset determined by `offset`). - bool samebits; - Info(Use *use, T *val, size_t offset, bool samebits) : - use(use), - val(val), - offset(offset), - samebits(samebits) - { - } - Info(Use *use, size_t offset, bool samebits) : - use(use), - val(cast<T>(use->getUser())), - offset(offset), - samebits(samebits) - { - } - }; - using UseInfo = Info<U>; - struct Frame : Info<Constant> { - template<typename... Args> - Frame(Args &&... args) : - Info<Constant>(std::forward<Args>(args)...), - cur(this->val->use_empty() ? nullptr : &*this->val->use_begin()), - _next(cur ? cur->getNext() : nullptr) - { - } - private: - void next() - { - cur = _next; - if (!cur) - return; - _next = cur->getNext(); - } - Use *cur; - Use *_next; - friend struct ConstantUses; - }; - ConstantUses(Constant *c, Module &M) - : stack{Frame(nullptr, c, 0u, true)}, - M(M) - { - forward(); - } - UseInfo get_info() const - { - auto &top = stack.back(); - return UseInfo(top.cur, top.offset, top.samebits); - } - const SmallVector<Frame, 4> &get_stack() const - { - return stack; - } - void next() - { - stack.back().next(); - forward(); - } - bool done() - { - return stack.empty(); - } -private: - void forward(); - SmallVector<Frame, 4> stack; - Module &M; -}; - -template<typename U> -void ConstantUses<U>::forward() -{ - assert(!stack.empty()); - auto frame = &stack.back(); - const DataLayout &DL = M.getDataLayout(); - auto pop = [&] { - stack.pop_back(); - if (stack.empty()) { - return false; - } - frame = &stack.back(); - return true; - }; - auto push = [&] (Use *use, Constant *c, size_t offset, bool samebits) { - stack.emplace_back(use, c, offset, samebits); - frame = &stack.back(); - }; - auto handle_constaggr = [&] (Use *use, ConstantAggregate *aggr) { - if (!frame->samebits) { - push(use, aggr, 0, false); - return; - } - if (auto strct = dyn_cast<ConstantStruct>(aggr)) { - auto layout = DL.getStructLayout(strct->getType()); - push(use, strct, frame->offset + layout->getElementOffset(use->getOperandNo()), true); - } - else if (auto ary = dyn_cast<ConstantArray>(aggr)) { - auto elty = ary->getType()->getElementType(); - push(use, ary, frame->offset + DL.getTypeAllocSize(elty) * use->getOperandNo(), true); - } - else if (auto vec = dyn_cast<ConstantVector>(aggr)) { - auto elty = vec->getType()->getElementType(); - push(use, vec, frame->offset + DL.getTypeAllocSize(elty) * use->getOperandNo(), true); - } - else { - jl_safe_printf("Unknown ConstantAggregate:\n"); - llvm_dump(aggr); - abort(); - } - }; - auto handle_constexpr = [&] (Use *use, ConstantExpr *expr) { - if (!frame->samebits) { - push(use, expr, 0, false); - return; - } - auto opcode = expr->getOpcode(); - if (opcode == Instruction::PtrToInt || opcode == Instruction::IntToPtr || - opcode == Instruction::AddrSpaceCast || opcode == Instruction::BitCast) { - push(use, expr, frame->offset, true); - } - else { - push(use, expr, 0, false); - } - }; - while (true) { - auto use = frame->cur; - if (!use) { - if (!pop()) - return; - continue; - } - auto user = use->getUser(); - if (isa<U>(user)) - return; - frame->next(); - if (auto aggr = dyn_cast<ConstantAggregate>(user)) { - handle_constaggr(use, aggr); - } - else if (auto expr = dyn_cast<ConstantExpr>(user)) { - handle_constexpr(use, expr); - } - } -} - static bool is_vector(FunctionType *ty) { if (ty->getReturnType()->isVectorTy()) @@ -574,7 +420,6 @@ void CloneCtx::prepare_slots() assert(F->hasFnAttribute("julia.mv.clones")); if (F->isDeclaration()) { auto GV = new GlobalVariable(M, F->getType(), false, GlobalValue::ExternalLinkage, nullptr, F->getName() + ".reloc_slot"); - GV->setVisibility(GlobalValue::HiddenVisibility); extern_relocs[F] = GV; } else { auto id = get_func_id(F); diff --git a/src/processor.cpp b/src/processor.cpp index 55b2cd2b4ab55..3a791778a3b21 100644 --- a/src/processor.cpp +++ b/src/processor.cpp @@ -647,7 +647,7 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback) std::vector<std::pair<uint32_t, const char *>> clones; for (unsigned i = 0; i < pointers->header->nshards; i++) { - auto shard = pointers->shards[0]; + auto shard = pointers->shards[i]; // .data base char *data_base = (char *)shard.gvar_base; @@ -657,6 +657,7 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback) const int32_t *offsets = shard.fvar_offsets; uint32_t nfunc = offsets[0]; + assert(nfunc <= pointers->header->nfvars); offsets++; const int32_t *reloc_slots = shard.clone_slots; const uint32_t nreloc = reloc_slots[0]; @@ -747,6 +748,7 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback) auto gidxs = shard.gvar_idxs; unsigned ngvars = shard.gvar_offsets[0]; + assert(ngvars <= pointers->header->ngvars); for (uint32_t i = 0; i < ngvars; i++) { gvars[gidxs[i]] = data_base + shard.gvar_offsets[i+1]; } @@ -756,6 +758,7 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback) auto offsets = (int32_t *) malloc(sizeof(int32_t) * fvars.size()); res.fptrs.base = fvars[0]; for (size_t i = 0; i < fvars.size(); i++) { + assert(fvars[i] && "Missing function pointer!"); offsets[i] = fvars[i] - res.fptrs.base; } res.fptrs.offsets = offsets; @@ -766,12 +769,14 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback) auto offsets = (int32_t *) malloc(sizeof(int32_t) * gvars.size()); res.gvars_base = (uintptr_t *)gvars[0]; for (size_t i = 0; i < gvars.size(); i++) { + assert(gvars[i] && "Missing global variable pointer!"); offsets[i] = gvars[i] - (const char *)res.gvars_base; } res.gvars_offsets = offsets; } if (!clones.empty()) { + assert(!fvars.empty()); std::sort(clones.begin(), clones.end()); auto clone_offsets = (int32_t *) malloc(sizeof(int32_t) * clones.size()); auto clone_idxs = (uint32_t *) malloc(sizeof(uint32_t) * clones.size()); From 4ad943da621f5d696ebfdf853ac03aa742edec65 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Fri, 6 Jan 2023 20:19:12 -0500 Subject: [PATCH 09/34] Don't try to extract indexes during partitioning --- src/aotcompile.cpp | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 8ef715235fb04..85e7481b21722 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -568,34 +568,27 @@ struct Partition { static void get_fvars_gvars(Module &M, DenseMap<GlobalValue *, unsigned> &fvars, DenseMap<GlobalValue *, unsigned> &gvars) { auto fvars_gv = M.getGlobalVariable("jl_fvars"); auto gvars_gv = M.getGlobalVariable("jl_gvars"); - assert(fvars_gv); - assert(gvars_gv); - auto fvars_init = cast<ConstantArray>(fvars_gv->getInitializer()); - auto gvars_init = cast<ConstantArray>(gvars_gv->getInitializer()); - std::string suffix; - if (auto md = M.getModuleFlag("julia.mv.suffix")) { - suffix = cast<MDString>(md)->getString().str(); - } auto fvars_idxs = M.getGlobalVariable("jl_fvar_idxs"); auto gvars_idxs = M.getGlobalVariable("jl_gvar_idxs"); + assert(fvars_gv); + assert(gvars_gv); assert(fvars_idxs); assert(gvars_idxs); - auto fvars_idxs_init = cast<ConstantDataArray>(fvars_idxs->getInitializer()); - auto gvars_idxs_init = cast<ConstantDataArray>(gvars_idxs->getInitializer()); + auto fvars_init = cast<ConstantArray>(fvars_gv->getInitializer()); + auto gvars_init = cast<ConstantArray>(gvars_gv->getInitializer()); for (unsigned i = 0; i < fvars_init->getNumOperands(); ++i) { auto gv = cast<GlobalValue>(fvars_init->getOperand(i)->stripPointerCasts()); - auto idx = fvars_idxs_init->getElementAsInteger(i); - fvars[gv] = idx; + fvars[gv] = i; } for (unsigned i = 0; i < gvars_init->getNumOperands(); ++i) { auto gv = cast<GlobalValue>(gvars_init->getOperand(i)->stripPointerCasts()); - auto idx = gvars_idxs_init->getElementAsInteger(i); - gvars[gv] = idx; + gvars[gv] = i; } fvars_gv->eraseFromParent(); gvars_gv->eraseFromParent(); fvars_idxs->eraseFromParent(); gvars_idxs->eraseFromParent(); + dbgs() << "Finished getting fvars/gvars\n"; } static size_t getFunctionWeight(const Function &F) From d717fa7023ab104290a136b955511c38e1416a4c Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Fri, 6 Jan 2023 20:28:00 -0500 Subject: [PATCH 10/34] Fix whitespace --- src/aotcompile.cpp | 10 +++++----- src/llvm-multiversioning.cpp | 2 +- src/processor.cpp | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 85e7481b21722..233e94bf13346 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -719,7 +719,7 @@ static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) { for (unsigned i = 0; i < threads; ++i) { pq.push(&partitions[i]); } - + // Assign the root of each partition to a partition, then assign its children to the same one for (unsigned i = 0; i < partitioner.nodes.size(); ++i) { auto root = partitioner.find(i); @@ -1011,7 +1011,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o dbgs() << "Time to add output: " << (end - start) / 1e9 << "s\n"; return; } - + start = jl_hrtime(); uint64_t counter = 0; for (auto &G : M.global_values()) { @@ -1050,7 +1050,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o materializePreserved(*M, partitions[i]); end = jl_hrtime(); dbgs() << "Materialization time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; - + start = jl_hrtime(); construct_vars(*M, partitions[i]); M->setModuleFlag(Module::Error, "julia.mv.suffix", MDString::get(M->getContext(), "_" + std::to_string(i))); @@ -1270,7 +1270,7 @@ void jl_dump_native_impl(void *native_code, !!unopt_bc_fname, !!bc_fname, !!obj_fname, !!asm_fname, threads ); }; - + compile(*dataM, "text", threads); end = jl_hrtime(); @@ -1389,7 +1389,7 @@ void jl_dump_native_impl(void *native_code, if (asm_fname) handleAllErrors(writeArchive(asm_fname, asm_Archive, true, Kind, true, false), reportWriterError); - + end = jl_hrtime(); dbgs() << "archive time: " << (end - start) / 1e9 << "s\n"; diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp index a172579f8ae4b..418201f0825a1 100644 --- a/src/llvm-multiversioning.cpp +++ b/src/llvm-multiversioning.cpp @@ -924,7 +924,7 @@ static bool runMultiVersioning(Module &M, bool allow_bad_fvars) return false; CloneCtx clone(M, allow_bad_fvars); - + clone.prepare_slots(); clone.clone_decls(); diff --git a/src/processor.cpp b/src/processor.cpp index 3a791778a3b21..851cbec62560a 100644 --- a/src/processor.cpp +++ b/src/processor.cpp @@ -636,7 +636,7 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback) const void *ids = pointers->target_data; uint32_t target_idx = callback(ids); - + if (pointers->header->version != 1) { jl_error("Image file is not compatible with this version of Julia"); } From fe0600d2dc74e1381ac3018fe12835fafc25d529 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Fri, 6 Jan 2023 20:38:39 -0500 Subject: [PATCH 11/34] Fix warnings --- src/aotcompile.cpp | 4 ++-- src/llvm-multiversioning.cpp | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 233e94bf13346..323577c693b51 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -606,8 +606,8 @@ static size_t getFunctionWeight(const Function &F) return weight; } - -static bool verify_partitioning(const SmallVectorImpl<Partition> &partitions, const Module &M) { +//Inline to fool gcc into not complaining about unused function when asserts are disabled +static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partitions, const Module &M) { StringMap<uint32_t> GVNames; bool bad = false; for (uint32_t i = 0; i < partitions.size(); i++) { diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp index 418201f0825a1..971b0f338bdf8 100644 --- a/src/llvm-multiversioning.cpp +++ b/src/llvm-multiversioning.cpp @@ -131,7 +131,8 @@ static uint32_t collect_func_info(Function &F, bool &has_veccall) } // Check for BFloat16 when they are added to julia can be done here } - if (has_veccall && (flag & JL_TARGET_CLONE_SIMD) && (flag & JL_TARGET_CLONE_MATH)) { + uint32_t veccall_flags = JL_TARGET_CLONE_SIMD | JL_TARGET_CLONE_MATH | JL_TARGET_CLONE_CPU | JL_TARGET_CLONE_FLOAT16; + if (has_veccall && (flag & veccall_flags) == veccall_flags) { return flag; } } From bdf65f4b4e8a1c8f1a058fe8fc69a6b4c56acf80 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Fri, 6 Jan 2023 21:01:56 -0500 Subject: [PATCH 12/34] Set reloc slot to be external linkage --- src/llvm-multiversioning.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp index 971b0f338bdf8..1a38511f34ffb 100644 --- a/src/llvm-multiversioning.cpp +++ b/src/llvm-multiversioning.cpp @@ -424,7 +424,7 @@ void CloneCtx::prepare_slots() extern_relocs[F] = GV; } else { auto id = get_func_id(F); - auto GV = new GlobalVariable(M, F->getType(), false, GlobalValue::InternalLinkage, Constant::getNullValue(F->getType()), F->getName() + ".reloc_slot"); + auto GV = new GlobalVariable(M, F->getType(), false, GlobalValue::ExternalLinkage, Constant::getNullValue(F->getType()), F->getName() + ".reloc_slot"); GV->setVisibility(GlobalValue::HiddenVisibility); const_relocs[id] = GV; } From 4fc5bed6b5fa58f056ee5ee87730bea2ac17fa8c Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Tue, 10 Jan 2023 00:41:53 -0500 Subject: [PATCH 13/34] Formalize printing more, correct module weight estimation with multiversioning --- src/aotcompile.cpp | 75 +++++++++++++++++++++++++----------- src/llvm-multiversioning.cpp | 6 +-- 2 files changed, 55 insertions(+), 26 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 323577c693b51..701ecdfc925e8 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -601,7 +601,10 @@ static size_t getFunctionWeight(const Function &F) // add some weight to it weight += F.size(); if (F.hasFnAttribute("julia.mv.clones")) { - weight *= F.getFnAttribute("julia.mv.clones").getValueAsString().count(',') + 1; + auto val = F.getFnAttribute("julia.mv.clones").getValueAsString(); + // base16, so must be at most 4 * length bits long + // popcount gives number of clones + weight *= APInt(val.size() * 4, val, 16).countPopulation() + 1; } return weight; } @@ -761,7 +764,8 @@ static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) { } static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *outputs, StringRef name, - NewArchiveMember *unopt, NewArchiveMember *opt, NewArchiveMember *obj, NewArchiveMember *asm_) { + NewArchiveMember *unopt, NewArchiveMember *opt, NewArchiveMember *obj, NewArchiveMember *asm_, + std::stringstream &stream, unsigned i) { auto TM = std::unique_ptr<TargetMachine>( SourceTM.getTarget().createTargetMachine( SourceTM.getTargetTriple().str(), @@ -814,7 +818,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out end = jl_hrtime(); - dbgs() << "optimize time: " << (end - start) / 1e9 << "s\n"; + stream << "optimize time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; if (opt) { raw_string_ostream OS(*outputs); @@ -847,7 +851,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out end = jl_hrtime(); - dbgs() << "codegen time: " << (end - start) / 1e9 << "s\n"; + stream << "codegen time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; if (asm_) { SmallVector<char, 0> Buffer; @@ -1002,11 +1006,14 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o asm_.resize(asm_.size() + asm_out * threads); if (threads == 1) { start = jl_hrtime(); + std::stringstream stream; add_output_impl(M, TM, outputs.data() + outputs.size() - outcount * 2, name, unopt_out ? unopt.data() + unopt.size() - 1 : nullptr, opt_out ? opt.data() + opt.size() - 1 : nullptr, obj_out ? obj.data() + obj.size() - 1 : nullptr, - asm_out ? asm_.data() + asm_.size() - 1 : nullptr); + asm_out ? asm_.data() + asm_.size() - 1 : nullptr, + stream, 0); + dbgs() << stream.str(); end = jl_hrtime(); dbgs() << "Time to add output: " << (end - start) / 1e9 << "s\n"; return; @@ -1034,6 +1041,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o auto asmstart = asm_out ? asm_.data() + asm_.size() - threads : nullptr; std::vector<std::thread> workers(threads); + std::vector<std::stringstream> stderrs(threads); for (unsigned i = 0; i < threads; i++) { workers[i] = std::thread([&, i](){ LLVMContext ctx; @@ -1042,43 +1050,46 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o start = jl_hrtime(); auto M = cantFail(getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), "Optimized"), ctx), "Error loading module"); end = jl_hrtime(); - dbgs() << "Deserialization time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; + stderrs[i] << "Deserialization time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; - dbgs() << "Starting shard " << i << " with weight=" << partitions[i].weight << "\n"; + stderrs[i] << "Starting shard " << i << " with weight=" << partitions[i].weight << "\n"; start = jl_hrtime(); materializePreserved(*M, partitions[i]); end = jl_hrtime(); - dbgs() << "Materialization time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; + stderrs[i] << "Materialization time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; start = jl_hrtime(); construct_vars(*M, partitions[i]); M->setModuleFlag(Module::Error, "julia.mv.suffix", MDString::get(M->getContext(), "_" + std::to_string(i))); end = jl_hrtime(); - dbgs() << "Construction time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; + stderrs[i] << "Construction time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; start = jl_hrtime(); dropUnusedDeclarations(*M); end = jl_hrtime(); - dbgs() << "Declaration deletion time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; + stderrs[i] << "Declaration deletion time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; start = jl_hrtime(); add_output_impl(*M, TM, outstart + i * outcount * 2, name, unoptstart ? unoptstart + i : nullptr, optstart ? optstart + i : nullptr, objstart ? objstart + i : nullptr, - asmstart ? asmstart + i : nullptr); + asmstart ? asmstart + i : nullptr, + stderrs[i], i); end = jl_hrtime(); - dbgs() << "Output time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; + stderrs[i] << "Output time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; }); } start = jl_hrtime(); for (auto &w : workers) w.join(); + for (auto &str : stderrs) + dbgs() << str.str(); end = jl_hrtime(); dbgs() << "Total time for parallel output: " << (end - start) / 1e9 << "s\n"; @@ -1087,32 +1098,46 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o unsigned compute_image_thread_count(Module &M) { // 32-bit systems are very memory-constrained #ifdef _P32 + dbgs() << "Threads: 1\n"; return 1; #endif - unsigned threads = std::max(llvm::hardware_concurrency().compute_thread_count() / 2, 1u); - - // memory limit check - // many threads use a lot of memory, so limit on constrained memory systems - size_t available = uv_get_available_memory(); size_t weight = 0; + size_t globals = 0; for (auto &GV : M.global_values()) { if (GV.isDeclaration()) continue; + globals++; if (isa<Function>(GV)) { weight += getFunctionWeight(cast<Function>(GV)); } else { weight += 1; } } - if (weight == 0) { - dbgs() << "No globals in module, using 1 thread\n"; + dbgs() << "Module weight: " << weight << "\n"; + if (weight < 1000) { + dbgs() << "Low module complexity bailout\n"; + dbgs() << "Threads: 1\n"; return 1; } + + unsigned threads = std::max(llvm::hardware_concurrency().compute_thread_count() / 2, 1u); + + // memory limit check + // many threads use a lot of memory, so limit on constrained memory systems + size_t available = uv_get_available_memory(); // crude estimate, available / (weight * fudge factor) = max threads size_t fudge = 10; unsigned max_threads = std::max(available / (weight * fudge), (size_t)1); - dbgs() << "Weight: " << weight << ", available: " << available << ", wanted: " << threads << ", max threads: " << max_threads << "\n"; - threads = std::min(threads, max_threads); + if (max_threads < threads) { + dbgs() << "Memory limiting threads to " << max_threads << "\n"; + threads = max_threads; + } + + max_threads = globals / 100; + if (max_threads < threads) { + dbgs() << "Low global count limiting threads to " << max_threads << " (" << globals << "globals)\n"; + threads = max_threads; + } // environment variable override const char *env_threads = getenv("JULIA_IMAGE_THREADS"); @@ -1122,10 +1147,15 @@ unsigned compute_image_thread_count(Module &M) { if (*endptr || !requested) { jl_safe_printf("WARNING: invalid value '%s' for JULIA_IMAGE_THREADS\n", env_threads); } else { + dbgs() << "Overriding threads to " << requested << "\n"; threads = requested; } } + threads = std::max(threads, 1u); + + dbgs() << "Threads: " << threads << "\n"; + return threads; } @@ -1208,7 +1238,7 @@ void jl_dump_native_impl(void *native_code, start = jl_hrtime(); - unsigned threads = compute_image_thread_count(*dataM); + unsigned threads = 1; unsigned nfvars = 0; unsigned ngvars = 0; @@ -1225,6 +1255,7 @@ void jl_dump_native_impl(void *native_code, } } } + threads = compute_image_thread_count(*dataM); nfvars = data->jl_sysimg_fvars.size(); ngvars = data->jl_sysimg_gvars.size(); emit_offset_table(*dataM, data->jl_sysimg_gvars, "jl_gvars", T_psize); diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp index 1a38511f34ffb..527c17e826ce9 100644 --- a/src/llvm-multiversioning.cpp +++ b/src/llvm-multiversioning.cpp @@ -313,8 +313,6 @@ struct CloneCtx { // Map from original function to one based index in `fvars` std::map<const Function*,uint32_t> func_ids{}; std::vector<Function*> orig_funcs{}; - std::vector<uint32_t> func_infos{}; - std::set<Function*> cloned{}; // GV addresses and their corresponding function id (i.e. 0-based index in `fvars`) std::vector<std::pair<Constant*,uint32_t>> gv_relocs{}; // Mapping from function id (i.e. 0-based index in `fvars`) to GVs to be initialized. @@ -650,7 +648,7 @@ void CloneCtx::fix_gv_uses() return changed; }; for (auto orig_f: orig_funcs) { - if (groups.size() == 1 && !cloned.count(orig_f)) + if (!orig_f->hasFnAttribute("julia.mv.clones")) continue; while (single_pass(orig_f)) { } @@ -813,7 +811,7 @@ void CloneCtx::emit_metadata() std::set<uint32_t> shared_relocs; { auto T_int32 = Type::getInt32Ty(M.getContext()); - std::stable_sort(gv_relocs.begin(), gv_relocs.end(), + std::sort(gv_relocs.begin(), gv_relocs.end(), [] (const std::pair<Constant*,uint32_t> &lhs, const std::pair<Constant*,uint32_t> &rhs) { return lhs.second < rhs.second; From 0c68e4af4d80e4dccfb682274b3130a699be2309 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Thu, 26 Jan 2023 16:14:42 -0500 Subject: [PATCH 14/34] Alter naming, sort partitions --- src/Makefile | 2 +- src/aotcompile.cpp | 64 +++++++++++++++++++++++++++++----------------- 2 files changed, 42 insertions(+), 24 deletions(-) diff --git a/src/Makefile b/src/Makefile index bb98f6766f470..dea033c0661d9 100644 --- a/src/Makefile +++ b/src/Makefile @@ -287,7 +287,7 @@ $(BUILDDIR)/julia_flisp.boot: $(addprefix $(SRCDIR)/,jlfrontend.scm flisp/aliase # additional dependency links $(BUILDDIR)/codegen-stubs.o $(BUILDDIR)/codegen-stubs.dbg.obj: $(SRCDIR)/intrinsics.h -$(BUILDDIR)/aotcompile.o $(BUILDDIR)/aotcompile.dbg.obj: $(SRCDIR)/jitlayers.h $(SRCDIR)/llvm-codegen-shared.h +$(BUILDDIR)/aotcompile.o $(BUILDDIR)/aotcompile.dbg.obj: $(SRCDIR)/jitlayers.h $(SRCDIR)/llvm-codegen-shared.h $(SRCDIR)/processor.h $(BUILDDIR)/ast.o $(BUILDDIR)/ast.dbg.obj: $(BUILDDIR)/julia_flisp.boot.inc $(SRCDIR)/flisp/*.h $(BUILDDIR)/builtins.o $(BUILDDIR)/builtins.dbg.obj: $(SRCDIR)/iddict.c $(SRCDIR)/builtin_proto.h $(BUILDDIR)/codegen.o $(BUILDDIR)/codegen.dbg.obj: $(addprefix $(SRCDIR)/,\ diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 701ecdfc925e8..7eeaeb94cf2da 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -66,6 +66,7 @@ using namespace llvm; #include "serialize.h" #include "julia_assert.h" #include "llvm-codegen-shared.h" +#include "processor.h" #define DEBUG_TYPE "julia_aotcompile" @@ -723,9 +724,19 @@ static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) { pq.push(&partitions[i]); } + std::vector<unsigned> idxs(partitioner.nodes.size()); + std::iota(idxs.begin(), idxs.end(), 0); + std::sort(idxs.begin(), idxs.end(), [&](unsigned a, unsigned b) { + //because roots have more weight than their children, + //we can sort by weight and get the roots first + return partitioner.nodes[a].weight > partitioner.nodes[b].weight; + }); + // Assign the root of each partition to a partition, then assign its children to the same one - for (unsigned i = 0; i < partitioner.nodes.size(); ++i) { + for (unsigned idx = 0; idx < idxs.size(); ++idx) { + auto i = idxs[idx]; auto root = partitioner.find(i); + assert(root == i || partitioner.nodes[root].GV == nullptr); if (partitioner.nodes[root].GV) { auto &node = partitioner.nodes[root]; auto &P = *pq.top(); @@ -763,9 +774,10 @@ static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) { return partitions; } -static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *outputs, StringRef name, +static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *outputs, ArrayRef<StringRef> names, NewArchiveMember *unopt, NewArchiveMember *opt, NewArchiveMember *obj, NewArchiveMember *asm_, std::stringstream &stream, unsigned i) { + assert(names.size() == 4); auto TM = std::unique_ptr<TargetMachine>( SourceTM.getTarget().createTargetMachine( SourceTM.getTargetTriple().str(), @@ -782,9 +794,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out AnalysisManagers AM{*TM, PB, OptimizationLevel::O0}; ModulePassManager MPM; MPM.addPass(BitcodeWriterPass(OS)); - outputs++; - *outputs = (name + "_unopt.bc").str(); - *unopt = NewArchiveMember(MemoryBufferRef(OS.str(), *outputs)); + *unopt = NewArchiveMember(MemoryBufferRef(*outputs, names[0])); outputs++; } if (!opt && !obj && !asm_) { @@ -826,9 +836,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out AnalysisManagers AM{*TM, PB, OptimizationLevel::O0}; ModulePassManager MPM; MPM.addPass(BitcodeWriterPass(OS)); - outputs++; - *outputs = (name + "_opt.bc").str(); - *opt = NewArchiveMember(MemoryBufferRef(OS.str(), *outputs)); + *opt = NewArchiveMember(MemoryBufferRef(*outputs, names[1])); outputs++; } @@ -843,9 +851,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out jl_safe_printf("ERROR: target does not support generation of object files\n"); emitter.run(M); *outputs = { Buffer.data(), Buffer.size() }; - outputs++; - *outputs = (name + ".o").str(); - *obj = NewArchiveMember(MemoryBufferRef(outputs[-1], *outputs)); + *obj = NewArchiveMember(MemoryBufferRef(*outputs, names[2])); outputs++; } @@ -862,9 +868,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out jl_safe_printf("ERROR: target does not support generation of assembly files\n"); emitter.run(M); *outputs = { Buffer.data(), Buffer.size() }; - outputs++; - *outputs = (name + ".s").str(); - *asm_ = NewArchiveMember(MemoryBufferRef(outputs[-1], *outputs)); + *asm_ = NewArchiveMember(MemoryBufferRef(*outputs, names[3])); outputs++; } } @@ -991,7 +995,7 @@ static void dropUnusedDeclarations(Module &M) { G->eraseFromParent(); } -static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &outputs, StringRef name, +static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &outputs, ArrayRef<StringRef> names, std::vector<NewArchiveMember> &unopt, std::vector<NewArchiveMember> &opt, std::vector<NewArchiveMember> &obj, std::vector<NewArchiveMember> &asm_, bool unopt_out, bool opt_out, bool obj_out, bool asm_out, @@ -999,7 +1003,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o uint64_t start = 0, end = 0; unsigned outcount = unopt_out + opt_out + obj_out + asm_out; assert(outcount); - outputs.resize(outputs.size() + outcount * threads * 2); + outputs.resize(outputs.size() + outcount * threads); unopt.resize(unopt.size() + unopt_out * threads); opt.resize(opt.size() + opt_out * threads); obj.resize(obj.size() + obj_out * threads); @@ -1007,7 +1011,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o if (threads == 1) { start = jl_hrtime(); std::stringstream stream; - add_output_impl(M, TM, outputs.data() + outputs.size() - outcount * 2, name, + add_output_impl(M, TM, outputs.data() + outputs.size() - outcount, names, unopt_out ? unopt.data() + unopt.size() - 1 : nullptr, opt_out ? opt.data() + opt.size() - 1 : nullptr, obj_out ? obj.data() + obj.size() - 1 : nullptr, @@ -1034,7 +1038,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o end = jl_hrtime(); dbgs() << "Time to serialize module: " << (end - start) / 1e9 << "s\n"; - auto outstart = outputs.data() + outputs.size() - outcount * threads * 2; + auto outstart = outputs.data() + outputs.size() - outcount * threads; auto unoptstart = unopt_out ? unopt.data() + unopt.size() - threads : nullptr; auto optstart = opt_out ? opt.data() + opt.size() - threads : nullptr; auto objstart = obj_out ? obj.data() + obj.size() - threads : nullptr; @@ -1073,7 +1077,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o stderrs[i] << "Declaration deletion time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; start = jl_hrtime(); - add_output_impl(*M, TM, outstart + i * outcount * 2, name, + add_output_impl(*M, TM, outstart + i * outcount, names, unoptstart ? unoptstart + i : nullptr, optstart ? optstart + i : nullptr, objstart ? objstart + i : nullptr, @@ -1295,14 +1299,21 @@ void jl_dump_native_impl(void *native_code, start = jl_hrtime(); - auto compile = [&](Module &M, StringRef name, unsigned threads) { add_output( - M, *SourceTM, outputs, name, + auto compile = [&](Module &M, ArrayRef<StringRef> names, unsigned threads) { add_output( + M, *SourceTM, outputs, names, unopt_bc_Archive, bc_Archive, obj_Archive, asm_Archive, !!unopt_bc_fname, !!bc_fname, !!obj_fname, !!asm_fname, threads ); }; + + std::array<StringRef, 4> text_names = { + "text_unopt.bc", + "text_opt.bc", + "text.o", + "text.s" + }; - compile(*dataM, "text", threads); + compile(*dataM, text_names, threads); end = jl_hrtime(); @@ -1399,7 +1410,14 @@ void jl_dump_native_impl(void *native_code, ios_write(s, (const char *)data.data(), data.size()); } } - compile(*sysimageM, "data", 1); + + std::array<StringRef, 4> data_names = { + "data_unopt.bc", + "data_opt.bc", + "data.o", + "data.s" + }; + compile(*sysimageM, data_names, 1); end = jl_hrtime(); From f9da0e261abf505af4a5841d7114efd7f499c755 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Thu, 26 Jan 2023 16:21:58 -0500 Subject: [PATCH 15/34] Fix whitespace --- src/aotcompile.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 7eeaeb94cf2da..28b13445c8e2a 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -1305,7 +1305,7 @@ void jl_dump_native_impl(void *native_code, !!unopt_bc_fname, !!bc_fname, !!obj_fname, !!asm_fname, threads ); }; - + std::array<StringRef, 4> text_names = { "text_unopt.bc", "text_opt.bc", From 9a72be669b5cac14cd9a2228563e12494d6d7717 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Thu, 26 Jan 2023 23:33:41 -0500 Subject: [PATCH 16/34] Avoid unused function warning --- src/aotcompile.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 28b13445c8e2a..6d0509ac05bbc 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -610,7 +610,8 @@ static size_t getFunctionWeight(const Function &F) return weight; } -//Inline to fool gcc into not complaining about unused function when asserts are disabled +#ifndef NDEBUG + static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partitions, const Module &M) { StringMap<uint32_t> GVNames; bool bad = false; @@ -644,6 +645,8 @@ static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partiti return !bad; } +#endif + // Chop a module up as equally as possible into threads partitions static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) { //Start by stripping fvars and gvars, which helpfully removes their uses as well From 1f07ea51faecfdcad80aeafa99e93bb65249f369 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Fri, 27 Jan 2023 02:48:12 -0500 Subject: [PATCH 17/34] Check relocations for generic target as well --- src/llvm-multiversioning.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp index 527c17e826ce9..cd90699e05aad 100644 --- a/src/llvm-multiversioning.cpp +++ b/src/llvm-multiversioning.cpp @@ -152,7 +152,6 @@ static void annotate_module_clones(Module &M) { auto specs = jl_get_llvm_clone_targets(); std::vector<APInt> clones(orig_funcs.size(), APInt(specs.size(), 0)); BitVector subtarget_cloned(orig_funcs.size()); - bool check_relocs = false; std::vector<unsigned> func_infos(orig_funcs.size()); for (unsigned i = 0; i < orig_funcs.size(); i++) { @@ -163,7 +162,6 @@ static void annotate_module_clones(Module &M) { for (unsigned j = 0; j < orig_funcs.size(); j++) { clones[j].setBit(i); } - check_relocs = true; } else { unsigned flag = specs[i].flags & clone_mask; std::set<Function*> sets[2]; @@ -217,7 +215,11 @@ static void annotate_module_clones(Module &M) { } } } - if (check_relocs) { + // if there's only one target, we won't need any relocation slots + // but even if there is one clone_all and one non-clone_all, we still need + // to check for relocation slots because we must fixup instruction uses to + // point at the right function. + if (specs.size() > 1) { for (unsigned i = 0; i < orig_funcs.size(); i++) { auto &F = *orig_funcs[i]; if (subtarget_cloned[i] && !ConstantUses<Instruction>(orig_funcs[i], M).done()) { From 83b196758b9fabbe64b36750f56d50e083f87020 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Fri, 27 Jan 2023 03:27:27 -0500 Subject: [PATCH 18/34] Debug macos linker --- src/aotcompile.cpp | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 6d0509ac05bbc..f3c45241c4a0a 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -610,11 +610,11 @@ static size_t getFunctionWeight(const Function &F) return weight; } -#ifndef NDEBUG static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partitions, const Module &M) { - StringMap<uint32_t> GVNames; bool bad = false; +#ifdef JL_DEBUG_BUILD + StringMap<uint32_t> GVNames; for (uint32_t i = 0; i < partitions.size(); i++) { for (auto &name : partitions[i].globals) { if (GVNames.count(name.getKey())) { @@ -642,11 +642,10 @@ static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partiti } } } +#endif return !bad; } -#endif - // Chop a module up as equally as possible into threads partitions static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) { //Start by stripping fvars and gvars, which helpfully removes their uses as well @@ -772,7 +771,9 @@ static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) { } } - assert(verify_partitioning(partitions, M) && "Partitioning failed to partition globals correctly"); + bool verified = verify_partitioning(partitions, M); + assert(verified && "Partitioning failed to partition globals correctly"); + (void) verified; return partitions; } @@ -1135,10 +1136,14 @@ unsigned compute_image_thread_count(Module &M) { // crude estimate, available / (weight * fudge factor) = max threads size_t fudge = 10; unsigned max_threads = std::max(available / (weight * fudge), (size_t)1); - if (max_threads < threads) { - dbgs() << "Memory limiting threads to " << max_threads << "\n"; - threads = max_threads; - } + dbgs() << "Available memory: " << available << " bytes\n"; + dbgs() << "Max threads: " << max_threads << "\n"; + dbgs() << "Temporarily disabling memory limiting threads\n"; + //TODO reenable + // if (max_threads < threads) { + // dbgs() << "Memory limiting threads to " << max_threads << "\n"; + // threads = max_threads; + // } max_threads = globals / 100; if (max_threads < threads) { @@ -1420,7 +1425,11 @@ void jl_dump_native_impl(void *native_code, "data.o", "data.s" }; + dbgs() << "Dumping sysimage data module\n"; + dbgs() << *sysimageM << "\n"; compile(*sysimageM, data_names, 1); + dbgs() << "Post-optimization sysimageM\n"; + dbgs() << *sysimageM << "\n"; end = jl_hrtime(); From c98ff304ab697f0eb492a150833c12baa499f29e Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Fri, 27 Jan 2023 17:41:11 -0500 Subject: [PATCH 19/34] Respect JULIA_CPU_THREADS --- src/aotcompile.cpp | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index f3c45241c4a0a..88c54d228c307 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -1153,14 +1153,33 @@ unsigned compute_image_thread_count(Module &M) { // environment variable override const char *env_threads = getenv("JULIA_IMAGE_THREADS"); + bool env_threads_set = false; if (env_threads) { char *endptr; unsigned long requested = strtoul(env_threads, &endptr, 10); if (*endptr || !requested) { jl_safe_printf("WARNING: invalid value '%s' for JULIA_IMAGE_THREADS\n", env_threads); } else { - dbgs() << "Overriding threads to " << requested << "\n"; + dbgs() << "Overriding threads to " << requested << " due to JULIA_IMAGE_THREADS\n"; threads = requested; + env_threads_set = true; + } + } + + // more defaults + if (!env_threads_set && threads > 1) { + if (jl_options.nthreads && jl_options.nthreads < threads) { + dbgs() << "Overriding threads to " << jl_options.nthreads << " due to -t option\n"; + threads = jl_options.nthreads; + } else if (auto fallbackenv = getenv(NUM_THREADS_NAME)) { + char *endptr; + unsigned long requested = strtoul(fallbackenv, &endptr, 10); + if (*endptr || !requested) { + jl_safe_printf("WARNING: invalid value '%s' for %s\m", fallbackenv, NUM_THREADS_NAME); + } else if (requested < threads) { + dbgs() << "Overriding threads to " << requested << " due to " << NUM_THREADS_NAME << "\n"; + threads = requested; + } } } @@ -1426,10 +1445,15 @@ void jl_dump_native_impl(void *native_code, "data.s" }; dbgs() << "Dumping sysimage data module\n"; + for (auto &F : *sysimageM) { + dbgs() << F << "\n"; + } dbgs() << *sysimageM << "\n"; compile(*sysimageM, data_names, 1); dbgs() << "Post-optimization sysimageM\n"; - dbgs() << *sysimageM << "\n"; + for (auto &F : *sysimageM) { + dbgs() << F << "\n"; + } end = jl_hrtime(); From 8cf48f2369a4197c46858e0bc6c166ad69d3e8d4 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Fri, 27 Jan 2023 18:23:03 -0500 Subject: [PATCH 20/34] Don't inject CRT aliases on macos --- src/aotcompile.cpp | 54 +++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 88c54d228c307..c40868af11c58 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -1168,16 +1168,18 @@ unsigned compute_image_thread_count(Module &M) { // more defaults if (!env_threads_set && threads > 1) { - if (jl_options.nthreads && jl_options.nthreads < threads) { - dbgs() << "Overriding threads to " << jl_options.nthreads << " due to -t option\n"; - threads = jl_options.nthreads; - } else if (auto fallbackenv = getenv(NUM_THREADS_NAME)) { + if (jl_options.nthreads) { + if (static_cast<unsigned>(jl_options.nthreads) < threads) { + dbgs() << "Overriding threads to " << jl_options.nthreads << " due to -t option\n"; + threads = jl_options.nthreads; + } + } else if (auto fallbackenv = getenv("JULIA_CPU_THREADS")) { char *endptr; unsigned long requested = strtoul(fallbackenv, &endptr, 10); if (*endptr || !requested) { - jl_safe_printf("WARNING: invalid value '%s' for %s\m", fallbackenv, NUM_THREADS_NAME); + jl_safe_printf("WARNING: invalid value '%s' for JULIA_CPU_THREADS\n", fallbackenv); } else if (requested < threads) { - dbgs() << "Overriding threads to " << requested << " due to " << NUM_THREADS_NAME << "\n"; + dbgs() << "Overriding threads to " << requested << " due to JULIA_CPU_THREADS\n"; threads = requested; } } @@ -1355,20 +1357,23 @@ void jl_dump_native_impl(void *native_code, sysimageM->setStackProtectorGuard(dataM->getStackProtectorGuard()); sysimageM->setOverrideStackAlignment(dataM->getOverrideStackAlignment()); #endif - // We would like to emit an alias or an weakref alias to redirect these symbols - // but LLVM doesn't let us emit a GlobalAlias to a declaration... - // So for now we inject a definition of these functions that calls our runtime - // functions. We do so after optimization to avoid cloning these functions. - injectCRTAlias(*sysimageM, "__gnu_h2f_ieee", "julia__gnu_h2f_ieee", - FunctionType::get(Type::getFloatTy(Context), { Type::getHalfTy(Context) }, false)); - injectCRTAlias(*sysimageM, "__extendhfsf2", "julia__gnu_h2f_ieee", - FunctionType::get(Type::getFloatTy(Context), { Type::getHalfTy(Context) }, false)); - injectCRTAlias(*sysimageM, "__gnu_f2h_ieee", "julia__gnu_f2h_ieee", - FunctionType::get(Type::getHalfTy(Context), { Type::getFloatTy(Context) }, false)); - injectCRTAlias(*sysimageM, "__truncsfhf2", "julia__gnu_f2h_ieee", - FunctionType::get(Type::getHalfTy(Context), { Type::getFloatTy(Context) }, false)); - injectCRTAlias(*sysimageM, "__truncdfhf2", "julia__truncdfhf2", - FunctionType::get(Type::getHalfTy(Context), { Type::getDoubleTy(Context) }, false)); + + if (!TheTriple.isOSDarwin()) { + // We would like to emit an alias or an weakref alias to redirect these symbols + // but LLVM doesn't let us emit a GlobalAlias to a declaration... + // So for now we inject a definition of these functions that calls our runtime + // functions. We do so after optimization to avoid cloning these functions. + injectCRTAlias(*sysimageM, "__gnu_h2f_ieee", "julia__gnu_h2f_ieee", + FunctionType::get(Type::getFloatTy(Context), { Type::getHalfTy(Context) }, false)); + injectCRTAlias(*sysimageM, "__extendhfsf2", "julia__gnu_h2f_ieee", + FunctionType::get(Type::getFloatTy(Context), { Type::getHalfTy(Context) }, false)); + injectCRTAlias(*sysimageM, "__gnu_f2h_ieee", "julia__gnu_f2h_ieee", + FunctionType::get(Type::getHalfTy(Context), { Type::getFloatTy(Context) }, false)); + injectCRTAlias(*sysimageM, "__truncsfhf2", "julia__gnu_f2h_ieee", + FunctionType::get(Type::getHalfTy(Context), { Type::getFloatTy(Context) }, false)); + injectCRTAlias(*sysimageM, "__truncdfhf2", "julia__truncdfhf2", + FunctionType::get(Type::getHalfTy(Context), { Type::getDoubleTy(Context) }, false)); + } if (TheTriple.isOSWindows()) { // Windows expect that the function `_DllMainStartup` is present in an dll. @@ -1444,16 +1449,7 @@ void jl_dump_native_impl(void *native_code, "data.o", "data.s" }; - dbgs() << "Dumping sysimage data module\n"; - for (auto &F : *sysimageM) { - dbgs() << F << "\n"; - } - dbgs() << *sysimageM << "\n"; compile(*sysimageM, data_names, 1); - dbgs() << "Post-optimization sysimageM\n"; - for (auto &F : *sysimageM) { - dbgs() << F << "\n"; - } end = jl_hrtime(); From 4e35f416bb71f0a33ad3605742bfa8f6bc82a85b Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Wed, 1 Feb 2023 03:02:47 -0500 Subject: [PATCH 21/34] Clean up timers and prints, link to JULIA_IMAGE_TIMINGS --- src/aotcompile.cpp | 272 +++++++++++++++++++++-------------- src/llvm-multiversioning.cpp | 4 +- 2 files changed, 168 insertions(+), 108 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index c40868af11c58..79e9ea07eb592 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -57,6 +57,7 @@ #include <llvm/IR/LegacyPassManagers.h> #include <llvm/Transforms/Utils/Cloning.h> +#include <llvm/Support/FormatAdapters.h> #include <llvm/Linker/Linker.h> @@ -269,8 +270,6 @@ void replaceUsesWithLoad(Function &F, function_ref<GlobalVariable *(Instruction extern "C" JL_DLLEXPORT void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvmmod, const jl_cgparams_t *cgparams, int _policy, int _imaging_mode, int _external_linkage, size_t _world) { - uint64_t start = jl_hrtime(); - uint64_t end = 0; ++CreateNativeCalls; CreateNativeMax.updateMax(jl_array_len(methods)); if (cgparams == NULL) @@ -463,8 +462,6 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm if (ctx.getContext()) { jl_ExecutionEngine->releaseContext(std::move(ctx)); } - end = jl_hrtime(); - dbgs() << "jl_create_native: " << (end - start) / 1e9 << "s\n"; return (void*)data; } @@ -589,7 +586,6 @@ static void get_fvars_gvars(Module &M, DenseMap<GlobalValue *, unsigned> &fvars, gvars_gv->eraseFromParent(); fvars_idxs->eraseFromParent(); gvars_idxs->eraseFromParent(); - dbgs() << "Finished getting fvars/gvars\n"; } static size_t getFunctionWeight(const Function &F) @@ -778,9 +774,74 @@ static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) { return partitions; } +struct ImageTimer { + uint64_t elapsed = 0; + std::string name; + std::string desc; + + void startTimer() { + elapsed = jl_hrtime(); + } + + void stopTimer() { + elapsed = jl_hrtime() - elapsed; + } + + void init(const Twine &name, const Twine &desc) { + this->name = name.str(); + this->desc = desc.str(); + } + + operator bool() const { + return elapsed != 0; + } + + void print(raw_ostream &out, bool clear=false) { + if (!*this) + return; + out << llvm::formatv("{0:F3} ", elapsed / 1e9) << name << " " << desc << "\n"; + if (clear) + elapsed = 0; + } +}; + +struct ShardTimers { + ImageTimer deserialize; + ImageTimer materialize; + ImageTimer construct; + ImageTimer deletion; + // impl timers + ImageTimer unopt; + ImageTimer optimize; + ImageTimer opt; + ImageTimer obj; + ImageTimer asm_; + + std::string name; + std::string desc; + + void print(raw_ostream &out, bool clear=false) { + StringRef sep = "===-------------------------------------------------------------------------==="; + out << formatv("{0}\n{1}\n{0}\n", sep, fmt_align(name + " : " + desc, AlignStyle::Center, sep.size())); + auto total = deserialize.elapsed + materialize.elapsed + construct.elapsed + deletion.elapsed + + unopt.elapsed + optimize.elapsed + opt.elapsed + obj.elapsed + asm_.elapsed; + out << "Time (s) Name Description\n"; + deserialize.print(out, clear); + materialize.print(out, clear); + construct.print(out, clear); + deletion.print(out, clear); + unopt.print(out, clear); + optimize.print(out, clear); + opt.print(out, clear); + obj.print(out, clear); + asm_.print(out, clear); + out << llvm::formatv("{0:F3} total Total time taken\n", total / 1e9); + } +}; + static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *outputs, ArrayRef<StringRef> names, NewArchiveMember *unopt, NewArchiveMember *opt, NewArchiveMember *obj, NewArchiveMember *asm_, - std::stringstream &stream, unsigned i) { + ShardTimers &timers, unsigned shardidx) { assert(names.size() == 4); auto TM = std::unique_ptr<TargetMachine>( SourceTM.getTarget().createTargetMachine( @@ -793,6 +854,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out SourceTM.getOptLevel())); if (unopt) { + timers.unopt.startTimer(); raw_string_ostream OS(*outputs); PassBuilder PB; AnalysisManagers AM{*TM, PB, OptimizationLevel::O0}; @@ -800,14 +862,14 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out MPM.addPass(BitcodeWriterPass(OS)); *unopt = NewArchiveMember(MemoryBufferRef(*outputs, names[0])); outputs++; + timers.unopt.stopTimer(); } if (!opt && !obj && !asm_) { return; } assert(!verifyModule(M, &errs())); - uint64_t start = jl_hrtime(); - uint64_t end = 0; + timers.optimize.startTimer(); #ifndef JL_USE_NEW_PM legacy::PassManager optimizer; @@ -829,12 +891,11 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out #endif optimizer.run(M); assert(!verifyModule(M, &errs())); - - end = jl_hrtime(); - - stream << "optimize time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; + + timers.optimize.stopTimer(); if (opt) { + timers.opt.startTimer(); raw_string_ostream OS(*outputs); PassBuilder PB; AnalysisManagers AM{*TM, PB, OptimizationLevel::O0}; @@ -842,11 +903,11 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out MPM.addPass(BitcodeWriterPass(OS)); *opt = NewArchiveMember(MemoryBufferRef(*outputs, names[1])); outputs++; + timers.opt.stopTimer(); } - start = jl_hrtime(); - if (obj) { + timers.obj.startTimer(); SmallVector<char, 0> Buffer; raw_svector_ostream OS(Buffer); legacy::PassManager emitter; @@ -857,13 +918,11 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out *outputs = { Buffer.data(), Buffer.size() }; *obj = NewArchiveMember(MemoryBufferRef(*outputs, names[2])); outputs++; + timers.obj.stopTimer(); } - end = jl_hrtime(); - - stream << "codegen time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; - if (asm_) { + timers.asm_.startTimer(); SmallVector<char, 0> Buffer; raw_svector_ostream OS(Buffer); legacy::PassManager emitter; @@ -874,6 +933,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out *outputs = { Buffer.data(), Buffer.size() }; *asm_ = NewArchiveMember(MemoryBufferRef(*outputs, names[3])); outputs++; + timers.asm_.stopTimer(); } } @@ -1004,7 +1064,6 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o std::vector<NewArchiveMember> &obj, std::vector<NewArchiveMember> &asm_, bool unopt_out, bool opt_out, bool obj_out, bool asm_out, unsigned threads) { - uint64_t start = 0, end = 0; unsigned outcount = unopt_out + opt_out + obj_out + asm_out; assert(outcount); outputs.resize(outputs.size() + outcount * threads); @@ -1012,22 +1071,64 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o opt.resize(opt.size() + opt_out * threads); obj.resize(obj.size() + obj_out * threads); asm_.resize(asm_.size() + asm_out * threads); + auto name = names[2]; + name.consume_back(".o"); + TimerGroup timer_group("add_output", ("Time to optimize and emit LLVM module " + name).str()); + SmallVector<ShardTimers, 1> timers(threads); + for (unsigned i = 0; i < threads; ++i) { + auto idx = std::to_string(i); + timers[i].name = "shard_" + idx; + timers[i].desc = ("Timings for " + name + " module shard " + idx).str(); + timers[i].deserialize.init("deserialize_" + idx, "Deserialize module"); + timers[i].materialize.init("materialize_" + idx, "Materialize declarations"); + timers[i].construct.init("construct_" + idx, "Construct partitioned definitions"); + timers[i].deletion.init("deletion_" + idx, "Delete dead declarations"); + timers[i].unopt.init("unopt_" + idx, "Emit unoptimized bitcode"); + timers[i].optimize.init("optimize_" + idx, "Optimize shard"); + timers[i].opt.init("opt_" + idx, "Emit optimized bitcode"); + timers[i].obj.init("obj_" + idx, "Emit object file"); + timers[i].asm_.init("asm_" + idx, "Emit assembly file"); + } + Timer partition_timer("partition", "Partition module", timer_group); + Timer serialize_timer("serialize", "Serialize module", timer_group); + Timer output_timer("output", "Add outputs", timer_group); + bool report_timings = false; + if (auto env = getenv("JULIA_IMAGE_TIMINGS")) { + char *endptr; + unsigned long val = strtoul(env, &endptr, 10); + if (endptr != env && !*endptr && val <= 1) { + report_timings = val; + } else { + if (StringRef("true").compare_insensitive(env) == 0) + report_timings = true; + else if (StringRef("false").compare_insensitive(env) == 0) + report_timings = false; + else + errs() << "WARNING: Invalid value for JULIA_IMAGE_TIMINGS: " << env << "\n"; + } + } if (threads == 1) { - start = jl_hrtime(); - std::stringstream stream; + output_timer.startTimer(); add_output_impl(M, TM, outputs.data() + outputs.size() - outcount, names, unopt_out ? unopt.data() + unopt.size() - 1 : nullptr, opt_out ? opt.data() + opt.size() - 1 : nullptr, obj_out ? obj.data() + obj.size() - 1 : nullptr, asm_out ? asm_.data() + asm_.size() - 1 : nullptr, - stream, 0); - dbgs() << stream.str(); - end = jl_hrtime(); - dbgs() << "Time to add output: " << (end - start) / 1e9 << "s\n"; + timers[0], 0); + output_timer.stopTimer(); + + if (!report_timings) { + timer_group.clear(); + } else { + timer_group.print(dbgs(), true); + for (auto &t : timers) { + t.print(dbgs(), true); + } + } return; } - start = jl_hrtime(); + partition_timer.startTimer(); uint64_t counter = 0; for (auto &G : M.global_values()) { if (!G.isDeclaration() && !G.hasName()) { @@ -1035,12 +1136,12 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o } } auto partitions = partitionModule(M, threads); - end = jl_hrtime(); - dbgs() << "Time to partition module: " << (end - start) / 1e9 << "s\n"; - start = jl_hrtime(); + partition_timer.stopTimer(); + serialize_timer.startTimer(); auto serialized = serializeModule(M); - end = jl_hrtime(); - dbgs() << "Time to serialize module: " << (end - start) / 1e9 << "s\n"; + serialize_timer.stopTimer(); + + output_timer.startTimer(); auto outstart = outputs.data() + outputs.size() - outcount * threads; auto unoptstart = unopt_out ? unopt.data() + unopt.size() - threads : nullptr; @@ -1049,64 +1150,56 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o auto asmstart = asm_out ? asm_.data() + asm_.size() - threads : nullptr; std::vector<std::thread> workers(threads); - std::vector<std::stringstream> stderrs(threads); for (unsigned i = 0; i < threads; i++) { workers[i] = std::thread([&, i](){ LLVMContext ctx; - uint64_t start = 0; - uint64_t end = 0; - start = jl_hrtime(); + timers[i].deserialize.startTimer(); auto M = cantFail(getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), "Optimized"), ctx), "Error loading module"); - end = jl_hrtime(); - stderrs[i] << "Deserialization time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; + timers[i].deserialize.stopTimer(); - stderrs[i] << "Starting shard " << i << " with weight=" << partitions[i].weight << "\n"; + // dbgs() << "Starting shard " << i << " with weight=" << partitions[i].weight << "\n"; - start = jl_hrtime(); + timers[i].materialize.startTimer(); materializePreserved(*M, partitions[i]); - end = jl_hrtime(); - stderrs[i] << "Materialization time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; + timers[i].materialize.stopTimer(); - start = jl_hrtime(); + timers[i].construct.startTimer(); construct_vars(*M, partitions[i]); M->setModuleFlag(Module::Error, "julia.mv.suffix", MDString::get(M->getContext(), "_" + std::to_string(i))); - end = jl_hrtime(); - - stderrs[i] << "Construction time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; + timers[i].construct.stopTimer(); - start = jl_hrtime(); + timers[i].deletion.startTimer(); dropUnusedDeclarations(*M); - end = jl_hrtime(); - - stderrs[i] << "Declaration deletion time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; + timers[i].deletion.stopTimer(); - start = jl_hrtime(); add_output_impl(*M, TM, outstart + i * outcount, names, unoptstart ? unoptstart + i : nullptr, optstart ? optstart + i : nullptr, objstart ? objstart + i : nullptr, asmstart ? asmstart + i : nullptr, - stderrs[i], i); - end = jl_hrtime(); - - stderrs[i] << "Output time for shard " << i << ": " << (end - start) / 1e9 << "s\n"; + timers[i], i); }); } - start = jl_hrtime(); for (auto &w : workers) w.join(); - for (auto &str : stderrs) - dbgs() << str.str(); - end = jl_hrtime(); - dbgs() << "Total time for parallel output: " << (end - start) / 1e9 << "s\n"; + output_timer.stopTimer(); + + if (!report_timings) { + timer_group.clear(); + } else { + timer_group.print(dbgs(), true); + for (auto &t : timers) { + t.print(dbgs(), true); + } + } } unsigned compute_image_thread_count(Module &M) { // 32-bit systems are very memory-constrained #ifdef _P32 - dbgs() << "Threads: 1\n"; + // dbgs() << "Threads: 1\n"; return 1; #endif size_t weight = 0; @@ -1121,10 +1214,10 @@ unsigned compute_image_thread_count(Module &M) { weight += 1; } } - dbgs() << "Module weight: " << weight << "\n"; + // dbgs() << "Module weight: " << weight << "\n"; if (weight < 1000) { - dbgs() << "Low module complexity bailout\n"; - dbgs() << "Threads: 1\n"; + // dbgs() << "Low module complexity bailout\n"; + // dbgs() << "Threads: 1\n"; return 1; } @@ -1136,9 +1229,9 @@ unsigned compute_image_thread_count(Module &M) { // crude estimate, available / (weight * fudge factor) = max threads size_t fudge = 10; unsigned max_threads = std::max(available / (weight * fudge), (size_t)1); - dbgs() << "Available memory: " << available << " bytes\n"; - dbgs() << "Max threads: " << max_threads << "\n"; - dbgs() << "Temporarily disabling memory limiting threads\n"; + // dbgs() << "Available memory: " << available << " bytes\n"; + // dbgs() << "Max threads: " << max_threads << "\n"; + // dbgs() << "Temporarily disabling memory limiting threads\n"; //TODO reenable // if (max_threads < threads) { // dbgs() << "Memory limiting threads to " << max_threads << "\n"; @@ -1147,7 +1240,7 @@ unsigned compute_image_thread_count(Module &M) { max_threads = globals / 100; if (max_threads < threads) { - dbgs() << "Low global count limiting threads to " << max_threads << " (" << globals << "globals)\n"; + // dbgs() << "Low global count limiting threads to " << max_threads << " (" << globals << "globals)\n"; threads = max_threads; } @@ -1160,7 +1253,7 @@ unsigned compute_image_thread_count(Module &M) { if (*endptr || !requested) { jl_safe_printf("WARNING: invalid value '%s' for JULIA_IMAGE_THREADS\n", env_threads); } else { - dbgs() << "Overriding threads to " << requested << " due to JULIA_IMAGE_THREADS\n"; + // dbgs() << "Overriding threads to " << requested << " due to JULIA_IMAGE_THREADS\n"; threads = requested; env_threads_set = true; } @@ -1168,18 +1261,13 @@ unsigned compute_image_thread_count(Module &M) { // more defaults if (!env_threads_set && threads > 1) { - if (jl_options.nthreads) { - if (static_cast<unsigned>(jl_options.nthreads) < threads) { - dbgs() << "Overriding threads to " << jl_options.nthreads << " due to -t option\n"; - threads = jl_options.nthreads; - } - } else if (auto fallbackenv = getenv("JULIA_CPU_THREADS")) { + if (auto fallbackenv = getenv("JULIA_CPU_THREADS")) { char *endptr; unsigned long requested = strtoul(fallbackenv, &endptr, 10); if (*endptr || !requested) { jl_safe_printf("WARNING: invalid value '%s' for JULIA_CPU_THREADS\n", fallbackenv); } else if (requested < threads) { - dbgs() << "Overriding threads to " << requested << " due to JULIA_CPU_THREADS\n"; + // dbgs() << "Overriding threads to " << requested << " due to JULIA_CPU_THREADS\n"; threads = requested; } } @@ -1187,7 +1275,7 @@ unsigned compute_image_thread_count(Module &M) { threads = std::max(threads, 1u); - dbgs() << "Threads: " << threads << "\n"; + // dbgs() << "Threads: " << threads << "\n"; return threads; } @@ -1200,12 +1288,10 @@ void jl_dump_native_impl(void *native_code, const char *asm_fname, const char *sysimg_data, size_t sysimg_len, ios_t *s) { - uint64_t start = jl_hrtime(); - uint64_t end = 0; JL_TIMING(NATIVE_DUMP); jl_native_code_desc_t *data = (jl_native_code_desc_t*)native_code; if (!bc_fname && !unopt_bc_fname && !obj_fname && !asm_fname) { - dbgs() << "No output requested, skipping native code dump?\n"; + // dbgs() << "No output requested, skipping native code dump?\n"; delete data; return; } @@ -1265,12 +1351,6 @@ void jl_dump_native_impl(void *native_code, bool imaging_mode = imaging_default() || jl_options.outputo; - end = jl_hrtime(); - - dbgs() << "setup time: " << (end - start) / 1e9 << "s\n"; - - start = jl_hrtime(); - unsigned threads = 1; unsigned nfvars = 0; unsigned ngvars = 0; @@ -1322,12 +1402,6 @@ void jl_dump_native_impl(void *native_code, "jl_RTLD_DEFAULT_handle_pointer"), TheTriple); } - end = jl_hrtime(); - - dbgs() << "metadata time: " << (end - start) / 1e9 << "s\n"; - - start = jl_hrtime(); - auto compile = [&](Module &M, ArrayRef<StringRef> names, unsigned threads) { add_output( M, *SourceTM, outputs, names, unopt_bc_Archive, bc_Archive, obj_Archive, asm_Archive, @@ -1344,12 +1418,6 @@ void jl_dump_native_impl(void *native_code, compile(*dataM, text_names, threads); - end = jl_hrtime(); - - dbgs() << "text output time: " << (end - start) / 1e9 << "s\n"; - - start = jl_hrtime(); - auto sysimageM = std::make_unique<Module>("sysimage", Context); sysimageM->setTargetTriple(dataM->getTargetTriple()); sysimageM->setDataLayout(dataM->getDataLayout()); @@ -1451,12 +1519,6 @@ void jl_dump_native_impl(void *native_code, }; compile(*sysimageM, data_names, 1); - end = jl_hrtime(); - - dbgs() << "data module time: " << (end - start) / 1e9 << "s\n"; - - start = jl_hrtime(); - object::Archive::Kind Kind = getDefaultForHost(TheTriple); if (unopt_bc_fname) handleAllErrors(writeArchive(unopt_bc_fname, unopt_bc_Archive, true, @@ -1471,10 +1533,6 @@ void jl_dump_native_impl(void *native_code, handleAllErrors(writeArchive(asm_fname, asm_Archive, true, Kind, true, false), reportWriterError); - end = jl_hrtime(); - - dbgs() << "archive time: " << (end - start) / 1e9 << "s\n"; - delete data; } diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp index cd90699e05aad..42aa34d3bdb4f 100644 --- a/src/llvm-multiversioning.cpp +++ b/src/llvm-multiversioning.cpp @@ -915,8 +915,9 @@ static bool runMultiVersioning(Module &M, bool allow_bad_fvars) // * Cloned function -> Original function (add as we clone functions) // * Original function -> Base function (target specific and updated by LLVM) // * ID -> relocation slots (const). - if (M.getName() == "sysimage") + if (!M.getModuleFlag("julia.mv.enable")) { return false; + } GlobalVariable *fvars = M.getGlobalVariable("jl_fvars"); GlobalVariable *gvars = M.getGlobalVariable("jl_gvars"); @@ -986,6 +987,7 @@ static RegisterPass<MultiVersioningLegacy> X("JuliaMultiVersioning", "JuliaMulti void multiversioning_preannotate(Module &M) { annotate_module_clones(M); + M.addModuleFlag(Module::ModFlagBehavior::Error, "julia.mv.enable", 1); } void replaceUsesWithLoad(Function &F, function_ref<GlobalVariable *(Instruction &I)> should_replace, MDNode *tbaa_const) { From a723211c3106d6eebfbbbb680615269995cab0ec Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Wed, 1 Feb 2023 03:16:18 -0500 Subject: [PATCH 22/34] Fix whitespace --- src/aotcompile.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 79e9ea07eb592..428f397c35aed 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -891,7 +891,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out #endif optimizer.run(M); assert(!verifyModule(M, &errs())); - + timers.optimize.stopTimer(); if (opt) { @@ -1185,7 +1185,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o w.join(); output_timer.stopTimer(); - + if (!report_timings) { timer_group.clear(); } else { From 7cf839aaeb636e27b75c9857fe52913477ac1734 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Wed, 1 Feb 2023 06:30:22 -0500 Subject: [PATCH 23/34] Don't leave aliases to extern global objects --- src/aotcompile.cpp | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 428f397c35aed..fffc7839d74c9 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -938,6 +938,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out } static auto serializeModule(const Module &M) { + assert(!verifyModule(M, &errs()) && "Serializing invalid module!"); SmallVector<char, 0> ClonedModuleBuffer; BitcodeWriter BCWriter(ClonedModuleBuffer); BCWriter.writeModule(M); @@ -976,9 +977,16 @@ static void materializePreserved(Module &M, Partition &partition) { if (!GA.isDeclaration()) { if (!Preserve.contains(&GA)) { if (GA.getValueType()->isFunctionTy()) { - DeletedAliases.push_back({ &GA, Function::Create(cast<FunctionType>(GA.getValueType()), GlobalValue::ExternalLinkage, "", &M) }); + auto F = Function::Create(cast<FunctionType>(GA.getValueType()), GlobalValue::ExternalLinkage, "", &M); + // This is an extremely sad hack to make sure the global alias never points to an extern function + auto BB = BasicBlock::Create(M.getContext(), "", F); + new UnreachableInst(M.getContext(), BB); + GA.setAliasee(F); + + DeletedAliases.push_back({ &GA, F }); } else { - DeletedAliases.push_back({ &GA, new GlobalVariable(M, GA.getValueType(), false, GlobalValue::ExternalLinkage, nullptr) }); + auto GV = new GlobalVariable(M, GA.getValueType(), false, GlobalValue::ExternalLinkage, Constant::getNullValue(GA.getValueType())); + DeletedAliases.push_back({ &GA, GV }); } } } @@ -988,6 +996,12 @@ static void materializePreserved(Module &M, Partition &partition) { Deleted.second->takeName(Deleted.first); Deleted.first->replaceAllUsesWith(Deleted.second); Deleted.first->eraseFromParent(); + // undo our previous sad hack + if (auto F = dyn_cast<Function>(Deleted.second)) { + F->deleteBody(); + } else { + cast<GlobalVariable>(Deleted.second)->setInitializer(nullptr); + } } } From fa208d43a95e1336c6e793795aad8134cb72883b Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Wed, 1 Feb 2023 10:41:27 -0500 Subject: [PATCH 24/34] Break multiversioning's dependency on jl_get_llvm_clone_targets --- src/llvm-multiversioning.cpp | 110 ++++++++++++++++++++++++++++++----- 1 file changed, 96 insertions(+), 14 deletions(-) diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp index 42aa34d3bdb4f..b4f67ebe22c7d 100644 --- a/src/llvm-multiversioning.cpp +++ b/src/llvm-multiversioning.cpp @@ -140,6 +140,64 @@ static uint32_t collect_func_info(Function &F, bool &has_veccall) return flag; } +struct TargetSpec { + std::string cpu_name; + std::string cpu_features; + uint32_t base; + uint32_t flags; + + TargetSpec() = default; + + static TargetSpec fromSpec(jl_target_spec_t &spec) { + TargetSpec out; + out.cpu_name = spec.cpu_name; + out.cpu_features = spec.cpu_features; + out.base = spec.base; + out.flags = spec.flags; + return out; + } + + static TargetSpec fromMD(MDTuple *tup) { + TargetSpec out; + assert(tup->getNumOperands() == 4); + out.cpu_name = cast<MDString>(tup->getOperand(0))->getString().str(); + out.cpu_features = cast<MDString>(tup->getOperand(1))->getString().str(); + out.base = cast<ConstantInt>(cast<ConstantAsMetadata>(tup->getOperand(2))->getValue())->getZExtValue(); + out.flags = cast<ConstantInt>(cast<ConstantAsMetadata>(tup->getOperand(3))->getValue())->getZExtValue(); + return out; + } + + MDNode *toMD(LLVMContext &ctx) const { + return MDTuple::get(ctx, { + MDString::get(ctx, cpu_name), + MDString::get(ctx, cpu_features), + ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(ctx), base)), + ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(ctx), flags)) + }); + } +}; + +static Optional<std::vector<TargetSpec>> get_target_specs(Module &M) { + auto md = M.getModuleFlag("julia.mv.specs"); + if (!md) + return None; + auto tup = cast<MDTuple>(md); + std::vector<TargetSpec> out(tup->getNumOperands()); + for (unsigned i = 0; i < tup->getNumOperands(); i++) { + out[i] = TargetSpec::fromMD(cast<MDTuple>(tup->getOperand(i).get())); + } + return out; +} + +static void set_target_specs(Module &M, ArrayRef<TargetSpec> specs) { + std::vector<Metadata *> md; + md.reserve(specs.size()); + for (auto &spec: specs) { + md.push_back(spec.toMD(M.getContext())); + } + M.addModuleFlag(Module::Error, "julia.mv.specs", MDTuple::get(M.getContext(), md)); +} + static void annotate_module_clones(Module &M) { CallGraph CG(M); std::vector<Function *> orig_funcs; @@ -149,7 +207,17 @@ static void annotate_module_clones(Module &M) { orig_funcs.push_back(&F); } bool has_veccall = false; - auto specs = jl_get_llvm_clone_targets(); + std::vector<TargetSpec> specs; + if (auto maybe_specs = get_target_specs(M)) { + specs = std::move(*maybe_specs); + } else { + auto full_specs = jl_get_llvm_clone_targets(); + specs.reserve(full_specs.size()); + for (auto &spec: full_specs) { + specs.push_back(TargetSpec::fromSpec(spec)); + } + set_target_specs(M, specs); + } std::vector<APInt> clones(orig_funcs.size(), APInt(specs.size(), 0)); BitVector subtarget_cloned(orig_funcs.size()); @@ -255,6 +323,7 @@ static void annotate_module_clones(Module &M) { if (has_veccall) { M.addModuleFlag(Module::Max, "julia.mv.veccall", 1); } + M.addModuleFlag(Module::Error, "julia.mv.annotated", 1); } struct CloneCtx { @@ -305,7 +374,7 @@ struct CloneCtx { void rewrite_alias(GlobalAlias *alias, Function* F); MDNode *tbaa_const; - std::vector<jl_target_spec_t> specs; + std::vector<TargetSpec> specs; std::vector<Group> groups{}; std::vector<Target *> linearized; std::vector<Function*> fvars; @@ -362,7 +431,7 @@ static inline std::vector<T*> consume_gv(Module &M, const char *name, bool allow // Collect basic information about targets and functions. CloneCtx::CloneCtx(Module &M, bool allow_bad_fvars) : tbaa_const(tbaa_make_child_with_context(M.getContext(), "jtbaa_const", nullptr, true).first), - specs(jl_get_llvm_clone_targets()), + specs(*get_target_specs(M)), fvars(consume_gv<Function>(M, "jl_fvars", allow_bad_fvars)), gvars(consume_gv<Constant>(M, "jl_gvars", false)), M(M), @@ -473,24 +542,24 @@ static void clone_function(Function *F, Function *new_f, ValueToValueMapTy &vmap #endif } -static void add_features(Function *F, StringRef name, StringRef features, uint32_t flags) +static void add_features(Function *F, TargetSpec &spec) { auto attr = F->getFnAttribute("target-features"); if (attr.isStringAttribute()) { std::string new_features(attr.getValueAsString()); new_features += ","; - new_features += features; + new_features += spec.cpu_features; F->addFnAttr("target-features", new_features); } else { - F->addFnAttr("target-features", features); + F->addFnAttr("target-features", spec.cpu_features); } - F->addFnAttr("target-cpu", name); + F->addFnAttr("target-cpu", spec.cpu_name); if (!F->hasFnAttribute(Attribute::OptimizeNone)) { - if (flags & JL_TARGET_OPTSIZE) { + if (spec.flags & JL_TARGET_OPTSIZE) { F->addFnAttr(Attribute::OptimizeForSize); } - else if (flags & JL_TARGET_MINSIZE) { + else if (spec.flags & JL_TARGET_MINSIZE) { F->addFnAttr(Attribute::MinSize); } } @@ -514,18 +583,19 @@ void CloneCtx::clone_bodies() if (!F->isDeclaration()) { clone_function(group_F, target_F, *target.vmap); } - add_features(target_F, specs[target.idx].cpu_name, - specs[target.idx].cpu_features, specs[target.idx].flags); + add_features(target_F, specs[target.idx]); target_F->addFnAttr("julia.mv.clone", std::to_string(i)); } } + // don't set the original function's features yet, + // since we may clone it for later groups if (i != 0) { - //TODO should we also do this for target 0? - add_features(group_F, specs[groups[i].idx].cpu_name, - specs[groups[i].idx].cpu_features, specs[groups[i].idx].flags); + add_features(group_F, specs[groups[i].idx]); } group_F->addFnAttr("julia.mv.clone", std::to_string(i)); } + // Add features to the original function + add_features(F, specs[0]); } } @@ -919,6 +989,18 @@ static bool runMultiVersioning(Module &M, bool allow_bad_fvars) return false; } + // for opt testing purposes + bool annotated = !!M.getModuleFlag("julia.mv.annotated"); + if (!annotated) { + annotate_module_clones(M); + } + + // also for opt testing purposes + if (M.getModuleFlag("julia.mv.skipcloning")) { + assert(!annotated && "Multiversioning was enabled and annotations were added, but cloning was skipped!"); + return true; + } + GlobalVariable *fvars = M.getGlobalVariable("jl_fvars"); GlobalVariable *gvars = M.getGlobalVariable("jl_gvars"); if (allow_bad_fvars && (!fvars || !fvars->hasInitializer() || !isa<ConstantArray>(fvars->getInitializer()) || From 3dcd1a23ac16748daa6770d44bf0825fa3981767 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Wed, 1 Feb 2023 12:50:33 -0500 Subject: [PATCH 25/34] Add multiversioning annotation test --- .../multiversioning-annotate-only.ll | 217 ++++++++++++++++++ 1 file changed, 217 insertions(+) create mode 100644 test/llvmpasses/multiversioning-annotate-only.ll diff --git a/test/llvmpasses/multiversioning-annotate-only.ll b/test/llvmpasses/multiversioning-annotate-only.ll new file mode 100644 index 0000000000000..38af146c078f5 --- /dev/null +++ b/test/llvmpasses/multiversioning-annotate-only.ll @@ -0,0 +1,217 @@ +; RUN: opt -enable-new-pm=0 -load libjulia-codegen%shlibext -JuliaMultiVersioning -S %s | FileCheck %s +; RUN: opt -enable-new-pm=1 --load-pass-plugin=libjulia-codegen%shlibext -passes='JuliaMultiVersioning' -S %s | FileCheck %s + +; COM: This test checks that multiversioning correctly picks up on features that should trigger cloning +; COM: Note that for annotations alone, we don't need jl_fvars or jl_gvars + +; COM: Copied from src/processor.h +; COM: JL_TARGET_VEC_CALL = 1 << 0, +; COM: // Clone all functions +; COM: JL_TARGET_CLONE_ALL = 1 << 1, +; COM: // Clone when there's scalar math operations that can benefit from target-specific +; COM: // optimizations. This includes `muladd`, `fma`, `fast`/`contract` flags. +; COM: JL_TARGET_CLONE_MATH = 1 << 2, +; COM: // Clone when the function has a loop +; COM: JL_TARGET_CLONE_LOOP = 1 << 3, +; COM: // Clone when the function uses any vectors +; COM: // When this is specified, the cloning pass should also record if any of the cloned functions +; COM: // used this in any function call (including the signature of the function itself) +; COM: JL_TARGET_CLONE_SIMD = 1 << 4, +; COM: // The CPU name is unknown +; COM: JL_TARGET_UNKNOWN_NAME = 1 << 5, +; COM: // Optimize for size for this target +; COM: JL_TARGET_OPTSIZE = 1 << 6, +; COM: // Only optimize for size for this target +; COM: JL_TARGET_MINSIZE = 1 << 7, +; COM: // Clone when the function queries CPU features +; COM: JL_TARGET_CLONE_CPU = 1 << 8, +; COM: // Clone when the function uses fp16 +; COM: JL_TARGET_CLONE_FLOAT16 = 1 << 9, + +; COM: start with the basics, just one feature per function + +; COM: boring should only be cloned if clone_all is enabled on the target +; CHECK: @boring{{.*}}#[[BORING_ATTRS:[0-9]+]] +define noundef i32 @boring(i32 noundef %0) { + ret i32 %0 +} + +; CHECK: @fastmath_test{{.*}}#[[FASTMATH_TEST_ATTRS:[0-9]+]] +define noundef float @fastmath_test(float noundef %0, float noundef %1) { + %3 = fadd fast float %0, %1 + ret float %3 +} + +; CHECK: @loop_test{{.*}}#[[LOOP_TEST_ATTRS:[0-9]+]] +define noundef i32 @loop_test(i32 noundef %0) { + %2 = icmp sgt i32 %0, 0 + br i1 %2, label %5, label %3 + +3: ; preds = %5, %1 + %4 = phi i32 [ 0, %1 ], [ %9, %5 ] + ret i32 %4 + +5: ; preds = %1, %5 + %6 = phi i32 [ %10, %5 ], [ 0, %1 ] + %7 = phi i32 [ %9, %5 ], [ 0, %1 ] + %8 = lshr i32 %6, 1 + %9 = add nuw nsw i32 %8, %7 + %10 = add nuw nsw i32 %6, 1 + %11 = icmp eq i32 %10, %0 + br i1 %11, label %3, label %5, !llvm.loop !9 +} + +; CHECK: @simd_test{{.*}}#[[SIMD_TEST_ATTRS:[0-9]+]] +define noundef i32 @simd_test(<4 x i32> noundef %0) { + %2 = extractelement <4 x i32> %0, i64 0 + ret i32 %2 +} + +; COM: now check all the combinations + +; CHECK: @simd_fastmath_test{{.*}}#[[SIMD_FASTMATH_TEST_ATTRS:[0-9]+]] +define noundef float @simd_fastmath_test(<4 x float> noundef %0) { + %2 = extractelement <4 x float> %0, i64 0 + %3 = extractelement <4 x float> %0, i64 1 + %4 = fadd fast float %2, %3 + ret float %4 +} + +; CHECK: @loop_fastmath_test{{.*}}#[[LOOP_FASTMATH_TEST_ATTRS:[0-9]+]] +define noundef i32 @loop_fastmath_test(i32 noundef %0) { + %2 = icmp sgt i32 %0, 0 + br i1 %2, label %7, label %5 + +3: ; preds = %7 + %4 = fptosi float %12 to i32 + br label %5 + +5: ; preds = %3, %1 + %6 = phi i32 [ 0, %1 ], [ %4, %3 ] + ret i32 %6 + +7: ; preds = %1, %7 + %8 = phi i32 [ %13, %7 ], [ 0, %1 ] + %9 = phi float [ %12, %7 ], [ 0.000000e+00, %1 ] + %10 = lshr i32 %8, 1 + %11 = sitofp i32 %10 to float + %12 = fadd fast float %9, %11 + %13 = add nuw nsw i32 %8, 1 + %14 = icmp eq i32 %13, %0 + br i1 %14, label %3, label %7, !llvm.loop !9 +} + +; CHECK: @simd_loop_test{{.*}}#[[SIMD_LOOP_TEST_ATTRS:[0-9]+]] +define dso_local noundef i32 @simd_loop_test(<4 x i32> noundef %0) { + %2 = extractelement <4 x i32> %0, i64 0 + %3 = icmp sgt i32 %2, 0 + br i1 %3, label %6, label %4 + +4: ; preds = %6, %1 + %5 = phi i32 [ 0, %1 ], [ %10, %6 ] + ret i32 %5 + +6: ; preds = %1, %6 + %7 = phi i32 [ %11, %6 ], [ 0, %1 ] + %8 = phi i32 [ %10, %6 ], [ 0, %1 ] + %9 = lshr i32 %7, 1 + %10 = add nuw nsw i32 %9, %8 + %11 = add nuw nsw i32 %7, 1 + %12 = icmp eq i32 %11, %2 + br i1 %12, label %4, label %6, !llvm.loop !9 +} + +; CHECK: @simd_loop_fastmath_test{{.*}}#[[SIMD_LOOP_FASTMATH_TEST_ATTRS:[0-9]+]] +define noundef i32 @simd_loop_fastmath_test(<4 x i32> noundef %0) { + %2 = extractelement <4 x i32> %0, i64 0 + %3 = icmp sgt i32 %2, 0 + br i1 %3, label %8, label %6 + +4: ; preds = %8 + %5 = fptosi float %13 to i32 + br label %6 + +6: ; preds = %4, %1 + %7 = phi i32 [ 0, %1 ], [ %5, %4 ] + ret i32 %7 + +8: ; preds = %1, %8 + %9 = phi i32 [ %14, %8 ], [ 0, %1 ] + %10 = phi float [ %13, %8 ], [ 0.000000e+00, %1 ] + %11 = lshr i32 %9, 1 + %12 = sitofp i32 %11 to float + %13 = fadd fast float %10, %12 + %14 = add nuw nsw i32 %9, 1 + %15 = icmp eq i32 %14, %2 + br i1 %15, label %4, label %8, !llvm.loop !9 +} + +; COM: check for fvar and reloc annotations on functions used by other globals + +@func_gv = global i32 (i32)* @func_in_gv, align 8 + +; CHECK: @func_in_gv{{.*}}#[[FUNC_IN_GV_ATTRS:[0-9]+]] +define noundef i32 @func_in_gv(i32 noundef returned %0) { + ret i32 %0 +} + +@aliaser = alias i32 (i32)*, bitcast (i32 (i32)* @aliasee to i32 (i32)**) + +; CHECK: @aliasee{{.*}}#[[ALIASEE_ATTRS:[0-9]+]] +define i32 @aliasee(i32 noundef returned %0) { + ret i32 %0 +} + +; COM: check for reloc annotations on functions used by other functions +; CHECK: @cloned{{.*}}#[[CLONED_RELOC_ATTRS:[0-9]+]] +define noundef float @cloned(float noundef %0, float noundef %1) { + %3 = fadd fast float %0, %1 + ret float %3 +} + +define noundef i32 @uncloned(i32 noundef %0) { + %2 = sitofp i32 %0 to float + %3 = call noundef float @cloned(float noundef %2, float noundef %2) + %4 = fptosi float %3 to i32 + ret i32 %4 +} + +; COM: Note that these strings are hex-encoded bits of the target indices that will be cloned +; CHECK-DAG: attributes #[[BORING_ATTRS]] = { "julia.mv.clones"="2" } +; CHECK-DAG: attributes #[[FASTMATH_TEST_ATTRS]] = { "julia.mv.clones"="6" } +; CHECK-DAG: attributes #[[LOOP_TEST_ATTRS]] = { "julia.mv.clones"="A" } +; CHECK-DAG: attributes #[[SIMD_TEST_ATTRS]] = { "julia.mv.clones"="12" } +; CHECK-DAG: attributes #[[SIMD_FASTMATH_TEST_ATTRS]] = { "julia.mv.clones"="16" } +; CHECK-DAG: attributes #[[LOOP_FASTMATH_TEST_ATTRS]] = { "julia.mv.clones"="E" } +; CHECK-DAG: attributes #[[SIMD_LOOP_TEST_ATTRS]] = { "julia.mv.clones"="1A" } +; CHECK-DAG: attributes #[[SIMD_LOOP_FASTMATH_TEST_ATTRS]] = { "julia.mv.clones"="1E" } +; CHECK-DAG: attributes #[[FUNC_IN_GV_ATTRS]] +; CHECK-SAME: "julia.mv.clones"="2" +; CHECK-SAME: "julia.mv.fvar" +; CHECK-DAG: attributes #[[ALIASEE_ATTRS]] +; CHECK-SAME: "julia.mv.clones"="2" +; CHECK-SAME: "julia.mv.reloc" +; CHECK-DAG: attributes #[[CLONED_RELOC_ATTRS]] +; CHECK-SAME: "julia.mv.clones"="6" +; CHECK-SAME: "julia.mv.reloc" + +; CHECK-LABEL: !llvm.module.flags + +!llvm.module.flags = !{!0, !1, !2} + +; CHECK-DAG: julia.mv.enable +; CHECK-DAG: julia.mv.skipcloning +; CHECK-DAG: julia.mv.specs +; CHECK-DAG: julia.mv.annotated +; CHECK-DAG: julia.mv.veccall + +!0 = !{i32 1, !"julia.mv.enable", i32 1} +!1 = !{i32 1, !"julia.mv.skipcloning", i32 1} +!2 = !{i32 1, !"julia.mv.specs", !3} +!3 = !{!4, !5, !6, !7, !8} +!4 = !{!"cpubase", !"nofeatures", i32 0, i32 2} +!5 = !{!"cpucloneall", !"cloneall", i32 0, i32 2} +!6 = !{!"cpufastmath", !"fastmathclone", i32 0, i32 4} +!7 = !{!"cpuloop", !"loopclone", i32 0, i32 8} +!8 = !{!"cpusimd", !"simdclone", i32 0, i32 16} +!9 = !{!9} From b3d3ffbc3384451819aa9d1886f9c7230969411e Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Thu, 2 Feb 2023 10:48:52 -0500 Subject: [PATCH 26/34] Couple more tests for multiversioning --- test/llvmpasses/multiversioning-clone-only.ll | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 test/llvmpasses/multiversioning-clone-only.ll diff --git a/test/llvmpasses/multiversioning-clone-only.ll b/test/llvmpasses/multiversioning-clone-only.ll new file mode 100644 index 0000000000000..61bcdb8613306 --- /dev/null +++ b/test/llvmpasses/multiversioning-clone-only.ll @@ -0,0 +1,50 @@ +; RUN: opt -enable-new-pm=0 -load libjulia-codegen%shlibext -JuliaMultiVersioning -S %s | FileCheck %s --allow-unused-prefixes=false +; RUN: opt -enable-new-pm=1 --load-pass-plugin=libjulia-codegen%shlibext -passes='JuliaMultiVersioning' -S %s | FileCheck %s --allow-unused-prefixes=false + +@jl_fvars = global [0 x i64] zeroinitializer, align 16 +@jl_gvars = global [0 x i64] zeroinitializer, align 16 +@jl_fvar_idxs = global [0 x i32] zeroinitializer, align 16 +@jl_gvar_idxs = global [0 x i32] zeroinitializer, align 16 + +; CHECK-DAG: define{{.*}}@boring({{.*}}#[[BORING_DEFAULT_ATTRS:[0-9]+]] +; CHECK-DAG-NEXT: ret i32 %0 +; CHECK-DAG: define{{.*}}@boring.1({{.*}}#[[BORING_CLONEALL_ATTRS:[0-9]+]] +; CHECK-DAG-NEXT: ret i32 %0 +define noundef i32 @boring(i32 noundef %0) #0 { + ret i32 %0 +} + +; CHECK-DAG: declare{{.*}}@declaration({{.*}}#[[DECLARATION_DEFAULT_ATTRS:[0-9]+]] +; CHECK-DAG: declare{{.*}}@declaration.1({{.*}}#[[DECLARATION_CLONEALL_ATTRS:[0-9]+]] +declare i32 @declaration(i32 %0) #1 + +; CHECK: } + +; CHECK-DAG: attributes #[[BORING_DEFAULT_ATTRS:[0-9]+]] +; CHECK-SAME: { +; CHECK-DAG: "julia.mv.clones"="2" +; CHECK-DAG: "julia.mv.clone"="0" +; CHECK-DAG: "target-cpu"="cpubase" +; CHECK-DAG: "target-features"="nofeatures" +; CHECK-SAME: } +; CHECK-DAG: attributes #[[BORING_CLONEALL_ATTRS:[0-9]+]] +; CHECK-SAME: { +; CHECK-DAG: "julia.mv.clones"="2" +; CHECK-DAG: "julia.mv.clone"="1" +; CHECK-DAG: "target-cpu"="cpucloneall" +; CHECK-DAG: "target-features"="cloneall" +; CHECK-SAME: } +attributes #0 = {"julia.mv.clones"="2"} +attributes #1 = {"julia.mv.clones"="2" "test.unique"="1"} + +!llvm.module.flags = !{!0, !1, !2} + +!0 = !{i32 1, !"julia.mv.enable", i32 1} +!1 = !{i32 1, !"julia.mv.annotated", i32 1} +!2 = !{i32 1, !"julia.mv.specs", !3} +!3 = !{!4, !5, !6, !7, !8} +!4 = !{!"cpubase", !"nofeatures", i32 0, i32 2} +!5 = !{!"cpucloneall", !"cloneall", i32 0, i32 2} +!6 = !{!"cpufastmath", !"fastmathclone", i32 0, i32 4} +!7 = !{!"cpuloop", !"loopclone", i32 0, i32 8} +!8 = !{!"cpusimd", !"simdclone", i32 0, i32 16} \ No newline at end of file From e75e362dc936b2bf98028e887ec075f50c928c6b Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Thu, 2 Feb 2023 11:01:51 -0500 Subject: [PATCH 27/34] Inject CRT aliases with internal linkage within every shard --- src/aotcompile.cpp | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index fffc7839d74c9..5e8618d637b3e 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -485,8 +485,7 @@ static void injectCRTAlias(Module &M, StringRef name, StringRef alias, FunctionT if (!target) { target = Function::Create(FT, Function::ExternalLinkage, alias, M); } - Function *interposer = Function::Create(FT, Function::ExternalLinkage, name, M); - interposer->setVisibility(GlobalValue::HiddenVisibility); + Function *interposer = Function::Create(FT, Function::InternalLinkage, name, M); appendToCompilerUsed(M, {interposer}); llvm::IRBuilder<> builder(BasicBlock::Create(M.getContext(), "top", interposer)); @@ -891,6 +890,30 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out #endif optimizer.run(M); assert(!verifyModule(M, &errs())); + bool inject_aliases = false; + for (auto &F : M.functions()) { + if (!F.isDeclaration() && F.getName() != "_DllMainCRTStartup") { + inject_aliases = true; + break; + } + } + // no need to inject aliases if we have no functions + if (inject_aliases) { + // We would like to emit an alias or an weakref alias to redirect these symbols + // but LLVM doesn't let us emit a GlobalAlias to a declaration... + // So for now we inject a definition of these functions that calls our runtime + // functions. We do so after optimization to avoid cloning these functions. + injectCRTAlias(M, "__gnu_h2f_ieee", "julia__gnu_h2f_ieee", + FunctionType::get(Type::getFloatTy(M.getContext()), { Type::getHalfTy(M.getContext()) }, false)); + injectCRTAlias(M, "__extendhfsf2", "julia__gnu_h2f_ieee", + FunctionType::get(Type::getFloatTy(M.getContext()), { Type::getHalfTy(M.getContext()) }, false)); + injectCRTAlias(M, "__gnu_f2h_ieee", "julia__gnu_f2h_ieee", + FunctionType::get(Type::getHalfTy(M.getContext()), { Type::getFloatTy(M.getContext()) }, false)); + injectCRTAlias(M, "__truncsfhf2", "julia__gnu_f2h_ieee", + FunctionType::get(Type::getHalfTy(M.getContext()), { Type::getFloatTy(M.getContext()) }, false)); + injectCRTAlias(M, "__truncdfhf2", "julia__truncdfhf2", + FunctionType::get(Type::getHalfTy(M.getContext()), { Type::getDoubleTy(M.getContext()) }, false)); + } timers.optimize.stopTimer(); @@ -1440,23 +1463,6 @@ void jl_dump_native_impl(void *native_code, sysimageM->setOverrideStackAlignment(dataM->getOverrideStackAlignment()); #endif - if (!TheTriple.isOSDarwin()) { - // We would like to emit an alias or an weakref alias to redirect these symbols - // but LLVM doesn't let us emit a GlobalAlias to a declaration... - // So for now we inject a definition of these functions that calls our runtime - // functions. We do so after optimization to avoid cloning these functions. - injectCRTAlias(*sysimageM, "__gnu_h2f_ieee", "julia__gnu_h2f_ieee", - FunctionType::get(Type::getFloatTy(Context), { Type::getHalfTy(Context) }, false)); - injectCRTAlias(*sysimageM, "__extendhfsf2", "julia__gnu_h2f_ieee", - FunctionType::get(Type::getFloatTy(Context), { Type::getHalfTy(Context) }, false)); - injectCRTAlias(*sysimageM, "__gnu_f2h_ieee", "julia__gnu_f2h_ieee", - FunctionType::get(Type::getHalfTy(Context), { Type::getFloatTy(Context) }, false)); - injectCRTAlias(*sysimageM, "__truncsfhf2", "julia__gnu_f2h_ieee", - FunctionType::get(Type::getHalfTy(Context), { Type::getFloatTy(Context) }, false)); - injectCRTAlias(*sysimageM, "__truncdfhf2", "julia__truncdfhf2", - FunctionType::get(Type::getHalfTy(Context), { Type::getDoubleTy(Context) }, false)); - } - if (TheTriple.isOSWindows()) { // Windows expect that the function `_DllMainStartup` is present in an dll. // Normal compilers use something like Zig's crtdll.c instead we provide a From 65e6de2a6265243cb750b3551d5ac86029e7ffad Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Fri, 3 Feb 2023 00:58:25 -0500 Subject: [PATCH 28/34] Expand on the multiversioning tests --- src/llvm-multiversioning.cpp | 42 +++- test/llvmpasses/multiversioning-clone-only.ll | 193 ++++++++++++++++-- 2 files changed, 216 insertions(+), 19 deletions(-) diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp index b4f67ebe22c7d..6e9bbe85aa7f6 100644 --- a/src/llvm-multiversioning.cpp +++ b/src/llvm-multiversioning.cpp @@ -364,7 +364,9 @@ struct CloneCtx { void clone_decls(); void clone_bodies(); void fix_gv_uses(); + void finalize_orig_clone_attr(); void fix_inst_uses(); + void finalize_orig_features(); void emit_metadata(); private: void prepare_vmap(ValueToValueMapTy &vmap); @@ -399,6 +401,8 @@ static inline std::vector<T*> consume_gv(Module &M, const char *name, bool allow // Strip them from the Module so that it's easier to handle the uses. GlobalVariable *gv = M.getGlobalVariable(name); assert(gv && gv->hasInitializer()); + dbgs() << "Consume " << *gv << ":\n"; + dbgs() << *gv->getType() << "\n"; ArrayType *Ty = cast<ArrayType>(gv->getInitializer()->getType()); unsigned nele = Ty->getArrayNumElements(); std::vector<T*> res(nele); @@ -417,6 +421,7 @@ static inline std::vector<T*> consume_gv(Module &M, const char *name, bool allow nele--; continue; } + dbgs() << *val << ": " << *val->getType() << "\n"; res[i++] = cast<T>(val); } res.resize(nele); @@ -584,18 +589,20 @@ void CloneCtx::clone_bodies() clone_function(group_F, target_F, *target.vmap); } add_features(target_F, specs[target.idx]); - target_F->addFnAttr("julia.mv.clone", std::to_string(i)); + target_F->addFnAttr("julia.mv.clone", std::to_string(target.idx)); } } // don't set the original function's features yet, // since we may clone it for later groups if (i != 0) { add_features(group_F, specs[groups[i].idx]); + group_F->addFnAttr("julia.mv.clone", std::to_string(groups[i].idx)); } - group_F->addFnAttr("julia.mv.clone", std::to_string(i)); } - // Add features to the original function - add_features(F, specs[0]); + // still don't set the original function's features yet, + // since we'll copy function attributes if we need to rewrite + // the alias, and target specific attributes are illegal on + // alias trampolines unless the user explicitly specifies them } } @@ -658,6 +665,11 @@ void CloneCtx::rewrite_alias(GlobalAlias *alias, Function *F) Function::Create(F->getFunctionType(), alias->getLinkage(), "", &M); trampoline->copyAttributesFrom(F); trampoline->takeName(alias); + trampoline->setVisibility(alias->getVisibility()); + // drop multiversioning attributes, add alias attribute for testing purposes + trampoline->removeFnAttr("julia.mv.reloc"); + trampoline->removeFnAttr("julia.mv.clones"); + trampoline->addFnAttr("julia.mv.alias"); alias->eraseFromParent(); uint32_t id; @@ -727,6 +739,15 @@ void CloneCtx::fix_gv_uses() } } +void CloneCtx::finalize_orig_clone_attr() +{ + for (auto orig_f: orig_funcs) { + if (!orig_f->hasFnAttribute("julia.mv.clones")) + continue; + orig_f->addFnAttr("julia.mv.clone", "0"); + } +} + std::pair<uint32_t,GlobalVariable*> CloneCtx::get_reloc_slot(Function *F) const { if (F->isDeclaration()) { @@ -814,6 +835,12 @@ void CloneCtx::fix_inst_uses() } } +void CloneCtx::finalize_orig_features() { + for (auto F : orig_funcs) { + add_features(F, specs[0]); + } +} + static Constant *get_ptrdiff32(Constant *ptr, Constant *base) { if (ptr->getType()->isPointerTy()) @@ -1021,6 +1048,10 @@ static bool runMultiVersioning(Module &M, bool allow_bad_fvars) // These relocations must be initialized for **ALL** targets. clone.fix_gv_uses(); + // Now we have all the cloned functions, we can set the original functions' + // clone attribute to be 0 + clone.finalize_orig_clone_attr(); + // For each group, scan all functions cloned by **PARTIALLY** cloned targets for // instruction use. // A function needs a const relocation slot if it is cloned and is called by a @@ -1031,6 +1062,9 @@ static bool runMultiVersioning(Module &M, bool allow_bad_fvars) // A target needs a slot to be initialized iff at least one caller is not initialized. clone.fix_inst_uses(); + //Now set the original functions' target-specific attributes, since nobody will look at those again + clone.finalize_orig_features(); + // Store back sysimg information with the correct format. // At this point, we should have fixed up all the uses of the cloned functions // and collected all the shared/target-specific relocations. diff --git a/test/llvmpasses/multiversioning-clone-only.ll b/test/llvmpasses/multiversioning-clone-only.ll index 61bcdb8613306..a5c327548d702 100644 --- a/test/llvmpasses/multiversioning-clone-only.ll +++ b/test/llvmpasses/multiversioning-clone-only.ll @@ -1,41 +1,202 @@ ; RUN: opt -enable-new-pm=0 -load libjulia-codegen%shlibext -JuliaMultiVersioning -S %s | FileCheck %s --allow-unused-prefixes=false ; RUN: opt -enable-new-pm=1 --load-pass-plugin=libjulia-codegen%shlibext -passes='JuliaMultiVersioning' -S %s | FileCheck %s --allow-unused-prefixes=false -@jl_fvars = global [0 x i64] zeroinitializer, align 16 -@jl_gvars = global [0 x i64] zeroinitializer, align 16 -@jl_fvar_idxs = global [0 x i32] zeroinitializer, align 16 -@jl_gvar_idxs = global [0 x i32] zeroinitializer, align 16 - -; CHECK-DAG: define{{.*}}@boring({{.*}}#[[BORING_DEFAULT_ATTRS:[0-9]+]] -; CHECK-DAG-NEXT: ret i32 %0 -; CHECK-DAG: define{{.*}}@boring.1({{.*}}#[[BORING_CLONEALL_ATTRS:[0-9]+]] -; CHECK-DAG-NEXT: ret i32 %0 +; CHECK: @jl_fvar_idxs = hidden constant [1 x i32] zeroinitializer +; CHECK: @jl_gvar_idxs = hidden constant [0 x i32] zeroinitializer +; CHECK: @subtarget_cloned_gv = hidden global i64* null +; CHECK: @subtarget_cloned.reloc_slot = hidden global i32 (i32)* null +; CHECK: @jl_fvar_offsets = hidden constant [2 x i32] [i32 1, i32 0] +; CHECK: @jl_gvar_base = hidden constant i64 0 +; CHECK: @jl_gvar_offsets = hidden constant [1 x i32] zeroinitializer +; CHECK: @jl_clone_slots = hidden constant [5 x i32] +; CHECK-SAME: i32 2, i32 0, {{.*}} sub {{.*}}@subtarget_cloned.reloc_slot{{.*}}@jl_gvar_base +; CHECK: @jl_clone_idxs = hidden constant [13 x i32] +; COM: TODO actually check the clone idxs maybe? +; CHECK: @jl_clone_offsets = hidden constant [4 x i32] +; CHECK-SAME: sub +; CHECK-SAME: @subtarget_cloned.1 +; CHECK-SAME: @subtarget_cloned +; CHECK-SAME: sub +; CHECK-SAME: @subtarget_cloned.2 +; CHECK-SAME: @subtarget_cloned +; CHECK-SAME: sub + +@jl_fvars = global [1 x i64*] [i64* bitcast (i32 (i32)* @subtarget_cloned to i64*)], align 16 +@jl_gvars = global [0 x i64*] zeroinitializer, align 16 +@jl_fvar_idxs = hidden constant [1 x i32] [i32 0], align 16 +@jl_gvar_idxs = hidden constant [0 x i32] zeroinitializer, align 16 +@subtarget_cloned_gv = hidden global i64* bitcast (i32 (i32)* @subtarget_cloned to i64*), align 16 + +@subtarget_cloned_aliased = alias i32 (i32), i32 (i32)* @subtarget_cloned + +; CHECK: define{{.*}}@boring({{.*}}#[[BORING_DEFAULT_ATTRS:[0-9]+]] +; CHECK-NEXT: ret i32 %0 define noundef i32 @boring(i32 noundef %0) #0 { ret i32 %0 } -; CHECK-DAG: declare{{.*}}@declaration({{.*}}#[[DECLARATION_DEFAULT_ATTRS:[0-9]+]] -; CHECK-DAG: declare{{.*}}@declaration.1({{.*}}#[[DECLARATION_CLONEALL_ATTRS:[0-9]+]] +; CHECK: declare{{.*}}@declaration({{.*}}#[[DECLARATION_DEFAULT_ATTRS:[0-9]+]] declare i32 @declaration(i32 %0) #1 -; CHECK: } +; CHECK: define{{.*}}@call_boring({{.*}}#[[BORING_DEFAULT_ATTRS]] +; CHECK-NEXT: %2 = call noundef i32 @boring(i32 noundef %0) +define noundef i32 @call_boring(i32 noundef %0) #0 { + %2 = call noundef i32 @boring(i32 noundef %0) + ret i32 %2 +} + +; CHECK: define{{.*}}@call_declaration({{.*}}#[[DECLARATION_DEFAULT_ATTRS]] +; CHECK-NEXT: %2 = call noundef i32 @declaration(i32 noundef %0) +define noundef i32 @call_declaration(i32 noundef %0) #1 { + %2 = call noundef i32 @declaration(i32 noundef %0) + ret i32 %2 +} + +; CHECK: define{{.*}}@subtarget_cloned({{.*}}#[[SUBTARGET_CLONED_DEFAULT_ATTRS:[0-9]+]] +; CHECK-NEXT: ret i32 0 +define noundef i32 @subtarget_cloned(i32 noundef %0) #2 { + ret i32 0 +} + +; COM: should fixup this callsite since 2 is cloned for a subtarget +; CHECK: define{{.*}}@call_subtarget_cloned({{.*}}#[[CALL_SUBTARGET_CLONED_DEFAULT_ATTRS:[0-9]+]] +; CHECK-NEXT: [[FUNC_PTR:%[0-9]+]] = load{{.*}}@subtarget_cloned.reloc_slot{{.*}}!tbaa ![[TBAA_CONST_METADATA:[0-9]+]], !invariant.load +; CHECK-NEXT: call{{.*}}[[FUNC_PTR]] +; CHECK: ret i32 +define noundef i32 @call_subtarget_cloned(i32 noundef %0) #3 { + %2 = call noundef i32 @subtarget_cloned(i32 noundef %0) + ret i32 %2 +} + +; CHECK: define{{.*}}@call_subtarget_cloned_but_not_cloned({{.*}}#[[BORING_DEFAULT_ATTRS]] +; CHECK-NEXT: [[FUNC_PTR:%[0-9]+]] = load{{.*}}@subtarget_cloned.reloc_slot{{.*}}!tbaa ![[TBAA_CONST_METADATA]], !invariant.load +; CHECK-NEXT: call{{.*}}[[FUNC_PTR]] +; CHECK: ret i32 +define noundef i32 @call_subtarget_cloned_but_not_cloned(i32 noundef %0) #0 { + %2 = call noundef i32 @subtarget_cloned(i32 noundef %0) + ret i32 %2 +} + +; CHECK: define{{.*}}@boring.1({{.*}}#[[BORING_CLONEALL_ATTRS:[0-9]+]] +; CHECK-NEXT: ret i32 %0 + +; CHECK: declare{{.*}}@declaration.1({{.*}}#[[DECLARATION_CLONEALL_ATTRS:[0-9]+]] + +; COM: should not fixup this callsite since boring is not cloned for a subtarget +; COM: also should call boring.1 instead of boring +; CHECK: define{{.*}}@call_boring.1({{.*}}#[[BORING_CLONEALL_ATTRS]] +; CHECK-NEXT: %2 = call noundef i32 @boring.1(i32 noundef %0) + +; CHECK: define{{.*}}@call_declaration.1({{.*}}#[[DECLARATION_CLONEALL_ATTRS]] +; CHECK-NEXT: %2 = call noundef i32 @declaration.1(i32 noundef %0) -; CHECK-DAG: attributes #[[BORING_DEFAULT_ATTRS:[0-9]+]] +; CHECK: define{{.*}}@subtarget_cloned.1({{.*}}#[[SUBTARGET_CLONED_CLONEALL_ATTRS:[0-9]+]] +; CHECK-NEXT: ret i32 0 + +; CHECK: define{{.*}}@subtarget_cloned.2({{.*}}#[[SUBTARGET_CLONED_FASTMATH_ATTRS:[0-9]+]] +; CHECK-NEXT: ret i32 0 + +; COM: should *NOT* fixup this callsite since subtarget_cloned is not cloned for a subtarget of the cloneall +; CHECK: define{{.*}}@call_subtarget_cloned.1({{.*}}#[[CALL_SUBTARGET_CLONED_CLONEALL_ATTRS:[0-9]+]] +; CHECK-NEXT: %2 = call noundef i32 @subtarget_cloned.1(i32 noundef %0) + +; CHECK: define {{.*}}@call_subtarget_cloned.2({{.*}}#[[CALL_SUBTARGET_CLONED_FASTMATH_ATTRS:[0-9]+]] +; CHECK-NEXT: %2 = call noundef i32 @subtarget_cloned.2(i32 noundef %0) + +; CHECK: define{{.*}}@call_subtarget_cloned_but_not_cloned.1({{.*}}#[[BORING_CLONEALL_ATTRS]] +; CHECK-NEXT: %2 = call noundef i32 @subtarget_cloned.1(i32 noundef %0) + +; COM: should not have cloned for fastmath +; CHECK-NOT: @subtarget_cloned_but_not_cloned.2 + +; COM: check for alias being rewritten to a function trampoline +; CHECK: define{{.*}}@subtarget_cloned_aliased{{.*}}#[[SUBTARGET_ALIASED_ATTRS:[0-9]+]] +; CHECK-NOT: } +; CHECK: [[FUNC_PTR:%[0-9]+]] = load{{.*}}@subtarget_cloned.reloc_slot{{.*}}!tbaa ![[TBAA_CONST_METADATA]], !invariant.load +; CHECK-NEXT: call{{.*}}[[FUNC_PTR]] +; CHECK: ret i32 + +; CHECK: attributes #[[BORING_DEFAULT_ATTRS]] ; CHECK-SAME: { ; CHECK-DAG: "julia.mv.clones"="2" ; CHECK-DAG: "julia.mv.clone"="0" ; CHECK-DAG: "target-cpu"="cpubase" ; CHECK-DAG: "target-features"="nofeatures" ; CHECK-SAME: } -; CHECK-DAG: attributes #[[BORING_CLONEALL_ATTRS:[0-9]+]] +; CHECK: attributes #[[DECLARATION_DEFAULT_ATTRS]] ; CHECK-SAME: { ; CHECK-DAG: "julia.mv.clones"="2" +; CHECK-DAG: "julia.mv.clone"="0" +; CHECK-DAG: "target-cpu"="cpubase" +; CHECK-DAG: "target-features"="nofeatures" +; CHECK-SAME: } +; CHECK: attributes #[[SUBTARGET_CLONED_DEFAULT_ATTRS]] +; CHECK-SAME: { +; CHECK-DAG: "julia.mv.clones"="6" +; CHECK-DAG: "julia.mv.clone"="0" +; CHECK-DAG: "target-cpu"="cpubase" +; CHECK-DAG: "target-features"="nofeatures" +; CHECK-DAG: "julia.mv.reloc" +; CHECK-SAME: } +; CHECK: attributes #[[CALL_SUBTARGET_CLONED_DEFAULT_ATTRS]] +; CHECK-SAME: { +; CHECK-DAG: "julia.mv.clones"="6" +; CHECK-DAG: "julia.mv.clone"="0" +; CHECK-DAG: "target-cpu"="cpubase" +; CHECK-DAG: "target-features"="nofeatures" +; CHECK-SAME: } +; CHECK: attributes #[[BORING_CLONEALL_ATTRS]] +; CHECK-SAME: { +; CHECK-DAG: "julia.mv.clones"="2" +; CHECK-DAG: "julia.mv.clone"="1" +; CHECK-DAG: "target-cpu"="cpucloneall" +; CHECK-DAG: "target-features"="cloneall" +; CHECK-SAME: } +; CHECK: attributes #[[DECLARATION_CLONEALL_ATTRS]] +; CHECK-SAME: { +; CHECK-DAG: "julia.mv.clones"="2" +; CHECK-DAG: "julia.mv.clone"="1" +; CHECK-DAG: "target-cpu"="cpucloneall" +; CHECK-DAG: "target-features"="cloneall" +; CHECK-SAME: } +; CHECK: attributes #[[SUBTARGET_CLONED_CLONEALL_ATTRS]] +; CHECK-SAME: { +; CHECK-DAG: "julia.mv.clones"="6" ; CHECK-DAG: "julia.mv.clone"="1" ; CHECK-DAG: "target-cpu"="cpucloneall" ; CHECK-DAG: "target-features"="cloneall" +; CHECK-DAG: "julia.mv.reloc" +; CHECK-SAME: } +; CHECK: attributes #[[SUBTARGET_CLONED_FASTMATH_ATTRS]] +; CHECK-SAME: { +; CHECK-DAG: "julia.mv.clones"="6" +; CHECK-DAG: "julia.mv.clone"="2" +; CHECK-DAG: "target-cpu"="cpufastmath" +; CHECK-DAG: "target-features"="fastmathclone" +; CHECK-DAG: "julia.mv.reloc" +; CHECK-SAME: } +; CHECK: attributes #[[CALL_SUBTARGET_CLONED_CLONEALL_ATTRS]] +; CHECK-SAME: { +; CHECK-DAG: "julia.mv.clones"="6" +; CHECK-DAG: "julia.mv.clone"="1" +; CHECK-DAG: "target-cpu"="cpucloneall" +; CHECK-DAG: "target-features"="cloneall" +; CHECK-SAME: } +; CHECK: attributes #[[CALL_SUBTARGET_CLONED_FASTMATH_ATTRS]] +; CHECK-SAME: { +; CHECK-DAG: "julia.mv.clones"="6" +; CHECK-DAG: "julia.mv.clone"="2" +; CHECK-DAG: "target-cpu"="cpufastmath" +; CHECK-DAG: "target-features"="fastmathclone" +; CHECK-SAME: } +; CHECK: attributes #[[SUBTARGET_ALIASED_ATTRS]] +; CHECK-SAME: { +; CHECK-SAME: "julia.mv.alias" ; CHECK-SAME: } attributes #0 = {"julia.mv.clones"="2"} attributes #1 = {"julia.mv.clones"="2" "test.unique"="1"} +attributes #2 = {"julia.mv.clones"="6" "julia.mv.reloc"} +attributes #3 = {"julia.mv.clones"="6"} !llvm.module.flags = !{!0, !1, !2} @@ -47,4 +208,6 @@ attributes #1 = {"julia.mv.clones"="2" "test.unique"="1"} !5 = !{!"cpucloneall", !"cloneall", i32 0, i32 2} !6 = !{!"cpufastmath", !"fastmathclone", i32 0, i32 4} !7 = !{!"cpuloop", !"loopclone", i32 0, i32 8} -!8 = !{!"cpusimd", !"simdclone", i32 0, i32 16} \ No newline at end of file +!8 = !{!"cpusimd", !"simdclone", i32 0, i32 16} +; CHECK-DAG: ![[TBAA_CONST_METADATA]] = !{![[JTBAA_CONST_METADATA:[0-9]+]], ![[JTBAA_CONST_METADATA]] +; CHECK-DAG: ![[JTBAA_CONST_METADATA]] = !{!"jtbaa_const" From 556122393ab3762f6d19fc3f19c83739065b8c28 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Fri, 3 Feb 2023 03:42:59 -0500 Subject: [PATCH 29/34] Remove stray debug prints --- src/llvm-multiversioning.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp index 6e9bbe85aa7f6..cbce76d702119 100644 --- a/src/llvm-multiversioning.cpp +++ b/src/llvm-multiversioning.cpp @@ -401,8 +401,6 @@ static inline std::vector<T*> consume_gv(Module &M, const char *name, bool allow // Strip them from the Module so that it's easier to handle the uses. GlobalVariable *gv = M.getGlobalVariable(name); assert(gv && gv->hasInitializer()); - dbgs() << "Consume " << *gv << ":\n"; - dbgs() << *gv->getType() << "\n"; ArrayType *Ty = cast<ArrayType>(gv->getInitializer()->getType()); unsigned nele = Ty->getArrayNumElements(); std::vector<T*> res(nele); @@ -421,7 +419,6 @@ static inline std::vector<T*> consume_gv(Module &M, const char *name, bool allow nele--; continue; } - dbgs() << *val << ": " << *val->getType() << "\n"; res[i++] = cast<T>(val); } res.resize(nele); From fef319cf11394caf3526460758d4e57196bd2322 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Mon, 13 Feb 2023 13:42:27 -0500 Subject: [PATCH 30/34] Track gvar count --- src/processor.cpp | 1 + src/processor.h | 1 + src/staticdata.c | 1 + 3 files changed, 3 insertions(+) diff --git a/src/processor.cpp b/src/processor.cpp index 851cbec62560a..fec2b77102f55 100644 --- a/src/processor.cpp +++ b/src/processor.cpp @@ -773,6 +773,7 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback) offsets[i] = gvars[i] - (const char *)res.gvars_base; } res.gvars_offsets = offsets; + res.ngvars = gvars.size(); } if (!clones.empty()) { diff --git a/src/processor.h b/src/processor.h index 73271290eff76..6445f221882ba 100644 --- a/src/processor.h +++ b/src/processor.h @@ -159,6 +159,7 @@ typedef struct { uint64_t base; uintptr_t *gvars_base; const int32_t *gvars_offsets; + uint32_t ngvars; jl_image_fptrs_t fptrs; } jl_image_t; diff --git a/src/staticdata.c b/src/staticdata.c index 94e93f4198b4c..d832cda995a94 100644 --- a/src/staticdata.c +++ b/src/staticdata.c @@ -1901,6 +1901,7 @@ static void jl_update_all_gvars(jl_serializer_state *s, jl_image_t *image, uint3 reloc_t *gvars = (reloc_t*)&s->gvar_record->buf[0]; int gvar_link_index = 0; int external_fns_link_index = 0; + assert(l == image->ngvars); for (i = 0; i < l; i++) { uintptr_t offset = gvars[i]; uintptr_t v = 0; From acc54d9e93567de0077ba248206f1a02f4d7b0cd Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Fri, 17 Feb 2023 15:40:21 -0500 Subject: [PATCH 31/34] Add more assertions --- src/aotcompile.cpp | 51 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 5e8618d637b3e..f74c9f92d3093 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -354,10 +354,14 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm // process the globals array, before jl_merge_module destroys them std::vector<std::string> gvars(params.globals.size()); data->jl_value_to_llvm.resize(params.globals.size()); + StringSet<> gvars_names; + DenseSet<GlobalValue *> gvars_set; size_t idx = 0; for (auto &global : params.globals) { gvars[idx] = global.second->getName().str(); + assert(gvars_set.insert(global.second).second && "Duplicate gvar in params!"); + assert(gvars_names.insert(gvars[idx]).second && "Duplicate gvar name in params!"); data->jl_value_to_llvm[idx] = global.first; idx++; } @@ -374,7 +378,10 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm GlobalVariable *F = extern_fn.second; size_t idx = gvars.size() - offset; assert(idx >= 0); - data->jl_external_to_llvm.at(idx) = this_code; + assert(idx < data->jl_external_to_llvm.size()); + data->jl_external_to_llvm[idx] = this_code; + assert(gvars_set.insert(F).second && "Duplicate gvar in params!"); + assert(gvars_names.insert(F->getName()).second && "Duplicate gvar name in params!"); gvars.push_back(std::string(F->getName())); } @@ -575,12 +582,18 @@ static void get_fvars_gvars(Module &M, DenseMap<GlobalValue *, unsigned> &fvars, auto gvars_init = cast<ConstantArray>(gvars_gv->getInitializer()); for (unsigned i = 0; i < fvars_init->getNumOperands(); ++i) { auto gv = cast<GlobalValue>(fvars_init->getOperand(i)->stripPointerCasts()); + assert(gv && gv->hasName() && "fvar must be a named global"); + assert(!fvars.count(gv) && "Duplicate fvar"); fvars[gv] = i; } + assert(fvars.size() == fvars_init->getNumOperands()); for (unsigned i = 0; i < gvars_init->getNumOperands(); ++i) { auto gv = cast<GlobalValue>(gvars_init->getOperand(i)->stripPointerCasts()); + assert(gv && gv->hasName() && "gvar must be a named global"); + assert(!gvars.count(gv) && "Duplicate gvar"); gvars[gv] = i; } + assert(gvars.size() == gvars_init->getNumOperands()); fvars_gv->eraseFromParent(); gvars_gv->eraseFromParent(); fvars_idxs->eraseFromParent(); @@ -606,9 +619,11 @@ static size_t getFunctionWeight(const Function &F) } -static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partitions, const Module &M) { +static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partitions, const Module &M, size_t fvars_size, size_t gvars_size) { bool bad = false; -#ifdef JL_DEBUG_BUILD +#ifndef JL_NDEBUG + SmallVector<uint32_t> fvars(fvars_size); + SmallVector<uint32_t> gvars(gvars_size); StringMap<uint32_t> GVNames; for (uint32_t i = 0; i < partitions.size(); i++) { for (auto &name : partitions[i].globals) { @@ -618,7 +633,21 @@ static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partiti } GVNames[name.getKey()] = i; } - dbgs() << "partition: " << i << " fvars: " << partitions[i].fvars.size() << " gvars: " << partitions[i].gvars.size() << "\n"; + for (auto &fvar : partitions[i].fvars) { + if (fvars[fvar.second] != 0) { + bad = true; + dbgs() << "Duplicate fvar " << fvar.first() << " in partitions " << i << " and " << fvars[fvar.second] - 1 << "\n"; + } + fvars[fvar.second] = i+1; + } + for (auto &gvar : partitions[i].gvars) { + if (gvars[gvar.second] != 0) { + bad = true; + dbgs() << "Duplicate gvar " << gvar.first() << " in partitions " << i << " and " << gvars[gvar.second] - 1 << "\n"; + } + gvars[gvar.second] = i+1; + } + // dbgs() << "partition: " << i << " fvars: " << partitions[i].fvars.size() << " gvars: " << partitions[i].gvars.size() << "\n"; } for (auto &GV : M.globals()) { if (GV.isDeclaration()) { @@ -637,6 +666,18 @@ static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partiti } } } + for (uint32_t i = 0; i < fvars_size; i++) { + if (fvars[i] == 0) { + bad = true; + dbgs() << "fvar " << i << " not in any partition\n"; + } + } + for (uint32_t i = 0; i < gvars_size; i++) { + if (gvars[i] == 0) { + bad = true; + dbgs() << "gvar " << i << " not in any partition\n"; + } + } #endif return !bad; } @@ -766,7 +807,7 @@ static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) { } } - bool verified = verify_partitioning(partitions, M); + bool verified = verify_partitioning(partitions, M, fvars.size(), gvars.size()); assert(verified && "Partitioning failed to partition globals correctly"); (void) verified; From 27f1ccdf55078670d4f4f0a7407e53df580134d5 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Sun, 5 Mar 2023 23:14:01 -0500 Subject: [PATCH 32/34] Move dbgs under LLVM_DEBUG --- src/aotcompile.cpp | 152 ++++++++++++++++++++++++++++----------------- 1 file changed, 95 insertions(+), 57 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index f74c9f92d3093..d512ad586a680 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -600,24 +600,72 @@ static void get_fvars_gvars(Module &M, DenseMap<GlobalValue *, unsigned> &fvars, gvars_idxs->eraseFromParent(); } -static size_t getFunctionWeight(const Function &F) +struct FunctionInfo { + size_t weight; + size_t bbs; + size_t insts; + size_t clones; +}; + +static FunctionInfo getFunctionWeight(const Function &F) { - size_t weight = 1; + FunctionInfo info; + info.weight = 1; + info.bbs = F.size(); + info.insts = 0; + info.clones = 1; for (const BasicBlock &BB : F) { - weight += BB.size(); + info.insts += BB.size(); } - // more basic blocks = more complex than just sum of insts, - // add some weight to it - weight += F.size(); if (F.hasFnAttribute("julia.mv.clones")) { auto val = F.getFnAttribute("julia.mv.clones").getValueAsString(); // base16, so must be at most 4 * length bits long // popcount gives number of clones - weight *= APInt(val.size() * 4, val, 16).countPopulation() + 1; + info.clones = APInt(val.size() * 4, val, 16).countPopulation() + 1; } - return weight; + info.weight += info.insts; + // more basic blocks = more complex than just sum of insts, + // add some weight to it + info.weight += info.bbs; + info.weight *= info.clones; + return info; } +struct ModuleInfo { + size_t globals; + size_t funcs; + size_t bbs; + size_t insts; + size_t clones; + size_t weight; +}; + +ModuleInfo compute_module_info(Module &M) { + ModuleInfo info; + info.globals = 0; + info.funcs = 0; + info.bbs = 0; + info.insts = 0; + info.clones = 0; + info.weight = 0; + for (auto &G : M.global_values()) { + if (G.isDeclaration()) { + continue; + } + info.globals++; + if (auto F = dyn_cast<Function>(&G)) { + info.funcs++; + auto func_info = getFunctionWeight(*F); + info.bbs += func_info.bbs; + info.insts += func_info.insts; + info.clones += func_info.clones; + info.weight += func_info.weight; + } else { + info.weight += 1; + } + } + return info; +} static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partitions, const Module &M, size_t fvars_size, size_t gvars_size) { bool bad = false; @@ -647,7 +695,6 @@ static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partiti } gvars[gvar.second] = i+1; } - // dbgs() << "partition: " << i << " fvars: " << partitions[i].fvars.size() << " gvars: " << partitions[i].gvars.size() << "\n"; } for (auto &GV : M.globals()) { if (GV.isDeclaration()) { @@ -736,7 +783,7 @@ static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) { if (G.isDeclaration()) continue; if (isa<Function>(G)) { - partitioner.make(&G, getFunctionWeight(cast<Function>(G))); + partitioner.make(&G, getFunctionWeight(cast<Function>(G)).weight); } else { partitioner.make(&G, 1); } @@ -1141,7 +1188,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o std::vector<NewArchiveMember> &unopt, std::vector<NewArchiveMember> &opt, std::vector<NewArchiveMember> &obj, std::vector<NewArchiveMember> &asm_, bool unopt_out, bool opt_out, bool obj_out, bool asm_out, - unsigned threads) { + unsigned threads, ModuleInfo module_info) { unsigned outcount = unopt_out + opt_out + obj_out + asm_out; assert(outcount); outputs.resize(outputs.size() + outcount * threads); @@ -1235,8 +1282,6 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o auto M = cantFail(getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), "Optimized"), ctx), "Error loading module"); timers[i].deserialize.stopTimer(); - // dbgs() << "Starting shard " << i << " with weight=" << partitions[i].weight << "\n"; - timers[i].materialize.startTimer(); materializePreserved(*M, partitions[i]); timers[i].materialize.stopTimer(); @@ -1271,54 +1316,37 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o for (auto &t : timers) { t.print(dbgs(), true); } + dbgs() << "Partition weights: ["; + bool comma = false; + for (auto &p : partitions) { + if (comma) + dbgs() << ", "; + else + comma = true; + dbgs() << p.weight; + } + dbgs() << "]\n"; } } -unsigned compute_image_thread_count(Module &M) { +static unsigned compute_image_thread_count(const ModuleInfo &info) { // 32-bit systems are very memory-constrained #ifdef _P32 - // dbgs() << "Threads: 1\n"; + LLVM_DEBUG(dbgs() << "32-bit systems are restricted to a single thread\n"); return 1; #endif - size_t weight = 0; - size_t globals = 0; - for (auto &GV : M.global_values()) { - if (GV.isDeclaration()) - continue; - globals++; - if (isa<Function>(GV)) { - weight += getFunctionWeight(cast<Function>(GV)); - } else { - weight += 1; - } - } - // dbgs() << "Module weight: " << weight << "\n"; - if (weight < 1000) { - // dbgs() << "Low module complexity bailout\n"; - // dbgs() << "Threads: 1\n"; + // This is not overridable because empty modules do occasionally appear, but they'll be very small and thus exit early to + // known easy behavior. Plus they really don't warrant multiple threads + if (info.weight < 1000) { + LLVM_DEBUG(dbgs() << "Small module, using a single thread\n"); return 1; } - unsigned threads = std::max(llvm::hardware_concurrency().compute_thread_count() / 2, 1u); - - // memory limit check - // many threads use a lot of memory, so limit on constrained memory systems - size_t available = uv_get_available_memory(); - // crude estimate, available / (weight * fudge factor) = max threads - size_t fudge = 10; - unsigned max_threads = std::max(available / (weight * fudge), (size_t)1); - // dbgs() << "Available memory: " << available << " bytes\n"; - // dbgs() << "Max threads: " << max_threads << "\n"; - // dbgs() << "Temporarily disabling memory limiting threads\n"; - //TODO reenable - // if (max_threads < threads) { - // dbgs() << "Memory limiting threads to " << max_threads << "\n"; - // threads = max_threads; - // } - - max_threads = globals / 100; + unsigned threads = std::max(jl_cpu_threads() / 2, 1); + + auto max_threads = info.globals / 100; if (max_threads < threads) { - // dbgs() << "Low global count limiting threads to " << max_threads << " (" << globals << "globals)\n"; + LLVM_DEBUG(dbgs() << "Low global count limiting threads to " << max_threads << " (" << info.globals << "globals)\n"); threads = max_threads; } @@ -1331,7 +1359,7 @@ unsigned compute_image_thread_count(Module &M) { if (*endptr || !requested) { jl_safe_printf("WARNING: invalid value '%s' for JULIA_IMAGE_THREADS\n", env_threads); } else { - // dbgs() << "Overriding threads to " << requested << " due to JULIA_IMAGE_THREADS\n"; + LLVM_DEBUG(dbgs() << "Overriding threads to " << requested << " due to JULIA_IMAGE_THREADS\n"); threads = requested; env_threads_set = true; } @@ -1345,7 +1373,7 @@ unsigned compute_image_thread_count(Module &M) { if (*endptr || !requested) { jl_safe_printf("WARNING: invalid value '%s' for JULIA_CPU_THREADS\n", fallbackenv); } else if (requested < threads) { - // dbgs() << "Overriding threads to " << requested << " due to JULIA_CPU_THREADS\n"; + LLVM_DEBUG(dbgs() << "Overriding threads to " << requested << " due to JULIA_CPU_THREADS\n"); threads = requested; } } @@ -1353,8 +1381,6 @@ unsigned compute_image_thread_count(Module &M) { threads = std::max(threads, 1u); - // dbgs() << "Threads: " << threads << "\n"; - return threads; } @@ -1369,7 +1395,7 @@ void jl_dump_native_impl(void *native_code, JL_TIMING(NATIVE_DUMP); jl_native_code_desc_t *data = (jl_native_code_desc_t*)native_code; if (!bc_fname && !unopt_bc_fname && !obj_fname && !asm_fname) { - // dbgs() << "No output requested, skipping native code dump?\n"; + LLVM_DEBUG(dbgs() << "No output requested, skipping native code dump?\n"); delete data; return; } @@ -1433,6 +1459,17 @@ void jl_dump_native_impl(void *native_code, unsigned nfvars = 0; unsigned ngvars = 0; + ModuleInfo module_info = compute_module_info(*dataM); + LLVM_DEBUG(dbgs() + << "Dumping module with stats:\n" + << " globals: " << module_info.globals << "\n" + << " functions: " << module_info.funcs << "\n" + << " basic blocks: " << module_info.bbs << "\n" + << " instructions: " << module_info.insts << "\n" + << " clones: " << module_info.clones << "\n" + << " weight: " << module_info.weight << "\n" + ); + // add metadata information if (imaging_mode) { multiversioning_preannotate(*dataM); @@ -1446,7 +1483,8 @@ void jl_dump_native_impl(void *native_code, } } } - threads = compute_image_thread_count(*dataM); + threads = compute_image_thread_count(module_info); + LLVM_DEBUG(dbgs() << "Using " << threads << " to emit aot image\n"); nfvars = data->jl_sysimg_fvars.size(); ngvars = data->jl_sysimg_gvars.size(); emit_offset_table(*dataM, data->jl_sysimg_gvars, "jl_gvars", T_psize); @@ -1484,7 +1522,7 @@ void jl_dump_native_impl(void *native_code, M, *SourceTM, outputs, names, unopt_bc_Archive, bc_Archive, obj_Archive, asm_Archive, !!unopt_bc_fname, !!bc_fname, !!obj_fname, !!asm_fname, - threads + threads, module_info ); }; std::array<StringRef, 4> text_names = { From 6b8ec27dbc582ba67f717e400b1bcff8f886c6d3 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Sun, 5 Mar 2023 23:57:51 -0500 Subject: [PATCH 33/34] Add some documentation --- src/aotcompile.cpp | 52 +++++++++-- src/llvm-multiversioning.cpp | 2 + src/processor.h | 164 +++++++++++++++++++---------------- 3 files changed, 133 insertions(+), 85 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index d512ad586a680..0337602cde27e 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -505,6 +505,7 @@ static void injectCRTAlias(Module &M, StringRef name, StringRef alias, FunctionT void multiversioning_preannotate(Module &M); +// See src/processor.h for documentation about this table. Corresponds to jl_image_shard_t. static GlobalVariable *emit_shard_table(Module &M, Type *T_size, Type *T_psize, unsigned threads) { SmallVector<Constant *, 0> tables(sizeof(jl_image_shard_t) / sizeof(void *) * threads); for (unsigned i = 0; i < threads; i++) { @@ -533,6 +534,7 @@ static GlobalVariable *emit_shard_table(Module &M, Type *T_size, Type *T_psize, return tables_gv; } +// See src/processor.h for documentation about this table. Corresponds to jl_image_ptls_t. static GlobalVariable *emit_ptls_table(Module &M, Type *T_size, Type *T_psize) { std::array<Constant *, 3> ptls_table{ new GlobalVariable(M, T_size, false, GlobalValue::ExternalLinkage, Constant::getNullValue(T_size), "jl_pgcstack_func_slot"), @@ -548,6 +550,7 @@ static GlobalVariable *emit_ptls_table(Module &M, Type *T_size, Type *T_psize) { return ptls_table_gv; } +// See src/processor.h for documentation about this table. Corresponds to jl_image_header_t. static GlobalVariable *emit_image_header(Module &M, unsigned threads, unsigned nfvars, unsigned ngvars) { constexpr uint32_t version = 1; std::array<uint32_t, 4> header{ @@ -562,13 +565,7 @@ static GlobalVariable *emit_image_header(Module &M, unsigned threads, unsigned n return header_gv; } -struct Partition { - StringSet<> globals; - StringMap<unsigned> fvars; - StringMap<unsigned> gvars; - size_t weight; -}; - +// Grab fvars and gvars data from the module static void get_fvars_gvars(Module &M, DenseMap<GlobalValue *, unsigned> &fvars, DenseMap<GlobalValue *, unsigned> &gvars) { auto fvars_gv = M.getGlobalVariable("jl_fvars"); auto gvars_gv = M.getGlobalVariable("jl_gvars"); @@ -600,6 +597,11 @@ static void get_fvars_gvars(Module &M, DenseMap<GlobalValue *, unsigned> &fvars, gvars_idxs->eraseFromParent(); } +// Weight computation +// It is important for multithreaded image building to be able to split work up +// among the threads equally. The weight calculated here is an estimation of +// how expensive a particular function is going to be to compile. + struct FunctionInfo { size_t weight; size_t bbs; @@ -667,6 +669,13 @@ ModuleInfo compute_module_info(Module &M) { return info; } +struct Partition { + StringSet<> globals; + StringMap<unsigned> fvars; + StringMap<unsigned> gvars; + size_t weight; +}; + static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partitions, const Module &M, size_t fvars_size, size_t gvars_size) { bool bad = false; #ifndef JL_NDEBUG @@ -729,7 +738,7 @@ static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partiti return !bad; } -// Chop a module up as equally as possible into threads partitions +// Chop a module up as equally as possible by weight into threads partitions static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) { //Start by stripping fvars and gvars, which helpfully removes their uses as well DenseMap<GlobalValue *, unsigned> fvars, gvars; @@ -926,6 +935,7 @@ struct ShardTimers { } }; +// Perform the actual optimization and emission of the output files static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *outputs, ArrayRef<StringRef> names, NewArchiveMember *unopt, NewArchiveMember *opt, NewArchiveMember *obj, NewArchiveMember *asm_, ShardTimers &timers, unsigned shardidx) { @@ -1048,6 +1058,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out } } +// serialize module to bitcode static auto serializeModule(const Module &M) { assert(!verifyModule(M, &errs()) && "Serializing invalid module!"); SmallVector<char, 0> ClonedModuleBuffer; @@ -1058,6 +1069,12 @@ static auto serializeModule(const Module &M) { return ClonedModuleBuffer; } +// Modules are deserialized lazily by LLVM, to avoid deserializing +// unnecessary functions. We take advantage of this by serializing +// the entire module once, then deleting the bodies of functions +// that are not in this partition. Once unnecesary functions are +// deleted, we then materialize the entire module to make use-lists +// consistent. static void materializePreserved(Module &M, Partition &partition) { DenseSet<GlobalValue *> Preserve; for (auto &GV : M.global_values()) { @@ -1083,6 +1100,12 @@ static void materializePreserved(Module &M, Partition &partition) { } } } + // Global aliases are a pain to deal with. It is illegal to have an alias to a declaration, + // so we need to replace them with either a function or a global variable declaration. However, + // we can't just delete the alias, because that would break the users of the alias. Therefore, + // we do a dance where we point each global alias to a dummy function or global variable, + // then materialize the module to access use-lists, then replace all the uses, and finally commit + // to deleting the old alias. SmallVector<std::pair<GlobalAlias *, GlobalValue *>> DeletedAliases; for (auto &GA : M.aliases()) { if (!GA.isDeclaration()) { @@ -1116,6 +1139,7 @@ static void materializePreserved(Module &M, Partition &partition) { } } +// Reconstruct jl_fvars, jl_gvars, jl_fvars_idxs, and jl_gvars_idxs from the partition static void construct_vars(Module &M, Partition &partition) { std::vector<std::pair<uint32_t, GlobalValue *>> fvar_pairs; fvar_pairs.reserve(partition.fvars.size()); @@ -1168,6 +1192,8 @@ static void construct_vars(Module &M, Partition &partition) { gidxs_var->setVisibility(GlobalValue::HiddenVisibility); } +// Materialization will leave many unused declarations, which multiversioning would otherwise clone. +// This function removes them to avoid unnecessary cloning of declarations. static void dropUnusedDeclarations(Module &M) { SmallVector<GlobalValue *> unused; for (auto &G : M.global_values()) { @@ -1184,6 +1210,8 @@ static void dropUnusedDeclarations(Module &M) { G->eraseFromParent(); } +// Entrypoint to optionally-multithreaded image compilation. This handles global coordination of the threading, +// as well as partitioning, serialization, and deserialization. static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &outputs, ArrayRef<StringRef> names, std::vector<NewArchiveMember> &unopt, std::vector<NewArchiveMember> &opt, std::vector<NewArchiveMember> &obj, std::vector<NewArchiveMember> &asm_, @@ -1198,6 +1226,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o asm_.resize(asm_.size() + asm_out * threads); auto name = names[2]; name.consume_back(".o"); + // Timers for timing purposes TimerGroup timer_group("add_output", ("Time to optimize and emit LLVM module " + name).str()); SmallVector<ShardTimers, 1> timers(threads); for (unsigned i = 0; i < threads; ++i) { @@ -1232,6 +1261,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o errs() << "WARNING: Invalid value for JULIA_IMAGE_TIMINGS: " << env << "\n"; } } + // Single-threaded case if (threads == 1) { output_timer.startTimer(); add_output_impl(M, TM, outputs.data() + outputs.size() - outcount, names, @@ -1255,6 +1285,8 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o partition_timer.startTimer(); uint64_t counter = 0; + // Partitioning requires all globals to have names. + // We use a prefix to avoid name conflicts with user code. for (auto &G : M.global_values()) { if (!G.isDeclaration() && !G.hasName()) { G.setName("jl_ext_" + Twine(counter++)); @@ -1262,6 +1294,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o } auto partitions = partitionModule(M, threads); partition_timer.stopTimer(); + serialize_timer.startTimer(); auto serialized = serializeModule(M); serialize_timer.stopTimer(); @@ -1274,10 +1307,12 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o auto objstart = obj_out ? obj.data() + obj.size() - threads : nullptr; auto asmstart = asm_out ? asm_.data() + asm_.size() - threads : nullptr; + // Start all of the worker threads std::vector<std::thread> workers(threads); for (unsigned i = 0; i < threads; i++) { workers[i] = std::thread([&, i](){ LLVMContext ctx; + // Lazily deserialize the entire module timers[i].deserialize.startTimer(); auto M = cantFail(getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), "Optimized"), ctx), "Error loading module"); timers[i].deserialize.stopTimer(); @@ -1304,6 +1339,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o }); } + // Wait for all of the worker threads to finish for (auto &w : workers) w.join(); diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp index cbce76d702119..0474cb0c7add7 100644 --- a/src/llvm-multiversioning.cpp +++ b/src/llvm-multiversioning.cpp @@ -3,6 +3,8 @@ // Function multi-versioning // LLVM pass to clone function for different archs +//see src/processor.h for documentation of the relevant globals inserted here + #include "llvm-version.h" #include "passes.h" diff --git a/src/processor.h b/src/processor.h index 6445f221882ba..497a93d40e11f 100644 --- a/src/processor.h +++ b/src/processor.h @@ -14,82 +14,9 @@ extern "C" { #endif -/** - * Related sysimg exported symbols - * - * In the following text, function refers to an abstract entity. - * It corresponds to a `Function` that we emit in the codegen, and there might be multiple copies - * of it in the system image. Only one of those copies will be used in a given session. - * Function pointers refer to a real piece of code in the system image. - * Each function might have multiple function pointers in the system image - * and each function pointer will correspond to only one function. - * - * # Global function and base pointers - * `jl_sysimg_gvars_base`: - * The address of this symbol is the base data pointer - * (all other data pointers are stored as offsets to this address) - * `jl_sysimg_fvars_base`: - * The address of this symbol is the base function pointer - * (all other function pointers are stored as offsets to this address) - * `jl_sysimg_fvars_offsets`: [static data] - * The array of function pointer offsets (`int32_t`) from the base pointer. - * This includes all julia functions in sysimg as well as all other functions that are cloned. - * The default function pointer is used if the function is cloned. - * The first element is the size of the array, which should **NOT** be used as the number - * of julia functions in the sysimg. - * Each entry in this array uniquely identifies a function we are interested in - * (the function may have multiple function pointers corresponding to different versions). - * In other sysimg info, all references to functions are stored as their `uint32_t` index - * in this array. - * - * # Target data and dispatch slots (Only needed by runtime during loading) - * `jl_dispatch_target_ids`: [static data] serialize target data. - * This contains the number of targets which is needed to decode `jl_dispatch_fvars_idxs` - * in addition to the name and feature set of each target. - * `jl_dispatch_reloc_slots`: [static data] location and index of relocation slots. - * Stored as pairs of function indices and `int32_t` offsets from `jl_sysimg_gvars_base`. - * The first element is an `uint32_t` giving the number of relocations. - * This is needed for functions whose address is used in a way that requires dispatch. - * We currently only support one type of relocation (i.e. absolute pointer) which is enough - * for all use in functions as well as GOT slot (for "PLT" callback). - * Note that not all functions being cloned are assigned a slot. - * This array is sorted by the function indices. - * There can be more than one slot per-function, - * i.e. there can be duplicated function indices. - * - * # Target functions - * `jl_dispatch_fvars_idxs`: [static data] Target-specific function indices. - * For each target, this includes a tagged `uint32_t` length, an optional `uint32_t` index - * of the base target followed by an array of tagged function indices. - * The base target index is required to be smaller than the index of the current target - * and must be the default (`0`) or a `clone_all` target. - * If it's not `0`, the function pointer array for the `clone_all` target will be used as - * the base function pointer offsets instead. - * The tag bits for both the length and the indices are the top bit. - * A tagged length indicates that all of the functions are cloned and the indices follows - * are the ones that requires relocation. The base target index is omitted in this case. - * Otherwise, the length is the total number of functions that we are interested in - * for this target, which includes all cloned julia functions and - * all other cloned functions that requires relocation. - * A tagged index means that the function pointer should be filled into the GOT slots - * identified by `jl_dispatch_reloc_slots`. There could be more than one slot per function. - * (Note that a tagged index could corresponds to a functions pointer that's the same as - * the base one since this is the only way we currently represent relocations.) - * A tagged length implicitly tags all the indices and the indices will not have the tag bit - * set. The lengths in this variable is needed to decode `jl_dispatch_fvars_offsets`. - * `jl_dispatch_fvars_offsets`: [static data] Target-specific function pointer offsets. - * This contains all the cloned functions that we are interested in and it needs to be decoded - * and used along with `jl_dispatch_fvars_idxs`. - * For the default target, there's no entries in this variable, if there's any relocations - * needed for the default target, the function pointers are taken from the global offset - * arrays directly. - * For a `clone_all` target (i.e. with the length in `jl_dispatch_fvars_idxs` tagged), this - * variable contains an offset array of the same length as the global one. Only the indices - * appearing in `jl_dispatch_fvars_idxs` need relocation and the dispatch code should return - * this array as the original/base function offsets. - * For other targets, this variable contains an offset array with the length defined in - * `jl_dispatch_fvars_idxs`. Tagged indices need relocations. - */ +// Image metadata +// Every image exports a `jl_image_pointers_t` as a global symbol `jl_image_pointers`. +// This symbol acts as a root for all other code-related symbols in the image. enum { JL_TARGET_VEC_CALL = 1 << 0, @@ -163,35 +90,118 @@ typedef struct { jl_image_fptrs_t fptrs; } jl_image_t; +// The header for each image +// Details important counts about the image typedef struct { + // The version of the image format + // Most up-to-date version is 1 uint32_t version; + // The number of shards in this image uint32_t nshards; + // The total number of fvars in this image among all shards uint32_t nfvars; + // The total number of gvars in this image among all shards uint32_t ngvars; } jl_image_header_t; +// Per-shard data for image shards. Each image contains header->nshards of these. typedef struct { + + // This is the base function pointer + // (all other function pointers are stored as offsets to this address) const char *fvar_base; + + // The array of function pointer offsets (`int32_t`) from the base pointer. + // This includes all julia functions in sysimg as well as all other functions that are cloned. + // The default function pointer is used if the function is cloned. + // The first element is the size of the array, which should **NOT** be used as the number + // of julia functions in the sysimg. + // Each entry in this array uniquely identifies a function we are interested in + // (the function may have multiple function pointers corresponding to different versions). + // In other sysimg info, all references to functions are stored as their `uint32_t` index + // in this array. const int32_t *fvar_offsets; + // This is the mapping of shard function index -> global function index + // staticdata.c relies on the same order of functions in the global function array being + // the same as what it saw when serializing the global function array. However, partitioning + // into multiple shards will cause functions to be reordered. This array is used to map + // back to the original function array for loading. const uint32_t *fvar_idxs; + // This is the base data pointer + // (all other data pointers in this shard are stored as offsets to this address) uintptr_t *gvar_base; + // This array of global variable offsets (`int32_t`) from the base pointer. + // Similar to fvar_offsets, but for gvars const int32_t *gvar_offsets; + // This is the mapping of shard global variable index -> global global variable index + // Similar to fvar_idxs, but for gvars const uint32_t *gvar_idxs; + + // location and index of relocation slots. + // Stored as pairs of function indices and `int32_t` offsets from `jl_sysimg_gvars_base`. + // The first element is an `uint32_t` giving the number of relocations. + // This is needed for functions whose address is used in a way that requires dispatch. + // We currently only support one type of relocation (i.e. absolute pointer) which is enough + // for all use in functions as well as GOT slot (for "PLT" callback). + // Note that not all functions being cloned are assigned a slot. + // This array is sorted by the function indices. + // There can be more than one slot per-function, + // i.e. there can be duplicated function indices. const int32_t *clone_slots; + // Target-specific function pointer offsets. + // This contains all the cloned functions that we are interested in and it needs to be decoded + // and used along with `jl_dispatch_fvars_idxs`. + // For the default target, there's no entries in this variable, if there's any relocations + // needed for the default target, the function pointers are taken from the global offset + // arrays directly. + // For a `clone_all` target (i.e. with the length in `jl_dispatch_fvars_idxs` tagged), this + // variable contains an offset array of the same length as the global one. Only the indices + // appearing in `jl_dispatch_fvars_idxs` need relocation and the dispatch code should return + // this array as the original/base function offsets. + // For other targets, this variable contains an offset array with the length defined in + // `jl_dispatch_fvars_idxs`. Tagged indices need relocations. const int32_t *clone_offsets; + // Target-specific function indices. + // For each target, this includes a tagged `uint32_t` length, an optional `uint32_t` index + // of the base target followed by an array of tagged function indices. + // The base target index is required to be smaller than the index of the current target + // and must be the default (`0`) or a `clone_all` target. + // If it's not `0`, the function pointer array for the `clone_all` target will be used as + // the base function pointer offsets instead. + // The tag bits for both the length and the indices are the top bit. + // A tagged length indicates that all of the functions are cloned and the indices follows + // are the ones that requires relocation. The base target index is omitted in this case. + // Otherwise, the length is the total number of functions that we are interested in + // for this target, which includes all cloned julia functions and + // all other cloned functions that requires relocation. + // A tagged index means that the function pointer should be filled into the GOT slots + // identified by `jl_dispatch_reloc_slots`. There could be more than one slot per function. + // (Note that a tagged index could corresponds to a functions pointer that's the same as + // the base one since this is the only way we currently represent relocations.) + // A tagged length implicitly tags all the indices and the indices will not have the tag bit + // set. The lengths in this variable is needed to decode `jl_dispatch_fvars_offsets`. const uint32_t *clone_idxs; } jl_image_shard_t; +// The TLS data for each image typedef struct { void *pgcstack_func_slot; void *pgcstack_key_slot; size_t *tls_offset; } jl_image_ptls_t; +//The root struct for images, points to all the other globals typedef struct { + // The image header, contains numerical global data const jl_image_header_t *header; - const jl_image_shard_t *shards; // nshards-length array + // The shard table, contains per-shard data + const jl_image_shard_t *shards; // points to header->nshards length array + // The TLS data const jl_image_ptls_t *ptls; + + // serialized target data + // This contains the number of targets + // in addition to the name and feature set of each target. const void *target_data; } jl_image_pointers_t; From 5108b4036d7610fff1ef6e56c80f760a05d2c4d0 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi <prem.chintalapudi@gmail.com> Date: Mon, 6 Mar 2023 00:38:41 -0500 Subject: [PATCH 34/34] Update documentation --- doc/src/devdocs/sysimg.md | 3 +++ doc/src/manual/environment-variables.md | 15 ++++++++++++++- src/aotcompile.cpp | 6 +++--- src/processor.h | 6 ++---- 4 files changed, 22 insertions(+), 8 deletions(-) diff --git a/doc/src/devdocs/sysimg.md b/doc/src/devdocs/sysimg.md index 3058834e927d0..6706e30ce97b1 100644 --- a/doc/src/devdocs/sysimg.md +++ b/doc/src/devdocs/sysimg.md @@ -8,6 +8,9 @@ as many platforms as possible, so as to give vastly improved startup times. On not ship with a precompiled system image file, one can be generated from the source files shipped in Julia's `DATAROOTDIR/julia/base` folder. +Julia will by default generate its system image on half of the available system threads. This +may be controlled by the [`JULIA_IMAGE_THREADS`](@ref env-image-threads) environment variable. + This operation is useful for multiple reasons. A user may: * Build a precompiled shared library system image on a platform that did not ship with one, thereby diff --git a/doc/src/manual/environment-variables.md b/doc/src/manual/environment-variables.md index a199112e934dd..a5f4efc28e965 100644 --- a/doc/src/manual/environment-variables.md +++ b/doc/src/manual/environment-variables.md @@ -277,7 +277,7 @@ To use Visual Studio Code on Windows, set `$JULIA_EDITOR` to `code.cmd`. ## Parallelization -### `JULIA_CPU_THREADS` +### [`JULIA_CPU_THREADS`](@id env-cpu-threads) Overrides the global variable [`Base.Sys.CPU_THREADS`](@ref), the number of logical CPU cores available. @@ -316,6 +316,19 @@ then spinning threads never sleep. Otherwise, `$JULIA_THREAD_SLEEP_THRESHOLD` is interpreted as an unsigned 64-bit integer (`uint64_t`) and gives, in nanoseconds, the amount of time after which spinning threads should sleep. +### [`JULIA_IMAGE_THREADS`](@id env-image-threads) + +An unsigned 32-bit integer that sets the number of threads used by image +compilation in this Julia process. The value of this variable may be +ignored if the module is a small module. If left unspecified, the smaller +of the value of [`JULIA_CPU_THREADS`](@ref env-cpu-threads) or half the +number of logical CPU cores is used in its place. + +### `JULIA_IMAGE_TIMINGS` + +A boolean value that determines if detailed timing information is printed during +during image compilation. Defaults to 0. + ### `JULIA_EXCLUSIVE` If set to anything besides `0`, then Julia's thread policy is consistent with diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 0337602cde27e..dd49e6b466474 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -600,7 +600,7 @@ static void get_fvars_gvars(Module &M, DenseMap<GlobalValue *, unsigned> &fvars, // Weight computation // It is important for multithreaded image building to be able to split work up // among the threads equally. The weight calculated here is an estimation of -// how expensive a particular function is going to be to compile. +// how expensive a particular function is going to be to compile. struct FunctionInfo { size_t weight; @@ -1193,7 +1193,7 @@ static void construct_vars(Module &M, Partition &partition) { } // Materialization will leave many unused declarations, which multiversioning would otherwise clone. -// This function removes them to avoid unnecessary cloning of declarations. +// This function removes them to avoid unnecessary cloning of declarations. static void dropUnusedDeclarations(Module &M) { SmallVector<GlobalValue *> unused; for (auto &G : M.global_values()) { @@ -1211,7 +1211,7 @@ static void dropUnusedDeclarations(Module &M) { } // Entrypoint to optionally-multithreaded image compilation. This handles global coordination of the threading, -// as well as partitioning, serialization, and deserialization. +// as well as partitioning, serialization, and deserialization. static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &outputs, ArrayRef<StringRef> names, std::vector<NewArchiveMember> &unopt, std::vector<NewArchiveMember> &opt, std::vector<NewArchiveMember> &obj, std::vector<NewArchiveMember> &asm_, diff --git a/src/processor.h b/src/processor.h index 497a93d40e11f..d2280068fb67d 100644 --- a/src/processor.h +++ b/src/processor.h @@ -106,11 +106,10 @@ typedef struct { // Per-shard data for image shards. Each image contains header->nshards of these. typedef struct { - + // This is the base function pointer // (all other function pointers are stored as offsets to this address) const char *fvar_base; - // The array of function pointer offsets (`int32_t`) from the base pointer. // This includes all julia functions in sysimg as well as all other functions that are cloned. // The default function pointer is used if the function is cloned. @@ -125,7 +124,7 @@ typedef struct { // staticdata.c relies on the same order of functions in the global function array being // the same as what it saw when serializing the global function array. However, partitioning // into multiple shards will cause functions to be reordered. This array is used to map - // back to the original function array for loading. + // back to the original function array for loading. const uint32_t *fvar_idxs; // This is the base data pointer // (all other data pointers in this shard are stored as offsets to this address) @@ -136,7 +135,6 @@ typedef struct { // This is the mapping of shard global variable index -> global global variable index // Similar to fvar_idxs, but for gvars const uint32_t *gvar_idxs; - // location and index of relocation slots. // Stored as pairs of function indices and `int32_t` offsets from `jl_sysimg_gvars_base`. // The first element is an `uint32_t` giving the number of relocations.