From 615b142c88a074399bac08a0e8fd8f48b491c1fd Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Tue, 3 Jan 2023 16:03:18 -0500
Subject: [PATCH 01/34] Simplify multiversioning

---
 src/llvm-multiversioning.cpp | 28 +++-------------------------
 1 file changed, 3 insertions(+), 25 deletions(-)

diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp
index 242b0c454ad0a..68042700bb1d0 100644
--- a/src/llvm-multiversioning.cpp
+++ b/src/llvm-multiversioning.cpp
@@ -222,8 +222,6 @@ struct CloneCtx {
         int idx;
         uint32_t flags;
         std::unique_ptr<ValueToValueMapTy> vmap; // ValueToValueMapTy is not movable....
-        // function ids that needs relocation to be initialized
-        std::set<uint32_t> relocs{};
         Target(int idx, const jl_target_spec_t &spec) :
             idx(idx),
             flags(spec.flags),
@@ -290,8 +288,6 @@ struct CloneCtx {
     std::vector<std::pair<Constant*,uint32_t>> gv_relocs{};
     // Mapping from function id (i.e. 0-based index in `fvars`) to GVs to be initialized.
     std::map<uint32_t,GlobalVariable*> const_relocs;
-    // Functions that were referred to by a global alias, and might not have other uses.
-    std::set<uint32_t> alias_relocs;
     bool has_veccall{false};
     bool has_cloneall{false};
     bool allow_bad_fvars{false};
@@ -734,13 +730,6 @@ void CloneCtx::rewrite_alias(GlobalAlias *alias, Function *F)
     uint32_t id;
     GlobalVariable *slot;
     std::tie(id, slot) = get_reloc_slot(F);
-    for (auto &grp: groups) {
-        grp.relocs.insert(id);
-        for (auto &tgt: grp.clones) {
-            tgt.relocs.insert(id);
-        }
-    }
-    alias_relocs.insert(id);
 
     auto BB = BasicBlock::Create(F->getContext(), "top", trampoline);
     IRBuilder<> irbuilder(BB);
@@ -884,15 +873,6 @@ void CloneCtx::fix_inst_uses()
                 if (!use_f->getName().endswith(suffix))
                     return nullptr;
                 std::tie(id, slot) = get_reloc_slot(orig_f);
-
-                grp.relocs.insert(id);
-                for (auto &tgt: grp.clones) {
-                    // The enclosing function of the use is cloned,
-                    // no need to deal with this use on this target.
-                    if (map_get(*tgt.vmap, use_f))
-                        continue;
-                    tgt.relocs.insert(id);
-                }
                 return slot;
             }, tbaa_const);
         }
@@ -1018,12 +998,10 @@ void CloneCtx::emit_metadata()
             }
             auto it = const_relocs.find(id);
             if (it != const_relocs.end()) {
+                shared_relocs.insert(id);
                 values.push_back(id_v);
                 values.push_back(get_ptrdiff32(it->second, gbase));
             }
-            if (alias_relocs.find(id) != alias_relocs.end()) {
-                shared_relocs.insert(id);
-            }
         }
         values[0] = ConstantInt::get(T_int32, values.size() / 2);
         ArrayType *vars_type = ArrayType::get(T_int32, values.size());
@@ -1046,7 +1024,7 @@ void CloneCtx::emit_metadata()
                 auto grp = static_cast<Group*>(tgt);
                 count = jl_sysimg_tag_mask;
                 for (uint32_t j = 0; j < nfvars; j++) {
-                    if (shared_relocs.count(j) || tgt->relocs.count(j)) {
+                    if (shared_relocs.count(j)) {
                         count++;
                         idxs.push_back(j);
                     }
@@ -1061,7 +1039,7 @@ void CloneCtx::emit_metadata()
                 idxs.push_back(baseidx);
                 for (uint32_t j = 0; j < nfvars; j++) {
                     auto base_f = grp->base_func(fvars[j]);
-                    if (shared_relocs.count(j) || tgt->relocs.count(j)) {
+                    if (shared_relocs.count(j)) {
                         count++;
                         idxs.push_back(jl_sysimg_tag_mask | j);
                         auto f = map_get(*tgt->vmap, base_f, base_f);

From 27808e136757c19c9d7accfafae3958f3f48b7f1 Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Tue, 3 Jan 2023 18:17:46 -0500
Subject: [PATCH 02/34] Refactor aotcompile

---
 src/aotcompile.cpp           | 231 ++++++++++++++++++++---------------
 src/llvm-multiversioning.cpp |  31 +----
 2 files changed, 140 insertions(+), 122 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 907735dfa0128..d3d4529d32c30 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -496,7 +496,8 @@ static void injectCRTAlias(Module &M, StringRef name, StringRef alias, FunctionT
     if (!target) {
         target = Function::Create(FT, Function::ExternalLinkage, alias, M);
     }
-    Function *interposer = Function::Create(FT, Function::InternalLinkage, name, M);
+    Function *interposer = Function::Create(FT, Function::ExternalLinkage, name, M);
+    interposer->setVisibility(GlobalValue::HiddenVisibility);
     appendToCompilerUsed(M, {interposer});
 
     llvm::IRBuilder<> builder(BasicBlock::Create(M.getContext(), "top", interposer));
@@ -532,7 +533,7 @@ void jl_dump_native_impl(void *native_code,
     TheTriple.setObjectFormat(Triple::MachO);
     TheTriple.setOS(llvm::Triple::MacOSX);
 #endif
-    std::unique_ptr<TargetMachine> TM(
+    std::unique_ptr<TargetMachine> SourceTM(
         jl_ExecutionEngine->getTarget().createTargetMachine(
             TheTriple.getTriple(),
             jl_ExecutionEngine->getTargetCPU(),
@@ -554,53 +555,16 @@ void jl_dump_native_impl(void *native_code,
             ));
 
 
-    // set up optimization passes
-    SmallVector<char, 0> bc_Buffer;
-    SmallVector<char, 0> obj_Buffer;
-    SmallVector<char, 0> asm_Buffer;
-    SmallVector<char, 0> unopt_bc_Buffer;
-    raw_svector_ostream bc_OS(bc_Buffer);
-    raw_svector_ostream obj_OS(obj_Buffer);
-    raw_svector_ostream asm_OS(asm_Buffer);
-    raw_svector_ostream unopt_bc_OS(unopt_bc_Buffer);
     std::vector<NewArchiveMember> bc_Archive;
     std::vector<NewArchiveMember> obj_Archive;
     std::vector<NewArchiveMember> asm_Archive;
     std::vector<NewArchiveMember> unopt_bc_Archive;
     std::vector<std::string> outputs;
 
-    PassBuilder emptyPB;
-    AnalysisManagers empty(emptyPB);
-    ModulePassManager preopt, postopt;
-    legacy::PassManager emitter; // MC emission is only supported on legacy PM
-
-    if (unopt_bc_fname)
-        preopt.addPass(BitcodeWriterPass(unopt_bc_OS));
-
-    if (bc_fname)
-        postopt.addPass(BitcodeWriterPass(bc_OS));
-    //Is this necessary for TM?
-    addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis());
-    if (obj_fname)
-        if (TM->addPassesToEmitFile(emitter, obj_OS, nullptr, CGFT_ObjectFile, false))
-            jl_safe_printf("ERROR: target does not support generation of object files\n");
-    if (asm_fname)
-        if (TM->addPassesToEmitFile(emitter, asm_OS, nullptr, CGFT_AssemblyFile, false))
-            jl_safe_printf("ERROR: target does not support generation of object files\n");
-
     // Reset the target triple to make sure it matches the new target machine
     auto dataM = data->M.getModuleUnlocked();
-    dataM->setTargetTriple(TM->getTargetTriple().str());
-    dataM->setDataLayout(jl_create_datalayout(*TM));
-
-#ifndef JL_USE_NEW_PM
-    legacy::PassManager optimizer;
-    addTargetPasses(&optimizer, TM->getTargetTriple(), TM->getTargetIRAnalysis());
-    addOptimizationPasses(&optimizer, jl_options.opt_level, true, true);
-    addMachinePasses(&optimizer, jl_options.opt_level);
-#else
-    NewPM optimizer{std::move(TM), getOptLevel(jl_options.opt_level), OptimizationOptions::defaults(true, true)};
-#endif
+    dataM->setTargetTriple(SourceTM->getTargetTriple().str());
+    dataM->setDataLayout(jl_create_datalayout(*SourceTM));
 
     Type *T_size;
     if (sizeof(size_t) == 8)
@@ -609,8 +573,10 @@ void jl_dump_native_impl(void *native_code,
         T_size = Type::getInt32Ty(Context);
     Type *T_psize = T_size->getPointerTo();
 
+    bool imaging_mode = imaging_default() || jl_options.outputo;
+
     // add metadata information
-    if (imaging_default() || jl_options.outputo) {
+    if (imaging_mode) {
         emit_offset_table(*dataM, data->jl_sysimg_gvars, "jl_sysimg_gvars", T_psize);
         emit_offset_table(*dataM, data->jl_sysimg_fvars, "jl_sysimg_fvars", T_psize);
 
@@ -626,70 +592,87 @@ void jl_dump_native_impl(void *native_code,
     }
 
     // do the actual work
-    auto add_output = [&] (Module &M, StringRef unopt_bc_Name, StringRef bc_Name, StringRef obj_Name, StringRef asm_Name, bool inject_crt) {
-        preopt.run(M, empty.MAM);
-        if (bc_fname || obj_fname || asm_fname) {
-            assert(!verifyModule(M, &errs()));
-            optimizer.run(M);
-            assert(!verifyModule(M, &errs()));
+    auto add_output = [&] (Module &M, StringRef unopt_bc_Name, StringRef bc_Name, StringRef obj_Name, StringRef asm_Name) {
+
+        auto TM = std::unique_ptr<TargetMachine>(
+            SourceTM->getTarget().createTargetMachine(
+                SourceTM->getTargetTriple().str(),
+                SourceTM->getTargetCPU(),
+                SourceTM->getTargetFeatureString(),
+                SourceTM->Options,
+                SourceTM->getRelocationModel(),
+                SourceTM->getCodeModel(),
+                SourceTM->getOptLevel()));
+
+        if (unopt_bc_fname) {
+            SmallVector<char, 0> Buffer;
+            raw_svector_ostream OS(Buffer);
+            PassBuilder PB;
+            AnalysisManagers AM{*TM, PB, OptimizationLevel::O0};
+            ModulePassManager MPM;
+            MPM.addPass(BitcodeWriterPass(OS));
+            emit_result(unopt_bc_Archive, Buffer, unopt_bc_Name, outputs);
         }
+        if (!bc_fname && !obj_fname && !asm_fname) {
+            return;
+        }
+        assert(!verifyModule(M, &errs()));
 
-        if (inject_crt) {
-            // We would like to emit an alias or an weakref alias to redirect these symbols
-            // but LLVM doesn't let us emit a GlobalAlias to a declaration...
-            // So for now we inject a definition of these functions that calls our runtime
-            // functions. We do so after optimization to avoid cloning these functions.
-            injectCRTAlias(M, "__gnu_h2f_ieee", "julia__gnu_h2f_ieee",
-                    FunctionType::get(Type::getFloatTy(Context), { Type::getHalfTy(Context) }, false));
-            injectCRTAlias(M, "__extendhfsf2", "julia__gnu_h2f_ieee",
-                    FunctionType::get(Type::getFloatTy(Context), { Type::getHalfTy(Context) }, false));
-            injectCRTAlias(M, "__gnu_f2h_ieee", "julia__gnu_f2h_ieee",
-                    FunctionType::get(Type::getHalfTy(Context), { Type::getFloatTy(Context) }, false));
-            injectCRTAlias(M, "__truncsfhf2", "julia__gnu_f2h_ieee",
-                    FunctionType::get(Type::getHalfTy(Context), { Type::getFloatTy(Context) }, false));
-            injectCRTAlias(M, "__truncdfhf2", "julia__truncdfhf2",
-                    FunctionType::get(Type::getHalfTy(Context), { Type::getDoubleTy(Context) }, false));
+#ifndef JL_USE_NEW_PM
+        legacy::PassManager optimizer;
+        addTargetPasses(&optimizer, TM->getTargetTriple(), TM->getTargetIRAnalysis());
+        addOptimizationPasses(&optimizer, jl_options.opt_level, true, true);
+        addMachinePasses(&optimizer, jl_options.opt_level);
+#else
 
-#if defined(_OS_WINDOWS_)
-            // Windows expect that the function `_DllMainStartup` is present in an dll.
-            // Normal compilers use something like Zig's crtdll.c instead we provide a
-            // a stub implementation.
-            auto T_pvoid = Type::getInt8Ty(Context)->getPointerTo();
-            auto T_int32 = Type::getInt32Ty(Context);
-            auto FT = FunctionType::get(T_int32, {T_pvoid, T_int32, T_pvoid}, false);
-            auto F = Function::Create(FT, Function::ExternalLinkage, "_DllMainCRTStartup", M);
-            F->setCallingConv(CallingConv::X86_StdCall);
-
-            llvm::IRBuilder<> builder(BasicBlock::Create(M.getContext(), "top", F));
-            builder.CreateRet(ConstantInt::get(T_int32, 1));
+        auto PMTM = std::unique_ptr<TargetMachine>(
+            SourceTM->getTarget().createTargetMachine(
+                SourceTM->getTargetTriple().str(),
+                SourceTM->getTargetCPU(),
+                SourceTM->getTargetFeatureString(),
+                SourceTM->Options,
+                SourceTM->getRelocationModel(),
+                SourceTM->getCodeModel(),
+                SourceTM->getOptLevel()));
+        NewPM optimizer{std::move(PMTM), getOptLevel(jl_options.opt_level), OptimizationOptions::defaults(true, true)};
 #endif
+        optimizer.run(M);
+        assert(!verifyModule(M, &errs()));
+
+        if (bc_fname) {
+            SmallVector<char, 0> Buffer;
+            raw_svector_ostream OS(Buffer);
+            PassBuilder PB;
+            AnalysisManagers AM{*TM, PB, OptimizationLevel::O0};
+            ModulePassManager MPM;
+            MPM.addPass(BitcodeWriterPass(OS));
+            emit_result(bc_Archive, Buffer, bc_Name, outputs);
         }
 
-        postopt.run(M, empty.MAM);
-
-        // Get target by snooping on multiversioning
-        GlobalVariable *target_ids = M.getNamedGlobal("jl_dispatch_target_ids");
-        if (s && target_ids) {
-            if(auto targets = dyn_cast<ConstantDataArray>(target_ids->getInitializer())) {
-                auto rawTargets = targets->getRawDataValues();
-                write_int32(s, rawTargets.size());
-                ios_write(s, rawTargets.data(), rawTargets.size());
-            };
+        if (obj_fname) {
+            SmallVector<char, 0> Buffer;
+            raw_svector_ostream OS(Buffer);
+            legacy::PassManager emitter;
+            addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis());
+            if (TM->addPassesToEmitFile(emitter, OS, nullptr, CGFT_ObjectFile, false))
+                jl_safe_printf("ERROR: target does not support generation of object files\n");
+            emitter.run(M);
+            emit_result(obj_Archive, Buffer, obj_Name, outputs);
         }
 
-        emitter.run(M);
-
-        if (unopt_bc_fname)
-            emit_result(unopt_bc_Archive, unopt_bc_Buffer, unopt_bc_Name, outputs);
-        if (bc_fname)
-            emit_result(bc_Archive, bc_Buffer, bc_Name, outputs);
-        if (obj_fname)
-            emit_result(obj_Archive, obj_Buffer, obj_Name, outputs);
-        if (asm_fname)
-            emit_result(asm_Archive, asm_Buffer, asm_Name, outputs);
+        if (asm_fname) {
+            SmallVector<char, 0> Buffer;
+            raw_svector_ostream OS(Buffer);
+            legacy::PassManager emitter;
+            addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis());
+            if (TM->addPassesToEmitFile(emitter, OS, nullptr, CGFT_AssemblyFile, false))
+                jl_safe_printf("ERROR: target does not support generation of assembly files\n");
+            emitter.run(M);
+            emit_result(asm_Archive, Buffer, asm_Name, outputs);
+        }
     };
 
-    add_output(*dataM, "unopt.bc", "text.bc", "text.o", "text.s", true);
+    add_output(*dataM, "unopt.bc", "text.bc", "text.o", "text.s");
 
     orc::ThreadSafeModule sysimage(std::make_unique<Module>("sysimage", Context), TSCtx);
     auto sysimageM = sysimage.getModuleUnlocked();
@@ -699,6 +682,35 @@ void jl_dump_native_impl(void *native_code,
     sysimageM->setStackProtectorGuard(dataM->getStackProtectorGuard());
     sysimageM->setOverrideStackAlignment(dataM->getOverrideStackAlignment());
 #endif
+    // We would like to emit an alias or an weakref alias to redirect these symbols
+    // but LLVM doesn't let us emit a GlobalAlias to a declaration...
+    // So for now we inject a definition of these functions that calls our runtime
+    // functions. We do so after optimization to avoid cloning these functions.
+    injectCRTAlias(*sysimageM, "__gnu_h2f_ieee", "julia__gnu_h2f_ieee",
+            FunctionType::get(Type::getFloatTy(Context), { Type::getHalfTy(Context) }, false));
+    injectCRTAlias(*sysimageM, "__extendhfsf2", "julia__gnu_h2f_ieee",
+            FunctionType::get(Type::getFloatTy(Context), { Type::getHalfTy(Context) }, false));
+    injectCRTAlias(*sysimageM, "__gnu_f2h_ieee", "julia__gnu_f2h_ieee",
+            FunctionType::get(Type::getHalfTy(Context), { Type::getFloatTy(Context) }, false));
+    injectCRTAlias(*sysimageM, "__truncsfhf2", "julia__gnu_f2h_ieee",
+            FunctionType::get(Type::getHalfTy(Context), { Type::getFloatTy(Context) }, false));
+    injectCRTAlias(*sysimageM, "__truncdfhf2", "julia__truncdfhf2",
+            FunctionType::get(Type::getHalfTy(Context), { Type::getDoubleTy(Context) }, false));
+
+    if (TheTriple.isOSWindows()) {
+        // Windows expect that the function `_DllMainStartup` is present in an dll.
+        // Normal compilers use something like Zig's crtdll.c instead we provide a
+        // a stub implementation.
+        auto T_pvoid = Type::getInt8Ty(Context)->getPointerTo();
+        auto T_int32 = Type::getInt32Ty(Context);
+        auto FT = FunctionType::get(T_int32, {T_pvoid, T_int32, T_pvoid}, false);
+        auto F = Function::Create(FT, Function::ExternalLinkage, "_DllMainCRTStartup", *sysimageM);
+        F->setCallingConv(CallingConv::X86_StdCall);
+
+        llvm::IRBuilder<> builder(BasicBlock::Create(Context, "top", F));
+        builder.CreateRet(ConstantInt::get(T_int32, 1));
+    }
+    bool has_veccall = dataM->getModuleFlag("julia.mv.veccall");
     data->M = orc::ThreadSafeModule(); // free memory for data->M
 
     if (sysimg_data) {
@@ -712,7 +724,32 @@ void jl_dump_native_impl(void *native_code,
                                      GlobalVariable::ExternalLinkage,
                                      len, "jl_system_image_size"));
     }
-    add_output(*sysimageM, "data.bc", "data.bc", "data.o", "data.s", false);
+    if (imaging_mode) {
+        auto specs = jl_get_llvm_clone_targets();
+        const uint32_t base_flags = has_veccall ? JL_TARGET_VEC_CALL : 0;
+        std::vector<uint8_t> data;
+        auto push_i32 = [&] (uint32_t v) {
+            uint8_t buff[4];
+            memcpy(buff, &v, 4);
+            data.insert(data.end(), buff, buff + 4);
+        };
+        push_i32(specs.size());
+        for (uint32_t i = 0; i < specs.size(); i++) {
+            push_i32(base_flags | (specs[i].flags & JL_TARGET_UNKNOWN_NAME));
+            auto &specdata = specs[i].data;
+            data.insert(data.end(), specdata.begin(), specdata.end());
+        }
+        auto value = ConstantDataArray::get(Context, data);
+        addComdat(new GlobalVariable(*sysimageM, value->getType(), true,
+                                      GlobalVariable::ExternalLinkage,
+                                      value, "jl_dispatch_target_ids"));
+
+        if (s) {
+            write_int32(s, data.size());
+            ios_write(s, (const char *)data.data(), data.size());
+        }
+    }
+    add_output(*sysimageM, "data.bc", "data.bc", "data.o", "data.s");
 
     object::Archive::Kind Kind = getDefaultForHost(TheTriple);
     if (unopt_bc_fname)
diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp
index 68042700bb1d0..c94aee9927540 100644
--- a/src/llvm-multiversioning.cpp
+++ b/src/llvm-multiversioning.cpp
@@ -289,7 +289,6 @@ struct CloneCtx {
     // Mapping from function id (i.e. 0-based index in `fvars`) to GVs to be initialized.
     std::map<uint32_t,GlobalVariable*> const_relocs;
     bool has_veccall{false};
-    bool has_cloneall{false};
     bool allow_bad_fvars{false};
 };
 
@@ -345,7 +344,6 @@ CloneCtx::CloneCtx(Module &M, function_ref<LoopInfo&(Function&)> GetLI, function
     for (uint32_t i = 1; i < ntargets; i++) {
         auto &spec = specs[i];
         if (spec.flags & JL_TARGET_CLONE_ALL) {
-            has_cloneall = true;
             groups.emplace_back(i, spec);
         }
         else {
@@ -404,7 +402,7 @@ void CloneCtx::clone_function(Function *F, Function *new_f, ValueToValueMapTy &v
 // Clone all clone_all targets. Makes sure that the base targets are all available.
 void CloneCtx::clone_bases()
 {
-    if (!has_cloneall)
+    if (groups.size() == 1)
         return;
     uint32_t ngrps = groups.size();
     for (uint32_t gid = 1; gid < ngrps; gid++) {
@@ -553,7 +551,7 @@ void CloneCtx::check_partial(Group &grp, Target &tgt)
                                            F->getName() + suffix, &M);
         new_f->copyAttributesFrom(F);
         vmap[F] = new_f;
-        if (!has_cloneall)
+        if (groups.size() == 1)
             cloned.insert(orig_f);
         grp.clone_fs.insert(i);
         all_origs.insert(orig_f);
@@ -607,7 +605,7 @@ void CloneCtx::check_partial(Group &grp, Target &tgt)
             continue;
         auto orig_f = orig_funcs[i];
         if (all_origs.count(orig_f)) {
-            if (!has_cloneall)
+            if (groups.size() == 1)
                 cloned.insert(orig_f);
             grp.clone_fs.insert(i);
         }
@@ -787,7 +785,7 @@ void CloneCtx::fix_gv_uses()
         return changed;
     };
     for (auto orig_f: orig_funcs) {
-        if (!has_cloneall && !cloned.count(orig_f))
+        if (groups.size() == 1 && !cloned.count(orig_f))
             continue;
         while (single_pass(orig_f)) {
         }
@@ -952,25 +950,8 @@ void CloneCtx::emit_metadata()
         }
     }
 
-    // Generate `jl_dispatch_target_ids`
-    {
-        const uint32_t base_flags = has_veccall ? JL_TARGET_VEC_CALL : 0;
-        std::vector<uint8_t> data;
-        auto push_i32 = [&] (uint32_t v) {
-            uint8_t buff[4];
-            memcpy(buff, &v, 4);
-            data.insert(data.end(), buff, buff + 4);
-        };
-        push_i32(ntargets);
-        for (uint32_t i = 0; i < ntargets; i++) {
-            push_i32(base_flags | (specs[i].flags & JL_TARGET_UNKNOWN_NAME));
-            auto &specdata = specs[i].data;
-            data.insert(data.end(), specdata.begin(), specdata.end());
-        }
-        auto value = ConstantDataArray::get(M.getContext(), data);
-        add_comdat(new GlobalVariable(M, value->getType(), true,
-                                      GlobalVariable::ExternalLinkage,
-                                      value, "jl_dispatch_target_ids"));
+    if (has_veccall) {
+        M.addModuleFlag(Module::Max, "julia.mv.veccall", 1);
     }
 
     // Generate `jl_dispatch_reloc_slots`

From 4524987a384f444e02c0e21afacfc3c4f4d68a4e Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Tue, 3 Jan 2023 18:41:07 -0500
Subject: [PATCH 03/34] Timing print statements

---
 src/aotcompile.cpp | 47 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index d3d4529d32c30..2c9edecae7df7 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -273,6 +273,8 @@ void replaceUsesWithLoad(Function &F, function_ref<GlobalVariable *(Instruction
 extern "C" JL_DLLEXPORT
 void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvmmod, const jl_cgparams_t *cgparams, int _policy, int _imaging_mode, int _external_linkage, size_t _world)
 {
+    uint64_t start = jl_hrtime();
+    uint64_t end = 0;
     ++CreateNativeCalls;
     CreateNativeMax.updateMax(jl_array_len(methods));
     if (cgparams == NULL)
@@ -464,6 +466,8 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
     if (ctx.getContext()) {
         jl_ExecutionEngine->releaseContext(std::move(ctx));
     }
+    end = jl_hrtime();
+    dbgs() << "jl_create_native: " << (end - start) / 1e9 << "s\n";
     return (void*)data;
 }
 
@@ -517,6 +521,8 @@ void jl_dump_native_impl(void *native_code,
         const char *asm_fname,
         const char *sysimg_data, size_t sysimg_len, ios_t *s)
 {
+    uint64_t start = jl_hrtime();
+    uint64_t end = 0;
     JL_TIMING(NATIVE_DUMP);
     jl_native_code_desc_t *data = (jl_native_code_desc_t*)native_code;
     auto TSCtx = data->M.getContext();
@@ -575,6 +581,12 @@ void jl_dump_native_impl(void *native_code,
 
     bool imaging_mode = imaging_default() || jl_options.outputo;
 
+    end = jl_hrtime();
+
+    dbgs() << "setup time: " << (end - start) / 1e9 << "s\n";
+
+    start = jl_hrtime();
+
     // add metadata information
     if (imaging_mode) {
         emit_offset_table(*dataM, data->jl_sysimg_gvars, "jl_sysimg_gvars", T_psize);
@@ -591,6 +603,12 @@ void jl_dump_native_impl(void *native_code,
                                      "jl_RTLD_DEFAULT_handle_pointer"));
     }
 
+    end = jl_hrtime();
+
+    dbgs() << "metadata time: " << (end - start) / 1e9 << "s\n";
+
+    start = jl_hrtime();
+
     // do the actual work
     auto add_output = [&] (Module &M, StringRef unopt_bc_Name, StringRef bc_Name, StringRef obj_Name, StringRef asm_Name) {
 
@@ -618,6 +636,9 @@ void jl_dump_native_impl(void *native_code,
         }
         assert(!verifyModule(M, &errs()));
 
+        uint64_t start = jl_hrtime();
+        end = 0;
+
 #ifndef JL_USE_NEW_PM
         legacy::PassManager optimizer;
         addTargetPasses(&optimizer, TM->getTargetTriple(), TM->getTargetIRAnalysis());
@@ -639,6 +660,10 @@ void jl_dump_native_impl(void *native_code,
         optimizer.run(M);
         assert(!verifyModule(M, &errs()));
 
+        end = jl_hrtime();
+
+        dbgs() << "optimize time: " << (end - start) / 1e9 << "s\n";
+
         if (bc_fname) {
             SmallVector<char, 0> Buffer;
             raw_svector_ostream OS(Buffer);
@@ -649,6 +674,8 @@ void jl_dump_native_impl(void *native_code,
             emit_result(bc_Archive, Buffer, bc_Name, outputs);
         }
 
+        start = jl_hrtime();
+
         if (obj_fname) {
             SmallVector<char, 0> Buffer;
             raw_svector_ostream OS(Buffer);
@@ -660,6 +687,10 @@ void jl_dump_native_impl(void *native_code,
             emit_result(obj_Archive, Buffer, obj_Name, outputs);
         }
 
+        end = jl_hrtime();
+
+        dbgs() << "codegen time: " << (end - start) / 1e9 << "s\n";
+
         if (asm_fname) {
             SmallVector<char, 0> Buffer;
             raw_svector_ostream OS(Buffer);
@@ -674,6 +705,12 @@ void jl_dump_native_impl(void *native_code,
 
     add_output(*dataM, "unopt.bc", "text.bc", "text.o", "text.s");
 
+    end = jl_hrtime();
+
+    dbgs() << "text output time: " << (end - start) / 1e9 << "s\n";
+
+    start = jl_hrtime();
+
     orc::ThreadSafeModule sysimage(std::make_unique<Module>("sysimage", Context), TSCtx);
     auto sysimageM = sysimage.getModuleUnlocked();
     sysimageM->setTargetTriple(dataM->getTargetTriple());
@@ -751,6 +788,12 @@ void jl_dump_native_impl(void *native_code,
     }
     add_output(*sysimageM, "data.bc", "data.bc", "data.o", "data.s");
 
+    end = jl_hrtime();
+
+    dbgs() << "data module time: " << (end - start) / 1e9 << "s\n";
+
+    start = jl_hrtime();
+
     object::Archive::Kind Kind = getDefaultForHost(TheTriple);
     if (unopt_bc_fname)
         handleAllErrors(writeArchive(unopt_bc_fname, unopt_bc_Archive, true,
@@ -764,6 +807,10 @@ void jl_dump_native_impl(void *native_code,
     if (asm_fname)
         handleAllErrors(writeArchive(asm_fname, asm_Archive, true,
                     Kind, true, false), reportWriterError);
+    
+    end = jl_hrtime();
+
+    dbgs() << "archive time: " << (end - start) / 1e9 << "s\n";
 
     delete data;
 }

From 094269c8c1e506e36f0b4bd7ddc6ec38f279bb3c Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Thu, 5 Jan 2023 14:36:16 -0500
Subject: [PATCH 04/34] Move image init to processor.cpp

---
 src/llvm-multiversioning.cpp | 67 ++++++++++++++----------------
 src/processor.cpp            | 79 +++++++++++++++++++++++++-----------
 src/processor.h              | 11 ++++-
 src/processor_arm.cpp        |  4 +-
 src/processor_fallback.cpp   |  4 +-
 src/processor_x86.cpp        |  4 +-
 src/staticdata.c             | 69 ++-----------------------------
 7 files changed, 103 insertions(+), 135 deletions(-)

diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp
index c94aee9927540..3325cb47147a6 100644
--- a/src/llvm-multiversioning.cpp
+++ b/src/llvm-multiversioning.cpp
@@ -253,21 +253,14 @@ struct CloneCtx {
     void emit_metadata();
 private:
     void prepare_vmap(ValueToValueMapTy &vmap);
-    bool is_vector(FunctionType *ty) const;
     void clone_function(Function *F, Function *new_f, ValueToValueMapTy &vmap);
     uint32_t collect_func_info(Function &F);
     void check_partial(Group &grp, Target &tgt);
     void clone_partial(Group &grp, Target &tgt);
-    void add_features(Function *F, StringRef name, StringRef features, uint32_t flags) const;
-    template<typename T>
-    T *add_comdat(T *G) const;
     uint32_t get_func_id(Function *F);
     template<typename Stack>
     Constant *rewrite_gv_init(const Stack& stack);
     std::pair<uint32_t,GlobalVariable*> get_reloc_slot(Function *F);
-    Constant *get_ptrdiff32(Constant *ptr, Constant *base) const;
-    template<typename T>
-    Constant *emit_offset_table(const std::vector<T*> &vars, StringRef name) const;
     void rewrite_alias(GlobalAlias *alias, Function* F);
 
     MDNode *tbaa_const;
@@ -424,7 +417,7 @@ void CloneCtx::clone_bases()
     }
 }
 
-bool CloneCtx::is_vector(FunctionType *ty) const
+static bool is_vector(FunctionType *ty)
 {
     if (ty->getReturnType()->isVectorTy())
         return true;
@@ -507,6 +500,29 @@ void CloneCtx::collect_func_infos()
     }
 }
 
+static void add_features(Function *F, StringRef name, StringRef features, uint32_t flags)
+{
+    auto attr = F->getFnAttribute("target-features");
+    if (attr.isStringAttribute()) {
+        std::string new_features(attr.getValueAsString());
+        new_features += ",";
+        new_features += features;
+        F->addFnAttr("target-features", new_features);
+    }
+    else {
+        F->addFnAttr("target-features", features);
+    }
+    F->addFnAttr("target-cpu", name);
+    if (!F->hasFnAttribute(Attribute::OptimizeNone)) {
+        if (flags & JL_TARGET_OPTSIZE) {
+            F->addFnAttr(Attribute::OptimizeForSize);
+        }
+        else if (flags & JL_TARGET_MINSIZE) {
+            F->addFnAttr(Attribute::MinSize);
+        }
+    }
+}
+
 void CloneCtx::clone_all_partials()
 {
     // First decide what to clone
@@ -632,29 +648,6 @@ void CloneCtx::clone_partial(Group &grp, Target &tgt)
     }
 }
 
-void CloneCtx::add_features(Function *F, StringRef name, StringRef features, uint32_t flags) const
-{
-    auto attr = F->getFnAttribute("target-features");
-    if (attr.isStringAttribute()) {
-        std::string new_features(attr.getValueAsString());
-        new_features += ",";
-        new_features += features;
-        F->addFnAttr("target-features", new_features);
-    }
-    else {
-        F->addFnAttr("target-features", features);
-    }
-    F->addFnAttr("target-cpu", name);
-    if (!F->hasFnAttribute(Attribute::OptimizeNone)) {
-        if (flags & JL_TARGET_OPTSIZE) {
-            F->addFnAttr(Attribute::OptimizeForSize);
-        }
-        else if (flags & JL_TARGET_MINSIZE) {
-            F->addFnAttr(Attribute::MinSize);
-        }
-    }
-}
-
 uint32_t CloneCtx::get_func_id(Function *F)
 {
     auto &ref = func_ids[F];
@@ -878,7 +871,7 @@ void CloneCtx::fix_inst_uses()
 }
 
 template<typename T>
-inline T *CloneCtx::add_comdat(T *G) const
+static inline T *add_comdat(T *G)
 {
 #if defined(_OS_WINDOWS_)
     // add __declspec(dllexport) to everything marked for export
@@ -890,7 +883,7 @@ inline T *CloneCtx::add_comdat(T *G) const
     return G;
 }
 
-Constant *CloneCtx::get_ptrdiff32(Constant *ptr, Constant *base) const
+static Constant *get_ptrdiff32(Constant *ptr, Constant *base)
 {
     if (ptr->getType()->isPointerTy())
         ptr = ConstantExpr::getPtrToInt(ptr, getSizeTy(ptr->getContext()));
@@ -899,7 +892,7 @@ Constant *CloneCtx::get_ptrdiff32(Constant *ptr, Constant *base) const
 }
 
 template<typename T>
-Constant *CloneCtx::emit_offset_table(const std::vector<T*> &vars, StringRef name) const
+static Constant *emit_offset_table(Module &M, const std::vector<T*> &vars, StringRef name)
 {
     auto T_int32 = Type::getInt32Ty(M.getContext());
     auto T_size = getSizeTy(M.getContext());
@@ -911,7 +904,7 @@ Constant *CloneCtx::emit_offset_table(const std::vector<T*> &vars, StringRef nam
                                        name + "_base",
                                        base, &M));
     } else {
-        base = ConstantExpr::getNullValue(T_size->getPointerTo());
+        base = add_comdat(new GlobalVariable(M, T_size, true, GlobalValue::ExternalLinkage, Constant::getNullValue(T_size), name + "_base"));
     }
     auto vbase = ConstantExpr::getPtrToInt(base, T_size);
     std::vector<Constant*> offsets(nvars + 1);
@@ -938,8 +931,8 @@ void CloneCtx::emit_metadata()
     }
 
     // Store back the information about exported functions.
-    auto fbase = emit_offset_table(fvars, "jl_sysimg_fvars");
-    auto gbase = emit_offset_table(gvars, "jl_sysimg_gvars");
+    auto fbase = emit_offset_table(M, fvars, "jl_sysimg_fvars");
+    auto gbase = emit_offset_table(M, gvars, "jl_sysimg_gvars");
 
     uint32_t ntargets = specs.size();
     SmallVector<Target*, 8> targets(ntargets);
diff --git a/src/processor.cpp b/src/processor.cpp
index 13b40ec4f7363..a8aca2a64ab19 100644
--- a/src/processor.cpp
+++ b/src/processor.cpp
@@ -17,6 +17,10 @@
 
 #include "julia_assert.h"
 
+#ifndef _OS_WINDOWS_
+#include <dlfcn.h>
+#endif
+
 // CPU target string is a list of strings separated by `;` each string starts with a CPU
 // or architecture name and followed by an optional list of features separated by `,`.
 // A "generic" or empty CPU name means the basic required feature set of the target ISA
@@ -621,44 +625,53 @@ static inline std::vector<TargetData<n>> &get_cmdline_targets(F &&feature_cb)
 // Load sysimg, use the `callback` for dispatch and perform all relocations
 // for the selected target.
 template<typename F>
-static inline jl_image_fptrs_t parse_sysimg(void *hdl, F &&callback)
+static inline jl_image_t parse_sysimg(void *hdl, F &&callback)
 {
-    jl_image_fptrs_t res = {nullptr, 0, nullptr, 0, nullptr, nullptr};
+    jl_image_t res{};
 
     // .data base
     char *data_base;
-    if (!jl_dlsym(hdl, "jl_sysimg_gvars_base", (void**)&data_base, 0)) {
-        data_base = NULL;
+    jl_dlsym(hdl, "jl_sysimg_gvars_base", (void**)&data_base, 1);
+
+    {
+        void *pgcstack_func_slot;
+        if (jl_dlsym(hdl, "jl_pgcstack_func_slot", &pgcstack_func_slot, 0)) {
+            void *pgcstack_key_slot;
+            jl_dlsym(hdl, "jl_pgcstack_key_slot", &pgcstack_key_slot, 1);
+            jl_pgcstack_getkey((jl_get_pgcstack_func**)pgcstack_func_slot, (jl_pgcstack_key_t*)pgcstack_key_slot);
+
+            size_t *tls_offset_idx;
+            jl_dlsym(hdl, "jl_tls_offset", (void **)&tls_offset_idx, 1);
+            *tls_offset_idx = (uintptr_t)(jl_tls_offset == -1 ? 0 : jl_tls_offset);
+        }
     }
+
     // .text base
     char *text_base;
-    if (!jl_dlsym(hdl, "jl_sysimg_fvars_base", (void**)&text_base, 0)) {
-        text_base = NULL;
-    }
-    res.base = text_base;
+    jl_dlsym(hdl, "jl_sysimg_fvars_base", (void**)&text_base, 1);
 
-    int32_t *offsets;
+    const int32_t *offsets;
     jl_dlsym(hdl, "jl_sysimg_fvars_offsets", (void**)&offsets, 1);
     uint32_t nfunc = offsets[0];
-    res.offsets = offsets + 1;
+    offsets++;
 
-    void *ids;
-    jl_dlsym(hdl, "jl_dispatch_target_ids", &ids, 1);
+    const void *ids;
+    jl_dlsym(hdl, "jl_dispatch_target_ids", (void**)&ids, 1);
     uint32_t target_idx = callback(ids);
 
-    int32_t *reloc_slots;
+    const int32_t *reloc_slots;
     jl_dlsym(hdl, "jl_dispatch_reloc_slots", (void **)&reloc_slots, 1);
     const uint32_t nreloc = reloc_slots[0];
     reloc_slots += 1;
-    uint32_t *clone_idxs;
-    int32_t *clone_offsets;
+    const uint32_t *clone_idxs;
+    const int32_t *clone_offsets;
     jl_dlsym(hdl, "jl_dispatch_fvars_idxs", (void**)&clone_idxs, 1);
     jl_dlsym(hdl, "jl_dispatch_fvars_offsets", (void**)&clone_offsets, 1);
     uint32_t tag_len = clone_idxs[0];
     clone_idxs += 1;
 
     assert(tag_len & jl_sysimg_tag_mask);
-    std::vector<const int32_t*> base_offsets = {res.offsets};
+    std::vector<const int32_t*> base_offsets = {offsets};
     // Find target
     for (uint32_t i = 0;i < target_idx;i++) {
         uint32_t len = jl_sysimg_val_mask & tag_len;
@@ -680,20 +693,20 @@ static inline jl_image_fptrs_t parse_sysimg(void *hdl, F &&callback)
     if (clone_all) {
         // clone_all
         if (target_idx != 0) {
-            res.offsets = clone_offsets;
+            offsets = clone_offsets;
         }
     }
     else {
         uint32_t base_idx = clone_idxs[0];
         assert(base_idx < target_idx);
         if (target_idx != 0) {
-            res.offsets = base_offsets[base_idx];
-            assert(res.offsets);
+            offsets = base_offsets[base_idx];
+            assert(offsets);
         }
         clone_idxs++;
-        res.nclones = tag_len;
-        res.clone_offsets = clone_offsets;
-        res.clone_idxs = clone_idxs;
+        res.fptrs.nclones = tag_len;
+        res.fptrs.clone_offsets = clone_offsets;
+        res.fptrs.clone_idxs = clone_idxs;
     }
     // Do relocation
     uint32_t reloc_i = 0;
@@ -702,7 +715,7 @@ static inline jl_image_fptrs_t parse_sysimg(void *hdl, F &&callback)
         uint32_t idx = clone_idxs[i];
         int32_t offset;
         if (clone_all) {
-            offset = res.offsets[idx];
+            offset = offsets[idx];
         }
         else if (idx & jl_sysimg_tag_mask) {
             idx = idx & jl_sysimg_val_mask;
@@ -718,7 +731,7 @@ static inline jl_image_fptrs_t parse_sysimg(void *hdl, F &&callback)
                 found = true;
                 auto slot = (const void**)(data_base + reloc_slots[reloc_i * 2 + 1]);
                 assert(slot);
-                *slot = offset + res.base;
+                *slot = offset + text_base;
             }
             else if (reloc_idx > idx) {
                 break;
@@ -728,6 +741,24 @@ static inline jl_image_fptrs_t parse_sysimg(void *hdl, F &&callback)
         (void)found;
     }
 
+    res.fptrs.base = text_base;
+    res.fptrs.offsets = offsets;
+    res.gvars_base = (uintptr_t *)data_base;
+    jl_dlsym(hdl, "jl_sysimg_gvars_offsets", (void **)&res.gvars_offsets, 1);
+    res.gvars_offsets += 1;
+
+#ifdef _OS_WINDOWS_
+    res.base = (intptr_t)hdl;
+#else
+    Dl_info dlinfo;
+    if (dladdr((void*)res.gvars_base, &dlinfo) != 0) {
+        res.base = (intptr_t)dlinfo.dli_fbase;
+    }
+    else {
+        res.base = 0;
+    }
+#endif
+
     return res;
 }
 
diff --git a/src/processor.h b/src/processor.h
index e3f3bd512c910..f76722e885a1d 100644
--- a/src/processor.h
+++ b/src/processor.h
@@ -155,6 +155,13 @@ typedef struct _jl_image_fptrs_t {
     const uint32_t *clone_idxs;
 } jl_image_fptrs_t;
 
+typedef struct {
+    uint64_t base;
+    uintptr_t *gvars_base;
+    const int32_t *gvars_offsets;
+    jl_image_fptrs_t fptrs;
+} jl_image_t;
+
 /**
  * Initialize the processor dispatch system with sysimg `hdl` (also initialize the sysimg itself).
  * The dispatch system will find the best implementation to be used in this session.
@@ -165,8 +172,8 @@ typedef struct _jl_image_fptrs_t {
  *
  * Return the data about the function pointers selected.
  */
-jl_image_fptrs_t jl_init_processor_sysimg(void *hdl);
-jl_image_fptrs_t jl_init_processor_pkgimg(void *hdl);
+jl_image_t jl_init_processor_sysimg(void *hdl);
+jl_image_t jl_init_processor_pkgimg(void *hdl);
 
 // Return the name of the host CPU as a julia string.
 JL_DLLEXPORT jl_value_t *jl_get_cpu_name(void);
diff --git a/src/processor_arm.cpp b/src/processor_arm.cpp
index 3e7b22caf00d4..0797fa4381f9d 100644
--- a/src/processor_arm.cpp
+++ b/src/processor_arm.cpp
@@ -1802,14 +1802,14 @@ JL_DLLEXPORT jl_value_t *jl_get_cpu_name(void)
     return jl_cstr_to_string(host_cpu_name().c_str());
 }
 
-jl_image_fptrs_t jl_init_processor_sysimg(void *hdl)
+jl_image_t jl_init_processor_sysimg(void *hdl)
 {
     if (!jit_targets.empty())
         jl_error("JIT targets already initialized");
     return parse_sysimg(hdl, sysimg_init_cb);
 }
 
-jl_image_fptrs_t jl_init_processor_pkgimg(void *hdl)
+jl_image_t jl_init_processor_pkgimg(void *hdl)
 {
     if (jit_targets.empty())
         jl_error("JIT targets not initialized");
diff --git a/src/processor_fallback.cpp b/src/processor_fallback.cpp
index c1353e1bb43b0..1aebde6dab90a 100644
--- a/src/processor_fallback.cpp
+++ b/src/processor_fallback.cpp
@@ -112,14 +112,14 @@ get_llvm_target_str(const TargetData<1> &data)
 
 using namespace Fallback;
 
-jl_image_fptrs_t jl_init_processor_sysimg(void *hdl)
+jl_image_t jl_init_processor_sysimg(void *hdl)
 {
     if (!jit_targets.empty())
         jl_error("JIT targets already initialized");
     return parse_sysimg(hdl, sysimg_init_cb);
 }
 
-jl_image_fptrs_t jl_init_processor_pkgimg(void *hdl)
+jl_image_t jl_init_processor_pkgimg(void *hdl)
 {
     if (jit_targets.empty())
         jl_error("JIT targets not initialized");
diff --git a/src/processor_x86.cpp b/src/processor_x86.cpp
index 6b3e7d5b63678..30a6ff9b3dede 100644
--- a/src/processor_x86.cpp
+++ b/src/processor_x86.cpp
@@ -1039,14 +1039,14 @@ JL_DLLEXPORT jl_value_t *jl_get_cpu_name(void)
     return jl_cstr_to_string(host_cpu_name().c_str());
 }
 
-jl_image_fptrs_t jl_init_processor_sysimg(void *hdl)
+jl_image_t jl_init_processor_sysimg(void *hdl)
 {
     if (!jit_targets.empty())
         jl_error("JIT targets already initialized");
     return parse_sysimg(hdl, sysimg_init_cb);
 }
 
-jl_image_fptrs_t jl_init_processor_pkgimg(void *hdl)
+jl_image_t jl_init_processor_pkgimg(void *hdl)
 {
     if (jit_targets.empty())
         jl_error("JIT targets not initialized");
diff --git a/src/staticdata.c b/src/staticdata.c
index cd9ed8b0db088..94e93f4198b4c 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -315,13 +315,6 @@ void *native_functions;   // opaque jl_native_code_desc_t blob used for fetching
 // table of struct field addresses to rewrite during saving
 static htable_t field_replace;
 
-typedef struct {
-    uint64_t base;
-    uintptr_t *gvars_base;
-    int32_t *gvars_offsets;
-    jl_image_fptrs_t fptrs;
-} jl_image_t;
-
 // array of definitions for the predefined function pointers
 // (reverse of fptr_to_id)
 // This is a manually constructed dual of the fvars array, which would be produced by codegen for Julia code, for C.
@@ -446,7 +439,7 @@ typedef struct {
 static void *jl_sysimg_handle = NULL;
 static jl_image_t sysimage;
 
-static inline uintptr_t *sysimg_gvars(uintptr_t *base, int32_t *offsets, size_t idx)
+static inline uintptr_t *sysimg_gvars(uintptr_t *base, const int32_t *offsets, size_t idx)
 {
     return base + offsets[idx] / sizeof(base[0]);
 }
@@ -461,32 +454,7 @@ static void jl_load_sysimg_so(void)
     int imaging_mode = jl_generating_output() && !jl_options.incremental;
     // in --build mode only use sysimg data, not precompiled native code
     if (!imaging_mode && jl_options.use_sysimage_native_code==JL_OPTIONS_USE_SYSIMAGE_NATIVE_CODE_YES) {
-        jl_dlsym(jl_sysimg_handle, "jl_sysimg_gvars_base", (void **)&sysimage.gvars_base, 1);
-        jl_dlsym(jl_sysimg_handle, "jl_sysimg_gvars_offsets", (void **)&sysimage.gvars_offsets, 1);
-        sysimage.gvars_offsets += 1;
         assert(sysimage.fptrs.base);
-
-        void *pgcstack_func_slot;
-        jl_dlsym(jl_sysimg_handle, "jl_pgcstack_func_slot", &pgcstack_func_slot, 1);
-        void *pgcstack_key_slot;
-        jl_dlsym(jl_sysimg_handle, "jl_pgcstack_key_slot", &pgcstack_key_slot, 1);
-        jl_pgcstack_getkey((jl_get_pgcstack_func**)pgcstack_func_slot, (jl_pgcstack_key_t*)pgcstack_key_slot);
-
-        size_t *tls_offset_idx;
-        jl_dlsym(jl_sysimg_handle, "jl_tls_offset", (void **)&tls_offset_idx, 1);
-        *tls_offset_idx = (uintptr_t)(jl_tls_offset == -1 ? 0 : jl_tls_offset);
-
-#ifdef _OS_WINDOWS_
-        sysimage.base = (intptr_t)jl_sysimg_handle;
-#else
-        Dl_info dlinfo;
-        if (dladdr((void*)sysimage.gvars_base, &dlinfo) != 0) {
-            sysimage.base = (intptr_t)dlinfo.dli_fbase;
-        }
-        else {
-            sysimage.base = 0;
-        }
-#endif
     }
     else {
         memset(&sysimage.fptrs, 0, sizeof(sysimage.fptrs));
@@ -2693,7 +2661,7 @@ JL_DLLEXPORT void jl_set_sysimg_so(void *handle)
     if (jl_options.cpu_target == NULL)
         jl_options.cpu_target = "native";
     jl_sysimg_handle = handle;
-    sysimage.fptrs = jl_init_processor_sysimg(handle);
+    sysimage = jl_init_processor_sysimg(handle);
 }
 
 #ifndef JL_NDEBUG
@@ -3391,38 +3359,7 @@ JL_DLLEXPORT jl_value_t *jl_restore_package_image_from_file(const char *fname, j
     size_t *plen;
     jl_dlsym(pkgimg_handle, "jl_system_image_size", (void **)&plen, 1);
 
-    jl_image_t pkgimage;
-    pkgimage.fptrs = jl_init_processor_pkgimg(pkgimg_handle);
-    if (!jl_dlsym(pkgimg_handle, "jl_sysimg_gvars_base", (void **)&pkgimage.gvars_base, 0)) {
-        pkgimage.gvars_base = NULL;
-    }
-
-    jl_dlsym(pkgimg_handle, "jl_sysimg_gvars_offsets", (void **)&pkgimage.gvars_offsets, 1);
-    pkgimage.gvars_offsets += 1;
-
-    void *pgcstack_func_slot;
-    jl_dlsym(pkgimg_handle, "jl_pgcstack_func_slot", &pgcstack_func_slot, 0);
-    if (pgcstack_func_slot) { // Empty package images might miss these
-        void *pgcstack_key_slot;
-        jl_dlsym(pkgimg_handle, "jl_pgcstack_key_slot", &pgcstack_key_slot, 1);
-        jl_pgcstack_getkey((jl_get_pgcstack_func**)pgcstack_func_slot, (jl_pgcstack_key_t*)pgcstack_key_slot);
-
-        size_t *tls_offset_idx;
-        jl_dlsym(pkgimg_handle, "jl_tls_offset", (void **)&tls_offset_idx, 1);
-        *tls_offset_idx = (uintptr_t)(jl_tls_offset == -1 ? 0 : jl_tls_offset);
-    }
-
-    #ifdef _OS_WINDOWS_
-        pkgimage.base = (intptr_t)pkgimg_handle;
-    #else
-        Dl_info dlinfo;
-        if (dladdr((void*)pkgimage.gvars_base, &dlinfo) != 0) {
-            pkgimage.base = (intptr_t)dlinfo.dli_fbase;
-        }
-        else {
-            pkgimage.base = 0;
-        }
-    #endif
+    jl_image_t pkgimage = jl_init_processor_pkgimg(pkgimg_handle);
 
     jl_value_t* mod = jl_restore_incremental_from_buf(pkgimg_data, &pkgimage, *plen, depmods, completeinfo);
 

From 2c7375cbb0c5ab7d331829d7a55d97881cd33255 Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Thu, 5 Jan 2023 19:33:30 -0500
Subject: [PATCH 05/34] Annotate information before running optimization

---
 src/aotcompile.cpp           |  12 +
 src/llvm-multiversioning.cpp | 667 +++++++++++++++++------------------
 2 files changed, 344 insertions(+), 335 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 2c9edecae7df7..527b793f142c8 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -512,6 +512,7 @@ static void injectCRTAlias(Module &M, StringRef name, StringRef alias, FunctionT
     builder.CreateRet(val);
 }
 
+void multiversioning_preannotate(Module &M);
 
 // takes the running content that has collected in the shadow module and dump it to disk
 // this builds the object file portion of the sysimage files for fast startup
@@ -589,6 +590,17 @@ void jl_dump_native_impl(void *native_code,
 
     // add metadata information
     if (imaging_mode) {
+        multiversioning_preannotate(*dataM);
+        {
+            DenseSet<GlobalValue *> fvars(data->jl_sysimg_fvars.begin(), data->jl_sysimg_fvars.end());
+            for (auto &F : *dataM) {
+                if (F.hasFnAttribute("julia.mv.reloc") || F.hasFnAttribute("julia.mv.fvar")) {
+                    if (fvars.insert(&F).second) {
+                        data->jl_sysimg_fvars.push_back(&F);
+                    }
+                }
+            }
+        }
         emit_offset_table(*dataM, data->jl_sysimg_gvars, "jl_sysimg_gvars", T_psize);
         emit_offset_table(*dataM, data->jl_sysimg_fvars, "jl_sysimg_fvars", T_psize);
 
diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp
index 3325cb47147a6..1a1dc297b2702 100644
--- a/src/llvm-multiversioning.cpp
+++ b/src/llvm-multiversioning.cpp
@@ -10,6 +10,7 @@
 #include <llvm-c/Types.h>
 
 #include <llvm/Pass.h>
+#include <llvm/ADT/BitVector.h>
 #include <llvm/ADT/Statistic.h>
 #include <llvm/IR/Module.h>
 #include <llvm/IR/LegacyPassManager.h>
@@ -217,25 +218,211 @@ void ConstantUses<U>::forward()
     }
 }
 
+static bool is_vector(FunctionType *ty)
+{
+    if (ty->getReturnType()->isVectorTy())
+        return true;
+    for (auto arg: ty->params()) {
+        if (arg->isVectorTy()) {
+            return true;
+        }
+    }
+    return false;
+}
+
+static uint32_t collect_func_info(Function &F, bool &has_veccall)
+{
+    DominatorTree DT(F);
+    LoopInfo LI(DT);
+    uint32_t flag = 0;
+    if (!LI.empty())
+        flag |= JL_TARGET_CLONE_LOOP;
+    if (is_vector(F.getFunctionType())) {
+        flag |= JL_TARGET_CLONE_SIMD;
+        has_veccall = true;
+    }
+    for (auto &bb: F) {
+        for (auto &I: bb) {
+            if (auto call = dyn_cast<CallInst>(&I)) {
+                if (is_vector(call->getFunctionType())) {
+                    has_veccall = true;
+                    flag |= JL_TARGET_CLONE_SIMD;
+                }
+                if (auto callee = call->getCalledFunction()) {
+                    auto name = callee->getName();
+                    if (name.startswith("llvm.muladd.") || name.startswith("llvm.fma.")) {
+                        flag |= JL_TARGET_CLONE_MATH;
+                    }
+                    else if (name.startswith("julia.cpu.")) {
+                        if (name.startswith("julia.cpu.have_fma.")) {
+                            // for some platforms we know they always do (or don't) support
+                            // FMA. in those cases we don't need to clone the function.
+                            if (!always_have_fma(*callee).hasValue())
+                                flag |= JL_TARGET_CLONE_CPU;
+                        } else {
+                            flag |= JL_TARGET_CLONE_CPU;
+                        }
+                    }
+                }
+            }
+            else if (auto store = dyn_cast<StoreInst>(&I)) {
+                if (store->getValueOperand()->getType()->isVectorTy()) {
+                    flag |= JL_TARGET_CLONE_SIMD;
+                }
+            }
+            else if (I.getType()->isVectorTy()) {
+                flag |= JL_TARGET_CLONE_SIMD;
+            }
+            if (auto mathOp = dyn_cast<FPMathOperator>(&I)) {
+                if (mathOp->getFastMathFlags().any()) {
+                    flag |= JL_TARGET_CLONE_MATH;
+                }
+            }
+
+            for (size_t i = 0; i < I.getNumOperands(); i++) {
+                if(I.getOperand(i)->getType()->isHalfTy()){
+                    flag |= JL_TARGET_CLONE_FLOAT16;
+                }
+                // Check for BFloat16 when they are added to julia can be done here
+            }
+            if (has_veccall && (flag & JL_TARGET_CLONE_SIMD) && (flag & JL_TARGET_CLONE_MATH)) {
+                return flag;
+            }
+        }
+    }
+    return flag;
+}
+
+static void annotate_module_clones(Module &M) {
+    CallGraph CG(M);
+    std::vector<Function *> orig_funcs;
+    for (auto &F: M) {
+        if (F.isDeclaration())
+            continue;
+        orig_funcs.push_back(&F);
+    }
+    bool has_veccall = false;
+    auto specs = jl_get_llvm_clone_targets();
+    std::vector<APInt> clones(orig_funcs.size(), APInt(specs.size(), 0));
+    BitVector subtarget_cloned(orig_funcs.size());
+    bool check_relocs = false;
+
+    std::vector<unsigned> func_infos(orig_funcs.size());
+    for (unsigned i = 0; i < orig_funcs.size(); i++) {
+        func_infos[i] = collect_func_info(*orig_funcs[i], has_veccall);
+    }
+    for (unsigned i = 1; i < specs.size(); i++) {
+        if (specs[i].flags & JL_TARGET_CLONE_ALL) {
+            for (unsigned j = 0; j < orig_funcs.size(); j++) {
+                clones[j].setBit(i);
+            }
+            check_relocs = true;
+        } else {
+            unsigned flag = specs[i].flags & clone_mask;
+            std::set<Function*> sets[2];
+            for (unsigned j = 0; j < orig_funcs.size(); j++) {
+                if (!(func_infos[j] & flag)) {
+                    continue;
+                }
+                sets[0].insert(orig_funcs[j]);
+            }
+            std::set<Function*> all_origs(sets[0]);
+            auto *cur_set = &sets[0];
+            auto *next_set = &sets[1];
+            // Reduce dispatch by expand the cloning set to functions that are directly called by
+            // and calling cloned functions.
+            while (!cur_set->empty()) {
+                for (auto orig_f: *cur_set) {
+                    // Use the uncloned function since it's already in the call graph
+                    auto node = CG[orig_f];
+                    for (const auto &I: *node) {
+                        auto child_node = I.second;
+                        auto orig_child_f = child_node->getFunction();
+                        if (!orig_child_f)
+                            continue;
+                        // Already cloned
+                        if (all_origs.count(orig_child_f))
+                            continue;
+                        bool calling_clone = false;
+                        for (const auto &I2: *child_node) {
+                            auto orig_child_f2 = I2.second->getFunction();
+                            if (!orig_child_f2)
+                                continue;
+                            if (all_origs.count(orig_child_f2)) {
+                                calling_clone = true;
+                                break;
+                            }
+                        }
+                        if (!calling_clone)
+                            continue;
+                        next_set->insert(orig_child_f);
+                        all_origs.insert(orig_child_f);
+                    }
+                }
+                std::swap(cur_set, next_set);
+                next_set->clear();
+            }
+            for (unsigned j = 0; j < orig_funcs.size(); j++) {
+                if (all_origs.count(orig_funcs[j])) {
+                    clones[j].setBit(i);
+                    subtarget_cloned.set(j);
+                }
+            }
+        }
+    }
+    if (check_relocs) {
+        for (unsigned i = 0; i < orig_funcs.size(); i++) {
+            auto &F = *orig_funcs[i];
+            if (subtarget_cloned[i] && !ConstantUses<Instruction>(orig_funcs[i], M).done()) {
+                F.addFnAttr("julia.mv.reloc", "");
+            } else {
+                auto uses = ConstantUses<GlobalValue>(orig_funcs[i], M);
+                if (!uses.done()) {
+                    bool slot = false;
+                    for (; !uses.done(); uses.next()) {
+                        if (isa<GlobalAlias>(uses.get_info().val)) {
+                            slot = true;
+                            break;
+                        }
+                    }
+                    if (slot) {
+                        F.addFnAttr("julia.mv.reloc", "");
+                    } else {
+                        F.addFnAttr("julia.mv.fvar", "");
+                    }
+                }
+            }
+        }
+    }
+    SmallString<128> cloneset;
+    for (unsigned i = 0; i < orig_funcs.size(); i++) {
+        if (!clones[i].isZero()) {
+            auto &F = *orig_funcs[i];
+            cloneset.clear();
+            clones[i].toStringUnsigned(cloneset, 16);
+            F.addFnAttr("julia.mv.clones", cloneset);
+        }
+    }
+    if (has_veccall) {
+        M.addModuleFlag(Module::Max, "julia.mv.veccall", 1);
+    }
+}
+
 struct CloneCtx {
     struct Target {
         int idx;
-        uint32_t flags;
         std::unique_ptr<ValueToValueMapTy> vmap; // ValueToValueMapTy is not movable....
-        Target(int idx, const jl_target_spec_t &spec) :
+        explicit Target(int idx) :
             idx(idx),
-            flags(spec.flags),
             vmap(new ValueToValueMapTy)
         {
         }
     };
     struct Group : Target {
         std::vector<Target> clones;
-        std::set<uint32_t> clone_fs;
-        Group(int base, const jl_target_spec_t &spec) :
-            Target(base, spec),
-            clones{},
-            clone_fs{}
+        explicit Group(int base) :
+            Target(base),
+            clones{}
         {}
         Function *base_func(Function *orig_f) const
         {
@@ -243,34 +430,38 @@ struct CloneCtx {
                 return orig_f;
             return cast<Function>(vmap->lookup(orig_f));
         }
+
+        bool has_subtarget_clone(Function *orig_f) const
+        {
+            auto base = base_func(orig_f);
+            for (auto &clone: clones) {
+                if (map_get(*clone.vmap, base))
+                    return true;
+            }
+            return false;
+        }
     };
-    CloneCtx(Module &M, function_ref<LoopInfo&(Function&)> GetLI, function_ref<CallGraph&()> GetCG, bool allow_bad_fvars);
-    void clone_bases();
-    void collect_func_infos();
-    void clone_all_partials();
+    CloneCtx(Module &M, bool allow_bad_fvars);
+    void prepare_slots();
+    void clone_decls();
+    void clone_bodies();
     void fix_gv_uses();
     void fix_inst_uses();
     void emit_metadata();
 private:
     void prepare_vmap(ValueToValueMapTy &vmap);
-    void clone_function(Function *F, Function *new_f, ValueToValueMapTy &vmap);
-    uint32_t collect_func_info(Function &F);
-    void check_partial(Group &grp, Target &tgt);
     void clone_partial(Group &grp, Target &tgt);
-    uint32_t get_func_id(Function *F);
-    template<typename Stack>
-    Constant *rewrite_gv_init(const Stack& stack);
-    std::pair<uint32_t,GlobalVariable*> get_reloc_slot(Function *F);
+    uint32_t get_func_id(Function *F) const;
+    std::pair<uint32_t,GlobalVariable*> get_reloc_slot(Function *F) const;
     void rewrite_alias(GlobalAlias *alias, Function* F);
 
     MDNode *tbaa_const;
     std::vector<jl_target_spec_t> specs;
     std::vector<Group> groups{};
+    std::vector<Target *> linearized;
     std::vector<Function*> fvars;
     std::vector<Constant*> gvars;
     Module &M;
-    function_ref<LoopInfo&(Function&)> GetLI;
-    function_ref<CallGraph&()> GetCG;
 
     // Map from original function to one based index in `fvars`
     std::map<const Function*,uint32_t> func_ids{};
@@ -281,7 +472,7 @@ struct CloneCtx {
     std::vector<std::pair<Constant*,uint32_t>> gv_relocs{};
     // Mapping from function id (i.e. 0-based index in `fvars`) to GVs to be initialized.
     std::map<uint32_t,GlobalVariable*> const_relocs;
-    bool has_veccall{false};
+    std::map<Function *, GlobalVariable*> extern_relocs;
     bool allow_bad_fvars{false};
 };
 
@@ -322,36 +513,36 @@ static inline std::vector<T*> consume_gv(Module &M, const char *name, bool allow
 }
 
 // Collect basic information about targets and functions.
-CloneCtx::CloneCtx(Module &M, function_ref<LoopInfo&(Function&)> GetLI, function_ref<CallGraph&()> GetCG, bool allow_bad_fvars)
+CloneCtx::CloneCtx(Module &M, bool allow_bad_fvars)
     : tbaa_const(tbaa_make_child_with_context(M.getContext(), "jtbaa_const", nullptr, true).first),
       specs(jl_get_llvm_clone_targets()),
       fvars(consume_gv<Function>(M, "jl_sysimg_fvars", allow_bad_fvars)),
       gvars(consume_gv<Constant>(M, "jl_sysimg_gvars", false)),
       M(M),
-      GetLI(GetLI),
-      GetCG(GetCG),
       allow_bad_fvars(allow_bad_fvars)
 {
-    groups.emplace_back(0, specs[0]);
+    groups.emplace_back(0);
+    linearized.resize(specs.size());
+    linearized[0] = &groups[0];
+    std::vector<unsigned> group_ids(specs.size(), 0);
     uint32_t ntargets = specs.size();
     for (uint32_t i = 1; i < ntargets; i++) {
         auto &spec = specs[i];
         if (spec.flags & JL_TARGET_CLONE_ALL) {
-            groups.emplace_back(i, spec);
+            group_ids[i] = groups.size();
+            groups.emplace_back(i);
         }
         else {
-            auto base = spec.base;
-            bool found = false;
-            for (auto &grp: groups) {
-                if (grp.idx == base) {
-                    found = true;
-                    grp.clones.emplace_back(i, spec);
-                    break;
-                }
-            }
-            (void)found;
+            assert(0 <= spec.base && (unsigned) spec.base < i);
+            group_ids[i] = group_ids[spec.base];
+            groups[group_ids[i]].clones.emplace_back(i);
         }
     }
+    for (auto &grp: groups) {
+        for (auto &tgt: grp.clones)
+            linearized[tgt.idx] = &tgt;
+        linearized[grp.idx] = &grp;
+    }
     uint32_t nfvars = fvars.size();
     for (uint32_t i = 0; i < nfvars; i++)
         func_ids[fvars[i]] = i + 1;
@@ -376,128 +567,64 @@ void CloneCtx::prepare_vmap(ValueToValueMapTy &vmap)
     }
 }
 
-void CloneCtx::clone_function(Function *F, Function *new_f, ValueToValueMapTy &vmap)
-{
-    Function::arg_iterator DestI = new_f->arg_begin();
-    for (Function::const_arg_iterator J = F->arg_begin(); J != F->arg_end(); ++J) {
-        DestI->setName(J->getName());
-        vmap[&*J] = &*DestI++;
-    }
-    SmallVector<ReturnInst*,8> Returns;
-#if JL_LLVM_VERSION >= 130000
-    // We are cloning into the same module
-    CloneFunctionInto(new_f, F, vmap, CloneFunctionChangeType::GlobalChanges, Returns);
-#else
-    CloneFunctionInto(new_f, F, vmap, true, Returns);
-#endif
-}
-
-// Clone all clone_all targets. Makes sure that the base targets are all available.
-void CloneCtx::clone_bases()
+void CloneCtx::prepare_slots()
 {
-    if (groups.size() == 1)
-        return;
-    uint32_t ngrps = groups.size();
-    for (uint32_t gid = 1; gid < ngrps; gid++) {
-        auto &grp = groups[gid];
-        auto suffix = ".clone_" + std::to_string(grp.idx);
-        auto &vmap = *grp.vmap;
-        // Fill in old->new mapping. We need to do this before cloning the function so that
-        // the intra target calls are automatically fixed up on cloning.
-        for (auto F: orig_funcs) {
-            Function *new_f = Function::Create(F->getFunctionType(), F->getLinkage(),
-                                               F->getName() + suffix, &M);
-            new_f->copyAttributesFrom(F);
-            vmap[F] = new_f;
-        }
-        prepare_vmap(vmap);
-        for (auto F: orig_funcs) {
-            clone_function(F, cast<Function>(vmap.lookup(F)), vmap);
-        }
-    }
-}
-
-static bool is_vector(FunctionType *ty)
-{
-    if (ty->getReturnType()->isVectorTy())
-        return true;
-    for (auto arg: ty->params()) {
-        if (arg->isVectorTy()) {
-            return true;
+    for (auto &F : orig_funcs) {
+        if (F->hasFnAttribute("julia.mv.reloc")) {
+            assert(F->hasFnAttribute("julia.mv.clones"));
+            if (F->isDeclaration()) {
+                auto GV = new GlobalVariable(M, F->getType(), false, GlobalValue::ExternalLinkage, nullptr, F->getName() + ".reloc_slot");
+                GV->setVisibility(GlobalValue::HiddenVisibility);
+                extern_relocs[F] = GV;
+            } else {
+                auto id = get_func_id(F);
+                auto GV = new GlobalVariable(M, F->getType(), false, GlobalValue::InternalLinkage, Constant::getNullValue(F->getType()), F->getName() + ".reloc_slot");
+                GV->setVisibility(GlobalValue::HiddenVisibility);
+                const_relocs[id] = GV;
+            }
         }
     }
-    return false;
 }
 
-uint32_t CloneCtx::collect_func_info(Function &F)
+void CloneCtx::clone_decls()
 {
-    uint32_t flag = 0;
-    if (!GetLI(F).empty())
-        flag |= JL_TARGET_CLONE_LOOP;
-    if (is_vector(F.getFunctionType())) {
-        flag |= JL_TARGET_CLONE_SIMD;
-        has_veccall = true;
+    std::vector<std::string> suffixes(specs.size());
+    for (unsigned i = 1; i < specs.size(); i++) {
+        suffixes[i] = "." + std::to_string(i);
     }
-    for (auto &bb: F) {
-        for (auto &I: bb) {
-            if (auto call = dyn_cast<CallInst>(&I)) {
-                if (is_vector(call->getFunctionType())) {
-                    has_veccall = true;
-                    flag |= JL_TARGET_CLONE_SIMD;
-                }
-                if (auto callee = call->getCalledFunction()) {
-                    auto name = callee->getName();
-                    if (name.startswith("llvm.muladd.") || name.startswith("llvm.fma.")) {
-                        flag |= JL_TARGET_CLONE_MATH;
-                    }
-                    else if (name.startswith("julia.cpu.")) {
-                        if (name.startswith("julia.cpu.have_fma.")) {
-                            // for some platforms we know they always do (or don't) support
-                            // FMA. in those cases we don't need to clone the function.
-                            if (!always_have_fma(*callee).hasValue())
-                                flag |= JL_TARGET_CLONE_CPU;
-                        } else {
-                            flag |= JL_TARGET_CLONE_CPU;
-                        }
-                    }
-                }
-            }
-            else if (auto store = dyn_cast<StoreInst>(&I)) {
-                if (store->getValueOperand()->getType()->isVectorTy()) {
-                    flag |= JL_TARGET_CLONE_SIMD;
-                }
-            }
-            else if (I.getType()->isVectorTy()) {
-                flag |= JL_TARGET_CLONE_SIMD;
-            }
-            if (auto mathOp = dyn_cast<FPMathOperator>(&I)) {
-                if (mathOp->getFastMathFlags().any()) {
-                    flag |= JL_TARGET_CLONE_MATH;
-                }
-            }
-
-            for (size_t i = 0; i < I.getNumOperands(); i++) {
-                if(I.getOperand(i)->getType()->isHalfTy()){
-                    flag |= JL_TARGET_CLONE_FLOAT16;
-                }
-                // Check for BFloat16 when they are added to julia can be done here
-            }
-            if (has_veccall && (flag & JL_TARGET_CLONE_SIMD) && (flag & JL_TARGET_CLONE_MATH) &&
-               (flag & JL_TARGET_CLONE_CPU) && (flag & JL_TARGET_CLONE_FLOAT16)) {
-                return flag;
+    for (auto &F : orig_funcs) {
+        if (!F->hasFnAttribute("julia.mv.clones"))
+            continue;
+        APInt clones(specs.size(), F->getFnAttribute("julia.mv.clones").getValueAsString(), 16);
+        for (unsigned i = 1; i < specs.size(); i++) {
+            if (!clones[i]) {
+                continue;
             }
+            auto new_F = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName() + suffixes[i], &M);
+            new_F->copyAttributesFrom(F);
+            new_F->setVisibility(F->getVisibility());
+            auto base_func = F;
+            if (specs[i].flags & JL_TARGET_CLONE_ALL)
+                base_func = static_cast<Group*>(linearized[specs[i].base])->base_func(F);
+            (*linearized[i]->vmap)[base_func] = new_F;
         }
     }
-    return flag;
 }
 
-void CloneCtx::collect_func_infos()
+static void clone_function(Function *F, Function *new_f, ValueToValueMapTy &vmap)
 {
-    uint32_t nfuncs = orig_funcs.size();
-    func_infos.resize(nfuncs);
-    for (uint32_t i = 0; i < nfuncs; i++) {
-        func_infos[i] = collect_func_info(*orig_funcs[i]);
+    Function::arg_iterator DestI = new_f->arg_begin();
+    for (Function::const_arg_iterator J = F->arg_begin(); J != F->arg_end(); ++J) {
+        DestI->setName(J->getName());
+        vmap[&*J] = &*DestI++;
     }
+    SmallVector<ReturnInst*,8> Returns;
+#if JL_LLVM_VERSION >= 130000
+    // We are cloning into the same module
+    CloneFunctionInto(new_f, F, vmap, CloneFunctionChangeType::GlobalChanges, Returns);
+#else
+    CloneFunctionInto(new_f, F, vmap, true, Returns);
+#endif
 }
 
 static void add_features(Function *F, StringRef name, StringRef features, uint32_t flags)
@@ -523,149 +650,48 @@ static void add_features(Function *F, StringRef name, StringRef features, uint32
     }
 }
 
-void CloneCtx::clone_all_partials()
-{
-    // First decide what to clone
-    // Do this before actually cloning the functions
-    // so that the call graph is easier to understand
-    for (auto &grp: groups) {
-        for (auto &tgt: grp.clones) {
-            check_partial(grp, tgt);
-        }
-    }
-    for (auto &grp: groups) {
-        for (auto &tgt: grp.clones)
-            clone_partial(grp, tgt);
-        // Also set feature strings for base target functions
-        // now that all the actual cloning is done.
-        auto &base_spec = specs[grp.idx];
-        for (auto orig_f: orig_funcs) {
-            add_features(grp.base_func(orig_f), base_spec.cpu_name,
-                         base_spec.cpu_features, base_spec.flags);
-        }
-    }
-    func_infos.clear(); // We don't need this anymore
-}
-
-void CloneCtx::check_partial(Group &grp, Target &tgt)
+void CloneCtx::clone_bodies()
 {
-    auto flag = specs[tgt.idx].flags & clone_mask;
-    auto suffix = ".clone_" + std::to_string(tgt.idx);
-    auto &vmap = *tgt.vmap;
-    uint32_t nfuncs = func_infos.size();
-
-    std::set<Function*> all_origs;
-    // Use a simple heuristic to decide which function we need to clone.
-    for (uint32_t i = 0; i < nfuncs; i++) {
-        if (!(func_infos[i] & flag))
-            continue;
-        auto orig_f = orig_funcs[i];
-        // Fill in old->new mapping. We need to do this before cloning the function so that
-        // the intra target calls are automatically fixed up on cloning.
-        auto F = grp.base_func(orig_f);
-        Function *new_f = Function::Create(F->getFunctionType(), F->getLinkage(),
-                                           F->getName() + suffix, &M);
-        new_f->copyAttributesFrom(F);
-        vmap[F] = new_f;
-        if (groups.size() == 1)
-            cloned.insert(orig_f);
-        grp.clone_fs.insert(i);
-        all_origs.insert(orig_f);
-    }
-    std::set<Function*> sets[2]{all_origs, std::set<Function*>{}};
-    auto *cur_set = &sets[0];
-    auto *next_set = &sets[1];
-    // Reduce dispatch by expand the cloning set to functions that are directly called by
-    // and calling cloned functions.
-    auto &graph = GetCG();
-    while (!cur_set->empty()) {
-        for (auto orig_f: *cur_set) {
-            // Use the uncloned function since it's already in the call graph
-            auto node = graph[orig_f];
-            for (const auto &I: *node) {
-                auto child_node = I.second;
-                auto orig_child_f = child_node->getFunction();
-                if (!orig_child_f)
-                    continue;
-                // Already cloned
-                if (all_origs.count(orig_child_f))
-                    continue;
-                bool calling_clone = false;
-                for (const auto &I2: *child_node) {
-                    auto orig_child_f2 = I2.second->getFunction();
-                    if (!orig_child_f2)
-                        continue;
-                    if (all_origs.count(orig_child_f2)) {
-                        calling_clone = true;
-                        break;
+    for (auto F : orig_funcs) {
+        for (unsigned i = 0; i < groups.size(); i++) {
+            Function *group_F = F;
+            if (i != 0) {
+                group_F = groups[i].base_func(F);
+                if (!F->isDeclaration()) {
+                    clone_function(F, group_F, *groups[i].vmap);
+                }
+            }
+            for (auto &target : groups[i].clones) {
+                prepare_vmap(*target.vmap);
+                auto target_F = cast_or_null<Function>(map_get(*target.vmap, F));
+                if (target_F) {
+                    if (!F->isDeclaration()) {
+                        clone_function(group_F, target_F, *target.vmap);
                     }
+                    add_features(target_F, specs[target.idx].cpu_name,
+                                specs[target.idx].cpu_features, specs[target.idx].flags);
+                    target_F->addFnAttr("julia.mv.clone", std::to_string(i));
                 }
-                if (!calling_clone)
-                    continue;
-                next_set->insert(orig_child_f);
-                all_origs.insert(orig_child_f);
-                auto child_f = grp.base_func(orig_child_f);
-                Function *new_f = Function::Create(child_f->getFunctionType(),
-                                                   child_f->getLinkage(),
-                                                   child_f->getName() + suffix, &M);
-                new_f->copyAttributesFrom(child_f);
-                vmap[child_f] = new_f;
             }
-        }
-        std::swap(cur_set, next_set);
-        next_set->clear();
-    }
-    for (uint32_t i = 0; i < nfuncs; i++) {
-        // Only need to handle expanded functions
-        if (func_infos[i] & flag)
-            continue;
-        auto orig_f = orig_funcs[i];
-        if (all_origs.count(orig_f)) {
-            if (groups.size() == 1)
-                cloned.insert(orig_f);
-            grp.clone_fs.insert(i);
-        }
-    }
-}
-
-void CloneCtx::clone_partial(Group &grp, Target &tgt)
-{
-    auto &spec = specs[tgt.idx];
-    auto &vmap = *tgt.vmap;
-    uint32_t nfuncs = orig_funcs.size();
-    prepare_vmap(vmap);
-    for (uint32_t i = 0; i < nfuncs; i++) {
-        auto orig_f = orig_funcs[i];
-        auto F = grp.base_func(orig_f);
-        if (auto new_v = map_get(vmap, F)) {
-            auto new_f = cast<Function>(new_v);
-            assert(new_f != F);
-            clone_function(F, new_f, vmap);
-            // We can set the feature strings now since no one is going to
-            // clone these functions again.
-            add_features(new_f, spec.cpu_name, spec.cpu_features, spec.flags);
+            if (i != 0) {
+                //TODO should we also do this for target 0?
+                add_features(group_F, specs[groups[i].idx].cpu_name,
+                            specs[groups[i].idx].cpu_features, specs[groups[i].idx].flags);
+            }
+            group_F->addFnAttr("julia.mv.clone", std::to_string(i));
         }
     }
 }
 
-uint32_t CloneCtx::get_func_id(Function *F)
+uint32_t CloneCtx::get_func_id(Function *F) const
 {
-    auto &ref = func_ids[F];
-    if (!ref) {
-        if (allow_bad_fvars && F->isDeclaration()) {
-            // This should never happen in regular use, but can happen if
-            // bugpoint deletes the function. Just do something here to
-            // allow bugpoint to proceed.
-            return (uint32_t)-1;
-        }
-        fvars.push_back(F);
-        ref = fvars.size();
-    }
-    return ref - 1;
+    auto ref = func_ids.find(F);
+    assert(ref != func_ids.end() && "Requesting id of non-fvar!");
+    return ref->second - 1;
 }
 
 template<typename Stack>
-Constant *CloneCtx::rewrite_gv_init(const Stack& stack)
+static Constant *rewrite_gv_init(const Stack& stack)
 {
     // Null initialize so that LLVM put it in the correct section.
     SmallVector<Constant*, 8> args;
@@ -785,16 +811,18 @@ void CloneCtx::fix_gv_uses()
     }
 }
 
-std::pair<uint32_t,GlobalVariable*> CloneCtx::get_reloc_slot(Function *F)
+std::pair<uint32_t,GlobalVariable*> CloneCtx::get_reloc_slot(Function *F) const
 {
-    // Null initialize so that LLVM put it in the correct section.
-    auto id = get_func_id(F);
-    auto &slot = const_relocs[id];
-    if (!slot)
-        slot = new GlobalVariable(M, F->getType(), false, GlobalVariable::InternalLinkage,
-                                  ConstantPointerNull::get(F->getType()),
-                                  F->getName() + ".reloc_slot");
-    return std::make_pair(id, slot);
+    if (F->isDeclaration()) {
+        auto extern_decl = extern_relocs.find(F);
+        assert(extern_decl != extern_relocs.end() && "Missing extern relocation slot!");
+        return {(uint32_t)-1, extern_decl->second};
+    } else {
+        auto id = get_func_id(F);
+        auto slot = const_relocs.find(id);
+        assert(slot != const_relocs.end() && "Missing relocation slot!");
+        return {id, slot->second};
+    }
 }
 
 template<typename Stack>
@@ -851,17 +879,17 @@ void CloneCtx::fix_inst_uses()
 {
     uint32_t nfuncs = orig_funcs.size();
     for (auto &grp: groups) {
-        auto suffix = ".clone_" + std::to_string(grp.idx);
         for (uint32_t i = 0; i < nfuncs; i++) {
-            if (!grp.clone_fs.count(i))
-                continue;
             auto orig_f = orig_funcs[i];
+            if (!grp.has_subtarget_clone(orig_f))
+                continue;
             auto F = grp.base_func(orig_f);
+            auto grpidx = std::to_string(grp.idx);
             replaceUsesWithLoad(*F, [&](Instruction &I) -> GlobalVariable * {
                 uint32_t id;
                 GlobalVariable *slot;
                 auto use_f = I.getFunction();
-                if (!use_f->getName().endswith(suffix))
+                if (!use_f->hasFnAttribute("julia.mv.clone") || use_f->getFnAttribute("julia.mv.clone").getValueAsString() != grpidx)
                     return nullptr;
                 std::tie(id, slot) = get_reloc_slot(orig_f);
                 return slot;
@@ -935,17 +963,6 @@ void CloneCtx::emit_metadata()
     auto gbase = emit_offset_table(M, gvars, "jl_sysimg_gvars");
 
     uint32_t ntargets = specs.size();
-    SmallVector<Target*, 8> targets(ntargets);
-    for (auto &grp: groups) {
-        targets[grp.idx] = &grp;
-        for (auto &tgt: grp.clones) {
-            targets[tgt.idx] = &tgt;
-        }
-    }
-
-    if (has_veccall) {
-        M.addModuleFlag(Module::Max, "julia.mv.veccall", 1);
-    }
 
     // Generate `jl_dispatch_reloc_slots`
     std::set<uint32_t> shared_relocs;
@@ -989,7 +1006,7 @@ void CloneCtx::emit_metadata()
         std::vector<uint32_t> idxs;
         std::vector<Constant*> offsets;
         for (uint32_t i = 0; i < ntargets; i++) {
-            auto tgt = targets[i];
+            auto tgt = linearized[i];
             auto &spec = specs[i];
             uint32_t len_idx = idxs.size();
             idxs.push_back(0); // We will fill in the real value later.
@@ -1009,7 +1026,7 @@ void CloneCtx::emit_metadata()
             }
             else {
                 auto baseidx = spec.base;
-                auto grp = static_cast<Group*>(targets[baseidx]);
+                auto grp = static_cast<Group*>(linearized[baseidx]);
                 idxs.push_back(baseidx);
                 for (uint32_t j = 0; j < nfvars; j++) {
                     auto base_f = grp->base_func(fvars[j]);
@@ -1040,7 +1057,7 @@ void CloneCtx::emit_metadata()
     }
 }
 
-static bool runMultiVersioning(Module &M, function_ref<LoopInfo&(Function&)> GetLI, function_ref<CallGraph&()> GetCG, bool allow_bad_fvars)
+static bool runMultiVersioning(Module &M, bool allow_bad_fvars)
 {
     // Group targets and identify cloning bases.
     // Also initialize function info maps (we'll update these maps as we go)
@@ -1059,19 +1076,13 @@ static bool runMultiVersioning(Module &M, function_ref<LoopInfo&(Function&)> Get
                             !gvars || !gvars->hasInitializer() || !isa<ConstantArray>(gvars->getInitializer())))
         return false;
 
-    CloneCtx clone(M, GetLI, GetCG, allow_bad_fvars);
+    CloneCtx clone(M, allow_bad_fvars);
+    
+    clone.prepare_slots();
 
-    // Collect a list of original functions and clone base functions
-    clone.clone_bases();
+    clone.clone_decls();
 
-    // Collect function info (type of instruction used)
-    clone.collect_func_infos();
-
-    // If any partially cloned target exist decide which functions to clone for these targets.
-    // Clone functions for each group and collect a list of them.
-    // We can also add feature strings for cloned functions
-    // now that no additional cloning needs to be done.
-    clone.clone_all_partials();
+    clone.clone_bodies();
 
     // Scan **ALL** cloned functions (including full cloning for base target)
     // for global variables initialization use.
@@ -1108,24 +1119,12 @@ struct MultiVersioningLegacy: public ModulePass {
 
 private:
     bool runOnModule(Module &M) override;
-    void getAnalysisUsage(AnalysisUsage &AU) const override
-    {
-        AU.addRequired<LoopInfoWrapperPass>();
-        AU.addRequired<CallGraphWrapperPass>();
-        AU.addPreserved<LoopInfoWrapperPass>();
-    }
     bool allow_bad_fvars;
 };
 
 bool MultiVersioningLegacy::runOnModule(Module &M)
 {
-    auto GetLI = [this](Function &F) -> LoopInfo & {
-        return getAnalysis<LoopInfoWrapperPass>(F).getLoopInfo();
-    };
-    auto GetCG = [this]() -> CallGraph & {
-        return getAnalysis<CallGraphWrapperPass>().getCallGraph();
-    };
-    return runMultiVersioning(M, GetLI, GetCG, allow_bad_fvars);
+    return runMultiVersioning(M, allow_bad_fvars);
 }
 
 
@@ -1136,6 +1135,11 @@ static RegisterPass<MultiVersioningLegacy> X("JuliaMultiVersioning", "JuliaMulti
 
 } // anonymous namespace
 
+void multiversioning_preannotate(Module &M)
+{
+    annotate_module_clones(M);
+}
+
 void replaceUsesWithLoad(Function &F, function_ref<GlobalVariable *(Instruction &I)> should_replace, MDNode *tbaa_const) {
     bool changed;
     do {
@@ -1162,14 +1166,7 @@ void replaceUsesWithLoad(Function &F, function_ref<GlobalVariable *(Instruction
 
 PreservedAnalyses MultiVersioning::run(Module &M, ModuleAnalysisManager &AM)
 {
-    auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
-    auto GetLI = [&](Function &F) -> LoopInfo & {
-        return FAM.getResult<LoopAnalysis>(F);
-    };
-    auto GetCG = [&]() -> CallGraph & {
-        return AM.getResult<CallGraphAnalysis>(M);
-    };
-    if (runMultiVersioning(M, GetLI, GetCG, external_use)) {
+    if (runMultiVersioning(M, external_use)) {
         auto preserved = PreservedAnalyses::allInSet<CFGAnalyses>();
         preserved.preserve<LoopAnalysis>();
         return preserved;

From 6ab1862106bc7f48afa54bac792cb7909df35cd7 Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Thu, 5 Jan 2023 22:17:52 -0500
Subject: [PATCH 06/34] Table-based dlsym

---
 src/aotcompile.cpp           | 112 ++++++++++++++++++++++++++++++++---
 src/llvm-multiversioning.cpp |  68 ++++++++++-----------
 src/llvm-ptls.cpp            |  19 +-----
 src/processor.cpp            |  72 ++++++++++++----------
 src/processor.h              |  32 ++++++++++
 5 files changed, 214 insertions(+), 89 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 527b793f142c8..5873c1ca56477 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -424,7 +424,8 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
         //Safe b/c context is locked by params
         GlobalVariable *G = cast<GlobalVariable>(clone.getModuleUnlocked()->getNamedValue(global));
         G->setInitializer(ConstantPointerNull::get(cast<PointerType>(G->getValueType())));
-        G->setLinkage(GlobalVariable::InternalLinkage);
+        G->setLinkage(GlobalValue::ExternalLinkage);
+        G->setVisibility(GlobalValue::HiddenVisibility);
         data->jl_sysimg_gvars.push_back(G);
     }
     CreateNativeGlobals += gvars.size();
@@ -446,9 +447,9 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
         //Safe b/c context is locked by params
         for (GlobalObject &G : clone.getModuleUnlocked()->global_objects()) {
             if (!G.isDeclaration()) {
-                G.setLinkage(Function::InternalLinkage);
+                G.setLinkage(GlobalValue::ExternalLinkage);
+                G.setVisibility(GlobalValue::HiddenVisibility);
                 makeSafeName(G);
-                addComdat(&G);
 #if defined(_OS_WINDOWS_) && defined(_CPU_X86_64_)
                 // Add unwind exception personalities to functions to handle async exceptions
                 if (Function *F = dyn_cast<Function>(&G))
@@ -514,6 +515,63 @@ static void injectCRTAlias(Module &M, StringRef name, StringRef alias, FunctionT
 
 void multiversioning_preannotate(Module &M);
 
+static GlobalVariable *emit_shard_table(Module &M, Type *T_size, Type *T_psize, unsigned threads) {
+    SmallVector<Constant *, 0> tables(sizeof(jl_image_shard_t) / sizeof(void *) * threads);
+    for (unsigned i = 0; i < threads; i++) {
+        auto suffix = "_" + std::to_string(i);
+        auto create_gv = [&](StringRef name, bool constant) {
+            auto gv = new GlobalVariable(M, T_size, constant,
+                                         GlobalValue::ExternalLinkage, nullptr, name + suffix);
+            gv->setVisibility(GlobalValue::HiddenVisibility);
+            return gv;
+        };
+        auto table = tables.data() + i * sizeof(jl_image_shard_t) / sizeof(void *);
+        table[offsetof(jl_image_shard_t, fvar_base) / sizeof(void*)] = create_gv("jl_fvar_base", false);
+        table[offsetof(jl_image_shard_t, fvar_offsets) / sizeof(void*)] = create_gv("jl_fvar_offsets", true);
+        table[offsetof(jl_image_shard_t, fvar_idxs) / sizeof(void*)] = create_gv("jl_fvar_idxs", true);
+        table[offsetof(jl_image_shard_t, gvar_base) / sizeof(void*)] = create_gv("jl_gvar_base", false);
+        table[offsetof(jl_image_shard_t, gvar_offsets) / sizeof(void*)] = create_gv("jl_gvar_offsets", true);
+        table[offsetof(jl_image_shard_t, gvar_idxs) / sizeof(void*)] = create_gv("jl_gvar_idxs", true);
+        table[offsetof(jl_image_shard_t, clone_slots) / sizeof(void*)] = create_gv("jl_clone_slots", true);
+        table[offsetof(jl_image_shard_t, clone_offsets) / sizeof(void*)] = create_gv("jl_clone_offsets", true);
+        table[offsetof(jl_image_shard_t, clone_idxs) / sizeof(void*)] = create_gv("jl_clone_idxs", true);
+    }
+    auto tables_arr = ConstantArray::get(ArrayType::get(T_psize, tables.size()), tables);
+    auto tables_gv = new GlobalVariable(M, tables_arr->getType(), false,
+                                        GlobalValue::ExternalLinkage, tables_arr, "jl_shard_tables");
+    tables_gv->setVisibility(GlobalValue::HiddenVisibility);
+    return tables_gv;
+}
+
+static GlobalVariable *emit_ptls_table(Module &M, Type *T_size, Type *T_psize) {
+    std::array<Constant *, 3> ptls_table{
+        new GlobalVariable(M, T_size, false, GlobalValue::ExternalLinkage, Constant::getNullValue(T_size), "jl_pgcstack_func_slot"),
+        new GlobalVariable(M, T_size, false, GlobalValue::ExternalLinkage, Constant::getNullValue(T_size), "jl_pgcstack_key_slot"),
+        new GlobalVariable(M, T_size, false, GlobalValue::ExternalLinkage, Constant::getNullValue(T_size), "jl_tls_offset"),
+    };
+    for (auto &gv : ptls_table)
+        cast<GlobalVariable>(gv)->setVisibility(GlobalValue::HiddenVisibility);
+    auto ptls_table_arr = ConstantArray::get(ArrayType::get(T_psize, ptls_table.size()), ptls_table);
+    auto ptls_table_gv = new GlobalVariable(M, ptls_table_arr->getType(), false,
+                                            GlobalValue::ExternalLinkage, ptls_table_arr, "jl_ptls_table");
+    ptls_table_gv->setVisibility(GlobalValue::HiddenVisibility);
+    return ptls_table_gv;
+}
+
+static GlobalVariable *emit_image_header(Module &M, unsigned threads, unsigned nfvars, unsigned ngvars) {
+    constexpr uint32_t version = 1;
+    std::array<uint32_t, 4> header{
+        version,
+        threads,
+        nfvars,
+        ngvars,
+    };
+    auto header_arr = ConstantDataArray::get(M.getContext(), header);
+    auto header_gv = new GlobalVariable(M, header_arr->getType(), false,
+                                        GlobalValue::InternalLinkage, header_arr, "jl_image_header");
+    return header_gv;
+}
+
 // takes the running content that has collected in the shadow module and dump it to disk
 // this builds the object file portion of the sysimage files for fast startup
 extern "C" JL_DLLEXPORT
@@ -588,6 +646,10 @@ void jl_dump_native_impl(void *native_code,
 
     start = jl_hrtime();
 
+    unsigned threads = 1;
+    unsigned nfvars = 0;
+    unsigned ngvars = 0;
+
     // add metadata information
     if (imaging_mode) {
         multiversioning_preannotate(*dataM);
@@ -601,8 +663,27 @@ void jl_dump_native_impl(void *native_code,
                 }
             }
         }
-        emit_offset_table(*dataM, data->jl_sysimg_gvars, "jl_sysimg_gvars", T_psize);
-        emit_offset_table(*dataM, data->jl_sysimg_fvars, "jl_sysimg_fvars", T_psize);
+        nfvars = data->jl_sysimg_fvars.size();
+        ngvars = data->jl_sysimg_gvars.size();
+        emit_offset_table(*dataM, data->jl_sysimg_gvars, "jl_gvars", T_psize);
+        emit_offset_table(*dataM, data->jl_sysimg_fvars, "jl_fvars", T_psize);
+        std::vector<uint32_t> idxs;
+        idxs.resize(data->jl_sysimg_gvars.size());
+        std::iota(idxs.begin(), idxs.end(), 0);
+        auto gidxs = ConstantDataArray::get(Context, idxs);
+        auto gidxs_var = new GlobalVariable(*dataM, gidxs->getType(), true,
+                                            GlobalVariable::ExternalLinkage,
+                                            gidxs, "jl_gvar_idxs");
+        gidxs_var->setVisibility(GlobalValue::HiddenVisibility);
+        idxs.clear();
+        idxs.resize(data->jl_sysimg_fvars.size());
+        std::iota(idxs.begin(), idxs.end(), 0);
+        auto fidxs = ConstantDataArray::get(Context, idxs);
+        auto fidxs_var = new GlobalVariable(*dataM, fidxs->getType(), true,
+                                            GlobalVariable::ExternalLinkage,
+                                            fidxs, "jl_fvar_idxs");
+        fidxs_var->setVisibility(GlobalValue::HiddenVisibility);
+        dataM->addModuleFlag(Module::Error, "julia.mv.suffix", MDString::get(Context, "_0"));
 
         // reflect the address of the jl_RTLD_DEFAULT_handle variable
         // back to the caller, so that we can check for consistency issues
@@ -789,10 +870,23 @@ void jl_dump_native_impl(void *native_code,
             data.insert(data.end(), specdata.begin(), specdata.end());
         }
         auto value = ConstantDataArray::get(Context, data);
-        addComdat(new GlobalVariable(*sysimageM, value->getType(), true,
-                                      GlobalVariable::ExternalLinkage,
-                                      value, "jl_dispatch_target_ids"));
-
+        auto target_ids = new GlobalVariable(*sysimageM, value->getType(), true,
+                                      GlobalVariable::InternalLinkage,
+                                      value, "jl_dispatch_target_ids");
+        auto shards = emit_shard_table(*sysimageM, T_size, T_psize, threads);
+        auto ptls = emit_ptls_table(*sysimageM, T_size, T_psize);
+        auto header = emit_image_header(*sysimageM, threads, nfvars, ngvars);
+        auto AT = ArrayType::get(T_psize, 4);
+        auto pointers = new GlobalVariable(*sysimageM, AT, false,
+                                           GlobalVariable::ExternalLinkage,
+                                           ConstantArray::get(AT, {
+                                                ConstantExpr::getBitCast(header, T_psize),
+                                                ConstantExpr::getBitCast(shards, T_psize),
+                                                ConstantExpr::getBitCast(ptls, T_psize),
+                                                ConstantExpr::getBitCast(target_ids, T_psize)
+                                           }),
+                                           "jl_image_pointers");
+        addComdat(pointers);
         if (s) {
             write_int32(s, data.size());
             ios_write(s, (const char *)data.data(), data.size());
diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp
index 1a1dc297b2702..44c83502e0537 100644
--- a/src/llvm-multiversioning.cpp
+++ b/src/llvm-multiversioning.cpp
@@ -516,8 +516,8 @@ static inline std::vector<T*> consume_gv(Module &M, const char *name, bool allow
 CloneCtx::CloneCtx(Module &M, bool allow_bad_fvars)
     : tbaa_const(tbaa_make_child_with_context(M.getContext(), "jtbaa_const", nullptr, true).first),
       specs(jl_get_llvm_clone_targets()),
-      fvars(consume_gv<Function>(M, "jl_sysimg_fvars", allow_bad_fvars)),
-      gvars(consume_gv<Constant>(M, "jl_sysimg_gvars", false)),
+      fvars(consume_gv<Function>(M, "jl_fvars", allow_bad_fvars)),
+      gvars(consume_gv<Constant>(M, "jl_gvars", false)),
       M(M),
       allow_bad_fvars(allow_bad_fvars)
 {
@@ -547,7 +547,7 @@ CloneCtx::CloneCtx(Module &M, bool allow_bad_fvars)
     for (uint32_t i = 0; i < nfvars; i++)
         func_ids[fvars[i]] = i + 1;
     for (auto &F: M) {
-        if (F.empty())
+        if (F.empty() && !F.hasFnAttribute("julia.mv.clones"))
             continue;
         orig_funcs.push_back(&F);
     }
@@ -898,19 +898,6 @@ void CloneCtx::fix_inst_uses()
     }
 }
 
-template<typename T>
-static inline T *add_comdat(T *G)
-{
-#if defined(_OS_WINDOWS_)
-    // add __declspec(dllexport) to everything marked for export
-    if (G->getLinkage() == GlobalValue::ExternalLinkage)
-        G->setDLLStorageClass(GlobalValue::DLLExportStorageClass);
-    else
-        G->setDLLStorageClass(GlobalValue::DefaultStorageClass);
-#endif
-    return G;
-}
-
 static Constant *get_ptrdiff32(Constant *ptr, Constant *base)
 {
     if (ptr->getType()->isPointerTy())
@@ -920,7 +907,7 @@ static Constant *get_ptrdiff32(Constant *ptr, Constant *base)
 }
 
 template<typename T>
-static Constant *emit_offset_table(Module &M, const std::vector<T*> &vars, StringRef name)
+static Constant *emit_offset_table(Module &M, const std::vector<T*> &vars, StringRef name, StringRef suffix)
 {
     auto T_int32 = Type::getInt32Ty(M.getContext());
     auto T_size = getSizeTy(M.getContext());
@@ -928,11 +915,14 @@ static Constant *emit_offset_table(Module &M, const std::vector<T*> &vars, Strin
     Constant *base = nullptr;
     if (nvars > 0) {
         base = ConstantExpr::getBitCast(vars[0], T_size->getPointerTo());
-        add_comdat(GlobalAlias::create(T_size, 0, GlobalVariable::ExternalLinkage,
-                                       name + "_base",
-                                       base, &M));
+        auto ga = GlobalAlias::create(T_size, 0, GlobalVariable::ExternalLinkage,
+                                       name + "_base" + suffix,
+                                       base, &M);
+        ga->setVisibility(GlobalValue::HiddenVisibility);
     } else {
-        base = add_comdat(new GlobalVariable(M, T_size, true, GlobalValue::ExternalLinkage, Constant::getNullValue(T_size), name + "_base"));
+        auto gv = new GlobalVariable(M, T_size, true, GlobalValue::ExternalLinkage, Constant::getNullValue(T_size), name + "_base" + suffix);
+        gv->setVisibility(GlobalValue::HiddenVisibility);
+        base = gv;
     }
     auto vbase = ConstantExpr::getPtrToInt(base, T_size);
     std::vector<Constant*> offsets(nvars + 1);
@@ -943,10 +933,11 @@ static Constant *emit_offset_table(Module &M, const std::vector<T*> &vars, Strin
             offsets[i + 1] = get_ptrdiff32(vars[i], vbase);
     }
     ArrayType *vars_type = ArrayType::get(T_int32, nvars + 1);
-    add_comdat(new GlobalVariable(M, vars_type, true,
+    auto gv = new GlobalVariable(M, vars_type, true,
                                   GlobalVariable::ExternalLinkage,
                                   ConstantArray::get(vars_type, offsets),
-                                  name + "_offsets"));
+                                  name + "_offsets" + suffix);
+    gv->setVisibility(GlobalValue::HiddenVisibility);
     return vbase;
 }
 
@@ -958,9 +949,17 @@ void CloneCtx::emit_metadata()
         return;
     }
 
+    StringRef suffix;
+    if (auto suffix_md = M.getModuleFlag("julia.mv.suffix")) {
+        suffix = cast<MDString>(suffix_md)->getString();
+    }
+
     // Store back the information about exported functions.
-    auto fbase = emit_offset_table(M, fvars, "jl_sysimg_fvars");
-    auto gbase = emit_offset_table(M, gvars, "jl_sysimg_gvars");
+    auto fbase = emit_offset_table(M, fvars, "jl_fvar", suffix);
+    auto gbase = emit_offset_table(M, gvars, "jl_gvar", suffix);
+
+    M.getGlobalVariable("jl_fvar_idxs")->setName("jl_fvar_idxs" + suffix);
+    M.getGlobalVariable("jl_gvar_idxs")->setName("jl_gvar_idxs" + suffix);
 
     uint32_t ntargets = specs.size();
 
@@ -996,9 +995,10 @@ void CloneCtx::emit_metadata()
         }
         values[0] = ConstantInt::get(T_int32, values.size() / 2);
         ArrayType *vars_type = ArrayType::get(T_int32, values.size());
-        add_comdat(new GlobalVariable(M, vars_type, true, GlobalVariable::ExternalLinkage,
+        auto gv = new GlobalVariable(M, vars_type, true, GlobalVariable::ExternalLinkage,
                                       ConstantArray::get(vars_type, values),
-                                      "jl_dispatch_reloc_slots"));
+                                      "jl_clone_slots" + suffix);
+        gv->setVisibility(GlobalValue::HiddenVisibility);
     }
 
     // Generate `jl_dispatch_fvars_idxs` and `jl_dispatch_fvars_offsets`
@@ -1046,14 +1046,16 @@ void CloneCtx::emit_metadata()
             idxs[len_idx] = count;
         }
         auto idxval = ConstantDataArray::get(M.getContext(), idxs);
-        add_comdat(new GlobalVariable(M, idxval->getType(), true,
+        auto gv1 = new GlobalVariable(M, idxval->getType(), true,
                                       GlobalVariable::ExternalLinkage,
-                                      idxval, "jl_dispatch_fvars_idxs"));
+                                      idxval, "jl_clone_idxs" + suffix);
+        gv1->setVisibility(GlobalValue::HiddenVisibility);
         ArrayType *offsets_type = ArrayType::get(Type::getInt32Ty(M.getContext()), offsets.size());
-        add_comdat(new GlobalVariable(M, offsets_type, true,
+        auto gv2 = new GlobalVariable(M, offsets_type, true,
                                       GlobalVariable::ExternalLinkage,
                                       ConstantArray::get(offsets_type, offsets),
-                                      "jl_dispatch_fvars_offsets"));
+                                      "jl_clone_offsets" + suffix);
+        gv2->setVisibility(GlobalValue::HiddenVisibility);
     }
 }
 
@@ -1070,8 +1072,8 @@ static bool runMultiVersioning(Module &M, bool allow_bad_fvars)
     if (M.getName() == "sysimage")
         return false;
 
-    GlobalVariable *fvars = M.getGlobalVariable("jl_sysimg_fvars");
-    GlobalVariable *gvars = M.getGlobalVariable("jl_sysimg_gvars");
+    GlobalVariable *fvars = M.getGlobalVariable("jl_fvars");
+    GlobalVariable *gvars = M.getGlobalVariable("jl_gvars");
     if (allow_bad_fvars && (!fvars || !fvars->hasInitializer() || !isa<ConstantArray>(fvars->getInitializer()) ||
                             !gvars || !gvars->hasInitializer() || !isa<ConstantArray>(gvars->getInitializer())))
         return false;
diff --git a/src/llvm-ptls.cpp b/src/llvm-ptls.cpp
index ea92e1709c597..e49b992ded50f 100644
--- a/src/llvm-ptls.cpp
+++ b/src/llvm-ptls.cpp
@@ -140,26 +140,11 @@ GlobalVariable *LowerPTLS::create_aliased_global(Type *T, StringRef name) const
     // the address is visible externally but LLVM can still assume that the
     // address of this variable doesn't need dynamic relocation
     // (can be accessed with a single PC-rel load).
-    auto GV = new GlobalVariable(*M, T, false, GlobalVariable::InternalLinkage,
-                                 Constant::getNullValue(T), name + ".real");
-    add_comdat(GlobalAlias::create(T, 0, GlobalVariable::ExternalLinkage,
-                                   name, GV, M));
+    auto GV = new GlobalVariable(*M, T, false, GlobalVariable::ExternalLinkage,
+                                 nullptr, name);
     return GV;
 }
 
-template<typename T>
-inline T *LowerPTLS::add_comdat(T *G) const
-{
-#if defined(_OS_WINDOWS_)
-    // add __declspec(dllexport) to everything marked for export
-    if (G->getLinkage() == GlobalValue::ExternalLinkage)
-        G->setDLLStorageClass(GlobalValue::DLLExportStorageClass);
-    else
-        G->setDLLStorageClass(GlobalValue::DefaultStorageClass);
-#endif
-    return G;
-}
-
 void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, Function *pgcstack_getter, bool or_new, bool *CFGModified)
 {
     if (pgcstack->use_empty()) {
diff --git a/src/processor.cpp b/src/processor.cpp
index a8aca2a64ab19..ea8e4101e6c33 100644
--- a/src/processor.cpp
+++ b/src/processor.cpp
@@ -21,6 +21,8 @@
 #include <dlfcn.h>
 #endif
 
+#include <iostream>
+
 // CPU target string is a list of strings separated by `;` each string starts with a CPU
 // or architecture name and followed by an optional list of features separated by `,`.
 // A "generic" or empty CPU name means the basic required feature set of the target ISA
@@ -629,47 +631,42 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback)
 {
     jl_image_t res{};
 
-    // .data base
-    char *data_base;
-    jl_dlsym(hdl, "jl_sysimg_gvars_base", (void**)&data_base, 1);
+    const jl_image_pointers_t *pointers;
+    jl_dlsym(hdl, "jl_image_pointers", (void**)&pointers, 1);
 
-    {
-        void *pgcstack_func_slot;
-        if (jl_dlsym(hdl, "jl_pgcstack_func_slot", &pgcstack_func_slot, 0)) {
-            void *pgcstack_key_slot;
-            jl_dlsym(hdl, "jl_pgcstack_key_slot", &pgcstack_key_slot, 1);
-            jl_pgcstack_getkey((jl_get_pgcstack_func**)pgcstack_func_slot, (jl_pgcstack_key_t*)pgcstack_key_slot);
-
-            size_t *tls_offset_idx;
-            jl_dlsym(hdl, "jl_tls_offset", (void **)&tls_offset_idx, 1);
-            *tls_offset_idx = (uintptr_t)(jl_tls_offset == -1 ? 0 : jl_tls_offset);
-        }
-    }
+    const void *ids = pointers->target_data;
+    uint32_t target_idx = callback(ids);
+
+    std::cout << "Finished callback\n";
+
+    auto shard = pointers->shards[0];
+
+    std::cout << "Shard access is ok\n";
+
+    // .data base
+    char *data_base = (char *)shard.gvar_base;
 
     // .text base
-    char *text_base;
-    jl_dlsym(hdl, "jl_sysimg_fvars_base", (void**)&text_base, 1);
+    const char *text_base = shard.fvar_base;
 
-    const int32_t *offsets;
-    jl_dlsym(hdl, "jl_sysimg_fvars_offsets", (void**)&offsets, 1);
+    const int32_t *offsets = shard.fvar_offsets;
     uint32_t nfunc = offsets[0];
     offsets++;
 
-    const void *ids;
-    jl_dlsym(hdl, "jl_dispatch_target_ids", (void**)&ids, 1);
-    uint32_t target_idx = callback(ids);
+    std::cout << "Initial offsets\n";
 
-    const int32_t *reloc_slots;
-    jl_dlsym(hdl, "jl_dispatch_reloc_slots", (void **)&reloc_slots, 1);
+    const int32_t *reloc_slots = shard.clone_slots;
+    std::cout << reloc_slots << "\n";
     const uint32_t nreloc = reloc_slots[0];
     reloc_slots += 1;
-    const uint32_t *clone_idxs;
-    const int32_t *clone_offsets;
-    jl_dlsym(hdl, "jl_dispatch_fvars_idxs", (void**)&clone_idxs, 1);
-    jl_dlsym(hdl, "jl_dispatch_fvars_offsets", (void**)&clone_offsets, 1);
+    std::cout << "Set reloc_slots\n";
+    const uint32_t *clone_idxs = shard.clone_idxs;
+    const int32_t *clone_offsets = shard.clone_offsets;
     uint32_t tag_len = clone_idxs[0];
     clone_idxs += 1;
 
+    std::cout << "Set clone_idxs\n";
+
     assert(tag_len & jl_sysimg_tag_mask);
     std::vector<const int32_t*> base_offsets = {offsets};
     // Find target
@@ -688,6 +685,8 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback)
         base_offsets.push_back(tag_len & jl_sysimg_tag_mask ? clone_offsets : nullptr);
     }
 
+    std::cout << "Set offsets\n";
+
     bool clone_all = (tag_len & jl_sysimg_tag_mask) != 0;
     // Fill in return value
     if (clone_all) {
@@ -741,17 +740,19 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback)
         (void)found;
     }
 
+    std::cout << "Finished relocation\n";
+
     res.fptrs.base = text_base;
     res.fptrs.offsets = offsets;
     res.gvars_base = (uintptr_t *)data_base;
-    jl_dlsym(hdl, "jl_sysimg_gvars_offsets", (void **)&res.gvars_offsets, 1);
+    res.gvars_offsets = shard.gvar_offsets;
     res.gvars_offsets += 1;
 
 #ifdef _OS_WINDOWS_
     res.base = (intptr_t)hdl;
 #else
     Dl_info dlinfo;
-    if (dladdr((void*)res.gvars_base, &dlinfo) != 0) {
+    if (dladdr((void*)pointers, &dlinfo) != 0) {
         res.base = (intptr_t)dlinfo.dli_fbase;
     }
     else {
@@ -759,6 +760,17 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback)
     }
 #endif
 
+    std::cout << "Starting ptls\n";
+
+    {
+        void *pgcstack_func_slot = pointers->ptls->pgcstack_func_slot;
+        void *pgcstack_key_slot = pointers->ptls->pgcstack_key_slot;
+        jl_pgcstack_getkey((jl_get_pgcstack_func**)pgcstack_func_slot, (jl_pgcstack_key_t*)pgcstack_key_slot);
+
+        size_t *tls_offset_idx = pointers->ptls->tls_offset;
+        *tls_offset_idx = (uintptr_t)(jl_tls_offset == -1 ? 0 : jl_tls_offset);
+    }
+
     return res;
 }
 
diff --git a/src/processor.h b/src/processor.h
index f76722e885a1d..73271290eff76 100644
--- a/src/processor.h
+++ b/src/processor.h
@@ -162,6 +162,38 @@ typedef struct {
     jl_image_fptrs_t fptrs;
 } jl_image_t;
 
+typedef struct {
+    uint32_t version;
+    uint32_t nshards;
+    uint32_t nfvars;
+    uint32_t ngvars;
+} jl_image_header_t;
+
+typedef struct {
+    const char *fvar_base;
+    const int32_t *fvar_offsets;
+    const uint32_t *fvar_idxs;
+    uintptr_t *gvar_base;
+    const int32_t *gvar_offsets;
+    const uint32_t *gvar_idxs;
+    const int32_t *clone_slots;
+    const int32_t *clone_offsets;
+    const uint32_t *clone_idxs;
+} jl_image_shard_t;
+
+typedef struct {
+    void *pgcstack_func_slot;
+    void *pgcstack_key_slot;
+    size_t *tls_offset;
+} jl_image_ptls_t;
+
+typedef struct {
+    const jl_image_header_t *header;
+    const jl_image_shard_t *shards; // nshards-length array
+    const jl_image_ptls_t *ptls;
+    const void *target_data;
+} jl_image_pointers_t;
+
 /**
  * Initialize the processor dispatch system with sysimg `hdl` (also initialize the sysimg itself).
  * The dispatch system will find the best implementation to be used in this session.

From 798ee2245b6aae597a99d25f27aa3ed96cf3c2aa Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Thu, 5 Jan 2023 23:54:39 -0500
Subject: [PATCH 07/34] Allow loader to deal with multiple shards

---
 src/processor.cpp | 232 ++++++++++++++++++++++++++--------------------
 1 file changed, 133 insertions(+), 99 deletions(-)

diff --git a/src/processor.cpp b/src/processor.cpp
index ea8e4101e6c33..55b2cd2b4ab55 100644
--- a/src/processor.cpp
+++ b/src/processor.cpp
@@ -636,117 +636,153 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback)
 
     const void *ids = pointers->target_data;
     uint32_t target_idx = callback(ids);
+    
+    if (pointers->header->version != 1) {
+        jl_error("Image file is not compatible with this version of Julia");
+    }
 
-    std::cout << "Finished callback\n";
-
-    auto shard = pointers->shards[0];
-
-    std::cout << "Shard access is ok\n";
-
-    // .data base
-    char *data_base = (char *)shard.gvar_base;
-
-    // .text base
-    const char *text_base = shard.fvar_base;
-
-    const int32_t *offsets = shard.fvar_offsets;
-    uint32_t nfunc = offsets[0];
-    offsets++;
-
-    std::cout << "Initial offsets\n";
-
-    const int32_t *reloc_slots = shard.clone_slots;
-    std::cout << reloc_slots << "\n";
-    const uint32_t nreloc = reloc_slots[0];
-    reloc_slots += 1;
-    std::cout << "Set reloc_slots\n";
-    const uint32_t *clone_idxs = shard.clone_idxs;
-    const int32_t *clone_offsets = shard.clone_offsets;
-    uint32_t tag_len = clone_idxs[0];
-    clone_idxs += 1;
-
-    std::cout << "Set clone_idxs\n";
+    std::vector<const char *> fvars(pointers->header->nfvars);
+    std::vector<const char *> gvars(pointers->header->ngvars);
+
+    std::vector<std::pair<uint32_t, const char *>> clones;
+
+    for (unsigned i = 0; i < pointers->header->nshards; i++) {
+        auto shard = pointers->shards[0];
+
+        // .data base
+        char *data_base = (char *)shard.gvar_base;
+
+        // .text base
+        const char *text_base = shard.fvar_base;
+
+        const int32_t *offsets = shard.fvar_offsets;
+        uint32_t nfunc = offsets[0];
+        offsets++;
+        const int32_t *reloc_slots = shard.clone_slots;
+        const uint32_t nreloc = reloc_slots[0];
+        reloc_slots += 1;
+        const uint32_t *clone_idxs = shard.clone_idxs;
+        const int32_t *clone_offsets = shard.clone_offsets;
+        uint32_t tag_len = clone_idxs[0];
+        clone_idxs += 1;
+
+        assert(tag_len & jl_sysimg_tag_mask);
+        std::vector<const int32_t*> base_offsets = {offsets};
+        // Find target
+        for (uint32_t i = 0;i < target_idx;i++) {
+            uint32_t len = jl_sysimg_val_mask & tag_len;
+            if (jl_sysimg_tag_mask & tag_len) {
+                if (i != 0)
+                    clone_offsets += nfunc;
+                clone_idxs += len + 1;
+            }
+            else {
+                clone_offsets += len;
+                clone_idxs += len + 2;
+            }
+            tag_len = clone_idxs[-1];
+            base_offsets.push_back(tag_len & jl_sysimg_tag_mask ? clone_offsets : nullptr);
+        }
 
-    assert(tag_len & jl_sysimg_tag_mask);
-    std::vector<const int32_t*> base_offsets = {offsets};
-    // Find target
-    for (uint32_t i = 0;i < target_idx;i++) {
-        uint32_t len = jl_sysimg_val_mask & tag_len;
-        if (jl_sysimg_tag_mask & tag_len) {
-            if (i != 0)
-                clone_offsets += nfunc;
-            clone_idxs += len + 1;
+        bool clone_all = (tag_len & jl_sysimg_tag_mask) != 0;
+        // Fill in return value
+        if (clone_all) {
+            // clone_all
+            if (target_idx != 0) {
+                offsets = clone_offsets;
+            }
         }
         else {
-            clone_offsets += len;
-            clone_idxs += len + 2;
+            uint32_t base_idx = clone_idxs[0];
+            assert(base_idx < target_idx);
+            if (target_idx != 0) {
+                offsets = base_offsets[base_idx];
+                assert(offsets);
+            }
+            clone_idxs++;
+            unsigned start = clones.size();
+            clones.resize(start + tag_len);
+            auto idxs = shard.fvar_idxs;
+            for (unsigned i = 0; i < tag_len; i++) {
+                clones[start + i] = {(clone_idxs[i] & ~jl_sysimg_val_mask) | idxs[clone_idxs[i] & jl_sysimg_val_mask], clone_offsets[i] + text_base};
+            }
+        }
+        // Do relocation
+        uint32_t reloc_i = 0;
+        uint32_t len = jl_sysimg_val_mask & tag_len;
+        for (uint32_t i = 0; i < len; i++) {
+            uint32_t idx = clone_idxs[i];
+            int32_t offset;
+            if (clone_all) {
+                offset = offsets[idx];
+            }
+            else if (idx & jl_sysimg_tag_mask) {
+                idx = idx & jl_sysimg_val_mask;
+                offset = clone_offsets[i];
+            }
+            else {
+                continue;
+            }
+            bool found = false;
+            for (; reloc_i < nreloc; reloc_i++) {
+                auto reloc_idx = ((const uint32_t*)reloc_slots)[reloc_i * 2];
+                if (reloc_idx == idx) {
+                    found = true;
+                    auto slot = (const void**)(data_base + reloc_slots[reloc_i * 2 + 1]);
+                    assert(slot);
+                    *slot = offset + text_base;
+                }
+                else if (reloc_idx > idx) {
+                    break;
+                }
+            }
+            assert(found && "Cannot find GOT entry for cloned function.");
+            (void)found;
         }
-        tag_len = clone_idxs[-1];
-        base_offsets.push_back(tag_len & jl_sysimg_tag_mask ? clone_offsets : nullptr);
-    }
 
-    std::cout << "Set offsets\n";
+        auto fidxs = shard.fvar_idxs;
+        for (uint32_t i = 0; i < nfunc; i++) {
+            fvars[fidxs[i]] = text_base + offsets[i];
+        }
 
-    bool clone_all = (tag_len & jl_sysimg_tag_mask) != 0;
-    // Fill in return value
-    if (clone_all) {
-        // clone_all
-        if (target_idx != 0) {
-            offsets = clone_offsets;
+        auto gidxs = shard.gvar_idxs;
+        unsigned ngvars = shard.gvar_offsets[0];
+        for (uint32_t i = 0; i < ngvars; i++) {
+            gvars[gidxs[i]] = data_base + shard.gvar_offsets[i+1];
         }
     }
-    else {
-        uint32_t base_idx = clone_idxs[0];
-        assert(base_idx < target_idx);
-        if (target_idx != 0) {
-            offsets = base_offsets[base_idx];
-            assert(offsets);
+
+    if (!fvars.empty()) {
+        auto offsets = (int32_t *) malloc(sizeof(int32_t) * fvars.size());
+        res.fptrs.base = fvars[0];
+        for (size_t i = 0; i < fvars.size(); i++) {
+            offsets[i] = fvars[i] - res.fptrs.base;
         }
-        clone_idxs++;
-        res.fptrs.nclones = tag_len;
-        res.fptrs.clone_offsets = clone_offsets;
-        res.fptrs.clone_idxs = clone_idxs;
+        res.fptrs.offsets = offsets;
+        res.fptrs.noffsets = fvars.size();
     }
-    // Do relocation
-    uint32_t reloc_i = 0;
-    uint32_t len = jl_sysimg_val_mask & tag_len;
-    for (uint32_t i = 0; i < len; i++) {
-        uint32_t idx = clone_idxs[i];
-        int32_t offset;
-        if (clone_all) {
-            offset = offsets[idx];
-        }
-        else if (idx & jl_sysimg_tag_mask) {
-            idx = idx & jl_sysimg_val_mask;
-            offset = clone_offsets[i];
-        }
-        else {
-            continue;
-        }
-        bool found = false;
-        for (; reloc_i < nreloc; reloc_i++) {
-            auto reloc_idx = ((const uint32_t*)reloc_slots)[reloc_i * 2];
-            if (reloc_idx == idx) {
-                found = true;
-                auto slot = (const void**)(data_base + reloc_slots[reloc_i * 2 + 1]);
-                assert(slot);
-                *slot = offset + text_base;
-            }
-            else if (reloc_idx > idx) {
-                break;
-            }
+
+    if (!gvars.empty()) {
+        auto offsets = (int32_t *) malloc(sizeof(int32_t) * gvars.size());
+        res.gvars_base = (uintptr_t *)gvars[0];
+        for (size_t i = 0; i < gvars.size(); i++) {
+            offsets[i] = gvars[i] - (const char *)res.gvars_base;
         }
-        assert(found && "Cannot find GOT entry for cloned function.");
-        (void)found;
+        res.gvars_offsets = offsets;
     }
 
-    std::cout << "Finished relocation\n";
-
-    res.fptrs.base = text_base;
-    res.fptrs.offsets = offsets;
-    res.gvars_base = (uintptr_t *)data_base;
-    res.gvars_offsets = shard.gvar_offsets;
-    res.gvars_offsets += 1;
+    if (!clones.empty()) {
+        std::sort(clones.begin(), clones.end());
+        auto clone_offsets = (int32_t *) malloc(sizeof(int32_t) * clones.size());
+        auto clone_idxs = (uint32_t *) malloc(sizeof(uint32_t) * clones.size());
+        for (size_t i = 0; i < clones.size(); i++) {
+            clone_idxs[i] = clones[i].first;
+            clone_offsets[i] = clones[i].second - res.fptrs.base;
+        }
+        res.fptrs.clone_idxs = clone_idxs;
+        res.fptrs.clone_offsets = clone_offsets;
+        res.fptrs.nclones = clones.size();
+    }
 
 #ifdef _OS_WINDOWS_
     res.base = (intptr_t)hdl;
@@ -760,8 +796,6 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback)
     }
 #endif
 
-    std::cout << "Starting ptls\n";
-
     {
         void *pgcstack_func_slot = pointers->ptls->pgcstack_func_slot;
         void *pgcstack_key_slot = pointers->ptls->pgcstack_key_slot;

From 3915101dc65d3d0844cf8e0f5d5a1e39ddf97407 Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Fri, 6 Jan 2023 19:21:47 -0500
Subject: [PATCH 08/34] Multithreaded image builder

---
 src/aotcompile.cpp           | 729 +++++++++++++++++++++++++++++------
 src/llvm-codegen-shared.h    | 152 ++++++++
 src/llvm-multiversioning.cpp | 155 --------
 src/processor.cpp            |   7 +-
 4 files changed, 764 insertions(+), 279 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 5873c1ca56477..8ef715235fb04 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -51,6 +51,7 @@
 // for outputting code
 #include <llvm/Bitcode/BitcodeWriter.h>
 #include <llvm/Bitcode/BitcodeWriterPass.h>
+#include <llvm/Bitcode/BitcodeReader.h>
 #include "llvm/Object/ArchiveWriter.h"
 #include <llvm/IR/IRPrintingPasses.h>
 
@@ -74,19 +75,13 @@ STATISTIC(CreateNativeMethods, "Number of methods compiled for jl_create_native"
 STATISTIC(CreateNativeMax, "Max number of methods compiled at once for jl_create_native");
 STATISTIC(CreateNativeGlobals, "Number of globals compiled for jl_create_native");
 
-template<class T> // for GlobalObject's
-static T *addComdat(T *G)
+static void addComdat(GlobalValue *G, Triple &T)
 {
-#if defined(_OS_WINDOWS_)
-    if (!G->isDeclaration()) {
+    if (T.isOSBinFormatCOFF() && !G->isDeclaration()) {
         // add __declspec(dllexport) to everything marked for export
-        if (G->getLinkage() == GlobalValue::ExternalLinkage)
-            G->setDLLStorageClass(GlobalValue::DLLExportStorageClass);
-        else
-            G->setDLLStorageClass(GlobalValue::DefaultStorageClass);
+        assert(G->hasExternalLinkage() && "Cannot set DLLExport on non-external linkage!");
+        G->setDLLStorageClass(GlobalValue::DLLExportStorageClass);
     }
-#endif
-    return G;
 }
 
 
@@ -472,15 +467,6 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
     return (void*)data;
 }
 
-
-static void emit_result(std::vector<NewArchiveMember> &Archive, SmallVectorImpl<char> &OS,
-        StringRef Name, std::vector<std::string> &outputs)
-{
-    outputs.push_back({ OS.data(), OS.size() });
-    Archive.push_back(NewArchiveMember(MemoryBufferRef(outputs.back(), Name)));
-    OS.clear();
-}
-
 static object::Archive::Kind getDefaultForHost(Triple &triple)
 {
       if (triple.isOSDarwin())
@@ -572,6 +558,584 @@ static GlobalVariable *emit_image_header(Module &M, unsigned threads, unsigned n
     return header_gv;
 }
 
+struct Partition {
+    StringSet<> globals;
+    StringMap<unsigned> fvars;
+    StringMap<unsigned> gvars;
+    size_t weight;
+};
+
+static void get_fvars_gvars(Module &M, DenseMap<GlobalValue *, unsigned> &fvars, DenseMap<GlobalValue *, unsigned> &gvars) {
+    auto fvars_gv = M.getGlobalVariable("jl_fvars");
+    auto gvars_gv = M.getGlobalVariable("jl_gvars");
+    assert(fvars_gv);
+    assert(gvars_gv);
+    auto fvars_init = cast<ConstantArray>(fvars_gv->getInitializer());
+    auto gvars_init = cast<ConstantArray>(gvars_gv->getInitializer());
+    std::string suffix;
+    if (auto md = M.getModuleFlag("julia.mv.suffix")) {
+        suffix = cast<MDString>(md)->getString().str();
+    }
+    auto fvars_idxs = M.getGlobalVariable("jl_fvar_idxs");
+    auto gvars_idxs = M.getGlobalVariable("jl_gvar_idxs");
+    assert(fvars_idxs);
+    assert(gvars_idxs);
+    auto fvars_idxs_init = cast<ConstantDataArray>(fvars_idxs->getInitializer());
+    auto gvars_idxs_init = cast<ConstantDataArray>(gvars_idxs->getInitializer());
+    for (unsigned i = 0; i < fvars_init->getNumOperands(); ++i) {
+        auto gv = cast<GlobalValue>(fvars_init->getOperand(i)->stripPointerCasts());
+        auto idx = fvars_idxs_init->getElementAsInteger(i);
+        fvars[gv] = idx;
+    }
+    for (unsigned i = 0; i < gvars_init->getNumOperands(); ++i) {
+        auto gv = cast<GlobalValue>(gvars_init->getOperand(i)->stripPointerCasts());
+        auto idx = gvars_idxs_init->getElementAsInteger(i);
+        gvars[gv] = idx;
+    }
+    fvars_gv->eraseFromParent();
+    gvars_gv->eraseFromParent();
+    fvars_idxs->eraseFromParent();
+    gvars_idxs->eraseFromParent();
+}
+
+static size_t getFunctionWeight(const Function &F)
+{
+    size_t weight = 1;
+    for (const BasicBlock &BB : F) {
+        weight += BB.size();
+    }
+    // more basic blocks = more complex than just sum of insts,
+    // add some weight to it
+    weight += F.size();
+    if (F.hasFnAttribute("julia.mv.clones")) {
+        weight *= F.getFnAttribute("julia.mv.clones").getValueAsString().count(',') + 1;
+    }
+    return weight;
+}
+
+
+static bool verify_partitioning(const SmallVectorImpl<Partition> &partitions, const Module &M) {
+    StringMap<uint32_t> GVNames;
+    bool bad = false;
+    for (uint32_t i = 0; i < partitions.size(); i++) {
+        for (auto &name : partitions[i].globals) {
+            if (GVNames.count(name.getKey())) {
+                bad = true;
+                dbgs() << "Duplicate global name " << name.getKey() << " in partitions " << i << " and " << GVNames[name.getKey()] << "\n";
+            }
+            GVNames[name.getKey()] = i;
+        }
+        dbgs() << "partition: " << i << " fvars: " << partitions[i].fvars.size() << " gvars: " << partitions[i].gvars.size() << "\n";
+    }
+    for (auto &GV : M.globals()) {
+        if (GV.isDeclaration()) {
+            if (GVNames.count(GV.getName())) {
+                bad = true;
+                dbgs() << "Global " << GV.getName() << " is a declaration but is in partition " << GVNames[GV.getName()] << "\n";
+            }
+        } else {
+            if (!GVNames.count(GV.getName())) {
+                bad = true;
+                dbgs() << "Global " << GV << " not in any partition\n";
+            }
+            if (!GV.hasExternalLinkage()) {
+                bad = true;
+                dbgs() << "Global " << GV << " has non-external linkage " << GV.getLinkage() << " but is in partition " << GVNames[GV.getName()] << "\n";
+            }
+        }
+    }
+    return !bad;
+}
+
+// Chop a module up as equally as possible into threads partitions
+static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) {
+    //Start by stripping fvars and gvars, which helpfully removes their uses as well
+    DenseMap<GlobalValue *, unsigned> fvars, gvars;
+    get_fvars_gvars(M, fvars, gvars);
+
+    // Partition by union-find, since we only have def->use traversal right now
+    struct Partitioner {
+        struct Node {
+            GlobalValue *GV;
+            unsigned parent;
+            unsigned size;
+            size_t weight;
+        };
+        std::vector<Node> nodes;
+        DenseMap<GlobalValue *, unsigned> node_map;
+        unsigned merged;
+
+        unsigned make(GlobalValue *GV, size_t weight) {
+            unsigned idx = nodes.size();
+            nodes.push_back({GV, idx, 1, weight});
+            node_map[GV] = idx;
+            return idx;
+        }
+
+        unsigned find(unsigned idx) {
+            while (nodes[idx].parent != idx) {
+                nodes[idx].parent = nodes[nodes[idx].parent].parent;
+                idx = nodes[idx].parent;
+            }
+            return idx;
+        }
+
+        unsigned merge(unsigned x, unsigned y) {
+            x = find(x);
+            y = find(y);
+            if (x == y)
+                return x;
+            if (nodes[x].size < nodes[y].size)
+                std::swap(x, y);
+            nodes[y].parent = x;
+            nodes[x].size += nodes[y].size;
+            nodes[x].weight += nodes[y].weight;
+            merged++;
+            return x;
+        }
+    };
+
+    Partitioner partitioner;
+
+    for (auto &G : M.global_values()) {
+        if (G.isDeclaration())
+            continue;
+        if (isa<Function>(G)) {
+            partitioner.make(&G, getFunctionWeight(cast<Function>(G)));
+        } else {
+            partitioner.make(&G, 1);
+        }
+    }
+
+    // Merge all uses to go together into the same partition
+    for (unsigned i = 0; i < partitioner.nodes.size(); ++i) {
+        for (ConstantUses<GlobalValue> uses(partitioner.nodes[i].GV, M); !uses.done(); uses.next()) {
+            auto val = uses.get_info().val;
+            auto idx = partitioner.node_map.find(val);
+            assert(idx != partitioner.node_map.end());
+            partitioner.merge(i, idx->second);
+        }
+    }
+
+    SmallVector<Partition, 32> partitions(threads);
+    // always get the smallest partition first
+    auto pcomp = [](const Partition *p1, const Partition *p2) {
+        return p1->weight > p2->weight;
+    };
+    std::priority_queue<Partition *, std::vector<Partition *>, decltype(pcomp)> pq(pcomp);
+    for (unsigned i = 0; i < threads; ++i) {
+        pq.push(&partitions[i]);
+    }
+    
+    // Assign the root of each partition to a partition, then assign its children to the same one
+    for (unsigned i = 0; i < partitioner.nodes.size(); ++i) {
+        auto root = partitioner.find(i);
+        if (partitioner.nodes[root].GV) {
+            auto &node = partitioner.nodes[root];
+            auto &P = *pq.top();
+            pq.pop();
+            auto name = node.GV->getName();
+            P.globals.insert(name);
+            if (fvars.count(node.GV))
+                P.fvars[name] = fvars[node.GV];
+            if (gvars.count(node.GV))
+                P.gvars[name] = gvars[node.GV];
+            P.weight += node.weight;
+            node.GV = nullptr;
+            node.size = &P - partitions.data();
+            pq.push(&P);
+        }
+        if (root != i) {
+            auto &node = partitioner.nodes[i];
+            assert(node.GV != nullptr);
+            // we assigned its root already, so just add it to the root's partition
+            // don't touch the priority queue, since we're not changing the weight
+            auto &P = partitions[partitioner.nodes[root].size];
+            auto name = node.GV->getName();
+            P.globals.insert(name);
+            if (fvars.count(node.GV))
+                P.fvars[name] = fvars[node.GV];
+            if (gvars.count(node.GV))
+                P.gvars[name] = gvars[node.GV];
+            node.GV = nullptr;
+            node.size = partitioner.nodes[root].size;
+        }
+    }
+
+    assert(verify_partitioning(partitions, M) && "Partitioning failed to partition globals correctly");
+
+    return partitions;
+}
+
+static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *outputs, StringRef name,
+                    NewArchiveMember *unopt, NewArchiveMember *opt, NewArchiveMember *obj, NewArchiveMember *asm_) {
+    auto TM = std::unique_ptr<TargetMachine>(
+        SourceTM.getTarget().createTargetMachine(
+            SourceTM.getTargetTriple().str(),
+            SourceTM.getTargetCPU(),
+            SourceTM.getTargetFeatureString(),
+            SourceTM.Options,
+            SourceTM.getRelocationModel(),
+            SourceTM.getCodeModel(),
+            SourceTM.getOptLevel()));
+
+    if (unopt) {
+        raw_string_ostream OS(*outputs);
+        PassBuilder PB;
+        AnalysisManagers AM{*TM, PB, OptimizationLevel::O0};
+        ModulePassManager MPM;
+        MPM.addPass(BitcodeWriterPass(OS));
+        outputs++;
+        *outputs = (name + "_unopt.bc").str();
+        *unopt = NewArchiveMember(MemoryBufferRef(OS.str(), *outputs));
+        outputs++;
+    }
+    if (!opt && !obj && !asm_) {
+        return;
+    }
+    assert(!verifyModule(M, &errs()));
+
+    uint64_t start = jl_hrtime();
+    uint64_t end = 0;
+
+#ifndef JL_USE_NEW_PM
+    legacy::PassManager optimizer;
+    addTargetPasses(&optimizer, TM->getTargetTriple(), TM->getTargetIRAnalysis());
+    addOptimizationPasses(&optimizer, jl_options.opt_level, true, true);
+    addMachinePasses(&optimizer, jl_options.opt_level);
+#else
+
+    auto PMTM = std::unique_ptr<TargetMachine>(
+        SourceTM.getTarget().createTargetMachine(
+            SourceTM.getTargetTriple().str(),
+            SourceTM.getTargetCPU(),
+            SourceTM.getTargetFeatureString(),
+            SourceTM.Options,
+            SourceTM.getRelocationModel(),
+            SourceTM.getCodeModel(),
+            SourceTM.getOptLevel()));
+    NewPM optimizer{std::move(PMTM), getOptLevel(jl_options.opt_level), OptimizationOptions::defaults(true, true)};
+#endif
+    optimizer.run(M);
+    assert(!verifyModule(M, &errs()));
+
+    end = jl_hrtime();
+
+    dbgs() << "optimize time: " << (end - start) / 1e9 << "s\n";
+
+    if (opt) {
+        raw_string_ostream OS(*outputs);
+        PassBuilder PB;
+        AnalysisManagers AM{*TM, PB, OptimizationLevel::O0};
+        ModulePassManager MPM;
+        MPM.addPass(BitcodeWriterPass(OS));
+        outputs++;
+        *outputs = (name + "_opt.bc").str();
+        *opt = NewArchiveMember(MemoryBufferRef(OS.str(), *outputs));
+        outputs++;
+    }
+
+    start = jl_hrtime();
+
+    if (obj) {
+        SmallVector<char, 0> Buffer;
+        raw_svector_ostream OS(Buffer);
+        legacy::PassManager emitter;
+        addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis());
+        if (TM->addPassesToEmitFile(emitter, OS, nullptr, CGFT_ObjectFile, false))
+            jl_safe_printf("ERROR: target does not support generation of object files\n");
+        emitter.run(M);
+        *outputs = { Buffer.data(), Buffer.size() };
+        outputs++;
+        *outputs = (name + ".o").str();
+        *obj = NewArchiveMember(MemoryBufferRef(outputs[-1], *outputs));
+        outputs++;
+    }
+
+    end = jl_hrtime();
+
+    dbgs() << "codegen time: " << (end - start) / 1e9 << "s\n";
+
+    if (asm_) {
+        SmallVector<char, 0> Buffer;
+        raw_svector_ostream OS(Buffer);
+        legacy::PassManager emitter;
+        addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis());
+        if (TM->addPassesToEmitFile(emitter, OS, nullptr, CGFT_AssemblyFile, false))
+            jl_safe_printf("ERROR: target does not support generation of assembly files\n");
+        emitter.run(M);
+        *outputs = { Buffer.data(), Buffer.size() };
+        outputs++;
+        *outputs = (name + ".s").str();
+        *asm_ = NewArchiveMember(MemoryBufferRef(outputs[-1], *outputs));
+        outputs++;
+    }
+}
+
+static auto serializeModule(const Module &M) {
+    SmallVector<char, 0> ClonedModuleBuffer;
+    BitcodeWriter BCWriter(ClonedModuleBuffer);
+    BCWriter.writeModule(M);
+    BCWriter.writeSymtab();
+    BCWriter.writeStrtab();
+    return ClonedModuleBuffer;
+}
+
+static void materializePreserved(Module &M, Partition &partition) {
+    DenseSet<GlobalValue *> Preserve;
+    for (auto &GV : M.global_values()) {
+        if (!GV.isDeclaration()) {
+            if (partition.globals.count(GV.getName())) {
+                Preserve.insert(&GV);
+            }
+        }
+    }
+    for (auto &F : M.functions()) {
+        if (!F.isDeclaration()) {
+            if (!Preserve.contains(&F)) {
+                F.deleteBody();
+                F.setLinkage(GlobalValue::ExternalLinkage);
+            }
+        }
+    }
+    for (auto &GV : M.globals()) {
+        if (!GV.isDeclaration()) {
+            if (!Preserve.contains(&GV)) {
+                GV.setInitializer(nullptr);
+                GV.setLinkage(GlobalValue::ExternalLinkage);
+            }
+        }
+    }
+    SmallVector<std::pair<GlobalAlias *, GlobalValue *>> DeletedAliases;
+    for (auto &GA : M.aliases()) {
+        if (!GA.isDeclaration()) {
+            if (!Preserve.contains(&GA)) {
+                if (GA.getValueType()->isFunctionTy()) {
+                    DeletedAliases.push_back({ &GA, Function::Create(cast<FunctionType>(GA.getValueType()), GlobalValue::ExternalLinkage, "", &M) });
+                } else {
+                    DeletedAliases.push_back({ &GA, new GlobalVariable(M, GA.getValueType(), false, GlobalValue::ExternalLinkage, nullptr) });
+                }
+            }
+        }
+    }
+    cantFail(M.materializeAll());
+    for (auto &Deleted : DeletedAliases) {
+        Deleted.second->takeName(Deleted.first);
+        Deleted.first->replaceAllUsesWith(Deleted.second);
+        Deleted.first->eraseFromParent();
+    }
+}
+
+static void construct_vars(Module &M, Partition &partition) {
+    std::vector<std::pair<uint32_t, GlobalValue *>> fvar_pairs;
+    fvar_pairs.reserve(partition.fvars.size());
+    for (auto &fvar : partition.fvars) {
+        auto F = M.getFunction(fvar.first());
+        assert(F);
+        assert(!F->isDeclaration());
+        fvar_pairs.push_back({ fvar.second, F });
+    }
+    std::vector<GlobalValue *> fvars;
+    std::vector<uint32_t> fvar_idxs;
+    fvars.reserve(fvar_pairs.size());
+    fvar_idxs.reserve(fvar_pairs.size());
+    std::sort(fvar_pairs.begin(), fvar_pairs.end());
+    for (auto &fvar : fvar_pairs) {
+        fvars.push_back(fvar.second);
+        fvar_idxs.push_back(fvar.first);
+    }
+    std::vector<std::pair<uint32_t, GlobalValue *>> gvar_pairs;
+    gvar_pairs.reserve(partition.gvars.size());
+    for (auto &gvar : partition.gvars) {
+        auto GV = M.getGlobalVariable(gvar.first());
+        assert(GV);
+        assert(!GV->isDeclaration());
+        gvar_pairs.push_back({ gvar.second, GV });
+    }
+    std::vector<GlobalValue *> gvars;
+    std::vector<uint32_t> gvar_idxs;
+    gvars.reserve(gvar_pairs.size());
+    gvar_idxs.reserve(gvar_pairs.size());
+    std::sort(gvar_pairs.begin(), gvar_pairs.end());
+    for (auto &gvar : gvar_pairs) {
+        gvars.push_back(gvar.second);
+        gvar_idxs.push_back(gvar.first);
+    }
+
+    // Now commit the fvars, gvars, and idxs
+    auto T_psize = M.getDataLayout().getIntPtrType(M.getContext())->getPointerTo();
+    emit_offset_table(M, fvars, "jl_fvars", T_psize);
+    emit_offset_table(M, gvars, "jl_gvars", T_psize);
+    auto fidxs = ConstantDataArray::get(M.getContext(), fvar_idxs);
+    auto fidxs_var = new GlobalVariable(M, fidxs->getType(), true,
+                                        GlobalVariable::ExternalLinkage,
+                                        fidxs, "jl_fvar_idxs");
+    fidxs_var->setVisibility(GlobalValue::HiddenVisibility);
+    auto gidxs = ConstantDataArray::get(M.getContext(), gvar_idxs);
+    auto gidxs_var = new GlobalVariable(M, gidxs->getType(), true,
+                                        GlobalVariable::ExternalLinkage,
+                                        gidxs, "jl_gvar_idxs");
+    gidxs_var->setVisibility(GlobalValue::HiddenVisibility);
+}
+
+static void dropUnusedDeclarations(Module &M) {
+    SmallVector<GlobalValue *> unused;
+    for (auto &G : M.global_values()) {
+        if (G.isDeclaration()) {
+            if (G.use_empty()) {
+                unused.push_back(&G);
+            } else {
+                G.setDSOLocal(false); // These are never going to be seen in the same module again
+                G.setVisibility(GlobalValue::DefaultVisibility);
+            }
+        }
+    }
+    for (auto &G : unused)
+        G->eraseFromParent();
+}
+
+static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &outputs, StringRef name,
+                std::vector<NewArchiveMember> &unopt, std::vector<NewArchiveMember> &opt,
+                std::vector<NewArchiveMember> &obj, std::vector<NewArchiveMember> &asm_,
+                bool unopt_out, bool opt_out, bool obj_out, bool asm_out,
+                unsigned threads) {
+    uint64_t start = 0, end = 0;
+    unsigned outcount = unopt_out + opt_out + obj_out + asm_out;
+    assert(outcount);
+    outputs.resize(outputs.size() + outcount * threads * 2);
+    unopt.resize(unopt.size() + unopt_out * threads);
+    opt.resize(opt.size() + opt_out * threads);
+    obj.resize(obj.size() + obj_out * threads);
+    asm_.resize(asm_.size() + asm_out * threads);
+    if (threads == 1) {
+        start = jl_hrtime();
+        add_output_impl(M, TM, outputs.data() + outputs.size() - outcount * 2, name,
+                        unopt_out ? unopt.data() + unopt.size() - 1 : nullptr,
+                        opt_out ? opt.data() + opt.size() - 1 : nullptr,
+                        obj_out ? obj.data() + obj.size() - 1 : nullptr,
+                        asm_out ? asm_.data() + asm_.size() - 1 : nullptr);
+        end = jl_hrtime();
+        dbgs() << "Time to add output: " << (end - start) / 1e9 << "s\n";
+        return;
+    }
+    
+    start = jl_hrtime();
+    uint64_t counter = 0;
+    for (auto &G : M.global_values()) {
+        if (!G.isDeclaration() && !G.hasName()) {
+            G.setName("jl_ext_" + Twine(counter++));
+        }
+    }
+    auto partitions = partitionModule(M, threads);
+    end = jl_hrtime();
+    dbgs() << "Time to partition module: " << (end - start) / 1e9 << "s\n";
+    start = jl_hrtime();
+    auto serialized = serializeModule(M);
+    end = jl_hrtime();
+    dbgs() << "Time to serialize module: " << (end - start) / 1e9 << "s\n";
+
+    auto outstart = outputs.data() + outputs.size() - outcount * threads * 2;
+    auto unoptstart = unopt_out ? unopt.data() + unopt.size() - threads : nullptr;
+    auto optstart = opt_out ? opt.data() + opt.size() - threads : nullptr;
+    auto objstart = obj_out ? obj.data() + obj.size() - threads : nullptr;
+    auto asmstart = asm_out ? asm_.data() + asm_.size() - threads : nullptr;
+
+    std::vector<std::thread> workers(threads);
+    for (unsigned i = 0; i < threads; i++) {
+        workers[i] = std::thread([&, i](){
+            LLVMContext ctx;
+            uint64_t start = 0;
+            uint64_t end = 0;
+            start = jl_hrtime();
+            auto M = cantFail(getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), "Optimized"), ctx), "Error loading module");
+            end = jl_hrtime();
+            dbgs() << "Deserialization time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
+
+            dbgs() << "Starting shard " << i << " with weight=" << partitions[i].weight << "\n";
+
+            start = jl_hrtime();
+            materializePreserved(*M, partitions[i]);
+            end = jl_hrtime();
+            dbgs() << "Materialization time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
+            
+            start = jl_hrtime();
+            construct_vars(*M, partitions[i]);
+            M->setModuleFlag(Module::Error, "julia.mv.suffix", MDString::get(M->getContext(), "_" + std::to_string(i)));
+            end = jl_hrtime();
+
+            dbgs() << "Construction time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
+
+            start = jl_hrtime();
+            dropUnusedDeclarations(*M);
+            end = jl_hrtime();
+
+            dbgs() << "Declaration deletion time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
+
+            start = jl_hrtime();
+            add_output_impl(*M, TM, outstart + i * outcount * 2, name,
+                            unoptstart ? unoptstart + i : nullptr,
+                            optstart ? optstart + i : nullptr,
+                            objstart ? objstart + i : nullptr,
+                            asmstart ? asmstart + i : nullptr);
+            end = jl_hrtime();
+
+            dbgs() << "Output time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
+        });
+    }
+
+    start = jl_hrtime();
+    for (auto &w : workers)
+        w.join();
+    end = jl_hrtime();
+
+    dbgs() << "Total time for parallel output: " << (end - start) / 1e9 << "s\n";
+}
+
+unsigned compute_image_thread_count(Module &M) {
+    // 32-bit systems are very memory-constrained
+#ifdef _P32
+    return 1;
+#endif
+    unsigned threads = std::max(llvm::hardware_concurrency().compute_thread_count() / 2, 1u);
+
+    // memory limit check
+    // many threads use a lot of memory, so limit on constrained memory systems
+    size_t available = uv_get_available_memory();
+    size_t weight = 0;
+    for (auto &GV : M.global_values()) {
+        if (GV.isDeclaration())
+            continue;
+        if (isa<Function>(GV)) {
+            weight += getFunctionWeight(cast<Function>(GV));
+        } else {
+            weight += 1;
+        }
+    }
+    if (weight == 0) {
+        dbgs() << "No globals in module, using 1 thread\n";
+        return 1;
+    }
+    // crude estimate, available / (weight * fudge factor) = max threads
+    size_t fudge = 10;
+    unsigned max_threads = std::max(available / (weight * fudge), (size_t)1);
+    dbgs() << "Weight: " << weight << ", available: " << available << ", wanted: " << threads << ", max threads: " << max_threads << "\n";
+    threads = std::min(threads, max_threads);
+
+    // environment variable override
+    const char *env_threads = getenv("JULIA_IMAGE_THREADS");
+    if (env_threads) {
+        char *endptr;
+        unsigned long requested = strtoul(env_threads, &endptr, 10);
+        if (*endptr || !requested) {
+            jl_safe_printf("WARNING: invalid value '%s' for JULIA_IMAGE_THREADS\n", env_threads);
+        } else {
+            threads = requested;
+        }
+    }
+
+    return threads;
+}
+
 // takes the running content that has collected in the shadow module and dump it to disk
 // this builds the object file portion of the sysimage files for fast startup
 extern "C" JL_DLLEXPORT
@@ -584,6 +1148,11 @@ void jl_dump_native_impl(void *native_code,
     uint64_t end = 0;
     JL_TIMING(NATIVE_DUMP);
     jl_native_code_desc_t *data = (jl_native_code_desc_t*)native_code;
+    if (!bc_fname && !unopt_bc_fname && !obj_fname && !asm_fname) {
+        dbgs() << "No output requested, skipping native code dump?\n";
+        delete data;
+        return;
+    }
     auto TSCtx = data->M.getContext();
     auto lock = TSCtx.getLock();
     LLVMContext &Context = *TSCtx.getContext();
@@ -646,7 +1215,7 @@ void jl_dump_native_impl(void *native_code,
 
     start = jl_hrtime();
 
-    unsigned threads = 1;
+    unsigned threads = compute_image_thread_count(*dataM);
     unsigned nfvars = 0;
     unsigned ngvars = 0;
 
@@ -693,7 +1262,7 @@ void jl_dump_native_impl(void *native_code,
                                      true,
                                      GlobalVariable::ExternalLinkage,
                                      jlRTLD_DEFAULT_var,
-                                     "jl_RTLD_DEFAULT_handle_pointer"));
+                                     "jl_RTLD_DEFAULT_handle_pointer"), TheTriple);
     }
 
     end = jl_hrtime();
@@ -702,101 +1271,14 @@ void jl_dump_native_impl(void *native_code,
 
     start = jl_hrtime();
 
-    // do the actual work
-    auto add_output = [&] (Module &M, StringRef unopt_bc_Name, StringRef bc_Name, StringRef obj_Name, StringRef asm_Name) {
-
-        auto TM = std::unique_ptr<TargetMachine>(
-            SourceTM->getTarget().createTargetMachine(
-                SourceTM->getTargetTriple().str(),
-                SourceTM->getTargetCPU(),
-                SourceTM->getTargetFeatureString(),
-                SourceTM->Options,
-                SourceTM->getRelocationModel(),
-                SourceTM->getCodeModel(),
-                SourceTM->getOptLevel()));
-
-        if (unopt_bc_fname) {
-            SmallVector<char, 0> Buffer;
-            raw_svector_ostream OS(Buffer);
-            PassBuilder PB;
-            AnalysisManagers AM{*TM, PB, OptimizationLevel::O0};
-            ModulePassManager MPM;
-            MPM.addPass(BitcodeWriterPass(OS));
-            emit_result(unopt_bc_Archive, Buffer, unopt_bc_Name, outputs);
-        }
-        if (!bc_fname && !obj_fname && !asm_fname) {
-            return;
-        }
-        assert(!verifyModule(M, &errs()));
-
-        uint64_t start = jl_hrtime();
-        end = 0;
-
-#ifndef JL_USE_NEW_PM
-        legacy::PassManager optimizer;
-        addTargetPasses(&optimizer, TM->getTargetTriple(), TM->getTargetIRAnalysis());
-        addOptimizationPasses(&optimizer, jl_options.opt_level, true, true);
-        addMachinePasses(&optimizer, jl_options.opt_level);
-#else
-
-        auto PMTM = std::unique_ptr<TargetMachine>(
-            SourceTM->getTarget().createTargetMachine(
-                SourceTM->getTargetTriple().str(),
-                SourceTM->getTargetCPU(),
-                SourceTM->getTargetFeatureString(),
-                SourceTM->Options,
-                SourceTM->getRelocationModel(),
-                SourceTM->getCodeModel(),
-                SourceTM->getOptLevel()));
-        NewPM optimizer{std::move(PMTM), getOptLevel(jl_options.opt_level), OptimizationOptions::defaults(true, true)};
-#endif
-        optimizer.run(M);
-        assert(!verifyModule(M, &errs()));
-
-        end = jl_hrtime();
-
-        dbgs() << "optimize time: " << (end - start) / 1e9 << "s\n";
-
-        if (bc_fname) {
-            SmallVector<char, 0> Buffer;
-            raw_svector_ostream OS(Buffer);
-            PassBuilder PB;
-            AnalysisManagers AM{*TM, PB, OptimizationLevel::O0};
-            ModulePassManager MPM;
-            MPM.addPass(BitcodeWriterPass(OS));
-            emit_result(bc_Archive, Buffer, bc_Name, outputs);
-        }
-
-        start = jl_hrtime();
-
-        if (obj_fname) {
-            SmallVector<char, 0> Buffer;
-            raw_svector_ostream OS(Buffer);
-            legacy::PassManager emitter;
-            addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis());
-            if (TM->addPassesToEmitFile(emitter, OS, nullptr, CGFT_ObjectFile, false))
-                jl_safe_printf("ERROR: target does not support generation of object files\n");
-            emitter.run(M);
-            emit_result(obj_Archive, Buffer, obj_Name, outputs);
-        }
-
-        end = jl_hrtime();
-
-        dbgs() << "codegen time: " << (end - start) / 1e9 << "s\n";
-
-        if (asm_fname) {
-            SmallVector<char, 0> Buffer;
-            raw_svector_ostream OS(Buffer);
-            legacy::PassManager emitter;
-            addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis());
-            if (TM->addPassesToEmitFile(emitter, OS, nullptr, CGFT_AssemblyFile, false))
-                jl_safe_printf("ERROR: target does not support generation of assembly files\n");
-            emitter.run(M);
-            emit_result(asm_Archive, Buffer, asm_Name, outputs);
-        }
-    };
-
-    add_output(*dataM, "unopt.bc", "text.bc", "text.o", "text.s");
+    auto compile = [&](Module &M, StringRef name, unsigned threads) { add_output(
+            M, *SourceTM, outputs, name,
+            unopt_bc_Archive, bc_Archive, obj_Archive, asm_Archive,
+            !!unopt_bc_fname, !!bc_fname, !!obj_fname, !!asm_fname,
+            threads
+    ); };
+    
+    compile(*dataM, "text", threads);
 
     end = jl_hrtime();
 
@@ -804,8 +1286,7 @@ void jl_dump_native_impl(void *native_code,
 
     start = jl_hrtime();
 
-    orc::ThreadSafeModule sysimage(std::make_unique<Module>("sysimage", Context), TSCtx);
-    auto sysimageM = sysimage.getModuleUnlocked();
+    auto sysimageM = std::make_unique<Module>("sysimage", Context);
     sysimageM->setTargetTriple(dataM->getTargetTriple());
     sysimageM->setDataLayout(dataM->getDataLayout());
 #if JL_LLVM_VERSION >= 130000
@@ -846,13 +1327,15 @@ void jl_dump_native_impl(void *native_code,
     if (sysimg_data) {
         Constant *data = ConstantDataArray::get(Context,
             ArrayRef<uint8_t>((const unsigned char*)sysimg_data, sysimg_len));
-        addComdat(new GlobalVariable(*sysimageM, data->getType(), false,
+        auto sysdata = new GlobalVariable(*sysimageM, data->getType(), false,
                                      GlobalVariable::ExternalLinkage,
-                                     data, "jl_system_image_data"))->setAlignment(Align(64));
+                                     data, "jl_system_image_data");
+        sysdata->setAlignment(Align(64));
+        addComdat(sysdata, TheTriple);
         Constant *len = ConstantInt::get(T_size, sysimg_len);
         addComdat(new GlobalVariable(*sysimageM, len->getType(), true,
                                      GlobalVariable::ExternalLinkage,
-                                     len, "jl_system_image_size"));
+                                     len, "jl_system_image_size"), TheTriple);
     }
     if (imaging_mode) {
         auto specs = jl_get_llvm_clone_targets();
@@ -886,13 +1369,13 @@ void jl_dump_native_impl(void *native_code,
                                                 ConstantExpr::getBitCast(target_ids, T_psize)
                                            }),
                                            "jl_image_pointers");
-        addComdat(pointers);
+        addComdat(pointers, TheTriple);
         if (s) {
             write_int32(s, data.size());
             ios_write(s, (const char *)data.data(), data.size());
         }
     }
-    add_output(*sysimageM, "data.bc", "data.bc", "data.o", "data.s");
+    compile(*sysimageM, "data", 1);
 
     end = jl_hrtime();
 
diff --git a/src/llvm-codegen-shared.h b/src/llvm-codegen-shared.h
index e0edb792d7645..732871b12ff23 100644
--- a/src/llvm-codegen-shared.h
+++ b/src/llvm-codegen-shared.h
@@ -449,4 +449,156 @@ inline Attribute getAttributeAtIndex(const AttributeList &L, unsigned Index, Att
     return L.getAttribute(Index, Kind);
 #endif
 }
+
+// Iterate through uses of a particular type.
+// Recursively scan through `ConstantExpr` and `ConstantAggregate` use.
+template<typename U>
+struct ConstantUses {
+    template<typename T>
+    struct Info {
+        llvm::Use *use;
+        T *val;
+        // If `samebits == true`, the offset the original value appears in the constant.
+        size_t offset;
+        // This specify whether the original value appears in the current value in exactly
+        // the same bit pattern (with possibly an offset determined by `offset`).
+        bool samebits;
+        Info(llvm::Use *use, T *val, size_t offset, bool samebits) :
+            use(use),
+            val(val),
+            offset(offset),
+            samebits(samebits)
+        {
+        }
+        Info(llvm::Use *use, size_t offset, bool samebits) :
+            use(use),
+            val(cast<T>(use->getUser())),
+            offset(offset),
+            samebits(samebits)
+        {
+        }
+    };
+    using UseInfo = Info<U>;
+    struct Frame : Info<llvm::Constant> {
+        template<typename... Args>
+        Frame(Args &&... args) :
+            Info<llvm::Constant>(std::forward<Args>(args)...),
+            cur(this->val->use_empty() ? nullptr : &*this->val->use_begin()),
+            _next(cur ? cur->getNext() : nullptr)
+        {
+        }
+    private:
+        void next()
+        {
+            cur = _next;
+            if (!cur)
+                return;
+            _next = cur->getNext();
+        }
+        llvm::Use *cur;
+        llvm::Use *_next;
+        friend struct ConstantUses;
+    };
+    ConstantUses(llvm::Constant *c, llvm::Module &M)
+        : stack{Frame(nullptr, c, 0u, true)},
+          M(M)
+    {
+        forward();
+    }
+    UseInfo get_info() const
+    {
+        auto &top = stack.back();
+        return UseInfo(top.cur, top.offset, top.samebits);
+    }
+    const auto &get_stack() const
+    {
+        return stack;
+    }
+    void next()
+    {
+        stack.back().next();
+        forward();
+    }
+    bool done()
+    {
+        return stack.empty();
+    }
+private:
+    void forward();
+    llvm::SmallVector<Frame, 4> stack;
+    llvm::Module &M;
+};
+
+template<typename U>
+void ConstantUses<U>::forward()
+{
+    assert(!stack.empty());
+    auto frame = &stack.back();
+    const auto &DL = M.getDataLayout();
+    auto pop = [&] {
+        stack.pop_back();
+        if (stack.empty()) {
+            return false;
+        }
+        frame = &stack.back();
+        return true;
+    };
+    auto push = [&] (llvm::Use *use, llvm::Constant *c, size_t offset, bool samebits) {
+        stack.emplace_back(use, c, offset, samebits);
+        frame = &stack.back();
+    };
+    auto handle_constaggr = [&] (llvm::Use *use, llvm::ConstantAggregate *aggr) {
+        if (!frame->samebits) {
+            push(use, aggr, 0, false);
+            return;
+        }
+        if (auto strct = dyn_cast<llvm::ConstantStruct>(aggr)) {
+            auto layout = DL.getStructLayout(strct->getType());
+            push(use, strct, frame->offset + layout->getElementOffset(use->getOperandNo()), true);
+        }
+        else if (auto ary = dyn_cast<llvm::ConstantArray>(aggr)) {
+            auto elty = ary->getType()->getElementType();
+            push(use, ary, frame->offset + DL.getTypeAllocSize(elty) * use->getOperandNo(), true);
+        }
+        else if (auto vec = dyn_cast<llvm::ConstantVector>(aggr)) {
+            auto elty = vec->getType()->getElementType();
+            push(use, vec, frame->offset + DL.getTypeAllocSize(elty) * use->getOperandNo(), true);
+        }
+        else {
+            abort();
+        }
+    };
+    auto handle_constexpr = [&] (llvm::Use *use, llvm::ConstantExpr *expr) {
+        if (!frame->samebits) {
+            push(use, expr, 0, false);
+            return;
+        }
+        auto opcode = expr->getOpcode();
+        if (opcode == llvm::Instruction::PtrToInt || opcode == llvm::Instruction::IntToPtr ||
+            opcode == llvm::Instruction::AddrSpaceCast || opcode == llvm::Instruction::BitCast) {
+            push(use, expr, frame->offset, true);
+        }
+        else {
+            push(use, expr, 0, false);
+        }
+    };
+    while (true) {
+        auto use = frame->cur;
+        if (!use) {
+            if (!pop())
+                return;
+            continue;
+        }
+        auto user = use->getUser();
+        if (isa<U>(user))
+            return;
+        frame->next();
+        if (auto aggr = dyn_cast<llvm::ConstantAggregate>(user)) {
+            handle_constaggr(use, aggr);
+        }
+        else if (auto expr = dyn_cast<llvm::ConstantExpr>(user)) {
+            handle_constexpr(use, expr);
+        }
+    }
+}
 }
diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp
index 44c83502e0537..a172579f8ae4b 100644
--- a/src/llvm-multiversioning.cpp
+++ b/src/llvm-multiversioning.cpp
@@ -64,160 +64,6 @@ Value *map_get(T &&vmap, Value *key, Value *def=nullptr)
     return val;
 }
 
-// Iterate through uses of a particular type.
-// Recursively scan through `ConstantExpr` and `ConstantAggregate` use.
-template<typename U>
-struct ConstantUses {
-    template<typename T>
-    struct Info {
-        Use *use;
-        T *val;
-        // If `samebits == true`, the offset the original value appears in the constant.
-        size_t offset;
-        // This specify whether the original value appears in the current value in exactly
-        // the same bit pattern (with possibly an offset determined by `offset`).
-        bool samebits;
-        Info(Use *use, T *val, size_t offset, bool samebits) :
-            use(use),
-            val(val),
-            offset(offset),
-            samebits(samebits)
-        {
-        }
-        Info(Use *use, size_t offset, bool samebits) :
-            use(use),
-            val(cast<T>(use->getUser())),
-            offset(offset),
-            samebits(samebits)
-        {
-        }
-    };
-    using UseInfo = Info<U>;
-    struct Frame : Info<Constant> {
-        template<typename... Args>
-        Frame(Args &&... args) :
-            Info<Constant>(std::forward<Args>(args)...),
-            cur(this->val->use_empty() ? nullptr : &*this->val->use_begin()),
-            _next(cur ? cur->getNext() : nullptr)
-        {
-        }
-    private:
-        void next()
-        {
-            cur = _next;
-            if (!cur)
-                return;
-            _next = cur->getNext();
-        }
-        Use *cur;
-        Use *_next;
-        friend struct ConstantUses;
-    };
-    ConstantUses(Constant *c, Module &M)
-        : stack{Frame(nullptr, c, 0u, true)},
-          M(M)
-    {
-        forward();
-    }
-    UseInfo get_info() const
-    {
-        auto &top = stack.back();
-        return UseInfo(top.cur, top.offset, top.samebits);
-    }
-    const SmallVector<Frame, 4> &get_stack() const
-    {
-        return stack;
-    }
-    void next()
-    {
-        stack.back().next();
-        forward();
-    }
-    bool done()
-    {
-        return stack.empty();
-    }
-private:
-    void forward();
-    SmallVector<Frame, 4> stack;
-    Module &M;
-};
-
-template<typename U>
-void ConstantUses<U>::forward()
-{
-    assert(!stack.empty());
-    auto frame = &stack.back();
-    const DataLayout &DL = M.getDataLayout();
-    auto pop = [&] {
-        stack.pop_back();
-        if (stack.empty()) {
-            return false;
-        }
-        frame = &stack.back();
-        return true;
-    };
-    auto push = [&] (Use *use, Constant *c, size_t offset, bool samebits) {
-        stack.emplace_back(use, c, offset, samebits);
-        frame = &stack.back();
-    };
-    auto handle_constaggr = [&] (Use *use, ConstantAggregate *aggr) {
-        if (!frame->samebits) {
-            push(use, aggr, 0, false);
-            return;
-        }
-        if (auto strct = dyn_cast<ConstantStruct>(aggr)) {
-            auto layout = DL.getStructLayout(strct->getType());
-            push(use, strct, frame->offset + layout->getElementOffset(use->getOperandNo()), true);
-        }
-        else if (auto ary = dyn_cast<ConstantArray>(aggr)) {
-            auto elty = ary->getType()->getElementType();
-            push(use, ary, frame->offset + DL.getTypeAllocSize(elty) * use->getOperandNo(), true);
-        }
-        else if (auto vec = dyn_cast<ConstantVector>(aggr)) {
-            auto elty = vec->getType()->getElementType();
-            push(use, vec, frame->offset + DL.getTypeAllocSize(elty) * use->getOperandNo(), true);
-        }
-        else {
-            jl_safe_printf("Unknown ConstantAggregate:\n");
-            llvm_dump(aggr);
-            abort();
-        }
-    };
-    auto handle_constexpr = [&] (Use *use, ConstantExpr *expr) {
-        if (!frame->samebits) {
-            push(use, expr, 0, false);
-            return;
-        }
-        auto opcode = expr->getOpcode();
-        if (opcode == Instruction::PtrToInt || opcode == Instruction::IntToPtr ||
-            opcode == Instruction::AddrSpaceCast || opcode == Instruction::BitCast) {
-            push(use, expr, frame->offset, true);
-        }
-        else {
-            push(use, expr, 0, false);
-        }
-    };
-    while (true) {
-        auto use = frame->cur;
-        if (!use) {
-            if (!pop())
-                return;
-            continue;
-        }
-        auto user = use->getUser();
-        if (isa<U>(user))
-            return;
-        frame->next();
-        if (auto aggr = dyn_cast<ConstantAggregate>(user)) {
-            handle_constaggr(use, aggr);
-        }
-        else if (auto expr = dyn_cast<ConstantExpr>(user)) {
-            handle_constexpr(use, expr);
-        }
-    }
-}
-
 static bool is_vector(FunctionType *ty)
 {
     if (ty->getReturnType()->isVectorTy())
@@ -574,7 +420,6 @@ void CloneCtx::prepare_slots()
             assert(F->hasFnAttribute("julia.mv.clones"));
             if (F->isDeclaration()) {
                 auto GV = new GlobalVariable(M, F->getType(), false, GlobalValue::ExternalLinkage, nullptr, F->getName() + ".reloc_slot");
-                GV->setVisibility(GlobalValue::HiddenVisibility);
                 extern_relocs[F] = GV;
             } else {
                 auto id = get_func_id(F);
diff --git a/src/processor.cpp b/src/processor.cpp
index 55b2cd2b4ab55..3a791778a3b21 100644
--- a/src/processor.cpp
+++ b/src/processor.cpp
@@ -647,7 +647,7 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback)
     std::vector<std::pair<uint32_t, const char *>> clones;
 
     for (unsigned i = 0; i < pointers->header->nshards; i++) {
-        auto shard = pointers->shards[0];
+        auto shard = pointers->shards[i];
 
         // .data base
         char *data_base = (char *)shard.gvar_base;
@@ -657,6 +657,7 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback)
 
         const int32_t *offsets = shard.fvar_offsets;
         uint32_t nfunc = offsets[0];
+        assert(nfunc <= pointers->header->nfvars);
         offsets++;
         const int32_t *reloc_slots = shard.clone_slots;
         const uint32_t nreloc = reloc_slots[0];
@@ -747,6 +748,7 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback)
 
         auto gidxs = shard.gvar_idxs;
         unsigned ngvars = shard.gvar_offsets[0];
+        assert(ngvars <= pointers->header->ngvars);
         for (uint32_t i = 0; i < ngvars; i++) {
             gvars[gidxs[i]] = data_base + shard.gvar_offsets[i+1];
         }
@@ -756,6 +758,7 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback)
         auto offsets = (int32_t *) malloc(sizeof(int32_t) * fvars.size());
         res.fptrs.base = fvars[0];
         for (size_t i = 0; i < fvars.size(); i++) {
+            assert(fvars[i] && "Missing function pointer!");
             offsets[i] = fvars[i] - res.fptrs.base;
         }
         res.fptrs.offsets = offsets;
@@ -766,12 +769,14 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback)
         auto offsets = (int32_t *) malloc(sizeof(int32_t) * gvars.size());
         res.gvars_base = (uintptr_t *)gvars[0];
         for (size_t i = 0; i < gvars.size(); i++) {
+            assert(gvars[i] && "Missing global variable pointer!");
             offsets[i] = gvars[i] - (const char *)res.gvars_base;
         }
         res.gvars_offsets = offsets;
     }
 
     if (!clones.empty()) {
+        assert(!fvars.empty());
         std::sort(clones.begin(), clones.end());
         auto clone_offsets = (int32_t *) malloc(sizeof(int32_t) * clones.size());
         auto clone_idxs = (uint32_t *) malloc(sizeof(uint32_t) * clones.size());

From 4ad943da621f5d696ebfdf853ac03aa742edec65 Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Fri, 6 Jan 2023 20:19:12 -0500
Subject: [PATCH 09/34] Don't try to extract indexes during partitioning

---
 src/aotcompile.cpp | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 8ef715235fb04..85e7481b21722 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -568,34 +568,27 @@ struct Partition {
 static void get_fvars_gvars(Module &M, DenseMap<GlobalValue *, unsigned> &fvars, DenseMap<GlobalValue *, unsigned> &gvars) {
     auto fvars_gv = M.getGlobalVariable("jl_fvars");
     auto gvars_gv = M.getGlobalVariable("jl_gvars");
-    assert(fvars_gv);
-    assert(gvars_gv);
-    auto fvars_init = cast<ConstantArray>(fvars_gv->getInitializer());
-    auto gvars_init = cast<ConstantArray>(gvars_gv->getInitializer());
-    std::string suffix;
-    if (auto md = M.getModuleFlag("julia.mv.suffix")) {
-        suffix = cast<MDString>(md)->getString().str();
-    }
     auto fvars_idxs = M.getGlobalVariable("jl_fvar_idxs");
     auto gvars_idxs = M.getGlobalVariable("jl_gvar_idxs");
+    assert(fvars_gv);
+    assert(gvars_gv);
     assert(fvars_idxs);
     assert(gvars_idxs);
-    auto fvars_idxs_init = cast<ConstantDataArray>(fvars_idxs->getInitializer());
-    auto gvars_idxs_init = cast<ConstantDataArray>(gvars_idxs->getInitializer());
+    auto fvars_init = cast<ConstantArray>(fvars_gv->getInitializer());
+    auto gvars_init = cast<ConstantArray>(gvars_gv->getInitializer());
     for (unsigned i = 0; i < fvars_init->getNumOperands(); ++i) {
         auto gv = cast<GlobalValue>(fvars_init->getOperand(i)->stripPointerCasts());
-        auto idx = fvars_idxs_init->getElementAsInteger(i);
-        fvars[gv] = idx;
+        fvars[gv] = i;
     }
     for (unsigned i = 0; i < gvars_init->getNumOperands(); ++i) {
         auto gv = cast<GlobalValue>(gvars_init->getOperand(i)->stripPointerCasts());
-        auto idx = gvars_idxs_init->getElementAsInteger(i);
-        gvars[gv] = idx;
+        gvars[gv] = i;
     }
     fvars_gv->eraseFromParent();
     gvars_gv->eraseFromParent();
     fvars_idxs->eraseFromParent();
     gvars_idxs->eraseFromParent();
+    dbgs() << "Finished getting fvars/gvars\n";
 }
 
 static size_t getFunctionWeight(const Function &F)

From d717fa7023ab104290a136b955511c38e1416a4c Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Fri, 6 Jan 2023 20:28:00 -0500
Subject: [PATCH 10/34] Fix whitespace

---
 src/aotcompile.cpp           | 10 +++++-----
 src/llvm-multiversioning.cpp |  2 +-
 src/processor.cpp            |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 85e7481b21722..233e94bf13346 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -719,7 +719,7 @@ static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) {
     for (unsigned i = 0; i < threads; ++i) {
         pq.push(&partitions[i]);
     }
-    
+
     // Assign the root of each partition to a partition, then assign its children to the same one
     for (unsigned i = 0; i < partitioner.nodes.size(); ++i) {
         auto root = partitioner.find(i);
@@ -1011,7 +1011,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
         dbgs() << "Time to add output: " << (end - start) / 1e9 << "s\n";
         return;
     }
-    
+
     start = jl_hrtime();
     uint64_t counter = 0;
     for (auto &G : M.global_values()) {
@@ -1050,7 +1050,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
             materializePreserved(*M, partitions[i]);
             end = jl_hrtime();
             dbgs() << "Materialization time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
-            
+
             start = jl_hrtime();
             construct_vars(*M, partitions[i]);
             M->setModuleFlag(Module::Error, "julia.mv.suffix", MDString::get(M->getContext(), "_" + std::to_string(i)));
@@ -1270,7 +1270,7 @@ void jl_dump_native_impl(void *native_code,
             !!unopt_bc_fname, !!bc_fname, !!obj_fname, !!asm_fname,
             threads
     ); };
-    
+
     compile(*dataM, "text", threads);
 
     end = jl_hrtime();
@@ -1389,7 +1389,7 @@ void jl_dump_native_impl(void *native_code,
     if (asm_fname)
         handleAllErrors(writeArchive(asm_fname, asm_Archive, true,
                     Kind, true, false), reportWriterError);
-    
+
     end = jl_hrtime();
 
     dbgs() << "archive time: " << (end - start) / 1e9 << "s\n";
diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp
index a172579f8ae4b..418201f0825a1 100644
--- a/src/llvm-multiversioning.cpp
+++ b/src/llvm-multiversioning.cpp
@@ -924,7 +924,7 @@ static bool runMultiVersioning(Module &M, bool allow_bad_fvars)
         return false;
 
     CloneCtx clone(M, allow_bad_fvars);
-    
+
     clone.prepare_slots();
 
     clone.clone_decls();
diff --git a/src/processor.cpp b/src/processor.cpp
index 3a791778a3b21..851cbec62560a 100644
--- a/src/processor.cpp
+++ b/src/processor.cpp
@@ -636,7 +636,7 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback)
 
     const void *ids = pointers->target_data;
     uint32_t target_idx = callback(ids);
-    
+
     if (pointers->header->version != 1) {
         jl_error("Image file is not compatible with this version of Julia");
     }

From fe0600d2dc74e1381ac3018fe12835fafc25d529 Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Fri, 6 Jan 2023 20:38:39 -0500
Subject: [PATCH 11/34] Fix warnings

---
 src/aotcompile.cpp           | 4 ++--
 src/llvm-multiversioning.cpp | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 233e94bf13346..323577c693b51 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -606,8 +606,8 @@ static size_t getFunctionWeight(const Function &F)
     return weight;
 }
 
-
-static bool verify_partitioning(const SmallVectorImpl<Partition> &partitions, const Module &M) {
+//Inline to fool gcc into not complaining about unused function when asserts are disabled
+static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partitions, const Module &M) {
     StringMap<uint32_t> GVNames;
     bool bad = false;
     for (uint32_t i = 0; i < partitions.size(); i++) {
diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp
index 418201f0825a1..971b0f338bdf8 100644
--- a/src/llvm-multiversioning.cpp
+++ b/src/llvm-multiversioning.cpp
@@ -131,7 +131,8 @@ static uint32_t collect_func_info(Function &F, bool &has_veccall)
                 }
                 // Check for BFloat16 when they are added to julia can be done here
             }
-            if (has_veccall && (flag & JL_TARGET_CLONE_SIMD) && (flag & JL_TARGET_CLONE_MATH)) {
+            uint32_t veccall_flags = JL_TARGET_CLONE_SIMD | JL_TARGET_CLONE_MATH | JL_TARGET_CLONE_CPU | JL_TARGET_CLONE_FLOAT16;
+            if (has_veccall && (flag & veccall_flags) == veccall_flags) {
                 return flag;
             }
         }

From bdf65f4b4e8a1c8f1a058fe8fc69a6b4c56acf80 Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Fri, 6 Jan 2023 21:01:56 -0500
Subject: [PATCH 12/34] Set reloc slot to be external linkage

---
 src/llvm-multiversioning.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp
index 971b0f338bdf8..1a38511f34ffb 100644
--- a/src/llvm-multiversioning.cpp
+++ b/src/llvm-multiversioning.cpp
@@ -424,7 +424,7 @@ void CloneCtx::prepare_slots()
                 extern_relocs[F] = GV;
             } else {
                 auto id = get_func_id(F);
-                auto GV = new GlobalVariable(M, F->getType(), false, GlobalValue::InternalLinkage, Constant::getNullValue(F->getType()), F->getName() + ".reloc_slot");
+                auto GV = new GlobalVariable(M, F->getType(), false, GlobalValue::ExternalLinkage, Constant::getNullValue(F->getType()), F->getName() + ".reloc_slot");
                 GV->setVisibility(GlobalValue::HiddenVisibility);
                 const_relocs[id] = GV;
             }

From 4fc5bed6b5fa58f056ee5ee87730bea2ac17fa8c Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Tue, 10 Jan 2023 00:41:53 -0500
Subject: [PATCH 13/34] Formalize printing more, correct module weight
 estimation with multiversioning

---
 src/aotcompile.cpp           | 75 +++++++++++++++++++++++++-----------
 src/llvm-multiversioning.cpp |  6 +--
 2 files changed, 55 insertions(+), 26 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 323577c693b51..701ecdfc925e8 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -601,7 +601,10 @@ static size_t getFunctionWeight(const Function &F)
     // add some weight to it
     weight += F.size();
     if (F.hasFnAttribute("julia.mv.clones")) {
-        weight *= F.getFnAttribute("julia.mv.clones").getValueAsString().count(',') + 1;
+        auto val = F.getFnAttribute("julia.mv.clones").getValueAsString();
+        // base16, so must be at most 4 * length bits long
+        // popcount gives number of clones
+        weight *= APInt(val.size() * 4, val, 16).countPopulation() + 1;
     }
     return weight;
 }
@@ -761,7 +764,8 @@ static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) {
 }
 
 static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *outputs, StringRef name,
-                    NewArchiveMember *unopt, NewArchiveMember *opt, NewArchiveMember *obj, NewArchiveMember *asm_) {
+                    NewArchiveMember *unopt, NewArchiveMember *opt, NewArchiveMember *obj, NewArchiveMember *asm_,
+                    std::stringstream &stream, unsigned i) {
     auto TM = std::unique_ptr<TargetMachine>(
         SourceTM.getTarget().createTargetMachine(
             SourceTM.getTargetTriple().str(),
@@ -814,7 +818,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out
 
     end = jl_hrtime();
 
-    dbgs() << "optimize time: " << (end - start) / 1e9 << "s\n";
+    stream << "optimize time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
 
     if (opt) {
         raw_string_ostream OS(*outputs);
@@ -847,7 +851,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out
 
     end = jl_hrtime();
 
-    dbgs() << "codegen time: " << (end - start) / 1e9 << "s\n";
+    stream << "codegen time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
 
     if (asm_) {
         SmallVector<char, 0> Buffer;
@@ -1002,11 +1006,14 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
     asm_.resize(asm_.size() + asm_out * threads);
     if (threads == 1) {
         start = jl_hrtime();
+        std::stringstream stream;
         add_output_impl(M, TM, outputs.data() + outputs.size() - outcount * 2, name,
                         unopt_out ? unopt.data() + unopt.size() - 1 : nullptr,
                         opt_out ? opt.data() + opt.size() - 1 : nullptr,
                         obj_out ? obj.data() + obj.size() - 1 : nullptr,
-                        asm_out ? asm_.data() + asm_.size() - 1 : nullptr);
+                        asm_out ? asm_.data() + asm_.size() - 1 : nullptr,
+                        stream, 0);
+        dbgs() << stream.str();
         end = jl_hrtime();
         dbgs() << "Time to add output: " << (end - start) / 1e9 << "s\n";
         return;
@@ -1034,6 +1041,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
     auto asmstart = asm_out ? asm_.data() + asm_.size() - threads : nullptr;
 
     std::vector<std::thread> workers(threads);
+    std::vector<std::stringstream> stderrs(threads);
     for (unsigned i = 0; i < threads; i++) {
         workers[i] = std::thread([&, i](){
             LLVMContext ctx;
@@ -1042,43 +1050,46 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
             start = jl_hrtime();
             auto M = cantFail(getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), "Optimized"), ctx), "Error loading module");
             end = jl_hrtime();
-            dbgs() << "Deserialization time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
+            stderrs[i] << "Deserialization time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
 
-            dbgs() << "Starting shard " << i << " with weight=" << partitions[i].weight << "\n";
+            stderrs[i] << "Starting shard " << i << " with weight=" << partitions[i].weight << "\n";
 
             start = jl_hrtime();
             materializePreserved(*M, partitions[i]);
             end = jl_hrtime();
-            dbgs() << "Materialization time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
+            stderrs[i] << "Materialization time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
 
             start = jl_hrtime();
             construct_vars(*M, partitions[i]);
             M->setModuleFlag(Module::Error, "julia.mv.suffix", MDString::get(M->getContext(), "_" + std::to_string(i)));
             end = jl_hrtime();
 
-            dbgs() << "Construction time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
+            stderrs[i] << "Construction time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
 
             start = jl_hrtime();
             dropUnusedDeclarations(*M);
             end = jl_hrtime();
 
-            dbgs() << "Declaration deletion time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
+            stderrs[i] << "Declaration deletion time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
 
             start = jl_hrtime();
             add_output_impl(*M, TM, outstart + i * outcount * 2, name,
                             unoptstart ? unoptstart + i : nullptr,
                             optstart ? optstart + i : nullptr,
                             objstart ? objstart + i : nullptr,
-                            asmstart ? asmstart + i : nullptr);
+                            asmstart ? asmstart + i : nullptr,
+                            stderrs[i], i);
             end = jl_hrtime();
 
-            dbgs() << "Output time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
+            stderrs[i] << "Output time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
         });
     }
 
     start = jl_hrtime();
     for (auto &w : workers)
         w.join();
+    for (auto &str : stderrs)
+        dbgs() << str.str();
     end = jl_hrtime();
 
     dbgs() << "Total time for parallel output: " << (end - start) / 1e9 << "s\n";
@@ -1087,32 +1098,46 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
 unsigned compute_image_thread_count(Module &M) {
     // 32-bit systems are very memory-constrained
 #ifdef _P32
+    dbgs() << "Threads: 1\n";
     return 1;
 #endif
-    unsigned threads = std::max(llvm::hardware_concurrency().compute_thread_count() / 2, 1u);
-
-    // memory limit check
-    // many threads use a lot of memory, so limit on constrained memory systems
-    size_t available = uv_get_available_memory();
     size_t weight = 0;
+    size_t globals = 0;
     for (auto &GV : M.global_values()) {
         if (GV.isDeclaration())
             continue;
+        globals++;
         if (isa<Function>(GV)) {
             weight += getFunctionWeight(cast<Function>(GV));
         } else {
             weight += 1;
         }
     }
-    if (weight == 0) {
-        dbgs() << "No globals in module, using 1 thread\n";
+    dbgs() << "Module weight: " << weight << "\n";
+    if (weight < 1000) {
+        dbgs() << "Low module complexity bailout\n";
+        dbgs() << "Threads: 1\n";
         return 1;
     }
+
+    unsigned threads = std::max(llvm::hardware_concurrency().compute_thread_count() / 2, 1u);
+
+    // memory limit check
+    // many threads use a lot of memory, so limit on constrained memory systems
+    size_t available = uv_get_available_memory();
     // crude estimate, available / (weight * fudge factor) = max threads
     size_t fudge = 10;
     unsigned max_threads = std::max(available / (weight * fudge), (size_t)1);
-    dbgs() << "Weight: " << weight << ", available: " << available << ", wanted: " << threads << ", max threads: " << max_threads << "\n";
-    threads = std::min(threads, max_threads);
+    if (max_threads < threads) {
+        dbgs() << "Memory limiting threads to " << max_threads << "\n";
+        threads = max_threads;
+    }
+
+    max_threads = globals / 100;
+    if (max_threads < threads) {
+        dbgs() << "Low global count limiting threads to " << max_threads << " (" << globals << "globals)\n";
+        threads = max_threads;
+    }
 
     // environment variable override
     const char *env_threads = getenv("JULIA_IMAGE_THREADS");
@@ -1122,10 +1147,15 @@ unsigned compute_image_thread_count(Module &M) {
         if (*endptr || !requested) {
             jl_safe_printf("WARNING: invalid value '%s' for JULIA_IMAGE_THREADS\n", env_threads);
         } else {
+            dbgs() << "Overriding threads to " << requested << "\n";
             threads = requested;
         }
     }
 
+    threads = std::max(threads, 1u);
+
+    dbgs() << "Threads: " << threads << "\n";
+
     return threads;
 }
 
@@ -1208,7 +1238,7 @@ void jl_dump_native_impl(void *native_code,
 
     start = jl_hrtime();
 
-    unsigned threads = compute_image_thread_count(*dataM);
+    unsigned threads = 1;
     unsigned nfvars = 0;
     unsigned ngvars = 0;
 
@@ -1225,6 +1255,7 @@ void jl_dump_native_impl(void *native_code,
                 }
             }
         }
+        threads = compute_image_thread_count(*dataM);
         nfvars = data->jl_sysimg_fvars.size();
         ngvars = data->jl_sysimg_gvars.size();
         emit_offset_table(*dataM, data->jl_sysimg_gvars, "jl_gvars", T_psize);
diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp
index 1a38511f34ffb..527c17e826ce9 100644
--- a/src/llvm-multiversioning.cpp
+++ b/src/llvm-multiversioning.cpp
@@ -313,8 +313,6 @@ struct CloneCtx {
     // Map from original function to one based index in `fvars`
     std::map<const Function*,uint32_t> func_ids{};
     std::vector<Function*> orig_funcs{};
-    std::vector<uint32_t> func_infos{};
-    std::set<Function*> cloned{};
     // GV addresses and their corresponding function id (i.e. 0-based index in `fvars`)
     std::vector<std::pair<Constant*,uint32_t>> gv_relocs{};
     // Mapping from function id (i.e. 0-based index in `fvars`) to GVs to be initialized.
@@ -650,7 +648,7 @@ void CloneCtx::fix_gv_uses()
         return changed;
     };
     for (auto orig_f: orig_funcs) {
-        if (groups.size() == 1 && !cloned.count(orig_f))
+        if (!orig_f->hasFnAttribute("julia.mv.clones"))
             continue;
         while (single_pass(orig_f)) {
         }
@@ -813,7 +811,7 @@ void CloneCtx::emit_metadata()
     std::set<uint32_t> shared_relocs;
     {
         auto T_int32 = Type::getInt32Ty(M.getContext());
-        std::stable_sort(gv_relocs.begin(), gv_relocs.end(),
+        std::sort(gv_relocs.begin(), gv_relocs.end(),
                          [] (const std::pair<Constant*,uint32_t> &lhs,
                              const std::pair<Constant*,uint32_t> &rhs) {
                              return lhs.second < rhs.second;

From 0c68e4af4d80e4dccfb682274b3130a699be2309 Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Thu, 26 Jan 2023 16:14:42 -0500
Subject: [PATCH 14/34] Alter naming, sort partitions

---
 src/Makefile       |  2 +-
 src/aotcompile.cpp | 64 +++++++++++++++++++++++++++++-----------------
 2 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index bb98f6766f470..dea033c0661d9 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -287,7 +287,7 @@ $(BUILDDIR)/julia_flisp.boot: $(addprefix $(SRCDIR)/,jlfrontend.scm flisp/aliase
 
 # additional dependency links
 $(BUILDDIR)/codegen-stubs.o $(BUILDDIR)/codegen-stubs.dbg.obj: $(SRCDIR)/intrinsics.h
-$(BUILDDIR)/aotcompile.o $(BUILDDIR)/aotcompile.dbg.obj: $(SRCDIR)/jitlayers.h $(SRCDIR)/llvm-codegen-shared.h
+$(BUILDDIR)/aotcompile.o $(BUILDDIR)/aotcompile.dbg.obj: $(SRCDIR)/jitlayers.h $(SRCDIR)/llvm-codegen-shared.h $(SRCDIR)/processor.h
 $(BUILDDIR)/ast.o $(BUILDDIR)/ast.dbg.obj: $(BUILDDIR)/julia_flisp.boot.inc $(SRCDIR)/flisp/*.h
 $(BUILDDIR)/builtins.o $(BUILDDIR)/builtins.dbg.obj: $(SRCDIR)/iddict.c $(SRCDIR)/builtin_proto.h
 $(BUILDDIR)/codegen.o $(BUILDDIR)/codegen.dbg.obj: $(addprefix $(SRCDIR)/,\
diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 701ecdfc925e8..7eeaeb94cf2da 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -66,6 +66,7 @@ using namespace llvm;
 #include "serialize.h"
 #include "julia_assert.h"
 #include "llvm-codegen-shared.h"
+#include "processor.h"
 
 #define DEBUG_TYPE "julia_aotcompile"
 
@@ -723,9 +724,19 @@ static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) {
         pq.push(&partitions[i]);
     }
 
+    std::vector<unsigned> idxs(partitioner.nodes.size());
+    std::iota(idxs.begin(), idxs.end(), 0);
+    std::sort(idxs.begin(), idxs.end(), [&](unsigned a, unsigned b) {
+        //because roots have more weight than their children,
+        //we can sort by weight and get the roots first
+        return partitioner.nodes[a].weight > partitioner.nodes[b].weight;
+    });
+
     // Assign the root of each partition to a partition, then assign its children to the same one
-    for (unsigned i = 0; i < partitioner.nodes.size(); ++i) {
+    for (unsigned idx = 0; idx < idxs.size(); ++idx) {
+        auto i = idxs[idx];
         auto root = partitioner.find(i);
+        assert(root == i || partitioner.nodes[root].GV == nullptr);
         if (partitioner.nodes[root].GV) {
             auto &node = partitioner.nodes[root];
             auto &P = *pq.top();
@@ -763,9 +774,10 @@ static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) {
     return partitions;
 }
 
-static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *outputs, StringRef name,
+static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *outputs, ArrayRef<StringRef> names,
                     NewArchiveMember *unopt, NewArchiveMember *opt, NewArchiveMember *obj, NewArchiveMember *asm_,
                     std::stringstream &stream, unsigned i) {
+    assert(names.size() == 4);
     auto TM = std::unique_ptr<TargetMachine>(
         SourceTM.getTarget().createTargetMachine(
             SourceTM.getTargetTriple().str(),
@@ -782,9 +794,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out
         AnalysisManagers AM{*TM, PB, OptimizationLevel::O0};
         ModulePassManager MPM;
         MPM.addPass(BitcodeWriterPass(OS));
-        outputs++;
-        *outputs = (name + "_unopt.bc").str();
-        *unopt = NewArchiveMember(MemoryBufferRef(OS.str(), *outputs));
+        *unopt = NewArchiveMember(MemoryBufferRef(*outputs, names[0]));
         outputs++;
     }
     if (!opt && !obj && !asm_) {
@@ -826,9 +836,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out
         AnalysisManagers AM{*TM, PB, OptimizationLevel::O0};
         ModulePassManager MPM;
         MPM.addPass(BitcodeWriterPass(OS));
-        outputs++;
-        *outputs = (name + "_opt.bc").str();
-        *opt = NewArchiveMember(MemoryBufferRef(OS.str(), *outputs));
+        *opt = NewArchiveMember(MemoryBufferRef(*outputs, names[1]));
         outputs++;
     }
 
@@ -843,9 +851,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out
             jl_safe_printf("ERROR: target does not support generation of object files\n");
         emitter.run(M);
         *outputs = { Buffer.data(), Buffer.size() };
-        outputs++;
-        *outputs = (name + ".o").str();
-        *obj = NewArchiveMember(MemoryBufferRef(outputs[-1], *outputs));
+        *obj = NewArchiveMember(MemoryBufferRef(*outputs, names[2]));
         outputs++;
     }
 
@@ -862,9 +868,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out
             jl_safe_printf("ERROR: target does not support generation of assembly files\n");
         emitter.run(M);
         *outputs = { Buffer.data(), Buffer.size() };
-        outputs++;
-        *outputs = (name + ".s").str();
-        *asm_ = NewArchiveMember(MemoryBufferRef(outputs[-1], *outputs));
+        *asm_ = NewArchiveMember(MemoryBufferRef(*outputs, names[3]));
         outputs++;
     }
 }
@@ -991,7 +995,7 @@ static void dropUnusedDeclarations(Module &M) {
         G->eraseFromParent();
 }
 
-static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &outputs, StringRef name,
+static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &outputs, ArrayRef<StringRef> names,
                 std::vector<NewArchiveMember> &unopt, std::vector<NewArchiveMember> &opt,
                 std::vector<NewArchiveMember> &obj, std::vector<NewArchiveMember> &asm_,
                 bool unopt_out, bool opt_out, bool obj_out, bool asm_out,
@@ -999,7 +1003,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
     uint64_t start = 0, end = 0;
     unsigned outcount = unopt_out + opt_out + obj_out + asm_out;
     assert(outcount);
-    outputs.resize(outputs.size() + outcount * threads * 2);
+    outputs.resize(outputs.size() + outcount * threads);
     unopt.resize(unopt.size() + unopt_out * threads);
     opt.resize(opt.size() + opt_out * threads);
     obj.resize(obj.size() + obj_out * threads);
@@ -1007,7 +1011,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
     if (threads == 1) {
         start = jl_hrtime();
         std::stringstream stream;
-        add_output_impl(M, TM, outputs.data() + outputs.size() - outcount * 2, name,
+        add_output_impl(M, TM, outputs.data() + outputs.size() - outcount, names,
                         unopt_out ? unopt.data() + unopt.size() - 1 : nullptr,
                         opt_out ? opt.data() + opt.size() - 1 : nullptr,
                         obj_out ? obj.data() + obj.size() - 1 : nullptr,
@@ -1034,7 +1038,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
     end = jl_hrtime();
     dbgs() << "Time to serialize module: " << (end - start) / 1e9 << "s\n";
 
-    auto outstart = outputs.data() + outputs.size() - outcount * threads * 2;
+    auto outstart = outputs.data() + outputs.size() - outcount * threads;
     auto unoptstart = unopt_out ? unopt.data() + unopt.size() - threads : nullptr;
     auto optstart = opt_out ? opt.data() + opt.size() - threads : nullptr;
     auto objstart = obj_out ? obj.data() + obj.size() - threads : nullptr;
@@ -1073,7 +1077,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
             stderrs[i] << "Declaration deletion time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
 
             start = jl_hrtime();
-            add_output_impl(*M, TM, outstart + i * outcount * 2, name,
+            add_output_impl(*M, TM, outstart + i * outcount, names,
                             unoptstart ? unoptstart + i : nullptr,
                             optstart ? optstart + i : nullptr,
                             objstart ? objstart + i : nullptr,
@@ -1295,14 +1299,21 @@ void jl_dump_native_impl(void *native_code,
 
     start = jl_hrtime();
 
-    auto compile = [&](Module &M, StringRef name, unsigned threads) { add_output(
-            M, *SourceTM, outputs, name,
+    auto compile = [&](Module &M, ArrayRef<StringRef> names, unsigned threads) { add_output(
+            M, *SourceTM, outputs, names,
             unopt_bc_Archive, bc_Archive, obj_Archive, asm_Archive,
             !!unopt_bc_fname, !!bc_fname, !!obj_fname, !!asm_fname,
             threads
     ); };
+    
+    std::array<StringRef, 4> text_names = {
+        "text_unopt.bc",
+        "text_opt.bc",
+        "text.o",
+        "text.s"
+    };
 
-    compile(*dataM, "text", threads);
+    compile(*dataM, text_names, threads);
 
     end = jl_hrtime();
 
@@ -1399,7 +1410,14 @@ void jl_dump_native_impl(void *native_code,
             ios_write(s, (const char *)data.data(), data.size());
         }
     }
-    compile(*sysimageM, "data", 1);
+
+    std::array<StringRef, 4> data_names = {
+        "data_unopt.bc",
+        "data_opt.bc",
+        "data.o",
+        "data.s"
+    };
+    compile(*sysimageM, data_names, 1);
 
     end = jl_hrtime();
 

From f9da0e261abf505af4a5841d7114efd7f499c755 Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Thu, 26 Jan 2023 16:21:58 -0500
Subject: [PATCH 15/34] Fix whitespace

---
 src/aotcompile.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 7eeaeb94cf2da..28b13445c8e2a 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -1305,7 +1305,7 @@ void jl_dump_native_impl(void *native_code,
             !!unopt_bc_fname, !!bc_fname, !!obj_fname, !!asm_fname,
             threads
     ); };
-    
+
     std::array<StringRef, 4> text_names = {
         "text_unopt.bc",
         "text_opt.bc",

From 9a72be669b5cac14cd9a2228563e12494d6d7717 Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Thu, 26 Jan 2023 23:33:41 -0500
Subject: [PATCH 16/34] Avoid unused function warning

---
 src/aotcompile.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 28b13445c8e2a..6d0509ac05bbc 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -610,7 +610,8 @@ static size_t getFunctionWeight(const Function &F)
     return weight;
 }
 
-//Inline to fool gcc into not complaining about unused function when asserts are disabled
+#ifndef NDEBUG
+
 static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partitions, const Module &M) {
     StringMap<uint32_t> GVNames;
     bool bad = false;
@@ -644,6 +645,8 @@ static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partiti
     return !bad;
 }
 
+#endif
+
 // Chop a module up as equally as possible into threads partitions
 static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) {
     //Start by stripping fvars and gvars, which helpfully removes their uses as well

From 1f07ea51faecfdcad80aeafa99e93bb65249f369 Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Fri, 27 Jan 2023 02:48:12 -0500
Subject: [PATCH 17/34] Check relocations for generic target as well

---
 src/llvm-multiversioning.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp
index 527c17e826ce9..cd90699e05aad 100644
--- a/src/llvm-multiversioning.cpp
+++ b/src/llvm-multiversioning.cpp
@@ -152,7 +152,6 @@ static void annotate_module_clones(Module &M) {
     auto specs = jl_get_llvm_clone_targets();
     std::vector<APInt> clones(orig_funcs.size(), APInt(specs.size(), 0));
     BitVector subtarget_cloned(orig_funcs.size());
-    bool check_relocs = false;
 
     std::vector<unsigned> func_infos(orig_funcs.size());
     for (unsigned i = 0; i < orig_funcs.size(); i++) {
@@ -163,7 +162,6 @@ static void annotate_module_clones(Module &M) {
             for (unsigned j = 0; j < orig_funcs.size(); j++) {
                 clones[j].setBit(i);
             }
-            check_relocs = true;
         } else {
             unsigned flag = specs[i].flags & clone_mask;
             std::set<Function*> sets[2];
@@ -217,7 +215,11 @@ static void annotate_module_clones(Module &M) {
             }
         }
     }
-    if (check_relocs) {
+    // if there's only one target, we won't need any relocation slots
+    // but even if there is one clone_all and one non-clone_all, we still need
+    // to check for relocation slots because we must fixup instruction uses to
+    // point at the right function.
+    if (specs.size() > 1) {
         for (unsigned i = 0; i < orig_funcs.size(); i++) {
             auto &F = *orig_funcs[i];
             if (subtarget_cloned[i] && !ConstantUses<Instruction>(orig_funcs[i], M).done()) {

From 83b196758b9fabbe64b36750f56d50e083f87020 Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Fri, 27 Jan 2023 03:27:27 -0500
Subject: [PATCH 18/34] Debug macos linker

---
 src/aotcompile.cpp | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 6d0509ac05bbc..f3c45241c4a0a 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -610,11 +610,11 @@ static size_t getFunctionWeight(const Function &F)
     return weight;
 }
 
-#ifndef NDEBUG
 
 static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partitions, const Module &M) {
-    StringMap<uint32_t> GVNames;
     bool bad = false;
+#ifdef JL_DEBUG_BUILD
+    StringMap<uint32_t> GVNames;
     for (uint32_t i = 0; i < partitions.size(); i++) {
         for (auto &name : partitions[i].globals) {
             if (GVNames.count(name.getKey())) {
@@ -642,11 +642,10 @@ static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partiti
             }
         }
     }
+#endif
     return !bad;
 }
 
-#endif
-
 // Chop a module up as equally as possible into threads partitions
 static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) {
     //Start by stripping fvars and gvars, which helpfully removes their uses as well
@@ -772,7 +771,9 @@ static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) {
         }
     }
 
-    assert(verify_partitioning(partitions, M) && "Partitioning failed to partition globals correctly");
+    bool verified = verify_partitioning(partitions, M);
+    assert(verified && "Partitioning failed to partition globals correctly");
+    (void) verified;
 
     return partitions;
 }
@@ -1135,10 +1136,14 @@ unsigned compute_image_thread_count(Module &M) {
     // crude estimate, available / (weight * fudge factor) = max threads
     size_t fudge = 10;
     unsigned max_threads = std::max(available / (weight * fudge), (size_t)1);
-    if (max_threads < threads) {
-        dbgs() << "Memory limiting threads to " << max_threads << "\n";
-        threads = max_threads;
-    }
+    dbgs() << "Available memory: " << available << " bytes\n";
+    dbgs() << "Max threads: " << max_threads << "\n";
+    dbgs() << "Temporarily disabling memory limiting threads\n";
+    //TODO reenable
+    // if (max_threads < threads) {
+    //     dbgs() << "Memory limiting threads to " << max_threads << "\n";
+    //     threads = max_threads;
+    // }
 
     max_threads = globals / 100;
     if (max_threads < threads) {
@@ -1420,7 +1425,11 @@ void jl_dump_native_impl(void *native_code,
         "data.o",
         "data.s"
     };
+    dbgs() << "Dumping sysimage data module\n";
+    dbgs() << *sysimageM << "\n";
     compile(*sysimageM, data_names, 1);
+    dbgs() << "Post-optimization sysimageM\n";
+    dbgs() << *sysimageM << "\n";
 
     end = jl_hrtime();
 

From c98ff304ab697f0eb492a150833c12baa499f29e Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Fri, 27 Jan 2023 17:41:11 -0500
Subject: [PATCH 19/34] Respect JULIA_CPU_THREADS

---
 src/aotcompile.cpp | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index f3c45241c4a0a..88c54d228c307 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -1153,14 +1153,33 @@ unsigned compute_image_thread_count(Module &M) {
 
     // environment variable override
     const char *env_threads = getenv("JULIA_IMAGE_THREADS");
+    bool env_threads_set = false;
     if (env_threads) {
         char *endptr;
         unsigned long requested = strtoul(env_threads, &endptr, 10);
         if (*endptr || !requested) {
             jl_safe_printf("WARNING: invalid value '%s' for JULIA_IMAGE_THREADS\n", env_threads);
         } else {
-            dbgs() << "Overriding threads to " << requested << "\n";
+            dbgs() << "Overriding threads to " << requested << " due to JULIA_IMAGE_THREADS\n";
             threads = requested;
+            env_threads_set = true;
+        }
+    }
+
+    // more defaults
+    if (!env_threads_set && threads > 1) {
+        if (jl_options.nthreads && jl_options.nthreads < threads) {
+            dbgs() << "Overriding threads to " << jl_options.nthreads << " due to -t option\n";
+            threads = jl_options.nthreads;
+        } else if (auto fallbackenv = getenv(NUM_THREADS_NAME)) {
+            char *endptr;
+            unsigned long requested = strtoul(fallbackenv, &endptr, 10);
+            if (*endptr || !requested) {
+                jl_safe_printf("WARNING: invalid value '%s' for %s\m", fallbackenv, NUM_THREADS_NAME);
+            } else if (requested < threads) {
+                dbgs() << "Overriding threads to " << requested << " due to " << NUM_THREADS_NAME << "\n";
+                threads = requested;
+            }
         }
     }
 
@@ -1426,10 +1445,15 @@ void jl_dump_native_impl(void *native_code,
         "data.s"
     };
     dbgs() << "Dumping sysimage data module\n";
+    for (auto &F : *sysimageM) {
+        dbgs() << F << "\n";
+    }
     dbgs() << *sysimageM << "\n";
     compile(*sysimageM, data_names, 1);
     dbgs() << "Post-optimization sysimageM\n";
-    dbgs() << *sysimageM << "\n";
+    for (auto &F : *sysimageM) {
+        dbgs() << F << "\n";
+    }
 
     end = jl_hrtime();
 

From 8cf48f2369a4197c46858e0bc6c166ad69d3e8d4 Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Fri, 27 Jan 2023 18:23:03 -0500
Subject: [PATCH 20/34] Don't inject CRT aliases on macos

---
 src/aotcompile.cpp | 54 +++++++++++++++++++++-------------------------
 1 file changed, 25 insertions(+), 29 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 88c54d228c307..c40868af11c58 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -1168,16 +1168,18 @@ unsigned compute_image_thread_count(Module &M) {
 
     // more defaults
     if (!env_threads_set && threads > 1) {
-        if (jl_options.nthreads && jl_options.nthreads < threads) {
-            dbgs() << "Overriding threads to " << jl_options.nthreads << " due to -t option\n";
-            threads = jl_options.nthreads;
-        } else if (auto fallbackenv = getenv(NUM_THREADS_NAME)) {
+        if (jl_options.nthreads) {
+            if (static_cast<unsigned>(jl_options.nthreads) < threads) {
+                dbgs() << "Overriding threads to " << jl_options.nthreads << " due to -t option\n";
+                threads = jl_options.nthreads;
+            }
+        } else if (auto fallbackenv = getenv("JULIA_CPU_THREADS")) {
             char *endptr;
             unsigned long requested = strtoul(fallbackenv, &endptr, 10);
             if (*endptr || !requested) {
-                jl_safe_printf("WARNING: invalid value '%s' for %s\m", fallbackenv, NUM_THREADS_NAME);
+                jl_safe_printf("WARNING: invalid value '%s' for JULIA_CPU_THREADS\n", fallbackenv);
             } else if (requested < threads) {
-                dbgs() << "Overriding threads to " << requested << " due to " << NUM_THREADS_NAME << "\n";
+                dbgs() << "Overriding threads to " << requested << " due to JULIA_CPU_THREADS\n";
                 threads = requested;
             }
         }
@@ -1355,20 +1357,23 @@ void jl_dump_native_impl(void *native_code,
     sysimageM->setStackProtectorGuard(dataM->getStackProtectorGuard());
     sysimageM->setOverrideStackAlignment(dataM->getOverrideStackAlignment());
 #endif
-    // We would like to emit an alias or an weakref alias to redirect these symbols
-    // but LLVM doesn't let us emit a GlobalAlias to a declaration...
-    // So for now we inject a definition of these functions that calls our runtime
-    // functions. We do so after optimization to avoid cloning these functions.
-    injectCRTAlias(*sysimageM, "__gnu_h2f_ieee", "julia__gnu_h2f_ieee",
-            FunctionType::get(Type::getFloatTy(Context), { Type::getHalfTy(Context) }, false));
-    injectCRTAlias(*sysimageM, "__extendhfsf2", "julia__gnu_h2f_ieee",
-            FunctionType::get(Type::getFloatTy(Context), { Type::getHalfTy(Context) }, false));
-    injectCRTAlias(*sysimageM, "__gnu_f2h_ieee", "julia__gnu_f2h_ieee",
-            FunctionType::get(Type::getHalfTy(Context), { Type::getFloatTy(Context) }, false));
-    injectCRTAlias(*sysimageM, "__truncsfhf2", "julia__gnu_f2h_ieee",
-            FunctionType::get(Type::getHalfTy(Context), { Type::getFloatTy(Context) }, false));
-    injectCRTAlias(*sysimageM, "__truncdfhf2", "julia__truncdfhf2",
-            FunctionType::get(Type::getHalfTy(Context), { Type::getDoubleTy(Context) }, false));
+
+    if (!TheTriple.isOSDarwin()) {
+        // We would like to emit an alias or an weakref alias to redirect these symbols
+        // but LLVM doesn't let us emit a GlobalAlias to a declaration...
+        // So for now we inject a definition of these functions that calls our runtime
+        // functions. We do so after optimization to avoid cloning these functions.
+        injectCRTAlias(*sysimageM, "__gnu_h2f_ieee", "julia__gnu_h2f_ieee",
+                FunctionType::get(Type::getFloatTy(Context), { Type::getHalfTy(Context) }, false));
+        injectCRTAlias(*sysimageM, "__extendhfsf2", "julia__gnu_h2f_ieee",
+                FunctionType::get(Type::getFloatTy(Context), { Type::getHalfTy(Context) }, false));
+        injectCRTAlias(*sysimageM, "__gnu_f2h_ieee", "julia__gnu_f2h_ieee",
+                FunctionType::get(Type::getHalfTy(Context), { Type::getFloatTy(Context) }, false));
+        injectCRTAlias(*sysimageM, "__truncsfhf2", "julia__gnu_f2h_ieee",
+                FunctionType::get(Type::getHalfTy(Context), { Type::getFloatTy(Context) }, false));
+        injectCRTAlias(*sysimageM, "__truncdfhf2", "julia__truncdfhf2",
+                FunctionType::get(Type::getHalfTy(Context), { Type::getDoubleTy(Context) }, false));
+    }
 
     if (TheTriple.isOSWindows()) {
         // Windows expect that the function `_DllMainStartup` is present in an dll.
@@ -1444,16 +1449,7 @@ void jl_dump_native_impl(void *native_code,
         "data.o",
         "data.s"
     };
-    dbgs() << "Dumping sysimage data module\n";
-    for (auto &F : *sysimageM) {
-        dbgs() << F << "\n";
-    }
-    dbgs() << *sysimageM << "\n";
     compile(*sysimageM, data_names, 1);
-    dbgs() << "Post-optimization sysimageM\n";
-    for (auto &F : *sysimageM) {
-        dbgs() << F << "\n";
-    }
 
     end = jl_hrtime();
 

From 4e35f416bb71f0a33ad3605742bfa8f6bc82a85b Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Wed, 1 Feb 2023 03:02:47 -0500
Subject: [PATCH 21/34] Clean up timers and prints, link to JULIA_IMAGE_TIMINGS

---
 src/aotcompile.cpp           | 272 +++++++++++++++++++++--------------
 src/llvm-multiversioning.cpp |   4 +-
 2 files changed, 168 insertions(+), 108 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index c40868af11c58..79e9ea07eb592 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -57,6 +57,7 @@
 
 #include <llvm/IR/LegacyPassManagers.h>
 #include <llvm/Transforms/Utils/Cloning.h>
+#include <llvm/Support/FormatAdapters.h>
 #include <llvm/Linker/Linker.h>
 
 
@@ -269,8 +270,6 @@ void replaceUsesWithLoad(Function &F, function_ref<GlobalVariable *(Instruction
 extern "C" JL_DLLEXPORT
 void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvmmod, const jl_cgparams_t *cgparams, int _policy, int _imaging_mode, int _external_linkage, size_t _world)
 {
-    uint64_t start = jl_hrtime();
-    uint64_t end = 0;
     ++CreateNativeCalls;
     CreateNativeMax.updateMax(jl_array_len(methods));
     if (cgparams == NULL)
@@ -463,8 +462,6 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
     if (ctx.getContext()) {
         jl_ExecutionEngine->releaseContext(std::move(ctx));
     }
-    end = jl_hrtime();
-    dbgs() << "jl_create_native: " << (end - start) / 1e9 << "s\n";
     return (void*)data;
 }
 
@@ -589,7 +586,6 @@ static void get_fvars_gvars(Module &M, DenseMap<GlobalValue *, unsigned> &fvars,
     gvars_gv->eraseFromParent();
     fvars_idxs->eraseFromParent();
     gvars_idxs->eraseFromParent();
-    dbgs() << "Finished getting fvars/gvars\n";
 }
 
 static size_t getFunctionWeight(const Function &F)
@@ -778,9 +774,74 @@ static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) {
     return partitions;
 }
 
+struct ImageTimer {
+    uint64_t elapsed = 0;
+    std::string name;
+    std::string desc;
+
+    void startTimer() {
+        elapsed = jl_hrtime();
+    }
+
+    void stopTimer() {
+        elapsed = jl_hrtime() - elapsed;
+    }
+
+    void init(const Twine &name, const Twine &desc) {
+        this->name = name.str();
+        this->desc = desc.str();
+    }
+
+    operator bool() const {
+        return elapsed != 0;
+    }
+
+    void print(raw_ostream &out, bool clear=false) {
+        if (!*this)
+            return;
+        out << llvm::formatv("{0:F3}  ", elapsed / 1e9) << name << "  " << desc << "\n";
+        if (clear)
+            elapsed = 0;
+    }
+};
+
+struct ShardTimers {
+    ImageTimer deserialize;
+    ImageTimer materialize;
+    ImageTimer construct;
+    ImageTimer deletion;
+    // impl timers
+    ImageTimer unopt;
+    ImageTimer optimize;
+    ImageTimer opt;
+    ImageTimer obj;
+    ImageTimer asm_;
+
+    std::string name;
+    std::string desc;
+
+    void print(raw_ostream &out, bool clear=false) {
+        StringRef sep = "===-------------------------------------------------------------------------===";
+        out << formatv("{0}\n{1}\n{0}\n", sep, fmt_align(name + " : " + desc, AlignStyle::Center, sep.size()));
+        auto total = deserialize.elapsed + materialize.elapsed + construct.elapsed + deletion.elapsed +
+            unopt.elapsed + optimize.elapsed + opt.elapsed + obj.elapsed + asm_.elapsed;
+        out << "Time (s)  Name  Description\n";
+        deserialize.print(out, clear);
+        materialize.print(out, clear);
+        construct.print(out, clear);
+        deletion.print(out, clear);
+        unopt.print(out, clear);
+        optimize.print(out, clear);
+        opt.print(out, clear);
+        obj.print(out, clear);
+        asm_.print(out, clear);
+        out << llvm::formatv("{0:F3}  total  Total time taken\n", total / 1e9);
+    }
+};
+
 static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *outputs, ArrayRef<StringRef> names,
                     NewArchiveMember *unopt, NewArchiveMember *opt, NewArchiveMember *obj, NewArchiveMember *asm_,
-                    std::stringstream &stream, unsigned i) {
+                    ShardTimers &timers, unsigned shardidx) {
     assert(names.size() == 4);
     auto TM = std::unique_ptr<TargetMachine>(
         SourceTM.getTarget().createTargetMachine(
@@ -793,6 +854,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out
             SourceTM.getOptLevel()));
 
     if (unopt) {
+        timers.unopt.startTimer();
         raw_string_ostream OS(*outputs);
         PassBuilder PB;
         AnalysisManagers AM{*TM, PB, OptimizationLevel::O0};
@@ -800,14 +862,14 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out
         MPM.addPass(BitcodeWriterPass(OS));
         *unopt = NewArchiveMember(MemoryBufferRef(*outputs, names[0]));
         outputs++;
+        timers.unopt.stopTimer();
     }
     if (!opt && !obj && !asm_) {
         return;
     }
     assert(!verifyModule(M, &errs()));
 
-    uint64_t start = jl_hrtime();
-    uint64_t end = 0;
+    timers.optimize.startTimer();
 
 #ifndef JL_USE_NEW_PM
     legacy::PassManager optimizer;
@@ -829,12 +891,11 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out
 #endif
     optimizer.run(M);
     assert(!verifyModule(M, &errs()));
-
-    end = jl_hrtime();
-
-    stream << "optimize time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
+    
+    timers.optimize.stopTimer();
 
     if (opt) {
+        timers.opt.startTimer();
         raw_string_ostream OS(*outputs);
         PassBuilder PB;
         AnalysisManagers AM{*TM, PB, OptimizationLevel::O0};
@@ -842,11 +903,11 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out
         MPM.addPass(BitcodeWriterPass(OS));
         *opt = NewArchiveMember(MemoryBufferRef(*outputs, names[1]));
         outputs++;
+        timers.opt.stopTimer();
     }
 
-    start = jl_hrtime();
-
     if (obj) {
+        timers.obj.startTimer();
         SmallVector<char, 0> Buffer;
         raw_svector_ostream OS(Buffer);
         legacy::PassManager emitter;
@@ -857,13 +918,11 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out
         *outputs = { Buffer.data(), Buffer.size() };
         *obj = NewArchiveMember(MemoryBufferRef(*outputs, names[2]));
         outputs++;
+        timers.obj.stopTimer();
     }
 
-    end = jl_hrtime();
-
-    stream << "codegen time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
-
     if (asm_) {
+        timers.asm_.startTimer();
         SmallVector<char, 0> Buffer;
         raw_svector_ostream OS(Buffer);
         legacy::PassManager emitter;
@@ -874,6 +933,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out
         *outputs = { Buffer.data(), Buffer.size() };
         *asm_ = NewArchiveMember(MemoryBufferRef(*outputs, names[3]));
         outputs++;
+        timers.asm_.stopTimer();
     }
 }
 
@@ -1004,7 +1064,6 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
                 std::vector<NewArchiveMember> &obj, std::vector<NewArchiveMember> &asm_,
                 bool unopt_out, bool opt_out, bool obj_out, bool asm_out,
                 unsigned threads) {
-    uint64_t start = 0, end = 0;
     unsigned outcount = unopt_out + opt_out + obj_out + asm_out;
     assert(outcount);
     outputs.resize(outputs.size() + outcount * threads);
@@ -1012,22 +1071,64 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
     opt.resize(opt.size() + opt_out * threads);
     obj.resize(obj.size() + obj_out * threads);
     asm_.resize(asm_.size() + asm_out * threads);
+    auto name = names[2];
+    name.consume_back(".o");
+    TimerGroup timer_group("add_output", ("Time to optimize and emit LLVM module " + name).str());
+    SmallVector<ShardTimers, 1> timers(threads);
+    for (unsigned i = 0; i < threads; ++i) {
+        auto idx = std::to_string(i);
+        timers[i].name = "shard_" + idx;
+        timers[i].desc = ("Timings for " + name + " module shard " + idx).str();
+        timers[i].deserialize.init("deserialize_" + idx, "Deserialize module");
+        timers[i].materialize.init("materialize_" + idx, "Materialize declarations");
+        timers[i].construct.init("construct_" + idx, "Construct partitioned definitions");
+        timers[i].deletion.init("deletion_" + idx, "Delete dead declarations");
+        timers[i].unopt.init("unopt_" + idx, "Emit unoptimized bitcode");
+        timers[i].optimize.init("optimize_" + idx, "Optimize shard");
+        timers[i].opt.init("opt_" + idx, "Emit optimized bitcode");
+        timers[i].obj.init("obj_" + idx, "Emit object file");
+        timers[i].asm_.init("asm_" + idx, "Emit assembly file");
+    }
+    Timer partition_timer("partition", "Partition module", timer_group);
+    Timer serialize_timer("serialize", "Serialize module", timer_group);
+    Timer output_timer("output", "Add outputs", timer_group);
+    bool report_timings = false;
+    if (auto env = getenv("JULIA_IMAGE_TIMINGS")) {
+        char *endptr;
+        unsigned long val = strtoul(env, &endptr, 10);
+        if (endptr != env && !*endptr && val <= 1) {
+            report_timings = val;
+        } else {
+            if (StringRef("true").compare_insensitive(env) == 0)
+                report_timings = true;
+            else if (StringRef("false").compare_insensitive(env) == 0)
+                report_timings = false;
+            else
+                errs() << "WARNING: Invalid value for JULIA_IMAGE_TIMINGS: " << env << "\n";
+        }
+    }
     if (threads == 1) {
-        start = jl_hrtime();
-        std::stringstream stream;
+        output_timer.startTimer();
         add_output_impl(M, TM, outputs.data() + outputs.size() - outcount, names,
                         unopt_out ? unopt.data() + unopt.size() - 1 : nullptr,
                         opt_out ? opt.data() + opt.size() - 1 : nullptr,
                         obj_out ? obj.data() + obj.size() - 1 : nullptr,
                         asm_out ? asm_.data() + asm_.size() - 1 : nullptr,
-                        stream, 0);
-        dbgs() << stream.str();
-        end = jl_hrtime();
-        dbgs() << "Time to add output: " << (end - start) / 1e9 << "s\n";
+                        timers[0], 0);
+        output_timer.stopTimer();
+
+        if (!report_timings) {
+            timer_group.clear();
+        } else {
+            timer_group.print(dbgs(), true);
+            for (auto &t : timers) {
+                t.print(dbgs(), true);
+            }
+        }
         return;
     }
 
-    start = jl_hrtime();
+    partition_timer.startTimer();
     uint64_t counter = 0;
     for (auto &G : M.global_values()) {
         if (!G.isDeclaration() && !G.hasName()) {
@@ -1035,12 +1136,12 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
         }
     }
     auto partitions = partitionModule(M, threads);
-    end = jl_hrtime();
-    dbgs() << "Time to partition module: " << (end - start) / 1e9 << "s\n";
-    start = jl_hrtime();
+    partition_timer.stopTimer();
+    serialize_timer.startTimer();
     auto serialized = serializeModule(M);
-    end = jl_hrtime();
-    dbgs() << "Time to serialize module: " << (end - start) / 1e9 << "s\n";
+    serialize_timer.stopTimer();
+
+    output_timer.startTimer();
 
     auto outstart = outputs.data() + outputs.size() - outcount * threads;
     auto unoptstart = unopt_out ? unopt.data() + unopt.size() - threads : nullptr;
@@ -1049,64 +1150,56 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
     auto asmstart = asm_out ? asm_.data() + asm_.size() - threads : nullptr;
 
     std::vector<std::thread> workers(threads);
-    std::vector<std::stringstream> stderrs(threads);
     for (unsigned i = 0; i < threads; i++) {
         workers[i] = std::thread([&, i](){
             LLVMContext ctx;
-            uint64_t start = 0;
-            uint64_t end = 0;
-            start = jl_hrtime();
+            timers[i].deserialize.startTimer();
             auto M = cantFail(getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), "Optimized"), ctx), "Error loading module");
-            end = jl_hrtime();
-            stderrs[i] << "Deserialization time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
+            timers[i].deserialize.stopTimer();
 
-            stderrs[i] << "Starting shard " << i << " with weight=" << partitions[i].weight << "\n";
+            // dbgs() << "Starting shard " << i << " with weight=" << partitions[i].weight << "\n";
 
-            start = jl_hrtime();
+            timers[i].materialize.startTimer();
             materializePreserved(*M, partitions[i]);
-            end = jl_hrtime();
-            stderrs[i] << "Materialization time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
+            timers[i].materialize.stopTimer();
 
-            start = jl_hrtime();
+            timers[i].construct.startTimer();
             construct_vars(*M, partitions[i]);
             M->setModuleFlag(Module::Error, "julia.mv.suffix", MDString::get(M->getContext(), "_" + std::to_string(i)));
-            end = jl_hrtime();
-
-            stderrs[i] << "Construction time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
+            timers[i].construct.stopTimer();
 
-            start = jl_hrtime();
+            timers[i].deletion.startTimer();
             dropUnusedDeclarations(*M);
-            end = jl_hrtime();
-
-            stderrs[i] << "Declaration deletion time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
+            timers[i].deletion.stopTimer();
 
-            start = jl_hrtime();
             add_output_impl(*M, TM, outstart + i * outcount, names,
                             unoptstart ? unoptstart + i : nullptr,
                             optstart ? optstart + i : nullptr,
                             objstart ? objstart + i : nullptr,
                             asmstart ? asmstart + i : nullptr,
-                            stderrs[i], i);
-            end = jl_hrtime();
-
-            stderrs[i] << "Output time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
+                            timers[i], i);
         });
     }
 
-    start = jl_hrtime();
     for (auto &w : workers)
         w.join();
-    for (auto &str : stderrs)
-        dbgs() << str.str();
-    end = jl_hrtime();
 
-    dbgs() << "Total time for parallel output: " << (end - start) / 1e9 << "s\n";
+    output_timer.stopTimer();
+    
+    if (!report_timings) {
+        timer_group.clear();
+    } else {
+        timer_group.print(dbgs(), true);
+        for (auto &t : timers) {
+            t.print(dbgs(), true);
+        }
+    }
 }
 
 unsigned compute_image_thread_count(Module &M) {
     // 32-bit systems are very memory-constrained
 #ifdef _P32
-    dbgs() << "Threads: 1\n";
+    // dbgs() << "Threads: 1\n";
     return 1;
 #endif
     size_t weight = 0;
@@ -1121,10 +1214,10 @@ unsigned compute_image_thread_count(Module &M) {
             weight += 1;
         }
     }
-    dbgs() << "Module weight: " << weight << "\n";
+    // dbgs() << "Module weight: " << weight << "\n";
     if (weight < 1000) {
-        dbgs() << "Low module complexity bailout\n";
-        dbgs() << "Threads: 1\n";
+        // dbgs() << "Low module complexity bailout\n";
+        // dbgs() << "Threads: 1\n";
         return 1;
     }
 
@@ -1136,9 +1229,9 @@ unsigned compute_image_thread_count(Module &M) {
     // crude estimate, available / (weight * fudge factor) = max threads
     size_t fudge = 10;
     unsigned max_threads = std::max(available / (weight * fudge), (size_t)1);
-    dbgs() << "Available memory: " << available << " bytes\n";
-    dbgs() << "Max threads: " << max_threads << "\n";
-    dbgs() << "Temporarily disabling memory limiting threads\n";
+    // dbgs() << "Available memory: " << available << " bytes\n";
+    // dbgs() << "Max threads: " << max_threads << "\n";
+    // dbgs() << "Temporarily disabling memory limiting threads\n";
     //TODO reenable
     // if (max_threads < threads) {
     //     dbgs() << "Memory limiting threads to " << max_threads << "\n";
@@ -1147,7 +1240,7 @@ unsigned compute_image_thread_count(Module &M) {
 
     max_threads = globals / 100;
     if (max_threads < threads) {
-        dbgs() << "Low global count limiting threads to " << max_threads << " (" << globals << "globals)\n";
+        // dbgs() << "Low global count limiting threads to " << max_threads << " (" << globals << "globals)\n";
         threads = max_threads;
     }
 
@@ -1160,7 +1253,7 @@ unsigned compute_image_thread_count(Module &M) {
         if (*endptr || !requested) {
             jl_safe_printf("WARNING: invalid value '%s' for JULIA_IMAGE_THREADS\n", env_threads);
         } else {
-            dbgs() << "Overriding threads to " << requested << " due to JULIA_IMAGE_THREADS\n";
+            // dbgs() << "Overriding threads to " << requested << " due to JULIA_IMAGE_THREADS\n";
             threads = requested;
             env_threads_set = true;
         }
@@ -1168,18 +1261,13 @@ unsigned compute_image_thread_count(Module &M) {
 
     // more defaults
     if (!env_threads_set && threads > 1) {
-        if (jl_options.nthreads) {
-            if (static_cast<unsigned>(jl_options.nthreads) < threads) {
-                dbgs() << "Overriding threads to " << jl_options.nthreads << " due to -t option\n";
-                threads = jl_options.nthreads;
-            }
-        } else if (auto fallbackenv = getenv("JULIA_CPU_THREADS")) {
+        if (auto fallbackenv = getenv("JULIA_CPU_THREADS")) {
             char *endptr;
             unsigned long requested = strtoul(fallbackenv, &endptr, 10);
             if (*endptr || !requested) {
                 jl_safe_printf("WARNING: invalid value '%s' for JULIA_CPU_THREADS\n", fallbackenv);
             } else if (requested < threads) {
-                dbgs() << "Overriding threads to " << requested << " due to JULIA_CPU_THREADS\n";
+                // dbgs() << "Overriding threads to " << requested << " due to JULIA_CPU_THREADS\n";
                 threads = requested;
             }
         }
@@ -1187,7 +1275,7 @@ unsigned compute_image_thread_count(Module &M) {
 
     threads = std::max(threads, 1u);
 
-    dbgs() << "Threads: " << threads << "\n";
+    // dbgs() << "Threads: " << threads << "\n";
 
     return threads;
 }
@@ -1200,12 +1288,10 @@ void jl_dump_native_impl(void *native_code,
         const char *asm_fname,
         const char *sysimg_data, size_t sysimg_len, ios_t *s)
 {
-    uint64_t start = jl_hrtime();
-    uint64_t end = 0;
     JL_TIMING(NATIVE_DUMP);
     jl_native_code_desc_t *data = (jl_native_code_desc_t*)native_code;
     if (!bc_fname && !unopt_bc_fname && !obj_fname && !asm_fname) {
-        dbgs() << "No output requested, skipping native code dump?\n";
+        // dbgs() << "No output requested, skipping native code dump?\n";
         delete data;
         return;
     }
@@ -1265,12 +1351,6 @@ void jl_dump_native_impl(void *native_code,
 
     bool imaging_mode = imaging_default() || jl_options.outputo;
 
-    end = jl_hrtime();
-
-    dbgs() << "setup time: " << (end - start) / 1e9 << "s\n";
-
-    start = jl_hrtime();
-
     unsigned threads = 1;
     unsigned nfvars = 0;
     unsigned ngvars = 0;
@@ -1322,12 +1402,6 @@ void jl_dump_native_impl(void *native_code,
                                      "jl_RTLD_DEFAULT_handle_pointer"), TheTriple);
     }
 
-    end = jl_hrtime();
-
-    dbgs() << "metadata time: " << (end - start) / 1e9 << "s\n";
-
-    start = jl_hrtime();
-
     auto compile = [&](Module &M, ArrayRef<StringRef> names, unsigned threads) { add_output(
             M, *SourceTM, outputs, names,
             unopt_bc_Archive, bc_Archive, obj_Archive, asm_Archive,
@@ -1344,12 +1418,6 @@ void jl_dump_native_impl(void *native_code,
 
     compile(*dataM, text_names, threads);
 
-    end = jl_hrtime();
-
-    dbgs() << "text output time: " << (end - start) / 1e9 << "s\n";
-
-    start = jl_hrtime();
-
     auto sysimageM = std::make_unique<Module>("sysimage", Context);
     sysimageM->setTargetTriple(dataM->getTargetTriple());
     sysimageM->setDataLayout(dataM->getDataLayout());
@@ -1451,12 +1519,6 @@ void jl_dump_native_impl(void *native_code,
     };
     compile(*sysimageM, data_names, 1);
 
-    end = jl_hrtime();
-
-    dbgs() << "data module time: " << (end - start) / 1e9 << "s\n";
-
-    start = jl_hrtime();
-
     object::Archive::Kind Kind = getDefaultForHost(TheTriple);
     if (unopt_bc_fname)
         handleAllErrors(writeArchive(unopt_bc_fname, unopt_bc_Archive, true,
@@ -1471,10 +1533,6 @@ void jl_dump_native_impl(void *native_code,
         handleAllErrors(writeArchive(asm_fname, asm_Archive, true,
                     Kind, true, false), reportWriterError);
 
-    end = jl_hrtime();
-
-    dbgs() << "archive time: " << (end - start) / 1e9 << "s\n";
-
     delete data;
 }
 
diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp
index cd90699e05aad..42aa34d3bdb4f 100644
--- a/src/llvm-multiversioning.cpp
+++ b/src/llvm-multiversioning.cpp
@@ -915,8 +915,9 @@ static bool runMultiVersioning(Module &M, bool allow_bad_fvars)
     //     * Cloned function -> Original function (add as we clone functions)
     //     * Original function -> Base function (target specific and updated by LLVM)
     //     * ID -> relocation slots (const).
-    if (M.getName() == "sysimage")
+    if (!M.getModuleFlag("julia.mv.enable")) {
         return false;
+    }
 
     GlobalVariable *fvars = M.getGlobalVariable("jl_fvars");
     GlobalVariable *gvars = M.getGlobalVariable("jl_gvars");
@@ -986,6 +987,7 @@ static RegisterPass<MultiVersioningLegacy> X("JuliaMultiVersioning", "JuliaMulti
 void multiversioning_preannotate(Module &M)
 {
     annotate_module_clones(M);
+    M.addModuleFlag(Module::ModFlagBehavior::Error, "julia.mv.enable", 1);
 }
 
 void replaceUsesWithLoad(Function &F, function_ref<GlobalVariable *(Instruction &I)> should_replace, MDNode *tbaa_const) {

From a723211c3106d6eebfbbbb680615269995cab0ec Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Wed, 1 Feb 2023 03:16:18 -0500
Subject: [PATCH 22/34] Fix whitespace

---
 src/aotcompile.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 79e9ea07eb592..428f397c35aed 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -891,7 +891,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out
 #endif
     optimizer.run(M);
     assert(!verifyModule(M, &errs()));
-    
+
     timers.optimize.stopTimer();
 
     if (opt) {
@@ -1185,7 +1185,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
         w.join();
 
     output_timer.stopTimer();
-    
+
     if (!report_timings) {
         timer_group.clear();
     } else {

From 7cf839aaeb636e27b75c9857fe52913477ac1734 Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Wed, 1 Feb 2023 06:30:22 -0500
Subject: [PATCH 23/34] Don't leave aliases to extern global objects

---
 src/aotcompile.cpp | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 428f397c35aed..fffc7839d74c9 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -938,6 +938,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out
 }
 
 static auto serializeModule(const Module &M) {
+    assert(!verifyModule(M, &errs()) && "Serializing invalid module!");
     SmallVector<char, 0> ClonedModuleBuffer;
     BitcodeWriter BCWriter(ClonedModuleBuffer);
     BCWriter.writeModule(M);
@@ -976,9 +977,16 @@ static void materializePreserved(Module &M, Partition &partition) {
         if (!GA.isDeclaration()) {
             if (!Preserve.contains(&GA)) {
                 if (GA.getValueType()->isFunctionTy()) {
-                    DeletedAliases.push_back({ &GA, Function::Create(cast<FunctionType>(GA.getValueType()), GlobalValue::ExternalLinkage, "", &M) });
+                    auto F = Function::Create(cast<FunctionType>(GA.getValueType()), GlobalValue::ExternalLinkage, "", &M);
+                    // This is an extremely sad hack to make sure the global alias never points to an extern function
+                    auto BB = BasicBlock::Create(M.getContext(), "", F);
+                    new UnreachableInst(M.getContext(), BB);
+                    GA.setAliasee(F);
+
+                    DeletedAliases.push_back({ &GA, F });
                 } else {
-                    DeletedAliases.push_back({ &GA, new GlobalVariable(M, GA.getValueType(), false, GlobalValue::ExternalLinkage, nullptr) });
+                    auto GV = new GlobalVariable(M, GA.getValueType(), false, GlobalValue::ExternalLinkage, Constant::getNullValue(GA.getValueType()));
+                    DeletedAliases.push_back({ &GA, GV });
                 }
             }
         }
@@ -988,6 +996,12 @@ static void materializePreserved(Module &M, Partition &partition) {
         Deleted.second->takeName(Deleted.first);
         Deleted.first->replaceAllUsesWith(Deleted.second);
         Deleted.first->eraseFromParent();
+        // undo our previous sad hack
+        if (auto F = dyn_cast<Function>(Deleted.second)) {
+            F->deleteBody();
+        } else {
+            cast<GlobalVariable>(Deleted.second)->setInitializer(nullptr);
+        }
     }
 }
 

From fa208d43a95e1336c6e793795aad8134cb72883b Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Wed, 1 Feb 2023 10:41:27 -0500
Subject: [PATCH 24/34] Break multiversioning's dependency on
 jl_get_llvm_clone_targets

---
 src/llvm-multiversioning.cpp | 110 ++++++++++++++++++++++++++++++-----
 1 file changed, 96 insertions(+), 14 deletions(-)

diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp
index 42aa34d3bdb4f..b4f67ebe22c7d 100644
--- a/src/llvm-multiversioning.cpp
+++ b/src/llvm-multiversioning.cpp
@@ -140,6 +140,64 @@ static uint32_t collect_func_info(Function &F, bool &has_veccall)
     return flag;
 }
 
+struct TargetSpec {
+    std::string cpu_name;
+    std::string cpu_features;
+    uint32_t base;
+    uint32_t flags;
+
+    TargetSpec() = default;
+
+    static TargetSpec fromSpec(jl_target_spec_t &spec) {
+        TargetSpec out;
+        out.cpu_name = spec.cpu_name;
+        out.cpu_features = spec.cpu_features;
+        out.base = spec.base;
+        out.flags = spec.flags;
+        return out;
+    }
+
+    static TargetSpec fromMD(MDTuple *tup) {
+        TargetSpec out;
+        assert(tup->getNumOperands() == 4);
+        out.cpu_name = cast<MDString>(tup->getOperand(0))->getString().str();
+        out.cpu_features = cast<MDString>(tup->getOperand(1))->getString().str();
+        out.base = cast<ConstantInt>(cast<ConstantAsMetadata>(tup->getOperand(2))->getValue())->getZExtValue();
+        out.flags = cast<ConstantInt>(cast<ConstantAsMetadata>(tup->getOperand(3))->getValue())->getZExtValue();
+        return out;
+    }
+
+    MDNode *toMD(LLVMContext &ctx) const {
+        return MDTuple::get(ctx, {
+            MDString::get(ctx, cpu_name),
+            MDString::get(ctx, cpu_features),
+            ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(ctx), base)),
+            ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(ctx), flags))
+        });
+    }
+};
+
+static Optional<std::vector<TargetSpec>> get_target_specs(Module &M) {
+    auto md = M.getModuleFlag("julia.mv.specs");
+    if (!md)
+        return None;
+    auto tup = cast<MDTuple>(md);
+    std::vector<TargetSpec> out(tup->getNumOperands());
+    for (unsigned i = 0; i < tup->getNumOperands(); i++) {
+        out[i] = TargetSpec::fromMD(cast<MDTuple>(tup->getOperand(i).get()));
+    }
+    return out;
+}
+
+static void set_target_specs(Module &M, ArrayRef<TargetSpec> specs) {
+    std::vector<Metadata *> md;
+    md.reserve(specs.size());
+    for (auto &spec: specs) {
+        md.push_back(spec.toMD(M.getContext()));
+    }
+    M.addModuleFlag(Module::Error, "julia.mv.specs", MDTuple::get(M.getContext(), md));
+}
+
 static void annotate_module_clones(Module &M) {
     CallGraph CG(M);
     std::vector<Function *> orig_funcs;
@@ -149,7 +207,17 @@ static void annotate_module_clones(Module &M) {
         orig_funcs.push_back(&F);
     }
     bool has_veccall = false;
-    auto specs = jl_get_llvm_clone_targets();
+    std::vector<TargetSpec> specs;
+    if (auto maybe_specs = get_target_specs(M)) {
+        specs = std::move(*maybe_specs);
+    } else {
+        auto full_specs = jl_get_llvm_clone_targets();
+        specs.reserve(full_specs.size());
+        for (auto &spec: full_specs) {
+            specs.push_back(TargetSpec::fromSpec(spec));
+        }
+        set_target_specs(M, specs);
+    }
     std::vector<APInt> clones(orig_funcs.size(), APInt(specs.size(), 0));
     BitVector subtarget_cloned(orig_funcs.size());
 
@@ -255,6 +323,7 @@ static void annotate_module_clones(Module &M) {
     if (has_veccall) {
         M.addModuleFlag(Module::Max, "julia.mv.veccall", 1);
     }
+    M.addModuleFlag(Module::Error, "julia.mv.annotated", 1);
 }
 
 struct CloneCtx {
@@ -305,7 +374,7 @@ struct CloneCtx {
     void rewrite_alias(GlobalAlias *alias, Function* F);
 
     MDNode *tbaa_const;
-    std::vector<jl_target_spec_t> specs;
+    std::vector<TargetSpec> specs;
     std::vector<Group> groups{};
     std::vector<Target *> linearized;
     std::vector<Function*> fvars;
@@ -362,7 +431,7 @@ static inline std::vector<T*> consume_gv(Module &M, const char *name, bool allow
 // Collect basic information about targets and functions.
 CloneCtx::CloneCtx(Module &M, bool allow_bad_fvars)
     : tbaa_const(tbaa_make_child_with_context(M.getContext(), "jtbaa_const", nullptr, true).first),
-      specs(jl_get_llvm_clone_targets()),
+      specs(*get_target_specs(M)),
       fvars(consume_gv<Function>(M, "jl_fvars", allow_bad_fvars)),
       gvars(consume_gv<Constant>(M, "jl_gvars", false)),
       M(M),
@@ -473,24 +542,24 @@ static void clone_function(Function *F, Function *new_f, ValueToValueMapTy &vmap
 #endif
 }
 
-static void add_features(Function *F, StringRef name, StringRef features, uint32_t flags)
+static void add_features(Function *F, TargetSpec &spec)
 {
     auto attr = F->getFnAttribute("target-features");
     if (attr.isStringAttribute()) {
         std::string new_features(attr.getValueAsString());
         new_features += ",";
-        new_features += features;
+        new_features += spec.cpu_features;
         F->addFnAttr("target-features", new_features);
     }
     else {
-        F->addFnAttr("target-features", features);
+        F->addFnAttr("target-features", spec.cpu_features);
     }
-    F->addFnAttr("target-cpu", name);
+    F->addFnAttr("target-cpu", spec.cpu_name);
     if (!F->hasFnAttribute(Attribute::OptimizeNone)) {
-        if (flags & JL_TARGET_OPTSIZE) {
+        if (spec.flags & JL_TARGET_OPTSIZE) {
             F->addFnAttr(Attribute::OptimizeForSize);
         }
-        else if (flags & JL_TARGET_MINSIZE) {
+        else if (spec.flags & JL_TARGET_MINSIZE) {
             F->addFnAttr(Attribute::MinSize);
         }
     }
@@ -514,18 +583,19 @@ void CloneCtx::clone_bodies()
                     if (!F->isDeclaration()) {
                         clone_function(group_F, target_F, *target.vmap);
                     }
-                    add_features(target_F, specs[target.idx].cpu_name,
-                                specs[target.idx].cpu_features, specs[target.idx].flags);
+                    add_features(target_F, specs[target.idx]);
                     target_F->addFnAttr("julia.mv.clone", std::to_string(i));
                 }
             }
+            // don't set the original function's features yet,
+            // since we may clone it for later groups
             if (i != 0) {
-                //TODO should we also do this for target 0?
-                add_features(group_F, specs[groups[i].idx].cpu_name,
-                            specs[groups[i].idx].cpu_features, specs[groups[i].idx].flags);
+                add_features(group_F, specs[groups[i].idx]);
             }
             group_F->addFnAttr("julia.mv.clone", std::to_string(i));
         }
+        // Add features to the original function
+        add_features(F, specs[0]);
     }
 }
 
@@ -919,6 +989,18 @@ static bool runMultiVersioning(Module &M, bool allow_bad_fvars)
         return false;
     }
 
+    // for opt testing purposes
+    bool annotated = !!M.getModuleFlag("julia.mv.annotated");
+    if (!annotated) {
+        annotate_module_clones(M);
+    }
+
+    // also for opt testing purposes
+    if (M.getModuleFlag("julia.mv.skipcloning")) {
+        assert(!annotated && "Multiversioning was enabled and annotations were added, but cloning was skipped!");
+        return true;
+    }
+
     GlobalVariable *fvars = M.getGlobalVariable("jl_fvars");
     GlobalVariable *gvars = M.getGlobalVariable("jl_gvars");
     if (allow_bad_fvars && (!fvars || !fvars->hasInitializer() || !isa<ConstantArray>(fvars->getInitializer()) ||

From 3dcd1a23ac16748daa6770d44bf0825fa3981767 Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Wed, 1 Feb 2023 12:50:33 -0500
Subject: [PATCH 25/34] Add multiversioning annotation test

---
 .../multiversioning-annotate-only.ll          | 217 ++++++++++++++++++
 1 file changed, 217 insertions(+)
 create mode 100644 test/llvmpasses/multiversioning-annotate-only.ll

diff --git a/test/llvmpasses/multiversioning-annotate-only.ll b/test/llvmpasses/multiversioning-annotate-only.ll
new file mode 100644
index 0000000000000..38af146c078f5
--- /dev/null
+++ b/test/llvmpasses/multiversioning-annotate-only.ll
@@ -0,0 +1,217 @@
+; RUN: opt -enable-new-pm=0 -load libjulia-codegen%shlibext -JuliaMultiVersioning -S %s | FileCheck %s
+; RUN: opt -enable-new-pm=1 --load-pass-plugin=libjulia-codegen%shlibext -passes='JuliaMultiVersioning' -S %s | FileCheck %s
+
+; COM: This test checks that multiversioning correctly picks up on features that should trigger cloning
+; COM: Note that for annotations alone, we don't need jl_fvars or jl_gvars
+
+; COM: Copied from src/processor.h
+; COM:    JL_TARGET_VEC_CALL = 1 << 0,
+; COM:    // Clone all functions
+; COM:    JL_TARGET_CLONE_ALL = 1 << 1,
+; COM:    // Clone when there's scalar math operations that can benefit from target-specific
+; COM:    // optimizations. This includes `muladd`, `fma`, `fast`/`contract` flags.
+; COM:    JL_TARGET_CLONE_MATH = 1 << 2,
+; COM:    // Clone when the function has a loop
+; COM:    JL_TARGET_CLONE_LOOP = 1 << 3,
+; COM:    // Clone when the function uses any vectors
+; COM:    // When this is specified, the cloning pass should also record if any of the cloned functions
+; COM:    // used this in any function call (including the signature of the function itself)
+; COM:    JL_TARGET_CLONE_SIMD = 1 << 4,
+; COM:    // The CPU name is unknown
+; COM:    JL_TARGET_UNKNOWN_NAME = 1 << 5,
+; COM:    // Optimize for size for this target
+; COM:    JL_TARGET_OPTSIZE = 1 << 6,
+; COM:    // Only optimize for size for this target
+; COM:    JL_TARGET_MINSIZE = 1 << 7,
+; COM:    // Clone when the function queries CPU features
+; COM:    JL_TARGET_CLONE_CPU = 1 << 8,
+; COM:    // Clone when the function uses fp16
+; COM:    JL_TARGET_CLONE_FLOAT16 = 1 << 9,
+
+; COM: start with the basics, just one feature per function
+
+; COM: boring should only be cloned if clone_all is enabled on the target
+; CHECK: @boring{{.*}}#[[BORING_ATTRS:[0-9]+]]
+define noundef i32 @boring(i32 noundef %0) {
+  ret i32 %0
+}
+
+; CHECK: @fastmath_test{{.*}}#[[FASTMATH_TEST_ATTRS:[0-9]+]]
+define noundef float @fastmath_test(float noundef %0, float noundef %1) {
+  %3 = fadd fast float %0, %1
+  ret float %3
+}
+
+; CHECK: @loop_test{{.*}}#[[LOOP_TEST_ATTRS:[0-9]+]]
+define noundef i32 @loop_test(i32 noundef %0) {
+  %2 = icmp sgt i32 %0, 0
+  br i1 %2, label %5, label %3
+
+3:                                                ; preds = %5, %1
+  %4 = phi i32 [ 0, %1 ], [ %9, %5 ]
+  ret i32 %4
+
+5:                                                ; preds = %1, %5
+  %6 = phi i32 [ %10, %5 ], [ 0, %1 ]
+  %7 = phi i32 [ %9, %5 ], [ 0, %1 ]
+  %8 = lshr i32 %6, 1
+  %9 = add nuw nsw i32 %8, %7
+  %10 = add nuw nsw i32 %6, 1
+  %11 = icmp eq i32 %10, %0
+  br i1 %11, label %3, label %5, !llvm.loop !9
+}
+
+; CHECK: @simd_test{{.*}}#[[SIMD_TEST_ATTRS:[0-9]+]]
+define noundef i32 @simd_test(<4 x i32> noundef %0) {
+  %2 = extractelement <4 x i32> %0, i64 0
+  ret i32 %2
+}
+
+; COM: now check all the combinations
+
+; CHECK: @simd_fastmath_test{{.*}}#[[SIMD_FASTMATH_TEST_ATTRS:[0-9]+]]
+define noundef float @simd_fastmath_test(<4 x float> noundef %0) {
+  %2 = extractelement <4 x float> %0, i64 0
+  %3 = extractelement <4 x float> %0, i64 1
+  %4 = fadd fast float %2, %3
+  ret float %4
+}
+
+; CHECK: @loop_fastmath_test{{.*}}#[[LOOP_FASTMATH_TEST_ATTRS:[0-9]+]]
+define noundef i32 @loop_fastmath_test(i32 noundef %0) {
+  %2 = icmp sgt i32 %0, 0
+  br i1 %2, label %7, label %5
+
+3:                                                ; preds = %7
+  %4 = fptosi float %12 to i32
+  br label %5
+
+5:                                                ; preds = %3, %1
+  %6 = phi i32 [ 0, %1 ], [ %4, %3 ]
+  ret i32 %6
+
+7:                                                ; preds = %1, %7
+  %8 = phi i32 [ %13, %7 ], [ 0, %1 ]
+  %9 = phi float [ %12, %7 ], [ 0.000000e+00, %1 ]
+  %10 = lshr i32 %8, 1
+  %11 = sitofp i32 %10 to float
+  %12 = fadd fast float %9, %11
+  %13 = add nuw nsw i32 %8, 1
+  %14 = icmp eq i32 %13, %0
+  br i1 %14, label %3, label %7, !llvm.loop !9
+}
+
+; CHECK: @simd_loop_test{{.*}}#[[SIMD_LOOP_TEST_ATTRS:[0-9]+]]
+define dso_local noundef i32 @simd_loop_test(<4 x i32> noundef %0) {
+  %2 = extractelement <4 x i32> %0, i64 0
+  %3 = icmp sgt i32 %2, 0
+  br i1 %3, label %6, label %4
+
+4:                                                ; preds = %6, %1
+  %5 = phi i32 [ 0, %1 ], [ %10, %6 ]
+  ret i32 %5
+
+6:                                                ; preds = %1, %6
+  %7 = phi i32 [ %11, %6 ], [ 0, %1 ]
+  %8 = phi i32 [ %10, %6 ], [ 0, %1 ]
+  %9 = lshr i32 %7, 1
+  %10 = add nuw nsw i32 %9, %8
+  %11 = add nuw nsw i32 %7, 1
+  %12 = icmp eq i32 %11, %2
+  br i1 %12, label %4, label %6, !llvm.loop !9
+}
+
+; CHECK: @simd_loop_fastmath_test{{.*}}#[[SIMD_LOOP_FASTMATH_TEST_ATTRS:[0-9]+]]
+define noundef i32 @simd_loop_fastmath_test(<4 x i32> noundef %0) {
+  %2 = extractelement <4 x i32> %0, i64 0
+  %3 = icmp sgt i32 %2, 0
+  br i1 %3, label %8, label %6
+
+4:                                                ; preds = %8
+  %5 = fptosi float %13 to i32
+  br label %6
+
+6:                                                ; preds = %4, %1
+  %7 = phi i32 [ 0, %1 ], [ %5, %4 ]
+  ret i32 %7
+
+8:                                                ; preds = %1, %8
+  %9 = phi i32 [ %14, %8 ], [ 0, %1 ]
+  %10 = phi float [ %13, %8 ], [ 0.000000e+00, %1 ]
+  %11 = lshr i32 %9, 1
+  %12 = sitofp i32 %11 to float
+  %13 = fadd fast float %10, %12
+  %14 = add nuw nsw i32 %9, 1
+  %15 = icmp eq i32 %14, %2
+  br i1 %15, label %4, label %8, !llvm.loop !9
+}
+
+; COM: check for fvar and reloc annotations on functions used by other globals
+
+@func_gv = global i32 (i32)* @func_in_gv, align 8
+
+; CHECK: @func_in_gv{{.*}}#[[FUNC_IN_GV_ATTRS:[0-9]+]]
+define noundef i32 @func_in_gv(i32 noundef returned %0) {
+  ret i32 %0
+}
+
+@aliaser = alias i32 (i32)*, bitcast (i32 (i32)* @aliasee to i32 (i32)**)
+
+; CHECK: @aliasee{{.*}}#[[ALIASEE_ATTRS:[0-9]+]]
+define i32 @aliasee(i32 noundef returned %0) {
+  ret i32 %0
+}
+
+; COM: check for reloc annotations on functions used by other functions
+; CHECK: @cloned{{.*}}#[[CLONED_RELOC_ATTRS:[0-9]+]]
+define noundef float @cloned(float noundef %0, float noundef %1) {
+  %3 = fadd fast float %0, %1
+  ret float %3
+}
+
+define noundef i32 @uncloned(i32 noundef %0) {
+  %2 = sitofp i32 %0 to float
+  %3 = call noundef float @cloned(float noundef %2, float noundef %2)
+  %4 = fptosi float %3 to i32
+  ret i32 %4
+}
+
+; COM: Note that these strings are hex-encoded bits of the target indices that will be cloned
+; CHECK-DAG: attributes #[[BORING_ATTRS]] = { "julia.mv.clones"="2" }
+; CHECK-DAG: attributes #[[FASTMATH_TEST_ATTRS]] = { "julia.mv.clones"="6" }
+; CHECK-DAG: attributes #[[LOOP_TEST_ATTRS]] = { "julia.mv.clones"="A" }
+; CHECK-DAG: attributes #[[SIMD_TEST_ATTRS]] = { "julia.mv.clones"="12" }
+; CHECK-DAG: attributes #[[SIMD_FASTMATH_TEST_ATTRS]] = { "julia.mv.clones"="16" }
+; CHECK-DAG: attributes #[[LOOP_FASTMATH_TEST_ATTRS]] = { "julia.mv.clones"="E" }
+; CHECK-DAG: attributes #[[SIMD_LOOP_TEST_ATTRS]] = { "julia.mv.clones"="1A" }
+; CHECK-DAG: attributes #[[SIMD_LOOP_FASTMATH_TEST_ATTRS]] = { "julia.mv.clones"="1E" }
+; CHECK-DAG: attributes #[[FUNC_IN_GV_ATTRS]]
+; CHECK-SAME: "julia.mv.clones"="2"
+; CHECK-SAME: "julia.mv.fvar"
+; CHECK-DAG: attributes #[[ALIASEE_ATTRS]]
+; CHECK-SAME: "julia.mv.clones"="2"
+; CHECK-SAME: "julia.mv.reloc"
+; CHECK-DAG: attributes #[[CLONED_RELOC_ATTRS]]
+; CHECK-SAME: "julia.mv.clones"="6"
+; CHECK-SAME: "julia.mv.reloc"
+
+; CHECK-LABEL: !llvm.module.flags
+
+!llvm.module.flags = !{!0, !1, !2}
+
+; CHECK-DAG: julia.mv.enable
+; CHECK-DAG: julia.mv.skipcloning
+; CHECK-DAG: julia.mv.specs
+; CHECK-DAG: julia.mv.annotated
+; CHECK-DAG: julia.mv.veccall
+
+!0 = !{i32 1, !"julia.mv.enable", i32 1}
+!1 = !{i32 1, !"julia.mv.skipcloning", i32 1}
+!2 = !{i32 1, !"julia.mv.specs", !3}
+!3 = !{!4, !5, !6, !7, !8}
+!4 = !{!"cpubase", !"nofeatures", i32 0, i32 2}
+!5 = !{!"cpucloneall", !"cloneall", i32 0, i32 2}
+!6 = !{!"cpufastmath", !"fastmathclone", i32 0, i32 4}
+!7 = !{!"cpuloop", !"loopclone", i32 0, i32 8}
+!8 = !{!"cpusimd", !"simdclone", i32 0, i32 16}
+!9 = !{!9}

From b3d3ffbc3384451819aa9d1886f9c7230969411e Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Thu, 2 Feb 2023 10:48:52 -0500
Subject: [PATCH 26/34] Couple more tests for multiversioning

---
 test/llvmpasses/multiversioning-clone-only.ll | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 test/llvmpasses/multiversioning-clone-only.ll

diff --git a/test/llvmpasses/multiversioning-clone-only.ll b/test/llvmpasses/multiversioning-clone-only.ll
new file mode 100644
index 0000000000000..61bcdb8613306
--- /dev/null
+++ b/test/llvmpasses/multiversioning-clone-only.ll
@@ -0,0 +1,50 @@
+; RUN: opt -enable-new-pm=0 -load libjulia-codegen%shlibext -JuliaMultiVersioning -S %s | FileCheck %s --allow-unused-prefixes=false
+; RUN: opt -enable-new-pm=1 --load-pass-plugin=libjulia-codegen%shlibext -passes='JuliaMultiVersioning' -S %s | FileCheck %s --allow-unused-prefixes=false
+
+@jl_fvars = global [0 x i64] zeroinitializer, align 16
+@jl_gvars = global [0 x i64] zeroinitializer, align 16
+@jl_fvar_idxs = global [0 x i32] zeroinitializer, align 16
+@jl_gvar_idxs = global [0 x i32] zeroinitializer, align 16
+
+; CHECK-DAG: define{{.*}}@boring({{.*}}#[[BORING_DEFAULT_ATTRS:[0-9]+]]
+; CHECK-DAG-NEXT: ret i32 %0
+; CHECK-DAG: define{{.*}}@boring.1({{.*}}#[[BORING_CLONEALL_ATTRS:[0-9]+]]
+; CHECK-DAG-NEXT: ret i32 %0
+define noundef i32 @boring(i32 noundef %0) #0 {
+    ret i32 %0
+}
+
+; CHECK-DAG: declare{{.*}}@declaration({{.*}}#[[DECLARATION_DEFAULT_ATTRS:[0-9]+]]
+; CHECK-DAG: declare{{.*}}@declaration.1({{.*}}#[[DECLARATION_CLONEALL_ATTRS:[0-9]+]]
+declare i32 @declaration(i32 %0) #1
+
+; CHECK: }
+
+; CHECK-DAG: attributes #[[BORING_DEFAULT_ATTRS:[0-9]+]]
+; CHECK-SAME: {
+; CHECK-DAG: "julia.mv.clones"="2"
+; CHECK-DAG: "julia.mv.clone"="0"
+; CHECK-DAG: "target-cpu"="cpubase"
+; CHECK-DAG: "target-features"="nofeatures"
+; CHECK-SAME: }
+; CHECK-DAG: attributes #[[BORING_CLONEALL_ATTRS:[0-9]+]]
+; CHECK-SAME: {
+; CHECK-DAG: "julia.mv.clones"="2"
+; CHECK-DAG: "julia.mv.clone"="1"
+; CHECK-DAG: "target-cpu"="cpucloneall"
+; CHECK-DAG: "target-features"="cloneall"
+; CHECK-SAME: }
+attributes #0 = {"julia.mv.clones"="2"}
+attributes #1 = {"julia.mv.clones"="2" "test.unique"="1"}
+
+!llvm.module.flags = !{!0, !1, !2}
+
+!0 = !{i32 1, !"julia.mv.enable", i32 1}
+!1 = !{i32 1, !"julia.mv.annotated", i32 1}
+!2 = !{i32 1, !"julia.mv.specs", !3}
+!3 = !{!4, !5, !6, !7, !8}
+!4 = !{!"cpubase", !"nofeatures", i32 0, i32 2}
+!5 = !{!"cpucloneall", !"cloneall", i32 0, i32 2}
+!6 = !{!"cpufastmath", !"fastmathclone", i32 0, i32 4}
+!7 = !{!"cpuloop", !"loopclone", i32 0, i32 8}
+!8 = !{!"cpusimd", !"simdclone", i32 0, i32 16}
\ No newline at end of file

From e75e362dc936b2bf98028e887ec075f50c928c6b Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Thu, 2 Feb 2023 11:01:51 -0500
Subject: [PATCH 27/34] Inject CRT aliases with internal linkage within every
 shard

---
 src/aotcompile.cpp | 44 +++++++++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index fffc7839d74c9..5e8618d637b3e 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -485,8 +485,7 @@ static void injectCRTAlias(Module &M, StringRef name, StringRef alias, FunctionT
     if (!target) {
         target = Function::Create(FT, Function::ExternalLinkage, alias, M);
     }
-    Function *interposer = Function::Create(FT, Function::ExternalLinkage, name, M);
-    interposer->setVisibility(GlobalValue::HiddenVisibility);
+    Function *interposer = Function::Create(FT, Function::InternalLinkage, name, M);
     appendToCompilerUsed(M, {interposer});
 
     llvm::IRBuilder<> builder(BasicBlock::Create(M.getContext(), "top", interposer));
@@ -891,6 +890,30 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out
 #endif
     optimizer.run(M);
     assert(!verifyModule(M, &errs()));
+    bool inject_aliases = false;
+    for (auto &F : M.functions()) {
+        if (!F.isDeclaration() && F.getName() != "_DllMainCRTStartup") {
+            inject_aliases = true;
+            break;
+        }
+    }
+    // no need to inject aliases if we have no functions
+    if (inject_aliases) {
+        // We would like to emit an alias or an weakref alias to redirect these symbols
+        // but LLVM doesn't let us emit a GlobalAlias to a declaration...
+        // So for now we inject a definition of these functions that calls our runtime
+        // functions. We do so after optimization to avoid cloning these functions.
+        injectCRTAlias(M, "__gnu_h2f_ieee", "julia__gnu_h2f_ieee",
+                FunctionType::get(Type::getFloatTy(M.getContext()), { Type::getHalfTy(M.getContext()) }, false));
+        injectCRTAlias(M, "__extendhfsf2", "julia__gnu_h2f_ieee",
+                FunctionType::get(Type::getFloatTy(M.getContext()), { Type::getHalfTy(M.getContext()) }, false));
+        injectCRTAlias(M, "__gnu_f2h_ieee", "julia__gnu_f2h_ieee",
+                FunctionType::get(Type::getHalfTy(M.getContext()), { Type::getFloatTy(M.getContext()) }, false));
+        injectCRTAlias(M, "__truncsfhf2", "julia__gnu_f2h_ieee",
+                FunctionType::get(Type::getHalfTy(M.getContext()), { Type::getFloatTy(M.getContext()) }, false));
+        injectCRTAlias(M, "__truncdfhf2", "julia__truncdfhf2",
+                FunctionType::get(Type::getHalfTy(M.getContext()), { Type::getDoubleTy(M.getContext()) }, false));
+    }
 
     timers.optimize.stopTimer();
 
@@ -1440,23 +1463,6 @@ void jl_dump_native_impl(void *native_code,
     sysimageM->setOverrideStackAlignment(dataM->getOverrideStackAlignment());
 #endif
 
-    if (!TheTriple.isOSDarwin()) {
-        // We would like to emit an alias or an weakref alias to redirect these symbols
-        // but LLVM doesn't let us emit a GlobalAlias to a declaration...
-        // So for now we inject a definition of these functions that calls our runtime
-        // functions. We do so after optimization to avoid cloning these functions.
-        injectCRTAlias(*sysimageM, "__gnu_h2f_ieee", "julia__gnu_h2f_ieee",
-                FunctionType::get(Type::getFloatTy(Context), { Type::getHalfTy(Context) }, false));
-        injectCRTAlias(*sysimageM, "__extendhfsf2", "julia__gnu_h2f_ieee",
-                FunctionType::get(Type::getFloatTy(Context), { Type::getHalfTy(Context) }, false));
-        injectCRTAlias(*sysimageM, "__gnu_f2h_ieee", "julia__gnu_f2h_ieee",
-                FunctionType::get(Type::getHalfTy(Context), { Type::getFloatTy(Context) }, false));
-        injectCRTAlias(*sysimageM, "__truncsfhf2", "julia__gnu_f2h_ieee",
-                FunctionType::get(Type::getHalfTy(Context), { Type::getFloatTy(Context) }, false));
-        injectCRTAlias(*sysimageM, "__truncdfhf2", "julia__truncdfhf2",
-                FunctionType::get(Type::getHalfTy(Context), { Type::getDoubleTy(Context) }, false));
-    }
-
     if (TheTriple.isOSWindows()) {
         // Windows expect that the function `_DllMainStartup` is present in an dll.
         // Normal compilers use something like Zig's crtdll.c instead we provide a

From 65e6de2a6265243cb750b3551d5ac86029e7ffad Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Fri, 3 Feb 2023 00:58:25 -0500
Subject: [PATCH 28/34] Expand on the multiversioning tests

---
 src/llvm-multiversioning.cpp                  |  42 +++-
 test/llvmpasses/multiversioning-clone-only.ll | 193 ++++++++++++++++--
 2 files changed, 216 insertions(+), 19 deletions(-)

diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp
index b4f67ebe22c7d..6e9bbe85aa7f6 100644
--- a/src/llvm-multiversioning.cpp
+++ b/src/llvm-multiversioning.cpp
@@ -364,7 +364,9 @@ struct CloneCtx {
     void clone_decls();
     void clone_bodies();
     void fix_gv_uses();
+    void finalize_orig_clone_attr();
     void fix_inst_uses();
+    void finalize_orig_features();
     void emit_metadata();
 private:
     void prepare_vmap(ValueToValueMapTy &vmap);
@@ -399,6 +401,8 @@ static inline std::vector<T*> consume_gv(Module &M, const char *name, bool allow
     // Strip them from the Module so that it's easier to handle the uses.
     GlobalVariable *gv = M.getGlobalVariable(name);
     assert(gv && gv->hasInitializer());
+    dbgs() << "Consume " << *gv << ":\n";
+    dbgs() << *gv->getType() << "\n";
     ArrayType *Ty = cast<ArrayType>(gv->getInitializer()->getType());
     unsigned nele = Ty->getArrayNumElements();
     std::vector<T*> res(nele);
@@ -417,6 +421,7 @@ static inline std::vector<T*> consume_gv(Module &M, const char *name, bool allow
                 nele--;
                 continue;
             }
+            dbgs() << *val << ": " << *val->getType() << "\n";
             res[i++] = cast<T>(val);
         }
         res.resize(nele);
@@ -584,18 +589,20 @@ void CloneCtx::clone_bodies()
                         clone_function(group_F, target_F, *target.vmap);
                     }
                     add_features(target_F, specs[target.idx]);
-                    target_F->addFnAttr("julia.mv.clone", std::to_string(i));
+                    target_F->addFnAttr("julia.mv.clone", std::to_string(target.idx));
                 }
             }
             // don't set the original function's features yet,
             // since we may clone it for later groups
             if (i != 0) {
                 add_features(group_F, specs[groups[i].idx]);
+                group_F->addFnAttr("julia.mv.clone", std::to_string(groups[i].idx));
             }
-            group_F->addFnAttr("julia.mv.clone", std::to_string(i));
         }
-        // Add features to the original function
-        add_features(F, specs[0]);
+        // still don't set the original function's features yet,
+        // since we'll copy function attributes if we need to rewrite
+        // the alias, and target specific attributes are illegal on
+        // alias trampolines unless the user explicitly specifies them
     }
 }
 
@@ -658,6 +665,11 @@ void CloneCtx::rewrite_alias(GlobalAlias *alias, Function *F)
         Function::Create(F->getFunctionType(), alias->getLinkage(), "", &M);
     trampoline->copyAttributesFrom(F);
     trampoline->takeName(alias);
+    trampoline->setVisibility(alias->getVisibility());
+    // drop multiversioning attributes, add alias attribute for testing purposes
+    trampoline->removeFnAttr("julia.mv.reloc");
+    trampoline->removeFnAttr("julia.mv.clones");
+    trampoline->addFnAttr("julia.mv.alias");
     alias->eraseFromParent();
 
     uint32_t id;
@@ -727,6 +739,15 @@ void CloneCtx::fix_gv_uses()
     }
 }
 
+void CloneCtx::finalize_orig_clone_attr()
+{
+    for (auto orig_f: orig_funcs) {
+        if (!orig_f->hasFnAttribute("julia.mv.clones"))
+            continue;
+        orig_f->addFnAttr("julia.mv.clone", "0");
+    }
+}
+
 std::pair<uint32_t,GlobalVariable*> CloneCtx::get_reloc_slot(Function *F) const
 {
     if (F->isDeclaration()) {
@@ -814,6 +835,12 @@ void CloneCtx::fix_inst_uses()
     }
 }
 
+void CloneCtx::finalize_orig_features() {
+    for (auto F : orig_funcs) {
+        add_features(F, specs[0]);
+    }
+}
+
 static Constant *get_ptrdiff32(Constant *ptr, Constant *base)
 {
     if (ptr->getType()->isPointerTy())
@@ -1021,6 +1048,10 @@ static bool runMultiVersioning(Module &M, bool allow_bad_fvars)
     // These relocations must be initialized for **ALL** targets.
     clone.fix_gv_uses();
 
+    // Now we have all the cloned functions, we can set the original functions'
+    // clone attribute to be 0
+    clone.finalize_orig_clone_attr();
+
     // For each group, scan all functions cloned by **PARTIALLY** cloned targets for
     // instruction use.
     // A function needs a const relocation slot if it is cloned and is called by a
@@ -1031,6 +1062,9 @@ static bool runMultiVersioning(Module &M, bool allow_bad_fvars)
     // A target needs a slot to be initialized iff at least one caller is not initialized.
     clone.fix_inst_uses();
 
+    //Now set the original functions' target-specific attributes, since nobody will look at those again
+    clone.finalize_orig_features();
+
     // Store back sysimg information with the correct format.
     // At this point, we should have fixed up all the uses of the cloned functions
     // and collected all the shared/target-specific relocations.
diff --git a/test/llvmpasses/multiversioning-clone-only.ll b/test/llvmpasses/multiversioning-clone-only.ll
index 61bcdb8613306..a5c327548d702 100644
--- a/test/llvmpasses/multiversioning-clone-only.ll
+++ b/test/llvmpasses/multiversioning-clone-only.ll
@@ -1,41 +1,202 @@
 ; RUN: opt -enable-new-pm=0 -load libjulia-codegen%shlibext -JuliaMultiVersioning -S %s | FileCheck %s --allow-unused-prefixes=false
 ; RUN: opt -enable-new-pm=1 --load-pass-plugin=libjulia-codegen%shlibext -passes='JuliaMultiVersioning' -S %s | FileCheck %s --allow-unused-prefixes=false
 
-@jl_fvars = global [0 x i64] zeroinitializer, align 16
-@jl_gvars = global [0 x i64] zeroinitializer, align 16
-@jl_fvar_idxs = global [0 x i32] zeroinitializer, align 16
-@jl_gvar_idxs = global [0 x i32] zeroinitializer, align 16
-
-; CHECK-DAG: define{{.*}}@boring({{.*}}#[[BORING_DEFAULT_ATTRS:[0-9]+]]
-; CHECK-DAG-NEXT: ret i32 %0
-; CHECK-DAG: define{{.*}}@boring.1({{.*}}#[[BORING_CLONEALL_ATTRS:[0-9]+]]
-; CHECK-DAG-NEXT: ret i32 %0
+; CHECK: @jl_fvar_idxs = hidden constant [1 x i32] zeroinitializer
+; CHECK: @jl_gvar_idxs = hidden constant [0 x i32] zeroinitializer
+; CHECK: @subtarget_cloned_gv = hidden global i64* null
+; CHECK: @subtarget_cloned.reloc_slot = hidden global i32 (i32)* null
+; CHECK: @jl_fvar_offsets = hidden constant [2 x i32] [i32 1, i32 0]
+; CHECK: @jl_gvar_base = hidden constant i64 0
+; CHECK: @jl_gvar_offsets = hidden constant [1 x i32] zeroinitializer
+; CHECK: @jl_clone_slots = hidden constant [5 x i32]
+; CHECK-SAME: i32 2, i32 0, {{.*}} sub {{.*}}@subtarget_cloned.reloc_slot{{.*}}@jl_gvar_base
+; CHECK: @jl_clone_idxs = hidden constant [13 x i32]
+; COM: TODO actually check the clone idxs maybe?
+; CHECK: @jl_clone_offsets = hidden constant [4 x i32]
+; CHECK-SAME: sub
+; CHECK-SAME: @subtarget_cloned.1
+; CHECK-SAME: @subtarget_cloned
+; CHECK-SAME: sub
+; CHECK-SAME: @subtarget_cloned.2
+; CHECK-SAME: @subtarget_cloned
+; CHECK-SAME: sub
+
+@jl_fvars = global [1 x i64*] [i64* bitcast (i32 (i32)* @subtarget_cloned to i64*)], align 16
+@jl_gvars = global [0 x i64*] zeroinitializer, align 16
+@jl_fvar_idxs = hidden constant [1 x i32] [i32 0], align 16
+@jl_gvar_idxs = hidden constant [0 x i32] zeroinitializer, align 16
+@subtarget_cloned_gv = hidden global i64* bitcast (i32 (i32)* @subtarget_cloned to i64*), align 16
+
+@subtarget_cloned_aliased = alias i32 (i32), i32 (i32)* @subtarget_cloned
+
+; CHECK: define{{.*}}@boring({{.*}}#[[BORING_DEFAULT_ATTRS:[0-9]+]]
+; CHECK-NEXT: ret i32 %0
 define noundef i32 @boring(i32 noundef %0) #0 {
     ret i32 %0
 }
 
-; CHECK-DAG: declare{{.*}}@declaration({{.*}}#[[DECLARATION_DEFAULT_ATTRS:[0-9]+]]
-; CHECK-DAG: declare{{.*}}@declaration.1({{.*}}#[[DECLARATION_CLONEALL_ATTRS:[0-9]+]]
+; CHECK: declare{{.*}}@declaration({{.*}}#[[DECLARATION_DEFAULT_ATTRS:[0-9]+]]
 declare i32 @declaration(i32 %0) #1
 
-; CHECK: }
+; CHECK: define{{.*}}@call_boring({{.*}}#[[BORING_DEFAULT_ATTRS]]
+; CHECK-NEXT: %2 = call noundef i32 @boring(i32 noundef %0)
+define noundef i32 @call_boring(i32 noundef %0) #0 {
+    %2 = call noundef i32 @boring(i32 noundef %0)
+    ret i32 %2
+}
+
+; CHECK: define{{.*}}@call_declaration({{.*}}#[[DECLARATION_DEFAULT_ATTRS]]
+; CHECK-NEXT: %2 = call noundef i32 @declaration(i32 noundef %0)
+define noundef i32 @call_declaration(i32 noundef %0) #1 {
+    %2 = call noundef i32 @declaration(i32 noundef %0)
+    ret i32 %2
+}
+
+; CHECK: define{{.*}}@subtarget_cloned({{.*}}#[[SUBTARGET_CLONED_DEFAULT_ATTRS:[0-9]+]]
+; CHECK-NEXT: ret i32 0
+define noundef i32 @subtarget_cloned(i32 noundef %0) #2 {
+    ret i32 0
+}
+
+; COM: should fixup this callsite since 2 is cloned for a subtarget
+; CHECK: define{{.*}}@call_subtarget_cloned({{.*}}#[[CALL_SUBTARGET_CLONED_DEFAULT_ATTRS:[0-9]+]]
+; CHECK-NEXT: [[FUNC_PTR:%[0-9]+]] = load{{.*}}@subtarget_cloned.reloc_slot{{.*}}!tbaa ![[TBAA_CONST_METADATA:[0-9]+]], !invariant.load
+; CHECK-NEXT: call{{.*}}[[FUNC_PTR]]
+; CHECK: ret i32
+define noundef i32 @call_subtarget_cloned(i32 noundef %0) #3 {
+    %2 = call noundef i32 @subtarget_cloned(i32 noundef %0)
+    ret i32 %2
+}
+
+; CHECK: define{{.*}}@call_subtarget_cloned_but_not_cloned({{.*}}#[[BORING_DEFAULT_ATTRS]]
+; CHECK-NEXT: [[FUNC_PTR:%[0-9]+]] = load{{.*}}@subtarget_cloned.reloc_slot{{.*}}!tbaa ![[TBAA_CONST_METADATA]], !invariant.load
+; CHECK-NEXT: call{{.*}}[[FUNC_PTR]]
+; CHECK: ret i32
+define noundef i32 @call_subtarget_cloned_but_not_cloned(i32 noundef %0) #0 {
+    %2 = call noundef i32 @subtarget_cloned(i32 noundef %0)
+    ret i32 %2
+}
+
+; CHECK: define{{.*}}@boring.1({{.*}}#[[BORING_CLONEALL_ATTRS:[0-9]+]]
+; CHECK-NEXT: ret i32 %0
+
+; CHECK: declare{{.*}}@declaration.1({{.*}}#[[DECLARATION_CLONEALL_ATTRS:[0-9]+]]
+
+; COM: should not fixup this callsite since boring is not cloned for a subtarget
+; COM: also should call boring.1 instead of boring
+; CHECK: define{{.*}}@call_boring.1({{.*}}#[[BORING_CLONEALL_ATTRS]]
+; CHECK-NEXT: %2 = call noundef i32 @boring.1(i32 noundef %0)
+
+; CHECK: define{{.*}}@call_declaration.1({{.*}}#[[DECLARATION_CLONEALL_ATTRS]]
+; CHECK-NEXT: %2 = call noundef i32 @declaration.1(i32 noundef %0)
 
-; CHECK-DAG: attributes #[[BORING_DEFAULT_ATTRS:[0-9]+]]
+; CHECK: define{{.*}}@subtarget_cloned.1({{.*}}#[[SUBTARGET_CLONED_CLONEALL_ATTRS:[0-9]+]]
+; CHECK-NEXT: ret i32 0
+
+; CHECK: define{{.*}}@subtarget_cloned.2({{.*}}#[[SUBTARGET_CLONED_FASTMATH_ATTRS:[0-9]+]]
+; CHECK-NEXT: ret i32 0
+
+; COM: should *NOT* fixup this callsite since subtarget_cloned is not cloned for a subtarget of the cloneall
+; CHECK: define{{.*}}@call_subtarget_cloned.1({{.*}}#[[CALL_SUBTARGET_CLONED_CLONEALL_ATTRS:[0-9]+]]
+; CHECK-NEXT: %2 = call noundef i32 @subtarget_cloned.1(i32 noundef %0)
+
+; CHECK: define {{.*}}@call_subtarget_cloned.2({{.*}}#[[CALL_SUBTARGET_CLONED_FASTMATH_ATTRS:[0-9]+]]
+; CHECK-NEXT: %2 = call noundef i32 @subtarget_cloned.2(i32 noundef %0)
+
+; CHECK: define{{.*}}@call_subtarget_cloned_but_not_cloned.1({{.*}}#[[BORING_CLONEALL_ATTRS]]
+; CHECK-NEXT: %2 = call noundef i32 @subtarget_cloned.1(i32 noundef %0)
+
+; COM: should not have cloned for fastmath
+; CHECK-NOT: @subtarget_cloned_but_not_cloned.2
+
+; COM: check for alias being rewritten to a function trampoline
+; CHECK: define{{.*}}@subtarget_cloned_aliased{{.*}}#[[SUBTARGET_ALIASED_ATTRS:[0-9]+]]
+; CHECK-NOT: }
+; CHECK: [[FUNC_PTR:%[0-9]+]] = load{{.*}}@subtarget_cloned.reloc_slot{{.*}}!tbaa ![[TBAA_CONST_METADATA]], !invariant.load
+; CHECK-NEXT: call{{.*}}[[FUNC_PTR]]
+; CHECK: ret i32
+
+; CHECK: attributes #[[BORING_DEFAULT_ATTRS]]
 ; CHECK-SAME: {
 ; CHECK-DAG: "julia.mv.clones"="2"
 ; CHECK-DAG: "julia.mv.clone"="0"
 ; CHECK-DAG: "target-cpu"="cpubase"
 ; CHECK-DAG: "target-features"="nofeatures"
 ; CHECK-SAME: }
-; CHECK-DAG: attributes #[[BORING_CLONEALL_ATTRS:[0-9]+]]
+; CHECK: attributes #[[DECLARATION_DEFAULT_ATTRS]]
 ; CHECK-SAME: {
 ; CHECK-DAG: "julia.mv.clones"="2"
+; CHECK-DAG: "julia.mv.clone"="0"
+; CHECK-DAG: "target-cpu"="cpubase"
+; CHECK-DAG: "target-features"="nofeatures"
+; CHECK-SAME: }
+; CHECK: attributes #[[SUBTARGET_CLONED_DEFAULT_ATTRS]]
+; CHECK-SAME: {
+; CHECK-DAG: "julia.mv.clones"="6"
+; CHECK-DAG: "julia.mv.clone"="0"
+; CHECK-DAG: "target-cpu"="cpubase"
+; CHECK-DAG: "target-features"="nofeatures"
+; CHECK-DAG: "julia.mv.reloc"
+; CHECK-SAME: }
+; CHECK: attributes #[[CALL_SUBTARGET_CLONED_DEFAULT_ATTRS]]
+; CHECK-SAME: {
+; CHECK-DAG: "julia.mv.clones"="6"
+; CHECK-DAG: "julia.mv.clone"="0"
+; CHECK-DAG: "target-cpu"="cpubase"
+; CHECK-DAG: "target-features"="nofeatures"
+; CHECK-SAME: }
+; CHECK: attributes #[[BORING_CLONEALL_ATTRS]]
+; CHECK-SAME: {
+; CHECK-DAG: "julia.mv.clones"="2"
+; CHECK-DAG: "julia.mv.clone"="1"
+; CHECK-DAG: "target-cpu"="cpucloneall"
+; CHECK-DAG: "target-features"="cloneall"
+; CHECK-SAME: }
+; CHECK: attributes #[[DECLARATION_CLONEALL_ATTRS]]
+; CHECK-SAME: {
+; CHECK-DAG: "julia.mv.clones"="2"
+; CHECK-DAG: "julia.mv.clone"="1"
+; CHECK-DAG: "target-cpu"="cpucloneall"
+; CHECK-DAG: "target-features"="cloneall"
+; CHECK-SAME: }
+; CHECK: attributes #[[SUBTARGET_CLONED_CLONEALL_ATTRS]]
+; CHECK-SAME: {
+; CHECK-DAG: "julia.mv.clones"="6"
 ; CHECK-DAG: "julia.mv.clone"="1"
 ; CHECK-DAG: "target-cpu"="cpucloneall"
 ; CHECK-DAG: "target-features"="cloneall"
+; CHECK-DAG: "julia.mv.reloc"
+; CHECK-SAME: }
+; CHECK: attributes #[[SUBTARGET_CLONED_FASTMATH_ATTRS]]
+; CHECK-SAME: {
+; CHECK-DAG: "julia.mv.clones"="6"
+; CHECK-DAG: "julia.mv.clone"="2"
+; CHECK-DAG: "target-cpu"="cpufastmath"
+; CHECK-DAG: "target-features"="fastmathclone"
+; CHECK-DAG: "julia.mv.reloc"
+; CHECK-SAME: }
+; CHECK: attributes #[[CALL_SUBTARGET_CLONED_CLONEALL_ATTRS]]
+; CHECK-SAME: {
+; CHECK-DAG: "julia.mv.clones"="6"
+; CHECK-DAG: "julia.mv.clone"="1"
+; CHECK-DAG: "target-cpu"="cpucloneall"
+; CHECK-DAG: "target-features"="cloneall"
+; CHECK-SAME: }
+; CHECK: attributes #[[CALL_SUBTARGET_CLONED_FASTMATH_ATTRS]]
+; CHECK-SAME: {
+; CHECK-DAG: "julia.mv.clones"="6"
+; CHECK-DAG: "julia.mv.clone"="2"
+; CHECK-DAG: "target-cpu"="cpufastmath"
+; CHECK-DAG: "target-features"="fastmathclone"
+; CHECK-SAME: }
+; CHECK: attributes #[[SUBTARGET_ALIASED_ATTRS]]
+; CHECK-SAME: {
+; CHECK-SAME: "julia.mv.alias"
 ; CHECK-SAME: }
 attributes #0 = {"julia.mv.clones"="2"}
 attributes #1 = {"julia.mv.clones"="2" "test.unique"="1"}
+attributes #2 = {"julia.mv.clones"="6" "julia.mv.reloc"}
+attributes #3 = {"julia.mv.clones"="6"}
 
 !llvm.module.flags = !{!0, !1, !2}
 
@@ -47,4 +208,6 @@ attributes #1 = {"julia.mv.clones"="2" "test.unique"="1"}
 !5 = !{!"cpucloneall", !"cloneall", i32 0, i32 2}
 !6 = !{!"cpufastmath", !"fastmathclone", i32 0, i32 4}
 !7 = !{!"cpuloop", !"loopclone", i32 0, i32 8}
-!8 = !{!"cpusimd", !"simdclone", i32 0, i32 16}
\ No newline at end of file
+!8 = !{!"cpusimd", !"simdclone", i32 0, i32 16}
+; CHECK-DAG: ![[TBAA_CONST_METADATA]] = !{![[JTBAA_CONST_METADATA:[0-9]+]], ![[JTBAA_CONST_METADATA]]
+; CHECK-DAG: ![[JTBAA_CONST_METADATA]] = !{!"jtbaa_const"

From 556122393ab3762f6d19fc3f19c83739065b8c28 Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Fri, 3 Feb 2023 03:42:59 -0500
Subject: [PATCH 29/34] Remove stray debug prints

---
 src/llvm-multiversioning.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp
index 6e9bbe85aa7f6..cbce76d702119 100644
--- a/src/llvm-multiversioning.cpp
+++ b/src/llvm-multiversioning.cpp
@@ -401,8 +401,6 @@ static inline std::vector<T*> consume_gv(Module &M, const char *name, bool allow
     // Strip them from the Module so that it's easier to handle the uses.
     GlobalVariable *gv = M.getGlobalVariable(name);
     assert(gv && gv->hasInitializer());
-    dbgs() << "Consume " << *gv << ":\n";
-    dbgs() << *gv->getType() << "\n";
     ArrayType *Ty = cast<ArrayType>(gv->getInitializer()->getType());
     unsigned nele = Ty->getArrayNumElements();
     std::vector<T*> res(nele);
@@ -421,7 +419,6 @@ static inline std::vector<T*> consume_gv(Module &M, const char *name, bool allow
                 nele--;
                 continue;
             }
-            dbgs() << *val << ": " << *val->getType() << "\n";
             res[i++] = cast<T>(val);
         }
         res.resize(nele);

From fef319cf11394caf3526460758d4e57196bd2322 Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Mon, 13 Feb 2023 13:42:27 -0500
Subject: [PATCH 30/34] Track gvar count

---
 src/processor.cpp | 1 +
 src/processor.h   | 1 +
 src/staticdata.c  | 1 +
 3 files changed, 3 insertions(+)

diff --git a/src/processor.cpp b/src/processor.cpp
index 851cbec62560a..fec2b77102f55 100644
--- a/src/processor.cpp
+++ b/src/processor.cpp
@@ -773,6 +773,7 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback)
             offsets[i] = gvars[i] - (const char *)res.gvars_base;
         }
         res.gvars_offsets = offsets;
+        res.ngvars = gvars.size();
     }
 
     if (!clones.empty()) {
diff --git a/src/processor.h b/src/processor.h
index 73271290eff76..6445f221882ba 100644
--- a/src/processor.h
+++ b/src/processor.h
@@ -159,6 +159,7 @@ typedef struct {
     uint64_t base;
     uintptr_t *gvars_base;
     const int32_t *gvars_offsets;
+    uint32_t ngvars;
     jl_image_fptrs_t fptrs;
 } jl_image_t;
 
diff --git a/src/staticdata.c b/src/staticdata.c
index 94e93f4198b4c..d832cda995a94 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -1901,6 +1901,7 @@ static void jl_update_all_gvars(jl_serializer_state *s, jl_image_t *image, uint3
     reloc_t *gvars = (reloc_t*)&s->gvar_record->buf[0];
     int gvar_link_index = 0;
     int external_fns_link_index = 0;
+    assert(l == image->ngvars);
     for (i = 0; i < l; i++) {
         uintptr_t offset = gvars[i];
         uintptr_t v = 0;

From acc54d9e93567de0077ba248206f1a02f4d7b0cd Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Fri, 17 Feb 2023 15:40:21 -0500
Subject: [PATCH 31/34] Add more assertions

---
 src/aotcompile.cpp | 51 +++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 5 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 5e8618d637b3e..f74c9f92d3093 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -354,10 +354,14 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
     // process the globals array, before jl_merge_module destroys them
     std::vector<std::string> gvars(params.globals.size());
     data->jl_value_to_llvm.resize(params.globals.size());
+    StringSet<> gvars_names;
+    DenseSet<GlobalValue *> gvars_set;
 
     size_t idx = 0;
     for (auto &global : params.globals) {
         gvars[idx] = global.second->getName().str();
+        assert(gvars_set.insert(global.second).second && "Duplicate gvar in params!");
+        assert(gvars_names.insert(gvars[idx]).second && "Duplicate gvar name in params!");
         data->jl_value_to_llvm[idx] = global.first;
         idx++;
     }
@@ -374,7 +378,10 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
         GlobalVariable *F = extern_fn.second;
         size_t idx = gvars.size() - offset;
         assert(idx >= 0);
-        data->jl_external_to_llvm.at(idx) = this_code;
+        assert(idx < data->jl_external_to_llvm.size());
+        data->jl_external_to_llvm[idx] = this_code;
+        assert(gvars_set.insert(F).second && "Duplicate gvar in params!");
+        assert(gvars_names.insert(F->getName()).second && "Duplicate gvar name in params!");
         gvars.push_back(std::string(F->getName()));
     }
 
@@ -575,12 +582,18 @@ static void get_fvars_gvars(Module &M, DenseMap<GlobalValue *, unsigned> &fvars,
     auto gvars_init = cast<ConstantArray>(gvars_gv->getInitializer());
     for (unsigned i = 0; i < fvars_init->getNumOperands(); ++i) {
         auto gv = cast<GlobalValue>(fvars_init->getOperand(i)->stripPointerCasts());
+        assert(gv && gv->hasName() && "fvar must be a named global");
+        assert(!fvars.count(gv) && "Duplicate fvar");
         fvars[gv] = i;
     }
+    assert(fvars.size() == fvars_init->getNumOperands());
     for (unsigned i = 0; i < gvars_init->getNumOperands(); ++i) {
         auto gv = cast<GlobalValue>(gvars_init->getOperand(i)->stripPointerCasts());
+        assert(gv && gv->hasName() && "gvar must be a named global");
+        assert(!gvars.count(gv) && "Duplicate gvar");
         gvars[gv] = i;
     }
+    assert(gvars.size() == gvars_init->getNumOperands());
     fvars_gv->eraseFromParent();
     gvars_gv->eraseFromParent();
     fvars_idxs->eraseFromParent();
@@ -606,9 +619,11 @@ static size_t getFunctionWeight(const Function &F)
 }
 
 
-static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partitions, const Module &M) {
+static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partitions, const Module &M, size_t fvars_size, size_t gvars_size) {
     bool bad = false;
-#ifdef JL_DEBUG_BUILD
+#ifndef JL_NDEBUG
+    SmallVector<uint32_t> fvars(fvars_size);
+    SmallVector<uint32_t> gvars(gvars_size);
     StringMap<uint32_t> GVNames;
     for (uint32_t i = 0; i < partitions.size(); i++) {
         for (auto &name : partitions[i].globals) {
@@ -618,7 +633,21 @@ static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partiti
             }
             GVNames[name.getKey()] = i;
         }
-        dbgs() << "partition: " << i << " fvars: " << partitions[i].fvars.size() << " gvars: " << partitions[i].gvars.size() << "\n";
+        for (auto &fvar : partitions[i].fvars) {
+            if (fvars[fvar.second] != 0) {
+                bad = true;
+                dbgs() << "Duplicate fvar " << fvar.first() << " in partitions " << i << " and " << fvars[fvar.second] - 1 << "\n";
+            }
+            fvars[fvar.second] = i+1;
+        }
+        for (auto &gvar : partitions[i].gvars) {
+            if (gvars[gvar.second] != 0) {
+                bad = true;
+                dbgs() << "Duplicate gvar " << gvar.first() << " in partitions " << i << " and " << gvars[gvar.second] - 1 << "\n";
+            }
+            gvars[gvar.second] = i+1;
+        }
+        // dbgs() << "partition: " << i << " fvars: " << partitions[i].fvars.size() << " gvars: " << partitions[i].gvars.size() << "\n";
     }
     for (auto &GV : M.globals()) {
         if (GV.isDeclaration()) {
@@ -637,6 +666,18 @@ static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partiti
             }
         }
     }
+    for (uint32_t i = 0; i < fvars_size; i++) {
+        if (fvars[i] == 0) {
+            bad = true;
+            dbgs() << "fvar " << i << " not in any partition\n";
+        }
+    }
+    for (uint32_t i = 0; i < gvars_size; i++) {
+        if (gvars[i] == 0) {
+            bad = true;
+            dbgs() << "gvar " << i << " not in any partition\n";
+        }
+    }
 #endif
     return !bad;
 }
@@ -766,7 +807,7 @@ static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) {
         }
     }
 
-    bool verified = verify_partitioning(partitions, M);
+    bool verified = verify_partitioning(partitions, M, fvars.size(), gvars.size());
     assert(verified && "Partitioning failed to partition globals correctly");
     (void) verified;
 

From 27f1ccdf55078670d4f4f0a7407e53df580134d5 Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Sun, 5 Mar 2023 23:14:01 -0500
Subject: [PATCH 32/34] Move dbgs under LLVM_DEBUG

---
 src/aotcompile.cpp | 152 ++++++++++++++++++++++++++++-----------------
 1 file changed, 95 insertions(+), 57 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index f74c9f92d3093..d512ad586a680 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -600,24 +600,72 @@ static void get_fvars_gvars(Module &M, DenseMap<GlobalValue *, unsigned> &fvars,
     gvars_idxs->eraseFromParent();
 }
 
-static size_t getFunctionWeight(const Function &F)
+struct FunctionInfo {
+    size_t weight;
+    size_t bbs;
+    size_t insts;
+    size_t clones;
+};
+
+static FunctionInfo getFunctionWeight(const Function &F)
 {
-    size_t weight = 1;
+    FunctionInfo info;
+    info.weight = 1;
+    info.bbs = F.size();
+    info.insts = 0;
+    info.clones = 1;
     for (const BasicBlock &BB : F) {
-        weight += BB.size();
+        info.insts += BB.size();
     }
-    // more basic blocks = more complex than just sum of insts,
-    // add some weight to it
-    weight += F.size();
     if (F.hasFnAttribute("julia.mv.clones")) {
         auto val = F.getFnAttribute("julia.mv.clones").getValueAsString();
         // base16, so must be at most 4 * length bits long
         // popcount gives number of clones
-        weight *= APInt(val.size() * 4, val, 16).countPopulation() + 1;
+        info.clones = APInt(val.size() * 4, val, 16).countPopulation() + 1;
     }
-    return weight;
+    info.weight += info.insts;
+    // more basic blocks = more complex than just sum of insts,
+    // add some weight to it
+    info.weight += info.bbs;
+    info.weight *= info.clones;
+    return info;
 }
 
+struct ModuleInfo {
+    size_t globals;
+    size_t funcs;
+    size_t bbs;
+    size_t insts;
+    size_t clones;
+    size_t weight;
+};
+
+ModuleInfo compute_module_info(Module &M) {
+    ModuleInfo info;
+    info.globals = 0;
+    info.funcs = 0;
+    info.bbs = 0;
+    info.insts = 0;
+    info.clones = 0;
+    info.weight = 0;
+    for (auto &G : M.global_values()) {
+        if (G.isDeclaration()) {
+            continue;
+        }
+        info.globals++;
+        if (auto F = dyn_cast<Function>(&G)) {
+            info.funcs++;
+            auto func_info = getFunctionWeight(*F);
+            info.bbs += func_info.bbs;
+            info.insts += func_info.insts;
+            info.clones += func_info.clones;
+            info.weight += func_info.weight;
+        } else {
+            info.weight += 1;
+        }
+    }
+    return info;
+}
 
 static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partitions, const Module &M, size_t fvars_size, size_t gvars_size) {
     bool bad = false;
@@ -647,7 +695,6 @@ static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partiti
             }
             gvars[gvar.second] = i+1;
         }
-        // dbgs() << "partition: " << i << " fvars: " << partitions[i].fvars.size() << " gvars: " << partitions[i].gvars.size() << "\n";
     }
     for (auto &GV : M.globals()) {
         if (GV.isDeclaration()) {
@@ -736,7 +783,7 @@ static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) {
         if (G.isDeclaration())
             continue;
         if (isa<Function>(G)) {
-            partitioner.make(&G, getFunctionWeight(cast<Function>(G)));
+            partitioner.make(&G, getFunctionWeight(cast<Function>(G)).weight);
         } else {
             partitioner.make(&G, 1);
         }
@@ -1141,7 +1188,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
                 std::vector<NewArchiveMember> &unopt, std::vector<NewArchiveMember> &opt,
                 std::vector<NewArchiveMember> &obj, std::vector<NewArchiveMember> &asm_,
                 bool unopt_out, bool opt_out, bool obj_out, bool asm_out,
-                unsigned threads) {
+                unsigned threads, ModuleInfo module_info) {
     unsigned outcount = unopt_out + opt_out + obj_out + asm_out;
     assert(outcount);
     outputs.resize(outputs.size() + outcount * threads);
@@ -1235,8 +1282,6 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
             auto M = cantFail(getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), "Optimized"), ctx), "Error loading module");
             timers[i].deserialize.stopTimer();
 
-            // dbgs() << "Starting shard " << i << " with weight=" << partitions[i].weight << "\n";
-
             timers[i].materialize.startTimer();
             materializePreserved(*M, partitions[i]);
             timers[i].materialize.stopTimer();
@@ -1271,54 +1316,37 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
         for (auto &t : timers) {
             t.print(dbgs(), true);
         }
+        dbgs() << "Partition weights: [";
+        bool comma = false;
+        for (auto &p : partitions) {
+            if (comma)
+                dbgs() << ", ";
+            else
+                comma = true;
+            dbgs() << p.weight;
+        }
+        dbgs() << "]\n";
     }
 }
 
-unsigned compute_image_thread_count(Module &M) {
+static unsigned compute_image_thread_count(const ModuleInfo &info) {
     // 32-bit systems are very memory-constrained
 #ifdef _P32
-    // dbgs() << "Threads: 1\n";
+    LLVM_DEBUG(dbgs() << "32-bit systems are restricted to a single thread\n");
     return 1;
 #endif
-    size_t weight = 0;
-    size_t globals = 0;
-    for (auto &GV : M.global_values()) {
-        if (GV.isDeclaration())
-            continue;
-        globals++;
-        if (isa<Function>(GV)) {
-            weight += getFunctionWeight(cast<Function>(GV));
-        } else {
-            weight += 1;
-        }
-    }
-    // dbgs() << "Module weight: " << weight << "\n";
-    if (weight < 1000) {
-        // dbgs() << "Low module complexity bailout\n";
-        // dbgs() << "Threads: 1\n";
+    // This is not overridable because empty modules do occasionally appear, but they'll be very small and thus exit early to
+    // known easy behavior. Plus they really don't warrant multiple threads
+    if (info.weight < 1000) {
+        LLVM_DEBUG(dbgs() << "Small module, using a single thread\n");
         return 1;
     }
 
-    unsigned threads = std::max(llvm::hardware_concurrency().compute_thread_count() / 2, 1u);
-
-    // memory limit check
-    // many threads use a lot of memory, so limit on constrained memory systems
-    size_t available = uv_get_available_memory();
-    // crude estimate, available / (weight * fudge factor) = max threads
-    size_t fudge = 10;
-    unsigned max_threads = std::max(available / (weight * fudge), (size_t)1);
-    // dbgs() << "Available memory: " << available << " bytes\n";
-    // dbgs() << "Max threads: " << max_threads << "\n";
-    // dbgs() << "Temporarily disabling memory limiting threads\n";
-    //TODO reenable
-    // if (max_threads < threads) {
-    //     dbgs() << "Memory limiting threads to " << max_threads << "\n";
-    //     threads = max_threads;
-    // }
-
-    max_threads = globals / 100;
+    unsigned threads = std::max(jl_cpu_threads() / 2, 1);
+
+    auto max_threads = info.globals / 100;
     if (max_threads < threads) {
-        // dbgs() << "Low global count limiting threads to " << max_threads << " (" << globals << "globals)\n";
+        LLVM_DEBUG(dbgs() << "Low global count limiting threads to " << max_threads << " (" << info.globals << "globals)\n");
         threads = max_threads;
     }
 
@@ -1331,7 +1359,7 @@ unsigned compute_image_thread_count(Module &M) {
         if (*endptr || !requested) {
             jl_safe_printf("WARNING: invalid value '%s' for JULIA_IMAGE_THREADS\n", env_threads);
         } else {
-            // dbgs() << "Overriding threads to " << requested << " due to JULIA_IMAGE_THREADS\n";
+            LLVM_DEBUG(dbgs() << "Overriding threads to " << requested << " due to JULIA_IMAGE_THREADS\n");
             threads = requested;
             env_threads_set = true;
         }
@@ -1345,7 +1373,7 @@ unsigned compute_image_thread_count(Module &M) {
             if (*endptr || !requested) {
                 jl_safe_printf("WARNING: invalid value '%s' for JULIA_CPU_THREADS\n", fallbackenv);
             } else if (requested < threads) {
-                // dbgs() << "Overriding threads to " << requested << " due to JULIA_CPU_THREADS\n";
+                LLVM_DEBUG(dbgs() << "Overriding threads to " << requested << " due to JULIA_CPU_THREADS\n");
                 threads = requested;
             }
         }
@@ -1353,8 +1381,6 @@ unsigned compute_image_thread_count(Module &M) {
 
     threads = std::max(threads, 1u);
 
-    // dbgs() << "Threads: " << threads << "\n";
-
     return threads;
 }
 
@@ -1369,7 +1395,7 @@ void jl_dump_native_impl(void *native_code,
     JL_TIMING(NATIVE_DUMP);
     jl_native_code_desc_t *data = (jl_native_code_desc_t*)native_code;
     if (!bc_fname && !unopt_bc_fname && !obj_fname && !asm_fname) {
-        // dbgs() << "No output requested, skipping native code dump?\n";
+        LLVM_DEBUG(dbgs() << "No output requested, skipping native code dump?\n");
         delete data;
         return;
     }
@@ -1433,6 +1459,17 @@ void jl_dump_native_impl(void *native_code,
     unsigned nfvars = 0;
     unsigned ngvars = 0;
 
+    ModuleInfo module_info = compute_module_info(*dataM);
+    LLVM_DEBUG(dbgs()
+        << "Dumping module with stats:\n"
+        << "    globals: " << module_info.globals << "\n"
+        << "    functions: " << module_info.funcs << "\n"
+        << "    basic blocks: " << module_info.bbs << "\n"
+        << "    instructions: " << module_info.insts << "\n"
+        << "    clones: " << module_info.clones << "\n"
+        << "    weight: " << module_info.weight << "\n"
+    );
+
     // add metadata information
     if (imaging_mode) {
         multiversioning_preannotate(*dataM);
@@ -1446,7 +1483,8 @@ void jl_dump_native_impl(void *native_code,
                 }
             }
         }
-        threads = compute_image_thread_count(*dataM);
+        threads = compute_image_thread_count(module_info);
+        LLVM_DEBUG(dbgs() << "Using " << threads << " to emit aot image\n");
         nfvars = data->jl_sysimg_fvars.size();
         ngvars = data->jl_sysimg_gvars.size();
         emit_offset_table(*dataM, data->jl_sysimg_gvars, "jl_gvars", T_psize);
@@ -1484,7 +1522,7 @@ void jl_dump_native_impl(void *native_code,
             M, *SourceTM, outputs, names,
             unopt_bc_Archive, bc_Archive, obj_Archive, asm_Archive,
             !!unopt_bc_fname, !!bc_fname, !!obj_fname, !!asm_fname,
-            threads
+            threads, module_info
     ); };
 
     std::array<StringRef, 4> text_names = {

From 6b8ec27dbc582ba67f717e400b1bcff8f886c6d3 Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Sun, 5 Mar 2023 23:57:51 -0500
Subject: [PATCH 33/34] Add some documentation

---
 src/aotcompile.cpp           |  52 +++++++++--
 src/llvm-multiversioning.cpp |   2 +
 src/processor.h              | 164 +++++++++++++++++++----------------
 3 files changed, 133 insertions(+), 85 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index d512ad586a680..0337602cde27e 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -505,6 +505,7 @@ static void injectCRTAlias(Module &M, StringRef name, StringRef alias, FunctionT
 
 void multiversioning_preannotate(Module &M);
 
+// See src/processor.h for documentation about this table. Corresponds to jl_image_shard_t.
 static GlobalVariable *emit_shard_table(Module &M, Type *T_size, Type *T_psize, unsigned threads) {
     SmallVector<Constant *, 0> tables(sizeof(jl_image_shard_t) / sizeof(void *) * threads);
     for (unsigned i = 0; i < threads; i++) {
@@ -533,6 +534,7 @@ static GlobalVariable *emit_shard_table(Module &M, Type *T_size, Type *T_psize,
     return tables_gv;
 }
 
+// See src/processor.h for documentation about this table. Corresponds to jl_image_ptls_t.
 static GlobalVariable *emit_ptls_table(Module &M, Type *T_size, Type *T_psize) {
     std::array<Constant *, 3> ptls_table{
         new GlobalVariable(M, T_size, false, GlobalValue::ExternalLinkage, Constant::getNullValue(T_size), "jl_pgcstack_func_slot"),
@@ -548,6 +550,7 @@ static GlobalVariable *emit_ptls_table(Module &M, Type *T_size, Type *T_psize) {
     return ptls_table_gv;
 }
 
+// See src/processor.h for documentation about this table. Corresponds to jl_image_header_t.
 static GlobalVariable *emit_image_header(Module &M, unsigned threads, unsigned nfvars, unsigned ngvars) {
     constexpr uint32_t version = 1;
     std::array<uint32_t, 4> header{
@@ -562,13 +565,7 @@ static GlobalVariable *emit_image_header(Module &M, unsigned threads, unsigned n
     return header_gv;
 }
 
-struct Partition {
-    StringSet<> globals;
-    StringMap<unsigned> fvars;
-    StringMap<unsigned> gvars;
-    size_t weight;
-};
-
+// Grab fvars and gvars data from the module
 static void get_fvars_gvars(Module &M, DenseMap<GlobalValue *, unsigned> &fvars, DenseMap<GlobalValue *, unsigned> &gvars) {
     auto fvars_gv = M.getGlobalVariable("jl_fvars");
     auto gvars_gv = M.getGlobalVariable("jl_gvars");
@@ -600,6 +597,11 @@ static void get_fvars_gvars(Module &M, DenseMap<GlobalValue *, unsigned> &fvars,
     gvars_idxs->eraseFromParent();
 }
 
+// Weight computation
+// It is important for multithreaded image building to be able to split work up
+// among the threads equally. The weight calculated here is an estimation of
+// how expensive a particular function is going to be to compile. 
+
 struct FunctionInfo {
     size_t weight;
     size_t bbs;
@@ -667,6 +669,13 @@ ModuleInfo compute_module_info(Module &M) {
     return info;
 }
 
+struct Partition {
+    StringSet<> globals;
+    StringMap<unsigned> fvars;
+    StringMap<unsigned> gvars;
+    size_t weight;
+};
+
 static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partitions, const Module &M, size_t fvars_size, size_t gvars_size) {
     bool bad = false;
 #ifndef JL_NDEBUG
@@ -729,7 +738,7 @@ static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partiti
     return !bad;
 }
 
-// Chop a module up as equally as possible into threads partitions
+// Chop a module up as equally as possible by weight into threads partitions
 static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) {
     //Start by stripping fvars and gvars, which helpfully removes their uses as well
     DenseMap<GlobalValue *, unsigned> fvars, gvars;
@@ -926,6 +935,7 @@ struct ShardTimers {
     }
 };
 
+// Perform the actual optimization and emission of the output files
 static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *outputs, ArrayRef<StringRef> names,
                     NewArchiveMember *unopt, NewArchiveMember *opt, NewArchiveMember *obj, NewArchiveMember *asm_,
                     ShardTimers &timers, unsigned shardidx) {
@@ -1048,6 +1058,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out
     }
 }
 
+// serialize module to bitcode
 static auto serializeModule(const Module &M) {
     assert(!verifyModule(M, &errs()) && "Serializing invalid module!");
     SmallVector<char, 0> ClonedModuleBuffer;
@@ -1058,6 +1069,12 @@ static auto serializeModule(const Module &M) {
     return ClonedModuleBuffer;
 }
 
+// Modules are deserialized lazily by LLVM, to avoid deserializing
+// unnecessary functions. We take advantage of this by serializing
+// the entire module once, then deleting the bodies of functions
+// that are not in this partition. Once unnecesary functions are
+// deleted, we then materialize the entire module to make use-lists
+// consistent.
 static void materializePreserved(Module &M, Partition &partition) {
     DenseSet<GlobalValue *> Preserve;
     for (auto &GV : M.global_values()) {
@@ -1083,6 +1100,12 @@ static void materializePreserved(Module &M, Partition &partition) {
             }
         }
     }
+    // Global aliases are a pain to deal with. It is illegal to have an alias to a declaration,
+    // so we need to replace them with either a function or a global variable declaration. However,
+    // we can't just delete the alias, because that would break the users of the alias. Therefore,
+    // we do a dance where we point each global alias to a dummy function or global variable,
+    // then materialize the module to access use-lists, then replace all the uses, and finally commit
+    // to deleting the old alias.
     SmallVector<std::pair<GlobalAlias *, GlobalValue *>> DeletedAliases;
     for (auto &GA : M.aliases()) {
         if (!GA.isDeclaration()) {
@@ -1116,6 +1139,7 @@ static void materializePreserved(Module &M, Partition &partition) {
     }
 }
 
+// Reconstruct jl_fvars, jl_gvars, jl_fvars_idxs, and jl_gvars_idxs from the partition
 static void construct_vars(Module &M, Partition &partition) {
     std::vector<std::pair<uint32_t, GlobalValue *>> fvar_pairs;
     fvar_pairs.reserve(partition.fvars.size());
@@ -1168,6 +1192,8 @@ static void construct_vars(Module &M, Partition &partition) {
     gidxs_var->setVisibility(GlobalValue::HiddenVisibility);
 }
 
+// Materialization will leave many unused declarations, which multiversioning would otherwise clone.
+// This function removes them to avoid unnecessary cloning of declarations. 
 static void dropUnusedDeclarations(Module &M) {
     SmallVector<GlobalValue *> unused;
     for (auto &G : M.global_values()) {
@@ -1184,6 +1210,8 @@ static void dropUnusedDeclarations(Module &M) {
         G->eraseFromParent();
 }
 
+// Entrypoint to optionally-multithreaded image compilation. This handles global coordination of the threading,
+// as well as partitioning, serialization, and deserialization. 
 static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &outputs, ArrayRef<StringRef> names,
                 std::vector<NewArchiveMember> &unopt, std::vector<NewArchiveMember> &opt,
                 std::vector<NewArchiveMember> &obj, std::vector<NewArchiveMember> &asm_,
@@ -1198,6 +1226,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
     asm_.resize(asm_.size() + asm_out * threads);
     auto name = names[2];
     name.consume_back(".o");
+    // Timers for timing purposes
     TimerGroup timer_group("add_output", ("Time to optimize and emit LLVM module " + name).str());
     SmallVector<ShardTimers, 1> timers(threads);
     for (unsigned i = 0; i < threads; ++i) {
@@ -1232,6 +1261,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
                 errs() << "WARNING: Invalid value for JULIA_IMAGE_TIMINGS: " << env << "\n";
         }
     }
+    // Single-threaded case
     if (threads == 1) {
         output_timer.startTimer();
         add_output_impl(M, TM, outputs.data() + outputs.size() - outcount, names,
@@ -1255,6 +1285,8 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
 
     partition_timer.startTimer();
     uint64_t counter = 0;
+    // Partitioning requires all globals to have names.
+    // We use a prefix to avoid name conflicts with user code.
     for (auto &G : M.global_values()) {
         if (!G.isDeclaration() && !G.hasName()) {
             G.setName("jl_ext_" + Twine(counter++));
@@ -1262,6 +1294,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
     }
     auto partitions = partitionModule(M, threads);
     partition_timer.stopTimer();
+
     serialize_timer.startTimer();
     auto serialized = serializeModule(M);
     serialize_timer.stopTimer();
@@ -1274,10 +1307,12 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
     auto objstart = obj_out ? obj.data() + obj.size() - threads : nullptr;
     auto asmstart = asm_out ? asm_.data() + asm_.size() - threads : nullptr;
 
+    // Start all of the worker threads
     std::vector<std::thread> workers(threads);
     for (unsigned i = 0; i < threads; i++) {
         workers[i] = std::thread([&, i](){
             LLVMContext ctx;
+            // Lazily deserialize the entire module
             timers[i].deserialize.startTimer();
             auto M = cantFail(getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), "Optimized"), ctx), "Error loading module");
             timers[i].deserialize.stopTimer();
@@ -1304,6 +1339,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
         });
     }
 
+    // Wait for all of the worker threads to finish
     for (auto &w : workers)
         w.join();
 
diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp
index cbce76d702119..0474cb0c7add7 100644
--- a/src/llvm-multiversioning.cpp
+++ b/src/llvm-multiversioning.cpp
@@ -3,6 +3,8 @@
 // Function multi-versioning
 // LLVM pass to clone function for different archs
 
+//see src/processor.h for documentation of the relevant globals inserted here
+
 #include "llvm-version.h"
 #include "passes.h"
 
diff --git a/src/processor.h b/src/processor.h
index 6445f221882ba..497a93d40e11f 100644
--- a/src/processor.h
+++ b/src/processor.h
@@ -14,82 +14,9 @@
 extern "C" {
 #endif
 
-/**
- * Related sysimg exported symbols
- *
- * In the following text, function refers to an abstract entity.
- * It corresponds to a `Function` that we emit in the codegen, and there might be multiple copies
- * of it in the system image. Only one of those copies will be used in a given session.
- * Function pointers refer to a real piece of code in the system image.
- * Each function might have multiple function pointers in the system image
- * and each function pointer will correspond to only one function.
- *
- * # Global function and base pointers
- * `jl_sysimg_gvars_base`:
- *     The address of this symbol is the base data pointer
- *     (all other data pointers are stored as offsets to this address)
- * `jl_sysimg_fvars_base`:
- *     The address of this symbol is the base function pointer
- *     (all other function pointers are stored as offsets to this address)
- * `jl_sysimg_fvars_offsets`: [static data]
- *     The array of function pointer offsets (`int32_t`) from the base pointer.
- *     This includes all julia functions in sysimg as well as all other functions that are cloned.
- *     The default function pointer is used if the function is cloned.
- *     The first element is the size of the array, which should **NOT** be used as the number
- *     of julia functions in the sysimg.
- *     Each entry in this array uniquely identifies a function we are interested in
- *     (the function may have multiple function pointers corresponding to different versions).
- *     In other sysimg info, all references to functions are stored as their `uint32_t` index
- *     in this array.
- *
- * # Target data and dispatch slots (Only needed by runtime during loading)
- * `jl_dispatch_target_ids`: [static data] serialize target data.
- *     This contains the number of targets which is needed to decode `jl_dispatch_fvars_idxs`
- *     in addition to the name and feature set of each target.
- * `jl_dispatch_reloc_slots`: [static data] location and index of relocation slots.
- *     Stored as pairs of function indices and `int32_t` offsets from `jl_sysimg_gvars_base`.
- *     The first element is an `uint32_t` giving the number of relocations.
- *     This is needed for functions whose address is used in a way that requires dispatch.
- *     We currently only support one type of relocation (i.e. absolute pointer) which is enough
- *     for all use in functions as well as GOT slot (for "PLT" callback).
- *     Note that not all functions being cloned are assigned a slot.
- *     This array is sorted by the function indices.
- *     There can be more than one slot per-function,
- *     i.e. there can be duplicated function indices.
- *
- * # Target functions
- * `jl_dispatch_fvars_idxs`: [static data] Target-specific function indices.
- *     For each target, this includes a tagged `uint32_t` length, an optional `uint32_t` index
- *     of the base target followed by an array of tagged function indices.
- *     The base target index is required to be smaller than the index of the current target
- *     and must be the default (`0`) or a `clone_all` target.
- *     If it's not `0`, the function pointer array for the `clone_all` target will be used as
- *     the base function pointer offsets instead.
- *     The tag bits for both the length and the indices are the top bit.
- *     A tagged length indicates that all of the functions are cloned and the indices follows
- *     are the ones that requires relocation. The base target index is omitted in this case.
- *     Otherwise, the length is the total number of functions that we are interested in
- *     for this target, which includes all cloned julia functions and
- *     all other cloned functions that requires relocation.
- *     A tagged index means that the function pointer should be filled into the GOT slots
- *     identified by `jl_dispatch_reloc_slots`. There could be more than one slot per function.
- *     (Note that a tagged index could corresponds to a functions pointer that's the same as
- *     the base one since this is the only way we currently represent relocations.)
- *     A tagged length implicitly tags all the indices and the indices will not have the tag bit
- *     set. The lengths in this variable is needed to decode `jl_dispatch_fvars_offsets`.
- * `jl_dispatch_fvars_offsets`: [static data] Target-specific function pointer offsets.
- *     This contains all the cloned functions that we are interested in and it needs to be decoded
- *     and used along with `jl_dispatch_fvars_idxs`.
- *     For the default target, there's no entries in this variable, if there's any relocations
- *     needed for the default target, the function pointers are taken from the global offset
- *     arrays directly.
- *     For a `clone_all` target (i.e. with the length in `jl_dispatch_fvars_idxs` tagged), this
- *     variable contains an offset array of the same length as the global one. Only the indices
- *     appearing in `jl_dispatch_fvars_idxs` need relocation and the dispatch code should return
- *     this array as the original/base function offsets.
- *     For other targets, this variable contains an offset array with the length defined in
- *     `jl_dispatch_fvars_idxs`. Tagged indices need relocations.
- */
+// Image metadata
+// Every image exports a `jl_image_pointers_t` as a global symbol `jl_image_pointers`.
+// This symbol acts as a root for all other code-related symbols in the image.
 
 enum {
     JL_TARGET_VEC_CALL = 1 << 0,
@@ -163,35 +90,118 @@ typedef struct {
     jl_image_fptrs_t fptrs;
 } jl_image_t;
 
+// The header for each image
+// Details important counts about the image
 typedef struct {
+    // The version of the image format
+    // Most up-to-date version is 1
     uint32_t version;
+    // The number of shards in this image
     uint32_t nshards;
+    // The total number of fvars in this image among all shards
     uint32_t nfvars;
+    // The total number of gvars in this image among all shards
     uint32_t ngvars;
 } jl_image_header_t;
 
+// Per-shard data for image shards. Each image contains header->nshards of these.
 typedef struct {
+    
+    // This is the base function pointer
+    // (all other function pointers are stored as offsets to this address)
     const char *fvar_base;
+    
+    // The array of function pointer offsets (`int32_t`) from the base pointer.
+    // This includes all julia functions in sysimg as well as all other functions that are cloned.
+    // The default function pointer is used if the function is cloned.
+    // The first element is the size of the array, which should **NOT** be used as the number
+    // of julia functions in the sysimg.
+    // Each entry in this array uniquely identifies a function we are interested in
+    // (the function may have multiple function pointers corresponding to different versions).
+    // In other sysimg info, all references to functions are stored as their `uint32_t` index
+    // in this array.
     const int32_t *fvar_offsets;
+    // This is the mapping of shard function index -> global function index
+    // staticdata.c relies on the same order of functions in the global function array being
+    // the same as what it saw when serializing the global function array. However, partitioning
+    // into multiple shards will cause functions to be reordered. This array is used to map
+    // back to the original function array for loading. 
     const uint32_t *fvar_idxs;
+    // This is the base data pointer
+    // (all other data pointers in this shard are stored as offsets to this address)
     uintptr_t *gvar_base;
+    // This array of global variable offsets (`int32_t`) from the base pointer.
+    // Similar to fvar_offsets, but for gvars
     const int32_t *gvar_offsets;
+    // This is the mapping of shard global variable index -> global global variable index
+    // Similar to fvar_idxs, but for gvars
     const uint32_t *gvar_idxs;
+
+    // location and index of relocation slots.
+    // Stored as pairs of function indices and `int32_t` offsets from `jl_sysimg_gvars_base`.
+    // The first element is an `uint32_t` giving the number of relocations.
+    // This is needed for functions whose address is used in a way that requires dispatch.
+    // We currently only support one type of relocation (i.e. absolute pointer) which is enough
+    // for all use in functions as well as GOT slot (for "PLT" callback).
+    // Note that not all functions being cloned are assigned a slot.
+    // This array is sorted by the function indices.
+    // There can be more than one slot per-function,
+    // i.e. there can be duplicated function indices.
     const int32_t *clone_slots;
+    //  Target-specific function pointer offsets.
+    //  This contains all the cloned functions that we are interested in and it needs to be decoded
+    //  and used along with `jl_dispatch_fvars_idxs`.
+    //  For the default target, there's no entries in this variable, if there's any relocations
+    //  needed for the default target, the function pointers are taken from the global offset
+    //  arrays directly.
+    //  For a `clone_all` target (i.e. with the length in `jl_dispatch_fvars_idxs` tagged), this
+    //  variable contains an offset array of the same length as the global one. Only the indices
+    //  appearing in `jl_dispatch_fvars_idxs` need relocation and the dispatch code should return
+    //  this array as the original/base function offsets.
+    //  For other targets, this variable contains an offset array with the length defined in
+    //  `jl_dispatch_fvars_idxs`. Tagged indices need relocations.
     const int32_t *clone_offsets;
+    //  Target-specific function indices.
+    //  For each target, this includes a tagged `uint32_t` length, an optional `uint32_t` index
+    //  of the base target followed by an array of tagged function indices.
+    //  The base target index is required to be smaller than the index of the current target
+    //  and must be the default (`0`) or a `clone_all` target.
+    //  If it's not `0`, the function pointer array for the `clone_all` target will be used as
+    //  the base function pointer offsets instead.
+    //  The tag bits for both the length and the indices are the top bit.
+    //  A tagged length indicates that all of the functions are cloned and the indices follows
+    //  are the ones that requires relocation. The base target index is omitted in this case.
+    //  Otherwise, the length is the total number of functions that we are interested in
+    //  for this target, which includes all cloned julia functions and
+    //  all other cloned functions that requires relocation.
+    //  A tagged index means that the function pointer should be filled into the GOT slots
+    //  identified by `jl_dispatch_reloc_slots`. There could be more than one slot per function.
+    //  (Note that a tagged index could corresponds to a functions pointer that's the same as
+    //  the base one since this is the only way we currently represent relocations.)
+    //  A tagged length implicitly tags all the indices and the indices will not have the tag bit
+    //  set. The lengths in this variable is needed to decode `jl_dispatch_fvars_offsets`.
     const uint32_t *clone_idxs;
 } jl_image_shard_t;
 
+// The TLS data for each image
 typedef struct {
     void *pgcstack_func_slot;
     void *pgcstack_key_slot;
     size_t *tls_offset;
 } jl_image_ptls_t;
 
+//The root struct for images, points to all the other globals
 typedef struct {
+    // The image header, contains numerical global data
     const jl_image_header_t *header;
-    const jl_image_shard_t *shards; // nshards-length array
+    // The shard table, contains per-shard data
+    const jl_image_shard_t *shards; // points to header->nshards length array
+    // The TLS data
     const jl_image_ptls_t *ptls;
+
+    //  serialized target data
+    //  This contains the number of targets
+    //  in addition to the name and feature set of each target.
     const void *target_data;
 } jl_image_pointers_t;
 

From 5108b4036d7610fff1ef6e56c80f760a05d2c4d0 Mon Sep 17 00:00:00 2001
From: Prem Chintalapudi <prem.chintalapudi@gmail.com>
Date: Mon, 6 Mar 2023 00:38:41 -0500
Subject: [PATCH 34/34] Update documentation

---
 doc/src/devdocs/sysimg.md               |  3 +++
 doc/src/manual/environment-variables.md | 15 ++++++++++++++-
 src/aotcompile.cpp                      |  6 +++---
 src/processor.h                         |  6 ++----
 4 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/doc/src/devdocs/sysimg.md b/doc/src/devdocs/sysimg.md
index 3058834e927d0..6706e30ce97b1 100644
--- a/doc/src/devdocs/sysimg.md
+++ b/doc/src/devdocs/sysimg.md
@@ -8,6 +8,9 @@ as many platforms as possible, so as to give vastly improved startup times.  On
 not ship with a precompiled system image file, one can be generated from the source files shipped
 in Julia's `DATAROOTDIR/julia/base` folder.
 
+Julia will by default generate its system image on half of the available system threads. This
+may be controlled by the [`JULIA_IMAGE_THREADS`](@ref env-image-threads) environment variable.
+
 This operation is useful for multiple reasons.  A user may:
 
   * Build a precompiled shared library system image on a platform that did not ship with one, thereby
diff --git a/doc/src/manual/environment-variables.md b/doc/src/manual/environment-variables.md
index a199112e934dd..a5f4efc28e965 100644
--- a/doc/src/manual/environment-variables.md
+++ b/doc/src/manual/environment-variables.md
@@ -277,7 +277,7 @@ To use Visual Studio Code on Windows, set `$JULIA_EDITOR` to `code.cmd`.
 
 ## Parallelization
 
-### `JULIA_CPU_THREADS`
+### [`JULIA_CPU_THREADS`](@id env-cpu-threads)
 
 Overrides the global variable [`Base.Sys.CPU_THREADS`](@ref), the number of
 logical CPU cores available.
@@ -316,6 +316,19 @@ then spinning threads never sleep. Otherwise, `$JULIA_THREAD_SLEEP_THRESHOLD` is
 interpreted as an unsigned 64-bit integer (`uint64_t`) and gives, in
 nanoseconds, the amount of time after which spinning threads should sleep.
 
+### [`JULIA_IMAGE_THREADS`](@id env-image-threads)
+
+An unsigned 32-bit integer that sets the number of threads used by image
+compilation in this Julia process. The value of this variable may be
+ignored if the module is a small module. If left unspecified, the smaller
+of the value of [`JULIA_CPU_THREADS`](@ref env-cpu-threads) or half the
+number of logical CPU cores is used in its place.
+
+### `JULIA_IMAGE_TIMINGS`
+
+A boolean value that determines if detailed timing information is printed during
+during image compilation. Defaults to 0.
+
 ### `JULIA_EXCLUSIVE`
 
 If set to anything besides `0`, then Julia's thread policy is consistent with
diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 0337602cde27e..dd49e6b466474 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -600,7 +600,7 @@ static void get_fvars_gvars(Module &M, DenseMap<GlobalValue *, unsigned> &fvars,
 // Weight computation
 // It is important for multithreaded image building to be able to split work up
 // among the threads equally. The weight calculated here is an estimation of
-// how expensive a particular function is going to be to compile. 
+// how expensive a particular function is going to be to compile.
 
 struct FunctionInfo {
     size_t weight;
@@ -1193,7 +1193,7 @@ static void construct_vars(Module &M, Partition &partition) {
 }
 
 // Materialization will leave many unused declarations, which multiversioning would otherwise clone.
-// This function removes them to avoid unnecessary cloning of declarations. 
+// This function removes them to avoid unnecessary cloning of declarations.
 static void dropUnusedDeclarations(Module &M) {
     SmallVector<GlobalValue *> unused;
     for (auto &G : M.global_values()) {
@@ -1211,7 +1211,7 @@ static void dropUnusedDeclarations(Module &M) {
 }
 
 // Entrypoint to optionally-multithreaded image compilation. This handles global coordination of the threading,
-// as well as partitioning, serialization, and deserialization. 
+// as well as partitioning, serialization, and deserialization.
 static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &outputs, ArrayRef<StringRef> names,
                 std::vector<NewArchiveMember> &unopt, std::vector<NewArchiveMember> &opt,
                 std::vector<NewArchiveMember> &obj, std::vector<NewArchiveMember> &asm_,
diff --git a/src/processor.h b/src/processor.h
index 497a93d40e11f..d2280068fb67d 100644
--- a/src/processor.h
+++ b/src/processor.h
@@ -106,11 +106,10 @@ typedef struct {
 
 // Per-shard data for image shards. Each image contains header->nshards of these.
 typedef struct {
-    
+
     // This is the base function pointer
     // (all other function pointers are stored as offsets to this address)
     const char *fvar_base;
-    
     // The array of function pointer offsets (`int32_t`) from the base pointer.
     // This includes all julia functions in sysimg as well as all other functions that are cloned.
     // The default function pointer is used if the function is cloned.
@@ -125,7 +124,7 @@ typedef struct {
     // staticdata.c relies on the same order of functions in the global function array being
     // the same as what it saw when serializing the global function array. However, partitioning
     // into multiple shards will cause functions to be reordered. This array is used to map
-    // back to the original function array for loading. 
+    // back to the original function array for loading.
     const uint32_t *fvar_idxs;
     // This is the base data pointer
     // (all other data pointers in this shard are stored as offsets to this address)
@@ -136,7 +135,6 @@ typedef struct {
     // This is the mapping of shard global variable index -> global global variable index
     // Similar to fvar_idxs, but for gvars
     const uint32_t *gvar_idxs;
-
     // location and index of relocation slots.
     // Stored as pairs of function indices and `int32_t` offsets from `jl_sysimg_gvars_base`.
     // The first element is an `uint32_t` giving the number of relocations.