From 7d68c8457722a091b740cf4a939c92148c82892e Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sun, 20 Feb 2022 12:54:50 -0500 Subject: [PATCH 1/7] WIP: Start new pipeline --- src/aotcompile.cpp | 284 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 284 insertions(+) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 7a19b34bd6824..131a57bde8d25 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -45,6 +45,12 @@ #endif #endif +// NewPM needs to manually include all the pass headers +#include +#include "llvm/Transforms/Scalar/InstSimplifyPass.h" +#include "llvm/Transforms/Scalar/SimplifyCFG.h" + + // for outputting code #include #include @@ -937,6 +943,284 @@ void jl_add_optimization_passes_impl(LLVMPassManagerRef PM, int opt_level, int l addOptimizationPasses(unwrap(PM), opt_level, lower_intrinsics); } +// new pass manager + +// FunctionAnalysisManager registerFunctionAnalysis(PassBuilder &PB, TargetMachine *TM) +// { +// llvm::FunctionAnalysisManager FAM; +// FAM.registerPass([&] { return llvm::TargetIRAnalysis(TM->getTargetIRAnalysis()); }); +// FAM.registerPass([&] { return llvm::TargetLibraryAnalysis(llvm::TargetLibraryInfoImpl(TM->getTargetTriple())); }); + +// return FAM; +// } + + + +// LoopAnalysisManager registerLoopAnalysis() +// { +// llvm::LoopAnalysisManager LAM; +// llvm::PassBuilder& PB; +// PB.registerLoopAnalyses(LAM); +// return LAM; +// } + + +void constructPipeline(TargetMachine *TM, int opt_level, bool lower_intrinsics, bool dump_native) { + // llvm::PassBuilder pb(targetMachine->LLVM, llvm::PipelineTuningOptions(), llvm::None, &passInstrumentationCallbacks); + PassBuilder PB; + // Create the analysis managers. + LoopAnalysisManager LAM; + PB.registerLoopAnalyses(LAM); + + // do we need to do this manually? + // https://www.duskborn.com/posts/llvm-new-pass-manager/#porting-our-custom-pass-pipeline does + FunctionAnalysisManager FAM; + FAM.registerPass([&] { return llvm::TargetIRAnalysis(TM->getTargetIRAnalysis()); }); + FAM.registerPass([&] { return llvm::TargetLibraryAnalysis(llvm::TargetLibraryInfoImpl(TM->getTargetTriple())); }); + + PB.registerFunctionAnalyses(FAM); + + CGSCCAnalysisManager CGAM; + PB.registerCGSCCAnalyses(CGAM); + + ModuleAnalysisManager MAM; + PB.registerModuleAnalyses(MAM); + + PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); + + ModulePassManager MPM; + + // Construct a pipeline. + // TODO: CommonInstruction hoisting/sinking enables AllocOpt + // to merge allocations and sometimes eliminate them, + // since AllocOpt does not handle PhiNodes. + // Enable this instruction hoisting because of this and Union benchmarks. + auto simplifyCFGOptions = SimplifyCFGOptions().hoistCommonInsts(true); + +// #ifdef JL_DEBUG_BUILD + { + FunctionPassManager FPM; + FPM.addPass(GCInvariantVerifierPass()); + FPM.addPass(VerifierPass()); + + MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); + } +// #endif + + MPM.addPass(ConstantMergePass()); + if (opt_level < 2) { + if (!dump_native) { + // we won't be multiversioning, so lower CPU feature checks early on + // so that we can avoid an additional CFG simplification pass at the end. + MPM.addPass(CPUFeatures()); + if (opt_level == 1) + MPM.addPass(InstSimplifyPass()); + } + MPM.addPass(SimplifyCFGPass(simplifyCFGOptions)); + if (opt_level == 1) { +// PM->add(createSROAPass()); +// PM->add(createInstructionCombiningPass()); +// PM->add(createEarlyCSEPass()); +// // maybe add GVN? +// // also try GVNHoist and GVNSink + } +// PM->add(createMemCpyOptPass()); +// PM->add(createAlwaysInlinerLegacyPass()); // Respect always_inline +// PM->add(createLowerSimdLoopPass()); // Annotate loop marked with "loopinfo" as LLVM parallel loop + if (lower_intrinsics) { +// PM->add(createBarrierNoopPass()); +// PM->add(createLowerExcHandlersPass()); +// PM->add(createGCInvariantVerifierPass(false)); +// PM->add(createRemoveNIPass()); +// PM->add(createLateLowerGCFramePass()); +// PM->add(createFinalLowerGCPass()); +// PM->add(createLowerPTLSPass(dump_native)); + } + else { +// PM->add(createRemoveNIPass()); + } +// PM->add(createLowerSimdLoopPass()); // Annotate loop marked with "loopinfo" as LLVM parallel loop + if (dump_native) { +// PM->add(createMultiVersioningPass()); +// PM->add(createCPUFeaturesPass()); +// // minimal clean-up to get rid of CPU feature checks + if (opt_level == 1) { +// PM->add(createInstSimplifyLegacyPass()); +// PM->add(createCFGSimplificationPass(simplifyCFGOptions)); + } + } +// #if defined(_COMPILER_ASAN_ENABLED_) +// PM->add(createAddressSanitizerFunctionPass()); +// #endif +// #if defined(_COMPILER_MSAN_ENABLED_) +// PM->add(createMemorySanitizerPass(true)); +// #endif +// #if defined(_COMPILER_TSAN_ENABLED_) +// PM->add(createThreadSanitizerLegacyPassPass()); +// #endif + return; + } +// PM->add(createPropagateJuliaAddrspaces()); +// PM->add(createScopedNoAliasAAWrapperPass()); +// PM->add(createTypeBasedAAWrapperPass()); + if (opt_level >= 3) { +// PM->add(createBasicAAWrapperPass()); + } + +// PM->add(createCFGSimplificationPass(simplifyCFGOptions)); +// PM->add(createDeadCodeEliminationPass()); +// PM->add(createSROAPass()); + +// //PM->add(createMemCpyOptPass()); + +// PM->add(createAlwaysInlinerLegacyPass()); // Respect always_inline + +// // Running `memcpyopt` between this and `sroa` seems to give `sroa` a hard time +// // merging the `alloca` for the unboxed data and the `alloca` created by the `alloc_opt` +// // pass. +// PM->add(createAllocOptPass()); +// // consider AggressiveInstCombinePass at optlevel > 2 +// PM->add(createInstructionCombiningPass()); +// PM->add(createCFGSimplificationPass(simplifyCFGOptions)); +// if (dump_native) +// PM->add(createMultiVersioningPass()); +// PM->add(createCPUFeaturesPass()); +// PM->add(createSROAPass()); +// PM->add(createInstSimplifyLegacyPass()); +// PM->add(createJumpThreadingPass()); +// PM->add(createCorrelatedValuePropagationPass()); + +// PM->add(createReassociatePass()); + +// PM->add(createEarlyCSEPass()); + +// // Load forwarding above can expose allocations that aren't actually used +// // remove those before optimizing loops. +// PM->add(createAllocOptPass()); +// PM->add(createLoopRotatePass()); +// // moving IndVarSimplify here prevented removing the loop in perf_sumcartesian(10:-1:1) +// #ifdef USE_POLLY +// // LCSSA (which has already run at this point due to the dependencies of the +// // above passes) introduces redundant phis that hinder Polly. Therefore we +// // run InstCombine here to remove them. +// PM->add(createInstructionCombiningPass()); +// PM->add(polly::createCodePreparationPass()); +// polly::registerPollyPasses(*PM); +// PM->add(polly::createCodegenCleanupPass()); +// #endif +// // LoopRotate strips metadata from terminator, so run LowerSIMD afterwards +// PM->add(createLowerSimdLoopPass()); // Annotate loop marked with "loopinfo" as LLVM parallel loop +// PM->add(createLICMPass()); +// PM->add(createJuliaLICMPass()); +// PM->add(createLoopUnswitchPass()); +// PM->add(createLICMPass()); +// PM->add(createJuliaLICMPass()); +// PM->add(createInductiveRangeCheckEliminationPass()); // Must come before indvars +// // Subsequent passes not stripping metadata from terminator +// PM->add(createInstSimplifyLegacyPass()); +// PM->add(createLoopIdiomPass()); +// PM->add(createIndVarSimplifyPass()); +// PM->add(createLoopDeletionPass()); +// PM->add(createSimpleLoopUnrollPass()); + +// // Run our own SROA on heap objects before LLVM's +// PM->add(createAllocOptPass()); +// // Re-run SROA after loop-unrolling (useful for small loops that operate, +// // over the structure of an aggregate) +// PM->add(createSROAPass()); +// // might not be necessary: +// PM->add(createInstSimplifyLegacyPass()); + +// PM->add(createGVNPass()); +// PM->add(createMemCpyOptPass()); +// PM->add(createSCCPPass()); + +// //These next two passes must come before IRCE to eliminate the bounds check in #43308 +// PM->add(createCorrelatedValuePropagationPass()); +// PM->add(createDeadCodeEliminationPass()); + +// PM->add(createInductiveRangeCheckEliminationPass()); // Must come between the two GVN passes + +// // Run instcombine after redundancy elimination to exploit opportunities +// // opened up by them. +// // This needs to be InstCombine instead of InstSimplify to allow +// // loops over Union-typed arrays to vectorize. +// PM->add(createInstructionCombiningPass()); +// PM->add(createJumpThreadingPass()); +// if (opt_level >= 3) { +// PM->add(createGVNPass()); // Must come after JumpThreading and before LoopVectorize +// } +// PM->add(createDeadStoreEliminationPass()); + +// // More dead allocation (store) deletion before loop optimization +// // consider removing this: +// PM->add(createAllocOptPass()); +// // see if all of the constant folding has exposed more loops +// // to simplification and deletion +// // this helps significantly with cleaning up iteration +// PM->add(createCFGSimplificationPass()); // See note above, don't hoist instructions before LV +// PM->add(createLoopDeletionPass()); +// PM->add(createInstructionCombiningPass()); +// PM->add(createLoopVectorizePass()); +// PM->add(createLoopLoadEliminationPass()); +// // Cleanup after LV pass +// PM->add(createInstructionCombiningPass()); +// PM->add(createCFGSimplificationPass( // Aggressive CFG simplification +// SimplifyCFGOptions() +// .forwardSwitchCondToPhi(true) +// .convertSwitchToLookupTable(true) +// .needCanonicalLoops(false) +// .hoistCommonInsts(true) +// // .sinkCommonInsts(true) // FIXME: Causes assertion in llvm-late-lowering +// )); +// PM->add(createSLPVectorizerPass()); +// // might need this after LLVM 11: +// //PM->add(createVectorCombinePass()); + +// PM->add(createAggressiveDCEPass()); + +// if (lower_intrinsics) { +// // LowerPTLS removes an indirect call. As a result, it is likely to trigger +// // LLVM's devirtualization heuristics, which would result in the entire +// // pass pipeline being re-exectuted. Prevent this by inserting a barrier. +// PM->add(createBarrierNoopPass()); +// PM->add(createLowerExcHandlersPass()); +// PM->add(createGCInvariantVerifierPass(false)); +// // Needed **before** LateLowerGCFrame on LLVM < 12 +// // due to bug in `CreateAlignmentAssumption`. +// PM->add(createRemoveNIPass()); +// PM->add(createLateLowerGCFramePass()); +// PM->add(createFinalLowerGCPass()); +// // We need these two passes and the instcombine below +// // after GC lowering to let LLVM do some constant propagation on the tags. +// // and remove some unnecessary write barrier checks. +// PM->add(createGVNPass()); +// PM->add(createSCCPPass()); +// // Remove dead use of ptls +// PM->add(createDeadCodeEliminationPass()); +// PM->add(createLowerPTLSPass(dump_native)); +// PM->add(createInstructionCombiningPass()); +// // Clean up write barrier and ptls lowering +// PM->add(createCFGSimplificationPass()); +// } +// else { +// PM->add(createRemoveNIPass()); +// } +// PM->add(createCombineMulAddPass()); +// PM->add(createDivRemPairsPass()); +// #if defined(_COMPILER_ASAN_ENABLED_) +// PM->add(createAddressSanitizerFunctionPass()); +// #endif +// #if defined(_COMPILER_MSAN_ENABLED_) +// PM->add(createMemorySanitizerPass(true)); +// #endif +// #if defined(_COMPILER_TSAN_ENABLED_) +// PM->add(createThreadSanitizerLegacyPassPass()); +// #endif +// } + +} + // new pass manager plugin // NOTE: Instead of exporting all the constructors in passes.h we could From 3b18637d7ffe6e1299c34958a3bf4795fa1fe90a Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sat, 26 Feb 2022 23:30:57 -0500 Subject: [PATCH 2/7] finish translating pipeline --- src/aotcompile.cpp | 559 ++++++++++++++++++++++++++------------------- 1 file changed, 321 insertions(+), 238 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 131a57bde8d25..458f83692df6e 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -46,9 +46,41 @@ #endif // NewPM needs to manually include all the pass headers +#include "llvm/Transforms/IPO/AlwaysInliner.h" #include +#include "llvm/Transforms/InstCombine/InstCombine.h" +#include "llvm/Transforms/Instrumentation/AddressSanitizer.h" +#include "llvm/Transforms/Instrumentation/MemorySanitizer.h" +#include "llvm/Transforms/Instrumentation/ThreadSanitizer.h" +#include "llvm/Transforms/Scalar/ADCE.h" +#include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h" +#include "llvm/Transforms/Scalar/DCE.h" +#include "llvm/Transforms/Scalar/DeadStoreElimination.h" +#include "llvm/Transforms/Scalar/DivRemPairs.h" +#include "llvm/Transforms/Scalar/EarlyCSE.h" +#include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/Scalar/IndVarSimplify.h" +#include "llvm/Transforms/Scalar/InductiveRangeCheckElimination.h" #include "llvm/Transforms/Scalar/InstSimplifyPass.h" +#include "llvm/Transforms/Scalar/JumpThreading.h" +#include "llvm/Transforms/Scalar/LICM.h" +#include "llvm/Transforms/Scalar/LoopDeletion.h" +#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h" +#include "llvm/Transforms/Scalar/LoopInstSimplify.h" +#include "llvm/Transforms/Scalar/LoopLoadElimination.h" +#include "llvm/Transforms/Scalar/LoopRotation.h" +#include "llvm/Transforms/Scalar/LoopSimplifyCFG.h" +#include "llvm/Transforms/Scalar/LoopUnrollPass.h" +#include "llvm/Transforms/Scalar/MemCpyOptimizer.h" +#include "llvm/Transforms/Scalar/Reassociate.h" +#include "llvm/Transforms/Scalar/SCCP.h" +#include "llvm/Transforms/Scalar/SROA.h" +#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" #include "llvm/Transforms/Scalar/SimplifyCFG.h" +#include "llvm/Transforms/Vectorize/LoopVectorize.h" +#include "llvm/Transforms/Vectorize/SLPVectorizer.h" +#include "llvm/Transforms/Vectorize/VectorCombine.h" + // for outputting code @@ -944,53 +976,8 @@ void jl_add_optimization_passes_impl(LLVMPassManagerRef PM, int opt_level, int l } // new pass manager - -// FunctionAnalysisManager registerFunctionAnalysis(PassBuilder &PB, TargetMachine *TM) -// { -// llvm::FunctionAnalysisManager FAM; -// FAM.registerPass([&] { return llvm::TargetIRAnalysis(TM->getTargetIRAnalysis()); }); -// FAM.registerPass([&] { return llvm::TargetLibraryAnalysis(llvm::TargetLibraryInfoImpl(TM->getTargetTriple())); }); - -// return FAM; -// } - - - -// LoopAnalysisManager registerLoopAnalysis() -// { -// llvm::LoopAnalysisManager LAM; -// llvm::PassBuilder& PB; -// PB.registerLoopAnalyses(LAM); -// return LAM; -// } - - -void constructPipeline(TargetMachine *TM, int opt_level, bool lower_intrinsics, bool dump_native) { - // llvm::PassBuilder pb(targetMachine->LLVM, llvm::PipelineTuningOptions(), llvm::None, &passInstrumentationCallbacks); - PassBuilder PB; - // Create the analysis managers. - LoopAnalysisManager LAM; - PB.registerLoopAnalyses(LAM); - - // do we need to do this manually? - // https://www.duskborn.com/posts/llvm-new-pass-manager/#porting-our-custom-pass-pipeline does - FunctionAnalysisManager FAM; - FAM.registerPass([&] { return llvm::TargetIRAnalysis(TM->getTargetIRAnalysis()); }); - FAM.registerPass([&] { return llvm::TargetLibraryAnalysis(llvm::TargetLibraryInfoImpl(TM->getTargetTriple())); }); - - PB.registerFunctionAnalyses(FAM); - - CGSCCAnalysisManager CGAM; - PB.registerCGSCCAnalyses(CGAM); - - ModuleAnalysisManager MAM; - PB.registerModuleAnalyses(MAM); - - PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); - - ModulePassManager MPM; - - // Construct a pipeline. +void addPipeline(ModulePassManager &MPM, int opt_level, bool lower_intrinsics, bool dump_native) +{ // TODO: CommonInstruction hoisting/sinking enables AllocOpt // to merge allocations and sometimes eliminate them, // since AllocOpt does not handle PhiNodes. @@ -1002,7 +989,6 @@ void constructPipeline(TargetMachine *TM, int opt_level, bool lower_intrinsics, FunctionPassManager FPM; FPM.addPass(GCInvariantVerifierPass()); FPM.addPass(VerifierPass()); - MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); } // #endif @@ -1013,212 +999,309 @@ void constructPipeline(TargetMachine *TM, int opt_level, bool lower_intrinsics, // we won't be multiversioning, so lower CPU feature checks early on // so that we can avoid an additional CFG simplification pass at the end. MPM.addPass(CPUFeatures()); - if (opt_level == 1) - MPM.addPass(InstSimplifyPass()); + if (opt_level == 1) { + FunctionPassManager FPM; + FPM.addPass(InstSimplifyPass()); + MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); + } } - MPM.addPass(SimplifyCFGPass(simplifyCFGOptions)); - if (opt_level == 1) { -// PM->add(createSROAPass()); -// PM->add(createInstructionCombiningPass()); -// PM->add(createEarlyCSEPass()); -// // maybe add GVN? -// // also try GVNHoist and GVNSink + { + FunctionPassManager FPM; + FPM.addPass(SimplifyCFGPass(simplifyCFGOptions)); + if (opt_level == 1) { + FPM.addPass(SROA()); + FPM.addPass(InstCombinePass()); + FPM.addPass(EarlyCSEPass()); + // maybe add GVN? + // also try GVNHoist and GVNSink + } + FPM.addPass(MemCpyOptPass()); + MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); } -// PM->add(createMemCpyOptPass()); -// PM->add(createAlwaysInlinerLegacyPass()); // Respect always_inline -// PM->add(createLowerSimdLoopPass()); // Annotate loop marked with "loopinfo" as LLVM parallel loop + MPM.addPass(AlwaysInlinerPass()); + MPM.addPass(LowerSIMDLoop()); // Annotate loop marked with "loopinfo" as LLVM parallel loop if (lower_intrinsics) { -// PM->add(createBarrierNoopPass()); -// PM->add(createLowerExcHandlersPass()); -// PM->add(createGCInvariantVerifierPass(false)); -// PM->add(createRemoveNIPass()); -// PM->add(createLateLowerGCFramePass()); -// PM->add(createFinalLowerGCPass()); -// PM->add(createLowerPTLSPass(dump_native)); + // TODO: Barrier Pass? + { + FunctionPassManager FPM; + FPM.addPass(LowerExcHandlers()); + FPM.addPass(GCInvariantVerifierPass(false)); + MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); + } + MPM.addPass(RemoveNI()); + { + FunctionPassManager FPM; + FPM.addPass(LateLowerGC()); + MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); + } + MPM.addPass(FinalLowerGCPass()); + MPM.addPass(LowerPTLSPass(dump_native)); } else { -// PM->add(createRemoveNIPass()); + MPM.addPass(RemoveNI()); } -// PM->add(createLowerSimdLoopPass()); // Annotate loop marked with "loopinfo" as LLVM parallel loop + MPM.addPass(LowerSIMDLoop()); // Annotate loop marked with "loopinfo" as LLVM parallel loop if (dump_native) { -// PM->add(createMultiVersioningPass()); -// PM->add(createCPUFeaturesPass()); -// // minimal clean-up to get rid of CPU feature checks + MPM.addPass(MultiVersioning()); + MPM.addPass(CPUFeatures()); + // minimal clean-up to get rid of CPU feature checks if (opt_level == 1) { -// PM->add(createInstSimplifyLegacyPass()); -// PM->add(createCFGSimplificationPass(simplifyCFGOptions)); + FunctionPassManager FPM; + FPM.addPass(InstSimplifyPass()); + FPM.addPass(SimplifyCFGPass(simplifyCFGOptions)); + MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); } } -// #if defined(_COMPILER_ASAN_ENABLED_) -// PM->add(createAddressSanitizerFunctionPass()); -// #endif -// #if defined(_COMPILER_MSAN_ENABLED_) -// PM->add(createMemorySanitizerPass(true)); -// #endif -// #if defined(_COMPILER_TSAN_ENABLED_) -// PM->add(createThreadSanitizerLegacyPassPass()); -// #endif + { + FunctionPassManager FPM; +#if defined(_COMPILER_ASAN_ENABLED_) + FPM.addPass(AddressSanitizerPass()); +#endif +#if defined(_COMPILER_MSAN_ENABLED_) + FPM.addPass(MemorySanitizerPass()); +#endif +#if defined(_COMPILER_TSAN_ENABLED_) + FPM.addPass(ThreadSanitizerPass()); +#endif + MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); + } return; } -// PM->add(createPropagateJuliaAddrspaces()); -// PM->add(createScopedNoAliasAAWrapperPass()); -// PM->add(createTypeBasedAAWrapperPass()); + { + FunctionPassManager FPM; + FPM.addPass(PropagateJuliaAddrspacesPass()); + FPM.addPass(SimplifyCFGPass(simplifyCFGOptions)); + FPM.addPass(DCEPass()); + FPM.addPass(SROA()); + // FPM.addPass(MemCpyOptPass()); + MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); + } + MPM.addPass(AlwaysInlinerPass()); // Respect always_inline + // Running `memcpyopt` between this and `sroa` seems to give `sroa` a hard time + // merging the `alloca` for the unboxed data and the `alloca` created by the `alloc_opt` + // pass. + { + FunctionPassManager FPM; + FPM.addPass(AllocOptPass()); + // consider AggressiveInstCombinePass at optlevel > 2 + FPM.addPass(InstCombinePass()); + FPM.addPass(SimplifyCFGPass(simplifyCFGOptions)); + } + + if (dump_native) + MPM.addPass(MultiVersioning()); + MPM.addPass(CPUFeatures()); + { + FunctionPassManager FPM; + FPM.addPass(SROA()); + FPM.addPass(InstSimplifyPass()); + FPM.addPass(JumpThreadingPass()); + FPM.addPass(CorrelatedValuePropagationPass()); + FPM.addPass(ReassociatePass()); + FPM.addPass(EarlyCSEPass()); + // Load forwarding above can expose allocations that aren't actually used + // remove those before optimizing loops. + FPM.addPass(AllocOptPass()); + { + LoopPassManager LPM; + LPM.addPass(LoopRotatePass()); + // TODO: Can we use MemorySSA? + FPM.addPass(llvm::createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/false)); + } + // moving IndVarSimplify here prevented removing the loop in perf_sumcartesian(10:-1:1) + // LoopRotate strips metadata from terminator, so run LowerSIMD afterwards + MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); + } + MPM.addPass(LowerSIMDLoop()); // Annotate loop marked with "loopinfo" as LLVM parallel loop + { + FunctionPassManager FPM; + { + LoopPassManager LPM; + LPM.addPass(LICMPass()); + LPM.addPass(JuliaLICMPass()); + LPM.addPass(SimpleLoopUnswitchPass()); + LPM.addPass(LICMPass()); + LPM.addPass(JuliaLICMPass()); + // TODO: Can we use MemorySSA? + FPM.addPass(llvm::createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/false)); + } + FPM.addPass(IRCEPass()); // must combe before indvars + { + LoopPassManager LPM; + // Subsequent passes not stripping metadata from terminator + LPM.addPass(LoopInstSimplifyPass()); // NOTE: OldPM we used InstSimplifyPass + LPM.addPass(LoopIdiomRecognizePass()); + LPM.addPass(IndVarSimplifyPass()); + LPM.addPass(LoopDeletionPass()); + + // TODO: Can we use MemorySSA? + FPM.addPass(llvm::createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/false)); + } + FPM.addPass(LoopUnrollPass()); // Note: OldPM SimpleLoopUnrollPass + FPM.addPass(InstSimplifyPass()); // NOTE: Additionally for NewPM + // Run our own SROA on heap objects before LLVM's + FPM.addPass(AllocOptPass()); + // Re-run SROA after loop-unrolling (useful for small loops that operate, + // over the structure of an aggregate) + FPM.addPass(SROA()); + FPM.addPass(InstSimplifyPass()); + FPM.addPass(GVN()); + FPM.addPass(MemCpyOptPass()); + FPM.addPass(SCCPPass()); + // These next two passes must come before IRCE to eliminate the bounds check in #43308 + FPM.addPass(CorrelatedValuePropagationPass()); + FPM.addPass(DCEPass()); + FPM.addPass(IRCEPass()); // Must come between the two GVN passes + // Run instcombine after redundancy elimination to exploit opportunities + // opened up by them. + // This needs to be InstCombine instead of InstSimplify to allow + // loops over Union-typed arrays to vectorize. + FPM.addPass(InstCombinePass()); + FPM.addPass(JumpThreadingPass()); + if (opt_level >= 3) { + FPM.addPass(GVN()); // Must come after JumpThreading and before LoopVectorize + } + FPM.addPass(DSEPass()); + // More dead allocation (store) deletion before loop optimization + // consider removing this: + FPM.addPass(AllocOptPass()); + // see if all of the constant folding has exposed more loops + // to simplification and deletion + // this helps significantly with cleaning up iteration + { + LoopPassManager LPM; + LPM.addPass(LoopSimplifyCFGPass()); // NOTE: OldPM we used SimplifyCFGPass + LPM.addPass(LoopDeletionPass()); + LPM.addPass(LoopInstSimplifyPass()); // NOTE: OldPM we used InstCombinePass + FPM.addPass(llvm::createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/false)); + } + FPM.addPass(LoopVectorizePass()); + FPM.addPass(LoopLoadEliminationPass()); + // Cleanup after LV pass + FPM.addPass(InstCombinePass()); + FPM.addPass(SimplifyCFGPass( // Aggressive CFG simplification + SimplifyCFGOptions() + .forwardSwitchCondToPhi(true) + .convertSwitchToLookupTable(true) + .needCanonicalLoops(false) + .hoistCommonInsts(true) + // .sinkCommonInsts(true) // FIXME: Causes assertion in llvm-late-lowering + )); + FPM.addPass(SLPVectorizerPass()); + // might need this after LLVM 11: + // FPM->add(VectorCombinePass()); + FPM.addPass(ADCEPass()); + MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); + } + + if (lower_intrinsics) { + // LowerPTLS removes an indirect call. As a result, it is likely to trigger + // LLVM's devirtualization heuristics, which would result in the entire + // pass pipeline being re-exectuted. Prevent this by inserting a barrier. + // TODO: Barrier Pass? + // Maybe by creating a new pass manager and nesting it inside MPM? + // ModulePassManager NestedMPM; + // MPM.addPass(std::move(NestedMPM)); + { + FunctionPassManager FPM; + FPM.addPass(LowerExcHandlers()); + FPM.addPass(GCInvariantVerifierPass(false)); + MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); + } + // Needed **before** LateLowerGCFrame on LLVM < 12 + // due to bug in `CreateAlignmentAssumption`. + MPM.addPass(RemoveNI()); + { + FunctionPassManager FPM; + FPM.addPass(LateLowerGC()); + MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); + } + MPM.addPass(FinalLowerGCPass()); + // We need these two passes and the instcombine below + // after GC lowering to let LLVM do some constant propagation on the tags. + // and remove some unnecessary write barrier checks. + { + FunctionPassManager FPM; + FPM.addPass(GVN()); + FPM.addPass(SCCPPass()); + // Remove dead use of ptls + FPM.addPass(DCEPass()); + MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); + } + MPM.addPass(LowerPTLSPass(dump_native)); + // Clean up write barrier and ptls lowering + { + FunctionPassManager FPM; + FPM.addPass(InstCombinePass()); + FPM.addPass(SimplifyCFGPass()); + } + } + else { + MPM.addPass(RemoveNI()); + } + { + FunctionPassManager FPM; + FPM.addPass(CombineMulAdd()); + FPM.addPass(DivRemPairsPass()); +#if defined(_COMPILER_ASAN_ENABLED_) + FPM.addPass(AddressSanitizerPass()); +#endif +#if defined(_COMPILER_MSAN_ENABLED_) + FPM.addPass(MemorySanitizerPass()); +#endif +#if defined(_COMPILER_TSAN_ENABLED_) + FPM.addPass(ThreadSanitizerPass()); +#endif + MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); + } +} + +// TODO(vchuravy/maleadt): +// Since we are not using the PassBuilder fully and instead rolling our own, we are missing out on +// TargetMachine::registerPassBuilderCallbacks. We need to find a solution either in working with upstream +// or adapting PassBuilder (or subclassing it) to suite our needs. This is in particular important for +// BPF, NVPTX, and AMDGPU. + +void optimizeModule(Module &M, TargetMachine *TM, int opt_level, bool lower_intrinsics, bool dump_native) +{ + // llvm::PassBuilder pb(targetMachine->LLVM, llvm::PipelineTuningOptions(), llvm::None, &passInstrumentationCallbacks); + PassBuilder PB; + // Create the analysis managers. + LoopAnalysisManager LAM; + PB.registerLoopAnalyses(LAM); + + AAManager AA; + // TODO: Why are we only doing this for -O3? if (opt_level >= 3) { -// PM->add(createBasicAAWrapperPass()); + AA.registerFunctionAnalysis(); } + if (opt_level >= 2) { + AA.registerFunctionAnalysis(); + AA.registerFunctionAnalysis(); + } + // TM->registerDefaultAliasAnalyses(AA); -// PM->add(createCFGSimplificationPass(simplifyCFGOptions)); -// PM->add(createDeadCodeEliminationPass()); -// PM->add(createSROAPass()); - -// //PM->add(createMemCpyOptPass()); - -// PM->add(createAlwaysInlinerLegacyPass()); // Respect always_inline - -// // Running `memcpyopt` between this and `sroa` seems to give `sroa` a hard time -// // merging the `alloca` for the unboxed data and the `alloca` created by the `alloc_opt` -// // pass. -// PM->add(createAllocOptPass()); -// // consider AggressiveInstCombinePass at optlevel > 2 -// PM->add(createInstructionCombiningPass()); -// PM->add(createCFGSimplificationPass(simplifyCFGOptions)); -// if (dump_native) -// PM->add(createMultiVersioningPass()); -// PM->add(createCPUFeaturesPass()); -// PM->add(createSROAPass()); -// PM->add(createInstSimplifyLegacyPass()); -// PM->add(createJumpThreadingPass()); -// PM->add(createCorrelatedValuePropagationPass()); - -// PM->add(createReassociatePass()); - -// PM->add(createEarlyCSEPass()); - -// // Load forwarding above can expose allocations that aren't actually used -// // remove those before optimizing loops. -// PM->add(createAllocOptPass()); -// PM->add(createLoopRotatePass()); -// // moving IndVarSimplify here prevented removing the loop in perf_sumcartesian(10:-1:1) -// #ifdef USE_POLLY -// // LCSSA (which has already run at this point due to the dependencies of the -// // above passes) introduces redundant phis that hinder Polly. Therefore we -// // run InstCombine here to remove them. -// PM->add(createInstructionCombiningPass()); -// PM->add(polly::createCodePreparationPass()); -// polly::registerPollyPasses(*PM); -// PM->add(polly::createCodegenCleanupPass()); -// #endif -// // LoopRotate strips metadata from terminator, so run LowerSIMD afterwards -// PM->add(createLowerSimdLoopPass()); // Annotate loop marked with "loopinfo" as LLVM parallel loop -// PM->add(createLICMPass()); -// PM->add(createJuliaLICMPass()); -// PM->add(createLoopUnswitchPass()); -// PM->add(createLICMPass()); -// PM->add(createJuliaLICMPass()); -// PM->add(createInductiveRangeCheckEliminationPass()); // Must come before indvars -// // Subsequent passes not stripping metadata from terminator -// PM->add(createInstSimplifyLegacyPass()); -// PM->add(createLoopIdiomPass()); -// PM->add(createIndVarSimplifyPass()); -// PM->add(createLoopDeletionPass()); -// PM->add(createSimpleLoopUnrollPass()); - -// // Run our own SROA on heap objects before LLVM's -// PM->add(createAllocOptPass()); -// // Re-run SROA after loop-unrolling (useful for small loops that operate, -// // over the structure of an aggregate) -// PM->add(createSROAPass()); -// // might not be necessary: -// PM->add(createInstSimplifyLegacyPass()); - -// PM->add(createGVNPass()); -// PM->add(createMemCpyOptPass()); -// PM->add(createSCCPPass()); - -// //These next two passes must come before IRCE to eliminate the bounds check in #43308 -// PM->add(createCorrelatedValuePropagationPass()); -// PM->add(createDeadCodeEliminationPass()); - -// PM->add(createInductiveRangeCheckEliminationPass()); // Must come between the two GVN passes - -// // Run instcombine after redundancy elimination to exploit opportunities -// // opened up by them. -// // This needs to be InstCombine instead of InstSimplify to allow -// // loops over Union-typed arrays to vectorize. -// PM->add(createInstructionCombiningPass()); -// PM->add(createJumpThreadingPass()); -// if (opt_level >= 3) { -// PM->add(createGVNPass()); // Must come after JumpThreading and before LoopVectorize -// } -// PM->add(createDeadStoreEliminationPass()); - -// // More dead allocation (store) deletion before loop optimization -// // consider removing this: -// PM->add(createAllocOptPass()); -// // see if all of the constant folding has exposed more loops -// // to simplification and deletion -// // this helps significantly with cleaning up iteration -// PM->add(createCFGSimplificationPass()); // See note above, don't hoist instructions before LV -// PM->add(createLoopDeletionPass()); -// PM->add(createInstructionCombiningPass()); -// PM->add(createLoopVectorizePass()); -// PM->add(createLoopLoadEliminationPass()); -// // Cleanup after LV pass -// PM->add(createInstructionCombiningPass()); -// PM->add(createCFGSimplificationPass( // Aggressive CFG simplification -// SimplifyCFGOptions() -// .forwardSwitchCondToPhi(true) -// .convertSwitchToLookupTable(true) -// .needCanonicalLoops(false) -// .hoistCommonInsts(true) -// // .sinkCommonInsts(true) // FIXME: Causes assertion in llvm-late-lowering -// )); -// PM->add(createSLPVectorizerPass()); -// // might need this after LLVM 11: -// //PM->add(createVectorCombinePass()); - -// PM->add(createAggressiveDCEPass()); - -// if (lower_intrinsics) { -// // LowerPTLS removes an indirect call. As a result, it is likely to trigger -// // LLVM's devirtualization heuristics, which would result in the entire -// // pass pipeline being re-exectuted. Prevent this by inserting a barrier. -// PM->add(createBarrierNoopPass()); -// PM->add(createLowerExcHandlersPass()); -// PM->add(createGCInvariantVerifierPass(false)); -// // Needed **before** LateLowerGCFrame on LLVM < 12 -// // due to bug in `CreateAlignmentAssumption`. -// PM->add(createRemoveNIPass()); -// PM->add(createLateLowerGCFramePass()); -// PM->add(createFinalLowerGCPass()); -// // We need these two passes and the instcombine below -// // after GC lowering to let LLVM do some constant propagation on the tags. -// // and remove some unnecessary write barrier checks. -// PM->add(createGVNPass()); -// PM->add(createSCCPPass()); -// // Remove dead use of ptls -// PM->add(createDeadCodeEliminationPass()); -// PM->add(createLowerPTLSPass(dump_native)); -// PM->add(createInstructionCombiningPass()); -// // Clean up write barrier and ptls lowering -// PM->add(createCFGSimplificationPass()); -// } -// else { -// PM->add(createRemoveNIPass()); -// } -// PM->add(createCombineMulAddPass()); -// PM->add(createDivRemPairsPass()); -// #if defined(_COMPILER_ASAN_ENABLED_) -// PM->add(createAddressSanitizerFunctionPass()); -// #endif -// #if defined(_COMPILER_MSAN_ENABLED_) -// PM->add(createMemorySanitizerPass(true)); -// #endif -// #if defined(_COMPILER_TSAN_ENABLED_) -// PM->add(createThreadSanitizerLegacyPassPass()); -// #endif -// } + FunctionAnalysisManager FAM; + // Register the AA manager first so that our version is the one used. + FAM.registerPass([&] { return std::move(AA); }); + // Register our TargetLibraryInfoImpl. + FAM.registerPass([&] { return llvm::TargetIRAnalysis(TM->getTargetIRAnalysis()); }); + FAM.registerPass([&] { return llvm::TargetLibraryAnalysis(llvm::TargetLibraryInfoImpl(TM->getTargetTriple())); }); + + PB.registerFunctionAnalyses(FAM); + + CGSCCAnalysisManager CGAM; + PB.registerCGSCCAnalyses(CGAM); + + ModuleAnalysisManager MAM; + PB.registerModuleAnalyses(MAM); + + PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); + + ModulePassManager MPM; + addPipeline(MPM, opt_level, lower_intrinsics, dump_native); + MPM.run(M, MAM); } // new pass manager plugin From d64d14e6ca6ead230f1f8c26fc946fe0ececc06b Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi Date: Mon, 23 May 2022 13:00:36 -0400 Subject: [PATCH 3/7] Add list of julia passes --- src/llvm-julia-passes.inc | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 src/llvm-julia-passes.inc diff --git a/src/llvm-julia-passes.inc b/src/llvm-julia-passes.inc new file mode 100644 index 0000000000000..9063780c7a66e --- /dev/null +++ b/src/llvm-julia-passes.inc @@ -0,0 +1,34 @@ +//TODO clobber files when this changes in Makefile + +//Module passes + +#ifndef MODULE_PASS +#define MODULE_PASS(NAME, CREATE_PASS) +#endif + +MODULE_PASS("CPUFeatures", CPUFeatures()) +MODULE_PASS("RemoveNI", RemoveNI()) +MODULE_PASS("LowerSIMDLoop", LowerSIMDLoop()) +MODULE_PASS("FinalLowerGC", FinalLowerGCPass()) +MODULE_PASS("JuliaMultiVersioning", MultiVersioning()) +MODULE_PASS("RemoveJuliaAddrspaces", RemoveJuliaAddrspacesPass()) +MODULE_PASS("RemoveAddrspaces", RemoveAddrspacesPass()) +MODULE_PASS("LowerPTLSPass", LowerPTLSPass()) + +#ifndef FUNCTION_PASS +#define FUNCTION_PASS(NAME, CREATE_PASS) +#endif +//Function passes +FUNCTION_PASS("DemoteFloat16", DemoteFloat16()) +FUNCTION_PASS("CombineMulAdd", CombineMulAdd()) +FUNCTION_PASS("LateLowerGCFrame", LateLowerGC()) +FUNCTION_PASS("AllocOpt", AllocOptPass()) +FUNCTION_PASS("PropagateJuliaAddrspaces", PropagateJuliaAddrspacesPass()) +FUNCTION_PASS("LowerExcHandlers", LowerExcHandlers()) +FUNCTION_PASS("GCInvariantVerifier", GCInvariantVerifierPass()) + +#ifndef LOOP_PASS +#define LOOP_PASS(NAME, CREATE_PASS) +#endif +//Loop passes +LOOP_PASS("JuliaLICM", JuliaLICMPass()) \ No newline at end of file From 017a474388b9aae2c68fde63f8d00eac2fd27d03 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi Date: Mon, 23 May 2022 13:37:04 -0400 Subject: [PATCH 4/7] Clean up pipeline parsing --- src/aotcompile.cpp | 76 +++++++++++----------------------------------- 1 file changed, 18 insertions(+), 58 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 458f83692df6e..79f0bd6188f02 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -1313,79 +1313,39 @@ static void registerCallbacks(PassBuilder &PB) { PB.registerPipelineParsingCallback( [](StringRef Name, FunctionPassManager &PM, ArrayRef InnerPipeline) { - if (Name == "DemoteFloat16") { - PM.addPass(DemoteFloat16()); - return true; - } - if (Name == "CombineMulAdd") { - PM.addPass(CombineMulAdd()); - return true; - } - if (Name == "LateLowerGCFrame") { - PM.addPass(LateLowerGC()); - return true; - } - if (Name == "AllocOpt") { - PM.addPass(AllocOptPass()); - return true; - } - if (Name == "PropagateJuliaAddrspaces") { - PM.addPass(PropagateJuliaAddrspacesPass()); - return true; - } - if (Name == "LowerExcHandlers") { - PM.addPass(LowerExcHandlers()); - return true; - } - if (Name == "GCInvariantVerifier") { - // TODO: Parse option and allow users to set `Strong` - PM.addPass(GCInvariantVerifierPass()); - return true; +#define FUNCTION_PASS(NAME, CREATE_PASS) \ + if (Name == NAME) { \ + PM.addPass(CREATE_PASS); \ + return true; \ } +#include "llvm-julia-passes.inc" +#undef FUNCTION_PASS return false; }); PB.registerPipelineParsingCallback( [](StringRef Name, ModulePassManager &PM, ArrayRef InnerPipeline) { - if (Name == "CPUFeatures") { - PM.addPass(CPUFeatures()); - return true; - } - if (Name == "RemoveNI") { - PM.addPass(RemoveNI()); - return true; - } - if (Name == "LowerSIMDLoop") { - PM.addPass(LowerSIMDLoop()); - return true; - } - if (Name == "FinalLowerGC") { - PM.addPass(FinalLowerGCPass()); - return true; - } - if (Name == "RemoveJuliaAddrspaces") { - PM.addPass(RemoveJuliaAddrspacesPass()); - return true; - } - if (Name == "MultiVersioning") { - PM.addPass(MultiVersioning()); - return true; - } - if (Name == "LowerPTLS") { - PM.addPass(LowerPTLSPass()); - return true; +#define MODULE_PASS(NAME, CREATE_PASS) \ + if (Name == NAME) { \ + PM.addPass(CREATE_PASS); \ + return true; \ } +#include "llvm-julia-passes.inc" +#undef MODULE_PASS return false; }); PB.registerPipelineParsingCallback( [](StringRef Name, LoopPassManager &PM, ArrayRef InnerPipeline) { - if (Name == "JuliaLICM") { - PM.addPass(JuliaLICMPass()); - return true; +#define LOOP_PASS(NAME, CREATE_PASS) \ + if (Name == NAME) { \ + PM.addPass(CREATE_PASS); \ + return true; \ } +#include "llvm-julia-passes.inc" +#undef LOOP_PASS return false; }); } From 4119135ba8b5b352c082ce9c4dd35106c249ca15 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi Date: Wed, 20 Jul 2022 03:25:46 -0400 Subject: [PATCH 5/7] Move NewPM pipeline to pipeline.cpp --- src/Makefile | 2 +- src/aotcompile.cpp | 330 +-------------------- src/jitlayers.h | 27 ++ src/llvm-julia-passes.inc | 19 +- src/pipeline.cpp | 584 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 621 insertions(+), 341 deletions(-) create mode 100644 src/pipeline.cpp diff --git a/src/Makefile b/src/Makefile index 90455d51e9345..e10c6f23808c2 100644 --- a/src/Makefile +++ b/src/Makefile @@ -56,7 +56,7 @@ CODEGEN_SRCS := codegen jitlayers aotcompile debuginfo disasm llvm-simdloop llvm llvm-final-gc-lowering llvm-pass-helpers llvm-late-gc-lowering llvm-ptls \ llvm-lower-handlers llvm-gc-invariant-verifier llvm-propagate-addrspaces \ llvm-multiversioning llvm-alloc-opt llvm-alloc-helpers cgmemmgr llvm-remove-addrspaces \ - llvm-remove-ni llvm-julia-licm llvm-demote-float16 llvm-cpufeatures + llvm-remove-ni llvm-julia-licm llvm-demote-float16 llvm-cpufeatures pipeline FLAGS += -I$(shell $(LLVM_CONFIG_HOST) --includedir) CG_LLVM_LIBS := all ifeq ($(USE_POLLY),1) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 79f0bd6188f02..a2c2d90672f0a 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -975,334 +975,8 @@ void jl_add_optimization_passes_impl(LLVMPassManagerRef PM, int opt_level, int l addOptimizationPasses(unwrap(PM), opt_level, lower_intrinsics); } -// new pass manager -void addPipeline(ModulePassManager &MPM, int opt_level, bool lower_intrinsics, bool dump_native) -{ - // TODO: CommonInstruction hoisting/sinking enables AllocOpt - // to merge allocations and sometimes eliminate them, - // since AllocOpt does not handle PhiNodes. - // Enable this instruction hoisting because of this and Union benchmarks. - auto simplifyCFGOptions = SimplifyCFGOptions().hoistCommonInsts(true); - -// #ifdef JL_DEBUG_BUILD - { - FunctionPassManager FPM; - FPM.addPass(GCInvariantVerifierPass()); - FPM.addPass(VerifierPass()); - MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); - } -// #endif - - MPM.addPass(ConstantMergePass()); - if (opt_level < 2) { - if (!dump_native) { - // we won't be multiversioning, so lower CPU feature checks early on - // so that we can avoid an additional CFG simplification pass at the end. - MPM.addPass(CPUFeatures()); - if (opt_level == 1) { - FunctionPassManager FPM; - FPM.addPass(InstSimplifyPass()); - MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); - } - } - { - FunctionPassManager FPM; - FPM.addPass(SimplifyCFGPass(simplifyCFGOptions)); - if (opt_level == 1) { - FPM.addPass(SROA()); - FPM.addPass(InstCombinePass()); - FPM.addPass(EarlyCSEPass()); - // maybe add GVN? - // also try GVNHoist and GVNSink - } - FPM.addPass(MemCpyOptPass()); - MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); - } - MPM.addPass(AlwaysInlinerPass()); - MPM.addPass(LowerSIMDLoop()); // Annotate loop marked with "loopinfo" as LLVM parallel loop - if (lower_intrinsics) { - // TODO: Barrier Pass? - { - FunctionPassManager FPM; - FPM.addPass(LowerExcHandlers()); - FPM.addPass(GCInvariantVerifierPass(false)); - MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); - } - MPM.addPass(RemoveNI()); - { - FunctionPassManager FPM; - FPM.addPass(LateLowerGC()); - MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); - } - MPM.addPass(FinalLowerGCPass()); - MPM.addPass(LowerPTLSPass(dump_native)); - } - else { - MPM.addPass(RemoveNI()); - } - MPM.addPass(LowerSIMDLoop()); // Annotate loop marked with "loopinfo" as LLVM parallel loop - if (dump_native) { - MPM.addPass(MultiVersioning()); - MPM.addPass(CPUFeatures()); - // minimal clean-up to get rid of CPU feature checks - if (opt_level == 1) { - FunctionPassManager FPM; - FPM.addPass(InstSimplifyPass()); - FPM.addPass(SimplifyCFGPass(simplifyCFGOptions)); - MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); - } - } - { - FunctionPassManager FPM; -#if defined(_COMPILER_ASAN_ENABLED_) - FPM.addPass(AddressSanitizerPass()); -#endif -#if defined(_COMPILER_MSAN_ENABLED_) - FPM.addPass(MemorySanitizerPass()); -#endif -#if defined(_COMPILER_TSAN_ENABLED_) - FPM.addPass(ThreadSanitizerPass()); -#endif - MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); - } - return; - } - { - FunctionPassManager FPM; - FPM.addPass(PropagateJuliaAddrspacesPass()); - FPM.addPass(SimplifyCFGPass(simplifyCFGOptions)); - FPM.addPass(DCEPass()); - FPM.addPass(SROA()); - // FPM.addPass(MemCpyOptPass()); - MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); - } - MPM.addPass(AlwaysInlinerPass()); // Respect always_inline - // Running `memcpyopt` between this and `sroa` seems to give `sroa` a hard time - // merging the `alloca` for the unboxed data and the `alloca` created by the `alloc_opt` - // pass. - { - FunctionPassManager FPM; - FPM.addPass(AllocOptPass()); - // consider AggressiveInstCombinePass at optlevel > 2 - FPM.addPass(InstCombinePass()); - FPM.addPass(SimplifyCFGPass(simplifyCFGOptions)); - } - - if (dump_native) - MPM.addPass(MultiVersioning()); - MPM.addPass(CPUFeatures()); - { - FunctionPassManager FPM; - FPM.addPass(SROA()); - FPM.addPass(InstSimplifyPass()); - FPM.addPass(JumpThreadingPass()); - FPM.addPass(CorrelatedValuePropagationPass()); - FPM.addPass(ReassociatePass()); - FPM.addPass(EarlyCSEPass()); - // Load forwarding above can expose allocations that aren't actually used - // remove those before optimizing loops. - FPM.addPass(AllocOptPass()); - { - LoopPassManager LPM; - LPM.addPass(LoopRotatePass()); - // TODO: Can we use MemorySSA? - FPM.addPass(llvm::createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/false)); - } - // moving IndVarSimplify here prevented removing the loop in perf_sumcartesian(10:-1:1) - // LoopRotate strips metadata from terminator, so run LowerSIMD afterwards - MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); - } - MPM.addPass(LowerSIMDLoop()); // Annotate loop marked with "loopinfo" as LLVM parallel loop - { - FunctionPassManager FPM; - { - LoopPassManager LPM; - LPM.addPass(LICMPass()); - LPM.addPass(JuliaLICMPass()); - LPM.addPass(SimpleLoopUnswitchPass()); - LPM.addPass(LICMPass()); - LPM.addPass(JuliaLICMPass()); - // TODO: Can we use MemorySSA? - FPM.addPass(llvm::createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/false)); - } - FPM.addPass(IRCEPass()); // must combe before indvars - { - LoopPassManager LPM; - // Subsequent passes not stripping metadata from terminator - LPM.addPass(LoopInstSimplifyPass()); // NOTE: OldPM we used InstSimplifyPass - LPM.addPass(LoopIdiomRecognizePass()); - LPM.addPass(IndVarSimplifyPass()); - LPM.addPass(LoopDeletionPass()); - - // TODO: Can we use MemorySSA? - FPM.addPass(llvm::createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/false)); - } - FPM.addPass(LoopUnrollPass()); // Note: OldPM SimpleLoopUnrollPass - FPM.addPass(InstSimplifyPass()); // NOTE: Additionally for NewPM - // Run our own SROA on heap objects before LLVM's - FPM.addPass(AllocOptPass()); - // Re-run SROA after loop-unrolling (useful for small loops that operate, - // over the structure of an aggregate) - FPM.addPass(SROA()); - FPM.addPass(InstSimplifyPass()); - FPM.addPass(GVN()); - FPM.addPass(MemCpyOptPass()); - FPM.addPass(SCCPPass()); - // These next two passes must come before IRCE to eliminate the bounds check in #43308 - FPM.addPass(CorrelatedValuePropagationPass()); - FPM.addPass(DCEPass()); - FPM.addPass(IRCEPass()); // Must come between the two GVN passes - // Run instcombine after redundancy elimination to exploit opportunities - // opened up by them. - // This needs to be InstCombine instead of InstSimplify to allow - // loops over Union-typed arrays to vectorize. - FPM.addPass(InstCombinePass()); - FPM.addPass(JumpThreadingPass()); - if (opt_level >= 3) { - FPM.addPass(GVN()); // Must come after JumpThreading and before LoopVectorize - } - FPM.addPass(DSEPass()); - // More dead allocation (store) deletion before loop optimization - // consider removing this: - FPM.addPass(AllocOptPass()); - // see if all of the constant folding has exposed more loops - // to simplification and deletion - // this helps significantly with cleaning up iteration - { - LoopPassManager LPM; - LPM.addPass(LoopSimplifyCFGPass()); // NOTE: OldPM we used SimplifyCFGPass - LPM.addPass(LoopDeletionPass()); - LPM.addPass(LoopInstSimplifyPass()); // NOTE: OldPM we used InstCombinePass - FPM.addPass(llvm::createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/false)); - } - FPM.addPass(LoopVectorizePass()); - FPM.addPass(LoopLoadEliminationPass()); - // Cleanup after LV pass - FPM.addPass(InstCombinePass()); - FPM.addPass(SimplifyCFGPass( // Aggressive CFG simplification - SimplifyCFGOptions() - .forwardSwitchCondToPhi(true) - .convertSwitchToLookupTable(true) - .needCanonicalLoops(false) - .hoistCommonInsts(true) - // .sinkCommonInsts(true) // FIXME: Causes assertion in llvm-late-lowering - )); - FPM.addPass(SLPVectorizerPass()); - // might need this after LLVM 11: - // FPM->add(VectorCombinePass()); - FPM.addPass(ADCEPass()); - MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); - } - - if (lower_intrinsics) { - // LowerPTLS removes an indirect call. As a result, it is likely to trigger - // LLVM's devirtualization heuristics, which would result in the entire - // pass pipeline being re-exectuted. Prevent this by inserting a barrier. - // TODO: Barrier Pass? - // Maybe by creating a new pass manager and nesting it inside MPM? - // ModulePassManager NestedMPM; - // MPM.addPass(std::move(NestedMPM)); - { - FunctionPassManager FPM; - FPM.addPass(LowerExcHandlers()); - FPM.addPass(GCInvariantVerifierPass(false)); - MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); - } - // Needed **before** LateLowerGCFrame on LLVM < 12 - // due to bug in `CreateAlignmentAssumption`. - MPM.addPass(RemoveNI()); - { - FunctionPassManager FPM; - FPM.addPass(LateLowerGC()); - MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); - } - MPM.addPass(FinalLowerGCPass()); - // We need these two passes and the instcombine below - // after GC lowering to let LLVM do some constant propagation on the tags. - // and remove some unnecessary write barrier checks. - { - FunctionPassManager FPM; - FPM.addPass(GVN()); - FPM.addPass(SCCPPass()); - // Remove dead use of ptls - FPM.addPass(DCEPass()); - MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); - } - MPM.addPass(LowerPTLSPass(dump_native)); - // Clean up write barrier and ptls lowering - { - FunctionPassManager FPM; - FPM.addPass(InstCombinePass()); - FPM.addPass(SimplifyCFGPass()); - } - } - else { - MPM.addPass(RemoveNI()); - } - { - FunctionPassManager FPM; - FPM.addPass(CombineMulAdd()); - FPM.addPass(DivRemPairsPass()); -#if defined(_COMPILER_ASAN_ENABLED_) - FPM.addPass(AddressSanitizerPass()); -#endif -#if defined(_COMPILER_MSAN_ENABLED_) - FPM.addPass(MemorySanitizerPass()); -#endif -#if defined(_COMPILER_TSAN_ENABLED_) - FPM.addPass(ThreadSanitizerPass()); -#endif - MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); - } -} - -// TODO(vchuravy/maleadt): -// Since we are not using the PassBuilder fully and instead rolling our own, we are missing out on -// TargetMachine::registerPassBuilderCallbacks. We need to find a solution either in working with upstream -// or adapting PassBuilder (or subclassing it) to suite our needs. This is in particular important for -// BPF, NVPTX, and AMDGPU. - -void optimizeModule(Module &M, TargetMachine *TM, int opt_level, bool lower_intrinsics, bool dump_native) -{ - // llvm::PassBuilder pb(targetMachine->LLVM, llvm::PipelineTuningOptions(), llvm::None, &passInstrumentationCallbacks); - PassBuilder PB; - // Create the analysis managers. - LoopAnalysisManager LAM; - PB.registerLoopAnalyses(LAM); - - AAManager AA; - // TODO: Why are we only doing this for -O3? - if (opt_level >= 3) { - AA.registerFunctionAnalysis(); - } - if (opt_level >= 2) { - AA.registerFunctionAnalysis(); - AA.registerFunctionAnalysis(); - } - // TM->registerDefaultAliasAnalyses(AA); - - FunctionAnalysisManager FAM; - // Register the AA manager first so that our version is the one used. - FAM.registerPass([&] { return std::move(AA); }); - // Register our TargetLibraryInfoImpl. - FAM.registerPass([&] { return llvm::TargetIRAnalysis(TM->getTargetIRAnalysis()); }); - FAM.registerPass([&] { return llvm::TargetLibraryAnalysis(llvm::TargetLibraryInfoImpl(TM->getTargetTriple())); }); - - PB.registerFunctionAnalyses(FAM); - - CGSCCAnalysisManager CGAM; - PB.registerCGSCCAnalyses(CGAM); - - ModuleAnalysisManager MAM; - PB.registerModuleAnalyses(MAM); - - PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); - - ModulePassManager MPM; - addPipeline(MPM, opt_level, lower_intrinsics, dump_native); - - MPM.run(M, MAM); -} +void buildBasicPipeline(ModulePassManager &MPM, PassBuilder &PB, OptimizationLevel O, OptimizationOptions options); +void buildFullPipeline(ModulePassManager &MPM, PassBuilder &PB, OptimizationLevel O, OptimizationOptions options); // new pass manager plugin diff --git a/src/jitlayers.h b/src/jitlayers.h index c4a89f882beaa..e5c595968b5d9 100644 --- a/src/jitlayers.h +++ b/src/jitlayers.h @@ -13,6 +13,10 @@ #include #include +#include +#include +#include + #include #include "julia_assert.h" #include "debug-registry.h" @@ -97,6 +101,29 @@ struct jl_locked_stream { } }; +struct OptimizationOptions { + bool lower_intrinsics; + bool dump_native; + bool external_use; + + static constexpr OptimizationOptions defaults() { + return {true, false, false}; + } +}; + +struct NewPM { + std::unique_ptr TM; + StandardInstrumentations SI; + std::unique_ptr PIC; + PassBuilder PB; + ModulePassManager MPM; + OptimizationLevel O; + + NewPM(std::unique_ptr TM, OptimizationLevel O, OptimizationOptions options = OptimizationOptions::defaults()); + + void run(Module &M); +}; + typedef struct _jl_llvm_functions_t { std::string functionObject; // jlcall llvm Function name std::string specFunctionObject; // specialized llvm Function name diff --git a/src/llvm-julia-passes.inc b/src/llvm-julia-passes.inc index 9063780c7a66e..d50ac131e3ff3 100644 --- a/src/llvm-julia-passes.inc +++ b/src/llvm-julia-passes.inc @@ -1,11 +1,7 @@ //TODO clobber files when this changes in Makefile //Module passes - -#ifndef MODULE_PASS -#define MODULE_PASS(NAME, CREATE_PASS) -#endif - +#ifdef MODULE_PASS MODULE_PASS("CPUFeatures", CPUFeatures()) MODULE_PASS("RemoveNI", RemoveNI()) MODULE_PASS("LowerSIMDLoop", LowerSIMDLoop()) @@ -14,11 +10,10 @@ MODULE_PASS("JuliaMultiVersioning", MultiVersioning()) MODULE_PASS("RemoveJuliaAddrspaces", RemoveJuliaAddrspacesPass()) MODULE_PASS("RemoveAddrspaces", RemoveAddrspacesPass()) MODULE_PASS("LowerPTLSPass", LowerPTLSPass()) - -#ifndef FUNCTION_PASS -#define FUNCTION_PASS(NAME, CREATE_PASS) #endif + //Function passes +#ifdef FUNCTION_PASS FUNCTION_PASS("DemoteFloat16", DemoteFloat16()) FUNCTION_PASS("CombineMulAdd", CombineMulAdd()) FUNCTION_PASS("LateLowerGCFrame", LateLowerGC()) @@ -26,9 +21,9 @@ FUNCTION_PASS("AllocOpt", AllocOptPass()) FUNCTION_PASS("PropagateJuliaAddrspaces", PropagateJuliaAddrspacesPass()) FUNCTION_PASS("LowerExcHandlers", LowerExcHandlers()) FUNCTION_PASS("GCInvariantVerifier", GCInvariantVerifierPass()) - -#ifndef LOOP_PASS -#define LOOP_PASS(NAME, CREATE_PASS) #endif + //Loop passes -LOOP_PASS("JuliaLICM", JuliaLICMPass()) \ No newline at end of file +#ifdef LOOP_PASS +LOOP_PASS("JuliaLICM", JuliaLICMPass()) +#endif \ No newline at end of file diff --git a/src/pipeline.cpp b/src/pipeline.cpp new file mode 100644 index 0000000000000..a66ecbe785279 --- /dev/null +++ b/src/pipeline.cpp @@ -0,0 +1,584 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +#include +#include "platform.h" + +//We don't care about uninitialized variables in LLVM; that's LLVM's problem +#ifdef _COMPILER_GCC_ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + +// analysis passes +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// NewPM needs to manually include all the pass headers +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _COMPILER_GCC_ +#pragma GCC diagnostic pop +#endif + +#include "passes.h" + +#include + +#include "julia.h" +#include "julia_internal.h" +#include "jitlayers.h" +#include "julia_assert.h" + +using namespace llvm; + +namespace { + //Shamelessly stolen from Clang's approach to sanitizers + //TODO do we want to enable other sanitizers? + static void addSanitizerPasses(ModulePassManager &MPM, OptimizationLevel O) { + // Coverage sanitizer + // if (CodeGenOpts.hasSanitizeCoverage()) { + // auto SancovOpts = getSancovOptsFromCGOpts(CodeGenOpts); + // MPM.addPass(ModuleSanitizerCoveragePass( + // SancovOpts, CodeGenOpts.SanitizeCoverageAllowlistFiles, + // CodeGenOpts.SanitizeCoverageIgnorelistFiles)); + // } + + #ifdef _COMPILER_MSAN_ENABLED_ + auto MSanPass = [&](/*SanitizerMask Mask, */bool CompileKernel) { + // if (LangOpts.Sanitize.has(Mask)) { + // int TrackOrigins = CodeGenOpts.SanitizeMemoryTrackOrigins; + // bool Recover = CodeGenOpts.SanitizeRecover.has(Mask); + + // MemorySanitizerOptions options(TrackOrigins, Recover, CompileKernel, + // CodeGenOpts.SanitizeMemoryParamRetval); + MemorySanitizerOptions options{}; + #if JL_LLVM_VERSION >= 140000 + MPM.addPass(ModuleMemorySanitizerPass(options)); + #endif + FunctionPassManager FPM; + FPM.addPass(MemorySanitizerPass(options)); + if (O.getSpeedupLevel() != 0) { + // MemorySanitizer inserts complex instrumentation that mostly + // follows the logic of the original code, but operates on + // "shadow" values. It can benefit from re-running some + // general purpose optimization passes. + FPM.addPass(EarlyCSEPass()); + // TODO: Consider add more passes like in + // addGeneralOptsForMemorySanitizer. EarlyCSEPass makes visible + // difference on size. It's not clear if the rest is still + // usefull. InstCombinePass breakes + // compiler-rt/test/msan/select_origin.cpp. + } + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + // } + }; + MSanPass(/*SanitizerKind::Memory, */false); + // MSanPass(SanitizerKind::KernelMemory, true); + #endif + + #ifdef _COMPILER_TSAN_ENABLED_ + // if (LangOpts.Sanitize.has(SanitizerKind::Thread)) { + #if JL_LLVM_VERSION >= 140000 + MPM.addPass(ModuleThreadSanitizerPass()); + #endif + MPM.addPass(createModuleToFunctionPassAdaptor(ThreadSanitizerPass())); + // } + #endif + + + #ifdef _COMPILER_ASAN_ENABLED_ + auto ASanPass = [&](/*SanitizerMask Mask, */bool CompileKernel) { + // if (LangOpts.Sanitize.has(Mask)) { + // bool UseGlobalGC = asanUseGlobalsGC(TargetTriple, CodeGenOpts); + // bool UseOdrIndicator = CodeGenOpts.SanitizeAddressUseOdrIndicator; + // llvm::AsanDtorKind DestructorKind = + // CodeGenOpts.getSanitizeAddressDtor(); + // AddressSanitizerOptions Opts; + // Opts.CompileKernel = CompileKernel; + // Opts.Recover = CodeGenOpts.SanitizeRecover.has(Mask); + // Opts.UseAfterScope = CodeGenOpts.SanitizeAddressUseAfterScope; + // Opts.UseAfterReturn = CodeGenOpts.getSanitizeAddressUseAfterReturn(); + MPM.addPass(RequireAnalysisPass()); + // MPM.addPass(ModuleAddressSanitizerPass( + // Opts, UseGlobalGC, UseOdrIndicator, DestructorKind)); + MPM.addPass(ModuleAddressSanitizerPass()); + MPM.addPass(createModuleToFunctionPassAdaptor(AddressSanitizerPass())); + // } + }; + ASanPass(/*SanitizerKind::Address, */false); + // ASanPass(SanitizerKind::KernelAddress, true); + #endif + + // auto HWASanPass = [&](SanitizerMask Mask, bool CompileKernel) { + // if (LangOpts.Sanitize.has(Mask)) { + // bool Recover = CodeGenOpts.SanitizeRecover.has(Mask); + // MPM.addPass(HWAddressSanitizerPass( + // {CompileKernel, Recover, + // /*DisableOptimization=*/CodeGenOpts.OptimizationLevel == 0})); + // } + // }; + // HWASanPass(/*SanitizerKind::HWAddress, */false); + // // HWASanPass(SanitizerKind::KernelHWAddress, true); + + // if (LangOpts.Sanitize.has(SanitizerKind::DataFlow)) { + // MPM.addPass(DataFlowSanitizerPass(LangOpts.NoSanitizeFiles)); + // } + } + + void addVerificationPasses(ModulePassManager &MPM) { + FunctionPassManager FPM; + FPM.addPass(GCInvariantVerifierPass()); + FPM.addPass(VerifierPass()); + MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM))); + } + + auto basicSimplifyCFGOptions() { + return SimplifyCFGOptions() + .convertSwitchRangeToICmp(true) + .convertSwitchToLookupTable(true) + .forwardSwitchCondToPhi(true); + } + + auto aggressiveSimplifyCFGOptions() { + return SimplifyCFGOptions() + .convertSwitchRangeToICmp(true) + .convertSwitchToLookupTable(true) + .forwardSwitchCondToPhi(true) + //These mess with loop rotation, so only do them after that + .hoistCommonInsts(true) + // Causes an SRET assertion error in late-gc-lowering + // .sinkCommonInsts(true) + ; + } + + // TODO(vchuravy/maleadt): + // Since we are not using the PassBuilder fully and instead rolling our own, we are missing out on + // TargetMachine::registerPassBuilderCallbacks. We need to find a solution either in working with upstream + // or adapting PassBuilder (or subclassing it) to suite our needs. This is in particular important for + // BPF, NVPTX, and AMDGPU. + //TODO implement these once LLVM exposes + //the PassBuilder extension point callbacks + void invokePipelineStartCallbacks(ModulePassManager &MPM, PassBuilder &PB, OptimizationLevel O) {} + void invokePeepholeEPCallbacks(FunctionPassManager &MPM, PassBuilder &PB, OptimizationLevel O) {} + void invokeEarlySimplificationCallbacks(ModulePassManager &MPM, PassBuilder &PB, OptimizationLevel O) {} + void invokeCGSCCCallbacks(CGSCCPassManager &MPM, PassBuilder &PB, OptimizationLevel O) {} + void invokeOptimizerEarlyCallbacks(ModulePassManager &MPM, PassBuilder &PB, OptimizationLevel O) {} + void invokeLateLoopOptimizationCallbacks(LoopPassManager &MPM, PassBuilder &PB, OptimizationLevel O) {} + void invokeLoopOptimizerEndCallbacks(LoopPassManager &MPM, PassBuilder &PB, OptimizationLevel O) {} + void invokeScalarOptimizerCallbacks(FunctionPassManager &MPM, PassBuilder &PB, OptimizationLevel O) {} + void invokeVectorizerCallbacks(FunctionPassManager &MPM, PassBuilder &PB, OptimizationLevel O) {} + void invokeOptimizerLastCallbacks(ModulePassManager &MPM, PassBuilder &PB, OptimizationLevel O) {} +} + +//The actual pipelines +//TODO Things we might want to consider: +//? annotation2metadata pass +//? force function attributes pass +//? annotation remarks pass +//? infer function attributes pass +//? lower expect intrinsic pass +//? warn missed transformations pass +//* For vectorization +//? loop unroll/jam after loop vectorization +//? optimization remarks pass +//? cse/cvp/instcombine/bdce/sccp/licm/unswitch after loop vectorization ( +// cleanup as much as possible before trying to slp vectorize) +//? vectorcombine pass +//* For optimization +//? float2int pass +//? lower constant intrinsics pass +//? loop sink pass +//? hot-cold splitting pass + +//Use for O1 and below +void buildBasicPipeline(ModulePassManager &MPM, PassBuilder &PB, OptimizationLevel O, OptimizationOptions options) { +// #ifdef JL_DEBUG_BUILD + addVerificationPasses(MPM); +// #endif + invokePipelineStartCallbacks(MPM, PB, O); + MPM.addPass(ConstantMergePass()); + if (!options.dump_native) { + MPM.addPass(CPUFeatures()); + if (O.getSpeedupLevel() > 0) { + MPM.addPass(createModuleToFunctionPassAdaptor(InstSimplifyPass())); + } + } + { + FunctionPassManager FPM; + FPM.addPass(SimplifyCFGPass(basicSimplifyCFGOptions())); + if (O.getSpeedupLevel() > 0) { + FPM.addPass(SROAPass()); + FPM.addPass(InstCombinePass()); + FPM.addPass(EarlyCSEPass()); + } + FPM.addPass(MemCpyOptPass()); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } + invokeEarlySimplificationCallbacks(MPM, PB, O); + MPM.addPass(AlwaysInlinerPass()); + { + CGSCCPassManager CGPM; + invokeCGSCCCallbacks(CGPM, PB, O); + MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); + } + invokeOptimizerEarlyCallbacks(MPM, PB, O); + MPM.addPass(LowerSIMDLoop()); + { + FunctionPassManager FPM; + { + LoopPassManager LPM; + invokeLateLoopOptimizationCallbacks(LPM, PB, O); + invokeLoopOptimizerEndCallbacks(LPM, PB, O); + FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM))); + } + invokeScalarOptimizerCallbacks(FPM, PB, O); + invokeVectorizerCallbacks(FPM, PB, O); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } + if (options.lower_intrinsics) { + //TODO no barrier pass? + { + FunctionPassManager FPM; + FPM.addPass(LowerExcHandlers()); + FPM.addPass(GCInvariantVerifierPass(false)); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } + MPM.addPass(RemoveNI()); + MPM.addPass(createModuleToFunctionPassAdaptor(LateLowerGC())); + MPM.addPass(FinalLowerGCPass()); + MPM.addPass(LowerPTLSPass(options.dump_native)); + } else { + MPM.addPass(RemoveNI()); + } + MPM.addPass(LowerSIMDLoop()); // TODO why do we do this twice + if (options.dump_native) { + MPM.addPass(MultiVersioning(options.external_use)); + MPM.addPass(CPUFeatures()); + if (O.getSpeedupLevel() > 0) { + FunctionPassManager FPM; + FPM.addPass(InstSimplifyPass()); + FPM.addPass(SimplifyCFGPass(basicSimplifyCFGOptions())); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } + } + invokeOptimizerLastCallbacks(MPM, PB, O); + addSanitizerPasses(MPM, O); + MPM.addPass(createModuleToFunctionPassAdaptor(DemoteFloat16())); +} + +//Use for O2 and above +void buildFullPipeline(ModulePassManager &MPM, PassBuilder &PB, OptimizationLevel O, OptimizationOptions options) { +// #ifdef JL_DEBUG_BUILD + addVerificationPasses(MPM); +// #endif + invokePipelineStartCallbacks(MPM, PB, O); + MPM.addPass(ConstantMergePass()); + { + FunctionPassManager FPM; + FPM.addPass(PropagateJuliaAddrspacesPass()); + //TODO consider not using even basic simplification + //options here, and adding a run of CVP to take advantage + //of the unsimplified codegen information (e.g. known + //zeros or ones) + FPM.addPass(SimplifyCFGPass(basicSimplifyCFGOptions())); + FPM.addPass(DCEPass()); + FPM.addPass(SROAPass()); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } + invokeEarlySimplificationCallbacks(MPM, PB, O); + MPM.addPass(AlwaysInlinerPass()); + invokeOptimizerEarlyCallbacks(MPM, PB, O); + { + CGSCCPassManager CGPM; + invokeCGSCCCallbacks(CGPM, PB, O); + { + FunctionPassManager FPM; + FPM.addPass(AllocOptPass()); + FPM.addPass(InstCombinePass()); + FPM.addPass(SimplifyCFGPass(basicSimplifyCFGOptions())); + CGPM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM))); + } + MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); + } + if (options.dump_native) { + MPM.addPass(MultiVersioning(options.external_use)); + } + MPM.addPass(CPUFeatures()); + { + FunctionPassManager FPM; + FPM.addPass(SROAPass()); + FPM.addPass(InstSimplifyPass()); + FPM.addPass(JumpThreadingPass()); + FPM.addPass(CorrelatedValuePropagationPass()); + FPM.addPass(ReassociatePass()); + FPM.addPass(EarlyCSEPass()); + FPM.addPass(AllocOptPass()); + invokePeepholeEPCallbacks(FPM, PB, O); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } + MPM.addPass(LowerSIMDLoop()); + { + FunctionPassManager FPM; + { + LoopPassManager LPM1, LPM2; + LPM1.addPass(LoopRotatePass()); + invokeLateLoopOptimizationCallbacks(LPM1, PB, O); + //We don't know if the loop callbacks support MSSA + FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), /*UseMemorySSA = */false)); + LPM2.addPass(LICMPass()); + LPM2.addPass(JuliaLICMPass()); + LPM2.addPass(SimpleLoopUnswitchPass()); + LPM2.addPass(LICMPass()); + LPM2.addPass(JuliaLICMPass()); + //LICM needs MemorySSA now, so we must use it + FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2), /*UseMemorySSA = */true)); + } + FPM.addPass(IRCEPass()); + { + LoopPassManager LPM; + LPM.addPass(LoopInstSimplifyPass()); + LPM.addPass(LoopIdiomRecognizePass()); + LPM.addPass(IndVarSimplifyPass()); + LPM.addPass(LoopDeletionPass()); + invokeLoopOptimizerEndCallbacks(LPM, PB, O); + //We don't know if the loop end callbacks support MSSA + FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA = */false)); + } + FPM.addPass(LoopUnrollPass()); + FPM.addPass(AllocOptPass()); + FPM.addPass(SROAPass()); + FPM.addPass(InstSimplifyPass()); + FPM.addPass(GVNPass()); + FPM.addPass(MemCpyOptPass()); + FPM.addPass(SCCPPass()); + FPM.addPass(CorrelatedValuePropagationPass()); + FPM.addPass(DCEPass()); + FPM.addPass(IRCEPass()); + FPM.addPass(InstCombinePass()); + FPM.addPass(JumpThreadingPass()); + if (O.getSpeedupLevel() >= 3) { + FPM.addPass(GVNPass()); + } + FPM.addPass(DSEPass()); + invokePeepholeEPCallbacks(FPM, PB, O); + FPM.addPass(SimplifyCFGPass(aggressiveSimplifyCFGOptions())); + FPM.addPass(AllocOptPass()); + { + LoopPassManager LPM; + LPM.addPass(LoopDeletionPass()); + LPM.addPass(LoopInstSimplifyPass()); + FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM))); + } + invokeScalarOptimizerCallbacks(FPM, PB, O); + //TODO look into loop vectorize options + FPM.addPass(LoopVectorizePass()); + FPM.addPass(LoopLoadEliminationPass()); + FPM.addPass(InstCombinePass()); + FPM.addPass(SimplifyCFGPass(aggressiveSimplifyCFGOptions())); + FPM.addPass(SLPVectorizerPass()); + invokeVectorizerCallbacks(FPM, PB, O); + FPM.addPass(ADCEPass()); + //TODO add BDCEPass here? + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } + if (options.lower_intrinsics) { + //TODO barrier pass? + { + FunctionPassManager FPM; + FPM.addPass(LowerExcHandlers()); + FPM.addPass(GCInvariantVerifierPass(false)); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } + // Needed **before** LateLowerGCFrame on LLVM < 12 + // due to bug in `CreateAlignmentAssumption`. + MPM.addPass(RemoveNI()); + MPM.addPass(createModuleToFunctionPassAdaptor(LateLowerGC())); + MPM.addPass(FinalLowerGCPass()); + { + FunctionPassManager FPM; + FPM.addPass(GVNPass()); + FPM.addPass(SCCPPass()); + FPM.addPass(DCEPass()); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } + MPM.addPass(LowerPTLSPass(options.dump_native)); + { + FunctionPassManager FPM; + FPM.addPass(InstCombinePass()); + FPM.addPass(SimplifyCFGPass(aggressiveSimplifyCFGOptions())); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } + } else { + MPM.addPass(RemoveNI()); + } + { + FunctionPassManager FPM; + FPM.addPass(CombineMulAdd()); + FPM.addPass(DivRemPairsPass()); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } + invokeOptimizerLastCallbacks(MPM, PB, O); + addSanitizerPasses(MPM, O); + { + FunctionPassManager FPM; + FPM.addPass(DemoteFloat16()); + FPM.addPass(GVNPass()); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } +} + +namespace { + auto createPIC(StandardInstrumentations &SI) { + auto PIC = std::make_unique(); +//Borrowed from LLVM PassBuilder.cpp:386 +#define MODULE_PASS(NAME, CREATE_PASS) \ +PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME); +#define MODULE_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS) \ +PIC->addClassToPassName(CLASS, NAME); +#define MODULE_ANALYSIS(NAME, CREATE_PASS) \ +PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME); +#define FUNCTION_PASS(NAME, CREATE_PASS) \ +PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME); +#define FUNCTION_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS) \ +PIC->addClassToPassName(CLASS, NAME); +#define FUNCTION_ANALYSIS(NAME, CREATE_PASS) \ +PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME); +#define LOOPNEST_PASS(NAME, CREATE_PASS) \ +PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME); +#define LOOP_PASS(NAME, CREATE_PASS) \ +PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME); +#define LOOP_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS) \ +PIC->addClassToPassName(CLASS, NAME); +#define LOOP_ANALYSIS(NAME, CREATE_PASS) \ +PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME); +#define CGSCC_PASS(NAME, CREATE_PASS) \ +PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME); +#define CGSCC_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS) \ +PIC->addClassToPassName(CLASS, NAME); +#define CGSCC_ANALYSIS(NAME, CREATE_PASS) \ +PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME); + +#include + +#undef MODULE_PASS +#undef MODULE_PASS_WITH_PARAMS +#undef MODULE_ANALYSIS +#undef FUNCTION_PASS +#undef FUNCTION_PASS_WITH_PARAMS +#undef FUNCTION_ANALYSIS +#undef LOOPNEST_PASS +#undef LOOP_PASS +#undef LOOP_PASS_WITH_PARAMS +#undef LOOP_ANALYSIS +#undef CGSCC_PASS +#undef CGSCC_PASS_WITH_PARAMS +#undef CGSCC_ANALYSIS + + SI.registerCallbacks(*PIC); + return PIC; + } + + FunctionAnalysisManager createFAM(OptimizationLevel O, TargetIRAnalysis analysis, const Triple &triple) { + + FunctionAnalysisManager FAM; + // Register the AA manager first so that our version is the one used. + FAM.registerPass([&] { + AAManager AA; + // TODO: Why are we only doing this for -O3? + if (O.getSpeedupLevel() >= 3) { + AA.registerFunctionAnalysis(); + } + if (O.getSpeedupLevel() >= 2) { + AA.registerFunctionAnalysis(); + AA.registerFunctionAnalysis(); + } + // TM->registerDefaultAliasAnalyses(AA); + return AA; + }); + // Register our TargetLibraryInfoImpl. + FAM.registerPass([&] { return llvm::TargetIRAnalysis(analysis); }); + FAM.registerPass([&] { return llvm::TargetLibraryAnalysis(llvm::TargetLibraryInfoImpl(triple)); }); + return FAM; + } + + ModulePassManager createMPM(PassBuilder &PB, OptimizationLevel O, OptimizationOptions options) { + ModulePassManager MPM; + if (O.getSpeedupLevel() < 2) + buildBasicPipeline(MPM, PB, O, options); + else + buildFullPipeline(MPM, PB, O, options); + return MPM; + } +} + +NewPM::NewPM(std::unique_ptr TM, OptimizationLevel O, OptimizationOptions options) : + TM(std::move(TM)), SI(false), PIC(createPIC(SI)), + PB(this->TM.get(), PipelineTuningOptions(), None, PIC.get()), + MPM(createMPM(PB, O, options)), O(O) {} + +void NewPM::run(Module &M) { + //We must recreate the analysis managers every time + //so that analyses from previous runs of the pass manager + //do not hang around for the next run + LoopAnalysisManager LAM; + FunctionAnalysisManager FAM(createFAM(O, TM->getTargetIRAnalysis(), TM->getTargetTriple())); + CGSCCAnalysisManager CGAM; + ModuleAnalysisManager MAM; + PB.registerLoopAnalyses(LAM); + PB.registerFunctionAnalyses(FAM); + PB.registerCGSCCAnalyses(CGAM); + PB.registerModuleAnalyses(MAM); + PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); + MPM.run(M, MAM); +} \ No newline at end of file From 9ebd5ce467e4e45870c29dc1a92ce19bc5c42f8d Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi Date: Thu, 21 Jul 2022 01:33:17 -0400 Subject: [PATCH 6/7] Use NewPM almost everywhere --- src/Makefile | 1 + src/aotcompile.cpp | 60 +++++++++++++++++++++++++++++---------- src/disasm.cpp | 8 ++++++ src/jitlayers.cpp | 29 +++++++++++++++---- src/jitlayers.h | 14 +++++++++ src/llvm-julia-passes.inc | 2 +- src/pipeline.cpp | 42 ++++++++++++++++++++------- 7 files changed, 125 insertions(+), 31 deletions(-) diff --git a/src/Makefile b/src/Makefile index e10c6f23808c2..4f8b496cf84a0 100644 --- a/src/Makefile +++ b/src/Makefile @@ -312,6 +312,7 @@ $(BUILDDIR)/llvm-pass-helpers.o $(BUILDDIR)/llvm-pass-helpers.dbg.obj: $(SRCDIR) $(BUILDDIR)/llvm-propagate-addrspaces.o $(BUILDDIR)/llvm-propagate-addrspaces.dbg.obj: $(SRCDIR)/codegen_shared.h $(BUILDDIR)/llvm-remove-addrspaces.o $(BUILDDIR)/llvm-remove-addrspaces.dbg.obj: $(SRCDIR)/codegen_shared.h $(BUILDDIR)/llvm-ptls.o $(BUILDDIR)/llvm-ptls.dbg.obj: $(SRCDIR)/codegen_shared.h +$(BUILDDIR)/pipeline.o $(BUILDDIR)/pipeline.dbg.obj: $(SRCDIR)/jitlayers.h $(BUILDDIR)/processor.o $(BUILDDIR)/processor.dbg.obj: $(addprefix $(SRCDIR)/,processor_*.cpp processor.h features_*.h) $(BUILDDIR)/signal-handling.o $(BUILDDIR)/signal-handling.dbg.obj: $(addprefix $(SRCDIR)/,signals-*.c) $(BUILDDIR)/staticdata.o $(BUILDDIR)/staticdata.dbg.obj: $(SRCDIR)/processor.h $(SRCDIR)/builtin_proto.h diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index a2c2d90672f0a..3dfdeea1e6fd9 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -553,29 +553,39 @@ void jl_dump_native_impl(void *native_code, std::vector unopt_bc_Archive; std::vector outputs; +#ifndef JL_USE_NEW_PM legacy::PassManager preopt, postopt; +#else + PassBuilder PB; + AnalysisManagers AM{*TM, PB, getOptLevel(jl_options.opt_level)}; + ModulePassManager preopt, postopt; +#endif + legacy::PassManager emitter; - if (unopt_bc_fname) + if (unopt_bc_fname) { +#ifndef JL_USE_NEW_PM preopt.add(createBitcodeWriterPass(unopt_bc_OS)); +#else + preopt.addPass(BitcodeWriterPass(unopt_bc_OS)); +#endif + } - //Is this necessary for TM? - // addTargetPasses(&postopt, TM->getTargetTriple(), TM->getTargetIRAnalysis()); - if (bc_fname) + if (bc_fname) { +#ifndef JL_USE_NEW_PM postopt.add(createBitcodeWriterPass(bc_OS)); +#else + postopt.addPass(BitcodeWriterPass(bc_OS)); +#endif + } + //Is this necessary for TM? + addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis()); if (obj_fname) - if (TM->addPassesToEmitFile(postopt, obj_OS, nullptr, CGFT_ObjectFile, false)) + if (TM->addPassesToEmitFile(emitter, obj_OS, nullptr, CGFT_ObjectFile, false)) jl_safe_printf("ERROR: target does not support generation of object files\n"); if (asm_fname) - if (TM->addPassesToEmitFile(postopt, asm_OS, nullptr, CGFT_AssemblyFile, false)) + if (TM->addPassesToEmitFile(emitter, asm_OS, nullptr, CGFT_AssemblyFile, false)) jl_safe_printf("ERROR: target does not support generation of object files\n"); - legacy::PassManager optimizer; - if (bc_fname || obj_fname || asm_fname) { - addTargetPasses(&optimizer, TM->getTargetTriple(), TM->getTargetIRAnalysis()); - addOptimizationPasses(&optimizer, jl_options.opt_level, true, true); - addMachinePasses(&optimizer, jl_options.opt_level); - } - // Reset the target triple to make sure it matches the new target machine auto dataM = data->M.getModuleUnlocked(); dataM->setTargetTriple(TM->getTargetTriple().str()); @@ -587,6 +597,17 @@ void jl_dump_native_impl(void *native_code, T_size = Type::getInt32Ty(Context); Type *T_psize = T_size->getPointerTo(); +#ifndef JL_USE_NEW_PM + legacy::PassManager optimizer; + if (bc_fname || obj_fname || asm_fname) { + addTargetPasses(&optimizer, TM->getTargetTriple(), TM->getTargetIRAnalysis()); + addOptimizationPasses(&optimizer, jl_options.opt_level, true, true); + addMachinePasses(&optimizer, jl_options.opt_level); + } +#else + NewPM optimizer{std::move(TM), getOptLevel(jl_options.opt_level), {true, true}}; +#endif + // add metadata information if (imaging_default()) { emit_offset_table(*dataM, data->jl_sysimg_gvars, "jl_sysimg_gvars", T_psize); @@ -605,7 +626,11 @@ void jl_dump_native_impl(void *native_code, // do the actual work auto add_output = [&] (Module &M, StringRef unopt_bc_Name, StringRef bc_Name, StringRef obj_Name, StringRef asm_Name) { - preopt.run(M); + preopt.run(M +#ifdef JL_USE_NEW_PM + , AM.MAM +#endif + ); optimizer.run(M); // We would like to emit an alias or an weakref alias to redirect these symbols @@ -623,7 +648,12 @@ void jl_dump_native_impl(void *native_code, injectCRTAlias(M, "__truncdfhf2", "julia__truncdfhf2", FunctionType::get(Type::getHalfTy(Context), { Type::getDoubleTy(Context) }, false)); - postopt.run(M); + postopt.run(M +#ifdef JL_USE_NEW_PM + , AM.MAM +#endif + ); + emitter.run(M); if (unopt_bc_fname) emit_result(unopt_bc_Archive, unopt_bc_Buffer, unopt_bc_Name, outputs); diff --git a/src/disasm.cpp b/src/disasm.cpp index 838934a6c5893..64c88bda54b4a 100644 --- a/src/disasm.cpp +++ b/src/disasm.cpp @@ -482,9 +482,17 @@ void jl_strip_llvm_debug(Module *m) void jl_strip_llvm_addrspaces(Module *m) { +#ifndef JL_USE_NEW_PM legacy::PassManager PM; PM.add(createRemoveJuliaAddrspacesPass()); PM.run(*m); +#else + PassBuilder PB; + AnalysisManagers AM{PB}; + ModulePassManager PM; + PM.addPass(RemoveJuliaAddrspacesPass()); + PM.run(*m, AM.MAM); +#endif } // print an llvm IR acquired from jl_get_llvmf diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp index 009b969201164..bdd1f5c1f879c 100644 --- a/src/jitlayers.cpp +++ b/src/jitlayers.cpp @@ -874,7 +874,11 @@ namespace { namespace { +#ifndef JL_USE_NEW_PM typedef legacy::PassManager PassManager; +#else + typedef NewPM PassManager; +#endif orc::JITTargetMachineBuilder createJTMBFromTM(TargetMachine &TM, int optlevel) { return orc::JITTargetMachineBuilder(TM.getTargetTriple()) @@ -896,21 +900,24 @@ namespace { } }; +#ifndef JL_USE_NEW_PM struct PMCreator { std::unique_ptr TM; int optlevel; PMCreator(TargetMachine &TM, int optlevel) : TM(cantFail(createJTMBFromTM(TM, optlevel).createTargetMachine())), optlevel(optlevel) {} PMCreator(const PMCreator &other) : PMCreator(*other.TM, other.optlevel) {} - PMCreator(PMCreator &&other) : TM(std::move(other.TM)), optlevel(other.optlevel) {} + PMCreator &operator=(const PMCreator &other) { + TM = cantFail(createJTMBFromTM(*other.TM, other.optlevel).createTargetMachine()); + optlevel = other.optlevel; + return *this; + } + PMCreator(PMCreator &&other) = default; + PMCreator &operator=(PMCreator &&other) = default; friend void swap(PMCreator &self, PMCreator &other) { using std::swap; swap(self.TM, other.TM); swap(self.optlevel, other.optlevel); } - PMCreator &operator=(PMCreator other) { - swap(*this, other); - return *this; - } std::unique_ptr operator()() { auto PM = std::make_unique(); addTargetPasses(PM.get(), TM->getTargetTriple(), TM->getTargetIRAnalysis()); @@ -920,6 +927,18 @@ namespace { } }; +#else + + struct PMCreator { + orc::JITTargetMachineBuilder JTMB; + OptimizationLevel O; + PMCreator(TargetMachine &TM, int optlevel) : JTMB(createJTMBFromTM(TM, optlevel)), O(getOptLevel(optlevel)) {} + std::unique_ptr operator()() { + return std::make_unique(cantFail(JTMB.createTargetMachine()), O); + } + }; +#endif + struct OptimizerT { OptimizerT(TargetMachine &TM, int optlevel) : optlevel(optlevel), PMs(PMCreator(TM, optlevel)) {} diff --git a/src/jitlayers.h b/src/jitlayers.h index e5c595968b5d9..8ec9ccbb9f6f0 100644 --- a/src/jitlayers.h +++ b/src/jitlayers.h @@ -124,6 +124,20 @@ struct NewPM { void run(Module &M); }; +struct AnalysisManagers { + LoopAnalysisManager LAM; + FunctionAnalysisManager FAM; + CGSCCAnalysisManager CGAM; + ModuleAnalysisManager MAM; + + AnalysisManagers(PassBuilder &PB); + AnalysisManagers(TargetMachine &TM, PassBuilder &PB, OptimizationLevel O); +}; + +OptimizationLevel getOptLevel(int optlevel); + +#define JL_USE_NEW_PM + typedef struct _jl_llvm_functions_t { std::string functionObject; // jlcall llvm Function name std::string specFunctionObject; // specialized llvm Function name diff --git a/src/llvm-julia-passes.inc b/src/llvm-julia-passes.inc index d50ac131e3ff3..99dfc437d30e2 100644 --- a/src/llvm-julia-passes.inc +++ b/src/llvm-julia-passes.inc @@ -26,4 +26,4 @@ FUNCTION_PASS("GCInvariantVerifier", GCInvariantVerifierPass()) //Loop passes #ifdef LOOP_PASS LOOP_PASS("JuliaLICM", JuliaLICMPass()) -#endif \ No newline at end of file +#endif diff --git a/src/pipeline.cpp b/src/pipeline.cpp index a66ecbe785279..377773a57b8c5 100644 --- a/src/pipeline.cpp +++ b/src/pipeline.cpp @@ -567,18 +567,40 @@ NewPM::NewPM(std::unique_ptr TM, OptimizationLevel O, Optimizatio PB(this->TM.get(), PipelineTuningOptions(), None, PIC.get()), MPM(createMPM(PB, O, options)), O(O) {} -void NewPM::run(Module &M) { - //We must recreate the analysis managers every time - //so that analyses from previous runs of the pass manager - //do not hang around for the next run - LoopAnalysisManager LAM; - FunctionAnalysisManager FAM(createFAM(O, TM->getTargetIRAnalysis(), TM->getTargetTriple())); - CGSCCAnalysisManager CGAM; - ModuleAnalysisManager MAM; +AnalysisManagers::AnalysisManagers(TargetMachine &TM, PassBuilder &PB, OptimizationLevel O) : LAM(), FAM(createFAM(O, TM.getTargetIRAnalysis(), TM.getTargetTriple())), CGAM(), MAM() { + PB.registerLoopAnalyses(LAM); + PB.registerFunctionAnalyses(FAM); + PB.registerCGSCCAnalyses(CGAM); + PB.registerModuleAnalyses(MAM); + PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); +} + +AnalysisManagers::AnalysisManagers(PassBuilder &PB) : LAM(), FAM(), CGAM(), MAM() { PB.registerLoopAnalyses(LAM); PB.registerFunctionAnalyses(FAM); PB.registerCGSCCAnalyses(CGAM); PB.registerModuleAnalyses(MAM); PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); - MPM.run(M, MAM); -} \ No newline at end of file +} + +void NewPM::run(Module &M) { + //We must recreate the analysis managers every time + //so that analyses from previous runs of the pass manager + //do not hang around for the next run + AnalysisManagers AM{*TM, PB, O}; + MPM.run(M, AM.MAM); +} + +OptimizationLevel getOptLevel(int optlevel) { + switch (std::min(std::max(optlevel, 0), 3)) { + case 0: + return OptimizationLevel::O0; + case 1: + return OptimizationLevel::O1; + case 2: + return OptimizationLevel::O2; + case 3: + return OptimizationLevel::O3; + } + llvm_unreachable("cannot get here!"); +} From 06b2a73544dd120a572aa1af9aa7c0ae4be971bd Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi Date: Thu, 21 Jul 2022 02:00:28 -0400 Subject: [PATCH 7/7] Fix ASAN/MSAN builds --- src/Makefile | 2 +- src/codegen.cpp | 4 ++++ src/jitlayers.h | 7 +++++-- src/pipeline.cpp | 18 +++++++----------- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/src/Makefile b/src/Makefile index 4f8b496cf84a0..f13f7f2005b4c 100644 --- a/src/Makefile +++ b/src/Makefile @@ -289,7 +289,7 @@ $(BUILDDIR)/builtins.o $(BUILDDIR)/builtins.dbg.obj: $(SRCDIR)/iddict.c $(SRCDIR $(BUILDDIR)/codegen.o $(BUILDDIR)/codegen.dbg.obj: $(addprefix $(SRCDIR)/,\ intrinsics.cpp jitlayers.h debug-registry.h intrinsics.h codegen_shared.h cgutils.cpp ccall.cpp abi_*.cpp processor.h builtin_proto.h) $(BUILDDIR)/debuginfo.o $(BUILDDIR)/debuginfo.dbg.obj: $(addprefix $(SRCDIR)/,debuginfo.h processor.h jitlayers.h debug-registry.h) -$(BUILDDIR)/disasm.o $(BUILDDIR)/disasm.dbg.obj: $(SRCDIR)/debuginfo.h $(SRCDIR)/processor.h +$(BUILDDIR)/disasm.o $(BUILDDIR)/disasm.dbg.obj: $(SRCDIR)/debuginfo.h $(SRCDIR)/processor.h $(SRCDIR)/jitlayers.h $(BUILDDIR)/dump.o $(BUILDDIR)/dump.dbg.obj: $(addprefix $(SRCDIR)/,common_symbols1.inc common_symbols2.inc builtin_proto.h serialize.h) $(BUILDDIR)/gc-debug.o $(BUILDDIR)/gc-debug.dbg.obj: $(SRCDIR)/gc.h $(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc.h diff --git a/src/codegen.cpp b/src/codegen.cpp index 8ac0cf6105601..2eb21d0de0bc4 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -8597,6 +8597,9 @@ extern "C" void jl_init_llvm(void) if (jl_using_gdb_jitevents) jl_ExecutionEngine->enableJITDebuggingSupport(); +#if defined(_COMPILER_ASAN_ENABLED_) && defined(JL_USE_NEW_PM) +#warning "JIT profiling support (JL_USE_*_JITEVENTS) not yet available for ASAN with NewPM (requires JITLink)" +#else #if defined(JL_USE_INTEL_JITEVENTS) || \ defined(JL_USE_OPROFILE_JITEVENTS) || \ defined(JL_USE_PERF_JITEVENTS) @@ -8638,6 +8641,7 @@ extern "C" void jl_init_llvm(void) jl_ExecutionEngine->RegisterJITEventListener(JITEventListener::createPerfJITEventListener()); #endif #endif +#endif #endif cl::PrintOptionValues(); diff --git a/src/jitlayers.h b/src/jitlayers.h index 8ec9ccbb9f6f0..5191eb99cb697 100644 --- a/src/jitlayers.h +++ b/src/jitlayers.h @@ -20,6 +20,7 @@ #include #include "julia_assert.h" #include "debug-registry.h" +#include "platform.h" #include #include @@ -40,7 +41,10 @@ // and feature support (e.g. Windows, JITEventListeners for various profilers, // etc.). Thus, we currently only use JITLink where absolutely required, that is, // for Mac/aarch64. -#if defined(_OS_DARWIN_) && defined(_CPU_AARCH64_) + +#define JL_USE_NEW_PM + +#if defined(_OS_DARWIN_) && defined(_CPU_AARCH64_) || defined(_COMPILER_ASAN_ENABLED_) && defined(JL_USE_NEW_PM) && defined(_OS_LINUX_) # if JL_LLVM_VERSION < 130000 # pragma message("On aarch64-darwin, LLVM version >= 13 is required for JITLink; fallback suffers from occasional segfaults") # endif @@ -136,7 +140,6 @@ struct AnalysisManagers { OptimizationLevel getOptLevel(int optlevel); -#define JL_USE_NEW_PM typedef struct _jl_llvm_functions_t { std::string functionObject; // jlcall llvm Function name diff --git a/src/pipeline.cpp b/src/pipeline.cpp index 377773a57b8c5..3960981bf1404 100644 --- a/src/pipeline.cpp +++ b/src/pipeline.cpp @@ -96,19 +96,17 @@ namespace { #ifdef _COMPILER_MSAN_ENABLED_ auto MSanPass = [&](/*SanitizerMask Mask, */bool CompileKernel) { - // if (LangOpts.Sanitize.has(Mask)) { + // if (LangOpts.Sanitize.has(Mask)) { // int TrackOrigins = CodeGenOpts.SanitizeMemoryTrackOrigins; // bool Recover = CodeGenOpts.SanitizeRecover.has(Mask); // MemorySanitizerOptions options(TrackOrigins, Recover, CompileKernel, - // CodeGenOpts.SanitizeMemoryParamRetval); - MemorySanitizerOptions options{}; - #if JL_LLVM_VERSION >= 140000 + // CodeGenOpts.SanitizeMemoryParamRetval); + MemorySanitizerOptions options; MPM.addPass(ModuleMemorySanitizerPass(options)); - #endif FunctionPassManager FPM; FPM.addPass(MemorySanitizerPass(options)); - if (O.getSpeedupLevel() != 0) { + if (O != OptimizationLevel::O0) { // MemorySanitizer inserts complex instrumentation that mostly // follows the logic of the original code, but operates on // "shadow" values. It can benefit from re-running some @@ -121,7 +119,7 @@ namespace { // compiler-rt/test/msan/select_origin.cpp. } MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); - // } + // } }; MSanPass(/*SanitizerKind::Memory, */false); // MSanPass(SanitizerKind::KernelMemory, true); @@ -129,9 +127,7 @@ namespace { #ifdef _COMPILER_TSAN_ENABLED_ // if (LangOpts.Sanitize.has(SanitizerKind::Thread)) { - #if JL_LLVM_VERSION >= 140000 MPM.addPass(ModuleThreadSanitizerPass()); - #endif MPM.addPass(createModuleToFunctionPassAdaptor(ThreadSanitizerPass())); // } #endif @@ -152,8 +148,8 @@ namespace { MPM.addPass(RequireAnalysisPass()); // MPM.addPass(ModuleAddressSanitizerPass( // Opts, UseGlobalGC, UseOdrIndicator, DestructorKind)); - MPM.addPass(ModuleAddressSanitizerPass()); - MPM.addPass(createModuleToFunctionPassAdaptor(AddressSanitizerPass())); + //Let's assume the defaults are actually fine for our purposes + MPM.addPass(ModuleAddressSanitizerPass(AddressSanitizerOptions())); // } }; ASanPass(/*SanitizerKind::Address, */false);