-
Notifications
You must be signed in to change notification settings - Fork 12.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[CGData][ThinLTO] Global Outlining with Two-CodeGen Rounds #90933
Conversation
885409a
to
0f0755b
Compare
@llvm/pr-subscribers-llvm-support @llvm/pr-subscribers-lto Author: Kyungwoo Lee (kyulee-com) ChangesThis feature is enabled by
Depends on #90934. Patch is 26.07 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/90933.diff 10 Files Affected:
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 7fa69420298160..a1909d45b4d944 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1286,10 +1286,10 @@ static void runThinLTOBackend(
Conf.CGFileType = getCodeGenFileType(Action);
break;
}
- if (Error E =
- thinBackend(Conf, -1, AddStream, *M, *CombinedIndex, ImportList,
- ModuleToDefinedGVSummaries[M->getModuleIdentifier()],
- /* ModuleMap */ nullptr, CGOpts.CmdArgs)) {
+ if (Error E = thinBackend(
+ Conf, -1, AddStream, *M, *CombinedIndex, ImportList,
+ ModuleToDefinedGVSummaries[M->getModuleIdentifier()],
+ /* ModuleMap */ nullptr, Conf.CodeGenOnly, CGOpts.CmdArgs)) {
handleAllErrors(std::move(E), [&](ErrorInfoBase &EIB) {
errs() << "Error running ThinLTO backend: " << EIB.message() << '\n';
});
diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h
index 84133a433170fe..1e1afe99327650 100644
--- a/llvm/include/llvm/CGData/CodeGenData.h
+++ b/llvm/include/llvm/CGData/CodeGenData.h
@@ -164,6 +164,22 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) {
CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree));
}
+/// Initialize the two-codegen rounds.
+void initializeTwoCodegenRounds();
+
+/// Save the current module before the first codegen round.
+void saveModuleForTwoRounds(const Module &TheModule, unsigned Task);
+
+/// Load the current module before the second codegen round.
+std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
+ unsigned Task,
+ LLVMContext &Context);
+
+/// Merge the codegen data from the input files in scratch vector in ThinLTO
+/// two-codegen rounds.
+Error mergeCodeGenData(
+ const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles);
+
void warn(Error E, StringRef Whence = "");
void warn(Twine Message, std::string Whence = "", std::string Hint = "");
diff --git a/llvm/include/llvm/LTO/LTOBackend.h b/llvm/include/llvm/LTO/LTOBackend.h
index de89f4bb10dff2..8516398510d4b8 100644
--- a/llvm/include/llvm/LTO/LTOBackend.h
+++ b/llvm/include/llvm/LTO/LTOBackend.h
@@ -56,6 +56,7 @@ Error thinBackend(const Config &C, unsigned Task, AddStreamFn AddStream,
const FunctionImporter::ImportMapTy &ImportList,
const GVSummaryMapTy &DefinedGlobals,
MapVector<StringRef, BitcodeModule> *ModuleMap,
+ bool CodeGenOnly,
const std::vector<uint8_t> &CmdArgs = std::vector<uint8_t>());
Error finalizeOptimizationRemarks(
diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp
index 55d2504231c744..e8fda7ad7454d7 100644
--- a/llvm/lib/CGData/CodeGenData.cpp
+++ b/llvm/lib/CGData/CodeGenData.cpp
@@ -17,6 +17,7 @@
#include "llvm/Object/ObjectFile.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
#include "llvm/Support/WithColor.h"
#define DEBUG_TYPE "cg-data"
@@ -30,6 +31,14 @@ cl::opt<bool>
cl::opt<std::string>
CodeGenDataUsePath("codegen-data-use-path", cl::init(""), cl::Hidden,
cl::desc("File path to where .cgdata file is read"));
+cl::opt<bool> CodeGenDataThinLTOTwoRounds(
+ "codegen-data-thinlto-two-rounds", cl::init(false), cl::Hidden,
+ cl::desc("Enable two-round ThinLTO code generation. The first round "
+ "generates code and emits CodeGen data, while the second round "
+ "uses the emitted data for further optimizations."));
+
+// Path to where the optimized bitcodes are saved and restored for ThinLTO.
+static SmallString<128> CodeGenDataThinLTOTwoRoundsPath;
static std::string getCGDataErrString(cgdata_error Err,
const std::string &ErrMsg = "") {
@@ -139,7 +148,7 @@ CodeGenData &CodeGenData::getInstance() {
std::call_once(CodeGenData::OnceFlag, []() {
Instance = std::unique_ptr<CodeGenData>(new CodeGenData());
- if (CodeGenDataGenerate)
+ if (CodeGenDataGenerate || CodeGenDataThinLTOTwoRounds)
Instance->EmitCGData = true;
else if (!CodeGenDataUsePath.empty()) {
// Initialize the global CGData if the input file name is given.
@@ -215,6 +224,76 @@ void warn(Error E, StringRef Whence) {
}
}
+static std::string getPath(StringRef Dir, unsigned Task) {
+ return (Dir + "/" + llvm::Twine(Task) + ".saved_copy.bc").str();
+}
+
+void initializeTwoCodegenRounds() {
+ assert(CodeGenDataThinLTOTwoRounds);
+ if (auto EC = llvm::sys::fs::createUniqueDirectory(
+ "cgdata", CodeGenDataThinLTOTwoRoundsPath))
+ report_fatal_error(Twine("Failed to create directory: ") + EC.message());
+}
+
+void saveModuleForTwoRounds(const Module &TheModule, unsigned Task) {
+ assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath));
+ std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task);
+ std::error_code EC;
+ raw_fd_ostream OS(Path, EC, sys::fs::OpenFlags::OF_None);
+ if (EC)
+ report_fatal_error(Twine("Failed to open ") + Path +
+ " to save optimized bitcode: " + EC.message());
+ WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true);
+}
+
+std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
+ unsigned Task,
+ LLVMContext &Context) {
+ assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath));
+ std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task);
+ auto FileOrError = MemoryBuffer::getFile(Path);
+ if (auto EC = FileOrError.getError())
+ report_fatal_error(Twine("Failed to open ") + Path +
+ " to load optimized bitcode: " + EC.message());
+
+ std::unique_ptr<MemoryBuffer> FileBuffer = std::move(*FileOrError);
+ auto RestoredModule = llvm::parseBitcodeFile(*FileBuffer, Context);
+ if (!RestoredModule)
+ report_fatal_error(Twine("Failed to parse optimized bitcode loaded from ") +
+ Path + "\n");
+
+ // Restore the original module identifier.
+ (*RestoredModule)->setModuleIdentifier(OrigModule.getModuleIdentifier());
+ return std::move(*RestoredModule);
+}
+
+Error mergeCodeGenData(
+ const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles) {
+
+ OutlinedHashTreeRecord GlobalOutlineRecord;
+ for (auto &InputFile : *(InputFiles)) {
+ if (InputFile.empty())
+ continue;
+ StringRef File = StringRef(InputFile.data(), InputFile.size());
+ std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBuffer(
+ File, "in-memory object file", /*RequiresNullTerminator=*/false);
+ Expected<std::unique_ptr<object::ObjectFile>> BinOrErr =
+ object::ObjectFile::createObjectFile(Buffer->getMemBufferRef());
+ if (!BinOrErr)
+ return BinOrErr.takeError();
+
+ std::unique_ptr<object::ObjectFile> &Obj = BinOrErr.get();
+ if (auto E = CodeGenDataReader::mergeFromObjectFile(Obj.get(),
+ GlobalOutlineRecord))
+ return E;
+ }
+
+ if (!GlobalOutlineRecord.empty())
+ cgdata::publishOutlinedHashTree(std::move(GlobalOutlineRecord.HashTree));
+
+ return Error::success();
+}
+
} // end namespace cgdata
} // end namespace llvm
diff --git a/llvm/lib/LTO/CMakeLists.txt b/llvm/lib/LTO/CMakeLists.txt
index 69ff08e1f374c4..057d73b6349cf1 100644
--- a/llvm/lib/LTO/CMakeLists.txt
+++ b/llvm/lib/LTO/CMakeLists.txt
@@ -21,6 +21,7 @@ add_llvm_component_library(LLVMLTO
BinaryFormat
BitReader
BitWriter
+ CGData
CodeGen
CodeGenTypes
Core
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index a88124dacfaefd..945f8c859365ea 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -21,6 +21,7 @@
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Bitcode/BitcodeReader.h"
#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/CGData/CodeGenData.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/IR/AutoUpgrade.h"
@@ -70,6 +71,8 @@ static cl::opt<bool>
DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden,
cl::desc("Dump the SCCs in the ThinLTO index's callgraph"));
+extern cl::opt<bool> CodeGenDataThinLTOTwoRounds;
+
namespace llvm {
/// Enable global value internalization in LTO.
cl::opt<bool> EnableLTOInternalization(
@@ -1458,7 +1461,7 @@ class InProcessThinBackend : public ThinBackendProc {
GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name)));
}
- Error runThinLTOBackendThread(
+ virtual Error runThinLTOBackendThread(
AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM,
ModuleSummaryIndex &CombinedIndex,
const FunctionImporter::ImportMapTy &ImportList,
@@ -1473,7 +1476,8 @@ class InProcessThinBackend : public ThinBackendProc {
return MOrErr.takeError();
return thinBackend(Conf, Task, AddStream, **MOrErr, CombinedIndex,
- ImportList, DefinedGlobals, &ModuleMap);
+ ImportList, DefinedGlobals, &ModuleMap,
+ Conf.CodeGenOnly);
};
auto ModuleID = BM.getModuleIdentifier();
@@ -1558,6 +1562,60 @@ class InProcessThinBackend : public ThinBackendProc {
return BackendThreadPool.getMaxConcurrency();
}
};
+
+/// This Backend will run ThinBackend process but throw away all the output from
+/// the codegen. This class facilitates the first codegen round.
+class NoOutputThinBackend : public InProcessThinBackend {
+public:
+ NoOutputThinBackend(
+ const Config &Conf, ModuleSummaryIndex &CombinedIndex,
+ ThreadPoolStrategy ThinLTOParallelism,
+ const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+ std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch)
+ : InProcessThinBackend(
+ Conf, CombinedIndex, ThinLTOParallelism, ModuleToDefinedGVSummaries,
+ // Allocate a scratch buffer for each task to write output to.
+ [Allocation = &*Scratch](unsigned Task, const Twine &ModuleName) {
+ return std::make_unique<CachedFileStream>(
+ std::make_unique<raw_svector_ostream>((*Allocation)[Task]));
+ },
+ FileCache(), nullptr, false, false),
+ Scratch(std::move(Scratch)) {}
+
+ /// Scratch space for writing output during the codegen.
+ std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch;
+};
+
+/// This Backend performs codegen on bitcode that was previously saved after
+/// going through optimization. This class facilitates the second codegen round.
+class OptimizedBitcodeThinBackend : public InProcessThinBackend {
+public:
+ OptimizedBitcodeThinBackend(
+ const Config &Conf, ModuleSummaryIndex &CombinedIndex,
+ ThreadPoolStrategy ThinLTOParallelism,
+ const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+ AddStreamFn AddStream)
+ : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism,
+ ModuleToDefinedGVSummaries, AddStream, FileCache(),
+ nullptr, false, false) {}
+
+ virtual Error runThinLTOBackendThread(
+ AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM,
+ ModuleSummaryIndex &CombinedIndex,
+ const FunctionImporter::ImportMapTy &ImportList,
+ const FunctionImporter::ExportSetTy &ExportList,
+ const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
+ const GVSummaryMapTy &DefinedGlobals,
+ MapVector<StringRef, BitcodeModule> &ModuleMap) override {
+ LTOLLVMContext BackendContext(Conf);
+ std::unique_ptr<Module> LoadedModule =
+ cgdata::loadModuleForTwoRounds(BM, Task, BackendContext);
+
+ return thinBackend(Conf, Task, AddStream, *LoadedModule, CombinedIndex,
+ ImportList, DefinedGlobals, &ModuleMap,
+ /*CodeGenOnly=*/true);
+ }
+};
} // end anonymous namespace
ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism,
@@ -1839,45 +1897,85 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
TimeTraceScopeExit.release();
- std::unique_ptr<ThinBackendProc> BackendProc =
- ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
- AddStream, Cache);
-
auto &ModuleMap =
ThinLTO.ModulesToCompile ? *ThinLTO.ModulesToCompile : ThinLTO.ModuleMap;
- auto ProcessOneModule = [&](int I) -> Error {
- auto &Mod = *(ModuleMap.begin() + I);
- // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for
- // combined module and parallel code generation partitions.
- return BackendProc->start(RegularLTO.ParallelCodeGenParallelismLevel + I,
- Mod.second, ImportLists[Mod.first],
- ExportLists[Mod.first], ResolvedODR[Mod.first],
- ThinLTO.ModuleMap);
+ auto RunBackends = [&](ThinBackendProc *BackendProcess) -> Error {
+ auto ProcessOneModule = [&](int I) -> Error {
+ auto &Mod = *(ModuleMap.begin() + I);
+ // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for
+ // combined module and parallel code generation partitions.
+ return BackendProcess->start(
+ RegularLTO.ParallelCodeGenParallelismLevel + I, Mod.second,
+ ImportLists[Mod.first], ExportLists[Mod.first],
+ ResolvedODR[Mod.first], ThinLTO.ModuleMap);
+ };
+
+ if (BackendProcess->getThreadCount() == 1) {
+ // Process the modules in the order they were provided on the
+ // command-line. It is important for this codepath to be used for
+ // WriteIndexesThinBackend, to ensure the emitted LinkedObjectsFile lists
+ // ThinLTO objects in the same order as the inputs, which otherwise would
+ // affect the final link order.
+ for (int I = 0, E = ModuleMap.size(); I != E; ++I)
+ if (Error E = ProcessOneModule(I))
+ return E;
+ } else {
+ // When executing in parallel, process largest bitsize modules first to
+ // improve parallelism, and avoid starving the thread pool near the end.
+ // This saves about 15 sec on a 36-core machine while link `clang.exe`
+ // (out of 100 sec).
+ std::vector<BitcodeModule *> ModulesVec;
+ ModulesVec.reserve(ModuleMap.size());
+ for (auto &Mod : ModuleMap)
+ ModulesVec.push_back(&Mod.second);
+ for (int I : generateModulesOrdering(ModulesVec))
+ if (Error E = ProcessOneModule(I))
+ return E;
+ }
+ return BackendProcess->wait();
};
- if (BackendProc->getThreadCount() == 1) {
- // Process the modules in the order they were provided on the command-line.
- // It is important for this codepath to be used for WriteIndexesThinBackend,
- // to ensure the emitted LinkedObjectsFile lists ThinLTO objects in the same
- // order as the inputs, which otherwise would affect the final link order.
- for (int I = 0, E = ModuleMap.size(); I != E; ++I)
- if (Error E = ProcessOneModule(I))
- return E;
- } else {
- // When executing in parallel, process largest bitsize modules first to
- // improve parallelism, and avoid starving the thread pool near the end.
- // This saves about 15 sec on a 36-core machine while link `clang.exe` (out
- // of 100 sec).
- std::vector<BitcodeModule *> ModulesVec;
- ModulesVec.reserve(ModuleMap.size());
- for (auto &Mod : ModuleMap)
- ModulesVec.push_back(&Mod.second);
- for (int I : generateModulesOrdering(ModulesVec))
- if (Error E = ProcessOneModule(I))
- return E;
+ if (!CodeGenDataThinLTOTwoRounds) {
+ std::unique_ptr<ThinBackendProc> BackendProc =
+ ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
+ AddStream, Cache);
+ return RunBackends(BackendProc.get());
}
- return BackendProc->wait();
+
+ // Perform two rounds of code generation for ThinLTO:
+ // 1. First round: Run optimization and code generation with a scratch output.
+ // 2. Merge codegen data extracted from the scratch output.
+ // 3. Second round: Run code generation again using the merged data.
+ LLVM_DEBUG(dbgs() << "Running ThinLTO two-codegen rounds\n");
+
+ // Initialize a temporary path to store and retrieve optimized IRs for
+ // two-round code generation.
+ cgdata::initializeTwoCodegenRounds();
+
+ // Create a scratch output to hold intermediate results.
+ auto Outputs =
+ std::make_unique<std::vector<llvm::SmallString<0>>>(getMaxTasks());
+ auto FirstRoundLTO = std::make_unique<NoOutputThinBackend>(
+ Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(),
+ ModuleToDefinedGVSummaries, std::move(Outputs));
+ // First round: Run optimization and code generation with a scratch output.
+ // Before code generation, serialize modules.
+ if (Error E = RunBackends(FirstRoundLTO.get()))
+ return E;
+
+ // Merge codegen data extracted from the scratch output.
+ if (Error E = cgdata::mergeCodeGenData(std::move(FirstRoundLTO->Scratch)))
+ return E;
+
+ // Second round: Run code generation by reading IRs.
+ std::unique_ptr<ThinBackendProc> SecondRoundLTO =
+ std::make_unique<OptimizedBitcodeThinBackend>(
+ Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(),
+ ModuleToDefinedGVSummaries, AddStream);
+ Error E = RunBackends(SecondRoundLTO.get());
+
+ return E;
}
Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks(
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 4e58cd369c3ac9..d198e8e5102009 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -20,6 +20,7 @@
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Bitcode/BitcodeReader.h"
#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/CGData/CodeGenData.h"
#include "llvm/IR/LLVMRemarkStreamer.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/PassManager.h"
@@ -74,6 +75,8 @@ static cl::opt<bool> ThinLTOAssumeMerged(
cl::desc("Assume the input has already undergone ThinLTO function "
"importing and the other pre-optimization pipeline changes."));
+extern cl::opt<bool> CodeGenDataThinLTOTwoRounds;
+
namespace llvm {
extern cl::opt<bool> NoPGOWarnMismatch;
}
@@ -565,7 +568,7 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
const FunctionImporter::ImportMapTy &ImportList,
const GVSummaryMapTy &DefinedGlobals,
MapVector<StringRef, BitcodeModule> *ModuleMap,
- const std::vector<uint8_t> &CmdArgs) {
+ bool CodeGenOnly, const std::vector<uint8_t> &CmdArgs) {
Expected<const Target *> TOrErr = initAndLookupTarget(Conf, Mod);
if (!TOrErr)
return TOrErr.takeError();
@@ -586,7 +589,9 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
Mod.setPartialSampleProfileRatio(CombinedIndex);
LLVM_DEBUG(dbgs() << "Running ThinLTO\n");
- if (Conf.CodeGenOnly) {
+ if (CodeGenOnly) {
+ // If CodeGenOnly is set, we only perform code generation and skip
+ // optimization.
codegen(Conf, TM.get(), AddStream, Task, Mod, CombinedIndex);
return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
}
@@ -597,11 +602,19 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
auto OptimizeAndCodegen =
[&](Module &Mod, TargetMachine *TM,
std::unique_ptr<ToolOutputFile> DiagnosticOutputFile) {
+ // Perform optimization and code generation for ThinLTO.
if (!opt(Conf, TM, Task, Mod, /*IsThinLTO=*/true,
/*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex,
CmdArgs))
return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
+ // Save the current module before the first codege...
[truncated]
|
@@ -164,6 +164,22 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) { | |||
CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree)); | |||
} | |||
|
|||
/// Initialize the two-codegen rounds. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This comment isn't very useful (thanks to the function name 😄). Since the contents are also pretty simple, the comment probably isn't needed.
void initializeTwoCodegenRounds(); | ||
|
||
/// Save the current module before the first codegen round. | ||
void saveModuleForTwoRounds(const Module &TheModule, unsigned Task); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is Task
? Some way to disambiguate modules?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is
Task
? Some way to disambiguate modules?
Added a comment to the function.
/// Merge the codegen data from the input files in scratch vector in ThinLTO | ||
/// two-codegen rounds. | ||
Error mergeCodeGenData( | ||
const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is similar to a constant pointer. Can we use ArrayRef
instead somehow?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is similar to a constant pointer. Can we use
ArrayRef
instead somehow?
The function mergeCodeGenData
takes ownership of InputFiles
. Once the function returns, the scratch buffer for the produced object files will be destroyed. I think this behavior is different from that of ArrayRef
, which provides a read-only view into a vector.
llvm/lib/CGData/CodeGenData.cpp
Outdated
@@ -215,6 +224,76 @@ void warn(Error E, StringRef Whence) { | |||
} | |||
} | |||
|
|||
static std::string getPath(StringRef Dir, unsigned Task) { | |||
return (Dir + "/" + llvm::Twine(Task) + ".saved_copy.bc").str(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will this work on windows? Can we use llvm::sys::path
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will this work on windows? Can we use
llvm::sys::path
?
Thanks for the catch!
llvm/lib/LTO/LTO.cpp
Outdated
return std::make_unique<CachedFileStream>( | ||
std::make_unique<raw_svector_ostream>((*Allocation)[Task])); | ||
}, | ||
FileCache(), nullptr, false, false), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you mind labelling these arguments with a comment including the name of the parameter like you've done below: /*CodeGenOnly=*/true
. Same with the OptimizedBitcodeThinBackend
constructor.
llvm/lib/LTO/LTO.cpp
Outdated
|
||
/// This Backend will run ThinBackend process but throw away all the output from | ||
/// the codegen. This class facilitates the first codegen round. | ||
class NoOutputThinBackend : public InProcessThinBackend { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If I'm following correctly, it's not as if this backend really throws results away, but it writes the produced object files into the scratch buffer rather than to files. It also writes the optimized bitcode files to disk right?
I don't have a strong opinion, but maybe we could name the backend / adjust the comment to reflect this. I wouldn't even be opposed to naming the backends something specific to two-codegen rounds, as I don't see anyone using them for other purposes.
Lastly just an idea: We could just hold the optimized bitcode in a similar buffer in memory, rather than writing them to disk and reading them again between rounds. Might run a bit faster.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lastly just an idea: We could just hold the optimized bitcode in a similar buffer in memory, rather than writing them to disk and reading them again between rounds. Might run a bit faster.
That's a great point! I've been cautious about peak memory usage, especially with large app binaries. Since the linker already buffers the resulting object files, this isn't a new concern. It's worth noting that the buffer from the first round gets discarded right before the second round, so we effectively only hold a buffer for the resulting object files. As for the optimized bitcode files, which are usually much larger than object files, I chose to write them to disk instead of keeping everything in memory. For smaller app binaries, buffering the optimized bitcode could be beneficial, as you suggested. I think we can always revisit and potentially add this as an option if needed.
Yeah, I should've created a PR directly on the remote branch instead of on my fork. |
Could someone please take another look? Thanks! |
@teresajohnson Do you have any concern or comment on this direction? |
Just a quick reply to say that I am taking a look today (unfamiliar with this approach so need to read through the RFC etc) and will get back later today |
Sorry I was obviously over optimistic on when I could do this! A couple of high level comments / questions:
|
It's interesting to know that Propeller wants to redo the codegen. I'm happy to align with this work. We've already started discussing this and have shared some details from our side. Here's the link for more info: https://discourse.llvm.org/t/rfc-enhanced-machine-outliner-part-2-thinlto-nolto/78753/11?u=kyulee-com.
I think technically it's doable, but initially, I believed we did not want to repeat the code generation with distributed ThinLTO unless we were willing to introduce additional synchronization and spawn distributed ThinLTO backends again with the merged codegen data. If we aim to leverage the saved optimized IR, I suppose we need to assign the same backend work to the same machine used in the first run.
That's a good catch! Assuming this is not a common scenario (as mentioned in the link above RFC), my initial design intentionally disables LTO's caching for correctness and simplicity by setting an empty |
@teresajohnson Here is the summary for the latest commit. Sorry about a few more dependent PRs whose commits also appear in this PR.
|
Great, I hadn't looked at the RFC again and didn't realize Rahman already responded there.
Agree it makes sense to do the in-process mode support first as that is more straightforward.
Not necessarily. The IR just needs to be saved in a path that the build system specifies and can therefore move around.
Yeah, there are a few caching aspects. Adding caching of the scratch files and optimized IR being reused here is one aspect that will enable more caching and better build performance for this optimization. I was more concerned about the existing LTO caching and whether that would be broken, but missed that you were previously disabling caching. We use distributed ThinLTO but my understanding is that LTO caching is frequently used for in-process ThinLTO. I'll look at your refactoring and updates as soon as I can. |
Yes. Propeller's final post-link optimization can use the optimized cached bitcode from the profiled build. This can be an improvement for Propeller. @amharc did some experiments to measure the gain from such improvements. IIUC, we must use |
This is NFC for #90933. - Create a lambda function, `RunBackends`, to group the backend operations into a single function. - Explicitly pass the `CodeGenOnly` argument to thinBackend, instead of depending on a configuration value. Depends on #90304. This is a patch for https://discourse.llvm.org/t/rfc-enhanced-machine-outliner-part-2-thinlto-nolto/78753.
@rlavaee It is not strictly necessary to run both |
This is a prep for #90933. - Change `FileCache` from a function to a type. - Store the cache directory in the type, which will be used when creating additional caches for two-codegen round runs that inherit this value.
This is a prep for #90933. - Change `ThinBackend` from a function to a type. - Store the parallelism level in the type, which will be used when creating two-codegen round backends that inherit this value. - `ThinBackendProc` is hoisted to `LTO.h` from `LTO.cpp` to provide its body for `ThinBackend`. However, `emitFiles()` is still implemented separately in `LTO.cpp`, distinct from its parent class.
llvm/lib/LTO/LTOBackend.cpp
Outdated
@@ -74,6 +75,8 @@ static cl::opt<bool> ThinLTOAssumeMerged( | |||
cl::desc("Assume the input has already undergone ThinLTO function " | |||
"importing and the other pre-optimization pipeline changes.")); | |||
|
|||
extern cl::opt<bool> CodeGenDataThinLTOTwoRounds; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is unused in this file
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Deleted.
llvm/lib/LTO/LTO.cpp
Outdated
AddStreamFn &CacheCGAddStream = *CacheCGAddStreamOrErr; | ||
|
||
// Get IRKey for caching (optimized) IR in IRCache with an extra ID. | ||
std::string IRKey = computeLTOCacheKey( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's a bit unfortunate to have to compute the whole cache key again but just with one extra ID added. I suppose an alternative would be to just rehash the CGKey with "IR".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Define drecomputeLTOCacheKey
to rehash the key with additional string.
llvm/lib/LTO/LTO.cpp
Outdated
std::string Key = computeLTOCacheKey( | ||
Conf, CombinedIndex, ModuleID, ImportList, ExportList, ResolvedODR, | ||
DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls, | ||
/*ExtraID=*/std::to_string(CombinedCGDataHash)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems that if there is any change to the CG data affecting any module, that all will be cache misses, since the same CombinedCGDataHash is used for all modules. I don't know what is in the cg data exactly, but is there any way to know what part of it affects which modules? Otherwise, any change to any file affecting any cg data means all will be misses.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great catch! That's actually one of the reasons I initially disabled caches, as the cache hit rate tends to be low, especially during the second round of code generation when any changes to the codegen data summary, which is global, occur. Currently, the codegen data summary includes outlining opportunities that have occurred locally within modules. We've found these to be quite stable since the outlining sequences are typically short but frequently used across modules.
In practice, rather than repeating codegen in place, we plan to utilize the prior codegen summary from the previous build to optimize subsequent builds. This approach may come at the cost of size efficiency due to the use of stale summaries, but it's guaranteed to be safe. Given that this global summary is fixed (for a certain period until we update that summary), incremental builds may not pose a problem in this setting.
Addressing your original question, the outlining summary data contrasts with the module summary index, which is primarily used for inlining. For the module summary index, we could use symbol, call-graph, and profile summaries to infer inlining potentials and import potential candidates at the summary level. Using this information, we could shard the module summary index for each module compilation, making it more cache-friendly.
On the other hand, the outlining summary we collect should be global but based on hash sequences. To match the actual code sequence that needs to be outlined, we need to compare them against the actual instructions (here MIR instead of IR). Partitioning this global data for each module at the summary level would be challenging. However, it would be an interesting problem to explore if we can come up with a summary-level outlining and partition such data effectively.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In practice, rather than repeating codegen in place, we plan to utilize the prior codegen summary from the previous build to optimize subsequent builds.
Can you clarify - do you plan to use the two round support being added here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think most developer builds benefit from using previous codegen summaries in incremental builds, which assists with scaling. For final builds or getting the optimal binary, this two-round support is quite useful as it runs well on a single machine without requiring changes to the build system. I plan to use the two-round support, particularly in scenarios without dthin-lto for now, but I'm also open to explore this case for the dthin-lto scenario.
@@ -0,0 +1,94 @@ | |||
; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This test seems like mostly an overlap of the caching test, with the exception of the regular LTO module interactions. Can you better describe that here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added more comments on the test.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lgtm
@teresajohnson Thank you for your review and valuable feedback! |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/72/builds/4173 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/159/builds/7851 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/168/builds/4263 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/186/builds/3073 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/145/builds/2381 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/64/builds/1190 Here is the relevant piece of the build log for the reference
|
This feature is enabled by `-codegen-data-thinlto-two-rounds`, which effectively runs the `-codegen-data-generate` and `-codegen-data-use` in two rounds to enable global outlining with ThinLTO. 1. The first round: Run both optimization + codegen with a scratch output. Before running codegen, we serialize the optimized bitcode modules to a temporary path. 2. From the scratch object files, we merge them into the codegen data. 3. The second round: Read the optimized bitcode modules and start the codegen only this time. Using the codegen data, the machine outliner effectively performs the global outlining. Depends on llvm#90934, llvm#110461 and llvm#110463. This is a patch for https://discourse.llvm.org/t/rfc-enhanced-machine-outliner-part-2-thinlto-nolto/78753.
This feature is enabled by
-codegen-data-thinlto-two-rounds
, which effectively runs the-codegen-data-generate
and-codegen-data-use
in two rounds to enable global outlining with ThinLTO.Before running codegen, we serialize the optimized bitcode modules to a temporary path.
Using the codegen data, the machine outliner effectively performs the global outlining.
Depends on #90934, #110461 and #110463.
This is a patch for https://discourse.llvm.org/t/rfc-enhanced-machine-outliner-part-2-thinlto-nolto/78753.