From 4ff02805162b9c6c29bdbbb5c44218701b6796c0 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Wed, 9 Jul 2025 16:39:25 -0400 Subject: [PATCH] Fix LLVM TaskDispatcher implementation issues Fixes #58229 (LLVM JITLink stack overflow issue). I tried submitting this promise/future implementation upstream (https://github.com/llvm/llvm-project/compare/main...vtjnash:llvm-project:jn/cowait-jit) so that I would not need to duplicate nearly as much code here to fix this bug, but upstream is currently opposed to fixing this bug and instead insists it is preferable for each downstream project to implement this fix themselves. This is inspired by std::promise/std::future, but redesigned to avoid many different memory-safety gotchas that I felt were unnecessarily complex about the stdlib design. --- src/Makefile | 2 +- src/jitlayers.cpp | 23 +- src/llvm-julia-task-dispatcher.h | 465 +++++++++++++++++++++++++++++++ 3 files changed, 474 insertions(+), 16 deletions(-) create mode 100644 src/llvm-julia-task-dispatcher.h diff --git a/src/Makefile b/src/Makefile index 717afa55c6207..a8926a46c9b00 100644 --- a/src/Makefile +++ b/src/Makefile @@ -374,7 +374,7 @@ $(BUILDDIR)/gc-alloc-profiler.o $(BUILDDIR)/gc-alloc-profiler.dbg.obj: $(SRCDIR) $(BUILDDIR)/gc-page-profiler.o $(BUILDDIR)/gc-page-profiler.dbg.obj: $(SRCDIR)/gc-page-profiler.h $(BUILDDIR)/init.o $(BUILDDIR)/init.dbg.obj: $(SRCDIR)/builtin_proto.h $(BUILDDIR)/interpreter.o $(BUILDDIR)/interpreter.dbg.obj: $(SRCDIR)/builtin_proto.h -$(BUILDDIR)/jitlayers.o $(BUILDDIR)/jitlayers.dbg.obj: $(SRCDIR)/jitlayers.h $(SRCDIR)/llvm-codegen-shared.h +$(BUILDDIR)/jitlayers.o $(BUILDDIR)/jitlayers.dbg.obj: $(SRCDIR)/jitlayers.h $(SRCDIR)/llvm-codegen-shared.h $(SRCDIR)/llvm-julia-task-dispatcher.h $(BUILDDIR)/jltypes.o $(BUILDDIR)/jltypes.dbg.obj: $(SRCDIR)/builtin_proto.h $(build_shlibdir)/libllvmcalltest.$(SHLIB_EXT): $(SRCDIR)/llvm-codegen-shared.h $(BUILDDIR)/julia_version.h $(BUILDDIR)/llvm-alloc-helpers.o $(BUILDDIR)/llvm-alloc-helpers.dbg.obj: $(SRCDIR)/llvm-codegen-shared.h $(SRCDIR)/llvm-pass-helpers.h $(SRCDIR)/llvm-alloc-helpers.h diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp index 9ea98fed68db3..c5588be794201 100644 --- a/src/jitlayers.cpp +++ b/src/jitlayers.cpp @@ -6,6 +6,7 @@ #include #include "llvm/IR/Mangler.h" +#include #include #include #include @@ -50,6 +51,7 @@ using namespace llvm; #include "jitlayers.h" #include "julia_assert.h" #include "processor.h" +#include "llvm-julia-task-dispatcher.h" #if JL_LLVM_VERSION >= 180000 # include @@ -723,17 +725,8 @@ static void jl_compile_codeinst_now(jl_code_instance_t *codeinst) if (!decls.specFunctionObject.empty()) NewDefs.push_back(decls.specFunctionObject); } - // Split batches to avoid stack overflow in the JIT linker. - // FIXME: Patch ORCJITs InPlaceTaskDispatcher to not recurse on task dispatches but - // push the tasks to a queue to be drained later. This avoids the stackoverflow caused by recursion - // in the linker when compiling a large number of functions at once. - SmallVector Addrs; - for (size_t i = 0; i < NewDefs.size(); i += 1000) { - auto end = std::min(i + 1000, NewDefs.size()); - SmallVector batch(NewDefs.begin() + i, NewDefs.begin() + end); - auto AddrsBatch = jl_ExecutionEngine->findSymbols(batch); - Addrs.append(AddrsBatch); - } + auto Addrs = jl_ExecutionEngine->findSymbols(NewDefs); + size_t nextaddr = 0; for (auto &this_code : linkready) { auto it = invokenames.find(this_code); @@ -1901,7 +1894,7 @@ llvm::DataLayout jl_create_datalayout(TargetMachine &TM) { JuliaOJIT::JuliaOJIT() : TM(createTargetMachine()), DL(jl_create_datalayout(*TM)), - ES(cantFail(orc::SelfExecutorProcessControl::Create())), + ES(cantFail(orc::SelfExecutorProcessControl::Create(nullptr, std::make_unique<::JuliaTaskDispatcher>()))), GlobalJD(ES.createBareJITDylib("JuliaGlobals")), JD(ES.createBareJITDylib("JuliaOJIT")), ExternalJD(ES.createBareJITDylib("JuliaExternal")), @@ -2159,7 +2152,7 @@ SmallVector JuliaOJIT::findSymbols(ArrayRef Names) Unmangled[NonOwningSymbolStringPtr(Mangled)] = Unmangled.size(); Exports.add(std::move(Mangled)); } - SymbolMap Syms = cantFail(ES.lookup(orc::makeJITDylibSearchOrder(ArrayRef(&JD)), std::move(Exports))); + SymbolMap Syms = cantFail(::safelookup(ES, orc::makeJITDylibSearchOrder(ArrayRef(&JD)), std::move(Exports))); SmallVector Addrs(Names.size()); for (auto it : Syms) { Addrs[Unmangled.at(orc::NonOwningSymbolStringPtr(it.first))] = it.second.getAddress().getValue(); @@ -2171,7 +2164,7 @@ Expected JuliaOJIT::findSymbol(StringRef Name, bool ExportedS { orc::JITDylib* SearchOrders[3] = {&JD, &GlobalJD, &ExternalJD}; ArrayRef SearchOrder = ArrayRef(&SearchOrders[0], ExportedSymbolsOnly ? 3 : 1); - auto Sym = ES.lookup(SearchOrder, Name); + auto Sym = ::safelookup(ES, SearchOrder, Name); return Sym; } @@ -2184,7 +2177,7 @@ Expected JuliaOJIT::findExternalJDSymbol(StringRef Name, bool { orc::JITDylib* SearchOrders[3] = {&ExternalJD, &GlobalJD, &JD}; ArrayRef SearchOrder = ArrayRef(&SearchOrders[0], ExternalJDOnly ? 1 : 3); - auto Sym = ES.lookup(SearchOrder, getMangledName(Name)); + auto Sym = ::safelookup(ES, SearchOrder, getMangledName(Name)); return Sym; } diff --git a/src/llvm-julia-task-dispatcher.h b/src/llvm-julia-task-dispatcher.h new file mode 100644 index 0000000000000..dd4037378b6b6 --- /dev/null +++ b/src/llvm-julia-task-dispatcher.h @@ -0,0 +1,465 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +namespace { + +using namespace llvm::orc; + +template struct future_value_storage { + // Union disables default construction/destruction semantics, allowing us to + // use placement new/delete for precise control over value lifetime + union { + U value_; + }; + + future_value_storage() {} + ~future_value_storage() {} +}; + +template <> struct future_value_storage { + // No value_ member for void +}; + +struct JuliaTaskDispatcher : public TaskDispatcher { + /// Forward declarations + class future_base; + void dispatch(std::unique_ptr T) override; + void shutdown() override; + void work_until(future_base &F); +private: + /// C++ does not support non-static thread_local variables, so this needs to + /// store both the task and the associated dispatcher queue so that shutdown + /// can wait for the correct tasks to finish. + thread_local static SmallVector, JuliaTaskDispatcher*>> TaskQueue; + std::mutex DispatchMutex; + std::condition_variable WorkFinishedCV; + SmallVector WaitingFutures; + +public: + +/// @name ORC Promise/Future Classes +/// +/// ORC-aware promise/future implementation that integrates with the +/// TaskDispatcher system to allow efficient cooperative multitasking while +/// waiting for results (with certain limitations on what can be awaited). +/// Together they provide building blocks for a full async/await-like runtime +/// for llvm that supports multiple threads. +/// +/// Unlike std::promise/std::future alone, these classes can help dispatch other +/// tasks while waiting, preventing deadlocks and improving overall system +/// throughput. They have a similar API, though with some important differences +/// and some features simply not currently implemented. +/// +/// @{ + +template class promise; +template class future; + +/// Status for future/promise state +enum class FutureStatus : uint8_t { NotReady = 0, Ready = 1 }; + +/// @} + +/// Type-erased base class for futures, generally for scheduler use to avoid +/// needing virtual dispatches +class future_base { +public: + /// Check if the future is now ready with a value (precondition: get_promise() + /// must have been called) + bool ready() const { + if (!valid()) + report_fatal_error("ready() called before get_promise()"); + return state_->status_.load(std::memory_order_acquire) == FutureStatus::Ready; + } + + /// Check if the future is in a valid state (not moved-from and get_promise() called) + bool valid() const { return state_ != nullptr; } + + /// Wait for the future to be ready, helping with task dispatch + void wait(JuliaTaskDispatcher &D) { + // Keep helping with task dispatch until our future is ready + if (!ready()) { + D.work_until(*this); + if (state_->status_.load(std::memory_order_relaxed) != FutureStatus::Ready) + report_fatal_error( + "work_until() returned without this future being ready"); + } + } + +protected: + struct state_base { + std::atomic status_{FutureStatus::NotReady}; + }; + + future_base(state_base *state) : state_(state) {} + future_base() = default; + + /// Only allow deleting the future once it is invalid + ~future_base() { + if (state_) + report_fatal_error("get() must be called before future destruction (ensuring promise::set_value memory is valid)"); + } + + // Move constructor and assignment + future_base(future_base &&other) noexcept : state_(other.state_) { + other.state_ = nullptr; + } + + future_base &operator=(future_base &&other) noexcept { + if (this != &other) { + this->~future_base(); + state_ = other.state_; + other.state_ = nullptr; + } + return *this; + } + + state_base *state_; +}; + +/// TaskDispatcher-aware future class for cooperative await. +/// +/// @tparam T The type of value this future will provide. Use void for futures +/// that +/// signal completion without providing a value. +/// +/// This future implementation is similar to `std::future`, so most code can +/// transition to it easily. However, it differs from `std::future` in a few +/// key ways to be aware of: +/// - No exception support (or the overhead for it). +/// - The future is created before the promise, then the promise is created +/// from the future. +/// - The future is in an invalid state until get_promise() has been called. +/// - Waiting operations (get(&D), wait(&D)) help dispatch other tasks while +/// blocked, requiring an additional argument of which TaskDispatcher object +/// of where all associated work will be scheduled. +/// - While `wait` may be called multiple times and on multiple threads, all of +/// them must have returned before calling `get` on exactly one thread. +/// - Must call get() exactly once before destruction (enforced with +/// `report_fatal_error`) after each call to `get_promise`. Internal state is +/// freed when `get` returns, and allocated when `get_promise` is called. +/// +/// Other notable features, in common with `std::future`: +/// - Supports both value types and void specialization through the same +/// interface. +/// - Thread-safe through atomic operations. +/// - Provides acquire-release ordering with `std::promise::set_value()`. +/// - Concurrent access to any method (including to `ready`) on multiple threads +/// is not allowed. +/// - Holding any locks while calling `get()` is likely to lead to deadlock. +/// +/// @warning Users should avoid borrowing references to futures. References may +/// go out of scope and break the uniqueness contract, which may break the +/// soundness of the types. Always use move semantics or pass by value. + +template class future : public future_base { +public: + future() : future_base(nullptr) {} + future(const future &) = delete; + future &operator=(const future &) = delete; + future(future &&) = default; + future &operator=(future &&) = default; + + /// Get the value, helping with task dispatch while waiting. + /// This will destroy the underlying value, so this must be called exactly + /// once, which returns the future to the initial state. + T get(JuliaTaskDispatcher &D) { + if (!valid()) + report_fatal_error("get() must only be called once, after get_promise()"); + wait(D); + auto state_ = static_cast(this->state_); + this->state_ = nullptr; + return take_value(state_); + } + + /// Get the associated promise (must only be called once) + promise get_promise() { + if (valid()) + report_fatal_error("get_promise() can only be called once"); + auto state_ = new state(); + this->state_ = state_; + return promise(state_); + } + +private: + friend class promise; + + // Template the state struct with EBCO so that future has no wasted + // overhead for the value. The declaration of future_value_storage is far + // above here since GCC doesn't implement it properly when nested. + struct state : future_base::state_base, future_value_storage {}; + + template + typename std::enable_if::value, U>::type take_value(state *state_) { + T result = std::move(state_->value_); + state_->value_.~T(); + delete state_; + return result; + } + + template + typename std::enable_if::value, U>::type take_value(state *state_) { + delete state_; + } +}; + +/// TaskDispatcher-aware promise class that provides values to associated +/// futures. +/// +/// @tparam T The type of value this promise will provide. Use void for promises +/// that +/// signal completion without providing a value. +/// +/// This promise implementation provides the value-setting side of the +/// promise/future pair and integrates with the ORC TaskDispatcher system. Key +/// characteristics: +/// - Created from a future via get_promise() rather than creating the future from the promise. +/// - Must call get_future() on the thread that created it (it can be passed to another thread, but do not borrow a reference and use that to mutate it from another thread). +/// - Must call set_value() exactly once per `get_promise()` call to provide the result. +/// - Thread-safe from set_value to get. +/// - Move-only semantics to prevent accidental copying. +/// +/// The `promise` can usually be passed to another thread in one of two ways: +/// - With move semantics: +/// * `[P = F.get_promise()] () { P.set_value(); }` +/// * `[P = std::move(P)] () { P.set_value(); }` +/// * Advantages: clearer where `P` is owned, automatic deadlock detection +/// on destruction, +/// easier memory management if the future is returned from the function. +/// - By reference: +/// * `[&P] () { P.set_value(); }` +/// * Advantages: simpler memory management if the future is consumed in the +/// same function. +/// * Disadvantages: more difficult memory management if the future is +/// returned from the function, no deadlock detection. +/// +/// @warning Users should avoid borrowing references to promises. References may +/// go out of scope and break the uniqueness contract, which may break the +/// soundness of the types. Always use move semantics or pass by value. +/// +/// @par Error Handling: +/// The promise/future system uses report_fatal_error() for misuse: +/// - Calling set_value() more than once. +/// - Destroying a future without calling get(). +/// - Calling get() more than once on a future. +/// +/// @par Thread Safety: +/// - Each promise/future must only be accessed by one thread, as concurrent +/// calls to the API functions may result in crashes. +/// - Multiple threads can safely access different promise/future pairs. +/// - set_value() and get() operations are atomic and thread-safe. +/// - Move operations should only be performed by a single thread. +template class promise { + friend class future; + +public: + promise() : state_(nullptr) {} + + ~promise() { + // Assert proper promise lifecycle: ensure set_value was called if promise was valid. + // This can catch deadlocks where a promise is created but set_value() is + // never called, though only if the promise is moved from instead of + // borrowed from the frame with the future. + // Empty promises (state_ == nullptr) are allowed to be destroyed without calling set_value. + } + + promise(const promise &) = delete; + promise &operator=(const promise &) = delete; + + promise(promise &&other) noexcept + : state_(other.state_) { + other.state_ = nullptr; + } + + promise &operator=(promise &&other) noexcept { + if (this != &other) { + this->~promise(); + state_ = other.state_; + other.state_ = nullptr; + } + return *this; + } + + + /// Set the value (must only be called once) + // In C++20, this std::conditional weirdness can probably be replaced just + // with requires. It ensures that we don't try to define a method for `void&`, + // but that if the user calls set_value(v) for any value v that they get a + // member function error, instead of no member named 'value_'. + template + void + set_value(const typename std::conditional::value, + std::nullopt_t, T>::type &value) const { + assert(state_ && "set_value() can only be called once"); + new (&state_->value_) T(value); + state_->status_.store(FutureStatus::Ready, std::memory_order_release); + state_ = nullptr; + } + + template + void set_value(typename std::conditional::value, + std::nullopt_t, T>::type &&value) const { + assert(state_ && "set_value() can only be called once"); + new (&state_->value_) T(std::move(value)); + state_->status_.store(FutureStatus::Ready, std::memory_order_release); + state_ = nullptr; + } + + template + typename std::enable_if::value, void>::type + set_value(const std::nullopt_t &value) = delete; + + template + typename std::enable_if::value, void>::type + set_value(std::nullopt_t &&value) = delete; + + template + typename std::enable_if::value, void>::type set_value() const { + assert(state_ && "set_value() can only be called once"); + state_->status_.store(FutureStatus::Ready, std::memory_order_release); + state_ = nullptr; + } + + /// Swap with another promise + void swap(promise &other) noexcept { + using std::swap; + swap(state_, other.state_); + } + +private: + explicit promise(typename future::state *state) + : state_(state) {} + + mutable typename future::state *state_; +}; + +}; // class JuliaTaskDispatcher + +thread_local SmallVector, JuliaTaskDispatcher *>> JuliaTaskDispatcher::TaskQueue; + +void JuliaTaskDispatcher::dispatch(std::unique_ptr T) { + TaskQueue.push_back(std::pair(std::move(T), this)); +} + +void JuliaTaskDispatcher::shutdown() { + // Keep processing until no tasks belonging to this dispatcher remain + while (true) { + // Check if any task belongs to this dispatcher + auto it = std::find_if( + TaskQueue.begin(), TaskQueue.end(), + [this](const auto &TaskPair) { return TaskPair.second == this; }); + + // If no tasks belonging to this dispatcher, we're done + if (it == TaskQueue.end()) + return; + + // Create a future/promise pair to wait for completion of this task + future taskFuture; + // Replace the task with a GenericNamedTask that wraps the original task + // with a notification of completion that this thread can work_until. + auto originalTask = std::move(it->first); + it->first = makeGenericNamedTask( + [originalTask = std::move(originalTask), + taskPromise = taskFuture.get_promise()]() { + originalTask->run(); + taskPromise.set_value(); + }, + "Shutdown task marker"); + + // Wait for the task to complete + taskFuture.get(*this); + } +} + +void JuliaTaskDispatcher::work_until(future_base &F) { + while (!F.ready()) { + // First, process any tasks in our local queue + // Process in LIFO order (most recently added first) to avoid deadlocks + // when tasks have dependencies on each other + while (!TaskQueue.empty()) { + { + auto TaskPair = std::move(TaskQueue.back()); + TaskQueue.pop_back(); + TaskPair.first->run(); + } + + // Notify any threads that might be waiting for work to complete + { + std::lock_guard Lock(DispatchMutex); + bool ShouldNotify = llvm::any_of( + WaitingFutures, [](future_base *F) { return F->ready(); }); + if (ShouldNotify) { + WaitingFutures.clear(); + WorkFinishedCV.notify_all(); + } + } + + // Check if our future is now ready + if (F.ready()) + return; + } + + // If we get here, our queue is empty but the future isn't ready + // We need to wait for other threads to finish work that should complete our + // future + { + std::unique_lock Lock(DispatchMutex); + WaitingFutures.push_back(&F); + WorkFinishedCV.wait(Lock, [&F]() { return F.ready(); }); + } + } +} + +} // End namespace + +namespace std { +template +void swap(::JuliaTaskDispatcher::promise &lhs, ::JuliaTaskDispatcher::promise &rhs) noexcept { + lhs.swap(rhs); +} +} // End namespace std + +// n.b. this actually is sometimes a safepoint +Expected +safelookup(ExecutionSession &ES, + const JITDylibSearchOrder &SearchOrder, + SymbolLookupSet Symbols, LookupKind K = LookupKind::Static, + SymbolState RequiredState = SymbolState::Ready, + RegisterDependenciesFunction RegisterDependencies = NoDependenciesToRegister) JL_NOTSAFEPOINT { + JuliaTaskDispatcher::future> PromisedFuture; + auto NotifyComplete = [PromisedResult = PromisedFuture.get_promise()](Expected R) { + PromisedResult.set_value(std::move(R)); + }; + ES.lookup(K, SearchOrder, std::move(Symbols), RequiredState, + std::move(NotifyComplete), RegisterDependencies); + return PromisedFuture.get(static_cast(ES.getExecutorProcessControl().getDispatcher())); +} + +Expected +safelookup(ExecutionSession &ES, + const JITDylibSearchOrder &SearchOrder, + SymbolStringPtr Name, + SymbolState RequiredState = SymbolState::Ready) JL_NOTSAFEPOINT { + SymbolLookupSet Names({Name}); + + if (auto ResultMap = safelookup(ES, SearchOrder, std::move(Names), LookupKind::Static, + RequiredState, NoDependenciesToRegister)) { + assert(ResultMap->size() == 1 && "Unexpected number of results"); + assert(ResultMap->count(Name) && "Missing result for symbol"); + return std::move(ResultMap->begin()->second); + } else + return ResultMap.takeError(); +} + +Expected +safelookup(ExecutionSession &ES, + ArrayRef SearchOrder, SymbolStringPtr Name, + SymbolState RequiredState = SymbolState::Ready) JL_NOTSAFEPOINT { + return safelookup(ES, makeJITDylibSearchOrder(SearchOrder), Name, RequiredState); +} + +Expected +safelookup(ExecutionSession &ES, + ArrayRef SearchOrder, StringRef Name, + SymbolState RequiredState = SymbolState::Ready) JL_NOTSAFEPOINT { + return safelookup(ES, SearchOrder, ES.intern(Name), RequiredState); +}