diff --git a/src/Makefile b/src/Makefile index 717afa55c6207..a8926a46c9b00 100644 --- a/src/Makefile +++ b/src/Makefile @@ -374,7 +374,7 @@ $(BUILDDIR)/gc-alloc-profiler.o $(BUILDDIR)/gc-alloc-profiler.dbg.obj: $(SRCDIR) $(BUILDDIR)/gc-page-profiler.o $(BUILDDIR)/gc-page-profiler.dbg.obj: $(SRCDIR)/gc-page-profiler.h $(BUILDDIR)/init.o $(BUILDDIR)/init.dbg.obj: $(SRCDIR)/builtin_proto.h $(BUILDDIR)/interpreter.o $(BUILDDIR)/interpreter.dbg.obj: $(SRCDIR)/builtin_proto.h -$(BUILDDIR)/jitlayers.o $(BUILDDIR)/jitlayers.dbg.obj: $(SRCDIR)/jitlayers.h $(SRCDIR)/llvm-codegen-shared.h +$(BUILDDIR)/jitlayers.o $(BUILDDIR)/jitlayers.dbg.obj: $(SRCDIR)/jitlayers.h $(SRCDIR)/llvm-codegen-shared.h $(SRCDIR)/llvm-julia-task-dispatcher.h $(BUILDDIR)/jltypes.o $(BUILDDIR)/jltypes.dbg.obj: $(SRCDIR)/builtin_proto.h $(build_shlibdir)/libllvmcalltest.$(SHLIB_EXT): $(SRCDIR)/llvm-codegen-shared.h $(BUILDDIR)/julia_version.h $(BUILDDIR)/llvm-alloc-helpers.o $(BUILDDIR)/llvm-alloc-helpers.dbg.obj: $(SRCDIR)/llvm-codegen-shared.h $(SRCDIR)/llvm-pass-helpers.h $(SRCDIR)/llvm-alloc-helpers.h diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp index 9ea98fed68db3..c5588be794201 100644 --- a/src/jitlayers.cpp +++ b/src/jitlayers.cpp @@ -6,6 +6,7 @@ #include #include "llvm/IR/Mangler.h" +#include #include #include #include @@ -50,6 +51,7 @@ using namespace llvm; #include "jitlayers.h" #include "julia_assert.h" #include "processor.h" +#include "llvm-julia-task-dispatcher.h" #if JL_LLVM_VERSION >= 180000 # include @@ -723,17 +725,8 @@ static void jl_compile_codeinst_now(jl_code_instance_t *codeinst) if (!decls.specFunctionObject.empty()) NewDefs.push_back(decls.specFunctionObject); } - // Split batches to avoid stack overflow in the JIT linker. - // FIXME: Patch ORCJITs InPlaceTaskDispatcher to not recurse on task dispatches but - // push the tasks to a queue to be drained later. This avoids the stackoverflow caused by recursion - // in the linker when compiling a large number of functions at once. - SmallVector Addrs; - for (size_t i = 0; i < NewDefs.size(); i += 1000) { - auto end = std::min(i + 1000, NewDefs.size()); - SmallVector batch(NewDefs.begin() + i, NewDefs.begin() + end); - auto AddrsBatch = jl_ExecutionEngine->findSymbols(batch); - Addrs.append(AddrsBatch); - } + auto Addrs = jl_ExecutionEngine->findSymbols(NewDefs); + size_t nextaddr = 0; for (auto &this_code : linkready) { auto it = invokenames.find(this_code); @@ -1901,7 +1894,7 @@ llvm::DataLayout jl_create_datalayout(TargetMachine &TM) { JuliaOJIT::JuliaOJIT() : TM(createTargetMachine()), DL(jl_create_datalayout(*TM)), - ES(cantFail(orc::SelfExecutorProcessControl::Create())), + ES(cantFail(orc::SelfExecutorProcessControl::Create(nullptr, std::make_unique<::JuliaTaskDispatcher>()))), GlobalJD(ES.createBareJITDylib("JuliaGlobals")), JD(ES.createBareJITDylib("JuliaOJIT")), ExternalJD(ES.createBareJITDylib("JuliaExternal")), @@ -2159,7 +2152,7 @@ SmallVector JuliaOJIT::findSymbols(ArrayRef Names) Unmangled[NonOwningSymbolStringPtr(Mangled)] = Unmangled.size(); Exports.add(std::move(Mangled)); } - SymbolMap Syms = cantFail(ES.lookup(orc::makeJITDylibSearchOrder(ArrayRef(&JD)), std::move(Exports))); + SymbolMap Syms = cantFail(::safelookup(ES, orc::makeJITDylibSearchOrder(ArrayRef(&JD)), std::move(Exports))); SmallVector Addrs(Names.size()); for (auto it : Syms) { Addrs[Unmangled.at(orc::NonOwningSymbolStringPtr(it.first))] = it.second.getAddress().getValue(); @@ -2171,7 +2164,7 @@ Expected JuliaOJIT::findSymbol(StringRef Name, bool ExportedS { orc::JITDylib* SearchOrders[3] = {&JD, &GlobalJD, &ExternalJD}; ArrayRef SearchOrder = ArrayRef(&SearchOrders[0], ExportedSymbolsOnly ? 3 : 1); - auto Sym = ES.lookup(SearchOrder, Name); + auto Sym = ::safelookup(ES, SearchOrder, Name); return Sym; } @@ -2184,7 +2177,7 @@ Expected JuliaOJIT::findExternalJDSymbol(StringRef Name, bool { orc::JITDylib* SearchOrders[3] = {&ExternalJD, &GlobalJD, &JD}; ArrayRef SearchOrder = ArrayRef(&SearchOrders[0], ExternalJDOnly ? 1 : 3); - auto Sym = ES.lookup(SearchOrder, getMangledName(Name)); + auto Sym = ::safelookup(ES, SearchOrder, getMangledName(Name)); return Sym; } diff --git a/src/llvm-julia-task-dispatcher.h b/src/llvm-julia-task-dispatcher.h new file mode 100644 index 0000000000000..dd4037378b6b6 --- /dev/null +++ b/src/llvm-julia-task-dispatcher.h @@ -0,0 +1,465 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +namespace { + +using namespace llvm::orc; + +template struct future_value_storage { + // Union disables default construction/destruction semantics, allowing us to + // use placement new/delete for precise control over value lifetime + union { + U value_; + }; + + future_value_storage() {} + ~future_value_storage() {} +}; + +template <> struct future_value_storage { + // No value_ member for void +}; + +struct JuliaTaskDispatcher : public TaskDispatcher { + /// Forward declarations + class future_base; + void dispatch(std::unique_ptr T) override; + void shutdown() override; + void work_until(future_base &F); +private: + /// C++ does not support non-static thread_local variables, so this needs to + /// store both the task and the associated dispatcher queue so that shutdown + /// can wait for the correct tasks to finish. + thread_local static SmallVector, JuliaTaskDispatcher*>> TaskQueue; + std::mutex DispatchMutex; + std::condition_variable WorkFinishedCV; + SmallVector WaitingFutures; + +public: + +/// @name ORC Promise/Future Classes +/// +/// ORC-aware promise/future implementation that integrates with the +/// TaskDispatcher system to allow efficient cooperative multitasking while +/// waiting for results (with certain limitations on what can be awaited). +/// Together they provide building blocks for a full async/await-like runtime +/// for llvm that supports multiple threads. +/// +/// Unlike std::promise/std::future alone, these classes can help dispatch other +/// tasks while waiting, preventing deadlocks and improving overall system +/// throughput. They have a similar API, though with some important differences +/// and some features simply not currently implemented. +/// +/// @{ + +template class promise; +template class future; + +/// Status for future/promise state +enum class FutureStatus : uint8_t { NotReady = 0, Ready = 1 }; + +/// @} + +/// Type-erased base class for futures, generally for scheduler use to avoid +/// needing virtual dispatches +class future_base { +public: + /// Check if the future is now ready with a value (precondition: get_promise() + /// must have been called) + bool ready() const { + if (!valid()) + report_fatal_error("ready() called before get_promise()"); + return state_->status_.load(std::memory_order_acquire) == FutureStatus::Ready; + } + + /// Check if the future is in a valid state (not moved-from and get_promise() called) + bool valid() const { return state_ != nullptr; } + + /// Wait for the future to be ready, helping with task dispatch + void wait(JuliaTaskDispatcher &D) { + // Keep helping with task dispatch until our future is ready + if (!ready()) { + D.work_until(*this); + if (state_->status_.load(std::memory_order_relaxed) != FutureStatus::Ready) + report_fatal_error( + "work_until() returned without this future being ready"); + } + } + +protected: + struct state_base { + std::atomic status_{FutureStatus::NotReady}; + }; + + future_base(state_base *state) : state_(state) {} + future_base() = default; + + /// Only allow deleting the future once it is invalid + ~future_base() { + if (state_) + report_fatal_error("get() must be called before future destruction (ensuring promise::set_value memory is valid)"); + } + + // Move constructor and assignment + future_base(future_base &&other) noexcept : state_(other.state_) { + other.state_ = nullptr; + } + + future_base &operator=(future_base &&other) noexcept { + if (this != &other) { + this->~future_base(); + state_ = other.state_; + other.state_ = nullptr; + } + return *this; + } + + state_base *state_; +}; + +/// TaskDispatcher-aware future class for cooperative await. +/// +/// @tparam T The type of value this future will provide. Use void for futures +/// that +/// signal completion without providing a value. +/// +/// This future implementation is similar to `std::future`, so most code can +/// transition to it easily. However, it differs from `std::future` in a few +/// key ways to be aware of: +/// - No exception support (or the overhead for it). +/// - The future is created before the promise, then the promise is created +/// from the future. +/// - The future is in an invalid state until get_promise() has been called. +/// - Waiting operations (get(&D), wait(&D)) help dispatch other tasks while +/// blocked, requiring an additional argument of which TaskDispatcher object +/// of where all associated work will be scheduled. +/// - While `wait` may be called multiple times and on multiple threads, all of +/// them must have returned before calling `get` on exactly one thread. +/// - Must call get() exactly once before destruction (enforced with +/// `report_fatal_error`) after each call to `get_promise`. Internal state is +/// freed when `get` returns, and allocated when `get_promise` is called. +/// +/// Other notable features, in common with `std::future`: +/// - Supports both value types and void specialization through the same +/// interface. +/// - Thread-safe through atomic operations. +/// - Provides acquire-release ordering with `std::promise::set_value()`. +/// - Concurrent access to any method (including to `ready`) on multiple threads +/// is not allowed. +/// - Holding any locks while calling `get()` is likely to lead to deadlock. +/// +/// @warning Users should avoid borrowing references to futures. References may +/// go out of scope and break the uniqueness contract, which may break the +/// soundness of the types. Always use move semantics or pass by value. + +template class future : public future_base { +public: + future() : future_base(nullptr) {} + future(const future &) = delete; + future &operator=(const future &) = delete; + future(future &&) = default; + future &operator=(future &&) = default; + + /// Get the value, helping with task dispatch while waiting. + /// This will destroy the underlying value, so this must be called exactly + /// once, which returns the future to the initial state. + T get(JuliaTaskDispatcher &D) { + if (!valid()) + report_fatal_error("get() must only be called once, after get_promise()"); + wait(D); + auto state_ = static_cast(this->state_); + this->state_ = nullptr; + return take_value(state_); + } + + /// Get the associated promise (must only be called once) + promise get_promise() { + if (valid()) + report_fatal_error("get_promise() can only be called once"); + auto state_ = new state(); + this->state_ = state_; + return promise(state_); + } + +private: + friend class promise; + + // Template the state struct with EBCO so that future has no wasted + // overhead for the value. The declaration of future_value_storage is far + // above here since GCC doesn't implement it properly when nested. + struct state : future_base::state_base, future_value_storage {}; + + template + typename std::enable_if::value, U>::type take_value(state *state_) { + T result = std::move(state_->value_); + state_->value_.~T(); + delete state_; + return result; + } + + template + typename std::enable_if::value, U>::type take_value(state *state_) { + delete state_; + } +}; + +/// TaskDispatcher-aware promise class that provides values to associated +/// futures. +/// +/// @tparam T The type of value this promise will provide. Use void for promises +/// that +/// signal completion without providing a value. +/// +/// This promise implementation provides the value-setting side of the +/// promise/future pair and integrates with the ORC TaskDispatcher system. Key +/// characteristics: +/// - Created from a future via get_promise() rather than creating the future from the promise. +/// - Must call get_future() on the thread that created it (it can be passed to another thread, but do not borrow a reference and use that to mutate it from another thread). +/// - Must call set_value() exactly once per `get_promise()` call to provide the result. +/// - Thread-safe from set_value to get. +/// - Move-only semantics to prevent accidental copying. +/// +/// The `promise` can usually be passed to another thread in one of two ways: +/// - With move semantics: +/// * `[P = F.get_promise()] () { P.set_value(); }` +/// * `[P = std::move(P)] () { P.set_value(); }` +/// * Advantages: clearer where `P` is owned, automatic deadlock detection +/// on destruction, +/// easier memory management if the future is returned from the function. +/// - By reference: +/// * `[&P] () { P.set_value(); }` +/// * Advantages: simpler memory management if the future is consumed in the +/// same function. +/// * Disadvantages: more difficult memory management if the future is +/// returned from the function, no deadlock detection. +/// +/// @warning Users should avoid borrowing references to promises. References may +/// go out of scope and break the uniqueness contract, which may break the +/// soundness of the types. Always use move semantics or pass by value. +/// +/// @par Error Handling: +/// The promise/future system uses report_fatal_error() for misuse: +/// - Calling set_value() more than once. +/// - Destroying a future without calling get(). +/// - Calling get() more than once on a future. +/// +/// @par Thread Safety: +/// - Each promise/future must only be accessed by one thread, as concurrent +/// calls to the API functions may result in crashes. +/// - Multiple threads can safely access different promise/future pairs. +/// - set_value() and get() operations are atomic and thread-safe. +/// - Move operations should only be performed by a single thread. +template class promise { + friend class future; + +public: + promise() : state_(nullptr) {} + + ~promise() { + // Assert proper promise lifecycle: ensure set_value was called if promise was valid. + // This can catch deadlocks where a promise is created but set_value() is + // never called, though only if the promise is moved from instead of + // borrowed from the frame with the future. + // Empty promises (state_ == nullptr) are allowed to be destroyed without calling set_value. + } + + promise(const promise &) = delete; + promise &operator=(const promise &) = delete; + + promise(promise &&other) noexcept + : state_(other.state_) { + other.state_ = nullptr; + } + + promise &operator=(promise &&other) noexcept { + if (this != &other) { + this->~promise(); + state_ = other.state_; + other.state_ = nullptr; + } + return *this; + } + + + /// Set the value (must only be called once) + // In C++20, this std::conditional weirdness can probably be replaced just + // with requires. It ensures that we don't try to define a method for `void&`, + // but that if the user calls set_value(v) for any value v that they get a + // member function error, instead of no member named 'value_'. + template + void + set_value(const typename std::conditional::value, + std::nullopt_t, T>::type &value) const { + assert(state_ && "set_value() can only be called once"); + new (&state_->value_) T(value); + state_->status_.store(FutureStatus::Ready, std::memory_order_release); + state_ = nullptr; + } + + template + void set_value(typename std::conditional::value, + std::nullopt_t, T>::type &&value) const { + assert(state_ && "set_value() can only be called once"); + new (&state_->value_) T(std::move(value)); + state_->status_.store(FutureStatus::Ready, std::memory_order_release); + state_ = nullptr; + } + + template + typename std::enable_if::value, void>::type + set_value(const std::nullopt_t &value) = delete; + + template + typename std::enable_if::value, void>::type + set_value(std::nullopt_t &&value) = delete; + + template + typename std::enable_if::value, void>::type set_value() const { + assert(state_ && "set_value() can only be called once"); + state_->status_.store(FutureStatus::Ready, std::memory_order_release); + state_ = nullptr; + } + + /// Swap with another promise + void swap(promise &other) noexcept { + using std::swap; + swap(state_, other.state_); + } + +private: + explicit promise(typename future::state *state) + : state_(state) {} + + mutable typename future::state *state_; +}; + +}; // class JuliaTaskDispatcher + +thread_local SmallVector, JuliaTaskDispatcher *>> JuliaTaskDispatcher::TaskQueue; + +void JuliaTaskDispatcher::dispatch(std::unique_ptr T) { + TaskQueue.push_back(std::pair(std::move(T), this)); +} + +void JuliaTaskDispatcher::shutdown() { + // Keep processing until no tasks belonging to this dispatcher remain + while (true) { + // Check if any task belongs to this dispatcher + auto it = std::find_if( + TaskQueue.begin(), TaskQueue.end(), + [this](const auto &TaskPair) { return TaskPair.second == this; }); + + // If no tasks belonging to this dispatcher, we're done + if (it == TaskQueue.end()) + return; + + // Create a future/promise pair to wait for completion of this task + future taskFuture; + // Replace the task with a GenericNamedTask that wraps the original task + // with a notification of completion that this thread can work_until. + auto originalTask = std::move(it->first); + it->first = makeGenericNamedTask( + [originalTask = std::move(originalTask), + taskPromise = taskFuture.get_promise()]() { + originalTask->run(); + taskPromise.set_value(); + }, + "Shutdown task marker"); + + // Wait for the task to complete + taskFuture.get(*this); + } +} + +void JuliaTaskDispatcher::work_until(future_base &F) { + while (!F.ready()) { + // First, process any tasks in our local queue + // Process in LIFO order (most recently added first) to avoid deadlocks + // when tasks have dependencies on each other + while (!TaskQueue.empty()) { + { + auto TaskPair = std::move(TaskQueue.back()); + TaskQueue.pop_back(); + TaskPair.first->run(); + } + + // Notify any threads that might be waiting for work to complete + { + std::lock_guard Lock(DispatchMutex); + bool ShouldNotify = llvm::any_of( + WaitingFutures, [](future_base *F) { return F->ready(); }); + if (ShouldNotify) { + WaitingFutures.clear(); + WorkFinishedCV.notify_all(); + } + } + + // Check if our future is now ready + if (F.ready()) + return; + } + + // If we get here, our queue is empty but the future isn't ready + // We need to wait for other threads to finish work that should complete our + // future + { + std::unique_lock Lock(DispatchMutex); + WaitingFutures.push_back(&F); + WorkFinishedCV.wait(Lock, [&F]() { return F.ready(); }); + } + } +} + +} // End namespace + +namespace std { +template +void swap(::JuliaTaskDispatcher::promise &lhs, ::JuliaTaskDispatcher::promise &rhs) noexcept { + lhs.swap(rhs); +} +} // End namespace std + +// n.b. this actually is sometimes a safepoint +Expected +safelookup(ExecutionSession &ES, + const JITDylibSearchOrder &SearchOrder, + SymbolLookupSet Symbols, LookupKind K = LookupKind::Static, + SymbolState RequiredState = SymbolState::Ready, + RegisterDependenciesFunction RegisterDependencies = NoDependenciesToRegister) JL_NOTSAFEPOINT { + JuliaTaskDispatcher::future> PromisedFuture; + auto NotifyComplete = [PromisedResult = PromisedFuture.get_promise()](Expected R) { + PromisedResult.set_value(std::move(R)); + }; + ES.lookup(K, SearchOrder, std::move(Symbols), RequiredState, + std::move(NotifyComplete), RegisterDependencies); + return PromisedFuture.get(static_cast(ES.getExecutorProcessControl().getDispatcher())); +} + +Expected +safelookup(ExecutionSession &ES, + const JITDylibSearchOrder &SearchOrder, + SymbolStringPtr Name, + SymbolState RequiredState = SymbolState::Ready) JL_NOTSAFEPOINT { + SymbolLookupSet Names({Name}); + + if (auto ResultMap = safelookup(ES, SearchOrder, std::move(Names), LookupKind::Static, + RequiredState, NoDependenciesToRegister)) { + assert(ResultMap->size() == 1 && "Unexpected number of results"); + assert(ResultMap->count(Name) && "Missing result for symbol"); + return std::move(ResultMap->begin()->second); + } else + return ResultMap.takeError(); +} + +Expected +safelookup(ExecutionSession &ES, + ArrayRef SearchOrder, SymbolStringPtr Name, + SymbolState RequiredState = SymbolState::Ready) JL_NOTSAFEPOINT { + return safelookup(ES, makeJITDylibSearchOrder(SearchOrder), Name, RequiredState); +} + +Expected +safelookup(ExecutionSession &ES, + ArrayRef SearchOrder, StringRef Name, + SymbolState RequiredState = SymbolState::Ready) JL_NOTSAFEPOINT { + return safelookup(ES, SearchOrder, ES.intern(Name), RequiredState); +}