diff --git a/docs/make.jl b/docs/make.jl index 175ca04..edeebdb 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -7,7 +7,7 @@ makedocs( format = Documenter.HTML(; prettyurls = get(ENV, "CI", nothing) == "true"), authors = "Krisztián Schäffer", sitename = "Catwalk.jl", - pages = Any["index.md", "usage.md"], + pages = Any["index.md", "usage.md", "howitworks.md", "tuning.md"], # strict = true, # clean = true, # checkdocs = :exports, diff --git a/docs/src/howitworks.md b/docs/src/howitworks.md new file mode 100644 index 0000000..ebe6b89 --- /dev/null +++ b/docs/src/howitworks.md @@ -0,0 +1,81 @@ +# How it works? + +The `@jit` macro turns the outer function to a `@generated` one, +so that we can recompile with reoptimized source code at will. + +The optimized code looks like this: + +```julia + x = get_some_x(i) + if x isa FrequentType1 + calc_with_x(x) # Fast type-stable route + elseif x isa FrequentType2 + calc_with_x(x) # Fast type-stable route + . + . + . + else + calc_with_x(x) # Fallback to the dynamically dispatched call + end +``` + +The *type* of the `jitctx` argument drives the compilation process, it +encodes everything needed to generate the code, namely the list of +stabilized types and the type of the profiler that runs in the +current batch: + +```julia +struct CallCtx{TProfiler, TFixtypes} + profiler::TProfiler +end +``` + +Two profilers are implemented at the time: + +```julia +struct NoProfiler <: Profiler end + +struct FullProfiler <: Profiler + typefreqs::DataTypeFrequencies +end +``` + +The `FullProfiler` collects statistics from every call. +It logs a call faster than a dynamic dispatch, but running +it in every batch would still eat most of the cake, so it +is sparsely used, with 1% probability by default (It is +always active during the first two batches). + +The last missing part is the explorer, which automatically +connects the JIT compiler with the `@jit`-ed functions that +run under its supervision. + +Also, a single JIT compiler can handle multiple call sites, +so the `jitctx` in reality is not a single `CallCtx` as described +earlier, but a `NamedTuple` of them, plus an explorer: + +```julia +struct OptimizerCtx{TCallCtxs, TExplorer} + callctxs::TCallCtxs + explorer::TExplorer +end +``` + +The explorer holds its id in its type, because exploration happens +during compilation, when only its type is available. + +```julia +struct BasicExplorer{TOptimizerId} <: Explorer end +``` + +Here Catwalk - just like many other meta-heavy Julia packages - +violates the rule that a `@generated` function is not "allowed" +to access mutable global state. It logs the call site to a global +dict, keyed with its id, from where the JIT compiler can read it out. + +It seems impossible to push back information from the compilation process +without breaking this rule, and pushing the exploration to the tight loop +we is not feasible. + +The alternative is to configure the compiler with the call sites +and `NoExplorer` manually. TODO: link to tuning, when documented. diff --git a/docs/src/index.md b/docs/src/index.md index 7efaafa..ab0eb67 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,7 +1,8 @@ # Catwalk.jl Intro Catwalk.jl can speed up long-running Julia processes by minimizing the -overhead of dynamic dispatch. +overhead of dynamic dispatch. It is a JIT compiler that continuosly +re-optimizes dispatch code based on data collected at runtime. ![Speedup demo](assets/catwalk-speeddemo.gif) @@ -31,6 +32,7 @@ Catwalk.jl assumes the followings: ## Alternatives -- [FunctionWrappers.jl](https://github.com/yuyichao/FunctionWrappers.jl) will get you type stability for a fixed (?) cost. - [ManualDispatch.jl](https://github.com/jlapeyre/ManualDispatch.jl) can serve you better in less dynamic cases, when it is feasible to list the dynamically dispatched types in the source code. -- In even simpler cases using unions instead of a type hierarchy may allow the Julia compiler to "split the union". See for example [List performance improvent by Union-typed tail](https://github.com/JuliaCollections/DataStructures.jl/pull/682/commits/4742228d42ae441f9837e5825feedeb1c013bd99) in DataStructures.jl. \ No newline at end of file +- In even simpler cases using unions instead of a type hierarchy may allow the Julia compiler to "split the union". See for example [List performance improvent by Union-typed tail](https://github.com/JuliaCollections/DataStructures.jl/pull/682/commits/4742228d42ae441f9837e5825feedeb1c013bd99) in DataStructures.jl. +- [FunctionWrappers.jl](https://github.com/yuyichao/FunctionWrappers.jl) will give you type stability for a fixed (?) cost. Its use case is different, but if you are wrestling with type instabilities, take a look at it first. +- [FunctionWranglers.jl](https://github.com/tisztamo/FunctionWranglers.jl) allows fast, inlined execution of functions provided in an array - for that use case it is a better choice than Catwalk.jl. diff --git a/docs/src/tuning.md b/docs/src/tuning.md new file mode 100644 index 0000000..a76e1a4 --- /dev/null +++ b/docs/src/tuning.md @@ -0,0 +1,2 @@ +# Configuration & tuning + diff --git a/docs/src/usage.jl b/docs/src/usage.jl index a120d85..366c922 100644 --- a/docs/src/usage.jl +++ b/docs/src/usage.jl @@ -2,8 +2,10 @@ # Let's say you have a long-running calculation, organized into batches: +const NUM_BATCHES = 1000 + function runbatches() - for batchidx = 1:1000 + for batchidx = 1:NUM_BATCHES hotloop() ## Log progress, etc. end @@ -12,8 +14,10 @@ end # The hot loop calls the type-unstable function `get_some_x()` and # passes its result to a relatively cheap calculation `calc_with_x()`. +const NUM_ITERS_PER_BATCH = 1_000_000 + function hotloop() - for i = 1:1_000_000 + for i = 1:NUM_ITERS_PER_BATCH x = get_some_x(i) calc_with_x(x) end @@ -34,8 +38,7 @@ end # * Sometimes it is not feasible to type-stabilize `get_some_x`. * # # Catwalk.jl is here for those cases. -# You mark `hotloop`, the outer function (the one which has the -# dynamic call site in its body) +# You mark `hotloop`, the outer function # with the `@jit` macro and provide the name of the dynamically # dispatched function # and the argument to operate on (the API will hopefully @@ -45,7 +48,7 @@ end using Catwalk @jit calc_with_x x function hotloop_jit(jitctx) - for i = 1:1_000_000 + for i = 1:NUM_ITERS_PER_BATCH x = get_some_x(i) calc_with_x(x) end @@ -53,13 +56,13 @@ end # The Catwalk optimizer will provide you the `jitctx` context which you have to pass # to the jit-ed function manually. -# Also, every batch needs a bit more housekeeping to drive the Catwalk optimizer: +# Also, every batch needs a bit housekeeping to drive the Catwalk optimizer: function runbatches_jit() - opt = Catwalk.RuntimeOptimizer() - for batch = 1:1000 - Catwalk.step!(opt) - hotloop_jit(Catwalk.ctx(opt)) + jit = Catwalk.JIT() + for batch = 1:NUM_BATCHES + Catwalk.step!(jit) + hotloop_jit(Catwalk.ctx(jit)) end end @@ -85,4 +88,4 @@ jit_result == result[] || error("JIT must be a no-op!") # Please note that the speedup depends on the portion of the runtime spent in dynamic dispatch, # which is most likely smaller in your case than in this contrived example. # -# You can find this example under [docs/src/usage.jl](https://github.com/tisztamo/Catwalk.jl/docs/src/usage.jl) in the repo. \ No newline at end of file +# You can find this example under [docs/src/usage.jl](https://github.com/tisztamo/Catwalk.jl/docs/src/usage.jl) in the repo. diff --git a/src/Catwalk.jl b/src/Catwalk.jl index a1043e1..d675e95 100644 --- a/src/Catwalk.jl +++ b/src/Catwalk.jl @@ -5,7 +5,7 @@ An optimizing Just In Time compiler written in Julia. """ module Catwalk -export @jit, RuntimeOptimizer +export @jit, JIT include("typelist.jl") include("frequencies.jl") @@ -45,34 +45,34 @@ end fixtypes(::Type{CallCtx{TProfiler, TFixtypes}}) where {TProfiler, TFixtypes} = TFixtypes profiler(ctx::CallCtx) = ctx.profiler -struct RuntimeOptimizer +struct JIT id::Int callboosts::Vector{CallBoost} explorer::Explorer end -function RuntimeOptimizer(callboosts...; explorertype=BasicExplorer) +function JIT(callboosts...; explorertype=BasicExplorer) id = rand(Int) - opt = RuntimeOptimizer(id, [], explorertype(id)) + opt = JIT(id, [], explorertype(id)) for boost in callboosts add_boost!(opt, boost) end return opt end -optimizerid(opt::RuntimeOptimizer) = opt.id +optimizerid(opt::JIT) = opt.id -function add_boost!(opt::RuntimeOptimizer, boost) +function add_boost!(opt::JIT, boost) push!(opt.callboosts, boost) register_callsite!(opt.id, boost.fnsym) end -function step!(opt::RuntimeOptimizer) +function step!(opt::JIT) update_callboosts(opt) step!.(opt.callboosts) step!(opt.explorer) end -function update_callboosts(opt::RuntimeOptimizer) +function update_callboosts(opt::JIT) currentsyms = Set(map(b -> b.fnsym, opt.callboosts)) for newsite in setdiff(get_freshcallsites!(opt.id), currentsyms) add_boost!(opt, CallBoost(newsite)) @@ -86,7 +86,7 @@ struct OptimizerCtx{TCallCtxs, TExplorer} OptimizerCtx(optimizerid, callctxs, explorer) = new{typeof(callctxs), typeof(explorer)}(callctxs, explorer) end -function ctx(opt::RuntimeOptimizer) +function ctx(opt::JIT) callctxs = (;map(boost -> (boost.fnsym, ctx(boost)), opt.callboosts)...) return OptimizerCtx(optimizerid(opt), callctxs, opt.explorer) end diff --git a/test/scheduling.jl b/test/scheduling.jl index 476035a..07b776d 100644 --- a/test/scheduling.jl +++ b/test/scheduling.jl @@ -120,7 +120,7 @@ push!(scheduler.msgqueue, Msg{Ping}(Addr(42), Ping())) @testset "ping-pong" begin #msgcallboost = CallBoost(:step_kern1!, profilestrategy = SparseProfile(1.0)) #actorcallboost = CallBoost(:step_kern!, profilestrategy = SparseProfile(1.0)) - optimizer = RuntimeOptimizer() + optimizer = JIT() #Catwalk.add_boost!(optimizer, msgcallboost) #Catwalk.add_boost!(optimizer, actorcallboost) normaltime = 0 diff --git a/test/typesweep.jl b/test/typesweep.jl index c29e48d..b2919d0 100644 --- a/test/typesweep.jl +++ b/test/typesweep.jl @@ -62,7 +62,7 @@ end @testset "Type sweep" begin println("Measuring performance in a type-sweep scenario") - optimizer = RuntimeOptimizer() + optimizer = JIT() Catwalk.add_boost!( optimizer, Catwalk.CallBoost(