Refactor and docs

tisztamo · Mar 24, 2021 · 3e575b8 · 3e575b8
1 parent d4ca3c0
commit 3e575b8
Show file tree

Hide file tree

Showing 8 changed files with 114 additions and 26 deletions.
diff --git a/docs/make.jl b/docs/make.jl
@@ -7,7 +7,7 @@ makedocs(
     format = Documenter.HTML(; prettyurls = get(ENV, "CI", nothing) == "true"),
     authors = "Krisztián Schäffer",
     sitename = "Catwalk.jl",
-    pages = Any["index.md", "usage.md"],
+    pages = Any["index.md", "usage.md", "howitworks.md", "tuning.md"],
     # strict = true,
     # clean = true,
     # checkdocs = :exports,

diff --git a/docs/src/howitworks.md b/docs/src/howitworks.md
@@ -0,0 +1,81 @@
+# How it works?
+
+The `@jit` macro turns the outer function to a `@generated` one,
+so that we can recompile with reoptimized source code at will.
+
+The optimized code looks like this:
+
+```julia
+    x = get_some_x(i)
+    if x isa FrequentType1
+        calc_with_x(x) # Fast type-stable route
+    elseif x isa FrequentType2
+        calc_with_x(x) # Fast type-stable route
+    .
+    .
+    .
+    else
+        calc_with_x(x) # Fallback to the dynamically dispatched call
+    end
+```
+
+The *type* of the `jitctx` argument drives the compilation process, it
+encodes everything needed to generate the code, namely the list of 
+stabilized types and the type of the profiler that runs in the
+current batch:
+
+```julia
+struct CallCtx{TProfiler, TFixtypes}
+    profiler::TProfiler
+end
+```
+
+Two profilers are implemented at the time:
+
+```julia
+struct NoProfiler <: Profiler end
+
+struct FullProfiler <: Profiler
+    typefreqs::DataTypeFrequencies
+end
+```
+
+The `FullProfiler` collects statistics from every call.
+It logs a call faster than a dynamic dispatch, but running
+it in every batch would still eat most of the cake, so it
+is sparsely used, with 1% probability by default (It is
+always active during the first two batches). 
+
+The last missing part is the explorer, which automatically
+connects the JIT compiler with the `@jit`-ed functions that
+run under its supervision.
+
+Also, a single JIT compiler can handle multiple call sites,
+so the `jitctx` in reality is not a single `CallCtx` as described
+earlier, but a `NamedTuple` of them, plus an explorer:
+
+```julia
+struct OptimizerCtx{TCallCtxs, TExplorer}
+    callctxs::TCallCtxs
+    explorer::TExplorer
+end
+```
+
+The explorer holds its id in its type, because exploration happens
+during compilation, when only its type is available.
+
+```julia
+struct BasicExplorer{TOptimizerId} <: Explorer end
+```
+
+Here Catwalk - just like many other meta-heavy Julia packages -
+violates the rule that a `@generated` function is not "allowed"
+to access mutable global state. It logs the call site to a global
+dict, keyed with its id, from where the JIT compiler can read it out.
+
+It seems impossible to push back information from the compilation process
+without breaking this rule, and pushing the exploration to the tight loop
+we is not feasible.
+
+The alternative is to configure the compiler with the call sites
+and `NoExplorer` manually. TODO: link to tuning, when documented.
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -1,7 +1,8 @@
 # Catwalk.jl Intro
 
 Catwalk.jl can speed up long-running Julia processes by minimizing the
-overhead of dynamic dispatch.
+overhead of dynamic dispatch. It is a JIT compiler that continuosly
+re-optimizes dispatch code based on data collected at runtime.
 
 ![Speedup demo](assets/catwalk-speeddemo.gif)
 
@@ -31,6 +32,7 @@ Catwalk.jl assumes the followings:
 
 ## Alternatives
 
-- [FunctionWrappers.jl](https://github.com/yuyichao/FunctionWrappers.jl) will get you type stability for a fixed (?) cost.
 - [ManualDispatch.jl](https://github.com/jlapeyre/ManualDispatch.jl) can serve you better in less dynamic cases, when it is feasible to list the dynamically dispatched types in the source code.
-- In even simpler cases using unions instead of a type hierarchy may allow the Julia compiler to "split the union". See for example [List performance improvent by Union-typed tail](https://github.com/JuliaCollections/DataStructures.jl/pull/682/commits/4742228d42ae441f9837e5825feedeb1c013bd99) in DataStructures.jl.
+- In even simpler cases using unions instead of a type hierarchy may allow the Julia compiler to "split the union". See for example [List performance improvent by Union-typed tail](https://github.com/JuliaCollections/DataStructures.jl/pull/682/commits/4742228d42ae441f9837e5825feedeb1c013bd99) in DataStructures.jl.
+- [FunctionWrappers.jl](https://github.com/yuyichao/FunctionWrappers.jl) will give you type stability for a fixed (?) cost. Its use case is different, but if you are wrestling with type instabilities, take a look at it first.
+- [FunctionWranglers.jl](https://github.com/tisztamo/FunctionWranglers.jl) allows fast, inlined execution of functions provided in an array - for that use case it is a better choice than Catwalk.jl.
diff --git a/docs/src/tuning.md b/docs/src/tuning.md
@@ -0,0 +1,2 @@
+# Configuration & tuning
+
diff --git a/docs/src/usage.jl b/docs/src/usage.jl
@@ -2,8 +2,10 @@
 
 # Let's say you have a long-running calculation, organized into batches:
 
+const NUM_BATCHES = 1000
+
 function runbatches()
-    for batchidx = 1:1000
+    for batchidx = 1:NUM_BATCHES
         hotloop()
         ## Log progress, etc.
     end
@@ -12,8 +14,10 @@ end
 # The hot loop calls the type-unstable function `get_some_x()` and 
 # passes its result to a relatively cheap calculation `calc_with_x()`.
 
+const NUM_ITERS_PER_BATCH = 1_000_000
+
 function hotloop()
-    for i = 1:1_000_000
+    for i = 1:NUM_ITERS_PER_BATCH
         x = get_some_x(i)
         calc_with_x(x)
     end
@@ -34,8 +38,7 @@ end
 # * Sometimes it is not feasible to type-stabilize `get_some_x`. *
 #
 # Catwalk.jl is here for those cases. 
-# You mark `hotloop`, the outer function (the one which has the
-# dynamic call site in its body)
+# You mark `hotloop`, the outer function
 # with the `@jit` macro and provide the name of the dynamically
 # dispatched function
 # and the argument to operate on (the API will hopefully
@@ -45,21 +48,21 @@ end
 using Catwalk
 
 @jit calc_with_x x function hotloop_jit(jitctx)
-    for i = 1:1_000_000
+    for i = 1:NUM_ITERS_PER_BATCH
         x = get_some_x(i)
         calc_with_x(x)
     end
 end
 
 # The Catwalk optimizer will provide you the `jitctx` context which you have to pass
 # to the jit-ed function manually.
-# Also, every batch needs a bit more housekeeping to drive the Catwalk optimizer:
+# Also, every batch needs a bit housekeeping to drive the Catwalk optimizer:
 
 function runbatches_jit()
-    opt = Catwalk.RuntimeOptimizer()
-    for batch = 1:1000
-        Catwalk.step!(opt)
-        hotloop_jit(Catwalk.ctx(opt))
+    jit = Catwalk.JIT()
+    for batch = 1:NUM_BATCHES
+        Catwalk.step!(jit)
+        hotloop_jit(Catwalk.ctx(jit))
     end
 end
 
@@ -85,4 +88,4 @@ jit_result == result[] || error("JIT must be a no-op!")
 # Please note that the speedup depends on the portion of the runtime spent in dynamic dispatch,
 # which is most likely smaller in your case than in this contrived example.
 #
-# You can find this example under [docs/src/usage.jl](https://github.com/tisztamo/Catwalk.jl/docs/src/usage.jl) in the repo.
+# You can find this example under [docs/src/usage.jl](https://github.com/tisztamo/Catwalk.jl/docs/src/usage.jl) in the repo.
diff --git a/src/Catwalk.jl b/src/Catwalk.jl
@@ -5,7 +5,7 @@ An optimizing Just In Time compiler written in Julia.
 """
 module Catwalk
 
-export @jit, RuntimeOptimizer
+export @jit, JIT
 
 include("typelist.jl")
 include("frequencies.jl")
@@ -45,34 +45,34 @@ end
 fixtypes(::Type{CallCtx{TProfiler, TFixtypes}}) where {TProfiler, TFixtypes} = TFixtypes
 profiler(ctx::CallCtx) = ctx.profiler
 
-struct RuntimeOptimizer
+struct JIT
     id::Int
     callboosts::Vector{CallBoost}
     explorer::Explorer
 end
-function RuntimeOptimizer(callboosts...; explorertype=BasicExplorer)
+function JIT(callboosts...; explorertype=BasicExplorer)
     id = rand(Int)
-    opt = RuntimeOptimizer(id, [], explorertype(id))
+    opt = JIT(id, [], explorertype(id))
     for boost in callboosts
         add_boost!(opt, boost)
     end
     return opt
 end
 
-optimizerid(opt::RuntimeOptimizer) = opt.id
+optimizerid(opt::JIT) = opt.id
 
-function add_boost!(opt::RuntimeOptimizer, boost)
+function add_boost!(opt::JIT, boost)
     push!(opt.callboosts, boost)
     register_callsite!(opt.id, boost.fnsym)
 end
 
-function step!(opt::RuntimeOptimizer)
+function step!(opt::JIT)
     update_callboosts(opt)
     step!.(opt.callboosts)
     step!(opt.explorer)
 end
 
-function update_callboosts(opt::RuntimeOptimizer)
+function update_callboosts(opt::JIT)
     currentsyms = Set(map(b -> b.fnsym, opt.callboosts))
     for newsite in setdiff(get_freshcallsites!(opt.id), currentsyms)
         add_boost!(opt, CallBoost(newsite))
@@ -86,7 +86,7 @@ struct OptimizerCtx{TCallCtxs, TExplorer}
     OptimizerCtx(optimizerid, callctxs, explorer) = new{typeof(callctxs), typeof(explorer)}(callctxs, explorer)
 end
 
-function ctx(opt::RuntimeOptimizer)
+function ctx(opt::JIT)
     callctxs = (;map(boost -> (boost.fnsym, ctx(boost)), opt.callboosts)...)
     return OptimizerCtx(optimizerid(opt), callctxs, opt.explorer)
 end

diff --git a/test/scheduling.jl b/test/scheduling.jl
@@ -120,7 +120,7 @@ push!(scheduler.msgqueue, Msg{Ping}(Addr(42), Ping()))
 @testset "ping-pong" begin
     #msgcallboost = CallBoost(:step_kern1!, profilestrategy = SparseProfile(1.0))
     #actorcallboost = CallBoost(:step_kern!, profilestrategy = SparseProfile(1.0))
-    optimizer = RuntimeOptimizer()
+    optimizer = JIT()
     #Catwalk.add_boost!(optimizer, msgcallboost)
     #Catwalk.add_boost!(optimizer, actorcallboost)
     normaltime = 0

diff --git a/test/typesweep.jl b/test/typesweep.jl
@@ -62,7 +62,7 @@ end
 
 @testset "Type sweep" begin
     println("Measuring performance in a type-sweep scenario")
-    optimizer = RuntimeOptimizer()
+    optimizer = JIT()
     Catwalk.add_boost!(
         optimizer,
         Catwalk.CallBoost(