Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add disk cache infrastructure for Julia 1.11 #557

Merged
merged 11 commits into from
Jun 28, 2024
8 changes: 7 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,23 @@ InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
Preferences = "21216c6a-2e73-6563-6e65-726566657250"
Scratch = "6c6a2e73-6563-6170-7368-637461726353"
Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"

[compat]
ExprTools = "0.1"
InteractiveUtils = "1"
LLVM = "7.1"
Libdl = "1"
Logging = "1"
UUIDs = "1"
LLVM = "7.1"
Preferences = "1"
Scratch = "1"
Serialization = "1"
TOML = "1"
TimerOutputs = "0.5"
julia = "1.8"
2 changes: 2 additions & 0 deletions src/GPUCompiler.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ using ExprTools: splitdef, combinedef

using Libdl

using Serialization
using Scratch: @get_scratch!
using Preferences

const CC = Core.Compiler
using Core: MethodInstance, CodeInstance, CodeInfo
Expand Down
133 changes: 128 additions & 5 deletions src/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,54 @@ end

## cached compilation

### Disk cache notes
vchuravy marked this conversation as resolved.
Show resolved Hide resolved
# Julia uses package images (pkgimg) to cache both the result of inference,
# and the result of native code emissions. Up until Julia v1.11 neither the
# inferred nor the nativce code of foreign abstract interpreters was cached
# across sessions. Julia v1.11 allows for caching of inference results across
# sessions as long as those inference results are created during precompilation.
#
# Julia cache hierarchy is roughly as follows:
# Function (name of a thing)
# -> Method (particular piece of code to dispatch to with a signature)
# -> MethodInstance (A particular Method + particular signature)
# -> CodeInstance (A MethodInstance compiled for a world)
#
# In order to cache code across sessions we need to insert CodeInstance(owner=GPUCompilerCacheToken)
# into the internal cache. Once we have done so we know that a particular CodeInstance is unique in
# the system. (During pkgimg loading conflicts will be resolved).
#
# When a pkgimg is loaded we check it's validity, this means checking that all depdencies are the same,
# the pkgimg was created for the right set of compiler flags, and that all source code that was used
# to create this pkgimg is the same. When a CodeInstance is inside a pkgimg we can extend the chain of
# validity even for GPU code, we cannot verify a "runtime" CodeInstance in the same way.
#
# Therefore when we see a compilation request for a CodeInstance that is originating from a pkgimg
# we can use it as part of the hash for the on-disk cache. (see `cache_file`)

"""
disk_cache()
vchuravy marked this conversation as resolved.
Show resolved Hide resolved

Query if caching to disk is enabled.
"""
disk_cache() = parse(Bool, @load_preference("disk_cache", "false"))
vchuravy marked this conversation as resolved.
Show resolved Hide resolved

"""
enable_cache!(state::Bool=true)
vchuravy marked this conversation as resolved.
Show resolved Hide resolved

Activate the GPUCompiler disk cache in the current environment.
You will need to restart your Julia environment for it to take effect.

!!! note
The cache functionality requires Julia 1.11
"""
function enable_cache!(state::Bool=true)
@set_preferences!("disk_cache"=>string(state))
end

cache_path() = @get_scratch!("cache")
clear_disk_cache!() = rm(cache_path(); recursive=true, force=true)

const cache_lock = ReentrantLock()

"""
Expand Down Expand Up @@ -108,6 +156,37 @@ function cached_compilation(cache::AbstractDict{<:Any,V},
return obj::V
end

@noinline function cache_file(ci::CodeInstance, cfg::CompilerConfig)
h = hash(Base.objectid(ci))
@static if isdefined(Base, :object_build_id)
bid = Base.object_build_id(ci)
if bid === nothing # CI is from a runtime compilation, not worth caching on disk
return nothing
else
bid = bid % UInt64 # The upper 64bit are a checksum, unavailable during precompilation
end
h = hash(bid, h)
end
h = hash(cfg, h)

gpucompiler_buildid = Base.module_build_id(@__MODULE__)
if (gpucompiler_buildid >> 64) % UInt64 == 0xffffffffffffffff
return nothing # Don't cache during precompilation of GPUCompiler
end

return joinpath(
cache_path(),
# bifurcate the cache by build id of GPUCompiler
string(gpucompiler_buildid),
string(h, ".jls"))
end

struct OnDiskCacheEntry
src::Type # Originally MethodInstance, but upon deserialize they were not uniqued...
cfg::CompilerConfig
asm
end

@noinline function actual_compilation(cache::AbstractDict, src::MethodInstance, world::UInt,
cfg::CompilerConfig, compiler::Function, linker::Function)
job = CompilerJob(src, cfg, world)
Expand All @@ -117,20 +196,64 @@ end
ci = ci_cache_lookup(ci_cache(job), src, world, world)::Union{Nothing,CodeInstance}
if ci !== nothing
key = (ci, cfg)
if haskey(cache, key)
obj = cache[key]
end
obj = get(cache, key, nothing)
end

# slow path: compile and link
if obj === nothing || compile_hook[] !== nothing
# TODO: consider loading the assembly from an on-disk cache here
asm = compiler(job)
asm = nothing
path = nothing
ondisk_hit = false
@static if VERSION >= v"1.11.0-"
# Don't try to hit the disk cache if we are for a *compile* hook
# TODO:
# - Sould we hit disk cache if Base.generating_output()
# - Should we allow backend to opt out?
if ci !== nothing && obj === nothing && disk_cache()
path = cache_file(ci, cfg)
@debug "Looking for on-disk cache" job path
if path !== nothing && isfile(path)
ondisk_hit = true
try
@debug "Loading compiled kernel" job path
# The MI we deserialize here didn't get uniqued...
entry = deserialize(path)::OnDiskCacheEntry
if entry.src == src.specTypes && entry.cfg == cfg
asm = entry.asm
else
@show entry.src == src.specTypes
@show entry.cfg == cfg
@warn "Cache missmatch" src.specTypes cfg entry.src entry.cfg
end
catch ex
@warn "Failed to load compiled kernel" job path exception=(ex, catch_backtrace())
end
end
end
end

if asm === nothing || compile_hook[] !== nothing
# Run the compiler in-case we need to hook it.
asm = compiler(job)
end
if obj !== nothing
# we got here because of a *compile* hook; don't bother linking
return obj
end
vchuravy marked this conversation as resolved.
Show resolved Hide resolved

@static if VERSION >= v"1.11.0-"
if !ondisk_hit && path !== nothing && disk_cache()
@debug "Writing out on-disk cache" job path
tmppath, io = mktemp(;cleanup=false)
entry = OnDiskCacheEntry(src.specTypes, cfg, asm)
serialize(io, entry)
close(io)
# atomic move
mkpath(dirname(path))
Base.rename(tmppath, path, force=true)
end
end

obj = linker(job, asm)

if ci === nothing
Expand Down
21 changes: 21 additions & 0 deletions src/jlgen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,27 @@ macro in_world(world, ex)
end
end

"""
precompile(job::CompilerJob)

Compile the GPUCompiler job. In particular this will run inference using the foreign
abstract interpreter.
"""
function Base.precompile(@nospecialize(job::CompilerJob))
if job.source.def.primary_world > job.world || job.world > job.source.def.deleted_world
error("Cannot compile $(job.source) for world $(job.world); method is only valid in worlds $(job.source.def.primary_world) to $(job.source.def.deleted_world)")
end

# populate the cache
interp = get_interpreter(job)
cache = CC.code_cache(interp)
if ci_cache_lookup(cache, job.source, job.world, job.world) === nothing
ci_cache_populate(interp, cache, job.source, job.world, job.world)
return ci_cache_lookup(cache, job.source, job.world, job.world) !== nothing
end
return true
end

function compile_method_instance(@nospecialize(job::CompilerJob))
if job.source.def.primary_world > job.world || job.world > job.source.def.deleted_world
error("Cannot compile $(job.source) for world $(job.world); method is only valid in worlds $(job.source.def.primary_world) to $(job.source.def.deleted_world)")
Expand Down
31 changes: 21 additions & 10 deletions test/native_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -549,20 +549,21 @@ precompile_test_harness("Inference caching") do load_path
import GPUCompiler
using PrecompileTools

function kernel()
function kernel(A, x)
A[1] = x
return
end

let
job, _ = NativeCompiler.create_job(kernel, ())
GPUCompiler.code_typed(job)
job, _ = NativeCompiler.create_job(kernel, (Vector{Int}, Int))
precompile(job)
end

# identity is foreign
@setup_workload begin
job, _ = NativeCompiler.create_job(identity, (Int,))
@compile_workload begin
GPUCompiler.code_typed(job)
precompile(job)
end
end
end) |> string)
Expand All @@ -578,20 +579,30 @@ precompile_test_harness("Inference caching") do load_path
job, _ = NativeCompiler.create_job(identity, (Int,))
GPUCompiler.ci_cache_token(job)
end
ci = isdefined(identity_mi, :cache) ? identity_mi.cache : nothing
while ci !== nothing
@test ci.owner !== token
ci = isdefined(ci, :next) ? ci.next : nothing
end
@test !check_presence(identity_mi, token)

using InferenceCaching

# Check that kernel survived
kernel_mi = GPUCompiler.methodinstance(typeof(InferenceCaching.kernel), Tuple{})
kernel_mi = GPUCompiler.methodinstance(typeof(InferenceCaching.kernel), Tuple{Vector{Int}, Int})
@test check_presence(kernel_mi, token)

# check that identity survived
@test check_presence(identity_mi, token)

GPUCompiler.enable_cache!()
@test GPUCompiler.disk_cache() == true

job, _ = NativeCompiler.create_job(InferenceCaching.kernel, (Vector{Int}, Int))
@assert job.source == kernel_mi
ci = GPUCompiler.ci_cache_lookup(GPUCompiler.ci_cache(job), job.source, job.world, job.world)
@assert ci !== nothing
@assert ci.inferred !== nothing
path = GPUCompiler.cache_file(ci, job.config)
@test path !== nothing
@test !ispath(path)
NativeCompiler.cached_execution(InferenceCaching.kernel, (Vector{Int}, Int))
@test ispath(path)
end
end

Expand Down
18 changes: 18 additions & 0 deletions test/native_testsetup.jl
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,22 @@ function code_execution(@nospecialize(func), @nospecialize(types); kwargs...)
end
end

const runtime_cache = Dict{Any, Any}()

function compiler(job)
JuliaContext() do ctx
GPUCompiler.compile(:asm, job, validate=false)
end
end

function linker(job, asm)
asm
end

# simulates cached codegen
function cached_execution(@nospecialize(func), @nospecialize(types); kwargs...)
job, kwargs = create_job(func, types; kwargs...)
GPUCompiler.cached_compilation(runtime_cache, job.source, job.config, compiler, linker)
end

end
4 changes: 2 additions & 2 deletions test/ptx_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -339,14 +339,14 @@ precompile_test_harness("Inference caching") do load_path

let
job, _ = PTXCompiler.create_job(kernel, ())
GPUCompiler.code_typed(job)
precompile(job)
end

# identity is foreign
@setup_workload begin
job, _ = PTXCompiler.create_job(identity, (Int,))
@compile_workload begin
GPUCompiler.code_typed(job)
precompile(job)
end
end
end) |> string)
Expand Down