Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Perform synchronization on a worker thread #2025

Merged
merged 8 commits into from
Aug 12, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions lib/cudadrv/CUDAdrv.jl
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,6 @@ include("graph.jl")

# global state (CUDA.jl's driver wrappers behave like CUDA's runtime library)
include("state.jl")

# support for concurrent programming
include("synchronization.jl")
16 changes: 2 additions & 14 deletions lib/cudadrv/context.jl
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ associated with the current task.
function synchronize(ctx::CuContext)
push!(CuContext, ctx)
try
nonblocking_synchronize()
device_synchronize()
finally
pop!(CuContext)
end
Expand All @@ -316,21 +316,9 @@ associated with the current task.
On the device, `device_synchronize` acts as a synchronization point for child grids in the
context of dynamic parallelism.
"""
device_synchronize() = nonblocking_synchronize()
device_synchronize()
# XXX: can we put the device docstring in dynamic_parallelism.jl?

@inline function nonblocking_synchronize()
# perform as much of the sync as possible without blocking in CUDA.
# XXX: remove this using a yield callback, or by synchronizing on a dedicated thread?
nonblocking_synchronize(legacy_stream())

# even though the GPU should be idle now, CUDA hooks work to the actual API call.
# see NVIDIA bug #3383169 for more details.
cuCtxSynchronize()

check_exceptions()
end


## cache config

Expand Down
31 changes: 1 addition & 30 deletions lib/cudadrv/events.jl
Original file line number Diff line number Diff line change
Expand Up @@ -49,36 +49,7 @@ record(e::CuEvent, stream::CuStream=stream()) =

Waits for an event to complete.
"""
function synchronize(e::CuEvent)
# perform as much of the sync as possible without blocking in CUDA.
# XXX: remove this using a yield callback, or by synchronizing on a dedicated thread?
nonblocking_synchronize(e)

# even though the GPU should be idle now, CUDA hooks work to the actual API call.
# see NVIDIA bug #3383169 for more details.
cuEventSynchronize(e)
end

@inline function nonblocking_synchronize(e::CuEvent)
# fast path
isdone(e) && return

# spin (initially without yielding to minimize latency)
spins = 0
while spins < 256
if spins < 32
ccall(:jl_cpu_pause, Cvoid, ())
# Temporary solution before we have gc transition support in codegen.
ccall(:jl_gc_safepoint, Cvoid, ())
else
yield()
end
isdone(e) && return
spins += 1
end

return
end
synchronize(e::CuEvent)
maleadt marked this conversation as resolved.
Show resolved Hide resolved

"""
isdone(e::CuEvent)
Expand Down
2 changes: 1 addition & 1 deletion lib/cudadrv/state.jl
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,7 @@ an array or a dictionary, use additional locks.
"""
struct PerDevice{T}
lock::ReentrantLock
values::LazyInitialized{Vector{Union{Nothing,Tuple{CuContext,T}}}}
values::LazyInitialized{Vector{Union{Nothing,Tuple{CuContext,T}}},Nothing}
end

function PerDevice{T}() where {T}
Expand Down
70 changes: 1 addition & 69 deletions lib/cudadrv/stream.jl
Original file line number Diff line number Diff line change
Expand Up @@ -120,75 +120,7 @@ associated with the current Julia task.

See also: [`device_synchronize`](@ref)
"""
function synchronize(stream::CuStream=stream(); blocking=nothing)
if blocking !== nothing
Base.depwarn("the blocking keyword to synchronize() has been deprecated", :synchronize)
end

# perform as much of the sync as possible without blocking in CUDA.
# XXX: remove this using a yield callback, or by synchronizing on a dedicated stream?
nonblocking_synchronize(stream)

# even though the GPU should be idle now, CUDA hooks work to the actual API call.
# see NVIDIA bug #3383169 for more details.
cuStreamSynchronize(stream)

check_exceptions()
end

@inline function nonblocking_synchronize(stream::CuStream)
# fast path
isdone(stream) && return

# minimize latency of short operations by busy-waiting,
# initially without even yielding to other tasks
spins = 0
while spins < 256
if spins < 32
ccall(:jl_cpu_pause, Cvoid, ())
# Temporary solution before we have gc transition support in codegen.
ccall(:jl_gc_safepoint, Cvoid, ())
else
yield()
end
isdone(stream) && return
spins += 1
end

# minimize CPU usage of long-running kernels by waiting for an event signalled by CUDA
event = Base.Event()
launch(; stream) do
notify(event)
end
# if an error occurs, the callback may never fire, so use a timer to detect such cases
dev = device()
timer = Timer(0; interval=1)
Base.@sync begin
Threads.@spawn try
device!(dev)
while true
try
Base.wait(timer)
catch err
err isa EOFError && break
rethrow()
end
if unsafe_cuStreamQuery(stream) != ERROR_NOT_READY
break
end
end
finally
notify(event)
end

Threads.@spawn begin
Base.wait(event)
close(timer)
end
end

return
end
synchronize(stream::CuStream=stream())

"""
priority_range()
Expand Down
Loading