JuliaGPU · maleadt · Aug 12, 2023 · Aug 11, 2023 · Aug 11, 2023 · Aug 11, 2023
diff --git a/lib/cudadrv/CUDAdrv.jl b/lib/cudadrv/CUDAdrv.jl
@@ -29,3 +29,6 @@ include("graph.jl")
 
 # global state (CUDA.jl's driver wrappers behave like CUDA's runtime library)
 include("state.jl")
+
+# support for concurrent programming
+include("synchronization.jl")
diff --git a/lib/cudadrv/context.jl b/lib/cudadrv/context.jl
@@ -299,7 +299,7 @@ associated with the current task.
 function synchronize(ctx::CuContext)
     push!(CuContext, ctx)
     try
-        nonblocking_synchronize()
+        device_synchronize()
     finally
         pop!(CuContext)
     end
@@ -316,21 +316,9 @@ associated with the current task.
 On the device, `device_synchronize` acts as a synchronization point for child grids in the
 context of dynamic parallelism.
 """
-device_synchronize() = nonblocking_synchronize()
+device_synchronize()
 # XXX: can we put the device docstring in dynamic_parallelism.jl?
 
-@inline function nonblocking_synchronize()
-    # perform as much of the sync as possible without blocking in CUDA.
-    # XXX: remove this using a yield callback, or by synchronizing on a dedicated thread?
-    nonblocking_synchronize(legacy_stream())
-
-    # even though the GPU should be idle now, CUDA hooks work to the actual API call.
-    # see NVIDIA bug #3383169 for more details.
-    cuCtxSynchronize()
-
-    check_exceptions()
-end
-
 
 ## cache config
 

diff --git a/lib/cudadrv/events.jl b/lib/cudadrv/events.jl
@@ -49,36 +49,7 @@ record(e::CuEvent, stream::CuStream=stream()) =
 
 Waits for an event to complete.
 """
-function synchronize(e::CuEvent)
-    # perform as much of the sync as possible without blocking in CUDA.
-    # XXX: remove this using a yield callback, or by synchronizing on a dedicated thread?
-    nonblocking_synchronize(e)
-
-    # even though the GPU should be idle now, CUDA hooks work to the actual API call.
-    # see NVIDIA bug #3383169 for more details.
-    cuEventSynchronize(e)
-end
-
-@inline function nonblocking_synchronize(e::CuEvent)
-    # fast path
-    isdone(e) && return
-
-    # spin (initially without yielding to minimize latency)
-    spins = 0
-    while spins < 256
-        if spins < 32
-            ccall(:jl_cpu_pause, Cvoid, ())
-            # Temporary solution before we have gc transition support in codegen.
-            ccall(:jl_gc_safepoint, Cvoid, ())
-        else
-            yield()
-        end
-        isdone(e) && return
-        spins += 1
-    end
-
-    return
-end
+synchronize(e::CuEvent)
 
 """
     isdone(e::CuEvent)

diff --git a/lib/cudadrv/state.jl b/lib/cudadrv/state.jl
@@ -444,7 +444,7 @@ an array or a dictionary, use additional locks.
 """
 struct PerDevice{T}
     lock::ReentrantLock
-    values::LazyInitialized{Vector{Union{Nothing,Tuple{CuContext,T}}}}
+    values::LazyInitialized{Vector{Union{Nothing,Tuple{CuContext,T}}},Nothing}
 end
 
 function PerDevice{T}() where {T}

diff --git a/lib/cudadrv/stream.jl b/lib/cudadrv/stream.jl
@@ -120,75 +120,7 @@ associated with the current Julia task.
 
 See also: [`device_synchronize`](@ref)
 """
-function synchronize(stream::CuStream=stream(); blocking=nothing)
-    if blocking !== nothing
-        Base.depwarn("the blocking keyword to synchronize() has been deprecated", :synchronize)
-    end
-
-    # perform as much of the sync as possible without blocking in CUDA.
-    # XXX: remove this using a yield callback, or by synchronizing on a dedicated stream?
-    nonblocking_synchronize(stream)
-
-    # even though the GPU should be idle now, CUDA hooks work to the actual API call.
-    # see NVIDIA bug #3383169 for more details.
-    cuStreamSynchronize(stream)
-
-    check_exceptions()
-end
-
-@inline function nonblocking_synchronize(stream::CuStream)
-    # fast path
-    isdone(stream) && return
-
-    # minimize latency of short operations by busy-waiting,
-    # initially without even yielding to other tasks
-    spins = 0
-    while spins < 256
-        if spins < 32
-            ccall(:jl_cpu_pause, Cvoid, ())
-            # Temporary solution before we have gc transition support in codegen.
-            ccall(:jl_gc_safepoint, Cvoid, ())
-        else
-            yield()
-        end
-        isdone(stream) && return
-        spins += 1
-    end
-
-    # minimize CPU usage of long-running kernels by waiting for an event signalled by CUDA
-    event = Base.Event()
-    launch(; stream) do
-        notify(event)
-    end
-    # if an error occurs, the callback may never fire, so use a timer to detect such cases
-    dev = device()
-    timer = Timer(0; interval=1)
-    Base.@sync begin
-        Threads.@spawn try
-            device!(dev)
-            while true
-                try
-                    Base.wait(timer)
-                catch err
-                    err isa EOFError && break
-                    rethrow()
-                end
-                if unsafe_cuStreamQuery(stream) != ERROR_NOT_READY
-                    break
-                end
-            end
-        finally
-            notify(event)
-        end
-
-        Threads.@spawn begin
-            Base.wait(event)
-            close(timer)
-        end
-    end
-
-    return
-end
+synchronize(stream::CuStream=stream())
 
 """
     priority_range()