Skip to content

Commit

Permalink
partr: Fix deadlock (#34807)
Browse files Browse the repository at this point in the history
When there is no work to do, the first thread to be idle will attempt to
run the event loop once, waiting for any notifications (which will usually
create new work). However, there is an interesting corner case where a
notification arrives, but no work was scheduled. That doesn't usually happen,
but there are a few situations where it does:

1) Somebody used a libuv primitive outside of julia, so the callback
   doesn't schedule any julia work.
2) Another thread forbily interrupted the waiting thread because it wants to
   take over the event loop for various reasons
3) On Windows, we occasionally get spurious wake ups of the event loop.

The existing code in partr assumed that we were in situation 2, i.e. that
there was another thread waiting to take over the event loop, so it released
the event loop and simply put the current thread to sleep in the expectation
that another thread will pick it up. However, if we instead are in one of the
other two conditions, there may not be another thread there to pick up the event
loop. Thus, with no thread owning the event loop, julia will stop responding to
events and effectively deadlock. Since both 1 and 3 are rare, and we don't actually
enter the event loop until there was no work for 4 milliseconds (which is fairly rare),
this condition rarely happens, but is occasionally observable on Windows, where it
caused #34769. To test that this fix works, we manually create situation 1 in the
test by creating an idle callback, which will prevent the event loop from blocking,
but only schedules julia work after it's been called 100 times. This reproduces
the observed failure from the issue and is fixed by this PR.

Fixes #34769

Co-authored-by: Jeff Bezanson <jeff@juliacomputing.com>
Co-authored-by: Jameson Nash <jameson@juliacomputing.com>
  • Loading branch information
3 people committed Feb 19, 2020
1 parent 4188b2c commit f36edc2
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 0 deletions.
3 changes: 3 additions & 0 deletions src/partr.c
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,9 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q)
// TODO: this relinquishes responsibility for all event
// to the last thread to do an explicit operation,
// which may starve other threads of critical work
if (jl_atomic_load(&jl_uv_n_waiters) == 0) {
continue;
}
}
if (!_threadedregion && active && ptls->tid == 0) {
// thread 0 is the only thread permitted to run the event loop
Expand Down
117 changes: 117 additions & 0 deletions test/threads.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,120 @@ if Sys.islinux() && Sys.CPU_THREADS > 1 && Sys.which("taskset") !== nothing
@test endswith(run_with_affinity("1"), "2")
@test endswith(run_with_affinity("0,1"), "3")
end

# issue #34769
function idle_callback(handle)
idle = @Base.handle_as handle UvTestIdle
if idle.active
idle.count += 1
if idle.count == 1
# We want to hit the case where we're allowing
# the thread to go to sleep, which only happens
# after some default amount of time (DEFAULT_THREAD_SLEEP_THRESHOLD)
# so spend that amount of time here.
Libc.systemsleep(0.004)
elseif idle.count >= 10
lock(idle.cond)
try
notify(idle.cond, true)
finally
unlock(idle.cond)
end
idle.active = false
end
end
nothing
end

mutable struct UvTestIdle
handle::Ptr{Cvoid}
cond::Base.ThreadSynchronizer
isopen::Bool
active::Bool
count::Int

function UvTestIdle()
this = new(Libc.malloc(Base._sizeof_uv_idle), Base.ThreadSynchronizer(), true, false, 0)
Base.iolock_begin()
Base.associate_julia_struct(this.handle, this)
err = ccall(:uv_idle_init, Cint, (Ptr{Cvoid}, Ptr{Cvoid}),
Base.eventloop(), this.handle)
if err != 0
Libc.free(this.handle)
this.handle = C_NULL
throw(_UVError("uv_idle_init", err))
end
err = ccall(:uv_idle_start, Cint, (Ptr{Cvoid}, Ptr{Cvoid}),
this.handle, @cfunction(idle_callback, Cvoid, (Ptr{Cvoid},)))
if err != 0
Libc.free(this.handle)
this.handle = C_NULL
throw(_UVError("uv_idle_start", err))
end
finalizer(Base.uvfinalize, this)
Base.iolock_end()
return this
end
end
Base.unsafe_convert(::Type{Ptr{Cvoid}}, idle::UvTestIdle) = idle.handle

function Base.uvfinalize(t::UvTestIdle)
Base.iolock_begin()
Base.lock(t.cond)
try
if t.handle != C_NULL
Base.disassociate_julia_struct(t.handle) # not going to call the usual close hooks
if t.isopen
t.isopen = false
ccall(:jl_close_uv, Cvoid, (Ptr{Cvoid},), t)
end
t.handle = C_NULL
notify(t.cond, false)
end
finally
unlock(t.cond)
end
Base.iolock_end()
nothing
end

function Base.wait(idle::UvTestIdle)
Base.iolock_begin()
Base.preserve_handle(idle)
Base.lock(idle.cond)
try
idle.active = true
wait(idle.cond)
finally
Base.unlock(idle.cond)
Base.unpreserve_handle(idle)
Base.iolock_end()
end
end

# Spawn another process as a watchdog. If this test fails, it'll unrecoverably
# hang in the event loop. Another process needs to kill it
cmd = """
@async (Base.wait_readnb(stdin, 1); exit())
sleep(100)
isopen(stdin) || exit()
println(stderr, "ERROR: Killing threads test due to watchdog expiry")
ccall(:uv_kill, Cint, (Cint, Cint), $(getpid()), Base.SIGTERM)
"""
proc = open(pipeline(`$(Base.julia_cmd()) -e $cmd`; stderr=stderr); write=true)

let idle=UvTestIdle()
wait(idle)
end

using Base.Threads
@threads for i = 1:1
let idle=UvTestIdle()
wait(idle)
end
end

@test process_running(proc)

# We don't need the watchdog anymore
close(proc.in)

0 comments on commit f36edc2

Please sign in to comment.