Skip to content

Commit

Permalink
add some missing timeouts in Distributed (#34502)
Browse files Browse the repository at this point in the history
fixes #34486
  • Loading branch information
JeffBezanson authored and KristofferC committed Apr 11, 2020
1 parent c2defcf commit 0b91311
Showing 1 changed file with 12 additions and 0 deletions.
12 changes: 12 additions & 0 deletions stdlib/Distributed/src/cluster.jl
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,7 @@ end
function create_worker(manager, wconfig)
# only node 1 can add new nodes, since nobody else has the full list of address:port
@assert LPROC.id == 1
timeout = worker_timeout()

# initiate a connect. Does not wait for connection completion in case of TCP.
w = Worker()
Expand Down Expand Up @@ -633,9 +634,14 @@ function create_worker(manager, wconfig)
(notnothing(x.config.ident) in something(wconfig.connect_idents, []))

wlist = filter(filterfunc, PGRP.workers)
waittime = 0
while wconfig.connect_idents !== nothing &&
length(wlist) < length(wconfig.connect_idents)
if waittime >= timeout
error("peer workers did not connect within $timeout seconds")
end
sleep(1.0)
waittime += 1
wlist = filter(filterfunc, PGRP.workers)
end

Expand All @@ -655,7 +661,13 @@ function create_worker(manager, wconfig)
send_msg_now(w, MsgHeader(RRID(0,0), ntfy_oid), join_message)

@async manage(w.manager, w.id, w.config, :register)
# wait for rr_ntfy_join with timeout
timedout = false
@async (sleep($timeout); timedout = true; put!(rr_ntfy_join, 1))
wait(rr_ntfy_join)
if timedout
error("worker did not connect within $timeout seconds")
end
lock(client_refs) do
delete!(PGRP.refs, ntfy_oid)
end
Expand Down

0 comments on commit 0b91311

Please sign in to comment.