Flaky `test_multiple_workers_2` #6755

gjoseph92 · 2022-07-20T19:32:25Z

___________________________ test_multiple_workers_2 ____________________________
addr = 'tcp://127.0.0.1:49727', timeout = 5, deserialize = True
handshake_overrides = None
connection_args = {'extra_conn_args': {}, 'require_encryption': False, 'ssl_context': None}
scheme = 'tcp', loc = '127.0.0.1:49727'
backend = <distributed.comm.tcp.TCPBackend object at 0x7faf8a7a4ac0>
connector = <distributed.comm.tcp.TCPConnector object at 0x7faf93020c70>
comm = <TCP (closed)  local=tcp://127.0.0.1:49747 remote=tcp://127.0.0.1:49727>
time_left = <function connect.<locals>.time_left at 0x7faf93015550>
backoff_base = 0.01
asyncdefconnect(
        addr, timeout=None, deserialize=True, handshake_overrides=None, **connection_args
    ):
"""
    Connect to the given address (a URI such as ``tcp://127.0.0.1:1234``)
    and yield a ``Comm`` object.  If the connection attempt fails, it is
    retried until the *timeout* is expired.
    """
if timeout isNone:
            timeout = dask.config.get("distributed.comm.timeouts.connect")
        timeout = parse_timedelta(timeout, default="seconds")
        scheme, loc = parse_address(addr)
        backend = registry.get_backend(scheme)
        connector = backend.get_connector()
        comm = None
        start = time()
deftime_left():
            deadline = start + timeout
returnmax(0, deadline - time())
        backoff_base = 0.01
        attempt = 0
# Prefer multiple small attempts than one long attempt. This should protect
# primarily from DNS race conditions
# gh3104, gh4176, gh4167
        intermediate_cap = timeout / 5
        active_exception = None
while time_left() > 0:
try:
                comm = await asyncio.wait_for(
                    connector.connect(loc, deserialize=deserialize, **connection_args),
                    timeout=min(intermediate_cap, time_left()),
                )
break
except FatalCommClosedError:
raise
# Note: CommClosed inherits from OSError
except (asyncio.TimeoutError, OSError) as exc:
                active_exception = exc
# As descibed above, the intermediate timeout is used to distributed
# initial, bulk connect attempts homogeneously. In particular with
# the jitter upon retries we should not be worred about overloading
# any more DNS servers
                intermediate_cap = timeout
# FullJitter see https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/
                upper_cap = min(time_left(), backoff_base * (2**attempt))
                backoff = random.uniform(0, upper_cap)
                attempt += 1
                logger.debug(
"Could not connect to %s, waiting for %s before retrying", loc, backoff
                )
await asyncio.sleep(backoff)
else:
raiseOSError(
f"Timed out trying to connect to {addr} after {timeout} s"
            ) fromactive_exception
        local_info = {
            **comm.handshake_info(),
            **(handshake_overrides or {}),
        }
try:
# This would be better, but connections leak if worker is closed quickly
# write, handshake = await asyncio.gather(comm.write(local_info), comm.read())
>           handshake = await asyncio.wait_for(comm.read(), time_left())
distributed/comm/core.py:328: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
fut = <Task cancelled name='Task-398' coro=<TCP.read() done, defined at /Users/runner/work/distributed/distributed/distributed/comm/tcp.py:214>>
timeout = 0.0730409622[1923](https://github.com/dask/distributed/runs/7395221485?check_suite_focus=true#step:11:1924)828
asyncdefwait_for(fut, timeout, *, loop=None):
"""Wait for the single Future or coroutine to complete, with timeout.
    Coroutine will be wrapped in Task.
    Returns result of the Future or coroutine.  When a timeout occurs,
    it cancels the task and raises TimeoutError.  To avoid the task
    cancellation, wrap it in shield().
    If the wait is cancelled, the task is also cancelled.
    This function is a coroutine.
    """
if loop isNone:
            loop = events.get_running_loop()
else:
            warnings.warn("The loop argument is deprecated since Python 3.8, "
"and scheduled for removal in Python 3.10.",
DeprecationWarning, stacklevel=2)
if timeout isNone:
returnawait fut
if timeout <= 0:
            fut = ensure_future(fut, loop=loop)
if fut.done():
return fut.result()
await _cancel_and_wait(fut, loop=loop)
try:
                fut.result()
except exceptions.CancelledError as exc:
raise exceptions.TimeoutError() fromexc
else:
raise exceptions.TimeoutError()
        waiter = loop.create_future()
        timeout_handle = loop.call_later(timeout, _release_waiter, waiter)
        cb = functools.partial(_release_waiter, waiter)
        fut = ensure_future(fut, loop=loop)
        fut.add_done_callback(cb)
try:
# wait until the future completes or the timeout
try:
await waiter
except exceptions.CancelledError:
if fut.done():
return fut.result()
else:
                    fut.remove_done_callback(cb)
# We must ensure that the task is not running
# after wait_for() returns.
# See https://bugs.python.org/issue32751
await _cancel_and_wait(fut, loop=loop)
raise
if fut.done():
return fut.result()
else:
                fut.remove_done_callback(cb)
# We must ensure that the task is not running
# after wait_for() returns.
# See https://bugs.python.org/issue32751
await _cancel_and_wait(fut, loop=loop)
>               raise exceptions.TimeoutError()
E               asyncio.exceptions.TimeoutError
../../../miniconda3/envs/dask-distributed/lib/python3.8/asyncio/tasks.py:501: TimeoutError
The above exception was the direct cause of the following exception:
loop = <tornado.platform.asyncio.AsyncIOLoop object at 0x7faf931294c0>
deftest_multiple_workers_2(loop):
        text = """
def dask_setup(worker):
    worker.foo = 'setup'
"""
        port = open_port()
with popen(
            ["dask-scheduler", "--no-dashboard", "--host", f"127.0.0.1:{port}"]
        ) as s:
with popen(
                [
"dask-worker",
f"localhost:{port}",
"--no-dashboard",
"--preload",
                    text,
"--preload-nanny",
                    text,
                ]
            ) as a:
>               with Client(f"127.0.0.1:{port}", loop=loop) as c:
distributed/cli/tests/test_dask_scheduler.py:511: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/client.py:940: in __init__
self.start(timeout=timeout)
distributed/client.py:1098: in start
    sync(self.loop, self._start, **kwargs)
distributed/utils.py:405: in sync
raise exc.with_traceback(tb)
distributed/utils.py:378: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.8/site-packages/tornado/gen.py:762: in run
    value = future.result()
distributed/client.py:1178: in _start
awaitself._ensure_connected(timeout=timeout)
distributed/client.py:1241: in _ensure_connected
    comm = await connect(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
addr = 'tcp://127.0.0.1:49727', timeout = 5, deserialize = True
handshake_overrides = None
connection_args = {'extra_conn_args': {}, 'require_encryption': False, 'ssl_context': None}
scheme = 'tcp', loc = '127.0.0.1:49727'
backend = <distributed.comm.tcp.TCPBackend object at 0x7faf8a7a4ac0>
connector = <distributed.comm.tcp.TCPConnector object at 0x7faf93020c70>
comm = <TCP (closed)  local=tcp://127.0.0.1:49747 remote=tcp://127.0.0.1:49727>
time_left = <function connect.<locals>.time_left at 0x7faf93015550>
backoff_base = 0.01
asyncdefconnect(
        addr, timeout=None, deserialize=True, handshake_overrides=None, **connection_args
    ):
"""
    Connect to the given address (a URI such as ``tcp://127.0.0.1:1234``)
    and yield a ``Comm`` object.  If the connection attempt fails, it is
    retried until the *timeout* is expired.
    """
if timeout isNone:
            timeout = dask.config.get("distributed.comm.timeouts.connect")
        timeout = parse_timedelta(timeout, default="seconds")
        scheme, loc = parse_address(addr)
        backend = registry.get_backend(scheme)
        connector = backend.get_connector()
        comm = None
        start = time()
deftime_left():
            deadline = start + timeout
returnmax(0, deadline - time())
        backoff_base = 0.01
        attempt = 0
# Prefer multiple small attempts than one long attempt. This should protect
# primarily from DNS race conditions
# gh3104, gh4176, gh4167
        intermediate_cap = timeout / 5
        active_exception = None
while time_left() > 0:
try:
                comm = await asyncio.wait_for(
                    connector.connect(loc, deserialize=deserialize, **connection_args),
                    timeout=min(intermediate_cap, time_left()),
                )
break
except FatalCommClosedError:
raise
# Note: CommClosed inherits from OSError
except (asyncio.TimeoutError, OSError) as exc:
                active_exception = exc
# As descibed above, the intermediate timeout is used to distributed
# initial, bulk connect attempts homogeneously. In particular with
# the jitter upon retries we should not be worred about overloading
# any more DNS servers
                intermediate_cap = timeout
# FullJitter see https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/
                upper_cap = min(time_left(), backoff_base * (2**attempt))
                backoff = random.uniform(0, upper_cap)
                attempt += 1
                logger.debug(
"Could not connect to %s, waiting for %s before retrying", loc, backoff
                )
await asyncio.sleep(backoff)
else:
raiseOSError(
f"Timed out trying to connect to {addr} after {timeout} s"
            ) fromactive_exception
        local_info = {
            **comm.handshake_info(),
            **(handshake_overrides or {}),
        }
try:
# This would be better, but connections leak if worker is closed quickly
# write, handshake = await asyncio.gather(comm.write(local_info), comm.read())
            handshake = await asyncio.wait_for(comm.read(), time_left())
await asyncio.wait_for(comm.write(local_info), time_left())
exceptExceptionas exc:
with suppress(Exception):
await comm.close()
>           raiseOSError(
f"Timed out during handshake while connecting to {addr} after {timeout} s"
            ) fromexc
E           OSError: Timed out during handshake while connecting to tcp://127.0.0.1:49727 after 5 s
distributed/comm/core.py:333: OSError
----------------------------- Captured stderr call -----------------------------
2022-07-18 18:28:14,163 - distributed.scheduler - INFO - -----------------------------------------------
2022-07-18 18:28:14,167 - distributed.utils - INFO - Reload module tmpix1h276_ from .py file
2022-07-18 18:28:14,171 - distributed.preloading - INFO - Import preload module: /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/tmpix1h276_.py
2022-07-18 18:28:14,176 - distributed.http.proxy - INFO - To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
2022-07-18 18:28:14,177 - distributed.preloading - INFO - Creating preload: 
def dask_setup(worker):
    worker.foo = 'setup'
2022-07-18 18:28:14,179 - distributed.utils - INFO - Reload module tmp5b_yct6b from .py file
2022-07-18 18:28:14,184 - distributed.scheduler - INFO - State start
2022-07-18 18:28:14,187 - distributed.preloading - INFO - Import preload module: /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/tmp5b_yct6b.py
2022-07-18 18:28:14,196 - distributed.scheduler - INFO - -----------------------------------------------
2022-07-18 18:28:14,196 - distributed.scheduler - INFO - Clear task state
2022-07-18 18:28:14,197 - distributed.scheduler - INFO -   Scheduler at:     tcp://127.0.0.1:49727
2022-07-18 18:28:14,197 - distributed.scheduler - INFO -   dashboard at:            127.0.0.1:8787
2022-07-18 18:28:14,204 - distributed.preloading - INFO - Run preload setup: 
def dask_setup(worker):
    worker.foo = 'setup'
2022-07-18 18:28:14,211 - distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:49740'
2022-07-18 18:28:15,944 - distributed.preloading - INFO - Creating preload: 
def dask_setup(worker):
    worker.foo = 'setup'
2022-07-18 18:28:15,950 - distributed.utils - INFO - Reload module tmp81ty01p5 from .py file
2022-07-18 18:28:15,954 - distributed.preloading - INFO - Import preload module: /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/tmp81ty01p5.py
2022-07-18 18:28:15,990 - distributed.preloading - INFO - Run preload setup: 
def dask_setup(worker):
    worker.foo = 'setup'
2022-07-18 18:28:15,990 - distributed.worker - INFO -       Start worker at:      tcp://127.0.0.1:49744
2022-07-18 18:28:15,991 - distributed.worker - INFO -          Listening to:      tcp://127.0.0.1:49744
2022-07-18 18:28:15,991 - distributed.worker - INFO -          dashboard at:            127.0.0.1:49745
2022-07-18 18:28:15,991 - distributed.worker - INFO - Waiting to connect to:      tcp://localhost:49727
2022-07-18 18:28:15,991 - distributed.worker - INFO - -------------------------------------------------
2022-07-18 18:28:15,991 - distributed.worker - INFO -               Threads:                          3
2022-07-18 18:28:15,991 - distributed.worker - INFO -                Memory:                  14.00 GiB
2022-07-18 18:28:15,991 - distributed.worker - INFO -       Local Directory: /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/dask-worker-space/worker-n_jp7jms
2022-07-18 18:28:15,991 - distributed.worker - INFO - -------------------------------------------------
2022-07-18 18:28:16,621 - distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:49744', status: init, memory: 0, processing: 0>
2022-07-18 18:28:16,926 - distributed._signals - INFO - Received signal SIGINT (2)
2022-07-18 18:28:16,927 - distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:49740'.
2022-07-18 18:28:16,928 - distributed.nanny - INFO - Nanny asking worker to close
2022-07-18 18:28:16,929 - distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:49744
2022-07-18 18:28:16,930 - distributed.worker - INFO - Closed worker has not yet started: Status.init
2022-07-18 18:28:17,263 - distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:49744
2022-07-18 18:28:17,263 - distributed.core - INFO - Starting established connection
2022-07-18 18:28:17,264 - distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:49744', status: init, memory: 0, processing: 0>
2022-07-18 18:28:17,264 - distributed.core - INFO - Removing comms to tcp://127.0.0.1:49744
2022-07-18 18:28:17,264 - distributed.scheduler - INFO - Lost all workers
2022-07-18 18:28:17,267 - distributed.comm.tcp - INFO - Connection from tcp://127.0.0.1:49747 closed before handshake completed
2022-07-18 18:28:17,301 - distributed.dask_worker - INFO - End worker
2022-07-18 18:28:17,489 - distributed._signals - INFO - Received signal SIGINT (2)
2022-07-18 18:28:17,490 - distributed.scheduler - INFO - Scheduler closing...
2022-07-18 18:28:17,491 - distributed.scheduler - INFO - Scheduler closing all comms
2022-07-18 18:28:17,491 - distributed.scheduler - INFO - Stopped scheduler at 'tcp://127.0.0.1:49727'
2022-07-18 18:28:17,492 - distributed.scheduler - INFO - End scheduler

https://github.com/dask/distributed/runs/7395221485?check_suite_focus=true#step:11:2023

The text was updated successfully, but these errors were encountered:

gjoseph92 added the flaky test Intermittent failures on CI. label Jul 20, 2022

gjoseph92 mentioned this issue Jul 20, 2022

Flaky tests: OSError: Timed out trying to connect to tcp://127.0.0.1:8786 after 5 s #6731

Closed

gjoseph92 mentioned this issue Aug 3, 2022

Only set 5s connect timeout in gen_cluster tests #6822

Merged

1 task

fjetter closed this as completed in #6822 Aug 5, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Flaky `test_multiple_workers_2` #6755

Flaky `test_multiple_workers_2` #6755

gjoseph92 commented Jul 20, 2022 •

edited

Loading

Flaky test_multiple_workers_2 #6755

Flaky test_multiple_workers_2 #6755

Comments

gjoseph92 commented Jul 20, 2022 • edited Loading

Flaky `test_multiple_workers_2` #6755

Flaky `test_multiple_workers_2` #6755

gjoseph92 commented Jul 20, 2022 •

edited

Loading