Skip to content

Commit

Permalink
Reinstate --death-timeout CLI option (#1140)
Browse files Browse the repository at this point in the history
Add back in the `--death-timeout` option removed in #563, along with some tests to verify it's working as expected.

Closes #1017

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: #1140
  • Loading branch information
charlesbluca authored Mar 13, 2023
1 parent cdd4c0b commit ec2de78
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 0 deletions.
6 changes: 6 additions & 0 deletions dask_cuda/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,12 @@ def cuda():
help="""Module that should be loaded by each worker process like ``"foo.bar"`` or
``"/path/to/foo.py"``.""",
)
@click.option(
"--death-timeout",
type=str,
default=None,
help="Seconds to wait for a scheduler before closing",
)
@click.option(
"--dashboard-prefix",
type=str,
Expand Down
21 changes: 21 additions & 0 deletions dask_cuda/tests/test_dask_cuda_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,3 +431,24 @@ def test_worker_fraction_limits(loop): # noqa: F811
ret["[plugin] RMMSetup"]["maximum_pool_size"]
== (device_total_memory * 0.3) // 256 * 256
)


@patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
def test_worker_timeout():
ret = subprocess.run(
[
"dask",
"cuda",
"worker",
"192.168.1.100:7777",
"--death-timeout",
"1",
],
text=True,
encoding="utf8",
capture_output=True,
)

assert "closing nanny at" in ret.stderr.lower()
assert "reason: nanny-close" in ret.stderr.lower()
assert ret.returncode == 0
10 changes: 10 additions & 0 deletions dask_cuda/tests/test_local_cuda_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,3 +440,13 @@ def test_print_cluster_config(capsys):
assert "ucx" in captured.out
assert "1 B" in captured.out
assert "[plugin]" in captured.out


def test_death_timeout_raises():
with pytest.raises(asyncio.exceptions.TimeoutError):
with LocalCUDACluster(
silence_logs=False,
death_timeout=1e-10,
dashboard_address=":0",
):
pass

0 comments on commit ec2de78

Please sign in to comment.