From ec2de7868e191d092f7215ed4bca9f2453b78e9a Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 13 Mar 2023 17:08:02 -0400 Subject: [PATCH] Reinstate `--death-timeout` CLI option (#1140) Add back in the `--death-timeout` option removed in #563, along with some tests to verify it's working as expected. Closes #1017 Authors: - Charles Blackmon-Luca (https://github.com/charlesbluca) Approvers: - Peter Andreas Entschev (https://github.com/pentschev) URL: https://github.com/rapidsai/dask-cuda/pull/1140 --- dask_cuda/cli.py | 6 ++++++ dask_cuda/tests/test_dask_cuda_worker.py | 21 +++++++++++++++++++++ dask_cuda/tests/test_local_cuda_cluster.py | 10 ++++++++++ 3 files changed, 37 insertions(+) diff --git a/dask_cuda/cli.py b/dask_cuda/cli.py index 5a6e3db07..128da2078 100644 --- a/dask_cuda/cli.py +++ b/dask_cuda/cli.py @@ -243,6 +243,12 @@ def cuda(): help="""Module that should be loaded by each worker process like ``"foo.bar"`` or ``"/path/to/foo.py"``.""", ) +@click.option( + "--death-timeout", + type=str, + default=None, + help="Seconds to wait for a scheduler before closing", +) @click.option( "--dashboard-prefix", type=str, diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py index 9f5d82d9d..7a6207c06 100644 --- a/dask_cuda/tests/test_dask_cuda_worker.py +++ b/dask_cuda/tests/test_dask_cuda_worker.py @@ -431,3 +431,24 @@ def test_worker_fraction_limits(loop): # noqa: F811 ret["[plugin] RMMSetup"]["maximum_pool_size"] == (device_total_memory * 0.3) // 256 * 256 ) + + +@patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"}) +def test_worker_timeout(): + ret = subprocess.run( + [ + "dask", + "cuda", + "worker", + "192.168.1.100:7777", + "--death-timeout", + "1", + ], + text=True, + encoding="utf8", + capture_output=True, + ) + + assert "closing nanny at" in ret.stderr.lower() + assert "reason: nanny-close" in ret.stderr.lower() + assert ret.returncode == 0 diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py index 987055636..a72ec3f2e 100644 --- a/dask_cuda/tests/test_local_cuda_cluster.py +++ b/dask_cuda/tests/test_local_cuda_cluster.py @@ -440,3 +440,13 @@ def test_print_cluster_config(capsys): assert "ucx" in captured.out assert "1 B" in captured.out assert "[plugin]" in captured.out + + +def test_death_timeout_raises(): + with pytest.raises(asyncio.exceptions.TimeoutError): + with LocalCUDACluster( + silence_logs=False, + death_timeout=1e-10, + dashboard_address=":0", + ): + pass