From d5cd4f87afc89c4f66b237d0e16e5916ce05702d Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Tue, 29 Oct 2024 23:03:46 -0700 Subject: [PATCH] Fix flaky test linux://python/ray/dag:tests/experimental/test_mocked_nccl_dag (#48433) Reduce the actors needed for test_mocked_nccl_dag test. Closes #48288. Signed-off-by: Stephanie Wang Signed-off-by: mohitjain2504 --- .../experimental/test_mocked_nccl_dag.py | 44 +++++-------------- 1 file changed, 12 insertions(+), 32 deletions(-) diff --git a/python/ray/dag/tests/experimental/test_mocked_nccl_dag.py b/python/ray/dag/tests/experimental/test_mocked_nccl_dag.py index 634d411272f2..53002896a79d 100644 --- a/python/ray/dag/tests/experimental/test_mocked_nccl_dag.py +++ b/python/ray/dag/tests/experimental/test_mocked_nccl_dag.py @@ -80,11 +80,8 @@ def test_p2p(ray_start_cluster): Test simple sender -> receiver pattern. Check that receiver receives correct results. """ - # Barrier name should be barrier-{sender rank}-{receiver rank}. - # Create a barrier in both directions because we don't know which rank will - # get assigned to sender and receiver. - barrier1 = Barrier.options(name="barrier-0-1").remote() # noqa - barrier2 = Barrier.options(name="barrier-1-0").remote() # noqa + # Barrier name should be barrier-{lower rank}-{higher rank}. + barrier = Barrier.options(name="barrier-0-1").remote() # noqa sender = MockedWorker.remote() receiver = MockedWorker.remote() @@ -115,8 +112,6 @@ def test_p2p(ray_start_cluster): ref = compiled_dag.execute(i, shape=shape, dtype=dtype, send_as_dict=True) assert ray.get(ref) == (i, shape, dtype) - ray.kill(barrier1) - ray.kill(barrier2) compiled_dag.teardown() @@ -138,11 +133,8 @@ def test_p2p_static_shape(ray_start_cluster, send_as_dict): _static_shape=True. If sender always sends tensors of the same shape, then it works. """ - # Barrier name should be barrier-{sender rank}-{receiver rank}. - # Create a barrier in both directions because we don't know which rank will - # get assigned to sender and receiver. - barrier1 = Barrier.options(name="barrier-0-1").remote() # noqa - barrier2 = Barrier.options(name="barrier-1-0").remote() # noqa + # Barrier name should be barrier-{lower rank}-{higher rank}. + barrier = Barrier.options(name="barrier-0-1").remote() # noqa sender = MockedWorker.remote() receiver = MockedWorker.remote() @@ -181,11 +173,8 @@ def test_p2p_static_shape_error(capsys, ray_start_cluster, send_as_dict): Test that when static_shape=True, an error is thrown when a tensor with a different shape or dtype is found. """ - # Barrier name should be barrier-{sender rank}-{receiver rank}. - # Create a barrier in both directions because we don't know which rank will - # get assigned to sender and receiver. - barrier1 = Barrier.options(name="barrier-0-1").remote() # noqa - barrier2 = Barrier.options(name="barrier-1-0").remote() # noqa + # Barrier name should be barrier-{lower rank}-{higher rank}. + barrier = Barrier.options(name="barrier-0-1").remote() # noqa sender = MockedWorker.remote() receiver = MockedWorker.remote() @@ -241,11 +230,8 @@ def test_p2p_direct_return(ray_start_cluster): """ Test simple sender -> receiver pattern with _direct_return=True """ - # Barrier name should be barrier-{sender rank}-{receiver rank}. - # Create a barrier in both directions because we don't know which rank will - # get assigned to sender and receiver. - barrier1 = Barrier.options(name="barrier-0-1").remote() # noqa - barrier2 = Barrier.options(name="barrier-1-0").remote() # noqa + # Barrier name should be barrier-{lower rank}-{higher rank}. + barrier = Barrier.options(name="barrier-0-1").remote() # noqa sender = MockedWorker.remote() receiver = MockedWorker.remote() @@ -286,11 +272,8 @@ def test_p2p_direct_return_error(capsys, ray_start_cluster): _direct_return=True. Test that error is thrown when actor task does not return a tensor directly. """ - # Barrier name should be barrier-{sender rank}-{receiver rank}. - # Create a barrier in both directions because we don't know which rank will - # get assigned to sender and receiver. - barrier1 = Barrier.options(name="barrier-0-1").remote() # noqa - barrier2 = Barrier.options(name="barrier-1-0").remote() # noqa + # Barrier name should be barrier-{lower rank}-{higher rank}. + barrier = Barrier.options(name="barrier-0-1").remote() # noqa sender = MockedWorker.remote() receiver = MockedWorker.remote() @@ -355,11 +338,8 @@ def test_p2p_static_shape_and_direct_return( are passed (check_static_shape=True) OR if non-tensor value is returned (check_static_shape=False). """ - # Barrier name should be barrier-{sender rank}-{receiver rank}. - # Create a barrier in both directions because we don't know which rank will - # get assigned to sender and receiver. - barrier1 = Barrier.options(name="barrier-0-1").remote() # noqa - barrier2 = Barrier.options(name="barrier-1-0").remote() # noqa + # Barrier name should be barrier-{lower rank}-{higher rank}. + barrier = Barrier.options(name="barrier-0-1").remote() # noqa sender = MockedWorker.remote() receiver = MockedWorker.remote()