From 68358e4ec90aa7142854490da2142ccfb6879370 Mon Sep 17 00:00:00 2001 From: Scott Lee Date: Tue, 20 Aug 2024 15:47:22 -0700 Subject: [PATCH 1/2] add access denied retry Signed-off-by: Scott Lee --- .../nightly_tests/dataset/multi_node_train_benchmark.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/release/nightly_tests/dataset/multi_node_train_benchmark.py b/release/nightly_tests/dataset/multi_node_train_benchmark.py index 679261931ec6..432d05509846 100644 --- a/release/nightly_tests/dataset/multi_node_train_benchmark.py +++ b/release/nightly_tests/dataset/multi_node_train_benchmark.py @@ -571,8 +571,13 @@ def __iter__(self): def benchmark_code( args, ): + ctx = ray.data.DataContext.get_current() + # This release test runs into ACCESS_DENIED errors fairly often. + # We add ACCESS_DENIED as a retryable exception type to avoid flakiness. + # See for more details: https://github.com/ray-project/ray/issues/47230 + ctx.retried_io_errors.append("AWS ACCESS_DENIED") + if args.target_max_block_size_mb is not None: - ctx = ray.data.DataContext.get_current() ctx.target_max_block_size = args.target_max_block_size_mb * 1024 * 1024 cache_input_ds = args.cache_input_ds From 4cedbb30345e61f16dd364e0ff361746f52c8dbf Mon Sep 17 00:00:00 2001 From: Scott Lee Date: Tue, 20 Aug 2024 17:03:04 -0700 Subject: [PATCH 2/2] fix msg format Signed-off-by: Scott Lee --- release/nightly_tests/dataset/multi_node_train_benchmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release/nightly_tests/dataset/multi_node_train_benchmark.py b/release/nightly_tests/dataset/multi_node_train_benchmark.py index 432d05509846..69828258fdea 100644 --- a/release/nightly_tests/dataset/multi_node_train_benchmark.py +++ b/release/nightly_tests/dataset/multi_node_train_benchmark.py @@ -575,7 +575,7 @@ def benchmark_code( # This release test runs into ACCESS_DENIED errors fairly often. # We add ACCESS_DENIED as a retryable exception type to avoid flakiness. # See for more details: https://github.com/ray-project/ray/issues/47230 - ctx.retried_io_errors.append("AWS ACCESS_DENIED") + ctx.retried_io_errors.append("AWS Error ACCESS_DENIED") if args.target_max_block_size_mb is not None: ctx.target_max_block_size = args.target_max_block_size_mb * 1024 * 1024