diff --git a/release/air_tests/air_benchmarks/compute_cpu_1_aws.yaml b/release/air_tests/air_benchmarks/compute_cpu_1_aws.yaml index e9ad74fc995c..52564851afe3 100644 --- a/release/air_tests/air_benchmarks/compute_cpu_1_aws.yaml +++ b/release/air_tests/air_benchmarks/compute_cpu_1_aws.yaml @@ -1,15 +1,10 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -max_workers: 1 +max_workers: 0 head_node_type: name: head_node instance_type: m5.2xlarge -worker_node_types: - - name: worker_node - instance_type: m5.2xlarge - max_workers: 1 - min_workers: 1 - use_spot: false +worker_node_types: [] diff --git a/release/air_tests/air_benchmarks/compute_cpu_1_gce.yaml b/release/air_tests/air_benchmarks/compute_cpu_1_gce.yaml index 9b42574aadc7..90de98eb18e6 100644 --- a/release/air_tests/air_benchmarks/compute_cpu_1_gce.yaml +++ b/release/air_tests/air_benchmarks/compute_cpu_1_gce.yaml @@ -3,15 +3,10 @@ region: us-west1 allowed_azs: - us-west1-b -max_workers: 1 +max_workers: 0 head_node_type: name: head_node instance_type: n1-standard-8 -worker_node_types: - - name: worker_node - instance_type: n1-standard-8 - max_workers: 1 - min_workers: 1 - use_spot: false +worker_node_types: [] diff --git a/release/air_tests/air_benchmarks/compute_cpu_4_aws.yaml b/release/air_tests/air_benchmarks/compute_cpu_4_aws.yaml index 395891382653..fb8dc9719e67 100644 --- a/release/air_tests/air_benchmarks/compute_cpu_4_aws.yaml +++ b/release/air_tests/air_benchmarks/compute_cpu_4_aws.yaml @@ -1,17 +1,15 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -max_workers: 4 +max_workers: 3 head_node_type: name: head_node instance_type: m5.2xlarge - resources: - cpu: 0 worker_node_types: - name: worker_node instance_type: m5.2xlarge - max_workers: 4 - min_workers: 4 + max_workers: 3 + min_workers: 3 use_spot: false diff --git a/release/air_tests/air_benchmarks/compute_cpu_4_gce.yaml b/release/air_tests/air_benchmarks/compute_cpu_4_gce.yaml index 0a9f868020aa..03f5772f88be 100644 --- a/release/air_tests/air_benchmarks/compute_cpu_4_gce.yaml +++ b/release/air_tests/air_benchmarks/compute_cpu_4_gce.yaml @@ -3,17 +3,15 @@ region: us-west1 allowed_azs: - us-west1-b -max_workers: 4 +max_workers: 3 head_node_type: name: head_node instance_type: n1-standard-8 - resources: - cpu: 0 worker_node_types: - name: worker_node instance_type: n1-standard-8 - max_workers: 4 - min_workers: 4 + max_workers: 3 + min_workers: 3 use_spot: false diff --git a/release/air_tests/air_benchmarks/compute_cpu_8_aws.yaml b/release/air_tests/air_benchmarks/compute_cpu_8_aws.yaml index c0d89e3c2773..afbedcb738c3 100644 --- a/release/air_tests/air_benchmarks/compute_cpu_8_aws.yaml +++ b/release/air_tests/air_benchmarks/compute_cpu_8_aws.yaml @@ -6,12 +6,10 @@ max_workers: 7 head_node_type: name: head_node instance_type: m5.2xlarge - resources: - cpu: 0 worker_node_types: - name: worker_node instance_type: m5.2xlarge - max_workers: 8 - min_workers: 8 + max_workers: 7 + min_workers: 7 use_spot: false diff --git a/release/air_tests/air_benchmarks/compute_cpu_8_gce.yaml b/release/air_tests/air_benchmarks/compute_cpu_8_gce.yaml index 653c278fc35e..b15168fcb260 100644 --- a/release/air_tests/air_benchmarks/compute_cpu_8_gce.yaml +++ b/release/air_tests/air_benchmarks/compute_cpu_8_gce.yaml @@ -3,17 +3,15 @@ region: us-west1 allowed_azs: - us-west1-b -max_workers: 8 +max_workers: 7 head_node_type: name: head_node instance_type: n1-standard-8 - resources: - cpu: 0 worker_node_types: - name: worker_node instance_type: n1-standard-8 - max_workers: 8 - min_workers: 8 + max_workers: 7 + min_workers: 7 use_spot: false diff --git a/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml b/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml index f9a7beda293a..df7c2a8958a0 100644 --- a/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml +++ b/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml @@ -1,18 +1,13 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -max_workers: 1 +max_workers: 0 head_node_type: name: head_node - instance_type: m5.2xlarge + instance_type: g3.8xlarge -worker_node_types: - - name: worker_node - instance_type: g3.8xlarge - max_workers: 1 - min_workers: 1 - use_spot: false +worker_node_types: [] aws: BlockDeviceMappings: diff --git a/release/air_tests/air_benchmarks/compute_gpu_1_gce.yaml b/release/air_tests/air_benchmarks/compute_gpu_1_gce.yaml index 38a82f997649..4776275bbc19 100644 --- a/release/air_tests/air_benchmarks/compute_gpu_1_gce.yaml +++ b/release/air_tests/air_benchmarks/compute_gpu_1_gce.yaml @@ -3,18 +3,13 @@ region: us-west1 allowed_azs: - us-west1-b -max_workers: 1 +max_workers: 0 head_node_type: name: head_node - instance_type: n1-standard-8 + instance_type: n1-standard-32-nvidia-tesla-t4-2 -worker_node_types: - - name: worker_node - instance_type: n1-standard-32-nvidia-tesla-t4-2 - max_workers: 1 - min_workers: 1 - use_spot: false +worker_node_types: [] gcp_advanced_configurations_json: instance_properties: diff --git a/release/air_tests/air_benchmarks/compute_gpu_2x2_aws.yaml b/release/air_tests/air_benchmarks/compute_gpu_2x2_aws.yaml index 7308d43dec49..20791f9e4d9d 100644 --- a/release/air_tests/air_benchmarks/compute_gpu_2x2_aws.yaml +++ b/release/air_tests/air_benchmarks/compute_gpu_2x2_aws.yaml @@ -1,17 +1,15 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -max_workers: 2 +max_workers: 1 head_node_type: name: head_node - instance_type: m5.2xlarge - resources: - cpu: 0 + instance_type: g3.8xlarge worker_node_types: - name: worker_node instance_type: g3.8xlarge - max_workers: 2 - min_workers: 2 + max_workers: 1 + min_workers: 1 use_spot: false diff --git a/release/air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml b/release/air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml index 3f4440b6e41a..3bf0b4eca9d0 100644 --- a/release/air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml +++ b/release/air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml @@ -3,17 +3,15 @@ region: us-west1 allowed_azs: - us-west1-b -max_workers: 2 +max_workers: 1 head_node_type: name: head_node - instance_type: n1-standard-8 - resources: - cpu: 0 + instance_type: n1-standard-32-nvidia-tesla-t4-2 worker_node_types: - name: worker_node instance_type: n1-standard-32-nvidia-tesla-t4-2 - max_workers: 2 - min_workers: 2 + max_workers: 1 + min_workers: 1 use_spot: false diff --git a/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml b/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml index 21f37172f3bf..ee7d1436e7cf 100644 --- a/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml +++ b/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml @@ -1,19 +1,17 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -max_workers: 4 +max_workers: 3 head_node_type: name: head_node - instance_type: m5.2xlarge - resources: - cpu: 0 + instance_type: g4dn.12xlarge worker_node_types: - name: worker_node instance_type: g4dn.12xlarge - max_workers: 4 - min_workers: 4 + max_workers: 3 + min_workers: 3 use_spot: false aws: diff --git a/release/air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml b/release/air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml index 492992822850..5702b44d240e 100644 --- a/release/air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml +++ b/release/air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml @@ -3,19 +3,17 @@ region: us-west1 allowed_azs: - us-west1-b -max_workers: 4 +max_workers: 3 head_node_type: name: head_node - instance_type: n1-standard-8 - resources: - cpu: 0 + instance_type: n1-standard-64-nvidia-tesla-t4-4 worker_node_types: - name: worker_node instance_type: n1-standard-64-nvidia-tesla-t4-4 - max_workers: 4 - min_workers: 4 + max_workers: 3 + min_workers: 3 use_spot: false gcp_advanced_configurations_json: diff --git a/release/air_tests/air_benchmarks/workloads/benchmark_util.py b/release/air_tests/air_benchmarks/workloads/benchmark_util.py index 854b4a7d33ec..5fbaaf8c285a 100644 --- a/release/air_tests/air_benchmarks/workloads/benchmark_util.py +++ b/release/air_tests/air_benchmarks/workloads/benchmark_util.py @@ -11,11 +11,8 @@ def schedule_remote_fn_on_all_nodes( - remote_fn, exclude_head: bool = True, *args, **kwargs + remote_fn, exclude_head: bool = False, *args, **kwargs ): - """Runs remote fn on all worker nodes. - Also schedules on the head node if `exclude_head` is False. - """ head_ip = ray.util.get_node_ip_address() futures = [] @@ -58,15 +55,13 @@ def upload_file_to_all_nodes(path: str): return ray.get(futures) -@ray.remote(num_cpus=0) +@ray.remote def _run_command(cmd: str): return subprocess.check_call(cmd) -def run_command_on_all_nodes(cmd: List[str], exclude_head: bool = True): - futures = schedule_remote_fn_on_all_nodes( - _run_command, cmd=cmd, exclude_head=exclude_head - ) +def run_command_on_all_nodes(cmd: List[str]): + futures = schedule_remote_fn_on_all_nodes(_run_command, cmd=cmd) return ray.get(futures) diff --git a/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py b/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py index 6ac84f13ac95..5f2c8f69881c 100644 --- a/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py +++ b/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py @@ -269,9 +269,7 @@ def run( path = str((Path(__file__).parent / "_tensorflow_prepare.py").absolute()) upload_file_to_all_nodes(path) - # NOTE: This includes the head node ƒor the release smoke test that only - # runs on a single node. - run_command_on_all_nodes(["python", path], exclude_head=False) + run_command_on_all_nodes(["python", path]) times_ray = [] times_local_ray = [] diff --git a/release/air_tests/air_benchmarks/workloads/torch_benchmark.py b/release/air_tests/air_benchmarks/workloads/torch_benchmark.py index e9d1b9008cf5..0ce327cb6e5d 100644 --- a/release/air_tests/air_benchmarks/workloads/torch_benchmark.py +++ b/release/air_tests/air_benchmarks/workloads/torch_benchmark.py @@ -418,9 +418,7 @@ def run( path = str((Path(__file__).parent / "_torch_prepare.py").absolute()) upload_file_to_all_nodes(path) - # NOTE: This includes the head node ƒor the release smoke test that only - # runs on a single node. - run_command_on_all_nodes(["python", path], exclude_head=False) + run_command_on_all_nodes(["python", path]) times_ray = [] times_local_ray = [] diff --git a/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py b/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py index c32328575880..0d7b594d1497 100644 --- a/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py +++ b/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py @@ -21,14 +21,14 @@ def prepare_mnist(): print("Preparing Torch benchmark: Downloading MNIST") - @ray.remote(num_cpus=0) + @ray.remote def _download_data(): import torchvision torchvision.datasets.FashionMNIST("/tmp/data_fashion_mnist", download=True) return True - ray.get(schedule_remote_fn_on_all_nodes(_download_data, exclude_head=False)) + ray.get(schedule_remote_fn_on_all_nodes(_download_data)) def get_trainer(