Revert "[ci][tune][train] Update release test compute configs to not …

…schedule work on head node" (ray-project#48321) Reverts ray-project#48103 Signed-off-by: JP-sDEV <jon.pablo80@gmail.com>
JP-sDEV · Nov 14, 2024 · b6898b4 · b6898b4
1 parent b2ede7d
commit b6898b4
Show file tree

Hide file tree

Showing 16 changed files with 45 additions and 90 deletions.
diff --git a/release/air_tests/air_benchmarks/compute_cpu_1_aws.yaml b/release/air_tests/air_benchmarks/compute_cpu_1_aws.yaml
@@ -1,15 +1,10 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 1
+max_workers: 0
 
 head_node_type:
     name: head_node
     instance_type: m5.2xlarge
 
-worker_node_types:
-    - name: worker_node
-      instance_type: m5.2xlarge
-      max_workers: 1
-      min_workers: 1
-      use_spot: false
+worker_node_types: []
diff --git a/release/air_tests/air_benchmarks/compute_cpu_1_gce.yaml b/release/air_tests/air_benchmarks/compute_cpu_1_gce.yaml
@@ -3,15 +3,10 @@ region: us-west1
 allowed_azs:
     - us-west1-b
 
-max_workers: 1
+max_workers: 0
 
 head_node_type:
     name: head_node
     instance_type: n1-standard-8
 
-worker_node_types:
-    - name: worker_node
-      instance_type: n1-standard-8
-      max_workers: 1
-      min_workers: 1
-      use_spot: false
+worker_node_types: []
diff --git a/release/air_tests/air_benchmarks/compute_cpu_4_aws.yaml b/release/air_tests/air_benchmarks/compute_cpu_4_aws.yaml
@@ -1,17 +1,15 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 4
+max_workers: 3
 
 head_node_type:
     name: head_node
     instance_type: m5.2xlarge
-    resources:
-      cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: m5.2xlarge
-      max_workers: 4
-      min_workers: 4
+      max_workers: 3
+      min_workers: 3
       use_spot: false
diff --git a/release/air_tests/air_benchmarks/compute_cpu_4_gce.yaml b/release/air_tests/air_benchmarks/compute_cpu_4_gce.yaml
@@ -3,17 +3,15 @@ region: us-west1
 allowed_azs:
     - us-west1-b
 
-max_workers: 4
+max_workers: 3
 
 head_node_type:
     name: head_node
     instance_type: n1-standard-8
-    resources:
-      cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: n1-standard-8
-      max_workers: 4
-      min_workers: 4
+      max_workers: 3
+      min_workers: 3
       use_spot: false
diff --git a/release/air_tests/air_benchmarks/compute_cpu_8_aws.yaml b/release/air_tests/air_benchmarks/compute_cpu_8_aws.yaml
@@ -6,12 +6,10 @@ max_workers: 7
 head_node_type:
     name: head_node
     instance_type: m5.2xlarge
-    resources:
-      cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: m5.2xlarge
-      max_workers: 8
-      min_workers: 8
+      max_workers: 7
+      min_workers: 7
       use_spot: false
diff --git a/release/air_tests/air_benchmarks/compute_cpu_8_gce.yaml b/release/air_tests/air_benchmarks/compute_cpu_8_gce.yaml
@@ -3,17 +3,15 @@ region: us-west1
 allowed_azs:
     - us-west1-b
 
-max_workers: 8
+max_workers: 7
 
 head_node_type:
     name: head_node
     instance_type: n1-standard-8
-    resources:
-      cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: n1-standard-8
-      max_workers: 8
-      min_workers: 8
+      max_workers: 7
+      min_workers: 7
       use_spot: false
diff --git a/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml b/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml
@@ -1,18 +1,13 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 1
+max_workers: 0
 
 head_node_type:
     name: head_node
-    instance_type: m5.2xlarge
+    instance_type: g3.8xlarge
 
-worker_node_types:
-    - name: worker_node
-      instance_type: g3.8xlarge
-      max_workers: 1
-      min_workers: 1
-      use_spot: false
+worker_node_types: []
 
 aws:
     BlockDeviceMappings:

diff --git a/release/air_tests/air_benchmarks/compute_gpu_1_gce.yaml b/release/air_tests/air_benchmarks/compute_gpu_1_gce.yaml
@@ -3,18 +3,13 @@ region: us-west1
 allowed_azs:
     - us-west1-b
 
-max_workers: 1
+max_workers: 0
 
 head_node_type:
     name: head_node
-    instance_type: n1-standard-8
+    instance_type: n1-standard-32-nvidia-tesla-t4-2
 
-worker_node_types:
-    - name: worker_node
-      instance_type: n1-standard-32-nvidia-tesla-t4-2
-      max_workers: 1
-      min_workers: 1
-      use_spot: false
+worker_node_types: []
 
 gcp_advanced_configurations_json:
   instance_properties:

diff --git a/release/air_tests/air_benchmarks/compute_gpu_2x2_aws.yaml b/release/air_tests/air_benchmarks/compute_gpu_2x2_aws.yaml
@@ -1,17 +1,15 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 2
+max_workers: 1
 
 head_node_type:
     name: head_node
-    instance_type: m5.2xlarge
-    resources:
-      cpu: 0
+    instance_type: g3.8xlarge
 
 worker_node_types:
     - name: worker_node
       instance_type: g3.8xlarge
-      max_workers: 2
-      min_workers: 2
+      max_workers: 1
+      min_workers: 1
       use_spot: false
diff --git a/release/air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml b/release/air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml
@@ -3,17 +3,15 @@ region: us-west1
 allowed_azs:
     - us-west1-b
 
-max_workers: 2
+max_workers: 1
 
 head_node_type:
     name: head_node
-    instance_type: n1-standard-8
-    resources:
-      cpu: 0
+    instance_type: n1-standard-32-nvidia-tesla-t4-2
 
 worker_node_types:
     - name: worker_node
       instance_type: n1-standard-32-nvidia-tesla-t4-2
-      max_workers: 2
-      min_workers: 2
+      max_workers: 1
+      min_workers: 1
       use_spot: false
diff --git a/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml b/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml
@@ -1,19 +1,17 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 4
+max_workers: 3
 
 head_node_type:
     name: head_node
-    instance_type: m5.2xlarge
-    resources:
-      cpu: 0
+    instance_type: g4dn.12xlarge
 
 worker_node_types:
     - name: worker_node
       instance_type: g4dn.12xlarge
-      max_workers: 4
-      min_workers: 4
+      max_workers: 3
+      min_workers: 3
       use_spot: false
 
 aws:

diff --git a/release/air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml b/release/air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml
@@ -3,19 +3,17 @@ region: us-west1
 allowed_azs:
     - us-west1-b
 
-max_workers: 4
+max_workers: 3
 
 head_node_type:
     name: head_node
-    instance_type: n1-standard-8
-    resources:
-      cpu: 0
+    instance_type: n1-standard-64-nvidia-tesla-t4-4
 
 worker_node_types:
     - name: worker_node
       instance_type: n1-standard-64-nvidia-tesla-t4-4
-      max_workers: 4
-      min_workers: 4
+      max_workers: 3
+      min_workers: 3
       use_spot: false
 
 gcp_advanced_configurations_json:

diff --git a/release/air_tests/air_benchmarks/workloads/benchmark_util.py b/release/air_tests/air_benchmarks/workloads/benchmark_util.py
@@ -11,11 +11,8 @@
 
 
 def schedule_remote_fn_on_all_nodes(
-    remote_fn, exclude_head: bool = True, *args, **kwargs
+    remote_fn, exclude_head: bool = False, *args, **kwargs
 ):
-    """Runs remote fn on all worker nodes.
-    Also schedules on the head node if `exclude_head` is False.
-    """
     head_ip = ray.util.get_node_ip_address()
 
     futures = []
@@ -58,15 +55,13 @@ def upload_file_to_all_nodes(path: str):
     return ray.get(futures)
 
 
-@ray.remote(num_cpus=0)
+@ray.remote
 def _run_command(cmd: str):
     return subprocess.check_call(cmd)
 
 
-def run_command_on_all_nodes(cmd: List[str], exclude_head: bool = True):
-    futures = schedule_remote_fn_on_all_nodes(
-        _run_command, cmd=cmd, exclude_head=exclude_head
-    )
+def run_command_on_all_nodes(cmd: List[str]):
+    futures = schedule_remote_fn_on_all_nodes(_run_command, cmd=cmd)
     return ray.get(futures)
 
 

diff --git a/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py b/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py
@@ -269,9 +269,7 @@ def run(
     path = str((Path(__file__).parent / "_tensorflow_prepare.py").absolute())
 
     upload_file_to_all_nodes(path)
-    # NOTE: This includes the head node ƒor the release smoke test that only
-    # runs on a single node.
-    run_command_on_all_nodes(["python", path], exclude_head=False)
+    run_command_on_all_nodes(["python", path])
 
     times_ray = []
     times_local_ray = []

diff --git a/release/air_tests/air_benchmarks/workloads/torch_benchmark.py b/release/air_tests/air_benchmarks/workloads/torch_benchmark.py
@@ -418,9 +418,7 @@ def run(
 
     path = str((Path(__file__).parent / "_torch_prepare.py").absolute())
     upload_file_to_all_nodes(path)
-    # NOTE: This includes the head node ƒor the release smoke test that only
-    # runs on a single node.
-    run_command_on_all_nodes(["python", path], exclude_head=False)
+    run_command_on_all_nodes(["python", path])
 
     times_ray = []
     times_local_ray = []

diff --git a/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py b/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py
@@ -21,14 +21,14 @@ def prepare_mnist():
 
     print("Preparing Torch benchmark: Downloading MNIST")
 
-    @ray.remote(num_cpus=0)
+    @ray.remote
     def _download_data():
         import torchvision
 
         torchvision.datasets.FashionMNIST("/tmp/data_fashion_mnist", download=True)
         return True
 
-    ray.get(schedule_remote_fn_on_all_nodes(_download_data, exclude_head=False))
+    ray.get(schedule_remote_fn_on_all_nodes(_download_data))
 
 
 def get_trainer(