Skip to content

Commit

Permalink
Revert "[ci][tune][train] Update release test compute configs to not …
Browse files Browse the repository at this point in the history
…schedule work on head node" (ray-project#48321)

Reverts ray-project#48103
Signed-off-by: JP-sDEV <jon.pablo80@gmail.com>
  • Loading branch information
can-anyscale authored and JP-sDEV committed Nov 14, 2024
1 parent b2ede7d commit b6898b4
Show file tree
Hide file tree
Showing 16 changed files with 45 additions and 90 deletions.
9 changes: 2 additions & 7 deletions release/air_tests/air_benchmarks/compute_cpu_1_aws.yaml
Original file line number Diff line number Diff line change
@@ -1,15 +1,10 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 1
max_workers: 0

head_node_type:
name: head_node
instance_type: m5.2xlarge

worker_node_types:
- name: worker_node
instance_type: m5.2xlarge
max_workers: 1
min_workers: 1
use_spot: false
worker_node_types: []
9 changes: 2 additions & 7 deletions release/air_tests/air_benchmarks/compute_cpu_1_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,10 @@ region: us-west1
allowed_azs:
- us-west1-b

max_workers: 1
max_workers: 0

head_node_type:
name: head_node
instance_type: n1-standard-8

worker_node_types:
- name: worker_node
instance_type: n1-standard-8
max_workers: 1
min_workers: 1
use_spot: false
worker_node_types: []
8 changes: 3 additions & 5 deletions release/air_tests/air_benchmarks/compute_cpu_4_aws.yaml
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 4
max_workers: 3

head_node_type:
name: head_node
instance_type: m5.2xlarge
resources:
cpu: 0

worker_node_types:
- name: worker_node
instance_type: m5.2xlarge
max_workers: 4
min_workers: 4
max_workers: 3
min_workers: 3
use_spot: false
8 changes: 3 additions & 5 deletions release/air_tests/air_benchmarks/compute_cpu_4_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,15 @@ region: us-west1
allowed_azs:
- us-west1-b

max_workers: 4
max_workers: 3

head_node_type:
name: head_node
instance_type: n1-standard-8
resources:
cpu: 0

worker_node_types:
- name: worker_node
instance_type: n1-standard-8
max_workers: 4
min_workers: 4
max_workers: 3
min_workers: 3
use_spot: false
6 changes: 2 additions & 4 deletions release/air_tests/air_benchmarks/compute_cpu_8_aws.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,10 @@ max_workers: 7
head_node_type:
name: head_node
instance_type: m5.2xlarge
resources:
cpu: 0

worker_node_types:
- name: worker_node
instance_type: m5.2xlarge
max_workers: 8
min_workers: 8
max_workers: 7
min_workers: 7
use_spot: false
8 changes: 3 additions & 5 deletions release/air_tests/air_benchmarks/compute_cpu_8_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,15 @@ region: us-west1
allowed_azs:
- us-west1-b

max_workers: 8
max_workers: 7

head_node_type:
name: head_node
instance_type: n1-standard-8
resources:
cpu: 0

worker_node_types:
- name: worker_node
instance_type: n1-standard-8
max_workers: 8
min_workers: 8
max_workers: 7
min_workers: 7
use_spot: false
11 changes: 3 additions & 8 deletions release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml
Original file line number Diff line number Diff line change
@@ -1,18 +1,13 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 1
max_workers: 0

head_node_type:
name: head_node
instance_type: m5.2xlarge
instance_type: g3.8xlarge

worker_node_types:
- name: worker_node
instance_type: g3.8xlarge
max_workers: 1
min_workers: 1
use_spot: false
worker_node_types: []

aws:
BlockDeviceMappings:
Expand Down
11 changes: 3 additions & 8 deletions release/air_tests/air_benchmarks/compute_gpu_1_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,13 @@ region: us-west1
allowed_azs:
- us-west1-b

max_workers: 1
max_workers: 0

head_node_type:
name: head_node
instance_type: n1-standard-8
instance_type: n1-standard-32-nvidia-tesla-t4-2

worker_node_types:
- name: worker_node
instance_type: n1-standard-32-nvidia-tesla-t4-2
max_workers: 1
min_workers: 1
use_spot: false
worker_node_types: []

gcp_advanced_configurations_json:
instance_properties:
Expand Down
10 changes: 4 additions & 6 deletions release/air_tests/air_benchmarks/compute_gpu_2x2_aws.yaml
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 2
max_workers: 1

head_node_type:
name: head_node
instance_type: m5.2xlarge
resources:
cpu: 0
instance_type: g3.8xlarge

worker_node_types:
- name: worker_node
instance_type: g3.8xlarge
max_workers: 2
min_workers: 2
max_workers: 1
min_workers: 1
use_spot: false
10 changes: 4 additions & 6 deletions release/air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,15 @@ region: us-west1
allowed_azs:
- us-west1-b

max_workers: 2
max_workers: 1

head_node_type:
name: head_node
instance_type: n1-standard-8
resources:
cpu: 0
instance_type: n1-standard-32-nvidia-tesla-t4-2

worker_node_types:
- name: worker_node
instance_type: n1-standard-32-nvidia-tesla-t4-2
max_workers: 2
min_workers: 2
max_workers: 1
min_workers: 1
use_spot: false
10 changes: 4 additions & 6 deletions release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 4
max_workers: 3

head_node_type:
name: head_node
instance_type: m5.2xlarge
resources:
cpu: 0
instance_type: g4dn.12xlarge

worker_node_types:
- name: worker_node
instance_type: g4dn.12xlarge
max_workers: 4
min_workers: 4
max_workers: 3
min_workers: 3
use_spot: false

aws:
Expand Down
10 changes: 4 additions & 6 deletions release/air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,17 @@ region: us-west1
allowed_azs:
- us-west1-b

max_workers: 4
max_workers: 3

head_node_type:
name: head_node
instance_type: n1-standard-8
resources:
cpu: 0
instance_type: n1-standard-64-nvidia-tesla-t4-4

worker_node_types:
- name: worker_node
instance_type: n1-standard-64-nvidia-tesla-t4-4
max_workers: 4
min_workers: 4
max_workers: 3
min_workers: 3
use_spot: false

gcp_advanced_configurations_json:
Expand Down
13 changes: 4 additions & 9 deletions release/air_tests/air_benchmarks/workloads/benchmark_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,8 @@


def schedule_remote_fn_on_all_nodes(
remote_fn, exclude_head: bool = True, *args, **kwargs
remote_fn, exclude_head: bool = False, *args, **kwargs
):
"""Runs remote fn on all worker nodes.
Also schedules on the head node if `exclude_head` is False.
"""
head_ip = ray.util.get_node_ip_address()

futures = []
Expand Down Expand Up @@ -58,15 +55,13 @@ def upload_file_to_all_nodes(path: str):
return ray.get(futures)


@ray.remote(num_cpus=0)
@ray.remote
def _run_command(cmd: str):
return subprocess.check_call(cmd)


def run_command_on_all_nodes(cmd: List[str], exclude_head: bool = True):
futures = schedule_remote_fn_on_all_nodes(
_run_command, cmd=cmd, exclude_head=exclude_head
)
def run_command_on_all_nodes(cmd: List[str]):
futures = schedule_remote_fn_on_all_nodes(_run_command, cmd=cmd)
return ray.get(futures)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -269,9 +269,7 @@ def run(
path = str((Path(__file__).parent / "_tensorflow_prepare.py").absolute())

upload_file_to_all_nodes(path)
# NOTE: This includes the head node ƒor the release smoke test that only
# runs on a single node.
run_command_on_all_nodes(["python", path], exclude_head=False)
run_command_on_all_nodes(["python", path])

times_ray = []
times_local_ray = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -418,9 +418,7 @@ def run(

path = str((Path(__file__).parent / "_torch_prepare.py").absolute())
upload_file_to_all_nodes(path)
# NOTE: This includes the head node ƒor the release smoke test that only
# runs on a single node.
run_command_on_all_nodes(["python", path], exclude_head=False)
run_command_on_all_nodes(["python", path])

times_ray = []
times_local_ray = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ def prepare_mnist():

print("Preparing Torch benchmark: Downloading MNIST")

@ray.remote(num_cpus=0)
@ray.remote
def _download_data():
import torchvision

torchvision.datasets.FashionMNIST("/tmp/data_fashion_mnist", download=True)
return True

ray.get(schedule_remote_fn_on_all_nodes(_download_data, exclude_head=False))
ray.get(schedule_remote_fn_on_all_nodes(_download_data))


def get_trainer(
Expand Down

0 comments on commit b6898b4

Please sign in to comment.