Skip to content
17 changes: 17 additions & 0 deletions release/release_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1800,6 +1800,23 @@
timeout: 1200
script: RAY_TRAIN_V2_ENABLED=1 python train_benchmark.py --task=recsys --dataloader_type=ray_data --num_workers=8 --train_batch_size=8192 --validation_batch_size=16384 --num_epochs=1

- name: training_ingest_benchmark-soak_test
group: Train tests
working_dir: train_tests/benchmark

frequency: weekly
team: ml

cluster:
byod:
type: gpu
cluster_compute: compute_configs/compute_gpu_4x4_cpu_4_aws.yaml

run:
timeout: 43200
long_running: true
script: RAY_TRAIN_V2_ENABLED=1 python train_benchmark.py --task=image_classification --dataloader_type=ray_data --num_workers=16 --image_classification_data_format=parquet --num_epochs=50

- name: train_multinode_persistence
python: "3.10"
group: Train tests
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

head_node_type:
name: head_node
instance_type: m5.4xlarge
resources:
cpu: 0

worker_node_types:
- name: worker_node_gpu
instance_type: g4dn.12xlarge
max_workers: 4
min_workers: 4
use_spot: false
resources:
cpu: 0
- name: worker_node_cpu
instance_type: m5.4xlarge
max_workers: 4
min_workers: 4
use_spot: false
Comment on lines +18 to +22
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need to add cpu nodes for this test?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See the testing notes (#57120 (comment)) - doing this kept memory growth low. Lmk if this is fine.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

interesting 🤔