diff --git a/release/release_tests.yaml b/release/release_tests.yaml index f5e0a4bd2f24..e9a36f3347d0 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -1800,6 +1800,23 @@ timeout: 1200 script: RAY_TRAIN_V2_ENABLED=1 python train_benchmark.py --task=recsys --dataloader_type=ray_data --num_workers=8 --train_batch_size=8192 --validation_batch_size=16384 --num_epochs=1 +- name: training_ingest_benchmark-soak_test + group: Train tests + working_dir: train_tests/benchmark + + frequency: weekly + team: ml + + cluster: + byod: + type: gpu + cluster_compute: compute_configs/compute_gpu_4x4_cpu_4_aws.yaml + + run: + timeout: 43200 + long_running: true + script: RAY_TRAIN_V2_ENABLED=1 python train_benchmark.py --task=image_classification --dataloader_type=ray_data --num_workers=16 --image_classification_data_format=parquet --num_epochs=50 + - name: train_multinode_persistence python: "3.10" group: Train tests diff --git a/release/train_tests/benchmark/compute_configs/compute_gpu_4x4_cpu_4_aws.yaml b/release/train_tests/benchmark/compute_configs/compute_gpu_4x4_cpu_4_aws.yaml new file mode 100644 index 000000000000..638a501bd2e0 --- /dev/null +++ b/release/train_tests/benchmark/compute_configs/compute_gpu_4x4_cpu_4_aws.yaml @@ -0,0 +1,22 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west-2 + +head_node_type: + name: head_node + instance_type: m5.4xlarge + resources: + cpu: 0 + +worker_node_types: + - name: worker_node_gpu + instance_type: g4dn.12xlarge + max_workers: 4 + min_workers: 4 + use_spot: false + resources: + cpu: 0 + - name: worker_node_cpu + instance_type: m5.4xlarge + max_workers: 4 + min_workers: 4 + use_spot: false