ray-project · justinvyu · Oct 15, 2025 · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025
diff --git a/release/release_tests.yaml b/release/release_tests.yaml
@@ -1800,6 +1800,23 @@
         timeout: 1200
         script: RAY_TRAIN_V2_ENABLED=1 python train_benchmark.py --task=recsys --dataloader_type=ray_data --num_workers=8 --train_batch_size=8192 --validation_batch_size=16384 --num_epochs=1
 
+- name: training_ingest_benchmark-soak_test
+  group: Train tests
+  working_dir: train_tests/benchmark
+
+  frequency: weekly
+  team: ml
+
+  cluster:
+    byod:
+      type: gpu
+    cluster_compute: compute_configs/compute_gpu_4x4_cpu_4_aws.yaml
+
+  run:
+    timeout: 43200
+    long_running: true
+    script: RAY_TRAIN_V2_ENABLED=1 python train_benchmark.py --task=image_classification --dataloader_type=ray_data --num_workers=16 --image_classification_data_format=parquet --num_epochs=50
+
 - name: train_multinode_persistence
   python: "3.10"
   group: Train tests

diff --git a/release/train_tests/benchmark/compute_configs/compute_gpu_4x4_cpu_4_aws.yaml b/release/train_tests/benchmark/compute_configs/compute_gpu_4x4_cpu_4_aws.yaml
@@ -0,0 +1,22 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west-2
+
+head_node_type:
+    name: head_node
+    instance_type: m5.4xlarge
+    resources:
+        cpu: 0
+
+worker_node_types:
+    - name: worker_node_gpu
+      instance_type: g4dn.12xlarge
+      max_workers: 4
+      min_workers: 4
+      use_spot: false
+      resources:
+        cpu: 0
+    - name: worker_node_cpu
+      instance_type: m5.4xlarge
+      max_workers: 4
+      min_workers: 4
+      use_spot: false