[train] Unblock get_all_reported_checkpoints if reporting only metrics (#58870)

TimothySeah · web-flow · commit 0c4f9d28b832 · 2025-11-25T18:13:00.000-08:00
When reporting a checkpoint to Ray Train, every worker needs to form a
barrier with a `ray.train.report` call. If every worker reports an empty
checkpoint, we should notify the condition to unblock
`ray.train.get_all_reported_checkpoint` calls.

Before this fix, reporting an empty checkpoint and calling
`get_all_reported_checkpoints` would result in a hang.

---------

Signed-off-by: Timothy Seah &lt;tseah@anyscale.com&gt;
diff --git a/python/ray/train/v2/_internal/execution/checkpoint/checkpoint_manager.py b/python/ray/train/v2/_internal/execution/checkpoint/checkpoint_manager.py
@@ -342,6 +342,7 @@ def after_report(
     ):
         if not training_report.checkpoint:
             self._current_report_index += 1
+            self._notify()
             return
 
         self.register_checkpoint(
diff --git a/python/ray/train/v2/tests/test_async_checkpointing_validation.py b/python/ray/train/v2/tests/test_async_checkpointing_validation.py
@@ -389,6 +389,30 @@ def train_fn():
         trainer.fit()
 
 
+def test_report_get_all_reported_checkpoints():
+    """Check that get_all_reported_checkpoints returns checkpoints depending on # report calls."""
+
+    def train_fn():
+        if ray.train.get_context().get_world_rank() == 0:
+            ray.train.report(metrics={}, checkpoint=None)
+            with create_dict_checkpoint({}) as checkpoint:
+                ray.train.report(metrics={}, checkpoint=checkpoint)
+            assert len(ray.train.get_all_reported_checkpoints()) == 1
+            with create_dict_checkpoint({}) as checkpoint:
+                ray.train.report(metrics={}, checkpoint=checkpoint)
+        else:
+            ray.train.report(metrics={}, checkpoint=None)
+            ray.train.report(metrics={}, checkpoint=None)
+            ray.train.report(metrics={}, checkpoint=None)
+            assert len(ray.train.get_all_reported_checkpoints()) == 2
+
+    trainer = DataParallelTrainer(
+        train_fn,
+        scaling_config=ScalingConfig(num_workers=2),
+    )
+    trainer.fit()
+
+
 def test_get_all_reported_checkpoints_all_consistency_modes():
     signal_actor = create_remote_signal_actor(ray).remote()
 
@@ -440,6 +464,18 @@ def validate_fn(checkpoint, config):
     trainer.fit()
 
 
+def test_get_all_reported_checkpoints_empty_reports():
+    def train_fn():
+        ray.train.report(metrics={}, checkpoint=None)
+        assert len(ray.train.get_all_reported_checkpoints()) == 0
+
+    trainer = DataParallelTrainer(
+        train_fn,
+        scaling_config=ScalingConfig(num_workers=2),
+    )
+    trainer.fit()
+
+
 if __name__ == "__main__":
     import sys
 
diff --git a/python/ray/train/v2/tests/test_data_parallel_trainer.py b/python/ray/train/v2/tests/test_data_parallel_trainer.py
@@ -141,30 +141,6 @@ def train_fn():
         assert tmp_path.joinpath("validate", str(rank)).exists()
 
 
-def test_report_get_all_reported_checkpoints():
-    """Check that get_all_reported_checkpoints returns checkpoints depending on # report calls."""
-
-    def train_fn():
-        if ray.train.get_context().get_world_rank() == 0:
-            ray.train.report(metrics={}, checkpoint=None)
-            with create_dict_checkpoint({}) as checkpoint:
-                ray.train.report(metrics={}, checkpoint=checkpoint)
-            assert len(ray.train.get_all_reported_checkpoints()) == 1
-            with create_dict_checkpoint({}) as checkpoint:
-                ray.train.report(metrics={}, checkpoint=checkpoint)
-        else:
-            ray.train.report(metrics={}, checkpoint=None)
-            ray.train.report(metrics={}, checkpoint=None)
-            ray.train.report(metrics={}, checkpoint=None)
-            assert len(ray.train.get_all_reported_checkpoints()) == 2
-
-    trainer = DataParallelTrainer(
-        train_fn,
-        scaling_config=ScalingConfig(num_workers=2),
-    )
-    trainer.fit()
-
-
 def test_error(tmp_path):
     def _error_func_rank_0():
         """An example train_fun that raises an error on rank 0."""
diff --git a/python/ray/train/v2/tests/test_report_handler.py b/python/ray/train/v2/tests/test_report_handler.py
@@ -62,7 +62,9 @@ def generate_worker_group_poll_status(num_workers, num_ckpt, num_dummy, num_none
         (10, 1, 8, 1, 0),  # one worker with checkpoint, one worker with None
     ],
 )
-def test_report_handler(tmp_path, num_workers, num_ckpt, num_dummy, num_none, expected):
+async def test_report_handler(
+    tmp_path, num_workers, num_ckpt, num_dummy, num_none, expected
+):
     """`expected` is the number of times that the
     CheckpointManager.register_checkpoint is called.
     """