Skip to content

Commit

Permalink
Participants APIs should check if quorum is started (#95)
Browse files Browse the repository at this point in the history
  • Loading branch information
fegin authored Feb 4, 2025
1 parent 87290f5 commit 118d1a2
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 0 deletions.
6 changes: 6 additions & 0 deletions torchft/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,6 +664,9 @@ def participating_rank(self) -> Optional[int]:
Returns:
the rank of the current quorum
"""
if self._quorum_future is None:
return None

self.wait_quorum()

return self._participating_rank
Expand All @@ -679,6 +682,9 @@ def num_participants(self) -> int:
Returns:
the number of participants in the current quorum
"""
if self._quorum_future is None:
return 0

self.wait_quorum()

assert self._participating_world_size >= 0, "internal error"
Expand Down
3 changes: 3 additions & 0 deletions torchft/manager_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,9 @@ def test_quorum_heal_sync(self, client_mock: MagicMock) -> None:
self.assertEqual(manager._quorum_id, -1)
self.assertEqual(manager.current_step(), 0)

self.assertEqual(manager.num_participants(), 0)
self.assertEqual(manager.participating_rank(), None)

manager.start_quorum()
manager.allreduce(torch.tensor([1.0])).wait()
self.assertFalse(manager._healing)
Expand Down

0 comments on commit 118d1a2

Please sign in to comment.