Skip to content

Commit

Permalink
add debug maybe reinit
Browse files Browse the repository at this point in the history
  • Loading branch information
samsja committed Oct 4, 2024
1 parent be1f8ac commit 6655614
Showing 1 changed file with 2 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/zeroband/comms.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,7 @@ def maybe_reinit_global_pg(self):
self.global_pg = dist.ProcessGroupGloo(
prefix_store, self.world_info.global_rank, self.world_info.global_world_size, TCPSTORE_TIMEOUT
)
self._logger.debug("Successfully recreated process group")

if self._global_leader:
self._clear_joiners()
Expand All @@ -330,6 +331,7 @@ def maybe_reinit_global_pg(self):
self.global_store.set(f"rank_{self.world_info.global_unique_id}", str(self.world_info.global_rank))
# Without this barrier, a node might queue leave before the leaving queue is cleared
dist.barrier(self.global_pg)
self._logger.debug("Reinitialized global_pg done in %s seconds", time.perf_counter() - time_start)

def get_global_pg(self, maybe_reinit: bool = False) -> dist.ProcessGroup:
"""Get the global process group. If maybe_reinit is True, reinitialize the global process group if needed."""
Expand Down

0 comments on commit 6655614

Please sign in to comment.