Merge branch 'main' into profiling

hexinw-nvidia · web-flow · commit 4dcb8fe3b5ab · 2025-09-17T11:34:48.000-07:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,7 +37,6 @@ packaging = "*"
 python = ">=3.10"
 psutil = ">=6.0.0"
 pyyaml = "*"
-pynvml = ">=12.0.0"
 nvidia-ml-py = ">=12.570.86"
 defusedxml = "*"
 
diff --git a/src/nvidia_resiliency_ext/inprocess/rank_assignment.py b/src/nvidia_resiliency_ext/inprocess/rank_assignment.py
@@ -26,6 +26,8 @@
 import warnings
 from typing import Callable, Optional, Union
 
+from nvidia_resiliency_ext.shared_utils.log_manager import LogConfig
+
 from . import exception, utils
 from .state import Mode, State
 from .store import StoreMixin
@@ -177,7 +179,8 @@ def __call__(self, ctx: RankAssignmentCtx) -> RankAssignmentCtx:
             active_rank = None
             # Log deactivation if transitioning from ACTIVE to INACTIVE
             if state.mode == Mode.ACTIVE:
-                log = logging.getLogger(__name__)
+                log = logging.getLogger(LogConfig.name)
+
                 log.info(
                     f"[In-process] Rank deactivated (rank={state.rank}) due to max active world size limit ({active_world_size})"
                 )
@@ -224,7 +227,7 @@ def __call__(self, ctx: RankAssignmentCtx) -> RankAssignmentCtx:
             active_rank = None
             # Log deactivation if transitioning from ACTIVE to INACTIVE
             if state.mode == Mode.ACTIVE:
-                log = logging.getLogger(__name__)
+                log = logging.getLogger(LogConfig.name)
                 log.info(
                     f"[In-process] Rank deactivated (rank={state.rank}) due to divisibility requirement (active_world_size={active_world_size}, divisor={divisor})"
                 )
@@ -349,7 +352,7 @@ def __repr__(self):
         return f'{type(self).__name__}({self.name=})'
 
 
-def bounded_activate(node, counter, path=None):
+def bounded_activate(node, counter, path=None, current_state=None):
     if path is None:
         path = []
 
@@ -361,17 +364,29 @@ def bounded_activate(node, counter, path=None):
                 for ascendant in path
             )
         ):
+            # Log activation if this is the current rank
+            if current_state and current_state.initial_rank == node.state.initial_rank:
+                log = logging.getLogger(LogConfig.name)
+                log.info(
+                    f"[In-process] Rank activated (initial_rank={node.state.initial_rank}, active_rank={counter}) in topology tree"
+                )
             node.activate(counter)
             counter += 1
             for ascendant in path:
                 ascendant.active_count += 1
         else:
+            # Log deactivation if this is the current rank
+            if current_state and current_state.initial_rank == node.state.initial_rank:
+                log = logging.getLogger(LogConfig.name)
+                log.info(
+                    f"[In-process] Rank deactivated (initial_rank={node.state.initial_rank}) due to max_ranks constraint in topology layer"
+                )
             node.deactivate()
 
     path.append(node)
 
     for child in node.children.values():
-        counter = bounded_activate(child, counter, path)
+        counter = bounded_activate(child, counter, path, current_state)
     path.pop()
     return counter
 
@@ -574,7 +589,7 @@ def build_tree(self, state, store):
     def replace_with_inactive(self, terminated_active_ranks):
         replaced_terminate_active_ranks = set()
 
-        log = logging.getLogger(__name__)
+        log = logging.getLogger(LogConfig.name)
 
         for terminated_active_rank in sorted(terminated_active_ranks):
             terminated_active_node = self.rank_map[terminated_active_rank]
@@ -625,7 +640,7 @@ def replace_with_backfill(self, unhandled_terminations):
                 key=lambda node: node.state.active_rank,
             )
 
-            log = logging.getLogger(__name__)
+            log = logging.getLogger(LogConfig.name)
             for backfill_node, terminated_node in itertools.zip_longest(
                 reversed(largest_active_nodes),
                 terminated_nodes,
@@ -647,7 +662,7 @@ def replace_with_backfill(self, unhandled_terminations):
 
     def shift_ranks(self, replaced_active, unhandled_terminations):
         sorted_replaced_active = sorted(replaced_active)
-        log = logging.getLogger(__name__)
+        log = logging.getLogger(LogConfig.name)
 
         for n in self.rank_map.values():
             n.state.active_world_size -= len(unhandled_terminations)
@@ -672,7 +687,7 @@ def filter_active_world_size(self):
         new_active_world_size = self.world_size_filter(active_world_size)
         assert new_active_world_size <= active_world_size
 
-        log = logging.getLogger(__name__)
+        log = logging.getLogger(LogConfig.name)
         for leaf in self.tree.iter_leaves():
             leaf.state.active_world_size = new_active_world_size
             if leaf.state.mode == Mode.ACTIVE and leaf.state.active_rank >= new_active_world_size:
@@ -722,7 +737,7 @@ def __call__(self, ctx: RankAssignmentCtx) -> RankAssignmentCtx:
         if self.tree is None:
             self.build_tree(state, store)
 
-            active_world_size = bounded_activate(self.tree, 0)
+            active_world_size = bounded_activate(self.tree, 0, None, self.current_state)
             for node in self.rank_map.values():
                 node.state.active_world_size = active_world_size
 
@@ -738,7 +753,7 @@ def __call__(self, ctx: RankAssignmentCtx) -> RankAssignmentCtx:
             rank for rank in terminated_ranks if self.rank_map[rank].state.mode == Mode.ACTIVE
         )
 
-        log = logging.getLogger(__name__)
+        log = logging.getLogger(LogConfig.name)
         for terminated_rank in terminated_ranks:
             # If this rank is being terminated, log it
             if self.current_state.initial_rank == self.rank_map[terminated_rank].state.initial_rank:
@@ -808,7 +823,7 @@ def __call__(self, ctx: RankAssignmentCtx) -> RankAssignmentCtx:
             terminated_ranks = utils.format_rank_set(terminated_ranks)
             raise RankDiscarded(f'{rank=} {terminated_ranks=}')
         elif rank >= world_size:
-            log = logging.getLogger(__name__)
+            log = logging.getLogger(LogConfig.name)
             old_rank = rank
             rank = ordered_terminated_ranks[rank - world_size]
             log.info(
@@ -869,7 +884,7 @@ def __call__(self, ctx: RankAssignmentCtx) -> RankAssignmentCtx:
             old_rank = rank
             rank = rank - sum(rank > terminated_rank for terminated_rank in terminated_ranks)
             if old_rank != rank:
-                log = logging.getLogger(__name__)
+                log = logging.getLogger(LogConfig.name)
                 log.info(f"[In-process] Rank shifted (rank changed from {old_rank} to {rank})")
 
         state = dataclasses.replace(
@@ -982,7 +997,7 @@ def __call__(self, ctx: RankAssignmentCtx) -> RankAssignmentCtx:
 
             group_count = int(store.get(prefixed_key))
             if not self.condition(group_count):
-                log = logging.getLogger(__name__)
+                log = logging.getLogger(LogConfig.name)
                 log.info(
                     f"[In-process] Rank marked for termination (rank={rank}, group_key={key}, group_count={group_count}) due to failed group condition"
                 )