Merge pull request #184 from NVIDIA/trace_collector_to_nvrx_main

namitdhameja · web-flow · commit e50e3912fd28 · 2025-09-15T09:35:11.000-07:00
NVRX Logger: backtrace indentation fix and trace collector to use nvrx
diff --git a/src/nvidia_resiliency_ext/attribution/trace_analyzer/trace_collector.py b/src/nvidia_resiliency_ext/attribution/trace_analyzer/trace_collector.py
@@ -13,8 +13,9 @@
 
 from nvidia_resiliency_ext.attribution.utils import capture_logs
 from nvidia_resiliency_ext.shared_utils.health_check import GPUHealthCheck, NicHealthCheck
+from nvidia_resiliency_ext.shared_utils.log_manager import LogConfig
 
-logger = logging.getLogger(__name__)
+logger = logging.getLogger(LogConfig.name)
 
 
 class TraceCollector(ABC):
@@ -65,6 +66,7 @@ def __init__(
         self.stack_trace = None
         self.dump_fn = torch._C._distributed_c10d._dump_nccl_trace
         self.json = json
+        logger = logging.getLogger(LogConfig.name)
         logger.info(f"{self.rank} created TorchFRTraceCollector")
 
     def collect(self):
@@ -112,11 +114,10 @@ def get_health_check_results(local_rank: int):
         - Returns the bypassed output strings for GPU and NIC health checks
         """
         health_check_results = {}
-
-        with capture_logs() as stderr_gpu:
+        with capture_logs(LogConfig.name) as stderr_gpu:
             gpu_health_check = GPUHealthCheck(device_index=local_rank)
             gpu_health = gpu_health_check._perform_health_check()
-        with capture_logs() as stderr_nic:
+        with capture_logs(LogConfig.name) as stderr_nic:
             nic_health_check = NicHealthCheck()
             nic_health_check.set_nic_device(local_rank)
             nic_health = nic_health_check._perform_health_check()
diff --git a/src/nvidia_resiliency_ext/fault_tolerance/rank_monitor_server.py b/src/nvidia_resiliency_ext/fault_tolerance/rank_monitor_server.py
@@ -539,10 +539,6 @@ def run(
 
         try:
             setup_logger(force_reset=True, node_local_tmp_prefix="rankmonsvr")
-            rmlogger = RankMonitorLogger(
-                level=cfg.log_level, is_restarter_logger=is_restarter_logger
-            )
-
             logger = logging.getLogger(LogConfig.name)
 
             logger.debug(f"Starting RankMonitorServer... PID={os.getpid()}")
diff --git a/src/nvidia_resiliency_ext/shared_utils/log_node_local_tmp.py b/src/nvidia_resiliency_ext/shared_utils/log_node_local_tmp.py
@@ -154,8 +154,10 @@ class LogMessage:
     def __init__(self, log_message: str):
         self.log_message = log_message
         self.hash_table = {}
+        self.log_message_valid = False
         match = LogMessage.log_pattern.match(log_message)
         if match:
+            self.log_message_valid = True
             log_fields = match.groupdict()
             for key, value in log_fields.items():
                 if key == 'asctime':
@@ -220,7 +222,7 @@ def _write_messages_to_file(self, messages: List[LogMessage], output):
         for msg in messages:
             try:
                 # The message is already formatted by the formatter, just write it
-                output.write(msg.log_message + '\n')
+                output.write(msg.log_message)
                 output.flush()
             except Exception as e:
                 # Fallback to stderr if output fails
@@ -366,13 +368,25 @@ def _process_message_file(self, msg_file: str):
             return
 
         # Process each line
+        # Multi-line logs (e.g., tracebacks) have a single header line (matches log_pattern)
+        # followed by one or more continuation lines. A non-header line is treated as a
+        # continuation of the previous record, and the entire block is collapsed into a log message.
         log_msg_q = queue.SimpleQueue()
+        old_log_msg: LogMessage = None
         for line in lines:
-            line = line.strip()
-            if not line:
+            lineChk = line.strip()
+            if not lineChk:
                 continue
             log_msg = LogMessage(line)
-            log_msg_q.put(log_msg)
+            if log_msg.log_message_valid:
+                old_log_msg = log_msg
+                log_msg_q.put(log_msg)
+            else:
+                if old_log_msg is not None:
+                    old_log_msg.log_message += line
+                else:
+                    old_log_msg = log_msg
+                    log_msg_q.put(log_msg)
 
         self._log_dict_queue[msg_file] = log_msg_q
 
diff --git a/tests/shared_utils/test_logger.py b/tests/shared_utils/test_logger.py
@@ -19,6 +19,7 @@
 import os
 import random
 import shutil
+import textwrap
 import time
 import unittest
 from datetime import datetime
@@ -75,6 +76,21 @@ def gen_log_msg(logger, num_msg, log_type="info"):
             logger.info(f"My Info Logging Message {i}")
         if log_type == "debug":
             logger.debug(f"My Debug Logging Message {i}")
+        if log_type == "error":
+            msg = textwrap.dedent(
+                """\
+            monitor_process.py:316 Traceback (most recent call last):
+              File "/usr/local/lib/python3.12/dist-packages/nvidia_resiliency_ext/inprocess/monitor_process.py", line 297, in run
+                store.iteration_barrier(
+
+              File "/usr/local/lib/python3.12/dist-packages/nvidia_resiliency_ext/inprocess/store.py", line 303, in reentrant_barrier
+                self.wait([last_worker_arrived_key], timeout_chunk)
+              torch.distributed.DistNetworkError: Failed to recv, got 0 bytes. Connection was likely closed. Did the remote server shutdown or crash?
+
+
+            """
+            )
+            logger.error(msg)
 
 
 def worker_process(id, num_msg, file_size):
@@ -227,6 +243,9 @@ def test_single_msg(self):
             num_msg=1, file_size_kb=1024, pm_files=1, is_agg=True, log_type="info", dbg_on="0"
         )
 
+    def test_traceback_msg(self):
+        self.check_msg(2, 1024, 1, True, "error", "0")
+
     def test_single_dbg_msg(self):
         self.check_msg(
             num_msg=1, file_size_kb=1024, pm_files=1, is_agg=True, log_type="debug", dbg_on="1"