2025-11-03 nightly release (2f4b794)

pytorchbot · pytorchbot · commit 081e80d7d6d8 · 2025-11-03T11:35:05.000Z
diff --git a/torchrec/distributed/benchmark/base.py b/torchrec/distributed/benchmark/base.py
@@ -425,17 +425,11 @@ def _load_config_file(
             if not config_path:
                 return {}
 
-            try:
-                with open(config_path, "r") as f:
-                    if is_json:
-                        return json.load(f) or {}
-                    else:
-                        return yaml.safe_load(f) or {}
-            except Exception as e:
-                logger.error(
-                    f"Failed to load config because {e}. Proceeding without it."
-                )
-                return {}
+            with open(config_path, "r") as f:
+                if is_json:
+                    return json.load(f) or {}
+                else:
+                    return yaml.safe_load(f) or {}
 
         @functools.wraps(func)
         def wrapper() -> Any:  # pyre-ignore [3]
@@ -479,7 +473,12 @@ def wrapper() -> Any:  # pyre-ignore [3]
             # Merge the two dictionaries, JSON overrides YAML
             merged_defaults = {**yaml_defaults, **json_defaults}
 
-            seen_args = set()  # track all --<name> we've added
+            # track all --<name> we've added
+            seen_args = {
+                "json_config",
+                "yaml_config",
+                "loglevel",
+            }
 
             for _name, param in sig.parameters.items():
                 cls = param.annotation
@@ -548,7 +547,12 @@ def wrapper() -> Any:  # pyre-ignore [3]
                     logger.info(config_instance)
 
             loglevel = logging._nameToLevel[args.loglevel.upper()]
-            logger.setLevel(loglevel)
+            # Set loglevel for all existing loggers
+            for existing_logger_name in logging.root.manager.loggerDict:
+                existing_logger = logging.getLogger(existing_logger_name)
+                existing_logger.setLevel(loglevel)
+            # Also set the root logger
+            logging.root.setLevel(loglevel)
 
             return func(**kwargs)
 
@@ -745,7 +749,7 @@ def _trace_handler(prof: torch.profiler.profile) -> None:
                     f"{output_dir}/stacks-cuda-{name}.stacks", "self_cuda_time_total"
                 )
 
-        if memory_snapshot:
+        if memory_snapshot and (all_rank_traces or rank == 0):
             torch.cuda.empty_cache()
             torch.cuda.memory._record_memory_history(
                 max_entries=MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT
@@ -771,7 +775,7 @@ def _trace_handler(prof: torch.profiler.profile) -> None:
         else:
             torch.cuda.synchronize(rank)
 
-        if memory_snapshot:
+        if memory_snapshot and (all_rank_traces or rank == 0):
             try:
                 torch.cuda.memory._dump_snapshot(
                     f"{output_dir}/memory-{name}-rank{rank}.pickle"
@@ -857,6 +861,7 @@ class BenchFuncConfig:
     export_stacks: bool = False
     all_rank_traces: bool = False
     memory_snapshot: bool = False
+    loglevel: str = "WARNING"
 
     # pyre-ignore [2]
     def benchmark_func_kwargs(self, **kwargs_to_override) -> Dict[str, Any]:
@@ -873,6 +878,10 @@ def benchmark_func_kwargs(self, **kwargs_to_override) -> Dict[str, Any]:
             "memory_snapshot": self.memory_snapshot,
         } | kwargs_to_override
 
+    def set_log_level(self) -> None:
+        loglevel = logging._nameToLevel[self.loglevel.upper()]
+        logging.root.setLevel(loglevel)
+
 
 def benchmark_func(
     name: str,
diff --git a/torchrec/distributed/benchmark/benchmark_train_pipeline.py b/torchrec/distributed/benchmark/benchmark_train_pipeline.py
@@ -129,7 +129,7 @@ def runner(
         torch.cuda.is_available() and torch.cuda.device_count() >= world_size
     ), "CUDA not available or insufficient GPUs for the requested world_size"
 
-    torch.autograd.set_detect_anomaly(True)
+    run_option.set_log_level()
     with MultiProcessContext(
         rank=rank,
         world_size=world_size,