Refactor tensor handling in __to_tensor method to optimize device management for scalar metrics.

alpha0422 · alpha0422 · commit 4223f697ce99 · 2025-09-21T13:16:08.000+08:00
Signed-off-by: Wil Kong &lt;alpha0422@gmail.com&gt;
diff --git a/src/lightning/pytorch/core/module.py b/src/lightning/pytorch/core/module.py
@@ -656,11 +656,20 @@ def __check_allowed(v: Any, name: str, value: Any) -> None:
         raise ValueError(f"`self.log({name}, {value})` was called, but `{type(v).__name__}` values cannot be logged")
 
     def __to_tensor(self, value: Union[Tensor, numbers.Number], name: str) -> Tensor:
-        value = (
-            value.clone().detach()
-            if isinstance(value, Tensor)
-            else torch.tensor(value, device=self.device, dtype=_get_default_dtype())
-        )
+        if isinstance(value, Tensor):
+            # Keep tensor on its original device to avoid unnecessary transfers
+            value = value.clone().detach()
+        else:
+            if self.device.type == "cuda":
+                # Place scalar metrics on CPU to avoid CPU-GPU transfer and synchronization.
+                # `torch.tensor(value, device="cuda")` contains such synchronization, while the metric
+                # itself is only used on the CPU side. So placing metric on CPU for scalar inputs is more efficient.
+                device = "cpu"
+            else:
+                # For non-CUDA devices, maintain original behavior
+                device = self.device
+            value = torch.tensor(value, device=device, dtype=_get_default_dtype())
+
         if not torch.numel(value) == 1:
             raise ValueError(
                 f"`self.log({name}, {value})` was called, but the tensor must have a single element."