feat: support low cpu mem usage in spmd-fsdp

lausannel · lausannel · commit 8968543d16e6 · 2024-09-02T15:46:43.000+08:00
diff --git a/torch_xla/experimental/spmd_fully_sharded_data_parallel.py b/torch_xla/experimental/spmd_fully_sharded_data_parallel.py
@@ -11,6 +11,7 @@
 import torch_xla.core.xla_model as xm
 import torch_xla.distributed.spmd as spmd
 from torch_xla.distributed.fsdp.wrap import recursive_wrap
+from torch_xla.distributed.fsdp._init_utils import _materialize_module
 
 
 def _prepare_spmd_partition_spec(param):
@@ -95,6 +96,13 @@ def __init__(
       )
       self._auto_wrap(auto_wrap_kwargs, fsdp_kwargs)
 
+    _materialize_module(
+      module,
+      None,
+      [],
+      deferred_init_check_fn=lambda k: not isinstance(
+          k, SpmdFullyShardedDataParallel))
+
     # Let's move the module to xla device in case it's not moved
     # by the caller already.
     self._orig_module = module.to(xm.xla_device())