add prefetch

peterfu0 · facebook-github-bot · commit 6af25a25c1d6 · 2025-09-03T13:35:58.000-07:00
Differential Revision: D79404930
diff --git a/torchrec/distributed/train_pipeline/runtime_forwards.py b/torchrec/distributed/train_pipeline/runtime_forwards.py
@@ -58,6 +58,10 @@ def name(self) -> str:
     def args(self) -> CallArgs:
         return self._args
 
+    @classmethod
+    def prefetch(cls) -> bool:
+        return False
+
     def set_context(self, context: TForwardContext) -> None:
         self._context = context
 
@@ -220,6 +224,72 @@ def detach_embeddings(
         pass
 
 
+class PrefetchPipelinedForwardCustomizedOrder(
+    BaseForward[PrefetchTrainPipelineContext]
+):
+    """
+    This pipeline is used in TrainPipelineCustomizedOrderSparseDist
+    compute_and_output_dist for batch N is called at the end of step N - 1
+    """
+
+    def __init__(
+        self,
+        name: str,
+        args: CallArgs,
+        module: ShardedModule,
+        context: PrefetchTrainPipelineContext,
+        prefetch_stream: Optional[torch.Stream] = None,
+    ) -> None:
+        super().__init__(
+            name=name,
+            args=args,
+            module=module,
+            context=context,
+            stream=prefetch_stream,
+        )
+        self._compute_and_output_dist_awaitable: Optional[
+            Awaitable[Multistreamable]
+        ] = None
+
+    @classmethod
+    def prefetch(cls) -> bool:
+        return True
+
+    def compute_and_output_dist(self) -> None:
+        assert (
+            self._name in self._context.module_input_post_prefetch
+        ), "Invalid PrefetchPipelinedForward usage, please do not directly call model.forward()"
+        data = self._context.module_input_post_prefetch.pop(self._name)
+        ctx = self._context.module_contexts_post_prefetch.pop(self._name)
+
+        # Make sure that both result of input_dist and context
+        # are properly transferred to the current stream.
+        if self._stream is not None:
+            torch.get_device_module(self._device).current_stream().wait_stream(
+                self._stream
+            )
+            cur_stream = torch.get_device_module(self._device).current_stream()
+
+            assert isinstance(
+                data, (torch.Tensor, Multistreamable)
+            ), f"{type(data)} must implement Multistreamable interface"
+            data.record_stream(cur_stream)
+
+            ctx.record_stream(cur_stream)
+
+        self._compute_and_output_dist_awaitable = self._module.compute_and_output_dist(
+            ctx, data
+        )
+
+    # pyre-ignore [2, 24]
+    def __call__(self, *input, **kwargs) -> Awaitable:
+        if not self._compute_and_output_dist_awaitable:
+            raise Exception(
+                "compute_and_output_dist must be called before __call__",
+            )
+        return self._compute_and_output_dist_awaitable
+
+
 class PrefetchPipelinedForward(BaseForward[PrefetchTrainPipelineContext]):
     """
     This pipeline is used in PrefetchTrainPipelineSparseDist
@@ -241,6 +311,10 @@ def __init__(
             stream=prefetch_stream,
         )
 
+    @classmethod
+    def prefetch(cls) -> bool:
+        return True
+
     # pyre-ignore [2, 24]
     def __call__(self, *input, **kwargs) -> Awaitable:
         assert (
diff --git a/torchrec/distributed/train_pipeline/train_pipelines.py b/torchrec/distributed/train_pipeline/train_pipelines.py
@@ -583,6 +583,7 @@ def enqueue_batch(self, dataloader_iter: Iterator[In]) -> bool:
         batch, context = self.copy_batch_to_gpu(dataloader_iter)
         if batch is None:
             return False
+
         self.batches.append(batch)
         # pyre-ignore [6]
         self.contexts.append(context)
@@ -732,6 +733,7 @@ def _pipeline_model(
         batch: Optional[In],
         context: TrainPipelineContext,
         pipelined_forward: Type[PipelinedForward] = PipelinedForward,
+        prefetch_stream: Optional[torch.Stream] = None,
     ) -> None:
         (
             self._pipelined_modules,
@@ -742,7 +744,9 @@ def _pipeline_model(
         ) = _rewrite_model(
             model=self._model,
             context=context,
-            dist_stream=self._data_dist_stream,
+            dist_stream=(
+                self._data_dist_stream if prefetch_stream is None else prefetch_stream
+            ),
             default_stream=torch.get_device_module(self._device).current_stream(),
             batch=batch,
             apply_jit=self._apply_jit,
@@ -845,9 +849,11 @@ def wait_sparse_data_dist(self, context: TrainPipelineContext) -> None:
         """
         with record_function(f"## wait_sparse_data_dist {context.index} ##"):
             with self._stream_context(self._data_dist_stream):
+                # fused_splits_awaitables is empty
                 for names, awaitable in context.fused_splits_awaitables:
                     for name, request in zip(names, awaitable.wait()):
                         context.input_dist_tensors_requests[name] = request
+
         context.input_dist_splits_requests.clear()
         context.fused_splits_awaitables.clear()
 
@@ -1495,6 +1501,10 @@ def _fill_pipeline(self, dataloader_iter: Iterator[In]) -> None:
         self._batch_ip1 = self._copy_batch_to_gpu(dataloader_iter)
         self._start_sparse_data_dist(self._batch_ip1)
 
+    # i: prefetch is done
+    # ip1: input_dist is done, need to prefetch
+    # ip2: not exist, need to copy and then start input_dist
+    # how about: ip2': memcpy is done, need to input_dist, ip3': not exist, need to memcpy
     def progress(self, dataloader_iter: Iterator[In]) -> Out:
         self._fill_pipeline(dataloader_iter)
 
@@ -1507,12 +1517,12 @@ def progress(self, dataloader_iter: Iterator[In]) -> Out:
 
         self._batch_ip2 = self._copy_batch_to_gpu(dataloader_iter)
 
-        self._wait_sparse_data_dist()
+        self._wait_sparse_data_dist()  # it waits for both i and ip1, as ip1(ip2 in previous round) started
         # forward
         with record_function("## forward ##"):
             losses, output = self._model_fwd(self._batch_i)
 
-        self._prefetch(self._batch_ip1)
+        self._prefetch(self._batch_ip1)  # prefetch 1
 
         if self._model.training:
             # backward
diff --git a/torchrec/distributed/train_pipeline/utils.py b/torchrec/distributed/train_pipeline/utils.py
@@ -29,7 +29,6 @@
 
 import torch
 from torch.profiler import record_function
-
 from torchrec.distributed.dist_data import KJTAllToAll, KJTAllToAllTensorsAwaitable
 from torchrec.distributed.embedding_sharding import (
     FusedKJTListSplitsAwaitable,
@@ -53,6 +52,7 @@
     KJTAllToAllForward,
     PipelinedForward,
     PrefetchPipelinedForward,
+    PrefetchPipelinedForwardCustomizedOrder,
     TForwardContext,
 )
 from torchrec.distributed.train_pipeline.tracing import (
@@ -61,7 +61,7 @@
     Tracer,
 )
 from torchrec.distributed.train_pipeline.types import CallArgs  # noqa
-from torchrec.distributed.types import Awaitable
+from torchrec.distributed.types import Awaitable, LazyAwaitable
 from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
 from torchrec.streamable import Multistreamable, Pipelineable
 
@@ -138,6 +138,7 @@ def _start_data_dist(
                 PrefetchPipelinedForward,
                 EmbeddingPipelinedForward,
                 InSyncEmbeddingPipelinedForward,
+                PrefetchPipelinedForwardCustomizedOrder,
             ),
         )
 
@@ -539,6 +540,10 @@ def get_next_batch(self, none_throws: bool = False) -> Optional[In]:
         return batch
 
 
+def _prefetch_enabled(forward: LazyAwaitable[Out]) -> bool:
+    assert isinstance(forward, BaseForward) and forward.prefetch
+
+
 def _prefetch_embeddings(
     batch: In,
     context: PrefetchTrainPipelineContext,
@@ -551,7 +556,11 @@ def _prefetch_embeddings(
     data_per_sharded_module = {}
     for sharded_module in pipelined_modules:
         forward = sharded_module.forward
-        assert isinstance(forward, PrefetchPipelinedForward)
+        # for backward compatibility, consider it valid if it is PrefetchPipelinedForward
+        # because the class might not have prefetch method
+        assert isinstance(forward, PrefetchPipelinedForward) or _prefetch_enabled(
+            forward
+        )
         assert forward._name in context.input_dist_tensors_requests
         request = context.input_dist_tensors_requests.pop(forward._name)
         assert isinstance(request, Awaitable)