intialize both output dists for VBE in TW/TWRW (#3378)

iamzainhuda · facebook-github-bot · commit 15c3d7a16a61 · 2025-09-25T12:10:46.000-07:00
Summary: Pull Request resolved: #3378 There are rare cases using VBE where one of the KJTs has the same batch size. This is not recognized as a VBE on KJT init which can cause issues in the forward pass. We initialize both output dist comms to support this. Differential Revision: D82478607 fbshipit-source-id: f91e7d1724ae09ff202b4b698a7fc0eedf177e43
diff --git a/torchrec/distributed/sharding/tw_sharding.py b/torchrec/distributed/sharding/tw_sharding.py
@@ -345,9 +345,8 @@ def __init__(
             else None
         )
         self._emb_dim_per_rank_per_feature = emb_dim_per_rank_per_feature
-        self._dist: Optional[
-            Union[PooledEmbeddingsAllToAll, VariableBatchPooledEmbeddingsAllToAll]
-        ] = None
+        self._dist: Optional[PooledEmbeddingsAllToAll] = None
+        self._variable_dist: Optional[VariableBatchPooledEmbeddingsAllToAll] = None
 
     def forward(
         self,
@@ -371,7 +370,10 @@ def forward(
         if sharding_ctx is None:
             return cast(PooledEmbeddingsAllToAll, self._dist)(local_embs)
         elif sharding_ctx.variable_batch_per_feature:
-            return cast(VariableBatchPooledEmbeddingsAllToAll, self._dist)(
+            assert (
+                self._variable_dist is not None
+            ), "variable batch dist is not initialized!"
+            return self._variable_dist(
                 local_embs,
                 batch_size_per_rank_per_feature=sharding_ctx.batch_size_per_rank_per_feature,
                 batch_size_per_feature_pre_a2a=sharding_ctx.batch_size_per_feature_pre_a2a,
@@ -386,21 +388,20 @@ def _create_output_dist_module(
         self, sharding_ctx: Optional[EmbeddingShardingContext] = None
     ) -> None:
         if sharding_ctx is not None and sharding_ctx.variable_batch_per_feature:
-            self._dist = VariableBatchPooledEmbeddingsAllToAll(
+            self._variable_dist = VariableBatchPooledEmbeddingsAllToAll(
                 pg=self._pg,
                 emb_dim_per_rank_per_feature=self._emb_dim_per_rank_per_feature,
                 device=self._device,
                 callbacks=None,
                 codecs=self._codecs,
             )
-        else:
-            self._dist = PooledEmbeddingsAllToAll(
-                pg=self._pg,
-                dim_sum_per_rank=self._dim_sum_per_rank,
-                device=self._device,
-                callbacks=self._callbacks,
-                codecs=self._codecs,
-            )
+        self._dist = PooledEmbeddingsAllToAll(
+            pg=self._pg,
+            dim_sum_per_rank=self._dim_sum_per_rank,
+            device=self._device,
+            callbacks=self._callbacks,
+            codecs=self._codecs,
+        )
 
 
 class TwPooledEmbeddingSharding(
diff --git a/torchrec/distributed/sharding/twrw_sharding.py b/torchrec/distributed/sharding/twrw_sharding.py
@@ -472,18 +472,14 @@ def __init__(
             if qcomm_codecs_registry
             else None
         )
-        self._intra_dist: Optional[
-            Union[
-                PooledEmbeddingsReduceScatter,
-                VariableBatchPooledEmbeddingsReduceScatter,
-            ]
-        ] = None
-        self._cross_dist: Optional[
-            Union[
-                PooledEmbeddingsAllToAll,
-                VariableBatchPooledEmbeddingsAllToAll,
-            ]
+        self._intra_dist: Optional[PooledEmbeddingsReduceScatter] = None
+        self._cross_dist: Optional[PooledEmbeddingsAllToAll] = None
+        self._variable_intra_dist: Optional[
+            VariableBatchPooledEmbeddingsReduceScatter
         ] = None
+        self._variable_cross_dist: Optional[VariableBatchPooledEmbeddingsAllToAll] = (
+            None
+        )
 
     def forward(
         self,
@@ -514,13 +510,15 @@ def forward(
                 sharding_ctx.batch_size_per_rank_per_feature,
             )
             rs_result = cast(
-                VariableBatchPooledEmbeddingsReduceScatter, self._intra_dist
+                VariableBatchPooledEmbeddingsReduceScatter, self._variable_intra_dist
             )(
                 local_embs,
                 batch_size_per_rank_per_feature=batch_size_per_feature_sum_by_cross_group,
                 embedding_dims=self._emb_dim_per_node_per_feature[current_node],
             ).wait()
-            return cast(VariableBatchPooledEmbeddingsAllToAll, self._cross_dist)(
+            return cast(
+                VariableBatchPooledEmbeddingsAllToAll, self._variable_cross_dist
+            )(
                 rs_result,
                 batch_size_per_rank_per_feature=batch_size_per_rank_per_feature_by_cross_group[
                     local_rank
@@ -615,28 +613,27 @@ def _create_output_dist_modules(
         self, sharding_ctx: Optional[EmbeddingShardingContext] = None
     ) -> None:
         if sharding_ctx is not None and sharding_ctx.variable_batch_per_feature:
-            self._intra_dist = VariableBatchPooledEmbeddingsReduceScatter(
+            self._variable_intra_dist = VariableBatchPooledEmbeddingsReduceScatter(
                 pg=self._intra_pg,
                 codecs=self._intra_codecs,
             )
-            self._cross_dist = VariableBatchPooledEmbeddingsAllToAll(
+            self._variable_cross_dist = VariableBatchPooledEmbeddingsAllToAll(
                 pg=self._cross_pg,
                 emb_dim_per_rank_per_feature=self._emb_dim_per_node_per_feature,
                 device=self._device,
                 callbacks=None,  # don't pass permute callback, handle in LazyAwaitable
                 codecs=self._cross_codecs,
             )
-        else:
-            self._intra_dist = PooledEmbeddingsReduceScatter(
-                pg=self._intra_pg,
-                codecs=self._intra_codecs,
-            )
-            self._cross_dist = PooledEmbeddingsAllToAll(
-                pg=self._cross_pg,
-                dim_sum_per_rank=self._dim_sum_per_node,
-                device=self._device,
-                codecs=self._cross_codecs,
-            )
+        self._intra_dist = PooledEmbeddingsReduceScatter(
+            pg=self._intra_pg,
+            codecs=self._intra_codecs,
+        )
+        self._cross_dist = PooledEmbeddingsAllToAll(
+            pg=self._cross_pg,
+            dim_sum_per_rank=self._dim_sum_per_node,
+            device=self._device,
+            codecs=self._cross_codecs,
+        )
 
 
 class TwRwPooledEmbeddingSharding(