directly pass update_util as int flag without syncing iter (#3293)

Fei Yu · facebook-github-bot · commit ee2355e2f7d4 · 2025-09-17T08:42:39.000-07:00
Summary: Pull Request resolved: #3293 as title, this issue continue exists in most recent ITEP experiments when we only apply ITEP on the baseline without changing batch size and/or trainer numbers. from recent MAI results in f777920760, we see about 3.5% QPS gap with ITEP enabled (393 vs 403 P90) issues visible in trace https://www.internalfb.com/intern/perfdoctor/trace_view?filepath=tree%2Ftraces%2Fdynocli%2Faps-aps-mai_to_flow-777920760-f777930060%2F0%2Frank-0.Aug_11_01_48_39.4443.pt.trace.json.gz&bucket=aps_traces {F1981311699} Reviewed By: doIIarplus Differential Revision: D67302872 fbshipit-source-id: dbb33a76e44f6dbeb62a80b19d4e7d97286ebec4
diff --git a/torchrec/distributed/itep_embeddingbag.py b/torchrec/distributed/itep_embeddingbag.py
@@ -177,6 +177,10 @@ def compute(
         ctx: ITEPEmbeddingBagCollectionContext,
         dist_input: KJTList,
     ) -> List[torch.Tensor]:
+        # We need to explicitly move iter to CPU since it might be moved to GPU
+        # after __init__. This should be done once.
+        self._iter = self._iter.cpu()
+
         if not ctx.is_reindexed:
             dist_input = self._reindex(dist_input)
             ctx.is_reindexed = True
@@ -196,6 +200,10 @@ def output_dist(
     def compute_and_output_dist(
         self, ctx: ITEPEmbeddingBagCollectionContext, input: KJTList
     ) -> LazyAwaitable[KeyedTensor]:
+        # We need to explicitly move iter to CPU since it might be moved to GPU
+        # after __init__. This should be done once.
+        self._iter = self._iter.cpu()
+
         # Insert forward() function of GenericITEPModule into compute_and_output_dist()
         for i, (sharding, features) in enumerate(
             zip(
@@ -424,6 +432,10 @@ def compute(
         ctx: ITEPEmbeddingCollectionContext,
         dist_input: KJTList,
     ) -> List[torch.Tensor]:
+        # We need to explicitly move iter to CPU since it might be moved to GPU
+        # after __init__. This should be done once.
+        self._iter = self._iter.cpu()
+
         for i, (sharding, features) in enumerate(
             zip(
                 self._embedding_collection._sharding_type_to_sharding.keys(),
@@ -450,6 +462,10 @@ def output_dist(
     def compute_and_output_dist(
         self, ctx: ITEPEmbeddingCollectionContext, input: KJTList
     ) -> LazyAwaitable[Dict[str, JaggedTensor]]:
+        # We need to explicitly move iter to CPU since it might be moved to GPU
+        # after __init__. This should be done once.
+        self._iter = self._iter.cpu()
+
         # Insert forward() function of GenericITEPModule into compute_and_output_dist()
         """ """
         for i, (sharding, features) in enumerate(
diff --git a/torchrec/modules/itep_embedding_modules.py b/torchrec/modules/itep_embedding_modules.py
@@ -79,6 +79,7 @@ def forward(
 
         features = self._itep_module(features, self._iter.item())
         pooled_embeddings = self._embedding_bag_collection(features)
+
         self._iter += 1
 
         return pooled_embeddings
diff --git a/torchrec/modules/itep_modules.py b/torchrec/modules/itep_modules.py
@@ -514,13 +514,13 @@ def forward(
             feature_offsets,
         ) = self.get_remap_info(sparse_features)
 
-        update_utils: bool = (
+        update_util: bool = (
             (cur_iter < 10)
             or (cur_iter < 100 and (cur_iter + 1) % 19 == 0)
             or ((cur_iter + 1) % 39 == 0)
         )
         full_values_list = None
-        if update_utils and sparse_features.variable_stride_per_key():
+        if update_util and sparse_features.variable_stride_per_key():
             if sparse_features.inverse_indices_or_none() is not None:
                 # full util update mode require reconstructing original input indicies from VBE input
                 full_values_list = self.get_full_values_list(sparse_features)
@@ -531,7 +531,7 @@ def forward(
                 )
 
         remapped_values = torch.ops.fbgemm.remap_indices_update_utils(
-            cur_iter,
+            int(cur_iter),
             buffer_idx,
             feature_lengths,
             feature_offsets,
@@ -540,6 +540,7 @@ def forward(
             self.row_util,
             self.buffer_offsets,
             full_values_list=full_values_list,
+            update_util=update_util,
         )
 
         sparse_features._values = remapped_values
diff --git a/torchrec/modules/tests/test_itep_embedding_modules.py b/torchrec/modules/tests/test_itep_embedding_modules.py