add variable batch_size for training (#3388)

TroyGarden · facebook-github-bot · commit 73f0d7f46595 · 2025-09-19T15:21:03.000-07:00
Summary: Pull Request resolved: #3388 Pull Request resolved: #3387 # context * APS is using "variable batch size" during training, e.g., using a smaller `batch_size` (like 32) to warm up then use a larger `batch_size` (like 64) for the rest of training. ``` batch_size_schedule: - batch_size: 32 max_iters: 5 - batch_size: 64 max_iters: 999999999 ``` * however, this becomes a problem for torch.export (PT2 IR) because the exported program assumes the `batch_size` to be constant. NOTE: this "variable batch" concept is fundamentally different from the "variable length" (VLE/VBE) * in the variable batch scenario, within the same batch/training iteration, each feature in the KJT shares the same `batch_size` (which can only vary in a later iteration), so it follows the correlation: `batch_size = length(kjt._lengths) // len(kjt._keys)`, and `kjt.stride()` returns the `batch_size` by calculation from `_lengths` and `_keys`. * in the variable length scenario, within the same batch/training iteration, each feature in the KJT could have different `batch_size`, and there's no correlation between `_lengths` and `_keys` or `batch_size`. * so this "variable batch size" **CAN NOT** simply be resolved by setting all input KJTs as variable lengths, instead, it has to use `batch_size` as a dynamic shape implicitly from the `mark_dynamic_kjt` util function. WARNING: it's the user's responsibility to make sure that the `variable_batch` is only used when setting `variable_length` to `False`, otherwise it will cause unexpected behavior with the dynamic shapes in torch.export Reviewed By: spmex, malaybag Differential Revision: D82792378
diff --git a/torchrec/ir/tests/test_serializer.py b/torchrec/ir/tests/test_serializer.py
@@ -297,7 +297,7 @@ def test_serialize_deserialize_ebc(self) -> None:
         self.assertEqual(len(deserialized_out), len(eager_out))
         for deserialized, orginal in zip(deserialized_out, eager_out):
             self.assertEqual(deserialized.shape, orginal.shape)
-            self.assertTrue(torch.allclose(deserialized, orginal))
+            torch.testing.assert_close(deserialized, orginal)
 
     def test_serialize_deserialize_ebc_with_vbe_kjt(self) -> None:
         model = self.generate_model_for_vbe_kjt()
@@ -374,14 +374,14 @@ def test_serialize_deserialize_ebc_with_vbe_kjt(self) -> None:
         self.assertEqual(len(deserialized_out), len(eager_out))
         for deserialized, orginal in zip(deserialized_out, eager_out):
             self.assertEqual(deserialized.shape, orginal.shape)
-            self.assertTrue(torch.allclose(deserialized, orginal))
+            torch.testing.assert_close(deserialized, orginal)
 
         deserialized_out_2 = deserialized_model(kjt_2)
 
         self.assertEqual(len(deserialized_out_2), len(eager_out_2))
         for deserialized, orginal in zip(deserialized_out_2, eager_out_2):
             self.assertEqual(deserialized.shape, orginal.shape)
-            self.assertTrue(torch.allclose(deserialized, orginal))
+            torch.testing.assert_close(deserialized, orginal)
 
     def test_dynamic_shape_ebc_disabled_in_oss_compatibility(self) -> None:
         model = self.generate_model()
@@ -428,7 +428,61 @@ def test_dynamic_shape_ebc_disabled_in_oss_compatibility(self) -> None:
 
         for i, tensor in enumerate(deserialized_out):
             self.assertEqual(eager_out[i].shape, tensor.shape)
-            assert torch.allclose(eager_out[i], tensor)
+            torch.testing.assert_close(eager_out[i], tensor)
+
+    def test_variable_batch_size_ebc_disabled_in_oss_compatibility(self) -> None:
+        model = self.generate_model()
+        feature1 = KeyedJaggedTensor.from_offsets_sync(
+            keys=["f1", "f2", "f3"],
+            values=torch.tensor([0, 1, 2, 3, 2, 3]),
+            offsets=torch.tensor([0, 2, 2, 3, 4, 5, 6]),  # batch size = 2
+        )
+
+        feature2 = KeyedJaggedTensor.from_offsets_sync(
+            keys=["f1", "f2", "f3"],
+            values=torch.tensor([0, 1, 2, 3, 2, 3, 4, 5, 6]),
+            offsets=torch.tensor([0, 2, 2, 3, 4, 5, 7, 8, 8, 9]),  # batch size = 3
+        )
+        eager_out1 = model(feature1)
+        eager_out2 = model(feature2)
+        # feature1.lengths()
+        # feature2.lengths()
+
+        # Serialize EBC with sample input (feature1, batch size = 2)
+        collection = mark_dynamic_kjt(feature1, variable_batch=True)
+        model, sparse_fqns = encapsulate_ir_modules(model, JsonSerializer)
+        ep = torch.export.export(
+            model,
+            (feature1,),
+            {},
+            dynamic_shapes=collection.dynamic_shapes(model, (feature1,)),
+            strict=False,
+            # Allows KJT to not be unflattened and run a forward on unflattened EP
+            preserve_module_call_signature=tuple(sparse_fqns),
+        )
+
+        # Run forward on ExportedProgram
+        ep_output1 = ep.module()(feature1)
+        ep_output2 = ep.module()(feature2)
+
+        # other asserts
+        for eager_out, ep_out in [(eager_out1, ep_output1), (eager_out2, ep_output2)]:
+            for a, b in zip(eager_out, ep_out):
+                self.assertEqual(a.shape, b.shape)
+
+        # Deserialize EBC
+        unflatten_ep = torch.export.unflatten(ep)
+        deserialized_model = decapsulate_ir_modules(unflatten_ep, JsonSerializer)
+        deserialized_model.load_state_dict(model.state_dict())
+
+        # Run forward on deserialized model
+        deserialized_out1 = deserialized_model(feature1)
+        deserialized_out2 = deserialized_model(feature2)
+
+        for e, d in ([eager_out1, deserialized_out1], [eager_out2, deserialized_out2]):
+            for a, b in zip(e, d):
+                self.assertEqual(a.shape, b.shape)
+                torch.testing.assert_close(a, b)
 
     def test_ir_emb_lookup_device(self) -> None:
         model = self.generate_model()
@@ -573,7 +627,7 @@ def forward(self, features: KeyedJaggedTensor) -> List[torch.Tensor]:
         deserialized_out = deserialized_model(id_list_features)
         self.assertEqual(len(deserialized_out), len(eager_out))
         for x, y in zip(deserialized_out, eager_out):
-            self.assertTrue(torch.allclose(x, y))
+            torch.testing.assert_close(x, y)
 
     def test_regroup_as_dict_module(self) -> None:
         class Model(nn.Module):
diff --git a/torchrec/ir/utils.py b/torchrec/ir/utils.py
@@ -195,6 +195,7 @@ def mark_dynamic_kjt(
     kjt: KeyedJaggedTensor,
     shapes_collection: Optional[ShapesCollection] = None,
     variable_length: bool = False,
+    variable_batch: bool = False,
     vlen: Optional[DIM] = None,
     llen: Optional[DIM] = None,
 ) -> ShapesCollection:
@@ -211,10 +212,18 @@ def mark_dynamic_kjt(
     it will use the default name "vlen" for values, and "llen", "lofs" if variable length.
     A passed-in dynamic dim is useful if the dynamic dim is already used in other places.
 
+    variable batch size means the batch size is dynamic during different training iterations
+    the batch size for all features are the same within one iteration/batch. so it still follows
+    the correlation: len(lengths) == len(keys) * batch_size
+
+    in the variable length scenario, the batch size could be different for each feature within
+    the iteration/batch, so it doesn't follow the correlation: len(lengths) == len(keys) * batch_size
+
     Args:
         kjt (KeyedJaggedTensor): The KJT to make dynamic.
         shapes_collection (Optional[ShapesCollection]): The collection to update.
-        variable_length (bool): Whether the KJT is variable length.
+        variable_length (bool): Whether the KJT is variable length len(lengths) != len(keys) * batch_size
+        variable_batch (bool): Whether the KJT is variable batch size, len(lengths) == len(keys) * batch_size, it only works when variable_length is False.
         vlen (Optional[DIM]): The dynamic length for the values. If it's None, it will use the default name "vlen".
         llen (Optional[DIM]): The dynamic length for the lengths, it's only used when variable_length is true. If it's None, it will use the default name "llen".
         batch_size (Optional[DIM]): The dynamic length for the batch_size, it's only used when variable_length and mark_batch_size are both true.
@@ -245,6 +254,21 @@ def _has_dim(t: Optional[torch.Tensor]) -> bool:
             shapes_collection[kjt._lengths] = (llen,)
         if _has_dim(kjt._offsets):
             shapes_collection[kjt._offsets] = (llen + 1,)
+    elif variable_batch:
+        # variable batch size means the batch size is dynamic during different training iterations
+        # the batch size for all features are the same within one iteration/batch
+        #
+        # this is fundamentally different from variable length, where the batch size is different
+        # for each feature within one iteration/batch
+        #
+        # it's the user's responsibility to make sure that in a variable batch scenario,
+        # the argument variable_batch is only used when setting variable_length to False,
+        # otherwise it will lead to unexpected behavior with the dynamic shapes in torch.export
+        batch_size = _get_dim("batch_size")
+        if _has_dim(kjt._lengths):
+            shapes_collection[kjt._lengths] = (batch_size * len(kjt.keys()),)
+        if _has_dim(kjt._offsets):
+            shapes_collection[kjt._offsets] = (batch_size * len(kjt.keys()) + 1,)
     return shapes_collection