checkpointing with orbax (#122)

silkyarora · web-flow · commit be495229f07d · 2025-06-17T10:49:06.000-07:00
diff --git a/keras_rs/src/layers/embedding/jax/checkpoint_utils.py b/keras_rs/src/layers/embedding/jax/checkpoint_utils.py
@@ -0,0 +1,104 @@
+"""A Wrapper over orbax CheckpointManager for Keras3 Jax TPU Embeddings."""
+
+from typing import Any
+
+import keras
+import orbax.checkpoint as ocp
+from etils import epath
+
+
+class JaxKeras3CheckpointManager(ocp.CheckpointManager):
+    """A wrapper over orbax CheckpointManager for Keras3 Jax TPU Embeddings."""
+
+    def __init__(
+        self,
+        model: keras.Model,
+        checkpoint_dir: epath.PathLike,
+        max_to_keep: int,
+        steps_per_epoch: int = 1,
+        **kwargs: Any,
+    ):
+        options = ocp.CheckpointManagerOptions(
+            max_to_keep=max_to_keep, enable_async_checkpointing=False, **kwargs
+        )
+        self._model = model
+        self._steps_per_epoch = steps_per_epoch
+        self._checkpoint_dir = checkpoint_dir
+        super().__init__(checkpoint_dir, options=options)
+
+    def _get_state(self) -> tuple[dict[str, Any], Any | None]:
+        """Gets the model state and metrics"""
+        model_state = self._model.get_state_tree()
+        state = {}
+        metrics = None
+        for k, v in model_state.items():
+            if k == "metrics_variables":
+                metrics = v
+            else:
+                state[k] = v
+        return state, metrics
+
+    def save_state(self, epoch: int) -> None:
+        """Saves the model to the checkpoint directory.
+
+        Args:
+          epoch: The epoch number at which the state is saved.
+        """
+        state, metrics_value = self._get_state()
+        self.save(
+            epoch * self._steps_per_epoch,
+            args=ocp.args.StandardSave(item=state),
+            metrics=metrics_value,
+        )
+
+    def restore_state(self, step: int | None = None) -> None:
+        """Restores the model from the checkpoint directory.
+
+        Args:
+          step: The step .number to restore the state from. Default=None
+            restores the latest step.
+        """
+        if step is None:
+            step = self.latest_step()
+        # Restore the model state only, not metrics.
+        state, _ = self._get_state()
+        restored_state = self.restore(
+            step, args=ocp.args.StandardRestore(item=state)
+        )
+        self._model.set_state_tree(restored_state)
+
+
+class JaxKeras3CheckpointCallback(keras.callbacks.Callback):
+    """A callback for checkpointing and restoring state using Orbax."""
+
+    def __init__(
+        self,
+        model: keras.Model,
+        checkpoint_dir: epath.PathLike,
+        max_to_keep: int,
+        steps_per_epoch: int = 1,
+        **kwargs: Any,
+    ):
+        if keras.backend.backend() != "jax":
+            raise ValueError(
+                "`JaxKeras3CheckpointCallback` is only supported on a "
+                "`jax` backend."
+            )
+        self._checkpoint_manager = JaxKeras3CheckpointManager(
+            model, checkpoint_dir, max_to_keep, steps_per_epoch, **kwargs
+        )
+
+    def on_train_begin(self, logs: dict[str, Any] | None = None) -> None:
+        if not self.model.built or not self.model.optimizer.built:
+            raise ValueError(
+                "To use `JaxKeras3CheckpointCallback`, your model and "
+                "optimizer must be built before you call `fit()`."
+            )
+        latest_epoch = self._checkpoint_manager.latest_step()
+        if latest_epoch is not None:
+            self._checkpoint_manager.restore_state(step=latest_epoch)
+
+    def on_epoch_end(
+        self, epoch: int, logs: dict[str, Any] | None = None
+    ) -> None:
+        self._checkpoint_manager.save_state(epoch)
diff --git a/keras_rs/src/layers/embedding/jax/distributed_embedding_test.py b/keras_rs/src/layers/embedding/jax/distributed_embedding_test.py
@@ -1,3 +1,6 @@
+import dataclasses
+import os
+import tempfile
 import typing
 from typing import Any
 
@@ -10,10 +13,13 @@
 from absl.testing import parameterized
 from jax.experimental import layout as jax_layout
 from jax_tpu_embedding.sparsecore.lib.nn import embedding_spec
-from jax_tpu_embedding.sparsecore.lib.nn import table_stacking
+from jax_tpu_embedding.sparsecore.lib.nn import (
+    table_stacking as table_stacking_lib,
+)
 from jax_tpu_embedding.sparsecore.utils import utils as jte_utils
 
 from keras_rs.src.layers.embedding import test_utils as keras_test_utils
+from keras_rs.src.layers.embedding.jax import checkpoint_utils
 from keras_rs.src.layers.embedding.jax import config_conversion
 from keras_rs.src.layers.embedding.jax import (
     distributed_embedding as jax_distributed_embedding,
@@ -131,7 +137,7 @@ def test_sharded_matches_unsharded(self):
             feature_spec.table_spec.name: feature_spec.table_spec
             for feature_spec in feature_specs
         }
-        table_stacking.stack_tables(
+        table_stacking_lib.stack_tables(
             feature_specs,
             table_names=[table_config.name for table_config in table_configs],
             global_device_count=device_count,
@@ -198,7 +204,7 @@ def test_random_shards(self):
         num_sc_per_device = _num_sparsecores_per_device()
         num_table_shards = device_count * num_sc_per_device
 
-        table_stacking.stack_tables(
+        table_stacking_lib.stack_tables(
             feature_specs,
             table_names=[
                 table_spec.name for table_spec in table_specs.values()
@@ -257,7 +263,7 @@ def test_compilability(self):
         num_sc_per_device = _num_sparsecores_per_device()
         num_table_shards = device_count * num_sc_per_device
 
-        table_stacking.stack_tables(
+        table_stacking_lib.stack_tables(
             feature_specs,
             table_names=[
                 table_spec.name for table_spec in table_specs.values()
@@ -458,6 +464,130 @@ def loss_fn(y_true, y_pred):
         loss_after = model.evaluate(evaluation_dataset)
         np.testing.assert_array_less(loss_after, loss_before)
 
+    @parameterized.product(
+        ragged=[False, True],
+        target_stacking=[
+            "auto",
+            [["table:0", "table:1", "table:2"]],
+        ],
+    )
+    def test_save_and_restore(
+        self,
+        ragged: bool,
+        target_stacking: str | list[str] | list[list[str]],
+    ):
+        keras.distribution.set_distribution(keras.distribution.DataParallel())
+
+        table_configs = keras_test_utils.create_random_table_configs(
+            max_vocabulary_size=64,
+            max_embedding_dim=8,
+            optimizer=keras.optimizers.SGD(learning_rate=0.1),
+            seed=10,
+        )
+        feature_configs = keras_test_utils.create_random_feature_configs(
+            table_configs=table_configs,
+            batch_size=16,
+            seed=20,
+        )
+        feature_configs_dict = {
+            feature_config.name: feature_config
+            for feature_config in feature_configs
+        }
+
+        # Create tables for generating labels.
+        seed = keras.random.SeedGenerator(40)
+        tables = {
+            table_config.name: keras.random.uniform(
+                shape=(
+                    table_config.vocabulary_size,
+                    table_config.embedding_dim,
+                ),
+                minval=-5,
+                maxval=5,
+                dtype="float32",
+                seed=seed,
+            )
+            for table_config in table_configs
+        }
+
+        # Fit and evaluate.
+        def loss_fn(y_true, y_pred):
+            return jnp.mean(jnp.square(y_true - y_pred))
+
+        embedding_layer_name = "distributed_embedding_chkpt_test"
+        layer = jax_distributed_embedding.DistributedEmbedding(
+            feature_configs_dict,
+            table_stacking=target_stacking,
+            name=embedding_layer_name,
+        )
+        model = keras.Sequential([layer])
+        model.compile(jit_compile=True, loss=loss_fn)
+
+        # Fit model to different dataset.
+        training_dataset = keras_test_utils.RandomInputSampleDataset(
+            feature_configs_dict,
+            tables,
+            ragged=ragged,
+            num_batches=100,
+            seed=42,
+            preprocessor=lambda inputs, weights: layer.preprocess(
+                inputs, weights, training=True
+            ),
+        )
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            chkpt_path = os.path.join(tmp_dir, "checkpoint")
+
+        model.fit(
+            training_dataset,
+            epochs=2,
+            steps_per_epoch=1,
+            callbacks=[
+                checkpoint_utils.JaxKeras3CheckpointCallback(
+                    model,
+                    chkpt_path,
+                    max_to_keep=1,
+                    steps_per_epoch=1,
+                )
+            ],
+        )
+        # Setup a model with a zero initializer but otherwise the same
+        # feature configs to test restore. Keep the same embedding layer name to
+        # ensure the correct weights are restored.
+        feature_configs_with_zero_init = {
+            feature_config.name: dataclasses.replace(
+                feature_config,
+                table=dataclasses.replace(
+                    feature_config.table, initializer="zeros"
+                ),
+            )
+            for feature_config in feature_configs
+        }
+        layer_for_restore = jax_distributed_embedding.DistributedEmbedding(
+            feature_configs_with_zero_init,
+            table_stacking=target_stacking,
+            name=embedding_layer_name,
+        )
+        input_shapes = jax.tree.map(
+            lambda f: f.input_shape, feature_configs_with_zero_init
+        )
+        layer_for_restore.build(input_shapes)
+        model_for_restore = keras.Sequential([layer_for_restore])
+        manager_for_restore = checkpoint_utils.JaxKeras3CheckpointManager(
+            model_for_restore,
+            chkpt_path,
+            max_to_keep=1,
+            steps_per_epoch=1,
+        )
+        model_for_restore.compile(jit_compile=True, loss=loss_fn)
+        model_for_restore.build()
+        model_for_restore.optimizer.build(model_for_restore.trainable_variables)
+        manager_for_restore.restore_state()
+        jax.tree.map(
+            np.testing.assert_array_equal,
+            model.trainable_variables,
+            model_for_restore.trainable_variables,
+        )
+
 
 if __name__ == "__main__":
     absltest.main()