Update trainers to use Orbax checkpointing.

The config files have also been updated since orbax computes the "wait time" differently. PiperOrigin-RevId: 557456501
google-research · Aug 16, 2023 · 760d672 · 760d672
1 parent c2b4ce0
commit 760d672
Show file tree

Hide file tree

Showing 8 changed files with 147 additions and 395 deletions.
diff --git a/vmoe/checkpoints/periodic_actions.py b/vmoe/checkpoints/periodic_actions.py
diff --git a/vmoe/checkpoints/periodic_actions_test.py b/vmoe/checkpoints/periodic_actions_test.py
diff --git a/vmoe/configs/eee_paper/eee_s32_last2_ilsvrc2012_ft_cifar100.py b/vmoe/configs/eee_paper/eee_s32_last2_ilsvrc2012_ft_cifar100.py
@@ -165,7 +165,7 @@ def get_config():
   config.save_checkpoint.every_steps = 1_000
   config.save_checkpoint.keep_last = 1
   config.save_checkpoint.num_shards = 32  # Target number of checkpoint shards.
-  config.save_checkpoint.wait_seconds = 1.0
+  config.save_checkpoint.wait_seconds = 300
   # Report training progress every 100 steps.
   config.report_progress = ml_collections.ConfigDict()
   config.report_progress.every_secs = None

diff --git a/vmoe/configs/vmoe_paper/common.py b/vmoe/configs/vmoe_paper/common.py
@@ -60,7 +60,7 @@ def get_base_config() -> ml_collections.ConfigDict:
   config.save_checkpoint = ml_collections.ConfigDict()
   config.save_checkpoint.every_steps = 1_000
   config.save_checkpoint.keep_last = 1
-  config.save_checkpoint.wait_seconds = 1.0
+  config.save_checkpoint.wait_seconds = 300
   # Report training progress every minute.
   config.report_progress = ml_collections.ConfigDict()
   config.report_progress.every_secs = None

diff --git a/vmoe/data/input_pipeline.py b/vmoe/data/input_pipeline.py
@@ -19,6 +19,7 @@
 """
 from typing import Any, Callable, Dict, Optional, Union
 
+from absl import logging
 from clu.data import dataset_iterator
 import jax
 import ml_collections
@@ -85,6 +86,11 @@ def get_dataset(
   Returns:
     A DatasetIterator.
   """
+  if variant == 'train' and shuffle_seed is not None:
+    logging.error('Deterministic training is not supported but you specified '
+                  'shuffle_seed=%d for training. This can potentially lead to '
+                  'data being repeated if restarts happen during training.',
+                  shuffle_seed)
   builder = vmoe.data.builder.get_dataset_builder(
       name=name,
       split=split,

diff --git a/vmoe/projects/soft_moe/configs/common.py b/vmoe/projects/soft_moe/configs/common.py
@@ -42,7 +42,7 @@ def get_base_config() -> ml_collections.ConfigDict:
   config.save_checkpoint = ml_collections.ConfigDict()
   config.save_checkpoint.every_steps = 1_000
   config.save_checkpoint.keep_last = 1
-  config.save_checkpoint.wait_seconds = 10
+  config.save_checkpoint.wait_seconds = 300
   # Report training progress every minute to avoid hitting maximum RPC/s quota.
   config.report_progress = ml_collections.ConfigDict()
   config.report_progress.every_secs = 60.0