From 760d672f34bedc4c52ff8336d728bf9c95769a35 Mon Sep 17 00:00:00 2001 From: Joan Puigcerver Date: Wed, 16 Aug 2023 05:44:43 -0700 Subject: [PATCH] Update trainers to use Orbax checkpointing. The config files have also been updated since orbax computes the "wait time" differently. PiperOrigin-RevId: 557456501 --- vmoe/checkpoints/periodic_actions.py | 208 ------------------ vmoe/checkpoints/periodic_actions_test.py | 118 ---------- .../eee_s32_last2_ilsvrc2012_ft_cifar100.py | 2 +- vmoe/configs/vmoe_paper/common.py | 2 +- vmoe/data/input_pipeline.py | 6 + vmoe/projects/soft_moe/configs/common.py | 2 +- vmoe/train/trainer.py | 133 +++++++---- vmoe/train/trainer_test.py | 71 ++++-- 8 files changed, 147 insertions(+), 395 deletions(-) delete mode 100644 vmoe/checkpoints/periodic_actions.py delete mode 100644 vmoe/checkpoints/periodic_actions_test.py diff --git a/vmoe/checkpoints/periodic_actions.py b/vmoe/checkpoints/periodic_actions.py deleted file mode 100644 index 7a801a8..0000000 --- a/vmoe/checkpoints/periodic_actions.py +++ /dev/null @@ -1,208 +0,0 @@ -# Copyright 2023 Google LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""PeriodicAction that saves checkpoints periodically.""" -import multiprocessing -import os -from typing import Iterable, Optional - -from clu import periodic_actions -from clu.data import dataset_iterator as clu_dataset_iterator -import jax - -from vmoe import multihost_utils -from vmoe.checkpoints import base as checkpoints_base -from vmoe.checkpoints import partitioned as checkpoints_partitioned - - -DatasetIterator = clu_dataset_iterator.DatasetIterator -MapResult = checkpoints_partitioned.MapResult -PyTree = checkpoints_partitioned.PyTree -ThreadPool = checkpoints_partitioned.ThreadPool - - -class PeriodicSaveCheckpoint(periodic_actions.PeriodicCallback): - """Saves checkpoints of a partitioned training state periodically. - - Example: - saver = PeriodicSaveCheckpoint(prefix='/tmp/ckpt', every_steps=10) - for step in range(100): - state = update_state(...) - saver(step=step, state=state) # Saves at steps 0, 10, 20, 30, ... - """ - - def __init__( - self, - *, - prefix: str, - num_shards: int = 0, - num_threads: Optional[int] = None, - wait_seconds: Optional[int] = None, - every_steps: Optional[int] = None, - every_secs: Optional[float] = None, - on_steps: Optional[Iterable[int]] = None, - keep_last: Optional[int] = None, - keep_steps_multiple_of: Optional[int] = None, - execute_async: bool = True, - report_progress: Optional[periodic_actions.ReportProgress] = None, - report_progress_name: str = 'ckpt'): - """Initializer. - - Args: - prefix: Prefix for the checkpoint files. The step number is appended to - this when a checkpoint is written (e.g. prefix='ckpt_' gives checkpoints - 'ckpt_1', 'ckpt_2', ...). - num_shards: Number of checkpoint shards. If `num_shards <= 0`, the minimum - number of shards will be used. If `num_shards > 0`, this number is only - tentative. - num_threads: Number of threads to use for writing checkpoint shards. If - None, `multiprocessing.pool.cpu_count()` is used. - wait_seconds: If given, we wait at most this number of seconds for the - checkpoint writing to complete. Otherwise, TimeoutError is raised. - every_steps: If given, writes a checkpoint every `every_steps` steps. - every_secs: If given, writes a checkpoint every `every_secs` seconds. - on_steps: If given, writes a checkpoint on these particular steps. - keep_last: If given, we only keep the last `keep_last` checkpoints. - If None, only the last checkpoint is kept. - keep_steps_multiple_of: If given, all steps multiple of this number are - kept (in addition to the `keep_last` steps). - execute_async: If True, writes checkpoints shards asynchronously. - If False, waits `wait_seconds` for the writing to complete. Note that, - even if this is True, we always wait up to `wait_seconds` between two - consecutive checkpointing steps. - report_progress: When given, the `timed()` method of this `ReportProgress` - is used to time the saving of checkpoints. - report_progress_name: Name used by `ReportProgress.timed()`. - """ - self._thread_pool = ThreadPool(processes=num_threads) - self._async_result = None # type: Optional[MapResult] - self._wait_seconds = wait_seconds - self._makedirs(os.path.dirname(prefix)) - keep_last = max(keep_last or 1, 1) - keep_multiple = max(keep_steps_multiple_of or 0, 0) - super().__init__( - every_steps=every_steps, - every_secs=every_secs, - on_steps=on_steps, - callback_fn=self._make_callback_fn( - prefix, num_shards, wait_seconds, keep_last, keep_multiple, - execute_async, self._thread_pool, - report_progress, report_progress_name), - # Note: save_checkpoint() is still asynchronous. This just means that - # we wait until the callback_fn returns. - execute_async=False, - pass_step_and_time=True) - - def __del__(self): - if self._async_result: - self._block_async_result(self._wait_seconds) - self._thread_pool.close() - - @classmethod - def _makedirs(cls, workdir: str): - # Process 0 creates the workdir if it doesn't exist. All processes wait - # until this is done. - if jax.process_index() == 0 and not os.path.exists(workdir): - checkpoints_base.gfile.makedirs(workdir) - multihost_utils.sync_devices(f'checkpoints:mkdir:{workdir}') - - @classmethod - def _remove_old_checkpoints(cls, prefix: str, keep_last: int, - keep_multiple: int, thread_pool: ThreadPool): - - def _parse_step_from_filepath(filepath): - m = checkpoints_base.CHECKPOINT_REGEX.fullmatch(filepath) - step_str = m.group(2) if m else None - return int(step_str[1:]) if step_str else None - - def _find_step_numbers(filepaths): - for step in map(_parse_step_from_filepath, filepaths): - if step is not None: - yield step - - def _remove(): - # Find step number of pending shards. - workdir = os.path.dirname(prefix) - basename = os.path.basename(prefix) - prefix_tmp = os.path.join(workdir, f'.tmp.{basename}') + '*' - checkpoints_tmp = checkpoints_base.gfile.glob(prefix_tmp) - pending_steps = set(_find_step_numbers(checkpoints_tmp)) - # Find all completed shards. - checkpoints = checkpoints_base.gfile.glob(prefix + '*') - completed_steps = set(_find_step_numbers(checkpoints)) - # Keep `keep_last` completed steps. - keep_steps = set(sorted(completed_steps - pending_steps)[-keep_last:]) - # Keep steps multiple of `keep_multiple`. - if keep_multiple > 0: - keep_steps.update([ - step for step in completed_steps if step % keep_multiple == 0]) - # Always keep pending steps. - keep_steps.update(pending_steps) - # Remove checkpoints. - def match_remove_fn(filepath): - # Returns True (to remove) if the step is not in `keep_steps`. - step = _parse_step_from_filepath(filepath) - return (step not in keep_steps) if step is not None else False - checkpoints_base.remove_checkpoints( - checkpoints, match_remove_fn, thread_pool=thread_pool) - - # Only process 0 removes files. All processes wait untils this is done. - if jax.process_index() == 0: - _remove() - multihost_utils.sync_devices(f'checkpoints:remove:{prefix}') - - def _block_async_result(self, wait_seconds: Optional[int]): - try: - self._async_result.get(wait_seconds) - self._async_result = None - except multiprocessing.context.TimeoutError as exc: - raise TimeoutError('Timeout while writing checkpoint files after ' - f'{wait_seconds} seconds.') from exc - - def _make_callback_fn(self, prefix, num_shards, wait_seconds, keep_last, - keep_multiple, execute_async, thread_pool, - report_progress, report_progress_name): - - def callback_fn(step: int, t: float, state: PyTree, - iterator: Optional[DatasetIterator] = None): - del t # Unused. - # Wait up to `wait_seconds` seconds, until the previous checkpoint is - # completed before starting to write a new checkpoint. If the timeout - # expires, an exception is raised. This is to avoid having multiple copies - # of the model in the CPU memory. - if self._async_result: - self._block_async_result(wait_seconds) - multihost_utils.sync_devices(f'checkpoints:sync_pending:{prefix}') - # Remove outdated checkpoints before starting writing new ones. - self._remove_old_checkpoints( - prefix, keep_last, keep_multiple, thread_pool) - # Save new checkpoint. - self._async_result = checkpoints_partitioned.save_checkpoint( - prefix=f'{prefix}_{step}', - tree=state, - num_shards=num_shards, - thread_pool=thread_pool, - makedirs=False, - overwrite=True) - # Optionally, wait `wait_seconds` until the checkpointing is done, or - # raise an exception if writing doesn't finish in `wait_seconds`. - if not execute_async: - self._block_async_result(wait_seconds) - multihost_utils.sync_devices(f'checkpoints:no_async:{prefix}') - - if report_progress is None: - return callback_fn - else: - return report_progress.timed( - report_progress_name, wait_jax_async_dispatch=False)(callback_fn) diff --git a/vmoe/checkpoints/periodic_actions_test.py b/vmoe/checkpoints/periodic_actions_test.py deleted file mode 100644 index fb4ebc8..0000000 --- a/vmoe/checkpoints/periodic_actions_test.py +++ /dev/null @@ -1,118 +0,0 @@ -# Copyright 2023 Google LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for periodic_actions.""" -import os -import random -import time -from unittest import mock - -from absl.testing import absltest -from vmoe.checkpoints import periodic_actions - - -class PeriodicSaveCheckpointRemoveOldCheckpointsTest(absltest.TestCase): - - def setUp(self): - super().setUp() - self.workdir = self.create_tempdir() - for step in range(10): - self.workdir.create_file('ckpt_with_no_step') - self.workdir.create_file(f'not_a_ckpt_{step}') - self.workdir.create_file(f'ckpt_main_{step}') - self.workdir.create_file(f'ckpt_model_{step}-00000-of-00001') - self.workdir.create_file(f'ckpt_host_{step}-00000-of-00002') - self.workdir.create_file(f'ckpt_host_{step}-00001-of-00002') - - def test_remove_old_checkpoints_with_keep_multiple_of(self): - periodic_actions.PeriodicSaveCheckpoint._remove_old_checkpoints( - prefix=os.path.join(self.workdir.full_path, 'ckpt_'), - keep_last=2, - keep_multiple=5, - thread_pool=None) - expected = (['ckpt_with_no_step'] + - [f'not_a_ckpt_{step}' for step in range(10)] + - [f'ckpt_main_{step}' for step in [0, 5, 8, 9]] + - [f'ckpt_model_{step}-00000-of-00001' for step in [0, 5, 8, 9]] + - [f'ckpt_host_{step}-00000-of-00002' for step in [0, 5, 8, 9]] + - [f'ckpt_host_{step}-00001-of-00002' for step in [0, 5, 8, 9]]) - self.assertCountEqual(expected, os.listdir(self.workdir.full_path)) - - def test_remove_old_checkpoints_with_no_keep_multiples(self): - periodic_actions.PeriodicSaveCheckpoint._remove_old_checkpoints( - prefix=os.path.join(self.workdir.full_path, 'ckpt_'), - keep_last=2, - keep_multiple=0, - thread_pool=None) - expected = (['ckpt_with_no_step'] + - [f'not_a_ckpt_{step}' for step in range(10)] + - [f'ckpt_main_{step}' for step in [8, 9]] + - [f'ckpt_model_{step}-00000-of-00001' for step in [8, 9]] + - [f'ckpt_host_{step}-00000-of-00002' for step in [8, 9]] + - [f'ckpt_host_{step}-00001-of-00002' for step in [8, 9]]) - self.assertCountEqual(expected, os.listdir(self.workdir.full_path)) - - -class PeriodicSaveCheckpointTest(absltest.TestCase): - - @mock.patch.object( - periodic_actions.checkpoints_partitioned, 'save_checkpoint') - def test(self, mock_save_checkpoint): - # When calling save_checkpoint, we do nothing but we'll wait a few seconds. - def _save_checkpoint_side_effect(*args, thread_pool, **kwargs): - del args - del kwargs - wait = lambda: time.sleep(random.randint(1, 3)) - return thread_pool.apply_async(wait) - mock_save_checkpoint.side_effect = _save_checkpoint_side_effect - # Run a few steps, calling saver on each step. - prefix = os.path.join(self.create_tempdir().full_path, 'ckpt') - saver = periodic_actions.PeriodicSaveCheckpoint( - prefix=prefix, - every_steps=4) - for step in range(1, 10): - saver(step=step, state={}) - # Check that the saver was called twice, on steps 4 and 8. - call_args_list = mock_save_checkpoint.call_args_list - self.assertLen(call_args_list, 2) - self.assertEqual(call_args_list[0], - mock.call(prefix=prefix + '_4', num_shards=0, - makedirs=False, overwrite=True, tree={}, - thread_pool=mock.ANY)) - self.assertEqual(call_args_list[1], - mock.call(prefix=prefix + '_8', num_shards=0, - makedirs=False, overwrite=True, tree={}, - thread_pool=mock.ANY)) - saver.__del__() - - def test_report_progress(self): - mock_report_progress = mock.MagicMock( - periodic_actions.periodic_actions.ReportProgress) - # Run a few steps, calling saver on each step. - prefix = os.path.join(self.create_tempdir().full_path, 'ckpt') - saver = periodic_actions.PeriodicSaveCheckpoint( - prefix=prefix, - every_steps=4, - report_progress=mock_report_progress, - report_progress_name='foo') - for step in range(1, 10): - saver(step=step, state={}) - call_args_list = mock_report_progress.timed.call_args_list - self.assertLen(call_args_list, 1) - self.assertEqual(call_args_list[0], - mock.call('foo', wait_jax_async_dispatch=False)) - - -if __name__ == '__main__': - absltest.main() diff --git a/vmoe/configs/eee_paper/eee_s32_last2_ilsvrc2012_ft_cifar100.py b/vmoe/configs/eee_paper/eee_s32_last2_ilsvrc2012_ft_cifar100.py index d9f49fd..9a884b1 100644 --- a/vmoe/configs/eee_paper/eee_s32_last2_ilsvrc2012_ft_cifar100.py +++ b/vmoe/configs/eee_paper/eee_s32_last2_ilsvrc2012_ft_cifar100.py @@ -165,7 +165,7 @@ def get_config(): config.save_checkpoint.every_steps = 1_000 config.save_checkpoint.keep_last = 1 config.save_checkpoint.num_shards = 32 # Target number of checkpoint shards. - config.save_checkpoint.wait_seconds = 1.0 + config.save_checkpoint.wait_seconds = 300 # Report training progress every 100 steps. config.report_progress = ml_collections.ConfigDict() config.report_progress.every_secs = None diff --git a/vmoe/configs/vmoe_paper/common.py b/vmoe/configs/vmoe_paper/common.py index 494b61b..f4fd172 100644 --- a/vmoe/configs/vmoe_paper/common.py +++ b/vmoe/configs/vmoe_paper/common.py @@ -60,7 +60,7 @@ def get_base_config() -> ml_collections.ConfigDict: config.save_checkpoint = ml_collections.ConfigDict() config.save_checkpoint.every_steps = 1_000 config.save_checkpoint.keep_last = 1 - config.save_checkpoint.wait_seconds = 1.0 + config.save_checkpoint.wait_seconds = 300 # Report training progress every minute. config.report_progress = ml_collections.ConfigDict() config.report_progress.every_secs = None diff --git a/vmoe/data/input_pipeline.py b/vmoe/data/input_pipeline.py index 40b7183..60c0cb6 100644 --- a/vmoe/data/input_pipeline.py +++ b/vmoe/data/input_pipeline.py @@ -19,6 +19,7 @@ """ from typing import Any, Callable, Dict, Optional, Union +from absl import logging from clu.data import dataset_iterator import jax import ml_collections @@ -85,6 +86,11 @@ def get_dataset( Returns: A DatasetIterator. """ + if variant == 'train' and shuffle_seed is not None: + logging.error('Deterministic training is not supported but you specified ' + 'shuffle_seed=%d for training. This can potentially lead to ' + 'data being repeated if restarts happen during training.', + shuffle_seed) builder = vmoe.data.builder.get_dataset_builder( name=name, split=split, diff --git a/vmoe/projects/soft_moe/configs/common.py b/vmoe/projects/soft_moe/configs/common.py index 87f9021..649c031 100644 --- a/vmoe/projects/soft_moe/configs/common.py +++ b/vmoe/projects/soft_moe/configs/common.py @@ -42,7 +42,7 @@ def get_base_config() -> ml_collections.ConfigDict: config.save_checkpoint = ml_collections.ConfigDict() config.save_checkpoint.every_steps = 1_000 config.save_checkpoint.keep_last = 1 - config.save_checkpoint.wait_seconds = 10 + config.save_checkpoint.wait_seconds = 300 # Report training progress every minute to avoid hitting maximum RPC/s quota. config.report_progress = ml_collections.ConfigDict() config.report_progress.every_secs = 60.0 diff --git a/vmoe/train/trainer.py b/vmoe/train/trainer.py index 081ccca..eb2375a 100644 --- a/vmoe/train/trainer.py +++ b/vmoe/train/trainer.py @@ -36,12 +36,12 @@ import ml_collections import numpy as np import optax -from vmoe import checkpoints +import orbax.checkpoint +import tensorflow as tf from vmoe import initialization from vmoe import multihost_utils from vmoe import partitioning from vmoe import utils -from vmoe.checkpoints import periodic_actions as checkpoints_periodic_actions from vmoe.data import input_pipeline from vmoe.data import pjit_utils from vmoe.evaluate import ensemble @@ -62,7 +62,6 @@ Mesh = partitioning.Mesh NamedSharding = jax.sharding.NamedSharding PartitionSpec = partitioning.PartitionSpec -PeriodicCheckpointSaver = checkpoints_periodic_actions.PeriodicSaveCheckpoint PRNGKey = Union[jax.numpy.ndarray, jax.random.KeyArray] PyTree = Any ReportProgress = train_periodic_actions.ReportProgress @@ -132,18 +131,38 @@ def accum_fn(i, state): return new_grad_and_metrics_fn -def create_checkpoint_hook(*, workdir: str, progress_hook: ReportProgress, - train_steps: int, - **kwargs) -> PeriodicCheckpointSaver: - on_steps = set(kwargs.pop('on_steps', [])) - # Always save checkpoint on the last step. - on_steps.update((0, train_steps)) - return PeriodicCheckpointSaver( - prefix=os.path.join(workdir, 'ckpt'), - report_progress=progress_hook, - report_progress_name='ckpt', - on_steps=on_steps, - **kwargs) +def create_checkpoint_manager( + *, + workdir: str, + every_steps: int, + keep_last: Optional[int] = None, + keep_steps_multiple_of: Optional[int] = None, + wait_seconds: int = 300, +) -> orbax.checkpoint.CheckpointManager: + """Creates an Orbax checkpoint manager.""" + directory = os.path.join(workdir, 'ckpt') + if jax.process_index() == 0 and not tf.io.gfile.exists(directory): + tf.io.gfile.makedirs(directory) + multihost_utils.sync_devices('create-ckpt-dir') + ckpt_options = orbax.checkpoint.CheckpointManagerOptions( + save_interval_steps=every_steps, + max_to_keep=keep_last, + keep_period=keep_steps_multiple_of, + ) + ckpt_manager = orbax.checkpoint.CheckpointManager( + directory, + { + 'state': orbax.checkpoint.AsyncCheckpointer( + orbax.checkpoint.PyTreeCheckpointHandler(), + timeout_secs=wait_seconds, + ), + 'dataset_iterator': orbax.checkpoint.Checkpointer( + orbax.checkpoint.JsonCheckpointHandler() + ), + }, + options=ckpt_options, + ) + return ckpt_manager def create_evaluation_hook( @@ -301,10 +320,11 @@ def _initialize_fn(): def get_dataset_iterator( - dataset: DatasetIterator, prefetch_size: int, init_step: int, mesh: Mesh, - workdir: str): + dataset: DatasetIterator, prefetch_size: int, mesh: Mesh, + last_seen_index: Optional[int] = None): """Creates a dataset iterator with device prefetching.""" - del init_step, workdir + logging.warning("Your dataset iterator doesn't allow checkpointing!") + del last_seen_index return pjit_utils.prefetch_to_device(dataset, size=prefetch_size, mesh=mesh) @@ -350,19 +370,17 @@ def initialize(): def restore_or_create_train_state( *, - prefix: str, + ckpt_manager: orbax.checkpoint.CheckpointManager, initialize_fn: Callable[[], TrainState], axis_resources_regexes: partitioning.AxisResourcesRegexes, mesh: Optional[Mesh] = None, thread_pool: Optional[ThreadPool] = None, initialization_kwargs: Optional[Mapping[str, Any]] = None, -) -> TrainState: +) -> Tuple[TrainState, Optional[int]]: """Restores a TrainState from the latest complete checkpoint or creates one. Args: - prefix: Prefix used to find the checkpoint (e.g. '/tmp/ckpt'). This assumes - that checkpoints are partitioned. Thus, a complete checkpoint has files - such as '/tmp/ckpt_1.index' and '/tmp/ckpt_1.data-?????-of-?????'. + ckpt_manager: Checkpoint manager. initialize_fn: Function used to create and initialize a train state from scratch. axis_resources_regexes: Regular expressions specifying how the TrainState @@ -373,7 +391,7 @@ def restore_or_create_train_state( initialize the TrainState from an existing checkpoint. Returns: - A TrainState. + A TrainState and (optionally) the last_seen_index. """ mesh = mesh or maps.thread_resources.env.physical_mesh train_state_shape_dtype = jax.eval_shape(initialize_fn) @@ -388,15 +406,25 @@ def restore_or_create_train_state( lambda x, y: jax.ShapeDtypeStruct(x.shape, x.dtype, sharding=y), train_state_shape_dtype, train_state_axis_resources) - prefix = checkpoints.find_latest_complete_checkpoint_for_prefix( - prefix=prefix, suffixes=('.index', '.data')) - if prefix: - logging.info('Continue training from checkpoint prefix = %r', prefix) - # Restore train_state from checkpoints to CPU memory. - return checkpoints.restore_checkpoint_partitioned( - prefix=prefix, - tree=train_state, - thread_pool=thread_pool) + if (step := ckpt_manager.latest_step()) is not None: + logging.info('Continue training from checkpoint at step %d', step) + def _array_restore_args_fn(x: jax.ShapeDtypeStruct): + return orbax.checkpoint.ArrayRestoreArgs( + dtype=x.dtype, sharding=x.sharding, global_shape=x.shape) + restore_kwargs = { + 'state': { + 'restore_args': jax.tree_map(_array_restore_args_fn, train_state), + }, + } + items = ckpt_manager.restore( + step, + items={ + 'state': train_state, + 'dataset_iterator': {'last_seen_index': 0}, + }, + restore_kwargs=restore_kwargs) + return items['state'], items['dataset_iterator']['last_seen_index'] + if initialization_kwargs: logging.info('Partially initializing the TrainState: %r', initialization_kwargs) @@ -409,7 +437,7 @@ def restore_or_create_train_state( train_state_shape_dtype.params, include_stats=False, msg='Parameter overview:') return create_or_reuse_train_state( - train_state=train_state, initialize_fn=initialize_fn, mesh=mesh) + train_state=train_state, initialize_fn=initialize_fn, mesh=mesh), None def get_loss_fn(name: str, **kwargs): @@ -503,6 +531,9 @@ def initialize_train_state_from_checkpoint( elif name == 'initialize_from_vit': return initialization.initialize_from_vit(target=train_state, mesh=mesh, **kwargs) + elif name == 'initialize_from_orbax': + return initialization.initialize_from_orbax(target=train_state, mesh=mesh, + **kwargs) else: raise ValueError(f'Unknown initialization method: {name!r}') @@ -661,6 +692,8 @@ def train_and_evaluate(config: ml_collections.ConfigDict, workdir: str, datataset_element_shape_dtype = pjit_utils.get_dataset_shape_dtype_struct( datasets['train']) + ckpt_manager = create_checkpoint_manager( + workdir=workdir, **config.get('save_checkpoint', {})) train_state_initialize_fn = make_create_train_state_fn( model=create_flax_model(config=config.model, deterministic=False), optimizer_config=config.optimizer, @@ -669,8 +702,8 @@ def train_and_evaluate(config: ml_collections.ConfigDict, workdir: str, train_steps=train_steps, extra_rng_keys=tuple(config.get('extra_rng_keys', [])), seed=config.get('seed', 0)) - train_state = restore_or_create_train_state( - prefix=os.path.join(workdir, 'ckpt'), + train_state, last_seen_index = restore_or_create_train_state( + ckpt_manager=ckpt_manager, initialize_fn=train_state_initialize_fn, axis_resources_regexes=config.params_axis_resources, thread_pool=ThreadPool(), @@ -680,7 +713,8 @@ def train_and_evaluate(config: ml_collections.ConfigDict, workdir: str, tr_iter = get_dataset_iterator( dataset=datasets['train'], prefetch_size=config.dataset.train.get('prefetch_device', 1), - init_step=init_step, mesh=mesh, workdir=workdir) + mesh=mesh, + last_seen_index=last_seen_index) train_loss_fn, eval_loss_fn, label_pred_fn = get_loss_fn(**config.loss) summarizer = create_tree_summarizer(config.get('summarize_arrays')) train_step_fn = functools.partial( @@ -715,9 +749,6 @@ def train_and_evaluate(config: ml_collections.ConfigDict, workdir: str, progress_hook = create_progress_hook( writer=writer, first_step=init_step + 1, train_steps=train_steps, **config.get('report_progress', {})) - checkpoint_hook = create_checkpoint_hook( - workdir=workdir, progress_hook=progress_hook, - train_steps=train_steps, **config.get('save_checkpoint', {})) evaluation_hook, config_model_eval = create_evaluation_hook( base_model_config=config.model.copy_and_resolve_references(), writer=writer, @@ -739,8 +770,19 @@ def train_and_evaluate(config: ml_collections.ConfigDict, workdir: str, **config.get('fewshot', {})) # Run checkpoint hook just before starting the loop. This will save the train # state at initialization. - if init_step == 0: - checkpoint_hook(init_step, state=train_state, iterator=tr_iter) + def _save_checkpoint(step, ts, it, force=False): + last_seen_index = step * train_batch_size + with progress_hook.timed('ckpt', wait_jax_async_dispatch=False): + ckpt_manager.save( + step, + items={ + 'state': ts, + 'dataset_iterator': {'last_seen_index': last_seen_index}, + }, + force=force) + if init_step == 0 and not tf.io.gfile.exists(os.path.join(workdir, 'ckpt/0')): + multihost_utils.sync_devices('training:ckpt-first') + _save_checkpoint(init_step, train_state, tr_iter, force=True) # Explicitly compile train_step here and report the compilation time. t0 = time.time() train_step_pjit = train_step_pjit.lower( @@ -756,9 +798,14 @@ def train_and_evaluate(config: ml_collections.ConfigDict, workdir: str, batch['labels']) progress_hook( step, scalar_metrics={f'train/{k}': v for k, v in metrics.items()}) - checkpoint_hook(step, state=train_state, iterator=tr_iter) + _save_checkpoint(step, train_state, tr_iter) evaluation_hook(step, params=train_state.params) fewshot_hook(step, variables={'params': train_state.params}) + ckpt_manager.wait_until_finished() + if not tf.io.gfile.exists(os.path.join(workdir, f'ckpt/{train_steps}')): + multihost_utils.sync_devices('training:ckpt-last') + _save_checkpoint(train_steps, train_state, tr_iter, force=True) + ckpt_manager.wait_until_finished() multihost_utils.sync_devices('training:completed') logging.info('Training completed.') diff --git a/vmoe/train/trainer_test.py b/vmoe/train/trainer_test.py index 8843347..b304c54 100644 --- a/vmoe/train/trainer_test.py +++ b/vmoe/train/trainer_test.py @@ -14,7 +14,6 @@ """Tests for trainer.""" import functools -import os from unittest import mock from absl.testing import absltest @@ -29,6 +28,7 @@ import ml_collections import numpy as np import optax +import orbax.checkpoint import tensorflow as tf from vmoe.train import trainer @@ -192,8 +192,7 @@ class InitializeTrainStateFromCheckpointTest(absltest.TestCase): """Tests that the appropriate initialization functions are called.""" @mock.patch.object( - trainer.initialization, 'initialize_from_vmoe', - autospec=True) + trainer.initialization, 'initialize_from_vmoe', autospec=True) def test_initialize_from_vmoe( self, mock_initialize_from_vmoe): train_state = mock.create_autospec(trainer.TrainState, instance=True) @@ -206,8 +205,7 @@ def test_initialize_from_vmoe( rules=[]) @mock.patch.object( - trainer.initialization, 'initialize_from_vit', - autospec=True) + trainer.initialization, 'initialize_from_vit', autospec=True) def test_initialize_from_vit( self, mock_initialize_from_vit): train_state = mock.create_autospec(trainer.TrainState, instance=True) @@ -218,6 +216,17 @@ def test_initialize_from_vit( mock_initialize_from_vit.assert_called_once_with( target=train_state, mesh=mesh, filepath='/foo', rules=[]) + @mock.patch.object( + trainer.initialization, 'initialize_from_orbax', autospec=True) + def test_initialize_from_orbax(self, mock_initialize_from_orbax): + train_state = mock.create_autospec(trainer.TrainState, instance=True) + mesh = mock.create_autospec(jax.sharding.Mesh, instance=True) + _ = trainer.initialize_train_state_from_checkpoint( + train_state=train_state, name='initialize_from_orbax', mesh=mesh, + directory='/foo', rules=[]) + mock_initialize_from_orbax.assert_called_once_with( + target=train_state, mesh=mesh, directory='/foo', rules=[]) + def test_unknown_method_raises(self): train_state = mock.create_autospec(trainer.TrainState, instance=True) mesh = mock.create_autospec(jax.sharding.Mesh, instance=True) @@ -326,23 +335,23 @@ def initialize_fn(): def test_create_from_scratch(self): """Tests when training starts from scratch.""" - prefix = os.path.join(self.create_tempdir().full_path, 'ckpt_1') - train_state = trainer.restore_or_create_train_state( - prefix=prefix, initialize_fn=self.initialize_fn, - axis_resources_regexes=[], mesh=self.mesh, + ckpt_manager = mock.create_autospec(orbax.checkpoint.CheckpointManager, + instance=True) + ckpt_manager.latest_step.return_value = None + train_state, last_seen = trainer.restore_or_create_train_state( + ckpt_manager=ckpt_manager, + initialize_fn=self.initialize_fn, + axis_resources_regexes=[], + mesh=self.mesh, initialization_kwargs={}) chex.assert_trees_all_close(flax.core.unfreeze(train_state.params), { 'a': 1 * np.ones((5,), dtype=np.float32), 'b': 2 * np.ones((10,), dtype=np.float32), }) chex.assert_trees_all_equal(train_state.step, 0) + self.assertIsNone(last_seen) - @mock.patch.object(trainer.checkpoints, - 'find_latest_complete_checkpoint_for_prefix', - return_value='/foo/ckpt_1') - @mock.patch.object(trainer.checkpoints, 'restore_checkpoint_partitioned', - autospec=True) - def test_continue_training(self, mock_restore_checkpoint, _): + def test_continue_training(self): """Tests when training continues from an existing checkpoint.""" # Mock the call to restore_checkpoint_partitioned. def restore_checkpoint_side_effect(*args, **kwargs): @@ -356,19 +365,29 @@ def f(): }) return train_state with self.mesh: - return pjit.pjit(f, out_shardings=None)() - mock_restore_checkpoint.side_effect = restore_checkpoint_side_effect + state = pjit.pjit(f, out_shardings=None)() + return { + 'state': state, + 'dataset_iterator': {'last_seen_index': 16}, + } + ckpt_manager = mock.create_autospec(orbax.checkpoint.CheckpointManager, + instance=True) + ckpt_manager.latest_step.return_value = 3 + ckpt_manager.restore.side_effect = restore_checkpoint_side_effect # Call restore_or_create_train_state and check that the outputs are the # expected ones. - train_state = trainer.restore_or_create_train_state( - prefix='/foo/ckpt_1', initialize_fn=self.initialize_fn, - axis_resources_regexes=[], mesh=self.mesh, + train_state, last_seen = trainer.restore_or_create_train_state( + ckpt_manager=ckpt_manager, + initialize_fn=self.initialize_fn, + axis_resources_regexes=[], + mesh=self.mesh, initialization_kwargs={}) chex.assert_trees_all_close(train_state.params, { 'a': 3 * np.ones((5,), dtype=np.float32), 'b': 4 * np.ones((10,), dtype=np.float32), }) chex.assert_trees_all_equal(train_state.step, 3) + self.assertEqual(last_seen, 16) @mock.patch.object(trainer, 'initialize_train_state_from_checkpoint') def test_initialize_from_checkpoint(self, @@ -399,15 +418,21 @@ def initialize_train_state_from_ckpt_side_effect(*args, **kwargs): initialize_train_state_from_ckpt_side_effect) # Call restore_or_create_train_state and check that the outputs are the # expected ones. - train_state = trainer.restore_or_create_train_state( - prefix='/foo/ckpt_1', initialize_fn=self.initialize_fn, - axis_resources_regexes=[], mesh=self.mesh, + ckpt_manager = mock.create_autospec(orbax.checkpoint.CheckpointManager, + instance=True) + ckpt_manager.latest_step.return_value = None + train_state, last_seen = trainer.restore_or_create_train_state( + ckpt_manager=ckpt_manager, + initialize_fn=self.initialize_fn, + axis_resources_regexes=[], + mesh=self.mesh, initialization_kwargs={'foo': 'bar'}) chex.assert_trees_all_close(flax.core.unfreeze(train_state.params), { 'a': 1 * np.ones((5,), dtype=np.float32), 'b': 5 * np.ones((10,), dtype=np.float32), }) chex.assert_trees_all_equal(train_state.step, 0) + self.assertIsNone(last_seen) class TrainAndEvaluateTest(parameterized.TestCase):