From 1cb732198f935e3162a8db92c2a92030ea24f84f Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 4 Jun 2024 12:39:43 -0400 Subject: [PATCH 1/3] autoresume check --- composer/trainer/trainer.py | 2 ++ tests/trainer/test_checkpoint.py | 14 ++++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index eb5080eaee..7b8926980a 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -1726,6 +1726,8 @@ def __init__( if autoresume: log.info('Searching for a previous checkpoint to autoresume') error_message = '' + if max_duration is not None: + error_message += 'The `max_duration` must be specified on trainer.__init__ when autoresume is enabled. ' if save_folder is None: error_message += 'The `save_folder` must be specified when autoresume is enabled. ' if save_overwrite: diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py index d23b55875f..12210f7451 100644 --- a/tests/trainer/test_checkpoint.py +++ b/tests/trainer/test_checkpoint.py @@ -1212,21 +1212,23 @@ def test_load_weights_object_store(self, tmp_path): ) @pytest.mark.parametrize( - 'run_name,save_folder,save_overwrite,latest_filename', + 'run_name,save_folder,save_overwrite,latest_filename,max_duration', [ - [None, 'first', False, 'latest-rank{rank}.pt'], - ['big-chungus', None, False, 'latest-rank{rank}.pt'], - ['big-chungus', 'first', True, 'latest-rank{rank}.pt'], - ['big-chungus', 'first', False, None], + [None, 'first', False, 'latest-rank{rank}.pt', '2ep'], + ['big-chungus', None, False, 'latest-rank{rank}.pt', '2ep'], + ['big-chungus', 'first', True, 'latest-rank{rank}.pt', '2ep'], + ['big-chungus', 'first', False, None, '2ep'], + ['big-chungus', 'first', False, 'latest-rank{rank}.pt', None], ], ) - def test_autoresume_fail(self, run_name, save_folder, save_overwrite, latest_filename): + def test_autoresume_fail(self, run_name, save_folder, save_overwrite, latest_filename, max_duration): with pytest.raises(ValueError): self.get_trainer( latest_filename=latest_filename, save_overwrite=save_overwrite, save_folder=save_folder, run_name=run_name, + max_duration=max_duration, autoresume=True, ) From b382e1842a32ddde37fbfe2ffb783bb5a3ef7aec Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 4 Jun 2024 12:52:57 -0400 Subject: [PATCH 2/3] add checks --- composer/trainer/trainer.py | 16 ++++++++++++++-- tests/trainer/test_checkpoint.py | 22 ++++++++++++++++++++-- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index 7b8926980a..cb42094f37 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -1723,10 +1723,11 @@ def __init__( # Load Checkpoint self._rng_state = None # If autoresume is enabled, first check for existing checkpoints to load - if autoresume: + self.autoresume = autoresume + if self.autoresume: log.info('Searching for a previous checkpoint to autoresume') error_message = '' - if max_duration is not None: + if max_duration is None: error_message += 'The `max_duration` must be specified on trainer.__init__ when autoresume is enabled. ' if save_folder is None: error_message += 'The `save_folder` must be specified when autoresume is enabled. ' @@ -2190,10 +2191,21 @@ def fit( # Reset Time if reset_time: + if self.autoresume: + raise ValueError( + 'Cannot specify `reset_time=True` when autoresume is enabled. Please instead ' + 'specify `load_ignore_keys` when constructing the Trainer, which will only ' + 'run on the initial load and not any subsequent autoresumptions.', + ) self.state.timestamp = Timestamp() # Max Duration if duration is not None: + if self.autoresume: + raise ValueError( + '`duration` cannot be specified when autoresume is enabled. Please instead ' + 'specify `max_duration` when constructing the Trainer.', + ) duration = ensure_time(duration, TimeUnit.EPOCH) if duration.unit == TimeUnit.SECOND: raise ValueError('Wall clock time not an allowed time unit.') diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py index 12210f7451..93777dc027 100644 --- a/tests/trainer/test_checkpoint.py +++ b/tests/trainer/test_checkpoint.py @@ -667,6 +667,7 @@ def get_trainer( max_duration: str = '2ep', latest_filename: str = 'latest-rank{rank}.pt', file_extension: str = '.pt', + use_scheduler: bool = True, **kwargs, ): if model is None: @@ -704,7 +705,7 @@ def get_trainer( save_filename='ep{epoch}' + file_extension, max_duration=max_duration, optimizers=optimizer, - schedulers=ExponentialScheduler(gamma=0.9), + schedulers=ExponentialScheduler(gamma=0.9) if use_scheduler else None, callbacks=callbacks, **kwargs, ) @@ -1221,7 +1222,7 @@ def test_load_weights_object_store(self, tmp_path): ['big-chungus', 'first', False, 'latest-rank{rank}.pt', None], ], ) - def test_autoresume_fail(self, run_name, save_folder, save_overwrite, latest_filename, max_duration): + def test_autoresume_fail_init(self, run_name, save_folder, save_overwrite, latest_filename, max_duration): with pytest.raises(ValueError): self.get_trainer( latest_filename=latest_filename, @@ -1230,8 +1231,25 @@ def test_autoresume_fail(self, run_name, save_folder, save_overwrite, latest_fil run_name=run_name, max_duration=max_duration, autoresume=True, + use_scheduler=False, ) + @pytest.mark.parametrize( + 'duration,reset_time', + [ + ['1ep', False], + [None, True], + ] + ) + def test_autoresume_fail_fit(self, duration: Optional[str], reset_time: bool): + trainer = self.get_trainer( + run_name='bigtrainer', + save_folder='first', + autoresume=True, + ) + with pytest.raises(ValueError): + trainer.fit(duration=duration, reset_time=reset_time) + def test_different_run_names(self): trainer_1 = self.get_trainer( From 01a5f1c615fc2dd018fc054af973516c850b7751 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 4 Jun 2024 13:30:13 -0400 Subject: [PATCH 3/3] lin --- tests/trainer/test_checkpoint.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py index 93777dc027..dc887fa5e2 100644 --- a/tests/trainer/test_checkpoint.py +++ b/tests/trainer/test_checkpoint.py @@ -1235,11 +1235,11 @@ def test_autoresume_fail_init(self, run_name, save_folder, save_overwrite, lates ) @pytest.mark.parametrize( - 'duration,reset_time', - [ - ['1ep', False], - [None, True], - ] + 'duration,reset_time', + [ + ['1ep', False], + [None, True], + ], ) def test_autoresume_fail_fit(self, duration: Optional[str], reset_time: bool): trainer = self.get_trainer(