diff --git a/checkpoint/orbax/checkpoint/checkpoint_manager.py b/checkpoint/orbax/checkpoint/checkpoint_manager.py index d2e510f0..d2705826 100644 --- a/checkpoint/orbax/checkpoint/checkpoint_manager.py +++ b/checkpoint/orbax/checkpoint/checkpoint_manager.py @@ -364,7 +364,9 @@ def _create_root_directory( if not directory.exists() and utils.is_primary_host( multiprocessing_options.primary_host ): - directory.mkdir(parents=True) + # We need exists_ok=True because the directory might have been created due + # to a race condition. + directory.mkdir(parents=True, exist_ok=True) logging.info('Created directory=%s', directory) multihost.sync_global_processes( multihost.unique_barrier_key(