From ef8e46ac2437e0dfad9cdeca77fa04db08d75314 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Tue, 28 Nov 2023 07:58:07 +0000 Subject: [PATCH] ReturnnForwardJobV2, wait for checkpoint a bit --- returnn/forward.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/returnn/forward.py b/returnn/forward.py index 0c009bd5..b835c723 100644 --- a/returnn/forward.py +++ b/returnn/forward.py @@ -304,9 +304,20 @@ def create_files(self): # check here if model actually exists if self.model_checkpoint is not None: - assert os.path.exists( - _get_model_path(self.model_checkpoint).get_path() - ), f"Provided model checkpoint does not exists: {self.model_checkpoint}" + import time + + num_tries = 0 + while True: + if os.path.exists(_get_model_path(self.model_checkpoint).get_path()): + break + num_tries += 1 + if num_tries > 10: + raise Exception(f"Provided model checkpoint does not exists: {self.model_checkpoint}") + print( + f"Provided model checkpoint does not exists: {self.model_checkpoint}. " + f"Waiting for 3s (try {num_tries})..." + ) + time.sleep(3) def run(self): """run"""