From 4327d7c3b4e707c491b3f04f13a5dd481e9a1f3d Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Fri, 5 Jan 2024 14:29:10 -0500 Subject: [PATCH] added handle for the empty return code file. --- nvflare/private/fed/client/client_executor.py | 2 +- nvflare/private/fed/server/server_engine.py | 2 +- nvflare/private/fed/utils/fed_utils.py | 18 +++++++++++------- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index e2b7ba868e..eae4b8abb0 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -427,7 +427,7 @@ def _wait_child_process_finish(self, client, job_id, allocated_resource, token, if child_process: child_process.wait() - return_code = get_return_code(child_process, job_id, workspace) + return_code = get_return_code(child_process, job_id, workspace, self.logger) self.logger.info(f"run ({job_id}): child worker process finished with RC {return_code}") if return_code in [ProcessExitCode.UNSAFE_COMPONENT, ProcessExitCode.CONFIG_ERROR]: diff --git a/nvflare/private/fed/server/server_engine.py b/nvflare/private/fed/server/server_engine.py index d9466ad8f4..d6b636b575 100644 --- a/nvflare/private/fed/server/server_engine.py +++ b/nvflare/private/fed/server/server_engine.py @@ -211,7 +211,7 @@ def wait_for_complete(self, workspace, job_id, process): break time.sleep(0.1) with self.lock: - return_code = get_return_code(process, job_id, workspace) + return_code = get_return_code(process, job_id, workspace, self.logger) # if process exit but with Execution exception if return_code and return_code != 0: self.logger.info(f"Job: {job_id} child process exit with return code {return_code}") diff --git a/nvflare/private/fed/utils/fed_utils.py b/nvflare/private/fed/utils/fed_utils.py index 19a28b7f98..7bb43be35a 100644 --- a/nvflare/private/fed/utils/fed_utils.py +++ b/nvflare/private/fed/utils/fed_utils.py @@ -317,16 +317,20 @@ def get_target_names(targets): return target_names -def get_return_code(process, job_id, workspace): +def get_return_code(process, job_id, workspace, logger): run_dir = os.path.join(workspace, job_id) rc_file = os.path.join(run_dir, FLMetaKey.PROCESS_RC_FILE) - try: - if os.path.exists(rc_file): + if os.path.exists(rc_file): + try: with open(rc_file, "r") as f: return_code = int(f.readline()) os.remove(rc_file) - else: + except Exception: + logger.warning( + f"Could not get the return_code from {rc_file} of the job:{job_id}, " + f"Return the RC from the process:{process.pid}" + ) return_code = process.poll() - return return_code - except Exception: - raise RuntimeError(f"Could not get the return_code of the {job_id} execution, process_id:{process.pid}") + else: + return_code = process.poll() + return return_code