Skip to content

Commit

Permalink
added handle for the empty return code file.
Browse files Browse the repository at this point in the history
  • Loading branch information
yhwen authored and IsaacYangSLA committed Jan 9, 2024
1 parent 8cb6ab8 commit 4327d7c
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 9 deletions.
2 changes: 1 addition & 1 deletion nvflare/private/fed/client/client_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,7 @@ def _wait_child_process_finish(self, client, job_id, allocated_resource, token,
if child_process:
child_process.wait()

return_code = get_return_code(child_process, job_id, workspace)
return_code = get_return_code(child_process, job_id, workspace, self.logger)

self.logger.info(f"run ({job_id}): child worker process finished with RC {return_code}")
if return_code in [ProcessExitCode.UNSAFE_COMPONENT, ProcessExitCode.CONFIG_ERROR]:
Expand Down
2 changes: 1 addition & 1 deletion nvflare/private/fed/server/server_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ def wait_for_complete(self, workspace, job_id, process):
break
time.sleep(0.1)
with self.lock:
return_code = get_return_code(process, job_id, workspace)
return_code = get_return_code(process, job_id, workspace, self.logger)
# if process exit but with Execution exception
if return_code and return_code != 0:
self.logger.info(f"Job: {job_id} child process exit with return code {return_code}")
Expand Down
18 changes: 11 additions & 7 deletions nvflare/private/fed/utils/fed_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,16 +317,20 @@ def get_target_names(targets):
return target_names


def get_return_code(process, job_id, workspace):
def get_return_code(process, job_id, workspace, logger):
run_dir = os.path.join(workspace, job_id)
rc_file = os.path.join(run_dir, FLMetaKey.PROCESS_RC_FILE)
try:
if os.path.exists(rc_file):
if os.path.exists(rc_file):
try:
with open(rc_file, "r") as f:
return_code = int(f.readline())
os.remove(rc_file)
else:
except Exception:
logger.warning(
f"Could not get the return_code from {rc_file} of the job:{job_id}, "
f"Return the RC from the process:{process.pid}"
)
return_code = process.poll()
return return_code
except Exception:
raise RuntimeError(f"Could not get the return_code of the {job_id} execution, process_id:{process.pid}")
else:
return_code = process.poll()
return return_code

0 comments on commit 4327d7c

Please sign in to comment.