From 57152d3ab6ee1123d5ee448cb28d4b4b67b4e2a0 Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Fri, 21 Jan 2022 17:51:14 -0500 Subject: [PATCH] Removed the use of tmp folders for the server and client. (#133) * Removed the use of tmp folders for the server and client. * renamed back one of the variable name. * Removed no used import. * Removed the no longer needed sleep(). * Removed the no used import. --- .../private/fed/app/client/client_train.py | 20 ++----------------- .../private/fed/app/client/worker_process.py | 7 ++++--- .../private/fed/app/server/server_train.py | 16 +++------------ nvflare/private/fed/client/client_engine.py | 2 +- nvflare/private/fed/client/client_executor.py | 10 ++++++---- nvflare/private/fed/server/server_engine.py | 4 ---- 6 files changed, 16 insertions(+), 43 deletions(-) diff --git a/nvflare/private/fed/app/client/client_train.py b/nvflare/private/fed/app/client/client_train.py index ab26cf70ea..fee9b5f150 100644 --- a/nvflare/private/fed/app/client/client_train.py +++ b/nvflare/private/fed/app/client/client_train.py @@ -16,9 +16,7 @@ import argparse import os -import shutil import sys -import time from nvflare.fuel.common.excepts import ConfigError from nvflare.fuel.sec.audit import AuditService @@ -66,9 +64,7 @@ def main(): os.chdir(args.workspace) AuditService.initialize(audit_file_name="audit.log") - if rank == 0: - workspace = create_workspace(args) - time.sleep(rank * 2) + workspace = os.path.join(args.workspace, "startup") # trainer = WorkFlowFactory().create_client_trainer(train_configs, envs) conf = FLClientStarterConfiger( @@ -76,7 +72,7 @@ def main(): # wf_config_file_name="config_train.json", client_config_file_name=args.fed_client, # env_config_file_name="environment.json", - log_config_file_name=workspace + "/log.config", + log_config_file_name="log.config", kv_list=args.set, ) conf.configure() @@ -173,18 +169,6 @@ def remove_restart_file(args): os.remove(restart_file) -def create_workspace(args): - kv_vars = parse_vars(args.set) - workspace = "/tmp/fl/" + kv_vars.get("uid") - - if os.path.exists(workspace): - shutil.rmtree(workspace) - startup = os.path.join(args.workspace, "startup") - shutil.copytree(startup, workspace) - - return workspace - - def create_admin_agent( client_args, client_id, req_processors, secure_train, server_args, federated_client, args, is_multi_gpu, rank ): diff --git a/nvflare/private/fed/app/client/worker_process.py b/nvflare/private/fed/app/client/worker_process.py index e30b276e04..88ba61b367 100644 --- a/nvflare/private/fed/app/client/worker_process.py +++ b/nvflare/private/fed/app/client/worker_process.py @@ -35,6 +35,7 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument("--workspace", "-m", type=str, help="WORKSPACE folder", required=True) + parser.add_argument("--startup", "-w", type=str, help="startup folder", required=True) parser.add_argument( "--fed_client", "-s", type=str, help="an aggregation server specification json file", required=True @@ -88,17 +89,17 @@ def main(): ) ) - workspace = os.path.join("/tmp/fl", client_name) + startup = args.startup app_root = os.path.join(args.workspace, "run_" + str(run_number), "app_" + client_name) app_log_config = os.path.join(app_root, config_folder, "log.config") if os.path.exists(app_log_config): args.log_config = app_log_config else: - args.log_config = os.path.join(workspace, "log.config") + args.log_config = os.path.join(startup, "log.config") conf = FLClientStarterConfiger( - app_root=workspace, + app_root=startup, client_config_file_name=args.fed_client, log_config_file_name=args.log_config, kv_list=args.set, diff --git a/nvflare/private/fed/app/server/server_train.py b/nvflare/private/fed/app/server/server_train.py index f64927791f..0da1f8518b 100644 --- a/nvflare/private/fed/app/server/server_train.py +++ b/nvflare/private/fed/app/server/server_train.py @@ -17,7 +17,6 @@ import argparse import logging import os -import shutil import sys from nvflare.fuel.common.excepts import ConfigError @@ -63,15 +62,14 @@ def main(): try: os.chdir(args.workspace) - create_workspace(args) # YC: is this still useful? - # trainer = WorkFlowFactory().create_server_trainer(train_configs, envs) + startup = os.path.join(args.workspace, "startup") conf = FLServerStarterConfiger( - app_root="/tmp/fl_server", + app_root="startup", # wf_config_file_name="config_train.json", server_config_file_name=args.fed_server, # env_config_file_name="environment.json", - log_config_file_name="/tmp/fl_server/log.config", + log_config_file_name="log.config", kv_list=args.set, ) log_level = os.environ.get("FL_LOG_LEVEL", "") @@ -118,7 +116,6 @@ def main(): except ConfigError as ex: print("ConfigError:", str(ex)) finally: - # shutil.rmtree("/tmp/fl_server") pass @@ -217,13 +214,6 @@ def create_admin_server(fl_server, server_conf=None, args=None, secure_train=Fal return admin_server -def create_workspace(args): - if os.path.exists("/tmp/fl_server"): - shutil.rmtree("/tmp/fl_server") - startup = os.path.join(args.workspace, "startup") - shutil.copytree(startup, "/tmp/fl_server") - - if __name__ == "__main__": """ This is the main program when starting the NVIDIA FLARE server process. diff --git a/nvflare/private/fed/client/client_engine.py b/nvflare/private/fed/client/client_engine.py index 71dbf5fbde..1a69c9839e 100644 --- a/nvflare/private/fed/client/client_engine.py +++ b/nvflare/private/fed/client/client_engine.py @@ -45,7 +45,7 @@ def __init__(self, client, client_name, sender, args, rank, workers=5): self.args = args self.rank = rank self.client.process = None - self.client_executor = ProcessExecutor(client.client_name) + self.client_executor = ProcessExecutor(client.client_name, os.path.join(args.workspace, "startup")) self.run_number = -1 self.status = MachineStatus.STOPPED diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index 6a79d11c9f..8f0a9fd407 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -29,8 +29,8 @@ class ClientExecutor(object): - def __init__(self, uid) -> None: - pipe_path = "/tmp/fl/" + uid + "/comm" + def __init__(self, uid, startup) -> None: + pipe_path = startup + "/comm" if not os.path.exists(pipe_path): os.makedirs(pipe_path) @@ -123,9 +123,10 @@ class ProcessExecutor(ClientExecutor): Run the Client executor in a child process. """ - def __init__(self, uid): - ClientExecutor.__init__(self, uid) + def __init__(self, uid, startup): + ClientExecutor.__init__(self, uid, startup) # self.client = client + self.startup = startup self.conn_client = None # self.pool = None @@ -171,6 +172,7 @@ def start_train(self, client, args, app_root, app_custom_folder, listen_port): command = ( f"{sys.executable} -m nvflare.private.fed.app.client.worker_process -m " + args.workspace + + " -w " + self.startup + " -s fed_client.json " " --set" + command_options + " print_conf=True" ) diff --git a/nvflare/private/fed/server/server_engine.py b/nvflare/private/fed/server/server_engine.py index 4e2406cd8f..5809cc000d 100644 --- a/nvflare/private/fed/server/server_engine.py +++ b/nvflare/private/fed/server/server_engine.py @@ -399,10 +399,6 @@ def start_server_training(server, args, app_root, run_number): if os.path.exists(restart_file): os.remove(restart_file) - if os.path.exists(os.path.join(app_root, args.env)): - env_config = args.env - else: - env_config = "/tmp/fl_server/environment.json" try: server_config_file_name = os.path.join(app_root, args.server_config)