diff --git a/docs/examples/xgboost.rst b/docs/examples/xgboost.rst new file mode 100644 index 0000000000..97aa060bf5 --- /dev/null +++ b/docs/examples/xgboost.rst @@ -0,0 +1,27 @@ +Federated XGBoost +================= + + +Overview +-------- + +NVFlare supports federated learning using popular gradient boosting library XGBoost. +It uses XGBoost library with federated plugin (xgboost version >= 1.7.0rc1) to perform the learning. + +Using XGBoost with NVFlare has following benefits compared with running federated XGBoost directly, + +* XGBoost instance's life-cycle is managed by NVFlare. Both XGBoost client and server + are started/stopped automatically by NVFlare workflow. +* For histogram-based XGBoost federated server can be configured automatically with auto-assigned port number. +* When mutual TLS is used, the certificates are managed by NVFlare using existing + provisioning process. +* No need to manually configure each instance. Instance specific parameters + like code:`rank` are assigned automatically by the NVFlare controller. + +Examples +-------- + +Basic components to run XGBoost are already included with NVFlare distribution. +Most XGBoost jobs can be created without custom code. + +Please refer to :code:`NVFlare/examples/xgboost` for more details. diff --git a/docs/index.rst b/docs/index.rst index 557f0b370e..ee9eae696c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -33,7 +33,7 @@ NVIDIA FLARE is built on a componentized architecture that gives you the flexibi Getting Started =============== -For first-time users and FL researchers, FLARE provides the FL Simulator that allows you to build, test, and deploy applications locally. The :ref:`Getting Started guide ` covers installation and walks through an example application using the FL Simulator. +For first-time users and FL researchers, FLARE provides the :ref:`fl_simulator` that allows you to build, test, and deploy applications locally. The :ref:`Getting Started guide ` covers installation and walks through an example application using the FL Simulator. When you are ready to for a secure, distributed deployment, the :ref:`Real World Federated Learning ` section covers the tools and process required to deploy and operate a secure, real-world FLARE project. diff --git a/docs/programming_guide/provisioning_system.rst b/docs/programming_guide/provisioning_system.rst index 67ee433efb..1928cfdbc0 100644 --- a/docs/programming_guide/provisioning_system.rst +++ b/docs/programming_guide/provisioning_system.rst @@ -283,6 +283,67 @@ A new builder to write 'gateway.conf' can be implemented as follows (for referen f.write(f"name = {gw.name}\n") f.write(f"port = {port}\n") +.. _distribution_builder: + +Case 4: adding a builder for enabling the creation of zip archives for the startup kits +--------------------------------------------------------------------------------------- +DistributionBuilder was included in NVIDIA FLARE before version 2.2.1 but has been removed from the +default builders. You can make this builder available and add it as a builder in project.yml if you want to zip the startup kits:: + + import os + import shutil + import subprocess + + from nvflare.lighter.spec import Builder, Project + from nvflare.lighter.utils import generate_password + + class DistributionBuilder(Builder): + def __init__(self, zip_password=False): + """Build the zip files for each folder. + Creates the zip files containing the archives for each startup kit. It will add password protection if the + argument (zip_password) is true. + Args: + zip_password: if true, will create zipped packages with passwords + """ + self.zip_password = zip_password + + def build(self, project: Project, ctx: dict): + """Create a zip for each individual folder. + Note that if zip_password is True, the zip command will be used to encrypt zip files. Users have to + install this zip utility before provisioning. In Ubuntu system, use this command to install zip utility: + sudo apt-get install zip + Args: + project (Project): project instance + ctx (dict): the provision context + """ + wip_dir = self.get_wip_dir(ctx) + dirs = [ + name + for name in os.listdir(wip_dir) + if os.path.isdir(os.path.join(wip_dir, name)) and "nvflare_" not in name + ] + for dir in dirs: + dest_zip_file = os.path.join(wip_dir, f"{dir}") + if self.zip_password: + pw = generate_password() + run_args = ["zip", "-rq", "-P", pw, dest_zip_file + ".zip", ".", "-i", "startup/*"] + os.chdir(dest_zip_file) + try: + subprocess.run(run_args) + print(f"Password {pw} on {dir}.zip") + except FileNotFoundError: + raise RuntimeError("Unable to zip folders with password. Maybe the zip utility is not installed.") + finally: + os.chdir(os.path.join(dest_zip_file, "..")) + else: + shutil.make_archive(dest_zip_file, "zip", root_dir=os.path.join(wip_dir, dir), base_dir="startup") + +If the above code is made available at ``nvflare.lighter.impl.workspace.DistributionBuilder``, add the following to your project.yml at the bottom of the list of builders:: + + path: nvflare.lighter.impl.workspace.DistributionBuilder + args: + zip_password: true + Takeaways for Custom Builders ----------------------------- From the cases shown previously, implementing your own Builders only requires the following steps: diff --git a/docs/publications_and_talks.md b/docs/publications_and_talks.md index 99faaa26eb..ea6648618e 100644 --- a/docs/publications_and_talks.md +++ b/docs/publications_and_talks.md @@ -40,6 +40,7 @@ NVIDIA FLARE related blogs and other media. #### 2019 * **2019-12** [Federated Learning powered by NVIDIA Clara](https://developer.nvidia.com/blog/federated-learning-clara/) (NVIDIA Technical Blog) +* **2019-10** [What is federated learning - in Chinese](https://blogs.nvidia.com.tw/2019/10/13/what-is-federated-learning/) (NVIDIA Technical Blog) * **2019-10** [NVIDIA Research: First Privacy-Preserving Federated Learning System for Medical Imaging](https://www.youtube.com/watch?v=Jy7ozgwovgg) (NVIDIA video) ## Talks @@ -58,3 +59,6 @@ Recent talks and Webinars covering federated learning research and NVIDIA FLARE. * **2021-09** [Federated Learning](https://www.youtube.com/watch?v=YeYO4JGTBb0&) ([MONAI MICCAI Bootcamp 2021](https://www.gpuhackathons.org/event/monai-miccai-bootcamp-2021)) * **2021-03** [NVIDIA FLARE: An Open Federated Learning Platform](https://www.nvidia.com/en-us/on-demand/session/gtcspring22-se1991/) ([GTC Spring 2022](https://www.nvidia.com/gtc/)) * **2021-03** [Federated Learning for Healthcare – Collaborative AI without Sharing Patient Data ](https://www.youtube.com/watch?v=xr_eJp3ctzw) ([Data Science Seminar](https://www.dkfz.de/en/datascience/seminar/Rieke.html)) + +#### 2020 +* **2020-11** [Federated Learning for Medical Imaging - in Chinese](https://www.youtube.com/watch?v=CiPdALrNEjU) (NVIDIA Taiwan) diff --git a/docs/quickstart.rst b/docs/quickstart.rst new file mode 100644 index 0000000000..a9a7b819bb --- /dev/null +++ b/docs/quickstart.rst @@ -0,0 +1,5 @@ +########## +Quickstart +########## + +See :ref:`quickstart`. diff --git a/docs/real_world_fl.rst b/docs/real_world_fl.rst index 3d822d74bf..da7c5d7859 100644 --- a/docs/real_world_fl.rst +++ b/docs/real_world_fl.rst @@ -8,7 +8,9 @@ This section shows how to use NVIDIA FLARE to deploy and operate an FL system. A reference application will be used here to show provisioning and basic operation of the system through the admin client. You will find information here about setting up the system with all the available components and the Admin API for -operating FL. For more details on what you can do with apps with custom components and +operating FL. For instructions on how to set up the :ref:`nvflare_dashboard_ui` added in 2.2.1 to +help gather information to provision a project and distribute startup kits, see :ref:`dashboard_api`. +For more details on what you can do with apps with custom components and the flexibility that the Controller and Worker APIs bring, see the :ref:`programming_guide`. You can also see some `example applications `_ integrating with @@ -24,5 +26,4 @@ to see the capabilities of the system and how it can be operated. real_world_fl/application real_world_fl/job real_world_fl/workspace - real_world_fl/authorization - + user_guide/federated_authorization diff --git a/docs/real_world_fl/application.rst b/docs/real_world_fl/application.rst index 0e0754aaa4..b3a651bde0 100644 --- a/docs/real_world_fl/application.rst +++ b/docs/real_world_fl/application.rst @@ -107,16 +107,15 @@ the client config should have the following in order to configure it as an Execu Configuration of Executor Tasks is ignored here. -Please follow :ref:`quickstart:Quickstart` to learn more. +Please follow :ref:`quickstart` to learn more. .. _troubleshooting_byoc: Troubleshooting BYOC ==================== -There is an ``enable_byoc`` flag for each participant that can be set at provisioning, and if that is disabled, even if -you have custom code in your application folder, it will not be loaded. There is also a setting for ``allow_byoc`` -through the authorization rule groups. This controls whether or not apps containing BYOC code will be allowed to be -uploaded and deployed. +In 2.2.1, authorization has been redesigned and BYOC is no longer controlled through settings at provisioning, but +instead by each site's authorization.json (in the local folder of the workspace). BYOC is a right and can be restricted +to certain roles or even orgs or users. See :ref:`federated_authorization` for details. ********* Resources diff --git a/docs/real_world_fl/authorization.rst b/docs/real_world_fl/authorization.rst deleted file mode 100644 index 30ef3d1b90..0000000000 --- a/docs/real_world_fl/authorization.rst +++ /dev/null @@ -1,236 +0,0 @@ -############# -Authorization -############# - -NVIDIA FLARE implements a role-based authorization framework that determines what a user can or cannot do based on the user's -assigned roles configured through AuthPolicyBuilder at provisioning. - -******************************** -Terminology for FL authorization -******************************** -The following concepts are used in defining an authorization policy for NVIDIA FLARE. - -Rights -====== -A right is a permission for a user to do certain things. For example, the right "train_all" allows the user to do -training for all orgs in a group. - -Rules -===== -A rule is a policy that an org wants to enforce. For example, the rule "allow_byoc" allows BYOC code to be included in -the application configurations deployed to the org's site. - -Roles -===== -Even though there may be any number of users, they usually are categorized into several types that share the same -authorization settings. Each such type is called a *role*. A user can be assigned to one or more roles. - -Groups -====== -Even though there could be many orgs in the study, they usually are categorized into several types that share the same -authorization settings. Each such type is called a *group*. An org can be configured to belong to groups, with a group -for specifying rules for sites of the org and a group for rights definitions. - -*************************** -Define authorization policy -*************************** - -Each org can specify its own policies: - - Orgs that share the same authorization policies are put in the same group, and authorization policies are defined for the group. - - For each group, the permission matrix is defined for role-right combinations. - - For each group, permission values are defined for each rule. - -The Right Space is a 3D [group, role, right] matrix of permission values, and the Rule Space is a 2D [group, rule] matrix of permission values. - -Right Evaluation -================ -To determine whether a user has a right on a site: - - - Determine the group(s) that the site belongs to - - Determine the role(s) of the user - - Check the Right Space for each [group, role, right] coordinate. If any point is True, then the result is True. This is what we call the "most generous" policy - as long as any of the user roles has the right in any of the groups the site belongs to, the right is granted. If there is no explicit definition for any point, the default value of the right is taken. - -.. note:: - - Note that what is important is the user's role(s). The user's org is not considered except for deciding if a user is - considered "self" for the site. This can in turn affect the right, for example, if site A is with group configured to - allow "Operate Self" but not "Operate All" for role "lead_researcher", a "lead_researcher" user of site A's org can - "Operate" on site A whereas a user that is only "lead_researcher" of another org does not have "Operate" rights. - -Rule Evaluation -=============== -Similar to right evaluation, we also adopt the "most generous" policy to determine the rule value of a site. - -Determine the group(s) that the site belongs to -Check the Rule Space for each [group, rule] coordinate. If any point is True, then the result is True. -If there is no explicit definition for any point, the default value of the rule is taken. - -Defined Rights -============== -Currently the following rights are defined: - -.. csv-table:: - :header: Right,Description - - Upload application,whether the user is allowed to upload applications. - Deploy All,whether all users of the corresponding role are allowed to deploy applications at sites of a certain group. - Deploy Self,whether users of the corresponding role and of the same org as sites of a certain group are allowed to deploy applications. - Train All,whether all users of the corresponding role are allowed to perform training actions at sites of a certain group. - Train Self,whether users of the corresponding role and of the same org as sites of a certain group are allowed to perform training actions - View All,whether all users of the corresponding role are allowed to view information at sites of a certain group. - View Self,whether users of the corresponding role and of the same org as sites of a certain group are allowed to view information - Operate All,whether all users of the corresponding role are allowed to operate at sites of a certain group. - Operate Self,whether users of the corresponding role and of the same org as sites of a certain group are allowed to operate - -.. note:: - - Rights are always in the context of a group, and note that as mentioned above, for evaluating Rights, the rights - group of the site's org is what is important, and the user's org is only important when "All" and "Self" have - different Rights in that group. Otherwise, the user's role(s) are all that will matter for determining Rights. - -Defined Rules -============= -Currently the following rules are defined: - -.. csv-table:: - :header: Rule,Description - - Allow BYOC,whether BYOC code is allowed in application configurations - Allow Custom Data List,whether custom data list is allowed in application configurations - -.. note:: - - For these rules to take effect, the user must provide the implementation of the ``TrainConfigValidator`` - and edit the ``config_fed_server.json``, otherwise they take no effect. - -************************* -Policy definition example -************************* - -The authorization policy is configured in the authz_policy section of study project YAML file: :ref:`project_yml`. When -using the provisioning tool to generate a set of packages, the authorization policy json file is included in the server's -startup kit zip file. - -Here is an example of the generated file:: - - { - "version": "1.0", - - "roles": { - "super": "super user of system", - "lead_researcher": "lead researcher of the study", - "site_researcher": "site researcher of the study", - "site_it": "site IT of the study", - "lead_it": "lead IT of the study" - }, - "groups": { - "relaxed": { - "desc": "the org group with relaxed policies", - "rules": { - "allow_byoc": true, - "allow_custom_datalist": true - } - }, - "strict": { - "desc": "the org group with strict policies", - "rules": { - "allow_byoc": false, - "allow_custom_datalist": false - } - }, - "general": { - "desc": "general group user rights", - "role_rights": { - "super": { - "operate_all": true, - "view_all": true, - "train_all": true - }, - "lead_researcher": { - "train_all": true, - "view_all": true - }, - "site_researcher": { - "train_self": true, - "view_self": true - }, - "lead_it": { - "operate_all": true, - "view_all": true - }, - "site_it": { - "operate_self": true, - "view_self": true - } - } - } - }, - "users": { - "admin@nvidia.com": { - "org": "nvidia", - "roles": ["super"] - }, - "researcher1@org2.com": { - "org": "org2", - "roles": ["lead_it", "site_researcher"] - }, - "researcher2@org1.com": { - "org": "org1", - "roles": ["site_researcher"] - } - }, - "orgs": { - "org1": ["general", "strict"], - "org2": ["general", "relaxed"], - "nvidia": ["general"] - }, - "sites": { - "org1-a": "org1", - "org1-b": "org1", - "org2": "org2", - "server": "nvidia" - } - } - -A few highlights: - - - Each right has a default value. Default values are used for "holes" in the Right Space. - - Each rule has a default value. Default values are used for "holes" in the Rule Space. - - Each user is assigned to a single org and one or more roles; - - Each site is assigned a single org; - - Each org is assigned to one or more groups; - - In each group, a rule and/or right matrix is defined. - -**************************** -Admin command authorizations -**************************** - -Each command from the admin user is subject to authorization. The command is executed only if the authorization is passed. - -Commands are grouped into the following action groups for rights: - -UPLOAD - uploading application configuration to the server. -=========================================================== -Command(s) in this group require the "upload application" right. -Furthermore, if the application contains BYOC code, the site's "allow_byoc" must be true. -Furthermore, if the application contains a custom data list, the site's "allow_custom_datalist" must be true. - -DEPLOY - deploy the application to a site -========================================= -Command(s) in this group require the "deploy all" or "deploy self" right. -Furthermore, if the application contains BYOC code, the site's "allow_byoc" must be true. -Furthermore, if the application contains a custom data list, the site's "allow_custom_datalist" must be true. - -TRAIN - training related actions (set run, start/abort training) -================================================================ -Command(s) in this group require the "train all" or "train self" right. - -VIEW - view training and/or system info (ls, head, tail, grep, pwd, …) -================================================================================ -Command(s) in this group require the "view all" or "view self" right. - -OPERATE - application operation (shutdown, restart server/clients, sys_info) -============================================================================ -Command(s) in this group require the "operate all" or "operate self" right. - - diff --git a/docs/real_world_fl/overview.rst b/docs/real_world_fl/overview.rst index 228d673ce1..707bdc0893 100644 --- a/docs/real_world_fl/overview.rst +++ b/docs/real_world_fl/overview.rst @@ -7,8 +7,8 @@ Introduction ************ NVIDIA FLARE utilizes provisioning and admin clients to reduce the amount of human coordination involved to set up a -federated learning project. A provisioning tool can be configured to create a startup kit for each site in an encrypted -package. These packages can then be delivered to each site ready to go, streamlining the process to provision, start, +federated learning project. A provisioning tool can be configured to create a startup kit for each site. +These packages can then be delivered to each site ready to go, streamlining the process to provision, start, and operate federated learning with a trusted setup. Provision - Start - Operate @@ -16,7 +16,7 @@ Provision - Start - Operate Provision --------- -Lead IT generates the packages for the server / clients / admins, protected with passwords +Project administrator generates the packages for the server / clients / admins Start ----- @@ -43,16 +43,23 @@ Provisioning a federated learning project The :ref:`provisioning` page has details on the contents of the provisioning tool and the underlying NVIDIA FLARE Open Provision API, which you can use to customize configurations to fit your own requirements. +.. note:: + + Starting in NVIDIA FLARE version 2.2.1, the :ref:`nvflare_dashboard_ui` has been introduced for an easier experience for + provisioning a project and distributing the startup kits. If you are using the Dashboard UI, see :ref:`dashboard_api` for + details on how to set it up, and you can skip the rest of this :ref:`provisioned_setup` section. + Edit the :ref:`programming_guide/provisioning_system:Project yaml file` in the directory with the provisioning tool to meet your -project requirements (make sure the server, client sites, admin, orgs, enable_byoc settings, and everything else are right +project requirements (make sure the server, client sites, admin, orgs, and everything else are right for your project). Then run the provision command with (here we assume your project.yml is in current working directory):: - provision -p project.yml + nvflare provision -p project.yml The generated startup kits are created by default in a directory prefixed with "prod\_" within a folder of the project -name in the workspace folder created where provision.py is run. +name in the workspace folder created where provision.py is run. To create password protected zip archives for the startup +kits, see :ref:`distribution_builder`. .. attention:: @@ -64,42 +71,9 @@ name in the workspace folder created where provision.py is run. signed by :class:`SignatureBuilder` so the system will detect if any of the files have been altered and may not run. -The console displays a list of zip files and their passwords. We suggest you copy the console output -and "packages" folder to a safe location. The passwords shown below are for demonstration purposes only:: - - Project yaml file: /home/nvflare-venv/project.yml. - ┏━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓ - ┃ participant ┃ org ┃ destination ┃ password ┃ - ┡━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩ - │ localhost │ nvidia │ localhost.zip │ Dby2BhwJdLKfStWl │ - │ org1-a │ org1 │ org1-a.zip │ BlLXFKgWp0Qu28cS │ - │ org1-b │ org1 │ org1-b.zip │ Lu6w0aCf1RhAqSlJ │ - │ org2 │ org2 │ org2.zip │ KdotOafkUl2ziRh5 │ - │ org3 │ org3 │ org3.zip │ t05cHjnd4WNSo62x │ - │ admin@nvidia.com │ nvidia │ admin@nvidia.com.zip │ eyTrthBudD7noW1s │ - │ researcher@nvidia.com │ nvidia │ researcher@nvidia.com.zip │ s52b8h9ToIRuALpx │ - │ researcher@org1.com │ org1 │ researcher@org1.com.zip │ dmlt3ySsAeU0V9F7 │ - │ researcher@org2.com │ org2 │ researcher@org2.com.zip │ GJS6eb410q0ijlCZ │ - │ it@org2.com │ org2 │ it@org2.com.zip │ s3lYvaL2tqX0Wrjb │ - └───────────────────────┴────────┴───────────────────────────┴──────────────────┘ - -.. tip:: For security reasons, it is recommended to send the password to each participant separately from the package itself. - -After generating packages: Distribute and extract -================================================= -Please let each participant know that the packages are password protected. In Ubuntu, the following command can be used -to extract the packages:: - - unzip -P $PASSWORD $ZIP_FILE -d $DIRECTORY_TO_EXTRACT_TO - -Using ``-d $DIRECTORY_TO_EXTRACT_TO`` is optional, and without it, a "startup" folder will be extracted to the current -directory the package is in. Either way, the parent folder containing this "startup" folder (*$DIRECTORY_TO_EXTRACT_TO* -if the ``-d`` option was used) will be the server, client, or admin client workspace root directory, and the party -running the package will need write access there. - .. note:: - It is important that this "startup" folder is not renamed because the code relies upon this for operation. Please + It is important that the "startup" folder in each startup kit is not renamed because the code relies upon this for operation. Please note that a "transfer" directory and deployed applications will be created at the level of this "startup" folder. See the section on `Internal folder and file structures for NVIDIA FLARE`_ below for more details. @@ -109,24 +83,24 @@ Start: Instructions for each participant to start running FL with their startup .. attention:: Please always safeguard .key files! These are the critical keys for secure communication! -Overseer ($OVERSEER_NAME.zip) +Overseer ============================= One single Overseer will keep track of all the FL servers and communicate to all the participants through their Overseer Agents the active FL server or SP. -After unzipping the package for the Overseer, run the start.sh file from the "startup" folder you unzipped to start the Overseer. +In the package for the Overseer, run the start.sh file from the "startup" folder to start the Overseer. If clients from other machines cannot connect to the Overseer, make sure that the hostname (name of the server under participants in project.yml) specified when generating the startup kits in the provisioning process resolves to the correct IP. If the FL server is on an internal network without a DNS hostname, in Ubuntu, an entry may need to be added to ``/etc/hosts`` with the internal IP and the hostname. -Federated learning servers ($SERVER_NAME.zip) +Federated learning servers ============================================= Server will coordinate the federated learning training and be the main hub all clients and admin clients connect to. -After unzipping the package server.zip, run the start.sh file from the "startup" folder you unzipped to start the server. +In the package for each server, run the start.sh file from the "startup" folder to start the server. The rootCA.pem file is pointed to by "ssl_root_cert" in fed_server.json. If you plan to move/copy it to a different place, you will need to modify fed_server.json. The same applies to the other two files, server.crt and server.key. @@ -143,13 +117,13 @@ participants in project.yml) specified when generating the startup kits in the p correct IP. If the FL server is on an internal network without a DNS hostname, in Ubuntu, an entry may need to be added to ``/etc/hosts`` with the internal IP and the hostname. -Federated learning client ($CLIENT_NAME.zip) +Federated learning clients ============================================ Each site participating in federated learning training is a client. Each package for a client is named after the client name specified when provisioning the project. -After unzipping the package (for details see `After generating packages: Distribute and extract`_), run ``start.sh`` -from the "startup" folder you unzipped to start the client. +In the package for each client, run ``start.sh`` +from the "startup" folder to start the client. .. tip:: diff --git a/docs/user_guide/dashboard_api.rst b/docs/user_guide/dashboard_api.rst index b123f0e466..d4890476d3 100644 --- a/docs/user_guide/dashboard_api.rst +++ b/docs/user_guide/dashboard_api.rst @@ -3,11 +3,11 @@ ######################### Dashboard in NVIDIA FLARE ######################### -As mentioned in :ref:`provisioning`, NVIDIA FLARE system requires a set of startup kits -which include the private keys and certificates, signed by the root CA, in order to communicate to one another. -The new Dashboard in NVIDIA FLARE provides a simple way to collect information of clients and users from different organizations, +As mentioned in :ref:`provisioning`, the NVIDIA FLARE system requires a set of startup kits +which include the private keys and certificates (signed by the root CA) in order to communicate to one another. +The new :ref:`nvflare_dashboard_ui` in NVIDIA FLARE provides a simple way to collect information of clients and users from different organizations, as well as to generate those startup kits for users to download. - + Most of the details about provisioning can be found in :ref:`provisioning`. In this section, we focus on the user interaction with Dashboard and its backend API. ***************************** diff --git a/docs/user_guide/dashboard_ui.rst b/docs/user_guide/dashboard_ui.rst index 5156d1f37a..fc28176349 100644 --- a/docs/user_guide/dashboard_ui.rst +++ b/docs/user_guide/dashboard_ui.rst @@ -4,14 +4,14 @@ NVFLARE Dashboard UI ###################################################### -The NVFlare Dashboard is a new optional addition to NVIDIA FLARE in version 2.2 that allows for the project administrator +The NVFlare Dashboard is a new optional addition to NVIDIA FLARE in version 2.2.1 that allows for the project administrator to deploy a website to gather information about the sites and distribute startup kits. This simplifies the process of data collection and provisioning with users registering to join the project and provide their own information then downloading their own startup kits once the project admin has approved the registration. All the project information can be managed online with provisioning done on the fly. -As of version 2.2, users with role ``Member`` or ``Lead`` can register a user account and then download the startup kit +As of version 2.2.1, users with role ``Member`` or ``Lead`` can register a user account and then download the startup kit for the FLARE console once the account is approved. Users of the role ``Org Admin`` have the additional ability to specify the name and resource specifications for client sites diff --git a/docs/user_guide/federated_authorization.rst b/docs/user_guide/federated_authorization.rst index a812daddd4..0638cd9bbb 100644 --- a/docs/user_guide/federated_authorization.rst +++ b/docs/user_guide/federated_authorization.rst @@ -17,16 +17,18 @@ Here are some examples that an org can do: Centralized vs. Federated Authorization ======================================== -In NVFLARE 2.1 and before, authorization policy is centrally enforced by the FL Server. A recent security review determined this to be a weakness. In a true federated environment, each organization should be able to define and enforce their own authorization policy, instead of relying others (such as FL Server that is owned by a separate org) to do so. +In NVFLARE before version 2.2.1, the authorization policy was centrally enforced by the FL Server. In a true federated environment, each organization should be able to define and enforce their own authorization policy instead of relying others (such as FL Server that is owned by a separate org) to do so. -NVFLARE 2.2 changes to federated authorization where each organization defines and enforces its own authorization policy: +NVFLARE 2.2.1 changes the way authorization is implemented to federated authorization where each organization defines and enforces its own authorization policy: - - Each organization defines its policy in its own authorization.json + - Each organization defines its policy in its own authorization.json (in the local folder of the workspace) - This locally defined policy is loaded by FL Clients owned by the organization - The policy is also enforced by these FL Clients This decentralized authorization has an added benefit: since each organization takes care of its own authorization, there will be no need to update the policy of any other participants (FL Server or Clients) when a new orgs or clients are added. +See `Federated Policies (Github) `_ for a working example with federated site policies for authorization. + Simplified Authorization Policy Configuration ============================================== Since each organization defines its own policy, there will be no need to centrally define all orgs and users. The policy configuration for an org is simply a matrix of role/right permissions. Each role/right combination in the permission matrix answers this question: what kind of users of this role can have this right? @@ -35,7 +37,7 @@ To answer this question, the role/right combination defines one or more conditio Roles ----- -Users are classified into roles. NVFLARE 2.2 defines four roles: +Users are classified into roles. NVFLARE defines four roles starting in 2.2.1: - Project Admin - this role is responsible for the whole FL project; - Org Admin - this role is responsible for the administration of all sites in its org. Each org must have one Org Admin; @@ -44,7 +46,7 @@ Users are classified into roles. NVFLARE 2.2 defines four roles: Rights ------ -NVFLARE 2.2 supports more accurate right definitions to be more flexible: +NVFLARE 2.2.1 supports more accurate right definitions to be more flexible: - Each server-side admin command is a right! This makes it possible for an org to control each command explicitly; - Admin commands are grouped into categories. For example, commands like abort_job, delete_job, start_app are in manage_job category; all shell commands are put into the shell_commands category. Each category is also a right. @@ -52,7 +54,7 @@ NVFLARE 2.2 supports more accurate right definitions to be more flexible: This right system makes it easy to write simple policies that only use command categories. It also makes it possible to write policies to control individual commands. When both categories and commands are used, command-based control takes precedence over category-based control. -See Appendix One for command categories. +See :ref:`command_categories` for command categories. Controls and Conditions ----------------------- @@ -85,7 +87,7 @@ In addition, two words are used for extreme conditions: - Any user is allowed: any - No user is allowed: none -See Appendix Two for an example policy. +See :ref:`sample_auth_policy` for an example policy. Policy Evaluation ----------------- @@ -127,6 +129,7 @@ There are multiple commands (clone_job, delete_job, download_job, etc.) in the " Job management command authorization often evaluates the relationship between the subject user and the job submitter, as shown in the examples. +.. _command_categories: Appendix One - Command Categories ================================= @@ -171,9 +174,13 @@ Appendix One - Command Categories } +.. _sample_auth_policy: + Appendix Two - Sample Policy with Explanations ============================================== +This is an example authorization.json (in the local folder of the workspace for a site). + .. code-block:: shell { diff --git a/examples/xgboost/README.md b/examples/xgboost/README.md index 6de16a079e..1d17f288b4 100644 --- a/examples/xgboost/README.md +++ b/examples/xgboost/README.md @@ -120,12 +120,105 @@ The script will also generate 2 configs in `histogram-based/job_configs` for his - histogram-based training with uniform data split for 2 clients - histogram-based training with uniform data split for 5 clients +## Run experiments for tree-based and histogram-based settings +After you run the two scripts `data_split_gen.sh` and `job_config_gen.sh`, +please go to sub-folder [tree-based](tree-based) for running tree-based algorithms, +and sub-folder [histogram-based](histogram-based) for running histogram-based algorithms. + +## GPU support By default, CPU based training is used. +If the CUDA is installed on the site, tree construction and prediction can be +accelerated using GPUs. + +GPUs are enabled by using :code:`gpu_hist` as :code:`tree_method` parameter. +For example, +:: + "xgboost_params": { + "max_depth": 8, + "eta": 0.1, + "objective": "binary:logistic", + "eval_metric": "auc", + "tree_method": "gpu_hist", + "gpu_id": 0, + "nthread": 16 + } + For GPU based training, edit `job_config_gen.sh` to change `TREE_METHOD="hist"` to `TREE_METHOD="gpu_hist"`. +Then run the `job_config_gen.sh` again to generates new job configs for GPU-based training. + +### Multi GPU support + +Multiple GPUs can be supported by running one NVFlare client for each GPU. Each client +runs a different NVFlare app with the corresponding :code:`gpu_id` assigned. + +Assuming there are 2 physical client sites, each with 2 GPUs (id 0 and 1). +We can start 4 NVFlare client processes (site-1a, site-1b, site-2a, site-2b), one for each GPU. +The job layout looks like this, +:: + + xgb_multi_gpu_job + ├── app_server + │ └── config + │ └── config_fed_server.json + ├── app_site1_gpu0 + │ └── config + │ └── config_fed_client.json + ├── app_site1_gpu1 + │ └── config + │ └── config_fed_client.json + ├── app_site2_gpu0 + │ └── config + │ └── config_fed_client.json + ├── app_site2_gpu1 + │ └── config + │ └── config_fed_client.json + └── meta.json + +Each app is deployed to its own client site. Here is the :code:`meta.json`, +:: + + { + "name": "xgb_multi_gpu_job", + "resource_spec": { + "site-1a": { + "num_of_gpus": 1, + "mem_per_gpu_in_GiB": 1 + }, + "site-1b": { + "num_of_gpus": 1, + "mem_per_gpu_in_GiB": 1 + }, + "site-2a": { + "num_of_gpus": 1, + "mem_per_gpu_in_GiB": 1 + }, + "site-2b": { + "num_of_gpus": 1, + "mem_per_gpu_in_GiB": 1 + } + }, + "deploy_map": { + "app_server": [ + "server" + ], + "app_site1_gpu0": [ + "site-1a" + ], + "app_site1_gpu1": [ + "site-1b" + ], + "app_site2_gpu0": [ + "site-2a" + ], + "app_site2_gpu1": [ + "site-2b" + ] + }, + "min_clients": 4 + } + +For federated XGBoost, all clients must participate in the training. Therefore, +:code:`min_clients` must equal to the number of clients. -## Run experiments for tree-based and histogram-based settings -After you run the two scripts `data_split_gen.sh` and `job_config_gen.sh`, -please go to sub-folder [tree-based](tree-based) for running tree-based algorithms, -and sub-folder [histogram-based](histogram-based) for running histogram-based algorithms. diff --git a/examples/xgboost/histogram-based/README.md b/examples/xgboost/histogram-based/README.md index 3d1bace27f..b50c8a8485 100644 --- a/examples/xgboost/histogram-based/README.md +++ b/examples/xgboost/histogram-based/README.md @@ -28,6 +28,7 @@ To run in a federated setting, follow [Real-World FL](https://nvflare.readthedoc start the overseer, FL servers and FL clients. You need to download the HIGGS data on each client site. +You will also need to install the xgboost on each client site and server site. You can still generate the data splits and job configs using the scripts provided. @@ -37,3 +38,14 @@ inside the `/tmp/nvflare/xgboost_higgs_dataset` folder, since each site might save the HIGGS dataset in different places. Then you can use admin client to submit the job via `submit_job` command. + +## Customization + +The provided XGBoost executor can be customized using Boost parameters +provided in `xgboost_params` argument. + +If the parameter change alone is not sufficient and code changes are required, +a custom executor can be implemented to make calls to xgboost library directly. + +The custom executor can inherit the base class `FedXGBHistogramExecutorSpec` and +implement the `xgb_train()` and `load_data()` method. diff --git a/examples/xgboost/tree-based/README.md b/examples/xgboost/tree-based/README.md index 82c575c9e8..37005d85ee 100644 --- a/examples/xgboost/tree-based/README.md +++ b/examples/xgboost/tree-based/README.md @@ -72,6 +72,7 @@ To run in a federated setting, follow [Real-World FL](https://nvflare.readthedoc start the overseer, FL servers and FL clients. You need to download the HIGGS data on each client site. +You will also need to install the xgboost on each client site and server site. You can still generate the data splits and job configs using the scripts provided. diff --git a/examples/xgboost/utils/prepare_data_split.py b/examples/xgboost/utils/prepare_data_split.py index 2036d18048..21b9cb161e 100644 --- a/examples/xgboost/utils/prepare_data_split.py +++ b/examples/xgboost/utils/prepare_data_split.py @@ -25,11 +25,16 @@ def data_split_args_parser(): parser.add_argument("--site_num", type=int, help="Total number of sites") parser.add_argument("--site_name_prefix", type=str, default="site-", help="Site name prefix") parser.add_argument("--size_total", type=int, help="Total number of instances") - parser.add_argument("--size_valid", type=int, - help="Validation size, the first N instances to be treated as validation data") - parser.add_argument("--split_method", type=str, default="uniform", - choices=["uniform", "linear", "square", "exponential"], - help="How to split the dataset") + parser.add_argument( + "--size_valid", type=int, help="Validation size, the first N instances to be treated as validation data" + ) + parser.add_argument( + "--split_method", + type=str, + default="uniform", + choices=["uniform", "linear", "square", "exponential"], + help="How to split the dataset", + ) parser.add_argument("--out_path", type=str, default="~/dataset", help="Output path for the data split json file") return parser diff --git a/nvflare/app_opt/xgboost/README.rst b/nvflare/app_opt/xgboost/README.rst deleted file mode 100644 index bde999291f..0000000000 --- a/nvflare/app_opt/xgboost/README.rst +++ /dev/null @@ -1,267 +0,0 @@ -Federated XGBoost -================= - - -Overview --------- - -NVFlare supports federated learning using popular gradient boosting library XGBoost. -It uses XGBoost library with federated plugin to perform the learning. - -Following components are provided to run XGBoost jobs in NVFlare, - -* :code:`nvflare.app_opt.xgboost.controller.XGBFedController`: The controller - that starts the XGBoost federated server and kicks off all the XGBoost job on - each NVFlare client. The configuration is generic for this component and - no modification is needed for most training jobs. -* :code:`nvflare.app_opt.xgboost.executor.XGBExecutor`: This is the executor - running on each NVFlare client, which starts XGBoost training. The - configuration for this component needs to be customized for every site and for - each training job because it contains job-specific parameters like location - of training data. - -Using XGBoost with NVFlare has following benefits compared with running federated XGBoost directly, - -* XGBoost instance's life-cycle is managed by NVFlare. Both XGBoost client and server - are started/stopped automatically by NVFlare workflow. -* XGBoost federated server can be configured automatically with auto-assigned port number. -* When mutual TLS is used, the certificates are managed by NVFlare using existing - provisioning process. -* No need to manually configure each instance. Instance specific parameters - like code:`rank` are assigned automatically by the NVFlare controller. - -Requirements ------------- - -The XGBoost library with federated plugin must be installed on all the sites involved -in the learning. - -Following the instructions here to build the XGBoost library with federated plugin, - -https://github.com/dmlc/xgboost/tree/master/plugin/federated#readme - -The Python package for XGBoost is also required. It can be installed using pip, -:: - pip install xgboost - -Usage ------ - -Basic components to run XGBoost are already included with NVFlare distribution. -Most XGBoost jobs can be created without custom code. - -Please refer to :code:`NVFlare/examples/hello-xgboost` for an example -of a simple XGBoost learning job. - -The server workflow is the same for all jobs, so the server configuration can be used -as is without modification. The default configuration starts the XGBoost federated -server on a random port. If a particular port (e.g. 4321) is required, it can be -configured as following, -:: - - { - "format_version": 2, - "server": { - "heart_beat_timeout": 600 - }, - "task_data_filters": [], - "task_result_filters": [], - "components": [], - "workflows": [ - { - "id": "xgb_controller", - "path": "nvflare.app_opt.xgboost.controller.XGBFedController", - "args": { - "train_timeout": 30000, - "port": 3456 - } - } - ] - } - -The client configuration uses an executor to run XGBoost. For example, -:: - - { - "format_version": 2, - "data_root": "/dataset/", - "components": [], - "executors": [ - { - "tasks": [ - "train" - ], - "executor": { - "id": "Executor", - "path": "nvflare.app_opt.xgboost.executor.XGBExecutor", - "args": { - "train_data": "{data_root}higgs.train.csv.1?format=csv&label_column=0", - "test_data": "{data_root}higgs.test.csv.1?format=csv&label_column=0", - "num_rounds": 100, - "early_stopping_round": 2, - "xgboost_params": { - "max_depth": 8, - "eta": 0.1, - "objective": "binary:logistic", - "eval_metric": "auc", - "tree_method": "hist", - "nthread": 16 - } - } - } - } - ], - - "task_result_filters": [], - "task_data_filters": [] - } - -These parameters need to be adjusted for each learning job and for each site. -Most parameters are self-explanatory. Here are descriptions for a -few commonly used ones, - -* :code:`train_data`: Location of the local training data. - This is directly fed into DMatrix so it can be in any format - that's supported by DMatrix. -* :code:`test_data`: Location of the local test data for verification. - Also in DMatrix format. -* :code:`xgboost_params`: This dict is passed to :code:`xgboost.train()` as the first - argument :code:`params`. It contains all the Booster parameters. - Please refer to XGBoost documentation for details: - https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.training - - -GPU Support ------------ - -If the CUDA is installed on the site, tree construction and prediction can be -accelerated using GPUs. - -GPUs are enabled by using :code:`gpu_hist` as :code:`tree_method` parameter. -For example, -:: - "xgboost_params": { - "max_depth": 8, - "eta": 0.1, - "objective": "binary:logistic", - "eval_metric": "auc", - "tree_method": "gpu_hist", - "gpu_id": 0, - "nthread": 16 - } - -Multiple GPUs can be supported by running one NVFlare client for each GPU. Each client -runs a different NVFlare app with the corresponding :code:`gpu_id` assigned. - -Assuming there are 2 client sites, each with 2 GPUs (id 0 and 1), 4 NVFlare -client sites are needed, one for each GPU. The job layout looks like this, -:: - - xgb_multi_gpu_job - ├── app_server - │ └── config - │ └── config_fed_server.json - ├── app_site1_gpu0 - │ └── config - │ └── config_fed_client.json - ├── app_site1_gpu1 - │ └── config - │ └── config_fed_client.json - ├── app_site2_gpu0 - │ └── config - │ └── config_fed_client.json - ├── app_site2_gpu1 - │ └── config - │ └── config_fed_client.json - └── meta.json - -Each app is deployed to its own client site. Here is the :code:`meta.json`, -:: - - { - "name": "xgb_multi_gpu_job", - "resource_spec": { - "site-1a": { - "num_of_gpus": 1, - "mem_per_gpu_in_GiB": 1 - }, - "site-1b": { - "num_of_gpus": 1, - "mem_per_gpu_in_GiB": 1 - }, - "site-2a": { - "num_of_gpus": 1, - "mem_per_gpu_in_GiB": 1 - }, - "site-2b": { - "num_of_gpus": 1, - "mem_per_gpu_in_GiB": 1 - } - }, - "deploy_map": { - "app_server": [ - "server" - ], - "app_site1_gpu0": [ - "site-1a" - ], - "app_site1_gpu1": [ - "site-1b" - ], - "app_site2_gpu0": [ - "site-2a" - ], - "app_site2_gpu1": [ - "site-2b" - ] - }, - "min_clients": 4 - } - -For federated XGBoost, all clients must participate in the training. There, -:code:`min_clients` must equal to the number of clients. - -Customization -------------- - -The provided XGBoost executor can be customized using Boost parameters -provided in :code:`xgboost_params` argument. - -If the parameter change alone is not sufficient and code changes are required, -a custom executor can be implemented to make calls to xgboost library directly. - -The executor must inherit the base class :code:`XGBExecutorBase` and implement -the :code:`xgb_train()` method. - -For example, following custom executor can be used if a particular objective -function is required, -:: - - class CustomXGBExecutor(XGBExecutorBase): - def xgb_train(self, params: XGBoostParams, fl_ctx: FLContext) -> Shareable: - with xgb.collective.CommunicatorContext(**params.communicator_env): - dtrain = xgb.DMatrix(params.train_data) - dtest = xgb.DMatrix(params.test_data) - watchlist = [(dtest, "eval"), (dtrain, "train")] - bst = xgb.train( - params.xgb_params, - dtrain, - params.num_rounds, - evals=watchlist, - early_stopping_rounds=params.early_stopping_rounds, - verbose_eval=params.verbose_eval, - callbacks=[callback.EvaluationMonitor(rank=self.rank)], - obj=squared_log, - ) - - # Save the model. - workspace = fl_ctx.get_prop(FLContextKey.WORKSPACE_OBJECT) - run_number = fl_ctx.get_prop(FLContextKey.CURRENT_RUN) - run_dir = workspace.get_run_dir(run_number) - bst.save_model(os.path.join(run_dir, "test.model.json")) - xgb.collective.communicator_print("Finished training\n") - - return make_reply(ReturnCode.OK) - -In the above example, :code:`squared_log` function is used as the objective -function, instead of the default one. \ No newline at end of file diff --git a/nvflare/app_opt/xgboost/histogram_based/controller.py b/nvflare/app_opt/xgboost/histogram_based/controller.py index 1f6b12b7f7..715ca5c855 100644 --- a/nvflare/app_opt/xgboost/histogram_based/controller.py +++ b/nvflare/app_opt/xgboost/histogram_based/controller.py @@ -34,6 +34,10 @@ class XGBFedController(Controller): def __init__(self, train_timeout: int = 300, port: int = None): """Federated XGBoost training controller for histogram-base collaboration. + It starts the XGBoost federated server and kicks off all the XGBoost job on + each NVFlare client. The configuration is generic for this component and + no modification is needed for most training jobs. + Args: train_timeout (int, optional): Time to wait for clients to do local training in seconds. port (int, optional): the port to open XGBoost FL server diff --git a/nvflare/app_opt/xgboost/histogram_based/executor.py b/nvflare/app_opt/xgboost/histogram_based/executor.py index 6b73822222..684e7a8728 100644 --- a/nvflare/app_opt/xgboost/histogram_based/executor.py +++ b/nvflare/app_opt/xgboost/histogram_based/executor.py @@ -31,17 +31,7 @@ from nvflare.security.logging import secure_format_exception, secure_log_traceback from .constants import XGB_TRAIN_TASK, XGBShareableHeader -from .executor_spec import FedXGBHistogramExecutorSpec - - -class XGBoostParams: - """Container for all XGBoost parameters""" - - def __init__(self, xgb_params: dict, num_rounds=10, early_stopping_rounds=2, verbose_eval=False): - self.num_rounds = num_rounds - self.early_stopping_rounds = early_stopping_rounds - self.verbose_eval = verbose_eval - self.xgb_params: dict = xgb_params if xgb_params else {} +from .executor_spec import FedXGBHistogramExecutorSpec, XGBoostParams class FedXGBHistogramExecutor(FedXGBHistogramExecutorSpec, Executor, ABC): @@ -50,17 +40,21 @@ class FedXGBHistogramExecutor(FedXGBHistogramExecutorSpec, Executor, ABC): This class implements the basic xgb_train logic, the subclass must implement load_data. """ - def __init__(self, num_rounds, early_stopping_round, xgboost_params, verbose_eval=False): - """Federated XGBoost Executor Spec for histogram-base collaboration. + def __init__(self, num_rounds, early_stopping_round, xgboost_params: dict, verbose_eval=False): + """Federated XGBoost Executor for histogram-base collaboration. This class sets up the training environment for Federated XGBoost. - This is an abstract class, load_data and xgb_train method must be implemented by a subclass. + This is an abstract class, load_data method must be implemented by a subclass. + This is the executor running on each NVFlare client, which starts XGBoost training. Args: num_rounds: number of boosting rounds early_stopping_round: early stopping round - xgboost_params: parameters to passed in xgb - verbose_eval: verbose_eval in xgb + xgboost_params: This dict is passed to `xgboost.train()` as the first argument `params`. + It contains all the Booster parameters. + Please refer to XGBoost documentation for details: + https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.training + verbose_eval: verbose_eval in xgboost.train """ super().__init__() @@ -101,6 +95,7 @@ def xgb_train(self, params: XGBoostParams) -> xgb.core.Booster: early_stopping_rounds=params.early_stopping_rounds, verbose_eval=params.verbose_eval, callbacks=[callback.EvaluationMonitor(rank=self.rank)], + obj="squared_log", ) return bst diff --git a/nvflare/app_opt/xgboost/histogram_based/executor_spec.py b/nvflare/app_opt/xgboost/histogram_based/executor_spec.py index 17f9030468..7429ab5182 100644 --- a/nvflare/app_opt/xgboost/histogram_based/executor_spec.py +++ b/nvflare/app_opt/xgboost/histogram_based/executor_spec.py @@ -22,6 +22,14 @@ class XGBoostParams: """Container for all XGBoost parameters""" def __init__(self, xgb_params: dict, num_rounds=10, early_stopping_rounds=2, verbose_eval=False): + """ + + Args: + xgb_params: This dict is passed to `xgboost.train()` as the first argument `params`. + It contains all the Booster parameters. + Please refer to XGBoost documentation for details: + https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.training + """ self.num_rounds = num_rounds self.early_stopping_rounds = early_stopping_rounds self.verbose_eval = verbose_eval diff --git a/nvflare/fuel/utils/fobs/README.rst b/nvflare/fuel/utils/fobs/README.rst index da980bc356..5f528298d8 100644 --- a/nvflare/fuel/utils/fobs/README.rst +++ b/nvflare/fuel/utils/fobs/README.rst @@ -154,6 +154,25 @@ by MessagePack, a decomposer is included in `fobs` module so no need to further The same decomposer can be registered multiple times. Only first one takes effect, the others are ignored with a warning message. +Note that fobs_initialize() may need to be called if decomposers are not registered. + +Enum Types +---------- + +FOBS supports enum types by default. Decomposers for all classes derived from :code:`Enum` are +automatically registered using the generic decomposer for enum. + +In rare case that an enum class is too complicated that the generic decomposer can't +handle it, a special decomposer can be written and registered. This will prevent FOBS from +auto-registering the generic decomposer for this enum type. + +The auto-registering of enum decomposers can be disabled like this, + +:: + + fobs.auto_register_enum_types(False) + + Custom Types ------------