Merge branch 'main' into new_cuda_plugin

YuanTingHsieh · Dec 2, 2024 · a38a09c · a38a09c
2 parents 91602e8 + ccd28aa
commit a38a09c
Show file tree

Hide file tree

Showing 112 changed files with 4,124 additions and 738 deletions.
diff --git a/3rdParty/bitsandbytes.LICENSE.txt b/3rdParty/bitsandbytes.LICENSE.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) Facebook, Inc. and its affiliates.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/docs/resources/log.config b/docs/resources/log.config
@@ -1,27 +1,27 @@
 [loggers]
-keys=root,modelLogger
+keys=root
 
 [handlers]
-keys=consoleHandler
+keys=consoleHandler,errorFileHandler
 
 [formatters]
 keys=fullFormatter
 
 [logger_root]
 level=INFO
-handlers=consoleHandler
-
-[logger_modelLogger]
-level=DEBUG
-handlers=consoleHandler
-qualname=modelLogger
-propagate=0
+handlers=consoleHandler,errorFileHandler
 
 [handler_consoleHandler]
 class=StreamHandler
 level=DEBUG
 formatter=fullFormatter
 args=(sys.stdout,)
 
+[handler_errorFileHandler]
+class=FileHandler
+level=ERROR
+formatter=fullFormatter
+args=('error_log.txt', 'a')
+
 [formatter_fullFormatter]
 format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
diff --git a/examples/advanced/job_api/tf/README.md b/examples/advanced/job_api/tf/README.md
@@ -7,9 +7,8 @@ All examples in this folder are based on using [TensorFlow](https://tensorflow.o
 
 ## Simulated Federated Learning with CIFAR10 Using Tensorflow
 
-This example shows `Tensorflow`-based classic Federated Learning
-algorithms, namely FedAvg and FedOpt on CIFAR10
-dataset. This example is analogous to [the example using `Pytorch`
+This example demonstrates TensorFlow-based federated learning algorithms on the CIFAR-10 dataset.
+This example is analogous to [the example using `Pytorch`
 backend](https://github.com/NVIDIA/NVFlare/tree/main/examples/advanced/cifar10/cifar10-sim)
 on the same dataset, where same experiments
 were conducted and analyzed. You should expect the same
@@ -21,7 +20,7 @@ client-side training logics (details in file
 and the new
 [`FedJob`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/job_config/api.py)
 APIs were used to programmatically set up an
-`nvflare` job to be exported or ran by simulator (details in file
+NVFlare job to be exported or ran by simulator (details in file
 [`tf_fl_script_runner_cifar10.py`](tf_fl_script_runner_cifar10.py)),
 alleviating the need of writing job config files, simplifying
 development process.
@@ -65,12 +64,8 @@ script.
 > `export TF_FORCE_GPU_ALLOW_GROWTH=true && export
 > TF_GPU_ALLOCATOR=cuda_malloc_asyncp`
 
-The set-up of all experiments in this example are kept the same as
-[the example using `Pytorch`
-backend](https://github.com/NVIDIA/NVFlare/tree/main/examples/advanced/cifar10/cifar10-sim). Refer
-to the `Pytorch` example for more details. Similar to the Pytorch
-example, we here also use Dirichelet sampling on CIFAR10 data labels
-to simulate data heterogeneity among data splits for different client
+We use Dirichelet sampling (implementation from FedMA (https://github.com/IBM/FedMA)) on
+CIFAR10 data labels to simulate data heterogeneity among data splits for different client
 sites, controlled by an alpha value, ranging from 0 (not including 0)
 to 1. A high alpha value indicates less data heterogeneity, i.e., an
 alpha value equal to 1.0 would result in homogeneous data distribution

diff --git a/examples/advanced/job_api/tf/run_jobs.sh b/examples/advanced/job_api/tf/run_jobs.sh
@@ -25,7 +25,7 @@ GPU_INDX=0
 WORKSPACE=/tmp
 
 # Run centralized training job
-python ./tf_fl_script_executor_cifar10.py \
+python ./tf_fl_script_runner_cifar10.py \
        --algo centralized \
        --n_clients 1 \
        --num_rounds 25 \
@@ -39,7 +39,7 @@ python ./tf_fl_script_executor_cifar10.py \
 # Run FedAvg with different alpha values
 for alpha in 1.0 0.5 0.3 0.1; do
 
-    python ./tf_fl_script_executor_cifar10.py \
+    python ./tf_fl_script_runner_cifar10.py \
        --algo fedavg \
        --n_clients 8 \
        --num_rounds 50 \
@@ -53,7 +53,7 @@ done
 
 
 # Run FedOpt job
-python ./tf_fl_script_executor_cifar10.py \
+python ./tf_fl_script_runner_cifar10.py \
        --algo fedopt \
        --n_clients 8 \
        --num_rounds 50 \
@@ -65,7 +65,7 @@ python ./tf_fl_script_executor_cifar10.py \
 
 
 # Run FedProx job.
-python ./tf_fl_script_executor_cifar10.py \
+python ./tf_fl_script_runner_cifar10.py \
        --algo fedprox \
        --n_clients 8 \
        --num_rounds 50 \
@@ -77,11 +77,11 @@ python ./tf_fl_script_executor_cifar10.py \
 
 
 # Run scaffold job
-python ./tf_fl_script_executor_cifar10.py \
+python ./tf_fl_script_runner_cifar10.py \
        --algo scaffold \
        --n_clients 8 \
        --num_rounds 50 \
        --batch_size 64 \
        --epochs 4 \
        --alpha 0.1 \
-       --gpu $GPU_INDX   
+       --gpu $GPU_INDX