Merge branch 'main' into enhance_launcher_executor_and_subprocess_lau…

…ncher
YuanTingHsieh · Dec 4, 2024 · c9ea854 · c9ea854
2 parents a6756c5 + c6f2521
commit c9ea854
Show file tree

Hide file tree

Showing 43 changed files with 512 additions and 294 deletions.
diff --git a/examples/advanced/job_api/tf/README.md b/examples/advanced/job_api/tf/README.md
@@ -7,9 +7,8 @@ All examples in this folder are based on using [TensorFlow](https://tensorflow.o
 
 ## Simulated Federated Learning with CIFAR10 Using Tensorflow
 
-This example shows `Tensorflow`-based classic Federated Learning
-algorithms, namely FedAvg and FedOpt on CIFAR10
-dataset. This example is analogous to [the example using `Pytorch`
+This example demonstrates TensorFlow-based federated learning algorithms on the CIFAR-10 dataset.
+This example is analogous to [the example using `Pytorch`
 backend](https://github.com/NVIDIA/NVFlare/tree/main/examples/advanced/cifar10/cifar10-sim)
 on the same dataset, where same experiments
 were conducted and analyzed. You should expect the same
@@ -21,7 +20,7 @@ client-side training logics (details in file
 and the new
 [`FedJob`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/job_config/api.py)
 APIs were used to programmatically set up an
-`nvflare` job to be exported or ran by simulator (details in file
+NVFlare job to be exported or ran by simulator (details in file
 [`tf_fl_script_runner_cifar10.py`](tf_fl_script_runner_cifar10.py)),
 alleviating the need of writing job config files, simplifying
 development process.
@@ -65,12 +64,8 @@ script.
 > `export TF_FORCE_GPU_ALLOW_GROWTH=true && export
 > TF_GPU_ALLOCATOR=cuda_malloc_asyncp`
 
-The set-up of all experiments in this example are kept the same as
-[the example using `Pytorch`
-backend](https://github.com/NVIDIA/NVFlare/tree/main/examples/advanced/cifar10/cifar10-sim). Refer
-to the `Pytorch` example for more details. Similar to the Pytorch
-example, we here also use Dirichelet sampling on CIFAR10 data labels
-to simulate data heterogeneity among data splits for different client
+We use Dirichelet sampling (implementation from FedMA (https://github.com/IBM/FedMA)) on
+CIFAR10 data labels to simulate data heterogeneity among data splits for different client
 sites, controlled by an alpha value, ranging from 0 (not including 0)
 to 1. A high alpha value indicates less data heterogeneity, i.e., an
 alpha value equal to 1.0 would result in homogeneous data distribution

diff --git a/examples/advanced/job_api/tf/run_jobs.sh b/examples/advanced/job_api/tf/run_jobs.sh
@@ -25,7 +25,7 @@ GPU_INDX=0
 WORKSPACE=/tmp
 
 # Run centralized training job
-python ./tf_fl_script_executor_cifar10.py \
+python ./tf_fl_script_runner_cifar10.py \
        --algo centralized \
        --n_clients 1 \
        --num_rounds 25 \
@@ -39,7 +39,7 @@ python ./tf_fl_script_executor_cifar10.py \
 # Run FedAvg with different alpha values
 for alpha in 1.0 0.5 0.3 0.1; do
 
-    python ./tf_fl_script_executor_cifar10.py \
+    python ./tf_fl_script_runner_cifar10.py \
        --algo fedavg \
        --n_clients 8 \
        --num_rounds 50 \
@@ -53,7 +53,7 @@ done
 
 
 # Run FedOpt job
-python ./tf_fl_script_executor_cifar10.py \
+python ./tf_fl_script_runner_cifar10.py \
        --algo fedopt \
        --n_clients 8 \
        --num_rounds 50 \
@@ -65,7 +65,7 @@ python ./tf_fl_script_executor_cifar10.py \
 
 
 # Run FedProx job.
-python ./tf_fl_script_executor_cifar10.py \
+python ./tf_fl_script_runner_cifar10.py \
        --algo fedprox \
        --n_clients 8 \
        --num_rounds 50 \
@@ -77,11 +77,11 @@ python ./tf_fl_script_executor_cifar10.py \
 
 
 # Run scaffold job
-python ./tf_fl_script_executor_cifar10.py \
+python ./tf_fl_script_runner_cifar10.py \
        --algo scaffold \
        --n_clients 8 \
        --num_rounds 50 \
        --batch_size 64 \
        --epochs 4 \
        --alpha 0.1 \
-       --gpu $GPU_INDX   
+       --gpu $GPU_INDX
diff --git a/examples/advanced/xgboost/histogram-based/requirements.txt b/examples/advanced/xgboost/histogram-based/requirements.txt
@@ -3,5 +3,7 @@ pandas
 scikit-learn
 torch
 tensorboard
+matplotlib
+shap
 # require xgboost 2.2 version, for now need to install a nightly build
 https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/federated-secure/xgboost-2.2.0.dev0%2B4601688195708f7c31fcceeb0e0ac735e7311e61-py3-none-manylinux_2_28_x86_64.whl
diff --git a/examples/advanced/xgboost/tree-based/requirements.txt b/examples/advanced/xgboost/tree-based/requirements.txt
@@ -3,5 +3,7 @@ pandas
 scikit-learn
 torch
 tensorboard
+matplotlib
+shap
 # require xgboost 2.2 version, for now need to install a nightly build
 https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/federated-secure/xgboost-2.2.0.dev0%2B4601688195708f7c31fcceeb0e0ac735e7311e61-py3-none-manylinux_2_28_x86_64.whl
diff --git a/examples/getting_started/tf/README.md b/examples/getting_started/tf/README.md
@@ -1,26 +1,21 @@
 # Getting Started with NVFlare (TensorFlow)
 [![TensorFlow Logo](https://upload.wikimedia.org/wikipedia/commons/a/ab/TensorFlow_logo.svg)](https://tensorflow.org/)
 
-We provide several examples to quickly get you started using NVFlare's Job API. 
+We provide several examples to help you quickly get started with NVFlare.
 All examples in this folder are based on using [TensorFlow](https://tensorflow.org/) as the model training framework.
 
 ## Simulated Federated Learning with CIFAR10 Using Tensorflow
 
-This example shows `Tensorflow`-based classic Federated Learning
-algorithms, namely FedAvg and FedOpt on CIFAR10
-dataset. This example is analogous to [the example using `Pytorch`
-backend](https://github.com/NVIDIA/NVFlare/tree/main/examples/advanced/cifar10/cifar10-sim)
-on the same dataset, where same experiments
-were conducted and analyzed. You should expect the same
-experimental results when comparing this example with the `Pytorch` one.
+This example demonstrates TensorFlow-based federated learning algorithms,
+FedAvg and FedOpt, on the CIFAR-10 dataset.
 
 In this example, the latest Client APIs were used to implement
 client-side training logics (details in file
 [`cifar10_tf_fl_alpha_split.py`](src/cifar10_tf_fl_alpha_split.py)),
 and the new
 [`FedJob`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/job_config/api.py)
 APIs were used to programmatically set up an
-`nvflare` job to be exported or ran by simulator (details in file
+NVFlare job to be exported or ran by simulator (details in file
 [`tf_fl_script_runner_cifar10.py`](tf_fl_script_runner_cifar10.py)),
 alleviating the need of writing job config files, simplifying
 development process.
@@ -64,12 +59,8 @@ script.
 > `export TF_FORCE_GPU_ALLOW_GROWTH=true && export
 > TF_GPU_ALLOCATOR=cuda_malloc_asyncp`
 
-The set-up of all experiments in this example are kept the same as
-[the example using `Pytorch`
-backend](https://github.com/NVIDIA/NVFlare/tree/main/examples/advanced/cifar10/cifar10-sim). Refer
-to the `Pytorch` example for more details. Similar to the Pytorch
-example, we here also use Dirichelet sampling on CIFAR10 data labels
-to simulate data heterogeneity among data splits for different client
+We use Dirichelet sampling (implementation from FedMA (https://github.com/IBM/FedMA)) on
+CIFAR10 data labels to simulate data heterogeneity among data splits for different client
 sites, controlled by an alpha value, ranging from 0 (not including 0)
 to 1. A high alpha value indicates less data heterogeneity, i.e., an
 alpha value equal to 1.0 would result in homogeneous data distribution
@@ -111,11 +102,11 @@ for alpha in 1.0 0.5 0.3 0.1; do
 done
 ```
 
-## 2. Results
+## 3. Results
 
 Now let's compare experimental results.
 
-### 2.1 Centralized training vs. FedAvg for homogeneous split
+### 3.1 Centralized training vs. FedAvg for homogeneous split
 Let's first compare FedAvg with homogeneous data split
 (i.e. `alpha=1.0`) and centralized training. As can be seen from the
 figure and table below, FedAvg can achieve similar performance to
@@ -129,7 +120,7 @@ no difference in data distributions among different clients.
 
 ![Central vs. FedAvg](./figs/fedavg-vs-centralized.png)
 
-### 2.2 Impact of client data heterogeneity
+### 3.2 Impact of client data heterogeneity
 
 Here we compare the impact of data heterogeneity by varying the
 `alpha` value, where lower values cause higher heterogeneity. As can
@@ -145,7 +136,7 @@ as data heterogeneity becomes higher.
 
 ![Impact of client data
 heterogeneity](./figs/fedavg-diff-alphas.png)
- 
+
 > [!NOTE]
 > More examples can be found at https://nvidia.github.io/NVFlare.
 
diff --git a/examples/getting_started/tf/run_jobs.sh b/examples/getting_started/tf/run_jobs.sh
@@ -50,38 +50,3 @@ for alpha in 1.0 0.5 0.3 0.1; do
        --workspace $WORKSPACE
 
 done
-
-
-# Run FedOpt job
-python ./tf_fl_script_runner_cifar10.py \
-       --algo fedopt \
-       --n_clients 8 \
-       --num_rounds 50 \
-       --batch_size 64 \
-       --epochs 4 \
-       --alpha 0.1 \
-       --gpu $GPU_INDX \
-       --workspace $WORKSPACE
-
-
-# Run FedProx job.
-python ./tf_fl_script_runner_cifar10.py \
-       --algo fedprox \
-       --n_clients 8 \
-       --num_rounds 50 \
-       --batch_size 64 \
-       --epochs 4 \
-       --fedprox_mu 1e-5 \
-       --alpha 0.1 \
-       --gpu $GPU_INDX
-
-
-# Run scaffold job
-python ./tf_fl_script_runner_cifar10.py \
-       --algo scaffold \
-       --n_clients 8 \
-       --num_rounds 50 \
-       --batch_size 64 \
-       --epochs 4 \
-       --alpha 0.1 \
-       --gpu $GPU_INDX   
diff --git a/examples/hello-world/hello-tf/README.md b/examples/hello-world/hello-tf/README.md
@@ -4,7 +4,7 @@ Example of using [NVIDIA FLARE](https://nvflare.readthedocs.io/en/main/index.htm
 using federated averaging ([FedAvg](https://arxiv.org/abs/1602.05629))
 and [TensorFlow](https://tensorflow.org/) as the deep learning training framework.
 
-> **_NOTE:_** This example uses the [MNIST](http://yann.lecun.com/exdb/mnist/) handwritten digits dataset and will load its data within the trainer code.
+> **_NOTE:_** This example uses the [MNIST](https://www.tensorflow.org/datasets/catalog/mnist) handwritten digits dataset and will load its data within the trainer code.
 
 See the [Hello TensorFlow](https://nvflare.readthedocs.io/en/main/examples/hello_tf_job_api.html#hello-tf-job-api) example documentation page for details on this
 example.
@@ -48,7 +48,7 @@ In scenarios where multiple clients are involved, you have to prevent TensorFlow
 by setting the following flags.
 
 ```bash
-TF_FORCE_GPU_ALLOW_GROWTH=true TF_GPU_ALLOCATOR=cuda_malloc_async
+TF_FORCE_GPU_ALLOW_GROWTH=true TF_GPU_ALLOCATOR=cuda_malloc_async python3 fedavg_script_runner_tf.py
 ```
 
 If you possess more GPUs than clients, a good strategy is to run one client on each GPU.

diff --git a/nvflare/apis/event_type.py b/nvflare/apis/event_type.py
@@ -89,4 +89,4 @@ class EventType(object):
 
     AUTHORIZE_COMMAND_CHECK = "_authorize_command_check"
     BEFORE_BUILD_COMPONENT = "_before_build_component"
-    GET_JOB_LAUNCHER = "_get_job_launcher"
+    BEFORE_JOB_LAUNCH = "_before_job_launch"
diff --git a/nvflare/app_common/job_launcher/client_process_launcher.py b/nvflare/app_common/job_launcher/client_process_launcher.py
@@ -11,56 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
-import sys
 
-from nvflare.apis.fl_constant import FLContextKey, JobConstants
-from nvflare.apis.workspace import Workspace
 from nvflare.app_common.job_launcher.process_launcher import ProcessJobLauncher
-from nvflare.private.fed.utils.fed_utils import add_custom_dir_to_path
+from nvflare.utils.job_launcher_utils import generate_client_command
 
 
 class ClientProcessJobLauncher(ProcessJobLauncher):
-    def get_command(self, launch_data, fl_ctx) -> (str, dict):
-        new_env = os.environ.copy()
-        workspace_obj: Workspace = fl_ctx.get_prop(FLContextKey.WORKSPACE_OBJECT)
-        args = fl_ctx.get_prop(FLContextKey.ARGS)
-        client = fl_ctx.get_prop(FLContextKey.SITE_OBJ)
-        job_id = launch_data.get(JobConstants.JOB_ID)
-        server_config = fl_ctx.get_prop(FLContextKey.SERVER_CONFIG)
-        if not server_config:
-            raise RuntimeError(f"missing {FLContextKey.SERVER_CONFIG} in FL context")
-        service = server_config[0].get("service", {})
-        if not isinstance(service, dict):
-            raise RuntimeError(f"expect server config data to be dict but got {type(service)}")
-
-        app_custom_folder = workspace_obj.get_app_custom_dir(job_id)
-        if app_custom_folder != "":
-            add_custom_dir_to_path(app_custom_folder, new_env)
-
-        command_options = ""
-        for t in args.set:
-            command_options += " " + t
-        command = (
-            f"{sys.executable} -m nvflare.private.fed.app.client.worker_process -m "
-            + args.workspace
-            + " -w "
-            + (workspace_obj.get_startup_kit_dir())
-            + " -t "
-            + client.token
-            + " -d "
-            + client.ssid
-            + " -n "
-            + job_id
-            + " -c "
-            + client.client_name
-            + " -p "
-            + str(client.cell.get_internal_listener_url())
-            + " -g "
-            + service.get("target")
-            + " -scheme "
-            + service.get("scheme", "grpc")
-            + " -s fed_client.json "
-            " --set" + command_options + " print_conf=True"
-        )
-        return command, new_env
+    def get_command(self, job_meta, fl_ctx) -> str:
+        return generate_client_command(job_meta, fl_ctx)
diff --git a/nvflare/app_common/job_launcher/process_launcher.py b/nvflare/app_common/job_launcher/process_launcher.py
@@ -18,10 +18,11 @@
 from abc import abstractmethod
 
 from nvflare.apis.event_type import EventType
-from nvflare.apis.fl_constant import FLContextKey
+from nvflare.apis.fl_constant import FLContextKey, JobConstants
 from nvflare.apis.fl_context import FLContext
 from nvflare.apis.job_launcher_spec import JobHandleSpec, JobLauncherSpec, JobReturnCode, add_launcher
-from nvflare.private.fed.utils.fed_utils import extract_job_image
+from nvflare.apis.workspace import Workspace
+from nvflare.utils.job_launcher_utils import add_custom_dir_to_path, extract_job_image
 
 JOB_RETURN_CODE_MAPPING = {0: JobReturnCode.SUCCESS, 1: JobReturnCode.EXECUTION_ERROR, 9: JobReturnCode.ABORTED}
 
@@ -62,7 +63,14 @@ def __init__(self):
 
     def launch_job(self, job_meta: dict, fl_ctx: FLContext) -> JobHandleSpec:
 
-        command, new_env = self.get_command(job_meta, fl_ctx)
+        new_env = os.environ.copy()
+        workspace_obj: Workspace = fl_ctx.get_prop(FLContextKey.WORKSPACE_OBJECT)
+        job_id = job_meta.get(JobConstants.JOB_ID)
+        app_custom_folder = workspace_obj.get_app_custom_dir(job_id)
+        if app_custom_folder != "":
+            add_custom_dir_to_path(app_custom_folder, new_env)
+
+        command = self.get_command(job_meta, fl_ctx)
         # use os.setsid to create new process group ID
         process = subprocess.Popen(shlex.split(command, True), preexec_fn=os.setsid, env=new_env)
 
@@ -71,22 +79,22 @@ def launch_job(self, job_meta: dict, fl_ctx: FLContext) -> JobHandleSpec:
         return ProcessHandle(process)
 
     def handle_event(self, event_type: str, fl_ctx: FLContext):
-        if event_type == EventType.GET_JOB_LAUNCHER:
+        if event_type == EventType.BEFORE_JOB_LAUNCH:
             job_meta = fl_ctx.get_prop(FLContextKey.JOB_META)
             job_image = extract_job_image(job_meta, fl_ctx.get_identity_name())
             if not job_image:
                 add_launcher(self, fl_ctx)
 
     @abstractmethod
-    def get_command(self, launch_data, fl_ctx) -> (str, dict):
+    def get_command(self, job_meta, fl_ctx) -> str:
         """To generate the command to launcher the job in sub-process
 
         Args:
             fl_ctx: FLContext
-            launch_data: job launcher data
+            job_meta: job meta data
 
         Returns:
-            launch command, environment dict
+            launch command
 
         """
         pass