Skip to content

Commit

Permalink
Merge branch 'main' into enhance_launcher_executor_and_subprocess_lau…
Browse files Browse the repository at this point in the history
…ncher
  • Loading branch information
YuanTingHsieh authored Dec 4, 2024
2 parents a6756c5 + c6f2521 commit c9ea854
Show file tree
Hide file tree
Showing 43 changed files with 512 additions and 294 deletions.
15 changes: 5 additions & 10 deletions examples/advanced/job_api/tf/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@ All examples in this folder are based on using [TensorFlow](https://tensorflow.o

## Simulated Federated Learning with CIFAR10 Using Tensorflow

This example shows `Tensorflow`-based classic Federated Learning
algorithms, namely FedAvg and FedOpt on CIFAR10
dataset. This example is analogous to [the example using `Pytorch`
This example demonstrates TensorFlow-based federated learning algorithms on the CIFAR-10 dataset.
This example is analogous to [the example using `Pytorch`
backend](https://github.com/NVIDIA/NVFlare/tree/main/examples/advanced/cifar10/cifar10-sim)
on the same dataset, where same experiments
were conducted and analyzed. You should expect the same
Expand All @@ -21,7 +20,7 @@ client-side training logics (details in file
and the new
[`FedJob`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/job_config/api.py)
APIs were used to programmatically set up an
`nvflare` job to be exported or ran by simulator (details in file
NVFlare job to be exported or ran by simulator (details in file
[`tf_fl_script_runner_cifar10.py`](tf_fl_script_runner_cifar10.py)),
alleviating the need of writing job config files, simplifying
development process.
Expand Down Expand Up @@ -65,12 +64,8 @@ script.
> `export TF_FORCE_GPU_ALLOW_GROWTH=true && export
> TF_GPU_ALLOCATOR=cuda_malloc_asyncp`
The set-up of all experiments in this example are kept the same as
[the example using `Pytorch`
backend](https://github.com/NVIDIA/NVFlare/tree/main/examples/advanced/cifar10/cifar10-sim). Refer
to the `Pytorch` example for more details. Similar to the Pytorch
example, we here also use Dirichelet sampling on CIFAR10 data labels
to simulate data heterogeneity among data splits for different client
We use Dirichelet sampling (implementation from FedMA (https://github.com/IBM/FedMA)) on
CIFAR10 data labels to simulate data heterogeneity among data splits for different client
sites, controlled by an alpha value, ranging from 0 (not including 0)
to 1. A high alpha value indicates less data heterogeneity, i.e., an
alpha value equal to 1.0 would result in homogeneous data distribution
Expand Down
12 changes: 6 additions & 6 deletions examples/advanced/job_api/tf/run_jobs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ GPU_INDX=0
WORKSPACE=/tmp

# Run centralized training job
python ./tf_fl_script_executor_cifar10.py \
python ./tf_fl_script_runner_cifar10.py \
--algo centralized \
--n_clients 1 \
--num_rounds 25 \
Expand All @@ -39,7 +39,7 @@ python ./tf_fl_script_executor_cifar10.py \
# Run FedAvg with different alpha values
for alpha in 1.0 0.5 0.3 0.1; do

python ./tf_fl_script_executor_cifar10.py \
python ./tf_fl_script_runner_cifar10.py \
--algo fedavg \
--n_clients 8 \
--num_rounds 50 \
Expand All @@ -53,7 +53,7 @@ done


# Run FedOpt job
python ./tf_fl_script_executor_cifar10.py \
python ./tf_fl_script_runner_cifar10.py \
--algo fedopt \
--n_clients 8 \
--num_rounds 50 \
Expand All @@ -65,7 +65,7 @@ python ./tf_fl_script_executor_cifar10.py \


# Run FedProx job.
python ./tf_fl_script_executor_cifar10.py \
python ./tf_fl_script_runner_cifar10.py \
--algo fedprox \
--n_clients 8 \
--num_rounds 50 \
Expand All @@ -77,11 +77,11 @@ python ./tf_fl_script_executor_cifar10.py \


# Run scaffold job
python ./tf_fl_script_executor_cifar10.py \
python ./tf_fl_script_runner_cifar10.py \
--algo scaffold \
--n_clients 8 \
--num_rounds 50 \
--batch_size 64 \
--epochs 4 \
--alpha 0.1 \
--gpu $GPU_INDX
--gpu $GPU_INDX
2 changes: 2 additions & 0 deletions examples/advanced/xgboost/histogram-based/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,7 @@ pandas
scikit-learn
torch
tensorboard
matplotlib
shap
# require xgboost 2.2 version, for now need to install a nightly build
https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/federated-secure/xgboost-2.2.0.dev0%2B4601688195708f7c31fcceeb0e0ac735e7311e61-py3-none-manylinux_2_28_x86_64.whl
2 changes: 2 additions & 0 deletions examples/advanced/xgboost/tree-based/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,7 @@ pandas
scikit-learn
torch
tensorboard
matplotlib
shap
# require xgboost 2.2 version, for now need to install a nightly build
https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/federated-secure/xgboost-2.2.0.dev0%2B4601688195708f7c31fcceeb0e0ac735e7311e61-py3-none-manylinux_2_28_x86_64.whl
29 changes: 10 additions & 19 deletions examples/getting_started/tf/README.md
Original file line number Diff line number Diff line change
@@ -1,26 +1,21 @@
# Getting Started with NVFlare (TensorFlow)
[![TensorFlow Logo](https://upload.wikimedia.org/wikipedia/commons/a/ab/TensorFlow_logo.svg)](https://tensorflow.org/)

We provide several examples to quickly get you started using NVFlare's Job API.
We provide several examples to help you quickly get started with NVFlare.
All examples in this folder are based on using [TensorFlow](https://tensorflow.org/) as the model training framework.

## Simulated Federated Learning with CIFAR10 Using Tensorflow

This example shows `Tensorflow`-based classic Federated Learning
algorithms, namely FedAvg and FedOpt on CIFAR10
dataset. This example is analogous to [the example using `Pytorch`
backend](https://github.com/NVIDIA/NVFlare/tree/main/examples/advanced/cifar10/cifar10-sim)
on the same dataset, where same experiments
were conducted and analyzed. You should expect the same
experimental results when comparing this example with the `Pytorch` one.
This example demonstrates TensorFlow-based federated learning algorithms,
FedAvg and FedOpt, on the CIFAR-10 dataset.

In this example, the latest Client APIs were used to implement
client-side training logics (details in file
[`cifar10_tf_fl_alpha_split.py`](src/cifar10_tf_fl_alpha_split.py)),
and the new
[`FedJob`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/job_config/api.py)
APIs were used to programmatically set up an
`nvflare` job to be exported or ran by simulator (details in file
NVFlare job to be exported or ran by simulator (details in file
[`tf_fl_script_runner_cifar10.py`](tf_fl_script_runner_cifar10.py)),
alleviating the need of writing job config files, simplifying
development process.
Expand Down Expand Up @@ -64,12 +59,8 @@ script.
> `export TF_FORCE_GPU_ALLOW_GROWTH=true && export
> TF_GPU_ALLOCATOR=cuda_malloc_asyncp`
The set-up of all experiments in this example are kept the same as
[the example using `Pytorch`
backend](https://github.com/NVIDIA/NVFlare/tree/main/examples/advanced/cifar10/cifar10-sim). Refer
to the `Pytorch` example for more details. Similar to the Pytorch
example, we here also use Dirichelet sampling on CIFAR10 data labels
to simulate data heterogeneity among data splits for different client
We use Dirichelet sampling (implementation from FedMA (https://github.com/IBM/FedMA)) on
CIFAR10 data labels to simulate data heterogeneity among data splits for different client
sites, controlled by an alpha value, ranging from 0 (not including 0)
to 1. A high alpha value indicates less data heterogeneity, i.e., an
alpha value equal to 1.0 would result in homogeneous data distribution
Expand Down Expand Up @@ -111,11 +102,11 @@ for alpha in 1.0 0.5 0.3 0.1; do
done
```

## 2. Results
## 3. Results

Now let's compare experimental results.

### 2.1 Centralized training vs. FedAvg for homogeneous split
### 3.1 Centralized training vs. FedAvg for homogeneous split
Let's first compare FedAvg with homogeneous data split
(i.e. `alpha=1.0`) and centralized training. As can be seen from the
figure and table below, FedAvg can achieve similar performance to
Expand All @@ -129,7 +120,7 @@ no difference in data distributions among different clients.

![Central vs. FedAvg](./figs/fedavg-vs-centralized.png)

### 2.2 Impact of client data heterogeneity
### 3.2 Impact of client data heterogeneity

Here we compare the impact of data heterogeneity by varying the
`alpha` value, where lower values cause higher heterogeneity. As can
Expand All @@ -145,7 +136,7 @@ as data heterogeneity becomes higher.

![Impact of client data
heterogeneity](./figs/fedavg-diff-alphas.png)

> [!NOTE]
> More examples can be found at https://nvidia.github.io/NVFlare.
35 changes: 0 additions & 35 deletions examples/getting_started/tf/run_jobs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,38 +50,3 @@ for alpha in 1.0 0.5 0.3 0.1; do
--workspace $WORKSPACE

done


# Run FedOpt job
python ./tf_fl_script_runner_cifar10.py \
--algo fedopt \
--n_clients 8 \
--num_rounds 50 \
--batch_size 64 \
--epochs 4 \
--alpha 0.1 \
--gpu $GPU_INDX \
--workspace $WORKSPACE


# Run FedProx job.
python ./tf_fl_script_runner_cifar10.py \
--algo fedprox \
--n_clients 8 \
--num_rounds 50 \
--batch_size 64 \
--epochs 4 \
--fedprox_mu 1e-5 \
--alpha 0.1 \
--gpu $GPU_INDX


# Run scaffold job
python ./tf_fl_script_runner_cifar10.py \
--algo scaffold \
--n_clients 8 \
--num_rounds 50 \
--batch_size 64 \
--epochs 4 \
--alpha 0.1 \
--gpu $GPU_INDX
4 changes: 2 additions & 2 deletions examples/hello-world/hello-tf/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Example of using [NVIDIA FLARE](https://nvflare.readthedocs.io/en/main/index.htm
using federated averaging ([FedAvg](https://arxiv.org/abs/1602.05629))
and [TensorFlow](https://tensorflow.org/) as the deep learning training framework.

> **_NOTE:_** This example uses the [MNIST](http://yann.lecun.com/exdb/mnist/) handwritten digits dataset and will load its data within the trainer code.
> **_NOTE:_** This example uses the [MNIST](https://www.tensorflow.org/datasets/catalog/mnist) handwritten digits dataset and will load its data within the trainer code.
See the [Hello TensorFlow](https://nvflare.readthedocs.io/en/main/examples/hello_tf_job_api.html#hello-tf-job-api) example documentation page for details on this
example.
Expand Down Expand Up @@ -48,7 +48,7 @@ In scenarios where multiple clients are involved, you have to prevent TensorFlow
by setting the following flags.

```bash
TF_FORCE_GPU_ALLOW_GROWTH=true TF_GPU_ALLOCATOR=cuda_malloc_async
TF_FORCE_GPU_ALLOW_GROWTH=true TF_GPU_ALLOCATOR=cuda_malloc_async python3 fedavg_script_runner_tf.py
```

If you possess more GPUs than clients, a good strategy is to run one client on each GPU.
Expand Down
2 changes: 1 addition & 1 deletion nvflare/apis/event_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,4 +89,4 @@ class EventType(object):

AUTHORIZE_COMMAND_CHECK = "_authorize_command_check"
BEFORE_BUILD_COMPONENT = "_before_build_component"
GET_JOB_LAUNCHER = "_get_job_launcher"
BEFORE_JOB_LAUNCH = "_before_job_launch"
51 changes: 3 additions & 48 deletions nvflare/app_common/job_launcher/client_process_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,56 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys

from nvflare.apis.fl_constant import FLContextKey, JobConstants
from nvflare.apis.workspace import Workspace
from nvflare.app_common.job_launcher.process_launcher import ProcessJobLauncher
from nvflare.private.fed.utils.fed_utils import add_custom_dir_to_path
from nvflare.utils.job_launcher_utils import generate_client_command


class ClientProcessJobLauncher(ProcessJobLauncher):
def get_command(self, launch_data, fl_ctx) -> (str, dict):
new_env = os.environ.copy()
workspace_obj: Workspace = fl_ctx.get_prop(FLContextKey.WORKSPACE_OBJECT)
args = fl_ctx.get_prop(FLContextKey.ARGS)
client = fl_ctx.get_prop(FLContextKey.SITE_OBJ)
job_id = launch_data.get(JobConstants.JOB_ID)
server_config = fl_ctx.get_prop(FLContextKey.SERVER_CONFIG)
if not server_config:
raise RuntimeError(f"missing {FLContextKey.SERVER_CONFIG} in FL context")
service = server_config[0].get("service", {})
if not isinstance(service, dict):
raise RuntimeError(f"expect server config data to be dict but got {type(service)}")

app_custom_folder = workspace_obj.get_app_custom_dir(job_id)
if app_custom_folder != "":
add_custom_dir_to_path(app_custom_folder, new_env)

command_options = ""
for t in args.set:
command_options += " " + t
command = (
f"{sys.executable} -m nvflare.private.fed.app.client.worker_process -m "
+ args.workspace
+ " -w "
+ (workspace_obj.get_startup_kit_dir())
+ " -t "
+ client.token
+ " -d "
+ client.ssid
+ " -n "
+ job_id
+ " -c "
+ client.client_name
+ " -p "
+ str(client.cell.get_internal_listener_url())
+ " -g "
+ service.get("target")
+ " -scheme "
+ service.get("scheme", "grpc")
+ " -s fed_client.json "
" --set" + command_options + " print_conf=True"
)
return command, new_env
def get_command(self, job_meta, fl_ctx) -> str:
return generate_client_command(job_meta, fl_ctx)
22 changes: 15 additions & 7 deletions nvflare/app_common/job_launcher/process_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@
from abc import abstractmethod

from nvflare.apis.event_type import EventType
from nvflare.apis.fl_constant import FLContextKey
from nvflare.apis.fl_constant import FLContextKey, JobConstants
from nvflare.apis.fl_context import FLContext
from nvflare.apis.job_launcher_spec import JobHandleSpec, JobLauncherSpec, JobReturnCode, add_launcher
from nvflare.private.fed.utils.fed_utils import extract_job_image
from nvflare.apis.workspace import Workspace
from nvflare.utils.job_launcher_utils import add_custom_dir_to_path, extract_job_image

JOB_RETURN_CODE_MAPPING = {0: JobReturnCode.SUCCESS, 1: JobReturnCode.EXECUTION_ERROR, 9: JobReturnCode.ABORTED}

Expand Down Expand Up @@ -62,7 +63,14 @@ def __init__(self):

def launch_job(self, job_meta: dict, fl_ctx: FLContext) -> JobHandleSpec:

command, new_env = self.get_command(job_meta, fl_ctx)
new_env = os.environ.copy()
workspace_obj: Workspace = fl_ctx.get_prop(FLContextKey.WORKSPACE_OBJECT)
job_id = job_meta.get(JobConstants.JOB_ID)
app_custom_folder = workspace_obj.get_app_custom_dir(job_id)
if app_custom_folder != "":
add_custom_dir_to_path(app_custom_folder, new_env)

command = self.get_command(job_meta, fl_ctx)
# use os.setsid to create new process group ID
process = subprocess.Popen(shlex.split(command, True), preexec_fn=os.setsid, env=new_env)

Expand All @@ -71,22 +79,22 @@ def launch_job(self, job_meta: dict, fl_ctx: FLContext) -> JobHandleSpec:
return ProcessHandle(process)

def handle_event(self, event_type: str, fl_ctx: FLContext):
if event_type == EventType.GET_JOB_LAUNCHER:
if event_type == EventType.BEFORE_JOB_LAUNCH:
job_meta = fl_ctx.get_prop(FLContextKey.JOB_META)
job_image = extract_job_image(job_meta, fl_ctx.get_identity_name())
if not job_image:
add_launcher(self, fl_ctx)

@abstractmethod
def get_command(self, launch_data, fl_ctx) -> (str, dict):
def get_command(self, job_meta, fl_ctx) -> str:
"""To generate the command to launcher the job in sub-process
Args:
fl_ctx: FLContext
launch_data: job launcher data
job_meta: job meta data
Returns:
launch command, environment dict
launch command
"""
pass
Loading

0 comments on commit c9ea854

Please sign in to comment.