From e62c7d0ad81e66cece48216abf5497925e71fe0c Mon Sep 17 00:00:00 2001 From: YuanTingHsieh Date: Mon, 29 Jan 2024 20:14:59 -0800 Subject: [PATCH] Address final VDR feedbacks --- docs/programming_guide.rst | 1 + docs/programming_guide/execution_api_type.rst | 14 +- .../3rd_party_integration.rst | 268 +++++++----------- .../execution_api_type/client_api.rst | 2 +- docs/release_notes/flare_240.rst | 2 +- docs/resources/3rd_party_trainer.py | 59 ++++ examples/hello-world/ml-to-fl/np/README.md | 23 +- examples/hello-world/ml-to-fl/pt/README.md | 37 ++- .../pt/code/cifar10_lightning_ddp_fl.py | 4 +- examples/hello-world/ml-to-fl/tf/README.md | 30 +- .../app/config/config_fed_server.json | 4 +- nvflare/client/lightning/__init__.py | 10 + nvflare/lighter/dummy_project.yml | 7 +- setup.cfg | 4 + 14 files changed, 267 insertions(+), 198 deletions(-) create mode 100644 docs/resources/3rd_party_trainer.py diff --git a/docs/programming_guide.rst b/docs/programming_guide.rst index 835839245f..28e8b7992b 100644 --- a/docs/programming_guide.rst +++ b/docs/programming_guide.rst @@ -37,6 +37,7 @@ Please refer to :ref:`application` for more details. programming_guide/workflows_and_controllers programming_guide/execution_api_type + programming_guide/fl_model programming_guide/shareable programming_guide/data_exchange_object programming_guide/fl_context diff --git a/docs/programming_guide/execution_api_type.rst b/docs/programming_guide/execution_api_type.rst index 1095368afc..174778b9c5 100644 --- a/docs/programming_guide/execution_api_type.rst +++ b/docs/programming_guide/execution_api_type.rst @@ -1,11 +1,12 @@ .. _execution_api_type: -################## -Execution API Type -################## +####################### +From Local to Federated +####################### In the FLARE system, a federated learning algorithm is defined in a Job format (for details, please refer to :ref:`job`). + A Job consists of multiple "workflows" and "executors." The simplified job execution flow is as follows: @@ -16,8 +17,8 @@ The simplified job execution flow is as follows: - If it is not done, it schedules a new task - If it is done, it proceeds to the next workflow in the Job. -Users need to adapt their local training logic into FLARE's task execution -abstractions to make their training federated. +Users need to adapt their local training or computing logic into FLARE's task +execution abstractions to make their training or computing federated. We offer various levels of abstraction for writing task execution code, catering to use cases that span from complete customizability to easy user adaptation. @@ -32,7 +33,8 @@ The Client API uses the :class:`FLModel` - :class:`FlareAgentWithCellPipe` -You can create the FlareAgent as the following code: +You can create the FlareAgentWithCellPipe as the following code: .. code-block:: python @@ -72,7 +72,7 @@ You can create the FlareAgent as the following code: agent = FlareAgentWithCellPipe( root_url="grpc://server:8002", - flare_site_name=args.site_name, + site_name=args.site_name, agent_id=args.agent_id, workspace_dir=args.workspace, secure_mode=True, @@ -142,72 +142,15 @@ If this call is missed, the program may not exit properly. agent.stop() -Putting Together ----------------- +5. Putting Together +------------------- Now we learn all the necessary steps, we can put together into the following example code of this usage pattern: -.. code-block:: python - - import argparse - import logging - - from nvflare.client.defs import RC, AgentClosed, MetaKey - from nvflare.client.flare_agent import FlareAgentWithCellPipe - - NUMPY_KEY = "numpy_key" - - - def main(): - - logging.basicConfig() - logging.getLogger().setLevel(logging.INFO) - - parser = argparse.ArgumentParser() - parser.add_argument("--workspace", "-w", type=str, help="workspace folder", required=False, default=".") - parser.add_argument("--site_name", "-s", type=str, help="flare site name", required=True) - parser.add_argument("--agent_id", "-a", type=str, help="agent id", required=True) - - args = parser.parse_args() +.. literalinclude:: ../../resources/3rd_party_trainer.py + :language: python - # 1. create the agent - agent = FlareAgentWithCellPipe( - root_url="grpc://server:8002", - flare_site_name=args.site_name, - agent_id=args.agent_id, - workspace_dir=args.workspace, - secure_mode=True, - submit_result_timeout=2.0, - heartbeat_timeout=120.0, - ) - - # 2. start the agent - agent.start() - - # 3. processing tasks - while True: - print("getting task ...") - try: - task = agent.get_task() - except AgentClosed: - print("agent closed - exit") - break - - print(f"got task: {task}") - rc, meta, result = train(task.data) # perform train task - submitted = agent.submit_result(TaskResult(data=result, meta=meta, return_code=rc)) - print(f"result submitted: {submitted}") - - # 4. stop the agent - agent.stop() - - - def train(model): - ... - - if __name__ == "__main__": - main() Notes: @@ -250,81 +193,61 @@ An example looks like: - name: site_1 type: client org: nvidia - listening_host: site_1.maglev.nvidia.com + listening_host: localhost - name: site_2 type: client org: nvidia - listening_host: site_2.maglev.nvidia.com + listening_host: localhost Once the project is provisioned, check the "startup" kit generated for the clients. You should see the following files, among others: client.crt, client.key, server.crt, server.key, rootCA.pem -Note that the specified listening_port of a site must be accessible to the trainer of the site. - -Step Two - Setup for Adhoc Direct Connection between FL Client and Trainer --------------------------------------------------------------------------- - -FL client and the trainer can always talk to each other via the server, -but it could be slow, especially if the server is located far away. -The enable adhoc direct connections between the FL client and the trainer, -configure the comm_config.json on the client site as follows: - -.. code-block:: json - - { - "allow_adhoc_conns": true, - "use_aio_grpc": true, - "adhoc": { - "scheme": "tcp", - "resources": { - "host": "nvclient", - "secure": true - } - } - } - -This file must be placed into the site's "local" folder within its workspace. - -Pay attention to the following: - -- For most cases, the "scheme" should be set to "tcp" to get the best performance. - If "tcp" cannot be used, you can use "grpc". -- In "resources": +Note that the specified listening_host of a site must be must be a hostname that +the external trainer can reach via network. - - If FL client and the trainer are within the same trusted network, - you can set "secure" to false; otherwise set it to true. - - The value of the "host" must match the "listening_host" value of the site used in provision. - -Step Three - Prepare Job Configuration --------------------------------------- +Step Two - Prepare Job Configuration +------------------------------------ For each job, configure the config_fed_client.json to use :class:`TaskExchanger` as the executor. -.. code-block:: json +.. code-block:: { "format_version": 2, "executors": [ - { - "tasks": [ - "train" - ], - "executor": { - "path": "nvflare.app_common.executors.task_exchanger.TaskExchanger", - "args": { - "pipe_id": "pipe" - "peer_read_timeout": 30, - "heartbeat_timeout": 60 + { + "tasks": [ + "train" + ], + "executor": { + "path": "nvflare.app_common.executors.task_exchanger.TaskExchanger", + "args": { + "pipe_id": "pipe" + "peer_read_timeout": 30, + "heartbeat_timeout": 60 + } } } - } - ], + ], "task_result_filters": [], "task_data_filters": [], - "components": [] + components = [ + { + id = "pipe" + path = "nvflare.fuel.utils.pipe.cell_pipe.CellPipe" + args { + mode = "PASSIVE" + site_name = "{SITE_NAME}" + token = "{SITE_NAME}" + root_url = "{ROOT_URL}" + secure_mode = "{SECURE_MODE}" + workspace_dir = "{WORKSPACE}" + } + } + ] } Make sure the parameters of the :class:`TaskExchanger` @@ -333,80 +256,95 @@ are configured properly, and change the default values as needed. Please refer to the API page for a detailed explanation of each argument: :class:`TaskExchanger` -Step Four - Trainer Setup -------------------------- +Step Three - Trainer Setup +-------------------------- -The trainer program must have access to a local file system, and you must create a "workspace" folder. -This workspace should be used for all jobs. +For each client site, you will have an FL client and a trainer process. -Copy the "startup" folder of the provisioned site, and put it in the designated workspace folder. -If needed, any additional config files required by the trainer can also be placed in the workspace folder. +To make our integration work, please follow the following steps to +setup the trainer process on each client site: -Ensure to set the FlareAgent's "workspace_dir" to the workspace folder and -that the correct "agent_id" value is passed to both the FL client and the training process. + - Make sure the trainer process has access to a local file system. + - Create a "workspace" folder that is going to be used by this trainer process + This workspace will be used for all jobs. + - Copy the "startup" folder of the client site to this "workspace" folder + If needed, any additional config files required by the trainer can also + be placed in this "workspace" folder. + - Create the trainer script following the steps in the above section. + Please set the FlareAgentWithCellPipe's "workspace_dir" to the path of + this "workspace" folder that you just created. + Please make sure the "agent_id" value of FlareAgentWithCellPipe is the same + as the "token" value in the above Verification ============ -The FL client (TaskExchanger) and your trainer process (FlareAgent) do not have -to be started at exactly the same time. +The FL client (TaskExchanger) and your trainer process (FlareAgentWithCellPipe) +do not have to be started at exactly the same time. + Whichever is started first will wait for the other for ``heartbeat_timeout`` seconds. Once they both are started and connected, you can verify they are directly connected using the Admin console's ``cells`` commands. -The following example shows two clients (red, blue) connected to their external -trainers via the agent_id "ext_trainer_1": +The following example shows two clients (site-1, site-2) connected to their +external trainers via the agent_id/token "ext_trainer": .. code-block:: shell > cells server - server.44c08365-e829-4bc1-a034-cda5a252fe73 - red - red.44c08365-e829-4bc1-a034-cda5a252fe73 - blue - blue.44c08365-e829-4bc1-a034-cda5a252fe73 - red--ndas_1 - blue--ndas_1 - Total Cells: 8 - Done [21695 usecs] 2023-10-16 19:28:37.523651 + server.10d1d3b7-fb50-4c83-9575-e510f32c5d21 + site-1 + site-1.10d1d3b7-fb50-4c83-9575-e510f32c5d21 + site-2 + site-2.10d1d3b7-fb50-4c83-9575-e510f32c5d21 + site-1_ext_trainer_active + site-2_ext_trainer_active + site-2_ext_trainer_passive + site-1_ext_trainer_passive + Total Cells: 10 + The ``cells`` command lists all cells. -Notice that the job 44c08365-e829-4bc1-a034-cda5a252fe73 is running on both "blue" and "red" clients. -Also notice that there are two corresponding ext_trainer cells (red-ext_trainer_1, and blue-ext_trainer1). -.. code-block:: shell +Notice that the job ``10d1d3b7-fb50-4c83-9575-e510f32c5d21`` is running on both +"site-1" and "site-2" clients. - > peers blue--ext_trainer_1 - server - blue.44c08365-e829-4bc1-a034-cda5a252fe73 - Total Agents: 2 - Done [14526 usecs] 2023-10-16 19:28:44.407505 +Also notice that there are two pairs of corresponding cells +(site-1_ext_trainer_active, site-1_ext_trainer_passive) +and ((site-2_ext_trainer_active, site-2_ext_trainer_passive)). -The ``peers`` command shows the cells directly connected to the specified cell. -Here you see that the blue-ext_trainer_1 is directly connected to two cells: -the server and the FL client (blue.44c08365-e829-4bc1-a034-cda5a252fe73). -.. code-block:: shell +Optional - Setup for Adhoc Direct Connection between FL Client and Trainer +========================================================================== + +FL client and the trainer can always talk to each other via the server, +but it could be slow, especially if the server is located far away. +The enable adhoc direct connections between the FL client and the trainer, +configure the comm_config.json on the client site as follows: + +.. code-block:: json - > conns blue--ext_trainer_1 { - "bb_ext_connector": { - "url": "grpc://server:8002", - "handle": "CH00001", - "type": "connector" - }, - "adhoc_connectors": { - "blue.44c08365-e829-4bc1-a034-cda5a252fe73": { - "url": "stcp://nvclient:11947", - "handle": "CH00002", - "type": "connector" + "allow_adhoc_conns": true, + "use_aio_grpc": true, + "adhoc": { + "scheme": "tcp", + "resources": { + "host": "localhost", + "secure": true } } } -The ``conns`` command shows the connectors on the specified cell. -Here you see that blue--ext_trainer_1 has two connectors: -one connects the server on ``grpc://server:8002``, and another connects to -``blue.44c08365-e829-4bc1-a034-cda5a252fe73 on stcp://nvclient:11947``. -Note that this port (11947) is opened by the FL client dynamically. +This file must be placed into the site's "local" folder within its workspace. + +Pay attention to the following: + +- For most cases, the "scheme" should be set to "tcp" to get the best performance. + If "tcp" cannot be used, you can use "grpc". +- In "resources": + + - If FL client and the trainer are within the same trusted network, + you can set "secure" to false; otherwise set it to true. + - The value of the "host" must match the "listening_host" value of the site used in provision. diff --git a/docs/programming_guide/execution_api_type/client_api.rst b/docs/programming_guide/execution_api_type/client_api.rst index 0861a446e3..aff5da50eb 100644 --- a/docs/programming_guide/execution_api_type/client_api.rst +++ b/docs/programming_guide/execution_api_type/client_api.rst @@ -133,7 +133,7 @@ Below is a table overview of key Client APIs. - API Doc Link * - patch - Patches the PyTorch Lightning Trainer for usage with FLARE. - - :func:`train` + - :func:`patch` .. list-table:: Metrics Logger :widths: 25 25 50 diff --git a/docs/release_notes/flare_240.rst b/docs/release_notes/flare_240.rst index d2fe661ad5..4741b386f0 100644 --- a/docs/release_notes/flare_240.rst +++ b/docs/release_notes/flare_240.rst @@ -168,7 +168,7 @@ Improved Job Configuration File Processing - OS Environment Variables - OS environment variables can be referenced via the dollar sign - Parameterized Variable Definition - for creating configuration templates that can be reused and resolved into different concrete configurations -See more details in the :ref:`configuration_files` documentation. +See more details in the :ref:`configurations` documentation. POC Command Upgrade =================== diff --git a/docs/resources/3rd_party_trainer.py b/docs/resources/3rd_party_trainer.py new file mode 100644 index 0000000000..1ffdd085bb --- /dev/null +++ b/docs/resources/3rd_party_trainer.py @@ -0,0 +1,59 @@ +import argparse +import logging + +from nvflare.client.flare_agent import AgentClosed, FlareAgentWithCellPipe + +NUMPY_KEY = "numpy_key" + + +def main(): + + logging.basicConfig() + logging.getLogger().setLevel(logging.INFO) + + parser = argparse.ArgumentParser() + parser.add_argument("--workspace", "-w", type=str, help="workspace folder", required=False, default=".") + parser.add_argument("--site_name", "-s", type=str, help="flare site name", required=True) + parser.add_argument("--agent_id", "-a", type=str, help="agent id", required=True) + + args = parser.parse_args() + + # 1. create the agent + agent = FlareAgentWithCellPipe( + root_url="grpc://server:8002", + site_name=args.site_name, + agent_id=args.agent_id, + workspace_dir=args.workspace, + secure_mode=True, + submit_result_timeout=2.0, + heartbeat_timeout=120.0, + ) + + # 2. start the agent + agent.start() + + # 3. processing tasks + while True: + print("getting task ...") + try: + task = agent.get_task() + except AgentClosed: + print("agent closed - exit") + break + + print(f"got task: {task}") + result = train(task.data) # perform train task + submitted = agent.submit_result(result) + print(f"result submitted: {submitted}") + + # 4. stop the agent + agent.stop() + + +def train(model): + print(f"training on {model}") + return model + + +if __name__ == "__main__": + main() diff --git a/examples/hello-world/ml-to-fl/np/README.md b/examples/hello-world/ml-to-fl/np/README.md index c6dea03fc7..1de78e0e5a 100644 --- a/examples/hello-world/ml-to-fl/np/README.md +++ b/examples/hello-world/ml-to-fl/np/README.md @@ -1,4 +1,4 @@ -# Configurations of NVFlare Client API +# NVFlare Client API We will demonstrate how to send back model parameters or model parameters differences in different approaches in the following examples: @@ -18,6 +18,25 @@ We demonstrate how to launch training script once and have training script keeps 1. [Launch once for the whole job](#launch-once-for-the-whole-job) +## Software Requirements + +Please install the requirements first, it is suggested to install inside a virtual environment: + +```bash +pip install -r requirements.txt +``` + +Please also configure the job templates folder: + +```bash +nvflare config -jt ../../../../job_templates/ +nvflare job list_templates +``` + +## Minimum Hardware Requirements + +1 CPU + ## Send model parameters back to the NVFlare server @@ -29,8 +48,6 @@ To send back the whole model parameters, we need to make sure the "params_transf Let reuse the job templates from [sag_np](../../../../job_templates/sag_np/): ```bash -nvflare config -jt ../../../../job_templates/ -nvflare job list_templates nvflare job create -force -j ./jobs/np_param_full_transfer_full -w sag_np -sd ./code/ \ -f config_fed_client.conf app_script=train_full.py params_transfer_type=FULL launch_once=false ``` diff --git a/examples/hello-world/ml-to-fl/pt/README.md b/examples/hello-world/ml-to-fl/pt/README.md index 7789515df8..09c1a8b0db 100644 --- a/examples/hello-world/ml-to-fl/pt/README.md +++ b/examples/hello-world/ml-to-fl/pt/README.md @@ -1,5 +1,19 @@ # PyTorch Deep Learning to Federated Learning transition with NVFlare +We will demonstrate how to transform an existing DL code into an FL application step-by-step: + + 1. [Show a baseline training script](#the-baseline) + 2. [How to modify an existing training script using DL2FL Client API](#transform-cifar10-dl-training-code-to-fl-including-best-model-selection-using-client-api) + 3. [How to modify a structured script using DL2FL decorator](#the-decorator-use-case) + 4. [How to modify a PyTorch Lightning script using DL2FL Lightning Client API](#transform-cifar10-pytorch-lightning-training-code-to-fl-with-nvflare-client-lightning-integration-api) + +If you have multi GPU please refer to the following examples: + + 1. [How to modify a PyTorch DDP training script using DL2FL Client API](#transform-cifar10-pytorch--ddp-training-code-to-fl-using-client-api) + 2. [How to modify a PyTorch Lightning DDP training script using DL2FL Lightning Client API](#transform-cifar10-pytorch-lightning--ddp-training-code-to-fl-with-nvflare-client-lightning-integration-api) + +## Software Requirements + Please install the requirements first, it is suggested to install inside a virtual environment: ```bash @@ -13,17 +27,22 @@ nvflare config -jt ../../../../job_templates/ nvflare job list_templates ``` -We will demonstrate how to transform an existing DL code into an FL application step-by-step: +## Minimum Hardware Requirements - 1. [Show a baseline training script](#the-baseline) - 2. [How to modify an existing training script using DL2FL Client API](#transform-cifar10-dl-training-code-to-fl-including-best-model-selection-using-client-api) - 3. [How to modify a structured script using DL2FL decorator](#the-decorator-use-case) - 4. [How to modify a PyTorch Lightning script using DL2FL Lightning Client API](#transform-cifar10-pytorch-lightning-training-code-to-fl-with-nvflare-client-lightning-integration-api) +Each example has different requirements: -If you have multi GPU please refer to the following examples: +| Example name | minimum requirements | +| ------------ | -------------------- | +| [Show a baseline training script](#the-baseline) | 1 CPU or 1 GPU* | +| [How to modify an existing training script using DL2FL Client API](#transform-cifar10-dl-training-code-to-fl-including-best-model-selection-using-client-api) | 1 CPU or 1 GPU* | +| [How to modify a structured script using DL2FL decorator](#the-decorator-use-case) | 1 CPU or 1 GPU* | +| [How to modify a PyTorch Lightning script using DL2FL Lightning Client API](#transform-cifar10-pytorch-lightning-training-code-to-fl-with-nvflare-client-lightning-integration-api) | 1 CPU or 1 GPU* | +| [How to modify a PyTorch DDP training script using DL2FL Client API](#transform-cifar10-pytorch--ddp-training-code-to-fl-using-client-api) | 2 GPUs | +| [How to modify a PyTorch Lightning DDP training script using DL2FL Lightning Client API](#transform-cifar10-pytorch-lightning--ddp-training-code-to-fl-with-nvflare-client-lightning-integration-api) | 2 CPUs or 2 GPUs** | - 1. [How to modify a PyTorch DDP training script using DL2FL Client API](#transform-cifar10-pytorch--ddp-training-code-to-fl-using-client-api) - 2. [How to modify a PyTorch Lightning DDP training script using DL2FL Lightning Client API](#transform-cifar10-pytorch-lightning--ddp-training-code-to-fl-with-nvflare-client-lightning-integration-api) + +\* it depends on you use `device=cpu` or `device=cuda` +\*\* it depends on whether `torch.cuda.is_available()` is True or not ## The baseline @@ -200,8 +219,6 @@ nvflare simulator -n 2 -t 2 ./jobs/lightning -w lightning_workspace We follow the official [PyTorch documentation](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html#initialize-ddp-with-torch-distributed-run-torchrun) and write a [./code/cifar10_ddp_original.py](./code/cifar10_ddp_original.py). -Note that this example requires at least 2 GPUs on your machine. - Note that we wrap the evaluation logic into a method for better usability. It can be run using the torch distributed run: diff --git a/examples/hello-world/ml-to-fl/pt/code/cifar10_lightning_ddp_fl.py b/examples/hello-world/ml-to-fl/pt/code/cifar10_lightning_ddp_fl.py index 62a513f548..0398c9d3b0 100644 --- a/examples/hello-world/ml-to-fl/pt/code/cifar10_lightning_ddp_fl.py +++ b/examples/hello-world/ml-to-fl/pt/code/cifar10_lightning_ddp_fl.py @@ -72,7 +72,9 @@ def main(): model = LitNet() cifar10_dm = CIFAR10DataModule() - trainer = Trainer(max_epochs=1, strategy="ddp", devices=2 if torch.cuda.is_available() else None) + trainer = Trainer( + max_epochs=1, strategy="ddp", devices=2, accelerator="gpu" if torch.cuda.is_available() else "cpu" + ) # (2) patch the lightning trainer flare.patch(trainer) diff --git a/examples/hello-world/ml-to-fl/tf/README.md b/examples/hello-world/ml-to-fl/tf/README.md index 80b879425b..e2444844f4 100644 --- a/examples/hello-world/ml-to-fl/tf/README.md +++ b/examples/hello-world/ml-to-fl/tf/README.md @@ -1,18 +1,38 @@ # TensorFlow Deep Learning to Federated Learning transition with NVFlare +We will demonstrate how to transform an existing DL code into an FL application step-by-step: + +1. [How to modify an existing training script using DL2FL Client API](#transform-cifar10-tensorflow-training-code-to-fl-with-nvflare-client-api) + +2. [How to modify an existing multi GPU training script using DL2FL Client API](#transform-cifar10-tensorflow-multi-gpu-training-code-to-fl-with-nvflare-client-api) + +## Software Requirements + Please install the requirements first, it is suggested to install inside a virtual environment: ```bash pip install -r requirements.txt ``` -Note that for running with GPUs, we recommend using [NVIDIA TensorFlow docker](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tensorflow) +Please also configure the job templates folder: -We will demonstrate how to transform an existing DL code into an FL application step-by-step: +```bash +nvflare config -jt ../../../../job_templates/ +nvflare job list_templates +``` -1. [How to modify an existing training script using DL2FL Client API](#transform-cifar10-tensorflow-training-code-to-fl-with-nvflare-client-api) +## Minimum Hardware Requirements + +| Example name | minimum requirements | +| ------------ | -------------------- | +| [How to modify an existing training script using DL2FL Client API](#transform-cifar10-tensorflow-training-code-to-fl-with-nvflare-client-api) | 1 CPU or 1 GPU* | +| [How to modify an existing multi GPU training script using DL2FL Client API](#transform-cifar10-tensorflow-multi-gpu-training-code-to-fl-with-nvflare-client-api) | 2 CPUs or 2 GPUs* | + +\* depends on whether TF can found GPU or not + + +Note that for running with GPUs, we recommend using [NVIDIA TensorFlow docker](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tensorflow) -2. [How to modify an existing multi GPU training script using DL2FL Client API](#transform-cifar10-tensorflow-multi-gpu-training-code-to-fl-with-nvflare-client-api) ## Transform CIFAR10 TensorFlow training code to FL with NVFLARE Client API @@ -46,7 +66,6 @@ Please refer to [JOB CLI tutorial](../../../tutorials/job_cli.ipynb) on how to g We choose the [tensorflow job template](../../../../job_templates/sag_tf/) and run the following command to create the job: ```bash -nvflare config -jt ../../../../job_templates nvflare job create -force -j ./jobs/tensorflow -w sag_tf -sd ./code/ -f config_fed_client.conf app_script=cifar10_tf_fl.py ``` @@ -82,7 +101,6 @@ Please refer to [JOB CLI tutorial](../../../tutorials/job_cli.ipynb) on how to g We choose the [tensorflow job template](../../../../job_templates/sag_tf/) and run the following command to create the job: ```bash -nvflare config -jt ../../../../job_templates nvflare job create -force -j ./jobs/tensorflow_multi_gpu -w sag_tf -sd ./code/ -f config_fed_client.conf app_script=cifar10_tf_multi_gpu_fl.py ``` diff --git a/integration/monai/examples/spleen_ct_segmentation_local/jobs/spleen_ct_segmentation_local/app/config/config_fed_server.json b/integration/monai/examples/spleen_ct_segmentation_local/jobs/spleen_ct_segmentation_local/app/config/config_fed_server.json index 749b59deb5..6a31890fbf 100644 --- a/integration/monai/examples/spleen_ct_segmentation_local/jobs/spleen_ct_segmentation_local/app/config/config_fed_server.json +++ b/integration/monai/examples/spleen_ct_segmentation_local/jobs/spleen_ct_segmentation_local/app/config/config_fed_server.json @@ -52,10 +52,10 @@ "experiment_name": "monai-spleen-experiment", "run_name": "monai-spleen-with-mlflow", "experiment_tags": { - "mlflow-note-content": "## **MONAI experiment with spleen bundle with MLflow**" + "\"mlflow.note.content\"": "## **MONAI experiment with spleen bundle with MLflow**" }, "run_tags": { - "mlflow-note-content": "## Federated Experiment tracking with MLflow \n### Example of using **[NVIDIA FLARE](https://nvflare.readthedocs.io/en/main/index.html)** to train and run MONAI-bundle using federated averaging ([FedAvg]([FedAvg](https://arxiv.org/abs/1602.05629))) and [PyTorch](https://pytorch.org/) as the deep learning training framework. This example also highlights the FLARE streaming capability from the clients to the server for server delivery to MLflow.\n" + "\"mlflow.note.content\"": "## Federated Experiment tracking with MLflow \n### Example of using **[NVIDIA FLARE](https://nvflare.readthedocs.io/en/main/index.html)** to train and run MONAI-bundle using federated averaging ([FedAvg]([FedAvg](https://arxiv.org/abs/1602.05629))) and [PyTorch](https://pytorch.org/) as the deep learning training framework. This example also highlights the FLARE streaming capability from the clients to the server for server delivery to MLflow.\n" } }, "artifact_location": "artifacts" diff --git a/nvflare/client/lightning/__init__.py b/nvflare/client/lightning/__init__.py index a3f1d5acbb..395e6728ab 100644 --- a/nvflare/client/lightning/__init__.py +++ b/nvflare/client/lightning/__init__.py @@ -12,6 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. +"""PyTorch Lightning API integration module for simplified imports. + +Usage: + from nvflare.client.lightning import patch + +For detailed information on usage and the API, refer to: + :mod:`nvflare.app_opt.lightning.api` + +""" + from nvflare.fuel.utils.import_utils import optional_import pytorch_lightning, ok = optional_import(module="pytorch_lightning") diff --git a/nvflare/lighter/dummy_project.yml b/nvflare/lighter/dummy_project.yml index fb5a759b95..57311da4ae 100644 --- a/nvflare/lighter/dummy_project.yml +++ b/nvflare/lighter/dummy_project.yml @@ -12,9 +12,10 @@ participants: - name: site-1 type: client org: nvidia - # listening_host will enable creating one pair of cert/private key for this client - # so it can behave like a server for client api. The value must be a hostname that - # client api can reach via network. + # Specifying listening_host will enable the creation of one pair of + # certificate/private key for this client, allowing the client to function + # as a server for 3rd-party integration. + # The value must be a hostname that the external trainer can reach via the network. # listening_host: site-1-lh - name: site-2 type: client diff --git a/setup.cfg b/setup.cfg index 23868dbe05..7b1b6e834f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -46,6 +46,8 @@ PT = SKLEARN = scikit-learn TRACKING = + mlflow + wandb tensorboard CONFIG = omegaconf @@ -55,6 +57,8 @@ app_opt = %(PT)s %(SKLEARN)s %(TRACKING)s + pytorch_lightning + xgboost app_opt_mac = %(PT)s %(SKLEARN)s