From 4c83a5e9def9f948b8d3645e3775450d771729ef Mon Sep 17 00:00:00 2001 From: jafermarq Date: Wed, 21 Feb 2024 17:57:24 +0000 Subject: [PATCH 001/103] init --- src/py/flwr/common/constant.py | 2 + src/py/flwr/server/app.py | 77 ++++++++++++++++++- .../server/superlink/fleet/vce/__init__.py | 15 ++++ .../server/superlink/fleet/vce/vce_api.py | 68 ++++++++++++++++ 4 files changed, 160 insertions(+), 2 deletions(-) create mode 100644 src/py/flwr/server/superlink/fleet/vce/__init__.py create mode 100644 src/py/flwr/server/superlink/fleet/vce/vce_api.py diff --git a/src/py/flwr/common/constant.py b/src/py/flwr/common/constant.py index 811fff73f06..2946a594e68 100644 --- a/src/py/flwr/common/constant.py +++ b/src/py/flwr/common/constant.py @@ -28,10 +28,12 @@ TRANSPORT_TYPE_GRPC_BIDI = "grpc-bidi" TRANSPORT_TYPE_GRPC_RERE = "grpc-rere" TRANSPORT_TYPE_REST = "rest" +TRANSPORT_TYPE_VCE = "vce" TRANSPORT_TYPES = [ TRANSPORT_TYPE_GRPC_BIDI, TRANSPORT_TYPE_GRPC_RERE, TRANSPORT_TYPE_REST, + TRANSPORT_TYPE_VCE, ] MESSAGE_TYPE_GET_PROPERTIES = "get_properties" diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py index dbbf63b0fe5..75fa372d084 100644 --- a/src/py/flwr/server/app.py +++ b/src/py/flwr/server/app.py @@ -14,9 +14,9 @@ # ============================================================================== """Flower server app.""" - import argparse import importlib.util +import json import sys import threading from logging import ERROR, INFO, WARN @@ -24,7 +24,7 @@ from pathlib import Path from signal import SIGINT, SIGTERM, signal from types import FrameType -from typing import List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union import grpc @@ -34,6 +34,7 @@ MISSING_EXTRA_REST, TRANSPORT_TYPE_GRPC_RERE, TRANSPORT_TYPE_REST, + TRANSPORT_TYPE_VCE, ) from flwr.common.logger import log from flwr.proto.driver_pb2_grpc import ( # pylint: disable=E0611 @@ -315,6 +316,15 @@ def run_fleet_api() -> None: certificates=certificates, ) grpc_servers.append(fleet_server) + elif args.fleet_api_type == TRANSPORT_TYPE_VCE: + _run_fleet_api_vce( + num_supernodes=args.num_supernodes, + client_app_str=args.client_app, + backend=args.backend, + backend_config=args.backend_config, + working_dir=args.dir, + state_factory=state_factory, + ) else: raise ValueError(f"Unknown fleet_api_type: {args.fleet_api_type}") @@ -537,6 +547,29 @@ def _run_fleet_api_grpc_rere( return fleet_grpc_server +# pylint: disable=import-outside-toplevel,too-many-arguments +def _run_fleet_api_vce( + num_supernodes: int, + client_app_str: str, + backend: str, + backend_config: Dict[str, Union[str, int, float]], + working_dir: str, + state_factory: StateFactory, +) -> None: + from flwr.server.superlink.fleet.vce.vce_api import start_vce + + log(INFO, "Flower VCE: Starting Fleet API (VirtualClientEngine)") + + start_vce( + num_supernodes=num_supernodes, + client_app_str=client_app_str, + backend_str=backend, + backend_config=backend_config, + state_factory=state_factory, + working_dir=working_dir, + ) + + # pylint: disable=import-outside-toplevel,too-many-arguments def _run_fleet_api_rest( host: str, @@ -714,6 +747,14 @@ def _add_args_fleet_api(parser: argparse.ArgumentParser) -> None: help="Start a Fleet API server (REST, experimental)", ) + ex_group.add_argument( + "--vce", + action="store_const", + dest="fleet_api_type", + const=TRANSPORT_TYPE_VCE, + help="Start a Fleet API server (VirtualClientEngine)", + ) + # Fleet API gRPC-rere options grpc_rere_group = parser.add_argument_group( "Fleet API (gRPC-rere) server options", "" @@ -749,3 +790,35 @@ def _add_args_fleet_api(parser: argparse.ArgumentParser) -> None: type=int, default=1, ) + + # Fleet API VCE options + vce_group = parser.add_argument_group("Fleet API (VCE) server options", "") + vce_group.add_argument( + "--client-app", + help="For example: `client:app` or `project.package.module:wrapper.app`.", + ) + vce_group.add_argument( + "--num-supernodes", + type=int, + help="Number of SuperNodes connected to the SuperLink.", + ) + vce_group.add_argument( + "--backend", + default="ray", + type=str, + help="Simulation Backend that process a ClientApp.", + ) + vce_group.add_argument( + "--backend-config", + type=json.loads, + default='{"num_cpus":2, "num_gpus":0.0}', + help='A dict in the form \'{"":, "":}\' to ' + "configure a backend. Pay close attention to how the quotes and double quotes " + "are set.", + ) + parser.add_argument( + "--dir", + default="", + help="Add specified directory to the PYTHONPATH." + " Default: current working directory.", + ) diff --git a/src/py/flwr/server/superlink/fleet/vce/__init__.py b/src/py/flwr/server/superlink/fleet/vce/__init__.py new file mode 100644 index 00000000000..563f77595e1 --- /dev/null +++ b/src/py/flwr/server/superlink/fleet/vce/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2024 Flower Labs GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Fleet VirtualClientEngine side.""" diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py new file mode 100644 index 00000000000..a6160125f1e --- /dev/null +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -0,0 +1,68 @@ +# Copyright 2024 Flower Labs GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Fleet VirtualClientEngine API.""" + + +from logging import INFO +from typing import Dict, Union + +from flwr.client.clientapp import ClientApp, load_client_app +from flwr.client.node_state import NodeState +from flwr.common.logger import log +from flwr.server.superlink.state import StateFactory + +NodeToPartitionMapping = Dict[int, int] + + +def _register_nodes( + num_nodes: int, state_factory: StateFactory +) -> NodeToPartitionMapping: + """Registre nodes with the StateFactory and create node-id:partition-id mapping.""" + nodes_mapping: NodeToPartitionMapping = {} + state = state_factory.state() + for i in range(num_nodes): + node_id = state.create_node() + nodes_mapping[node_id] = i + log(INFO, "Registered %i nodes", len(nodes_mapping)) + return nodes_mapping + + +# pylint: disable=too-many-arguments,unused-argument +def start_vce( + num_supernodes: int, + client_app_str: str, + backend_str: str, + backend_config: Dict[str, Union[str, int, float]], + state_factory: StateFactory, + working_dir: str, +) -> None: + """Start Fleet API with the VirtualClientEngine (VCE).""" + # Register SuperNodes + nodes_mapping = _register_nodes( + num_nodes=num_supernodes, state_factory=state_factory + ) + + # Construct mapping of NodeStates + node_states: Dict[int, NodeState] = {} + for node_id in nodes_mapping: + node_states[node_id] = NodeState() + + log(INFO, "client_app_str = %s", client_app_str) + + def _load() -> ClientApp: + app: ClientApp = load_client_app(client_app_str) + return app + + # start backend From a85db409e037e7bdd243394fa108d315e55c0b22 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Wed, 21 Feb 2024 18:16:47 +0000 Subject: [PATCH 002/103] base backend --- .../superlink/fleet/vce/backend/__init__.py | 21 ++++++++ .../superlink/fleet/vce/backend/backend.py | 53 +++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 src/py/flwr/server/superlink/fleet/vce/backend/__init__.py create mode 100644 src/py/flwr/server/superlink/fleet/vce/backend/backend.py diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py b/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py new file mode 100644 index 00000000000..3ff90c288a5 --- /dev/null +++ b/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py @@ -0,0 +1,21 @@ +# Copyright 2024 Flower Labs GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""VirtualClientEngine Backends.""" + +from .backend import Backend + +__all__ = [ + "Backend", +] diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py new file mode 100644 index 00000000000..ed6f7857d93 --- /dev/null +++ b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py @@ -0,0 +1,53 @@ +# Copyright 2024 Flower Labs GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Generic Backend class for Fleet API using the VCE.""" + + +from abc import ABC, abstractmethod +from typing import Callable, Tuple + +from flwr.client.clientapp import ClientApp +from flwr.common.context import Context +from flwr.common.message import Message + + +class Backend(ABC): + """Abstract base class for a Backend.""" + + async def build(self) -> None: + """Build backend asynchronously. + + Different components need to be inplace before workers in a backend are ready to + accept jobs. When this method finish executed, the backend should be fully ready + to run jobs. + """ + + @property + def num_workers(self) -> int: + """Return number of workers in the backend. + + This is the number of TaskIns that can be run concurrently. + """ + return 0 + + @abstractmethod + async def process_message( + self, + app: Callable[[], ClientApp], + message: Message, + context: Context, + node_id: int, + ) -> Tuple[Message, Context]: + """Submit a job to the backend.""" From b77031219b3b5a74b9fcbace60ee9b67950f6971 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Wed, 21 Feb 2024 18:17:10 +0000 Subject: [PATCH 003/103] update --- src/py/flwr/server/superlink/fleet/vce/backend/backend.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py index ed6f7857d93..1c83e604c65 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py @@ -48,6 +48,5 @@ async def process_message( app: Callable[[], ClientApp], message: Message, context: Context, - node_id: int, ) -> Tuple[Message, Context]: """Submit a job to the backend.""" From b9c64554ce24eb5114d8949191f9e7eda54eac88 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Wed, 21 Feb 2024 18:25:02 +0000 Subject: [PATCH 004/103] update docstrings --- src/py/flwr/server/superlink/fleet/vce/backend/backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py index 1c83e604c65..ff28724b14f 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py @@ -30,7 +30,7 @@ async def build(self) -> None: """Build backend asynchronously. Different components need to be inplace before workers in a backend are ready to - accept jobs. When this method finish executed, the backend should be fully ready + accept jobs. When this method finish executing, the backend should be fully ready to run jobs. """ @@ -38,7 +38,7 @@ async def build(self) -> None: def num_workers(self) -> int: """Return number of workers in the backend. - This is the number of TaskIns that can be run concurrently. + This is the number of TaskIns that can be processed concurrently. """ return 0 From cd48539ee8185333d9edaa16d0de7147a38601ec Mon Sep 17 00:00:00 2001 From: jafermarq Date: Wed, 21 Feb 2024 18:26:40 +0000 Subject: [PATCH 005/103] minor fixes --- src/py/flwr/server/app.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py index 75fa372d084..e16ab4dc4b8 100644 --- a/src/py/flwr/server/app.py +++ b/src/py/flwr/server/app.py @@ -14,6 +14,7 @@ # ============================================================================== """Flower server app.""" + import argparse import importlib.util import json @@ -800,13 +801,13 @@ def _add_args_fleet_api(parser: argparse.ArgumentParser) -> None: vce_group.add_argument( "--num-supernodes", type=int, - help="Number of SuperNodes connected to the SuperLink.", + help="Number of SuperNodes to register with the SuperLink.", ) vce_group.add_argument( "--backend", default="ray", type=str, - help="Simulation Backend that process a ClientApp.", + help="Simulation Backend that processes a ClientApp.", ) vce_group.add_argument( "--backend-config", @@ -819,6 +820,6 @@ def _add_args_fleet_api(parser: argparse.ArgumentParser) -> None: parser.add_argument( "--dir", default="", - help="Add specified directory to the PYTHONPATH." + help="Add a specified directory to the PYTHONPATH." " Default: current working directory.", ) From 2791163b68004b14ad11945111ba36ebde38fef1 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Thu, 22 Feb 2024 09:43:14 +0000 Subject: [PATCH 006/103] updates --- src/py/flwr/server/app.py | 7 ++++--- .../superlink/fleet/vce/backend/__init__.py | 3 ++- .../server/superlink/fleet/vce/backend/backend.py | 15 +++++++++++---- src/py/flwr/server/superlink/fleet/vce/vce_api.py | 6 ++++-- 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py index 75fa372d084..b2e5cefe45d 100644 --- a/src/py/flwr/server/app.py +++ b/src/py/flwr/server/app.py @@ -24,7 +24,7 @@ from pathlib import Path from signal import SIGINT, SIGTERM, signal from types import FrameType -from typing import Dict, List, Optional, Tuple, Union +from typing import List, Optional, Tuple import grpc @@ -55,6 +55,7 @@ start_grpc_server, ) from .superlink.fleet.grpc_rere.fleet_servicer import FleetServicer +from .superlink.fleet.vce.backend import BackendConfig from .superlink.state import StateFactory ADDRESS_DRIVER_API = "0.0.0.0:9091" @@ -552,11 +553,11 @@ def _run_fleet_api_vce( num_supernodes: int, client_app_str: str, backend: str, - backend_config: Dict[str, Union[str, int, float]], + backend_config: BackendConfig, working_dir: str, state_factory: StateFactory, ) -> None: - from flwr.server.superlink.fleet.vce.vce_api import start_vce + from .superlink.fleet.vce.vce_api import start_vce log(INFO, "Flower VCE: Starting Fleet API (VirtualClientEngine)") diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py b/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py index 3ff90c288a5..305cb32c16e 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py @@ -14,8 +14,9 @@ # ============================================================================== """VirtualClientEngine Backends.""" -from .backend import Backend +from .backend import Backend, BackendConfig __all__ = [ "Backend", + "BackendConfig", ] diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py index ff28724b14f..90745f12e71 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py @@ -16,22 +16,25 @@ from abc import ABC, abstractmethod -from typing import Callable, Tuple +from typing import Callable, Dict, Tuple, Union from flwr.client.clientapp import ClientApp from flwr.common.context import Context from flwr.common.message import Message +BackendConfig = Dict[str, Union[str, int, float]] + class Backend(ABC): """Abstract base class for a Backend.""" - async def build(self) -> None: + @abstractmethod + async def build(self, backend_config: BackendConfig) -> None: """Build backend asynchronously. Different components need to be inplace before workers in a backend are ready to - accept jobs. When this method finish executing, the backend should be fully ready - to run jobs. + accept jobs. When this method finish executing, the backend should be fully + ready to run jobs. """ @property @@ -42,6 +45,10 @@ def num_workers(self) -> int: """ return 0 + @abstractmethod + def is_worker_idle(self) -> bool: + """Report whether a backend worker is idle and can therefore run a ClientApp.""" + @abstractmethod async def process_message( self, diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index a6160125f1e..0c9b1589e89 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -16,13 +16,15 @@ from logging import INFO -from typing import Dict, Union +from typing import Dict from flwr.client.clientapp import ClientApp, load_client_app from flwr.client.node_state import NodeState from flwr.common.logger import log from flwr.server.superlink.state import StateFactory +from .backend import BackendConfig + NodeToPartitionMapping = Dict[int, int] @@ -44,7 +46,7 @@ def start_vce( num_supernodes: int, client_app_str: str, backend_str: str, - backend_config: Dict[str, Union[str, int, float]], + backend_config: BackendConfig, state_factory: StateFactory, working_dir: str, ) -> None: From 4ca33ece05702923c3ddd65ae5bf5529e8387241 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Thu, 22 Feb 2024 09:59:27 +0000 Subject: [PATCH 007/103] backend-config should contain value types --- src/py/flwr/server/app.py | 7 ++++--- src/py/flwr/server/superlink/fleet/vce/backend/backend.py | 5 +++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py index b2e5cefe45d..ec64a6e8518 100644 --- a/src/py/flwr/server/app.py +++ b/src/py/flwr/server/app.py @@ -812,10 +812,11 @@ def _add_args_fleet_api(parser: argparse.ArgumentParser) -> None: vce_group.add_argument( "--backend-config", type=json.loads, - default='{"num_cpus":2, "num_gpus":0.0}', + default='{"client_resources": {"num_cpus":2, "num_gpus":0.0}', help='A dict in the form \'{"":, "":}\' to ' - "configure a backend. Pay close attention to how the quotes and double quotes " - "are set.", + "configure a backend. Values supported in are those included by " + "`flwr.common.typing.ConfigsRecordValues`. " + "Pay close attention to how the quotes and double quotes are set.", ) parser.add_argument( "--dir", diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py index 90745f12e71..3f428061e9a 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py @@ -16,13 +16,14 @@ from abc import ABC, abstractmethod -from typing import Callable, Dict, Tuple, Union +from typing import Callable, Dict, Tuple from flwr.client.clientapp import ClientApp from flwr.common.context import Context from flwr.common.message import Message +from flwr.common.typing import ConfigsRecordValues -BackendConfig = Dict[str, Union[str, int, float]] +BackendConfig = Dict[str, ConfigsRecordValues] class Backend(ABC): From 1b6564ac578174f82bee1604813a6467e7d50840 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Thu, 22 Feb 2024 10:00:27 +0000 Subject: [PATCH 008/103] fix --- src/py/flwr/server/superlink/fleet/vce/backend/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py index 3f428061e9a..28a080b1252 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py @@ -23,7 +23,7 @@ from flwr.common.message import Message from flwr.common.typing import ConfigsRecordValues -BackendConfig = Dict[str, ConfigsRecordValues] +BackendConfig = Dict[str, Dict[str, ConfigsRecordValues]] class Backend(ABC): From bad872788a7814072ff97cc11a1b00ce7146c14d Mon Sep 17 00:00:00 2001 From: jafermarq Date: Thu, 22 Feb 2024 10:12:12 +0000 Subject: [PATCH 009/103] w/ previous --- src/py/flwr/server/superlink/fleet/vce/backend/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py index 28a080b1252..9b7cc18f3c0 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py @@ -30,7 +30,7 @@ class Backend(ABC): """Abstract base class for a Backend.""" @abstractmethod - async def build(self, backend_config: BackendConfig) -> None: + async def build(self) -> None: """Build backend asynchronously. Different components need to be inplace before workers in a backend are ready to From a68172fbef7d4bb4416e11cc856f5cd4170314c1 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Thu, 22 Feb 2024 11:22:07 +0000 Subject: [PATCH 010/103] fix --- src/py/flwr/server/app.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py index e16ab4dc4b8..f686cf1f9bb 100644 --- a/src/py/flwr/server/app.py +++ b/src/py/flwr/server/app.py @@ -317,15 +317,6 @@ def run_fleet_api() -> None: certificates=certificates, ) grpc_servers.append(fleet_server) - elif args.fleet_api_type == TRANSPORT_TYPE_VCE: - _run_fleet_api_vce( - num_supernodes=args.num_supernodes, - client_app_str=args.client_app, - backend=args.backend, - backend_config=args.backend_config, - working_dir=args.dir, - state_factory=state_factory, - ) else: raise ValueError(f"Unknown fleet_api_type: {args.fleet_api_type}") @@ -412,6 +403,15 @@ def run_superlink() -> None: certificates=certificates, ) grpc_servers.append(fleet_server) + elif args.fleet_api_type == TRANSPORT_TYPE_VCE: + _run_fleet_api_vce( + num_supernodes=args.num_supernodes, + client_app_str=args.client_app, + backend=args.backend, + backend_config=args.backend_config, + working_dir=args.dir, + state_factory=state_factory, + ) else: raise ValueError(f"Unknown fleet_api_type: {args.fleet_api_type}") From 68818666aa41aa5f14d1c06aa1eb6de8480eb116 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Thu, 22 Feb 2024 11:25:17 +0000 Subject: [PATCH 011/103] fix for json.loads --- src/py/flwr/server/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py index d7f8471a0aa..7acc69be5d5 100644 --- a/src/py/flwr/server/app.py +++ b/src/py/flwr/server/app.py @@ -813,7 +813,7 @@ def _add_args_fleet_api(parser: argparse.ArgumentParser) -> None: vce_group.add_argument( "--backend-config", type=json.loads, - default='{"client_resources": {"num_cpus":2, "num_gpus":0.0}', + default='{"client_resources": {"num_cpus":2, "num_gpus":0.0}}', help='A dict in the form \'{"":, "":}\' to ' "configure a backend. Values supported in are those included by " "`flwr.common.typing.ConfigsRecordValues`. " From 935e3337e774b6777ff3ebc9b3ccd2b99980dbfd Mon Sep 17 00:00:00 2001 From: jafermarq Date: Thu, 22 Feb 2024 12:09:06 +0000 Subject: [PATCH 012/103] keep backend-config as json string --- src/py/flwr/server/app.py | 22 +++++++++---------- .../server/superlink/fleet/vce/vce_api.py | 9 +++++--- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py index f686cf1f9bb..e11a58a19d2 100644 --- a/src/py/flwr/server/app.py +++ b/src/py/flwr/server/app.py @@ -17,7 +17,6 @@ import argparse import importlib.util -import json import sys import threading from logging import ERROR, INFO, WARN @@ -25,7 +24,7 @@ from pathlib import Path from signal import SIGINT, SIGTERM, signal from types import FrameType -from typing import Dict, List, Optional, Tuple, Union +from typing import List, Optional, Tuple import grpc @@ -408,7 +407,7 @@ def run_superlink() -> None: num_supernodes=args.num_supernodes, client_app_str=args.client_app, backend=args.backend, - backend_config=args.backend_config, + backend_config_json_str=args.backend_config, working_dir=args.dir, state_factory=state_factory, ) @@ -553,11 +552,11 @@ def _run_fleet_api_vce( num_supernodes: int, client_app_str: str, backend: str, - backend_config: Dict[str, Union[str, int, float]], + backend_config_json_str: str, working_dir: str, state_factory: StateFactory, ) -> None: - from flwr.server.superlink.fleet.vce.vce_api import start_vce + from .superlink.fleet.vce.vce_api import start_vce log(INFO, "Flower VCE: Starting Fleet API (VirtualClientEngine)") @@ -565,7 +564,7 @@ def _run_fleet_api_vce( num_supernodes=num_supernodes, client_app_str=client_app_str, backend_str=backend, - backend_config=backend_config, + backend_config_json_str=backend_config_json_str, state_factory=state_factory, working_dir=working_dir, ) @@ -811,11 +810,12 @@ def _add_args_fleet_api(parser: argparse.ArgumentParser) -> None: ) vce_group.add_argument( "--backend-config", - type=json.loads, - default='{"num_cpus":2, "num_gpus":0.0}', - help='A dict in the form \'{"":, "":}\' to ' - "configure a backend. Pay close attention to how the quotes and double quotes " - "are set.", + type=str, + default='{"client_resources": {"num_cpus":2, "num_gpus":0.0}}', + help='A JSON-like dict, e.g. \'{"":, "":}\' to ' + "configure a backend. Values supported in are those included by " + "`flwr.common.typing.ConfigsRecordValues`. " + "Pay close attention to how the quotes and double quotes are set.", ) parser.add_argument( "--dir", diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index a6160125f1e..88144b1c3c0 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -14,9 +14,9 @@ # ============================================================================== """Fleet VirtualClientEngine API.""" - +import json from logging import INFO -from typing import Dict, Union +from typing import Dict from flwr.client.clientapp import ClientApp, load_client_app from flwr.client.node_state import NodeState @@ -44,7 +44,7 @@ def start_vce( num_supernodes: int, client_app_str: str, backend_str: str, - backend_config: Dict[str, Union[str, int, float]], + backend_config_json_str: str, state_factory: StateFactory, working_dir: str, ) -> None: @@ -59,6 +59,9 @@ def start_vce( for node_id in nodes_mapping: node_states[node_id] = NodeState() + # Load backend config + _ = json.loads(backend_config_json_str) + log(INFO, "client_app_str = %s", client_app_str) def _load() -> ClientApp: From adfe198f60253b85fd6fbd2a572281d34c247880 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Thu, 22 Feb 2024 13:10:56 +0000 Subject: [PATCH 013/103] added `RayBackend` and `SimpleActorPool` --- .../superlink/fleet/vce/backend/__init__.py | 7 + .../superlink/fleet/vce/backend/backend.py | 3 + .../superlink/fleet/vce/backend/raybackend.py | 148 ++++++++++++++++++ .../server/superlink/fleet/vce/vce_api.py | 18 ++- .../simulation/ray_transport/ray_actor.py | 73 ++++++++- 5 files changed, 246 insertions(+), 3 deletions(-) create mode 100644 src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py b/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py index 305cb32c16e..dd954907234 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py @@ -14,9 +14,16 @@ # ============================================================================== """VirtualClientEngine Backends.""" +from typing import Dict, Type + from .backend import Backend, BackendConfig +from .raybackend import RayBackend __all__ = [ "Backend", "BackendConfig", + "RayBackend", ] + +# mappy of supported backends +supported_backends: Dict[str, Type[Backend]] = {"ray": RayBackend} diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py index 9b7cc18f3c0..4cc5432ce5f 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py @@ -29,6 +29,9 @@ class Backend(ABC): """Abstract base class for a Backend.""" + def __init__(self, backend_config: BackendConfig, work_dir: str) -> None: + """Construct a backend.""" + @abstractmethod async def build(self) -> None: """Build backend asynchronously. diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py new file mode 100644 index 00000000000..66511a16e0c --- /dev/null +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py @@ -0,0 +1,148 @@ +# Copyright 2024 Flower Labs GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Ray backend for the Fleet API using the VCE.""" + +import asyncio +import pathlib +from logging import INFO +from typing import Callable, Dict, List, Tuple, Union + +from flwr.client.clientapp import ClientApp +from flwr.common.context import Context +from flwr.common.logger import log +from flwr.common.message import Message +from flwr.simulation.ray_transport.ray_actor import ( + BasicActorPool, + ClientAppActor, + init_ray, +) + +from .backend import Backend, BackendConfig + +ClienteResourcesDict = Dict[str, Union[int, float]] + + +class RayBackend(Backend): + """A backend that submits jobs to a `BasicActorPool`.""" + + def __init__( + self, + backend_config: BackendConfig, + work_dir: str, + ) -> None: + """Prepare RayBackend by initialising Ray and creating the ActorPool.""" + log(INFO, "Backend config: %s", backend_config) + + # Init ray and append working dir if needed + runtime_env = ( + self._configure_runtime_env(work_dir=work_dir) if work_dir else None + ) + init_ray(runtime_env=runtime_env) + + # Validate client resources + self.client_resources_key = "client_resources" + + # Create actor pool + client_resources = self._validate_client_resources(config=backend_config) + self.pool = BasicActorPool( + actor_type=ClientAppActor, + client_resources=client_resources, + ) + + def _configure_runtime_env(self, work_dir: str) -> Dict[str, Union[str, List[str]]]: + """Return list of files/subdirectories to exclude relateive to work_dir. + + Without this, Ray will push everything to the Ray Cluster. + """ + runtime_env: Dict[str, Union[str, List[str]]] = {"working_dir": work_dir} + + if runtime_env: + excludes = [] + path = pathlib.Path(work_dir) + for p in path.rglob("*"): + # exclude files need to be relative to the working_dir + excludes.append(str(p.relative_to(path))) + runtime_env["excludes"] = excludes + + return runtime_env + + def _validate_client_resources(self, config: BackendConfig) -> ClienteResourcesDict: + client_resources_config = config.get(self.client_resources_key) + client_resources: ClienteResourcesDict = {} + valid_types = (int, float) + if client_resources_config: + for k, v in client_resources_config.items(): + assert isinstance(k, str), ValueError( + f"client resources keys are expected to be `str` but you used " + f"{type(k)} for `{k}`" + ) + assert isinstance(v, valid_types), ValueError( + f"client resources are expected to be of type {valid_types} but " + f"found `{type(v)}` for key `{k}`", + ) + client_resources[k] = v + + else: + client_resources = {"num_cpus": 2, "num_gpus": 0.0} + log( + INFO, + "`%s` not specified in backend config. Applying default setting: %s", + self.client_resources_key, + client_resources, + ) + + return client_resources + + @property + def num_workers(self) -> int: + """Return number of actors in pool.""" + return self.pool.num_actors + + def is_worker_idle(self) -> bool: + """Report whether the pool has idle actors.""" + return self.pool.is_actor_available() + + async def build(self) -> None: + """Build pool of Ray actors that this backend will submit jobs to.""" + await self.pool.add_actors_to_pool(self.pool.actors_capacity) + log(INFO, "Constructed ActorPool with: %i actors", self.pool.num_actors) + + async def process_message( + self, + app: Callable[[], ClientApp], + message: Message, + context: Context, + ) -> Tuple[Message, Context]: + """Run ClientApp that process a given message. + + Return output message and updated context. + """ + node_id = message.metadata.dst_node_id + + # Submite a task to the pool + future = await self.pool.submit( + lambda a, a_fn, mssg, cid, state: a.run.remote(a_fn, mssg, cid, state), + (app, message, str(node_id), context), + ) + + await asyncio.wait([future]) + + # Fetch result + ( + out_mssg, + updated_context, + ) = await self.pool.fetch_result_and_return_actor_to_pool(future) + + return out_mssg, updated_context diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index 88144b1c3c0..57e5fa77bb1 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -15,7 +15,7 @@ """Fleet VirtualClientEngine API.""" import json -from logging import INFO +from logging import ERROR, INFO from typing import Dict from flwr.client.clientapp import ClientApp, load_client_app @@ -23,6 +23,8 @@ from flwr.common.logger import log from flwr.server.superlink.state import StateFactory +from .backend import supported_backends + NodeToPartitionMapping = Dict[int, int] @@ -60,7 +62,19 @@ def start_vce( node_states[node_id] = NodeState() # Load backend config - _ = json.loads(backend_config_json_str) + backend_config = json.loads(backend_config_json_str) + + try: + backend_type = supported_backends[backend_str] + _ = backend_type(backend_config, work_dir=working_dir) + except KeyError as ex: + log( + ERROR, + "Backennd type `%s`, is not supported. Use any of %s", + backend_str, + list(supported_backends.keys()), + ) + raise ex log(INFO, "client_app_str = %s", client_app_str) diff --git a/src/py/flwr/simulation/ray_transport/ray_actor.py b/src/py/flwr/simulation/ray_transport/ray_actor.py index 70a220dc2a1..e2de8f8b947 100644 --- a/src/py/flwr/simulation/ray_transport/ray_actor.py +++ b/src/py/flwr/simulation/ray_transport/ray_actor.py @@ -14,7 +14,7 @@ # ============================================================================== """Ray-based Flower Actor and ActorPool implementation.""" - +import asyncio import threading import traceback from abc import ABC @@ -414,3 +414,74 @@ def get_client_result( # Fetch result belonging to the VirtualClient calling this method # Return both result from tasks and (potentially) updated run context return self._fetch_future_result(cid) + + +def init_ray(*args: Any, **kwargs: Any) -> None: + """Intialises Ray if not already initialised.""" + if not ray.is_initialized(): + ray.init(*args, **kwargs) + + +class BasicActorPool: + """A basic actor pool.""" + + def __init__( + self, + actor_type: Type[VirtualClientEngineActor], + client_resources: Dict[str, Union[int, float]], + ): + self.client_resources = client_resources + + # Queue of idle actors + self.pool: asyncio.Queue[Type[VirtualClientEngineActor]] = asyncio.Queue() + self.num_actors = 0 + + # A function that creates an actor + self.create_actor_fn = lambda: actor_type.options( # type: ignore + **client_resources + ).remote() + + # Figure out how many actors can be created given the cluster resources + # and the resources the user indicates each VirtualClient will need + self.actors_capacity = pool_size_from_resources(client_resources) + self._future_to_actor: Dict[Any, Type[VirtualClientEngineActor]] = {} + + def is_actor_available(self) -> bool: + """Return true if there is an idle actor.""" + return self.pool.qsize() > 0 + + async def add_actors_to_pool(self, num_actors: int) -> None: + """Add actors to the pool. + + This method may be executed also if new resources are added to your Ray cluster + (e.g. you add a new node). + """ + for _ in range(num_actors): + await self.pool.put(self.create_actor_fn()) # type: ignore + self.num_actors += num_actors + + async def submit( + self, actor_fn: Any, job: Tuple[ClientAppFn, Message, str, Context] + ) -> Any: + """On idle actor, submit job and return future.""" + # Remove idle actor from pool + actor = await self.pool.get() + # Submit job to actor + app_fn, mssg, cid, context = job + future = actor_fn(actor, app_fn, mssg, cid, context) + # Keep track of future:actor (so we can fetch the actor upon job completion + # and add it back to the pool) + self._future_to_actor[future] = actor + return future + + async def fetch_result_and_return_actor_to_pool( + self, future: Any + ) -> Tuple[Message, Context]: + """Pull result given a future and add actor back to pool.""" + # Get actor that ran job + actor = self._future_to_actor.pop(future) + await self.pool.put(actor) + # Retrieve result for object store + # Instead of doing ray.get(future) we await it + _, out_mssg, updated_context = await future + return out_mssg, updated_context From d0bab9a59fd21f6298e9d94ebc081a25a5323435 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Thu, 22 Feb 2024 14:35:38 +0000 Subject: [PATCH 014/103] complete VCE loop; works with `simulation-pytorch` example --- examples/simulation-pytorch/README.md | 21 ++-- examples/simulation-pytorch/sim.py | 49 +++++--- .../server/superlink/fleet/vce/vce_api.py | 119 +++++++++++++++++- 3 files changed, 157 insertions(+), 32 deletions(-) diff --git a/examples/simulation-pytorch/README.md b/examples/simulation-pytorch/README.md index 5ba5ec70dc3..f8e3c87770a 100644 --- a/examples/simulation-pytorch/README.md +++ b/examples/simulation-pytorch/README.md @@ -54,17 +54,13 @@ Write the command below in your terminal to install the dependencies according t pip install -r requirements.txt ``` -### Run Federated Learning Example +### Run with `start_simulation` -```bash -# You can run the example without activating your environemnt -poetry run python sim.py +Ensure you have activated your environment then: -# Or by first activating it -poetry shell +```bash # and then run the example python sim.py -# you can exit your environment by typing "exit" ``` You can adjust the CPU/GPU resources you assign to each of your virtual clients. By default, your clients will only use 1xCPU core. For example: @@ -79,4 +75,15 @@ python sim.py --num_cpus=2 python sim.py --num_cpus=2 --num_gpus=0.2 ``` +### Run with `super-link` and `server-app` + +Ensure you have activated your environment, then: + +``` +flower-superlink --insecure --vce --num-supernodes 100 --client-app sim:client_app + +# on a different terminal +flower-server-app sim:server_app --insecure +``` + Take a look at the [Documentation](https://flower.ai/docs/framework/how-to-run-simulations.html) for more details on how you can customise your simulation. diff --git a/examples/simulation-pytorch/sim.py b/examples/simulation-pytorch/sim.py index 0a6ed8ebb9b..139cbf60bb8 100644 --- a/examples/simulation-pytorch/sim.py +++ b/examples/simulation-pytorch/sim.py @@ -29,9 +29,9 @@ default=0.0, help="Ratio of GPU memory to assign to a virtual client", ) -parser.add_argument("--num_rounds", type=int, default=10, help="Number of FL rounds.") NUM_CLIENTS = 100 +NUM_ROUNDS = 10 # Flower client, adapted from Pytorch quickstart example @@ -167,28 +167,37 @@ def evaluate( return evaluate +# Download MNIST dataset and partition it +mnist_fds = FederatedDataset(dataset="mnist", partitioners={"train": NUM_CLIENTS}) +centralized_testset = mnist_fds.load_full("test") + +# Configure the strategy +strategy = fl.server.strategy.FedAvg( + fraction_fit=0.1, # Sample 10% of available clients for training + fraction_evaluate=0.05, # Sample 5% of available clients for evaluation + min_available_clients=10, + on_fit_config_fn=fit_config, + evaluate_metrics_aggregation_fn=weighted_average, # Aggregate federated metrics + # evaluate_fn=get_evaluate_fn(centralized_testset), # Global evaluation function +) + + +# Run via `flower-client-app client:app` +client_app = fl.client.ClientApp( + client_fn=get_client_fn(mnist_fds), +) + + +server_app = fl.server.ServerApp( + config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS), + strategy=strategy, +) + + def main(): # Parse input arguments args = parser.parse_args() - # Download MNIST dataset and partition it - mnist_fds = FederatedDataset(dataset="mnist", partitioners={"train": NUM_CLIENTS}) - centralized_testset = mnist_fds.load_full("test") - - # Configure the strategy - strategy = fl.server.strategy.FedAvg( - fraction_fit=0.1, # Sample 10% of available clients for training - fraction_evaluate=0.05, # Sample 5% of available clients for evaluation - min_fit_clients=10, # Never sample less than 10 clients for training - min_evaluate_clients=5, # Never sample less than 5 clients for evaluation - min_available_clients=int( - NUM_CLIENTS * 0.75 - ), # Wait until at least 75 clients are available - on_fit_config_fn=fit_config, - evaluate_metrics_aggregation_fn=weighted_average, # Aggregate federated metrics - evaluate_fn=get_evaluate_fn(centralized_testset), # Global evaluation function - ) - # Resources to be assigned to each virtual client client_resources = { "num_cpus": args.num_cpus, @@ -200,7 +209,7 @@ def main(): client_fn=get_client_fn(mnist_fds), num_clients=NUM_CLIENTS, client_resources=client_resources, - config=fl.server.ServerConfig(num_rounds=args.num_rounds), + config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS), strategy=strategy, actor_kwargs={ "on_actor_init_fn": disable_progress_bar # disable tqdm on each actor/process spawning virtual clients diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index 57e5fa77bb1..aacba9d1140 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -14,17 +14,23 @@ # ============================================================================== """Fleet VirtualClientEngine API.""" + +import asyncio import json -from logging import ERROR, INFO -from typing import Dict +import traceback +from logging import DEBUG, ERROR, INFO +from typing import Callable, Dict from flwr.client.clientapp import ClientApp, load_client_app from flwr.client.node_state import NodeState from flwr.common.logger import log +from flwr.common.serde import message_from_taskins, message_to_taskres +from flwr.proto.task_pb2 import TaskIns # pylint: disable=E0611 from flwr.server.superlink.state import StateFactory -from .backend import supported_backends +from .backend import Backend, supported_backends +TaskInsQueue = asyncio.Queue[TaskIns] NodeToPartitionMapping = Dict[int, int] @@ -41,6 +47,99 @@ def _register_nodes( return nodes_mapping +# pylint: disable=too-many-arguments +async def worker( + app: Callable[[], ClientApp], + queue: TaskInsQueue, + node_states: Dict[int, NodeState], + state_factory: StateFactory, + nodes_mapping: NodeToPartitionMapping, + backend: Backend, +) -> None: + """Get TaskIns from queue and pass it to an actor in the pool to execute it.""" + state = state_factory.state() + while True: + try: + task_ins = await queue.get() + node_id = task_ins.task.consumer.node_id + + # Register and retrive runstate + node_states[node_id].register_context(run_id=task_ins.run_id) + context = node_states[node_id].retrieve_context(run_id=task_ins.run_id) + + # Convert TaskIns to Message + message = message_from_taskins(task_ins) + # Replace node-id with data partition id + message.metadata.dst_node_id = nodes_mapping[node_id] + + # Let backend process message + out_mssg, updated_context = await backend.process_message( + app, message, context + ) + + # Update Context + node_states[node_id].update_context( + task_ins.run_id, context=updated_context + ) + + # Undo change node_id for partition choice + out_mssg.metadata._src_node_id = ( # pylint: disable=protected-access + task_ins.task.consumer.node_id + ) + # Convert to TaskRes + task_res = message_to_taskres(out_mssg) + # Store TaskRes in state + state.store_task_res(task_res) + + except Exception as ex: # pylint: disable=broad-exception-caught + # pylint: disable=fixme + # TODO: gen TaskRes with relevant error, add it to state_factory + log(ERROR, ex) + log(ERROR, traceback.format_exc()) + break + + +async def generate_pull_requests( + queue: TaskInsQueue, + state_factory: StateFactory, + nodes_mapping: NodeToPartitionMapping, +) -> None: + """Generate TaskIns and add it to the queue.""" + state = state_factory.state() + while True: + for node_id in nodes_mapping.keys(): + task_ins = state.get_task_ins(node_id=node_id, limit=1) + if task_ins: + await queue.put(task_ins[0]) + log(DEBUG, "TaskIns in queue: %i", queue.qsize()) + # pylint: disable=fixme + await asyncio.sleep(1.0) # TODO: revisit + + +async def run( + app: Callable[[], ClientApp], + backend: Backend, + nodes_mapping: NodeToPartitionMapping, + state_factory: StateFactory, + node_states: Dict[int, NodeState], +) -> None: + """Run the VCE async.""" + # pylint: disable=fixme + queue: TaskInsQueue = asyncio.Queue(64) # TODO: revisit + + # Build backend + await backend.build() + worker_tasks = [ + asyncio.create_task( + worker(app, queue, node_states, state_factory, nodes_mapping, backend) + ) + for _ in range(backend.num_workers) + ] + asyncio.create_task(generate_pull_requests(queue, state_factory, nodes_mapping)) + await queue.join() + await asyncio.gather(*worker_tasks) + + # pylint: disable=too-many-arguments,unused-argument def start_vce( num_supernodes: int, @@ -66,7 +165,7 @@ def start_vce( try: backend_type = supported_backends[backend_str] - _ = backend_type(backend_config, work_dir=working_dir) + backend = backend_type(backend_config, work_dir=working_dir) except KeyError as ex: log( ERROR, @@ -82,4 +181,14 @@ def _load() -> ClientApp: app: ClientApp = load_client_app(client_app_str) return app - # start backend + app = _load + + asyncio.run( + run( + app, + backend, + nodes_mapping, + state_factory, + node_states, + ) + ) From 5e0ee74dd505737abea8550125658a16badfc30c Mon Sep 17 00:00:00 2001 From: jafermarq Date: Thu, 22 Feb 2024 14:47:19 +0000 Subject: [PATCH 015/103] fix exclude generation logic --- src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py index 66511a16e0c..5d552ea758b 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py @@ -73,7 +73,8 @@ def _configure_runtime_env(self, work_dir: str) -> Dict[str, Union[str, List[str path = pathlib.Path(work_dir) for p in path.rglob("*"): # exclude files need to be relative to the working_dir - excludes.append(str(p.relative_to(path))) + if p.is_file() and not str(p).endswith('.py'): + excludes.append(str(p.relative_to(path))) runtime_env["excludes"] = excludes return runtime_env From 4853813cc8462ed097477b3294271d9b8ef6e269 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Thu, 22 Feb 2024 15:54:53 +0000 Subject: [PATCH 016/103] simulation-tf w/ Flower-next; updates pytorch example too --- examples/simulation-pytorch/README.md | 4 +- examples/simulation-pytorch/sim.py | 7 ++-- examples/simulation-tensorflow/README.md | 21 ++++++---- examples/simulation-tensorflow/sim.py | 52 ++++++++++++++---------- 4 files changed, 50 insertions(+), 34 deletions(-) diff --git a/examples/simulation-pytorch/README.md b/examples/simulation-pytorch/README.md index f8e3c87770a..33cef10cc03 100644 --- a/examples/simulation-pytorch/README.md +++ b/examples/simulation-pytorch/README.md @@ -54,7 +54,7 @@ Write the command below in your terminal to install the dependencies according t pip install -r requirements.txt ``` -### Run with `start_simulation` +### Run with `start_simulation()` Ensure you have activated your environment then: @@ -75,7 +75,7 @@ python sim.py --num_cpus=2 python sim.py --num_cpus=2 --num_gpus=0.2 ``` -### Run with `super-link` and `server-app` +### Run with Flower-Next (`super-link` and `server-app`) Ensure you have activated your environment, then: diff --git a/examples/simulation-pytorch/sim.py b/examples/simulation-pytorch/sim.py index 139cbf60bb8..84a00e3f092 100644 --- a/examples/simulation-pytorch/sim.py +++ b/examples/simulation-pytorch/sim.py @@ -178,16 +178,15 @@ def evaluate( min_available_clients=10, on_fit_config_fn=fit_config, evaluate_metrics_aggregation_fn=weighted_average, # Aggregate federated metrics - # evaluate_fn=get_evaluate_fn(centralized_testset), # Global evaluation function + evaluate_fn=get_evaluate_fn(centralized_testset), # Global evaluation function ) - -# Run via `flower-client-app client:app` +# ClientApp for Flower-Next client_app = fl.client.ClientApp( client_fn=get_client_fn(mnist_fds), ) - +# ServerApp for Flower-Next server_app = fl.server.ServerApp( config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS), strategy=strategy, diff --git a/examples/simulation-tensorflow/README.md b/examples/simulation-tensorflow/README.md index 75be823db2e..900cdbebe52 100644 --- a/examples/simulation-tensorflow/README.md +++ b/examples/simulation-tensorflow/README.md @@ -53,17 +53,13 @@ Write the command below in your terminal to install the dependencies according t pip install -r requirements.txt ``` -### Run Federated Learning Example +### Run with `start_simulation()` -```bash -# You can run the example without activating your environemnt -poetry run python sim.py +Ensure you have activated your environment then: -# Or by first activating it -poetry shell +```bash # and then run the example python sim.py -# you can exit your environment by typing "exit" ``` You can adjust the CPU/GPU resources you assign to each of your virtual clients. By default, your clients will only use 1xCPU core. For example: @@ -78,4 +74,15 @@ python sim.py --num_cpus=2 python sim.py --num_cpus=2 --num_gpus=0.2 ``` +### Run with Flower-Next (`super-link` and `server-app`) + +Ensure you have activated your environment, then: + +``` +flower-superlink --insecure --vce --num-supernodes 100 --client-app sim:client_app + +# on a different terminal +flower-server-app sim:server_app --insecure +``` + Take a look at the [Documentation](https://flower.ai/docs/framework/how-to-run-simulations.html) for more details on how you can customise your simulation. diff --git a/examples/simulation-tensorflow/sim.py b/examples/simulation-tensorflow/sim.py index 043c624a40a..5db708e3651 100644 --- a/examples/simulation-tensorflow/sim.py +++ b/examples/simulation-tensorflow/sim.py @@ -29,9 +29,9 @@ default=0.0, help="Ratio of GPU memory to assign to a virtual client", ) -parser.add_argument("--num_rounds", type=int, default=10, help="Number of FL rounds.") NUM_CLIENTS = 100 +NUM_ROUNDS = 10 VERBOSE = 0 @@ -129,29 +129,39 @@ def evaluate( return evaluate +# Download MNIST dataset and partition it +mnist_fds = FederatedDataset(dataset="mnist", partitioners={"train": NUM_CLIENTS}) +# Get the whole test set for centralised evaluation +centralized_testset = mnist_fds.load_full("test").to_tf_dataset( + columns="image", label_cols="label", batch_size=64 +) + +# Create FedAvg strategy +strategy = fl.server.strategy.FedAvg( + fraction_fit=0.1, # Sample 10% of available clients for training + fraction_evaluate=0.05, # Sample 5% of available clients for evaluation + min_fit_clients=10, # Never sample less than 10 clients for training + evaluate_metrics_aggregation_fn=weighted_average, # aggregates federated metrics + evaluate_fn=get_evaluate_fn(centralized_testset), # global evaluation function +) + + +# ClientApp for Flower-Next +client_app = fl.client.ClientApp( + client_fn=get_client_fn(mnist_fds), +) + +# ServerApp for Flower-Next +server_app = fl.server.ServerApp( + config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS), + strategy=strategy, +) + + def main() -> None: # Parse input arguments args = parser.parse_args() - # Download MNIST dataset and partition it - mnist_fds = FederatedDataset(dataset="mnist", partitioners={"train": NUM_CLIENTS}) - # Get the whole test set for centralised evaluation - centralized_testset = mnist_fds.load_full("test").to_tf_dataset( - columns="image", label_cols="label", batch_size=64 - ) - - # Create FedAvg strategy - strategy = fl.server.strategy.FedAvg( - fraction_fit=0.1, # Sample 10% of available clients for training - fraction_evaluate=0.05, # Sample 5% of available clients for evaluation - min_fit_clients=10, # Never sample less than 10 clients for training - min_evaluate_clients=5, # Never sample less than 5 clients for evaluation - min_available_clients=int( - NUM_CLIENTS * 0.75 - ), # Wait until at least 75 clients are available - evaluate_metrics_aggregation_fn=weighted_average, # aggregates federated metrics - evaluate_fn=get_evaluate_fn(centralized_testset), # global evaluation function - ) # With a dictionary, you tell Flower's VirtualClientEngine that each # client needs exclusive access to these many resources in order to run @@ -164,7 +174,7 @@ def main() -> None: fl.simulation.start_simulation( client_fn=get_client_fn(mnist_fds), num_clients=NUM_CLIENTS, - config=fl.server.ServerConfig(num_rounds=args.num_rounds), + config=fl.server.ServerConfig(NUM_ROUNDS), strategy=strategy, client_resources=client_resources, actor_kwargs={ From 31787cf508b933d9d6f4d2e03eb76e3df4827760 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Thu, 22 Feb 2024 16:33:12 +0000 Subject: [PATCH 017/103] format --- examples/simulation-tensorflow/sim.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/simulation-tensorflow/sim.py b/examples/simulation-tensorflow/sim.py index 5db708e3651..6f28eaee170 100644 --- a/examples/simulation-tensorflow/sim.py +++ b/examples/simulation-tensorflow/sim.py @@ -162,7 +162,6 @@ def main() -> None: # Parse input arguments args = parser.parse_args() - # With a dictionary, you tell Flower's VirtualClientEngine that each # client needs exclusive access to these many resources in order to run client_resources = { From 8522022c8395eafdbf8229411c9baddcaa280cbf Mon Sep 17 00:00:00 2001 From: jafermarq Date: Thu, 22 Feb 2024 16:35:11 +0000 Subject: [PATCH 018/103] passing actor init kwargs --- .../flwr/server/superlink/fleet/vce/backend/raybackend.py | 4 +++- src/py/flwr/simulation/ray_transport/ray_actor.py | 6 +++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py index 5d552ea758b..741cdee93a7 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py @@ -55,10 +55,12 @@ def __init__( self.client_resources_key = "client_resources" # Create actor pool + actor_kwargs = backend_config.get("actor_kwargs", {}) client_resources = self._validate_client_resources(config=backend_config) self.pool = BasicActorPool( actor_type=ClientAppActor, client_resources=client_resources, + actor_kwargs=actor_kwargs, ) def _configure_runtime_env(self, work_dir: str) -> Dict[str, Union[str, List[str]]]: @@ -73,7 +75,7 @@ def _configure_runtime_env(self, work_dir: str) -> Dict[str, Union[str, List[str path = pathlib.Path(work_dir) for p in path.rglob("*"): # exclude files need to be relative to the working_dir - if p.is_file() and not str(p).endswith('.py'): + if p.is_file() and not str(p).endswith(".py"): excludes.append(str(p.relative_to(path))) runtime_env["excludes"] = excludes diff --git a/src/py/flwr/simulation/ray_transport/ray_actor.py b/src/py/flwr/simulation/ray_transport/ray_actor.py index e2de8f8b947..b48e448b681 100644 --- a/src/py/flwr/simulation/ray_transport/ray_actor.py +++ b/src/py/flwr/simulation/ray_transport/ray_actor.py @@ -429,6 +429,7 @@ def __init__( self, actor_type: Type[VirtualClientEngineActor], client_resources: Dict[str, Union[int, float]], + actor_kwargs: Dict[str, Any], ): self.client_resources = client_resources @@ -436,10 +437,13 @@ def __init__( self.pool: asyncio.Queue[Type[VirtualClientEngineActor]] = asyncio.Queue() self.num_actors = 0 + # Resolve arguments to pass during actor init + actor_args = {} if actor_kwargs is None else actor_kwargs + # A function that creates an actor self.create_actor_fn = lambda: actor_type.options( # type: ignore **client_resources - ).remote() + ).remote(**actor_args) # Figure out how many actors can be created given the cluster resources # and the resources the user indicates each VirtualClient will need From 92005133226bf8809e7ab35642b5d7812883b34d Mon Sep 17 00:00:00 2001 From: jafermarq Date: Thu, 22 Feb 2024 17:04:31 +0000 Subject: [PATCH 019/103] updated examples --- examples/simulation-pytorch/README.md | 20 ++++++++++++++++---- examples/simulation-tensorflow/README.md | 23 ++++++++++++++++++----- examples/simulation-tensorflow/sim.py | 1 + 3 files changed, 35 insertions(+), 9 deletions(-) diff --git a/examples/simulation-pytorch/README.md b/examples/simulation-pytorch/README.md index 33cef10cc03..8b21e845ddc 100644 --- a/examples/simulation-pytorch/README.md +++ b/examples/simulation-pytorch/README.md @@ -67,12 +67,12 @@ You can adjust the CPU/GPU resources you assign to each of your virtual clients. ```bash # Will assign 2xCPUs to each client -python sim.py --num_cpus=2 +python sim.py --num_cpus=4 -# Will assign 2xCPUs and 20% of the GPU's VRAM to each client -# This means that you can have 5 concurrent clients on each GPU +# Will assign 4xCPUs and 25% of the GPU's VRAM to each client +# This means that you can have 4 concurrent clients on each GPU # (assuming you have enough CPUs) -python sim.py --num_cpus=2 --num_gpus=0.2 +python sim.py --num_cpus=4 --num_gpus=0.25 ``` ### Run with Flower-Next (`super-link` and `server-app`) @@ -86,4 +86,16 @@ flower-superlink --insecure --vce --num-supernodes 100 --client-app sim:client_a flower-server-app sim:server_app --insecure ``` +You can change the default resources assigned to each `ClientApp` by means the `--backend-config` argument: + +```bash +# Tells the VCE to resever 4x CPUs and 25% of available VRAM for each ClientApp +flower-superlink --insecure --vce --num-supernodes 100 \ + --client-app sim:client_app \ + --backend-config='{"client_resources": {"num_cpus":4, "num_gpus":0.25}}' + +# Then you can launch the `flower-server-app` command as shown earlier. +``` + + Take a look at the [Documentation](https://flower.ai/docs/framework/how-to-run-simulations.html) for more details on how you can customise your simulation. diff --git a/examples/simulation-tensorflow/README.md b/examples/simulation-tensorflow/README.md index 900cdbebe52..7e0225ad524 100644 --- a/examples/simulation-tensorflow/README.md +++ b/examples/simulation-tensorflow/README.md @@ -66,13 +66,14 @@ You can adjust the CPU/GPU resources you assign to each of your virtual clients. ```bash # Will assign 2xCPUs to each client -python sim.py --num_cpus=2 +python sim.py --num_cpus=4 -# Will assign 2xCPUs and 20% of the GPU's VRAM to each client -# This means that you can have 5 concurrent clients on each GPU +# Will assign 4xCPUs and 25% of the GPU's VRAM to each client +# This means that you can have 4 concurrent clients on each GPU # (assuming you have enough CPUs) -python sim.py --num_cpus=2 --num_gpus=0.2 +python sim.py --num_cpus=4 --num_gpus=0.25 ``` +Because TensorFlow by default maps all the available VRAM, we need to [enable GPU memory growth](https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth), see how it is done in the example (`sim.py`) for both the "main" process (where the server/strategy runs) and for the clients (using the `actor_kwargs`) ### Run with Flower-Next (`super-link` and `server-app`) @@ -85,4 +86,16 @@ flower-superlink --insecure --vce --num-supernodes 100 --client-app sim:client_a flower-server-app sim:server_app --insecure ``` -Take a look at the [Documentation](https://flower.ai/docs/framework/how-to-run-simulations.html) for more details on how you can customise your simulation. +You can change the default resources assigned to each `ClientApp` by means the `--backend-config` argument. Note that we need to flag that the backend is going to use `TensorFlow`. In this way it will enable GPU memory growth. + +```bash +# Tells the VCE to resever 4x CPUs and 25% of available VRAM for each ClientApp +flower-superlink --insecure --vce --num-supernodes 100 \ + --client-app sim:client_app \ + --backend-config='{"client_resources": {"num_cpus":4, "num_gpus":0.25}, "tensorflow": 1}' + +# Then you can launch the `flower-server-app` command as shown earlier. +``` + + +Take a look at the [Documentation](https://flower.ai/docs/framework/how-to-run-simulations.html) for more details on how you can customise your simulation. diff --git a/examples/simulation-tensorflow/sim.py b/examples/simulation-tensorflow/sim.py index 6f28eaee170..dbba71ac2cf 100644 --- a/examples/simulation-tensorflow/sim.py +++ b/examples/simulation-tensorflow/sim.py @@ -152,6 +152,7 @@ def evaluate( ) # ServerApp for Flower-Next +# TODO: Unclear how to enable GPU growth for the ServerApp server_app = fl.server.ServerApp( config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS), strategy=strategy, From d8935b35eb9efe36328e3bd4043eea9f14759b64 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Thu, 22 Feb 2024 17:05:06 +0000 Subject: [PATCH 020/103] auto enable GPU growth if 'tensorflow' passed --- .../flwr/server/superlink/fleet/vce/backend/raybackend.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py index 741cdee93a7..1710ff7d937 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py @@ -28,6 +28,8 @@ ClientAppActor, init_ray, ) +from flwr.simulation.ray_transport.utils import enable_tf_gpu_growth + from .backend import Backend, BackendConfig @@ -55,7 +57,9 @@ def __init__( self.client_resources_key = "client_resources" # Create actor pool - actor_kwargs = backend_config.get("actor_kwargs", {}) + use_tf = backend_config.get("tensorflow", False) + actor_kwargs = {"on_actor_init_fn": enable_tf_gpu_growth } if use_tf else {} + client_resources = self._validate_client_resources(config=backend_config) self.pool = BasicActorPool( actor_type=ClientAppActor, From b108be22c97d1ec21f0111ca125eb608f22b3828 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Thu, 22 Feb 2024 17:43:27 +0000 Subject: [PATCH 021/103] return to default 1xCPU for virtual client --- examples/simulation-pytorch/README.md | 10 +++++----- examples/simulation-tensorflow/README.md | 14 +++++++------- src/py/flwr/server/app.py | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/simulation-pytorch/README.md b/examples/simulation-pytorch/README.md index 8b21e845ddc..b95dcdceb09 100644 --- a/examples/simulation-pytorch/README.md +++ b/examples/simulation-pytorch/README.md @@ -67,12 +67,12 @@ You can adjust the CPU/GPU resources you assign to each of your virtual clients. ```bash # Will assign 2xCPUs to each client -python sim.py --num_cpus=4 +python sim.py --num_cpus=2 -# Will assign 4xCPUs and 25% of the GPU's VRAM to each client +# Will assign 2xCPUs and 25% of the GPU's VRAM to each client # This means that you can have 4 concurrent clients on each GPU # (assuming you have enough CPUs) -python sim.py --num_cpus=4 --num_gpus=0.25 +python sim.py --num_cpus=2 --num_gpus=0.25 ``` ### Run with Flower-Next (`super-link` and `server-app`) @@ -89,10 +89,10 @@ flower-server-app sim:server_app --insecure You can change the default resources assigned to each `ClientApp` by means the `--backend-config` argument: ```bash -# Tells the VCE to resever 4x CPUs and 25% of available VRAM for each ClientApp +# Tells the VCE to resever 2x CPUs and 25% of available VRAM for each ClientApp flower-superlink --insecure --vce --num-supernodes 100 \ --client-app sim:client_app \ - --backend-config='{"client_resources": {"num_cpus":4, "num_gpus":0.25}}' + --backend-config='{"client_resources": {"num_cpus":2, "num_gpus":0.25}}' # Then you can launch the `flower-server-app` command as shown earlier. ``` diff --git a/examples/simulation-tensorflow/README.md b/examples/simulation-tensorflow/README.md index 7e0225ad524..8718987eb6e 100644 --- a/examples/simulation-tensorflow/README.md +++ b/examples/simulation-tensorflow/README.md @@ -62,16 +62,16 @@ Ensure you have activated your environment then: python sim.py ``` -You can adjust the CPU/GPU resources you assign to each of your virtual clients. By default, your clients will only use 1xCPU core. For example: +You can adjust the CPU/GPU resources you assign to each of your virtual clients. By default, your clients will only use 2xCPU core. For example: ```bash # Will assign 2xCPUs to each client -python sim.py --num_cpus=4 +python sim.py --num_cpus=2 -# Will assign 4xCPUs and 25% of the GPU's VRAM to each client +# Will assign 2xCPUs and 25% of the GPU's VRAM to each client # This means that you can have 4 concurrent clients on each GPU # (assuming you have enough CPUs) -python sim.py --num_cpus=4 --num_gpus=0.25 +python sim.py --num_cpus=2 --num_gpus=0.25 ``` Because TensorFlow by default maps all the available VRAM, we need to [enable GPU memory growth](https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth), see how it is done in the example (`sim.py`) for both the "main" process (where the server/strategy runs) and for the clients (using the `actor_kwargs`) @@ -86,13 +86,13 @@ flower-superlink --insecure --vce --num-supernodes 100 --client-app sim:client_a flower-server-app sim:server_app --insecure ``` -You can change the default resources assigned to each `ClientApp` by means the `--backend-config` argument. Note that we need to flag that the backend is going to use `TensorFlow`. In this way it will enable GPU memory growth. +You can change the default resources assigned to each `ClientApp` using the `--backend-config` argument. Note that we need to flag that the backend is going to use `TensorFlow`. In this way, it will enable GPU memory growth. ```bash -# Tells the VCE to resever 4x CPUs and 25% of available VRAM for each ClientApp +# Tells the VCE to resever 2x CPUs and 25% of available VRAM for each ClientApp flower-superlink --insecure --vce --num-supernodes 100 \ --client-app sim:client_app \ - --backend-config='{"client_resources": {"num_cpus":4, "num_gpus":0.25}, "tensorflow": 1}' + --backend-config='{"client_resources": {"num_cpus":2, "num_gpus":0.25}, "tensorflow": 1}' # Then you can launch the `flower-server-app` command as shown earlier. ``` diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py index e11a58a19d2..8eb5a96bf42 100644 --- a/src/py/flwr/server/app.py +++ b/src/py/flwr/server/app.py @@ -811,7 +811,7 @@ def _add_args_fleet_api(parser: argparse.ArgumentParser) -> None: vce_group.add_argument( "--backend-config", type=str, - default='{"client_resources": {"num_cpus":2, "num_gpus":0.0}}', + default='{"client_resources": {"num_cpus":1, "num_gpus":0.0}}', help='A JSON-like dict, e.g. \'{"":, "":}\' to ' "configure a backend. Values supported in are those included by " "`flwr.common.typing.ConfigsRecordValues`. " From 0e02b05cd60b6f364182321f9f0ef8734334ea4e Mon Sep 17 00:00:00 2001 From: jafermarq Date: Thu, 22 Feb 2024 18:50:58 +0000 Subject: [PATCH 022/103] moved import --- src/py/flwr/server/app.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py index e11a58a19d2..0a24c1e36e4 100644 --- a/src/py/flwr/server/app.py +++ b/src/py/flwr/server/app.py @@ -55,6 +55,7 @@ start_grpc_server, ) from .superlink.fleet.grpc_rere.fleet_servicer import FleetServicer +from .superlink.fleet.vce.vce_api import start_vce from .superlink.state import StateFactory ADDRESS_DRIVER_API = "0.0.0.0:9091" @@ -547,7 +548,7 @@ def _run_fleet_api_grpc_rere( return fleet_grpc_server -# pylint: disable=import-outside-toplevel,too-many-arguments +# pylint: disable=too-many-arguments def _run_fleet_api_vce( num_supernodes: int, client_app_str: str, @@ -556,8 +557,6 @@ def _run_fleet_api_vce( working_dir: str, state_factory: StateFactory, ) -> None: - from .superlink.fleet.vce.vce_api import start_vce - log(INFO, "Flower VCE: Starting Fleet API (VirtualClientEngine)") start_vce( From fd67f22d09e6097667dbe56e22c53e2d7c96fc01 Mon Sep 17 00:00:00 2001 From: Javier Date: Thu, 22 Feb 2024 19:29:45 +0000 Subject: [PATCH 023/103] Apply suggestions from code review Co-authored-by: Daniel J. Beutel --- src/py/flwr/server/app.py | 7 ++++--- src/py/flwr/server/superlink/fleet/vce/vce_api.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py index 0a24c1e36e4..84eca40e995 100644 --- a/src/py/flwr/server/app.py +++ b/src/py/flwr/server/app.py @@ -799,13 +799,13 @@ def _add_args_fleet_api(parser: argparse.ArgumentParser) -> None: vce_group.add_argument( "--num-supernodes", type=int, - help="Number of SuperNodes to register with the SuperLink.", + help="Number of simulated SuperNodes.", ) vce_group.add_argument( "--backend", default="ray", type=str, - help="Simulation Backend that processes a ClientApp.", + help="Simulation backend that executes the ClientApp.", ) vce_group.add_argument( "--backend-config", @@ -819,6 +819,7 @@ def _add_args_fleet_api(parser: argparse.ArgumentParser) -> None: parser.add_argument( "--dir", default="", - help="Add a specified directory to the PYTHONPATH." + help="Add specified directory to the PYTHONPATH and load" + "ClientApp from there." " Default: current working directory.", ) diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index 88144b1c3c0..9357693a0e8 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -29,7 +29,7 @@ def _register_nodes( num_nodes: int, state_factory: StateFactory ) -> NodeToPartitionMapping: - """Registre nodes with the StateFactory and create node-id:partition-id mapping.""" + """Register nodes with the StateFactory and create node-id:partition-id mapping.""" nodes_mapping: NodeToPartitionMapping = {} state = state_factory.state() for i in range(num_nodes): From cf004d8ed4bbb520a88b05bc0ce888809149dd17 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Thu, 22 Feb 2024 19:43:23 +0000 Subject: [PATCH 024/103] renamed vars; exporting --- src/py/flwr/server/app.py | 27 +++++++++---------- .../server/superlink/fleet/vce/__init__.py | 6 +++++ .../server/superlink/fleet/vce/vce_api.py | 12 ++++----- 3 files changed, 25 insertions(+), 20 deletions(-) diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py index 84eca40e995..c8cdef9ff32 100644 --- a/src/py/flwr/server/app.py +++ b/src/py/flwr/server/app.py @@ -55,7 +55,7 @@ start_grpc_server, ) from .superlink.fleet.grpc_rere.fleet_servicer import FleetServicer -from .superlink.fleet.vce.vce_api import start_vce +from .superlink.fleet.vce import start_vce from .superlink.state import StateFactory ADDRESS_DRIVER_API = "0.0.0.0:9091" @@ -406,9 +406,9 @@ def run_superlink() -> None: elif args.fleet_api_type == TRANSPORT_TYPE_VCE: _run_fleet_api_vce( num_supernodes=args.num_supernodes, - client_app_str=args.client_app, - backend=args.backend, - backend_config_json_str=args.backend_config, + client_app_module_name=args.client_app, + backend_name=args.backend, + backend_config_json_stream=args.backend_config, working_dir=args.dir, state_factory=state_factory, ) @@ -551,9 +551,9 @@ def _run_fleet_api_grpc_rere( # pylint: disable=too-many-arguments def _run_fleet_api_vce( num_supernodes: int, - client_app_str: str, - backend: str, - backend_config_json_str: str, + client_app_module_name: str, + backend_name: str, + backend_config_json_stream: str, working_dir: str, state_factory: StateFactory, ) -> None: @@ -561,9 +561,9 @@ def _run_fleet_api_vce( start_vce( num_supernodes=num_supernodes, - client_app_str=client_app_str, - backend_str=backend, - backend_config_json_str=backend_config_json_str, + client_app_module_name=client_app_module_name, + backend_name=backend_name, + backend_config_json_stream=backend_config_json_stream, state_factory=state_factory, working_dir=working_dir, ) @@ -810,11 +810,10 @@ def _add_args_fleet_api(parser: argparse.ArgumentParser) -> None: vce_group.add_argument( "--backend-config", type=str, - default='{"client_resources": {"num_cpus":2, "num_gpus":0.0}}', - help='A JSON-like dict, e.g. \'{"":, "":}\' to ' + default='{"client_resources": {"num_cpus":1, "num_gpus":0.0}, "tensorflow": 0}', + help='A JSON formatted stream, e.g \'{"":, "":}\' to ' "configure a backend. Values supported in are those included by " - "`flwr.common.typing.ConfigsRecordValues`. " - "Pay close attention to how the quotes and double quotes are set.", + "`flwr.common.typing.ConfigsRecordValues`. ", ) parser.add_argument( "--dir", diff --git a/src/py/flwr/server/superlink/fleet/vce/__init__.py b/src/py/flwr/server/superlink/fleet/vce/__init__.py index 563f77595e1..72cd76f7376 100644 --- a/src/py/flwr/server/superlink/fleet/vce/__init__.py +++ b/src/py/flwr/server/superlink/fleet/vce/__init__.py @@ -13,3 +13,9 @@ # limitations under the License. # ============================================================================== """Fleet VirtualClientEngine side.""" + +from .vce_api import start_vce + +__all__ = [ + "start_vce", +] diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index 9357693a0e8..8c76b401b91 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -42,9 +42,9 @@ def _register_nodes( # pylint: disable=too-many-arguments,unused-argument def start_vce( num_supernodes: int, - client_app_str: str, - backend_str: str, - backend_config_json_str: str, + client_app_module_name: str, + backend_name: str, + backend_config_json_stream: str, state_factory: StateFactory, working_dir: str, ) -> None: @@ -60,12 +60,12 @@ def start_vce( node_states[node_id] = NodeState() # Load backend config - _ = json.loads(backend_config_json_str) + _ = json.loads(backend_config_json_stream) - log(INFO, "client_app_str = %s", client_app_str) + log(INFO, "client_app_str = %s", client_app_module_name) def _load() -> ClientApp: - app: ClientApp = load_client_app(client_app_str) + app: ClientApp = load_client_app(client_app_module_name) return app # start backend From a521b402b6d45a542b32c04845078bf2bcc45cc8 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Thu, 22 Feb 2024 20:58:01 +0000 Subject: [PATCH 025/103] moved --- src/py/flwr/server/app.py | 47 ++------------------------------ src/py/flwr/server/compat/app.py | 3 +- src/py/flwr/server/server.py | 47 ++++++++++++++++++++++++++++++-- src/py/flwr/simulation/app.py | 3 +- 4 files changed, 50 insertions(+), 50 deletions(-) diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py index c8cdef9ff32..ac7a8339b31 100644 --- a/src/py/flwr/server/app.py +++ b/src/py/flwr/server/app.py @@ -44,11 +44,11 @@ add_FleetServicer_to_server, ) -from .client_manager import ClientManager, SimpleClientManager +from .client_manager import ClientManager from .history import History -from .server import Server +from .server import Server, init_defaults, run_fl from .server_config import ServerConfig -from .strategy import FedAvg, Strategy +from .strategy import Strategy from .superlink.driver.driver_servicer import DriverServicer from .superlink.fleet.grpc_bidi.grpc_server import ( generic_create_grpc_server, @@ -185,47 +185,6 @@ def start_server( # pylint: disable=too-many-arguments,too-many-locals return hist -def init_defaults( - server: Optional[Server], - config: Optional[ServerConfig], - strategy: Optional[Strategy], - client_manager: Optional[ClientManager], -) -> Tuple[Server, ServerConfig]: - """Create server instance if none was given.""" - if server is None: - if client_manager is None: - client_manager = SimpleClientManager() - if strategy is None: - strategy = FedAvg() - server = Server(client_manager=client_manager, strategy=strategy) - elif strategy is not None: - log(WARN, "Both server and strategy were provided, ignoring strategy") - - # Set default config values - if config is None: - config = ServerConfig() - - return server, config - - -def run_fl( - server: Server, - config: ServerConfig, -) -> History: - """Train a model on the given server and return the History object.""" - hist = server.fit(num_rounds=config.num_rounds, timeout=config.round_timeout) - log(INFO, "app_fit: losses_distributed %s", str(hist.losses_distributed)) - log(INFO, "app_fit: metrics_distributed_fit %s", str(hist.metrics_distributed_fit)) - log(INFO, "app_fit: metrics_distributed %s", str(hist.metrics_distributed)) - log(INFO, "app_fit: losses_centralized %s", str(hist.losses_centralized)) - log(INFO, "app_fit: metrics_centralized %s", str(hist.metrics_centralized)) - - # Graceful shutdown - server.disconnect_all_clients(timeout=config.round_timeout) - - return hist - - def run_driver_api() -> None: """Run Flower server (Driver API).""" log(INFO, "Starting Flower server (Driver API)") diff --git a/src/py/flwr/server/compat/app.py b/src/py/flwr/server/compat/app.py index c0255391b88..3df779ebf99 100644 --- a/src/py/flwr/server/compat/app.py +++ b/src/py/flwr/server/compat/app.py @@ -26,10 +26,9 @@ from flwr.common.address import parse_address from flwr.common.logger import log, warn_deprecated_feature from flwr.proto import driver_pb2 # pylint: disable=E0611 -from flwr.server.app import init_defaults, run_fl from flwr.server.client_manager import ClientManager from flwr.server.history import History -from flwr.server.server import Server +from flwr.server.server import Server, init_defaults, run_fl from flwr.server.server_config import ServerConfig from flwr.server.strategy import Strategy diff --git a/src/py/flwr/server/server.py b/src/py/flwr/server/server.py index cf3a4d9aa07..ea62587b7de 100644 --- a/src/py/flwr/server/server.py +++ b/src/py/flwr/server/server.py @@ -17,7 +17,7 @@ import concurrent.futures import timeit -from logging import DEBUG, INFO +from logging import DEBUG, INFO, WARN from typing import Dict, List, Optional, Tuple, Union from flwr.common import ( @@ -33,11 +33,13 @@ ) from flwr.common.logger import log from flwr.common.typing import GetParametersIns -from flwr.server.client_manager import ClientManager +from flwr.server.client_manager import ClientManager, SimpleClientManager from flwr.server.client_proxy import ClientProxy from flwr.server.history import History from flwr.server.strategy import FedAvg, Strategy +from .server_config import ServerConfig + FitResultsAndFailures = Tuple[ List[Tuple[ClientProxy, FitRes]], List[Union[Tuple[ClientProxy, FitRes], BaseException]], @@ -441,3 +443,44 @@ def _handle_finished_future_after_evaluate( # Not successful, client returned a result where the status code is not OK failures.append(result) + + +def init_defaults( + server: Optional[Server], + config: Optional[ServerConfig], + strategy: Optional[Strategy], + client_manager: Optional[ClientManager], +) -> Tuple[Server, ServerConfig]: + """Create server instance if none was given.""" + if server is None: + if client_manager is None: + client_manager = SimpleClientManager() + if strategy is None: + strategy = FedAvg() + server = Server(client_manager=client_manager, strategy=strategy) + elif strategy is not None: + log(WARN, "Both server and strategy were provided, ignoring strategy") + + # Set default config values + if config is None: + config = ServerConfig() + + return server, config + + +def run_fl( + server: Server, + config: ServerConfig, +) -> History: + """Train a model on the given server and return the History object.""" + hist = server.fit(num_rounds=config.num_rounds, timeout=config.round_timeout) + log(INFO, "app_fit: losses_distributed %s", str(hist.losses_distributed)) + log(INFO, "app_fit: metrics_distributed_fit %s", str(hist.metrics_distributed_fit)) + log(INFO, "app_fit: metrics_distributed %s", str(hist.metrics_distributed)) + log(INFO, "app_fit: losses_centralized %s", str(hist.losses_centralized)) + log(INFO, "app_fit: metrics_centralized %s", str(hist.metrics_centralized)) + + # Graceful shutdown + server.disconnect_all_clients(timeout=config.round_timeout) + + return hist diff --git a/src/py/flwr/simulation/app.py b/src/py/flwr/simulation/app.py index f3ffe632bbe..ff18f37664b 100644 --- a/src/py/flwr/simulation/app.py +++ b/src/py/flwr/simulation/app.py @@ -28,10 +28,9 @@ from flwr.client import ClientFn from flwr.common import EventType, event from flwr.common.logger import log -from flwr.server import Server -from flwr.server.app import init_defaults, run_fl from flwr.server.client_manager import ClientManager from flwr.server.history import History +from flwr.server.server import Server, init_defaults, run_fl from flwr.server.server_config import ServerConfig from flwr.server.strategy import Strategy from flwr.simulation.ray_transport.ray_actor import ( From 443551fb65721739fcfffe96760a7d2c4b3814cf Mon Sep 17 00:00:00 2001 From: jafermarq Date: Fri, 23 Feb 2024 00:31:28 +0000 Subject: [PATCH 026/103] revisited imports readiness for chosen backend --- .../superlink/fleet/vce/backend/__init__.py | 29 +++++++++++++++---- .../superlink/fleet/vce/backend/raybackend.py | 1 + .../server/superlink/fleet/vce/vce_api.py | 9 ++++-- 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py b/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py index dd954907234..80e93f74e4b 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py @@ -14,16 +14,35 @@ # ============================================================================== """VirtualClientEngine Backends.""" +import importlib from typing import Dict, Type from .backend import Backend, BackendConfig -from .raybackend import RayBackend + +is_ray_installed = importlib.util.find_spec("ray") is not None + +# mapping of supported backends +supported_backends: Dict[str, Type[Backend]] = {} + +# To log backend-specific error message when chosen backend isn't available +error_messages_backends: Dict[str, str] = {} + +if is_ray_installed: + from .raybackend import RayBackend + + supported_backends["ray"] = RayBackend +else: + error_messages_backends[ + "ray" + ] = """Unable to import module `ray`. + + To install the necessary dependencies, install `flwr` with the `simulation` extra: + + pip install -U flwr["simulation"] + """ + __all__ = [ "Backend", "BackendConfig", - "RayBackend", ] - -# mappy of supported backends -supported_backends: Dict[str, Type[Backend]] = {"ray": RayBackend} diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py index 741cdee93a7..ce66300c361 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py @@ -43,6 +43,7 @@ def __init__( work_dir: str, ) -> None: """Prepare RayBackend by initialising Ray and creating the ActorPool.""" + log(INFO, "Initialising: %s", self.__class__.__name__) log(INFO, "Backend config: %s", backend_config) # Init ray and append working dir if needed diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index ed14cb76769..74ddbc1d279 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -23,7 +23,7 @@ from flwr.common.logger import log from flwr.server.superlink.state import StateFactory -from .backend import supported_backends +from .backend import error_messages_backends, supported_backends NodeToPartitionMapping = Dict[int, int] @@ -62,6 +62,7 @@ def start_vce( node_states[node_id] = NodeState() # Load backend config + log(INFO, "Supported backends: %s", list(supported_backends.keys())) backend_config = json.loads(backend_config_json_stream) try: @@ -70,10 +71,14 @@ def start_vce( except KeyError as ex: log( ERROR, - "Backennd type `%s`, is not supported. Use any of %s", + "Backend `%s`, is not supported. Use any of %s or add support " + "for a new backend.", backend_name, list(supported_backends.keys()), ) + if backend_name in error_messages_backends: + log(ERROR, error_messages_backends[backend_name]) + raise ex log(INFO, "client_app_str = %s", client_app_module_name) From 79f363e73014c62e694a247739b8927c821dae0a Mon Sep 17 00:00:00 2001 From: Javier Date: Fri, 23 Feb 2024 16:02:34 +0000 Subject: [PATCH 027/103] Apply suggestions from code review Co-authored-by: Daniel J. Beutel --- .../flwr/server/superlink/fleet/vce/backend/__init__.py | 2 +- .../flwr/server/superlink/fleet/vce/backend/raybackend.py | 8 ++++---- src/py/flwr/server/superlink/fleet/vce/vce_api.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py b/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py index 80e93f74e4b..8c351743dbd 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py @@ -21,7 +21,7 @@ is_ray_installed = importlib.util.find_spec("ray") is not None -# mapping of supported backends +# Mapping of supported backends supported_backends: Dict[str, Type[Backend]] = {} # To log backend-specific error message when chosen backend isn't available diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py index ce66300c361..b1099ab78f8 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Ray backend for the Fleet API using the VCE.""" +"""Ray backend for the Fleet API using the Simulation Engine.""" import asyncio import pathlib @@ -31,7 +31,7 @@ from .backend import Backend, BackendConfig -ClienteResourcesDict = Dict[str, Union[int, float]] +ClientResourcesDict = Dict[str, Union[int, float]] class RayBackend(Backend): @@ -65,7 +65,7 @@ def __init__( ) def _configure_runtime_env(self, work_dir: str) -> Dict[str, Union[str, List[str]]]: - """Return list of files/subdirectories to exclude relateive to work_dir. + """Return list of files/subdirectories to exclude relative to work_dir. Without this, Ray will push everything to the Ray Cluster. """ @@ -75,7 +75,7 @@ def _configure_runtime_env(self, work_dir: str) -> Dict[str, Union[str, List[str excludes = [] path = pathlib.Path(work_dir) for p in path.rglob("*"): - # exclude files need to be relative to the working_dir + # Exclude files need to be relative to the working_dir if p.is_file() and not str(p).endswith(".py"): excludes.append(str(p.relative_to(path))) runtime_env["excludes"] = excludes diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index 74ddbc1d279..c91bae9ddab 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -81,7 +81,7 @@ def start_vce( raise ex - log(INFO, "client_app_str = %s", client_app_module_name) + log(INFO, "client_app_module_name = %s", client_app_module_name) def _load() -> ClientApp: app: ClientApp = load_client_app(client_app_module_name) From 12fa44c03c5724ab3a03b1496ca70b82ba69034a Mon Sep 17 00:00:00 2001 From: jafermarq Date: Fri, 23 Feb 2024 16:54:35 +0000 Subject: [PATCH 028/103] remove suprefluous if --- .../superlink/fleet/vce/backend/raybackend.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py index b1099ab78f8..f223d8ba9cb 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py @@ -71,20 +71,19 @@ def _configure_runtime_env(self, work_dir: str) -> Dict[str, Union[str, List[str """ runtime_env: Dict[str, Union[str, List[str]]] = {"working_dir": work_dir} - if runtime_env: - excludes = [] - path = pathlib.Path(work_dir) - for p in path.rglob("*"): - # Exclude files need to be relative to the working_dir - if p.is_file() and not str(p).endswith(".py"): - excludes.append(str(p.relative_to(path))) - runtime_env["excludes"] = excludes + excludes = [] + path = pathlib.Path(work_dir) + for p in path.rglob("*"): + # Exclude files need to be relative to the working_dir + if p.is_file() and not str(p).endswith(".py"): + excludes.append(str(p.relative_to(path))) + runtime_env["excludes"] = excludes return runtime_env - def _validate_client_resources(self, config: BackendConfig) -> ClienteResourcesDict: + def _validate_client_resources(self, config: BackendConfig) -> ClientResourcesDict: client_resources_config = config.get(self.client_resources_key) - client_resources: ClienteResourcesDict = {} + client_resources: ClientResourcesDict = {} valid_types = (int, float) if client_resources_config: for k, v in client_resources_config.items(): From c30904620401c847c59c79cf9e10121680f62c38 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Fri, 23 Feb 2024 17:15:02 +0000 Subject: [PATCH 029/103] fixes --- .../superlink/fleet/vce/backend/raybackend.py | 18 ++++++++++-------- .../flwr/simulation/ray_transport/ray_actor.py | 2 +- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py index f223d8ba9cb..24620aab083 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py @@ -87,14 +87,16 @@ def _validate_client_resources(self, config: BackendConfig) -> ClientResourcesDi valid_types = (int, float) if client_resources_config: for k, v in client_resources_config.items(): - assert isinstance(k, str), ValueError( - f"client resources keys are expected to be `str` but you used " - f"{type(k)} for `{k}`" - ) - assert isinstance(v, valid_types), ValueError( - f"client resources are expected to be of type {valid_types} but " - f"found `{type(v)}` for key `{k}`", - ) + if not isinstance(k, str): + raise ValueError( + f"client resources keys are expected to be `str` but you used " + f"{type(k)} for `{k}`" + ) + if not isinstance(v, valid_types): + raise ValueError( + f"client resources are expected to be of type {valid_types} " + f"but found `{type(v)}` for key `{k}`", + ) client_resources[k] = v else: diff --git a/src/py/flwr/simulation/ray_transport/ray_actor.py b/src/py/flwr/simulation/ray_transport/ray_actor.py index b48e448b681..e899ce28261 100644 --- a/src/py/flwr/simulation/ray_transport/ray_actor.py +++ b/src/py/flwr/simulation/ray_transport/ray_actor.py @@ -434,7 +434,7 @@ def __init__( self.client_resources = client_resources # Queue of idle actors - self.pool: asyncio.Queue[Type[VirtualClientEngineActor]] = asyncio.Queue() + self.pool: "asyncio.Queue[Type[VirtualClientEngineActor]]" = asyncio.Queue() self.num_actors = 0 # Resolve arguments to pass during actor init From b16d0b81d8b74e4953bed446befdacf4c7d40950 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Sat, 24 Feb 2024 09:49:30 +0000 Subject: [PATCH 030/103] init; need resolve circular imports --- pyproject.toml | 1 + src/py/flwr/simulation/__init__.py | 3 + src/py/flwr/simulation/run_simulation.py | 122 +++++++++++++++++++++++ 3 files changed, 126 insertions(+) create mode 100644 src/py/flwr/simulation/run_simulation.py diff --git a/pyproject.toml b/pyproject.toml index 6bd5c74f29a..743670c6419 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,6 +58,7 @@ flower-fleet-api = "flwr.server:run_fleet_api" flower-superlink = "flwr.server:run_superlink" flower-client-app = "flwr.client:run_client_app" flower-server-app = "flwr.server:run_server_app" +flower-simulation = "flwr.simulation:run_simulation" [tool.poetry.dependencies] python = "^3.8" diff --git a/src/py/flwr/simulation/__init__.py b/src/py/flwr/simulation/__init__.py index 724ea927391..b283de70c58 100644 --- a/src/py/flwr/simulation/__init__.py +++ b/src/py/flwr/simulation/__init__.py @@ -17,6 +17,8 @@ import importlib +from flwr.simulation.run_simulation import run_simulation + is_ray_installed = importlib.util.find_spec("ray") is not None if is_ray_installed: @@ -36,4 +38,5 @@ def start_simulation(*args, **kwargs): # type: ignore __all__ = [ "start_simulation", + "run_simulation", ] diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py new file mode 100644 index 00000000000..70e44b61211 --- /dev/null +++ b/src/py/flwr/simulation/run_simulation.py @@ -0,0 +1,122 @@ +# Copyright 2024 Flower Labs GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Flower Simulation.""" + +import argparse +import threading + +import grpc + +from flwr.common import EventType, event +from flwr.server.driver.driver import Driver +from flwr.server.run_serverapp import run +from flwr.server.superlink.state import StateFactory + + +def run_simulation() -> None: + """.""" + # TODO: below create circular imports + from flwr.server.app import _register_exit_handlers, _run_driver_api_grpc + from flwr.server.superlink.fleet.vce import start_vce + + args = _parse_args_run_simulation().parse_args() + + # Initialize StateFactory + state_factory = StateFactory(":flwr-in-memory-state:") + + # Start Driver API + driver_server: grpc.Server = _run_driver_api_grpc( + address="0.0.0.0:9091", + state_factory=state_factory, + certificates=None, + ) + + # Superlink with Simulation Engine + superlink_th = threading.Thread( + target=start_vce, + args=( + args.num_supernodes, + args.client_app, + args.backend, + args.backend_config, + state_factory, + args.dir, + ), + daemon=False, + ) + + event(EventType.RUN_SUPERLINK_ENTER) + superlink_th.start() + + # Initialize Driver + driver = Driver( + driver_service_address="0.0.0.0:9091", + root_certificates=None, + ) + + # Launch server app + run(args.server_app, driver, args.dir) + + _register_exit_handlers( + grpc_servers=[driver_server], + bckg_threads=[superlink_th], + event_type=EventType.RUN_SUPERLINK_LEAVE, + ) + + +def _parse_args_run_simulation() -> argparse.ArgumentParser: + """Parse flower-simulation command line arguments.""" + parser = argparse.ArgumentParser( + description="Start a Flower Simulation", + ) + parser.add_argument( + "--client-app", + required=True, + help="For example: `client:app` or `project.package.module:wrapper.app`", + ) + parser.add_argument( + "--server-app", + required=True, + help="For example: `server:app` or `project.package.module:wrapper.app`", + ) + parser.add_argument( + "--num-supernodes", + type=int, + required=True, + help="Number of simulated SuperNodes.", + ) + parser.add_argument( + "--backend", + default="ray", + type=str, + help="Simulation backend that executes the ClientApp.", + ) + parser.add_argument( + "--backend-config", + type=str, + default='{"client_resources": {"num_cpus":1, "num_gpus":0.0}, "tensorflow": 0}', + help='A JSON formatted stream, e.g \'{"":, "":}\' to ' + "configure a backend. Values supported in are those included by " + "`flwr.common.typing.ConfigsRecordValues`. ", + ) + parser.add_argument( + "--dir", + default="", + help="Add specified directory to the PYTHONPATH and load" + "ClientApp and ServerApp from there." + " Default: current working directory.", + ) + + return parser From 93918db466054926bb96eb936fe3aa0b4ad16321 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Sun, 25 Feb 2024 22:25:16 +0000 Subject: [PATCH 031/103] gracefully shutdown --- .../superlink/fleet/vce/backend/backend.py | 4 +++ .../superlink/fleet/vce/backend/raybackend.py | 4 +++ .../server/superlink/fleet/vce/vce_api.py | 31 +++++++++++++++++-- .../simulation/ray_transport/ray_actor.py | 18 +++++++++-- src/py/flwr/simulation/run_simulation.py | 11 ++++++- 5 files changed, 61 insertions(+), 7 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py index 2df4be76e7a..f2796a5758a 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py @@ -53,6 +53,10 @@ def num_workers(self) -> int: def is_worker_idle(self) -> bool: """Report whether a backend worker is idle and can therefore run a ClientApp.""" + @abstractmethod + async def terminate(self) -> None: + """Terminate backend.""" + @abstractmethod async def process_message( self, diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py index 8a33d07404b..cc3cf434849 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py @@ -154,3 +154,7 @@ async def process_message( ) = await self.pool.fetch_result_and_return_actor_to_pool(future) return out_mssg, updated_context + + async def terminate(self) -> None: + """Terminate all actors in actor pool.""" + await self.pool.terminate_all_actors() diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index 679742de4b1..2c2cbaca018 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -91,6 +91,10 @@ async def worker( # Store TaskRes in state state.store_task_res(task_res) + except asyncio.CancelledError as e: + log(DEBUG, f"Async worker: {e}") + break + except Exception as ex: # pylint: disable=broad-exception-caught # pylint: disable=fixme # TODO: gen TaskRes with relevant error, add it to state_factory @@ -103,10 +107,11 @@ async def generate_pull_requests( queue: TaskInsQueue, state_factory: StateFactory, nodes_mapping: NodeToPartitionMapping, + f_stop: asyncio.Event, ) -> None: """Generate TaskIns and add it to the queue.""" state = state_factory.state() - while True: + while not (f_stop.is_set()): for node_id in nodes_mapping.keys(): task_ins = state.get_task_ins(node_id=node_id, limit=1) if task_ins: @@ -114,6 +119,7 @@ async def generate_pull_requests( log(DEBUG, "TaskIns in queue: %i", queue.qsize()) # pylint: disable=fixme await asyncio.sleep(1.0) # TODO: revisit + log(DEBUG, "Async producer: Stopped pulling from StateFactory.") async def run( @@ -122,6 +128,7 @@ async def run( nodes_mapping: NodeToPartitionMapping, state_factory: StateFactory, node_states: Dict[int, NodeState], + f_stop: asyncio.Event, ) -> None: """Run the VCE async.""" # pylint: disable=fixme @@ -135,10 +142,26 @@ async def run( ) for _ in range(backend.num_workers) ] - asyncio.create_task(generate_pull_requests(queue, state_factory, nodes_mapping)) - await queue.join() + producer = asyncio.create_task( + generate_pull_requests(queue, state_factory, nodes_mapping, f_stop) + ) + + await asyncio.gather(producer) + + # Produced task terminated, now cancel worker tasks + for w_t in worker_tasks: + _ = w_t.cancel("Terminate on Simulation Engine shutdown.") + + # print('requested cancel') + while not all(w_t.done() for w_t in worker_tasks): + log(DEBUG, "Terminating async workers...") + await asyncio.sleep(0.5) + await asyncio.gather(*worker_tasks) + # Terminate backend + await backend.terminate() + # pylint: disable=too-many-arguments,unused-argument def start_vce( @@ -148,6 +171,7 @@ def start_vce( backend_config_json_stream: str, state_factory: StateFactory, working_dir: str, + f_stop: asyncio.Event, ) -> None: """Start Fleet API with the VirtualClientEngine (VCE).""" # Register SuperNodes @@ -195,5 +219,6 @@ def _load() -> ClientApp: nodes_mapping, state_factory, node_states, + f_stop, ) ) diff --git a/src/py/flwr/simulation/ray_transport/ray_actor.py b/src/py/flwr/simulation/ray_transport/ray_actor.py index e899ce28261..5ac0b2c2748 100644 --- a/src/py/flwr/simulation/ray_transport/ray_actor.py +++ b/src/py/flwr/simulation/ray_transport/ray_actor.py @@ -18,7 +18,7 @@ import threading import traceback from abc import ABC -from logging import ERROR, WARNING +from logging import DEBUG, ERROR, WARNING from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union import ray @@ -46,7 +46,7 @@ class VirtualClientEngineActor(ABC): def terminate(self) -> None: """Manually terminate Actor object.""" - log(WARNING, "Manually terminating %s}", self.__class__.__name__) + log(WARNING, "Manually terminating %s", self.__class__.__name__) ray.actor.exit_actor() def run( @@ -434,7 +434,9 @@ def __init__( self.client_resources = client_resources # Queue of idle actors - self.pool: "asyncio.Queue[Type[VirtualClientEngineActor]]" = asyncio.Queue() + self.pool: "asyncio.Queue[Type[VirtualClientEngineActor]]" = asyncio.Queue( + maxsize=1024 + ) self.num_actors = 0 # Resolve arguments to pass during actor init @@ -464,6 +466,16 @@ async def add_actors_to_pool(self, num_actors: int) -> None: await self.pool.put(self.create_actor_fn()) # type: ignore self.num_actors += num_actors + async def terminate_all_actors(self) -> None: + """Terminate actors in pool.""" + num_terminated = 0 + while self.pool.qsize(): + actor = await self.pool.get() + actor.terminate.remote() # type: ignore + num_terminated += 1 + + log(DEBUG, "Terminated %i actors", num_terminated) + async def submit( self, actor_fn: Any, job: Tuple[ClientAppFn, Message, str, Context] ) -> Any: diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index 70e44b61211..e15807adeb3 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -15,6 +15,7 @@ """Flower Simulation.""" import argparse +import asyncio import threading import grpc @@ -44,6 +45,7 @@ def run_simulation() -> None: ) # Superlink with Simulation Engine + f_stop = asyncio.Event() superlink_th = threading.Thread( target=start_vce, args=( @@ -53,6 +55,7 @@ def run_simulation() -> None: args.backend_config, state_factory, args.dir, + f_stop, ), daemon=False, ) @@ -69,11 +72,17 @@ def run_simulation() -> None: # Launch server app run(args.server_app, driver, args.dir) + del driver + + # Trigger stop event + f_stop.set() + _register_exit_handlers( grpc_servers=[driver_server], bckg_threads=[superlink_th], event_type=EventType.RUN_SUPERLINK_LEAVE, ) + superlink_th.join() def _parse_args_run_simulation() -> argparse.ArgumentParser: @@ -106,7 +115,7 @@ def _parse_args_run_simulation() -> argparse.ArgumentParser: parser.add_argument( "--backend-config", type=str, - default='{"client_resources": {"num_cpus":1, "num_gpus":0.0}, "tensorflow": 0}', + default='{"client_resources": {"num_cpus":2, "num_gpus":0.0}, "tensorflow": 0}', help='A JSON formatted stream, e.g \'{"":, "":}\' to ' "configure a backend. Values supported in are those included by " "`flwr.common.typing.ConfigsRecordValues`. ", From 0e4ab143ac76c5340404ae22162e5d1f3c79408f Mon Sep 17 00:00:00 2001 From: jafermarq Date: Sun, 25 Feb 2024 22:31:49 +0000 Subject: [PATCH 032/103] terminate method for backend; asyncio event to trigger stop --- .../superlink/fleet/vce/backend/backend.py | 4 ++++ .../superlink/fleet/vce/backend/raybackend.py | 9 ++++++++- .../flwr/server/superlink/fleet/vce/vce_api.py | 4 +++- .../flwr/simulation/ray_transport/ray_actor.py | 18 +++++++++++++++--- 4 files changed, 30 insertions(+), 5 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py index 2df4be76e7a..f2796a5758a 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py @@ -53,6 +53,10 @@ def num_workers(self) -> int: def is_worker_idle(self) -> bool: """Report whether a backend worker is idle and can therefore run a ClientApp.""" + @abstractmethod + async def terminate(self) -> None: + """Terminate backend.""" + @abstractmethod async def process_message( self, diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py index 24620aab083..cc3cf434849 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py @@ -28,6 +28,7 @@ ClientAppActor, init_ray, ) +from flwr.simulation.ray_transport.utils import enable_tf_gpu_growth from .backend import Backend, BackendConfig @@ -56,7 +57,9 @@ def __init__( self.client_resources_key = "client_resources" # Create actor pool - actor_kwargs = backend_config.get("actor_kwargs", {}) + use_tf = backend_config.get("tensorflow", False) + actor_kwargs = {"on_actor_init_fn": enable_tf_gpu_growth} if use_tf else {} + client_resources = self._validate_client_resources(config=backend_config) self.pool = BasicActorPool( actor_type=ClientAppActor, @@ -151,3 +154,7 @@ async def process_message( ) = await self.pool.fetch_result_and_return_actor_to_pool(future) return out_mssg, updated_context + + async def terminate(self) -> None: + """Terminate all actors in actor pool.""" + await self.pool.terminate_all_actors() diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index c91bae9ddab..666e7e7d9ec 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -14,9 +14,10 @@ # ============================================================================== """Fleet VirtualClientEngine API.""" +import asyncio import json from logging import ERROR, INFO -from typing import Dict +from typing import Dict, Optional from flwr.client.clientapp import ClientApp, load_client_app from flwr.client.node_state import NodeState @@ -49,6 +50,7 @@ def start_vce( backend_config_json_stream: str, state_factory: StateFactory, working_dir: str, + f_stop: Optional[asyncio.Event] = None, ) -> None: """Start Fleet API with the VirtualClientEngine (VCE).""" # Register SuperNodes diff --git a/src/py/flwr/simulation/ray_transport/ray_actor.py b/src/py/flwr/simulation/ray_transport/ray_actor.py index e899ce28261..5ac0b2c2748 100644 --- a/src/py/flwr/simulation/ray_transport/ray_actor.py +++ b/src/py/flwr/simulation/ray_transport/ray_actor.py @@ -18,7 +18,7 @@ import threading import traceback from abc import ABC -from logging import ERROR, WARNING +from logging import DEBUG, ERROR, WARNING from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union import ray @@ -46,7 +46,7 @@ class VirtualClientEngineActor(ABC): def terminate(self) -> None: """Manually terminate Actor object.""" - log(WARNING, "Manually terminating %s}", self.__class__.__name__) + log(WARNING, "Manually terminating %s", self.__class__.__name__) ray.actor.exit_actor() def run( @@ -434,7 +434,9 @@ def __init__( self.client_resources = client_resources # Queue of idle actors - self.pool: "asyncio.Queue[Type[VirtualClientEngineActor]]" = asyncio.Queue() + self.pool: "asyncio.Queue[Type[VirtualClientEngineActor]]" = asyncio.Queue( + maxsize=1024 + ) self.num_actors = 0 # Resolve arguments to pass during actor init @@ -464,6 +466,16 @@ async def add_actors_to_pool(self, num_actors: int) -> None: await self.pool.put(self.create_actor_fn()) # type: ignore self.num_actors += num_actors + async def terminate_all_actors(self) -> None: + """Terminate actors in pool.""" + num_terminated = 0 + while self.pool.qsize(): + actor = await self.pool.get() + actor.terminate.remote() # type: ignore + num_terminated += 1 + + log(DEBUG, "Terminated %i actors", num_terminated) + async def submit( self, actor_fn: Any, job: Tuple[ClientAppFn, Message, str, Context] ) -> Any: From 21e9932e89965be7c9958b46ddcce7632cf7311a Mon Sep 17 00:00:00 2001 From: jafermarq Date: Sun, 25 Feb 2024 22:56:14 +0000 Subject: [PATCH 033/103] propagate terminate asyncio logic --- src/py/flwr/server/app.py | 6 ++- .../server/superlink/fleet/vce/vce_api.py | 38 +++++++++++++++---- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py index ac7a8339b31..eecd80fcf17 100644 --- a/src/py/flwr/server/app.py +++ b/src/py/flwr/server/app.py @@ -14,8 +14,8 @@ # ============================================================================== """Flower server app.""" - import argparse +import asyncio import importlib.util import sys import threading @@ -363,6 +363,7 @@ def run_superlink() -> None: ) grpc_servers.append(fleet_server) elif args.fleet_api_type == TRANSPORT_TYPE_VCE: + f_stop = asyncio.Event() # Does nothing _run_fleet_api_vce( num_supernodes=args.num_supernodes, client_app_module_name=args.client_app, @@ -370,6 +371,7 @@ def run_superlink() -> None: backend_config_json_stream=args.backend_config, working_dir=args.dir, state_factory=state_factory, + f_stop=f_stop, ) else: raise ValueError(f"Unknown fleet_api_type: {args.fleet_api_type}") @@ -515,6 +517,7 @@ def _run_fleet_api_vce( backend_config_json_stream: str, working_dir: str, state_factory: StateFactory, + f_stop: asyncio.Event, ) -> None: log(INFO, "Flower VCE: Starting Fleet API (VirtualClientEngine)") @@ -525,6 +528,7 @@ def _run_fleet_api_vce( backend_config_json_stream=backend_config_json_stream, state_factory=state_factory, working_dir=working_dir, + f_stop=f_stop, ) diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index 56e5e5b75a0..6312ab17359 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -19,7 +19,7 @@ import json import traceback from logging import DEBUG, ERROR, INFO -from typing import Callable, Dict, Optional +from typing import Callable, Dict from flwr.client.clientapp import ClientApp, load_client_app from flwr.client.node_state import NodeState @@ -47,7 +47,7 @@ def _register_nodes( return nodes_mapping -# pylint: disable=too-many-arguments +# pylint: disable=too-many-arguments,too-many-locals async def worker( app: Callable[[], ClientApp], queue: TaskInsQueue, @@ -91,6 +91,10 @@ async def worker( # Store TaskRes in state state.store_task_res(task_res) + except asyncio.CancelledError as e: + log(DEBUG, "Async worker: %s", e) + break + except Exception as ex: # pylint: disable=broad-exception-caught # pylint: disable=fixme # TODO: gen TaskRes with relevant error, add it to state_factory @@ -103,10 +107,11 @@ async def generate_pull_requests( queue: TaskInsQueue, state_factory: StateFactory, nodes_mapping: NodeToPartitionMapping, + f_stop: asyncio.Event, ) -> None: """Generate TaskIns and add it to the queue.""" state = state_factory.state() - while True: + while not f_stop.is_set(): for node_id in nodes_mapping.keys(): task_ins = state.get_task_ins(node_id=node_id, limit=1) if task_ins: @@ -114,6 +119,7 @@ async def generate_pull_requests( log(DEBUG, "TaskIns in queue: %i", queue.qsize()) # pylint: disable=fixme await asyncio.sleep(1.0) # TODO: revisit + log(DEBUG, "Async producer: Stopped pulling from StateFactory.") async def run( @@ -122,6 +128,7 @@ async def run( nodes_mapping: NodeToPartitionMapping, state_factory: StateFactory, node_states: Dict[int, NodeState], + f_stop: asyncio.Event, ) -> None: """Run the VCE async.""" # pylint: disable=fixme @@ -135,12 +142,28 @@ async def run( ) for _ in range(backend.num_workers) ] - asyncio.create_task(generate_pull_requests(queue, state_factory, nodes_mapping)) - await queue.join() + producer = asyncio.create_task( + generate_pull_requests(queue, state_factory, nodes_mapping, f_stop) + ) + + await asyncio.gather(producer) + + # Produced task terminated, now cancel worker tasks + for w_t in worker_tasks: + _ = w_t.cancel("Terminate on Simulation Engine shutdown.") + + # print('requested cancel') + while not all(w_t.done() for w_t in worker_tasks): + log(DEBUG, "Terminating async workers...") + await asyncio.sleep(0.5) + await asyncio.gather(*worker_tasks) + # Terminate backend + await backend.terminate() + -# pylint: disable=too-many-arguments,unused-argument +# pylint: disable=too-many-arguments,unused-argument,too-many-locals def start_vce( num_supernodes: int, client_app_module_name: str, @@ -148,7 +171,7 @@ def start_vce( backend_config_json_stream: str, state_factory: StateFactory, working_dir: str, - f_stop: Optional[asyncio.Event] = None, + f_stop: asyncio.Event, ) -> None: """Start Fleet API with the VirtualClientEngine (VCE).""" # Register SuperNodes @@ -196,5 +219,6 @@ def _load() -> ClientApp: nodes_mapping, state_factory, node_states, + f_stop, ) ) From f8b57c561024f9edf98bcba9efac42009cdc2f8d Mon Sep 17 00:00:00 2001 From: jafermarq Date: Mon, 26 Feb 2024 11:50:34 +0000 Subject: [PATCH 034/103] added build/process/terminate tests --- .../fleet/vce/backend/raybackend_test.py | 140 ++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py new file mode 100644 index 00000000000..441329d159e --- /dev/null +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py @@ -0,0 +1,140 @@ +# Copyright 2024 Flower Labs GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Test for Ray backend for the Fleet API using the Simulation Engine.""" + +import asyncio +from math import pi +from typing import Callable, Dict, Optional, Tuple + +import ray + +from flwr.client import Client, NumPyClient +from flwr.client.clientapp import ClientApp +from flwr.common import ( + Config, + ConfigsRecord, + Context, + GetPropertiesIns, + Message, + Metadata, + RecordSet, + Scalar, +) +from flwr.common.constant import MESSAGE_TYPE_GET_PROPERTIES +from flwr.common.recordset_compat import getpropertiesins_to_recordset + +from .raybackend import RayBackend + + +class DummyClient(NumPyClient): + """A dummy NumPyClient for tests.""" + + def get_properties(self, config: Config) -> Dict[str, Scalar]: + """Return properties by doing a simple calculation.""" + result = float(config["factor"]) * pi + + # store something in context + self.context.state.configs_records["result"] = ConfigsRecord({"result": result}) + return {"result": result} + + +def get_dummy_client(cid: str) -> Client: # pylint: disable=unused-argument + """Return a DummyClient converted to Client type.""" + return DummyClient().to_client() + + +def _load_app() -> ClientApp: + return ClientApp(client_fn=get_dummy_client) + + +async def backend_build_process_and_termination( + backend: RayBackend, + process_args: Optional[Tuple[Callable[[], ClientApp], Message, Context]] = None, +) -> Tuple[Message, Context] | None: + """Build, process job and terminate RayBackend.""" + await backend.build() + to_return = None + + if process_args: + to_return = await backend.process_message(*process_args) + + await backend.terminate() + + ray.shutdown() + + return to_return + + +def test_backend_creation_and_termination() -> None: + """Test creation of RayBackend and its termination.""" + backend = RayBackend(backend_config={}, work_dir="") + asyncio.run( + backend_build_process_and_termination(backend=backend, process_args=None) + ) + + +def test_backend_creation_submit_and_termination() -> None: + """Test submit.""" + backend = RayBackend(backend_config={}, work_dir="") + + # Define ClientApp + client_app_callable = _load_app + + # Construct a Message + mult_factor = 2024 + getproperties_ins = GetPropertiesIns(config={"factor": mult_factor}) + recordset = getpropertiesins_to_recordset(getproperties_ins) + message = Message( + content=recordset, + metadata=Metadata( + run_id=0, + message_id="", + group_id="", + src_node_id=0, + dst_node_id=0, + reply_to_message="", + ttl="", + message_type=MESSAGE_TYPE_GET_PROPERTIES, + ), + ) + + # Construct emtpy Context + context = Context(state=RecordSet()) + + res = asyncio.run( + backend_build_process_and_termination( + backend=backend, process_args=(client_app_callable, message, context) + ) + ) + + if res is None: + raise AssertionError("This shouldn't happen") + + out_mssg, updated_context = res + + # Verify message content is as expected + content = out_mssg.content + assert ( + content.configs_records["getpropertiesres.properties"]["result"] + == pi * mult_factor + ) + + # Verify context is correct + obtained_result_in_context = updated_context.state.configs_records["result"][ + "result" + ] + assert obtained_result_in_context == pi * mult_factor + + From 39e3234884ca36fde8a541256e6f1737cbf2dfd1 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Mon, 26 Feb 2024 11:55:34 +0000 Subject: [PATCH 035/103] format --- .../flwr/server/superlink/fleet/vce/backend/raybackend_test.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py index 441329d159e..d31fe6c3416 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py @@ -136,5 +136,3 @@ def test_backend_creation_submit_and_termination() -> None: "result" ] assert obtained_result_in_context == pi * mult_factor - - From 8ea4b08100fb6e7ac0a72b40cab83b994fd44cf4 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Mon, 26 Feb 2024 11:58:58 +0000 Subject: [PATCH 036/103] fix for py3.8 --- .../server/superlink/fleet/vce/backend/raybackend_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py index d31fe6c3416..bb33491db90 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py @@ -16,7 +16,7 @@ import asyncio from math import pi -from typing import Callable, Dict, Optional, Tuple +from typing import Callable, Dict, Optional, Tuple, Union import ray @@ -62,7 +62,7 @@ def _load_app() -> ClientApp: async def backend_build_process_and_termination( backend: RayBackend, process_args: Optional[Tuple[Callable[[], ClientApp], Message, Context]] = None, -) -> Tuple[Message, Context] | None: +) -> Union[Tuple[Message, Context], None]: """Build, process job and terminate RayBackend.""" await backend.build() to_return = None From 35c55d41eabdd20915a47c92b74a5cc2926b6248 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Mon, 26 Feb 2024 12:48:44 +0000 Subject: [PATCH 037/103] fix py3.11 --- src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py index cc3cf434849..1864e48fe16 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py @@ -145,7 +145,7 @@ async def process_message( (app, message, str(node_id), context), ) - await asyncio.wait([future]) + await future # Fetch result ( From 49bc661c1d764b9402dd0713eabf33ffea0738d6 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Mon, 26 Feb 2024 12:51:17 +0000 Subject: [PATCH 038/103] fix import --- src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py index 1864e48fe16..b29d76b239e 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py @@ -14,7 +14,6 @@ # ============================================================================== """Ray backend for the Fleet API using the Simulation Engine.""" -import asyncio import pathlib from logging import INFO from typing import Callable, Dict, List, Tuple, Union From 4506a1706f624821000976b639eb56d2779d0c73 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Mon, 26 Feb 2024 13:11:22 +0000 Subject: [PATCH 039/103] wrapped asyncio test under `IsolatedAsyncioTestCase` class --- .../fleet/vce/backend/raybackend_test.py | 111 +++++++++--------- 1 file changed, 57 insertions(+), 54 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py index bb33491db90..f0cca527ab9 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py @@ -17,6 +17,7 @@ import asyncio from math import pi from typing import Callable, Dict, Optional, Tuple, Union +from unittest import IsolatedAsyncioTestCase import ray @@ -77,62 +78,64 @@ async def backend_build_process_and_termination( return to_return -def test_backend_creation_and_termination() -> None: - """Test creation of RayBackend and its termination.""" - backend = RayBackend(backend_config={}, work_dir="") - asyncio.run( - backend_build_process_and_termination(backend=backend, process_args=None) - ) - - -def test_backend_creation_submit_and_termination() -> None: - """Test submit.""" - backend = RayBackend(backend_config={}, work_dir="") - - # Define ClientApp - client_app_callable = _load_app - - # Construct a Message - mult_factor = 2024 - getproperties_ins = GetPropertiesIns(config={"factor": mult_factor}) - recordset = getpropertiesins_to_recordset(getproperties_ins) - message = Message( - content=recordset, - metadata=Metadata( - run_id=0, - message_id="", - group_id="", - src_node_id=0, - dst_node_id=0, - reply_to_message="", - ttl="", - message_type=MESSAGE_TYPE_GET_PROPERTIES, - ), - ) - - # Construct emtpy Context - context = Context(state=RecordSet()) - - res = asyncio.run( - backend_build_process_and_termination( - backend=backend, process_args=(client_app_callable, message, context) +class AsyncTestRayBackend(IsolatedAsyncioTestCase): + """A basic class that allows runnig multliple asyncio tests.""" + + def test_backend_creation_and_termination(self) -> None: + """Test creation of RayBackend and its termination.""" + backend = RayBackend(backend_config={}, work_dir="") + asyncio.run( + backend_build_process_and_termination(backend=backend, process_args=None) + ) + + def test_backend_creation_submit_and_termination(self) -> None: + """Test submit.""" + backend = RayBackend(backend_config={}, work_dir="") + + # Define ClientApp + client_app_callable = _load_app + + # Construct a Message + mult_factor = 2024 + getproperties_ins = GetPropertiesIns(config={"factor": mult_factor}) + recordset = getpropertiesins_to_recordset(getproperties_ins) + message = Message( + content=recordset, + metadata=Metadata( + run_id=0, + message_id="", + group_id="", + src_node_id=0, + dst_node_id=0, + reply_to_message="", + ttl="", + message_type=MESSAGE_TYPE_GET_PROPERTIES, + ), ) - ) - if res is None: - raise AssertionError("This shouldn't happen") + # Construct emtpy Context + context = Context(state=RecordSet()) - out_mssg, updated_context = res + res = asyncio.run( + backend_build_process_and_termination( + backend=backend, process_args=(client_app_callable, message, context) + ) + ) - # Verify message content is as expected - content = out_mssg.content - assert ( - content.configs_records["getpropertiesres.properties"]["result"] - == pi * mult_factor - ) + if res is None: + raise AssertionError("This shouldn't happen") + + out_mssg, updated_context = res + + # Verify message content is as expected + content = out_mssg.content + assert ( + content.configs_records["getpropertiesres.properties"]["result"] + == pi * mult_factor + ) - # Verify context is correct - obtained_result_in_context = updated_context.state.configs_records["result"][ - "result" - ] - assert obtained_result_in_context == pi * mult_factor + # Verify context is correct + obtained_result_in_context = updated_context.state.configs_records["result"][ + "result" + ] + assert obtained_result_in_context == pi * mult_factor From ed5b181361b6682b8a8c7f912ec07cbdf1462419 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Mon, 26 Feb 2024 14:18:53 +0000 Subject: [PATCH 040/103] start/shutdown tests --- .../server/superlink/fleet/vce/vce_api.py | 2 +- .../superlink/fleet/vce/vce_api_test.py | 59 +++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 src/py/flwr/server/superlink/fleet/vce/vce_api_test.py diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index 3365e8d9471..7dc86dac01a 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Fleet VirtualClientEngine API.""" +"""Fleet Simulation Engine API.""" import asyncio diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py new file mode 100644 index 00000000000..987f8ce27c1 --- /dev/null +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py @@ -0,0 +1,59 @@ +# Copyright 2024 Flower Labs GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Test Fleet Simulation Engine API.""" + +import asyncio +import threading +from time import sleep +from unittest import IsolatedAsyncioTestCase + +from flwr.server.superlink.state import StateFactory + +from . import start_vce + + +class AsyncTestFleetSimulationEngine(IsolatedAsyncioTestCase): + """A basic class to test Fleet Simulation Enginge funcionality.""" + + def test_start_and_shutdown(self) -> None: + """Start Simulation Engine Fleet and terminate it.""" + f_stop = asyncio.Event() + + # Initialize StateFactory + state_factory = StateFactory(":flwr-in-memory-state:") + + superlink_th = threading.Thread( + target=start_vce, + args=( + 50, + "", + "ray", + "{}", # an empty json stream (represents an empty config) + state_factory, + "", + f_stop, + ), + daemon=False, + ) + + superlink_th.start() + + # Sleep for some time + sleep(10) + + # Trigger stop event + f_stop.set() + + superlink_th.join() From 2c05cdd066a1dd7209e197eb4e2f3c5210b71205 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Mon, 26 Feb 2024 17:49:55 +0000 Subject: [PATCH 041/103] full loop tests; tweaks --- .../superlink/fleet/vce/backend/raybackend.py | 4 + .../fleet/vce/backend/raybackend_test.py | 4 - .../server/superlink/fleet/vce/vce_api.py | 27 ++- .../superlink/fleet/vce/vce_api_test.py | 188 +++++++++++++++--- 4 files changed, 183 insertions(+), 40 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py index b29d76b239e..5c81501d62d 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py @@ -18,6 +18,8 @@ from logging import INFO from typing import Callable, Dict, List, Tuple, Union +import ray + from flwr.client.clientapp import ClientApp from flwr.common.context import Context from flwr.common.logger import log @@ -157,3 +159,5 @@ async def process_message( async def terminate(self) -> None: """Terminate all actors in actor pool.""" await self.pool.terminate_all_actors() + ray.shutdown() + log(INFO, "Terminated %s", self.__class__.__name__) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py index f0cca527ab9..bef0d8ec7e5 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py @@ -19,8 +19,6 @@ from typing import Callable, Dict, Optional, Tuple, Union from unittest import IsolatedAsyncioTestCase -import ray - from flwr.client import Client, NumPyClient from flwr.client.clientapp import ClientApp from flwr.common import ( @@ -73,8 +71,6 @@ async def backend_build_process_and_termination( await backend.terminate() - ray.shutdown() - return to_return diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index 7dc86dac01a..881765213da 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -19,7 +19,7 @@ import json import traceback from logging import DEBUG, ERROR, INFO -from typing import Callable, Dict +from typing import Callable, Dict, Optional from flwr.client.clientapp import ClientApp, load_client_app from flwr.client.node_state import NodeState @@ -30,7 +30,6 @@ from .backend import Backend, error_messages_backends, supported_backends -TaskInsQueue = asyncio.Queue[TaskIns] NodeToPartitionMapping = Dict[int, int] @@ -50,7 +49,7 @@ def _register_nodes( # pylint: disable=too-many-arguments,too-many-locals async def worker( app: Callable[[], ClientApp], - queue: TaskInsQueue, + queue: "asyncio.Queue[TaskIns]", node_states: Dict[int, NodeState], state_factory: StateFactory, nodes_mapping: NodeToPartitionMapping, @@ -60,7 +59,7 @@ async def worker( state = state_factory.state() while True: try: - task_ins = await queue.get() + task_ins: TaskIns = await queue.get() node_id = task_ins.task.consumer.node_id # Register and retrive runstate @@ -104,7 +103,7 @@ async def worker( async def generate_pull_requests( - queue: TaskInsQueue, + queue: "asyncio.Queue[TaskIns]", state_factory: StateFactory, nodes_mapping: NodeToPartitionMapping, f_stop: asyncio.Event, @@ -132,7 +131,7 @@ async def run( ) -> None: """Run the VCE async.""" # pylint: disable=fixme - queue: TaskInsQueue = asyncio.Queue(128) + queue: "asyncio.Queue[TaskIns]" = asyncio.Queue(128) # Build backend await backend.build() @@ -150,7 +149,7 @@ async def run( # Produced task terminated, now cancel worker tasks for w_t in worker_tasks: - _ = w_t.cancel("Terminate on Simulation Engine shutdown.") + _ = w_t.cancel() # print('requested cancel') while not all(w_t.done() for w_t in worker_tasks): @@ -172,12 +171,18 @@ def start_vce( state_factory: StateFactory, working_dir: str, f_stop: asyncio.Event, + existing_nodes_mapping: Optional[NodeToPartitionMapping] = None, ) -> None: """Start Fleet API with the VirtualClientEngine (VCE).""" - # Register SuperNodes - nodes_mapping = _register_nodes( - num_nodes=num_supernodes, state_factory=state_factory - ) + if existing_nodes_mapping: + # Use mapping constructed externally. This also means nodes + # have previously being registered. + nodes_mapping = existing_nodes_mapping + else: + # Register SuperNodes + nodes_mapping = _register_nodes( + num_nodes=num_supernodes, state_factory=state_factory + ) # Construct mapping of NodeStates node_states: Dict[int, NodeState] = {} diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py index 987f8ce27c1..6abdd046f81 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py @@ -13,47 +13,185 @@ # limitations under the License. # ============================================================================== """Test Fleet Simulation Engine API.""" - import asyncio import threading +from itertools import cycle +from math import pi from time import sleep +from typing import Dict, Optional, Set from unittest import IsolatedAsyncioTestCase +from uuid import UUID + +from flwr.client import Client, NumPyClient +from flwr.client.clientapp import ClientApp +from flwr.common import ( + Config, + ConfigsRecord, + GetPropertiesIns, + Message, + Metadata, + Scalar, +) +from flwr.common.constant import MESSAGE_TYPE_GET_PROPERTIES +from flwr.common.recordset_compat import getpropertiesins_to_recordset +from flwr.common.serde import message_from_taskres, message_to_taskins +from flwr.server.superlink.fleet.vce.vce_api import ( + NodeToPartitionMapping, + _register_nodes, + start_vce, +) +from flwr.server.superlink.state import InMemoryState, StateFactory + + +class DummyClient(NumPyClient): + """A dummy NumPyClient for tests.""" + + def get_properties(self, config: Config) -> Dict[str, Scalar]: + """Return properties by doing a simple calculation.""" + result = float(config["factor"]) * pi + + # store something in context + self.context.state.configs_records["result"] = ConfigsRecord({"result": result}) + return {"result": result} + + +def get_dummy_client(cid: str) -> Client: # pylint: disable=unused-argument + """Return a DummyClient converted to Client type.""" + return DummyClient().to_client() + + +client_app = ClientApp( + client_fn=get_dummy_client, +) + -from flwr.server.superlink.state import StateFactory +def terminate_simulation(f_stop: asyncio.Event, sleep_duration: int) -> None: + """Set event to terminate Simulation Engine after `sleep_duration` seconds.""" + sleep(sleep_duration) + f_stop.set() -from . import start_vce +def start_and_shutdown( + existing_state_factory: Optional[StateFactory] = None, + nodes_mapping: Optional[NodeToPartitionMapping] = None, + duration: int = 10, +) -> None: + """Start Simulation Engine and terminate after specified number of seconds.""" + f_stop = asyncio.Event() + + # Initialize StateFactory + if nodes_mapping: + if existing_state_factory is None: + raise ValueError( + "If you specify a node mapping, you must pass a StateFactory." + ) + state_factory = existing_state_factory + else: + state_factory = StateFactory(":flwr-in-memory-state:") + + # Setup thread that will set the f_stop event, triggering the termination of all + # asyncio logic in the Simulation Engine. It will also terminate the Backend. + termination_th = threading.Thread( + target=terminate_simulation, args=(f_stop, duration) + ) + termination_th.start() + + start_vce( + num_supernodes=50, + client_app_module_name="vce_api_test:client_app", + backend_name="ray", + backend_config_json_stream="{}", # an empty json stream (an empty config) + state_factory=state_factory, + working_dir="", + f_stop=f_stop, + existing_nodes_mapping=nodes_mapping, + ) + + # Trigger stop event + f_stop.set() + + termination_th.join() -class AsyncTestFleetSimulationEngine(IsolatedAsyncioTestCase): - """A basic class to test Fleet Simulation Enginge funcionality.""" + +class AsyncTestFleetSimulationEngineRayBackend(IsolatedAsyncioTestCase): + """A basic class that enables testing asyncio functionalities.""" def test_start_and_shutdown(self) -> None: """Start Simulation Engine Fleet and terminate it.""" - f_stop = asyncio.Event() + start_and_shutdown() + + # pylint: disable=too-many-locals + def test_start_and_shutdown_with_tasks_in_state(self) -> None: + """Run Simulation Engine with some TasksIns in State. + + This test creates a few nodes and submits a few messages that need to be + executed by the Backend. In order for that to happen the asyncio + producer/consumer logic must function. + """ + num_messages = 113 + num_nodes = 59 - # Initialize StateFactory + # Register a state and a run_id in it + run_id = 1234 state_factory = StateFactory(":flwr-in-memory-state:") + state: InMemoryState = state_factory.state() # type: ignore + state.run_ids.add(run_id) - superlink_th = threading.Thread( - target=start_vce, - args=( - 50, - "", - "ray", - "{}", # an empty json stream (represents an empty config) - state_factory, - "", - f_stop, - ), - daemon=False, + # Register a few nodes + nodes_mapping = _register_nodes( + num_nodes=num_nodes, state_factory=state_factory ) - superlink_th.start() + # Artificially add TaskIns to state so they can be processed + # by the Simulation Engine logic + nodes_cycle = cycle( + nodes_mapping.keys() + ) # we have more messages than supernodes + task_ids: Set[UUID] = set() # so we can retrieve them later + expected_results = {} + for i in range(num_messages): + dst_node_id = next(nodes_cycle) + # Construct a Message + mult_factor = 2024 + i + getproperties_ins = GetPropertiesIns(config={"factor": mult_factor}) + recordset = getpropertiesins_to_recordset(getproperties_ins) + message = Message( + content=recordset, + metadata=Metadata( + run_id=run_id, + message_id="", + group_id="", + src_node_id=0, + dst_node_id=dst_node_id, # indicate destination node + reply_to_message="", + ttl="", + message_type=MESSAGE_TYPE_GET_PROPERTIES, + ), + ) + # Convert Message to TaskIns + taskins = message_to_taskins(message) + # Instert in state + task_id = state.store_task_ins(taskins) + if task_id: + # Add to UUID set + task_ids.add(task_id) + # Store expected output for check later on + expected_results[task_id] = mult_factor * pi + + # Run + start_and_shutdown(state_factory, nodes_mapping) + + # Get all TaskRes + task_res_list = state.get_task_res(task_ids=task_ids, limit=len(task_ids)) - # Sleep for some time - sleep(10) + # Check results by first converting to Message + for task_res in task_res_list: - # Trigger stop event - f_stop.set() + message = message_from_taskres(task_res) - superlink_th.join() + # Verify message content is as expected + content = message.content + assert ( + content.configs_records["getpropertiesres.properties"]["result"] + == expected_results[UUID(task_res.task.ancestry[0])] + ) From 98fb4b458e7d9cda53b44c327b381e340c3b18f2 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Mon, 26 Feb 2024 17:54:20 +0000 Subject: [PATCH 042/103] . --- src/py/flwr/server/superlink/fleet/vce/__init__.py | 2 +- src/py/flwr/server/superlink/fleet/vce/backend/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/__init__.py b/src/py/flwr/server/superlink/fleet/vce/__init__.py index 72cd76f7376..57d39688b52 100644 --- a/src/py/flwr/server/superlink/fleet/vce/__init__.py +++ b/src/py/flwr/server/superlink/fleet/vce/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Fleet VirtualClientEngine side.""" +"""Fleet Simulation Engine side.""" from .vce_api import start_vce diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py b/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py index 8c351743dbd..d751cf4bcae 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""VirtualClientEngine Backends.""" +"""Simulation Engine Backends.""" import importlib from typing import Dict, Type From 65c8b79df30d94231e8fe46314c482fa04f362db Mon Sep 17 00:00:00 2001 From: jafermarq Date: Mon, 26 Feb 2024 18:11:08 +0000 Subject: [PATCH 043/103] undoing changes to simulation examples --- examples/simulation-pytorch/README.md | 38 +++++------------ examples/simulation-pytorch/sim.py | 48 +++++++++------------ examples/simulation-tensorflow/README.md | 42 +++++------------- examples/simulation-tensorflow/sim.py | 54 ++++++++++-------------- 4 files changed, 63 insertions(+), 119 deletions(-) diff --git a/examples/simulation-pytorch/README.md b/examples/simulation-pytorch/README.md index 963e77bc568..5ba5ec70dc3 100644 --- a/examples/simulation-pytorch/README.md +++ b/examples/simulation-pytorch/README.md @@ -54,13 +54,17 @@ Write the command below in your terminal to install the dependencies according t pip install -r requirements.txt ``` -### Run with `start_simulation()` - -Ensure you have activated your environment then: +### Run Federated Learning Example ```bash +# You can run the example without activating your environemnt +poetry run python sim.py + +# Or by first activating it +poetry shell # and then run the example python sim.py +# you can exit your environment by typing "exit" ``` You can adjust the CPU/GPU resources you assign to each of your virtual clients. By default, your clients will only use 1xCPU core. For example: @@ -69,32 +73,10 @@ You can adjust the CPU/GPU resources you assign to each of your virtual clients. # Will assign 2xCPUs to each client python sim.py --num_cpus=2 -# Will assign 2xCPUs and 25% of the GPU's VRAM to each client -# This means that you can have 4 concurrent clients on each GPU +# Will assign 2xCPUs and 20% of the GPU's VRAM to each client +# This means that you can have 5 concurrent clients on each GPU # (assuming you have enough CPUs) -python sim.py --num_cpus=2 --num_gpus=0.25 -``` - -### Run with Flower-Next (`super-link` and `server-app`) - -Ensure you have activated your environment, then: - -``` -flower-superlink --insecure --vce --num-supernodes 100 --client-app sim:client_app - -# on a different terminal -flower-server-app sim:server_app --insecure -``` - -You can change the default resources assigned to each `ClientApp` by means the `--backend-config` argument: - -```bash -# Tells the VCE to resever 2x CPUs and 25% of available VRAM for each ClientApp -flower-superlink --insecure --vce --num-supernodes 100 \ - --client-app sim:client_app \ - --backend-config='{"client_resources": {"num_cpus":2, "num_gpus":0.25}}' - -# Then you can launch the `flower-server-app` command as shown earlier. +python sim.py --num_cpus=2 --num_gpus=0.2 ``` Take a look at the [Documentation](https://flower.ai/docs/framework/how-to-run-simulations.html) for more details on how you can customise your simulation. diff --git a/examples/simulation-pytorch/sim.py b/examples/simulation-pytorch/sim.py index 84a00e3f092..0a6ed8ebb9b 100644 --- a/examples/simulation-pytorch/sim.py +++ b/examples/simulation-pytorch/sim.py @@ -29,9 +29,9 @@ default=0.0, help="Ratio of GPU memory to assign to a virtual client", ) +parser.add_argument("--num_rounds", type=int, default=10, help="Number of FL rounds.") NUM_CLIENTS = 100 -NUM_ROUNDS = 10 # Flower client, adapted from Pytorch quickstart example @@ -167,36 +167,28 @@ def evaluate( return evaluate -# Download MNIST dataset and partition it -mnist_fds = FederatedDataset(dataset="mnist", partitioners={"train": NUM_CLIENTS}) -centralized_testset = mnist_fds.load_full("test") - -# Configure the strategy -strategy = fl.server.strategy.FedAvg( - fraction_fit=0.1, # Sample 10% of available clients for training - fraction_evaluate=0.05, # Sample 5% of available clients for evaluation - min_available_clients=10, - on_fit_config_fn=fit_config, - evaluate_metrics_aggregation_fn=weighted_average, # Aggregate federated metrics - evaluate_fn=get_evaluate_fn(centralized_testset), # Global evaluation function -) - -# ClientApp for Flower-Next -client_app = fl.client.ClientApp( - client_fn=get_client_fn(mnist_fds), -) - -# ServerApp for Flower-Next -server_app = fl.server.ServerApp( - config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS), - strategy=strategy, -) - - def main(): # Parse input arguments args = parser.parse_args() + # Download MNIST dataset and partition it + mnist_fds = FederatedDataset(dataset="mnist", partitioners={"train": NUM_CLIENTS}) + centralized_testset = mnist_fds.load_full("test") + + # Configure the strategy + strategy = fl.server.strategy.FedAvg( + fraction_fit=0.1, # Sample 10% of available clients for training + fraction_evaluate=0.05, # Sample 5% of available clients for evaluation + min_fit_clients=10, # Never sample less than 10 clients for training + min_evaluate_clients=5, # Never sample less than 5 clients for evaluation + min_available_clients=int( + NUM_CLIENTS * 0.75 + ), # Wait until at least 75 clients are available + on_fit_config_fn=fit_config, + evaluate_metrics_aggregation_fn=weighted_average, # Aggregate federated metrics + evaluate_fn=get_evaluate_fn(centralized_testset), # Global evaluation function + ) + # Resources to be assigned to each virtual client client_resources = { "num_cpus": args.num_cpus, @@ -208,7 +200,7 @@ def main(): client_fn=get_client_fn(mnist_fds), num_clients=NUM_CLIENTS, client_resources=client_resources, - config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS), + config=fl.server.ServerConfig(num_rounds=args.num_rounds), strategy=strategy, actor_kwargs={ "on_actor_init_fn": disable_progress_bar # disable tqdm on each actor/process spawning virtual clients diff --git a/examples/simulation-tensorflow/README.md b/examples/simulation-tensorflow/README.md index f6f0a22fdd7..75be823db2e 100644 --- a/examples/simulation-tensorflow/README.md +++ b/examples/simulation-tensorflow/README.md @@ -53,49 +53,29 @@ Write the command below in your terminal to install the dependencies according t pip install -r requirements.txt ``` -### Run with `start_simulation()` - -Ensure you have activated your environment then: +### Run Federated Learning Example ```bash +# You can run the example without activating your environemnt +poetry run python sim.py + +# Or by first activating it +poetry shell # and then run the example python sim.py +# you can exit your environment by typing "exit" ``` -You can adjust the CPU/GPU resources you assign to each of your virtual clients. By default, your clients will only use 2xCPU core. For example: +You can adjust the CPU/GPU resources you assign to each of your virtual clients. By default, your clients will only use 1xCPU core. For example: ```bash # Will assign 2xCPUs to each client python sim.py --num_cpus=2 -# Will assign 2xCPUs and 25% of the GPU's VRAM to each client -# This means that you can have 4 concurrent clients on each GPU +# Will assign 2xCPUs and 20% of the GPU's VRAM to each client +# This means that you can have 5 concurrent clients on each GPU # (assuming you have enough CPUs) -python sim.py --num_cpus=2 --num_gpus=0.25 -``` - -Because TensorFlow by default maps all the available VRAM, we need to [enable GPU memory growth](https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth), see how it is done in the example (`sim.py`) for both the "main" process (where the server/strategy runs) and for the clients (using the `actor_kwargs`) - -### Run with Flower-Next (`super-link` and `server-app`) - -Ensure you have activated your environment, then: - -``` -flower-superlink --insecure --vce --num-supernodes 100 --client-app sim:client_app - -# on a different terminal -flower-server-app sim:server_app --insecure -``` - -You can change the default resources assigned to each `ClientApp` using the `--backend-config` argument. Note that we need to flag that the backend is going to use `TensorFlow`. In this way, it will enable GPU memory growth. - -```bash -# Tells the VCE to resever 2x CPUs and 25% of available VRAM for each ClientApp -flower-superlink --insecure --vce --num-supernodes 100 \ - --client-app sim:client_app \ - --backend-config='{"client_resources": {"num_cpus":2, "num_gpus":0.25}, "tensorflow": 1}' - -# Then you can launch the `flower-server-app` command as shown earlier. +python sim.py --num_cpus=2 --num_gpus=0.2 ``` Take a look at the [Documentation](https://flower.ai/docs/framework/how-to-run-simulations.html) for more details on how you can customise your simulation. diff --git a/examples/simulation-tensorflow/sim.py b/examples/simulation-tensorflow/sim.py index dbba71ac2cf..043c624a40a 100644 --- a/examples/simulation-tensorflow/sim.py +++ b/examples/simulation-tensorflow/sim.py @@ -29,9 +29,9 @@ default=0.0, help="Ratio of GPU memory to assign to a virtual client", ) +parser.add_argument("--num_rounds", type=int, default=10, help="Number of FL rounds.") NUM_CLIENTS = 100 -NUM_ROUNDS = 10 VERBOSE = 0 @@ -129,40 +129,30 @@ def evaluate( return evaluate -# Download MNIST dataset and partition it -mnist_fds = FederatedDataset(dataset="mnist", partitioners={"train": NUM_CLIENTS}) -# Get the whole test set for centralised evaluation -centralized_testset = mnist_fds.load_full("test").to_tf_dataset( - columns="image", label_cols="label", batch_size=64 -) - -# Create FedAvg strategy -strategy = fl.server.strategy.FedAvg( - fraction_fit=0.1, # Sample 10% of available clients for training - fraction_evaluate=0.05, # Sample 5% of available clients for evaluation - min_fit_clients=10, # Never sample less than 10 clients for training - evaluate_metrics_aggregation_fn=weighted_average, # aggregates federated metrics - evaluate_fn=get_evaluate_fn(centralized_testset), # global evaluation function -) - - -# ClientApp for Flower-Next -client_app = fl.client.ClientApp( - client_fn=get_client_fn(mnist_fds), -) - -# ServerApp for Flower-Next -# TODO: Unclear how to enable GPU growth for the ServerApp -server_app = fl.server.ServerApp( - config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS), - strategy=strategy, -) - - def main() -> None: # Parse input arguments args = parser.parse_args() + # Download MNIST dataset and partition it + mnist_fds = FederatedDataset(dataset="mnist", partitioners={"train": NUM_CLIENTS}) + # Get the whole test set for centralised evaluation + centralized_testset = mnist_fds.load_full("test").to_tf_dataset( + columns="image", label_cols="label", batch_size=64 + ) + + # Create FedAvg strategy + strategy = fl.server.strategy.FedAvg( + fraction_fit=0.1, # Sample 10% of available clients for training + fraction_evaluate=0.05, # Sample 5% of available clients for evaluation + min_fit_clients=10, # Never sample less than 10 clients for training + min_evaluate_clients=5, # Never sample less than 5 clients for evaluation + min_available_clients=int( + NUM_CLIENTS * 0.75 + ), # Wait until at least 75 clients are available + evaluate_metrics_aggregation_fn=weighted_average, # aggregates federated metrics + evaluate_fn=get_evaluate_fn(centralized_testset), # global evaluation function + ) + # With a dictionary, you tell Flower's VirtualClientEngine that each # client needs exclusive access to these many resources in order to run client_resources = { @@ -174,7 +164,7 @@ def main() -> None: fl.simulation.start_simulation( client_fn=get_client_fn(mnist_fds), num_clients=NUM_CLIENTS, - config=fl.server.ServerConfig(NUM_ROUNDS), + config=fl.server.ServerConfig(num_rounds=args.num_rounds), strategy=strategy, client_resources=client_resources, actor_kwargs={ From 87d7a4cd4a0f21251fd9d6f42618c636f47c7839 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Mon, 26 Feb 2024 18:18:12 +0000 Subject: [PATCH 044/103] adding back examples --- examples/simulation-pytorch/README.md | 38 ++++++++++++----- examples/simulation-pytorch/sim.py | 48 ++++++++++++--------- examples/simulation-tensorflow/README.md | 42 +++++++++++++----- examples/simulation-tensorflow/sim.py | 54 ++++++++++++++---------- 4 files changed, 119 insertions(+), 63 deletions(-) diff --git a/examples/simulation-pytorch/README.md b/examples/simulation-pytorch/README.md index 5ba5ec70dc3..963e77bc568 100644 --- a/examples/simulation-pytorch/README.md +++ b/examples/simulation-pytorch/README.md @@ -54,17 +54,13 @@ Write the command below in your terminal to install the dependencies according t pip install -r requirements.txt ``` -### Run Federated Learning Example +### Run with `start_simulation()` -```bash -# You can run the example without activating your environemnt -poetry run python sim.py +Ensure you have activated your environment then: -# Or by first activating it -poetry shell +```bash # and then run the example python sim.py -# you can exit your environment by typing "exit" ``` You can adjust the CPU/GPU resources you assign to each of your virtual clients. By default, your clients will only use 1xCPU core. For example: @@ -73,10 +69,32 @@ You can adjust the CPU/GPU resources you assign to each of your virtual clients. # Will assign 2xCPUs to each client python sim.py --num_cpus=2 -# Will assign 2xCPUs and 20% of the GPU's VRAM to each client -# This means that you can have 5 concurrent clients on each GPU +# Will assign 2xCPUs and 25% of the GPU's VRAM to each client +# This means that you can have 4 concurrent clients on each GPU # (assuming you have enough CPUs) -python sim.py --num_cpus=2 --num_gpus=0.2 +python sim.py --num_cpus=2 --num_gpus=0.25 +``` + +### Run with Flower-Next (`super-link` and `server-app`) + +Ensure you have activated your environment, then: + +``` +flower-superlink --insecure --vce --num-supernodes 100 --client-app sim:client_app + +# on a different terminal +flower-server-app sim:server_app --insecure +``` + +You can change the default resources assigned to each `ClientApp` by means the `--backend-config` argument: + +```bash +# Tells the VCE to resever 2x CPUs and 25% of available VRAM for each ClientApp +flower-superlink --insecure --vce --num-supernodes 100 \ + --client-app sim:client_app \ + --backend-config='{"client_resources": {"num_cpus":2, "num_gpus":0.25}}' + +# Then you can launch the `flower-server-app` command as shown earlier. ``` Take a look at the [Documentation](https://flower.ai/docs/framework/how-to-run-simulations.html) for more details on how you can customise your simulation. diff --git a/examples/simulation-pytorch/sim.py b/examples/simulation-pytorch/sim.py index 0a6ed8ebb9b..84a00e3f092 100644 --- a/examples/simulation-pytorch/sim.py +++ b/examples/simulation-pytorch/sim.py @@ -29,9 +29,9 @@ default=0.0, help="Ratio of GPU memory to assign to a virtual client", ) -parser.add_argument("--num_rounds", type=int, default=10, help="Number of FL rounds.") NUM_CLIENTS = 100 +NUM_ROUNDS = 10 # Flower client, adapted from Pytorch quickstart example @@ -167,28 +167,36 @@ def evaluate( return evaluate +# Download MNIST dataset and partition it +mnist_fds = FederatedDataset(dataset="mnist", partitioners={"train": NUM_CLIENTS}) +centralized_testset = mnist_fds.load_full("test") + +# Configure the strategy +strategy = fl.server.strategy.FedAvg( + fraction_fit=0.1, # Sample 10% of available clients for training + fraction_evaluate=0.05, # Sample 5% of available clients for evaluation + min_available_clients=10, + on_fit_config_fn=fit_config, + evaluate_metrics_aggregation_fn=weighted_average, # Aggregate federated metrics + evaluate_fn=get_evaluate_fn(centralized_testset), # Global evaluation function +) + +# ClientApp for Flower-Next +client_app = fl.client.ClientApp( + client_fn=get_client_fn(mnist_fds), +) + +# ServerApp for Flower-Next +server_app = fl.server.ServerApp( + config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS), + strategy=strategy, +) + + def main(): # Parse input arguments args = parser.parse_args() - # Download MNIST dataset and partition it - mnist_fds = FederatedDataset(dataset="mnist", partitioners={"train": NUM_CLIENTS}) - centralized_testset = mnist_fds.load_full("test") - - # Configure the strategy - strategy = fl.server.strategy.FedAvg( - fraction_fit=0.1, # Sample 10% of available clients for training - fraction_evaluate=0.05, # Sample 5% of available clients for evaluation - min_fit_clients=10, # Never sample less than 10 clients for training - min_evaluate_clients=5, # Never sample less than 5 clients for evaluation - min_available_clients=int( - NUM_CLIENTS * 0.75 - ), # Wait until at least 75 clients are available - on_fit_config_fn=fit_config, - evaluate_metrics_aggregation_fn=weighted_average, # Aggregate federated metrics - evaluate_fn=get_evaluate_fn(centralized_testset), # Global evaluation function - ) - # Resources to be assigned to each virtual client client_resources = { "num_cpus": args.num_cpus, @@ -200,7 +208,7 @@ def main(): client_fn=get_client_fn(mnist_fds), num_clients=NUM_CLIENTS, client_resources=client_resources, - config=fl.server.ServerConfig(num_rounds=args.num_rounds), + config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS), strategy=strategy, actor_kwargs={ "on_actor_init_fn": disable_progress_bar # disable tqdm on each actor/process spawning virtual clients diff --git a/examples/simulation-tensorflow/README.md b/examples/simulation-tensorflow/README.md index 75be823db2e..f6f0a22fdd7 100644 --- a/examples/simulation-tensorflow/README.md +++ b/examples/simulation-tensorflow/README.md @@ -53,29 +53,49 @@ Write the command below in your terminal to install the dependencies according t pip install -r requirements.txt ``` -### Run Federated Learning Example +### Run with `start_simulation()` -```bash -# You can run the example without activating your environemnt -poetry run python sim.py +Ensure you have activated your environment then: -# Or by first activating it -poetry shell +```bash # and then run the example python sim.py -# you can exit your environment by typing "exit" ``` -You can adjust the CPU/GPU resources you assign to each of your virtual clients. By default, your clients will only use 1xCPU core. For example: +You can adjust the CPU/GPU resources you assign to each of your virtual clients. By default, your clients will only use 2xCPU core. For example: ```bash # Will assign 2xCPUs to each client python sim.py --num_cpus=2 -# Will assign 2xCPUs and 20% of the GPU's VRAM to each client -# This means that you can have 5 concurrent clients on each GPU +# Will assign 2xCPUs and 25% of the GPU's VRAM to each client +# This means that you can have 4 concurrent clients on each GPU # (assuming you have enough CPUs) -python sim.py --num_cpus=2 --num_gpus=0.2 +python sim.py --num_cpus=2 --num_gpus=0.25 +``` + +Because TensorFlow by default maps all the available VRAM, we need to [enable GPU memory growth](https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth), see how it is done in the example (`sim.py`) for both the "main" process (where the server/strategy runs) and for the clients (using the `actor_kwargs`) + +### Run with Flower-Next (`super-link` and `server-app`) + +Ensure you have activated your environment, then: + +``` +flower-superlink --insecure --vce --num-supernodes 100 --client-app sim:client_app + +# on a different terminal +flower-server-app sim:server_app --insecure +``` + +You can change the default resources assigned to each `ClientApp` using the `--backend-config` argument. Note that we need to flag that the backend is going to use `TensorFlow`. In this way, it will enable GPU memory growth. + +```bash +# Tells the VCE to resever 2x CPUs and 25% of available VRAM for each ClientApp +flower-superlink --insecure --vce --num-supernodes 100 \ + --client-app sim:client_app \ + --backend-config='{"client_resources": {"num_cpus":2, "num_gpus":0.25}, "tensorflow": 1}' + +# Then you can launch the `flower-server-app` command as shown earlier. ``` Take a look at the [Documentation](https://flower.ai/docs/framework/how-to-run-simulations.html) for more details on how you can customise your simulation. diff --git a/examples/simulation-tensorflow/sim.py b/examples/simulation-tensorflow/sim.py index 043c624a40a..dbba71ac2cf 100644 --- a/examples/simulation-tensorflow/sim.py +++ b/examples/simulation-tensorflow/sim.py @@ -29,9 +29,9 @@ default=0.0, help="Ratio of GPU memory to assign to a virtual client", ) -parser.add_argument("--num_rounds", type=int, default=10, help="Number of FL rounds.") NUM_CLIENTS = 100 +NUM_ROUNDS = 10 VERBOSE = 0 @@ -129,30 +129,40 @@ def evaluate( return evaluate +# Download MNIST dataset and partition it +mnist_fds = FederatedDataset(dataset="mnist", partitioners={"train": NUM_CLIENTS}) +# Get the whole test set for centralised evaluation +centralized_testset = mnist_fds.load_full("test").to_tf_dataset( + columns="image", label_cols="label", batch_size=64 +) + +# Create FedAvg strategy +strategy = fl.server.strategy.FedAvg( + fraction_fit=0.1, # Sample 10% of available clients for training + fraction_evaluate=0.05, # Sample 5% of available clients for evaluation + min_fit_clients=10, # Never sample less than 10 clients for training + evaluate_metrics_aggregation_fn=weighted_average, # aggregates federated metrics + evaluate_fn=get_evaluate_fn(centralized_testset), # global evaluation function +) + + +# ClientApp for Flower-Next +client_app = fl.client.ClientApp( + client_fn=get_client_fn(mnist_fds), +) + +# ServerApp for Flower-Next +# TODO: Unclear how to enable GPU growth for the ServerApp +server_app = fl.server.ServerApp( + config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS), + strategy=strategy, +) + + def main() -> None: # Parse input arguments args = parser.parse_args() - # Download MNIST dataset and partition it - mnist_fds = FederatedDataset(dataset="mnist", partitioners={"train": NUM_CLIENTS}) - # Get the whole test set for centralised evaluation - centralized_testset = mnist_fds.load_full("test").to_tf_dataset( - columns="image", label_cols="label", batch_size=64 - ) - - # Create FedAvg strategy - strategy = fl.server.strategy.FedAvg( - fraction_fit=0.1, # Sample 10% of available clients for training - fraction_evaluate=0.05, # Sample 5% of available clients for evaluation - min_fit_clients=10, # Never sample less than 10 clients for training - min_evaluate_clients=5, # Never sample less than 5 clients for evaluation - min_available_clients=int( - NUM_CLIENTS * 0.75 - ), # Wait until at least 75 clients are available - evaluate_metrics_aggregation_fn=weighted_average, # aggregates federated metrics - evaluate_fn=get_evaluate_fn(centralized_testset), # global evaluation function - ) - # With a dictionary, you tell Flower's VirtualClientEngine that each # client needs exclusive access to these many resources in order to run client_resources = { @@ -164,7 +174,7 @@ def main() -> None: fl.simulation.start_simulation( client_fn=get_client_fn(mnist_fds), num_clients=NUM_CLIENTS, - config=fl.server.ServerConfig(num_rounds=args.num_rounds), + config=fl.server.ServerConfig(NUM_ROUNDS), strategy=strategy, client_resources=client_resources, actor_kwargs={ From 35ab1f384fb642a4e6ab0463c451ad053861f54d Mon Sep 17 00:00:00 2001 From: Javier Date: Mon, 26 Feb 2024 19:42:07 +0000 Subject: [PATCH 045/103] Apply suggestions from code review Co-authored-by: Daniel J. Beutel --- src/py/flwr/server/superlink/fleet/vce/vce_api.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index 881765213da..21da4b7070b 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -68,7 +68,7 @@ async def worker( # Convert TaskIns to Message message = message_from_taskins(task_ins) - # Replace node-id with data partition id + # Replace node ID with data partition ID message.metadata.dst_node_id = nodes_mapping[node_id] # Let backend process message @@ -108,7 +108,7 @@ async def generate_pull_requests( nodes_mapping: NodeToPartitionMapping, f_stop: asyncio.Event, ) -> None: - """Generate TaskIns and add it to the queue.""" + """Retrieve TaskIns and add it to the queue.""" state = state_factory.state() while not f_stop.is_set(): for node_id in nodes_mapping.keys(): @@ -151,7 +151,6 @@ async def run( for w_t in worker_tasks: _ = w_t.cancel() - # print('requested cancel') while not all(w_t.done() for w_t in worker_tasks): log(DEBUG, "Terminating async workers...") await asyncio.sleep(0.5) From 5b3365a0f1a53a7ae7d2ed8de1ecf94221b327a4 Mon Sep 17 00:00:00 2001 From: Javier Date: Mon, 26 Feb 2024 19:51:21 +0000 Subject: [PATCH 046/103] Apply suggestions from code review Co-authored-by: Daniel J. Beutel --- src/py/flwr/server/superlink/fleet/vce/vce_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index 21da4b7070b..914e69ac4ce 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -62,7 +62,7 @@ async def worker( task_ins: TaskIns = await queue.get() node_id = task_ins.task.consumer.node_id - # Register and retrive runstate + # Register and retrieve runstate node_states[node_id].register_context(run_id=task_ins.run_id) context = node_states[node_id].retrieve_context(run_id=task_ins.run_id) From 785ac918f457e9ada79a8f9a4be0d903f2e0c71c Mon Sep 17 00:00:00 2001 From: jafermarq Date: Mon, 26 Feb 2024 20:38:07 +0000 Subject: [PATCH 047/103] introduced `partition_id`. --- .../client/message_handler/message_handler.py | 2 +- .../message_handler/message_handler_test.py | 2 ++ src/py/flwr/common/message.py | 23 ++++++++++++++++--- 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/py/flwr/client/message_handler/message_handler.py b/src/py/flwr/client/message_handler/message_handler.py index e7e6c7e05c7..87cace88ec2 100644 --- a/src/py/flwr/client/message_handler/message_handler.py +++ b/src/py/flwr/client/message_handler/message_handler.py @@ -98,7 +98,7 @@ def handle_legacy_message_from_msgtype( client_fn: ClientFn, message: Message, context: Context ) -> Message: """Handle legacy message in the inner most mod.""" - client = client_fn(str(message.metadata.dst_node_id)) + client = client_fn(str(message.metadata.partition_id)) # Check if NumPyClient is returend if isinstance(client, NumPyClient): diff --git a/src/py/flwr/client/message_handler/message_handler_test.py b/src/py/flwr/client/message_handler/message_handler_test.py index 9fc126f2792..c24b51972f3 100644 --- a/src/py/flwr/client/message_handler/message_handler_test.py +++ b/src/py/flwr/client/message_handler/message_handler_test.py @@ -269,6 +269,8 @@ def test_invalid_message_run_id(self) -> None: invalid_metadata_list: List[Metadata] = [] attrs = list(vars(self.valid_out_metadata).keys()) for attr in attrs: + if attr == "_partition_id": + continue if attr == "_ttl": # Skip configurable ttl continue # Make an invalid metadata diff --git a/src/py/flwr/common/message.py b/src/py/flwr/common/message.py index 14dae0f6ee5..49ac6227ecc 100644 --- a/src/py/flwr/common/message.py +++ b/src/py/flwr/common/message.py @@ -15,9 +15,8 @@ """Message.""" -from __future__ import annotations - from dataclasses import dataclass +from typing import Optional, Union from .record import RecordSet @@ -46,6 +45,10 @@ class Metadata: # pylint: disable=too-many-instance-attributes message_type : str A string that encodes the action to be executed on the receiving end. + partition_id : Optional[int] + An identifier that can be used when loading a particular + data partition for a ClientApp. Making use of this identifier + is more relevant when conducting simulations. """ _run_id: int @@ -56,6 +59,7 @@ class Metadata: # pylint: disable=too-many-instance-attributes _group_id: str _ttl: str _message_type: str + _partition_id: Optional[int] def __init__( # pylint: disable=too-many-arguments self, @@ -67,6 +71,7 @@ def __init__( # pylint: disable=too-many-arguments group_id: str, ttl: str, message_type: str, + partition_id: Optional[int] = None, ) -> None: self._run_id = run_id self._message_id = message_id @@ -76,6 +81,7 @@ def __init__( # pylint: disable=too-many-arguments self._group_id = group_id self._ttl = ttl self._message_type = message_type + self._partition_id = partition_id @property def run_id(self) -> int: @@ -137,6 +143,16 @@ def message_type(self, value: str) -> None: """Set message_type.""" self._message_type = value + @property + def partition_id(self) -> Union[int, None]: + """An identifier telling which data partition a ClientApp should use.""" + return self._partition_id + + @partition_id.setter + def partition_id(self, value: int) -> None: + """Set patition_id.""" + self._partition_id = value + @dataclass class Message: @@ -173,7 +189,7 @@ def content(self, value: RecordSet) -> None: """Set content.""" self._content = value - def create_reply(self, content: RecordSet, ttl: str) -> Message: + def create_reply(self, content: RecordSet, ttl: str) -> "Message": """Create a reply to this message with specified content and TTL. The method generates a new `Message` as a reply to this message. @@ -202,6 +218,7 @@ def create_reply(self, content: RecordSet, ttl: str) -> Message: group_id=self.metadata.group_id, ttl=ttl, message_type=self.metadata.message_type, + partition_id=self.metadata.partition_id, ), content=content, ) From eba053a702fafba05fee9b3aae8fb25d5e0068f3 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Mon, 26 Feb 2024 20:44:00 +0000 Subject: [PATCH 048/103] fix for ray proxies and tests --- src/py/flwr/simulation/ray_transport/ray_client_proxy.py | 1 + src/py/flwr/simulation/ray_transport/ray_client_proxy_test.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/py/flwr/simulation/ray_transport/ray_client_proxy.py b/src/py/flwr/simulation/ray_transport/ray_client_proxy.py index 405e0920c5a..a45321ed236 100644 --- a/src/py/flwr/simulation/ray_transport/ray_client_proxy.py +++ b/src/py/flwr/simulation/ray_transport/ray_client_proxy.py @@ -111,6 +111,7 @@ def _wrap_recordset_in_message( reply_to_message="", ttl=str(timeout) if timeout else "", message_type=message_type, + partition_id=int(self.cid), ), ) diff --git a/src/py/flwr/simulation/ray_transport/ray_client_proxy_test.py b/src/py/flwr/simulation/ray_transport/ray_client_proxy_test.py index 3eeabe0292c..24fe3546e7d 100644 --- a/src/py/flwr/simulation/ray_transport/ray_client_proxy_test.py +++ b/src/py/flwr/simulation/ray_transport/ray_client_proxy_test.py @@ -198,10 +198,11 @@ def _load_app() -> ClientApp: message_id="", group_id="", src_node_id=0, - dst_node_id=int(cid), + dst_node_id=12345, reply_to_message="", ttl="", message_type=MESSAGE_TYPE_GET_PROPERTIES, + partition_id=int(cid), ), ) pool.submit_client_job( From 27d2bb1c1b5b88f727dc760c777ebbd5f7cfe3b3 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Mon, 26 Feb 2024 21:35:41 +0000 Subject: [PATCH 049/103] re written --- src/py/flwr/common/message.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/py/flwr/common/message.py b/src/py/flwr/common/message.py index 49ac6227ecc..1e1132e42e2 100644 --- a/src/py/flwr/common/message.py +++ b/src/py/flwr/common/message.py @@ -14,9 +14,9 @@ # ============================================================================== """Message.""" +from __future__ import annotations from dataclasses import dataclass -from typing import Optional, Union from .record import RecordSet @@ -59,7 +59,7 @@ class Metadata: # pylint: disable=too-many-instance-attributes _group_id: str _ttl: str _message_type: str - _partition_id: Optional[int] + _partition_id: int | None def __init__( # pylint: disable=too-many-arguments self, @@ -71,7 +71,7 @@ def __init__( # pylint: disable=too-many-arguments group_id: str, ttl: str, message_type: str, - partition_id: Optional[int] = None, + partition_id: int | None = None, ) -> None: self._run_id = run_id self._message_id = message_id @@ -144,7 +144,7 @@ def message_type(self, value: str) -> None: self._message_type = value @property - def partition_id(self) -> Union[int, None]: + def partition_id(self) -> int | None: """An identifier telling which data partition a ClientApp should use.""" return self._partition_id @@ -189,7 +189,7 @@ def content(self, value: RecordSet) -> None: """Set content.""" self._content = value - def create_reply(self, content: RecordSet, ttl: str) -> "Message": + def create_reply(self, content: RecordSet, ttl: str) -> Message: """Create a reply to this message with specified content and TTL. The method generates a new `Message` as a reply to this message. From 1969aac37519073f1ef5960acd0e1c9d67e080ff Mon Sep 17 00:00:00 2001 From: jafermarq Date: Mon, 26 Feb 2024 21:45:43 +0000 Subject: [PATCH 050/103] using `metadata.partition_id` --- src/py/flwr/server/superlink/fleet/vce/vce_api.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index 914e69ac4ce..11aad9fd9f3 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -69,7 +69,7 @@ async def worker( # Convert TaskIns to Message message = message_from_taskins(task_ins) # Replace node ID with data partition ID - message.metadata.dst_node_id = nodes_mapping[node_id] + message.metadata.partition_id = nodes_mapping[node_id] # Let backend process message out_mssg, updated_context = await backend.process_message( @@ -81,10 +81,6 @@ async def worker( task_ins.run_id, context=updated_context ) - # Undo change node_id for partition choice - out_mssg.metadata._src_node_id = ( # pylint: disable=protected-access - task_ins.task.consumer.node_id - ) # Convert to TaskRes task_res = message_to_taskres(out_mssg) # Store TaskRes in state @@ -95,8 +91,7 @@ async def worker( break except Exception as ex: # pylint: disable=broad-exception-caught - # pylint: disable=fixme - # TODO: gen TaskRes with relevant error, add it to state_factory + log(ERROR, ex) log(ERROR, traceback.format_exc()) break @@ -116,8 +111,8 @@ async def generate_pull_requests( if task_ins: await queue.put(task_ins[0]) log(DEBUG, "TaskIns in queue: %i", queue.qsize()) - # pylint: disable=fixme - await asyncio.sleep(1.0) # TODO: revisit + + await asyncio.sleep(1.0) log(DEBUG, "Async producer: Stopped pulling from StateFactory.") From 8f1ca09b6a24c640b22c0335bf109ab23382c691 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Tue, 27 Feb 2024 01:04:41 +0000 Subject: [PATCH 051/103] more efficient --- .../server/superlink/state/in_memory_state.py | 52 ++++++++++++++----- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/src/py/flwr/server/superlink/state/in_memory_state.py b/src/py/flwr/server/superlink/state/in_memory_state.py index 690fadc032d..9e6c458519c 100644 --- a/src/py/flwr/server/superlink/state/in_memory_state.py +++ b/src/py/flwr/server/superlink/state/in_memory_state.py @@ -35,6 +35,7 @@ def __init__(self) -> None: self.node_ids: Set[int] = set() self.run_ids: Set[int] = set() self.task_ins_store: Dict[UUID, TaskIns] = {} + self.task_ins_mapping: Dict[int, List[UUID]] = {} self.task_res_store: Dict[UUID, TaskRes] = {} self.lock = threading.Lock() @@ -61,6 +62,14 @@ def store_task_ins(self, task_ins: TaskIns) -> Optional[UUID]: task_ins.task.ttl = ttl.isoformat() with self.lock: self.task_ins_store[task_id] = task_ins + node_id = task_ins.task.consumer.node_id + if node_id: + # If not an annonymous node, let's construct or + # update the node_id:task_id mapping + if node_id in self.task_ins_mapping: + self.task_ins_mapping[node_id].append(task_id) + else: + self.task_ins_mapping[node_id] = [task_id] # Return the new task_id return task_id @@ -75,22 +84,37 @@ def get_task_ins( # Find TaskIns for node_id that were not delivered yet task_ins_list: List[TaskIns] = [] with self.lock: - for _, task_ins in self.task_ins_store.items(): - # pylint: disable=too-many-boolean-expressions + # If not annoymous clients, we can get TaskIns efficiently + # by making use of node_id:task_id mapping + if node_id: if ( - node_id is not None # Not anonymous - and task_ins.task.consumer.anonymous is False - and task_ins.task.consumer.node_id == node_id - and task_ins.task.delivered_at == "" - ) or ( - node_id is None # Anonymous - and task_ins.task.consumer.anonymous is True - and task_ins.task.consumer.node_id == 0 - and task_ins.task.delivered_at == "" + node_id not in self.task_ins_mapping + or len(self.task_ins_mapping[node_id]) == 0 ): - task_ins_list.append(task_ins) - if limit and len(task_ins_list) == limit: - break + return task_ins_list + task_ids = self.task_ins_mapping[node_id] + num = limit if limit else len(task_ids) + while len(task_ins_list) < num: + # Remove + uuid = task_ids.pop(0) + # Fetch + taskins = self.task_ins_store[uuid] + # Update + self.task_ins_mapping[node_id] = task_ids + + task_ins_list.append(taskins) + else: + for _, task_ins in self.task_ins_store.items(): + # pylint: disable=too-many-boolean-expressions + if ( + node_id is None # Anonymous + and task_ins.task.consumer.anonymous is True + and task_ins.task.consumer.node_id == 0 + and task_ins.task.delivered_at == "" + ): + task_ins_list.append(task_ins) + if limit and len(task_ins_list) == limit: + break # Mark all of them as delivered delivered_at = now().isoformat() From f44b595b90ef47d989b9eb5039d9e1c75961df57 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Tue, 27 Feb 2024 01:06:56 +0000 Subject: [PATCH 052/103] undo --- .../server/superlink/state/in_memory_state.py | 52 +++++-------------- 1 file changed, 14 insertions(+), 38 deletions(-) diff --git a/src/py/flwr/server/superlink/state/in_memory_state.py b/src/py/flwr/server/superlink/state/in_memory_state.py index 9e6c458519c..690fadc032d 100644 --- a/src/py/flwr/server/superlink/state/in_memory_state.py +++ b/src/py/flwr/server/superlink/state/in_memory_state.py @@ -35,7 +35,6 @@ def __init__(self) -> None: self.node_ids: Set[int] = set() self.run_ids: Set[int] = set() self.task_ins_store: Dict[UUID, TaskIns] = {} - self.task_ins_mapping: Dict[int, List[UUID]] = {} self.task_res_store: Dict[UUID, TaskRes] = {} self.lock = threading.Lock() @@ -62,14 +61,6 @@ def store_task_ins(self, task_ins: TaskIns) -> Optional[UUID]: task_ins.task.ttl = ttl.isoformat() with self.lock: self.task_ins_store[task_id] = task_ins - node_id = task_ins.task.consumer.node_id - if node_id: - # If not an annonymous node, let's construct or - # update the node_id:task_id mapping - if node_id in self.task_ins_mapping: - self.task_ins_mapping[node_id].append(task_id) - else: - self.task_ins_mapping[node_id] = [task_id] # Return the new task_id return task_id @@ -84,37 +75,22 @@ def get_task_ins( # Find TaskIns for node_id that were not delivered yet task_ins_list: List[TaskIns] = [] with self.lock: - # If not annoymous clients, we can get TaskIns efficiently - # by making use of node_id:task_id mapping - if node_id: + for _, task_ins in self.task_ins_store.items(): + # pylint: disable=too-many-boolean-expressions if ( - node_id not in self.task_ins_mapping - or len(self.task_ins_mapping[node_id]) == 0 + node_id is not None # Not anonymous + and task_ins.task.consumer.anonymous is False + and task_ins.task.consumer.node_id == node_id + and task_ins.task.delivered_at == "" + ) or ( + node_id is None # Anonymous + and task_ins.task.consumer.anonymous is True + and task_ins.task.consumer.node_id == 0 + and task_ins.task.delivered_at == "" ): - return task_ins_list - task_ids = self.task_ins_mapping[node_id] - num = limit if limit else len(task_ids) - while len(task_ins_list) < num: - # Remove - uuid = task_ids.pop(0) - # Fetch - taskins = self.task_ins_store[uuid] - # Update - self.task_ins_mapping[node_id] = task_ids - - task_ins_list.append(taskins) - else: - for _, task_ins in self.task_ins_store.items(): - # pylint: disable=too-many-boolean-expressions - if ( - node_id is None # Anonymous - and task_ins.task.consumer.anonymous is True - and task_ins.task.consumer.node_id == 0 - and task_ins.task.delivered_at == "" - ): - task_ins_list.append(task_ins) - if limit and len(task_ins_list) == limit: - break + task_ins_list.append(task_ins) + if limit and len(task_ins_list) == limit: + break # Mark all of them as delivered delivered_at = now().isoformat() From ab55b0cbccada54cbe40614f62af1263421f5c34 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Tue, 27 Feb 2024 09:53:44 +0000 Subject: [PATCH 053/103] more tests --- .../server/superlink/fleet/vce/vce_api.py | 25 ++++++++-- .../superlink/fleet/vce/vce_api_test.py | 47 +++++++++++++------ 2 files changed, 53 insertions(+), 19 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index 11aad9fd9f3..e665903188e 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -158,21 +158,38 @@ async def run( # pylint: disable=too-many-arguments,unused-argument,too-many-locals def start_vce( - num_supernodes: int, client_app_module_name: str, backend_name: str, backend_config_json_stream: str, - state_factory: StateFactory, working_dir: str, f_stop: asyncio.Event, + num_supernodes: Optional[int] = None, + state_factory: Optional[StateFactory] = None, existing_nodes_mapping: Optional[NodeToPartitionMapping] = None, ) -> None: - """Start Fleet API with the VirtualClientEngine (VCE).""" + """Start Fleet API with the Simulation Engine.""" + if num_supernodes is not None and existing_nodes_mapping is not None: + raise ValueError( + "Both `num_supernodes` and `existing_nodes_mapping` are provided, " + "but only one is allowed." + ) if existing_nodes_mapping: + if state_factory is None: + raise ValueError( + "You passed `existing_nodes_mapping` but no `state_factory` was passed." + ) + log(INFO, "Using exiting NodeToPartitionMapping and StateFactory.") # Use mapping constructed externally. This also means nodes # have previously being registered. nodes_mapping = existing_nodes_mapping - else: + + if not state_factory: + log(INFO, "A StateFactory was not supplied to the SimulationEngine.") + # Create an empty in-memory state factory + state_factory = StateFactory(":flwr-in-memory-state:") + log(INFO, "Created new %s.", state_factory.__class__.__name__) + + if num_supernodes: # Register SuperNodes nodes_mapping = _register_nodes( num_nodes=num_supernodes, state_factory=state_factory diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py index 6abdd046f81..3967c734617 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py @@ -72,23 +72,15 @@ def terminate_simulation(f_stop: asyncio.Event, sleep_duration: int) -> None: def start_and_shutdown( - existing_state_factory: Optional[StateFactory] = None, + backend: str = "ray", + num_supernodes: Optional[int] = None, + state_factory: Optional[StateFactory] = None, nodes_mapping: Optional[NodeToPartitionMapping] = None, duration: int = 10, ) -> None: """Start Simulation Engine and terminate after specified number of seconds.""" f_stop = asyncio.Event() - # Initialize StateFactory - if nodes_mapping: - if existing_state_factory is None: - raise ValueError( - "If you specify a node mapping, you must pass a StateFactory." - ) - state_factory = existing_state_factory - else: - state_factory = StateFactory(":flwr-in-memory-state:") - # Setup thread that will set the f_stop event, triggering the termination of all # asyncio logic in the Simulation Engine. It will also terminate the Backend. termination_th = threading.Thread( @@ -97,9 +89,9 @@ def start_and_shutdown( termination_th.start() start_vce( - num_supernodes=50, + num_supernodes=num_supernodes, client_app_module_name="vce_api_test:client_app", - backend_name="ray", + backend_name=backend, backend_config_json_stream="{}", # an empty json stream (an empty config) state_factory=state_factory, working_dir="", @@ -118,7 +110,32 @@ class AsyncTestFleetSimulationEngineRayBackend(IsolatedAsyncioTestCase): def test_start_and_shutdown(self) -> None: """Start Simulation Engine Fleet and terminate it.""" - start_and_shutdown() + start_and_shutdown(num_supernodes=50) + + def test_with_nonexistent_backend(self) -> None: + """Test specifying a backend that does not exist.""" + with self.assertRaises(KeyError): + start_and_shutdown(num_supernodes=50, backend="this-backend-does-not-exist") + + def test_erroneous_arguments_num_supernodes_and_existing_mapping(self) -> None: + """Test ValueError if a node mapping is passed but also num_supernodes. + + Passing `num_supernodes` does nothing since we assume that if a node mapping + is supplied, nodes have been registered externally already. Therefore passing + `num_supernodes` might give the impression that that many nodes will be registered. + We don't do that since a mapping already exists. + """ + with self.assertRaises(ValueError): + start_and_shutdown(num_supernodes=50, nodes_mapping={0: 1}) + + def test_erroneous_arguments_existing_mapping_but_no_state_factory(self) -> None: + """Test ValueError if a node mapping is passed but no state. + + Passing a node mapping indicates that (externally) nodes have registered with a + state factory. Therefore, that state factory should be passed too. + """ + with self.assertRaises(ValueError): + start_and_shutdown(nodes_mapping={0: 1}) # pylint: disable=too-many-locals def test_start_and_shutdown_with_tasks_in_state(self) -> None: @@ -179,7 +196,7 @@ def test_start_and_shutdown_with_tasks_in_state(self) -> None: expected_results[task_id] = mult_factor * pi # Run - start_and_shutdown(state_factory, nodes_mapping) + start_and_shutdown(state_factory=state_factory, nodes_mapping=nodes_mapping) # Get all TaskRes task_res_list = state.get_task_res(task_ids=task_ids, limit=len(task_ids)) From 28dda2dde4ec460453a91b280fc89426731d8d62 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Tue, 27 Feb 2024 11:11:49 +0000 Subject: [PATCH 054/103] more --- .../server/superlink/fleet/vce/vce_api.py | 15 +- .../superlink/fleet/vce/vce_api_test.py | 150 +++++++++++++----- 2 files changed, 119 insertions(+), 46 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index e665903188e..c7f94a4c554 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -21,7 +21,7 @@ from logging import DEBUG, ERROR, INFO from typing import Callable, Dict, Optional -from flwr.client.clientapp import ClientApp, load_client_app +from flwr.client.clientapp import ClientApp, LoadClientAppError, load_client_app from flwr.client.node_state import NodeState from flwr.common.logger import log from flwr.common.serde import message_from_taskins, message_to_taskres @@ -90,8 +90,12 @@ async def worker( log(DEBUG, "Async worker: %s", e) break - except Exception as ex: # pylint: disable=broad-exception-caught + except LoadClientAppError as app_ex: + log(ERROR, "Async worker: %s", app_ex) + log(ERROR, traceback.format_exc()) + raise + except Exception as ex: # pylint: disable=broad-exception-caught log(ERROR, ex) log(ERROR, traceback.format_exc()) break @@ -173,6 +177,13 @@ def start_vce( "Both `num_supernodes` and `existing_nodes_mapping` are provided, " "but only one is allowed." ) + if num_supernodes is None: + if state_factory is None or existing_nodes_mapping is None: + raise ValueError( + "If not passing an existing `state_factory` and associated " + "`existing_nodes_mapping` you must supply `num_supernodes` to indicate " + "how many nodes to insert into a new StateFactory that will be created." + ) if existing_nodes_mapping: if state_factory is None: raise ValueError( diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py index 3967c734617..7cb0583791d 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py @@ -13,9 +13,12 @@ # limitations under the License. # ============================================================================== """Test Fleet Simulation Engine API.""" + + import asyncio import threading from itertools import cycle +from json import JSONDecodeError from math import pi from time import sleep from typing import Dict, Optional, Set @@ -71,12 +74,62 @@ def terminate_simulation(f_stop: asyncio.Event, sleep_duration: int) -> None: f_stop.set() +# pylint: disable=too-many-locals +def register_messages_into_state( + state_factory: StateFactory, + nodes_mapping: NodeToPartitionMapping, + run_id: int, + num_messages: int, +) -> Dict[UUID, float]: + """Register `num_messages` into the state factory.""" + state: InMemoryState = state_factory.state() # type: ignore + state.run_ids.add(run_id) + # Artificially add TaskIns to state so they can be processed + # by the Simulation Engine logic + nodes_cycle = cycle(nodes_mapping.keys()) # we have more messages than supernodes + task_ids: Set[UUID] = set() # so we can retrieve them later + expected_results = {} + for i in range(num_messages): + dst_node_id = next(nodes_cycle) + # Construct a Message + mult_factor = 2024 + i + getproperties_ins = GetPropertiesIns(config={"factor": mult_factor}) + recordset = getpropertiesins_to_recordset(getproperties_ins) + message = Message( + content=recordset, + metadata=Metadata( + run_id=run_id, + message_id="", + group_id="", + src_node_id=0, + dst_node_id=dst_node_id, # indicate destination node + reply_to_message="", + ttl="", + message_type=MESSAGE_TYPE_GET_PROPERTIES, + ), + ) + # Convert Message to TaskIns + taskins = message_to_taskins(message) + # Instert in state + task_id = state.store_task_ins(taskins) + if task_id: + # Add to UUID set + task_ids.add(task_id) + # Store expected output for check later on + expected_results[task_id] = mult_factor * pi + + return expected_results + + +# pylint: disable=too-many-arguments def start_and_shutdown( backend: str = "ray", + clientapp_module: str = "vce_api_test:client_app", num_supernodes: Optional[int] = None, state_factory: Optional[StateFactory] = None, nodes_mapping: Optional[NodeToPartitionMapping] = None, duration: int = 10, + backend_config: str = "{}", ) -> None: """Start Simulation Engine and terminate after specified number of seconds.""" f_stop = asyncio.Event() @@ -90,9 +143,9 @@ def start_and_shutdown( start_vce( num_supernodes=num_supernodes, - client_app_module_name="vce_api_test:client_app", + client_app_module_name=clientapp_module, backend_name=backend, - backend_config_json_stream="{}", # an empty json stream (an empty config) + backend_config_json_stream=backend_config, state_factory=state_factory, working_dir="", f_stop=f_stop, @@ -108,9 +161,43 @@ def start_and_shutdown( class AsyncTestFleetSimulationEngineRayBackend(IsolatedAsyncioTestCase): """A basic class that enables testing asyncio functionalities.""" - def test_start_and_shutdown(self) -> None: - """Start Simulation Engine Fleet and terminate it.""" - start_and_shutdown(num_supernodes=50) + def test_erroneous_no_supernodes_client_mapping(self) -> None: + """Test with unset arguments.""" + with self.assertRaises(ValueError): + start_and_shutdown() + + # def test_erroneous_clientapp_module_name(self) -> None: + # """Tests attempt to load a ClientApp that can't be found.""" + # from flwr.client.clientapp import LoadClientAppError + # num_messages = 7 + # num_nodes = 59 + + # # Register a state and a run_id in it + # run_id = 1234 + # state_factory = StateFactory(":flwr-in-memory-state:") + + # # Register a few nodes + # nodes_mapping = _register_nodes( + # num_nodes=num_nodes, state_factory=state_factory + # ) + + # _ = register_messages_into_state( + # state_factory=state_factory, + # nodes_mapping=nodes_mapping, + # run_id=run_id, + # num_messages=num_messages, + # ) + # with self.assertRaises(LoadClientAppError): + # start_and_shutdown( + # clientapp_module="totally_fictitious_app:client", + # state_factory=state_factory, + # nodes_mapping=nodes_mapping, + # ) + + def test_erroneous_backend_config(self) -> None: + """Backend Config should be a JSON stream.""" + with self.assertRaises(JSONDecodeError): + start_and_shutdown(num_supernodes=50, backend_config="not a proper config") def test_with_nonexistent_backend(self) -> None: """Test specifying a backend that does not exist.""" @@ -122,8 +209,8 @@ def test_erroneous_arguments_num_supernodes_and_existing_mapping(self) -> None: Passing `num_supernodes` does nothing since we assume that if a node mapping is supplied, nodes have been registered externally already. Therefore passing - `num_supernodes` might give the impression that that many nodes will be registered. - We don't do that since a mapping already exists. + `num_supernodes` might give the impression that that many nodes will be + registered. We don't do that since a mapping already exists. """ with self.assertRaises(ValueError): start_and_shutdown(num_supernodes=50, nodes_mapping={0: 1}) @@ -137,6 +224,10 @@ def test_erroneous_arguments_existing_mapping_but_no_state_factory(self) -> None with self.assertRaises(ValueError): start_and_shutdown(nodes_mapping={0: 1}) + def test_start_and_shutdown(self) -> None: + """Start Simulation Engine Fleet and terminate it.""" + start_and_shutdown(num_supernodes=50) + # pylint: disable=too-many-locals def test_start_and_shutdown_with_tasks_in_state(self) -> None: """Run Simulation Engine with some TasksIns in State. @@ -151,54 +242,25 @@ def test_start_and_shutdown_with_tasks_in_state(self) -> None: # Register a state and a run_id in it run_id = 1234 state_factory = StateFactory(":flwr-in-memory-state:") - state: InMemoryState = state_factory.state() # type: ignore - state.run_ids.add(run_id) # Register a few nodes nodes_mapping = _register_nodes( num_nodes=num_nodes, state_factory=state_factory ) - # Artificially add TaskIns to state so they can be processed - # by the Simulation Engine logic - nodes_cycle = cycle( - nodes_mapping.keys() - ) # we have more messages than supernodes - task_ids: Set[UUID] = set() # so we can retrieve them later - expected_results = {} - for i in range(num_messages): - dst_node_id = next(nodes_cycle) - # Construct a Message - mult_factor = 2024 + i - getproperties_ins = GetPropertiesIns(config={"factor": mult_factor}) - recordset = getpropertiesins_to_recordset(getproperties_ins) - message = Message( - content=recordset, - metadata=Metadata( - run_id=run_id, - message_id="", - group_id="", - src_node_id=0, - dst_node_id=dst_node_id, # indicate destination node - reply_to_message="", - ttl="", - message_type=MESSAGE_TYPE_GET_PROPERTIES, - ), - ) - # Convert Message to TaskIns - taskins = message_to_taskins(message) - # Instert in state - task_id = state.store_task_ins(taskins) - if task_id: - # Add to UUID set - task_ids.add(task_id) - # Store expected output for check later on - expected_results[task_id] = mult_factor * pi + expected_results = register_messages_into_state( + state_factory=state_factory, + nodes_mapping=nodes_mapping, + run_id=run_id, + num_messages=num_messages, + ) # Run start_and_shutdown(state_factory=state_factory, nodes_mapping=nodes_mapping) # Get all TaskRes + state = state_factory.state() + task_ids = set(expected_results.keys()) task_res_list = state.get_task_res(task_ids=task_ids, limit=len(task_ids)) # Check results by first converting to Message From 5cd047eaf5a26b7c93b34815f47aacb4311992c3 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Tue, 27 Feb 2024 11:42:59 +0000 Subject: [PATCH 055/103] minor update --- .../flwr/server/superlink/fleet/vce/backend/raybackend.py | 2 +- src/py/flwr/server/superlink/fleet/vce/vce_api.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py index 5c81501d62d..7494ea7c285 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py @@ -138,7 +138,7 @@ async def process_message( Return output message and updated context. """ - node_id = message.metadata.dst_node_id + node_id = message.metadata.partition_id # Submite a task to the pool future = await self.pool.submit( diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index c7f94a4c554..79d41a3de77 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -21,7 +21,7 @@ from logging import DEBUG, ERROR, INFO from typing import Callable, Dict, Optional -from flwr.client.clientapp import ClientApp, LoadClientAppError, load_client_app +from flwr.client.clientapp import ClientApp, load_client_app, LoadClientAppError from flwr.client.node_state import NodeState from flwr.common.logger import log from flwr.common.serde import message_from_taskins, message_to_taskres @@ -89,12 +89,12 @@ async def worker( except asyncio.CancelledError as e: log(DEBUG, "Async worker: %s", e) break - + except LoadClientAppError as app_ex: log(ERROR, "Async worker: %s", app_ex) log(ERROR, traceback.format_exc()) raise - + except Exception as ex: # pylint: disable=broad-exception-caught log(ERROR, ex) log(ERROR, traceback.format_exc()) From 4be09c29d4b6f15a968d0ce8d04b557d5812a533 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Tue, 27 Feb 2024 11:44:51 +0000 Subject: [PATCH 056/103] handle loading of non-existing ClientApp --- .../superlink/fleet/vce/backend/raybackend.py | 42 ++++++--- .../fleet/vce/backend/raybackend_test.py | 85 ++++++++++++------- .../simulation/ray_transport/ray_actor.py | 5 +- 3 files changed, 87 insertions(+), 45 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py index b29d76b239e..709680bdba0 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py @@ -15,10 +15,12 @@ """Ray backend for the Fleet API using the Simulation Engine.""" import pathlib -from logging import INFO +from logging import ERROR, INFO from typing import Callable, Dict, List, Tuple, Union -from flwr.client.clientapp import ClientApp +import ray + +from flwr.client.clientapp import ClientApp, LoadClientAppError from flwr.common.context import Context from flwr.common.logger import log from flwr.common.message import Message @@ -138,22 +140,34 @@ async def process_message( """ node_id = message.metadata.dst_node_id - # Submite a task to the pool - future = await self.pool.submit( - lambda a, a_fn, mssg, cid, state: a.run.remote(a_fn, mssg, cid, state), - (app, message, str(node_id), context), - ) + try: + # Submite a task to the pool + future = await self.pool.submit( + lambda a, a_fn, mssg, cid, state: a.run.remote(a_fn, mssg, cid, state), + (app, message, str(node_id), context), + ) - await future + await future - # Fetch result - ( - out_mssg, - updated_context, - ) = await self.pool.fetch_result_and_return_actor_to_pool(future) + # Fetch result + ( + out_mssg, + updated_context, + ) = await self.pool.fetch_result_and_return_actor_to_pool(future) - return out_mssg, updated_context + return out_mssg, updated_context + + except LoadClientAppError as load_ex: + log( + ERROR, + "An exception was raised when processing a message. Terminating %s", + self.__class__.__name__, + ) + await self.terminate() + raise load_ex async def terminate(self) -> None: """Terminate all actors in actor pool.""" await self.pool.terminate_all_actors() + ray.shutdown() + log(INFO, "Terminated %s", self.__class__.__name__) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py index f0cca527ab9..3a9c7cd529b 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py @@ -19,10 +19,8 @@ from typing import Callable, Dict, Optional, Tuple, Union from unittest import IsolatedAsyncioTestCase -import ray - from flwr.client import Client, NumPyClient -from flwr.client.clientapp import ClientApp +from flwr.client.clientapp import ClientApp, LoadClientAppError, load_client_app from flwr.common import ( Config, ConfigsRecord, @@ -60,6 +58,14 @@ def _load_app() -> ClientApp: return ClientApp(client_fn=get_dummy_client) +def _load_from_module(client_app_module_name: str) -> Callable[[], ClientApp]: + def _load_app() -> ClientApp: + app: ClientApp = load_client_app(client_app_module_name) + return app + + return _load_app + + async def backend_build_process_and_termination( backend: RayBackend, process_args: Optional[Tuple[Callable[[], ClientApp], Message, Context]] = None, @@ -73,11 +79,38 @@ async def backend_build_process_and_termination( await backend.terminate() - ray.shutdown() - return to_return +def _create_message_and_context() -> Tuple[Message, Context, float]: + + # Construct a Message + mult_factor = 2024 + getproperties_ins = GetPropertiesIns(config={"factor": mult_factor}) + recordset = getpropertiesins_to_recordset(getproperties_ins) + message = Message( + content=recordset, + metadata=Metadata( + run_id=0, + message_id="", + group_id="", + src_node_id=0, + dst_node_id=0, + reply_to_message="", + ttl="", + message_type=MESSAGE_TYPE_GET_PROPERTIES, + ), + ) + + # Construct emtpy Context + context = Context(state=RecordSet()) + + # Expected output + expected_output = pi * mult_factor + + return message, context, expected_output + + class AsyncTestRayBackend(IsolatedAsyncioTestCase): """A basic class that allows runnig multliple asyncio tests.""" @@ -88,33 +121,16 @@ def test_backend_creation_and_termination(self) -> None: backend_build_process_and_termination(backend=backend, process_args=None) ) - def test_backend_creation_submit_and_termination(self) -> None: - """Test submit.""" + def test_backend_creation_submit_and_termination( + self, client_app_loader: Callable[[], ClientApp] = _load_app + ) -> None: + """Test submitting a message to a given ClientApp.""" backend = RayBackend(backend_config={}, work_dir="") # Define ClientApp - client_app_callable = _load_app - - # Construct a Message - mult_factor = 2024 - getproperties_ins = GetPropertiesIns(config={"factor": mult_factor}) - recordset = getpropertiesins_to_recordset(getproperties_ins) - message = Message( - content=recordset, - metadata=Metadata( - run_id=0, - message_id="", - group_id="", - src_node_id=0, - dst_node_id=0, - reply_to_message="", - ttl="", - message_type=MESSAGE_TYPE_GET_PROPERTIES, - ), - ) + client_app_callable = client_app_loader - # Construct emtpy Context - context = Context(state=RecordSet()) + message, context, expected_output = _create_message_and_context() res = asyncio.run( backend_build_process_and_termination( @@ -131,11 +147,20 @@ def test_backend_creation_submit_and_termination(self) -> None: content = out_mssg.content assert ( content.configs_records["getpropertiesres.properties"]["result"] - == pi * mult_factor + == expected_output ) # Verify context is correct obtained_result_in_context = updated_context.state.configs_records["result"][ "result" ] - assert obtained_result_in_context == pi * mult_factor + assert obtained_result_in_context == expected_output + + def test_backend_creation_submit_and_termination_non_existent_client_app( + self, + ) -> None: + """Testing with ClientApp module that does not exist.""" + with self.assertRaises(LoadClientAppError): + self.test_backend_creation_submit_and_termination( + client_app_loader=_load_from_module("a_non_existing_module:app") + ) diff --git a/src/py/flwr/simulation/ray_transport/ray_actor.py b/src/py/flwr/simulation/ray_transport/ray_actor.py index 5ac0b2c2748..482506df94b 100644 --- a/src/py/flwr/simulation/ray_transport/ray_actor.py +++ b/src/py/flwr/simulation/ray_transport/ray_actor.py @@ -25,7 +25,7 @@ from ray import ObjectRef from ray.util.actor_pool import ActorPool -from flwr.client.clientapp import ClientApp +from flwr.client.clientapp import ClientApp, LoadClientAppError from flwr.common import Context, Message from flwr.common.logger import log @@ -67,6 +67,9 @@ def run( # Handle task message out_message = app(message=message, context=context) + except LoadClientAppError as load_ex: + raise load_ex + except Exception as ex: client_trace = traceback.format_exc() mssg = ( From 3c616e9df491c7e64feb72392257f2b314dee104 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Tue, 27 Feb 2024 13:25:29 +0000 Subject: [PATCH 057/103] better tests; reorg --- .../superlink/fleet/vce/backend/raybackend.py | 3 + .../fleet/vce/backend/raybackend_test.py | 74 ++++++++++--------- 2 files changed, 41 insertions(+), 36 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py index 709680bdba0..7f885e2cfa7 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py @@ -48,6 +48,9 @@ def __init__( log(INFO, "Initialising: %s", self.__class__.__name__) log(INFO, "Backend config: %s", backend_config) + if not pathlib.Path(work_dir).exists(): + raise ValueError(f"Specified work_dir {work_dir} does not exist.") + # Init ray and append working dir if needed runtime_env = ( self._configure_runtime_env(work_dir=work_dir) if work_dir else None diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py index 3a9c7cd529b..92ca60db230 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py @@ -16,46 +16,17 @@ import asyncio from math import pi -from typing import Callable, Dict, Optional, Tuple, Union +from pathlib import Path +from typing import Callable, Optional, Tuple, Union from unittest import IsolatedAsyncioTestCase -from flwr.client import Client, NumPyClient from flwr.client.clientapp import ClientApp, LoadClientAppError, load_client_app -from flwr.common import ( - Config, - ConfigsRecord, - Context, - GetPropertiesIns, - Message, - Metadata, - RecordSet, - Scalar, -) +from flwr.common import Context, GetPropertiesIns, Message, Metadata, RecordSet from flwr.common.constant import MESSAGE_TYPE_GET_PROPERTIES from flwr.common.recordset_compat import getpropertiesins_to_recordset from .raybackend import RayBackend - - -class DummyClient(NumPyClient): - """A dummy NumPyClient for tests.""" - - def get_properties(self, config: Config) -> Dict[str, Scalar]: - """Return properties by doing a simple calculation.""" - result = float(config["factor"]) * pi - - # store something in context - self.context.state.configs_records["result"] = ConfigsRecord({"result": result}) - return {"result": result} - - -def get_dummy_client(cid: str) -> Client: # pylint: disable=unused-argument - """Return a DummyClient converted to Client type.""" - return DummyClient().to_client() - - -def _load_app() -> ClientApp: - return ClientApp(client_fn=get_dummy_client) +from .test.client import _load_app def _load_from_module(client_app_module_name: str) -> Callable[[], ClientApp]: @@ -122,10 +93,12 @@ def test_backend_creation_and_termination(self) -> None: ) def test_backend_creation_submit_and_termination( - self, client_app_loader: Callable[[], ClientApp] = _load_app + self, + client_app_loader: Callable[[], ClientApp] = _load_app, + workdir: str = "", ) -> None: """Test submitting a message to a given ClientApp.""" - backend = RayBackend(backend_config={}, work_dir="") + backend = RayBackend(backend_config={}, work_dir=workdir) # Define ClientApp client_app_callable = client_app_loader @@ -156,7 +129,7 @@ def test_backend_creation_submit_and_termination( ] assert obtained_result_in_context == expected_output - def test_backend_creation_submit_and_termination_non_existent_client_app( + def test_backend_creation_submit_and_termination_non_existing_client_app( self, ) -> None: """Testing with ClientApp module that does not exist.""" @@ -164,3 +137,32 @@ def test_backend_creation_submit_and_termination_non_existent_client_app( self.test_backend_creation_submit_and_termination( client_app_loader=_load_from_module("a_non_existing_module:app") ) + + def test_backend_creation_submit_and_termination_existing_client_app( + self, + ) -> None: + """Testing with ClientApp module that exist.""" + # Resolve what should be the workdir to pass upon Backend initialisation + file_path = Path(__file__) + print(f"{file_path = }") + working_dir = Path.cwd() + print(f"{working_dir = }") + rel_workdir = file_path.relative_to(working_dir) + + # Susbtract lats element and append "test" (to make it point ot .test dir) + rel_workdir_str = str(rel_workdir.parent / "test") + + self.test_backend_creation_submit_and_termination( + client_app_loader=_load_from_module("client:client_app"), + workdir=rel_workdir_str, + ) + + def test_backend_creation_submit_and_termination_existing_client_app_unsetworkdir( + self, + ) -> None: + """Testing with ClientApp module that exist but the passed workdir does not.""" + with self.assertRaises(ValueError): + self.test_backend_creation_submit_and_termination( + client_app_loader=_load_from_module("test.client:client_app"), + workdir="/?&%$^#%@$!", + ) From aed442041ac40c221daae98cc81bdeacb59eeba4 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Tue, 27 Feb 2024 14:36:14 +0000 Subject: [PATCH 058/103] update --- .../superlink/fleet/vce/vce_api_test.py | 55 +++++++------------ 1 file changed, 21 insertions(+), 34 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py index c71c33c1a96..26ea5d52905 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py @@ -20,21 +20,14 @@ from itertools import cycle from json import JSONDecodeError from math import pi +from pathlib import Path from time import sleep from typing import Dict, Optional, Set from unittest import IsolatedAsyncioTestCase from uuid import UUID -from flwr.client import Client, NumPyClient -from flwr.client.clientapp import ClientApp, LoadClientAppError -from flwr.common import ( - Config, - ConfigsRecord, - GetPropertiesIns, - Message, - Metadata, - Scalar, -) +from flwr.client.clientapp import LoadClientAppError +from flwr.common import GetPropertiesIns, Message, Metadata from flwr.common.constant import MESSAGE_TYPE_GET_PROPERTIES from flwr.common.recordset_compat import getpropertiesins_to_recordset from flwr.common.serde import message_from_taskres, message_to_taskins @@ -46,28 +39,6 @@ from flwr.server.superlink.state import InMemoryState, StateFactory -class DummyClient(NumPyClient): - """A dummy NumPyClient for tests.""" - - def get_properties(self, config: Config) -> Dict[str, Scalar]: - """Return properties by doing a simple calculation.""" - result = float(config["factor"]) * pi - - # store something in context - self.context.state.configs_records["result"] = ConfigsRecord({"result": result}) - return {"result": result} - - -def get_dummy_client(cid: str) -> Client: # pylint: disable=unused-argument - """Return a DummyClient converted to Client type.""" - return DummyClient().to_client() - - -client_app = ClientApp( - client_fn=get_dummy_client, -) - - def terminate_simulation(f_stop: asyncio.Event, sleep_duration: int) -> None: """Set event to terminate Simulation Engine after `sleep_duration` seconds.""" sleep(sleep_duration) @@ -121,10 +92,21 @@ def register_messages_into_state( return expected_results +def _autoresolve_working_dir(rel_client_app_dir: str = "backend/test") -> str: + """Correctly resolve working directory.""" + file_path = Path(__file__) + working_dir = Path.cwd() + rel_workdir = file_path.relative_to(working_dir) + + # Susbtract lats element and append "backend/test" (wher the client module is.) + return str(rel_workdir.parent / rel_client_app_dir) + + # pylint: disable=too-many-arguments def start_and_shutdown( backend: str = "ray", - clientapp_module: str = "vce_api_test:client_app", + clientapp_module: str = "client:client_app", + working_dir: str = "", num_supernodes: Optional[int] = None, state_factory: Optional[StateFactory] = None, nodes_mapping: Optional[NodeToPartitionMapping] = None, @@ -141,13 +123,18 @@ def start_and_shutdown( ) termination_th.start() + # Resolve working directory if not passed + if not working_dir: + working_dir = _autoresolve_working_dir() + print(f"---> {working_dir = }") + start_vce( num_supernodes=num_supernodes, client_app_module_name=clientapp_module, backend_name=backend, backend_config_json_stream=backend_config, state_factory=state_factory, - working_dir="", + working_dir=working_dir, f_stop=f_stop, existing_nodes_mapping=nodes_mapping, ) From 96519dc164fddd18fb93dff90ed4c4f32988cb90 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Tue, 27 Feb 2024 14:36:56 +0000 Subject: [PATCH 059/103] w/ previous --- .../fleet/vce/backend/raybackend_test.py | 2 - .../fleet/vce/backend/test/__init__.py | 15 ++++++ .../fleet/vce/backend/test/client.py | 48 +++++++++++++++++++ 3 files changed, 63 insertions(+), 2 deletions(-) create mode 100644 src/py/flwr/server/superlink/fleet/vce/backend/test/__init__.py create mode 100644 src/py/flwr/server/superlink/fleet/vce/backend/test/client.py diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py index 92ca60db230..24dfb0fd120 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py @@ -144,9 +144,7 @@ def test_backend_creation_submit_and_termination_existing_client_app( """Testing with ClientApp module that exist.""" # Resolve what should be the workdir to pass upon Backend initialisation file_path = Path(__file__) - print(f"{file_path = }") working_dir = Path.cwd() - print(f"{working_dir = }") rel_workdir = file_path.relative_to(working_dir) # Susbtract lats element and append "test" (to make it point ot .test dir) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/test/__init__.py b/src/py/flwr/server/superlink/fleet/vce/backend/test/__init__.py new file mode 100644 index 00000000000..96bab3a5c6f --- /dev/null +++ b/src/py/flwr/server/superlink/fleet/vce/backend/test/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2024 Flower Labs GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Components for Simulation Engine tests.""" diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/test/client.py b/src/py/flwr/server/superlink/fleet/vce/backend/test/client.py new file mode 100644 index 00000000000..4d0cdf6e2a7 --- /dev/null +++ b/src/py/flwr/server/superlink/fleet/vce/backend/test/client.py @@ -0,0 +1,48 @@ +# Copyright 2024 Flower Labs GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""A ClientApp for Backend tests.""" + +from math import pi +from typing import Dict + +from flwr.client import Client, NumPyClient +from flwr.client.clientapp import ClientApp +from flwr.common import Config, ConfigsRecord, Scalar + + +class DummyClient(NumPyClient): + """A dummy NumPyClient for tests.""" + + def get_properties(self, config: Config) -> Dict[str, Scalar]: + """Return properties by doing a simple calculation.""" + result = float(config["factor"]) * pi + + # store something in context + self.context.state.configs_records["result"] = ConfigsRecord({"result": result}) + return {"result": result} + + +def get_dummy_client(cid: str) -> Client: # pylint: disable=unused-argument + """Return a DummyClient converted to Client type.""" + return DummyClient().to_client() + + +def _load_app() -> ClientApp: + return ClientApp(client_fn=get_dummy_client) + + +client_app = ClientApp( + client_fn=get_dummy_client, +) From 1aa3b364e19eeb8f924698b153eddd3efe31a8b8 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Tue, 27 Feb 2024 15:02:50 +0000 Subject: [PATCH 060/103] post merge update --- src/py/flwr/simulation/run_simulation.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index e15807adeb3..7e9af626f86 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -48,15 +48,15 @@ def run_simulation() -> None: f_stop = asyncio.Event() superlink_th = threading.Thread( target=start_vce, - args=( - args.num_supernodes, - args.client_app, - args.backend, - args.backend_config, - state_factory, - args.dir, - f_stop, - ), + kwargs={ + "num_supernodes": args.num_supernodes, + "client_app_module_name": args.client_app, + "backend_name": args.backend, + "backend_config_json_stream": args.backend_config, + "working_dir": args.dir, + "state_factory": state_factory, + "f_stop": f_stop, + }, daemon=False, ) From 21f03a93fcbca2f9c6cde9f5368ae1292389bcb6 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Tue, 27 Feb 2024 15:05:06 +0000 Subject: [PATCH 061/103] fix --- src/py/flwr/cli/new/new_test.py | 34 +++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/src/py/flwr/cli/new/new_test.py b/src/py/flwr/cli/new/new_test.py index 39717bc67ab..7a4832013b0 100644 --- a/src/py/flwr/cli/new/new_test.py +++ b/src/py/flwr/cli/new/new_test.py @@ -77,17 +77,23 @@ def test_new(tmp_path: str) -> None: "client.py", } - # Change into the temprorary directory - os.chdir(tmp_path) - - # Execute - new(project_name=project_name, framework=framework) - - # Assert - file_list = os.listdir(os.path.join(tmp_path, project_name.lower())) - assert set(file_list) == expected_files_top_level - - file_list = os.listdir( - os.path.join(tmp_path, project_name.lower(), project_name.lower()) - ) - assert set(file_list) == expected_files_module + # Current directory + origin = os.getcwd() + + try: + # Change into the temprorary directory + os.chdir(tmp_path) + + # Execute + new(project_name=project_name, framework=framework) + + # Assert + file_list = os.listdir(os.path.join(tmp_path, project_name.lower())) + assert set(file_list) == expected_files_top_level + + file_list = os.listdir( + os.path.join(tmp_path, project_name.lower(), project_name.lower()) + ) + assert set(file_list) == expected_files_module + finally: + os.chdir(origin) From 4d8ee734c3f99a4b05fb13941237d437a46286ff Mon Sep 17 00:00:00 2001 From: jafermarq Date: Tue, 27 Feb 2024 15:57:55 +0000 Subject: [PATCH 062/103] minor tweak --- src/py/flwr/simulation/run_simulation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index 7e9af626f86..ee886877c09 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -38,8 +38,9 @@ def run_simulation() -> None: state_factory = StateFactory(":flwr-in-memory-state:") # Start Driver API + driver_address = "0.0.0.0:9098" driver_server: grpc.Server = _run_driver_api_grpc( - address="0.0.0.0:9091", + address=driver_address, state_factory=state_factory, certificates=None, ) @@ -65,7 +66,7 @@ def run_simulation() -> None: # Initialize Driver driver = Driver( - driver_service_address="0.0.0.0:9091", + driver_service_address=driver_address, root_certificates=None, ) From 1ac8e2b99708927cf30e3ebcf08fba1b84140a13 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Tue, 27 Feb 2024 16:43:21 +0000 Subject: [PATCH 063/103] runs passing server-app; client-app modules --- examples/simulation-pytorch/sim.py | 52 ++++++++++--------- src/py/flwr/server/run_serverapp.py | 14 ++++- .../server/superlink/fleet/vce/vce_api.py | 13 ++++- src/py/flwr/simulation/run_simulation.py | 43 +++++++++++---- 4 files changed, 85 insertions(+), 37 deletions(-) diff --git a/examples/simulation-pytorch/sim.py b/examples/simulation-pytorch/sim.py index 84a00e3f092..9bad47bb9b0 100644 --- a/examples/simulation-pytorch/sim.py +++ b/examples/simulation-pytorch/sim.py @@ -193,28 +193,30 @@ def evaluate( ) -def main(): - # Parse input arguments - args = parser.parse_args() - - # Resources to be assigned to each virtual client - client_resources = { - "num_cpus": args.num_cpus, - "num_gpus": args.num_gpus, - } - - # Start simulation - fl.simulation.start_simulation( - client_fn=get_client_fn(mnist_fds), - num_clients=NUM_CLIENTS, - client_resources=client_resources, - config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS), - strategy=strategy, - actor_kwargs={ - "on_actor_init_fn": disable_progress_bar # disable tqdm on each actor/process spawning virtual clients - }, - ) - - -if __name__ == "__main__": - main() +fl.simulation.run_simulation(server_app=server_app, client_app=client_app, num_supernodes=NUM_CLIENTS) + +# def main(): +# # Parse input arguments +# args = parser.parse_args() + +# # Resources to be assigned to each virtual client +# client_resources = { +# "num_cpus": args.num_cpus, +# "num_gpus": args.num_gpus, +# } + +# # Start simulation +# fl.simulation.start_simulation( +# client_fn=get_client_fn(mnist_fds), +# num_clients=NUM_CLIENTS, +# client_resources=client_resources, +# config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS), +# strategy=strategy, +# actor_kwargs={ +# "on_actor_init_fn": disable_progress_bar # disable tqdm on each actor/process spawning virtual clients +# }, +# ) + + +# if __name__ == "__main__": +# main() diff --git a/src/py/flwr/server/run_serverapp.py b/src/py/flwr/server/run_serverapp.py index e7205ebd144..c6710918448 100644 --- a/src/py/flwr/server/run_serverapp.py +++ b/src/py/flwr/server/run_serverapp.py @@ -19,6 +19,7 @@ import sys from logging import DEBUG, WARN from pathlib import Path +from typing import Optional from flwr.common import Context, EventType, RecordSet, event from flwr.common.logger import log @@ -27,13 +28,22 @@ from .server_app import ServerApp, load_server_app -def run(server_app_attr: str, driver: Driver, server_app_dir: str) -> None: +def run( + server_app_attr: str, + driver: Driver, + server_app_dir: str, + loaded_server_app: Optional[ServerApp] = None, +) -> None: """Run ServerApp with a given Driver.""" if server_app_dir is not None: sys.path.insert(0, server_app_dir) def _load() -> ServerApp: - server_app: ServerApp = load_server_app(server_app_attr) + server_app: ServerApp = ( + load_server_app(server_app_attr) + if loaded_server_app is None + else loaded_server_app + ) return server_app server_app = _load() diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index c7f94a4c554..11eeab542f1 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -167,11 +167,18 @@ def start_vce( backend_config_json_stream: str, working_dir: str, f_stop: asyncio.Event, + client_app: Optional[ClientApp] = None, num_supernodes: Optional[int] = None, state_factory: Optional[StateFactory] = None, existing_nodes_mapping: Optional[NodeToPartitionMapping] = None, ) -> None: """Start Fleet API with the Simulation Engine.""" + if client_app_module_name is not None and client_app is not None: + raise ValueError( + "Both `client_app_module_name` and `client_app` are provided, " + "but only one is allowed." + ) + if num_supernodes is not None and existing_nodes_mapping is not None: raise ValueError( "Both `num_supernodes` and `existing_nodes_mapping` are provided, " @@ -234,7 +241,11 @@ def start_vce( log(INFO, "client_app_module_name = %s", client_app_module_name) def _load() -> ClientApp: - app: ClientApp = load_client_app(client_app_module_name) + app: ClientApp = ( + load_client_app(client_app_module_name) + if client_app is None + else client_app + ) return app app = _load diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index ee886877c09..8b5eb50e8af 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -17,23 +17,47 @@ import argparse import asyncio import threading +from typing import Optional import grpc +from flwr.client import ClientApp from flwr.common import EventType, event from flwr.server.driver.driver import Driver from flwr.server.run_serverapp import run +from flwr.server.server_app import ServerApp from flwr.server.superlink.state import StateFactory -def run_simulation() -> None: +def run_from_cli() -> None: + """.""" + args = _parse_args_run_simulation().parse_args() + + run_simulation( + num_supernodes=args.num_supernodes, + client_app_module_name=args.client_app, + backend_name=args.backend, + backend_config_json_stream=args.backend_config, + working_dir=args.dir, + server_app_module_name=args.server_app, + ) + + +def run_simulation( + num_supernodes: int, + server_app: Optional[ServerApp] = None, + client_app: Optional[ClientApp] = None, + backend_name: str = "ray", + backend_config: str = "{}", + client_app_module_name: Optional[str] = None, + server_app_module_name: Optional[str] = None, + working_dir: str = "", +) -> None: """.""" # TODO: below create circular imports from flwr.server.app import _register_exit_handlers, _run_driver_api_grpc from flwr.server.superlink.fleet.vce import start_vce - args = _parse_args_run_simulation().parse_args() - # Initialize StateFactory state_factory = StateFactory(":flwr-in-memory-state:") @@ -50,11 +74,12 @@ def run_simulation() -> None: superlink_th = threading.Thread( target=start_vce, kwargs={ - "num_supernodes": args.num_supernodes, - "client_app_module_name": args.client_app, - "backend_name": args.backend, - "backend_config_json_stream": args.backend_config, - "working_dir": args.dir, + "num_supernodes": num_supernodes, + "client_app_module_name": client_app_module_name, + "client_app": client_app, + "backend_name": backend_name, + "backend_config_json_stream": backend_config, + "working_dir": working_dir, "state_factory": state_factory, "f_stop": f_stop, }, @@ -71,7 +96,7 @@ def run_simulation() -> None: ) # Launch server app - run(args.server_app, driver, args.dir) + run(server_app_module_name, driver, working_dir, loaded_server_app=server_app) del driver From c36660427469850d43eeb2f88c48f2798703bd51 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Tue, 27 Feb 2024 16:45:11 +0000 Subject: [PATCH 064/103] w/ previous --- src/py/flwr/simulation/__init__.py | 4 ++-- src/py/flwr/simulation/run_simulation.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/py/flwr/simulation/__init__.py b/src/py/flwr/simulation/__init__.py index b283de70c58..764127465cb 100644 --- a/src/py/flwr/simulation/__init__.py +++ b/src/py/flwr/simulation/__init__.py @@ -17,7 +17,7 @@ import importlib -from flwr.simulation.run_simulation import run_simulation +from flwr.simulation.run_simulation import run_simulation_from_cli is_ray_installed = importlib.util.find_spec("ray") is not None @@ -38,5 +38,5 @@ def start_simulation(*args, **kwargs): # type: ignore __all__ = [ "start_simulation", - "run_simulation", + "run_simulation_from_cli", ] diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index 8b5eb50e8af..319c85e7798 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -29,7 +29,7 @@ from flwr.server.superlink.state import StateFactory -def run_from_cli() -> None: +def run_simulation_from_cli() -> None: """.""" args = _parse_args_run_simulation().parse_args() @@ -37,7 +37,7 @@ def run_from_cli() -> None: num_supernodes=args.num_supernodes, client_app_module_name=args.client_app, backend_name=args.backend, - backend_config_json_stream=args.backend_config, + backend_config=args.backend_config, working_dir=args.dir, server_app_module_name=args.server_app, ) From a62a0d13f39f1a05a32e03e92bf577967896d5f1 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Tue, 27 Feb 2024 16:55:51 +0000 Subject: [PATCH 065/103] wip --- examples/simulation-pytorch/sim.ipynb | 51 ++++++++++++++---------- pyproject.toml | 2 +- src/py/flwr/simulation/__init__.py | 3 +- src/py/flwr/simulation/run_simulation.py | 2 +- 4 files changed, 35 insertions(+), 23 deletions(-) diff --git a/examples/simulation-pytorch/sim.ipynb b/examples/simulation-pytorch/sim.ipynb index 93a79d2f0e0..b2c2d9c0580 100644 --- a/examples/simulation-pytorch/sim.ipynb +++ b/examples/simulation-pytorch/sim.ipynb @@ -511,10 +511,7 @@ " # Create and return client\n", " return FlowerClient(trainloader, valloader).to_client()\n", "\n", - " return client_fn\n", - "\n", - "\n", - "client_fn_callback = get_client_fn(mnist_fds)" + " return client_fn" ] }, { @@ -536,25 +533,27 @@ }, "outputs": [], "source": [ - "# With a dictionary, you tell Flower's VirtualClientEngine that each\n", - "# client needs exclusive access to these many resources in order to run\n", - "client_resources = {\"num_cpus\": 1, \"num_gpus\": 0.0}\n", - "\n", - "# Let's disable tqdm progress bar in the main thread (used by the server)\n", - "disable_progress_bar()\n", - "\n", - "history = fl.simulation.start_simulation(\n", - " client_fn=client_fn_callback, # a callback to construct a client\n", - " num_clients=NUM_CLIENTS, # total number of clients in the experiment\n", - " config=fl.server.ServerConfig(num_rounds=10), # let's run for 10 rounds\n", - " strategy=strategy, # the strategy that will orchestrate the whole FL pipeline\n", - " client_resources=client_resources,\n", - " actor_kwargs={\n", - " \"on_actor_init_fn\": disable_progress_bar # disable tqdm on each actor/process spawning virtual clients\n", - " },\n", + "# ClientApp for Flower-Next\n", + "client_app = fl.client.ClientApp(\n", + " client_fn=get_client_fn(mnist_fds),\n", + ")\n", + "\n", + "# ServerApp for Flower-Next\n", + "server_app = fl.server.ServerApp(\n", + " config=fl.server.ServerConfig(num_rounds=10),\n", + " strategy=strategy,\n", ")" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fl.simulation.run_simulation(server_app=server_app, client_app=client_app, num_supernodes=NUM_CLIENTS)" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -622,6 +621,18 @@ "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" } }, "nbformat": 4, diff --git a/pyproject.toml b/pyproject.toml index 743670c6419..b45f960063d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,7 +58,7 @@ flower-fleet-api = "flwr.server:run_fleet_api" flower-superlink = "flwr.server:run_superlink" flower-client-app = "flwr.client:run_client_app" flower-server-app = "flwr.server:run_server_app" -flower-simulation = "flwr.simulation:run_simulation" +flower-simulation = "flwr.simulation:run_simulation_from_cli" [tool.poetry.dependencies] python = "^3.8" diff --git a/src/py/flwr/simulation/__init__.py b/src/py/flwr/simulation/__init__.py index 764127465cb..af87232f15d 100644 --- a/src/py/flwr/simulation/__init__.py +++ b/src/py/flwr/simulation/__init__.py @@ -17,7 +17,7 @@ import importlib -from flwr.simulation.run_simulation import run_simulation_from_cli +from flwr.simulation.run_simulation import run_simulation_from_cli, run_simulation is_ray_installed = importlib.util.find_spec("ray") is not None @@ -39,4 +39,5 @@ def start_simulation(*args, **kwargs): # type: ignore __all__ = [ "start_simulation", "run_simulation_from_cli", + "run_simulation" ] diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index 319c85e7798..6568e05e5d4 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -62,7 +62,7 @@ def run_simulation( state_factory = StateFactory(":flwr-in-memory-state:") # Start Driver API - driver_address = "0.0.0.0:9098" + driver_address = "0.0.0.0:9091" driver_server: grpc.Server = _run_driver_api_grpc( address=driver_address, state_factory=state_factory, From c45c4afabd519ff588013b914e796270b0cdeef9 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Tue, 27 Feb 2024 19:18:52 +0000 Subject: [PATCH 066/103] no need for separate test/ dir --- .../fleet/vce/backend/raybackend_test.py | 50 ++++++++++++++++--- .../fleet/vce/backend/test/__init__.py | 15 ------ .../fleet/vce/backend/test/client.py | 48 ------------------ 3 files changed, 42 insertions(+), 71 deletions(-) delete mode 100644 src/py/flwr/server/superlink/fleet/vce/backend/test/__init__.py delete mode 100644 src/py/flwr/server/superlink/fleet/vce/backend/test/client.py diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py index 24dfb0fd120..8ac9df35d45 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py @@ -17,16 +17,50 @@ import asyncio from math import pi from pathlib import Path -from typing import Callable, Optional, Tuple, Union +from typing import Callable, Dict, Optional, Tuple, Union from unittest import IsolatedAsyncioTestCase +from flwr.client import Client, NumPyClient from flwr.client.clientapp import ClientApp, LoadClientAppError, load_client_app -from flwr.common import Context, GetPropertiesIns, Message, Metadata, RecordSet +from flwr.common import ( + Config, + ConfigsRecord, + Context, + GetPropertiesIns, + Message, + Metadata, + RecordSet, + Scalar, +) from flwr.common.constant import MESSAGE_TYPE_GET_PROPERTIES from flwr.common.recordset_compat import getpropertiesins_to_recordset +from flwr.server.superlink.fleet.vce.backend.raybackend import RayBackend -from .raybackend import RayBackend -from .test.client import _load_app + +class DummyClient(NumPyClient): + """A dummy NumPyClient for tests.""" + + def get_properties(self, config: Config) -> Dict[str, Scalar]: + """Return properties by doing a simple calculation.""" + result = float(config["factor"]) * pi + + # store something in context + self.context.state.configs_records["result"] = ConfigsRecord({"result": result}) + return {"result": result} + + +def get_dummy_client(cid: str) -> Client: # pylint: disable=unused-argument + """Return a DummyClient converted to Client type.""" + return DummyClient().to_client() + + +def _load_app() -> ClientApp: + return ClientApp(client_fn=get_dummy_client) + + +client_app = ClientApp( + client_fn=get_dummy_client, +) def _load_from_module(client_app_module_name: str) -> Callable[[], ClientApp]: @@ -147,11 +181,11 @@ def test_backend_creation_submit_and_termination_existing_client_app( working_dir = Path.cwd() rel_workdir = file_path.relative_to(working_dir) - # Susbtract lats element and append "test" (to make it point ot .test dir) - rel_workdir_str = str(rel_workdir.parent / "test") + # Susbtract last element + rel_workdir_str = str(rel_workdir.parent) self.test_backend_creation_submit_and_termination( - client_app_loader=_load_from_module("client:client_app"), + client_app_loader=_load_from_module("raybackend_test:client_app"), workdir=rel_workdir_str, ) @@ -161,6 +195,6 @@ def test_backend_creation_submit_and_termination_existing_client_app_unsetworkdi """Testing with ClientApp module that exist but the passed workdir does not.""" with self.assertRaises(ValueError): self.test_backend_creation_submit_and_termination( - client_app_loader=_load_from_module("test.client:client_app"), + client_app_loader=_load_from_module("raybackend_test:client_app"), workdir="/?&%$^#%@$!", ) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/test/__init__.py b/src/py/flwr/server/superlink/fleet/vce/backend/test/__init__.py deleted file mode 100644 index 96bab3a5c6f..00000000000 --- a/src/py/flwr/server/superlink/fleet/vce/backend/test/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright 2024 Flower Labs GmbH. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Components for Simulation Engine tests.""" diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/test/client.py b/src/py/flwr/server/superlink/fleet/vce/backend/test/client.py deleted file mode 100644 index 4d0cdf6e2a7..00000000000 --- a/src/py/flwr/server/superlink/fleet/vce/backend/test/client.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright 2024 Flower Labs GmbH. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""A ClientApp for Backend tests.""" - -from math import pi -from typing import Dict - -from flwr.client import Client, NumPyClient -from flwr.client.clientapp import ClientApp -from flwr.common import Config, ConfigsRecord, Scalar - - -class DummyClient(NumPyClient): - """A dummy NumPyClient for tests.""" - - def get_properties(self, config: Config) -> Dict[str, Scalar]: - """Return properties by doing a simple calculation.""" - result = float(config["factor"]) * pi - - # store something in context - self.context.state.configs_records["result"] = ConfigsRecord({"result": result}) - return {"result": result} - - -def get_dummy_client(cid: str) -> Client: # pylint: disable=unused-argument - """Return a DummyClient converted to Client type.""" - return DummyClient().to_client() - - -def _load_app() -> ClientApp: - return ClientApp(client_fn=get_dummy_client) - - -client_app = ClientApp( - client_fn=get_dummy_client, -) From c9492f067d4acc507748c78a03e08b2bf99cf143 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Tue, 27 Feb 2024 19:36:31 +0000 Subject: [PATCH 067/103] update --- src/py/flwr/server/superlink/fleet/vce/vce_api_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py index 26ea5d52905..d345cf7bb7e 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py @@ -92,7 +92,7 @@ def register_messages_into_state( return expected_results -def _autoresolve_working_dir(rel_client_app_dir: str = "backend/test") -> str: +def _autoresolve_working_dir(rel_client_app_dir: str = "backend") -> str: """Correctly resolve working directory.""" file_path = Path(__file__) working_dir = Path.cwd() @@ -105,7 +105,7 @@ def _autoresolve_working_dir(rel_client_app_dir: str = "backend/test") -> str: # pylint: disable=too-many-arguments def start_and_shutdown( backend: str = "ray", - clientapp_module: str = "client:client_app", + clientapp_module: str = "raybackend_test:client_app", working_dir: str = "", num_supernodes: Optional[int] = None, state_factory: Optional[StateFactory] = None, @@ -126,7 +126,6 @@ def start_and_shutdown( # Resolve working directory if not passed if not working_dir: working_dir = _autoresolve_working_dir() - print(f"---> {working_dir = }") start_vce( num_supernodes=num_supernodes, @@ -220,7 +219,8 @@ def test_start_and_shutdown_with_tasks_in_state(self) -> None: This test creates a few nodes and submits a few messages that need to be executed by the Backend. In order for that to happen the asyncio - producer/consumer logic must function. + producer/consumer logic must function. This also severs to evaluate + a valid ClientApp. """ num_messages = 113 num_nodes = 59 From 82878f6df700de1030d0e6285c88b7e53bb46b23 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Tue, 27 Feb 2024 19:59:20 +0000 Subject: [PATCH 068/103] updates --- src/py/flwr/simulation/run_simulation.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index ee886877c09..6006fa57c82 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -21,15 +21,17 @@ import grpc from flwr.common import EventType, event +from flwr.common.exit_handlers import register_exit_handlers from flwr.server.driver.driver import Driver from flwr.server.run_serverapp import run + from flwr.server.superlink.state import StateFactory def run_simulation() -> None: """.""" # TODO: below create circular imports - from flwr.server.app import _register_exit_handlers, _run_driver_api_grpc + from flwr.server.app import _run_driver_api_grpc from flwr.server.superlink.fleet.vce import start_vce args = _parse_args_run_simulation().parse_args() @@ -78,7 +80,7 @@ def run_simulation() -> None: # Trigger stop event f_stop.set() - _register_exit_handlers( + register_exit_handlers( grpc_servers=[driver_server], bckg_threads=[superlink_th], event_type=EventType.RUN_SUPERLINK_LEAVE, From b3d397b731efc6c0d629814fa261c8e506445a47 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Tue, 27 Feb 2024 22:06:04 +0000 Subject: [PATCH 069/103] better handling of exceptions in vce's ; adjust test for --- .../superlink/fleet/vce/backend/raybackend.py | 3 +- .../fleet/vce/backend/raybackend_test.py | 9 ++ .../server/superlink/fleet/vce/vce_api.py | 100 +++++++++++++----- .../superlink/fleet/vce/vce_api_test.py | 9 +- 4 files changed, 89 insertions(+), 32 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py index 7eb21e3b20d..06a6fc72975 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py @@ -163,10 +163,9 @@ async def process_message( except LoadClientAppError as load_ex: log( ERROR, - "An exception was raised when processing a message. Terminating %s", + "An exception was raised when processing a message by %s", self.__class__.__name__, ) - await self.terminate() raise load_ex async def terminate(self) -> None: diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py index fd246b5fc2a..e14c466e7b8 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py @@ -20,6 +20,8 @@ from typing import Callable, Dict, Optional, Tuple, Union from unittest import IsolatedAsyncioTestCase +import ray + from flwr.client import Client, NumPyClient from flwr.client.client_app import ClientApp, LoadClientAppError, load_client_app from flwr.common import ( @@ -119,6 +121,11 @@ def _create_message_and_context() -> Tuple[Message, Context, float]: class AsyncTestRayBackend(IsolatedAsyncioTestCase): """A basic class that allows runnig multliple asyncio tests.""" + async def on_cleanup(self) -> None: + """Ensure Ray has shutdown.""" + if ray.is_initialized(): + ray.shutdown() + def test_backend_creation_and_termination(self) -> None: """Test creation of RayBackend and its termination.""" backend = RayBackend(backend_config={}, work_dir="") @@ -171,6 +178,7 @@ def test_backend_creation_submit_and_termination_non_existing_client_app( self.test_backend_creation_submit_and_termination( client_app_loader=_load_from_module("a_non_existing_module:app") ) + self.addAsyncCleanup(self.on_cleanup) def test_backend_creation_submit_and_termination_existing_client_app( self, @@ -198,3 +206,4 @@ def test_backend_creation_submit_and_termination_existing_client_app_unsetworkdi client_app_loader=_load_from_module("raybackend_test:client_app"), workdir="/?&%$^#%@$!", ) + self.addAsyncCleanup(self.on_cleanup) diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index 82dda285158..761712875cc 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -18,8 +18,8 @@ import asyncio import json import traceback -from logging import DEBUG, ERROR, INFO -from typing import Callable, Dict, Optional +from logging import DEBUG, ERROR, INFO, WARN +from typing import Callable, Dict, List, Optional from flwr.client.client_app import ClientApp, LoadClientAppError, load_client_app from flwr.client.node_state import NodeState @@ -101,21 +101,50 @@ async def worker( break -async def generate_pull_requests( +async def add_taskins_to_queue( queue: "asyncio.Queue[TaskIns]", state_factory: StateFactory, nodes_mapping: NodeToPartitionMapping, + backend: Backend, + consumers: List["asyncio.Task[None]"], f_stop: asyncio.Event, ) -> None: """Retrieve TaskIns and add it to the queue.""" state = state_factory.state() + num_initial_consumers = len(consumers) while not f_stop.is_set(): for node_id in nodes_mapping.keys(): task_ins = state.get_task_ins(node_id=node_id, limit=1) if task_ins: await queue.put(task_ins[0]) - log(DEBUG, "TaskIns in queue: %i", queue.qsize()) + # Count consumers that are running + num_active = sum(not (cc.done()) for cc in consumers) + + # Alert if number of consumers decreased by half + if num_active < num_initial_consumers // 2: + log( + WARN, + "Number of active workers has more than halved: (%i/%i active)", + num_active, + num_initial_consumers, + ) + + # Break if consumers died + if num_active == 0: + raise RuntimeError("All workers have died. Ending Simulation.") + + # Log some stats + log( + DEBUG, + "Simulation Engine stats: " + "(Active workers: (%i/%i) | %s (%i workers) | Tasks in queue: %i)", + num_active, + num_initial_consumers, + backend.__class__.__name__, + backend.num_workers, + queue.qsize(), + ) await asyncio.sleep(1.0) log(DEBUG, "Async producer: Stopped pulling from StateFactory.") @@ -132,32 +161,55 @@ async def run( # pylint: disable=fixme queue: "asyncio.Queue[TaskIns]" = asyncio.Queue(128) - # Build backend - await backend.build() - worker_tasks = [ - asyncio.create_task( - worker(app, queue, node_states, state_factory, nodes_mapping, backend) + try: + # Build backend + await backend.build() + + # Add workers (they submit Messages to Backend) + worker_tasks = [ + asyncio.create_task( + worker(app, queue, node_states, state_factory, nodes_mapping, backend) + ) + for _ in range(backend.num_workers) + ] + # Create producer (adds TaskIns into Queue) + producer = asyncio.create_task( + add_taskins_to_queue( + queue, state_factory, nodes_mapping, backend, worker_tasks, f_stop + ) ) - for _ in range(backend.num_workers) - ] - producer = asyncio.create_task( - generate_pull_requests(queue, state_factory, nodes_mapping, f_stop) - ) - await asyncio.gather(producer) + # Wait for producer to finish + # The producer runs forever until f_stop is set or until + # all worker (consumer) coroutines are completed. Workers + # also run forever and only end if an exception is raised. + await asyncio.gather(producer) + + except Exception as ex: + + log(ERROR, "An exception occured!! %s", ex) + log(ERROR, traceback.format_exc()) + log(WARN, "Stopping Simulation Engine.") + + # Manually trigger stopping event + f_stop.set() + + # Raise exception + raise RuntimeError("Simulation Engine crashed.") from ex - # Produced task terminated, now cancel worker tasks - for w_t in worker_tasks: - _ = w_t.cancel() + finally: + # Produced task terminated, now cancel worker tasks + for w_t in worker_tasks: + _ = w_t.cancel() - while not all(w_t.done() for w_t in worker_tasks): - log(DEBUG, "Terminating async workers...") - await asyncio.sleep(0.5) + while not all(w_t.done() for w_t in worker_tasks): + log(DEBUG, "Terminating async workers...") + await asyncio.sleep(0.5) - await asyncio.gather(*worker_tasks) + await asyncio.gather(*[w_t for w_t in worker_tasks if not w_t.done()]) - # Terminate backend - await backend.terminate() + # Terminate backend + await backend.terminate() # pylint: disable=too-many-arguments,unused-argument,too-many-locals diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py index 3dc7b57aa35..5bcff233759 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py @@ -26,7 +26,6 @@ from unittest import IsolatedAsyncioTestCase from uuid import UUID -from flwr.client.client_app import LoadClientAppError from flwr.common import GetPropertiesIns, Message, Metadata from flwr.common.constant import MESSAGE_TYPE_GET_PROPERTIES from flwr.common.recordset_compat import getpropertiesins_to_recordset @@ -138,9 +137,6 @@ def start_and_shutdown( existing_nodes_mapping=nodes_mapping, ) - # Trigger stop event - f_stop.set() - termination_th.join() @@ -172,11 +168,12 @@ def test_erroneous_clientapp_module_name(self) -> None: run_id=run_id, num_messages=num_messages, ) - with self.assertRaises(LoadClientAppError): + with self.assertRaises(RuntimeError): start_and_shutdown( clientapp_module="totally_fictitious_app:client", state_factory=state_factory, nodes_mapping=nodes_mapping, + duration=10, ) def test_erroneous_backend_config(self) -> None: @@ -222,7 +219,7 @@ def test_start_and_shutdown_with_tasks_in_state(self) -> None: producer/consumer logic must function. This also severs to evaluate a valid ClientApp. """ - num_messages = 113 + num_messages = 229 num_nodes = 59 # Register a state and a run_id in it From bd7b1aa26a48ed0e171be0f227fb38f538f21016 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Tue, 27 Feb 2024 23:08:57 +0000 Subject: [PATCH 070/103] completed tests. --- .../superlink/fleet/vce/vce_api_test.py | 117 +++++++++++------- 1 file changed, 75 insertions(+), 42 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py index 5bcff233759..ea2de2e636b 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py @@ -22,7 +22,7 @@ from math import pi from pathlib import Path from time import sleep -from typing import Dict, Optional, Set +from typing import Dict, Optional, Set, Tuple from unittest import IsolatedAsyncioTestCase from uuid import UUID @@ -44,12 +44,36 @@ def terminate_simulation(f_stop: asyncio.Event, sleep_duration: int) -> None: f_stop.set() +def init_state_factory_nodes_mapping( + num_nodes: int, + num_messages: int, + erroneous_message: Optional[bool] = False, +) -> Tuple[StateFactory, NodeToPartitionMapping, Dict[UUID, float]]: + """Instatiate StateFactory, register nodes and pre-insert messages in the state.""" + # Register a state and a run_id in it + run_id = 1234 + state_factory = StateFactory(":flwr-in-memory-state:") + + # Register a few nodes + nodes_mapping = _register_nodes(num_nodes=num_nodes, state_factory=state_factory) + + expected_results = register_messages_into_state( + state_factory=state_factory, + nodes_mapping=nodes_mapping, + run_id=run_id, + num_messages=num_messages, + erroneous_message=erroneous_message, + ) + return state_factory, nodes_mapping, expected_results + + # pylint: disable=too-many-locals def register_messages_into_state( state_factory: StateFactory, nodes_mapping: NodeToPartitionMapping, run_id: int, num_messages: int, + erroneous_message: Optional[bool] = False, ) -> Dict[UUID, float]: """Register `num_messages` into the state factory.""" state: InMemoryState = state_factory.state() # type: ignore @@ -75,7 +99,11 @@ def register_messages_into_state( dst_node_id=dst_node_id, # indicate destination node reply_to_message="", ttl="", - message_type=MESSAGE_TYPE_GET_PROPERTIES, + message_type=( + "a bad message" + if erroneous_message + else MESSAGE_TYPE_GET_PROPERTIES + ), ), ) # Convert Message to TaskIns @@ -109,18 +137,24 @@ def start_and_shutdown( num_supernodes: Optional[int] = None, state_factory: Optional[StateFactory] = None, nodes_mapping: Optional[NodeToPartitionMapping] = None, - duration: int = 10, + duration: int = 0, backend_config: str = "{}", ) -> None: - """Start Simulation Engine and terminate after specified number of seconds.""" + """Start Simulation Engine and terminate after specified number of seconds. + + Some tests need to be terminated by triggering externally an asyncio.Event. This + is enabled whtn passing `duration`>0. + """ f_stop = asyncio.Event() - # Setup thread that will set the f_stop event, triggering the termination of all - # asyncio logic in the Simulation Engine. It will also terminate the Backend. - termination_th = threading.Thread( - target=terminate_simulation, args=(f_stop, duration) - ) - termination_th.start() + if duration: + + # Setup thread that will set the f_stop event, triggering the termination of all + # asyncio logic in the Simulation Engine. It will also terminate the Backend. + termination_th = threading.Thread( + target=terminate_simulation, args=(f_stop, duration) + ) + termination_th.start() # Resolve working directory if not passed if not working_dir: @@ -137,7 +171,8 @@ def start_and_shutdown( existing_nodes_mapping=nodes_mapping, ) - termination_th.join() + if duration: + termination_th.join() class AsyncTestFleetSimulationEngineRayBackend(IsolatedAsyncioTestCase): @@ -146,34 +181,40 @@ class AsyncTestFleetSimulationEngineRayBackend(IsolatedAsyncioTestCase): def test_erroneous_no_supernodes_client_mapping(self) -> None: """Test with unset arguments.""" with self.assertRaises(ValueError): - start_and_shutdown() + start_and_shutdown(duration=2) def test_erroneous_clientapp_module_name(self) -> None: """Tests attempt to load a ClientApp that can't be found.""" num_messages = 7 num_nodes = 59 - # Register a state and a run_id in it - run_id = 1234 - state_factory = StateFactory(":flwr-in-memory-state:") - - # Register a few nodes - nodes_mapping = _register_nodes( - num_nodes=num_nodes, state_factory=state_factory + state_factory, nodes_mapping, _ = init_state_factory_nodes_mapping( + num_nodes=num_nodes, num_messages=num_messages ) + with self.assertRaises(RuntimeError): + start_and_shutdown( + clientapp_module="totally_fictitious_app:client", + state_factory=state_factory, + nodes_mapping=nodes_mapping, + ) + + def test_erroneous_messages(self) -> None: + """Test handling of error in async worker (consumer). + + We register messages which will trigger an error when handling, triggering an + error. + """ + num_messages = 100 + num_nodes = 59 - _ = register_messages_into_state( - state_factory=state_factory, - nodes_mapping=nodes_mapping, - run_id=run_id, - num_messages=num_messages, + state_factory, nodes_mapping, _ = init_state_factory_nodes_mapping( + num_nodes=num_nodes, num_messages=num_messages, erroneous_message=True ) + with self.assertRaises(RuntimeError): start_and_shutdown( - clientapp_module="totally_fictitious_app:client", state_factory=state_factory, nodes_mapping=nodes_mapping, - duration=10, ) def test_erroneous_backend_config(self) -> None: @@ -208,7 +249,7 @@ def test_erroneous_arguments_existing_mapping_but_no_state_factory(self) -> None def test_start_and_shutdown(self) -> None: """Start Simulation Engine Fleet and terminate it.""" - start_and_shutdown(num_supernodes=50) + start_and_shutdown(num_supernodes=50, duration=10) # pylint: disable=too-many-locals def test_start_and_shutdown_with_tasks_in_state(self) -> None: @@ -222,24 +263,16 @@ def test_start_and_shutdown_with_tasks_in_state(self) -> None: num_messages = 229 num_nodes = 59 - # Register a state and a run_id in it - run_id = 1234 - state_factory = StateFactory(":flwr-in-memory-state:") - - # Register a few nodes - nodes_mapping = _register_nodes( - num_nodes=num_nodes, state_factory=state_factory - ) - - expected_results = register_messages_into_state( - state_factory=state_factory, - nodes_mapping=nodes_mapping, - run_id=run_id, - num_messages=num_messages, + state_factory, nodes_mapping, expected_results = ( + init_state_factory_nodes_mapping( + num_nodes=num_nodes, num_messages=num_messages + ) ) # Run - start_and_shutdown(state_factory=state_factory, nodes_mapping=nodes_mapping) + start_and_shutdown( + state_factory=state_factory, nodes_mapping=nodes_mapping, duration=10 + ) # Get all TaskRes state = state_factory.state() From 0dce992c5982f0cde4760ac1feb6cbf2cd400e59 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Tue, 27 Feb 2024 23:30:43 +0000 Subject: [PATCH 071/103] update import --- src/py/flwr/simulation/run_simulation.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index 6006fa57c82..b61951c0b3f 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -24,7 +24,7 @@ from flwr.common.exit_handlers import register_exit_handlers from flwr.server.driver.driver import Driver from flwr.server.run_serverapp import run - +from flwr.server.superlink.fleet import vce from flwr.server.superlink.state import StateFactory @@ -32,7 +32,6 @@ def run_simulation() -> None: """.""" # TODO: below create circular imports from flwr.server.app import _run_driver_api_grpc - from flwr.server.superlink.fleet.vce import start_vce args = _parse_args_run_simulation().parse_args() @@ -50,7 +49,7 @@ def run_simulation() -> None: # Superlink with Simulation Engine f_stop = asyncio.Event() superlink_th = threading.Thread( - target=start_vce, + target=vce.start_vce, kwargs={ "num_supernodes": args.num_supernodes, "client_app_module_name": args.client_app, From 19366315aefa5b9c168d9b73f95cd1f877793c18 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Wed, 28 Feb 2024 00:05:12 +0000 Subject: [PATCH 072/103] wip --- src/py/flwr/simulation/run_simulation.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index b61951c0b3f..03baae738c2 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -39,9 +39,8 @@ def run_simulation() -> None: state_factory = StateFactory(":flwr-in-memory-state:") # Start Driver API - driver_address = "0.0.0.0:9098" driver_server: grpc.Server = _run_driver_api_grpc( - address=driver_address, + address=args.driver_api_address, state_factory=state_factory, certificates=None, ) @@ -67,7 +66,7 @@ def run_simulation() -> None: # Initialize Driver driver = Driver( - driver_service_address=driver_address, + driver_service_address=args.driver_api_address, root_certificates=None, ) @@ -102,6 +101,12 @@ def _parse_args_run_simulation() -> argparse.ArgumentParser: required=True, help="For example: `server:app` or `project.package.module:wrapper.app`", ) + parser.add_argument( + "--driver-api-address", + default="0.0.0.0:9091", + type=str, + help="For example: `server:app` or `project.package.module:wrapper.app`", + ) parser.add_argument( "--num-supernodes", type=int, From 6e3271b30f843aee79a7305238e75d6eebdd3c7a Mon Sep 17 00:00:00 2001 From: jafermarq Date: Wed, 28 Feb 2024 08:34:39 +0000 Subject: [PATCH 073/103] minior formatting --- src/py/flwr/server/superlink/fleet/vce/vce_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index 761712875cc..7583506e221 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -138,7 +138,7 @@ async def add_taskins_to_queue( log( DEBUG, "Simulation Engine stats: " - "(Active workers: (%i/%i) | %s (%i workers) | Tasks in queue: %i)", + "Active workers: (%i/%i) | %s (%i workers) | Tasks in queue: %i)", num_active, num_initial_consumers, backend.__class__.__name__, From 67777c5d93076f9fabcae5e0a25b839cb251b429 Mon Sep 17 00:00:00 2001 From: Javier Date: Wed, 28 Feb 2024 12:54:12 +0000 Subject: [PATCH 074/103] Apply suggestions from code review Co-authored-by: Daniel J. Beutel --- src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py | 2 +- src/py/flwr/server/superlink/fleet/vce/vce_api.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py index 06a6fc72975..4a729f22436 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py @@ -141,7 +141,7 @@ async def process_message( Return output message and updated context. """ - node_id = message.metadata.partition_id + partition_id = message.metadata.partition_id try: # Submite a task to the pool diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index 7583506e221..1aad6aa95f9 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -68,7 +68,7 @@ async def worker( # Convert TaskIns to Message message = message_from_taskins(task_ins) - # Replace node ID with data partition ID + # Set partition_id message.metadata.partition_id = nodes_mapping[node_id] # Let backend process message @@ -239,7 +239,7 @@ def start_vce( if existing_nodes_mapping: if state_factory is None: raise ValueError( - "You passed `existing_nodes_mapping` but no `state_factory` was passed." + "`existing_nodes_mapping` was passed, but no `state_factory` was passed." ) log(INFO, "Using exiting NodeToPartitionMapping and StateFactory.") # Use mapping constructed externally. This also means nodes From 46eac84ade3063b12c8add3335d178da480ed362 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Wed, 28 Feb 2024 12:56:37 +0000 Subject: [PATCH 075/103] fixes post review --- .../superlink/fleet/vce/backend/raybackend.py | 2 +- .../flwr/server/superlink/fleet/vce/vce_api.py | 17 ++++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py index 4a729f22436..8ef0d54622a 100644 --- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py +++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py @@ -147,7 +147,7 @@ async def process_message( # Submite a task to the pool future = await self.pool.submit( lambda a, a_fn, mssg, cid, state: a.run.remote(a_fn, mssg, cid, state), - (app, message, str(node_id), context), + (app, message, str(partition_id), context), ) await future diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index 1aad6aa95f9..5cc62911dd5 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -48,7 +48,7 @@ def _register_nodes( # pylint: disable=too-many-arguments,too-many-locals async def worker( - app: Callable[[], ClientApp], + app_fn: Callable[[], ClientApp], queue: "asyncio.Queue[TaskIns]", node_states: Dict[int, NodeState], state_factory: StateFactory, @@ -73,7 +73,7 @@ async def worker( # Let backend process message out_mssg, updated_context = await backend.process_message( - app, message, context + app_fn, message, context ) # Update Context @@ -150,7 +150,7 @@ async def add_taskins_to_queue( async def run( - app: Callable[[], ClientApp], + app_fn: Callable[[], ClientApp], backend: Backend, nodes_mapping: NodeToPartitionMapping, state_factory: StateFactory, @@ -168,7 +168,9 @@ async def run( # Add workers (they submit Messages to Backend) worker_tasks = [ asyncio.create_task( - worker(app, queue, node_states, state_factory, nodes_mapping, backend) + worker( + app_fn, queue, node_states, state_factory, nodes_mapping, backend + ) ) for _ in range(backend.num_workers) ] @@ -239,7 +241,8 @@ def start_vce( if existing_nodes_mapping: if state_factory is None: raise ValueError( - "`existing_nodes_mapping` was passed, but no `state_factory` was passed." + "`existing_nodes_mapping` was passed, but no `state_factory` was " + "passed." ) log(INFO, "Using exiting NodeToPartitionMapping and StateFactory.") # Use mapping constructed externally. This also means nodes @@ -289,11 +292,11 @@ def _load() -> ClientApp: app: ClientApp = load_client_app(client_app_module_name) return app - app = _load + app_fn = _load asyncio.run( run( - app, + app_fn, backend, nodes_mapping, state_factory, From cc6a1451fa0ca51c9b39244e14cfa3c2d3fc55c1 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Wed, 28 Feb 2024 16:16:37 +0000 Subject: [PATCH 076/103] instantiating backend in asyncio event loop --- src/py/flwr/server/superlink/fleet/vce/vce_api.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index 5cc62911dd5..ad858cbb997 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -151,17 +151,20 @@ async def add_taskins_to_queue( async def run( app_fn: Callable[[], ClientApp], - backend: Backend, + backend_fn: Callable[[], Backend], nodes_mapping: NodeToPartitionMapping, state_factory: StateFactory, node_states: Dict[int, NodeState], f_stop: asyncio.Event, ) -> None: """Run the VCE async.""" - # pylint: disable=fixme queue: "asyncio.Queue[TaskIns]" = asyncio.Queue(128) try: + + # Instantiate backend + backend = backend_fn() + # Build backend await backend.build() @@ -272,7 +275,6 @@ def start_vce( try: backend_type = supported_backends[backend_name] - backend = backend_type(backend_config, work_dir=working_dir) except KeyError as ex: log( ERROR, @@ -286,6 +288,10 @@ def start_vce( raise ex + def backend_fn() -> Backend: + """Instantiate a Backend.""" + return backend_type(backend_config, work_dir=working_dir) + log(INFO, "client_app_module_name = %s", client_app_module_name) def _load() -> ClientApp: @@ -297,7 +303,7 @@ def _load() -> ClientApp: asyncio.run( run( app_fn, - backend, + backend_fn, nodes_mapping, state_factory, node_states, From 662579eae1a2bcdc7d2f99dce5ff644084dd4e0f Mon Sep 17 00:00:00 2001 From: jafermarq Date: Wed, 28 Feb 2024 16:23:49 +0000 Subject: [PATCH 077/103] minor --- src/py/flwr/simulation/run_simulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index 03baae738c2..8c343bb23e6 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -61,8 +61,8 @@ def run_simulation() -> None: daemon=False, ) - event(EventType.RUN_SUPERLINK_ENTER) superlink_th.start() + event(EventType.RUN_SUPERLINK_ENTER) # Initialize Driver driver = Driver( From 6dd034b67c77eb769e8822dc7adbc0b7dad7cc0d Mon Sep 17 00:00:00 2001 From: jafermarq Date: Wed, 28 Feb 2024 16:31:45 +0000 Subject: [PATCH 078/103] updated TF notebook --- examples/simulation-tensorflow/sim.ipynb | 53 +++++++++++++++++------- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/examples/simulation-tensorflow/sim.ipynb b/examples/simulation-tensorflow/sim.ipynb index 9acfba99237..21639877be2 100644 --- a/examples/simulation-tensorflow/sim.ipynb +++ b/examples/simulation-tensorflow/sim.ipynb @@ -17,8 +17,8 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -q flwr[\"simulation\"] tensorflow\n", - "!pip install -q flwr_datasets[\"vision\"]" + "!pip install -q \"flwr[simulation]\" tensorflow\n", + "!pip install -q \"flwr_datasets[vision]\"" ] }, { @@ -34,7 +34,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install matplotlib" + "!pip install -q matplotlib" ] }, { @@ -265,20 +265,33 @@ " evaluate_fn=get_evaluate_fn(centralized_testset), # global evaluation function\n", ")\n", "\n", - "# With a dictionary, you tell Flower's VirtualClientEngine that each\n", - "# client needs exclusive access to these many resources in order to run\n", - "client_resources = {\"num_cpus\": 1, \"num_gpus\": 0.0}\n", - "\n", - "# Start simulation\n", - "history = fl.simulation.start_simulation(\n", + "# ClientApp for Flower-Next\n", + "client_app = fl.client.ClientApp(\n", " client_fn=get_client_fn(mnist_fds),\n", - " num_clients=NUM_CLIENTS,\n", + ")\n", + "\n", + "# ServerApp for Flower-Next\n", + "server_app = fl.server.ServerApp(\n", " config=fl.server.ServerConfig(num_rounds=10),\n", " strategy=strategy,\n", - " client_resources=client_resources,\n", - " actor_kwargs={\n", - " \"on_actor_init_fn\": enable_tf_gpu_growth # Enable GPU growth upon actor init.\n", - " },\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's lauch the simulation:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fl.simulation.run_simulation(\n", + " server_app=server_app, client_app=client_app, num_supernodes=NUM_CLIENTS\n", ")" ] }, @@ -340,6 +353,18 @@ "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" } }, "nbformat": 4, From 2aba8954f569e8263fd1f2f3be745497506a82b2 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Wed, 28 Feb 2024 18:03:34 +0000 Subject: [PATCH 079/103] moved `run_driver_api_grpc()` --- src/py/flwr/server/app.py | 33 ++---------- .../server/superlink/driver/driver_grpc.py | 54 +++++++++++++++++++ 2 files changed, 57 insertions(+), 30 deletions(-) create mode 100644 src/py/flwr/server/superlink/driver/driver_grpc.py diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py index cf6b716bd18..788ebeb8a45 100644 --- a/src/py/flwr/server/app.py +++ b/src/py/flwr/server/app.py @@ -36,9 +36,6 @@ ) from flwr.common.exit_handlers import register_exit_handlers from flwr.common.logger import log -from flwr.proto.driver_pb2_grpc import ( # pylint: disable=E0611 - add_DriverServicer_to_server, -) from flwr.proto.fleet_pb2_grpc import ( # pylint: disable=E0611 add_FleetServicer_to_server, ) @@ -48,7 +45,7 @@ from .server import Server, init_defaults, run_fl from .server_config import ServerConfig from .strategy import Strategy -from .superlink.driver.driver_servicer import DriverServicer +from .superlink.driver.driver_grpc import run_driver_api_grpc from .superlink.fleet.grpc_bidi.grpc_server import ( generic_create_grpc_server, start_grpc_server, @@ -204,7 +201,7 @@ def run_driver_api() -> None: state_factory = StateFactory(args.database) # Start server - grpc_server: grpc.Server = _run_driver_api_grpc( + grpc_server: grpc.Server = run_driver_api_grpc( address=address, state_factory=state_factory, certificates=certificates, @@ -313,7 +310,7 @@ def run_superlink() -> None: state_factory = StateFactory(args.database) # Start Driver API - driver_server: grpc.Server = _run_driver_api_grpc( + driver_server: grpc.Server = run_driver_api_grpc( address=address, state_factory=state_factory, certificates=certificates, @@ -414,30 +411,6 @@ def _try_obtain_certificates( return certificates -def _run_driver_api_grpc( - address: str, - state_factory: StateFactory, - certificates: Optional[Tuple[bytes, bytes, bytes]], -) -> grpc.Server: - """Run Driver API (gRPC, request-response).""" - # Create Driver API gRPC server - driver_servicer: grpc.Server = DriverServicer( - state_factory=state_factory, - ) - driver_add_servicer_to_server_fn = add_DriverServicer_to_server - driver_grpc_server = generic_create_grpc_server( - servicer_and_add_fn=(driver_servicer, driver_add_servicer_to_server_fn), - server_address=address, - max_message_length=GRPC_MAX_MESSAGE_LENGTH, - certificates=certificates, - ) - - log(INFO, "Flower ECE: Starting Driver API (gRPC-rere) on %s", address) - driver_grpc_server.start() - - return driver_grpc_server - - def _run_fleet_api_grpc_rere( address: str, state_factory: StateFactory, diff --git a/src/py/flwr/server/superlink/driver/driver_grpc.py b/src/py/flwr/server/superlink/driver/driver_grpc.py new file mode 100644 index 00000000000..f74000bc59c --- /dev/null +++ b/src/py/flwr/server/superlink/driver/driver_grpc.py @@ -0,0 +1,54 @@ +# Copyright 2020 Flower Labs GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Driver gRPC API.""" + +from logging import INFO +from typing import Optional, Tuple + +import grpc + +from flwr.common import GRPC_MAX_MESSAGE_LENGTH +from flwr.common.logger import log +from flwr.proto.driver_pb2_grpc import ( # pylint: disable=E0611 + add_DriverServicer_to_server, +) +from flwr.server.superlink.state import StateFactory + +from ..fleet.grpc_bidi.grpc_server import generic_create_grpc_server +from .driver_servicer import DriverServicer + + +def run_driver_api_grpc( + address: str, + state_factory: StateFactory, + certificates: Optional[Tuple[bytes, bytes, bytes]], +) -> grpc.Server: + """Run Driver API (gRPC, request-response).""" + # Create Driver API gRPC server + driver_servicer: grpc.Server = DriverServicer( + state_factory=state_factory, + ) + driver_add_servicer_to_server_fn = add_DriverServicer_to_server + driver_grpc_server = generic_create_grpc_server( + servicer_and_add_fn=(driver_servicer, driver_add_servicer_to_server_fn), + server_address=address, + max_message_length=GRPC_MAX_MESSAGE_LENGTH, + certificates=certificates, + ) + + log(INFO, "Flower ECE: Starting Driver API (gRPC-rere) on %s", address) + driver_grpc_server.start() + + return driver_grpc_server From f3d2c639339f2deabaae7666156a5a2687c30915 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Wed, 28 Feb 2024 18:10:27 +0000 Subject: [PATCH 080/103] update and format --- src/py/flwr/simulation/run_simulation.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index 8c343bb23e6..1e7c13a0ba6 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -24,22 +24,20 @@ from flwr.common.exit_handlers import register_exit_handlers from flwr.server.driver.driver import Driver from flwr.server.run_serverapp import run +from flwr.server.superlink.driver.driver_grpc import run_driver_api_grpc from flwr.server.superlink.fleet import vce from flwr.server.superlink.state import StateFactory def run_simulation() -> None: - """.""" - # TODO: below create circular imports - from flwr.server.app import _run_driver_api_grpc - + """Run Simulation Engine.""" args = _parse_args_run_simulation().parse_args() # Initialize StateFactory state_factory = StateFactory(":flwr-in-memory-state:") # Start Driver API - driver_server: grpc.Server = _run_driver_api_grpc( + driver_server: grpc.Server = run_driver_api_grpc( address=args.driver_api_address, state_factory=state_factory, certificates=None, From c2a4dc8f36a47d59aeb9bf1e6f8db2d45c051507 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Thu, 29 Feb 2024 00:13:57 +0000 Subject: [PATCH 081/103] better exception handling; updated simp.py examples --- examples/simulation-pytorch/README.md | 21 +++---- examples/simulation-pytorch/sim.py | 4 +- examples/simulation-tensorflow/README.md | 23 +++---- examples/simulation-tensorflow/sim.py | 6 +- src/py/flwr/simulation/run_simulation.py | 77 ++++++++++++++++++------ 5 files changed, 82 insertions(+), 49 deletions(-) diff --git a/examples/simulation-pytorch/README.md b/examples/simulation-pytorch/README.md index 963e77bc568..339cae67320 100644 --- a/examples/simulation-pytorch/README.md +++ b/examples/simulation-pytorch/README.md @@ -75,26 +75,23 @@ python sim.py --num_cpus=2 python sim.py --num_cpus=2 --num_gpus=0.25 ``` -### Run with Flower-Next (`super-link` and `server-app`) +### Run with Flower-Next -Ensure you have activated your environment, then: +Ensure you have activated your environment, then execute the command below. All `ClientApp` instances will run on CPU but the `ServerApp` will run on the GPU if one is available. Note that this is the case because the `Simulation Engine` only exposes certain resources to the `ClientApp` (based on the `client_resources` in `--backend-config`). -``` -flower-superlink --insecure --vce --num-supernodes 100 --client-app sim:client_app - -# on a different terminal -flower-server-app sim:server_app --insecure +```bash +# Run with the default backend-config. +# `--server-app` points to the `server` object in the sim.py file in this example. +# `--client-app` points to the `client` object in the sim.py file in this example. +flower-simulation --client-app=sim:client --server-app=sim:server --num-supernodes=100 ``` -You can change the default resources assigned to each `ClientApp` by means the `--backend-config` argument: +You can change the default resources assigned to each `ClientApp` by means of the `--backend-config` argument: ```bash # Tells the VCE to resever 2x CPUs and 25% of available VRAM for each ClientApp -flower-superlink --insecure --vce --num-supernodes 100 \ - --client-app sim:client_app \ +flower-simulation --client-app=sim:client --server-app=sim:server --num-supernodes=100 \ --backend-config='{"client_resources": {"num_cpus":2, "num_gpus":0.25}}' - -# Then you can launch the `flower-server-app` command as shown earlier. ``` Take a look at the [Documentation](https://flower.ai/docs/framework/how-to-run-simulations.html) for more details on how you can customise your simulation. diff --git a/examples/simulation-pytorch/sim.py b/examples/simulation-pytorch/sim.py index 84a00e3f092..ca9e6f0e836 100644 --- a/examples/simulation-pytorch/sim.py +++ b/examples/simulation-pytorch/sim.py @@ -182,12 +182,12 @@ def evaluate( ) # ClientApp for Flower-Next -client_app = fl.client.ClientApp( +client = fl.client.ClientApp( client_fn=get_client_fn(mnist_fds), ) # ServerApp for Flower-Next -server_app = fl.server.ServerApp( +server = fl.server.ServerApp( config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS), strategy=strategy, ) diff --git a/examples/simulation-tensorflow/README.md b/examples/simulation-tensorflow/README.md index f6f0a22fdd7..a49fda9c0b3 100644 --- a/examples/simulation-tensorflow/README.md +++ b/examples/simulation-tensorflow/README.md @@ -76,26 +76,23 @@ python sim.py --num_cpus=2 --num_gpus=0.25 Because TensorFlow by default maps all the available VRAM, we need to [enable GPU memory growth](https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth), see how it is done in the example (`sim.py`) for both the "main" process (where the server/strategy runs) and for the clients (using the `actor_kwargs`) -### Run with Flower-Next (`super-link` and `server-app`) +### Run with Flower-Next -Ensure you have activated your environment, then: +Ensure you have activated your environment, then execute the command below. All `ClientApp` instances will run on CPU but the `ServerApp` will run on the GPU if one is available. Note that this is the case because the `Simulation Engine` only exposes certain resources to the `ClientApp` (based on the `client_resources` in `--backend-config`). For TensorFlow simulations, it is desirable to make use of TF's [memory growth](https://www.tensorflow.org/api_docs/python/tf/config/experimental/set_memory_growth) feature. You can enable that easily with the `--enable-tf-gpu-growth` flag. -``` -flower-superlink --insecure --vce --num-supernodes 100 --client-app sim:client_app - -# on a different terminal -flower-server-app sim:server_app --insecure +```bash +# Run with the default backend-config. +# `--server-app` points to the `server` object in the sim.py file in this example. +# `--client-app` points to the `client` object in the sim.py file in this example. +flower-simulation --client-app=sim:client --server-app=sim:server --num-supernodes=100 --enable-tf-gpu-growth ``` -You can change the default resources assigned to each `ClientApp` using the `--backend-config` argument. Note that we need to flag that the backend is going to use `TensorFlow`. In this way, it will enable GPU memory growth. +You can change the default resources assigned to each `ClientApp` using the `--backend-config` argument. ```bash # Tells the VCE to resever 2x CPUs and 25% of available VRAM for each ClientApp -flower-superlink --insecure --vce --num-supernodes 100 \ - --client-app sim:client_app \ - --backend-config='{"client_resources": {"num_cpus":2, "num_gpus":0.25}, "tensorflow": 1}' - -# Then you can launch the `flower-server-app` command as shown earlier. +flower-simulation --client-app=sim:client --server-app=sim:server --num-supernodes=100 \ + --backend-config='{"client_resources": {"num_cpus":2, "num_gpus":0.25}}' --enable-tf-gpu-growth ``` Take a look at the [Documentation](https://flower.ai/docs/framework/how-to-run-simulations.html) for more details on how you can customise your simulation. diff --git a/examples/simulation-tensorflow/sim.py b/examples/simulation-tensorflow/sim.py index dbba71ac2cf..2a19e131fe7 100644 --- a/examples/simulation-tensorflow/sim.py +++ b/examples/simulation-tensorflow/sim.py @@ -1,5 +1,4 @@ import os -import math import argparse from typing import Dict, List, Tuple @@ -147,13 +146,12 @@ def evaluate( # ClientApp for Flower-Next -client_app = fl.client.ClientApp( +client = fl.client.ClientApp( client_fn=get_client_fn(mnist_fds), ) # ServerApp for Flower-Next -# TODO: Unclear how to enable GPU growth for the ServerApp -server_app = fl.server.ServerApp( +server = fl.server.ServerApp( config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS), strategy=strategy, ) diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index 1e7c13a0ba6..ffdaad01a11 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -16,23 +16,43 @@ import argparse import asyncio +import json import threading +import traceback +from logging import ERROR, INFO, WARNING import grpc -from flwr.common import EventType, event +from flwr.common import EventType, event, log from flwr.common.exit_handlers import register_exit_handlers from flwr.server.driver.driver import Driver from flwr.server.run_serverapp import run from flwr.server.superlink.driver.driver_grpc import run_driver_api_grpc from flwr.server.superlink.fleet import vce from flwr.server.superlink.state import StateFactory +from flwr.simulation.ray_transport.utils import enable_tf_gpu_growth def run_simulation() -> None: """Run Simulation Engine.""" args = _parse_args_run_simulation().parse_args() + # Load JSON config + backend_config_dict = json.loads(args.backend_config) + + # Enable GPU memory growth (relevant only for TF) + if args.enable_tf_gpu_growth: + log(INFO, "Enabling GPU growth for Tensorflow on the main thread.") + enable_tf_gpu_growth() + # Check that Backend config has also enabled using GPU growth + use_tf = backend_config_dict.get("tensorflow", False) + if not use_tf: + log(WARNING, "Enabling GPU growth for your backend.") + backend_config_dict["tensorflow"] = True + + # Convert back to JSON stream + backend_config = json.dumps(backend_config_dict) + # Initialize StateFactory state_factory = StateFactory(":flwr-in-memory-state:") @@ -51,7 +71,7 @@ def run_simulation() -> None: "num_supernodes": args.num_supernodes, "client_app_module_name": args.client_app, "backend_name": args.backend, - "backend_config_json_stream": args.backend_config, + "backend_config_json_stream": backend_config, "working_dir": args.dir, "state_factory": state_factory, "f_stop": f_stop, @@ -62,26 +82,37 @@ def run_simulation() -> None: superlink_th.start() event(EventType.RUN_SUPERLINK_ENTER) - # Initialize Driver - driver = Driver( - driver_service_address=args.driver_api_address, - root_certificates=None, - ) + try: + # Initialize Driver + driver = Driver( + driver_service_address=args.driver_api_address, + root_certificates=None, + ) - # Launch server app - run(args.server_app, driver, args.dir) + # Launch server app + run(args.server_app, driver, args.dir) - del driver + except Exception as ex: - # Trigger stop event - f_stop.set() + log(ERROR, "An exception occured !! %s", ex) + log(ERROR, traceback.format_exc()) + raise RuntimeError( + "An error was encountered by the Simulation Engine. Ending Simulation." + ) from ex - register_exit_handlers( - grpc_servers=[driver_server], - bckg_threads=[superlink_th], - event_type=EventType.RUN_SUPERLINK_LEAVE, - ) - superlink_th.join() + finally: + + del driver + + # Trigger stop event + f_stop.set() + + register_exit_handlers( + grpc_servers=[driver_server], + bckg_threads=[superlink_th], + event_type=EventType.RUN_SUPERLINK_LEAVE, + ) + superlink_th.join() def _parse_args_run_simulation() -> argparse.ArgumentParser: @@ -117,6 +148,16 @@ def _parse_args_run_simulation() -> argparse.ArgumentParser: type=str, help="Simulation backend that executes the ClientApp.", ) + parser.add_argument( + "--enable-tf-gpu-growth", + action="store_true", + help="Enables GPU growth on the main thread. This is desirable if you make " + "use of a TensorFlow model on your `ServerApp` while having your `ClientApp` " + "running on the same GPU. Without enabling this, you might encounter an " + "out-of-memory error becasue TensorFlow by default allocates all GPU memory." + "Read mor about how `tf.config.experimental.set_memory_growth()` works in " + "the TensorFlow documentation: https://www.tensorflow.org/api/stable.", + ) parser.add_argument( "--backend-config", type=str, From 94c264984c0af850b232ee291e663f60d59a699d Mon Sep 17 00:00:00 2001 From: jafermarq Date: Thu, 29 Feb 2024 09:24:29 +0000 Subject: [PATCH 082/103] updates --- examples/simulation-pytorch/sim.ipynb | 4 --- examples/simulation-tensorflow/sim.ipynb | 5 +++- src/py/flwr/server/run_serverapp.py | 28 +++++++++++------- src/py/flwr/simulation/run_simulation.py | 36 ++++++++++++++++-------- 4 files changed, 46 insertions(+), 27 deletions(-) diff --git a/examples/simulation-pytorch/sim.ipynb b/examples/simulation-pytorch/sim.ipynb index 85fb67f6602..e351228a19e 100644 --- a/examples/simulation-pytorch/sim.ipynb +++ b/examples/simulation-pytorch/sim.ipynb @@ -629,10 +629,6 @@ "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.8.16" } }, "nbformat": 4, diff --git a/examples/simulation-tensorflow/sim.ipynb b/examples/simulation-tensorflow/sim.ipynb index c506c505855..6c08666b6e4 100644 --- a/examples/simulation-tensorflow/sim.ipynb +++ b/examples/simulation-tensorflow/sim.ipynb @@ -291,7 +291,10 @@ "outputs": [], "source": [ "fl.simulation.run_simulation(\n", - " server_app=server_app, client_app=client_app, num_supernodes=NUM_CLIENTS, enable_tf_gpu_growth=True\n", + " server_app=server_app,\n", + " client_app=client_app,\n", + " num_supernodes=NUM_CLIENTS,\n", + " enable_tf_gpu_growth=True,\n", ")" ] }, diff --git a/src/py/flwr/server/run_serverapp.py b/src/py/flwr/server/run_serverapp.py index 4ff3ede06a2..4de966bfc87 100644 --- a/src/py/flwr/server/run_serverapp.py +++ b/src/py/flwr/server/run_serverapp.py @@ -29,24 +29,32 @@ def run( - server_app_attr: str, driver: Driver, server_app_dir: str, + server_app_attr: Optional[str] = None, loaded_server_app: Optional[ServerApp] = None, ) -> None: """Run ServerApp with a given Driver.""" + if not (server_app_attr is None) ^ (loaded_server_app is None): + raise ValueError( + "Either `server_app_attr` should `loaded_server_app` be set " + "but not both. " + ) + if server_app_dir is not None: sys.path.insert(0, server_app_dir) - def _load() -> ServerApp: - server_app: ServerApp = ( - load_server_app(server_app_attr) - if loaded_server_app is None - else loaded_server_app - ) - return server_app + # Load ServerApp if needed + if server_app_attr: + + def _load() -> ServerApp: + server_app: ServerApp = load_server_app(server_app_attr) + return server_app + + server_app = _load() - server_app = _load() + if loaded_server_app: + server_app = loaded_server_app # Initialize Context context = Context(state=RecordSet()) @@ -114,7 +122,7 @@ def run_server_app() -> None: ) # Run the Server App with the Driver - run(server_app_attr, driver, server_app_dir) + run(driver=driver, server_app_dir=server_app_dir, server_app_attr=server_app_attr) # Clean up driver.__del__() # pylint: disable=unnecessary-dunder-call diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index dbd05e9e86f..d1aa6746908 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -20,13 +20,14 @@ import threading import traceback from logging import ERROR, INFO, WARNING -from typing import Optional +from typing import Dict, Optional import grpc from flwr.client import ClientApp from flwr.common import EventType, event, log from flwr.common.exit_handlers import register_exit_handlers +from flwr.common.typing import ConfigsRecordValues from flwr.server.driver.driver import Driver from flwr.server.run_serverapp import run from flwr.server.server_app import ServerApp @@ -42,12 +43,15 @@ def run_simulation_from_cli() -> None: """Run Simulation Engine from the CLI.""" args = _parse_args_run_simulation().parse_args() + # Load JSON config + backend_config_dict = json.loads(args.backend_config) + run_simulation( num_supernodes=args.num_supernodes, client_app_module_name=args.client_app, server_app_module_name=args.server_app, backend_name=args.backend, - backend_config=args.backend_config, + backend_config=backend_config_dict, working_dir=args.dir, driver_api_address=args.driver_api_address, enable_tf_gpu_growth=args.enable_tf_gpu_growth, @@ -60,7 +64,7 @@ def run_simulation( client_app: Optional[ClientApp] = None, server_app: Optional[ServerApp] = None, backend_name: str = "ray", - backend_config: str = "{}", + backend_config: Optional[Dict[str, ConfigsRecordValues]] = None, client_app_module_name: Optional[str] = None, server_app_module_name: Optional[str] = None, working_dir: str = "", @@ -86,9 +90,9 @@ def run_simulation( backend_name : str (default: ray) A simulation backend that runs `ClientApp`s. - backend_config : str - 'A JSON formatted stream, e.g \'{"":, "":}\' to - configure a backend. Values supported in are those included by + backend_config : Optional[Dict[str, ConfigsRecordValues]] + 'A dictionary, e.g {"":, "":} to configure a + backend. Values supported in are those included by `flwr.common.typing.ConfigsRecordValues`. client_app_module_name : str @@ -114,18 +118,21 @@ def run_simulation( all GPU memory. Read mor about how `tf.config.experimental.set_memory_growth()` works in the TensorFlow documentation: https://www.tensorflow.org/api/stable. """ - # Load JSON config - backend_config_dict = json.loads(backend_config) + if backend_config is None: + backend_config = {} # Enable GPU memory growth (relevant only for TF) if enable_tf_gpu_growth: log(INFO, "Enabling GPU growth for Tensorflow on the main thread.") enable_gpu_growth() # Check that Backend config has also enabled using GPU growth - use_tf = backend_config_dict.get("tensorflow", False) + use_tf = backend_config.get("tensorflow", False) if not use_tf: log(WARNING, "Enabling GPU growth for your backend.") - backend_config_dict["tensorflow"] = True + backend_config["tensorflow"] = True + + # Convert config to original JSON-stream format + backend_config_stream = json.dumps(backend_config) # Initialize StateFactory state_factory = StateFactory(":flwr-in-memory-state:") @@ -146,7 +153,7 @@ def run_simulation( "client_app_module_name": client_app_module_name, "client_app": client_app, "backend_name": backend_name, - "backend_config_json_stream": backend_config, + "backend_config_json_stream": backend_config_stream, "working_dir": working_dir, "state_factory": state_factory, "f_stop": f_stop, @@ -165,7 +172,12 @@ def run_simulation( ) # Launch server app - run(server_app_module_name, driver, working_dir, loaded_server_app=server_app) + run( + driver=driver, + server_app_dir=working_dir, + server_app_attr=server_app_module_name, + loaded_server_app=server_app, + ) except Exception as ex: From b00e8405287fed9e786344e8bec2569b235d31ed Mon Sep 17 00:00:00 2001 From: jafermarq Date: Fri, 1 Mar 2024 16:42:03 +0000 Subject: [PATCH 083/103] better --- src/py/flwr/server/run_serverapp.py | 15 +++- src/py/flwr/simulation/run_simulation.py | 102 ++++++++++++++++------- 2 files changed, 85 insertions(+), 32 deletions(-) diff --git a/src/py/flwr/server/run_serverapp.py b/src/py/flwr/server/run_serverapp.py index 19fd16fb0c1..d4f21cbf20d 100644 --- a/src/py/flwr/server/run_serverapp.py +++ b/src/py/flwr/server/run_serverapp.py @@ -16,9 +16,11 @@ import argparse +import asyncio import sys from logging import DEBUG, WARN from pathlib import Path +from typing import Optional from flwr.common import Context, EventType, RecordSet, event from flwr.common.logger import log @@ -27,7 +29,12 @@ from .server_app import ServerApp, load_server_app -def run(server_app_attr: str, driver: Driver, server_app_dir: str) -> None: +def run( + server_app_attr: str, + driver: Driver, + server_app_dir: str, + stop_event: Optional[asyncio.Event] = None, +) -> None: """Run ServerApp with a given Driver.""" if server_app_dir is not None: sys.path.insert(0, server_app_dir) @@ -44,6 +51,12 @@ def _load() -> ServerApp: # Call ServerApp server_app(driver=driver, context=context) + log(DEBUG, "ServerApp finished running.") + # Upon completion, trigger stop event if one was passed + if stop_event is not None: + log(DEBUG, "Triggering stop event.") + stop_event.set() + def run_server_app() -> None: """Run Flower server app.""" diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index ebe76944e77..5459506335e 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -20,11 +20,12 @@ import threading import traceback from logging import ERROR, INFO, WARNING +from time import sleep +from typing import Any, Callable import grpc from flwr.common import EventType, event, log -from flwr.common.exit_handlers import register_exit_handlers from flwr.server.driver.driver import Driver from flwr.server.run_serverapp import run from flwr.server.superlink.driver.driver_grpc import run_driver_api_grpc @@ -33,6 +34,49 @@ from flwr.simulation.ray_transport.utils import enable_tf_gpu_growth +def run_serverapp_th( + server_app_attr: str, + driver: Driver, + server_app_dir: str, + f_stop: asyncio.Event, + delay_launch: int = 3, +) -> threading.Thread: + """Run SeverApp in a thread.""" + serverapp_th = threading.Thread( + target=run, + kwargs={ + "server_app_attr": server_app_attr, + "driver": driver, + "server_app_dir": server_app_dir, + "stop_event": f_stop, # will be set when `run()` finishes + # will trigger the shutdown of the Simulation Engine + }, + ) + sleep(delay_launch) + serverapp_th.start() + return serverapp_th + + +def get_thread_exception_hook(stop_event: asyncio.Event) -> Callable[[Any], None]: + """Return a callback for when the serverapp thread raises an exception.""" + + def execepthook(args: Any) -> None: + """Upon exception raised, log exception and trigger stop event.""" + # log + log( + ERROR, + "The ServerApp thread triggered exception (%s): %s", + args.exc_type, + args.exc_value, + ) + log(ERROR, traceback.format_exc()) + # Set stop event + stop_event.set() + log(WARNING, "Triggered stop event for Simulation Engine.") + + return execepthook + + def run_simulation() -> None: """Run Simulation Engine.""" args = _parse_args_run_simulation().parse_args() @@ -63,56 +107,52 @@ def run_simulation() -> None: certificates=None, ) - # SuperLink with Simulation Engine f_stop = asyncio.Event() - superlink_th = threading.Thread( - target=vce.start_vce, - kwargs={ - "num_supernodes": args.num_supernodes, - "client_app_module_name": args.client_app, - "backend_name": args.backend, - "backend_config_json_stream": backend_config, - "working_dir": args.dir, - "state_factory": state_factory, - "f_stop": f_stop, - }, - daemon=False, - ) - - superlink_th.start() - event(EventType.RUN_SUPERLINK_ENTER) - + serverapp_th = None try: + # Initialize Driver driver = Driver( driver_service_address=args.driver_api_address, root_certificates=None, ) - # Launch server app - run(args.server_app, driver, args.dir) + # Get and run ServerApp thread + serverapp_th = run_serverapp_th(args.server_app, driver, args.dir, f_stop) + # Setup an exception hook + threading.excepthook = get_thread_exception_hook(f_stop) + + # SuperLink with Simulation Engine + event(EventType.RUN_SUPERLINK_ENTER) + vce.start_vce( + num_supernodes=args.num_supernodes, + client_app_module_name=args.client_app, + backend_name=args.backend, + backend_config_json_stream=backend_config, + working_dir=args.dir, + state_factory=state_factory, + f_stop=f_stop, + ) except Exception as ex: log(ERROR, "An exception occurred: %s", ex) log(ERROR, traceback.format_exc()) - raise RuntimeError( - "An error was encountered by the Simulation Engine. Ending simulation." - ) from ex + raise RuntimeError("An error was encountered. Ending simulation.") from ex finally: + # Stop Driver + driver_server.stop(grace=0) del driver - # Trigger stop event f_stop.set() - register_exit_handlers( - grpc_servers=[driver_server], - bckg_threads=[superlink_th], - event_type=EventType.RUN_SUPERLINK_LEAVE, - ) - superlink_th.join() + event(EventType.RUN_SUPERLINK_LEAVE) + if serverapp_th: + serverapp_th.join() + + log(INFO, "Stopping Simulation Engine now.") def _parse_args_run_simulation() -> argparse.ArgumentParser: From 6f9bd9e6af2c5bf4968ae6a9438e93c805f3ea71 Mon Sep 17 00:00:00 2001 From: "Daniel J. Beutel" Date: Fri, 1 Mar 2024 17:48:29 +0100 Subject: [PATCH 084/103] Update src/py/flwr/simulation/run_simulation.py --- src/py/flwr/simulation/run_simulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index 5459506335e..11d68beefc7 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -58,7 +58,7 @@ def run_serverapp_th( def get_thread_exception_hook(stop_event: asyncio.Event) -> Callable[[Any], None]: - """Return a callback for when the serverapp thread raises an exception.""" + """Return a callback for when the ServerApp thread raises an exception.""" def execepthook(args: Any) -> None: """Upon exception raised, log exception and trigger stop event.""" From 32d8b331ddbcbbca53c9afe179082812aa820583 Mon Sep 17 00:00:00 2001 From: "Daniel J. Beutel" Date: Fri, 1 Mar 2024 17:49:31 +0100 Subject: [PATCH 085/103] Update src/py/flwr/simulation/run_simulation.py --- src/py/flwr/simulation/run_simulation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index 11d68beefc7..71cd45fdd0a 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -110,7 +110,6 @@ def run_simulation() -> None: f_stop = asyncio.Event() serverapp_th = None try: - # Initialize Driver driver = Driver( driver_service_address=args.driver_api_address, From 57a84e585fff92fd4570ce8b7a5a9cc64090e180 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Fri, 1 Mar 2024 17:36:00 +0000 Subject: [PATCH 086/103] pyling, mypy fixes --- src/py/flwr/server/run_serverapp.py | 14 ++++++-------- src/py/flwr/server/superlink/fleet/vce/vce_api.py | 13 +++++++------ src/py/flwr/simulation/run_simulation.py | 5 +++-- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/py/flwr/server/run_serverapp.py b/src/py/flwr/server/run_serverapp.py index fb9eebb3c4c..a1608fd15ec 100644 --- a/src/py/flwr/server/run_serverapp.py +++ b/src/py/flwr/server/run_serverapp.py @@ -47,16 +47,14 @@ def run( sys.path.insert(0, server_app_dir) # Load ServerApp if needed - if server_app_attr: - - def _load() -> ServerApp: + def _load() -> ServerApp: + if server_app_attr: server_app: ServerApp = load_server_app(server_app_attr) - return server_app - - server_app = _load() + if loaded_server_app: + server_app = loaded_server_app + return server_app - if loaded_server_app: - server_app = loaded_server_app + server_app = _load() # Initialize Context context = Context(state=RecordSet()) diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index 4226e0109be..c03b57ddbb5 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -219,12 +219,12 @@ async def run( # pylint: disable=too-many-arguments,unused-argument,too-many-locals def start_vce( - client_app_module_name: str, backend_name: str, backend_config_json_stream: str, working_dir: str, f_stop: asyncio.Event, client_app: Optional[ClientApp] = None, + client_app_module_name: Optional[str] = None, num_supernodes: Optional[int] = None, state_factory: Optional[StateFactory] = None, existing_nodes_mapping: Optional[NodeToPartitionMapping] = None, @@ -301,12 +301,13 @@ def backend_fn() -> Backend: log(INFO, "client_app_module_name = %s", client_app_module_name) + # Load ClientApp if needed def _load() -> ClientApp: - app: ClientApp = ( - load_client_app(client_app_module_name) - if client_app is None - else client_app - ) + + if client_app_module_name: + app: ClientApp = load_client_app(client_app_module_name) + if client_app: + app = client_app return app app_fn = _load diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index 9a357726bdd..b911784c1f6 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -58,9 +58,10 @@ def run_simulation_from_cli() -> None: ) +# pylint: disable=too-many-arguments def run_serverapp_th( - server_app_attr: str, - server_app: ServerApp, + server_app_attr: Optional[str], + server_app: Optional[ServerApp], driver: Driver, server_app_dir: str, f_stop: asyncio.Event, From 5ef65ee657a7af15e4bec73f3a9d340af946263c Mon Sep 17 00:00:00 2001 From: jafermarq Date: Fri, 1 Mar 2024 21:34:06 +0000 Subject: [PATCH 087/103] handling asyncio event loop running by default in colab/jupyter --- src/py/flwr/simulation/run_simulation.py | 170 +++++++++++++++-------- 1 file changed, 112 insertions(+), 58 deletions(-) diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index b911784c1f6..e87e4b48881 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -19,7 +19,7 @@ import json import threading import traceback -from logging import ERROR, INFO, WARNING +from logging import DEBUG, ERROR, INFO, WARNING from time import sleep from typing import Any, Callable, Dict, Optional @@ -40,7 +40,7 @@ def run_simulation_from_cli() -> None: - """Run Simulation Engine from the CLI.""" + """Start Simulation Engine from the CLI.""" args = _parse_args_run_simulation().parse_args() # Load JSON config @@ -104,6 +104,87 @@ def execepthook(args: Any) -> None: return execepthook +def _main_loop( + num_supernodes: int, + backend_name: str, + backend_config_stream: str, + driver_api_address: str, + working_dir: str, + client_app: Optional[ClientApp] = None, + client_app_module_name: Optional[str] = None, + server_app: Optional[ServerApp] = None, + server_app_module_name: Optional[str] = None, +) -> None: + """Launch SuperLink with Simulation Engine, then ServerApp on a separate thread. + + Everything runs on the main thread or a separate one, depening on whether the main + thread already contains a running Asyncio event loop. This is the case if running + the Simulation Engine on a Jupyter/Colab notebook. + """ + # Initialize StateFactory + state_factory = StateFactory(":flwr-in-memory-state:") + + # Start Driver API + driver_server: grpc.Server = run_driver_api_grpc( + address=driver_api_address, + state_factory=state_factory, + certificates=None, + ) + + f_stop = asyncio.Event() + serverapp_th = None + try: + # Initialize Driver + driver = Driver( + driver_service_address=driver_api_address, + root_certificates=None, + ) + + # Get and run ServerApp thread + serverapp_th = run_serverapp_th( + server_app_attr=server_app_module_name, + server_app=server_app, + driver=driver, + server_app_dir=working_dir, + f_stop=f_stop, + ) + # Setup an exception hook + threading.excepthook = get_thread_exception_hook(f_stop) + + # SuperLink with Simulation Engine + event(EventType.RUN_SUPERLINK_ENTER) + vce.start_vce( + num_supernodes=num_supernodes, + client_app_module_name=client_app_module_name, + client_app=client_app, + backend_name=backend_name, + backend_config_json_stream=backend_config_stream, + working_dir=working_dir, + state_factory=state_factory, + f_stop=f_stop, + ) + + except Exception as ex: + + log(ERROR, "An exception occured !! %s", ex) + log(ERROR, traceback.format_exc()) + raise RuntimeError("An error was encountered. Ending simulation.") from ex + + finally: + + # Stop Driver + driver_server.stop(grace=0) + del driver + # Trigger stop event + f_stop.set() + + event(EventType.RUN_SUPERLINK_LEAVE) + if serverapp_th: + serverapp_th.join() + + log(INFO, "Stopping Simulation Engine now.") + + # pylint: disable=too-many-arguments,too-many-locals def run_simulation( num_supernodes: int, @@ -180,68 +261,41 @@ def run_simulation( # Convert config to original JSON-stream format backend_config_stream = json.dumps(backend_config) - # Initialize StateFactory - state_factory = StateFactory(":flwr-in-memory-state:") - - # Start Driver API - driver_server: grpc.Server = run_driver_api_grpc( - address=driver_api_address, - state_factory=state_factory, - certificates=None, + simulation_engine_th = None + args = ( + num_supernodes, + backend_name, + backend_config_stream, + driver_api_address, + working_dir, + client_app, + client_app_module_name, + server_app, + server_app_module_name, ) - - f_stop = asyncio.Event() - serverapp_th = None + # Detect if there is an Asyncio event loop already running. + # If yes, run everything on a separate thread. In environmnets + # like Jupyter/Colab notebooks, there is an event loop present. + run_in_thread = False try: - # Initialize Driver - driver = Driver( - driver_service_address=driver_api_address, - root_certificates=None, - ) - - # Get and run ServerApp thread - serverapp_th = run_serverapp_th( - server_app_attr=server_app_module_name, - server_app=server_app, - driver=driver, - server_app_dir=working_dir, - f_stop=f_stop, - ) - # Setup an exception hook - threading.excepthook = get_thread_exception_hook(f_stop) - - # SuperLink with Simulation Engine - event(EventType.RUN_SUPERLINK_ENTER) - vce.start_vce( - num_supernodes=num_supernodes, - client_app_module_name=client_app_module_name, - client_app=client_app, - backend_name=backend_name, - backend_config_json_stream=backend_config_stream, - working_dir=working_dir, - state_factory=state_factory, - f_stop=f_stop, - ) + _ = ( + asyncio.get_running_loop() + ) # Raises RuntimeError if no event loop is present + log(DEBUG, "Asyncio event loop already running.") - except Exception as ex: + run_in_thread = True - log(ERROR, "An exception occured !! %s", ex) - log(ERROR, traceback.format_exc()) - raise RuntimeError("An error was encountered. Ending simulation.") from ex + except RuntimeError: + log(DEBUG, "No asyncio event loop runnig") finally: - - # Stop Driver - driver_server.stop(grace=0) - del driver - # Trigger stop event - f_stop.set() - - event(EventType.RUN_SUPERLINK_LEAVE) - if serverapp_th: - serverapp_th.join() - - log(INFO, "Stopping Simulation Engine now.") + if run_in_thread: + log(DEBUG, "Starting Simulation Engine on a new thread.") + simulation_engine_th = threading.Thread(target=_main_loop, args=args) + simulation_engine_th.start() + else: + log(DEBUG, "Starting Simulation Engine on the main thread.") + _main_loop(*args) def _parse_args_run_simulation() -> argparse.ArgumentParser: From 5f16beecbfc14ab10214d4410fb9e8efddd34e3c Mon Sep 17 00:00:00 2001 From: jafermarq Date: Fri, 1 Mar 2024 21:57:19 +0000 Subject: [PATCH 088/103] join thread, else bad things happen --- src/py/flwr/simulation/run_simulation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index e87e4b48881..54e8c861029 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -293,6 +293,7 @@ def run_simulation( log(DEBUG, "Starting Simulation Engine on a new thread.") simulation_engine_th = threading.Thread(target=_main_loop, args=args) simulation_engine_th.start() + simulation_engine_th.join() else: log(DEBUG, "Starting Simulation Engine on the main thread.") _main_loop(*args) From 232a82b4f2cf9444cd9af8a2e8e8bacaf3767352 Mon Sep 17 00:00:00 2001 From: "Daniel J. Beutel" Date: Sat, 2 Mar 2024 17:37:14 +0100 Subject: [PATCH 089/103] Update src/py/flwr/server/run_serverapp.py --- src/py/flwr/server/run_serverapp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/py/flwr/server/run_serverapp.py b/src/py/flwr/server/run_serverapp.py index a1608fd15ec..c57a4a30c8a 100644 --- a/src/py/flwr/server/run_serverapp.py +++ b/src/py/flwr/server/run_serverapp.py @@ -39,7 +39,7 @@ def run( """Run ServerApp with a given Driver.""" if not (server_app_attr is None) ^ (loaded_server_app is None): raise ValueError( - "Either `server_app_attr` should `loaded_server_app` be set " + "Either `server_app_attr` or `loaded_server_app` should be set " "but not both. " ) From 7a569284196a7d0c69c9a8e6a026dc092437f076 Mon Sep 17 00:00:00 2001 From: "Daniel J. Beutel" Date: Sat, 2 Mar 2024 17:44:08 +0100 Subject: [PATCH 090/103] Update src/py/flwr/simulation/run_simulation.py --- src/py/flwr/simulation/run_simulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index 54e8c861029..a0b5be6f643 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -40,7 +40,7 @@ def run_simulation_from_cli() -> None: - """Start Simulation Engine from the CLI.""" + """Run Simulation Engine from the CLI.""" args = _parse_args_run_simulation().parse_args() # Load JSON config From 2859e3e27b54f9818bb1574439ce46a574586927 Mon Sep 17 00:00:00 2001 From: "Daniel J. Beutel" Date: Sat, 2 Mar 2024 17:53:49 +0100 Subject: [PATCH 091/103] Update src/py/flwr/simulation/run_simulation.py --- src/py/flwr/simulation/run_simulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index a0b5be6f643..a6f91957837 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -166,7 +166,7 @@ def _main_loop( except Exception as ex: - log(ERROR, "An exception occured !! %s", ex) + log(ERROR, "An exception occurred !! %s", ex) log(ERROR, traceback.format_exc()) raise RuntimeError("An error was encountered. Ending simulation.") from ex From 8e0f95cfc8efd21a17402491476034c2bd96a03f Mon Sep 17 00:00:00 2001 From: jafermarq Date: Sat, 2 Mar 2024 23:11:17 +0000 Subject: [PATCH 092/103] exposing relevant args to entry point through python env / notebook; unifying names --- src/py/flwr/server/app.py | 6 +- .../server/superlink/fleet/vce/vce_api.py | 12 +-- .../superlink/fleet/vce/vce_api_test.py | 8 +- src/py/flwr/simulation/run_simulation.py | 81 +++++++++++++++---- 4 files changed, 80 insertions(+), 27 deletions(-) diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py index 788ebeb8a45..01b1f622212 100644 --- a/src/py/flwr/server/app.py +++ b/src/py/flwr/server/app.py @@ -362,7 +362,7 @@ def run_superlink() -> None: f_stop = asyncio.Event() # Does nothing _run_fleet_api_vce( num_supernodes=args.num_supernodes, - client_app_module_name=args.client_app, + client_app_attr=args.client_app, backend_name=args.backend, backend_config_json_stream=args.backend_config, working_dir=args.dir, @@ -438,7 +438,7 @@ def _run_fleet_api_grpc_rere( # pylint: disable=too-many-arguments def _run_fleet_api_vce( num_supernodes: int, - client_app_module_name: str, + client_app_attr: str, backend_name: str, backend_config_json_stream: str, working_dir: str, @@ -449,7 +449,7 @@ def _run_fleet_api_vce( start_vce( num_supernodes=num_supernodes, - client_app_module_name=client_app_module_name, + client_app_attr=client_app_attr, backend_name=backend_name, backend_config_json_stream=backend_config_json_stream, state_factory=state_factory, diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py index c03b57ddbb5..d42379960a6 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py @@ -224,15 +224,15 @@ def start_vce( working_dir: str, f_stop: asyncio.Event, client_app: Optional[ClientApp] = None, - client_app_module_name: Optional[str] = None, + client_app_attr: Optional[str] = None, num_supernodes: Optional[int] = None, state_factory: Optional[StateFactory] = None, existing_nodes_mapping: Optional[NodeToPartitionMapping] = None, ) -> None: """Start Fleet API with the Simulation Engine.""" - if client_app_module_name is not None and client_app is not None: + if client_app_attr is not None and client_app is not None: raise ValueError( - "Both `client_app_module_name` and `client_app` are provided, " + "Both `client_app_attr` and `client_app` are provided, " "but only one is allowed." ) @@ -299,13 +299,13 @@ def backend_fn() -> Backend: """Instantiate a Backend.""" return backend_type(backend_config, work_dir=working_dir) - log(INFO, "client_app_module_name = %s", client_app_module_name) + log(INFO, "client_app_attr = %s", client_app_attr) # Load ClientApp if needed def _load() -> ClientApp: - if client_app_module_name: - app: ClientApp = load_client_app(client_app_module_name) + if client_app_attr: + app: ClientApp = load_client_app(client_app_attr) if client_app: app = client_app return app diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py index ea2de2e636b..16cb45c1262 100644 --- a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py +++ b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py @@ -132,7 +132,7 @@ def _autoresolve_working_dir(rel_client_app_dir: str = "backend") -> str: # pylint: disable=too-many-arguments def start_and_shutdown( backend: str = "ray", - clientapp_module: str = "raybackend_test:client_app", + client_app_attr: str = "raybackend_test:client_app", working_dir: str = "", num_supernodes: Optional[int] = None, state_factory: Optional[StateFactory] = None, @@ -162,7 +162,7 @@ def start_and_shutdown( start_vce( num_supernodes=num_supernodes, - client_app_module_name=clientapp_module, + client_app_attr=client_app_attr, backend_name=backend, backend_config_json_stream=backend_config, state_factory=state_factory, @@ -183,7 +183,7 @@ def test_erroneous_no_supernodes_client_mapping(self) -> None: with self.assertRaises(ValueError): start_and_shutdown(duration=2) - def test_erroneous_clientapp_module_name(self) -> None: + def test_erroneous_client_app_attr(self) -> None: """Tests attempt to load a ClientApp that can't be found.""" num_messages = 7 num_nodes = 59 @@ -193,7 +193,7 @@ def test_erroneous_clientapp_module_name(self) -> None: ) with self.assertRaises(RuntimeError): start_and_shutdown( - clientapp_module="totally_fictitious_app:client", + client_app_attr="totally_fictitious_app:client", state_factory=state_factory, nodes_mapping=nodes_mapping, ) diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index a6f91957837..9eb8012161c 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -39,6 +39,7 @@ ) +# Entry point from CLI def run_simulation_from_cli() -> None: """Run Simulation Engine from the CLI.""" args = _parse_args_run_simulation().parse_args() @@ -46,10 +47,10 @@ def run_simulation_from_cli() -> None: # Load JSON config backend_config_dict = json.loads(args.backend_config) - run_simulation( + _run_simulation( num_supernodes=args.num_supernodes, - client_app_module_name=args.client_app, - server_app_module_name=args.server_app, + client_app_attr=args.client_app, + server_app_attr=args.server_app, backend_name=args.backend, backend_config=backend_config_dict, working_dir=args.dir, @@ -58,6 +59,58 @@ def run_simulation_from_cli() -> None: ) +# Entry point from Python session (script or notebook) +# pylint: disable=too-many-arguments +def run_simulation( + num_supernodes: int, + client_app: ClientApp, + server_app: ServerApp, + backend_name: str = "ray", + backend_config: Optional[Dict[str, ConfigsRecordValues]] = None, + enable_tf_gpu_growth: bool = False, +) -> None: + r"""Launch the Simulation Engine. + + Parameters + ---------- + num_supernodes : int + Number of nodes that run a ClientApp. They can be sampled by a + Driver in the ServerApp and receive a Message describing what the ClientApp + should perform. + + client_app : ClientApp + The `ClientApp` to be executed by each of the `SuperNodes`. It will receive + messages sent by the `ServerApp`. + + server_app : ServerApp + The `ServerApp` to be executed. + + backend_name : str (default: ray) + A simulation backend that runs `ClientApp`s. + + backend_config : Optional[Dict[str, ConfigsRecordValues]] + 'A dictionary, e.g {"":, "":} to configure a + backend. Values supported in are those included by + `flwr.common.typing.ConfigsRecordValues`. + + enable_tf_gpu_growth : bool (default: False) + A boolean to indicate whether to enable GPU growth on the main thread. This is + desirable if you make use of a TensorFlow model on your `ServerApp` while + having your `ClientApp` running on the same GPU. Without enabling this, you + might encounter an out-of-memory error becasue TensorFlow by default allocates + all GPU memory. Read mor about how `tf.config.experimental.set_memory_growth()` + works in the TensorFlow documentation: https://www.tensorflow.org/api/stable. + """ + _run_simulation( + num_supernodes=num_supernodes, + client_app=client_app, + server_app=server_app, + backend_name=backend_name, + backend_config=backend_config, + enable_tf_gpu_growth=enable_tf_gpu_growth, + ) + + # pylint: disable=too-many-arguments def run_serverapp_th( server_app_attr: Optional[str], @@ -111,9 +164,9 @@ def _main_loop( driver_api_address: str, working_dir: str, client_app: Optional[ClientApp] = None, - client_app_module_name: Optional[str] = None, + client_app_attr: Optional[str] = None, server_app: Optional[ServerApp] = None, - server_app_module_name: Optional[str] = None, + server_app_attr: Optional[str] = None, ) -> None: """Launch SuperLink with Simulation Engine, then ServerApp on a separate thread. @@ -142,7 +195,7 @@ def _main_loop( # Get and run ServerApp thread serverapp_th = run_serverapp_th( - server_app_attr=server_app_module_name, + server_app_attr=server_app_attr, server_app=server_app, driver=driver, server_app_dir=working_dir, @@ -155,7 +208,7 @@ def _main_loop( event(EventType.RUN_SUPERLINK_ENTER) vce.start_vce( num_supernodes=num_supernodes, - client_app_module_name=client_app_module_name, + client_app_attr=client_app_attr, client_app=client_app, backend_name=backend_name, backend_config_json_stream=backend_config_stream, @@ -186,14 +239,14 @@ def _main_loop( # pylint: disable=too-many-arguments,too-many-locals -def run_simulation( +def _run_simulation( num_supernodes: int, client_app: Optional[ClientApp] = None, server_app: Optional[ServerApp] = None, backend_name: str = "ray", backend_config: Optional[Dict[str, ConfigsRecordValues]] = None, - client_app_module_name: Optional[str] = None, - server_app_module_name: Optional[str] = None, + client_app_attr: Optional[str] = None, + server_app_attr: Optional[str] = None, working_dir: str = "", driver_api_address: str = "0.0.0.0:9091", enable_tf_gpu_growth: bool = False, @@ -222,11 +275,11 @@ def run_simulation( backend. Values supported in are those included by `flwr.common.typing.ConfigsRecordValues`. - client_app_module_name : str + client_app_attr : str A path to a `ClientApp` module to be loaded: For example: `client:app` or `project.package.module:wrapper.app`." - server_app_module_name : str + server_app_attr : str A path to a `ServerApp` module to be loaded: For example: `server:app` or `project.package.module:wrapper.app`." @@ -269,9 +322,9 @@ def run_simulation( driver_api_address, working_dir, client_app, - client_app_module_name, + client_app_attr, server_app, - server_app_module_name, + server_app_attr, ) # Detect if there is an Asyncio event loop already running. # If yes, run everything on a separate thread. In environmnets From 86f3761bc0625b2a49c5ad063e73ccf99b41fd9d Mon Sep 17 00:00:00 2001 From: jafermarq Date: Mon, 4 Mar 2024 20:24:58 +0000 Subject: [PATCH 093/103] moved input args; fix enable GPU growth in ServerApp thread; other minor --- src/py/flwr/server/run_serverapp.py | 6 --- src/py/flwr/simulation/run_simulation.py | 49 +++++++++++++++++------- 2 files changed, 35 insertions(+), 20 deletions(-) diff --git a/src/py/flwr/server/run_serverapp.py b/src/py/flwr/server/run_serverapp.py index c57a4a30c8a..4431397a28e 100644 --- a/src/py/flwr/server/run_serverapp.py +++ b/src/py/flwr/server/run_serverapp.py @@ -16,7 +16,6 @@ import argparse -import asyncio import sys from logging import DEBUG, WARN from pathlib import Path @@ -34,7 +33,6 @@ def run( server_app_dir: str, server_app_attr: Optional[str] = None, loaded_server_app: Optional[ServerApp] = None, - stop_event: Optional[asyncio.Event] = None, ) -> None: """Run ServerApp with a given Driver.""" if not (server_app_attr is None) ^ (loaded_server_app is None): @@ -63,10 +61,6 @@ def _load() -> ServerApp: server_app(driver=driver, context=context) log(DEBUG, "ServerApp finished running.") - # Upon completion, trigger stop event if one was passed - if stop_event is not None: - log(DEBUG, "Triggering stop event.") - stop_event.set() def run_server_app() -> None: diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index 44f50706af1..8f9bea29100 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -62,9 +62,9 @@ def run_simulation_from_cli() -> None: # Entry point from Python session (script or notebook) # pylint: disable=too-many-arguments def run_simulation( - num_supernodes: int, - client_app: ClientApp, server_app: ServerApp, + client_app: ClientApp, + num_supernodes: int, backend_name: str = "ray", backend_config: Optional[Dict[str, ConfigsRecordValues]] = None, enable_tf_gpu_growth: bool = False, @@ -73,17 +73,17 @@ def run_simulation( Parameters ---------- - num_supernodes : int - Number of nodes that run a ClientApp. They can be sampled by a - Driver in the ServerApp and receive a Message describing what the ClientApp - should perform. + server_app : ServerApp + The `ServerApp` to be executed. client_app : ClientApp The `ClientApp` to be executed by each of the `SuperNodes`. It will receive messages sent by the `ServerApp`. - server_app : ServerApp - The `ServerApp` to be executed. + num_supernodes : int + Number of nodes that run a ClientApp. They can be sampled by a + Driver in the ServerApp and receive a Message describing what the ClientApp + should perform. backend_name : str (default: ray) A simulation backend that runs `ClientApp`s. @@ -118,18 +118,38 @@ def run_serverapp_th( driver: Driver, server_app_dir: str, f_stop: asyncio.Event, + enable_tf_gpu_growth: bool, delay_launch: int = 3, ) -> threading.Thread: """Run SeverApp in a thread.""" + + def server_th_with_start_checks( # type: ignore + tf_gpu_growth: bool, stop_event: asyncio.Event, **kwargs + ) -> None: + """Run SeverApp, after check if GPU memory grouwth has to be set.""" + try: + if tf_gpu_growth: + log(INFO, "Enabling GPU growth for Tensorflow on the main thread.") + enable_gpu_growth() + run(**kwargs) + except Exception as ex: # pylint: disable=broad-exception-caught + log(ERROR, "ServerApp thread raised an exception: %s", ex) + log(ERROR, traceback.format_exc()) + finally: + log(DEBUG, "ServerApp finished running.") + # Upon completion, trigger stop event if one was passed + if stop_event is not None: + log(DEBUG, "Triggering stop event.") + stop_event.set() + serverapp_th = threading.Thread( - target=run, + target=server_th_with_start_checks, + args=(enable_tf_gpu_growth, f_stop), kwargs={ "server_app_attr": server_app_attr, "loaded_server_app": server_app, "driver": driver, "server_app_dir": server_app_dir, - "stop_event": f_stop, # will be set when `run()` finishes - # will trigger the shutdown of the Simulation Engine }, ) sleep(delay_launch) @@ -157,12 +177,14 @@ def execepthook(args: Any) -> None: return execepthook +# pylint: disable=too-many-locals def _main_loop( num_supernodes: int, backend_name: str, backend_config_stream: str, driver_api_address: str, working_dir: str, + enable_tf_gpu_growth: bool, client_app: Optional[ClientApp] = None, client_app_attr: Optional[str] = None, server_app: Optional[ServerApp] = None, @@ -200,6 +222,7 @@ def _main_loop( driver=driver, server_app_dir=working_dir, f_stop=f_stop, + enable_tf_gpu_growth=enable_tf_gpu_growth, ) # Setup an exception hook threading.excepthook = get_thread_exception_hook(f_stop) @@ -301,10 +324,7 @@ def _run_simulation( if backend_config is None: backend_config = {} - # Enable GPU memory growth (relevant only for TF) if enable_tf_gpu_growth: - log(INFO, "Enabling GPU growth for Tensorflow on the main thread.") - enable_gpu_growth() # Check that Backend config has also enabled using GPU growth use_tf = backend_config.get("tensorflow", False) if not use_tf: @@ -321,6 +341,7 @@ def _run_simulation( backend_config_stream, driver_api_address, working_dir, + enable_tf_gpu_growth, client_app, client_app_attr, server_app, From f3ed0c9f8efe24b268d1c8de6102742837df8415 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Mon, 4 Mar 2024 21:07:41 +0000 Subject: [PATCH 094/103] simplifications; option `--verbose` --- src/py/flwr/simulation/run_simulation.py | 64 +++++++++++++----------- 1 file changed, 35 insertions(+), 29 deletions(-) diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index 8f9bea29100..2ac8d55f7c8 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -17,11 +17,12 @@ import argparse import asyncio import json +import logging import threading import traceback from logging import DEBUG, ERROR, INFO, WARNING from time import sleep -from typing import Any, Callable, Dict, Optional +from typing import Dict, Optional import grpc @@ -48,14 +49,15 @@ def run_simulation_from_cli() -> None: backend_config_dict = json.loads(args.backend_config) _run_simulation( - num_supernodes=args.num_supernodes, - client_app_attr=args.client_app, server_app_attr=args.server_app, + client_app_attr=args.client_app, + num_supernodes=args.num_supernodes, backend_name=args.backend, backend_config=backend_config_dict, working_dir=args.dir, driver_api_address=args.driver_api_address, enable_tf_gpu_growth=args.enable_tf_gpu_growth, + verbose_logging=args.verbose, ) @@ -68,6 +70,7 @@ def run_simulation( backend_name: str = "ray", backend_config: Optional[Dict[str, ConfigsRecordValues]] = None, enable_tf_gpu_growth: bool = False, + verbose_logging: bool = False, ) -> None: r"""Launch the Simulation Engine. @@ -100,6 +103,10 @@ def run_simulation( might encounter an out-of-memory error becasue TensorFlow by default allocates all GPU memory. Read mor about how `tf.config.experimental.set_memory_growth()` works in the TensorFlow documentation: https://www.tensorflow.org/api/stable. + + verbose_logging : bool (default: False) + When diabled, only INFO, WARNING and ERROR log messages will be shown. If + enabled, DEBUG-level logs will be displayed. """ _run_simulation( num_supernodes=num_supernodes, @@ -108,6 +115,7 @@ def run_simulation( backend_name=backend_name, backend_config=backend_config, enable_tf_gpu_growth=enable_tf_gpu_growth, + verbose_logging=verbose_logging, ) @@ -126,11 +134,16 @@ def run_serverapp_th( def server_th_with_start_checks( # type: ignore tf_gpu_growth: bool, stop_event: asyncio.Event, **kwargs ) -> None: - """Run SeverApp, after check if GPU memory grouwth has to be set.""" + """Run SeverApp, after check if GPU memory grouwth has to be set. + + Upon exception, trigger stop event for Simulation Engine. + """ try: if tf_gpu_growth: log(INFO, "Enabling GPU growth for Tensorflow on the main thread.") enable_gpu_growth() + + # Run ServerApp run(**kwargs) except Exception as ex: # pylint: disable=broad-exception-caught log(ERROR, "ServerApp thread raised an exception: %s", ex) @@ -139,8 +152,8 @@ def server_th_with_start_checks( # type: ignore log(DEBUG, "ServerApp finished running.") # Upon completion, trigger stop event if one was passed if stop_event is not None: - log(DEBUG, "Triggering stop event.") stop_event.set() + log(WARNING, "Triggered stop event for Simulation Engine.") serverapp_th = threading.Thread( target=server_th_with_start_checks, @@ -157,26 +170,6 @@ def server_th_with_start_checks( # type: ignore return serverapp_th -def get_thread_exception_hook(stop_event: asyncio.Event) -> Callable[[Any], None]: - """Return a callback for when the ServerApp thread raises an exception.""" - - def execepthook(args: Any) -> None: - """Upon exception raised, log exception and trigger stop event.""" - # log - log( - ERROR, - "The ServerApp thread triggered exception (%s): %s", - args.exc_type, - args.exc_value, - ) - log(ERROR, traceback.format_exc()) - # Set stop event - stop_event.set() - log(WARNING, "Triggered stop event for Simulation Engine.") - - return execepthook - - # pylint: disable=too-many-locals def _main_loop( num_supernodes: int, @@ -224,8 +217,6 @@ def _main_loop( f_stop=f_stop, enable_tf_gpu_growth=enable_tf_gpu_growth, ) - # Setup an exception hook - threading.excepthook = get_thread_exception_hook(f_stop) # SuperLink with Simulation Engine event(EventType.RUN_SUPERLINK_ENTER) @@ -241,13 +232,11 @@ def _main_loop( ) except Exception as ex: - log(ERROR, "An exception occurred !! %s", ex) log(ERROR, traceback.format_exc()) raise RuntimeError("An error was encountered. Ending simulation.") from ex finally: - # Stop Driver driver_server.stop(grace=0) del driver @@ -273,6 +262,7 @@ def _run_simulation( working_dir: str = "", driver_api_address: str = "0.0.0.0:9091", enable_tf_gpu_growth: bool = False, + verbose_logging: bool = False, ) -> None: r"""Launch the Simulation Engine. @@ -320,7 +310,16 @@ def _run_simulation( might encounter an out-of-memory error becasue TensorFlow by default allocates all GPU memory. Read mor about how `tf.config.experimental.set_memory_growth()` works in the TensorFlow documentation: https://www.tensorflow.org/api/stable. + + verbose_logging : bool (default: False) + When diabled, only INFO, WARNING and ERROR log messages will be shown. If + enabled, DEBUG-level logs will be displayed. """ + # Set logging level + if not verbose_logging: + logger = logging.getLogger("flwr") + logger.setLevel(INFO) + if backend_config is None: backend_config = {} @@ -432,4 +431,11 @@ def _parse_args_run_simulation() -> argparse.ArgumentParser: " Default: current working directory.", ) + parser.add_argument( + "--verbose", + action="store_true", + help="When unset, only INFO, WARNING and ERROR log messages will be shown. " + "If set, DEBUG-level logs will be displayed. ", + ) + return parser From 43bd2e18b6c28dbac0607d90c117bda7952f683c Mon Sep 17 00:00:00 2001 From: jafermarq Date: Tue, 5 Mar 2024 09:48:55 +0000 Subject: [PATCH 095/103] discarded changes to notebooks --- examples/simulation-pytorch/sim.ipynb | 55 +++++++++++------------- examples/simulation-tensorflow/sim.ipynb | 44 ++++++------------- 2 files changed, 38 insertions(+), 61 deletions(-) diff --git a/examples/simulation-pytorch/sim.ipynb b/examples/simulation-pytorch/sim.ipynb index 762911fdc5c..e27721a7fa5 100644 --- a/examples/simulation-pytorch/sim.ipynb +++ b/examples/simulation-pytorch/sim.ipynb @@ -20,8 +20,9 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -q \"flwr[simulation]\"\n", - "!pip install -q \"flwr_datasets[vision]\"" + "# depending on your shell, you might need to add `\\` before `[` and `]`.\n", + "!pip install -q flwr[simulation]\n", + "!pip install flwr_datasets[vision]" ] }, { @@ -62,7 +63,7 @@ }, "outputs": [], "source": [ - "!pip install -q matplotlib" + "!pip install matplotlib" ] }, { @@ -510,7 +511,10 @@ " # Create and return client\n", " return FlowerClient(trainloader, valloader).to_client()\n", "\n", - " return client_fn" + " return client_fn\n", + "\n", + "\n", + "client_fn_callback = get_client_fn(mnist_fds)" ] }, { @@ -532,33 +536,22 @@ }, "outputs": [], "source": [ - "# ClientApp for Flower-Next\n", - "client_app = fl.client.ClientApp(\n", - " client_fn=get_client_fn(mnist_fds),\n", - ")\n", - "\n", - "# ServerApp for Flower-Next\n", - "server_app = fl.server.ServerApp(\n", - " config=fl.server.ServerConfig(num_rounds=10),\n", - " strategy=strategy,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, launch the simulation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fl.simulation.run_simulation(\n", - " server_app=server_app, client_app=client_app, num_supernodes=NUM_CLIENTS\n", + "# With a dictionary, you tell Flower's VirtualClientEngine that each\n", + "# client needs exclusive access to these many resources in order to run\n", + "client_resources = {\"num_cpus\": 1, \"num_gpus\": 0.0}\n", + "\n", + "# Let's disable tqdm progress bar in the main thread (used by the server)\n", + "disable_progress_bar()\n", + "\n", + "history = fl.simulation.start_simulation(\n", + " client_fn=client_fn_callback, # a callback to construct a client\n", + " num_clients=NUM_CLIENTS, # total number of clients in the experiment\n", + " config=fl.server.ServerConfig(num_rounds=10), # let's run for 10 rounds\n", + " strategy=strategy, # the strategy that will orchestrate the whole FL pipeline\n", + " client_resources=client_resources,\n", + " actor_kwargs={\n", + " \"on_actor_init_fn\": disable_progress_bar # disable tqdm on each actor/process spawning virtual clients\n", + " },\n", ")" ] }, diff --git a/examples/simulation-tensorflow/sim.ipynb b/examples/simulation-tensorflow/sim.ipynb index 6c08666b6e4..9acfba99237 100644 --- a/examples/simulation-tensorflow/sim.ipynb +++ b/examples/simulation-tensorflow/sim.ipynb @@ -17,8 +17,8 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -q \"flwr[simulation]\" tensorflow\n", - "!pip install -q \"flwr_datasets[vision]\"" + "!pip install -q flwr[\"simulation\"] tensorflow\n", + "!pip install -q flwr_datasets[\"vision\"]" ] }, { @@ -34,7 +34,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -q matplotlib" + "!pip install matplotlib" ] }, { @@ -265,36 +265,20 @@ " evaluate_fn=get_evaluate_fn(centralized_testset), # global evaluation function\n", ")\n", "\n", - "# ClientApp for Flower-Next\n", - "client_app = fl.client.ClientApp(\n", - " client_fn=get_client_fn(mnist_fds),\n", - ")\n", + "# With a dictionary, you tell Flower's VirtualClientEngine that each\n", + "# client needs exclusive access to these many resources in order to run\n", + "client_resources = {\"num_cpus\": 1, \"num_gpus\": 0.0}\n", "\n", - "# ServerApp for Flower-Next\n", - "server_app = fl.server.ServerApp(\n", + "# Start simulation\n", + "history = fl.simulation.start_simulation(\n", + " client_fn=get_client_fn(mnist_fds),\n", + " num_clients=NUM_CLIENTS,\n", " config=fl.server.ServerConfig(num_rounds=10),\n", " strategy=strategy,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's lauch the simulation:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fl.simulation.run_simulation(\n", - " server_app=server_app,\n", - " client_app=client_app,\n", - " num_supernodes=NUM_CLIENTS,\n", - " enable_tf_gpu_growth=True,\n", + " client_resources=client_resources,\n", + " actor_kwargs={\n", + " \"on_actor_init_fn\": enable_tf_gpu_growth # Enable GPU growth upon actor init.\n", + " },\n", ")" ] }, From 91683fe5c13ec3eab92bbbaf44be8bee190c3cae Mon Sep 17 00:00:00 2001 From: "Daniel J. Beutel" Date: Tue, 5 Mar 2024 10:53:17 +0100 Subject: [PATCH 096/103] Update src/py/flwr/simulation/run_simulation.py --- src/py/flwr/simulation/run_simulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index 2ac8d55f7c8..b214cdb2da7 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -72,7 +72,7 @@ def run_simulation( enable_tf_gpu_growth: bool = False, verbose_logging: bool = False, ) -> None: - r"""Launch the Simulation Engine. + r"""Run a Flower App using the Simulation Engine. Parameters ---------- From 8531ab6e6cb524de8f5b7236ff9f22c9993d87b5 Mon Sep 17 00:00:00 2001 From: "Daniel J. Beutel" Date: Tue, 5 Mar 2024 10:54:28 +0100 Subject: [PATCH 097/103] Update src/py/flwr/simulation/run_simulation.py --- src/py/flwr/simulation/run_simulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index b214cdb2da7..7a5a8dc0c5e 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -77,7 +77,7 @@ def run_simulation( Parameters ---------- server_app : ServerApp - The `ServerApp` to be executed. + The `ServerApp` to be executed. It will send messages to different `ClientApp` instances. client_app : ClientApp The `ClientApp` to be executed by each of the `SuperNodes`. It will receive From 0de86ad155b1fb608452558724c53a2970eac0fc Mon Sep 17 00:00:00 2001 From: "Daniel J. Beutel" Date: Tue, 5 Mar 2024 10:55:22 +0100 Subject: [PATCH 098/103] Update src/py/flwr/simulation/run_simulation.py --- src/py/flwr/simulation/run_simulation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index 7a5a8dc0c5e..94950e68f92 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -77,7 +77,8 @@ def run_simulation( Parameters ---------- server_app : ServerApp - The `ServerApp` to be executed. It will send messages to different `ClientApp` instances. + The `ServerApp` to be executed. It will send messages to different `ClientApp` + instances. client_app : ClientApp The `ClientApp` to be executed by each of the `SuperNodes`. It will receive From a7e875a5f557fdd320fb3a5306c042939534cb69 Mon Sep 17 00:00:00 2001 From: "Daniel J. Beutel" Date: Tue, 5 Mar 2024 11:00:08 +0100 Subject: [PATCH 099/103] Update src/py/flwr/simulation/run_simulation.py --- src/py/flwr/simulation/run_simulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index 94950e68f92..3cd40643210 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -78,7 +78,7 @@ def run_simulation( ---------- server_app : ServerApp The `ServerApp` to be executed. It will send messages to different `ClientApp` - instances. + instances running on different (virtual) SuperNodes. client_app : ClientApp The `ClientApp` to be executed by each of the `SuperNodes`. It will receive From 6a18a1e22624b9499a73caf63b9a8bcbc6630872 Mon Sep 17 00:00:00 2001 From: "Daniel J. Beutel" Date: Tue, 5 Mar 2024 11:01:00 +0100 Subject: [PATCH 100/103] Update src/py/flwr/simulation/run_simulation.py --- src/py/flwr/simulation/run_simulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index 3cd40643210..85a87c4bede 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -81,7 +81,7 @@ def run_simulation( instances running on different (virtual) SuperNodes. client_app : ClientApp - The `ClientApp` to be executed by each of the `SuperNodes`. It will receive + The `ClientApp` to be executed by each of the SuperNodes. It will receive messages sent by the `ServerApp`. num_supernodes : int From bceabb06d1950ec3c5dbbca27c75ea121161ad8f Mon Sep 17 00:00:00 2001 From: "Daniel J. Beutel" Date: Tue, 5 Mar 2024 11:02:01 +0100 Subject: [PATCH 101/103] Update src/py/flwr/simulation/run_simulation.py --- src/py/flwr/simulation/run_simulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index 85a87c4bede..a48fb833edd 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -93,7 +93,7 @@ def run_simulation( A simulation backend that runs `ClientApp`s. backend_config : Optional[Dict[str, ConfigsRecordValues]] - 'A dictionary, e.g {"":, "":} to configure a + 'A dictionary, e.g {"": , "": } to configure a backend. Values supported in are those included by `flwr.common.typing.ConfigsRecordValues`. From 037eda43b5898add84b4131836c289f93e8c2fe5 Mon Sep 17 00:00:00 2001 From: "Daniel J. Beutel" Date: Tue, 5 Mar 2024 11:03:47 +0100 Subject: [PATCH 102/103] Update src/py/flwr/simulation/run_simulation.py --- src/py/flwr/simulation/run_simulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index a48fb833edd..2cc3b21af4c 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -101,7 +101,7 @@ def run_simulation( A boolean to indicate whether to enable GPU growth on the main thread. This is desirable if you make use of a TensorFlow model on your `ServerApp` while having your `ClientApp` running on the same GPU. Without enabling this, you - might encounter an out-of-memory error becasue TensorFlow by default allocates + might encounter an out-of-memory error because TensorFlow, by default, allocates all GPU memory. Read mor about how `tf.config.experimental.set_memory_growth()` works in the TensorFlow documentation: https://www.tensorflow.org/api/stable. From af9760975a79bac5a69a9ca0f69f87358cf4f06a Mon Sep 17 00:00:00 2001 From: "Daniel J. Beutel" Date: Tue, 5 Mar 2024 11:04:13 +0100 Subject: [PATCH 103/103] Update src/py/flwr/simulation/run_simulation.py --- src/py/flwr/simulation/run_simulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py index 2cc3b21af4c..cb68221ea58 100644 --- a/src/py/flwr/simulation/run_simulation.py +++ b/src/py/flwr/simulation/run_simulation.py @@ -102,7 +102,7 @@ def run_simulation( desirable if you make use of a TensorFlow model on your `ServerApp` while having your `ClientApp` running on the same GPU. Without enabling this, you might encounter an out-of-memory error because TensorFlow, by default, allocates - all GPU memory. Read mor about how `tf.config.experimental.set_memory_growth()` + all GPU memory. Read more about how `tf.config.experimental.set_memory_growth()` works in the TensorFlow documentation: https://www.tensorflow.org/api/stable. verbose_logging : bool (default: False)