From 4c83a5e9def9f948b8d3645e3775450d771729ef Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.dev>
Date: Wed, 21 Feb 2024 17:57:24 +0000
Subject: [PATCH 001/103] init

---
 src/py/flwr/common/constant.py                |  2 +
 src/py/flwr/server/app.py                     | 77 ++++++++++++++++++-
 .../server/superlink/fleet/vce/__init__.py    | 15 ++++
 .../server/superlink/fleet/vce/vce_api.py     | 68 ++++++++++++++++
 4 files changed, 160 insertions(+), 2 deletions(-)
 create mode 100644 src/py/flwr/server/superlink/fleet/vce/__init__.py
 create mode 100644 src/py/flwr/server/superlink/fleet/vce/vce_api.py

diff --git a/src/py/flwr/common/constant.py b/src/py/flwr/common/constant.py
index 811fff73f06..2946a594e68 100644
--- a/src/py/flwr/common/constant.py
+++ b/src/py/flwr/common/constant.py
@@ -28,10 +28,12 @@
 TRANSPORT_TYPE_GRPC_BIDI = "grpc-bidi"
 TRANSPORT_TYPE_GRPC_RERE = "grpc-rere"
 TRANSPORT_TYPE_REST = "rest"
+TRANSPORT_TYPE_VCE = "vce"
 TRANSPORT_TYPES = [
     TRANSPORT_TYPE_GRPC_BIDI,
     TRANSPORT_TYPE_GRPC_RERE,
     TRANSPORT_TYPE_REST,
+    TRANSPORT_TYPE_VCE,
 ]
 
 MESSAGE_TYPE_GET_PROPERTIES = "get_properties"
diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py
index dbbf63b0fe5..75fa372d084 100644
--- a/src/py/flwr/server/app.py
+++ b/src/py/flwr/server/app.py
@@ -14,9 +14,9 @@
 # ==============================================================================
 """Flower server app."""
 
-
 import argparse
 import importlib.util
+import json
 import sys
 import threading
 from logging import ERROR, INFO, WARN
@@ -24,7 +24,7 @@
 from pathlib import Path
 from signal import SIGINT, SIGTERM, signal
 from types import FrameType
-from typing import List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 
 import grpc
 
@@ -34,6 +34,7 @@
     MISSING_EXTRA_REST,
     TRANSPORT_TYPE_GRPC_RERE,
     TRANSPORT_TYPE_REST,
+    TRANSPORT_TYPE_VCE,
 )
 from flwr.common.logger import log
 from flwr.proto.driver_pb2_grpc import (  # pylint: disable=E0611
@@ -315,6 +316,15 @@ def run_fleet_api() -> None:
             certificates=certificates,
         )
         grpc_servers.append(fleet_server)
+    elif args.fleet_api_type == TRANSPORT_TYPE_VCE:
+        _run_fleet_api_vce(
+            num_supernodes=args.num_supernodes,
+            client_app_str=args.client_app,
+            backend=args.backend,
+            backend_config=args.backend_config,
+            working_dir=args.dir,
+            state_factory=state_factory,
+        )
     else:
         raise ValueError(f"Unknown fleet_api_type: {args.fleet_api_type}")
 
@@ -537,6 +547,29 @@ def _run_fleet_api_grpc_rere(
     return fleet_grpc_server
 
 
+# pylint: disable=import-outside-toplevel,too-many-arguments
+def _run_fleet_api_vce(
+    num_supernodes: int,
+    client_app_str: str,
+    backend: str,
+    backend_config: Dict[str, Union[str, int, float]],
+    working_dir: str,
+    state_factory: StateFactory,
+) -> None:
+    from flwr.server.superlink.fleet.vce.vce_api import start_vce
+
+    log(INFO, "Flower VCE: Starting Fleet API (VirtualClientEngine)")
+
+    start_vce(
+        num_supernodes=num_supernodes,
+        client_app_str=client_app_str,
+        backend_str=backend,
+        backend_config=backend_config,
+        state_factory=state_factory,
+        working_dir=working_dir,
+    )
+
+
 # pylint: disable=import-outside-toplevel,too-many-arguments
 def _run_fleet_api_rest(
     host: str,
@@ -714,6 +747,14 @@ def _add_args_fleet_api(parser: argparse.ArgumentParser) -> None:
         help="Start a Fleet API server (REST, experimental)",
     )
 
+    ex_group.add_argument(
+        "--vce",
+        action="store_const",
+        dest="fleet_api_type",
+        const=TRANSPORT_TYPE_VCE,
+        help="Start a Fleet API server (VirtualClientEngine)",
+    )
+
     # Fleet API gRPC-rere options
     grpc_rere_group = parser.add_argument_group(
         "Fleet API (gRPC-rere) server options", ""
@@ -749,3 +790,35 @@ def _add_args_fleet_api(parser: argparse.ArgumentParser) -> None:
         type=int,
         default=1,
     )
+
+    # Fleet API VCE options
+    vce_group = parser.add_argument_group("Fleet API (VCE) server options", "")
+    vce_group.add_argument(
+        "--client-app",
+        help="For example: `client:app` or `project.package.module:wrapper.app`.",
+    )
+    vce_group.add_argument(
+        "--num-supernodes",
+        type=int,
+        help="Number of SuperNodes connected to the SuperLink.",
+    )
+    vce_group.add_argument(
+        "--backend",
+        default="ray",
+        type=str,
+        help="Simulation Backend that process a ClientApp.",
+    )
+    vce_group.add_argument(
+        "--backend-config",
+        type=json.loads,
+        default='{"num_cpus":2, "num_gpus":0.0}',
+        help='A dict in the form \'{"<key>":<value>, "<another-key>":<value>}\' to '
+        "configure a backend. Pay close attention to how the quotes and double quotes "
+        "are set.",
+    )
+    parser.add_argument(
+        "--dir",
+        default="",
+        help="Add specified directory to the PYTHONPATH."
+        " Default: current working directory.",
+    )
diff --git a/src/py/flwr/server/superlink/fleet/vce/__init__.py b/src/py/flwr/server/superlink/fleet/vce/__init__.py
new file mode 100644
index 00000000000..563f77595e1
--- /dev/null
+++ b/src/py/flwr/server/superlink/fleet/vce/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2024 Flower Labs GmbH. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Fleet VirtualClientEngine side."""
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
new file mode 100644
index 00000000000..a6160125f1e
--- /dev/null
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -0,0 +1,68 @@
+# Copyright 2024 Flower Labs GmbH. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Fleet VirtualClientEngine API."""
+
+
+from logging import INFO
+from typing import Dict, Union
+
+from flwr.client.clientapp import ClientApp, load_client_app
+from flwr.client.node_state import NodeState
+from flwr.common.logger import log
+from flwr.server.superlink.state import StateFactory
+
+NodeToPartitionMapping = Dict[int, int]
+
+
+def _register_nodes(
+    num_nodes: int, state_factory: StateFactory
+) -> NodeToPartitionMapping:
+    """Registre nodes with the StateFactory and create node-id:partition-id mapping."""
+    nodes_mapping: NodeToPartitionMapping = {}
+    state = state_factory.state()
+    for i in range(num_nodes):
+        node_id = state.create_node()
+        nodes_mapping[node_id] = i
+    log(INFO, "Registered %i nodes", len(nodes_mapping))
+    return nodes_mapping
+
+
+# pylint: disable=too-many-arguments,unused-argument
+def start_vce(
+    num_supernodes: int,
+    client_app_str: str,
+    backend_str: str,
+    backend_config: Dict[str, Union[str, int, float]],
+    state_factory: StateFactory,
+    working_dir: str,
+) -> None:
+    """Start Fleet API with the VirtualClientEngine (VCE)."""
+    # Register SuperNodes
+    nodes_mapping = _register_nodes(
+        num_nodes=num_supernodes, state_factory=state_factory
+    )
+
+    # Construct mapping of NodeStates
+    node_states: Dict[int, NodeState] = {}
+    for node_id in nodes_mapping:
+        node_states[node_id] = NodeState()
+
+    log(INFO, "client_app_str = %s", client_app_str)
+
+    def _load() -> ClientApp:
+        app: ClientApp = load_client_app(client_app_str)
+        return app
+
+    # start backend

From a85db409e037e7bdd243394fa108d315e55c0b22 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.dev>
Date: Wed, 21 Feb 2024 18:16:47 +0000
Subject: [PATCH 002/103] base backend

---
 .../superlink/fleet/vce/backend/__init__.py   | 21 ++++++++
 .../superlink/fleet/vce/backend/backend.py    | 53 +++++++++++++++++++
 2 files changed, 74 insertions(+)
 create mode 100644 src/py/flwr/server/superlink/fleet/vce/backend/__init__.py
 create mode 100644 src/py/flwr/server/superlink/fleet/vce/backend/backend.py

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py b/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py
new file mode 100644
index 00000000000..3ff90c288a5
--- /dev/null
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2024 Flower Labs GmbH. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""VirtualClientEngine Backends."""
+
+from .backend import Backend
+
+__all__ = [
+    "Backend",
+]
diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
new file mode 100644
index 00000000000..ed6f7857d93
--- /dev/null
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
@@ -0,0 +1,53 @@
+# Copyright 2024 Flower Labs GmbH. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Generic Backend class for Fleet API using the VCE."""
+
+
+from abc import ABC, abstractmethod
+from typing import Callable, Tuple
+
+from flwr.client.clientapp import ClientApp
+from flwr.common.context import Context
+from flwr.common.message import Message
+
+
+class Backend(ABC):
+    """Abstract base class for a Backend."""
+
+    async def build(self) -> None:
+        """Build backend asynchronously.
+
+        Different components need to be inplace before workers in a backend are ready to
+        accept jobs. When this method finish executed, the backend should be fully ready
+        to run jobs.
+        """
+
+    @property
+    def num_workers(self) -> int:
+        """Return number of workers in the backend.
+
+        This is the number of TaskIns that can be run concurrently.
+        """
+        return 0
+
+    @abstractmethod
+    async def process_message(
+        self,
+        app: Callable[[], ClientApp],
+        message: Message,
+        context: Context,
+        node_id: int,
+    ) -> Tuple[Message, Context]:
+        """Submit a job to the backend."""

From b77031219b3b5a74b9fcbace60ee9b67950f6971 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.dev>
Date: Wed, 21 Feb 2024 18:17:10 +0000
Subject: [PATCH 003/103] update

---
 src/py/flwr/server/superlink/fleet/vce/backend/backend.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
index ed6f7857d93..1c83e604c65 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
@@ -48,6 +48,5 @@ async def process_message(
         app: Callable[[], ClientApp],
         message: Message,
         context: Context,
-        node_id: int,
     ) -> Tuple[Message, Context]:
         """Submit a job to the backend."""

From b9c64554ce24eb5114d8949191f9e7eda54eac88 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.dev>
Date: Wed, 21 Feb 2024 18:25:02 +0000
Subject: [PATCH 004/103] update docstrings

---
 src/py/flwr/server/superlink/fleet/vce/backend/backend.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
index 1c83e604c65..ff28724b14f 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
@@ -30,7 +30,7 @@ async def build(self) -> None:
         """Build backend asynchronously.
 
         Different components need to be inplace before workers in a backend are ready to
-        accept jobs. When this method finish executed, the backend should be fully ready
+        accept jobs. When this method finish executing, the backend should be fully ready
         to run jobs.
         """
 
@@ -38,7 +38,7 @@ async def build(self) -> None:
     def num_workers(self) -> int:
         """Return number of workers in the backend.
 
-        This is the number of TaskIns that can be run concurrently.
+        This is the number of TaskIns that can be processed concurrently.
         """
         return 0
 

From cd48539ee8185333d9edaa16d0de7147a38601ec Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.dev>
Date: Wed, 21 Feb 2024 18:26:40 +0000
Subject: [PATCH 005/103] minor fixes

---
 src/py/flwr/server/app.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py
index 75fa372d084..e16ab4dc4b8 100644
--- a/src/py/flwr/server/app.py
+++ b/src/py/flwr/server/app.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Flower server app."""
 
+
 import argparse
 import importlib.util
 import json
@@ -800,13 +801,13 @@ def _add_args_fleet_api(parser: argparse.ArgumentParser) -> None:
     vce_group.add_argument(
         "--num-supernodes",
         type=int,
-        help="Number of SuperNodes connected to the SuperLink.",
+        help="Number of SuperNodes to register with the SuperLink.",
     )
     vce_group.add_argument(
         "--backend",
         default="ray",
         type=str,
-        help="Simulation Backend that process a ClientApp.",
+        help="Simulation Backend that processes a ClientApp.",
     )
     vce_group.add_argument(
         "--backend-config",
@@ -819,6 +820,6 @@ def _add_args_fleet_api(parser: argparse.ArgumentParser) -> None:
     parser.add_argument(
         "--dir",
         default="",
-        help="Add specified directory to the PYTHONPATH."
+        help="Add a specified directory to the PYTHONPATH."
         " Default: current working directory.",
     )

From 2791163b68004b14ad11945111ba36ebde38fef1 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.dev>
Date: Thu, 22 Feb 2024 09:43:14 +0000
Subject: [PATCH 006/103] updates

---
 src/py/flwr/server/app.py                         |  7 ++++---
 .../superlink/fleet/vce/backend/__init__.py       |  3 ++-
 .../server/superlink/fleet/vce/backend/backend.py | 15 +++++++++++----
 src/py/flwr/server/superlink/fleet/vce/vce_api.py |  6 ++++--
 4 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py
index 75fa372d084..b2e5cefe45d 100644
--- a/src/py/flwr/server/app.py
+++ b/src/py/flwr/server/app.py
@@ -24,7 +24,7 @@
 from pathlib import Path
 from signal import SIGINT, SIGTERM, signal
 from types import FrameType
-from typing import Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple
 
 import grpc
 
@@ -55,6 +55,7 @@
     start_grpc_server,
 )
 from .superlink.fleet.grpc_rere.fleet_servicer import FleetServicer
+from .superlink.fleet.vce.backend import BackendConfig
 from .superlink.state import StateFactory
 
 ADDRESS_DRIVER_API = "0.0.0.0:9091"
@@ -552,11 +553,11 @@ def _run_fleet_api_vce(
     num_supernodes: int,
     client_app_str: str,
     backend: str,
-    backend_config: Dict[str, Union[str, int, float]],
+    backend_config: BackendConfig,
     working_dir: str,
     state_factory: StateFactory,
 ) -> None:
-    from flwr.server.superlink.fleet.vce.vce_api import start_vce
+    from .superlink.fleet.vce.vce_api import start_vce
 
     log(INFO, "Flower VCE: Starting Fleet API (VirtualClientEngine)")
 
diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py b/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py
index 3ff90c288a5..305cb32c16e 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py
@@ -14,8 +14,9 @@
 # ==============================================================================
 """VirtualClientEngine Backends."""
 
-from .backend import Backend
+from .backend import Backend, BackendConfig
 
 __all__ = [
     "Backend",
+    "BackendConfig",
 ]
diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
index ff28724b14f..90745f12e71 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
@@ -16,22 +16,25 @@
 
 
 from abc import ABC, abstractmethod
-from typing import Callable, Tuple
+from typing import Callable, Dict, Tuple, Union
 
 from flwr.client.clientapp import ClientApp
 from flwr.common.context import Context
 from flwr.common.message import Message
 
+BackendConfig = Dict[str, Union[str, int, float]]
+
 
 class Backend(ABC):
     """Abstract base class for a Backend."""
 
-    async def build(self) -> None:
+    @abstractmethod
+    async def build(self, backend_config: BackendConfig) -> None:
         """Build backend asynchronously.
 
         Different components need to be inplace before workers in a backend are ready to
-        accept jobs. When this method finish executing, the backend should be fully ready
-        to run jobs.
+        accept jobs. When this method finish executing, the backend should be fully
+        ready to run jobs.
         """
 
     @property
@@ -42,6 +45,10 @@ def num_workers(self) -> int:
         """
         return 0
 
+    @abstractmethod
+    def is_worker_idle(self) -> bool:
+        """Report whether a backend worker is idle and can therefore run a ClientApp."""
+
     @abstractmethod
     async def process_message(
         self,
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index a6160125f1e..0c9b1589e89 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -16,13 +16,15 @@
 
 
 from logging import INFO
-from typing import Dict, Union
+from typing import Dict
 
 from flwr.client.clientapp import ClientApp, load_client_app
 from flwr.client.node_state import NodeState
 from flwr.common.logger import log
 from flwr.server.superlink.state import StateFactory
 
+from .backend import BackendConfig
+
 NodeToPartitionMapping = Dict[int, int]
 
 
@@ -44,7 +46,7 @@ def start_vce(
     num_supernodes: int,
     client_app_str: str,
     backend_str: str,
-    backend_config: Dict[str, Union[str, int, float]],
+    backend_config: BackendConfig,
     state_factory: StateFactory,
     working_dir: str,
 ) -> None:

From 4ca33ece05702923c3ddd65ae5bf5529e8387241 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.dev>
Date: Thu, 22 Feb 2024 09:59:27 +0000
Subject: [PATCH 007/103] backend-config should contain value types

---
 src/py/flwr/server/app.py                                 | 7 ++++---
 src/py/flwr/server/superlink/fleet/vce/backend/backend.py | 5 +++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py
index b2e5cefe45d..ec64a6e8518 100644
--- a/src/py/flwr/server/app.py
+++ b/src/py/flwr/server/app.py
@@ -812,10 +812,11 @@ def _add_args_fleet_api(parser: argparse.ArgumentParser) -> None:
     vce_group.add_argument(
         "--backend-config",
         type=json.loads,
-        default='{"num_cpus":2, "num_gpus":0.0}',
+        default='{"client_resources": {"num_cpus":2, "num_gpus":0.0}',
         help='A dict in the form \'{"<key>":<value>, "<another-key>":<value>}\' to '
-        "configure a backend. Pay close attention to how the quotes and double quotes "
-        "are set.",
+        "configure a backend. Values supported in <value> are those included by "
+        "`flwr.common.typing.ConfigsRecordValues`. "
+        "Pay close attention to how the quotes and double quotes are set.",
     )
     parser.add_argument(
         "--dir",
diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
index 90745f12e71..3f428061e9a 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
@@ -16,13 +16,14 @@
 
 
 from abc import ABC, abstractmethod
-from typing import Callable, Dict, Tuple, Union
+from typing import Callable, Dict, Tuple
 
 from flwr.client.clientapp import ClientApp
 from flwr.common.context import Context
 from flwr.common.message import Message
+from flwr.common.typing import ConfigsRecordValues
 
-BackendConfig = Dict[str, Union[str, int, float]]
+BackendConfig = Dict[str, ConfigsRecordValues]
 
 
 class Backend(ABC):

From 1b6564ac578174f82bee1604813a6467e7d50840 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.dev>
Date: Thu, 22 Feb 2024 10:00:27 +0000
Subject: [PATCH 008/103] fix

---
 src/py/flwr/server/superlink/fleet/vce/backend/backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
index 3f428061e9a..28a080b1252 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
@@ -23,7 +23,7 @@
 from flwr.common.message import Message
 from flwr.common.typing import ConfigsRecordValues
 
-BackendConfig = Dict[str, ConfigsRecordValues]
+BackendConfig = Dict[str, Dict[str, ConfigsRecordValues]]
 
 
 class Backend(ABC):

From bad872788a7814072ff97cc11a1b00ce7146c14d Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.dev>
Date: Thu, 22 Feb 2024 10:12:12 +0000
Subject: [PATCH 009/103] w/ previous

---
 src/py/flwr/server/superlink/fleet/vce/backend/backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
index 28a080b1252..9b7cc18f3c0 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
@@ -30,7 +30,7 @@ class Backend(ABC):
     """Abstract base class for a Backend."""
 
     @abstractmethod
-    async def build(self, backend_config: BackendConfig) -> None:
+    async def build(self) -> None:
         """Build backend asynchronously.
 
         Different components need to be inplace before workers in a backend are ready to

From a68172fbef7d4bb4416e11cc856f5cd4170314c1 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.dev>
Date: Thu, 22 Feb 2024 11:22:07 +0000
Subject: [PATCH 010/103] fix

---
 src/py/flwr/server/app.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py
index e16ab4dc4b8..f686cf1f9bb 100644
--- a/src/py/flwr/server/app.py
+++ b/src/py/flwr/server/app.py
@@ -317,15 +317,6 @@ def run_fleet_api() -> None:
             certificates=certificates,
         )
         grpc_servers.append(fleet_server)
-    elif args.fleet_api_type == TRANSPORT_TYPE_VCE:
-        _run_fleet_api_vce(
-            num_supernodes=args.num_supernodes,
-            client_app_str=args.client_app,
-            backend=args.backend,
-            backend_config=args.backend_config,
-            working_dir=args.dir,
-            state_factory=state_factory,
-        )
     else:
         raise ValueError(f"Unknown fleet_api_type: {args.fleet_api_type}")
 
@@ -412,6 +403,15 @@ def run_superlink() -> None:
             certificates=certificates,
         )
         grpc_servers.append(fleet_server)
+    elif args.fleet_api_type == TRANSPORT_TYPE_VCE:
+        _run_fleet_api_vce(
+            num_supernodes=args.num_supernodes,
+            client_app_str=args.client_app,
+            backend=args.backend,
+            backend_config=args.backend_config,
+            working_dir=args.dir,
+            state_factory=state_factory,
+        )
     else:
         raise ValueError(f"Unknown fleet_api_type: {args.fleet_api_type}")
 

From 68818666aa41aa5f14d1c06aa1eb6de8480eb116 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.dev>
Date: Thu, 22 Feb 2024 11:25:17 +0000
Subject: [PATCH 011/103] fix for json.loads

---
 src/py/flwr/server/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py
index d7f8471a0aa..7acc69be5d5 100644
--- a/src/py/flwr/server/app.py
+++ b/src/py/flwr/server/app.py
@@ -813,7 +813,7 @@ def _add_args_fleet_api(parser: argparse.ArgumentParser) -> None:
     vce_group.add_argument(
         "--backend-config",
         type=json.loads,
-        default='{"client_resources": {"num_cpus":2, "num_gpus":0.0}',
+        default='{"client_resources": {"num_cpus":2, "num_gpus":0.0}}',
         help='A dict in the form \'{"<key>":<value>, "<another-key>":<value>}\' to '
         "configure a backend. Values supported in <value> are those included by "
         "`flwr.common.typing.ConfigsRecordValues`. "

From 935e3337e774b6777ff3ebc9b3ccd2b99980dbfd Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.dev>
Date: Thu, 22 Feb 2024 12:09:06 +0000
Subject: [PATCH 012/103] keep backend-config as json string

---
 src/py/flwr/server/app.py                     | 22 +++++++++----------
 .../server/superlink/fleet/vce/vce_api.py     |  9 +++++---
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py
index f686cf1f9bb..e11a58a19d2 100644
--- a/src/py/flwr/server/app.py
+++ b/src/py/flwr/server/app.py
@@ -17,7 +17,6 @@
 
 import argparse
 import importlib.util
-import json
 import sys
 import threading
 from logging import ERROR, INFO, WARN
@@ -25,7 +24,7 @@
 from pathlib import Path
 from signal import SIGINT, SIGTERM, signal
 from types import FrameType
-from typing import Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple
 
 import grpc
 
@@ -408,7 +407,7 @@ def run_superlink() -> None:
             num_supernodes=args.num_supernodes,
             client_app_str=args.client_app,
             backend=args.backend,
-            backend_config=args.backend_config,
+            backend_config_json_str=args.backend_config,
             working_dir=args.dir,
             state_factory=state_factory,
         )
@@ -553,11 +552,11 @@ def _run_fleet_api_vce(
     num_supernodes: int,
     client_app_str: str,
     backend: str,
-    backend_config: Dict[str, Union[str, int, float]],
+    backend_config_json_str: str,
     working_dir: str,
     state_factory: StateFactory,
 ) -> None:
-    from flwr.server.superlink.fleet.vce.vce_api import start_vce
+    from .superlink.fleet.vce.vce_api import start_vce
 
     log(INFO, "Flower VCE: Starting Fleet API (VirtualClientEngine)")
 
@@ -565,7 +564,7 @@ def _run_fleet_api_vce(
         num_supernodes=num_supernodes,
         client_app_str=client_app_str,
         backend_str=backend,
-        backend_config=backend_config,
+        backend_config_json_str=backend_config_json_str,
         state_factory=state_factory,
         working_dir=working_dir,
     )
@@ -811,11 +810,12 @@ def _add_args_fleet_api(parser: argparse.ArgumentParser) -> None:
     )
     vce_group.add_argument(
         "--backend-config",
-        type=json.loads,
-        default='{"num_cpus":2, "num_gpus":0.0}',
-        help='A dict in the form \'{"<key>":<value>, "<another-key>":<value>}\' to '
-        "configure a backend. Pay close attention to how the quotes and double quotes "
-        "are set.",
+        type=str,
+        default='{"client_resources": {"num_cpus":2, "num_gpus":0.0}}',
+        help='A JSON-like dict, e.g. \'{"<key>":<value>, "<another-key>":<value>}\' to '
+        "configure a backend. Values supported in <value> are those included by "
+        "`flwr.common.typing.ConfigsRecordValues`. "
+        "Pay close attention to how the quotes and double quotes are set.",
     )
     parser.add_argument(
         "--dir",
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index a6160125f1e..88144b1c3c0 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -14,9 +14,9 @@
 # ==============================================================================
 """Fleet VirtualClientEngine API."""
 
-
+import json
 from logging import INFO
-from typing import Dict, Union
+from typing import Dict
 
 from flwr.client.clientapp import ClientApp, load_client_app
 from flwr.client.node_state import NodeState
@@ -44,7 +44,7 @@ def start_vce(
     num_supernodes: int,
     client_app_str: str,
     backend_str: str,
-    backend_config: Dict[str, Union[str, int, float]],
+    backend_config_json_str: str,
     state_factory: StateFactory,
     working_dir: str,
 ) -> None:
@@ -59,6 +59,9 @@ def start_vce(
     for node_id in nodes_mapping:
         node_states[node_id] = NodeState()
 
+    # Load backend config
+    _ = json.loads(backend_config_json_str)
+
     log(INFO, "client_app_str = %s", client_app_str)
 
     def _load() -> ClientApp:

From adfe198f60253b85fd6fbd2a572281d34c247880 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.dev>
Date: Thu, 22 Feb 2024 13:10:56 +0000
Subject: [PATCH 013/103] added `RayBackend` and `SimpleActorPool`

---
 .../superlink/fleet/vce/backend/__init__.py   |   7 +
 .../superlink/fleet/vce/backend/backend.py    |   3 +
 .../superlink/fleet/vce/backend/raybackend.py | 148 ++++++++++++++++++
 .../server/superlink/fleet/vce/vce_api.py     |  18 ++-
 .../simulation/ray_transport/ray_actor.py     |  73 ++++++++-
 5 files changed, 246 insertions(+), 3 deletions(-)
 create mode 100644 src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py b/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py
index 305cb32c16e..dd954907234 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py
@@ -14,9 +14,16 @@
 # ==============================================================================
 """VirtualClientEngine Backends."""
 
+from typing import Dict, Type
+
 from .backend import Backend, BackendConfig
+from .raybackend import RayBackend
 
 __all__ = [
     "Backend",
     "BackendConfig",
+    "RayBackend",
 ]
+
+# mappy of supported backends
+supported_backends: Dict[str, Type[Backend]] = {"ray": RayBackend}
diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
index 9b7cc18f3c0..4cc5432ce5f 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
@@ -29,6 +29,9 @@
 class Backend(ABC):
     """Abstract base class for a Backend."""
 
+    def __init__(self, backend_config: BackendConfig, work_dir: str) -> None:
+        """Construct a backend."""
+
     @abstractmethod
     async def build(self) -> None:
         """Build backend asynchronously.
diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
new file mode 100644
index 00000000000..66511a16e0c
--- /dev/null
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
@@ -0,0 +1,148 @@
+# Copyright 2024 Flower Labs GmbH. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ray backend for the Fleet API using the VCE."""
+
+import asyncio
+import pathlib
+from logging import INFO
+from typing import Callable, Dict, List, Tuple, Union
+
+from flwr.client.clientapp import ClientApp
+from flwr.common.context import Context
+from flwr.common.logger import log
+from flwr.common.message import Message
+from flwr.simulation.ray_transport.ray_actor import (
+    BasicActorPool,
+    ClientAppActor,
+    init_ray,
+)
+
+from .backend import Backend, BackendConfig
+
+ClienteResourcesDict = Dict[str, Union[int, float]]
+
+
+class RayBackend(Backend):
+    """A backend that submits jobs to a `BasicActorPool`."""
+
+    def __init__(
+        self,
+        backend_config: BackendConfig,
+        work_dir: str,
+    ) -> None:
+        """Prepare RayBackend by initialising Ray and creating the ActorPool."""
+        log(INFO, "Backend config: %s", backend_config)
+
+        # Init ray and append working dir if needed
+        runtime_env = (
+            self._configure_runtime_env(work_dir=work_dir) if work_dir else None
+        )
+        init_ray(runtime_env=runtime_env)
+
+        # Validate client resources
+        self.client_resources_key = "client_resources"
+
+        # Create actor pool
+        client_resources = self._validate_client_resources(config=backend_config)
+        self.pool = BasicActorPool(
+            actor_type=ClientAppActor,
+            client_resources=client_resources,
+        )
+
+    def _configure_runtime_env(self, work_dir: str) -> Dict[str, Union[str, List[str]]]:
+        """Return list of files/subdirectories to exclude relateive to work_dir.
+
+        Without this, Ray will push everything to the Ray Cluster.
+        """
+        runtime_env: Dict[str, Union[str, List[str]]] = {"working_dir": work_dir}
+
+        if runtime_env:
+            excludes = []
+            path = pathlib.Path(work_dir)
+            for p in path.rglob("*"):
+                # exclude files need to be relative to the working_dir
+                excludes.append(str(p.relative_to(path)))
+            runtime_env["excludes"] = excludes
+
+        return runtime_env
+
+    def _validate_client_resources(self, config: BackendConfig) -> ClienteResourcesDict:
+        client_resources_config = config.get(self.client_resources_key)
+        client_resources: ClienteResourcesDict = {}
+        valid_types = (int, float)
+        if client_resources_config:
+            for k, v in client_resources_config.items():
+                assert isinstance(k, str), ValueError(
+                    f"client resources keys are expected to be `str` but you used "
+                    f"{type(k)} for `{k}`"
+                )
+                assert isinstance(v, valid_types), ValueError(
+                    f"client resources are expected to be of type {valid_types} but "
+                    f"found `{type(v)}` for key `{k}`",
+                )
+                client_resources[k] = v
+
+        else:
+            client_resources = {"num_cpus": 2, "num_gpus": 0.0}
+            log(
+                INFO,
+                "`%s` not specified in backend config. Applying default setting: %s",
+                self.client_resources_key,
+                client_resources,
+            )
+
+        return client_resources
+
+    @property
+    def num_workers(self) -> int:
+        """Return number of actors in pool."""
+        return self.pool.num_actors
+
+    def is_worker_idle(self) -> bool:
+        """Report whether the pool has idle actors."""
+        return self.pool.is_actor_available()
+
+    async def build(self) -> None:
+        """Build pool of Ray actors that this backend will submit jobs to."""
+        await self.pool.add_actors_to_pool(self.pool.actors_capacity)
+        log(INFO, "Constructed ActorPool with: %i actors", self.pool.num_actors)
+
+    async def process_message(
+        self,
+        app: Callable[[], ClientApp],
+        message: Message,
+        context: Context,
+    ) -> Tuple[Message, Context]:
+        """Run ClientApp that process a given message.
+
+        Return output message and updated context.
+        """
+        node_id = message.metadata.dst_node_id
+
+        # Submite a task to the pool
+        future = await self.pool.submit(
+            lambda a, a_fn, mssg, cid, state: a.run.remote(a_fn, mssg, cid, state),
+            (app, message, str(node_id), context),
+        )
+
+        await asyncio.wait([future])
+
+        # Fetch result
+        (
+            out_mssg,
+            updated_context,
+        ) = await self.pool.fetch_result_and_return_actor_to_pool(future)
+
+        return out_mssg, updated_context
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index 88144b1c3c0..57e5fa77bb1 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -15,7 +15,7 @@
 """Fleet VirtualClientEngine API."""
 
 import json
-from logging import INFO
+from logging import ERROR, INFO
 from typing import Dict
 
 from flwr.client.clientapp import ClientApp, load_client_app
@@ -23,6 +23,8 @@
 from flwr.common.logger import log
 from flwr.server.superlink.state import StateFactory
 
+from .backend import supported_backends
+
 NodeToPartitionMapping = Dict[int, int]
 
 
@@ -60,7 +62,19 @@ def start_vce(
         node_states[node_id] = NodeState()
 
     # Load backend config
-    _ = json.loads(backend_config_json_str)
+    backend_config = json.loads(backend_config_json_str)
+
+    try:
+        backend_type = supported_backends[backend_str]
+        _ = backend_type(backend_config, work_dir=working_dir)
+    except KeyError as ex:
+        log(
+            ERROR,
+            "Backennd type `%s`, is not supported. Use any of %s",
+            backend_str,
+            list(supported_backends.keys()),
+        )
+        raise ex
 
     log(INFO, "client_app_str = %s", client_app_str)
 
diff --git a/src/py/flwr/simulation/ray_transport/ray_actor.py b/src/py/flwr/simulation/ray_transport/ray_actor.py
index 70a220dc2a1..e2de8f8b947 100644
--- a/src/py/flwr/simulation/ray_transport/ray_actor.py
+++ b/src/py/flwr/simulation/ray_transport/ray_actor.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Ray-based Flower Actor and ActorPool implementation."""
 
-
+import asyncio
 import threading
 import traceback
 from abc import ABC
@@ -414,3 +414,74 @@ def get_client_result(
         # Fetch result belonging to the VirtualClient calling this method
         # Return both result from tasks and (potentially) updated run context
         return self._fetch_future_result(cid)
+
+
+def init_ray(*args: Any, **kwargs: Any) -> None:
+    """Intialises Ray if not already initialised."""
+    if not ray.is_initialized():
+        ray.init(*args, **kwargs)
+
+
+class BasicActorPool:
+    """A basic actor pool."""
+
+    def __init__(
+        self,
+        actor_type: Type[VirtualClientEngineActor],
+        client_resources: Dict[str, Union[int, float]],
+    ):
+        self.client_resources = client_resources
+
+        # Queue of idle actors
+        self.pool: asyncio.Queue[Type[VirtualClientEngineActor]] = asyncio.Queue()
+        self.num_actors = 0
+
+        # A function that creates an actor
+        self.create_actor_fn = lambda: actor_type.options(  # type: ignore
+            **client_resources
+        ).remote()
+
+        # Figure out how many actors can be created given the cluster resources
+        # and the resources the user indicates each VirtualClient will need
+        self.actors_capacity = pool_size_from_resources(client_resources)
+        self._future_to_actor: Dict[Any, Type[VirtualClientEngineActor]] = {}
+
+    def is_actor_available(self) -> bool:
+        """Return true if there is an idle actor."""
+        return self.pool.qsize() > 0
+
+    async def add_actors_to_pool(self, num_actors: int) -> None:
+        """Add actors to the pool.
+
+        This method may be executed also if new resources are added to your Ray cluster
+        (e.g. you add a new node).
+        """
+        for _ in range(num_actors):
+            await self.pool.put(self.create_actor_fn())  # type: ignore
+        self.num_actors += num_actors
+
+    async def submit(
+        self, actor_fn: Any, job: Tuple[ClientAppFn, Message, str, Context]
+    ) -> Any:
+        """On idle actor, submit job and return future."""
+        # Remove idle actor from pool
+        actor = await self.pool.get()
+        # Submit job to actor
+        app_fn, mssg, cid, context = job
+        future = actor_fn(actor, app_fn, mssg, cid, context)
+        # Keep track of future:actor (so we can fetch the actor upon job completion
+        # and add it back to the pool)
+        self._future_to_actor[future] = actor
+        return future
+
+    async def fetch_result_and_return_actor_to_pool(
+        self, future: Any
+    ) -> Tuple[Message, Context]:
+        """Pull result given a future and add actor back to pool."""
+        # Get actor that ran job
+        actor = self._future_to_actor.pop(future)
+        await self.pool.put(actor)
+        # Retrieve result for object store
+        # Instead of doing ray.get(future) we await it
+        _, out_mssg, updated_context = await future
+        return out_mssg, updated_context

From d0bab9a59fd21f6298e9d94ebc081a25a5323435 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.dev>
Date: Thu, 22 Feb 2024 14:35:38 +0000
Subject: [PATCH 014/103] complete VCE loop; works with `simulation-pytorch`
 example

---
 examples/simulation-pytorch/README.md         |  21 ++--
 examples/simulation-pytorch/sim.py            |  49 +++++---
 .../server/superlink/fleet/vce/vce_api.py     | 119 +++++++++++++++++-
 3 files changed, 157 insertions(+), 32 deletions(-)

diff --git a/examples/simulation-pytorch/README.md b/examples/simulation-pytorch/README.md
index 5ba5ec70dc3..f8e3c87770a 100644
--- a/examples/simulation-pytorch/README.md
+++ b/examples/simulation-pytorch/README.md
@@ -54,17 +54,13 @@ Write the command below in your terminal to install the dependencies according t
 pip install -r requirements.txt
 ```
 
-### Run Federated Learning Example
+### Run with `start_simulation`
 
-```bash
-# You can run the example without activating your environemnt
-poetry run python sim.py
+Ensure you have activated your environment then:
 
-# Or by first activating it
-poetry shell
+```bash
 # and then run the example
 python sim.py
-# you can exit your environment by typing "exit"
 ```
 
 You can adjust the CPU/GPU resources you assign to each of your virtual clients. By default, your clients will only use 1xCPU core. For example:
@@ -79,4 +75,15 @@ python sim.py --num_cpus=2
 python sim.py --num_cpus=2 --num_gpus=0.2
 ```
 
+### Run with `super-link` and `server-app`
+
+Ensure you have activated your environment, then:
+
+```
+flower-superlink --insecure --vce --num-supernodes 100 --client-app sim:client_app
+
+# on a different terminal
+flower-server-app sim:server_app --insecure
+```
+
 Take a look at the [Documentation](https://flower.ai/docs/framework/how-to-run-simulations.html) for more details on how you can customise your simulation.
diff --git a/examples/simulation-pytorch/sim.py b/examples/simulation-pytorch/sim.py
index 0a6ed8ebb9b..139cbf60bb8 100644
--- a/examples/simulation-pytorch/sim.py
+++ b/examples/simulation-pytorch/sim.py
@@ -29,9 +29,9 @@
     default=0.0,
     help="Ratio of GPU memory to assign to a virtual client",
 )
-parser.add_argument("--num_rounds", type=int, default=10, help="Number of FL rounds.")
 
 NUM_CLIENTS = 100
+NUM_ROUNDS = 10
 
 
 # Flower client, adapted from Pytorch quickstart example
@@ -167,28 +167,37 @@ def evaluate(
     return evaluate
 
 
+# Download MNIST dataset and partition it
+mnist_fds = FederatedDataset(dataset="mnist", partitioners={"train": NUM_CLIENTS})
+centralized_testset = mnist_fds.load_full("test")
+
+# Configure the strategy
+strategy = fl.server.strategy.FedAvg(
+    fraction_fit=0.1,  # Sample 10% of available clients for training
+    fraction_evaluate=0.05,  # Sample 5% of available clients for evaluation
+    min_available_clients=10,
+    on_fit_config_fn=fit_config,
+    evaluate_metrics_aggregation_fn=weighted_average,  # Aggregate federated metrics
+    # evaluate_fn=get_evaluate_fn(centralized_testset),  # Global evaluation function
+)
+
+
+# Run via `flower-client-app client:app`
+client_app = fl.client.ClientApp(
+    client_fn=get_client_fn(mnist_fds),
+)
+
+
+server_app = fl.server.ServerApp(
+    config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS),
+    strategy=strategy,
+)
+
+
 def main():
     # Parse input arguments
     args = parser.parse_args()
 
-    # Download MNIST dataset and partition it
-    mnist_fds = FederatedDataset(dataset="mnist", partitioners={"train": NUM_CLIENTS})
-    centralized_testset = mnist_fds.load_full("test")
-
-    # Configure the strategy
-    strategy = fl.server.strategy.FedAvg(
-        fraction_fit=0.1,  # Sample 10% of available clients for training
-        fraction_evaluate=0.05,  # Sample 5% of available clients for evaluation
-        min_fit_clients=10,  # Never sample less than 10 clients for training
-        min_evaluate_clients=5,  # Never sample less than 5 clients for evaluation
-        min_available_clients=int(
-            NUM_CLIENTS * 0.75
-        ),  # Wait until at least 75 clients are available
-        on_fit_config_fn=fit_config,
-        evaluate_metrics_aggregation_fn=weighted_average,  # Aggregate federated metrics
-        evaluate_fn=get_evaluate_fn(centralized_testset),  # Global evaluation function
-    )
-
     # Resources to be assigned to each virtual client
     client_resources = {
         "num_cpus": args.num_cpus,
@@ -200,7 +209,7 @@ def main():
         client_fn=get_client_fn(mnist_fds),
         num_clients=NUM_CLIENTS,
         client_resources=client_resources,
-        config=fl.server.ServerConfig(num_rounds=args.num_rounds),
+        config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS),
         strategy=strategy,
         actor_kwargs={
             "on_actor_init_fn": disable_progress_bar  # disable tqdm on each actor/process spawning virtual clients
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index 57e5fa77bb1..aacba9d1140 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -14,17 +14,23 @@
 # ==============================================================================
 """Fleet VirtualClientEngine API."""
 
+
+import asyncio
 import json
-from logging import ERROR, INFO
-from typing import Dict
+import traceback
+from logging import DEBUG, ERROR, INFO
+from typing import Callable, Dict
 
 from flwr.client.clientapp import ClientApp, load_client_app
 from flwr.client.node_state import NodeState
 from flwr.common.logger import log
+from flwr.common.serde import message_from_taskins, message_to_taskres
+from flwr.proto.task_pb2 import TaskIns  # pylint: disable=E0611
 from flwr.server.superlink.state import StateFactory
 
-from .backend import supported_backends
+from .backend import Backend, supported_backends
 
+TaskInsQueue = asyncio.Queue[TaskIns]
 NodeToPartitionMapping = Dict[int, int]
 
 
@@ -41,6 +47,99 @@ def _register_nodes(
     return nodes_mapping
 
 
+# pylint: disable=too-many-arguments
+async def worker(
+    app: Callable[[], ClientApp],
+    queue: TaskInsQueue,
+    node_states: Dict[int, NodeState],
+    state_factory: StateFactory,
+    nodes_mapping: NodeToPartitionMapping,
+    backend: Backend,
+) -> None:
+    """Get TaskIns from queue and pass it to an actor in the pool to execute it."""
+    state = state_factory.state()
+    while True:
+        try:
+            task_ins = await queue.get()
+            node_id = task_ins.task.consumer.node_id
+
+            # Register and retrive runstate
+            node_states[node_id].register_context(run_id=task_ins.run_id)
+            context = node_states[node_id].retrieve_context(run_id=task_ins.run_id)
+
+            # Convert TaskIns to Message
+            message = message_from_taskins(task_ins)
+            # Replace node-id with data partition id
+            message.metadata.dst_node_id = nodes_mapping[node_id]
+
+            # Let backend process message
+            out_mssg, updated_context = await backend.process_message(
+                app, message, context
+            )
+
+            # Update Context
+            node_states[node_id].update_context(
+                task_ins.run_id, context=updated_context
+            )
+
+            # Undo change node_id for partition choice
+            out_mssg.metadata._src_node_id = (  # pylint: disable=protected-access
+                task_ins.task.consumer.node_id
+            )
+            # Convert to TaskRes
+            task_res = message_to_taskres(out_mssg)
+            # Store TaskRes in state
+            state.store_task_res(task_res)
+
+        except Exception as ex:  # pylint: disable=broad-exception-caught
+            # pylint: disable=fixme
+            # TODO: gen TaskRes with relevant error, add it to state_factory
+            log(ERROR, ex)
+            log(ERROR, traceback.format_exc())
+            break
+
+
+async def generate_pull_requests(
+    queue: TaskInsQueue,
+    state_factory: StateFactory,
+    nodes_mapping: NodeToPartitionMapping,
+) -> None:
+    """Generate TaskIns and add it to the queue."""
+    state = state_factory.state()
+    while True:
+        for node_id in nodes_mapping.keys():
+            task_ins = state.get_task_ins(node_id=node_id, limit=1)
+            if task_ins:
+                await queue.put(task_ins[0])
+        log(DEBUG, "TaskIns in queue: %i", queue.qsize())
+        # pylint: disable=fixme
+        await asyncio.sleep(1.0)  # TODO: revisit
+
+
+async def run(
+    app: Callable[[], ClientApp],
+    backend: Backend,
+    nodes_mapping: NodeToPartitionMapping,
+    state_factory: StateFactory,
+    node_states: Dict[int, NodeState],
+) -> None:
+    """Run the VCE async."""
+    # pylint: disable=fixme
+    queue: TaskInsQueue = asyncio.Queue(64)  # TODO: revisit
+
+    # Build backend
+    await backend.build()
+    worker_tasks = [
+        asyncio.create_task(
+            worker(app, queue, node_states, state_factory, nodes_mapping, backend)
+        )
+        for _ in range(backend.num_workers)
+    ]
+    asyncio.create_task(generate_pull_requests(queue, state_factory, nodes_mapping))
+    await queue.join()
+    await asyncio.gather(*worker_tasks)
+
+
 # pylint: disable=too-many-arguments,unused-argument
 def start_vce(
     num_supernodes: int,
@@ -66,7 +165,7 @@ def start_vce(
 
     try:
         backend_type = supported_backends[backend_str]
-        _ = backend_type(backend_config, work_dir=working_dir)
+        backend = backend_type(backend_config, work_dir=working_dir)
     except KeyError as ex:
         log(
             ERROR,
@@ -82,4 +181,14 @@ def _load() -> ClientApp:
         app: ClientApp = load_client_app(client_app_str)
         return app
 
-    # start backend
+    app = _load
+
+    asyncio.run(
+        run(
+            app,
+            backend,
+            nodes_mapping,
+            state_factory,
+            node_states,
+        )
+    )

From 5e0ee74dd505737abea8550125658a16badfc30c Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.dev>
Date: Thu, 22 Feb 2024 14:47:19 +0000
Subject: [PATCH 015/103] fix exclude generation logic

---
 src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
index 66511a16e0c..5d552ea758b 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
@@ -73,7 +73,8 @@ def _configure_runtime_env(self, work_dir: str) -> Dict[str, Union[str, List[str
             path = pathlib.Path(work_dir)
             for p in path.rglob("*"):
                 # exclude files need to be relative to the working_dir
-                excludes.append(str(p.relative_to(path)))
+                if p.is_file() and not str(p).endswith('.py'):
+                    excludes.append(str(p.relative_to(path)))
             runtime_env["excludes"] = excludes
 
         return runtime_env

From 4853813cc8462ed097477b3294271d9b8ef6e269 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.dev>
Date: Thu, 22 Feb 2024 15:54:53 +0000
Subject: [PATCH 016/103] simulation-tf w/ Flower-next; updates pytorch example
 too

---
 examples/simulation-pytorch/README.md    |  4 +-
 examples/simulation-pytorch/sim.py       |  7 ++--
 examples/simulation-tensorflow/README.md | 21 ++++++----
 examples/simulation-tensorflow/sim.py    | 52 ++++++++++++++----------
 4 files changed, 50 insertions(+), 34 deletions(-)

diff --git a/examples/simulation-pytorch/README.md b/examples/simulation-pytorch/README.md
index f8e3c87770a..33cef10cc03 100644
--- a/examples/simulation-pytorch/README.md
+++ b/examples/simulation-pytorch/README.md
@@ -54,7 +54,7 @@ Write the command below in your terminal to install the dependencies according t
 pip install -r requirements.txt
 ```
 
-### Run with `start_simulation`
+### Run with `start_simulation()`
 
 Ensure you have activated your environment then:
 
@@ -75,7 +75,7 @@ python sim.py --num_cpus=2
 python sim.py --num_cpus=2 --num_gpus=0.2
 ```
 
-### Run with `super-link` and `server-app`
+### Run with Flower-Next (`super-link` and `server-app`)
 
 Ensure you have activated your environment, then:
 
diff --git a/examples/simulation-pytorch/sim.py b/examples/simulation-pytorch/sim.py
index 139cbf60bb8..84a00e3f092 100644
--- a/examples/simulation-pytorch/sim.py
+++ b/examples/simulation-pytorch/sim.py
@@ -178,16 +178,15 @@ def evaluate(
     min_available_clients=10,
     on_fit_config_fn=fit_config,
     evaluate_metrics_aggregation_fn=weighted_average,  # Aggregate federated metrics
-    # evaluate_fn=get_evaluate_fn(centralized_testset),  # Global evaluation function
+    evaluate_fn=get_evaluate_fn(centralized_testset),  # Global evaluation function
 )
 
-
-# Run via `flower-client-app client:app`
+# ClientApp for Flower-Next
 client_app = fl.client.ClientApp(
     client_fn=get_client_fn(mnist_fds),
 )
 
-
+# ServerApp for Flower-Next
 server_app = fl.server.ServerApp(
     config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS),
     strategy=strategy,
diff --git a/examples/simulation-tensorflow/README.md b/examples/simulation-tensorflow/README.md
index 75be823db2e..900cdbebe52 100644
--- a/examples/simulation-tensorflow/README.md
+++ b/examples/simulation-tensorflow/README.md
@@ -53,17 +53,13 @@ Write the command below in your terminal to install the dependencies according t
 pip install -r requirements.txt
 ```
 
-### Run Federated Learning Example
+### Run with `start_simulation()`
 
-```bash
-# You can run the example without activating your environemnt
-poetry run python sim.py
+Ensure you have activated your environment then:
 
-# Or by first activating it
-poetry shell
+```bash
 # and then run the example
 python sim.py
-# you can exit your environment by typing "exit"
 ```
 
 You can adjust the CPU/GPU resources you assign to each of your virtual clients. By default, your clients will only use 1xCPU core. For example:
@@ -78,4 +74,15 @@ python sim.py --num_cpus=2
 python sim.py --num_cpus=2 --num_gpus=0.2
 ```
 
+### Run with Flower-Next (`super-link` and `server-app`)
+
+Ensure you have activated your environment, then:
+
+```
+flower-superlink --insecure --vce --num-supernodes 100 --client-app sim:client_app
+
+# on a different terminal
+flower-server-app sim:server_app --insecure
+```
+
 Take a look at the [Documentation](https://flower.ai/docs/framework/how-to-run-simulations.html) for more details on how you can customise your simulation.
diff --git a/examples/simulation-tensorflow/sim.py b/examples/simulation-tensorflow/sim.py
index 043c624a40a..5db708e3651 100644
--- a/examples/simulation-tensorflow/sim.py
+++ b/examples/simulation-tensorflow/sim.py
@@ -29,9 +29,9 @@
     default=0.0,
     help="Ratio of GPU memory to assign to a virtual client",
 )
-parser.add_argument("--num_rounds", type=int, default=10, help="Number of FL rounds.")
 
 NUM_CLIENTS = 100
+NUM_ROUNDS = 10
 VERBOSE = 0
 
 
@@ -129,29 +129,39 @@ def evaluate(
     return evaluate
 
 
+# Download MNIST dataset and partition it
+mnist_fds = FederatedDataset(dataset="mnist", partitioners={"train": NUM_CLIENTS})
+# Get the whole test set for centralised evaluation
+centralized_testset = mnist_fds.load_full("test").to_tf_dataset(
+    columns="image", label_cols="label", batch_size=64
+)
+
+# Create FedAvg strategy
+strategy = fl.server.strategy.FedAvg(
+    fraction_fit=0.1,  # Sample 10% of available clients for training
+    fraction_evaluate=0.05,  # Sample 5% of available clients for evaluation
+    min_fit_clients=10,  # Never sample less than 10 clients for training
+    evaluate_metrics_aggregation_fn=weighted_average,  # aggregates federated metrics
+    evaluate_fn=get_evaluate_fn(centralized_testset),  # global evaluation function
+)
+
+
+# ClientApp for Flower-Next
+client_app = fl.client.ClientApp(
+    client_fn=get_client_fn(mnist_fds),
+)
+
+# ServerApp for Flower-Next
+server_app = fl.server.ServerApp(
+    config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS),
+    strategy=strategy,
+)
+
+
 def main() -> None:
     # Parse input arguments
     args = parser.parse_args()
 
-    # Download MNIST dataset and partition it
-    mnist_fds = FederatedDataset(dataset="mnist", partitioners={"train": NUM_CLIENTS})
-    # Get the whole test set for centralised evaluation
-    centralized_testset = mnist_fds.load_full("test").to_tf_dataset(
-        columns="image", label_cols="label", batch_size=64
-    )
-
-    # Create FedAvg strategy
-    strategy = fl.server.strategy.FedAvg(
-        fraction_fit=0.1,  # Sample 10% of available clients for training
-        fraction_evaluate=0.05,  # Sample 5% of available clients for evaluation
-        min_fit_clients=10,  # Never sample less than 10 clients for training
-        min_evaluate_clients=5,  # Never sample less than 5 clients for evaluation
-        min_available_clients=int(
-            NUM_CLIENTS * 0.75
-        ),  # Wait until at least 75 clients are available
-        evaluate_metrics_aggregation_fn=weighted_average,  # aggregates federated metrics
-        evaluate_fn=get_evaluate_fn(centralized_testset),  # global evaluation function
-    )
 
     # With a dictionary, you tell Flower's VirtualClientEngine that each
     # client needs exclusive access to these many resources in order to run
@@ -164,7 +174,7 @@ def main() -> None:
     fl.simulation.start_simulation(
         client_fn=get_client_fn(mnist_fds),
         num_clients=NUM_CLIENTS,
-        config=fl.server.ServerConfig(num_rounds=args.num_rounds),
+        config=fl.server.ServerConfig(NUM_ROUNDS),
         strategy=strategy,
         client_resources=client_resources,
         actor_kwargs={

From 31787cf508b933d9d6f4d2e03eb76e3df4827760 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Thu, 22 Feb 2024 16:33:12 +0000
Subject: [PATCH 017/103] format

---
 examples/simulation-tensorflow/sim.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/simulation-tensorflow/sim.py b/examples/simulation-tensorflow/sim.py
index 5db708e3651..6f28eaee170 100644
--- a/examples/simulation-tensorflow/sim.py
+++ b/examples/simulation-tensorflow/sim.py
@@ -162,7 +162,6 @@ def main() -> None:
     # Parse input arguments
     args = parser.parse_args()
 
-
     # With a dictionary, you tell Flower's VirtualClientEngine that each
     # client needs exclusive access to these many resources in order to run
     client_resources = {

From 8522022c8395eafdbf8229411c9baddcaa280cbf Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Thu, 22 Feb 2024 16:35:11 +0000
Subject: [PATCH 018/103] passing actor init kwargs

---
 .../flwr/server/superlink/fleet/vce/backend/raybackend.py   | 4 +++-
 src/py/flwr/simulation/ray_transport/ray_actor.py           | 6 +++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
index 5d552ea758b..741cdee93a7 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
@@ -55,10 +55,12 @@ def __init__(
         self.client_resources_key = "client_resources"
 
         # Create actor pool
+        actor_kwargs = backend_config.get("actor_kwargs", {})
         client_resources = self._validate_client_resources(config=backend_config)
         self.pool = BasicActorPool(
             actor_type=ClientAppActor,
             client_resources=client_resources,
+            actor_kwargs=actor_kwargs,
         )
 
     def _configure_runtime_env(self, work_dir: str) -> Dict[str, Union[str, List[str]]]:
@@ -73,7 +75,7 @@ def _configure_runtime_env(self, work_dir: str) -> Dict[str, Union[str, List[str
             path = pathlib.Path(work_dir)
             for p in path.rglob("*"):
                 # exclude files need to be relative to the working_dir
-                if p.is_file() and not str(p).endswith('.py'):
+                if p.is_file() and not str(p).endswith(".py"):
                     excludes.append(str(p.relative_to(path)))
             runtime_env["excludes"] = excludes
 
diff --git a/src/py/flwr/simulation/ray_transport/ray_actor.py b/src/py/flwr/simulation/ray_transport/ray_actor.py
index e2de8f8b947..b48e448b681 100644
--- a/src/py/flwr/simulation/ray_transport/ray_actor.py
+++ b/src/py/flwr/simulation/ray_transport/ray_actor.py
@@ -429,6 +429,7 @@ def __init__(
         self,
         actor_type: Type[VirtualClientEngineActor],
         client_resources: Dict[str, Union[int, float]],
+        actor_kwargs: Dict[str, Any],
     ):
         self.client_resources = client_resources
 
@@ -436,10 +437,13 @@ def __init__(
         self.pool: asyncio.Queue[Type[VirtualClientEngineActor]] = asyncio.Queue()
         self.num_actors = 0
 
+        # Resolve arguments to pass during actor init
+        actor_args = {} if actor_kwargs is None else actor_kwargs
+
         # A function that creates an actor
         self.create_actor_fn = lambda: actor_type.options(  # type: ignore
             **client_resources
-        ).remote()
+        ).remote(**actor_args)
 
         # Figure out how many actors can be created given the cluster resources
         # and the resources the user indicates each VirtualClient will need

From 92005133226bf8809e7ab35642b5d7812883b34d Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Thu, 22 Feb 2024 17:04:31 +0000
Subject: [PATCH 019/103] updated examples

---
 examples/simulation-pytorch/README.md    | 20 ++++++++++++++++----
 examples/simulation-tensorflow/README.md | 23 ++++++++++++++++++-----
 examples/simulation-tensorflow/sim.py    |  1 +
 3 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/examples/simulation-pytorch/README.md b/examples/simulation-pytorch/README.md
index 33cef10cc03..8b21e845ddc 100644
--- a/examples/simulation-pytorch/README.md
+++ b/examples/simulation-pytorch/README.md
@@ -67,12 +67,12 @@ You can adjust the CPU/GPU resources you assign to each of your virtual clients.
 
 ```bash
 # Will assign 2xCPUs to each client
-python sim.py --num_cpus=2
+python sim.py --num_cpus=4
 
-# Will assign 2xCPUs and 20% of the GPU's VRAM to each client
-# This means that you can have 5 concurrent clients on each GPU
+# Will assign 4xCPUs and 25% of the GPU's VRAM to each client
+# This means that you can have 4 concurrent clients on each GPU
 # (assuming you have enough CPUs)
-python sim.py --num_cpus=2 --num_gpus=0.2
+python sim.py --num_cpus=4 --num_gpus=0.25
 ```
 
 ### Run with Flower-Next (`super-link` and `server-app`)
@@ -86,4 +86,16 @@ flower-superlink --insecure --vce --num-supernodes 100 --client-app sim:client_a
 flower-server-app sim:server_app --insecure
 ```
 
+You can change the default resources assigned to each `ClientApp` by means the `--backend-config` argument:
+
+```bash
+# Tells the VCE to resever 4x CPUs and 25% of available VRAM for each ClientApp
+flower-superlink --insecure --vce --num-supernodes 100 \
+                 --client-app sim:client_app \
+                 --backend-config='{"client_resources": {"num_cpus":4, "num_gpus":0.25}}'
+
+# Then you can launch the `flower-server-app` command as shown earlier.
+```
+
+
 Take a look at the [Documentation](https://flower.ai/docs/framework/how-to-run-simulations.html) for more details on how you can customise your simulation.
diff --git a/examples/simulation-tensorflow/README.md b/examples/simulation-tensorflow/README.md
index 900cdbebe52..7e0225ad524 100644
--- a/examples/simulation-tensorflow/README.md
+++ b/examples/simulation-tensorflow/README.md
@@ -66,13 +66,14 @@ You can adjust the CPU/GPU resources you assign to each of your virtual clients.
 
 ```bash
 # Will assign 2xCPUs to each client
-python sim.py --num_cpus=2
+python sim.py --num_cpus=4
 
-# Will assign 2xCPUs and 20% of the GPU's VRAM to each client
-# This means that you can have 5 concurrent clients on each GPU
+# Will assign 4xCPUs and 25% of the GPU's VRAM to each client
+# This means that you can have 4 concurrent clients on each GPU
 # (assuming you have enough CPUs)
-python sim.py --num_cpus=2 --num_gpus=0.2
+python sim.py --num_cpus=4 --num_gpus=0.25
 ```
+Because TensorFlow by default maps all the available VRAM, we need to [enable GPU memory growth](https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth), see how it is done in the example (`sim.py`) for both the "main" process (where the server/strategy runs) and for the clients (using the `actor_kwargs`)
 
 ### Run with Flower-Next (`super-link` and `server-app`)
 
@@ -85,4 +86,16 @@ flower-superlink --insecure --vce --num-supernodes 100 --client-app sim:client_a
 flower-server-app sim:server_app --insecure
 ```
 
-Take a look at the [Documentation](https://flower.ai/docs/framework/how-to-run-simulations.html) for more details on how you can customise your simulation.
+You can change the default resources assigned to each `ClientApp` by means the `--backend-config` argument. Note that we need to flag that the backend is going to use `TensorFlow`. In this way it will enable GPU memory growth.
+
+```bash
+# Tells the VCE to resever 4x CPUs and 25% of available VRAM for each ClientApp
+flower-superlink --insecure --vce --num-supernodes 100 \
+                 --client-app sim:client_app \
+                 --backend-config='{"client_resources": {"num_cpus":4, "num_gpus":0.25}, "tensorflow": 1}'
+
+# Then you can launch the `flower-server-app` command as shown earlier.
+```
+
+
+Take a look at the [Documentation](https://flower.ai/docs/framework/how-to-run-simulations.html) for more details on how you can customise your simulation. 
diff --git a/examples/simulation-tensorflow/sim.py b/examples/simulation-tensorflow/sim.py
index 6f28eaee170..dbba71ac2cf 100644
--- a/examples/simulation-tensorflow/sim.py
+++ b/examples/simulation-tensorflow/sim.py
@@ -152,6 +152,7 @@ def evaluate(
 )
 
 # ServerApp for Flower-Next
+# TODO: Unclear how to enable GPU growth for the ServerApp
 server_app = fl.server.ServerApp(
     config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS),
     strategy=strategy,

From d8935b35eb9efe36328e3bd4043eea9f14759b64 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Thu, 22 Feb 2024 17:05:06 +0000
Subject: [PATCH 020/103] auto enable GPU growth if 'tensorflow' passed

---
 .../flwr/server/superlink/fleet/vce/backend/raybackend.py   | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
index 741cdee93a7..1710ff7d937 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
@@ -28,6 +28,8 @@
     ClientAppActor,
     init_ray,
 )
+from flwr.simulation.ray_transport.utils import enable_tf_gpu_growth
+
 
 from .backend import Backend, BackendConfig
 
@@ -55,7 +57,9 @@ def __init__(
         self.client_resources_key = "client_resources"
 
         # Create actor pool
-        actor_kwargs = backend_config.get("actor_kwargs", {})
+        use_tf = backend_config.get("tensorflow", False)
+        actor_kwargs = {"on_actor_init_fn": enable_tf_gpu_growth } if use_tf else {}
+
         client_resources = self._validate_client_resources(config=backend_config)
         self.pool = BasicActorPool(
             actor_type=ClientAppActor,

From b108be22c97d1ec21f0111ca125eb608f22b3828 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Thu, 22 Feb 2024 17:43:27 +0000
Subject: [PATCH 021/103] return to default 1xCPU for virtual client

---
 examples/simulation-pytorch/README.md    | 10 +++++-----
 examples/simulation-tensorflow/README.md | 14 +++++++-------
 src/py/flwr/server/app.py                |  2 +-
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/examples/simulation-pytorch/README.md b/examples/simulation-pytorch/README.md
index 8b21e845ddc..b95dcdceb09 100644
--- a/examples/simulation-pytorch/README.md
+++ b/examples/simulation-pytorch/README.md
@@ -67,12 +67,12 @@ You can adjust the CPU/GPU resources you assign to each of your virtual clients.
 
 ```bash
 # Will assign 2xCPUs to each client
-python sim.py --num_cpus=4
+python sim.py --num_cpus=2
 
-# Will assign 4xCPUs and 25% of the GPU's VRAM to each client
+# Will assign 2xCPUs and 25% of the GPU's VRAM to each client
 # This means that you can have 4 concurrent clients on each GPU
 # (assuming you have enough CPUs)
-python sim.py --num_cpus=4 --num_gpus=0.25
+python sim.py --num_cpus=2 --num_gpus=0.25
 ```
 
 ### Run with Flower-Next (`super-link` and `server-app`)
@@ -89,10 +89,10 @@ flower-server-app sim:server_app --insecure
 You can change the default resources assigned to each `ClientApp` by means the `--backend-config` argument:
 
 ```bash
-# Tells the VCE to resever 4x CPUs and 25% of available VRAM for each ClientApp
+# Tells the VCE to resever 2x CPUs and 25% of available VRAM for each ClientApp
 flower-superlink --insecure --vce --num-supernodes 100 \
                  --client-app sim:client_app \
-                 --backend-config='{"client_resources": {"num_cpus":4, "num_gpus":0.25}}'
+                 --backend-config='{"client_resources": {"num_cpus":2, "num_gpus":0.25}}'
 
 # Then you can launch the `flower-server-app` command as shown earlier.
 ```
diff --git a/examples/simulation-tensorflow/README.md b/examples/simulation-tensorflow/README.md
index 7e0225ad524..8718987eb6e 100644
--- a/examples/simulation-tensorflow/README.md
+++ b/examples/simulation-tensorflow/README.md
@@ -62,16 +62,16 @@ Ensure you have activated your environment then:
 python sim.py
 ```
 
-You can adjust the CPU/GPU resources you assign to each of your virtual clients. By default, your clients will only use 1xCPU core. For example:
+You can adjust the CPU/GPU resources you assign to each of your virtual clients. By default, your clients will only use 2xCPU core. For example:
 
 ```bash
 # Will assign 2xCPUs to each client
-python sim.py --num_cpus=4
+python sim.py --num_cpus=2
 
-# Will assign 4xCPUs and 25% of the GPU's VRAM to each client
+# Will assign 2xCPUs and 25% of the GPU's VRAM to each client
 # This means that you can have 4 concurrent clients on each GPU
 # (assuming you have enough CPUs)
-python sim.py --num_cpus=4 --num_gpus=0.25
+python sim.py --num_cpus=2 --num_gpus=0.25
 ```
 Because TensorFlow by default maps all the available VRAM, we need to [enable GPU memory growth](https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth), see how it is done in the example (`sim.py`) for both the "main" process (where the server/strategy runs) and for the clients (using the `actor_kwargs`)
 
@@ -86,13 +86,13 @@ flower-superlink --insecure --vce --num-supernodes 100 --client-app sim:client_a
 flower-server-app sim:server_app --insecure
 ```
 
-You can change the default resources assigned to each `ClientApp` by means the `--backend-config` argument. Note that we need to flag that the backend is going to use `TensorFlow`. In this way it will enable GPU memory growth.
+You can change the default resources assigned to each `ClientApp` using the `--backend-config` argument. Note that we need to flag that the backend is going to use `TensorFlow`. In this way, it will enable GPU memory growth.
 
 ```bash
-# Tells the VCE to resever 4x CPUs and 25% of available VRAM for each ClientApp
+# Tells the VCE to resever 2x CPUs and 25% of available VRAM for each ClientApp
 flower-superlink --insecure --vce --num-supernodes 100 \
                  --client-app sim:client_app \
-                 --backend-config='{"client_resources": {"num_cpus":4, "num_gpus":0.25}, "tensorflow": 1}'
+                 --backend-config='{"client_resources": {"num_cpus":2, "num_gpus":0.25}, "tensorflow": 1}'
 
 # Then you can launch the `flower-server-app` command as shown earlier.
 ```
diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py
index e11a58a19d2..8eb5a96bf42 100644
--- a/src/py/flwr/server/app.py
+++ b/src/py/flwr/server/app.py
@@ -811,7 +811,7 @@ def _add_args_fleet_api(parser: argparse.ArgumentParser) -> None:
     vce_group.add_argument(
         "--backend-config",
         type=str,
-        default='{"client_resources": {"num_cpus":2, "num_gpus":0.0}}',
+        default='{"client_resources": {"num_cpus":1, "num_gpus":0.0}}',
         help='A JSON-like dict, e.g. \'{"<key>":<value>, "<another-key>":<value>}\' to '
         "configure a backend. Values supported in <value> are those included by "
         "`flwr.common.typing.ConfigsRecordValues`. "

From 0e02b05cd60b6f364182321f9f0ef8734334ea4e Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Thu, 22 Feb 2024 18:50:58 +0000
Subject: [PATCH 022/103] moved import

---
 src/py/flwr/server/app.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py
index e11a58a19d2..0a24c1e36e4 100644
--- a/src/py/flwr/server/app.py
+++ b/src/py/flwr/server/app.py
@@ -55,6 +55,7 @@
     start_grpc_server,
 )
 from .superlink.fleet.grpc_rere.fleet_servicer import FleetServicer
+from .superlink.fleet.vce.vce_api import start_vce
 from .superlink.state import StateFactory
 
 ADDRESS_DRIVER_API = "0.0.0.0:9091"
@@ -547,7 +548,7 @@ def _run_fleet_api_grpc_rere(
     return fleet_grpc_server
 
 
-# pylint: disable=import-outside-toplevel,too-many-arguments
+# pylint: disable=too-many-arguments
 def _run_fleet_api_vce(
     num_supernodes: int,
     client_app_str: str,
@@ -556,8 +557,6 @@ def _run_fleet_api_vce(
     working_dir: str,
     state_factory: StateFactory,
 ) -> None:
-    from .superlink.fleet.vce.vce_api import start_vce
-
     log(INFO, "Flower VCE: Starting Fleet API (VirtualClientEngine)")
 
     start_vce(

From fd67f22d09e6097667dbe56e22c53e2d7c96fc01 Mon Sep 17 00:00:00 2001
From: Javier <jafermarq@users.noreply.github.com>
Date: Thu, 22 Feb 2024 19:29:45 +0000
Subject: [PATCH 023/103] Apply suggestions from code review

Co-authored-by: Daniel J. Beutel <daniel@flower.ai>
---
 src/py/flwr/server/app.py                         | 7 ++++---
 src/py/flwr/server/superlink/fleet/vce/vce_api.py | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py
index 0a24c1e36e4..84eca40e995 100644
--- a/src/py/flwr/server/app.py
+++ b/src/py/flwr/server/app.py
@@ -799,13 +799,13 @@ def _add_args_fleet_api(parser: argparse.ArgumentParser) -> None:
     vce_group.add_argument(
         "--num-supernodes",
         type=int,
-        help="Number of SuperNodes to register with the SuperLink.",
+        help="Number of simulated SuperNodes.",
     )
     vce_group.add_argument(
         "--backend",
         default="ray",
         type=str,
-        help="Simulation Backend that processes a ClientApp.",
+        help="Simulation backend that executes the ClientApp.",
     )
     vce_group.add_argument(
         "--backend-config",
@@ -819,6 +819,7 @@ def _add_args_fleet_api(parser: argparse.ArgumentParser) -> None:
     parser.add_argument(
         "--dir",
         default="",
-        help="Add a specified directory to the PYTHONPATH."
+        help="Add specified directory to the PYTHONPATH and load"
+        "ClientApp from there."
         " Default: current working directory.",
     )
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index 88144b1c3c0..9357693a0e8 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -29,7 +29,7 @@
 def _register_nodes(
     num_nodes: int, state_factory: StateFactory
 ) -> NodeToPartitionMapping:
-    """Registre nodes with the StateFactory and create node-id:partition-id mapping."""
+    """Register nodes with the StateFactory and create node-id:partition-id mapping."""
     nodes_mapping: NodeToPartitionMapping = {}
     state = state_factory.state()
     for i in range(num_nodes):

From cf004d8ed4bbb520a88b05bc0ce888809149dd17 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Thu, 22 Feb 2024 19:43:23 +0000
Subject: [PATCH 024/103] renamed  vars; exporting

---
 src/py/flwr/server/app.py                     | 27 +++++++++----------
 .../server/superlink/fleet/vce/__init__.py    |  6 +++++
 .../server/superlink/fleet/vce/vce_api.py     | 12 ++++-----
 3 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py
index 84eca40e995..c8cdef9ff32 100644
--- a/src/py/flwr/server/app.py
+++ b/src/py/flwr/server/app.py
@@ -55,7 +55,7 @@
     start_grpc_server,
 )
 from .superlink.fleet.grpc_rere.fleet_servicer import FleetServicer
-from .superlink.fleet.vce.vce_api import start_vce
+from .superlink.fleet.vce import start_vce
 from .superlink.state import StateFactory
 
 ADDRESS_DRIVER_API = "0.0.0.0:9091"
@@ -406,9 +406,9 @@ def run_superlink() -> None:
     elif args.fleet_api_type == TRANSPORT_TYPE_VCE:
         _run_fleet_api_vce(
             num_supernodes=args.num_supernodes,
-            client_app_str=args.client_app,
-            backend=args.backend,
-            backend_config_json_str=args.backend_config,
+            client_app_module_name=args.client_app,
+            backend_name=args.backend,
+            backend_config_json_stream=args.backend_config,
             working_dir=args.dir,
             state_factory=state_factory,
         )
@@ -551,9 +551,9 @@ def _run_fleet_api_grpc_rere(
 # pylint: disable=too-many-arguments
 def _run_fleet_api_vce(
     num_supernodes: int,
-    client_app_str: str,
-    backend: str,
-    backend_config_json_str: str,
+    client_app_module_name: str,
+    backend_name: str,
+    backend_config_json_stream: str,
     working_dir: str,
     state_factory: StateFactory,
 ) -> None:
@@ -561,9 +561,9 @@ def _run_fleet_api_vce(
 
     start_vce(
         num_supernodes=num_supernodes,
-        client_app_str=client_app_str,
-        backend_str=backend,
-        backend_config_json_str=backend_config_json_str,
+        client_app_module_name=client_app_module_name,
+        backend_name=backend_name,
+        backend_config_json_stream=backend_config_json_stream,
         state_factory=state_factory,
         working_dir=working_dir,
     )
@@ -810,11 +810,10 @@ def _add_args_fleet_api(parser: argparse.ArgumentParser) -> None:
     vce_group.add_argument(
         "--backend-config",
         type=str,
-        default='{"client_resources": {"num_cpus":2, "num_gpus":0.0}}',
-        help='A JSON-like dict, e.g. \'{"<key>":<value>, "<another-key>":<value>}\' to '
+        default='{"client_resources": {"num_cpus":1, "num_gpus":0.0}, "tensorflow": 0}',
+        help='A JSON formatted stream, e.g \'{"<keyA>":<value>, "<keyB>":<value>}\' to '
         "configure a backend. Values supported in <value> are those included by "
-        "`flwr.common.typing.ConfigsRecordValues`. "
-        "Pay close attention to how the quotes and double quotes are set.",
+        "`flwr.common.typing.ConfigsRecordValues`. ",
     )
     parser.add_argument(
         "--dir",
diff --git a/src/py/flwr/server/superlink/fleet/vce/__init__.py b/src/py/flwr/server/superlink/fleet/vce/__init__.py
index 563f77595e1..72cd76f7376 100644
--- a/src/py/flwr/server/superlink/fleet/vce/__init__.py
+++ b/src/py/flwr/server/superlink/fleet/vce/__init__.py
@@ -13,3 +13,9 @@
 # limitations under the License.
 # ==============================================================================
 """Fleet VirtualClientEngine side."""
+
+from .vce_api import start_vce
+
+__all__ = [
+    "start_vce",
+]
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index 9357693a0e8..8c76b401b91 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -42,9 +42,9 @@ def _register_nodes(
 # pylint: disable=too-many-arguments,unused-argument
 def start_vce(
     num_supernodes: int,
-    client_app_str: str,
-    backend_str: str,
-    backend_config_json_str: str,
+    client_app_module_name: str,
+    backend_name: str,
+    backend_config_json_stream: str,
     state_factory: StateFactory,
     working_dir: str,
 ) -> None:
@@ -60,12 +60,12 @@ def start_vce(
         node_states[node_id] = NodeState()
 
     # Load backend config
-    _ = json.loads(backend_config_json_str)
+    _ = json.loads(backend_config_json_stream)
 
-    log(INFO, "client_app_str = %s", client_app_str)
+    log(INFO, "client_app_str = %s", client_app_module_name)
 
     def _load() -> ClientApp:
-        app: ClientApp = load_client_app(client_app_str)
+        app: ClientApp = load_client_app(client_app_module_name)
         return app
 
     # start backend

From a521b402b6d45a542b32c04845078bf2bcc45cc8 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Thu, 22 Feb 2024 20:58:01 +0000
Subject: [PATCH 025/103] moved

---
 src/py/flwr/server/app.py        | 47 ++------------------------------
 src/py/flwr/server/compat/app.py |  3 +-
 src/py/flwr/server/server.py     | 47 ++++++++++++++++++++++++++++++--
 src/py/flwr/simulation/app.py    |  3 +-
 4 files changed, 50 insertions(+), 50 deletions(-)

diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py
index c8cdef9ff32..ac7a8339b31 100644
--- a/src/py/flwr/server/app.py
+++ b/src/py/flwr/server/app.py
@@ -44,11 +44,11 @@
     add_FleetServicer_to_server,
 )
 
-from .client_manager import ClientManager, SimpleClientManager
+from .client_manager import ClientManager
 from .history import History
-from .server import Server
+from .server import Server, init_defaults, run_fl
 from .server_config import ServerConfig
-from .strategy import FedAvg, Strategy
+from .strategy import Strategy
 from .superlink.driver.driver_servicer import DriverServicer
 from .superlink.fleet.grpc_bidi.grpc_server import (
     generic_create_grpc_server,
@@ -185,47 +185,6 @@ def start_server(  # pylint: disable=too-many-arguments,too-many-locals
     return hist
 
 
-def init_defaults(
-    server: Optional[Server],
-    config: Optional[ServerConfig],
-    strategy: Optional[Strategy],
-    client_manager: Optional[ClientManager],
-) -> Tuple[Server, ServerConfig]:
-    """Create server instance if none was given."""
-    if server is None:
-        if client_manager is None:
-            client_manager = SimpleClientManager()
-        if strategy is None:
-            strategy = FedAvg()
-        server = Server(client_manager=client_manager, strategy=strategy)
-    elif strategy is not None:
-        log(WARN, "Both server and strategy were provided, ignoring strategy")
-
-    # Set default config values
-    if config is None:
-        config = ServerConfig()
-
-    return server, config
-
-
-def run_fl(
-    server: Server,
-    config: ServerConfig,
-) -> History:
-    """Train a model on the given server and return the History object."""
-    hist = server.fit(num_rounds=config.num_rounds, timeout=config.round_timeout)
-    log(INFO, "app_fit: losses_distributed %s", str(hist.losses_distributed))
-    log(INFO, "app_fit: metrics_distributed_fit %s", str(hist.metrics_distributed_fit))
-    log(INFO, "app_fit: metrics_distributed %s", str(hist.metrics_distributed))
-    log(INFO, "app_fit: losses_centralized %s", str(hist.losses_centralized))
-    log(INFO, "app_fit: metrics_centralized %s", str(hist.metrics_centralized))
-
-    # Graceful shutdown
-    server.disconnect_all_clients(timeout=config.round_timeout)
-
-    return hist
-
-
 def run_driver_api() -> None:
     """Run Flower server (Driver API)."""
     log(INFO, "Starting Flower server (Driver API)")
diff --git a/src/py/flwr/server/compat/app.py b/src/py/flwr/server/compat/app.py
index c0255391b88..3df779ebf99 100644
--- a/src/py/flwr/server/compat/app.py
+++ b/src/py/flwr/server/compat/app.py
@@ -26,10 +26,9 @@
 from flwr.common.address import parse_address
 from flwr.common.logger import log, warn_deprecated_feature
 from flwr.proto import driver_pb2  # pylint: disable=E0611
-from flwr.server.app import init_defaults, run_fl
 from flwr.server.client_manager import ClientManager
 from flwr.server.history import History
-from flwr.server.server import Server
+from flwr.server.server import Server, init_defaults, run_fl
 from flwr.server.server_config import ServerConfig
 from flwr.server.strategy import Strategy
 
diff --git a/src/py/flwr/server/server.py b/src/py/flwr/server/server.py
index cf3a4d9aa07..ea62587b7de 100644
--- a/src/py/flwr/server/server.py
+++ b/src/py/flwr/server/server.py
@@ -17,7 +17,7 @@
 
 import concurrent.futures
 import timeit
-from logging import DEBUG, INFO
+from logging import DEBUG, INFO, WARN
 from typing import Dict, List, Optional, Tuple, Union
 
 from flwr.common import (
@@ -33,11 +33,13 @@
 )
 from flwr.common.logger import log
 from flwr.common.typing import GetParametersIns
-from flwr.server.client_manager import ClientManager
+from flwr.server.client_manager import ClientManager, SimpleClientManager
 from flwr.server.client_proxy import ClientProxy
 from flwr.server.history import History
 from flwr.server.strategy import FedAvg, Strategy
 
+from .server_config import ServerConfig
+
 FitResultsAndFailures = Tuple[
     List[Tuple[ClientProxy, FitRes]],
     List[Union[Tuple[ClientProxy, FitRes], BaseException]],
@@ -441,3 +443,44 @@ def _handle_finished_future_after_evaluate(
 
     # Not successful, client returned a result where the status code is not OK
     failures.append(result)
+
+
+def init_defaults(
+    server: Optional[Server],
+    config: Optional[ServerConfig],
+    strategy: Optional[Strategy],
+    client_manager: Optional[ClientManager],
+) -> Tuple[Server, ServerConfig]:
+    """Create server instance if none was given."""
+    if server is None:
+        if client_manager is None:
+            client_manager = SimpleClientManager()
+        if strategy is None:
+            strategy = FedAvg()
+        server = Server(client_manager=client_manager, strategy=strategy)
+    elif strategy is not None:
+        log(WARN, "Both server and strategy were provided, ignoring strategy")
+
+    # Set default config values
+    if config is None:
+        config = ServerConfig()
+
+    return server, config
+
+
+def run_fl(
+    server: Server,
+    config: ServerConfig,
+) -> History:
+    """Train a model on the given server and return the History object."""
+    hist = server.fit(num_rounds=config.num_rounds, timeout=config.round_timeout)
+    log(INFO, "app_fit: losses_distributed %s", str(hist.losses_distributed))
+    log(INFO, "app_fit: metrics_distributed_fit %s", str(hist.metrics_distributed_fit))
+    log(INFO, "app_fit: metrics_distributed %s", str(hist.metrics_distributed))
+    log(INFO, "app_fit: losses_centralized %s", str(hist.losses_centralized))
+    log(INFO, "app_fit: metrics_centralized %s", str(hist.metrics_centralized))
+
+    # Graceful shutdown
+    server.disconnect_all_clients(timeout=config.round_timeout)
+
+    return hist
diff --git a/src/py/flwr/simulation/app.py b/src/py/flwr/simulation/app.py
index f3ffe632bbe..ff18f37664b 100644
--- a/src/py/flwr/simulation/app.py
+++ b/src/py/flwr/simulation/app.py
@@ -28,10 +28,9 @@
 from flwr.client import ClientFn
 from flwr.common import EventType, event
 from flwr.common.logger import log
-from flwr.server import Server
-from flwr.server.app import init_defaults, run_fl
 from flwr.server.client_manager import ClientManager
 from flwr.server.history import History
+from flwr.server.server import Server, init_defaults, run_fl
 from flwr.server.server_config import ServerConfig
 from flwr.server.strategy import Strategy
 from flwr.simulation.ray_transport.ray_actor import (

From 443551fb65721739fcfffe96760a7d2c4b3814cf Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Fri, 23 Feb 2024 00:31:28 +0000
Subject: [PATCH 026/103] revisited imports readiness for chosen backend

---
 .../superlink/fleet/vce/backend/__init__.py   | 29 +++++++++++++++----
 .../superlink/fleet/vce/backend/raybackend.py |  1 +
 .../server/superlink/fleet/vce/vce_api.py     |  9 ++++--
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py b/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py
index dd954907234..80e93f74e4b 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py
@@ -14,16 +14,35 @@
 # ==============================================================================
 """VirtualClientEngine Backends."""
 
+import importlib
 from typing import Dict, Type
 
 from .backend import Backend, BackendConfig
-from .raybackend import RayBackend
+
+is_ray_installed = importlib.util.find_spec("ray") is not None
+
+# mapping of supported backends
+supported_backends: Dict[str, Type[Backend]] = {}
+
+# To log backend-specific error message when chosen backend isn't available
+error_messages_backends: Dict[str, str] = {}
+
+if is_ray_installed:
+    from .raybackend import RayBackend
+
+    supported_backends["ray"] = RayBackend
+else:
+    error_messages_backends[
+        "ray"
+    ] = """Unable to import module `ray`.
+
+    To install the necessary dependencies, install `flwr` with the `simulation` extra:
+
+        pip install -U flwr["simulation"]
+    """
+
 
 __all__ = [
     "Backend",
     "BackendConfig",
-    "RayBackend",
 ]
-
-# mappy of supported backends
-supported_backends: Dict[str, Type[Backend]] = {"ray": RayBackend}
diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
index 741cdee93a7..ce66300c361 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
@@ -43,6 +43,7 @@ def __init__(
         work_dir: str,
     ) -> None:
         """Prepare RayBackend by initialising Ray and creating the ActorPool."""
+        log(INFO, "Initialising: %s", self.__class__.__name__)
         log(INFO, "Backend config: %s", backend_config)
 
         # Init ray and append working dir if needed
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index ed14cb76769..74ddbc1d279 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -23,7 +23,7 @@
 from flwr.common.logger import log
 from flwr.server.superlink.state import StateFactory
 
-from .backend import supported_backends
+from .backend import error_messages_backends, supported_backends
 
 NodeToPartitionMapping = Dict[int, int]
 
@@ -62,6 +62,7 @@ def start_vce(
         node_states[node_id] = NodeState()
 
     # Load backend config
+    log(INFO, "Supported backends: %s", list(supported_backends.keys()))
     backend_config = json.loads(backend_config_json_stream)
 
     try:
@@ -70,10 +71,14 @@ def start_vce(
     except KeyError as ex:
         log(
             ERROR,
-            "Backennd type `%s`, is not supported. Use any of %s",
+            "Backend `%s`, is not supported. Use any of %s or add support "
+            "for a new backend.",
             backend_name,
             list(supported_backends.keys()),
         )
+        if backend_name in error_messages_backends:
+            log(ERROR, error_messages_backends[backend_name])
+
         raise ex
 
     log(INFO, "client_app_str = %s", client_app_module_name)

From 79f363e73014c62e694a247739b8927c821dae0a Mon Sep 17 00:00:00 2001
From: Javier <jafermarq@users.noreply.github.com>
Date: Fri, 23 Feb 2024 16:02:34 +0000
Subject: [PATCH 027/103] Apply suggestions from code review

Co-authored-by: Daniel J. Beutel <daniel@flower.ai>
---
 .../flwr/server/superlink/fleet/vce/backend/__init__.py   | 2 +-
 .../flwr/server/superlink/fleet/vce/backend/raybackend.py | 8 ++++----
 src/py/flwr/server/superlink/fleet/vce/vce_api.py         | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py b/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py
index 80e93f74e4b..8c351743dbd 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py
@@ -21,7 +21,7 @@
 
 is_ray_installed = importlib.util.find_spec("ray") is not None
 
-# mapping of supported backends
+# Mapping of supported backends
 supported_backends: Dict[str, Type[Backend]] = {}
 
 # To log backend-specific error message when chosen backend isn't available
diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
index ce66300c361..b1099ab78f8 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Ray backend for the Fleet API using the VCE."""
+"""Ray backend for the Fleet API using the Simulation Engine."""
 
 import asyncio
 import pathlib
@@ -31,7 +31,7 @@
 
 from .backend import Backend, BackendConfig
 
-ClienteResourcesDict = Dict[str, Union[int, float]]
+ClientResourcesDict = Dict[str, Union[int, float]]
 
 
 class RayBackend(Backend):
@@ -65,7 +65,7 @@ def __init__(
         )
 
     def _configure_runtime_env(self, work_dir: str) -> Dict[str, Union[str, List[str]]]:
-        """Return list of files/subdirectories to exclude relateive to work_dir.
+        """Return list of files/subdirectories to exclude relative to work_dir.
 
         Without this, Ray will push everything to the Ray Cluster.
         """
@@ -75,7 +75,7 @@ def _configure_runtime_env(self, work_dir: str) -> Dict[str, Union[str, List[str
             excludes = []
             path = pathlib.Path(work_dir)
             for p in path.rglob("*"):
-                # exclude files need to be relative to the working_dir
+                # Exclude files need to be relative to the working_dir
                 if p.is_file() and not str(p).endswith(".py"):
                     excludes.append(str(p.relative_to(path)))
             runtime_env["excludes"] = excludes
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index 74ddbc1d279..c91bae9ddab 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -81,7 +81,7 @@ def start_vce(
 
         raise ex
 
-    log(INFO, "client_app_str = %s", client_app_module_name)
+    log(INFO, "client_app_module_name = %s", client_app_module_name)
 
     def _load() -> ClientApp:
         app: ClientApp = load_client_app(client_app_module_name)

From 12fa44c03c5724ab3a03b1496ca70b82ba69034a Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Fri, 23 Feb 2024 16:54:35 +0000
Subject: [PATCH 028/103] remove suprefluous if

---
 .../superlink/fleet/vce/backend/raybackend.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
index b1099ab78f8..f223d8ba9cb 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
@@ -71,20 +71,19 @@ def _configure_runtime_env(self, work_dir: str) -> Dict[str, Union[str, List[str
         """
         runtime_env: Dict[str, Union[str, List[str]]] = {"working_dir": work_dir}
 
-        if runtime_env:
-            excludes = []
-            path = pathlib.Path(work_dir)
-            for p in path.rglob("*"):
-                # Exclude files need to be relative to the working_dir
-                if p.is_file() and not str(p).endswith(".py"):
-                    excludes.append(str(p.relative_to(path)))
-            runtime_env["excludes"] = excludes
+        excludes = []
+        path = pathlib.Path(work_dir)
+        for p in path.rglob("*"):
+            # Exclude files need to be relative to the working_dir
+            if p.is_file() and not str(p).endswith(".py"):
+                excludes.append(str(p.relative_to(path)))
+        runtime_env["excludes"] = excludes
 
         return runtime_env
 
-    def _validate_client_resources(self, config: BackendConfig) -> ClienteResourcesDict:
+    def _validate_client_resources(self, config: BackendConfig) -> ClientResourcesDict:
         client_resources_config = config.get(self.client_resources_key)
-        client_resources: ClienteResourcesDict = {}
+        client_resources: ClientResourcesDict = {}
         valid_types = (int, float)
         if client_resources_config:
             for k, v in client_resources_config.items():

From c30904620401c847c59c79cf9e10121680f62c38 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Fri, 23 Feb 2024 17:15:02 +0000
Subject: [PATCH 029/103] fixes

---
 .../superlink/fleet/vce/backend/raybackend.py  | 18 ++++++++++--------
 .../flwr/simulation/ray_transport/ray_actor.py |  2 +-
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
index f223d8ba9cb..24620aab083 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
@@ -87,14 +87,16 @@ def _validate_client_resources(self, config: BackendConfig) -> ClientResourcesDi
         valid_types = (int, float)
         if client_resources_config:
             for k, v in client_resources_config.items():
-                assert isinstance(k, str), ValueError(
-                    f"client resources keys are expected to be `str` but you used "
-                    f"{type(k)} for `{k}`"
-                )
-                assert isinstance(v, valid_types), ValueError(
-                    f"client resources are expected to be of type {valid_types} but "
-                    f"found `{type(v)}` for key `{k}`",
-                )
+                if not isinstance(k, str):
+                    raise ValueError(
+                        f"client resources keys are expected to be `str` but you used "
+                        f"{type(k)} for `{k}`"
+                    )
+                if not isinstance(v, valid_types):
+                    raise ValueError(
+                        f"client resources are expected to be of type {valid_types} "
+                        f"but found `{type(v)}` for key `{k}`",
+                    )
                 client_resources[k] = v
 
         else:
diff --git a/src/py/flwr/simulation/ray_transport/ray_actor.py b/src/py/flwr/simulation/ray_transport/ray_actor.py
index b48e448b681..e899ce28261 100644
--- a/src/py/flwr/simulation/ray_transport/ray_actor.py
+++ b/src/py/flwr/simulation/ray_transport/ray_actor.py
@@ -434,7 +434,7 @@ def __init__(
         self.client_resources = client_resources
 
         # Queue of idle actors
-        self.pool: asyncio.Queue[Type[VirtualClientEngineActor]] = asyncio.Queue()
+        self.pool: "asyncio.Queue[Type[VirtualClientEngineActor]]" = asyncio.Queue()
         self.num_actors = 0
 
         # Resolve arguments to pass during actor init

From b16d0b81d8b74e4953bed446befdacf4c7d40950 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Sat, 24 Feb 2024 09:49:30 +0000
Subject: [PATCH 030/103] init; need resolve circular imports

---
 pyproject.toml                           |   1 +
 src/py/flwr/simulation/__init__.py       |   3 +
 src/py/flwr/simulation/run_simulation.py | 122 +++++++++++++++++++++++
 3 files changed, 126 insertions(+)
 create mode 100644 src/py/flwr/simulation/run_simulation.py

diff --git a/pyproject.toml b/pyproject.toml
index 6bd5c74f29a..743670c6419 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,6 +58,7 @@ flower-fleet-api = "flwr.server:run_fleet_api"
 flower-superlink = "flwr.server:run_superlink"
 flower-client-app = "flwr.client:run_client_app"
 flower-server-app = "flwr.server:run_server_app"
+flower-simulation = "flwr.simulation:run_simulation"
 
 [tool.poetry.dependencies]
 python = "^3.8"
diff --git a/src/py/flwr/simulation/__init__.py b/src/py/flwr/simulation/__init__.py
index 724ea927391..b283de70c58 100644
--- a/src/py/flwr/simulation/__init__.py
+++ b/src/py/flwr/simulation/__init__.py
@@ -17,6 +17,8 @@
 
 import importlib
 
+from flwr.simulation.run_simulation import run_simulation
+
 is_ray_installed = importlib.util.find_spec("ray") is not None
 
 if is_ray_installed:
@@ -36,4 +38,5 @@ def start_simulation(*args, **kwargs):  # type: ignore
 
 __all__ = [
     "start_simulation",
+    "run_simulation",
 ]
diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
new file mode 100644
index 00000000000..70e44b61211
--- /dev/null
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -0,0 +1,122 @@
+# Copyright 2024 Flower Labs GmbH. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Flower Simulation."""
+
+import argparse
+import threading
+
+import grpc
+
+from flwr.common import EventType, event
+from flwr.server.driver.driver import Driver
+from flwr.server.run_serverapp import run
+from flwr.server.superlink.state import StateFactory
+
+
+def run_simulation() -> None:
+    """."""
+    # TODO: below create circular imports
+    from flwr.server.app import _register_exit_handlers, _run_driver_api_grpc
+    from flwr.server.superlink.fleet.vce import start_vce
+
+    args = _parse_args_run_simulation().parse_args()
+
+    # Initialize StateFactory
+    state_factory = StateFactory(":flwr-in-memory-state:")
+
+    # Start Driver API
+    driver_server: grpc.Server = _run_driver_api_grpc(
+        address="0.0.0.0:9091",
+        state_factory=state_factory,
+        certificates=None,
+    )
+
+    # Superlink with Simulation Engine
+    superlink_th = threading.Thread(
+        target=start_vce,
+        args=(
+            args.num_supernodes,
+            args.client_app,
+            args.backend,
+            args.backend_config,
+            state_factory,
+            args.dir,
+        ),
+        daemon=False,
+    )
+
+    event(EventType.RUN_SUPERLINK_ENTER)
+    superlink_th.start()
+
+    # Initialize Driver
+    driver = Driver(
+        driver_service_address="0.0.0.0:9091",
+        root_certificates=None,
+    )
+
+    # Launch server app
+    run(args.server_app, driver, args.dir)
+
+    _register_exit_handlers(
+        grpc_servers=[driver_server],
+        bckg_threads=[superlink_th],
+        event_type=EventType.RUN_SUPERLINK_LEAVE,
+    )
+
+
+def _parse_args_run_simulation() -> argparse.ArgumentParser:
+    """Parse flower-simulation command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Start a Flower Simulation",
+    )
+    parser.add_argument(
+        "--client-app",
+        required=True,
+        help="For example: `client:app` or `project.package.module:wrapper.app`",
+    )
+    parser.add_argument(
+        "--server-app",
+        required=True,
+        help="For example: `server:app` or `project.package.module:wrapper.app`",
+    )
+    parser.add_argument(
+        "--num-supernodes",
+        type=int,
+        required=True,
+        help="Number of simulated SuperNodes.",
+    )
+    parser.add_argument(
+        "--backend",
+        default="ray",
+        type=str,
+        help="Simulation backend that executes the ClientApp.",
+    )
+    parser.add_argument(
+        "--backend-config",
+        type=str,
+        default='{"client_resources": {"num_cpus":1, "num_gpus":0.0}, "tensorflow": 0}',
+        help='A JSON formatted stream, e.g \'{"<keyA>":<value>, "<keyB>":<value>}\' to '
+        "configure a backend. Values supported in <value> are those included by "
+        "`flwr.common.typing.ConfigsRecordValues`. ",
+    )
+    parser.add_argument(
+        "--dir",
+        default="",
+        help="Add specified directory to the PYTHONPATH and load"
+        "ClientApp and ServerApp from there."
+        " Default: current working directory.",
+    )
+
+    return parser

From 93918db466054926bb96eb936fe3aa0b4ad16321 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Sun, 25 Feb 2024 22:25:16 +0000
Subject: [PATCH 031/103] gracefully shutdown

---
 .../superlink/fleet/vce/backend/backend.py    |  4 +++
 .../superlink/fleet/vce/backend/raybackend.py |  4 +++
 .../server/superlink/fleet/vce/vce_api.py     | 31 +++++++++++++++++--
 .../simulation/ray_transport/ray_actor.py     | 18 +++++++++--
 src/py/flwr/simulation/run_simulation.py      | 11 ++++++-
 5 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
index 2df4be76e7a..f2796a5758a 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
@@ -53,6 +53,10 @@ def num_workers(self) -> int:
     def is_worker_idle(self) -> bool:
         """Report whether a backend worker is idle and can therefore run a ClientApp."""
 
+    @abstractmethod
+    async def terminate(self) -> None:
+        """Terminate backend."""
+
     @abstractmethod
     async def process_message(
         self,
diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
index 8a33d07404b..cc3cf434849 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
@@ -154,3 +154,7 @@ async def process_message(
         ) = await self.pool.fetch_result_and_return_actor_to_pool(future)
 
         return out_mssg, updated_context
+
+    async def terminate(self) -> None:
+        """Terminate all actors in actor pool."""
+        await self.pool.terminate_all_actors()
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index 679742de4b1..2c2cbaca018 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -91,6 +91,10 @@ async def worker(
             # Store TaskRes in state
             state.store_task_res(task_res)
 
+        except asyncio.CancelledError as e:
+            log(DEBUG, f"Async worker: {e}")
+            break
+
         except Exception as ex:  # pylint: disable=broad-exception-caught
             # pylint: disable=fixme
             # TODO: gen TaskRes with relevant error, add it to state_factory
@@ -103,10 +107,11 @@ async def generate_pull_requests(
     queue: TaskInsQueue,
     state_factory: StateFactory,
     nodes_mapping: NodeToPartitionMapping,
+    f_stop: asyncio.Event,
 ) -> None:
     """Generate TaskIns and add it to the queue."""
     state = state_factory.state()
-    while True:
+    while not (f_stop.is_set()):
         for node_id in nodes_mapping.keys():
             task_ins = state.get_task_ins(node_id=node_id, limit=1)
             if task_ins:
@@ -114,6 +119,7 @@ async def generate_pull_requests(
         log(DEBUG, "TaskIns in queue: %i", queue.qsize())
         # pylint: disable=fixme
         await asyncio.sleep(1.0)  # TODO: revisit
+    log(DEBUG, "Async producer: Stopped pulling from StateFactory.")
 
 
 async def run(
@@ -122,6 +128,7 @@ async def run(
     nodes_mapping: NodeToPartitionMapping,
     state_factory: StateFactory,
     node_states: Dict[int, NodeState],
+    f_stop: asyncio.Event,
 ) -> None:
     """Run the VCE async."""
     # pylint: disable=fixme
@@ -135,10 +142,26 @@ async def run(
         )
         for _ in range(backend.num_workers)
     ]
-    asyncio.create_task(generate_pull_requests(queue, state_factory, nodes_mapping))
-    await queue.join()
+    producer = asyncio.create_task(
+        generate_pull_requests(queue, state_factory, nodes_mapping, f_stop)
+    )
+
+    await asyncio.gather(producer)
+
+    # Produced task terminated, now cancel worker tasks
+    for w_t in worker_tasks:
+        _ = w_t.cancel("Terminate on Simulation Engine shutdown.")
+
+    # print('requested cancel')
+    while not all(w_t.done() for w_t in worker_tasks):
+        log(DEBUG, "Terminating async workers...")
+        await asyncio.sleep(0.5)
+
     await asyncio.gather(*worker_tasks)
 
+    # Terminate backend
+    await backend.terminate()
+
 
 # pylint: disable=too-many-arguments,unused-argument
 def start_vce(
@@ -148,6 +171,7 @@ def start_vce(
     backend_config_json_stream: str,
     state_factory: StateFactory,
     working_dir: str,
+    f_stop: asyncio.Event,
 ) -> None:
     """Start Fleet API with the VirtualClientEngine (VCE)."""
     # Register SuperNodes
@@ -195,5 +219,6 @@ def _load() -> ClientApp:
             nodes_mapping,
             state_factory,
             node_states,
+            f_stop,
         )
     )
diff --git a/src/py/flwr/simulation/ray_transport/ray_actor.py b/src/py/flwr/simulation/ray_transport/ray_actor.py
index e899ce28261..5ac0b2c2748 100644
--- a/src/py/flwr/simulation/ray_transport/ray_actor.py
+++ b/src/py/flwr/simulation/ray_transport/ray_actor.py
@@ -18,7 +18,7 @@
 import threading
 import traceback
 from abc import ABC
-from logging import ERROR, WARNING
+from logging import DEBUG, ERROR, WARNING
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
 
 import ray
@@ -46,7 +46,7 @@ class VirtualClientEngineActor(ABC):
 
     def terminate(self) -> None:
         """Manually terminate Actor object."""
-        log(WARNING, "Manually terminating %s}", self.__class__.__name__)
+        log(WARNING, "Manually terminating %s", self.__class__.__name__)
         ray.actor.exit_actor()
 
     def run(
@@ -434,7 +434,9 @@ def __init__(
         self.client_resources = client_resources
 
         # Queue of idle actors
-        self.pool: "asyncio.Queue[Type[VirtualClientEngineActor]]" = asyncio.Queue()
+        self.pool: "asyncio.Queue[Type[VirtualClientEngineActor]]" = asyncio.Queue(
+            maxsize=1024
+        )
         self.num_actors = 0
 
         # Resolve arguments to pass during actor init
@@ -464,6 +466,16 @@ async def add_actors_to_pool(self, num_actors: int) -> None:
             await self.pool.put(self.create_actor_fn())  # type: ignore
         self.num_actors += num_actors
 
+    async def terminate_all_actors(self) -> None:
+        """Terminate actors in pool."""
+        num_terminated = 0
+        while self.pool.qsize():
+            actor = await self.pool.get()
+            actor.terminate.remote()  # type: ignore
+            num_terminated += 1
+
+        log(DEBUG, "Terminated %i actors", num_terminated)
+
     async def submit(
         self, actor_fn: Any, job: Tuple[ClientAppFn, Message, str, Context]
     ) -> Any:
diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index 70e44b61211..e15807adeb3 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -15,6 +15,7 @@
 """Flower Simulation."""
 
 import argparse
+import asyncio
 import threading
 
 import grpc
@@ -44,6 +45,7 @@ def run_simulation() -> None:
     )
 
     # Superlink with Simulation Engine
+    f_stop = asyncio.Event()
     superlink_th = threading.Thread(
         target=start_vce,
         args=(
@@ -53,6 +55,7 @@ def run_simulation() -> None:
             args.backend_config,
             state_factory,
             args.dir,
+            f_stop,
         ),
         daemon=False,
     )
@@ -69,11 +72,17 @@ def run_simulation() -> None:
     # Launch server app
     run(args.server_app, driver, args.dir)
 
+    del driver
+
+    # Trigger stop event
+    f_stop.set()
+
     _register_exit_handlers(
         grpc_servers=[driver_server],
         bckg_threads=[superlink_th],
         event_type=EventType.RUN_SUPERLINK_LEAVE,
     )
+    superlink_th.join()
 
 
 def _parse_args_run_simulation() -> argparse.ArgumentParser:
@@ -106,7 +115,7 @@ def _parse_args_run_simulation() -> argparse.ArgumentParser:
     parser.add_argument(
         "--backend-config",
         type=str,
-        default='{"client_resources": {"num_cpus":1, "num_gpus":0.0}, "tensorflow": 0}',
+        default='{"client_resources": {"num_cpus":2, "num_gpus":0.0}, "tensorflow": 0}',
         help='A JSON formatted stream, e.g \'{"<keyA>":<value>, "<keyB>":<value>}\' to '
         "configure a backend. Values supported in <value> are those included by "
         "`flwr.common.typing.ConfigsRecordValues`. ",

From 0e4ab143ac76c5340404ae22162e5d1f3c79408f Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Sun, 25 Feb 2024 22:31:49 +0000
Subject: [PATCH 032/103] terminate method for backend; asyncio event to
 trigger stop

---
 .../superlink/fleet/vce/backend/backend.py     |  4 ++++
 .../superlink/fleet/vce/backend/raybackend.py  |  9 ++++++++-
 .../flwr/server/superlink/fleet/vce/vce_api.py |  4 +++-
 .../flwr/simulation/ray_transport/ray_actor.py | 18 +++++++++++++++---
 4 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
index 2df4be76e7a..f2796a5758a 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/backend.py
@@ -53,6 +53,10 @@ def num_workers(self) -> int:
     def is_worker_idle(self) -> bool:
         """Report whether a backend worker is idle and can therefore run a ClientApp."""
 
+    @abstractmethod
+    async def terminate(self) -> None:
+        """Terminate backend."""
+
     @abstractmethod
     async def process_message(
         self,
diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
index 24620aab083..cc3cf434849 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
@@ -28,6 +28,7 @@
     ClientAppActor,
     init_ray,
 )
+from flwr.simulation.ray_transport.utils import enable_tf_gpu_growth
 
 from .backend import Backend, BackendConfig
 
@@ -56,7 +57,9 @@ def __init__(
         self.client_resources_key = "client_resources"
 
         # Create actor pool
-        actor_kwargs = backend_config.get("actor_kwargs", {})
+        use_tf = backend_config.get("tensorflow", False)
+        actor_kwargs = {"on_actor_init_fn": enable_tf_gpu_growth} if use_tf else {}
+
         client_resources = self._validate_client_resources(config=backend_config)
         self.pool = BasicActorPool(
             actor_type=ClientAppActor,
@@ -151,3 +154,7 @@ async def process_message(
         ) = await self.pool.fetch_result_and_return_actor_to_pool(future)
 
         return out_mssg, updated_context
+
+    async def terminate(self) -> None:
+        """Terminate all actors in actor pool."""
+        await self.pool.terminate_all_actors()
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index c91bae9ddab..666e7e7d9ec 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -14,9 +14,10 @@
 # ==============================================================================
 """Fleet VirtualClientEngine API."""
 
+import asyncio
 import json
 from logging import ERROR, INFO
-from typing import Dict
+from typing import Dict, Optional
 
 from flwr.client.clientapp import ClientApp, load_client_app
 from flwr.client.node_state import NodeState
@@ -49,6 +50,7 @@ def start_vce(
     backend_config_json_stream: str,
     state_factory: StateFactory,
     working_dir: str,
+    f_stop: Optional[asyncio.Event] = None,
 ) -> None:
     """Start Fleet API with the VirtualClientEngine (VCE)."""
     # Register SuperNodes
diff --git a/src/py/flwr/simulation/ray_transport/ray_actor.py b/src/py/flwr/simulation/ray_transport/ray_actor.py
index e899ce28261..5ac0b2c2748 100644
--- a/src/py/flwr/simulation/ray_transport/ray_actor.py
+++ b/src/py/flwr/simulation/ray_transport/ray_actor.py
@@ -18,7 +18,7 @@
 import threading
 import traceback
 from abc import ABC
-from logging import ERROR, WARNING
+from logging import DEBUG, ERROR, WARNING
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
 
 import ray
@@ -46,7 +46,7 @@ class VirtualClientEngineActor(ABC):
 
     def terminate(self) -> None:
         """Manually terminate Actor object."""
-        log(WARNING, "Manually terminating %s}", self.__class__.__name__)
+        log(WARNING, "Manually terminating %s", self.__class__.__name__)
         ray.actor.exit_actor()
 
     def run(
@@ -434,7 +434,9 @@ def __init__(
         self.client_resources = client_resources
 
         # Queue of idle actors
-        self.pool: "asyncio.Queue[Type[VirtualClientEngineActor]]" = asyncio.Queue()
+        self.pool: "asyncio.Queue[Type[VirtualClientEngineActor]]" = asyncio.Queue(
+            maxsize=1024
+        )
         self.num_actors = 0
 
         # Resolve arguments to pass during actor init
@@ -464,6 +466,16 @@ async def add_actors_to_pool(self, num_actors: int) -> None:
             await self.pool.put(self.create_actor_fn())  # type: ignore
         self.num_actors += num_actors
 
+    async def terminate_all_actors(self) -> None:
+        """Terminate actors in pool."""
+        num_terminated = 0
+        while self.pool.qsize():
+            actor = await self.pool.get()
+            actor.terminate.remote()  # type: ignore
+            num_terminated += 1
+
+        log(DEBUG, "Terminated %i actors", num_terminated)
+
     async def submit(
         self, actor_fn: Any, job: Tuple[ClientAppFn, Message, str, Context]
     ) -> Any:

From 21e9932e89965be7c9958b46ddcce7632cf7311a Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Sun, 25 Feb 2024 22:56:14 +0000
Subject: [PATCH 033/103] propagate terminate asyncio logic

---
 src/py/flwr/server/app.py                     |  6 ++-
 .../server/superlink/fleet/vce/vce_api.py     | 38 +++++++++++++++----
 2 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py
index ac7a8339b31..eecd80fcf17 100644
--- a/src/py/flwr/server/app.py
+++ b/src/py/flwr/server/app.py
@@ -14,8 +14,8 @@
 # ==============================================================================
 """Flower server app."""
 
-
 import argparse
+import asyncio
 import importlib.util
 import sys
 import threading
@@ -363,6 +363,7 @@ def run_superlink() -> None:
         )
         grpc_servers.append(fleet_server)
     elif args.fleet_api_type == TRANSPORT_TYPE_VCE:
+        f_stop = asyncio.Event()  # Does nothing
         _run_fleet_api_vce(
             num_supernodes=args.num_supernodes,
             client_app_module_name=args.client_app,
@@ -370,6 +371,7 @@ def run_superlink() -> None:
             backend_config_json_stream=args.backend_config,
             working_dir=args.dir,
             state_factory=state_factory,
+            f_stop=f_stop,
         )
     else:
         raise ValueError(f"Unknown fleet_api_type: {args.fleet_api_type}")
@@ -515,6 +517,7 @@ def _run_fleet_api_vce(
     backend_config_json_stream: str,
     working_dir: str,
     state_factory: StateFactory,
+    f_stop: asyncio.Event,
 ) -> None:
     log(INFO, "Flower VCE: Starting Fleet API (VirtualClientEngine)")
 
@@ -525,6 +528,7 @@ def _run_fleet_api_vce(
         backend_config_json_stream=backend_config_json_stream,
         state_factory=state_factory,
         working_dir=working_dir,
+        f_stop=f_stop,
     )
 
 
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index 56e5e5b75a0..6312ab17359 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -19,7 +19,7 @@
 import json
 import traceback
 from logging import DEBUG, ERROR, INFO
-from typing import Callable, Dict, Optional
+from typing import Callable, Dict
 
 from flwr.client.clientapp import ClientApp, load_client_app
 from flwr.client.node_state import NodeState
@@ -47,7 +47,7 @@ def _register_nodes(
     return nodes_mapping
 
 
-# pylint: disable=too-many-arguments
+# pylint: disable=too-many-arguments,too-many-locals
 async def worker(
     app: Callable[[], ClientApp],
     queue: TaskInsQueue,
@@ -91,6 +91,10 @@ async def worker(
             # Store TaskRes in state
             state.store_task_res(task_res)
 
+        except asyncio.CancelledError as e:
+            log(DEBUG, "Async worker: %s", e)
+            break
+
         except Exception as ex:  # pylint: disable=broad-exception-caught
             # pylint: disable=fixme
             # TODO: gen TaskRes with relevant error, add it to state_factory
@@ -103,10 +107,11 @@ async def generate_pull_requests(
     queue: TaskInsQueue,
     state_factory: StateFactory,
     nodes_mapping: NodeToPartitionMapping,
+    f_stop: asyncio.Event,
 ) -> None:
     """Generate TaskIns and add it to the queue."""
     state = state_factory.state()
-    while True:
+    while not f_stop.is_set():
         for node_id in nodes_mapping.keys():
             task_ins = state.get_task_ins(node_id=node_id, limit=1)
             if task_ins:
@@ -114,6 +119,7 @@ async def generate_pull_requests(
         log(DEBUG, "TaskIns in queue: %i", queue.qsize())
         # pylint: disable=fixme
         await asyncio.sleep(1.0)  # TODO: revisit
+    log(DEBUG, "Async producer: Stopped pulling from StateFactory.")
 
 
 async def run(
@@ -122,6 +128,7 @@ async def run(
     nodes_mapping: NodeToPartitionMapping,
     state_factory: StateFactory,
     node_states: Dict[int, NodeState],
+    f_stop: asyncio.Event,
 ) -> None:
     """Run the VCE async."""
     # pylint: disable=fixme
@@ -135,12 +142,28 @@ async def run(
         )
         for _ in range(backend.num_workers)
     ]
-    asyncio.create_task(generate_pull_requests(queue, state_factory, nodes_mapping))
-    await queue.join()
+    producer = asyncio.create_task(
+        generate_pull_requests(queue, state_factory, nodes_mapping, f_stop)
+    )
+
+    await asyncio.gather(producer)
+
+    # Produced task terminated, now cancel worker tasks
+    for w_t in worker_tasks:
+        _ = w_t.cancel("Terminate on Simulation Engine shutdown.")
+
+    # print('requested cancel')
+    while not all(w_t.done() for w_t in worker_tasks):
+        log(DEBUG, "Terminating async workers...")
+        await asyncio.sleep(0.5)
+
     await asyncio.gather(*worker_tasks)
 
+    # Terminate backend
+    await backend.terminate()
+
 
-# pylint: disable=too-many-arguments,unused-argument
+# pylint: disable=too-many-arguments,unused-argument,too-many-locals
 def start_vce(
     num_supernodes: int,
     client_app_module_name: str,
@@ -148,7 +171,7 @@ def start_vce(
     backend_config_json_stream: str,
     state_factory: StateFactory,
     working_dir: str,
-    f_stop: Optional[asyncio.Event] = None,
+    f_stop: asyncio.Event,
 ) -> None:
     """Start Fleet API with the VirtualClientEngine (VCE)."""
     # Register SuperNodes
@@ -196,5 +219,6 @@ def _load() -> ClientApp:
             nodes_mapping,
             state_factory,
             node_states,
+            f_stop,
         )
     )

From f8b57c561024f9edf98bcba9efac42009cdc2f8d Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Mon, 26 Feb 2024 11:50:34 +0000
Subject: [PATCH 034/103] added build/process/terminate tests

---
 .../fleet/vce/backend/raybackend_test.py      | 140 ++++++++++++++++++
 1 file changed, 140 insertions(+)
 create mode 100644 src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
new file mode 100644
index 00000000000..441329d159e
--- /dev/null
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
@@ -0,0 +1,140 @@
+# Copyright 2024 Flower Labs GmbH. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test for Ray backend for the Fleet API using the Simulation Engine."""
+
+import asyncio
+from math import pi
+from typing import Callable, Dict, Optional, Tuple
+
+import ray
+
+from flwr.client import Client, NumPyClient
+from flwr.client.clientapp import ClientApp
+from flwr.common import (
+    Config,
+    ConfigsRecord,
+    Context,
+    GetPropertiesIns,
+    Message,
+    Metadata,
+    RecordSet,
+    Scalar,
+)
+from flwr.common.constant import MESSAGE_TYPE_GET_PROPERTIES
+from flwr.common.recordset_compat import getpropertiesins_to_recordset
+
+from .raybackend import RayBackend
+
+
+class DummyClient(NumPyClient):
+    """A dummy NumPyClient for tests."""
+
+    def get_properties(self, config: Config) -> Dict[str, Scalar]:
+        """Return properties by doing a simple calculation."""
+        result = float(config["factor"]) * pi
+
+        # store something in context
+        self.context.state.configs_records["result"] = ConfigsRecord({"result": result})
+        return {"result": result}
+
+
+def get_dummy_client(cid: str) -> Client:  # pylint: disable=unused-argument
+    """Return a DummyClient converted to Client type."""
+    return DummyClient().to_client()
+
+
+def _load_app() -> ClientApp:
+    return ClientApp(client_fn=get_dummy_client)
+
+
+async def backend_build_process_and_termination(
+    backend: RayBackend,
+    process_args: Optional[Tuple[Callable[[], ClientApp], Message, Context]] = None,
+) -> Tuple[Message, Context] | None:
+    """Build, process job and terminate RayBackend."""
+    await backend.build()
+    to_return = None
+
+    if process_args:
+        to_return = await backend.process_message(*process_args)
+
+    await backend.terminate()
+
+    ray.shutdown()
+
+    return to_return
+
+
+def test_backend_creation_and_termination() -> None:
+    """Test creation of RayBackend and its termination."""
+    backend = RayBackend(backend_config={}, work_dir="")
+    asyncio.run(
+        backend_build_process_and_termination(backend=backend, process_args=None)
+    )
+
+
+def test_backend_creation_submit_and_termination() -> None:
+    """Test submit."""
+    backend = RayBackend(backend_config={}, work_dir="")
+
+    # Define ClientApp
+    client_app_callable = _load_app
+
+    # Construct a Message
+    mult_factor = 2024
+    getproperties_ins = GetPropertiesIns(config={"factor": mult_factor})
+    recordset = getpropertiesins_to_recordset(getproperties_ins)
+    message = Message(
+        content=recordset,
+        metadata=Metadata(
+            run_id=0,
+            message_id="",
+            group_id="",
+            src_node_id=0,
+            dst_node_id=0,
+            reply_to_message="",
+            ttl="",
+            message_type=MESSAGE_TYPE_GET_PROPERTIES,
+        ),
+    )
+
+    # Construct emtpy Context
+    context = Context(state=RecordSet())
+
+    res = asyncio.run(
+        backend_build_process_and_termination(
+            backend=backend, process_args=(client_app_callable, message, context)
+        )
+    )
+
+    if res is None:
+        raise AssertionError("This shouldn't happen")
+
+    out_mssg, updated_context = res
+
+    # Verify message content is as expected
+    content = out_mssg.content
+    assert (
+        content.configs_records["getpropertiesres.properties"]["result"]
+        == pi * mult_factor
+    )
+
+    # Verify context is correct
+    obtained_result_in_context = updated_context.state.configs_records["result"][
+        "result"
+    ]
+    assert obtained_result_in_context == pi * mult_factor
+
+

From 39e3234884ca36fde8a541256e6f1737cbf2dfd1 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Mon, 26 Feb 2024 11:55:34 +0000
Subject: [PATCH 035/103] format

---
 .../flwr/server/superlink/fleet/vce/backend/raybackend_test.py  | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
index 441329d159e..d31fe6c3416 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
@@ -136,5 +136,3 @@ def test_backend_creation_submit_and_termination() -> None:
         "result"
     ]
     assert obtained_result_in_context == pi * mult_factor
-
-

From 8ea4b08100fb6e7ac0a72b40cab83b994fd44cf4 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Mon, 26 Feb 2024 11:58:58 +0000
Subject: [PATCH 036/103] fix for py3.8

---
 .../server/superlink/fleet/vce/backend/raybackend_test.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
index d31fe6c3416..bb33491db90 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
@@ -16,7 +16,7 @@
 
 import asyncio
 from math import pi
-from typing import Callable, Dict, Optional, Tuple
+from typing import Callable, Dict, Optional, Tuple, Union
 
 import ray
 
@@ -62,7 +62,7 @@ def _load_app() -> ClientApp:
 async def backend_build_process_and_termination(
     backend: RayBackend,
     process_args: Optional[Tuple[Callable[[], ClientApp], Message, Context]] = None,
-) -> Tuple[Message, Context] | None:
+) -> Union[Tuple[Message, Context], None]:
     """Build, process job and terminate RayBackend."""
     await backend.build()
     to_return = None

From 35c55d41eabdd20915a47c92b74a5cc2926b6248 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Mon, 26 Feb 2024 12:48:44 +0000
Subject: [PATCH 037/103] fix py3.11

---
 src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
index cc3cf434849..1864e48fe16 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
@@ -145,7 +145,7 @@ async def process_message(
             (app, message, str(node_id), context),
         )
 
-        await asyncio.wait([future])
+        await future
 
         # Fetch result
         (

From 49bc661c1d764b9402dd0713eabf33ffea0738d6 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Mon, 26 Feb 2024 12:51:17 +0000
Subject: [PATCH 038/103] fix import

---
 src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
index 1864e48fe16..b29d76b239e 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
@@ -14,7 +14,6 @@
 # ==============================================================================
 """Ray backend for the Fleet API using the Simulation Engine."""
 
-import asyncio
 import pathlib
 from logging import INFO
 from typing import Callable, Dict, List, Tuple, Union

From 4506a1706f624821000976b639eb56d2779d0c73 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Mon, 26 Feb 2024 13:11:22 +0000
Subject: [PATCH 039/103] wrapped asyncio test under `IsolatedAsyncioTestCase`
 class

---
 .../fleet/vce/backend/raybackend_test.py      | 111 +++++++++---------
 1 file changed, 57 insertions(+), 54 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
index bb33491db90..f0cca527ab9 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
@@ -17,6 +17,7 @@
 import asyncio
 from math import pi
 from typing import Callable, Dict, Optional, Tuple, Union
+from unittest import IsolatedAsyncioTestCase
 
 import ray
 
@@ -77,62 +78,64 @@ async def backend_build_process_and_termination(
     return to_return
 
 
-def test_backend_creation_and_termination() -> None:
-    """Test creation of RayBackend and its termination."""
-    backend = RayBackend(backend_config={}, work_dir="")
-    asyncio.run(
-        backend_build_process_and_termination(backend=backend, process_args=None)
-    )
-
-
-def test_backend_creation_submit_and_termination() -> None:
-    """Test submit."""
-    backend = RayBackend(backend_config={}, work_dir="")
-
-    # Define ClientApp
-    client_app_callable = _load_app
-
-    # Construct a Message
-    mult_factor = 2024
-    getproperties_ins = GetPropertiesIns(config={"factor": mult_factor})
-    recordset = getpropertiesins_to_recordset(getproperties_ins)
-    message = Message(
-        content=recordset,
-        metadata=Metadata(
-            run_id=0,
-            message_id="",
-            group_id="",
-            src_node_id=0,
-            dst_node_id=0,
-            reply_to_message="",
-            ttl="",
-            message_type=MESSAGE_TYPE_GET_PROPERTIES,
-        ),
-    )
-
-    # Construct emtpy Context
-    context = Context(state=RecordSet())
-
-    res = asyncio.run(
-        backend_build_process_and_termination(
-            backend=backend, process_args=(client_app_callable, message, context)
+class AsyncTestRayBackend(IsolatedAsyncioTestCase):
+    """A basic class that allows runnig multliple asyncio tests."""
+
+    def test_backend_creation_and_termination(self) -> None:
+        """Test creation of RayBackend and its termination."""
+        backend = RayBackend(backend_config={}, work_dir="")
+        asyncio.run(
+            backend_build_process_and_termination(backend=backend, process_args=None)
+        )
+
+    def test_backend_creation_submit_and_termination(self) -> None:
+        """Test submit."""
+        backend = RayBackend(backend_config={}, work_dir="")
+
+        # Define ClientApp
+        client_app_callable = _load_app
+
+        # Construct a Message
+        mult_factor = 2024
+        getproperties_ins = GetPropertiesIns(config={"factor": mult_factor})
+        recordset = getpropertiesins_to_recordset(getproperties_ins)
+        message = Message(
+            content=recordset,
+            metadata=Metadata(
+                run_id=0,
+                message_id="",
+                group_id="",
+                src_node_id=0,
+                dst_node_id=0,
+                reply_to_message="",
+                ttl="",
+                message_type=MESSAGE_TYPE_GET_PROPERTIES,
+            ),
         )
-    )
 
-    if res is None:
-        raise AssertionError("This shouldn't happen")
+        # Construct emtpy Context
+        context = Context(state=RecordSet())
 
-    out_mssg, updated_context = res
+        res = asyncio.run(
+            backend_build_process_and_termination(
+                backend=backend, process_args=(client_app_callable, message, context)
+            )
+        )
 
-    # Verify message content is as expected
-    content = out_mssg.content
-    assert (
-        content.configs_records["getpropertiesres.properties"]["result"]
-        == pi * mult_factor
-    )
+        if res is None:
+            raise AssertionError("This shouldn't happen")
+
+        out_mssg, updated_context = res
+
+        # Verify message content is as expected
+        content = out_mssg.content
+        assert (
+            content.configs_records["getpropertiesres.properties"]["result"]
+            == pi * mult_factor
+        )
 
-    # Verify context is correct
-    obtained_result_in_context = updated_context.state.configs_records["result"][
-        "result"
-    ]
-    assert obtained_result_in_context == pi * mult_factor
+        # Verify context is correct
+        obtained_result_in_context = updated_context.state.configs_records["result"][
+            "result"
+        ]
+        assert obtained_result_in_context == pi * mult_factor

From ed5b181361b6682b8a8c7f912ec07cbdf1462419 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Mon, 26 Feb 2024 14:18:53 +0000
Subject: [PATCH 040/103] start/shutdown tests

---
 .../server/superlink/fleet/vce/vce_api.py     |  2 +-
 .../superlink/fleet/vce/vce_api_test.py       | 59 +++++++++++++++++++
 2 files changed, 60 insertions(+), 1 deletion(-)
 create mode 100644 src/py/flwr/server/superlink/fleet/vce/vce_api_test.py

diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index 3365e8d9471..7dc86dac01a 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Fleet VirtualClientEngine API."""
+"""Fleet Simulation Engine API."""
 
 
 import asyncio
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
new file mode 100644
index 00000000000..987f8ce27c1
--- /dev/null
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Flower Labs GmbH. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test Fleet Simulation Engine API."""
+
+import asyncio
+import threading
+from time import sleep
+from unittest import IsolatedAsyncioTestCase
+
+from flwr.server.superlink.state import StateFactory
+
+from . import start_vce
+
+
+class AsyncTestFleetSimulationEngine(IsolatedAsyncioTestCase):
+    """A basic class to test Fleet Simulation Enginge funcionality."""
+
+    def test_start_and_shutdown(self) -> None:
+        """Start Simulation Engine Fleet and terminate it."""
+        f_stop = asyncio.Event()
+
+        # Initialize StateFactory
+        state_factory = StateFactory(":flwr-in-memory-state:")
+
+        superlink_th = threading.Thread(
+            target=start_vce,
+            args=(
+                50,
+                "",
+                "ray",
+                "{}",  # an empty json stream (represents an empty config)
+                state_factory,
+                "",
+                f_stop,
+            ),
+            daemon=False,
+        )
+
+        superlink_th.start()
+
+        # Sleep for some time
+        sleep(10)
+
+        # Trigger stop event
+        f_stop.set()
+
+        superlink_th.join()

From 2c05cdd066a1dd7209e197eb4e2f3c5210b71205 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Mon, 26 Feb 2024 17:49:55 +0000
Subject: [PATCH 041/103] full loop tests; tweaks

---
 .../superlink/fleet/vce/backend/raybackend.py |   4 +
 .../fleet/vce/backend/raybackend_test.py      |   4 -
 .../server/superlink/fleet/vce/vce_api.py     |  27 ++-
 .../superlink/fleet/vce/vce_api_test.py       | 188 +++++++++++++++---
 4 files changed, 183 insertions(+), 40 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
index b29d76b239e..5c81501d62d 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
@@ -18,6 +18,8 @@
 from logging import INFO
 from typing import Callable, Dict, List, Tuple, Union
 
+import ray
+
 from flwr.client.clientapp import ClientApp
 from flwr.common.context import Context
 from flwr.common.logger import log
@@ -157,3 +159,5 @@ async def process_message(
     async def terminate(self) -> None:
         """Terminate all actors in actor pool."""
         await self.pool.terminate_all_actors()
+        ray.shutdown()
+        log(INFO, "Terminated %s", self.__class__.__name__)
diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
index f0cca527ab9..bef0d8ec7e5 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
@@ -19,8 +19,6 @@
 from typing import Callable, Dict, Optional, Tuple, Union
 from unittest import IsolatedAsyncioTestCase
 
-import ray
-
 from flwr.client import Client, NumPyClient
 from flwr.client.clientapp import ClientApp
 from flwr.common import (
@@ -73,8 +71,6 @@ async def backend_build_process_and_termination(
 
     await backend.terminate()
 
-    ray.shutdown()
-
     return to_return
 
 
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index 7dc86dac01a..881765213da 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -19,7 +19,7 @@
 import json
 import traceback
 from logging import DEBUG, ERROR, INFO
-from typing import Callable, Dict
+from typing import Callable, Dict, Optional
 
 from flwr.client.clientapp import ClientApp, load_client_app
 from flwr.client.node_state import NodeState
@@ -30,7 +30,6 @@
 
 from .backend import Backend, error_messages_backends, supported_backends
 
-TaskInsQueue = asyncio.Queue[TaskIns]
 NodeToPartitionMapping = Dict[int, int]
 
 
@@ -50,7 +49,7 @@ def _register_nodes(
 # pylint: disable=too-many-arguments,too-many-locals
 async def worker(
     app: Callable[[], ClientApp],
-    queue: TaskInsQueue,
+    queue: "asyncio.Queue[TaskIns]",
     node_states: Dict[int, NodeState],
     state_factory: StateFactory,
     nodes_mapping: NodeToPartitionMapping,
@@ -60,7 +59,7 @@ async def worker(
     state = state_factory.state()
     while True:
         try:
-            task_ins = await queue.get()
+            task_ins: TaskIns = await queue.get()
             node_id = task_ins.task.consumer.node_id
 
             # Register and retrive runstate
@@ -104,7 +103,7 @@ async def worker(
 
 
 async def generate_pull_requests(
-    queue: TaskInsQueue,
+    queue: "asyncio.Queue[TaskIns]",
     state_factory: StateFactory,
     nodes_mapping: NodeToPartitionMapping,
     f_stop: asyncio.Event,
@@ -132,7 +131,7 @@ async def run(
 ) -> None:
     """Run the VCE async."""
     # pylint: disable=fixme
-    queue: TaskInsQueue = asyncio.Queue(128)
+    queue: "asyncio.Queue[TaskIns]" = asyncio.Queue(128)
 
     # Build backend
     await backend.build()
@@ -150,7 +149,7 @@ async def run(
 
     # Produced task terminated, now cancel worker tasks
     for w_t in worker_tasks:
-        _ = w_t.cancel("Terminate on Simulation Engine shutdown.")
+        _ = w_t.cancel()
 
     # print('requested cancel')
     while not all(w_t.done() for w_t in worker_tasks):
@@ -172,12 +171,18 @@ def start_vce(
     state_factory: StateFactory,
     working_dir: str,
     f_stop: asyncio.Event,
+    existing_nodes_mapping: Optional[NodeToPartitionMapping] = None,
 ) -> None:
     """Start Fleet API with the VirtualClientEngine (VCE)."""
-    # Register SuperNodes
-    nodes_mapping = _register_nodes(
-        num_nodes=num_supernodes, state_factory=state_factory
-    )
+    if existing_nodes_mapping:
+        # Use mapping constructed externally. This also means nodes
+        # have previously being registered.
+        nodes_mapping = existing_nodes_mapping
+    else:
+        # Register SuperNodes
+        nodes_mapping = _register_nodes(
+            num_nodes=num_supernodes, state_factory=state_factory
+        )
 
     # Construct mapping of NodeStates
     node_states: Dict[int, NodeState] = {}
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
index 987f8ce27c1..6abdd046f81 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
@@ -13,47 +13,185 @@
 # limitations under the License.
 # ==============================================================================
 """Test Fleet Simulation Engine API."""
-
 import asyncio
 import threading
+from itertools import cycle
+from math import pi
 from time import sleep
+from typing import Dict, Optional, Set
 from unittest import IsolatedAsyncioTestCase
+from uuid import UUID
+
+from flwr.client import Client, NumPyClient
+from flwr.client.clientapp import ClientApp
+from flwr.common import (
+    Config,
+    ConfigsRecord,
+    GetPropertiesIns,
+    Message,
+    Metadata,
+    Scalar,
+)
+from flwr.common.constant import MESSAGE_TYPE_GET_PROPERTIES
+from flwr.common.recordset_compat import getpropertiesins_to_recordset
+from flwr.common.serde import message_from_taskres, message_to_taskins
+from flwr.server.superlink.fleet.vce.vce_api import (
+    NodeToPartitionMapping,
+    _register_nodes,
+    start_vce,
+)
+from flwr.server.superlink.state import InMemoryState, StateFactory
+
+
+class DummyClient(NumPyClient):
+    """A dummy NumPyClient for tests."""
+
+    def get_properties(self, config: Config) -> Dict[str, Scalar]:
+        """Return properties by doing a simple calculation."""
+        result = float(config["factor"]) * pi
+
+        # store something in context
+        self.context.state.configs_records["result"] = ConfigsRecord({"result": result})
+        return {"result": result}
+
+
+def get_dummy_client(cid: str) -> Client:  # pylint: disable=unused-argument
+    """Return a DummyClient converted to Client type."""
+    return DummyClient().to_client()
+
+
+client_app = ClientApp(
+    client_fn=get_dummy_client,
+)
+
 
-from flwr.server.superlink.state import StateFactory
+def terminate_simulation(f_stop: asyncio.Event, sleep_duration: int) -> None:
+    """Set event to terminate Simulation Engine after `sleep_duration` seconds."""
+    sleep(sleep_duration)
+    f_stop.set()
 
-from . import start_vce
 
+def start_and_shutdown(
+    existing_state_factory: Optional[StateFactory] = None,
+    nodes_mapping: Optional[NodeToPartitionMapping] = None,
+    duration: int = 10,
+) -> None:
+    """Start Simulation Engine and terminate after specified number of seconds."""
+    f_stop = asyncio.Event()
+
+    # Initialize StateFactory
+    if nodes_mapping:
+        if existing_state_factory is None:
+            raise ValueError(
+                "If you specify a node mapping, you must pass a StateFactory."
+            )
+        state_factory = existing_state_factory
+    else:
+        state_factory = StateFactory(":flwr-in-memory-state:")
+
+    # Setup thread that will set the f_stop event, triggering the termination of all
+    # asyncio logic in the Simulation Engine. It will also terminate the Backend.
+    termination_th = threading.Thread(
+        target=terminate_simulation, args=(f_stop, duration)
+    )
+    termination_th.start()
+
+    start_vce(
+        num_supernodes=50,
+        client_app_module_name="vce_api_test:client_app",
+        backend_name="ray",
+        backend_config_json_stream="{}",  # an empty json stream (an empty config)
+        state_factory=state_factory,
+        working_dir="",
+        f_stop=f_stop,
+        existing_nodes_mapping=nodes_mapping,
+    )
+
+    # Trigger stop event
+    f_stop.set()
+
+    termination_th.join()
 
-class AsyncTestFleetSimulationEngine(IsolatedAsyncioTestCase):
-    """A basic class to test Fleet Simulation Enginge funcionality."""
+
+class AsyncTestFleetSimulationEngineRayBackend(IsolatedAsyncioTestCase):
+    """A basic class that enables testing asyncio functionalities."""
 
     def test_start_and_shutdown(self) -> None:
         """Start Simulation Engine Fleet and terminate it."""
-        f_stop = asyncio.Event()
+        start_and_shutdown()
+
+    # pylint: disable=too-many-locals
+    def test_start_and_shutdown_with_tasks_in_state(self) -> None:
+        """Run Simulation Engine with some TasksIns in State.
+
+        This test creates a few nodes and submits a few messages that need to be
+        executed by the Backend. In order for that to happen the asyncio
+        producer/consumer logic must function.
+        """
+        num_messages = 113
+        num_nodes = 59
 
-        # Initialize StateFactory
+        # Register a state and a run_id in it
+        run_id = 1234
         state_factory = StateFactory(":flwr-in-memory-state:")
+        state: InMemoryState = state_factory.state()  # type: ignore
+        state.run_ids.add(run_id)
 
-        superlink_th = threading.Thread(
-            target=start_vce,
-            args=(
-                50,
-                "",
-                "ray",
-                "{}",  # an empty json stream (represents an empty config)
-                state_factory,
-                "",
-                f_stop,
-            ),
-            daemon=False,
+        # Register a few nodes
+        nodes_mapping = _register_nodes(
+            num_nodes=num_nodes, state_factory=state_factory
         )
 
-        superlink_th.start()
+        # Artificially add TaskIns to state so they can be processed
+        # by the Simulation Engine logic
+        nodes_cycle = cycle(
+            nodes_mapping.keys()
+        )  # we have more messages than supernodes
+        task_ids: Set[UUID] = set()  # so we can retrieve them later
+        expected_results = {}
+        for i in range(num_messages):
+            dst_node_id = next(nodes_cycle)
+            # Construct a Message
+            mult_factor = 2024 + i
+            getproperties_ins = GetPropertiesIns(config={"factor": mult_factor})
+            recordset = getpropertiesins_to_recordset(getproperties_ins)
+            message = Message(
+                content=recordset,
+                metadata=Metadata(
+                    run_id=run_id,
+                    message_id="",
+                    group_id="",
+                    src_node_id=0,
+                    dst_node_id=dst_node_id,  # indicate destination node
+                    reply_to_message="",
+                    ttl="",
+                    message_type=MESSAGE_TYPE_GET_PROPERTIES,
+                ),
+            )
+            # Convert Message to TaskIns
+            taskins = message_to_taskins(message)
+            # Instert in state
+            task_id = state.store_task_ins(taskins)
+            if task_id:
+                # Add to UUID set
+                task_ids.add(task_id)
+                # Store expected output for check later on
+                expected_results[task_id] = mult_factor * pi
+
+        # Run
+        start_and_shutdown(state_factory, nodes_mapping)
+
+        # Get all TaskRes
+        task_res_list = state.get_task_res(task_ids=task_ids, limit=len(task_ids))
 
-        # Sleep for some time
-        sleep(10)
+        # Check results by first converting to Message
+        for task_res in task_res_list:
 
-        # Trigger stop event
-        f_stop.set()
+            message = message_from_taskres(task_res)
 
-        superlink_th.join()
+            # Verify message content is as expected
+            content = message.content
+            assert (
+                content.configs_records["getpropertiesres.properties"]["result"]
+                == expected_results[UUID(task_res.task.ancestry[0])]
+            )

From 98fb4b458e7d9cda53b44c327b381e340c3b18f2 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Mon, 26 Feb 2024 17:54:20 +0000
Subject: [PATCH 042/103] .

---
 src/py/flwr/server/superlink/fleet/vce/__init__.py         | 2 +-
 src/py/flwr/server/superlink/fleet/vce/backend/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/__init__.py b/src/py/flwr/server/superlink/fleet/vce/__init__.py
index 72cd76f7376..57d39688b52 100644
--- a/src/py/flwr/server/superlink/fleet/vce/__init__.py
+++ b/src/py/flwr/server/superlink/fleet/vce/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Fleet VirtualClientEngine side."""
+"""Fleet Simulation Engine side."""
 
 from .vce_api import start_vce
 
diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py b/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py
index 8c351743dbd..d751cf4bcae 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""VirtualClientEngine Backends."""
+"""Simulation Engine Backends."""
 
 import importlib
 from typing import Dict, Type

From 65c8b79df30d94231e8fe46314c482fa04f362db Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Mon, 26 Feb 2024 18:11:08 +0000
Subject: [PATCH 043/103] undoing changes to simulation examples

---
 examples/simulation-pytorch/README.md    | 38 +++++------------
 examples/simulation-pytorch/sim.py       | 48 +++++++++------------
 examples/simulation-tensorflow/README.md | 42 +++++-------------
 examples/simulation-tensorflow/sim.py    | 54 ++++++++++--------------
 4 files changed, 63 insertions(+), 119 deletions(-)

diff --git a/examples/simulation-pytorch/README.md b/examples/simulation-pytorch/README.md
index 963e77bc568..5ba5ec70dc3 100644
--- a/examples/simulation-pytorch/README.md
+++ b/examples/simulation-pytorch/README.md
@@ -54,13 +54,17 @@ Write the command below in your terminal to install the dependencies according t
 pip install -r requirements.txt
 ```
 
-### Run with `start_simulation()`
-
-Ensure you have activated your environment then:
+### Run Federated Learning Example
 
 ```bash
+# You can run the example without activating your environemnt
+poetry run python sim.py
+
+# Or by first activating it
+poetry shell
 # and then run the example
 python sim.py
+# you can exit your environment by typing "exit"
 ```
 
 You can adjust the CPU/GPU resources you assign to each of your virtual clients. By default, your clients will only use 1xCPU core. For example:
@@ -69,32 +73,10 @@ You can adjust the CPU/GPU resources you assign to each of your virtual clients.
 # Will assign 2xCPUs to each client
 python sim.py --num_cpus=2
 
-# Will assign 2xCPUs and 25% of the GPU's VRAM to each client
-# This means that you can have 4 concurrent clients on each GPU
+# Will assign 2xCPUs and 20% of the GPU's VRAM to each client
+# This means that you can have 5 concurrent clients on each GPU
 # (assuming you have enough CPUs)
-python sim.py --num_cpus=2 --num_gpus=0.25
-```
-
-### Run with Flower-Next (`super-link` and `server-app`)
-
-Ensure you have activated your environment, then:
-
-```
-flower-superlink --insecure --vce --num-supernodes 100 --client-app sim:client_app
-
-# on a different terminal
-flower-server-app sim:server_app --insecure
-```
-
-You can change the default resources assigned to each `ClientApp` by means the `--backend-config` argument:
-
-```bash
-# Tells the VCE to resever 2x CPUs and 25% of available VRAM for each ClientApp
-flower-superlink --insecure --vce --num-supernodes 100 \
-    --client-app sim:client_app \
-    --backend-config='{"client_resources": {"num_cpus":2, "num_gpus":0.25}}'
-
-# Then you can launch the `flower-server-app` command as shown earlier.
+python sim.py --num_cpus=2 --num_gpus=0.2
 ```
 
 Take a look at the [Documentation](https://flower.ai/docs/framework/how-to-run-simulations.html) for more details on how you can customise your simulation.
diff --git a/examples/simulation-pytorch/sim.py b/examples/simulation-pytorch/sim.py
index 84a00e3f092..0a6ed8ebb9b 100644
--- a/examples/simulation-pytorch/sim.py
+++ b/examples/simulation-pytorch/sim.py
@@ -29,9 +29,9 @@
     default=0.0,
     help="Ratio of GPU memory to assign to a virtual client",
 )
+parser.add_argument("--num_rounds", type=int, default=10, help="Number of FL rounds.")
 
 NUM_CLIENTS = 100
-NUM_ROUNDS = 10
 
 
 # Flower client, adapted from Pytorch quickstart example
@@ -167,36 +167,28 @@ def evaluate(
     return evaluate
 
 
-# Download MNIST dataset and partition it
-mnist_fds = FederatedDataset(dataset="mnist", partitioners={"train": NUM_CLIENTS})
-centralized_testset = mnist_fds.load_full("test")
-
-# Configure the strategy
-strategy = fl.server.strategy.FedAvg(
-    fraction_fit=0.1,  # Sample 10% of available clients for training
-    fraction_evaluate=0.05,  # Sample 5% of available clients for evaluation
-    min_available_clients=10,
-    on_fit_config_fn=fit_config,
-    evaluate_metrics_aggregation_fn=weighted_average,  # Aggregate federated metrics
-    evaluate_fn=get_evaluate_fn(centralized_testset),  # Global evaluation function
-)
-
-# ClientApp for Flower-Next
-client_app = fl.client.ClientApp(
-    client_fn=get_client_fn(mnist_fds),
-)
-
-# ServerApp for Flower-Next
-server_app = fl.server.ServerApp(
-    config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS),
-    strategy=strategy,
-)
-
-
 def main():
     # Parse input arguments
     args = parser.parse_args()
 
+    # Download MNIST dataset and partition it
+    mnist_fds = FederatedDataset(dataset="mnist", partitioners={"train": NUM_CLIENTS})
+    centralized_testset = mnist_fds.load_full("test")
+
+    # Configure the strategy
+    strategy = fl.server.strategy.FedAvg(
+        fraction_fit=0.1,  # Sample 10% of available clients for training
+        fraction_evaluate=0.05,  # Sample 5% of available clients for evaluation
+        min_fit_clients=10,  # Never sample less than 10 clients for training
+        min_evaluate_clients=5,  # Never sample less than 5 clients for evaluation
+        min_available_clients=int(
+            NUM_CLIENTS * 0.75
+        ),  # Wait until at least 75 clients are available
+        on_fit_config_fn=fit_config,
+        evaluate_metrics_aggregation_fn=weighted_average,  # Aggregate federated metrics
+        evaluate_fn=get_evaluate_fn(centralized_testset),  # Global evaluation function
+    )
+
     # Resources to be assigned to each virtual client
     client_resources = {
         "num_cpus": args.num_cpus,
@@ -208,7 +200,7 @@ def main():
         client_fn=get_client_fn(mnist_fds),
         num_clients=NUM_CLIENTS,
         client_resources=client_resources,
-        config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS),
+        config=fl.server.ServerConfig(num_rounds=args.num_rounds),
         strategy=strategy,
         actor_kwargs={
             "on_actor_init_fn": disable_progress_bar  # disable tqdm on each actor/process spawning virtual clients
diff --git a/examples/simulation-tensorflow/README.md b/examples/simulation-tensorflow/README.md
index f6f0a22fdd7..75be823db2e 100644
--- a/examples/simulation-tensorflow/README.md
+++ b/examples/simulation-tensorflow/README.md
@@ -53,49 +53,29 @@ Write the command below in your terminal to install the dependencies according t
 pip install -r requirements.txt
 ```
 
-### Run with `start_simulation()`
-
-Ensure you have activated your environment then:
+### Run Federated Learning Example
 
 ```bash
+# You can run the example without activating your environemnt
+poetry run python sim.py
+
+# Or by first activating it
+poetry shell
 # and then run the example
 python sim.py
+# you can exit your environment by typing "exit"
 ```
 
-You can adjust the CPU/GPU resources you assign to each of your virtual clients. By default, your clients will only use 2xCPU core. For example:
+You can adjust the CPU/GPU resources you assign to each of your virtual clients. By default, your clients will only use 1xCPU core. For example:
 
 ```bash
 # Will assign 2xCPUs to each client
 python sim.py --num_cpus=2
 
-# Will assign 2xCPUs and 25% of the GPU's VRAM to each client
-# This means that you can have 4 concurrent clients on each GPU
+# Will assign 2xCPUs and 20% of the GPU's VRAM to each client
+# This means that you can have 5 concurrent clients on each GPU
 # (assuming you have enough CPUs)
-python sim.py --num_cpus=2 --num_gpus=0.25
-```
-
-Because TensorFlow by default maps all the available VRAM, we need to [enable GPU memory growth](https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth), see how it is done in the example (`sim.py`) for both the "main" process (where the server/strategy runs) and for the clients (using the `actor_kwargs`)
-
-### Run with Flower-Next (`super-link` and `server-app`)
-
-Ensure you have activated your environment, then:
-
-```
-flower-superlink --insecure --vce --num-supernodes 100 --client-app sim:client_app
-
-# on a different terminal
-flower-server-app sim:server_app --insecure
-```
-
-You can change the default resources assigned to each `ClientApp` using the `--backend-config` argument. Note that we need to flag that the backend is going to use `TensorFlow`. In this way, it will enable GPU memory growth.
-
-```bash
-# Tells the VCE to resever 2x CPUs and 25% of available VRAM for each ClientApp
-flower-superlink --insecure --vce --num-supernodes 100 \
-    --client-app sim:client_app \
-    --backend-config='{"client_resources": {"num_cpus":2, "num_gpus":0.25}, "tensorflow": 1}'
-
-# Then you can launch the `flower-server-app` command as shown earlier.
+python sim.py --num_cpus=2 --num_gpus=0.2
 ```
 
 Take a look at the [Documentation](https://flower.ai/docs/framework/how-to-run-simulations.html) for more details on how you can customise your simulation.
diff --git a/examples/simulation-tensorflow/sim.py b/examples/simulation-tensorflow/sim.py
index dbba71ac2cf..043c624a40a 100644
--- a/examples/simulation-tensorflow/sim.py
+++ b/examples/simulation-tensorflow/sim.py
@@ -29,9 +29,9 @@
     default=0.0,
     help="Ratio of GPU memory to assign to a virtual client",
 )
+parser.add_argument("--num_rounds", type=int, default=10, help="Number of FL rounds.")
 
 NUM_CLIENTS = 100
-NUM_ROUNDS = 10
 VERBOSE = 0
 
 
@@ -129,40 +129,30 @@ def evaluate(
     return evaluate
 
 
-# Download MNIST dataset and partition it
-mnist_fds = FederatedDataset(dataset="mnist", partitioners={"train": NUM_CLIENTS})
-# Get the whole test set for centralised evaluation
-centralized_testset = mnist_fds.load_full("test").to_tf_dataset(
-    columns="image", label_cols="label", batch_size=64
-)
-
-# Create FedAvg strategy
-strategy = fl.server.strategy.FedAvg(
-    fraction_fit=0.1,  # Sample 10% of available clients for training
-    fraction_evaluate=0.05,  # Sample 5% of available clients for evaluation
-    min_fit_clients=10,  # Never sample less than 10 clients for training
-    evaluate_metrics_aggregation_fn=weighted_average,  # aggregates federated metrics
-    evaluate_fn=get_evaluate_fn(centralized_testset),  # global evaluation function
-)
-
-
-# ClientApp for Flower-Next
-client_app = fl.client.ClientApp(
-    client_fn=get_client_fn(mnist_fds),
-)
-
-# ServerApp for Flower-Next
-# TODO: Unclear how to enable GPU growth for the ServerApp
-server_app = fl.server.ServerApp(
-    config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS),
-    strategy=strategy,
-)
-
-
 def main() -> None:
     # Parse input arguments
     args = parser.parse_args()
 
+    # Download MNIST dataset and partition it
+    mnist_fds = FederatedDataset(dataset="mnist", partitioners={"train": NUM_CLIENTS})
+    # Get the whole test set for centralised evaluation
+    centralized_testset = mnist_fds.load_full("test").to_tf_dataset(
+        columns="image", label_cols="label", batch_size=64
+    )
+
+    # Create FedAvg strategy
+    strategy = fl.server.strategy.FedAvg(
+        fraction_fit=0.1,  # Sample 10% of available clients for training
+        fraction_evaluate=0.05,  # Sample 5% of available clients for evaluation
+        min_fit_clients=10,  # Never sample less than 10 clients for training
+        min_evaluate_clients=5,  # Never sample less than 5 clients for evaluation
+        min_available_clients=int(
+            NUM_CLIENTS * 0.75
+        ),  # Wait until at least 75 clients are available
+        evaluate_metrics_aggregation_fn=weighted_average,  # aggregates federated metrics
+        evaluate_fn=get_evaluate_fn(centralized_testset),  # global evaluation function
+    )
+
     # With a dictionary, you tell Flower's VirtualClientEngine that each
     # client needs exclusive access to these many resources in order to run
     client_resources = {
@@ -174,7 +164,7 @@ def main() -> None:
     fl.simulation.start_simulation(
         client_fn=get_client_fn(mnist_fds),
         num_clients=NUM_CLIENTS,
-        config=fl.server.ServerConfig(NUM_ROUNDS),
+        config=fl.server.ServerConfig(num_rounds=args.num_rounds),
         strategy=strategy,
         client_resources=client_resources,
         actor_kwargs={

From 87d7a4cd4a0f21251fd9d6f42618c636f47c7839 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Mon, 26 Feb 2024 18:18:12 +0000
Subject: [PATCH 044/103] adding back examples

---
 examples/simulation-pytorch/README.md    | 38 ++++++++++++-----
 examples/simulation-pytorch/sim.py       | 48 ++++++++++++---------
 examples/simulation-tensorflow/README.md | 42 +++++++++++++-----
 examples/simulation-tensorflow/sim.py    | 54 ++++++++++++++----------
 4 files changed, 119 insertions(+), 63 deletions(-)

diff --git a/examples/simulation-pytorch/README.md b/examples/simulation-pytorch/README.md
index 5ba5ec70dc3..963e77bc568 100644
--- a/examples/simulation-pytorch/README.md
+++ b/examples/simulation-pytorch/README.md
@@ -54,17 +54,13 @@ Write the command below in your terminal to install the dependencies according t
 pip install -r requirements.txt
 ```
 
-### Run Federated Learning Example
+### Run with `start_simulation()`
 
-```bash
-# You can run the example without activating your environemnt
-poetry run python sim.py
+Ensure you have activated your environment then:
 
-# Or by first activating it
-poetry shell
+```bash
 # and then run the example
 python sim.py
-# you can exit your environment by typing "exit"
 ```
 
 You can adjust the CPU/GPU resources you assign to each of your virtual clients. By default, your clients will only use 1xCPU core. For example:
@@ -73,10 +69,32 @@ You can adjust the CPU/GPU resources you assign to each of your virtual clients.
 # Will assign 2xCPUs to each client
 python sim.py --num_cpus=2
 
-# Will assign 2xCPUs and 20% of the GPU's VRAM to each client
-# This means that you can have 5 concurrent clients on each GPU
+# Will assign 2xCPUs and 25% of the GPU's VRAM to each client
+# This means that you can have 4 concurrent clients on each GPU
 # (assuming you have enough CPUs)
-python sim.py --num_cpus=2 --num_gpus=0.2
+python sim.py --num_cpus=2 --num_gpus=0.25
+```
+
+### Run with Flower-Next (`super-link` and `server-app`)
+
+Ensure you have activated your environment, then:
+
+```
+flower-superlink --insecure --vce --num-supernodes 100 --client-app sim:client_app
+
+# on a different terminal
+flower-server-app sim:server_app --insecure
+```
+
+You can change the default resources assigned to each `ClientApp` by means the `--backend-config` argument:
+
+```bash
+# Tells the VCE to resever 2x CPUs and 25% of available VRAM for each ClientApp
+flower-superlink --insecure --vce --num-supernodes 100 \
+    --client-app sim:client_app \
+    --backend-config='{"client_resources": {"num_cpus":2, "num_gpus":0.25}}'
+
+# Then you can launch the `flower-server-app` command as shown earlier.
 ```
 
 Take a look at the [Documentation](https://flower.ai/docs/framework/how-to-run-simulations.html) for more details on how you can customise your simulation.
diff --git a/examples/simulation-pytorch/sim.py b/examples/simulation-pytorch/sim.py
index 0a6ed8ebb9b..84a00e3f092 100644
--- a/examples/simulation-pytorch/sim.py
+++ b/examples/simulation-pytorch/sim.py
@@ -29,9 +29,9 @@
     default=0.0,
     help="Ratio of GPU memory to assign to a virtual client",
 )
-parser.add_argument("--num_rounds", type=int, default=10, help="Number of FL rounds.")
 
 NUM_CLIENTS = 100
+NUM_ROUNDS = 10
 
 
 # Flower client, adapted from Pytorch quickstart example
@@ -167,28 +167,36 @@ def evaluate(
     return evaluate
 
 
+# Download MNIST dataset and partition it
+mnist_fds = FederatedDataset(dataset="mnist", partitioners={"train": NUM_CLIENTS})
+centralized_testset = mnist_fds.load_full("test")
+
+# Configure the strategy
+strategy = fl.server.strategy.FedAvg(
+    fraction_fit=0.1,  # Sample 10% of available clients for training
+    fraction_evaluate=0.05,  # Sample 5% of available clients for evaluation
+    min_available_clients=10,
+    on_fit_config_fn=fit_config,
+    evaluate_metrics_aggregation_fn=weighted_average,  # Aggregate federated metrics
+    evaluate_fn=get_evaluate_fn(centralized_testset),  # Global evaluation function
+)
+
+# ClientApp for Flower-Next
+client_app = fl.client.ClientApp(
+    client_fn=get_client_fn(mnist_fds),
+)
+
+# ServerApp for Flower-Next
+server_app = fl.server.ServerApp(
+    config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS),
+    strategy=strategy,
+)
+
+
 def main():
     # Parse input arguments
     args = parser.parse_args()
 
-    # Download MNIST dataset and partition it
-    mnist_fds = FederatedDataset(dataset="mnist", partitioners={"train": NUM_CLIENTS})
-    centralized_testset = mnist_fds.load_full("test")
-
-    # Configure the strategy
-    strategy = fl.server.strategy.FedAvg(
-        fraction_fit=0.1,  # Sample 10% of available clients for training
-        fraction_evaluate=0.05,  # Sample 5% of available clients for evaluation
-        min_fit_clients=10,  # Never sample less than 10 clients for training
-        min_evaluate_clients=5,  # Never sample less than 5 clients for evaluation
-        min_available_clients=int(
-            NUM_CLIENTS * 0.75
-        ),  # Wait until at least 75 clients are available
-        on_fit_config_fn=fit_config,
-        evaluate_metrics_aggregation_fn=weighted_average,  # Aggregate federated metrics
-        evaluate_fn=get_evaluate_fn(centralized_testset),  # Global evaluation function
-    )
-
     # Resources to be assigned to each virtual client
     client_resources = {
         "num_cpus": args.num_cpus,
@@ -200,7 +208,7 @@ def main():
         client_fn=get_client_fn(mnist_fds),
         num_clients=NUM_CLIENTS,
         client_resources=client_resources,
-        config=fl.server.ServerConfig(num_rounds=args.num_rounds),
+        config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS),
         strategy=strategy,
         actor_kwargs={
             "on_actor_init_fn": disable_progress_bar  # disable tqdm on each actor/process spawning virtual clients
diff --git a/examples/simulation-tensorflow/README.md b/examples/simulation-tensorflow/README.md
index 75be823db2e..f6f0a22fdd7 100644
--- a/examples/simulation-tensorflow/README.md
+++ b/examples/simulation-tensorflow/README.md
@@ -53,29 +53,49 @@ Write the command below in your terminal to install the dependencies according t
 pip install -r requirements.txt
 ```
 
-### Run Federated Learning Example
+### Run with `start_simulation()`
 
-```bash
-# You can run the example without activating your environemnt
-poetry run python sim.py
+Ensure you have activated your environment then:
 
-# Or by first activating it
-poetry shell
+```bash
 # and then run the example
 python sim.py
-# you can exit your environment by typing "exit"
 ```
 
-You can adjust the CPU/GPU resources you assign to each of your virtual clients. By default, your clients will only use 1xCPU core. For example:
+You can adjust the CPU/GPU resources you assign to each of your virtual clients. By default, your clients will only use 2xCPU core. For example:
 
 ```bash
 # Will assign 2xCPUs to each client
 python sim.py --num_cpus=2
 
-# Will assign 2xCPUs and 20% of the GPU's VRAM to each client
-# This means that you can have 5 concurrent clients on each GPU
+# Will assign 2xCPUs and 25% of the GPU's VRAM to each client
+# This means that you can have 4 concurrent clients on each GPU
 # (assuming you have enough CPUs)
-python sim.py --num_cpus=2 --num_gpus=0.2
+python sim.py --num_cpus=2 --num_gpus=0.25
+```
+
+Because TensorFlow by default maps all the available VRAM, we need to [enable GPU memory growth](https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth), see how it is done in the example (`sim.py`) for both the "main" process (where the server/strategy runs) and for the clients (using the `actor_kwargs`)
+
+### Run with Flower-Next (`super-link` and `server-app`)
+
+Ensure you have activated your environment, then:
+
+```
+flower-superlink --insecure --vce --num-supernodes 100 --client-app sim:client_app
+
+# on a different terminal
+flower-server-app sim:server_app --insecure
+```
+
+You can change the default resources assigned to each `ClientApp` using the `--backend-config` argument. Note that we need to flag that the backend is going to use `TensorFlow`. In this way, it will enable GPU memory growth.
+
+```bash
+# Tells the VCE to resever 2x CPUs and 25% of available VRAM for each ClientApp
+flower-superlink --insecure --vce --num-supernodes 100 \
+    --client-app sim:client_app \
+    --backend-config='{"client_resources": {"num_cpus":2, "num_gpus":0.25}, "tensorflow": 1}'
+
+# Then you can launch the `flower-server-app` command as shown earlier.
 ```
 
 Take a look at the [Documentation](https://flower.ai/docs/framework/how-to-run-simulations.html) for more details on how you can customise your simulation.
diff --git a/examples/simulation-tensorflow/sim.py b/examples/simulation-tensorflow/sim.py
index 043c624a40a..dbba71ac2cf 100644
--- a/examples/simulation-tensorflow/sim.py
+++ b/examples/simulation-tensorflow/sim.py
@@ -29,9 +29,9 @@
     default=0.0,
     help="Ratio of GPU memory to assign to a virtual client",
 )
-parser.add_argument("--num_rounds", type=int, default=10, help="Number of FL rounds.")
 
 NUM_CLIENTS = 100
+NUM_ROUNDS = 10
 VERBOSE = 0
 
 
@@ -129,30 +129,40 @@ def evaluate(
     return evaluate
 
 
+# Download MNIST dataset and partition it
+mnist_fds = FederatedDataset(dataset="mnist", partitioners={"train": NUM_CLIENTS})
+# Get the whole test set for centralised evaluation
+centralized_testset = mnist_fds.load_full("test").to_tf_dataset(
+    columns="image", label_cols="label", batch_size=64
+)
+
+# Create FedAvg strategy
+strategy = fl.server.strategy.FedAvg(
+    fraction_fit=0.1,  # Sample 10% of available clients for training
+    fraction_evaluate=0.05,  # Sample 5% of available clients for evaluation
+    min_fit_clients=10,  # Never sample less than 10 clients for training
+    evaluate_metrics_aggregation_fn=weighted_average,  # aggregates federated metrics
+    evaluate_fn=get_evaluate_fn(centralized_testset),  # global evaluation function
+)
+
+
+# ClientApp for Flower-Next
+client_app = fl.client.ClientApp(
+    client_fn=get_client_fn(mnist_fds),
+)
+
+# ServerApp for Flower-Next
+# TODO: Unclear how to enable GPU growth for the ServerApp
+server_app = fl.server.ServerApp(
+    config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS),
+    strategy=strategy,
+)
+
+
 def main() -> None:
     # Parse input arguments
     args = parser.parse_args()
 
-    # Download MNIST dataset and partition it
-    mnist_fds = FederatedDataset(dataset="mnist", partitioners={"train": NUM_CLIENTS})
-    # Get the whole test set for centralised evaluation
-    centralized_testset = mnist_fds.load_full("test").to_tf_dataset(
-        columns="image", label_cols="label", batch_size=64
-    )
-
-    # Create FedAvg strategy
-    strategy = fl.server.strategy.FedAvg(
-        fraction_fit=0.1,  # Sample 10% of available clients for training
-        fraction_evaluate=0.05,  # Sample 5% of available clients for evaluation
-        min_fit_clients=10,  # Never sample less than 10 clients for training
-        min_evaluate_clients=5,  # Never sample less than 5 clients for evaluation
-        min_available_clients=int(
-            NUM_CLIENTS * 0.75
-        ),  # Wait until at least 75 clients are available
-        evaluate_metrics_aggregation_fn=weighted_average,  # aggregates federated metrics
-        evaluate_fn=get_evaluate_fn(centralized_testset),  # global evaluation function
-    )
-
     # With a dictionary, you tell Flower's VirtualClientEngine that each
     # client needs exclusive access to these many resources in order to run
     client_resources = {
@@ -164,7 +174,7 @@ def main() -> None:
     fl.simulation.start_simulation(
         client_fn=get_client_fn(mnist_fds),
         num_clients=NUM_CLIENTS,
-        config=fl.server.ServerConfig(num_rounds=args.num_rounds),
+        config=fl.server.ServerConfig(NUM_ROUNDS),
         strategy=strategy,
         client_resources=client_resources,
         actor_kwargs={

From 35ab1f384fb642a4e6ab0463c451ad053861f54d Mon Sep 17 00:00:00 2001
From: Javier <jafermarq@users.noreply.github.com>
Date: Mon, 26 Feb 2024 19:42:07 +0000
Subject: [PATCH 045/103] Apply suggestions from code review

Co-authored-by: Daniel J. Beutel <daniel@flower.ai>
---
 src/py/flwr/server/superlink/fleet/vce/vce_api.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index 881765213da..21da4b7070b 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -68,7 +68,7 @@ async def worker(
 
             # Convert TaskIns to Message
             message = message_from_taskins(task_ins)
-            # Replace node-id with data partition id
+            # Replace node ID with data partition ID
             message.metadata.dst_node_id = nodes_mapping[node_id]
 
             # Let backend process message
@@ -108,7 +108,7 @@ async def generate_pull_requests(
     nodes_mapping: NodeToPartitionMapping,
     f_stop: asyncio.Event,
 ) -> None:
-    """Generate TaskIns and add it to the queue."""
+    """Retrieve TaskIns and add it to the queue."""
     state = state_factory.state()
     while not f_stop.is_set():
         for node_id in nodes_mapping.keys():
@@ -151,7 +151,6 @@ async def run(
     for w_t in worker_tasks:
         _ = w_t.cancel()
 
-    # print('requested cancel')
     while not all(w_t.done() for w_t in worker_tasks):
         log(DEBUG, "Terminating async workers...")
         await asyncio.sleep(0.5)

From 5b3365a0f1a53a7ae7d2ed8de1ecf94221b327a4 Mon Sep 17 00:00:00 2001
From: Javier <jafermarq@users.noreply.github.com>
Date: Mon, 26 Feb 2024 19:51:21 +0000
Subject: [PATCH 046/103] Apply suggestions from code review

Co-authored-by: Daniel J. Beutel <daniel@flower.ai>
---
 src/py/flwr/server/superlink/fleet/vce/vce_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index 21da4b7070b..914e69ac4ce 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -62,7 +62,7 @@ async def worker(
             task_ins: TaskIns = await queue.get()
             node_id = task_ins.task.consumer.node_id
 
-            # Register and retrive runstate
+            # Register and retrieve runstate
             node_states[node_id].register_context(run_id=task_ins.run_id)
             context = node_states[node_id].retrieve_context(run_id=task_ins.run_id)
 

From 785ac918f457e9ada79a8f9a4be0d903f2e0c71c Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Mon, 26 Feb 2024 20:38:07 +0000
Subject: [PATCH 047/103] introduced `partition_id`.

---
 .../client/message_handler/message_handler.py |  2 +-
 .../message_handler/message_handler_test.py   |  2 ++
 src/py/flwr/common/message.py                 | 23 ++++++++++++++++---
 3 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/src/py/flwr/client/message_handler/message_handler.py b/src/py/flwr/client/message_handler/message_handler.py
index e7e6c7e05c7..87cace88ec2 100644
--- a/src/py/flwr/client/message_handler/message_handler.py
+++ b/src/py/flwr/client/message_handler/message_handler.py
@@ -98,7 +98,7 @@ def handle_legacy_message_from_msgtype(
     client_fn: ClientFn, message: Message, context: Context
 ) -> Message:
     """Handle legacy message in the inner most mod."""
-    client = client_fn(str(message.metadata.dst_node_id))
+    client = client_fn(str(message.metadata.partition_id))
 
     # Check if NumPyClient is returend
     if isinstance(client, NumPyClient):
diff --git a/src/py/flwr/client/message_handler/message_handler_test.py b/src/py/flwr/client/message_handler/message_handler_test.py
index 9fc126f2792..c24b51972f3 100644
--- a/src/py/flwr/client/message_handler/message_handler_test.py
+++ b/src/py/flwr/client/message_handler/message_handler_test.py
@@ -269,6 +269,8 @@ def test_invalid_message_run_id(self) -> None:
         invalid_metadata_list: List[Metadata] = []
         attrs = list(vars(self.valid_out_metadata).keys())
         for attr in attrs:
+            if attr == "_partition_id":
+                continue
             if attr == "_ttl":  # Skip configurable ttl
                 continue
             # Make an invalid metadata
diff --git a/src/py/flwr/common/message.py b/src/py/flwr/common/message.py
index 14dae0f6ee5..49ac6227ecc 100644
--- a/src/py/flwr/common/message.py
+++ b/src/py/flwr/common/message.py
@@ -15,9 +15,8 @@
 """Message."""
 
 
-from __future__ import annotations
-
 from dataclasses import dataclass
+from typing import Optional, Union
 
 from .record import RecordSet
 
@@ -46,6 +45,10 @@ class Metadata:  # pylint: disable=too-many-instance-attributes
     message_type : str
         A string that encodes the action to be executed on
         the receiving end.
+    partition_id : Optional[int]
+        An identifier that can be used when loading a particular
+        data partition for a ClientApp. Making use of this identifier
+        is more relevant when conducting simulations.
     """
 
     _run_id: int
@@ -56,6 +59,7 @@ class Metadata:  # pylint: disable=too-many-instance-attributes
     _group_id: str
     _ttl: str
     _message_type: str
+    _partition_id: Optional[int]
 
     def __init__(  # pylint: disable=too-many-arguments
         self,
@@ -67,6 +71,7 @@ def __init__(  # pylint: disable=too-many-arguments
         group_id: str,
         ttl: str,
         message_type: str,
+        partition_id: Optional[int] = None,
     ) -> None:
         self._run_id = run_id
         self._message_id = message_id
@@ -76,6 +81,7 @@ def __init__(  # pylint: disable=too-many-arguments
         self._group_id = group_id
         self._ttl = ttl
         self._message_type = message_type
+        self._partition_id = partition_id
 
     @property
     def run_id(self) -> int:
@@ -137,6 +143,16 @@ def message_type(self, value: str) -> None:
         """Set message_type."""
         self._message_type = value
 
+    @property
+    def partition_id(self) -> Union[int, None]:
+        """An identifier telling which data partition a ClientApp should use."""
+        return self._partition_id
+
+    @partition_id.setter
+    def partition_id(self, value: int) -> None:
+        """Set patition_id."""
+        self._partition_id = value
+
 
 @dataclass
 class Message:
@@ -173,7 +189,7 @@ def content(self, value: RecordSet) -> None:
         """Set content."""
         self._content = value
 
-    def create_reply(self, content: RecordSet, ttl: str) -> Message:
+    def create_reply(self, content: RecordSet, ttl: str) -> "Message":
         """Create a reply to this message with specified content and TTL.
 
         The method generates a new `Message` as a reply to this message.
@@ -202,6 +218,7 @@ def create_reply(self, content: RecordSet, ttl: str) -> Message:
                 group_id=self.metadata.group_id,
                 ttl=ttl,
                 message_type=self.metadata.message_type,
+                partition_id=self.metadata.partition_id,
             ),
             content=content,
         )

From eba053a702fafba05fee9b3aae8fb25d5e0068f3 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Mon, 26 Feb 2024 20:44:00 +0000
Subject: [PATCH 048/103] fix for ray proxies and tests

---
 src/py/flwr/simulation/ray_transport/ray_client_proxy.py      | 1 +
 src/py/flwr/simulation/ray_transport/ray_client_proxy_test.py | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/py/flwr/simulation/ray_transport/ray_client_proxy.py b/src/py/flwr/simulation/ray_transport/ray_client_proxy.py
index 405e0920c5a..a45321ed236 100644
--- a/src/py/flwr/simulation/ray_transport/ray_client_proxy.py
+++ b/src/py/flwr/simulation/ray_transport/ray_client_proxy.py
@@ -111,6 +111,7 @@ def _wrap_recordset_in_message(
                 reply_to_message="",
                 ttl=str(timeout) if timeout else "",
                 message_type=message_type,
+                partition_id=int(self.cid),
             ),
         )
 
diff --git a/src/py/flwr/simulation/ray_transport/ray_client_proxy_test.py b/src/py/flwr/simulation/ray_transport/ray_client_proxy_test.py
index 3eeabe0292c..24fe3546e7d 100644
--- a/src/py/flwr/simulation/ray_transport/ray_client_proxy_test.py
+++ b/src/py/flwr/simulation/ray_transport/ray_client_proxy_test.py
@@ -198,10 +198,11 @@ def _load_app() -> ClientApp:
                 message_id="",
                 group_id="",
                 src_node_id=0,
-                dst_node_id=int(cid),
+                dst_node_id=12345,
                 reply_to_message="",
                 ttl="",
                 message_type=MESSAGE_TYPE_GET_PROPERTIES,
+                partition_id=int(cid),
             ),
         )
         pool.submit_client_job(

From 27d2bb1c1b5b88f727dc760c777ebbd5f7cfe3b3 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Mon, 26 Feb 2024 21:35:41 +0000
Subject: [PATCH 049/103] re written

---
 src/py/flwr/common/message.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/py/flwr/common/message.py b/src/py/flwr/common/message.py
index 49ac6227ecc..1e1132e42e2 100644
--- a/src/py/flwr/common/message.py
+++ b/src/py/flwr/common/message.py
@@ -14,9 +14,9 @@
 # ==============================================================================
 """Message."""
 
+from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import Optional, Union
 
 from .record import RecordSet
 
@@ -59,7 +59,7 @@ class Metadata:  # pylint: disable=too-many-instance-attributes
     _group_id: str
     _ttl: str
     _message_type: str
-    _partition_id: Optional[int]
+    _partition_id: int | None
 
     def __init__(  # pylint: disable=too-many-arguments
         self,
@@ -71,7 +71,7 @@ def __init__(  # pylint: disable=too-many-arguments
         group_id: str,
         ttl: str,
         message_type: str,
-        partition_id: Optional[int] = None,
+        partition_id: int | None = None,
     ) -> None:
         self._run_id = run_id
         self._message_id = message_id
@@ -144,7 +144,7 @@ def message_type(self, value: str) -> None:
         self._message_type = value
 
     @property
-    def partition_id(self) -> Union[int, None]:
+    def partition_id(self) -> int | None:
         """An identifier telling which data partition a ClientApp should use."""
         return self._partition_id
 
@@ -189,7 +189,7 @@ def content(self, value: RecordSet) -> None:
         """Set content."""
         self._content = value
 
-    def create_reply(self, content: RecordSet, ttl: str) -> "Message":
+    def create_reply(self, content: RecordSet, ttl: str) -> Message:
         """Create a reply to this message with specified content and TTL.
 
         The method generates a new `Message` as a reply to this message.

From 1969aac37519073f1ef5960acd0e1c9d67e080ff Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Mon, 26 Feb 2024 21:45:43 +0000
Subject: [PATCH 050/103] using `metadata.partition_id`

---
 src/py/flwr/server/superlink/fleet/vce/vce_api.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index 914e69ac4ce..11aad9fd9f3 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -69,7 +69,7 @@ async def worker(
             # Convert TaskIns to Message
             message = message_from_taskins(task_ins)
             # Replace node ID with data partition ID
-            message.metadata.dst_node_id = nodes_mapping[node_id]
+            message.metadata.partition_id = nodes_mapping[node_id]
 
             # Let backend process message
             out_mssg, updated_context = await backend.process_message(
@@ -81,10 +81,6 @@ async def worker(
                 task_ins.run_id, context=updated_context
             )
 
-            # Undo change node_id for partition choice
-            out_mssg.metadata._src_node_id = (  # pylint: disable=protected-access
-                task_ins.task.consumer.node_id
-            )
             # Convert to TaskRes
             task_res = message_to_taskres(out_mssg)
             # Store TaskRes in state
@@ -95,8 +91,7 @@ async def worker(
             break
 
         except Exception as ex:  # pylint: disable=broad-exception-caught
-            # pylint: disable=fixme
-            # TODO: gen TaskRes with relevant error, add it to state_factory
+
             log(ERROR, ex)
             log(ERROR, traceback.format_exc())
             break
@@ -116,8 +111,8 @@ async def generate_pull_requests(
             if task_ins:
                 await queue.put(task_ins[0])
         log(DEBUG, "TaskIns in queue: %i", queue.qsize())
-        # pylint: disable=fixme
-        await asyncio.sleep(1.0)  # TODO: revisit
+
+        await asyncio.sleep(1.0)
     log(DEBUG, "Async producer: Stopped pulling from StateFactory.")
 
 

From 8f1ca09b6a24c640b22c0335bf109ab23382c691 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Tue, 27 Feb 2024 01:04:41 +0000
Subject: [PATCH 051/103] more efficient

---
 .../server/superlink/state/in_memory_state.py | 52 ++++++++++++++-----
 1 file changed, 38 insertions(+), 14 deletions(-)

diff --git a/src/py/flwr/server/superlink/state/in_memory_state.py b/src/py/flwr/server/superlink/state/in_memory_state.py
index 690fadc032d..9e6c458519c 100644
--- a/src/py/flwr/server/superlink/state/in_memory_state.py
+++ b/src/py/flwr/server/superlink/state/in_memory_state.py
@@ -35,6 +35,7 @@ def __init__(self) -> None:
         self.node_ids: Set[int] = set()
         self.run_ids: Set[int] = set()
         self.task_ins_store: Dict[UUID, TaskIns] = {}
+        self.task_ins_mapping: Dict[int, List[UUID]] = {}
         self.task_res_store: Dict[UUID, TaskRes] = {}
         self.lock = threading.Lock()
 
@@ -61,6 +62,14 @@ def store_task_ins(self, task_ins: TaskIns) -> Optional[UUID]:
         task_ins.task.ttl = ttl.isoformat()
         with self.lock:
             self.task_ins_store[task_id] = task_ins
+            node_id = task_ins.task.consumer.node_id
+            if node_id:
+                # If not an annonymous node, let's construct or
+                # update the node_id:task_id mapping
+                if node_id in self.task_ins_mapping:
+                    self.task_ins_mapping[node_id].append(task_id)
+                else:
+                    self.task_ins_mapping[node_id] = [task_id]
 
         # Return the new task_id
         return task_id
@@ -75,22 +84,37 @@ def get_task_ins(
         # Find TaskIns for node_id that were not delivered yet
         task_ins_list: List[TaskIns] = []
         with self.lock:
-            for _, task_ins in self.task_ins_store.items():
-                # pylint: disable=too-many-boolean-expressions
+            # If not annoymous clients, we can get TaskIns efficiently
+            # by making use of node_id:task_id mapping
+            if node_id:
                 if (
-                    node_id is not None  # Not anonymous
-                    and task_ins.task.consumer.anonymous is False
-                    and task_ins.task.consumer.node_id == node_id
-                    and task_ins.task.delivered_at == ""
-                ) or (
-                    node_id is None  # Anonymous
-                    and task_ins.task.consumer.anonymous is True
-                    and task_ins.task.consumer.node_id == 0
-                    and task_ins.task.delivered_at == ""
+                    node_id not in self.task_ins_mapping
+                    or len(self.task_ins_mapping[node_id]) == 0
                 ):
-                    task_ins_list.append(task_ins)
-                if limit and len(task_ins_list) == limit:
-                    break
+                    return task_ins_list
+                task_ids = self.task_ins_mapping[node_id]
+                num = limit if limit else len(task_ids)
+                while len(task_ins_list) < num:
+                    # Remove
+                    uuid = task_ids.pop(0)
+                    # Fetch
+                    taskins = self.task_ins_store[uuid]
+                    # Update
+                    self.task_ins_mapping[node_id] = task_ids
+
+                    task_ins_list.append(taskins)
+            else:
+                for _, task_ins in self.task_ins_store.items():
+                    # pylint: disable=too-many-boolean-expressions
+                    if (
+                        node_id is None  # Anonymous
+                        and task_ins.task.consumer.anonymous is True
+                        and task_ins.task.consumer.node_id == 0
+                        and task_ins.task.delivered_at == ""
+                    ):
+                        task_ins_list.append(task_ins)
+                    if limit and len(task_ins_list) == limit:
+                        break
 
         # Mark all of them as delivered
         delivered_at = now().isoformat()

From f44b595b90ef47d989b9eb5039d9e1c75961df57 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Tue, 27 Feb 2024 01:06:56 +0000
Subject: [PATCH 052/103] undo

---
 .../server/superlink/state/in_memory_state.py | 52 +++++--------------
 1 file changed, 14 insertions(+), 38 deletions(-)

diff --git a/src/py/flwr/server/superlink/state/in_memory_state.py b/src/py/flwr/server/superlink/state/in_memory_state.py
index 9e6c458519c..690fadc032d 100644
--- a/src/py/flwr/server/superlink/state/in_memory_state.py
+++ b/src/py/flwr/server/superlink/state/in_memory_state.py
@@ -35,7 +35,6 @@ def __init__(self) -> None:
         self.node_ids: Set[int] = set()
         self.run_ids: Set[int] = set()
         self.task_ins_store: Dict[UUID, TaskIns] = {}
-        self.task_ins_mapping: Dict[int, List[UUID]] = {}
         self.task_res_store: Dict[UUID, TaskRes] = {}
         self.lock = threading.Lock()
 
@@ -62,14 +61,6 @@ def store_task_ins(self, task_ins: TaskIns) -> Optional[UUID]:
         task_ins.task.ttl = ttl.isoformat()
         with self.lock:
             self.task_ins_store[task_id] = task_ins
-            node_id = task_ins.task.consumer.node_id
-            if node_id:
-                # If not an annonymous node, let's construct or
-                # update the node_id:task_id mapping
-                if node_id in self.task_ins_mapping:
-                    self.task_ins_mapping[node_id].append(task_id)
-                else:
-                    self.task_ins_mapping[node_id] = [task_id]
 
         # Return the new task_id
         return task_id
@@ -84,37 +75,22 @@ def get_task_ins(
         # Find TaskIns for node_id that were not delivered yet
         task_ins_list: List[TaskIns] = []
         with self.lock:
-            # If not annoymous clients, we can get TaskIns efficiently
-            # by making use of node_id:task_id mapping
-            if node_id:
+            for _, task_ins in self.task_ins_store.items():
+                # pylint: disable=too-many-boolean-expressions
                 if (
-                    node_id not in self.task_ins_mapping
-                    or len(self.task_ins_mapping[node_id]) == 0
+                    node_id is not None  # Not anonymous
+                    and task_ins.task.consumer.anonymous is False
+                    and task_ins.task.consumer.node_id == node_id
+                    and task_ins.task.delivered_at == ""
+                ) or (
+                    node_id is None  # Anonymous
+                    and task_ins.task.consumer.anonymous is True
+                    and task_ins.task.consumer.node_id == 0
+                    and task_ins.task.delivered_at == ""
                 ):
-                    return task_ins_list
-                task_ids = self.task_ins_mapping[node_id]
-                num = limit if limit else len(task_ids)
-                while len(task_ins_list) < num:
-                    # Remove
-                    uuid = task_ids.pop(0)
-                    # Fetch
-                    taskins = self.task_ins_store[uuid]
-                    # Update
-                    self.task_ins_mapping[node_id] = task_ids
-
-                    task_ins_list.append(taskins)
-            else:
-                for _, task_ins in self.task_ins_store.items():
-                    # pylint: disable=too-many-boolean-expressions
-                    if (
-                        node_id is None  # Anonymous
-                        and task_ins.task.consumer.anonymous is True
-                        and task_ins.task.consumer.node_id == 0
-                        and task_ins.task.delivered_at == ""
-                    ):
-                        task_ins_list.append(task_ins)
-                    if limit and len(task_ins_list) == limit:
-                        break
+                    task_ins_list.append(task_ins)
+                if limit and len(task_ins_list) == limit:
+                    break
 
         # Mark all of them as delivered
         delivered_at = now().isoformat()

From ab55b0cbccada54cbe40614f62af1263421f5c34 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Tue, 27 Feb 2024 09:53:44 +0000
Subject: [PATCH 053/103] more tests

---
 .../server/superlink/fleet/vce/vce_api.py     | 25 ++++++++--
 .../superlink/fleet/vce/vce_api_test.py       | 47 +++++++++++++------
 2 files changed, 53 insertions(+), 19 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index 11aad9fd9f3..e665903188e 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -158,21 +158,38 @@ async def run(
 
 # pylint: disable=too-many-arguments,unused-argument,too-many-locals
 def start_vce(
-    num_supernodes: int,
     client_app_module_name: str,
     backend_name: str,
     backend_config_json_stream: str,
-    state_factory: StateFactory,
     working_dir: str,
     f_stop: asyncio.Event,
+    num_supernodes: Optional[int] = None,
+    state_factory: Optional[StateFactory] = None,
     existing_nodes_mapping: Optional[NodeToPartitionMapping] = None,
 ) -> None:
-    """Start Fleet API with the VirtualClientEngine (VCE)."""
+    """Start Fleet API with the Simulation Engine."""
+    if num_supernodes is not None and existing_nodes_mapping is not None:
+        raise ValueError(
+            "Both `num_supernodes` and `existing_nodes_mapping` are provided, "
+            "but only one is allowed."
+        )
     if existing_nodes_mapping:
+        if state_factory is None:
+            raise ValueError(
+                "You passed `existing_nodes_mapping` but no `state_factory` was passed."
+            )
+        log(INFO, "Using exiting NodeToPartitionMapping and StateFactory.")
         # Use mapping constructed externally. This also means nodes
         # have previously being registered.
         nodes_mapping = existing_nodes_mapping
-    else:
+
+    if not state_factory:
+        log(INFO, "A StateFactory was not supplied to the SimulationEngine.")
+        # Create an empty in-memory state factory
+        state_factory = StateFactory(":flwr-in-memory-state:")
+        log(INFO, "Created new %s.", state_factory.__class__.__name__)
+
+    if num_supernodes:
         # Register SuperNodes
         nodes_mapping = _register_nodes(
             num_nodes=num_supernodes, state_factory=state_factory
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
index 6abdd046f81..3967c734617 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
@@ -72,23 +72,15 @@ def terminate_simulation(f_stop: asyncio.Event, sleep_duration: int) -> None:
 
 
 def start_and_shutdown(
-    existing_state_factory: Optional[StateFactory] = None,
+    backend: str = "ray",
+    num_supernodes: Optional[int] = None,
+    state_factory: Optional[StateFactory] = None,
     nodes_mapping: Optional[NodeToPartitionMapping] = None,
     duration: int = 10,
 ) -> None:
     """Start Simulation Engine and terminate after specified number of seconds."""
     f_stop = asyncio.Event()
 
-    # Initialize StateFactory
-    if nodes_mapping:
-        if existing_state_factory is None:
-            raise ValueError(
-                "If you specify a node mapping, you must pass a StateFactory."
-            )
-        state_factory = existing_state_factory
-    else:
-        state_factory = StateFactory(":flwr-in-memory-state:")
-
     # Setup thread that will set the f_stop event, triggering the termination of all
     # asyncio logic in the Simulation Engine. It will also terminate the Backend.
     termination_th = threading.Thread(
@@ -97,9 +89,9 @@ def start_and_shutdown(
     termination_th.start()
 
     start_vce(
-        num_supernodes=50,
+        num_supernodes=num_supernodes,
         client_app_module_name="vce_api_test:client_app",
-        backend_name="ray",
+        backend_name=backend,
         backend_config_json_stream="{}",  # an empty json stream (an empty config)
         state_factory=state_factory,
         working_dir="",
@@ -118,7 +110,32 @@ class AsyncTestFleetSimulationEngineRayBackend(IsolatedAsyncioTestCase):
 
     def test_start_and_shutdown(self) -> None:
         """Start Simulation Engine Fleet and terminate it."""
-        start_and_shutdown()
+        start_and_shutdown(num_supernodes=50)
+
+    def test_with_nonexistent_backend(self) -> None:
+        """Test specifying a backend that does not exist."""
+        with self.assertRaises(KeyError):
+            start_and_shutdown(num_supernodes=50, backend="this-backend-does-not-exist")
+
+    def test_erroneous_arguments_num_supernodes_and_existing_mapping(self) -> None:
+        """Test ValueError if a node mapping is passed but also num_supernodes.
+
+        Passing `num_supernodes` does nothing since we assume that if a node mapping
+        is supplied, nodes have been registered externally already. Therefore passing
+        `num_supernodes` might give the impression that that many nodes will be registered.
+        We don't do that since a mapping already exists.
+        """
+        with self.assertRaises(ValueError):
+            start_and_shutdown(num_supernodes=50, nodes_mapping={0: 1})
+
+    def test_erroneous_arguments_existing_mapping_but_no_state_factory(self) -> None:
+        """Test ValueError if a node mapping is passed but no state.
+
+        Passing a node mapping indicates that (externally) nodes have registered with a
+        state factory. Therefore, that state factory should be passed too.
+        """
+        with self.assertRaises(ValueError):
+            start_and_shutdown(nodes_mapping={0: 1})
 
     # pylint: disable=too-many-locals
     def test_start_and_shutdown_with_tasks_in_state(self) -> None:
@@ -179,7 +196,7 @@ def test_start_and_shutdown_with_tasks_in_state(self) -> None:
                 expected_results[task_id] = mult_factor * pi
 
         # Run
-        start_and_shutdown(state_factory, nodes_mapping)
+        start_and_shutdown(state_factory=state_factory, nodes_mapping=nodes_mapping)
 
         # Get all TaskRes
         task_res_list = state.get_task_res(task_ids=task_ids, limit=len(task_ids))

From 28dda2dde4ec460453a91b280fc89426731d8d62 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Tue, 27 Feb 2024 11:11:49 +0000
Subject: [PATCH 054/103] more

---
 .../server/superlink/fleet/vce/vce_api.py     |  15 +-
 .../superlink/fleet/vce/vce_api_test.py       | 150 +++++++++++++-----
 2 files changed, 119 insertions(+), 46 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index e665903188e..c7f94a4c554 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -21,7 +21,7 @@
 from logging import DEBUG, ERROR, INFO
 from typing import Callable, Dict, Optional
 
-from flwr.client.clientapp import ClientApp, load_client_app
+from flwr.client.clientapp import ClientApp, LoadClientAppError, load_client_app
 from flwr.client.node_state import NodeState
 from flwr.common.logger import log
 from flwr.common.serde import message_from_taskins, message_to_taskres
@@ -90,8 +90,12 @@ async def worker(
             log(DEBUG, "Async worker: %s", e)
             break
 
-        except Exception as ex:  # pylint: disable=broad-exception-caught
+        except LoadClientAppError as app_ex:
+            log(ERROR, "Async worker: %s", app_ex)
+            log(ERROR, traceback.format_exc())
+            raise
 
+        except Exception as ex:  # pylint: disable=broad-exception-caught
             log(ERROR, ex)
             log(ERROR, traceback.format_exc())
             break
@@ -173,6 +177,13 @@ def start_vce(
             "Both `num_supernodes` and `existing_nodes_mapping` are provided, "
             "but only one is allowed."
         )
+    if num_supernodes is None:
+        if state_factory is None or existing_nodes_mapping is None:
+            raise ValueError(
+                "If not passing an existing `state_factory` and associated "
+                "`existing_nodes_mapping` you must supply `num_supernodes` to indicate "
+                "how many nodes to insert into a new StateFactory that will be created."
+            )
     if existing_nodes_mapping:
         if state_factory is None:
             raise ValueError(
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
index 3967c734617..7cb0583791d 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
@@ -13,9 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 """Test Fleet Simulation Engine API."""
+
+
 import asyncio
 import threading
 from itertools import cycle
+from json import JSONDecodeError
 from math import pi
 from time import sleep
 from typing import Dict, Optional, Set
@@ -71,12 +74,62 @@ def terminate_simulation(f_stop: asyncio.Event, sleep_duration: int) -> None:
     f_stop.set()
 
 
+# pylint: disable=too-many-locals
+def register_messages_into_state(
+    state_factory: StateFactory,
+    nodes_mapping: NodeToPartitionMapping,
+    run_id: int,
+    num_messages: int,
+) -> Dict[UUID, float]:
+    """Register `num_messages` into the state factory."""
+    state: InMemoryState = state_factory.state()  # type: ignore
+    state.run_ids.add(run_id)
+    # Artificially add TaskIns to state so they can be processed
+    # by the Simulation Engine logic
+    nodes_cycle = cycle(nodes_mapping.keys())  # we have more messages than supernodes
+    task_ids: Set[UUID] = set()  # so we can retrieve them later
+    expected_results = {}
+    for i in range(num_messages):
+        dst_node_id = next(nodes_cycle)
+        # Construct a Message
+        mult_factor = 2024 + i
+        getproperties_ins = GetPropertiesIns(config={"factor": mult_factor})
+        recordset = getpropertiesins_to_recordset(getproperties_ins)
+        message = Message(
+            content=recordset,
+            metadata=Metadata(
+                run_id=run_id,
+                message_id="",
+                group_id="",
+                src_node_id=0,
+                dst_node_id=dst_node_id,  # indicate destination node
+                reply_to_message="",
+                ttl="",
+                message_type=MESSAGE_TYPE_GET_PROPERTIES,
+            ),
+        )
+        # Convert Message to TaskIns
+        taskins = message_to_taskins(message)
+        # Instert in state
+        task_id = state.store_task_ins(taskins)
+        if task_id:
+            # Add to UUID set
+            task_ids.add(task_id)
+            # Store expected output for check later on
+            expected_results[task_id] = mult_factor * pi
+
+    return expected_results
+
+
+# pylint: disable=too-many-arguments
 def start_and_shutdown(
     backend: str = "ray",
+    clientapp_module: str = "vce_api_test:client_app",
     num_supernodes: Optional[int] = None,
     state_factory: Optional[StateFactory] = None,
     nodes_mapping: Optional[NodeToPartitionMapping] = None,
     duration: int = 10,
+    backend_config: str = "{}",
 ) -> None:
     """Start Simulation Engine and terminate after specified number of seconds."""
     f_stop = asyncio.Event()
@@ -90,9 +143,9 @@ def start_and_shutdown(
 
     start_vce(
         num_supernodes=num_supernodes,
-        client_app_module_name="vce_api_test:client_app",
+        client_app_module_name=clientapp_module,
         backend_name=backend,
-        backend_config_json_stream="{}",  # an empty json stream (an empty config)
+        backend_config_json_stream=backend_config,
         state_factory=state_factory,
         working_dir="",
         f_stop=f_stop,
@@ -108,9 +161,43 @@ def start_and_shutdown(
 class AsyncTestFleetSimulationEngineRayBackend(IsolatedAsyncioTestCase):
     """A basic class that enables testing asyncio functionalities."""
 
-    def test_start_and_shutdown(self) -> None:
-        """Start Simulation Engine Fleet and terminate it."""
-        start_and_shutdown(num_supernodes=50)
+    def test_erroneous_no_supernodes_client_mapping(self) -> None:
+        """Test with unset arguments."""
+        with self.assertRaises(ValueError):
+            start_and_shutdown()
+
+    # def test_erroneous_clientapp_module_name(self) -> None:
+    #     """Tests attempt to load a ClientApp that can't be found."""
+    #     from flwr.client.clientapp import LoadClientAppError
+    #     num_messages = 7
+    #     num_nodes = 59
+
+    #     # Register a state and a run_id in it
+    #     run_id = 1234
+    #     state_factory = StateFactory(":flwr-in-memory-state:")
+
+    #     # Register a few nodes
+    #     nodes_mapping = _register_nodes(
+    #         num_nodes=num_nodes, state_factory=state_factory
+    #     )
+
+    #     _ = register_messages_into_state(
+    #         state_factory=state_factory,
+    #         nodes_mapping=nodes_mapping,
+    #         run_id=run_id,
+    #         num_messages=num_messages,
+    #     )
+    #     with self.assertRaises(LoadClientAppError):
+    #         start_and_shutdown(
+    #             clientapp_module="totally_fictitious_app:client",
+    #             state_factory=state_factory,
+    #             nodes_mapping=nodes_mapping,
+    #         )
+
+    def test_erroneous_backend_config(self) -> None:
+        """Backend Config should be a JSON stream."""
+        with self.assertRaises(JSONDecodeError):
+            start_and_shutdown(num_supernodes=50, backend_config="not a proper config")
 
     def test_with_nonexistent_backend(self) -> None:
         """Test specifying a backend that does not exist."""
@@ -122,8 +209,8 @@ def test_erroneous_arguments_num_supernodes_and_existing_mapping(self) -> None:
 
         Passing `num_supernodes` does nothing since we assume that if a node mapping
         is supplied, nodes have been registered externally already. Therefore passing
-        `num_supernodes` might give the impression that that many nodes will be registered.
-        We don't do that since a mapping already exists.
+        `num_supernodes` might give the impression that that many nodes will be
+        registered. We don't do that since a mapping already exists.
         """
         with self.assertRaises(ValueError):
             start_and_shutdown(num_supernodes=50, nodes_mapping={0: 1})
@@ -137,6 +224,10 @@ def test_erroneous_arguments_existing_mapping_but_no_state_factory(self) -> None
         with self.assertRaises(ValueError):
             start_and_shutdown(nodes_mapping={0: 1})
 
+    def test_start_and_shutdown(self) -> None:
+        """Start Simulation Engine Fleet and terminate it."""
+        start_and_shutdown(num_supernodes=50)
+
     # pylint: disable=too-many-locals
     def test_start_and_shutdown_with_tasks_in_state(self) -> None:
         """Run Simulation Engine with some TasksIns in State.
@@ -151,54 +242,25 @@ def test_start_and_shutdown_with_tasks_in_state(self) -> None:
         # Register a state and a run_id in it
         run_id = 1234
         state_factory = StateFactory(":flwr-in-memory-state:")
-        state: InMemoryState = state_factory.state()  # type: ignore
-        state.run_ids.add(run_id)
 
         # Register a few nodes
         nodes_mapping = _register_nodes(
             num_nodes=num_nodes, state_factory=state_factory
         )
 
-        # Artificially add TaskIns to state so they can be processed
-        # by the Simulation Engine logic
-        nodes_cycle = cycle(
-            nodes_mapping.keys()
-        )  # we have more messages than supernodes
-        task_ids: Set[UUID] = set()  # so we can retrieve them later
-        expected_results = {}
-        for i in range(num_messages):
-            dst_node_id = next(nodes_cycle)
-            # Construct a Message
-            mult_factor = 2024 + i
-            getproperties_ins = GetPropertiesIns(config={"factor": mult_factor})
-            recordset = getpropertiesins_to_recordset(getproperties_ins)
-            message = Message(
-                content=recordset,
-                metadata=Metadata(
-                    run_id=run_id,
-                    message_id="",
-                    group_id="",
-                    src_node_id=0,
-                    dst_node_id=dst_node_id,  # indicate destination node
-                    reply_to_message="",
-                    ttl="",
-                    message_type=MESSAGE_TYPE_GET_PROPERTIES,
-                ),
-            )
-            # Convert Message to TaskIns
-            taskins = message_to_taskins(message)
-            # Instert in state
-            task_id = state.store_task_ins(taskins)
-            if task_id:
-                # Add to UUID set
-                task_ids.add(task_id)
-                # Store expected output for check later on
-                expected_results[task_id] = mult_factor * pi
+        expected_results = register_messages_into_state(
+            state_factory=state_factory,
+            nodes_mapping=nodes_mapping,
+            run_id=run_id,
+            num_messages=num_messages,
+        )
 
         # Run
         start_and_shutdown(state_factory=state_factory, nodes_mapping=nodes_mapping)
 
         # Get all TaskRes
+        state = state_factory.state()
+        task_ids = set(expected_results.keys())
         task_res_list = state.get_task_res(task_ids=task_ids, limit=len(task_ids))
 
         # Check results by first converting to Message

From 5cd047eaf5a26b7c93b34815f47aacb4311992c3 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Tue, 27 Feb 2024 11:42:59 +0000
Subject: [PATCH 055/103] minor update

---
 .../flwr/server/superlink/fleet/vce/backend/raybackend.py   | 2 +-
 src/py/flwr/server/superlink/fleet/vce/vce_api.py           | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
index 5c81501d62d..7494ea7c285 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
@@ -138,7 +138,7 @@ async def process_message(
 
         Return output message and updated context.
         """
-        node_id = message.metadata.dst_node_id
+        node_id = message.metadata.partition_id
 
         # Submite a task to the pool
         future = await self.pool.submit(
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index c7f94a4c554..79d41a3de77 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -21,7 +21,7 @@
 from logging import DEBUG, ERROR, INFO
 from typing import Callable, Dict, Optional
 
-from flwr.client.clientapp import ClientApp, LoadClientAppError, load_client_app
+from flwr.client.clientapp import ClientApp, load_client_app, LoadClientAppError
 from flwr.client.node_state import NodeState
 from flwr.common.logger import log
 from flwr.common.serde import message_from_taskins, message_to_taskres
@@ -89,12 +89,12 @@ async def worker(
         except asyncio.CancelledError as e:
             log(DEBUG, "Async worker: %s", e)
             break
-
+        
         except LoadClientAppError as app_ex:
             log(ERROR, "Async worker: %s", app_ex)
             log(ERROR, traceback.format_exc())
             raise
-
+        
         except Exception as ex:  # pylint: disable=broad-exception-caught
             log(ERROR, ex)
             log(ERROR, traceback.format_exc())

From 4be09c29d4b6f15a968d0ce8d04b557d5812a533 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Tue, 27 Feb 2024 11:44:51 +0000
Subject: [PATCH 056/103] handle loading of non-existing ClientApp

---
 .../superlink/fleet/vce/backend/raybackend.py | 42 ++++++---
 .../fleet/vce/backend/raybackend_test.py      | 85 ++++++++++++-------
 .../simulation/ray_transport/ray_actor.py     |  5 +-
 3 files changed, 87 insertions(+), 45 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
index b29d76b239e..709680bdba0 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
@@ -15,10 +15,12 @@
 """Ray backend for the Fleet API using the Simulation Engine."""
 
 import pathlib
-from logging import INFO
+from logging import ERROR, INFO
 from typing import Callable, Dict, List, Tuple, Union
 
-from flwr.client.clientapp import ClientApp
+import ray
+
+from flwr.client.clientapp import ClientApp, LoadClientAppError
 from flwr.common.context import Context
 from flwr.common.logger import log
 from flwr.common.message import Message
@@ -138,22 +140,34 @@ async def process_message(
         """
         node_id = message.metadata.dst_node_id
 
-        # Submite a task to the pool
-        future = await self.pool.submit(
-            lambda a, a_fn, mssg, cid, state: a.run.remote(a_fn, mssg, cid, state),
-            (app, message, str(node_id), context),
-        )
+        try:
+            # Submite a task to the pool
+            future = await self.pool.submit(
+                lambda a, a_fn, mssg, cid, state: a.run.remote(a_fn, mssg, cid, state),
+                (app, message, str(node_id), context),
+            )
 
-        await future
+            await future
 
-        # Fetch result
-        (
-            out_mssg,
-            updated_context,
-        ) = await self.pool.fetch_result_and_return_actor_to_pool(future)
+            # Fetch result
+            (
+                out_mssg,
+                updated_context,
+            ) = await self.pool.fetch_result_and_return_actor_to_pool(future)
 
-        return out_mssg, updated_context
+            return out_mssg, updated_context
+
+        except LoadClientAppError as load_ex:
+            log(
+                ERROR,
+                "An exception was raised when processing a message. Terminating %s",
+                self.__class__.__name__,
+            )
+            await self.terminate()
+            raise load_ex
 
     async def terminate(self) -> None:
         """Terminate all actors in actor pool."""
         await self.pool.terminate_all_actors()
+        ray.shutdown()
+        log(INFO, "Terminated %s", self.__class__.__name__)
diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
index f0cca527ab9..3a9c7cd529b 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
@@ -19,10 +19,8 @@
 from typing import Callable, Dict, Optional, Tuple, Union
 from unittest import IsolatedAsyncioTestCase
 
-import ray
-
 from flwr.client import Client, NumPyClient
-from flwr.client.clientapp import ClientApp
+from flwr.client.clientapp import ClientApp, LoadClientAppError, load_client_app
 from flwr.common import (
     Config,
     ConfigsRecord,
@@ -60,6 +58,14 @@ def _load_app() -> ClientApp:
     return ClientApp(client_fn=get_dummy_client)
 
 
+def _load_from_module(client_app_module_name: str) -> Callable[[], ClientApp]:
+    def _load_app() -> ClientApp:
+        app: ClientApp = load_client_app(client_app_module_name)
+        return app
+
+    return _load_app
+
+
 async def backend_build_process_and_termination(
     backend: RayBackend,
     process_args: Optional[Tuple[Callable[[], ClientApp], Message, Context]] = None,
@@ -73,11 +79,38 @@ async def backend_build_process_and_termination(
 
     await backend.terminate()
 
-    ray.shutdown()
-
     return to_return
 
 
+def _create_message_and_context() -> Tuple[Message, Context, float]:
+
+    # Construct a Message
+    mult_factor = 2024
+    getproperties_ins = GetPropertiesIns(config={"factor": mult_factor})
+    recordset = getpropertiesins_to_recordset(getproperties_ins)
+    message = Message(
+        content=recordset,
+        metadata=Metadata(
+            run_id=0,
+            message_id="",
+            group_id="",
+            src_node_id=0,
+            dst_node_id=0,
+            reply_to_message="",
+            ttl="",
+            message_type=MESSAGE_TYPE_GET_PROPERTIES,
+        ),
+    )
+
+    # Construct emtpy Context
+    context = Context(state=RecordSet())
+
+    # Expected output
+    expected_output = pi * mult_factor
+
+    return message, context, expected_output
+
+
 class AsyncTestRayBackend(IsolatedAsyncioTestCase):
     """A basic class that allows runnig multliple asyncio tests."""
 
@@ -88,33 +121,16 @@ def test_backend_creation_and_termination(self) -> None:
             backend_build_process_and_termination(backend=backend, process_args=None)
         )
 
-    def test_backend_creation_submit_and_termination(self) -> None:
-        """Test submit."""
+    def test_backend_creation_submit_and_termination(
+        self, client_app_loader: Callable[[], ClientApp] = _load_app
+    ) -> None:
+        """Test submitting a message to a given ClientApp."""
         backend = RayBackend(backend_config={}, work_dir="")
 
         # Define ClientApp
-        client_app_callable = _load_app
-
-        # Construct a Message
-        mult_factor = 2024
-        getproperties_ins = GetPropertiesIns(config={"factor": mult_factor})
-        recordset = getpropertiesins_to_recordset(getproperties_ins)
-        message = Message(
-            content=recordset,
-            metadata=Metadata(
-                run_id=0,
-                message_id="",
-                group_id="",
-                src_node_id=0,
-                dst_node_id=0,
-                reply_to_message="",
-                ttl="",
-                message_type=MESSAGE_TYPE_GET_PROPERTIES,
-            ),
-        )
+        client_app_callable = client_app_loader
 
-        # Construct emtpy Context
-        context = Context(state=RecordSet())
+        message, context, expected_output = _create_message_and_context()
 
         res = asyncio.run(
             backend_build_process_and_termination(
@@ -131,11 +147,20 @@ def test_backend_creation_submit_and_termination(self) -> None:
         content = out_mssg.content
         assert (
             content.configs_records["getpropertiesres.properties"]["result"]
-            == pi * mult_factor
+            == expected_output
         )
 
         # Verify context is correct
         obtained_result_in_context = updated_context.state.configs_records["result"][
             "result"
         ]
-        assert obtained_result_in_context == pi * mult_factor
+        assert obtained_result_in_context == expected_output
+
+    def test_backend_creation_submit_and_termination_non_existent_client_app(
+        self,
+    ) -> None:
+        """Testing with ClientApp module that does not exist."""
+        with self.assertRaises(LoadClientAppError):
+            self.test_backend_creation_submit_and_termination(
+                client_app_loader=_load_from_module("a_non_existing_module:app")
+            )
diff --git a/src/py/flwr/simulation/ray_transport/ray_actor.py b/src/py/flwr/simulation/ray_transport/ray_actor.py
index 5ac0b2c2748..482506df94b 100644
--- a/src/py/flwr/simulation/ray_transport/ray_actor.py
+++ b/src/py/flwr/simulation/ray_transport/ray_actor.py
@@ -25,7 +25,7 @@
 from ray import ObjectRef
 from ray.util.actor_pool import ActorPool
 
-from flwr.client.clientapp import ClientApp
+from flwr.client.clientapp import ClientApp, LoadClientAppError
 from flwr.common import Context, Message
 from flwr.common.logger import log
 
@@ -67,6 +67,9 @@ def run(
             # Handle task message
             out_message = app(message=message, context=context)
 
+        except LoadClientAppError as load_ex:
+            raise load_ex
+
         except Exception as ex:
             client_trace = traceback.format_exc()
             mssg = (

From 3c616e9df491c7e64feb72392257f2b314dee104 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Tue, 27 Feb 2024 13:25:29 +0000
Subject: [PATCH 057/103] better tests; reorg

---
 .../superlink/fleet/vce/backend/raybackend.py |  3 +
 .../fleet/vce/backend/raybackend_test.py      | 74 ++++++++++---------
 2 files changed, 41 insertions(+), 36 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
index 709680bdba0..7f885e2cfa7 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
@@ -48,6 +48,9 @@ def __init__(
         log(INFO, "Initialising: %s", self.__class__.__name__)
         log(INFO, "Backend config: %s", backend_config)
 
+        if not pathlib.Path(work_dir).exists():
+            raise ValueError(f"Specified work_dir {work_dir} does not exist.")
+
         # Init ray and append working dir if needed
         runtime_env = (
             self._configure_runtime_env(work_dir=work_dir) if work_dir else None
diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
index 3a9c7cd529b..92ca60db230 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
@@ -16,46 +16,17 @@
 
 import asyncio
 from math import pi
-from typing import Callable, Dict, Optional, Tuple, Union
+from pathlib import Path
+from typing import Callable, Optional, Tuple, Union
 from unittest import IsolatedAsyncioTestCase
 
-from flwr.client import Client, NumPyClient
 from flwr.client.clientapp import ClientApp, LoadClientAppError, load_client_app
-from flwr.common import (
-    Config,
-    ConfigsRecord,
-    Context,
-    GetPropertiesIns,
-    Message,
-    Metadata,
-    RecordSet,
-    Scalar,
-)
+from flwr.common import Context, GetPropertiesIns, Message, Metadata, RecordSet
 from flwr.common.constant import MESSAGE_TYPE_GET_PROPERTIES
 from flwr.common.recordset_compat import getpropertiesins_to_recordset
 
 from .raybackend import RayBackend
-
-
-class DummyClient(NumPyClient):
-    """A dummy NumPyClient for tests."""
-
-    def get_properties(self, config: Config) -> Dict[str, Scalar]:
-        """Return properties by doing a simple calculation."""
-        result = float(config["factor"]) * pi
-
-        # store something in context
-        self.context.state.configs_records["result"] = ConfigsRecord({"result": result})
-        return {"result": result}
-
-
-def get_dummy_client(cid: str) -> Client:  # pylint: disable=unused-argument
-    """Return a DummyClient converted to Client type."""
-    return DummyClient().to_client()
-
-
-def _load_app() -> ClientApp:
-    return ClientApp(client_fn=get_dummy_client)
+from .test.client import _load_app
 
 
 def _load_from_module(client_app_module_name: str) -> Callable[[], ClientApp]:
@@ -122,10 +93,12 @@ def test_backend_creation_and_termination(self) -> None:
         )
 
     def test_backend_creation_submit_and_termination(
-        self, client_app_loader: Callable[[], ClientApp] = _load_app
+        self,
+        client_app_loader: Callable[[], ClientApp] = _load_app,
+        workdir: str = "",
     ) -> None:
         """Test submitting a message to a given ClientApp."""
-        backend = RayBackend(backend_config={}, work_dir="")
+        backend = RayBackend(backend_config={}, work_dir=workdir)
 
         # Define ClientApp
         client_app_callable = client_app_loader
@@ -156,7 +129,7 @@ def test_backend_creation_submit_and_termination(
         ]
         assert obtained_result_in_context == expected_output
 
-    def test_backend_creation_submit_and_termination_non_existent_client_app(
+    def test_backend_creation_submit_and_termination_non_existing_client_app(
         self,
     ) -> None:
         """Testing with ClientApp module that does not exist."""
@@ -164,3 +137,32 @@ def test_backend_creation_submit_and_termination_non_existent_client_app(
             self.test_backend_creation_submit_and_termination(
                 client_app_loader=_load_from_module("a_non_existing_module:app")
             )
+
+    def test_backend_creation_submit_and_termination_existing_client_app(
+        self,
+    ) -> None:
+        """Testing with ClientApp module that exist."""
+        # Resolve what should be the workdir to pass upon Backend initialisation
+        file_path = Path(__file__)
+        print(f"{file_path = }")
+        working_dir = Path.cwd()
+        print(f"{working_dir = }")
+        rel_workdir = file_path.relative_to(working_dir)
+
+        # Susbtract lats element and append "test" (to make it point ot .test dir)
+        rel_workdir_str = str(rel_workdir.parent / "test")
+
+        self.test_backend_creation_submit_and_termination(
+            client_app_loader=_load_from_module("client:client_app"),
+            workdir=rel_workdir_str,
+        )
+
+    def test_backend_creation_submit_and_termination_existing_client_app_unsetworkdir(
+        self,
+    ) -> None:
+        """Testing with ClientApp module that exist but the passed workdir does not."""
+        with self.assertRaises(ValueError):
+            self.test_backend_creation_submit_and_termination(
+                client_app_loader=_load_from_module("test.client:client_app"),
+                workdir="/?&%$^#%@$!",
+            )

From aed442041ac40c221daae98cc81bdeacb59eeba4 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Tue, 27 Feb 2024 14:36:14 +0000
Subject: [PATCH 058/103] update

---
 .../superlink/fleet/vce/vce_api_test.py       | 55 +++++++------------
 1 file changed, 21 insertions(+), 34 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
index c71c33c1a96..26ea5d52905 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
@@ -20,21 +20,14 @@
 from itertools import cycle
 from json import JSONDecodeError
 from math import pi
+from pathlib import Path
 from time import sleep
 from typing import Dict, Optional, Set
 from unittest import IsolatedAsyncioTestCase
 from uuid import UUID
 
-from flwr.client import Client, NumPyClient
-from flwr.client.clientapp import ClientApp, LoadClientAppError
-from flwr.common import (
-    Config,
-    ConfigsRecord,
-    GetPropertiesIns,
-    Message,
-    Metadata,
-    Scalar,
-)
+from flwr.client.clientapp import LoadClientAppError
+from flwr.common import GetPropertiesIns, Message, Metadata
 from flwr.common.constant import MESSAGE_TYPE_GET_PROPERTIES
 from flwr.common.recordset_compat import getpropertiesins_to_recordset
 from flwr.common.serde import message_from_taskres, message_to_taskins
@@ -46,28 +39,6 @@
 from flwr.server.superlink.state import InMemoryState, StateFactory
 
 
-class DummyClient(NumPyClient):
-    """A dummy NumPyClient for tests."""
-
-    def get_properties(self, config: Config) -> Dict[str, Scalar]:
-        """Return properties by doing a simple calculation."""
-        result = float(config["factor"]) * pi
-
-        # store something in context
-        self.context.state.configs_records["result"] = ConfigsRecord({"result": result})
-        return {"result": result}
-
-
-def get_dummy_client(cid: str) -> Client:  # pylint: disable=unused-argument
-    """Return a DummyClient converted to Client type."""
-    return DummyClient().to_client()
-
-
-client_app = ClientApp(
-    client_fn=get_dummy_client,
-)
-
-
 def terminate_simulation(f_stop: asyncio.Event, sleep_duration: int) -> None:
     """Set event to terminate Simulation Engine after `sleep_duration` seconds."""
     sleep(sleep_duration)
@@ -121,10 +92,21 @@ def register_messages_into_state(
     return expected_results
 
 
+def _autoresolve_working_dir(rel_client_app_dir: str = "backend/test") -> str:
+    """Correctly resolve working directory."""
+    file_path = Path(__file__)
+    working_dir = Path.cwd()
+    rel_workdir = file_path.relative_to(working_dir)
+
+    # Susbtract lats element and append "backend/test" (wher the client module is.)
+    return str(rel_workdir.parent / rel_client_app_dir)
+
+
 # pylint: disable=too-many-arguments
 def start_and_shutdown(
     backend: str = "ray",
-    clientapp_module: str = "vce_api_test:client_app",
+    clientapp_module: str = "client:client_app",
+    working_dir: str = "",
     num_supernodes: Optional[int] = None,
     state_factory: Optional[StateFactory] = None,
     nodes_mapping: Optional[NodeToPartitionMapping] = None,
@@ -141,13 +123,18 @@ def start_and_shutdown(
     )
     termination_th.start()
 
+    # Resolve working directory if not passed
+    if not working_dir:
+        working_dir = _autoresolve_working_dir()
+        print(f"---> {working_dir = }")
+
     start_vce(
         num_supernodes=num_supernodes,
         client_app_module_name=clientapp_module,
         backend_name=backend,
         backend_config_json_stream=backend_config,
         state_factory=state_factory,
-        working_dir="",
+        working_dir=working_dir,
         f_stop=f_stop,
         existing_nodes_mapping=nodes_mapping,
     )

From 96519dc164fddd18fb93dff90ed4c4f32988cb90 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Tue, 27 Feb 2024 14:36:56 +0000
Subject: [PATCH 059/103] w/ previous

---
 .../fleet/vce/backend/raybackend_test.py      |  2 -
 .../fleet/vce/backend/test/__init__.py        | 15 ++++++
 .../fleet/vce/backend/test/client.py          | 48 +++++++++++++++++++
 3 files changed, 63 insertions(+), 2 deletions(-)
 create mode 100644 src/py/flwr/server/superlink/fleet/vce/backend/test/__init__.py
 create mode 100644 src/py/flwr/server/superlink/fleet/vce/backend/test/client.py

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
index 92ca60db230..24dfb0fd120 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
@@ -144,9 +144,7 @@ def test_backend_creation_submit_and_termination_existing_client_app(
         """Testing with ClientApp module that exist."""
         # Resolve what should be the workdir to pass upon Backend initialisation
         file_path = Path(__file__)
-        print(f"{file_path = }")
         working_dir = Path.cwd()
-        print(f"{working_dir = }")
         rel_workdir = file_path.relative_to(working_dir)
 
         # Susbtract lats element and append "test" (to make it point ot .test dir)
diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/test/__init__.py b/src/py/flwr/server/superlink/fleet/vce/backend/test/__init__.py
new file mode 100644
index 00000000000..96bab3a5c6f
--- /dev/null
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/test/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2024 Flower Labs GmbH. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Components for Simulation Engine tests."""
diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/test/client.py b/src/py/flwr/server/superlink/fleet/vce/backend/test/client.py
new file mode 100644
index 00000000000..4d0cdf6e2a7
--- /dev/null
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/test/client.py
@@ -0,0 +1,48 @@
+# Copyright 2024 Flower Labs GmbH. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A ClientApp for Backend tests."""
+
+from math import pi
+from typing import Dict
+
+from flwr.client import Client, NumPyClient
+from flwr.client.clientapp import ClientApp
+from flwr.common import Config, ConfigsRecord, Scalar
+
+
+class DummyClient(NumPyClient):
+    """A dummy NumPyClient for tests."""
+
+    def get_properties(self, config: Config) -> Dict[str, Scalar]:
+        """Return properties by doing a simple calculation."""
+        result = float(config["factor"]) * pi
+
+        # store something in context
+        self.context.state.configs_records["result"] = ConfigsRecord({"result": result})
+        return {"result": result}
+
+
+def get_dummy_client(cid: str) -> Client:  # pylint: disable=unused-argument
+    """Return a DummyClient converted to Client type."""
+    return DummyClient().to_client()
+
+
+def _load_app() -> ClientApp:
+    return ClientApp(client_fn=get_dummy_client)
+
+
+client_app = ClientApp(
+    client_fn=get_dummy_client,
+)

From 1aa3b364e19eeb8f924698b153eddd3efe31a8b8 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Tue, 27 Feb 2024 15:02:50 +0000
Subject: [PATCH 060/103] post merge update

---
 src/py/flwr/simulation/run_simulation.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index e15807adeb3..7e9af626f86 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -48,15 +48,15 @@ def run_simulation() -> None:
     f_stop = asyncio.Event()
     superlink_th = threading.Thread(
         target=start_vce,
-        args=(
-            args.num_supernodes,
-            args.client_app,
-            args.backend,
-            args.backend_config,
-            state_factory,
-            args.dir,
-            f_stop,
-        ),
+        kwargs={
+            "num_supernodes": args.num_supernodes,
+            "client_app_module_name": args.client_app,
+            "backend_name": args.backend,
+            "backend_config_json_stream": args.backend_config,
+            "working_dir": args.dir,
+            "state_factory": state_factory,
+            "f_stop": f_stop,
+        },
         daemon=False,
     )
 

From 21f03a93fcbca2f9c6cde9f5368ae1292389bcb6 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Tue, 27 Feb 2024 15:05:06 +0000
Subject: [PATCH 061/103] fix

---
 src/py/flwr/cli/new/new_test.py | 34 +++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/src/py/flwr/cli/new/new_test.py b/src/py/flwr/cli/new/new_test.py
index 39717bc67ab..7a4832013b0 100644
--- a/src/py/flwr/cli/new/new_test.py
+++ b/src/py/flwr/cli/new/new_test.py
@@ -77,17 +77,23 @@ def test_new(tmp_path: str) -> None:
         "client.py",
     }
 
-    # Change into the temprorary directory
-    os.chdir(tmp_path)
-
-    # Execute
-    new(project_name=project_name, framework=framework)
-
-    # Assert
-    file_list = os.listdir(os.path.join(tmp_path, project_name.lower()))
-    assert set(file_list) == expected_files_top_level
-
-    file_list = os.listdir(
-        os.path.join(tmp_path, project_name.lower(), project_name.lower())
-    )
-    assert set(file_list) == expected_files_module
+    # Current directory
+    origin = os.getcwd()
+
+    try:
+        # Change into the temprorary directory
+        os.chdir(tmp_path)
+
+        # Execute
+        new(project_name=project_name, framework=framework)
+
+        # Assert
+        file_list = os.listdir(os.path.join(tmp_path, project_name.lower()))
+        assert set(file_list) == expected_files_top_level
+
+        file_list = os.listdir(
+            os.path.join(tmp_path, project_name.lower(), project_name.lower())
+        )
+        assert set(file_list) == expected_files_module
+    finally:
+        os.chdir(origin)

From 4d8ee734c3f99a4b05fb13941237d437a46286ff Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Tue, 27 Feb 2024 15:57:55 +0000
Subject: [PATCH 062/103] minor tweak

---
 src/py/flwr/simulation/run_simulation.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index 7e9af626f86..ee886877c09 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -38,8 +38,9 @@ def run_simulation() -> None:
     state_factory = StateFactory(":flwr-in-memory-state:")
 
     # Start Driver API
+    driver_address = "0.0.0.0:9098"
     driver_server: grpc.Server = _run_driver_api_grpc(
-        address="0.0.0.0:9091",
+        address=driver_address,
         state_factory=state_factory,
         certificates=None,
     )
@@ -65,7 +66,7 @@ def run_simulation() -> None:
 
     # Initialize Driver
     driver = Driver(
-        driver_service_address="0.0.0.0:9091",
+        driver_service_address=driver_address,
         root_certificates=None,
     )
 

From 1ac8e2b99708927cf30e3ebcf08fba1b84140a13 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Tue, 27 Feb 2024 16:43:21 +0000
Subject: [PATCH 063/103] runs passing server-app; client-app modules

---
 examples/simulation-pytorch/sim.py            | 52 ++++++++++---------
 src/py/flwr/server/run_serverapp.py           | 14 ++++-
 .../server/superlink/fleet/vce/vce_api.py     | 13 ++++-
 src/py/flwr/simulation/run_simulation.py      | 43 +++++++++++----
 4 files changed, 85 insertions(+), 37 deletions(-)

diff --git a/examples/simulation-pytorch/sim.py b/examples/simulation-pytorch/sim.py
index 84a00e3f092..9bad47bb9b0 100644
--- a/examples/simulation-pytorch/sim.py
+++ b/examples/simulation-pytorch/sim.py
@@ -193,28 +193,30 @@ def evaluate(
 )
 
 
-def main():
-    # Parse input arguments
-    args = parser.parse_args()
-
-    # Resources to be assigned to each virtual client
-    client_resources = {
-        "num_cpus": args.num_cpus,
-        "num_gpus": args.num_gpus,
-    }
-
-    # Start simulation
-    fl.simulation.start_simulation(
-        client_fn=get_client_fn(mnist_fds),
-        num_clients=NUM_CLIENTS,
-        client_resources=client_resources,
-        config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS),
-        strategy=strategy,
-        actor_kwargs={
-            "on_actor_init_fn": disable_progress_bar  # disable tqdm on each actor/process spawning virtual clients
-        },
-    )
-
-
-if __name__ == "__main__":
-    main()
+fl.simulation.run_simulation(server_app=server_app, client_app=client_app, num_supernodes=NUM_CLIENTS)
+
+# def main():
+#     # Parse input arguments
+#     args = parser.parse_args()
+
+#     # Resources to be assigned to each virtual client
+#     client_resources = {
+#         "num_cpus": args.num_cpus,
+#         "num_gpus": args.num_gpus,
+#     }
+
+#     # Start simulation
+#     fl.simulation.start_simulation(
+#         client_fn=get_client_fn(mnist_fds),
+#         num_clients=NUM_CLIENTS,
+#         client_resources=client_resources,
+#         config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS),
+#         strategy=strategy,
+#         actor_kwargs={
+#             "on_actor_init_fn": disable_progress_bar  # disable tqdm on each actor/process spawning virtual clients
+#         },
+#     )
+
+
+# if __name__ == "__main__":
+#     main()
diff --git a/src/py/flwr/server/run_serverapp.py b/src/py/flwr/server/run_serverapp.py
index e7205ebd144..c6710918448 100644
--- a/src/py/flwr/server/run_serverapp.py
+++ b/src/py/flwr/server/run_serverapp.py
@@ -19,6 +19,7 @@
 import sys
 from logging import DEBUG, WARN
 from pathlib import Path
+from typing import Optional
 
 from flwr.common import Context, EventType, RecordSet, event
 from flwr.common.logger import log
@@ -27,13 +28,22 @@
 from .server_app import ServerApp, load_server_app
 
 
-def run(server_app_attr: str, driver: Driver, server_app_dir: str) -> None:
+def run(
+    server_app_attr: str,
+    driver: Driver,
+    server_app_dir: str,
+    loaded_server_app: Optional[ServerApp] = None,
+) -> None:
     """Run ServerApp with a given Driver."""
     if server_app_dir is not None:
         sys.path.insert(0, server_app_dir)
 
     def _load() -> ServerApp:
-        server_app: ServerApp = load_server_app(server_app_attr)
+        server_app: ServerApp = (
+            load_server_app(server_app_attr)
+            if loaded_server_app is None
+            else loaded_server_app
+        )
         return server_app
 
     server_app = _load()
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index c7f94a4c554..11eeab542f1 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -167,11 +167,18 @@ def start_vce(
     backend_config_json_stream: str,
     working_dir: str,
     f_stop: asyncio.Event,
+    client_app: Optional[ClientApp] = None,
     num_supernodes: Optional[int] = None,
     state_factory: Optional[StateFactory] = None,
     existing_nodes_mapping: Optional[NodeToPartitionMapping] = None,
 ) -> None:
     """Start Fleet API with the Simulation Engine."""
+    if client_app_module_name is not None and client_app is not None:
+        raise ValueError(
+            "Both `client_app_module_name` and `client_app` are provided, "
+            "but only one is allowed."
+        )
+
     if num_supernodes is not None and existing_nodes_mapping is not None:
         raise ValueError(
             "Both `num_supernodes` and `existing_nodes_mapping` are provided, "
@@ -234,7 +241,11 @@ def start_vce(
     log(INFO, "client_app_module_name = %s", client_app_module_name)
 
     def _load() -> ClientApp:
-        app: ClientApp = load_client_app(client_app_module_name)
+        app: ClientApp = (
+            load_client_app(client_app_module_name)
+            if client_app is None
+            else client_app
+        )
         return app
 
     app = _load
diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index ee886877c09..8b5eb50e8af 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -17,23 +17,47 @@
 import argparse
 import asyncio
 import threading
+from typing import Optional
 
 import grpc
 
+from flwr.client import ClientApp
 from flwr.common import EventType, event
 from flwr.server.driver.driver import Driver
 from flwr.server.run_serverapp import run
+from flwr.server.server_app import ServerApp
 from flwr.server.superlink.state import StateFactory
 
 
-def run_simulation() -> None:
+def run_from_cli() -> None:
+    """."""
+    args = _parse_args_run_simulation().parse_args()
+
+    run_simulation(
+        num_supernodes=args.num_supernodes,
+        client_app_module_name=args.client_app,
+        backend_name=args.backend,
+        backend_config_json_stream=args.backend_config,
+        working_dir=args.dir,
+        server_app_module_name=args.server_app,
+    )
+
+
+def run_simulation(
+    num_supernodes: int,
+    server_app: Optional[ServerApp] = None,
+    client_app: Optional[ClientApp] = None,
+    backend_name: str = "ray",
+    backend_config: str = "{}",
+    client_app_module_name: Optional[str] = None,
+    server_app_module_name: Optional[str] = None,
+    working_dir: str = "",
+) -> None:
     """."""
     # TODO: below create circular imports
     from flwr.server.app import _register_exit_handlers, _run_driver_api_grpc
     from flwr.server.superlink.fleet.vce import start_vce
 
-    args = _parse_args_run_simulation().parse_args()
-
     # Initialize StateFactory
     state_factory = StateFactory(":flwr-in-memory-state:")
 
@@ -50,11 +74,12 @@ def run_simulation() -> None:
     superlink_th = threading.Thread(
         target=start_vce,
         kwargs={
-            "num_supernodes": args.num_supernodes,
-            "client_app_module_name": args.client_app,
-            "backend_name": args.backend,
-            "backend_config_json_stream": args.backend_config,
-            "working_dir": args.dir,
+            "num_supernodes": num_supernodes,
+            "client_app_module_name": client_app_module_name,
+            "client_app": client_app,
+            "backend_name": backend_name,
+            "backend_config_json_stream": backend_config,
+            "working_dir": working_dir,
             "state_factory": state_factory,
             "f_stop": f_stop,
         },
@@ -71,7 +96,7 @@ def run_simulation() -> None:
     )
 
     # Launch server app
-    run(args.server_app, driver, args.dir)
+    run(server_app_module_name, driver, working_dir, loaded_server_app=server_app)
 
     del driver
 

From c36660427469850d43eeb2f88c48f2798703bd51 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Tue, 27 Feb 2024 16:45:11 +0000
Subject: [PATCH 064/103] w/ previous

---
 src/py/flwr/simulation/__init__.py       | 4 ++--
 src/py/flwr/simulation/run_simulation.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/py/flwr/simulation/__init__.py b/src/py/flwr/simulation/__init__.py
index b283de70c58..764127465cb 100644
--- a/src/py/flwr/simulation/__init__.py
+++ b/src/py/flwr/simulation/__init__.py
@@ -17,7 +17,7 @@
 
 import importlib
 
-from flwr.simulation.run_simulation import run_simulation
+from flwr.simulation.run_simulation import run_simulation_from_cli
 
 is_ray_installed = importlib.util.find_spec("ray") is not None
 
@@ -38,5 +38,5 @@ def start_simulation(*args, **kwargs):  # type: ignore
 
 __all__ = [
     "start_simulation",
-    "run_simulation",
+    "run_simulation_from_cli",
 ]
diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index 8b5eb50e8af..319c85e7798 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -29,7 +29,7 @@
 from flwr.server.superlink.state import StateFactory
 
 
-def run_from_cli() -> None:
+def run_simulation_from_cli() -> None:
     """."""
     args = _parse_args_run_simulation().parse_args()
 
@@ -37,7 +37,7 @@ def run_from_cli() -> None:
         num_supernodes=args.num_supernodes,
         client_app_module_name=args.client_app,
         backend_name=args.backend,
-        backend_config_json_stream=args.backend_config,
+        backend_config=args.backend_config,
         working_dir=args.dir,
         server_app_module_name=args.server_app,
     )

From a62a0d13f39f1a05a32e03e92bf577967896d5f1 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Tue, 27 Feb 2024 16:55:51 +0000
Subject: [PATCH 065/103] wip

---
 examples/simulation-pytorch/sim.ipynb    | 51 ++++++++++++++----------
 pyproject.toml                           |  2 +-
 src/py/flwr/simulation/__init__.py       |  3 +-
 src/py/flwr/simulation/run_simulation.py |  2 +-
 4 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/examples/simulation-pytorch/sim.ipynb b/examples/simulation-pytorch/sim.ipynb
index 93a79d2f0e0..b2c2d9c0580 100644
--- a/examples/simulation-pytorch/sim.ipynb
+++ b/examples/simulation-pytorch/sim.ipynb
@@ -511,10 +511,7 @@
     "        # Create and return client\n",
     "        return FlowerClient(trainloader, valloader).to_client()\n",
     "\n",
-    "    return client_fn\n",
-    "\n",
-    "\n",
-    "client_fn_callback = get_client_fn(mnist_fds)"
+    "    return client_fn"
    ]
   },
   {
@@ -536,25 +533,27 @@
    },
    "outputs": [],
    "source": [
-    "# With a dictionary, you tell Flower's VirtualClientEngine that each\n",
-    "# client needs exclusive access to these many resources in order to run\n",
-    "client_resources = {\"num_cpus\": 1, \"num_gpus\": 0.0}\n",
-    "\n",
-    "# Let's disable tqdm progress bar in the main thread (used by the server)\n",
-    "disable_progress_bar()\n",
-    "\n",
-    "history = fl.simulation.start_simulation(\n",
-    "    client_fn=client_fn_callback,  # a callback to construct a client\n",
-    "    num_clients=NUM_CLIENTS,  # total number of clients in the experiment\n",
-    "    config=fl.server.ServerConfig(num_rounds=10),  # let's run for 10 rounds\n",
-    "    strategy=strategy,  # the strategy that will orchestrate the whole FL pipeline\n",
-    "    client_resources=client_resources,\n",
-    "    actor_kwargs={\n",
-    "        \"on_actor_init_fn\": disable_progress_bar  # disable tqdm on each actor/process spawning virtual clients\n",
-    "    },\n",
+    "# ClientApp for Flower-Next\n",
+    "client_app = fl.client.ClientApp(\n",
+    "    client_fn=get_client_fn(mnist_fds),\n",
+    ")\n",
+    "\n",
+    "# ServerApp for Flower-Next\n",
+    "server_app = fl.server.ServerApp(\n",
+    "    config=fl.server.ServerConfig(num_rounds=10),\n",
+    "    strategy=strategy,\n",
     ")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fl.simulation.run_simulation(server_app=server_app, client_app=client_app, num_supernodes=NUM_CLIENTS)"
+   ]
+  },
   {
    "attachments": {},
    "cell_type": "markdown",
@@ -622,6 +621,18 @@
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.16"
   }
  },
  "nbformat": 4,
diff --git a/pyproject.toml b/pyproject.toml
index 743670c6419..b45f960063d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,7 +58,7 @@ flower-fleet-api = "flwr.server:run_fleet_api"
 flower-superlink = "flwr.server:run_superlink"
 flower-client-app = "flwr.client:run_client_app"
 flower-server-app = "flwr.server:run_server_app"
-flower-simulation = "flwr.simulation:run_simulation"
+flower-simulation = "flwr.simulation:run_simulation_from_cli"
 
 [tool.poetry.dependencies]
 python = "^3.8"
diff --git a/src/py/flwr/simulation/__init__.py b/src/py/flwr/simulation/__init__.py
index 764127465cb..af87232f15d 100644
--- a/src/py/flwr/simulation/__init__.py
+++ b/src/py/flwr/simulation/__init__.py
@@ -17,7 +17,7 @@
 
 import importlib
 
-from flwr.simulation.run_simulation import run_simulation_from_cli
+from flwr.simulation.run_simulation import run_simulation_from_cli, run_simulation
 
 is_ray_installed = importlib.util.find_spec("ray") is not None
 
@@ -39,4 +39,5 @@ def start_simulation(*args, **kwargs):  # type: ignore
 __all__ = [
     "start_simulation",
     "run_simulation_from_cli",
+    "run_simulation"
 ]
diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index 319c85e7798..6568e05e5d4 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -62,7 +62,7 @@ def run_simulation(
     state_factory = StateFactory(":flwr-in-memory-state:")
 
     # Start Driver API
-    driver_address = "0.0.0.0:9098"
+    driver_address = "0.0.0.0:9091"
     driver_server: grpc.Server = _run_driver_api_grpc(
         address=driver_address,
         state_factory=state_factory,

From c45c4afabd519ff588013b914e796270b0cdeef9 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Tue, 27 Feb 2024 19:18:52 +0000
Subject: [PATCH 066/103] no need for separate test/ dir

---
 .../fleet/vce/backend/raybackend_test.py      | 50 ++++++++++++++++---
 .../fleet/vce/backend/test/__init__.py        | 15 ------
 .../fleet/vce/backend/test/client.py          | 48 ------------------
 3 files changed, 42 insertions(+), 71 deletions(-)
 delete mode 100644 src/py/flwr/server/superlink/fleet/vce/backend/test/__init__.py
 delete mode 100644 src/py/flwr/server/superlink/fleet/vce/backend/test/client.py

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
index 24dfb0fd120..8ac9df35d45 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
@@ -17,16 +17,50 @@
 import asyncio
 from math import pi
 from pathlib import Path
-from typing import Callable, Optional, Tuple, Union
+from typing import Callable, Dict, Optional, Tuple, Union
 from unittest import IsolatedAsyncioTestCase
 
+from flwr.client import Client, NumPyClient
 from flwr.client.clientapp import ClientApp, LoadClientAppError, load_client_app
-from flwr.common import Context, GetPropertiesIns, Message, Metadata, RecordSet
+from flwr.common import (
+    Config,
+    ConfigsRecord,
+    Context,
+    GetPropertiesIns,
+    Message,
+    Metadata,
+    RecordSet,
+    Scalar,
+)
 from flwr.common.constant import MESSAGE_TYPE_GET_PROPERTIES
 from flwr.common.recordset_compat import getpropertiesins_to_recordset
+from flwr.server.superlink.fleet.vce.backend.raybackend import RayBackend
 
-from .raybackend import RayBackend
-from .test.client import _load_app
+
+class DummyClient(NumPyClient):
+    """A dummy NumPyClient for tests."""
+
+    def get_properties(self, config: Config) -> Dict[str, Scalar]:
+        """Return properties by doing a simple calculation."""
+        result = float(config["factor"]) * pi
+
+        # store something in context
+        self.context.state.configs_records["result"] = ConfigsRecord({"result": result})
+        return {"result": result}
+
+
+def get_dummy_client(cid: str) -> Client:  # pylint: disable=unused-argument
+    """Return a DummyClient converted to Client type."""
+    return DummyClient().to_client()
+
+
+def _load_app() -> ClientApp:
+    return ClientApp(client_fn=get_dummy_client)
+
+
+client_app = ClientApp(
+    client_fn=get_dummy_client,
+)
 
 
 def _load_from_module(client_app_module_name: str) -> Callable[[], ClientApp]:
@@ -147,11 +181,11 @@ def test_backend_creation_submit_and_termination_existing_client_app(
         working_dir = Path.cwd()
         rel_workdir = file_path.relative_to(working_dir)
 
-        # Susbtract lats element and append "test" (to make it point ot .test dir)
-        rel_workdir_str = str(rel_workdir.parent / "test")
+        # Susbtract last element
+        rel_workdir_str = str(rel_workdir.parent)
 
         self.test_backend_creation_submit_and_termination(
-            client_app_loader=_load_from_module("client:client_app"),
+            client_app_loader=_load_from_module("raybackend_test:client_app"),
             workdir=rel_workdir_str,
         )
 
@@ -161,6 +195,6 @@ def test_backend_creation_submit_and_termination_existing_client_app_unsetworkdi
         """Testing with ClientApp module that exist but the passed workdir does not."""
         with self.assertRaises(ValueError):
             self.test_backend_creation_submit_and_termination(
-                client_app_loader=_load_from_module("test.client:client_app"),
+                client_app_loader=_load_from_module("raybackend_test:client_app"),
                 workdir="/?&%$^#%@$!",
             )
diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/test/__init__.py b/src/py/flwr/server/superlink/fleet/vce/backend/test/__init__.py
deleted file mode 100644
index 96bab3a5c6f..00000000000
--- a/src/py/flwr/server/superlink/fleet/vce/backend/test/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright 2024 Flower Labs GmbH. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Components for Simulation Engine tests."""
diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/test/client.py b/src/py/flwr/server/superlink/fleet/vce/backend/test/client.py
deleted file mode 100644
index 4d0cdf6e2a7..00000000000
--- a/src/py/flwr/server/superlink/fleet/vce/backend/test/client.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright 2024 Flower Labs GmbH. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A ClientApp for Backend tests."""
-
-from math import pi
-from typing import Dict
-
-from flwr.client import Client, NumPyClient
-from flwr.client.clientapp import ClientApp
-from flwr.common import Config, ConfigsRecord, Scalar
-
-
-class DummyClient(NumPyClient):
-    """A dummy NumPyClient for tests."""
-
-    def get_properties(self, config: Config) -> Dict[str, Scalar]:
-        """Return properties by doing a simple calculation."""
-        result = float(config["factor"]) * pi
-
-        # store something in context
-        self.context.state.configs_records["result"] = ConfigsRecord({"result": result})
-        return {"result": result}
-
-
-def get_dummy_client(cid: str) -> Client:  # pylint: disable=unused-argument
-    """Return a DummyClient converted to Client type."""
-    return DummyClient().to_client()
-
-
-def _load_app() -> ClientApp:
-    return ClientApp(client_fn=get_dummy_client)
-
-
-client_app = ClientApp(
-    client_fn=get_dummy_client,
-)

From c9492f067d4acc507748c78a03e08b2bf99cf143 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Tue, 27 Feb 2024 19:36:31 +0000
Subject: [PATCH 067/103] update

---
 src/py/flwr/server/superlink/fleet/vce/vce_api_test.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
index 26ea5d52905..d345cf7bb7e 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
@@ -92,7 +92,7 @@ def register_messages_into_state(
     return expected_results
 
 
-def _autoresolve_working_dir(rel_client_app_dir: str = "backend/test") -> str:
+def _autoresolve_working_dir(rel_client_app_dir: str = "backend") -> str:
     """Correctly resolve working directory."""
     file_path = Path(__file__)
     working_dir = Path.cwd()
@@ -105,7 +105,7 @@ def _autoresolve_working_dir(rel_client_app_dir: str = "backend/test") -> str:
 # pylint: disable=too-many-arguments
 def start_and_shutdown(
     backend: str = "ray",
-    clientapp_module: str = "client:client_app",
+    clientapp_module: str = "raybackend_test:client_app",
     working_dir: str = "",
     num_supernodes: Optional[int] = None,
     state_factory: Optional[StateFactory] = None,
@@ -126,7 +126,6 @@ def start_and_shutdown(
     # Resolve working directory if not passed
     if not working_dir:
         working_dir = _autoresolve_working_dir()
-        print(f"---> {working_dir = }")
 
     start_vce(
         num_supernodes=num_supernodes,
@@ -220,7 +219,8 @@ def test_start_and_shutdown_with_tasks_in_state(self) -> None:
 
         This test creates a few nodes and submits a few messages that need to be
         executed by the Backend. In order for that to happen the asyncio
-        producer/consumer logic must function.
+        producer/consumer logic must function. This also severs to evaluate
+        a valid ClientApp.
         """
         num_messages = 113
         num_nodes = 59

From 82878f6df700de1030d0e6285c88b7e53bb46b23 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Tue, 27 Feb 2024 19:59:20 +0000
Subject: [PATCH 068/103] updates

---
 src/py/flwr/simulation/run_simulation.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index ee886877c09..6006fa57c82 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -21,15 +21,17 @@
 import grpc
 
 from flwr.common import EventType, event
+from flwr.common.exit_handlers import register_exit_handlers
 from flwr.server.driver.driver import Driver
 from flwr.server.run_serverapp import run
+
 from flwr.server.superlink.state import StateFactory
 
 
 def run_simulation() -> None:
     """."""
     # TODO: below create circular imports
-    from flwr.server.app import _register_exit_handlers, _run_driver_api_grpc
+    from flwr.server.app import _run_driver_api_grpc
     from flwr.server.superlink.fleet.vce import start_vce
 
     args = _parse_args_run_simulation().parse_args()
@@ -78,7 +80,7 @@ def run_simulation() -> None:
     # Trigger stop event
     f_stop.set()
 
-    _register_exit_handlers(
+    register_exit_handlers(
         grpc_servers=[driver_server],
         bckg_threads=[superlink_th],
         event_type=EventType.RUN_SUPERLINK_LEAVE,

From b3d397b731efc6c0d629814fa261c8e506445a47 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Tue, 27 Feb 2024 22:06:04 +0000
Subject: [PATCH 069/103] better handling of exceptions in vce's ; adjust test
 for

---
 .../superlink/fleet/vce/backend/raybackend.py |   3 +-
 .../fleet/vce/backend/raybackend_test.py      |   9 ++
 .../server/superlink/fleet/vce/vce_api.py     | 100 +++++++++++++-----
 .../superlink/fleet/vce/vce_api_test.py       |   9 +-
 4 files changed, 89 insertions(+), 32 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
index 7eb21e3b20d..06a6fc72975 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
@@ -163,10 +163,9 @@ async def process_message(
         except LoadClientAppError as load_ex:
             log(
                 ERROR,
-                "An exception was raised when processing a message. Terminating %s",
+                "An exception was raised when processing a message by %s",
                 self.__class__.__name__,
             )
-            await self.terminate()
             raise load_ex
 
     async def terminate(self) -> None:
diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
index fd246b5fc2a..e14c466e7b8 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend_test.py
@@ -20,6 +20,8 @@
 from typing import Callable, Dict, Optional, Tuple, Union
 from unittest import IsolatedAsyncioTestCase
 
+import ray
+
 from flwr.client import Client, NumPyClient
 from flwr.client.client_app import ClientApp, LoadClientAppError, load_client_app
 from flwr.common import (
@@ -119,6 +121,11 @@ def _create_message_and_context() -> Tuple[Message, Context, float]:
 class AsyncTestRayBackend(IsolatedAsyncioTestCase):
     """A basic class that allows runnig multliple asyncio tests."""
 
+    async def on_cleanup(self) -> None:
+        """Ensure Ray has shutdown."""
+        if ray.is_initialized():
+            ray.shutdown()
+
     def test_backend_creation_and_termination(self) -> None:
         """Test creation of RayBackend and its termination."""
         backend = RayBackend(backend_config={}, work_dir="")
@@ -171,6 +178,7 @@ def test_backend_creation_submit_and_termination_non_existing_client_app(
             self.test_backend_creation_submit_and_termination(
                 client_app_loader=_load_from_module("a_non_existing_module:app")
             )
+        self.addAsyncCleanup(self.on_cleanup)
 
     def test_backend_creation_submit_and_termination_existing_client_app(
         self,
@@ -198,3 +206,4 @@ def test_backend_creation_submit_and_termination_existing_client_app_unsetworkdi
                 client_app_loader=_load_from_module("raybackend_test:client_app"),
                 workdir="/?&%$^#%@$!",
             )
+        self.addAsyncCleanup(self.on_cleanup)
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index 82dda285158..761712875cc 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -18,8 +18,8 @@
 import asyncio
 import json
 import traceback
-from logging import DEBUG, ERROR, INFO
-from typing import Callable, Dict, Optional
+from logging import DEBUG, ERROR, INFO, WARN
+from typing import Callable, Dict, List, Optional
 
 from flwr.client.client_app import ClientApp, LoadClientAppError, load_client_app
 from flwr.client.node_state import NodeState
@@ -101,21 +101,50 @@ async def worker(
             break
 
 
-async def generate_pull_requests(
+async def add_taskins_to_queue(
     queue: "asyncio.Queue[TaskIns]",
     state_factory: StateFactory,
     nodes_mapping: NodeToPartitionMapping,
+    backend: Backend,
+    consumers: List["asyncio.Task[None]"],
     f_stop: asyncio.Event,
 ) -> None:
     """Retrieve TaskIns and add it to the queue."""
     state = state_factory.state()
+    num_initial_consumers = len(consumers)
     while not f_stop.is_set():
         for node_id in nodes_mapping.keys():
             task_ins = state.get_task_ins(node_id=node_id, limit=1)
             if task_ins:
                 await queue.put(task_ins[0])
-        log(DEBUG, "TaskIns in queue: %i", queue.qsize())
 
+        # Count consumers that are running
+        num_active = sum(not (cc.done()) for cc in consumers)
+
+        # Alert if number of consumers decreased by half
+        if num_active < num_initial_consumers // 2:
+            log(
+                WARN,
+                "Number of active workers has more than halved: (%i/%i active)",
+                num_active,
+                num_initial_consumers,
+            )
+
+        # Break if consumers died
+        if num_active == 0:
+            raise RuntimeError("All workers have died. Ending Simulation.")
+
+        # Log some stats
+        log(
+            DEBUG,
+            "Simulation Engine stats: "
+            "(Active workers: (%i/%i) | %s (%i workers) | Tasks in queue: %i)",
+            num_active,
+            num_initial_consumers,
+            backend.__class__.__name__,
+            backend.num_workers,
+            queue.qsize(),
+        )
         await asyncio.sleep(1.0)
     log(DEBUG, "Async producer: Stopped pulling from StateFactory.")
 
@@ -132,32 +161,55 @@ async def run(
     # pylint: disable=fixme
     queue: "asyncio.Queue[TaskIns]" = asyncio.Queue(128)
 
-    # Build backend
-    await backend.build()
-    worker_tasks = [
-        asyncio.create_task(
-            worker(app, queue, node_states, state_factory, nodes_mapping, backend)
+    try:
+        # Build backend
+        await backend.build()
+
+        # Add workers (they submit Messages to Backend)
+        worker_tasks = [
+            asyncio.create_task(
+                worker(app, queue, node_states, state_factory, nodes_mapping, backend)
+            )
+            for _ in range(backend.num_workers)
+        ]
+        # Create producer (adds TaskIns into Queue)
+        producer = asyncio.create_task(
+            add_taskins_to_queue(
+                queue, state_factory, nodes_mapping, backend, worker_tasks, f_stop
+            )
         )
-        for _ in range(backend.num_workers)
-    ]
-    producer = asyncio.create_task(
-        generate_pull_requests(queue, state_factory, nodes_mapping, f_stop)
-    )
 
-    await asyncio.gather(producer)
+        # Wait for producer to finish
+        # The producer runs forever until f_stop is set or until
+        # all worker (consumer) coroutines are completed. Workers
+        # also run forever and only end if an exception is raised.
+        await asyncio.gather(producer)
+
+    except Exception as ex:
+
+        log(ERROR, "An exception occured!! %s", ex)
+        log(ERROR, traceback.format_exc())
+        log(WARN, "Stopping Simulation Engine.")
+
+        # Manually trigger stopping event
+        f_stop.set()
+
+        # Raise exception
+        raise RuntimeError("Simulation Engine crashed.") from ex
 
-    # Produced task terminated, now cancel worker tasks
-    for w_t in worker_tasks:
-        _ = w_t.cancel()
+    finally:
+        # Produced task terminated, now cancel worker tasks
+        for w_t in worker_tasks:
+            _ = w_t.cancel()
 
-    while not all(w_t.done() for w_t in worker_tasks):
-        log(DEBUG, "Terminating async workers...")
-        await asyncio.sleep(0.5)
+        while not all(w_t.done() for w_t in worker_tasks):
+            log(DEBUG, "Terminating async workers...")
+            await asyncio.sleep(0.5)
 
-    await asyncio.gather(*worker_tasks)
+        await asyncio.gather(*[w_t for w_t in worker_tasks if not w_t.done()])
 
-    # Terminate backend
-    await backend.terminate()
+        # Terminate backend
+        await backend.terminate()
 
 
 # pylint: disable=too-many-arguments,unused-argument,too-many-locals
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
index 3dc7b57aa35..5bcff233759 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
@@ -26,7 +26,6 @@
 from unittest import IsolatedAsyncioTestCase
 from uuid import UUID
 
-from flwr.client.client_app import LoadClientAppError
 from flwr.common import GetPropertiesIns, Message, Metadata
 from flwr.common.constant import MESSAGE_TYPE_GET_PROPERTIES
 from flwr.common.recordset_compat import getpropertiesins_to_recordset
@@ -138,9 +137,6 @@ def start_and_shutdown(
         existing_nodes_mapping=nodes_mapping,
     )
 
-    # Trigger stop event
-    f_stop.set()
-
     termination_th.join()
 
 
@@ -172,11 +168,12 @@ def test_erroneous_clientapp_module_name(self) -> None:
             run_id=run_id,
             num_messages=num_messages,
         )
-        with self.assertRaises(LoadClientAppError):
+        with self.assertRaises(RuntimeError):
             start_and_shutdown(
                 clientapp_module="totally_fictitious_app:client",
                 state_factory=state_factory,
                 nodes_mapping=nodes_mapping,
+                duration=10,
             )
 
     def test_erroneous_backend_config(self) -> None:
@@ -222,7 +219,7 @@ def test_start_and_shutdown_with_tasks_in_state(self) -> None:
         producer/consumer logic must function. This also severs to evaluate a valid
         ClientApp.
         """
-        num_messages = 113
+        num_messages = 229
         num_nodes = 59
 
         # Register a state and a run_id in it

From bd7b1aa26a48ed0e171be0f227fb38f538f21016 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Tue, 27 Feb 2024 23:08:57 +0000
Subject: [PATCH 070/103] completed tests.

---
 .../superlink/fleet/vce/vce_api_test.py       | 117 +++++++++++-------
 1 file changed, 75 insertions(+), 42 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
index 5bcff233759..ea2de2e636b 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
@@ -22,7 +22,7 @@
 from math import pi
 from pathlib import Path
 from time import sleep
-from typing import Dict, Optional, Set
+from typing import Dict, Optional, Set, Tuple
 from unittest import IsolatedAsyncioTestCase
 from uuid import UUID
 
@@ -44,12 +44,36 @@ def terminate_simulation(f_stop: asyncio.Event, sleep_duration: int) -> None:
     f_stop.set()
 
 
+def init_state_factory_nodes_mapping(
+    num_nodes: int,
+    num_messages: int,
+    erroneous_message: Optional[bool] = False,
+) -> Tuple[StateFactory, NodeToPartitionMapping, Dict[UUID, float]]:
+    """Instatiate StateFactory, register nodes and pre-insert messages in the state."""
+    # Register a state and a run_id in it
+    run_id = 1234
+    state_factory = StateFactory(":flwr-in-memory-state:")
+
+    # Register a few nodes
+    nodes_mapping = _register_nodes(num_nodes=num_nodes, state_factory=state_factory)
+
+    expected_results = register_messages_into_state(
+        state_factory=state_factory,
+        nodes_mapping=nodes_mapping,
+        run_id=run_id,
+        num_messages=num_messages,
+        erroneous_message=erroneous_message,
+    )
+    return state_factory, nodes_mapping, expected_results
+
+
 # pylint: disable=too-many-locals
 def register_messages_into_state(
     state_factory: StateFactory,
     nodes_mapping: NodeToPartitionMapping,
     run_id: int,
     num_messages: int,
+    erroneous_message: Optional[bool] = False,
 ) -> Dict[UUID, float]:
     """Register `num_messages` into the state factory."""
     state: InMemoryState = state_factory.state()  # type: ignore
@@ -75,7 +99,11 @@ def register_messages_into_state(
                 dst_node_id=dst_node_id,  # indicate destination node
                 reply_to_message="",
                 ttl="",
-                message_type=MESSAGE_TYPE_GET_PROPERTIES,
+                message_type=(
+                    "a bad message"
+                    if erroneous_message
+                    else MESSAGE_TYPE_GET_PROPERTIES
+                ),
             ),
         )
         # Convert Message to TaskIns
@@ -109,18 +137,24 @@ def start_and_shutdown(
     num_supernodes: Optional[int] = None,
     state_factory: Optional[StateFactory] = None,
     nodes_mapping: Optional[NodeToPartitionMapping] = None,
-    duration: int = 10,
+    duration: int = 0,
     backend_config: str = "{}",
 ) -> None:
-    """Start Simulation Engine and terminate after specified number of seconds."""
+    """Start Simulation Engine and terminate after specified number of seconds.
+
+    Some tests need to be terminated by triggering externally an asyncio.Event. This
+    is enabled whtn passing `duration`>0.
+    """
     f_stop = asyncio.Event()
 
-    # Setup thread that will set the f_stop event, triggering the termination of all
-    # asyncio logic in the Simulation Engine. It will also terminate the Backend.
-    termination_th = threading.Thread(
-        target=terminate_simulation, args=(f_stop, duration)
-    )
-    termination_th.start()
+    if duration:
+
+        # Setup thread that will set the f_stop event, triggering the termination of all
+        # asyncio logic in the Simulation Engine. It will also terminate the Backend.
+        termination_th = threading.Thread(
+            target=terminate_simulation, args=(f_stop, duration)
+        )
+        termination_th.start()
 
     # Resolve working directory if not passed
     if not working_dir:
@@ -137,7 +171,8 @@ def start_and_shutdown(
         existing_nodes_mapping=nodes_mapping,
     )
 
-    termination_th.join()
+    if duration:
+        termination_th.join()
 
 
 class AsyncTestFleetSimulationEngineRayBackend(IsolatedAsyncioTestCase):
@@ -146,34 +181,40 @@ class AsyncTestFleetSimulationEngineRayBackend(IsolatedAsyncioTestCase):
     def test_erroneous_no_supernodes_client_mapping(self) -> None:
         """Test with unset arguments."""
         with self.assertRaises(ValueError):
-            start_and_shutdown()
+            start_and_shutdown(duration=2)
 
     def test_erroneous_clientapp_module_name(self) -> None:
         """Tests attempt to load a ClientApp that can't be found."""
         num_messages = 7
         num_nodes = 59
 
-        # Register a state and a run_id in it
-        run_id = 1234
-        state_factory = StateFactory(":flwr-in-memory-state:")
-
-        # Register a few nodes
-        nodes_mapping = _register_nodes(
-            num_nodes=num_nodes, state_factory=state_factory
+        state_factory, nodes_mapping, _ = init_state_factory_nodes_mapping(
+            num_nodes=num_nodes, num_messages=num_messages
         )
+        with self.assertRaises(RuntimeError):
+            start_and_shutdown(
+                clientapp_module="totally_fictitious_app:client",
+                state_factory=state_factory,
+                nodes_mapping=nodes_mapping,
+            )
+
+    def test_erroneous_messages(self) -> None:
+        """Test handling of error in async worker (consumer).
+
+        We register messages which will trigger an error when handling, triggering an
+        error.
+        """
+        num_messages = 100
+        num_nodes = 59
 
-        _ = register_messages_into_state(
-            state_factory=state_factory,
-            nodes_mapping=nodes_mapping,
-            run_id=run_id,
-            num_messages=num_messages,
+        state_factory, nodes_mapping, _ = init_state_factory_nodes_mapping(
+            num_nodes=num_nodes, num_messages=num_messages, erroneous_message=True
         )
+
         with self.assertRaises(RuntimeError):
             start_and_shutdown(
-                clientapp_module="totally_fictitious_app:client",
                 state_factory=state_factory,
                 nodes_mapping=nodes_mapping,
-                duration=10,
             )
 
     def test_erroneous_backend_config(self) -> None:
@@ -208,7 +249,7 @@ def test_erroneous_arguments_existing_mapping_but_no_state_factory(self) -> None
 
     def test_start_and_shutdown(self) -> None:
         """Start Simulation Engine Fleet and terminate it."""
-        start_and_shutdown(num_supernodes=50)
+        start_and_shutdown(num_supernodes=50, duration=10)
 
     # pylint: disable=too-many-locals
     def test_start_and_shutdown_with_tasks_in_state(self) -> None:
@@ -222,24 +263,16 @@ def test_start_and_shutdown_with_tasks_in_state(self) -> None:
         num_messages = 229
         num_nodes = 59
 
-        # Register a state and a run_id in it
-        run_id = 1234
-        state_factory = StateFactory(":flwr-in-memory-state:")
-
-        # Register a few nodes
-        nodes_mapping = _register_nodes(
-            num_nodes=num_nodes, state_factory=state_factory
-        )
-
-        expected_results = register_messages_into_state(
-            state_factory=state_factory,
-            nodes_mapping=nodes_mapping,
-            run_id=run_id,
-            num_messages=num_messages,
+        state_factory, nodes_mapping, expected_results = (
+            init_state_factory_nodes_mapping(
+                num_nodes=num_nodes, num_messages=num_messages
+            )
         )
 
         # Run
-        start_and_shutdown(state_factory=state_factory, nodes_mapping=nodes_mapping)
+        start_and_shutdown(
+            state_factory=state_factory, nodes_mapping=nodes_mapping, duration=10
+        )
 
         # Get all TaskRes
         state = state_factory.state()

From 0dce992c5982f0cde4760ac1feb6cbf2cd400e59 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Tue, 27 Feb 2024 23:30:43 +0000
Subject: [PATCH 071/103] update import

---
 src/py/flwr/simulation/run_simulation.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index 6006fa57c82..b61951c0b3f 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -24,7 +24,7 @@
 from flwr.common.exit_handlers import register_exit_handlers
 from flwr.server.driver.driver import Driver
 from flwr.server.run_serverapp import run
-
+from flwr.server.superlink.fleet import vce
 from flwr.server.superlink.state import StateFactory
 
 
@@ -32,7 +32,6 @@ def run_simulation() -> None:
     """."""
     # TODO: below create circular imports
     from flwr.server.app import _run_driver_api_grpc
-    from flwr.server.superlink.fleet.vce import start_vce
 
     args = _parse_args_run_simulation().parse_args()
 
@@ -50,7 +49,7 @@ def run_simulation() -> None:
     # Superlink with Simulation Engine
     f_stop = asyncio.Event()
     superlink_th = threading.Thread(
-        target=start_vce,
+        target=vce.start_vce,
         kwargs={
             "num_supernodes": args.num_supernodes,
             "client_app_module_name": args.client_app,

From 19366315aefa5b9c168d9b73f95cd1f877793c18 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Wed, 28 Feb 2024 00:05:12 +0000
Subject: [PATCH 072/103] wip

---
 src/py/flwr/simulation/run_simulation.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index b61951c0b3f..03baae738c2 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -39,9 +39,8 @@ def run_simulation() -> None:
     state_factory = StateFactory(":flwr-in-memory-state:")
 
     # Start Driver API
-    driver_address = "0.0.0.0:9098"
     driver_server: grpc.Server = _run_driver_api_grpc(
-        address=driver_address,
+        address=args.driver_api_address,
         state_factory=state_factory,
         certificates=None,
     )
@@ -67,7 +66,7 @@ def run_simulation() -> None:
 
     # Initialize Driver
     driver = Driver(
-        driver_service_address=driver_address,
+        driver_service_address=args.driver_api_address,
         root_certificates=None,
     )
 
@@ -102,6 +101,12 @@ def _parse_args_run_simulation() -> argparse.ArgumentParser:
         required=True,
         help="For example: `server:app` or `project.package.module:wrapper.app`",
     )
+    parser.add_argument(
+        "--driver-api-address",
+        default="0.0.0.0:9091",
+        type=str,
+        help="For example: `server:app` or `project.package.module:wrapper.app`",
+    )
     parser.add_argument(
         "--num-supernodes",
         type=int,

From 6e3271b30f843aee79a7305238e75d6eebdd3c7a Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Wed, 28 Feb 2024 08:34:39 +0000
Subject: [PATCH 073/103] minior formatting

---
 src/py/flwr/server/superlink/fleet/vce/vce_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index 761712875cc..7583506e221 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -138,7 +138,7 @@ async def add_taskins_to_queue(
         log(
             DEBUG,
             "Simulation Engine stats: "
-            "(Active workers: (%i/%i) | %s (%i workers) | Tasks in queue: %i)",
+            "Active workers: (%i/%i) | %s (%i workers) | Tasks in queue: %i)",
             num_active,
             num_initial_consumers,
             backend.__class__.__name__,

From 67777c5d93076f9fabcae5e0a25b839cb251b429 Mon Sep 17 00:00:00 2001
From: Javier <jafermarq@users.noreply.github.com>
Date: Wed, 28 Feb 2024 12:54:12 +0000
Subject: [PATCH 074/103] Apply suggestions from code review

Co-authored-by: Daniel J. Beutel <daniel@flower.ai>
---
 src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py | 2 +-
 src/py/flwr/server/superlink/fleet/vce/vce_api.py            | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
index 06a6fc72975..4a729f22436 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
@@ -141,7 +141,7 @@ async def process_message(
 
         Return output message and updated context.
         """
-        node_id = message.metadata.partition_id
+        partition_id = message.metadata.partition_id
 
         try:
             # Submite a task to the pool
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index 7583506e221..1aad6aa95f9 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -68,7 +68,7 @@ async def worker(
 
             # Convert TaskIns to Message
             message = message_from_taskins(task_ins)
-            # Replace node ID with data partition ID
+            # Set partition_id
             message.metadata.partition_id = nodes_mapping[node_id]
 
             # Let backend process message
@@ -239,7 +239,7 @@ def start_vce(
     if existing_nodes_mapping:
         if state_factory is None:
             raise ValueError(
-                "You passed `existing_nodes_mapping` but no `state_factory` was passed."
+                "`existing_nodes_mapping` was passed, but no `state_factory` was passed."
             )
         log(INFO, "Using exiting NodeToPartitionMapping and StateFactory.")
         # Use mapping constructed externally. This also means nodes

From 46eac84ade3063b12c8add3335d178da480ed362 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Wed, 28 Feb 2024 12:56:37 +0000
Subject: [PATCH 075/103] fixes post review

---
 .../superlink/fleet/vce/backend/raybackend.py   |  2 +-
 .../flwr/server/superlink/fleet/vce/vce_api.py  | 17 ++++++++++-------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
index 4a729f22436..8ef0d54622a 100644
--- a/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
+++ b/src/py/flwr/server/superlink/fleet/vce/backend/raybackend.py
@@ -147,7 +147,7 @@ async def process_message(
             # Submite a task to the pool
             future = await self.pool.submit(
                 lambda a, a_fn, mssg, cid, state: a.run.remote(a_fn, mssg, cid, state),
-                (app, message, str(node_id), context),
+                (app, message, str(partition_id), context),
             )
 
             await future
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index 1aad6aa95f9..5cc62911dd5 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -48,7 +48,7 @@ def _register_nodes(
 
 # pylint: disable=too-many-arguments,too-many-locals
 async def worker(
-    app: Callable[[], ClientApp],
+    app_fn: Callable[[], ClientApp],
     queue: "asyncio.Queue[TaskIns]",
     node_states: Dict[int, NodeState],
     state_factory: StateFactory,
@@ -73,7 +73,7 @@ async def worker(
 
             # Let backend process message
             out_mssg, updated_context = await backend.process_message(
-                app, message, context
+                app_fn, message, context
             )
 
             # Update Context
@@ -150,7 +150,7 @@ async def add_taskins_to_queue(
 
 
 async def run(
-    app: Callable[[], ClientApp],
+    app_fn: Callable[[], ClientApp],
     backend: Backend,
     nodes_mapping: NodeToPartitionMapping,
     state_factory: StateFactory,
@@ -168,7 +168,9 @@ async def run(
         # Add workers (they submit Messages to Backend)
         worker_tasks = [
             asyncio.create_task(
-                worker(app, queue, node_states, state_factory, nodes_mapping, backend)
+                worker(
+                    app_fn, queue, node_states, state_factory, nodes_mapping, backend
+                )
             )
             for _ in range(backend.num_workers)
         ]
@@ -239,7 +241,8 @@ def start_vce(
     if existing_nodes_mapping:
         if state_factory is None:
             raise ValueError(
-                "`existing_nodes_mapping` was passed, but no `state_factory` was passed."
+                "`existing_nodes_mapping` was passed, but no `state_factory` was "
+                "passed."
             )
         log(INFO, "Using exiting NodeToPartitionMapping and StateFactory.")
         # Use mapping constructed externally. This also means nodes
@@ -289,11 +292,11 @@ def _load() -> ClientApp:
         app: ClientApp = load_client_app(client_app_module_name)
         return app
 
-    app = _load
+    app_fn = _load
 
     asyncio.run(
         run(
-            app,
+            app_fn,
             backend,
             nodes_mapping,
             state_factory,

From cc6a1451fa0ca51c9b39244e14cfa3c2d3fc55c1 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Wed, 28 Feb 2024 16:16:37 +0000
Subject: [PATCH 076/103] instantiating backend in asyncio event loop

---
 src/py/flwr/server/superlink/fleet/vce/vce_api.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index 5cc62911dd5..ad858cbb997 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -151,17 +151,20 @@ async def add_taskins_to_queue(
 
 async def run(
     app_fn: Callable[[], ClientApp],
-    backend: Backend,
+    backend_fn: Callable[[], Backend],
     nodes_mapping: NodeToPartitionMapping,
     state_factory: StateFactory,
     node_states: Dict[int, NodeState],
     f_stop: asyncio.Event,
 ) -> None:
     """Run the VCE async."""
-    # pylint: disable=fixme
     queue: "asyncio.Queue[TaskIns]" = asyncio.Queue(128)
 
     try:
+
+        # Instantiate backend
+        backend = backend_fn()
+
         # Build backend
         await backend.build()
 
@@ -272,7 +275,6 @@ def start_vce(
 
     try:
         backend_type = supported_backends[backend_name]
-        backend = backend_type(backend_config, work_dir=working_dir)
     except KeyError as ex:
         log(
             ERROR,
@@ -286,6 +288,10 @@ def start_vce(
 
         raise ex
 
+    def backend_fn() -> Backend:
+        """Instantiate a Backend."""
+        return backend_type(backend_config, work_dir=working_dir)
+
     log(INFO, "client_app_module_name = %s", client_app_module_name)
 
     def _load() -> ClientApp:
@@ -297,7 +303,7 @@ def _load() -> ClientApp:
     asyncio.run(
         run(
             app_fn,
-            backend,
+            backend_fn,
             nodes_mapping,
             state_factory,
             node_states,

From 662579eae1a2bcdc7d2f99dce5ff644084dd4e0f Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Wed, 28 Feb 2024 16:23:49 +0000
Subject: [PATCH 077/103] minor

---
 src/py/flwr/simulation/run_simulation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index 03baae738c2..8c343bb23e6 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -61,8 +61,8 @@ def run_simulation() -> None:
         daemon=False,
     )
 
-    event(EventType.RUN_SUPERLINK_ENTER)
     superlink_th.start()
+    event(EventType.RUN_SUPERLINK_ENTER)
 
     # Initialize Driver
     driver = Driver(

From 6dd034b67c77eb769e8822dc7adbc0b7dad7cc0d Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Wed, 28 Feb 2024 16:31:45 +0000
Subject: [PATCH 078/103] updated TF notebook

---
 examples/simulation-tensorflow/sim.ipynb | 53 +++++++++++++++++-------
 1 file changed, 39 insertions(+), 14 deletions(-)

diff --git a/examples/simulation-tensorflow/sim.ipynb b/examples/simulation-tensorflow/sim.ipynb
index 9acfba99237..21639877be2 100644
--- a/examples/simulation-tensorflow/sim.ipynb
+++ b/examples/simulation-tensorflow/sim.ipynb
@@ -17,8 +17,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install -q flwr[\"simulation\"] tensorflow\n",
-    "!pip install -q flwr_datasets[\"vision\"]"
+    "!pip install -q \"flwr[simulation]\" tensorflow\n",
+    "!pip install -q \"flwr_datasets[vision]\""
    ]
   },
   {
@@ -34,7 +34,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install matplotlib"
+    "!pip install -q matplotlib"
    ]
   },
   {
@@ -265,20 +265,33 @@
     "    evaluate_fn=get_evaluate_fn(centralized_testset),  # global evaluation function\n",
     ")\n",
     "\n",
-    "# With a dictionary, you tell Flower's VirtualClientEngine that each\n",
-    "# client needs exclusive access to these many resources in order to run\n",
-    "client_resources = {\"num_cpus\": 1, \"num_gpus\": 0.0}\n",
-    "\n",
-    "# Start simulation\n",
-    "history = fl.simulation.start_simulation(\n",
+    "# ClientApp for Flower-Next\n",
+    "client_app = fl.client.ClientApp(\n",
     "    client_fn=get_client_fn(mnist_fds),\n",
-    "    num_clients=NUM_CLIENTS,\n",
+    ")\n",
+    "\n",
+    "# ServerApp for Flower-Next\n",
+    "server_app = fl.server.ServerApp(\n",
     "    config=fl.server.ServerConfig(num_rounds=10),\n",
     "    strategy=strategy,\n",
-    "    client_resources=client_resources,\n",
-    "    actor_kwargs={\n",
-    "        \"on_actor_init_fn\": enable_tf_gpu_growth  # Enable GPU growth upon actor init.\n",
-    "    },\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now let's lauch the simulation:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fl.simulation.run_simulation(\n",
+    "    server_app=server_app, client_app=client_app, num_supernodes=NUM_CLIENTS\n",
     ")"
    ]
   },
@@ -340,6 +353,18 @@
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.16"
   }
  },
  "nbformat": 4,

From 2aba8954f569e8263fd1f2f3be745497506a82b2 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Wed, 28 Feb 2024 18:03:34 +0000
Subject: [PATCH 079/103] moved `run_driver_api_grpc()`

---
 src/py/flwr/server/app.py                     | 33 ++----------
 .../server/superlink/driver/driver_grpc.py    | 54 +++++++++++++++++++
 2 files changed, 57 insertions(+), 30 deletions(-)
 create mode 100644 src/py/flwr/server/superlink/driver/driver_grpc.py

diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py
index cf6b716bd18..788ebeb8a45 100644
--- a/src/py/flwr/server/app.py
+++ b/src/py/flwr/server/app.py
@@ -36,9 +36,6 @@
 )
 from flwr.common.exit_handlers import register_exit_handlers
 from flwr.common.logger import log
-from flwr.proto.driver_pb2_grpc import (  # pylint: disable=E0611
-    add_DriverServicer_to_server,
-)
 from flwr.proto.fleet_pb2_grpc import (  # pylint: disable=E0611
     add_FleetServicer_to_server,
 )
@@ -48,7 +45,7 @@
 from .server import Server, init_defaults, run_fl
 from .server_config import ServerConfig
 from .strategy import Strategy
-from .superlink.driver.driver_servicer import DriverServicer
+from .superlink.driver.driver_grpc import run_driver_api_grpc
 from .superlink.fleet.grpc_bidi.grpc_server import (
     generic_create_grpc_server,
     start_grpc_server,
@@ -204,7 +201,7 @@ def run_driver_api() -> None:
     state_factory = StateFactory(args.database)
 
     # Start server
-    grpc_server: grpc.Server = _run_driver_api_grpc(
+    grpc_server: grpc.Server = run_driver_api_grpc(
         address=address,
         state_factory=state_factory,
         certificates=certificates,
@@ -313,7 +310,7 @@ def run_superlink() -> None:
     state_factory = StateFactory(args.database)
 
     # Start Driver API
-    driver_server: grpc.Server = _run_driver_api_grpc(
+    driver_server: grpc.Server = run_driver_api_grpc(
         address=address,
         state_factory=state_factory,
         certificates=certificates,
@@ -414,30 +411,6 @@ def _try_obtain_certificates(
     return certificates
 
 
-def _run_driver_api_grpc(
-    address: str,
-    state_factory: StateFactory,
-    certificates: Optional[Tuple[bytes, bytes, bytes]],
-) -> grpc.Server:
-    """Run Driver API (gRPC, request-response)."""
-    # Create Driver API gRPC server
-    driver_servicer: grpc.Server = DriverServicer(
-        state_factory=state_factory,
-    )
-    driver_add_servicer_to_server_fn = add_DriverServicer_to_server
-    driver_grpc_server = generic_create_grpc_server(
-        servicer_and_add_fn=(driver_servicer, driver_add_servicer_to_server_fn),
-        server_address=address,
-        max_message_length=GRPC_MAX_MESSAGE_LENGTH,
-        certificates=certificates,
-    )
-
-    log(INFO, "Flower ECE: Starting Driver API (gRPC-rere) on %s", address)
-    driver_grpc_server.start()
-
-    return driver_grpc_server
-
-
 def _run_fleet_api_grpc_rere(
     address: str,
     state_factory: StateFactory,
diff --git a/src/py/flwr/server/superlink/driver/driver_grpc.py b/src/py/flwr/server/superlink/driver/driver_grpc.py
new file mode 100644
index 00000000000..f74000bc59c
--- /dev/null
+++ b/src/py/flwr/server/superlink/driver/driver_grpc.py
@@ -0,0 +1,54 @@
+# Copyright 2020 Flower Labs GmbH. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Driver gRPC API."""
+
+from logging import INFO
+from typing import Optional, Tuple
+
+import grpc
+
+from flwr.common import GRPC_MAX_MESSAGE_LENGTH
+from flwr.common.logger import log
+from flwr.proto.driver_pb2_grpc import (  # pylint: disable=E0611
+    add_DriverServicer_to_server,
+)
+from flwr.server.superlink.state import StateFactory
+
+from ..fleet.grpc_bidi.grpc_server import generic_create_grpc_server
+from .driver_servicer import DriverServicer
+
+
+def run_driver_api_grpc(
+    address: str,
+    state_factory: StateFactory,
+    certificates: Optional[Tuple[bytes, bytes, bytes]],
+) -> grpc.Server:
+    """Run Driver API (gRPC, request-response)."""
+    # Create Driver API gRPC server
+    driver_servicer: grpc.Server = DriverServicer(
+        state_factory=state_factory,
+    )
+    driver_add_servicer_to_server_fn = add_DriverServicer_to_server
+    driver_grpc_server = generic_create_grpc_server(
+        servicer_and_add_fn=(driver_servicer, driver_add_servicer_to_server_fn),
+        server_address=address,
+        max_message_length=GRPC_MAX_MESSAGE_LENGTH,
+        certificates=certificates,
+    )
+
+    log(INFO, "Flower ECE: Starting Driver API (gRPC-rere) on %s", address)
+    driver_grpc_server.start()
+
+    return driver_grpc_server

From f3d2c639339f2deabaae7666156a5a2687c30915 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Wed, 28 Feb 2024 18:10:27 +0000
Subject: [PATCH 080/103] update and format

---
 src/py/flwr/simulation/run_simulation.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index 8c343bb23e6..1e7c13a0ba6 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -24,22 +24,20 @@
 from flwr.common.exit_handlers import register_exit_handlers
 from flwr.server.driver.driver import Driver
 from flwr.server.run_serverapp import run
+from flwr.server.superlink.driver.driver_grpc import run_driver_api_grpc
 from flwr.server.superlink.fleet import vce
 from flwr.server.superlink.state import StateFactory
 
 
 def run_simulation() -> None:
-    """."""
-    # TODO: below create circular imports
-    from flwr.server.app import _run_driver_api_grpc
-
+    """Run Simulation Engine."""
     args = _parse_args_run_simulation().parse_args()
 
     # Initialize StateFactory
     state_factory = StateFactory(":flwr-in-memory-state:")
 
     # Start Driver API
-    driver_server: grpc.Server = _run_driver_api_grpc(
+    driver_server: grpc.Server = run_driver_api_grpc(
         address=args.driver_api_address,
         state_factory=state_factory,
         certificates=None,

From c2a4dc8f36a47d59aeb9bf1e6f8db2d45c051507 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Thu, 29 Feb 2024 00:13:57 +0000
Subject: [PATCH 081/103] better exception handling; updated simp.py examples

---
 examples/simulation-pytorch/README.md    | 21 +++----
 examples/simulation-pytorch/sim.py       |  4 +-
 examples/simulation-tensorflow/README.md | 23 +++----
 examples/simulation-tensorflow/sim.py    |  6 +-
 src/py/flwr/simulation/run_simulation.py | 77 ++++++++++++++++++------
 5 files changed, 82 insertions(+), 49 deletions(-)

diff --git a/examples/simulation-pytorch/README.md b/examples/simulation-pytorch/README.md
index 963e77bc568..339cae67320 100644
--- a/examples/simulation-pytorch/README.md
+++ b/examples/simulation-pytorch/README.md
@@ -75,26 +75,23 @@ python sim.py --num_cpus=2
 python sim.py --num_cpus=2 --num_gpus=0.25
 ```
 
-### Run with Flower-Next (`super-link` and `server-app`)
+### Run with Flower-Next
 
-Ensure you have activated your environment, then:
+Ensure you have activated your environment, then execute the command below. All `ClientApp` instances will run on CPU but the `ServerApp` will run on the GPU if one is available. Note that this is the case because the `Simulation Engine` only exposes certain resources to the `ClientApp` (based on the `client_resources` in `--backend-config`).
 
-```
-flower-superlink --insecure --vce --num-supernodes 100 --client-app sim:client_app
-
-# on a different terminal
-flower-server-app sim:server_app --insecure
+```bash
+# Run with the default backend-config.
+# `--server-app` points to the `server` object in the sim.py file in this example.
+# `--client-app` points to the `client` object in the sim.py file in this example.
+flower-simulation --client-app=sim:client --server-app=sim:server --num-supernodes=100
 ```
 
-You can change the default resources assigned to each `ClientApp` by means the `--backend-config` argument:
+You can change the default resources assigned to each `ClientApp` by means of the `--backend-config` argument:
 
 ```bash
 # Tells the VCE to resever 2x CPUs and 25% of available VRAM for each ClientApp
-flower-superlink --insecure --vce --num-supernodes 100 \
-    --client-app sim:client_app \
+flower-simulation --client-app=sim:client --server-app=sim:server --num-supernodes=100 \
     --backend-config='{"client_resources": {"num_cpus":2, "num_gpus":0.25}}'
-
-# Then you can launch the `flower-server-app` command as shown earlier.
 ```
 
 Take a look at the [Documentation](https://flower.ai/docs/framework/how-to-run-simulations.html) for more details on how you can customise your simulation.
diff --git a/examples/simulation-pytorch/sim.py b/examples/simulation-pytorch/sim.py
index 84a00e3f092..ca9e6f0e836 100644
--- a/examples/simulation-pytorch/sim.py
+++ b/examples/simulation-pytorch/sim.py
@@ -182,12 +182,12 @@ def evaluate(
 )
 
 # ClientApp for Flower-Next
-client_app = fl.client.ClientApp(
+client = fl.client.ClientApp(
     client_fn=get_client_fn(mnist_fds),
 )
 
 # ServerApp for Flower-Next
-server_app = fl.server.ServerApp(
+server = fl.server.ServerApp(
     config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS),
     strategy=strategy,
 )
diff --git a/examples/simulation-tensorflow/README.md b/examples/simulation-tensorflow/README.md
index f6f0a22fdd7..a49fda9c0b3 100644
--- a/examples/simulation-tensorflow/README.md
+++ b/examples/simulation-tensorflow/README.md
@@ -76,26 +76,23 @@ python sim.py --num_cpus=2 --num_gpus=0.25
 
 Because TensorFlow by default maps all the available VRAM, we need to [enable GPU memory growth](https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth), see how it is done in the example (`sim.py`) for both the "main" process (where the server/strategy runs) and for the clients (using the `actor_kwargs`)
 
-### Run with Flower-Next (`super-link` and `server-app`)
+### Run with Flower-Next
 
-Ensure you have activated your environment, then:
+Ensure you have activated your environment, then execute the command below. All `ClientApp` instances will run on CPU but the `ServerApp` will run on the GPU if one is available. Note that this is the case because the `Simulation Engine` only exposes certain resources to the `ClientApp` (based on the `client_resources` in `--backend-config`). For TensorFlow simulations, it is desirable to make use of TF's [memory growth](https://www.tensorflow.org/api_docs/python/tf/config/experimental/set_memory_growth) feature. You can enable that easily with the `--enable-tf-gpu-growth` flag.
 
-```
-flower-superlink --insecure --vce --num-supernodes 100 --client-app sim:client_app
-
-# on a different terminal
-flower-server-app sim:server_app --insecure
+```bash
+# Run with the default backend-config.
+# `--server-app` points to the `server` object in the sim.py file in this example.
+# `--client-app` points to the `client` object in the sim.py file in this example.
+flower-simulation --client-app=sim:client --server-app=sim:server --num-supernodes=100 --enable-tf-gpu-growth
 ```
 
-You can change the default resources assigned to each `ClientApp` using the `--backend-config` argument. Note that we need to flag that the backend is going to use `TensorFlow`. In this way, it will enable GPU memory growth.
+You can change the default resources assigned to each `ClientApp` using the `--backend-config` argument.
 
 ```bash
 # Tells the VCE to resever 2x CPUs and 25% of available VRAM for each ClientApp
-flower-superlink --insecure --vce --num-supernodes 100 \
-    --client-app sim:client_app \
-    --backend-config='{"client_resources": {"num_cpus":2, "num_gpus":0.25}, "tensorflow": 1}'
-
-# Then you can launch the `flower-server-app` command as shown earlier.
+flower-simulation --client-app=sim:client --server-app=sim:server --num-supernodes=100 \
+    --backend-config='{"client_resources": {"num_cpus":2, "num_gpus":0.25}}' --enable-tf-gpu-growth
 ```
 
 Take a look at the [Documentation](https://flower.ai/docs/framework/how-to-run-simulations.html) for more details on how you can customise your simulation.
diff --git a/examples/simulation-tensorflow/sim.py b/examples/simulation-tensorflow/sim.py
index dbba71ac2cf..2a19e131fe7 100644
--- a/examples/simulation-tensorflow/sim.py
+++ b/examples/simulation-tensorflow/sim.py
@@ -1,5 +1,4 @@
 import os
-import math
 import argparse
 from typing import Dict, List, Tuple
 
@@ -147,13 +146,12 @@ def evaluate(
 
 
 # ClientApp for Flower-Next
-client_app = fl.client.ClientApp(
+client = fl.client.ClientApp(
     client_fn=get_client_fn(mnist_fds),
 )
 
 # ServerApp for Flower-Next
-# TODO: Unclear how to enable GPU growth for the ServerApp
-server_app = fl.server.ServerApp(
+server = fl.server.ServerApp(
     config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS),
     strategy=strategy,
 )
diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index 1e7c13a0ba6..ffdaad01a11 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -16,23 +16,43 @@
 
 import argparse
 import asyncio
+import json
 import threading
+import traceback
+from logging import ERROR, INFO, WARNING
 
 import grpc
 
-from flwr.common import EventType, event
+from flwr.common import EventType, event, log
 from flwr.common.exit_handlers import register_exit_handlers
 from flwr.server.driver.driver import Driver
 from flwr.server.run_serverapp import run
 from flwr.server.superlink.driver.driver_grpc import run_driver_api_grpc
 from flwr.server.superlink.fleet import vce
 from flwr.server.superlink.state import StateFactory
+from flwr.simulation.ray_transport.utils import enable_tf_gpu_growth
 
 
 def run_simulation() -> None:
     """Run Simulation Engine."""
     args = _parse_args_run_simulation().parse_args()
 
+    # Load JSON config
+    backend_config_dict = json.loads(args.backend_config)
+
+    # Enable GPU memory growth (relevant only for TF)
+    if args.enable_tf_gpu_growth:
+        log(INFO, "Enabling GPU growth for Tensorflow on the main thread.")
+        enable_tf_gpu_growth()
+        # Check that Backend config has also enabled using GPU growth
+        use_tf = backend_config_dict.get("tensorflow", False)
+        if not use_tf:
+            log(WARNING, "Enabling GPU growth for your backend.")
+            backend_config_dict["tensorflow"] = True
+
+    # Convert back to JSON stream
+    backend_config = json.dumps(backend_config_dict)
+
     # Initialize StateFactory
     state_factory = StateFactory(":flwr-in-memory-state:")
 
@@ -51,7 +71,7 @@ def run_simulation() -> None:
             "num_supernodes": args.num_supernodes,
             "client_app_module_name": args.client_app,
             "backend_name": args.backend,
-            "backend_config_json_stream": args.backend_config,
+            "backend_config_json_stream": backend_config,
             "working_dir": args.dir,
             "state_factory": state_factory,
             "f_stop": f_stop,
@@ -62,26 +82,37 @@ def run_simulation() -> None:
     superlink_th.start()
     event(EventType.RUN_SUPERLINK_ENTER)
 
-    # Initialize Driver
-    driver = Driver(
-        driver_service_address=args.driver_api_address,
-        root_certificates=None,
-    )
+    try:
+        # Initialize Driver
+        driver = Driver(
+            driver_service_address=args.driver_api_address,
+            root_certificates=None,
+        )
 
-    # Launch server app
-    run(args.server_app, driver, args.dir)
+        # Launch server app
+        run(args.server_app, driver, args.dir)
 
-    del driver
+    except Exception as ex:
 
-    # Trigger stop event
-    f_stop.set()
+        log(ERROR, "An exception occured !! %s", ex)
+        log(ERROR, traceback.format_exc())
+        raise RuntimeError(
+            "An error was encountered by the Simulation Engine. Ending Simulation."
+        ) from ex
 
-    register_exit_handlers(
-        grpc_servers=[driver_server],
-        bckg_threads=[superlink_th],
-        event_type=EventType.RUN_SUPERLINK_LEAVE,
-    )
-    superlink_th.join()
+    finally:
+
+        del driver
+
+        # Trigger stop event
+        f_stop.set()
+
+        register_exit_handlers(
+            grpc_servers=[driver_server],
+            bckg_threads=[superlink_th],
+            event_type=EventType.RUN_SUPERLINK_LEAVE,
+        )
+        superlink_th.join()
 
 
 def _parse_args_run_simulation() -> argparse.ArgumentParser:
@@ -117,6 +148,16 @@ def _parse_args_run_simulation() -> argparse.ArgumentParser:
         type=str,
         help="Simulation backend that executes the ClientApp.",
     )
+    parser.add_argument(
+        "--enable-tf-gpu-growth",
+        action="store_true",
+        help="Enables GPU growth on the main thread. This is desirable if you make "
+        "use of a TensorFlow model on your `ServerApp` while having your `ClientApp` "
+        "running on the same GPU. Without enabling this, you might encounter an "
+        "out-of-memory error becasue TensorFlow by default allocates all GPU memory."
+        "Read mor about how `tf.config.experimental.set_memory_growth()` works in "
+        "the TensorFlow documentation: https://www.tensorflow.org/api/stable.",
+    )
     parser.add_argument(
         "--backend-config",
         type=str,

From 94c264984c0af850b232ee291e663f60d59a699d Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Thu, 29 Feb 2024 09:24:29 +0000
Subject: [PATCH 082/103] updates

---
 examples/simulation-pytorch/sim.ipynb    |  4 ---
 examples/simulation-tensorflow/sim.ipynb |  5 +++-
 src/py/flwr/server/run_serverapp.py      | 28 +++++++++++-------
 src/py/flwr/simulation/run_simulation.py | 36 ++++++++++++++++--------
 4 files changed, 46 insertions(+), 27 deletions(-)

diff --git a/examples/simulation-pytorch/sim.ipynb b/examples/simulation-pytorch/sim.ipynb
index 85fb67f6602..e351228a19e 100644
--- a/examples/simulation-pytorch/sim.ipynb
+++ b/examples/simulation-pytorch/sim.ipynb
@@ -629,10 +629,6 @@
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
-  },
-  "language_info": {
-   "name": "python",
-   "version": "3.8.16"
   }
  },
  "nbformat": 4,
diff --git a/examples/simulation-tensorflow/sim.ipynb b/examples/simulation-tensorflow/sim.ipynb
index c506c505855..6c08666b6e4 100644
--- a/examples/simulation-tensorflow/sim.ipynb
+++ b/examples/simulation-tensorflow/sim.ipynb
@@ -291,7 +291,10 @@
    "outputs": [],
    "source": [
     "fl.simulation.run_simulation(\n",
-    "    server_app=server_app, client_app=client_app, num_supernodes=NUM_CLIENTS, enable_tf_gpu_growth=True\n",
+    "    server_app=server_app,\n",
+    "    client_app=client_app,\n",
+    "    num_supernodes=NUM_CLIENTS,\n",
+    "    enable_tf_gpu_growth=True,\n",
     ")"
    ]
   },
diff --git a/src/py/flwr/server/run_serverapp.py b/src/py/flwr/server/run_serverapp.py
index 4ff3ede06a2..4de966bfc87 100644
--- a/src/py/flwr/server/run_serverapp.py
+++ b/src/py/flwr/server/run_serverapp.py
@@ -29,24 +29,32 @@
 
 
 def run(
-    server_app_attr: str,
     driver: Driver,
     server_app_dir: str,
+    server_app_attr: Optional[str] = None,
     loaded_server_app: Optional[ServerApp] = None,
 ) -> None:
     """Run ServerApp with a given Driver."""
+    if not (server_app_attr is None) ^ (loaded_server_app is None):
+        raise ValueError(
+            "Either `server_app_attr` should `loaded_server_app` be set "
+            "but not both. "
+        )
+
     if server_app_dir is not None:
         sys.path.insert(0, server_app_dir)
 
-    def _load() -> ServerApp:
-        server_app: ServerApp = (
-            load_server_app(server_app_attr)
-            if loaded_server_app is None
-            else loaded_server_app
-        )
-        return server_app
+    # Load ServerApp if needed
+    if server_app_attr:
+
+        def _load() -> ServerApp:
+            server_app: ServerApp = load_server_app(server_app_attr)
+            return server_app
+
+        server_app = _load()
 
-    server_app = _load()
+    if loaded_server_app:
+        server_app = loaded_server_app
 
     # Initialize Context
     context = Context(state=RecordSet())
@@ -114,7 +122,7 @@ def run_server_app() -> None:
     )
 
     # Run the Server App with the Driver
-    run(server_app_attr, driver, server_app_dir)
+    run(driver=driver, server_app_dir=server_app_dir, server_app_attr=server_app_attr)
 
     # Clean up
     driver.__del__()  # pylint: disable=unnecessary-dunder-call
diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index dbd05e9e86f..d1aa6746908 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -20,13 +20,14 @@
 import threading
 import traceback
 from logging import ERROR, INFO, WARNING
-from typing import Optional
+from typing import Dict, Optional
 
 import grpc
 
 from flwr.client import ClientApp
 from flwr.common import EventType, event, log
 from flwr.common.exit_handlers import register_exit_handlers
+from flwr.common.typing import ConfigsRecordValues
 from flwr.server.driver.driver import Driver
 from flwr.server.run_serverapp import run
 from flwr.server.server_app import ServerApp
@@ -42,12 +43,15 @@ def run_simulation_from_cli() -> None:
     """Run Simulation Engine from the CLI."""
     args = _parse_args_run_simulation().parse_args()
 
+    # Load JSON config
+    backend_config_dict = json.loads(args.backend_config)
+
     run_simulation(
         num_supernodes=args.num_supernodes,
         client_app_module_name=args.client_app,
         server_app_module_name=args.server_app,
         backend_name=args.backend,
-        backend_config=args.backend_config,
+        backend_config=backend_config_dict,
         working_dir=args.dir,
         driver_api_address=args.driver_api_address,
         enable_tf_gpu_growth=args.enable_tf_gpu_growth,
@@ -60,7 +64,7 @@ def run_simulation(
     client_app: Optional[ClientApp] = None,
     server_app: Optional[ServerApp] = None,
     backend_name: str = "ray",
-    backend_config: str = "{}",
+    backend_config: Optional[Dict[str, ConfigsRecordValues]] = None,
     client_app_module_name: Optional[str] = None,
     server_app_module_name: Optional[str] = None,
     working_dir: str = "",
@@ -86,9 +90,9 @@ def run_simulation(
     backend_name : str (default: ray)
         A simulation backend that runs `ClientApp`s.
 
-    backend_config : str
-        'A JSON formatted stream, e.g \'{"<keyA>":<value>, "<keyB>":<value>}\' to
-        configure a backend. Values supported in <value> are those included by
+    backend_config : Optional[Dict[str, ConfigsRecordValues]]
+        'A dictionary, e.g {"<keyA>":<value>, "<keyB>":<value>} to configure a
+        backend. Values supported in <value> are those included by
         `flwr.common.typing.ConfigsRecordValues`.
 
     client_app_module_name : str
@@ -114,18 +118,21 @@ def run_simulation(
         all GPU memory. Read mor about how `tf.config.experimental.set_memory_growth()`
         works in the TensorFlow documentation: https://www.tensorflow.org/api/stable.
     """
-    # Load JSON config
-    backend_config_dict = json.loads(backend_config)
+    if backend_config is None:
+        backend_config = {}
 
     # Enable GPU memory growth (relevant only for TF)
     if enable_tf_gpu_growth:
         log(INFO, "Enabling GPU growth for Tensorflow on the main thread.")
         enable_gpu_growth()
         # Check that Backend config has also enabled using GPU growth
-        use_tf = backend_config_dict.get("tensorflow", False)
+        use_tf = backend_config.get("tensorflow", False)
         if not use_tf:
             log(WARNING, "Enabling GPU growth for your backend.")
-            backend_config_dict["tensorflow"] = True
+            backend_config["tensorflow"] = True
+
+    # Convert config to original JSON-stream format
+    backend_config_stream = json.dumps(backend_config)
 
     # Initialize StateFactory
     state_factory = StateFactory(":flwr-in-memory-state:")
@@ -146,7 +153,7 @@ def run_simulation(
             "client_app_module_name": client_app_module_name,
             "client_app": client_app,
             "backend_name": backend_name,
-            "backend_config_json_stream": backend_config,
+            "backend_config_json_stream": backend_config_stream,
             "working_dir": working_dir,
             "state_factory": state_factory,
             "f_stop": f_stop,
@@ -165,7 +172,12 @@ def run_simulation(
         )
 
         # Launch server app
-        run(server_app_module_name, driver, working_dir, loaded_server_app=server_app)
+        run(
+            driver=driver,
+            server_app_dir=working_dir,
+            server_app_attr=server_app_module_name,
+            loaded_server_app=server_app,
+        )
 
     except Exception as ex:
 

From b00e8405287fed9e786344e8bec2569b235d31ed Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Fri, 1 Mar 2024 16:42:03 +0000
Subject: [PATCH 083/103] better

---
 src/py/flwr/server/run_serverapp.py      |  15 +++-
 src/py/flwr/simulation/run_simulation.py | 102 ++++++++++++++++-------
 2 files changed, 85 insertions(+), 32 deletions(-)

diff --git a/src/py/flwr/server/run_serverapp.py b/src/py/flwr/server/run_serverapp.py
index 19fd16fb0c1..d4f21cbf20d 100644
--- a/src/py/flwr/server/run_serverapp.py
+++ b/src/py/flwr/server/run_serverapp.py
@@ -16,9 +16,11 @@
 
 
 import argparse
+import asyncio
 import sys
 from logging import DEBUG, WARN
 from pathlib import Path
+from typing import Optional
 
 from flwr.common import Context, EventType, RecordSet, event
 from flwr.common.logger import log
@@ -27,7 +29,12 @@
 from .server_app import ServerApp, load_server_app
 
 
-def run(server_app_attr: str, driver: Driver, server_app_dir: str) -> None:
+def run(
+    server_app_attr: str,
+    driver: Driver,
+    server_app_dir: str,
+    stop_event: Optional[asyncio.Event] = None,
+) -> None:
     """Run ServerApp with a given Driver."""
     if server_app_dir is not None:
         sys.path.insert(0, server_app_dir)
@@ -44,6 +51,12 @@ def _load() -> ServerApp:
     # Call ServerApp
     server_app(driver=driver, context=context)
 
+    log(DEBUG, "ServerApp finished running.")
+    # Upon completion, trigger stop event if one was passed
+    if stop_event is not None:
+        log(DEBUG, "Triggering stop event.")
+        stop_event.set()
+
 
 def run_server_app() -> None:
     """Run Flower server app."""
diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index ebe76944e77..5459506335e 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -20,11 +20,12 @@
 import threading
 import traceback
 from logging import ERROR, INFO, WARNING
+from time import sleep
+from typing import Any, Callable
 
 import grpc
 
 from flwr.common import EventType, event, log
-from flwr.common.exit_handlers import register_exit_handlers
 from flwr.server.driver.driver import Driver
 from flwr.server.run_serverapp import run
 from flwr.server.superlink.driver.driver_grpc import run_driver_api_grpc
@@ -33,6 +34,49 @@
 from flwr.simulation.ray_transport.utils import enable_tf_gpu_growth
 
 
+def run_serverapp_th(
+    server_app_attr: str,
+    driver: Driver,
+    server_app_dir: str,
+    f_stop: asyncio.Event,
+    delay_launch: int = 3,
+) -> threading.Thread:
+    """Run SeverApp in a thread."""
+    serverapp_th = threading.Thread(
+        target=run,
+        kwargs={
+            "server_app_attr": server_app_attr,
+            "driver": driver,
+            "server_app_dir": server_app_dir,
+            "stop_event": f_stop,  # will be set when `run()` finishes
+            # will trigger the shutdown of the Simulation Engine
+        },
+    )
+    sleep(delay_launch)
+    serverapp_th.start()
+    return serverapp_th
+
+
+def get_thread_exception_hook(stop_event: asyncio.Event) -> Callable[[Any], None]:
+    """Return a callback for when the serverapp thread raises an exception."""
+
+    def execepthook(args: Any) -> None:
+        """Upon exception raised, log exception and trigger stop event."""
+        # log
+        log(
+            ERROR,
+            "The ServerApp thread triggered exception (%s): %s",
+            args.exc_type,
+            args.exc_value,
+        )
+        log(ERROR, traceback.format_exc())
+        # Set stop event
+        stop_event.set()
+        log(WARNING, "Triggered stop event for Simulation Engine.")
+
+    return execepthook
+
+
 def run_simulation() -> None:
     """Run Simulation Engine."""
     args = _parse_args_run_simulation().parse_args()
@@ -63,56 +107,52 @@ def run_simulation() -> None:
         certificates=None,
     )
 
-    # SuperLink with Simulation Engine
     f_stop = asyncio.Event()
-    superlink_th = threading.Thread(
-        target=vce.start_vce,
-        kwargs={
-            "num_supernodes": args.num_supernodes,
-            "client_app_module_name": args.client_app,
-            "backend_name": args.backend,
-            "backend_config_json_stream": backend_config,
-            "working_dir": args.dir,
-            "state_factory": state_factory,
-            "f_stop": f_stop,
-        },
-        daemon=False,
-    )
-
-    superlink_th.start()
-    event(EventType.RUN_SUPERLINK_ENTER)
-
+    serverapp_th = None
     try:
+
         # Initialize Driver
         driver = Driver(
             driver_service_address=args.driver_api_address,
             root_certificates=None,
         )
 
-        # Launch server app
-        run(args.server_app, driver, args.dir)
+        # Get and run ServerApp thread
+        serverapp_th = run_serverapp_th(args.server_app, driver, args.dir, f_stop)
+        # Setup an exception hook
+        threading.excepthook = get_thread_exception_hook(f_stop)
+
+        # SuperLink with Simulation Engine
+        event(EventType.RUN_SUPERLINK_ENTER)
+        vce.start_vce(
+            num_supernodes=args.num_supernodes,
+            client_app_module_name=args.client_app,
+            backend_name=args.backend,
+            backend_config_json_stream=backend_config,
+            working_dir=args.dir,
+            state_factory=state_factory,
+            f_stop=f_stop,
+        )
 
     except Exception as ex:
 
         log(ERROR, "An exception occurred: %s", ex)
         log(ERROR, traceback.format_exc())
-        raise RuntimeError(
-            "An error was encountered by the Simulation Engine. Ending simulation."
-        ) from ex
+        raise RuntimeError("An error was encountered. Ending simulation.") from ex
 
     finally:
 
+        # Stop Driver
+        driver_server.stop(grace=0)
         del driver
-
         # Trigger stop event
         f_stop.set()
 
-        register_exit_handlers(
-            grpc_servers=[driver_server],
-            bckg_threads=[superlink_th],
-            event_type=EventType.RUN_SUPERLINK_LEAVE,
-        )
-        superlink_th.join()
+        event(EventType.RUN_SUPERLINK_LEAVE)
+        if serverapp_th:
+            serverapp_th.join()
+
+    log(INFO, "Stopping Simulation Engine now.")
 
 
 def _parse_args_run_simulation() -> argparse.ArgumentParser:

From 6f9bd9e6af2c5bf4968ae6a9438e93c805f3ea71 Mon Sep 17 00:00:00 2001
From: "Daniel J. Beutel" <daniel@flower.ai>
Date: Fri, 1 Mar 2024 17:48:29 +0100
Subject: [PATCH 084/103] Update src/py/flwr/simulation/run_simulation.py

---
 src/py/flwr/simulation/run_simulation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index 5459506335e..11d68beefc7 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -58,7 +58,7 @@ def run_serverapp_th(
 
 
 def get_thread_exception_hook(stop_event: asyncio.Event) -> Callable[[Any], None]:
-    """Return a callback for when the serverapp thread raises an exception."""
+    """Return a callback for when the ServerApp thread raises an exception."""
 
     def execepthook(args: Any) -> None:
         """Upon exception raised, log exception and trigger stop event."""

From 32d8b331ddbcbbca53c9afe179082812aa820583 Mon Sep 17 00:00:00 2001
From: "Daniel J. Beutel" <daniel@flower.ai>
Date: Fri, 1 Mar 2024 17:49:31 +0100
Subject: [PATCH 085/103] Update src/py/flwr/simulation/run_simulation.py

---
 src/py/flwr/simulation/run_simulation.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index 11d68beefc7..71cd45fdd0a 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -110,7 +110,6 @@ def run_simulation() -> None:
     f_stop = asyncio.Event()
     serverapp_th = None
     try:
-
         # Initialize Driver
         driver = Driver(
             driver_service_address=args.driver_api_address,

From 57a84e585fff92fd4570ce8b7a5a9cc64090e180 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Fri, 1 Mar 2024 17:36:00 +0000
Subject: [PATCH 086/103] pyling, mypy fixes

---
 src/py/flwr/server/run_serverapp.py               | 14 ++++++--------
 src/py/flwr/server/superlink/fleet/vce/vce_api.py | 13 +++++++------
 src/py/flwr/simulation/run_simulation.py          |  5 +++--
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/py/flwr/server/run_serverapp.py b/src/py/flwr/server/run_serverapp.py
index fb9eebb3c4c..a1608fd15ec 100644
--- a/src/py/flwr/server/run_serverapp.py
+++ b/src/py/flwr/server/run_serverapp.py
@@ -47,16 +47,14 @@ def run(
         sys.path.insert(0, server_app_dir)
 
     # Load ServerApp if needed
-    if server_app_attr:
-
-        def _load() -> ServerApp:
+    def _load() -> ServerApp:
+        if server_app_attr:
             server_app: ServerApp = load_server_app(server_app_attr)
-            return server_app
-
-        server_app = _load()
+        if loaded_server_app:
+            server_app = loaded_server_app
+        return server_app
 
-    if loaded_server_app:
-        server_app = loaded_server_app
+    server_app = _load()
 
     # Initialize Context
     context = Context(state=RecordSet())
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index 4226e0109be..c03b57ddbb5 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -219,12 +219,12 @@ async def run(
 
 # pylint: disable=too-many-arguments,unused-argument,too-many-locals
 def start_vce(
-    client_app_module_name: str,
     backend_name: str,
     backend_config_json_stream: str,
     working_dir: str,
     f_stop: asyncio.Event,
     client_app: Optional[ClientApp] = None,
+    client_app_module_name: Optional[str] = None,
     num_supernodes: Optional[int] = None,
     state_factory: Optional[StateFactory] = None,
     existing_nodes_mapping: Optional[NodeToPartitionMapping] = None,
@@ -301,12 +301,13 @@ def backend_fn() -> Backend:
 
     log(INFO, "client_app_module_name = %s", client_app_module_name)
 
+    # Load ClientApp if needed
     def _load() -> ClientApp:
-        app: ClientApp = (
-            load_client_app(client_app_module_name)
-            if client_app is None
-            else client_app
-        )
+
+        if client_app_module_name:
+            app: ClientApp = load_client_app(client_app_module_name)
+        if client_app:
+            app = client_app
         return app
 
     app_fn = _load
diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index 9a357726bdd..b911784c1f6 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -58,9 +58,10 @@ def run_simulation_from_cli() -> None:
     )
 
 
+# pylint: disable=too-many-arguments
 def run_serverapp_th(
-    server_app_attr: str,
-    server_app: ServerApp,
+    server_app_attr: Optional[str],
+    server_app: Optional[ServerApp],
     driver: Driver,
     server_app_dir: str,
     f_stop: asyncio.Event,

From 5ef65ee657a7af15e4bec73f3a9d340af946263c Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Fri, 1 Mar 2024 21:34:06 +0000
Subject: [PATCH 087/103] handling asyncio event loop running by default in
 colab/jupyter

---
 src/py/flwr/simulation/run_simulation.py | 170 +++++++++++++++--------
 1 file changed, 112 insertions(+), 58 deletions(-)

diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index b911784c1f6..e87e4b48881 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -19,7 +19,7 @@
 import json
 import threading
 import traceback
-from logging import ERROR, INFO, WARNING
+from logging import DEBUG, ERROR, INFO, WARNING
 from time import sleep
 from typing import Any, Callable, Dict, Optional
 
@@ -40,7 +40,7 @@
 
 
 def run_simulation_from_cli() -> None:
-    """Run Simulation Engine from the CLI."""
+    """Start Simulation Engine from the CLI."""
     args = _parse_args_run_simulation().parse_args()
 
     # Load JSON config
@@ -104,6 +104,87 @@ def execepthook(args: Any) -> None:
     return execepthook
 
 
+def _main_loop(
+    num_supernodes: int,
+    backend_name: str,
+    backend_config_stream: str,
+    driver_api_address: str,
+    working_dir: str,
+    client_app: Optional[ClientApp] = None,
+    client_app_module_name: Optional[str] = None,
+    server_app: Optional[ServerApp] = None,
+    server_app_module_name: Optional[str] = None,
+) -> None:
+    """Launch SuperLink with Simulation Engine, then ServerApp on a separate thread.
+
+    Everything runs on the main thread or a separate one, depening on whether the main
+    thread already contains a running Asyncio event loop. This is the case if running
+    the Simulation Engine on a Jupyter/Colab notebook.
+    """
+    # Initialize StateFactory
+    state_factory = StateFactory(":flwr-in-memory-state:")
+
+    # Start Driver API
+    driver_server: grpc.Server = run_driver_api_grpc(
+        address=driver_api_address,
+        state_factory=state_factory,
+        certificates=None,
+    )
+
+    f_stop = asyncio.Event()
+    serverapp_th = None
+    try:
+        # Initialize Driver
+        driver = Driver(
+            driver_service_address=driver_api_address,
+            root_certificates=None,
+        )
+
+        # Get and run ServerApp thread
+        serverapp_th = run_serverapp_th(
+            server_app_attr=server_app_module_name,
+            server_app=server_app,
+            driver=driver,
+            server_app_dir=working_dir,
+            f_stop=f_stop,
+        )
+        # Setup an exception hook
+        threading.excepthook = get_thread_exception_hook(f_stop)
+
+        # SuperLink with Simulation Engine
+        event(EventType.RUN_SUPERLINK_ENTER)
+        vce.start_vce(
+            num_supernodes=num_supernodes,
+            client_app_module_name=client_app_module_name,
+            client_app=client_app,
+            backend_name=backend_name,
+            backend_config_json_stream=backend_config_stream,
+            working_dir=working_dir,
+            state_factory=state_factory,
+            f_stop=f_stop,
+        )
+
+    except Exception as ex:
+
+        log(ERROR, "An exception occured !! %s", ex)
+        log(ERROR, traceback.format_exc())
+        raise RuntimeError("An error was encountered. Ending simulation.") from ex
+
+    finally:
+
+        # Stop Driver
+        driver_server.stop(grace=0)
+        del driver
+        # Trigger stop event
+        f_stop.set()
+
+        event(EventType.RUN_SUPERLINK_LEAVE)
+        if serverapp_th:
+            serverapp_th.join()
+
+    log(INFO, "Stopping Simulation Engine now.")
+
+
 # pylint: disable=too-many-arguments,too-many-locals
 def run_simulation(
     num_supernodes: int,
@@ -180,68 +261,41 @@ def run_simulation(
     # Convert config to original JSON-stream format
     backend_config_stream = json.dumps(backend_config)
 
-    # Initialize StateFactory
-    state_factory = StateFactory(":flwr-in-memory-state:")
-
-    # Start Driver API
-    driver_server: grpc.Server = run_driver_api_grpc(
-        address=driver_api_address,
-        state_factory=state_factory,
-        certificates=None,
+    simulation_engine_th = None
+    args = (
+        num_supernodes,
+        backend_name,
+        backend_config_stream,
+        driver_api_address,
+        working_dir,
+        client_app,
+        client_app_module_name,
+        server_app,
+        server_app_module_name,
     )
-
-    f_stop = asyncio.Event()
-    serverapp_th = None
+    # Detect if there is an Asyncio event loop already running.
+    # If yes, run everything on a separate thread. In environmnets
+    # like Jupyter/Colab notebooks, there is an event loop present.
+    run_in_thread = False
     try:
-        # Initialize Driver
-        driver = Driver(
-            driver_service_address=driver_api_address,
-            root_certificates=None,
-        )
-
-        # Get and run ServerApp thread
-        serverapp_th = run_serverapp_th(
-            server_app_attr=server_app_module_name,
-            server_app=server_app,
-            driver=driver,
-            server_app_dir=working_dir,
-            f_stop=f_stop,
-        )
-        # Setup an exception hook
-        threading.excepthook = get_thread_exception_hook(f_stop)
-
-        # SuperLink with Simulation Engine
-        event(EventType.RUN_SUPERLINK_ENTER)
-        vce.start_vce(
-            num_supernodes=num_supernodes,
-            client_app_module_name=client_app_module_name,
-            client_app=client_app,
-            backend_name=backend_name,
-            backend_config_json_stream=backend_config_stream,
-            working_dir=working_dir,
-            state_factory=state_factory,
-            f_stop=f_stop,
-        )
+        _ = (
+            asyncio.get_running_loop()
+        )  # Raises RuntimeError if no event loop is present
+        log(DEBUG, "Asyncio event loop already running.")
 
-    except Exception as ex:
+        run_in_thread = True
 
-        log(ERROR, "An exception occured !! %s", ex)
-        log(ERROR, traceback.format_exc())
-        raise RuntimeError("An error was encountered. Ending simulation.") from ex
+    except RuntimeError:
+        log(DEBUG, "No asyncio event loop runnig")
 
     finally:
-
-        # Stop Driver
-        driver_server.stop(grace=0)
-        del driver
-        # Trigger stop event
-        f_stop.set()
-
-        event(EventType.RUN_SUPERLINK_LEAVE)
-        if serverapp_th:
-            serverapp_th.join()
-
-    log(INFO, "Stopping Simulation Engine now.")
+        if run_in_thread:
+            log(DEBUG, "Starting Simulation Engine on a new thread.")
+            simulation_engine_th = threading.Thread(target=_main_loop, args=args)
+            simulation_engine_th.start()
+        else:
+            log(DEBUG, "Starting Simulation Engine on the main thread.")
+            _main_loop(*args)
 
 
 def _parse_args_run_simulation() -> argparse.ArgumentParser:

From 5f16beecbfc14ab10214d4410fb9e8efddd34e3c Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Fri, 1 Mar 2024 21:57:19 +0000
Subject: [PATCH 088/103] join thread, else bad things happen

---
 src/py/flwr/simulation/run_simulation.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index e87e4b48881..54e8c861029 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -293,6 +293,7 @@ def run_simulation(
             log(DEBUG, "Starting Simulation Engine on a new thread.")
             simulation_engine_th = threading.Thread(target=_main_loop, args=args)
             simulation_engine_th.start()
+            simulation_engine_th.join()
         else:
             log(DEBUG, "Starting Simulation Engine on the main thread.")
             _main_loop(*args)

From 232a82b4f2cf9444cd9af8a2e8e8bacaf3767352 Mon Sep 17 00:00:00 2001
From: "Daniel J. Beutel" <daniel@flower.ai>
Date: Sat, 2 Mar 2024 17:37:14 +0100
Subject: [PATCH 089/103] Update src/py/flwr/server/run_serverapp.py

---
 src/py/flwr/server/run_serverapp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/py/flwr/server/run_serverapp.py b/src/py/flwr/server/run_serverapp.py
index a1608fd15ec..c57a4a30c8a 100644
--- a/src/py/flwr/server/run_serverapp.py
+++ b/src/py/flwr/server/run_serverapp.py
@@ -39,7 +39,7 @@ def run(
     """Run ServerApp with a given Driver."""
     if not (server_app_attr is None) ^ (loaded_server_app is None):
         raise ValueError(
-            "Either `server_app_attr` should `loaded_server_app` be set "
+            "Either `server_app_attr` or `loaded_server_app` should be set "
             "but not both. "
         )
 

From 7a569284196a7d0c69c9a8e6a026dc092437f076 Mon Sep 17 00:00:00 2001
From: "Daniel J. Beutel" <daniel@flower.ai>
Date: Sat, 2 Mar 2024 17:44:08 +0100
Subject: [PATCH 090/103] Update src/py/flwr/simulation/run_simulation.py

---
 src/py/flwr/simulation/run_simulation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index 54e8c861029..a0b5be6f643 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -40,7 +40,7 @@
 
 
 def run_simulation_from_cli() -> None:
-    """Start Simulation Engine from the CLI."""
+    """Run Simulation Engine from the CLI."""
     args = _parse_args_run_simulation().parse_args()
 
     # Load JSON config

From 2859e3e27b54f9818bb1574439ce46a574586927 Mon Sep 17 00:00:00 2001
From: "Daniel J. Beutel" <daniel@flower.ai>
Date: Sat, 2 Mar 2024 17:53:49 +0100
Subject: [PATCH 091/103] Update src/py/flwr/simulation/run_simulation.py

---
 src/py/flwr/simulation/run_simulation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index a0b5be6f643..a6f91957837 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -166,7 +166,7 @@ def _main_loop(
 
     except Exception as ex:
 
-        log(ERROR, "An exception occured !! %s", ex)
+        log(ERROR, "An exception occurred !! %s", ex)
         log(ERROR, traceback.format_exc())
         raise RuntimeError("An error was encountered. Ending simulation.") from ex
 

From 8e0f95cfc8efd21a17402491476034c2bd96a03f Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Sat, 2 Mar 2024 23:11:17 +0000
Subject: [PATCH 092/103] exposing relevant args to entry point through python
 env / notebook; unifying names

---
 src/py/flwr/server/app.py                     |  6 +-
 .../server/superlink/fleet/vce/vce_api.py     | 12 +--
 .../superlink/fleet/vce/vce_api_test.py       |  8 +-
 src/py/flwr/simulation/run_simulation.py      | 81 +++++++++++++++----
 4 files changed, 80 insertions(+), 27 deletions(-)

diff --git a/src/py/flwr/server/app.py b/src/py/flwr/server/app.py
index 788ebeb8a45..01b1f622212 100644
--- a/src/py/flwr/server/app.py
+++ b/src/py/flwr/server/app.py
@@ -362,7 +362,7 @@ def run_superlink() -> None:
         f_stop = asyncio.Event()  # Does nothing
         _run_fleet_api_vce(
             num_supernodes=args.num_supernodes,
-            client_app_module_name=args.client_app,
+            client_app_attr=args.client_app,
             backend_name=args.backend,
             backend_config_json_stream=args.backend_config,
             working_dir=args.dir,
@@ -438,7 +438,7 @@ def _run_fleet_api_grpc_rere(
 # pylint: disable=too-many-arguments
 def _run_fleet_api_vce(
     num_supernodes: int,
-    client_app_module_name: str,
+    client_app_attr: str,
     backend_name: str,
     backend_config_json_stream: str,
     working_dir: str,
@@ -449,7 +449,7 @@ def _run_fleet_api_vce(
 
     start_vce(
         num_supernodes=num_supernodes,
-        client_app_module_name=client_app_module_name,
+        client_app_attr=client_app_attr,
         backend_name=backend_name,
         backend_config_json_stream=backend_config_json_stream,
         state_factory=state_factory,
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api.py b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
index c03b57ddbb5..d42379960a6 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api.py
@@ -224,15 +224,15 @@ def start_vce(
     working_dir: str,
     f_stop: asyncio.Event,
     client_app: Optional[ClientApp] = None,
-    client_app_module_name: Optional[str] = None,
+    client_app_attr: Optional[str] = None,
     num_supernodes: Optional[int] = None,
     state_factory: Optional[StateFactory] = None,
     existing_nodes_mapping: Optional[NodeToPartitionMapping] = None,
 ) -> None:
     """Start Fleet API with the Simulation Engine."""
-    if client_app_module_name is not None and client_app is not None:
+    if client_app_attr is not None and client_app is not None:
         raise ValueError(
-            "Both `client_app_module_name` and `client_app` are provided, "
+            "Both `client_app_attr` and `client_app` are provided, "
             "but only one is allowed."
         )
 
@@ -299,13 +299,13 @@ def backend_fn() -> Backend:
         """Instantiate a Backend."""
         return backend_type(backend_config, work_dir=working_dir)
 
-    log(INFO, "client_app_module_name = %s", client_app_module_name)
+    log(INFO, "client_app_attr = %s", client_app_attr)
 
     # Load ClientApp if needed
     def _load() -> ClientApp:
 
-        if client_app_module_name:
-            app: ClientApp = load_client_app(client_app_module_name)
+        if client_app_attr:
+            app: ClientApp = load_client_app(client_app_attr)
         if client_app:
             app = client_app
         return app
diff --git a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
index ea2de2e636b..16cb45c1262 100644
--- a/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
+++ b/src/py/flwr/server/superlink/fleet/vce/vce_api_test.py
@@ -132,7 +132,7 @@ def _autoresolve_working_dir(rel_client_app_dir: str = "backend") -> str:
 # pylint: disable=too-many-arguments
 def start_and_shutdown(
     backend: str = "ray",
-    clientapp_module: str = "raybackend_test:client_app",
+    client_app_attr: str = "raybackend_test:client_app",
     working_dir: str = "",
     num_supernodes: Optional[int] = None,
     state_factory: Optional[StateFactory] = None,
@@ -162,7 +162,7 @@ def start_and_shutdown(
 
     start_vce(
         num_supernodes=num_supernodes,
-        client_app_module_name=clientapp_module,
+        client_app_attr=client_app_attr,
         backend_name=backend,
         backend_config_json_stream=backend_config,
         state_factory=state_factory,
@@ -183,7 +183,7 @@ def test_erroneous_no_supernodes_client_mapping(self) -> None:
         with self.assertRaises(ValueError):
             start_and_shutdown(duration=2)
 
-    def test_erroneous_clientapp_module_name(self) -> None:
+    def test_erroneous_client_app_attr(self) -> None:
         """Tests attempt to load a ClientApp that can't be found."""
         num_messages = 7
         num_nodes = 59
@@ -193,7 +193,7 @@ def test_erroneous_clientapp_module_name(self) -> None:
         )
         with self.assertRaises(RuntimeError):
             start_and_shutdown(
-                clientapp_module="totally_fictitious_app:client",
+                client_app_attr="totally_fictitious_app:client",
                 state_factory=state_factory,
                 nodes_mapping=nodes_mapping,
             )
diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index a6f91957837..9eb8012161c 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -39,6 +39,7 @@
 )
 
 
+# Entry point from CLI
 def run_simulation_from_cli() -> None:
     """Run Simulation Engine from the CLI."""
     args = _parse_args_run_simulation().parse_args()
@@ -46,10 +47,10 @@ def run_simulation_from_cli() -> None:
     # Load JSON config
     backend_config_dict = json.loads(args.backend_config)
 
-    run_simulation(
+    _run_simulation(
         num_supernodes=args.num_supernodes,
-        client_app_module_name=args.client_app,
-        server_app_module_name=args.server_app,
+        client_app_attr=args.client_app,
+        server_app_attr=args.server_app,
         backend_name=args.backend,
         backend_config=backend_config_dict,
         working_dir=args.dir,
@@ -58,6 +59,58 @@ def run_simulation_from_cli() -> None:
     )
 
 
+# Entry point from Python session (script or notebook)
+# pylint: disable=too-many-arguments
+def run_simulation(
+    num_supernodes: int,
+    client_app: ClientApp,
+    server_app: ServerApp,
+    backend_name: str = "ray",
+    backend_config: Optional[Dict[str, ConfigsRecordValues]] = None,
+    enable_tf_gpu_growth: bool = False,
+) -> None:
+    r"""Launch the Simulation Engine.
+
+    Parameters
+    ----------
+    num_supernodes : int
+        Number of nodes that run a ClientApp. They can be sampled by a
+        Driver in the ServerApp and receive a Message describing what the ClientApp
+        should perform.
+
+    client_app : ClientApp
+        The `ClientApp` to be executed by each of the `SuperNodes`. It will receive
+        messages sent by the `ServerApp`.
+
+    server_app : ServerApp
+        The `ServerApp` to be executed.
+
+    backend_name : str (default: ray)
+        A simulation backend that runs `ClientApp`s.
+
+    backend_config : Optional[Dict[str, ConfigsRecordValues]]
+        'A dictionary, e.g {"<keyA>":<value>, "<keyB>":<value>} to configure a
+        backend. Values supported in <value> are those included by
+        `flwr.common.typing.ConfigsRecordValues`.
+
+    enable_tf_gpu_growth : bool (default: False)
+        A boolean to indicate whether to enable GPU growth on the main thread. This is
+        desirable if you make use of a TensorFlow model on your `ServerApp` while
+        having your `ClientApp` running on the same GPU. Without enabling this, you
+        might encounter an out-of-memory error becasue TensorFlow by default allocates
+        all GPU memory. Read mor about how `tf.config.experimental.set_memory_growth()`
+        works in the TensorFlow documentation: https://www.tensorflow.org/api/stable.
+    """
+    _run_simulation(
+        num_supernodes=num_supernodes,
+        client_app=client_app,
+        server_app=server_app,
+        backend_name=backend_name,
+        backend_config=backend_config,
+        enable_tf_gpu_growth=enable_tf_gpu_growth,
+    )
+
+
 # pylint: disable=too-many-arguments
 def run_serverapp_th(
     server_app_attr: Optional[str],
@@ -111,9 +164,9 @@ def _main_loop(
     driver_api_address: str,
     working_dir: str,
     client_app: Optional[ClientApp] = None,
-    client_app_module_name: Optional[str] = None,
+    client_app_attr: Optional[str] = None,
     server_app: Optional[ServerApp] = None,
-    server_app_module_name: Optional[str] = None,
+    server_app_attr: Optional[str] = None,
 ) -> None:
     """Launch SuperLink with Simulation Engine, then ServerApp on a separate thread.
 
@@ -142,7 +195,7 @@ def _main_loop(
 
         # Get and run ServerApp thread
         serverapp_th = run_serverapp_th(
-            server_app_attr=server_app_module_name,
+            server_app_attr=server_app_attr,
             server_app=server_app,
             driver=driver,
             server_app_dir=working_dir,
@@ -155,7 +208,7 @@ def _main_loop(
         event(EventType.RUN_SUPERLINK_ENTER)
         vce.start_vce(
             num_supernodes=num_supernodes,
-            client_app_module_name=client_app_module_name,
+            client_app_attr=client_app_attr,
             client_app=client_app,
             backend_name=backend_name,
             backend_config_json_stream=backend_config_stream,
@@ -186,14 +239,14 @@ def _main_loop(
 
 
 # pylint: disable=too-many-arguments,too-many-locals
-def run_simulation(
+def _run_simulation(
     num_supernodes: int,
     client_app: Optional[ClientApp] = None,
     server_app: Optional[ServerApp] = None,
     backend_name: str = "ray",
     backend_config: Optional[Dict[str, ConfigsRecordValues]] = None,
-    client_app_module_name: Optional[str] = None,
-    server_app_module_name: Optional[str] = None,
+    client_app_attr: Optional[str] = None,
+    server_app_attr: Optional[str] = None,
     working_dir: str = "",
     driver_api_address: str = "0.0.0.0:9091",
     enable_tf_gpu_growth: bool = False,
@@ -222,11 +275,11 @@ def run_simulation(
         backend. Values supported in <value> are those included by
         `flwr.common.typing.ConfigsRecordValues`.
 
-    client_app_module_name : str
+    client_app_attr : str
         A path to a `ClientApp` module to be loaded: For example: `client:app` or
         `project.package.module:wrapper.app`."
 
-    server_app_module_name : str
+    server_app_attr : str
         A path to a `ServerApp` module to be loaded: For example: `server:app` or
         `project.package.module:wrapper.app`."
 
@@ -269,9 +322,9 @@ def run_simulation(
         driver_api_address,
         working_dir,
         client_app,
-        client_app_module_name,
+        client_app_attr,
         server_app,
-        server_app_module_name,
+        server_app_attr,
     )
     # Detect if there is an Asyncio event loop already running.
     # If yes, run everything on a separate thread. In environmnets

From 86f3761bc0625b2a49c5ad063e73ccf99b41fd9d Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Mon, 4 Mar 2024 20:24:58 +0000
Subject: [PATCH 093/103] moved input args; fix enable GPU growth in ServerApp
 thread; other minor

---
 src/py/flwr/server/run_serverapp.py      |  6 ---
 src/py/flwr/simulation/run_simulation.py | 49 +++++++++++++++++-------
 2 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/src/py/flwr/server/run_serverapp.py b/src/py/flwr/server/run_serverapp.py
index c57a4a30c8a..4431397a28e 100644
--- a/src/py/flwr/server/run_serverapp.py
+++ b/src/py/flwr/server/run_serverapp.py
@@ -16,7 +16,6 @@
 
 
 import argparse
-import asyncio
 import sys
 from logging import DEBUG, WARN
 from pathlib import Path
@@ -34,7 +33,6 @@ def run(
     server_app_dir: str,
     server_app_attr: Optional[str] = None,
     loaded_server_app: Optional[ServerApp] = None,
-    stop_event: Optional[asyncio.Event] = None,
 ) -> None:
     """Run ServerApp with a given Driver."""
     if not (server_app_attr is None) ^ (loaded_server_app is None):
@@ -63,10 +61,6 @@ def _load() -> ServerApp:
     server_app(driver=driver, context=context)
 
     log(DEBUG, "ServerApp finished running.")
-    # Upon completion, trigger stop event if one was passed
-    if stop_event is not None:
-        log(DEBUG, "Triggering stop event.")
-        stop_event.set()
 
 
 def run_server_app() -> None:
diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index 44f50706af1..8f9bea29100 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -62,9 +62,9 @@ def run_simulation_from_cli() -> None:
 # Entry point from Python session (script or notebook)
 # pylint: disable=too-many-arguments
 def run_simulation(
-    num_supernodes: int,
-    client_app: ClientApp,
     server_app: ServerApp,
+    client_app: ClientApp,
+    num_supernodes: int,
     backend_name: str = "ray",
     backend_config: Optional[Dict[str, ConfigsRecordValues]] = None,
     enable_tf_gpu_growth: bool = False,
@@ -73,17 +73,17 @@ def run_simulation(
 
     Parameters
     ----------
-    num_supernodes : int
-        Number of nodes that run a ClientApp. They can be sampled by a
-        Driver in the ServerApp and receive a Message describing what the ClientApp
-        should perform.
+    server_app : ServerApp
+        The `ServerApp` to be executed.
 
     client_app : ClientApp
         The `ClientApp` to be executed by each of the `SuperNodes`. It will receive
         messages sent by the `ServerApp`.
 
-    server_app : ServerApp
-        The `ServerApp` to be executed.
+    num_supernodes : int
+        Number of nodes that run a ClientApp. They can be sampled by a
+        Driver in the ServerApp and receive a Message describing what the ClientApp
+        should perform.
 
     backend_name : str (default: ray)
         A simulation backend that runs `ClientApp`s.
@@ -118,18 +118,38 @@ def run_serverapp_th(
     driver: Driver,
     server_app_dir: str,
     f_stop: asyncio.Event,
+    enable_tf_gpu_growth: bool,
     delay_launch: int = 3,
 ) -> threading.Thread:
     """Run SeverApp in a thread."""
+
+    def server_th_with_start_checks(  # type: ignore
+        tf_gpu_growth: bool, stop_event: asyncio.Event, **kwargs
+    ) -> None:
+        """Run SeverApp, after check if GPU memory grouwth has to be set."""
+        try:
+            if tf_gpu_growth:
+                log(INFO, "Enabling GPU growth for Tensorflow on the main thread.")
+                enable_gpu_growth()
+            run(**kwargs)
+        except Exception as ex:  # pylint: disable=broad-exception-caught
+            log(ERROR, "ServerApp thread raised an exception: %s", ex)
+            log(ERROR, traceback.format_exc())
+        finally:
+            log(DEBUG, "ServerApp finished running.")
+            # Upon completion, trigger stop event if one was passed
+            if stop_event is not None:
+                log(DEBUG, "Triggering stop event.")
+                stop_event.set()
+
     serverapp_th = threading.Thread(
-        target=run,
+        target=server_th_with_start_checks,
+        args=(enable_tf_gpu_growth, f_stop),
         kwargs={
             "server_app_attr": server_app_attr,
             "loaded_server_app": server_app,
             "driver": driver,
             "server_app_dir": server_app_dir,
-            "stop_event": f_stop,  # will be set when `run()` finishes
-            # will trigger the shutdown of the Simulation Engine
         },
     )
     sleep(delay_launch)
@@ -157,12 +177,14 @@ def execepthook(args: Any) -> None:
     return execepthook
 
 
+# pylint: disable=too-many-locals
 def _main_loop(
     num_supernodes: int,
     backend_name: str,
     backend_config_stream: str,
     driver_api_address: str,
     working_dir: str,
+    enable_tf_gpu_growth: bool,
     client_app: Optional[ClientApp] = None,
     client_app_attr: Optional[str] = None,
     server_app: Optional[ServerApp] = None,
@@ -200,6 +222,7 @@ def _main_loop(
             driver=driver,
             server_app_dir=working_dir,
             f_stop=f_stop,
+            enable_tf_gpu_growth=enable_tf_gpu_growth,
         )
         # Setup an exception hook
         threading.excepthook = get_thread_exception_hook(f_stop)
@@ -301,10 +324,7 @@ def _run_simulation(
     if backend_config is None:
         backend_config = {}
 
-    # Enable GPU memory growth (relevant only for TF)
     if enable_tf_gpu_growth:
-        log(INFO, "Enabling GPU growth for Tensorflow on the main thread.")
-        enable_gpu_growth()
         # Check that Backend config has also enabled using GPU growth
         use_tf = backend_config.get("tensorflow", False)
         if not use_tf:
@@ -321,6 +341,7 @@ def _run_simulation(
         backend_config_stream,
         driver_api_address,
         working_dir,
+        enable_tf_gpu_growth,
         client_app,
         client_app_attr,
         server_app,

From f3ed0c9f8efe24b268d1c8de6102742837df8415 Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Mon, 4 Mar 2024 21:07:41 +0000
Subject: [PATCH 094/103] simplifications; option `--verbose`

---
 src/py/flwr/simulation/run_simulation.py | 64 +++++++++++++-----------
 1 file changed, 35 insertions(+), 29 deletions(-)

diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index 8f9bea29100..2ac8d55f7c8 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -17,11 +17,12 @@
 import argparse
 import asyncio
 import json
+import logging
 import threading
 import traceback
 from logging import DEBUG, ERROR, INFO, WARNING
 from time import sleep
-from typing import Any, Callable, Dict, Optional
+from typing import Dict, Optional
 
 import grpc
 
@@ -48,14 +49,15 @@ def run_simulation_from_cli() -> None:
     backend_config_dict = json.loads(args.backend_config)
 
     _run_simulation(
-        num_supernodes=args.num_supernodes,
-        client_app_attr=args.client_app,
         server_app_attr=args.server_app,
+        client_app_attr=args.client_app,
+        num_supernodes=args.num_supernodes,
         backend_name=args.backend,
         backend_config=backend_config_dict,
         working_dir=args.dir,
         driver_api_address=args.driver_api_address,
         enable_tf_gpu_growth=args.enable_tf_gpu_growth,
+        verbose_logging=args.verbose,
     )
 
 
@@ -68,6 +70,7 @@ def run_simulation(
     backend_name: str = "ray",
     backend_config: Optional[Dict[str, ConfigsRecordValues]] = None,
     enable_tf_gpu_growth: bool = False,
+    verbose_logging: bool = False,
 ) -> None:
     r"""Launch the Simulation Engine.
 
@@ -100,6 +103,10 @@ def run_simulation(
         might encounter an out-of-memory error becasue TensorFlow by default allocates
         all GPU memory. Read mor about how `tf.config.experimental.set_memory_growth()`
         works in the TensorFlow documentation: https://www.tensorflow.org/api/stable.
+
+    verbose_logging : bool (default: False)
+        When diabled, only INFO, WARNING and ERROR log messages will be shown. If
+        enabled, DEBUG-level logs will be displayed.
     """
     _run_simulation(
         num_supernodes=num_supernodes,
@@ -108,6 +115,7 @@ def run_simulation(
         backend_name=backend_name,
         backend_config=backend_config,
         enable_tf_gpu_growth=enable_tf_gpu_growth,
+        verbose_logging=verbose_logging,
     )
 
 
@@ -126,11 +134,16 @@ def run_serverapp_th(
     def server_th_with_start_checks(  # type: ignore
         tf_gpu_growth: bool, stop_event: asyncio.Event, **kwargs
     ) -> None:
-        """Run SeverApp, after check if GPU memory grouwth has to be set."""
+        """Run SeverApp, after check if GPU memory grouwth has to be set.
+
+        Upon exception, trigger stop event for Simulation Engine.
+        """
         try:
             if tf_gpu_growth:
                 log(INFO, "Enabling GPU growth for Tensorflow on the main thread.")
                 enable_gpu_growth()
+
+            # Run ServerApp
             run(**kwargs)
         except Exception as ex:  # pylint: disable=broad-exception-caught
             log(ERROR, "ServerApp thread raised an exception: %s", ex)
@@ -139,8 +152,8 @@ def server_th_with_start_checks(  # type: ignore
             log(DEBUG, "ServerApp finished running.")
             # Upon completion, trigger stop event if one was passed
             if stop_event is not None:
-                log(DEBUG, "Triggering stop event.")
                 stop_event.set()
+                log(WARNING, "Triggered stop event for Simulation Engine.")
 
     serverapp_th = threading.Thread(
         target=server_th_with_start_checks,
@@ -157,26 +170,6 @@ def server_th_with_start_checks(  # type: ignore
     return serverapp_th
 
 
-def get_thread_exception_hook(stop_event: asyncio.Event) -> Callable[[Any], None]:
-    """Return a callback for when the ServerApp thread raises an exception."""
-
-    def execepthook(args: Any) -> None:
-        """Upon exception raised, log exception and trigger stop event."""
-        # log
-        log(
-            ERROR,
-            "The ServerApp thread triggered exception (%s): %s",
-            args.exc_type,
-            args.exc_value,
-        )
-        log(ERROR, traceback.format_exc())
-        # Set stop event
-        stop_event.set()
-        log(WARNING, "Triggered stop event for Simulation Engine.")
-
-    return execepthook
-
-
 # pylint: disable=too-many-locals
 def _main_loop(
     num_supernodes: int,
@@ -224,8 +217,6 @@ def _main_loop(
             f_stop=f_stop,
             enable_tf_gpu_growth=enable_tf_gpu_growth,
         )
-        # Setup an exception hook
-        threading.excepthook = get_thread_exception_hook(f_stop)
 
         # SuperLink with Simulation Engine
         event(EventType.RUN_SUPERLINK_ENTER)
@@ -241,13 +232,11 @@ def _main_loop(
         )
 
     except Exception as ex:
-
         log(ERROR, "An exception occurred !! %s", ex)
         log(ERROR, traceback.format_exc())
         raise RuntimeError("An error was encountered. Ending simulation.") from ex
 
     finally:
-
         # Stop Driver
         driver_server.stop(grace=0)
         del driver
@@ -273,6 +262,7 @@ def _run_simulation(
     working_dir: str = "",
     driver_api_address: str = "0.0.0.0:9091",
     enable_tf_gpu_growth: bool = False,
+    verbose_logging: bool = False,
 ) -> None:
     r"""Launch the Simulation Engine.
 
@@ -320,7 +310,16 @@ def _run_simulation(
         might encounter an out-of-memory error becasue TensorFlow by default allocates
         all GPU memory. Read mor about how `tf.config.experimental.set_memory_growth()`
         works in the TensorFlow documentation: https://www.tensorflow.org/api/stable.
+
+    verbose_logging : bool (default: False)
+        When diabled, only INFO, WARNING and ERROR log messages will be shown. If
+        enabled, DEBUG-level logs will be displayed.
     """
+    # Set logging level
+    if not verbose_logging:
+        logger = logging.getLogger("flwr")
+        logger.setLevel(INFO)
+
     if backend_config is None:
         backend_config = {}
 
@@ -432,4 +431,11 @@ def _parse_args_run_simulation() -> argparse.ArgumentParser:
         " Default: current working directory.",
     )
 
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="When unset, only INFO, WARNING and ERROR log messages will be shown. "
+        "If set, DEBUG-level logs will be displayed. ",
+    )
+
     return parser

From 43bd2e18b6c28dbac0607d90c117bda7952f683c Mon Sep 17 00:00:00 2001
From: jafermarq <javier@flower.ai>
Date: Tue, 5 Mar 2024 09:48:55 +0000
Subject: [PATCH 095/103] discarded changes to notebooks

---
 examples/simulation-pytorch/sim.ipynb    | 55 +++++++++++-------------
 examples/simulation-tensorflow/sim.ipynb | 44 ++++++-------------
 2 files changed, 38 insertions(+), 61 deletions(-)

diff --git a/examples/simulation-pytorch/sim.ipynb b/examples/simulation-pytorch/sim.ipynb
index 762911fdc5c..e27721a7fa5 100644
--- a/examples/simulation-pytorch/sim.ipynb
+++ b/examples/simulation-pytorch/sim.ipynb
@@ -20,8 +20,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install -q \"flwr[simulation]\"\n",
-    "!pip install -q \"flwr_datasets[vision]\""
+    "# depending on your shell, you might need to add `\\` before `[` and `]`.\n",
+    "!pip install -q flwr[simulation]\n",
+    "!pip install flwr_datasets[vision]"
    ]
   },
   {
@@ -62,7 +63,7 @@
    },
    "outputs": [],
    "source": [
-    "!pip install -q matplotlib"
+    "!pip install matplotlib"
    ]
   },
   {
@@ -510,7 +511,10 @@
     "        # Create and return client\n",
     "        return FlowerClient(trainloader, valloader).to_client()\n",
     "\n",
-    "    return client_fn"
+    "    return client_fn\n",
+    "\n",
+    "\n",
+    "client_fn_callback = get_client_fn(mnist_fds)"
    ]
   },
   {
@@ -532,33 +536,22 @@
    },
    "outputs": [],
    "source": [
-    "# ClientApp for Flower-Next\n",
-    "client_app = fl.client.ClientApp(\n",
-    "    client_fn=get_client_fn(mnist_fds),\n",
-    ")\n",
-    "\n",
-    "# ServerApp for Flower-Next\n",
-    "server_app = fl.server.ServerApp(\n",
-    "    config=fl.server.ServerConfig(num_rounds=10),\n",
-    "    strategy=strategy,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Finally, launch the simulation"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fl.simulation.run_simulation(\n",
-    "    server_app=server_app, client_app=client_app, num_supernodes=NUM_CLIENTS\n",
+    "# With a dictionary, you tell Flower's VirtualClientEngine that each\n",
+    "# client needs exclusive access to these many resources in order to run\n",
+    "client_resources = {\"num_cpus\": 1, \"num_gpus\": 0.0}\n",
+    "\n",
+    "# Let's disable tqdm progress bar in the main thread (used by the server)\n",
+    "disable_progress_bar()\n",
+    "\n",
+    "history = fl.simulation.start_simulation(\n",
+    "    client_fn=client_fn_callback,  # a callback to construct a client\n",
+    "    num_clients=NUM_CLIENTS,  # total number of clients in the experiment\n",
+    "    config=fl.server.ServerConfig(num_rounds=10),  # let's run for 10 rounds\n",
+    "    strategy=strategy,  # the strategy that will orchestrate the whole FL pipeline\n",
+    "    client_resources=client_resources,\n",
+    "    actor_kwargs={\n",
+    "        \"on_actor_init_fn\": disable_progress_bar  # disable tqdm on each actor/process spawning virtual clients\n",
+    "    },\n",
     ")"
    ]
   },
diff --git a/examples/simulation-tensorflow/sim.ipynb b/examples/simulation-tensorflow/sim.ipynb
index 6c08666b6e4..9acfba99237 100644
--- a/examples/simulation-tensorflow/sim.ipynb
+++ b/examples/simulation-tensorflow/sim.ipynb
@@ -17,8 +17,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install -q \"flwr[simulation]\" tensorflow\n",
-    "!pip install -q \"flwr_datasets[vision]\""
+    "!pip install -q flwr[\"simulation\"] tensorflow\n",
+    "!pip install -q flwr_datasets[\"vision\"]"
    ]
   },
   {
@@ -34,7 +34,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install -q matplotlib"
+    "!pip install matplotlib"
    ]
   },
   {
@@ -265,36 +265,20 @@
     "    evaluate_fn=get_evaluate_fn(centralized_testset),  # global evaluation function\n",
     ")\n",
     "\n",
-    "# ClientApp for Flower-Next\n",
-    "client_app = fl.client.ClientApp(\n",
-    "    client_fn=get_client_fn(mnist_fds),\n",
-    ")\n",
+    "# With a dictionary, you tell Flower's VirtualClientEngine that each\n",
+    "# client needs exclusive access to these many resources in order to run\n",
+    "client_resources = {\"num_cpus\": 1, \"num_gpus\": 0.0}\n",
     "\n",
-    "# ServerApp for Flower-Next\n",
-    "server_app = fl.server.ServerApp(\n",
+    "# Start simulation\n",
+    "history = fl.simulation.start_simulation(\n",
+    "    client_fn=get_client_fn(mnist_fds),\n",
+    "    num_clients=NUM_CLIENTS,\n",
     "    config=fl.server.ServerConfig(num_rounds=10),\n",
     "    strategy=strategy,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now let's lauch the simulation:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fl.simulation.run_simulation(\n",
-    "    server_app=server_app,\n",
-    "    client_app=client_app,\n",
-    "    num_supernodes=NUM_CLIENTS,\n",
-    "    enable_tf_gpu_growth=True,\n",
+    "    client_resources=client_resources,\n",
+    "    actor_kwargs={\n",
+    "        \"on_actor_init_fn\": enable_tf_gpu_growth  # Enable GPU growth upon actor init.\n",
+    "    },\n",
     ")"
    ]
   },

From 91683fe5c13ec3eab92bbbaf44be8bee190c3cae Mon Sep 17 00:00:00 2001
From: "Daniel J. Beutel" <daniel@flower.ai>
Date: Tue, 5 Mar 2024 10:53:17 +0100
Subject: [PATCH 096/103] Update src/py/flwr/simulation/run_simulation.py

---
 src/py/flwr/simulation/run_simulation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index 2ac8d55f7c8..b214cdb2da7 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -72,7 +72,7 @@ def run_simulation(
     enable_tf_gpu_growth: bool = False,
     verbose_logging: bool = False,
 ) -> None:
-    r"""Launch the Simulation Engine.
+    r"""Run a Flower App using the Simulation Engine.
 
     Parameters
     ----------

From 8531ab6e6cb524de8f5b7236ff9f22c9993d87b5 Mon Sep 17 00:00:00 2001
From: "Daniel J. Beutel" <daniel@flower.ai>
Date: Tue, 5 Mar 2024 10:54:28 +0100
Subject: [PATCH 097/103] Update src/py/flwr/simulation/run_simulation.py

---
 src/py/flwr/simulation/run_simulation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index b214cdb2da7..7a5a8dc0c5e 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -77,7 +77,7 @@ def run_simulation(
     Parameters
     ----------
     server_app : ServerApp
-        The `ServerApp` to be executed.
+        The `ServerApp` to be executed. It will send messages to different `ClientApp` instances.
 
     client_app : ClientApp
         The `ClientApp` to be executed by each of the `SuperNodes`. It will receive

From 0de86ad155b1fb608452558724c53a2970eac0fc Mon Sep 17 00:00:00 2001
From: "Daniel J. Beutel" <daniel@flower.ai>
Date: Tue, 5 Mar 2024 10:55:22 +0100
Subject: [PATCH 098/103] Update src/py/flwr/simulation/run_simulation.py

---
 src/py/flwr/simulation/run_simulation.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index 7a5a8dc0c5e..94950e68f92 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -77,7 +77,8 @@ def run_simulation(
     Parameters
     ----------
     server_app : ServerApp
-        The `ServerApp` to be executed. It will send messages to different `ClientApp` instances.
+        The `ServerApp` to be executed. It will send messages to different `ClientApp`
+        instances.
 
     client_app : ClientApp
         The `ClientApp` to be executed by each of the `SuperNodes`. It will receive

From a7e875a5f557fdd320fb3a5306c042939534cb69 Mon Sep 17 00:00:00 2001
From: "Daniel J. Beutel" <daniel@flower.ai>
Date: Tue, 5 Mar 2024 11:00:08 +0100
Subject: [PATCH 099/103] Update src/py/flwr/simulation/run_simulation.py

---
 src/py/flwr/simulation/run_simulation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index 94950e68f92..3cd40643210 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -78,7 +78,7 @@ def run_simulation(
     ----------
     server_app : ServerApp
         The `ServerApp` to be executed. It will send messages to different `ClientApp`
-        instances.
+        instances running on different (virtual) SuperNodes.
 
     client_app : ClientApp
         The `ClientApp` to be executed by each of the `SuperNodes`. It will receive

From 6a18a1e22624b9499a73caf63b9a8bcbc6630872 Mon Sep 17 00:00:00 2001
From: "Daniel J. Beutel" <daniel@flower.ai>
Date: Tue, 5 Mar 2024 11:01:00 +0100
Subject: [PATCH 100/103] Update src/py/flwr/simulation/run_simulation.py

---
 src/py/flwr/simulation/run_simulation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index 3cd40643210..85a87c4bede 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -81,7 +81,7 @@ def run_simulation(
         instances running on different (virtual) SuperNodes.
 
     client_app : ClientApp
-        The `ClientApp` to be executed by each of the `SuperNodes`. It will receive
+        The `ClientApp` to be executed by each of the SuperNodes. It will receive
         messages sent by the `ServerApp`.
 
     num_supernodes : int

From bceabb06d1950ec3c5dbbca27c75ea121161ad8f Mon Sep 17 00:00:00 2001
From: "Daniel J. Beutel" <daniel@flower.ai>
Date: Tue, 5 Mar 2024 11:02:01 +0100
Subject: [PATCH 101/103] Update src/py/flwr/simulation/run_simulation.py

---
 src/py/flwr/simulation/run_simulation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index 85a87c4bede..a48fb833edd 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -93,7 +93,7 @@ def run_simulation(
         A simulation backend that runs `ClientApp`s.
 
     backend_config : Optional[Dict[str, ConfigsRecordValues]]
-        'A dictionary, e.g {"<keyA>":<value>, "<keyB>":<value>} to configure a
+        'A dictionary, e.g {"<keyA>": <value>, "<keyB>": <value>} to configure a
         backend. Values supported in <value> are those included by
         `flwr.common.typing.ConfigsRecordValues`.
 

From 037eda43b5898add84b4131836c289f93e8c2fe5 Mon Sep 17 00:00:00 2001
From: "Daniel J. Beutel" <daniel@flower.ai>
Date: Tue, 5 Mar 2024 11:03:47 +0100
Subject: [PATCH 102/103] Update src/py/flwr/simulation/run_simulation.py

---
 src/py/flwr/simulation/run_simulation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index a48fb833edd..2cc3b21af4c 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -101,7 +101,7 @@ def run_simulation(
         A boolean to indicate whether to enable GPU growth on the main thread. This is
         desirable if you make use of a TensorFlow model on your `ServerApp` while
         having your `ClientApp` running on the same GPU. Without enabling this, you
-        might encounter an out-of-memory error becasue TensorFlow by default allocates
+        might encounter an out-of-memory error because TensorFlow, by default, allocates
         all GPU memory. Read mor about how `tf.config.experimental.set_memory_growth()`
         works in the TensorFlow documentation: https://www.tensorflow.org/api/stable.
 

From af9760975a79bac5a69a9ca0f69f87358cf4f06a Mon Sep 17 00:00:00 2001
From: "Daniel J. Beutel" <daniel@flower.ai>
Date: Tue, 5 Mar 2024 11:04:13 +0100
Subject: [PATCH 103/103] Update src/py/flwr/simulation/run_simulation.py

---
 src/py/flwr/simulation/run_simulation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/py/flwr/simulation/run_simulation.py b/src/py/flwr/simulation/run_simulation.py
index 2cc3b21af4c..cb68221ea58 100644
--- a/src/py/flwr/simulation/run_simulation.py
+++ b/src/py/flwr/simulation/run_simulation.py
@@ -102,7 +102,7 @@ def run_simulation(
         desirable if you make use of a TensorFlow model on your `ServerApp` while
         having your `ClientApp` running on the same GPU. Without enabling this, you
         might encounter an out-of-memory error because TensorFlow, by default, allocates
-        all GPU memory. Read mor about how `tf.config.experimental.set_memory_growth()`
+        all GPU memory. Read more about how `tf.config.experimental.set_memory_growth()`
         works in the TensorFlow documentation: https://www.tensorflow.org/api/stable.
 
     verbose_logging : bool (default: False)