FedML-AI · liuliuliu0605 · Sep 26, 2023 · Sep 27, 2023 · Sep 30, 2023
diff --git a/python/examples/simulation/mpi_torch_hierarchical_fl/README.md b/python/examples/simulation/mpi_torch_hierarchical_fl/README.md
@@ -0,0 +1,18 @@
+# Install FedML and Prepare the Distributed Environment
+```
+pip install fedml
+```
+
+
+# Run the example
+
+## mpi hierarchical fl
+```
+sh run_step_by_step_example.sh 5 config/mnist_lr/fedml_config.yaml
+```
+
+## mpi hierarchical fl based on some topology (e.g., 2d_torus, star, complete, isolated, balanced_tree and random)
+```
+sh run_step_by_step_example.sh 5 config/mnist_lr/fedml_config.yaml
+```
+
diff --git a/python/examples/simulation/mpi_torch_hierarchical_fl/__init__.py b/python/examples/simulation/mpi_torch_hierarchical_fl/__init__.py
diff --git a/python/examples/simulation/mpi_torch_hierarchical_fl/batch_run.sh b/python/examples/simulation/mpi_torch_hierarchical_fl/batch_run.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+
+GROUP_NUM=5
+GROUP_METHOD="hetero"
+COMM_ROUND=62 #250
+GROUP_COMM_ROUND=4 # 1
+TOPO_NAME="star"
+CONFIG_PATH=config/mnist_lr/fedml_config_topo.yaml
+
+group_alpha_list=(0.01 0.1 1.0)
+
+WORKER_NUM=$(($GROUP_NUM+1))
+hostname > mpi_host_file
+mkdir -p batch_log
+# we need to install yq (https://github.com/mikefarah/yq)
+# wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/bin/yq && chmod +x /usr/bin/yq
+
+yq -i ".device_args.worker_num = ${WORKER_NUM}" $CONFIG_PATH
+yq -i ".device_args.gpu_mapping_key = \"mapping_config1_${WORKER_NUM}\"" $CONFIG_PATH
+yq -i ".train_args.group_num = ${GROUP_NUM}" $CONFIG_PATH
+yq -i ".train_args.comm_round = ${COMM_ROUND}" $CONFIG_PATH
+yq -i ".train_args.group_comm_round = ${GROUP_COMM_ROUND}" $CONFIG_PATH
+yq -i ".train_args.group_method = \"${GROUP_METHOD}\"" $CONFIG_PATH
+yq -i ".train_args.topo_name = \"${TOPO_NAME}\"" $CONFIG_PATH
+
+if [ "${GROUP_METHOD}" = "random" ]; then
+  yq -i ".train_args.group_alpha = 0" $CONFIG_PATH
+fi
+
+if [ "${TOPO_NAME}" != "random" ]; then
+  yq -i ".train_args.topo_edge_probability = 1.0" $CONFIG_PATH
+fi
+
+
+for group_alpha in ${group_alpha_list[@]};
+do
+  echo "group_alpha=$group_alpha"
+  yq -i ".train_args.group_alpha = ${group_alpha}" $CONFIG_PATH
+
+  nohup mpirun -np $WORKER_NUM \
+  -hostfile mpi_host_file \
+  python torch_step_by_step_example.py --cf $CONFIG_PATH \
+  > batch_log/"group_alpha=$group_alpha.log"  2>&1 & echo $! >> batch_log/group_alpha.pid
+  sleep 30
+done
+
+echo "Finished!"
diff --git a/python/examples/simulation/mpi_torch_hierarchical_fl/config/mnist_lr/fedml_config.yaml b/python/examples/simulation/mpi_torch_hierarchical_fl/config/mnist_lr/fedml_config.yaml
@@ -0,0 +1,49 @@
+common_args:
+  training_type: "simulation"
+  random_seed: 0
+
+data_args:
+  dataset: "mnist"
+  data_cache_dir: ~/fedml_data
+  partition_method: "hetero"
+  partition_alpha: 0.5
+
+model_args:
+  model: "lr"
+
+train_args:
+  federated_optimizer: "HierarchicalFL"
+  client_id_list: "[]"
+  client_num_in_total: 1000
+  client_num_per_round: 20
+  comm_round: 20
+  epochs: 1
+  batch_size: 10
+  client_optimizer: sgd
+  learning_rate: 0.03
+  weight_decay: 0.001
+  group_method: "random"
+  group_num: 4
+  group_comm_round: 5
+
+validation_args:
+  frequency_of_the_test: 5
+
+device_args:
+  worker_num: 5
+  using_gpu: true
+  gpu_mapping_file: config/mnist_lr/gpu_mapping.yaml
+  gpu_mapping_key: mapping_config1_5
+
+comm_args:
+  backend: "MPI"
+  is_mobile: 0
+
+
+tracking_args:
+   # When running on MLOps platform(open.fedml.ai), the default log path is at ~/fedml-client/fedml/logs/ and ~/fedml-server/fedml/logs/
+  enable_wandb: true
+  wandb_key: ee0b5f53d949c84cee7decbe7a629e63fb2f8408
+  wandb_project: fedml
+  run_name: mpi_hierarchical_fl_mnist_lr
+  wandb_only_server: true
diff --git a/python/examples/simulation/mpi_torch_hierarchical_fl/config/mnist_lr/fedml_config_topo.yaml b/python/examples/simulation/mpi_torch_hierarchical_fl/config/mnist_lr/fedml_config_topo.yaml
@@ -0,0 +1,52 @@
+common_args:
+  training_type: "simulation"
+  random_seed: 0
+
+data_args:
+  dataset: "mnist"
+  data_cache_dir: ~/fedml_data
+  partition_method: "hetero"
+  partition_alpha: 0.5
+
+model_args:
+  model: "lr"
+
+train_args:
+  federated_optimizer: "HierarchicalFL"
+  client_id_list: "[]"
+  client_num_in_total: 1000
+  client_num_per_round: 20
+  comm_round: 20
+  epochs: 1
+  batch_size: 10
+  client_optimizer: sgd
+  learning_rate: 0.03
+  weight_decay: 0.001
+  group_method: "hetero"
+  group_alpha: 0.5
+  group_num: 4
+  group_comm_round: 5
+  topo_name: "ring"
+  topo_edge_probability: 0.5
+
+validation_args:
+  frequency_of_the_test: 5
+
+device_args:
+  worker_num: 5
+  using_gpu: true
+  gpu_mapping_file: config/mnist_lr/gpu_mapping.yaml
+  gpu_mapping_key: mapping_config1_5
+
+comm_args:
+  backend: "MPI"
+  is_mobile: 0
+
+
+tracking_args:
+   # When running on MLOps platform(open.fedml.ai), the default log path is at ~/fedml-client/fedml/logs/ and ~/fedml-server/fedml/logs/
+  enable_wandb: true
+  wandb_key: ee0b5f53d949c84cee7decbe7a629e63fb2f8408
+  wandb_project: fedml
+  run_name: mpi_hierarchical_fl_mnist_lr
+  wandb_only_server: true
diff --git a/python/examples/simulation/mpi_torch_hierarchical_fl/config/mnist_lr/gpu_mapping.yaml b/python/examples/simulation/mpi_torch_hierarchical_fl/config/mnist_lr/gpu_mapping.yaml
@@ -0,0 +1,70 @@
+# You can define a cluster containing multiple GPUs within multiple machines by defining `gpu_mapping.yaml` as follows:
+
+# config_cluster0:
+#     host_name_node0: [num_of_processes_on_GPU0, num_of_processes_on_GPU1, num_of_processes_on_GPU2, num_of_processes_on_GPU3, ..., num_of_processes_on_GPU_n]
+#     host_name_node1: [num_of_processes_on_GPU0, num_of_processes_on_GPU1, num_of_processes_on_GPU2, num_of_processes_on_GPU3, ..., num_of_processes_on_GPU_n]
+#     host_name_node_m: [num_of_processes_on_GPU0, num_of_processes_on_GPU1, num_of_processes_on_GPU2, num_of_processes_on_GPU3, ..., num_of_processes_on_GPU_n]
+
+
+# this is used for 10 clients and 1 server training within a single machine which has 4 GPUs
+mapping_default:
+    ChaoyangHe-GPU-RTX2080Tix4: [3, 3, 3, 2]
+
+mapping_config1_2:
+    host1: [1, 1]
+
+mapping_config1_3:
+    host1: [1, 1, 1]
+
+# this is used for 4 clients and 1 server training within a single machine which has 4 GPUs
+mapping_config1_5:
+    host1: [2, 1, 1, 1]
+
+# this is used for 4 clients and 1 server training within a single machine which has 4 GPUs
+mapping_config1_6:
+    host1: [2, 2, 1, 1]
+
+# this is used for 10 clients and 1 server training within a single machine which has 4 GPUs
+mapping_config2_11:
+    host1: [3, 3, 3, 2]
+
+# this is used for 10 clients and 1 server training within a single machine which has 8 GPUs
+mapping_config3_11:
+    host1: [2, 2, 2, 1, 1, 1, 1, 1]
+
+# this is used for 4 clients and 1 server training within a single machine which has 8 GPUs, but you hope to skip the GPU device ID.
+mapping_config4_5:
+    host1: [1, 0, 0, 1, 1, 0, 1, 1]
+
+# this is used for 4 clients and 1 server training using 6 machines, each machine has 2 GPUs inside, but you hope to use the second GPU.
+mapping_config5_6:
+    host1: [0, 1]
+    host2: [0, 1]
+    host3: [0, 1]
+    host4: [0, 1]
+    host5: [0, 1]
+# this is used for 4 clients and 1 server training using 2 machines, each machine has 2 GPUs inside, but you hope to use the second GPU.
+mapping_config5_2:
+    gpu-worker2: [1,1]
+    gpu-worker1: [2,1]
+
+# this is used for 10 clients and 1 server training using 4 machines, each machine has 2 GPUs inside, but you hope to use the second GPU.
+mapping_config5_4:
+    gpu-worker2: [1,1]
+    gpu-worker1: [2,1]
+    gpu-worker3: [3,1]
+    gpu-worker4: [1,1]
+
+# for grpc GPU mapping
+mapping_FedML_gRPC:
+    hostname_node_server: [1]
+    hostname_node_1: [1, 0, 0, 0]
+    hostname_node_2: [1, 0, 0, 0]
+
+# for torch RPC GPU mapping
+mapping_FedML_tRPC:
+    lambda-server1: [0, 0, 0, 0, 2, 2, 1, 1]
+    lambda-server2: [2, 1, 1, 1, 0, 0, 0, 0]
+
+#mapping_FedML_tRPC:
+#    lambda-server1: [0, 0, 0, 0, 3, 3, 3, 2]
diff --git a/python/examples/simulation/mpi_torch_hierarchical_fl/mpi_host_file b/python/examples/simulation/mpi_torch_hierarchical_fl/mpi_host_file
@@ -0,0 +1 @@
+liuxuezheng3
diff --git a/python/examples/simulation/mpi_torch_hierarchical_fl/run_step_by_step_example.sh b/python/examples/simulation/mpi_torch_hierarchical_fl/run_step_by_step_example.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+WORKER_NUM=$1
+CONFIG_PATH=$2
+
+hostname > mpi_host_file
+
+mpirun -np $WORKER_NUM \
+-hostfile mpi_host_file \
+python torch_step_by_step_example.py --cf $CONFIG_PATH
diff --git a/python/examples/simulation/mpi_torch_hierarchical_fl/torch_step_by_step_example.py b/python/examples/simulation/mpi_torch_hierarchical_fl/torch_step_by_step_example.py
@@ -0,0 +1 @@
+import fedmlfrom fedml import FedMLRunnerif __name__ == "__main__":    # init FedML framework    args = fedml.init()    # init device    device = fedml.device.get_device(args)    # load data    dataset, output_dim = fedml.data.load(args)    # load model    model = fedml.model.create(args, output_dim)    # start training    fedml_runner = FedMLRunner(args, device, dataset, model)    fedml_runner.run()

diff --git a/python/fedml/core/distributed/topology/symmetric_topology_manager.py b/python/fedml/core/distributed/topology/symmetric_topology_manager.py
@@ -2,6 +2,7 @@
 import numpy as np
 
 from .base_topology_manager import BaseTopologyManager
+from .topo_utils import *
 
 
 class SymmetricTopologyManager(BaseTopologyManager):
@@ -18,6 +19,27 @@ def __init__(self, n, neighbor_num=2):
         self.neighbor_num = neighbor_num
         self.topology = []
 
+    def generate_custom_topology(self, args):
+        topo_name = args.topo_name
+        if topo_name == 'ring':
+            self.neighbor_num = 2
+            self.generate_topology()
+        elif topo_name == '2d_torus':
+            self.topology = get_2d_torus_overlay(self.n)
+        elif topo_name == 'star':
+            self.topology = get_star_overlay(self.n)
+        elif topo_name == 'complete':
+            self.topology = get_complete_overlay(self.n)
+        elif topo_name == 'isolated':
+            self.topology = get_isolated_overlay(self.n)
+        elif topo_name == 'balanced_tree':
+            self.topology = get_balanced_tree_overlay(self.n, self.neighbor_num)
+        elif topo_name == 'random':
+            probability = args.topo_edge_probability  # Probability for edge creation
+            self.topology = get_random_overlay(self.n, probability)
+        else:
+            raise Exception(topo_name)
+
     def generate_topology(self):
         # first generate a ring topology
         topology_ring = np.array(
@@ -84,8 +106,9 @@ def get_out_neighbor_idx_list(self, node_index):
 
 if __name__ == "__main__":
     # generate a ring topology
-    tpmgr = SymmetricTopologyManager(6, 2)
-    tpmgr.generate_topology()
+    tpmgr = SymmetricTopologyManager(9, 2, 0.3)
+    # tpmgr.generate_topology()
+    tpmgr.generate_custom_topology('random')
     print("tpmgr.topology = " + str(tpmgr.topology))
 
     # get the OUT neighbor weights for node 1
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		import fedmlfrom fedml import FedMLRunnerif __name__ == "__main__": # init FedML framework args = fedml.init() # init device device = fedml.device.get_device(args) # load data dataset, output_dim = fedml.data.load(args) # load model model = fedml.model.create(args, output_dim) # start training fedml_runner = FedMLRunner(args, device, dataset, model) fedml_runner.run()
Expand Down