Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev/v0.7.0 #1500

Open
wants to merge 3 commits into
base: dev/v0.7.0
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions python/examples/simulation/mpi_torch_hierarchical_fl/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Install FedML and Prepare the Distributed Environment
```
pip install fedml
```


# Run the example

## mpi hierarchical fl
```
sh run_step_by_step_example.sh 5 config/mnist_lr/fedml_config.yaml
```

## mpi hierarchical fl based on some topology (e.g., 2d_torus, star, complete, isolated, balanced_tree and random)
```
sh run_step_by_step_example.sh 5 config/mnist_lr/fedml_config.yaml
```

Empty file.
47 changes: 47 additions & 0 deletions python/examples/simulation/mpi_torch_hierarchical_fl/batch_run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env bash

GROUP_NUM=5
GROUP_METHOD="hetero"
COMM_ROUND=62 #250
GROUP_COMM_ROUND=4 # 1
TOPO_NAME="star"
CONFIG_PATH=config/mnist_lr/fedml_config_topo.yaml

group_alpha_list=(0.01 0.1 1.0)

WORKER_NUM=$(($GROUP_NUM+1))
hostname > mpi_host_file
mkdir -p batch_log
# we need to install yq (https://github.com/mikefarah/yq)
# wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/bin/yq && chmod +x /usr/bin/yq

yq -i ".device_args.worker_num = ${WORKER_NUM}" $CONFIG_PATH
yq -i ".device_args.gpu_mapping_key = \"mapping_config1_${WORKER_NUM}\"" $CONFIG_PATH
yq -i ".train_args.group_num = ${GROUP_NUM}" $CONFIG_PATH
yq -i ".train_args.comm_round = ${COMM_ROUND}" $CONFIG_PATH
yq -i ".train_args.group_comm_round = ${GROUP_COMM_ROUND}" $CONFIG_PATH
yq -i ".train_args.group_method = \"${GROUP_METHOD}\"" $CONFIG_PATH
yq -i ".train_args.topo_name = \"${TOPO_NAME}\"" $CONFIG_PATH

if [ "${GROUP_METHOD}" = "random" ]; then
yq -i ".train_args.group_alpha = 0" $CONFIG_PATH
fi

if [ "${TOPO_NAME}" != "random" ]; then
yq -i ".train_args.topo_edge_probability = 1.0" $CONFIG_PATH
fi


for group_alpha in ${group_alpha_list[@]};
do
echo "group_alpha=$group_alpha"
yq -i ".train_args.group_alpha = ${group_alpha}" $CONFIG_PATH

nohup mpirun -np $WORKER_NUM \
-hostfile mpi_host_file \
python torch_step_by_step_example.py --cf $CONFIG_PATH \
> batch_log/"group_alpha=$group_alpha.log" 2>&1 & echo $! >> batch_log/group_alpha.pid
sleep 30
done

echo "Finished!"
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
common_args:
training_type: "simulation"
random_seed: 0

data_args:
dataset: "mnist"
data_cache_dir: ~/fedml_data
partition_method: "hetero"
partition_alpha: 0.5

model_args:
model: "lr"

train_args:
federated_optimizer: "HierarchicalFL"
client_id_list: "[]"
client_num_in_total: 1000
client_num_per_round: 20
comm_round: 20
epochs: 1
batch_size: 10
client_optimizer: sgd
learning_rate: 0.03
weight_decay: 0.001
group_method: "random"
group_num: 4
group_comm_round: 5

validation_args:
frequency_of_the_test: 5

device_args:
worker_num: 5
using_gpu: true
gpu_mapping_file: config/mnist_lr/gpu_mapping.yaml
gpu_mapping_key: mapping_config1_5

comm_args:
backend: "MPI"
is_mobile: 0


tracking_args:
# When running on MLOps platform(open.fedml.ai), the default log path is at ~/fedml-client/fedml/logs/ and ~/fedml-server/fedml/logs/
enable_wandb: true
wandb_key: ee0b5f53d949c84cee7decbe7a629e63fb2f8408
wandb_project: fedml
run_name: mpi_hierarchical_fl_mnist_lr
wandb_only_server: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
common_args:
training_type: "simulation"
random_seed: 0

data_args:
dataset: "mnist"
data_cache_dir: ~/fedml_data
partition_method: "hetero"
partition_alpha: 0.5

model_args:
model: "lr"

train_args:
federated_optimizer: "HierarchicalFL"
client_id_list: "[]"
client_num_in_total: 1000
client_num_per_round: 20
comm_round: 20
epochs: 1
batch_size: 10
client_optimizer: sgd
learning_rate: 0.03
weight_decay: 0.001
group_method: "hetero"
group_alpha: 0.5
group_num: 4
group_comm_round: 5
topo_name: "ring"
topo_edge_probability: 0.5

validation_args:
frequency_of_the_test: 5

device_args:
worker_num: 5
using_gpu: true
gpu_mapping_file: config/mnist_lr/gpu_mapping.yaml
gpu_mapping_key: mapping_config1_5

comm_args:
backend: "MPI"
is_mobile: 0


tracking_args:
# When running on MLOps platform(open.fedml.ai), the default log path is at ~/fedml-client/fedml/logs/ and ~/fedml-server/fedml/logs/
enable_wandb: true
wandb_key: ee0b5f53d949c84cee7decbe7a629e63fb2f8408
wandb_project: fedml
run_name: mpi_hierarchical_fl_mnist_lr
wandb_only_server: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# You can define a cluster containing multiple GPUs within multiple machines by defining `gpu_mapping.yaml` as follows:

# config_cluster0:
# host_name_node0: [num_of_processes_on_GPU0, num_of_processes_on_GPU1, num_of_processes_on_GPU2, num_of_processes_on_GPU3, ..., num_of_processes_on_GPU_n]
# host_name_node1: [num_of_processes_on_GPU0, num_of_processes_on_GPU1, num_of_processes_on_GPU2, num_of_processes_on_GPU3, ..., num_of_processes_on_GPU_n]
# host_name_node_m: [num_of_processes_on_GPU0, num_of_processes_on_GPU1, num_of_processes_on_GPU2, num_of_processes_on_GPU3, ..., num_of_processes_on_GPU_n]


# this is used for 10 clients and 1 server training within a single machine which has 4 GPUs
mapping_default:
ChaoyangHe-GPU-RTX2080Tix4: [3, 3, 3, 2]

mapping_config1_2:
host1: [1, 1]

mapping_config1_3:
host1: [1, 1, 1]

# this is used for 4 clients and 1 server training within a single machine which has 4 GPUs
mapping_config1_5:
host1: [2, 1, 1, 1]

# this is used for 4 clients and 1 server training within a single machine which has 4 GPUs
mapping_config1_6:
host1: [2, 2, 1, 1]

# this is used for 10 clients and 1 server training within a single machine which has 4 GPUs
mapping_config2_11:
host1: [3, 3, 3, 2]

# this is used for 10 clients and 1 server training within a single machine which has 8 GPUs
mapping_config3_11:
host1: [2, 2, 2, 1, 1, 1, 1, 1]

# this is used for 4 clients and 1 server training within a single machine which has 8 GPUs, but you hope to skip the GPU device ID.
mapping_config4_5:
host1: [1, 0, 0, 1, 1, 0, 1, 1]

# this is used for 4 clients and 1 server training using 6 machines, each machine has 2 GPUs inside, but you hope to use the second GPU.
mapping_config5_6:
host1: [0, 1]
host2: [0, 1]
host3: [0, 1]
host4: [0, 1]
host5: [0, 1]
# this is used for 4 clients and 1 server training using 2 machines, each machine has 2 GPUs inside, but you hope to use the second GPU.
mapping_config5_2:
gpu-worker2: [1,1]
gpu-worker1: [2,1]

# this is used for 10 clients and 1 server training using 4 machines, each machine has 2 GPUs inside, but you hope to use the second GPU.
mapping_config5_4:
gpu-worker2: [1,1]
gpu-worker1: [2,1]
gpu-worker3: [3,1]
gpu-worker4: [1,1]

# for grpc GPU mapping
mapping_FedML_gRPC:
hostname_node_server: [1]
hostname_node_1: [1, 0, 0, 0]
hostname_node_2: [1, 0, 0, 0]

# for torch RPC GPU mapping
mapping_FedML_tRPC:
lambda-server1: [0, 0, 0, 0, 2, 2, 1, 1]
lambda-server2: [2, 1, 1, 1, 0, 0, 0, 0]

#mapping_FedML_tRPC:
# lambda-server1: [0, 0, 0, 0, 3, 3, 3, 2]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
liuxuezheng3
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/env bash

WORKER_NUM=$1
CONFIG_PATH=$2

hostname > mpi_host_file

mpirun -np $WORKER_NUM \
-hostfile mpi_host_file \
python torch_step_by_step_example.py --cf $CONFIG_PATH
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
import fedmlfrom fedml import FedMLRunnerif __name__ == "__main__": # init FedML framework args = fedml.init() # init device device = fedml.device.get_device(args) # load data dataset, output_dim = fedml.data.load(args) # load model model = fedml.model.create(args, output_dim) # start training fedml_runner = FedMLRunner(args, device, dataset, model) fedml_runner.run()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import numpy as np

from .base_topology_manager import BaseTopologyManager
from .topo_utils import *


class SymmetricTopologyManager(BaseTopologyManager):
Expand All @@ -18,6 +19,27 @@ def __init__(self, n, neighbor_num=2):
self.neighbor_num = neighbor_num
self.topology = []

def generate_custom_topology(self, args):
topo_name = args.topo_name
if topo_name == 'ring':
self.neighbor_num = 2
self.generate_topology()
elif topo_name == '2d_torus':
self.topology = get_2d_torus_overlay(self.n)
elif topo_name == 'star':
self.topology = get_star_overlay(self.n)
elif topo_name == 'complete':
self.topology = get_complete_overlay(self.n)
elif topo_name == 'isolated':
self.topology = get_isolated_overlay(self.n)
elif topo_name == 'balanced_tree':
self.topology = get_balanced_tree_overlay(self.n, self.neighbor_num)
elif topo_name == 'random':
probability = args.topo_edge_probability # Probability for edge creation
self.topology = get_random_overlay(self.n, probability)
else:
raise Exception(topo_name)

def generate_topology(self):
# first generate a ring topology
topology_ring = np.array(
Expand Down Expand Up @@ -84,8 +106,9 @@ def get_out_neighbor_idx_list(self, node_index):

if __name__ == "__main__":
# generate a ring topology
tpmgr = SymmetricTopologyManager(6, 2)
tpmgr.generate_topology()
tpmgr = SymmetricTopologyManager(9, 2, 0.3)
# tpmgr.generate_topology()
tpmgr.generate_custom_topology('random')
print("tpmgr.topology = " + str(tpmgr.topology))

# get the OUT neighbor weights for node 1
Expand Down
Loading