-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
290 lines (236 loc) · 15.8 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2" #set the ID devices that you want to use
import sys
import torch
import argparse
import traceback
import time
from importlib import import_module
from torch.utils.tensorboard import SummaryWriter
from src import Range, set_logger, TensorBoardRunner, check_args, set_seed, load_dataset, load_model
import gc
import copy
def main(args, writer):
"""Main program to run federated learning.
Args:
args: user input arguments parsed by argparser
writer: `torch.utils.tensorboard.SummaryWriter` instance for TensorBoard tracking
"""
# set seed for reproducibility
set_seed(args.seed)
server_dataset, client_datasets = load_dataset(args)
# adjust device selection
if 'cuda' in args.device:
assert torch.cuda.is_available(), 'Please check if your CUDA GPU is available!'
args.device = 'cuda' if not args.device_ids else f'cuda:{args.device_ids[0]}'
elif 'mps' in args.device:
assert torch.backends.mps.is_available(), 'MPS device (Mac GPU) is not available. Ensure you have a supported macOS and device.'
args.device = 'mps'
else:
args.device = 'cpu'
# setting up device in PyTorch
device = torch.device(args.device)
print(f"Using device: {device}")
# get model
model, args = load_model(args)
# Multi GPU
# if n_gpus > 1:
# model = torch.nn.DataParallel(model)
# check all args before FL
args = check_args(args)
# create central server
server_class = import_module(f'src.server.{args.algorithm}server').__dict__[f'{args.algorithm.title()}server']
server = server_class(args=args, writer=writer, server_dataset=server_dataset, client_datasets=client_datasets, model=model)
# federated learning
for curr_round in range(1, args.R + 1):
## update round indicator
server.round = curr_round
## update after sampling clients randomly
selected_ids = server.update()
## evaluate on clients not sampled (for measuring generalization performance)
if curr_round % args.eval_every == 0 or (curr_round == args.R):
server.evaluate(excluded_ids=selected_ids)
# torch.cuda.empty_cache() #check
else:
## wrap-up
server.finalize()
if __name__ == "__main__":
# parse user inputs as arguments
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
#####################
# Default arguments #
#####################
parser.add_argument('--exp_name', help='experiment name', type=str, required=False, default=['FedLex'])
seed = int(time.time())
parser.add_argument('--seed', help='global random seed', type=int, default=seed)
parser.add_argument('--device', help='device to use; `cpu`, `cuda`, `cuda:GPU_NUMBER`', type=str, default='mps')
parser.add_argument('--device_ids', nargs='+', type=int, help='GPU device ids for multi-GPU training (use all available GPUs if no number is passed)', default=[])
parser.add_argument('--data_path', help='path to read data from', type=str, default='./data')
parser.add_argument('--log_path', help='path to store logs', type=str, default='./log')
parser.add_argument('--result_path', help='path to save results', type=str, default='./result_paper')
parser.add_argument('--use_tb', help='use TensorBoard to track logs (atif passed)', action='store_true')
parser.add_argument('--tb_port', help='TensorBoard port number (valid only if `use_tb`)', type=int, default=6006)
parser.add_argument('--tb_host', help='TensorBoard host address (valid only if `use_tb`)', type=str, default='0.0.0.0')
#####################
# Dataset arguments #
#####################
## dataset
parser.add_argument('--dataset', help='''name of dataset to use for an experiment
* NOTE: case sensitive*
- image classification datasets in `torchvision.datasets`,
- text classification datasets in `torchtext.datasets`,
- LEAF benchmarks [ FEMNIST | Sent140 | Shakespeare | CelebA | Reddit ],
- among [ TinyImageNet | CINIC10 | BeerReviewsA | BeerReviewsL | Heart | Adult | Cover | GLEAM ]
''', type=str, required=False, default='CIFAR10')
parser.add_argument('--test_fraction', help='fraction of local hold-out dataset for evaluation', type=float, choices=[Range(0., 9e-1)], default= 0.2)
parser.add_argument('--rawsmpl', help='fraction of raw data to be used (only used when one of `LEAF` datasets is used)', type=float, choices=[Range(0., 1.)], default=1.0)
## data augmentation arguments
parser.add_argument('--resize', help='resize input images (using `torchvision.transforms.Resize`)', type=int, default=24) #24
parser.add_argument('--imnorm', help='normalize channels using ImageNet pre-trained mean & standard deviation (using `torchvision.transforms.Normalize`)', action='store_true')
parser.add_argument('--randrot', help='randomly rotate input (using `torchvision.transforms.RandomRotation`)', type=int, default=None)
parser.add_argument('--randhf', help='randomly flip input horizontaly (using `torchvision.transforms.RandomHorizontalFlip`)', type=float, choices=[Range(0., 1.)], default=0.5)#0.5
parser.add_argument('--randvf', help='randomly flip input vertically (using `torchvision.transforms.RandomVerticalFlip`)', type=float, choices=[Range(0., 1.)], default=None)
parser.add_argument('--randjit', help='randomly flip input vertically (using `torchvision.transforms.RandomVerticalFlip`)', type=float, default=0.5) #0.5
## statistical heterogeneity simulation arguments
parser.add_argument('--split_type', help='''type of data split scenario
- `iid`: statistically homogeneous setting,
- `unbalanced`: unbalance in sample counts across clients,
- `patho`: pathological non-IID split scenario proposed in (McMahan et al., 2016),
- `diri`: Dirichlet distribution-based split scenario proposed in (Hsu et al., 2019),
- `pre`: pre-defined data split scenario
''', type=str, choices=['iid', 'unbalanced', 'patho', 'diri', 'pre'], required=False, default='patho')
parser.add_argument('--mincls', help='the minimum number of distinct classes per client (only used when `split_type` is `patho`)', type=int, default=2)
parser.add_argument('--cncntrtn', help='a concentration parameter for Dirichlet distribution (only used when `split_type` is `diri`)', type=float, default=0.2)
# Additional arguments for Continual Learning
parser.add_argument('--num_tasks', help='number of tasks for task-based split', type=int, default=5)
parser.add_argument('--lambda_', help='modulation factor for Guidance Matrix', type=float, default=0.01)
parser.add_argument('--alpha', help='exponential moving average factor for Guidance Matrix', type=float, default=0.9)
parser.add_argument('--buffer_size', help='size of replay buffer for experience replay', type=int, default=100)
###################
# Model arguments#
###################
## model
parser.add_argument('--model_name', help='a model to be used (note that it is case sensitive)', type=str,
choices=[
'TwoNN', 'TwoCNN', 'FEMNISTCNN',
'LeNet', 'MobileNet', 'SqueezeNet',
'VGG9', 'VGG9BN', 'VGG11', 'VGG11BN', 'VGG13', 'VGG13BN',
'ResNet10', 'ResNet18', 'ResNet34',
'MobileNeXt', 'SqueezeNeXt', 'MobileViT',
'NextCharLSTM', 'NextWordLSTM',
'DistilBert', 'SqueezeBert', 'MobileBert',
'LogReg', 'GRUClassifier','twocnn_GTL', 'BIGCNN', 'WideResNet'
],
required=False, default='TwoCNN'
)
parser.add_argument('--hidden_size', help='hidden channel size for vision models, or hidden dimension of language models', type=int, default=32) #32
parser.add_argument('--dropout', help='dropout rate', type=float, choices=[Range(0., 1.)], default=0)
parser.add_argument('--use_model_tokenizer', help='use a model-specific tokenizer (if passed)', action='store_true')
parser.add_argument('--use_pt_model', help='use a pre-trained model weights for fine-tuning (if passed)', action='store_true')
parser.add_argument('--seq_len', help='maximum sequence length used for `torchtext.datasets`)', type=int, default=512)
parser.add_argument('--num_layers', help='number of layers in recurrent cells', type=int, default=4) #4
parser.add_argument('--num_embeddings', help='size of embedding dictionary', type=int, default=1000)
parser.add_argument('--embedding_size', help='embedding dimension of language models', type=int, default=512)
parser.add_argument('--init_type', help='weight initialization method', type=str, default='xavier', choices=['normal', 'xavier', 'kaiming', 'orthogonal'])
parser.add_argument('--init_gain', type=float, default=1.0, help='magnitude of variance used for weight initialization')
######################
######################
# Learning arguments #
######################
## federated learning settings
parser.add_argument('--algorithm', help='type of an federated aggragation learning algorithm to be used', type=str,
choices=['fedavg', 'fedlex','gtfedprox'],
required=False, default='fedlex'
)
parser.add_argument('--eval_type', help='''the evaluation type of a model trained from FL algorithm
- `local`: evaluation of personalization model on local hold-out dataset (i.e., evaluate personalized models using each client\'s local evaluation set)
- `global`: evaluation of a global model on global hold-out dataset (i.e., evaluate the global model using separate holdout dataset located at the server)
- 'both': combination of `local` and `global` setting
''', type=str,
choices=['local', 'global', 'both'],
required=False, default='both'
)
parser.add_argument('--eval_fraction', help='fraction of randomly selected (unparticipated) clients for the evaluation (for `eval_type` is `local` or `both`)', type=float, choices=[Range(1e-8, 1.)], default=1.) #1
parser.add_argument('--eval_every', help='frequency of the evaluation (i.e., evaluate peformance of a model every `eval_every` round)', type=int, default=10)
parser.add_argument('--eval_metrics', help='metric(s) used for evaluation', type=str,
choices=[
'acc1', 'acc5', 'auroc', 'auprc', 'youdenj', 'f1', 'precision', 'recall',
'seqacc', 'mse', 'mae', 'mape', 'rmse', 'r2', 'd2'
], nargs='+', required=False, default=['acc1','acc5','f1','precision','recall']
)
parser.add_argument('--K', help='number of total cilents participating in federated training', type=int, default=20)
parser.add_argument('--R', help='number of total rounds', type=int, default=200)
parser.add_argument('--C', help='sampling fraction of clietns per round (full participation when zero is passed)', type=float, choices=[Range(0., 1.)], default=0.05)
parser.add_argument('--E', help='number of local epochs', type=int, default=5)
parser.add_argument('--B', help='batch size for local update in each client (full-batch training when zero is passed)', type=int, default=350) #64
parser.add_argument('--beta', help='global momentum factor for an update of a global model when aggregated at the server', type=float, choices=[Range(0., 1.)], default=0.9) #0.9
# optimization arguments
parser.add_argument('--no_shuffle', help='do not shuffle data (if passed)', action='store_true')
parser.add_argument('--optimizer', help='type of optimization method (should be a sub-module of `torch.optim`)', type=str, default='Adam')
parser.add_argument('--max_grad_norm', help='a constant required for gradient clipping', type=float, choices=[Range(0., float('inf'))], default=0.)
parser.add_argument('--weight_decay', help='weight decay (L2 penalty)', type=float, choices=[Range(0., 1.)], default=0)
parser.add_argument('--momentum', help='momentum factor', type=float, choices=[Range(0., 1.)], default=0.)
parser.add_argument('--lr', help='learning rate for local updates in each client', type=float, choices=[Range(0., 100.)], default=0.0003)
parser.add_argument('--lr_server', help='learning rate for the server opt', type=float, choices=[Range(0., 100.)], default=0.0003)
parser.add_argument('--lr_decay', help='rate of learning rate decay applied per round', type=float, choices=[Range(0., 1.)], default=0.99)
parser.add_argument('--lr_decay_step', help='rate of learning rate decay applied per round', type=int, default=1)
parser.add_argument('--criterion', help='type of criterion for objective function (should be a submodule of `torch.nn`)', type=str, default='CrossEntropyLoss')
parser.add_argument('--mu', help='constant for proximity regularization term (for algorithms `fedprox`)', type=float, choices=[Range(0., 100)], default=0.01)
##### Argoument for Eploraiton phase #####
parser.add_argument('--Patience_mask', help='Pateince early stopping for scouting process', type=int, default=40)
parser.add_argument('--epoochs_mask', help='Number of epochs for the exploration process', type=int, default=1)
parser.add_argument('--perc_clients_for_mask', help=' specifies the percentage of clients for which to create a mask.', type=float,choices=[Range(0., 1)], default=0.1)
parser.add_argument('--guidence_normalization', help=' specifies the noromalization type create the Global Guidence mask.', type=str, default='MinMax')
parser.add_argument('--mask_pruining', help='Pruining functionality', type=str, default='True')
parser.add_argument('--treeshold_pruining', help='Treeshold for pruining in percentile', type=float, default=0.5)
parser.add_argument('--initial_value_threshold', help='Treeshold for pruining in percentile', type=float, default=0.5)
##### Ablation studies #####treeshold_pruining
# parser.add_argument('--sf', help='scaling factor', type=float, choices=[Range(0., 1.)], default=1)
##### Argoument for Fed Optimzation algs (Reddi et al., 2020) (https://arxiv.org/abs/2003.00295) #####
parser.add_argument('--beta1', help='server momentum factor', type=float, choices=[Range(0., 1.)], default=0.9)
parser.add_argument('--tau', help='server momentum factor', type=float, choices=[Range(0., 1.)], default=0.001)
parser.add_argument('--plot', help='Do want to plot?', default=False)
parser.add_argument('--CEXP2', help='N', type=int, default=1)
parser.add_argument('--random_pruning', help='Do want random pruning', default=False)
# parse arguments
args = parser.parse_args()
# make path for saving losses & metrics & models
curr_time = time.strftime("%y%m%d_%H%M%S", time.localtime())
args.exp_name = f'{args.exp_name}_{args.seed}_{args.dataset.lower()}_{args.model_name.lower()}_{curr_time}'
args.result_path = os.path.join(args.result_path, f'{args.exp_name}')
if not os.path.exists(args.result_path):
os.makedirs(args.result_path)
# make path for saving logs
if not os.path.exists(args.log_path):
os.makedirs(args.log_path)
# initialize logger
set_logger(f'{args.log_path}/{args.exp_name}.log', args)
# check TensorBoard execution
if args.use_tb: # run TensorBaord for tracking losses and metrics
tb = TensorBoardRunner(os.path.join(args.log_path, f'{args.exp_name}'), args.tb_host, args.tb_port)
else:
tb = None
# define writer
writer = SummaryWriter(
log_dir=os.path.join(args.log_path, f'{args.exp_name}'),
filename_suffix=f'_{curr_time}'
)
# run main program
torch.autograd.set_detect_anomaly(True)
try:
main(args, writer)
if args.use_tb:
tb.finalize()
sys.exit(0)
except Exception:
traceback.print_exc()
if args.use_tb:
tb.interrupt()
sys.exit(1)
# # run main program
# main(args, writer)
#
# # bye!
# if tb is not None:
# tb.finalize()
# os._exit(0)