From fcec4e5ccaf818205be622b8abc6c34796bdd9f9 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Tue, 31 Dec 2019 16:35:10 -0500 Subject: [PATCH 1/6] Support for multi-gpu or cpu-only training --- checkpoint.py | 4 ++++ eval.py | 23 +++++++++++++++++++--- parse_args.py | 54 ++++++++++++++++++++++++++------------------------- train.py | 25 ++++++++++++++++++++---- 4 files changed, 73 insertions(+), 33 deletions(-) diff --git a/checkpoint.py b/checkpoint.py index 455ef69..edaa16b 100644 --- a/checkpoint.py +++ b/checkpoint.py @@ -34,4 +34,8 @@ def load_checkpoint(name, key_name='state_dict'): Selected element from loaded checkpoint pickle file """ checkpoint = torch.load(name) + + if key_name not in checkpoint: + return checkpoint + return checkpoint[key_name] diff --git a/eval.py b/eval.py index 43148c7..1e9cd77 100644 --- a/eval.py +++ b/eval.py @@ -56,13 +56,23 @@ def eval(**args): writer = SummaryWriter(log_dir) # Check if GPU is available (CUDA) - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + num_gpus = torch.cuda.device_count() if args['num_gpus'] == -1 else args['num_gpus'] + device = torch.device("cuda:0" if num_gpus > 0 and torch.cuda.is_available() else "cpu") + print('Using {}'.format(device.type)) # Load Network model = create_model_object(**args).to(device) + model_obj = model + + if device.type == 'cuda' and num_gpus > 1: + device_ids = list(range(num_gpus)) #number of GPUs specified + model = nn.DataParallel(model, device_ids=device_ids) + model_obj = model.module #Model from DataParallel object has to be accessed through module + + print('GPUs Device IDs: {}'.format(device_ids)) # Load Data - loader = data_loader(**args, model_obj=model) + loader = data_loader(**args, model_obj=model_obj) if args['load_type'] == 'train_val': eval_loader = loader['valid'] @@ -80,7 +90,14 @@ def eval(**args): if isinstance(args['pretrained'], str): ckpt = load_checkpoint(args['pretrained']) - model.load_state_dict(ckpt) + model_obj.load_state_dict(ckpt) + + ckpt_keys = list(ckpt.keys()) + if ckpt_keys[0].startswith('module.'): #if checkpoint weights are from DataParallel object + for key in ckpt_keys: + ckpt[key[7:]] = ckpt.pop(key) + + model_obj.load_state_dict(ckpt) # Training Setup params = [p for p in model.parameters() if p.requires_grad] diff --git a/parse_args.py b/parse_args.py index 3e7c8fe..58a58ac 100644 --- a/parse_args.py +++ b/parse_args.py @@ -17,6 +17,7 @@ def __init__(self): parser.add_argument('--dataset', type=str, help='Name of dataset') parser.add_argument('--batch_size', type=int, help='Numbers of videos in a mini-batch') parser.add_argument('--pseudo_batch_loop', type=int, help='Number of loops for mini-batch') + parser.add_argument('--num_gpus', type=int, help='Number of GPUs to use, default: -1 (all available GPUs). 0 (use CPU), >1 (number of GPUs to use)') parser.add_argument('--num_workers', type=int, help='Number of subprocesses for dataloading') parser.add_argument('--load_type', type=str, help='Environment selection, to include only training/training and validation/testing dataset (train, train_val, test)') parser.add_argument('--model', type=str, help='Name of model to be loaded') @@ -56,32 +57,33 @@ def __init__(self): # Default dict, anything not present is required to exist as an argument or in yaml file self.defaults = dict( - rerun = 5, - batch_size = 1, - pseudo_batch_loop= 1, - num_workers = 1, - acc_metric = None, - opt = 'sgd', - lr = 0.001, - momentum = 0.9, - weight_decay = 0.0005, - milestones = [5], - gamma = 0.1, - epoch = 10, - save_dir = './results', - exp = 'exp', - preprocess = 'default', - pretrained = 0, - subtract_mean = '', - clip_offset = 0, - random_offset = 0, - clip_stride = 0, - crop_type = None, - num_clips = 1, - debug = 0, - seed = 0, - scale = [1,1], - resume = 0) + rerun = 5, + batch_size = 1, + pseudo_batch_loop = 1, + num_gpus = -1, + num_workers = 1, + acc_metric = None, + opt = 'sgd', + lr = 0.001, + momentum = 0.9, + weight_decay = 0.0005, + milestones = [5], + gamma = 0.1, + epoch = 10, + save_dir = './results', + exp = 'exp', + preprocess = 'default', + pretrained = 0, + subtract_mean = '', + clip_offset = 0, + random_offset = 0, + clip_stride = 0, + crop_type = None, + num_clips = 1, + debug = 0, + seed = 0, + scale = [1,1], + resume = 0) diff --git a/train.py b/train.py index d889d84..d1ff75c 100644 --- a/train.py +++ b/train.py @@ -68,13 +68,23 @@ def train(**args): writer = SummaryWriter(log_dir) # Check if GPU is available (CUDA) - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - + num_gpus = torch.cuda.device_count() if args['num_gpus'] == -1 else args['num_gpus'] + device = torch.device("cuda:0" if num_gpus > 0 and torch.cuda.is_available() else "cpu") + print('Using {}'.format(device.type)) + # Load Network model = create_model_object(**args).to(device) + model_obj = model + + if device.type == 'cuda' and num_gpus > 1: + device_ids = list(range(num_gpus)) #number of GPUs specified + model = nn.DataParallel(model, device_ids=device_ids) + model_obj = model.module #Model from DataParallel object has to be accessed through module + print('GPUs Device IDs: {}'.format(device_ids)) + # Load Data - loader = data_loader(model_obj=model, **args) + loader = data_loader(model_obj=model_obj, **args) if args['load_type'] == 'train': train_loader = loader['train'] @@ -107,7 +117,14 @@ def train(**args): if isinstance(args['pretrained'], str): ckpt = load_checkpoint(args['pretrained']) - model.load_state_dict(ckpt) + model_obj.load_state_dict(ckpt) + + ckpt_keys = list(ckpt.keys()) + if ckpt_keys[0].startswith('module.'): #if checkpoint weights are from DataParallel object + for key in ckpt_keys: + ckpt[key[7:]] = ckpt.pop(key) + + model_obj.load_state_dict(ckpt) if args['resume']: start_epoch = load_checkpoint(args['pretrained'], key_name='epoch') + 1 From e74e53db6e56c42333f162285f5c161902c50352 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Tue, 31 Dec 2019 17:03:19 -0500 Subject: [PATCH 2/6] Removed one line mistake --- eval.py | 1 - train.py | 1 - 2 files changed, 2 deletions(-) diff --git a/eval.py b/eval.py index 1e9cd77..4c6de17 100644 --- a/eval.py +++ b/eval.py @@ -90,7 +90,6 @@ def eval(**args): if isinstance(args['pretrained'], str): ckpt = load_checkpoint(args['pretrained']) - model_obj.load_state_dict(ckpt) ckpt_keys = list(ckpt.keys()) if ckpt_keys[0].startswith('module.'): #if checkpoint weights are from DataParallel object diff --git a/train.py b/train.py index d1ff75c..95ec7f7 100644 --- a/train.py +++ b/train.py @@ -117,7 +117,6 @@ def train(**args): if isinstance(args['pretrained'], str): ckpt = load_checkpoint(args['pretrained']) - model_obj.load_state_dict(ckpt) ckpt_keys = list(ckpt.keys()) if ckpt_keys[0].startswith('module.'): #if checkpoint weights are from DataParallel object From 4a58ca249d19bc904928417e9f7c4a331c211f46 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Tue, 31 Dec 2019 17:40:30 -0500 Subject: [PATCH 3/6] moved num_gpu setting to parse args --- eval.py | 2 +- parse_args.py | 8 ++++++-- train.py | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/eval.py b/eval.py index 4c6de17..6b6bc25 100644 --- a/eval.py +++ b/eval.py @@ -56,7 +56,7 @@ def eval(**args): writer = SummaryWriter(log_dir) # Check if GPU is available (CUDA) - num_gpus = torch.cuda.device_count() if args['num_gpus'] == -1 else args['num_gpus'] + num_gpus = args['num_gpus'] device = torch.device("cuda:0" if num_gpus > 0 and torch.cuda.is_available() else "cpu") print('Using {}'.format(device.type)) diff --git a/parse_args.py b/parse_args.py index 58a58ac..751a31a 100644 --- a/parse_args.py +++ b/parse_args.py @@ -1,5 +1,6 @@ import argparse import yaml +import torch class Parse(): @@ -15,7 +16,7 @@ def __init__(self): #Command-line arguments will override any config file arguments parser.add_argument('--rerun', type=int, help='Number of trials to repeat an experiment') parser.add_argument('--dataset', type=str, help='Name of dataset') - parser.add_argument('--batch_size', type=int, help='Numbers of videos in a mini-batch') + parser.add_argument('--batch_size', type=int, help='Numbers of videos in a mini-batch (per GPU)') parser.add_argument('--pseudo_batch_loop', type=int, help='Number of loops for mini-batch') parser.add_argument('--num_gpus', type=int, help='Number of GPUs to use, default: -1 (all available GPUs). 0 (use CPU), >1 (number of GPUs to use)') parser.add_argument('--num_workers', type=int, help='Number of subprocesses for dataloading') @@ -122,6 +123,9 @@ def get_args(self): if self.cfg_args['clip_stride'] < 1: self.cfg_args['clip_stride'] = 1 - + #Set number of GPUs. Assertion error later if num requested > num available + #Important to know if task not running on 4 gpus if 4 were requested. + num_gpus = torch.cuda.device_count() if self.cfg_args['num_gpus'] == -1 else self.cfg_args['num_gpus'] + self.cfg_args['num_gpus'] = num_gpus return self.cfg_args diff --git a/train.py b/train.py index 95ec7f7..8c5dcc5 100644 --- a/train.py +++ b/train.py @@ -68,7 +68,7 @@ def train(**args): writer = SummaryWriter(log_dir) # Check if GPU is available (CUDA) - num_gpus = torch.cuda.device_count() if args['num_gpus'] == -1 else args['num_gpus'] + num_gpus = args['num_gpus'] device = torch.device("cuda:0" if num_gpus > 0 and torch.cuda.is_available() else "cpu") print('Using {}'.format(device.type)) From 9c1868b145307cb27f968a0fcca460562bce99e4 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Tue, 31 Dec 2019 17:43:46 -0500 Subject: [PATCH 4/6] Multiply batch size by number of GPUs --- datasets/loading_function.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datasets/loading_function.py b/datasets/loading_function.py index e24e7ae..4290463 100644 --- a/datasets/loading_function.py +++ b/datasets/loading_function.py @@ -52,20 +52,20 @@ def data_loader(**kwargs): val_data = create_dataset_object(**kwargs) kwargs['load_type'] = load_type - trainloader = torch.utils.data.DataLoader(dataset=train_data, batch_size=kwargs['batch_size'], shuffle=True, num_workers=kwargs['num_workers']) - valloader = torch.utils.data.DataLoader(dataset=val_data, batch_size=kwargs['batch_size'], shuffle=False, num_workers=kwargs['num_workers']) + trainloader = torch.utils.data.DataLoader(dataset=train_data, batch_size=kwargs['batch_size']*kwargs['num_gpus'], shuffle=True, num_workers=kwargs['num_workers']) + valloader = torch.utils.data.DataLoader(dataset=val_data, batch_size=kwargs['batch_size']*kwargs['num_gpus'], shuffle=False, num_workers=kwargs['num_workers']) ret_dict = dict(train=trainloader, valid=valloader) elif load_type == 'train': data = create_dataset_object(**kwargs) - loader = torch.utils.data.DataLoader(dataset=data, batch_size=kwargs['batch_size'], shuffle=True, num_workers=kwargs['num_workers']) + loader = torch.utils.data.DataLoader(dataset=data, batch_size=kwargs['batch_size']*kwargs['num_gpus'], shuffle=True, num_workers=kwargs['num_workers']) ret_dict = dict(train=loader) else: data = create_dataset_object(**kwargs) - loader = torch.utils.data.DataLoader(dataset=data, batch_size=kwargs['batch_size'], shuffle=False, num_workers=kwargs['num_workers']) + loader = torch.utils.data.DataLoader(dataset=data, batch_size=kwargs['batch_size']*kwargs['num_gpus'], shuffle=False, num_workers=kwargs['num_workers']) ret_dict = dict(test=loader) From 097f30a10c6521bad4d65db5c3b978897794ea41 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Wed, 8 Jan 2020 15:28:47 -0500 Subject: [PATCH 5/6] small fix. incase num_gpus=0 or -1 --- datasets/loading_function.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/datasets/loading_function.py b/datasets/loading_function.py index 4290463..3d0a3d1 100644 --- a/datasets/loading_function.py +++ b/datasets/loading_function.py @@ -45,6 +45,7 @@ def data_loader(**kwargs): """ load_type = kwargs['load_type'] + num_nodes = max(kwargs['num_gpus'], 1) if load_type == 'train_val': kwargs['load_type'] = 'train' train_data = create_dataset_object(**kwargs) @@ -52,20 +53,20 @@ def data_loader(**kwargs): val_data = create_dataset_object(**kwargs) kwargs['load_type'] = load_type - trainloader = torch.utils.data.DataLoader(dataset=train_data, batch_size=kwargs['batch_size']*kwargs['num_gpus'], shuffle=True, num_workers=kwargs['num_workers']) - valloader = torch.utils.data.DataLoader(dataset=val_data, batch_size=kwargs['batch_size']*kwargs['num_gpus'], shuffle=False, num_workers=kwargs['num_workers']) + trainloader = torch.utils.data.DataLoader(dataset=train_data, batch_size=kwargs['batch_size']*num_nodes, shuffle=True, num_workers=kwargs['num_workers']) + valloader = torch.utils.data.DataLoader(dataset=val_data, batch_size=kwargs['batch_size']*num_nodes, shuffle=False, num_workers=kwargs['num_workers']) ret_dict = dict(train=trainloader, valid=valloader) elif load_type == 'train': data = create_dataset_object(**kwargs) - loader = torch.utils.data.DataLoader(dataset=data, batch_size=kwargs['batch_size']*kwargs['num_gpus'], shuffle=True, num_workers=kwargs['num_workers']) + loader = torch.utils.data.DataLoader(dataset=data, batch_size=kwargs['batch_size']*num_nodes, shuffle=True, num_workers=kwargs['num_workers']) ret_dict = dict(train=loader) else: data = create_dataset_object(**kwargs) - loader = torch.utils.data.DataLoader(dataset=data, batch_size=kwargs['batch_size']*kwargs['num_gpus'], shuffle=False, num_workers=kwargs['num_workers']) + loader = torch.utils.data.DataLoader(dataset=data, batch_size=kwargs['batch_size']*num_nodes, shuffle=False, num_workers=kwargs['num_workers']) ret_dict = dict(test=loader) From bde60005aec87f591e83e04c8bc5e58540afcba3 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Wed, 8 Jan 2020 15:34:40 -0500 Subject: [PATCH 6/6] default to select between minimum of available and requested GPUs --- parse_args.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/parse_args.py b/parse_args.py index 751a31a..3330ac5 100644 --- a/parse_args.py +++ b/parse_args.py @@ -123,9 +123,9 @@ def get_args(self): if self.cfg_args['clip_stride'] < 1: self.cfg_args['clip_stride'] = 1 - #Set number of GPUs. Assertion error later if num requested > num available - #Important to know if task not running on 4 gpus if 4 were requested. - num_gpus = torch.cuda.device_count() if self.cfg_args['num_gpus'] == -1 else self.cfg_args['num_gpus'] + #Use all available GPUs if num_gpus = -1 + #Else select the minimum between available GPUS and requested GPUs + num_gpus = torch.cuda.device_count() if self.cfg_args['num_gpus'] == -1 else min(torch.cuda.device_count(), self.cfg_args['num_gpus']) self.cfg_args['num_gpus'] = num_gpus return self.cfg_args