diff --git a/.circleci/config.yml b/.circleci/config.yml index b011910643..9aa8e5e91f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -106,7 +106,7 @@ jobs: pip install . classy-project my-project pushd my-project - ./classy_train.py --device cpu --config configs/template_config.json + ./classy_train.py --config configs/template_config.json popd rm -rf my-project diff --git a/classy_train.py b/classy_train.py index 852627be2c..9a148870c7 100755 --- a/classy_train.py +++ b/classy_train.py @@ -93,18 +93,13 @@ def main(args, config): # Configure hooks to do tensorboard logging, checkpoints and so on task.set_hooks(configure_hooks(args, config)) - use_gpu = None - if args.device is not None: - use_gpu = args.device == "gpu" - assert torch.cuda.is_available() or not use_gpu, "CUDA is unavailable" - # LocalTrainer is used for a single node. DistributedTrainer will setup # training to use PyTorch's DistributedDataParallel. trainer_class = {"none": LocalTrainer, "ddp": DistributedTrainer}[ args.distributed_backend ] - trainer = trainer_class(use_gpu=use_gpu, num_dataloader_workers=args.num_workers) + trainer = trainer_class(num_dataloader_workers=args.num_workers) logging.info( f"Starting training on rank {get_rank()} worker. " diff --git a/classy_vision/generic/opts.py b/classy_vision/generic/opts.py index 5a0cd3083d..0520766f45 100644 --- a/classy_vision/generic/opts.py +++ b/classy_vision/generic/opts.py @@ -18,12 +18,6 @@ def add_generic_args(parser): parser.add_argument( "--config_file", type=str, help="path to config file for model", required=True ) - parser.add_argument( - "--device", - default=None, - type=str, - help="device to use: either 'cpu' or 'gpu'. If unspecified, will use GPU when available and CPU otherwise.", - ) parser.add_argument( "--num_workers", default=4, @@ -145,13 +139,6 @@ def check_generic_args(args): # check types and values: assert is_pos_int(args.num_workers), "incorrect number of workers" assert is_pos_int(args.visdom_port), "incorrect visdom port" - assert ( - args.device is None or args.device == "cpu" or args.device == "gpu" - ), "unknown device" - - # check that CUDA is available: - if args.device == "gpu": - assert torch.cuda.is_available(), "CUDA required to train on GPUs" # create checkpoint folder if it does not exist: if args.checkpoint_folder != "" and not os.path.exists(args.checkpoint_folder): diff --git a/classy_vision/tasks/classification_task.py b/classy_vision/tasks/classification_task.py index 8c8c089aa7..bd546f0257 100644 --- a/classy_vision/tasks/classification_task.py +++ b/classy_vision/tasks/classification_task.py @@ -142,6 +142,16 @@ def __init__(self): self.perf_log = [] self.last_batch = None self.batch_norm_sync_mode = BatchNormSyncMode.DISABLED + self.use_gpu = torch.cuda.is_available() + + def set_use_gpu(self, use_gpu: bool): + self.use_gpu = use_gpu + + assert ( + not self.use_gpu or torch.cuda.is_available() + ), "CUDA required to train on GPUs" + + return self def set_checkpoint(self, checkpoint): """Sets checkpoint on task. @@ -359,6 +369,10 @@ def from_config(cls, config: Dict[str, Any]) -> "ClassificationTask": .set_hooks(hooks) ) + use_gpu = config.get("use_gpu") + if use_gpu is not None: + task.set_use_gpu(use_gpu) + for phase_type in phase_types: task.set_dataset(datasets[phase_type], phase_type) @@ -508,24 +522,19 @@ def build_dataloaders( for phase_type in self.datasets.keys() } - def prepare( - self, - num_dataloader_workers=0, - pin_memory=False, - use_gpu=False, - dataloader_mp_context=None, - ): + def prepare(self, num_dataloader_workers=0, dataloader_mp_context=None): """Prepares task for training, populates all derived attributes Args: num_dataloader_workers: Number of dataloading processes. If 0, dataloading is done on main process - pin_memory: if true pin memory on GPU - use_gpu: if true, load model, optimizer, loss, etc on GPU dataloader_mp_context: Determines how processes are spawned. Value must be one of None, "spawn", "fork", "forkserver". If None, then context is inherited from parent process """ + + pin_memory = self.use_gpu and torch.cuda.device_count() > 1 + self.phases = self._build_phases() self.dataloaders = self.build_dataloaders( num_workers=num_dataloader_workers, @@ -539,7 +548,7 @@ def prepare( self.base_model = apex.parallel.convert_syncbn_model(self.base_model) # move the model and loss to the right device - if use_gpu: + if self.use_gpu: self.base_model, self.loss = copy_model_to_gpu(self.base_model, self.loss) else: self.loss.cpu() @@ -686,7 +695,7 @@ def set_classy_state(self, state): # Set up pytorch module in train vs eval mode, update optimizer. self._set_model_train_mode() - def eval_step(self, use_gpu): + def eval_step(self): self.last_batch = None # Process next sample @@ -699,7 +708,7 @@ def eval_step(self, use_gpu): # Copy sample to GPU target = sample["target"] - if use_gpu: + if self.use_gpu: for key, value in sample.items(): sample[key] = recursive_copy_to_gpu(value, non_blocking=True) @@ -726,12 +735,8 @@ def check_inf_nan(self, loss): if loss == float("inf") or loss == float("-inf") or loss != loss: raise FloatingPointError(f"Loss is infinity or NaN: {loss}") - def train_step(self, use_gpu): - """Train step to be executed in train loop - - Args: - use_gpu: if true, execute training on GPU - """ + def train_step(self): + """Train step to be executed in train loop.""" self.last_batch = None @@ -745,7 +750,7 @@ def train_step(self, use_gpu): # Copy sample to GPU target = sample["target"] - if use_gpu: + if self.use_gpu: for key, value in sample.items(): sample[key] = recursive_copy_to_gpu(value, non_blocking=True) diff --git a/classy_vision/tasks/classy_task.py b/classy_vision/tasks/classy_task.py index d0d2fe462a..23517198e3 100644 --- a/classy_vision/tasks/classy_task.py +++ b/classy_vision/tasks/classy_task.py @@ -86,11 +86,7 @@ def set_classy_state(self, state): @abstractmethod def prepare( - self, - num_dataloader_workers=0, - pin_memory=False, - use_gpu=False, - dataloader_mp_context=None, + self, num_dataloader_workers=0, dataloader_mp_context=None ) -> None: """ Prepares the task for training. @@ -102,19 +98,15 @@ def prepare( num_dataloader_workers: Number of workers to create for the dataloaders pin_memory: Whether the dataloaders should copy the Tensors into CUDA pinned memory (default False) - use_gpu: True if training on GPUs, False otherwise """ pass @abstractmethod - def train_step(self, use_gpu) -> None: + def train_step(self) -> None: """ Run a train step. This corresponds to training over one batch of data from the dataloaders. - - Args: - use_gpu: True if training on GPUs, False otherwise """ pass @@ -155,24 +147,21 @@ def on_end(self): pass @abstractmethod - def eval_step(self, use_gpu) -> None: + def eval_step(self) -> None: """ Run an evaluation step. This corresponds to evaluating the model over one batch of data. - - Args: - use_gpu: True if training on GPUs, False otherwise """ pass - def step(self, use_gpu) -> None: + def step(self) -> None: from classy_vision.hooks import ClassyHookFunctions if self.train: - self.train_step(use_gpu) + self.train_step() else: - self.eval_step(use_gpu) + self.eval_step() for hook in self.hooks: hook.on_step(self) diff --git a/classy_vision/tasks/fine_tuning_task.py b/classy_vision/tasks/fine_tuning_task.py index 93a795fcba..5da2b382ce 100644 --- a/classy_vision/tasks/fine_tuning_task.py +++ b/classy_vision/tasks/fine_tuning_task.py @@ -67,18 +67,12 @@ def _set_model_train_mode(self): self.base_model.train(phase["train"]) def prepare( - self, - num_dataloader_workers: int = 0, - pin_memory: bool = False, - use_gpu: bool = False, - dataloader_mp_context=None, + self, num_dataloader_workers: int = 0, dataloader_mp_context=None ) -> None: assert ( self.pretrained_checkpoint is not None ), "Need a pretrained checkpoint for fine tuning" - super().prepare( - num_dataloader_workers, pin_memory, use_gpu, dataloader_mp_context - ) + super().prepare(num_dataloader_workers, dataloader_mp_context) if self.checkpoint is None: # no checkpoint exists, load the model's state from the pretrained # checkpoint diff --git a/classy_vision/trainer/classy_trainer.py b/classy_vision/trainer/classy_trainer.py index 043ba1b361..bca12b1464 100644 --- a/classy_vision/trainer/classy_trainer.py +++ b/classy_vision/trainer/classy_trainer.py @@ -27,25 +27,18 @@ class ClassyTrainer: def __init__( self, - use_gpu: Optional[bool] = None, num_dataloader_workers: int = 0, dataloader_mp_context: Optional[str] = None, ): """Constructor for ClassyTrainer. Args: - use_gpu: If true, then use GPUs for training. - If None, then check if we have GPUs available, if we do - then use GPU for training. num_dataloader_workers: Number of CPU processes doing dataloading per GPU. If 0, then dataloading is done on main thread. dataloader_mp_context: Determines how to launch new processes for dataloading. Must be one of "fork", "forkserver", "spawn". If None, process launching is inherited from parent. """ - if use_gpu is None: - use_gpu = torch.cuda.is_available() - self.use_gpu = use_gpu self.num_dataloader_workers = num_dataloader_workers self.dataloader_mp_context = dataloader_mp_context @@ -57,11 +50,8 @@ def train(self, task: ClassyTask): everything that is needed for training """ - pin_memory = self.use_gpu and torch.cuda.device_count() > 1 task.prepare( num_dataloader_workers=self.num_dataloader_workers, - pin_memory=pin_memory, - use_gpu=self.use_gpu, dataloader_mp_context=self.dataloader_mp_context, ) assert isinstance(task, ClassyTask) @@ -75,7 +65,7 @@ def train(self, task: ClassyTask): task.on_phase_start() while True: try: - task.step(self.use_gpu) + task.step() except StopIteration: break task.on_phase_end() diff --git a/classy_vision/trainer/distributed_trainer.py b/classy_vision/trainer/distributed_trainer.py index e4d02098e3..339fa3171f 100644 --- a/classy_vision/trainer/distributed_trainer.py +++ b/classy_vision/trainer/distributed_trainer.py @@ -56,39 +56,19 @@ class DistributedTrainer(ClassyTrainer): """Distributed trainer for using multiple training processes """ - def __init__( - self, - use_gpu: Optional[bool] = None, - num_dataloader_workers: int = 0, - dataloader_mp_context: Optional[str] = None, - ): - """Constructor for DistributedTrainer. - - Args: - use_gpu: If true, then use GPU 0 for training. - If None, then check if we have GPUs available, if we do - then use GPU for training. - num_dataloader_workers: Number of CPU processes doing dataloading - per GPU. If 0, then dataloading is done on main thread. - dataloader_mp_context: Determines how to launch - new processes for dataloading. Must be one of "fork", "forkserver", - "spawn". If None, process launching is inherited from parent. - """ - super().__init__( - use_gpu=use_gpu, - num_dataloader_workers=num_dataloader_workers, - dataloader_mp_context=dataloader_mp_context, - ) + def train(self, task): _init_env_vars() - _init_distributed(self.use_gpu) + _init_distributed(task.use_gpu) logging.info( f"Done setting up distributed process_group with rank {get_rank()}" + f", world_size {get_world_size()}" ) local_rank = int(os.environ["LOCAL_RANK"]) - if self.use_gpu: + if task.use_gpu: logging.info("Using GPU, CUDA device index: {}".format(local_rank)) set_cuda_device_index(local_rank) else: logging.info("Using CPU") set_cpu_device() + + super().train(task) diff --git a/classy_vision/trainer/local_trainer.py b/classy_vision/trainer/local_trainer.py index 4435381ffb..05be158943 100644 --- a/classy_vision/trainer/local_trainer.py +++ b/classy_vision/trainer/local_trainer.py @@ -16,32 +16,12 @@ class LocalTrainer(ClassyTrainer): """Trainer to be used if you want want use only a single training process. """ - def __init__( - self, - use_gpu: Optional[bool] = None, - num_dataloader_workers: int = 0, - dataloader_mp_context: Optional[str] = None, - ): - """Constructor for LocalTrainer. - - Args: - use_gpu: If true, then use GPU 0 for training. - If None, then check if we have GPUs available, if we do - then use GPU for training. - num_dataloader_workers: Number of CPU processes doing dataloading - per GPU. If 0, then dataloading is done on main thread. - dataloader_mp_context: Determines how to launch - new processes for dataloading. Must be one of "fork", "forkserver", - "spawn". If None, process launching is inherited from parent. - """ - super().__init__( - use_gpu=use_gpu, - num_dataloader_workers=num_dataloader_workers, - dataloader_mp_context=dataloader_mp_context, - ) - if self.use_gpu: + def train(self, task): + if task.use_gpu: logging.info("Using GPU, CUDA device index: {}".format(0)) set_cuda_device_index(0) else: logging.info("Using CPU") set_cpu_device() + + super().train(task) diff --git a/test/generic_util_test.py b/test/generic_util_test.py index dca4d76e75..a43d6b41ea 100644 --- a/test/generic_util_test.py +++ b/test/generic_util_test.py @@ -437,7 +437,7 @@ def test_update_classy_state(self): task = build_task(config) task_2 = build_task(config) task_2.prepare() - trainer = LocalTrainer(use_gpu=False) + trainer = LocalTrainer() trainer.train(task) update_classy_state(task_2, task.get_classy_state(deep_copy=True)) self._compare_states(task.get_classy_state(), task_2.get_classy_state()) @@ -449,13 +449,12 @@ def test_update_classy_model(self): """ config = get_fast_test_task_config() task = build_task(config) - use_gpu = torch.cuda.is_available() - trainer = LocalTrainer(use_gpu=use_gpu) + trainer = LocalTrainer() trainer.train(task) for reset_heads in [False, True]: task_2 = build_task(config) # prepare task_2 for the right device - task_2.prepare(use_gpu=use_gpu) + task_2.prepare() update_classy_model( task_2.model, task.model.get_classy_state(deep_copy=True), reset_heads ) diff --git a/test/hooks_checkpoint_hook_test.py b/test/hooks_checkpoint_hook_test.py index 828bd998ef..7996f2a221 100644 --- a/test/hooks_checkpoint_hook_test.py +++ b/test/hooks_checkpoint_hook_test.py @@ -155,7 +155,7 @@ def test_checkpointing(self): cuda_available = torch.cuda.is_available() task = build_task(config) - task.prepare(use_gpu=cuda_available) + task.prepare() # create a checkpoint hook checkpoint_hook = CheckpointHook(checkpoint_folder, {}, phase_types=["train"]) @@ -175,8 +175,8 @@ def test_checkpointing(self): # set the checkpoint task.set_checkpoint(checkpoint) - task.prepare(use_gpu=use_gpu) + task.set_use_gpu(use_gpu) # we should be able to run the trainer using the checkpoint - trainer = LocalTrainer(use_gpu=use_gpu) + trainer = LocalTrainer() trainer.train(task) diff --git a/test/manual/tasks_classification_task_amp_test.py b/test/manual/tasks_classification_task_amp_test.py index 80edc14b8b..ea245e8a6e 100644 --- a/test/manual/tasks_classification_task_amp_test.py +++ b/test/manual/tasks_classification_task_amp_test.py @@ -32,5 +32,6 @@ def test_training(self): config = get_fast_test_task_config() config["amp_args"] = {"opt_level": "O2"} task = build_task(config) - trainer = LocalTrainer(use_gpu=True) + task.set_use_gpu(True) + trainer = LocalTrainer() trainer.train(task) diff --git a/test/tasks_classification_task_test.py b/test/tasks_classification_task_test.py index 7580189e5f..8a90b366d9 100644 --- a/test/tasks_classification_task_test.py +++ b/test/tasks_classification_task_test.py @@ -65,10 +65,10 @@ def test_get_state(self): dataset = build_dataset(config["dataset"][phase_type]) task.set_dataset(dataset, phase_type) - task.prepare(num_dataloader_workers=1, pin_memory=False) + task.prepare(num_dataloader_workers=1) task = build_task(config) - task.prepare(num_dataloader_workers=1, pin_memory=False) + task.prepare(num_dataloader_workers=1) def test_checkpointing(self): """ @@ -79,10 +79,10 @@ def test_checkpointing(self): task = build_task(config).set_hooks([LossLrMeterLoggingHook()]) task_2 = build_task(config).set_hooks([LossLrMeterLoggingHook()]) - use_gpu = torch.cuda.is_available() + task.set_use_gpu(torch.cuda.is_available()) # prepare the tasks for the right device - task.prepare(use_gpu=use_gpu) + task.prepare() # test in both train and test mode for _ in range(2): @@ -90,7 +90,7 @@ def test_checkpointing(self): # set task's state as task_2's checkpoint task_2.set_checkpoint(get_checkpoint_dict(task, {}, deep_copy=True)) - task_2.prepare(use_gpu=use_gpu) + task_2.prepare() # task 2 should have the same state self._compare_states(task.get_classy_state(), task_2.get_classy_state()) @@ -102,8 +102,8 @@ def test_checkpointing(self): # test that the train step runs the same way on both states # and the loss remains the same - task.train_step(use_gpu) - task_2.train_step(use_gpu) + task.train_step() + task_2.train_step() self._compare_states(task.get_classy_state(), task_2.get_classy_state()) def test_final_train_checkpoint(self): @@ -115,9 +115,9 @@ def test_final_train_checkpoint(self): ) task_2 = build_task(config) - use_gpu = torch.cuda.is_available() + task.set_use_gpu(torch.cuda.is_available()) - trainer = LocalTrainer(use_gpu=use_gpu) + trainer = LocalTrainer() trainer.train(task) # load the final train checkpoint @@ -130,7 +130,7 @@ def test_final_train_checkpoint(self): # set task_2's state as task's final train checkpoint task_2.set_checkpoint(checkpoint) - task_2.prepare(use_gpu=use_gpu) + task_2.prepare() # we should be able to train the task trainer.train(task_2) @@ -148,20 +148,18 @@ def test_test_only_checkpointing(self): train_task = build_task(train_config).set_hooks([LossLrMeterLoggingHook()]) test_only_task = build_task(test_config).set_hooks([LossLrMeterLoggingHook()]) - use_gpu = torch.cuda.is_available() - # prepare the tasks for the right device - train_task.prepare(use_gpu=use_gpu) + train_task.prepare() # test in both train and test mode - trainer = LocalTrainer(use_gpu=use_gpu) + trainer = LocalTrainer() trainer.train(train_task) # set task's state as task_2's checkpoint test_only_task.set_checkpoint( get_checkpoint_dict(train_task, {}, deep_copy=True) ) - test_only_task.prepare(use_gpu=use_gpu) + test_only_task.prepare() test_state = test_only_task.get_classy_state() # We expect the phase idx to be different for a test only task @@ -177,7 +175,7 @@ def test_test_only_checkpointing(self): self.assertEqual(test_state["train_phase_idx"], -1) # Verify task will run - trainer = LocalTrainer(use_gpu=use_gpu) + trainer = LocalTrainer() trainer.train(test_only_task) @unittest.skipUnless(torch.cuda.is_available(), "This test needs a gpu to run") @@ -187,11 +185,13 @@ def test_checkpointing_different_device(self): task_2 = build_task(config) for use_gpu in [True, False]: - task.prepare(use_gpu=use_gpu) + task.set_use_gpu(use_gpu) + task.prepare() # set task's state as task_2's checkpoint task_2.set_checkpoint(get_checkpoint_dict(task, {}, deep_copy=True)) # we should be able to run the trainer using state from a different device - trainer = LocalTrainer(use_gpu=not use_gpu) + trainer = LocalTrainer() + task_2.set_use_gpu(not use_gpu) trainer.train(task_2) diff --git a/test/trainer_distributed_trainer_test.py b/test/trainer_distributed_trainer_test.py index 675a3c32e2..fb8727dfc2 100644 --- a/test/trainer_distributed_trainer_test.py +++ b/test/trainer_distributed_trainer_test.py @@ -44,7 +44,6 @@ def test_training(self): """Checks we can train a small MLP model.""" num_processes = 2 - device = "gpu" if torch.cuda.is_available() else "cpu" for config_key, expected_success in [ ("invalid_config", False), @@ -57,7 +56,6 @@ def test_training(self): --master_port=29500 \ --use_env \ {self.path}/../classy_train.py \ - --device={device} \ --config={self.config_files[config_key]} \ --num_workers=4 \ --log_freq=100 \ @@ -72,7 +70,6 @@ def test_sync_batch_norm(self): """Test that sync batch norm training doesn't hang.""" num_processes = 2 - device = "gpu" cmd = f"""{sys.executable} -m torch.distributed.launch \ --nnodes=1 \ @@ -81,7 +78,6 @@ def test_sync_batch_norm(self): --master_port=29500 \ --use_env \ {self.path}/../classy_train.py \ - --device={device} \ --config={self.config_files["sync_bn_config"]} \ --num_workers=4 \ --log_freq=100 \