diff --git a/pytorch_lightning/callbacks/gpu_stats_monitor.py b/pytorch_lightning/callbacks/gpu_stats_monitor.py index 4657056aaaeeb..60d54156265bb 100644 --- a/pytorch_lightning/callbacks/gpu_stats_monitor.py +++ b/pytorch_lightning/callbacks/gpu_stats_monitor.py @@ -133,7 +133,7 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_ def on_train_batch_end(self, trainer, pl_module, batch, batch_idx, dataloader_idx): gpu_stat_keys = self._get_gpu_stat_keys() + self._get_gpu_device_stat_keys() gpu_stats = self._get_gpu_stats(gpu_stat_keys) - + if self._log_stats.inter_step_time: self._snap_inter_step_time = time.time() diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py index f883f8b080e86..df6e8b8213c36 100644 --- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -191,62 +191,6 @@ def restore_hpc_weights_if_needed(self, model: LightningModule): did_restore = True return did_restore - def restore_training_state(self, checkpoint): - """ - Restore trainer state. - Model will get its change to update - :param checkpoint: - :return: - """ - if 'optimizer_states' not in checkpoint or 'lr_schedulers' not in checkpoint: - raise KeyError( - 'Trying to restore training state but checkpoint contains only the model.' - ' This is probably due to `ModelCheckpoint.save_weights_only` being set to `True`.' - ) - - if any([key in checkpoint for key in DEPRECATED_CHECKPOINT_KEYS]): - raise ValueError( - "The checkpoint you're attempting to load follows an" - " outdated schema. You can upgrade to the current schema by running" - " `python -m pytorch_lightning.utilities.upgrade_checkpoint --file model.ckpt`" - " where `model.ckpt` is your checkpoint file." - ) - - # load callback states - self.trainer.on_load_checkpoint(checkpoint) - - self.trainer.global_step = checkpoint['global_step'] - self.trainer.current_epoch = checkpoint['epoch'] - - # Division deals with global step stepping once per accumulated batch - # Inequality deals with different global step for odd vs even num_training_batches - n_accum = 1 if self.trainer.accumulate_grad_batches is None else self.trainer.accumulate_grad_batches - expected_steps = self.trainer.num_training_batches / n_accum - if self.trainer.num_training_batches != 0 and self.trainer.global_step % expected_steps > 1: - rank_zero_warn( - "You're resuming from a checkpoint that ended mid-epoch. " - "This can cause unreliable results if further training is done, " - "consider using an end of epoch checkpoint. " - ) - - # restore the optimizers - optimizer_states = checkpoint['optimizer_states'] - for optimizer, opt_state in zip(self.trainer.optimizers, optimizer_states): - optimizer.load_state_dict(opt_state) - - # move optimizer to GPU 1 weight at a time - # avoids OOM - if self.trainer.root_gpu is not None: - for state in optimizer.state.values(): - for k, v in state.items(): - if isinstance(v, torch.Tensor): - state[k] = v.cuda(self.trainer.root_gpu) - - # restore the lr schedulers - lr_schedulers = checkpoint['lr_schedulers'] - for scheduler, lrs_state in zip(self.trainer.lr_schedulers, lr_schedulers): - scheduler['scheduler'].load_state_dict(lrs_state) - # ---------------------------------- # PRIVATE OPS # ---------------------------------- diff --git a/pytorch_lightning/tuner/lr_finder.py b/pytorch_lightning/tuner/lr_finder.py index 37a7d2f6b7780..71756678af9c5 100644 --- a/pytorch_lightning/tuner/lr_finder.py +++ b/pytorch_lightning/tuner/lr_finder.py @@ -327,75 +327,6 @@ def suggestion(self, skip_begin: int = 10, skip_end: int = 1): self._optimal_idx = None -class _LRCallback(Callback): - """ Special callback used by the learning rate finder. This callbacks log - the learning rate before each batch and log the corresponding loss after - each batch. - - Args: - num_training: number of iterations done by the learning rate finder - early_stop_threshold: threshold for stopping the search. If the - loss at any point is larger than ``early_stop_threshold*best_loss`` - then the search is stopped. To disable, set to ``None``. - progress_bar_refresh_rate: rate to refresh the progress bar for - the learning rate finder - beta: smoothing value, the loss being logged is a running average of - loss values logged until now. ``beta`` controls the forget rate i.e. - if ``beta=0`` all past information is ignored. - - """ - def __init__(self, num_training: int, - early_stop_threshold: float = 4.0, - progress_bar_refresh_rate: int = 0, - beta: float = 0.98): - self.num_training = num_training - self.early_stop_threshold = early_stop_threshold - self.beta = beta - self.losses = [] - self.lrs = [] - self.avg_loss = 0.0 - self.best_loss = 0.0 - self.progress_bar_refresh_rate = progress_bar_refresh_rate - self.progress_bar = None - - def on_batch_start(self, trainer, pl_module): - """ Called before each training batch, logs the lr that will be used """ - if (trainer.batch_idx + 1) % trainer.accumulate_grad_batches != 0: - return - - if self.progress_bar_refresh_rate and self.progress_bar is None: - self.progress_bar = tqdm(desc='Finding best initial lr', total=self.num_training) - - self.lrs.append(trainer.lr_schedulers[0]['scheduler'].lr[0]) - - def on_train_batch_end(self, trainer, pl_module, batch, batch_idx, dataloader_idx): - """ Called when the training batch ends, logs the calculated loss """ - if (trainer.batch_idx + 1) % trainer.accumulate_grad_batches != 0: - return - - if self.progress_bar: - self.progress_bar.update() - - current_loss = trainer.train_loop.running_loss.last().item() - current_step = trainer.global_step + 1 # remove the +1 in 1.0 - - # Avg loss (loss with momentum) + smoothing - self.avg_loss = self.beta * self.avg_loss + (1 - self.beta) * current_loss - smoothed_loss = self.avg_loss / (1 - self.beta**current_step) - - # Check if we diverging - if self.early_stop_threshold is not None: - if current_step > 1 and smoothed_loss > self.early_stop_threshold * self.best_loss: - trainer.max_steps = current_step # stop signal - if self.progress_bar: - self.progress_bar.close() - - # Save best loss for diverging checking - if smoothed_loss < self.best_loss or current_step == 1: - self.best_loss = smoothed_loss - - self.losses.append(smoothed_loss) - class _LRCallback(Callback): """ Special callback used by the learning rate finder. This callbacks log the learning rate before each batch and log the corresponding loss after diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py index 6f7bbfa6bfdbc..0f601949443f5 100644 --- a/tests/loggers/test_all.py +++ b/tests/loggers/test_all.py @@ -235,11 +235,9 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_ @pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows") @pytest.mark.parametrize("logger_class", [ TensorBoardLogger, - CometLogger, MLFlowLogger, NeptuneLogger, TestTubeLogger, - WandbLogger, ]) def test_logger_created_on_rank_zero_only(tmpdir, monkeypatch, logger_class): """ Test that loggers get replaced by dummy logges on global rank > 0""" diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py index 80e587f34a041..b1d92cf8e694e 100644 --- a/tests/models/test_amp.py +++ b/tests/models/test_amp.py @@ -13,36 +13,6 @@ from tests.base import EvalModelTemplate -@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -def test_multi_gpu_wandb_ddp_spawn(tmpdir): - """ - Test ddp + wb - """ - from pytorch_lightning.loggers import WandbLogger - tutils.set_random_master_port() - - model = EvalModelTemplate() - - wandb.run = MagicMock() - wandb.init(name='name', project='project') - - logger = WandbLogger(name='name', offline=True) - trainer_options = dict( - default_root_dir=tmpdir, - max_epochs=1, - gpus=2, - distributed_backend='ddp_spawn', - precision=16, - logger=logger, - - ) - # tutils.run_model_test(trainer_options, model) - trainer = Trainer(**trainer_options) - result = trainer.fit(model) - assert result - trainer.test(model) - - @pytest.mark.skip(reason='dp + amp not supported currently') # TODO @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") def test_amp_single_gpu_dp(tmpdir): diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 85b8241e2b5ac..9618502887039 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -1065,46 +1065,3 @@ def test_dataloaders_load_only_once_passed_loaders(tmpdir): ] for call, expected in zip(calls, expected_sequence): assert call['name'] == expected - - -def test_dataloaders_load_every_epoch(tmpdir): - os.environ['PL_DEV_DEBUG'] = '1' - - model = EvalModelTemplate() - train_loader = model.train_dataloader() - model.train_dataloader = None - val_loader = model.val_dataloader() - model.val_dataloader = None - test_loader = model.test_dataloader() - model.test_dataloader = None - - # logger file to get meta - trainer = Trainer( - default_root_dir=tmpdir, - limit_train_batches=0.3, - limit_val_batches=0.3, - reload_dataloaders_every_epoch=True, - max_epochs=3, - ) - result = trainer.fit(model, train_loader, val_loader) - - trainer.test(test_dataloaders=test_loader) - - assert len(trainer.dev_debugger.val_dataloader_calls) == 4 - assert len(trainer.dev_debugger.train_dataloader_calls) == 3 - assert len(trainer.dev_debugger.test_dataloader_calls) == 1 - - # verify the sequence - calls = trainer.dev_debugger.dataloader_sequence_calls - expected_sequence = [ - 'val_dataloader', - 'train_dataloader', - 'val_dataloader', - 'train_dataloader', - 'val_dataloader', - 'train_dataloader', - 'val_dataloader', - 'test_dataloader' - ] - for call, expected in zip(calls, expected_sequence): - assert call['name'] == expected