Lightning-AI · williamFalcon · Sep 14, 2020 · Sep 14, 2020 · Sep 14, 2020 · Sep 14, 2020
@@ -133,7 +133,7 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_
     def on_train_batch_end(self, trainer, pl_module, batch, batch_idx, dataloader_idx):
         gpu_stat_keys = self._get_gpu_stat_keys() + self._get_gpu_device_stat_keys()
         gpu_stats = self._get_gpu_stats(gpu_stat_keys)
-        
+
         if self._log_stats.inter_step_time:
             self._snap_inter_step_time = time.time()
 

@@ -191,62 +191,6 @@ def restore_hpc_weights_if_needed(self, model: LightningModule):
                 did_restore = True
         return did_restore
 
-    def restore_training_state(self, checkpoint):
-        """
-        Restore trainer state.
-        Model will get its change to update
-        :param checkpoint:
-        :return:
-        """
-        if 'optimizer_states' not in checkpoint or 'lr_schedulers' not in checkpoint:
-            raise KeyError(
-                'Trying to restore training state but checkpoint contains only the model.'
-                ' This is probably due to `ModelCheckpoint.save_weights_only` being set to `True`.'
-            )
-
-        if any([key in checkpoint for key in DEPRECATED_CHECKPOINT_KEYS]):
-            raise ValueError(
-                "The checkpoint you're attempting to load follows an"
-                " outdated schema. You can upgrade to the current schema by running"
-                " `python -m pytorch_lightning.utilities.upgrade_checkpoint --file model.ckpt`"
-                " where `model.ckpt` is your checkpoint file."
-            )
-
-        # load callback states
-        self.trainer.on_load_checkpoint(checkpoint)
-
-        self.trainer.global_step = checkpoint['global_step']
-        self.trainer.current_epoch = checkpoint['epoch']
-
-        # Division deals with global step stepping once per accumulated batch
-        # Inequality deals with different global step for odd vs even num_training_batches
-        n_accum = 1 if self.trainer.accumulate_grad_batches is None else self.trainer.accumulate_grad_batches
-        expected_steps = self.trainer.num_training_batches / n_accum
-        if self.trainer.num_training_batches != 0 and self.trainer.global_step % expected_steps > 1:
-            rank_zero_warn(
-                "You're resuming from a checkpoint that ended mid-epoch. "
-                "This can cause unreliable results if further training is done, "
-                "consider using an end of epoch checkpoint. "
-            )
-
-        # restore the optimizers
-        optimizer_states = checkpoint['optimizer_states']
-        for optimizer, opt_state in zip(self.trainer.optimizers, optimizer_states):
-            optimizer.load_state_dict(opt_state)
-
-            # move optimizer to GPU 1 weight at a time
-            # avoids OOM
-            if self.trainer.root_gpu is not None:
-                for state in optimizer.state.values():
-                    for k, v in state.items():
-                        if isinstance(v, torch.Tensor):
-                            state[k] = v.cuda(self.trainer.root_gpu)
-
-        # restore the lr schedulers
-        lr_schedulers = checkpoint['lr_schedulers']
-        for scheduler, lrs_state in zip(self.trainer.lr_schedulers, lr_schedulers):
-            scheduler['scheduler'].load_state_dict(lrs_state)
-
     # ----------------------------------
     # PRIVATE OPS
     # ----------------------------------

@@ -327,75 +327,6 @@ def suggestion(self, skip_begin: int = 10, skip_end: int = 1):
             self._optimal_idx = None
 
 
-class _LRCallback(Callback):
-    """ Special callback used by the learning rate finder. This callbacks log
-    the learning rate before each batch and log the corresponding loss after
-    each batch.
-
-    Args:
-        num_training: number of iterations done by the learning rate finder
-        early_stop_threshold: threshold for stopping the search. If the
-            loss at any point is larger than ``early_stop_threshold*best_loss``
-            then the search is stopped. To disable, set to ``None``.
-        progress_bar_refresh_rate: rate to refresh the progress bar for
-            the learning rate finder
-        beta: smoothing value, the loss being logged is a running average of
-            loss values logged until now. ``beta`` controls the forget rate i.e.
-            if ``beta=0`` all past information is ignored.
-
-    """
-    def __init__(self, num_training: int,
-                 early_stop_threshold: float = 4.0,
-                 progress_bar_refresh_rate: int = 0,
-                 beta: float = 0.98):
-        self.num_training = num_training
-        self.early_stop_threshold = early_stop_threshold
-        self.beta = beta
-        self.losses = []
-        self.lrs = []
-        self.avg_loss = 0.0
-        self.best_loss = 0.0
-        self.progress_bar_refresh_rate = progress_bar_refresh_rate
-        self.progress_bar = None
-
-    def on_batch_start(self, trainer, pl_module):
-        """ Called before each training batch, logs the lr that will be used """
-        if (trainer.batch_idx + 1) % trainer.accumulate_grad_batches != 0:
-            return
-
-        if self.progress_bar_refresh_rate and self.progress_bar is None:
-            self.progress_bar = tqdm(desc='Finding best initial lr', total=self.num_training)
-
-        self.lrs.append(trainer.lr_schedulers[0]['scheduler'].lr[0])
-
-    def on_train_batch_end(self, trainer, pl_module, batch, batch_idx, dataloader_idx):
-        """ Called when the training batch ends, logs the calculated loss """
-        if (trainer.batch_idx + 1) % trainer.accumulate_grad_batches != 0:
-            return
-
-        if self.progress_bar:
-            self.progress_bar.update()
-
-        current_loss = trainer.train_loop.running_loss.last().item()
-        current_step = trainer.global_step + 1  # remove the +1 in 1.0
-
-        # Avg loss (loss with momentum) + smoothing
-        self.avg_loss = self.beta * self.avg_loss + (1 - self.beta) * current_loss
-        smoothed_loss = self.avg_loss / (1 - self.beta**current_step)
-
-        # Check if we diverging
-        if self.early_stop_threshold is not None:
-            if current_step > 1 and smoothed_loss > self.early_stop_threshold * self.best_loss:
-                trainer.max_steps = current_step  # stop signal
-                if self.progress_bar:
-                    self.progress_bar.close()
-
-        # Save best loss for diverging checking
-        if smoothed_loss < self.best_loss or current_step == 1:
-            self.best_loss = smoothed_loss
-
-        self.losses.append(smoothed_loss)
-
 class _LRCallback(Callback):
     """ Special callback used by the learning rate finder. This callbacks log
     the learning rate before each batch and log the corresponding loss after

@@ -235,11 +235,9 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_
 @pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 @pytest.mark.parametrize("logger_class", [
     TensorBoardLogger,
-    CometLogger,
     MLFlowLogger,
     NeptuneLogger,
     TestTubeLogger,
-    WandbLogger,
 ])
 def test_logger_created_on_rank_zero_only(tmpdir, monkeypatch, logger_class):
     """ Test that loggers get replaced by dummy logges on global rank > 0"""

@@ -13,36 +13,6 @@
 from tests.base import EvalModelTemplate
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
-def test_multi_gpu_wandb_ddp_spawn(tmpdir):
-    """
-    Test ddp + wb
-    """
-    from pytorch_lightning.loggers import WandbLogger
-    tutils.set_random_master_port()
-
-    model = EvalModelTemplate()
-
-    wandb.run = MagicMock()
-    wandb.init(name='name', project='project')
-
-    logger = WandbLogger(name='name', offline=True)
-    trainer_options = dict(
-        default_root_dir=tmpdir,
-        max_epochs=1,
-        gpus=2,
-        distributed_backend='ddp_spawn',
-        precision=16,
-        logger=logger,
-
-    )
-    # tutils.run_model_test(trainer_options, model)
-    trainer = Trainer(**trainer_options)
-    result = trainer.fit(model)
-    assert result
-    trainer.test(model)
-
-
 @pytest.mark.skip(reason='dp + amp not supported currently')  # TODO
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
 def test_amp_single_gpu_dp(tmpdir):

@@ -1065,46 +1065,3 @@ def test_dataloaders_load_only_once_passed_loaders(tmpdir):
     ]
     for call, expected in zip(calls, expected_sequence):
         assert call['name'] == expected
-
-
-def test_dataloaders_load_every_epoch(tmpdir):
-    os.environ['PL_DEV_DEBUG'] = '1'
-
-    model = EvalModelTemplate()
-    train_loader = model.train_dataloader()
-    model.train_dataloader = None
-    val_loader = model.val_dataloader()
-    model.val_dataloader = None
-    test_loader = model.test_dataloader()
-    model.test_dataloader = None
-
-    # logger file to get meta
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        limit_train_batches=0.3,
-        limit_val_batches=0.3,
-        reload_dataloaders_every_epoch=True,
-        max_epochs=3,
-    )
-    result = trainer.fit(model, train_loader, val_loader)
-
-    trainer.test(test_dataloaders=test_loader)
-
-    assert len(trainer.dev_debugger.val_dataloader_calls) == 4
-    assert len(trainer.dev_debugger.train_dataloader_calls) == 3
-    assert len(trainer.dev_debugger.test_dataloader_calls) == 1
-
-    # verify the sequence
-    calls = trainer.dev_debugger.dataloader_sequence_calls
-    expected_sequence = [
-        'val_dataloader',
-        'train_dataloader',
-        'val_dataloader',
-        'train_dataloader',
-        'val_dataloader',
-        'train_dataloader',
-        'val_dataloader',
-        'test_dataloader'
-    ]
-    for call, expected in zip(calls, expected_sequence):
-        assert call['name'] == expected