Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cleaning up stale logger tests + flake8 #3490

Merged
merged 6 commits into from
Sep 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pytorch_lightning/callbacks/gpu_stats_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_
def on_train_batch_end(self, trainer, pl_module, batch, batch_idx, dataloader_idx):
gpu_stat_keys = self._get_gpu_stat_keys() + self._get_gpu_device_stat_keys()
gpu_stats = self._get_gpu_stats(gpu_stat_keys)

if self._log_stats.inter_step_time:
self._snap_inter_step_time = time.time()

Expand Down
56 changes: 0 additions & 56 deletions pytorch_lightning/trainer/connectors/checkpoint_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,62 +191,6 @@ def restore_hpc_weights_if_needed(self, model: LightningModule):
did_restore = True
return did_restore

def restore_training_state(self, checkpoint):
"""
Restore trainer state.
Model will get its change to update
:param checkpoint:
:return:
"""
if 'optimizer_states' not in checkpoint or 'lr_schedulers' not in checkpoint:
raise KeyError(
'Trying to restore training state but checkpoint contains only the model.'
' This is probably due to `ModelCheckpoint.save_weights_only` being set to `True`.'
)

if any([key in checkpoint for key in DEPRECATED_CHECKPOINT_KEYS]):
raise ValueError(
"The checkpoint you're attempting to load follows an"
" outdated schema. You can upgrade to the current schema by running"
" `python -m pytorch_lightning.utilities.upgrade_checkpoint --file model.ckpt`"
" where `model.ckpt` is your checkpoint file."
)

# load callback states
self.trainer.on_load_checkpoint(checkpoint)

self.trainer.global_step = checkpoint['global_step']
self.trainer.current_epoch = checkpoint['epoch']

# Division deals with global step stepping once per accumulated batch
# Inequality deals with different global step for odd vs even num_training_batches
n_accum = 1 if self.trainer.accumulate_grad_batches is None else self.trainer.accumulate_grad_batches
expected_steps = self.trainer.num_training_batches / n_accum
if self.trainer.num_training_batches != 0 and self.trainer.global_step % expected_steps > 1:
rank_zero_warn(
"You're resuming from a checkpoint that ended mid-epoch. "
"This can cause unreliable results if further training is done, "
"consider using an end of epoch checkpoint. "
)

# restore the optimizers
optimizer_states = checkpoint['optimizer_states']
for optimizer, opt_state in zip(self.trainer.optimizers, optimizer_states):
optimizer.load_state_dict(opt_state)

# move optimizer to GPU 1 weight at a time
# avoids OOM
if self.trainer.root_gpu is not None:
for state in optimizer.state.values():
for k, v in state.items():
if isinstance(v, torch.Tensor):
state[k] = v.cuda(self.trainer.root_gpu)

# restore the lr schedulers
lr_schedulers = checkpoint['lr_schedulers']
for scheduler, lrs_state in zip(self.trainer.lr_schedulers, lr_schedulers):
scheduler['scheduler'].load_state_dict(lrs_state)

# ----------------------------------
# PRIVATE OPS
# ----------------------------------
Expand Down
69 changes: 0 additions & 69 deletions pytorch_lightning/tuner/lr_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,75 +327,6 @@ def suggestion(self, skip_begin: int = 10, skip_end: int = 1):
self._optimal_idx = None


class _LRCallback(Callback):
""" Special callback used by the learning rate finder. This callbacks log
the learning rate before each batch and log the corresponding loss after
each batch.

Args:
num_training: number of iterations done by the learning rate finder
early_stop_threshold: threshold for stopping the search. If the
loss at any point is larger than ``early_stop_threshold*best_loss``
then the search is stopped. To disable, set to ``None``.
progress_bar_refresh_rate: rate to refresh the progress bar for
the learning rate finder
beta: smoothing value, the loss being logged is a running average of
loss values logged until now. ``beta`` controls the forget rate i.e.
if ``beta=0`` all past information is ignored.

"""
def __init__(self, num_training: int,
early_stop_threshold: float = 4.0,
progress_bar_refresh_rate: int = 0,
beta: float = 0.98):
self.num_training = num_training
self.early_stop_threshold = early_stop_threshold
self.beta = beta
self.losses = []
self.lrs = []
self.avg_loss = 0.0
self.best_loss = 0.0
self.progress_bar_refresh_rate = progress_bar_refresh_rate
self.progress_bar = None

def on_batch_start(self, trainer, pl_module):
""" Called before each training batch, logs the lr that will be used """
if (trainer.batch_idx + 1) % trainer.accumulate_grad_batches != 0:
return

if self.progress_bar_refresh_rate and self.progress_bar is None:
self.progress_bar = tqdm(desc='Finding best initial lr', total=self.num_training)

self.lrs.append(trainer.lr_schedulers[0]['scheduler'].lr[0])

def on_train_batch_end(self, trainer, pl_module, batch, batch_idx, dataloader_idx):
""" Called when the training batch ends, logs the calculated loss """
if (trainer.batch_idx + 1) % trainer.accumulate_grad_batches != 0:
return

if self.progress_bar:
self.progress_bar.update()

current_loss = trainer.train_loop.running_loss.last().item()
current_step = trainer.global_step + 1 # remove the +1 in 1.0

# Avg loss (loss with momentum) + smoothing
self.avg_loss = self.beta * self.avg_loss + (1 - self.beta) * current_loss
smoothed_loss = self.avg_loss / (1 - self.beta**current_step)

# Check if we diverging
if self.early_stop_threshold is not None:
if current_step > 1 and smoothed_loss > self.early_stop_threshold * self.best_loss:
trainer.max_steps = current_step # stop signal
if self.progress_bar:
self.progress_bar.close()

# Save best loss for diverging checking
if smoothed_loss < self.best_loss or current_step == 1:
self.best_loss = smoothed_loss

self.losses.append(smoothed_loss)

class _LRCallback(Callback):
""" Special callback used by the learning rate finder. This callbacks log
the learning rate before each batch and log the corresponding loss after
Expand Down
2 changes: 0 additions & 2 deletions tests/loggers/test_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,11 +235,9 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_
@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
@pytest.mark.parametrize("logger_class", [
TensorBoardLogger,
CometLogger,
MLFlowLogger,
NeptuneLogger,
TestTubeLogger,
WandbLogger,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@williamFalcon are these tests causing problems?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes... these loggers are having issues where they freeze and such. I’m honestly not sure it’s worth running some of these tests anymore if the loggers aren’t properly designed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh, that's new. I use wandb logger every day non stop and no issues with PL. Can you give me some pointer, I'd like to help fix the root issue.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in the mean time, can we comment the lines and add a todo note?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure. i’ll ping you on slack

])
def test_logger_created_on_rank_zero_only(tmpdir, monkeypatch, logger_class):
""" Test that loggers get replaced by dummy logges on global rank > 0"""
Expand Down
30 changes: 0 additions & 30 deletions tests/models/test_amp.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,36 +13,6 @@
from tests.base import EvalModelTemplate


@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
def test_multi_gpu_wandb_ddp_spawn(tmpdir):
"""
Test ddp + wb
"""
from pytorch_lightning.loggers import WandbLogger
tutils.set_random_master_port()

model = EvalModelTemplate()

wandb.run = MagicMock()
wandb.init(name='name', project='project')

logger = WandbLogger(name='name', offline=True)
trainer_options = dict(
default_root_dir=tmpdir,
max_epochs=1,
gpus=2,
distributed_backend='ddp_spawn',
precision=16,
logger=logger,

)
# tutils.run_model_test(trainer_options, model)
trainer = Trainer(**trainer_options)
result = trainer.fit(model)
assert result
trainer.test(model)


@pytest.mark.skip(reason='dp + amp not supported currently') # TODO
@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
def test_amp_single_gpu_dp(tmpdir):
Expand Down
43 changes: 0 additions & 43 deletions tests/trainer/test_dataloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -1065,46 +1065,3 @@ def test_dataloaders_load_only_once_passed_loaders(tmpdir):
]
for call, expected in zip(calls, expected_sequence):
assert call['name'] == expected


def test_dataloaders_load_every_epoch(tmpdir):
os.environ['PL_DEV_DEBUG'] = '1'

model = EvalModelTemplate()
train_loader = model.train_dataloader()
model.train_dataloader = None
val_loader = model.val_dataloader()
model.val_dataloader = None
test_loader = model.test_dataloader()
model.test_dataloader = None

# logger file to get meta
trainer = Trainer(
default_root_dir=tmpdir,
limit_train_batches=0.3,
limit_val_batches=0.3,
reload_dataloaders_every_epoch=True,
max_epochs=3,
)
result = trainer.fit(model, train_loader, val_loader)

trainer.test(test_dataloaders=test_loader)

assert len(trainer.dev_debugger.val_dataloader_calls) == 4
assert len(trainer.dev_debugger.train_dataloader_calls) == 3
assert len(trainer.dev_debugger.test_dataloader_calls) == 1

# verify the sequence
calls = trainer.dev_debugger.dataloader_sequence_calls
expected_sequence = [
'val_dataloader',
'train_dataloader',
'val_dataloader',
'train_dataloader',
'val_dataloader',
'train_dataloader',
'val_dataloader',
'test_dataloader'
]
for call, expected in zip(calls, expected_sequence):
assert call['name'] == expected