Lightning-AI · lexierule · May 26, 2021 · May 25, 2021 · May 25, 2021 · May 25, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,22 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
+## [1.3.3] - 2021-05-27
+
+### Changed
+
+- Changed calling of `untoggle_optimizer(opt_idx)` out of the closure function ([#7563](https://github.com/PyTorchLightning/pytorch-lightning/pull/7563))
+
+### Fixed
+
+- Fixed `ProgressBar` pickling after calling `trainer.predict` ([#7608](https://github.com/PyTorchLightning/pytorch-lightning/pull/7608))
+- Fixed broadcasting in multi-node, multi-gpu DDP using torch 1.7 ([#7592](https://github.com/PyTorchLightning/pytorch-lightning/pull/7592))
+- Fixed dataloaders are not reset when tuning the model ([#7566](https://github.com/PyTorchLightning/pytorch-lightning/pull/7566))
+- Fixed print errors in `ProgressBar` when `trainer.fit` is not called ([#7674](https://github.com/PyTorchLightning/pytorch-lightning/pull/7674))
+- Fixed global step update when the epoch is skipped ([#7677](https://github.com/PyTorchLightning/pytorch-lightning/pull/7677))
+- Fixed training loop total batch counter when accumulate grad batches was enabled ([#7692](https://github.com/PyTorchLightning/pytorch-lightning/pull/7692))
+
+
 ## [1.3.2] - 2021-05-18
 
 ### Changed
@@ -18,9 +34,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed setting correct `DistribType` for `ddp_cpu` (spawn) backend ([#7492](https://github.com/PyTorchLightning/pytorch-lightning/pull/7492))
 - Fixed incorrect number of calls to LR scheduler when `check_val_every_n_epoch > 1` ([#7032](https://github.com/PyTorchLightning/pytorch-lightning/pull/7032))
 
-
 ## [1.3.1] - 2021-05-11
 
+
-
-
 ### Fixed
 
 - Fixed DeepSpeed with IterableDatasets ([#7362](https://github.com/PyTorchLightning/pytorch-lightning/pull/7362))

diff --git a/dockers/base-xla/Dockerfile b/dockers/base-xla/Dockerfile
@@ -14,7 +14,7 @@
 
 FROM google/cloud-sdk:slim
 
-MAINTAINER PyTorchLightning <https://github.com/PyTorchLightning>
+LABEL maintainer="PyTorchLightning <https://github.com/PyTorchLightning>"
 
 # CALL: docker image build -t pytorch-lightning:XLA-extras-py3.6 -f dockers/base-xla/Dockerfile . --build-arg PYTHON_VERSION=3.6
 # This Dockerfile installs pytorch/xla 3.7 wheels. There are also 3.6 wheels available; see below.

diff --git a/dockers/nvidia/Dockerfile b/dockers/nvidia/Dockerfile
@@ -15,7 +15,7 @@
 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes
 FROM nvcr.io/nvidia/pytorch:21.04-py3
 
-MAINTAINER PyTorchLightning <https://github.com/PyTorchLightning>
+LABEL maintainer="PyTorchLightning <https://github.com/PyTorchLightning>"
 
 ARG LIGHTNING_VERSION=""
 

diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile
@@ -17,7 +17,7 @@ ARG PYTORCH_VERSION=1.5
 
 FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}
 
-MAINTAINER PyTorchLightning <https://github.com/PyTorchLightning>
+LABEL maintainer="PyTorchLightning <https://github.com/PyTorchLightning>"
 
 ARG LIGHTNING_VERSION=""
 

diff --git a/dockers/tpu-tests/Dockerfile b/dockers/tpu-tests/Dockerfile
@@ -17,7 +17,7 @@ ARG PYTORCH_VERSION=1.6
 
 FROM pytorchlightning/pytorch_lightning:base-xla-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}
 
-MAINTAINER PyTorchLightning <https://github.com/PyTorchLightning>
+LABEL maintainer="PyTorchLightning <https://github.com/PyTorchLightning>"
 
 #SHELL ["/bin/bash", "-c"]
 

diff --git a/pytorch_lightning/__about__.py b/pytorch_lightning/__about__.py
@@ -1,7 +1,7 @@
 import time
 
 _this_year = time.strftime("%Y")
-__version__ = '1.3.2'
+__version__ = '1.3.3'
 __author__ = 'William Falcon et al.'
 __author_email__ = 'waf2107@columbia.edu'
 __license__ = 'Apache-2.0'

diff --git a/pytorch_lightning/callbacks/progress.py b/pytorch_lightning/callbacks/progress.py
@@ -283,13 +283,15 @@ def __init__(self, refresh_rate: int = 1, process_position: int = 0):
  self.main_progress_bar = None
  self.val_progress_bar = None
  self.test_progress_bar = None
+ self.predict_progress_bar = None
 
  def __getstate__(self):
  # can't pickle the tqdm objects
  state = self.__dict__.copy()
  state['main_progress_bar'] = None
  state['val_progress_bar'] = None
  state['test_progress_bar'] = None
+ state['predict_progress_bar'] = None
  return state
 
  @property
@@ -471,12 +473,14 @@ def print(
  ):
  active_progress_bar = None
 
- if not self.main_progress_bar.disable:
+ if self.main_progress_bar is not None and not self.main_progress_bar.disable:
  active_progress_bar = self.main_progress_bar
- elif not self.val_progress_bar.disable:
+ elif self.val_progress_bar is not None and not self.val_progress_bar.disable:
  active_progress_bar = self.val_progress_bar
- elif not self.test_progress_bar.disable:
+ elif self.test_progress_bar is not None and not self.test_progress_bar.disable:
  active_progress_bar = self.test_progress_bar
+ elif self.predict_progress_bar is not None and not self.predict_progress_bar.disable:
+ active_progress_bar = self.predict_progress_bar
 
  if active_progress_bar is not None:
  s = sep.join(map(str, args))

diff --git a/pytorch_lightning/core/decorators.py b/pytorch_lightning/core/decorators.py
@@ -71,12 +71,11 @@ def auto_transfer_args(self, *args, **kwargs):
 
 def parameter_validation(fn: Callable) -> Callable:
  """
- Decorator for :meth:`~pytorch_lightning.core.LightningModule.to` method.
  Validates that the module parameter lengths match after moving to the device. It is useful
  when tying weights on TPU's.
 
  Args:
- fn: ``.to`` method
+ fn: ``model_to_device`` method
 
  Note:
  TPU's require weights to be tied/shared after moving the module to the device.
@@ -90,10 +89,10 @@ def parameter_validation(fn: Callable) -> Callable:
 
  @wraps(fn)
  def inner_fn(self, *args, **kwargs):
- pre_layer_count = len(list(self.parameters()))
+ pre_layer_count = len(list(self.model.parameters()))
  module = fn(self, *args, **kwargs)
- self.on_post_move_to_device()
- post_layer_count = len(list(self.parameters()))
+ self.model.on_post_move_to_device()
+ post_layer_count = len(list(self.model.parameters()))
 
  if not pre_layer_count == post_layer_count:
  rank_zero_warn(

diff --git a/pytorch_lightning/overrides/torch_distributed.py b/pytorch_lightning/overrides/torch_distributed.py
@@ -3,7 +3,7 @@
 
 import torch
 
-from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_7
+from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8
 
 log = logging.getLogger(__name__)
 
@@ -88,7 +88,7 @@ def _broadcast_object_list(object_list, src=0, group=None):
  object_list[i] = _tensor_to_object(obj_view, obj_size)
 
 
-if _TORCH_GREATER_EQUAL_1_7 and torch.distributed.is_available():
+if _TORCH_GREATER_EQUAL_1_8 and torch.distributed.is_available():
  from torch.distributed.distributed_c10d import broadcast_object_list
 else:
  broadcast_object_list = _broadcast_object_list
diff --git a/pytorch_lightning/plugins/training_type/single_tpu.py b/pytorch_lightning/plugins/training_type/single_tpu.py
@@ -15,6 +15,7 @@
 
 import torch
 
+from pytorch_lightning.core.decorators import parameter_validation
 from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin
 from pytorch_lightning.utilities import _TPU_AVAILABLE
 from pytorch_lightning.utilities.apply_func import move_data_to_device
@@ -43,6 +44,7 @@ def on_tpu(self) -> bool:
  def is_distributed(self) -> bool:
  return False
 
+ @parameter_validation
  def model_to_device(self) -> None:
  self.model.to(self.root_device)
 

diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py
@@ -23,6 +23,7 @@
 from torch.utils.data import DataLoader
 
 import pytorch_lightning as pl
+from pytorch_lightning.core.decorators import parameter_validation
 from pytorch_lightning.overrides import LightningDistributedModule
 from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin
 from pytorch_lightning.trainer.connectors.data_connector import _PatchDataLoader
@@ -171,6 +172,7 @@ def new_process(self, process_idx: int, trainer, mp_queue) -> None:
  if self.global_rank == 0:
  time.sleep(2)
 
+ @parameter_validation
  def model_to_device(self) -> None:
  self.device = xm.xla_device()
  self.model = self.wrapped_model.to(self.device)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
@@ -526,6 +526,8 @@ def run_training_epoch(self):
  self.update_train_loop_lr_schedulers(monitor_metrics=monitor_metrics)
  self.trainer.checkpoint_connector.has_trained = True
 
+ self.trainer.total_batch_idx += 1
+
  # max steps reached, end training
  if (
  self.trainer.max_steps is not None and self.trainer.max_steps <= self.trainer.global_step + 1
@@ -539,8 +541,6 @@ def run_training_epoch(self):
  if self.trainer.should_stop:
  break
 
- self.trainer.total_batch_idx += 1
-
  # stop epoch if we limited the number of training batches
  if self._num_training_batches_reached(is_last_batch):
  break
@@ -574,9 +574,8 @@ def run_training_epoch(self):
  self.trainer.run_evaluation(on_epoch=True)
  self.trainer.training = True
 
- # increment the global step once
- # progress global step according to grads progress
- self.increment_accumulated_grad_global_step()
+ if batch_output.signal != -1:
+ self.increment_accumulated_grad_global_step()
 
  def on_train_epoch_end(self, epoch_output: List[List[List[Result]]]) -> None:
  # inform logger the batch loop has finished
@@ -727,7 +726,9 @@ def train_step_and_backward_closure():
 
  # optimizer step
  self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
-
+ if len(self.trainer.optimizers) > 1:
+ # revert back to previous state
+ self.trainer.lightning_module.untoggle_optimizer(opt_idx)
  else:
  self._curr_step_result = self.training_step(
  split_batch, batch_idx, opt_idx, self.trainer.hiddens
@@ -838,10 +839,6 @@ def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer,
  "training_step returned None. If this was on purpose, ignore this warning..."
  )
 
- if len(self.trainer.optimizers) > 1:
- # revert back to previous state
- self.trainer.lightning_module.untoggle_optimizer(opt_idx)
-
  return result
 
  def _check_finite(self, loss: torch.Tensor) -> None:

diff --git a/pytorch_lightning/tuner/batch_size_scaling.py b/pytorch_lightning/tuner/batch_size_scaling.py
@@ -160,7 +160,10 @@ def _run_power_scaling(
  else:
  raise # some other error not memory related
 
- if not changed:
+ if changed:
+ # Force the train dataloader to reset as the batch size has changed
+ trainer.reset_train_dataloader(model)
+ else:
  break
  return new_size
 
@@ -192,7 +195,10 @@ def _run_binsearch_scaling(
  else:
  new_size, changed = _adjust_batch_size(trainer, batch_arg_name, factor=2.0, desc='succeeded')
 
- if not changed:
+ if changed:
+ # Force the train dataloader to reset as the batch size has changed
+ trainer.reset_train_dataloader(model)
+ else:
  break
 
  except RuntimeError as exception:

diff --git a/pytorch_lightning/utilities/device_dtype_mixin.py b/pytorch_lightning/utilities/device_dtype_mixin.py
@@ -17,8 +17,6 @@
 import torch
 from torch.nn import Module
 
-from pytorch_lightning.core.decorators import parameter_validation
-
 
 class DeviceDtypeModuleMixin(Module):
  __jit_unused_properties__ = ['device', 'dtype']
@@ -47,7 +45,6 @@ def device(self) -> Union[str, torch.device]:
 
  return device
 
- @parameter_validation
  def to(self, *args, **kwargs) -> Module:
  """Moves and/or casts the parameters and buffers.
 
@@ -84,9 +81,6 @@ def to(self, *args, **kwargs) -> Module:
  ... def __init__(self, weight: torch.Tensor):
  ... super().__init__()
  ... self.register_buffer('weight', weight)
- ...
- ... def on_post_move_to_device(self):
- ... pass
  >>> _ = torch.manual_seed(0)
  >>> module = ExampleModule(torch.rand(3, 4))
  >>> module.weight #doctest: +ELLIPSIS

diff --git a/pytorch_lightning/utilities/types.py b/pytorch_lightning/utilities/types.py
@@ -1,12 +1,14 @@
-from typing import Any, Dict, Iterator, List, Union
-
-import torch
-from torchmetrics import Metric
 """
 Convention:
  - Do not include any `_TYPE` suffix
  - Types used in public hooks (as those in the `LightningModule` and `Callback`) should be public (no trailing `_`)
 """
+
+from typing import Any, Dict, Iterator, List, Union
+
+import torch
+from torchmetrics import Metric
+
 _METRIC = Union[Metric, torch.Tensor, int, float]
 STEP_OUTPUT = Union[torch.Tensor, Dict[str, Any]]
 EPOCH_OUTPUT = List[STEP_OUTPUT]

diff --git a/tests/accelerators/test_tpu_backend.py b/tests/accelerators/test_tpu_backend.py
@@ -95,25 +95,21 @@ def test_weight_tying_warning(tmpdir, capsys=None):
  trainer.fit(model)
 
 
-# @RunIf(tpu=True)
-# @pl_multi_process_test
-# def test_if_weights_tied(tmpdir, capsys=None):
-# """
-# Test if weights are properly tied on `on_post_move_to_device`.
-# Ensure no warning for parameter mismatch is thrown.
-# """
-
-# # TODO (kaushikb11): Add `parameter_validation` specific to TPU Accelerators
-# class Model(WeightSharingModule):
+@RunIf(tpu=True)
+@pl_multi_process_test
+def test_if_weights_tied(tmpdir, capsys=None):
+ """
+ Test if weights are properly tied on `on_post_move_to_device`.
+ Ensure no warning for parameter mismatch is thrown.
+ """
 
-# def on_post_move_to_device(self):
-# self.layer_3.weight = self.layer_1.weight
+ class Model(WeightSharingModule):
 
-# model = Model()
-# trainer = Trainer(checkpoint_callback=True, max_epochs=1, tpu_cores=1)
+  def on_post_move_to_device(self):
+  self.layer_3.weight = self.layer_1.weight
 
-#  with pytest.warns(UserWarning) as warnings:
-#  trainer.fit(model)
+ model = Model()
+ trainer = Trainer(checkpoint_callback=True, max_epochs=1, tpu_cores=1)
 
-#  assert not list(filter(lambda x: 'The model layers do not match' in str(x), warnings.list))
-# assert len(trainer.test(model)) == 1
+ with pytest.warns(UserWarning, match="The model layers do not match"):
+  trainer.fit(model)