Lightning-AI · lexierule · Apr 27, 2021 · Apr 20, 2021 · Apr 20, 2021 · Apr 20, 2021
@@ -111,6 +111,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added new `EarlyStopping` parameters `stopping_threshold` and `divergence_threshold` ([#6868](https://github.com/PyTorchLightning/pytorch-lightning/pull/6868))
 
 
+- Added new `UnrepeatedDistributedSampler` and `IndexBatchSamplerWrapper` for tracking distributed predictions ([#7215](https://github.com/PyTorchLightning/pytorch-lightning/pull/7215))
+
+- Added `trainer.predict(return_predictions=None|False|True)` ([#7215](https://github.com/PyTorchLightning/pytorch-lightning/pull/7215))
+
 
 ### Changed
 

@@ -26,7 +26,7 @@
 
 class _DataModuleWrapper(type):
 
- def __init__(self, *args, **kwargs):
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
  super().__init__(*args, **kwargs)
  self.__has_added_checks = False
 

@@ -74,7 +74,7 @@ class LightningModule(
  "model_size",
  ] + DeviceDtypeModuleMixin.__jit_unused_properties__
 
- def __init__(self, *args, **kwargs):
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
  super().__init__(*args, **kwargs)
 
  # see (https://github.com/pytorch/pytorch/blob/3e6bb5233f9ca2c5aa55d9cda22a7ee85439aa6e/

@@ -12,10 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import itertools
-from typing import Any
+from typing import Any, Iterator, List, Optional
 
 import torch
 from torch.nn.parallel import DistributedDataParallel
+from torch.utils.data import BatchSampler, DistributedSampler, Sampler
 
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.overrides.base import _LightningModuleWrapperBase
@@ -75,3 +76,70 @@ def prepare_for_backward(model: DistributedDataParallel, output: Any):
  model.reducer.prepare_for_backward([])
  else:
  model.require_forward_param_sync = False
+
+
+class UnrepeatedDistributedSampler(DistributedSampler):
+ """
+ A fork of the pytorch DistributedSampler that doesn't repeat data, instead
+ allowing the number of batches per process to be off-by-one from each other.
+ This makes this sampler usable for predictions (it's deterministic and
+ doesn't require shuffling). It is potentially unsafe to use this sampler for
+ training, because during training the DistributedDataParallel syncs buffers
+ on each forward pass, so it could freeze if one of the processes runs one
+ fewer batch. During prediction, buffers are only synced on the first batch,
+ so this is safe to use as long as each process runs at least one batch. We
+ verify this in an assert.
+
+ Taken from https://github.com/jpuigcerver/PyLaia/blob/v1.0.0/laia/data/unpadded_distributed_sampler.py
+ and https://github.com/pytorch/pytorch/issues/25162#issuecomment-634146002
+ """
+
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
+ super().__init__(*args, **kwargs)
+ self.num_samples = len(range(self.rank, len(self.dataset), self.num_replicas))
+ self.total_size = len(self.dataset)
+ # If any process has at least one batch, every other process needs to
+ # have at least one batch, or the DistributedDataParallel could lock up.
+ assert self.num_samples >= 1 or self.total_size == 0
+
+ def __iter__(self) -> Iterator[List[int]]:
+ if self.shuffle:
+ # deterministically shuffle based on epoch
+ g = torch.Generator()
+ g.manual_seed(self.epoch)
+ indices = torch.randperm(len(self.dataset), generator=g).tolist()
+ else:
+ indices = list(range(len(self.dataset)))
+
+ assert len(indices) == self.total_size
+
+ # subsample
+ indices = indices[self.rank:self.total_size:self.num_replicas]
+ assert len(indices) == self.num_samples
+
+ return iter(indices)
+
+
+class IndexBatchSamplerWrapper:
+ """This class is used to wrap a :class:`torch.utils.data.BatchSampler` and capture its indices."""
+
+ def __init__(self, sampler: BatchSampler) -> None:
+ self.batch_sampler = sampler
+ self.batch_indices: Optional[List[int]] = None
+
+ def __iter__(self) -> Iterator[List[int]]:
+ for batch in self.batch_sampler:
+ self.batch_indices = batch
- self.batch_indices = batch
+ self._indices = batch
- self.batch_indices = batch
+ self._indices = batch
+ yield batch
+
+ @property
+ def drop_last(self) -> bool:
+ return self.batch_sampler.drop_last
+
+ @property
+ def batch_size(self) -> int:
+ return self.batch_sampler.batch_size
+
+ @property
+ def sampler(self) -> Sampler:
+ return self.batch_sampler.sampler
diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py
@@ -94,6 +94,10 @@ def distributed_sampler_kwargs(self):
  distributed_sampler_kwargs = dict(num_replicas=(self.num_nodes * self.num_processes), rank=self.global_rank)
  return distributed_sampler_kwargs
 
+ @property
+ def use_spawn(self) -> bool:
+ return True
+
  @property
  def _is_single_process_single_device(self):
  return True

diff --git a/pytorch_lightning/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py
@@ -47,6 +47,10 @@ def root_device(self):
  def on_gpu(self):
  return self.root_device.type == "cuda" and torch.cuda.is_available()
 
+ @property
+ def use_spawn(self) -> bool:
+ return False
+
  @property
  def lightning_module(self):
  return unwrap_lightning_module(self._model)

diff --git a/pytorch_lightning/plugins/training_type/single_device.py b/pytorch_lightning/plugins/training_type/single_device.py
@@ -36,6 +36,10 @@ def on_tpu(self) -> bool:
  def on_gpu(self) -> bool:
  return self.device.type == "cuda" and torch.cuda.is_available()
 
+ @property
+ def use_spawn(self) -> bool:
+ return False
+
  def reduce(self, tensor: Union[Any, torch.Tensor], *args: Any, **kwargs: Any) -> Union[Any, torch.Tensor]:
  """
  Reduces a tensor from several distributed processes to one aggregated tensor.

diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -73,6 +73,11 @@ def model_to_device(self) -> None:
  def is_global_zero(self) -> bool:
  """Whether the current process is the rank zero process not only on the local node, but for all nodes."""
 
+ @property
+ @abstractmethod
+ def use_spawn(self) -> bool:
+ """Whether the current processes are being spawned"""
+
  @abstractmethod
  def reduce(self, tensor: Union[torch.Tensor, Any], *args: Any, **kwargs: Any) -> Union[torch.Tensor, Any]:
  """

@@ -13,7 +13,7 @@
 # limitations under the License.
 import os
 from datetime import timedelta
-from typing import List, Union, Optional, Dict
+from typing import Dict, List, Optional, Union
 
 from pytorch_lightning.callbacks import Callback, ModelCheckpoint, ProgressBar, ProgressBarBase
 from pytorch_lightning.callbacks.timer import Timer
@@ -58,6 +58,8 @@ def on_trainer_init(
  # configure swa callback
  self._configure_swa_callbacks()
 
+ # configure the timer callback.
+ # responsible to stop the training when max_time is reached.
  self._configure_timer_callback(max_time)
 
  # init progress bar
@@ -115,9 +117,7 @@ def _configure_timer_callback(self, max_time: Optional[Union[str, timedelta, Dic
  if max_time is None:
  return
  if any(isinstance(cb, Timer) for cb in self.trainer.callbacks):
- rank_zero_info(
- "Ignoring `Trainer(max_time=...)`, callbacks list already contains a Timer."
- )
+ rank_zero_info("Ignoring `Trainer(max_time=...)`, callbacks list already contains a Timer.")
  return
  timer = Timer(duration=max_time, interval="step")
  self.trainer.callbacks.append(timer)

@@ -17,14 +17,16 @@
 from abc import ABC
 from copy import deepcopy
 from functools import partial
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 
 from pytorch_lightning.accelerators import Accelerator
 from pytorch_lightning.core import LightningModule
+from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper, UnrepeatedDistributedSampler
 from pytorch_lightning.trainer.connectors.accelerator_connector import AcceleratorConnector
+from pytorch_lightning.trainer.states import RunningStage
 from pytorch_lightning.trainer.supporters import CombinedLoader
 from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_6, rank_zero_warn
 from pytorch_lightning.utilities.apply_func import apply_to_collection
@@ -107,7 +109,9 @@ def auto_add_worker_init_fn(self, dataloader: DataLoader) -> None:
  if int(os.environ.get("PL_SEED_WORKERS", 0)) and dataloader.worker_init_fn is None:
  dataloader.worker_init_fn = partial(pl_worker_init_function, rank=self.global_rank)
 
- def auto_add_sampler(self, dataloader: DataLoader, shuffle: bool) -> DataLoader:
+ def auto_add_sampler(
+ self, dataloader: DataLoader, shuffle: bool, mode: Optional[RunningStage] = None
+ ) -> DataLoader:
  # don't do anything if it's not a dataloader
  is_dataloader = isinstance(dataloader, DataLoader)
  # don't manipulate iterable datasets
@@ -133,20 +137,23 @@ def auto_add_sampler(self, dataloader: DataLoader, shuffle: bool) -> DataLoader:
  )
 
  # replace with distributed sampler
- sampler = self._get_distributed_sampler(dataloader, shuffle)
- dataloader = self.replace_sampler(dataloader, sampler)
+ sampler = self._get_distributed_sampler(dataloader, shuffle, mode=mode)
+ dataloader = self.replace_sampler(dataloader, sampler, mode=mode)
 
  return dataloader
 
  @staticmethod
- def _resolve_batch_sampler(dl_args, dataloader, sampler):
+ def _resolve_batch_sampler(dl_args, dataloader, sampler, mode: Optional[RunningStage] = None) -> Dict[str, Any]:
  batch_sampler = getattr(dataloader, "batch_sampler")
- if batch_sampler is not None and type(batch_sampler) is not BatchSampler:
+ is_predicting = mode == RunningStage.PREDICTING
+ if (batch_sampler is not None and type(batch_sampler) is not BatchSampler) or is_predicting:
  batch_sampler = type(batch_sampler)(
  sampler,
  batch_size=batch_sampler.batch_size,
- drop_last=batch_sampler.drop_last,
+ drop_last=False if is_predicting else batch_sampler.drop_last,
  )
+ if is_predicting:
+ batch_sampler = IndexBatchSamplerWrapper(batch_sampler)
  dl_args['batch_sampler'] = batch_sampler
  dl_args['batch_size'] = 1
  dl_args['shuffle'] = False
@@ -159,7 +166,7 @@ def _resolve_batch_sampler(dl_args, dataloader, sampler):
 
  return dl_args
 
- def replace_sampler(self, dataloader, sampler):
+ def replace_sampler(self, dataloader: DataLoader, sampler, mode: Optional[RunningStage] = None) -> DataLoader:
  skip_keys = ('sampler', 'batch_sampler', 'dataset_kind')
  skip_signature_keys = ('args', 'kwargs', 'self')
 
@@ -174,7 +181,7 @@ def replace_sampler(self, dataloader, sampler):
 
  dl_args = {name: attrs[name] for name in params if name in attrs and name not in skip_keys}
 
- dl_args = self._resolve_batch_sampler(dl_args, dataloader, sampler)
+ dl_args = self._resolve_batch_sampler(dl_args, dataloader, sampler, mode=mode)
 
  multiprocessing_context = dataloader.multiprocessing_context
  dl_args['multiprocessing_context'] = multiprocessing_context
@@ -205,12 +212,15 @@ def __init__(self, num_features, dataset, *args, **kwargs):
  dataloader.multiprocessing_context = multiprocessing_context
  return dataloader
 
- def _get_distributed_sampler(self, dataloader, shuffle):
+ def _get_distributed_sampler(
+ self, dataloader: DataLoader, shuffle: bool, mode: Optional[RunningStage] = None
+ ) -> DistributedSampler:
  kwargs = self.distributed_sampler_kwargs
  kwargs["shuffle"] = shuffle and not self.overfit_batches
  if _TORCH_GREATER_EQUAL_1_6:
  kwargs.setdefault("seed", int(os.getenv("PL_GLOBAL_SEED", 0)))
- sampler = DistributedSampler(dataloader.dataset, **kwargs)
+ cls = UnrepeatedDistributedSampler if mode == RunningStage.PREDICTING else DistributedSampler
+ sampler = cls(dataloader.dataset, **kwargs)
  return sampler
 
  def reset_train_dataloader(self, model: LightningModule) -> None:
@@ -296,7 +306,7 @@ def _reset_eval_dataloader(
 
  Args:
  model: The current `LightningModule`
- mode: Either `'val'` or `'test'`
+ mode: Either `'val'`, `'test'` or `'predict'`
 
  Returns:
  Tuple (num_batches, dataloaders)
@@ -342,7 +352,9 @@ def _reset_eval_dataloader(
  rank_zero_warn("One of given dataloaders is None and it will be skipped.")
 
  # add samplers
- dataloaders = [self.auto_add_sampler(dl, shuffle=False) for dl in dataloaders if dl is not None]
+ dataloaders = [
+ self.auto_add_sampler(dl, shuffle=False, mode=self._running_stage) for dl in dataloaders if dl is not None
+ ]
 
  # add worker_init_fn for correct seeding in worker processes
  apply_to_collection(dataloaders, dtype=DataLoader, function=self.auto_add_worker_init_fn)