Lightning-AI · lexierule · Apr 27, 2021 · Apr 20, 2021 · Apr 20, 2021 · Apr 20, 2021
@@ -111,6 +111,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added new `EarlyStopping` parameters `stopping_threshold` and `divergence_threshold` ([#6868](https://github.com/PyTorchLightning/pytorch-lightning/pull/6868))
 
 
+- Added new `UnrepeatedDistributedSampler` and `IndexBatchSamplerWrapper` for tracking distributed predictions ([#7215](https://github.com/PyTorchLightning/pytorch-lightning/pull/7215))
+
+
+- Added `trainer.predict(return_predictions=None|False|True)` ([#7215](https://github.com/PyTorchLightning/pytorch-lightning/pull/7215))
+
 
 ### Changed
 

@@ -26,7 +26,7 @@
 
 class _DataModuleWrapper(type):
 
- def __init__(self, *args, **kwargs):
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
  super().__init__(*args, **kwargs)
  self.__has_added_checks = False
 

@@ -74,7 +74,7 @@ class LightningModule(
  "model_size",
  ] + DeviceDtypeModuleMixin.__jit_unused_properties__
 
- def __init__(self, *args, **kwargs):
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
  super().__init__(*args, **kwargs)
 
  # see (https://github.com/pytorch/pytorch/blob/3e6bb5233f9ca2c5aa55d9cda22a7ee85439aa6e/

@@ -12,10 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import itertools
-from typing import Any
+from typing import Any, Iterator, List, Optional
 
 import torch
 from torch.nn.parallel import DistributedDataParallel
+from torch.utils.data import BatchSampler, DistributedSampler, Sampler
 
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.overrides.base import _LightningModuleWrapperBase
@@ -75,3 +76,70 @@ def prepare_for_backward(model: DistributedDataParallel, output: Any):
  model.reducer.prepare_for_backward([])
  else:
  model.require_forward_param_sync = False
+
+
+class UnrepeatedDistributedSampler(DistributedSampler):
+ """
+ A fork of the pytorch DistributedSampler that doesn't repeat data, instead
+ allowing the number of batches per process to be off-by-one from each other.
+ This makes this sampler usable for predictions (it's deterministic and
+ doesn't require shuffling). It is potentially unsafe to use this sampler for
+ training, because during training the DistributedDataParallel syncs buffers
+ on each forward pass, so it could freeze if one of the processes runs one
+ fewer batch. During prediction, buffers are only synced on the first batch,
+ so this is safe to use as long as each process runs at least one batch. We
+ verify this in an assert.
+
+ Taken from https://github.com/jpuigcerver/PyLaia/blob/v1.0.0/laia/data/unpadded_distributed_sampler.py
+ and https://github.com/pytorch/pytorch/issues/25162#issuecomment-634146002
+ """
+
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
+ super().__init__(*args, **kwargs)
+ self.num_samples = len(range(self.rank, len(self.dataset), self.num_replicas))
+ self.total_size = len(self.dataset)
+ # If any process has at least one batch, every other process needs to
+ # have at least one batch, or the DistributedDataParallel could lock up.
+ assert self.num_samples >= 1 or self.total_size == 0
+
+ def __iter__(self) -> Iterator[List[int]]:
+ if self.shuffle:
+ # deterministically shuffle based on epoch
+ g = torch.Generator()
+ g.manual_seed(self.epoch)
+ indices = torch.randperm(len(self.dataset), generator=g).tolist()
+ else:
+ indices = list(range(len(self.dataset)))
+
+ assert len(indices) == self.total_size
+
+ # subsample
+ indices = indices[self.rank:self.total_size:self.num_replicas]
+ assert len(indices) == self.num_samples
+
+ return iter(indices)
+
+
+class IndexBatchSamplerWrapper:
+ """This class is used to wrap a :class:`torch.utils.data.BatchSampler` and capture its indices."""
+
+ def __init__(self, sampler: BatchSampler) -> None:
+ self._sampler = sampler
+ self.batch_indices: Optional[List[int]] = None
+
+ def __iter__(self) -> Iterator[List[int]]:
+ for batch in self._sampler:
+ self.batch_indices = batch
- self.batch_indices = batch
+ self._indices = batch
- self.batch_indices = batch
+ self._indices = batch
+ yield batch
+
+ @property
+ def drop_last(self) -> bool:
+ return self._sampler.drop_last
+
+ @property
+ def batch_size(self) -> int:
+ return self._sampler.batch_size
+
+ @property
+ def sampler(self) -> Sampler:
+ return self._sampler.sampler
@@ -13,7 +13,7 @@
 # limitations under the License.
 import os
 from datetime import timedelta
-from typing import List, Union, Optional, Dict
+from typing import Dict, List, Optional, Union
 
 from pytorch_lightning.callbacks import Callback, ModelCheckpoint, ProgressBar, ProgressBarBase
 from pytorch_lightning.callbacks.timer import Timer
@@ -58,6 +58,8 @@ def on_trainer_init(
  # configure swa callback
  self._configure_swa_callbacks()
 
+ # configure the timer callback.
+ # responsible to stop the training when max_time is reached.
  self._configure_timer_callback(max_time)
 
  # init progress bar
@@ -115,9 +117,7 @@ def _configure_timer_callback(self, max_time: Optional[Union[str, timedelta, Dic
  if max_time is None:
  return
  if any(isinstance(cb, Timer) for cb in self.trainer.callbacks):
- rank_zero_info(
- "Ignoring `Trainer(max_time=...)`, callbacks list already contains a Timer."
- )
+ rank_zero_info("Ignoring `Trainer(max_time=...)`, callbacks list already contains a Timer.")
  return
  timer = Timer(duration=max_time, interval="step")
  self.trainer.callbacks.append(timer)

@@ -17,14 +17,16 @@
 from abc import ABC
 from copy import deepcopy
 from functools import partial
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 
 from pytorch_lightning.accelerators import Accelerator
 from pytorch_lightning.core import LightningModule
+from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper, UnrepeatedDistributedSampler
 from pytorch_lightning.trainer.connectors.accelerator_connector import AcceleratorConnector
+from pytorch_lightning.trainer.states import RunningStage
 from pytorch_lightning.trainer.supporters import CombinedLoader
 from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_6, rank_zero_warn
 from pytorch_lightning.utilities.apply_func import apply_to_collection
@@ -107,7 +109,9 @@ def auto_add_worker_init_fn(self, dataloader: DataLoader) -> None:
  if int(os.environ.get("PL_SEED_WORKERS", 0)) and dataloader.worker_init_fn is None:
  dataloader.worker_init_fn = partial(pl_worker_init_function, rank=self.global_rank)
 
- def auto_add_sampler(self, dataloader: DataLoader, shuffle: bool) -> DataLoader:
+ def auto_add_sampler(
+ self, dataloader: DataLoader, shuffle: bool, mode: Optional[RunningStage] = None
+ ) -> DataLoader:
  # don't do anything if it's not a dataloader
  is_dataloader = isinstance(dataloader, DataLoader)
  # don't manipulate iterable datasets
@@ -133,20 +137,24 @@ def auto_add_sampler(self, dataloader: DataLoader, shuffle: bool) -> DataLoader:
  )
 
  # replace with distributed sampler
- sampler = self._get_distributed_sampler(dataloader, shuffle)
- dataloader = self.replace_sampler(dataloader, sampler)
+ sampler = self._get_distributed_sampler(dataloader, shuffle, mode=mode)
+ dataloader = self.replace_sampler(dataloader, sampler, mode=mode)
 
  return dataloader
 
  @staticmethod
- def _resolve_batch_sampler(dl_args, dataloader, sampler):
+ def _resolve_batch_sampler(dl_args, dataloader, sampler, mode: Optional[RunningStage] = None) -> Dict[str, Any]:
  batch_sampler = getattr(dataloader, "batch_sampler")
- if batch_sampler is not None and type(batch_sampler) is not BatchSampler:
+ is_predicting = mode == RunningStage.PREDICTING
+ # checking the batch sampler type is different than PyTorch default.
+ if (batch_sampler is not None and type(batch_sampler) is not BatchSampler) or is_predicting:
  batch_sampler = type(batch_sampler)(
  sampler,
  batch_size=batch_sampler.batch_size,
- drop_last=batch_sampler.drop_last,
+ drop_last=(False if is_predicting else batch_sampler.drop_last),
  )
+ if is_predicting:
+ batch_sampler = IndexBatchSamplerWrapper(batch_sampler)
  dl_args['batch_sampler'] = batch_sampler
  dl_args['batch_size'] = 1
  dl_args['shuffle'] = False
@@ -159,7 +167,7 @@ def _resolve_batch_sampler(dl_args, dataloader, sampler):
 
  return dl_args
 
- def replace_sampler(self, dataloader, sampler):
+ def replace_sampler(self, dataloader: DataLoader, sampler, mode: Optional[RunningStage] = None) -> DataLoader:
  skip_keys = ('sampler', 'batch_sampler', 'dataset_kind')
  skip_signature_keys = ('args', 'kwargs', 'self')
 
@@ -174,7 +182,7 @@ def replace_sampler(self, dataloader, sampler):
 
  dl_args = {name: attrs[name] for name in params if name in attrs and name not in skip_keys}
 
- dl_args = self._resolve_batch_sampler(dl_args, dataloader, sampler)
+ dl_args = self._resolve_batch_sampler(dl_args, dataloader, sampler, mode=mode)
 
  multiprocessing_context = dataloader.multiprocessing_context
  dl_args['multiprocessing_context'] = multiprocessing_context
@@ -205,12 +213,15 @@ def __init__(self, num_features, dataset, *args, **kwargs):
  dataloader.multiprocessing_context = multiprocessing_context
  return dataloader
 
- def _get_distributed_sampler(self, dataloader, shuffle):
+ def _get_distributed_sampler(
+ self, dataloader: DataLoader, shuffle: bool, mode: Optional[RunningStage] = None
+ ) -> DistributedSampler:
  kwargs = self.distributed_sampler_kwargs
  kwargs["shuffle"] = shuffle and not self.overfit_batches
  if _TORCH_GREATER_EQUAL_1_6:
  kwargs.setdefault("seed", int(os.getenv("PL_GLOBAL_SEED", 0)))
- sampler = DistributedSampler(dataloader.dataset, **kwargs)
+ cls = UnrepeatedDistributedSampler if mode == RunningStage.PREDICTING else DistributedSampler
+ sampler = cls(dataloader.dataset, **kwargs)
  return sampler
 
  def reset_train_dataloader(self, model: LightningModule) -> None:
@@ -296,7 +307,7 @@ def _reset_eval_dataloader(
 
  Args:
  model: The current `LightningModule`
- mode: Either `'val'` or `'test'`
+ mode: Either `'val'`, `'test'` or `'predict'`
 
  Returns:
  Tuple (num_batches, dataloaders)
@@ -342,7 +353,9 @@ def _reset_eval_dataloader(
  rank_zero_warn("One of given dataloaders is None and it will be skipped.")
 
  # add samplers
- dataloaders = [self.auto_add_sampler(dl, shuffle=False) for dl in dataloaders if dl is not None]
+ dataloaders = [
+ self.auto_add_sampler(dl, shuffle=False, mode=self._running_stage) for dl in dataloaders if dl is not None
+ ]
 
  # add worker_init_fn for correct seeding in worker processes
  apply_to_collection(dataloaders, dtype=DataLoader, function=self.auto_add_worker_init_fn)

@@ -11,9 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Any, List, Optional, Union
+
 import torch
+from torch.utils.data.dataloader import DataLoader
 
-from pytorch_lightning.utilities.apply_func import apply_to_collection
+from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper
+from pytorch_lightning.plugins import DDPSpawnPlugin
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.warnings import WarningCache
 
 
@@ -24,6 +29,27 @@ def __init__(self, trainer):
  self.max_batches = None
  self.num_dataloaders = None
  self.warning_cache = WarningCache()
+ self.batch_indices: Optional[List[int]] = None
+ self.epoch_batch_indices: Optional[List[List[int]]] = None
+ # `DDPSpawnPlugin` plugins and derivate don't support return predictions.
+ self._return_predictions: Optional[bool] = None
+ self._previous_grad_status: Optional[bool] = None
+
+ @property
+ def return_predictions(self) -> bool:
+ return self._return_predictions
+
+ @return_predictions.setter
+ def return_predictions(self, return_predictions: Optional[bool] = None) -> None:
+ # ``DDPSpawnPlugin`` plugins and derivate don't support return predictions.
+ is_ddp_spawn = isinstance(self.trainer.training_type_plugin, DDPSpawnPlugin)
+ if return_predictions and is_ddp_spawn:
+ raise MisconfigurationException(
+ "`return_predictions` should be set to `False` when using the `DDPSpawnPlugin` or children class. "
+ f"Found {return_predictions} with training_type_plugin {type(self.trainer.training_type_plugin)}."
+ )
+ # For non ``DDPSpawnPlugin`` plugin, the `return_predictions` is True by default unless user decide otherwise.
+ self._return_predictions = not is_ddp_spawn if return_predictions is None else return_predictions
 
  def on_trainer_init(self):
  self.trainer.num_predict_batches = []
@@ -54,22 +80,26 @@ def setup(self, model, max_batches, dataloaders):
 
  self.max_batches = max_batches
  self.num_dataloaders = self._get_num_dataloaders(dataloaders)
- self._predictions = [[] for _ in range(self.num_dataloaders)]
+ self.predictions = [[] for _ in range(self.num_dataloaders)]
+ self.epoch_batch_indices = [[] for _ in range(self.num_dataloaders)]
 
- def _get_num_dataloaders(self, dataloaders):
+ def _get_num_dataloaders(self, dataloaders: List[DataLoader]) -> int:
  # case where user does:
  # return dl1, dl2
  length = len(dataloaders)
  if len(dataloaders) > 0 and isinstance(dataloaders[0], (list, tuple)):
  length = len(dataloaders[0])
  return length
 
- def predict_step(self, batch, batch_idx, dataloader_idx):
+ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None:
  # configure args
  args = [batch, batch_idx]
  if self.num_dataloaders:
  args.append(dataloader_idx)
 
+ # extract batch_indices and store them
+ self._store_batch_indices(dataloader_idx)
+
  model_ref = self.trainer.lightning_module
 
  self.trainer.call_hook("on_predict_batch_start", batch, batch_idx, dataloader_idx)
@@ -82,18 +112,44 @@ def predict_step(self, batch, batch_idx, dataloader_idx):
 
  self.trainer.call_hook("on_predict_batch_end", predictions, batch, batch_idx, dataloader_idx)
 
- self._predictions[dataloader_idx].append(predictions)
+ if self.return_predictions:
+ self.predictions[dataloader_idx].append(predictions)
+
+ def _store_batch_indices(self, dataloader_idx: int) -> None:
+ batch_sampler = self.trainer.predict_dataloaders[dataloader_idx].batch_sampler
+ if isinstance(batch_sampler, IndexBatchSamplerWrapper):
+ self.batch_indices = batch_sampler.batch_indices
+ if self.return_predictions:
+ self.epoch_batch_indices[dataloader_idx].append(batch_sampler.batch_indices)
 
- def on_predict_epoch_end(self):
+ def on_predict_start(self) -> None:
+ # enable eval mode + no grads
+ self.on_predict_model_eval()
+ self.trainer.lightning_module.zero_grad()
+ self._previous_grad_status = torch.is_grad_enabled()
+ torch.set_grad_enabled(False)
+
+ # hook
+ self.trainer.call_hook("on_predict_start")
+ self.trainer.call_hook("on_predict_epoch_start")
+
+ def on_predict_epoch_end(self) -> Optional[Union[List[Any], List[List[Any]]]]:
  self.trainer.profiler.describe()
 
- results = self._predictions
+ results: List[List[Any]] = self.predictions
 
  self.trainer.call_hook("on_predict_epoch_end", results)
 
- def _convert_to_numpy(v):
- return v.cpu().numpy()
+ if self.return_predictions:
+ return results[0] if self.num_dataloaders == 1 else results
+
+ def on_predict_end(self):
+ # clear memory. the predictions are extracted in `on_predict_epoch_end`.
+ self.predictions = None
+ self.batch_indices = None
 
- results = apply_to_collection(results, torch.Tensor, _convert_to_numpy)
+ # reset grad to its previous status.
+ torch.set_grad_enabled(self._previous_grad_status)
 
- return results[0] if len(results) == 1 else results
+ # hook
+ self.trainer.call_hook("on_predict_end")