Lightning-AI · awaelchli · Aug 17, 2021 · Aug 13, 2021 · Aug 13, 2021 · Aug 13, 2021
@@ -43,7 +43,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
     * Added `FastForwardSampler` and `CaptureIterableDataset` injection to data loading utilities ([#8366](https://github.com/PyTorchLightning/pytorch-lightning/pull/8366))
     * Added `LightningDataFetcher` to control fetching flow ([#8890](https://github.com/PyTorchLightning/pytorch-lightning/pull/8890))
     * Added `SharedCycleIteratorState` to prevent infinite loop ([#8889](https://github.com/PyTorchLightning/pytorch-lightning/pull/8889))
-
+    * Added `CaptureMapDataset` for state management in map-style datasets ([#8891](https://github.com/PyTorchLightning/pytorch-lightning/pull/8891))
+    * Added Fault Tolerant Training to LightningFetcher ([#8891](https://github.com/PyTorchLightning/pytorch-lightning/pull/8891))
 
 - Added `CheckpointIO` to expose checkpoint IO from training type plugin ([#8743](https://github.com/PyTorchLightning/pytorch-lightning/pull/8743))
 

@@ -32,7 +32,7 @@
 from pytorch_lightning.utilities import rank_zero_warn
 from pytorch_lightning.utilities.apply_func import apply_to_collection
 from pytorch_lightning.utilities.auto_restart import (
-    _sampler_metadata_collate,
+    _capture_metadata_collate,
     CaptureIterableDataset,
     FastForwardSampler,
 )
@@ -529,5 +529,5 @@ def _add_sampler_metadata_collate(dataloader: DataLoader) -> None:
         Wrap default collate function to retrive ``FastForwardSampler`` state dict when fault tolerant is enabled.
         """
         dataloader.collate_fn = partial(
-            _sampler_metadata_collate, dataset=dataloader.dataset, default_collate=dataloader.collate_fn
+            _capture_metadata_collate, dataset=dataloader.dataset, default_collate=dataloader.collate_fn
         )
@@ -14,7 +14,9 @@
 
 from collections.abc import Mapping
 from copy import deepcopy
-from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Union
+from dataclasses import dataclass, field
+from functools import partial, wraps
+from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Tuple, Union
 
 from torch.utils.data import Dataset, get_worker_info, Sampler
 from torch.utils.data.dataloader import _MultiProcessingDataLoaderIter, DataLoader, IterableDataset
@@ -49,9 +51,8 @@ def __getattr__(self, key: str) -> Any:
         return getattr(self._sampler, key, None)
 
     def setup(self, dataloader_batch_size: Optional[int] = None) -> None:
-        """
-        Setup the ``FastForwardSampler``.
-        This is required only when the provided dataset subclassed :class:`torch.utils.data.Dataset`.
+        """Setup the ``FastForwardSampler``. This is required only when the provided dataset subclassed
+        :class:`torch.utils.data.Dataset`.
         """
         self._dataloader_batch_size = dataloader_batch_size
 
@@ -61,9 +62,10 @@ def worker_id(self) -> int:
         return worker_info.id if worker_info else 0
 
     def __iter__(self) -> Iterator[Any]:
-        # the `state dict` was cached as workers were unavailable before
-        # reload it now
-        self._load_cached_state()
+        self._current_iteration = 0
+        # the `state dict` was cached as workers were unavailable before.
+        if self._cached_state_dict is not None:
+            self._load_non_random_state(self._cached_state_dict)
 
         i = 0
         sampler_iter = iter(self._sampler)
@@ -72,6 +74,10 @@ def __iter__(self) -> Iterator[Any]:
             i += 1
 
         # here: i == self._current_iteration
+        if self._cached_state_dict is not None:
+            self._cached_state_dict = None
+
+        # recreate iterator to be sure loading is reflected there as well
         while True:
             self._current_iteration += 1
             try:
@@ -80,6 +86,7 @@ def __iter__(self) -> Iterator[Any]:
                 break
 
         self._current_iteration = 0
+        self._cached_state_dict = None
         self.restarting = False
 
     def __len__(self) -> int:
@@ -116,13 +123,111 @@ def _compute_current_iteration(self, num_batches_processed: Optional[int] = None
 
         return current_iteration
 
-    def _load_cached_state(self):
-        if self._cached_state_dict is None or self.worker_id not in self._cached_state_dict:
-            return
-        self._current_iteration = self._cached_state_dict[self.worker_id]["current_iteration"]
-        # delete cached state, prevent reloading every time iter() is called
+    def _load_non_random_state(self, state_dict: Dict[int, Dict[str, Any]]) -> None:
+        self._current_iteration = state_dict[self.worker_id]["current_iteration"]
+
+
+@dataclass(frozen=True, unsafe_hash=True)
+class IteratorState:
+    """The state of an iterator in a single worker process."""
+
+    dataset_state: Dict[int, Any] = field(default_factory=dict)
+    sampler_state: Dict[int, Any] = field(default_factory=dict)
+    worker_id: int = 0
+    num_workers: int = 0
+    num_batches_fetched: int = 0
+    name: Optional[str] = None
+
+    @classmethod
+    def from_state_dict(cls, state_dict) -> "IteratorState":
+        return cls(**state_dict)
+
+
+@dataclass
+class MergedIteratorState:
+    """This class is used to hold the current iterator state and lives on the iterator. It holds the current merged
+    states from all worker processes. Once an iterator advances, it can store updates of the worker states in this
+    merged iterator state."""
+
+    state: Union[Dict[Union[int, str], Union[Dict[str, IteratorState], IteratorState]]] = field(default_factory=dict)
+    latest_worker_id: int = 0
+    represent_map_dataset: Optional[bool] = None
+
+    def update(self, generator_name: Optional[str], new_state: IteratorState) -> None:
+        # a map based dataset doesn't own a generator and therefore `generator_name` should be None.
+        self.represent_map_dataset = generator_name is None
+        if self.represent_map_dataset:
+            state = self.state
+        else:
+            if generator_name not in self.state:
+                self.state[generator_name] = {}
+            state = self.state[generator_name]
+
+        latest_worker_id = new_state.worker_id
+        state[latest_worker_id] = new_state
+        self.latest_worker_id = latest_worker_id
+
+    @classmethod
+    def from_state_dict(cls, state_dict) -> "MergedIteratorState":
+        if state_dict["represent_map_dataset"]:
+            state_dict["state"] = {
+                worker_id: IteratorState.from_state_dict(state) for worker_id, state in state_dict["state"].items()
+            }
+        else:
+            state_dict["state"] = {
+                sampler_name: {
+                    worker_id: IteratorState.from_state_dict(state) for worker_id, state in worker_state.items()
+                }
+                for sampler_name, worker_state in state_dict["state"].items()
+            }
+        return cls(**state_dict)
+
+    def __len__(self) -> int:
+        return len(self.state)
+
+
+class CaptureMapDataset(Dataset):
+    """This class is used to capture the state from the map-based state dataset."""
+
+    def __init__(self, dataset: Dataset) -> None:
+        self.dataset = dataset
         self._cached_state_dict = None
 
+    @property
+    def worker_id(self) -> int:
+        worker_info = get_worker_info()
+        return worker_info.id if worker_info else 0
+
+    def __getitem__(self, item) -> Tuple[Any, Dict[int, Dict]]:
+        if self._cached_state_dict is not None:
+            if self.worker_id in self._cached_state_dict:
+                # TODO: reset random states
+                pass
+            self._cached_state_dict = None
+
+        data = self.dataset[item]
+        state_dict = self._state_dict()
+        return data, state_dict
+
+    def __len__(self) -> int:
+        return len(self.dataset)
+
+    def load_state_dict(self, state_dict: Dict[int, Any], latest_worker_id: int, num_workers: int) -> None:
+        # as workers aren't available, the ``state_dict``` is cached until workers are made available.
+        state_dict = deepcopy(state_dict)
+
+        if num_workers > 0:
+            # remap states to worker ids starting at 0
+            next_worker_id = latest_worker_id + 1
+            old_to_new_worker_id_map = [((next_worker_id + i) % num_workers, i) for i in range(num_workers)]
+            state_dict = {
+                new_id: state_dict[old_id] for old_id, new_id in old_to_new_worker_id_map if old_id in state_dict
+            }
+        self._cached_state_dict = state_dict
+
+    def _state_dict(self) -> Dict[int, Dict[str, Any]]:
+        return {self.worker_id: {"rng_states": {}}}
+
 
 class CaptureIterableDataset(IterableDataset):
     """
@@ -136,8 +241,9 @@ class CaptureIterableDataset(IterableDataset):
     def __init__(self, dataset: IterableDataset) -> None:
         super().__init__()
         self.dataset = deepcopy(dataset)
-        self._state_dict: Optional[Dict[int, Any]] = None
         self.samplers: Optional[Dict[str, FastForwardSampler]] = None
+        self._state_dict: Optional[Dict[int, Any]] = None
+        self._has_wrapped: bool = False
 
     @property
     def sampler(self) -> Sampler:
@@ -188,22 +294,29 @@ def _wrap_generator_samplers(self) -> None:
                 # if `CaptureIterableDataset` was available, the sampler should reload its own state.
                 if self._state_dict is not None:
                     sampler.load_state_dict(self._state_dict[generator_attr_name])
-
                 # store the samplers
                 self.samplers[generator_attr_name] = sampler
 
                 # replace generator with the generator from the `FastForwardSampler`.
                 dataset_dict[generator_attr_name] = iter(sampler)
 
-    def reset_on_epoch(self) -> None:
+        self.reset_on_epoch()
+
+    def reset_on_epoch(self):
         self._state_dict = None
 
     def __iter__(self) -> Iterator:
         # create a generator from the wrapped Iterative Dataset
-        # if the dataset contained samplers, they will be transformers into generators
+        # if the dataset contained samplers, they will be transformed into generators
         self.iter_data = iter(self.dataset)
 
         # wrap any generator associated to a Sampler into a `FastForwardSampler`.
+        if isinstance(self.iter_data, Generator):
+            raise MisconfigurationException(
+                "PyTorch Lightning Fault-Tolerant feature does not support `__iter__` returning a generator."
+                " Please use the `__next__` function to fetch the next batch and use a sampler for"
+                " doing your iterations."
+            )
         self._wrap_generator_samplers()
         return self
 
@@ -214,7 +327,6 @@ def __next__(self) -> Dict[str, Any]:
     def store_samplers_state_dict(iterator: Iterator, sampler_state_dict: List) -> None:
         """
         This function is used to store and update sampler state dict on its associated iterator.
-
         In Lightning, as the iterator is wrapped into a prefetching function,
         we needed to introduce a cache to delay updating the ``sampler_state_dict``.
         """
@@ -241,7 +353,7 @@ def _sanitize_batch_from_sampler_state(data: Any, state_dicts: List):
 
             {
                 "batch": ...,  # data returned by DataLoader
-                "__pl_samplers": {
+                "__pl_restart_meta": {
                     "sampler0": {
                         0: {"current_iteration": ...},
                         1: {"current_iteration": ...},
@@ -251,14 +363,14 @@ def _sanitize_batch_from_sampler_state(data: Any, state_dicts: List):
             }
 
         Each sampler in the worker process tracks the current iteration. We return all of them to the main process
-        as part of the sample and then a special collate function :func:`_sampler_metadata_collate`
+        as part of the sample and then a special collate function :func:`_capture_metadata_collate`
         will extract the current iteration as part of the metadata returned by a custom batch.
         """
 
         def _sanitize(data: Mapping):
             out = []
             for k, v in data.items():
-                if k == AutoRestartBatchKeys.PL_SAMPLERS:
+                if k == AutoRestartBatchKeys.PL_RESTART_META:
                     state_dicts.append(v)
                     return data["data"]
                 out.append((k, CaptureIterableDataset._sanitize_batch_from_sampler_state(v, state_dicts)))
@@ -376,20 +488,82 @@ def _find_current_worker(iterator: Iterator) -> Dict[str, Optional[int]]:
     return {"num_workers": num_workers, "previous_worker": previous_worker}
 
 
-def _sampler_metadata_collate(samples: List, dataset: Dataset, default_collate: Callable) -> Dict:
-    """
-    A collate function that adds the state dict of all samplers used in the worker processes.
-
+def _capture_metadata_collate(samples: List, dataset: Dataset, default_collate: Callable) -> Dict:
+    """A collate function that adds the state dict of a :class:`CaptureIterableDataset` or :class:`CaptureMapDataset`
+    used in the worker processes. This function gets executed within the worker processes.
     The structure will be:
 
     .. code-block:: python
 
         {
             "data": ...,  # data returned by Dataset
-            "__pl_samplers": {"sampler_name0": state_dict0, "sampler_name1": state_dict1},
+            "__pl_restart_meta": {"sampler_name0": state_dict0, "sampler_name1": state_dict1},
         }
     """
-    batch = default_collate(samples)
-    if not isinstance(dataset, CaptureIterableDataset):
-        return batch
-    return {"data": batch, AutoRestartBatchKeys.PL_SAMPLERS: dataset.state_dict()}
+    if isinstance(dataset, CaptureIterableDataset):
+        data = default_collate(samples)
+        metadata = dataset.state_dict()
+
+    elif isinstance(dataset, CaptureMapDataset):
+        samples, states = zip(*samples)
+        data = default_collate(samples)
+        metadata = states[-1]
+    else:
+        return default_collate(samples)
+
+    return {"data": data, AutoRestartBatchKeys.PL_RESTART_META: metadata}
+
+
+def patch_dataloader_iterator(
+    dataloader: DataLoader, iterator: Iterator, prefetcher, num_batches_fetched: int = 0
+) -> None:
+    assert isinstance(dataloader.dataset, (CaptureMapDataset, CaptureIterableDataset))
+
+    def _next_data_wrapper(fn, it, dl, num_batches_fetched) -> Callable:
+        @wraps(fn)
+        def wrapper():
+            nonlocal num_batches_fetched
+            nonlocal it
+            nonlocal dl
+
+            dataset = dl.dataset
+            combined_batch = fn()
+
+            batch, state = combined_batch["data"], combined_batch[AutoRestartBatchKeys.PL_RESTART_META]
+            num_batches_fetched += 1
+
+            if isinstance(dataset, CaptureIterableDataset):
+                state = [
+                    IteratorState(
+                        num_workers=dataloader.num_workers,
+                        sampler_state=iterator_state,
+                        num_batches_fetched=num_batches_fetched,
+                        worker_id=list(iterator_state.keys())[0],
+                        name=sampler_iter_name,
+                    )
+                    for sampler_iter_name, iterator_state in state.items()
+                ]
+            elif isinstance(dataset, CaptureMapDataset):
+                ff_sampler = _find_fast_forward_samplers(dl)
+                state = [
+                    IteratorState(
+                        num_workers=dataloader.num_workers,
+                        sampler_state=ff_sampler.state_dict(num_batches_fetched),
+                        dataset_state=state,
+                        worker_id=list(state.keys())[0],
+                        num_batches_fetched=num_batches_fetched,
+                    )
+                ]
+            prefetcher._store_dataloader_iter_state(it, state)
+            return batch
+
+        return wrapper
+
+    iterator._next_data = _next_data_wrapper(iterator._next_data, iterator, dataloader, num_batches_fetched)
+
+
+def _add_capture_metadata_collate(dataloader: DataLoader) -> None:
+    """Wrap default collate function to retrive captured dataset state dict when fault tolerant is enabled."""
+    dataloader.collate_fn = partial(
+        _capture_metadata_collate, dataset=dataloader.dataset, default_collate=dataloader.collate_fn
+    )
@@ -121,8 +121,6 @@ class GradClipAlgorithmType(LightningEnum):
 
 
 class AutoRestartBatchKeys(LightningEnum):
-    """
-    Defines special dictionary keys used to track sampler progress with multiple workers.
-    """
+    """Defines special dictionary keys used to track captured dataset state with multiple workers."""
 
-    PL_SAMPLERS = "__pl_samplers"
+    PL_RESTART_META = "__pl_restart_meta"