Horovod: fixed early stopping and added metrics aggregation (#3775)

* Fixed early stopping for Horovod * Refactored to sync_dist_if_available * Bump min Horovod version to support hvd.is_initialized * Changelog * Added back change for Horovod * Removed redundant checks for initialization * Implement metrics gathering for Horovod * Added test for EvalResult * Renamed ddp_sync_on_step -> dist_sync_on_step * Added metric test for Horovod * Added option pass callable allgather function to metric base class * Added dist_sync_fn * Fixed calls to private _sync_dist * Fixed Horovod test * Added sync_tensor to the distributed backend * Skip Windows * Insert test path * Removed redundant import * Updated drone * Unset HOROVOD_GPU_ALLREDUCE * Unset * No cache dir * No uninstall * Unset variables * Uninstall Horovod during initialization * Replaced more references to ddp_sync_on_step * Fixed imports * Fixed attribute * Added back default * Lint * Added back docstring * Made gather_all_tensors default * Added whitespace * Update tests/models/test_horovod.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/metrics/metric.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update CHANGELOG.md Co-authored-by: Teddy Koker <teddy.koker@gmail.com> Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> (cherry picked from commit 51cc7a8)
Lightning-AI · Nov 11, 2020 · d643123 · d643123
1 parent 84bea0f
commit d643123
Show file tree

Hide file tree

Showing 22 changed files with 365 additions and 67 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,50 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
+## [unreleased] - YYYY-MM-DD
+
+### Added
+
+- Added "monitor" key to saved `ModelCheckpoints` ([#4383](https://github.com/PyTorchLightning/pytorch-lightning/pull/4383))
+
+
+- Added `ConfusionMatrix` class interface ([#4348](https://github.com/PyTorchLightning/pytorch-lightning/pull/4348))
+
+
+- Added multiclass AUROC metric ([#4236](https://github.com/PyTorchLightning/pytorch-lightning/pull/4236))
+
+
+- Added global step indexing to the checkpoint name for a better sub-epoch checkpointing experience ([#3807](https://github.com/PyTorchLightning/pytorch-lightning/pull/3807))
+
+
+- Added optimizer hooks in callbacks ([#4379](https://github.com/PyTorchLightning/pytorch-lightning/pull/4379))
+
+
+- Added option to log momentum ([#4384](https://github.com/PyTorchLightning/pytorch-lightning/pull/4384))
+
+
+- Added `fsspec` to tuner ([#4458](https://github.com/PyTorchLightning/pytorch-lightning/pull/4458))
+
+
+- Added metrics aggregation in Horovod and fixed early stopping ([#3775](https://github.com/PyTorchLightning/pytorch-lightning/pull/3775)) 
+
+
+### Changed
+
+
+
+### Deprecated
+
+
+
+### Removed
+
+
+
+### Fixed
+
+
+
 ## [1.0.5] - 2020-11-03
 
 ### Added

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
@@ -14,7 +14,7 @@
 import os
 import math
 from enum import Enum
-from typing import Any, Optional
+from typing import Any, Optional, Union
 
 import torch
 
@@ -30,6 +30,12 @@
 except ImportError:
  amp = None
 
+if torch.distributed.is_available():
+ from torch.distributed import ReduceOp
+else:
+ class ReduceOp:
+ SUM = None
+
 EPSILON = 1e-6
 EPSILON_FP16 = 1e-5
 
@@ -209,6 +215,22 @@ def init_ddp_connection(
  torch_backend, rank=global_rank, world_size=world_size
  )
 
+ def sync_tensor(self,
+ tensor: Union[torch.Tensor],
+ group: Optional[Any] = None,
+ reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
+ """
+ Function to reduce a tensor from several distributed processes to one aggregated tensor.
+ Args:
+ tensor: the tensor to sync and reduce
+ group: the process group to gather results from. Defaults to all processes (world)
+ reduce_op: the reduction operation. Defaults to sum.
+ Can also be a string of 'avg', 'mean' to calculate the mean during reduction.
+ Return:
+ reduced value
+ """
+ raise NotImplementedError()
+
  def __getstate__(self):
  return {
  'trainer': self.trainer,

diff --git a/pytorch_lightning/accelerators/ddp_accelerator.py b/pytorch_lightning/accelerators/ddp_accelerator.py
@@ -18,17 +18,18 @@
 import sys
 from os.path import abspath
 from time import sleep
-from typing import Optional, List
+from typing import Any, Optional, List, Union
 
 import numpy as np
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.distributed.dist import LightningDistributed
 from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.utilities.distributed import find_free_network_port
 from pytorch_lightning.utilities.distributed import rank_zero_only
+from pytorch_lightning.utilities.distributed import sync_ddp_if_available
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.seed import seed_everything
 from torch.nn.parallel import DistributedDataParallel
@@ -298,3 +299,9 @@ def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule:
  model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None)
 
  return model
+
+ def sync_tensor(self,
+ tensor: Union[torch.Tensor],
+ group: Optional[Any] = None,
+ reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
+ return sync_ddp_if_available(tensor, group, reduce_op)
diff --git a/pytorch_lightning/accelerators/ddp_cpu_slurm_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_slurm_accelerator.py
@@ -12,18 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 import os
-from typing import List, Optional
+from typing import Any, List, Optional, Union
 
 import torch
 import torch.distributed as torch_distrib
 import torch.distributed as dist
 from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.utilities.distributed import rank_zero_only
+from pytorch_lightning.utilities.distributed import sync_ddp_if_available
 from pytorch_lightning.distributed.dist import LightningDistributed
 
 
@@ -199,3 +200,9 @@ def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule:
  model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None)
 
  return model
+
+ def sync_tensor(self,
+ tensor: Union[torch.Tensor],
+ group: Optional[Any] = None,
+ reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
+ return sync_ddp_if_available(tensor, group, reduce_op)
diff --git a/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 import os
-from typing import List, Optional
+from typing import Any, List, Optional, Union
 
 import torch
 import torch.distributed as torch_distrib
@@ -21,11 +21,11 @@
 from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.utilities.distributed import rank_zero_only, rank_zero_warn
-from pytorch_lightning.utilities.distributed import find_free_network_port
+from pytorch_lightning.utilities.distributed import find_free_network_port, sync_ddp_if_available
 from pytorch_lightning.distributed.dist import LightningDistributed
 
 try:
@@ -229,3 +229,9 @@ def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule:
  model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None)
 
  return model
+
+ def sync_tensor(self,
+ tensor: Union[torch.Tensor],
+ group: Optional[Any] = None,
+ reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
+ return sync_ddp_if_available(tensor, group, reduce_op)
diff --git a/pytorch_lightning/accelerators/ddp_cpu_torchelastic_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_torchelastic_accelerator.py
@@ -12,19 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 import os
-from typing import List, Optional
+from typing import Any, List, Optional, Union
 
 import torch
 import torch.distributed as torch_distrib
 import torch.distributed as dist
 from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.distributed.dist import LightningDistributed
 from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.utilities.distributed import rank_zero_only
+from pytorch_lightning.utilities.distributed import sync_ddp_if_available
 
 try:
  from hydra.utils import to_absolute_path, get_original_cwd
@@ -198,3 +199,9 @@ def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule:
  model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None)
 
  return model
+
+ def sync_tensor(self,
+ tensor: Union[torch.Tensor],
+ group: Optional[Any] = None,
+ reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
+ return sync_ddp_if_available(tensor, group, reduce_op)
diff --git a/pytorch_lightning/accelerators/ddp_slurm_accelerator.py b/pytorch_lightning/accelerators/ddp_slurm_accelerator.py
@@ -12,19 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 import os
-from typing import List
+from typing import Any, List, Optional, Union
 
 import torch
 import torch.distributed as torch_distrib
 import torch.distributed as dist
 from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.distributed.dist import LightningDistributed
 from pytorch_lightning.utilities import AMPType
-from pytorch_lightning.utilities.distributed import rank_zero_only
+from pytorch_lightning.utilities.distributed import rank_zero_only, sync_ddp_if_available
 from pytorch_lightning.utilities.seed import seed_everything
 
 try:
@@ -205,3 +205,9 @@ def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule:
  model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None)
 
  return model
+
+ def sync_tensor(self,
+ tensor: Union[torch.Tensor],
+ group: Optional[Any] = None,
+ reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
+ return sync_ddp_if_available(tensor, group, reduce_op)
diff --git a/pytorch_lightning/accelerators/ddp_spawn_accelerator.py b/pytorch_lightning/accelerators/ddp_spawn_accelerator.py
@@ -13,7 +13,7 @@
 # limitations under the License
 import os
 import re
-from typing import List, Optional
+from typing import Any, List, Optional, Union
 
 import torch
 import torch.multiprocessing as mp
@@ -22,11 +22,12 @@
 from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.utilities.cloud_io import atomic_save, load as pl_load
 from pytorch_lightning.utilities.distributed import rank_zero_only, rank_zero_warn, find_free_network_port
+from pytorch_lightning.utilities.distributed import sync_ddp_if_available
 from pytorch_lightning.utilities.seed import seed_everything
 from pytorch_lightning.distributed.dist import LightningDistributed
 
@@ -254,3 +255,9 @@ def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule:
  model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None)
 
  return model
+
+ def sync_tensor(self,
+ tensor: Union[torch.Tensor],
+ group: Optional[Any] = None,
+ reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
+ return sync_ddp_if_available(tensor, group, reduce_op)
diff --git a/pytorch_lightning/accelerators/ddp_torchelastic_accelerator.py b/pytorch_lightning/accelerators/ddp_torchelastic_accelerator.py
@@ -12,19 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 import os
-from typing import List, Optional
+from typing import Any, List, Optional, Union
 
 import torch
 import torch.distributed as torch_distrib
 import torch.distributed as dist
 from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.distributed.dist import LightningDistributed
 from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.utilities.distributed import rank_zero_only
+from pytorch_lightning.utilities.distributed import sync_ddp_if_available
 
 
 try:
@@ -201,3 +202,9 @@ def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule:
  model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None)
 
  return model
+
+ def sync_tensor(self,
+ tensor: Union[torch.Tensor],
+ group: Optional[Any] = None,
+ reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
+ return sync_ddp_if_available(tensor, group, reduce_op)
diff --git a/pytorch_lightning/accelerators/horovod_accelerator.py b/pytorch_lightning/accelerators/horovod_accelerator.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from contextlib import ExitStack
-from typing import Optional
+from typing import Any, Optional, Union
 
 import torch
 from torch.optim.lr_scheduler import _LRScheduler
 
-from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
 from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.utilities.distributed import rank_zero_only
 
@@ -161,3 +161,41 @@ def barrier(self, name: Optional[str] = None):
  def broadcast(self, obj, src=0):
  obj = hvd.broadcast_object(obj, src)
  return obj
+
+ def gather_all_tensors(self, result: Union[torch.Tensor], group: Optional[Any] = None):
+ if group is not None:
+ raise ValueError(
+ "Horovod does not support allgather using a subcommunicator at this time. "
+ "Unset `group`."
+ )
+
+ if len(result.shape) == 0:
+ # Convert scalars to single dimension tensors
+ result = result.reshape(1)
+
+ # sync and gather all
+ hvd.join()
+ gathered = hvd.allgather(result)
+ gathered_result = list(gathered.split(1, dim=0))
+ return gathered_result
+
+ def sync_tensor(self,
+ tensor: Union[torch.Tensor],
+ group: Optional[Any] = None,
+ reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
+ if group is not None:
+ raise ValueError(
+ "Horovod does not support allreduce using a subcommunicator at this time. "
+ "Unset `group`."
+ )
+
+ if reduce_op is None or reduce_op == "sum":
+ reduce_op = hvd.Sum
+ elif isinstance(reduce_op, str) and reduce_op in ("avg", "mean"):
+ reduce_op = hvd.Average
+ else:
+ raise ValueError(f"unrecognized `reduce_op`: {reduce_op}")
+
+ # sync all processes before reduction
+ hvd.join()
+ return hvd.allreduce(tensor, op=reduce_op)
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
@@ -243,7 +243,7 @@ def log(
  # set the default depending on the fx_name
  on_step = self.__auto_choose_log_on_step(on_step)
  on_epoch = self.__auto_choose_log_on_epoch(on_epoch)
-
+ accelerator = self.trainer.accelerator_backend
  self._results.log(
  name,
  value,
@@ -257,7 +257,8 @@ def log(
  enable_graph,
  sync_dist,
  sync_dist_op,
- sync_dist_group
+ sync_dist_group,
+ accelerator.sync_tensor
  )
 
  def log_dict(