From d74710ec0586dc9be73dd5f1515de60e30b7ceda Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 23 Jun 2022 15:31:10 +0100 Subject: [PATCH 1/4] Remove endpoint after collaborate app/dht CLI --- .../strategies/collaborative.py | 230 ++---------------- .../strategies/test_collaborative.py | 65 +---- 2 files changed, 30 insertions(+), 265 deletions(-) diff --git a/src/pytorch_lightning/strategies/collaborative.py b/src/pytorch_lightning/strategies/collaborative.py index 715f0d58e9c07..2c3ca588b7ab4 100644 --- a/src/pytorch_lightning/strategies/collaborative.py +++ b/src/pytorch_lightning/strategies/collaborative.py @@ -1,22 +1,15 @@ -import http import ipaddress import logging import os import platform -import re -import threading -import time -import warnings -from http.server import BaseHTTPRequestHandler from typing import Any, Callable, Dict, List, Optional, Union -import requests import torch from torch import Tensor import pytorch_lightning as pl from pytorch_lightning.strategies.strategy import Strategy, TBroadcast -from pytorch_lightning.utilities import rank_zero_only, rank_zero_warn +from pytorch_lightning.utilities import rank_zero_warn from pytorch_lightning.utilities.data import extract_batch_size from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -33,6 +26,8 @@ class CollaborativeStrategy(Strategy): + INITIAL_PEERS_ENV: str = "PL_INITIAL_PEERS" + def __init__( self, target_batch_size: int, @@ -50,13 +45,6 @@ def __init__( averager_opts: Optional[Dict] = None, host_maddrs: Optional[List] = None, initial_peers: Optional[Union[str, List]] = None, - endpoint: Optional[bool] = None, - peer_endpoint: Optional[str] = None, - persistent: bool = True, - host: Optional[str] = None, - port: Optional[int] = None, - retry_endpoint_attempts: int = 5, - retry_endpoint_sleep_duration: int = 5, **optimizer_kwargs: Any, ): """Provides capabilities to train using the Hivemind Library, training collaboratively across the internet @@ -147,17 +135,7 @@ def __init__( ) super().__init__() - self.dht_manager = DHTManager( - persistent=persistent, - endpoint=endpoint, - peer_endpoint=peer_endpoint, - host=host, - port=port, - host_maddrs=host_maddrs, - initial_peers=initial_peers, - retry_endpoint_attempts=retry_endpoint_attempts, - retry_endpoint_sleep_duration=retry_endpoint_sleep_duration, - ) + self._initial_peers = initial_peers self._target_batch_size = target_batch_size self._batch_size = batch_size self._scheduler_fn = scheduler_fn @@ -179,28 +157,38 @@ def __init__( **optimizer_kwargs, ) - # a bit of a hack to only log from the stable server - if self.dht_manager.disable_logging_checkpointing: - warnings.warn( - "This machine is not a persistent machine. Checkpointing/Logging has been disabled.", UserWarning + self._parse_env_initial_peers() + + self.dht = hivemind.DHT( + start=True, + initial_peers=initial_peers, + host_maddrs=host_maddrs if host_maddrs is not None else ["/ip4/0.0.0.0/tcp/0", "/ip4/0.0.0.0/udp/0/quic"], + ) + + visible_addresses = [ + str(a) for a in self.dht.get_visible_maddrs() if not ipaddress.ip_address(a.values()[0]).is_loopback + ] + + if initial_peers is None: + log.info( + "\nOther machines can connect running the same command:\n" + f"INITIAL_PEERS={','.join(visible_addresses)} python ...\n" + "or passing the peers to the strategy:\n" + f"CollaborativeStrategy(initial_peers='{','.join(visible_addresses)}')" ) - rank_zero_only.rank = 1 if self.dht_manager.disable_logging_checkpointing else 0 + self._hivemind_initialized = False + def _parse_env_initial_peers(self) -> None: + initial_peers = os.environ.get(self.INITIAL_PEERS_ENV, self._initial_peers) + self._initial_peers = initial_peers.split(",") if isinstance(initial_peers, str) else self._initial_peers + @property def num_peers(self) -> int: if self._opt: return self._opt.tracker.global_progress.num_peers return 1 - @property - def dht(self) -> "hivemind.DHT": - """Hivemind Distributed Hash Table which stores values across all peers. - - See documentation for more details: `https://learning-at-home.readthedocs.io/en/latest/modules/dht.html` - """ - return self.dht_manager.dht - @property def root_device(self) -> torch.device: from pytorch_lightning.accelerators.cpu import CPUAccelerator @@ -361,167 +349,3 @@ def load_state_dict(self, state_dict: Dict) -> None: def state_dict(self) -> Dict: return self.scheduler.state_dict() - - -class DHTManager: - ENDPOINT_ENV: str = "PL_ENDPOINT" - PEER_ENDPOINT_ENV: str = "PL_PEER_ENDPOINT" - INITIAL_PEERS_ENV: str = "PL_INITIAL_PEERS" - HOST_ENV: str = "PL_HOST" - PORT_ENV: str = "PL_PORT" - DEFAULT_HOST: str = "0.0.0.0" - DEFAULT_PORT: int = 1440 - - def __init__( - self, - host_maddrs: Optional[List], - initial_peers: Optional[Union[str, List]], - persistent: bool, - endpoint: Optional[bool], - peer_endpoint: Optional[str], - host: Optional[str], - port: Optional[int], - retry_endpoint_attempts: int = 5, - retry_endpoint_sleep_duration: int = 5, - ) -> None: - """Manages the `hivemind.DHT` connection and provides a side-car endpoint server for initial peer access. - - Arguments: - - host_maddrs: :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.host_maddrs` - - initial_peers: :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.initial_peers` - - persistent: :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.persistent` - - endpoint: :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.endpoint` - - peer_endpoint: :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.peer_endpoint` - - host: :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.host` - - port: :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.port` - - retry_endpoint_attempts: - :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.retry_endpoint_attempts` - - retry_endpoint_sleep_duration: - :paramref: - `~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.retry_endpoint_sleep_duration` - """ - self._persistent = persistent - self._endpoint = endpoint - self._initial_peers = initial_peers - self._peer_endpoint = peer_endpoint - self._host = host - self._port = port - - self._parse_env_vars() - - if self._peer_endpoint and self._initial_peers is None: - self._initial_peers = self._get_initial_peers_from_endpoint( - retry_initial_peers=retry_endpoint_attempts, retry_peer_sleep_duration=retry_endpoint_sleep_duration - ) - - self.dht = hivemind.DHT( - start=True, - initial_peers=self._initial_peers, - host_maddrs=host_maddrs if host_maddrs is not None else ["/ip4/0.0.0.0/tcp/0", "/ip4/0.0.0.0/udp/0/quic"], - ) - - visible_addresses = [ - str(a) for a in self.dht.get_visible_maddrs() if not ipaddress.ip_address(a.values()[0]).is_loopback - ] - - if self._endpoint: - self._host = self._host if self._host is not None else self.DEFAULT_HOST - self._port = self._port if self._port is not None else self.DEFAULT_PORT - self._start_server_process(self._host, self._port) - self._log_endpoint_helper_message(visible_addresses) - elif self._peer_endpoint: - log.info("Machine received initial peers from endpoint.") - elif self._initial_peers is None: - log.info( - "\nOther machines can connect running the same command:\n" - f"INITIAL_PEERS={','.join(visible_addresses)} python ...\n" - "or passing the peers to the strategy:\n" - f"CollaborativeStrategy(initial_peers='{','.join(visible_addresses)}')" - ) - - def _log_endpoint_helper_message(self, visible_addresses: List[str]) -> None: - assert self._host is not None - resolved_host = self._host - if "0.0.0.0" in self._host: - # use the visible multi-addresses to figure out the IP that has been exposed - # todo (sean): this is pretty hacky, worth investigating. - p = re.compile(r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+") - # todo (sean): we select one address from here, could we have multiple? - resolved_host = {p.findall(maddr)[0] for maddr in visible_addresses}.pop() - log.info( - "\nSidecar endpoint enabled to serve peers.\n" - "Other peers can connect via:\n" - f"PEER_ENDPOINT={resolved_host}:{self._port} python ...\n" - "or pass the peer endpoint address to the strategy:\n" - f"CollaborativeStrategy(peer_endpoint='{resolved_host}:{self._port}')" - ) - - def _start_server_process(self, host: str, port: int) -> None: - dht = self.dht - - class DHTHandler(BaseHTTPRequestHandler): - def do_GET(self) -> None: - """Respond to a GET request.""" - self.send_response(200) - self.send_header("Content-type", "text/html") - self.end_headers() - - visible_peers = [ - str(a) for a in dht.get_visible_maddrs() if not ipaddress.ip_address(a.values()[0]).is_loopback - ] - - self.wfile.write("\n".join(visible_peers).encode()) - - server = http.server.ThreadingHTTPServer((host, int(port)), DHTHandler) - thread = threading.Thread(target=server.serve_forever) - thread.daemon = True - thread.start() - - def _get_initial_peers_from_endpoint(self, retry_initial_peers: int, retry_peer_sleep_duration: int) -> List: - peers = None - for _ in range(retry_initial_peers): - try: - peers = self._get_peers() - break - except requests.exceptions.RequestException: - log.info(f"Failed to get peers, retrying in {retry_peer_sleep_duration} seconds...") - time.sleep(retry_peer_sleep_duration) - if peers is None: - raise MisconfigurationException( - f"Unable to get peers. Tried {retry_initial_peers} times waiting {retry_peer_sleep_duration}s." - f"These parameters can be extended by passing " - "to the strategy (CollaborativeStrategy(retry_connection=x, retry_sleep_duration=y))." - ) - log.info(f"Received initial peers from collaborative server: {peers}") - return peers - - def _get_peers(self) -> List[str]: - assert self._peer_endpoint is not None - url = f"http://{self._peer_endpoint}" if not self._peer_endpoint.startswith("http://") else self._peer_endpoint - r = requests.get(url) - return r.text.split(",") - - def _parse_env_vars(self) -> None: - endpoint = os.environ.get(self.ENDPOINT_ENV, self._endpoint) - self._endpoint = endpoint == "1" if isinstance(endpoint, str) else endpoint - self._peer_endpoint = os.environ.get(self.PEER_ENDPOINT_ENV, self._peer_endpoint) - initial_peers = os.environ.get(self.INITIAL_PEERS_ENV, self._initial_peers) - self._initial_peers = initial_peers.split(",") if isinstance(initial_peers, str) else initial_peers - - port = os.environ.get(self.PORT_ENV, self._port) - self._port = int(port) if isinstance(port, str) else port - self._host = os.environ.get(self.HOST_ENV, self._host) - - @property - def disable_logging_checkpointing(self) -> bool: - # if this node is a peer, we do not log/checkpoint in persistent mode. - return self._persistent and (self._initial_peers is not None or self._peer_endpoint is not None) diff --git a/tests/tests_pytorch/strategies/test_collaborative.py b/tests/tests_pytorch/strategies/test_collaborative.py index fefb5c13e0db1..5741681f81289 100644 --- a/tests/tests_pytorch/strategies/test_collaborative.py +++ b/tests/tests_pytorch/strategies/test_collaborative.py @@ -6,13 +6,11 @@ from unittest.mock import PropertyMock import pytest -import requests import torch from torch.optim import Optimizer import pytorch_lightning as pl from pytorch_lightning.demos.boring_classes import BoringModel -from pytorch_lightning.plugins.environments.lightning_environment import find_free_network_port from pytorch_lightning.strategies import CollaborativeStrategy from pytorch_lightning.strategies.collaborative import HiveMindScheduler from pytorch_lightning.utilities import _HIVEMIND_AVAILABLE @@ -39,37 +37,6 @@ def test_strategy(mock_dht): assert trainer.strategy == strategy -@RunIf(hivemind=True) -@mock.patch("hivemind.DHT", autospec=True) -@mock.patch("pytorch_lightning.strategies.collaborative.DHTManager._get_peers", autospec=True) -@pytest.mark.parametrize( - "initial_peers,peer_endpoint", - [(["TEST"], None), (None, "localhost:153")], -) -def test_logging_disabled_when_second_peer(mock_dht, mock_http, initial_peers, peer_endpoint): - """Test when we are a second peer (passing initial peers or peer endpoint) we warn the user that - logging/checkpointing will be disabled.""" - with pytest.warns(UserWarning, match="This machine is not a persistent machine"): - CollaborativeStrategy(target_batch_size=1, initial_peers=initial_peers, peer_endpoint=peer_endpoint) - - -@RunIf(hivemind=True) -@mock.patch.dict( - os.environ, - {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor", "PL_PORT": str(find_free_network_port())}, - clear=True, -) -@pytest.mark.parametrize( - "endpoint,expected_message", - [(False, "INITIAL_PEERS"), (True, "Sidecar endpoint enabled to serve peers.")], -) -def test_initial_peer_message(caplog, endpoint, expected_message): - model = BoringModel() - trainer = pl.Trainer(strategy=CollaborativeStrategy(target_batch_size=1, endpoint=endpoint), fast_dev_run=True) - trainer.fit(model) - assert expected_message in caplog.text - - @RunIf(hivemind=True) @mock.patch.dict(os.environ, {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor"}, clear=True) def test_optimizer_wrapped(): @@ -109,24 +76,14 @@ def configure_optimizers(self): { "HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor", "PL_INITIAL_PEERS": "TEST_PEERS", - "PL_HOST": "TEST_HOST", - "PL_PORT": "1300", - "PL_ENDPOINT": "1", - "PL_PEER_ENDPOINT": "TEST_PEER_ENDPOINT", }, clear=True, ) @mock.patch("hivemind.DHT", autospec=True) -@mock.patch("pytorch_lightning.strategies.collaborative.DHTManager._get_peers", autospec=True) -@mock.patch("http.server.ThreadingHTTPServer", autospec=True) -def test_env_variables_parsed(mock_dht, mock_peers, mock_server): +def test_env_variables_parsed(mock_dht): """Test that env variables are parsed correctly.""" strategy = CollaborativeStrategy(target_batch_size=1) - assert strategy.dht_manager._initial_peers == ["TEST_PEERS"] - assert strategy.dht_manager._host == "TEST_HOST" - assert strategy.dht_manager._port == 1300 - assert strategy.dht_manager._endpoint - assert strategy.dht_manager._peer_endpoint == "TEST_PEER_ENDPOINT" + assert strategy._initial_peers == ["TEST_PEERS"] @RunIf(hivemind=True) @@ -206,9 +163,8 @@ def test_warn_if_argument_passed(delay_grad_averaging, delay_state_averaging, de @RunIf(hivemind=True) @mock.patch.dict(os.environ, {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor"}, clear=True) -@mock.patch("http.server.ThreadingHTTPServer", autospec=True) @mock.patch("pytorch_lightning.strategies.collaborative.CollaborativeStrategy.num_peers", new_callable=PropertyMock) -def test_args_passed_to_optimizer(mock_peers, mock_server): +def test_args_passed_to_optimizer(mock_peers): """Test to ensure arguments are correctly passed to the hivemind optimizer wrapper.""" mock_peers.return_value = 1 compression = hivemind.ScaledFloat16Compression() @@ -355,18 +311,3 @@ def on_fit_start(self) -> None: ) with pytest.raises(SystemExit): trainer.fit(model) - - -@RunIf(hivemind=True) -def test_raise_when_peer_endpoint_unsuccessful(caplog): - port = find_free_network_port() - with pytest.raises(MisconfigurationException, match="Unable to get peers"): - with mock.patch("requests.get", wraps=requests.get) as requests_mock: - CollaborativeStrategy( - target_batch_size=1, - peer_endpoint=f"localhost:{port}", - retry_endpoint_attempts=10, - retry_endpoint_sleep_duration=0, - ) - assert "Failed to get peers, retrying" in caplog.text - assert requests_mock.call_count == 10 From 89735d2201fd6cecf54d752ae01ccee015b4f342 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 23 Jun 2022 16:54:45 +0100 Subject: [PATCH 2/4] Fix references, change filename --- src/pytorch_lightning/strategies/__init__.py | 2 +- .../{collaborative.py => hivemind.py} | 26 +++---------------- ...test_collaborative.py => test_hivemind.py} | 6 ++--- 3 files changed, 7 insertions(+), 27 deletions(-) rename src/pytorch_lightning/strategies/{collaborative.py => hivemind.py} (91%) rename tests/tests_pytorch/strategies/{test_collaborative.py => test_hivemind.py} (97%) diff --git a/src/pytorch_lightning/strategies/__init__.py b/src/pytorch_lightning/strategies/__init__.py index f59d976edf439..ab79bd4fd70d9 100644 --- a/src/pytorch_lightning/strategies/__init__.py +++ b/src/pytorch_lightning/strategies/__init__.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. from pytorch_lightning.strategies.bagua import BaguaStrategy # noqa: F401 -from pytorch_lightning.strategies.collaborative import HivemindStrategy # noqa: F401 from pytorch_lightning.strategies.ddp import DDPStrategy # noqa: F401 from pytorch_lightning.strategies.ddp2 import DDP2Strategy # noqa: F401 from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy # noqa: F401 @@ -20,6 +19,7 @@ from pytorch_lightning.strategies.dp import DataParallelStrategy # noqa: F401 from pytorch_lightning.strategies.fully_sharded import DDPFullyShardedStrategy # noqa: F401 from pytorch_lightning.strategies.fully_sharded_native import DDPFullyShardedNativeStrategy # noqa: F401 +from pytorch_lightning.strategies.hivemind import HivemindStrategy # noqa: F401 from pytorch_lightning.strategies.horovod import HorovodStrategy # noqa: F401 from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy # noqa: F401 from pytorch_lightning.strategies.ipu import IPUStrategy # noqa: F401 diff --git a/src/pytorch_lightning/strategies/collaborative.py b/src/pytorch_lightning/strategies/hivemind.py similarity index 91% rename from src/pytorch_lightning/strategies/collaborative.py rename to src/pytorch_lightning/strategies/hivemind.py index ed9067591d912..1dca8001489bb 100644 --- a/src/pytorch_lightning/strategies/collaborative.py +++ b/src/pytorch_lightning/strategies/hivemind.py @@ -69,11 +69,11 @@ def __init__( corresponding :meth:`hivemind.Optimizer.step` call. delay_optimizer_step: Run optimizer in background, apply results in future .step. requires - :paramref:`~pytorch_lightning.strategies.collaborative.HivemindStrategy.offload_optimizer`. + :paramref:`~pytorch_lightning.strategies.hivemind.HivemindStrategy.offload_optimizer`. delay_grad_averaging: Average gradients in background; requires - :paramref:`~pytorch_lightning.strategies.collaborative.HivemindStrategy.offload_optimizer` and - :paramref:`~pytorch_lightning.strategies.collaborative.HivemindStrategy.delay_optimizer_step`. + :paramref:`~pytorch_lightning.strategies.hivemind.HivemindStrategy.offload_optimizer` and + :paramref:`~pytorch_lightning.strategies.hivemind.HivemindStrategy.delay_optimizer_step`. offload_optimizer: Offload the optimizer to host memory, saving GPU memory for parameters and gradients. @@ -106,26 +106,6 @@ def __init__( initial_peers: If connecting to a running process, a list of initial peers needs to be passed in. This can also be set via the env variable ``INITIAL_PEERS``. - endpoint: Enable if a side-car endpoint server is required on the process to server initial peers. - This is useful when using some form of orchestration such as torchelastic. - - peer_endpoint: The endpoint to request initial peers from. - - persistent: When using an endpoint, this controls whether other processes that are not the endpoint - server log/checkpoint. If ``persistent`` is True, we do not log/checkpoint from other processes. - - host: When creating the endpoint, the host IP to use. - - port: When creating the endpoint, the host port to use. - - retry_endpoint_attempts: When connecting to the - :paramref:`~pytorch_lightning.strategies.collaborative.HivemindStrategy.peer_endpoint`, - how many time to retry before raising an exception. - - retry_endpoint_sleep_duration: When connecting to the - :paramref:`~pytorch_lightning.strategies.collaborative.HivemindStrategy.peer_endpoint`, - how long to wait between retries. - **optimizer_kwargs: kwargs are passed to the :class:`hivemind.Optimizer` class. """ if not _HIVEMIND_AVAILABLE or platform.system() != "Linux": diff --git a/tests/tests_pytorch/strategies/test_collaborative.py b/tests/tests_pytorch/strategies/test_hivemind.py similarity index 97% rename from tests/tests_pytorch/strategies/test_collaborative.py rename to tests/tests_pytorch/strategies/test_hivemind.py index 40eca70da44d2..58ae523858d53 100644 --- a/tests/tests_pytorch/strategies/test_collaborative.py +++ b/tests/tests_pytorch/strategies/test_hivemind.py @@ -12,7 +12,7 @@ import pytorch_lightning as pl from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.strategies import HivemindStrategy -from pytorch_lightning.strategies.collaborative import HiveMindScheduler +from pytorch_lightning.strategies.hivemind import HiveMindScheduler from pytorch_lightning.utilities import _HIVEMIND_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.types import STEP_OUTPUT @@ -22,7 +22,7 @@ import hivemind -@mock.patch("pytorch_lightning.strategies.collaborative._HIVEMIND_AVAILABLE", False) +@mock.patch("pytorch_lightning.strategies.hivemind._HIVEMIND_AVAILABLE", False) def test_raise_exception_if_hivemind_unavailable(): """Test that we raise an exception when Hivemind is not available.""" with pytest.raises(MisconfigurationException, match="you must have Hivemind installed"): @@ -161,7 +161,7 @@ def test_warn_if_argument_passed(delay_grad_averaging, delay_state_averaging, de @RunIf(hivemind=True) @mock.patch.dict(os.environ, {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor"}, clear=True) -@mock.patch("pytorch_lightning.strategies.collaborative.HivemindStrategy.num_peers", new_callable=PropertyMock) +@mock.patch("pytorch_lightning.strategies.hivemind.HivemindStrategy.num_peers", new_callable=PropertyMock) def test_args_passed_to_optimizer(mock_peers): """Test to ensure arguments are correctly passed to the hivemind optimizer wrapper.""" mock_peers.return_value = 1 From 1470fa156ad7143ca489c6777597c744d8b66e4d Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 23 Jun 2022 16:58:48 +0100 Subject: [PATCH 3/4] Add CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fd8118986132f..8cfed7defa4e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -44,6 +44,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Hivemind Strategy * Added `CollaborativeStrategy` ([#12842](https://github.com/PyTorchLightning/pytorch-lightning/pull/12842)) * Renamed `CollaborativeStrategy` to `HivemindStrategy` ([#13388](https://github.com/PyTorchLightning/pytorch-lightning/pull/13388)) + * Remove unnecessary endpoint logic, rename `collaborative` to `hivemind` ([#13392](https://github.com/PyTorchLightning/pytorch-lightning/pull/13392)) - Include a version suffix for new "last" checkpoints of later runs in the same directory ([#12902](https://github.com/PyTorchLightning/pytorch-lightning/pull/12902)) From 546a812015557ff3369f19b1244dc07228540181 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 24 Jun 2022 11:09:46 +0100 Subject: [PATCH 4/4] Address review --- CHANGELOG.md | 2 +- docs/source-pytorch/common_usecases.rst | 2 +- docs/source-pytorch/extensions/strategy.rst | 2 +- docs/source-pytorch/index.rst | 4 ++-- .../{collaborative_training.rst => hivemind.rst} | 8 ++++---- ...ollaborative_training_basic.rst => hivemind_basic.rst} | 2 +- ...laborative_training_expert.rst => hivemind_expert.rst} | 2 +- ...raining_intermediate.rst => hivemind_intermediate.rst} | 2 +- src/pytorch_lightning/strategies/hivemind.py | 2 +- 9 files changed, 13 insertions(+), 13 deletions(-) rename docs/source-pytorch/strategies/{collaborative_training.rst => hivemind.rst} (86%) rename docs/source-pytorch/strategies/{collaborative_training_basic.rst => hivemind_basic.rst} (98%) rename docs/source-pytorch/strategies/{collaborative_training_expert.rst => hivemind_expert.rst} (98%) rename docs/source-pytorch/strategies/{collaborative_training_intermediate.rst => hivemind_intermediate.rst} (98%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8cfed7defa4e1..22ee454f89ec2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -44,7 +44,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Hivemind Strategy * Added `CollaborativeStrategy` ([#12842](https://github.com/PyTorchLightning/pytorch-lightning/pull/12842)) * Renamed `CollaborativeStrategy` to `HivemindStrategy` ([#13388](https://github.com/PyTorchLightning/pytorch-lightning/pull/13388)) - * Remove unnecessary endpoint logic, rename `collaborative` to `hivemind` ([#13392](https://github.com/PyTorchLightning/pytorch-lightning/pull/13392)) + * Removed unnecessary endpoint logic, renamed `collaborative` to `hivemind` ([#13392](https://github.com/PyTorchLightning/pytorch-lightning/pull/13392)) - Include a version suffix for new "last" checkpoints of later runs in the same directory ([#12902](https://github.com/PyTorchLightning/pytorch-lightning/pull/12902)) diff --git a/docs/source-pytorch/common_usecases.rst b/docs/source-pytorch/common_usecases.rst index 93646296d2cf8..307a32f03be41 100644 --- a/docs/source-pytorch/common_usecases.rst +++ b/docs/source-pytorch/common_usecases.rst @@ -127,7 +127,7 @@ Customize and extend Lightning for things like custom hardware or distributed st :header: Train on multiple machines over the internet :description: Train on local machines or unreliable GPUs across the internet. :col_css: col-md-12 - :button_link: strategies/collaborative_training + :button_link: strategies/hivemind :height: 100 .. displayitem:: diff --git a/docs/source-pytorch/extensions/strategy.rst b/docs/source-pytorch/extensions/strategy.rst index 95c48e09496e6..0cc426225ca36 100644 --- a/docs/source-pytorch/extensions/strategy.rst +++ b/docs/source-pytorch/extensions/strategy.rst @@ -77,7 +77,7 @@ The below table lists all relevant strategies available in Lightning with their - Strategy for training using the Bagua library, with advanced distributed training algorithms and system optimizations. :ref:`Learn more. ` * - collaborative - :class:`~pytorch_lightning.strategies.HivemindStrategy` - - Strategy for training collaboratively on local machines or unreliable GPUs across the internet. :ref:`Learn more. ` + - Strategy for training collaboratively on local machines or unreliable GPUs across the internet. :ref:`Learn more. ` * - fsdp - :class:`~pytorch_lightning.strategies.DDPFullyShardedStrategy` - Strategy for Fully Sharded Data Parallel provided by FairScale. :ref:`Learn more. ` diff --git a/docs/source-pytorch/index.rst b/docs/source-pytorch/index.rst index fad7cb006079d..a61990102ab2e 100644 --- a/docs/source-pytorch/index.rst +++ b/docs/source-pytorch/index.rst @@ -203,7 +203,7 @@ Current Lightning Users clouds/cluster Save and load model progress Save memory with half-precision - Training over the internet + Training over the internet advanced/model_parallel clouds/cloud_training Train on single or multiple GPUs @@ -247,7 +247,7 @@ Current Lightning Users Metrics Model Model Parallel - Collaborative Training + Collaborative Training Plugins Progress bar Production diff --git a/docs/source-pytorch/strategies/collaborative_training.rst b/docs/source-pytorch/strategies/hivemind.rst similarity index 86% rename from docs/source-pytorch/strategies/collaborative_training.rst rename to docs/source-pytorch/strategies/hivemind.rst index 72e9d13f9133a..5695f5695fcaf 100644 --- a/docs/source-pytorch/strategies/collaborative_training.rst +++ b/docs/source-pytorch/strategies/hivemind.rst @@ -1,4 +1,4 @@ -.. _collaborative_training: +.. _hivemind: ##################################################### Training on unreliable mixed GPUs across the internet @@ -17,7 +17,7 @@ Training on unreliable mixed GPUs across the internet :header: 1: Training across multiple machines over the internet :description: Quick setup to start training on multiple machines. :col_css: col-md-4 - :button_link: collaborative_training_basic.html + :button_link: hivemind_basic.html :height: 200 :tag: basic @@ -25,7 +25,7 @@ Training on unreliable mixed GPUs across the internet :header: 2: Speed up training by enabling under-the-hood optimizations :description: Learn which flags to use with the HivemindStrategy to speed up training. :col_css: col-md-4 - :button_link: collaborative_training_intermediate.html + :button_link: hivemind_intermediate.html :height: 200 :tag: intermediate @@ -33,7 +33,7 @@ Training on unreliable mixed GPUs across the internet :header: 3: Optimize Memory and Communication using compression hooks :description: Enable gradient buffer optimizations and communication improvements to reduce bottlenecks in communication. :col_css: col-md-4 - :button_link: collaborative_training_expert.html + :button_link: hivemind_expert.html :height: 200 :tag: expert diff --git a/docs/source-pytorch/strategies/collaborative_training_basic.rst b/docs/source-pytorch/strategies/hivemind_basic.rst similarity index 98% rename from docs/source-pytorch/strategies/collaborative_training_basic.rst rename to docs/source-pytorch/strategies/hivemind_basic.rst index 108f6197fdd09..98e90cbfe94cd 100644 --- a/docs/source-pytorch/strategies/collaborative_training_basic.rst +++ b/docs/source-pytorch/strategies/hivemind_basic.rst @@ -1,6 +1,6 @@ :orphan: -.. _collaborative_training_basic: +.. _hivemind_basic: Training on unreliable mixed GPUs across the internet (Basic) ============================================================= diff --git a/docs/source-pytorch/strategies/collaborative_training_expert.rst b/docs/source-pytorch/strategies/hivemind_expert.rst similarity index 98% rename from docs/source-pytorch/strategies/collaborative_training_expert.rst rename to docs/source-pytorch/strategies/hivemind_expert.rst index 5b8a5e8b4c49e..3fa55afb132fd 100644 --- a/docs/source-pytorch/strategies/collaborative_training_expert.rst +++ b/docs/source-pytorch/strategies/hivemind_expert.rst @@ -1,6 +1,6 @@ :orphan: -.. _collaborative_training_expert: +.. _hivemind_expert: Training on unreliable mixed GPUs across the internet (Expert) ============================================================== diff --git a/docs/source-pytorch/strategies/collaborative_training_intermediate.rst b/docs/source-pytorch/strategies/hivemind_intermediate.rst similarity index 98% rename from docs/source-pytorch/strategies/collaborative_training_intermediate.rst rename to docs/source-pytorch/strategies/hivemind_intermediate.rst index 38d6c6a3421b6..cec004219f5d5 100644 --- a/docs/source-pytorch/strategies/collaborative_training_intermediate.rst +++ b/docs/source-pytorch/strategies/hivemind_intermediate.rst @@ -1,6 +1,6 @@ :orphan: -.. _collaborative_training_intermediate: +.. _hivemind_intermediate: Training on unreliable mixed GPUs across the internet (Intermediate) ==================================================================== diff --git a/src/pytorch_lightning/strategies/hivemind.py b/src/pytorch_lightning/strategies/hivemind.py index 1dca8001489bb..34e2f40b2ec40 100644 --- a/src/pytorch_lightning/strategies/hivemind.py +++ b/src/pytorch_lightning/strategies/hivemind.py @@ -49,7 +49,7 @@ def __init__( ): """Provides capabilities to train using the Hivemind Library, training collaboratively across the internet with unreliable machines. For more information, `refer to the docs `__. + lightning.readthedocs.io/en/latest/strategies/hivemind.html>`__. .. warning:: ``HivemindStrategy`` is experimental and subject to change.