From d74710ec0586dc9be73dd5f1515de60e30b7ceda Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Thu, 23 Jun 2022 15:31:10 +0100
Subject: [PATCH 1/4] Remove endpoint after collaborate app/dht CLI

---
 .../strategies/collaborative.py               | 230 ++----------------
 .../strategies/test_collaborative.py          |  65 +----
 2 files changed, 30 insertions(+), 265 deletions(-)

diff --git a/src/pytorch_lightning/strategies/collaborative.py b/src/pytorch_lightning/strategies/collaborative.py
index 715f0d58e9c07..2c3ca588b7ab4 100644
--- a/src/pytorch_lightning/strategies/collaborative.py
+++ b/src/pytorch_lightning/strategies/collaborative.py
@@ -1,22 +1,15 @@
-import http
 import ipaddress
 import logging
 import os
 import platform
-import re
-import threading
-import time
-import warnings
-from http.server import BaseHTTPRequestHandler
 from typing import Any, Callable, Dict, List, Optional, Union
 
-import requests
 import torch
 from torch import Tensor
 
 import pytorch_lightning as pl
 from pytorch_lightning.strategies.strategy import Strategy, TBroadcast
-from pytorch_lightning.utilities import rank_zero_only, rank_zero_warn
+from pytorch_lightning.utilities import rank_zero_warn
 from pytorch_lightning.utilities.data import extract_batch_size
 from pytorch_lightning.utilities.enums import PrecisionType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -33,6 +26,8 @@
 
 
 class CollaborativeStrategy(Strategy):
+    INITIAL_PEERS_ENV: str = "PL_INITIAL_PEERS"
+
     def __init__(
         self,
         target_batch_size: int,
@@ -50,13 +45,6 @@ def __init__(
         averager_opts: Optional[Dict] = None,
         host_maddrs: Optional[List] = None,
         initial_peers: Optional[Union[str, List]] = None,
-        endpoint: Optional[bool] = None,
-        peer_endpoint: Optional[str] = None,
-        persistent: bool = True,
-        host: Optional[str] = None,
-        port: Optional[int] = None,
-        retry_endpoint_attempts: int = 5,
-        retry_endpoint_sleep_duration: int = 5,
         **optimizer_kwargs: Any,
     ):
         """Provides capabilities to train using the Hivemind Library, training collaboratively across the internet
@@ -147,17 +135,7 @@ def __init__(
             )
 
         super().__init__()
-        self.dht_manager = DHTManager(
-            persistent=persistent,
-            endpoint=endpoint,
-            peer_endpoint=peer_endpoint,
-            host=host,
-            port=port,
-            host_maddrs=host_maddrs,
-            initial_peers=initial_peers,
-            retry_endpoint_attempts=retry_endpoint_attempts,
-            retry_endpoint_sleep_duration=retry_endpoint_sleep_duration,
-        )
+        self._initial_peers = initial_peers
         self._target_batch_size = target_batch_size
         self._batch_size = batch_size
         self._scheduler_fn = scheduler_fn
@@ -179,28 +157,38 @@ def __init__(
             **optimizer_kwargs,
         )
 
-        # a bit of a hack to only log from the stable server
-        if self.dht_manager.disable_logging_checkpointing:
-            warnings.warn(
-                "This machine is not a persistent machine. Checkpointing/Logging has been disabled.", UserWarning
+        self._parse_env_initial_peers()
+
+        self.dht = hivemind.DHT(
+            start=True,
+            initial_peers=initial_peers,
+            host_maddrs=host_maddrs if host_maddrs is not None else ["/ip4/0.0.0.0/tcp/0", "/ip4/0.0.0.0/udp/0/quic"],
+        )
+
+        visible_addresses = [
+            str(a) for a in self.dht.get_visible_maddrs() if not ipaddress.ip_address(a.values()[0]).is_loopback
+        ]
+
+        if initial_peers is None:
+            log.info(
+                "\nOther machines can connect running the same command:\n"
+                f"INITIAL_PEERS={','.join(visible_addresses)} python ...\n"
+                "or passing the peers to the strategy:\n"
+                f"CollaborativeStrategy(initial_peers='{','.join(visible_addresses)}')"
             )
-        rank_zero_only.rank = 1 if self.dht_manager.disable_logging_checkpointing else 0
+
         self._hivemind_initialized = False
 
+    def _parse_env_initial_peers(self) -> None:
+        initial_peers = os.environ.get(self.INITIAL_PEERS_ENV, self._initial_peers)
+        self._initial_peers = initial_peers.split(",") if isinstance(initial_peers, str) else self._initial_peers
+
     @property
     def num_peers(self) -> int:
         if self._opt:
             return self._opt.tracker.global_progress.num_peers
         return 1
 
-    @property
-    def dht(self) -> "hivemind.DHT":
-        """Hivemind Distributed Hash Table which stores values across all peers.
-
-        See documentation for more details: `https://learning-at-home.readthedocs.io/en/latest/modules/dht.html`
-        """
-        return self.dht_manager.dht
-
     @property
     def root_device(self) -> torch.device:
         from pytorch_lightning.accelerators.cpu import CPUAccelerator
@@ -361,167 +349,3 @@ def load_state_dict(self, state_dict: Dict) -> None:
 
     def state_dict(self) -> Dict:
         return self.scheduler.state_dict()
-
-
-class DHTManager:
-    ENDPOINT_ENV: str = "PL_ENDPOINT"
-    PEER_ENDPOINT_ENV: str = "PL_PEER_ENDPOINT"
-    INITIAL_PEERS_ENV: str = "PL_INITIAL_PEERS"
-    HOST_ENV: str = "PL_HOST"
-    PORT_ENV: str = "PL_PORT"
-    DEFAULT_HOST: str = "0.0.0.0"
-    DEFAULT_PORT: int = 1440
-
-    def __init__(
-        self,
-        host_maddrs: Optional[List],
-        initial_peers: Optional[Union[str, List]],
-        persistent: bool,
-        endpoint: Optional[bool],
-        peer_endpoint: Optional[str],
-        host: Optional[str],
-        port: Optional[int],
-        retry_endpoint_attempts: int = 5,
-        retry_endpoint_sleep_duration: int = 5,
-    ) -> None:
-        """Manages the `hivemind.DHT` connection and provides a side-car endpoint server for initial peer access.
-
-        Arguments:
-
-            host_maddrs: :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.host_maddrs`
-
-            initial_peers: :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.initial_peers`
-
-            persistent: :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.persistent`
-
-            endpoint: :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.endpoint`
-
-            peer_endpoint: :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.peer_endpoint`
-
-            host: :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.host`
-
-            port: :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.port`
-
-            retry_endpoint_attempts:
-                :paramref:`~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.retry_endpoint_attempts`
-
-            retry_endpoint_sleep_duration:
-                :paramref:
-                `~pytorch_lightning.strategies.collaborative.CollaborativeStrategy.retry_endpoint_sleep_duration`
-        """
-        self._persistent = persistent
-        self._endpoint = endpoint
-        self._initial_peers = initial_peers
-        self._peer_endpoint = peer_endpoint
-        self._host = host
-        self._port = port
-
-        self._parse_env_vars()
-
-        if self._peer_endpoint and self._initial_peers is None:
-            self._initial_peers = self._get_initial_peers_from_endpoint(
-                retry_initial_peers=retry_endpoint_attempts, retry_peer_sleep_duration=retry_endpoint_sleep_duration
-            )
-
-        self.dht = hivemind.DHT(
-            start=True,
-            initial_peers=self._initial_peers,
-            host_maddrs=host_maddrs if host_maddrs is not None else ["/ip4/0.0.0.0/tcp/0", "/ip4/0.0.0.0/udp/0/quic"],
-        )
-
-        visible_addresses = [
-            str(a) for a in self.dht.get_visible_maddrs() if not ipaddress.ip_address(a.values()[0]).is_loopback
-        ]
-
-        if self._endpoint:
-            self._host = self._host if self._host is not None else self.DEFAULT_HOST
-            self._port = self._port if self._port is not None else self.DEFAULT_PORT
-            self._start_server_process(self._host, self._port)
-            self._log_endpoint_helper_message(visible_addresses)
-        elif self._peer_endpoint:
-            log.info("Machine received initial peers from endpoint.")
-        elif self._initial_peers is None:
-            log.info(
-                "\nOther machines can connect running the same command:\n"
-                f"INITIAL_PEERS={','.join(visible_addresses)} python ...\n"
-                "or passing the peers to the strategy:\n"
-                f"CollaborativeStrategy(initial_peers='{','.join(visible_addresses)}')"
-            )
-
-    def _log_endpoint_helper_message(self, visible_addresses: List[str]) -> None:
-        assert self._host is not None
-        resolved_host = self._host
-        if "0.0.0.0" in self._host:
-            # use the visible multi-addresses to figure out the IP that has been exposed
-            # todo (sean): this is pretty hacky, worth investigating.
-            p = re.compile(r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+")
-            # todo (sean): we select one address from here, could we have multiple?
-            resolved_host = {p.findall(maddr)[0] for maddr in visible_addresses}.pop()
-        log.info(
-            "\nSidecar endpoint enabled to serve peers.\n"
-            "Other peers can connect via:\n"
-            f"PEER_ENDPOINT={resolved_host}:{self._port} python ...\n"
-            "or pass the peer endpoint address to the strategy:\n"
-            f"CollaborativeStrategy(peer_endpoint='{resolved_host}:{self._port}')"
-        )
-
-    def _start_server_process(self, host: str, port: int) -> None:
-        dht = self.dht
-
-        class DHTHandler(BaseHTTPRequestHandler):
-            def do_GET(self) -> None:
-                """Respond to a GET request."""
-                self.send_response(200)
-                self.send_header("Content-type", "text/html")
-                self.end_headers()
-
-                visible_peers = [
-                    str(a) for a in dht.get_visible_maddrs() if not ipaddress.ip_address(a.values()[0]).is_loopback
-                ]
-
-                self.wfile.write("\n".join(visible_peers).encode())
-
-        server = http.server.ThreadingHTTPServer((host, int(port)), DHTHandler)
-        thread = threading.Thread(target=server.serve_forever)
-        thread.daemon = True
-        thread.start()
-
-    def _get_initial_peers_from_endpoint(self, retry_initial_peers: int, retry_peer_sleep_duration: int) -> List:
-        peers = None
-        for _ in range(retry_initial_peers):
-            try:
-                peers = self._get_peers()
-                break
-            except requests.exceptions.RequestException:
-                log.info(f"Failed to get peers, retrying in {retry_peer_sleep_duration} seconds...")
-                time.sleep(retry_peer_sleep_duration)
-        if peers is None:
-            raise MisconfigurationException(
-                f"Unable to get peers. Tried {retry_initial_peers} times waiting {retry_peer_sleep_duration}s."
-                f"These parameters can be extended by passing "
-                "to the strategy (CollaborativeStrategy(retry_connection=x, retry_sleep_duration=y))."
-            )
-        log.info(f"Received initial peers from collaborative server: {peers}")
-        return peers
-
-    def _get_peers(self) -> List[str]:
-        assert self._peer_endpoint is not None
-        url = f"http://{self._peer_endpoint}" if not self._peer_endpoint.startswith("http://") else self._peer_endpoint
-        r = requests.get(url)
-        return r.text.split(",")
-
-    def _parse_env_vars(self) -> None:
-        endpoint = os.environ.get(self.ENDPOINT_ENV, self._endpoint)
-        self._endpoint = endpoint == "1" if isinstance(endpoint, str) else endpoint
-        self._peer_endpoint = os.environ.get(self.PEER_ENDPOINT_ENV, self._peer_endpoint)
-        initial_peers = os.environ.get(self.INITIAL_PEERS_ENV, self._initial_peers)
-        self._initial_peers = initial_peers.split(",") if isinstance(initial_peers, str) else initial_peers
-
-        port = os.environ.get(self.PORT_ENV, self._port)
-        self._port = int(port) if isinstance(port, str) else port
-        self._host = os.environ.get(self.HOST_ENV, self._host)
-
-    @property
-    def disable_logging_checkpointing(self) -> bool:
-        # if this node is a peer, we do not log/checkpoint in persistent mode.
-        return self._persistent and (self._initial_peers is not None or self._peer_endpoint is not None)
diff --git a/tests/tests_pytorch/strategies/test_collaborative.py b/tests/tests_pytorch/strategies/test_collaborative.py
index fefb5c13e0db1..5741681f81289 100644
--- a/tests/tests_pytorch/strategies/test_collaborative.py
+++ b/tests/tests_pytorch/strategies/test_collaborative.py
@@ -6,13 +6,11 @@
 from unittest.mock import PropertyMock
 
 import pytest
-import requests
 import torch
 from torch.optim import Optimizer
 
 import pytorch_lightning as pl
 from pytorch_lightning.demos.boring_classes import BoringModel
-from pytorch_lightning.plugins.environments.lightning_environment import find_free_network_port
 from pytorch_lightning.strategies import CollaborativeStrategy
 from pytorch_lightning.strategies.collaborative import HiveMindScheduler
 from pytorch_lightning.utilities import _HIVEMIND_AVAILABLE
@@ -39,37 +37,6 @@ def test_strategy(mock_dht):
     assert trainer.strategy == strategy
 
 
-@RunIf(hivemind=True)
-@mock.patch("hivemind.DHT", autospec=True)
-@mock.patch("pytorch_lightning.strategies.collaborative.DHTManager._get_peers", autospec=True)
-@pytest.mark.parametrize(
-    "initial_peers,peer_endpoint",
-    [(["TEST"], None), (None, "localhost:153")],
-)
-def test_logging_disabled_when_second_peer(mock_dht, mock_http, initial_peers, peer_endpoint):
-    """Test when we are a second peer (passing initial peers or peer endpoint) we warn the user that
-    logging/checkpointing will be disabled."""
-    with pytest.warns(UserWarning, match="This machine is not a persistent machine"):
-        CollaborativeStrategy(target_batch_size=1, initial_peers=initial_peers, peer_endpoint=peer_endpoint)
-
-
-@RunIf(hivemind=True)
-@mock.patch.dict(
-    os.environ,
-    {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor", "PL_PORT": str(find_free_network_port())},
-    clear=True,
-)
-@pytest.mark.parametrize(
-    "endpoint,expected_message",
-    [(False, "INITIAL_PEERS"), (True, "Sidecar endpoint enabled to serve peers.")],
-)
-def test_initial_peer_message(caplog, endpoint, expected_message):
-    model = BoringModel()
-    trainer = pl.Trainer(strategy=CollaborativeStrategy(target_batch_size=1, endpoint=endpoint), fast_dev_run=True)
-    trainer.fit(model)
-    assert expected_message in caplog.text
-
-
 @RunIf(hivemind=True)
 @mock.patch.dict(os.environ, {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor"}, clear=True)
 def test_optimizer_wrapped():
@@ -109,24 +76,14 @@ def configure_optimizers(self):
     {
         "HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor",
         "PL_INITIAL_PEERS": "TEST_PEERS",
-        "PL_HOST": "TEST_HOST",
-        "PL_PORT": "1300",
-        "PL_ENDPOINT": "1",
-        "PL_PEER_ENDPOINT": "TEST_PEER_ENDPOINT",
     },
     clear=True,
 )
 @mock.patch("hivemind.DHT", autospec=True)
-@mock.patch("pytorch_lightning.strategies.collaborative.DHTManager._get_peers", autospec=True)
-@mock.patch("http.server.ThreadingHTTPServer", autospec=True)
-def test_env_variables_parsed(mock_dht, mock_peers, mock_server):
+def test_env_variables_parsed(mock_dht):
     """Test that env variables are parsed correctly."""
     strategy = CollaborativeStrategy(target_batch_size=1)
-    assert strategy.dht_manager._initial_peers == ["TEST_PEERS"]
-    assert strategy.dht_manager._host == "TEST_HOST"
-    assert strategy.dht_manager._port == 1300
-    assert strategy.dht_manager._endpoint
-    assert strategy.dht_manager._peer_endpoint == "TEST_PEER_ENDPOINT"
+    assert strategy._initial_peers == ["TEST_PEERS"]
 
 
 @RunIf(hivemind=True)
@@ -206,9 +163,8 @@ def test_warn_if_argument_passed(delay_grad_averaging, delay_state_averaging, de
 
 @RunIf(hivemind=True)
 @mock.patch.dict(os.environ, {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor"}, clear=True)
-@mock.patch("http.server.ThreadingHTTPServer", autospec=True)
 @mock.patch("pytorch_lightning.strategies.collaborative.CollaborativeStrategy.num_peers", new_callable=PropertyMock)
-def test_args_passed_to_optimizer(mock_peers, mock_server):
+def test_args_passed_to_optimizer(mock_peers):
     """Test to ensure arguments are correctly passed to the hivemind optimizer wrapper."""
     mock_peers.return_value = 1
     compression = hivemind.ScaledFloat16Compression()
@@ -355,18 +311,3 @@ def on_fit_start(self) -> None:
     )
     with pytest.raises(SystemExit):
         trainer.fit(model)
-
-
-@RunIf(hivemind=True)
-def test_raise_when_peer_endpoint_unsuccessful(caplog):
-    port = find_free_network_port()
-    with pytest.raises(MisconfigurationException, match="Unable to get peers"):
-        with mock.patch("requests.get", wraps=requests.get) as requests_mock:
-            CollaborativeStrategy(
-                target_batch_size=1,
-                peer_endpoint=f"localhost:{port}",
-                retry_endpoint_attempts=10,
-                retry_endpoint_sleep_duration=0,
-            )
-    assert "Failed to get peers, retrying" in caplog.text
-    assert requests_mock.call_count == 10

From 89735d2201fd6cecf54d752ae01ccee015b4f342 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Thu, 23 Jun 2022 16:54:45 +0100
Subject: [PATCH 2/4] Fix references, change filename

---
 src/pytorch_lightning/strategies/__init__.py  |  2 +-
 .../{collaborative.py => hivemind.py}         | 26 +++----------------
 ...test_collaborative.py => test_hivemind.py} |  6 ++---
 3 files changed, 7 insertions(+), 27 deletions(-)
 rename src/pytorch_lightning/strategies/{collaborative.py => hivemind.py} (91%)
 rename tests/tests_pytorch/strategies/{test_collaborative.py => test_hivemind.py} (97%)

diff --git a/src/pytorch_lightning/strategies/__init__.py b/src/pytorch_lightning/strategies/__init__.py
index f59d976edf439..ab79bd4fd70d9 100644
--- a/src/pytorch_lightning/strategies/__init__.py
+++ b/src/pytorch_lightning/strategies/__init__.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from pytorch_lightning.strategies.bagua import BaguaStrategy  # noqa: F401
-from pytorch_lightning.strategies.collaborative import HivemindStrategy  # noqa: F401
 from pytorch_lightning.strategies.ddp import DDPStrategy  # noqa: F401
 from pytorch_lightning.strategies.ddp2 import DDP2Strategy  # noqa: F401
 from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy  # noqa: F401
@@ -20,6 +19,7 @@
 from pytorch_lightning.strategies.dp import DataParallelStrategy  # noqa: F401
 from pytorch_lightning.strategies.fully_sharded import DDPFullyShardedStrategy  # noqa: F401
 from pytorch_lightning.strategies.fully_sharded_native import DDPFullyShardedNativeStrategy  # noqa: F401
+from pytorch_lightning.strategies.hivemind import HivemindStrategy  # noqa: F401
 from pytorch_lightning.strategies.horovod import HorovodStrategy  # noqa: F401
 from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy  # noqa: F401
 from pytorch_lightning.strategies.ipu import IPUStrategy  # noqa: F401
diff --git a/src/pytorch_lightning/strategies/collaborative.py b/src/pytorch_lightning/strategies/hivemind.py
similarity index 91%
rename from src/pytorch_lightning/strategies/collaborative.py
rename to src/pytorch_lightning/strategies/hivemind.py
index ed9067591d912..1dca8001489bb 100644
--- a/src/pytorch_lightning/strategies/collaborative.py
+++ b/src/pytorch_lightning/strategies/hivemind.py
@@ -69,11 +69,11 @@ def __init__(
                 corresponding :meth:`hivemind.Optimizer.step` call.
 
             delay_optimizer_step: Run optimizer in background, apply results in future .step. requires
-                :paramref:`~pytorch_lightning.strategies.collaborative.HivemindStrategy.offload_optimizer`.
+                :paramref:`~pytorch_lightning.strategies.hivemind.HivemindStrategy.offload_optimizer`.
 
             delay_grad_averaging: Average gradients in background; requires
-                :paramref:`~pytorch_lightning.strategies.collaborative.HivemindStrategy.offload_optimizer` and
-                :paramref:`~pytorch_lightning.strategies.collaborative.HivemindStrategy.delay_optimizer_step`.
+                :paramref:`~pytorch_lightning.strategies.hivemind.HivemindStrategy.offload_optimizer` and
+                :paramref:`~pytorch_lightning.strategies.hivemind.HivemindStrategy.delay_optimizer_step`.
 
             offload_optimizer: Offload the optimizer to host memory, saving GPU memory for parameters and gradients.
 
@@ -106,26 +106,6 @@ def __init__(
             initial_peers: If connecting to a running process, a list of initial peers needs to be passed in.
                 This can also be set via the env variable ``INITIAL_PEERS``.
 
-            endpoint: Enable if a side-car endpoint server is required on the process to server initial peers.
-                This is useful when using some form of orchestration such as torchelastic.
-
-            peer_endpoint: The endpoint to request initial peers from.
-
-            persistent: When using an endpoint, this controls whether other processes that are not the endpoint
-                server log/checkpoint. If ``persistent`` is True, we do not log/checkpoint from other processes.
-
-            host: When creating the endpoint, the host IP to use.
-
-            port: When creating the endpoint, the host port to use.
-
-            retry_endpoint_attempts: When connecting to the
-                :paramref:`~pytorch_lightning.strategies.collaborative.HivemindStrategy.peer_endpoint`,
-                how many time to retry before raising an exception.
-
-            retry_endpoint_sleep_duration: When connecting to the
-                :paramref:`~pytorch_lightning.strategies.collaborative.HivemindStrategy.peer_endpoint`,
-                how long to wait between retries.
-
             **optimizer_kwargs: kwargs are passed to the :class:`hivemind.Optimizer` class.
         """
         if not _HIVEMIND_AVAILABLE or platform.system() != "Linux":
diff --git a/tests/tests_pytorch/strategies/test_collaborative.py b/tests/tests_pytorch/strategies/test_hivemind.py
similarity index 97%
rename from tests/tests_pytorch/strategies/test_collaborative.py
rename to tests/tests_pytorch/strategies/test_hivemind.py
index 40eca70da44d2..58ae523858d53 100644
--- a/tests/tests_pytorch/strategies/test_collaborative.py
+++ b/tests/tests_pytorch/strategies/test_hivemind.py
@@ -12,7 +12,7 @@
 import pytorch_lightning as pl
 from pytorch_lightning.demos.boring_classes import BoringModel
 from pytorch_lightning.strategies import HivemindStrategy
-from pytorch_lightning.strategies.collaborative import HiveMindScheduler
+from pytorch_lightning.strategies.hivemind import HiveMindScheduler
 from pytorch_lightning.utilities import _HIVEMIND_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.types import STEP_OUTPUT
@@ -22,7 +22,7 @@
     import hivemind
 
 
-@mock.patch("pytorch_lightning.strategies.collaborative._HIVEMIND_AVAILABLE", False)
+@mock.patch("pytorch_lightning.strategies.hivemind._HIVEMIND_AVAILABLE", False)
 def test_raise_exception_if_hivemind_unavailable():
     """Test that we raise an exception when Hivemind is not available."""
     with pytest.raises(MisconfigurationException, match="you must have Hivemind installed"):
@@ -161,7 +161,7 @@ def test_warn_if_argument_passed(delay_grad_averaging, delay_state_averaging, de
 
 @RunIf(hivemind=True)
 @mock.patch.dict(os.environ, {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor"}, clear=True)
-@mock.patch("pytorch_lightning.strategies.collaborative.HivemindStrategy.num_peers", new_callable=PropertyMock)
+@mock.patch("pytorch_lightning.strategies.hivemind.HivemindStrategy.num_peers", new_callable=PropertyMock)
 def test_args_passed_to_optimizer(mock_peers):
     """Test to ensure arguments are correctly passed to the hivemind optimizer wrapper."""
     mock_peers.return_value = 1

From 1470fa156ad7143ca489c6777597c744d8b66e4d Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Thu, 23 Jun 2022 16:58:48 +0100
Subject: [PATCH 3/4] Add CHANGELOG.md

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fd8118986132f..8cfed7defa4e1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -44,6 +44,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Hivemind Strategy
     * Added `CollaborativeStrategy` ([#12842](https://github.com/PyTorchLightning/pytorch-lightning/pull/12842))
     * Renamed `CollaborativeStrategy` to `HivemindStrategy` ([#13388](https://github.com/PyTorchLightning/pytorch-lightning/pull/13388))
+    * Remove unnecessary endpoint logic, rename `collaborative` to `hivemind` ([#13392](https://github.com/PyTorchLightning/pytorch-lightning/pull/13392))
 
 - Include a version suffix for new "last" checkpoints of later runs in the same directory ([#12902](https://github.com/PyTorchLightning/pytorch-lightning/pull/12902))
 

From 546a812015557ff3369f19b1244dc07228540181 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Fri, 24 Jun 2022 11:09:46 +0100
Subject: [PATCH 4/4] Address review

---
 CHANGELOG.md                                              | 2 +-
 docs/source-pytorch/common_usecases.rst                   | 2 +-
 docs/source-pytorch/extensions/strategy.rst               | 2 +-
 docs/source-pytorch/index.rst                             | 4 ++--
 .../{collaborative_training.rst => hivemind.rst}          | 8 ++++----
 ...ollaborative_training_basic.rst => hivemind_basic.rst} | 2 +-
 ...laborative_training_expert.rst => hivemind_expert.rst} | 2 +-
 ...raining_intermediate.rst => hivemind_intermediate.rst} | 2 +-
 src/pytorch_lightning/strategies/hivemind.py              | 2 +-
 9 files changed, 13 insertions(+), 13 deletions(-)
 rename docs/source-pytorch/strategies/{collaborative_training.rst => hivemind.rst} (86%)
 rename docs/source-pytorch/strategies/{collaborative_training_basic.rst => hivemind_basic.rst} (98%)
 rename docs/source-pytorch/strategies/{collaborative_training_expert.rst => hivemind_expert.rst} (98%)
 rename docs/source-pytorch/strategies/{collaborative_training_intermediate.rst => hivemind_intermediate.rst} (98%)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8cfed7defa4e1..22ee454f89ec2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -44,7 +44,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Hivemind Strategy
     * Added `CollaborativeStrategy` ([#12842](https://github.com/PyTorchLightning/pytorch-lightning/pull/12842))
     * Renamed `CollaborativeStrategy` to `HivemindStrategy` ([#13388](https://github.com/PyTorchLightning/pytorch-lightning/pull/13388))
-    * Remove unnecessary endpoint logic, rename `collaborative` to `hivemind` ([#13392](https://github.com/PyTorchLightning/pytorch-lightning/pull/13392))
+    * Removed unnecessary endpoint logic, renamed `collaborative` to `hivemind` ([#13392](https://github.com/PyTorchLightning/pytorch-lightning/pull/13392))
 
 - Include a version suffix for new "last" checkpoints of later runs in the same directory ([#12902](https://github.com/PyTorchLightning/pytorch-lightning/pull/12902))
 
diff --git a/docs/source-pytorch/common_usecases.rst b/docs/source-pytorch/common_usecases.rst
index 93646296d2cf8..307a32f03be41 100644
--- a/docs/source-pytorch/common_usecases.rst
+++ b/docs/source-pytorch/common_usecases.rst
@@ -127,7 +127,7 @@ Customize and extend Lightning for things like custom hardware or distributed st
    :header: Train on multiple machines over the internet
    :description: Train on local machines or unreliable GPUs across the internet.
    :col_css: col-md-12
-   :button_link: strategies/collaborative_training
+   :button_link: strategies/hivemind
    :height: 100
 
 .. displayitem::
diff --git a/docs/source-pytorch/extensions/strategy.rst b/docs/source-pytorch/extensions/strategy.rst
index 95c48e09496e6..0cc426225ca36 100644
--- a/docs/source-pytorch/extensions/strategy.rst
+++ b/docs/source-pytorch/extensions/strategy.rst
@@ -77,7 +77,7 @@ The below table lists all relevant strategies available in Lightning with their
      - Strategy for training using the Bagua library, with advanced distributed training algorithms and system optimizations. :ref:`Learn more. <accelerators/gpu_intermediate:Bagua>`
    * - collaborative
      - :class:`~pytorch_lightning.strategies.HivemindStrategy`
-     - Strategy for training collaboratively on local machines or unreliable GPUs across the internet. :ref:`Learn more. <strategies/collaborative_training:Training on unreliable mixed GPUs across the internet>`
+     - Strategy for training collaboratively on local machines or unreliable GPUs across the internet. :ref:`Learn more. <strategies/hivemind:Training on unreliable mixed GPUs across the internet>`
    * - fsdp
      - :class:`~pytorch_lightning.strategies.DDPFullyShardedStrategy`
      - Strategy for Fully Sharded Data Parallel provided by FairScale. :ref:`Learn more. <advanced/model_parallel:Fully Sharded Training>`
diff --git a/docs/source-pytorch/index.rst b/docs/source-pytorch/index.rst
index fad7cb006079d..a61990102ab2e 100644
--- a/docs/source-pytorch/index.rst
+++ b/docs/source-pytorch/index.rst
@@ -203,7 +203,7 @@ Current Lightning Users
    clouds/cluster
    Save and load model progress <common/checkpointing>
    Save memory with half-precision <common/precision>
-   Training over the internet <strategies/collaborative_training>
+   Training over the internet <strategies/hivemind>
    advanced/model_parallel
    clouds/cloud_training
    Train on single or multiple GPUs <accelerators/gpu>
@@ -247,7 +247,7 @@ Current Lightning Users
    Metrics <https://torchmetrics.readthedocs.io/en/stable/>
    Model <model/build_model.rst>
    Model Parallel <advanced/model_parallel>
-   Collaborative Training <strategies/collaborative_training>
+   Collaborative Training <strategies/hivemind>
    Plugins <extensions/plugins>
    Progress bar <common/progress_bar>
    Production <deploy/production_advanced>
diff --git a/docs/source-pytorch/strategies/collaborative_training.rst b/docs/source-pytorch/strategies/hivemind.rst
similarity index 86%
rename from docs/source-pytorch/strategies/collaborative_training.rst
rename to docs/source-pytorch/strategies/hivemind.rst
index 72e9d13f9133a..5695f5695fcaf 100644
--- a/docs/source-pytorch/strategies/collaborative_training.rst
+++ b/docs/source-pytorch/strategies/hivemind.rst
@@ -1,4 +1,4 @@
-.. _collaborative_training:
+.. _hivemind:
 
 #####################################################
 Training on unreliable mixed GPUs across the internet
@@ -17,7 +17,7 @@ Training on unreliable mixed GPUs across the internet
    :header: 1: Training across multiple machines over the internet
    :description: Quick setup to start training on multiple machines.
    :col_css: col-md-4
-   :button_link: collaborative_training_basic.html
+   :button_link: hivemind_basic.html
    :height: 200
    :tag: basic
 
@@ -25,7 +25,7 @@ Training on unreliable mixed GPUs across the internet
    :header: 2: Speed up training by enabling under-the-hood optimizations
    :description: Learn which flags to use with the HivemindStrategy to speed up training.
    :col_css: col-md-4
-   :button_link: collaborative_training_intermediate.html
+   :button_link: hivemind_intermediate.html
    :height: 200
    :tag: intermediate
 
@@ -33,7 +33,7 @@ Training on unreliable mixed GPUs across the internet
    :header: 3: Optimize Memory and Communication using compression hooks
    :description: Enable gradient buffer optimizations and communication improvements to reduce bottlenecks in communication.
    :col_css: col-md-4
-   :button_link: collaborative_training_expert.html
+   :button_link: hivemind_expert.html
    :height: 200
    :tag: expert
 
diff --git a/docs/source-pytorch/strategies/collaborative_training_basic.rst b/docs/source-pytorch/strategies/hivemind_basic.rst
similarity index 98%
rename from docs/source-pytorch/strategies/collaborative_training_basic.rst
rename to docs/source-pytorch/strategies/hivemind_basic.rst
index 108f6197fdd09..98e90cbfe94cd 100644
--- a/docs/source-pytorch/strategies/collaborative_training_basic.rst
+++ b/docs/source-pytorch/strategies/hivemind_basic.rst
@@ -1,6 +1,6 @@
 :orphan:
 
-.. _collaborative_training_basic:
+.. _hivemind_basic:
 
 Training on unreliable mixed GPUs across the internet (Basic)
 =============================================================
diff --git a/docs/source-pytorch/strategies/collaborative_training_expert.rst b/docs/source-pytorch/strategies/hivemind_expert.rst
similarity index 98%
rename from docs/source-pytorch/strategies/collaborative_training_expert.rst
rename to docs/source-pytorch/strategies/hivemind_expert.rst
index 5b8a5e8b4c49e..3fa55afb132fd 100644
--- a/docs/source-pytorch/strategies/collaborative_training_expert.rst
+++ b/docs/source-pytorch/strategies/hivemind_expert.rst
@@ -1,6 +1,6 @@
 :orphan:
 
-.. _collaborative_training_expert:
+.. _hivemind_expert:
 
 Training on unreliable mixed GPUs across the internet (Expert)
 ==============================================================
diff --git a/docs/source-pytorch/strategies/collaborative_training_intermediate.rst b/docs/source-pytorch/strategies/hivemind_intermediate.rst
similarity index 98%
rename from docs/source-pytorch/strategies/collaborative_training_intermediate.rst
rename to docs/source-pytorch/strategies/hivemind_intermediate.rst
index 38d6c6a3421b6..cec004219f5d5 100644
--- a/docs/source-pytorch/strategies/collaborative_training_intermediate.rst
+++ b/docs/source-pytorch/strategies/hivemind_intermediate.rst
@@ -1,6 +1,6 @@
 :orphan:
 
-.. _collaborative_training_intermediate:
+.. _hivemind_intermediate:
 
 Training on unreliable mixed GPUs across the internet (Intermediate)
 ====================================================================
diff --git a/src/pytorch_lightning/strategies/hivemind.py b/src/pytorch_lightning/strategies/hivemind.py
index 1dca8001489bb..34e2f40b2ec40 100644
--- a/src/pytorch_lightning/strategies/hivemind.py
+++ b/src/pytorch_lightning/strategies/hivemind.py
@@ -49,7 +49,7 @@ def __init__(
     ):
         """Provides capabilities to train using the Hivemind Library, training collaboratively across the internet
         with unreliable machines. For more information, `refer to the docs <https://pytorch-
-        lightning.readthedocs.io/en/latest/strategies/collaborative_training.html>`__.
+        lightning.readthedocs.io/en/latest/strategies/hivemind.html>`__.
 
         .. warning:: ``HivemindStrategy`` is experimental and subject to change.