Lightning-AI · awaelchli · Mar 6, 2023 · Feb 27, 2023 · Feb 27, 2023 · Feb 27, 2023
@@ -55,6 +55,7 @@ def restore_env_variables():
         "POPLAR_ENGINE_OPTIONS",  # set by IPUStrategy
         "CUDA_MODULE_LOADING",  # leaked since PyTorch 1.13
         "CRC32C_SW_MODE",  # set by tensorboardX
+        "CUBLAS_WORKSPACE_CONFIG",  # handled by the `reset_deterministic_algorithm` fixture below
     }
     leaked_vars.difference_update(allowlist)
     assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}"
@@ -72,9 +73,18 @@ def teardown_process_group():
 def reset_deterministic_algorithm():
     """Ensures that torch determinism settings are reset before the next test runs."""
     yield
+    os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None)
     torch.use_deterministic_algorithms(False)
 
 
+@pytest.fixture
+def reset_cudnn_benchmark():
+    """Ensures that the `torch.backends.cudnn.benchmark` setting gets reset before the next test runs."""
+    benchmark = torch.backends.cudnn.benchmark
+    yield
+    torch.backends.cudnn.benchmark = benchmark
+
+
 def mock_xla_available(monkeypatch: pytest.MonkeyPatch, value: bool = True) -> None:
     monkeypatch.setattr(lightning.fabric.accelerators.tpu, "_XLA_AVAILABLE", value)
     monkeypatch.setattr(lightning.fabric.plugins.environments.xla, "_XLA_AVAILABLE", value)

@@ -0,0 +1,83 @@
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC, abstractmethod
+from typing import Callable
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.optim import Optimizer
+from torch.utils.data import DataLoader, TensorDataset
+
+
+class ParityModel(ABC, nn.Module):
+    """Defines the interface for a model in a Fabric-PyTorch parity test."""
+
+    # Benchmarking parameters that should be model-specific
+    batch_size = 1
+    num_steps = 1
+
+    @abstractmethod
+    def get_optimizer(self, *args, **kwargs) -> Optimizer:
+        pass
+
+    @abstractmethod
+    def get_dataloader(self, *args, **kwargs) -> DataLoader:
+        pass
+
+    @abstractmethod
+    def get_loss_function(self) -> Callable:
+        pass
+
+
+class ConvNet(ParityModel):
+    batch_size = 4
+    num_steps = 1000
+
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = torch.flatten(x, 1)  # flatten all dimensions except batch
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+    def get_optimizer(self):
+        return torch.optim.SGD(self.parameters(), lr=0.0001)
+
+    def get_dataloader(self):
+        # multiply * 8 just in case world size is larger than 1
+        dataset_size = self.num_steps * self.batch_size * 8
+        inputs = torch.rand(dataset_size, 3, 32, 32)
+        labels = torch.randint(0, 10, (dataset_size,))
+        dataset = TensorDataset(inputs, labels)
+        dataloader = DataLoader(
+            dataset,
+            batch_size=self.batch_size,
+            num_workers=2,
+        )
+        return dataloader
+
+    def get_loss_function(self):
+        return F.cross_entropy
@@ -0,0 +1,157 @@
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import time
+from copy import deepcopy
+
+import pytest
+import torch
+import torch.distributed
+import torch.nn.functional
+from tests_fabric.helpers.runif import RunIf
+from tests_fabric.parity.models import ConvNet
+from tests_fabric.parity.utils import is_cuda_memory_close, is_state_dict_equal, is_timing_close, make_deterministic
+from torch.nn.parallel.distributed import DistributedDataParallel
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+
+from lightning.fabric.fabric import Fabric
+
+
+def train_torch_ddp(
+    rank,
+    world_size,
+    device=torch.device("cpu"),
+):
+    make_deterministic()
+    memory_stats = {}
+
+    os.environ["LOCAL_RANK"] = str(rank)
+    if torch.distributed.is_available() and not torch.distributed.is_initialized():
+        torch.distributed.init_process_group("gloo", rank=rank, world_size=world_size)
+
+    model = ConvNet().to(device)
+    initial_state_dict = deepcopy(model.state_dict())
+
+    ddp_model = DistributedDataParallel(model.to(device), device_ids=([rank] if device.type == "cuda" else None))
+
+    dataloader = model.get_dataloader()
+    sampler = DistributedSampler(dataloader.dataset, rank=rank, num_replicas=world_size, drop_last=False, shuffle=False)
+    dataloader = DataLoader(dataloader.dataset, sampler=sampler, batch_size=model.batch_size)
+    optimizer = model.get_optimizer()
+    loss_fn = model.get_loss_function()
+
+    memory_stats["start"] = torch.cuda.memory_stats()
+
+    ddp_model.train()
+    iteration_timings = []
+    iterator = iter(dataloader)
+    for _ in range(model.num_steps):
+        t0 = time.perf_counter()
+
+        inputs, labels = next(iterator)
+        inputs, labels = inputs.to(device), labels.to(device)
+        optimizer.zero_grad()
+        outputs = ddp_model(inputs)
+        loss = loss_fn(outputs, labels)
+        loss.backward()
+        optimizer.step()
+
+        t1 = time.perf_counter()
+        iteration_timings.append(t1 - t0)
+
+    memory_stats["end"] = torch.cuda.memory_stats()
+
+    # check that the model has changed
+    assert not is_state_dict_equal(initial_state_dict, ddp_model.module.state_dict())
+
+    return ddp_model.module.state_dict(), torch.tensor(iteration_timings), memory_stats
+
+
+def train_fabric_ddp(fabric):
+    make_deterministic()
+    memory_stats = {}
+
+    model = ConvNet()
+    initial_state_dict = deepcopy(model.state_dict())
+
+    optimizer = model.get_optimizer()
+    model, optimizer = fabric.setup(model, optimizer)
+
+    dataloader = model.get_dataloader()
+    dataloader = fabric.setup_dataloaders(dataloader)
+    loss_fn = model.get_loss_function()
+
+    memory_stats["start"] = torch.cuda.memory_stats()
+
+    model.train()
+    iteration_timings = []
+    iterator = iter(dataloader)
+    for _ in range(model.num_steps):
+        t0 = time.perf_counter()
+
+        inputs, labels = next(iterator)
+        optimizer.zero_grad()
+        outputs = model(inputs)
+        loss = loss_fn(outputs, labels)
+        fabric.backward(loss)
+        optimizer.step()
+
+        t1 = time.perf_counter()
+        iteration_timings.append(t1 - t0)
+
+    memory_stats["end"] = torch.cuda.memory_stats()
+
+    # check that the model has changed
+    assert not is_state_dict_equal(initial_state_dict, model.state_dict())
+
+    return model.state_dict(), torch.tensor(iteration_timings), memory_stats
+
+
+@RunIf(standalone=True)
+@pytest.mark.usefixtures("reset_deterministic_algorithm", "reset_cudnn_benchmark")
+@pytest.mark.parametrize(
+    "accelerator, devices",
+    [
+        ("cpu", 2),
+        pytest.param("cuda", 2, marks=RunIf(min_cuda_gpus=2)),
+    ],
+)
+def test_parity_ddp(accelerator, devices):
+    # Train with Fabric
+    fabric = Fabric(accelerator=accelerator, strategy="ddp", devices=devices)
+    fabric.launch()
+    state_dict_fabric, timings_fabric, memory_fabric = train_fabric_ddp(fabric)
+
+    if accelerator == "cuda":
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+
+    # Train with raw PyTorch
+    state_dict_torch, timings_torch, memory_torch = train_torch_ddp(
+        rank=fabric.global_rank,
+        world_size=fabric.world_size,
+        device=fabric.device,
+    )
+
+    # Compare the final weights
+    assert is_state_dict_equal(state_dict_torch, state_dict_fabric)
+
+    # Compare the time per iteration
+    assert is_timing_close(timings_torch, timings_fabric, rtol=1e-3, atol=1e-3)
+
+    # Compare memory usage
+    if accelerator == "cuda":
+        assert is_cuda_memory_close(memory_torch["start"], memory_fabric["start"])
+        assert is_cuda_memory_close(memory_torch["end"], memory_fabric["end"])