Correctly calculate distributed loss average (#269)

Co-authored-by: Abhishek Das <das.abhshk@gmail.com>
FAIR-Chem · Sep 18, 2021 · ef98b27 · ef98b27
1 parent d6529b8
commit ef98b27
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 1 deletion.
diff --git a/ocpmodels/modules/loss.py b/ocpmodels/modules/loss.py
@@ -1,6 +1,8 @@
 import torch
 from torch import nn
 
+from ocpmodels.common import distutils
+
 
 class L2MAELoss(nn.Module):
     def __init__(self, reduction="mean"):
@@ -14,3 +16,25 @@ def forward(self, input: torch.Tensor, target: torch.Tensor):
             return torch.mean(dists)
         elif self.reduction == "sum":
             return torch.sum(dists)
+
+
+class DDPLoss(nn.Module):
+    def __init__(self, loss_fn, reduction="mean"):
+        super().__init__()
+        self.loss_fn = loss_fn
+        self.loss_fn.reduction = "sum"
+        self.reduction = reduction
+        assert reduction in ["mean", "sum"]
+
+    def forward(self, input: torch.Tensor, target: torch.Tensor):
+        loss = self.loss_fn(input, target)
+        if self.reduction == "mean":
+            num_samples = input.shape[0]
+            num_samples = distutils.all_reduce(
+                num_samples, device=input.device
+            )
+            # Multiply by world size since gradients are averaged
+            # across DDP replicas
+            return loss * distutils.get_world_size() / num_samples
+        else:
+            return loss
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
@@ -40,7 +40,7 @@
 from ocpmodels.modules.exponential_moving_average import (
     ExponentialMovingAverage,
 )
-from ocpmodels.modules.loss import L2MAELoss
+from ocpmodels.modules.loss import DDPLoss, L2MAELoss
 from ocpmodels.modules.normalizer import Normalizer
 from ocpmodels.modules.scheduler import LRScheduler
 
@@ -366,6 +366,8 @@ def load_loss(self):
                 raise NotImplementedError(
                     f"Unknown loss function name: {loss_name}"
                 )
+            if distutils.initialized():
+                self.loss_fn[loss] = DDPLoss(self.loss_fn[loss])
 
     def load_optimizer(self):
         optimizer = self.config["optim"].get("optimizer", "AdamW")