From c7675607f9f3ed609f1af58da6eb19a175504ab5 Mon Sep 17 00:00:00 2001
From: Nicki Skafte <skaftenicki@gmail.com>
Date: Thu, 22 Oct 2020 22:14:21 +0200
Subject: [PATCH 1/9] detach on buffer

---
 docs/source/metrics.rst             | 27 +++++++++++++++++++--------
 pytorch_lightning/metrics/metric.py |  3 ++-
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst
index 731a818687d25..6c175a0361f28 100644
--- a/docs/source/metrics.rst
+++ b/docs/source/metrics.rst
@@ -31,14 +31,14 @@ The example below shows how to use a metric in your ``LightningModule``:
     def __init__(self):
         ...
         self.accuracy = pl.metrics.Accuracy()
-   
+
     def training_step(self, batch, batch_idx):
         logits = self(x)
         ...
         # log step metric
         self.log('train_acc_step', self.accuracy(logits, y))
         ...
-   
+
     def training_epoch_end(self, outs):
         # log epoch metric
         self.log('train_acc_epoch', self.accuracy.compute())
@@ -57,7 +57,7 @@ If ``on_epoch`` is True, the logger automatically logs the end of epoch metric v
     This however is only true for metrics that inherit the base class ``Metric``,
     and thus the functional metric API provides no support for in-built distributed synchronization
     or reduction functions.
-    
+
 
 .. code-block:: python
 
@@ -65,7 +65,7 @@ If ``on_epoch`` is True, the logger automatically logs the end of epoch metric v
         ...
         self.train_acc = pl.metrics.Accuracy()
         self.valid_acc = pl.metrics.Accuracy()
-   
+
     def training_step(self, batch, batch_idx):
         logits = self(x)
         ...
@@ -91,17 +91,17 @@ This metrics API is independent of PyTorch Lightning. Metrics can directly be us
     for epoch in range(epochs):
         for x, y in train_data:
             y_hat = model(x)
-            
+
             # training step accuracy
             batch_acc = train_accuracy(y_hat, y)
-            
+
         for x, y in valid_data:
             y_hat = model(x)
             valid_accuracy(y_hat, y)
-            
+
     # total accuracy over all training batches
     total_train_accuracy = train_accuracy.compute()
-    
+
     # total accuracy over all validation batches
     total_valid_accuracy = valid_accuracy.compute()
 
@@ -144,6 +144,17 @@ Example implementation:
         def compute(self):
             return self.correct.float() / self.total
 
+Metrics support backpropergation, if all computations involved in the metric calculation
+are differentiable. However, note that cashed buffer is detached from the computational
+graph and cannot be backpropergated. Not due this would mean storing the computational
+graph for each update call. Concreatly, this means that:
+
+.. code-block:: python
+    metric = MyMetric()
+    val = metric(pred, target) # this value can be backpropergated
+    val = metric.compute() # this value cannot be backpropergated
+
+
 **********
 Metric API
 **********
diff --git a/pytorch_lightning/metrics/metric.py b/pytorch_lightning/metrics/metric.py
index 8911341466b36..84d7cfe5d44f6 100644
--- a/pytorch_lightning/metrics/metric.py
+++ b/pytorch_lightning/metrics/metric.py
@@ -142,7 +142,8 @@ def forward(self, *args, **kwargs):
         Automatically calls ``update()``. Returns the metric value over inputs if ``compute_on_step`` is True.
         """
         # add current step
-        self.update(*args, **kwargs)
+        with torch.no_grad():
+            self.update(*args, **kwargs)
         self._forward_cache = None
 
         if self.compute_on_step:

From 158dc3904a03ead8c1f2cbe738c70eb45c0ac0a0 Mon Sep 17 00:00:00 2001
From: Nicki Skafte <skaftenicki@gmail.com>
Date: Thu, 22 Oct 2020 22:18:51 +0200
Subject: [PATCH 2/9] doc update

---
 docs/source/metrics.rst |   7 +--
 testscript.py           | 115 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 119 insertions(+), 3 deletions(-)
 create mode 100644 testscript.py

diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst
index 6c175a0361f28..00809e27ba526 100644
--- a/docs/source/metrics.rst
+++ b/docs/source/metrics.rst
@@ -145,9 +145,10 @@ Example implementation:
             return self.correct.float() / self.total
 
 Metrics support backpropergation, if all computations involved in the metric calculation
-are differentiable. However, note that cashed buffer is detached from the computational
-graph and cannot be backpropergated. Not due this would mean storing the computational
-graph for each update call. Concreatly, this means that:
+are differentiable. However, note that the cashed state is detached from the computational
+graph and cannot be backpropergated. Not doing this would mean storing the computational
+graph for each update call, which can lead to out-of-memory errors.
+In practise this means that:
 
 .. code-block:: python
     metric = MyMetric()
diff --git a/testscript.py b/testscript.py
new file mode 100644
index 0000000000000..ebf5a5a392efb
--- /dev/null
+++ b/testscript.py
@@ -0,0 +1,115 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Oct 22 22:14:58 2020
+
+@author: nsde
+"""
+
+import os
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.utils.data import DataLoader, random_split, Dataset
+from torchvision.datasets import MNIST
+from torchvision import transforms
+import pytorch_lightning as pl
+from pytorch_lightning.metrics.functional import accuracy
+tmpdir = os.getcwd()
+
+# some other options for random data
+from pl_bolts.datasets import RandomDataset, DummyDataset, RandomDictDataset
+
+class RandomDataset(Dataset):
+    def __init__(self, size, num_samples):
+        self.len = num_samples
+        self.data = torch.randn(num_samples, size)
+
+    def __getitem__(self, index):
+        return self.data[index]
+
+    def __len__(self):
+        return self.len
+
+
+num_samples = 10000
+
+train = RandomDataset(32, num_samples)
+train = DataLoader(train, batch_size=32)
+
+val = RandomDataset(32, num_samples)
+val = DataLoader(val, batch_size=32)
+
+test = RandomDataset(32, num_samples)
+test = DataLoader(test, batch_size=32)
+
+import torch
+from pytorch_lightning import LightningModule
+from torch.utils.data import Dataset
+
+class BoringModel(LightningModule):
+
+    def __init__(self):
+        super().__init__()
+        self.layer = torch.nn.Linear(32, 1)
+        self.train_metric = pl.metrics.MeanSquaredError()
+
+    def forward(self, x):
+        return self.layer(x)
+
+    def loss(self, batch, prediction):
+        # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls
+        return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction))
+
+    def training_step(self, batch, batch_idx):
+        output = self.layer(batch)
+        loss = self.loss(batch, output)
+        self.train_metric(output, torch.ones_like(output))
+        self.log('train_metric', self.train_metric, on_step=True, on_epoch=False)
+        return {"loss": loss}
+
+    def training_step_end(self, training_step_outputs):
+        return training_step_outputs
+
+    def training_epoch_end(self, outputs) -> None:
+        torch.stack([x["loss"] for x in outputs]).mean()
+
+    def validation_step(self, batch, batch_idx):
+        output = self.layer(batch)
+        loss = self.loss(batch, output)
+        return {"x": loss}
+
+    def validation_epoch_end(self, outputs) -> None:
+        torch.stack([x['x'] for x in outputs]).mean()
+
+    def test_step(self, batch, batch_idx):
+        output = self.layer(batch)
+        loss = self.loss(batch, output)
+        self.log('fake_test_acc', loss)
+        return {"y": loss}
+
+    def test_epoch_end(self, outputs) -> None:
+        torch.stack([x["y"] for x in outputs]).mean()
+
+    def configure_optimizers(self):
+        optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
+        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
+        return [optimizer], [lr_scheduler]
+
+def test_x(tmpdir):
+    # init model
+    model = BoringModel()
+
+    # Initialize a trainer
+    trainer = pl.Trainer(
+        gpus=1,
+        max_epochs=2,
+        progress_bar_refresh_rate=0
+    )
+
+    # Train the model ⚡
+    trainer.fit(model, train, val)
+
+    trainer.test(test_dataloaders=test)
+
+test_x(tmpdir)
\ No newline at end of file

From c174170a7eabba9814bb359bc7d1ab2934c86fae Mon Sep 17 00:00:00 2001
From: Nicki Skafte <skaftenicki@gmail.com>
Date: Thu, 22 Oct 2020 22:23:59 +0200
Subject: [PATCH 3/9] remove file

---
 testscript.py | 115 --------------------------------------------------
 1 file changed, 115 deletions(-)
 delete mode 100644 testscript.py

diff --git a/testscript.py b/testscript.py
deleted file mode 100644
index ebf5a5a392efb..0000000000000
--- a/testscript.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Thu Oct 22 22:14:58 2020
-
-@author: nsde
-"""
-
-import os
-
-import torch
-from torch import nn
-from torch.nn import functional as F
-from torch.utils.data import DataLoader, random_split, Dataset
-from torchvision.datasets import MNIST
-from torchvision import transforms
-import pytorch_lightning as pl
-from pytorch_lightning.metrics.functional import accuracy
-tmpdir = os.getcwd()
-
-# some other options for random data
-from pl_bolts.datasets import RandomDataset, DummyDataset, RandomDictDataset
-
-class RandomDataset(Dataset):
-    def __init__(self, size, num_samples):
-        self.len = num_samples
-        self.data = torch.randn(num_samples, size)
-
-    def __getitem__(self, index):
-        return self.data[index]
-
-    def __len__(self):
-        return self.len
-
-
-num_samples = 10000
-
-train = RandomDataset(32, num_samples)
-train = DataLoader(train, batch_size=32)
-
-val = RandomDataset(32, num_samples)
-val = DataLoader(val, batch_size=32)
-
-test = RandomDataset(32, num_samples)
-test = DataLoader(test, batch_size=32)
-
-import torch
-from pytorch_lightning import LightningModule
-from torch.utils.data import Dataset
-
-class BoringModel(LightningModule):
-
-    def __init__(self):
-        super().__init__()
-        self.layer = torch.nn.Linear(32, 1)
-        self.train_metric = pl.metrics.MeanSquaredError()
-
-    def forward(self, x):
-        return self.layer(x)
-
-    def loss(self, batch, prediction):
-        # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls
-        return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction))
-
-    def training_step(self, batch, batch_idx):
-        output = self.layer(batch)
-        loss = self.loss(batch, output)
-        self.train_metric(output, torch.ones_like(output))
-        self.log('train_metric', self.train_metric, on_step=True, on_epoch=False)
-        return {"loss": loss}
-
-    def training_step_end(self, training_step_outputs):
-        return training_step_outputs
-
-    def training_epoch_end(self, outputs) -> None:
-        torch.stack([x["loss"] for x in outputs]).mean()
-
-    def validation_step(self, batch, batch_idx):
-        output = self.layer(batch)
-        loss = self.loss(batch, output)
-        return {"x": loss}
-
-    def validation_epoch_end(self, outputs) -> None:
-        torch.stack([x['x'] for x in outputs]).mean()
-
-    def test_step(self, batch, batch_idx):
-        output = self.layer(batch)
-        loss = self.loss(batch, output)
-        self.log('fake_test_acc', loss)
-        return {"y": loss}
-
-    def test_epoch_end(self, outputs) -> None:
-        torch.stack([x["y"] for x in outputs]).mean()
-
-    def configure_optimizers(self):
-        optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
-        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
-        return [optimizer], [lr_scheduler]
-
-def test_x(tmpdir):
-    # init model
-    model = BoringModel()
-
-    # Initialize a trainer
-    trainer = pl.Trainer(
-        gpus=1,
-        max_epochs=2,
-        progress_bar_refresh_rate=0
-    )
-
-    # Train the model ⚡
-    trainer.fit(model, train, val)
-
-    trainer.test(test_dataloaders=test)
-
-test_x(tmpdir)
\ No newline at end of file

From 81c1a26ff42108bad7583b98fca12c28c4044207 Mon Sep 17 00:00:00 2001
From: Nicki Skafte <skaftenicki@gmail.com>
Date: Thu, 22 Oct 2020 22:35:35 +0200
Subject: [PATCH 4/9] changelog

---
 CHANGELOG.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 68a5a1d0f434c..6dcd1c260148a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,6 +21,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+- Fixed that metrics do not store computational graph for all seen data ([#4313](https://github.com/PyTorchLightning/pytorch-lightning/pull/4313))
+
 
 ## [1.0.2] - 2020-10-15
 
@@ -97,7 +99,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ### Fixed
 
 - Fixed `current_epoch` property update to reflect true epoch number inside `LightningDataModule`, when `reload_dataloaders_every_epoch=True`. ([#3974](https://github.com/PyTorchLightning/pytorch-lightning/pull/3974))
-- Fixed to print scaler value in progress bar ([#4053](https://github.com/PyTorchLightning/pytorch-lightning/pull/4053)) 
+- Fixed to print scaler value in progress bar ([#4053](https://github.com/PyTorchLightning/pytorch-lightning/pull/4053))
 - Fixed mismatch between docstring and code regarding when `on_load_checkpoint` hook is called ([#3996](https://github.com/PyTorchLightning/pytorch-lightning/pull/3996))
 
 
@@ -442,7 +444,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed adding val step argument to metrics ([#2986](https://github.com/PyTorchLightning/pytorch-lightning/pull/2986))
 - Fixed an issue that caused `Trainer.test()` to stall in ddp mode ([#2997](https://github.com/PyTorchLightning/pytorch-lightning/pull/2997))
 - Fixed gathering of results with tensors of varying shape ([#3020](https://github.com/PyTorchLightning/pytorch-lightning/pull/3020))
-- Fixed batch size auto-scaling feature to set the new value on the correct model attribute ([#3043](https://github.com/PyTorchLightning/pytorch-lightning/pull/3043)) 
+- Fixed batch size auto-scaling feature to set the new value on the correct model attribute ([#3043](https://github.com/PyTorchLightning/pytorch-lightning/pull/3043))
 - Fixed automatic batch scaling not working with half precision ([#3045](https://github.com/PyTorchLightning/pytorch-lightning/pull/3045))
 - Fixed setting device to root gpu ([#3042](https://github.com/PyTorchLightning/pytorch-lightning/pull/3042))
 

From f313bd2923009059e884ab9ad017413460a6b89d Mon Sep 17 00:00:00 2001
From: Nicki Skafte <skaftenicki@gmail.com>
Date: Thu, 22 Oct 2020 22:38:25 +0200
Subject: [PATCH 5/9] suggestions

---
 docs/source/metrics.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst
index 00809e27ba526..7fa68f9a2ef00 100644
--- a/docs/source/metrics.rst
+++ b/docs/source/metrics.rst
@@ -144,16 +144,16 @@ Example implementation:
         def compute(self):
             return self.correct.float() / self.total
 
-Metrics support backpropergation, if all computations involved in the metric calculation
+Metrics support backpropagation, if all computations involved in the metric calculation
 are differentiable. However, note that the cashed state is detached from the computational
-graph and cannot be backpropergated. Not doing this would mean storing the computational
+graph and cannot be backpropagated. Not doing this would mean storing the computational
 graph for each update call, which can lead to out-of-memory errors.
 In practise this means that:
 
 .. code-block:: python
     metric = MyMetric()
-    val = metric(pred, target) # this value can be backpropergated
-    val = metric.compute() # this value cannot be backpropergated
+    val = metric(pred, target) # this value can be backpropagated
+    val = metric.compute() # this value cannot be backpropagated
 
 
 **********

From fea7a12710e6bc52997e235ae9946cd0fae2b2e7 Mon Sep 17 00:00:00 2001
From: Nicki Skafte <skaftenicki@gmail.com>
Date: Fri, 23 Oct 2020 11:59:01 +0200
Subject: [PATCH 6/9] Update docs/source/metrics.rst

Co-authored-by: Teddy Koker <teddy.koker@gmail.com>
---
 docs/source/metrics.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst
index 7fa68f9a2ef00..22f520f340fd1 100644
--- a/docs/source/metrics.rst
+++ b/docs/source/metrics.rst
@@ -151,6 +151,7 @@ graph for each update call, which can lead to out-of-memory errors.
 In practise this means that:
 
 .. code-block:: python
+
     metric = MyMetric()
     val = metric(pred, target) # this value can be backpropagated
     val = metric.compute() # this value cannot be backpropagated
@@ -432,4 +433,3 @@ embedding_similarity [func]
 
 .. autofunction:: pytorch_lightning.metrics.functional.self_supervised.embedding_similarity
     :noindex:
-

From b1539670feb1cfd2ca2ad349a7fd316390abd784 Mon Sep 17 00:00:00 2001
From: Nicki Skafte <skaftenicki@gmail.com>
Date: Fri, 23 Oct 2020 13:26:45 +0200
Subject: [PATCH 7/9] fix for 4266

---
 pytorch_lightning/core/step_result.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py
index 650c1876d0cd0..a8224f45e3829 100644
--- a/pytorch_lightning/core/step_result.py
+++ b/pytorch_lightning/core/step_result.py
@@ -259,7 +259,7 @@ def get_batch_log_metrics(self, include_forked_originals=True) -> dict:
 
             if options['logger'] and options['on_step']:
                 if isinstance(self[k], Metric):
-                    result[k] = self[k]._forward_cache
+                    result[k] = self[k]._forward_cache.detach()
                 else:
                     result[k] = self[k]
 
@@ -281,7 +281,7 @@ def get_epoch_log_metrics(self) -> dict:
 
             if options['logger'] and options['on_epoch']:
                 if isinstance(self[k], Metric):
-                    result[k] = self[k].compute()
+                    result[k] = self[k].compute().detach()
                 else:
                     result[k] = self[k]
 
@@ -307,7 +307,7 @@ def get_epoch_pbar_metrics(self):
 
             if options['prog_bar'] and options['on_epoch']:
                 if isinstance(self[k], Metric):
-                    result[k] = self[k].compute()
+                    result[k] = self[k].compute().detach()
                 else:
                     result[k] = self[k]
 

From 74470dff1caa8d8df98b25ae596ab3c47d1c7a61 Mon Sep 17 00:00:00 2001
From: chaton <thomas@grid.ai>
Date: Fri, 30 Oct 2020 09:33:18 +0000
Subject: [PATCH 8/9] Update docs/source/metrics.rst
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
---
 docs/source/metrics.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst
index 5bac5f20754d2..3ec13b9c69bc8 100644
--- a/docs/source/metrics.rst
+++ b/docs/source/metrics.rst
@@ -145,7 +145,7 @@ Example implementation:
             return self.correct.float() / self.total
 
 Metrics support backpropagation, if all computations involved in the metric calculation
-are differentiable. However, note that the cashed state is detached from the computational
+are differentiable. However, note that the cached state is detached from the computational
 graph and cannot be backpropagated. Not doing this would mean storing the computational
 graph for each update call, which can lead to out-of-memory errors.
 In practise this means that:

From c88b511665af8a96cf64a7010bddad98ad1c1890 Mon Sep 17 00:00:00 2001
From: Nicki Skafte <skaftenicki@gmail.com>
Date: Mon, 2 Nov 2020 15:44:05 +0100
Subject: [PATCH 9/9] Update CHANGELOG.md

---
 CHANGELOG.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 367f56ffa2a58..62f28c6436b51 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -73,8 +73,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
-- Fixed error using `auto_select_gpus=True` with `gpus=-1` ([#4209](https://github.com/PyTorchLightning/pytorch-lightning/pull/4209))
-
 - Fixed setting device ids in DDP ([#4297](https://github.com/PyTorchLightning/pytorch-lightning/pull/4297))
 
 - Fixed synchronization of best model path in `ddp_accelerator` ([#4323](https://github.com/PyTorchLightning/pytorch-lightning/pull/4323))