diff --git a/.drone.yml b/.drone.yml
index bb4d8a74b28f53..84e97150752e72 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -20,7 +20,7 @@ name: torch-GPU
 
 steps:
 - name: testing
-  image: pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.5
+  image: pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.6
 
   environment:
     CODECOV_TOKEN:
diff --git a/README.md b/README.md
index 33ee5448029141..30079df9317590 100644
--- a/README.md
+++ b/README.md
@@ -92,7 +92,7 @@ Lightning can automatically export to ONNX or TorchScript for those cases.
 | System / PyTorch ver. | 1.3 (min. req.)* | 1.4 | 1.5 | 1.6 | 1.7 (latest) |
 | :---: | :---: | :---: | :---: | :---: | :---: |
 | Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) |
-| Linux py3.7 [GPUs**] | - | - | [![Build Status](http://104.154.220.231/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://104.154.220.231/PyTorchLightning/pytorch-lightning) | - | - |
+| Linux py3.7 [GPUs**] | - | - | - | [![Build Status](http://104.154.220.231/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://104.154.220.231/PyTorchLightning/pytorch-lightning) | - |
 | Linux py3.7 [TPUs***] | - | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - |
 | Linux py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |
 | OSX py3.6 / py3.7 | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |
diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py
index b016b6c5d24fba..98bc8dfc87d25a 100644
--- a/pytorch_lightning/plugins/native_amp.py
+++ b/pytorch_lightning/plugins/native_amp.py
@@ -29,8 +29,10 @@ def connect(self, model, optimizers):
     def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
         closure_loss = self.trainer.scaler.scale(closure_loss)
 
+        automatic_optimization = self.trainer.train_loop.automatic_optimization
+
         # do backward pass
-        if self.trainer.train_loop.automatic_optimization:
+        if automatic_optimization:
             model = self.trainer.get_model()
             model.backward(closure_loss, optimizer, opt_idx)
         else:
@@ -40,7 +42,7 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
         closure_loss = closure_loss.detach()
 
         # unscale gradient to allow analyze within `on_after_backward`
-        if not self.trainer.train_loop.should_accumulate():
+        if not self.trainer.train_loop.should_accumulate() and automatic_optimization:
             self.trainer.scaler.unscale_(optimizer)
 
         return closure_loss
diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
index 449bc8c712ddb4..d09d9387ea485f 100644
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@@ -25,6 +25,7 @@
 import tests.base.develop_pipelines as tpipes
 import tests.base.develop_utils as tutils
 from pytorch_lightning import Trainer
+from pytorch_lightning.utilities import APEX_AVAILABLE, NATIVE_AMP_AVALAIBLE
 from tests.base import EvalModelTemplate
 from tests.base.models import BasicGAN
 
@@ -126,8 +127,33 @@ def test_horovod_multi_gpu(tmpdir):
 @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
 @pytest.mark.skipif(not _nccl_available(), reason="test requires Horovod with NCCL support")
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+@pytest.mark.skipif(not APEX_AVAILABLE, reason="test requires apex")
+def test_horovod_apex(tmpdir):
+    """Test Horovod with multi-GPU support using apex amp."""
+    trainer_options = dict(
+        default_root_dir=str(tmpdir),
+        weights_save_path=str(tmpdir),
+        gradient_clip_val=1.0,
+        progress_bar_refresh_rate=0,
+        max_epochs=1,
+        limit_train_batches=0.4,
+        limit_val_batches=0.2,
+        gpus=2,
+        deterministic=True,
+        distributed_backend='horovod',
+        amp_backend='apex',
+        precision=16,
+    )
+    _run_horovod(trainer_options, on_gpu=True)
+
+
+@pytest.mark.skip(reason="Skip till Horovod fixes integration with Native torch.cuda.amp")
+@pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
+@pytest.mark.skipif(not _nccl_available(), reason="test requires Horovod with NCCL support")
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+@pytest.mark.skipif(not NATIVE_AMP_AVALAIBLE, reason="test requires torch.cuda.amp")
 def test_horovod_amp(tmpdir):
-    """Test Horovod with multi-GPU support."""
+    """Test Horovod with multi-GPU support using native amp."""
     trainer_options = dict(
         default_root_dir=str(tmpdir),
         weights_save_path=str(tmpdir),
@@ -139,6 +165,7 @@ def test_horovod_amp(tmpdir):
         gpus=2,
         deterministic=True,
         distributed_backend='horovod',
+        amp_backend='native',
         precision=16,
     )
     _run_horovod(trainer_options, on_gpu=True)
diff --git a/tests/plugins/test_amp_plugin.py b/tests/plugins/test_amp_plugin.py
index 6fd000b61d97f2..6d7f0252d94c3e 100644
--- a/tests/plugins/test_amp_plugin.py
+++ b/tests/plugins/test_amp_plugin.py
@@ -86,20 +86,19 @@ def on_fit_start(self, trainer, pl_module):
         trainer.fit(model)
 
 
+class GradientUnscaleBoringModel(BoringModel):
+    def on_after_backward(self):
+        norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2)
+        if not (torch.isinf(norm) or torch.isnan(norm)):
+            assert norm.item() < 15.
+
+
 @pytest.mark.skipif(
     LooseVersion(torch.__version__) < LooseVersion("1.6.0"),
     reason="Minimal PT version is set to 1.6")
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 def test_amp_gradient_unscale(tmpdir):
-
-    class ExtendedBoringModel(BoringModel):
-
-        def on_after_backward(self):
-            norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2)
-            if not (torch.isinf(norm) or torch.isnan(norm)):
-                assert norm.item() < 15.
-
-    model = ExtendedBoringModel()
+    model = GradientUnscaleBoringModel()
 
     trainer = Trainer(
         max_epochs=2,
@@ -117,19 +116,19 @@ def on_after_backward(self):
     trainer.fit(model)
 
 
+class UnscaleAccumulateGradBatchesBoringModel(BoringModel):
+
+    def on_after_backward(self):
+        norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2)
+        if not (torch.isinf(norm) or torch.isnan(norm)):
+            assert norm.item() < 15.
+
+
 @pytest.mark.skipif(
     LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Minimal PT version is set to 1.6")
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 def test_amp_gradient_unscale_accumulate_grad_batches(tmpdir):
-
-    class ExtendedBoringModel(BoringModel):
-
-        def on_after_backward(self):
-            norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2)
-            if not (torch.isinf(norm) or torch.isnan(norm)):
-                assert norm.item() < 15.
-
-    model = ExtendedBoringModel()
+    model = UnscaleAccumulateGradBatchesBoringModel()
 
     trainer = Trainer(
         max_epochs=2,