From 39e54aef44268ea3adf61b05d0a363c2dcce8b0c Mon Sep 17 00:00:00 2001
From: Mannat Singh <mannatsingh@fb.com>
Date: Wed, 11 Mar 2020 11:08:06 -0700
Subject: [PATCH] Add support for Sync BN (#423)

Summary:
Pull Request resolved: https://github.com/facebookresearch/ClassyVision/pull/423

Added support for using sync batch normalization using PyTorch's implementation or Apex's.

Plugged in the model complexity hook to `classy_train.py`. It helps test the bug I encountered and fixed which needs the profiler + sync batch norm.

Reviewed By: vreis

Differential Revision: D20307435

fbshipit-source-id: 30556effe0149a51e2f53970ffb422a6a8ef15be
---
 classy_train.py                               |  3 +-
 classy_vision/tasks/classification_task.py    | 43 ++++++++++++++++++-
 test/generic/config_utils.py                  |  5 ++-
 test/hooks_loss_lr_meter_logging_hook_test.py |  2 +-
 .../hooks_tensorboard_plot_hook_test.py       |  4 +-
 test/trainer_distributed_trainer_test.py      | 26 +++++++++++
 6 files changed, 75 insertions(+), 8 deletions(-)

diff --git a/classy_train.py b/classy_train.py
index 26e2eb53b8..852627be2c 100755
--- a/classy_train.py
+++ b/classy_train.py
@@ -50,6 +50,7 @@
 from classy_vision.hooks import (
     CheckpointHook,
     LossLrMeterLoggingHook,
+    ModelComplexityHook,
     ProfilerHook,
     ProgressBarHook,
     TensorboardPlotHook,
@@ -118,7 +119,7 @@ def main(args, config):
 
 
 def configure_hooks(args, config):
-    hooks = [LossLrMeterLoggingHook(args.log_freq)]
+    hooks = [LossLrMeterLoggingHook(args.log_freq), ModelComplexityHook()]
 
     # Make a folder to store checkpoints and tensorboard logging outputs
     suffix = datetime.now().isoformat()
diff --git a/classy_vision/tasks/classification_task.py b/classy_vision/tasks/classification_task.py
index 24670b1bb9..7f7cff2596 100644
--- a/classy_vision/tasks/classification_task.py
+++ b/classy_vision/tasks/classification_task.py
@@ -11,6 +11,7 @@
 from typing import Any, Dict, List, NamedTuple, Optional, Union
 
 import torch
+import torch.nn as nn
 from classy_vision.dataset import ClassyDataset, build_dataset
 from classy_vision.generic.distributed_util import (
     all_reduce_mean,
@@ -53,6 +54,12 @@ class BroadcastBuffersMode(enum.Enum):
     BEFORE_EVAL = enum.auto()
 
 
+class BatchNormSyncMode(enum.Enum):
+    DISABLED = enum.auto()  # No Synchronized Batch Normalization
+    PYTORCH = enum.auto()  # Use torch.nn.SyncBatchNorm
+    APEX = enum.auto()  # Use apex.parallel.SyncBatchNorm, needs apex to be installed
+
+
 class LastBatchInfo(NamedTuple):
     loss: torch.Tensor
     output: torch.Tensor
@@ -133,6 +140,7 @@ def __init__(self):
         self.amp_opt_level = None
         self.perf_log = []
         self.last_batch = None
+        self.batch_norm_sync_mode = BatchNormSyncMode.DISABLED
 
     def set_checkpoint(self, checkpoint):
         """Sets checkpoint on task.
@@ -204,14 +212,35 @@ def set_meters(self, meters: List["ClassyMeter"]):
         self.meters = meters
         return self
 
-    def set_distributed_options(self, broadcast_buffers_mode: BroadcastBuffersMode):
+    def set_distributed_options(
+        self,
+        broadcast_buffers_mode: BroadcastBuffersMode = BroadcastBuffersMode.DISABLED,
+        batch_norm_sync_mode: BatchNormSyncMode = BatchNormSyncMode.DISABLED,
+    ):
         """Set distributed options.
 
         Args:
             broadcast_buffers_mode: Broadcast buffers mode. See
                 :class:`BroadcastBuffersMode` for options.
+            batch_norm_sync_mode: Batch normalization synchronization mode. See
+                :class:`BatchNormSyncMode` for options.
+
+        Raises:
+            RuntimeError: If batch_norm_sync_mode is `BatchNormSyncMode.APEX` and apex
+                is not installed.
         """
         self.broadcast_buffers_mode = broadcast_buffers_mode
+
+        if batch_norm_sync_mode == BatchNormSyncMode.DISABLED:
+            logging.info("Synchronized Batch Normalization is disabled")
+        else:
+            if batch_norm_sync_mode == BatchNormSyncMode.APEX and not apex_available:
+                raise RuntimeError("apex is not installed")
+            logging.info(
+                f"Using Synchronized Batch Normalization using {batch_norm_sync_mode}"
+            )
+        self.batch_norm_sync_mode = batch_norm_sync_mode
+
         return self
 
     def set_hooks(self, hooks: List["ClassyHook"]):
@@ -317,7 +346,12 @@ def from_config(cls, config: Dict[str, Any]) -> "ClassificationTask":
             .set_meters(meters)
             .set_amp_opt_level(amp_opt_level)
             .set_distributed_options(
-                BroadcastBuffersMode[config.get("broadcast_buffers", "DISABLED")]
+                broadcast_buffers_mode=BroadcastBuffersMode[
+                    config.get("broadcast_buffers", "disabled").upper()
+                ],
+                batch_norm_sync_mode=BatchNormSyncMode[
+                    config.get("batch_norm_sync_mode", "disabled").upper()
+                ],
             )
         )
         for phase_type in phase_types:
@@ -494,6 +528,11 @@ def prepare(
             multiprocessing_context=dataloader_mp_context,
         )
 
+        if self.batch_norm_sync_mode == BatchNormSyncMode.PYTORCH:
+            self.base_model = nn.SyncBatchNorm.convert_sync_batchnorm(self.base_model)
+        elif self.batch_norm_sync_mode == BatchNormSyncMode.APEX:
+            self.base_model = apex.parallel.convert_syncbn_model(self.base_model)
+
         # move the model and loss to the right device
         if use_gpu:
             self.base_model, self.loss = copy_model_to_gpu(self.base_model, self.loss)
diff --git a/test/generic/config_utils.py b/test/generic/config_utils.py
index 2458f3e853..b5f53f423c 100644
--- a/test/generic/config_utils.py
+++ b/test/generic/config_utils.py
@@ -176,9 +176,9 @@ def get_test_mlp_task_config():
                 "num_classes": 2,
                 "crop_size": 20,
                 "class_ratio": 0.5,
-                "num_samples": 10,
+                "num_samples": 20,
                 "seed": 0,
-                "batchsize_per_replica": 3,
+                "batchsize_per_replica": 6,
                 "use_augmentation": False,
                 "use_shuffle": True,
                 "transforms": [
@@ -228,6 +228,7 @@ def get_test_mlp_task_config():
             "input_dim": 1200,
             "output_dim": 1000,
             "hidden_dims": [10],
+            "use_batchnorm": True,  # used for testing sync batchnorm
         },
         "meters": {"accuracy": {"topk": [1]}},
         "optimizer": {
diff --git a/test/hooks_loss_lr_meter_logging_hook_test.py b/test/hooks_loss_lr_meter_logging_hook_test.py
index daa207f18b..ba4a417bc1 100644
--- a/test/hooks_loss_lr_meter_logging_hook_test.py
+++ b/test/hooks_loss_lr_meter_logging_hook_test.py
@@ -83,7 +83,7 @@ def scheduler_mock(where):
         mock_lr_scheduler.update_interval = UpdateInterval.STEP
         config = get_test_mlp_task_config()
         config["num_epochs"] = 3
-        config["dataset"]["train"]["batchsize_per_replica"] = 5
+        config["dataset"]["train"]["batchsize_per_replica"] = 10
         config["dataset"]["test"]["batchsize_per_replica"] = 5
         task = build_task(config)
         task.optimizer.param_schedulers["lr"] = mock_lr_scheduler
diff --git a/test/manual/hooks_tensorboard_plot_hook_test.py b/test/manual/hooks_tensorboard_plot_hook_test.py
index 0dc5e2206a..d1a80ccc92 100644
--- a/test/manual/hooks_tensorboard_plot_hook_test.py
+++ b/test/manual/hooks_tensorboard_plot_hook_test.py
@@ -140,7 +140,7 @@ def flush(self):
 
         config = get_test_mlp_task_config()
         config["num_epochs"] = 3
-        config["dataset"]["train"]["batchsize_per_replica"] = 5
+        config["dataset"]["train"]["batchsize_per_replica"] = 10
         config["dataset"]["test"]["batchsize_per_replica"] = 5
         task = build_task(config)
 
@@ -152,7 +152,7 @@ def flush(self):
         trainer = LocalTrainer()
         trainer.train(task)
 
-        # We have 10 samples, batch size is 5. Each epoch is done in two steps.
+        # We have 20 samples, batch size is 10. Each epoch is done in two steps.
         self.assertEqual(
             writer.scalar_logs["train_learning_rate_updates"],
             [0, 1 / 6, 2 / 6, 3 / 6, 4 / 6, 5 / 6],
diff --git a/test/trainer_distributed_trainer_test.py b/test/trainer_distributed_trainer_test.py
index e3950d74b3..dd5c1124f2 100644
--- a/test/trainer_distributed_trainer_test.py
+++ b/test/trainer_distributed_trainer_test.py
@@ -22,10 +22,13 @@ def setUp(self):
         config = get_test_mlp_task_config()
         invalid_config = copy.deepcopy(config)
         invalid_config["name"] = "invalid_task"
+        sync_bn_config = copy.deepcopy(config)
+        sync_bn_config["sync_batch_norm_mode"] = "pytorch"
         self.config_files = {}
         for config_key, config in [
             ("config", config),
             ("invalid_config", invalid_config),
+            ("sync_bn_config", sync_bn_config),
         ]:
             with tempfile.NamedTemporaryFile(mode="w", delete=False) as f:
                 json.dump(config, f)
@@ -63,3 +66,26 @@ def test_training(self):
             result = subprocess.run(cmd, shell=True)
             success = result.returncode == 0
             self.assertEqual(success, expected_success)
+
+    @unittest.skipUnless(torch.cuda.is_available(), "This test needs a gpu to run")
+    def test_sync_batch_norm(self):
+        """Test that sync batch norm training doesn't hang."""
+
+        num_processes = 2
+        device = "gpu"
+
+        cmd = f"""{sys.executable} -m torch.distributed.launch \
+        --nnodes=1 \
+        --nproc_per_node={num_processes} \
+        --master_addr=localhost \
+        --master_port=29500 \
+        --use_env \
+        {self.path}/../classy_train.py \
+        --device={device} \
+        --config={self.config_files["sync_bn_config"]} \
+        --num_workers=4 \
+        --log_freq=100 \
+        --distributed_backend=ddp
+        """
+        result = subprocess.run(cmd, shell=True)
+        self.assertEqual(result.returncode, 0)