Feature/sg 128 kd recipe resnet50 (#213)

* add kd train with resnet50 * add kd train with resnet50 * wip * update acc and s3 path * wip * split train_from_recipe * change import * load new resnet50 weights * wip * changes
Deci-AI · Jun 8, 2022 · adf5dca · adf5dca
1 parent bd56ade
commit adf5dca
Show file tree

Hide file tree

Showing 14 changed files with 192 additions and 11 deletions.
diff --git a/src/super_gradients/__init__.py b/src/super_gradients/__init__.py
@@ -1,8 +1,9 @@
 from super_gradients.training import ARCHITECTURES, losses, utils, datasets_utils, DataAugmentation, \
-    TestDatasetInterface, SegmentationTestDatasetInterface, DetectionTestDatasetInterface, ClassificationTestDatasetInterface, SgModel
+    TestDatasetInterface, SegmentationTestDatasetInterface, DetectionTestDatasetInterface, ClassificationTestDatasetInterface, SgModel, KDModel
 from super_gradients.common import init_trainer, is_distributed
 from super_gradients.examples.train_from_recipe_example import train_from_recipe
+from super_gradients.examples.train_from_kd_recipe_example import train_from_kd_recipe
 
 __all__ = ['ARCHITECTURES', 'losses', 'utils', 'datasets_utils', 'DataAugmentation',
-           'TestDatasetInterface', 'SgModel', 'SegmentationTestDatasetInterface', 'DetectionTestDatasetInterface',
-           'ClassificationTestDatasetInterface', 'init_trainer', 'is_distributed', 'train_from_recipe']
+           'TestDatasetInterface', 'SgModel', 'KDModel', 'SegmentationTestDatasetInterface', 'DetectionTestDatasetInterface',
+           'ClassificationTestDatasetInterface', 'init_trainer', 'is_distributed', 'train_from_recipe', 'train_from_kd_recipe']
diff --git a/src/super_gradients/examples/train_from_kd_recipe_example/__init__.py b/src/super_gradients/examples/train_from_kd_recipe_example/__init__.py
diff --git a/src/super_gradients/examples/train_from_kd_recipe_example/train_from_kd_recipe.py b/src/super_gradients/examples/train_from_kd_recipe_example/train_from_kd_recipe.py
@@ -0,0 +1,22 @@
+"""
+Example code for running SuperGradient's recipes.
+
+General use: python train_from_kd_recipe.py --config-name="DESIRED_RECIPE".
+For recipe's specific instructions and details refer to the recipe's configuration file in the recipes directory.
+"""
+
+import super_gradients
+from omegaconf import DictConfig
+import hydra
+import pkg_resources
+from super_gradients.training.kd_trainer import KDTrainer
+
+
+@hydra.main(config_path=pkg_resources.resource_filename("super_gradients.recipes", ""))
+def main(cfg: DictConfig) -> None:
+    KDTrainer.train(cfg)
+
+
+if __name__ == "__main__":
+    super_gradients.init_trainer()
+    main()
diff --git a/src/super_gradients/recipes/imagenet_resnet50_kd.yaml b/src/super_gradients/recipes/imagenet_resnet50_kd.yaml
@@ -0,0 +1,88 @@
+#  ResNet50 Imagenet classification training:
+#  This example trains with batch_size = 192 * 8 GPUs, total 1536.
+#  Training time on 8 x GeForce RTX A5000 is 9min / epoch.
+#  Reach => 81.91 Top1 accuracy.
+#
+#  Log and tensorboard at s3://deci-pretrained-models/KD_ResNet50_Beit_Base_ImageNet/average_model.pth
+
+# Instructions:
+# running from the command line, set the PYTHONPATH environment variable: (Replace "YOUR_LOCAL_PATH" with the path to the downloaded repo):
+#   export PYTHONPATH="YOUR_LOCAL_PATH"/super_gradients/:"YOUR_LOCAL_PATH"/super_gradients/src/
+# Then:
+#   python train_from_recipe_example/train_from_kd_recipe.py --config-name=imagenet_resnet50_kd
+
+defaults:
+  - training_hyperparams: imagenet_resnet50_kd_train_params
+  - dataset_params: imagenet_dataset_params
+  - arch_params: default_arch_params
+  - checkpoint_params: default_checkpoint_params
+
+training_hyperparams:
+  loss: kd_loss
+  criterion_params:
+    distillation_loss_coeff: 0.8
+    task_loss_fn:
+      _target_: super_gradients.training.losses.label_smoothing_cross_entropy_loss.LabelSmoothingCrossEntropyLoss
+
+arch_params:
+  teacher_input_adapter:
+    _target_: super_gradients.training.utils.kd_model_utils.NormalizationAdapter
+    mean_original: [0.485, 0.456, 0.406]
+    std_original: [0.229, 0.224, 0.225]
+    mean_required: [0.5, 0.5, 0.5]
+    std_required: [0.5, 0.5, 0.5]
+
+student_arch_params:
+  num_classes: 1000
+
+teacher_arch_params:
+  num_classes: 1000
+  image_size: [224, 224]
+  patch_size: [16, 16]
+
+dataset_params:
+  batch_size: 192
+  val_batch_size: 256
+  random_erase_prob: 0
+  random_erase_value: random
+  train_interpolation: random
+  rand_augment_config_string: rand-m7-mstd0.5
+  cutmix: True
+  cutmix_params:
+    mixup_alpha: 0.2
+    cutmix_alpha: 1.0
+    label_smoothing: 0.1
+  aug_repeat_count: 3
+
+dataset_interface:
+  imagenet:
+    dataset_params: ${dataset_params}
+
+data_loader_num_workers: 8
+
+model_checkpoints_location: local
+load_checkpoint: False
+checkpoint_params:
+  load_checkpoint: ${load_checkpoint}
+  teacher_pretrained_weights: imagenet
+
+run_teacher_on_eval: True
+
+experiment_name: resnet50_imagenet_KD_Model
+
+ckpt_root_dir:
+
+multi_gpu:
+  _target_: super_gradients.training.sg_model.MultiGPUMode
+  value: DDP
+
+sg_model:
+  _target_: super_gradients.KDModel
+  experiment_name: ${experiment_name}
+  model_checkpoints_location: ${model_checkpoints_location}
+  ckpt_root_dir: ${ckpt_root_dir}
+  multi_gpu: ${multi_gpu}
+
+architecture: kd_module
+student_architecture: resnet50
+teacher_architecture: beit_base_patch16_224
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_kd_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_kd_train_params.yaml
@@ -0,0 +1,24 @@
+defaults:
+  - default_train_params
+
+max_epochs: 610
+initial_lr: 5e-3
+lr_mode: cosine
+lr_warmup_epochs: 5
+lr_cooldown_epochs: 10
+ema: True
+mixed_precision: True
+zero_weight_decay_on_bias_and_bn: True
+optimizer: Lamb
+optimizer_params:
+  weight_decay: 0.02
+loss: cross_entropy
+train_metrics_list:                               # metrics for evaluation
+  - _target_: super_gradients.training.metrics.Accuracy
+  - _target_: super_gradients.training.metrics.Top5
+valid_metrics_list:                               # metrics for evaluation
+  - _target_: super_gradients.training.metrics.Accuracy
+  - _target_: super_gradients.training.metrics.Top5
+loss_logging_items_names: ["Loss", "Task Loss", "Distillation Loss"]
+
+_convert_: all
diff --git a/src/super_gradients/training/__init__.py b/src/super_gradients/training/__init__.py
@@ -4,6 +4,7 @@
 from super_gradients.training.models import ARCHITECTURES
 from super_gradients.training.sg_model import SgModel, \
     MultiGPUMode, StrictLoad
+from super_gradients.training.kd_model import KDModel
 
 __all__ = ['distributed_training_utils', 'datasets_utils', 'DataAugmentation', 'DetectionDataSet', 'TestDatasetInterface',
-           'ARCHITECTURES', 'SgModel', 'MultiGPUMode', 'TestDatasetInterface', 'SegmentationTestDatasetInterface', 'DetectionTestDatasetInterface', 'ClassificationTestDatasetInterface', 'StrictLoad']
+           'ARCHITECTURES', 'SgModel', 'KDModel', 'MultiGPUMode', 'TestDatasetInterface', 'SegmentationTestDatasetInterface', 'DetectionTestDatasetInterface', 'ClassificationTestDatasetInterface', 'StrictLoad']
diff --git a/src/super_gradients/training/kd_model/__init__.py b/src/super_gradients/training/kd_model/__init__.py
@@ -0,0 +1,5 @@
+# PACKAGE IMPORTS FOR EXTERNAL USAGE
+
+from super_gradients.training.kd_model.kd_model import KDModel
+
+__all__ = ['KDModel']
diff --git a/src/super_gradients/training/kd_trainer.py b/src/super_gradients/training/kd_trainer.py
@@ -0,0 +1,16 @@
+from super_gradients.training.trainer import Trainer
+
+
+class KDTrainer(Trainer):
+    """
+    Class for running SuperGradient's recipes for KD Models.
+    See train_from_kd_recipe example in the examples directory to demonstrate it's usage.
+    """
+
+    @classmethod
+    def build_model(cls, cfg):
+        cfg.sg_model.build_model(student_architecture=cfg.student_architecture,
+                                 teacher_architecture=cfg.teacher_architecture,
+                                 arch_params=cfg.arch_params, student_arch_params=cfg.student_arch_params,
+                                 teacher_arch_params=cfg.teacher_arch_params,
+                                 checkpoint_params=cfg.checkpoint_params, run_teacher_on_eval=cfg.run_teacher_on_eval)
diff --git a/src/super_gradients/training/losses/__init__.py b/src/super_gradients/training/losses/__init__.py
@@ -1,4 +1,5 @@
 from super_gradients.training.losses.focal_loss import FocalLoss
+from super_gradients.training.losses.kd_losses import KDLogitsLoss
 from super_gradients.training.losses.label_smoothing_cross_entropy_loss import LabelSmoothingCrossEntropyLoss
 from super_gradients.training.losses.r_squared_loss import RSquaredLoss
 from super_gradients.training.losses.shelfnet_ohem_loss import ShelfNetOHEMLoss
@@ -10,4 +11,4 @@
 from super_gradients.training.losses.all_losses import LOSSES
 
 __all__ = ['FocalLoss', 'LabelSmoothingCrossEntropyLoss', 'ShelfNetOHEMLoss', 'ShelfNetSemanticEncodingLoss',
-           'YoLoV3DetectionLoss', 'YoLoV5DetectionLoss', 'RSquaredLoss', 'SSDLoss', 'LOSSES', 'BCEDiceLoss']
+           'YoLoV3DetectionLoss', 'YoLoV5DetectionLoss', 'RSquaredLoss', 'SSDLoss', 'LOSSES', 'BCEDiceLoss', 'KDLogitsLoss']
diff --git a/src/super_gradients/training/losses/all_losses.py b/src/super_gradients/training/losses/all_losses.py
@@ -2,6 +2,7 @@
 
 from super_gradients.training.losses import LabelSmoothingCrossEntropyLoss, YoLoV3DetectionLoss, ShelfNetOHEMLoss, \
     ShelfNetSemanticEncodingLoss, RSquaredLoss, YoLoV5DetectionLoss, SSDLoss, BCEDiceLoss
+from super_gradients.training.losses.kd_losses import KDLogitsLoss
 from super_gradients.training.losses.stdc_loss import STDCLoss
 
 LOSSES = {"cross_entropy": LabelSmoothingCrossEntropyLoss,
@@ -13,5 +14,6 @@
           "yolo_v5_loss": YoLoV5DetectionLoss,
           "ssd_loss": SSDLoss,
           "stdc_loss": STDCLoss,
-          "bce_dice_loss": BCEDiceLoss
+          "bce_dice_loss": BCEDiceLoss,
+          "kd_loss": KDLogitsLoss
           }
diff --git a/src/super_gradients/training/pretrained_models.py b/src/super_gradients/training/pretrained_models.py
@@ -3,7 +3,7 @@
               "regnetY400_imagenet": "https://deci-pretrained-models.s3.amazonaws.com/RegnetY400/average_model_regnety400.pth",
               "regnetY200_imagenet": "https://deci-pretrained-models.s3.amazonaws.com/RegnetY200/average_model_regnety200.pth",
 
-              "resnet50_imagenet": "https://deci-pretrained-models.s3.amazonaws.com/ResNet50_ImageNet/average_model.pth",
+              "resnet50_imagenet": "https://deci-pretrained-models.s3.amazonaws.com/KD_ResNet50_Beit_Base_ImageNet/resnet.pth",
               "resnet34_imagenet": "https://deci-pretrained-models.s3.amazonaws.com/resent_34/average_model.pth",
               "resnet18_imagenet": "https://deci-pretrained-models.s3.amazonaws.com/resnet18/average_model.pth",
 

diff --git a/src/super_gradients/training/trainer.py b/src/super_gradients/training/trainer.py
@@ -84,8 +84,8 @@ def scale_params_for_yolov5(cfg):
         logger.info(log_msg)
         return cfg
 
-    @staticmethod
-    def train(cfg: DictConfig) -> None:
+    @classmethod
+    def train(cls, cfg: DictConfig) -> None:
         """
         Trains according to cfg recipe configuration.
 
@@ -99,11 +99,15 @@ def train(cfg: DictConfig) -> None:
         cfg.sg_model.connect_dataset_interface(cfg.dataset_interface, data_loader_num_workers=cfg.data_loader_num_workers)
 
         # BUILD NETWORK
-        cfg.sg_model.build_model(cfg.architecture, arch_params=cfg.arch_params, checkpoint_params=cfg.checkpoint_params)
+        cls.build_model(cfg)
 
         # FIXME: REMOVE PARAMETER MANIPULATION SPECIFIC FOR YOLO
         if str(cfg.architecture).startswith("yolo_v5"):
             cfg = Trainer.scale_params_for_yolov5(cfg)
 
         # TRAIN
         cfg.sg_model.train(training_params=cfg.training_hyperparams)
+
+    @classmethod
+    def build_model(cls, cfg):
+        cfg.sg_model.build_model(cfg.architecture, arch_params=cfg.arch_params, checkpoint_params=cfg.checkpoint_params)
diff --git a/src/super_gradients/training/utils/kd_model_utils.py b/src/super_gradients/training/utils/kd_model_utils.py
@@ -0,0 +1,17 @@
+import torch
+
+
+class NormalizationAdapter(torch.nn.Module):
+    def __init__(self, mean_original, std_original, mean_required, std_required):
+        super(NormalizationAdapter, self).__init__()
+        mean_original = torch.tensor(mean_original).unsqueeze(-1).unsqueeze(-1)
+        std_original = torch.tensor(std_original).unsqueeze(-1).unsqueeze(-1)
+        mean_required = torch.tensor(mean_required).unsqueeze(-1).unsqueeze(-1)
+        std_required = torch.tensor(std_required).unsqueeze(-1).unsqueeze(-1)
+
+        self.additive = torch.nn.Parameter((mean_original - mean_required) / std_original)
+        self.multiplier = torch.nn.Parameter(std_original / std_required)
+
+    def forward(self, x):
+        x = (x + self.additive) * self.multiplier
+        return x
diff --git a/tests/integration_tests/pretrained_models_test.py b/tests/integration_tests/pretrained_models_test.py
@@ -35,7 +35,7 @@ def setUp(self) -> None:
 
         self.imagenet21k_pretrained_ckpt_params = {"pretrained_weights": "imagenet21k"}
 
-        self.imagenet_pretrained_accuracies = {"resnet50": 0.7947,
+        self.imagenet_pretrained_accuracies = {"resnet50": 0.8191,
                                                "resnet34": 0.7413,
                                                "resnet18": 0.706,
                                                "repvgg_a0": 0.7205,