open-mmlab · hellock · Aug 31, 2020 · Nov 19, 2019 · Nov 20, 2019 · Dec 2, 2019
diff --git a/configs/yolo/README.md b/configs/yolo/README.md
@@ -0,0 +1,24 @@
+# YOLOv3
+
+## Introduction
+```
+@misc{redmon2018yolov3,
+    title={YOLOv3: An Incremental Improvement},
+    author={Joseph Redmon and Ali Farhadi},
+    year={2018},
+    eprint={1804.02767},
+    archivePrefix={arXiv},
+    primaryClass={cs.CV}
+}
+```
+
+## Results and Models
+
+|    Backbone     | Train Scale  | Lr schd | Mem (GB) | Eval Scale | Inf time (fps) | box AP | Download |
+| :-------------: | :----------: | :-----: | :------: | :--------: | :------------: | :----: |:--------:|
+|   DarkNet-53    | Multi-Scale  |  273e   |   1.8    | 608 * 608  |      44        |**37.6**| [model](https://drive.google.com/file/d/1Ca27fP4hlBFduMCv5b_f-0J9EdfxCgPb/view?usp=sharing) &#124; [log](https://github.com/open-mmlab/mmdetection/files/4910982/log.zip) |
+|        -        |      -       |     -   |  -       | 416 * 416  |      **64**    |  34.8  | - |
+
+
+## Credit
+This implementation originates from the project of Haoyu Wu(@wuhy08) at Western Digital.
diff --git a/configs/yolo/yolov3_d53_yolo_mstrain_273e_coco.py b/configs/yolo/yolov3_d53_yolo_mstrain_273e_coco.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2019 Western Digital Corporation or its affiliates.
+_base_ = [
+    '../_base_/default_runtime.py',
+]
+# model settings
+model = dict(
+    type='YOLOV3',
+    pretrained='./work_dirs/darknet_state_dict_only.pth',
+    backbone=dict(
+        type='Darknet',
+        depth=53,
+        out_indices=(3, 4, 5),
+    ),
+    neck=dict(
+        type='YOLOV3Neck',
+        num_scales=3,
+        in_channels=[1024, 512, 256],
+        out_channels=[512, 256, 128],
+    ),
+    bbox_head=dict(
+        type='YOLOV3Head',
+        num_classes=80,
+        num_scales=3,
+        num_anchors_per_scale=3,
+        in_channels=[512, 256, 128],
+        out_channels=[1024, 512, 256],
+        strides=[32, 16, 8],
+        anchor_base_sizes=[
+            [(116, 90), (156, 198), (373, 326)],
+            [(30, 61), (62, 45), (59, 119)],
+            [(10, 13), (16, 30), (33, 23)],
+        ],
+    ))
+# training and testing settings
+train_cfg = dict(
+    one_hot_smoother=0., ignore_config=0.5, xy_use_logit=False, debug=False)
+test_cfg = dict(
+    nms_pre=1000,
+    min_bbox_size=0,
+    score_thr=0.05,
+    conf_thr=0.005,
+    nms=dict(type='nms', iou_thr=0.45),
+    max_per_img=100)
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(mean=[0, 0, 0], std=[255., 255., 255.], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='PhotoMetricDistortion'),
+    dict(
+        type='Expand',
+        mean=img_norm_cfg['mean'],
+        to_rgb=img_norm_cfg['to_rgb'],
+        ratio_range=(1, 2)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', img_scale=[(320, 320), (608, 608)], keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(608, 608),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        pipeline=train_pipeline,
+    ),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline,
+    ),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline,
+    ))
+# optimizer
+optimizer = dict(type='SGD', lr=5e-4, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=2000,  # same as burn-in in darknet
+    warmup_ratio=0.1,
+    step=[218, 246])
+# runtime settings
+total_epochs = 273
+work_dir = './work_dirs/yolo_pretrained'
+evaluation = dict(interval=1, metric=['bbox'])
+find_unused_parameters = True
diff --git a/mmdet/models/backbones/__init__.py b/mmdet/models/backbones/__init__.py
@@ -1,3 +1,4 @@
+from .darknet import Darknet
 from .detectors_resnet import DetectoRS_ResNet
 from .detectors_resnext import DetectoRS_ResNeXt
 from .hourglass import HourglassNet
@@ -10,5 +11,5 @@
 
 __all__ = [
     'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SSDVGG', 'HRNet', 'Res2Net',
-    'HourglassNet', 'DetectoRS_ResNet', 'DetectoRS_ResNeXt'
+    'HourglassNet', 'DetectoRS_ResNet', 'DetectoRS_ResNeXt', 'Darknet'
 ]
diff --git a/mmdet/models/backbones/darknet.py b/mmdet/models/backbones/darknet.py
@@ -0,0 +1,190 @@
+# Copyright (c) 2019 Western Digital Corporation or its affiliates.
+
+import logging
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, constant_init, kaiming_init
+from mmcv.runner import load_checkpoint
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+
+
+class ResBlock(nn.Module):
+    """The basic residual block used in YoloV3. Each ResBlock consists of two
+    ConvModules and the input is added to the final output. Each ConvModule is
+    composed of Conv, BN, and LeakyReLU In YoloV3 paper, the first convLayer
+    has half of the number of the filters as much as the second convLayer. The
+    first convLayer has filter size of 1x1 and the second one has the filter
+    size of 3x3.
+
+    Args:
+        in_channels (int): The input channels. Must be even.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='LeakyReLU', negative_slope=0.1)):
+        super(ResBlock, self).__init__()
+        assert in_channels % 2 == 0  # ensure the in_channels is even
+        half_in_channels = in_channels // 2
+
+        # shortcut
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+
+        self.conv1 = ConvModule(in_channels, half_in_channels, 1, **cfg)
+        self.conv2 = ConvModule(
+            half_in_channels, in_channels, 3, padding=1, **cfg)
+
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+        out += residual
+
+        return out
+
+
+def make_conv_and_res_block(in_channels,
+                            out_channels,
+                            res_repeat,
+                            conv_cfg=None,
+                            norm_cfg=dict(type='BN', requires_grad=True),
+                            act_cfg=dict(type='LeakyReLU',
+                                         negative_slope=0.1)):
+    """In Darknet backbone, ConvLayer is usually followed by ResBlock. This
+    function will make that. The Conv layers always have 3x3 filters with
+    stride=2. The number of the filters in Conv layer is the same as the out
+    channels of the ResBlock.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        res_repeat (int): The number of ResBlocks.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+    """
+
+    cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+
+    model = nn.Sequential()
+    model.add_module(
+        'conv',
+        ConvModule(in_channels, out_channels, 3, stride=2, padding=1, **cfg))
+    for idx in range(res_repeat):
+        model.add_module('res{}'.format(idx), ResBlock(out_channels, **cfg))
+    return model
+
+
+@BACKBONES.register_module()
+class Darknet(nn.Module):
+    """Darknet backbone.
+
+    Args:
+        depth (int): Depth of Darknet. Currently only support 53.
+        out_indices (Sequence[int]): Output from which stages.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+
+    Example:
+        >>> from mmdet.models import Darknet
+        >>> import torch
+        >>> self = Darknet(depth=53)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 416, 416)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        ...
+        (1, 256, 52, 52)
+        (1, 512, 26, 26)
+        (1, 1024, 13, 13)
+    """
+
+    # Dict(depth: (layers, channels))
+    arch_settings = {
+        53: ((1, 2, 8, 8, 4), ((32, 64), (64, 128), (128, 256), (256, 512),
+                               (512, 1024)))
+    }
+
+    def __init__(self,
+                 depth=53,
+                 out_indices=(3, 4, 5),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='LeakyReLU', negative_slope=0.1),
+                 norm_eval=True):
+        super(Darknet, self).__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for darknet')
+        self.depth = depth
+        self.out_indices = out_indices
+        self.layers, self.channels = self.arch_settings[depth]
+
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+
+        self.conv1 = ConvModule(3, 32, 3, padding=1, **cfg)
+
+        self.cr_blocks = ['conv1']
+        for i, n_layers in enumerate(self.layers):
+            layer_name = f'cr_block{i + 1}'
+            in_c, out_c = self.channels[i]
+            self.add_module(
+                layer_name,
+                make_conv_and_res_block(in_c, out_c, n_layers, **cfg))
+            self.cr_blocks.append(layer_name)
+
+        self.norm_eval = norm_eval
+
+    def forward(self, x):
+        outs = []
+        for i, layer_name in enumerate(self.cr_blocks):
+            cr_block = getattr(self, layer_name)
+            x = cr_block(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return tuple(outs)
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def _freeze_stages(self):
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def train(self, mode=True):
+        super(Darknet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/mmdet/models/dense_heads/__init__.py b/mmdet/models/dense_heads/__init__.py
@@ -17,11 +17,12 @@
 from .retina_sepbn_head import RetinaSepBNHead
 from .rpn_head import RPNHead
 from .ssd_head import SSDHead
+from .yolo_head import YOLOV3Head
 
 __all__ = [
     'AnchorFreeHead', 'AnchorHead', 'GuidedAnchorHead', 'FeatureAdaption',
     'RPNHead', 'GARPNHead', 'RetinaHead', 'RetinaSepBNHead', 'GARetinaHead',
     'SSDHead', 'FCOSHead', 'RepPointsHead', 'FoveaHead',
     'FreeAnchorRetinaHead', 'ATSSHead', 'FSAFHead', 'NASFCOSHead',
-    'PISARetinaHead', 'PISASSDHead', 'GFLHead'
+    'PISARetinaHead', 'PISASSDHead', 'GFLHead', 'YOLOV3Head'
 ]