update tutorials

grantmerz · Aug 30, 2024 · b11c476 · b11c476
1 parent 12cb4db
commit b11c476
Show file tree

Hide file tree

Showing 18 changed files with 2,731 additions and 146 deletions.
diff --git a/configs/common/data/coco.py b/configs/common/data/coco.py
@@ -0,0 +1,48 @@
+from omegaconf import OmegaConf
+
+import detectron2.data.transforms as T
+from detectron2.config import LazyCall as L
+from detectron2.data import (
+    DatasetMapper,
+    build_detection_test_loader,
+    build_detection_train_loader,
+    get_detection_dataset_dicts,
+)
+from detectron2.evaluation import COCOEvaluator
+
+dataloader = OmegaConf.create()
+
+dataloader.train = L(build_detection_train_loader)(
+    dataset=L(get_detection_dataset_dicts)(names="coco_2017_train"),
+    mapper=L(DatasetMapper)(
+        is_train=True,
+        augmentations=[
+            L(T.ResizeShortestEdge)(
+                short_edge_length=(640, 672, 704, 736, 768, 800),
+                sample_style="choice",
+                max_size=1333,
+            ),
+            L(T.RandomFlip)(horizontal=True),
+        ],
+        image_format="BGR",
+        use_instance_mask=True,
+    ),
+    total_batch_size=16,
+    num_workers=4,
+)
+
+dataloader.test = L(build_detection_test_loader)(
+    dataset=L(get_detection_dataset_dicts)(names="coco_2017_val", filter_empty=False),
+    mapper=L(DatasetMapper)(
+        is_train=False,
+        augmentations=[
+            L(T.ResizeShortestEdge)(short_edge_length=800, max_size=1333),
+        ],
+        image_format="${...train.mapper.image_format}",
+    ),
+    num_workers=4,
+)
+
+dataloader.evaluator = L(COCOEvaluator)(
+    dataset_name="${..test.dataset.names}",
+)
diff --git a/configs/common/data/coco_keypoint.py b/configs/common/data/coco_keypoint.py
@@ -0,0 +1,13 @@
+from detectron2.data.detection_utils import create_keypoint_hflip_indices
+
+from .coco import dataloader
+
+dataloader.train.dataset.min_keypoints = 1
+dataloader.train.dataset.names = "keypoints_coco_2017_train"
+dataloader.test.dataset.names = "keypoints_coco_2017_val"
+
+dataloader.train.mapper.update(
+    use_instance_mask=False,
+    use_keypoint=True,
+    keypoint_hflip_indices=create_keypoint_hflip_indices(dataloader.train.dataset.names),
+)
diff --git a/configs/common/data/coco_panoptic_separated.py b/configs/common/data/coco_panoptic_separated.py
@@ -0,0 +1,26 @@
+from detectron2.config import LazyCall as L
+from detectron2.evaluation import (
+    COCOEvaluator,
+    COCOPanopticEvaluator,
+    DatasetEvaluators,
+    SemSegEvaluator,
+)
+
+from .coco import dataloader
+
+dataloader.train.dataset.names = "coco_2017_train_panoptic_separated"
+dataloader.train.dataset.filter_empty = False
+dataloader.test.dataset.names = "coco_2017_val_panoptic_separated"
+
+
+dataloader.evaluator = [
+    L(COCOEvaluator)(
+        dataset_name="${...test.dataset.names}",
+    ),
+    L(SemSegEvaluator)(
+        dataset_name="${...test.dataset.names}",
+    ),
+    L(COCOPanopticEvaluator)(
+        dataset_name="${...test.dataset.names}",
+    ),
+]
diff --git a/configs/common/data/constants.py b/configs/common/data/constants.py
@@ -0,0 +1,9 @@
+constants = dict(
+    imagenet_rgb256_mean=[123.675, 116.28, 103.53],
+    imagenet_rgb256_std=[58.395, 57.12, 57.375],
+    imagenet_bgr256_mean=[103.530, 116.280, 123.675],
+    # When using pre-trained models in Detectron1 or any MSRA models,
+    # std has been absorbed into its conv1 weights, so the std needs to be set 1.
+    # Otherwise, you can use [57.375, 57.120, 58.395] (ImageNet std)
+    imagenet_bgr256_std=[1.0, 1.0, 1.0],
+)
diff --git a/configs/common/models/cascade_rcnn.py b/configs/common/models/cascade_rcnn.py
@@ -0,0 +1,36 @@
+from detectron2.config import LazyCall as L
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.box_regression import Box2BoxTransform
+from detectron2.modeling.matcher import Matcher
+from detectron2.modeling.roi_heads import FastRCNNOutputLayers, FastRCNNConvFCHead, CascadeROIHeads
+
+from .mask_rcnn_fpn import model
+
+# arguments that don't exist for Cascade R-CNN
+[model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]]
+
+model.roi_heads.update(
+    _target_=CascadeROIHeads,
+    box_heads=[
+        L(FastRCNNConvFCHead)(
+            input_shape=ShapeSpec(channels=256, height=7, width=7),
+            conv_dims=[],
+            fc_dims=[1024, 1024],
+        )
+        for k in range(3)
+    ],
+    box_predictors=[
+        L(FastRCNNOutputLayers)(
+            input_shape=ShapeSpec(channels=1024),
+            test_score_thresh=0.05,
+            box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)),
+            cls_agnostic_bbox_reg=True,
+            num_classes="${...num_classes}",
+        )
+        for (w1, w2) in [(10, 5), (20, 10), (30, 15)]
+    ],
+    proposal_matchers=[
+        L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False)
+        for th in [0.5, 0.6, 0.7]
+    ],
+)
diff --git a/configs/common/models/fcos.py b/configs/common/models/fcos.py
@@ -0,0 +1,23 @@
+from detectron2.modeling.meta_arch.fcos import FCOS, FCOSHead
+
+from .retinanet import model
+
+model._target_ = FCOS
+
+del model.anchor_generator
+del model.box2box_transform
+del model.anchor_matcher
+del model.input_format
+
+# Use P5 instead of C5 to compute P6/P7
+# (Sec 2.2 of https://arxiv.org/abs/2006.09214)
+model.backbone.top_block.in_feature = "p5"
+model.backbone.top_block.in_channels = 256
+
+# New score threshold determined based on sqrt(cls_score * centerness)
+model.test_score_thresh = 0.2
+model.test_nms_thresh = 0.6
+
+model.head._target_ = FCOSHead
+del model.head.num_anchors
+model.head.norm = "GN"
diff --git a/configs/common/models/keypoint_rcnn_fpn.py b/configs/common/models/keypoint_rcnn_fpn.py
@@ -0,0 +1,33 @@
+from detectron2.config import LazyCall as L
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.poolers import ROIPooler
+from detectron2.modeling.roi_heads import KRCNNConvDeconvUpsampleHead
+
+from .mask_rcnn_fpn import model
+
+[model.roi_heads.pop(x) for x in ["mask_in_features", "mask_pooler", "mask_head"]]
+
+model.roi_heads.update(
+    num_classes=1,
+    keypoint_in_features=["p2", "p3", "p4", "p5"],
+    keypoint_pooler=L(ROIPooler)(
+        output_size=14,
+        scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32),
+        sampling_ratio=0,
+        pooler_type="ROIAlignV2",
+    ),
+    keypoint_head=L(KRCNNConvDeconvUpsampleHead)(
+        input_shape=ShapeSpec(channels=256, width=14, height=14),
+        num_keypoints=17,
+        conv_dims=[512] * 8,
+        loss_normalizer="visible",
+    ),
+)
+
+# Detectron1 uses 2000 proposals per-batch, but this option is per-image in detectron2.
+# 1000 proposals per-image is found to hurt box AP.
+# Therefore we increase it to 1500 per-image.
+model.proposal_generator.post_nms_topk = (1500, 1000)
+
+# Keypoint AP degrades (though box AP improves) when using plain L1 loss
+model.roi_heads.box_predictor.smooth_l1_beta = 0.5
diff --git a/configs/common/models/mask_rcnn_c4.py b/configs/common/models/mask_rcnn_c4.py
@@ -0,0 +1,90 @@
+from detectron2.config import LazyCall as L
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.meta_arch import GeneralizedRCNN
+from detectron2.modeling.anchor_generator import DefaultAnchorGenerator
+from detectron2.modeling.backbone import BasicStem, BottleneckBlock, ResNet
+from detectron2.modeling.box_regression import Box2BoxTransform
+from detectron2.modeling.matcher import Matcher
+from detectron2.modeling.poolers import ROIPooler
+from detectron2.modeling.proposal_generator import RPN, StandardRPNHead
+from detectron2.modeling.roi_heads import (
+    FastRCNNOutputLayers,
+    MaskRCNNConvUpsampleHead,
+    Res5ROIHeads,
+)
+
+from ..data.constants import constants
+
+model = L(GeneralizedRCNN)(
+    backbone=L(ResNet)(
+        stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"),
+        stages=L(ResNet.make_default_stages)(
+            depth=50,
+            stride_in_1x1=True,
+            norm="FrozenBN",
+        ),
+        out_features=["res4"],
+    ),
+    proposal_generator=L(RPN)(
+        in_features=["res4"],
+        head=L(StandardRPNHead)(in_channels=1024, num_anchors=15),
+        anchor_generator=L(DefaultAnchorGenerator)(
+            sizes=[[32, 64, 128, 256, 512]],
+            aspect_ratios=[0.5, 1.0, 2.0],
+            strides=[16],
+            offset=0.0,
+        ),
+        anchor_matcher=L(Matcher)(
+            thresholds=[0.3, 0.7], labels=[0, -1, 1], allow_low_quality_matches=True
+        ),
+        box2box_transform=L(Box2BoxTransform)(weights=[1.0, 1.0, 1.0, 1.0]),
+        batch_size_per_image=256,
+        positive_fraction=0.5,
+        pre_nms_topk=(12000, 6000),
+        post_nms_topk=(2000, 1000),
+        nms_thresh=0.7,
+    ),
+    roi_heads=L(Res5ROIHeads)(
+        num_classes=80,
+        batch_size_per_image=512,
+        positive_fraction=0.25,
+        proposal_matcher=L(Matcher)(
+            thresholds=[0.5], labels=[0, 1], allow_low_quality_matches=False
+        ),
+        in_features=["res4"],
+        pooler=L(ROIPooler)(
+            output_size=14,
+            scales=(1.0 / 16,),
+            sampling_ratio=0,
+            pooler_type="ROIAlignV2",
+        ),
+        res5=L(ResNet.make_stage)(
+            block_class=BottleneckBlock,
+            num_blocks=3,
+            stride_per_block=[2, 1, 1],
+            in_channels=1024,
+            bottleneck_channels=512,
+            out_channels=2048,
+            norm="FrozenBN",
+            stride_in_1x1=True,
+        ),
+        box_predictor=L(FastRCNNOutputLayers)(
+            input_shape=L(ShapeSpec)(channels="${...res5.out_channels}", height=1, width=1),
+            test_score_thresh=0.05,
+            box2box_transform=L(Box2BoxTransform)(weights=(10, 10, 5, 5)),
+            num_classes="${..num_classes}",
+        ),
+        mask_head=L(MaskRCNNConvUpsampleHead)(
+            input_shape=L(ShapeSpec)(
+                channels="${...res5.out_channels}",
+                width="${...pooler.output_size}",
+                height="${...pooler.output_size}",
+            ),
+            num_classes="${..num_classes}",
+            conv_dims=[256],
+        ),
+    ),
+    pixel_mean=constants.imagenet_bgr256_mean,
+    pixel_std=constants.imagenet_bgr256_std,
+    input_format="BGR",
+)
diff --git a/configs/common/models/mask_rcnn_fpn.py b/configs/common/models/mask_rcnn_fpn.py
@@ -0,0 +1,95 @@
+from detectron2.config import LazyCall as L
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.meta_arch import GeneralizedRCNN
+from detectron2.modeling.anchor_generator import DefaultAnchorGenerator
+from detectron2.modeling.backbone.fpn import LastLevelMaxPool
+from detectron2.modeling.backbone import BasicStem, FPN, ResNet
+from detectron2.modeling.box_regression import Box2BoxTransform
+from detectron2.modeling.matcher import Matcher
+from detectron2.modeling.poolers import ROIPooler
+from detectron2.modeling.proposal_generator import RPN, StandardRPNHead
+from detectron2.modeling.roi_heads import (
+    StandardROIHeads,
+    FastRCNNOutputLayers,
+    MaskRCNNConvUpsampleHead,
+    FastRCNNConvFCHead,
+)
+
+from ..data.constants import constants
+
+model = L(GeneralizedRCNN)(
+    backbone=L(FPN)(
+        bottom_up=L(ResNet)(
+            stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"),
+            stages=L(ResNet.make_default_stages)(
+                depth=50,
+                stride_in_1x1=True,
+                norm="FrozenBN",
+            ),
+            out_features=["res2", "res3", "res4", "res5"],
+        ),
+        in_features="${.bottom_up.out_features}",
+        out_channels=256,
+        top_block=L(LastLevelMaxPool)(),
+    ),
+    proposal_generator=L(RPN)(
+        in_features=["p2", "p3", "p4", "p5", "p6"],
+        head=L(StandardRPNHead)(in_channels=256, num_anchors=3),
+        anchor_generator=L(DefaultAnchorGenerator)(
+            sizes=[[32], [64], [128], [256], [512]],
+            aspect_ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64],
+            offset=0.0,
+        ),
+        anchor_matcher=L(Matcher)(
+            thresholds=[0.3, 0.7], labels=[0, -1, 1], allow_low_quality_matches=True
+        ),
+        box2box_transform=L(Box2BoxTransform)(weights=[1.0, 1.0, 1.0, 1.0]),
+        batch_size_per_image=256,
+        positive_fraction=0.5,
+        pre_nms_topk=(2000, 1000),
+        post_nms_topk=(1000, 1000),
+        nms_thresh=0.7,
+    ),
+    roi_heads=L(StandardROIHeads)(
+        num_classes=80,
+        batch_size_per_image=512,
+        positive_fraction=0.25,
+        proposal_matcher=L(Matcher)(
+            thresholds=[0.5], labels=[0, 1], allow_low_quality_matches=False
+        ),
+        box_in_features=["p2", "p3", "p4", "p5"],
+        box_pooler=L(ROIPooler)(
+            output_size=7,
+            scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32),
+            sampling_ratio=0,
+            pooler_type="ROIAlignV2",
+        ),
+        box_head=L(FastRCNNConvFCHead)(
+            input_shape=ShapeSpec(channels=256, height=7, width=7),
+            conv_dims=[],
+            fc_dims=[1024, 1024],
+        ),
+        box_predictor=L(FastRCNNOutputLayers)(
+            input_shape=ShapeSpec(channels=1024),
+            test_score_thresh=0.05,
+            box2box_transform=L(Box2BoxTransform)(weights=(10, 10, 5, 5)),
+            num_classes="${..num_classes}",
+        ),
+        mask_in_features=["p2", "p3", "p4", "p5"],
+        mask_pooler=L(ROIPooler)(
+            output_size=14,
+            scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32),
+            sampling_ratio=0,
+            pooler_type="ROIAlignV2",
+        ),
+        mask_head=L(MaskRCNNConvUpsampleHead)(
+            input_shape=ShapeSpec(channels=256, width=14, height=14),
+            num_classes="${..num_classes}",
+            conv_dims=[256, 256, 256, 256, 256],
+        ),
+    ),
+    pixel_mean=constants.imagenet_bgr256_mean,
+    pixel_std=constants.imagenet_bgr256_std,
+    input_format="BGR",
+)