open-mmlab · GT9505 · Nov 12, 2021 · Oct 25, 2021 · Oct 26, 2021 · Oct 26, 2021
diff --git a/configs/sot/siamese_rpn/siamese_rpn_r50_1x_vot2018.py b/configs/sot/siamese_rpn/siamese_rpn_r50_1x_vot2018.py
@@ -0,0 +1,21 @@
+_base_ = ['./siamese_rpn_r50_1x_lasot.py']
+
+# model settings
+model = dict(
+    test_cfg=dict(
+        rpn=dict(penalty_k=0.04, window_influence=0.44, lr=0.33),
+        test_mode='VOT'))
+
+data_root = 'data/'
+# dataset settings
+data = dict(
+    val=dict(
+        type='VOTDataset',
+        ann_file=data_root + 'vot2018/annotations/vot2018.json',
+        img_prefix=data_root + 'vot2018/data'),
+    test=dict(
+        type='VOTDataset',
+        ann_file=data_root + 'vot2018/annotations/vot2018.json',
+        img_prefix=data_root + 'vot2018/data'))
+evaluation = dict(
+    metric=['track'], interval=1, start=10, rule='greater', save_best='eao')
diff --git a/mmtrack/core/__init__.py b/mmtrack/core/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .anchor import *  # noqa: F401, F403
+from .bbox import *  # noqa: F401, F403
 from .evaluation import *  # noqa: F401, F403
 from .motion import *  # noqa: F401, F403
 from .optimizer import *  # noqa: F401, F403

diff --git a/mmtrack/core/bbox/__init__.py b/mmtrack/core/bbox/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .transforms import bbox_cxcywh_to_x1y1wh, bbox_xyxy_to_x1y1wh, quad2bbox
+
+__all__ = ['quad2bbox', 'bbox_cxcywh_to_x1y1wh', 'bbox_xyxy_to_x1y1wh']
diff --git a/mmtrack/core/bbox/transforms.py b/mmtrack/core/bbox/transforms.py
@@ -0,0 +1,66 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmdet.core.bbox.transforms import bbox_xyxy_to_cxcywh
+
+
+def quad2bbox(quad):
+    """Convert quadrilateral to axis aligned box in [cx, cy, w, h] format.
+
+    Args:
+        quad (Tensor): of shape (N, 8), (8, ), (N, 4) or (4, ). The
+            coordinates are in [x1, y1, x2, y2, x3, y3, x4, y4] or
+            [tl_x, tl_y, br_x, br_y] format.
+    Returns:
+        Tensor: in [cx, cy, w, h] format.
+    """
+    if len(quad.shape) == 1:
+        quad = quad.unsqueeze(0)
+    length = quad.shape[1]
+    if length == 8:
+        cx = torch.mean(quad[:, 0::2], dim=-1)
+        cy = torch.mean(quad[:, 1::2], dim=-1)
+        x1 = torch.min(quad[:, 0::2], dim=-1)[0]
+        x2 = torch.max(quad[:, 0::2], dim=-1)[0]
+        y1 = torch.min(quad[:, 1::2], dim=-1)[0]
+        y2 = torch.max(quad[:, 1::2], dim=-1)[0]
+        area1 = torch.norm(quad[:, 0:2] - quad[:, 2:4], dim=1) * \
+            torch.norm(quad[:, 2:4] - quad[:, 4:6], dim=1)
+        area2 = (x2 - x1) * (y2 - y1)
+        scale_factor = torch.sqrt(area1 / area2)
+        w = scale_factor * (x2 - x1)
+        h = scale_factor * (y2 - y1)
+        bbox = torch.stack((cx, cy, w, h), dim=-1).squeeze(0)
+    elif length == 4:
+        bbox = bbox_xyxy_to_cxcywh(quad).squeeze(0)
+    else:
+        NotImplementedError(f'The length of quadrilateral: {length} is \
+                 not supported')
+    return bbox
+
+
+def bbox_cxcywh_to_x1y1wh(bbox):
+    """Convert bbox coordinates from (cx, cy, w, h) to (x1, y1, w, h).
+
+    Args:
+        bbox (Tensor): Shape (n, 4) or (4, ) for bboxes.
+
+    Returns:
+        Tensor: Converted bboxes.
+    """
+    cx, cy, w, h = bbox.split((1, 1, 1, 1), dim=-1)
+    bbox_new = [(cx - 0.5 * w), (cy - 0.5 * h), w, h]
+    return torch.cat(bbox_new, dim=-1)
+
+
+def bbox_xyxy_to_x1y1wh(bbox):
+    """Convert bbox coordinates from (x1, y1, x2, y2) to (x1, y1, w, h).
+
+    Args:
+        bbox (Tensor): Shape (n, 4) or (4, ) for bboxes.
+
+    Returns:
+        Tensor: Converted bboxes.
+    """
+    x1, y1, x2, y2 = bbox.split((1, 1, 1, 1), dim=-1)
+    bbox_new = [x1, y1, (x2 - x1), (y2 - y1)]
+    return torch.cat(bbox_new, dim=-1)
diff --git a/mmtrack/core/evaluation/__init__.py b/mmtrack/core/evaluation/__init__.py
@@ -2,5 +2,10 @@
 from .eval_hooks import DistEvalHook, EvalHook
 from .eval_mot import eval_mot
 from .eval_sot_ope import eval_sot_ope
+from .eval_sot_vot import (bbox2region, eval_sot_accuracy_robustness,
+                           eval_sot_eao)
 
-__all__ = ['EvalHook', 'DistEvalHook', 'eval_mot', 'eval_sot_ope']
+__all__ = [
+    'EvalHook', 'DistEvalHook', 'eval_mot', 'eval_sot_ope', 'bbox2region',
+    'eval_sot_eao', 'eval_sot_accuracy_robustness'
+]
diff --git a/mmtrack/core/evaluation/eval_sot_vot.py b/mmtrack/core/evaluation/eval_sot_vot.py
@@ -0,0 +1,281 @@
+import numpy as np
+from vot.analysis import is_special
+from vot.region import Polygon, Rectangle, Special
+from vot.region import calculate_overlaps as calculate_region_overlaps
+
+
+def bbox2region(bbox):
+    """Convert bbox to Rectangle or Polygon Class object.
+
+    Args:
+        bbox (ndarray): the format of rectangle bbox is (x1, y1, w, h);
+            the format of polygon is (x1, y1, x2, y2, ...).
+
+    Returns:
+        Rectangle or Polygon Class object.
+    """
+    if len(bbox) == 1:
+        return Special(bbox[0])
+    elif len(bbox) == 4:
+        return Rectangle(bbox[0], bbox[1], bbox[2], bbox[3])
+    elif len(bbox) % 2 == 0 and len(bbox) > 4:
+        return Polygon([(x_, y_) for x_, y_ in zip(bbox[::2], bbox[1::2])])
+    else:
+        raise NotImplementedError(
+            f'The length of bbox is {len(bbox)}, which is not supported')
+
+
+def trajectory2region(trajectory):
+    """Convert bbox trajectory to Rectangle or Polygon Class object trajectory.
+
+    Args:
+        trajectory (list[ndarray]): The outer list contains bbox of
+            each frame in a video. The bbox is a ndarray.
+
+    Returns:
+        List: contains the Region Class object of each frame in a
+            trajectory.
+    """
+    traj_region = []
+    for bbox in trajectory:
+        traj_region.append(bbox2region(bbox))
+    return traj_region
+
+
+def locate_failures_inits(trajectory):
+    """locate the failure frame and initialized frame in a trajectory.
+
+    Args:
+        trajectory (list[ndarray]): list of tracking results.
+
+    Returns:
+        fail_inds (list): index of failed frame in a trajectory.
+        init_inds (list): index of initialized frame in a trajectory.
+    """
+    fail_inds = []
+    init_inds = []
+    for i, bbox in enumerate(trajectory):
+        if len(bbox) == 1:
+            if bbox[0] == 1.:
+                init_inds.append(i)
+            elif bbox[0] == 2.:
+                fail_inds.append(i)
+    return fail_inds, init_inds
+
+
+def count_failures(trajectory):
+    """count the number of failed frame in a trajectory.
+
+    Args:
+        trajectory (list[ndarray]): list of tracking results.
+
+    Returns:
+        List: the number of failed frame in a trajectory.
+    """
+    num_fails = 0
+    for bbox in trajectory:
+        if len(bbox) == 1 and bbox[0] == 2.:
+            num_fails += 1
+    return num_fails
+
+
+def calc_accuracy(gt_trajectory,
+                  pred_trajectory,
+                  burnin=10,
+                  ignore_unknown=True,
+                  video_wh=None):
+    """Calculate accuracy over the sequence.
+
+    Args:
+        gt_trajectory (list[list]): list of bboxes
+        pred_trajectory (list[ndarray]): The outer list contains the
+            tracking results of each frame in one video. The ndarray has two
+            cases:
+                - bbox: denotes the normal tracking box in [x1, y1, w, h]
+                    format.
+                - special tracking state: [0] denotes the unknown state,
+                    namely the skipping frame after failure, [1] denotes the
+                    initialized state, and [2] denotes the failed state.
+        burnin: number of frames that have to be ignored after the
+            re-initialization when calculating accuracy. Default is 10.
+        ignore_unknown (bool): whether ignore the skipping frames after
+            failures when calculating accuracy. Default is True.
+        video_wh: bounding region (width, height)
+
+    Return:
+        Float: accuracy over the sequence.
+    """
+    pred_traj_region = trajectory2region(pred_trajectory)
+    gt_traj_region = trajectory2region(gt_trajectory)
+    overlaps = np.array(
+        calculate_region_overlaps(pred_traj_region, gt_traj_region, video_wh))
+    mask = np.ones(len(overlaps), dtype=bool)
+
+    for i, region in enumerate(pred_traj_region):
+        if is_special(region, Special.UNKNOWN) and ignore_unknown:
+            mask[i] = False
+        elif is_special(region, Special.INITIALIZATION):
+            for j in range(i, min(len(pred_traj_region), i + burnin)):
+                mask[j] = False
+        elif is_special(region, Special.FAILURE):
+            mask[i] = False
+    return np.mean(overlaps[mask]) if any(mask) else 0.
+
+
+def eval_sot_accuracy_robustness(results,
+                                 annotations,
+                                 burnin=10,
+                                 ignore_unknown=True,
+                                 videos_wh=None):
+    """Calculate accuracy and robustness over all tracking sequences.
+
+    Args:
+        results (list[list[ndarray]]): The first list contains the
+            tracking results of each video. The second list contains the
+            tracking results of each frame in one video. The ndarray have two
+            cases:
+                - bbox: denotes the normal tracking box in [x1, y1, w, h]
+                    format.
+                - special tracking state: [0] denotes the unknown state,
+                    namely the skipping frame after failure, [1] denotes the
+                    initialized state, and [2] denotes the failed state.
+        annotations (list[list[dict]]): The first list contains the
+            gt_bboxes of each video. The second list contains the
+            gt_bbox of each frame in one video. The dict contains the
+            annotation information of one frame.
+        burnin: number of frames that have to be ignored after the
+            re-initialization when calculating accuracy. Default is 10.
+        ignore_unknown (bool): whether ignore the skipping frames after
+            failures when calculating accuracy. Default is True.
+        videos_wh (list[tuple(width, height), ...]): The list contains the
+            width and height of each video. Default is None.
+
+    Return:
+        dict{str: float}: accuracy and robustness in EAO evaluation metric.
+    """
+    accuracy = 0
+    num_fails = 0
+    weight = 0
+    for i, (gt_traj, pred_traj) in enumerate(zip(annotations, results)):
+        gt_traj = np.stack([ann['bboxes'] for ann in gt_traj])
+        assert len(gt_traj) == len(pred_traj)
+        assert len(pred_traj[0]) == 1 and pred_traj[0][0] == 1
+        num_fails += count_failures(pred_traj)
+        accuracy += calc_accuracy(
+            gt_traj,
+            pred_traj,
+            burnin=burnin,
+            ignore_unknown=ignore_unknown,
+            video_wh=videos_wh[i]) * len(pred_traj)
+        weight += len(pred_traj)
+
+    accuracy /= weight
+    robustness = num_fails / weight * 100
+    return dict(accuracy=accuracy, robustness=robustness, num_fails=num_fails)
+
+
+def calc_eao_curve(overlaps, successes):
+    """Calculate EAO curve over all tracking sequences.
+
+    Args:
+        overlaps (list[list]): The outer list contains the overlaps of each
+            video. The inner list contains the overlap of each frame in one
+            video.
+        successes (list): The list contains the tracking states of last frame
+            in each fragment.
+
+    Return:
+        ndarray: The N-th element in ndarray denotes the average overlaps from
+            1 to N in all fragments.
+    """
+    max_length = max([len(_) for _ in overlaps])
+    total_runs = len(overlaps)
+
+    overlaps_array = np.zeros((total_runs, max_length), dtype=np.float32)
+    # mask out frames which are not considered in EAO calculation. initial
+    # value are zero, meaning ignored.
+    mask = np.zeros((total_runs, max_length), dtype=np.float32)
+    for i, (overlap, success) in enumerate(zip(overlaps, successes)):
+        overlaps_array[i, :len(overlap)] = np.array(overlap)
+        if not success:
+            # tracker has failed during this sequence - consider all of
+            # 'overlaps_array' and use the default padding from the end of
+            # sequence to max length.
+            mask[i, :] = 1
+        else:
+            # tracker has successfully tracked to the end - consider only this
+            # part of the true sequence, and ignore the padding from the end of
+            # sequence to max length.
+            mask[i, :len(overlap)] = 1
+
+    overlaps_array_sum = overlaps_array.copy()
+    # overlaps_array_sum[i,j] means the mean overlap from 1 to j in i-th
+    # sequence
+    for j in range(1, overlaps_array_sum.shape[1]):
+        overlaps_array_sum[:, j] = np.mean(overlaps_array[:, 1:j + 1], axis=1)
+
+    return np.sum(overlaps_array_sum * mask, axis=0) / np.sum(mask, axis=0)
+
+
+def eval_sot_eao(results, annotations, interval=[100, 356], videos_wh=None):
+    """Calculate EAO socre over all tracking sequences.
+
+    Args:
+        results (list[list[ndarray]]): The first list contains the
+            tracking results of each video. The second list contains the
+            tracking results of each frame in one video. The ndarray have two
+            cases:
+                - bbox: denotes the normal tracking box in [x1, y1, w, h]
+                    format.
+                - special tracking state: [0] denotes the unknown state,
+                    namely the skipping frame after failure, [1] denotes the
+                    initialized state, and [2] denotes the failed state.
+        annotations (list[list[dict]]): The first list contains the
+            gt_bboxes of each video. The second list contains the
+            gt_bbox of each frame in one video. The dict contains the
+            annotation information of one frame.
+        interval: an specified interval in EAO curve used to calculate the EAO
+            score. There are different settings in different VOT challenge.
+            Default is VOT2018 setting: [100, 356].
+        videos_wh (list[tuple(width, height), ...]): The list contains the
+            width and height of each video. Default is None.
+
+    Return:
+        dict[str, float]: EAO score in EAO evaluation metric.
+    """
+    if videos_wh is None:
+        videos_wh = [None] * len(annotations)
+
+    all_overlaps = []
+    all_successes = []
+
+    for i, (gt_traj, pred_traj) in enumerate(zip(annotations, results)):
+        gt_traj = np.stack([ann['bboxes'] for ann in gt_traj])
+        assert len(gt_traj) == len(pred_traj)
+        # initialized bbox annotation is [1]
+        assert len(pred_traj[0]) == 1 and pred_traj[0][0] == 1
+        fail_inds, init_inds = locate_failures_inits(pred_traj)
+
+        pred_traj = trajectory2region(pred_traj)
+        gt_traj = trajectory2region(gt_traj)
+        overlaps = calculate_region_overlaps(pred_traj, gt_traj, videos_wh[i])
+
+        if len(fail_inds) > 0:
+            for i in range(len(fail_inds)):
+                all_overlaps.append(overlaps[init_inds[i]:fail_inds[i]])
+                all_successes.append(False)
+
+            # handle last initialization
+            if len(init_inds) > len(fail_inds):
+                # tracker was initialized, but it has not failed until the end
+                # of the sequence
+                all_overlaps.append(overlaps[init_inds[-1]:])
+                all_successes.append(True)
+        else:
+            all_overlaps.append(overlaps)
+            all_successes.append(True)
+
+    eao_curve = calc_eao_curve(all_overlaps, all_successes)
+    eao_score = np.mean(eao_curve[interval[0]:interval[1] + 1])
+    eao = dict(eao=eao_score)
+    return eao