Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] Support VOT2018 dataset #305

Merged
merged 19 commits into from
Nov 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions configs/sot/siamese_rpn/siamese_rpn_r50_1x_vot2018.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
_base_ = ['./siamese_rpn_r50_1x_lasot.py']

# model settings
model = dict(
test_cfg=dict(
rpn=dict(penalty_k=0.04, window_influence=0.44, lr=0.33),
test_mode='VOT'))

data_root = 'data/'
# dataset settings
data = dict(
val=dict(
type='VOTDataset',
ann_file=data_root + 'vot2018/annotations/vot2018.json',
img_prefix=data_root + 'vot2018/data'),
test=dict(
type='VOTDataset',
ann_file=data_root + 'vot2018/annotations/vot2018.json',
img_prefix=data_root + 'vot2018/data'))
evaluation = dict(
metric=['track'], interval=1, start=10, rule='greater', save_best='eao')
1 change: 1 addition & 0 deletions mmtrack/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) OpenMMLab. All rights reserved.
from .anchor import * # noqa: F401, F403
from .bbox import * # noqa: F401, F403
from .evaluation import * # noqa: F401, F403
from .motion import * # noqa: F401, F403
from .optimizer import * # noqa: F401, F403
Expand Down
4 changes: 4 additions & 0 deletions mmtrack/core/bbox/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) OpenMMLab. All rights reserved.
from .transforms import bbox_cxcywh_to_x1y1wh, bbox_xyxy_to_x1y1wh, quad2bbox

__all__ = ['quad2bbox', 'bbox_cxcywh_to_x1y1wh', 'bbox_xyxy_to_x1y1wh']
66 changes: 66 additions & 0 deletions mmtrack/core/bbox/transforms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from mmdet.core.bbox.transforms import bbox_xyxy_to_cxcywh


def quad2bbox(quad):
"""Convert quadrilateral to axis aligned box in [cx, cy, w, h] format.

Args:
quad (Tensor): of shape (N, 8), (8, ), (N, 4) or (4, ). The
coordinates are in [x1, y1, x2, y2, x3, y3, x4, y4] or
[tl_x, tl_y, br_x, br_y] format.
Returns:
Tensor: in [cx, cy, w, h] format.
"""
if len(quad.shape) == 1:
quad = quad.unsqueeze(0)
length = quad.shape[1]
if length == 8:
cx = torch.mean(quad[:, 0::2], dim=-1)
cy = torch.mean(quad[:, 1::2], dim=-1)
x1 = torch.min(quad[:, 0::2], dim=-1)[0]
x2 = torch.max(quad[:, 0::2], dim=-1)[0]
y1 = torch.min(quad[:, 1::2], dim=-1)[0]
y2 = torch.max(quad[:, 1::2], dim=-1)[0]
area1 = torch.norm(quad[:, 0:2] - quad[:, 2:4], dim=1) * \
torch.norm(quad[:, 2:4] - quad[:, 4:6], dim=1)
area2 = (x2 - x1) * (y2 - y1)
scale_factor = torch.sqrt(area1 / area2)
w = scale_factor * (x2 - x1)
h = scale_factor * (y2 - y1)
bbox = torch.stack((cx, cy, w, h), dim=-1).squeeze(0)
JingweiZhang12 marked this conversation as resolved.
Show resolved Hide resolved
elif length == 4:
bbox = bbox_xyxy_to_cxcywh(quad).squeeze(0)
GT9505 marked this conversation as resolved.
Show resolved Hide resolved
else:
NotImplementedError(f'The length of quadrilateral: {length} is \
not supported')
return bbox


def bbox_cxcywh_to_x1y1wh(bbox):
"""Convert bbox coordinates from (cx, cy, w, h) to (x1, y1, w, h).

Args:
bbox (Tensor): Shape (n, 4) or (4, ) for bboxes.

Returns:
Tensor: Converted bboxes.
"""
cx, cy, w, h = bbox.split((1, 1, 1, 1), dim=-1)
bbox_new = [(cx - 0.5 * w), (cy - 0.5 * h), w, h]
return torch.cat(bbox_new, dim=-1)


def bbox_xyxy_to_x1y1wh(bbox):
"""Convert bbox coordinates from (x1, y1, x2, y2) to (x1, y1, w, h).

Args:
bbox (Tensor): Shape (n, 4) or (4, ) for bboxes.

Returns:
Tensor: Converted bboxes.
"""
x1, y1, x2, y2 = bbox.split((1, 1, 1, 1), dim=-1)
bbox_new = [x1, y1, (x2 - x1), (y2 - y1)]
return torch.cat(bbox_new, dim=-1)
7 changes: 6 additions & 1 deletion mmtrack/core/evaluation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,10 @@
from .eval_hooks import DistEvalHook, EvalHook
from .eval_mot import eval_mot
from .eval_sot_ope import eval_sot_ope
from .eval_sot_vot import (bbox2region, eval_sot_accuracy_robustness,
eval_sot_eao)

__all__ = ['EvalHook', 'DistEvalHook', 'eval_mot', 'eval_sot_ope']
__all__ = [
'EvalHook', 'DistEvalHook', 'eval_mot', 'eval_sot_ope', 'bbox2region',
'eval_sot_eao', 'eval_sot_accuracy_robustness'
]
281 changes: 281 additions & 0 deletions mmtrack/core/evaluation/eval_sot_vot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,281 @@
import numpy as np
from vot.analysis import is_special
from vot.region import Polygon, Rectangle, Special
from vot.region import calculate_overlaps as calculate_region_overlaps


def bbox2region(bbox):
"""Convert bbox to Rectangle or Polygon Class object.

Args:
bbox (ndarray): the format of rectangle bbox is (x1, y1, w, h);
the format of polygon is (x1, y1, x2, y2, ...).

Returns:
Rectangle or Polygon Class object.
"""
if len(bbox) == 1:
return Special(bbox[0])
elif len(bbox) == 4:
return Rectangle(bbox[0], bbox[1], bbox[2], bbox[3])
elif len(bbox) % 2 == 0 and len(bbox) > 4:
return Polygon([(x_, y_) for x_, y_ in zip(bbox[::2], bbox[1::2])])
else:
raise NotImplementedError(
f'The length of bbox is {len(bbox)}, which is not supported')


def trajectory2region(trajectory):
"""Convert bbox trajectory to Rectangle or Polygon Class object trajectory.

Args:
trajectory (list[ndarray]): The outer list contains bbox of
each frame in a video. The bbox is a ndarray.

Returns:
List: contains the Region Class object of each frame in a
trajectory.
"""
traj_region = []
for bbox in trajectory:
traj_region.append(bbox2region(bbox))
return traj_region


def locate_failures_inits(trajectory):
"""locate the failure frame and initialized frame in a trajectory.

Args:
trajectory (list[ndarray]): list of tracking results.

Returns:
fail_inds (list): index of failed frame in a trajectory.
init_inds (list): index of initialized frame in a trajectory.
"""
fail_inds = []
init_inds = []
for i, bbox in enumerate(trajectory):
if len(bbox) == 1:
if bbox[0] == 1.:
init_inds.append(i)
elif bbox[0] == 2.:
fail_inds.append(i)
return fail_inds, init_inds


def count_failures(trajectory):
"""count the number of failed frame in a trajectory.

Args:
trajectory (list[ndarray]): list of tracking results.

Returns:
List: the number of failed frame in a trajectory.
"""
num_fails = 0
for bbox in trajectory:
if len(bbox) == 1 and bbox[0] == 2.:
num_fails += 1
return num_fails


def calc_accuracy(gt_trajectory,
pred_trajectory,
burnin=10,
ignore_unknown=True,
video_wh=None):
"""Calculate accuracy over the sequence.

Args:
gt_trajectory (list[list]): list of bboxes
pred_trajectory (list[ndarray]): The outer list contains the
tracking results of each frame in one video. The ndarray has two
cases:
- bbox: denotes the normal tracking box in [x1, y1, w, h]
format.
- special tracking state: [0] denotes the unknown state,
namely the skipping frame after failure, [1] denotes the
initialized state, and [2] denotes the failed state.
burnin: number of frames that have to be ignored after the
re-initialization when calculating accuracy. Default is 10.
ignore_unknown (bool): whether ignore the skipping frames after
failures when calculating accuracy. Default is True.
video_wh: bounding region (width, height)

Return:
Float: accuracy over the sequence.
"""
pred_traj_region = trajectory2region(pred_trajectory)
gt_traj_region = trajectory2region(gt_trajectory)
overlaps = np.array(
calculate_region_overlaps(pred_traj_region, gt_traj_region, video_wh))
mask = np.ones(len(overlaps), dtype=bool)

for i, region in enumerate(pred_traj_region):
if is_special(region, Special.UNKNOWN) and ignore_unknown:
mask[i] = False
elif is_special(region, Special.INITIALIZATION):
for j in range(i, min(len(pred_traj_region), i + burnin)):
mask[j] = False
elif is_special(region, Special.FAILURE):
mask[i] = False
return np.mean(overlaps[mask]) if any(mask) else 0.


def eval_sot_accuracy_robustness(results,
annotations,
burnin=10,
ignore_unknown=True,
videos_wh=None):
"""Calculate accuracy and robustness over all tracking sequences.

Args:
results (list[list[ndarray]]): The first list contains the
tracking results of each video. The second list contains the
tracking results of each frame in one video. The ndarray have two
cases:
- bbox: denotes the normal tracking box in [x1, y1, w, h]
format.
- special tracking state: [0] denotes the unknown state,
namely the skipping frame after failure, [1] denotes the
initialized state, and [2] denotes the failed state.
annotations (list[list[dict]]): The first list contains the
gt_bboxes of each video. The second list contains the
gt_bbox of each frame in one video. The dict contains the
annotation information of one frame.
burnin: number of frames that have to be ignored after the
re-initialization when calculating accuracy. Default is 10.
ignore_unknown (bool): whether ignore the skipping frames after
failures when calculating accuracy. Default is True.
videos_wh (list[tuple(width, height), ...]): The list contains the
width and height of each video. Default is None.

Return:
dict{str: float}: accuracy and robustness in EAO evaluation metric.
"""
accuracy = 0
num_fails = 0
weight = 0
for i, (gt_traj, pred_traj) in enumerate(zip(annotations, results)):
gt_traj = np.stack([ann['bboxes'] for ann in gt_traj])
assert len(gt_traj) == len(pred_traj)
assert len(pred_traj[0]) == 1 and pred_traj[0][0] == 1
num_fails += count_failures(pred_traj)
accuracy += calc_accuracy(
gt_traj,
pred_traj,
burnin=burnin,
ignore_unknown=ignore_unknown,
video_wh=videos_wh[i]) * len(pred_traj)
weight += len(pred_traj)

accuracy /= weight
robustness = num_fails / weight * 100
return dict(accuracy=accuracy, robustness=robustness, num_fails=num_fails)


def calc_eao_curve(overlaps, successes):
"""Calculate EAO curve over all tracking sequences.

Args:
overlaps (list[list]): The outer list contains the overlaps of each
video. The inner list contains the overlap of each frame in one
video.
successes (list): The list contains the tracking states of last frame
in each fragment.

Return:
ndarray: The N-th element in ndarray denotes the average overlaps from
1 to N in all fragments.
"""
max_length = max([len(_) for _ in overlaps])
total_runs = len(overlaps)

overlaps_array = np.zeros((total_runs, max_length), dtype=np.float32)
# mask out frames which are not considered in EAO calculation. initial
# value are zero, meaning ignored.
mask = np.zeros((total_runs, max_length), dtype=np.float32)
for i, (overlap, success) in enumerate(zip(overlaps, successes)):
overlaps_array[i, :len(overlap)] = np.array(overlap)
if not success:
# tracker has failed during this sequence - consider all of
# 'overlaps_array' and use the default padding from the end of
# sequence to max length.
mask[i, :] = 1
else:
# tracker has successfully tracked to the end - consider only this
# part of the true sequence, and ignore the padding from the end of
# sequence to max length.
mask[i, :len(overlap)] = 1

overlaps_array_sum = overlaps_array.copy()
# overlaps_array_sum[i,j] means the mean overlap from 1 to j in i-th
# sequence
for j in range(1, overlaps_array_sum.shape[1]):
overlaps_array_sum[:, j] = np.mean(overlaps_array[:, 1:j + 1], axis=1)

return np.sum(overlaps_array_sum * mask, axis=0) / np.sum(mask, axis=0)


def eval_sot_eao(results, annotations, interval=[100, 356], videos_wh=None):
"""Calculate EAO socre over all tracking sequences.

Args:
results (list[list[ndarray]]): The first list contains the
tracking results of each video. The second list contains the
tracking results of each frame in one video. The ndarray have two
cases:
- bbox: denotes the normal tracking box in [x1, y1, w, h]
format.
- special tracking state: [0] denotes the unknown state,
namely the skipping frame after failure, [1] denotes the
initialized state, and [2] denotes the failed state.
annotations (list[list[dict]]): The first list contains the
gt_bboxes of each video. The second list contains the
gt_bbox of each frame in one video. The dict contains the
annotation information of one frame.
interval: an specified interval in EAO curve used to calculate the EAO
score. There are different settings in different VOT challenge.
Default is VOT2018 setting: [100, 356].
videos_wh (list[tuple(width, height), ...]): The list contains the
width and height of each video. Default is None.

Return:
dict[str, float]: EAO score in EAO evaluation metric.
"""
if videos_wh is None:
videos_wh = [None] * len(annotations)

all_overlaps = []
all_successes = []

for i, (gt_traj, pred_traj) in enumerate(zip(annotations, results)):
gt_traj = np.stack([ann['bboxes'] for ann in gt_traj])
assert len(gt_traj) == len(pred_traj)
# initialized bbox annotation is [1]
assert len(pred_traj[0]) == 1 and pred_traj[0][0] == 1
fail_inds, init_inds = locate_failures_inits(pred_traj)

pred_traj = trajectory2region(pred_traj)
gt_traj = trajectory2region(gt_traj)
overlaps = calculate_region_overlaps(pred_traj, gt_traj, videos_wh[i])

if len(fail_inds) > 0:
for i in range(len(fail_inds)):
all_overlaps.append(overlaps[init_inds[i]:fail_inds[i]])
all_successes.append(False)

# handle last initialization
if len(init_inds) > len(fail_inds):
# tracker was initialized, but it has not failed until the end
# of the sequence
all_overlaps.append(overlaps[init_inds[-1]:])
all_successes.append(True)
else:
all_overlaps.append(overlaps)
all_successes.append(True)

eao_curve = calc_eao_curve(all_overlaps, all_successes)
eao_score = np.mean(eao_curve[interval[0]:interval[1] + 1])
eao = dict(eao=eao_score)
return eao
Loading