open-mmlab · GT9505 · Jan 29, 2022 · Jan 20, 2022 · Jan 20, 2022 · Jan 21, 2022
diff --git a/configs/sot/siamese_rpn/siamese_rpn_r50_1x_lasot.py b/configs/sot/siamese_rpn/siamese_rpn_r50_1x_lasot.py
@@ -71,7 +71,7 @@
 data_root = 'data/'
 train_pipeline = [
     dict(type='LoadMultiImagesFromFile', to_float32=True),
-    dict(type='SeqLoadAnnotations', with_bbox=True),
+    dict(type='SeqLoadAnnotations', with_bbox=True, with_label=False),
     dict(
         type='SeqCropLikeSiamFC',
         context_amount=0.5,
@@ -90,7 +90,7 @@
 ]
 test_pipeline = [
     dict(type='LoadImageFromFile', to_float32=True),
-    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_label=False),
     dict(
         type='MultiScaleFlipAug',
         scale_factor=1,
@@ -146,19 +146,15 @@
     ],
     val=dict(
         type='LaSOTDataset',
-        test_load_ann=True,
-        ann_file=data_root + 'lasot/annotations/lasot_test.json',
         img_prefix=data_root + 'lasot/LaSOTBenchmark',
         pipeline=test_pipeline,
-        ref_img_sampler=None,
+        split='test',
         test_mode=True),
     test=dict(
         type='LaSOTDataset',
-        test_load_ann=True,
-        ann_file=data_root + 'lasot/annotations/lasot_test.json',
         img_prefix=data_root + 'lasot/LaSOTBenchmark',
         pipeline=test_pipeline,
-        ref_img_sampler=None,
+        split='test',
         test_mode=True))
 # optimizer
 optimizer = dict(

diff --git a/configs/sot/siamese_rpn/siamese_rpn_r50_1x_otb100.py b/configs/sot/siamese_rpn/siamese_rpn_r50_1x_otb100.py
@@ -11,7 +11,7 @@
 data_root = 'data/'
 train_pipeline = [
     dict(type='LoadMultiImagesFromFile', to_float32=True),
-    dict(type='SeqLoadAnnotations', with_bbox=True),
+    dict(type='SeqLoadAnnotations', with_bbox=True, with_label=False),
     dict(
         type='SeqCropLikeSiamFC',
         context_amount=0.5,
@@ -74,9 +74,9 @@
     ],
     val=dict(
         type='OTB100Dataset',
-        ann_file=data_root + 'otb100/annotations/otb100.json',
+        ann_file='tools/convert_datasets/otb100/otb100_infos.txt',
         img_prefix=data_root + 'otb100/data'),
     test=dict(
         type='OTB100Dataset',
-        ann_file=data_root + 'otb100/annotations/otb100.json',
+        ann_file='tools/convert_datasets/otb100/otb100_infos.txt',
         img_prefix=data_root + 'otb100/data'))
diff --git a/configs/sot/siamese_rpn/siamese_rpn_r50_1x_uav123.py b/configs/sot/siamese_rpn/siamese_rpn_r50_1x_uav123.py
@@ -9,9 +9,9 @@
 data = dict(
     val=dict(
         type='UAV123Dataset',
-        ann_file=data_root + 'UAV123/annotations/uav123.json',
-        img_prefix=data_root + 'UAV123/data_seq/UAV123'),
+        ann_file='tools/convert_datasets/uav123/uav123_infos.txt',
+        img_prefix=data_root + 'UAV123'),
     test=dict(
         type='UAV123Dataset',
-        ann_file=data_root + 'UAV123/annotations/uav123.json',
-        img_prefix=data_root + 'UAV123/data_seq/UAV123'))
+        ann_file='tools/convert_datasets/uav123/uav123_infos.txt',
+        img_prefix=data_root + 'UAV123'))
diff --git a/configs/sot/siamese_rpn/siamese_rpn_r50_1x_vot2018.py b/configs/sot/siamese_rpn/siamese_rpn_r50_1x_vot2018.py
@@ -11,11 +11,11 @@
 data = dict(
     val=dict(
         type='VOTDataset',
-        ann_file=data_root + 'vot2018/annotations/vot2018.json',
+        challenge_year=2018,
         img_prefix=data_root + 'vot2018/data'),
     test=dict(
         type='VOTDataset',
-        ann_file=data_root + 'vot2018/annotations/vot2018.json',
+        challenge_year=2018,
         img_prefix=data_root + 'vot2018/data'))
 evaluation = dict(
     metric=['track'], interval=1, start=10, rule='greater', save_best='eao')
diff --git a/mmtrack/core/evaluation/eval_sot_ope.py b/mmtrack/core/evaluation/eval_sot_ope.py
@@ -63,10 +63,10 @@ def eval_sot_ope(results, annotations):
             results of each video. The second list contains the tracking
             results of each frame in one video. The ndarray denotes the
             tracking box in [tl_x, tl_y, br_x, br_y] format.
-        annotations (list[list[dict]]): The first list contains the annotations
-            of each video. The second list contains the annotations of each
-            frame in one video. The dict contains the annotation information
-            of one frame.
+        annotations (list[dict]): The list contains the annotations
+            of each video. The dict contains the annotation information
+            of one video. The format is {'bboxes': ndarray in (N, 4) shape,
+            'visible':ndarray, ...}. The bbox is in (x1, y1, x2, y2) format.
 
     Returns:
         dict[str, float]: OPE style evaluation metric (i.e. success,
@@ -76,14 +76,15 @@ def eval_sot_ope(results, annotations):
     precision_results = []
     norm_precision_results = []
     for single_video_results, single_video_anns in zip(results, annotations):
-        gt_bboxes = np.stack([ann['bboxes'] for ann in single_video_anns])
+        gt_bboxes = single_video_anns['bboxes']
         pred_bboxes = np.stack(single_video_results)
+        assert len(pred_bboxes) == len(gt_bboxes)
         video_length = len(single_video_results)
 
-        if 'ignore' in single_video_anns[0]:
-            gt_ignore = np.stack([ann['ignore'] for ann in single_video_anns])
-            gt_bboxes = gt_bboxes[gt_ignore == 0]
-            pred_bboxes = pred_bboxes[gt_ignore == 0]
+        if 'visible' in single_video_anns:
+            gt_valid = single_video_anns['visible']
+            gt_bboxes = gt_bboxes[gt_valid]
+            pred_bboxes = pred_bboxes[gt_valid]
 
         # eval success based on iou
         iou_th = np.arange(0, 1.05, 0.05)

diff --git a/mmtrack/core/evaluation/eval_sot_vot.py b/mmtrack/core/evaluation/eval_sot_vot.py
@@ -152,7 +152,7 @@ def eval_sot_accuracy_robustness(results,
                 - special tracking state: [0] denotes the unknown state,
                     namely the skipping frame after failure, [1] denotes the
                     initialized state, and [2] denotes the failed state.
-        annotations (list[list[dict]]): The first list contains the
+        annotations (list[list[ndarray]]): The first list contains the
             gt_bboxes of each video. The second list contains the
             gt_bbox of each frame in one video. The dict contains the
             annotation information of one frame.
@@ -176,7 +176,6 @@ def eval_sot_accuracy_robustness(results,
     num_fails = 0
     weight = 0
     for i, (gt_traj, pred_traj) in enumerate(zip(annotations, results)):
-        gt_traj = np.stack([ann['bboxes'] for ann in gt_traj])
         assert len(gt_traj) == len(pred_traj)
         assert len(pred_traj[0]) == 1 and pred_traj[0][0] == 1
         num_fails += count_failures(pred_traj)
@@ -249,7 +248,7 @@ def eval_sot_eao(results, annotations, interval=[100, 356], videos_wh=None):
                 - special tracking state: [0] denotes the unknown state,
                     namely the skipping frame after failure, [1] denotes the
                     initialized state, and [2] denotes the failed state.
-        annotations (list[list[dict]]): The first list contains the
+        annotations (list[list[ndarray]]): The first list contains the
             gt_bboxes of each video. The second list contains the
             gt_bbox of each frame in one video. The dict contains the
             annotation information of one frame.
@@ -275,10 +274,11 @@ def eval_sot_eao(results, annotations, interval=[100, 356], videos_wh=None):
     all_successes = []
 
     for i, (gt_traj, pred_traj) in enumerate(zip(annotations, results)):
-        gt_traj = np.stack([ann['bboxes'] for ann in gt_traj])
-        assert len(gt_traj) == len(pred_traj)
+        assert len(gt_traj) == len(
+            pred_traj), f'{len(gt_traj)} == {len(pred_traj)}'
         # initialized bbox annotation is [1]
-        assert len(pred_traj[0]) == 1 and pred_traj[0][0] == 1
+        assert len(pred_traj[0]) == 1 and pred_traj[0][
+            0] == 1, f'{len(pred_traj[0])} == 1 and {pred_traj[0][0]} == 1'
         fail_inds, init_inds = locate_failures_inits(pred_traj)
 
         pred_traj = trajectory2region(pred_traj)

diff --git a/mmtrack/datasets/base_sot_dataset.py b/mmtrack/datasets/base_sot_dataset.py
@@ -117,9 +117,9 @@ def get_bboxes_from_video(self, video_ind):
         start_frame_id = self.data_infos[video_ind]['start_frame_id']
 
         if not self.test_mode:
-            assert len(bboxes) == (end_frame_id - start_frame_id +
-                                   1), f'{len(bboxes)} is not equal to'
-            '{end_frame_id}-{start_frame_id}+1'
+            assert len(bboxes) == (
+                end_frame_id - start_frame_id + 1
+            ), f'{len(bboxes)} is not equal to {end_frame_id}-{start_frame_id}+1'  # noqa
         return bboxes
 
     def get_len_per_video(self, video_ind):
@@ -263,21 +263,23 @@ def evaluate(self, results, metric=['track'], logger=None):
         # get all test annotations
         annotations = []
         for video_ind in range(len(self.data_infos)):
-            bboxes = self.get_ann_infos_from_video(video_ind)['bboxes']
-            annotations.append(bboxes)
+            video_anns = self.get_ann_infos_from_video(video_ind)
+            annotations.append(video_anns)
 
         # tracking_bboxes converting code
         eval_results = dict()
         if 'track' in metrics:
-            assert len(self) == len(results['track_bboxes'])
+            assert len(self) == len(
+                results['track_bboxes']
+            ), f"{len(self)} == {len(results['track_bboxes'])}"
             print_log('Evaluate OPE Benchmark...', logger=logger)
             track_bboxes = []
             start_ind = end_ind = 0
             for num in self.num_frames_per_video:
                 end_ind += num
                 track_bboxes.append(
                     list(
-                        map(lambda x: x[:4],
+                        map(lambda x: x[:-1],
                             results['track_bboxes'][start_ind:end_ind])))
                 start_ind += num
 
@@ -289,4 +291,4 @@ def evaluate(self, results, metric=['track'], logger=None):
             for k, v in eval_results.items():
                 if isinstance(v, float):
                     eval_results[k] = float(f'{(v):.3f}')
-            print_log(eval_results, logger=logger)
+        return eval_results
diff --git a/mmtrack/datasets/lasot_dataset.py b/mmtrack/datasets/lasot_dataset.py
@@ -1,34 +1,96 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import glob
+import os.path as osp
+import time
+
 import numpy as np
 from mmdet.datasets import DATASETS
 
-from .sot_test_dataset import SOTTestDataset
+from .base_sot_dataset import BaseSOTDataset
 
 
 @DATASETS.register_module()
-class LaSOTDataset(SOTTestDataset):
-    """LaSOT dataset for the testing of single object tracking.
+class LaSOTDataset(BaseSOTDataset):
+    """LaSOT dataset of single object tracking.
 
-    The dataset doesn't support training mode.
+    The dataset can both support training and testing mode.
     """
 
-    def _parse_ann_info(self, img_info, ann_info):
-        """Parse bbox annotations.
+    def __init__(self, ann_file, *args, **kwargs):
+        """Initialization of SOT dataset class.
+
+        Args:
+            ann_file (str): The file contains testing video names. It will be
+                loaded in the `self.load_data_infos` function.
+        """
+        self.ann_file = ann_file
+        super(LaSOTDataset, self).__init__(*args, **kwargs)
+
+    def load_data_infos(self, split='test'):
+        """Load dataset information.
 
         Args:
-            img_info (dict): image information.
-            ann_info (list[dict]): Annotation information of an image. Each
-                image only has one bbox annotation.
+            split (str, optional): Dataset split. Defaults to 'test'.
 
         Returns:
-            dict: A dict containing the following keys: bboxes, labels,
-            ignore. labels are not useful in SOT.
+            list[dict]: The length of the list is the number of videos. The
+                inner dict is in the following format:
+                    {
+                        'video_path': the video path
+                        'ann_path': the annotation path
+                        'start_frame_id': the starting frame number contained
+                            in the image name
+                        'end_frame_id': the ending frame number contained in
+                            the image name
+                        'framename_template': the template of image name
+                    }
         """
-        gt_bboxes = np.array(ann_info[0]['bbox'], dtype=np.float32)
-        # convert [x1, y1, w, h] to [x1, y1, x2, y2]
-        gt_bboxes[2] += gt_bboxes[0]
-        gt_bboxes[3] += gt_bboxes[1]
-        gt_labels = np.array(self.cat2label[ann_info[0]['category_id']])
-        ignore = ann_info[0]['full_occlusion'] or ann_info[0]['out_of_view']
-        ann = dict(bboxes=gt_bboxes, labels=gt_labels, ignore=ignore)
-        return ann
+        print('Loading LaSOT dataset...')
+        start_time = time.time()
+        assert split in ['train', 'test']
+        data_infos = []
+
+        test_videos_list = np.loadtxt(self.ann_file, dtype=np.str_)
+        if self.test_mode:
+            videos_list = test_videos_list.tolist()
+        else:
+            all_videos_list = glob.glob(self.img_prefix + '/*/*-[1-20]')
+            test_videos = set(test_videos_list)
+            videos_list = []
+            for x in all_videos_list:
+                x = osp.basename(x)
+                if x not in test_videos:
+                    videos_list.append(x)
+
+        videos_list = sorted(videos_list)
+        for video_name in videos_list:
+            video_name = osp.join(video_name.split('-')[0], video_name)
+            video_path = osp.join(video_name, 'img')
+            ann_path = osp.join(video_name, 'groundtruth.txt')
+            img_names = glob.glob(
+                osp.join(self.img_prefix, video_name, 'img', '*.jpg'))
+            end_frame_name = max(
+                img_names, key=lambda x: int(osp.basename(x).split('.')[0]))
+            end_frame_id = int(osp.basename(end_frame_name).split('.')[0])
+            data_infos.append(
+                dict(
+                    video_path=video_path,
+                    ann_path=ann_path,
+                    start_frame_id=1,
+                    end_frame_id=end_frame_id,
+                    framename_template='%08d.jpg'))
+        print(f'LaSOT dataset loaded! ({time.time()-start_time:.2f} s)')
+        return data_infos
+
+    def get_visibility_from_video(self, video_ind):
+        """Get the visible information of instance in a video."""
+        video_path = osp.dirname(self.data_infos[video_ind]['video_path'])
+        full_occlusion_file = osp.join(self.img_prefix, video_path,
+                                       'full_occlusion.txt')
+        out_of_view_file = osp.join(self.img_prefix, video_path,
+                                    'out_of_view.txt')
+        full_occlusion = np.loadtxt(
+            full_occlusion_file, dtype=bool, delimiter=',')
+        out_of_view = np.loadtxt(out_of_view_file, dtype=bool, delimiter=',')
+        visible = ~(full_occlusion | out_of_view)
+        return dict(visible=visible)