diff --git a/configs/sot/siamese_rpn/siamese_rpn_r50_1x_lasot.py b/configs/sot/siamese_rpn/siamese_rpn_r50_1x_lasot.py
index 655ee52cc..edc2b723e 100644
--- a/configs/sot/siamese_rpn/siamese_rpn_r50_1x_lasot.py
+++ b/configs/sot/siamese_rpn/siamese_rpn_r50_1x_lasot.py
@@ -71,7 +71,7 @@
 data_root = 'data/'
 train_pipeline = [
     dict(type='LoadMultiImagesFromFile', to_float32=True),
-    dict(type='SeqLoadAnnotations', with_bbox=True),
+    dict(type='SeqLoadAnnotations', with_bbox=True, with_label=False),
     dict(
         type='SeqCropLikeSiamFC',
         context_amount=0.5,
@@ -146,20 +146,20 @@
     ],
     val=dict(
         type='LaSOTDataset',
-        test_load_ann=True,
-        ann_file=data_root + 'lasot/annotations/lasot_test.json',
+        ann_file='tools/convert_datasets/lasot/testing_set.txt',
         img_prefix=data_root + 'lasot/LaSOTBenchmark',
         pipeline=test_pipeline,
-        ref_img_sampler=None,
-        test_mode=True),
+        split='test',
+        test_mode=True,
+        only_eval_visible=True),
     test=dict(
         type='LaSOTDataset',
-        test_load_ann=True,
-        ann_file=data_root + 'lasot/annotations/lasot_test.json',
+        ann_file='tools/convert_datasets/lasot/testing_set.txt',
         img_prefix=data_root + 'lasot/LaSOTBenchmark',
         pipeline=test_pipeline,
-        ref_img_sampler=None,
-        test_mode=True))
+        split='test',
+        test_mode=True,
+        only_eval_visible=True))
 # optimizer
 optimizer = dict(
     type='SGD',
diff --git a/configs/sot/siamese_rpn/siamese_rpn_r50_1x_otb100.py b/configs/sot/siamese_rpn/siamese_rpn_r50_1x_otb100.py
index c41f4b81f..2deb64268 100644
--- a/configs/sot/siamese_rpn/siamese_rpn_r50_1x_otb100.py
+++ b/configs/sot/siamese_rpn/siamese_rpn_r50_1x_otb100.py
@@ -11,7 +11,7 @@
 data_root = 'data/'
 train_pipeline = [
     dict(type='LoadMultiImagesFromFile', to_float32=True),
-    dict(type='SeqLoadAnnotations', with_bbox=True),
+    dict(type='SeqLoadAnnotations', with_bbox=True, with_label=False),
     dict(
         type='SeqCropLikeSiamFC',
         context_amount=0.5,
@@ -74,9 +74,11 @@
     ],
     val=dict(
         type='OTB100Dataset',
-        ann_file=data_root + 'otb100/annotations/otb100.json',
-        img_prefix=data_root + 'otb100/data'),
+        ann_file='tools/convert_datasets/otb100/otb100_infos.txt',
+        img_prefix=data_root + 'otb100/data',
+        only_eval_visible=False),
     test=dict(
         type='OTB100Dataset',
-        ann_file=data_root + 'otb100/annotations/otb100.json',
-        img_prefix=data_root + 'otb100/data'))
+        ann_file='tools/convert_datasets/otb100/otb100_infos.txt',
+        img_prefix=data_root + 'otb100/data',
+        only_eval_visible=False))
diff --git a/configs/sot/siamese_rpn/siamese_rpn_r50_1x_trackingnet.py b/configs/sot/siamese_rpn/siamese_rpn_r50_1x_trackingnet.py
index e9b50157a..66d77b121 100644
--- a/configs/sot/siamese_rpn/siamese_rpn_r50_1x_trackingnet.py
+++ b/configs/sot/siamese_rpn/siamese_rpn_r50_1x_trackingnet.py
@@ -3,7 +3,4 @@
 data_root = 'data/'
 # dataset settings
 data = dict(
-    test=dict(
-        type='TrackingNetDataset',
-        img_prefix=data_root + 'trackingnet',
-        split='test'))
+    test=dict(type='TrackingNetDataset', img_prefix=data_root + 'trackingnet'))
diff --git a/configs/sot/siamese_rpn/siamese_rpn_r50_1x_uav123.py b/configs/sot/siamese_rpn/siamese_rpn_r50_1x_uav123.py
index a2b3857a1..c7bfed7f9 100644
--- a/configs/sot/siamese_rpn/siamese_rpn_r50_1x_uav123.py
+++ b/configs/sot/siamese_rpn/siamese_rpn_r50_1x_uav123.py
@@ -9,9 +9,11 @@
 data = dict(
     val=dict(
         type='UAV123Dataset',
-        ann_file=data_root + 'UAV123/annotations/uav123.json',
-        img_prefix=data_root + 'UAV123/data_seq/UAV123'),
+        ann_file='tools/convert_datasets/uav123/uav123_infos.txt',
+        img_prefix=data_root + 'UAV123',
+        only_eval_visible=False),
     test=dict(
         type='UAV123Dataset',
-        ann_file=data_root + 'UAV123/annotations/uav123.json',
-        img_prefix=data_root + 'UAV123/data_seq/UAV123'))
+        ann_file='tools/convert_datasets/uav123/uav123_infos.txt',
+        img_prefix=data_root + 'UAV123',
+        only_eval_visible=False))
diff --git a/configs/sot/siamese_rpn/siamese_rpn_r50_1x_vot2018.py b/configs/sot/siamese_rpn/siamese_rpn_r50_1x_vot2018.py
index 4d8768f1b..6e5989dbd 100644
--- a/configs/sot/siamese_rpn/siamese_rpn_r50_1x_vot2018.py
+++ b/configs/sot/siamese_rpn/siamese_rpn_r50_1x_vot2018.py
@@ -11,11 +11,11 @@
 data = dict(
     val=dict(
         type='VOTDataset',
-        ann_file=data_root + 'vot2018/annotations/vot2018.json',
+        dataset_type='vot2018',
         img_prefix=data_root + 'vot2018/data'),
     test=dict(
         type='VOTDataset',
-        ann_file=data_root + 'vot2018/annotations/vot2018.json',
+        dataset_type='vot2018',
         img_prefix=data_root + 'vot2018/data'))
 evaluation = dict(
     metric=['track'], interval=1, start=10, rule='greater', save_best='eao')
diff --git a/mmtrack/apis/train.py b/mmtrack/apis/train.py
index b112546cd..abb15ca42 100644
--- a/mmtrack/apis/train.py
+++ b/mmtrack/apis/train.py
@@ -108,8 +108,7 @@ def train_model(model,
             broadcast_buffers=False,
             find_unused_parameters=find_unused_parameters)
     else:
-        model = MMDataParallel(
-            model, device_ids=cfg.gpu_ids)
+        model = MMDataParallel(model, device_ids=cfg.gpu_ids)
 
     # build runner
     optimizer = build_optimizer(model, cfg.optimizer)
diff --git a/mmtrack/core/evaluation/eval_sot_ope.py b/mmtrack/core/evaluation/eval_sot_ope.py
index 2071fa070..b379daa36 100644
--- a/mmtrack/core/evaluation/eval_sot_ope.py
+++ b/mmtrack/core/evaluation/eval_sot_ope.py
@@ -55,7 +55,7 @@ def success_error(gt_bboxes_center, pred_bboxes_center, pixel_offset_th,
     return success
 
 
-def eval_sot_ope(results, annotations):
+def eval_sot_ope(results, annotations, visible_infos=None):
     """Evaluation in OPE protocol.
 
     Args:
@@ -63,10 +63,13 @@ def eval_sot_ope(results, annotations):
             results of each video. The second list contains the tracking
             results of each frame in one video. The ndarray denotes the
             tracking box in [tl_x, tl_y, br_x, br_y] format.
-        annotations (list[list[dict]]): The first list contains the annotations
-            of each video. The second list contains the annotations of each
-            frame in one video. The dict contains the annotation information
-            of one frame.
+        annotations (list[ndarray]): The list contains the bbox
+            annotations of each video. The ndarray is gt_bboxes of one video.
+            It's in (N, 4) shape. Each bbox is in (x1, y1, x2, y2) format.
+        visible_infos (list[ndarray] | None): If not None, the list
+            contains the visible information of each video. The ndarray is
+            visibility (with bool type) of object in one video. It's in (N,)
+            shape. Default to None.
 
     Returns:
         dict[str, float]: OPE style evaluation metric (i.e. success,
@@ -75,15 +78,16 @@ def eval_sot_ope(results, annotations):
     success_results = []
     precision_results = []
     norm_precision_results = []
-    for single_video_results, single_video_anns in zip(results, annotations):
-        gt_bboxes = np.stack([ann['bboxes'] for ann in single_video_anns])
+    if visible_infos is None:
+        visible_infos = [np.array([True] * len(_)) for _ in annotations]
+    for single_video_results, single_video_gt_bboxes, single_video_visible in zip(  # noqa
+            results, annotations, visible_infos):
         pred_bboxes = np.stack(single_video_results)
+        assert len(pred_bboxes) == len(single_video_gt_bboxes)
         video_length = len(single_video_results)
 
-        if 'ignore' in single_video_anns[0]:
-            gt_ignore = np.stack([ann['ignore'] for ann in single_video_anns])
-            gt_bboxes = gt_bboxes[gt_ignore == 0]
-            pred_bboxes = pred_bboxes[gt_ignore == 0]
+        gt_bboxes = single_video_gt_bboxes[single_video_visible]
+        pred_bboxes = pred_bboxes[single_video_visible]
 
         # eval success based on iou
         iou_th = np.arange(0, 1.05, 0.05)
diff --git a/mmtrack/core/evaluation/eval_sot_vot.py b/mmtrack/core/evaluation/eval_sot_vot.py
index c89a76b13..850e8e9a1 100644
--- a/mmtrack/core/evaluation/eval_sot_vot.py
+++ b/mmtrack/core/evaluation/eval_sot_vot.py
@@ -152,10 +152,9 @@ def eval_sot_accuracy_robustness(results,
                 - special tracking state: [0] denotes the unknown state,
                     namely the skipping frame after failure, [1] denotes the
                     initialized state, and [2] denotes the failed state.
-        annotations (list[list[dict]]): The first list contains the
-            gt_bboxes of each video. The second list contains the
-            gt_bbox of each frame in one video. The dict contains the
-            annotation information of one frame.
+        annotations (list[ndarray]): The list contains the gt_bboxes of each
+            video. The ndarray is gt_bboxes of one video. It's in (N, 4) shape.
+            Each bbox is in (x1, y1, w, h) format.
         burnin: number of frames that have to be ignored after the
             re-initialization when calculating accuracy. Default is 10.
         ignore_unknown (bool): whether ignore the skipping frames after
@@ -176,7 +175,6 @@ def eval_sot_accuracy_robustness(results,
     num_fails = 0
     weight = 0
     for i, (gt_traj, pred_traj) in enumerate(zip(annotations, results)):
-        gt_traj = np.stack([ann['bboxes'] for ann in gt_traj])
         assert len(gt_traj) == len(pred_traj)
         assert len(pred_traj[0]) == 1 and pred_traj[0][0] == 1
         num_fails += count_failures(pred_traj)
@@ -249,10 +247,9 @@ def eval_sot_eao(results, annotations, interval=[100, 356], videos_wh=None):
                 - special tracking state: [0] denotes the unknown state,
                     namely the skipping frame after failure, [1] denotes the
                     initialized state, and [2] denotes the failed state.
-        annotations (list[list[dict]]): The first list contains the
-            gt_bboxes of each video. The second list contains the
-            gt_bbox of each frame in one video. The dict contains the
-            annotation information of one frame.
+        annotations (list[ndarray]): The list contains the gt_bboxes of each
+            video. The ndarray is gt_bboxes of one video. It's in (N, 4) shape.
+            Each bbox is in (x1, y1, w, h) format.
         interval: an specified interval in EAO curve used to calculate the EAO
             score. There are different settings in different VOT challenge.
             Default is VOT2018 setting: [100, 356].
@@ -275,10 +272,11 @@ def eval_sot_eao(results, annotations, interval=[100, 356], videos_wh=None):
     all_successes = []
 
     for i, (gt_traj, pred_traj) in enumerate(zip(annotations, results)):
-        gt_traj = np.stack([ann['bboxes'] for ann in gt_traj])
-        assert len(gt_traj) == len(pred_traj)
+        assert len(gt_traj) == len(
+            pred_traj), f'{len(gt_traj)} == {len(pred_traj)}'
         # initialized bbox annotation is [1]
-        assert len(pred_traj[0]) == 1 and pred_traj[0][0] == 1
+        assert len(pred_traj[0]) == 1 and pred_traj[0][
+            0] == 1, f'{len(pred_traj[0])} == 1 and {pred_traj[0][0]} == 1'
         fail_inds, init_inds = locate_failures_inits(pred_traj)
 
         pred_traj = trajectory2region(pred_traj)
diff --git a/mmtrack/datasets/base_sot_dataset.py b/mmtrack/datasets/base_sot_dataset.py
index 1bec84fcf..1970a3ff8 100644
--- a/mmtrack/datasets/base_sot_dataset.py
+++ b/mmtrack/datasets/base_sot_dataset.py
@@ -23,6 +23,8 @@ class BaseSOTDataset(Dataset, metaclass=ABCMeta):
         test_mode (bool, optional): Default to False.
         bbox_min_size (int, optional): Only bounding boxes whose sizes are
             larger than `bbox_min_size` can be regarded as valid. Default to 0.
+        only_eval_visible (bool, optional): Whether to only evaluate frames
+            where object are visible. Default to False.
     """
 
     # Compatible with MOT and VID Dataset class. The 'CLASSES' attribute will
@@ -35,12 +37,14 @@ def __init__(self,
                  split,
                  test_mode=False,
                  bbox_min_size=0,
+                 only_eval_visible=False,
                  **kwargs):
         self.img_prefix = img_prefix
         self.split = split
         self.pipeline = Compose(pipeline)
         self.test_mode = test_mode
         self.bbox_min_size = bbox_min_size
+        self.only_eval_visible = only_eval_visible
         # 'self.load_as_video' must be set to True in order to using
         # distributed video sampler to load dataset when testing.
         self.load_as_video = True
@@ -98,9 +102,9 @@ def get_bboxes_from_video(self, video_ind):
         start_frame_id = self.data_infos[video_ind]['start_frame_id']
 
         if not self.test_mode:
-            assert len(bboxes) == (end_frame_id - start_frame_id +
-                                   1), f'{len(bboxes)} is not equal to'
-            '{end_frame_id}-{start_frame_id}+1'
+            assert len(bboxes) == (
+                end_frame_id - start_frame_id + 1
+            ), f'{len(bboxes)} is not equal to {end_frame_id}-{start_frame_id}+1'  # noqa
         return bboxes
 
     def get_len_per_video(self, video_ind):
@@ -249,15 +253,19 @@ def evaluate(self, results, metric=['track'], logger=None):
                 raise KeyError(f'metric {metric} is not supported.')
 
         # get all test annotations
-        annotations = []
+        gt_bboxes = []
+        visible_infos = []
         for video_ind in range(len(self.data_infos)):
-            bboxes = self.get_ann_infos_from_video(video_ind)['bboxes']
-            annotations.append(bboxes)
+            video_anns = self.get_ann_infos_from_video(video_ind)
+            gt_bboxes.append(video_anns['bboxes'])
+            visible_infos.append(video_anns['visible'])
 
         # tracking_bboxes converting code
         eval_results = dict()
         if 'track' in metrics:
-            assert len(self) == len(results['track_bboxes'])
+            assert len(self) == len(
+                results['track_bboxes']
+            ), f"{len(self)} == {len(results['track_bboxes'])}"
             print_log('Evaluate OPE Benchmark...', logger=logger)
             track_bboxes = []
             start_ind = end_ind = 0
@@ -265,16 +273,21 @@ def evaluate(self, results, metric=['track'], logger=None):
                 end_ind += num
                 track_bboxes.append(
                     list(
-                        map(lambda x: x[:4],
+                        map(lambda x: x[:-1],
                             results['track_bboxes'][start_ind:end_ind])))
                 start_ind += num
 
+            if not self.only_eval_visible:
+                visible_infos = None
             # evaluation
             track_eval_results = eval_sot_ope(
-                results=track_bboxes, annotations=annotations)
+                results=track_bboxes,
+                annotations=gt_bboxes,
+                visible_infos=visible_infos)
             eval_results.update(track_eval_results)
 
             for k, v in eval_results.items():
                 if isinstance(v, float):
                     eval_results[k] = float(f'{(v):.3f}')
             print_log(eval_results, logger=logger)
+        return eval_results
diff --git a/mmtrack/datasets/lasot_dataset.py b/mmtrack/datasets/lasot_dataset.py
index 6547d87ce..de57b3d4f 100644
--- a/mmtrack/datasets/lasot_dataset.py
+++ b/mmtrack/datasets/lasot_dataset.py
@@ -1,34 +1,96 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import glob
+import os.path as osp
+import time
+
 import numpy as np
 from mmdet.datasets import DATASETS
 
-from .sot_test_dataset import SOTTestDataset
+from .base_sot_dataset import BaseSOTDataset
 
 
 @DATASETS.register_module()
-class LaSOTDataset(SOTTestDataset):
-    """LaSOT dataset for the testing of single object tracking.
+class LaSOTDataset(BaseSOTDataset):
+    """LaSOT dataset of single object tracking.
 
-    The dataset doesn't support training mode.
+    The dataset can both support training and testing mode.
     """
 
-    def _parse_ann_info(self, img_info, ann_info):
-        """Parse bbox annotations.
+    def __init__(self, ann_file, *args, **kwargs):
+        """Initialization of SOT dataset class.
+
+        Args:
+            ann_file (str): The file contains testing video names. It will be
+                loaded in the `self.load_data_infos` function.
+        """
+        self.ann_file = ann_file
+        super(LaSOTDataset, self).__init__(*args, **kwargs)
+
+    def load_data_infos(self, split='test'):
+        """Load dataset information.
 
         Args:
-            img_info (dict): image information.
-            ann_info (list[dict]): Annotation information of an image. Each
-                image only has one bbox annotation.
+            split (str, optional): Dataset split. Defaults to 'test'.
 
         Returns:
-            dict: A dict containing the following keys: bboxes, labels,
-            ignore. labels are not useful in SOT.
+            list[dict]: The length of the list is the number of videos. The
+                inner dict is in the following format:
+                    {
+                        'video_path': the video path
+                        'ann_path': the annotation path
+                        'start_frame_id': the starting frame number contained
+                            in the image name
+                        'end_frame_id': the ending frame number contained in
+                            the image name
+                        'framename_template': the template of image name
+                    }
         """
-        gt_bboxes = np.array(ann_info[0]['bbox'], dtype=np.float32)
-        # convert [x1, y1, w, h] to [x1, y1, x2, y2]
-        gt_bboxes[2] += gt_bboxes[0]
-        gt_bboxes[3] += gt_bboxes[1]
-        gt_labels = np.array(self.cat2label[ann_info[0]['category_id']])
-        ignore = ann_info[0]['full_occlusion'] or ann_info[0]['out_of_view']
-        ann = dict(bboxes=gt_bboxes, labels=gt_labels, ignore=ignore)
-        return ann
+        print('Loading LaSOT dataset...')
+        start_time = time.time()
+        assert split in ['train', 'test']
+        data_infos = []
+
+        test_videos_list = np.loadtxt(self.ann_file, dtype=np.str_)
+        if self.test_mode:
+            videos_list = test_videos_list.tolist()
+        else:
+            all_videos_list = glob.glob(self.img_prefix + '/*/*-[1-20]')
+            test_videos = set(test_videos_list)
+            videos_list = []
+            for x in all_videos_list:
+                x = osp.basename(x)
+                if x not in test_videos:
+                    videos_list.append(x)
+
+        videos_list = sorted(videos_list)
+        for video_name in videos_list:
+            video_name = osp.join(video_name.split('-')[0], video_name)
+            video_path = osp.join(video_name, 'img')
+            ann_path = osp.join(video_name, 'groundtruth.txt')
+            img_names = glob.glob(
+                osp.join(self.img_prefix, video_name, 'img', '*.jpg'))
+            end_frame_name = max(
+                img_names, key=lambda x: int(osp.basename(x).split('.')[0]))
+            end_frame_id = int(osp.basename(end_frame_name).split('.')[0])
+            data_infos.append(
+                dict(
+                    video_path=video_path,
+                    ann_path=ann_path,
+                    start_frame_id=1,
+                    end_frame_id=end_frame_id,
+                    framename_template='%08d.jpg'))
+        print(f'LaSOT dataset loaded! ({time.time()-start_time:.2f} s)')
+        return data_infos
+
+    def get_visibility_from_video(self, video_ind):
+        """Get the visible information of instance in a video."""
+        video_path = osp.dirname(self.data_infos[video_ind]['video_path'])
+        full_occlusion_file = osp.join(self.img_prefix, video_path,
+                                       'full_occlusion.txt')
+        out_of_view_file = osp.join(self.img_prefix, video_path,
+                                    'out_of_view.txt')
+        full_occlusion = np.loadtxt(
+            full_occlusion_file, dtype=bool, delimiter=',')
+        out_of_view = np.loadtxt(out_of_view_file, dtype=bool, delimiter=',')
+        visible = ~(full_occlusion | out_of_view)
+        return dict(visible=visible)
diff --git a/mmtrack/datasets/otb_dataset.py b/mmtrack/datasets/otb_dataset.py
index babfe2194..0a7f5f932 100644
--- a/mmtrack/datasets/otb_dataset.py
+++ b/mmtrack/datasets/otb_dataset.py
@@ -1,11 +1,106 @@
-from mmtrack.datasets import DATASETS
-from .sot_test_dataset import SOTTestDataset
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import re
+import time
+
+import numpy as np
+from mmdet.datasets import DATASETS
+
+from .base_sot_dataset import BaseSOTDataset
 
 
 @DATASETS.register_module()
-class OTB100Dataset(SOTTestDataset):
-    """OTB100 dataset for the testing of single object tracking.
+class OTB100Dataset(BaseSOTDataset):
+    """OTB100 dataset of single object tracking.
 
-    The dataset doesn't support training mode.
+    The dataset is only used to test.
     """
-    pass
+
+    def __init__(self, ann_file, *args, **kwargs):
+        """Initialization of SOT dataset class.
+
+        Args:
+            ann_file (str): The file contains data information. It will be
+                loaded and parsed in the `self.load_data_infos` function.
+        """
+        self.ann_file = ann_file
+        super().__init__(*args, **kwargs)
+
+    def load_data_infos(self, split='test'):
+        """Load dataset information.
+
+        Args:
+            split (str, optional): Dataset split. Defaults to 'test'.
+
+        Returns:
+            list[dict]: The length of the list is the number of videos. The
+                inner dict is in the following format:
+                    {
+                        'video_path': the video path
+                        'ann_path': the annotation path
+                        'start_frame_id': the starting frame number contained
+                            in the image name
+                        'end_frame_id': the ending frame number contained in
+                            the image name
+                        'framename_template': the template of image name
+                        'init_skip_num': (optional) the number of skipped
+                            frames when initializing tracker
+                    }
+        """
+        print('Loading OTB100 dataset...')
+        start_time = time.time()
+        data_infos = []
+        with open(self.ann_file, 'r') as f:
+            # the first line of annotation file is dataset comment.
+            for line in f.readlines()[1:]:
+                line = line.strip().split(',')
+                if line[0].split('/')[0] == 'Board':
+                    framename_template = '%05d.jpg'
+                else:
+                    framename_template = '%04d.jpg'
+                data_info = dict(
+                    video_path=line[0],
+                    ann_path=line[1],
+                    start_frame_id=int(line[2]),
+                    end_frame_id=int(line[3]),
+                    framename_template=framename_template)
+                # Tracker initializatioin in `Tiger1` video will skip the first
+                # 5 frames. Details can be seen in the official file
+                # `tracker_benchmark_v1.0/initOmit/tiger1.txt`.
+                # Annotation loading will refer to this information.
+                if line[0].split('/')[0] == 'Tiger1':
+                    data_info['init_skip_num'] = 5
+                data_infos.append(data_info)
+        print(f'OTB100 dataset loaded! ({time.time()-start_time:.2f} s)')
+        return data_infos
+
+    def get_bboxes_from_video(self, video_ind):
+        """Get bboxes annotation about the instance in a video.
+
+        Args:
+            video_ind (int): video index
+
+        Returns:
+            ndarray: in [N, 4] shape. The N is the bbox number and the bbox
+                is in (x, y, w, h) format.
+        """
+        bboxes_file = osp.join(self.img_prefix,
+                               self.data_infos[video_ind]['ann_path'])
+        bboxes = []
+        with open(bboxes_file, 'r') as f:
+            for bbox in f.readlines():
+                bbox = list(map(int, re.findall(r'-?\d+', bbox)))
+                bboxes.append(bbox)
+        bboxes = np.array(bboxes, dtype=float)
+
+        if 'init_skip_num' in self.data_infos[video_ind]:
+            init_skip_num = self.data_infos[video_ind]['init_skip_num']
+            bboxes = bboxes[init_skip_num:]
+
+        end_frame_id = self.data_infos[video_ind]['end_frame_id']
+        start_frame_id = self.data_infos[video_ind]['start_frame_id']
+        assert len(bboxes) == (
+            end_frame_id - start_frame_id + 1
+        ), f'{len(bboxes)} is not equal to {end_frame_id}-{start_frame_id}+1'
+        assert bboxes.shape[1] == 4
+        return bboxes
diff --git a/mmtrack/datasets/uav123_dataset.py b/mmtrack/datasets/uav123_dataset.py
index 8e41cac4a..e622173f1 100644
--- a/mmtrack/datasets/uav123_dataset.py
+++ b/mmtrack/datasets/uav123_dataset.py
@@ -1,13 +1,60 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import time
+
 from mmdet.datasets import DATASETS
 
-from .sot_test_dataset import SOTTestDataset
+from .base_sot_dataset import BaseSOTDataset
 
 
 @DATASETS.register_module()
-class UAV123Dataset(SOTTestDataset):
-    """UAV123 dataset for the testing of single object tracking.
+class UAV123Dataset(BaseSOTDataset):
+    """UAV123 dataset of single object tracking.
 
-    The dataset doesn't support training mode.
+    The dataset is only used to test.
     """
-    pass
+
+    def __init__(self, ann_file, *args, **kwargs):
+        """Initialization of SOT dataset class.
+
+        Args:
+            ann_file (str): The file contains data information. It will be
+                loaded and parsed in the `self.load_data_infos` function.
+        """
+        self.ann_file = ann_file
+        super().__init__(*args, **kwargs)
+
+    def load_data_infos(self, split='test'):
+        """Load dataset information.
+
+        Args:
+            split (str, optional): Dataset split. Defaults to 'test'.
+
+        Returns:
+            list[dict]: The length of the list is the number of videos. The
+                inner dict is in the following format:
+                    {
+                        'video_path': the video path
+                        'ann_path': the annotation path
+                        'start_frame_id': the starting frame number contained
+                            in the image name
+                        'end_frame_id': the ending frame number contained in
+                            the image name
+                        'framename_template': the template of image name
+                    }
+        """
+        print('Loading UAV123 dataset...')
+        start_time = time.time()
+        data_infos = []
+        with open(self.ann_file, 'r') as f:
+            # the first line of annotation file is dataset comment.
+            for line in f.readlines()[1:]:
+                line = line.strip().split(',')
+                data_info = dict(
+                    video_path=line[0],
+                    ann_path=line[1],
+                    start_frame_id=int(line[2]),
+                    end_frame_id=int(line[3]),
+                    framename_template='%06d.jpg')
+                data_infos.append(data_info)
+        print(f'UAV123 dataset loaded! ({time.time()-start_time:.2f} s)')
+        return data_infos
diff --git a/mmtrack/datasets/vot_dataset.py b/mmtrack/datasets/vot_dataset.py
index a3aaa8330..548f79f6c 100644
--- a/mmtrack/datasets/vot_dataset.py
+++ b/mmtrack/datasets/vot_dataset.py
@@ -1,27 +1,37 @@
+import glob
 import os.path as osp
+import time
 
+import mmcv
 import numpy as np
 from mmcv.utils import print_log
 from mmdet.datasets import DATASETS
 
 from mmtrack.core.evaluation import eval_sot_accuracy_robustness, eval_sot_eao
-from .sot_test_dataset import SOTTestDataset
+from .base_sot_dataset import BaseSOTDataset
 
 
 @DATASETS.register_module()
-class VOTDataset(SOTTestDataset):
-    """VOT dataset for the testing of single object tracking.
+class VOTDataset(BaseSOTDataset):
+    """VOT dataset of single object tracking.
 
-    The dataset doesn't support training mode.
-
-    Note: The vot datasets using the mask annotation, such as VOT2020, is not
-    supported now.
+    The dataset is only used to test.
     """
-    CLASSES = (0, )
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, dataset_type='vot2018', *args, **kwargs):
+        """Initialization of SOT dataset class.
+
+        Args:
+            dataset_type (str, optional): The type of VOT challenge. The
+                optional values are in ['vot2018', 'vot2018_lt',
+                'vot2019', 'vot2019_lt', 'vot2020', 'vot2021']
+        """
+        assert dataset_type in [
+            'vot2018', 'vot2018_lt', 'vot2019', 'vot2019_lt', 'vot2020',
+            'vot2021'
+        ]
+        self.dataset_type = dataset_type
         super().__init__(*args, **kwargs)
-        self.dataset_name = osp.basename(self.ann_file).rstrip('.json')
         # parameter, used for EAO evaluation, may vary by different vot
         # challenges.
         self.INTERVAL = dict(
@@ -30,30 +40,79 @@ def __init__(self, *args, **kwargs):
             vot2020=[115, 755],
             vot2021=[115, 755])
 
-    def _parse_ann_info(self, img_info, ann_info):
-        """Parse bbox annotations.
+    def load_data_infos(self, split='test'):
+        """Load dataset information.
+
+        Args:
+            split (str, optional): Dataset split. Defaults to 'test'.
+
+        Returns:
+            list[dict]: The length of the list is the number of videos. The
+                inner dict is in the following format:
+                    {
+                        'video_path': the video path
+                        'ann_path': the annotation path
+                        'start_frame_id': the starting frame number contained
+                            in the image name
+                        'end_frame_id': the ending frame number contained in
+                            the image name
+                        'framename_template': the template of image name
+                    }
+        """
+        print('Loading VOT dataset...')
+        start_time = time.time()
+        data_infos = []
+        ann_file = osp.join(self.img_prefix, 'list.txt')
+        videos_list = np.loadtxt(ann_file, dtype=np.str_)
+        for video_name in videos_list:
+            video_path = osp.join(video_name, 'color')
+            ann_path = osp.join(video_name, 'groundtruth.txt')
+            img_names = glob.glob(
+                osp.join(self.img_prefix, video_path + '/*.jpg'))
+            end_frame_id = max(
+                img_names, key=lambda x: int(osp.basename(x).split('.')[0]))
+            data_info = dict(
+                video_path=video_path,
+                ann_path=ann_path,
+                start_frame_id=1,
+                end_frame_id=int(osp.basename(end_frame_id).split('.')[0]),
+                framename_template='%08d.jpg')
+            data_infos.append(data_info)
+        print(f'VOT dataset loaded! ({time.time()-start_time:.2f} s)')
+        return data_infos
+
+    def get_ann_infos_from_video(self, video_ind):
+        """Get bboxes annotation about the instance in a video.
 
         Args:
-            img_info (dict): image information.
-            ann_info (list[dict]): Annotation information of an image. Each
-                image only has one bbox annotation.
+            video_ind (int): video index
+
         Returns:
-            dict: A dict containing the following keys: bboxes, labels.
-            labels are not useful in SOT.
+            ndarray: in [N, 8] shape. The N is the bbox number and the bbox
+                is in (x1, y1, x2, y2, x3, y3, x4, y4) format.
         """
-        # The shape of gt_bboxes is (8, ), in [x1, y1, x2, y2, x3, y3, x4, y4]
-        # format
-        gt_bboxes = np.array(ann_info[0]['bbox'], dtype=np.float32)
-        gt_labels = np.array(self.cat2label[ann_info[0]['category_id']])
-        ann = dict(bboxes=gt_bboxes, labels=gt_labels)
-        return ann
+        bboxes = self.get_bboxes_from_video(video_ind)
+        if bboxes.shape[1] == 4:
+            x1, y1 = bboxes[:, 0], bboxes[:, 1],
+            x2, y2 = bboxes[:, 0] + bboxes[:, 2], bboxes[:, 1],
+            x3, y3 = bboxes[:, 0] + bboxes[:, 2], bboxes[:, 1] + bboxes[:, 3]
+            x4, y4 = bboxes[:, 0], bboxes[:, 1] + bboxes[:, 3],
+            bboxes = np.stack((x1, y1, x2, y2, x3, y3, x4, y4), axis=-1)
+
+        visible_info = self.get_visibility_from_video(video_ind)
+        # bboxes in VOT datasets are all valid
+        bboxes_isvalid = np.array([True] * len(bboxes), dtype=np.bool_)
+        ann_infos = dict(
+            bboxes=bboxes, bboxes_isvalid=bboxes_isvalid, **visible_info)
+        return ann_infos
 
     # TODO support multirun test
     def evaluate(self, results, metric=['track'], logger=None, interval=None):
         """Evaluation in VOT protocol.
 
         Args:
-            results (dict): Testing results of the dataset.
+            results (dict): Testing results of the dataset. The tracking bboxes
+                are in (tl_x, tl_y, br_x, br_y) format.
             metric (str | list[str]): Metrics to be evaluated. Options are
                 'track'.
             logger (logging.Logger | str | None): Logger used for printing
@@ -75,40 +134,53 @@ def evaluate(self, results, metric=['track'], logger=None, interval=None):
             if metric not in allowed_metrics:
                 raise KeyError(f'metric {metric} is not supported.')
 
+        # get all test annotations
+        # annotations are in list[ndarray] format
+        annotations = []
+        for video_ind in range(len(self.data_infos)):
+            bboxes = self.get_ann_infos_from_video(video_ind)['bboxes']
+            annotations.append(bboxes)
+
+        # tracking_bboxes converting code
         eval_results = dict()
         if 'track' in metrics:
-            assert len(self.data_infos) == len(results['track_bboxes'])
+            assert len(self) == len(
+                results['track_bboxes']
+            ), f"{len(self)} == {len(results['track_bboxes'])}"
             print_log('Evaluate VOT Benchmark...', logger=logger)
-            inds = []
+            track_bboxes = []
+            start_ind = end_ind = 0
             videos_wh = []
-            ann_infos = []
-            for i, info in enumerate(self.data_infos):
-                if info['frame_id'] == 0:
-                    inds.append(i)
-                    videos_wh.append((info['width'], info['height']))
+            for data_info in self.data_infos:
+                num = data_info['end_frame_id'] - data_info[
+                    'start_frame_id'] + 1
+                end_ind += num
 
-                ann_infos.append(self.get_ann_info(info))
-
-            num_vids = len(inds)
-            inds.append(len(self.data_infos))
-            track_bboxes = []
-            annotations = []
-            for i in range(num_vids):
                 bboxes_per_video = []
-                for bbox in results['track_bboxes'][inds[i]:inds[i + 1]]:
+                # results are in dict(track_bboxes=list[ndarray]) format
+                # track_bboxes are in list[list[ndarray]] format
+                for bbox in results['track_bboxes'][start_ind:end_ind]:
                     # the last element of `bbox` is score.
                     if len(bbox) != 2:
                         # convert bbox format from (tl_x, tl_y, br_x, br_y) to
                         # (x1, y1, w, h)
                         bbox[2] -= bbox[0]
                         bbox[3] -= bbox[1]
+
                     bboxes_per_video.append(bbox[:-1])
+
                 track_bboxes.append(bboxes_per_video)
-                annotations.append(ann_infos[inds[i]:inds[i + 1]])
+                start_ind += num
 
-            interval = self.INTERVAL[self.dataset_name] if interval is None \
+                # read one image in the video to get video width and height
+                filename = osp.join(self.img_prefix, data_info['video_path'],
+                                    data_info['framename_template'] % 1)
+                img = mmcv.imread(filename)
+                videos_wh.append((img.shape[1], img.shape[0]))
+
+            interval = self.INTERVAL[self.dataset_type] if interval is None \
                 else interval
-            # anno_info is list[list[dict]]
+
             eao_score = eval_sot_eao(
                 results=track_bboxes,
                 annotations=annotations,
diff --git a/tests/data/demo_sot_data/lasot_full/airplane/airplane-1/full_occlusion.txt b/tests/data/demo_sot_data/lasot_full/airplane/airplane-1/full_occlusion.txt
new file mode 100755
index 000000000..15794e007
--- /dev/null
+++ b/tests/data/demo_sot_data/lasot_full/airplane/airplane-1/full_occlusion.txt
@@ -0,0 +1 @@
+0,0
diff --git a/tests/data/demo_sot_data/lasot_full/airplane/airplane-1/groundtruth.txt b/tests/data/demo_sot_data/lasot_full/airplane/airplane-1/groundtruth.txt
new file mode 100644
index 000000000..bf06a6169
--- /dev/null
+++ b/tests/data/demo_sot_data/lasot_full/airplane/airplane-1/groundtruth.txt
@@ -0,0 +1,2 @@
+1,100,1,100
+1,100,1,100
diff --git a/tests/data/demo_sot_data/lasot_full/airplane/airplane-1/gt_for_eval.txt b/tests/data/demo_sot_data/lasot_full/airplane/airplane-1/gt_for_eval.txt
new file mode 100644
index 000000000..496a7042e
--- /dev/null
+++ b/tests/data/demo_sot_data/lasot_full/airplane/airplane-1/gt_for_eval.txt
@@ -0,0 +1,25 @@
+367,101,41,16
+366,103,45,16
+364,107,45,15
+362,109,46,16
+362,111,46,18
+362,113,46,18
+364,116,46,17
+366,118,45,17
+362,119,48,17
+359,119,45,17
+358,119,46,17
+360,121,46,17
+360,124,46,17
+359,124,47,17
+360,126,46,17
+356,127,46,18
+354,127,46,17
+352,127,46,17
+352,126,44,17
+349,126,46,17
+347,126,46,17
+346,125,46,17
+345,124,47,17
+345,124,46,17
+344,124,47,17
diff --git a/tests/data/demo_sot_data/lasot_full/airplane/airplane-1/img/00000001.jpg b/tests/data/demo_sot_data/lasot_full/airplane/airplane-1/img/00000001.jpg
new file mode 100644
index 000000000..81e94785b
Binary files /dev/null and b/tests/data/demo_sot_data/lasot_full/airplane/airplane-1/img/00000001.jpg differ
diff --git a/tests/data/demo_sot_data/lasot_full/airplane/airplane-1/img/00000002.jpg b/tests/data/demo_sot_data/lasot_full/airplane/airplane-1/img/00000002.jpg
new file mode 100644
index 000000000..81e94785b
Binary files /dev/null and b/tests/data/demo_sot_data/lasot_full/airplane/airplane-1/img/00000002.jpg differ
diff --git a/tests/data/demo_sot_data/lasot_full/airplane/airplane-1/out_of_view.txt b/tests/data/demo_sot_data/lasot_full/airplane/airplane-1/out_of_view.txt
new file mode 100755
index 000000000..15794e007
--- /dev/null
+++ b/tests/data/demo_sot_data/lasot_full/airplane/airplane-1/out_of_view.txt
@@ -0,0 +1 @@
+0,0
diff --git a/tests/data/demo_sot_data/lasot_full/airplane/airplane-1/track_results.txt b/tests/data/demo_sot_data/lasot_full/airplane/airplane-1/track_results.txt
new file mode 100644
index 000000000..7648a02f5
--- /dev/null
+++ b/tests/data/demo_sot_data/lasot_full/airplane/airplane-1/track_results.txt
@@ -0,0 +1,25 @@
+367,101,408,117
+367,102,410,118
+363,105,406,121
+362,109,407,124
+361,112,407,128
+362,114,408,130
+364,116,410,132
+364,118,411,134
+360,120,408,136
+356,119,404,135
+356,119,404,135
+359,121,407,137
+359,124,407,141
+359,125,407,141
+358,126,406,143
+354,127,402,144
+351,127,400,144
+350,127,398,143
+349,127,397,143
+346,126,394,142
+344,126,392,143
+343,125,392,142
+343,123,392,140
+343,124,392,141
+341,124,392,141
diff --git a/tests/data/demo_sot_data/lasot_full/basketball/basketball-2/full_occlusion.txt b/tests/data/demo_sot_data/lasot_full/basketball/basketball-2/full_occlusion.txt
new file mode 100755
index 000000000..15794e007
--- /dev/null
+++ b/tests/data/demo_sot_data/lasot_full/basketball/basketball-2/full_occlusion.txt
@@ -0,0 +1 @@
+0,0
diff --git a/tests/data/demo_sot_data/lasot_full/basketball/basketball-2/groundtruth.txt b/tests/data/demo_sot_data/lasot_full/basketball/basketball-2/groundtruth.txt
new file mode 100644
index 000000000..bf06a6169
--- /dev/null
+++ b/tests/data/demo_sot_data/lasot_full/basketball/basketball-2/groundtruth.txt
@@ -0,0 +1,2 @@
+1,100,1,100
+1,100,1,100
diff --git a/tests/data/demo_sot_data/lasot_full/basketball/basketball-2/gt_for_eval.txt b/tests/data/demo_sot_data/lasot_full/basketball/basketball-2/gt_for_eval.txt
new file mode 100644
index 000000000..2580b2402
--- /dev/null
+++ b/tests/data/demo_sot_data/lasot_full/basketball/basketball-2/gt_for_eval.txt
@@ -0,0 +1,25 @@
+76,74,367,151
+75,76,369,150
+78,76,368,150
+81,77,366,149
+82,76,367,150
+81,74,370,151
+81,74,370,152
+84,77,370,151
+89,79,371,149
+88,78,372,149
+88,78,372,150
+90,79,374,149
+90,80,374,149
+89,81,374,150
+92,81,375,150
+94,80,378,150
+95,80,379,150
+96,79,376,151
+96,79,375,152
+100,81,377,150
+102,81,377,150
+99,79,376,152
+99,82,379,150
+104,82,375,150
+100,81,379,152
diff --git a/tests/data/demo_sot_data/lasot_full/basketball/basketball-2/img/00000001.jpg b/tests/data/demo_sot_data/lasot_full/basketball/basketball-2/img/00000001.jpg
new file mode 100644
index 000000000..81e94785b
Binary files /dev/null and b/tests/data/demo_sot_data/lasot_full/basketball/basketball-2/img/00000001.jpg differ
diff --git a/tests/data/demo_sot_data/lasot_full/basketball/basketball-2/img/00000002.jpg b/tests/data/demo_sot_data/lasot_full/basketball/basketball-2/img/00000002.jpg
new file mode 100644
index 000000000..81e94785b
Binary files /dev/null and b/tests/data/demo_sot_data/lasot_full/basketball/basketball-2/img/00000002.jpg differ
diff --git a/tests/data/demo_sot_data/lasot_full/basketball/basketball-2/out_of_view.txt b/tests/data/demo_sot_data/lasot_full/basketball/basketball-2/out_of_view.txt
new file mode 100755
index 000000000..15794e007
--- /dev/null
+++ b/tests/data/demo_sot_data/lasot_full/basketball/basketball-2/out_of_view.txt
@@ -0,0 +1 @@
+0,0
diff --git a/tests/data/demo_sot_data/lasot_full/basketball/basketball-2/track_results.txt b/tests/data/demo_sot_data/lasot_full/basketball/basketball-2/track_results.txt
new file mode 100644
index 000000000..2716acd33
--- /dev/null
+++ b/tests/data/demo_sot_data/lasot_full/basketball/basketball-2/track_results.txt
@@ -0,0 +1,25 @@
+15,123,544,267
+18,130,545,274
+23,106,553,252
+20,117,547,264
+17,122,545,267
+13,129,540,273
+24,104,551,249
+29,110,559,255
+34,113,566,258
+31,122,557,266
+32,127,552,271
+30,135,548,276
+37,110,554,254
+31,112,558,258
+31,119,560,264
+21,124,547,268
+48,132,578,277
+22,102,553,249
+11,105,544,253
+19,110,551,257
+22,113,557,257
+32,112,567,255
+30,115,566,258
+34,116,570,261
+28,120,556,265
diff --git a/tests/data/demo_sot_data/lasot_full/testing_set.txt b/tests/data/demo_sot_data/lasot_full/testing_set.txt
new file mode 100644
index 000000000..b9db8f6f9
--- /dev/null
+++ b/tests/data/demo_sot_data/lasot_full/testing_set.txt
@@ -0,0 +1,2 @@
+airplane-1
+airplane-1
diff --git a/tests/data/demo_sot_data/vot2018/ants1/color/00000001.jpg b/tests/data/demo_sot_data/vot2018/ants1/color/00000001.jpg
new file mode 100644
index 000000000..81e94785b
Binary files /dev/null and b/tests/data/demo_sot_data/vot2018/ants1/color/00000001.jpg differ
diff --git a/tests/data/demo_sot_data/vot2018/ants1/color/00000002.jpg b/tests/data/demo_sot_data/vot2018/ants1/color/00000002.jpg
new file mode 100644
index 000000000..81e94785b
Binary files /dev/null and b/tests/data/demo_sot_data/vot2018/ants1/color/00000002.jpg differ
diff --git a/tests/data/demo_sot_data/vot2018/ants1/groundtruth.txt b/tests/data/demo_sot_data/vot2018/ants1/groundtruth.txt
new file mode 100644
index 000000000..7e32c84af
--- /dev/null
+++ b/tests/data/demo_sot_data/vot2018/ants1/groundtruth.txt
@@ -0,0 +1,2 @@
+1,1,100,1,100,100,1,100
+1,1,100,1,100,100,1,100
diff --git a/tests/data/demo_sot_data/vot2018/ants1/gt_for_eval.txt b/tests/data/demo_sot_data/vot2018/ants1/gt_for_eval.txt
new file mode 100644
index 000000000..68f7c1d1b
--- /dev/null
+++ b/tests/data/demo_sot_data/vot2018/ants1/gt_for_eval.txt
@@ -0,0 +1,25 @@
+367,101,408,101,408,117,367,117
+366,103,411,103,411,119,366,119
+364,107,409,107,409,122,364,122
+362,109,408,109,408,125,362,125
+362,111,408,111,408,129,362,129
+362,113,408,113,408,131,362,131
+364,116,410,116,410,133,364,133
+366,118,411,118,411,135,366,135
+362,119,410,119,410,136,362,136
+359,119,404,119,404,136,359,136
+358,119,404,119,404,136,358,136
+360,121,406,121,406,138,360,138
+360,124,406,124,406,141,360,141
+359,124,406,124,406,141,359,141
+360,126,406,126,406,143,360,143
+356,127,402,127,402,145,356,145
+354,127,400,127,400,144,354,144
+352,127,398,127,398,144,352,144
+352,126,396,126,396,143,352,143
+349,126,395,126,395,143,349,143
+347,126,393,126,393,143,347,143
+346,125,392,125,392,142,346,142
+345,124,392,124,392,141,345,141
+345,124,391,124,391,141,345,141
+344,124,391,124,391,141,344,141
diff --git a/tests/data/demo_sot_data/lasot/airplane-1/vot_track_results.txt b/tests/data/demo_sot_data/vot2018/ants1/track_results.txt
similarity index 100%
rename from tests/data/demo_sot_data/lasot/airplane-1/vot_track_results.txt
rename to tests/data/demo_sot_data/vot2018/ants1/track_results.txt
diff --git a/tests/data/demo_sot_data/vot2018/ants3/color/00000001.jpg b/tests/data/demo_sot_data/vot2018/ants3/color/00000001.jpg
new file mode 100644
index 000000000..81e94785b
Binary files /dev/null and b/tests/data/demo_sot_data/vot2018/ants3/color/00000001.jpg differ
diff --git a/tests/data/demo_sot_data/vot2018/ants3/color/00000002.jpg b/tests/data/demo_sot_data/vot2018/ants3/color/00000002.jpg
new file mode 100644
index 000000000..81e94785b
Binary files /dev/null and b/tests/data/demo_sot_data/vot2018/ants3/color/00000002.jpg differ
diff --git a/tests/data/demo_sot_data/vot2018/ants3/groundtruth.txt b/tests/data/demo_sot_data/vot2018/ants3/groundtruth.txt
new file mode 100644
index 000000000..7e32c84af
--- /dev/null
+++ b/tests/data/demo_sot_data/vot2018/ants3/groundtruth.txt
@@ -0,0 +1,2 @@
+1,1,100,1,100,100,1,100
+1,1,100,1,100,100,1,100
diff --git a/tests/data/demo_sot_data/vot2018/ants3/gt_for_eval.txt b/tests/data/demo_sot_data/vot2018/ants3/gt_for_eval.txt
new file mode 100644
index 000000000..8814fef9e
--- /dev/null
+++ b/tests/data/demo_sot_data/vot2018/ants3/gt_for_eval.txt
@@ -0,0 +1,25 @@
+76,74,443,74,443,225,76,225
+75,76,444,76,444,226,75,226
+78,76,446,76,446,226,78,226
+81,77,447,77,447,226,81,226
+82,76,449,76,449,226,82,226
+81,74,451,74,451,225,81,225
+81,74,451,74,451,226,81,226
+84,77,454,77,454,228,84,228
+89,79,460,79,460,228,89,228
+88,78,460,78,460,227,88,227
+88,78,460,78,460,228,88,228
+90,79,464,79,464,228,90,228
+90,80,464,80,464,229,90,229
+89,81,463,81,463,231,89,231
+92,81,467,81,467,231,92,231
+94,80,472,80,472,230,94,230
+95,80,474,80,474,230,95,230
+96,79,472,79,472,230,96,230
+96,79,471,79,471,231,96,231
+100,81,477,81,477,231,100,231
+102,81,479,81,479,231,102,231
+99,79,475,79,475,231,99,231
+99,82,478,82,478,232,99,232
+104,82,479,82,479,232,104,232
+100,81,479,81,479,233,100,233
diff --git a/tests/data/demo_sot_data/lasot/airplane-2/vot_track_results.txt b/tests/data/demo_sot_data/vot2018/ants3/track_results.txt
similarity index 100%
rename from tests/data/demo_sot_data/lasot/airplane-2/vot_track_results.txt
rename to tests/data/demo_sot_data/vot2018/ants3/track_results.txt
diff --git a/tests/data/demo_sot_data/vot2018/list.txt b/tests/data/demo_sot_data/vot2018/list.txt
new file mode 100644
index 000000000..9c6f04f85
--- /dev/null
+++ b/tests/data/demo_sot_data/vot2018/list.txt
@@ -0,0 +1,2 @@
+ants1
+ants3
diff --git a/tests/test_data/test_datasets/test_sot_dataset.py b/tests/test_data/test_datasets/test_sot_dataset.py
index 9adeb4ee6..b87c485b3 100644
--- a/tests/test_data/test_datasets/test_sot_dataset.py
+++ b/tests/test_data/test_datasets/test_sot_dataset.py
@@ -12,6 +12,18 @@
 SOT_DATA_PREFIX = f'{PREFIX}/demo_sot_data'
 DATASET_INFOS = dict(
     GOT10kDataset=dict(img_prefix=osp.join(SOT_DATA_PREFIX, 'got10k')),
+    VOTDataset=dict(
+        dataset_type='vot2018',
+        img_prefix=osp.join(SOT_DATA_PREFIX, 'vot2018')),
+    OTB100Dataset=dict(
+        ann_file='tools/convert_datasets/otb100/otb100_infos.txt',
+        img_prefix=osp.join(SOT_DATA_PREFIX, 'otb100')),
+    UAV123Dataset=dict(
+        ann_file='tools/convert_datasets/uav123/uav123_infos.txt',
+        img_prefix=osp.join(SOT_DATA_PREFIX, 'uav123')),
+    LaSOTDataset=dict(
+        ann_file=osp.join(SOT_DATA_PREFIX, 'lasot_full', 'testing_set.txt'),
+        img_prefix=osp.join(SOT_DATA_PREFIX, 'lasot_full')),
     TrackingNetDataset=dict(
         chunks_list=[0], img_prefix=osp.join(SOT_DATA_PREFIX, 'trackingnet')),
     SOTCocoDataset=dict(
@@ -23,7 +35,8 @@
 
 
 @pytest.mark.parametrize('dataset', [
-    'GOT10kDataset', 'TrackingNetDataset', 'SOTImageNetVIDDataset',
+    'GOT10kDataset', 'VOTDataset', 'OTB100Dataset', 'UAV123Dataset',
+    'LaSOTDataset', 'TrackingNetDataset', 'SOTImageNetVIDDataset',
     'SOTCocoDataset'
 ])
 def test_load_data_infos(dataset):
@@ -34,8 +47,8 @@ def test_load_data_infos(dataset):
 
 
 @pytest.mark.parametrize('dataset', [
-    'GOT10kDataset', 'TrackingNetDataset', 'SOTImageNetVIDDataset',
-    'SOTCocoDataset'
+    'GOT10kDataset', 'VOTDataset', 'LaSOTDataset', 'TrackingNetDataset',
+    'SOTImageNetVIDDataset', 'SOTCocoDataset'
 ])
 def test_get_bboxes_from_video(dataset):
     dataset_class = DATASETS.get(dataset)
@@ -52,8 +65,8 @@ def test_get_bboxes_from_video(dataset):
 
 
 @pytest.mark.parametrize('dataset', [
-    'GOT10kDataset', 'TrackingNetDataset', 'SOTImageNetVIDDataset',
-    'SOTCocoDataset'
+    'GOT10kDataset', 'VOTDataset', 'LaSOTDataset', 'TrackingNetDataset',
+    'SOTImageNetVIDDataset', 'SOTCocoDataset'
 ])
 def test_get_visibility_from_video(dataset):
     dataset_class = DATASETS.get(dataset)
@@ -66,7 +79,7 @@ def test_get_visibility_from_video(dataset):
 
 @pytest.mark.parametrize('dataset', [
     'GOT10kDataset', 'TrackingNetDataset', 'SOTImageNetVIDDataset',
-    'SOTCocoDataset'
+    'SOTCocoDataset', 'VOTDataset', 'LaSOTDataset'
 ])
 def test_get_ann_infos_from_video(dataset):
     dataset_class = DATASETS.get(dataset)
@@ -78,7 +91,7 @@ def test_get_ann_infos_from_video(dataset):
 
 @pytest.mark.parametrize('dataset', [
     'GOT10kDataset', 'TrackingNetDataset', 'SOTImageNetVIDDataset',
-    'SOTCocoDataset'
+    'SOTCocoDataset', 'VOTDataset', 'LaSOTDataset'
 ])
 def test_get_img_infos_from_video(dataset):
     dataset_class = DATASETS.get(dataset)
@@ -88,7 +101,9 @@ def test_get_img_infos_from_video(dataset):
     dataset_object.get_img_infos_from_video(0)
 
 
-@pytest.mark.parametrize('dataset', ['GOT10kDataset', 'TrackingNetDataset'])
+@pytest.mark.parametrize(
+    'dataset',
+    ['GOT10kDataset', 'VOTDataset', 'LaSOTDataset', 'TrackingNetDataset'])
 def test_prepare_test_data(dataset):
     dataset_class = DATASETS.get(dataset)
 
@@ -99,7 +114,7 @@ def test_prepare_test_data(dataset):
 
 @pytest.mark.parametrize('dataset', [
     'GOT10kDataset', 'TrackingNetDataset', 'SOTImageNetVIDDataset',
-    'SOTCocoDataset'
+    'SOTCocoDataset', 'LaSOTDataset'
 ])
 def test_prepare_train_data(dataset):
     dataset_class = DATASETS.get(dataset)
@@ -138,3 +153,85 @@ def test_format_results(dataset):
     dataset_object.format_results(track_results, resfile_path=tmp_dir.name)
     if osp.isdir(tmp_dir.name):
         tmp_dir.cleanup()
+
+
+def test_sot_ope_evaluation():
+    dataset_class = DATASETS.get('UAV123Dataset')
+    dataset_object = dataset_class(
+        **DATASET_INFOS['UAV123Dataset'],
+        pipeline=[],
+        split='test',
+        test_mode=True)
+
+    dataset_object.num_frames_per_video = [25, 25]
+    results = []
+    data_infos = []
+    lasot_root = osp.join(SOT_DATA_PREFIX, 'lasot_full')
+    for video_name in ['airplane/airplane-1', 'basketball/basketball-2']:
+        bboxes = np.loadtxt(
+            osp.join(lasot_root, video_name, 'track_results.txt'),
+            delimiter=',')
+        scores = np.zeros((len(bboxes), 1))
+        bboxes = np.concatenate((bboxes, scores), axis=-1)
+        results.extend(bboxes)
+        data_infos.append(
+            dict(
+                video_path=osp.join(lasot_root, video_name, 'img'),
+                ann_path=osp.join(lasot_root, video_name, 'gt_for_eval.txt'),
+                start_frame_id=1,
+                end_frame_id=25,
+                framename_template='%06d.jpg'))
+
+    dataset_object.data_infos = data_infos
+    track_results = dict(track_bboxes=results)
+    eval_results = dataset_object.evaluate(track_results, metric=['track'])
+    assert eval_results['success'] == 67.524
+    assert eval_results['norm_precision'] == 70.0
+    assert eval_results['precision'] == 50.0
+
+
+def test_sot_vot_evaluation():
+    dataset_class = DATASETS.get('VOTDataset')
+    dataset_object = dataset_class(
+        **DATASET_INFOS['VOTDataset'],
+        pipeline=[],
+        split='test',
+        test_mode=True)
+
+    dataset_object.num_frames_per_video = [25, 25]
+    data_infos = []
+    results = []
+    vot_root = osp.join(SOT_DATA_PREFIX, 'vot2018')
+    for video_name in ['ants1', 'ants3']:
+        results.extend(
+            mmcv.list_from_file(
+                osp.join(vot_root, video_name, 'track_results.txt')))
+        data_infos.append(
+            dict(
+                video_path=osp.join(vot_root, video_name, 'color'),
+                ann_path=osp.join(vot_root, video_name, 'gt_for_eval.txt'),
+                start_frame_id=1,
+                end_frame_id=25,
+                framename_template='%08d.jpg'))
+    dataset_object.data_infos = data_infos
+
+    track_bboxes = []
+    for result in results:
+        result = result.split(',')
+        if len(result) == 1:
+            track_bboxes.append(np.array([float(result[0]), 0.]))
+        else:
+            track_bboxes.append(
+                np.array([
+                    float(result[0]),
+                    float(result[1]),
+                    float(result[2]),
+                    float(result[3]), 0.
+                ]))
+
+    track_bboxes = dict(track_bboxes=track_bboxes)
+    eval_results = dataset_object.evaluate(
+        track_bboxes, interval=[1, 3], metric=['track'])
+    assert abs(eval_results['eao'] - 0.6661) < 0.0001
+    assert round(eval_results['accuracy'], 4) == 0.5826
+    assert round(eval_results['robustness'], 4) == 6.0
diff --git a/tests/test_data/test_datasets/test_sot_test_dataset.py b/tests/test_data/test_datasets/test_sot_test_dataset.py
deleted file mode 100644
index 72a302c3c..000000000
--- a/tests/test_data/test_datasets/test_sot_test_dataset.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-
-import mmcv
-import numpy as np
-import pytest
-
-from mmtrack.datasets import DATASETS as DATASETS
-
-PREFIX = osp.join(osp.dirname(__file__), '../../data')
-LASOT_ANN_PATH = f'{PREFIX}/demo_sot_data/lasot'
-
-
-@pytest.mark.parametrize('dataset',
-                         ['SOTTestDataset', 'LaSOTDataset', 'VOTDataset'])
-def test_parse_ann_info(dataset):
-    dataset_class = DATASETS.get(dataset)
-
-    ann_file = osp.join(LASOT_ANN_PATH, 'lasot_test_dummy.json')
-    dataset_object = dataset_class(ann_file=ann_file, pipeline=[])
-
-    if dataset == 'VOTDataset':
-        for _, img_ann in dataset_object.coco.anns.items():
-            x, y, w, h = img_ann['bbox']
-            img_ann['bbox'] = [x, y, x + w, y, x + w, y + h, x, y + h]
-
-    # image 5 has 1 objects
-    img_id = 5
-    img_info = dataset_object.coco.load_imgs([img_id])[0]
-    ann_ids = dataset_object.coco.get_ann_ids([img_id])
-    ann_info = dataset_object.coco.loadAnns(ann_ids)
-    ann = dataset_object._parse_ann_info(img_info, ann_info)
-    assert ann['bboxes'].shape == (
-        4, ) if dataset != 'VOTDataset' else ann['bboxes'].shape == (8, )
-    assert ann['labels'] == 0
-
-
-def test_sot_ope_evaluation():
-    dataset_class = DATASETS.get('SOTTestDataset')
-    dataset = dataset_class(
-        ann_file=osp.join(LASOT_ANN_PATH, 'lasot_test_dummy.json'),
-        pipeline=[])
-
-    results = []
-    for video_name in ['airplane-1', 'airplane-2']:
-        results.extend(
-            mmcv.list_from_file(
-                osp.join(LASOT_ANN_PATH, video_name, 'track_results.txt')))
-    track_bboxes = []
-    for result in results:
-        x1, y1, x2, y2 = result.split(',')
-        track_bboxes.append(
-            np.array([float(x1),
-                      float(y1),
-                      float(x2),
-                      float(y2), 0.]))
-
-    track_results = dict(track_bboxes=track_bboxes)
-    eval_results = dataset.evaluate(track_results, metric=['track'])
-    assert eval_results['success'] == 67.524
-    assert eval_results['norm_precision'] == 70.0
-    assert eval_results['precision'] == 50.0
-
-
-def test_sot_vot_evaluation():
-    dataset_class = DATASETS.get('VOTDataset')
-    dataset = dataset_class(
-        ann_file=osp.join(LASOT_ANN_PATH, 'lasot_test_dummy.json'),
-        pipeline=[])
-
-    for _, img_ann in dataset.coco.anns.items():
-        x, y, w, h = img_ann['bbox']
-        img_ann['bbox'] = [x, y, x + w, y, x + w, y + h, x, y + h]
-
-    results = []
-    for video_name in ['airplane-1', 'airplane-2']:
-        results.extend(
-            mmcv.list_from_file(
-                osp.join(LASOT_ANN_PATH, video_name, 'vot_track_results.txt')))
-    track_bboxes = []
-    for result in results:
-        result = result.split(',')
-        if len(result) == 1:
-            track_bboxes.append(np.array([float(result[0]), 0.]))
-        else:
-            track_bboxes.append(
-                np.array([
-                    float(result[0]),
-                    float(result[1]),
-                    float(result[2]),
-                    float(result[3]), 0.
-                ]))
-
-    track_bboxes = dict(track_bboxes=track_bboxes)
-    eval_results = dataset.evaluate(
-        track_bboxes, interval=[1, 3], metric=['track'])
-    assert abs(eval_results['eao'] - 0.6394) < 0.0001
-    assert round(eval_results['accuracy'], 4) == 0.5431
-    assert round(eval_results['robustness'], 4) == 6.0
diff --git a/tools/convert_datasets/otb100/otb100_infos.txt b/tools/convert_datasets/otb100/otb100_infos.txt
new file mode 100644
index 000000000..bfd81686e
--- /dev/null
+++ b/tools/convert_datasets/otb100/otb100_infos.txt
@@ -0,0 +1,101 @@
+The format of each line in this txt is (video_path,annotation_path,start_frame_id,end_frame_id)
+Basketball/img,Basketball/groundtruth_rect.txt,1,725
+Biker/img,Biker/groundtruth_rect.txt,1,142
+Bird1/img,Bird1/groundtruth_rect.txt,1,408
+Bird2/img,Bird2/groundtruth_rect.txt,1,99
+BlurBody/img,BlurBody/groundtruth_rect.txt,1,334
+BlurCar1/img,BlurCar1/groundtruth_rect.txt,247,988
+BlurCar2/img,BlurCar2/groundtruth_rect.txt,1,585
+BlurCar3/img,BlurCar3/groundtruth_rect.txt,3,359
+BlurCar4/img,BlurCar4/groundtruth_rect.txt,18,397
+BlurFace/img,BlurFace/groundtruth_rect.txt,1,493
+BlurOwl/img,BlurOwl/groundtruth_rect.txt,1,631
+Board/img,Board/groundtruth_rect.txt,1,698
+Bolt/img,Bolt/groundtruth_rect.txt,1,350
+Bolt2/img,Bolt2/groundtruth_rect.txt,1,293
+Box/img,Box/groundtruth_rect.txt,1,1161
+Boy/img,Boy/groundtruth_rect.txt,1,602
+Car1/img,Car1/groundtruth_rect.txt,1,1020
+Car2/img,Car2/groundtruth_rect.txt,1,913
+Car24/img,Car24/groundtruth_rect.txt,1,3059
+Car4/img,Car4/groundtruth_rect.txt,1,659
+CarDark/img,CarDark/groundtruth_rect.txt,1,393
+CarScale/img,CarScale/groundtruth_rect.txt,1,252
+ClifBar/img,ClifBar/groundtruth_rect.txt,1,472
+Coke/img,Coke/groundtruth_rect.txt,1,291
+Couple/img,Couple/groundtruth_rect.txt,1,140
+Coupon/img,Coupon/groundtruth_rect.txt,1,327
+Crossing/img,Crossing/groundtruth_rect.txt,1,120
+Crowds/img,Crowds/groundtruth_rect.txt,1,347
+Dancer/img,Dancer/groundtruth_rect.txt,1,225
+Dancer2/img,Dancer2/groundtruth_rect.txt,1,150
+David/img,David/groundtruth_rect.txt,300,770
+David2/img,David2/groundtruth_rect.txt,1,537
+David3/img,David3/groundtruth_rect.txt,1,252
+Deer/img,Deer/groundtruth_rect.txt,1,71
+Diving/img,Diving/groundtruth_rect.txt,1,215
+Dog/img,Dog/groundtruth_rect.txt,1,127
+Dog1/img,Dog1/groundtruth_rect.txt,1,1350
+Doll/img,Doll/groundtruth_rect.txt,1,3872
+DragonBaby/img,DragonBaby/groundtruth_rect.txt,1,113
+Dudek/img,Dudek/groundtruth_rect.txt,1,1145
+FaceOcc1/img,FaceOcc1/groundtruth_rect.txt,1,892
+FaceOcc2/img,FaceOcc2/groundtruth_rect.txt,1,812
+Fish/img,Fish/groundtruth_rect.txt,1,476
+FleetFace/img,FleetFace/groundtruth_rect.txt,1,707
+Football/img,Football/groundtruth_rect.txt,1,362
+Football1/img,Football1/groundtruth_rect.txt,1,74
+Freeman1/img,Freeman1/groundtruth_rect.txt,1,326
+Freeman3/img,Freeman3/groundtruth_rect.txt,1,460
+Freeman4/img,Freeman4/groundtruth_rect.txt,1,283
+Girl/img,Girl/groundtruth_rect.txt,1,500
+Girl2/img,Girl2/groundtruth_rect.txt,1,1500
+Gym/img,Gym/groundtruth_rect.txt,1,767
+Human2/img,Human2/groundtruth_rect.txt,1,1128
+Human3/img,Human3/groundtruth_rect.txt,1,1698
+Human4/img,Human4/groundtruth_rect.2.txt,1,667
+Human5/img,Human5/groundtruth_rect.txt,1,713
+Human6/img,Human6/groundtruth_rect.txt,1,792
+Human7/img,Human7/groundtruth_rect.txt,1,250
+Human8/img,Human8/groundtruth_rect.txt,1,128
+Human9/img,Human9/groundtruth_rect.txt,1,305
+Ironman/img,Ironman/groundtruth_rect.txt,1,166
+Jogging/img,Jogging/groundtruth_rect.1.txt,1,307
+Jogging/img,Jogging/groundtruth_rect.2.txt,1,307
+Jump/img,Jump/groundtruth_rect.txt,1,122
+Jumping/img,Jumping/groundtruth_rect.txt,1,313
+KiteSurf/img,KiteSurf/groundtruth_rect.txt,1,84
+Lemming/img,Lemming/groundtruth_rect.txt,1,1336
+Liquor/img,Liquor/groundtruth_rect.txt,1,1741
+Man/img,Man/groundtruth_rect.txt,1,134
+Matrix/img,Matrix/groundtruth_rect.txt,1,100
+Mhyang/img,Mhyang/groundtruth_rect.txt,1,1490
+MotorRolling/img,MotorRolling/groundtruth_rect.txt,1,164
+MountainBike/img,MountainBike/groundtruth_rect.txt,1,228
+Panda/img,Panda/groundtruth_rect.txt,1,1000
+RedTeam/img,RedTeam/groundtruth_rect.txt,1,1918
+Rubik/img,Rubik/groundtruth_rect.txt,1,1997
+Shaking/img,Shaking/groundtruth_rect.txt,1,365
+Singer1/img,Singer1/groundtruth_rect.txt,1,351
+Singer2/img,Singer2/groundtruth_rect.txt,1,366
+Skater/img,Skater/groundtruth_rect.txt,1,160
+Skater2/img,Skater2/groundtruth_rect.txt,1,435
+Skating1/img,Skating1/groundtruth_rect.txt,1,400
+Skating2/img,Skating2/groundtruth_rect.1.txt,1,473
+Skating2/img,Skating2/groundtruth_rect.2.txt,1,473
+Skiing/img,Skiing/groundtruth_rect.txt,1,81
+Soccer/img,Soccer/groundtruth_rect.txt,1,392
+Subway/img,Subway/groundtruth_rect.txt,1,175
+Surfer/img,Surfer/groundtruth_rect.txt,1,376
+Suv/img,Suv/groundtruth_rect.txt,1,945
+Sylvester/img,Sylvester/groundtruth_rect.txt,1,1345
+Tiger1/img,Tiger1/groundtruth_rect.txt,6,354
+Tiger2/img,Tiger2/groundtruth_rect.txt,1,365
+Toy/img,Toy/groundtruth_rect.txt,1,271
+Trans/img,Trans/groundtruth_rect.txt,1,124
+Trellis/img,Trellis/groundtruth_rect.txt,1,569
+Twinnings/img,Twinnings/groundtruth_rect.txt,1,472
+Vase/img,Vase/groundtruth_rect.txt,1,271
+Walking/img,Walking/groundtruth_rect.txt,1,412
+Walking2/img,Walking2/groundtruth_rect.txt,1,500
+Woman/img,Woman/groundtruth_rect.txt,1,597
diff --git a/tools/convert_datasets/uav123/uav123_info.txt b/tools/convert_datasets/uav123/uav123_info_deprecated.txt
similarity index 100%
rename from tools/convert_datasets/uav123/uav123_info.txt
rename to tools/convert_datasets/uav123/uav123_info_deprecated.txt
diff --git a/tools/convert_datasets/uav123/uav123_infos.txt b/tools/convert_datasets/uav123/uav123_infos.txt
new file mode 100644
index 000000000..af958d42c
--- /dev/null
+++ b/tools/convert_datasets/uav123/uav123_infos.txt
@@ -0,0 +1,124 @@
+The format of each line in this txt is (video_path,annotation_path,start_frame_id,end_frame_id)
+data_seq/UAV123/bike1,anno/UAV123/bike1.txt,1,3085
+data_seq/UAV123/bike2,anno/UAV123/bike2.txt,1,553
+data_seq/UAV123/bike3,anno/UAV123/bike3.txt,1,433
+data_seq/UAV123/bird1,anno/UAV123/bird1_1.txt,1,253
+data_seq/UAV123/bird1,anno/UAV123/bird1_2.txt,775,1477
+data_seq/UAV123/bird1,anno/UAV123/bird1_3.txt,1573,2437
+data_seq/UAV123/boat1,anno/UAV123/boat1.txt,1,901
+data_seq/UAV123/boat2,anno/UAV123/boat2.txt,1,799
+data_seq/UAV123/boat3,anno/UAV123/boat3.txt,1,901
+data_seq/UAV123/boat4,anno/UAV123/boat4.txt,1,553
+data_seq/UAV123/boat5,anno/UAV123/boat5.txt,1,505
+data_seq/UAV123/boat6,anno/UAV123/boat6.txt,1,805
+data_seq/UAV123/boat7,anno/UAV123/boat7.txt,1,535
+data_seq/UAV123/boat8,anno/UAV123/boat8.txt,1,685
+data_seq/UAV123/boat9,anno/UAV123/boat9.txt,1,1399
+data_seq/UAV123/building1,anno/UAV123/building1.txt,1,469
+data_seq/UAV123/building2,anno/UAV123/building2.txt,1,577
+data_seq/UAV123/building3,anno/UAV123/building3.txt,1,829
+data_seq/UAV123/building4,anno/UAV123/building4.txt,1,787
+data_seq/UAV123/building5,anno/UAV123/building5.txt,1,481
+data_seq/UAV123/car10,anno/UAV123/car10.txt,1,1405
+data_seq/UAV123/car11,anno/UAV123/car11.txt,1,337
+data_seq/UAV123/car12,anno/UAV123/car12.txt,1,499
+data_seq/UAV123/car13,anno/UAV123/car13.txt,1,415
+data_seq/UAV123/car14,anno/UAV123/car14.txt,1,1327
+data_seq/UAV123/car15,anno/UAV123/car15.txt,1,469
+data_seq/UAV123/car16,anno/UAV123/car16_1.txt,1,415
+data_seq/UAV123/car16,anno/UAV123/car16_2.txt,415,1993
+data_seq/UAV123/car17,anno/UAV123/car17.txt,1,1057
+data_seq/UAV123/car18,anno/UAV123/car18.txt,1,1207
+data_seq/UAV123/car1,anno/UAV123/car1_1.txt,1,751
+data_seq/UAV123/car1,anno/UAV123/car1_2.txt,751,1627
+data_seq/UAV123/car1,anno/UAV123/car1_3.txt,1627,2629
+data_seq/UAV123/car1_s,anno/UAV123/car1_s.txt,1,1475
+data_seq/UAV123/car2,anno/UAV123/car2.txt,1,1321
+data_seq/UAV123/car2_s,anno/UAV123/car2_s.txt,1,320
+data_seq/UAV123/car3,anno/UAV123/car3.txt,1,1717
+data_seq/UAV123/car3_s,anno/UAV123/car3_s.txt,1,1300
+data_seq/UAV123/car4,anno/UAV123/car4.txt,1,1345
+data_seq/UAV123/car4_s,anno/UAV123/car4_s.txt,1,830
+data_seq/UAV123/car5,anno/UAV123/car5.txt,1,745
+data_seq/UAV123/car6,anno/UAV123/car6_1.txt,1,487
+data_seq/UAV123/car6,anno/UAV123/car6_2.txt,487,1807
+data_seq/UAV123/car6,anno/UAV123/car6_3.txt,1807,2953
+data_seq/UAV123/car6,anno/UAV123/car6_4.txt,2953,3925
+data_seq/UAV123/car6,anno/UAV123/car6_5.txt,3925,4861
+data_seq/UAV123/car7,anno/UAV123/car7.txt,1,1033
+data_seq/UAV123/car8,anno/UAV123/car8_1.txt,1,1357
+data_seq/UAV123/car8,anno/UAV123/car8_2.txt,1357,2575
+data_seq/UAV123/car9,anno/UAV123/car9.txt,1,1879
+data_seq/UAV123/group1,anno/UAV123/group1_1.txt,1,1333
+data_seq/UAV123/group1,anno/UAV123/group1_2.txt,1333,2515
+data_seq/UAV123/group1,anno/UAV123/group1_3.txt,2515,3925
+data_seq/UAV123/group1,anno/UAV123/group1_4.txt,3925,4873
+data_seq/UAV123/group2,anno/UAV123/group2_1.txt,1,907
+data_seq/UAV123/group2,anno/UAV123/group2_2.txt,907,1771
+data_seq/UAV123/group2,anno/UAV123/group2_3.txt,1771,2683
+data_seq/UAV123/group3,anno/UAV123/group3_1.txt,1,1567
+data_seq/UAV123/group3,anno/UAV123/group3_2.txt,1567,2827
+data_seq/UAV123/group3,anno/UAV123/group3_3.txt,2827,4369
+data_seq/UAV123/group3,anno/UAV123/group3_4.txt,4369,5527
+data_seq/UAV123/person1,anno/UAV123/person1.txt,1,799
+data_seq/UAV123/person10,anno/UAV123/person10.txt,1,1021
+data_seq/UAV123/person11,anno/UAV123/person11.txt,1,721
+data_seq/UAV123/person12,anno/UAV123/person12_1.txt,1,601
+data_seq/UAV123/person12,anno/UAV123/person12_2.txt,601,1621
+data_seq/UAV123/person13,anno/UAV123/person13.txt,1,883
+data_seq/UAV123/person14,anno/UAV123/person14_1.txt,1,847
+data_seq/UAV123/person14,anno/UAV123/person14_2.txt,847,1813
+data_seq/UAV123/person14,anno/UAV123/person14_3.txt,1813,2923
+data_seq/UAV123/person15,anno/UAV123/person15.txt,1,1339
+data_seq/UAV123/person16,anno/UAV123/person16.txt,1,1147
+data_seq/UAV123/person17,anno/UAV123/person17_1.txt,1,1501
+data_seq/UAV123/person17,anno/UAV123/person17_2.txt,1501,2347
+data_seq/UAV123/person18,anno/UAV123/person18.txt,1,1393
+data_seq/UAV123/person19,anno/UAV123/person19_1.txt,1,1243
+data_seq/UAV123/person19,anno/UAV123/person19_2.txt,1243,2791
+data_seq/UAV123/person19,anno/UAV123/person19_3.txt,2791,4357
+data_seq/UAV123/person1_s,anno/UAV123/person1_s.txt,1,1600
+data_seq/UAV123/person20,anno/UAV123/person20.txt,1,1783
+data_seq/UAV123/person21,anno/UAV123/person21.txt,1,487
+data_seq/UAV123/person22,anno/UAV123/person22.txt,1,199
+data_seq/UAV123/person23,anno/UAV123/person23.txt,1,397
+data_seq/UAV123/person2,anno/UAV123/person2_1.txt,1,1189
+data_seq/UAV123/person2,anno/UAV123/person2_2.txt,1189,2623
+data_seq/UAV123/person2_s,anno/UAV123/person2_s.txt,1,250
+data_seq/UAV123/person3,anno/UAV123/person3.txt,1,643
+data_seq/UAV123/person3_s,anno/UAV123/person3_s.txt,1,505
+data_seq/UAV123/person4,anno/UAV123/person4_1.txt,1,1501
+data_seq/UAV123/person4,anno/UAV123/person4_2.txt,1501,2743
+data_seq/UAV123/person5,anno/UAV123/person5_1.txt,1,877
+data_seq/UAV123/person5,anno/UAV123/person5_2.txt,877,2101
+data_seq/UAV123/person6,anno/UAV123/person6.txt,1,901
+data_seq/UAV123/person7,anno/UAV123/person7_1.txt,1,1249
+data_seq/UAV123/person7,anno/UAV123/person7_2.txt,1249,2065
+data_seq/UAV123/person8,anno/UAV123/person8_1.txt,1,1075
+data_seq/UAV123/person8,anno/UAV123/person8_2.txt,1075,1525
+data_seq/UAV123/person9,anno/UAV123/person9.txt,1,661
+data_seq/UAV123/truck1,anno/UAV123/truck1.txt,1,463
+data_seq/UAV123/truck2,anno/UAV123/truck2.txt,1,385
+data_seq/UAV123/truck3,anno/UAV123/truck3.txt,1,535
+data_seq/UAV123/truck4,anno/UAV123/truck4_1.txt,1,577
+data_seq/UAV123/truck4,anno/UAV123/truck4_2.txt,577,1261
+data_seq/UAV123/uav1,anno/UAV123/uav1_1.txt,1,1555
+data_seq/UAV123/uav1,anno/UAV123/uav1_2.txt,1555,2377
+data_seq/UAV123/uav1,anno/UAV123/uav1_3.txt,2473,3469
+data_seq/UAV123/uav2,anno/UAV123/uav2.txt,1,133
+data_seq/UAV123/uav3,anno/UAV123/uav3.txt,1,265
+data_seq/UAV123/uav4,anno/UAV123/uav4.txt,1,157
+data_seq/UAV123/uav5,anno/UAV123/uav5.txt,1,139
+data_seq/UAV123/uav6,anno/UAV123/uav6.txt,1,109
+data_seq/UAV123/uav7,anno/UAV123/uav7.txt,1,373
+data_seq/UAV123/uav8,anno/UAV123/uav8.txt,1,301
+data_seq/UAV123/wakeboard1,anno/UAV123/wakeboard1.txt,1,421
+data_seq/UAV123/wakeboard10,anno/UAV123/wakeboard10.txt,1,469
+data_seq/UAV123/wakeboard2,anno/UAV123/wakeboard2.txt,1,733
+data_seq/UAV123/wakeboard3,anno/UAV123/wakeboard3.txt,1,823
+data_seq/UAV123/wakeboard4,anno/UAV123/wakeboard4.txt,1,697
+data_seq/UAV123/wakeboard5,anno/UAV123/wakeboard5.txt,1,1675
+data_seq/UAV123/wakeboard6,anno/UAV123/wakeboard6.txt,1,1165
+data_seq/UAV123/wakeboard7,anno/UAV123/wakeboard7.txt,1,199
+data_seq/UAV123/wakeboard8,anno/UAV123/wakeboard8.txt,1,1543
+data_seq/UAV123/wakeboard9,anno/UAV123/wakeboard9.txt,1,355
diff --git a/tools/convert_datasets/uav123/uav2coco.py b/tools/convert_datasets/uav123/uav2coco.py
index 6e489a53b..286040e14 100644
--- a/tools/convert_datasets/uav123/uav2coco.py
+++ b/tools/convert_datasets/uav123/uav2coco.py
@@ -34,7 +34,8 @@ def convert_uav123(uav123, ann_dir, save_dir):
     """
     # The format of each line in "uav_info123.txt" is
     # "anno_name,anno_path,video_path,start_frame,end_frame"
-    info_path = osp.join(os.path.dirname(__file__), 'uav123_info.txt')
+    info_path = osp.join(
+        os.path.dirname(__file__), 'uav123_info_deprecated.txt')
     uav_info = mmcv.list_from_file(info_path)[1:]
 
     records = dict(vid_id=1, img_id=1, ann_id=1, global_instance_id=1)