From 8b69aafd25893e953ac9da478cc3b7acbf9f2f5d Mon Sep 17 00:00:00 2001
From: noahcao <jinkuncao@gmail.com>
Date: Tue, 3 May 2022 11:13:39 -0400
Subject: [PATCH 1/9] fix format

---
 configs/_base_/datasets/dancetrack.py         |  74 +++
 ...track_faster-rcnn_r50_fpn_4e_dancetrack.py | 130 +++++
 docs/en/dataset.md                            |  17 +-
 docs/zh_cn/dataset.md                         |  17 +-
 mmtrack/datasets/__init__.py                  |   3 +-
 mmtrack/datasets/dancetrack_dataset.py        | 486 ++++++++++++++++++
 .../dancetrack/dancetrack2coco.py             | 181 +++++++
 7 files changed, 905 insertions(+), 3 deletions(-)
 create mode 100644 configs/_base_/datasets/dancetrack.py
 create mode 100644 configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py
 create mode 100644 mmtrack/datasets/dancetrack_dataset.py
 create mode 100644 tools/convert_datasets/dancetrack/dancetrack2coco.py

diff --git a/configs/_base_/datasets/dancetrack.py b/configs/_base_/datasets/dancetrack.py
new file mode 100644
index 000000000..4cc961ac4
--- /dev/null
+++ b/configs/_base_/datasets/dancetrack.py
@@ -0,0 +1,74 @@
+# dataset settings
+dataset_type = 'DanceTrackDataset'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadMultiImagesFromFile', to_float32=True),
+    dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),
+    dict(
+        type='SeqResize',
+        img_scale=(1088, 1088),
+        share_params=True,
+        ratio_range=(0.8, 1.2),
+        keep_ratio=True,
+        bbox_clip_border=False),
+    dict(type='SeqPhotoMetricDistortion', share_params=True),
+    dict(
+        type='SeqRandomCrop',
+        share_params=False,
+        crop_size=(1088, 1088),
+        bbox_clip_border=False),
+    dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
+    dict(type='SeqNormalize', **img_norm_cfg),
+    dict(type='SeqPad', size_divisor=32),
+    dict(type='MatchInstances', skip_nomatch=True),
+    dict(
+        type='VideoCollect',
+        keys=[
+            'img', 'gt_bboxes', 'gt_labels', 'gt_match_indices',
+            'gt_instance_ids'
+        ]),
+    dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1088, 1088),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='VideoCollect', keys=['img'])
+        ])
+]
+data_root = 'data/dancetrack/'
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        visibility_thr=-1,
+        ann_file=data_root + 'annotations/train_cocoformat.json',
+        img_prefix=data_root + 'train',
+        ref_img_sampler=dict(
+            num_ref_imgs=1,
+            frame_range=10,
+            filter_key_img=True,
+            method='uniform'),
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/val_cocoformat.json',
+        img_prefix=data_root + 'val',
+        ref_img_sampler=None,
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/val_cocoformat.json',
+        img_prefix=data_root + 'val',
+        ref_img_sampler=None,
+        pipeline=test_pipeline))
diff --git a/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py b/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py
new file mode 100644
index 000000000..ee9c51f32
--- /dev/null
+++ b/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py
@@ -0,0 +1,130 @@
+_base_ = [
+    '../../_base_/models/faster_rcnn_r50_fpn.py',
+    '../../_base_/datasets/dancetrack.py', '../../_base_/default_runtime.py'
+]
+model = dict(
+    type='QDTrack',
+    detector=dict(
+        backbone=dict(
+            norm_cfg=dict(requires_grad=False),
+            style='caffe',
+            init_cfg=dict(
+                type='Pretrained', checkpoint='torchvision://resnet50')),
+        rpn_head=dict(bbox_coder=dict(clip_border=False)),
+        roi_head=dict(
+            bbox_head=dict(
+                loss_bbox=dict(type='L1Loss', loss_weight=1.0),
+                bbox_coder=dict(clip_border=False),
+                num_classes=1)),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'  # noqa: E501
+        )),
+    track_head=dict(
+        type='QuasiDenseTrackHead',
+        roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        embed_head=dict(
+            type='QuasiDenseEmbedHead',
+            num_convs=4,
+            num_fcs=1,
+            embed_channels=256,
+            norm_cfg=dict(type='GN', num_groups=32),
+            loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
+            loss_track_aux=dict(
+                type='L2Loss',
+                neg_pos_ub=3,
+                pos_margin=0,
+                neg_margin=0.1,
+                hard_mining=True,
+                loss_weight=1.0)),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0),
+        train_cfg=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='CombinedSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=3,
+                add_gt_as_proposals=True,
+                pos_sampler=dict(type='InstanceBalancedPosSampler'),
+                neg_sampler=dict(type='RandomSampler')))),
+    tracker=dict(
+        type='QuasiDenseEmbedTracker',
+        init_score_thr=0.9,
+        obj_score_thr=0.5,
+        match_score_thr=0.5,
+        memo_tracklet_frames=30,
+        memo_backdrop_frames=1,
+        memo_momentum=0.8,
+        nms_conf_thr=0.5,
+        nms_backdrop_iou_thr=0.3,
+        nms_class_iou_thr=0.7,
+        with_cats=True,
+        match_metric='bisoftmax'))
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadMultiImagesFromFile', to_float32=True),
+    dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),
+    dict(
+        type='SeqResize',
+        img_scale=(1088, 1088),
+        share_params=True,
+        ratio_range=(0.8, 1.2),
+        keep_ratio=True,
+        bbox_clip_border=False),
+    dict(type='SeqPhotoMetricDistortion', share_params=True),
+    dict(
+        type='SeqRandomCrop',
+        share_params=False,
+        crop_size=(1088, 1088),
+        bbox_clip_border=False),
+    dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
+    dict(type='SeqNormalize', **img_norm_cfg),
+    dict(type='SeqPad', size_divisor=32),
+    dict(type='MatchInstances', skip_nomatch=True),
+    dict(
+        type='VideoCollect',
+        keys=[
+            'img', 'gt_bboxes', 'gt_labels', 'gt_match_indices',
+            'gt_instance_ids'
+        ]),
+    dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1088, 1088),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='VideoCollect', keys=['img'])
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer && learning policy
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(policy='step', step=[3])
+# runtime settings
+total_epochs = 4
+evaluation = dict(metric=['bbox', 'track'], interval=1)
diff --git a/docs/en/dataset.md b/docs/en/dataset.md
index ad520e25f..bfe45a296 100644
--- a/docs/en/dataset.md
+++ b/docs/en/dataset.md
@@ -9,6 +9,7 @@ This page provides the instructions for dataset preparation on existing benchmar
   - [CrowdHuman](https://www.crowdhuman.org/)
   - [LVIS](https://www.lvisdataset.org/)
   - [TAO](https://taodataset.org/)
+  - [DanceTrack](https://dancetrack.github.io)
 - Single Object Tracking
   - [LaSOT](http://vision.cs.stonybrook.edu/~lasot/)
   - [UAV123](https://cemse.kaust.edu.sa/ivul/uav123/)
@@ -31,7 +32,7 @@ Please download the datasets from the official websites. It is recommended to sy
 
 #### 1.2 Multiple Object Tracking
 
-- For the training and testing of multi object tracking task, one of the MOT Challenge datasets (e.g. MOT17) and TAO are needed, CrowdHuman and LVIS can be served as comlementary dataset.
+- For the training and testing of multi object tracking task, one of the MOT Challenge datasets (e.g. MOT17, TAO and DanceTrack) are needed, CrowdHuman and LVIS can be served as comlementary dataset.
 
 - The `annotations` under `tao` contains the official annotations from [here](https://github.com/TAO-Dataset/annotations).
 
@@ -98,6 +99,11 @@ mmtracking
 |   |   ├── train
 |   |   ├── test
 │   │
+|   ├── DanceTrack
+|   |   ├── train
+|   |   ├── val
+|   |   ├── test
+|   |
 │   ├── crowdhuman
 │   │   ├── annotation_train.odgt
 │   │   ├── annotation_val.odgt
@@ -230,6 +236,9 @@ python ./tools/convert_datasets/ilsvrc/imagenet2coco_vid.py -i ./data/ILSVRC -o
 python ./tools/convert_datasets/mot/mot2coco.py -i ./data/MOT17/ -o ./data/MOT17/annotations --split-train --convert-det
 python ./tools/convert_datasets/mot/mot2reid.py -i ./data/MOT17/ -o ./data/MOT17/reid --val-split 0.2 --vis-threshold 0.3
 
+# DanceTrack
+python ./tools/convert_datasets/dancetrack/dancetrack2coco.py -i ./data/DanceTrack ./data/DanceTrack/annotations
+
 # CrowdHuman
 python ./tools/convert_datasets/mot/crowdhuman2coco.py -i ./data/crowdhuman -o ./data/crowdhuman/annotations
 
@@ -320,6 +329,12 @@ mmtracking
 │   │   │   ├── imgs
 │   │   │   ├── meta
 │   │
+│   ├── DanceTrack
+│   │   ├── train
+│   │   ├── val
+│   │   ├── test
+│   │   ├── annotations
+│   │
 │   ├── crowdhuman
 │   │   ├── annotation_train.odgt
 │   │   ├── annotation_val.odgt
diff --git a/docs/zh_cn/dataset.md b/docs/zh_cn/dataset.md
index be10ea859..7c24aa2c3 100644
--- a/docs/zh_cn/dataset.md
+++ b/docs/zh_cn/dataset.md
@@ -9,6 +9,7 @@
   - [CrowdHuman](https://www.crowdhuman.org/)
   - [LVIS](https://www.lvisdataset.org/)
   - [TAO](https://taodataset.org/)
+  - [DanceTrack](https://dancetrack.github.io)
 - 单目标跟踪
   - [LaSOT](http://vision.cs.stonybrook.edu/~lasot/)
   - [UAV123](https://cemse.kaust.edu.sa/ivul/uav123/)
@@ -31,7 +32,7 @@
 
 #### 1.2 多目标跟踪
 
-- 对于多目标跟踪任务的训练和测试，需要 MOT Challenge 中的任意一个数据集（比如 MOT17）和 TAO ， CrowdHuman 和 LVIS 可以作为补充数据。
+- 对于多目标跟踪任务的训练和测试，需要 MOT Challenge 中的任意一个数据集（比如 MOT17, TAO和DanceTrack)， CrowdHuman 和 LVIS 可以作为补充数据。
 
 - `tao` 文件夹下包含官方标注的 `annotations` 可以从[这里](https://github.com/TAO-Dataset/annotations)获取。
 
@@ -98,6 +99,11 @@ mmtracking
 |   |   ├── train
 |   |   ├── test
 │   │
+|   ├── DanceTrack
+|   |   ├── train
+|   |   ├── val
+|   |   ├── test
+|   |
 │   ├── crowdhuman
 │   │   ├── annotation_train.odgt
 │   │   ├── annotation_val.odgt
@@ -231,6 +237,9 @@ python ./tools/convert_datasets/ilsvrc/imagenet2coco_vid.py -i ./data/ILSVRC -o
 python ./tools/convert_datasets/mot/mot2coco.py -i ./data/MOT17/ -o ./data/MOT17/annotations --split-train --convert-det
 python ./tools/convert_datasets/mot/mot2reid.py -i ./data/MOT17/ -o ./data/MOT17/reid --val-split 0.2 --vis-threshold 0.3
 
+# DanceTrack
+python ./tools/convert_datasets/dancetrack/dancetrack2coco.py -i ./data/DanceTrack ./data/DanceTrack/annotations
+
 # CrowdHuman
 python ./tools/convert_datasets/mot/crowdhuman2coco.py -i ./data/crowdhuman -o ./data/crowdhuman/annotations
 
@@ -321,6 +330,12 @@ mmtracking
 │   │   │   ├── imgs
 │   │   │   ├── meta
 │   │
+│   ├── DanceTrack
+│   │   ├── train
+│   │   ├── val
+│   │   ├── test
+│   │   ├── annotations
+│   │
 │   ├── crowdhuman
 │   │   ├── annotation_train.odgt
 │   │   ├── annotation_val.odgt
diff --git a/mmtrack/datasets/__init__.py b/mmtrack/datasets/__init__.py
index 394c52150..caa53634a 100644
--- a/mmtrack/datasets/__init__.py
+++ b/mmtrack/datasets/__init__.py
@@ -4,6 +4,7 @@
 from .base_sot_dataset import BaseSOTDataset
 from .builder import build_dataloader
 from .coco_video_dataset import CocoVideoDataset
+from .dancetrack_dataset import DanceTrackDataset
 from .dataset_wrappers import RandomSampleConcatDataset
 from .got10k_dataset import GOT10kDataset
 from .imagenet_vid_dataset import ImagenetVIDDataset
@@ -30,5 +31,5 @@
     'UAV123Dataset', 'TrackingNetDataset', 'OTB100Dataset',
     'YouTubeVISDataset', 'GOT10kDataset', 'VOTDataset', 'BaseSOTDataset',
     'SOTCocoDataset', 'SOTImageNetVIDDataset', 'RandomSampleConcatDataset',
-    'TaoDataset'
+    'TaoDataset', 'DanceTrackDataset'
 ]
diff --git a/mmtrack/datasets/dancetrack_dataset.py b/mmtrack/datasets/dancetrack_dataset.py
new file mode 100644
index 000000000..cf88993db
--- /dev/null
+++ b/mmtrack/datasets/dancetrack_dataset.py
@@ -0,0 +1,486 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import tempfile
+
+import mmcv
+import motmetrics as mm
+import numpy as np
+from mmcv.utils import print_log
+from mmdet.core import eval_map
+from mmdet.datasets import DATASETS
+
+from mmtrack.core import interpolate_tracks, results2outs
+from .coco_video_dataset import CocoVideoDataset
+
+try:
+    import trackeval
+except ImportError:
+    trackeval = None
+
+
+@DATASETS.register_module()
+class DanceTrackDataset(CocoVideoDataset):
+    """Dataset for DanceTrack: https://github.com/DanceTrack/DanceTrack.
+
+    Args:
+        visibility_thr (float, optional): The minimum visibility
+            for the objects during training. Default to -1.
+        interpolate_tracks_cfg (dict, optional): If not None, Interpolate
+            tracks linearly to make tracks more complete. Defaults to None.
+            - min_num_frames (int, optional): The minimum length of a track
+                that will be interpolated. Defaults to 5.
+            - max_num_frames (int, optional): The maximum disconnected length
+                in a track. Defaults to 20.
+        detection_file (str, optional): The path of the public
+            detection file. Default to None.
+    """
+
+    CLASSES = ('pedestrian', )
+
+    def __init__(self,
+                 visibility_thr=-1,
+                 interpolate_tracks_cfg=None,
+                 detection_file=None,
+                 *args,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.visibility_thr = visibility_thr
+        self.interpolate_tracks_cfg = interpolate_tracks_cfg
+        self.detections = self.load_detections(detection_file)
+
+    def load_detections(self, detection_file=None):
+        """Load public detections."""
+        # support detections in three formats
+        # 1. MMDet: [img_1, img_2, ...]
+        # 2. MMTrack: dict(det_bboxes=[img_1, img_2, ...])
+        # 3. Public:
+        #    1) dict(img1_name: [], img2_name: [], ...)
+        #    2) dict(det_bboxes=dict(img1_name: [], img2_name: [], ...))
+        # return as a dict or a list
+        if detection_file is not None:
+            detections = mmcv.load(detection_file)
+            if isinstance(detections, dict):
+                # results from mmtrack
+                if 'det_bboxes' in detections:
+                    detections = detections['det_bboxes']
+            else:
+                # results from mmdet
+                if not isinstance(detections, list):
+                    raise TypeError('detections must be a dict or a list.')
+            return detections
+        else:
+            return None
+
+    def prepare_results(self, img_info):
+        """Prepare results for image (e.g. the annotation information, ...)."""
+        results = super().prepare_results(img_info)
+        if self.detections is not None:
+            if isinstance(self.detections, dict):
+                indice = img_info['file_name']
+            elif isinstance(self.detections, list):
+                indice = self.img_ids.index(img_info['id'])
+            results['detections'] = self.detections[indice]
+        return results
+
+    def _parse_ann_info(self, img_info, ann_info):
+        """Parse bbox and mask annotation.
+
+        Args:
+            ann_info (list[dict]): Annotation info of an image.
+            with_mask (bool): Whether to parse mask annotations.
+
+        Returns:
+            dict: A dict containing the following keys: bboxes, bboxes_ignore,
+            labels, masks, seg_map. "masks" are raw annotations and not
+            decoded into binary masks.
+        """
+        gt_bboxes = []
+        gt_labels = []
+        gt_bboxes_ignore = []
+        gt_instance_ids = []
+
+        for i, ann in enumerate(ann_info):
+            if (not self.test_mode) and (ann['visibility'] <
+                                         self.visibility_thr):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            if ann.get('ignore', False) or ann.get('iscrowd', False):
+                # note: normally no `iscrowd` for MOT17Dataset
+                gt_bboxes_ignore.append(bbox)
+            else:
+                gt_bboxes.append(bbox)
+                gt_labels.append(self.cat2label[ann['category_id']])
+                gt_instance_ids.append(ann['instance_id'])
+
+        if gt_bboxes:
+            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+            gt_labels = np.array(gt_labels, dtype=np.int64)
+            gt_instance_ids = np.array(gt_instance_ids, dtype=np.int64)
+        else:
+            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+            gt_labels = np.array([], dtype=np.int64)
+            gt_instance_ids = np.array([], dtype=np.int64)
+
+        if gt_bboxes_ignore:
+            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+        else:
+            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+        ann = dict(
+            bboxes=gt_bboxes,
+            labels=gt_labels,
+            bboxes_ignore=gt_bboxes_ignore,
+            instance_ids=gt_instance_ids)
+
+        return ann
+
+    def format_results(self, results, resfile_path=None, metrics=['track']):
+        """Format the results to txts (standard format for MOT Challenge, which
+        is followed by DanceTrack dataset).
+
+        Args:
+            results (dict(list[ndarray])): Testing results of the dataset.
+            resfile_path (str, optional): Path to save the formatted results.
+                Defaults to None.
+            metrics (list[str], optional): The results of the specific metrics
+                will be formatted.. Defaults to ['track'].
+
+        Returns:
+            tuple: (resfile_path, resfiles, names, tmp_dir), resfile_path is
+            the path to save the formatted results, resfiles is a dict
+            containing the filepaths, names is a list containing the name of
+            the videos, tmp_dir is the temporal directory created for saving
+            files.
+        """
+        assert isinstance(results, dict), 'results must be a dict.'
+        if resfile_path is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            resfile_path = tmp_dir.name
+        else:
+            tmp_dir = None
+            if osp.exists(resfile_path):
+                print_log('remove previous results.', self.logger)
+                import shutil
+                shutil.rmtree(resfile_path)
+
+        resfiles = dict()
+        for metric in metrics:
+            resfiles[metric] = osp.join(resfile_path, metric)
+            os.makedirs(resfiles[metric], exist_ok=True)
+
+        inds = [i for i, _ in enumerate(self.data_infos) if _['frame_id'] == 0]
+        num_vids = len(inds)
+        assert num_vids == len(self.vid_ids)
+        inds.append(len(self.data_infos))
+        vid_infos = self.coco.load_vids(self.vid_ids)
+        names = [_['name'] for _ in vid_infos]
+
+        for i in range(num_vids):
+            for metric in metrics:
+                formatter = getattr(self, f'format_{metric}_results')
+                formatter(results[f'{metric}_bboxes'][inds[i]:inds[i + 1]],
+                          self.data_infos[inds[i]:inds[i + 1]],
+                          f'{resfiles[metric]}/{names[i]}.txt')
+
+        return resfile_path, resfiles, names, tmp_dir
+
+    def format_track_results(self, results, infos, resfile):
+        """Format tracking results."""
+
+        results_per_video = []
+        for frame_id, result in enumerate(results):
+            outs_track = results2outs(bbox_results=result)
+            track_ids, bboxes = outs_track['ids'], outs_track['bboxes']
+            frame_ids = np.full_like(track_ids, frame_id)
+            results_per_frame = np.concatenate(
+                (frame_ids[:, None], track_ids[:, None], bboxes), axis=1)
+            results_per_video.append(results_per_frame)
+        # `results_per_video` is a ndarray with shape (N, 7). Each row denotes
+        # (frame_id, track_id, x1, y1, x2, y2, score)
+        results_per_video = np.concatenate(results_per_video)
+
+        if self.interpolate_tracks_cfg is not None:
+            results_per_video = interpolate_tracks(
+                results_per_video, **self.interpolate_tracks_cfg)
+
+        with open(resfile, 'wt') as f:
+            for frame_id, info in enumerate(infos):
+                # `mot_frame_id` is the actually frame id used for evaluation.
+                # It may not start from 0.
+                if 'mot_frame_id' in info:
+                    mot_frame_id = info['mot_frame_id']
+                else:
+                    mot_frame_id = info['frame_id'] + 1
+
+                results_per_frame = \
+                    results_per_video[results_per_video[:, 0] == frame_id]
+                for i in range(len(results_per_frame)):
+                    _, track_id, x1, y1, x2, y2, conf = results_per_frame[i]
+                    f.writelines(
+                        f'{mot_frame_id},{track_id},{x1:.3f},{y1:.3f},' +
+                        f'{(x2-x1):.3f},{(y2-y1):.3f},{conf:.3f},-1,-1,-1\n')
+
+    def format_bbox_results(self, results, infos, resfile):
+        """Format detection results."""
+        with open(resfile, 'wt') as f:
+            for res, info in zip(results, infos):
+                if 'mot_frame_id' in info:
+                    frame = info['mot_frame_id']
+                else:
+                    frame = info['frame_id'] + 1
+
+                outs_det = results2outs(bbox_results=res)
+                for bbox, label in zip(outs_det['bboxes'], outs_det['labels']):
+                    x1, y1, x2, y2, conf = bbox
+                    f.writelines(
+                        f'{frame},-1,{x1:.3f},{y1:.3f},{(x2-x1):.3f},' +
+                        f'{(y2-y1):.3f},{conf:.3f}\n')
+            f.close()
+
+    def get_benchmark(self):
+        """Get benchmark from upeper/lower-case image prefix."""
+        BENCHMARKS = ['MOT15', 'MOT16', 'MOT17', 'MOT20']
+        for benchamrk in BENCHMARKS:
+            if benchamrk in self.img_prefix.upper():
+                return benchamrk
+
+    def get_dataset_cfg_for_hota(self, gt_folder, tracker_folder, seqmap):
+        """Get default configs for trackeval.datasets.MotChallenge2DBox.
+
+        Args:
+            gt_folder (str): the name of the GT folder
+            tracker_folder (str): the name of the tracker folder
+            seqmap (str): the file that contains the sequence of video names
+
+        Returns:
+            Dataset Configs for MotChallenge2DBox.
+        """
+        dataset_config = dict(
+            # Location of GT data
+            GT_FOLDER=gt_folder,
+            # Trackers location
+            TRACKERS_FOLDER=tracker_folder,
+            # Where to save eval results
+            # (if None, same as TRACKERS_FOLDER)
+            OUTPUT_FOLDER=None,
+            # Use 'track' as the default tracker
+            TRACKERS_TO_EVAL=['track'],
+            # Option values: ['pedestrian']
+            CLASSES_TO_EVAL=list(self.CLASSES),
+            # TrackEval does not support Dancetrack as an option,
+            # we use the wrapper for MOT17 dataset
+            BENCHMARK='DanceTrack',
+            # Option Values: 'train', 'val', 'test'
+            SPLIT_TO_EVAL='val',
+            # Whether tracker input files are zipped
+            INPUT_AS_ZIP=False,
+            # Whether to print current config
+            PRINT_CONFIG=True,
+            # Whether to perform preprocessing
+            # (never done for MOT15)
+            DO_PREPROC=False if 'MOT15' in self.img_prefix else True,
+            # Tracker files are in
+            # TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
+            TRACKER_SUB_FOLDER='',
+            # Output files are saved in
+            # OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
+            OUTPUT_SUB_FOLDER='',
+            # Names of trackers to display
+            # (if None: TRACKERS_TO_EVAL)
+            TRACKER_DISPLAY_NAMES=None,
+            # Where seqmaps are found
+            # (if None: GT_FOLDER/seqmaps)
+            SEQMAP_FOLDER=None,
+            # Directly specify seqmap file
+            # (if none use seqmap_folder/benchmark-split_to_eval)
+            SEQMAP_FILE=seqmap,
+            # If not None, specify sequences to eval
+            # and their number of timesteps
+            SEQ_INFO=None,
+            # '{gt_folder}/{seq}/gt/gt.txt'
+            GT_LOC_FORMAT='{gt_folder}/{seq}/gt/gt.txt',
+            # If False, data is in GT_FOLDER/BENCHMARK-SPLIT_TO_EVAL/ and in
+            # TRACKERS_FOLDER/BENCHMARK-SPLIT_TO_EVAL/tracker/
+            # If True, the middle 'benchmark-split' folder is skipped for both.
+            SKIP_SPLIT_FOL=True,
+        )
+
+        return dataset_config
+
+    def evaluate(self,
+                 results,
+                 metric='track',
+                 logger=None,
+                 resfile_path=None,
+                 bbox_iou_thr=0.5,
+                 track_iou_thr=0.5):
+        """Evaluation in MOT Challenge.
+
+        Args:
+            results (list[list | tuple]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated. Options are
+                'bbox', 'track'. Defaults to 'track'.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            resfile_path (str, optional): Path to save the formatted results.
+                Defaults to None.
+            bbox_iou_thr (float, optional): IoU threshold for detection
+                evaluation. Defaults to 0.5.
+            track_iou_thr (float, optional): IoU threshold for tracking
+                evaluation.. Defaults to 0.5.
+
+        Returns:
+            dict[str, float]: MOTChallenge style evaluation metric.
+        """
+        eval_results = dict()
+        if isinstance(metric, list):
+            metrics = metric
+        elif isinstance(metric, str):
+            metrics = [metric]
+        else:
+            raise TypeError('metric must be a list or a str.')
+        allowed_metrics = ['bbox', 'track']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported.')
+
+        if 'track' in metrics:
+            resfile_path, resfiles, names, tmp_dir = self.format_results(
+                results, resfile_path, metrics)
+            print_log('Evaluate CLEAR MOT results.', logger=logger)
+            distth = 1 - track_iou_thr
+            accs = []
+            # support loading data from ceph
+            local_dir = tempfile.TemporaryDirectory()
+
+            for name in names:
+                gt_file = osp.join(self.img_prefix, f'{name}/gt/gt.txt')
+                res_file = osp.join(resfiles['track'], f'{name}.txt')
+                # copy gt file from ceph to local temporary directory
+                gt_dir_path = osp.join(local_dir.name, name, 'gt')
+                os.makedirs(gt_dir_path)
+                copied_gt_file = osp.join(
+                    local_dir.name,
+                    gt_file.replace(gt_file.split(name)[0], ''))
+
+                f = open(copied_gt_file, 'wb')
+                gt_content = self.file_client.get(gt_file)
+                if hasattr(gt_content, 'tobytes'):
+                    gt_content = gt_content.tobytes()
+                f.write(gt_content)
+                f.close()
+                # copy sequence file from ceph to local temporary directory
+                copied_seqinfo_path = osp.join(local_dir.name, name,
+                                               'seqinfo.ini')
+                f = open(copied_seqinfo_path, 'wb')
+                seq_content = self.file_client.get(
+                    osp.join(self.img_prefix, name, 'seqinfo.ini'))
+                if hasattr(seq_content, 'tobytes'):
+                    seq_content = seq_content.tobytes()
+                f.write(seq_content)
+                f.close()
+
+                gt = mm.io.loadtxt(copied_gt_file)
+                res = mm.io.loadtxt(res_file)
+                if osp.exists(copied_seqinfo_path):
+                    acc, ana = mm.utils.CLEAR_MOT_M(
+                        gt, res, copied_seqinfo_path, distth=distth)
+                else:
+                    acc = mm.utils.compare_to_groundtruth(
+                        gt, res, distth=distth)
+                accs.append(acc)
+
+            mh = mm.metrics.create()
+            summary = mh.compute_many(
+                accs,
+                names=names,
+                metrics=mm.metrics.motchallenge_metrics,
+                generate_overall=True)
+
+            if trackeval is None:
+                raise ImportError(
+                    'Please run'
+                    'pip install git+https://github.com/JonathonLuiten/TrackEval.git'  # noqa
+                    'to manually install trackeval')
+
+            seqmap = osp.join(resfile_path, 'videoseq.txt')
+            with open(seqmap, 'w') as f:
+                f.write('name\n')
+                for name in names:
+                    f.write(name + '\n')
+                f.close()
+
+            eval_config = trackeval.Evaluator.get_default_eval_config()
+
+            # tracker's name is set to 'track',
+            # so this word needs to be splited out
+            output_folder = resfiles['track'].rsplit(os.sep, 1)[0]
+            dataset_config = self.get_dataset_cfg_for_hota(
+                local_dir.name, output_folder, seqmap)
+
+            evaluator = trackeval.Evaluator(eval_config)
+            dataset = [trackeval.datasets.MotChallenge2DBox(dataset_config)]
+            hota_metrics = [
+                trackeval.metrics.HOTA(dict(METRICS=['HOTA'], THRESHOLD=0.5))
+            ]
+            output_res, _ = evaluator.evaluate(dataset, hota_metrics)
+
+            # modify HOTA results sequence according to summary list,
+            # indexes of summary are sequence names and 'OVERALL'
+            # while for hota they are sequence names and 'COMBINED_SEQ'
+            seq_list = list(summary.index)
+            seq_list.append('COMBINED_SEQ')
+
+            hota = [
+                np.average(output_res['MotChallenge2DBox']['track'][seq]
+                           ['pedestrian']['HOTA']['HOTA']) for seq in seq_list
+                if 'OVERALL' not in seq
+            ]
+
+            eval_results.update({
+                mm.io.motchallenge_metric_names[k]: v['OVERALL']
+                for k, v in summary.to_dict().items()
+            })
+            eval_results['HOTA'] = hota[-1]
+
+            summary['HOTA'] = hota
+            str_summary = mm.io.render_summary(
+                summary,
+                formatters=mh.formatters,
+                namemap=mm.io.motchallenge_metric_names)
+            print(str_summary)
+            local_dir.cleanup()
+            if tmp_dir is not None:
+                tmp_dir.cleanup()
+
+        if 'bbox' in metrics:
+            if isinstance(results, dict):
+                bbox_results = results['det_bboxes']
+            elif isinstance(results, list):
+                bbox_results = results
+            else:
+                raise TypeError('results must be a dict or a list.')
+            annotations = [self.get_ann_info(info) for info in self.data_infos]
+            mean_ap, _ = eval_map(
+                bbox_results,
+                annotations,
+                iou_thr=bbox_iou_thr,
+                dataset=self.CLASSES,
+                logger=logger)
+            eval_results['mAP'] = mean_ap
+
+        for k, v in eval_results.items():
+            if isinstance(v, float):
+                eval_results[k] = float(f'{(v):.3f}')
+
+        return eval_results
diff --git a/tools/convert_datasets/dancetrack/dancetrack2coco.py b/tools/convert_datasets/dancetrack/dancetrack2coco.py
new file mode 100644
index 000000000..3bf666019
--- /dev/null
+++ b/tools/convert_datasets/dancetrack/dancetrack2coco.py
@@ -0,0 +1,181 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# This script converts DanceTrack labels into COCO style.
+# Official repo of the DanceTrack dataset:
+# https://github.com/DanceTrack/DanceTrack
+#
+# Label format of DanceTrack dataset:
+#   GTs:
+#       <frame_id> # starts from 1 but COCO style starts from 0,
+#       <instance_id>, <x1>, <y1>, <w>, <h>,
+#       <conf> # conf is annotated as 0 if the object is ignored,
+#       <class_id>, <visibility>
+#
+#   DETs and Results:
+#       <frame_id>, <instance_id>, <x1>, <y1>, <w>, <h>, <conf>,
+#       <x>, <y>, <z> # for 3D objects
+#
+# Classes in DanceTrack:
+#   1: 'pedestrian'
+#
+#   USELESS classes are not included into the json file.
+#   IGNORES classes are included with `ignore=True`.
+#
+# This file is adapted from the data label conversion file for MOT
+# But as Dancetrack does not provide public detections and provides
+# official train/val/test splitting, we make necessary adaptation.
+
+import argparse
+import os
+import os.path as osp
+from collections import defaultdict
+
+import mmcv
+from tqdm import tqdm
+
+USELESS = [3, 4, 5, 6, 9, 10, 11]
+IGNORES = [2, 7, 8, 12, 13]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert DanceTrack label and detections to \
+        COCO-VID format.')
+    parser.add_argument('-i', '--input', help='path of MOT data')
+    parser.add_argument(
+        '-o', '--output', help='path to save coco formatted label file')
+    return parser.parse_args()
+
+
+def parse_gts(gts, is_mot15):
+    outputs = defaultdict(list)
+    for gt in gts:
+        gt = gt.strip().split(',')
+        frame_id, ins_id = map(int, gt[:2])
+        bbox = list(map(float, gt[2:6]))
+        if is_mot15:
+            conf = 1.
+            class_id = 1
+            visibility = 1.
+        else:
+            conf = float(gt[6])
+            class_id = int(gt[7])
+            visibility = float(gt[8])
+        if class_id in USELESS:
+            continue
+        elif class_id in IGNORES:
+            continue
+        anns = dict(
+            category_id=1,
+            bbox=bbox,
+            area=bbox[2] * bbox[3],
+            iscrowd=False,
+            visibility=visibility,
+            mot_instance_id=ins_id,
+            mot_conf=conf,
+            mot_class_id=class_id)
+        outputs[frame_id].append(anns)
+    return outputs
+
+
+def parse_dets(dets):
+    outputs = defaultdict(list)
+    for det in dets:
+        det = det.strip().split(',')
+        frame_id, ins_id = map(int, det[:2])
+        assert ins_id == -1
+        bbox = list(map(float, det[2:7]))
+        # [x1, y1, x2, y2] to be consistent with mmdet
+        bbox = [
+            bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3], bbox[4]
+        ]
+        outputs[frame_id].append(bbox)
+
+    return outputs
+
+
+def main():
+    args = parse_args()
+    if not osp.isdir(args.output):
+        os.makedirs(args.output)
+
+    sets = ['train', 'val', 'test']
+    vid_id, img_id, ann_id = 1, 1, 1
+
+    for subset in sets:
+        ins_id = 0
+        print(f'Converting {subset} set to COCO format')
+        in_folder = osp.join(args.input, subset)
+        out_file = osp.join(args.output, f'{subset}_cocoformat.json')
+        outputs = defaultdict(list)
+        outputs['categories'] = [dict(id=1, name='pedestrian')]
+
+        video_names = os.listdir(in_folder)
+        video_names = [d for d in video_names if d != '.DS_Store']
+        for video_name in tqdm(video_names):
+            # basic params
+            parse_gt = 'test' not in subset
+            ins_maps = dict()
+            # load video infos
+            video_folder = osp.join(in_folder, video_name)
+            infos = mmcv.list_from_file(f'{video_folder}/seqinfo.ini')
+            # video-level infos
+            assert video_name == infos[1].strip().split('=')[1]
+            img_folder = infos[2].strip().split('=')[1]
+            img_names = os.listdir(f'{video_folder}/{img_folder}')
+            img_names = [d for d in img_names if d != '.DS_Store']
+            img_names = sorted(img_names)
+            fps = int(infos[3].strip().split('=')[1])
+            num_imgs = int(infos[4].strip().split('=')[1])
+
+            assert num_imgs == len(img_names)
+            width = int(infos[5].strip().split('=')[1])
+            height = int(infos[6].strip().split('=')[1])
+            video = dict(
+                id=vid_id,
+                name=video_name,
+                fps=fps,
+                width=width,
+                height=height)
+            # parse annotations
+            if parse_gt:
+                gts = mmcv.list_from_file(f'{video_folder}/gt/gt.txt')
+                img2gts = parse_gts(gts, False)
+
+            # image and box level infos
+            for frame_id, name in enumerate(img_names):
+                img_name = osp.join(video_name, img_folder, name)
+                mot_frame_id = int(name.split('.')[0])
+                image = dict(
+                    id=img_id,
+                    video_id=vid_id,
+                    file_name=img_name,
+                    height=height,
+                    width=width,
+                    frame_id=frame_id,
+                    mot_frame_id=mot_frame_id)
+                if parse_gt:
+                    gts = img2gts[mot_frame_id]
+                    for gt in gts:
+                        gt.update(id=ann_id, image_id=img_id)
+                        mot_ins_id = gt['mot_instance_id']
+                        if mot_ins_id in ins_maps:
+                            gt['instance_id'] = ins_maps[mot_ins_id]
+                        else:
+                            gt['instance_id'] = ins_id
+                            ins_maps[mot_ins_id] = ins_id
+                            ins_id += 1
+                        outputs['annotations'].append(gt)
+                        ann_id += 1
+
+                outputs['images'].append(image)
+                img_id += 1
+            outputs['videos'].append(video)
+            vid_id += 1
+            outputs['num_instances'] = ins_id
+        print(f'{subset} has {ins_id} instances.')
+        mmcv.dump(outputs, out_file)
+        print(f'Done! Saved as {out_file}')
+
+
+if __name__ == '__main__':
+    main()

From f7777e1dfab4362fbb0943a56334dc4271f96c2e Mon Sep 17 00:00:00 2001
From: noahcao <jinkuncao@gmail.com>
Date: Tue, 3 May 2022 11:13:39 -0400
Subject: [PATCH 2/9] support dancetrack dataset

---
 configs/_base_/datasets/dancetrack.py         |  74 +++
 ...track_faster-rcnn_r50_fpn_4e_dancetrack.py | 130 +++++
 docs/en/dataset.md                            |  17 +-
 docs/zh_cn/dataset.md                         |  17 +-
 mmtrack/datasets/__init__.py                  |   3 +-
 mmtrack/datasets/dancetrack_dataset.py        | 486 ++++++++++++++++++
 .../dancetrack/dancetrack2coco.py             | 181 +++++++
 7 files changed, 905 insertions(+), 3 deletions(-)
 create mode 100644 configs/_base_/datasets/dancetrack.py
 create mode 100644 configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py
 create mode 100644 mmtrack/datasets/dancetrack_dataset.py
 create mode 100644 tools/convert_datasets/dancetrack/dancetrack2coco.py

diff --git a/configs/_base_/datasets/dancetrack.py b/configs/_base_/datasets/dancetrack.py
new file mode 100644
index 000000000..4cc961ac4
--- /dev/null
+++ b/configs/_base_/datasets/dancetrack.py
@@ -0,0 +1,74 @@
+# dataset settings
+dataset_type = 'DanceTrackDataset'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadMultiImagesFromFile', to_float32=True),
+    dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),
+    dict(
+        type='SeqResize',
+        img_scale=(1088, 1088),
+        share_params=True,
+        ratio_range=(0.8, 1.2),
+        keep_ratio=True,
+        bbox_clip_border=False),
+    dict(type='SeqPhotoMetricDistortion', share_params=True),
+    dict(
+        type='SeqRandomCrop',
+        share_params=False,
+        crop_size=(1088, 1088),
+        bbox_clip_border=False),
+    dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
+    dict(type='SeqNormalize', **img_norm_cfg),
+    dict(type='SeqPad', size_divisor=32),
+    dict(type='MatchInstances', skip_nomatch=True),
+    dict(
+        type='VideoCollect',
+        keys=[
+            'img', 'gt_bboxes', 'gt_labels', 'gt_match_indices',
+            'gt_instance_ids'
+        ]),
+    dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1088, 1088),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='VideoCollect', keys=['img'])
+        ])
+]
+data_root = 'data/dancetrack/'
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        visibility_thr=-1,
+        ann_file=data_root + 'annotations/train_cocoformat.json',
+        img_prefix=data_root + 'train',
+        ref_img_sampler=dict(
+            num_ref_imgs=1,
+            frame_range=10,
+            filter_key_img=True,
+            method='uniform'),
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/val_cocoformat.json',
+        img_prefix=data_root + 'val',
+        ref_img_sampler=None,
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/val_cocoformat.json',
+        img_prefix=data_root + 'val',
+        ref_img_sampler=None,
+        pipeline=test_pipeline))
diff --git a/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py b/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py
new file mode 100644
index 000000000..ee9c51f32
--- /dev/null
+++ b/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py
@@ -0,0 +1,130 @@
+_base_ = [
+    '../../_base_/models/faster_rcnn_r50_fpn.py',
+    '../../_base_/datasets/dancetrack.py', '../../_base_/default_runtime.py'
+]
+model = dict(
+    type='QDTrack',
+    detector=dict(
+        backbone=dict(
+            norm_cfg=dict(requires_grad=False),
+            style='caffe',
+            init_cfg=dict(
+                type='Pretrained', checkpoint='torchvision://resnet50')),
+        rpn_head=dict(bbox_coder=dict(clip_border=False)),
+        roi_head=dict(
+            bbox_head=dict(
+                loss_bbox=dict(type='L1Loss', loss_weight=1.0),
+                bbox_coder=dict(clip_border=False),
+                num_classes=1)),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'  # noqa: E501
+        )),
+    track_head=dict(
+        type='QuasiDenseTrackHead',
+        roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        embed_head=dict(
+            type='QuasiDenseEmbedHead',
+            num_convs=4,
+            num_fcs=1,
+            embed_channels=256,
+            norm_cfg=dict(type='GN', num_groups=32),
+            loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
+            loss_track_aux=dict(
+                type='L2Loss',
+                neg_pos_ub=3,
+                pos_margin=0,
+                neg_margin=0.1,
+                hard_mining=True,
+                loss_weight=1.0)),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0),
+        train_cfg=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='CombinedSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=3,
+                add_gt_as_proposals=True,
+                pos_sampler=dict(type='InstanceBalancedPosSampler'),
+                neg_sampler=dict(type='RandomSampler')))),
+    tracker=dict(
+        type='QuasiDenseEmbedTracker',
+        init_score_thr=0.9,
+        obj_score_thr=0.5,
+        match_score_thr=0.5,
+        memo_tracklet_frames=30,
+        memo_backdrop_frames=1,
+        memo_momentum=0.8,
+        nms_conf_thr=0.5,
+        nms_backdrop_iou_thr=0.3,
+        nms_class_iou_thr=0.7,
+        with_cats=True,
+        match_metric='bisoftmax'))
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadMultiImagesFromFile', to_float32=True),
+    dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),
+    dict(
+        type='SeqResize',
+        img_scale=(1088, 1088),
+        share_params=True,
+        ratio_range=(0.8, 1.2),
+        keep_ratio=True,
+        bbox_clip_border=False),
+    dict(type='SeqPhotoMetricDistortion', share_params=True),
+    dict(
+        type='SeqRandomCrop',
+        share_params=False,
+        crop_size=(1088, 1088),
+        bbox_clip_border=False),
+    dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
+    dict(type='SeqNormalize', **img_norm_cfg),
+    dict(type='SeqPad', size_divisor=32),
+    dict(type='MatchInstances', skip_nomatch=True),
+    dict(
+        type='VideoCollect',
+        keys=[
+            'img', 'gt_bboxes', 'gt_labels', 'gt_match_indices',
+            'gt_instance_ids'
+        ]),
+    dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1088, 1088),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='VideoCollect', keys=['img'])
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer && learning policy
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(policy='step', step=[3])
+# runtime settings
+total_epochs = 4
+evaluation = dict(metric=['bbox', 'track'], interval=1)
diff --git a/docs/en/dataset.md b/docs/en/dataset.md
index ad520e25f..bfe45a296 100644
--- a/docs/en/dataset.md
+++ b/docs/en/dataset.md
@@ -9,6 +9,7 @@ This page provides the instructions for dataset preparation on existing benchmar
   - [CrowdHuman](https://www.crowdhuman.org/)
   - [LVIS](https://www.lvisdataset.org/)
   - [TAO](https://taodataset.org/)
+  - [DanceTrack](https://dancetrack.github.io)
 - Single Object Tracking
   - [LaSOT](http://vision.cs.stonybrook.edu/~lasot/)
   - [UAV123](https://cemse.kaust.edu.sa/ivul/uav123/)
@@ -31,7 +32,7 @@ Please download the datasets from the official websites. It is recommended to sy
 
 #### 1.2 Multiple Object Tracking
 
-- For the training and testing of multi object tracking task, one of the MOT Challenge datasets (e.g. MOT17) and TAO are needed, CrowdHuman and LVIS can be served as comlementary dataset.
+- For the training and testing of multi object tracking task, one of the MOT Challenge datasets (e.g. MOT17, TAO and DanceTrack) are needed, CrowdHuman and LVIS can be served as comlementary dataset.
 
 - The `annotations` under `tao` contains the official annotations from [here](https://github.com/TAO-Dataset/annotations).
 
@@ -98,6 +99,11 @@ mmtracking
 |   |   ├── train
 |   |   ├── test
 │   │
+|   ├── DanceTrack
+|   |   ├── train
+|   |   ├── val
+|   |   ├── test
+|   |
 │   ├── crowdhuman
 │   │   ├── annotation_train.odgt
 │   │   ├── annotation_val.odgt
@@ -230,6 +236,9 @@ python ./tools/convert_datasets/ilsvrc/imagenet2coco_vid.py -i ./data/ILSVRC -o
 python ./tools/convert_datasets/mot/mot2coco.py -i ./data/MOT17/ -o ./data/MOT17/annotations --split-train --convert-det
 python ./tools/convert_datasets/mot/mot2reid.py -i ./data/MOT17/ -o ./data/MOT17/reid --val-split 0.2 --vis-threshold 0.3
 
+# DanceTrack
+python ./tools/convert_datasets/dancetrack/dancetrack2coco.py -i ./data/DanceTrack ./data/DanceTrack/annotations
+
 # CrowdHuman
 python ./tools/convert_datasets/mot/crowdhuman2coco.py -i ./data/crowdhuman -o ./data/crowdhuman/annotations
 
@@ -320,6 +329,12 @@ mmtracking
 │   │   │   ├── imgs
 │   │   │   ├── meta
 │   │
+│   ├── DanceTrack
+│   │   ├── train
+│   │   ├── val
+│   │   ├── test
+│   │   ├── annotations
+│   │
 │   ├── crowdhuman
 │   │   ├── annotation_train.odgt
 │   │   ├── annotation_val.odgt
diff --git a/docs/zh_cn/dataset.md b/docs/zh_cn/dataset.md
index be10ea859..7c24aa2c3 100644
--- a/docs/zh_cn/dataset.md
+++ b/docs/zh_cn/dataset.md
@@ -9,6 +9,7 @@
   - [CrowdHuman](https://www.crowdhuman.org/)
   - [LVIS](https://www.lvisdataset.org/)
   - [TAO](https://taodataset.org/)
+  - [DanceTrack](https://dancetrack.github.io)
 - 单目标跟踪
   - [LaSOT](http://vision.cs.stonybrook.edu/~lasot/)
   - [UAV123](https://cemse.kaust.edu.sa/ivul/uav123/)
@@ -31,7 +32,7 @@
 
 #### 1.2 多目标跟踪
 
-- 对于多目标跟踪任务的训练和测试，需要 MOT Challenge 中的任意一个数据集（比如 MOT17）和 TAO ， CrowdHuman 和 LVIS 可以作为补充数据。
+- 对于多目标跟踪任务的训练和测试，需要 MOT Challenge 中的任意一个数据集（比如 MOT17, TAO和DanceTrack)， CrowdHuman 和 LVIS 可以作为补充数据。
 
 - `tao` 文件夹下包含官方标注的 `annotations` 可以从[这里](https://github.com/TAO-Dataset/annotations)获取。
 
@@ -98,6 +99,11 @@ mmtracking
 |   |   ├── train
 |   |   ├── test
 │   │
+|   ├── DanceTrack
+|   |   ├── train
+|   |   ├── val
+|   |   ├── test
+|   |
 │   ├── crowdhuman
 │   │   ├── annotation_train.odgt
 │   │   ├── annotation_val.odgt
@@ -231,6 +237,9 @@ python ./tools/convert_datasets/ilsvrc/imagenet2coco_vid.py -i ./data/ILSVRC -o
 python ./tools/convert_datasets/mot/mot2coco.py -i ./data/MOT17/ -o ./data/MOT17/annotations --split-train --convert-det
 python ./tools/convert_datasets/mot/mot2reid.py -i ./data/MOT17/ -o ./data/MOT17/reid --val-split 0.2 --vis-threshold 0.3
 
+# DanceTrack
+python ./tools/convert_datasets/dancetrack/dancetrack2coco.py -i ./data/DanceTrack ./data/DanceTrack/annotations
+
 # CrowdHuman
 python ./tools/convert_datasets/mot/crowdhuman2coco.py -i ./data/crowdhuman -o ./data/crowdhuman/annotations
 
@@ -321,6 +330,12 @@ mmtracking
 │   │   │   ├── imgs
 │   │   │   ├── meta
 │   │
+│   ├── DanceTrack
+│   │   ├── train
+│   │   ├── val
+│   │   ├── test
+│   │   ├── annotations
+│   │
 │   ├── crowdhuman
 │   │   ├── annotation_train.odgt
 │   │   ├── annotation_val.odgt
diff --git a/mmtrack/datasets/__init__.py b/mmtrack/datasets/__init__.py
index 394c52150..caa53634a 100644
--- a/mmtrack/datasets/__init__.py
+++ b/mmtrack/datasets/__init__.py
@@ -4,6 +4,7 @@
 from .base_sot_dataset import BaseSOTDataset
 from .builder import build_dataloader
 from .coco_video_dataset import CocoVideoDataset
+from .dancetrack_dataset import DanceTrackDataset
 from .dataset_wrappers import RandomSampleConcatDataset
 from .got10k_dataset import GOT10kDataset
 from .imagenet_vid_dataset import ImagenetVIDDataset
@@ -30,5 +31,5 @@
     'UAV123Dataset', 'TrackingNetDataset', 'OTB100Dataset',
     'YouTubeVISDataset', 'GOT10kDataset', 'VOTDataset', 'BaseSOTDataset',
     'SOTCocoDataset', 'SOTImageNetVIDDataset', 'RandomSampleConcatDataset',
-    'TaoDataset'
+    'TaoDataset', 'DanceTrackDataset'
 ]
diff --git a/mmtrack/datasets/dancetrack_dataset.py b/mmtrack/datasets/dancetrack_dataset.py
new file mode 100644
index 000000000..cf88993db
--- /dev/null
+++ b/mmtrack/datasets/dancetrack_dataset.py
@@ -0,0 +1,486 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import tempfile
+
+import mmcv
+import motmetrics as mm
+import numpy as np
+from mmcv.utils import print_log
+from mmdet.core import eval_map
+from mmdet.datasets import DATASETS
+
+from mmtrack.core import interpolate_tracks, results2outs
+from .coco_video_dataset import CocoVideoDataset
+
+try:
+    import trackeval
+except ImportError:
+    trackeval = None
+
+
+@DATASETS.register_module()
+class DanceTrackDataset(CocoVideoDataset):
+    """Dataset for DanceTrack: https://github.com/DanceTrack/DanceTrack.
+
+    Args:
+        visibility_thr (float, optional): The minimum visibility
+            for the objects during training. Default to -1.
+        interpolate_tracks_cfg (dict, optional): If not None, Interpolate
+            tracks linearly to make tracks more complete. Defaults to None.
+            - min_num_frames (int, optional): The minimum length of a track
+                that will be interpolated. Defaults to 5.
+            - max_num_frames (int, optional): The maximum disconnected length
+                in a track. Defaults to 20.
+        detection_file (str, optional): The path of the public
+            detection file. Default to None.
+    """
+
+    CLASSES = ('pedestrian', )
+
+    def __init__(self,
+                 visibility_thr=-1,
+                 interpolate_tracks_cfg=None,
+                 detection_file=None,
+                 *args,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.visibility_thr = visibility_thr
+        self.interpolate_tracks_cfg = interpolate_tracks_cfg
+        self.detections = self.load_detections(detection_file)
+
+    def load_detections(self, detection_file=None):
+        """Load public detections."""
+        # support detections in three formats
+        # 1. MMDet: [img_1, img_2, ...]
+        # 2. MMTrack: dict(det_bboxes=[img_1, img_2, ...])
+        # 3. Public:
+        #    1) dict(img1_name: [], img2_name: [], ...)
+        #    2) dict(det_bboxes=dict(img1_name: [], img2_name: [], ...))
+        # return as a dict or a list
+        if detection_file is not None:
+            detections = mmcv.load(detection_file)
+            if isinstance(detections, dict):
+                # results from mmtrack
+                if 'det_bboxes' in detections:
+                    detections = detections['det_bboxes']
+            else:
+                # results from mmdet
+                if not isinstance(detections, list):
+                    raise TypeError('detections must be a dict or a list.')
+            return detections
+        else:
+            return None
+
+    def prepare_results(self, img_info):
+        """Prepare results for image (e.g. the annotation information, ...)."""
+        results = super().prepare_results(img_info)
+        if self.detections is not None:
+            if isinstance(self.detections, dict):
+                indice = img_info['file_name']
+            elif isinstance(self.detections, list):
+                indice = self.img_ids.index(img_info['id'])
+            results['detections'] = self.detections[indice]
+        return results
+
+    def _parse_ann_info(self, img_info, ann_info):
+        """Parse bbox and mask annotation.
+
+        Args:
+            ann_info (list[dict]): Annotation info of an image.
+            with_mask (bool): Whether to parse mask annotations.
+
+        Returns:
+            dict: A dict containing the following keys: bboxes, bboxes_ignore,
+            labels, masks, seg_map. "masks" are raw annotations and not
+            decoded into binary masks.
+        """
+        gt_bboxes = []
+        gt_labels = []
+        gt_bboxes_ignore = []
+        gt_instance_ids = []
+
+        for i, ann in enumerate(ann_info):
+            if (not self.test_mode) and (ann['visibility'] <
+                                         self.visibility_thr):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            if ann.get('ignore', False) or ann.get('iscrowd', False):
+                # note: normally no `iscrowd` for MOT17Dataset
+                gt_bboxes_ignore.append(bbox)
+            else:
+                gt_bboxes.append(bbox)
+                gt_labels.append(self.cat2label[ann['category_id']])
+                gt_instance_ids.append(ann['instance_id'])
+
+        if gt_bboxes:
+            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+            gt_labels = np.array(gt_labels, dtype=np.int64)
+            gt_instance_ids = np.array(gt_instance_ids, dtype=np.int64)
+        else:
+            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+            gt_labels = np.array([], dtype=np.int64)
+            gt_instance_ids = np.array([], dtype=np.int64)
+
+        if gt_bboxes_ignore:
+            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+        else:
+            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+        ann = dict(
+            bboxes=gt_bboxes,
+            labels=gt_labels,
+            bboxes_ignore=gt_bboxes_ignore,
+            instance_ids=gt_instance_ids)
+
+        return ann
+
+    def format_results(self, results, resfile_path=None, metrics=['track']):
+        """Format the results to txts (standard format for MOT Challenge, which
+        is followed by DanceTrack dataset).
+
+        Args:
+            results (dict(list[ndarray])): Testing results of the dataset.
+            resfile_path (str, optional): Path to save the formatted results.
+                Defaults to None.
+            metrics (list[str], optional): The results of the specific metrics
+                will be formatted.. Defaults to ['track'].
+
+        Returns:
+            tuple: (resfile_path, resfiles, names, tmp_dir), resfile_path is
+            the path to save the formatted results, resfiles is a dict
+            containing the filepaths, names is a list containing the name of
+            the videos, tmp_dir is the temporal directory created for saving
+            files.
+        """
+        assert isinstance(results, dict), 'results must be a dict.'
+        if resfile_path is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            resfile_path = tmp_dir.name
+        else:
+            tmp_dir = None
+            if osp.exists(resfile_path):
+                print_log('remove previous results.', self.logger)
+                import shutil
+                shutil.rmtree(resfile_path)
+
+        resfiles = dict()
+        for metric in metrics:
+            resfiles[metric] = osp.join(resfile_path, metric)
+            os.makedirs(resfiles[metric], exist_ok=True)
+
+        inds = [i for i, _ in enumerate(self.data_infos) if _['frame_id'] == 0]
+        num_vids = len(inds)
+        assert num_vids == len(self.vid_ids)
+        inds.append(len(self.data_infos))
+        vid_infos = self.coco.load_vids(self.vid_ids)
+        names = [_['name'] for _ in vid_infos]
+
+        for i in range(num_vids):
+            for metric in metrics:
+                formatter = getattr(self, f'format_{metric}_results')
+                formatter(results[f'{metric}_bboxes'][inds[i]:inds[i + 1]],
+                          self.data_infos[inds[i]:inds[i + 1]],
+                          f'{resfiles[metric]}/{names[i]}.txt')
+
+        return resfile_path, resfiles, names, tmp_dir
+
+    def format_track_results(self, results, infos, resfile):
+        """Format tracking results."""
+
+        results_per_video = []
+        for frame_id, result in enumerate(results):
+            outs_track = results2outs(bbox_results=result)
+            track_ids, bboxes = outs_track['ids'], outs_track['bboxes']
+            frame_ids = np.full_like(track_ids, frame_id)
+            results_per_frame = np.concatenate(
+                (frame_ids[:, None], track_ids[:, None], bboxes), axis=1)
+            results_per_video.append(results_per_frame)
+        # `results_per_video` is a ndarray with shape (N, 7). Each row denotes
+        # (frame_id, track_id, x1, y1, x2, y2, score)
+        results_per_video = np.concatenate(results_per_video)
+
+        if self.interpolate_tracks_cfg is not None:
+            results_per_video = interpolate_tracks(
+                results_per_video, **self.interpolate_tracks_cfg)
+
+        with open(resfile, 'wt') as f:
+            for frame_id, info in enumerate(infos):
+                # `mot_frame_id` is the actually frame id used for evaluation.
+                # It may not start from 0.
+                if 'mot_frame_id' in info:
+                    mot_frame_id = info['mot_frame_id']
+                else:
+                    mot_frame_id = info['frame_id'] + 1
+
+                results_per_frame = \
+                    results_per_video[results_per_video[:, 0] == frame_id]
+                for i in range(len(results_per_frame)):
+                    _, track_id, x1, y1, x2, y2, conf = results_per_frame[i]
+                    f.writelines(
+                        f'{mot_frame_id},{track_id},{x1:.3f},{y1:.3f},' +
+                        f'{(x2-x1):.3f},{(y2-y1):.3f},{conf:.3f},-1,-1,-1\n')
+
+    def format_bbox_results(self, results, infos, resfile):
+        """Format detection results."""
+        with open(resfile, 'wt') as f:
+            for res, info in zip(results, infos):
+                if 'mot_frame_id' in info:
+                    frame = info['mot_frame_id']
+                else:
+                    frame = info['frame_id'] + 1
+
+                outs_det = results2outs(bbox_results=res)
+                for bbox, label in zip(outs_det['bboxes'], outs_det['labels']):
+                    x1, y1, x2, y2, conf = bbox
+                    f.writelines(
+                        f'{frame},-1,{x1:.3f},{y1:.3f},{(x2-x1):.3f},' +
+                        f'{(y2-y1):.3f},{conf:.3f}\n')
+            f.close()
+
+    def get_benchmark(self):
+        """Get benchmark from upeper/lower-case image prefix."""
+        BENCHMARKS = ['MOT15', 'MOT16', 'MOT17', 'MOT20']
+        for benchamrk in BENCHMARKS:
+            if benchamrk in self.img_prefix.upper():
+                return benchamrk
+
+    def get_dataset_cfg_for_hota(self, gt_folder, tracker_folder, seqmap):
+        """Get default configs for trackeval.datasets.MotChallenge2DBox.
+
+        Args:
+            gt_folder (str): the name of the GT folder
+            tracker_folder (str): the name of the tracker folder
+            seqmap (str): the file that contains the sequence of video names
+
+        Returns:
+            Dataset Configs for MotChallenge2DBox.
+        """
+        dataset_config = dict(
+            # Location of GT data
+            GT_FOLDER=gt_folder,
+            # Trackers location
+            TRACKERS_FOLDER=tracker_folder,
+            # Where to save eval results
+            # (if None, same as TRACKERS_FOLDER)
+            OUTPUT_FOLDER=None,
+            # Use 'track' as the default tracker
+            TRACKERS_TO_EVAL=['track'],
+            # Option values: ['pedestrian']
+            CLASSES_TO_EVAL=list(self.CLASSES),
+            # TrackEval does not support Dancetrack as an option,
+            # we use the wrapper for MOT17 dataset
+            BENCHMARK='DanceTrack',
+            # Option Values: 'train', 'val', 'test'
+            SPLIT_TO_EVAL='val',
+            # Whether tracker input files are zipped
+            INPUT_AS_ZIP=False,
+            # Whether to print current config
+            PRINT_CONFIG=True,
+            # Whether to perform preprocessing
+            # (never done for MOT15)
+            DO_PREPROC=False if 'MOT15' in self.img_prefix else True,
+            # Tracker files are in
+            # TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
+            TRACKER_SUB_FOLDER='',
+            # Output files are saved in
+            # OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
+            OUTPUT_SUB_FOLDER='',
+            # Names of trackers to display
+            # (if None: TRACKERS_TO_EVAL)
+            TRACKER_DISPLAY_NAMES=None,
+            # Where seqmaps are found
+            # (if None: GT_FOLDER/seqmaps)
+            SEQMAP_FOLDER=None,
+            # Directly specify seqmap file
+            # (if none use seqmap_folder/benchmark-split_to_eval)
+            SEQMAP_FILE=seqmap,
+            # If not None, specify sequences to eval
+            # and their number of timesteps
+            SEQ_INFO=None,
+            # '{gt_folder}/{seq}/gt/gt.txt'
+            GT_LOC_FORMAT='{gt_folder}/{seq}/gt/gt.txt',
+            # If False, data is in GT_FOLDER/BENCHMARK-SPLIT_TO_EVAL/ and in
+            # TRACKERS_FOLDER/BENCHMARK-SPLIT_TO_EVAL/tracker/
+            # If True, the middle 'benchmark-split' folder is skipped for both.
+            SKIP_SPLIT_FOL=True,
+        )
+
+        return dataset_config
+
+    def evaluate(self,
+                 results,
+                 metric='track',
+                 logger=None,
+                 resfile_path=None,
+                 bbox_iou_thr=0.5,
+                 track_iou_thr=0.5):
+        """Evaluation in MOT Challenge.
+
+        Args:
+            results (list[list | tuple]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated. Options are
+                'bbox', 'track'. Defaults to 'track'.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            resfile_path (str, optional): Path to save the formatted results.
+                Defaults to None.
+            bbox_iou_thr (float, optional): IoU threshold for detection
+                evaluation. Defaults to 0.5.
+            track_iou_thr (float, optional): IoU threshold for tracking
+                evaluation.. Defaults to 0.5.
+
+        Returns:
+            dict[str, float]: MOTChallenge style evaluation metric.
+        """
+        eval_results = dict()
+        if isinstance(metric, list):
+            metrics = metric
+        elif isinstance(metric, str):
+            metrics = [metric]
+        else:
+            raise TypeError('metric must be a list or a str.')
+        allowed_metrics = ['bbox', 'track']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported.')
+
+        if 'track' in metrics:
+            resfile_path, resfiles, names, tmp_dir = self.format_results(
+                results, resfile_path, metrics)
+            print_log('Evaluate CLEAR MOT results.', logger=logger)
+            distth = 1 - track_iou_thr
+            accs = []
+            # support loading data from ceph
+            local_dir = tempfile.TemporaryDirectory()
+
+            for name in names:
+                gt_file = osp.join(self.img_prefix, f'{name}/gt/gt.txt')
+                res_file = osp.join(resfiles['track'], f'{name}.txt')
+                # copy gt file from ceph to local temporary directory
+                gt_dir_path = osp.join(local_dir.name, name, 'gt')
+                os.makedirs(gt_dir_path)
+                copied_gt_file = osp.join(
+                    local_dir.name,
+                    gt_file.replace(gt_file.split(name)[0], ''))
+
+                f = open(copied_gt_file, 'wb')
+                gt_content = self.file_client.get(gt_file)
+                if hasattr(gt_content, 'tobytes'):
+                    gt_content = gt_content.tobytes()
+                f.write(gt_content)
+                f.close()
+                # copy sequence file from ceph to local temporary directory
+                copied_seqinfo_path = osp.join(local_dir.name, name,
+                                               'seqinfo.ini')
+                f = open(copied_seqinfo_path, 'wb')
+                seq_content = self.file_client.get(
+                    osp.join(self.img_prefix, name, 'seqinfo.ini'))
+                if hasattr(seq_content, 'tobytes'):
+                    seq_content = seq_content.tobytes()
+                f.write(seq_content)
+                f.close()
+
+                gt = mm.io.loadtxt(copied_gt_file)
+                res = mm.io.loadtxt(res_file)
+                if osp.exists(copied_seqinfo_path):
+                    acc, ana = mm.utils.CLEAR_MOT_M(
+                        gt, res, copied_seqinfo_path, distth=distth)
+                else:
+                    acc = mm.utils.compare_to_groundtruth(
+                        gt, res, distth=distth)
+                accs.append(acc)
+
+            mh = mm.metrics.create()
+            summary = mh.compute_many(
+                accs,
+                names=names,
+                metrics=mm.metrics.motchallenge_metrics,
+                generate_overall=True)
+
+            if trackeval is None:
+                raise ImportError(
+                    'Please run'
+                    'pip install git+https://github.com/JonathonLuiten/TrackEval.git'  # noqa
+                    'to manually install trackeval')
+
+            seqmap = osp.join(resfile_path, 'videoseq.txt')
+            with open(seqmap, 'w') as f:
+                f.write('name\n')
+                for name in names:
+                    f.write(name + '\n')
+                f.close()
+
+            eval_config = trackeval.Evaluator.get_default_eval_config()
+
+            # tracker's name is set to 'track',
+            # so this word needs to be splited out
+            output_folder = resfiles['track'].rsplit(os.sep, 1)[0]
+            dataset_config = self.get_dataset_cfg_for_hota(
+                local_dir.name, output_folder, seqmap)
+
+            evaluator = trackeval.Evaluator(eval_config)
+            dataset = [trackeval.datasets.MotChallenge2DBox(dataset_config)]
+            hota_metrics = [
+                trackeval.metrics.HOTA(dict(METRICS=['HOTA'], THRESHOLD=0.5))
+            ]
+            output_res, _ = evaluator.evaluate(dataset, hota_metrics)
+
+            # modify HOTA results sequence according to summary list,
+            # indexes of summary are sequence names and 'OVERALL'
+            # while for hota they are sequence names and 'COMBINED_SEQ'
+            seq_list = list(summary.index)
+            seq_list.append('COMBINED_SEQ')
+
+            hota = [
+                np.average(output_res['MotChallenge2DBox']['track'][seq]
+                           ['pedestrian']['HOTA']['HOTA']) for seq in seq_list
+                if 'OVERALL' not in seq
+            ]
+
+            eval_results.update({
+                mm.io.motchallenge_metric_names[k]: v['OVERALL']
+                for k, v in summary.to_dict().items()
+            })
+            eval_results['HOTA'] = hota[-1]
+
+            summary['HOTA'] = hota
+            str_summary = mm.io.render_summary(
+                summary,
+                formatters=mh.formatters,
+                namemap=mm.io.motchallenge_metric_names)
+            print(str_summary)
+            local_dir.cleanup()
+            if tmp_dir is not None:
+                tmp_dir.cleanup()
+
+        if 'bbox' in metrics:
+            if isinstance(results, dict):
+                bbox_results = results['det_bboxes']
+            elif isinstance(results, list):
+                bbox_results = results
+            else:
+                raise TypeError('results must be a dict or a list.')
+            annotations = [self.get_ann_info(info) for info in self.data_infos]
+            mean_ap, _ = eval_map(
+                bbox_results,
+                annotations,
+                iou_thr=bbox_iou_thr,
+                dataset=self.CLASSES,
+                logger=logger)
+            eval_results['mAP'] = mean_ap
+
+        for k, v in eval_results.items():
+            if isinstance(v, float):
+                eval_results[k] = float(f'{(v):.3f}')
+
+        return eval_results
diff --git a/tools/convert_datasets/dancetrack/dancetrack2coco.py b/tools/convert_datasets/dancetrack/dancetrack2coco.py
new file mode 100644
index 000000000..3bf666019
--- /dev/null
+++ b/tools/convert_datasets/dancetrack/dancetrack2coco.py
@@ -0,0 +1,181 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# This script converts DanceTrack labels into COCO style.
+# Official repo of the DanceTrack dataset:
+# https://github.com/DanceTrack/DanceTrack
+#
+# Label format of DanceTrack dataset:
+#   GTs:
+#       <frame_id> # starts from 1 but COCO style starts from 0,
+#       <instance_id>, <x1>, <y1>, <w>, <h>,
+#       <conf> # conf is annotated as 0 if the object is ignored,
+#       <class_id>, <visibility>
+#
+#   DETs and Results:
+#       <frame_id>, <instance_id>, <x1>, <y1>, <w>, <h>, <conf>,
+#       <x>, <y>, <z> # for 3D objects
+#
+# Classes in DanceTrack:
+#   1: 'pedestrian'
+#
+#   USELESS classes are not included into the json file.
+#   IGNORES classes are included with `ignore=True`.
+#
+# This file is adapted from the data label conversion file for MOT
+# But as Dancetrack does not provide public detections and provides
+# official train/val/test splitting, we make necessary adaptation.
+
+import argparse
+import os
+import os.path as osp
+from collections import defaultdict
+
+import mmcv
+from tqdm import tqdm
+
+USELESS = [3, 4, 5, 6, 9, 10, 11]
+IGNORES = [2, 7, 8, 12, 13]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert DanceTrack label and detections to \
+        COCO-VID format.')
+    parser.add_argument('-i', '--input', help='path of MOT data')
+    parser.add_argument(
+        '-o', '--output', help='path to save coco formatted label file')
+    return parser.parse_args()
+
+
+def parse_gts(gts, is_mot15):
+    outputs = defaultdict(list)
+    for gt in gts:
+        gt = gt.strip().split(',')
+        frame_id, ins_id = map(int, gt[:2])
+        bbox = list(map(float, gt[2:6]))
+        if is_mot15:
+            conf = 1.
+            class_id = 1
+            visibility = 1.
+        else:
+            conf = float(gt[6])
+            class_id = int(gt[7])
+            visibility = float(gt[8])
+        if class_id in USELESS:
+            continue
+        elif class_id in IGNORES:
+            continue
+        anns = dict(
+            category_id=1,
+            bbox=bbox,
+            area=bbox[2] * bbox[3],
+            iscrowd=False,
+            visibility=visibility,
+            mot_instance_id=ins_id,
+            mot_conf=conf,
+            mot_class_id=class_id)
+        outputs[frame_id].append(anns)
+    return outputs
+
+
+def parse_dets(dets):
+    outputs = defaultdict(list)
+    for det in dets:
+        det = det.strip().split(',')
+        frame_id, ins_id = map(int, det[:2])
+        assert ins_id == -1
+        bbox = list(map(float, det[2:7]))
+        # [x1, y1, x2, y2] to be consistent with mmdet
+        bbox = [
+            bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3], bbox[4]
+        ]
+        outputs[frame_id].append(bbox)
+
+    return outputs
+
+
+def main():
+    args = parse_args()
+    if not osp.isdir(args.output):
+        os.makedirs(args.output)
+
+    sets = ['train', 'val', 'test']
+    vid_id, img_id, ann_id = 1, 1, 1
+
+    for subset in sets:
+        ins_id = 0
+        print(f'Converting {subset} set to COCO format')
+        in_folder = osp.join(args.input, subset)
+        out_file = osp.join(args.output, f'{subset}_cocoformat.json')
+        outputs = defaultdict(list)
+        outputs['categories'] = [dict(id=1, name='pedestrian')]
+
+        video_names = os.listdir(in_folder)
+        video_names = [d for d in video_names if d != '.DS_Store']
+        for video_name in tqdm(video_names):
+            # basic params
+            parse_gt = 'test' not in subset
+            ins_maps = dict()
+            # load video infos
+            video_folder = osp.join(in_folder, video_name)
+            infos = mmcv.list_from_file(f'{video_folder}/seqinfo.ini')
+            # video-level infos
+            assert video_name == infos[1].strip().split('=')[1]
+            img_folder = infos[2].strip().split('=')[1]
+            img_names = os.listdir(f'{video_folder}/{img_folder}')
+            img_names = [d for d in img_names if d != '.DS_Store']
+            img_names = sorted(img_names)
+            fps = int(infos[3].strip().split('=')[1])
+            num_imgs = int(infos[4].strip().split('=')[1])
+
+            assert num_imgs == len(img_names)
+            width = int(infos[5].strip().split('=')[1])
+            height = int(infos[6].strip().split('=')[1])
+            video = dict(
+                id=vid_id,
+                name=video_name,
+                fps=fps,
+                width=width,
+                height=height)
+            # parse annotations
+            if parse_gt:
+                gts = mmcv.list_from_file(f'{video_folder}/gt/gt.txt')
+                img2gts = parse_gts(gts, False)
+
+            # image and box level infos
+            for frame_id, name in enumerate(img_names):
+                img_name = osp.join(video_name, img_folder, name)
+                mot_frame_id = int(name.split('.')[0])
+                image = dict(
+                    id=img_id,
+                    video_id=vid_id,
+                    file_name=img_name,
+                    height=height,
+                    width=width,
+                    frame_id=frame_id,
+                    mot_frame_id=mot_frame_id)
+                if parse_gt:
+                    gts = img2gts[mot_frame_id]
+                    for gt in gts:
+                        gt.update(id=ann_id, image_id=img_id)
+                        mot_ins_id = gt['mot_instance_id']
+                        if mot_ins_id in ins_maps:
+                            gt['instance_id'] = ins_maps[mot_ins_id]
+                        else:
+                            gt['instance_id'] = ins_id
+                            ins_maps[mot_ins_id] = ins_id
+                            ins_id += 1
+                        outputs['annotations'].append(gt)
+                        ann_id += 1
+
+                outputs['images'].append(image)
+                img_id += 1
+            outputs['videos'].append(video)
+            vid_id += 1
+            outputs['num_instances'] = ins_id
+        print(f'{subset} has {ins_id} instances.')
+        mmcv.dump(outputs, out_file)
+        print(f'Done! Saved as {out_file}')
+
+
+if __name__ == '__main__':
+    main()

From 5058e88fb6ed288fa1595d73dce653b4327c2c0a Mon Sep 17 00:00:00 2001
From: noahcao <jinkuncao@gmail.com>
Date: Wed, 4 May 2022 09:00:11 -0400
Subject: [PATCH 3/9] delete unused function; inherit dancetrack dataset class
 from MOTChallenge

---
 mmtrack/datasets/dancetrack_dataset.py           |  4 ++--
 .../dancetrack/dancetrack2coco.py                | 16 ----------------
 2 files changed, 2 insertions(+), 18 deletions(-)

diff --git a/mmtrack/datasets/dancetrack_dataset.py b/mmtrack/datasets/dancetrack_dataset.py
index cf88993db..7f7930d7e 100644
--- a/mmtrack/datasets/dancetrack_dataset.py
+++ b/mmtrack/datasets/dancetrack_dataset.py
@@ -11,7 +11,7 @@
 from mmdet.datasets import DATASETS
 
 from mmtrack.core import interpolate_tracks, results2outs
-from .coco_video_dataset import CocoVideoDataset
+from .mot_challenge_dataset import MOTChallengeDataset
 
 try:
     import trackeval
@@ -20,7 +20,7 @@
 
 
 @DATASETS.register_module()
-class DanceTrackDataset(CocoVideoDataset):
+class DanceTrackDataset(MOTChallengeDataset):
     """Dataset for DanceTrack: https://github.com/DanceTrack/DanceTrack.
 
     Args:
diff --git a/tools/convert_datasets/dancetrack/dancetrack2coco.py b/tools/convert_datasets/dancetrack/dancetrack2coco.py
index 3bf666019..3b37dd5a1 100644
--- a/tools/convert_datasets/dancetrack/dancetrack2coco.py
+++ b/tools/convert_datasets/dancetrack/dancetrack2coco.py
@@ -77,22 +77,6 @@ def parse_gts(gts, is_mot15):
     return outputs
 
 
-def parse_dets(dets):
-    outputs = defaultdict(list)
-    for det in dets:
-        det = det.strip().split(',')
-        frame_id, ins_id = map(int, det[:2])
-        assert ins_id == -1
-        bbox = list(map(float, det[2:7]))
-        # [x1, y1, x2, y2] to be consistent with mmdet
-        bbox = [
-            bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3], bbox[4]
-        ]
-        outputs[frame_id].append(bbox)
-
-    return outputs
-
-
 def main():
     args = parse_args()
     if not osp.isdir(args.output):

From a29e15b2391f8661801baf8902b41fdbd68c4403 Mon Sep 17 00:00:00 2001
From: noahcao <jinkuncao@gmail.com>
Date: Thu, 5 May 2022 02:28:57 -0400
Subject: [PATCH 4/9] remove duplicated content in DancetrackDataset

---
 mmtrack/datasets/dancetrack_dataset.py | 232 +------------------------
 1 file changed, 1 insertion(+), 231 deletions(-)

diff --git a/mmtrack/datasets/dancetrack_dataset.py b/mmtrack/datasets/dancetrack_dataset.py
index 7f7930d7e..3e4145f9d 100644
--- a/mmtrack/datasets/dancetrack_dataset.py
+++ b/mmtrack/datasets/dancetrack_dataset.py
@@ -3,14 +3,12 @@
 import os.path as osp
 import tempfile
 
-import mmcv
 import motmetrics as mm
 import numpy as np
 from mmcv.utils import print_log
 from mmdet.core import eval_map
 from mmdet.datasets import DATASETS
 
-from mmtrack.core import interpolate_tracks, results2outs
 from .mot_challenge_dataset import MOTChallengeDataset
 
 try:
@@ -23,237 +21,9 @@
 class DanceTrackDataset(MOTChallengeDataset):
     """Dataset for DanceTrack: https://github.com/DanceTrack/DanceTrack.
 
-    Args:
-        visibility_thr (float, optional): The minimum visibility
-            for the objects during training. Default to -1.
-        interpolate_tracks_cfg (dict, optional): If not None, Interpolate
-            tracks linearly to make tracks more complete. Defaults to None.
-            - min_num_frames (int, optional): The minimum length of a track
-                that will be interpolated. Defaults to 5.
-            - max_num_frames (int, optional): The maximum disconnected length
-                in a track. Defaults to 20.
-        detection_file (str, optional): The path of the public
-            detection file. Default to None.
+    Most content is inherited from MOTChallengeDataset.
     """
 
-    CLASSES = ('pedestrian', )
-
-    def __init__(self,
-                 visibility_thr=-1,
-                 interpolate_tracks_cfg=None,
-                 detection_file=None,
-                 *args,
-                 **kwargs):
-        super().__init__(*args, **kwargs)
-        self.visibility_thr = visibility_thr
-        self.interpolate_tracks_cfg = interpolate_tracks_cfg
-        self.detections = self.load_detections(detection_file)
-
-    def load_detections(self, detection_file=None):
-        """Load public detections."""
-        # support detections in three formats
-        # 1. MMDet: [img_1, img_2, ...]
-        # 2. MMTrack: dict(det_bboxes=[img_1, img_2, ...])
-        # 3. Public:
-        #    1) dict(img1_name: [], img2_name: [], ...)
-        #    2) dict(det_bboxes=dict(img1_name: [], img2_name: [], ...))
-        # return as a dict or a list
-        if detection_file is not None:
-            detections = mmcv.load(detection_file)
-            if isinstance(detections, dict):
-                # results from mmtrack
-                if 'det_bboxes' in detections:
-                    detections = detections['det_bboxes']
-            else:
-                # results from mmdet
-                if not isinstance(detections, list):
-                    raise TypeError('detections must be a dict or a list.')
-            return detections
-        else:
-            return None
-
-    def prepare_results(self, img_info):
-        """Prepare results for image (e.g. the annotation information, ...)."""
-        results = super().prepare_results(img_info)
-        if self.detections is not None:
-            if isinstance(self.detections, dict):
-                indice = img_info['file_name']
-            elif isinstance(self.detections, list):
-                indice = self.img_ids.index(img_info['id'])
-            results['detections'] = self.detections[indice]
-        return results
-
-    def _parse_ann_info(self, img_info, ann_info):
-        """Parse bbox and mask annotation.
-
-        Args:
-            ann_info (list[dict]): Annotation info of an image.
-            with_mask (bool): Whether to parse mask annotations.
-
-        Returns:
-            dict: A dict containing the following keys: bboxes, bboxes_ignore,
-            labels, masks, seg_map. "masks" are raw annotations and not
-            decoded into binary masks.
-        """
-        gt_bboxes = []
-        gt_labels = []
-        gt_bboxes_ignore = []
-        gt_instance_ids = []
-
-        for i, ann in enumerate(ann_info):
-            if (not self.test_mode) and (ann['visibility'] <
-                                         self.visibility_thr):
-                continue
-            x1, y1, w, h = ann['bbox']
-            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
-            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
-            if inter_w * inter_h == 0:
-                continue
-            if ann['area'] <= 0 or w < 1 or h < 1:
-                continue
-            if ann['category_id'] not in self.cat_ids:
-                continue
-            bbox = [x1, y1, x1 + w, y1 + h]
-            if ann.get('ignore', False) or ann.get('iscrowd', False):
-                # note: normally no `iscrowd` for MOT17Dataset
-                gt_bboxes_ignore.append(bbox)
-            else:
-                gt_bboxes.append(bbox)
-                gt_labels.append(self.cat2label[ann['category_id']])
-                gt_instance_ids.append(ann['instance_id'])
-
-        if gt_bboxes:
-            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
-            gt_labels = np.array(gt_labels, dtype=np.int64)
-            gt_instance_ids = np.array(gt_instance_ids, dtype=np.int64)
-        else:
-            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
-            gt_labels = np.array([], dtype=np.int64)
-            gt_instance_ids = np.array([], dtype=np.int64)
-
-        if gt_bboxes_ignore:
-            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
-        else:
-            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
-
-        ann = dict(
-            bboxes=gt_bboxes,
-            labels=gt_labels,
-            bboxes_ignore=gt_bboxes_ignore,
-            instance_ids=gt_instance_ids)
-
-        return ann
-
-    def format_results(self, results, resfile_path=None, metrics=['track']):
-        """Format the results to txts (standard format for MOT Challenge, which
-        is followed by DanceTrack dataset).
-
-        Args:
-            results (dict(list[ndarray])): Testing results of the dataset.
-            resfile_path (str, optional): Path to save the formatted results.
-                Defaults to None.
-            metrics (list[str], optional): The results of the specific metrics
-                will be formatted.. Defaults to ['track'].
-
-        Returns:
-            tuple: (resfile_path, resfiles, names, tmp_dir), resfile_path is
-            the path to save the formatted results, resfiles is a dict
-            containing the filepaths, names is a list containing the name of
-            the videos, tmp_dir is the temporal directory created for saving
-            files.
-        """
-        assert isinstance(results, dict), 'results must be a dict.'
-        if resfile_path is None:
-            tmp_dir = tempfile.TemporaryDirectory()
-            resfile_path = tmp_dir.name
-        else:
-            tmp_dir = None
-            if osp.exists(resfile_path):
-                print_log('remove previous results.', self.logger)
-                import shutil
-                shutil.rmtree(resfile_path)
-
-        resfiles = dict()
-        for metric in metrics:
-            resfiles[metric] = osp.join(resfile_path, metric)
-            os.makedirs(resfiles[metric], exist_ok=True)
-
-        inds = [i for i, _ in enumerate(self.data_infos) if _['frame_id'] == 0]
-        num_vids = len(inds)
-        assert num_vids == len(self.vid_ids)
-        inds.append(len(self.data_infos))
-        vid_infos = self.coco.load_vids(self.vid_ids)
-        names = [_['name'] for _ in vid_infos]
-
-        for i in range(num_vids):
-            for metric in metrics:
-                formatter = getattr(self, f'format_{metric}_results')
-                formatter(results[f'{metric}_bboxes'][inds[i]:inds[i + 1]],
-                          self.data_infos[inds[i]:inds[i + 1]],
-                          f'{resfiles[metric]}/{names[i]}.txt')
-
-        return resfile_path, resfiles, names, tmp_dir
-
-    def format_track_results(self, results, infos, resfile):
-        """Format tracking results."""
-
-        results_per_video = []
-        for frame_id, result in enumerate(results):
-            outs_track = results2outs(bbox_results=result)
-            track_ids, bboxes = outs_track['ids'], outs_track['bboxes']
-            frame_ids = np.full_like(track_ids, frame_id)
-            results_per_frame = np.concatenate(
-                (frame_ids[:, None], track_ids[:, None], bboxes), axis=1)
-            results_per_video.append(results_per_frame)
-        # `results_per_video` is a ndarray with shape (N, 7). Each row denotes
-        # (frame_id, track_id, x1, y1, x2, y2, score)
-        results_per_video = np.concatenate(results_per_video)
-
-        if self.interpolate_tracks_cfg is not None:
-            results_per_video = interpolate_tracks(
-                results_per_video, **self.interpolate_tracks_cfg)
-
-        with open(resfile, 'wt') as f:
-            for frame_id, info in enumerate(infos):
-                # `mot_frame_id` is the actually frame id used for evaluation.
-                # It may not start from 0.
-                if 'mot_frame_id' in info:
-                    mot_frame_id = info['mot_frame_id']
-                else:
-                    mot_frame_id = info['frame_id'] + 1
-
-                results_per_frame = \
-                    results_per_video[results_per_video[:, 0] == frame_id]
-                for i in range(len(results_per_frame)):
-                    _, track_id, x1, y1, x2, y2, conf = results_per_frame[i]
-                    f.writelines(
-                        f'{mot_frame_id},{track_id},{x1:.3f},{y1:.3f},' +
-                        f'{(x2-x1):.3f},{(y2-y1):.3f},{conf:.3f},-1,-1,-1\n')
-
-    def format_bbox_results(self, results, infos, resfile):
-        """Format detection results."""
-        with open(resfile, 'wt') as f:
-            for res, info in zip(results, infos):
-                if 'mot_frame_id' in info:
-                    frame = info['mot_frame_id']
-                else:
-                    frame = info['frame_id'] + 1
-
-                outs_det = results2outs(bbox_results=res)
-                for bbox, label in zip(outs_det['bboxes'], outs_det['labels']):
-                    x1, y1, x2, y2, conf = bbox
-                    f.writelines(
-                        f'{frame},-1,{x1:.3f},{y1:.3f},{(x2-x1):.3f},' +
-                        f'{(y2-y1):.3f},{conf:.3f}\n')
-            f.close()
-
-    def get_benchmark(self):
-        """Get benchmark from upeper/lower-case image prefix."""
-        BENCHMARKS = ['MOT15', 'MOT16', 'MOT17', 'MOT20']
-        for benchamrk in BENCHMARKS:
-            if benchamrk in self.img_prefix.upper():
-                return benchamrk
-
     def get_dataset_cfg_for_hota(self, gt_folder, tracker_folder, seqmap):
         """Get default configs for trackeval.datasets.MotChallenge2DBox.
 

From 5d21063d75f2a1a758b3d09d5f3cc7b3a67cdc47 Mon Sep 17 00:00:00 2001
From: noahcao <jinkuncao@gmail.com>
Date: Sat, 7 May 2022 12:39:35 -0400
Subject: [PATCH 5/9] remove legacy code

---
 .../qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py  |  5 +----
 mmtrack/datasets/dancetrack_dataset.py            | 12 +++++++++---
 .../dancetrack/dancetrack2coco.py                 | 15 +++++----------
 3 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py b/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py
index ee9c51f32..3dbb36c42 100644
--- a/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py
+++ b/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py
@@ -1,7 +1,4 @@
-_base_ = [
-    '../../_base_/models/faster_rcnn_r50_fpn.py',
-    '../../_base_/datasets/dancetrack.py', '../../_base_/default_runtime.py'
-]
+_base_ = ['./qdtrack_faster-rcnn_r50_fpn_4e_crowdhuman_mot17-private-half.py']
 model = dict(
     type='QDTrack',
     detector=dict(
diff --git a/mmtrack/datasets/dancetrack_dataset.py b/mmtrack/datasets/dancetrack_dataset.py
index 3e4145f9d..d07a57e32 100644
--- a/mmtrack/datasets/dancetrack_dataset.py
+++ b/mmtrack/datasets/dancetrack_dataset.py
@@ -24,6 +24,13 @@ class DanceTrackDataset(MOTChallengeDataset):
     Most content is inherited from MOTChallengeDataset.
     """
 
+    def get_benchmark(self):
+        """Get benchmark from upeper/lower-case image prefix."""
+        BENCHMARKS = ['DanceTrack']
+        for benchamrk in BENCHMARKS:
+            if benchamrk in self.img_prefix.upper():
+                return benchamrk
+
     def get_dataset_cfg_for_hota(self, gt_folder, tracker_folder, seqmap):
         """Get default configs for trackeval.datasets.MotChallenge2DBox.
 
@@ -49,7 +56,7 @@ def get_dataset_cfg_for_hota(self, gt_folder, tracker_folder, seqmap):
             CLASSES_TO_EVAL=list(self.CLASSES),
             # TrackEval does not support Dancetrack as an option,
             # we use the wrapper for MOT17 dataset
-            BENCHMARK='DanceTrack',
+            BENCHMARK=self.get_benchmark(),
             # Option Values: 'train', 'val', 'test'
             SPLIT_TO_EVAL='val',
             # Whether tracker input files are zipped
@@ -57,8 +64,7 @@ def get_dataset_cfg_for_hota(self, gt_folder, tracker_folder, seqmap):
             # Whether to print current config
             PRINT_CONFIG=True,
             # Whether to perform preprocessing
-            # (never done for MOT15)
-            DO_PREPROC=False if 'MOT15' in self.img_prefix else True,
+            DO_PREPROC=False,
             # Tracker files are in
             # TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
             TRACKER_SUB_FOLDER='',
diff --git a/tools/convert_datasets/dancetrack/dancetrack2coco.py b/tools/convert_datasets/dancetrack/dancetrack2coco.py
index 3b37dd5a1..bc6151c3d 100644
--- a/tools/convert_datasets/dancetrack/dancetrack2coco.py
+++ b/tools/convert_datasets/dancetrack/dancetrack2coco.py
@@ -46,20 +46,15 @@ def parse_args():
     return parser.parse_args()
 
 
-def parse_gts(gts, is_mot15):
+def parse_gts(gts):
     outputs = defaultdict(list)
     for gt in gts:
         gt = gt.strip().split(',')
         frame_id, ins_id = map(int, gt[:2])
         bbox = list(map(float, gt[2:6]))
-        if is_mot15:
-            conf = 1.
-            class_id = 1
-            visibility = 1.
-        else:
-            conf = float(gt[6])
-            class_id = int(gt[7])
-            visibility = float(gt[8])
+        conf = float(gt[6])
+        class_id = int(gt[7])
+        visibility = float(gt[8])
         if class_id in USELESS:
             continue
         elif class_id in IGNORES:
@@ -123,7 +118,7 @@ def main():
             # parse annotations
             if parse_gt:
                 gts = mmcv.list_from_file(f'{video_folder}/gt/gt.txt')
-                img2gts = parse_gts(gts, False)
+                img2gts = parse_gts(gts)
 
             # image and box level infos
             for frame_id, name in enumerate(img_names):

From ae1c3c5e06dd24b49a783ed7bf9dc61c6cd69bfc Mon Sep 17 00:00:00 2001
From: Tao Gong <gt950513@mail.ustc.edu.cn>
Date: Tue, 10 May 2022 11:42:11 +0800
Subject: [PATCH 6/9] refactor DanceTrackDataset class

---
 mmtrack/datasets/dancetrack_dataset.py    | 255 +---------------------
 mmtrack/datasets/mot_challenge_dataset.py |  27 ++-
 2 files changed, 31 insertions(+), 251 deletions(-)

diff --git a/mmtrack/datasets/dancetrack_dataset.py b/mmtrack/datasets/dancetrack_dataset.py
index d07a57e32..5699a283d 100644
--- a/mmtrack/datasets/dancetrack_dataset.py
+++ b/mmtrack/datasets/dancetrack_dataset.py
@@ -1,21 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import os
-import os.path as osp
-import tempfile
-
-import motmetrics as mm
-import numpy as np
-from mmcv.utils import print_log
-from mmdet.core import eval_map
 from mmdet.datasets import DATASETS
 
 from .mot_challenge_dataset import MOTChallengeDataset
 
-try:
-    import trackeval
-except ImportError:
-    trackeval = None
-
 
 @DATASETS.register_module()
 class DanceTrackDataset(MOTChallengeDataset):
@@ -24,239 +11,19 @@ class DanceTrackDataset(MOTChallengeDataset):
     Most content is inherited from MOTChallengeDataset.
     """
 
-    def get_benchmark(self):
-        """Get benchmark from upeper/lower-case image prefix."""
-        BENCHMARKS = ['DanceTrack']
-        for benchamrk in BENCHMARKS:
-            if benchamrk in self.img_prefix.upper():
-                return benchamrk
+    def get_benchmark_and_eval_split(self):
+        """Get benchmark and dataset split to evaluate.
 
-    def get_dataset_cfg_for_hota(self, gt_folder, tracker_folder, seqmap):
-        """Get default configs for trackeval.datasets.MotChallenge2DBox.
-
-        Args:
-            gt_folder (str): the name of the GT folder
-            tracker_folder (str): the name of the tracker folder
-            seqmap (str): the file that contains the sequence of video names
+        Get benchmark from upeper/lower-case image prefix and the dataset
+        split to evaluate.
 
         Returns:
-            Dataset Configs for MotChallenge2DBox.
+            tuple(string): The first string denotes the type of dataset.
+            The second string denots the split of the dataset to eval.
         """
-        dataset_config = dict(
-            # Location of GT data
-            GT_FOLDER=gt_folder,
-            # Trackers location
-            TRACKERS_FOLDER=tracker_folder,
-            # Where to save eval results
-            # (if None, same as TRACKERS_FOLDER)
-            OUTPUT_FOLDER=None,
-            # Use 'track' as the default tracker
-            TRACKERS_TO_EVAL=['track'],
-            # Option values: ['pedestrian']
-            CLASSES_TO_EVAL=list(self.CLASSES),
-            # TrackEval does not support Dancetrack as an option,
-            # we use the wrapper for MOT17 dataset
-            BENCHMARK=self.get_benchmark(),
-            # Option Values: 'train', 'val', 'test'
-            SPLIT_TO_EVAL='val',
-            # Whether tracker input files are zipped
-            INPUT_AS_ZIP=False,
-            # Whether to print current config
-            PRINT_CONFIG=True,
-            # Whether to perform preprocessing
-            DO_PREPROC=False,
-            # Tracker files are in
-            # TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
-            TRACKER_SUB_FOLDER='',
-            # Output files are saved in
-            # OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
-            OUTPUT_SUB_FOLDER='',
-            # Names of trackers to display
-            # (if None: TRACKERS_TO_EVAL)
-            TRACKER_DISPLAY_NAMES=None,
-            # Where seqmaps are found
-            # (if None: GT_FOLDER/seqmaps)
-            SEQMAP_FOLDER=None,
-            # Directly specify seqmap file
-            # (if none use seqmap_folder/benchmark-split_to_eval)
-            SEQMAP_FILE=seqmap,
-            # If not None, specify sequences to eval
-            # and their number of timesteps
-            SEQ_INFO=None,
-            # '{gt_folder}/{seq}/gt/gt.txt'
-            GT_LOC_FORMAT='{gt_folder}/{seq}/gt/gt.txt',
-            # If False, data is in GT_FOLDER/BENCHMARK-SPLIT_TO_EVAL/ and in
-            # TRACKERS_FOLDER/BENCHMARK-SPLIT_TO_EVAL/tracker/
-            # If True, the middle 'benchmark-split' folder is skipped for both.
-            SKIP_SPLIT_FOL=True,
-        )
-
-        return dataset_config
-
-    def evaluate(self,
-                 results,
-                 metric='track',
-                 logger=None,
-                 resfile_path=None,
-                 bbox_iou_thr=0.5,
-                 track_iou_thr=0.5):
-        """Evaluation in MOT Challenge.
-
-        Args:
-            results (list[list | tuple]): Testing results of the dataset.
-            metric (str | list[str]): Metrics to be evaluated. Options are
-                'bbox', 'track'. Defaults to 'track'.
-            logger (logging.Logger | str | None): Logger used for printing
-                related information during evaluation. Default: None.
-            resfile_path (str, optional): Path to save the formatted results.
-                Defaults to None.
-            bbox_iou_thr (float, optional): IoU threshold for detection
-                evaluation. Defaults to 0.5.
-            track_iou_thr (float, optional): IoU threshold for tracking
-                evaluation.. Defaults to 0.5.
-
-        Returns:
-            dict[str, float]: MOTChallenge style evaluation metric.
-        """
-        eval_results = dict()
-        if isinstance(metric, list):
-            metrics = metric
-        elif isinstance(metric, str):
-            metrics = [metric]
-        else:
-            raise TypeError('metric must be a list or a str.')
-        allowed_metrics = ['bbox', 'track']
-        for metric in metrics:
-            if metric not in allowed_metrics:
-                raise KeyError(f'metric {metric} is not supported.')
-
-        if 'track' in metrics:
-            resfile_path, resfiles, names, tmp_dir = self.format_results(
-                results, resfile_path, metrics)
-            print_log('Evaluate CLEAR MOT results.', logger=logger)
-            distth = 1 - track_iou_thr
-            accs = []
-            # support loading data from ceph
-            local_dir = tempfile.TemporaryDirectory()
-
-            for name in names:
-                gt_file = osp.join(self.img_prefix, f'{name}/gt/gt.txt')
-                res_file = osp.join(resfiles['track'], f'{name}.txt')
-                # copy gt file from ceph to local temporary directory
-                gt_dir_path = osp.join(local_dir.name, name, 'gt')
-                os.makedirs(gt_dir_path)
-                copied_gt_file = osp.join(
-                    local_dir.name,
-                    gt_file.replace(gt_file.split(name)[0], ''))
-
-                f = open(copied_gt_file, 'wb')
-                gt_content = self.file_client.get(gt_file)
-                if hasattr(gt_content, 'tobytes'):
-                    gt_content = gt_content.tobytes()
-                f.write(gt_content)
-                f.close()
-                # copy sequence file from ceph to local temporary directory
-                copied_seqinfo_path = osp.join(local_dir.name, name,
-                                               'seqinfo.ini')
-                f = open(copied_seqinfo_path, 'wb')
-                seq_content = self.file_client.get(
-                    osp.join(self.img_prefix, name, 'seqinfo.ini'))
-                if hasattr(seq_content, 'tobytes'):
-                    seq_content = seq_content.tobytes()
-                f.write(seq_content)
-                f.close()
-
-                gt = mm.io.loadtxt(copied_gt_file)
-                res = mm.io.loadtxt(res_file)
-                if osp.exists(copied_seqinfo_path):
-                    acc, ana = mm.utils.CLEAR_MOT_M(
-                        gt, res, copied_seqinfo_path, distth=distth)
-                else:
-                    acc = mm.utils.compare_to_groundtruth(
-                        gt, res, distth=distth)
-                accs.append(acc)
-
-            mh = mm.metrics.create()
-            summary = mh.compute_many(
-                accs,
-                names=names,
-                metrics=mm.metrics.motchallenge_metrics,
-                generate_overall=True)
-
-            if trackeval is None:
-                raise ImportError(
-                    'Please run'
-                    'pip install git+https://github.com/JonathonLuiten/TrackEval.git'  # noqa
-                    'to manually install trackeval')
-
-            seqmap = osp.join(resfile_path, 'videoseq.txt')
-            with open(seqmap, 'w') as f:
-                f.write('name\n')
-                for name in names:
-                    f.write(name + '\n')
-                f.close()
-
-            eval_config = trackeval.Evaluator.get_default_eval_config()
-
-            # tracker's name is set to 'track',
-            # so this word needs to be splited out
-            output_folder = resfiles['track'].rsplit(os.sep, 1)[0]
-            dataset_config = self.get_dataset_cfg_for_hota(
-                local_dir.name, output_folder, seqmap)
-
-            evaluator = trackeval.Evaluator(eval_config)
-            dataset = [trackeval.datasets.MotChallenge2DBox(dataset_config)]
-            hota_metrics = [
-                trackeval.metrics.HOTA(dict(METRICS=['HOTA'], THRESHOLD=0.5))
-            ]
-            output_res, _ = evaluator.evaluate(dataset, hota_metrics)
-
-            # modify HOTA results sequence according to summary list,
-            # indexes of summary are sequence names and 'OVERALL'
-            # while for hota they are sequence names and 'COMBINED_SEQ'
-            seq_list = list(summary.index)
-            seq_list.append('COMBINED_SEQ')
-
-            hota = [
-                np.average(output_res['MotChallenge2DBox']['track'][seq]
-                           ['pedestrian']['HOTA']['HOTA']) for seq in seq_list
-                if 'OVERALL' not in seq
-            ]
-
-            eval_results.update({
-                mm.io.motchallenge_metric_names[k]: v['OVERALL']
-                for k, v in summary.to_dict().items()
-            })
-            eval_results['HOTA'] = hota[-1]
-
-            summary['HOTA'] = hota
-            str_summary = mm.io.render_summary(
-                summary,
-                formatters=mh.formatters,
-                namemap=mm.io.motchallenge_metric_names)
-            print(str_summary)
-            local_dir.cleanup()
-            if tmp_dir is not None:
-                tmp_dir.cleanup()
-
-        if 'bbox' in metrics:
-            if isinstance(results, dict):
-                bbox_results = results['det_bboxes']
-            elif isinstance(results, list):
-                bbox_results = results
-            else:
-                raise TypeError('results must be a dict or a list.')
-            annotations = [self.get_ann_info(info) for info in self.data_infos]
-            mean_ap, _ = eval_map(
-                bbox_results,
-                annotations,
-                iou_thr=bbox_iou_thr,
-                dataset=self.CLASSES,
-                logger=logger)
-            eval_results['mAP'] = mean_ap
-
-        for k, v in eval_results.items():
-            if isinstance(v, float):
-                eval_results[k] = float(f'{(v):.3f}')
+        SPLIT_TO_EVAL = ['train', 'val']
+        for split_to_eval in SPLIT_TO_EVAL:
+            if split_to_eval in self.img_prefix.upper():
+                break
 
-        return eval_results
+        return 'DanceTrack', split_to_eval
diff --git a/mmtrack/datasets/mot_challenge_dataset.py b/mmtrack/datasets/mot_challenge_dataset.py
index bd0c8906b..f446333e4 100644
--- a/mmtrack/datasets/mot_challenge_dataset.py
+++ b/mmtrack/datasets/mot_challenge_dataset.py
@@ -246,12 +246,23 @@ def format_bbox_results(self, results, infos, resfile):
                         f'{(y2-y1):.3f},{conf:.3f}\n')
             f.close()
 
-    def get_benchmark(self):
-        """Get benchmark from upeper/lower-case image prefix."""
+    def get_benchmark_and_eval_split(self):
+        """Get benchmark and dataset split to evaluate.
+
+        Get benchmark from upeper/lower-case image prefix and the dataset
+        split to evaluate.
+
+        Returns:
+            tuple(string): The first string denotes the type of dataset.
+            The second string denotes the split of the dataset to eval.
+        """
         BENCHMARKS = ['MOT15', 'MOT16', 'MOT17', 'MOT20']
-        for benchamrk in BENCHMARKS:
-            if benchamrk in self.img_prefix.upper():
-                return benchamrk
+        for benchmark in BENCHMARKS:
+            if benchmark in self.img_prefix.upper():
+                break
+        # We directly return 'train' for the dataset split to evaluate, since
+        # MOT challenge only provides annotations for train split.
+        return benchmark, 'train'
 
     def get_dataset_cfg_for_hota(self, gt_folder, tracker_folder, seqmap):
         """Get default configs for trackeval.datasets.MotChallenge2DBox.
@@ -264,6 +275,8 @@ def get_dataset_cfg_for_hota(self, gt_folder, tracker_folder, seqmap):
         Returns:
             Dataset Configs for MotChallenge2DBox.
         """
+        benchmark, split_to_eval = self.get_benchmark_and_eval_split()
+
         dataset_config = dict(
             # Location of GT data
             GT_FOLDER=gt_folder,
@@ -277,9 +290,9 @@ def get_dataset_cfg_for_hota(self, gt_folder, tracker_folder, seqmap):
             # Option values: ['pedestrian']
             CLASSES_TO_EVAL=list(self.CLASSES),
             # Option Values: 'MOT17', 'MOT16', 'MOT20', 'MOT15'
-            BENCHMARK=self.get_benchmark(),
+            BENCHMARK=benchmark,
             # Option Values: 'train', 'test'
-            SPLIT_TO_EVAL='train',
+            SPLIT_TO_EVAL=split_to_eval,
             # Whether tracker input files are zipped
             INPUT_AS_ZIP=False,
             # Whether to print current config

From f4caf15b23c3a26a35f887973409688c46538182 Mon Sep 17 00:00:00 2001
From: noahcao <jinkuncao@gmail.com>
Date: Tue, 10 May 2022 18:04:40 -0400
Subject: [PATCH 7/9] fix error in dancetrack qdtrack config file

---
 .../qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py       |  5 ++++-
 mmtrack/datasets/dancetrack_dataset.py                 | 10 ++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py b/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py
index 3dbb36c42..39624a406 100644
--- a/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py
+++ b/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py
@@ -1,4 +1,7 @@
-_base_ = ['./qdtrack_faster-rcnn_r50_fpn_4e_crowdhuman_mot17-private-half.py']
+_base_ = [
+    '../../_base_/datasets/dancetrack.py'
+    '../../_base_/datasets/dancetrack.py', '../../_base_/default_runtime.py'
+]
 model = dict(
     type='QDTrack',
     detector=dict(
diff --git a/mmtrack/datasets/dancetrack_dataset.py b/mmtrack/datasets/dancetrack_dataset.py
index 5699a283d..92fddadff 100644
--- a/mmtrack/datasets/dancetrack_dataset.py
+++ b/mmtrack/datasets/dancetrack_dataset.py
@@ -20,10 +20,8 @@ def get_benchmark_and_eval_split(self):
         Returns:
             tuple(string): The first string denotes the type of dataset.
             The second string denots the split of the dataset to eval.
-        """
-        SPLIT_TO_EVAL = ['train', 'val']
-        for split_to_eval in SPLIT_TO_EVAL:
-            if split_to_eval in self.img_prefix.upper():
-                break
 
-        return 'DanceTrack', split_to_eval
+        As DanceTrack only has train/val and use 'val' for evaluation as
+        default, we can directly output the desired split.
+        """
+        return 'DanceTrack', 'val'

From d17cc1768fe0bac52946344051770aec5421830f Mon Sep 17 00:00:00 2001
From: noahcao <jinkuncao@gmail.com>
Date: Wed, 11 May 2022 01:40:27 -0400
Subject: [PATCH 8/9] refactor qdtrack config files.

---
 .../qdtrack_faster-rcnn_r50_fpn_4e_base.py    | 81 +++++++++++++++++++
 ...track_faster-rcnn_r50_fpn_4e_dancetrack.py | 81 +------------------
 ...ster-rcnn_r50_fpn_4e_mot17-private-half.py | 81 +------------------
 3 files changed, 85 insertions(+), 158 deletions(-)
 create mode 100644 configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_base.py

diff --git a/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_base.py b/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_base.py
new file mode 100644
index 000000000..a28bfb605
--- /dev/null
+++ b/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_base.py
@@ -0,0 +1,81 @@
+_base_ = [
+    '../../_base_/models/faster_rcnn_r50_fpn.py',
+    '../../_base_/default_runtime.py'
+]
+model = dict(
+    type='QDTrack',
+    detector=dict(
+        backbone=dict(
+            norm_cfg=dict(requires_grad=False),
+            style='caffe',
+            init_cfg=dict(
+                type='Pretrained', checkpoint='torchvision://resnet50')),
+        rpn_head=dict(bbox_coder=dict(clip_border=False)),
+        roi_head=dict(
+            bbox_head=dict(
+                loss_bbox=dict(type='L1Loss', loss_weight=1.0),
+                bbox_coder=dict(clip_border=False),
+                num_classes=1)),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'  # noqa: E501
+        )),
+    track_head=dict(
+        type='QuasiDenseTrackHead',
+        roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        embed_head=dict(
+            type='QuasiDenseEmbedHead',
+            num_convs=4,
+            num_fcs=1,
+            embed_channels=256,
+            norm_cfg=dict(type='GN', num_groups=32),
+            loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
+            loss_track_aux=dict(
+                type='L2Loss',
+                neg_pos_ub=3,
+                pos_margin=0,
+                neg_margin=0.1,
+                hard_mining=True,
+                loss_weight=1.0)),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0),
+        train_cfg=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='CombinedSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=3,
+                add_gt_as_proposals=True,
+                pos_sampler=dict(type='InstanceBalancedPosSampler'),
+                neg_sampler=dict(type='RandomSampler')))),
+    tracker=dict(
+        type='QuasiDenseEmbedTracker',
+        init_score_thr=0.9,
+        obj_score_thr=0.5,
+        match_score_thr=0.5,
+        memo_tracklet_frames=30,
+        memo_backdrop_frames=1,
+        memo_momentum=0.8,
+        nms_conf_thr=0.5,
+        nms_backdrop_iou_thr=0.3,
+        nms_class_iou_thr=0.7,
+        with_cats=True,
+        match_metric='bisoftmax'))
+# optimizer && learning policy
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(policy='step', step=[3])
+# runtime settings
+total_epochs = 4
+evaluation = dict(metric=['bbox', 'track'], interval=1)
diff --git a/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py b/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py
index 39624a406..2d6a21bb2 100644
--- a/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py
+++ b/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py
@@ -1,77 +1,7 @@
 _base_ = [
-    '../../_base_/datasets/dancetrack.py'
-    '../../_base_/datasets/dancetrack.py', '../../_base_/default_runtime.py'
+    './qdtrack_faster-rcnn_r50_fpn_4e_base.py',
+    '../../_base_/datasets/dancetrack.py',
 ]
-model = dict(
-    type='QDTrack',
-    detector=dict(
-        backbone=dict(
-            norm_cfg=dict(requires_grad=False),
-            style='caffe',
-            init_cfg=dict(
-                type='Pretrained', checkpoint='torchvision://resnet50')),
-        rpn_head=dict(bbox_coder=dict(clip_border=False)),
-        roi_head=dict(
-            bbox_head=dict(
-                loss_bbox=dict(type='L1Loss', loss_weight=1.0),
-                bbox_coder=dict(clip_border=False),
-                num_classes=1)),
-        init_cfg=dict(
-            type='Pretrained',
-            checkpoint=  # noqa: E251
-            'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'  # noqa: E501
-        )),
-    track_head=dict(
-        type='QuasiDenseTrackHead',
-        roi_extractor=dict(
-            type='SingleRoIExtractor',
-            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
-            out_channels=256,
-            featmap_strides=[4, 8, 16, 32]),
-        embed_head=dict(
-            type='QuasiDenseEmbedHead',
-            num_convs=4,
-            num_fcs=1,
-            embed_channels=256,
-            norm_cfg=dict(type='GN', num_groups=32),
-            loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
-            loss_track_aux=dict(
-                type='L2Loss',
-                neg_pos_ub=3,
-                pos_margin=0,
-                neg_margin=0.1,
-                hard_mining=True,
-                loss_weight=1.0)),
-        loss_bbox=dict(type='L1Loss', loss_weight=1.0),
-        train_cfg=dict(
-            assigner=dict(
-                type='MaxIoUAssigner',
-                pos_iou_thr=0.7,
-                neg_iou_thr=0.5,
-                min_pos_iou=0.5,
-                match_low_quality=False,
-                ignore_iof_thr=-1),
-            sampler=dict(
-                type='CombinedSampler',
-                num=256,
-                pos_fraction=0.5,
-                neg_pos_ub=3,
-                add_gt_as_proposals=True,
-                pos_sampler=dict(type='InstanceBalancedPosSampler'),
-                neg_sampler=dict(type='RandomSampler')))),
-    tracker=dict(
-        type='QuasiDenseEmbedTracker',
-        init_score_thr=0.9,
-        obj_score_thr=0.5,
-        match_score_thr=0.5,
-        memo_tracklet_frames=30,
-        memo_backdrop_frames=1,
-        memo_momentum=0.8,
-        nms_conf_thr=0.5,
-        nms_backdrop_iou_thr=0.3,
-        nms_class_iou_thr=0.7,
-        with_cats=True,
-        match_metric='bisoftmax'))
 img_norm_cfg = dict(
     mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
 train_pipeline = [
@@ -121,10 +51,3 @@
     train=dict(pipeline=train_pipeline),
     val=dict(pipeline=test_pipeline),
     test=dict(pipeline=test_pipeline))
-# optimizer && learning policy
-optimizer_config = dict(
-    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
-lr_config = dict(policy='step', step=[3])
-# runtime settings
-total_epochs = 4
-evaluation = dict(metric=['bbox', 'track'], interval=1)
diff --git a/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_mot17-private-half.py b/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_mot17-private-half.py
index cea2fbc9d..d38ab84c4 100644
--- a/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_mot17-private-half.py
+++ b/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_mot17-private-half.py
@@ -1,77 +1,7 @@
 _base_ = [
-    '../../_base_/models/faster_rcnn_r50_fpn.py',
-    '../../_base_/datasets/mot_challenge.py', '../../_base_/default_runtime.py'
+    './qdtrack_faster-rcnn_r50_fpn_4e_base.py',
+    '../../_base_/datasets/mot_challenge.py',
 ]
-model = dict(
-    type='QDTrack',
-    detector=dict(
-        backbone=dict(
-            norm_cfg=dict(requires_grad=False),
-            style='caffe',
-            init_cfg=dict(
-                type='Pretrained', checkpoint='torchvision://resnet50')),
-        rpn_head=dict(bbox_coder=dict(clip_border=False)),
-        roi_head=dict(
-            bbox_head=dict(
-                loss_bbox=dict(type='L1Loss', loss_weight=1.0),
-                bbox_coder=dict(clip_border=False),
-                num_classes=1)),
-        init_cfg=dict(
-            type='Pretrained',
-            checkpoint=  # noqa: E251
-            'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'  # noqa: E501
-        )),
-    track_head=dict(
-        type='QuasiDenseTrackHead',
-        roi_extractor=dict(
-            type='SingleRoIExtractor',
-            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
-            out_channels=256,
-            featmap_strides=[4, 8, 16, 32]),
-        embed_head=dict(
-            type='QuasiDenseEmbedHead',
-            num_convs=4,
-            num_fcs=1,
-            embed_channels=256,
-            norm_cfg=dict(type='GN', num_groups=32),
-            loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
-            loss_track_aux=dict(
-                type='L2Loss',
-                neg_pos_ub=3,
-                pos_margin=0,
-                neg_margin=0.1,
-                hard_mining=True,
-                loss_weight=1.0)),
-        loss_bbox=dict(type='L1Loss', loss_weight=1.0),
-        train_cfg=dict(
-            assigner=dict(
-                type='MaxIoUAssigner',
-                pos_iou_thr=0.7,
-                neg_iou_thr=0.5,
-                min_pos_iou=0.5,
-                match_low_quality=False,
-                ignore_iof_thr=-1),
-            sampler=dict(
-                type='CombinedSampler',
-                num=256,
-                pos_fraction=0.5,
-                neg_pos_ub=3,
-                add_gt_as_proposals=True,
-                pos_sampler=dict(type='InstanceBalancedPosSampler'),
-                neg_sampler=dict(type='RandomSampler')))),
-    tracker=dict(
-        type='QuasiDenseEmbedTracker',
-        init_score_thr=0.9,
-        obj_score_thr=0.5,
-        match_score_thr=0.5,
-        memo_tracklet_frames=30,
-        memo_backdrop_frames=1,
-        memo_momentum=0.8,
-        nms_conf_thr=0.5,
-        nms_backdrop_iou_thr=0.3,
-        nms_class_iou_thr=0.7,
-        with_cats=True,
-        match_metric='bisoftmax'))
 img_norm_cfg = dict(
     mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
 train_pipeline = [
@@ -121,10 +51,3 @@
     train=dict(pipeline=train_pipeline),
     val=dict(pipeline=test_pipeline),
     test=dict(pipeline=test_pipeline))
-# optimizer && learning policy
-optimizer_config = dict(
-    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
-lr_config = dict(policy='step', step=[3])
-# runtime settings
-total_epochs = 4
-evaluation = dict(metric=['bbox', 'track'], interval=1)

From 04e9b38671d4eb276bfdaa832cef8dc9132e4f7d Mon Sep 17 00:00:00 2001
From: Tao Gong <gt950513@mail.ustc.edu.cn>
Date: Thu, 12 May 2022 11:22:10 +0800
Subject: [PATCH 9/9] Update dancetrack_dataset.py

---
 mmtrack/datasets/dancetrack_dataset.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mmtrack/datasets/dancetrack_dataset.py b/mmtrack/datasets/dancetrack_dataset.py
index 92fddadff..f281fab4d 100644
--- a/mmtrack/datasets/dancetrack_dataset.py
+++ b/mmtrack/datasets/dancetrack_dataset.py
@@ -20,8 +20,7 @@ def get_benchmark_and_eval_split(self):
         Returns:
             tuple(string): The first string denotes the type of dataset.
             The second string denots the split of the dataset to eval.
-
-        As DanceTrack only has train/val and use 'val' for evaluation as
-        default, we can directly output the desired split.
         """
+        # As DanceTrack only has train/val and use 'val' for evaluation as
+        # default, we can directly output the desired split.
         return 'DanceTrack', 'val'