diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 00000000..203db2f5
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,7 @@
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+*.JPEG filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/citest.yaml b/.github/workflows/citest.yaml
index 9e455e71..1af7684f 100644
--- a/.github/workflows/citest.yaml
+++ b/.github/workflows/citest.yaml
@@ -43,6 +43,10 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v2
+        with:
+            lfs: 'true'
+      - name: Checkout LFS objects
+        run: git lfs checkout
       - name: Run unittest
         shell: bash
         run: |
diff --git a/assets/result.jpg b/assets/result.jpg
index d262a484..088bb660 100644
Binary files a/assets/result.jpg and b/assets/result.jpg differ
diff --git a/configs/face/face_96x96_wingloss.py b/configs/face/face_96x96_wingloss.py
new file mode 100644
index 00000000..f3725e9c
--- /dev/null
+++ b/configs/face/face_96x96_wingloss.py
@@ -0,0 +1,238 @@
+# model settings
+POINT_NUMBER = 106
+MEAN_FACE = [
+    0.05486667535113006, 0.24441904048908245, 0.05469932714062696,
+    0.30396829196709935, 0.05520653400164321, 0.3643191463607746,
+    0.05865501342257397, 0.42453849020500306, 0.0661603899137523,
+    0.48531377442945767, 0.07807677169271177, 0.5452126843738523,
+    0.09333319368757653, 0.6047840615432064, 0.11331425394034209,
+    0.6631144309665994, 0.13897813867699352, 0.7172296230155276,
+    0.17125811033538194, 0.767968859462583, 0.20831698519371536,
+    0.8146603379935117, 0.24944621000897876, 0.857321261721953,
+    0.2932993820558674, 0.8973900596678597, 0.33843820185594653,
+    0.9350576242126986, 0.38647802623495553, 0.966902971122812,
+    0.4411974776504609, 0.9878629960611088, 0.5000390697219397,
+    0.9934886214875595, 0.5588590024515473, 0.9878510782414189,
+    0.6135829360035883, 0.9668655595323074, 0.6616294188166414,
+    0.9350065330378543, 0.7067734980023662, 0.8973410411573094,
+    0.7506167730772516, 0.8572957679511382, 0.7917579157122047,
+    0.8146281598803492, 0.8288026446367324, 0.7679019642224981,
+    0.8610918526053805, 0.7171624168757985, 0.8867491048162915,
+    0.6630344261248556, 0.9067293813428708, 0.6047095492618413,
+    0.9219649147678989, 0.5451295187190602, 0.9338619041815587,
+    0.4852292097262674, 0.9413455695142587, 0.424454780475834,
+    0.9447753107545577, 0.3642347111991026, 0.9452649776939869,
+    0.30388458223793025, 0.9450854849661369, 0.24432737691068557,
+    0.1594802473020129, 0.17495177946520288, 0.2082918411850002,
+    0.12758378330875153, 0.27675902873293057, 0.11712230823088154,
+    0.34660582049732336, 0.12782553369032904, 0.4137234315527489,
+    0.14788458441422778, 0.4123890243720449, 0.18814226684806626,
+    0.3498927810760776, 0.17640650480816664, 0.28590212091591866,
+    0.16895271174960227, 0.22193967489846017, 0.16985862149585013,
+    0.5861805004572298, 0.147863456192582, 0.6532904167464643,
+    0.12780412047734288, 0.723142364263288, 0.11709102395419578,
+    0.7916076475508984, 0.12753867695205595, 0.8404440227263494,
+    0.17488715120168932, 0.7779848023963316, 0.1698261195288917,
+    0.7140264757991571, 0.1689377237959271, 0.650024882334848,
+    0.17640581823811927, 0.5875270068157493, 0.18815421057605972,
+    0.4999687027691624, 0.2770570778583906, 0.49996466107378934,
+    0.35408433007759227, 0.49996725190415664, 0.43227025345368053,
+    0.49997367716346774, 0.5099309118810921, 0.443147025685285,
+    0.2837021691260901, 0.4079306716593004, 0.4729519900478952,
+    0.3786223176615041, 0.5388017782630576, 0.4166237366074797,
+    0.5822229552544941, 0.4556754522760756, 0.5887956328134262,
+    0.49998730493119997, 0.5951855531982454, 0.5443300921009105,
+    0.5887796732983633, 0.5833722476054509, 0.582200985012979,
+    0.6213509190608012, 0.5387760772258134, 0.5920137550293199,
+    0.4729325070035326, 0.5567854054587345, 0.28368589871138317,
+    0.23395988420439123, 0.275313734012504, 0.27156519109550253,
+    0.2558735678926061, 0.31487949633428597, 0.2523033259214858,
+    0.356919009399118, 0.2627342680634766, 0.3866625969903256,
+    0.2913618036573405, 0.3482919069920915, 0.3009936818974329,
+    0.3064437008415846, 0.3037349617842158, 0.26724000706363993,
+    0.2961896087804692, 0.3135744691699477, 0.27611103614975246,
+    0.6132904312551143, 0.29135144033587107, 0.6430396927648264,
+    0.2627079452269443, 0.6850713556136455, 0.2522730391144915,
+    0.728377707003201, 0.25583118190779625, 0.7660035591791254,
+    0.27526375689471777, 0.7327054300488236, 0.2961495286346863,
+    0.6935171517115648, 0.3036951925380769, 0.6516533228539426,
+    0.3009921014909089, 0.6863983789278025, 0.2760904908649394,
+    0.35811903020866753, 0.7233174007629063, 0.4051199834269763,
+    0.6931800846807724, 0.4629631471997891, 0.6718031951363689,
+    0.5000016063148277, 0.6799150331999366, 0.5370506360177653,
+    0.6717809139952097, 0.5948714927411151, 0.6931581144392573,
+    0.6418878095835022, 0.7232890570786875, 0.6088129582142587,
+    0.7713407215524752, 0.5601450388292929, 0.8052499757498277,
+    0.5000181358125715, 0.8160749831906926, 0.4398905591799545,
+    0.8052697696938342, 0.39120318265892984, 0.771375905028864,
+    0.36888771299734613, 0.7241751210643214, 0.4331097084010058,
+    0.7194543690519717, 0.5000188612450743, 0.7216823277180712,
+    0.566895861884284, 0.7194302225129479, 0.631122598507516,
+    0.7241462073974219, 0.5678462302796355, 0.7386355816766528,
+    0.5000082906571756, 0.7479600838019628, 0.43217532542902076,
+    0.7386538729390463, 0.31371761254774383, 0.2753328284323114,
+    0.6862487843823917, 0.2752940437017121
+]
+IMAGE_SIZE = 96
+
+loss_config = dict(
+    num_points=POINT_NUMBER,
+    left_eye_left_corner_index=66,
+    right_eye_right_corner_index=79,
+    points_weight=1.0,
+    contour_weight=1.5,
+    eyebrow_weight=1.5,
+    eye_weight=1.7,
+    nose_weight=1.3,
+    lip_weight=1.7,
+    omega=10,
+    epsilon=2)
+
+model = dict(
+    type='FaceKeypoint',
+    backbone=dict(
+        type='FaceKeypointBackbone',
+        in_channels=3,
+        out_channels=48,
+        residual_activation='relu',
+        inverted_activation='half_v2',
+        inverted_expand_ratio=2,
+    ),
+    keypoint_head=dict(
+        type='FaceKeypointHead',
+        in_channels=48,
+        out_channels=POINT_NUMBER * 2,
+        input_size=IMAGE_SIZE,
+        inverted_expand_ratio=2,
+        inverted_activation='half_v2',
+        mean_face=MEAN_FACE,
+        loss_keypoint=dict(type='WingLossWithPose', **loss_config),
+    ),
+    pose_head=dict(
+        type='FacePoseHead',
+        in_channels=48,
+        out_channels=3,
+        inverted_expand_ratio=2,
+        inverted_activation='half_v2',
+        loss_pose=dict(type='FacePoseLoss', pose_weight=0.01),
+    ),
+)
+
+train_pipeline = [
+    dict(type='FaceKeypointRandomAugmentation', input_size=IMAGE_SIZE),
+    dict(type='FaceKeypointNorm', input_size=IMAGE_SIZE),
+    dict(type='MMToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.4076, 0.458, 0.485],
+        std=[1.0, 1.0, 1.0]),
+    dict(
+        type='Collect',
+        keys=[
+            'img', 'target_point', 'target_point_mask', 'target_pose',
+            'target_pose_mask'
+        ])
+]
+
+val_pipeline = [
+    dict(type='FaceKeypointNorm', input_size=IMAGE_SIZE),
+    dict(type='MMToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.4076, 0.458, 0.485],
+        std=[1.0, 1.0, 1.0]),
+    dict(
+        type='Collect',
+        keys=[
+            'img', 'target_point', 'target_point_mask', 'target_pose',
+            'target_pose_mask'
+        ])
+]
+test_pipeline = val_pipeline
+
+data_root = 'path/to/face_landmark_data/'
+
+data_cfg = dict(
+    data_root=data_root,
+    input_size=IMAGE_SIZE,
+)
+
+data = dict(
+    imgs_per_gpu=512,
+    workers_per_gpu=2,
+    train=dict(
+        type='FaceKeypointDataset',
+        data_source=dict(
+            type='FaceKeypintSource',
+            train=True,
+            data_range=[0, 30000],  # [0,30000]  [0,478857]
+            data_cfg=data_cfg,
+        ),
+        pipeline=train_pipeline),
+    val=dict(
+        type='FaceKeypointDataset',
+        data_source=dict(
+            type='FaceKeypintSource',
+            train=False,
+            data_range=[478857, 488857],
+            # data_range=[478857, 478999], #[478857, 478999] [478857, 488857]
+            data_cfg=data_cfg,
+        ),
+        pipeline=val_pipeline),
+    test=dict(
+        type='FaceKeypointDataset',
+        data_source=dict(
+            type='FaceKeypintSource',
+            train=False,
+            data_range=[478857, 488857],
+            # data_range=[478857, 478999], #[478857, 478999] [478857, 488857]
+            data_cfg=data_cfg,
+        ),
+        pipeline=test_pipeline),
+)
+
+# runtime setting
+optimizer = dict(
+    type='Adam',
+    lr=0.005,
+)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(
+    policy='CosineAnnealing',
+    min_lr=0.00001,
+    warmup='linear',
+    warmup_iters=10,
+    warmup_ratio=0.001,
+    warmup_by_epoch=True,
+    by_epoch=True)
+
+total_epochs = 1000
+checkpoint_config = dict(interval=10)
+log_config = dict(
+    interval=5, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+predict = dict(type='FaceKeypointsPredictor')
+
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+
+# disable opencv multithreading to avoid system being overloaded
+opencv_num_threads = 0
+# set multi-process start method as `fork` to speed up the training
+mp_start_method = 'fork'
+
+evaluation = dict(interval=1, metric=['NME'], save_best='NME')
+
+eval_config = dict(interval=1)
+evaluator_args = dict(metric_names='ave_nme')
+eval_pipelines = [
+    dict(
+        mode='test',
+        data=dict(**data['val'], imgs_per_gpu=1),
+        evaluators=[dict(type='FaceKeypointEvaluator', **evaluator_args)])
+]
diff --git a/configs/pose/hand/hrnet_w18_coco_wholebody_hand_256x256_dark.py b/configs/pose/hand/hrnet_w18_coco_wholebody_hand_256x256_dark.py
new file mode 100644
index 00000000..b2679fc7
--- /dev/null
+++ b/configs/pose/hand/hrnet_w18_coco_wholebody_hand_256x256_dark.py
@@ -0,0 +1,190 @@
+# oss_io_config = dict(
+#     ak_id='your oss ak id',
+#     ak_secret='your oss ak secret',
+#     hosts='oss-cn-zhangjiakou.aliyuncs.com',  # your oss hosts
+#     buckets=['your_bucket'])  # your oss buckets
+
+oss_sync_config = dict(other_file_list=['**/events.out.tfevents*', '**/*log*'])
+
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=10)
+
+optimizer = dict(type='Adam', lr=5e-4)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50,
+    hooks=[dict(type='TextLoggerHook'),
+           dict(type='TensorboardLoggerHook')])
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=False,
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True),
+            upsample=dict(mode='bilinear', align_corners=False))),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(
+            final_conv_kernel=1, num_conv_layers=1, num_conv_kernels=(1, )),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='unbiased',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_root = 'data/coco'
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+)
+
+train_pipeline = [
+    # dict(type='TopDownGetBboxCenterScale', padding=1.25),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='MMToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='PoseCollect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'image_id', 'joints_3d', 'joints_3d_visible',
+            'center', 'scale', 'rotation', 'flip_pairs'
+        ])
+]
+
+val_pipeline = [
+    dict(type='TopDownAffine'),
+    dict(type='MMToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='PoseCollect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'image_id', 'center', 'scale', 'rotation',
+            'flip_pairs'
+        ])
+]
+
+test_pipeline = val_pipeline
+data_source_cfg = dict(type='HandCocoPoseTopDownSource', data_cfg=data_cfg)
+
+data = dict(
+    imgs_per_gpu=32,  # for train
+    workers_per_gpu=2,  # for train
+    # imgs_per_gpu=1,  # for test
+    # workers_per_gpu=1,  # for test
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='HandCocoWholeBodyDataset',
+        data_source=dict(
+            ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+            img_prefix=f'{data_root}/train2017/',
+            **data_source_cfg),
+        pipeline=train_pipeline),
+    val=dict(
+        type='HandCocoWholeBodyDataset',
+        data_source=dict(
+            ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+            img_prefix=f'{data_root}/val2017/',
+            test_mode=True,
+            **data_source_cfg),
+        pipeline=val_pipeline),
+    test=dict(
+        type='HandCocoWholeBodyDataset',
+        data_source=dict(
+            ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+            img_prefix=f'{data_root}/val2017/',
+            test_mode=True,
+            **data_source_cfg),
+        pipeline=val_pipeline),
+)
+
+eval_config = dict(interval=10, metric='PCK', save_best='PCK')
+evaluator_args = dict(
+    metric_names=['PCK', 'AUC', 'EPE', 'NME'], pck_thr=0.2, auc_nor=30)
+eval_pipelines = [
+    dict(
+        mode='test',
+        data=dict(**data['val'], imgs_per_gpu=1),
+        evaluators=[dict(type='KeyPointEvaluator', **evaluator_args)])
+]
+export = dict(use_jit=False)
+checkpoint_sync_export = True
+predict = dict(type='HandKeypointsPredictor')
diff --git a/configs/pose/hand/litehrnet_30_coco_wholebody_hand_256x256.py b/configs/pose/hand/litehrnet_30_coco_wholebody_hand_256x256.py
new file mode 100644
index 00000000..0c47db73
--- /dev/null
+++ b/configs/pose/hand/litehrnet_30_coco_wholebody_hand_256x256.py
@@ -0,0 +1,176 @@
+# oss_io_config = dict(
+#     ak_id='your oss ak id',
+#     ak_secret='your oss ak secret',
+#     hosts='oss-cn-zhangjiakou.aliyuncs.com',  # your oss hosts
+#     buckets=['your_bucket'])  # your oss buckets
+
+oss_sync_config = dict(other_file_list=['**/events.out.tfevents*', '**/*log*'])
+
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=10)
+
+optimizer = dict(type='Adam', lr=5e-4)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50,
+    hooks=[dict(type='TextLoggerHook'),
+           dict(type='TensorboardLoggerHook')])
+channel_cfg = dict(
+    num_output_channels=21,
+    dataset_joints=21,
+    dataset_channel=[
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+            19, 20
+        ],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=False,
+    backbone=dict(
+        type='LiteHRNet',
+        in_channels=3,
+        extra=dict(
+            stem=dict(stem_channels=32, out_channels=32, expand_ratio=1),
+            num_stages=3,
+            stages_spec=dict(
+                num_modules=(3, 8, 3),
+                num_branches=(2, 3, 4),
+                num_blocks=(2, 2, 2),
+                module_type=('LITE', 'LITE', 'LITE'),
+                with_fuse=(True, True, True),
+                reduce_ratios=(8, 8, 8),
+                num_channels=(
+                    (40, 80),
+                    (40, 80, 160),
+                    (40, 80, 160, 320),
+                )),
+            with_head=True,
+        )),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=40,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_root = 'data/coco'
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+)
+
+train_pipeline = [
+    # dict(type='TopDownGetBboxCenterScale', padding=1.25),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=30,
+        scale_factor=0.25),
+    dict(type='TopDownAffine'),
+    dict(type='MMToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=3),
+    dict(
+        type='PoseCollect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'image_id', 'joints_3d', 'joints_3d_visible',
+            'center', 'scale', 'rotation', 'flip_pairs'
+        ])
+]
+
+val_pipeline = [
+    dict(type='TopDownAffine'),
+    dict(type='MMToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='PoseCollect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'image_id', 'center', 'scale', 'rotation',
+            'flip_pairs'
+        ])
+]
+
+test_pipeline = val_pipeline
+data_source_cfg = dict(type='HandCocoPoseTopDownSource', data_cfg=data_cfg)
+
+data = dict(
+    imgs_per_gpu=32,  # for train
+    workers_per_gpu=2,  # for train
+    # imgs_per_gpu=1,  # for test
+    # workers_per_gpu=1,  # for test
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='HandCocoWholeBodyDataset',
+        data_source=dict(
+            ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
+            img_prefix=f'{data_root}/train2017/',
+            **data_source_cfg),
+        pipeline=train_pipeline),
+    val=dict(
+        type='HandCocoWholeBodyDataset',
+        data_source=dict(
+            ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+            img_prefix=f'{data_root}/val2017/',
+            test_mode=True,
+            **data_source_cfg),
+        pipeline=val_pipeline),
+    test=dict(
+        type='HandCocoWholeBodyDataset',
+        data_source=dict(
+            ann_file=f'{data_root}/annotations/coco_wholebody_val_v1.0.json',
+            img_prefix=f'{data_root}/val2017/',
+            test_mode=True,
+            **data_source_cfg),
+        pipeline=val_pipeline),
+)
+
+eval_config = dict(interval=10, metric='PCK', save_best='PCK')
+evaluator_args = dict(
+    metric_names=['PCK', 'AUC', 'EPE', 'NME'], pck_thr=0.2, auc_nor=30)
+eval_pipelines = [
+    dict(
+        mode='test',
+        data=dict(**data['val'], imgs_per_gpu=1),
+        evaluators=[dict(type='KeyPointEvaluator', **evaluator_args)])
+]
+export = dict(use_jit=False)
+checkpoint_sync_export = True
diff --git a/configs/segmentation/segformer/segformer_b0_coco.py b/configs/segmentation/segformer/segformer_b0_coco.py
new file mode 100644
index 00000000..e6fc6d58
--- /dev/null
+++ b/configs/segmentation/segformer/segformer_b0_coco.py
@@ -0,0 +1,249 @@
+# segformer of B0
+
+CLASSES = [
+    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+    'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+    'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
+    'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+    'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
+    'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+    'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
+    'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+    'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+    'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+    'hair drier', 'toothbrush', 'banner', 'blanket', 'branch', 'bridge',
+    'building-other', 'bush', 'cabinet', 'cage', 'cardboard', 'carpet',
+    'ceiling-other', 'ceiling-tile', 'cloth', 'clothes', 'clouds', 'counter',
+    'cupboard', 'curtain', 'desk-stuff', 'dirt', 'door-stuff', 'fence',
+    'floor-marble', 'floor-other', 'floor-stone', 'floor-tile', 'floor-wood',
+    'flower', 'fog', 'food-other', 'fruit', 'furniture-other', 'grass',
+    'gravel', 'ground-other', 'hill', 'house', 'leaves', 'light', 'mat',
+    'metal', 'mirror-stuff', 'moss', 'mountain', 'mud', 'napkin', 'net',
+    'paper', 'pavement', 'pillow', 'plant-other', 'plastic', 'platform',
+    'playingfield', 'railing', 'railroad', 'river', 'road', 'rock', 'roof',
+    'rug', 'salad', 'sand', 'sea', 'shelf', 'sky-other', 'skyscraper', 'snow',
+    'solid-other', 'stairs', 'stone', 'straw', 'structural-other', 'table',
+    'tent', 'textile-other', 'towel', 'tree', 'vegetable', 'wall-brick',
+    'wall-concrete', 'wall-other', 'wall-panel', 'wall-stone', 'wall-tile',
+    'wall-wood', 'water-other', 'waterdrops', 'window-blind', 'window-other',
+    'wood'
+]
+PALETTE = [[0, 192, 64], [0, 192, 64], [0, 64, 96],
+           [128, 192, 192], [0, 64, 64], [0, 192, 224], [0, 192, 192],
+           [128, 192, 64], [0, 192, 96], [128, 192, 64], [128, 32, 192],
+           [0, 0, 224], [0, 0, 64], [0, 160, 192], [128, 0, 96], [128, 0, 192],
+           [0, 32, 192], [128, 128, 224], [0, 0, 192], [128, 160, 192],
+           [128, 128, 0], [128, 0, 32], [128, 32, 0], [128, 0, 128],
+           [64, 128, 32], [0, 160, 0], [0, 0, 0], [192, 128, 160], [0, 32, 0],
+           [0, 128, 128], [64, 128, 160], [128, 160, 0], [0, 128, 0],
+           [192, 128, 32], [128, 96, 128], [0, 0, 128], [64, 0, 32],
+           [0, 224, 128], [128, 0, 0], [192, 0, 160], [0, 96, 128],
+           [128, 128, 128], [64, 0, 160], [128, 224, 128], [128, 128,
+                                                            64], [192, 0, 32],
+           [128, 96, 0], [128, 0, 192], [0, 128, 32], [64, 224, 0], [0, 0, 64],
+           [128, 128, 160], [64, 96, 0], [0, 128, 192], [0, 128, 160],
+           [192, 224, 0], [0, 128, 64], [128, 128, 32], [192, 32, 128],
+           [0, 64, 192], [0, 0, 32], [64, 160, 128], [128, 64, 64],
+           [128, 0, 160], [64, 32, 128], [128, 192, 192], [0, 0, 160],
+           [192, 160, 128], [128, 192, 0], [128, 0, 96], [192, 32, 0],
+           [128, 64, 128], [64, 128, 96], [64, 160, 0], [0, 64, 0],
+           [192, 128, 224], [64, 32, 0], [0, 192, 128], [64, 128, 224],
+           [192, 160, 0], [0, 192, 0], [192, 128, 96], [192, 96, 128],
+           [0, 64, 128], [64, 0, 96], [64, 224, 128], [128, 64, 0],
+           [192, 0, 224], [64, 96, 128], [128, 192, 128], [64, 0, 224],
+           [192, 224, 128], [128, 192, 64], [192, 0, 96], [192, 96, 0],
+           [128, 64, 192], [0, 128, 96], [0, 224, 0], [64, 64, 64],
+           [128, 128, 224], [0, 96, 0], [64, 192, 192], [0, 128, 224],
+           [128, 224, 0], [64, 192, 64], [128, 128, 96], [128, 32, 128],
+           [64, 0, 192], [0, 64, 96], [0, 160, 128], [192, 0, 64],
+           [128, 64, 224], [0, 32, 128], [192, 128, 192], [0, 64, 224],
+           [128, 160, 128], [192, 128, 0], [128, 64, 32], [128, 32, 64],
+           [192, 0, 128], [64, 192, 32], [0, 160, 64], [64, 0, 0],
+           [192, 192, 160], [0, 32, 64], [64, 128, 128], [64, 192, 160],
+           [128, 160, 64], [64, 128, 0], [192, 192, 32], [128, 96, 192],
+           [64, 0, 128], [64, 64, 32], [0, 224, 192], [192, 0, 0],
+           [192, 64, 160], [0, 96, 192], [192, 128, 128], [64, 64, 160],
+           [128, 224, 192], [192, 128, 64], [192, 64, 32], [128, 96, 64],
+           [192, 0, 192], [0, 192, 32], [64, 224, 64], [64, 0, 64],
+           [128, 192, 160], [64, 96, 64], [64, 128, 192], [0, 192, 160],
+           [192, 224, 64], [64, 128, 64], [128, 192, 32], [192, 32, 192],
+           [64, 64, 192], [0, 64, 32], [64, 160, 192], [192, 64, 64],
+           [128, 64, 160], [64, 32, 192], [192, 192, 192], [0, 64, 160],
+           [192, 160, 192], [192, 192, 0], [128, 64, 96], [192, 32, 64],
+           [192, 64, 128], [64, 192, 96], [64, 160, 64], [64, 64, 0]]
+
+num_classes = 172
+
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained=
+    'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b0_20220624-7e0fe6dd.pth',
+    backbone=dict(
+        type='MixVisionTransformer',
+        in_channels=3,
+        embed_dims=32,
+        num_stages=4,
+        num_layers=[2, 2, 2, 2],
+        num_heads=[1, 2, 5, 8],
+        patch_sizes=[7, 3, 3, 3],
+        sr_ratios=[8, 4, 2, 1],
+        out_indices=(0, 1, 2, 3),
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.1),
+    decode_head=dict(
+        type='SegformerHead',
+        in_channels=[32, 64, 160, 256],
+        in_index=[0, 1, 2, 3],
+        channels=256,
+        dropout_ratio=0.1,
+        num_classes=num_classes,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+# dataset settings
+dataset_type = 'SegDataset'
+data_root = './data/coco_stuff164k/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='MMResize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
+    dict(type='SegRandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='MMRandomFlip', flip_ratio=0.5),
+    dict(type='MMPhotoMetricDistortion'),
+    dict(type='MMNormalize', **img_norm_cfg),
+    dict(type='MMPad', size=crop_size, pad_val=dict(img=0, masks=0, seg=255)),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=['img', 'gt_semantic_seg'],
+        meta_keys=('filename', 'ori_filename', 'ori_shape', 'img_shape',
+                   'pad_shape', 'scale_factor', 'flip', 'flip_direction',
+                   'img_norm_cfg')),
+]
+test_pipeline = [
+    dict(
+        type='MMMultiScaleFlipAug',
+        img_scale=(2048, 512),
+        # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+        flip=False,
+        transforms=[
+            dict(type='MMResize', keep_ratio=True),
+            dict(type='MMRandomFlip'),
+            dict(type='MMNormalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(
+                type='Collect',
+                keys=['img'],
+                meta_keys=('filename', 'ori_filename', 'ori_shape',
+                           'img_shape', 'pad_shape', 'scale_factor', 'flip',
+                           'flip_direction', 'img_norm_cfg')),
+        ])
+]
+
+data = dict(
+    imgs_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ignore_index=255,
+        data_source=dict(
+            type='SegSourceRaw',
+            img_suffix='.jpg',
+            label_suffix='_labelTrainIds.png',
+            img_root=data_root + 'train2017/',
+            label_root=data_root + 'annotations/train2017/',
+            split=data_root + 'train.txt',
+            classes=CLASSES,
+        ),
+        pipeline=train_pipeline),
+    val=dict(
+        imgs_per_gpu=1,
+        ignore_index=255,
+        type=dataset_type,
+        data_source=dict(
+            type='SegSourceRaw',
+            img_suffix='.jpg',
+            label_suffix='_labelTrainIds.png',
+            img_root=data_root + 'val2017/',
+            label_root=data_root + 'annotations/val2017',
+            split=data_root + 'val.txt',
+            classes=CLASSES,
+        ),
+        pipeline=test_pipeline),
+    test=dict(
+        imgs_per_gpu=1,
+        type=dataset_type,
+        data_source=dict(
+            type='SegSourceRaw',
+            img_suffix='.jpg',
+            label_suffix='_labelTrainIds.png',
+            img_root=data_root + 'val2017/',
+            label_root=data_root + 'annotations/val2017',
+            split=data_root + 'val.txt',
+            classes=CLASSES,
+        ),
+        pipeline=test_pipeline))
+optimizer = dict(
+    type='AdamW',
+    lr=6e-05,
+    betas=(0.9, 0.999),
+    weight_decay=0.01,
+    paramwise_options=dict(
+        custom_keys=dict(
+            pos_block=dict(decay_mult=0.0),
+            norm=dict(decay_mult=0.0),
+            head=dict(lr_mult=10.0))))
+optimizer_config = dict()
+lr_config = dict(
+    policy='poly',
+    warmup='linear',
+    warmup_iters=1500,
+    warmup_ratio=1e-06,
+    power=1.0,
+    min_lr=0.0,
+    by_epoch=False)
+
+# runtime settings
+total_epochs = 30
+checkpoint_config = dict(interval=1)
+eval_config = dict(interval=1, gpu_collect=False)
+eval_pipelines = [
+    dict(
+        mode='test',
+        evaluators=[
+            dict(
+                type='SegmentationEvaluator',
+                classes=CLASSES,
+                metric_names=['mIoU'])
+        ],
+    )
+]
+
+predict = dict(type='SegmentationPredictor')
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+dist_params = dict(backend='nccl')
+
+cudnn_benchmark = False
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/segmentation/segformer/segformer_b1_coco.py b/configs/segmentation/segformer/segformer_b1_coco.py
new file mode 100644
index 00000000..64f6e2c1
--- /dev/null
+++ b/configs/segmentation/segformer/segformer_b1_coco.py
@@ -0,0 +1,8 @@
+_base_ = './segformer_b0_coco.py'
+
+model = dict(
+    pretrained=
+    'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b1_20220624-02e5a6a1.pth',
+    backbone=dict(embed_dims=64, ),
+    decode_head=dict(in_channels=[64, 128, 320, 512], ),
+)
diff --git a/configs/segmentation/segformer/segformer_b2_coco.py b/configs/segmentation/segformer/segformer_b2_coco.py
new file mode 100644
index 00000000..16538eb3
--- /dev/null
+++ b/configs/segmentation/segformer/segformer_b2_coco.py
@@ -0,0 +1,14 @@
+_base_ = './segformer_b0_coco.py'
+
+model = dict(
+    pretrained=
+    'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b2_20220624-66e8bf70.pth',
+    backbone=dict(
+        embed_dims=64,
+        num_layers=[3, 4, 6, 3],
+    ),
+    decode_head=dict(
+        in_channels=[64, 128, 320, 512],
+        channels=768,
+    ),
+)
diff --git a/configs/segmentation/segformer/segformer_b3_coco.py b/configs/segmentation/segformer/segformer_b3_coco.py
new file mode 100644
index 00000000..8e59d9be
--- /dev/null
+++ b/configs/segmentation/segformer/segformer_b3_coco.py
@@ -0,0 +1,14 @@
+_base_ = './segformer_b0_coco.py'
+
+model = dict(
+    pretrained=
+    'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b3_20220624-13b1141c.pth',
+    backbone=dict(
+        embed_dims=64,
+        num_layers=[3, 4, 18, 3],
+    ),
+    decode_head=dict(
+        in_channels=[64, 128, 320, 512],
+        channels=768,
+    ),
+)
diff --git a/configs/segmentation/segformer/segformer_b4_coco.py b/configs/segmentation/segformer/segformer_b4_coco.py
new file mode 100644
index 00000000..d8de6ce2
--- /dev/null
+++ b/configs/segmentation/segformer/segformer_b4_coco.py
@@ -0,0 +1,14 @@
+_base_ = './segformer_b0_coco.py'
+
+model = dict(
+    pretrained=
+    'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b4_20220624-d588d980.pth',
+    backbone=dict(
+        embed_dims=64,
+        num_layers=[3, 8, 27, 3],
+    ),
+    decode_head=dict(
+        in_channels=[64, 128, 320, 512],
+        channels=768,
+    ),
+)
diff --git a/configs/segmentation/segformer/segformer_b5_coco.py b/configs/segmentation/segformer/segformer_b5_coco.py
new file mode 100644
index 00000000..83ccd9a3
--- /dev/null
+++ b/configs/segmentation/segformer/segformer_b5_coco.py
@@ -0,0 +1,52 @@
+_base_ = './segformer_b0_coco.py'
+
+model = dict(
+    pretrained=
+    'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b5_20220624-658746d9.pth',
+    backbone=dict(
+        embed_dims=64,
+        num_layers=[3, 6, 40, 3],
+    ),
+    decode_head=dict(
+        in_channels=[64, 128, 320, 512],
+        channels=768,
+    ),
+)
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+crop_size = (640, 640)
+train_pipeline = [
+    dict(type='MMResize', img_scale=(2048, 640), ratio_range=(0.5, 2.0)),
+    dict(type='SegRandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='MMRandomFlip', flip_ratio=0.5),
+    dict(type='MMPhotoMetricDistortion'),
+    dict(type='MMNormalize', **img_norm_cfg),
+    dict(type='MMPad', size=crop_size, pad_val=dict(img=0, masks=0, seg=255)),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=['img', 'gt_semantic_seg'],
+        meta_keys=('filename', 'ori_filename', 'ori_shape', 'img_shape',
+                   'pad_shape', 'scale_factor', 'flip', 'flip_direction',
+                   'img_norm_cfg')),
+]
+test_pipeline = [
+    dict(
+        type='MMMultiScaleFlipAug',
+        img_scale=(2048, 640),
+        # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+        flip=False,
+        transforms=[
+            dict(type='MMResize', keep_ratio=True),
+            dict(type='MMRandomFlip'),
+            dict(type='MMNormalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(
+                type='Collect',
+                keys=['img'],
+                meta_keys=('filename', 'ori_filename', 'ori_shape',
+                           'img_shape', 'pad_shape', 'scale_factor', 'flip',
+                           'flip_direction', 'img_norm_cfg')),
+        ])
+]
diff --git a/data/test/face_2d_keypoints/data/002253.png b/data/test/face_2d_keypoints/data/002253.png
new file mode 100644
index 00000000..00311c33
--- /dev/null
+++ b/data/test/face_2d_keypoints/data/002253.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:331ead75033fa2f01f6be72a2f8e34d581fcb593308067815d4bb136bb13b766
+size 54390
diff --git a/data/test/face_2d_keypoints/data/002258.png b/data/test/face_2d_keypoints/data/002258.png
new file mode 100644
index 00000000..3f887ef4
--- /dev/null
+++ b/data/test/face_2d_keypoints/data/002258.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6904c252a6ffa8702f4c255dafb0b7d03092c402e3c70598adab3f83c3858451
+size 36836
diff --git a/data/test/face_2d_keypoints/models/epoch_400.pth b/data/test/face_2d_keypoints/models/epoch_400.pth
new file mode 100644
index 00000000..e3e92113
--- /dev/null
+++ b/data/test/face_2d_keypoints/models/epoch_400.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8298b88539874b9914b90122575880a80ca0534499e9be9953e17fc177a1c2d2
+size 3421031
diff --git a/data/test/pose/hand/configs/hand_keypoints_predictor.py b/data/test/pose/hand/configs/hand_keypoints_predictor.py
new file mode 100644
index 00000000..799ef425
--- /dev/null
+++ b/data/test/pose/hand/configs/hand_keypoints_predictor.py
@@ -0,0 +1,86 @@
+model = dict(
+    type='SingleStageDetector',
+    backbone=dict(
+        type='MobileNetV2',
+        out_indices=(4, 7),
+        norm_cfg=dict(type='BN', eps=0.001, momentum=0.03),
+        init_cfg=dict(type='TruncNormal', layer='Conv2d', std=0.03)),
+    neck=dict(
+        type='SSDNeck',
+        in_channels=(96, 1280),
+        out_channels=(96, 1280, 512, 256, 256, 128),
+        level_strides=(2, 2, 2, 2),
+        level_paddings=(1, 1, 1, 1),
+        l2_norm_scale=None,
+        use_depthwise=True,
+        norm_cfg=dict(type='BN', eps=0.001, momentum=0.03),
+        act_cfg=dict(type='ReLU6'),
+        init_cfg=dict(type='TruncNormal', layer='Conv2d', std=0.03)),
+    bbox_head=dict(
+        type='SSDHead',
+        in_channels=(96, 1280, 512, 256, 256, 128),
+        num_classes=1,
+        use_depthwise=True,
+        norm_cfg=dict(type='BN', eps=0.001, momentum=0.03),
+        act_cfg=dict(type='ReLU6'),
+        init_cfg=dict(type='Normal', layer='Conv2d', std=0.001),
+
+        # set anchor size manually instead of using the predefined
+        # SSD300 setting.
+        anchor_generator=dict(
+            type='SSDAnchorGenerator',
+            scale_major=False,
+            strides=[16, 32, 64, 107, 160, 320],
+            ratios=[[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]],
+            min_sizes=[48, 100, 150, 202, 253, 304],
+            max_sizes=[100, 150, 202, 253, 304, 320]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2])),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.5,
+            min_pos_iou=0.,
+            ignore_iof_thr=-1,
+            gt_max_assign_all=False),
+        smoothl1_beta=1.,
+        allowed_border=-1,
+        pos_weight=-1,
+        neg_pos_ratio=3,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        nms=dict(type='nms', iou_threshold=0.45),
+        min_bbox_size=0,
+        score_thr=0.02,
+        max_per_img=200))
+
+classes = ('hand', )
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+test_pipeline = [
+    dict(
+        type='MMMultiScaleFlipAug',
+        img_scale=(320, 320),
+        flip=False,
+        transforms=[
+            dict(type='MMResize', keep_ratio=False),
+            dict(type='MMNormalize', **img_norm_cfg),
+            dict(type='MMPad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+load_from = 'https://download.openmmlab.com/mmpose/mmdet_pretrained/' \
+            'ssdlite_mobilenetv2_scratch_600e_onehand-4f9f8686_20220523.pth'
+mmlab_modules = [
+    dict(type='mmdet', name='SingleStageDetector', module='model'),
+    dict(type='mmdet', name='MobileNetV2', module='backbone'),
+    dict(type='mmdet', name='SSDNeck', module='neck'),
+    dict(type='mmdet', name='SSDHead', module='head'),
+]
+predictor = dict(type='DetectionPredictor', score_threshold=0.5)
diff --git a/data/test/pose/hand/data/hand.jpg b/data/test/pose/hand/data/hand.jpg
new file mode 100644
index 00000000..cb445c26
--- /dev/null
+++ b/data/test/pose/hand/data/hand.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c05d58edee7398de37b8e479410676d6b97cfde69cc003e8356a348067e71988
+size 7750
diff --git a/data/test/pose/hand/hrnet_w18_256x256.pth b/data/test/pose/hand/hrnet_w18_256x256.pth
new file mode 100644
index 00000000..3585315c
--- /dev/null
+++ b/data/test/pose/hand/hrnet_w18_256x256.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8570f45c7e642288b23a1c8722ba2b9b40939f1d55c962d13c789157b16edf01
+size 117072344
diff --git a/data/test/pose/hand/small_whole_body_hand_coco/annotations/small_whole_body_hand_coco.json b/data/test/pose/hand/small_whole_body_hand_coco/annotations/small_whole_body_hand_coco.json
new file mode 100644
index 00000000..8b1aa35d
--- /dev/null
+++ b/data/test/pose/hand/small_whole_body_hand_coco/annotations/small_whole_body_hand_coco.json
@@ -0,0 +1,2381 @@
+{
+	"info": {
+        "description": "COCO-WholeBody",
+        "url": "https://github.com/jin-s13/COCO-WholeBody",
+        "version": "1.0",
+        "year": "2020",
+        "date_created": "2020/07/01"
+    },
+    "licenses": [
+        {
+            "url": "http://creativecommons.org/licenses/by-nc-sa/2.0/",
+            "id": 1,
+            "name": "Attribution-NonCommercial-ShareAlike License"
+        },
+        {
+            "url": "http://creativecommons.org/licenses/by-nc/2.0/",
+            "id": 2,
+            "name": "Attribution-NonCommercial License"
+        },
+        {
+            "url": "http://creativecommons.org/licenses/by-nc-nd/2.0/",
+            "id": 3,
+            "name": "Attribution-NonCommercial-NoDerivs License"
+        },
+        {
+            "url": "http://creativecommons.org/licenses/by/2.0/",
+            "id": 4,
+            "name": "Attribution License"
+        },
+        {
+            "url": "http://creativecommons.org/licenses/by-sa/2.0/",
+            "id": 5,
+            "name": "Attribution-ShareAlike License"
+        },
+        {
+            "url": "http://creativecommons.org/licenses/by-nd/2.0/",
+            "id": 6,
+            "name": "Attribution-NoDerivs License"
+        },
+        {
+            "url": "http://flickr.com/commons/usage/",
+            "id": 7,
+            "name": "No known copyright restrictions"
+        },
+        {
+            "url": "http://www.usa.gov/copyright.shtml",
+            "id": 8,
+            "name": "United States Government Work"
+        }
+    ],
+    "images":[
+    	{
+            "license": 1,
+            "file_name": "000000425226.jpg",
+            "coco_url": "http://images.cocodataset.org/val2017/000000425226.jpg",
+            "height": 640,
+            "width": 480,
+            "date_captured": "2013-11-14 21:48:51",
+            "flickr_url": "http://farm5.staticflickr.com/4055/4546463824_bc40e0752b_z.jpg",
+            "id": 425226
+        },
+        {
+            "license": 3,
+            "file_name": "000000292456.jpg",
+            "coco_url": "http://images.cocodataset.org/val2017/000000292456.jpg",
+            "height": 375,
+            "width": 500,
+            "date_captured": "2013-11-15 13:30:32",
+            "flickr_url": "http://farm2.staticflickr.com/1284/846533447_96678f6640_z.jpg",
+            "id": 292456
+        }
+    ],
+    "annotations":[
+		{
+            "segmentation": [
+                [
+                    125.12,
+                    539.69,
+                    140.94,
+                    522.43,
+                    100.67,
+                    496.54,
+                    84.85,
+                    469.21,
+                    73.35,
+                    450.52,
+                    104.99,
+                    342.65,
+                    168.27,
+                    290.88,
+                    179.78,
+                    288,
+                    189.84,
+                    286.56,
+                    191.28,
+                    260.67,
+                    202.79,
+                    240.54,
+                    221.48,
+                    237.66,
+                    248.81,
+                    243.42,
+                    257.44,
+                    256.36,
+                    253.12,
+                    262.11,
+                    253.12,
+                    275.06,
+                    299.15,
+                    233.35,
+                    329.35,
+                    207.46,
+                    355.24,
+                    206.02,
+                    363.87,
+                    206.02,
+                    365.3,
+                    210.34,
+                    373.93,
+                    221.84,
+                    363.87,
+                    226.16,
+                    363.87,
+                    237.66,
+                    350.92,
+                    237.66,
+                    332.22,
+                    234.79,
+                    314.97,
+                    249.17,
+                    271.82,
+                    313.89,
+                    253.12,
+                    326.83,
+                    227.24,
+                    352.72,
+                    214.29,
+                    357.03,
+                    212.85,
+                    372.85,
+                    208.54,
+                    395.87,
+                    228.67,
+                    414.56,
+                    245.93,
+                    421.75,
+                    266.07,
+                    424.63,
+                    276.13,
+                    437.57,
+                    266.07,
+                    450.52,
+                    284.76,
+                    464.9,
+                    286.2,
+                    479.28,
+                    291.96,
+                    489.35,
+                    310.65,
+                    512.36,
+                    284.76,
+                    549.75,
+                    244.49,
+                    522.43,
+                    215.73,
+                    546.88,
+                    199.91,
+                    558.38,
+                    204.22,
+                    565.57,
+                    189.84,
+                    568.45,
+                    184.09,
+                    575.64,
+                    172.58,
+                    578.52,
+                    145.26,
+                    567.01,
+                    117.93,
+                    551.19,
+                    133.75,
+                    532.49
+                ]
+            ],
+            "num_keypoints": 10,
+            "area": 47803.27955,
+            "iscrowd": 0,
+            "keypoints": [
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                142,
+                309,
+                1,
+                177,
+                320,
+                2,
+                191,
+                398,
+                2,
+                237,
+                317,
+                2,
+                233,
+                426,
+                2,
+                306,
+                233,
+                2,
+                92,
+                452,
+                2,
+                123,
+                468,
+                2,
+                0,
+                0,
+                0,
+                251,
+                469,
+                2,
+                0,
+                0,
+                0,
+                162,
+                551,
+                2
+            ],
+            "image_id": 425226,
+            "bbox": [
+                73.35,
+                206.02,
+                300.58,
+                372.5
+            ],
+            "category_id": 1,
+            "id": 183126,
+            "face_box": [
+                0.0,
+                0.0,
+                0.0,
+                0.0
+            ],
+            "lefthand_box": [
+                235.32,
+                420.86,
+                36.03000000000003,
+                31.66999999999996
+            ],
+            "righthand_box": [
+                304.2,
+                204.65,
+                68.03000000000003,
+                42.579999999999984
+            ],
+            "lefthand_kpts": [
+                237.0,
+                426.0,
+                0.10405432432889938,
+                245.0,
+                428.0,
+                0.20745894312858582,
+                253.0,
+                430.0,
+                0.20745894312858582,
+                261.0,
+                433.0,
+                0.5343613624572754,
+                269.0,
+                438.0,
+                0.2143213450908661,
+                265.0,
+                429.0,
+                0.12357126176357269,
+                270.0,
+                433.0,
+                0.26612094044685364,
+                269.0,
+                439.0,
+                0.09291676431894302,
+                265.0,
+                445.0,
+                0.1144329234957695,
+                264.0,
+                427.0,
+                0.04132022708654404,
+                269.0,
+                438.0,
+                0.14125224947929382,
+                265.0,
+                445.0,
+                0.12546022236347198,
+                260.0,
+                446.0,
+                0.1301765739917755,
+                261.0,
+                431.0,
+                0.03911404311656952,
+                269.0,
+                438.0,
+                0.05325932428240776,
+                263.0,
+                445.0,
+                0.0727347582578659,
+                260.0,
+                446.0,
+                0.09616264700889587,
+                262.0,
+                426.0,
+                0.02422112599015236,
+                262.0,
+                446.0,
+                0.05307695269584656,
+                259.0,
+                447.0,
+                0.10021205991506577,
+                256.0,
+                449.0,
+                0.16766609251499176
+            ],
+            "righthand_kpts": [
+                308.0,
+                239.0,
+                0.1408521831035614,
+                321.0,
+                226.5,
+                0.02196568436920643,
+                334.0,
+                214.0,
+                0.02196568436920643,
+                354.0,
+                211.0,
+                0.10082424432039261,
+                365.0,
+                206.0,
+                0.7382210493087769,
+                346.0,
+                209.0,
+                0.36809757351875305,
+                364.0,
+                214.0,
+                0.18089689314365387,
+                355.0,
+                209.0,
+                0.1661667376756668,
+                370.0,
+                226.0,
+                0.0630379319190979,
+                342.0,
+                214.0,
+                0.20839764177799225,
+                364.0,
+                216.0,
+                0.3880220949649811,
+                368.0,
+                226.0,
+                0.26579514145851135,
+                374.0,
+                236.0,
+                0.2594422996044159,
+                338.0,
+                218.0,
+                0.4262842833995819,
+                356.0,
+                220.0,
+                0.6470030546188354,
+                364.0,
+                227.0,
+                0.3695657551288605,
+                372.0,
+                237.0,
+                0.14602071046829224,
+                335.0,
+                227.0,
+                0.425717294216156,
+                347.0,
+                229.0,
+                0.479814738035202,
+                356.0,
+                232.0,
+                0.3711409270763397,
+                364.0,
+                234.0,
+                0.39777901768684387
+            ],
+            "face_kpts": [
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0
+            ],
+            "face_valid": false,
+            "lefthand_valid": true,
+            "righthand_valid": true,
+            "foot_valid": true,
+            "foot_kpts": [
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                203.91247,
+                554.31637,
+                2.0,
+                186.27924,
+                568.78466,
+                2.0,
+                134.28382,
+                542.10875,
+                2.0
+            ]
+        },
+        {
+            "segmentation": [
+                [
+                    237.64,
+                    275.62,
+                    235.11,
+                    282.36,
+                    240.17,
+                    291.63,
+                    248.6,
+                    300.9,
+                    251.12,
+                    284.04,
+                    251.12,
+                    275.62,
+                    254.49,
+                    258.76,
+                    267.98,
+                    246.12,
+                    267.98,
+                    239.38,
+                    268.82,
+                    230.96,
+                    262.08,
+                    224.21,
+                    259.55,
+                    211.57,
+                    260.39,
+                    199.78,
+                    260.39,
+                    188.82,
+                    260.39,
+                    104.55,
+                    312.64,
+                    94.44,
+                    294.1,
+                    84.33,
+                    290.73,
+                    80.96,
+                    295.79,
+                    68.31,
+                    294.94,
+                    53.99,
+                    291.57,
+                    48.09,
+                    280.62,
+                    45.56,
+                    268.82,
+                    48.09,
+                    258.71,
+                    51.46,
+                    259.55,
+                    62.42,
+                    256.18,
+                    70.84,
+                    256.18,
+                    80.11,
+                    252.81,
+                    89.38,
+                    265.45,
+                    95.28,
+                    251.97,
+                    104.55,
+                    240.17,
+                    125.62,
+                    234.27,
+                    136.57,
+                    224.16,
+                    142.47,
+                    214.04,
+                    146.69,
+                    199.72,
+                    150.9,
+                    192.98,
+                    153.43,
+                    187.08,
+                    153.43,
+                    184.55,
+                    159.33,
+                    191.29,
+                    170.28,
+                    198.88,
+                    171.97,
+                    210.67,
+                    170.28,
+                    224.16,
+                    164.38,
+                    232.58,
+                    159.33,
+                    239.33,
+                    156.8,
+                    247.75,
+                    148.37,
+                    250.28,
+                    143.31,
+                    255.34,
+                    156.8,
+                    255.34,
+                    163.54,
+                    255.34,
+                    172.81,
+                    256.18,
+                    180.39,
+                    256.18,
+                    186.29,
+                    257.87,
+                    196.4,
+                    254.49,
+                    204.83,
+                    250.28,
+                    211.57,
+                    248.6,
+                    217.47,
+                    248.6,
+                    224.21,
+                    247.75,
+                    236.01,
+                    247.75,
+                    246.97,
+                    246.07,
+                    269.72,
+                    244.38,
+                    282.36,
+                    238.48,
+                    275.62
+                ]
+            ],
+            "num_keypoints": 13,
+            "area": 5425.7433,
+            "iscrowd": 0,
+            "keypoints": [
+                256,
+                76,
+                1,
+                260,
+                70,
+                2,
+                0,
+                0,
+                0,
+                269,
+                72,
+                2,
+                0,
+                0,
+                0,
+                254,
+                112,
+                2,
+                307,
+                99,
+                1,
+                236,
+                145,
+                2,
+                338,
+                150,
+                1,
+                200,
+                160,
+                2,
+                0,
+                0,
+                0,
+                270,
+                189,
+                1,
+                308,
+                188,
+                1,
+                257,
+                227,
+                2,
+                359,
+                246,
+                1,
+                251,
+                287,
+                1,
+                0,
+                0,
+                0
+            ],
+            "image_id": 292456,
+            "bbox": [
+                184.55,
+                45.56,
+                128.09,
+                255.34
+            ],
+            "category_id": 1,
+            "id": 184382,
+            "face_box": [
+                0.0,
+                0.0,
+                0.0,
+                0.0
+            ],
+            "lefthand_box": [
+                184.74,
+                153.3,
+                17.42999999999998,
+                16.939999999999998
+            ],
+            "righthand_box": [
+                0.0,
+                0.0,
+                0.0,
+                0.0
+            ],
+            "lefthand_kpts": [
+                197.4,
+                160.1,
+                1,
+                195.7,
+                156.85,
+                1,
+                194.0,
+                153.6,
+                2,
+                191.8,
+                154.7,
+                2,
+                190.2,
+                156.7,
+                1,
+                188.3,
+                158.3,
+                2,
+                185.9,
+                159.9,
+                2,
+                187.8,
+                160.1,
+                1,
+                189.6,
+                159.5,
+                1,
+                188.4,
+                161.1,
+                2,
+                185.8,
+                163.2,
+                2,
+                187.8,
+                163.2,
+                1,
+                189.9,
+                161.9,
+                1,
+                189.1,
+                164.2,
+                2,
+                187.3,
+                166.3,
+                2,
+                189.4,
+                165.8,
+                1,
+                190.9,
+                164.3,
+                1,
+                190.7,
+                166.9,
+                2,
+                189.6,
+                168.4,
+                2,
+                191.3,
+                168.0,
+                1,
+                192.8,
+                166.6,
+                1
+            ],
+            "righthand_kpts": [
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0
+            ],
+            "face_kpts": [
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0
+            ],
+            "face_valid": false,
+            "lefthand_valid": true,
+            "righthand_valid": false,
+            "foot_valid": true,
+            "foot_kpts": [
+                243.75893,
+                280.33895,
+                2.0,
+                238.25574,
+                283.57612,
+                2.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0
+            ]
+        },
+        {
+            "segmentation": [
+                [
+                    335.68,
+                    70.36,
+                    323.93,
+                    70.36,
+                    323.93,
+                    70.36,
+                    323.93,
+                    77.07,
+                    323.93,
+                    83.78,
+                    323.09,
+                    87.97,
+                    322.26,
+                    95.53,
+                    323.93,
+                    103.91,
+                    325.61,
+                    108.11,
+                    326.45,
+                    115.66,
+                    327.29,
+                    125.73,
+                    328.13,
+                    132.44,
+                    329.81,
+                    137.47,
+                    329.81,
+                    150.06,
+                    334,
+                    155.93,
+                    335.68,
+                    159.28,
+                    338.19,
+                    160.96,
+                    345.74,
+                    157.61,
+                    348.26,
+                    153.41,
+                    351.62,
+                    144.18,
+                    351.62,
+                    129.92,
+                    349.94,
+                    124.89,
+                    345.74,
+                    106.43,
+                    344.91,
+                    103.08,
+                    345.74,
+                    101.4,
+                    354.13,
+                    82.94,
+                    352.46,
+                    75.39,
+                    347.42,
+                    71.2,
+                    347.42,
+                    71.2,
+                    335.68,
+                    68.68
+                ]
+            ],
+            "num_keypoints": 10,
+            "area": 2050.998,
+            "iscrowd": 0,
+            "keypoints": [
+                346,
+                92,
+                2,
+                347,
+                87,
+                2,
+                340,
+                86,
+                2,
+                0,
+                0,
+                0,
+                327,
+                88,
+                2,
+                316,
+                109,
+                1,
+                337,
+                109,
+                2,
+                0,
+                0,
+                0,
+                358,
+                151,
+                1,
+                0,
+                0,
+                0,
+                380,
+                179,
+                1,
+                309,
+                177,
+                1,
+                331,
+                179,
+                1,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0
+            ],
+            "image_id": 292456,
+            "bbox": [
+                322.26,
+                68.68,
+                31.87,
+                92.28
+            ],
+            "category_id": 1,
+            "id": 208754,
+            "face_box": [
+                0.0,
+                0.0,
+                0.0,
+                0.0
+            ],
+            "lefthand_box": [
+                0.0,
+                0.0,
+                0.0,
+                0.0
+            ],
+            "righthand_box": [
+                0.0,
+                0.0,
+                0.0,
+                0.0
+            ],
+            "lefthand_kpts": [
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0
+            ],
+            "righthand_kpts": [
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0
+            ],
+            "face_kpts": [
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0
+            ],
+            "face_valid": false,
+            "lefthand_valid": false,
+            "righthand_valid": false,
+            "foot_valid": false,
+            "foot_kpts": [
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0
+            ]
+        },
+        {
+            "segmentation": [
+                [
+                    145.65,
+                    41.37,
+                    160.08,
+                    33.24,
+                    172.68,
+                    31.93,
+                    176.35,
+                    38.23,
+                    175.57,
+                    46.62,
+                    173.73,
+                    50.82,
+                    174.78,
+                    55.02,
+                    172.68,
+                    59.74,
+                    170.32,
+                    66.57,
+                    167.17,
+                    70.76,
+                    165.07,
+                    73.91,
+                    166.91,
+                    77.59,
+                    165.07,
+                    83.36,
+                    167.43,
+                    90.97,
+                    167.96,
+                    103.83,
+                    167.96,
+                    115.63,
+                    167.43,
+                    122.46,
+                    186.06,
+                    138.2,
+                    188.69,
+                    147.12,
+                    190.26,
+                    152.63,
+                    184.22,
+                    157.36,
+                    177.93,
+                    156.31,
+                    167.69,
+                    145.02,
+                    169.01,
+                    169.69,
+                    159.03,
+                    173.95,
+                    152.74,
+                    176.84,
+                    150.64,
+                    197.57,
+                    152.47,
+                    212,
+                    161.66,
+                    216.2,
+                    163.23,
+                    224.34,
+                    162.45,
+                    228.01,
+                    162.97,
+                    250.31,
+                    165.33,
+                    258.71,
+                    172.68,
+                    267.11,
+                    170.58,
+                    270.26,
+                    159.03,
+                    270.26,
+                    149.59,
+                    267.63,
+                    144.86,
+                    265.8,
+                    144.08,
+                    256.61,
+                    138.04,
+                    248.21,
+                    134.63,
+                    247.43,
+                    132.27,
+                    242.44,
+                    135.42,
+                    233.52,
+                    140.14,
+                    220.4,
+                    139.09,
+                    205.18,
+                    135.94,
+                    173.95,
+                    129.91,
+                    167.04,
+                    131.48,
+                    150.25,
+                    133.06,
+                    132.41,
+                    134.37,
+                    124.27,
+                    132.79,
+                    115.87,
+                    132.79,
+                    88.52,
+                    137.52,
+                    81.96,
+                    143.29,
+                    74.61,
+                    146.44,
+                    68.84,
+                    149.59,
+                    61.23,
+                    148.54,
+                    55.98,
+                    146.44,
+                    50.73,
+                    143.03,
+                    45.75
+                ]
+            ],
+            "num_keypoints": 14,
+            "area": 6855.73765,
+            "iscrowd": 0,
+            "keypoints": [
+                167,
+                60,
+                2,
+                169,
+                54,
+                2,
+                163,
+                54,
+                2,
+                0,
+                0,
+                0,
+                151,
+                54,
+                2,
+                160,
+                93,
+                2,
+                145,
+                94,
+                2,
+                0,
+                0,
+                0,
+                153,
+                116,
+                2,
+                0,
+                0,
+                0,
+                176,
+                138,
+                2,
+                157,
+                149,
+                2,
+                147,
+                151,
+                2,
+                153,
+                194,
+                1,
+                151,
+                206,
+                2,
+                143,
+                235,
+                2,
+                155,
+                257,
+                2
+            ],
+            "image_id": 292456,
+            "bbox": [
+                129.91,
+                31.93,
+                60.35,
+                238.33
+            ],
+            "category_id": 1,
+            "id": 216341,
+            "face_box": [
+                155.3,
+                51.49,
+                17.569999999999993,
+                20.440000000000005
+            ],
+            "lefthand_box": [
+                0.0,
+                0.0,
+                0.0,
+                0.0
+            ],
+            "righthand_box": [
+                176.35,
+                138.34,
+                12.870000000000005,
+                19.210000000000008
+            ],
+            "lefthand_kpts": [
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0
+            ],
+            "righthand_kpts": [
+                180.0,
+                141.0,
+                0.10326574742794037,
+                183.5,
+                144.0,
+                0.07696976512670517,
+                187.0,
+                147.0,
+                0.07696976512670517,
+                188.0,
+                149.0,
+                0.04711573198437691,
+                191.0,
+                155.0,
+                0.07077035307884216,
+                186.0,
+                149.0,
+                0.29661688208580017,
+                186.0,
+                153.0,
+                0.24504776298999786,
+                187.0,
+                154.0,
+                0.27228468656539917,
+                187.0,
+                156.0,
+                0.4332796335220337,
+                183.0,
+                149.0,
+                0.3887251019477844,
+                184.0,
+                153.0,
+                0.2849636673927307,
+                184.0,
+                155.0,
+                0.25188443064689636,
+                185.0,
+                157.0,
+                0.22013618052005768,
+                181.0,
+                149.0,
+                0.26592785120010376,
+                181.0,
+                152.0,
+                0.334407240152359,
+                181.0,
+                155.0,
+                0.3411094844341278,
+                182.0,
+                156.0,
+                0.21506451070308685,
+                178.0,
+                150.0,
+                0.1895236074924469,
+                179.0,
+                152.0,
+                0.22152823209762573,
+                179.0,
+                153.0,
+                0.24057771265506744,
+                179.0,
+                155.0,
+                0.3196240961551666
+            ],
+            "face_kpts": [
+                153.685,
+                53.5251,
+                1.0,
+                153.685,
+                55.9995,
+                1.0,
+                153.804,
+                58.4704,
+                1.0,
+                154.134,
+                60.9212,
+                1.0,
+                154.957,
+                63.2463,
+                1.0,
+                156.253,
+                65.3459,
+                1.0,
+                157.915,
+                67.1762,
+                1.0,
+                159.836,
+                68.7344,
+                1.0,
+                161.981,
+                69.9598,
+                1.0,
+                164.351,
+                70.6189,
+                1.0,
+                166.61,
+                69.8115,
+                1.0,
+                167.696,
+                67.6045,
+                1.0,
+                168.698,
+                65.3436,
+                1.0,
+                169.518,
+                63.0107,
+                1.0,
+                170.223,
+                60.6411,
+                1.0,
+                170.512,
+                58.1862,
+                1.0,
+                170.86,
+                55.7391,
+                1.0,
+                161.251,
+                53.1078,
+                1.0,
+                162.566,
+                52.687,
+                1.0,
+                163.888,
+                52.5886,
+                1.0,
+                165.174,
+                52.6945,
+                1.0,
+                166.412,
+                52.9976,
+                1.0,
+                169.073,
+                53.6941,
+                1.0,
+                169.62,
+                53.6665,
+                1.0,
+                170.131,
+                53.6955,
+                1.0,
+                170.627,
+                53.7716,
+                1.0,
+                171.084,
+                54.1277,
+                1.0,
+                167.945,
+                55.3813,
+                1.0,
+                168.385,
+                57.2486,
+                1.0,
+                168.792,
+                59.0841,
+                1.0,
+                169.156,
+                60.9188,
+                1.0,
+                165.611,
+                61.6406,
+                1.0,
+                166.592,
+                62.0987,
+                1.0,
+                167.667,
+                62.4634,
+                1.0,
+                168.508,
+                62.5478,
+                1.0,
+                169.288,
+                62.1283,
+                1.0,
+                162.091,
+                54.5984,
+                1.0,
+                163.22,
+                53.9983,
+                1.0,
+                164.482,
+                54.0386,
+                1.0,
+                165.15,
+                55.0692,
+                1.0,
+                164.097,
+                55.3072,
+                1.0,
+                163.04,
+                55.1084,
+                1.0,
+                168.609,
+                55.7261,
+                1.0,
+                169.207,
+                55.083,
+                1.0,
+                170.057,
+                55.1597,
+                1.0,
+                170.369,
+                55.961,
+                1.0,
+                169.776,
+                56.2287,
+                1.0,
+                169.141,
+                56.1008,
+                1.0,
+                162.475,
+                63.8369,
+                1.0,
+                164.48,
+                63.848,
+                1.0,
+                166.47,
+                64.0609,
+                1.0,
+                167.089,
+                64.2488,
+                1.0,
+                167.527,
+                64.4092,
+                1.0,
+                167.957,
+                64.8118,
+                1.0,
+                168.079,
+                65.2803,
+                1.0,
+                167.681,
+                65.6566,
+                1.0,
+                167.16,
+                65.9072,
+                1.0,
+                166.491,
+                65.9737,
+                1.0,
+                165.017,
+                65.529,
+                1.0,
+                163.655,
+                64.8154,
+                1.0,
+                162.759,
+                63.9383,
+                1.0,
+                164.797,
+                64.4124,
+                1.0,
+                166.767,
+                64.9156,
+                1.0,
+                167.323,
+                65.1626,
+                1.0,
+                167.909,
+                65.2694,
+                1.0,
+                167.288,
+                65.157,
+                1.0,
+                166.65,
+                64.9973,
+                1.0,
+                164.709,
+                64.5693,
+                1.0
+            ],
+            "face_valid": true,
+            "lefthand_valid": false,
+            "righthand_valid": true,
+            "foot_valid": true,
+            "foot_kpts": [
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                137.68845,
+                238.83224,
+                2.0,
+                169.43431,
+                265.41017,
+                2.0,
+                164.26638,
+                267.99414,
+                2.0,
+                149.87,
+                259.50397,
+                2.0
+            ]
+        }
+    ],
+	"categories": [
+        {
+            "supercategory": "person",
+            "id": 1,
+            "name": "person",
+            "keypoints": [
+                "nose",
+                "left_eye",
+                "right_eye",
+                "left_ear",
+                "right_ear",
+                "left_shoulder",
+                "right_shoulder",
+                "left_elbow",
+                "right_elbow",
+                "left_wrist",
+                "right_wrist",
+                "left_hip",
+                "right_hip",
+                "left_knee",
+                "right_knee",
+                "left_ankle",
+                "right_ankle"
+            ],
+            "skeleton": [
+                [
+                    16,
+                    14
+                ],
+                [
+                    14,
+                    12
+                ],
+                [
+                    17,
+                    15
+                ],
+                [
+                    15,
+                    13
+                ],
+                [
+                    12,
+                    13
+                ],
+                [
+                    6,
+                    12
+                ],
+                [
+                    7,
+                    13
+                ],
+                [
+                    6,
+                    7
+                ],
+                [
+                    6,
+                    8
+                ],
+                [
+                    7,
+                    9
+                ],
+                [
+                    8,
+                    10
+                ],
+                [
+                    9,
+                    11
+                ],
+                [
+                    2,
+                    3
+                ],
+                [
+                    1,
+                    2
+                ],
+                [
+                    1,
+                    3
+                ],
+                [
+                    2,
+                    4
+                ],
+                [
+                    3,
+                    5
+                ],
+                [
+                    4,
+                    6
+                ],
+                [
+                    5,
+                    7
+                ]
+            ]
+        }
+    ]
+}
diff --git a/data/test/pose/hand/small_whole_body_hand_coco/train2017/000000292456.jpg b/data/test/pose/hand/small_whole_body_hand_coco/train2017/000000292456.jpg
new file mode 100644
index 00000000..3852de57
--- /dev/null
+++ b/data/test/pose/hand/small_whole_body_hand_coco/train2017/000000292456.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c8207a06044306b0d271488a22e1a174af5a22e951a710e25a556cf5d212d5c
+size 160632
diff --git a/data/test/pose/hand/small_whole_body_hand_coco/train2017/000000425226.jpg b/data/test/pose/hand/small_whole_body_hand_coco/train2017/000000425226.jpg
new file mode 100644
index 00000000..0b6a0537
--- /dev/null
+++ b/data/test/pose/hand/small_whole_body_hand_coco/train2017/000000425226.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:feadc69a8190787088fda0ac12971d91badc93dbe06057645050fdbec1ce6911
+size 204232
diff --git a/data/test/segmentation/coco_stuff_164k/val2017/000000289059.jpg b/data/test/segmentation/coco_stuff_164k/val2017/000000289059.jpg
new file mode 100644
index 00000000..a9c0875c
--- /dev/null
+++ b/data/test/segmentation/coco_stuff_164k/val2017/000000289059.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af6fa61274e497ecc170de5adc4b8e7ac89eba2bc22a6aa119b08ec7adbe9459
+size 146140
diff --git a/data/test/segmentation/data/000000309022.jpg b/data/test/segmentation/data/000000309022.jpg
new file mode 100755
index 00000000..7aa51424
--- /dev/null
+++ b/data/test/segmentation/data/000000309022.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:898b141c663f242f716bb26c4cf4962452927e6bef3f170e61fb364cd6359d00
+size 187956
diff --git a/data/test/segmentation/models/segformer_b0.pth b/data/test/segmentation/models/segformer_b0.pth
new file mode 100644
index 00000000..85e68166
--- /dev/null
+++ b/data/test/segmentation/models/segformer_b0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94d7df6a4ff3c605916378304b2a00404a23d4965d226a657417061647cb46a6
+size 45361179
diff --git a/docs/source/_static/dingding_qrcode.jpg b/docs/source/_static/dingding_qrcode.jpg
index c9dd79df..4334a2b7 100644
Binary files a/docs/source/_static/dingding_qrcode.jpg and b/docs/source/_static/dingding_qrcode.jpg differ
diff --git a/docs/source/_static/result.jpg b/docs/source/_static/result.jpg
index 9bdf6295..5bb73d81 100644
Binary files a/docs/source/_static/result.jpg and b/docs/source/_static/result.jpg differ
diff --git a/docs/source/data_hub.md b/docs/source/data_hub.md
index 4b682c60..633d6a14 100644
--- a/docs/source/data_hub.md
+++ b/docs/source/data_hub.md
@@ -68,7 +68,10 @@ Before using dataset, please read the [LICENSE](docs/source/LICENSE) file to lea
 | **VOC2007**<br/>[url](http://host.robots.ox.ac.uk/pascal/VOC/voc2007/index.html) | Common        | PASCAL VOC 2007 is a dataset for image recognition consisting of 20 object categories. Each image in this dataset has pixel-level segmentation annotations, bounding box annotations, and object class annotations. | [VOCtrainval_06-Nov-2007.tar](http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar) (439MB) |         |
 | **VOC2012**<br/>[url](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/index.html) | Common        | From 2009 to 2011, the amount of data is still growing on the basis of the previous year's dataset, and from 2011 to 2012, the amount of data used for classification, detection and person layout tasks does not change. Mainly for segmentation and action recognition, improve the corresponding data subsets and label information. | [Baidu Netdisk (提取码:ro9f)](https://pan.baidu.com/s/1B4tF8cEPIe0xGL1FG0qbkg)<br/>[VOCtrainval_11-May-2012.tar](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar) (2G) |         | [LICENSE](https://github.com/alibaba/EasyCV/blob/master/docs/source/LICENSE#L70) |
 | **Pascal Context**<br/>[url](http://host.robots.ox.ac.uk/pascal/VOC/voc2010/) | Common        | This dataset is a set of additional annotations for PASCAL VOC 2010. It goes beyond the original PASCAL semantic segmentation task by providing annotations for the whole scene. The [statistics section](https://www.cs.stanford.edu/~roozbeh/pascal-context/#statistics) has a full list of 400+ labels. | [voc2010/VOCtrainval_03-May-2010.tar](http://host.robots.ox.ac.uk/pascal/VOC/voc2010/VOCtrainval_03-May-2010.tar) (1.3GB)<br/>[VOC2010test.tar](http://host.robots.ox.ac.uk:8080/eval/downloads/VOC2010test.tar) <br/>[trainval_merged.json](https://codalabuser.blob.core.windows.net/public/trainval_merged.json) (590MB) |         |
+| **COCO-Stuff 10K**<br/>[url](https://github.com/nightrome/cocostuff10k) | Common        | COCO-Stuff augments the popular COCO dataset with pixel-level stuff annotations. These annotations can be used for scene understanding tasks like semantic segmentation, object detection and image captioning. | [cocostuff-10k-v1.1.zip](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/cocostuff-10k-v1.1.zip) (2.0 GB) |         |
+| **COCO-Stuff 164K**<br/>[url](https://github.com/nightrome/cocostuff) | Common        | COCO-Stuff augments the popular COCO dataset with pixel-level stuff annotations. These annotations can be used for scene understanding tasks like semantic segmentation, object detection and image captioning. | [train2017.zip](http://images.cocodataset.org/zips/train2017.zip) (18.0 GB), <br/>[val2017.zip](http://images.cocodataset.org/zips/val2017.zip) (1.0 GB), <br/>[stuffthingmaps_trainval2017.zip](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip) (659M)|         |
 | **COCO-Stuff 10K**<br/>[url](https://github.com/nightrome/cocostuff10k) | Common        | COCO-Stuff augments the popular COCO dataset with pixel-level stuff annotations. These annotations can be used for scene understanding tasks like semantic segmentation, object detection and image captioning. | [Baidu Netdisk (提取码:4r7o)](https://pan.baidu.com/s/1aWOjVnnOHFNISnGerGQcnw)<br/>[cocostuff-10k-v1.1.zip](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/cocostuff-10k-v1.1.zip) (2.0 GB) |         | [LICENSE](https://github.com/alibaba/EasyCV/blob/master/docs/source/LICENSE#L17) |
+| **COCO-Stuff 164K**<br/>[url](https://github.com/nightrome/cocostuff) | Common        | COCO-Stuff augments the popular COCO dataset with pixel-level stuff annotations. These annotations can be used for scene understanding tasks like semantic segmentation, object detection and image captioning. | [train2017.zip](http://images.cocodataset.org/zips/train2017.zip) (18.0 GB), <br/>[val2017.zip](http://images.cocodataset.org/zips/val2017.zip) (1.0 GB), <br/>[stuffthingmaps_trainval2017.zip](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip) (659M)|         |
 | **Cityscapes**<br/>[url](https://www.cityscapes-dataset.com/) | Street scenes | The Cityscapes contains a diverse set of stereo video sequences recorded in street scenes from 50 different cities, with high quality pixel-level annotations of 5 000 frames in addition to a larger set of 20 000 weakly annotated frames. The dataset is thus an order of magnitude larger than similar previous attempts. | [leftImg8bit_trainvaltest.zip](https://www.cityscapes-dataset.com/file-handling/?packageID=3) (11GB) |         |
 | **ADE20K**<br/>[url](http://groups.csail.mit.edu/vision/datasets/ADE20K/) | Scene         | The ADE20K dataset is released by MIT and can be used for scene perception, parsing, segmentation, multi-object recognition and semantic understanding.The annotated images cover the scene categories from the SUN and Places database.It contains 25.574 training set and 2000 validation set. | [Baidu Netdisk (提取码:dqim)](https://pan.baidu.com/s/1ZuAuZheHHSDNRRdaI4wQrQ)<br/>[ADEChallengeData2016.zip](http://data.csail.mit.edu/places/ADEchallenge/ADEChallengeData2016.zip) (923MB)<br/>[release_test.zip](http://data.csail.mit.edu/places/ADEchallenge/release_test.zip) (202MB) |         | [LICENSE](https://github.com/alibaba/EasyCV/blob/master/docs/source/LICENSE#L30) |
 
diff --git a/docs/source/develop.md b/docs/source/develop.md
index 705e23f8..7273c16c 100644
--- a/docs/source/develop.md
+++ b/docs/source/develop.md
@@ -39,12 +39,55 @@ pre-commit run --all-files
 bash scripts/ci_test.sh
  ```
 
- ### 2.2 Test data
- if you add new data, please do the following to commit it to git-lfs before "git commit":
- ```bash
- python git-lfs/git_lfs.py add data/test/new_data
- python git-lfs/git_lfs.py push
- ```
+
+### 2.2 Test data storage
+
+As we need a lot of data for testing, including images, models. We use git lfs
+to store those large files.
+
+1. install git-lfs(version>=2.5.0)
+
+for mac
+
+```bash
+brew install git-lfs
+git lfs install
+```
+
+for centos, please download rpm from git-lfs github release [website](https://github.com/git-lfs/git-lfs/releases/tag/v3.2.0)
+```bash
+wget http://101374-public.oss-cn-hangzhou-zmf.aliyuncs.com/git-lfs-3.2.0-1.el7.x86_64.rpm
+sudo rpm -ivh git-lfs-3.2.0-1.el7.x86_64.rpm
+git lfs install
+```
+
+for ubuntu
+```bash
+curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
+sudo apt-get install git-lfs
+git lfs install
+```
+
+2. track your data type using git lfs, for example, to track png files
+```bash
+git lfs track "*.png"
+```
+
+3. add your test files to `data/test/` folder, you can make directories if you need.
+```bash
+git add data/test/test.png
+```
+
+4. commit your test data to remote branch
+```bash
+git commit -m "xxx"
+```
+
+To pull data from remote repo, just as the same way you pull git files.
+```bash
+git pull origin branch_name
+```
+
 
  ## 3. Build pip package
  ```bash
diff --git a/docs/source/model_zoo_seg.md b/docs/source/model_zoo_seg.md
index 1f871621..97820feb 100644
--- a/docs/source/model_zoo_seg.md
+++ b/docs/source/model_zoo_seg.md
@@ -11,6 +11,7 @@ Pretrained on **Pascal VOC 2012 + Aug**.
 ## UperNet
 
 Pretrained on **Pascal VOC 2012 + Aug**.
+
 | Algorithm  | Config                                                       | Params<br/>(backbone/total)                            | inference time(V100)<br/>(ms/img)                      | mIoU | Download                                                     |
 | ---------- | ------------------------------------------------------------ | ------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
 | upernet_r50 | [upernet_r50_512x512_8xb4_60e_voc12aug](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/upernet/upernet_r50_512x512_8xb4_60e_voc12aug.py) | 23M/66M | 282.9ms | 76.59               | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/upernet_r50/epoch_60.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/upernet_r50/20220706_114712.log.json) |
@@ -26,3 +27,17 @@ Pretrained on **Pascal VOC 2012 + Aug**.
 | Algorithm  | Config                                                       | PQ | box MAP | Mask mAP | Download                                                     |
 | ---------- | ---------- | ------------------------------------------------------------ | ------------------------ |----------|---------------------------------------------------------------------------- |
 | mask2former_r50 | [mask2former_r50_8xb2_e50_panopatic](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/mask2former/mask2former_r50_8xb2_e50_panopatic.py) | 51.64 | 44.81 | 41.88 |[model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/mask2former_r50_panoptic/epoch_50.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/mask2former_r50_panoptic/20220629_170721.log.json) |
+
+
+## SegFormer
+
+Semantic segmentation models trained on **CoCo_stuff164k**.
+
+| Algorithm  | Config                                                       | Params<br/>(backbone/total)                            | inference time(V100)<br/>(ms/img)                    |mIoU | Download                                                     |
+| ---------- | ------------------------------------------------------------ | ------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| SegFormer_B0 | [segformer_b0_coco.py](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/segformer/segformer_b0_coco.py) | 3.3M/3.8M | 47.2ms |  35.91               | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/damo/modelzoo/segmentation/segformer/segformer_b0/SegmentationEvaluator_mIoU_best.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/damo/modelzoo/segmentation/segformer/segformer_b0/20220909_152337.log.json) |
+| SegFormer_B1 | [segformer_b1_coco.py](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/segformer/segformer_b1_coco.py) | 13.2M/13.7M | 46.8ms |  40.53               | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/damo/modelzoo/segmentation/segformer/segformer_b1/SegmentationEvaluator_mIoU_best.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/damo/modelzoo/segmentation/segformer/segformer_b1/20220825_200708.log.json) |
+| SegFormer_B2 | [segformer_b2_coco.py](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/segformer/segformer_b2_coco.py) | 24.2M/27.5M   | 49.1ms |  44.53               | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/damo/modelzoo/segmentation/segformer/segformer_b2/SegmentationEvaluator_mIoU_best.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/damo/modelzoo/segmentation/segformer/segformer_b2/20220829_163757.log.json) |
+| SegFormer_B3 | [segformer_b3_coco.py](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/segformer/segformer_b3_coco.py) | 44.1M/47.4M | 52.3ms |  45.49               | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/damo/modelzoo/segmentation/segformer/segformer_b3/SegmentationEvaluator_mIoU_best.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/damo/modelzoo/segmentation/segformer/segformer_b3/20220830_142021.log.json) |
+| SegFormer_B4 | [segformer_b4_coco.py](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/segformer/segformer_b4_coco.py) | 60.8M/64.1M   | 58.5ms |  46.27               | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/damo/modelzoo/segmentation/segformer/segformer_b4/SegmentationEvaluator_mIoU_best.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/damo/modelzoo/segmentation/segformer/segformer_b4/20220902_135723.log.json) |
+| SegFormer_B5 | [segformer_b5_coco.py](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/segformer/segformer_b5_coco.py) | 81.4M/85.7M   | 99.2ms |  46.75               | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/damo/modelzoo/segmentation/segformer/segformer_b5/SegmentationEvaluator_mIoU_best.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/damo/modelzoo/segmentation/segformer/segformer_b5/20220812_144336.log.json) |
diff --git a/docs/source/prepare_data.md b/docs/source/prepare_data.md
index baeee784..5ce28bbd 100644
--- a/docs/source/prepare_data.md
+++ b/docs/source/prepare_data.md
@@ -6,6 +6,8 @@ EasyCV provides various datasets for multi tasks. Please refer to the following
 - [Object Detection](#Object-Detection)
 - [Self-Supervised Learning](#Self-Supervised-Learning)
 - [Pose (Keypoint)](#Pose)
+- [Image Segmentation](#Image-Segmentation)
+
 
 ## Image Classification
 
@@ -354,3 +356,30 @@ data/coco2017
     ├── 000000000285.jpg
     ├── ...
 ```
+
+## Image Segmentation
+
+- [COCO Stuff 164k](#COCO-Stuff-164k)
+### COCO Stuff 164k
+
+For COCO Stuff 164k dataset, please run the following commands to download and convert the augmented dataset.
+
+```shell
+# download
+mkdir coco_stuff164k && cd coco_stuff164k
+wget http://images.cocodataset.org/zips/train2017.zip
+wget http://images.cocodataset.org/zips/val2017.zip
+wget http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip
+
+# unzip
+unzip train2017.zip -d images/
+unzip val2017.zip -d images/
+unzip stuffthingmaps_trainval2017.zip -d annotations/
+
+# --nproc means 8 process for conversion, which could be omitted as well.
+python tools/prepare_data/coco_stuff164k.py /path/to/coco_stuff164k --nproc 8
+```
+
+By convention, mask labels in `/path/to/coco_stuff164k/annotations/*2017/*_labelTrainIds.png` are used for COCO Stuff 164k training and testing.
+
+The details of this dataset could be found at [here](https://github.com/nightrome/cocostuff#downloads).
diff --git a/easycv/core/evaluation/__init__.py b/easycv/core/evaluation/__init__.py
index 3c5f8c2d..2209e505 100644
--- a/easycv/core/evaluation/__init__.py
+++ b/easycv/core/evaluation/__init__.py
@@ -3,9 +3,12 @@
 from .base_evaluator import Evaluator
 from .classification_eval import ClsEvaluator
 from .coco_evaluation import CocoDetectionEvaluator, CoCoPoseTopDownEvaluator
+from .face_eval import FaceKeypointEvaluator
 from .faceid_pair_eval import FaceIDPairEvaluator
+from .keypoint_eval import KeyPointEvaluator
 from .mse_eval import MSEEvaluator
 from .retrival_topk_eval import RetrivalTopKEvaluator
 from .segmentation_eval import SegmentationEvaluator
-from .top_down_eval import (keypoint_pck_accuracy, keypoints_from_heatmaps,
+from .top_down_eval import (keypoint_auc, keypoint_epe, keypoint_nme,
+                            keypoint_pck_accuracy, keypoints_from_heatmaps,
                             pose_pck_accuracy)
diff --git a/easycv/core/evaluation/face_eval.py b/easycv/core/evaluation/face_eval.py
new file mode 100644
index 00000000..633a1392
--- /dev/null
+++ b/easycv/core/evaluation/face_eval.py
@@ -0,0 +1,59 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+
+from .base_evaluator import Evaluator
+from .builder import EVALUATORS
+from .metric_registry import METRICS
+
+
+@EVALUATORS.register_module
+class FaceKeypointEvaluator(Evaluator):
+
+    def __init__(self, dataset_name=None, metric_names=['ave_nme']):
+        super(FaceKeypointEvaluator, self).__init__(dataset_name, metric_names)
+        self.metric = metric_names
+        self.dataset_name = dataset_name
+
+    def _evaluate_impl(self, prediction_dict, groundtruth_dict, **kwargs):
+        """
+        Args:
+            prediction_dict: model forward output dict, ['point', 'pose']
+            groundtruth_dict: groundtruth dict, ['target_point', 'target_point_mask', 'target_pose', 'target_pose_mask'] used for compute accuracy
+            kwargs: other parameters
+        """
+
+        def evaluate(predicts, gts, **kwargs):
+            from easycv.models.utils.face_keypoint_utils import get_keypoint_accuracy, get_pose_accuracy
+            ave_pose_acc = 0
+            ave_nme = 0
+            idx = 0
+
+            for (predict_point, predict_pose,
+                 gt) in zip(predicts['point'], predicts['pose'], gts):
+                target_point = gt['target_point']
+                target_point_mask = gt['target_point_mask']
+                target_pose = gt['target_pose']
+                target_pose_mask = gt['target_pose_mask']
+
+                target_point = target_point * target_point_mask
+                target_pose = target_pose * target_pose_mask
+
+                keypoint_accuracy = get_keypoint_accuracy(
+                    predict_point, target_point)
+                pose_accuracy = get_pose_accuracy(predict_pose, target_pose)
+
+                ave_pose_acc += pose_accuracy['pose_acc']
+                ave_nme += keypoint_accuracy['nme']
+                idx += 1
+
+            eval_result = {}
+            idx += 0.000001
+            eval_result['ave_pose_acc'] = ave_pose_acc / idx
+            eval_result['ave_nme'] = ave_nme / idx
+
+            return eval_result
+
+        return evaluate(prediction_dict, groundtruth_dict)
+
+
+METRICS.register_default_best_metric(FaceKeypointEvaluator, 'ave_nme', 'min')
diff --git a/easycv/core/evaluation/keypoint_eval.py b/easycv/core/evaluation/keypoint_eval.py
new file mode 100644
index 00000000..0549a71f
--- /dev/null
+++ b/easycv/core/evaluation/keypoint_eval.py
@@ -0,0 +1,123 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapt from
+# https://github.com/open-mmlab/mmpose/blob/master/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py
+import numpy as np
+
+from .base_evaluator import Evaluator
+from .builder import EVALUATORS
+from .metric_registry import METRICS
+from .top_down_eval import (keypoint_auc, keypoint_epe, keypoint_nme,
+                            keypoint_pck_accuracy)
+
+
+@EVALUATORS.register_module
+class KeyPointEvaluator(Evaluator):
+    """ KeyPoint evaluator.
+    """
+
+    def __init__(self,
+                 dataset_name=None,
+                 metric_names=['PCK', 'PCKh', 'AUC', 'EPE', 'NME'],
+                 pck_thr=0.2,
+                 pckh_thr=0.7,
+                 auc_nor=30):
+        """
+
+        Args:
+            dataset_name: eval dataset name
+            metric_names: eval metrics name
+            pck_thr (float): PCK threshold, default as 0.2.
+            pckh_thr (float): PCKh threshold, default as 0.7.
+            auc_nor (float): AUC normalization factor, default as 30 pixel.
+        """
+        super(KeyPointEvaluator, self).__init__(dataset_name, metric_names)
+        self._pck_thr = pck_thr
+        self._pckh_thr = pckh_thr
+        self._auc_nor = auc_nor
+        self.dataset_name = dataset_name
+        allowed_metrics = ['PCK', 'PCKh', 'AUC', 'EPE', 'NME']
+        for metric in metric_names:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+    def _evaluate_impl(self, preds, coco_db, **kwargs):
+        ''' keypoint evaluation code which will be run after
+        all test batched data are predicted
+
+        Args:
+            preds: dict with key ``keypoints`` whose shape is Nx3
+            coco_db: the db of wholebody coco datasource, sorted by 'bbox_id'
+
+        Return:
+            a dict,  each key is metric_name, value is metric value
+        '''
+        assert len(preds) == len(coco_db)
+        eval_res = {}
+
+        outputs = []
+        gts = []
+        masks = []
+        box_sizes = []
+        threshold_bbox = []
+        threshold_head_box = []
+
+        for pred, item in zip(preds, coco_db):
+            outputs.append(np.array(pred['keypoints'])[:, :-1])
+            gts.append(np.array(item['joints_3d'])[:, :-1])
+            masks.append((np.array(item['joints_3d_visible'])[:, 0]) > 0)
+            if 'PCK' in self.metric_names:
+                bbox = np.array(item['bbox'])
+                bbox_thr = np.max(bbox[2:])
+                threshold_bbox.append(np.array([bbox_thr, bbox_thr]))
+            if 'PCKh' in self.metric_names:
+                head_box_thr = item['head_size']
+                threshold_head_box.append(
+                    np.array([head_box_thr, head_box_thr]))
+            box_sizes.append(item.get('box_size', 1))
+
+        outputs = np.array(outputs)
+        gts = np.array(gts)
+        masks = np.array(masks)
+        threshold_bbox = np.array(threshold_bbox)
+        threshold_head_box = np.array(threshold_head_box)
+        box_sizes = np.array(box_sizes).reshape([-1, 1])
+
+        if 'PCK' in self.metric_names:
+            _, pck, _ = keypoint_pck_accuracy(outputs, gts, masks,
+                                              self._pck_thr, threshold_bbox)
+            eval_res['PCK'] = pck
+
+        if 'PCKh' in self.metric_names:
+            _, pckh, _ = keypoint_pck_accuracy(outputs, gts, masks,
+                                               self._pckh_thr,
+                                               threshold_head_box)
+            eval_res['PCKh'] = pckh
+
+        if 'AUC' in self.metric_names:
+            eval_res['AUC'] = keypoint_auc(outputs, gts, masks, self._auc_nor)
+
+        if 'EPE' in self.metric_names:
+            eval_res['EPE'] = keypoint_epe(outputs, gts, masks)
+
+        if 'NME' in self.metric_names:
+            normalize_factor = self._get_normalize_factor(
+                gts=gts, box_sizes=box_sizes)
+            eval_res['NME'] = keypoint_nme(outputs, gts, masks,
+                                           normalize_factor)
+        return eval_res
+
+    def _get_normalize_factor(self, gts, *args, **kwargs):
+        """Get the normalize factor. generally inter-ocular distance measured
+        as the Euclidean distance between the outer corners of the eyes is
+        used. This function should be overrode, to measure NME.
+
+        Args:
+            gts (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+
+        Returns:
+            np.ndarray[N, 2]: normalized factor
+        """
+        return np.ones([gts.shape[0], 2], dtype=np.float32)
+
+
+METRICS.register_default_best_metric(KeyPointEvaluator, 'PCK', 'max')
diff --git a/easycv/core/evaluation/top_down_eval.py b/easycv/core/evaluation/top_down_eval.py
index 0659a0de..ebb505e8 100644
--- a/easycv/core/evaluation/top_down_eval.py
+++ b/easycv/core/evaluation/top_down_eval.py
@@ -178,6 +178,86 @@ def keypoint_pck_accuracy(pred, gt, mask, thr, normalize):
     return acc, avg_acc, cnt
 
 
+def keypoint_auc(pred, gt, mask, normalize, num_step=20):
+    """Calculate the pose accuracy of PCK for each individual keypoint and the
+    averaged accuracy across all keypoints for coordinates.
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+
+    Args:
+        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
+        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        normalize (float): Normalization factor.
+
+    Returns:
+        float: Area under curve.
+    """
+    nor = np.tile(np.array([[normalize, normalize]]), (pred.shape[0], 1))
+    x = [1.0 * i / num_step for i in range(num_step)]
+    y = []
+    for thr in x:
+        _, avg_acc, _ = keypoint_pck_accuracy(pred, gt, mask, thr, nor)
+        y.append(avg_acc)
+
+    auc = 0
+    for i in range(num_step):
+        auc += 1.0 / num_step * y[i]
+    return auc
+
+
+def keypoint_nme(pred, gt, mask, normalize_factor):
+    """Calculate the normalized mean error (NME).
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+
+    Args:
+        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
+        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        normalize_factor (np.ndarray[N, 2]): Normalization factor.
+
+    Returns:
+        float: normalized mean error
+    """
+    distances = _calc_distances(pred, gt, mask, normalize_factor)
+    distance_valid = distances[distances != -1]
+    return distance_valid.sum() / max(1, len(distance_valid))
+
+
+def keypoint_epe(pred, gt, mask):
+    """Calculate the end-point error.
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+
+    Args:
+        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
+        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+
+    Returns:
+        float: Average end-point error.
+    """
+
+    distances = _calc_distances(
+        pred, gt, mask,
+        np.ones((pred.shape[0], pred.shape[2]), dtype=np.float32))
+    distance_valid = distances[distances != -1]
+    return distance_valid.sum() / max(1, len(distance_valid))
+
+
 def _taylor(heatmap, coord):
     """Distribution aware coordinate decoding method.
 
diff --git a/easycv/core/post_processing/pose_transforms.py b/easycv/core/post_processing/pose_transforms.py
index 312aa904..23bf3470 100644
--- a/easycv/core/post_processing/pose_transforms.py
+++ b/easycv/core/post_processing/pose_transforms.py
@@ -83,7 +83,7 @@ def fliplr_regression(regression,
 
     allowed_center_mode = {'static', 'root'}
     assert center_mode in allowed_center_mode, 'Get invalid center_mode ' \
-        f'{center_mode}, allowed choices are {allowed_center_mode}'
+                                               f'{center_mode}, allowed choices are {allowed_center_mode}'
 
     if center_mode == 'static':
         x_c = center_x
diff --git a/easycv/datasets/__init__.py b/easycv/datasets/__init__.py
index 3f04bfd8..cb4abf82 100644
--- a/easycv/datasets/__init__.py
+++ b/easycv/datasets/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from . import classification, detection, pose, segmentation, selfsup, shared
+from . import (classification, detection, face, pose, segmentation, selfsup,
+               shared)
 from .builder import build_dali_dataset, build_dataset
 from .loader import DistributedGroupSampler, GroupSampler, build_dataloader
 from .registry import DATASETS
diff --git a/easycv/datasets/face/__init__.py b/easycv/datasets/face/__init__.py
new file mode 100644
index 00000000..d045ff4e
--- /dev/null
+++ b/easycv/datasets/face/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .data_sources import *
+from .face_keypoint_dataset import FaceKeypointDataset
+from .pipelines import *
diff --git a/easycv/datasets/face/data_sources/__init__.py b/easycv/datasets/face/data_sources/__init__.py
new file mode 100644
index 00000000..b23f3231
--- /dev/null
+++ b/easycv/datasets/face/data_sources/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .face_keypoint_source import FaceKeypintSource
diff --git a/easycv/datasets/face/data_sources/face_keypoint_source.py b/easycv/datasets/face/data_sources/face_keypoint_source.py
new file mode 100644
index 00000000..031de97e
--- /dev/null
+++ b/easycv/datasets/face/data_sources/face_keypoint_source.py
@@ -0,0 +1,171 @@
+import copy
+import json
+import logging
+import os
+
+import cv2
+import numpy as np
+import torch
+
+from easycv.datasets.face.pipelines.face_keypoint_transform import (
+    FaceKeypointNorm, FaceKeypointRandomAugmentation, normal)
+from easycv.datasets.registry import DATASOURCES
+from easycv.datasets.shared.base import BaseDataset
+
+FACE_KEYPOINT_DATASET_INFO = dict(
+    real_list_file_dir='real_face_list.txt',
+    data_info_dir='infos/merge/',
+    data_image_dir='images/merge/',
+    data_overlay_dir='images/overlay/',
+)
+
+
+@DATASOURCES.register_module()
+class FaceKeypintSource():
+    """
+        load dataset for face key points
+    """
+
+    def __init__(self,
+                 data_cfg,
+                 data_range,
+                 real_list_path=None,
+                 info_path=None,
+                 image_path=None,
+                 data_overlay_path=None,
+                 dataset_info=None,
+                 **kwargs):
+        super(FaceKeypintSource, self).__init__()
+        """
+        Args:
+            data_cfg: Data config dict
+            data_range: rang of dataset for training or validation
+            real_list_file_path: path of file contains image list
+            data_info_dir: annotation file path
+            data_img_dir: image file path
+            data_overlay_dir: overlay background image path
+
+            dataset_info: A dict containing all dataset info
+        """
+        if dataset_info is None:
+            logging.info(
+                'dataset_info is missing, use default face keypoiny dataset info'
+            )
+            dataset_info = FACE_KEYPOINT_DATASET_INFO
+
+        data_root = data_cfg['data_root']
+        real_list_file_path = os.path.join(data_root,
+                                           dataset_info['real_list_file_dir'])
+        data_info_dir = os.path.join(data_root, dataset_info['data_info_dir'])
+        data_img_dir = os.path.join(data_root, dataset_info['data_image_dir'])
+        data_overlay_dir = os.path.join(data_root,
+                                        dataset_info['data_overlay_dir'])
+        self.input_size = data_cfg['input_size']
+        data_range = data_range
+
+        if real_list_path is not None:
+            real_list_file_path = real_list_path
+        if info_path is not None:
+            data_info_dir = info_path
+        if image_path is not None:
+            data_img_dir = image_path
+        if data_overlay_path is not None:
+            data_overlay_dir = data_overlay_path
+
+        # overlay
+        self.overlay_image_path = []
+        for overlay_img_file in sorted(os.listdir(data_overlay_dir)):
+            overlay_img_filepath = os.path.join(data_overlay_dir,
+                                                overlay_img_file)
+            self.overlay_image_path.append(overlay_img_filepath)
+
+        self.points_and_pose_datas = []
+        with open(real_list_file_path, 'r') as real_list_file:
+            real_list_lines = real_list_file.readlines()
+        for index in range(data_range[0], data_range[1]):
+            idx = int(real_list_lines[index])
+            img_path = os.path.join(data_img_dir, '{:06d}.png'.format(idx))
+            if not os.path.exists(img_path):
+                logging.warning('image %s does not exist' % img_path)
+                continue
+            info_path = os.path.join(data_info_dir, '{:06d}.json'.format(idx))
+            if not os.path.exists(info_path):
+                logging.warning('annotation %s does not exist' % info_path)
+                continue
+            with open(info_path, 'r') as info_file:
+                info_json = json.load(info_file)
+                assert info_json['face_count'] == 1
+                base_info = info_json['face_infos'][0]['base_info']
+
+                # points
+                assert base_info['points_array'] is not None
+                points = np.asarray(base_info['points_array']).astype(
+                    np.float32)
+                points_mask = np.abs(points - (-999)) > 0.0001
+
+                # pose
+                pose = {'pitch': -999, 'yaw': -999, 'roll': -999}
+                if base_info['pitch'] is not None and base_info[
+                        'yaw'] is not None and base_info['roll'] is not None:
+                    pose['pitch'] = base_info['pitch']
+                    pose['yaw'] = base_info['yaw']
+                    # pose["roll"] = base_info["roll"]
+                    # datasets have been preprocessed, roll=0
+                    # add noise to pose
+                    pose['roll'] = normal() * 10.0
+
+                pose_mask = np.asarray([
+                    np.abs(pose['pitch'] - (-999)) > 0.0001,
+                    np.abs(pose['roll'] - (-999)) > 0.0001,
+                    np.abs(pose['yaw'] - (-999)) > 0.0001
+                ])
+
+            self.points_and_pose_datas.append(
+                (img_path, points, points_mask, pose, pose_mask))
+
+        self.db = []
+        for img_path, points, points_mask, pose, pose_mask in copy.deepcopy(
+                self.points_and_pose_datas):
+            image = cv2.imread(img_path)
+
+            points[:,
+                   0] = points[:, 0] / image.shape[1] * float(self.input_size)
+            points[:,
+                   1] = points[:, 1] / image.shape[0] * float(self.input_size)
+
+            target_point = np.reshape(points,
+                                      (points.shape[0] * points.shape[1]))
+            points_mask = points_mask.astype(np.float32)
+            points_mask = np.reshape(
+                points_mask, (points_mask.shape[0] * points_mask.shape[1]))
+            pose = np.asarray([pose['pitch'], pose['roll'], pose['yaw']])
+
+            self.db.append({
+                'img_path':
+                img_path,
+                'target_point':
+                torch.tensor(np.array(target_point, np.float32)),
+                'target_point_mask':
+                torch.tensor(points_mask),
+                'target_pose':
+                torch.tensor(np.array(pose, np.float32)),
+                'target_pose_mask':
+                torch.tensor(pose_mask.astype(np.float32))
+            })
+
+    def __getitem__(self, index):
+        img_path, points, points_mask, pose, pose_mask = copy.deepcopy(
+            self.points_and_pose_datas[index])
+        image = cv2.imread(img_path)
+
+        return {
+            'img': image,
+            'target_point': points,
+            'target_point_mask': points_mask,
+            'target_pose': pose,
+            'target_pose_mask': pose_mask,
+            'overlay_image_path': self.overlay_image_path
+        }
+
+    def __len__(self):
+        return len(self.points_and_pose_datas)
diff --git a/easycv/datasets/face/face_keypoint_dataset.py b/easycv/datasets/face/face_keypoint_dataset.py
new file mode 100644
index 00000000..a2c5fe11
--- /dev/null
+++ b/easycv/datasets/face/face_keypoint_dataset.py
@@ -0,0 +1,45 @@
+import copy
+import json
+import logging
+import os
+
+import cv2
+import numpy as np
+import torch
+import torch.utils.data as data
+
+from easycv.datasets.face.pipelines.face_keypoint_transform import (
+    FaceKeypointNorm, FaceKeypointRandomAugmentation, normal)
+from easycv.datasets.registry import DATASETS
+from easycv.datasets.shared.base import BaseDataset
+
+
+@DATASETS.register_module()
+class FaceKeypointDataset(BaseDataset):
+    """
+        dataset for face key points
+    """
+
+    def __init__(self, data_source, pipeline, profiling=False):
+        super(FaceKeypointDataset, self).__init__(data_source, pipeline,
+                                                  profiling)
+        """
+        Args:
+            data_source: Data_source config dict
+            pipeline: Pipeline config list
+            profiling: If set True, will print pipeline time
+        """
+
+    def evaluate(self, outputs, evaluators, **kwargs):
+        eval_result = {}
+        for evaluator in evaluators:
+            eval_result.update(
+                evaluator.evaluate(
+                    prediction_dict=outputs,
+                    groundtruth_dict=self.data_source.db))
+
+        return eval_result
+
+    def __getitem__(self, idx):
+        results = self.data_source[idx]
+        return self.pipeline(results)
diff --git a/easycv/datasets/face/pipelines/__init__.py b/easycv/datasets/face/pipelines/__init__.py
new file mode 100644
index 00000000..222448ab
--- /dev/null
+++ b/easycv/datasets/face/pipelines/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .face_keypoint_transform import (FaceKeypointNorm,
+                                      FaceKeypointRandomAugmentation)
+
+__all__ = ['FaceKeypointRandomAugmentation', 'FaceKeypointNorm']
diff --git a/easycv/datasets/face/pipelines/face_keypoint_transform.py b/easycv/datasets/face/pipelines/face_keypoint_transform.py
new file mode 100644
index 00000000..bda83859
--- /dev/null
+++ b/easycv/datasets/face/pipelines/face_keypoint_transform.py
@@ -0,0 +1,430 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import random
+
+import cv2
+import imgaug
+import imgaug.augmenters as iaa
+import numpy as np
+
+from easycv.datasets.registry import PIPELINES
+
+DEST_SIZE = 256
+BASE_LANDMARK_NUM = 106
+ENLARGE_RATIO = 1.1
+
+CONTOUR_PARTS = [[0, 32], [1, 31], [2, 30], [3, 29], [4, 28], [5, 27], [6, 26],
+                 [7, 25], [8, 24], [9, 23], [10, 22], [11, 21], [12, 20],
+                 [13, 19], [14, 18], [15, 17]]
+BROW_PARTS = [[33, 46], [34, 45], [35, 44], [36, 43], [37, 42], [38, 50],
+              [39, 49], [40, 48], [41, 47]]
+EYE_PARTS = [[66, 79], [67, 78], [68, 77], [69, 76], [70, 75], [71, 82],
+             [72, 81], [73, 80], [74, 83]]
+NOSE_PARTS = [[55, 65], [56, 64], [57, 63], [58, 62], [59, 61]]
+MOUSE_PARTS = [[84, 90], [85, 89], [86, 88], [96, 100], [97, 99], [103, 101],
+               [95, 91], [94, 92]]
+IRIS_PARTS = [[104, 105]]
+MATCHED_PARTS = CONTOUR_PARTS + BROW_PARTS + EYE_PARTS + NOSE_PARTS + MOUSE_PARTS + IRIS_PARTS
+
+
+def normal():
+    """
+    3-sigma rule
+    return: (-1, +1)
+    """
+    mu, sigma = 0, 1
+    while True:
+        s = np.random.normal(mu, sigma)
+        if s < mu - 3 * sigma or s > mu + 3 * sigma:
+            continue
+        return s / 3 * sigma
+
+
+def rotate(angle, center, landmark):
+    rad = angle * np.pi / 180.0
+    alpha = np.cos(rad)
+    beta = np.sin(rad)
+    M = np.zeros((2, 3), dtype=np.float32)
+    M[0, 0] = alpha
+    M[0, 1] = beta
+    M[0, 2] = (1 - alpha) * center[0] - beta * center[1]
+    M[1, 0] = -beta
+    M[1, 1] = alpha
+    M[1, 2] = beta * center[0] + (1 - alpha) * center[1]
+
+    landmark_ = np.asarray([(M[0, 0] * x + M[0, 1] * y + M[0, 2],
+                             M[1, 0] * x + M[1, 1] * y + M[1, 2])
+                            for (x, y) in landmark])
+    return M, landmark_
+
+
+class OverLayGenerator:
+
+    def __init__(self, shape):
+        # 4x4
+        h_seg_len = shape[0] // 4
+        w_seg_len = shape[1] // 4
+
+        self.overlay = []
+        # 2x2 overlay
+        for i in range(3):
+            for j in range(3):
+                if i == 1 and j == 1:
+                    continue
+                self.overlay.append((i * w_seg_len, j * h_seg_len,
+                                     2 * w_seg_len, 2 * h_seg_len))
+
+        # 2x3 overlay
+        for i in range(3):
+            for j in range(2):
+                if i == 1:
+                    continue
+                self.overlay.append((i * w_seg_len, j * h_seg_len,
+                                     2 * w_seg_len, 3 * h_seg_len))
+        for i in range(2):
+            for j in range(3):
+                if j == 1:
+                    continue
+                self.overlay.append((i * w_seg_len, j * h_seg_len,
+                                     3 * w_seg_len, 2 * h_seg_len))
+
+        # 2x4 overlay
+        for i in range(3):
+            for j in range(1):
+                if i == 1:
+                    continue
+                self.overlay.append((i * w_seg_len, j * h_seg_len,
+                                     2 * w_seg_len, 4 * h_seg_len))
+        for i in range(1):
+            for j in range(3):
+                if j == 1:
+                    continue
+                self.overlay.append((i * w_seg_len, j * h_seg_len,
+                                     4 * w_seg_len, 2 * h_seg_len))
+
+
+class FaceKeypointsDataAugumentation:
+
+    def __init__(self, input_size):
+        # option
+        self.enable_flip = True
+        self.enable_rotate = True
+        self.input_size = input_size
+
+        # mask generator
+        coarse_salt_and_pepper_iaa = iaa.CoarseSaltAndPepper(
+            (0.25, 0.35), size_percent=(0.03125, 0.015625))
+        self.mask_generator = coarse_salt_and_pepper_iaa.mask
+
+        # overlay generator
+        self.overlay_generator = OverLayGenerator(shape=(256, 256))
+
+        # flip
+        self.mirror_map = FaceKeypointsDataAugumentation.compute_mirror_map()
+
+    @staticmethod
+    def compute_mirror_map():
+
+        mirror_map = np.array(range(0, BASE_LANDMARK_NUM), np.int32)
+        for x, y in MATCHED_PARTS:
+            mirror_map[x] = y
+            mirror_map[y] = x
+
+        return mirror_map
+
+    def aug_flip(self, img, pts, visibility, pose):
+        # pts[:, 0] = self.input_size - pts[:, 0]
+        pts[:, 0] = img.shape[1] - pts[:, 0]
+        pts = pts[self.mirror_map]
+        if visibility is not None:
+            visibility = visibility[self.mirror_map]
+        img = cv2.flip(img, 1)
+        if pose is not None:
+            # fix roll&yaw in pose
+            pose['roll'] = -pose['roll']
+            pose['yaw'] = -pose['yaw']
+
+        return img, pts, visibility, pose
+
+    def aug_rotate(self, img, pts, pose, angle):
+        center = [DEST_SIZE // 2, DEST_SIZE // 2]
+        if pose is not None:
+            # fix roll in pose
+            pose['roll'] += angle
+
+        cx, cy = center
+        M, pts = rotate(angle, (cx, cy), pts)
+
+        imgT = cv2.warpAffine(img, M, (int(img.shape[1]), int(img.shape[0])))
+
+        x1 = np.min(pts[:, 0])
+        x2 = np.max(pts[:, 0])
+        y1 = np.min(pts[:, 1])
+        y2 = np.max(pts[:, 1])
+        w = x2 - x1 + 1
+        h = y2 - y1 + 1
+        x1 = int(x1 - (ENLARGE_RATIO - 1.0) / 2.0 * w)
+        y1 = int(y1 - (ENLARGE_RATIO - 1.0) * h)
+
+        new_w = int(ENLARGE_RATIO * (1 + normal() * 0.25) * w)
+        new_h = int(ENLARGE_RATIO * (1 + normal() * 0.25) * h)
+        new_x1 = x1 + int(normal() * DEST_SIZE * 0.15)
+        new_y1 = y1 + int(normal() * DEST_SIZE * 0.15)
+        new_x2 = new_x1 + new_w
+        new_y2 = new_y1 + new_h
+
+        new_xy = new_x1, new_y1
+        pts = pts - new_xy
+
+        height, width, _ = imgT.shape
+        dx = max(0, -new_x1)
+        dy = max(0, -new_y1)
+        new_x1 = max(0, new_x1)
+        new_y1 = max(0, new_y1)
+
+        edx = max(0, new_x2 - width)
+        edy = max(0, new_y2 - height)
+        new_x2 = min(width, new_x2)
+        new_y2 = min(height, new_y2)
+
+        imgT = imgT[new_y1:new_y2, new_x1:new_x2]
+        if dx > 0 or dy > 0 or edx > 0 or edy > 0:
+            imgT = cv2.copyMakeBorder(
+                imgT,
+                dy,
+                edy,
+                dx,
+                edx,
+                cv2.BORDER_CONSTANT,
+                value=(103.94, 116.78, 123.68))
+
+        return imgT, pts, pose
+
+    def random_mask(self, img):
+        mask = self.mask_generator.draw_samples(size=img.shape)
+        mask = np.expand_dims(np.sum(mask, axis=-1) > 0, axis=-1)
+        return mask
+
+    def random_overlay(self):
+        index = np.random.choice(len(self.overlay_generator.overlay))
+        overlay = self.overlay_generator.overlay[index]
+        return overlay
+
+    def augment_blur(self, img):
+        h, w = img.shape[:2]
+        assert h == w
+        ssize = int(random.uniform(0.01, 0.5) * h)
+        aug_seq = iaa.Sequential([
+            iaa.Sometimes(
+                1.0,
+                iaa.OneOf([
+                    iaa.GaussianBlur((3, 15)),
+                    iaa.AverageBlur(k=(3, 15)),
+                    iaa.MedianBlur(k=(3, 15)),
+                    iaa.MotionBlur((5, 25))
+                ])),
+            iaa.Resize(ssize, interpolation=imgaug.ALL),
+            iaa.Sometimes(
+                0.6,
+                iaa.OneOf([
+                    iaa.AdditiveGaussianNoise(
+                        loc=0, scale=(0.0, 0.1 * 255), per_channel=0.5),
+                    iaa.AdditiveLaplaceNoise(
+                        loc=0, scale=(0.0, 0.1 * 255), per_channel=0.5),
+                    iaa.AdditivePoissonNoise(lam=(0, 30), per_channel=0.5)
+                ])),
+            iaa.Sometimes(0.8, iaa.JpegCompression(compression=(40, 90))),
+            iaa.Resize(h),
+        ])
+
+        aug_img = aug_seq.augment_image(img)
+        return aug_img
+
+    def augment_color_temperature(self, img):
+        aug = iaa.ChangeColorTemperature((1000, 40000))
+
+        aug_img = aug.augment_image(img)
+        return aug_img
+
+    def aug_clr_noise_blur(self, img):
+        # skin&light
+        if np.random.choice((True, False), p=[0.05, 0.95]):
+            img_ycrcb_raw = cv2.cvtColor(img, cv2.COLOR_BGR2YCR_CB)
+            skin_factor_list = [0.6, 0.8, 1.0, 1.2, 1.4]
+            skin_factor = np.random.choice(skin_factor_list)
+            img_ycrcb_raw[:, :, 0:1] = np.clip(
+                img_ycrcb_raw[:, :, 0:1].astype(np.float) * skin_factor, 0,
+                255).astype(np.uint8)
+            img = cv2.cvtColor(img_ycrcb_raw, cv2.COLOR_YCR_CB2BGR)
+
+        # gauss blur 5%
+        if np.random.choice((True, False), p=[0.05, 0.95]):
+            sigma = np.random.choice([0.25, 0.50, 0.75])
+            gauss_blur_iaa = iaa.GaussianBlur(sigma=sigma)
+            img = gauss_blur_iaa(image=img)
+
+        # gauss noise 5%
+        if np.random.choice((True, False), p=[0.05, 0.95]):
+            scale = np.random.choice([0.01, 0.03, 0.05])
+            gauss_noise_iaa = iaa.AdditiveGaussianNoise(scale=scale * 255)
+            img = gauss_noise_iaa(image=img)
+
+        # motion blur 5%
+        if np.random.choice((True, False), p=[0.05, 0.95]):
+            angle = np.random.choice([0, 45, 90, 135, 180, 225, 270, 315])
+            motion_blur_iaa = iaa.MotionBlur(k=5, angle=angle)
+            img = motion_blur_iaa(image=img)
+
+        # jpeg compress 5%
+        if np.random.choice((True, False), p=[0.05, 0.95]):
+            jpeg_compress_iaa = iaa.JpegCompression(compression=(10, 50))
+            img = jpeg_compress_iaa(image=img)
+
+        # gamma contrast 5%
+        if np.random.choice((True, False), p=[0.05, 0.95]):
+            gamma_contrast_iaa = iaa.GammaContrast((0.85, 1.15))
+            img = gamma_contrast_iaa(image=img)
+
+        # brightness 5%
+        if np.random.choice((True, False), p=[0.05, 0.95]):
+            brightness_iaa = iaa.MultiplyAndAddToBrightness(
+                mul=(0.85, 1.15), add=(-10, 10))
+            img = brightness_iaa(image=img)
+
+        return img
+
+    def augment_set(self, img):
+        noisy_image = img.copy().astype(np.uint8)
+        if np.random.choice((True, False), p=[0.6, 0.4]):
+            aug = iaa.ChangeColorTemperature((1000, 40000))
+            noisy_image = aug.augment_image(noisy_image)
+
+        if np.random.choice((True, False), p=[0.8, 0.2]):
+            aug_seq = iaa.Sequential([
+                iaa.Sometimes(0.5, iaa.JpegCompression(compression=(40, 90))),
+                iaa.Sometimes(0.5, iaa.MotionBlur((3, 7))),
+                iaa.Sometimes(
+                    0.5,
+                    iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05 * 255))),
+            ],
+                                     random_order=True)
+            noisy_image = aug_seq.augment_image(noisy_image)
+
+        sometimes = lambda aug: iaa.Sometimes(0.25, aug)
+        seq = iaa.Sequential([
+            sometimes(iaa.AverageBlur(k=(2, 5))),
+            sometimes(iaa.GammaContrast((0.5, 2.0)))
+        ],
+                             random_order=True)
+
+        noisy_image = seq(images=noisy_image)
+        return noisy_image
+
+
+@PIPELINES.register_module()
+class FaceKeypointNorm:
+    """Data augmentation with Norm.
+    """
+
+    def __init__(self, input_size=96):
+        self.input_size = input_size
+
+    def __call__(self, results):
+        """Perform data augmentation with random image flip."""
+
+        # for key in results.get('img', []):
+        if 'img' in results.keys():
+            image = results['img']
+            h, w, c = image.shape
+            image = cv2.resize(image, (self.input_size, self.input_size))
+            results['img'] = np.array(image)
+
+            # for key in results.get('target_point', []):
+            if 'target_point' in results.keys():
+                points = results['target_point']
+                points[:, 0] = points[:, 0] / w * float(self.input_size)
+                points[:, 1] = points[:, 1] / h * float(self.input_size)
+                target_point = np.reshape(points,
+                                          (points.shape[0] * points.shape[1]))
+                results['target_point'] = np.array(target_point, np.float32)
+            else:
+                results['target_point'] = np.array(np.zeros(212), np.float32)
+
+            # for key in results.get('target_point_mask', []):
+            if 'target_point_mask' in results.keys():
+                points_mask = results['target_point_mask']
+                points_mask = points_mask.astype(np.float32)
+                points_mask = np.reshape(
+                    points_mask, (points_mask.shape[0] * points_mask.shape[1]))
+                results['target_point_mask'] = points_mask.astype(np.float32)
+            else:
+                results['target_point_mask'] = np.array(
+                    np.zeros(212), np.float32)
+
+            # for key in results.get('target_pose', []):
+            if 'target_pose' in results.keys():
+                pose = results['target_pose']
+                pose = np.asarray([pose['pitch'], pose['roll'], pose['yaw']])
+                results['target_pose'] = pose.astype(np.float32)
+            else:
+                results['target_pose'] = np.array(np.zeros(3), np.float32)
+
+            if 'target_pose_mask' not in results.keys():
+                results['target_pose_mask'] = np.array(np.zeros(3), np.float32)
+
+        return results
+
+
+@PIPELINES.register_module()
+class FaceKeypointRandomAugmentation:
+    """Data augmentation with random  flip.
+    """
+
+    def __init__(self, input_size=96):
+        self.input_size = input_size
+
+        # Data Augment
+        self.data_aug = FaceKeypointsDataAugumentation(self.input_size)
+
+    def __call__(self, results):
+        """Perform data augmentation with random image flip."""
+
+        image = results['img']
+        points = results['target_point']
+        points_mask = results['target_point_mask']
+        pose = results['target_pose']
+        pose_mask = results['target_pose_mask']
+        overlay_image_path = results['overlay_image_path']
+
+        if np.random.choice((True, False), p=[0.2, 0.8]):
+            # overlay
+            overlay_pos = self.data_aug.random_overlay()
+            overlay_img_index = np.random.choice(len(overlay_image_path))
+            overlay_img_filepath = overlay_image_path[overlay_img_index]
+            overlay_img = cv2.imread(overlay_img_filepath,
+                                     cv2.IMREAD_UNCHANGED)
+
+            (x, y, w, h) = overlay_pos
+            x1, y1, x2, y2 = x, y, x + w, y + h
+            overlay_img = cv2.resize(overlay_img, dsize=(w, h))
+            overlay_mask = overlay_img[:, :, 3:4] / 255.0
+            image[y1:y2, x1:x2, :] = image[y1:y2, x1:x2, :] * (
+                1 - overlay_mask) + overlay_img[:, :, 0:3] * overlay_mask
+            image = image.astype(np.uint8)
+
+        angle = pose['roll']
+        image, points, pose = self.data_aug.aug_rotate(
+            image, points, pose, angle)  # counterclockwise rotate angle
+        pose['roll'] = angle  # reset roll=angle
+
+        if np.random.choice((True, False)):
+            image_transform, points, _, pose = self.data_aug.aug_flip(
+                image, points, None, pose)
+        else:
+            image_transform = image
+
+        image_transform = self.data_aug.aug_clr_noise_blur(image_transform)
+
+        results['img'] = image_transform
+        results['target_point'] = points
+        results['target_pose'] = pose
+        return results
diff --git a/easycv/datasets/pose/__init__.py b/easycv/datasets/pose/__init__.py
index c56c8f13..06118419 100644
--- a/easycv/datasets/pose/__init__.py
+++ b/easycv/datasets/pose/__init__.py
@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from . import data_sources  # pylint: disable=unused-import
 from . import pipelines  # pylint: disable=unused-import
+from .hand_coco_wholebody_dataset import HandCocoWholeBodyDataset
 from .top_down import PoseTopDownDataset
 
-__all__ = ['PoseTopDownDataset']
+__all__ = ['PoseTopDownDataset', 'HandCocoWholeBodyDataset']
diff --git a/easycv/datasets/pose/data_sources/__init__.py b/easycv/datasets/pose/data_sources/__init__.py
index 5d240fa2..b153c7a8 100644
--- a/easycv/datasets/pose/data_sources/__init__.py
+++ b/easycv/datasets/pose/data_sources/__init__.py
@@ -1,5 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .coco import PoseTopDownSourceCoco
+from .hand import HandCocoPoseTopDownSource
 from .top_down import PoseTopDownSource
 
-__all__ = ['PoseTopDownSourceCoco', 'PoseTopDownSource']
+__all__ = [
+    'PoseTopDownSourceCoco', 'PoseTopDownSource', 'HandCocoPoseTopDownSource'
+]
diff --git a/easycv/datasets/pose/data_sources/hand/__init__.py b/easycv/datasets/pose/data_sources/hand/__init__.py
new file mode 100644
index 00000000..146a1276
--- /dev/null
+++ b/easycv/datasets/pose/data_sources/hand/__init__.py
@@ -0,0 +1,3 @@
+# !/usr/bin/env python
+# -*- encoding: utf-8 -*-
+from .coco_hand import HandCocoPoseTopDownSource
diff --git a/easycv/datasets/pose/data_sources/hand/coco_hand.py b/easycv/datasets/pose/data_sources/hand/coco_hand.py
new file mode 100644
index 00000000..4cff9cfd
--- /dev/null
+++ b/easycv/datasets/pose/data_sources/hand/coco_hand.py
@@ -0,0 +1,276 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapt from
+# https://github.com/open-mmlab/mmpose/blob/master/mmpose/datasets/datasets/hand/hand_coco_wholebody_dataset.py
+import logging
+import os.path as osp
+
+import numpy as np
+
+from easycv.datasets.registry import DATASOURCES
+from ..top_down import PoseTopDownSource
+
+COCO_WHOLEBODY_HAND_DATASET_INFO = dict(
+    dataset_name='coco_wholebody_hand',
+    paper_info=dict(
+        author='Jin, Sheng and Xu, Lumin and Xu, Jin and '
+        'Wang, Can and Liu, Wentao and '
+        'Qian, Chen and Ouyang, Wanli and Luo, Ping',
+        title='Whole-Body Human Pose Estimation in the Wild',
+        container='Proceedings of the European '
+        'Conference on Computer Vision (ECCV)',
+        year='2020',
+        homepage='https://github.com/jin-s13/COCO-WholeBody/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''),
+        5:
+        dict(
+            name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''),
+        9:
+        dict(
+            name='middle_finger1',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger3',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='middle_finger4',
+            id=12,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        13:
+        dict(
+            name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(
+            name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''),
+        17:
+        dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[
+        0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, 0.035, 0.018,
+        0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, 0.019, 0.022,
+        0.031
+    ])
+
+
+@DATASOURCES.register_module()
+class HandCocoPoseTopDownSource(PoseTopDownSource):
+    """Coco Whole-Body-Hand Source for top-down hand pose estimation.
+
+        "Whole-Body Human Pose Estimation in the Wild", ECCV'2020.
+        More details can be found in the `paper
+        <https://arxiv.org/abs/2007.11858>`__ .
+
+        The dataset loads raw features and apply specified transforms
+        to return a dict containing the image tensors and other information.
+
+        COCO-WholeBody Hand keypoint indexes::
+
+            0: 'wrist',
+            1: 'thumb1',
+            2: 'thumb2',
+            3: 'thumb3',
+            4: 'thumb4',
+            5: 'forefinger1',
+            6: 'forefinger2',
+            7: 'forefinger3',
+            8: 'forefinger4',
+            9: 'middle_finger1',
+            10: 'middle_finger2',
+            11: 'middle_finger3',
+            12: 'middle_finger4',
+            13: 'ring_finger1',
+            14: 'ring_finger2',
+            15: 'ring_finger3',
+            16: 'ring_finger4',
+            17: 'pinky_finger1',
+            18: 'pinky_finger2',
+            19: 'pinky_finger3',
+            20: 'pinky_finger4'
+
+        Args:
+            ann_file (str): Path to the annotation file.
+            img_prefix (str): Path to a directory where images are held.
+                Default: None.
+            data_cfg (dict): config
+            dataset_info (DatasetInfo): A class containing all dataset info.
+            test_mode (bool): Store True when building test or
+                validation dataset. Default: False.
+        """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 dataset_info=None,
+                 test_mode=False):
+
+        if dataset_info is None:
+            logging.info(
+                'dataset_info is missing, use default coco wholebody hand dataset info'
+            )
+            dataset_info = COCO_WHOLEBODY_HAND_DATASET_INFO
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.db = self._get_db()
+
+        print(f'=> num_images: {self.num_images}')
+        print(f'=> load {len(self.db)} samples')
+
+    def _get_db(self):
+        """Load dataset."""
+        gt_db = []
+        bbox_id = 0
+        num_joints = self.ann_info['num_joints']
+        for img_id in self.img_ids:
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = self.coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                for type in ['left', 'right']:
+                    if obj[f'{type}hand_valid'] and max(
+                            obj[f'{type}hand_kpts']) > 0:
+                        joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+                        joints_3d_visible = np.zeros((num_joints, 3),
+                                                     dtype=np.float32)
+
+                        keypoints = np.array(obj[f'{type}hand_kpts']).reshape(
+                            -1, 3)
+                        joints_3d[:, :2] = keypoints[:, :2]
+                        joints_3d_visible[:, :2] = np.minimum(
+                            1, keypoints[:, 2:3])
+
+                        image_file = osp.join(self.img_prefix,
+                                              self.id2name[img_id])
+                        center, scale = self._xywh2cs(
+                            *obj[f'{type}hand_box'][:4])
+                        gt_db.append({
+                            'image_file': image_file,
+                            'image_id': img_id,
+                            'rotation': 0,
+                            'center': center,
+                            'scale': scale,
+                            'joints_3d': joints_3d,
+                            'joints_3d_visible': joints_3d_visible,
+                            'dataset': self.dataset_name,
+                            'bbox': obj[f'{type}hand_box'],
+                            'bbox_score': 1,
+                            'bbox_id': bbox_id
+                        })
+                        bbox_id = bbox_id + 1
+        gt_db = sorted(gt_db, key=lambda x: x['bbox_id'])
+
+        return gt_db
diff --git a/easycv/datasets/pose/hand_coco_wholebody_dataset.py b/easycv/datasets/pose/hand_coco_wholebody_dataset.py
new file mode 100644
index 00000000..3084ba02
--- /dev/null
+++ b/easycv/datasets/pose/hand_coco_wholebody_dataset.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapt from
+# https://github.com/open-mmlab/mmpose/blob/master/mmpose/datasets/datasets/hand/hand_coco_wholebody_dataset.py
+
+from easycv.core.evaluation.keypoint_eval import KeyPointEvaluator
+from easycv.datasets.pose.data_sources.coco import PoseTopDownSource
+from easycv.datasets.registry import DATASETS
+from easycv.datasets.shared.base import BaseDataset
+
+
+@DATASETS.register_module()
+class HandCocoWholeBodyDataset(BaseDataset):
+    """CocoWholeBodyDataset for top-down hand pose estimation.
+
+    Args:
+        data_source: Data_source config dict
+        pipeline: Pipeline config list
+        profiling: If set True, will print pipeline time
+    """
+
+    def __init__(self, data_source, pipeline, profiling=False):
+        super(HandCocoWholeBodyDataset, self).__init__(data_source, pipeline,
+                                                       profiling)
+
+        if not isinstance(self.data_source, PoseTopDownSource):
+            raise ValueError('Only support `PoseTopDownSource`, but get %s' %
+                             self.data_source)
+
+    def evaluate(self, outputs, evaluators, **kwargs):
+        if len(evaluators) > 1 or not isinstance(evaluators[0],
+                                                 KeyPointEvaluator):
+            raise ValueError(
+                'HandCocoWholeBodyDataset only support one `KeyPointEvaluator` now, '
+                'but get %s' % evaluators)
+        evaluator = evaluators[0]
+
+        image_ids = outputs['image_ids']
+        preds = outputs['preds']
+        boxes = outputs['boxes']
+        bbox_ids = outputs['bbox_ids']
+
+        kpts = []
+        for i, image_id in enumerate(image_ids):
+            kpts.append({
+                'keypoints': preds[i],
+                'center': boxes[i][0:2],
+                'scale': boxes[i][2:4],
+                'area': boxes[i][4],
+                'score': boxes[i][5],
+                'image_id': image_id,
+                'bbox_id': bbox_ids[i]
+            })
+        kpts = self._sort_and_unique_bboxes(kpts)
+        eval_res = evaluator.evaluate(kpts, self.data_source.db)
+        return eval_res
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        kpts = sorted(kpts, key=lambda x: x[key])
+        num = len(kpts)
+        for i in range(num - 1, 0, -1):
+            if kpts[i][key] == kpts[i - 1][key]:
+                del kpts[i]
+
+        return kpts
+
+    def __getitem__(self, idx):
+        """Get the sample given index."""
+        results = self.data_source[idx]
+        return self.pipeline(results)
diff --git a/easycv/datasets/pose/pipelines/transforms.py b/easycv/datasets/pose/pipelines/transforms.py
index 79f8d6c4..27c7c325 100644
--- a/easycv/datasets/pose/pipelines/transforms.py
+++ b/easycv/datasets/pose/pipelines/transforms.py
@@ -1,9 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # Adapt from https://github.com/open-mmlab/mmpose/blob/master/mmpose/datasets/pipelines/top_down_transform.py
+
 import cv2
 import numpy as np
 from mmcv.parallel import DataContainer as DC
-from torchvision.transforms import functional as F
 
 from easycv.core.post_processing import (affine_transform, fliplr_joints,
                                          get_affine_transform, get_warp_matrix,
diff --git a/easycv/datasets/shared/pipelines/__init__.py b/easycv/datasets/shared/pipelines/__init__.py
index 2d1f6332..7a285840 100644
--- a/easycv/datasets/shared/pipelines/__init__.py
+++ b/easycv/datasets/shared/pipelines/__init__.py
@@ -4,4 +4,4 @@
                               DaliImageDecoder, DaliRandomGrayscale,
                               DaliRandomResizedCrop, DaliResize)
 from .format import Collect, DefaultFormatBundle, ImageToTensor
-from .transforms import Compose
+from .transforms import Compose, LoadImage
diff --git a/easycv/datasets/shared/pipelines/transforms.py b/easycv/datasets/shared/pipelines/transforms.py
index d10a1330..7de5c0a9 100644
--- a/easycv/datasets/shared/pipelines/transforms.py
+++ b/easycv/datasets/shared/pipelines/transforms.py
@@ -2,7 +2,10 @@
 import time
 from collections.abc import Sequence
 
+import numpy as np
+
 from easycv.datasets.registry import PIPELINES
+from easycv.file.image import load_image
 from easycv.utils.registry import build_from_cfg
 
 
@@ -48,3 +51,49 @@ def __repr__(self):
             format_string += f'\n    {t}'
         format_string += '\n)'
         return format_string
+
+
+@PIPELINES.register_module()
+class LoadImage:
+    """Load an image from file or numpy or PIL object.
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+    """
+
+    def __init__(self, to_float32=False, mode='bgr'):
+        self.to_float32 = to_float32
+        self.mode = mode
+
+    def __call__(self, results):
+        """Call functions to load image and get image meta information.
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+        filename = results.get('filename', None)
+        img = results.get('img', None)
+
+        if img is not None:
+            if not isinstance(img, np.ndarray):
+                img = np.asarray(img, dtype=np.uint8)
+        else:
+            assert filename is not None, 'Please provide "filename" or "img"!'
+            img = load_image(filename, mode=self.mode)
+
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['filename'] = filename
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['ori_shape'] = img.shape
+        results['img_fields'] = ['img']
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'to_float32={self.to_float32}, '
+                    f"mode='{self.mode}'")
+
+        return repr_str
diff --git a/easycv/models/__init__.py b/easycv/models/__init__.py
index 1890a46b..d125236d 100644
--- a/easycv/models/__init__.py
+++ b/easycv/models/__init__.py
@@ -3,6 +3,7 @@
 from .builder import build_backbone, build_head, build_loss, build_model
 from .classification import *
 from .detection import *
+from .face import *
 from .heads import *
 from .loss import *
 from .pose import TopDown
diff --git a/easycv/models/backbones/__init__.py b/easycv/models/backbones/__init__.py
index f0be50ae..c59a74fb 100644
--- a/easycv/models/backbones/__init__.py
+++ b/easycv/models/backbones/__init__.py
@@ -4,11 +4,13 @@
 from .conv_mae_vit import FastConvMAEViT
 from .conv_vitdet import ConvViTDet
 from .efficientformer import EfficientFormer
+from .face_keypoint_backbone import FaceKeypointBackbone
 from .genet import PlainNet
 from .hrnet import HRNet
 from .inceptionv3 import Inception3
 from .lighthrnet import LiteHRNet
 from .mae_vit_transformer import *
+from .mit import MixVisionTransformer
 from .mnasnet import MNASNet
 from .mobilenetv2 import MobileNetV2
 from .pytorch_image_models_wrapper import *
diff --git a/easycv/models/backbones/face_keypoint_backbone.py b/easycv/models/backbones/face_keypoint_backbone.py
new file mode 100644
index 00000000..32ef1ac3
--- /dev/null
+++ b/easycv/models/backbones/face_keypoint_backbone.py
@@ -0,0 +1,90 @@
+import torch.nn as nn
+
+from easycv.models.registry import BACKBONES
+from easycv.models.utils.face_keypoint_utils import InvertedResidual, Residual
+
+
+@BACKBONES.register_module
+class FaceKeypointBackbone(nn.Module):
+
+    def __init__(self,
+                 in_channels=3,
+                 out_channels=48,
+                 residual_activation='relu',
+                 inverted_activation='half_v2',
+                 inverted_expand_ratio=2):
+        super(FaceKeypointBackbone, self).__init__()
+        self.conv1 = Residual(in_channels, 12, 3, 2, 0)
+        self.conv2 = Residual(12, 12, 3, 1, 0, activation=residual_activation)
+        self.conv3 = Residual(12, 12, 3, 1, 1, activation=residual_activation)
+        self.conv4 = Residual(12, 12, 3, 1, 0, activation=residual_activation)
+        self.conv5 = Residual(12, 24, 3, 2, 0, activation=residual_activation)
+        self.conv6 = Residual(24, 24, 3, 1, 0, activation=residual_activation)
+        self.conv7 = Residual(24, 24, 3, 1, 1, activation=residual_activation)
+        self.conv8 = Residual(24, 24, 3, 1, 1, activation=residual_activation)
+        self.conv9 = InvertedResidual(
+            24,
+            48,
+            3,
+            2,
+            0,
+            expand_ratio=inverted_expand_ratio,
+            activation=inverted_activation)
+        self.conv10 = InvertedResidual(
+            48,
+            48,
+            3,
+            1,
+            0,
+            expand_ratio=inverted_expand_ratio,
+            activation=inverted_activation)
+        self.conv11 = InvertedResidual(
+            48,
+            48,
+            3,
+            1,
+            1,
+            expand_ratio=inverted_expand_ratio,
+            activation=inverted_activation)
+        self.conv12 = InvertedResidual(
+            48,
+            48,
+            3,
+            1,
+            1,
+            expand_ratio=inverted_expand_ratio,
+            activation=inverted_activation)
+        self.conv13 = InvertedResidual(
+            48,
+            48,
+            3,
+            1,
+            1,
+            expand_ratio=inverted_expand_ratio,
+            activation=inverted_activation)
+        self.conv14 = InvertedResidual(
+            48,
+            out_channels,
+            3,
+            2,
+            0,
+            expand_ratio=inverted_expand_ratio,
+            activation=inverted_activation)
+
+    def forward(self, x):
+        x1 = self.conv1(x)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x2)
+        x4 = self.conv4(x3)
+        x5 = self.conv5(x4)
+        x6 = self.conv6(x5)
+        x7 = self.conv7(x6)
+        x8 = self.conv8(x7)
+        x9 = self.conv9(x8)
+        x10 = self.conv10(x9)
+        x11 = self.conv11(x10)
+        x12 = self.conv12(x11)
+        x13 = self.conv13(x12)
+        x14 = self.conv14(x13)
+
+        return x14
diff --git a/easycv/models/backbones/mit.py b/easycv/models/backbones/mit.py
new file mode 100644
index 00000000..a9cf2c01
--- /dev/null
+++ b/easycv/models/backbones/mit.py
@@ -0,0 +1,443 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapt from: https://github.com/open-mmlab/mmsegmentation/blob/2d66179630035097dcae08ee958f60d4b5a7fcae/mmseg/models/backbones/mit.py
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import Conv2d, build_activation_layer, build_norm_layer
+from mmcv.cnn.bricks.drop import build_dropout
+from mmcv.cnn.bricks.transformer import MultiheadAttention
+from mmcv.cnn.utils.weight_init import (constant_init, normal_init,
+                                        trunc_normal_init)
+from mmcv.runner import BaseModule, ModuleList, Sequential
+
+from easycv.models.registry import BACKBONES
+from easycv.models.segmentation.utils import (PatchEmbed, nchw_to_nlc,
+                                              nlc_to_nchw)
+
+
+class MixFFN(BaseModule):
+    """An implementation of MixFFN of Segformer.
+
+    The differences between MixFFN & FFN:
+        1. Use 1X1 Conv to replace Linear layer.
+        2. Introduce 3X3 Conv to encode positional information.
+    Args:
+        embed_dims (int): The feature dimension. Same as
+            `MultiheadAttention`. Defaults: 256.
+        feedforward_channels (int): The hidden dimension of FFNs.
+            Defaults: 1024.
+        act_cfg (dict, optional): The activation config for FFNs.
+            Default: dict(type='ReLU')
+        ffn_drop (float, optional): Probability of an element to be
+            zeroed in FFN. Default 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 feedforward_channels,
+                 act_cfg=dict(type='GELU'),
+                 ffn_drop=0.,
+                 dropout_layer=None,
+                 init_cfg=None):
+        super(MixFFN, self).__init__(init_cfg)
+
+        self.embed_dims = embed_dims
+        self.feedforward_channels = feedforward_channels
+        self.act_cfg = act_cfg
+        self.activate = build_activation_layer(act_cfg)
+
+        in_channels = embed_dims
+        fc1 = Conv2d(
+            in_channels=in_channels,
+            out_channels=feedforward_channels,
+            kernel_size=1,
+            stride=1,
+            bias=True)
+        # 3x3 depth wise conv to provide positional encode information
+        pe_conv = Conv2d(
+            in_channels=feedforward_channels,
+            out_channels=feedforward_channels,
+            kernel_size=3,
+            stride=1,
+            padding=(3 - 1) // 2,
+            bias=True,
+            groups=feedforward_channels)
+        fc2 = Conv2d(
+            in_channels=feedforward_channels,
+            out_channels=in_channels,
+            kernel_size=1,
+            stride=1,
+            bias=True)
+        drop = nn.Dropout(ffn_drop)
+        layers = [fc1, pe_conv, self.activate, drop, fc2, drop]
+        self.layers = Sequential(*layers)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else torch.nn.Identity()
+
+    def forward(self, x, hw_shape, identity=None):
+        out = nlc_to_nchw(x, hw_shape)
+        out = self.layers(out)
+        out = nchw_to_nlc(out)
+        if identity is None:
+            identity = x
+        return identity + self.dropout_layer(out)
+
+
+class EfficientMultiheadAttention(MultiheadAttention):
+    """An implementation of Efficient Multi-head Attention of Segformer.
+
+    This module is modified from MultiheadAttention which is a module from
+    mmcv.cnn.bricks.transformer.
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut. Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default: False.
+        qkv_bias (bool): enable bias for qkv if True. Default True.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (int): The ratio of spatial reduction of Efficient Multi-head
+            Attention of Segformer. Default: 1.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=None,
+                 init_cfg=None,
+                 batch_first=True,
+                 qkv_bias=False,
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1):
+        super().__init__(
+            embed_dims,
+            num_heads,
+            attn_drop,
+            proj_drop,
+            dropout_layer=dropout_layer,
+            init_cfg=init_cfg,
+            batch_first=batch_first,
+            bias=qkv_bias)
+
+        self.sr_ratio = sr_ratio
+        if sr_ratio > 1:
+            self.sr = Conv2d(
+                in_channels=embed_dims,
+                out_channels=embed_dims,
+                kernel_size=sr_ratio,
+                stride=sr_ratio)
+            # The ret[0] of build_norm_layer is norm name.
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+
+    def forward(self, x, hw_shape, identity=None):
+
+        x_q = x
+        if self.sr_ratio > 1:
+            x_kv = nlc_to_nchw(x, hw_shape)
+            x_kv = self.sr(x_kv)
+            x_kv = nchw_to_nlc(x_kv)
+            x_kv = self.norm(x_kv)
+        else:
+            x_kv = x
+
+        if identity is None:
+            identity = x_q
+
+        # Because the dataflow('key', 'query', 'value') of
+        # ``torch.nn.MultiheadAttention`` is (num_query, batch,
+        # embed_dims), We should adjust the shape of dataflow from
+        # batch_first (batch, num_query, embed_dims) to num_query_first
+        # (num_query ,batch, embed_dims), and recover ``attn_output``
+        # from num_query_first to batch_first.
+        if self.batch_first:
+            x_q = x_q.transpose(0, 1)
+            x_kv = x_kv.transpose(0, 1)
+
+        out = self.attn(query=x_q, key=x_kv, value=x_kv)[0]
+
+        if self.batch_first:
+            out = out.transpose(0, 1)
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+    def legacy_forward(self, x, hw_shape, identity=None):
+        """multi head attention forward in mmcv version < 1.3.17."""
+
+        x_q = x
+        if self.sr_ratio > 1:
+            x_kv = nlc_to_nchw(x, hw_shape)
+            x_kv = self.sr(x_kv)
+            x_kv = nchw_to_nlc(x_kv)
+            x_kv = self.norm(x_kv)
+        else:
+            x_kv = x
+
+        if identity is None:
+            identity = x_q
+
+        # `need_weights=True` will let nn.MultiHeadAttention
+        # `return attn_output, attn_output_weights.sum(dim=1) / num_heads`
+        # The `attn_output_weights.sum(dim=1)` may cause cuda error. So, we set
+        # `need_weights=False` to ignore `attn_output_weights.sum(dim=1)`.
+        # This issue - `https://github.com/pytorch/pytorch/issues/37583` report
+        # the error that large scale tensor sum operation may cause cuda error.
+        out = self.attn(query=x_q, key=x_kv, value=x_kv, need_weights=False)[0]
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+
+class TransformerEncoderLayer(BaseModule):
+    """Implements one encoder layer in Segformer.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        drop_rate (float): Probability of an element to be zeroed.
+            after the feed forward layer. Default 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0.
+        drop_path_rate (float): stochastic depth rate. Default 0.0.
+        qkv_bias (bool): enable bias for qkv if True.
+            Default: True.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default: False.
+        init_cfg (dict, optional): Initialization config dict.
+            Default:None.
+        sr_ratio (int): The ratio of spatial reduction of Efficient Multi-head
+            Attention of Segformer. Default: 1.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 qkv_bias=True,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 batch_first=True,
+                 sr_ratio=1,
+                 with_cp=False):
+        super(TransformerEncoderLayer, self).__init__()
+
+        # The ret[0] of build_norm_layer is norm name.
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        self.attn = EfficientMultiheadAttention(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            attn_drop=attn_drop_rate,
+            proj_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            batch_first=batch_first,
+            qkv_bias=qkv_bias,
+            norm_cfg=norm_cfg,
+            sr_ratio=sr_ratio)
+
+        # The ret[0] of build_norm_layer is norm name.
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        self.ffn = MixFFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            act_cfg=act_cfg)
+
+        self.with_cp = with_cp
+
+    def forward(self, x, hw_shape):
+
+        def _inner_forward(x):
+            x = self.attn(self.norm1(x), hw_shape, identity=x)
+            x = self.ffn(self.norm2(x), hw_shape, identity=x)
+            return x
+
+        if self.with_cp and x.requires_grad:
+            x = cp.checkpoint(_inner_forward, x)
+        else:
+            x = _inner_forward(x)
+        return x
+
+
+@BACKBONES.register_module()
+class MixVisionTransformer(BaseModule):
+    """The backbone of Segformer.
+
+    This backbone is the implementation of `SegFormer: Simple and
+    Efficient Design for Semantic Segmentation with
+    Transformers <https://arxiv.org/abs/2105.15203>`_.
+    Args:
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (int): Embedding dimension. Default: 768.
+        num_stags (int): The num of stages. Default: 4.
+        num_layers (Sequence[int]): The layer number of each transformer encode
+            layer. Default: [3, 4, 6, 3].
+        num_heads (Sequence[int]): The attention heads of each transformer
+            encode layer. Default: [1, 2, 4, 8].
+        patch_sizes (Sequence[int]): The patch_size of each overlapped patch
+            embedding. Default: [7, 3, 3, 3].
+        strides (Sequence[int]): The stride of each overlapped patch embedding.
+            Default: [4, 2, 2, 2].
+        sr_ratios (Sequence[int]): The spatial reduction rate of each
+            transformer encode layer. Default: [8, 4, 2, 1].
+        out_indices (Sequence[int] | int): Output from which stages.
+            Default: (0, 1, 2, 3).
+        mlp_ratio (int): ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        qkv_bias (bool): Enable bias for qkv if True. Default: True.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.0
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0
+        drop_path_rate (float): stochastic depth rate. Default 0.0
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        pretrained (str, optional): model pretrained path. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=64,
+                 num_stages=4,
+                 num_layers=[3, 4, 6, 3],
+                 num_heads=[1, 2, 4, 8],
+                 patch_sizes=[7, 3, 3, 3],
+                 strides=[4, 2, 2, 2],
+                 sr_ratios=[8, 4, 2, 1],
+                 out_indices=(0, 1, 2, 3),
+                 mlp_ratio=4,
+                 qkv_bias=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN', eps=1e-6),
+                 pretrained=None,
+                 init_cfg=None,
+                 with_cp=False):
+        super(MixVisionTransformer, self).__init__(init_cfg=init_cfg)
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be set at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is not None:
+            raise TypeError('pretrained must be a str or None')
+
+        self.embed_dims = embed_dims
+        self.num_stages = num_stages
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.patch_sizes = patch_sizes
+        self.strides = strides
+        self.sr_ratios = sr_ratios
+        self.with_cp = with_cp
+        assert num_stages == len(num_layers) == len(num_heads) \
+               == len(patch_sizes) == len(strides) == len(sr_ratios)
+
+        self.out_indices = out_indices
+        assert max(out_indices) < self.num_stages
+
+        # transformer encoder
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, drop_path_rate, sum(num_layers))
+        ]  # stochastic num_layer decay rule
+
+        cur = 0
+        self.layers = ModuleList()
+        for i, num_layer in enumerate(num_layers):
+            embed_dims_i = embed_dims * num_heads[i]
+            patch_embed = PatchEmbed(
+                in_channels=in_channels,
+                embed_dims=embed_dims_i,
+                kernel_size=patch_sizes[i],
+                stride=strides[i],
+                padding=patch_sizes[i] // 2,
+                norm_cfg=norm_cfg)
+            layer = ModuleList([
+                TransformerEncoderLayer(
+                    embed_dims=embed_dims_i,
+                    num_heads=num_heads[i],
+                    feedforward_channels=mlp_ratio * embed_dims_i,
+                    drop_rate=drop_rate,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_path_rate=dpr[cur + idx],
+                    qkv_bias=qkv_bias,
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg,
+                    with_cp=with_cp,
+                    sr_ratio=sr_ratios[i]) for idx in range(num_layer)
+            ])
+            in_channels = embed_dims_i
+            # The ret[0] of build_norm_layer is norm name.
+            norm = build_norm_layer(norm_cfg, embed_dims_i)[1]
+            self.layers.append(ModuleList([patch_embed, layer, norm]))
+            cur += num_layer
+
+    def init_weights(self):
+        if self.init_cfg is None:
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, nn.LayerNorm):
+                    constant_init(m, val=1.0, bias=0.)
+                elif isinstance(m, nn.Conv2d):
+                    fan_out = m.kernel_size[0] * m.kernel_size[
+                        1] * m.out_channels
+                    fan_out //= m.groups
+                    normal_init(
+                        m, mean=0, std=math.sqrt(2.0 / fan_out), bias=0)
+        else:
+            super(MixVisionTransformer, self).init_weights()
+
+    def forward(self, x):
+        outs = []
+
+        for i, layer in enumerate(self.layers):
+            x, hw_shape = layer[0](x)
+            for block in layer[1]:
+                x = block(x, hw_shape)
+            x = layer[2](x)
+            x = nlc_to_nchw(x, hw_shape)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return outs
diff --git a/easycv/models/face/__init__.py b/easycv/models/face/__init__.py
new file mode 100644
index 00000000..d7782486
--- /dev/null
+++ b/easycv/models/face/__init__.py
@@ -0,0 +1,2 @@
+from .face_keypoint import FaceKeypoint
+from .head import *
diff --git a/easycv/models/face/face_keypoint.py b/easycv/models/face/face_keypoint.py
new file mode 100644
index 00000000..42268ba2
--- /dev/null
+++ b/easycv/models/face/face_keypoint.py
@@ -0,0 +1,103 @@
+import mmcv
+import numpy as np
+
+from easycv.models import builder
+from easycv.models.base import BaseModel
+from easycv.models.builder import MODELS
+from easycv.models.utils.face_keypoint_utils import (get_keypoint_accuracy,
+                                                     get_pose_accuracy)
+
+
+@MODELS.register_module()
+class FaceKeypoint(BaseModel):
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 keypoint_head=None,
+                 pose_head=None,
+                 pretrained=None,
+                 loss_keypoint=None,
+                 loss_pose=None):
+        super().__init__()
+        self.pretrained = pretrained
+
+        self.backbone = builder.build_backbone(backbone)
+
+        if neck is not None:
+            self.neck = builder.build_neck(neck)
+
+        if keypoint_head is not None:
+            if 'loss_keypoint' not in keypoint_head and loss_keypoint is not None:
+                keypoint_head['loss_keypoint'] = loss_keypoint
+            self.keypoint_head = builder.build_head(keypoint_head)
+
+        if pose_head is not None:
+            if 'loss_pose' not in pose_head and loss_pose is not None:
+                pose_head['loss_pose'] = loss_pose
+            self.pose_head = builder.build_head(pose_head)
+
+    @property
+    def with_neck(self):
+        """Check if has keypoint_head."""
+        return hasattr(self, 'neck')
+
+    @property
+    def with_keypoint(self):
+        """Check if has keypoint_head."""
+        return hasattr(self, 'keypoint_head')
+
+    @property
+    def with_pose(self):
+        """Check if has pose_head."""
+        return hasattr(self, 'pose_head')
+
+    def forward_train(self, img, target_point, target_point_mask, target_pose,
+                      target_pose_mask, **kwargs):
+        """Defines the computation performed at every call when training."""
+        output = self.backbone(img)
+
+        if self.with_neck:
+            output = self.neck(output)
+        if self.with_keypoint:
+            output_points = self.keypoint_head(output)
+        if self.with_pose:
+            output_pose = self.pose_head(output)
+
+        target_point = target_point * target_point_mask
+        target_pose = target_pose * target_pose_mask
+
+        losses = dict()
+        if self.with_keypoint:
+            keypoint_losses = self.keypoint_head.get_loss(
+                output_points, target_point, target_point_mask, target_pose)
+            losses.update(keypoint_losses)
+            keypoint_accuracy = get_keypoint_accuracy(output_points,
+                                                      target_point)
+            losses.update(keypoint_accuracy)
+
+        if self.with_pose:
+            output_pose = output_pose * 180.0 / np.pi
+            output_pose = output_pose * target_pose_mask
+
+            pose_losses = self.pose_head.get_loss(output_pose, target_pose)
+            losses.update(pose_losses)
+            pose_accuracy = get_pose_accuracy(output_pose, target_pose)
+            losses.update(pose_accuracy)
+        return losses
+
+    def forward_test(self, img, **kwargs):
+        """Defines the computation performed at every call when testing."""
+
+        output = self.backbone(img)
+        if self.with_neck:
+            output = self.neck(output)
+        if self.with_keypoint:
+            output_points = self.keypoint_head(output)
+        if self.with_pose:
+            output_pose = self.pose_head(output)
+
+        ret = {}
+        ret['point'] = output_points
+        ret['pose'] = output_pose
+        return ret
diff --git a/easycv/models/face/head/__init__.py b/easycv/models/face/head/__init__.py
new file mode 100644
index 00000000..504755cb
--- /dev/null
+++ b/easycv/models/face/head/__init__.py
@@ -0,0 +1,2 @@
+from .face_keypoint_head import FaceKeypointHead
+from .face_keypoint_pose_head import FacePoseHead
diff --git a/easycv/models/face/head/face_keypoint_head.py b/easycv/models/face/head/face_keypoint_head.py
new file mode 100644
index 00000000..a75cfa8c
--- /dev/null
+++ b/easycv/models/face/head/face_keypoint_head.py
@@ -0,0 +1,68 @@
+import copy
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from easycv.models.builder import HEADS, build_loss
+from easycv.models.utils.face_keypoint_utils import (InvertedResidual, View,
+                                                     conv_bn, conv_no_relu,
+                                                     get_keypoint_accuracy)
+
+
+@HEADS.register_module
+class FaceKeypointHead(nn.Module):
+
+    def __init__(
+        self,
+        mean_face,
+        loss_keypoint,
+        in_channels=48,
+        out_channels=212,
+        input_size=96,
+        inverted_expand_ratio=2,
+        inverted_activation='half_v2',
+    ):
+        super(FaceKeypointHead, self).__init__()
+        self.input_size = input_size
+        self.face_mean_shape = copy.deepcopy(np.asarray(mean_face))
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.branches = []
+
+        self.loss = build_loss(loss_keypoint)
+
+        # points
+        self.branches.append(
+            nn.Sequential(
+                InvertedResidual(
+                    in_channels,
+                    96,
+                    3,
+                    1,
+                    1,
+                    expand_ratio=inverted_expand_ratio,
+                    activation=inverted_activation),
+                View((-1, 96 * 3 * 3, 1, 1)), conv_bn(96 * 3 * 3, 128, 1, 1,
+                                                      0),
+                conv_bn(128, 128, 1, 1, 0),
+                conv_no_relu(128, out_channels, 1, 1, 0),
+                View((-1, out_channels))))
+        self.branches = nn.ModuleList(self.branches)
+
+    def get_loss(self, output, target_point, target_point_mask, target_pose):
+        losses = dict()
+        loss = self.loss(output * target_point_mask, target_point, target_pose)
+        losses['point_loss'] = loss
+
+        return losses
+
+    def get_accuracy(self, output, target_point):
+        return get_keypoint_accuracy(output, target_point)
+
+    def forward(self, x):
+        point = self.branches[0](x)
+        point = point * 0.5 + torch.from_numpy(self.face_mean_shape).to(
+            self.device)
+        point = point * self.input_size
+
+        return point
diff --git a/easycv/models/face/head/face_keypoint_pose_head.py b/easycv/models/face/head/face_keypoint_pose_head.py
new file mode 100644
index 00000000..4adde695
--- /dev/null
+++ b/easycv/models/face/head/face_keypoint_pose_head.py
@@ -0,0 +1,55 @@
+import numpy as np
+import torch
+import torch.nn as nn
+
+from easycv.models.builder import HEADS, build_loss
+from easycv.models.utils.face_keypoint_utils import (InvertedResidual, View,
+                                                     conv_bn, conv_no_relu,
+                                                     get_pose_accuracy)
+
+
+@HEADS.register_module
+class FacePoseHead(nn.Module):
+
+    def __init__(
+        self,
+        loss_pose,
+        in_channels=48,
+        out_channels=3,
+        inverted_expand_ratio=2,
+        inverted_activation='half_v2',
+    ):
+        super(FacePoseHead, self).__init__()
+        self.branches = []
+
+        self.loss = build_loss(loss_pose)
+
+        # pose
+        self.branches.append(
+            nn.Sequential(
+                InvertedResidual(
+                    in_channels,
+                    48,
+                    3,
+                    1,
+                    1,
+                    expand_ratio=inverted_expand_ratio,
+                    activation=inverted_activation),
+                View((-1, 48 * 3 * 3, 1, 1)), conv_bn(48 * 3 * 3, 48, 1, 1, 0),
+                conv_bn(48, 48, 1, 1, 0),
+                conv_no_relu(48, out_channels, 1, 1, 0),
+                View((-1, out_channels))))
+        self.branches = nn.ModuleList(self.branches)
+
+    def get_loss(self, output, target_pose):
+        losses = dict()
+        loss = self.loss(output, target_pose)
+        losses['pose_loss'] = loss
+
+        return losses
+
+    def get_accuracy(self, output, target_pose):
+        return get_pose_accuracy(output, target_pose)
+
+    def forward(self, x):
+        return self.branches[0](x)
diff --git a/easycv/models/loss/__init__.py b/easycv/models/loss/__init__.py
index cc05963b..6e3a6980 100644
--- a/easycv/models/loss/__init__.py
+++ b/easycv/models/loss/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .cross_entropy_loss import CrossEntropyLoss
+from .face_keypoint_loss import FacePoseLoss, WingLossWithPose
 from .focal_loss import FocalLoss, VarifocalLoss
 from .iou_loss import GIoULoss, IoULoss, YOLOX_IOULoss
 from .mse_loss import JointsMSELoss
diff --git a/easycv/models/loss/face_keypoint_loss.py b/easycv/models/loss/face_keypoint_loss.py
new file mode 100644
index 00000000..8d4a80c5
--- /dev/null
+++ b/easycv/models/loss/face_keypoint_loss.py
@@ -0,0 +1,91 @@
+import copy
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from easycv.models.builder import LOSSES
+
+CONSTANT_CONTOUR = 66
+CONSTANT_EYEBROW = 18
+CONSTANT_EYE = 18
+CONSTANT_NOSE = 30
+CONSTANT_LIPS = 40
+CONSTANT_EYE_CENTER = 4
+
+
+@LOSSES.register_module()
+class WingLossWithPose(nn.Module):
+
+    def __init__(self,
+                 num_points=106,
+                 left_eye_left_corner_index=66,
+                 right_eye_right_corner_index=79,
+                 points_weight=1.0,
+                 contour_weight=1.5,
+                 eyebrow_weight=1.5,
+                 eye_weight=1.7,
+                 nose_weight=1.3,
+                 lip_weight=1.7,
+                 omega=10,
+                 epsilon=2):
+        super(WingLossWithPose, self).__init__()
+        self.omega = omega
+        self.epsilon = epsilon
+
+        self.num_points = num_points
+        self.left_eye_left_corner_index = left_eye_left_corner_index
+        self.right_eye_right_corner_index = right_eye_right_corner_index
+        self.points_weight = points_weight
+        contour_weight = np.full(CONSTANT_CONTOUR, contour_weight)
+        eyebrow_left_weight = np.full(CONSTANT_EYEBROW, eyebrow_weight)
+        eyebrow_right_weight = np.full(CONSTANT_EYEBROW, eyebrow_weight)
+        nose_weight = np.full(CONSTANT_NOSE, nose_weight)
+        eye_left_weight = np.full(CONSTANT_EYE, eye_weight)
+        eye_right_weight = np.full(CONSTANT_EYE, eye_weight)
+        lips_weight = np.full(CONSTANT_LIPS, lip_weight)
+        eye_center_weight = np.full(CONSTANT_EYE_CENTER, eye_weight)
+        part_weight = np.concatenate(
+            (contour_weight, eyebrow_left_weight, eyebrow_right_weight,
+             nose_weight, eye_left_weight, eye_right_weight, lips_weight,
+             eye_center_weight),
+            axis=0)
+
+        self.part_weight = None
+        if part_weight is not None:
+            self.part_weight = torch.from_numpy(part_weight).cuda()
+
+    def forward(self, pred, target, pose):
+        weight = 5.0 * (1.0 - torch.cos(pose * np.pi / 180.0)) + 1.0
+        weight = torch.sum(weight, dim=1) / 3.0
+        weight = weight.view((weight.shape[0], 1))
+
+        if self.part_weight is not None:
+            weight = weight * self.part_weight
+
+        y = target
+        y_hat = pred
+        delta_y = (y - y_hat).abs() * weight
+        delta_y1 = delta_y[delta_y < self.omega]
+        delta_y2 = delta_y[delta_y >= self.omega]
+        loss1 = self.omega * torch.log(1 + delta_y1 / self.epsilon)
+        C = self.omega - self.omega * math.log(1 + self.omega / self.epsilon)
+        loss = delta_y2 - C
+        result = self.points_weight * (loss1.sum() + loss.sum()) / (
+            len(loss1) + len(loss))
+
+        return result
+
+
+@LOSSES.register_module()
+class FacePoseLoss(nn.Module):
+
+    def __init__(self, pose_weight=1.0):
+        super(FacePoseLoss, self).__init__()
+        self.criterion = nn.MSELoss()
+        self.pose_weight = pose_weight
+
+    def forward(self, pred, target):
+        result = self.pose_weight * self.criterion(pred, target)
+        return result
diff --git a/easycv/models/segmentation/heads/__init__.py b/easycv/models/segmentation/heads/__init__.py
index 776a56cc..aa32775c 100644
--- a/easycv/models/segmentation/heads/__init__.py
+++ b/easycv/models/segmentation/heads/__init__.py
@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .fcn_head import FCNHead
 from .mask2former_head import Mask2FormerHead
+from .segformer_head import SegformerHead
 from .uper_head import UPerHead
 
-__all__ = ['FCNHead', 'UPerHead', 'Mask2FormerHead']
+__all__ = ['FCNHead', 'UPerHead', 'Mask2FormerHead', 'SegformerHead']
diff --git a/easycv/models/segmentation/heads/segformer_head.py b/easycv/models/segmentation/heads/segformer_head.py
new file mode 100644
index 00000000..d0fbbe1a
--- /dev/null
+++ b/easycv/models/segmentation/heads/segformer_head.py
@@ -0,0 +1,146 @@
+# Modified from
+# https://github.com/NVlabs/SegFormer/blob/master/mmseg/models/decode_heads/segformer_head.py
+#
+# This work is licensed under the NVIDIA Source Code License.
+#
+# Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+# NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+# Augmentation (ADA)
+#
+#  1. Definitions
+#  "Licensor" means any person or entity that distributes its Work.
+#  "Software" means the original work of authorship made available under
+# this License.
+#  "Work" means the Software and any additions to or derivative works of
+# the Software that are made available under this License.
+#  The terms "reproduce," "reproduction," "derivative works," and
+# "distribution" have the meaning as provided under U.S. copyright law;
+# provided, however, that for the purposes of this License, derivative
+# works shall not include works that remain separable from, or merely
+# link (or bind by name) to the interfaces of, the Work.
+#  Works, including the Software, are "made available" under this License
+# by including in or with the Work either (a) a copyright notice
+# referencing the applicability of this License to the Work, or (b) a
+# copy of this License.
+#  2. License Grants
+#      2.1 Copyright Grant. Subject to the terms and conditions of this
+#     License, each Licensor grants to you a perpetual, worldwide,
+#     non-exclusive, royalty-free, copyright license to reproduce,
+#     prepare derivative works of, publicly display, publicly perform,
+#     sublicense and distribute its Work and any resulting derivative
+#     works in any form.
+#  3. Limitations
+#      3.1 Redistribution. You may reproduce or distribute the Work only
+#     if (a) you do so under this License, (b) you include a complete
+#     copy of this License with your distribution, and (c) you retain
+#     without modification any copyright, patent, trademark, or
+#     attribution notices that are present in the Work.
+#      3.2 Derivative Works. You may specify that additional or different
+#     terms apply to the use, reproduction, and distribution of your
+#     derivative works of the Work ("Your Terms") only if (a) Your Terms
+#     provide that the use limitation in Section 3.3 applies to your
+#     derivative works, and (b) you identify the specific derivative
+#     works that are subject to Your Terms. Notwithstanding Your Terms,
+#     this License (including the redistribution requirements in Section
+#     3.1) will continue to apply to the Work itself.
+#      3.3 Use Limitation. The Work and any derivative works thereof only
+#     may be used or intended for use non-commercially. Notwithstanding
+#     the foregoing, NVIDIA and its affiliates may use the Work and any
+#     derivative works commercially. As used herein, "non-commercially"
+#     means for research or evaluation purposes only.
+#      3.4 Patent Claims. If you bring or threaten to bring a patent claim
+#     against any Licensor (including any claim, cross-claim or
+#     counterclaim in a lawsuit) to enforce any patents that you allege
+#     are infringed by any Work, then your rights under this License from
+#     such Licensor (including the grant in Section 2.1) will terminate
+#     immediately.
+#      3.5 Trademarks. This License does not grant any rights to use any
+#     Licensor’s or its affiliates’ names, logos, or trademarks, except
+#     as necessary to reproduce the notices described in this License.
+#      3.6 Termination. If you violate any term of this License, then your
+#     rights under this License (including the grant in Section 2.1) will
+#     terminate immediately.
+#  4. Disclaimer of Warranty.
+#  THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+# NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+# THIS LICENSE.
+#  5. Limitation of Liability.
+#  EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+# THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+# SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+# INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+# OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+# (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+# LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+# COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGES.
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapt from https://github.com/open-mmlab/mmsegmentation/blob/2d66179630035097dcae08ee958f60d4b5a7fcae/mmseg/models/decode_heads/segformer_head.py
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from easycv.models.builder import HEADS
+from easycv.models.segmentation.heads.base import BaseDecodeHead
+from easycv.models.utils.ops import resize_tensor as resize
+
+
+@HEADS.register_module()
+class SegformerHead(BaseDecodeHead):
+    """The all mlp Head of segformer.
+
+    This head is the implementation of
+    `Segformer <https://arxiv.org/abs/2105.15203>` _.
+
+    Args:
+        interpolate_mode: The interpolate mode of MLP head upsample operation.
+            Default: 'bilinear'.
+    """
+
+    def __init__(self, interpolate_mode='bilinear', **kwargs):
+        super().__init__(input_transform='multiple_select', **kwargs)
+
+        self.interpolate_mode = interpolate_mode
+        num_inputs = len(self.in_channels)
+
+        assert num_inputs == len(self.in_index)
+
+        self.convs = nn.ModuleList()
+        for i in range(num_inputs):
+            self.convs.append(
+                ConvModule(
+                    in_channels=self.in_channels[i],
+                    out_channels=self.channels,
+                    kernel_size=1,
+                    stride=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+
+        self.fusion_conv = ConvModule(
+            in_channels=self.channels * num_inputs,
+            out_channels=self.channels,
+            kernel_size=1,
+            norm_cfg=self.norm_cfg)
+
+    def forward(self, inputs):
+        # Receive 4 stage backbone feature map: 1/4, 1/8, 1/16, 1/32
+        inputs = self._transform_inputs(inputs)
+        outs = []
+        for idx in range(len(inputs)):
+            x = inputs[idx]
+            conv = self.convs[idx]
+            outs.append(
+                resize(
+                    input=conv(x),
+                    size=inputs[0].shape[2:],
+                    mode=self.interpolate_mode,
+                    align_corners=self.align_corners))
+
+        out = self.fusion_conv(torch.cat(outs, dim=1))
+
+        out = self.cls_seg(out)
+
+        return out
diff --git a/easycv/models/segmentation/utils/__init__.py b/easycv/models/segmentation/utils/__init__.py
index 45cdd5c3..189097af 100644
--- a/easycv/models/segmentation/utils/__init__.py
+++ b/easycv/models/segmentation/utils/__init__.py
@@ -1,2 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+from .embed import PatchEmbed
 from .matcher import MaskHungarianMatcher
+from .shape_convert import (nchw2nlc2nchw, nchw_to_nlc, nlc2nchw2nlc,
+                            nlc_to_nchw)
diff --git a/easycv/models/segmentation/utils/embed.py b/easycv/models/segmentation/utils/embed.py
new file mode 100644
index 00000000..cec54054
--- /dev/null
+++ b/easycv/models/segmentation/utils/embed.py
@@ -0,0 +1,332 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapt from: https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/utils/embed.py
+
+import math
+from typing import Sequence
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.runner.base_module import BaseModule
+from mmcv.utils import to_2tuple
+
+
+class AdaptivePadding(nn.Module):
+    """Applies padding to input (if needed) so that input can get fully covered
+    by filter you specified. It support two modes "same" and "corner". The
+    "same" mode is same with "SAME" padding mode in TensorFlow, pad zero around
+    input. The "corner"  mode would pad zero to bottom right.
+
+    Args:
+        kernel_size (int | tuple): Size of the kernel:
+        stride (int | tuple): Stride of the filter. Default: 1:
+        dilation (int | tuple): Spacing between kernel elements.
+            Default: 1.
+        padding (str): Support "same" and "corner", "corner" mode
+            would pad zero to bottom right, and "same" mode would
+            pad zero around input. Default: "corner".
+    Example:
+        >>> kernel_size = 16
+        >>> stride = 16
+        >>> dilation = 1
+        >>> input = torch.rand(1, 1, 15, 17)
+        >>> adap_pad = AdaptivePadding(
+        >>>     kernel_size=kernel_size,
+        >>>     stride=stride,
+        >>>     dilation=dilation,
+        >>>     padding="corner")
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+        >>> input = torch.rand(1, 1, 16, 17)
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+    """
+
+    def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
+
+        super(AdaptivePadding, self).__init__()
+
+        assert padding in ('same', 'corner')
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+
+    def get_pad_shape(self, input_shape):
+        input_h, input_w = input_shape
+        kernel_h, kernel_w = self.kernel_size
+        stride_h, stride_w = self.stride
+        output_h = math.ceil(input_h / stride_h)
+        output_w = math.ceil(input_w / stride_w)
+        pad_h = max((output_h - 1) * stride_h +
+                    (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
+        pad_w = max((output_w - 1) * stride_w +
+                    (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
+        return pad_h, pad_w
+
+    def forward(self, x):
+        pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
+        if pad_h > 0 or pad_w > 0:
+            if self.padding == 'corner':
+                x = F.pad(x, [0, pad_w, 0, pad_h])
+            elif self.padding == 'same':
+                x = F.pad(x, [
+                    pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                    pad_h - pad_h // 2
+                ])
+        return x
+
+
+class PatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+
+    We use a conv layer to implement PatchEmbed.
+
+    Args:
+        in_channels (int): The num of input channels. Default: 3
+        embed_dims (int): The dimensions of embedding. Default: 768
+        conv_type (str): The config dict for embedding
+            conv layer type selection. Default: "Conv2d".
+        kernel_size (int): The kernel_size of embedding conv. Default: 16.
+        stride (int, optional): The slide stride of embedding conv.
+            Default: None (Would be set as `kernel_size`).
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int): The dilation rate of embedding conv. Default: 1.
+        bias (bool): Bias of embed conv. Default: True.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        input_size (int | tuple | None): The size of input, which will be
+            used to calculate the out size. Only work when `dynamic_size`
+            is False. Default: None.
+        init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=768,
+                 conv_type='Conv2d',
+                 kernel_size=16,
+                 stride=None,
+                 padding='corner',
+                 dilation=1,
+                 bias=True,
+                 norm_cfg=None,
+                 input_size=None,
+                 init_cfg=None):
+        super(PatchEmbed, self).__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        if stride is None:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of conv
+            padding = 0
+        else:
+            self.adap_padding = None
+        padding = to_2tuple(padding)
+
+        self.projection = build_conv_layer(
+            dict(type=conv_type),
+            in_channels=in_channels,
+            out_channels=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = None
+
+        if input_size:
+            input_size = to_2tuple(input_size)
+            # `init_out_size` would be used outside to
+            # calculate the num_patches
+            # when `use_abs_pos_embed` outside
+            self.init_input_size = input_size
+            if self.adap_padding:
+                pad_h, pad_w = self.adap_padding.get_pad_shape(input_size)
+                input_h, input_w = input_size
+                input_h = input_h + pad_h
+                input_w = input_w + pad_w
+                input_size = (input_h, input_w)
+
+            # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+            h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
+                     (kernel_size[0] - 1) - 1) // stride[0] + 1
+            w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
+                     (kernel_size[1] - 1) - 1) // stride[1] + 1
+            self.init_out_size = (h_out, w_out)
+        else:
+            self.init_input_size = None
+            self.init_out_size = None
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+                - x (Tensor): Has shape (B, out_h * out_w, embed_dims)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (out_h, out_w).
+        """
+
+        if self.adap_padding:
+            x = self.adap_padding(x)
+
+        x = self.projection(x)
+        out_size = (x.shape[2], x.shape[3])
+        x = x.flatten(2).transpose(1, 2)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, out_size
+
+
+class PatchMerging(BaseModule):
+    """Merge patch feature map.
+
+    This layer groups feature map by kernel_size, and applies norm and linear
+    layers to the grouped feature map. Our implementation uses `nn.Unfold` to
+    merge patch, which is about 25% faster than original implementation.
+    Instead, we need to modify pretrained models for compatibility.
+
+    Args:
+        in_channels (int): The num of input channels.
+        out_channels (int): The num of output channels.
+        kernel_size (int | tuple, optional): the kernel size in the unfold
+            layer. Defaults to 2.
+        stride (int | tuple, optional): the stride of the sliding blocks in the
+            unfold layer. Default: None. (Would be set as `kernel_size`)
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int | tuple, optional): dilation parameter in the unfold
+            layer. Default: 1.
+        bias (bool, optional): Whether to add bias in linear layer or not.
+            Defaults: False.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=2,
+                 stride=None,
+                 padding='corner',
+                 dilation=1,
+                 bias=False,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if stride:
+            stride = stride
+        else:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of unfold
+            padding = 0
+        else:
+            self.adap_padding = None
+
+        padding = to_2tuple(padding)
+        self.sampler = nn.Unfold(
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding=padding,
+            stride=stride)
+
+        sample_dim = kernel_size[0] * kernel_size[1] * in_channels
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
+        else:
+            self.norm = None
+
+        self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
+
+    def forward(self, x, input_size):
+        """
+        Args:
+            x (Tensor): Has shape (B, H*W, C_in).
+            input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
+                Default: None.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+                - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (Merged_H, Merged_W).
+        """
+        B, L, C = x.shape
+        assert isinstance(input_size, Sequence), f'Expect ' \
+                                                 f'input_size is ' \
+                                                 f'`Sequence` ' \
+                                                 f'but get {input_size}'
+
+        H, W = input_size
+        assert L == H * W, 'input feature has wrong size'
+
+        x = x.view(B, H, W, C).permute([0, 3, 1, 2])  # B, C, H, W
+        # Use nn.Unfold to merge patch. About 25% faster than original method,
+        # but need to modify pretrained model for compatibility
+
+        if self.adap_padding:
+            x = self.adap_padding(x)
+            H, W = x.shape[-2:]
+
+        x = self.sampler(x)
+        # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
+
+        out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
+                 (self.sampler.kernel_size[0] - 1) -
+                 1) // self.sampler.stride[0] + 1
+        out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
+                 (self.sampler.kernel_size[1] - 1) -
+                 1) // self.sampler.stride[1] + 1
+
+        output_size = (out_h, out_w)
+        x = x.transpose(1, 2)  # B, H/2*W/2, 4*C
+        x = self.norm(x) if self.norm else x
+        x = self.reduction(x)
+        return x, output_size
diff --git a/easycv/models/segmentation/utils/shape_convert.py b/easycv/models/segmentation/utils/shape_convert.py
new file mode 100644
index 00000000..b9fe2fb0
--- /dev/null
+++ b/easycv/models/segmentation/utils/shape_convert.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapt from: https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/utils/shape_convert.py
+
+
+def nlc_to_nchw(x, hw_shape):
+    """Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, L, C] before conversion.
+        hw_shape (Sequence[int]): The height and width of output feature map.
+
+    Returns:
+        Tensor: The output tensor of shape [N, C, H, W] after conversion.
+    """
+    H, W = hw_shape
+    assert len(x.shape) == 3
+    B, L, C = x.shape
+    assert L == H * W, 'The seq_len doesn\'t match H, W'
+    return x.transpose(1, 2).reshape(B, C, H, W)
+
+
+def nchw_to_nlc(x):
+    """Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, C, H, W] before conversion.
+
+    Returns:
+        Tensor: The output tensor of shape [N, L, C] after conversion.
+    """
+    assert len(x.shape) == 4
+    return x.flatten(2).transpose(1, 2).contiguous()
+
+
+def nchw2nlc2nchw(module, x, contiguous=False, **kwargs):
+    """Flatten [N, C, H, W] shape tensor `x` to [N, L, C] shape tensor. Use the
+    reshaped tensor as the input of `module`, and the convert the output of
+    `module`, whose shape is.
+
+    [N, L, C], to [N, C, H, W].
+
+    Args:
+        module (Callable): A callable object the takes a tensor
+            with shape [N, L, C] as input.
+        x (Tensor): The input tensor of shape [N, C, H, W].
+                contiguous:
+        contiguous (Bool): Whether to make the tensor contiguous
+            after each shape transform.
+
+    Returns:
+        Tensor: The output tensor of shape [N, C, H, W].
+
+    Example:
+        >>> import torch
+        >>> import torch.nn as nn
+        >>> norm = nn.LayerNorm(4)
+        >>> feature_map = torch.rand(4, 4, 5, 5)
+        >>> output = nchw2nlc2nchw(norm, feature_map)
+    """
+    B, C, H, W = x.shape
+    if not contiguous:
+        x = x.flatten(2).transpose(1, 2)
+        x = module(x, **kwargs)
+        x = x.transpose(1, 2).reshape(B, C, H, W)
+    else:
+        x = x.flatten(2).transpose(1, 2).contiguous()
+        x = module(x, **kwargs)
+        x = x.transpose(1, 2).reshape(B, C, H, W).contiguous()
+    return x
+
+
+def nlc2nchw2nlc(module, x, hw_shape, contiguous=False, **kwargs):
+    """Convert [N, L, C] shape tensor `x` to [N, C, H, W] shape tensor. Use the
+    reshaped tensor as the input of `module`, and convert the output of
+    `module`, whose shape is.
+
+    [N, C, H, W], to [N, L, C].
+
+    Args:
+        module (Callable): A callable object the takes a tensor
+            with shape [N, C, H, W] as input.
+        x (Tensor): The input tensor of shape [N, L, C].
+        hw_shape: (Sequence[int]): The height and width of the
+            feature map with shape [N, C, H, W].
+        contiguous (Bool): Whether to make the tensor contiguous
+            after each shape transform.
+
+    Returns:
+        Tensor: The output tensor of shape [N, L, C].
+
+    Example:
+        >>> import torch
+        >>> import torch.nn as nn
+        >>> conv = nn.Conv2d(16, 16, 3, 1, 1)
+        >>> feature_map = torch.rand(4, 25, 16)
+        >>> output = nlc2nchw2nlc(conv, feature_map, (5, 5))
+    """
+    H, W = hw_shape
+    assert len(x.shape) == 3
+    B, L, C = x.shape
+    assert L == H * W, 'The seq_len doesn\'t match H, W'
+    if not contiguous:
+        x = x.transpose(1, 2).reshape(B, C, H, W)
+        x = module(x, **kwargs)
+        x = x.flatten(2).transpose(1, 2)
+    else:
+        x = x.transpose(1, 2).reshape(B, C, H, W).contiguous()
+        x = module(x, **kwargs)
+        x = x.flatten(2).transpose(1, 2).contiguous()
+    return x
diff --git a/easycv/models/utils/__init__.py b/easycv/models/utils/__init__.py
index d45dc0a4..6509b8e0 100644
--- a/easycv/models/utils/__init__.py
+++ b/easycv/models/utils/__init__.py
@@ -5,6 +5,10 @@
 from .dist_utils import (DistributedLossWrapper, DistributedMinerWrapper,
                          get_world_size, is_dist_avail_and_initialized,
                          reduce_mean)
+from .face_keypoint_utils import (ION, InvertedResidual, Residual, Softmax,
+                                  View, conv_bn, conv_no_relu,
+                                  get_keypoint_accuracy, get_pose_accuracy,
+                                  pose_accuracy)
 from .gather_layer import GatherLayer
 from .init_weights import _init_weights, trunc_normal_
 from .multi_pooling import GeMPooling, MultiAvgPooling, MultiPooling
diff --git a/easycv/models/utils/face_keypoint_utils.py b/easycv/models/utils/face_keypoint_utils.py
new file mode 100644
index 00000000..c094afbc
--- /dev/null
+++ b/easycv/models/utils/face_keypoint_utils.py
@@ -0,0 +1,240 @@
+import copy
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+def conv_bn(inp, oup, kernel, stride, padding=1):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, kernel, stride, padding, bias=False),
+        nn.BatchNorm2d(oup), nn.PReLU(oup))
+
+
+def conv_no_relu(inp, oup, kernel, stride, padding=1):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, kernel, stride, padding, bias=False),
+        nn.BatchNorm2d(oup))
+
+
+class View(nn.Module):
+
+    def __init__(self, shape):
+        super(View, self).__init__()
+        self.shape = shape
+
+    def forward(self, x):
+        return x.view(*self.shape)
+
+
+class Softmax(nn.Module):
+
+    def __init__(self, dim):
+        super(Softmax, self).__init__()
+        self.softmax = nn.Softmax(dim)
+
+    def forward(self, x):
+        return self.softmax(x)
+
+
+class InvertedResidual(nn.Module):
+
+    def __init__(self,
+                 inp,
+                 oup,
+                 kernel_size,
+                 stride,
+                 padding,
+                 expand_ratio=2,
+                 use_connect=False,
+                 activation='relu'):
+        super(InvertedResidual, self).__init__()
+
+        hid_channels = int(inp * expand_ratio)
+        if activation == 'relu':
+            self.conv = nn.Sequential(
+                nn.Conv2d(inp, hid_channels, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(hid_channels), nn.ReLU(inplace=True),
+                nn.Conv2d(
+                    hid_channels,
+                    hid_channels,
+                    kernel_size,
+                    stride,
+                    padding,
+                    groups=hid_channels,
+                    bias=False), nn.BatchNorm2d(hid_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(hid_channels, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup))
+        elif activation == 'prelu':
+            self.conv = nn.Sequential(
+                nn.Conv2d(inp, hid_channels, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(hid_channels), nn.PReLU(hid_channels),
+                nn.Conv2d(
+                    hid_channels,
+                    hid_channels,
+                    kernel_size,
+                    stride,
+                    padding,
+                    groups=hid_channels,
+                    bias=False), nn.BatchNorm2d(hid_channels),
+                nn.PReLU(hid_channels),
+                nn.Conv2d(hid_channels, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup))
+        elif activation == 'half_v1':
+            self.conv = nn.Sequential(
+                nn.Conv2d(inp, hid_channels, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(hid_channels), nn.ReLU(inplace=True),
+                nn.Conv2d(
+                    hid_channels,
+                    hid_channels,
+                    kernel_size,
+                    stride,
+                    padding,
+                    groups=hid_channels,
+                    bias=False), nn.BatchNorm2d(hid_channels),
+                nn.PReLU(hid_channels),
+                nn.Conv2d(hid_channels, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup))
+        elif activation == 'half_v2':
+            self.conv = nn.Sequential(
+                nn.Conv2d(inp, hid_channels, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(hid_channels), nn.PReLU(hid_channels),
+                nn.Conv2d(
+                    hid_channels,
+                    hid_channels,
+                    kernel_size,
+                    stride,
+                    padding,
+                    groups=hid_channels,
+                    bias=False), nn.BatchNorm2d(hid_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(hid_channels, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup))
+        self.use_connect = use_connect
+
+    def forward(self, x):
+        if self.use_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class Residual(nn.Module):
+
+    def __init__(self,
+                 inp,
+                 oup,
+                 kernel_size,
+                 stride,
+                 padding,
+                 use_connect=False,
+                 activation='relu'):
+        super(Residual, self).__init__()
+
+        self.use_connect = use_connect
+
+        if activation == 'relu':
+            self.conv = nn.Sequential(
+                nn.Conv2d(
+                    inp,
+                    inp,
+                    kernel_size,
+                    stride,
+                    padding,
+                    groups=inp,
+                    bias=False), nn.BatchNorm2d(inp), nn.ReLU(inplace=True),
+                nn.Conv2d(inp, oup, 1, 1, 0, bias=False), nn.BatchNorm2d(oup),
+                nn.ReLU(inplace=True))
+        elif activation == 'prelu':
+            self.conv = nn.Sequential(
+                nn.Conv2d(
+                    inp,
+                    inp,
+                    kernel_size,
+                    stride,
+                    padding,
+                    groups=inp,
+                    bias=False), nn.BatchNorm2d(inp), nn.PReLU(inp),
+                nn.Conv2d(inp, oup, 1, 1, 0, bias=False), nn.BatchNorm2d(oup),
+                nn.PReLU(oup))
+        elif activation == 'half_v1':
+            self.conv = nn.Sequential(
+                nn.Conv2d(
+                    inp,
+                    inp,
+                    kernel_size,
+                    stride,
+                    padding,
+                    groups=inp,
+                    bias=False), nn.BatchNorm2d(inp), nn.ReLU(inplace=True),
+                nn.Conv2d(inp, oup, 1, 1, 0, bias=False), nn.BatchNorm2d(oup),
+                nn.PReLU(oup))
+        elif activation == 'half_v2':
+            self.conv = nn.Sequential(
+                nn.Conv2d(
+                    inp,
+                    inp,
+                    kernel_size,
+                    stride,
+                    padding,
+                    groups=inp,
+                    bias=False), nn.BatchNorm2d(inp), nn.PReLU(inp),
+                nn.Conv2d(inp, oup, 1, 1, 0, bias=False), nn.BatchNorm2d(oup),
+                nn.ReLU(inplace=True))
+
+    def forward(self, x):
+        if self.use_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+def pose_accuracy(output, target):
+    with torch.no_grad():
+        output = output.detach().cpu().numpy()
+        target = target.detach().cpu().numpy()
+
+        acc = np.mean(np.abs(output - target))
+        return acc
+
+
+def ION(output, target, left_eye_left_coner_idx, right_eye_right_corner_idx,
+        num_pts):
+    with torch.no_grad():
+        output = output.view(-1, num_pts, 2).cpu().numpy()
+        target = target.view(-1, num_pts, 2).cpu().numpy()
+
+        interocular = target[:,
+                             left_eye_left_coner_idx] - target[:,
+                                                               right_eye_right_corner_idx]
+        interocular = np.sqrt(
+            np.square(interocular[:, 0]) + np.square(interocular[:, 1])) + 1e-5
+        dist = target - output
+        dist = np.sqrt(np.square(dist[:, :, 0]) + np.square(dist[:, :, 1]))
+        dist = np.sum(dist, axis=1)
+        nme = dist / (interocular * num_pts)
+
+    return np.mean(nme)
+
+
+def get_keypoint_accuracy(output, target_point):
+    accuracy = dict()
+    num_points = 106
+    left_eye_left_corner_index = 66
+    right_eye_right_corner_index = 79
+
+    nme = ION(output, target_point, left_eye_left_corner_index,
+              right_eye_right_corner_index, num_points)
+
+    accuracy['nme'] = nme
+
+    return accuracy
+
+
+def get_pose_accuracy(output, target_pose):
+    accuracy = dict()
+    pose_acc = pose_accuracy(output, target_pose)
+    accuracy['pose_acc'] = float(pose_acc)
+    return accuracy
diff --git a/easycv/predictors/__init__.py b/easycv/predictors/__init__.py
index 740ad180..3fe86936 100644
--- a/easycv/predictors/__init__.py
+++ b/easycv/predictors/__init__.py
@@ -1,9 +1,13 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .classifier import TorchClassifier
-from .detector import (TorchFaceDetector, TorchYoloXClassifierPredictor,
-                       TorchYoloXPredictor)
+from .detector import (DetectionPredictor, TorchFaceDetector,
+                       TorchYoloXClassifierPredictor, TorchYoloXPredictor)
+from .face_keypoints_predictor import FaceKeypointsPredictor
 from .feature_extractor import (TorchFaceAttrExtractor,
                                 TorchFaceFeatureExtractor,
                                 TorchFeatureExtractor)
+from .hand_keypoints_predictor import HandKeypointsPredictor
 from .pose_predictor import (TorchPoseTopDownPredictor,
                              TorchPoseTopDownPredictorWithDetector)
+from .segmentation import (Mask2formerPredictor, SegFormerPredictor,
+                           SegmentationPredictor)
diff --git a/easycv/predictors/base.py b/easycv/predictors/base.py
index b7143c94..49f3a728 100644
--- a/easycv/predictors/base.py
+++ b/easycv/predictors/base.py
@@ -1,17 +1,21 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
+import pickle
 
 import numpy as np
 import torch
+from mmcv.parallel import collate, scatter_kwargs
 from PIL import Image
 from torchvision.transforms import Compose
 
 from easycv.datasets.registry import PIPELINES
 from easycv.file import io
-from easycv.models import build_model
+from easycv.models.builder import build_model
 from easycv.utils.checkpoint import load_checkpoint
 from easycv.utils.config_tools import mmcv_config_fromfile
 from easycv.utils.constant import CACHE_DIR
+from easycv.utils.mmlab_utils import (dynamic_adapt_for_mmlab,
+                                      remove_adapt_for_mmlab)
 from easycv.utils.registry import build_from_cfg
 
 
@@ -91,3 +95,180 @@ def predict_batch(self, image_batch, **forward_kwargs):
             output = self.model.forward(
                 image_batch.to(self.device), **forward_kwargs)
         return output
+
+
+class PredictorV2(object):
+    """Base predict pipeline.
+        Args:
+            model_path (str): Path of model path.
+            config_file (Optinal[str]): config file path for model and processor to init. Defaults to None.
+            batch_size (int): batch size for forward.
+            device (str): Support 'cuda' or 'cpu', if is None, detect device automatically.
+            save_results (bool): Whether to save predict results.
+            save_path (str): File path for saving results, only valid when `save_results` is True.
+        """
+
+    def __init__(self,
+                 model_path,
+                 config_file=None,
+                 batch_size=1,
+                 device=None,
+                 save_results=False,
+                 save_path=None,
+                 mode='rgb',
+                 *args,
+                 **kwargs):
+        self.model_path = model_path
+        self.batch_size = batch_size
+        self.save_results = save_results
+        self.save_path = save_path
+        if self.save_results:
+            assert self.save_path is not None
+        self.device = device
+        if self.device is None:
+            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+        self.cfg = None
+        if config_file is not None:
+            if isinstance(config_file, str):
+                self.cfg = mmcv_config_fromfile(config_file)
+            else:
+                self.cfg = config_file
+
+        self.model = self.prepare_model()
+        self.processor = self.build_processor()
+        self._load_op = None
+        self.mode = mode
+
+    def prepare_model(self):
+        """Build model from config file by default.
+        If the model is not loaded from a configuration file, e.g. torch jit model, you need to reimplement it.
+        """
+        model = self._build_model()
+        model.to(self.device)
+        model.eval()
+        load_checkpoint(model, self.model_path, map_location='cpu')
+        return model
+
+    def _build_model(self):
+        if self.cfg is None:
+            raise ValueError('Please provide "config_file"!')
+        # Use mmdet model
+        dynamic_adapt_for_mmlab(self.cfg)
+        model = build_model(self.cfg.model)
+        # remove adapt for mmdet to avoid conflict using mmdet models
+        remove_adapt_for_mmlab(self.cfg)
+        return model
+
+    def build_processor(self):
+        """Build processor to process loaded input.
+        If you need custom preprocessing ops, you need to reimplement it.
+        """
+        if self.cfg is None:
+            pipeline = []
+        else:
+            pipeline = [
+                build_from_cfg(p, PIPELINES)
+                for p in self.cfg.get('test_pipeline', [])
+            ]
+
+        from easycv.datasets.shared.pipelines.transforms import Compose
+        processor = Compose(pipeline)
+        return processor
+
+    def _load_input(self, input):
+        """Load image from file or numpy or PIL object.
+        Args:
+            input: File path or numpy or PIL object.
+        Returns:
+           {
+                'filename': filename,
+                'img': img,
+                'img_shape': img_shape,
+                'img_fields': ['img']
+            }
+        """
+        if self._load_op is None:
+            load_cfg = dict(type='LoadImage', mode=self.mode)
+            self._load_op = build_from_cfg(load_cfg, PIPELINES)
+
+        if not isinstance(input, str):
+            sample = self._load_op({'img': input})
+        else:
+            sample = self._load_op({'filename': input})
+
+        return sample
+
+    def preprocess_single(self, input):
+        """Preprocess single input sample.
+        If you need custom ops to load or process a single input sample, you need to reimplement it.
+        """
+        input = self._load_input(input)
+        return self.processor(input)
+
+    def preprocess(self, inputs, *args, **kwargs):
+        """Process all inputs list. And collate to batch and put to target device.
+        If you need custom ops to load or process a batch samples, you need to reimplement it.
+        """
+        batch_outputs = []
+        for i in inputs:
+            batch_outputs.append(self.preprocess_single(i, *args, **kwargs))
+
+        batch_outputs = self._collate_fn(batch_outputs)
+        batch_outputs = self._to_device(batch_outputs)
+
+        return batch_outputs
+
+    def forward(self, inputs):
+        """Model forward.
+        If you need refactor model forward, you need to reimplement it.
+        """
+        with torch.no_grad():
+            outputs = self.model(**inputs, mode='test')
+        return outputs
+
+    def postprocess(self, inputs, *args, **kwargs):
+        """Process model outputs.
+        If you need add some processing ops to process model outputs, you need to reimplement it.
+        """
+        return inputs
+
+    def _collate_fn(self, inputs):
+        """Prepare the input just before the forward function.
+        Puts each data field into a tensor with outer dimension batch size
+        """
+        return collate(inputs, samples_per_gpu=self.batch_size)
+
+    def _to_device(self, inputs):
+        target_gpus = [-1] if self.device == 'cpu' else [
+            torch.cuda.current_device()
+        ]
+        _, kwargs = scatter_kwargs(None, inputs, target_gpus=target_gpus)
+        return kwargs[0]
+
+    @staticmethod
+    def dump(obj, save_path, mode='wb'):
+        with open(save_path, mode) as f:
+            f.write(pickle.dumps(obj))
+
+    def __call__(self, inputs, keep_inputs=False):
+        # TODO: fault tolerance
+
+        if isinstance(inputs, str):
+            inputs = [inputs]
+
+        results_list = []
+        for i in range(0, len(inputs), self.batch_size):
+            batch = inputs[i:max(len(inputs) - 1, i + self.batch_size)]
+            batch_outputs = self.preprocess(batch)
+            batch_outputs = self.forward(batch_outputs)
+            results = self.postprocess(batch_outputs)
+            if keep_inputs:
+                results = {'inputs': batch, 'results': results}
+            # if dump, the outputs will not added to the return value to prevent taking up too much memory
+            if self.save_results:
+                self.dump([results], self.save_path, mode='ab+')
+            else:
+                results_list.append(results)
+
+        return results_list
diff --git a/easycv/predictors/builder.py b/easycv/predictors/builder.py
index f387471c..ac73bc52 100644
--- a/easycv/predictors/builder.py
+++ b/easycv/predictors/builder.py
@@ -4,5 +4,5 @@
 PREDICTORS = Registry('predictor')
 
 
-def build_predictor(cfg):
-    return build_from_cfg(cfg, PREDICTORS, default_args=None)
+def build_predictor(cfg, default_args=None):
+    return build_from_cfg(cfg, PREDICTORS, default_args=default_args)
diff --git a/easycv/predictors/detector.py b/easycv/predictors/detector.py
index 72796741..f9d05992 100644
--- a/easycv/predictors/detector.py
+++ b/easycv/predictors/detector.py
@@ -22,8 +22,10 @@
 from easycv.utils.config_tools import mmcv_config_fromfile
 from easycv.utils.constant import CACHE_DIR
 from easycv.utils.logger import get_root_logger
-from easycv.utils.mmlab_utils import dynamic_adapt_for_mmlab
+from easycv.utils.mmlab_utils import (dynamic_adapt_for_mmlab,
+                                      remove_adapt_for_mmlab)
 from easycv.utils.registry import build_from_cfg
+from .base import PredictorV2
 from .builder import PREDICTORS
 from .classifier import TorchClassifier
 
@@ -39,6 +41,44 @@
 
 
 @PREDICTORS.register_module()
+class DetectionPredictor(PredictorV2):
+    """Generic Detection Predictor, it will filter bbox results by ``score_threshold`` .
+    """
+
+    def __init__(self,
+                 model_path=None,
+                 config_file=None,
+                 batch_size=1,
+                 device=None,
+                 save_results=False,
+                 save_path=None,
+                 mode='rgb',
+                 score_threshold=0.5):
+        super(DetectionPredictor, self).__init__(
+            model_path,
+            config_file=config_file,
+            batch_size=batch_size,
+            device=device,
+            save_results=save_results,
+            save_path=save_path,
+            mode=mode,
+        )
+        self.score_thresh = score_threshold
+
+    def postprocess(self, inputs, *args, **kwargs):
+        for batch_index in range(self.batch_size):
+            this_detection_scores = inputs['detection_scores'][batch_index]
+            sel_ids = this_detection_scores > self.score_thresh
+            inputs['detection_scores'][batch_index] = inputs[
+                'detection_scores'][batch_index][sel_ids]
+            inputs['detection_boxes'][batch_index] = inputs['detection_boxes'][
+                batch_index][sel_ids]
+            inputs['detection_classes'][batch_index] = inputs[
+                'detection_classes'][batch_index][sel_ids]
+        # TODO class label remapping
+        return inputs
+
+
 class DetrPredictor(PredictorInterface):
     """Inference image(s) with the detector.
     Args:
diff --git a/easycv/predictors/face_keypoints_predictor.py b/easycv/predictors/face_keypoints_predictor.py
new file mode 100644
index 00000000..2c94f0a4
--- /dev/null
+++ b/easycv/predictors/face_keypoints_predictor.py
@@ -0,0 +1,121 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import cv2
+
+from easycv.predictors.builder import PREDICTORS
+from .base import PredictorV2
+
+face_contour_point_index = [
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+    21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+]
+left_eye_brow_point_index = [33, 34, 35, 36, 37, 38, 39, 40, 41, 33]
+right_eye_brow_point_index = [42, 43, 44, 45, 46, 47, 48, 49, 50, 42]
+left_eye_point_index = [66, 67, 68, 69, 70, 71, 72, 73, 66]
+right_eye_point_index = [75, 76, 77, 78, 79, 80, 81, 82, 75]
+nose_bridge_point_index = [51, 52, 53, 54]
+nose_contour_point_index = [55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65]
+mouth_outer_point_index = [84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 84]
+mouth_inter_point_index = [96, 97, 98, 99, 100, 101, 102, 103, 96]
+
+
+@PREDICTORS.register_module()
+class FaceKeypointsPredictor(PredictorV2):
+    """Predict pipeline for face keypoint
+    Args:
+        model_path (str): Path of model path
+        config_file (str): config file path for model and processor to init. Defaults to None.
+    """
+
+    def __init__(self,
+                 model_path,
+                 config_file,
+                 batch_size=1,
+                 device=None,
+                 save_results=False,
+                 save_path=None,
+                 mode='bgr'):
+        super(FaceKeypointsPredictor, self).__init__(
+            model_path,
+            config_file,
+            batch_size=batch_size,
+            device=device,
+            save_results=save_results,
+            save_path=save_path,
+            mode=mode)
+
+        self.input_size = self.cfg.IMAGE_SIZE
+        self.point_number = self.cfg.POINT_NUMBER
+
+    def preprocess(self, inputs, *args, **kwargs):
+        batch_outputs = super().preprocess(inputs, *args, **kwargs)
+        self.img_metas = batch_outputs['img_metas']
+        return batch_outputs
+
+    def postprocess(self, inputs, *args, **kwargs):
+        results = []
+
+        points = inputs['point'].cpu().numpy()
+        poses = inputs['pose'].cpu().numpy()
+
+        for idx, point in enumerate(points):
+            h, w, c = self.img_metas[idx]['img_shape']
+            scale_h = h / self.input_size
+            scale_w = w / self.input_size
+
+            point = point.reshape((self.point_number, 2))
+            for index in range(len(point)):
+                point[index][0] *= scale_w
+                point[index][1] *= scale_h
+
+            results.append({'point': point, 'pose': poses[idx]})
+
+        return results
+
+    def show_result(self, img, points, scale=4.0, save_path=None):
+        """Draw `result` over `img`.
+
+        Args:
+            img ( ndarray ): The image to be displayed.
+            result (list): The face keypoints to draw over `img`.
+            scale: zoom in or out scale
+            save_path: path to save drawned 'img'
+        Returns:
+            img (ndarray): Only if not `show` or `out_file`
+        """
+
+        image = cv2.resize(img, dsize=None, fx=scale, fy=scale)
+
+        def draw_line(point_index, image, point):
+            for i in range(len(point_index) - 1):
+                cur_index = point_index[i]
+                next_index = point_index[i + 1]
+                cur_pt = (int(point[cur_index][0] * scale),
+                          int(point[cur_index][1] * scale))
+                next_pt = (int(point[next_index][0] * scale),
+                           int(point[next_index][1] * scale))
+                cv2.line(image, cur_pt, next_pt, (0, 0, 255), thickness=2)
+
+        draw_line(face_contour_point_index, image, points)
+        draw_line(left_eye_brow_point_index, image, points)
+        draw_line(right_eye_brow_point_index, image, points)
+        draw_line(left_eye_point_index, image, points)
+        draw_line(right_eye_point_index, image, points)
+        draw_line(nose_bridge_point_index, image, points)
+        draw_line(nose_contour_point_index, image, points)
+        draw_line(mouth_outer_point_index, image, points)
+        draw_line(mouth_inter_point_index, image, points)
+
+        size = len(points)
+        for i in range(size):
+            x = int(points[i][0])
+            y = int(points[i][1])
+            cv2.putText(image, str(i), (int(x * scale), int(y * scale)),
+                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
+            cv2.circle(image, (int(x * scale), int(y * scale)), 2, (0, 255, 0),
+                       cv2.FILLED)
+
+        if save_path is not None:
+            cv2.imwrite(save_path, image)
+
+        return image
diff --git a/easycv/predictors/hand_keypoints_predictor.py b/easycv/predictors/hand_keypoints_predictor.py
new file mode 100644
index 00000000..01d0b0ce
--- /dev/null
+++ b/easycv/predictors/hand_keypoints_predictor.py
@@ -0,0 +1,221 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import cv2
+import mmcv
+import numpy as np
+
+from easycv.predictors.builder import PREDICTORS, build_predictor
+from ..datasets.pose.data_sources.hand.coco_hand import \
+    COCO_WHOLEBODY_HAND_DATASET_INFO
+from ..datasets.pose.data_sources.top_down import DatasetInfo
+from .base import PredictorV2
+from .pose_predictor import _box2cs
+
+HAND_SKELETON = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7],
+                 [7, 8], [9, 10], [10, 11], [11, 12], [13, 14], [14, 15],
+                 [15, 16], [0, 17], [17, 18], [18, 19], [19, 20], [5, 9],
+                 [9, 13], [13, 17]]
+
+
+@PREDICTORS.register_module()
+class HandKeypointsPredictor(PredictorV2):
+    """HandKeypointsPredictor
+
+    Attributes:
+        model_path: path of keypoint model
+        config_file: path or ``Config`` of config file
+        detection_model_config: dict of hand detection model predictor config,
+                                example like ``dict(type="", model_path="", config_file="", ......)``
+        batch_size: batch_size to infer
+        save_results: bool
+        save_path: path of result image
+    """
+
+    def __init__(self,
+                 model_path,
+                 config_file=None,
+                 detection_predictor_config=None,
+                 batch_size=1,
+                 device=None,
+                 save_results=False,
+                 save_path=None,
+                 mode='rgb',
+                 *args,
+                 **kwargs):
+        super(HandKeypointsPredictor, self).__init__(
+            model_path,
+            config_file=config_file,
+            batch_size=batch_size,
+            device=device,
+            save_results=save_results,
+            save_path=save_path,
+            mode=mode,
+            *args,
+            **kwargs)
+        self.dataset_info = DatasetInfo(COCO_WHOLEBODY_HAND_DATASET_INFO)
+        assert detection_predictor_config is not None, f"{self.__class__.__name__} need 'detection_predictor_config' " \
+                                                       f'property to build hand detection model'
+        self.detection_predictor = build_predictor(detection_predictor_config)
+
+    def _load_input(self, input):
+        """ load img and convert detection result to topdown style
+
+        Args:
+            input (dict):
+                {
+                    "inputs": image path,
+                    "results": {
+                        "detection_boxes": B*ndarray(N*4)
+                        "detection_scores": B*ndarray(N,)
+                        "detection_classes": B*ndarray(N,)
+                    }
+                }
+        """
+        image_paths = input['inputs']
+        batch_data = []
+        box_id = 0
+        for batch_index, image_path in enumerate(image_paths):
+            det_bbox_result = input['results']['detection_boxes'][batch_index]
+            det_bbox_scores = input['results']['detection_scores'][batch_index]
+            img = mmcv.imread(image_path, 'color', self.mode)
+            for bbox, score in zip(det_bbox_result, det_bbox_scores):
+                center, scale = _box2cs(self.cfg.data_cfg['image_size'], bbox)
+                # prepare data
+                data = {
+                    'image_file':
+                    image_path,
+                    'img':
+                    img,
+                    'image_id':
+                    batch_index,
+                    'center':
+                    center,
+                    'scale':
+                    scale,
+                    'bbox_score':
+                    score,
+                    'bbox_id':
+                    box_id,  # need to be assigned if batch_size > 1
+                    'dataset':
+                    'coco_wholebody_hand',
+                    'joints_3d':
+                    np.zeros((self.cfg.data_cfg.num_joints, 3),
+                             dtype=np.float32),
+                    'joints_3d_visible':
+                    np.zeros((self.cfg.data_cfg.num_joints, 3),
+                             dtype=np.float32),
+                    'rotation':
+                    0,
+                    'flip_pairs':
+                    self.dataset_info.flip_pairs,
+                    'ann_info': {
+                        'image_size':
+                        np.array(self.cfg.data_cfg['image_size']),
+                        'num_joints': self.cfg.data_cfg['num_joints']
+                    }
+                }
+                batch_data.append(data)
+                box_id += 1
+        return batch_data
+
+    def preprocess_single(self, input):
+        results = []
+        outputs = self._load_input(input)
+        for output in outputs:
+            results.append(self.processor(output))
+        return results
+
+    def preprocess(self, inputs, *args, **kwargs):
+        """Process all inputs list. And collate to batch and put to target device.
+        If you need custom ops to load or process a batch samples, you need to reimplement it.
+        """
+        batch_outputs = []
+        for i in inputs:
+            for res in self.preprocess_single(i, *args, **kwargs):
+                batch_outputs.append(res)
+        batch_outputs = self._collate_fn(batch_outputs)
+        batch_outputs = self._to_device(batch_outputs)
+        return batch_outputs
+
+    def postprocess(self, inputs, *args, **kwargs):
+        output = {}
+        output['keypoints'] = inputs['preds']
+        output['boxes'] = inputs['boxes']
+        for i, bbox in enumerate(output['boxes']):
+            center, scale = bbox[:2], bbox[2:4]
+            output['boxes'][i][:4] = bbox_cs2xyxy(center, scale)
+        output['boxes'] = output['boxes'][:, :4]
+        return output
+
+    def __call__(self, inputs, keep_inputs=False):
+        if isinstance(inputs, str):
+            inputs = [inputs]
+
+        results_list = []
+        for i in range(0, len(inputs), self.batch_size):
+            batch = inputs[i:max(len(inputs) - 1, i + self.batch_size)]
+            # hand det and return source image
+            det_results = self.detection_predictor(batch, keep_inputs=True)
+            # hand keypoints
+            batch_outputs = self.preprocess(det_results)
+            batch_outputs = self.forward(batch_outputs)
+            results = self.postprocess(batch_outputs)
+            if keep_inputs:
+                results = {'inputs': batch, 'results': results}
+            # if dump, the outputs will not added to the return value to prevent taking up too much memory
+            if self.save_results:
+                self.dump([results], self.save_path, mode='ab+')
+            else:
+                results_list.append(results)
+
+        return results_list
+
+    def show_result(self,
+                    image_path,
+                    keypoints,
+                    boxes=None,
+                    scale=4,
+                    save_path=None):
+        """Draw `result` over `img`.
+
+        Args:
+            image_path (str): filepath of img
+            keypoints (ndarray): N*21*3
+        """
+        point_color = [120, 225, 240]
+        sk_color = [0, 255, 0]
+        img = mmcv.imread(image_path)
+        img = img.copy()
+        img_h, img_w = img.shape[:2]
+
+        for kpts in keypoints:
+            # point
+            for kid, (x, y, s) in enumerate(kpts):
+                cv2.circle(img, (int(x), int(y)), scale, point_color, -1)
+            # skeleton
+            for sk_id, sk in enumerate(HAND_SKELETON):
+                pos1 = (int(kpts[sk[0], 0]), int(kpts[sk[0], 1]))
+                pos2 = (int(kpts[sk[1], 0]), int(kpts[sk[1], 1]))
+
+                if (pos1[0] <= 0 or pos1[0] >= img_w or pos1[1] <= 0
+                        or pos1[1] >= img_h or pos2[0] <= 0 or pos2[0] >= img_w
+                        or pos2[1] <= 0 or pos2[1] >= img_h):
+                    # skip the link that should not be drawn
+                    continue
+                cv2.line(img, pos1, pos2, sk_color, thickness=1)
+
+        if boxes is not None:
+            bboxes = np.vstack(boxes)
+            mmcv.imshow_bboxes(
+                img, bboxes, colors='green', top_k=-1, thickness=2, show=False)
+
+        if save_path is not None:
+            mmcv.imwrite(img, save_path)
+        return img
+
+
+def bbox_cs2xyxy(center, scale, padding=1., pixel_std=200.):
+    wh = scale * 0.8 / padding * pixel_std
+    xy = center - 0.5 * wh
+    x1, y1 = xy
+    w, h = wh
+    return np.r_[x1, y1, x1 + w, y1 + h]
diff --git a/easycv/predictors/segmentation.py b/easycv/predictors/segmentation.py
index 08f7dc0b..6916817b 100644
--- a/easycv/predictors/segmentation.py
+++ b/easycv/predictors/segmentation.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import cv2
+import mmcv
 import numpy as np
 import torch
 from matplotlib.collections import PatchCollection
@@ -13,7 +14,112 @@
 from easycv.predictors.builder import PREDICTORS
 from easycv.predictors.interface import PredictorInterface
 from easycv.utils.checkpoint import load_checkpoint
+from easycv.utils.config_tools import mmcv_config_fromfile
 from easycv.utils.registry import build_from_cfg
+from .base import PredictorV2
+
+
+@PREDICTORS.register_module()
+class SegmentationPredictor(PredictorV2):
+
+    def __init__(self,
+                 model_path,
+                 config_file,
+                 batch_size=1,
+                 device=None,
+                 save_results=False,
+                 save_path=None):
+        """Predict pipeline for Segmentation
+
+        Args:
+            model_path (str): Path of model path
+            config_file (str): config file path for model and processor to init. Defaults to None.
+        """
+        super(SegmentationPredictor, self).__init__(
+            model_path,
+            config_file,
+            batch_size=batch_size,
+            device=device,
+            save_results=save_results,
+            save_path=save_path)
+
+        self.CLASSES = self.cfg.CLASSES
+        self.PALETTE = self.cfg.PALETTE
+
+    def show_result(self,
+                    img,
+                    result,
+                    palette=None,
+                    win_name='',
+                    show=False,
+                    wait_time=0,
+                    out_file=None,
+                    opacity=0.5):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (Tensor): The semantic segmentation results to draw over
+                `img`.
+            palette (list[list[int]]] | np.ndarray | None): The palette of
+                segmentation map. If None is given, random palette will be
+                generated. Default: None
+            win_name (str): The window name.
+            wait_time (int): Value of waitKey param.
+                Default: 0.
+            show (bool): Whether to show the image.
+                Default: False.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+            opacity(float): Opacity of painted segmentation map.
+                Default 0.5.
+                Must be in (0, 1] range.
+        Returns:
+            img (Tensor): Only if not `show` or `out_file`
+        """
+
+        img = mmcv.imread(img)
+        img = img.copy()
+        seg = result[0]
+        if palette is None:
+            if self.PALETTE is None:
+                # Get random state before set seed,
+                # and restore random state later.
+                # It will prevent loss of randomness, as the palette
+                # may be different in each iteration if not specified.
+                # See: https://github.com/open-mmlab/mmdetection/issues/5844
+                state = np.random.get_state()
+                np.random.seed(42)
+                # random palette
+                palette = np.random.randint(
+                    0, 255, size=(len(self.CLASSES), 3))
+                np.random.set_state(state)
+            else:
+                palette = self.PALETTE
+        palette = np.array(palette)
+        assert palette.shape[0] == len(self.CLASSES)
+        assert palette.shape[1] == 3
+        assert len(palette.shape) == 2
+        assert 0 < opacity <= 1.0
+        color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
+        for label, color in enumerate(palette):
+            color_seg[seg == label, :] = color
+        # convert to BGR
+        color_seg = color_seg[..., ::-1]
+
+        img = img * (1 - opacity) + color_seg * opacity
+        img = img.astype(np.uint8)
+        # if out_file specified, do not show image in window
+        if out_file is not None:
+            show = False
+
+        if show:
+            mmcv.imshow(img, win_name, wait_time)
+        if out_file is not None:
+            mmcv.imwrite(img, out_file)
+
+        if not (show or out_file):
+            return img
 
 
 @PREDICTORS.register_module()
@@ -108,6 +214,147 @@ def show_instance(self, img, segms, bboxes, scores, labels, score_thr=0.5):
         return instance_result
 
 
+@PREDICTORS.register_module()
+class SegFormerPredictor(PredictorInterface):
+
+    def __init__(self, model_path, model_config):
+        """init model
+
+        Args:
+            model_path (str): Path of model path
+            model_config (config): config string for model to init. Defaults to None.
+        """
+        self.model_path = model_path
+
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.model = None
+        with io.open(self.model_path, 'rb') as infile:
+            checkpoint = torch.load(infile, map_location='cpu')
+
+        self.cfg = mmcv_config_fromfile(model_config)
+        self.CLASSES = self.cfg.CLASSES
+        self.PALETTE = self.cfg.PALETTE
+        # build model
+        self.model = build_model(self.cfg.model)
+
+        self.ckpt = load_checkpoint(
+            self.model, self.model_path, map_location=self.device)
+        self.model.to(self.device)
+        self.model.eval()
+
+        # build pipeline
+        test_pipeline = self.cfg.test_pipeline
+        pipeline = [build_from_cfg(p, PIPELINES) for p in test_pipeline]
+        self.pipeline = Compose(pipeline)
+
+    def predict(self, input_data_list):
+        """
+    using session run predict a number of samples using batch_size
+
+    Args:
+      input_data_list:  a list of numpy array(in rgb order), each array is a sample
+        to be predicted
+        use a fixed number if you do not want to adjust batch_size in runtime
+    """
+        output_list = []
+        for idx, img in enumerate(input_data_list):
+            if type(img) is not np.ndarray:
+                img = np.asarray(img)
+
+            ori_img_shape = img.shape[:2]
+
+            data_dict = {'img': img}
+            data_dict['ori_shape'] = ori_img_shape
+            data_dict = self.pipeline(data_dict)
+            img = data_dict['img']
+            img = torch.unsqueeze(img[0], 0).to(self.device)
+            data_dict.pop('img')
+
+            with torch.no_grad():
+                out = self.model([img],
+                                 mode='test',
+                                 img_metas=[[data_dict['img_metas'][0]._data]])
+
+            output_list.append(out)
+
+        return output_list
+
+    def show_result(self,
+                    img,
+                    result,
+                    palette=None,
+                    win_name='',
+                    show=False,
+                    wait_time=0,
+                    out_file=None,
+                    opacity=0.5):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (Tensor): The semantic segmentation results to draw over
+                `img`.
+            palette (list[list[int]]] | np.ndarray | None): The palette of
+                segmentation map. If None is given, random palette will be
+                generated. Default: None
+            win_name (str): The window name.
+            wait_time (int): Value of waitKey param.
+                Default: 0.
+            show (bool): Whether to show the image.
+                Default: False.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+            opacity(float): Opacity of painted segmentation map.
+                Default 0.5.
+                Must be in (0, 1] range.
+        Returns:
+            img (Tensor): Only if not `show` or `out_file`
+        """
+
+        img = mmcv.imread(img)
+        img = img.copy()
+        seg = result[0]
+        if palette is None:
+            if self.PALETTE is None:
+                # Get random state before set seed,
+                # and restore random state later.
+                # It will prevent loss of randomness, as the palette
+                # may be different in each iteration if not specified.
+                # See: https://github.com/open-mmlab/mmdetection/issues/5844
+                state = np.random.get_state()
+                np.random.seed(42)
+                # random palette
+                palette = np.random.randint(
+                    0, 255, size=(len(self.CLASSES), 3))
+                np.random.set_state(state)
+            else:
+                palette = self.PALETTE
+        palette = np.array(palette)
+        assert palette.shape[0] == len(self.CLASSES)
+        assert palette.shape[1] == 3
+        assert len(palette.shape) == 2
+        assert 0 < opacity <= 1.0
+        color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
+        for label, color in enumerate(palette):
+            color_seg[seg == label, :] = color
+        # convert to BGR
+        color_seg = color_seg[..., ::-1]
+
+        img = img * (1 - opacity) + color_seg * opacity
+        img = img.astype(np.uint8)
+        # if out_file specified, do not show image in window
+        if out_file is not None:
+            show = False
+
+        if show:
+            mmcv.imshow(img, win_name, wait_time)
+        if out_file is not None:
+            mmcv.imwrite(img, out_file)
+
+        if not (show or out_file):
+            return img
+
+
 def _get_bias_color(base, max_dist=30):
     """Get different colors for each masks.
 
diff --git a/easycv/toolkit/quantize/quantize_utils.py b/easycv/toolkit/quantize/quantize_utils.py
index 8759c593..c7ef8aa8 100644
--- a/easycv/toolkit/quantize/quantize_utils.py
+++ b/easycv/toolkit/quantize/quantize_utils.py
@@ -7,8 +7,8 @@
 import torch
 from mmcv.parallel import scatter_kwargs
 
+from easycv.models.detection.detectors.yolox.yolo_head import YOLOXHead
 from easycv.models.detection.utils import output_postprocess, postprocess
-from easycv.models.detection.yolox.yolo_head import YOLOXHead
 
 
 def quantize_config_check(device, backend, model_type=''):
diff --git a/easycv/utils/mmlab_utils.py b/easycv/utils/mmlab_utils.py
index bbf8e036..17899d08 100644
--- a/easycv/utils/mmlab_utils.py
+++ b/easycv/utils/mmlab_utils.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # flake8: noqa
+import copy
 import inspect
 import logging
 
@@ -13,6 +14,8 @@
 from easycv.models.registry import BACKBONES, HEADS, MODELS, NECKS
 from .test_util import run_in_subprocess
 
+MMDET = 'mmdet'
+
 try:
     from mmcv.runner.hooks import HOOKS
     import mmdet
@@ -23,6 +26,15 @@
     from mmdet.models.builder import HEADS as MMHEADS
     from mmdet.core import BitmapMasks, PolygonMasks, encode_mask_results
     from mmdet.core.mask import mask2bbox
+    MM_REGISTRY = {
+        MMDET: {
+            'model': MMMODELS,
+            'backbone': MMBACKBONES,
+            'neck': MMNECKS,
+            'head': MMHEADS
+        }
+    }
+    MM_ORIGINAL_REGISTRY = copy.deepcopy(MM_REGISTRY)
 except ImportError:
     pass
 
@@ -32,7 +44,7 @@
     'neck': NECKS,
     'head': HEADS
 }
-MMDET = 'mmdet'
+
 SUPPORT_MMLAB_TYPES = [MMDET]
 _MMLAB_COPIES = locals()
 
@@ -61,7 +73,7 @@ def __init__(self, modules_config):
         # Remove the annotation in feature
         # self.fix_conflicts()
 
-        self.MMTYPE_REGISTRY_MAP = self._get_mmtype_registry_map()
+        self.MMTYPE_REGISTRY_MAP = MMAdapter.reset_mm_registry()
         self.modules_config = modules_config
 
     def check_env(self):
@@ -104,7 +116,8 @@ def adapt_mmlab_modules(self):
             self._merge_all_easycv_modules_to_mmlab(mmtype)
 
     def wrap_module(self, mmtype, module_type, module_name):
-        module_obj = self._get_mm_module_obj(mmtype, module_type, module_name)
+        module_obj = self._get_mm_module_obj_in_easycv(mmtype, module_type,
+                                                       module_name)
         if mmtype == MMDET:
             MMDetWrapper().wrap_module(module_obj, module_type)
 
@@ -126,9 +139,13 @@ def _merge_mmlab_module_to_easycv(self,
         # Add mmlab module to my module registry.
         easycv_registry_type = EASYCV_REGISTRY_MAP[module_type]
         # Copy a duplicate to avoid directly modifying the properties of the original object
-        _MMLAB_COPIES[module_name] = type(module_name, (model_obj, ), dict())
-        easycv_registry_type.register_module(
-            _MMLAB_COPIES[module_name], force=force)
+        key = '.'.join([mmtype, module_type, module_name])
+        _MMLAB_COPIES[key] = type(module_name, (model_obj, ), dict())
+        easycv_registry_type.register_module(_MMLAB_COPIES[key], force=force)
+
+    def _get_mm_module_obj_in_easycv(self, mmtype, module_type, module_name):
+        key = '.'.join([mmtype, module_type, module_name])
+        return _MMLAB_COPIES[key]
 
     def _get_mm_module_obj(self, mmtype, module_type, module_name):
         if isinstance(module_name, str):
@@ -147,22 +164,21 @@ def _get_mm_module_obj(self, mmtype, module_type, module_name):
                 format(type(module_name)))
         return module_obj
 
-    def _get_mmtype_registry_map(self):
-        registry_map = {
-            MMDET: {
-                'model': MMMODELS,
-                'backbone': MMBACKBONES,
-                'neck': MMNECKS,
-                'head': MMHEADS
-            }
-        }
-        return registry_map
+    @staticmethod
+    def reset_mm_registry():
+        for mmtype, registries in MM_ORIGINAL_REGISTRY.items():
+            for k, ori_v in registries.items():
+                MM_REGISTRY[mmtype][k]._module_dict = copy.deepcopy(
+                    ori_v._module_dict)
+
+        return MM_REGISTRY
 
 
 class MMDetWrapper:
 
-    def __init__(self):
-        self.refactor_modules()
+    def __init__(self, refactor_modules=True):
+        if refactor_modules:
+            self.refactor_modules()
 
     def wrap_module(self, cls, module_type):
         if hasattr(cls, 'is_wrap') and cls.is_wrap:
@@ -342,3 +358,9 @@ def dynamic_adapt_for_mmlab(cfg):
     if len(mmlab_modules_cfg) > 1:
         adapter = MMAdapter(mmlab_modules_cfg)
         adapter.adapt_mmlab_modules()
+
+
+def remove_adapt_for_mmlab(cfg):
+    mmlab_modules_cfg = cfg.get('mmlab_modules', [])
+    adapter = MMAdapter(mmlab_modules_cfg)
+    adapter.reset_mm_registry()
diff --git a/easycv/utils/ms_utils.py b/easycv/utils/ms_utils.py
new file mode 100644
index 00000000..eea8448c
--- /dev/null
+++ b/easycv/utils/ms_utils.py
@@ -0,0 +1,149 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+
+import jsonplus
+
+from easycv.file import io
+from easycv.utils.config_tools import Config
+
+MODELSCOPE_PREFIX = 'modelscope'
+
+
+class EasyCVMeta:
+    ARCH = '__easycv_arch__'
+
+    META = '__easycv_meta__'
+    RESERVED_KEYS = 'reserved_keys'
+
+
+def to_ms_config(cfg,
+                 task,
+                 ms_model_name,
+                 pipeline_name,
+                 save_path=None,
+                 reserved_keys=[],
+                 dump=True):
+    """Convert EasyCV config to ModelScope style.
+
+    Args:
+        cfg (str | Config): Easycv config file or Config object.
+        task (str): Task name in modelscope, refer to: modelscope.utils.constant.Tasks.
+        ms_model_name (str): Model name registered in modelscope, model type will be replaced with `ms_model_name`, used in modelscope.
+        pipeline_name (str): Predict pipeline name registered in modelscope, refer to: modelscope/pipelines/cv/easycv_pipelines.
+        save_path (str): Save path for saving the generated modelscope configuration file. Only valid when dump is True.
+        reserved_keys (list of str): Keys conversion may loss some of the original global keys, not all keys will be retained.
+            If you need to keep some keys, for example, keep the `CLASSES` key of config for inference, you can specify: reserved_keys=['CLASSES'].
+        dump (bool): Whether dump the converted config to `save_path`.
+    """
+    # TODO: support multi eval_pipelines
+    # TODO: support for adding customized required keys to the configuration file
+
+    if isinstance(cfg, str):
+        easycv_cfg = Config.fromfile(cfg)
+        if dump and save_path is None:
+            save_dir = os.path.dirname(cfg)
+            save_name = MODELSCOPE_PREFIX + '_' + os.path.splitext(
+                os.path.basename(cfg))[0] + '.json'
+            save_path = os.path.join(save_dir, save_name)
+    else:
+        easycv_cfg = cfg
+        if dump and save_path is None:
+            raise ValueError('Please provide `save_path`!')
+
+    assert save_path.endswith('json'), 'Only support json file!'
+    optimizer_options = easycv_cfg.optimizer_config
+
+    val_dataset_cfg = easycv_cfg.data.val
+    val_imgs_per_gpu = val_dataset_cfg.pop('imgs_per_gpu',
+                                           easycv_cfg.data.imgs_per_gpu)
+    val_workers_per_gpu = val_dataset_cfg.pop('workers_per_gpu',
+                                              easycv_cfg.data.workers_per_gpu)
+
+    log_config = easycv_cfg.log_config
+    predict_config = easycv_cfg.get('predict', None)
+
+    hooks = [{
+        'type': 'CheckpointHook',
+        'interval': easycv_cfg.checkpoint_config.interval
+    }, {
+        'type': 'EvaluationHook',
+        'interval': easycv_cfg.eval_config.interval
+    }, {
+        'type': 'AddLrLogHook'
+    }, {
+        'type': 'IterTimerHook'
+    }]
+
+    custom_hooks = easycv_cfg.get('custom_hooks', [])
+    hooks.extend(custom_hooks)
+
+    for log_hook_i in log_config.hooks:
+        if log_hook_i['type'] == 'TensorboardLoggerHook':
+            # replace with modelscope api
+            hooks.append({
+                'type': 'TensorboardHook',
+                'interval': log_config.interval
+            })
+        elif log_hook_i['type'] == 'TextLoggerHook':
+            # use modelscope api
+            hooks.append({
+                'type': 'TextLoggerHook',
+                'interval': log_config.interval
+            })
+        else:
+            log_hook_i.update({'interval': log_config.interval})
+            hooks.append(log_hook_i)
+
+    ori_model_type = easycv_cfg.model.pop('type')
+
+    ms_cfg = Config(
+        dict(
+            task=task,
+            framework='pytorch',
+            preprocessor={},  # adapt to modelscope, do nothing
+            model={
+                'type': ms_model_name,
+                **easycv_cfg.model, EasyCVMeta.ARCH: {
+                    'type': ori_model_type
+                }
+            },
+            dataset=dict(train=easycv_cfg.data.train, val=val_dataset_cfg),
+            train=dict(
+                work_dir=easycv_cfg.get('work_dir', None),
+                max_epochs=easycv_cfg.total_epochs,
+                dataloader=dict(
+                    batch_size_per_gpu=easycv_cfg.data.imgs_per_gpu,
+                    workers_per_gpu=easycv_cfg.data.workers_per_gpu,
+                ),
+                optimizer=dict(
+                    **easycv_cfg.optimizer, options=optimizer_options),
+                lr_scheduler=easycv_cfg.lr_config,
+                hooks=hooks),
+            evaluation=dict(
+                dataloader=dict(
+                    batch_size_per_gpu=val_imgs_per_gpu,
+                    workers_per_gpu=val_workers_per_gpu,
+                ),
+                metrics={
+                    'type': 'EasyCVMetric',
+                    'evaluators': easycv_cfg.eval_pipelines[0].evaluators
+                }),
+            pipeline=dict(type=pipeline_name, predictor_config=predict_config),
+        ))
+
+    for key in reserved_keys:
+        ms_cfg.merge_from_dict({key: getattr(easycv_cfg, key)})
+
+    if len(reserved_keys) > 1:
+        ms_cfg.merge_from_dict(
+            {EasyCVMeta.META: {
+                EasyCVMeta.RESERVED_KEYS: reserved_keys
+            }})
+
+    if dump:
+        with io.open(save_path, 'w') as f:
+            res = jsonplus.dumps(
+                ms_cfg._cfg_dict.to_dict(), indent=4, sort_keys=False)
+            f.write(res)
+
+    return ms_cfg
diff --git a/easycv/version.py b/easycv/version.py
index 715bd6df..d1b30032 100644
--- a/easycv/version.py
+++ b/easycv/version.py
@@ -2,5 +2,5 @@
 # GENERATED VERSION FILE
 # TIME: Thu Nov  5 14:17:50 2020
 
-__version__ = '0.6.1'
-short_version = '0.6.1'
+__version__ = '0.6.3.1'
+short_version = '0.6.3.1'
diff --git a/requirements/optional.txt b/requirements/optional.txt
index 4b97d767..0fd17691 100644
--- a/requirements/optional.txt
+++ b/requirements/optional.txt
@@ -1,2 +1,4 @@
 http://pai-nni.oss-cn-zhangjiakou.aliyuncs.com/release/2.6.1/pai_nni-2.6.1-py3-none-manylinux1_x86_64.whl
+http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/pkgs/whl/panopticapi/panopticapi-0.1-py3-none-any.whl
 http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/third_party/blade_compression-0.0.2-py3-none-any.whl
+https://developer.download.nvidia.com/compute/redist/nvidia-dali-cuda100/nvidia_dali_cuda100-0.25.0-1535750-py3-none-manylinux2014_x86_64.whl
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index 9deeae54..60075ec5 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -3,21 +3,20 @@ dataclasses
 einops
 future
 h5py
-http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/pkgs/whl/panopticapi/panopticapi-0.1-py3-none-any.whl
-https://developer.download.nvidia.com/compute/redist/nvidia-dali-cuda100/nvidia_dali_cuda100-0.25.0-1535750-py3-none-manylinux2014_x86_64.whl
+imgaug
 json_tricks
 numpy
-opencv-python-headless
+opencv-python
 oss2
 packaging
 Pillow
 prettytable
 pycocotools
-pytorch_metric_learning==0.9.89
+pytorch_metric_learning>=0.9.89
 scikit-image
 sklearn
 tensorboard
 thop
-timm==0.4.9
+timm>=0.4.9
 xtcocotools
 yacs
diff --git a/tests/core/evaluation/test_keypoint_eval.py b/tests/core/evaluation/test_keypoint_eval.py
new file mode 100644
index 00000000..4a5d0216
--- /dev/null
+++ b/tests/core/evaluation/test_keypoint_eval.py
@@ -0,0 +1,51 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import numpy as np
+
+from easycv.core.evaluation import KeyPointEvaluator
+
+
+class KeyPointEvaluatorTest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+
+    def test_keypoint_evaluator_pck(self):
+        evaluator = KeyPointEvaluator(pck_thr=0.5, pckh_thr=0.5, auc_nor=30)
+        output = np.zeros((5, 3))
+        target = np.zeros((5, 3))
+        mask = np.zeros((5, 3))
+        mask[:, :2] = 1
+        # first channel
+        output[0] = [10, 0, 0]
+        target[0] = [10, 0, 0]
+        # second channel
+        output[1] = [20, 20, 0]
+        target[1] = [10, 10, 0]
+        # third channel
+        output[2] = [0, 0, 0]
+        target[2] = [-1, 0, 0]
+        # fourth channel
+        output[3] = [30, 30, 0]
+        target[3] = [30, 30, 0]
+        # fifth channel
+        output[4] = [0, 10, 0]
+        target[4] = [0, 10, 0]
+        preds = {'keypoints': output}
+        db = {
+            'joints_3d': target,
+            'joints_3d_visible': mask,
+            'bbox': [10, 10, 10, 10],
+            'head_size': 10
+        }
+        eval_res = evaluator.evaluate([preds, preds], [db, db])
+        self.assertAlmostEqual(eval_res['PCK'], 0.8)
+        self.assertAlmostEqual(eval_res['PCKh'], 0.8)
+        self.assertAlmostEqual(eval_res['EPE'], 3.0284271240234375)
+        self.assertAlmostEqual(eval_res['AUC'], 0.86)
+        self.assertAlmostEqual(eval_res['NME'], 3.0284271240234375)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/datasets/pose/data_sources/test_coco_hand.py b/tests/datasets/pose/data_sources/test_coco_hand.py
new file mode 100644
index 00000000..38b91d3c
--- /dev/null
+++ b/tests/datasets/pose/data_sources/test_coco_hand.py
@@ -0,0 +1,59 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import random
+import unittest
+
+import numpy as np
+from tests.ut_config import SMALL_COCO_WHOLE_BODY_HAND_ROOT
+
+from easycv.datasets.pose.data_sources import HandCocoPoseTopDownSource
+
+_DATA_CFG = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=21,
+    num_joints=21,
+    dataset_channel=[list(range(21))],
+    inference_channel=list(range(21)),
+)
+
+
+class HandCocoPoseSourceCocoTest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+
+    def test_top_down_source_coco(self):
+        data_source = HandCocoPoseTopDownSource(
+            data_cfg=_DATA_CFG,
+            ann_file=
+            f'{SMALL_COCO_WHOLE_BODY_HAND_ROOT}/annotations/small_whole_body_hand_coco.json',
+            img_prefix=f'{SMALL_COCO_WHOLE_BODY_HAND_ROOT}/train2017/')
+        index_list = random.choices(list(range(4)), k=3)
+        for idx in index_list:
+            data = data_source[idx]
+            self.assertIn('image_file', data)
+            self.assertIn('image_id', data)
+            self.assertIn('bbox_score', data)
+            self.assertIn('bbox_id', data)
+            self.assertIn('image_id', data)
+            self.assertEqual(data['center'].shape, (2, ))
+            self.assertEqual(data['scale'].shape, (2, ))
+            self.assertEqual(len(data['bbox']), 4)
+            self.assertEqual(data['joints_3d'].shape, (21, 3))
+            self.assertEqual(data['joints_3d_visible'].shape, (21, 3))
+            self.assertEqual(data['img'].shape[-1], 3)
+            ann_info = data['ann_info']
+            self.assertEqual(ann_info['image_size'].all(),
+                             np.array([256, 256]).all())
+            self.assertEqual(ann_info['heatmap_size'].all(),
+                             np.array([64, 64]).all())
+            self.assertEqual(ann_info['num_joints'], 21)
+            self.assertEqual(len(ann_info['inference_channel']), 21)
+            self.assertEqual(ann_info['num_output_channels'], 21)
+            break
+
+        self.assertEqual(len(data_source), 4)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/datasets/pose/test_coco_whole_body_hand_dataset.py b/tests/datasets/pose/test_coco_whole_body_hand_dataset.py
new file mode 100644
index 00000000..6f7e3636
--- /dev/null
+++ b/tests/datasets/pose/test_coco_whole_body_hand_dataset.py
@@ -0,0 +1,75 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import torch
+from tests.ut_config import SMALL_COCO_WHOLE_BODY_HAND_ROOT
+
+from easycv.datasets.pose import HandCocoWholeBodyDataset
+
+_DATA_CFG = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=21,
+    num_joints=21,
+    dataset_channel=[list(range(21))],
+    inference_channel=list(range(21)))
+
+_DATASET_ARGS = [{
+    'data_source':
+    dict(
+        type='HandCocoPoseTopDownSource',
+        data_cfg=_DATA_CFG,
+        ann_file=
+        f'{SMALL_COCO_WHOLE_BODY_HAND_ROOT}/annotations/small_whole_body_hand_coco.json',
+        img_prefix=f'{SMALL_COCO_WHOLE_BODY_HAND_ROOT}/train2017/'),
+    'pipeline': [
+        dict(type='TopDownRandomFlip', flip_prob=0.5),
+        dict(type='TopDownAffine'),
+        dict(type='MMToTensor'),
+        dict(type='TopDownGenerateTarget', sigma=3),
+        dict(
+            type='PoseCollect',
+            keys=['img', 'target', 'target_weight'],
+            meta_keys=[
+                'image_file', 'joints_3d', 'flip_pairs', 'joints_3d_visible',
+                'center', 'scale', 'rotation', 'bbox_score'
+            ])
+    ]
+}, {}]
+
+
+class PoseTopDownDatasetTest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+
+    @staticmethod
+    def build_dataset(index):
+        dataset = HandCocoWholeBodyDataset(
+            data_source=_DATASET_ARGS[index].get('data_source', None),
+            pipeline=_DATASET_ARGS[index].get('pipeline', None))
+
+        return dataset
+
+    def test_0(self, index=0):
+        dataset = self.build_dataset(index)
+        ann_info = dataset.data_source.ann_info
+
+        self.assertEqual(len(dataset), 4)
+        for i, batch in enumerate(dataset):
+            self.assertEqual(
+                batch['img'].shape,
+                torch.Size([3] + list(ann_info['image_size'][::-1])))
+            self.assertEqual(batch['target'].shape,
+                             (ann_info['num_joints'], ) +
+                             tuple(ann_info['heatmap_size'][::-1]))
+            self.assertEqual(batch['img_metas'].data['joints_3d'].shape,
+                             (ann_info['num_joints'], 3))
+            self.assertIn('center', batch['img_metas'].data)
+            self.assertIn('scale', batch['img_metas'].data)
+
+            break
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/datasets/shared/pipelines/__init__.py b/tests/datasets/shared/pipelines/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/datasets/shared/pipelines/test_transforms.py b/tests/datasets/shared/pipelines/test_transforms.py
new file mode 100644
index 00000000..0e03b41a
--- /dev/null
+++ b/tests/datasets/shared/pipelines/test_transforms.py
@@ -0,0 +1,54 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+
+import cv2
+import numpy as np
+from PIL import Image
+from tests.ut_config import TEST_IMAGES_DIR
+
+from easycv.datasets.shared.pipelines.transforms import LoadImage
+
+
+class LoadImageTest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+
+    def _check_results(self, results):
+        shape = (1350, 2408, 3)
+        self.assertEqual(results['img_shape'], shape)
+        self.assertEqual(results['ori_shape'], shape)
+        self.assertListEqual(results['img_fields'], ['img'])
+        self.assertEqual(results['img'].shape, shape)
+
+    def test_load_np(self):
+        load_op = LoadImage()
+        img_path = os.path.join(TEST_IMAGES_DIR, 'multi_face.jpg')
+        inputs = {'img': cv2.imread(img_path)}
+        results = load_op(inputs)
+        self._check_results(results)
+        self.assertEqual(results['filename'], None)
+        self.assertEqual(results['img'].dtype, np.uint8)
+
+    def test_load_pil(self):
+        load_op = LoadImage(to_float32=True)
+        img_path = os.path.join(TEST_IMAGES_DIR, 'multi_face.jpg')
+        inputs = {'img': Image.open(img_path)}
+        results = load_op(inputs)
+        self._check_results(results)
+        self.assertEqual(results['filename'], None)
+        self.assertEqual(results['img'].dtype, np.float32)
+
+    def test_load_path(self):
+        load_op = LoadImage(to_float32=True)
+        img_path = os.path.join(TEST_IMAGES_DIR, 'multi_face.jpg')
+        inputs = {'filename': img_path}
+        results = load_op(inputs)
+        self._check_results(results)
+        self.assertEqual(results['filename'], img_path)
+        self.assertEqual(results['img'].dtype, np.float32)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/predictors/test_face_keypoints_predictor.py b/tests/predictors/test_face_keypoints_predictor.py
new file mode 100644
index 00000000..67482e51
--- /dev/null
+++ b/tests/predictors/test_face_keypoints_predictor.py
@@ -0,0 +1,56 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+import cv2
+
+from easycv.predictors.face_keypoints_predictor import FaceKeypointsPredictor
+
+
+class FaceKeypointsPredictorWithoutDetectorTest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.image_path = './data/test/face_2d_keypoints/data/002258.png'
+        self.save_image_path = './data/test/face_2d_keypoints/data/result_002258.png'
+        self.model_path = './data/test/face_2d_keypoints/models/epoch_400.pth'
+        self.model_config_path = './configs/face/face_96x96_wingloss.py'
+
+    def test_single(self):
+        predict_pipeline = FaceKeypointsPredictor(
+            model_path=self.model_path, config_file=self.model_config_path)
+        output = predict_pipeline(self.image_path)[0][0]
+        output_keypoints = output['point']
+        output_pose = output['pose']
+        img = cv2.imread(self.image_path)
+        image_show = predict_pipeline.show_result(
+            img, output_keypoints, scale=2, save_path=self.save_image_path)
+        self.assertEqual(output_keypoints.shape[0], 106)
+        self.assertEqual(output_keypoints.shape[1], 2)
+        self.assertEqual(output_pose.shape[0], 3)
+
+    def test_batch(self):
+        predict_pipeline = FaceKeypointsPredictor(
+            model_path=self.model_path,
+            config_file=self.model_config_path,
+            batch_size=2)
+
+        total_samples = 3
+        output = predict_pipeline([self.image_path] * total_samples)
+
+        self.assertEqual(len(output), 2)
+        self.assertEqual(len(output[0]), 2)
+        self.assertEqual(len(output[1]), 1)
+        self.assertEqual(output[0][0]['point'].shape[0], 106)
+        self.assertEqual(output[0][0]['point'].shape[1], 2)
+        self.assertEqual(output[0][0]['pose'].shape[0], 3)
+        self.assertEqual(output[0][1]['point'].shape[0], 106)
+        self.assertEqual(output[0][1]['point'].shape[1], 2)
+        self.assertEqual(output[0][1]['pose'].shape[0], 3)
+        self.assertEqual(output[1][0]['point'].shape[0], 106)
+        self.assertEqual(output[1][0]['point'].shape[1], 2)
+        self.assertEqual(output[1][0]['pose'].shape[0], 3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/predictors/test_hand_keypoints_predictor.py b/tests/predictors/test_hand_keypoints_predictor.py
new file mode 100644
index 00000000..b2bca4cf
--- /dev/null
+++ b/tests/predictors/test_hand_keypoints_predictor.py
@@ -0,0 +1,44 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+from easycv.predictors.hand_keypoints_predictor import HandKeypointsPredictor
+from easycv.utils.config_tools import mmcv_config_fromfile
+
+MM_DEFAULT_HAND_DETECTION_SSDLITE_MODEL_PATH = 'https://download.openmmlab.com/mmpose/mmdet_pretrained/' \
+                                               'ssdlite_mobilenetv2_scratch_600e_onehand-4f9f8686_20220523.pth'
+MM_DEFAULT_HAND_DETECTION_SSDLITE_CONFIG_FILE = 'data/test/pose/hand/configs/hand_keypoints_predictor.py'
+
+
+class HandKeypointsPredictorTest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.image_path = 'data/test/pose/hand/data/hand.jpg'
+        self.save_image_path = 'data/test/pose/hand/data/hand_result.jpg'
+        self.model_path = 'data/test/pose/hand/hrnet_w18_256x256.pth'
+        self.model_config_path = 'configs/pose/hand/hrnet_w18_coco_wholebody_hand_256x256_dark.py'
+
+    def test_single(self):
+        config = mmcv_config_fromfile(self.model_config_path)
+        predict_pipeline = HandKeypointsPredictor(
+            model_path=self.model_path,
+            config_file=config,
+            detection_predictor_config=dict(
+                type='DetectionPredictor',
+                model_path=MM_DEFAULT_HAND_DETECTION_SSDLITE_MODEL_PATH,
+                config_file=MM_DEFAULT_HAND_DETECTION_SSDLITE_CONFIG_FILE,
+                score_threshold=0.5))
+
+        output = predict_pipeline(self.image_path)[0]
+        keypoints = output['keypoints']
+        boxes = output['boxes']
+        image_show = predict_pipeline.show_result(
+            self.image_path, keypoints, boxes, save_path=self.save_image_path)
+        self.assertEqual(keypoints.shape[0], 1)
+        self.assertEqual(keypoints.shape[1], 21)
+        self.assertEqual(keypoints.shape[2], 3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/predictors/test_segmentation.py b/tests/predictors/test_segmentation.py
new file mode 100644
index 00000000..e84a3e1a
--- /dev/null
+++ b/tests/predictors/test_segmentation.py
@@ -0,0 +1,107 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import pickle
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+from PIL import Image
+from tests.ut_config import (MODEL_CONFIG_SEGFORMER,
+                             PRETRAINED_MODEL_SEGFORMER, TEST_IMAGES_DIR)
+
+from easycv.predictors.segmentation import SegmentationPredictor
+
+
+class SegmentationPredictorTest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+
+    def test_single(self):
+        segmentation_model_path = PRETRAINED_MODEL_SEGFORMER
+        segmentation_model_config = MODEL_CONFIG_SEGFORMER
+
+        img_path = os.path.join(TEST_IMAGES_DIR, '000000289059.jpg')
+        img = np.asarray(Image.open(img_path))
+
+        predict_pipeline = SegmentationPredictor(
+            model_path=segmentation_model_path,
+            config_file=segmentation_model_config)
+
+        outputs = predict_pipeline(img_path, keep_inputs=True)
+        self.assertEqual(len(outputs), 1)
+        self.assertEqual(outputs[0]['inputs'], [img_path])
+
+        results = outputs[0]['results']
+        self.assertListEqual(
+            list(img.shape)[:2], list(results['seg_pred'][0].shape))
+        self.assertListEqual(results['seg_pred'][0][1, :10].tolist(),
+                             [161 for i in range(10)])
+        self.assertListEqual(results['seg_pred'][0][-1, -10:].tolist(),
+                             [133 for i in range(10)])
+
+    def test_batch(self):
+        segmentation_model_path = PRETRAINED_MODEL_SEGFORMER
+        segmentation_model_config = MODEL_CONFIG_SEGFORMER
+
+        img_path = os.path.join(TEST_IMAGES_DIR, '000000289059.jpg')
+        img = np.asarray(Image.open(img_path))
+
+        predict_pipeline = SegmentationPredictor(
+            model_path=segmentation_model_path,
+            config_file=segmentation_model_config,
+            batch_size=2)
+
+        total_samples = 3
+        outputs = predict_pipeline(
+            [img_path] * total_samples, keep_inputs=True)
+        self.assertEqual(len(outputs), 2)
+
+        self.assertEqual(outputs[0]['inputs'], [img_path] * 2)
+        self.assertEqual(outputs[1]['inputs'], [img_path] * 1)
+        self.assertEqual(len(outputs[0]['results']['seg_pred']), 2)
+        self.assertEqual(len(outputs[1]['results']['seg_pred']), 1)
+
+        for result in [outputs[0]['results'], outputs[1]['results']]:
+            self.assertListEqual(
+                list(img.shape)[:2], list(result['seg_pred'][0].shape))
+            self.assertListEqual(result['seg_pred'][0][1, :10].tolist(),
+                                 [161 for i in range(10)])
+            self.assertListEqual(result['seg_pred'][0][-1, -10:].tolist(),
+                                 [133 for i in range(10)])
+
+    def test_dump(self):
+        segmentation_model_path = PRETRAINED_MODEL_SEGFORMER
+        segmentation_model_config = MODEL_CONFIG_SEGFORMER
+
+        img_path = os.path.join(TEST_IMAGES_DIR, '000000289059.jpg')
+
+        temp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(temp_dir):
+            os.makedirs(temp_dir)
+        tmp_path = os.path.join(temp_dir, 'results.pkl')
+
+        predict_pipeline = SegmentationPredictor(
+            model_path=segmentation_model_path,
+            config_file=segmentation_model_config,
+            batch_size=2,
+            save_results=True,
+            save_path=tmp_path)
+
+        total_samples = 3
+        outputs = predict_pipeline(
+            [img_path] * total_samples, keep_inputs=True)
+        self.assertEqual(outputs, [])
+
+        with open(tmp_path, 'rb') as f:
+            results = pickle.loads(f.read())
+
+        self.assertIn('inputs', results[0])
+        self.assertIn('results', results[0])
+
+        shutil.rmtree(temp_dir, ignore_errors=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/predictors/test_segmentor.py b/tests/predictors/test_segmentor.py
new file mode 100644
index 00000000..1ca3eece
--- /dev/null
+++ b/tests/predictors/test_segmentor.py
@@ -0,0 +1,48 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+"""
+isort:skip_file
+"""
+import os
+import unittest
+
+import numpy as np
+from PIL import Image
+
+from tests.ut_config import TEST_IMAGES_DIR
+from tests.ut_config import (PRETRAINED_MODEL_SEGFORMER,
+                             MODEL_CONFIG_SEGFORMER)
+from easycv.predictors.segmentation import SegFormerPredictor
+
+
+class SegmentorTest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+
+    def test_segformer_detector(self):
+        segmentation_model_path = PRETRAINED_MODEL_SEGFORMER
+        segmentation_model_config = MODEL_CONFIG_SEGFORMER
+
+        img = os.path.join(TEST_IMAGES_DIR, '000000289059.jpg')
+        if not os.path.exists(img):
+            img = './data/test/segmentation/coco_stuff_164k/val2017/000000289059.jpg'
+
+        input_data_list = [np.asarray(Image.open(img))]
+        predictor = SegFormerPredictor(
+            model_path=segmentation_model_path,
+            model_config=segmentation_model_config)
+
+        output = predictor.predict(input_data_list)[0]
+        self.assertIn('seg_pred', output)
+
+        self.assertListEqual(
+            list(input_data_list[0].shape)[:2],
+            list(output['seg_pred'][0].shape))
+        self.assertListEqual(output['seg_pred'][0][1, :10].tolist(),
+                             [161 for i in range(10)])
+        self.assertListEqual(output['seg_pred'][0][-1, -10:].tolist(),
+                             [133 for i in range(10)])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/ut_config.py b/tests/ut_config.py
index 64284b1d..7a324a69 100644
--- a/tests/ut_config.py
+++ b/tests/ut_config.py
@@ -122,3 +122,11 @@
 PRETRAINED_MODEL_MASK2FORMER = os.path.join(
     BASE_LOCAL_PATH,
     'pretrained_models/segmentation/mask2former/mask2former_r50_instance.pth')
+
+PRETRAINED_MODEL_SEGFORMER = os.path.join(
+    BASE_LOCAL_PATH,
+    'pretrained_models/segmentation/segformer/segformer_b0/SegmentationEvaluator_mIoU_best.pth'
+)
+MODEL_CONFIG_SEGFORMER = (
+    './configs/segmentation/segformer/segformer_b0_coco.py')
+SMALL_COCO_WHOLE_BODY_HAND_ROOT = 'data/test/pose/hand/small_whole_body_hand_coco'
diff --git a/tests/utils/test_mmlab_utils.py b/tests/utils/test_mmlab_utils.py
index 1468ac48..714a1ce1 100644
--- a/tests/utils/test_mmlab_utils.py
+++ b/tests/utils/test_mmlab_utils.py
@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 import unittest
+from inspect import signature
 
 import torch
 from mmcv.parallel import scatter_kwargs
@@ -11,7 +12,9 @@
 from easycv.datasets import build_dataloader, build_dataset
 from easycv.models.builder import build_model
 from easycv.utils.config_tools import mmcv_config_fromfile
-from easycv.utils.mmlab_utils import dynamic_adapt_for_mmlab
+from easycv.utils.mmlab_utils import (MM_REGISTRY, MMDET,
+                                      dynamic_adapt_for_mmlab,
+                                      remove_adapt_for_mmlab)
 
 
 class MMLabUtilTest(unittest.TestCase):
@@ -24,6 +27,7 @@ def _get_model(self):
         cfg = mmcv_config_fromfile(config_path)
         dynamic_adapt_for_mmlab(cfg)
         model = build_model(cfg.model)
+        self.cfg = cfg
 
         return model
 
@@ -89,24 +93,17 @@ def _get_dataset(self, mode='train'):
             pipeline=pipeline)
         return build_dataset(dataset_cfg)
 
-    def xxtest_model_train(self):
+    def test_model_train(self):
         model = self._get_model()
         model = model.cuda()
         model.train()
 
         dataset = self._get_dataset()
         data_loader = build_dataloader(
-            dataset, imgs_per_gpu=3, workers_per_gpu=1, num_gpus=1, dist=False)
+            dataset, imgs_per_gpu=1, workers_per_gpu=1, num_gpus=1, dist=False)
         for i, data_batch in enumerate(data_loader):
-            input_args, kwargs = scatter_kwargs(None, data_batch, [-1])
-            for key in ['img', 'gt_bboxes', 'gt_labels']:
-                if isinstance(kwargs[0][key], (list, tuple)):
-                    kwargs[0][key] = [
-                        kwargs[0][key][i].cuda()
-                        for i in range(len(kwargs[0][key]))
-                    ]
-                else:
-                    kwargs[0][key] = kwargs[0][key].cuda()
+            input_args, kwargs = scatter_kwargs(None, data_batch,
+                                                [torch.cuda.current_device()])
             output = model(**kwargs[0], mode='train')
             self.assertEqual(len(output['loss_rpn_cls']), 5)
             self.assertEqual(len(output['loss_rpn_bbox']), 5)
@@ -128,6 +125,19 @@ def test_model_test(self):
         self.assertEqual(len(results['detection_masks']), 20)
         self.assertEqual(len(results['img_metas']), 20)
 
+    def test_reset(self):
+        model = self._get_model()
+        remove_adapt_for_mmlab(self.cfg)
+        mmdet_registry = MM_REGISTRY[MMDET]
+        for module, registry in mmdet_registry.items():
+            for k, v in registry.module_dict.items():
+                self.assertTrue('easycv' not in str(v))
+
+        models = mmdet_registry['model']
+        mask_rcnn = models.get('MaskRCNN')
+        sig_str = str(signature(mask_rcnn.forward))
+        self.assertTrue('mode' not in sig_str)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/utils/test_ms_utils.py b/tests/utils/test_ms_utils.py
new file mode 100644
index 00000000..5f4baad7
--- /dev/null
+++ b/tests/utils/test_ms_utils.py
@@ -0,0 +1,52 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+
+import easycv
+from easycv.utils.config_tools import Config
+from easycv.utils.ms_utils import to_ms_config
+
+
+class MsConfigTest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        super().tearDown()
+        shutil.rmtree(self.tmp_dir)
+
+    def test_to_ms_config(self):
+
+        config_path = os.path.join(
+            os.path.dirname(os.path.dirname(easycv.__file__)),
+            'configs/detection/yolox/yolox_s_8xb16_300e_coco.py')
+
+        ms_cfg_file = os.path.join(self.tmp_dir,
+                                   'ms_yolox_s_8xb16_300e_coco.json')
+        to_ms_config(
+            config_path,
+            task='image-object-detection',
+            ms_model_name='yolox',
+            pipeline_name='easycv-detection',
+            reserved_keys=['CLASSES'],
+            save_path=ms_cfg_file)
+        cfg = Config.fromfile(ms_cfg_file)
+        self.assertIn('task', cfg)
+        self.assertIn('framework', cfg)
+        self.assertIn('CLASSES', cfg)
+        self.assertIn('preprocessor', cfg)
+        self.assertIn('pipeline', cfg)
+        self.assertEqual(cfg.model.type, 'yolox')
+        self.assertIn('dataset', cfg)
+        self.assertIn('batch_size_per_gpu', cfg.train.dataloader)
+        self.assertIn('batch_size_per_gpu', cfg.evaluation.dataloader)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tools/prepare_data/coco_stuff164k.py b/tools/prepare_data/coco_stuff164k.py
new file mode 100644
index 00000000..d2e7cc8a
--- /dev/null
+++ b/tools/prepare_data/coco_stuff164k.py
@@ -0,0 +1,266 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapt from: https://github.com/open-mmlab/mmsegmentation/blob/master/tools/convert_datasets/coco_stuff164k.py
+
+import argparse
+import os.path as osp
+import shutil
+from functools import partial
+from glob import glob
+
+import mmcv
+import numpy as np
+from PIL import Image
+
+COCO_LEN = 123287
+
+clsID_to_trID = {
+    0: 0,
+    1: 1,
+    2: 2,
+    3: 3,
+    4: 4,
+    5: 5,
+    6: 6,
+    7: 7,
+    8: 8,
+    9: 9,
+    10: 10,
+    12: 11,
+    13: 12,
+    14: 13,
+    15: 14,
+    16: 15,
+    17: 16,
+    18: 17,
+    19: 18,
+    20: 19,
+    21: 20,
+    22: 21,
+    23: 22,
+    24: 23,
+    26: 24,
+    27: 25,
+    30: 26,
+    31: 27,
+    32: 28,
+    33: 29,
+    34: 30,
+    35: 31,
+    36: 32,
+    37: 33,
+    38: 34,
+    39: 35,
+    40: 36,
+    41: 37,
+    42: 38,
+    43: 39,
+    45: 40,
+    46: 41,
+    47: 42,
+    48: 43,
+    49: 44,
+    50: 45,
+    51: 46,
+    52: 47,
+    53: 48,
+    54: 49,
+    55: 50,
+    56: 51,
+    57: 52,
+    58: 53,
+    59: 54,
+    60: 55,
+    61: 56,
+    62: 57,
+    63: 58,
+    64: 59,
+    66: 60,
+    69: 61,
+    71: 62,
+    72: 63,
+    73: 64,
+    74: 65,
+    75: 66,
+    76: 67,
+    77: 68,
+    78: 69,
+    79: 70,
+    80: 71,
+    81: 72,
+    83: 73,
+    84: 74,
+    85: 75,
+    86: 76,
+    87: 77,
+    88: 78,
+    89: 79,
+    91: 80,
+    92: 81,
+    93: 82,
+    94: 83,
+    95: 84,
+    96: 85,
+    97: 86,
+    98: 87,
+    99: 88,
+    100: 89,
+    101: 90,
+    102: 91,
+    103: 92,
+    104: 93,
+    105: 94,
+    106: 95,
+    107: 96,
+    108: 97,
+    109: 98,
+    110: 99,
+    111: 100,
+    112: 101,
+    113: 102,
+    114: 103,
+    115: 104,
+    116: 105,
+    117: 106,
+    118: 107,
+    119: 108,
+    120: 109,
+    121: 110,
+    122: 111,
+    123: 112,
+    124: 113,
+    125: 114,
+    126: 115,
+    127: 116,
+    128: 117,
+    129: 118,
+    130: 119,
+    131: 120,
+    132: 121,
+    133: 122,
+    134: 123,
+    135: 124,
+    136: 125,
+    137: 126,
+    138: 127,
+    139: 128,
+    140: 129,
+    141: 130,
+    142: 131,
+    143: 132,
+    144: 133,
+    145: 134,
+    146: 135,
+    147: 136,
+    148: 137,
+    149: 138,
+    150: 139,
+    151: 140,
+    152: 141,
+    153: 142,
+    154: 143,
+    155: 144,
+    156: 145,
+    157: 146,
+    158: 147,
+    159: 148,
+    160: 149,
+    161: 150,
+    162: 151,
+    163: 152,
+    164: 153,
+    165: 154,
+    166: 155,
+    167: 156,
+    168: 157,
+    169: 158,
+    170: 159,
+    171: 160,
+    172: 161,
+    173: 162,
+    174: 163,
+    175: 164,
+    176: 165,
+    177: 166,
+    178: 167,
+    179: 168,
+    180: 169,
+    181: 170,
+    255: 255
+}
+
+
+def convert_to_trainID(maskpath, out_mask_dir, is_train):
+    mask = np.array(Image.open(maskpath))
+    mask_copy = mask.copy()
+    for clsID, trID in clsID_to_trID.items():
+        mask_copy[mask == clsID] = trID
+    seg_filename = osp.join(
+        out_mask_dir, 'train2017',
+        osp.basename(maskpath).split('.')[0] +
+        '_labelTrainIds.png') if is_train else osp.join(
+            out_mask_dir, 'val2017',
+            osp.basename(maskpath).split('.')[0] + '_labelTrainIds.png')
+    Image.fromarray(mask_copy).save(seg_filename, 'PNG')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description=\
+        'Convert COCO Stuff 164k annotations to mmsegmentation format')  # noqa
+    parser.add_argument('coco_path', help='coco stuff path')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    parser.add_argument(
+        '--nproc', default=16, type=int, help='number of process')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    coco_path = args.coco_path
+    nproc = args.nproc
+
+    out_dir = args.out_dir or coco_path
+    out_img_dir = osp.join(out_dir, 'images')
+    out_mask_dir = osp.join(out_dir, 'annotations')
+
+    mmcv.mkdir_or_exist(osp.join(out_mask_dir, 'train2017'))
+    mmcv.mkdir_or_exist(osp.join(out_mask_dir, 'val2017'))
+
+    if out_dir != coco_path:
+        shutil.copytree(osp.join(coco_path, 'images'), out_img_dir)
+
+    train_list = glob(osp.join(coco_path, 'annotations', 'train2017', '*.png'))
+    train_list = [file for file in train_list if '_labelTrainIds' not in file]
+    test_list = glob(osp.join(coco_path, 'annotations', 'val2017', '*.png'))
+    test_list = [file for file in test_list if '_labelTrainIds' not in file]
+    assert (len(train_list) +
+            len(test_list)) == COCO_LEN, 'Wrong length of list {} & {}'.format(
+                len(train_list), len(test_list))
+
+    if args.nproc > 1:
+        mmcv.track_parallel_progress(
+            partial(
+                convert_to_trainID, out_mask_dir=out_mask_dir, is_train=True),
+            train_list,
+            nproc=nproc)
+        mmcv.track_parallel_progress(
+            partial(
+                convert_to_trainID, out_mask_dir=out_mask_dir, is_train=False),
+            test_list,
+            nproc=nproc)
+    else:
+        mmcv.track_progress(
+            partial(
+                convert_to_trainID, out_mask_dir=out_mask_dir, is_train=True),
+            train_list)
+        mmcv.track_progress(
+            partial(
+                convert_to_trainID, out_mask_dir=out_mask_dir, is_train=False),
+            test_list)
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()