diff --git a/configs/body_3d_keypoint/pose_lift/README.md b/configs/body_3d_keypoint/pose_lift/README.md
index 7e5f9f7e2a..e3e6ff7176 100644
--- a/configs/body_3d_keypoint/pose_lift/README.md
+++ b/configs/body_3d_keypoint/pose_lift/README.md
@@ -16,23 +16,19 @@ For single-person 3D pose estimation from a monocular camera, existing works can
 
 #### Human3.6m Dataset
 
-| Arch | Receptive Field | MPJPE | P-MPJPE | N-MPJPE | ckpt | log |
-
-| :------------------------------------------------------ | :-------------: | :---: | :-----: | :-----: | :------------------------------------------------------: | :-----------------------------------------------------: |
-
-| [VideoPose3D-supervised](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-80e_h36m.py) | 27 | 40.1 | 30.1 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised-fe8fbba9_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised_20210527.log.json) |
-
-| [VideoPose3D-supervised](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-80e_h36m.py) | 81 | 39.1 | 29.3 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised-1f2d1104_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised_20210527.log.json) |
-
-| [VideoPose3D-supervised](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py) | 243 | | | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised-880bea25_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_20210527.log.json) |
-
-| [VideoPose3D-supervised-CPN](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-80e_h36m.py) | 1 | 53.0 | 41.3 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft-5c3afaed_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft_20210527.log.json) |
-
-| [VideoPose3D-supervised-CPN](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv-cpn-ft_8xb128-200e_h36m.py) | 243 | | | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft-88f5abbb_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft_20210527.log.json) |
-
-| [VideoPose3D-semi-supervised](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-semi-supv_8xb64-200e_h36m.py) | 27 | 57.2 | 42.4 | 54.2 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised-54aef83b_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_20210527.log.json) |
-
-| [VideoPose3D-semi-supervised-CPN](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-semi-supv-cpn-ft_8xb64-200e_h36m.py) | 27 | 67.3 | 50.4 | 63.6 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft-71be9cde_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft_20210527.log.json) |
+| Arch                                          | MPJPE | P-MPJPE | N-MPJPE |                     ckpt                      |                     log                      |              Details and Download               |
+| :-------------------------------------------- | :---: | :-----: | :-----: | :-------------------------------------------: | :------------------------------------------: | :---------------------------------------------: |
+| [VideoPose3D-supervised-27frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-80e_h36m.py) | 40.1  |  30.1   |    /    | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised-fe8fbba9_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
+| [VideoPose3D-supervised-81frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-80e_h36m.py) | 39.1  |  29.3   |    /    | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised-1f2d1104_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
+| [VideoPose3D-supervised-243frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py) | 37.6  |  28.3   |    /    | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised-880bea25_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
+| [VideoPose3D-supervised-CPN-1frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-80e_h36m.py) | 53.0  |  41.3   |    /    | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft-5c3afaed_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
+| [VideoPose3D-supervised-CPN-243frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv-cpn-ft_8xb128-200e_h36m.py) | 47.9  |  38.0   |    /    | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft-88f5abbb_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
+| [VideoPose3D-semi-supervised-27frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-semi-supv_8xb64-200e_h36m.py) | 57.2  |  42.4   |  54.2   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised-54aef83b_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
+| [VideoPose3D-semi-supervised-CPN-27frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-semi-supv-cpn-ft_8xb64-200e_h36m.py) | 67.3  |  50.4   |  63.6   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft-71be9cde_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
+| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 35.3  |  27.7   |    /    | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |                      /                       | [motionbert_h36m.md](./h36m/motionbert_h36m.md) |
+| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 27.5  |  21.6   |    /    | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |                      /                       | [motionbert_h36m.md](./h36m/motionbert_h36m.md) |
+
+*Models with * are converted from the official repo. The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
 
 ## Image-based Single-view 3D Human Body Pose Estimation
 
@@ -46,6 +42,6 @@ For single-person 3D pose estimation from a monocular camera, existing works can
 
 #### Human3.6m Dataset
 
-| Arch | MPJPE | P-MPJPE | N-MPJPE | ckpt | log |
-| :------------------------------------------------------ | :-------------: | :---: | :-----: | :-----: | :------------------------------------------------------: | :-----------------------------------------------------: |
-| [SimpleBaseline3D-tcn](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_simplebaseline3d_8xb64-200e_h36m.py) | 43.4 | 34.3 | /|[ckpt](https://download.openmmlab.com/mmpose/body3d/simple_baseline/simple3Dbaseline_h36m-f0ad73a4_20210419.pth) | [log](https://download.openmmlab.com/mmpose/body3d/simple_baseline/20210415_065056.log.json) |
+| Arch                                      | MPJPE | P-MPJPE | N-MPJPE |                   ckpt                    |                    log                    |                    Details and Download                    |
+| :---------------------------------------- | :---: | :-----: | :-----: | :---------------------------------------: | :---------------------------------------: | :--------------------------------------------------------: |
+| [SimpleBaseline3D-tcn](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_simplebaseline3d_8xb64-200e_h36m.py) | 43.4  |  34.3   |    /    | [ckpt](https://download.openmmlab.com/mmpose/body3d/simple_baseline/simple3Dbaseline_h36m-f0ad73a4_20210419.pth) | [log](https://download.openmmlab.com/mmpose/body3d/simple_baseline/20210415_065056.log.json) | [simplebaseline3d_h36m.md](./h36m/simplebaseline3d_h36m.md) |
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.md b/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.md
new file mode 100644
index 0000000000..d830d65c18
--- /dev/null
+++ b/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.md
@@ -0,0 +1,53 @@
+<!-- [BACKBONE] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/2210.06551">MotionBERT (2022)</a></summary>
+
+```bibtex
+ @misc{Zhu_Ma_Liu_Liu_Wu_Wang_2022,
+ title={Learning Human Motion Representations: A Unified Perspective},
+ author={Zhu, Wentao and Ma, Xiaoxuan and Liu, Zhaoyang and Liu, Libin and Wu, Wayne and Wang, Yizhou},
+ year={2022},
+ month={Oct},
+ language={en-US}
+ }
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/6682899/">Human3.6M (TPAMI'2014)</a></summary>
+
+```bibtex
+@article{h36m_pami,
+author = {Ionescu, Catalin and Papava, Dragos and Olaru, Vlad and Sminchisescu, Cristian},
+title = {Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human Sensing in Natural Environments},
+journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
+publisher = {IEEE Computer Society},
+volume = {36},
+number = {7},
+pages = {1325-1339},
+month = {jul},
+year = {2014}
+}
+```
+
+</details>
+
+Testing results on Human3.6M dataset with ground truth 2D detections
+
+| Arch                                                                                    | MPJPE | average MPJPE | P-MPJPE |                                           ckpt                                           |
+| :-------------------------------------------------------------------------------------- | :---: | :-----------: | :-----: | :--------------------------------------------------------------------------------------: |
+| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 35.3  |     35.3      |  27.7   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |
+| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 27.5  |     27.4      |  21.6   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |
+
+Testing results on Human3.6M dataset from the [official repo](https://github.com/Walter0807/MotionBERT) with ground truth 2D detections
+
+| Arch                                                                                    | MPJPE | average MPJPE | P-MPJPE |                                           ckpt                                           |
+| :-------------------------------------------------------------------------------------- | :---: | :-----------: | :-----: | :--------------------------------------------------------------------------------------: |
+| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 40.5  |     39.9      |  34.1   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |
+| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 38.2  |     37.7      |  32.6   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/Walter0807/MotionBERT). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.yml b/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.yml
new file mode 100644
index 0000000000..7257fea5a6
--- /dev/null
+++ b/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.yml
@@ -0,0 +1,34 @@
+Collections:
+- Name: MotionBERT
+  Paper:
+    Title: "Learning Human Motion Representations: A Unified Perspective"
+    URL: https://arxiv.org/abs/2210.06551
+  README: https://github.com/open-mmlab/mmpose/blob/main/docs/en/papers/algorithms/motionbert.md
+Models:
+- Config: configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert_8xb32-120e_h36m.py
+  In Collection: MotionBERT
+  Metadata:
+    Architecture: &id001
+    - MotionBERT
+    Training Data: Human3.6M
+  Name: vid_pl_motionbert_8xb32-120e_h36m
+  Results:
+  - Dataset: Human3.6M
+    Metrics:
+      MPJPE: 35.3
+      P-MPJPE: 27.7
+    Task: Body 3D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth
+- Config: configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert_8xb32-120e_h36m.py
+  In Collection: MotionBERT
+  Metadata:
+    Architecture: *id001
+    Training Data: Human3.6M
+  Name: vid_pl_motionbert-finetuned_8xb32-120e_h36m
+  Results:
+  - Dataset: Human3.6M
+    Metrics:
+      MPJPE: 27.5
+      P-MPJPE: 21.6
+    Task: Body 3D Keypoint
+  Weights: https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py
new file mode 100644
index 0000000000..88f6c3897d
--- /dev/null
+++ b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py
@@ -0,0 +1,140 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+vis_backends = [
+    dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+    type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime
+train_cfg = dict(max_epochs=120, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01))
+
+# learning policy
+param_scheduler = [
+    dict(type='ExponentialLR', gamma=0.99, end=120, by_epoch=True)
+]
+
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook',
+        save_best='MPJPE',
+        rule='less',
+        max_keep_ckpts=1),
+    logger=dict(type='LoggerHook', interval=20),
+)
+
+# codec settings
+train_codec = dict(
+    type='MotionBERTLabel',
+    num_keypoints=17,
+    concat_vis=True,
+    rootrel=True,
+    factor_label=False)
+val_codec = dict(
+    type='MotionBERTLabel', num_keypoints=17, concat_vis=True, rootrel=True)
+
+# model settings
+model = dict(
+    type='PoseLifter',
+    backbone=dict(
+        type='DSTFormer',
+        in_channels=3,
+        feat_size=512,
+        depth=5,
+        num_heads=8,
+        mlp_ratio=2,
+        seq_len=243,
+        att_fuse=True,
+    ),
+    head=dict(
+        type='MotionRegressionHead',
+        in_channels=512,
+        out_channels=3,
+        embedding_size=512,
+        loss=dict(type='MPJPEVelocityJointLoss'),
+        decoder=val_codec,
+    ),
+)
+
+# base dataset settings
+dataset_type = 'Human36mDataset'
+data_root = 'data/h36m/'
+
+# pipelines
+train_pipeline = [
+    dict(
+        type='RandomFlipAroundRoot',
+        keypoints_flip_cfg={},
+        target_flip_cfg={},
+        flip_image=True),
+    dict(type='GenerateTarget', encoder=train_codec),
+    dict(
+        type='PackPoseInputs',
+        meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+                   'factor', 'camera_param'))
+]
+val_pipeline = [
+    dict(type='GenerateTarget', encoder=val_codec),
+    dict(
+        type='PackPoseInputs',
+        meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+                   'factor', 'camera_param'))
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=32,
+    prefetch_factor=4,
+    pin_memory=True,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file='annotation_body3d/fps50/h36m_train.npz',
+        seq_len=1,
+        multiple_target=243,
+        multiple_target_step=81,
+        camera_param_file='annotation_body3d/cameras.pkl',
+        data_root=data_root,
+        data_prefix=dict(img='images/'),
+        pipeline=train_pipeline,
+    ))
+
+val_dataloader = dict(
+    batch_size=32,
+    prefetch_factor=4,
+    pin_memory=True,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file='annotation_body3d/fps50/h36m_test.npz',
+        seq_len=1,
+        seq_step=1,
+        multiple_target=243,
+        camera_param_file='annotation_body3d/cameras.pkl',
+        data_root=data_root,
+        data_prefix=dict(img='images/'),
+        pipeline=val_pipeline,
+        test_mode=True,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+skip_list = [
+    'S9_Greet', 'S9_SittingDown', 'S9_Wait_1', 'S9_Greeting', 'S9_Waiting_1'
+]
+val_evaluator = [
+    dict(type='MPJPE', mode='mpjpe', skip_list=skip_list),
+    dict(type='MPJPE', mode='p-mpjpe', skip_list=skip_list)
+]
+test_evaluator = val_evaluator
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-80e_h36m.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-160e_h36m.py
similarity index 98%
rename from configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-80e_h36m.py
rename to configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-160e_h36m.py
index 0cbf89142d..c1190fe83e 100644
--- a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-80e_h36m.py
+++ b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-160e_h36m.py
@@ -7,7 +7,7 @@
     type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
 
 # runtime
-train_cfg = dict(max_epochs=80, val_interval=10)
+train_cfg = dict(max_epochs=160, val_interval=10)
 
 # optimizer
 optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-4))
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-160e_h36m.py
similarity index 98%
rename from configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py
rename to configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-160e_h36m.py
index 0f311ac5cf..0d241c498f 100644
--- a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py
+++ b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-160e_h36m.py
@@ -7,7 +7,7 @@
     type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
 
 # runtime
-train_cfg = dict(max_epochs=80, val_interval=10)
+train_cfg = dict(max_epochs=160, val_interval=10)
 
 # optimizer
 optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-3))
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-80e_h36m.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-120e_h36m.py
similarity index 98%
rename from configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-80e_h36m.py
rename to configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-120e_h36m.py
index 2589b493a6..803f907b7b 100644
--- a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-80e_h36m.py
+++ b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-120e_h36m.py
@@ -7,7 +7,7 @@
     type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
 
 # runtime
-train_cfg = dict(max_epochs=80, val_interval=10)
+train_cfg = dict(max_epochs=160, val_interval=10)
 
 # optimizer
 optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-3))
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-80e_h36m.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-160e_h36m.py
similarity index 98%
rename from configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-80e_h36m.py
rename to configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-160e_h36m.py
index f2c27e423d..4b370fe76e 100644
--- a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-80e_h36m.py
+++ b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-160e_h36m.py
@@ -7,7 +7,7 @@
     type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
 
 # runtime
-train_cfg = dict(max_epochs=80, val_interval=10)
+train_cfg = dict(max_epochs=160, val_interval=10)
 
 # optimizer
 optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-3))
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/videopose3d_h36m.md b/configs/body_3d_keypoint/pose_lift/h36m/videopose3d_h36m.md
index f1c75d786a..48502c7b09 100644
--- a/configs/body_3d_keypoint/pose_lift/h36m/videopose3d_h36m.md
+++ b/configs/body_3d_keypoint/pose_lift/h36m/videopose3d_h36m.md
@@ -41,27 +41,27 @@ Testing results on Human3.6M dataset with ground truth 2D detections, supervised
 
 | Arch                                                       | Receptive Field | MPJPE | P-MPJPE |                            ckpt                            |                            log                            |
 | :--------------------------------------------------------- | :-------------: | :---: | :-----: | :--------------------------------------------------------: | :-------------------------------------------------------: |
-| [VideoPose3D](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-80e_h36m.py) |       27        | 40.1  |  30.1   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised-fe8fbba9_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised_20210527.log.json) |
-| [VideoPose3D](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-80e_h36m.py) |       81        | 39.1  |  29.3   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised-1f2d1104_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised_20210527.log.json) |
-| [VideoPose3D](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py) |       243       |       |         | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised-880bea25_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_20210527.log.json) |
+| [VideoPose3D-supervised-27frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-supv_8xb128-80e_h36m.py) |       27        | 40.1  |  30.1   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised-fe8fbba9_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised_20210527.log.json) |
+| [VideoPose3D-supervised-81frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-81frm-supv_8xb128-80e_h36m.py) |       81        | 39.1  |  29.3   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised-1f2d1104_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised_20210527.log.json) |
+| [VideoPose3D-supervised-243frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv_8xb128-80e_h36m.py) |       243       | 37.6  |  28.3   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised-880bea25_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_20210527.log.json) |
 
 Testing results on Human3.6M dataset with CPN 2D detections<sup>1</sup>, supervised training
 
 | Arch                                                       | Receptive Field | MPJPE | P-MPJPE |                            ckpt                            |                            log                            |
 | :--------------------------------------------------------- | :-------------: | :---: | :-----: | :--------------------------------------------------------: | :-------------------------------------------------------: |
-| [VideoPose3D](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-80e_h36m.py) |        1        | 53.0  |  41.3   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft-5c3afaed_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft_20210527.log.json) |
-| [VideoPose3D](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv-cpn-ft_8xb128-200e_h36m.py) |       243       |       |         | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft-88f5abbb_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft_20210527.log.json) |
+| [VideoPose3D-supervised-CPN-1frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-1frm-supv-cpn-ft_8xb128-80e_h36m.py) |        1        | 53.0  |  41.3   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft-5c3afaed_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft_20210527.log.json) |
+| [VideoPose3D-supervised-CPN-243frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-243frm-supv-cpn-ft_8xb128-200e_h36m.py) |       243       | 47.9  |  38.0   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft-88f5abbb_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft_20210527.log.json) |
 
 Testing results on Human3.6M dataset with ground truth 2D detections, semi-supervised training
 
 | Training Data |                        Arch                         | Receptive Field | MPJPE | P-MPJPE | N-MPJPE |                        ckpt                         |                         log                         |
 | :------------ | :-------------------------------------------------: | :-------------: | :---: | :-----: | :-----: | :-------------------------------------------------: | :-------------------------------------------------: |
-| 10% S1        | [VideoPose3D](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-semi-supv_8xb64-200e_h36m.py) |       27        | 57.2  |  42.4   |  54.2   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised-54aef83b_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_20210527.log.json) |
+| 10% S1        | [VideoPose3D-semi-supervised-27frm](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_videopose3d-27frm-semi-supv_8xb64-200e_h36m.py) |       27        | 57.2  |  42.4   |  54.2   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised-54aef83b_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_20210527.log.json) |
 
 Testing results on Human3.6M dataset with CPN 2D detections<sup>1</sup>, semi-supervised training
 
-| Training Data |              Arch              | Receptive Field | MPJPE | P-MPJPE | N-MPJPE |                              ckpt                              |                              log                              |
-| :------------ | :----------------------------: | :-------------: | :---: | :-----: | :-----: | :------------------------------------------------------------: | :-----------------------------------------------------------: |
-| 10% S1        | [VideoPose3D](/configs/xxx.py) |       27        | 67.3  |  50.4   |  63.6   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft-71be9cde_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft_20210527.log.json) |
+| Training Data |                        Arch                         | Receptive Field | MPJPE | P-MPJPE | N-MPJPE |                        ckpt                         |                         log                         |
+| :------------ | :-------------------------------------------------: | :-------------: | :---: | :-----: | :-----: | :-------------------------------------------------: | :-------------------------------------------------: |
+| 10% S1        | [VideoPose3D-semi-supervised-CPN-27frm](/configs/xxx.py) |       27        | 67.3  |  50.4   |  63.6   | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft-71be9cde_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft_20210527.log.json) |
 
 <sup>1</sup> CPN 2D detections are provided by [official repo](https://github.com/facebookresearch/VideoPose3D/blob/master/DATASETS.md). The reformatted version used in this repository can be downloaded from [train_detection](https://download.openmmlab.com/mmpose/body3d/videopose/cpn_ft_h36m_dbb_train.npy) and [test_detection](https://download.openmmlab.com/mmpose/body3d/videopose/cpn_ft_h36m_dbb_test.npy).
diff --git a/mmpose/apis/inference_3d.py b/mmpose/apis/inference_3d.py
index d5bb753945..d4b9623b86 100644
--- a/mmpose/apis/inference_3d.py
+++ b/mmpose/apis/inference_3d.py
@@ -316,8 +316,10 @@ def inference_pose_lifter_model(model,
             T,
             K,
         ), dtype=np.float32)
-        data_info['lifting_target'] = np.zeros((K, 3), dtype=np.float32)
-        data_info['lifting_target_visible'] = np.ones((K, 1), dtype=np.float32)
+        data_info['lifting_target'] = np.zeros((1, K, 3), dtype=np.float32)
+        data_info['factor'] = np.zeros((T, ), dtype=np.float32)
+        data_info['lifting_target_visible'] = np.ones((1, K, 1),
+                                                      dtype=np.float32)
 
         if image_size is not None:
             assert len(image_size) == 2
diff --git a/mmpose/apis/inferencers/pose3d_inferencer.py b/mmpose/apis/inferencers/pose3d_inferencer.py
index 0fe66ac72b..819273af66 100644
--- a/mmpose/apis/inferencers/pose3d_inferencer.py
+++ b/mmpose/apis/inferencers/pose3d_inferencer.py
@@ -271,8 +271,8 @@ def preprocess_single(self,
                 K,
             ),
                                                      dtype=np.float32)
-            data_info['lifting_target'] = np.zeros((K, 3), dtype=np.float32)
-            data_info['lifting_target_visible'] = np.ones((K, 1),
+            data_info['lifting_target'] = np.zeros((1, K, 3), dtype=np.float32)
+            data_info['lifting_target_visible'] = np.ones((1, K, 1),
                                                           dtype=np.float32)
             data_info['camera_param'] = dict(w=width, h=height)
 
diff --git a/mmpose/codecs/__init__.py b/mmpose/codecs/__init__.py
index cdbd8feb0c..1a48b7f851 100644
--- a/mmpose/codecs/__init__.py
+++ b/mmpose/codecs/__init__.py
@@ -4,6 +4,7 @@
 from .image_pose_lifting import ImagePoseLifting
 from .integral_regression_label import IntegralRegressionLabel
 from .megvii_heatmap import MegviiHeatmap
+from .motionbert_label import MotionBERTLabel
 from .msra_heatmap import MSRAHeatmap
 from .regression_label import RegressionLabel
 from .simcc_label import SimCCLabel
@@ -14,5 +15,6 @@
 __all__ = [
     'MSRAHeatmap', 'MegviiHeatmap', 'UDPHeatmap', 'RegressionLabel',
     'SimCCLabel', 'IntegralRegressionLabel', 'AssociativeEmbedding', 'SPR',
-    'DecoupledHeatmap', 'VideoPoseLifting', 'ImagePoseLifting'
+    'DecoupledHeatmap', 'VideoPoseLifting', 'ImagePoseLifting',
+    'MotionBERTLabel'
 ]
diff --git a/mmpose/codecs/image_pose_lifting.py b/mmpose/codecs/image_pose_lifting.py
index 64bf925997..aae6c3b5be 100644
--- a/mmpose/codecs/image_pose_lifting.py
+++ b/mmpose/codecs/image_pose_lifting.py
@@ -25,6 +25,10 @@ class ImagePoseLifting(BaseKeypointCodec):
             Default: ``False``.
         save_index (bool): If true, store the root position separated from the
             original pose. Default: ``False``.
+        reshape_keypoints (bool): If true, reshape the keypoints into shape
+            (-1, N). Default: ``True``.
+        concat_vis (bool): If true, concat the visibility item of keypoints.
+            Default: ``False``.
         keypoints_mean (np.ndarray, optional): Mean values of keypoints
             coordinates in shape (K, D).
         keypoints_std (np.ndarray, optional): Std values of keypoints
@@ -42,6 +46,8 @@ def __init__(self,
                  root_index: int,
                  remove_root: bool = False,
                  save_index: bool = False,
+                 reshape_keypoints: bool = True,
+                 concat_vis: bool = False,
                  keypoints_mean: Optional[np.ndarray] = None,
                  keypoints_std: Optional[np.ndarray] = None,
                  target_mean: Optional[np.ndarray] = None,
@@ -52,9 +58,23 @@ def __init__(self,
         self.root_index = root_index
         self.remove_root = remove_root
         self.save_index = save_index
-        if keypoints_mean is not None and keypoints_std is not None:
+        self.reshape_keypoints = reshape_keypoints
+        self.concat_vis = concat_vis
+        if keypoints_mean is not None:
+            keypoints_mean = np.array(
+                keypoints_mean,
+                dtype=np.float32).reshape(1, num_keypoints, -1)
+            keypoints_std = np.array(
+                keypoints_std, dtype=np.float32).reshape(1, num_keypoints, -1)
+            assert keypoints_std is not None
             assert keypoints_mean.shape == keypoints_std.shape
-        if target_mean is not None and target_std is not None:
+        if target_mean is not None:
+            target_dim = num_keypoints - 1 if remove_root else num_keypoints
+            target_mean = np.array(
+                target_mean, dtype=np.float32).reshape(1, target_dim, -1)
+            target_std = np.array(
+                target_std, dtype=np.float32).reshape(1, target_dim, -1)
+            assert target_std is not None
             assert target_mean.shape == target_std.shape
         self.keypoints_mean = keypoints_mean
         self.keypoints_std = keypoints_std
@@ -73,15 +93,17 @@ def encode(self,
             keypoints_visible (np.ndarray, optional): Keypoint visibilities in
                 shape (N, K).
             lifting_target (np.ndarray, optional): 3d target coordinate in
-                shape (K, C).
+                shape (T, K, C).
             lifting_target_visible (np.ndarray, optional): Target coordinate in
-                shape (K, ).
+                shape (T, K, ).
 
         Returns:
             encoded (dict): Contains the following items:
 
                 - keypoint_labels (np.ndarray): The processed keypoints in
-                  shape (K * D, N) where D is 2 for 2d coordinates.
+                  shape like (N, K, D) or (K * D, N).
+                - keypoint_labels_visible (np.ndarray): The processed
+                  keypoints' weights in shape (N, K, ) or (N-1, K, ).
                 - lifting_target_label: The processed target coordinate in
                   shape (K, C) or (K-1, C).
                 - lifting_target_weights (np.ndarray): The target weights in
@@ -93,18 +115,20 @@ def encode(self,
 
                 In addition, there are some optional items it may contain:
 
+                - target_root (np.ndarray): The root coordinate of target in
+                  shape (C, ). Exists if ``zero_center`` is ``True``.
                 - target_root_removed (bool): Indicate whether the root of
-                  pose lifting target is removed. Added if ``self.remove_root``
-                  is ``True``.
+                  pose-lifitng target is removed. Exists if
+                  ``remove_root`` is ``True``.
                 - target_root_index (int): An integer indicating the index of
-                  root. Added if ``self.remove_root`` and ``self.save_index``
+                  root. Exists if ``remove_root`` and ``save_index``
                   are ``True``.
         """
         if keypoints_visible is None:
             keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32)
 
         if lifting_target is None:
-            lifting_target = keypoints[0]
+            lifting_target = [keypoints[0]]
 
         # set initial value for `lifting_target_weights`
         # and `trajectory_weights`
@@ -126,13 +150,16 @@ def encode(self,
             f'Got invalid joint shape {lifting_target.shape}'
 
         root = lifting_target[..., self.root_index, :]
-        lifting_target_label = lifting_target - root
+        lifting_target_label = lifting_target - lifting_target[
+            ..., self.root_index:self.root_index + 1, :]
 
         if self.remove_root:
             lifting_target_label = np.delete(
                 lifting_target_label, self.root_index, axis=-2)
-            assert lifting_target_weights.ndim in {1, 2}
-            axis_to_remove = -2 if lifting_target_weights.ndim == 2 else -1
+            lifting_target_visible = np.delete(
+                lifting_target_visible, self.root_index, axis=-2)
+            assert lifting_target_weights.ndim in {2, 3}
+            axis_to_remove = -2 if lifting_target_weights.ndim == 3 else -1
             lifting_target_weights = np.delete(
                 lifting_target_weights, self.root_index, axis=axis_to_remove)
             # Add a flag to avoid latter transforms that rely on the root
@@ -145,15 +172,17 @@ def encode(self,
 
         # Normalize the 2D keypoint coordinate with mean and std
         keypoint_labels = keypoints.copy()
-        if self.keypoints_mean is not None and self.keypoints_std is not None:
-            keypoints_shape = keypoints.shape
-            assert self.keypoints_mean.shape == keypoints_shape[1:]
+        if self.keypoints_mean is not None:
+            assert self.keypoints_mean.shape[1:] == keypoints.shape[1:]
+            encoded['keypoints_mean'] = self.keypoints_mean.copy()
+            encoded['keypoints_std'] = self.keypoints_std.copy()
 
             keypoint_labels = (keypoint_labels -
                                self.keypoints_mean) / self.keypoints_std
-        if self.target_mean is not None and self.target_std is not None:
-            target_shape = lifting_target_label.shape
-            assert self.target_mean.shape == target_shape
+        if self.target_mean is not None:
+            assert self.target_mean.shape == lifting_target_label.shape
+            encoded['target_mean'] = self.target_mean.copy()
+            encoded['target_std'] = self.target_std.copy()
 
             lifting_target_label = (lifting_target_label -
                                     self.target_mean) / self.target_std
@@ -163,7 +192,19 @@ def encode(self,
         if keypoint_labels.ndim == 2:
             keypoint_labels = keypoint_labels[None, ...]
 
+        if self.concat_vis:
+            keypoints_visible_ = keypoints_visible
+            if keypoints_visible.ndim == 2:
+                keypoints_visible_ = keypoints_visible[..., None]
+            keypoint_labels = np.concatenate(
+                (keypoint_labels, keypoints_visible_), axis=2)
+
+        if self.reshape_keypoints:
+            N = keypoint_labels.shape[0]
+            keypoint_labels = keypoint_labels.transpose(1, 2, 0).reshape(-1, N)
+
         encoded['keypoint_labels'] = keypoint_labels
+        encoded['keypoint_labels_visible'] = keypoints_visible
         encoded['lifting_target_label'] = lifting_target_label
         encoded['lifting_target_weights'] = lifting_target_weights
         encoded['trajectory_weights'] = trajectory_weights
@@ -190,11 +231,11 @@ def decode(self,
         keypoints = encoded.copy()
 
         if self.target_mean is not None and self.target_std is not None:
-            assert self.target_mean.shape == keypoints.shape[1:]
+            assert self.target_mean.shape == keypoints.shape
             keypoints = keypoints * self.target_std + self.target_mean
 
-        if target_root.size > 0:
-            keypoints = keypoints + np.expand_dims(target_root, axis=0)
+        if target_root is not None and target_root.size > 0:
+            keypoints = keypoints + target_root
             if self.remove_root:
                 keypoints = np.insert(
                     keypoints, self.root_index, target_root, axis=1)
diff --git a/mmpose/codecs/motionbert_label.py b/mmpose/codecs/motionbert_label.py
new file mode 100644
index 0000000000..d0c8cd0d40
--- /dev/null
+++ b/mmpose/codecs/motionbert_label.py
@@ -0,0 +1,218 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from copy import deepcopy
+from typing import Optional, Tuple
+
+import numpy as np
+
+from mmpose.registry import KEYPOINT_CODECS
+from .base import BaseKeypointCodec
+from .utils import camera_to_image_coord
+
+
+@KEYPOINT_CODECS.register_module()
+class MotionBERTLabel(BaseKeypointCodec):
+    r"""Generate keypoint and label coordinates for `MotionBERT`_ by Zhu et al
+    (2022).
+
+    Note:
+
+        - instance number: N
+        - keypoint number: K
+        - keypoint dimension: D
+        - pose-lifitng target dimension: C
+
+    Args:
+        num_keypoints (int): The number of keypoints in the dataset.
+        root_index (int): Root keypoint index in the pose. Default: 0.
+        remove_root (bool): If true, remove the root keypoint from the pose.
+            Default: ``False``.
+        save_index (bool): If true, store the root position separated from the
+            original pose, only takes effect if ``remove_root`` is ``True``.
+            Default: ``False``.
+        concat_vis (bool): If true, concat the visibility item of keypoints.
+            Default: ``False``.
+        rootrel (bool): If true, the root keypoint will be set to the
+            coordinate origin. Default: ``False``.
+        factor_label (bool): If true, the label will be multiplied by a factor.
+            Default: ``True``.
+    """
+
+    auxiliary_encode_keys = {
+        'lifting_target', 'lifting_target_visible', 'camera_param', 'factor'
+    }
+
+    def __init__(self,
+                 num_keypoints: int,
+                 root_index: int = 0,
+                 remove_root: bool = False,
+                 save_index: bool = False,
+                 concat_vis: bool = False,
+                 rootrel: bool = False,
+                 factor_label: bool = True):
+        super().__init__()
+
+        self.num_keypoints = num_keypoints
+        self.root_index = root_index
+        self.remove_root = remove_root
+        self.save_index = save_index
+        self.concat_vis = concat_vis
+        self.rootrel = rootrel
+        self.factor_label = factor_label
+
+    def encode(self,
+               keypoints: np.ndarray,
+               keypoints_visible: Optional[np.ndarray] = None,
+               lifting_target: Optional[np.ndarray] = None,
+               lifting_target_visible: Optional[np.ndarray] = None,
+               camera_param: Optional[dict] = None,
+               factor: Optional[np.ndarray] = None) -> dict:
+        """Encoding keypoints from input image space to normalized space.
+
+        Args:
+            keypoints (np.ndarray): Keypoint coordinates in shape (B, T, K, D).
+            keypoints_visible (np.ndarray, optional): Keypoint visibilities in
+                shape (B, T, K).
+            lifting_target (np.ndarray, optional): 3d target coordinate in
+                shape (T, K, C).
+            lifting_target_visible (np.ndarray, optional): Target coordinate in
+                shape (T, K, ).
+            camera_param (dict, optional): The camera parameter dictionary.
+            factor (np.ndarray, optional): The factor mapping camera and image
+                  coordinate in shape (T, ).
+
+        Returns:
+            encoded (dict): Contains the following items:
+
+                - keypoint_labels (np.ndarray): The processed keypoints in
+                  shape like (N, K, D).
+                - keypoint_labels_visible (np.ndarray): The processed
+                  keypoints' weights in shape (N, K, ) or (N, K-1, ).
+                - lifting_target_label: The processed target coordinate in
+                  shape (K, C) or (K-1, C).
+                - lifting_target_weights (np.ndarray): The target weights in
+                  shape (K, ) or (K-1, ).
+                - trajectory_weights (np.ndarray): The trajectory weights in
+                  shape (K, ).
+                - factor (np.ndarray): The factor mapping camera and image
+                  coordinate in shape (T, 1).
+        """
+        if keypoints_visible is None:
+            keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32)
+
+        if lifting_target is None:
+            lifting_target = [keypoints[..., 0, :, :]]
+
+        # set initial value for `lifting_target_weights`
+        # and `trajectory_weights`
+        if lifting_target_visible is None:
+            lifting_target_visible = np.ones(
+                lifting_target.shape[:-1], dtype=np.float32)
+            lifting_target_weights = lifting_target_visible
+            trajectory_weights = (1 / lifting_target[:, 2])
+        else:
+            valid = lifting_target_visible > 0.5
+            lifting_target_weights = np.where(valid, 1., 0.).astype(np.float32)
+            trajectory_weights = lifting_target_weights
+
+        if camera_param is None:
+            camera_param = dict()
+
+        encoded = dict()
+
+        lifting_target_label = lifting_target.copy()
+        keypoint_labels = keypoints.copy()
+
+        assert keypoint_labels.ndim in {2, 3}
+        if keypoint_labels.ndim == 2:
+            keypoint_labels = keypoint_labels[None, ...]
+
+        # Normalize the 2D keypoint coordinate with image width and height
+        _camera_param = deepcopy(camera_param)
+        assert 'w' in _camera_param and 'h' in _camera_param
+        w, h = _camera_param['w'], _camera_param['h']
+        keypoint_labels[
+            ..., :2] = keypoint_labels[..., :2] / w * 2 - [1, h / w]
+
+        # convert target to image coordinate
+        T = keypoint_labels.shape[0]
+        factor_ = np.array([4] * T, dtype=np.float32).reshape(T, )
+        if 'f' in _camera_param and 'c' in _camera_param:
+            lifting_target_label, factor_ = camera_to_image_coord(
+                self.root_index, lifting_target_label, _camera_param)
+        lifting_target_label[..., :, :] = lifting_target_label[
+            ..., :, :] - lifting_target_label[...,
+                                              self.root_index:self.root_index +
+                                              1, :]
+        if factor is None or factor[0] == 0:
+            factor = factor_
+        if factor.ndim == 1:
+            factor = factor[:, None]
+        if self.factor_label:
+            lifting_target_label *= factor[..., None]
+
+        if self.concat_vis:
+            keypoints_visible_ = keypoints_visible
+            if keypoints_visible.ndim == 2:
+                keypoints_visible_ = keypoints_visible[..., None]
+            keypoint_labels = np.concatenate(
+                (keypoint_labels, keypoints_visible_), axis=2)
+
+        encoded['keypoint_labels'] = keypoint_labels
+        encoded['keypoint_labels_visible'] = keypoints_visible
+        encoded['lifting_target_label'] = lifting_target_label
+        encoded['lifting_target_weights'] = lifting_target_weights
+        encoded['lifting_target'] = lifting_target_label
+        encoded['lifting_target_visible'] = lifting_target_visible
+        encoded['trajectory_weights'] = trajectory_weights
+        encoded['factor'] = factor
+
+        return encoded
+
+    def decode(
+        self,
+        encoded: np.ndarray,
+        w: Optional[np.ndarray] = None,
+        h: Optional[np.ndarray] = None,
+        factor: Optional[np.ndarray] = None,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """Decode keypoint coordinates from normalized space to input image
+        space.
+
+        Args:
+            encoded (np.ndarray): Coordinates in shape (N, K, C).
+            w (np.ndarray, optional): The image widths in shape (N, ).
+                Default: ``None``.
+            h (np.ndarray, optional): The image heights in shape (N, ).
+                Default: ``None``.
+            factor (np.ndarray, optional): The factor for projection in shape
+                (N, ). Default: ``None``.
+
+        Returns:
+            keypoints (np.ndarray): Decoded coordinates in shape (N, K, C).
+            scores (np.ndarray): The keypoint scores in shape (N, K).
+        """
+        keypoints = encoded.copy()
+        scores = np.ones(keypoints.shape[:-1], dtype=np.float32)
+
+        if self.rootrel:
+            keypoints[..., 0, :] = 0
+
+        if w is not None and w.size > 0:
+            assert w.shape == h.shape
+            assert w.shape[0] == keypoints.shape[0]
+            assert w.ndim in {1, 2}
+            if w.ndim == 1:
+                w = w[:, None]
+                h = h[:, None]
+            trans = np.append(
+                np.ones((w.shape[0], 1)), h / w, axis=1)[:, None, :]
+            keypoints[..., :2] = (keypoints[..., :2] + trans) * w[:, None] / 2
+            keypoints[..., 2:] = keypoints[..., 2:] * w[:, None] / 2
+        if factor is not None and factor.size > 0:
+            assert factor.shape[0] == keypoints.shape[0]
+            keypoints *= factor[..., None]
+        keypoints[..., :, :] = keypoints[..., :, :] - keypoints[
+            ..., self.root_index:self.root_index + 1, :]
+        keypoints /= 1000.
+        return keypoints, scores
diff --git a/mmpose/codecs/utils/__init__.py b/mmpose/codecs/utils/__init__.py
index eaa093f12b..38bbae5c39 100644
--- a/mmpose/codecs/utils/__init__.py
+++ b/mmpose/codecs/utils/__init__.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .camera_image_projection import camera_to_image_coord, camera_to_pixel
 from .gaussian_heatmap import (generate_gaussian_heatmaps,
                                generate_udp_gaussian_heatmaps,
                                generate_unbiased_gaussian_heatmaps)
@@ -19,5 +20,6 @@
     'batch_heatmap_nms', 'refine_keypoints', 'refine_keypoints_dark',
     'refine_keypoints_dark_udp', 'generate_displacement_heatmap',
     'refine_simcc_dark', 'gaussian_blur1d', 'get_diagonal_lengths',
-    'get_instance_root', 'get_instance_bbox', 'get_simcc_normalized'
+    'get_instance_root', 'get_instance_bbox', 'get_simcc_normalized',
+    'camera_to_image_coord', 'camera_to_pixel'
 ]
diff --git a/mmpose/codecs/utils/camera_image_projection.py b/mmpose/codecs/utils/camera_image_projection.py
new file mode 100644
index 0000000000..5ed4d14109
--- /dev/null
+++ b/mmpose/codecs/utils/camera_image_projection.py
@@ -0,0 +1,69 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Tuple
+
+import numpy as np
+
+
+def camera_to_image_coord(root_index: int, kpts_3d_cam: np.ndarray,
+                          camera_param: Dict) -> Tuple[np.ndarray, np.ndarray]:
+    """Project keypoints from camera space to image space and calculate factor.
+
+    Args:
+        root_index (int): Index for root keypoint.
+        kpts_3d_cam (np.ndarray): Keypoint coordinates in camera space in
+            shape (N, K, D).
+        camera_param (dict): Parameters for the camera.
+
+    Returns:
+        tuple:
+        - kpts_3d_image (np.ndarray): Keypoint coordinates in image space in
+            shape (N, K, D).
+        - factor (np.ndarray): The scaling factor that maps keypoints from
+            image space to camera space in shape (N, ).
+    """
+
+    root = kpts_3d_cam[..., root_index, :]
+    tl_kpt = root.copy()
+    tl_kpt[..., :2] -= 1.0
+    br_kpt = root.copy()
+    br_kpt[..., :2] += 1.0
+    tl_kpt = np.reshape(tl_kpt, (-1, 3))
+    br_kpt = np.reshape(br_kpt, (-1, 3))
+    fx, fy = camera_param['f'] / 1000.
+    cx, cy = camera_param['c'] / 1000.
+
+    tl2d = camera_to_pixel(tl_kpt, fx, fy, cx, cy)
+    br2d = camera_to_pixel(br_kpt, fx, fy, cx, cy)
+
+    rectangle_3d_size = 2.0
+    kpts_3d_image = np.zeros_like(kpts_3d_cam)
+    kpts_3d_image[..., :2] = camera_to_pixel(kpts_3d_cam.copy(), fx, fy, cx,
+                                             cy)
+    ratio = (br2d[..., 0] - tl2d[..., 0] + 0.001) / rectangle_3d_size
+    factor = rectangle_3d_size / (br2d[..., 0] - tl2d[..., 0] + 0.001)
+    kpts_3d_depth = ratio[:, None] * (
+        kpts_3d_cam[..., 2] - kpts_3d_cam[..., root_index:root_index + 1, 2])
+    kpts_3d_image[..., 2] = kpts_3d_depth
+    return kpts_3d_image, factor
+
+
+def camera_to_pixel(kpts_3d: np.ndarray, fx: float, fy: float, cx: float,
+                    cy: float) -> np.ndarray:
+    """Project keypoints from camera space to image space.
+
+    Args:
+        kpts_3d (np.ndarray): Keypoint coordinates in camera space.
+        fx (float): x-coordinate of camera's focal length.
+        fy (float): y-coordinate of camera's focal length.
+        cx (float): x-coordinate of image center.
+        cy (float): y-coordinate of image center.
+
+    Returns:
+        pose_2d (np.ndarray): Projected keypoint coordinates in image space.
+    """
+    pose_2d = kpts_3d[..., :2] / kpts_3d[..., 2:3]
+    pose_2d[..., 0] *= fx
+    pose_2d[..., 1] *= fy
+    pose_2d[..., 0] += cx
+    pose_2d[..., 1] += cy
+    return pose_2d
diff --git a/mmpose/codecs/video_pose_lifting.py b/mmpose/codecs/video_pose_lifting.py
index 56cf35fa2d..9e409a663c 100644
--- a/mmpose/codecs/video_pose_lifting.py
+++ b/mmpose/codecs/video_pose_lifting.py
@@ -30,6 +30,10 @@ class VideoPoseLifting(BaseKeypointCodec):
         save_index (bool): If true, store the root position separated from the
             original pose, only takes effect if ``remove_root`` is ``True``.
             Default: ``False``.
+        reshape_keypoints (bool): If true, reshape the keypoints into shape
+            (-1, N). Default: ``True``.
+        concat_vis (bool): If true, concat the visibility item of keypoints.
+            Default: ``False``.
         normalize_camera (bool): Whether to normalize camera intrinsics.
             Default: ``False``.
     """
@@ -44,6 +48,8 @@ def __init__(self,
                  root_index: int = 0,
                  remove_root: bool = False,
                  save_index: bool = False,
+                 reshape_keypoints: bool = True,
+                 concat_vis: bool = False,
                  normalize_camera: bool = False):
         super().__init__()
 
@@ -52,6 +58,8 @@ def __init__(self,
         self.root_index = root_index
         self.remove_root = remove_root
         self.save_index = save_index
+        self.reshape_keypoints = reshape_keypoints
+        self.concat_vis = concat_vis
         self.normalize_camera = normalize_camera
 
     def encode(self,
@@ -67,16 +75,18 @@ def encode(self,
             keypoints_visible (np.ndarray, optional): Keypoint visibilities in
                 shape (N, K).
             lifting_target (np.ndarray, optional): 3d target coordinate in
-                shape (K, C).
+                shape (T, K, C).
             lifting_target_visible (np.ndarray, optional): Target coordinate in
-                shape (K, ).
+                shape (T, K, ).
             camera_param (dict, optional): The camera parameter dictionary.
 
         Returns:
             encoded (dict): Contains the following items:
 
                 - keypoint_labels (np.ndarray): The processed keypoints in
-                  shape (K * D, N) where D is 2 for 2d coordinates.
+                  shape like (N, K, D) or (K * D, N).
+                - keypoint_labels_visible (np.ndarray): The processed
+                  keypoints' weights in shape (N, K, ) or (N-1, K, ).
                 - lifting_target_label: The processed target coordinate in
                   shape (K, C) or (K-1, C).
                 - lifting_target_weights (np.ndarray): The target weights in
@@ -87,21 +97,21 @@ def encode(self,
                 In addition, there are some optional items it may contain:
 
                 - target_root (np.ndarray): The root coordinate of target in
-                  shape (C, ). Exists if ``self.zero_center`` is ``True``.
+                  shape (C, ). Exists if ``zero_center`` is ``True``.
                 - target_root_removed (bool): Indicate whether the root of
                   pose-lifitng target is removed. Exists if
-                  ``self.remove_root`` is ``True``.
+                  ``remove_root`` is ``True``.
                 - target_root_index (int): An integer indicating the index of
-                  root. Exists if ``self.remove_root`` and ``self.save_index``
+                  root. Exists if ``remove_root`` and ``save_index``
                   are ``True``.
                 - camera_param (dict): The updated camera parameter dictionary.
-                  Exists if ``self.normalize_camera`` is ``True``.
+                  Exists if ``normalize_camera`` is ``True``.
         """
         if keypoints_visible is None:
             keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32)
 
         if lifting_target is None:
-            lifting_target = keypoints[0]
+            lifting_target = [keypoints[0]]
 
         # set initial value for `lifting_target_weights`
         # and `trajectory_weights`
@@ -128,14 +138,17 @@ def encode(self,
                 f'Got invalid joint shape {lifting_target.shape}'
 
             root = lifting_target[..., self.root_index, :]
-            lifting_target_label = lifting_target_label - root
+            lifting_target_label -= lifting_target_label[
+                ..., self.root_index:self.root_index + 1, :]
             encoded['target_root'] = root
 
             if self.remove_root:
                 lifting_target_label = np.delete(
                     lifting_target_label, self.root_index, axis=-2)
-                assert lifting_target_weights.ndim in {1, 2}
-                axis_to_remove = -2 if lifting_target_weights.ndim == 2 else -1
+                lifting_target_visible = np.delete(
+                    lifting_target_visible, self.root_index, axis=-2)
+                assert lifting_target_weights.ndim in {2, 3}
+                axis_to_remove = -2 if lifting_target_weights.ndim == 3 else -1
                 lifting_target_weights = np.delete(
                     lifting_target_weights,
                     self.root_index,
@@ -167,7 +180,19 @@ def encode(self,
             _camera_param['c'] = (_camera_param['c'] - center[:, None]) / scale
             encoded['camera_param'] = _camera_param
 
+        if self.concat_vis:
+            keypoints_visible_ = keypoints_visible
+            if keypoints_visible.ndim == 2:
+                keypoints_visible_ = keypoints_visible[..., None]
+            keypoint_labels = np.concatenate(
+                (keypoint_labels, keypoints_visible_), axis=2)
+
+        if self.reshape_keypoints:
+            N = keypoint_labels.shape[0]
+            keypoint_labels = keypoint_labels.transpose(1, 2, 0).reshape(-1, N)
+
         encoded['keypoint_labels'] = keypoint_labels
+        encoded['keypoints_visible'] = keypoints_visible
         encoded['lifting_target_label'] = lifting_target_label
         encoded['lifting_target_weights'] = lifting_target_weights
         encoded['trajectory_weights'] = trajectory_weights
@@ -192,8 +217,8 @@ def decode(self,
         """
         keypoints = encoded.copy()
 
-        if target_root.size > 0:
-            keypoints = keypoints + np.expand_dims(target_root, axis=0)
+        if target_root is not None and target_root.size > 0:
+            keypoints = keypoints + target_root
             if self.remove_root:
                 keypoints = np.insert(
                     keypoints, self.root_index, target_root, axis=1)
diff --git a/mmpose/datasets/datasets/base/base_mocap_dataset.py b/mmpose/datasets/datasets/base/base_mocap_dataset.py
index d671a6ae94..e08ba6ea45 100644
--- a/mmpose/datasets/datasets/base/base_mocap_dataset.py
+++ b/mmpose/datasets/datasets/base/base_mocap_dataset.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import itertools
 import os.path as osp
 from copy import deepcopy
 from itertools import filterfalse, groupby
@@ -21,6 +22,8 @@ class BaseMocapDataset(BaseDataset):
     Args:
         ann_file (str): Annotation file path. Default: ''.
         seq_len (int): Number of frames in a sequence. Default: 1.
+        multiple_target (int): If larger than 0, merge every
+            ``multiple_target`` sequence together. Default: 0.
         causal (bool): If set to ``True``, the rightmost input frame will be
             the target frame. Otherwise, the middle input frame will be the
             target frame. Default: ``True``.
@@ -63,6 +66,7 @@ class BaseMocapDataset(BaseDataset):
     def __init__(self,
                  ann_file: str = '',
                  seq_len: int = 1,
+                 multiple_target: int = 0,
                  causal: bool = True,
                  subset_frac: float = 1.0,
                  camera_param_file: Optional[str] = None,
@@ -102,6 +106,10 @@ def __init__(self,
         self.seq_len = seq_len
         self.causal = causal
 
+        self.multiple_target = multiple_target
+        if self.multiple_target:
+            assert (self.seq_len == 1)
+
         assert 0 < subset_frac <= 1, (
             f'Unsupported `subset_frac` {subset_frac}. Supported range '
             'is (0, 1].')
@@ -241,6 +249,17 @@ def get_sequence_indices(self) -> List[List[int]]:
             sequence_indices = [[idx] for idx in range(num_imgs)]
         else:
             raise NotImplementedError('Multi-frame data sample unsupported!')
+
+        if self.multiple_target > 0:
+            sequence_indices_merged = []
+            for i in range(0, len(sequence_indices), self.multiple_target):
+                if i + self.multiple_target > len(sequence_indices):
+                    break
+                sequence_indices_merged.append(
+                    list(
+                        itertools.chain.from_iterable(
+                            sequence_indices[i:i + self.multiple_target])))
+            sequence_indices = sequence_indices_merged
         return sequence_indices
 
     def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
@@ -274,7 +293,9 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
         image_list = []
 
         for idx, frame_ids in enumerate(self.sequence_indices):
-            assert len(frame_ids) == self.seq_len
+            assert len(frame_ids) == (self.multiple_target
+                                      if self.multiple_target else
+                                      self.seq_len), f'{len(frame_ids)}'
 
             _img_names = img_names[frame_ids]
 
@@ -286,7 +307,9 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
             keypoints_3d = _keypoints_3d[..., :3]
             keypoints_3d_visible = _keypoints_3d[..., 3]
 
-            target_idx = -1 if self.causal else int(self.seq_len) // 2
+            target_idx = [-1] if self.causal else [int(self.seq_len) // 2]
+            if self.multiple_target:
+                target_idx = list(range(self.multiple_target))
 
             instance_info = {
                 'num_keypoints': num_keypoints,
diff --git a/mmpose/datasets/datasets/body3d/h36m_dataset.py b/mmpose/datasets/datasets/body3d/h36m_dataset.py
index 60094aa254..b7a4f71d65 100644
--- a/mmpose/datasets/datasets/body3d/h36m_dataset.py
+++ b/mmpose/datasets/datasets/body3d/h36m_dataset.py
@@ -45,6 +45,10 @@ class Human36mDataset(BaseMocapDataset):
         seq_len (int): Number of frames in a sequence. Default: 1.
         seq_step (int): The interval for extracting frames from the video.
             Default: 1.
+        multiple_target (int): If larger than 0, merge every
+            ``multiple_target`` sequence together. Default: 0.
+        multiple_target_step (int): The interval for merging sequence. Only
+            valid when ``multiple_target`` is larger than 0. Default: 0.
         pad_video_seq (bool): Whether to pad the video so that poses will be
             predicted for every frame in the video. Default: ``False``.
         causal (bool): If set to ``True``, the rightmost input frame will be
@@ -65,6 +69,9 @@ class Human36mDataset(BaseMocapDataset):
             If set, 2d keypoint loaded from this file will be used instead of
             ground-truth keypoints. This setting is only when
             ``keypoint_2d_src`` is ``'detection'``. Default: ``None``.
+        factor_file (str, optional): The projection factors' file. If set,
+            factor loaded from this file will be used instead of calculated
+            factors. Default: ``None``.
         camera_param_file (str): Cameras' parameters file. Default: ``None``.
         data_mode (str): Specifies the mode of data samples: ``'topdown'`` or
             ``'bottomup'``. In ``'topdown'`` mode, each data sample contains
@@ -104,11 +111,14 @@ def __init__(self,
                  ann_file: str = '',
                  seq_len: int = 1,
                  seq_step: int = 1,
+                 multiple_target: int = 0,
+                 multiple_target_step: int = 0,
                  pad_video_seq: bool = False,
                  causal: bool = True,
                  subset_frac: float = 1.0,
                  keypoint_2d_src: str = 'gt',
                  keypoint_2d_det_file: Optional[str] = None,
+                 factor_file: Optional[str] = None,
                  camera_param_file: Optional[str] = None,
                  data_mode: str = 'topdown',
                  metainfo: Optional[dict] = None,
@@ -138,9 +148,20 @@ def __init__(self,
         self.seq_step = seq_step
         self.pad_video_seq = pad_video_seq
 
+        if factor_file:
+            if not is_abs(factor_file):
+                factor_file = osp.join(data_root, factor_file)
+            assert exists(factor_file), 'Annotation file does not exist.'
+        self.factor_file = factor_file
+
+        if multiple_target > 0 and multiple_target_step == 0:
+            multiple_target_step = multiple_target
+        self.multiple_target_step = multiple_target_step
+
         super().__init__(
             ann_file=ann_file,
             seq_len=seq_len,
+            multiple_target=multiple_target,
             causal=causal,
             subset_frac=subset_frac,
             camera_param_file=camera_param_file,
@@ -171,41 +192,55 @@ def get_sequence_indices(self) -> List[List[int]]:
         sequence_indices = []
         _len = (self.seq_len - 1) * self.seq_step + 1
         _step = self.seq_step
-        for _, _indices in sorted(video_frames.items()):
-            n_frame = len(_indices)
-
-            if self.pad_video_seq:
-                # Pad the sequence so that every frame in the sequence will be
-                # predicted.
-                if self.causal:
-                    frames_left = self.seq_len - 1
-                    frames_right = 0
-                else:
-                    frames_left = (self.seq_len - 1) // 2
-                    frames_right = frames_left
-                for i in range(n_frame):
-                    pad_left = max(0, frames_left - i // _step)
-                    pad_right = max(0,
-                                    frames_right - (n_frame - 1 - i) // _step)
-                    start = max(i % _step, i - frames_left * _step)
-                    end = min(n_frame - (n_frame - 1 - i) % _step,
-                              i + frames_right * _step + 1)
-                    sequence_indices.append([_indices[0]] * pad_left +
-                                            _indices[start:end:_step] +
-                                            [_indices[-1]] * pad_right)
-            else:
+
+        if self.multiple_target:
+            for _, _indices in sorted(video_frames.items()):
+                n_frame = len(_indices)
                 seqs_from_video = [
-                    _indices[i:(i + _len):_step]
-                    for i in range(0, n_frame - _len + 1)
-                ]
+                    _indices[i:(i + self.multiple_target):_step]
+                    for i in range(0, n_frame, self.multiple_target_step)
+                ][:(n_frame + self.multiple_target_step -
+                    self.multiple_target) // self.multiple_target_step]
                 sequence_indices.extend(seqs_from_video)
 
+        else:
+            for _, _indices in sorted(video_frames.items()):
+                n_frame = len(_indices)
+
+                if self.pad_video_seq:
+                    # Pad the sequence so that every frame in the sequence will
+                    # be predicted.
+                    if self.causal:
+                        frames_left = self.seq_len - 1
+                        frames_right = 0
+                    else:
+                        frames_left = (self.seq_len - 1) // 2
+                        frames_right = frames_left
+                    for i in range(n_frame):
+                        pad_left = max(0, frames_left - i // _step)
+                        pad_right = max(
+                            0, frames_right - (n_frame - 1 - i) // _step)
+                        start = max(i % _step, i - frames_left * _step)
+                        end = min(n_frame - (n_frame - 1 - i) % _step,
+                                  i + frames_right * _step + 1)
+                        sequence_indices.append([_indices[0]] * pad_left +
+                                                _indices[start:end:_step] +
+                                                [_indices[-1]] * pad_right)
+                else:
+                    seqs_from_video = [
+                        _indices[i:(i + _len):_step]
+                        for i in range(0, n_frame - _len + 1)
+                    ]
+                    sequence_indices.extend(seqs_from_video)
+
         # reduce dataset size if needed
         subset_size = int(len(sequence_indices) * self.subset_frac)
         start = np.random.randint(0, len(sequence_indices) - subset_size + 1)
         end = start + subset_size
 
-        return sequence_indices[start:end]
+        sequence_indices = sequence_indices[start:end]
+
+        return sequence_indices
 
     def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
         instance_list, image_list = super()._load_annotations()
@@ -230,6 +265,15 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
                     'keypoints_visible':
                     keypoints_visible
                 })
+        if self.factor_file:
+            with get_local_path(self.factor_file) as local_path:
+                factors = np.load(local_path).astype(np.float32)
+        else:
+            factors = np.zeros((kpts_3d.shape[0], ), dtype=np.float32)
+        assert factors.shape[0] == kpts_3d.shape[0]
+        for idx, frame_ids in enumerate(self.sequence_indices):
+            factor = factors[frame_ids].astype(np.float32)
+            instance_list[idx].update({'factor': factor})
 
         return instance_list, image_list
 
diff --git a/mmpose/datasets/transforms/formatting.py b/mmpose/datasets/transforms/formatting.py
index 05aeef179f..d047cff3c3 100644
--- a/mmpose/datasets/transforms/formatting.py
+++ b/mmpose/datasets/transforms/formatting.py
@@ -51,8 +51,6 @@ def keypoints_to_tensor(keypoints: Union[np.ndarray, Sequence[np.ndarray]]
     """
     if isinstance(keypoints, np.ndarray):
         keypoints = np.ascontiguousarray(keypoints)
-        N = keypoints.shape[0]
-        keypoints = keypoints.transpose(1, 2, 0).reshape(-1, N)
         tensor = torch.from_numpy(keypoints).contiguous()
     else:
         assert is_seq_of(keypoints, np.ndarray)
@@ -209,9 +207,9 @@ def transform(self, results: dict) -> dict:
         for key, packed_key in self.label_mapping_table.items():
             if key in results:
                 # For pose-lifting, store only target-related fields
-                if 'lifting_target_label' in results and key in {
+                if 'lifting_target' in results and packed_key in {
                         'keypoint_labels', 'keypoint_weights',
-                        'transformed_keypoints_visible'
+                        'keypoints_visible'
                 }:
                     continue
                 if isinstance(results[key], list):
diff --git a/mmpose/datasets/transforms/pose3d_transforms.py b/mmpose/datasets/transforms/pose3d_transforms.py
index e6559fa398..2149d7cb30 100644
--- a/mmpose/datasets/transforms/pose3d_transforms.py
+++ b/mmpose/datasets/transforms/pose3d_transforms.py
@@ -25,6 +25,8 @@ class RandomFlipAroundRoot(BaseTransform):
         flip_prob (float): Probability of flip. Default: 0.5.
         flip_camera (bool): Whether to flip horizontal distortion coefficients.
             Default: ``False``.
+        flip_image (bool): Whether to flip keypoints horizontally according
+            to image size. Default: ``False``.
 
     Required keys:
         keypoints
@@ -39,14 +41,16 @@ def __init__(self,
                  keypoints_flip_cfg,
                  target_flip_cfg,
                  flip_prob=0.5,
-                 flip_camera=False):
+                 flip_camera=False,
+                 flip_image=False):
         self.keypoints_flip_cfg = keypoints_flip_cfg
         self.target_flip_cfg = target_flip_cfg
         self.flip_prob = flip_prob
         self.flip_camera = flip_camera
+        self.flip_image = flip_image
 
     def transform(self, results: Dict) -> dict:
-        """The transform function of :class:`ZeroCenterPose`.
+        """The transform function of :class:`RandomFlipAroundRoot`.
 
         See ``transform()`` method of :class:`BaseTransform` for details.
 
@@ -76,6 +80,15 @@ def transform(self, results: Dict) -> dict:
                 flip_indices = results['flip_indices']
 
             # flip joint coordinates
+            _camera_param = deepcopy(results['camera_param'])
+            if self.flip_image:
+                assert 'camera_param' in results, \
+                    'Camera parameters are missing.'
+                assert 'w' in _camera_param
+                w = _camera_param['w'] / 2
+                self.keypoints_flip_cfg['center_x'] = w
+                self.target_flip_cfg['center_x'] = w
+
             keypoints, keypoints_visible = flip_keypoints_custom_center(
                 keypoints, keypoints_visible, flip_indices,
                 **self.keypoints_flip_cfg)
@@ -92,7 +105,6 @@ def transform(self, results: Dict) -> dict:
             if self.flip_camera:
                 assert 'camera_param' in results, \
                     'Camera parameters are missing.'
-                _camera_param = deepcopy(results['camera_param'])
 
                 assert 'c' in _camera_param
                 _camera_param['c'][0] *= -1
diff --git a/mmpose/evaluation/metrics/keypoint_3d_metrics.py b/mmpose/evaluation/metrics/keypoint_3d_metrics.py
index e945650c30..fb3447bb3f 100644
--- a/mmpose/evaluation/metrics/keypoint_3d_metrics.py
+++ b/mmpose/evaluation/metrics/keypoint_3d_metrics.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from collections import defaultdict
 from os import path as osp
-from typing import Dict, Optional, Sequence
+from typing import Dict, List, Optional, Sequence
 
 import numpy as np
 from mmengine.evaluator import BaseMetric
@@ -38,6 +38,8 @@ class MPJPE(BaseMetric):
             names to disambiguate homonymous metrics of different evaluators.
             If prefix is not provided in the argument, ``self.default_prefix``
             will be used instead. Default: ``None``.
+        skip_list (list, optional): The list of subject and action combinations
+            to be skipped. Default: [].
     """
 
     ALIGNMENT = {'mpjpe': 'none', 'p-mpjpe': 'procrustes', 'n-mpjpe': 'scale'}
@@ -45,7 +47,8 @@ class MPJPE(BaseMetric):
     def __init__(self,
                  mode: str = 'mpjpe',
                  collect_device: str = 'cpu',
-                 prefix: Optional[str] = None) -> None:
+                 prefix: Optional[str] = None,
+                 skip_list: List[str] = []) -> None:
         super().__init__(collect_device=collect_device, prefix=prefix)
         allowed_modes = self.ALIGNMENT.keys()
         if mode not in allowed_modes:
@@ -53,6 +56,7 @@ def __init__(self,
                            f"'n-mpjpe', but got '{mode}'.")
 
         self.mode = mode
+        self.skip_list = skip_list
 
     def process(self, data_batch: Sequence[dict],
                 data_samples: Sequence[dict]) -> None:
@@ -67,24 +71,32 @@ def process(self, data_batch: Sequence[dict],
                 the model.
         """
         for data_sample in data_samples:
-            # predicted keypoints coordinates, [1, K, D]
+            # predicted keypoints coordinates, [T, K, D]
             pred_coords = data_sample['pred_instances']['keypoints']
+            if pred_coords.ndim == 4:
+                pred_coords = np.squeeze(pred_coords, axis=0)
             # ground truth data_info
             gt = data_sample['gt_instances']
-            # ground truth keypoints coordinates, [1, K, D]
+            # ground truth keypoints coordinates, [T, K, D]
             gt_coords = gt['lifting_target']
-            # ground truth keypoints_visible, [1, K, 1]
-            mask = gt['lifting_target_visible'].astype(bool).reshape(1, -1)
+            # ground truth keypoints_visible, [T, K, 1]
+            mask = gt['lifting_target_visible'].astype(bool).reshape(
+                gt_coords.shape[0], -1)
             # instance action
-            img_path = data_sample['target_img_path']
+            img_path = data_sample['target_img_path'][0]
             _, rest = osp.basename(img_path).split('_', 1)
             action, _ = rest.split('.', 1)
+            actions = np.array([action] * gt_coords.shape[0])
+
+            subj_act = osp.basename(img_path).split('.')[0]
+            if subj_act in self.skip_list:
+                continue
 
             result = {
                 'pred_coords': pred_coords,
                 'gt_coords': gt_coords,
                 'mask': mask,
-                'action': action
+                'actions': actions
             }
 
             self.results.append(result)
@@ -104,16 +116,15 @@ def compute_metrics(self, results: list) -> Dict[str, float]:
         # pred_coords: [N, K, D]
         pred_coords = np.concatenate(
             [result['pred_coords'] for result in results])
-        if pred_coords.ndim == 4 and pred_coords.shape[1] == 1:
-            pred_coords = np.squeeze(pred_coords, axis=1)
         # gt_coords: [N, K, D]
-        gt_coords = np.stack([result['gt_coords'] for result in results])
+        gt_coords = np.concatenate([result['gt_coords'] for result in results])
         # mask: [N, K]
         mask = np.concatenate([result['mask'] for result in results])
         # action_category_indices: Dict[List[int]]
         action_category_indices = defaultdict(list)
-        for idx, result in enumerate(results):
-            action_category = result['action'].split('_')[0]
+        actions = np.concatenate([result['actions'] for result in results])
+        for idx, action in enumerate(actions):
+            action_category = action.split('_')[0]
             action_category_indices[action_category].append(idx)
 
         error_name = self.mode.upper()
@@ -126,6 +137,7 @@ def compute_metrics(self, results: list) -> Dict[str, float]:
 
         for action_category, indices in action_category_indices.items():
             metrics[f'{error_name}_{action_category}'] = keypoint_mpjpe(
-                pred_coords[indices], gt_coords[indices], mask[indices])
+                pred_coords[indices], gt_coords[indices], mask[indices],
+                self.ALIGNMENT[self.mode])
 
         return metrics
diff --git a/mmpose/models/backbones/__init__.py b/mmpose/models/backbones/__init__.py
index cb2498560a..563264eecf 100644
--- a/mmpose/models/backbones/__init__.py
+++ b/mmpose/models/backbones/__init__.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .alexnet import AlexNet
 from .cpm import CPM
+from .dstformer import DSTFormer
 from .hourglass import HourglassNet
 from .hourglass_ae import HourglassAENet
 from .hrformer import HRFormer
@@ -33,5 +34,5 @@
     'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN',
     'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'ViPNAS_MobileNetV3',
     'LiteHRNet', 'V2VNet', 'HRFormer', 'PyramidVisionTransformer',
-    'PyramidVisionTransformerV2', 'SwinTransformer'
+    'PyramidVisionTransformerV2', 'SwinTransformer', 'DSTFormer'
 ]
diff --git a/mmpose/models/backbones/dstformer.py b/mmpose/models/backbones/dstformer.py
new file mode 100644
index 0000000000..2ef13bdb02
--- /dev/null
+++ b/mmpose/models/backbones/dstformer.py
@@ -0,0 +1,304 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn.bricks import DropPath
+from mmengine.model import BaseModule, constant_init
+from mmengine.model.weight_init import trunc_normal_
+
+from mmpose.registry import MODELS
+from .base_backbone import BaseBackbone
+
+
+class Attention(BaseModule):
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 mode='spatial'):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.mode = mode
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.attn_count_s = None
+        self.attn_count_t = None
+
+    def forward(self, x, seq_len=1):
+        B, N, C = x.shape
+
+        if self.mode == 'temporal':
+            qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C //
+                                      self.num_heads).permute(2, 0, 3, 1, 4)
+            q, k, v = qkv[0], qkv[1], qkv[
+                2]  # make torchscript happy (cannot use tensor as tuple)
+            x = self.forward_temporal(q, k, v, seq_len=seq_len)
+        elif self.mode == 'spatial':
+            qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C //
+                                      self.num_heads).permute(2, 0, 3, 1, 4)
+            q, k, v = qkv[0], qkv[1], qkv[
+                2]  # make torchscript happy (cannot use tensor as tuple)
+            x = self.forward_spatial(q, k, v)
+        else:
+            raise NotImplementedError(self.mode)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    def forward_spatial(self, q, k, v):
+        B, _, N, C = q.shape
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = attn @ v
+        x = x.transpose(1, 2).reshape(B, N, C * self.num_heads)
+        return x
+
+    def forward_temporal(self, q, k, v, seq_len=8):
+        B, _, N, C = q.shape
+        qt = q.reshape(-1, seq_len, self.num_heads, N,
+                       C).permute(0, 2, 3, 1, 4)  # (B, H, N, T, C)
+        kt = k.reshape(-1, seq_len, self.num_heads, N,
+                       C).permute(0, 2, 3, 1, 4)  # (B, H, N, T, C)
+        vt = v.reshape(-1, seq_len, self.num_heads, N,
+                       C).permute(0, 2, 3, 1, 4)  # (B, H, N, T, C)
+
+        attn = (qt @ kt.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = attn @ vt  # (B, H, N, T, C)
+        x = x.permute(0, 3, 2, 1, 4).reshape(B, N, C * self.num_heads)
+        return x
+
+
+class AttentionBlock(BaseModule):
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 mlp_out_ratio=1.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 st_mode='st'):
+        super().__init__()
+
+        self.st_mode = st_mode
+        self.norm1_s = nn.LayerNorm(dim, eps=1e-06)
+        self.norm1_t = nn.LayerNorm(dim, eps=1e-06)
+
+        self.attn_s = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            mode='spatial')
+        self.attn_t = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            mode='temporal')
+
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2_s = nn.LayerNorm(dim, eps=1e-06)
+        self.norm2_t = nn.LayerNorm(dim, eps=1e-06)
+
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        mlp_out_dim = int(dim * mlp_out_ratio)
+        self.mlp_s = nn.Sequential(
+            nn.Linear(dim, mlp_hidden_dim), nn.GELU(),
+            nn.Linear(mlp_hidden_dim, mlp_out_dim), nn.Dropout(drop))
+        self.mlp_t = nn.Sequential(
+            nn.Linear(dim, mlp_hidden_dim), nn.GELU(),
+            nn.Linear(mlp_hidden_dim, mlp_out_dim), nn.Dropout(drop))
+
+    def forward(self, x, seq_len=1):
+        if self.st_mode == 'st':
+            x = x + self.drop_path(self.attn_s(self.norm1_s(x), seq_len))
+            x = x + self.drop_path(self.mlp_s(self.norm2_s(x)))
+            x = x + self.drop_path(self.attn_t(self.norm1_t(x), seq_len))
+            x = x + self.drop_path(self.mlp_t(self.norm2_t(x)))
+        elif self.st_mode == 'ts':
+            x = x + self.drop_path(self.attn_t(self.norm1_t(x), seq_len))
+            x = x + self.drop_path(self.mlp_t(self.norm2_t(x)))
+            x = x + self.drop_path(self.attn_s(self.norm1_s(x), seq_len))
+            x = x + self.drop_path(self.mlp_s(self.norm2_s(x)))
+        else:
+            raise NotImplementedError(self.st_mode)
+        return x
+
+
+@MODELS.register_module()
+class DSTFormer(BaseBackbone):
+    """Dual-stream Spatio-temporal Transformer Module.
+
+    Args:
+        in_channels (int): Number of input channels.
+        feat_size: Number of feature channels. Default: 256.
+        depth: The network depth. Default: 5.
+        num_heads: Number of heads in multi-Head self-attention blocks.
+            Default: 8.
+        mlp_ratio (int, optional): The expansion ratio of FFN. Default: 4.
+        num_keypoints: num_keypoints (int): Number of keypoints. Default: 17.
+        seq_len: The sequence length. Default: 243.
+        qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
+            Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop_rate (float, optional): Dropout ratio of input. Default: 0.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Default: 0.
+        drop_path_rate (float, optional): Stochastic depth rate. Default: 0.
+        att_fuse: Whether to fuse the results of attention blocks.
+            Default: True.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Example:
+        >>> from mmpose.models import DSTFormer
+        >>> import torch
+        >>> self = DSTFormer(in_channels=3)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 2, 17, 3)
+        >>> level_outputs = self.forward(inputs)
+        >>> print(tuple(level_outputs.shape))
+        (1, 2, 17, 512)
+    """
+
+    def __init__(self,
+                 in_channels,
+                 feat_size=256,
+                 depth=5,
+                 num_heads=8,
+                 mlp_ratio=4,
+                 num_keypoints=17,
+                 seq_len=243,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 att_fuse=True,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.in_channels = in_channels
+        self.feat_size = feat_size
+
+        self.joints_embed = nn.Linear(in_channels, feat_size)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
+               ]  # stochastic depth decay rule
+
+        self.blocks_st = nn.ModuleList([
+            AttentionBlock(
+                dim=feat_size,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                st_mode='st') for i in range(depth)
+        ])
+        self.blocks_ts = nn.ModuleList([
+            AttentionBlock(
+                dim=feat_size,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                st_mode='ts') for i in range(depth)
+        ])
+
+        self.norm = nn.LayerNorm(feat_size, eps=1e-06)
+
+        self.temp_embed = nn.Parameter(torch.zeros(1, seq_len, 1, feat_size))
+        self.spat_embed = nn.Parameter(
+            torch.zeros(1, num_keypoints, feat_size))
+
+        trunc_normal_(self.temp_embed, std=.02)
+        trunc_normal_(self.spat_embed, std=.02)
+
+        self.att_fuse = att_fuse
+        if self.att_fuse:
+            self.attn_regress = nn.ModuleList(
+                [nn.Linear(feat_size * 2, 2) for i in range(depth)])
+            for i in range(depth):
+                self.attn_regress[i].weight.data.fill_(0)
+                self.attn_regress[i].bias.data.fill_(0.5)
+
+    def forward(self, x):
+        if len(x.shape) == 3:
+            x = x[None, :]
+        assert len(x.shape) == 4
+
+        B, F, K, C = x.shape
+        x = x.reshape(-1, K, C)
+        BF = x.shape[0]
+        x = self.joints_embed(x)  # (BF, K, feat_size)
+        x = x + self.spat_embed
+        _, K, C = x.shape
+        x = x.reshape(-1, F, K, C) + self.temp_embed[:, :F, :, :]
+        x = x.reshape(BF, K, C)  # (BF, K, feat_size)
+        x = self.pos_drop(x)
+
+        for idx, (blk_st,
+                  blk_ts) in enumerate(zip(self.blocks_st, self.blocks_ts)):
+            x_st = blk_st(x, F)
+            x_ts = blk_ts(x, F)
+            if self.att_fuse:
+                att = self.attn_regress[idx]
+                alpha = torch.cat([x_st, x_ts], dim=-1)
+                BF, K = alpha.shape[:2]
+                alpha = att(alpha)
+                alpha = alpha.softmax(dim=-1)
+                x = x_st * alpha[:, :, 0:1] + x_ts * alpha[:, :, 1:2]
+            else:
+                x = (x_st + x_ts) * 0.5
+        x = self.norm(x)  # (BF, K, feat_size)
+        x = x.reshape(B, F, K, -1)
+        return x
+
+    def init_weights(self):
+        """Initialize the weights in backbone."""
+        super(DSTFormer, self).init_weights()
+
+        if (isinstance(self.init_cfg, dict)
+                and self.init_cfg['type'] == 'Pretrained'):
+            return
+
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    constant_init(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                constant_init(m.bias, 0)
+                constant_init(m.weight, 1.0)
diff --git a/mmpose/models/heads/__init__.py b/mmpose/models/heads/__init__.py
index e01f2269e3..ef0e17d98e 100644
--- a/mmpose/models/heads/__init__.py
+++ b/mmpose/models/heads/__init__.py
@@ -5,7 +5,8 @@
                             HeatmapHead, MSPNHead, ViPNASHead)
 from .hybrid_heads import DEKRHead, VisPredictHead
 from .regression_heads import (DSNTHead, IntegralRegressionHead,
-                               RegressionHead, RLEHead, TemporalRegressionHead,
+                               MotionRegressionHead, RegressionHead, RLEHead,
+                               TemporalRegressionHead,
                                TrajectoryRegressionHead)
 
 __all__ = [
@@ -13,5 +14,5 @@
     'RegressionHead', 'IntegralRegressionHead', 'SimCCHead', 'RLEHead',
     'DSNTHead', 'AssociativeEmbeddingHead', 'DEKRHead', 'VisPredictHead',
     'CIDHead', 'RTMCCHead', 'TemporalRegressionHead',
-    'TrajectoryRegressionHead'
+    'TrajectoryRegressionHead', 'MotionRegressionHead'
 ]
diff --git a/mmpose/models/heads/regression_heads/__init__.py b/mmpose/models/heads/regression_heads/__init__.py
index ce9cd5e1b0..729d193b51 100644
--- a/mmpose/models/heads/regression_heads/__init__.py
+++ b/mmpose/models/heads/regression_heads/__init__.py
@@ -1,16 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .dsnt_head import DSNTHead
 from .integral_regression_head import IntegralRegressionHead
+from .motion_regression_head import MotionRegressionHead
 from .regression_head import RegressionHead
 from .rle_head import RLEHead
 from .temporal_regression_head import TemporalRegressionHead
 from .trajectory_regression_head import TrajectoryRegressionHead
 
 __all__ = [
-    'RegressionHead',
-    'IntegralRegressionHead',
-    'DSNTHead',
-    'RLEHead',
-    'TemporalRegressionHead',
-    'TrajectoryRegressionHead',
+    'RegressionHead', 'IntegralRegressionHead', 'DSNTHead', 'RLEHead',
+    'TemporalRegressionHead', 'TrajectoryRegressionHead',
+    'MotionRegressionHead'
 ]
diff --git a/mmpose/models/heads/regression_heads/motion_regression_head.py b/mmpose/models/heads/regression_heads/motion_regression_head.py
new file mode 100644
index 0000000000..a0037180c7
--- /dev/null
+++ b/mmpose/models/heads/regression_heads/motion_regression_head.py
@@ -0,0 +1,176 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+from typing import Tuple
+
+import numpy as np
+import torch
+from torch import Tensor, nn
+
+from mmpose.evaluation.functional import keypoint_mpjpe
+from mmpose.registry import KEYPOINT_CODECS, MODELS
+from mmpose.utils.tensor_utils import to_numpy
+from mmpose.utils.typing import (ConfigType, OptConfigType, OptSampleList,
+                                 Predictions)
+from ..base_head import BaseHead
+
+
+@MODELS.register_module()
+class MotionRegressionHead(BaseHead):
+    """Regression head of `MotionBERT`_ by Zhu et al (2022).
+
+    Args:
+        in_channels (int): Number of input channels. Default: 256.
+        out_channels (int): Number of output channels. Default: 3.
+        embedding_size (int): Number of embedding channels. Default: 512.
+        loss (Config): Config for keypoint loss. Defaults to use
+            :class:`MSELoss`
+        decoder (Config, optional): The decoder config that controls decoding
+            keypoint coordinates from the network output. Defaults to ``None``
+        init_cfg (Config, optional): Config to control the initialization. See
+            :attr:`default_init_cfg` for default settings
+
+    .. _`MotionBERT`: https://arxiv.org/abs/2210.06551
+    """
+
+    _version = 2
+
+    def __init__(self,
+                 in_channels: int = 256,
+                 out_channels: int = 3,
+                 embedding_size: int = 512,
+                 loss: ConfigType = dict(
+                     type='MSELoss', use_target_weight=True),
+                 decoder: OptConfigType = None,
+                 init_cfg: OptConfigType = None):
+
+        if init_cfg is None:
+            init_cfg = self.default_init_cfg
+
+        super().__init__(init_cfg)
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.loss_module = MODELS.build(loss)
+        if decoder is not None:
+            self.decoder = KEYPOINT_CODECS.build(decoder)
+        else:
+            self.decoder = None
+
+        # Define fully-connected layers
+        self.pre_logits = nn.Sequential(
+            OrderedDict([('fc', nn.Linear(in_channels, embedding_size)),
+                         ('act', nn.Tanh())]))
+        self.fc = nn.Linear(
+            embedding_size,
+            out_channels) if embedding_size > 0 else nn.Identity()
+
+    def forward(self, feats: Tuple[Tensor]) -> Tensor:
+        """Forward the network. The input is multi scale feature maps and the
+        output is the coordinates.
+
+        Args:
+            feats (Tuple[Tensor]): Multi scale feature maps.
+
+        Returns:
+            Tensor: Output coordinates (and sigmas[optional]).
+        """
+        x = feats  # (B, F, K, in_channels)
+        x = self.pre_logits(x)  # (B, F, K, embedding_size)
+        x = self.fc(x)  # (B, F, K, out_channels)
+
+        return x
+
+    def predict(self,
+                feats: Tuple[Tensor],
+                batch_data_samples: OptSampleList,
+                test_cfg: ConfigType = {}) -> Predictions:
+        """Predict results from outputs.
+
+        Returns:
+            preds (sequence[InstanceData]): Prediction results.
+                Each contains the following fields:
+
+                - keypoints: Predicted keypoints of shape (B, N, K, D).
+                - keypoint_scores: Scores of predicted keypoints of shape
+                  (B, N, K).
+        """
+
+        batch_coords = self.forward(feats)  # (B, K, D)
+
+        # Restore global position with camera_param and factor
+        camera_param = batch_data_samples[0].metainfo.get('camera_param', None)
+        if camera_param is not None:
+            w = torch.stack([
+                torch.from_numpy(np.array([b.metainfo['camera_param']['w']]))
+                for b in batch_data_samples
+            ])
+            h = torch.stack([
+                torch.from_numpy(np.array([b.metainfo['camera_param']['h']]))
+                for b in batch_data_samples
+            ])
+        else:
+            w = torch.stack([
+                torch.empty((0), dtype=torch.float32)
+                for _ in batch_data_samples
+            ])
+            h = torch.stack([
+                torch.empty((0), dtype=torch.float32)
+                for _ in batch_data_samples
+            ])
+
+        factor = batch_data_samples[0].metainfo.get('factor', None)
+        if factor is not None:
+            factor = torch.stack([
+                torch.from_numpy(b.metainfo['factor'])
+                for b in batch_data_samples
+            ])
+        else:
+            factor = torch.stack([
+                torch.empty((0), dtype=torch.float32)
+                for _ in batch_data_samples
+            ])
+
+        preds = self.decode((batch_coords, w, h, factor))
+
+        return preds
+
+    def loss(self,
+             inputs: Tuple[Tensor],
+             batch_data_samples: OptSampleList,
+             train_cfg: ConfigType = {}) -> dict:
+        """Calculate losses from a batch of inputs and data samples."""
+
+        pred_outputs = self.forward(inputs)
+
+        lifting_target_label = torch.stack([
+            d.gt_instance_labels.lifting_target_label
+            for d in batch_data_samples
+        ])
+        lifting_target_weights = torch.stack([
+            d.gt_instance_labels.lifting_target_weights
+            for d in batch_data_samples
+        ])
+
+        # calculate losses
+        losses = dict()
+        loss = self.loss_module(pred_outputs, lifting_target_label,
+                                lifting_target_weights.unsqueeze(-1))
+
+        losses.update(loss_pose3d=loss)
+
+        # calculate accuracy
+        mpjpe_err = keypoint_mpjpe(
+            pred=to_numpy(pred_outputs),
+            gt=to_numpy(lifting_target_label),
+            mask=to_numpy(lifting_target_weights) > 0)
+
+        mpjpe_pose = torch.tensor(
+            mpjpe_err, device=lifting_target_label.device)
+        losses.update(mpjpe=mpjpe_pose)
+
+        return losses
+
+    @property
+    def default_init_cfg(self):
+        init_cfg = [dict(type='TruncNormal', layer=['Linear'], std=0.02)]
+        return init_cfg
diff --git a/mmpose/models/heads/regression_heads/temporal_regression_head.py b/mmpose/models/heads/regression_heads/temporal_regression_head.py
index ac76316842..9ed2e9f4fa 100644
--- a/mmpose/models/heads/regression_heads/temporal_regression_head.py
+++ b/mmpose/models/heads/regression_heads/temporal_regression_head.py
@@ -101,7 +101,7 @@ def predict(self,
         else:
             target_root = torch.stack([
                 torch.empty((0), dtype=torch.float32)
-                for _ in batch_data_samples[0].metainfo
+                for _ in batch_data_samples
             ])
 
         preds = self.decode((batch_coords, target_root))
diff --git a/mmpose/models/heads/regression_heads/trajectory_regression_head.py b/mmpose/models/heads/regression_heads/trajectory_regression_head.py
index adfd7353d3..a1608aaae7 100644
--- a/mmpose/models/heads/regression_heads/trajectory_regression_head.py
+++ b/mmpose/models/heads/regression_heads/trajectory_regression_head.py
@@ -101,7 +101,7 @@ def predict(self,
         else:
             target_root = torch.stack([
                 torch.empty((0), dtype=torch.float32)
-                for _ in batch_data_samples[0].metainfo
+                for _ in batch_data_samples
             ])
 
         preds = self.decode((batch_coords, target_root))
diff --git a/mmpose/models/losses/regression_loss.py b/mmpose/models/losses/regression_loss.py
index 9a64a4adfe..b50ad99f04 100644
--- a/mmpose/models/losses/regression_loss.py
+++ b/mmpose/models/losses/regression_loss.py
@@ -365,6 +365,84 @@ def forward(self, output, target, target_weight=None):
         return loss * self.loss_weight
 
 
+@MODELS.register_module()
+class MPJPEVelocityJointLoss(nn.Module):
+    """MPJPE (Mean Per Joint Position Error) loss.
+
+    Args:
+        loss_weight (float): Weight of the loss. Default: 1.0.
+        lambda_scale (float): Factor of the N-MPJPE loss. Default: 0.5.
+        lambda_3d_velocity (float): Factor of the velocity loss. Default: 20.0.
+    """
+
+    def __init__(self,
+                 use_target_weight=False,
+                 loss_weight=1.,
+                 lambda_scale=0.5,
+                 lambda_3d_velocity=20.0):
+        super().__init__()
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+        self.lambda_scale = lambda_scale
+        self.lambda_3d_velocity = lambda_3d_velocity
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N,K,D]):
+                Weights across different joint types.
+        """
+        norm_output = torch.mean(
+            torch.sum(torch.square(output), dim=-1, keepdim=True),
+            dim=-2,
+            keepdim=True)
+        norm_target = torch.mean(
+            torch.sum(target * output, dim=-1, keepdim=True),
+            dim=-2,
+            keepdim=True)
+
+        velocity_output = output[..., 1:, :, :] - output[..., :-1, :, :]
+        velocity_target = target[..., 1:, :, :] - target[..., :-1, :, :]
+
+        if self.use_target_weight:
+            assert target_weight is not None
+            mpjpe = torch.mean(
+                torch.norm((output - target) * target_weight, dim=-1))
+
+            nmpjpe = torch.mean(
+                torch.norm(
+                    (norm_target / norm_output * output - target) *
+                    target_weight,
+                    dim=-1))
+
+            loss_3d_velocity = torch.mean(
+                torch.norm(
+                    (velocity_output - velocity_target) * target_weight,
+                    dim=-1))
+        else:
+            mpjpe = torch.mean(torch.norm(output - target, dim=-1))
+
+            nmpjpe = torch.mean(
+                torch.norm(
+                    norm_target / norm_output * output - target, dim=-1))
+
+            loss_3d_velocity = torch.mean(
+                torch.norm(velocity_output - velocity_target, dim=-1))
+
+        loss = mpjpe + nmpjpe * self.lambda_scale + \
+            loss_3d_velocity * self.lambda_3d_velocity
+
+        return loss * self.loss_weight
+
+
 @MODELS.register_module()
 class MPJPELoss(nn.Module):
     """MPJPE (Mean Per Joint Position Error) loss.
diff --git a/tests/test_codecs/test_image_pose_lifting.py b/tests/test_codecs/test_image_pose_lifting.py
index bb94786c32..78b19ec59b 100644
--- a/tests/test_codecs/test_image_pose_lifting.py
+++ b/tests/test_codecs/test_image_pose_lifting.py
@@ -13,14 +13,18 @@ def setUp(self) -> None:
         keypoints = (0.1 + 0.8 * np.random.rand(1, 17, 2)) * [192, 256]
         keypoints = np.round(keypoints).astype(np.float32)
         keypoints_visible = np.random.randint(2, size=(1, 17))
-        lifting_target = (0.1 + 0.8 * np.random.rand(17, 3))
-        lifting_target_visible = np.random.randint(2, size=(17, ))
+        lifting_target = (0.1 + 0.8 * np.random.rand(1, 17, 3))
+        lifting_target_visible = np.random.randint(
+            2, size=(
+                1,
+                17,
+            ))
         encoded_wo_sigma = np.random.rand(1, 17, 3)
 
         self.keypoints_mean = np.random.rand(17, 2).astype(np.float32)
         self.keypoints_std = np.random.rand(17, 2).astype(np.float32) + 1e-6
-        self.target_mean = np.random.rand(17, 3).astype(np.float32)
-        self.target_std = np.random.rand(17, 3).astype(np.float32) + 1e-6
+        self.target_mean = np.random.rand(1, 17, 3).astype(np.float32)
+        self.target_std = np.random.rand(1, 17, 3).astype(np.float32) + 1e-6
 
         self.data = dict(
             keypoints=keypoints,
@@ -30,7 +34,11 @@ def setUp(self) -> None:
             encoded_wo_sigma=encoded_wo_sigma)
 
     def build_pose_lifting_label(self, **kwargs):
-        cfg = dict(type='ImagePoseLifting', num_keypoints=17, root_index=0)
+        cfg = dict(
+            type='ImagePoseLifting',
+            num_keypoints=17,
+            root_index=0,
+            reshape_keypoints=False)
         cfg.update(kwargs)
         return KEYPOINT_CODECS.build(cfg)
 
@@ -50,10 +58,19 @@ def test_encode(self):
                                lifting_target_visible)
 
         self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2))
-        self.assertEqual(encoded['lifting_target_label'].shape, (17, 3))
-        self.assertEqual(encoded['lifting_target_weights'].shape, (17, ))
-        self.assertEqual(encoded['trajectory_weights'].shape, (17, ))
-        self.assertEqual(encoded['target_root'].shape, (3, ))
+        self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3))
+        self.assertEqual(encoded['lifting_target_weights'].shape, (
+            1,
+            17,
+        ))
+        self.assertEqual(encoded['trajectory_weights'].shape, (
+            1,
+            17,
+        ))
+        self.assertEqual(encoded['target_root'].shape, (
+            1,
+            3,
+        ))
 
         # test removing root
         codec = self.build_pose_lifting_label(
@@ -63,10 +80,16 @@ def test_encode(self):
 
         self.assertTrue('target_root_removed' in encoded
                         and 'target_root_index' in encoded)
-        self.assertEqual(encoded['lifting_target_weights'].shape, (16, ))
+        self.assertEqual(encoded['lifting_target_weights'].shape, (
+            1,
+            16,
+        ))
         self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2))
-        self.assertEqual(encoded['lifting_target_label'].shape, (16, 3))
-        self.assertEqual(encoded['target_root'].shape, (3, ))
+        self.assertEqual(encoded['lifting_target_label'].shape, (1, 16, 3))
+        self.assertEqual(encoded['target_root'].shape, (
+            1,
+            3,
+        ))
 
         # test normalization
         codec = self.build_pose_lifting_label(
@@ -78,7 +101,7 @@ def test_encode(self):
                                lifting_target_visible)
 
         self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2))
-        self.assertEqual(encoded['lifting_target_label'].shape, (17, 3))
+        self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3))
 
     def test_decode(self):
         lifting_target = self.data['lifting_target']
@@ -112,12 +135,10 @@ def test_cicular_verification(self):
                                lifting_target_visible)
 
         _keypoints, _ = codec.decode(
-            np.expand_dims(encoded['lifting_target_label'], axis=0),
+            encoded['lifting_target_label'],
             target_root=lifting_target[..., 0, :])
 
-        self.assertTrue(
-            np.allclose(
-                np.expand_dims(lifting_target, axis=0), _keypoints, atol=5.))
+        self.assertTrue(np.allclose(lifting_target, _keypoints, atol=5.))
 
         # test removing root
         codec = self.build_pose_lifting_label(remove_root=True)
@@ -125,12 +146,10 @@ def test_cicular_verification(self):
                                lifting_target_visible)
 
         _keypoints, _ = codec.decode(
-            np.expand_dims(encoded['lifting_target_label'], axis=0),
+            encoded['lifting_target_label'],
             target_root=lifting_target[..., 0, :])
 
-        self.assertTrue(
-            np.allclose(
-                np.expand_dims(lifting_target, axis=0), _keypoints, atol=5.))
+        self.assertTrue(np.allclose(lifting_target, _keypoints, atol=5.))
 
         # test normalization
         codec = self.build_pose_lifting_label(
@@ -142,9 +161,7 @@ def test_cicular_verification(self):
                                lifting_target_visible)
 
         _keypoints, _ = codec.decode(
-            np.expand_dims(encoded['lifting_target_label'], axis=0),
+            encoded['lifting_target_label'],
             target_root=lifting_target[..., 0, :])
 
-        self.assertTrue(
-            np.allclose(
-                np.expand_dims(lifting_target, axis=0), _keypoints, atol=5.))
+        self.assertTrue(np.allclose(lifting_target, _keypoints, atol=5.))
diff --git a/tests/test_codecs/test_motionbert_label.py b/tests/test_codecs/test_motionbert_label.py
new file mode 100644
index 0000000000..01c9c654a2
--- /dev/null
+++ b/tests/test_codecs/test_motionbert_label.py
@@ -0,0 +1,159 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from unittest import TestCase
+
+import numpy as np
+from mmengine.fileio import load
+
+from mmpose.codecs import MotionBERTLabel
+from mmpose.registry import KEYPOINT_CODECS
+
+
+class TestMotionBERTLabel(TestCase):
+
+    def get_camera_param(self, imgname, camera_param) -> dict:
+        """Get camera parameters of a frame by its image name."""
+        subj, rest = osp.basename(imgname).split('_', 1)
+        action, rest = rest.split('.', 1)
+        camera, rest = rest.split('_', 1)
+        return camera_param[(subj, camera)]
+
+    def build_pose_lifting_label(self, **kwargs):
+        cfg = dict(type='MotionBERTLabel', num_keypoints=17)
+        cfg.update(kwargs)
+        return KEYPOINT_CODECS.build(cfg)
+
+    def setUp(self) -> None:
+        keypoints = (0.1 + 0.8 * np.random.rand(1, 17, 2)) * [1000, 1002]
+        keypoints = np.round(keypoints).astype(np.float32)
+        keypoints_visible = np.random.randint(2, size=(1, 17))
+        lifting_target = (0.1 + 0.8 * np.random.rand(1, 17, 3))
+        lifting_target_visible = np.random.randint(
+            2, size=(
+                1,
+                17,
+            ))
+        encoded_wo_sigma = np.random.rand(1, 17, 3)
+
+        camera_param = load('tests/data/h36m/cameras.pkl')
+        camera_param = self.get_camera_param(
+            'S1/S1_Directions_1.54138969/S1_Directions_1.54138969_000001.jpg',
+            camera_param)
+        factor = 0.1 + 5 * np.random.rand(1, )
+
+        self.data = dict(
+            keypoints=keypoints,
+            keypoints_visible=keypoints_visible,
+            lifting_target=lifting_target,
+            lifting_target_visible=lifting_target_visible,
+            camera_param=camera_param,
+            factor=factor,
+            encoded_wo_sigma=encoded_wo_sigma)
+
+    def test_build(self):
+        codec = self.build_pose_lifting_label()
+        self.assertIsInstance(codec, MotionBERTLabel)
+
+    def test_encode(self):
+        keypoints = self.data['keypoints']
+        keypoints_visible = self.data['keypoints_visible']
+        lifting_target = self.data['lifting_target']
+        lifting_target_visible = self.data['lifting_target_visible']
+        camera_param = self.data['camera_param']
+        factor = self.data['factor']
+
+        # test default settings
+        codec = self.build_pose_lifting_label()
+        encoded = codec.encode(keypoints, keypoints_visible, lifting_target,
+                               lifting_target_visible, camera_param, factor)
+
+        self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2))
+        self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3))
+        self.assertEqual(encoded['lifting_target_weights'].shape, (
+            1,
+            17,
+        ))
+        self.assertEqual(encoded['trajectory_weights'].shape, (
+            1,
+            17,
+        ))
+
+        # test concatenating visibility
+        codec = self.build_pose_lifting_label(concat_vis=True)
+        encoded = codec.encode(keypoints, keypoints_visible, lifting_target,
+                               lifting_target_visible, camera_param, factor)
+
+        self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 3))
+        self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3))
+
+    def test_decode(self):
+        encoded_wo_sigma = self.data['encoded_wo_sigma']
+        camera_param = self.data['camera_param']
+
+        # test default settings
+        codec = self.build_pose_lifting_label()
+
+        decoded, scores = codec.decode(encoded_wo_sigma)
+
+        self.assertEqual(decoded.shape, (1, 17, 3))
+        self.assertEqual(scores.shape, (1, 17))
+
+        # test denormalize according to image shape
+        codec = self.build_pose_lifting_label()
+
+        decoded, scores = codec.decode(
+            encoded_wo_sigma,
+            w=np.array([camera_param['w']]),
+            h=np.array([camera_param['h']]))
+
+        self.assertEqual(decoded.shape, (1, 17, 3))
+        self.assertEqual(scores.shape, (1, 17))
+
+        # test with factor
+        codec = self.build_pose_lifting_label()
+
+        decoded, scores = codec.decode(
+            encoded_wo_sigma, factor=np.array([0.23]))
+
+        self.assertEqual(decoded.shape, (1, 17, 3))
+        self.assertEqual(scores.shape, (1, 17))
+
+    def test_cicular_verification(self):
+        keypoints_visible = self.data['keypoints_visible']
+        lifting_target = self.data['lifting_target']
+        lifting_target_visible = self.data['lifting_target_visible']
+        camera_param = self.data['camera_param']
+
+        # test denormalize according to image shape
+        keypoints = (0.1 + 0.8 * np.random.rand(1, 17, 3))
+        codec = self.build_pose_lifting_label()
+        encoded = codec.encode(keypoints, keypoints_visible, lifting_target,
+                               lifting_target_visible, camera_param)
+
+        _keypoints, _ = codec.decode(
+            encoded['keypoint_labels'],
+            w=np.array([camera_param['w']]),
+            h=np.array([camera_param['h']]))
+
+        keypoints[..., :, :] = keypoints[..., :, :] - keypoints[..., 0, :]
+
+        self.assertTrue(
+            np.allclose(keypoints[..., :2] / 1000, _keypoints[..., :2]))
+
+        # test with factor
+        keypoints = (0.1 + 0.8 * np.random.rand(1, 17, 3))
+        codec = self.build_pose_lifting_label()
+        encoded = codec.encode(keypoints, keypoints_visible, lifting_target,
+                               lifting_target_visible, camera_param)
+
+        _keypoints, _ = codec.decode(
+            encoded['keypoint_labels'],
+            w=np.array([camera_param['w']]),
+            h=np.array([camera_param['h']]),
+            factor=encoded['factor'])
+
+        keypoints *= encoded['factor']
+        keypoints[..., :, :] = keypoints[..., :, :] - keypoints[..., 0, :]
+
+        self.assertTrue(
+            np.allclose(keypoints[..., :2] / 1000, _keypoints[..., :2]))
diff --git a/tests/test_codecs/test_video_pose_lifting.py b/tests/test_codecs/test_video_pose_lifting.py
index cc58292d0c..31a095e927 100644
--- a/tests/test_codecs/test_video_pose_lifting.py
+++ b/tests/test_codecs/test_video_pose_lifting.py
@@ -19,7 +19,8 @@ def get_camera_param(self, imgname, camera_param) -> dict:
         return camera_param[(subj, camera)]
 
     def build_pose_lifting_label(self, **kwargs):
-        cfg = dict(type='VideoPoseLifting', num_keypoints=17)
+        cfg = dict(
+            type='VideoPoseLifting', num_keypoints=17, reshape_keypoints=False)
         cfg.update(kwargs)
         return KEYPOINT_CODECS.build(cfg)
 
@@ -27,8 +28,12 @@ def setUp(self) -> None:
         keypoints = (0.1 + 0.8 * np.random.rand(1, 17, 2)) * [192, 256]
         keypoints = np.round(keypoints).astype(np.float32)
         keypoints_visible = np.random.randint(2, size=(1, 17))
-        lifting_target = (0.1 + 0.8 * np.random.rand(17, 3))
-        lifting_target_visible = np.random.randint(2, size=(17, ))
+        lifting_target = (0.1 + 0.8 * np.random.rand(1, 17, 3))
+        lifting_target_visible = np.random.randint(
+            2, size=(
+                1,
+                17,
+            ))
         encoded_wo_sigma = np.random.rand(1, 17, 3)
 
         camera_param = load('tests/data/h36m/cameras.pkl')
@@ -61,10 +66,19 @@ def test_encode(self):
                                lifting_target_visible, camera_param)
 
         self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2))
-        self.assertEqual(encoded['lifting_target_label'].shape, (17, 3))
-        self.assertEqual(encoded['lifting_target_weights'].shape, (17, ))
-        self.assertEqual(encoded['trajectory_weights'].shape, (17, ))
-        self.assertEqual(encoded['target_root'].shape, (3, ))
+        self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3))
+        self.assertEqual(encoded['lifting_target_weights'].shape, (
+            1,
+            17,
+        ))
+        self.assertEqual(encoded['trajectory_weights'].shape, (
+            1,
+            17,
+        ))
+        self.assertEqual(encoded['target_root'].shape, (
+            1,
+            3,
+        ))
 
         # test not zero-centering
         codec = self.build_pose_lifting_label(zero_center=False)
@@ -72,9 +86,31 @@ def test_encode(self):
                                lifting_target_visible, camera_param)
 
         self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2))
-        self.assertEqual(encoded['lifting_target_label'].shape, (17, 3))
-        self.assertEqual(encoded['lifting_target_weights'].shape, (17, ))
-        self.assertEqual(encoded['trajectory_weights'].shape, (17, ))
+        self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3))
+        self.assertEqual(encoded['lifting_target_weights'].shape, (
+            1,
+            17,
+        ))
+        self.assertEqual(encoded['trajectory_weights'].shape, (
+            1,
+            17,
+        ))
+
+        # test reshape_keypoints
+        codec = self.build_pose_lifting_label(reshape_keypoints=True)
+        encoded = codec.encode(keypoints, keypoints_visible, lifting_target,
+                               lifting_target_visible, camera_param)
+
+        self.assertEqual(encoded['keypoint_labels'].shape, (34, 1))
+        self.assertEqual(encoded['lifting_target_label'].shape, (1, 17, 3))
+        self.assertEqual(encoded['lifting_target_weights'].shape, (
+            1,
+            17,
+        ))
+        self.assertEqual(encoded['trajectory_weights'].shape, (
+            1,
+            17,
+        ))
 
         # test removing root
         codec = self.build_pose_lifting_label(
@@ -84,10 +120,16 @@ def test_encode(self):
 
         self.assertTrue('target_root_removed' in encoded
                         and 'target_root_index' in encoded)
-        self.assertEqual(encoded['lifting_target_weights'].shape, (16, ))
+        self.assertEqual(encoded['lifting_target_weights'].shape, (
+            1,
+            16,
+        ))
         self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2))
-        self.assertEqual(encoded['lifting_target_label'].shape, (16, 3))
-        self.assertEqual(encoded['target_root'].shape, (3, ))
+        self.assertEqual(encoded['lifting_target_label'].shape, (1, 16, 3))
+        self.assertEqual(encoded['target_root'].shape, (
+            1,
+            3,
+        ))
 
         # test normalizing camera
         codec = self.build_pose_lifting_label(normalize_camera=True)
@@ -102,6 +144,35 @@ def test_encode(self):
                 encoded['camera_param']['f'],
                 atol=4.))
 
+        # test with multiple targets
+        keypoints = (0.1 + 0.8 * np.random.rand(2, 17, 2)) * [192, 256]
+        keypoints = np.round(keypoints).astype(np.float32)
+        keypoints_visible = np.random.randint(2, size=(2, 17))
+        lifting_target = (0.1 + 0.8 * np.random.rand(2, 17, 3))
+        lifting_target_visible = np.random.randint(
+            2, size=(
+                2,
+                17,
+            ))
+        codec = self.build_pose_lifting_label()
+        encoded = codec.encode(keypoints, keypoints_visible, lifting_target,
+                               lifting_target_visible, camera_param)
+
+        self.assertEqual(encoded['keypoint_labels'].shape, (2, 17, 2))
+        self.assertEqual(encoded['lifting_target_label'].shape, (2, 17, 3))
+        self.assertEqual(encoded['lifting_target_weights'].shape, (
+            2,
+            17,
+        ))
+        self.assertEqual(encoded['trajectory_weights'].shape, (
+            2,
+            17,
+        ))
+        self.assertEqual(encoded['target_root'].shape, (
+            2,
+            3,
+        ))
+
     def test_decode(self):
         lifting_target = self.data['lifting_target']
         encoded_wo_sigma = self.data['encoded_wo_sigma']
@@ -135,12 +206,10 @@ def test_cicular_verification(self):
                                lifting_target_visible, camera_param)
 
         _keypoints, _ = codec.decode(
-            np.expand_dims(encoded['lifting_target_label'], axis=0),
+            encoded['lifting_target_label'],
             target_root=lifting_target[..., 0, :])
 
-        self.assertTrue(
-            np.allclose(
-                np.expand_dims(lifting_target, axis=0), _keypoints, atol=5.))
+        self.assertTrue(np.allclose(lifting_target, _keypoints, atol=5.))
 
         # test removing root
         codec = self.build_pose_lifting_label(remove_root=True)
@@ -148,9 +217,7 @@ def test_cicular_verification(self):
                                lifting_target_visible, camera_param)
 
         _keypoints, _ = codec.decode(
-            np.expand_dims(encoded['lifting_target_label'], axis=0),
+            encoded['lifting_target_label'],
             target_root=lifting_target[..., 0, :])
 
-        self.assertTrue(
-            np.allclose(
-                np.expand_dims(lifting_target, axis=0), _keypoints, atol=5.))
+        self.assertTrue(np.allclose(lifting_target, _keypoints, atol=5.))
diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_h36m_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_h36m_dataset.py
index 88944dc11f..fd6cdf5f17 100644
--- a/tests/test_datasets/test_datasets/test_body_datasets/test_h36m_dataset.py
+++ b/tests/test_datasets/test_datasets/test_body_datasets/test_h36m_dataset.py
@@ -116,6 +116,17 @@ def test_topdown(self):
         self.assertEqual(len(dataset), 4)
         self.check_data_info_keys(dataset[0])
 
+        dataset = self.build_h36m_dataset(
+            data_mode='topdown',
+            seq_len=1,
+            seq_step=1,
+            multiple_target=1,
+            causal=False,
+            pad_video_seq=True,
+            camera_param_file='cameras.pkl')
+        self.assertEqual(len(dataset), 4)
+        self.check_data_info_keys(dataset[0])
+
         # test topdown testing with 2d keypoint detection file and
         # sequence config
         dataset = self.build_h36m_dataset(
diff --git a/tests/test_datasets/test_transforms/test_pose3d_transforms.py b/tests/test_datasets/test_transforms/test_pose3d_transforms.py
index 5f5d5aa096..b87931bb74 100644
--- a/tests/test_datasets/test_transforms/test_pose3d_transforms.py
+++ b/tests/test_datasets/test_transforms/test_pose3d_transforms.py
@@ -35,7 +35,7 @@ def _parse_h36m_imgname(imgname):
     scales = data['scale'].astype(np.float32)
 
     idx = 0
-    target_idx = 0
+    target_idx = [0]
 
     data_info = {
         'keypoints': keypoints[idx, :, :2].reshape(1, -1, 2),
@@ -52,7 +52,6 @@ def _parse_h36m_imgname(imgname):
         'sample_idx': idx,
         'lifting_target': keypoints_3d[target_idx, :, :3],
         'lifting_target_visible': keypoints_3d[target_idx, :, 3],
-        'target_img_path': osp.join('tests/data/h36m', imgnames[target_idx]),
     }
 
     # add camera parameters
@@ -108,9 +107,12 @@ def test_transform(self):
         tar_vis2 = results['lifting_target_visible']
 
         self.assertEqual(kpts_vis2.shape, (1, 17))
-        self.assertEqual(tar_vis2.shape, (17, ))
+        self.assertEqual(tar_vis2.shape, (
+            1,
+            17,
+        ))
         self.assertEqual(kpts2.shape, (1, 17, 2))
-        self.assertEqual(tar2.shape, (17, 3))
+        self.assertEqual(tar2.shape, (1, 17, 3))
 
         flip_indices = [
             0, 4, 5, 6, 1, 2, 3, 7, 8, 9, 10, 14, 15, 16, 11, 12, 13
@@ -121,12 +123,15 @@ def test_transform(self):
             self.assertTrue(
                 np.allclose(kpts1[0][left][1:], kpts2[0][right][1:], atol=4.))
             self.assertTrue(
-                np.allclose(tar1[left][1:], tar2[right][1:], atol=4.))
+                np.allclose(
+                    tar1[..., left, 1:], tar2[..., right, 1:], atol=4.))
 
             self.assertTrue(
-                np.allclose(kpts_vis1[0][left], kpts_vis2[0][right], atol=4.))
+                np.allclose(
+                    kpts_vis1[..., left], kpts_vis2[..., right], atol=4.))
             self.assertTrue(
-                np.allclose(tar_vis1[left], tar_vis2[right], atol=4.))
+                np.allclose(
+                    tar_vis1[..., left], tar_vis2[..., right], atol=4.))
 
         # test camera flipping
         transform = RandomFlipAroundRoot(
@@ -148,3 +153,23 @@ def test_transform(self):
                 -self.data_info['camera_param']['p'][0],
                 camera2['p'][0],
                 atol=4.))
+
+        # test flipping w.r.t. image
+        transform = RandomFlipAroundRoot({}, {}, flip_prob=1, flip_image=True)
+        results = deepcopy(self.data_info)
+        results = transform(results)
+        kpts2 = results['keypoints']
+        tar2 = results['lifting_target']
+
+        camera_param = results['camera_param']
+        for left, right in enumerate(flip_indices):
+            self.assertTrue(
+                np.allclose(
+                    camera_param['w'] - kpts1[0][left][:1],
+                    kpts2[0][right][:1],
+                    atol=4.))
+            self.assertTrue(
+                np.allclose(kpts1[0][left][1:], kpts2[0][right][1:], atol=4.))
+            self.assertTrue(
+                np.allclose(
+                    tar1[..., left, 1:], tar2[..., right, 1:], atol=4.))
diff --git a/tests/test_evaluation/test_metrics/test_keypoint_3d_metrics.py b/tests/test_evaluation/test_metrics/test_keypoint_3d_metrics.py
index 8289b09d0f..391b7b194a 100644
--- a/tests/test_evaluation/test_metrics/test_keypoint_3d_metrics.py
+++ b/tests/test_evaluation/test_metrics/test_keypoint_3d_metrics.py
@@ -20,9 +20,10 @@ def setUp(self):
         for i in range(self.batch_size):
             gt_instances = InstanceData()
             keypoints = np.random.random((1, num_keypoints, 3))
-            gt_instances.lifting_target = np.random.random((num_keypoints, 3))
+            gt_instances.lifting_target = np.random.random(
+                (1, num_keypoints, 3))
             gt_instances.lifting_target_visible = np.ones(
-                (num_keypoints, 1)).astype(bool)
+                (1, num_keypoints, 1)).astype(bool)
 
             pred_instances = InstanceData()
             pred_instances.keypoints = keypoints + np.random.normal(
@@ -32,8 +33,10 @@ def setUp(self):
             data_sample = PoseDataSample(
                 gt_instances=gt_instances, pred_instances=pred_instances)
             data_sample.set_metainfo(
-                dict(target_img_path='tests/data/h36m/S7/'
-                     'S7_Greeting.55011271/S7_Greeting.55011271_000396.jpg'))
+                dict(target_img_path=[
+                    'tests/data/h36m/S7/'
+                    'S7_Greeting.55011271/S7_Greeting.55011271_000396.jpg'
+                ]))
 
             self.data_batch.append(data)
             self.data_samples.append(data_sample.to_dict())
diff --git a/tests/test_models/test_backbones/test_dstformer.py b/tests/test_models/test_backbones/test_dstformer.py
new file mode 100644
index 0000000000..966ed6f49b
--- /dev/null
+++ b/tests/test_models/test_backbones/test_dstformer.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+
+from mmpose.models.backbones import DSTFormer
+from mmpose.models.backbones.dstformer import AttentionBlock
+
+
+class TestDSTFormer(TestCase):
+
+    def test_attention_block(self):
+        # BasicTemporalBlock with causal == False
+        block = AttentionBlock(dim=256, num_heads=2)
+        x = torch.rand(2, 17, 256)
+        x_out = block(x)
+        self.assertEqual(x_out.shape, torch.Size([2, 17, 256]))
+
+    def test_DSTFormer(self):
+        # Test DSTFormer with depth=2
+        model = DSTFormer(in_channels=3, depth=2, seq_len=2)
+        pose3d = torch.rand((1, 2, 17, 3))
+        feat = model(pose3d)
+        self.assertEqual(feat[0].shape, (2, 17, 256))
+
+        # Test DSTFormer with depth=4 and qkv_bias=False
+        model = DSTFormer(in_channels=3, depth=4, seq_len=2, qkv_bias=False)
+        pose3d = torch.rand((1, 2, 17, 3))
+        feat = model(pose3d)
+        self.assertEqual(feat[0].shape, (2, 17, 256))
+
+        # Test DSTFormer with depth=4 and att_fuse=False
+        model = DSTFormer(in_channels=3, depth=4, seq_len=2, att_fuse=False)
+        pose3d = torch.rand((1, 2, 17, 3))
+        feat = model(pose3d)
+        self.assertEqual(feat[0].shape, (2, 17, 256))