From b2e9d9d4052a11f413c06f16669de5aeb9947e16 Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Wed, 13 Dec 2023 20:36:53 +0800
Subject: [PATCH] resolve review & add ut & add doc

---
 docs/en/advanced_guides/datasets/waymo.md     |  70 +++++++++++----
 docs/zh_cn/advanced_guides/datasets/waymo.md  |  66 +++++++++++----
 .../waymo_utils/prediction_to_waymo.py        |  13 ++-
 mmdet3d/evaluation/metrics/waymo_metric.py    |  35 ++++----
 .../waymo/kitti_format/waymo_infos_train.pkl  | Bin 3445 -> 2020 bytes
 .../waymo/kitti_format/waymo_infos_val.pkl    | Bin 7000 -> 2020 bytes
 tests/test_datasets/test_waymo_dataset.py     |  80 ++++++++++++++++++
 tools/create_data.py                          |  10 +--
 tools/dataset_converters/waymo_converter.py   |  17 ++--
 9 files changed, 221 insertions(+), 70 deletions(-)
 create mode 100644 tests/test_datasets/test_waymo_dataset.py

diff --git a/docs/en/advanced_guides/datasets/waymo.md b/docs/en/advanced_guides/datasets/waymo.md
index 2e52b9dd10..5453f8912f 100644
--- a/docs/en/advanced_guides/datasets/waymo.md
+++ b/docs/en/advanced_guides/datasets/waymo.md
@@ -7,12 +7,7 @@ This page provides specific tutorials about the usage of MMDetection3D for Waymo
 Before preparing Waymo dataset, if you only installed requirements in `requirements/build.txt` and `requirements/runtime.txt` before, please install the official package for this dataset at first by running
 
 ```
-# tf 2.1.0.
-pip install waymo-open-dataset-tf-2-1-0==1.2.0
-# tf 2.0.0
-# pip install waymo-open-dataset-tf-2-0-0==1.2.0
-# tf 1.15.0
-# pip install waymo-open-dataset-tf-1-15-0==1.2.0
+pip install waymo-open-dataset-tf-2-6-0
 ```
 
 or
@@ -38,15 +33,19 @@ mmdetection3d
 │   │   │   ├── validation
 │   │   │   ├── testing
 │   │   │   ├── gt.bin
+│   │   │   ├── cam_gt.bin
+│   │   │   ├── fov_gt.bin
 │   │   ├── kitti_format
 │   │   │   ├── ImageSets
 
 ```
 
-You can download Waymo open dataset V1.2 [HERE](https://waymo.com/open/download/) and its data split [HERE](https://drive.google.com/drive/folders/18BVuF_RYJF0NjZpt8SnfzANiakoRMf0o?usp=sharing). Then put `tfrecord` files into corresponding folders in `data/waymo/waymo_format/` and put the data split txt files into `data/waymo/kitti_format/ImageSets`. Download ground truth bin files for validation set [HERE](https://console.cloud.google.com/storage/browser/waymo_open_dataset_v_1_2_0/validation/ground_truth_objects) and put it into `data/waymo/waymo_format/`. A tip is that you can use `gsutil` to download the large-scale dataset with commands. You can take this [tool](https://github.com/RalphMao/Waymo-Dataset-Tool) as an example for more details. Subsequently, prepare Waymo data by running
+You can download Waymo open dataset V1.4 [HERE](https://waymo.com/open/download/) and its data split [HERE](https://drive.google.com/drive/folders/18BVuF_RYJF0NjZpt8SnfzANiakoRMf0o?usp=sharing). Then put `tfrecord` files into corresponding folders in `data/waymo/waymo_format/` and put the data split txt files into `data/waymo/kitti_format/ImageSets`. Download ground truth bin files for validation set [HERE](https://console.cloud.google.com/storage/browser/waymo_open_dataset_v_1_2_0/validation/ground_truth_objects) and put it into `data/waymo/waymo_format/`. A tip is that you can use `gsutil` to download the large-scale dataset with commands. You can take this [tool](https://github.com/RalphMao/Waymo-Dataset-Tool) as an example for more details. Subsequently, prepare Waymo data by running
 
 ```bash
-python tools/create_data.py waymo --root-path ./data/waymo/ --out-dir ./data/waymo/ --workers 128 --extra-tag waymo
+# TF_CPP_MIN_LOG_LEVEL=3 will disable all logging output from TensorFlow.
+# The number of `--workers` depends on the maximum number of cores in your CPU.
+TF_CPP_MIN_LOG_LEVEL=3 python tools/create_data.py waymo --root-path ./data/waymo --out-dir ./data/waymo --workers 128 --extra-tag waymo --version v1.4
 ```
 
 Note that if your local disk does not have enough space for saving converted data, you can change the `--out-dir` to anywhere else. Just remember to create folders and prepare data there in advance and link them back to `data/waymo/kitti_format` after the data conversion.
@@ -65,22 +64,16 @@ mmdetection3d
 │   │   │   ├── validation
 │   │   │   ├── testing
 │   │   │   ├── gt.bin
+│   │   │   ├── cam_gt.bin
+│   │   │   ├── fov_gt.bin
 │   │   ├── kitti_format
 │   │   │   ├── ImageSets
 │   │   │   ├── training
-│   │   │   │   ├── calib
 │   │   │   │   ├── image_0
 │   │   │   │   ├── image_1
 │   │   │   │   ├── image_2
 │   │   │   │   ├── image_3
 │   │   │   │   ├── image_4
-│   │   │   │   ├── label_0
-│   │   │   │   ├── label_1
-│   │   │   │   ├── label_2
-│   │   │   │   ├── label_3
-│   │   │   │   ├── label_4
-│   │   │   │   ├── label_all
-│   │   │   │   ├── pose
 │   │   │   │   ├── velodyne
 │   │   │   ├── testing
 │   │   │   │   ├── (the same as training)
@@ -93,7 +86,48 @@ mmdetection3d
 
 ```
 
-Here because there are several cameras, we store the corresponding image and labels that can be projected to that camera respectively and save pose for further usage of consecutive frames point clouds. We use a coding way `{a}{bbb}{ccc}` to name the data for each frame, where `a` is the prefix for different split (`0` for training, `1` for validation and `2` for testing), `bbb` for segment index and `ccc` for frame index. You can easily locate the required frame according to this naming rule. We gather the data for training and validation together as KITTI and store the indices for different set in the `ImageSet` files.
+- `kitti_format/training/image_{0-4}/{a}{bbb}{ccc}.jpg` Here because there are several cameras, we store the corresponding images. We use a coding way `{a}{bbb}{ccc}` to name the data for each frame, where `a` is the prefix for different split (`0` for training, `1` for validation and `2` for testing), `bbb` for segment index and `ccc` for frame index. You can easily locate the required frame according to this naming rule. We gather the data for training and validation together as KITTI and store the indices for different set in the `ImageSet` files.
+- `kitti_format/training/velodyne/{a}{bbb}{ccc}.bin` point cloud data for each frame.
+- `kitti_format/waymo_gt_database/xxx_{Car/Pedestrian/Cyclist}_x.bin`. point cloud data included in each 3D bounding box of the training dataset. These point clouds will be used in data augmentation e.g. `ObjectSample`. `xxx` is the index of training samples and `x` is the index of objects in this frame.
+- `kitti_format/waymo_infos_train.pkl`. training dataset information, a dict contains two keys: `metainfo` and `data_list`.`metainfo` contains the basic information for the dataset itself, such as `dataset`, `version` and `info_version`, while `data_list` is a list of dict, each dict (hereinafter referred to as `info`) contains all the detailed information of single sample as follows:
+  - info\['sample_idx'\]: The index of this sample in the whole dataset.
+  - info\['ego2global'\]: The transformation matrix from the ego vehicle to global coordinates. (4x4 list).
+  - info\['timestamp'\]: Timestamp of the sample data.
+  - info\['context_name'\]: The context name of sample indices which `*.tfrecord` segment it extracted from.
+  - info\['lidar_points'\]: A dict containing all the information related to the lidar points.
+    - info\['lidar_points'\]\['lidar_path'\]: The filename of the lidar point cloud data.
+    - info\['lidar_points'\]\['num_pts_feats'\]: The feature dimension of point.
+  - info\['lidar_sweeps'\]: A list contains sweeps information of lidar
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['lidar_path'\]: The lidar data path of i-th sweep.
+    - info\['lidar_sweeps'\]\[i\]\['ego2global'\]: The transformation matrix from the ego vehicle to global coordinates. (4x4 list)
+    - info\['lidar_sweeps'\]\[i\]\['timestamp'\]: Timestamp of the sweep data.
+  - info\['images'\]: A dict contains five keys corresponding to each camera: `'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_SIDE_LEFT'`, `'CAM_SIDE_RIGHT'`. Each dict contains all data information related to  corresponding camera.
+    - info\['images'\]\['CAM_XXX'\]\['img_path'\]: The filename of the image.
+    - info\['images'\]\['CAM_XXX'\]\['height'\]: The height of the image.
+    - info\['images'\]\['CAM_XXX'\]\['width'\]: The width of the image.
+    - info\['images'\]\['CAM_XXX'\]\['cam2img'\]: The transformation matrix recording the intrinsic parameters when projecting 3D points to each image plane. (4x4 list)
+    - info\['images'\]\['CAM_XXX'\]\['lidar2cam'\]: The transformation matrix from lidar sensor to this camera. (4x4 list)
+    - info\['images'\]\['CAM_XXX'\]\['lidar2img'\]: The transformation matrix from lidar sensor to each image plane. (4x4 list)
+  - info\['image_sweeps'\]: A list contains sweeps information of images.
+    - info\['image_sweeps'\]\[i\]\['images'\]\['CAM_XXX'\]\['img_path'\]: The image path of i-th sweep.
+    - info\['image_sweeps'\]\[i\]\['ego2global'\]: The transformation matrix from the ego vehicle to global coordinates. (4x4 list)
+    - info\['image_sweeps'\]\[i\]\['timestamp'\]: Timestamp of the sweep data.
+  - info\['instances'\]: It is a list of dict. Each dict contains all annotation information of single instance. For the i-th instance:
+    - info\['instances'\]\[i\]\['bbox_3d'\]: List of 7 numbers representing the 3D bounding box of the instance, in (x, y, z, l, w, h, yaw) order.
+    - info\['instances'\]\[i\]\['bbox'\]: List of 4 numbers representing the 2D bounding box of the instance, in (x1, y1, x2, y2) order. (some instances may not have a corresponding 2D bounding box)
+    - info\['instances'\]\[i\]\['bbox_label_3d'\]: A int indicate the label of instance and the -1 indicate ignore.
+    - info\['instances'\]\[i\]\['bbox_label'\]: A int indicate the label of instance and the -1 indicate ignore.
+    - info\['instances'\]\[i\]\['num_lidar_pts'\]: Number of lidar points included in each 3D bounding box.
+    - info\['instances'\]\[i\]\['camera_id'\]: The index of the most visible camera for this instance.
+    - info\['instances'\]\[i\]\['group_id'\]: The index of this instance in this sample.
+  - info\['cam_sync_instances'\]: It is a list of dict. Each dict contains all annotation information of single instance. Its format is same with \['instances'\]. However, \['cam_sync_instances'\] is only for multi-view camera-based 3D Object Detection task.
+  - info\['cam_instances'\]: It is a dict containing keys `'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_SIDE_LEFT'`, `'CAM_SIDE_RIGHT'`. For monocular camera-based 3D Object Detection task, we split 3D annotations of the whole scenes according to the camera they belong to. For the i-th instance:
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_3d'\]: List of 7 numbers representing the 3D bounding box of the instance, in (x, y, z, l, h, w, yaw) order.
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox'\]: 2D bounding box annotation (exterior rectangle of the projected 3D box), a list arrange as \[x1, y1, x2, y2\].
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_label_3d'\]: Label of instance.
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_label'\]: Label of instance.
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['center_2d'\]: Projected center location on the image, a list has shape (2,).
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['depth'\]: The depth of projected center.
 
 ## Training
 
@@ -101,7 +135,7 @@ Considering there are many similar frames in the original dataset, we can basica
 
 ## Evaluation
 
-For evaluation on Waymo, please follow the [instruction](https://github.com/waymo-research/waymo-open-dataset/blob/master/docs/quick_start.md) to build the binary file `compute_detection_metrics_main` for metrics computation and put it into `mmdet3d/core/evaluation/waymo_utils/`.  Basically, you can follow the commands below to install `bazel` and build the file.
+For evaluation on Waymo, please follow the [instruction](https://github.com/waymo-research/waymo-open-dataset/blob/r1.3/docs/quick_start.md) to build the binary file `compute_detection_metrics_main` for metrics computation and put it into `mmdet3d/core/evaluation/waymo_utils/`.  Basically, you can follow the commands below to install `bazel` and build the file.
 
 ```shell
 # download the code and enter the base directory
diff --git a/docs/zh_cn/advanced_guides/datasets/waymo.md b/docs/zh_cn/advanced_guides/datasets/waymo.md
index 577ec1513a..8c0f0dfc0f 100644
--- a/docs/zh_cn/advanced_guides/datasets/waymo.md
+++ b/docs/zh_cn/advanced_guides/datasets/waymo.md
@@ -7,12 +7,7 @@
 在准备 Waymo 数据集之前，如果您之前只安装了 `requirements/build.txt` 和 `requirements/runtime.txt` 中的依赖，请通过运行如下指令额外安装 Waymo 数据集所依赖的官方包：
 
 ```
-# tf 2.1.0.
-pip install waymo-open-dataset-tf-2-1-0==1.2.0
-# tf 2.0.0
-# pip install waymo-open-dataset-tf-2-0-0==1.2.0
-# tf 1.15.0
-# pip install waymo-open-dataset-tf-1-15-0==1.2.0
+pip install waymo-open-dataset-tf-2-6-0
 ```
 
 或者
@@ -38,6 +33,8 @@ mmdetection3d
 │   │   │   ├── validation
 │   │   │   ├── testing
 │   │   │   ├── gt.bin
+│   │   │   ├── cam_gt.bin
+│   │   │   ├── fov_gt.bin
 │   │   ├── kitti_format
 │   │   │   ├── ImageSets
 
@@ -46,7 +43,9 @@ mmdetection3d
 您可以在[这里](https://waymo.com/open/download/)下载 1.2 版本的 Waymo 公开数据集，并在[这里](https://drive.google.com/drive/folders/18BVuF_RYJF0NjZpt8SnfzANiakoRMf0o?usp=sharing)下载其训练/验证/测试集拆分文件。接下来，请将 `tfrecord` 文件放入 `data/waymo/waymo_format/` 下的对应文件夹，并将 txt 格式的数据集拆分文件放入 `data/waymo/kitti_format/ImageSets`。在[这里](https://console.cloud.google.com/storage/browser/waymo_open_dataset_v_1_2_0/validation/ground_truth_objects)下载验证集使用的 bin 格式真实标注 (Ground Truth) 文件并放入 `data/waymo/waymo_format/`。小窍门：您可以使用 `gsutil` 来在命令行下载大规模数据集。您可以将该[工具](https://github.com/RalphMao/Waymo-Dataset-Tool) 作为一个例子来查看更多细节。之后，通过运行如下指令准备 Waymo 数据：
 
 ```bash
-python tools/create_data.py waymo --root-path ./data/waymo/ --out-dir ./data/waymo/ --workers 128 --extra-tag waymo
+# TF_CPP_MIN_LOG_LEVEL=3 will disable all logging output from TensorFlow.
+# The number of `--workers` depends on the maximum number of cores in your CPU.
+TF_CPP_MIN_LOG_LEVEL=3 python tools/create_data.py waymo --root-path ./data/waymo --out-dir ./data/waymo --workers 128 --extra-tag waymo --version v1.4
 ```
 
 请注意，如果您的本地磁盘没有足够空间保存转换后的数据，您可以将 `--out-dir` 改为其他目录；只要在创建文件夹、准备数据并转换格式后，将数据文件链接到 `data/waymo/kitti_format` 即可。
@@ -65,22 +64,16 @@ mmdetection3d
 │   │   │   ├── validation
 │   │   │   ├── testing
 │   │   │   ├── gt.bin
+│   │   │   ├── cam_gt.bin
+│   │   │   ├── fov_gt.bin
 │   │   ├── kitti_format
 │   │   │   ├── ImageSets
 │   │   │   ├── training
-│   │   │   │   ├── calib
 │   │   │   │   ├── image_0
 │   │   │   │   ├── image_1
 │   │   │   │   ├── image_2
 │   │   │   │   ├── image_3
 │   │   │   │   ├── image_4
-│   │   │   │   ├── label_0
-│   │   │   │   ├── label_1
-│   │   │   │   ├── label_2
-│   │   │   │   ├── label_3
-│   │   │   │   ├── label_4
-│   │   │   │   ├── label_all
-│   │   │   │   ├── pose
 │   │   │   │   ├── velodyne
 │   │   │   ├── testing
 │   │   │   │   ├── (the same as training)
@@ -93,7 +86,48 @@ mmdetection3d
 
 ```
 
-因为 Waymo 数据的来源包含数个相机，这里我们将每个相机对应的图像和标签文件分别存储，并将相机位姿 (pose) 文件存储下来以供后续处理连续多帧的点云。我们使用 `{a}{bbb}{ccc}` 的名称编码方式为每帧数据命名，其中 `a` 是不同数据拆分的前缀（`0` 指代训练集，`1` 指代验证集，`2` 指代测试集），`bbb` 是分割部分 (segment) 的索引，而 `ccc` 是帧索引。您可以轻而易举地按照如上命名规则定位到所需的帧。我们将训练和验证所需数据按 KITTI 的方式集合在一起，然后将训练集/验证集/测试集的索引存储在 `ImageSet` 下的文件中。
+- `kitti_format/training/image_{0-4}/{a}{bbb}{ccc}.jpg` 因为 Waymo 数据的来源包含数个相机，这里我们将每个相机对应的图像和标签文件分别存储，并将相机位姿 (pose) 文件存储下来以供后续处理连续多帧的点云。我们使用 `{a}{bbb}{ccc}` 的名称编码方式为每帧数据命名，其中 `a` 是不同数据拆分的前缀（`0` 指代训练集，`1` 指代验证集，`2` 指代测试集），`bbb` 是分割部分 (segment) 的索引，而 `ccc` 是帧索引。您可以轻而易举地按照如上命名规则定位到所需的帧。我们将训练和验证所需数据按 KITTI 的方式集合在一起，然后将训练集/验证集/测试集的索引存储在 `ImageSet` 下的文件中。
+- `kitti_format/training/velodyne/{a}{bbb}{ccc}.bin` 当前样本的点云数据
+- `kitti_format/waymo_gt_database/xxx_{Car/Pedestrian/Cyclist}_x.bin`. 训练数据集的每个 3D 包围框中包含的点云数据。这些点云会在数据增强中被使用，例如. `ObjectSample`. `xxx` 表示训练样本的索引，`x` 表示实例在当前样本中的索引。
+- `kitti_format/waymo_infos_train.pkl`. 训练数据集，该字典包含了两个键值：`metainfo` 和 `data_list`。`metainfo` 包含数据集的基本信息，例如 `dataset`、`version` 和 `info_version`。`data_list` 是由字典组成的列表，每个字典（以下简称 `info`）包含了单个样本的所有详细信息。:
+  - info\['sample_idx'\]: 样本在整个数据集的索引。
+  - info\['ego2global'\]: 自车到全局坐标的变换矩阵。（4x4 列表）
+  - info\['timestamp'\]：样本数据时间戳。
+  - info\['context_name'\]: 语境名，表示样本从哪个 `*.tfrecord` 片段中提取的。
+  - info\['lidar_points'\]: 是一个字典，包含了所有与激光雷达点相关的信息。
+    - info\['lidar_points'\]\['lidar_path'\]: 激光雷达点云数据的文件名。
+    - info\['lidar_points'\]\['num_pts_feats'\]: 点的特征维度。
+  - info\['lidar_sweeps'\]: 是一个列表，包含了历史帧信息。
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['lidar_path'\]: 第 i 帧的激光雷达数据的文件路径。
+    - info\['lidar_sweeps'\]\[i\]\['ego2global'\]: 第 i 帧的激光雷达传感器到自车的变换矩阵。（4x4 列表）
+    - info\['lidar_sweeps'\]\[i\]\['timestamp'\]: 第 i 帧的样本数据时间戳。
+  - info\['images'\]: 是一个字典，包含与每个相机对应的六个键值：`'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_SIDE_LEFT'`, `'CAM_SIDE_RIGHT'`。每个字典包含了对应相机的所有数据信息。
+    - info\['images'\]\['CAM_XXX'\]\['img_path'\]: 图像的文件名。
+    - info\['images'\]\['CAM_XXX'\]\['height'\]: 图像的高
+    - info\['images'\]\['CAM_XXX'\]\['width'\]: 图像的宽
+    - info\['images'\]\['CAM_XXX'\]\['cam2img'\]: 当 3D 点投影到图像平面时需要的内参信息相关的变换矩阵。（3x3 列表）
+    - info\['images'\]\['CAM_XXX'\]\['lidar2cam'\]: 激光雷达传感器到该相机的变换矩阵。（4x4 列表）
+    - info\['images'\]\['CAM_XXX'\]\['lidar2img'\]: 激光雷达传感器到图像平面的变换矩阵。（4x4 列表）
+  - info\['image_sweeps'\]: 是一个列表，包含了历史帧信息。
+    - info\['image_sweeps'\]\[i\]\['images'\]\['CAM_XXX'\]\['img_path'\]: 第i帧的图像的文件名.
+    - info\['image_sweeps'\]\[i\]\['ego2global'\]: 第 i 帧的自车到全局坐标的变换矩阵。（4x4 列表）
+    - info\['image_sweeps'\]\[i\]\['timestamp'\]: 第 i 帧的样本数据时间戳。
+  - info\['instances'\]: 是一个字典组成的列表。每个字典包含单个实例的所有标注信息。对于其中的第 i 个实例，我们有：
+    - info\['instances'\]\[i\]\['bbox_3d'\]: 长度为 7 的列表，以 (x, y, z, l, w, h, yaw) 的顺序表示实例的 3D 边界框。
+    - info\['instances'\]\[i\]\['bbox'\]: 2D 边界框标注（，顺序为 \[x1, y1, x2, y2\] 的列表。有些实例可能没有对应的 2D 边界框标注。
+    - info\['instances'\]\[i\]\['bbox_label_3d'\]: 整数表示实例的标签，-1 代表忽略。
+    - info\['instances'\]\[i\]\['bbox_label'\]: 整数表示实例的标签，-1 代表忽略。
+    - info\['instances'\]\[i\]\['num_lidar_pts'\]: 每个 3D 边界框内包含的激光雷达点数。
+    - info\['instances'\]\[i\]\['camera_id'\]: 当前实例最可见相机的索引。
+    - info\['instances'\]\[i\]\['group_id'\]: 当前实例在当前样本中的索引。
+  - info\['cam_sync_instances'\]: 是一个字典组成的列表。每个字典包含单个实例的所有标注信息。它的形式与 \['instances'\]相同. 但是, \['cam_sync_instances'\] 专门用于基于多视角相机的三维目标检测任务。
+  - info\['cam_instances'\]: 是一个字典，包含以下键值： `'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_SIDE_LEFT'`, `'CAM_SIDE_RIGHT'`. 对于基于视觉的 3D 目标检测任务，我们将整个场景的 3D 标注划分至它们所属于的相应相机中。对于其中的第 i 个实例，我们有：
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_3d'\]: 长度为 7 的列表，以 (x, y, z, l, h, w, yaw) 的顺序表示实例的 3D 边界框。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox'\]: 2D 边界框标注（3D 框投影的矩形框），顺序为 \[x1, y1, x2, y2\] 的列表。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_label_3d'\]: 实例标签。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_label'\]: 实例标签。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['center_2d'\]: 3D 框投影到图像上的中心点，大小为 (2, ) 的列表。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['depth'\]: 3D 框投影中心的深度。
 
 ## 训练
 
diff --git a/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py b/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py
index c1729e7b89..3c79d6f6cb 100644
--- a/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py
+++ b/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py
@@ -52,19 +52,18 @@ def __init__(self,
             'Cyclist': label_pb2.Label.TYPE_CYCLIST,
         }
 
-    def convert_one(self, res_index: int):
+    def convert_one(self, res_idx: int):
         """Convert action for single file. It read the metainfo from the
         preprocessed file offline and will be faster.
 
         Args:
-            res_index (int): The indices of the results.
+            res_idx (int): The indices of the results.
         """
-        sample_idx = self.results[res_index]['sample_idx']
-        if len(self.results[res_index]['labels_3d']) > 0:
+        sample_idx = self.results[res_idx]['sample_idx']
+        if len(self.results[res_idx]['labels_3d']) > 0:
             objects = self.parse_objects_from_origin(
-                self.results[res_index],
-                self.results[res_index]['context_name'],
-                self.results[res_index]['timestamp'])
+                self.results[res_idx], self.results[res_idx]['context_name'],
+                self.results[res_idx]['timestamp'])
         else:
             print(sample_idx, 'not found.')
             objects = metrics_pb2.Objects()
diff --git a/mmdet3d/evaluation/metrics/waymo_metric.py b/mmdet3d/evaluation/metrics/waymo_metric.py
index 41fe429ba8..cdbc4a58db 100644
--- a/mmdet3d/evaluation/metrics/waymo_metric.py
+++ b/mmdet3d/evaluation/metrics/waymo_metric.py
@@ -6,6 +6,7 @@
 import numpy as np
 import torch
 from mmengine import Config
+from mmengine.device import get_device
 from mmengine.evaluator import BaseMetric
 from mmengine.logging import MMLogger, print_log
 
@@ -39,6 +40,9 @@ class WaymoMetric(BaseMetric):
             evaluation. It is useful when you want to format the result to a
             specific format and submit it to the test server.
             Defaults to False.
+        nms_cfg (dict): The configuration of non-maximum suppression for
+            the mergence of multi-image predicted bboxes, only use when
+            load_type == 'mv_image_based'. Defaults to None.
     """
     num_cams = 5
     default_prefix = 'Waymo metric'
@@ -49,17 +53,22 @@ def __init__(self,
                  load_type: str = 'frame_based',
                  result_prefix: Optional[str] = None,
                  format_only: bool = False,
+                 nms_cfg=None,
                  **kwargs) -> None:
         super().__init__(**kwargs)
         self.waymo_bin_file = waymo_bin_file
         self.metrics = metric if isinstance(metric, list) else [metric]
         self.load_type = load_type
-        self.format_only = format_only
         self.result_prefix = result_prefix
+        self.format_only = format_only
         if self.format_only:
             assert result_prefix is not None, 'result_prefix must be not '
             'None when format_only is True, otherwise the result files will '
             'be saved to a temp directory which will be cleaned up at the end.'
+        if nms_cfg is not None:
+            assert load_type == 'mv_image_based', 'nms_cfg in WaymoMetric '
+            'only use when load_type == \'mv_image_based\'.'
+            self.nms_cfg = Config(nms_cfg)
 
     def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
         """Process one batch of data samples and predictions.
@@ -305,7 +314,7 @@ def format_results(
                                      self.classes)
         converter.convert()
 
-    def merge_multi_view_boxes(self, frame_results) -> dict:
+    def merge_multi_view_boxes(self, frame_results: List[dict]) -> dict:
         """Merge bounding boxes predicted from multi-view images.
 
         Args:
@@ -332,19 +341,12 @@ def merge_multi_view_boxes(self, frame_results) -> dict:
             bboxes_3d = np.concatenate(bboxes_3d)
             scores_3d = np.concatenate(scores_3d)
             labels_3d = np.concatenate(labels_3d)
-            nms_cfg = dict(
-                use_rotate_nms=True,
-                nms_across_levels=False,
-                nms_pre=500,
-                nms_thr=0.05,
-                score_thr=0.001,
-                min_bbox_size=0,
-                max_per_frame=100)
-            nms_cfg = Config(nms_cfg)
+
+            device = get_device()
             lidar_boxes3d = LiDARInstance3DBoxes(
-                torch.from_numpy(bboxes_3d).cuda())
-            scores = torch.from_numpy(scores_3d).cuda()
-            labels = torch.from_numpy(labels_3d).long().cuda()
+                torch.from_numpy(bboxes_3d).to(device))
+            scores = torch.from_numpy(scores_3d).to(device)
+            labels = torch.from_numpy(labels_3d).long().to(device)
             nms_scores = scores.new_zeros(scores.shape[0],
                                           len(self.classes) + 1)
             indices = labels.new_tensor(list(range(scores.shape[0])))
@@ -352,8 +354,9 @@ def merge_multi_view_boxes(self, frame_results) -> dict:
             lidar_boxes3d_for_nms = xywhr2xyxyr(lidar_boxes3d.bev)
             boxes3d = lidar_boxes3d.tensor
             bboxes_3d, scores_3d, labels_3d = box3d_multiclass_nms(
-                boxes3d, lidar_boxes3d_for_nms, nms_scores, nms_cfg.score_thr,
-                nms_cfg.max_per_frame, nms_cfg)
+                boxes3d, lidar_boxes3d_for_nms, nms_scores,
+                self.nms_cfg.score_thr, self.nms_cfg.max_per_frame,
+                self.nms_cfg)
 
             merged_result['bboxes_3d'] = bboxes_3d.cpu().numpy()
             merged_result['scores_3d'] = scores_3d.cpu().numpy()
diff --git a/tests/data/waymo/kitti_format/waymo_infos_train.pkl b/tests/data/waymo/kitti_format/waymo_infos_train.pkl
index e89255005950effc80d8d4bab100bfef12856f30..f2f587c6ad343c69b1276de411fe8673a11e0d1f 100644
GIT binary patch
literal 2020
zcma)7TWl0n7~U<E!g94J7X_)JIEt>@3rmsOoyui^(jp7hK^bRfcTaZ)c4qdUne8QH
z>jI$`g_0JGw!W;1Ml>-dl9(vSZgxpDpu`w8zL?ZVVqy%{2O=+0|8u4{jSoE!Gv9pY
zJOBA_=S;RN10GJC=ka*rilfL$&33>m0n)QL(TB9H=&2-;wRjiIZuT&xqv^zUkcaYD
zD~d<q=GCL+OiL1GO(JPVl_boOpn{WNE+>R;ic$^@3Hj@lHzE{-uI)X-K;i5=7jwuE
z3Fe{j>mO#1;Fk~@U3nPa2;M%ngQtMMGE^QxfI=T9<gcCmt%}m2flF(621XkXL@3~c
zmXSk%!ugzlfQ|nFp+DRyLc|4#Vy5AcE=M*LoxlQd9a7bSK=sDJra*PDK2TQ|tlx-3
z4%XHO<Uk-G2kY>G02YdKP^{@ng4nPq1B+RTwluZK+rxWW_rsD5+{KA=nI<8p6i0<h
zaXx<HVmkbXQVCefjHpCQs1DrS;wcx)QPHWzQGHKKwu0GAYH`J?i77fPlVCX}cLk<G
z`&^BnRLGCsdAC48XmDSz%X}EzlSKwou@Z%y1H&#o*m}_gqj&nNa9-d&f4`MOfWmvR
z3ZrA+j!-&(=-h@Z1D$ObT;{gPMUX+^eKSpN1?th|8ck2YDhXDj+xK$<pIg3(TAC^h
zf4a20YC40PteLjS<O=@ycOU2eBdnSx#3?p}i3(5dfu*K>?xvCY^MC{oqCacVpVMa!
z7I01|<W_wep@0vWo2wY}=9yo!2+*I0P<TB3MTF5?Qa=_L$gTd2G5iO^Z6nB_@Zp)I
zk)&<be9R*`Wom|F!@3MS;-0*5nsyBLua1do20SV*#J!BPE~gw@ZYK&ZeXMy-nuVwt
zcsz|5HspE<9?!rNZUZJPGo8Yd4E|;>(_vwVrBLQcJleC-U^HrWK|q3P47LCAm%p!B
z+92?qr*B-}xx^9p!JoQTj$P$33;M0CuOIn_uv1Uh$Vnwil2C(YXvG;-8;4p6>d;c?
z*3UU)g+kf)BO?^>-uKoQ7+CSzM_C5S#w>i})V!gg90mDnZO14eEYhh>WVE{h_nC-)
zJWUvXuw{cct%#Ml2qTm2ZX+g7rDCH5^%>a2sTjSA@v0c+Q!@)w#kf?AWiyTCubv2Q
zfU#V^%>eH`?-`*CKZ96Q^jsbHlLkQjoN}=t15c|<BxuY)h@<}TTntN`>h{M>i}>|)
z($N&lQo4P1Oi3!151QP0U)<?V5g$AwmZ8!1df0NZ?-ke@hHYH)EYXWx18i??ZQV77
zgFTu(4hYeh95dr2X8VXife&`DhV<8$hI>kVr4z+|9(J<3DXBP`Q4PC#p*ak@d!Z!^
zt!k6HO?6kSEq+oOF1=Aa51l;hQBn46820u;I1Ky7WsA^@(opea>HX5V#?q<cAP@Vg
zuq_PF^}_RE5U1I*il-9A%caZh#p662pt2Xj@M13<41+`_%JCAG!~2VsMZ-e=s|(BU
zja6vWdkPFpmSdu$_=z6N-@4L5HH4PKJqZNwe@m{3gDgP=?GA)0v&_oNe_o1EDZ7#5
zB!#C@ncPHNj8o&yE^Bs&g3SogK6c%Akfa&YoNkD*gr^l7dz1bLbmAzQ(QZOK1B7Ge
zh5mbO;y}B5?<?K92?;jSK`dJ{4N%!p>!dSsN438WG*&5UN1o6eVwA4OiBlVQRfp4Q
F|33rVS$zNi

literal 3445
zcmb`Kd3Y4%9mh8Zkc1$qV!?_>0Wnsa1cMN(kVPvhs{vk#3}{?tb|=}T+1dT>z6*&o
zZ6J3b!Guc=2@%jnZWT*G*!K7Kw5`4G`@Zdc{nftj%+3HM+K1<H_L-fTci#8=ozL(0
z&ilU8np+273vx=eLZ8paqcJnZ^;iPAz2j&--gs^O7;TKa5sm2$Ch}U4?^Lv}gVAsk
z#sqv?ky|5}(0K%7wW&%gsYk7-)llx#tthW>wz7oPfN|P*uT3hVCrlKCtaL2VTo#Ha
z&9Yd!kw^7pQg1E_Ce2XXO7Ubm#EVeq)GD&V&RrA*d>t4c3WnlgGn6VaEp<|aVl7AN
z8<?PtmB&V%N3B`7q7xGtO1kh4hAT@0If2{`T$L3J^X7zXH%ZG?&ovuY_jGsRoeYyp
z19`UA-I!8SQ?uwla<l&eK8|aQo=CABIG+p>wT|=o@UDooxt3un$8|>9#!SSc77vCR
z<LNN2Z@~?vTCp^dxWRf;=jo)G!i|Bk+5}m+#%zp-n=R7`<h!-&WSJL8_of#~BWht<
z+D=eNZ;Tq4-hvsWHo}q$+!QH{*x{KOk<2$Ulq%-@Kz=7~i8!G3Rd|mAy_MlM2^zT+
z@x3-;zkG}uqeQ!NGVSxrb9-B~j@rr}5^dDy&z5)xuP&Vs!(xQC(Tid$wOvxrYha&<
z<JA`zyxEnkh%y!Da)t_yS%!l=3%9$-l?r(_!yO*-oXBk|S%Xqj?4bRLdDJUT(~#fw
zpgp4fMmnBsqJFQ;y?1egIIYjWZydE>Qq8UI950JzuA-@8xRc|3S(@8%mrHZEqM64q
z-=nE^Xr7W*V#~uu$&~RIBu4a8w-^+A#Xj0FN^;IOphNSQKQ@a_zP*|8E0zTe_i)^s
zWvRsbU6%V4%R+|xJ(dR?mOjPO6_`ssGQJnk)QLR{^P1RbX}O?wnzl^Ie>6YtcY^<B
z|Cg1N2NlR7h5*MyS&-Ry*afLkAd4B6cpyt1kX~s{ho`O#-*sV9*5KL+tCo#IalYzA
z+EYo-(#xsX+eyI`#Ul*MIBE?|4pBZ<p$1isM_rWV3PodB;i0T_P};pXUQXb=MsIm?
zIf{5pK|Ib-$MJ-rT_J^+C4*`_4Dxs|q{k3+Sym|)ox$)}LJrGm*{GNfE}9mhvr)=S
z6W#Ka-#`b&IdP0KbcS|{0oqKRkI-PX*gs)Ka`7BGB5(B3v1N;C7Y(eH7hA<h?hHCp
zLxb|7n|6y+6RF=(D533if`(<;S)YG0wJsDR@+HiOW7RX~QZF68gSM`wOrChIO8UK2
zE7j^~q+vm6T%4lJTSm$!Lxr%4fXPtL(IDrQeAyxqPnn3g;86v>n&AT;c%uV;R94ki
ziXIxO)5OtgaaQy$UnLG)DTj$n?agA~iFHfG#_Cn&G(@{)7!J@D(OM;r6pEpIv0e04
zi%o_p&aJ*ilF)G(rM(LtkmJg5wb=fk*i}15mbClnAnh-hE?$srH&RAyE1Y!C4fHAv
zR2M{wm#mmCn~I^uGL$>4<Q)J1hm9#%iy_XDFzoqFx2$*yXhE{nX1Da1iImIE6?>Xt
zjbhK~z}oEmR$GZCHNQ18Jn7DFfnqra7I3VyXErqx-z#@r?n$}hgUY&v;X@oBHk?IK
zGHr!)Zidn7`mI-f8yMQMe(mSy(+=g=$<XEcUA$!Iwp|OPle?iuIrcJa<k)0r>Q+1y
zYD|ZvVV`T*uM9UcJe4&Z$QnkgFsKZ-Fl-&&aNDJZ+m+!ChNn4pW~1Afi0Ihmy6skO
zdl;U{y6ug0c?%u;v1f=@3@Md;48t7H8g@<$Bi@AluE+r;a**LUTSTooo_G3VrIZkT
zVw3Zub8<`+uarJco8{Zsx7I(e$ewP+Gc@RDbn1HQ5Qk!NhQddb;RwSE9ES{h%^8-<
zRVx*h%R?M?ZI3A1qYTF!+q_O3cWh@$Tje26Dz7Pi`AY9}e}&!?h?BW~oKQ|LGQ7m`
zvXha<xa<Ma2q#^iSCr338BTdVA9H-BE4;%?vrnGmzr`k#u40oMGmhRbhM$~d&G+N9
zGC9NWD#uv|CmH9qNw68`T$_xt`8dNTvZ){^6>0<&Z4Qb;q6~!6`y|7sI6m!UHEM;;
zCVa+~`mB=r9K&l_sn5H8Xg0o}`p_2{ULU;=eaY@aYRFL=zO0<S!thm&uQ{(sLo%LD
z$j^ZkzV5nxL%DsE;agd^Z%1BNeaI8}juQDU!}mD8?}*4@+)V0H1wU{_eyBu##PH*+
z$WI*li(a3fDxse-{G8(#ZnvnfkA~zYL^FQrD*Z|+{hHx7S*73p&*S_~sr;Vd4;*h8
zT8YF|>vMbMo(e{-U<1b=U9CSUtv@sTC9Cz<^IGNdvjcxqT7PHw2gg6<P_4c|Qfo~!
Nk;1>~@b9!<_Fq_JBP0L-

diff --git a/tests/data/waymo/kitti_format/waymo_infos_val.pkl b/tests/data/waymo/kitti_format/waymo_infos_val.pkl
index 0ce2230bb123b686dc3a4c337003fce992fbd8e4..82a6ed4704e8b4e8f07aa13994a6c92ca1b7eab8 100644
GIT binary patch
literal 2020
zcma)7Uu;uV81DvKM;Y>O!j!2M6t5tqYxjpK*fYksu(}}y<O*ExZriioJ9^uGy|;ss
zlr#Y<GmaP{NMfQ*#D9wkAtn+dvDelIUyN_U<MhE7LdF9q5Xk(_>DbWlz)9Nv^80@0
zJKw)^ny<})Ly+b<9FC}JsY+ZoEvS(J={bVrLfTY~RGcb$^f1irBbygGxY*JSYFcQ5
zy7!zxV`E}MtK)JSOZsB*WLS+uy$nu4h6X__*`6kJ<q%7gmp&fk;QU9qA_L^|x-D)5
zREBwi7?}KV`vjv)_L^@PEtahNe{o=+d;cgw28Av`%p+n;ES)bu>t-M@d8eny02w~|
zsf_@I^92zBUylHR8&wmfQX`^BGGWofmXc5n3XRfgq*}c`ufL<c!{6?2_qDZlcC~f3
zE1ey_cE#)UB530P7D)9d)(tgAO?V;)3weopy8D%_p&bLeVNnhi3sM8u#FUh3X|P0^
z&;6qxuh%o2iosIuM5B64v!JQpQ76@*;*cIi^^^U1C-)iGqiWh0Q4Lro!*VQcGZyzi
z-^DyivA9zU@E1#Q>Q2ecneppK1`%K>p2FhB(o3uKse3!CU~<aI>>_deaSeZ&@M+eC
zshfn+rTrgn<scCIYV9CrZvWcF7!<B}+~PV>k0JYXBL*!pJcD6BD~P1Htl5p0*~XBu
z4O7L389d~<hYp$9L5^KLnE#*9{g4n%e2EV_JaY!NnvHt3j4Yn#Wmt*vtipK4T8_EV
zCfdd3$^-+1T)Rd%b8hc08v(}i0t$~iN8OwzE$RdZ#l<%W17y5=oG~bT@$t$?GbSHC
z9+8qt>Iut))j4>ndikblHZj6~CMK*W;AM%mJCQMzlw~SIRK=}pg7q1mqMpF(nTX&(
zu9e}H9IUGjU@V=?r0_C>Cs@OE*ch5t8M8j<kmjPpa5#AwyfU;ts-kG;Yqp~N*0ViI
zTn*DW_|OgOIH&wk@XOF9h<0bog$ZOuyXI0YVW9NWkrjCcNK@l!&Jf4-Ce8%zeRc&c
zs2RWNv=LCA>Fgm2`fbN^9^^k=Cd|L1e|c?2m6qTp%uF$}iHI^=icT3e<e*E?Fneu1
zelX2fv7;aS=CxNf>=DU{^2wXSNBcx_wsF_pO^r1o8NKVz_U!(XYM7UXxx9w*j<McB
zJU}GguCo&i1U~=eLXk6zFW$;?=6<@z=HQ<zF2aD_@UxQvs$t|>NAPzHlf}=%oQ-f}
z4mN3vWOzLX0fB`_ga~#xmGwlDY3eaDaZ6XzX*KIIBWhetyP&&jcSWsiin?I4REtiB
zHbBql$#-E(2zrI!9H|Dmjj(lKV4!ab7yk-6ED*yHC6bKNi0PsU23)X>&tv6GrgFBm
za;AJaMqoRCn&PUZCt9Je0KpKvQGoss3~1e2uU6gBT>hbau{>J8vV5Gt4h>~*hG1s_
zLLt~SEo13czNq9Y-&Ve^tK>}ryP0lJ2;M5d-VjLBI`%X!Dvz~P#^O861p;p~)xHqy
zFTjBi$TPio2j|M$w%JCx7`XT4_k#?OZKn%v4wi23CJY2_-zw)h<M?ECku!gP7a|CF
zFRIso;^UuId{Izj7{nf@=IpbXl$==G%ccA|j?xrfMRn#7X(4K+CtcCa5fvv9!YBD#
zKSJZlh;C&e!pogeP3(9B>J4hCdSWOE(Hu~LUl#V?Yf=k_s*k>!HIfkHmJymZ^<)Ax
je$Ngui!##cX#<`2iTP0;Xofju7*T5ZqgB<xOvdvsqJugT

literal 7000
zcmb`M2Ut|sw#Ns=hS(Al%hhNUlq8B2QM^%Sf?Xowu}zGKGsEDFftgun4pJ0FfkA?Z
zAfRHytJr%$RMfd|t@qw7ulL@2@4ei&)}Hb-3Bi0{jvs5EbJjj<?f-A@*?XU3b!<ME
zsvsuWI?`k^8SWCN->{YVAlBh2E%8;PIJ~+urKGgjaNBg<RuS)a*otj>JPZl>;|q-n
zpEDlftg-yKI}3(39I1z48Vpa)8Dfp$drM$MZf@?nAMv3~j>&+Lc1MB3Tj+H7<DDKk
ziHCTVpX*zwd;3rmjIs{ni(;GM_M|~V07h#tCIDkK7-t<SnM!QCoiM%%UJEr`T^G0u
z%i;BPmh7=6YkZ_qfzM{RV1jij-!OEW+vE1^o*$~7IzKH{9VtG~Zg|5wD*BAyW%D`V
z%>ft`=uU^%<2UqDhY=4G!z4U~Awn=I$5aQCUGYl39j3Ur`QOrDYCTNT;H~7Gn4H);
znC=QATL3cz+1nb-WMr-{!kT4o;8@Z-CRUEs!%Zegl$c2x%r;<-y;L>a=XHCG0!Oj8
zv=EZ3;GJY^0zbi>SWx7&8Kt_@5AWs-vySGQd!5DJ!U~Twgyuc#G`<%_6LCh_*xeqO
zTdJsY-@ENFuL@F<Re$HD!hF|6mlB-nVu5KIq>I3~oVWmFge4Yb!2*$(slmc8m$*nt
zJkHJ7gHH`SF=-xlS@=pF+vwJO?4Lsqu*nj>6)j$<jvdRfv6!w-z}hcVi`G{sbkF?q
zje)MXB4i2a#TtCufcNbo(P^+GB6_KaUZ#OLD%#>&DBac0F|mh^?8WDFVc836FpxzZ
zR9Av$@tEcFiC=oc?WKEIx0cRh%@-=Z{NUFT^nnP<)?m2-E5d@(VP!<nDiO3=gEdh>
zYeRxsIVCoHu#39s_=*?xoR}216^~Ktm-Nny^|5Qpd^(p%-QR5YtP?dk8hmKL`mmY|
z$c?DkAZqe7*cer#h14{&8fvD#O?1h*-~}~H=TaMX(iQ)gbk2+Q%SR<9AD}12-(<&Z
z5-FQC*kZueu#^R`Eh5D#Qa;jPdsNEDAt^1=F?DovBR<-h`hpaCazK6^dD#4gM!r}b
z8>p4eQ`acxH!5a_h{@MrrvU|aD^FP_EmfvX19n9e*+h|D14mR*VMtLe*A5+DXJAL3
z`RelPL<FaZDAHiJ0WQ0B3`gFq7s&irV0a50wi0khwCoWrpJ-4V)lw4D(!&qoJu6*L
zz?Raz=2g^Z=O6TG3esH+4#9JHdGj=C!fJC8K3_?<@xm;;j=@}d^fBG18tTJZ^N#V@
zij7!@cd;6S>DW%E_k9=&@YZC!Nlkp-XtFHip1gurbn%$!c2dH|Bx>4gPD-XHo8F)<
z4wN@#&jf5upvM>-6cTtuyH^9B0l+Iv96#an`kkOhB>P3Op+RX>^4^f-2O_zV8x|;<
zkG0a6YgD^v>aceyV8+|jMYSv}fb~8zJ;lR(=Mv9_&Dd>mrTDi|Fo}J^kMlQD4<5n8
z{P<ovyOl04qchl!!K8|98z$ptc*}yl=Jn%>_8i3EhVe5d;dyGtHuHqZbR&5@J?F=;
z6HkZ!(c?_Z;DXCUaJdE*22|SBYG?C!ynfhM1^bhgevhrh2~`pO2Soot4XQ<dOdT8w
zuXbB9;IOQAH5$}LR=b=8UJ*waP^Z>8S(lgdpN;=?{)2#|uGip*0S)%hK1?t5IBbTq
z5I&9MYm|IPHE0UwI~Jlnf&)YjXqG_7HE1!Q)ovA+*WoBGE#x>SB5~RzPP+yj;W#HF
z8<8wHCG9?~L1(1hFYnsUsMf!f@T?>}r@?sxE`*^K`&>5Yie$Se*}kK}rEs>(p;oKi
zjB0iwRYSK#xuU^U1FqQ>YP;QA4%Z_=Zb*=u8r)JrWNUFd6lP9DIhRFpBw(Fo4PC69
zfOoKddE~ST0~WX=N$zTJ&w%@OwLL84&92Aq=A9foh@|>VQuS!i8%i}K01rc{W^f8v
z8c!$jl5f0a2R74T>P_B3eP+(JT2h)U&?h+_Y4F&9Cn5gDUhYOt0sWB-PbI@M4W36c
z1Vb67M8sgVg{ngrah41@Uf*be?@H_+(*O;iQ2TYSp<)+Qz>i0w|Aa*UNe#Xi=KcLp
zY#DN)wl9sxJ`BPSB-#%(_$dQ^I@ED)Poc9MekKy=XC=_jY4G#mK)>LcDf=3k2jqZX
zlmNe^!7m%|E1@yFTlbdwcnj`_UyTI$H3{<T8vI5$$Zviw$ZtuI-`3!F4EWtp5S~??
zx{ag2??r<Az6ALL4gN43<c~s<UK)pgEP?(+gFiLk&mx^$ROEK>drJlUc_h+bNTk2i
z;IG1w{`zyx`5TGyw;KGN0e^3|j$@g!eO4Q5e}UUm!0W(2L}LA;#QG-<{y7}$U;aPV
zSmE=p66@bI_;&;TgXd3qnemr7ojyPOXFmKFZ_xic+4U7W@2u4HMJclL{vTJYYlv%j
z<P9MDe)PW*|3{i`;&-fo9>Z4CW7)bQ6SO`=uBO)OamqK8tu$43=K3&kseNiMw~H?1
zl;NR#JS4MbAFn%g)Z*A9vvud@JOeKb+hg9ogsSuKHa77zO)VOA;HMNUl9@WeiXGU3
zj}FR2A6ShS%W|<+W@<d~Hub)*hvw=|4>nQ_UZ#ut*z=GMXW>mJuYPT2>f4C79JAGw
zovn74GpYX-=4*YpXh;a?Bb2<6Yz;l0t-Z8LAI0Thcozy@J%MX}IbnUYxH!QWE;W5D
z+kie!osSn!l=EwH#hhPPz6orjoZk?a;w+0I?~zZer53t-fLZapN0+F1JziwcqfL01
z2?X+Xneh<b+3Cc`yn=RH#=cD#@ILnN>c*(CGl4#xWU1n(r>CzjN^uM3g_-YA8y!`c
z=?Xn~4X@_BSB62lOkH#7J};~fneu**mY=2=M`kX1`YMp$6ama&qS7{rt)Wk5E0Cvf
zIT++j1^HWCa{$Ow#l;DxajEHVu?^_c)%gtZL?OQ|R}6Wk^37r!g`6lZ1^FIB=3P)9
zZx^s<zl`R)TbIqJ<GjNSm{WHr(h<hgo^Pg8)IXg!Vb$2ckbQK2+{C$DixKalYj`1-
z&R6oo^XLh7#U5P4<2$&NA3SY(hkN3d<weS#)pT?>-#vxx<^v<AQ~xLhId1EidH8^y
zy$a$aX+9H}t(47SYv{>r1@Svv4u*JvLi{e*902iq;^G8zxzzM|Yy*0VI-f6|DB@JP
zV#I06m(DhdI73_tvD(i*&cQ2GkBvfUDt3>JrQ0!7&vOShZrh3XHx-#zCE(MY{DZG&
zzWeAb_D?$~<D19u9XwizkML5tbk8w5OBV~>^iUY{zpL2B+rB^ubKk^KmD8$tzMvl7
z7@vt*|0Z^rljE}K9B(L}@#B6Mw$SbU6nquT3q%4_$W-bUvNiNYYz1=`mxE!>P?#5U
z%>gifTU?yreJ(Y93EP0aRGlvqPZYCRt{AgL`95G9#hfj!=twpt(M_Em3Mjfh93LgF
zNo2?iu?COM&nR)5i?+~-LORSZ20SMpnmCQll+EB?305+(lhjGi_X^@5-lMKeeCWfI
z7|f>Hj0}EDkYS89ONfUk9r9OUFq2xg6jMVb6Xp)MR=JCR4uTl3Q6Og#HpDEn@NerY
z4t}rFmrL`RzzU^oC0j#Z#a0ln=5jE^=?d{0t~mhWwc_Fg>$ueP9JT@dLv_AhJW<5C
za>a-@C|@4iDB_LcQi$)cnjLF+FrCUPW2F}ArMBVNw1yd*@Ko7oyfdP}oK3gNKf>qC
zyxYQ9AJK!I*s)S5)2V6vvbM5RAf2SEFqywk)e7TXqv%T2D(sb+q66#R$IEVfR*u2Z
zOQ-9Z{1fOfyQynW44nu84u4XKeHFl(G@kWuQo1&?HS{fP1@Klb2Lqg@0B_@(0|1sU
zmXhEjE;W5S+kpPDI^Q9lDByg#V!%6<uYhe7@Gfyh0aNEj?4_ECG8D6>Ca0SnjCTgV
z^BuyTiTEVbym2vhkK~Et5Cit`-8yXY+Jx_TYS=KHf_{FmET5h&v6cwj{0|q8#A^yJ
zzYNz*!XtFg(0PzwG)m|3+u|b`z<r6lJL%_NfQ(@N<}{3>w-0$0z&2?->$fXi4z`9~
z$W{P5xf~2|ssdcZH3tB^TU?yL#igdZ*#`7I>iiS&L;)Ae6$36&J`dX{V6V6o;Hx}>
zT|Dg0=?p=nUK%rR(vba3IF(u}u%GVjFq;|4<#hhSaPuG&c8nSQR$`^#o$bjRO*eFT
zs+mrwOwAfAcvH$p?2qRS!!64Wg95Y4=`^!-@ceyyjS1VB?U;mYo{evYhHelq$Tz*X
zS0U?@R<m$Wf^@cq?q@4x4K4>Eo3>byE#;a6kliaTPEf|BrkAq~=oRX`Qan+z`{atr
z?pMAlwo$SN#HGl7#$=Zi$6MD5RvuPYHU7GAiMJN@=0x9YdawdJc9}onueBFu;<+lB
ze~<E4jkDD0#emSo-VD68b*A8*LhabToF32NJwgy${U6Yc5=EF!4&`(HT63d<XOm~t
z@5-W{9eC1A{n+E4%)!<0m7=Yjt`B<^$_GUOL#S5b4zV@#!)%3e4VQya&Qg?Xx#j?r
z>%_$g0$gf(J==hOM4dN?CrbHKxnjzV%6F7)lyZ}}6y+yO8PDdzUOXm0bN<|O3_H^3
zD$l3&=GQ4OhTj=ls1wZm-q1*Oyf0|nC#;{*i6t@xV5?$Xk7sy|d%PVaMiXzV+eYB^
z1^nqJxCKuwp3A##8PGf-JYKgde;j{t3S!edylLY!pE2%PSeK36K1F+l`X2=9ANeY<
YkBJaw(X2EcXKUy!Z0)7aQd`RZ0ZByBZU6uP

diff --git a/tests/test_datasets/test_waymo_dataset.py b/tests/test_datasets/test_waymo_dataset.py
new file mode 100644
index 0000000000..20ec1fc173
--- /dev/null
+++ b/tests/test_datasets/test_waymo_dataset.py
@@ -0,0 +1,80 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import numpy as np
+import torch
+from mmcv.transforms.base import BaseTransform
+from mmengine.registry import TRANSFORMS
+from mmengine.structures import InstanceData
+
+from mmdet3d.datasets import WaymoDataset
+from mmdet3d.structures import Det3DDataSample, LiDARInstance3DBoxes
+
+
+def _generate_waymo_dataset_config():
+    data_root = 'tests/data/waymo/kitti_format'
+    ann_file = 'waymo_infos_train.pkl'
+    classes = ['Car', 'Pedestrian', 'Cyclist']
+    # wait for pipline refactor
+
+    if 'Identity' not in TRANSFORMS:
+
+        @TRANSFORMS.register_module()
+        class Identity(BaseTransform):
+
+            def transform(self, info):
+                if 'ann_info' in info:
+                    info['gt_labels_3d'] = info['ann_info']['gt_labels_3d']
+                data_sample = Det3DDataSample()
+                gt_instances_3d = InstanceData()
+                gt_instances_3d.labels_3d = info['gt_labels_3d']
+                data_sample.gt_instances_3d = gt_instances_3d
+                info['data_samples'] = data_sample
+                return info
+
+    pipeline = [
+        dict(type='Identity'),
+    ]
+
+    modality = dict(use_lidar=True, use_camera=True)
+    data_prefix = data_prefix = dict(
+        pts='training/velodyne', CAM_FRONT='training/image_0')
+    return data_root, ann_file, classes, data_prefix, pipeline, modality
+
+
+def test_getitem():
+    data_root, ann_file, classes, data_prefix, \
+        pipeline, modality, = _generate_waymo_dataset_config()
+
+    waymo_dataset = WaymoDataset(
+        data_root,
+        ann_file,
+        data_prefix=data_prefix,
+        pipeline=pipeline,
+        metainfo=dict(classes=classes),
+        modality=modality)
+
+    waymo_dataset.prepare_data(0)
+    input_dict = waymo_dataset.get_data_info(0)
+    waymo_dataset[0]
+    # assert the the path should contains data_prefix and data_root
+    assert data_prefix['pts'] in input_dict['lidar_points']['lidar_path']
+    assert data_root in input_dict['lidar_points']['lidar_path']
+    for cam_id, img_info in input_dict['images'].items():
+        if 'img_path' in img_info:
+            assert data_prefix['CAM_FRONT'] in img_info['img_path']
+            assert data_root in img_info['img_path']
+
+    ann_info = waymo_dataset.parse_ann_info(input_dict)
+
+    # only one instance
+    assert 'gt_labels_3d' in ann_info
+    assert ann_info['gt_labels_3d'].dtype == np.int64
+
+    assert 'gt_bboxes_3d' in ann_info
+    assert isinstance(ann_info['gt_bboxes_3d'], LiDARInstance3DBoxes)
+    assert torch.allclose(ann_info['gt_bboxes_3d'].tensor.sum(),
+                          torch.tensor(43.3103))
+    assert 'centers_2d' in ann_info
+    assert ann_info['centers_2d'].dtype == np.float32
+    assert 'depths' in ann_info
+    assert ann_info['depths'].dtype == np.float32
diff --git a/tools/create_data.py b/tools/create_data.py
index 384fa87127..6727a31de6 100644
--- a/tools/create_data.py
+++ b/tools/create_data.py
@@ -175,7 +175,7 @@ def waymo_data_prep(root_path,
                     workers,
                     max_sweeps=10,
                     only_gt_database=False,
-                    skip_image_and_lidar=False,
+                    save_senor_data=False,
                     skip_cam_instances_infos=False):
     """Prepare waymo dataset. There are 3 steps as follows:
 
@@ -197,7 +197,7 @@ def waymo_data_prep(root_path,
             frames for later use.
         only_gt_database (bool, optional): Whether to only generate ground
             truth database. Default to False.
-        skip_image_and_lidar (bool, optional): Whether to skip saving
+        save_senor_data (bool, optional): Whether to skip saving
             image and lidar. Default to False.
         skip_cam_instances_infos (bool, optional): Whether to skip
             gathering cam_instances infos in Step 2. Default to False.
@@ -232,7 +232,7 @@ def waymo_data_prep(root_path,
                 info_prefix=info_prefix,
                 max_sweeps=max_sweeps,
                 split=split,
-                save_image_and_lidar=not skip_image_and_lidar,
+                save_senor_data=not save_senor_data,
                 save_cam_instances=not skip_cam_instances_infos)
             converter.convert()
             if split == 'validation':
@@ -308,7 +308,7 @@ def semantickitti_data_prep(info_prefix, out_dir):
     help='''Whether to skip gathering cam_instances infos.
         Only used when dataset is Waymo!''')
 parser.add_argument(
-    '--skip-image-and-lidar',
+    '--skip-saving-senor-data',
     action='store_true',
     help='''Whether to skip saving image and lidar.
         Only used when dataset is Waymo!''')
@@ -380,7 +380,7 @@ def semantickitti_data_prep(info_prefix, out_dir):
             workers=args.workers,
             max_sweeps=args.max_sweeps,
             only_gt_database=args.only_gt_database,
-            skip_image_and_lidar=args.skip_image_and_lidar,
+            save_senor_data=not args.skip_saving_sensor_data,
             skip_cam_instances_infos=args.skip_cam_instances_infos)
     elif args.dataset == 'lyft':
         train_version = f'{args.version}-train'
diff --git a/tools/dataset_converters/waymo_converter.py b/tools/dataset_converters/waymo_converter.py
index 00eba35daa..c7704065d2 100644
--- a/tools/dataset_converters/waymo_converter.py
+++ b/tools/dataset_converters/waymo_converter.py
@@ -46,7 +46,7 @@ class Waymo2KITTI(object):
             Defaults to 64.
         test_mode (bool, optional): Whether in the test_mode.
             Defaults to False.
-        save_image_and_lidar (bool, optional): Whether to save image and lidar
+        save_senor_data (bool, optional): Whether to save image and lidar
             data. Defaults to True.
         save_cam_sync_instances (bool, optional): Whether to save cam sync
             instances. Defaults to True.
@@ -64,7 +64,7 @@ def __init__(self,
                  prefix,
                  workers=64,
                  test_mode=False,
-                 save_image_and_lidar=True,
+                 save_senor_data=True,
                  save_cam_sync_instances=True,
                  save_cam_instances=True,
                  info_prefix='waymo',
@@ -108,7 +108,7 @@ def __init__(self,
         self.prefix = prefix
         self.workers = int(workers)
         self.test_mode = test_mode
-        self.save_image_and_lidar = save_image_and_lidar
+        self.save_senor_data = save_senor_data
         self.save_cam_sync_instances = save_cam_sync_instances
         self.save_cam_instances = save_cam_instances
         self.info_prefix = info_prefix
@@ -147,8 +147,8 @@ def convert(self):
             data_list.extend(data_info)
         metainfo = dict()
         metainfo['dataset'] = 'waymo'
-        metainfo['version'] = '1.4'
-        metainfo['info_version'] = '1.1'
+        metainfo['version'] = 'waymo_v1.4'
+        metainfo['info_version'] = 'mmdet3d_v1.4'
         waymo_infos = dict(data_list=data_list, metainfo=metainfo)
         filenames = osp.join(
             osp.dirname(self.save_dir),
@@ -179,12 +179,13 @@ def convert_one(self, file_idx):
             frame = dataset_pb2.Frame()
             frame.ParseFromString(bytearray(data.numpy()))
 
-            # Step 1.
-            if self.save_image_and_lidar:
+            # Step 1. Extract camera images and lidar point clouds from waymo
+            # raw data in '*.tfreord' and save as kitti format.
+            if self.save_senor_data:
                 self.save_image(frame, file_idx, frame_idx)
                 self.save_lidar(frame, file_idx, frame_idx)
 
-            # Step 2.
+            # Step 2. Generate waymo train/val/test infos and save as pkl file.
             # TODO save the depth image for waymo challenge solution.
             self.create_waymo_info_file(frame, file_idx, frame_idx, file_infos)
         return file_infos