-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess_mvsec_dataset.py
337 lines (272 loc) · 13.4 KB
/
preprocess_mvsec_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
This script can be called to preprocess the MVSEC dataset, transforming the raw .hdf5 recordings
into PyTorch .pt files, with all the preprocessing steps (data formatting, normalization, LiDAR
projection, ...) already applied.
Preprocessing the dataset has its pros and cons:
- it reduces greatly the computational power necessary to load the dataset during training/
validation/testing;
- it greatly increases disk space usage, as the dataset is converted into multiple small sequences;
- if the preprocessing steps are modified, the whole dataset has to be preprocessed again.
"""
import argparse
from glob import glob
from os import mkdir, path
from zipfile import ZipFile
import h5py
import numpy as np
import pandas as pd
import torch
from tqdm.contrib.concurrent import process_map
import yaml
# We define args as a global variable, so that it can be seen by all parallel processes
args = None
def parse_args():
"""Args parser"""
parser = argparse.ArgumentParser()
parser.add_argument("path_raw", help="Path to the folder containing the raw dataset")
parser.add_argument("path_processed", help="Path to the folder where the dataset will be stored "
"after preprocessing")
parser.add_argument("evts_bins", type=int, help="Number of bins B for the event volume "
"representation")
parser.add_argument("lidar_clouds_per_seq", type=int, help="Number of LiDAR clouds contained in "
"each sequence. By setting it to a value <= 0, the full sequence won't be split")
parser.add_argument("lidar_max_range", type=float, help="Maximum range (in meters) of the LiDAR "
"sensor")
parser.add_argument("-j", type=int, default=0, help="Number of parallel processes spawned to "
"preprocess the dataset. If not specified, `min(32, cpu_count() + 4)` processes are spawned")
return parser.parse_args()
def compute_event_volume(events, bins):
"""
From a numpy array of events, computes an event volume, as described in the "Learning to Detect
Objects with a 1 Megapixel Event Camera" article by Perot et al.
This implementation is optimized for fast computation (which is still a bit slow :c), thanks to
https://stackoverflow.com/a/55739936
"""
# We create an empty event volume
event_volume = np.zeros((2*bins, 260, 346), np.float32)
# We compute the t_star value for each event
t_star = (bins-1)*(events[:, 2]-events[0, 2])/(events[-1, 2]-events[0, 2])
# We create an index of unique (x, y, pol) events
evts_x_y_pol = [tuple(e.astype(int)) for e in events[:, [0, 1, 3]]]
idx, u_evts = pd.factorize(evts_x_y_pol)
u_evts = np.array([np.array(u_evt) for u_evt in u_evts])
# Then, for each bin...
for i in range(bins):
# We compute the sum of the max(0, 1-abs(bin-t_star)) for each pixel
sums = np.bincount(idx, np.fmax(0, 1-abs(i-t_star)))
# We set these values inside the event volume
event_volume[i+bins*(u_evts[:, 2]+1)//2, u_evts[:, 1], u_evts[:, 0]] = sums
# We finally return the event volume, in the PyTorch format
return torch.from_numpy(event_volume)
def compute_depth_image(depth_image_raw, lidar_max_range):
"""
From a raw depth image, computes a Tensor representation, which can be fed to the network
"""
# We normalize the values based on the max range of the LiDAR
# Note that the depth image contains values > than 1.0, which should probably be filtered out
# during training
depth_image_raw /= lidar_max_range
# Finally, we transform the numpy matrix to a PyTorch Tensor
depth_image = torch.from_numpy(depth_image_raw)
# And we return it
return depth_image
def compute_lidar_projection(lidar_cloud, lidar_max_range, davis_intrinsics, davis_dist,
trans_davis_lidar):
"""
Creates a projection of the point cloud in a 1-channel matrix.
Each value of the matrix corresponds to the depth values, normalized between 0 and 1.
"""
# We use the calibration parameters to initialize the camera matrix
K = np.array([[davis_intrinsics[0], 0, davis_intrinsics[2]],
[0, davis_intrinsics[1], davis_intrinsics[3]],
[0, 0, 1]])
# We then filter the point cloud, to only retain points in front of the LiDAR
lidar_cloud_filt = lidar_cloud[lidar_cloud[:, 0] > 0]
# We tranform the coordinates to the camera's frame
pcl_camera_frame = ((trans_davis_lidar @ lidar_cloud_filt.T).T)[:, :3]
pcl_camera_frame_filt = pcl_camera_frame[pcl_camera_frame[:, 2] >= 1.0]
depths = pcl_camera_frame_filt[:, 2].copy()
pcl_camera_frame_filt[:, 0] /= depths
pcl_camera_frame_filt[:, 1] /= depths
pcl_camera_frame_filt[:, 2] /= depths
# We apply the distortions
# See eq. (3) to (8) of the "AprilCal: Assisted and repeatable camera calibration" article from
# Richardson et al. for more details on the formulas
pcl_camera_frame_dist = np.ones_like(pcl_camera_frame_filt)
x = pcl_camera_frame_filt[:, 0]
y = pcl_camera_frame_filt[:, 1]
z = pcl_camera_frame_filt[:, 2]
r_2 = x*x + y*y
r = np.sqrt(r_2)
theta = np.arctan2(r, z)
theta2 = theta*theta
theta3 = theta2*theta
theta4 = theta2*theta2
theta5 = theta4*theta
theta6 = theta3*theta3
theta7 = theta6*theta
theta8 = theta4*theta4
theta9 = theta8*theta
theta_d = theta + davis_dist[0]*theta3 + davis_dist[1]*theta5 + davis_dist[2]*theta7 + davis_dist[3]*theta9
psi = np.arctan2(y, x)
pcl_camera_frame_dist[:, 0] = theta_d * np.cos(psi)
pcl_camera_frame_dist[:, 1] = theta_d * np.sin(psi)
# We project them in the image
pcl_camera = (K @ pcl_camera_frame_dist.T).T
# We create the projection, and add each projected LiDAR point to it
lidar_proj = torch.zeros(1, 260, 346)
for i, pt in enumerate(pcl_camera[:, :2]):
if pt[0] >= 0 and pt[0] < 346 and pt[1] >= 0 and pt[1] < 260:
lidar_proj[0, int(pt[1]), int(pt[0])] = min(depths[i]/lidar_max_range, 1.0)
# We return the projection
return lidar_proj
def sorted_slice_mask(arr, l_val, r_val):
"""
Returns a mask of array arr (arr should be sorted!) such that values v of arr verifying l < v <= r
are set to True in the mask (and the others are set to False)
"""
start = np.searchsorted(arr, l_val, "right")
end = np.searchsorted(arr, r_val, "left")
out_arr = np.full_like(arr, False, dtype=bool)
out_arr[start:end+1] = True
return out_arr
def preprocess_recording(paths):
"""Recording preprocessing function, which can be called in parallel"""
# We start by getting the args
global args
# We get the prefix of the file (outdoor_day1, ...)
prefix = paths[0].split('/')[-1][:-10]
# We open and read data from the _data and _gt files
data_recording = h5py.File(paths[0])
gt_recording = h5py.File(paths[1])
events = data_recording["davis"]["left"]["events"][:]
events_ts = events[:, 2]
lidar_clouds = data_recording["velodyne"]["scans"][:]
lidar_clouds_ts = data_recording["velodyne"]["scans_ts"][:]
depths = gt_recording["davis"]["left"]["depth_image_raw"][:]
depths_ts = gt_recording["davis"]["left"]["depth_image_raw_ts"][:]
# We open and read data from the calibration file
calib_zip_file = ZipFile(paths[2])
for file in calib_zip_file.namelist():
if file.endswith(".yaml"):
calib_file_name = file
with calib_zip_file.open(calib_file_name) as stream:
yaml_data = yaml.safe_load(stream)
davis_intrinsics = np.array(yaml_data["cam0"]["intrinsics"])
davis_dist = np.array(yaml_data["cam0"]["distortion_coeffs"])
t_davis_lidar = np.array(yaml_data["T_cam0_lidar"])
# We have to filter the MVSEC dataset before using it: LiDAR scans are always available, but
# events and ground truth depth images are not necessarly available at the beginning or at the
# end of each sequence. So, based on the timestamps of the LiDAR clouds, we only keep those for
# which both a D_bf and a D_af depth images + an event volume can be associated
valid_data = []
for i in range(lidar_clouds.shape[0]-1):
# We create the array that will contain the LiDAR cloud, the D_bf and D_af depth images, an the
# events, in which we already place the LiDAR cloud
possible_valid_data = [lidar_clouds[i, :, :]]
# To know which events and depth images should be extracted, we set the start timestamp as the
# one of the current LiDAR cloud, and set the end timestamp as the timestamp of the next LiDAR
# cloud
start_ts = lidar_clouds_ts[i]
end_ts = lidar_clouds_ts[i+1]
# We check if we can get a bf_depth_image in this interval of time
bf_depth_image_ts_mask = np.bitwise_and(depths_ts >= (start_ts - 0.001),
depths_ts <= (start_ts + 0.001))
if np.sum(bf_depth_image_ts_mask) == 0:
continue
else:
possible_valid_data.append(depths[bf_depth_image_ts_mask])
# We do the same for the af_depth_image
af_depth_images_ts_mask = np.bitwise_and(depths_ts >= (end_ts - 0.001),
depths_ts <= (end_ts + 0.001))
if np.sum(af_depth_images_ts_mask) == 0:
continue
else:
possible_valid_data.append(depths[af_depth_images_ts_mask])
# And we finish with the events
events_ts_mask = sorted_slice_mask(events_ts, start_ts, end_ts)
if np.sum(events_ts_mask) == 0:
continue
else:
possible_valid_data.append(events[events_ts_mask])
# If we get here: congrats, we were able to get all the required data! So, we add it to the
# valid_data array
valid_data.append(possible_valid_data)
# If args.lidar_clouds_per_seq is set to a value <= 0, we do not split the sequence
if args.lidar_clouds_per_seq <= 0:
args.lidar_clouds_per_seq = len(valid_data)
# Each recording allows us to generate N/L sequences, where N is the total of valid data that was
# previously filtered, and L is the total number of successive point clouds in a sequence
nb_seq = len(valid_data) // args.lidar_clouds_per_seq
# We generate each of the nb_seq sequences
for i in range(nb_seq):
sequence = []
# Each sequence contains L successive LiDAR clouds from the recording
for j in range(i*args.lidar_clouds_per_seq, (i+1)*args.lidar_clouds_per_seq):
# We get the LiDAR cloud and project it as an image
lidar_cloud = valid_data[j][0]
lidar_proj = compute_lidar_projection(lidar_cloud, args.lidar_max_range, davis_intrinsics, davis_dist, t_davis_lidar)
# We get the D_bf depth image
bf_depth_image_raw = valid_data[j][1]
bf_depth_image = [compute_depth_image(bf_depth_image_raw, args.lidar_max_range)]
# And the D_af depth image
af_depth_image_raw = valid_data[j][2]
af_depth_image = [compute_depth_image(af_depth_image_raw, args.lidar_max_range)]
# And we finish with the events, for which we compute the corresponding event volume
evts = valid_data[j][3]
event_volume = [compute_event_volume(evts, args.evts_bins)]
# Finally, we add the projected LiDAR cloud, the event volume, and the D_bf and D_af depth
# images to the sequence array
sequence.append([lidar_proj, event_volume, bf_depth_image, af_depth_image])
# Once all the LiDAR point clouds, events and depth images have been added to the sequence, we
# save it
torch.save(sequence, f"{args.path_processed}/{prefix}_seq{i:04}.pt")
def main():
"""Main function"""
# We start by reading the args given by the user
global args
args = parse_args()
# We begin by verifying that the paths given by the user are valid
if not path.isdir(args.path_raw):
raise Exception("The path to the dataset should be a folder, containing .hdf5 recordings")
if not path.isdir(args.path_processed):
mkdir(args.path_processed)
# We list all the recordings in the folder
recordings_data_paths = sorted(glob(f"{args.path_raw}/*_data.hdf5"))
if not recordings_data_paths:
raise Exception("The provided folder does not contain any data file!")
recordings_gt_paths = sorted(glob(f"{args.path_raw}/*_gt.hdf5"))
if not recordings_gt_paths:
raise Exception("The provided folder does not contain any ground truth file!")
calib_paths = sorted(glob(f"{args.path_raw}/*_calib.zip"))
if not calib_paths:
raise Exception("The provided folder does not contain any calibration file!")
# We verify that we listed the same number of _data and _gt files
if len(recordings_data_paths) != len(recordings_gt_paths):
raise Exception("The provided folder does not contain the same number of data and ground truth "
"files!")
# We associate the correct calibration files for the _data and _gt files
recordings_calib_paths = []
for data_path in recordings_data_paths:
for calib_path in calib_paths:
prefix = '_'.join(calib_path.split('_')[:2])
if data_path.startswith(prefix):
recordings_calib_paths.append(calib_path)
break
# We verify that we were able to associate all _data files with a calibration file
if len(recordings_data_paths) != len(recordings_calib_paths):
raise Exception("Could not find all calibration files!")
# We fuse the data, ground truth, and calibration files in a single array (required for the
# `process_map` function)
recordings_paths = []
for data_path, gt_path, calib_path in zip(recordings_data_paths, recordings_gt_paths, recordings_calib_paths):
recordings_paths.append([data_path, gt_path, calib_path])
# Then, we process all the recordings in parallel, using the `process_map` function from tqdm
if args.j > 0:
process_map(preprocess_recording, recordings_paths, max_workers=args.j)
else:
process_map(preprocess_recording, recordings_paths)
if __name__ == "__main__":
main()