-
Notifications
You must be signed in to change notification settings - Fork 27
/
generate_proposals.py
executable file
·152 lines (132 loc) · 6.34 KB
/
generate_proposals.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env python
"""
Generate action proposals for video
"""
import os
import warnings
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
import h5py
import numpy as np
import pandas as pd
from daps import C3D, DAPs
from daps.utils.segment import non_maxima_supression
def input_parser():
description = ('Compute action proposals from its C3D feature '
'representation.')
epilog = ('Note: It assumes that C3D features were densely extracted '
'for all the frames.')
p = ArgumentParser(description=description, epilog=epilog,
formatter_class=ArgumentDefaultsHelpFormatter)
# Video arguments
p.add_argument('-iv', '--video-name', required=True,
help='Name of video-id in your HDF5-file with C3D features')
p.add_argument('-ic3d', '--c3d-hdf5', required=True,
help='HDF5 file with features for each video')
p.add_argument('-imd', '--model-file', required=True,
help='npz file with sequence encoder parameters')
p.add_argument('-iaf', '--anchors-hdf5', default='non-existent.hdf5',
help='HDF5 file with anchor segments')
# Output arguments
p.add_argument('-io', '--output-csv', default='',
help=('Filename to save proposals of video as CSV-file. If '
'empty "", it uses the same video-name'))
p.add_argument('-c', '--clobber', action='store_true',
help='Overwrite outputs')
# DAPs arguments
p.add_argument('-ses', '--seq-encoder-stride', default=64, type=int,
help='Sliding stride for sequence encoder along the video')
p.add_argument('-sel', '--seq-encoder-length', default=32, type=int,
help='Length of sequence encoder')
p.add_argument('-sew', '--seq-encoder-width', default=256, type=int,
help='Number of hidden units per layer')
p.add_argument('-sed', '--seq-encoder-depth', default=1, type=int,
help='Depth of sequence encoder')
# Extra arguments
p.add_argument('-vefr', '--c3d-f-res', default=16, type=int,
help='temporal resolution of C3D')
p.add_argument('-vefs', '--c3d-f-stride', default=8, type=int,
help='temporal stride for C3D sampling')
p.add_argument('-vept', '--c3d-pool-type', default='concat-32-mean',
help='Pooling strategy for C3D features')
p.add_argument('-vefd', '--c3d-feat-dim', default=500, type=int,
help='Dimensionality of visual representation')
p.add_argument('-vefi', '--c3d-feat-id', default='c3d_features',
help=('id used for HDF5-dataset corresponding to C3D '
'features'))
return p
def main(video_name, c3d_hdf5, model_file, anchors_hdf5='non-existent',
output_csv=None, clobber=False, seq_encoder_stride=64,
num_proposals_per_seq_length=64, seq_encoder_length=32,
seq_encoder_depth=1, seq_encoder_width=256, c3d_f_res=16,
c3d_f_stride=8, c3d_pool_type='concat-32-mean', c3d_feat_dim=500,
c3d_feat_id='c3d_features'):
# Setup DAPs model
# Infer receptive-field in terms of number of frames
daps_receptive_field = seq_encoder_length * c3d_f_res
# Visual Enconder
print 'Setup interface with visual encoder'
visual_encoder = C3D(c3d_hdf5, c3d_f_res, c3d_f_stride, c3d_pool_type,
c3d_feat_id)
visual_encoder.open_instance()
# Infer video length (it assumes C3D were densely extracted at every frame)
num_c3d_features = visual_encoder.fobj[video_name][c3d_feat_id].shape[0]
video_length = num_c3d_features + c3d_f_res
# If num-frames less than DAPs-res, change t_stride
if video_length < seq_encoder_length:
raise ValueError('video-length < seq-encoder-time-steps.\nWe never '
'consider to create proposals for short clips')
elif video_length < daps_receptive_field:
warnings.warn(('video-length < DAPs-temporal-span. Increasing '
'sampling of c3d to compensate this.'), RuntimeWarning)
visual_encoder.t_stride = int(num_c3d_features / seq_encoder_length)
daps_receptive_field = video_length
f_init_arr = np.arange(0, 1)
else:
f_init_arr = np.arange(0, video_length - daps_receptive_field + 1,
seq_encoder_stride)
# Sequence Enconder
# Load anchors file
anchors = None
if os.path.exists(anchors_hdf5):
with h5py.File(anchors_hdf5) as f:
anchors = f['anchors'].value
print 'Setup sequence encoder'
sequence_encoder = DAPs(num_proposals_per_seq_length, seq_encoder_length,
seq_encoder_depth, seq_encoder_width, c3d_feat_dim,
daps_receptive_field, anchors)
print 'Loading sequence encoder model'
sequence_encoder.load_model(model_file)
print 'Compiling sequence encoder'
sequence_encoder.compile()
# Using DAPs
print 'Reading C3D features'
ve_representation = visual_encoder.read_feat_batch_from_video(
video_name, f_init_arr, duration=daps_receptive_field)
# Generate proposals along the whole video
print 'Generating segments'
proposals, score = sequence_encoder.retrieve_proposals(
ve_representation, f_init_arr)
# Post-processing
print 'Post-processing segments'
pp_proposals = proposals.reshape((-1, 2))
pp_score = score.reshape(-1)
nms_proposals, nms_score = non_maxima_supression(pp_proposals, pp_score)
# Close visual encoder interface
visual_encoder.close_instance()
num_proposals = nms_proposals.shape[0]
df_out = pd.DataFrame({'f-init': nms_proposals[:, 0],
'f-end': nms_proposals[:, 1],
'score': nms_score,
'video-name': [video_name] * num_proposals})
# Dumping output
if output_csv is not None:
print 'Dumping results to disk'
if len(output_csv) == 0:
output_csv = video_name + '.csv'
if not clobber and os.path.isfile(output_csv):
raise ValueError('Existent output: {}'.format(output_csv))
df_out.to_csv(output_csv, index=None, sep=' ')
return df_out
if __name__ == '__main__':
p = input_parser()
main(**vars(p.parse_args()))