-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathmodel.py
59 lines (39 loc) · 1.87 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import torch.nn as nn
import torch
import torchvision.models as models
import config
class LRCN(nn.Module):
def __init__(self):
super(LRCN, self).__init__()
# define the CNN part
self.featureExtractor = models.alexnet(pretrained=True)
# remove fc7
self.featureExtractor.classifier = nn.Sequential(*list(self.featureExtractor.classifier.children())[:-5])
# define the lstm part
self.lstm = nn.LSTM(input_size=config.input_size, hidden_size=config.hidden_size, num_layers=config.num_of_layers, dropout=0.9, batch_first=True)
# define a linear layer
self.linearLayer = nn.Linear(config.hidden_size, config.classNum)
def forward(self, video_clip):
# video clip's dimension: [B, C, T, H, W]
# frameFeatures' dimension: [B, T, CNN's output dimension(4096)]
# it is used to store all frame's feature
frameFeatures = torch.empty(size=(video_clip.size()[0], video_clip.size()[2], config.input_size), device='cuda')
for t in range(0, video_clip.size()[2]):
frame = video_clip[:, :, t, :, :]
frame_feature = self.featureExtractor(frame)
# print(frame_feature.shape)
frameFeatures[:, t, :] = frame_feature
# x is the output of lstm:(batch, seq_len, input_size)
x, _ = self.lstm(frameFeatures)
# input's dimension: (batch_size, seq_length, hidden_size)
# output's dimension: (batch_size, seq_length, classNum)
x = self.linearLayer(x)
# get frame-wise's mean
# output's dimension:(batch, class_Num)
x = torch.mean(x, dim=1)
return x
if __name__ == '__main__':
model = LRCN()
frames = torch.rand(config.BATCH_SIZE, 3, config.seq_length, 227, 227)
output = model(frames)
print(output.size())