-
Notifications
You must be signed in to change notification settings - Fork 7
/
run_demo.py
101 lines (82 loc) · 3.53 KB
/
run_demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import torch
import numpy as np
from sync_batchnorm import convert_model
import models
import VGGSound
from VGGSound.test import args_dict
from VGGSound.model import AVENet
import torch.nn as nn
import librosa
from scipy import signal
from util import get_sr, get_audio_count, get_conf, get_visual_count, read_video
np.random.seed(0)
torch.manual_seed(0)
torch.backends.cudnn.enabled = False # 0.811
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#-----------------------------------------------load models trained on Countix-AV-------------------------------------------
model = models.video.r2plus1d_18(pretrained=True)
model.fc = torch.nn.Linear(512,34*41)
model.fc2 = torch.nn.Linear(512,41)
model = convert_model(model)
args = args_dict()
audio_model= AVENet(args)
audio_model.audnet.fc = torch.nn.Linear(512, 18*43)
audio_model.audnet.fc2 = torch.nn.Linear(512,43)
checkpoint = torch.load("audio_checkpoint.pt")
audio_model.load_state_dict(checkpoint['state_dict'])
audio_model = audio_model.cuda()
audio_model.eval()
tensor = torch.Tensor(np.arange(2,20).astype(np.float32)).cuda()
a_sr_model = VGGSound.models.resnet.SubNet()
a_sr_model = a_sr_model.cuda()
v_sr_model = models.video.resnet.SubNet()
v_sr_model.fc = nn.Sequential(torch.nn.Linear(1024,41))
v_sr_model = convert_model(v_sr_model)
conf_model = VGGSound.models.resnet.ConfSubNet(lambda1=14)
conf_model.fc = nn.Sequential(nn.Dropout(0.27),torch.nn.Linear(1024, 2),)
checkpoint = torch.load("conf_checkpoint.pt")
conf_model.load_state_dict(checkpoint['v_state_dict'])
conf_model = conf_model.cuda()
conf_model.eval()
if device.type == "cuda":
model = torch.nn.DataParallel(model)
v_sr_model = torch.nn.DataParallel(v_sr_model)
model = model.cuda()
checkpoint = torch.load("visual_checkpoint.pt")
model.load_state_dict(checkpoint['state_dict'])
model.eval()
checkpoint = torch.load("sr_checkpoint.pt")
v_sr_model.load_state_dict(checkpoint['sr_state_dict'])
v_sr_model.eval()
a_sr_model.load_state_dict(checkpoint['a_sr_state_dict'])
a_sr_model.eval()
tensor = torch.Tensor(np.arange(2,36)).type(torch.FloatTensor).cuda().unsqueeze(0)
audio_tensor = torch.Tensor(np.arange(2,20).astype(np.float32)).cuda()
video1 = "eEG7ZOQG6LM.00.mp4"
video1,fps = read_video(video1)
video1 = video1/255.0
video1 = (video1 - np.array([0.485,0.456,0.406]).reshape((1,1,1,3)))/np.array([0.229,0.224,0.225]).reshape((1,1,1,3))
# when we generated the video segment, we preserved some background context, so we need to do this line to remove the background
video1 = video1[10:265]
#--------------------------------------------------------------------------------------------
filename = "eEG7ZOQG6LM.00.wav"
y, sr = librosa.load(filename)
frequencies, times, spectrogram = signal.spectrogram(y, sr, nperseg=512, noverlap=353)
spectrogram = np.log(spectrogram + 1e-7)
mean = np.mean(spectrogram)
std = np.std(spectrogram)
spectrogram_ori = np.divide(spectrogram - mean, std + 1e-9)
#--------------------------------------------------------------------------------------------
sample_rate = get_sr(spectrogram_ori, video1, model, audio_model, a_sr_model, v_sr_model)
outputs11 = get_audio_count(spectrogram_ori, audio_model,audio_tensor)
outputs = get_visual_count(video1, sample_rate, model, tensor)
conf = get_conf(spectrogram_ori, outputs11, sample_rate, video1, audio_model, model, conf_model)
outputs = conf*outputs + (1-conf)*outputs11
outputs = np.round(outputs)
groundtruth=8
mae_err = abs(outputs-groundtruth) / groundtruth
if abs(outputs-groundtruth) <= 1:
OBO = 1
else:
OBO = 0
print(mae_err, OBO)