-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmoma_sacts.py
72 lines (60 loc) · 2.55 KB
/
moma_sacts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from momaapi import MOMAAPI
import torch
from mmpt.models import MMPTModel
from torchvision.io import read_video, write_video
from torch.nn import Sequential
import torchvision.transforms as Transforms
import skvideo.io
import numpy as np
moma = MOMAAPI('../../data/moma')
moma_sacts = [
"the barber is applying hair products or shaving cream",
"the adult is feeding the child",
"the adult is holding the child on the bike",
"no meaning to see if length effects anything",
"a bunch of random filler words with no meaning to see if length effects anything",
"a bunch of random filler words",
# "the firefighters are extinguishing fire",
]
model, tokenizer, aligner = MMPTModel.from_pretrained(
"projects/retri/videoclip/how2.yaml")
print(moma_sacts)
for correct_idx, activity in enumerate(moma_sacts):
print("Category:", activity)
sact_ids = moma.get_ids_sact(cnames_sact = [activity])
print("Number in category", len(sact_ids))
num_examples = 10
num_correct = 0
paths = moma.get_paths(ids_sact=sact_ids[:num_examples])
print("PATH LENGTH:", len(paths))
model.eval().to('cuda')
for path in paths:
videodata = skvideo.io.vread(path)
L, H, W, C = videodata.shape
if L / 30 != 0:
extra_frames = L % 30
videodata = videodata[extra_frames:]
L = len(videodata)
if L > 240:
# Grab middle 240 frames
videodata = videodata[L//2-120:L//2+120] # Cap videodata to first 240 frames
videodata = np.reshape(videodata, (1, -1, 30, H, W, C))
# B, T, FPS, H, W, C (VideoCLIP is trained on 30 fps of s3d)
text_to_try = moma_sacts
video_frames = torch.from_numpy(videodata / 255.0).cuda().float()
scores = []
for text in text_to_try:
caps, cmasks = aligner._build_text_seq(
tokenizer(text, add_special_tokens=False)["input_ids"]
)
caps, cmasks = caps[None, :].cuda(), cmasks[None, :].cuda() # bsz=1
with torch.no_grad():
# Goes here first
output = model(video_frames, caps, cmasks, return_score=True)
#print("Text:", "'" + text + "'", "score:", output["score"].item()) # dot-product
scores.append(output["score"].item())
pred = np.argmax(scores)
if pred == correct_idx:
num_correct += 1
print("Predicted class", moma_sacts[pred])
print("Accuracy for class", activity, num_correct / num_examples)