-
Notifications
You must be signed in to change notification settings - Fork 1
/
emotion_detection.py
executable file
·151 lines (123 loc) · 5.98 KB
/
emotion_detection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# HOW TO RUN
# python3 emotion_recognition.py -i video/novak_djokovic.mp4 --model output/model.pth --prototxt model/deploy.prototxt.txt --caffemodel model/res10_300x300_ssd_iter_140000_fp16.caffemodel
# import the necessary libraries
from torchvision.transforms import ToPILImage
from torchvision.transforms import Grayscale
from torchvision.transforms import ToTensor
from torchvision.transforms import Resize
from torchvision import transforms
from neuraspike import EmotionNet
import torch.nn.functional as nnf
from neuraspike import utils
import numpy as np
import argparse
import torch
import cv2
# initialize the argument parser and establish the arguments required
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--video", type=str, required=True,
help="path to the video file/ webcam")
parser.add_argument("-m", "--model", type=str, required=True,
help="path to the trained model")
parser.add_argument('-p', '--prototxt', type=str, required=True,
help='Path to deployed prototxt.txt model architecture file')
parser.add_argument('-c', '--caffemodel', type=str, required=True,
help='Path to Caffe model containing the weights')
parser.add_argument("-conf", "--confidence", type=int, default=0.5,
help="the minimum probability to filter out weak detection")
args = vars(parser.parse_args())
# load our serialized model from disk
print("[INFO] loading model...")
net = cv2.dnn.readNetFromCaffe(args['prototxt'], args['caffemodel'])
# check if gpu is available or not
device = "cuda" if torch.cuda.is_available() else "cpu"
# dictionary mapping for different outputs
emotion_dict = {0: "Angry", 1: "Fearful", 2: "Happy", 3: "Neutral",
4: "Sad", 5: "Surprised"}
# load the emotionNet weights
model = EmotionNet(num_of_channels=1, num_of_classes=len(emotion_dict))
model_weights = torch.load(args["model"])
model.load_state_dict(model_weights)
model.to(device)
model.eval()
# initialize a list of preprocessing steps to apply on each image during runtime
data_transform = transforms.Compose([
ToPILImage(),
Grayscale(num_output_channels=1),
Resize((48, 48)),
ToTensor()
])
# initialize the video stream
vs = cv2.VideoCapture(args['video'])
# iterate over frames from the video file stream
while True:
# read the next frame from the input stream
(grabbed, frame) = vs.read()
# check there's any frame to be grabbed from the steam
if not grabbed:
break
# clone the current frame, convert it from BGR into RGB
frame = utils.resize_image(frame, width=720, height=720)
output = frame.copy()
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# initialize an empty canvas to output the probability distributions
canvas = np.zeros((300, 300, 3), dtype="uint8")
# get the frame dimension, resize it and convert it to a blob
(h, w) = frame.shape[:2]
blob = cv2.dnn.blobFromImage(cv2.resize(frame, (300, 300)), 1.0, (300, 300))
# infer the blob through the network to get the detections and predictions
net.setInput(blob)
detections = net.forward()
# iterate over the detections
for i in range(0, detections.shape[2]):
# grab the confidence associated with the model's prediction
confidence = detections[0, 0, i, 2]
# eliminate weak detections, ensuring the confidence is greater
# than the minimum confidence pre-defined
if confidence > args['confidence']:
# compute the (x,y) coordinates (int) of the bounding box for the face
box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
(start_x, start_y, end_x, end_y) = box.astype("int")
# grab the region of interest within the image (the face),
# apply a data transform to fit the exact method our network was trained,
# add a new dimension (C, H, W) => (N, C, H, W) and send it to the device
face = frame[start_y:end_y, start_x:end_x]
face = data_transform(face)
face = face.unsqueeze(0)
face = face.to(device)
# infer the face (roi) into our pretrained model and compute the
# probability score and class for each face and grab the readable
# emotion detection
predictions = model(face)
prob = nnf.softmax(predictions, dim=1)
top_p, top_class = prob.topk(1, dim=1)
top_p, top_class = top_p.item(), top_class.item()
# grab the list of predictions along with their associated labels
emotion_prob = [p.item() for p in prob[0]]
emotion_value = emotion_dict.values()
# draw the probability distribution on an empty canvas initialized
for (i, (emotion, prob)) in enumerate(zip(emotion_value, emotion_prob)):
prob_text = f"{emotion}: {prob * 100:.2f}%"
width = int(prob * 300)
cv2.rectangle(canvas, (5, (i * 50) + 5), (width, (i * 50) + 50),
(0, 0, 255), -1)
cv2.putText(canvas, prob_text, (5, (i * 50) + 30),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)
# draw the bounding box of the face along with the associated emotion
# and probability
face_emotion = emotion_dict[top_class]
face_text = f"{face_emotion}: {top_p * 100:.2f}%"
cv2.rectangle(output, (start_x, start_y), (end_x, end_y), (0, 255, 0), 2)
y = start_y - 10 if start_y - 10 > 10 else start_y + 10
cv2.putText(output, face_text, (start_x, y), cv2.FONT_HERSHEY_SIMPLEX,
1.05, (0, 255, 0), 2)
# display the output to our screen
cv2.imshow("Face", output)
cv2.imshow("Emotion probability distribution", canvas)
# break the loop if the `q` key is pressed
key = cv2.waitKey(1) & 0xFF
if key == ord("q"):
break
# destroy all opened frame and clean up the video-steam
cv2.destroyAllWindows()
vs.release()