From 7dd2235be2d033a8f3a343287e1529678b7c5eff Mon Sep 17 00:00:00 2001 From: Aditya Chakraborty Date: Sun, 12 Jul 2020 13:20:22 -0400 Subject: [PATCH 1/6] Added evaluator for detector and classifier --- eval/evaluator.py | 373 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 373 insertions(+) create mode 100644 eval/evaluator.py diff --git a/eval/evaluator.py b/eval/evaluator.py new file mode 100644 index 00000000..94bf9a02 --- /dev/null +++ b/eval/evaluator.py @@ -0,0 +1,373 @@ +import os +import cv2 +import argparse +import torch +import time +import warnings +import json +import numpy as np + + +from face_detector import FaceDetector, classify + +class Evaluator(): + def __init__(self, cuda, detector, classifier, input_directory): + + ''' + This class evaluates face detection and goggle classsification performance. + Goggle Classification accuracy is given by average class accuracy and individual + video accuracy. + Face detection accuracy is give by precision and recall values. + + Parameters: + cuda: A bool value that specifies if cuda shall be used + detector: A string path to a .pth weights file for a face detection model + classifier: A string path to a .pth weights file for a goggle classsification model + input_directory: Directory containing test videos to run Evaluator on + ''' + + if cuda and torch.cuda.is_available(): + torch.set_default_tensor_type('torch.cuda.FloatTensor') + self.device = torch.device('cuda:0') + else: + torch.set_default_tensor_type('torch.FloatTensor') + self.device = torch.device('cpu') + + + if os.path.exists("test_results/det_results_ideal.txt"): + os.remove("test_results/det_results_ideal.txt") + + + self.detector = FaceDetector(trained_model=detector, cuda=cuda and torch.cuda.is_available(), + set_default_dev=True) + self.classifier = torch.load(classifier, map_location=self.device) + self.classifier.eval() + self.video_filenames = self.get_video_files(input_directory) + self.results = {'Goggles': + {'average_class_accuracy': 0.0, + 'number_of_videos' : 0, + 'individual_video_results': {} + }, + 'Glasses': + {'average_class_accuracy': 0.0, + 'number_of_videos' : 0, + 'individual_video_results': {} + }, + 'Neither': + {'average_class_accuracy': 0.0, + 'number_of_videos' : 0, + 'individual_video_results': {} + } + } + self.class_label = '' + self.condition = '' + self.cap = '' + self.video = '' + + self.evaluate() + + + def evaluate(self): + ''' + This method evaluates every video file in the input directory containing test videos. + It stores all the results in a dict called self.results as it calls the record_results method. + To understand the format of self.results dict, check the class constructor + ''' + total_videos_processed = 0 + for video_file in self.video_filenames: + self.video = video_file + print (f"Processing {self.video} ...") + + + self.class_label = self.get_class_label() + self.condition = self.get_condition() + self.cap = cv2.VideoCapture(self.video) + if self.cap.isOpened(): + classification_result = self.evaluate_classifications() #Also contains boxes + self.record_results(classification_result) + total_videos_processed += 1 + print (f"{self.video} : Done") + + else: + print (f"Unable to open video {self.video}") + continue + self.calculate_average_class_accuracy() + #self.record_detections('test_results/det_output.txt') + detection_results = self.evaluate_detections('ground_truth_detections_ideal/',"test_results/det_results_ideal.txt") + + + print (f"\n {total_videos_processed} videos processed!") + + + def calculate_average_class_accuracy(self): + ''' + This method calculates the average class accuracy for each class and stores it in the + self.results dict. + ''' + for class_label in self.results: + if self.results[class_label]['number_of_videos'] > 0: + self.results[class_label]['average_class_accuracy'] = self.results[class_label]['average_class_accuracy'] / self.results[class_label]['number_of_videos'] + + def record_results(self, result): + ''' + This method records all the results in the self.results dict + ''' + self.results[self.class_label]['number_of_videos'] += 1 + self.results[self.class_label]['average_class_accuracy'] += result[0] + self.results[self.class_label]['individual_video_results'][self.video] = {} + self.results[self.class_label]['individual_video_results'][self.video]["accuracy"] = result[0] + self.results[self.class_label]['individual_video_results'][self.video]["inference_time"] = result[1] + self.results[self.class_label]['individual_video_results'][self.video]["condition"] = self.condition + + def record_detections(self, file, detections): + f = open(file, "a+") + for detection in detections: + for element in detection: + f.write(str(element)) + f.write("|") + f.write("\n") + f.close() + + + def infer(self): + ''' + This method Performs inference on a video (frame by frame) by using the face detection + and goggle classification models + It returns: + 1) inference_dict contains the number of inferences for each class. + 2) average inference time is a float containing the average inference time for the whole video + ''' + bboxes = [] + inference_dict = {"Goggles": 0, "Glasses": 0, "Neither": 0} + frame_counter = 0 + start_time = time.time() + + while True: + ret, img = self.cap.read() + + if not ret: + break + frame_id = self.video.strip('.avi').strip('.mp4').strip('.MOV').strip('.mov').split('/')[-1] + "_" + str(frame_counter) + boxes = self.detector.detect(img) #Also contains confidence + preds = [] + for box in boxes: + x1, y1, x2, y2, conf = [b for b in box] + x1 = max(0, x1) + y1 = max(0, y1) + x2 = min(img.shape[1], x2) + y2 = min(img.shape[0], y2) + + if isinstance(conf, torch.Tensor): #This is true for ssd + conf = conf.numpy() + #print (conf) + + #assert isinstance(conf, np.float32) + + + face = img[int(y1):int(y2), int(x1):int(x2), :] + label, softlabels = classify(face, self.classifier, self.device) + + preds.append(label.item()) + + inference_dict["Goggles"] += preds.count(1) + inference_dict["Glasses"] += preds.count(0) + inference_dict["Neither"] += preds.count(2) + + bboxes.append([frame_id, x1, y1, x2, y2, conf]) + + frame_counter += 1 + + total_time = time.time() - start_time + if frame_counter > 0: + average_inference_time = total_time / frame_counter + else: + average_inference_time = -1 #Empty video file + + self.record_detections("test_results/det_results_ideal.txt", bboxes) + return inference_dict, average_inference_time + + + def get_class_label(self): + ''' + Get class label [Goggles / Glasses / Neither] that the image belongs to + ''' + + class_label = '' + if '/Goggles/' in self.video or '/goggles/' in self.video: + class_label = 'Goggles' + elif '/Glasses/' in self.video or '/glasses/' in self.video: + class_label = 'Glasses' + else: + class_label = 'Neither' + + return class_label + + def get_condition(self): + ''' + Get condition [Ideal, low_lighting etc. ] that the image belongs to + ''' + return (self.video.split('/')[-2]) + + + def evaluate_classifications(self): + ''' + This method returns the accuracy (percentage_of_correct_predictions) of the + predictions for a video + ''' + inferences, inference_time = self.infer() + if sum(inferences.values()) == 0: + percentage_of_correct_predictions = 0 + else: + percentage_of_correct_predictions = inferences[self.class_label] / sum(inferences.values()) + + + return percentage_of_correct_predictions, inference_time + + def get_ground_truth_detections(self, directory): + GT = {} + + for file in os.listdir(directory): + f = open(directory+file, "r") + key = file.strip('.txt') + content = f.readlines() + f.close() + + content = [list(map(float, x.strip(' \n').split(' '))) for x in content] + GT[key] = content + + + return GT + + def evaluate_detections(self, annotations_location, detection_location, ovthresh = 0.5): + ''' + This method calculates the recall and precision of face detection for a video + ''' + + GT_detections = self.get_ground_truth_detections(annotations_location) + with open(detection_location, 'r') as f: + lines = f.readlines() + + total_GT = 0 + for frame_id in GT_detections: + total_GT += len(GT_detections[frame_id]) + + if any(lines) == 1: + splitlines = [x.strip().split('|') for x in lines] + + ''' + for x in splitlines: + if x[0] not in GT_detections: + splitlines.remove(x) + ''' + + image_ids = [x[0] for x in splitlines] + confidence = np.array([float(x[5]) for x in splitlines]) + BB = np.array([[float(z) for z in x[1:5]] for x in splitlines]) + + # sort by confidence + sorted_ind = np.argsort(-confidence) + sorted_scores = np.sort(-confidence) + BB = BB[sorted_ind, :] + image_ids = [image_ids[x] for x in sorted_ind] + + nd = len(image_ids) + tp = np.zeros(nd) + fp = np.zeros(nd) + + print (nd) + for d in range(nd): + try: + R = GT_detections[image_ids[d]] + bb = BB[d, :].astype(float) + ovmax = -np.inf + BBGT = np.asarray(R, dtype=np.float32) + if BBGT.size > 0: + ixmin = np.maximum(BBGT[:, 0], bb[0]) + iymin = np.maximum(BBGT[:, 1], bb[1]) + ixmax = np.minimum(BBGT[:, 2], bb[2]) + iymax = np.minimum(BBGT[:, 3], bb[3]) + iw = np.maximum(ixmax - ixmin, 0.) + ih = np.maximum(iymax - iymin, 0.) + inters = iw * ih + uni = ((bb[2] - bb[0]) * (bb[3] - bb[1]) + + (BBGT[:, 2] - BBGT[:, 0]) * + (BBGT[:, 3] - BBGT[:, 1]) - inters) + overlaps = inters / uni + ovmax = np.max(overlaps) + #jmax = np.argmax(overlaps) + + if ovmax > ovthresh: + tp[d] = 1. + else: + fp[d] = 1. + + except KeyError: + continue + + print ("total_GT: ", total_GT) + fp = np.cumsum(fp) + tp = np.cumsum(tp) + rec = tp / float(total_GT) + # avoid divide by zero in case the first detection matches a difficult + # ground truth + prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) + else: + rec = -1. + prec = -1. + ap = -1. + + print ("precision: ", prec) + print ("recall: ", rec) + + return prec, rec + + + + def get_video_files(self, input_directory): + ''' + This method gets all the video files in the input directory + ''' + + filenames = [] + for dirName, subdirList, fileList in os.walk(input_directory): + for filename in fileList: + ext = '.' + filename.split('.')[-1] + if ext in ['.mov','.mp4','.avi','.MOV']: + filenames.append(dirName + '/' + filename) + + return filenames + + def get_evaluator_results(self): + ''' + This method returns the dict containing all the test results (self.results) + ''' + + return self.results + +def main(): + if not args.input_directory: + raise Exception("Invalid input directory") + evaluator = Evaluator(args.cuda, args.detector, args.classifier, args.input_directory) + individual_video_results = evaluator.get_evaluator_results() + + + with open(args.output_file, 'w') as json_file: + json.dump(individual_video_results, json_file, indent=4) + + print (f"\n Output saved at {args.output_file}") + +if __name__ == "__main__": + warnings.filterwarnings("once") + parser = argparse.ArgumentParser(description="Face detection") + parser.add_argument('--detector', '-t', type=str, default='model_weights/blazeface.pth', help="Path to a trained ssd .pth file") + parser.add_argument('--cuda', '-c', default=False, action='store_true', help="Enable cuda") + parser.add_argument('--classifier', default='model_weights/ensemble_100epochs.pth', type=str, help="Path to a trained classifier .pth file") + parser.add_argument('--output_file', type=str, default='test_results/test1.json', help="Path to a directory to store evaluation log") + parser.add_argument('--input_directory', type=str, help="Path to a directory containing video files") + parser.add_argument('--annotation_path', type=str, help="Path to annotation files") + + args = parser.parse_args() + + main() + + exit() From fa6707be0f1240a394076c0b52b6360c12bc0502 Mon Sep 17 00:00:00 2001 From: Aditya Chakraborty Date: Sun, 12 Jul 2020 14:56:10 -0400 Subject: [PATCH 2/6] Fixed bugs --- src/jetson/main.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/jetson/main.py b/src/jetson/main.py index 54ceee25..178a55ae 100644 --- a/src/jetson/main.py +++ b/src/jetson/main.py @@ -132,6 +132,7 @@ def detect(self, xmin = detections[i, 1] * image.shape[1] ymax = detections[i, 2] * image.shape[0] xmax = detections[i, 3] * image.shape[1] + conf = detections[i, 16] img = img / 127.5 - 1.0 @@ -139,7 +140,7 @@ def detect(self, kp_x = detections[i, 4 + k * 2] * img.shape[1] kp_y = detections[i, 4 + k * 2 + 1] * img.shape[0] - bboxes.append((xmin, ymin, xmax, ymax)) + bboxes.append((xmin, ymin, xmax, ymax, conf)) return bboxes @@ -154,9 +155,7 @@ def detect(self, boxes, scores = postprocess(boxes, conf, self.image_shape, self.detection_threshold, self.resize) dets = do_nms(boxes, scores, infer_params["nms_thresh"]) - bboxes = [] - for det in dets: - bboxes.append(tuple(dets[0][0:4])) + bboxes = [tuple(det[0:4]) for det in dets] return bboxes @@ -197,14 +196,16 @@ def close(self): self.t1.join() class Classifier: - def __init__(self, classifier): + def __init__(self, classifier, cuda:bool): ''' Performs classification of facial region into three classes - [Goggles, Glasses, Neither] Args: classifier - Trained classifier model (Currently, mobilenetv2) + cuda - True if Nvidia GPU is used ''' self.fps = 0 self.classifier = classifier + self.device = cuda def classifyFace(self, face: np.ndarray): @@ -233,7 +234,7 @@ def classifyFace(self, ]) transformed_face = transform(pil_face) face_batch = transformed_face.unsqueeze(0) - device = torch.device("cuda:0" if args.cuda and torch.cuda.is_available() else "cpu") + device = torch.device("cuda:0" if self.device and torch.cuda.is_available() else "cpu") with torch.no_grad(): face_batch = face_batch.to(device) labels = classifier(face_batch) @@ -405,7 +406,7 @@ def drawFrame(boxes, frame, fps): capturer = VideoCapturer() detector = FaceDetector(detector=args.detector, cuda=args.cuda and torch.cuda.is_available(), set_default_dev=True) - classifier = Classifier(g) + classifier = Classifier(g, args.cuda) encryptor = Encryptor() run_face_detection: bool = True From 4b85b361661a9a08107e93d8a317f28d2c75f586 Mon Sep 17 00:00:00 2001 From: Aditya Chakraborty Date: Sun, 12 Jul 2020 18:11:56 -0400 Subject: [PATCH 3/6] Made necessary changes to main to run evaluator --- eval/evaluator.py | 373 --------------------------------------------- src/jetson/main.py | 9 +- 2 files changed, 5 insertions(+), 377 deletions(-) delete mode 100644 eval/evaluator.py diff --git a/eval/evaluator.py b/eval/evaluator.py deleted file mode 100644 index 94bf9a02..00000000 --- a/eval/evaluator.py +++ /dev/null @@ -1,373 +0,0 @@ -import os -import cv2 -import argparse -import torch -import time -import warnings -import json -import numpy as np - - -from face_detector import FaceDetector, classify - -class Evaluator(): - def __init__(self, cuda, detector, classifier, input_directory): - - ''' - This class evaluates face detection and goggle classsification performance. - Goggle Classification accuracy is given by average class accuracy and individual - video accuracy. - Face detection accuracy is give by precision and recall values. - - Parameters: - cuda: A bool value that specifies if cuda shall be used - detector: A string path to a .pth weights file for a face detection model - classifier: A string path to a .pth weights file for a goggle classsification model - input_directory: Directory containing test videos to run Evaluator on - ''' - - if cuda and torch.cuda.is_available(): - torch.set_default_tensor_type('torch.cuda.FloatTensor') - self.device = torch.device('cuda:0') - else: - torch.set_default_tensor_type('torch.FloatTensor') - self.device = torch.device('cpu') - - - if os.path.exists("test_results/det_results_ideal.txt"): - os.remove("test_results/det_results_ideal.txt") - - - self.detector = FaceDetector(trained_model=detector, cuda=cuda and torch.cuda.is_available(), - set_default_dev=True) - self.classifier = torch.load(classifier, map_location=self.device) - self.classifier.eval() - self.video_filenames = self.get_video_files(input_directory) - self.results = {'Goggles': - {'average_class_accuracy': 0.0, - 'number_of_videos' : 0, - 'individual_video_results': {} - }, - 'Glasses': - {'average_class_accuracy': 0.0, - 'number_of_videos' : 0, - 'individual_video_results': {} - }, - 'Neither': - {'average_class_accuracy': 0.0, - 'number_of_videos' : 0, - 'individual_video_results': {} - } - } - self.class_label = '' - self.condition = '' - self.cap = '' - self.video = '' - - self.evaluate() - - - def evaluate(self): - ''' - This method evaluates every video file in the input directory containing test videos. - It stores all the results in a dict called self.results as it calls the record_results method. - To understand the format of self.results dict, check the class constructor - ''' - total_videos_processed = 0 - for video_file in self.video_filenames: - self.video = video_file - print (f"Processing {self.video} ...") - - - self.class_label = self.get_class_label() - self.condition = self.get_condition() - self.cap = cv2.VideoCapture(self.video) - if self.cap.isOpened(): - classification_result = self.evaluate_classifications() #Also contains boxes - self.record_results(classification_result) - total_videos_processed += 1 - print (f"{self.video} : Done") - - else: - print (f"Unable to open video {self.video}") - continue - self.calculate_average_class_accuracy() - #self.record_detections('test_results/det_output.txt') - detection_results = self.evaluate_detections('ground_truth_detections_ideal/',"test_results/det_results_ideal.txt") - - - print (f"\n {total_videos_processed} videos processed!") - - - def calculate_average_class_accuracy(self): - ''' - This method calculates the average class accuracy for each class and stores it in the - self.results dict. - ''' - for class_label in self.results: - if self.results[class_label]['number_of_videos'] > 0: - self.results[class_label]['average_class_accuracy'] = self.results[class_label]['average_class_accuracy'] / self.results[class_label]['number_of_videos'] - - def record_results(self, result): - ''' - This method records all the results in the self.results dict - ''' - self.results[self.class_label]['number_of_videos'] += 1 - self.results[self.class_label]['average_class_accuracy'] += result[0] - self.results[self.class_label]['individual_video_results'][self.video] = {} - self.results[self.class_label]['individual_video_results'][self.video]["accuracy"] = result[0] - self.results[self.class_label]['individual_video_results'][self.video]["inference_time"] = result[1] - self.results[self.class_label]['individual_video_results'][self.video]["condition"] = self.condition - - def record_detections(self, file, detections): - f = open(file, "a+") - for detection in detections: - for element in detection: - f.write(str(element)) - f.write("|") - f.write("\n") - f.close() - - - def infer(self): - ''' - This method Performs inference on a video (frame by frame) by using the face detection - and goggle classification models - It returns: - 1) inference_dict contains the number of inferences for each class. - 2) average inference time is a float containing the average inference time for the whole video - ''' - bboxes = [] - inference_dict = {"Goggles": 0, "Glasses": 0, "Neither": 0} - frame_counter = 0 - start_time = time.time() - - while True: - ret, img = self.cap.read() - - if not ret: - break - frame_id = self.video.strip('.avi').strip('.mp4').strip('.MOV').strip('.mov').split('/')[-1] + "_" + str(frame_counter) - boxes = self.detector.detect(img) #Also contains confidence - preds = [] - for box in boxes: - x1, y1, x2, y2, conf = [b for b in box] - x1 = max(0, x1) - y1 = max(0, y1) - x2 = min(img.shape[1], x2) - y2 = min(img.shape[0], y2) - - if isinstance(conf, torch.Tensor): #This is true for ssd - conf = conf.numpy() - #print (conf) - - #assert isinstance(conf, np.float32) - - - face = img[int(y1):int(y2), int(x1):int(x2), :] - label, softlabels = classify(face, self.classifier, self.device) - - preds.append(label.item()) - - inference_dict["Goggles"] += preds.count(1) - inference_dict["Glasses"] += preds.count(0) - inference_dict["Neither"] += preds.count(2) - - bboxes.append([frame_id, x1, y1, x2, y2, conf]) - - frame_counter += 1 - - total_time = time.time() - start_time - if frame_counter > 0: - average_inference_time = total_time / frame_counter - else: - average_inference_time = -1 #Empty video file - - self.record_detections("test_results/det_results_ideal.txt", bboxes) - return inference_dict, average_inference_time - - - def get_class_label(self): - ''' - Get class label [Goggles / Glasses / Neither] that the image belongs to - ''' - - class_label = '' - if '/Goggles/' in self.video or '/goggles/' in self.video: - class_label = 'Goggles' - elif '/Glasses/' in self.video or '/glasses/' in self.video: - class_label = 'Glasses' - else: - class_label = 'Neither' - - return class_label - - def get_condition(self): - ''' - Get condition [Ideal, low_lighting etc. ] that the image belongs to - ''' - return (self.video.split('/')[-2]) - - - def evaluate_classifications(self): - ''' - This method returns the accuracy (percentage_of_correct_predictions) of the - predictions for a video - ''' - inferences, inference_time = self.infer() - if sum(inferences.values()) == 0: - percentage_of_correct_predictions = 0 - else: - percentage_of_correct_predictions = inferences[self.class_label] / sum(inferences.values()) - - - return percentage_of_correct_predictions, inference_time - - def get_ground_truth_detections(self, directory): - GT = {} - - for file in os.listdir(directory): - f = open(directory+file, "r") - key = file.strip('.txt') - content = f.readlines() - f.close() - - content = [list(map(float, x.strip(' \n').split(' '))) for x in content] - GT[key] = content - - - return GT - - def evaluate_detections(self, annotations_location, detection_location, ovthresh = 0.5): - ''' - This method calculates the recall and precision of face detection for a video - ''' - - GT_detections = self.get_ground_truth_detections(annotations_location) - with open(detection_location, 'r') as f: - lines = f.readlines() - - total_GT = 0 - for frame_id in GT_detections: - total_GT += len(GT_detections[frame_id]) - - if any(lines) == 1: - splitlines = [x.strip().split('|') for x in lines] - - ''' - for x in splitlines: - if x[0] not in GT_detections: - splitlines.remove(x) - ''' - - image_ids = [x[0] for x in splitlines] - confidence = np.array([float(x[5]) for x in splitlines]) - BB = np.array([[float(z) for z in x[1:5]] for x in splitlines]) - - # sort by confidence - sorted_ind = np.argsort(-confidence) - sorted_scores = np.sort(-confidence) - BB = BB[sorted_ind, :] - image_ids = [image_ids[x] for x in sorted_ind] - - nd = len(image_ids) - tp = np.zeros(nd) - fp = np.zeros(nd) - - print (nd) - for d in range(nd): - try: - R = GT_detections[image_ids[d]] - bb = BB[d, :].astype(float) - ovmax = -np.inf - BBGT = np.asarray(R, dtype=np.float32) - if BBGT.size > 0: - ixmin = np.maximum(BBGT[:, 0], bb[0]) - iymin = np.maximum(BBGT[:, 1], bb[1]) - ixmax = np.minimum(BBGT[:, 2], bb[2]) - iymax = np.minimum(BBGT[:, 3], bb[3]) - iw = np.maximum(ixmax - ixmin, 0.) - ih = np.maximum(iymax - iymin, 0.) - inters = iw * ih - uni = ((bb[2] - bb[0]) * (bb[3] - bb[1]) + - (BBGT[:, 2] - BBGT[:, 0]) * - (BBGT[:, 3] - BBGT[:, 1]) - inters) - overlaps = inters / uni - ovmax = np.max(overlaps) - #jmax = np.argmax(overlaps) - - if ovmax > ovthresh: - tp[d] = 1. - else: - fp[d] = 1. - - except KeyError: - continue - - print ("total_GT: ", total_GT) - fp = np.cumsum(fp) - tp = np.cumsum(tp) - rec = tp / float(total_GT) - # avoid divide by zero in case the first detection matches a difficult - # ground truth - prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) - else: - rec = -1. - prec = -1. - ap = -1. - - print ("precision: ", prec) - print ("recall: ", rec) - - return prec, rec - - - - def get_video_files(self, input_directory): - ''' - This method gets all the video files in the input directory - ''' - - filenames = [] - for dirName, subdirList, fileList in os.walk(input_directory): - for filename in fileList: - ext = '.' + filename.split('.')[-1] - if ext in ['.mov','.mp4','.avi','.MOV']: - filenames.append(dirName + '/' + filename) - - return filenames - - def get_evaluator_results(self): - ''' - This method returns the dict containing all the test results (self.results) - ''' - - return self.results - -def main(): - if not args.input_directory: - raise Exception("Invalid input directory") - evaluator = Evaluator(args.cuda, args.detector, args.classifier, args.input_directory) - individual_video_results = evaluator.get_evaluator_results() - - - with open(args.output_file, 'w') as json_file: - json.dump(individual_video_results, json_file, indent=4) - - print (f"\n Output saved at {args.output_file}") - -if __name__ == "__main__": - warnings.filterwarnings("once") - parser = argparse.ArgumentParser(description="Face detection") - parser.add_argument('--detector', '-t', type=str, default='model_weights/blazeface.pth', help="Path to a trained ssd .pth file") - parser.add_argument('--cuda', '-c', default=False, action='store_true', help="Enable cuda") - parser.add_argument('--classifier', default='model_weights/ensemble_100epochs.pth', type=str, help="Path to a trained classifier .pth file") - parser.add_argument('--output_file', type=str, default='test_results/test1.json', help="Path to a directory to store evaluation log") - parser.add_argument('--input_directory', type=str, help="Path to a directory containing video files") - parser.add_argument('--annotation_path', type=str, help="Path to annotation files") - - args = parser.parse_args() - - main() - - exit() diff --git a/src/jetson/main.py b/src/jetson/main.py index 178a55ae..be6d9a08 100644 --- a/src/jetson/main.py +++ b/src/jetson/main.py @@ -111,7 +111,8 @@ def detect(self, while j < detections.shape[2] and detections[0, 1, j, 0] > self.detection_threshold: pt = (detections[0, 1, j, 1:] * scale).cpu().numpy() x1, y1, x2, y2 = pt - bboxes.append((x1, y1, x2, y2)) + conf = detections[0, 1, j, 0].item() + bboxes.append((x1, y1, x2, y2, conf)) j += 1 return bboxes @@ -155,7 +156,7 @@ def detect(self, boxes, scores = postprocess(boxes, conf, self.image_shape, self.detection_threshold, self.resize) dets = do_nms(boxes, scores, infer_params["nms_thresh"]) - bboxes = [tuple(det[0:4]) for det in dets] + bboxes = [tuple(det[0:5]) for det in dets] return bboxes @@ -259,7 +260,7 @@ def classifyFrame(self, label = [] for box in boxes: - x1, y1, x2, y2 = [int(b) for b in box] + x1, y1, x2, y2 = [int(b) for b in box[0:4]] # draw boxes within the frame x1 = max(0, x1) y1 = max(0, y1) @@ -308,7 +309,7 @@ def encryptFrame(self, img:np.ndarray, boxes: facial Coordinates ''' for box in boxes: - x1, y1, x2, y2 = [int(b) for b in box] + x1, y1, x2, y2 = [int(b) for b in box[0:4]] # draw boxes within the frame x1 = max(0, x1) y1 = max(0, y1) From 6d0651616a84af5cd714cb179512f40eeae1c653 Mon Sep 17 00:00:00 2001 From: ZPBerg Date: Sun, 12 Jul 2020 21:53:44 -0400 Subject: [PATCH 4/6] Create detector_type enum. --- src/jetson/main.py | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/src/jetson/main.py b/src/jetson/main.py index be6d9a08..c8641041 100644 --- a/src/jetson/main.py +++ b/src/jetson/main.py @@ -4,6 +4,7 @@ from typing import List, Set, Dict, Tuple, Optional import cv2 +from enum import Enum from PIL import Image import numpy as np import torch @@ -29,12 +30,20 @@ fileCount = Value('i', 0) encryptRet = Queue() #Shared memory queue to allow child encryption process to return to parent + +class DetectorType(Enum): + BLAZEFACE = 'blazeface', + RETINAFACE = 'retinaface', + SSD = 'ssd' + + class FaceDetector: - def __init__(self, detector:str, detection_threshold=0.7, cuda=True, set_default_dev=False): + def __init__(self, detector: str, detector_type: str, detection_threshold=0.7, cuda=True, set_default_dev=False): """ Creates a FaceDetector object Args: detector: A string path to a trained pth file for a ssd model trained in face detection + detector_type: A DetectorType describing which face detector is being used detection_threshold: The minimum threshold for a detection to be considered valid cuda: Whether or not to enable CUDA set_default_dev: Whether or not to set the default device for PyTorch @@ -42,18 +51,16 @@ def __init__(self, detector:str, detection_threshold=0.7, cuda=True, set_default self.device = torch.device("cpu") - if ('.pth' in detector and 'ssd' in detector): - from models.SSD.ssd import build_ssd + if detector_type == DetectorType.SSD: + from src.jetson.models.SSD.ssd import build_ssd self.net = build_ssd('test', 300, 2) self.model_name = 'ssd' self.net.load_state_dict(torch.load(detector, map_location=self.device)) self.transformer = BaseTransform(self.net.size, (104, 117, 123)) - - elif ('.pth' in detector and 'blazeface' in detector): - from models.BlazeFace.blazeface import BlazeFace - + elif detector_type == DetectorType.BLAZEFACE: + from src.jetson.models.BlazeFace.blazeface import BlazeFace self.net = BlazeFace(self.device) self.net.load_weights(detector) @@ -63,8 +70,8 @@ def __init__(self, detector:str, detection_threshold=0.7, cuda=True, set_default self.net.min_suppression_threshold = 0.3 self.transformer = BaseTransform(128, None) - elif ('.pth' in detector and 'mobile' in detector): - from models.Retinaface.retinaface import RetinaFace, load_model + elif detector_type == DetectorType.RETINAFACE: + from src.jetson.models.Retinaface.retinaface import RetinaFace, load_model self.net = RetinaFace(cfg=cfg, phase = 'test') self.net = load_model(self.net, detector, True) @@ -76,6 +83,9 @@ def __init__(self, detector:str, detection_threshold=0.7, cuda=True, set_default priors = priorbox.forward() self.prior_data = priors.data + else: + print('Please include a valid detector type (\'blazeface\', \'ssd\', or \'retinaface\'') + exit(1) self.detection_threshold = detection_threshold if cuda and torch.cuda.is_available(): @@ -391,7 +401,9 @@ def drawFrame(boxes, frame, fps): if __name__ == "__main__": warnings.filterwarnings("once") parser = argparse.ArgumentParser(description="Face detection") - parser.add_argument('--detector', '-t', type=str, required=True, help="Path to a trained ssd .pth file") + parser.add_argument('--detector', '-t', type=str, required=True, help="Path to a trained face detector .pth file") + parser.add_argument('--detector_type', '-d', type=str, required=True, help="Type of face detector. One of " + "blazeface, ssd, or retinaface.") parser.add_argument('--cuda', '-c', default=False, action='store_true', help="Enable cuda") parser.add_argument('--classifier', type=str, help="Path to a trained classifier .pth file") parser.add_argument('--write_imgs', default=False, help='Write images to output_dir') @@ -406,7 +418,7 @@ def drawFrame(boxes, frame, fps): g.eval() capturer = VideoCapturer() - detector = FaceDetector(detector=args.detector, cuda=args.cuda and torch.cuda.is_available(), set_default_dev=True) + detector = FaceDetector(detector=args.detector, detector_type=args.detector_type, cuda=args.cuda and torch.cuda.is_available(), set_default_dev=True) classifier = Classifier(g, args.cuda) encryptor = Encryptor() From a5d5d0a8f7223ffbe1313727dfcf24e074f6d766 Mon Sep 17 00:00:00 2001 From: ZPBerg Date: Mon, 13 Jul 2020 16:54:26 -0400 Subject: [PATCH 5/6] detector_type as an argument, compare to list of strings --- src/jetson/main.py | 90 +++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 50 deletions(-) diff --git a/src/jetson/main.py b/src/jetson/main.py index c8641041..232e75fc 100644 --- a/src/jetson/main.py +++ b/src/jetson/main.py @@ -28,13 +28,8 @@ from models.Retinaface.data import cfg_inference as infer_params fileCount = Value('i', 0) -encryptRet = Queue() #Shared memory queue to allow child encryption process to return to parent - - -class DetectorType(Enum): - BLAZEFACE = 'blazeface', - RETINAFACE = 'retinaface', - SSD = 'ssd' +encryptRet = Queue() # Shared memory queue to allow child encryption process to return to parent +DETECTOR_TYPES = ['blazeface', 'retinaface', 'ssd'] class FaceDetector: @@ -51,7 +46,7 @@ def __init__(self, detector: str, detector_type: str, detection_threshold=0.7, c self.device = torch.device("cpu") - if detector_type == DetectorType.SSD: + if detector_type == 'ssd': from src.jetson.models.SSD.ssd import build_ssd self.net = build_ssd('test', 300, 2) @@ -59,7 +54,7 @@ def __init__(self, detector: str, detector_type: str, detection_threshold=0.7, c self.net.load_state_dict(torch.load(detector, map_location=self.device)) self.transformer = BaseTransform(self.net.size, (104, 117, 123)) - elif detector_type == DetectorType.BLAZEFACE: + elif detector_type == 'blazeface': from src.jetson.models.BlazeFace.blazeface import BlazeFace self.net = BlazeFace(self.device) @@ -70,23 +65,19 @@ def __init__(self, detector: str, detector_type: str, detection_threshold=0.7, c self.net.min_suppression_threshold = 0.3 self.transformer = BaseTransform(128, None) - elif detector_type == DetectorType.RETINAFACE: + elif detector_type == 'retinaface': from src.jetson.models.Retinaface.retinaface import RetinaFace, load_model - self.net = RetinaFace(cfg=cfg, phase = 'test') + self.net = RetinaFace(cfg=cfg, phase='test') self.net = load_model(self.net, detector, True) self.model_name = 'retinaface' - self.image_shape = infer_params["image_shape"] #(H, W) + self.image_shape = infer_params["image_shape"] # (H, W) self.resize = infer_params["resize"] self.transformer = BaseTransform((self.image_shape[1], self.image_shape[0]), (104, 117, 123)) priorbox = PriorBox(cfg, image_size=self.image_shape) priors = priorbox.forward() self.prior_data = priors.data - else: - print('Please include a valid detector type (\'blazeface\', \'ssd\', or \'retinaface\'') - exit(1) - self.detection_threshold = detection_threshold if cuda and torch.cuda.is_available(): self.device = torch.device("cuda:0") @@ -98,7 +89,6 @@ def __init__(self, detector: str, detector_type: str, detection_threshold=0.7, c self.net.to(self.device) self.net.eval() - def detect(self, image: np.ndarray): """ @@ -110,7 +100,7 @@ def detect(self, The bounding boxes of the face(s) that were detected formatted (upper left corner(x, y) , lower right corner(x,y)) """ - if (self.model_name == 'ssd'): + if self.model_name == 'ssd': x = torch.from_numpy(self.transformer(image)[0]).permute(2, 0, 1) x = Variable(x.unsqueeze(0)).to(self.device) y = self.net(x) @@ -127,7 +117,7 @@ def detect(self, return bboxes - elif (self.model_name == 'blazeface'): + elif self.model_name == 'blazeface': img = self.transformer(image)[0].astype(np.float32) detections = self.net.predict_on_image(img) @@ -155,12 +145,11 @@ def detect(self, return bboxes - - elif (self.model_name == 'retinaface'): + elif self.model_name == 'retinaface': img = (self.transformer(image)[0]).transpose(2, 0, 1) img = torch.from_numpy(img).unsqueeze(0) - loc, conf, _ = self.net(img) # forward pass: Returns bounding box location, confidence and facial landmark locations - + loc, conf, _ = self.net( + img) # forward pass: Returns bounding box location, confidence and facial landmark locations boxes = decode(loc.data.squeeze(0), self.prior_data, cfg['variance']) boxes, scores = postprocess(boxes, conf, self.image_shape, self.detection_threshold, self.resize) @@ -171,9 +160,6 @@ def detect(self, return bboxes - - - class VideoCapturer(object): def __init__(self, src=0): ''' @@ -189,7 +175,6 @@ def __init__(self, src=0): self.t1.daemon = True self.t1.start() - def update(self): '''Get next frame in video stream''' while self.running.value: @@ -206,8 +191,9 @@ def close(self): self.running.value = False self.t1.join() + class Classifier: - def __init__(self, classifier, cuda:bool): + def __init__(self, classifier, cuda: bool): ''' Performs classification of facial region into three classes - [Goggles, Glasses, Neither] Args: @@ -219,7 +205,7 @@ def __init__(self, classifier, cuda:bool): self.device = cuda def classifyFace(self, - face: np.ndarray): + face: np.ndarray): ''' This method initializaes the transforms and classifies the face region Args: @@ -255,8 +241,8 @@ def classifyFace(self, return pred def classifyFrame(self, - img: np.ndarray, - boxes: List[Tuple[np.float64]]): + img: np.ndarray, + boxes: List[Tuple[np.float64]]): ''' This method loops through all the bounding boxes in an image, calls classifyFace method to classify face region and finally draws a box around the face. @@ -282,9 +268,9 @@ def classifyFrame(self, label.append(int(self.classifyFace(face).data)) - return label + class Encryptor(object): def __init__(self): ''' @@ -293,7 +279,6 @@ def __init__(self): self.encryptor = AESEncryptor() self.key = self.encryptor.key - def encryptFace(self, coordinates: List[Tuple[int]], img: np.ndarray): ''' @@ -310,8 +295,8 @@ def encryptFace(self, coordinates: List[Tuple[int]], return encryptedImg - def encryptFrame(self, img:np.ndarray, - boxes:List[Tuple[np.float64]]): + def encryptFrame(self, img: np.ndarray, + boxes: List[Tuple[np.float64]]): ''' This method takes the face coordinates, encrypts the facial region, writes encrypted image to file filesystem Args: @@ -345,7 +330,7 @@ def writeImg(img, output_dir): global fileCount face_file_name = os.path.join(output_dir, f'{fileCount.value}.jpg') - #TODO: Remove this print statement after db integration + # TODO: Remove this print statement after db integration print("writing ", face_file_name) if args.write_imgs: cv2.imwrite(face_file_name, img) @@ -382,16 +367,16 @@ def drawFrame(boxes, frame, fps): index = 0 for box in boxes: frame = cv2.putText(frame, - 'label: %s' % class_names[label[index]], - (int(box[0]), int(box[1]-40)), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, - (0, 0, 255)) + 'label: %s' % class_names[label[index]], + (int(box[0]), int(box[1] - 40)), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, + (0, 0, 255)) frame = cv2.putText(frame, - 'fps: %.3f' % fps, - (int(box[0]), int(box[1]-20)), - cv2.FONT_HERSHEY_SIMPLEX, - 0.5, (0, 0, 255)) + 'fps: %.3f' % fps, + (int(box[0]), int(box[1] - 20)), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, (0, 0, 255)) index += 1 @@ -401,8 +386,8 @@ def drawFrame(boxes, frame, fps): if __name__ == "__main__": warnings.filterwarnings("once") parser = argparse.ArgumentParser(description="Face detection") - parser.add_argument('--detector', '-t', type=str, required=True, help="Path to a trained face detector .pth file") - parser.add_argument('--detector_type', '-d', type=str, required=True, help="Type of face detector. One of " + parser.add_argument('--detector', '-d', type=str, required=True, help="Path to a trained face detector .pth file") + parser.add_argument('--detector_type', '-t', type=str, required=True, help="Type of face detector. One of " "blazeface, ssd, or retinaface.") parser.add_argument('--cuda', '-c', default=False, action='store_true', help="Enable cuda") parser.add_argument('--classifier', type=str, help="Path to a trained classifier .pth file") @@ -410,6 +395,10 @@ def drawFrame(boxes, frame, fps): parser.add_argument('--output_dir', default='encrypted_imgs', type=str, help="Where to output encrypted images") args = parser.parse_args() + if args.detector_type not in DETECTOR_TYPES: + print('Please include a valid detector type (\'blazeface\', \'ssd\', or \'retinaface\'') + exit(1) + device = torch.device('cpu') if args.cuda and torch.cuda.is_available(): device = torch.device('cuda:0') @@ -418,18 +407,19 @@ def drawFrame(boxes, frame, fps): g.eval() capturer = VideoCapturer() - detector = FaceDetector(detector=args.detector, detector_type=args.detector_type, cuda=args.cuda and torch.cuda.is_available(), set_default_dev=True) + detector = FaceDetector(detector=args.detector, detector_type=args.detector_type, + cuda=args.cuda and torch.cuda.is_available(), set_default_dev=True) classifier = Classifier(g, args.cuda) encryptor = Encryptor() run_face_detection: bool = True - while run_face_detection: #main video detection loop that will iterate until ESC key is entered + while run_face_detection: # main video detection loop that will iterate until ESC key is entered start_time = time.time() frame = capturer.get_frame() boxes = detector.detect(frame) - encryptedImg = frame.copy() #copy memory for encrypting image separate from unencrypted image + encryptedImg = frame.copy() # copy memory for encrypting image separate from unencrypted image if len(boxes) != 0: p1 = Process(target=encryptWorker, args=(encryptor, encryptedImg, boxes, args.output_dir, args.write_imgs)) @@ -441,7 +431,7 @@ def drawFrame(boxes, frame, fps): fps = 1 / (time.time() - start_time) drawFrame(boxes, frame, fps) - #remove frame creation and drawing before deployment + # remove frame creation and drawing before deployment p1.join() if cv2.waitKey(1) == 27: From fa4cfb55442a65266154f3857c5a8bf2826eca5a Mon Sep 17 00:00:00 2001 From: ZPBerg Date: Mon, 13 Jul 2020 18:22:31 -0400 Subject: [PATCH 6/6] Retinaface works with GPU --- src/jetson/main.py | 3 ++- src/jetson/models/utils/box_utils.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/jetson/main.py b/src/jetson/main.py index 232e75fc..4c6bf94a 100644 --- a/src/jetson/main.py +++ b/src/jetson/main.py @@ -76,7 +76,7 @@ def __init__(self, detector: str, detector_type: str, detection_threshold=0.7, c self.transformer = BaseTransform((self.image_shape[1], self.image_shape[0]), (104, 117, 123)) priorbox = PriorBox(cfg, image_size=self.image_shape) priors = priorbox.forward() - self.prior_data = priors.data + self.prior_data = priors.data.to(device) self.detection_threshold = detection_threshold if cuda and torch.cuda.is_available(): @@ -148,6 +148,7 @@ def detect(self, elif self.model_name == 'retinaface': img = (self.transformer(image)[0]).transpose(2, 0, 1) img = torch.from_numpy(img).unsqueeze(0) + img = img.to(device) loc, conf, _ = self.net( img) # forward pass: Returns bounding box location, confidence and facial landmark locations diff --git a/src/jetson/models/utils/box_utils.py b/src/jetson/models/utils/box_utils.py index bed236b3..03a5f513 100644 --- a/src/jetson/models/utils/box_utils.py +++ b/src/jetson/models/utils/box_utils.py @@ -376,7 +376,7 @@ def postprocess(boxes, conf, image_shape, detection_threshold, resize_factor): Returns boxes and confidence scores that are above confidence threshold """ scale = torch.Tensor([image_shape[1], image_shape[0], image_shape[1], image_shape[0]]) - boxes = (boxes * scale / resize_factor).numpy() + boxes = (boxes * scale / resize_factor).to('cpu').numpy() scores = conf.squeeze(0).data.cpu().numpy()[:, 1] # ignore low scores