From 7dd2235be2d033a8f3a343287e1529678b7c5eff Mon Sep 17 00:00:00 2001
From: Aditya Chakraborty <chakra17@purdue.edu>
Date: Sun, 12 Jul 2020 13:20:22 -0400
Subject: [PATCH 1/6] Added evaluator for detector and classifier

---
 eval/evaluator.py | 373 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 373 insertions(+)
 create mode 100644 eval/evaluator.py

diff --git a/eval/evaluator.py b/eval/evaluator.py
new file mode 100644
index 00000000..94bf9a02
--- /dev/null
+++ b/eval/evaluator.py
@@ -0,0 +1,373 @@
+import os
+import cv2
+import argparse
+import torch
+import time
+import warnings
+import json
+import numpy as np
+
+
+from face_detector import FaceDetector, classify
+
+class Evaluator():
+    def __init__(self, cuda, detector, classifier, input_directory):
+
+        '''
+        This class evaluates face detection and goggle classsification performance.
+        Goggle Classification accuracy is given by average class accuracy and individual
+        video accuracy.
+        Face detection accuracy is give by precision and recall values.
+
+        Parameters:
+        cuda: A bool value that specifies if cuda shall be used
+        detector: A string path to a .pth weights file for a face detection model
+        classifier: A string path to a .pth weights file for a goggle classsification model
+        input_directory: Directory containing test videos to run Evaluator on
+        '''
+
+        if cuda and torch.cuda.is_available():
+            torch.set_default_tensor_type('torch.cuda.FloatTensor')
+            self.device = torch.device('cuda:0')
+        else:
+            torch.set_default_tensor_type('torch.FloatTensor')
+            self.device = torch.device('cpu')
+
+
+        if os.path.exists("test_results/det_results_ideal.txt"):
+            os.remove("test_results/det_results_ideal.txt")
+
+
+        self.detector = FaceDetector(trained_model=detector, cuda=cuda and torch.cuda.is_available(),
+                                set_default_dev=True)
+        self.classifier = torch.load(classifier, map_location=self.device)
+        self.classifier.eval()
+        self.video_filenames = self.get_video_files(input_directory)
+        self.results = {'Goggles':
+                            {'average_class_accuracy': 0.0,
+                             'number_of_videos' : 0,
+                             'individual_video_results': {}
+                            },
+                        'Glasses':
+                            {'average_class_accuracy': 0.0,
+                             'number_of_videos' : 0,
+                             'individual_video_results': {}
+                            },
+                        'Neither':
+                            {'average_class_accuracy': 0.0,
+                             'number_of_videos' : 0,
+                             'individual_video_results': {}
+                            }
+                       }
+        self.class_label = ''
+        self.condition = ''
+        self.cap = ''
+        self.video = ''
+
+        self.evaluate()
+
+
+    def evaluate(self):
+        '''
+        This method evaluates every video file in the input directory containing test videos.
+        It stores all the results in a dict called self.results as it calls the record_results method.
+        To understand the format of self.results dict, check the class constructor
+        '''
+        total_videos_processed = 0
+        for video_file in self.video_filenames:
+            self.video = video_file
+            print (f"Processing {self.video} ...")
+
+
+            self.class_label = self.get_class_label()
+            self.condition = self.get_condition()
+            self.cap = cv2.VideoCapture(self.video)
+            if self.cap.isOpened():
+                classification_result = self.evaluate_classifications() #Also contains boxes
+                self.record_results(classification_result)
+                total_videos_processed += 1
+                print (f"{self.video} : Done")
+
+            else:
+                print (f"Unable to open video {self.video}")
+                continue
+        self.calculate_average_class_accuracy()
+        #self.record_detections('test_results/det_output.txt')
+        detection_results = self.evaluate_detections('ground_truth_detections_ideal/',"test_results/det_results_ideal.txt")
+
+
+        print (f"\n {total_videos_processed} videos processed!")
+
+
+    def calculate_average_class_accuracy(self):
+        '''
+        This method calculates the average class accuracy for each class and stores it in the
+        self.results dict.
+        '''
+        for class_label in self.results:
+            if self.results[class_label]['number_of_videos'] > 0:
+                self.results[class_label]['average_class_accuracy'] = self.results[class_label]['average_class_accuracy'] / self.results[class_label]['number_of_videos']
+
+    def record_results(self, result):
+        '''
+        This method records all the results in the self.results dict
+        '''
+        self.results[self.class_label]['number_of_videos'] += 1
+        self.results[self.class_label]['average_class_accuracy'] += result[0]
+        self.results[self.class_label]['individual_video_results'][self.video] = {}
+        self.results[self.class_label]['individual_video_results'][self.video]["accuracy"] = result[0]
+        self.results[self.class_label]['individual_video_results'][self.video]["inference_time"] = result[1]
+        self.results[self.class_label]['individual_video_results'][self.video]["condition"] = self.condition
+
+    def record_detections(self, file, detections):
+        f = open(file, "a+")
+        for detection in detections:
+            for element in detection:
+                f.write(str(element))
+                f.write("|")
+            f.write("\n")
+        f.close()
+
+
+    def infer(self):
+        '''
+        This method Performs inference on a video (frame by frame) by using the face detection
+        and goggle classification models
+        It returns:
+        1) inference_dict contains the number of inferences for each class.
+        2) average inference time is a float containing the average inference time for the whole video
+        '''
+        bboxes = []
+        inference_dict = {"Goggles": 0, "Glasses": 0, "Neither": 0}
+        frame_counter = 0
+        start_time = time.time()
+
+        while True:
+            ret, img = self.cap.read()
+
+            if not ret:
+                break
+            frame_id = self.video.strip('.avi').strip('.mp4').strip('.MOV').strip('.mov').split('/')[-1] + "_" + str(frame_counter)
+            boxes = self.detector.detect(img)  #Also contains confidence
+            preds = []
+            for box in boxes:
+                x1, y1, x2, y2, conf = [b for b in box]
+                x1 = max(0, x1)
+                y1 = max(0, y1)
+                x2 = min(img.shape[1], x2)
+                y2 = min(img.shape[0], y2)
+
+                if isinstance(conf, torch.Tensor): #This is true for ssd
+                    conf = conf.numpy()
+                    #print (conf)
+
+                #assert isinstance(conf, np.float32)
+
+
+                face = img[int(y1):int(y2), int(x1):int(x2), :]
+                label, softlabels = classify(face, self.classifier, self.device)
+
+                preds.append(label.item())
+
+                inference_dict["Goggles"] += preds.count(1)
+                inference_dict["Glasses"] += preds.count(0)
+                inference_dict["Neither"] += preds.count(2)
+
+                bboxes.append([frame_id, x1, y1, x2, y2, conf])
+
+            frame_counter += 1
+
+        total_time = time.time() - start_time
+        if frame_counter > 0:
+            average_inference_time = total_time / frame_counter
+        else:
+            average_inference_time = -1 #Empty video file
+
+        self.record_detections("test_results/det_results_ideal.txt", bboxes)
+        return inference_dict, average_inference_time
+
+
+    def get_class_label(self):
+        '''
+        Get class label [Goggles / Glasses / Neither] that the image belongs to
+        '''
+
+        class_label = ''
+        if '/Goggles/' in self.video or '/goggles/' in self.video:
+            class_label = 'Goggles'
+        elif '/Glasses/' in self.video or '/glasses/' in self.video:
+            class_label = 'Glasses'
+        else:
+            class_label = 'Neither'
+
+        return class_label
+
+    def get_condition(self):
+        '''
+        Get condition [Ideal, low_lighting etc. ] that the image belongs to
+        '''
+        return (self.video.split('/')[-2])
+
+
+    def evaluate_classifications(self):
+        '''
+        This method returns the accuracy (percentage_of_correct_predictions) of the
+        predictions for a video
+        '''
+        inferences, inference_time = self.infer()
+        if sum(inferences.values()) == 0:
+            percentage_of_correct_predictions = 0
+        else:
+            percentage_of_correct_predictions = inferences[self.class_label] / sum(inferences.values())
+
+
+        return percentage_of_correct_predictions, inference_time
+
+    def get_ground_truth_detections(self, directory):
+        GT = {}
+
+        for file in os.listdir(directory):
+            f = open(directory+file, "r")
+            key = file.strip('.txt')
+            content = f.readlines()
+            f.close()
+
+            content = [list(map(float, x.strip(' \n').split(' '))) for x in content]
+            GT[key] = content
+
+
+        return GT
+
+    def evaluate_detections(self, annotations_location, detection_location, ovthresh = 0.5):
+        '''
+        This method calculates the recall and precision of face detection for a video
+        '''
+
+        GT_detections = self.get_ground_truth_detections(annotations_location)
+        with open(detection_location, 'r') as f:
+            lines = f.readlines()
+
+        total_GT = 0
+        for frame_id in GT_detections:
+            total_GT += len(GT_detections[frame_id])
+
+        if any(lines) == 1:
+            splitlines = [x.strip().split('|') for x in lines]
+
+            '''
+            for x in splitlines:
+                if x[0] not in GT_detections:
+                    splitlines.remove(x)
+            '''
+
+            image_ids = [x[0] for x in splitlines]
+            confidence = np.array([float(x[5]) for x in splitlines])
+            BB = np.array([[float(z) for z in x[1:5]] for x in splitlines])
+
+            # sort by confidence
+            sorted_ind = np.argsort(-confidence)
+            sorted_scores = np.sort(-confidence)
+            BB = BB[sorted_ind, :]
+            image_ids = [image_ids[x] for x in sorted_ind]
+
+            nd = len(image_ids)
+            tp = np.zeros(nd)
+            fp = np.zeros(nd)
+
+            print (nd)
+            for d in range(nd):
+                try:
+                    R = GT_detections[image_ids[d]]
+                    bb = BB[d, :].astype(float)
+                    ovmax = -np.inf
+                    BBGT = np.asarray(R, dtype=np.float32)
+                    if BBGT.size > 0:
+                        ixmin = np.maximum(BBGT[:, 0], bb[0])
+                        iymin = np.maximum(BBGT[:, 1], bb[1])
+                        ixmax = np.minimum(BBGT[:, 2], bb[2])
+                        iymax = np.minimum(BBGT[:, 3], bb[3])
+                        iw = np.maximum(ixmax - ixmin, 0.)
+                        ih = np.maximum(iymax - iymin, 0.)
+                        inters = iw * ih
+                        uni = ((bb[2] - bb[0]) * (bb[3] - bb[1]) +
+                               (BBGT[:, 2] - BBGT[:, 0]) *
+                               (BBGT[:, 3] - BBGT[:, 1]) - inters)
+                        overlaps = inters / uni
+                        ovmax = np.max(overlaps)
+                        #jmax = np.argmax(overlaps)
+
+                    if ovmax > ovthresh:
+                        tp[d] = 1.
+                    else:
+                        fp[d] = 1.
+
+                except KeyError:
+                    continue
+
+            print ("total_GT: ", total_GT)
+            fp = np.cumsum(fp)
+            tp = np.cumsum(tp)
+            rec = tp / float(total_GT)
+            # avoid divide by zero in case the first detection matches a difficult
+            # ground truth
+            prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+        else:
+            rec = -1.
+            prec = -1.
+            ap = -1.
+
+        print ("precision: ", prec)
+        print ("recall: ", rec)
+
+        return prec, rec
+
+
+
+    def get_video_files(self, input_directory):
+        '''
+        This method gets all the video files in the input directory
+        '''
+
+        filenames = []
+        for dirName, subdirList, fileList in os.walk(input_directory):
+            for filename in fileList:
+                ext = '.' + filename.split('.')[-1]
+                if ext in ['.mov','.mp4','.avi','.MOV']:
+                    filenames.append(dirName + '/' + filename)
+
+        return filenames
+
+    def get_evaluator_results(self):
+        '''
+        This method returns the dict containing all the test results (self.results)
+        '''
+
+        return self.results
+
+def main():
+    if not args.input_directory:
+        raise Exception("Invalid input directory")
+    evaluator = Evaluator(args.cuda, args.detector, args.classifier, args.input_directory)
+    individual_video_results = evaluator.get_evaluator_results()
+
+
+    with open(args.output_file, 'w') as json_file:
+        json.dump(individual_video_results, json_file, indent=4)
+
+    print (f"\n Output saved at {args.output_file}")
+
+if __name__ == "__main__":
+    warnings.filterwarnings("once")
+    parser = argparse.ArgumentParser(description="Face detection")
+    parser.add_argument('--detector', '-t', type=str, default='model_weights/blazeface.pth', help="Path to a trained ssd .pth file")
+    parser.add_argument('--cuda', '-c', default=False, action='store_true', help="Enable cuda")
+    parser.add_argument('--classifier', default='model_weights/ensemble_100epochs.pth', type=str, help="Path to a trained classifier .pth file")
+    parser.add_argument('--output_file', type=str, default='test_results/test1.json', help="Path to a directory to store evaluation log")
+    parser.add_argument('--input_directory', type=str, help="Path to a directory containing video files")
+    parser.add_argument('--annotation_path', type=str, help="Path to annotation files")
+
+    args = parser.parse_args()
+
+    main()
+
+    exit()

From fa6707be0f1240a394076c0b52b6360c12bc0502 Mon Sep 17 00:00:00 2001
From: Aditya Chakraborty <chakra17@purdue.edu>
Date: Sun, 12 Jul 2020 14:56:10 -0400
Subject: [PATCH 2/6] Fixed bugs

---
 src/jetson/main.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/jetson/main.py b/src/jetson/main.py
index 54ceee25..178a55ae 100644
--- a/src/jetson/main.py
+++ b/src/jetson/main.py
@@ -132,6 +132,7 @@ def detect(self,
                 xmin = detections[i, 1] * image.shape[1]
                 ymax = detections[i, 2] * image.shape[0]
                 xmax = detections[i, 3] * image.shape[1]
+                conf = detections[i, 16]
 
                 img = img / 127.5 - 1.0
 
@@ -139,7 +140,7 @@ def detect(self,
                     kp_x = detections[i, 4 + k * 2] * img.shape[1]
                     kp_y = detections[i, 4 + k * 2 + 1] * img.shape[0]
 
-                bboxes.append((xmin, ymin, xmax, ymax))
+                bboxes.append((xmin, ymin, xmax, ymax, conf))
 
             return bboxes
 
@@ -154,9 +155,7 @@ def detect(self,
             boxes, scores = postprocess(boxes, conf, self.image_shape, self.detection_threshold, self.resize)
             dets = do_nms(boxes, scores, infer_params["nms_thresh"])
 
-            bboxes = []
-            for det in dets:
-                bboxes.append(tuple(dets[0][0:4]))
+            bboxes = [tuple(det[0:4]) for det in dets]
 
             return bboxes
 
@@ -197,14 +196,16 @@ def close(self):
         self.t1.join()
 
 class Classifier:
-    def __init__(self, classifier):
+    def __init__(self, classifier, cuda:bool):
         '''
         Performs classification of facial region into three classes - [Goggles, Glasses, Neither]
         Args:
             classifier - Trained classifier model (Currently, mobilenetv2)
+            cuda - True if Nvidia GPU is used
         '''
         self.fps = 0
         self.classifier = classifier
+        self.device = cuda
 
     def classifyFace(self,
                     face: np.ndarray):
@@ -233,7 +234,7 @@ def classifyFace(self,
         ])
         transformed_face = transform(pil_face)
         face_batch = transformed_face.unsqueeze(0)
-        device = torch.device("cuda:0" if args.cuda and torch.cuda.is_available() else "cpu")
+        device = torch.device("cuda:0" if self.device and torch.cuda.is_available() else "cpu")
         with torch.no_grad():
             face_batch = face_batch.to(device)
             labels = classifier(face_batch)
@@ -405,7 +406,7 @@ def drawFrame(boxes, frame, fps):
 
     capturer = VideoCapturer()
     detector = FaceDetector(detector=args.detector, cuda=args.cuda and torch.cuda.is_available(), set_default_dev=True)
-    classifier = Classifier(g)
+    classifier = Classifier(g, args.cuda)
     encryptor = Encryptor()
 
     run_face_detection: bool = True

From 4b85b361661a9a08107e93d8a317f28d2c75f586 Mon Sep 17 00:00:00 2001
From: Aditya Chakraborty <chakra17@purdue.edu>
Date: Sun, 12 Jul 2020 18:11:56 -0400
Subject: [PATCH 3/6] Made necessary changes to main to run evaluator

---
 eval/evaluator.py  | 373 ---------------------------------------------
 src/jetson/main.py |   9 +-
 2 files changed, 5 insertions(+), 377 deletions(-)
 delete mode 100644 eval/evaluator.py

diff --git a/eval/evaluator.py b/eval/evaluator.py
deleted file mode 100644
index 94bf9a02..00000000
--- a/eval/evaluator.py
+++ /dev/null
@@ -1,373 +0,0 @@
-import os
-import cv2
-import argparse
-import torch
-import time
-import warnings
-import json
-import numpy as np
-
-
-from face_detector import FaceDetector, classify
-
-class Evaluator():
-    def __init__(self, cuda, detector, classifier, input_directory):
-
-        '''
-        This class evaluates face detection and goggle classsification performance.
-        Goggle Classification accuracy is given by average class accuracy and individual
-        video accuracy.
-        Face detection accuracy is give by precision and recall values.
-
-        Parameters:
-        cuda: A bool value that specifies if cuda shall be used
-        detector: A string path to a .pth weights file for a face detection model
-        classifier: A string path to a .pth weights file for a goggle classsification model
-        input_directory: Directory containing test videos to run Evaluator on
-        '''
-
-        if cuda and torch.cuda.is_available():
-            torch.set_default_tensor_type('torch.cuda.FloatTensor')
-            self.device = torch.device('cuda:0')
-        else:
-            torch.set_default_tensor_type('torch.FloatTensor')
-            self.device = torch.device('cpu')
-
-
-        if os.path.exists("test_results/det_results_ideal.txt"):
-            os.remove("test_results/det_results_ideal.txt")
-
-
-        self.detector = FaceDetector(trained_model=detector, cuda=cuda and torch.cuda.is_available(),
-                                set_default_dev=True)
-        self.classifier = torch.load(classifier, map_location=self.device)
-        self.classifier.eval()
-        self.video_filenames = self.get_video_files(input_directory)
-        self.results = {'Goggles':
-                            {'average_class_accuracy': 0.0,
-                             'number_of_videos' : 0,
-                             'individual_video_results': {}
-                            },
-                        'Glasses':
-                            {'average_class_accuracy': 0.0,
-                             'number_of_videos' : 0,
-                             'individual_video_results': {}
-                            },
-                        'Neither':
-                            {'average_class_accuracy': 0.0,
-                             'number_of_videos' : 0,
-                             'individual_video_results': {}
-                            }
-                       }
-        self.class_label = ''
-        self.condition = ''
-        self.cap = ''
-        self.video = ''
-
-        self.evaluate()
-
-
-    def evaluate(self):
-        '''
-        This method evaluates every video file in the input directory containing test videos.
-        It stores all the results in a dict called self.results as it calls the record_results method.
-        To understand the format of self.results dict, check the class constructor
-        '''
-        total_videos_processed = 0
-        for video_file in self.video_filenames:
-            self.video = video_file
-            print (f"Processing {self.video} ...")
-
-
-            self.class_label = self.get_class_label()
-            self.condition = self.get_condition()
-            self.cap = cv2.VideoCapture(self.video)
-            if self.cap.isOpened():
-                classification_result = self.evaluate_classifications() #Also contains boxes
-                self.record_results(classification_result)
-                total_videos_processed += 1
-                print (f"{self.video} : Done")
-
-            else:
-                print (f"Unable to open video {self.video}")
-                continue
-        self.calculate_average_class_accuracy()
-        #self.record_detections('test_results/det_output.txt')
-        detection_results = self.evaluate_detections('ground_truth_detections_ideal/',"test_results/det_results_ideal.txt")
-
-
-        print (f"\n {total_videos_processed} videos processed!")
-
-
-    def calculate_average_class_accuracy(self):
-        '''
-        This method calculates the average class accuracy for each class and stores it in the
-        self.results dict.
-        '''
-        for class_label in self.results:
-            if self.results[class_label]['number_of_videos'] > 0:
-                self.results[class_label]['average_class_accuracy'] = self.results[class_label]['average_class_accuracy'] / self.results[class_label]['number_of_videos']
-
-    def record_results(self, result):
-        '''
-        This method records all the results in the self.results dict
-        '''
-        self.results[self.class_label]['number_of_videos'] += 1
-        self.results[self.class_label]['average_class_accuracy'] += result[0]
-        self.results[self.class_label]['individual_video_results'][self.video] = {}
-        self.results[self.class_label]['individual_video_results'][self.video]["accuracy"] = result[0]
-        self.results[self.class_label]['individual_video_results'][self.video]["inference_time"] = result[1]
-        self.results[self.class_label]['individual_video_results'][self.video]["condition"] = self.condition
-
-    def record_detections(self, file, detections):
-        f = open(file, "a+")
-        for detection in detections:
-            for element in detection:
-                f.write(str(element))
-                f.write("|")
-            f.write("\n")
-        f.close()
-
-
-    def infer(self):
-        '''
-        This method Performs inference on a video (frame by frame) by using the face detection
-        and goggle classification models
-        It returns:
-        1) inference_dict contains the number of inferences for each class.
-        2) average inference time is a float containing the average inference time for the whole video
-        '''
-        bboxes = []
-        inference_dict = {"Goggles": 0, "Glasses": 0, "Neither": 0}
-        frame_counter = 0
-        start_time = time.time()
-
-        while True:
-            ret, img = self.cap.read()
-
-            if not ret:
-                break
-            frame_id = self.video.strip('.avi').strip('.mp4').strip('.MOV').strip('.mov').split('/')[-1] + "_" + str(frame_counter)
-            boxes = self.detector.detect(img)  #Also contains confidence
-            preds = []
-            for box in boxes:
-                x1, y1, x2, y2, conf = [b for b in box]
-                x1 = max(0, x1)
-                y1 = max(0, y1)
-                x2 = min(img.shape[1], x2)
-                y2 = min(img.shape[0], y2)
-
-                if isinstance(conf, torch.Tensor): #This is true for ssd
-                    conf = conf.numpy()
-                    #print (conf)
-
-                #assert isinstance(conf, np.float32)
-
-
-                face = img[int(y1):int(y2), int(x1):int(x2), :]
-                label, softlabels = classify(face, self.classifier, self.device)
-
-                preds.append(label.item())
-
-                inference_dict["Goggles"] += preds.count(1)
-                inference_dict["Glasses"] += preds.count(0)
-                inference_dict["Neither"] += preds.count(2)
-
-                bboxes.append([frame_id, x1, y1, x2, y2, conf])
-
-            frame_counter += 1
-
-        total_time = time.time() - start_time
-        if frame_counter > 0:
-            average_inference_time = total_time / frame_counter
-        else:
-            average_inference_time = -1 #Empty video file
-
-        self.record_detections("test_results/det_results_ideal.txt", bboxes)
-        return inference_dict, average_inference_time
-
-
-    def get_class_label(self):
-        '''
-        Get class label [Goggles / Glasses / Neither] that the image belongs to
-        '''
-
-        class_label = ''
-        if '/Goggles/' in self.video or '/goggles/' in self.video:
-            class_label = 'Goggles'
-        elif '/Glasses/' in self.video or '/glasses/' in self.video:
-            class_label = 'Glasses'
-        else:
-            class_label = 'Neither'
-
-        return class_label
-
-    def get_condition(self):
-        '''
-        Get condition [Ideal, low_lighting etc. ] that the image belongs to
-        '''
-        return (self.video.split('/')[-2])
-
-
-    def evaluate_classifications(self):
-        '''
-        This method returns the accuracy (percentage_of_correct_predictions) of the
-        predictions for a video
-        '''
-        inferences, inference_time = self.infer()
-        if sum(inferences.values()) == 0:
-            percentage_of_correct_predictions = 0
-        else:
-            percentage_of_correct_predictions = inferences[self.class_label] / sum(inferences.values())
-
-
-        return percentage_of_correct_predictions, inference_time
-
-    def get_ground_truth_detections(self, directory):
-        GT = {}
-
-        for file in os.listdir(directory):
-            f = open(directory+file, "r")
-            key = file.strip('.txt')
-            content = f.readlines()
-            f.close()
-
-            content = [list(map(float, x.strip(' \n').split(' '))) for x in content]
-            GT[key] = content
-
-
-        return GT
-
-    def evaluate_detections(self, annotations_location, detection_location, ovthresh = 0.5):
-        '''
-        This method calculates the recall and precision of face detection for a video
-        '''
-
-        GT_detections = self.get_ground_truth_detections(annotations_location)
-        with open(detection_location, 'r') as f:
-            lines = f.readlines()
-
-        total_GT = 0
-        for frame_id in GT_detections:
-            total_GT += len(GT_detections[frame_id])
-
-        if any(lines) == 1:
-            splitlines = [x.strip().split('|') for x in lines]
-
-            '''
-            for x in splitlines:
-                if x[0] not in GT_detections:
-                    splitlines.remove(x)
-            '''
-
-            image_ids = [x[0] for x in splitlines]
-            confidence = np.array([float(x[5]) for x in splitlines])
-            BB = np.array([[float(z) for z in x[1:5]] for x in splitlines])
-
-            # sort by confidence
-            sorted_ind = np.argsort(-confidence)
-            sorted_scores = np.sort(-confidence)
-            BB = BB[sorted_ind, :]
-            image_ids = [image_ids[x] for x in sorted_ind]
-
-            nd = len(image_ids)
-            tp = np.zeros(nd)
-            fp = np.zeros(nd)
-
-            print (nd)
-            for d in range(nd):
-                try:
-                    R = GT_detections[image_ids[d]]
-                    bb = BB[d, :].astype(float)
-                    ovmax = -np.inf
-                    BBGT = np.asarray(R, dtype=np.float32)
-                    if BBGT.size > 0:
-                        ixmin = np.maximum(BBGT[:, 0], bb[0])
-                        iymin = np.maximum(BBGT[:, 1], bb[1])
-                        ixmax = np.minimum(BBGT[:, 2], bb[2])
-                        iymax = np.minimum(BBGT[:, 3], bb[3])
-                        iw = np.maximum(ixmax - ixmin, 0.)
-                        ih = np.maximum(iymax - iymin, 0.)
-                        inters = iw * ih
-                        uni = ((bb[2] - bb[0]) * (bb[3] - bb[1]) +
-                               (BBGT[:, 2] - BBGT[:, 0]) *
-                               (BBGT[:, 3] - BBGT[:, 1]) - inters)
-                        overlaps = inters / uni
-                        ovmax = np.max(overlaps)
-                        #jmax = np.argmax(overlaps)
-
-                    if ovmax > ovthresh:
-                        tp[d] = 1.
-                    else:
-                        fp[d] = 1.
-
-                except KeyError:
-                    continue
-
-            print ("total_GT: ", total_GT)
-            fp = np.cumsum(fp)
-            tp = np.cumsum(tp)
-            rec = tp / float(total_GT)
-            # avoid divide by zero in case the first detection matches a difficult
-            # ground truth
-            prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
-        else:
-            rec = -1.
-            prec = -1.
-            ap = -1.
-
-        print ("precision: ", prec)
-        print ("recall: ", rec)
-
-        return prec, rec
-
-
-
-    def get_video_files(self, input_directory):
-        '''
-        This method gets all the video files in the input directory
-        '''
-
-        filenames = []
-        for dirName, subdirList, fileList in os.walk(input_directory):
-            for filename in fileList:
-                ext = '.' + filename.split('.')[-1]
-                if ext in ['.mov','.mp4','.avi','.MOV']:
-                    filenames.append(dirName + '/' + filename)
-
-        return filenames
-
-    def get_evaluator_results(self):
-        '''
-        This method returns the dict containing all the test results (self.results)
-        '''
-
-        return self.results
-
-def main():
-    if not args.input_directory:
-        raise Exception("Invalid input directory")
-    evaluator = Evaluator(args.cuda, args.detector, args.classifier, args.input_directory)
-    individual_video_results = evaluator.get_evaluator_results()
-
-
-    with open(args.output_file, 'w') as json_file:
-        json.dump(individual_video_results, json_file, indent=4)
-
-    print (f"\n Output saved at {args.output_file}")
-
-if __name__ == "__main__":
-    warnings.filterwarnings("once")
-    parser = argparse.ArgumentParser(description="Face detection")
-    parser.add_argument('--detector', '-t', type=str, default='model_weights/blazeface.pth', help="Path to a trained ssd .pth file")
-    parser.add_argument('--cuda', '-c', default=False, action='store_true', help="Enable cuda")
-    parser.add_argument('--classifier', default='model_weights/ensemble_100epochs.pth', type=str, help="Path to a trained classifier .pth file")
-    parser.add_argument('--output_file', type=str, default='test_results/test1.json', help="Path to a directory to store evaluation log")
-    parser.add_argument('--input_directory', type=str, help="Path to a directory containing video files")
-    parser.add_argument('--annotation_path', type=str, help="Path to annotation files")
-
-    args = parser.parse_args()
-
-    main()
-
-    exit()
diff --git a/src/jetson/main.py b/src/jetson/main.py
index 178a55ae..be6d9a08 100644
--- a/src/jetson/main.py
+++ b/src/jetson/main.py
@@ -111,7 +111,8 @@ def detect(self,
             while j < detections.shape[2] and detections[0, 1, j, 0] > self.detection_threshold:
                 pt = (detections[0, 1, j, 1:] * scale).cpu().numpy()
                 x1, y1, x2, y2 = pt
-                bboxes.append((x1, y1, x2, y2))
+                conf = detections[0, 1, j, 0].item()
+                bboxes.append((x1, y1, x2, y2, conf))
                 j += 1
 
             return bboxes
@@ -155,7 +156,7 @@ def detect(self,
             boxes, scores = postprocess(boxes, conf, self.image_shape, self.detection_threshold, self.resize)
             dets = do_nms(boxes, scores, infer_params["nms_thresh"])
 
-            bboxes = [tuple(det[0:4]) for det in dets]
+            bboxes = [tuple(det[0:5]) for det in dets]
 
             return bboxes
 
@@ -259,7 +260,7 @@ def classifyFrame(self,
 
         label = []
         for box in boxes:
-            x1, y1, x2, y2 = [int(b) for b in box]
+            x1, y1, x2, y2 = [int(b) for b in box[0:4]]
             # draw boxes within the frame
             x1 = max(0, x1)
             y1 = max(0, y1)
@@ -308,7 +309,7 @@ def encryptFrame(self, img:np.ndarray,
             boxes: facial Coordinates
         '''
         for box in boxes:
-            x1, y1, x2, y2 = [int(b) for b in box]
+            x1, y1, x2, y2 = [int(b) for b in box[0:4]]
             # draw boxes within the frame
             x1 = max(0, x1)
             y1 = max(0, y1)

From 6d0651616a84af5cd714cb179512f40eeae1c653 Mon Sep 17 00:00:00 2001
From: ZPBerg <bergz@purdue.edu>
Date: Sun, 12 Jul 2020 21:53:44 -0400
Subject: [PATCH 4/6] Create detector_type enum.

---
 src/jetson/main.py | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/src/jetson/main.py b/src/jetson/main.py
index be6d9a08..c8641041 100644
--- a/src/jetson/main.py
+++ b/src/jetson/main.py
@@ -4,6 +4,7 @@
 from typing import List, Set, Dict, Tuple, Optional
 
 import cv2
+from enum import Enum
 from PIL import Image
 import numpy as np
 import torch
@@ -29,12 +30,20 @@
 fileCount = Value('i', 0)
 encryptRet = Queue() #Shared memory queue to allow child encryption process to return to parent
 
+
+class DetectorType(Enum):
+    BLAZEFACE = 'blazeface',
+    RETINAFACE = 'retinaface',
+    SSD = 'ssd'
+
+
 class FaceDetector:
-    def __init__(self, detector:str, detection_threshold=0.7, cuda=True, set_default_dev=False):
+    def __init__(self, detector: str, detector_type: str, detection_threshold=0.7, cuda=True, set_default_dev=False):
         """
         Creates a FaceDetector object
         Args:
             detector: A string path to a trained pth file for a ssd model trained in face detection
+            detector_type: A DetectorType describing which face detector is being used
             detection_threshold: The minimum threshold for a detection to be considered valid
             cuda: Whether or not to enable CUDA
             set_default_dev: Whether or not to set the default device for PyTorch
@@ -42,18 +51,16 @@ def __init__(self, detector:str, detection_threshold=0.7, cuda=True, set_default
 
         self.device = torch.device("cpu")
 
-        if ('.pth' in detector and 'ssd' in detector):
-            from models.SSD.ssd import build_ssd
+        if detector_type == DetectorType.SSD:
+            from src.jetson.models.SSD.ssd import build_ssd
 
             self.net = build_ssd('test', 300, 2)
             self.model_name = 'ssd'
             self.net.load_state_dict(torch.load(detector, map_location=self.device))
             self.transformer = BaseTransform(self.net.size, (104, 117, 123))
 
-
-        elif ('.pth' in detector and 'blazeface' in detector):
-            from models.BlazeFace.blazeface import BlazeFace
-
+        elif detector_type == DetectorType.BLAZEFACE:
+            from src.jetson.models.BlazeFace.blazeface import BlazeFace
 
             self.net = BlazeFace(self.device)
             self.net.load_weights(detector)
@@ -63,8 +70,8 @@ def __init__(self, detector:str, detection_threshold=0.7, cuda=True, set_default
             self.net.min_suppression_threshold = 0.3
             self.transformer = BaseTransform(128, None)
 
-        elif ('.pth' in detector and 'mobile' in detector):
-            from models.Retinaface.retinaface import RetinaFace, load_model
+        elif detector_type == DetectorType.RETINAFACE:
+            from src.jetson.models.Retinaface.retinaface import RetinaFace, load_model
 
             self.net = RetinaFace(cfg=cfg, phase = 'test')
             self.net = load_model(self.net, detector, True)
@@ -76,6 +83,9 @@ def __init__(self, detector:str, detection_threshold=0.7, cuda=True, set_default
             priors = priorbox.forward()
             self.prior_data = priors.data
 
+        else:
+            print('Please include a valid detector type (\'blazeface\', \'ssd\', or \'retinaface\'')
+            exit(1)
 
         self.detection_threshold = detection_threshold
         if cuda and torch.cuda.is_available():
@@ -391,7 +401,9 @@ def drawFrame(boxes, frame, fps):
 if __name__ == "__main__":
     warnings.filterwarnings("once")
     parser = argparse.ArgumentParser(description="Face detection")
-    parser.add_argument('--detector', '-t', type=str, required=True, help="Path to a trained ssd .pth file")
+    parser.add_argument('--detector', '-t', type=str, required=True, help="Path to a trained face detector .pth file")
+    parser.add_argument('--detector_type', '-d', type=str, required=True, help="Type of face detector. One of "
+                                                                               "blazeface, ssd, or retinaface.")
     parser.add_argument('--cuda', '-c', default=False, action='store_true', help="Enable cuda")
     parser.add_argument('--classifier', type=str, help="Path to a trained classifier .pth file")
     parser.add_argument('--write_imgs', default=False, help='Write images to output_dir')
@@ -406,7 +418,7 @@ def drawFrame(boxes, frame, fps):
     g.eval()
 
     capturer = VideoCapturer()
-    detector = FaceDetector(detector=args.detector, cuda=args.cuda and torch.cuda.is_available(), set_default_dev=True)
+    detector = FaceDetector(detector=args.detector, detector_type=args.detector_type, cuda=args.cuda and torch.cuda.is_available(), set_default_dev=True)
     classifier = Classifier(g, args.cuda)
     encryptor = Encryptor()
 

From a5d5d0a8f7223ffbe1313727dfcf24e074f6d766 Mon Sep 17 00:00:00 2001
From: ZPBerg <bergz@purdue.edu>
Date: Mon, 13 Jul 2020 16:54:26 -0400
Subject: [PATCH 5/6] detector_type as an argument, compare to list of strings

---
 src/jetson/main.py | 90 +++++++++++++++++++++-------------------------
 1 file changed, 40 insertions(+), 50 deletions(-)

diff --git a/src/jetson/main.py b/src/jetson/main.py
index c8641041..232e75fc 100644
--- a/src/jetson/main.py
+++ b/src/jetson/main.py
@@ -28,13 +28,8 @@
 from models.Retinaface.data import cfg_inference as infer_params
 
 fileCount = Value('i', 0)
-encryptRet = Queue() #Shared memory queue to allow child encryption process to return to parent
-
-
-class DetectorType(Enum):
-    BLAZEFACE = 'blazeface',
-    RETINAFACE = 'retinaface',
-    SSD = 'ssd'
+encryptRet = Queue()  # Shared memory queue to allow child encryption process to return to parent
+DETECTOR_TYPES = ['blazeface', 'retinaface', 'ssd']
 
 
 class FaceDetector:
@@ -51,7 +46,7 @@ def __init__(self, detector: str, detector_type: str, detection_threshold=0.7, c
 
         self.device = torch.device("cpu")
 
-        if detector_type == DetectorType.SSD:
+        if detector_type == 'ssd':
             from src.jetson.models.SSD.ssd import build_ssd
 
             self.net = build_ssd('test', 300, 2)
@@ -59,7 +54,7 @@ def __init__(self, detector: str, detector_type: str, detection_threshold=0.7, c
             self.net.load_state_dict(torch.load(detector, map_location=self.device))
             self.transformer = BaseTransform(self.net.size, (104, 117, 123))
 
-        elif detector_type == DetectorType.BLAZEFACE:
+        elif detector_type == 'blazeface':
             from src.jetson.models.BlazeFace.blazeface import BlazeFace
 
             self.net = BlazeFace(self.device)
@@ -70,23 +65,19 @@ def __init__(self, detector: str, detector_type: str, detection_threshold=0.7, c
             self.net.min_suppression_threshold = 0.3
             self.transformer = BaseTransform(128, None)
 
-        elif detector_type == DetectorType.RETINAFACE:
+        elif detector_type == 'retinaface':
             from src.jetson.models.Retinaface.retinaface import RetinaFace, load_model
 
-            self.net = RetinaFace(cfg=cfg, phase = 'test')
+            self.net = RetinaFace(cfg=cfg, phase='test')
             self.net = load_model(self.net, detector, True)
             self.model_name = 'retinaface'
-            self.image_shape = infer_params["image_shape"]  #(H, W)
+            self.image_shape = infer_params["image_shape"]  # (H, W)
             self.resize = infer_params["resize"]
             self.transformer = BaseTransform((self.image_shape[1], self.image_shape[0]), (104, 117, 123))
             priorbox = PriorBox(cfg, image_size=self.image_shape)
             priors = priorbox.forward()
             self.prior_data = priors.data
 
-        else:
-            print('Please include a valid detector type (\'blazeface\', \'ssd\', or \'retinaface\'')
-            exit(1)
-
         self.detection_threshold = detection_threshold
         if cuda and torch.cuda.is_available():
             self.device = torch.device("cuda:0")
@@ -98,7 +89,6 @@ def __init__(self, detector: str, detector_type: str, detection_threshold=0.7, c
         self.net.to(self.device)
         self.net.eval()
 
-
     def detect(self,
                image: np.ndarray):
         """
@@ -110,7 +100,7 @@ def detect(self,
             The bounding boxes of the face(s) that were detected formatted (upper left corner(x, y) , lower right corner(x,y))
         """
 
-        if (self.model_name == 'ssd'):
+        if self.model_name == 'ssd':
             x = torch.from_numpy(self.transformer(image)[0]).permute(2, 0, 1)
             x = Variable(x.unsqueeze(0)).to(self.device)
             y = self.net(x)
@@ -127,7 +117,7 @@ def detect(self,
 
             return bboxes
 
-        elif (self.model_name == 'blazeface'):
+        elif self.model_name == 'blazeface':
             img = self.transformer(image)[0].astype(np.float32)
 
             detections = self.net.predict_on_image(img)
@@ -155,12 +145,11 @@ def detect(self,
 
             return bboxes
 
-
-        elif (self.model_name == 'retinaface'):
+        elif self.model_name == 'retinaface':
             img = (self.transformer(image)[0]).transpose(2, 0, 1)
             img = torch.from_numpy(img).unsqueeze(0)
-            loc, conf, _ = self.net(img)  # forward pass: Returns bounding box location, confidence and facial landmark locations
-
+            loc, conf, _ = self.net(
+                img)  # forward pass: Returns bounding box location, confidence and facial landmark locations
 
             boxes = decode(loc.data.squeeze(0), self.prior_data, cfg['variance'])
             boxes, scores = postprocess(boxes, conf, self.image_shape, self.detection_threshold, self.resize)
@@ -171,9 +160,6 @@ def detect(self,
             return bboxes
 
 
-
-
-
 class VideoCapturer(object):
     def __init__(self, src=0):
         '''
@@ -189,7 +175,6 @@ def __init__(self, src=0):
         self.t1.daemon = True
         self.t1.start()
 
-
     def update(self):
         '''Get next frame in video stream'''
         while self.running.value:
@@ -206,8 +191,9 @@ def close(self):
         self.running.value = False
         self.t1.join()
 
+
 class Classifier:
-    def __init__(self, classifier, cuda:bool):
+    def __init__(self, classifier, cuda: bool):
         '''
         Performs classification of facial region into three classes - [Goggles, Glasses, Neither]
         Args:
@@ -219,7 +205,7 @@ def __init__(self, classifier, cuda:bool):
         self.device = cuda
 
     def classifyFace(self,
-                    face: np.ndarray):
+                     face: np.ndarray):
         '''
         This method initializaes the transforms and classifies the face region
         Args:
@@ -255,8 +241,8 @@ def classifyFace(self,
         return pred
 
     def classifyFrame(self,
-                    img: np.ndarray,
-                    boxes: List[Tuple[np.float64]]):
+                      img: np.ndarray,
+                      boxes: List[Tuple[np.float64]]):
         '''
         This method loops through all the bounding boxes in an image, calls classifyFace method
         to classify face region and finally draws a box around the face.
@@ -282,9 +268,9 @@ def classifyFrame(self,
 
             label.append(int(self.classifyFace(face).data))
 
-
         return label
 
+
 class Encryptor(object):
     def __init__(self):
         '''
@@ -293,7 +279,6 @@ def __init__(self):
         self.encryptor = AESEncryptor()
         self.key = self.encryptor.key
 
-
     def encryptFace(self, coordinates: List[Tuple[int]],
                     img: np.ndarray):
         '''
@@ -310,8 +295,8 @@ def encryptFace(self, coordinates: List[Tuple[int]],
 
         return encryptedImg
 
-    def encryptFrame(self, img:np.ndarray,
-                    boxes:List[Tuple[np.float64]]):
+    def encryptFrame(self, img: np.ndarray,
+                     boxes: List[Tuple[np.float64]]):
         '''
         This method takes the face coordinates, encrypts the facial region, writes encrypted image to file filesystem
         Args:
@@ -345,7 +330,7 @@ def writeImg(img, output_dir):
     global fileCount
     face_file_name = os.path.join(output_dir, f'{fileCount.value}.jpg')
 
-    #TODO: Remove this print statement after db integration
+    # TODO: Remove this print statement after db integration
     print("writing ", face_file_name)
     if args.write_imgs:
         cv2.imwrite(face_file_name, img)
@@ -382,16 +367,16 @@ def drawFrame(boxes, frame, fps):
     index = 0
     for box in boxes:
         frame = cv2.putText(frame,
-                    'label: %s' % class_names[label[index]],
-                    (int(box[0]), int(box[1]-40)),
-                    cv2.FONT_HERSHEY_SIMPLEX, 0.5,
-                    (0, 0, 255))
+                            'label: %s' % class_names[label[index]],
+                            (int(box[0]), int(box[1] - 40)),
+                            cv2.FONT_HERSHEY_SIMPLEX, 0.5,
+                            (0, 0, 255))
 
         frame = cv2.putText(frame,
-                'fps: %.3f' % fps,
-                (int(box[0]), int(box[1]-20)),
-                cv2.FONT_HERSHEY_SIMPLEX,
-                0.5, (0, 0, 255))
+                            'fps: %.3f' % fps,
+                            (int(box[0]), int(box[1] - 20)),
+                            cv2.FONT_HERSHEY_SIMPLEX,
+                            0.5, (0, 0, 255))
 
         index += 1
 
@@ -401,8 +386,8 @@ def drawFrame(boxes, frame, fps):
 if __name__ == "__main__":
     warnings.filterwarnings("once")
     parser = argparse.ArgumentParser(description="Face detection")
-    parser.add_argument('--detector', '-t', type=str, required=True, help="Path to a trained face detector .pth file")
-    parser.add_argument('--detector_type', '-d', type=str, required=True, help="Type of face detector. One of "
+    parser.add_argument('--detector', '-d', type=str, required=True, help="Path to a trained face detector .pth file")
+    parser.add_argument('--detector_type', '-t', type=str, required=True, help="Type of face detector. One of "
                                                                                "blazeface, ssd, or retinaface.")
     parser.add_argument('--cuda', '-c', default=False, action='store_true', help="Enable cuda")
     parser.add_argument('--classifier', type=str, help="Path to a trained classifier .pth file")
@@ -410,6 +395,10 @@ def drawFrame(boxes, frame, fps):
     parser.add_argument('--output_dir', default='encrypted_imgs', type=str, help="Where to output encrypted images")
     args = parser.parse_args()
 
+    if args.detector_type not in DETECTOR_TYPES:
+        print('Please include a valid detector type (\'blazeface\', \'ssd\', or \'retinaface\'')
+        exit(1)
+
     device = torch.device('cpu')
     if args.cuda and torch.cuda.is_available():
         device = torch.device('cuda:0')
@@ -418,18 +407,19 @@ def drawFrame(boxes, frame, fps):
     g.eval()
 
     capturer = VideoCapturer()
-    detector = FaceDetector(detector=args.detector, detector_type=args.detector_type, cuda=args.cuda and torch.cuda.is_available(), set_default_dev=True)
+    detector = FaceDetector(detector=args.detector, detector_type=args.detector_type,
+                            cuda=args.cuda and torch.cuda.is_available(), set_default_dev=True)
     classifier = Classifier(g, args.cuda)
     encryptor = Encryptor()
 
     run_face_detection: bool = True
-    while run_face_detection: #main video detection loop that will iterate until ESC key is entered
+    while run_face_detection:  # main video detection loop that will iterate until ESC key is entered
         start_time = time.time()
 
         frame = capturer.get_frame()
         boxes = detector.detect(frame)
 
-        encryptedImg = frame.copy() #copy memory for encrypting image separate from unencrypted image
+        encryptedImg = frame.copy()  # copy memory for encrypting image separate from unencrypted image
 
         if len(boxes) != 0:
             p1 = Process(target=encryptWorker, args=(encryptor, encryptedImg, boxes, args.output_dir, args.write_imgs))
@@ -441,7 +431,7 @@ def drawFrame(boxes, frame, fps):
             fps = 1 / (time.time() - start_time)
             drawFrame(boxes, frame, fps)
 
-            #remove frame creation and drawing before deployment
+            # remove frame creation and drawing before deployment
 
             p1.join()
             if cv2.waitKey(1) == 27:

From fa4cfb55442a65266154f3857c5a8bf2826eca5a Mon Sep 17 00:00:00 2001
From: ZPBerg <bergz@purdue.edu>
Date: Mon, 13 Jul 2020 18:22:31 -0400
Subject: [PATCH 6/6] Retinaface works with GPU

---
 src/jetson/main.py                   | 3 ++-
 src/jetson/models/utils/box_utils.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/jetson/main.py b/src/jetson/main.py
index 232e75fc..4c6bf94a 100644
--- a/src/jetson/main.py
+++ b/src/jetson/main.py
@@ -76,7 +76,7 @@ def __init__(self, detector: str, detector_type: str, detection_threshold=0.7, c
             self.transformer = BaseTransform((self.image_shape[1], self.image_shape[0]), (104, 117, 123))
             priorbox = PriorBox(cfg, image_size=self.image_shape)
             priors = priorbox.forward()
-            self.prior_data = priors.data
+            self.prior_data = priors.data.to(device)
 
         self.detection_threshold = detection_threshold
         if cuda and torch.cuda.is_available():
@@ -148,6 +148,7 @@ def detect(self,
         elif self.model_name == 'retinaface':
             img = (self.transformer(image)[0]).transpose(2, 0, 1)
             img = torch.from_numpy(img).unsqueeze(0)
+            img = img.to(device)
             loc, conf, _ = self.net(
                 img)  # forward pass: Returns bounding box location, confidence and facial landmark locations
 
diff --git a/src/jetson/models/utils/box_utils.py b/src/jetson/models/utils/box_utils.py
index bed236b3..03a5f513 100644
--- a/src/jetson/models/utils/box_utils.py
+++ b/src/jetson/models/utils/box_utils.py
@@ -376,7 +376,7 @@ def postprocess(boxes, conf, image_shape, detection_threshold, resize_factor):
     Returns boxes and confidence scores that are above confidence threshold
     """
     scale = torch.Tensor([image_shape[1], image_shape[0], image_shape[1], image_shape[0]])
-    boxes = (boxes * scale / resize_factor).numpy()
+    boxes = (boxes * scale / resize_factor).to('cpu').numpy()
     scores = conf.squeeze(0).data.cpu().numpy()[:, 1]
 
     # ignore low scores