cvat-ai · benhoff · Oct 10, 2019 · benhoff · Oct 15, 2019
@@ -31,14 +31,19 @@ def __init__(self, model, weights):
 
         iter_inputs = iter(network.inputs)
         self._input_blob_name = next(iter_inputs)
+        self._input_info_name = ''
         self._output_blob_name = next(iter(network.outputs))
 
         self._require_image_info = False
 
+        info_names = ('image_info', 'im_info')
+
         # NOTE: handeling for the inclusion of `image_info` in OpenVino2019
-        if 'image_info' in network.inputs:
+        if any(s in network.inputs for s in info_names):
             self._require_image_info = True
-        if self._input_blob_name == 'image_info':
+            self._input_info_name = set(network.inputs).intersection(info_names)
+            self._input_info_name = self._input_info_name.pop()
+        if self._input_blob_name in info_names:
             self._input_blob_name = next(iter_inputs)
 
         self._net = plugin.load(network=network, num_requests=2)
@@ -47,22 +52,31 @@ def __init__(self, model, weights):
 
     def infer(self, image):
         _, _, h, w = self._input_layout
-        in_frame = image if image.shape[:-1] == (h, w) else cv2.resize(image, (w, h))
+        scale = min(h / image.shape[0], w / image.shape[1])
+        in_frame = image if image.shape[:-1] == (h, w) else cv2.resize(image, None, fx=scale, fy=scale)
+
+        in_frame_size = in_frame.shape[:2]
+        in_frame = np.pad(in_frame, ((0, h - in_frame_size[0]),
+                                     (0, w - in_frame_size[1]),
+                                     (0, 0)),
+                          mode='constant', constant_values=0)
+
         in_frame = in_frame.transpose((2, 0, 1))  # Change data layout from HWC to CHW
         inputs = {self._input_blob_name: in_frame}
         if self._require_image_info:
-            info = np.zeros([1, 3])
-            info[0, 0] = h
-            info[0, 1] = w
-            # frame number
-            info[0, 2] = 1
-            inputs['image_info'] = info
+            info = np.asarray([[in_frame_size[0],
+                                in_frame_size[1],
+                                scale]],
+                              dtype=np.float32)
+
+            inputs[self._input_info_name] = info
 
         results = self._net.infer(inputs)
+
         if len(results) == 1:
             return results[self._output_blob_name].copy()
         else:
-            return results.copy()
+            return results
 
 
 def load_labelmap(labels_path):

@@ -0,0 +1,77 @@
+import numpy as np
+import cv2
+
+
+THRESHOLD = 0.5
+
+# See: https://github.com/opencv/open_model_zoo/blob/master/demos/python_demos/instance_segmentation_demo/main.py
+
+def segm_postprocess(box, raw_cls_mask, im_h, im_w):
+    # Add zero border to prevent upsampling artifacts on segment borders.
+    raw_cls_mask = np.pad(raw_cls_mask, ((1, 1), (1, 1)), 'constant', constant_values=0)
+    scale = int(raw_cls_mask.shape[0] / (raw_cls_mask.shape[0] - 2.0))
+    w_half = (box[2] - box[0]) * .5
+    h_half = (box[3] - box[1]) * .5
+    x_c = (box[2] + box[0]) * .5
+    y_c = (box[3] + box[1]) * .5
+    w_half *= scale
+    h_half *= scale
+    box_exp = np.zeros(box.shape)
+    box_exp[0] = x_c - w_half
+    box_exp[2] = x_c + w_half
+    box_exp[1] = y_c - h_half
+    box_exp[3] = y_c + h_half
+
+    extended_box = box_exp.astype(int)
+
+    w, h = np.maximum(extended_box[2:] - extended_box[:2] + 1, 1)
+    x0, y0 = np.clip(extended_box[:2], a_min=0, a_max=[im_w, im_h])
+    x1, y1 = np.clip(extended_box[2:] + 1, a_min=0, a_max=[im_w, im_h])
+
+    raw_cls_mask = cv2.resize(raw_cls_mask, (w, h)) > 0.5
+    mask = raw_cls_mask.astype(np.uint8)
+    # Put an object mask in an image mask.
+    im_mask = np.zeros((im_h, im_w), dtype=np.uint8)
+    im_mask[y0:y1, x0:x1] = mask[(y0 - extended_box[1]):(y1 - extended_box[1]),
+                            (x0 - extended_box[0]):(x1 - extended_box[0])]
+
+    return im_mask
+
+
+for detection in detections:
+    frame_number = detection['frame_id']
+    height = detection['frame_height']
+    width = detection['frame_width']
+    detection = detection['detections']
+
+    blob_height = 480
+    blob_width = 480
+
+    scale = min(blob_height / height, blob_width / width)
+
+    boxes = detection['boxes'] / scale
+    scores = detection['scores']
+    classes = detection['classes'].astype(np.uint32)
+    masks = []
+    for box, cls, raw_mask in zip(boxes, classes, detection['raw_masks']):
+        raw_cls_mask = raw_mask[cls, ...]
+        mask = segm_postprocess(box, raw_cls_mask, height, width)
+        masks.append(mask)
+
+    # Filter out detections with low confidence.
+    detections_filter = scores > THRESHOLD
+    scores = scores[detections_filter]
+    classes = classes[detections_filter]
+    boxes = boxes[detections_filter]
+    masks = list(segm for segm, is_valid in zip(masks, detections_filter) if is_valid)
+    for mask, label in zip(masks, classes):
+        # contours, hierarchy
+        contour, _ = cv2.findContours(mask,
+                                      cv2.RETR_EXTERNAL,
+                                      cv2.CHAIN_APPROX_TC89_KCOS)
+
+        contour = contour[0]
+        contour = contour.tolist()
+        contour = [x[0] for x in contour]
+
+        results.add_polygon(contour, label, frame_number)
@@ -0,0 +1,84 @@
+{
+    "label_map": {
+        "1": "person",
+        "2": "bicycle",
+        "3": "car",
+        "4": "motorcycle",
+        "5": "airplane",
+        "6": "bus",
+        "7": "train",
+        "8": "truck",
+        "9": "boat",
+        "10": "traffic_light",
+        "11": "fire_hydrant",
+        "13": "stop_sign",
+        "14": "parking_meter",
+        "15": "bench",
+        "16": "bird",
+        "17": "cat",
+        "18": "dog",
+        "19": "horse",
+        "20": "sheep",
+        "21": "cow",
+        "22": "elephant",
+        "23": "bear",
+        "24": "zebra",
+        "25": "giraffe",
+        "27": "backpack",
+        "28": "umbrella",
+        "31": "handbag",
+        "32": "tie",
+        "33": "suitcase",
+        "34": "frisbee",
+        "35": "skis",
+        "36": "snowboard",
+        "37": "sports_ball",
+        "38": "kite",
+        "39": "baseball_bat",
+        "40": "baseball_glove",
+        "41": "skateboard",
+        "42": "surfboard",
+        "43": "tennis_racket",
+        "44": "bottle",
+        "46": "wine_glass",
+        "47": "cup",
+        "48": "fork",
+        "49": "knife",
+        "50": "spoon",
+        "51": "bowl",
+        "52": "banana",
+        "53": "apple",
+        "54": "sandwich",
+        "55": "orange",
+        "56": "broccoli",
+        "57": "carrot",
+        "58": "hot_dog",
+        "59": "pizza",
+        "60": "donut",
+        "61": "cake",
+        "62": "chair",
+        "63": "couch",
+        "64": "potted_plant",
+        "65": "bed",
+        "67": "dining_table",
+        "70": "toilet",
+        "72": "tv",
+        "73": "laptop",
+        "74": "mouse",
+        "75": "remote",
+        "76": "keyboard",
+        "77": "cell_phone",
+        "78": "microwave",
+        "79": "oven",
+        "80": "toaster",
+        "81": "sink",
+        "83": "refrigerator",
+        "84": "book",
+        "85": "clock",
+        "86": "vase",
+        "87": "scissors",
+        "88": "teddy_bear",
+        "89": "hair_drier",
+        "90": "toothbrush"
+    }
+}