cortexlabs · deliahu · Jul 7, 2020 · Jul 3, 2020 · Jul 4, 2020 · Jul 4, 2020
diff --git a/examples/onnx/yolov5-youtube/README.md b/examples/onnx/yolov5-youtube/README.md
@@ -0,0 +1,57 @@
+# YoloV5 Detection model
+
+This example deploys a detection model trained using [ultralytics' yolo repo](https://github.com/ultralytics/yolov5) using ONNX.
+We'll use the `yolov5s` model as an example here.
+In can be used to run inference on youtube videos and returns the annotated video with bounding boxes.
+
+The example can be run on both CPU and on GPU hardware.
+
+## Sample Prediction
+
+Deploy the model by running:
+
+```bash
+cortex deploy
+```
+
+And wait for it to become live by tracking its status with `cortex get --watch`.
+
+Once the API has been successfully deployed, export the API's endpoint for convenience. You can get the API's endpoint by running `cortex get yolov5-youtube`.
+
+```bash
+export ENDPOINT=your-api-endpoint
+```
+
+When making a prediction with [sample.json](sample.json), [this](https://www.youtube.com/watch?v=aUdKzb4LGJ) youtube video will be used.
+
+To make a request to the model:
+
+```bash
+curl "${ENDPOINT}" -X POST -H "Content-Type: application/json" -d @sample.json --output video.mp4
+```
+
+After a few seconds, `curl` will save the resulting video `video.mp4` in the current working directory.
+
+
+## Exporting ONNX
+
+To export a custom model from the repo, use the [`model/export.py`](https://github.com/ultralytics/yolov5/blob/master/models/export.py) script.
+The only change we need to make is to change the line
+
+```bash
+model.model[-1].export = True  # set Detect() layer export=True
+```
+
+to
+
+```bash
+model.model[-1].export = False
+```
+
+Originally, the ultralytics repo does not export postprocessing steps of the model, e.g. the conversion from the raw CNN outputs to bounding boxes.
+With newer ONNX versions, these can be exported as part of the model making the deployment much easier.
+
+With this modified script, the ONNX graph used for this example has been exported using
+```bash
+python models/export.py --weights weights/yolov5s.pt --img 416 --batch 1
+```
diff --git a/examples/onnx/yolov5-youtube/conda-packages.txt b/examples/onnx/yolov5-youtube/conda-packages.txt
@@ -0,0 +1,3 @@
+conda-forge::ffmpeg=4.2.3
+conda-forge::youtube-dl
+conda-forge::matplotlib
diff --git a/examples/onnx/yolov5-youtube/cortex.yaml b/examples/onnx/yolov5-youtube/cortex.yaml
@@ -0,0 +1,14 @@
+# WARNING: you are on the master branch, please refer to the examples on the branch that matches your `cortex version`
+
+- name: yolov5-youtube
+  kind: SyncAPI
+  predictor:
+    type: onnx
+    path: predictor.py
+    model_path: s3://cortex-examples/onnx/yolov5-youtube/yolov5s.onnx
+    config:
+      iou_threshold: 0.5
+      confidence_threshold: 0.3
+  compute:
+    # GPU requirement is optional. Comment out next line to run on CPUs (albeit slower)
+    gpu: 1
diff --git a/examples/onnx/yolov5-youtube/labels.json b/examples/onnx/yolov5-youtube/labels.json
@@ -0,0 +1,82 @@
+[
+    "person",
+    "bicycle",
+    "car",
+    "motorcycle",
+    "airplane",
+    "bus",
+    "train",
+    "truck",
+    "boat",
+    "traffic light",
+    "fire hydrant",
+    "stop sign",
+    "parking meter",
+    "bench",
+    "bird",
+    "cat",
+    "dog",
+    "horse",
+    "sheep",
+    "cow",
+    "elephant",
+    "bear",
+    "zebra",
+    "giraffe",
+    "backpack",
+    "umbrella",
+    "handbag",
+    "tie",
+    "suitcase",
+    "frisbee",
+    "skis",
+    "snowboard",
+    "sports ball",
+    "kite",
+    "baseball bat",
+    "baseball glove",
+    "skateboard",
+    "surfboard",
+    "tennis racket",
+    "bottle",
+    "wine glass",
+    "cup",
+    "fork",
+    "knife",
+    "spoon",
+    "bowl",
+    "banana",
+    "apple",
+    "sandwich",
+    "orange",
+    "broccoli",
+    "carrot",
+    "hot dog",
+    "pizza",
+    "donut",
+    "cake",
+    "chair",
+    "couch",
+    "potted plant",
+    "bed",
+    "dining table",
+    "toilet",
+    "tv",
+    "laptop",
+    "mouse",
+    "remote",
+    "keyboard",
+    "cell phone",
+    "microwave",
+    "oven",
+    "toaster",
+    "sink",
+    "refrigerator",
+    "book",
+    "clock",
+    "vase",
+    "scissors",
+    "teddy bear",
+    "hair drier",
+    "toothbrush"
+]
diff --git a/examples/onnx/yolov5-youtube/predictor.py b/examples/onnx/yolov5-youtube/predictor.py
@@ -0,0 +1,186 @@
+# WARNING: you are on the master branch, please refer to the examples on the branch that matches your `cortex version`
+
+import json
+import os
+import uuid
+from pathlib import Path
+from typing import Iterable, Tuple
+
+import cv2
+import ffmpeg
+import numpy as np
+import youtube_dl
+from matplotlib import pyplot as plt
+
+from starlette.responses import FileResponse
+
+
+def download_from_youtube(url: str, min_height: int) -> Path:
+    target = f"{uuid.uuid1()}.mp4"
+    ydl_opts = {
+        "outtmpl": target,
+        "format": f"worstvideo[vcodec=vp9][height>={min_height}]",
+    }
+    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
+        ydl.download([url])
+    # we need to glob in case youtube-dl adds suffix
+    (path,) = Path().absolute().glob(f"{target}*")
+    return path
+
+
+def frame_reader(path: Path, size: Tuple[int, int]) -> Iterable[np.ndarray]:
+    width, height = size
+    # letterbox frames to fixed size
+    process = (
+        ffmpeg.input(path)
+        .filter("scale", size=f"{width}:{height}", force_original_aspect_ratio="decrease")
+        # Negative values for x and y center the padded video
+        .filter("pad", height=height, width=width, x=-1, y=-1)
+        .output("pipe:", format="rawvideo", pix_fmt="rgb24")
+        .run_async(pipe_stdout=True)
+    )
+
+    while True:
+        in_bytes = process.stdout.read(height * width * 3)
+        if not in_bytes:
+            process.wait()
+            break
+        frame = np.frombuffer(in_bytes, np.uint8).reshape([height, width, 3])
+        yield frame
+
+
+class FrameWriter:
+    def __init__(self, path: Path, size: Tuple[int, int]):
+        width, height = size
+        self.process = (
+            ffmpeg.input("pipe:", format="rawvideo", pix_fmt="rgb24", s=f"{width}x{height}")
+            .output(path, pix_fmt="yuv420p")
+            .overwrite_output()
+            .run_async(pipe_stdin=True)
+        )
+
+    def write(self, frame: np.ndarray):
+        self.process.stdin.write(frame.astype(np.uint8).tobytes())
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.__del__()
+
+    def __del__(self):
+        self.process.stdin.close()
+        self.process.wait()
+
+
+def nms(dets: np.ndarray, scores: np.ndarray, thresh: float) -> np.ndarray:
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]  # get boxes with more ious first
+
+    keep = []
+    while order.size > 0:
+        i = order[0]  # pick maxmum iou box
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)  # maximum width
+        h = np.maximum(0.0, yy2 - yy1 + 1)  # maxiumum height
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+
+    return np.array(keep).astype(np.int)
+
+
+def boxes_yolo_to_xyxy(boxes: np.ndarray):
+    boxes[:, 0] -= boxes[:, 2] / 2
+    boxes[:, 1] -= boxes[:, 3] / 2
+    boxes[:, 2] = boxes[:, 2] + boxes[:, 0]
+    boxes[:, 3] = boxes[:, 3] + boxes[:, 1]
+    return boxes
+
+
+def overlay_boxes(frame, boxes, class_ids, label_map, color_map, line_thickness=None):
+    tl = (
+        line_thickness or round(0.002 * (frame.shape[0] + frame.shape[1]) / 2) + 1
+    )  # line/font thickness
+
+    for class_id, (x1, y1, x2, y2) in zip(class_ids, boxes.astype(np.int)):
+        color = color_map[class_id]
+        label = label_map[class_id]
+        cv2.rectangle(frame, (x1, y1), (x2, y2), color, tl, cv2.LINE_AA)
+        tf = max(tl - 1, 1)  # font thickness
+        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
+        x3, y3 = x1 + t_size[0], y1 - t_size[1] - 3
+        cv2.rectangle(frame, (x1, y1), (x3, y3), color, -1, cv2.LINE_AA)  # filled
+        cv2.putText(
+            frame,
+            label,
+            (x1, y1 - 2),
+            0,
+            tl / 3,
+            [225, 255, 255],
+            thickness=tf,
+            lineType=cv2.LINE_AA,
+        )
+
+
+class ONNXPredictor:
+    def __init__(self, onnx_client, config):
+        self.client = onnx_client
+        # Get the input shape from the ONNX runtime
+        (signature,) = onnx_client.input_signatures.values()
+        _, _, height, width = signature["images"]["shape"]
+        self.input_size = (width, height)
+        self.config = config
+        with open("labels.json") as buf:
+            self.labels = json.load(buf)
+        color_map = plt.cm.tab20(np.linspace(0, 20, len(self.labels)))
+        self.color_map = [tuple(map(int, colors)) for colors in 255 * color_map]
+
+    def postprocess(self, output):
+        boxes, obj_score, class_scores = np.split(output[0], [4, 5], axis=1)
+        boxes = boxes_yolo_to_xyxy(boxes)
+
+        # get the class-prediction & class confidences
+        class_id = class_scores.argmax(axis=1)
+        cls_score = class_scores[np.arange(len(class_scores)), class_id]
+
+        confidence = obj_score.squeeze(axis=1) * cls_score
+        sel = confidence > self.config["confidence_threshold"]
+        boxes, class_id, confidence = boxes[sel], class_id[sel], confidence[sel]
+        sel = nms(boxes, confidence, self.config["iou_threshold"])
+        boxes, class_id, confidence = boxes[sel], class_id[sel], confidence[sel]
+        return boxes, class_id, confidence
+
+    def predict(self, payload):
+        in_path = download_from_youtube(payload["url"], self.input_size[1])
+        out_path = f"{uuid.uuid1()}.mp4"
+
+        with FrameWriter(out_path, size=self.input_size) as writer:
+            for frame in frame_reader(in_path, size=self.input_size):
+                x = (frame.astype(np.float32) / 255).transpose(2, 0, 1)
+                # 4 output tensors, the last three are intermediate values and
+                # not necessary for detection
+                output, *_ = self.client.predict(x[None])
+                boxes, class_ids, confidence = self.postprocess(output)
+                overlay_boxes(frame, boxes, class_ids, self.labels, self.color_map)
+                writer.write(frame)
+
+        with open(out_path, "rb") as f:
+            output_bytes = f.read()
+
+        os.remove(in_path)
+        os.remove(out_path)
+
+        return output_bytes
diff --git a/examples/onnx/yolov5-youtube/requirements.txt b/examples/onnx/yolov5-youtube/requirements.txt
@@ -0,0 +1,3 @@
+ffmpeg-python
+aiofiles
+opencv-python-headless
diff --git a/examples/onnx/yolov5-youtube/sample.json b/examples/onnx/yolov5-youtube/sample.json
@@ -0,0 +1,3 @@
+{
+    "url": "https://www.youtube.com/watch?v=aUdKzb4LGJI"
+}