nwojke · r-or · Apr 30, 2019 · Apr 30, 2019 · Apr 30, 2019 · Apr 30, 2019
diff --git a/README.md b/README.md
@@ -58,6 +58,7 @@ and evaluate the MOT challenge benchmark.
 
 ## Generating detections
 
+### Procedure without Openvino support
 Beside the main tracking application, this repository contains a script to
 generate features for person re-identification, suitable to compare the visual
 appearance of pedestrian bounding boxes using cosine similarity.
@@ -88,6 +89,49 @@ descriptor. The files generated by this command can be used as input for the
 try passing an absolute path to the ``--model`` argument. This might help in
 some cases.
 
+### Notes on Openvino
+This branch adds support for Openvino. Obviously this is more useful for online
+feature extraction instead of generating detections into a text file for MOT16.
+
+**Performance estimation:** during the [AI hackathon](http://www.ai-hackathon.com/)
+we used this to generate embeddings from a video stream on a NCS2. It was able
+to sustain around 5fps while tracking more than 15 targets. Note that no batch
+processing is available on this device, so for each target inference must be
+called sequentially.
+
+On CPU it runs roughly at the same speed as vanilla tensorflow. (Intel) GPU
+unfortunately doesn't work currently.
+
+For the feature extraction to use Openvino, a few additional steps have to be
+taken:
+
+##### 1) Freeze model for Openvino
+This is necessary as the default model includes elements which are incompatible
+with Openvino:
+```
+python tools/freeze_model.py --no_preprocess
+```
+
+##### 2) Convert model with Model Optimizer
+```
+cd model_data/networks
+mo_tf.py --input_model mars-small128.pb -b 1 --data_type <data_type>
+```
+As data type you need to use a type which is supported for the device you want
+to use. The Movidius NCS2 compute stick for instance needs "FP16", the CPU only
+supports the default "FP32".
+
+##### 3) Generate detections
+To generate the MOT16 detections in addition you have to supply the Openvino
+device (e.g. "CPU" or "MYRIAD" for the NCS2):
+```
+python tools/generate_detections.py \
+    --model=resources/networks/mars-small128.pb \
+    --mot_dir=./MOT16/train \
+    --output_dir=./resources/detections/MOT16_train \
+    --use_openvino=MYRIAD
+```
+
 ## Training the model
 
 To train the deep association metric model we used a novel [cosine metric learning](https://github.com/nwojke/cosine_metric_learning) approach which is provided as a separate repository.

diff --git a/deep_sort_app.py b/deep_sort_app.py
@@ -3,6 +3,7 @@
 
 import argparse
 import os
+import time
 
 import cv2
 import numpy as np
@@ -162,9 +163,16 @@ def run(sequence_dir, detection_file, output_file, min_confidence,
         "cosine", max_cosine_distance, nn_budget)
     tracker = Tracker(metric)
     results = []
+    last_time = time.time()
 
     def frame_callback(vis, frame_idx):
-        print("Processing frame %05d" % frame_idx)
+        curr_time = time.time()
+        if frame_callback.last_time is not None:
+            fps = 1 / (curr_time - frame_callback.last_time)
+        else:
+            fps = 0
+        frame_callback.last_time = curr_time
+        print("Processing frame %05d - %.1ffps" % (frame_idx, fps))
 
         # Load image and generate detections.
         detections = create_detections(
@@ -198,6 +206,9 @@ def frame_callback(vis, frame_idx):
             results.append([
                 frame_idx, track.track_id, bbox[0], bbox[1], bbox[2], bbox[3]])
 
+    # Store FPS object
+    frame_callback.last_time = None
+
     # Run tracker.
     if display:
         visualizer = visualization.Visualization(seq_info, update_ms=5)
@@ -257,7 +268,9 @@ def parse_args():
 
 if __name__ == "__main__":
     args = parse_args()
+    start_time = time.time()
     run(
         args.sequence_dir, args.detection_file, args.output_file,
         args.min_confidence, args.nms_max_overlap, args.min_detection_height,
         args.max_cosine_distance, args.nn_budget, args.display)
+    print("Processing time: %.2fs" % (time.time() - start_time))
diff --git a/ov_test.py b/ov_test.py
@@ -0,0 +1,94 @@
+import timeit
+import math
+
+from openvino.inference_engine import IENetwork, IEPlugin
+import numpy as np
+import tensorflow as tf
+
+all_batch_size = 1
+np.random.seed(seed=69)
+
+def _run_in_batches(f, data_dict, out, batch_size):
+    data_len = len(out)
+    num_batches = int(data_len / batch_size)
+
+    s, e = 0, 0
+    for i in range(num_batches):
+        s, e = i * batch_size, (i + 1) * batch_size
+        batch_data_dict = {k: v[s:e] for k, v in data_dict.items()}
+        out[s:e] = f(batch_data_dict)
+    if e < len(out):
+        batch_data_dict = {k: v[e:] for k, v in data_dict.items()}
+        out[e:] = f(batch_data_dict)
+
+modelname = 'resources/networks/mars-small128'
+
+# OV configuration
+ov_net = IENetwork(model=modelname + '.xml', weights=modelname + '.bin')
+ov_net.batch_size = all_batch_size
+ov_plugin = IEPlugin(device='CPU')
+
+# TF configuration
+tf_session = tf.Session()
+with tf.gfile.GFile(modelname + '.pb', 'rb') as gfile:
+    tf_graph = tf.GraphDef()
+    tf_graph.ParseFromString(gfile.read())
+tf.import_graph_def(tf_graph, name='net')
+tf_input_node = tf.get_default_graph().get_tensor_by_name('net/images:0')
+tf_output_node = tf.get_default_graph().get_tensor_by_name('net/features:0')
+
+
+# ?x128x64x3
+testinput = np.random.random_sample((all_batch_size, 128, 64, 3))
+testinput2 = testinput[:, :, :, ::-1]
+print(testinput - testinput2)
+# openvino expects colors major
+ov_testinput = np.transpose(testinput, (0, 3, 1, 2))
+ov_testinput2 = np.transpose(testinput2, (0, 3, 1, 2))
+
+# run OV
+ov_input_blob = next(iter(ov_net.inputs))
+ov_out_blob = next(iter(ov_net.outputs))
+ov_exec_net = ov_plugin.load(network=ov_net)
+
+def run_ov(inp):
+    return ov_exec_net.infer(inputs={ov_input_blob: inp})
+
+ov_res = next(iter(run_ov(ov_testinput).values()))
+ov_res2 = next(iter(run_ov(ov_testinput2).values()))
+
+# run TF
+def run_tf(inp):
+    tf_output = np.zeros((all_batch_size, 128), np.float32)
+    _run_in_batches(lambda x: tf_session.run(tf_output_node, feed_dict=x),
+                    {tf_input_node: inp}, tf_output, all_batch_size)
+    return tf_output
+
+tf_res = run_tf(testinput)
+tf_res2 = run_tf(testinput2)
+
+def compare(vec1, vec2):
+    print('Diff abs (0.0 is exactly same):\n', vec1 - vec2)
+    print('Diff rel (1.0 is exactly same):\n', vec1 / vec2)
+
+    comp = 'PASSED' if np.allclose(vec1, vec2) else 'FAILED'
+    print('Comparison: {}'.format(comp))
+
+# compare different results
+print('TF: RGB vs BGR')
+compare(tf_res, tf_res2)
+print('')
+
+print('OV: RGB vs BGR')
+compare(ov_res, ov_res2)
+print('')
+
+print('TF vs OV')
+compare(tf_res, ov_res)
+print('')
+
+# timing
+iterations = int(300 / all_batch_size)
+print('Batch size {}, {} iterations:'.format(all_batch_size, iterations))
+print(' OV: {:.5f}s'.format(timeit.timeit('run_ov(ov_testinput)', number=iterations, globals=globals())))
+print(' TF: {:.5f}s'.format(timeit.timeit('run_tf(testinput)', number=iterations, globals=globals())))
diff --git a/tools/freeze_model.py b/tools/freeze_model.py
@@ -188,6 +188,12 @@ def parse_args():
     parser.add_argument(
         "--graphdef_out",
         default="resources/networks/mars-small128.pb")
+    parser.add_argument(
+        "--no_preprocess",
+        default=False,
+        action='store_true',
+        help="Do not include preprocessing in model (to avoid compatibility "
+             "issues)")
     return parser.parse_args()
 
 
@@ -197,9 +203,12 @@ def main():
     with tf.Session(graph=tf.Graph()) as session:
         input_var = tf.placeholder(
             tf.uint8, (None, 128, 64, 3), name="images")
-        image_var = tf.map_fn(
-            lambda x: _preprocess(x), tf.cast(input_var, tf.float32),
-            back_prop=False)
+        if args.no_preprocess:
+            image_var = tf.cast(input_var, tf.float32)
+        else:
+            image_var = tf.map_fn(
+                lambda x: _preprocess(x), tf.cast(input_var, tf.float32),
+                back_prop=False)
 
         factory_fn = _network_factory()
         features, _ = factory_fn(image_var, reuse=None)

diff --git a/tools/generate_detections.py b/tools/generate_detections.py
@@ -2,10 +2,17 @@
 import os
 import errno
 import argparse
+import time
 import numpy as np
 import cv2
 import tensorflow as tf
 
+try:
+    from openvino.inference_engine import IENetwork, IEPlugin
+    USE_DYN_BATCH = False
+except ImportError:
+    pass
+
 
 def _run_in_batches(f, data_dict, out, batch_size):
     data_len = len(out)
@@ -71,7 +78,20 @@ def extract_image_patch(image, bbox, patch_shape):
 class ImageEncoder(object):
 
     def __init__(self, checkpoint_filename, input_name="images",
-                 output_name="features"):
+                 output_name="features", openvino_device=None):
+
+        self.openvino_device=openvino_device
+        if openvino_device is not None:
+            # setup device
+            model_base = os.path.splitext(checkpoint_filename)[0]
+            self._net = IENetwork(
+                model=model_base + '.xml',
+                weights=model_base + '.bin')
+            self._plugin = IEPlugin(device=openvino_device)
+            self._input_blob = next(iter(self._net.inputs))
+            self._out_blob = next(iter(self._net.outputs))
+            self._reload_openvino_net(1)
+
         self.session = tf.Session()
         with tf.gfile.GFile(checkpoint_filename, "rb") as file_handle:
             graph_def = tf.GraphDef()
@@ -87,20 +107,52 @@ def __init__(self, checkpoint_filename, input_name="images",
         self.feature_dim = self.output_var.get_shape().as_list()[-1]
         self.image_shape = self.input_var.get_shape().as_list()[1:]
 
+    def _reload_openvino_net(self, batch_size):
+        if self.openvino_device == "MYRIAD" or not USE_DYN_BATCH:
+            self._net.batch_size = 1
+            self._exec_net = self._plugin.load(network=self._net)
+        else:
+            self._net.batch_size = batch_size
+            self._exec_net = self._plugin.load(
+                network=self._net, config={'DYN_BATCH_ENABLED': 'YES'})
+
     def __call__(self, data_x, batch_size=32):
+        if self.openvino_device != "MYRIAD" \
+                and batch_size != self._net.batch_size:
+            self._reload_openvino_net(batch_size)
+
         out = np.zeros((len(data_x), self.feature_dim), np.float32)
-        _run_in_batches(
-            lambda x: self.session.run(self.output_var, feed_dict=x),
-            {self.input_var: data_x}, out, batch_size)
+        if self.openvino_device:
+            def reorder(tensor):
+                return np.transpose(tensor, (0, 3, 1, 2))
+
+            if self.openvino_device == "MYRIAD" or not USE_DYN_BATCH:
+                # doesn't support dynamic batch size
+                for patch in range(len(data_x)):
+                    out[patch] = next(iter(self._exec_net.infer(
+                        inputs={self._input_blob: reorder(
+                            data_x[patch:patch + 1])}).values()))
+            else:
+                _run_in_batches(
+                    lambda x: self._exec_net.infer(inputs=x),
+                    {self._input_blob: reorder(data_x)}, out, batch_size)
+        else:
+            _run_in_batches(
+                lambda x: self.session.run(self.output_var, feed_dict=x),
+                {self.input_var: data_x}, out, batch_size)
         return out
 
 
 def create_box_encoder(model_filename, input_name="images",
-                       output_name="features", batch_size=32):
-    image_encoder = ImageEncoder(model_filename, input_name, output_name)
+                       output_name="features", batch_size=32,
+                       openvino_device=None):
+    image_encoder = ImageEncoder(
+        model_filename, input_name, output_name,
+        openvino_device=openvino_device)
     image_shape = image_encoder.image_shape
 
     def encoder(image, boxes):
+        encoder.batch_size = batch_size
         image_patches = []
         for box in boxes:
             patch = extract_image_patch(image, box, image_shape[:2])
@@ -132,7 +184,6 @@ def generate_detections(encoder, mot_dir, output_dir, detection_dir=None):
         Path to custom detections. The directory structure should be the default
         MOTChallenge structure: `[sequence]/det/det.txt`. If None, uses the
         standard MOTChallenge detections.
-
     """
     if detection_dir is None:
         detection_dir = mot_dir
@@ -162,8 +213,12 @@ def generate_detections(encoder, mot_dir, output_dir, detection_dir=None):
         frame_indices = detections_in[:, 0].astype(np.int)
         min_frame_idx = frame_indices.astype(np.int).min()
         max_frame_idx = frame_indices.astype(np.int).max()
+        last_frame_time = 0
         for frame_idx in range(min_frame_idx, max_frame_idx + 1):
-            print("Frame %05d/%05d" % (frame_idx, max_frame_idx))
+            curr_frame_time = time.time()
+            print("Frame %05d/%05d - %.2ffps"
+                % (frame_idx, max_frame_idx, 1 / (curr_frame_time - last_frame_time)))
+            last_frame_time = curr_frame_time
             mask = frame_indices == frame_idx
             rows = detections_in[mask]
 
@@ -199,14 +254,26 @@ def parse_args():
     parser.add_argument(
         "--output_dir", help="Output directory. Will be created if it does not"
         " exist.", default="detections")
+    parser.add_argument(
+        "--use_openvino", help="Use Openvino. Can be any available device as "
+        "long as it is compatible. Model & weights are expected to be inside "
+        "the folder specified with '-model' and end with '.xml' and '.bin' "
+        "respectively. Supply the device identifier (CPU, GPU, MYRIAD etc.)",
+        default="CPU")
     return parser.parse_args()
 
 
 def main():
     args = parse_args()
-    encoder = create_box_encoder(args.model, batch_size=32)
-    generate_detections(encoder, args.mot_dir, args.output_dir,
-                        args.detection_dir)
+    if args.use_openvino:
+        assert IENetwork, "Openvino could not be imported. " \
+            "Make sure it is installed correctly."
+        args.use_openvino = args.use_openvino.upper()
+
+    encoder = create_box_encoder(
+        args.model, batch_size=32, openvino_device=args.use_openvino)
+    generate_detections(
+        encoder, args.mot_dir, args.output_dir, args.detection_dir)
 
 
 if __name__ == "__main__":