diff --git a/README.md b/README.md index 69ec911e..46f9c19b 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,7 @@ and evaluate the MOT challenge benchmark. ## Generating detections +### Procedure without Openvino support Beside the main tracking application, this repository contains a script to generate features for person re-identification, suitable to compare the visual appearance of pedestrian bounding boxes using cosine similarity. @@ -88,6 +89,49 @@ descriptor. The files generated by this command can be used as input for the try passing an absolute path to the ``--model`` argument. This might help in some cases. +### Notes on Openvino +This branch adds support for Openvino. Obviously this is more useful for online +feature extraction instead of generating detections into a text file for MOT16. + +**Performance estimation:** during the [AI hackathon](http://www.ai-hackathon.com/) +we used this to generate embeddings from a video stream on a NCS2. It was able +to sustain around 5fps while tracking more than 15 targets. Note that no batch +processing is available on this device, so for each target inference must be +called sequentially. + +On CPU it runs roughly at the same speed as vanilla tensorflow. (Intel) GPU +unfortunately doesn't work currently. + +For the feature extraction to use Openvino, a few additional steps have to be +taken: + +##### 1) Freeze model for Openvino +This is necessary as the default model includes elements which are incompatible +with Openvino: +``` +python tools/freeze_model.py --no_preprocess +``` + +##### 2) Convert model with Model Optimizer +``` +cd model_data/networks +mo_tf.py --input_model mars-small128.pb -b 1 --data_type +``` +As data type you need to use a type which is supported for the device you want +to use. The Movidius NCS2 compute stick for instance needs "FP16", the CPU only +supports the default "FP32". + +##### 3) Generate detections +To generate the MOT16 detections in addition you have to supply the Openvino +device (e.g. "CPU" or "MYRIAD" for the NCS2): +``` +python tools/generate_detections.py \ + --model=resources/networks/mars-small128.pb \ + --mot_dir=./MOT16/train \ + --output_dir=./resources/detections/MOT16_train \ + --use_openvino=MYRIAD +``` + ## Training the model To train the deep association metric model we used a novel [cosine metric learning](https://github.com/nwojke/cosine_metric_learning) approach which is provided as a separate repository. diff --git a/deep_sort_app.py b/deep_sort_app.py index 563d2dd2..9481ee21 100644 --- a/deep_sort_app.py +++ b/deep_sort_app.py @@ -3,6 +3,7 @@ import argparse import os +import time import cv2 import numpy as np @@ -162,9 +163,16 @@ def run(sequence_dir, detection_file, output_file, min_confidence, "cosine", max_cosine_distance, nn_budget) tracker = Tracker(metric) results = [] + last_time = time.time() def frame_callback(vis, frame_idx): - print("Processing frame %05d" % frame_idx) + curr_time = time.time() + if frame_callback.last_time is not None: + fps = 1 / (curr_time - frame_callback.last_time) + else: + fps = 0 + frame_callback.last_time = curr_time + print("Processing frame %05d - %.1ffps" % (frame_idx, fps)) # Load image and generate detections. detections = create_detections( @@ -198,6 +206,9 @@ def frame_callback(vis, frame_idx): results.append([ frame_idx, track.track_id, bbox[0], bbox[1], bbox[2], bbox[3]]) + # Store FPS object + frame_callback.last_time = None + # Run tracker. if display: visualizer = visualization.Visualization(seq_info, update_ms=5) @@ -257,7 +268,9 @@ def parse_args(): if __name__ == "__main__": args = parse_args() + start_time = time.time() run( args.sequence_dir, args.detection_file, args.output_file, args.min_confidence, args.nms_max_overlap, args.min_detection_height, args.max_cosine_distance, args.nn_budget, args.display) + print("Processing time: %.2fs" % (time.time() - start_time)) diff --git a/ov_test.py b/ov_test.py new file mode 100644 index 00000000..a8def3ad --- /dev/null +++ b/ov_test.py @@ -0,0 +1,94 @@ +import timeit +import math + +from openvino.inference_engine import IENetwork, IEPlugin +import numpy as np +import tensorflow as tf + +all_batch_size = 1 +np.random.seed(seed=69) + +def _run_in_batches(f, data_dict, out, batch_size): + data_len = len(out) + num_batches = int(data_len / batch_size) + + s, e = 0, 0 + for i in range(num_batches): + s, e = i * batch_size, (i + 1) * batch_size + batch_data_dict = {k: v[s:e] for k, v in data_dict.items()} + out[s:e] = f(batch_data_dict) + if e < len(out): + batch_data_dict = {k: v[e:] for k, v in data_dict.items()} + out[e:] = f(batch_data_dict) + +modelname = 'resources/networks/mars-small128' + +# OV configuration +ov_net = IENetwork(model=modelname + '.xml', weights=modelname + '.bin') +ov_net.batch_size = all_batch_size +ov_plugin = IEPlugin(device='CPU') + +# TF configuration +tf_session = tf.Session() +with tf.gfile.GFile(modelname + '.pb', 'rb') as gfile: + tf_graph = tf.GraphDef() + tf_graph.ParseFromString(gfile.read()) +tf.import_graph_def(tf_graph, name='net') +tf_input_node = tf.get_default_graph().get_tensor_by_name('net/images:0') +tf_output_node = tf.get_default_graph().get_tensor_by_name('net/features:0') + + +# ?x128x64x3 +testinput = np.random.random_sample((all_batch_size, 128, 64, 3)) +testinput2 = testinput[:, :, :, ::-1] +print(testinput - testinput2) +# openvino expects colors major +ov_testinput = np.transpose(testinput, (0, 3, 1, 2)) +ov_testinput2 = np.transpose(testinput2, (0, 3, 1, 2)) + +# run OV +ov_input_blob = next(iter(ov_net.inputs)) +ov_out_blob = next(iter(ov_net.outputs)) +ov_exec_net = ov_plugin.load(network=ov_net) + +def run_ov(inp): + return ov_exec_net.infer(inputs={ov_input_blob: inp}) + +ov_res = next(iter(run_ov(ov_testinput).values())) +ov_res2 = next(iter(run_ov(ov_testinput2).values())) + +# run TF +def run_tf(inp): + tf_output = np.zeros((all_batch_size, 128), np.float32) + _run_in_batches(lambda x: tf_session.run(tf_output_node, feed_dict=x), + {tf_input_node: inp}, tf_output, all_batch_size) + return tf_output + +tf_res = run_tf(testinput) +tf_res2 = run_tf(testinput2) + +def compare(vec1, vec2): + print('Diff abs (0.0 is exactly same):\n', vec1 - vec2) + print('Diff rel (1.0 is exactly same):\n', vec1 / vec2) + + comp = 'PASSED' if np.allclose(vec1, vec2) else 'FAILED' + print('Comparison: {}'.format(comp)) + +# compare different results +print('TF: RGB vs BGR') +compare(tf_res, tf_res2) +print('') + +print('OV: RGB vs BGR') +compare(ov_res, ov_res2) +print('') + +print('TF vs OV') +compare(tf_res, ov_res) +print('') + +# timing +iterations = int(300 / all_batch_size) +print('Batch size {}, {} iterations:'.format(all_batch_size, iterations)) +print(' OV: {:.5f}s'.format(timeit.timeit('run_ov(ov_testinput)', number=iterations, globals=globals()))) +print(' TF: {:.5f}s'.format(timeit.timeit('run_tf(testinput)', number=iterations, globals=globals()))) \ No newline at end of file diff --git a/tools/freeze_model.py b/tools/freeze_model.py index e89ad290..faf9f887 100644 --- a/tools/freeze_model.py +++ b/tools/freeze_model.py @@ -188,6 +188,12 @@ def parse_args(): parser.add_argument( "--graphdef_out", default="resources/networks/mars-small128.pb") + parser.add_argument( + "--no_preprocess", + default=False, + action='store_true', + help="Do not include preprocessing in model (to avoid compatibility " + "issues)") return parser.parse_args() @@ -197,9 +203,12 @@ def main(): with tf.Session(graph=tf.Graph()) as session: input_var = tf.placeholder( tf.uint8, (None, 128, 64, 3), name="images") - image_var = tf.map_fn( - lambda x: _preprocess(x), tf.cast(input_var, tf.float32), - back_prop=False) + if args.no_preprocess: + image_var = tf.cast(input_var, tf.float32) + else: + image_var = tf.map_fn( + lambda x: _preprocess(x), tf.cast(input_var, tf.float32), + back_prop=False) factory_fn = _network_factory() features, _ = factory_fn(image_var, reuse=None) diff --git a/tools/generate_detections.py b/tools/generate_detections.py index c7192c26..6bb65973 100644 --- a/tools/generate_detections.py +++ b/tools/generate_detections.py @@ -2,10 +2,17 @@ import os import errno import argparse +import time import numpy as np import cv2 import tensorflow as tf +try: + from openvino.inference_engine import IENetwork, IEPlugin + USE_DYN_BATCH = False +except ImportError: + pass + def _run_in_batches(f, data_dict, out, batch_size): data_len = len(out) @@ -71,7 +78,20 @@ def extract_image_patch(image, bbox, patch_shape): class ImageEncoder(object): def __init__(self, checkpoint_filename, input_name="images", - output_name="features"): + output_name="features", openvino_device=None): + + self.openvino_device=openvino_device + if openvino_device is not None: + # setup device + model_base = os.path.splitext(checkpoint_filename)[0] + self._net = IENetwork( + model=model_base + '.xml', + weights=model_base + '.bin') + self._plugin = IEPlugin(device=openvino_device) + self._input_blob = next(iter(self._net.inputs)) + self._out_blob = next(iter(self._net.outputs)) + self._reload_openvino_net(1) + self.session = tf.Session() with tf.gfile.GFile(checkpoint_filename, "rb") as file_handle: graph_def = tf.GraphDef() @@ -87,20 +107,52 @@ def __init__(self, checkpoint_filename, input_name="images", self.feature_dim = self.output_var.get_shape().as_list()[-1] self.image_shape = self.input_var.get_shape().as_list()[1:] + def _reload_openvino_net(self, batch_size): + if self.openvino_device == "MYRIAD" or not USE_DYN_BATCH: + self._net.batch_size = 1 + self._exec_net = self._plugin.load(network=self._net) + else: + self._net.batch_size = batch_size + self._exec_net = self._plugin.load( + network=self._net, config={'DYN_BATCH_ENABLED': 'YES'}) + def __call__(self, data_x, batch_size=32): + if self.openvino_device != "MYRIAD" \ + and batch_size != self._net.batch_size: + self._reload_openvino_net(batch_size) + out = np.zeros((len(data_x), self.feature_dim), np.float32) - _run_in_batches( - lambda x: self.session.run(self.output_var, feed_dict=x), - {self.input_var: data_x}, out, batch_size) + if self.openvino_device: + def reorder(tensor): + return np.transpose(tensor, (0, 3, 1, 2)) + + if self.openvino_device == "MYRIAD" or not USE_DYN_BATCH: + # doesn't support dynamic batch size + for patch in range(len(data_x)): + out[patch] = next(iter(self._exec_net.infer( + inputs={self._input_blob: reorder( + data_x[patch:patch + 1])}).values())) + else: + _run_in_batches( + lambda x: self._exec_net.infer(inputs=x), + {self._input_blob: reorder(data_x)}, out, batch_size) + else: + _run_in_batches( + lambda x: self.session.run(self.output_var, feed_dict=x), + {self.input_var: data_x}, out, batch_size) return out def create_box_encoder(model_filename, input_name="images", - output_name="features", batch_size=32): - image_encoder = ImageEncoder(model_filename, input_name, output_name) + output_name="features", batch_size=32, + openvino_device=None): + image_encoder = ImageEncoder( + model_filename, input_name, output_name, + openvino_device=openvino_device) image_shape = image_encoder.image_shape def encoder(image, boxes): + encoder.batch_size = batch_size image_patches = [] for box in boxes: patch = extract_image_patch(image, box, image_shape[:2]) @@ -132,7 +184,6 @@ def generate_detections(encoder, mot_dir, output_dir, detection_dir=None): Path to custom detections. The directory structure should be the default MOTChallenge structure: `[sequence]/det/det.txt`. If None, uses the standard MOTChallenge detections. - """ if detection_dir is None: detection_dir = mot_dir @@ -162,8 +213,12 @@ def generate_detections(encoder, mot_dir, output_dir, detection_dir=None): frame_indices = detections_in[:, 0].astype(np.int) min_frame_idx = frame_indices.astype(np.int).min() max_frame_idx = frame_indices.astype(np.int).max() + last_frame_time = 0 for frame_idx in range(min_frame_idx, max_frame_idx + 1): - print("Frame %05d/%05d" % (frame_idx, max_frame_idx)) + curr_frame_time = time.time() + print("Frame %05d/%05d - %.2ffps" + % (frame_idx, max_frame_idx, 1 / (curr_frame_time - last_frame_time))) + last_frame_time = curr_frame_time mask = frame_indices == frame_idx rows = detections_in[mask] @@ -199,14 +254,26 @@ def parse_args(): parser.add_argument( "--output_dir", help="Output directory. Will be created if it does not" " exist.", default="detections") + parser.add_argument( + "--use_openvino", help="Use Openvino. Can be any available device as " + "long as it is compatible. Model & weights are expected to be inside " + "the folder specified with '-model' and end with '.xml' and '.bin' " + "respectively. Supply the device identifier (CPU, GPU, MYRIAD etc.)", + default="CPU") return parser.parse_args() def main(): args = parse_args() - encoder = create_box_encoder(args.model, batch_size=32) - generate_detections(encoder, args.mot_dir, args.output_dir, - args.detection_dir) + if args.use_openvino: + assert IENetwork, "Openvino could not be imported. " \ + "Make sure it is installed correctly." + args.use_openvino = args.use_openvino.upper() + + encoder = create_box_encoder( + args.model, batch_size=32, openvino_device=args.use_openvino) + generate_detections( + encoder, args.mot_dir, args.output_dir, args.detection_dir) if __name__ == "__main__":