apache · marcoabreu · Aug 10, 2018 · Jun 18, 2018 · Jul 25, 2018 · Jul 25, 2018
@@ -26,3 +26,6 @@
 [submodule "3rdparty/tvm"]
  path = 3rdparty/tvm
  url = https://github.com/dmlc/tvm
+[submodule "3rdparty/onnx-tensorrt"]
+ path = 3rdparty/onnx-tensorrt
+ url = https://github.com/onnx/onnx-tensorrt.git
diff --git a/3rdparty/onnx-tensorrt b/3rdparty/onnx-tensorrt
@@ -37,6 +37,7 @@ mxnet_option(ENABLE_CUDA_RTC "Build with CUDA runtime compilation support"
 mxnet_option(BUILD_CPP_EXAMPLES "Build cpp examples" ON)
 mxnet_option(INSTALL_EXAMPLES "Install the example source files." OFF)
 mxnet_option(USE_SIGNAL_HANDLER "Print stack traces on segfaults." OFF)
+mxnet_option(USE_TENSORRT "Enable infeference optimization with TensorRT." OFF)
 
 message(STATUS "CMAKE_SYSTEM_NAME ${CMAKE_SYSTEM_NAME}")
 if(USE_CUDA AND NOT USE_OLDCMAKECUDA)
@@ -185,6 +186,15 @@ if(USE_VTUNE)
  list(APPEND mxnet_LINKER_LIBS dl)
 endif()
 
+if(USE_TENSORRT)
+ message(STATUS "Using TensorRT")
+ include_directories(3rdparty/onnx-tensorrt/third_party/onnx/build/)
+ include_directories(3rdparty/onnx-tensorrt/)
+ include_directories(3rdparty/)
+ add_definitions(-DMXNET_USE_TENSORRT=1)
+ add_definitions(-DONNX_NAMESPACE=onnx)
+endif()
+
 if(USE_MKLDNN)
  include(cmake/MklDnn.cmake)
  # CPU architecture (e.g., C5) can't run on another architecture (e.g., g3).

@@ -28,6 +28,7 @@ mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3r
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
 mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
 mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
 // timeout in minutes
 max_time = 120
 // assign any caught errors here
@@ -372,6 +373,17 @@ try {
  }
  }
  },
+ 'TensorRT': {
+ node('mxnetlinux-cpu') {
+ ws('workspace/build-tensorrt') {
+ timeout(time: max_time, unit: 'MINUTES') {
+ init_git()
+ docker_run('ubuntu_gpu_tensorrt', 'build_ubuntu_gpu_tensorrt', false)
+ pack_lib('tensorrt', mx_tensorrt_lib)
+ }
+ }
+ }
+ },
  'Build CPU windows':{
  node('mxnetwindows-cpu') {
  timeout(time: max_time, unit: 'MINUTES') {
@@ -740,6 +752,22 @@ try {
  }
  }
  },
+ 'Python3: TensorRT GPU': {
+ node('mxnetlinux-gpu-p3') {
+ ws('workspace/build-tensorrt') {
+ timeout(time: max_time, unit: 'MINUTES') {
+ try {
+ init_git()
+ unpack_lib('tensorrt', mx_tensorrt_lib)
+ docker_run('ubuntu_gpu_tensorrt', 'unittest_ubuntu_tensorrt_gpu', true)
+ publish_test_coverage()
+ } finally {
+ collect_test_results_unix('nosetests_tensorrt.xml', 'nosetests_python3_tensorrt_gpu.xml')
+ }
+ }
+ }
+ }
+ },
  'Scala: CPU': {
  node('mxnetlinux-cpu') {
  ws('workspace/ut-scala-cpu') {

@@ -91,6 +91,14 @@ else
 endif
 CFLAGS += -I$(TPARTYDIR)/mshadow/ -I$(TPARTYDIR)/dmlc-core/include -fPIC -I$(NNVM_PATH)/include -I$(DLPACK_PATH)/include -I$(TPARTYDIR)/tvm/include -Iinclude $(MSHADOW_CFLAGS)
 LDFLAGS = -pthread $(MSHADOW_LDFLAGS) $(DMLC_LDFLAGS)
+
+
+ifeq ($(USE_TENSORRT), 1)
+ CFLAGS += -I$(ROOTDIR) -I$(TPARTYDIR) -DONNX_NAMESPACE=$(ONNX_NAMESPACE) -DMXNET_USE_TENSORRT=1
+ LDFLAGS += -lprotobuf -pthread -lonnx -lonnx_proto -lnvonnxparser -lnvonnxparser_runtime -lnvinfer -lnvinfer_plugin
+endif
+# -L/usr/local/lib
+
 ifeq ($(DEBUG), 1)
  NVCCFLAGS += -std=c++11 -Xcompiler -D_FORCE_INLINES -g -G -O0 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
 else

diff --git a/amalgamation/amalgamation.py b/amalgamation/amalgamation.py
@@ -23,13 +23,12 @@
 import platform
 
 blacklist = [
- 'Windows.h', 'cublas_v2.h', 'cuda/tensor_gpu-inl.cuh',
- 'cuda_runtime.h', 'cudnn.h', 'cudnn_lrn-inl.h', 'curand.h', 'curand_kernel.h',
- 'glog/logging.h', 'io/azure_filesys.h', 'io/hdfs_filesys.h', 'io/s3_filesys.h',
- 'kvstore_dist.h', 'mach/clock.h', 'mach/mach.h',
- 'malloc.h', 'mkl.h', 'mkl_cblas.h', 'mkl_vsl.h', 'mkl_vsl_functions.h',
- 'nvml.h', 'opencv2/opencv.hpp', 'sys/stat.h', 'sys/types.h', 'cuda.h', 'cuda_fp16.h',
- 'omp.h', 'execinfo.h', 'packet/sse-inl.h', 'emmintrin.h', 'thrust/device_vector.h',
+ 'Windows.h', 'cublas_v2.h', 'cuda/tensor_gpu-inl.cuh', 'cuda_runtime.h', 'cudnn.h',
+ 'cudnn_lrn-inl.h', 'curand.h', 'curand_kernel.h', 'glog/logging.h', 'io/azure_filesys.h',
+ 'io/hdfs_filesys.h', 'io/s3_filesys.h', 'kvstore_dist.h', 'mach/clock.h', 'mach/mach.h',
+ 'malloc.h', 'mkl.h', 'mkl_cblas.h', 'mkl_vsl.h', 'mkl_vsl_functions.h', 'NvInfer.h', 'nvml.h',
+ 'opencv2/opencv.hpp', 'sys/stat.h', 'sys/types.h', 'cuda.h', 'cuda_fp16.h', 'omp.h',
+ 'onnx/onnx.pb.h', 'execinfo.h', 'packet/sse-inl.h', 'emmintrin.h', 'thrust/device_vector.h',
  'cusolverDn.h', 'internal/concurrentqueue_internal_debug.h', 'relacy/relacy_std.hpp',
  'relacy_shims.h', 'ittnotify.h', 'shared_mutex'
  ]
@@ -150,6 +149,7 @@ def expand(x, pending, stage):
  h not in sysheaders and
  'mkl' not in h and
  'nnpack' not in h and
+ 'tensorrt' not in h and
  not h.endswith('.cuh')): sysheaders.append(h)
  else:
  expand.treeDepth += 1

@@ -0,0 +1,41 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Dockerfile to run MXNet on Ubuntu 16.04 for CPU
+
+FROM nvidia/cuda:9.0-cudnn7-devel
+
+WORKDIR /work/deps
+
+COPY install/ubuntu_core.sh /work/
+RUN /work/ubuntu_core.sh
+COPY install/deb_ubuntu_ccache.sh /work/
+RUN /work/deb_ubuntu_ccache.sh
+COPY install/ubuntu_python.sh /work/
+RUN /work/ubuntu_python.sh
+COPY install/tensorrt.sh /work
+RUN /work/tensorrt.sh
+
+ARG USER_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
+COPY runtime_functions.sh /work/
+
+WORKDIR /work/mxnet
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Install gluoncv since we're testing Gluon models as well
+pip2 install gluoncv==0.2.0
+pip3 install gluoncv==0.2.0
+
+# Install Protobuf
+# Install protoc 3.5 and build protobuf here (for onnx and onnx-tensorrt)
+pushd .
+cd ..
+apt-get update
+apt-get install -y automake libtool
+git clone --recursive -b 3.5.1.1 https://github.com/google/protobuf.git
+cd protobuf
+./autogen.sh
+./configure
+make -j$(nproc)
+make install
+ldconfig
+popd
+
+# Install TensorRT
+echo "TensorRT build enabled. Installing TensorRT."
+wget -qO tensorrt.deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64/nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0_1-1_amd64.deb
+dpkg -i tensorrt.deb
+apt-get update
+apt-get install -y --allow-downgrades libnvinfer-dev
+rm tensorrt.deb
@@ -436,6 +436,62 @@ build_ubuntu_gpu() {
  build_ubuntu_gpu_cuda91_cudnn7
 }
 
+build_ubuntu_gpu_tensorrt() {
+
+ set -ex
+
+ build_ccache_wrappers
+
+ # Build ONNX
+ pushd .
+ echo "Installing ONNX."
+ cd 3rdparty/onnx-tensorrt/third_party/onnx
+ rm -rf build
+ mkdir -p build
+ cd build
+ cmake \
+ -DCMAKE_CXX_FLAGS=-I/usr/include/python${PYVER}\
+ -DBUILD_SHARED_LIBS=ON ..\
+ -G Ninja
+ ninja -v
+ export LIBRARY_PATH=`pwd`:`pwd`/onnx/:$LIBRARY_PATH
+ export CPLUS_INCLUDE_PATH=`pwd`:$CPLUS_INCLUDE_PATH
+ popd
+
+ # Build ONNX-TensorRT
+ pushd .
+ cd 3rdparty/onnx-tensorrt/
+ mkdir -p build
+ cd build
+ cmake ..
+ make -j$(nproc)
+ export LIBRARY_PATH=`pwd`:$LIBRARY_PATH
+ popd
+
+ mkdir -p /work/mxnet/lib/
+ cp 3rdparty/onnx-tensorrt/third_party/onnx/build/*.so /work/mxnet/lib/
+ cp -L 3rdparty/onnx-tensorrt/build/libnvonnxparser_runtime.so.0 /work/mxnet/lib/
+ cp -L 3rdparty/onnx-tensorrt/build/libnvonnxparser.so.0 /work/mxnet/lib/
+
+ rm -rf build
+ make \
+ DEV=1 \
+ USE_BLAS=openblas \
+ USE_CUDA=1 \
+ USE_CUDA_PATH=/usr/local/cuda \
+ USE_CUDNN=1 \
+ USE_OPENCV=0 \
+ USE_DIST_KVSTORE=0 \
+ USE_TENSORRT=1 \
+ USE_JEMALLOC=0 \
+ USE_GPERFTOOLS=0 \
+ ONNX_NAMESPACE=onnx \
+ CUDA_ARCH="-gencode arch=compute_70,code=compute_70"\
+ -j$(nproc)
+
+ report_ccache_usage
+}
+
 build_ubuntu_gpu_mkldnn() {
  set -ex
 
@@ -638,6 +694,15 @@ unittest_ubuntu_python3_gpu_nocudnn() {
  nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
 }
 
+unittest_ubuntu_tensorrt_gpu() {
+ set -ex
+ export PYTHONPATH=./python/
+ export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+ export LD_LIBRARY_PATH=/work/mxnet/lib:$LD_LIBRARY_PATH
+ python tests/python/tensorrt/lenet5_train.py
+ nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS --with-xunit --xunit-file nosetests_trt_gpu.xml --verbose tests/python/tensorrt/
+}
+
 # quantization gpu currently only runs on P3 instances
 # need to separte it from unittest_ubuntu_python2_gpu()
 unittest_ubuntu_python2_quantization_gpu() {
@@ -970,3 +1035,5 @@ EOF
  declare -F | cut -d' ' -f3
  echo
 fi
+
+
@@ -1714,6 +1714,13 @@ MXNET_DLL int MXExecutorReshape(int partial_shaping,
  NDArrayHandle** aux_states,
  ExecutorHandle shared_exec,
  ExecutorHandle *out);
+
+/*!
+ * \brief get optimized graph from graph executor
+ */
+MXNET_DLL int MXExecutorGetOptimizedSymbol(ExecutorHandle handle,
+ SymbolHandle *out);
+
 /*!
  * \brief set a call back to notify the completion of operation
  */

@@ -152,14 +152,14 @@ class Executor {
  static Executor* SimpleBind(nnvm::Symbol symbol,
  const Context& default_ctx,
  const std::map<std::string, Context>& group2ctx,
- const std::vector<Context>& in_arg_ctxes,
- const std::vector<Context>& arg_grad_ctxes,
- const std::vector<Context>& aux_state_ctxes,
- const std::unordered_map<std::string, TShape>& arg_shape_map,
- const std::unordered_map<std::string, int>& arg_dtype_map,
- const std::unordered_map<std::string, int>& arg_stype_map,
- const std::vector<OpReqType>& grad_req_types,
- const std::unordered_set<std::string>& param_names,
+ std::vector<Context>* in_arg_ctxes,
+ std::vector<Context>* arg_grad_ctxes,
+ std::vector<Context>* aux_state_ctxes,
+ std::unordered_map<std::string, TShape>* arg_shape_map,
+ std::unordered_map<std::string, int>* arg_dtype_map,
+ std::unordered_map<std::string, int>* arg_stype_map,
+ std::vector<OpReqType>* grad_req_types,
+ std::unordered_set<std::string>* param_names,
  std::vector<NDArray>* in_args,
  std::vector<NDArray>* arg_grads,
  std::vector<NDArray>* aux_states,

@@ -709,3 +709,19 @@ def write_all_str(module_file, module_all_list):
  module_op_file.close()
  write_all_str(module_internal_file, module_internal_all)
  module_internal_file.close()
+
+def cint(init_val=0):
+ """create a C int with an optional initial value"""
+ return C.c_int(init_val)
+
+def int_addr(x):
+ """given a c_int, return it's address as an int ptr"""
+ x_addr = C.addressof(x)
+ int_p = C.POINTER(C.c_int)
+ x_int_addr = C.cast(x_addr, int_p)
+ return x_int_addr
+
+def checked_call(f, *args):
+ """call a cuda function and check for success"""
+ error_t = f(*args)
+ assert error_t == 0, "Failing cuda call %s returns %s." % (f.__name__, error_t)
@@ -24,8 +24,9 @@
 import ctypes
 import copy
 import numpy as np
+import mxnet as mx
 from .base import _LIB
-from .base import mx_uint, NDArrayHandle, ExecutorHandle, py_str
+from .base import mx_uint, NDArrayHandle, ExecutorHandle, SymbolHandle, py_str
 from .base import check_call, c_handle_array, c_array_buf, c_str_array
 from .ndarray import NDArray
 from .ndarray import _ndarray_cls
@@ -73,6 +74,7 @@ def __init__(self, handle, symbol, ctx, grad_req, group2ctx):
  self.aux_arrays = []
  self.outputs = self._get_outputs()
  self._symbol = copy.deepcopy(symbol)
+ self._optimized_symbol = None
  self._arg_dict = None
  self._grad_dict = None
  self._aux_dict = None
@@ -323,6 +325,21 @@ def output_dict(self):
  self._symbol.list_outputs(), self.outputs)
  return self._output_dict
 
+ @property
+ def optimized_symbol(self):
+ """Get optimized symbol.
+
+ Returns
+ -------
+ symbol : nnvm::Symbol
+ The nnvm symbol optimized.
+ """
+ if self._optimized_symbol is None:
+ handle = SymbolHandle()
+ check_call(_LIB.MXExecutorGetOptimizedSymbol(self.handle, ctypes.byref(handle)))
+ self._optimized_symbol = mx.sym.Symbol(handle=handle)
+ return self._optimized_symbol
+
  def copy_params_from(self, arg_params, aux_params=None, allow_extra_params=False):
  """Copy parameters from arg_params, aux_params into executor's internal array.