From a3154c24dea70da0de4b8d769f0515ecd6d410b8 Mon Sep 17 00:00:00 2001
From: James Gilles <jhgilles@mit.edu>
Date: Wed, 10 Apr 2019 17:28:25 -0400
Subject: [PATCH 001/106] [REFACTOR] Use more TypedPackedFuncs (#2981)

* Add `set_body_simple` to Registry, refactor a lot of code to use it

* Add more types to Relay PackedFuncs

* Add Registry::set_body_method to easily make Node methods into
PackedFuncs

* Add set_body_method, set_body_node_method; start typing api_lang

* Add some docs, remove unused script

* Fix mysterious linter problem

* Touch up api_ir.cc

* Fix some issues with TOPI argument counts

* Revert changes to topi.cc to avoid problems with optional arguments

* A little more cleanup

* Type more of the api _ functions

* Whitespace

* Finalize names and docs for new registry helpers

* Update docs
---
 include/tvm/runtime/registry.h               | 163 ++++++++
 nnvm/src/compiler/compile_engine.cc          |   4 +-
 nnvm/src/compiler/graph_hash.cc              |   4 +-
 src/api/api_arith.cc                         |  60 +--
 src/api/api_base.cc                          |   5 +-
 src/api/api_codegen.cc                       |   4 +-
 src/api/api_ir.cc                            | 218 +++++------
 src/api/api_lang.cc                          | 374 +++++--------------
 src/api/api_pass.cc                          |  99 ++---
 src/api/api_schedule.cc                      |  32 +-
 src/codegen/codegen_opencl.cc                |   4 +-
 src/codegen/codegen_opengl.cc                |   4 +-
 src/codegen/codegen_vhls.cc                  |   4 +-
 src/codegen/llvm/codegen_amdgpu.cc           |   4 +-
 src/codegen/llvm/codegen_nvptx.cc            |   4 +-
 src/codegen/opt/build_cuda_on.cc             |   4 +-
 src/codegen/source_module.cc                 |   4 +-
 src/codegen/spirv/build_vulkan.cc            |   4 +-
 src/codegen/stackvm/codegen_stackvm.cc       |   4 +-
 src/relay/backend/interpreter.cc             |  25 +-
 src/relay/ir/adt.cc                          |  28 +-
 src/relay/ir/alpha_equal.cc                  |  12 +-
 src/relay/ir/base.cc                         |  12 +-
 src/relay/ir/expr.cc                         |  56 +--
 src/relay/ir/expr_functor.cc                 |   5 +-
 src/relay/ir/hash.cc                         |  12 +-
 src/relay/ir/module.cc                       |  53 +--
 src/relay/ir/type.cc                         |  43 +--
 src/relay/op/debug.cc                        |   4 +-
 src/relay/op/image/resize.cc                 |   4 +-
 src/relay/op/nn/convolution.cc               |  36 +-
 src/relay/op/nn/nn.cc                        |  71 ++--
 src/relay/op/nn/pad.cc                       |   4 +-
 src/relay/op/nn/pooling.cc                   |  16 +-
 src/relay/op/nn/upsampling.cc                |   4 +-
 src/relay/op/tensor/reduce.cc                |   6 +-
 src/relay/op/tensor/transform.cc             | 104 ++----
 src/relay/op/vision/multibox_op.cc           |   8 +-
 src/relay/op/vision/nms.cc                   |   8 +-
 src/relay/op/vision/rcnn_op.cc               |  12 +-
 src/relay/op/vision/yolo.cc                  |   4 +-
 src/relay/pass/canonicalize_ops.cc           |   4 +-
 src/relay/pass/combine_parallel_conv2d.cc    |   4 +-
 src/relay/pass/dead_code.cc                  |   4 +-
 src/relay/pass/device_annotation.cc          |  12 +-
 src/relay/pass/fold_constant.cc              |   4 +-
 src/relay/pass/fuse_ops.cc                   |   4 +-
 src/relay/pass/gradient.cc                   |  12 +-
 src/relay/pass/mac_count.cc                  |   4 +-
 src/relay/pass/pass_manager.cc               |  27 +-
 src/relay/pass/quantize.cc                   |  13 +-
 src/relay/pass/simplify_inference.cc         |   4 +-
 src/relay/pass/to_a_normal_form.cc           |   4 +-
 src/relay/pass/to_graph_normal_form.cc       |   4 +-
 src/relay/pass/type_infer.cc                 |   4 +-
 src/relay/pass/util.cc                       |   8 +-
 src/relay/pass/well_formed.cc                |   5 +-
 src/runtime/cuda/cuda_module.cc              |  12 +-
 src/runtime/metal/metal_module.mm            |   8 +-
 src/runtime/opencl/aocl/aocl_module.cc       |   4 +-
 src/runtime/opencl/opencl_module.cc          |  12 +-
 src/runtime/opencl/sdaccel/sdaccel_module.cc |   8 +-
 src/runtime/rocm/rocm_module.cc              |   8 +-
 src/runtime/rpc/rpc_event_impl.cc            |   4 +-
 src/runtime/rpc/rpc_socket_impl.cc           |   4 +-
 src/runtime/stackvm/stackvm_module.cc        |   4 +-
 src/runtime/vulkan/vulkan_module.cc          |   8 +-
 web/web_runtime.cc                           |  10 +-
 68 files changed, 635 insertions(+), 1090 deletions(-)

diff --git a/include/tvm/runtime/registry.h b/include/tvm/runtime/registry.h
index 50bb5c5b967d..40e1a520cb67 100644
--- a/include/tvm/runtime/registry.h
+++ b/include/tvm/runtime/registry.h
@@ -83,6 +83,169 @@ class Registry {
   Registry& set_body_typed(FLambda f) {
     return set_body(TypedPackedFunc<FType>(f).packed());
   }
+
+  /*!
+   * \brief set the body of the function to the given function pointer.
+   *        Note that this doesn't work with lambdas, you need to
+   *        explicitly give a type for those.
+   *        Note that this will ignore default arg values and always require all arguments to be provided.
+   *
+   * \code
+   * 
+   * int multiply(int x, int y) {
+   *   return x * y;
+   * }
+   *
+   * TVM_REGISTER_API("multiply")
+   * .set_body_typed(multiply); // will have type int(int, int)
+   *
+   * \endcode
+   *
+   * \param f The function to forward to.
+   * \tparam R the return type of the function (inferred).
+   * \tparam Args the argument types of the function (inferred).
+   */
+  template<typename R, typename ...Args>
+  Registry& set_body_typed(R (*f)(Args...)) {
+    return set_body(TypedPackedFunc<R(Args...)>(f));
+  }
+
+  /*!
+   * \brief set the body of the function to be the passed method pointer.
+   *        Note that this will ignore default arg values and always require all arguments to be provided.
+   *
+   * \code
+   * 
+   * // node subclass:
+   * struct Example {
+   *    int doThing(int x);
+   * }
+   * TVM_REGISTER_API("Example_doThing")
+   * .set_body_method(&Example::doThing); // will have type int(Example, int)
+   *
+   * \endcode
+   *
+   * \param f the method pointer to forward to.
+   * \tparam T the type containing the method (inferred).
+   * \tparam R the return type of the function (inferred).
+   * \tparam Args the argument types of the function (inferred).
+   */
+  template<typename T, typename R, typename ...Args>
+  Registry& set_body_method(R (T::*f)(Args...)) {
+    return set_body_typed<R(T, Args...)>([f](T target, Args... params) -> R {
+      // call method pointer
+      return (target.*f)(params...);
+    });
+  }
+
+  /*!
+   * \brief set the body of the function to be the passed method pointer.
+   *        Note that this will ignore default arg values and always require all arguments to be provided.
+   *
+   * \code
+   * 
+   * // node subclass:
+   * struct Example {
+   *    int doThing(int x);
+   * }
+   * TVM_REGISTER_API("Example_doThing")
+   * .set_body_method(&Example::doThing); // will have type int(Example, int)
+   *
+   * \endcode
+   *
+   * \param f the method pointer to forward to.
+   * \tparam T the type containing the method (inferred).
+   * \tparam R the return type of the function (inferred).
+   * \tparam Args the argument types of the function (inferred).
+   */
+  template<typename T, typename R, typename ...Args>
+  Registry& set_body_method(R (T::*f)(Args...) const) {
+    return set_body_typed<R(T, Args...)>([f](const T target, Args... params) -> R {
+      // call method pointer
+      return (target.*f)(params...);
+    });
+  }
+
+  /*!
+   * \brief set the body of the function to be the passed method pointer.
+   *        Used when calling a method on a Node subclass through a NodeRef subclass.
+   *        Note that this will ignore default arg values and always require all arguments to be provided.
+   *
+   * \code
+   * 
+   * // node subclass:
+   * struct ExampleNode: BaseNode {
+   *    int doThing(int x);
+   * }
+   * 
+   * // noderef subclass
+   * struct Example; 
+   *
+   * TVM_REGISTER_API("Example_doThing")
+   * .set_body_method<Example>(&ExampleNode::doThing); // will have type int(Example, int)
+   * 
+   * // note that just doing:
+   * // .set_body_method(&ExampleNode::doThing);
+   * // wouldn't work, because ExampleNode can't be taken from a TVMArgValue.
+   *
+   * \endcode
+   *
+   * \param f the method pointer to forward to.
+   * \tparam TNodeRef the node reference type to call the method on
+   * \tparam TNode the node type containing the method (inferred).
+   * \tparam R the return type of the function (inferred).
+   * \tparam Args the argument types of the function (inferred).
+   */
+  template<typename TNodeRef, typename TNode, typename R, typename ...Args,
+    typename = typename std::enable_if<std::is_base_of<NodeRef, TNodeRef>::value>::type>
+  Registry& set_body_method(R (TNode::*f)(Args...)) {
+    return set_body_typed<R(TNodeRef, Args...)>([f](TNodeRef ref, Args... params) {
+      TNode* target = ref.operator->();
+      // call method pointer
+      return (target->*f)(params...);
+    });
+  }
+
+  /*!
+   * \brief set the body of the function to be the passed method pointer.
+   *        Used when calling a method on a Node subclass through a NodeRef subclass.
+   *        Note that this will ignore default arg values and always require all arguments to be provided.
+   *
+   * \code
+   * 
+   * // node subclass:
+   * struct ExampleNode: BaseNode {
+   *    int doThing(int x);
+   * }
+   * 
+   * // noderef subclass
+   * struct Example; 
+   *
+   * TVM_REGISTER_API("Example_doThing")
+   * .set_body_method<Example>(&ExampleNode::doThing); // will have type int(Example, int)
+   * 
+   * // note that just doing:
+   * // .set_body_method(&ExampleNode::doThing);
+   * // wouldn't work, because ExampleNode can't be taken from a TVMArgValue.
+   *
+   * \endcode
+   *
+   * \param f the method pointer to forward to.
+   * \tparam TNodeRef the node reference type to call the method on
+   * \tparam TNode the node type containing the method (inferred).
+   * \tparam R the return type of the function (inferred).
+   * \tparam Args the argument types of the function (inferred).
+   */
+  template<typename TNodeRef, typename TNode, typename R, typename ...Args,
+    typename = typename std::enable_if<std::is_base_of<NodeRef, TNodeRef>::value>::type>
+  Registry& set_body_method(R (TNode::*f)(Args...) const) {
+    return set_body_typed<R(TNodeRef, Args...)>([f](TNodeRef ref, Args... params) {
+      const TNode* target = ref.operator->();
+      // call method pointer
+      return (target->*f)(params...);
+    });
+  }
+
   /*!
    * \brief Register a function with given name
    * \param name The name of the function.
diff --git a/nnvm/src/compiler/compile_engine.cc b/nnvm/src/compiler/compile_engine.cc
index 95ed87be5b86..3da95e879fa7 100644
--- a/nnvm/src/compiler/compile_engine.cc
+++ b/nnvm/src/compiler/compile_engine.cc
@@ -364,9 +364,7 @@ TVM_REGISTER_GLOBAL("nnvm.compiler.GraphKeyGetGraph")
   });
 
 TVM_REGISTER_GLOBAL("nnvm.compiler.MakeGraphKey")
-.set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue *rv) {
-    *rv = GraphKeyNode::make(args[0], args[1], args[2]);
-  });
+.set_body_typed(GraphKeyNode::make);
 
 // This can be used to extract workloads from nnvm compiler
 TVM_REGISTER_GLOBAL("nnvm.compiler.CacheItem2ScheduleArgs")
diff --git a/nnvm/src/compiler/graph_hash.cc b/nnvm/src/compiler/graph_hash.cc
index e825ef4efe57..b76f99fa58d3 100644
--- a/nnvm/src/compiler/graph_hash.cc
+++ b/nnvm/src/compiler/graph_hash.cc
@@ -235,8 +235,6 @@ std::string GraphDeepCompare(const Graph& a,
 }
 
 TVM_REGISTER_GLOBAL("nnvm.graph.DeepCompare")
-.set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue *rv) {
-    *rv = GraphDeepCompare(args[0], args[1], args[2]);
-  });
+.set_body_typed(GraphDeepCompare);
 }  // namespace compiler
 }  // namespace nnvm
diff --git a/src/api/api_arith.cc b/src/api/api_arith.cc
index ca0bed18f554..fce73aabf6a7 100644
--- a/src/api/api_arith.cc
+++ b/src/api/api_arith.cc
@@ -31,73 +31,51 @@ namespace tvm {
 namespace arith {
 
 TVM_REGISTER_API("arith.intset_single_point")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = IntSet::single_point(args[0]);
-  });
+.set_body_typed(IntSet::single_point);
 
 TVM_REGISTER_API("arith.intset_vector")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = IntSet::vector(args[0]);
-  });
+.set_body_typed(IntSet::vector);
 
 TVM_REGISTER_API("arith.intset_interval")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = IntSet::interval(args[0], args[1]);
-  });
+.set_body_typed(IntSet::interval);
 
 TVM_REGISTER_API("arith.DetectLinearEquation")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = DetectLinearEquation(args[0], args[1]);
-  });
+.set_body_typed(DetectLinearEquation);
 
 TVM_REGISTER_API("arith.DetectClipBound")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = DetectClipBound(args[0], args[1]);
-  });
+.set_body_typed(DetectClipBound);
 
 TVM_REGISTER_API("arith.DeduceBound")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = DeduceBound(args[0], args[1],
-        args[2].operator Map<Var, IntSet>(),
-        args[3].operator Map<Var, IntSet>());
-  });
+.set_body_typed<IntSet(Expr, Expr, Map<Var, IntSet>, Map<Var, IntSet>)>([](
+  Expr v, Expr cond,
+  const Map<Var, IntSet> hint_map,
+  const Map<Var, IntSet> relax_map
+) {
+  return DeduceBound(v, cond, hint_map, relax_map);
+});
 
 
 TVM_REGISTER_API("arith.DomainTouched")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = DomainTouched(args[0], args[1], args[2], args[3]);
-  });
+.set_body_typed(DomainTouched);
 
 
 TVM_REGISTER_API("_IntervalSetGetMin")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = args[0].operator IntSet().min();
-  });
+.set_body_method(&IntSet::min);
 
 TVM_REGISTER_API("_IntervalSetGetMax")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = args[0].operator IntSet().max();
-  });
+.set_body_method(&IntSet::max);
 
 TVM_REGISTER_API("_IntSetIsNothing")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = args[0].operator IntSet().is_nothing();
-  });
+.set_body_method(&IntSet::is_nothing);
 
 TVM_REGISTER_API("_IntSetIsEverything")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = args[0].operator IntSet().is_everything();
-  });
+.set_body_method(&IntSet::is_everything);
 
 TVM_REGISTER_API("arith._make_ConstIntBound")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = ConstIntBoundNode::make(args[0], args[1]);
-  });
+.set_body_typed(ConstIntBoundNode::make);
 
 TVM_REGISTER_API("arith._make_ModularSet")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = ModularSetNode::make(args[0], args[1]);
-  });
+.set_body_typed(ModularSetNode::make);
 
 TVM_REGISTER_API("arith._CreateAnalyzer")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
diff --git a/src/api/api_base.cc b/src/api/api_base.cc
index 23d1f5c67f7c..28ebb4d65005 100644
--- a/src/api/api_base.cc
+++ b/src/api/api_base.cc
@@ -50,9 +50,8 @@ TVM_REGISTER_API("_load_json")
 .set_body_typed<NodeRef(std::string)>(LoadJSON<NodeRef>);
 
 TVM_REGISTER_API("_TVMSetStream")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-    TVMSetStream(args[0], args[1], args[2]);
-  });
+.set_body_typed(TVMSetStream);
+
 TVM_REGISTER_API("_save_param_dict")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
     CHECK_EQ(args.size() % 2, 0u);
diff --git a/src/api/api_codegen.cc b/src/api/api_codegen.cc
index e44ebbec7085..73e26719cf15 100644
--- a/src/api/api_codegen.cc
+++ b/src/api/api_codegen.cc
@@ -41,8 +41,6 @@ TVM_REGISTER_API("codegen._Build")
   });
 
 TVM_REGISTER_API("module._PackImportsToC")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = PackImportsToC(args[0], args[1]);
-  });
+.set_body_typed(PackImportsToC);
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/api/api_ir.cc b/src/api/api_ir.cc
index c5680bb3df8d..2525059b47ba 100644
--- a/src/api/api_ir.cc
+++ b/src/api/api_ir.cc
@@ -31,54 +31,43 @@ namespace tvm {
 namespace ir {
 
 TVM_REGISTER_API("_Var")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-    *ret = Variable::make(args[1], args[0]);
+.set_body_typed<VarExpr(std::string, Type)>([](std::string s, Type t) {
+    return Variable::make(t, s);
   });
 
 TVM_REGISTER_API("make.abs")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-    *ret = tvm::abs(args[0]);
-  });
+.set_body_typed(tvm::abs);
 
 TVM_REGISTER_API("make.floor")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-    *ret = tvm::floor(args[0]);
-  });
+.set_body_typed(tvm::floor);
 
 TVM_REGISTER_API("make.ceil")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-    *ret = tvm::ceil(args[0]);
-  });
+.set_body_typed(tvm::ceil);
 
 TVM_REGISTER_API("make.round")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-    *ret = tvm::round(args[0]);
-  });
+.set_body_typed(tvm::round);
 
 TVM_REGISTER_API("make.trunc")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-    *ret = tvm::trunc(args[0]);
-  });
+.set_body_typed(tvm::trunc);
 
 TVM_REGISTER_API("make._cast")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-    *ret = tvm::cast(args[0], args[1]);
-  });
+.set_body_typed(tvm::cast);
 
 TVM_REGISTER_API("make._range_by_min_extent")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-    *ret = Range::make_by_min_extent(args[0], args[1]);
-  });
+.set_body_typed(Range::make_by_min_extent);
 
 TVM_REGISTER_API("make.For")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-    *ret = For::make(args[0],
-                     args[1],
-                     args[2],
-                     static_cast<ForType>(args[3].operator int()),
-                     static_cast<HalideIR::DeviceAPI>(args[4].operator int()),
-                     args[5]);
-  });
+.set_body_typed<Stmt(VarExpr, Expr, Expr, int, int, Stmt)>([](
+  VarExpr loop_var, Expr min, Expr extent,
+  int for_type, int device_api, Stmt body
+) {
+  return For::make(loop_var,
+                    min,
+                    extent,
+                    static_cast<ForType>(for_type),
+                    static_cast<HalideIR::DeviceAPI>(device_api),
+                    body);
+});
 
 TVM_REGISTER_API("make.Load")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
@@ -101,114 +90,87 @@ TVM_REGISTER_API("make.Store")
   });
 
 TVM_REGISTER_API("make.Realize")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-    *ret = Realize::make(args[0],
-                         args[1],
-                         args[2],
-                         args[3],
-                         args[4],
-                         args[5]);
-  });
-
+.set_body_typed(Realize::make);
 
 TVM_REGISTER_API("make.Call")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-    *ret = Call::make(args[0],
-                      args[1],
-                      args[2],
-                      static_cast<Call::CallType>(args[3].operator int()),
-                      args[4],
-                      args[5]);
-  });
+.set_body_typed<Expr(Type, std::string, Array<Expr>, int, FunctionRef, int)>([](
+  Type type, std::string name,
+  Array<Expr> args, int call_type,
+  FunctionRef func, int value_index
+) {
+  return Call::make(type,
+                    name,
+                    args,
+                    static_cast<Call::CallType>(call_type),
+                    func,
+                    value_index);
+});
 
 TVM_REGISTER_API("make.CommReducer")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = CommReducerNode::make(args[0],
-                                 args[1],
-                                 args[2],
-                                 args[3]);
-  });
+.set_body_typed(CommReducerNode::make);
 
 // make from two arguments
-#define REGISTER_MAKE1(Node)                                 \
-  TVM_REGISTER_API("make."#Node)                             \
-  .set_body([](TVMArgs args,  TVMRetValue *ret) {            \
-      *ret = Node::make(args[0]);                            \
-    })                                                       \
-
-#define REGISTER_MAKE2(Node)                                 \
+#define REGISTER_MAKE(Node)                                  \
   TVM_REGISTER_API("make."#Node)                             \
-  .set_body([](TVMArgs args,  TVMRetValue *ret) {            \
-      *ret = Node::make(args[0], args[1]);                   \
-    })                                                       \
-
-#define REGISTER_MAKE3(Node)                                 \
-  TVM_REGISTER_API("make."#Node)                             \
-  .set_body([](TVMArgs args,  TVMRetValue *ret) {            \
-      *ret = Node::make(args[0], args[1], args[2]);          \
-    })                                                       \
-
-#define REGISTER_MAKE4(Node)                                            \
-  TVM_REGISTER_API("make."#Node)                                        \
-  .set_body([](TVMArgs args,  TVMRetValue *ret) {                       \
-      *ret = Node::make(args[0], args[1], args[2], args[3]);            \
-    })                                                                  \
-
-#define REGISTER_MAKE5(Node)                                            \
-  TVM_REGISTER_API("make."#Node)                                        \
-  .set_body([](TVMArgs args,  TVMRetValue *ret) {                       \
-      *ret = Node::make(args[0], args[1], args[2], args[3], args[4]);   \
-    })                                                                  \
-
-
-REGISTER_MAKE5(Reduce);
-REGISTER_MAKE4(AttrStmt);
-
-REGISTER_MAKE2(IntImm);
-REGISTER_MAKE2(UIntImm);
-REGISTER_MAKE2(FloatImm);
-REGISTER_MAKE1(StringImm);
-
-REGISTER_MAKE2(Add);
-REGISTER_MAKE2(Sub);
-REGISTER_MAKE2(Mul);
-REGISTER_MAKE2(Div);
-REGISTER_MAKE2(Mod);
-REGISTER_MAKE2(Min);
-REGISTER_MAKE2(Max);
-REGISTER_MAKE2(EQ);
-REGISTER_MAKE2(NE);
-REGISTER_MAKE2(LT);
-REGISTER_MAKE2(LE);
-REGISTER_MAKE2(GT);
-REGISTER_MAKE2(GE);
-REGISTER_MAKE2(And);
-REGISTER_MAKE2(Or);
-
-REGISTER_MAKE1(Not);
-REGISTER_MAKE3(Select);
-REGISTER_MAKE3(Ramp);
-REGISTER_MAKE2(Cast);
-REGISTER_MAKE2(Broadcast);
-REGISTER_MAKE2(Shuffle);
-REGISTER_MAKE3(Let);
-REGISTER_MAKE3(LetStmt);
-REGISTER_MAKE3(AssertStmt);
-REGISTER_MAKE3(ProducerConsumer);
-REGISTER_MAKE5(Allocate);
-REGISTER_MAKE4(Provide);
-REGISTER_MAKE4(Prefetch);
-REGISTER_MAKE1(Free);
-REGISTER_MAKE2(Block);
-REGISTER_MAKE3(IfThenElse);
-REGISTER_MAKE1(Evaluate);
+  .set_body_typed(Node::make);                              \
+
+REGISTER_MAKE(Reduce);
+REGISTER_MAKE(AttrStmt);
+
+REGISTER_MAKE(IntImm);
+REGISTER_MAKE(UIntImm);
+REGISTER_MAKE(FloatImm);
+REGISTER_MAKE(StringImm);
+
+REGISTER_MAKE(Add);
+REGISTER_MAKE(Sub);
+REGISTER_MAKE(Mul);
+REGISTER_MAKE(Div);
+REGISTER_MAKE(Mod);
+REGISTER_MAKE(Min);
+REGISTER_MAKE(Max);
+REGISTER_MAKE(EQ);
+REGISTER_MAKE(NE);
+REGISTER_MAKE(LT);
+REGISTER_MAKE(LE);
+REGISTER_MAKE(GT);
+REGISTER_MAKE(GE);
+REGISTER_MAKE(And);
+REGISTER_MAKE(Or);
+
+REGISTER_MAKE(Not);
+REGISTER_MAKE(Select);
+REGISTER_MAKE(Ramp);
+REGISTER_MAKE(Cast);
+REGISTER_MAKE(Broadcast);
+REGISTER_MAKE(Shuffle);
+REGISTER_MAKE(Let);
+REGISTER_MAKE(LetStmt);
+REGISTER_MAKE(AssertStmt);
+REGISTER_MAKE(ProducerConsumer);
+REGISTER_MAKE(Provide);
+REGISTER_MAKE(Prefetch);
+REGISTER_MAKE(Free);
+REGISTER_MAKE(IfThenElse);
+REGISTER_MAKE(Evaluate);
+
+// overloaded, needs special handling
+TVM_REGISTER_API("make.Block")
+  .set_body_typed(static_cast<Stmt (*)(Stmt, Stmt)>(Block::make));
+
+// has default args
+TVM_REGISTER_API("make.Allocate")
+  .set_body_typed<Stmt(VarExpr, Type, Array<Expr>, Expr, Stmt)>([](
+    VarExpr buffer_var, Type type, Array<Expr> extents, Expr condition, Stmt body
+  ){
+    return Allocate::make(buffer_var, type, extents, condition, body);
+  });
 
 // operator overloading, smarter than make
 #define REGISTER_MAKE_BINARY_OP(Node, Func)                  \
   TVM_REGISTER_API("make."#Node)                             \
-  .set_body([](TVMArgs args,  TVMRetValue *ret) {            \
-      Expr a = args[0], b = args[1];                         \
-      *ret = (Func(a, b));                                   \
+  .set_body_typed<Expr(Expr, Expr)>([](Expr a, Expr b) {     \
+      return (Func(a, b));                                   \
     })
 
 #define REGISTER_MAKE_BIT_OP(Node, Func)                                \
diff --git a/src/api/api_lang.cc b/src/api/api_lang.cc
index aac73f1878f8..42d60b85e375 100644
--- a/src/api/api_lang.cc
+++ b/src/api/api_lang.cc
@@ -32,19 +32,14 @@
 #include <tvm/build_module.h>
 #include <tvm/data_layout.h>
 
+
 namespace tvm {
 
 TVM_REGISTER_API("_min_value")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    Type t = args[0].operator Type();
-    *ret = t.min();
-  });
+.set_body_method(&Type::min);
 
 TVM_REGISTER_API("_max_value")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    Type t = args[0].operator Type();
-    *ret = t.max();
-  });
+.set_body_method(&Type::max);
 
 TVM_REGISTER_API("_const")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
@@ -58,9 +53,7 @@ TVM_REGISTER_API("_const")
   });
 
 TVM_REGISTER_API("_str")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-  *ret = ir::StringImm::make(args[0]);
-});
+.set_body_typed(ir::StringImm::make);
 
 
 TVM_REGISTER_API("_Array")
@@ -214,373 +207,217 @@ TVM_REGISTER_API("Range")
   });
 
 TVM_REGISTER_API("_Buffer")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = BufferNode::make(args[0],
-                            args[1],
-                            args[2],
-                            args[3],
-                            args[4],
-                            args[5],
-                            args[6],
-                            args[7],
-                            args[8]);
-  });
+.set_body_typed(BufferNode::make);
 
 TVM_REGISTER_API("_BufferAccessPtr")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = args[0].operator Buffer()
-        .access_ptr(args[1], args[2], args[3], args[4]);
-  });
+.set_body_method(&Buffer::access_ptr);
 
 TVM_REGISTER_API("_BufferVLoad")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = args[0].operator Buffer()
-        .vload(args[1], args[2]);
-  });
+.set_body_method(&Buffer::vload);
 
 TVM_REGISTER_API("_BufferVStore")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = args[0].operator Buffer()
-        .vstore(args[1], args[2]);
-  });
+.set_body_method(&Buffer::vstore);
 
 TVM_REGISTER_API("_Layout")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = LayoutNode::make(args[0]);
-  });
+.set_body_typed(LayoutNode::make);
 
 TVM_REGISTER_API("_LayoutIndexOf")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-  *ret = args[0].operator Layout()
-      .IndexOf(LayoutAxis::make(args[1]));
+.set_body_typed<int(Layout, std::string)>([](Layout layout, std::string axis) {
+  return layout.IndexOf(LayoutAxis::make(axis));
 });
 
 TVM_REGISTER_API("_LayoutFactorOf")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-  *ret = args[0].operator Layout()
-      .FactorOf(LayoutAxis::make(args[1]));
+.set_body_typed<int(Layout, std::string)>([](Layout layout, std::string axis) {
+  return layout.FactorOf(LayoutAxis::make(axis));
 });
 
 TVM_REGISTER_API("_LayoutNdim")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-  *ret = static_cast<int64_t>(args[0].operator Layout().ndim());
+.set_body_typed<int(Layout)>([](Layout layout) {
+  return layout.ndim();
 });
 
 TVM_REGISTER_API("_LayoutGetItem")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-  const LayoutAxis& axis = args[0].operator Layout()[args[1]];
-  *ret = axis.name();
+.set_body_typed<std::string(Layout, int)>([](Layout layout, int idx) {
+  const LayoutAxis& axis = layout[idx];
+  return axis.name();
 });
 
 TVM_REGISTER_API("_BijectiveLayout")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = BijectiveLayoutNode::make(args[0], args[1]);
-  });
+.set_body_typed(BijectiveLayoutNode::make);
 
 TVM_REGISTER_API("_BijectiveLayoutForwardIndex")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = args[0].operator BijectiveLayout()
-        .ForwardIndex(args[1]);
-  });
+.set_body_method(&BijectiveLayout::ForwardIndex);
 
 TVM_REGISTER_API("_BijectiveLayoutBackwardIndex")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = args[0].operator BijectiveLayout()
-        .BackwardIndex(args[1]);
-  });
+.set_body_method(&BijectiveLayout::BackwardIndex);
 
 TVM_REGISTER_API("_BijectiveLayoutForwardShape")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = args[0].operator BijectiveLayout()
-        .ForwardShape(args[1]);
-  });
+.set_body_method(&BijectiveLayout::ForwardShape);
 
 TVM_REGISTER_API("_BijectiveLayoutBackwardShape")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = args[0].operator BijectiveLayout()
-        .BackwardShape(args[1]);
-  });
+.set_body_method(&BijectiveLayout::BackwardShape);
 
 TVM_REGISTER_API("_Tensor")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = TensorNode::make(args[0],
-                            args[1],
-                            args[2],
-                            args[3]);
-  });
+.set_body_typed(TensorNode::make);
 
 TVM_REGISTER_API("_TensorIntrin")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = TensorIntrinNode::make(args[0],
-                                  args[1],
-                                  args[2],
-                                  args[3],
-                                  args[4],
-                                  args[5],
-                                  args[6]);
-  });
+.set_body_typed(TensorIntrinNode::make);
 
 TVM_REGISTER_API("_TensorIntrinCall")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = TensorIntrinCallNode::make(args[0],
-                                      args[1],
-                                      args[2],
-                                      args[3]);
-  });
+.set_body_typed(TensorIntrinCallNode::make);
 
 TVM_REGISTER_API("_TensorEqual")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = args[0].operator Tensor() == args[1].operator Tensor();
-  });
+.set_body_method(&Tensor::operator==);
 
 TVM_REGISTER_API("_TensorHash")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = static_cast<int64_t>(
-        std::hash<Tensor>()(args[0].operator Tensor()));
+.set_body_typed<int64_t(Tensor)>([](Tensor tensor) {
+    return static_cast<int64_t>(std::hash<Tensor>()(tensor));
   });
 
 TVM_REGISTER_API("_Placeholder")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = placeholder(args[0],
-                       args[1],
-                       args[2]);
-  });
+.set_body_typed<Tensor(Array<Expr>, Type, std::string)>([](
+  Array<Expr> shape, Type dtype, std::string name
+) {
+  return placeholder(shape, dtype, name);
+});
 
 TVM_REGISTER_API("_ComputeOp")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = ComputeOpNode::make(args[0],
-                               args[1],
-                               args[2],
-                               args[3],
-                               args[4]);
-  });
+.set_body_typed(ComputeOpNode::make);
 
 TVM_REGISTER_API("_ScanOp")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = ScanOpNode::make(args[0],
-                            args[1],
-                            args[2],
-                            args[3],
-                            args[4],
-                            args[5],
-                            args[6],
-                            args[7]);
-  });
+.set_body_typed(ScanOpNode::make);
 
 TVM_REGISTER_API("_TensorComputeOp")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = TensorComputeOpNode::make(args[0],
-                                     args[1],
-                                     args[2],
-                                     args[3],
-                                     args[4],
-                                     args[5],
-                                     args[6],
-                                     args[7]);
-  });
+.set_body_typed(TensorComputeOpNode::make);
 
 TVM_REGISTER_API("_ExternOp")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = ExternOpNode::make(args[0],
-                              args[1],
-                              args[2],
-                              args[3],
-                              args[4],
-                              args[5],
-                              args[6]);
-  });
+.set_body_typed(ExternOpNode::make);
 
 TVM_REGISTER_API("_HybridOp")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = HybridOpNode::make(args[0],
-                              args[1],
-                              args[2],
-                              args[3],
-                              args[4],
-                              args[5]);
-  });
+.set_body_typed(HybridOpNode::make);
 
 TVM_REGISTER_API("_OpGetOutput")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = args[0].operator Operation().output(
-        static_cast<size_t>(args[1].operator int64_t()));
-  });
+.set_body_typed<Tensor(Operation, int64_t)>([](Operation op, int64_t output) {
+  return op.output(static_cast<size_t>(output));
+});
 
 TVM_REGISTER_API("_OpNumOutputs")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = args[0].operator Operation()->num_outputs();
-  });
+.set_body_method<Operation>(&OperationNode::num_outputs);
 
 TVM_REGISTER_API("_OpInputTensors")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = args[0].operator Operation()->InputTensors();
-  });
+.set_body_method<Operation>(&OperationNode::InputTensors);
 
 TVM_REGISTER_API("_IterVar")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-    *ret = IterVarNode::make(
-        args[0], args[1],
-        static_cast<IterVarType>(args[2].operator int()),
-        args[3]);
-  });
+.set_body_typed<IterVar(Range, Var, int, std::string)>([](
+  Range dom, Var var, int iter_type, std::string thread_tag
+) {
+  return IterVarNode::make(
+      dom, var,
+      static_cast<IterVarType>(iter_type),
+      thread_tag);
+});
 
 TVM_REGISTER_API("_CreateSchedule")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = create_schedule(args[0].operator Array<Operation>());
-  });
+.set_body_typed(create_schedule);
 
 TVM_REGISTER_API("_StageSetScope")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    args[0].operator Stage()
-        .set_scope(args[1]);
-  });
+.set_body_method(&Stage::set_scope);
 
 TVM_REGISTER_API("_StageBind")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    args[0].operator Stage()
-        .bind(args[1], args[2]);
-  });
+.set_body_method(&Stage::bind);
 
 TVM_REGISTER_API("_StageSplitByFactor")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    IterVar outer, inner;
-    args[0].operator Stage()
-        .split(args[1], args[2], &outer, &inner);
-    *ret = Array<IterVar>({outer, inner});
-  });
+.set_body_typed<Array<IterVar>(Stage, IterVar, Expr)>([](
+  Stage stage, IterVar parent, Expr factor
+) {
+  IterVar outer, inner;
+  stage.split(parent, factor, &outer, &inner);
+  return Array<IterVar>({outer, inner});
+});
 
 TVM_REGISTER_API("_StageSplitByNParts")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    IterVar outer, inner;
-    args[0].operator Stage()
-        .split_by_nparts(args[1], args[2], &outer, &inner);
-    *ret = Array<IterVar>({outer, inner});
-  });
+.set_body_typed<Array<IterVar>(Stage, IterVar, Expr)>([](
+  Stage stage, IterVar parent, Expr nparts
+) {
+  IterVar outer, inner;
+  stage.split_by_nparts(parent, nparts, &outer, &inner);
+  return Array<IterVar>({outer, inner});
+});
 
 TVM_REGISTER_API("_StageFuse")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
+.set_body_typed<IterVar(Stage, Array<IterVar>)>([](Stage stage, Array<IterVar> axes) {
     IterVar fused;
-    args[0].operator Stage()
-        .fuse(args[1], &fused);
-    *ret = fused;
+    stage.fuse(axes, &fused);
+    return fused;
   });
 
 TVM_REGISTER_API("_StageComputeAt")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    args[0].operator Stage()
-        .compute_at(args[1], args[2]);
-  });
+.set_body_method(&Stage::compute_at);
 
 TVM_REGISTER_API("_StageComputeInline")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    args[0].operator Stage()
-        .compute_inline();
-  });
+.set_body_method(&Stage::compute_inline);
 
 TVM_REGISTER_API("_StageComputeRoot")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    args[0].operator Stage()
-        .compute_root();
-  });
+.set_body_method(&Stage::compute_root);
 
 TVM_REGISTER_API("_StageReorder")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    args[0].operator Stage()
-        .reorder(args[1]);
-  });
+.set_body_method(&Stage::reorder);
 
 TVM_REGISTER_API("_StageTile")
-  .set_body([](TVMArgs args, TVMRetValue* ret) {
+.set_body_typed<Array<IterVar>(Stage, IterVar, IterVar, Expr, Expr)>([](
+  Stage stage,
+  IterVar x_parent, IterVar y_parent,
+  Expr x_factor, Expr y_factor
+) {
     IterVar x_outer, y_outer, x_inner, y_inner;
-    args[0].operator Stage()
-        .tile(args[1], args[2],
-              args[3], args[4],
-              &x_outer, &y_outer,
-              &x_inner, &y_inner);
-    *ret = Array<IterVar>({x_outer, y_outer, x_inner, y_inner});
+    stage.tile(x_parent, y_parent,
+               x_factor, y_factor,
+               &x_outer, &y_outer,
+               &x_inner, &y_inner);
+    return Array<IterVar>({x_outer, y_outer, x_inner, y_inner});
   });
 
 TVM_REGISTER_API("_StageEnvThreads")
-  .set_body([](TVMArgs args, TVMRetValue* ret) {
-    args[0].operator Stage()
-        .env_threads(args[1]);
-  });
+.set_body_method(&Stage::env_threads);
 
 TVM_REGISTER_API("_StageSetStorePredicate")
-  .set_body([](TVMArgs args, TVMRetValue* ret) {
-    args[0].operator Stage()
-        .set_store_predicate(args[1]);
-  });
+.set_body_method(&Stage::set_store_predicate);
 
 TVM_REGISTER_API("_StageUnroll")
-  .set_body([](TVMArgs args, TVMRetValue* ret) {
-    args[0].operator Stage()
-        .unroll(args[1]);
-  });
+.set_body_method(&Stage::unroll);
 
 TVM_REGISTER_API("_StageVectorize")
-  .set_body([](TVMArgs args, TVMRetValue* ret) {
-    args[0].operator Stage()
-        .vectorize(args[1]);
-  });
+.set_body_method(&Stage::vectorize);
 
 TVM_REGISTER_API("_StageTensorize")
-  .set_body([](TVMArgs args, TVMRetValue* ret) {
-    args[0].operator Stage()
-        .tensorize(args[1], args[2]);
-  });
+.set_body_method(&Stage::tensorize);
 
 TVM_REGISTER_API("_StageParallel")
-  .set_body([](TVMArgs args, TVMRetValue* ret) {
-    args[0].operator Stage()
-        .parallel(args[1]);
-  });
+.set_body_method(&Stage::parallel);
 
 TVM_REGISTER_API("_StagePragma")
-  .set_body([](TVMArgs args, TVMRetValue* ret) {
-    args[0].operator Stage()
-        .pragma(args[1], args[2], args[3]);
-  });
+.set_body_method(&Stage::pragma);
 
 TVM_REGISTER_API("_StagePrefetch")
-  .set_body([](TVMArgs args, TVMRetValue *ret) {
-    args[0].operator Stage()
-        .prefetch(args[1], args[2], args[3]);
-  });
+.set_body_method(&Stage::prefetch);
 
 TVM_REGISTER_API("_StageStorageAlign")
-  .set_body([](TVMArgs args, TVMRetValue *ret) {
-    args[0].operator Stage()
-        .storage_align(args[1], args[2], args[3]);
-  });
+.set_body_method(&Stage::storage_align);
 
 TVM_REGISTER_API("_StageDoubleBuffer")
-  .set_body([](TVMArgs args, TVMRetValue *ret) {
-    args[0].operator Stage().double_buffer();
-  });
+.set_body_method(&Stage::double_buffer);
 
 TVM_REGISTER_API("_StageOpenGL")
-  .set_body([](TVMArgs args, TVMRetValue *ret) {
-    args[0].operator Stage().opengl();
-  });
+.set_body_method(&Stage::opengl);
 
 TVM_REGISTER_API("_ScheduleNormalize")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = args[0].operator Schedule()
-        .normalize();
-  });
+.set_body_method(&Schedule::normalize);
 
 TVM_REGISTER_API("_ScheduleCreateGroup")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = args[0].operator Schedule()
-        .create_group(args[1], args[2], args[3]);
-  });
+.set_body_method(&Schedule::create_group);
 
 TVM_REGISTER_API("_ScheduleCacheRead")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = args[0].operator Schedule()
-        .cache_read(args[1], args[2], args[3]);
-  });
+.set_body_method(&Schedule::cache_read);
 
 TVM_REGISTER_API("_ScheduleCacheWrite")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
@@ -594,16 +431,9 @@ TVM_REGISTER_API("_ScheduleCacheWrite")
   });
 
 TVM_REGISTER_API("_ScheduleRFactor")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = args[0].operator Schedule()
-        .rfactor(args[1], args[2], args[3]);
-  });
+.set_body_method(&Schedule::rfactor);
 
 TVM_REGISTER_API("_CommReducerCombine")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    const ir::CommReducerNode* combiner =
-      args[0].operator ir::CommReducer().as<ir::CommReducerNode>();
-    *ret = (*combiner)(args[1], args[2]);
-  });
+.set_body_method<ir::CommReducer>(&ir::CommReducerNode::operator());
 
 }  // namespace tvm
diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc
index 2e1ab42e4cbe..6195aac1b93f 100644
--- a/src/api/api_pass.cc
+++ b/src/api/api_pass.cc
@@ -119,68 +119,43 @@ TVM_REGISTER_API("ir_pass.PostOrderVisit")
   });
 
 // make from two arguments
-#define REGISTER_PASS1(PassName)                                  \
+#define REGISTER_PASS(PassName)                                   \
   TVM_REGISTER_API("ir_pass."#PassName)                           \
-  .set_body([](TVMArgs args,  TVMRetValue *ret) {                 \
-      *ret = PassName(args[0]);                                   \
-    })                                                            \
-
-#define REGISTER_PASS2(PassName)                                  \
-  TVM_REGISTER_API("ir_pass."#PassName)                           \
-  .set_body([](TVMArgs args,  TVMRetValue *ret) {                 \
-      *ret = PassName(args[0], args[1]);                          \
-    })                                                            \
-
-#define REGISTER_PASS3(PassName)                                        \
-  TVM_REGISTER_API("ir_pass."#PassName)                                 \
-  .set_body([](TVMArgs args,  TVMRetValue *ret) {                       \
-      *ret = PassName(args[0], args[1], args[2]);                       \
-    })                                                                  \
-
-#define REGISTER_PASS4(PassName)                                        \
-  TVM_REGISTER_API("ir_pass."#PassName)                                 \
-  .set_body([](TVMArgs args,  TVMRetValue *ret) {                       \
-      *ret = PassName(args[0], args[1], args[2], args[3]);              \
-    })                                                                  \
-
-#define REGISTER_PASS5(PassName)                                        \
-  TVM_REGISTER_API("ir_pass."#PassName)                                 \
-  .set_body([](TVMArgs args,  TVMRetValue *ret) {                       \
-      *ret = PassName(args[0], args[1], args[2], args[3], args[4]);     \
-    })                                                                  \
-
-REGISTER_PASS1(ConvertSSA);
-REGISTER_PASS1(VerifySSA);
-REGISTER_PASS1(RewriteUnsafeSelect);
-REGISTER_PASS4(Inline);
-REGISTER_PASS4(IRTransform);
-REGISTER_PASS1(VectorizeLoop);
-REGISTER_PASS5(UnrollLoop);
-REGISTER_PASS3(InjectCopyIntrin);
-REGISTER_PASS2(ThreadSync);
-REGISTER_PASS5(MakeAPI);
-REGISTER_PASS2(BindDeviceType);
-REGISTER_PASS1(SplitHostDevice);
-REGISTER_PASS1(StorageRewrite);
-REGISTER_PASS1(CoProcSync);
-REGISTER_PASS1(LowerStorageAccessInfo);
-REGISTER_PASS1(InjectVirtualThread);
-REGISTER_PASS1(InjectPrefetch);
-REGISTER_PASS2(InjectDoubleBuffer);
-REGISTER_PASS2(LoopPartition);
-REGISTER_PASS1(RemoveNoOp);
-REGISTER_PASS2(SplitPipeline);
-REGISTER_PASS2(LiftAttrScope);
-REGISTER_PASS1(NarrowChannelAccess);
-REGISTER_PASS2(LowerThreadAllreduce);
-REGISTER_PASS2(LowerWarpMemory);
-REGISTER_PASS2(RemapThreadAxis);
-REGISTER_PASS2(LowerIntrin);
-REGISTER_PASS1(LowerTVMBuiltin);
-REGISTER_PASS1(CombineContextCall);
-REGISTER_PASS2(VerifyMemory);
-REGISTER_PASS2(VerifyGPUCode);
-REGISTER_PASS1(DecorateDeviceScope);
-REGISTER_PASS1(InstrumentBoundCheckers);
+  .set_body_typed(PassName);                                     \
+
+
+REGISTER_PASS(ConvertSSA);
+REGISTER_PASS(VerifySSA);
+REGISTER_PASS(RewriteUnsafeSelect);
+REGISTER_PASS(Inline);
+REGISTER_PASS(IRTransform);
+REGISTER_PASS(VectorizeLoop);
+REGISTER_PASS(UnrollLoop);
+REGISTER_PASS(InjectCopyIntrin);
+REGISTER_PASS(ThreadSync);
+REGISTER_PASS(MakeAPI);
+REGISTER_PASS(BindDeviceType);
+REGISTER_PASS(SplitHostDevice);
+REGISTER_PASS(StorageRewrite);
+REGISTER_PASS(CoProcSync);
+REGISTER_PASS(LowerStorageAccessInfo);
+REGISTER_PASS(InjectVirtualThread);
+REGISTER_PASS(InjectPrefetch);
+REGISTER_PASS(InjectDoubleBuffer);
+REGISTER_PASS(LoopPartition);
+REGISTER_PASS(RemoveNoOp);
+REGISTER_PASS(SplitPipeline);
+REGISTER_PASS(LiftAttrScope);
+REGISTER_PASS(NarrowChannelAccess);
+REGISTER_PASS(LowerThreadAllreduce);
+REGISTER_PASS(LowerWarpMemory);
+REGISTER_PASS(RemapThreadAxis);
+REGISTER_PASS(LowerIntrin);
+REGISTER_PASS(LowerTVMBuiltin);
+REGISTER_PASS(CombineContextCall);
+REGISTER_PASS(VerifyMemory);
+REGISTER_PASS(VerifyGPUCode);
+REGISTER_PASS(DecorateDeviceScope);
+REGISTER_PASS(InstrumentBoundCheckers);
 }  // namespace ir
 }  // namespace tvm
diff --git a/src/api/api_schedule.cc b/src/api/api_schedule.cc
index 45e2eb4c9375..177360bf2ebb 100644
--- a/src/api/api_schedule.cc
+++ b/src/api/api_schedule.cc
@@ -33,15 +33,11 @@ namespace tvm {
 namespace schedule {
 
 TVM_REGISTER_API("schedule.AutoInlineElemWise")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    AutoInlineElemWise(args[0]);
-  });
+.set_body_typed(AutoInlineElemWise);
 
 
 TVM_REGISTER_API("schedule.AutoInlineInjective")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    AutoInlineInjective(args[0]);
-  });
+.set_body_typed(AutoInlineInjective);
 
 TVM_REGISTER_API("schedule.ScheduleOps")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
@@ -51,25 +47,17 @@ TVM_REGISTER_API("schedule.ScheduleOps")
     *ret = ScheduleOps(args[0], args[1], args[2]);
 });
 
-#define REGISTER_SCHEDULE_PASS1(PassName)                         \
-  TVM_REGISTER_API("schedule."#PassName)                          \
-  .set_body([](TVMArgs args,  TVMRetValue *ret) {                 \
-      *ret = PassName(args[0]);                                   \
-    })                                                            \
-
-#define REGISTER_SCHEDULE_PASS2(PassName)                         \
+#define REGISTER_SCHEDULE_PASS(PassName)                          \
   TVM_REGISTER_API("schedule."#PassName)                          \
-  .set_body([](TVMArgs args,  TVMRetValue *ret) {                 \
-      *ret = PassName(args[0], args[1]);                          \
-    })                                                            \
+  .set_body_typed(PassName);                                     \
 
 
-REGISTER_SCHEDULE_PASS1(InferBound);
-REGISTER_SCHEDULE_PASS1(CreateReadGraph);
-REGISTER_SCHEDULE_PASS2(PostDFSOrder);
-REGISTER_SCHEDULE_PASS1(CreateAttachPath);
-REGISTER_SCHEDULE_PASS1(ScanGetBody);
-REGISTER_SCHEDULE_PASS1(ScanFixPointAnalysis);
+REGISTER_SCHEDULE_PASS(InferBound);
+REGISTER_SCHEDULE_PASS(CreateReadGraph);
+REGISTER_SCHEDULE_PASS(PostDFSOrder);
+REGISTER_SCHEDULE_PASS(CreateAttachPath);
+REGISTER_SCHEDULE_PASS(ScanGetBody);
+REGISTER_SCHEDULE_PASS(ScanFixPointAnalysis);
 
 }  // namespace schedule
 }  // namespace tvm
diff --git a/src/codegen/codegen_opencl.cc b/src/codegen/codegen_opencl.cc
index 96e1b9efe8dd..382124a7ed2d 100644
--- a/src/codegen/codegen_opencl.cc
+++ b/src/codegen/codegen_opencl.cc
@@ -263,8 +263,6 @@ runtime::Module BuildOpenCL(Array<LoweredFunc> funcs) {
 }
 
 TVM_REGISTER_API("codegen.build_opencl")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = BuildOpenCL(args[0]);
-  });
+.set_body_typed(BuildOpenCL);
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/codegen/codegen_opengl.cc b/src/codegen/codegen_opengl.cc
index 27d910e7211b..797a7d1c406e 100644
--- a/src/codegen/codegen_opengl.cc
+++ b/src/codegen/codegen_opengl.cc
@@ -302,9 +302,7 @@ runtime::Module BuildOpenGL(Array<LoweredFunc> funcs) {
 }
 
 TVM_REGISTER_API("codegen.build_opengl")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = BuildOpenGL(args[0]);
-});
+.set_body_typed(BuildOpenGL);
 
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/codegen/codegen_vhls.cc b/src/codegen/codegen_vhls.cc
index 460647a6e180..a18312fe6af5 100644
--- a/src/codegen/codegen_vhls.cc
+++ b/src/codegen/codegen_vhls.cc
@@ -164,9 +164,7 @@ runtime::Module BuildSDAccel(Array<LoweredFunc> funcs, std::string target_str) {
 }
 
 TVM_REGISTER_API("codegen.build_sdaccel")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = BuildSDAccel(args[0], args[1]);
-  });
+.set_body_typed(BuildSDAccel);
 
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc
index 22c432cc9e4b..396ae5956556 100644
--- a/src/codegen/llvm/codegen_amdgpu.cc
+++ b/src/codegen/llvm/codegen_amdgpu.cc
@@ -265,9 +265,7 @@ runtime::Module BuildAMDGPU(Array<LoweredFunc> funcs, std::string target) {
 }
 
 TVM_REGISTER_API("codegen.build_rocm")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = BuildAMDGPU(args[0], args[1]);
-  });
+.set_body_typed(BuildAMDGPU);
 
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/codegen/llvm/codegen_nvptx.cc b/src/codegen/llvm/codegen_nvptx.cc
index b1b541d4ab74..290727fd9152 100644
--- a/src/codegen/llvm/codegen_nvptx.cc
+++ b/src/codegen/llvm/codegen_nvptx.cc
@@ -243,9 +243,7 @@ runtime::Module BuildNVPTX(Array<LoweredFunc> funcs, std::string target) {
 }
 
 TVM_REGISTER_API("codegen.build_nvptx")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = BuildNVPTX(args[0], args[1]);
-  });
+.set_body_typed(BuildNVPTX);
 
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/codegen/opt/build_cuda_on.cc b/src/codegen/opt/build_cuda_on.cc
index 581c33086bee..fda239f0766f 100644
--- a/src/codegen/opt/build_cuda_on.cc
+++ b/src/codegen/opt/build_cuda_on.cc
@@ -155,8 +155,6 @@ runtime::Module BuildCUDA(Array<LoweredFunc> funcs) {
 }
 
 TVM_REGISTER_API("codegen.build_cuda")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = BuildCUDA(args[0]);
-  });
+.set_body_typed(BuildCUDA);
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/codegen/source_module.cc b/src/codegen/source_module.cc
index 70047a6050db..88be7fed448d 100644
--- a/src/codegen/source_module.cc
+++ b/src/codegen/source_module.cc
@@ -188,8 +188,6 @@ runtime::Module DeviceSourceModuleCreate(
 }
 
 TVM_REGISTER_GLOBAL("module.source_module_create")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = SourceModuleCreate(args[0], args[1]);
-  });
+.set_body_typed(SourceModuleCreate);
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/codegen/spirv/build_vulkan.cc b/src/codegen/spirv/build_vulkan.cc
index 2b1ef660fbdc..18ffad1a58bc 100644
--- a/src/codegen/spirv/build_vulkan.cc
+++ b/src/codegen/spirv/build_vulkan.cc
@@ -103,9 +103,7 @@ runtime::Module BuildSPIRV(Array<LoweredFunc> funcs) {
 }
 
 TVM_REGISTER_API("codegen.build_vulkan")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = BuildSPIRV(args[0]);
-  });
+.set_body_typed(BuildSPIRV);
 
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/codegen/stackvm/codegen_stackvm.cc b/src/codegen/stackvm/codegen_stackvm.cc
index 8c4c258095d3..2d71a20a6232 100644
--- a/src/codegen/stackvm/codegen_stackvm.cc
+++ b/src/codegen/stackvm/codegen_stackvm.cc
@@ -522,8 +522,6 @@ runtime::Module BuildStackVM(const Array<LoweredFunc>& funcs) {
 }
 
 TVM_REGISTER_API("codegen.build_stackvm")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = BuildStackVM(args[0]);
-  });
+.set_body_typed(BuildStackVM);
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index 735f1830d049..9af3f822a07d 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -51,9 +51,7 @@ Closure ClosureNode::make(tvm::Map<Var, Value> env, Function func) {
 }
 
 TVM_REGISTER_API("relay._make.Closure")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = ClosureNode::make(args[0], args[1]);
-  });
+.set_body_typed(ClosureNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<ClosureNode>([](const ClosureNode* node, tvm::IRPrinter* p) {
@@ -67,9 +65,7 @@ TupleValue TupleValueNode::make(tvm::Array<Value> value) {
 }
 
 TVM_REGISTER_API("relay._make.TupleValue")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = TupleValueNode::make(args[0]);
-  });
+.set_body_typed(TupleValueNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<TupleValueNode>([](const TupleValueNode* node, tvm::IRPrinter* p) {
@@ -90,10 +86,7 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
   });
 
 TVM_REGISTER_API("relay._make.TensorValue")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    runtime::NDArray data = args[0];
-    *ret = TensorValueNode::make(data);
-  });
+.set_body_typed(TensorValueNode::make);
 
 RefValue RefValueNode::make(Value value) {
   NodePtr<RefValueNode> n = make_node<RefValueNode>();
@@ -102,9 +95,7 @@ RefValue RefValueNode::make(Value value) {
 }
 
 TVM_REGISTER_API("relay._make.RefValue")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = RefValueNode::make(args[0]);
-  });
+.set_body_typed(RefValueNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<RefValueNode>([](const RefValueNode* node,
@@ -121,9 +112,7 @@ ConstructorValue ConstructorValueNode::make(Constructor constructor,
 }
 
 TVM_REGISTER_API("relay._make.ConstructorValue")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = ConstructorValueNode::make(args[0], args[1]);
-  });
+.set_body_typed(ConstructorValueNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<ConstructorValueNode>([](const ConstructorValueNode* node,
@@ -614,9 +603,7 @@ CreateInterpreter(
 }
 
 TVM_REGISTER_API("relay.backend.CreateInterpreter")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = CreateInterpreter(args[0], args[1], args[2]);
-  });
+.set_body_typed(CreateInterpreter);
 
 TVM_REGISTER_NODE_TYPE(ClosureNode);
 TVM_REGISTER_NODE_TYPE(TupleValueNode);
diff --git a/src/relay/ir/adt.cc b/src/relay/ir/adt.cc
index 2e7d854fbd2a..b59281a4f1fd 100644
--- a/src/relay/ir/adt.cc
+++ b/src/relay/ir/adt.cc
@@ -36,9 +36,7 @@ PatternWildcard PatternWildcardNode::make() {
 TVM_REGISTER_NODE_TYPE(PatternWildcardNode);
 
 TVM_REGISTER_API("relay._make.PatternWildcard")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = PatternWildcardNode::make();
-  });
+.set_body_typed(PatternWildcardNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<PatternWildcardNode>([](const PatternWildcardNode* node,
@@ -55,9 +53,7 @@ PatternVar PatternVarNode::make(tvm::relay::Var var) {
 TVM_REGISTER_NODE_TYPE(PatternVarNode);
 
 TVM_REGISTER_API("relay._make.PatternVar")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = PatternVarNode::make(args[0]);
-  });
+.set_body_typed(PatternVarNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<PatternVarNode>([](const PatternVarNode* node,
@@ -76,9 +72,7 @@ PatternConstructor PatternConstructorNode::make(Constructor constructor,
 TVM_REGISTER_NODE_TYPE(PatternConstructorNode);
 
 TVM_REGISTER_API("relay._make.PatternConstructor")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = PatternConstructorNode::make(args[0], args[1]);
-  });
+.set_body_typed(PatternConstructorNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<PatternConstructorNode>([](const PatternConstructorNode* node,
@@ -100,9 +94,7 @@ Constructor ConstructorNode::make(std::string name_hint,
 TVM_REGISTER_NODE_TYPE(ConstructorNode);
 
 TVM_REGISTER_API("relay._make.Constructor")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = ConstructorNode::make(args[0], args[1], args[2]);
-  });
+.set_body_typed(ConstructorNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<ConstructorNode>([](const ConstructorNode* node,
@@ -124,9 +116,7 @@ TypeData TypeDataNode::make(GlobalTypeVar header,
 TVM_REGISTER_NODE_TYPE(TypeDataNode);
 
 TVM_REGISTER_API("relay._make.TypeData")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = TypeDataNode::make(args[0], args[1], args[2]);
-  });
+.set_body_typed(TypeDataNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<TypeDataNode>([](const TypeDataNode* node,
@@ -145,9 +135,7 @@ Clause ClauseNode::make(Pattern lhs, Expr rhs) {
 TVM_REGISTER_NODE_TYPE(ClauseNode);
 
 TVM_REGISTER_API("relay._make.Clause")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = ClauseNode::make(args[0], args[1]);
-  });
+.set_body_typed(ClauseNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<ClauseNode>([](const ClauseNode* node,
@@ -166,9 +154,7 @@ Match MatchNode::make(Expr data, tvm::Array<Clause> clauses) {
 TVM_REGISTER_NODE_TYPE(MatchNode);
 
 TVM_REGISTER_API("relay._make.Match")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = MatchNode::make(args[0], args[1]);
-  });
+.set_body_typed(MatchNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<MatchNode>([](const MatchNode* node,
diff --git a/src/relay/ir/alpha_equal.cc b/src/relay/ir/alpha_equal.cc
index 967034519979..81017d4fddfa 100644
--- a/src/relay/ir/alpha_equal.cc
+++ b/src/relay/ir/alpha_equal.cc
@@ -505,18 +505,18 @@ bool AlphaEqual(const Expr& lhs, const Expr& rhs) {
 
 // TODO(@jroesch): move to correct namespace?
 TVM_REGISTER_API("relay._make._alpha_equal")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = AlphaEqualHandler(false).Equal(args[0], args[1]);
+.set_body_typed<bool(NodeRef, NodeRef)>([](NodeRef a, NodeRef b) {
+    return AlphaEqualHandler(false).Equal(a, b);
   });
 
 TVM_REGISTER_API("relay._make._type_alpha_equal")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = AlphaEqualHandler(false).TypeEqual(args[0], args[1]);
+.set_body_typed<bool(Type, Type)>([](Type a, Type b) {
+    return AlphaEqualHandler(false).TypeEqual(a, b);
   });
 
 TVM_REGISTER_API("relay._make._graph_equal")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = AlphaEqualHandler(true).Equal(args[0], args[1]);
+.set_body_typed<bool(NodeRef, NodeRef)>([](NodeRef a, NodeRef b) {
+    return AlphaEqualHandler(true).Equal(a, b);
   });
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/ir/base.cc b/src/relay/ir/base.cc
index 9c35173bb47a..f60f6594559c 100644
--- a/src/relay/ir/base.cc
+++ b/src/relay/ir/base.cc
@@ -52,9 +52,7 @@ SourceName SourceName::Get(const std::string& name) {
 }
 
 TVM_REGISTER_API("relay._make.SourceName")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = SourceName::Get(args[0]);
-  });
+.set_body_typed(SourceName::Get);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<SourceNameNode>([](const SourceNameNode* node, tvm::IRPrinter* p) {
@@ -78,9 +76,7 @@ Span SpanNode::make(SourceName source, int lineno, int col_offset) {
 TVM_REGISTER_NODE_TYPE(SpanNode);
 
 TVM_REGISTER_API("relay._make.Span")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = SpanNode::make(args[0], args[1], args[2]);
-  });
+.set_body_typed(SpanNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<SpanNode>([](const SpanNode* node, tvm::IRPrinter* p) {
@@ -91,11 +87,9 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 TVM_REGISTER_NODE_TYPE(IdNode);
 
 TVM_REGISTER_API("relay._base.set_span")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    NodeRef node_ref = args[0];
+.set_body_typed<void(NodeRef, Span)>([](NodeRef node_ref, Span sp) {
     auto rn = node_ref.as_derived<RelayNode>();
     CHECK(rn);
-    Span sp = args[1];
     rn->span = sp;
 });
 
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index 3108bc2501fe..63d41c405e33 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -39,9 +39,7 @@ Constant ConstantNode::make(runtime::NDArray data) {
 TVM_REGISTER_NODE_TYPE(ConstantNode);
 
 TVM_REGISTER_API("relay._make.Constant")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = ConstantNode::make(args[0]);
-  });
+.set_body_typed(ConstantNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<ConstantNode>([](const ConstantNode* node, tvm::IRPrinter* p) {
@@ -73,9 +71,7 @@ Tuple TupleNode::make(tvm::Array<relay::Expr> fields) {
 TVM_REGISTER_NODE_TYPE(TupleNode);
 
 TVM_REGISTER_API("relay._make.Tuple")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = TupleNode::make(args[0]);
-  });
+.set_body_typed(TupleNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<TupleNode>([](const TupleNode* node, tvm::IRPrinter* p) {
@@ -99,9 +95,7 @@ Var VarNode::make(std::string name_hint, Type type_annotation) {
 TVM_REGISTER_NODE_TYPE(VarNode);
 
 TVM_REGISTER_API("relay._make.Var")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = VarNode::make(args[0].operator std::string(), args[1]);
-  });
+.set_body_typed(static_cast<Var (*)(std::string, Type)>(VarNode::make));
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<VarNode>([](const VarNode* node, tvm::IRPrinter* p) {
@@ -122,9 +116,7 @@ GlobalVar GlobalVarNode::make(std::string name_hint) {
 TVM_REGISTER_NODE_TYPE(GlobalVarNode);
 
 TVM_REGISTER_API("relay._make.GlobalVar")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = GlobalVarNode::make(args[0]);
-  });
+.set_body_typed(GlobalVarNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<GlobalVarNode>([](const GlobalVarNode* node, tvm::IRPrinter* p) {
@@ -201,9 +193,7 @@ Function FunctionSetAttr(const Function& func, const std::string& key, const Nod
 TVM_REGISTER_NODE_TYPE(FunctionNode);
 
 TVM_REGISTER_API("relay._make.Function")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  *ret = FunctionNode::make(args[0], args[1], args[2], args[3], args[4]);
-});
+.set_body_typed(FunctionNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<FunctionNode>([](const FunctionNode* node,
@@ -226,9 +216,7 @@ Call CallNode::make(Expr op, Array<Expr> args, Attrs attrs,
 TVM_REGISTER_NODE_TYPE(CallNode);
 
 TVM_REGISTER_API("relay._make.Call")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  *ret = CallNode::make(args[0], args[1], args[2], args[3]);
-});
+.set_body_typed(CallNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<CallNode>([](const CallNode* node, tvm::IRPrinter* p) {
@@ -247,9 +235,7 @@ Let LetNode::make(Var var, Expr value, Expr body) {
 TVM_REGISTER_NODE_TYPE(LetNode);
 
 TVM_REGISTER_API("relay._make.Let")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = LetNode::make(args[0], args[1], args[2]);
-  });
+.set_body_typed(LetNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<LetNode>([](const LetNode* node, tvm::IRPrinter* p) {
@@ -267,9 +253,8 @@ If IfNode::make(Expr cond, Expr true_branch, Expr false_branch) {
 
 TVM_REGISTER_NODE_TYPE(IfNode);
 
-TVM_REGISTER_API("relay._make.If").set_body([](TVMArgs args, TVMRetValue* ret) {
-  *ret = IfNode::make(args[0], args[1], args[2]);
-});
+TVM_REGISTER_API("relay._make.If")
+.set_body_typed(IfNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<IfNode>([](const IfNode* node, tvm::IRPrinter* p) {
@@ -286,9 +271,8 @@ TupleGetItem TupleGetItemNode::make(Expr tuple, int index) {
 
 TVM_REGISTER_NODE_TYPE(TupleGetItemNode);
 
-TVM_REGISTER_API("relay._make.TupleGetItem").set_body([](TVMArgs args, TVMRetValue* ret) {
-  *ret = TupleGetItemNode::make(args[0], args[1]);
-});
+TVM_REGISTER_API("relay._make.TupleGetItem")
+.set_body_typed(TupleGetItemNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<TupleGetItemNode>([](const TupleGetItemNode* node, tvm::IRPrinter* p) {
@@ -301,9 +285,8 @@ RefCreate RefCreateNode::make(Expr value) {
   return RefCreate(n);
 }
 
-TVM_REGISTER_API("relay._make.RefCreate").set_body([](TVMArgs args, TVMRetValue* ret) {
-  *ret = RefCreateNode::make(args[0]);
-});
+TVM_REGISTER_API("relay._make.RefCreate")
+.set_body_typed(RefCreateNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<RefCreateNode>([](const RefCreateNode* node, tvm::IRPrinter* p) {
@@ -317,9 +300,7 @@ RefRead RefReadNode::make(Expr ref) {
 }
 
 TVM_REGISTER_API("relay._make.RefRead")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  *ret = RefReadNode::make(args[0]);
-});
+.set_body_typed(RefReadNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<RefReadNode>([](const RefReadNode* node, tvm::IRPrinter* p) {
@@ -334,9 +315,7 @@ RefWrite RefWriteNode::make(Expr ref, Expr value) {
 }
 
 TVM_REGISTER_API("relay._make.RefWrite")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  *ret = RefWriteNode::make(args[0], args[1]);
-});
+.set_body_typed(RefWriteNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<RefWriteNode>([](const RefWriteNode* node, tvm::IRPrinter* p) {
@@ -344,9 +323,8 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 });
 
 TVM_REGISTER_API("relay._expr.TempExprRealize")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  TempExpr temp = args[0];
-  *ret = temp->Realize();
+.set_body_typed<Expr(TempExpr)>([](TempExpr temp) {
+  return temp->Realize();
 });
 
 }  // namespace relay
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index d0cd30adda29..7a6250cd6580 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -346,9 +346,8 @@ void PostOrderVisit(const Expr& e, std::function<void(const Expr&)> fvisit) {
 }
 
 TVM_REGISTER_API("relay._ir_pass.post_order_visit")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    PackedFunc f = args[1];
-    PostOrderVisit(args[0], [f](const Expr& n) {
+.set_body_typed<void(Expr, PackedFunc)>([](Expr expr, PackedFunc f) {
+    PostOrderVisit(expr, [f](const Expr& n) {
         f(n);
       });
   });
diff --git a/src/relay/ir/hash.cc b/src/relay/ir/hash.cc
index cb2be8b2c184..89ad6083fb8e 100644
--- a/src/relay/ir/hash.cc
+++ b/src/relay/ir/hash.cc
@@ -410,14 +410,14 @@ size_t StructuralHash::operator()(const Expr& expr) const {
 }
 
 TVM_REGISTER_API("relay._ir_pass._expr_hash")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = static_cast<int64_t>(RelayHashHandler().Hash(args[0]));
-  });
+.set_body_typed<int64_t(NodeRef)>([](NodeRef ref) {
+  return static_cast<int64_t>(RelayHashHandler().Hash(ref));
+});
 
 TVM_REGISTER_API("relay._ir_pass._type_hash")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = static_cast<int64_t>(RelayHashHandler().TypeHash(args[0]));
-  });
+.set_body_typed<int64_t(Type)>([](Type type) {
+  return static_cast<int64_t>(RelayHashHandler().TypeHash(type));
+});
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/ir/module.cc b/src/relay/ir/module.cc
index 38c9756841fc..eabea2ecfeb0 100644
--- a/src/relay/ir/module.cc
+++ b/src/relay/ir/module.cc
@@ -181,66 +181,43 @@ Module ModuleNode::FromExpr(
 TVM_REGISTER_NODE_TYPE(ModuleNode);
 
 TVM_REGISTER_API("relay._make.Module")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = ModuleNode::make(args[0], args[1]);
-  });
+.set_body_typed(ModuleNode::make);
 
 TVM_REGISTER_API("relay._make.Module_Add")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    Module mod = args[0];
-    mod->Add(args[1], args[2], args[3]);
-  });
+.set_body_method<Module>(&ModuleNode::Add);
 
 TVM_REGISTER_API("relay._module.Module_AddDef")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    Module mod = args[0];
-    mod->AddDef(args[1], args[2]);
-  });
+.set_body_method<Module>(&ModuleNode::AddDef);
 
 TVM_REGISTER_API("relay._module.Module_GetGlobalVar")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    Module mod = args[0];
-    *ret = mod->GetGlobalVar(args[1]);
-  });
+.set_body_method<Module>(&ModuleNode::GetGlobalVar);
 
 TVM_REGISTER_API("relay._module.Module_GetGlobalTypeVar")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    Module mod = args[0];
-    *ret = mod->GetGlobalTypeVar(args[1]);
-  });
+.set_body_method<Module>(&ModuleNode::GetGlobalTypeVar);
 
 TVM_REGISTER_API("relay._module.Module_Lookup")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    Module mod = args[0];
-    GlobalVar var = args[1];
-    *ret = mod->Lookup(var);
+.set_body_typed<Function(Module, GlobalVar)>([](Module mod, GlobalVar var) {
+    return mod->Lookup(var);
   });
 
 TVM_REGISTER_API("relay._module.Module_Lookup_str")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    Module mod = args[0];
-    std::string var_name = args[1];
-    *ret = mod->Lookup(var_name);
+.set_body_typed<Function(Module, std::string)>([](Module mod, std::string var) {
+    return mod->Lookup(var);
   });
 
 TVM_REGISTER_API("relay._module.Module_LookupDef")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    Module mod = args[0];
-    GlobalTypeVar var = args[1];
-    *ret = mod->LookupDef(var);
+.set_body_typed<TypeData(Module, GlobalTypeVar)>([](Module mod, GlobalTypeVar var) {
+    return mod->LookupDef(var);
   });
 
 TVM_REGISTER_API("relay._module.Module_LookupDef_str")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    Module mod = args[0];
-    std::string var_name = args[1];
-    *ret = mod->LookupDef(var_name);
+.set_body_typed<TypeData(Module, std::string)>([](Module mod, std::string var) {
+    return mod->LookupDef(var);
   });
 
 TVM_REGISTER_API("relay._module.Module_Update")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    Module mod = args[0];
-    mod->Update(args[1]);
+.set_body_typed<void(Module, Module)>([](Module mod, Module from) {
+    mod->Update(from);
   });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
diff --git a/src/relay/ir/type.cc b/src/relay/ir/type.cc
index fb0d919b46c3..8f0bdcba2b1b 100644
--- a/src/relay/ir/type.cc
+++ b/src/relay/ir/type.cc
@@ -56,10 +56,7 @@ IndexExpr TensorTypeNode::Size() const {
 TVM_REGISTER_NODE_TYPE(TensorTypeNode);
 
 TVM_REGISTER_API("relay._make.TensorType")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  Array<IndexExpr> shape = args[0];
-  *ret = TensorTypeNode::make(shape, args[1]);
-});
+.set_body_typed(TensorTypeNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<TensorTypeNode>([](const TensorTypeNode* node,
@@ -77,10 +74,8 @@ TypeVar TypeVarNode::make(std::string name, Kind kind) {
 TVM_REGISTER_NODE_TYPE(TypeVarNode);
 
 TVM_REGISTER_API("relay._make.TypeVar")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  int kind = args[1];
-  *ret =
-    TypeVarNode::make(args[0], static_cast<Kind>(kind));
+.set_body_typed<TypeVar(std::string, int)>([](std::string name, int kind) {
+    return TypeVarNode::make(name, static_cast<Kind>(kind));
     });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
@@ -100,10 +95,9 @@ GlobalTypeVar GlobalTypeVarNode::make(std::string name, Kind kind) {
 TVM_REGISTER_NODE_TYPE(GlobalTypeVarNode);
 
 TVM_REGISTER_API("relay._make.GlobalTypeVar")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  int kind = args[1];
-  *ret = GlobalTypeVarNode::make(args[0], static_cast<Kind>(kind));
-});
+.set_body_typed<GlobalTypeVar(std::string, int)>([](std::string name, int kind) {
+    return GlobalTypeVarNode::make(name, static_cast<Kind>(kind));
+    });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<GlobalTypeVarNode>([](const GlobalTypeVarNode *node,
@@ -122,9 +116,7 @@ TypeCall TypeCallNode::make(Type func, tvm::Array<Type> args) {
 TVM_REGISTER_NODE_TYPE(TypeCallNode);
 
 TVM_REGISTER_API("relay._make.TypeCall")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  *ret = TypeCallNode::make(args[0], args[1]);
-});
+.set_body_typed(TypeCallNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<TypeCallNode>([](const TypeCallNode* node,
@@ -142,9 +134,8 @@ IncompleteType IncompleteTypeNode::make(Kind kind) {
 TVM_REGISTER_NODE_TYPE(IncompleteTypeNode);
 
 TVM_REGISTER_API("relay._make.IncompleteType")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    int kind = args[0];
-    *ret = IncompleteTypeNode::make(static_cast<Kind>(kind));
+.set_body_typed<IncompleteType(int)>([](int kind) {
+    return IncompleteTypeNode::make(static_cast<Kind>(kind));
   });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
@@ -169,9 +160,7 @@ FuncType FuncTypeNode::make(tvm::Array<Type> arg_types,
 TVM_REGISTER_NODE_TYPE(FuncTypeNode);
 
 TVM_REGISTER_API("relay._make.FuncType")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  *ret = FuncTypeNode::make(args[0], args[1], args[2], args[3]);
-});
+.set_body_typed(FuncTypeNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<FuncTypeNode>([](const FuncTypeNode* node,
@@ -196,9 +185,7 @@ TypeRelation TypeRelationNode::make(TypeRelationFn func,
 TVM_REGISTER_NODE_TYPE(TypeRelationNode);
 
 TVM_REGISTER_API("relay._make.TypeRelation")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = TypeRelationNode::make(args[0], args[1], args[2], args[3]);
-});
+.set_body_typed(TypeRelationNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<TypeRelationNode>([](const TypeRelationNode* node, tvm::IRPrinter* p) {
@@ -216,9 +203,7 @@ TupleType TupleTypeNode::make(Array<Type> fields) {
 TVM_REGISTER_NODE_TYPE(TupleTypeNode);
 
 TVM_REGISTER_API("relay._make.TupleType")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = TupleTypeNode::make(args[0]);
-});
+.set_body_typed(TupleTypeNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<TupleTypeNode>([](const TupleTypeNode* node,
@@ -233,9 +218,7 @@ RefType RefTypeNode::make(Type value) {
 }
 
 TVM_REGISTER_API("relay._make.RefType")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  *ret = RefTypeNode::make(args[0]);
-});
+.set_body_typed(RefTypeNode::make);
 
 TVM_REGISTER_NODE_TYPE(RefTypeNode);
 
diff --git a/src/relay/op/debug.cc b/src/relay/op/debug.cc
index 3aea0c03f798..37fb090aa231 100644
--- a/src/relay/op/debug.cc
+++ b/src/relay/op/debug.cc
@@ -64,9 +64,7 @@ Expr MakeDebug(Expr expr, std::string name) {
 }
 
 TVM_REGISTER_API("relay.op._make.debug")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeDebug, args, rv);
-  });
+.set_body_typed(MakeDebug);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/image/resize.cc b/src/relay/op/image/resize.cc
index 7ca762e7394a..ffa489edff76 100644
--- a/src/relay/op/image/resize.cc
+++ b/src/relay/op/image/resize.cc
@@ -105,9 +105,7 @@ Expr MakeResize(Expr data,
 
 
 TVM_REGISTER_API("relay.op.image._make.resize")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 5>(MakeResize, args, rv);
-  });
+.set_body_typed(MakeResize);
 
 
 RELAY_REGISTER_OP("image.resize")
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index f2c0a27600d9..97cba7964000 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -170,9 +170,7 @@ Expr MakeConv2D(Expr data,
 
 
 TVM_REGISTER_API("relay.op.nn._make.conv2d")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 12>(MakeConv2D, args, rv);
-  });
+.set_body_typed(MakeConv2D);
 
 
 RELAY_REGISTER_OP("nn.conv2d")
@@ -324,9 +322,7 @@ Expr MakeConv2DTranspose(Expr data,
 
 
 TVM_REGISTER_API("relay.op.nn._make.conv2d_transpose")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 12>(MakeConv2DTranspose, args, rv);
-  });
+.set_body_typed(MakeConv2DTranspose);
 
 RELAY_REGISTER_OP("nn.conv2d_transpose")
 .describe(R"code(Transposed 2D convolution layer (sometimes called Deconvolution).
@@ -465,9 +461,7 @@ Expr MakeConv2DWinograd(Expr data,
 
 
 TVM_REGISTER_API("relay.op.nn._make.contrib_conv2d_winograd_without_weight_transform")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 13>(MakeConv2DWinograd, args, rv);
-  });
+.set_body_typed(MakeConv2DWinograd);
 
 
 RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_without_weight_transform")
@@ -530,9 +524,7 @@ Expr MakeConv2DWinogradWeightTransform(Expr weight,
 
 
 TVM_REGISTER_API("relay.op.nn._make.contrib_conv2d_winograd_weight_transform")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-  runtime::detail::unpack_call<Expr, 2>(MakeConv2DWinogradWeightTransform, args, rv);
-});
+.set_body_typed(MakeConv2DWinogradWeightTransform);
 
 
 RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_weight_transform")
@@ -580,9 +572,7 @@ Expr MakeConv2DWinogradNNPACK(Expr data,
 }
 
 TVM_REGISTER_API("relay.op.nn._make.contrib_conv2d_winograd_nnpack_without_weight_transform")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-  runtime::detail::unpack_call<Expr, 12>(MakeConv2DWinogradNNPACK, args, rv);
-});
+.set_body_typed(MakeConv2DWinogradNNPACK);
 
 RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_nnpack_without_weight_transform")
 .describe(R"code(Compute conv2d with winograd nnpack. Only supports NCHW layout.
@@ -649,9 +639,7 @@ Expr MakeConv2DWinogradNNPACKWeightTransform(Expr weight,
 }
 
 TVM_REGISTER_API("relay.op.nn._make.contrib_conv2d_winograd_nnpack_weight_transform")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-  runtime::detail::unpack_call<Expr, 3>(MakeConv2DWinogradNNPACKWeightTransform, args, rv);
-});
+.set_body_typed(MakeConv2DWinogradNNPACKWeightTransform);
 
 RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_nnpack_weight_transform")
 .describe(R"code(Weight transformation of winograd fast convolution algorithm with NNPACK.
@@ -698,9 +686,7 @@ Expr MakeConv2DNCHWc(Expr data,
 }
 
 TVM_REGISTER_API("relay.op.nn._make.contrib_conv2d_NCHWc")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 12>(MakeConv2DNCHWc, args, rv);
-  });
+.set_body_typed(MakeConv2DNCHWc);
 
 
 RELAY_REGISTER_OP("nn.contrib_conv2d_NCHWc")
@@ -750,9 +736,7 @@ Expr MakeDepthwiseConv2DNCHWc(Expr data,
 }
 
 TVM_REGISTER_API("relay.op.nn._make.contrib_depthwise_conv2d_NCHWc")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 12>(MakeDepthwiseConv2DNCHWc, args, rv);
-  });
+.set_body_typed(MakeDepthwiseConv2DNCHWc);
 
 
 RELAY_REGISTER_OP("nn.contrib_depthwise_conv2d_NCHWc")
@@ -910,9 +894,7 @@ Expr MakeDeformableConv2D(Expr data,
 }
 
 TVM_REGISTER_API("relay.op.nn._make.deformable_conv2d")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 14>(MakeDeformableConv2D, args, rv);
-  });
+.set_body_typed(MakeDeformableConv2D);
 
 
 }  // namespace relay
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index d24431347f80..2356634c4ed0 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -78,9 +78,7 @@ Expr MakeBiasAdd(Expr data,
 
 
 TVM_REGISTER_API("relay.op.nn._make.bias_add")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 3>(MakeBiasAdd, args, rv);
-  });
+.set_body_typed(MakeBiasAdd);
 
 
 RELAY_REGISTER_OP("nn.bias_add")
@@ -145,9 +143,7 @@ Expr MakeDense(Expr data,
 
 
 TVM_REGISTER_API("relay.op.nn._make.dense")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 3>(MakeDense, args, rv);
-  });
+.set_body_typed(MakeDense);
 
 
 RELAY_REGISTER_OP("nn.dense")
@@ -179,9 +175,7 @@ Expr MakeLeakyRelu(Expr data,
 
 
 TVM_REGISTER_API("relay.op.nn._make.leaky_relu")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeLeakyRelu, args, rv);
-  });
+.set_body_typed(MakeLeakyRelu);
 
 
 RELAY_REGISTER_OP("nn.leaky_relu")
@@ -244,9 +238,7 @@ Expr MakePRelu(Expr data,
 
 
 TVM_REGISTER_API("relay.op.nn._make.prelu")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 3>(MakePRelu, args, rv);
-  });
+.set_body_typed(MakePRelu);
 
 
 RELAY_REGISTER_OP("nn.prelu")
@@ -276,17 +268,14 @@ where :math:`*` is an channelwise multiplication for each sample in the batch.
 TVM_REGISTER_NODE_TYPE(SoftmaxAttrs);
 
 TVM_REGISTER_API("relay.op.nn._make.softmax")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-  auto make_func = [](Expr data, int axis) {
-    auto attrs = make_node<SoftmaxAttrs>();
-    attrs->axis = axis;
-    static const Op& op = Op::Get("nn.softmax");
-    return CallNode::make(op, {data}, Attrs(attrs), {});
-  };
-
-  runtime::detail::unpack_call<Expr, 2>(make_func, args, rv);
+.set_body_typed<Call(Expr, int)>([](Expr data, int axis) {
+  auto attrs = make_node<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("nn.softmax");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
 });
 
+
 RELAY_REGISTER_OP("nn.softmax")
     .describe(R"code(Softmax layer.
 
@@ -314,15 +303,11 @@ RELAY_REGISTER_OP("nn.softmax")
 
 // relay.nn.log_softmax
 TVM_REGISTER_API("relay.op.nn._make.log_softmax")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-  auto make_func = [](Expr data, int axis) {
-    auto attrs = make_node<SoftmaxAttrs>();
-    attrs->axis = axis;
-    static const Op& op = Op::Get("nn.log_softmax");
-    return CallNode::make(op, {data}, Attrs(attrs), {});
-  };
-
-  runtime::detail::unpack_call<Expr, 2>(make_func, args, rv);
+.set_body_typed<Call(Expr, int)>([](Expr data, int axis) {
+  auto attrs = make_node<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("nn.log_softmax");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
 });
 
 RELAY_REGISTER_OP("nn.log_softmax")
@@ -382,9 +367,7 @@ Expr MakeBatchFlatten(Expr data) {
 
 
 TVM_REGISTER_API("relay.op.nn._make.batch_flatten")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 1>(MakeBatchFlatten, args, rv);
-  });
+.set_body_typed(MakeBatchFlatten);
 
 
 RELAY_REGISTER_OP("nn.batch_flatten")
@@ -424,7 +407,7 @@ Example::
 
 // relu
 TVM_REGISTER_API("relay.op.nn._make.relu")
-.set_body_typed<Expr(Expr)>([](Expr data) {
+.set_body_typed<Call(Expr)>([](Expr data) {
     static const Op& op = Op::Get("nn.relu");
     return CallNode::make(op, {data}, Attrs(), {});
   });
@@ -469,9 +452,7 @@ Expr MakeLRN(Expr data,
 }
 
 TVM_REGISTER_API("relay.op.nn._make.lrn")
-  .set_body([](const TVMArgs& args, TVMRetValue* rv) {
-      runtime::detail::unpack_call<Expr, 6>(MakeLRN, args, rv);
-  });
+.set_body_typed(MakeLRN);
 
 RELAY_REGISTER_OP("nn.lrn")
 .describe(R"code(LRN layer.
@@ -509,9 +490,7 @@ Expr MakeL2Normalize(Expr data,
 }
 
 TVM_REGISTER_API("relay.op.nn._make.l2_normalize")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 3>(MakeL2Normalize, args, rv);
-  });
+.set_body_typed(MakeL2Normalize);
 
 RELAY_REGISTER_OP("nn.l2_normalize")
 .describe(R"code(L2 Normalization layer.
@@ -556,9 +535,7 @@ Expr MakeDropout(Expr data, double rate) {
 }
 
 TVM_REGISTER_API("relay.op.nn._make.dropout")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeDropout, args, rv);
-  });
+.set_body_typed(MakeDropout);
 
 RELAY_REGISTER_OP("nn.dropout")
 .describe(R"code(Applies the dropout operation to the input array.
@@ -622,9 +599,7 @@ Expr MakeBatchNorm(Expr data, Expr gamma, Expr beta, Expr moving_mean, Expr movi
 }
 
 TVM_REGISTER_API("relay.op.nn._make.batch_norm")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 9>(MakeBatchNorm, args, rv);
-  });
+.set_body_typed(MakeBatchNorm);
 
 RELAY_REGISTER_OP("nn.batch_norm")
 .describe(R"code(Batch normalization layer (Ioffe and Szegedy, 2014).
@@ -711,9 +686,7 @@ Expr MakeBatchMatmul(Expr x,
 
 
 TVM_REGISTER_API("relay.op.nn._make.batch_matmul")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeBatchMatmul, args, rv);
-  });
+.set_body_typed(MakeBatchMatmul);
 
 
 RELAY_REGISTER_OP("nn.batch_matmul")
diff --git a/src/relay/op/nn/pad.cc b/src/relay/op/nn/pad.cc
index c653e3b9f39d..98b9d671bff9 100644
--- a/src/relay/op/nn/pad.cc
+++ b/src/relay/op/nn/pad.cc
@@ -115,9 +115,7 @@ Expr MakePad(Expr data, Array<Array<IndexExpr> > pad_width, double pad_value) {
 }
 
 TVM_REGISTER_API("relay.op.nn._make.pad")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 3>(MakePad, args, rv);
-  });
+.set_body_typed(MakePad);
 
 RELAY_REGISTER_OP("nn.pad")
 .describe(R"code(Pad for n-D tensor.
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
index 0717ee5c577f..df238b38c9cd 100644
--- a/src/relay/op/nn/pooling.cc
+++ b/src/relay/op/nn/pooling.cc
@@ -186,9 +186,7 @@ Array<Tensor> Pool2DCompute(const Attrs& attrs,
 }
 
 TVM_REGISTER_API("relay.op.nn._make.max_pool2d")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 6>(MakeMaxPool2D, args, rv);
-  });
+.set_body_typed(MakeMaxPool2D);
 
 
 RELAY_REGISTER_OP("nn.max_pool2d")
@@ -242,9 +240,7 @@ Expr MakeAvgPool2D(Expr data,
 
 
 TVM_REGISTER_API("relay.op.nn._make.avg_pool2d")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 7>(MakeAvgPool2D, args, rv);
-  });
+.set_body_typed(MakeAvgPool2D);
 
 
 RELAY_REGISTER_OP("nn.avg_pool2d")
@@ -345,9 +341,7 @@ Expr MakeGlobalAvgPool2D(Expr data,
 
 
 TVM_REGISTER_API("relay.op.nn._make.global_avg_pool2d")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeGlobalAvgPool2D, args, rv);
-  });
+.set_body_typed(MakeGlobalAvgPool2D);
 
 // GlobalAvgPool
 RELAY_REGISTER_OP("nn.global_avg_pool2d")
@@ -378,9 +372,7 @@ Expr MakeGlobalMaxPool2D(Expr data,
 }
 
 TVM_REGISTER_API("relay.op.nn._make.global_max_pool2d")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeGlobalMaxPool2D, args, rv);
-  });
+.set_body_typed(MakeGlobalMaxPool2D);
 
 
 RELAY_REGISTER_OP("nn.global_max_pool2d")
diff --git a/src/relay/op/nn/upsampling.cc b/src/relay/op/nn/upsampling.cc
index 98458b9dc258..acefaf3e7e5d 100644
--- a/src/relay/op/nn/upsampling.cc
+++ b/src/relay/op/nn/upsampling.cc
@@ -110,9 +110,7 @@ Expr MakeUpSampling(Expr data,
 
 
 TVM_REGISTER_API("relay.op.nn._make.upsampling")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 4>(MakeUpSampling, args, rv);
-  });
+.set_body_typed(MakeUpSampling);
 
 
 RELAY_REGISTER_OP("nn.upsampling")
diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc
index 7bade46b31d4..b889b6ce51cd 100644
--- a/src/relay/op/tensor/reduce.cc
+++ b/src/relay/op/tensor/reduce.cc
@@ -265,8 +265,8 @@ bool ReduceRel(const Array<Type>& types,
 
 #define RELAY_REGISTER_REDUCE_OP(OpName)                           \
   TVM_REGISTER_API("relay.op._make." OpName)                       \
-  .set_body([](const TVMArgs& args, TVMRetValue* rv) {             \
-    auto make_func = [](Expr data,                                 \
+  .set_body_typed<Call(Expr, Array<Integer>, bool, bool)>([](      \
+                        Expr data,                                 \
                         Array<Integer> axis,                       \
                         bool keepdims,                             \
                         bool exclude) {                            \
@@ -276,8 +276,6 @@ bool ReduceRel(const Array<Type>& types,
       attrs->exclude = exclude;                                    \
       static const Op& op = Op::Get(OpName);                       \
       return CallNode::make(op, {data}, Attrs(attrs), {});         \
-    };                                                             \
-    runtime::detail::unpack_call<Expr, 4>(make_func, args, rv);    \
     });                                                            \
   RELAY_REGISTER_OP(OpName)                                        \
   .set_num_inputs(1)                                               \
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index f86156bdbddc..873e75d9660b 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -81,9 +81,7 @@ Expr MakeCast(Expr data,
 }
 
 TVM_REGISTER_API("relay._make.cast")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeCast, args, rv);
-});
+.set_body_typed(MakeCast);
 
 RELAY_REGISTER_OP("cast")
 .describe(R"code(Cast the data into a new data type.
@@ -161,9 +159,7 @@ Expr MakeExpandDims(Expr data,
 }
 
 TVM_REGISTER_API("relay.op._make.expand_dims")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 3>(MakeExpandDims, args, rv);
-});
+.set_body_typed(MakeExpandDims);
 
 RELAY_REGISTER_OP("expand_dims")
 .describe(R"code(Insert `num_newaxis` axises at the position given by `axis`
@@ -279,9 +275,7 @@ Expr MakeConcatenate(Expr data,
 }
 
 TVM_REGISTER_API("relay.op._make.concatenate")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeConcatenate, args, rv);
-});
+.set_body_typed(MakeConcatenate);
 
 RELAY_REGISTER_OP("concatenate")
 .describe(R"code(Concatenate the input tensors along the given axis.
@@ -367,9 +361,7 @@ Expr MakeStack(Expr data,
 }
 
 TVM_REGISTER_API("relay.op._make.stack")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeStack, args, rv);
-});
+.set_body_typed(MakeStack);
 
 RELAY_REGISTER_OP("stack")
 .describe(R"code(Stack the input tensors along the given axis.
@@ -461,9 +453,7 @@ Expr MakeTranspose(Expr data,
 }
 
 TVM_REGISTER_API("relay.op._make.transpose")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeTranspose, args, rv);
-});
+.set_body_typed(MakeTranspose);
 
 RELAY_REGISTER_OP("transpose")
 .describe(R"code(Permutes the dimensions of an array.
@@ -598,9 +588,7 @@ Expr MakeReshape(Expr data,
 }
 
 TVM_REGISTER_API("relay.op._make.reshape")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeReshape, args, rv);
-});
+.set_body_typed(MakeReshape);
 
 RELAY_REGISTER_OP("reshape")
 .describe(R"code(Reshapes the input array.
@@ -698,9 +686,7 @@ Expr MakeReshapeLike(Expr data,
 
 
 TVM_REGISTER_API("relay.op._make.reshape_like")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeReshapeLike, args, rv);
-});
+.set_body_typed(MakeReshapeLike);
 
 
 RELAY_REGISTER_OP("reshape_like")
@@ -790,9 +776,7 @@ Expr MakeTake(Expr data,
 }
 
 TVM_REGISTER_API("relay.op._make.take")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 4>(MakeTake, args, rv);
-});
+.set_body_typed(MakeTake);
 
 RELAY_REGISTER_OP("take")
 .describe(R"code(Take elements from an array along an axis.
@@ -873,9 +857,7 @@ Expr MakeFull(Expr fill_value,
 }
 
 TVM_REGISTER_API("relay.op._make.full")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 3>(MakeFull, args, rv);
-});
+.set_body_typed(MakeFull);
 
 RELAY_REGISTER_OP("full")
 .describe(R"code(Fill array with scalar value.
@@ -910,9 +892,7 @@ Expr MakeZeros(Array<IndexExpr> shape,
 }
 
 TVM_REGISTER_API("relay.op._make.zeros")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeZeros, args, rv);
-  });
+.set_body_typed(MakeZeros);
 
 RELAY_REGISTER_OP("zeros")
 .describe(R"code(Fill array with zeros.
@@ -933,9 +913,7 @@ Expr MakeOnes(Array<IndexExpr> shape,
 }
 
 TVM_REGISTER_API("relay.op._make.ones")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeOnes, args, rv);
-  });
+.set_body_typed(MakeOnes);
 
 RELAY_REGISTER_OP("ones")
 .describe(R"code(Fill array with ones.
@@ -982,9 +960,7 @@ Expr MakeFullLike(Expr data,
 }
 
 TVM_REGISTER_API("relay.op._make.full_like")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeFullLike, args, rv);
-  });
+.set_body_typed(MakeFullLike);
 
 RELAY_REGISTER_OP("full_like")
 .describe(R"code(Return an scalar value array with the same shape
@@ -1041,9 +1017,7 @@ Expr MakeArange(tvm::Expr start,
 }
 
 TVM_REGISTER_API("relay.op._make.arange")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 4>(MakeArange, args, rv);
-});
+.set_body_typed(MakeArange);
 
 RELAY_REGISTER_OP("arange")
 .describe(R"code(Returns evenly spaced values within a given interval.
@@ -1117,9 +1091,7 @@ Expr MakeRepeat(Expr data,
 }
 
 TVM_REGISTER_API("relay.op._make.repeat")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 3>(MakeRepeat, args, rv);
-});
+.set_body_typed(MakeRepeat);
 
 RELAY_REGISTER_OP("repeat")
 .describe(R"code(Repeat elements of an array `repeats` times along axis `axis`
@@ -1217,9 +1189,7 @@ Expr MakeTile(Expr data,
 }
 
 TVM_REGISTER_API("relay.op._make.tile")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeTile, args, rv);
-});
+.set_body_typed(MakeTile);
 
 RELAY_REGISTER_OP("tile")
 .describe(R"code(Repeat the whole array multiple times.
@@ -1280,9 +1250,7 @@ Expr MakeReverse(Expr data,
 }
 
 TVM_REGISTER_API("relay.op._make.reverse")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeReverse, args, rv);
-});
+.set_body_typed(MakeReverse);
 
 RELAY_REGISTER_OP("reverse")
 .describe(R"code(Reverses the order of elements along given `axis` while preserving array shape.
@@ -1345,9 +1313,7 @@ Array<Tensor> WhereCompute(const Attrs& attrs,
 }
 
 TVM_REGISTER_API("relay.op._make.where")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-  runtime::detail::unpack_call<Expr, 3>(MakeWhere, args, rv);
-});
+.set_body_typed(MakeWhere);
 
 RELAY_REGISTER_OP("where")
 .describe(R"code(
@@ -1400,9 +1366,7 @@ Expr MakeSqueeze(Expr data,
 }
 
 TVM_REGISTER_API("relay.op._make.squeeze")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeSqueeze, args, rv);
-  });
+.set_body_typed(MakeSqueeze);
 
 
 bool SqueezeRel(const Array<Type>& types,
@@ -1507,9 +1471,7 @@ Array<Tensor> CollapseSumLikeCompute(const Attrs& attrs,
 }
 
 TVM_REGISTER_API("relay.op._make.collapse_sum_like")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeCollapseSumLike, args, rv);
-  });
+.set_body_typed(MakeCollapseSumLike);
 
 RELAY_REGISTER_OP("collapse_sum_like")
 .describe(R"code(Collapse the first input to match the shape of the second input.
@@ -1554,9 +1516,7 @@ Array<Tensor> BroadCastToCompute(const Attrs& attrs,
 }
 
 TVM_REGISTER_API("relay.op._make.broadcast_to")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeBroadCastTo, args, rv);
-  });
+.set_body_typed(MakeBroadCastTo);
 
 RELAY_REGISTER_OP("broadcast_to")
 .describe(R"code(Broadcast the first input to match the shape argument.
@@ -1594,9 +1554,7 @@ Array<Tensor> BroadCastToLikeCompute(const Attrs& attrs,
 }
 
 TVM_REGISTER_API("relay.op._make.broadcast_to_like")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeBroadCastToLike, args, rv);
-  });
+.set_body_typed(MakeBroadCastToLike);
 
 RELAY_REGISTER_OP("broadcast_to_like")
 .describe(R"code(Broadcast the first input to match the shape of the second input.
@@ -1806,9 +1764,7 @@ Array<Tensor> StridedSliceCompute(const Attrs& attrs,
 
 
 TVM_REGISTER_API("relay.op._make.strided_slice")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 4>(MakeStridedSlice, args, rv);
-  });
+.set_body_typed(MakeStridedSlice);
 
 
 RELAY_REGISTER_OP("strided_slice")
@@ -2081,9 +2037,7 @@ Array<Tensor> SliceLikeCompute(const Attrs& attrs,
 
 
 TVM_REGISTER_API("relay.op._make.slice_like")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 3>(MakeSliceLike, args, rv);
-});
+.set_body_typed(MakeSliceLike);
 
 
 RELAY_REGISTER_OP("slice_like")
@@ -2144,9 +2098,7 @@ Expr MakeLayoutTransform(Expr data,
 }
 
 TVM_REGISTER_API("relay.op._make.layout_transform")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-  runtime::detail::unpack_call<Expr, 3>(MakeLayoutTransform, args, rv);
-});
+.set_body_typed(MakeLayoutTransform);
 
 RELAY_REGISTER_OP("layout_transform")
 .describe(R"code(Transform the input data layout.
@@ -2174,9 +2126,7 @@ Expr MakeReverseReshape(Expr data,
 }
 
 TVM_REGISTER_API("relay.op._make._contrib_reverse_reshape")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeReverseReshape, args, rv);
-});
+.set_body_typed(MakeReverseReshape);
 
 RELAY_REGISTER_OP("_contrib_reverse_reshape")
 .describe(R"code(Reshapes the input array where the special values are inferred from
@@ -2250,9 +2200,7 @@ Expr MakeGatherND(Expr data,
 }
 
 TVM_REGISTER_API("relay.op._make.gather_nd")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeGatherND, args, rv);
-});
+.set_body_typed(MakeGatherND);
 
 RELAY_REGISTER_OP("gather_nd")
 .describe(R"code(Gather elements or slices from data and store to
diff --git a/src/relay/op/vision/multibox_op.cc b/src/relay/op/vision/multibox_op.cc
index 2c9f76ba2015..56a03ff80bc9 100644
--- a/src/relay/op/vision/multibox_op.cc
+++ b/src/relay/op/vision/multibox_op.cc
@@ -73,9 +73,7 @@ Expr MakeMultiBoxPrior(Expr data,
 
 
 TVM_REGISTER_API("relay.op.vision._make.multibox_prior")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-  runtime::detail::unpack_call<Expr, 6>(MakeMultiBoxPrior, args, rv);
-});
+.set_body_typed(MakeMultiBoxPrior);
 
 
 RELAY_REGISTER_OP("vision.multibox_prior")
@@ -147,9 +145,7 @@ Expr MakeMultiBoxTransformLoc(Expr cls_prob,
 }
 
 TVM_REGISTER_API("relay.op.vision._make.multibox_transform_loc")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-  runtime::detail::unpack_call<Expr, 6>(MakeMultiBoxTransformLoc, args, rv);
-});
+.set_body_typed(MakeMultiBoxTransformLoc);
 
 RELAY_REGISTER_OP("vision.multibox_transform_loc")
 .describe(R"doc("Location transformation for multibox detection."
diff --git a/src/relay/op/vision/nms.cc b/src/relay/op/vision/nms.cc
index 75161bfd1e92..5344bce3d641 100644
--- a/src/relay/op/vision/nms.cc
+++ b/src/relay/op/vision/nms.cc
@@ -59,9 +59,7 @@ Expr MakeGetValidCounts(Expr data,
 
 
 TVM_REGISTER_API("relay.op.vision._make.get_valid_counts")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-  runtime::detail::unpack_call<Expr, 2>(MakeGetValidCounts, args, rv);
-});
+.set_body_typed(MakeGetValidCounts);
 
 
 RELAY_REGISTER_OP("vision.get_valid_counts")
@@ -125,9 +123,7 @@ Expr MakeNMS(Expr data,
 
 
 TVM_REGISTER_API("relay.op.vision._make.non_max_suppression")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-  runtime::detail::unpack_call<Expr, 9>(MakeNMS, args, rv);
-});
+.set_body_typed(MakeNMS);
 
 
 RELAY_REGISTER_OP("vision.non_max_suppression")
diff --git a/src/relay/op/vision/rcnn_op.cc b/src/relay/op/vision/rcnn_op.cc
index 70fe292ed9e5..0522ab845fad 100644
--- a/src/relay/op/vision/rcnn_op.cc
+++ b/src/relay/op/vision/rcnn_op.cc
@@ -62,9 +62,7 @@ Expr MakeROIAlign(Expr data, Expr rois, Array<IndexExpr> pooled_size, double spa
 }
 
 TVM_REGISTER_API("relay.op.vision._make.roi_align")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 6>(MakeROIAlign, args, rv);
-  });
+.set_body_typed(MakeROIAlign);
 
 RELAY_REGISTER_OP("vision.roi_align")
     .describe(R"doc(ROI Align operator.
@@ -114,9 +112,7 @@ Expr MakeROIPool(Expr data, Expr rois, Array<IndexExpr> pooled_size, double spat
 }
 
 TVM_REGISTER_API("relay.op.vision._make.roi_pool")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 5>(MakeROIPool, args, rv);
-  });
+.set_body_typed(MakeROIPool);
 
 RELAY_REGISTER_OP("vision.roi_pool")
     .describe(R"doc(ROI Pool operator.
@@ -182,9 +178,7 @@ Expr MakeProposal(Expr cls_prob, Expr bbox_pred, Expr im_info, Array<IndexExpr>
 }
 
 TVM_REGISTER_API("relay.op.vision._make.proposal")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 11>(MakeProposal, args, rv);
-  });
+.set_body_typed(MakeProposal);
 
 RELAY_REGISTER_OP("vision.proposal")
     .describe(R"code(Generate region proposals via RPN.
diff --git a/src/relay/op/vision/yolo.cc b/src/relay/op/vision/yolo.cc
index 310e30a51890..0a1d9614976e 100644
--- a/src/relay/op/vision/yolo.cc
+++ b/src/relay/op/vision/yolo.cc
@@ -71,9 +71,7 @@ Expr MakeYoloReorg(Expr data,
 
 
 TVM_REGISTER_API("relay.op.vision._make.yolo_reorg")
-.set_body([](const TVMArgs& args, TVMRetValue* rv) {
-  runtime::detail::unpack_call<Expr, 2>(MakeYoloReorg, args, rv);
-});
+.set_body_typed(MakeYoloReorg);
 
 
 RELAY_REGISTER_OP("vision.yolo_reorg")
diff --git a/src/relay/pass/canonicalize_ops.cc b/src/relay/pass/canonicalize_ops.cc
index c4350cc0c9db..9a4602750195 100644
--- a/src/relay/pass/canonicalize_ops.cc
+++ b/src/relay/pass/canonicalize_ops.cc
@@ -61,9 +61,7 @@ Expr CanonicalizeOps(const Expr& e) {
 }
 
 TVM_REGISTER_API("relay._ir_pass.canonicalize_ops")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-*ret = CanonicalizeOps(args[0]);
-});
+.set_body_typed(CanonicalizeOps);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/combine_parallel_conv2d.cc b/src/relay/pass/combine_parallel_conv2d.cc
index cd7a852bcad7..7e76322d5a2a 100644
--- a/src/relay/pass/combine_parallel_conv2d.cc
+++ b/src/relay/pass/combine_parallel_conv2d.cc
@@ -355,9 +355,7 @@ Expr CombineParallelConv2D(const Expr& expr, uint64_t min_num_branches) {
 }
 
 TVM_REGISTER_API("relay._ir_pass.CombineParallelConv2D")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  *ret = CombineParallelConv2D(args[0], args[1]);
-});
+.set_body_typed(CombineParallelConv2D);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/dead_code.cc b/src/relay/pass/dead_code.cc
index 06cd9091749b..c5c4f333ecfe 100644
--- a/src/relay/pass/dead_code.cc
+++ b/src/relay/pass/dead_code.cc
@@ -148,9 +148,7 @@ Expr DeadCodeElimination(const Expr& e) {
 }
 
 TVM_REGISTER_API("relay._ir_pass.dead_code_elimination")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = DeadCodeElimination(args[0]);
-  });
+.set_body_typed(DeadCodeElimination);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/device_annotation.cc b/src/relay/pass/device_annotation.cc
index 6f063830cbe9..46f4268cc970 100644
--- a/src/relay/pass/device_annotation.cc
+++ b/src/relay/pass/device_annotation.cc
@@ -493,19 +493,13 @@ Map<Expr, Integer> CollectDeviceAnnotationOps(const Expr& expr) {
 }
 
 TVM_REGISTER_API("relay._ir_pass.CollectDeviceInfo")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-  *ret = CollectDeviceInfo(args[0]);
-});
+.set_body_typed(CollectDeviceInfo);
 
 TVM_REGISTER_API("relay._ir_pass.RewriteDeviceAnnotation")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-  *ret = RewriteAnnotatedOps(args[0], args[1]);
-});
+.set_body_typed(RewriteAnnotatedOps);
 
 TVM_REGISTER_API("relay._ir_pass.CollectDeviceAnnotationOps")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-  *ret = CollectDeviceAnnotationOps(args[0]);
-});
+.set_body_typed(CollectDeviceAnnotationOps);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/fold_constant.cc b/src/relay/pass/fold_constant.cc
index 9d55a548be10..5bfee6cfe9f6 100644
--- a/src/relay/pass/fold_constant.cc
+++ b/src/relay/pass/fold_constant.cc
@@ -210,9 +210,7 @@ Expr FoldConstant(const Expr& expr) {
 }
 
 TVM_REGISTER_API("relay._ir_pass.FoldConstant")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = FoldConstant(args[0]);
-});
+.set_body_typed(FoldConstant);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index 4b50c64459a0..6de9c2d65f90 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -912,8 +912,6 @@ Expr FuseOps(const Expr& expr, int fuse_opt_level) {
 }
 
 TVM_REGISTER_API("relay._ir_pass.FuseOps")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = FuseOps(args[0], args[1]);
-});
+.set_body_typed(FuseOps);
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/gradient.cc b/src/relay/pass/gradient.cc
index 8a5d1df53a26..5c5ea01ac2f3 100644
--- a/src/relay/pass/gradient.cc
+++ b/src/relay/pass/gradient.cc
@@ -247,10 +247,7 @@ Expr FirstOrderGradient(const Expr& re, const Module& mod) {
 }
 
 TVM_REGISTER_API("relay._ir_pass.first_order_gradient")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  CHECK_EQ(args.size(), 2);
-  *ret = FirstOrderGradient(args[0], args[1]);
-});
+.set_body_typed(FirstOrderGradient);
 
 struct ReverseADType : TypeMutator {
   Type VisitType_(const TensorTypeNode* ttn) final {
@@ -263,7 +260,7 @@ struct ReverseAD : ExprMutator {
   Var bp;
   const OpMap<FPrimalGradient> rev_map = Op::GetAttr<FPrimalGradient>("FPrimalGradient");
 
-  ReverseAD(const Var& bp) : bp(bp) { }
+  ReverseAD(const Var& bp) : bp(bp) { } /// NOLINT(*)
 
   Expr VisitExpr_(const OpNode* op) final {
     LOG(FATAL) << "op should only be inside call";
@@ -349,10 +346,7 @@ Expr Gradient(const Expr& re, const Module& mod) {
 }
 
 TVM_REGISTER_API("relay._ir_pass.gradient")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  CHECK_EQ(args.size(), 2);
-  *ret = Gradient(args[0], args[1]);
-});
+.set_body_typed(Gradient);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/mac_count.cc b/src/relay/pass/mac_count.cc
index 702e703cd902..c9ee4eec0337 100644
--- a/src/relay/pass/mac_count.cc
+++ b/src/relay/pass/mac_count.cc
@@ -147,9 +147,7 @@ int64_t GetTotalMacNumber(const Expr& expr) {
 }
 
 TVM_REGISTER_API("relay._ir_pass.GetTotalMacNumber")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-  *ret = GetTotalMacNumber(args[0]);
-});
+.set_body_typed(GetTotalMacNumber);
 
 }  // namespace mac_count
 }  // namespace relay
diff --git a/src/relay/pass/pass_manager.cc b/src/relay/pass/pass_manager.cc
index fad3728d433e..d607247b3bc8 100644
--- a/src/relay/pass/pass_manager.cc
+++ b/src/relay/pass/pass_manager.cc
@@ -426,12 +426,7 @@ Pass CreateSequentialPass(const tvm::Array<Pass>& passes,
 TVM_REGISTER_NODE_TYPE(PassInfoNode);
 
 TVM_REGISTER_API("relay._ir_pass.PassInfo")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  int opt_level = args[0];
-  std::string name = args[1];
-  tvm::Array<tvm::Expr> required = args[2];
-  *ret = PassInfoNode::make(opt_level, name, required);
-});
+.set_body_typed(PassInfoNode::make);
 
 TVM_REGISTER_API("relay._ir_pass.Info")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
@@ -456,13 +451,7 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 TVM_REGISTER_NODE_TYPE(ModulePassNode);
 
 TVM_REGISTER_API("relay._ir_pass.CreateModulePass")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  PackedFunc pass_func = args[0];
-  int opt_level = args[1];
-  std::string name = args[2];
-  tvm::Array<tvm::Expr> required = args[3];
-  *ret = CreateModulePass(pass_func, opt_level, name, required);
-});
+.set_body_typed(CreateModulePass);
 
 TVM_REGISTER_API("relay._ir_pass.RunPass")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
@@ -487,13 +476,7 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 TVM_REGISTER_NODE_TYPE(FunctionPassNode);
 
 TVM_REGISTER_API("relay._ir_pass.CreateFunctionPass")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  PackedFunc pass_func = args[0];
-  int opt_level = args[1];
-  std::string name = args[2];
-  tvm::Array<tvm::Expr> required = args[3];
-  *ret = CreateFunctionPass(pass_func, opt_level, name, required);
-});
+.set_body_typed(CreateFunctionPass);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<FunctionPassNode>([](const FunctionPassNode* node,
@@ -541,9 +524,7 @@ TVM_REGISTER_API("relay._ir_pass.SetContext")
 TVM_REGISTER_NODE_TYPE(PassContextNode);
 
 TVM_REGISTER_API("relay._ir_pass.PassContext")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  *ret = PassContextNode::make();
-});
+.set_body_typed(PassContextNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<PassContextNode>([](const PassContextNode* node,
diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc
index cb0f9d9c5acb..5fa30535b002 100644
--- a/src/relay/pass/quantize.cc
+++ b/src/relay/pass/quantize.cc
@@ -571,20 +571,13 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 });
 
 TVM_REGISTER_API("relay._quantize._GetCurrentQConfig")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  *ret = QConfig::Current();
-  });
+.set_body_typed(QConfig::Current);
 
 TVM_REGISTER_API("relay._quantize._EnterQConfigScope")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  QConfig target = args[0];
-  QConfig::EnterQConfigScope(target);
-  });
+.set_body_typed(QConfig::EnterQConfigScope);
 
 TVM_REGISTER_API("relay._quantize._ExitQConfigScope")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  QConfig::ExitQConfigScope();
-  });
+.set_body_typed(QConfig::ExitQConfigScope);
 
 }  // namespace quantize
 }  // namespace relay
diff --git a/src/relay/pass/simplify_inference.cc b/src/relay/pass/simplify_inference.cc
index 28ebaaa75546..cecebc5c04ed 100644
--- a/src/relay/pass/simplify_inference.cc
+++ b/src/relay/pass/simplify_inference.cc
@@ -103,9 +103,7 @@ Expr SimplifyInference(const Expr& e) {
 }
 
 TVM_REGISTER_API("relay._ir_pass.simplify_inference")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = SimplifyInference(args[0]);
-  });
+.set_body_typed(SimplifyInference);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/to_a_normal_form.cc b/src/relay/pass/to_a_normal_form.cc
index bac6fd28faf5..5507de471ae5 100644
--- a/src/relay/pass/to_a_normal_form.cc
+++ b/src/relay/pass/to_a_normal_form.cc
@@ -491,9 +491,7 @@ Expr ToANormalForm(const Expr& e, const Module& m) {
 }
 
 TVM_REGISTER_API("relay._ir_pass.to_a_normal_form")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = ToANormalForm(args[0], args[1]);
-  });
+.set_body_typed(static_cast<Expr (*)(const Expr&, const Module&)>(ToANormalForm));
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/to_graph_normal_form.cc b/src/relay/pass/to_graph_normal_form.cc
index cc7e1a43068e..490a80f308ce 100644
--- a/src/relay/pass/to_graph_normal_form.cc
+++ b/src/relay/pass/to_graph_normal_form.cc
@@ -77,9 +77,7 @@ Expr ToGraphNormalForm(const Expr& e) {
 }
 
 TVM_REGISTER_API("relay._ir_pass.to_graph_normal_form")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  *ret = ToGraphNormalForm(args[0]);
-});
+.set_body_typed(ToGraphNormalForm);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index 5abf0b74ab68..30d4d79f6c86 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -801,8 +801,8 @@ Function InferType(const Function& func,
 }
 
 TVM_REGISTER_API("relay._ir_pass.infer_type")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = InferType(args[0], args[1]);
+.set_body_typed<Expr(const Expr&, const Module&)>([](const Expr& expr, const Module& mod_ref) {
+    return InferType(expr, mod_ref);
   });
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/util.cc b/src/relay/pass/util.cc
index fa655a785338..8e02cf127bfd 100644
--- a/src/relay/pass/util.cc
+++ b/src/relay/pass/util.cc
@@ -275,9 +275,7 @@ tvm::Array<Var> AllVars(const Expr& expr) {
 }
 
 TVM_REGISTER_API("relay._ir_pass.free_vars")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = FreeVars(args[0]);
-  });
+.set_body_typed(FreeVars);
 
 TVM_REGISTER_API("relay._ir_pass.bound_vars")
   .set_body([](TVMArgs args, TVMRetValue* ret) {
@@ -290,9 +288,7 @@ TVM_REGISTER_API("relay._ir_pass.bound_vars")
     });
 
 TVM_REGISTER_API("relay._ir_pass.all_vars")
-  .set_body([](TVMArgs args, TVMRetValue* ret) {
-      *ret = AllVars(args[0]);
-    });
+.set_body_typed(AllVars);
 
 TVM_REGISTER_API("relay._ir_pass.free_type_vars")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
diff --git a/src/relay/pass/well_formed.cc b/src/relay/pass/well_formed.cc
index 86107d66e52f..4eaaa934e78b 100644
--- a/src/relay/pass/well_formed.cc
+++ b/src/relay/pass/well_formed.cc
@@ -79,10 +79,7 @@ bool WellFormed(const Expr& e) {
 }
 
 TVM_REGISTER_API("relay._ir_pass.well_formed")
-  .set_body([](TVMArgs args, TVMRetValue *ret) {
-      Expr e = args[0];
-      *ret = WellFormed(e);
-    });
+.set_body_typed(WellFormed);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/runtime/cuda/cuda_module.cc b/src/runtime/cuda/cuda_module.cc
index a46f0ebfdbdc..55d9e648e154 100644
--- a/src/runtime/cuda/cuda_module.cc
+++ b/src/runtime/cuda/cuda_module.cc
@@ -308,18 +308,12 @@ Module CUDAModuleLoadBinary(void* strm) {
 }
 
 TVM_REGISTER_GLOBAL("module.loadfile_cubin")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = CUDAModuleLoadFile(args[0], args[1]);
-  });
+.set_body_typed(CUDAModuleLoadFile);
 
 TVM_REGISTER_GLOBAL("module.loadfile_ptx")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = CUDAModuleLoadFile(args[0], args[1]);
-  });
+.set_body_typed(CUDAModuleLoadFile);
 
 TVM_REGISTER_GLOBAL("module.loadbinary_cuda")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = CUDAModuleLoadBinary(args[0]);
-  });
+.set_body_typed(CUDAModuleLoadBinary);
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/metal/metal_module.mm b/src/runtime/metal/metal_module.mm
index e1f0e3fd534b..af809d7619bd 100644
--- a/src/runtime/metal/metal_module.mm
+++ b/src/runtime/metal/metal_module.mm
@@ -310,13 +310,9 @@ Module MetalModuleLoadBinary(void* strm) {
 }
 
 TVM_REGISTER_GLOBAL("module.loadfile_metal")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = MetalModuleLoadFile(args[0], args[1]);
-  });
+.set_body_typed(MetalModuleLoadFile);
 
 TVM_REGISTER_GLOBAL("module.loadbinary_metal")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = MetalModuleLoadBinary(args[0]);
-  });
+.set_body_typed(MetalModuleLoadBinary);
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/opencl/aocl/aocl_module.cc b/src/runtime/opencl/aocl/aocl_module.cc
index 38e82edfe296..d9a3aa23c4c5 100644
--- a/src/runtime/opencl/aocl/aocl_module.cc
+++ b/src/runtime/opencl/aocl/aocl_module.cc
@@ -69,9 +69,7 @@ Module AOCLModuleLoadFile(const std::string& file_name,
 }
 
 TVM_REGISTER_GLOBAL("module.loadfile_aocx")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = AOCLModuleLoadFile(args[0], args[1]);
-  });
+.set_body_typed(AOCLModuleLoadFile);
 
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc
index 543ffb9825b1..971ae3482014 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -281,18 +281,12 @@ Module OpenCLModuleLoadBinary(void* strm) {
 }
 
 TVM_REGISTER_GLOBAL("module.loadfile_cl")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = OpenCLModuleLoadFile(args[0], args[1]);
-  });
+.set_body_typed(OpenCLModuleLoadFile);
 
 TVM_REGISTER_GLOBAL("module.loadfile_clbin")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = OpenCLModuleLoadFile(args[0], args[1]);
-  });
+.set_body_typed(OpenCLModuleLoadFile);
 
 TVM_REGISTER_GLOBAL("module.loadbinary_opencl")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = OpenCLModuleLoadBinary(args[0]);
-  });
+.set_body_typed(OpenCLModuleLoadBinary);
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/opencl/sdaccel/sdaccel_module.cc b/src/runtime/opencl/sdaccel/sdaccel_module.cc
index 9bfc9d2b2705..900d56433514 100644
--- a/src/runtime/opencl/sdaccel/sdaccel_module.cc
+++ b/src/runtime/opencl/sdaccel/sdaccel_module.cc
@@ -80,13 +80,9 @@ Module SDAccelModuleLoadBinary(void* strm) {
 }
 
 TVM_REGISTER_GLOBAL("module.loadfile_xclbin")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = SDAccelModuleLoadFile(args[0], args[1]);
-  });
+.set_body_typed(SDAccelModuleLoadFile);
 
 TVM_REGISTER_GLOBAL("module.loadfile_awsxclbin")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = SDAccelModuleLoadFile(args[0], args[1]);
-  });
+.set_body_typed(SDAccelModuleLoadFile);
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/rocm/rocm_module.cc b/src/runtime/rocm/rocm_module.cc
index b7b93c7c4dfd..6531f97d4b12 100644
--- a/src/runtime/rocm/rocm_module.cc
+++ b/src/runtime/rocm/rocm_module.cc
@@ -243,14 +243,10 @@ Module ROCMModuleLoadBinary(void* strm) {
 
 
 TVM_REGISTER_GLOBAL("module.loadbinary_hsaco")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = ROCMModuleLoadBinary(args[0]);
-  });
+.set_body_typed(ROCMModuleLoadBinary);
 
 
 TVM_REGISTER_GLOBAL("module.loadbinary_hip")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = ROCMModuleLoadBinary(args[0]);
-  });
+.set_body_typed(ROCMModuleLoadBinary);
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/rpc/rpc_event_impl.cc b/src/runtime/rpc/rpc_event_impl.cc
index dfbdb2699d39..7a142f3373db 100644
--- a/src/runtime/rpc/rpc_event_impl.cc
+++ b/src/runtime/rpc/rpc_event_impl.cc
@@ -64,8 +64,6 @@ PackedFunc CreateEventDrivenServer(PackedFunc fsend,
 }
 
 TVM_REGISTER_GLOBAL("rpc._CreateEventDrivenServer")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = CreateEventDrivenServer(args[0], args[1], args[2]);
-  });
+.set_body_typed(CreateEventDrivenServer);
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/rpc/rpc_socket_impl.cc b/src/runtime/rpc/rpc_socket_impl.cc
index 33d852f5a575..16528bcc68a1 100644
--- a/src/runtime/rpc/rpc_socket_impl.cc
+++ b/src/runtime/rpc/rpc_socket_impl.cc
@@ -110,9 +110,7 @@ void RPCServerLoop(int sockfd) {
 }
 
 TVM_REGISTER_GLOBAL("rpc._Connect")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = RPCClientConnect(args[0], args[1], args[2]);
-  });
+.set_body_typed(RPCClientConnect);
 
 TVM_REGISTER_GLOBAL("rpc._ServerLoop")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
diff --git a/src/runtime/stackvm/stackvm_module.cc b/src/runtime/stackvm/stackvm_module.cc
index 5e6f96be50df..4e7d42279001 100644
--- a/src/runtime/stackvm/stackvm_module.cc
+++ b/src/runtime/stackvm/stackvm_module.cc
@@ -142,9 +142,7 @@ Module StackVMModuleCreate(std::unordered_map<std::string, StackVM> fmap,
 }
 
 TVM_REGISTER_GLOBAL("module.loadfile_stackvm")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = StackVMModuleNode::LoadFromFile(args[0], args[1]);
-  });
+.set_body_typed(StackVMModuleNode::LoadFromFile);
 
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/vulkan/vulkan_module.cc b/src/runtime/vulkan/vulkan_module.cc
index cfa80bef151f..c1db14d35674 100644
--- a/src/runtime/vulkan/vulkan_module.cc
+++ b/src/runtime/vulkan/vulkan_module.cc
@@ -427,13 +427,9 @@ Module VulkanModuleLoadBinary(void* strm) {
 }
 
 TVM_REGISTER_GLOBAL("module.loadfile_vulkan")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = VulkanModuleLoadFile(args[0], args[1]);
-  });
+.set_body_typed(VulkanModuleLoadFile);
 
 TVM_REGISTER_GLOBAL("module.loadbinary_vulkan")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = VulkanModuleLoadBinary(args[0]);
-  });
+.set_body_typed(VulkanModuleLoadBinary);
 }  // namespace runtime
 }  // namespace tvm
diff --git a/web/web_runtime.cc b/web/web_runtime.cc
index 273d43b38f22..12bc53cd3407 100644
--- a/web/web_runtime.cc
+++ b/web/web_runtime.cc
@@ -60,16 +60,16 @@ struct RPCEnv {
 };
 
 TVM_REGISTER_GLOBAL("tvm.rpc.server.workpath")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
+.set_body_typed<std::string(std::string)>([](std::string path) {
     static RPCEnv env;
-    *rv = env.GetPath(args[0]);
+    return env.GetPath(path);
   });
 
 TVM_REGISTER_GLOBAL("tvm.rpc.server.load_module")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-    std::string file_name = "/rpc/" + args[0].operator std::string();
-    *rv = Module::LoadFromFile(file_name, "");
+.set_body_typed<Module(std::string)>([](std::string path) {
+    std::string file_name = "/rpc/" + path;
     LOG(INFO) << "Load module from " << file_name << " ...";
+    return Module::LoadFromFile(file_name, "");
   });
 }  // namespace contrib
 }  // namespace tvm

From 10c5378e54f2f7fe3df558a3b35919b82569033a Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Thu, 11 Apr 2019 10:58:54 +0800
Subject: [PATCH 002/106] [AutoTVM] fix argument type for curve feature (#3004)

---
 src/autotvm/touch_extractor.cc                |  4 ++--
 tests/python/unittest/test_autotvm_feature.py | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/autotvm/touch_extractor.cc b/src/autotvm/touch_extractor.cc
index e24e757427ba..002b970588ec 100644
--- a/src/autotvm/touch_extractor.cc
+++ b/src/autotvm/touch_extractor.cc
@@ -514,10 +514,10 @@ TVM_REGISTER_API("autotvm.feature.GetItervarFeatureFlatten")
 TVM_REGISTER_API("autotvm.feature.GetCurveSampleFeatureFlatten")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
   Stmt stmt = args[0];
-  bool take_log = args[1];
+  int sample_n = args[1];
   std::vector<float> ret_feature;
 
-  GetCurveSampleFeatureFlatten(stmt, take_log, &ret_feature);
+  GetCurveSampleFeatureFlatten(stmt, sample_n, &ret_feature);
 
   TVMByteArray arr;
   arr.size = sizeof(float) * ret_feature.size();
diff --git a/tests/python/unittest/test_autotvm_feature.py b/tests/python/unittest/test_autotvm_feature.py
index 401a8d3be407..e0736c280dc4 100644
--- a/tests/python/unittest/test_autotvm_feature.py
+++ b/tests/python/unittest/test_autotvm_feature.py
@@ -61,6 +61,23 @@ def test_iter_feature_gemm():
             assert ans[pair[0]] == pair[1:], "%s: %s vs %s" % (pair[0], ans[pair[0]], pair[1:])
 
 
+def test_curve_feature_gemm():
+    N = 128
+
+    k = tvm.reduce_axis((0, N), 'k')
+    A = tvm.placeholder((N, N), name='A')
+    B = tvm.placeholder((N, N), name='B')
+    C = tvm.compute(
+        A.shape,
+        lambda y, x: tvm.sum(A[y, k] * B[k, x], axis=k),
+        name='C')
+
+    s = tvm.create_schedule(C.op)
+
+    feas = feature.get_buffer_curve_sample_flatten(s, [A, B, C], sample_n=30)
+    # sample_n * #buffers * #curves * 2 numbers per curve
+    assert len(feas) == 30 * 3 * 4 * 2
+
 def test_feature_shape():
     """test the dimensions of flatten feature are the same"""
 
@@ -112,4 +129,6 @@ def get_gemm_feature(target):
 
 if __name__ == "__main__":
     test_iter_feature_gemm()
+    test_curve_feature_gemm()
     test_feature_shape()
+

From b1f5d90b1fd622ecb10785da6e47cc97ef39c669 Mon Sep 17 00:00:00 2001
From: Alexey Romanov <alexey.v.romanov@gmail.com>
Date: Fri, 12 Apr 2019 06:59:20 +0300
Subject: [PATCH 003/106] Support SpaceToBatchND/BatchToSpaceND in Tensorflow
 frontend (#2943)

Thanks @alexeyr . This is now merged.
---
 python/tvm/relay/frontend/tensorflow.py       |  87 +++++++++++
 .../frontend/tensorflow/test_forward.py       | 135 +++++++++++++++++-
 2 files changed, 216 insertions(+), 6 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 1b5318a83412..b357a2fbff30 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -984,6 +984,91 @@ def _impl(inputs, attr, params):
         return AttrCvt(op_name=name)(inputs, attr)
     return _impl
 
+def _space_to_batch_nd():
+    def _impl(inputs, attr, params):
+        input_node = inputs[0]
+        input_shape = attr['_input_shapes'][input_node]
+        block_shape = params.pop(inputs[1].name_hint).asnumpy().tolist()
+        paddings = params.pop(inputs[2].name_hint).asnumpy().tolist()
+        N = len(input_shape)
+        M = len(block_shape)
+        batch = input_shape[0]
+        remaining_shape_length = N - M - 1
+        paddings = [(0, 0)] + paddings + [(0, 0)] * remaining_shape_length
+        # From https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/space-to-batch-n-d:
+        # Zero-pad the start and end of dimensions [1, ..., M] of the input according to paddings
+        # to produce padded of shape padded_shape.
+        padded = tvm.relay.nn.pad(input_node, pad_width=paddings)
+        # Reshape padded to reshaped_padded of shape:
+        # [batch] + [padded_shape[1] / block_shape[0], block_shape[0], ...,
+        # padded_shape[M] / block_shape[M-1], block_shape[M-1]] + remaining_shape
+        shape1 = [batch] + [item for i in range(M) for item in [-4, -1, block_shape[i]]] + [-2]
+        reshaped_padded = tvm.relay.reshape(padded, newshape=shape1)
+        # Permute dimensions of reshaped_padded to produce permuted_reshaped_padded of shape:
+        # block_shape + [batch] + [padded_shape[1] / block_shape[0], ...,
+        # padded_shape[M] / block_shape[M-1]] + remaining_shape
+        axes = [2 * i + 2 for i in range(M)] + [0] + [2 * i + 1 for i in range(M)] + \
+               list(range(1 + 2 * M, 1 + 2 * M + remaining_shape_length))
+        permuted_reshaped_padded = tvm.relay.transpose(reshaped_padded, axes=axes)
+        permuted_reshaped_padded_shape = _infer_out_shapes(permuted_reshaped_padded, params)[0]
+        # Reshape permuted_reshaped_padded to flatten block_shape into the batch dimension,
+        # producing an output tensor of shape:
+        # [batch * prod(block_shape)] + [padded_shape[1] / block_shape[0], ...,
+        # padded_shape[M] / block_shape[M-1]] + remaining_shape
+        shape2 = [batch * np.prod(block_shape)] + list(permuted_reshaped_padded_shape)[M + 1:]
+        reshaped_permuted_reshaped_padded = tvm.relay.reshape(permuted_reshaped_padded,
+                                                              newshape=shape2)
+        return reshaped_permuted_reshaped_padded
+
+    return _impl
+
+
+def _batch_to_space_nd():
+    def _impl(inputs, attr, params):
+        input_node = inputs[0]
+        input_shape = attr['_input_shapes'][input_node]
+        block_shape = params.pop(inputs[1].name_hint).asnumpy().tolist()
+        crops = params.pop(inputs[2].name_hint).asnumpy().tolist()
+        M = len(block_shape)
+        batch = input_shape[0]
+        # From https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/batch-to-space-n-d:
+        # Reshape input to reshaped of shape:
+        # [block_shape[0], ..., block_shape[M-1], batch / prod(block_shape),
+        #  input_shape[1], ..., input_shape[N-1]]
+        shape1 = block_shape + [batch // np.prod(block_shape)] + input_shape[1:]
+        reshaped = tvm.relay.reshape(input_node, newshape=shape1)
+        # Permute dimensions of reshaped to produce permuted of shape
+        # [batch / prod(block_shape), input_shape[1], block_shape[0], ...,
+        # input_shape[M], block_shape[M-1], input_shape[M+1], ..., input_shape[N-1]]
+        axes = [M] + [axis for i in range(M) for axis in [M + i + 1, i]] + \
+            list(range(2 * M + 1, len(shape1)))
+        permuted = tvm.relay.transpose(reshaped, axes=axes)
+        # Reshape permuted to produce reshaped_permuted of shape
+        # [batch / prod(block_shape), input_shape[1] * block_shape[0], ...,
+        #  input_shape[M] * block_shape[M-1], input_shape[M+1], ..., input_shape[N-1]]
+        shape2 = [0] + [-3] * M + [-2]
+        reshaped_permuted = tvm.relay.reshape(permuted, newshape=shape2)
+        # Crop the start and end of dimensions [1, ..., M] of reshaped_permuted according to crops
+        # to produce the output of shape:
+        # [batch / prod(block_shape), input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
+        #  ..., input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
+        #  input_shape[M+1], ..., input_shape[N-1]]
+        reshaped_permuted_shape = _infer_out_shapes(reshaped_permuted, params)[0]
+        cropped = reshaped_permuted
+        for axis in range(1, M+1):
+            crop = crops[axis - 1]
+            if crop != [0, 0]:
+                indices = tvm.relay.arange(
+                    crop[0],
+                    reshaped_permuted_shape[axis] - crop[1],
+                    dtype='int32'
+                )
+                cropped = tvm.relay.take(cropped, indices=indices, axis=axis)
+
+        return cropped
+
+    return _impl
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -1060,6 +1145,8 @@ def _impl(inputs, attr, params):
     'Split'                             : _split(False),
     'SplitV'                            : _split(True),
     'Unpack'                            : _unpack(),
+    'SpaceToBatchND'                    : _space_to_batch_nd(),
+    'BatchToSpaceND'                    : _batch_to_space_nd(),
 }
 
 def _LSTMBlockCell():
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 35dca8008dfc..7e7c1510c60b 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -161,6 +161,7 @@ def is_gpu_available():
     else:
         return False
 
+
 #######################################################################
 # Pooling
 # -------
@@ -221,6 +222,19 @@ def test_forward_pooling():
                          dilation_rate=[1, 1],
                          strides=[2, 1])
 
+            # Tests involving SpaceToBatchND
+            _test_pooling(input_shape=[1, 1, 2, 1],
+                         window_shape=[1, 1],
+                         padding='VALID',
+                         pooling_type=pool_type,
+                         dilation_rate=[1, 2])
+
+            _test_pooling(input_shape=[1, 2, 1],
+                         window_shape=[1],
+                         padding='VALID',
+                         pooling_type=pool_type,
+                         dilation_rate=[2])
+
 #######################################################################
 # Convolution
 # -----------
@@ -229,12 +243,8 @@ def _test_convolution(tensor_in_sizes, filter_in_sizes,
                       dilations, strides, padding, data_format):
     """ One iteration of convolution with given shapes and attributes """
 
-    total_size_1 = 1
-    total_size_2 = 1
-    for s in tensor_in_sizes:
-        total_size_1 *= s
-    for s in filter_in_sizes:
-        total_size_2 *= s
+    total_size_1 = np.prod(tensor_in_sizes)
+    total_size_2 = np.prod(filter_in_sizes)
     # Initializes the input tensor with array containing incrementing
     # numbers from 1.
     data_array = [f * 1.0 for f in range(1, total_size_1 + 1)]
@@ -253,6 +263,7 @@ def _test_convolution(tensor_in_sizes, filter_in_sizes,
         nn_ops.conv2d(in_data,
                       in_filter,
                       strides=strides,
+                      dilations=dilations,
                       padding=padding,
                       data_format=data_format)
 
@@ -271,6 +282,116 @@ def test_forward_convolution():
     _test_convolution([4, 17, 17, 124], [1, 1, 124, 19], [1, 1], [1, 1], 'SAME', 'NHWC')
     _test_convolution([4, 17, 17, 12], [3, 3, 12, 32], [1, 1], [2, 2], 'VALID', 'NHWC')
 
+#######################################################################
+# SpaceToBatchND
+# --------------
+def _test_space_to_batch_nd(input_shape, block_shape, paddings, dtype='int32'):
+    data = np.random.uniform(0, 5, size=input_shape).astype(dtype)
+
+    with tf.Graph().as_default():
+        in_data = tf.placeholder(shape=input_shape, dtype=dtype)
+        out = tf.space_to_batch_nd(in_data, block_shape, paddings)
+
+        compare_tf_with_tvm(data, in_data.name, out.name)
+
+def test_forward_space_to_batch_nd():
+    # test cases: https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/space-to-batch-n-d
+    _test_space_to_batch_nd(
+        input_shape=[1, 2, 2, 1],
+        block_shape=[2, 2],
+        paddings=[[0, 0], [0, 0]]
+    )
+
+    _test_space_to_batch_nd(
+        input_shape=[1, 2, 2, 3],
+        block_shape=[2, 2],
+        paddings=[[0, 0], [0, 0]]
+    )
+
+    _test_space_to_batch_nd(
+        input_shape=[1, 4, 4, 1],
+        block_shape=[2, 2],
+        paddings=[[0, 0], [0, 0]]
+    )
+
+    _test_space_to_batch_nd(
+        input_shape=[2, 2, 4, 1],
+        block_shape=[2, 2],
+        paddings=[[0, 0], [2, 0]],
+        dtype='int64'
+    )
+
+    # pylint: disable=line-too-long
+    # https://github.com/tensorflow/tensorflow/blob/24f578/tensorflow/python/kernel_tests/spacetobatch_op_test.py
+    _test_space_to_batch_nd(
+        input_shape=[2, 3],
+        block_shape=[2],
+        paddings=[[1, 0]],
+        dtype='float32'
+    )
+
+    _test_space_to_batch_nd(
+        input_shape=[2, 3, 2],
+        block_shape=[2],
+        paddings=[[1, 0]],
+        dtype='float64'
+    )
+
+#######################################################################
+# BatchToSpaceND
+# --------------
+def _test_batch_to_space_nd(input_shape, block_shape, crops, dtype='int32'):
+    data = np.random.uniform(0, 5, size=input_shape).astype(dtype)
+
+    with tf.Graph().as_default():
+        in_data = tf.placeholder(shape=input_shape, dtype=dtype)
+        out = tf.batch_to_space_nd(in_data, block_shape, crops)
+
+        compare_tf_with_tvm(data, in_data.name, out.name)
+
+def test_forward_batch_to_space_nd():
+    # test cases: https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/batch-to-space-n-d
+    _test_batch_to_space_nd(
+        input_shape=[4, 1, 1, 1],
+        block_shape=[2, 2],
+        crops=[[0, 0], [0, 0]]
+    )
+
+    _test_batch_to_space_nd(
+        input_shape=[4, 1, 1, 3],
+        block_shape=[2, 2],
+        crops=[[0, 0], [0, 0]]
+    )
+
+    _test_batch_to_space_nd(
+        input_shape=[4, 2, 2, 1],
+        block_shape=[2, 2],
+        crops=[[0, 0], [0, 0]]
+    )
+
+    _test_batch_to_space_nd(
+        input_shape=[8, 1, 3, 1],
+        block_shape=[2, 2],
+        crops=[[0, 0], [2, 0]],
+        dtype='int64'
+    )
+
+    # pylint: disable=line-too-long
+    # https://github.com/tensorflow/tensorflow/blob/24f578/tensorflow/python/kernel_tests/batchtospace_op_test.py
+    _test_batch_to_space_nd(
+        input_shape=[18, 2, 1, 2],
+        block_shape=[2, 3],
+        crops=[[1, 1], [0, 0]],
+        dtype='float32'
+    )
+
+    _test_batch_to_space_nd(
+        input_shape=[20, 5, 8, 7],
+        block_shape=[2, 2],
+        crops=[[1, 1], [1, 1]],
+        dtype='float64'
+    )
+
 #######################################################################
 # Reshape
 # -------
@@ -1312,6 +1433,8 @@ def test_forward_rel_ops():
         _test_forward_concat_v2()
     test_forward_lrn()
     test_forward_l2_normalize()
+    test_forward_space_to_batch_nd()
+    test_forward_batch_to_space_nd()
 
     # End to End
     test_forward_inception_v3()

From 918729d3c0ae6fc23c15b2fa87ef6c0f574330a1 Mon Sep 17 00:00:00 2001
From: Ce Gao <ce.gao@outlook.com>
Date: Sat, 13 Apr 2019 03:37:08 +0800
Subject: [PATCH 004/106] [AutoTVM] Fix typos (#3014)

Signed-off-by: Ce Gao <gaoce@caicloud.io>
---
 tutorials/autotvm/tune_simple_template.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/autotvm/tune_simple_template.py b/tutorials/autotvm/tune_simple_template.py
index dac4f157f44a..832e060312d4 100644
--- a/tutorials/autotvm/tune_simple_template.py
+++ b/tutorials/autotvm/tune_simple_template.py
@@ -63,7 +63,7 @@
 # --------------------------------
 # In this section, we will rewrite a deterministic tvm schedule code to a
 # tunable schedule template. You can regard the process of search space definition
-# as the parametrization of our exiting schedule code.
+# as the parametrization of our existing schedule code.
 #
 # To begin with, here is how we implement a blocked matrix multiplication in tvm.
 

From 6a9c060e8834f591e7e6ca7af3e37aab4df0a215 Mon Sep 17 00:00:00 2001
From: eqy <eqy@cs.washington.edu>
Date: Fri, 12 Apr 2019 14:17:38 -0700
Subject: [PATCH 005/106] [NIT] fix relay invariant error message (#3011)

* [NIT] fix common error message

Extremely minor issue, but this is one of the most common error messages people see...

* Update type_solver.cc

trigger CI
---
 src/relay/pass/type_solver.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/relay/pass/type_solver.cc b/src/relay/pass/type_solver.cc
index 2ce441069aa9..84f72e0d5a00 100644
--- a/src/relay/pass/type_solver.cc
+++ b/src/relay/pass/type_solver.cc
@@ -460,7 +460,7 @@ Type TypeSolver::Resolve(const Type& type) {
 }
 
 bool TypeSolver::Solve() {
-  // update until queue is empty
+  // Update until queue is empty.
   while (!update_queue_.empty()) {
     RelationNode* rnode = update_queue_.front();
     const auto& rel = rnode->rel;
@@ -496,7 +496,7 @@ bool TypeSolver::Solve() {
       rnode->resolved = false;
       this->ReportError(
           RELAY_ERROR(
-            "an internal invariant was violdated while " \
+            "an internal invariant was violated while " \
             "typechecking your program " <<
             err.what()), rnode->location);
     }

From f7438f593e3a61cb98ee27edec56f37afcd4eb4b Mon Sep 17 00:00:00 2001
From: Logan Weber <36520469+weberlo@users.noreply.github.com>
Date: Fri, 12 Apr 2019 15:43:37 -0700
Subject: [PATCH 006/106] [Relay] Add gradient operator tutorial docs (#2751)

* Add gradient operator tutorial docs

* Incorporate Steven's and Ziheng's feedback

* Remove TODO about `collapse_sum_like`

* Add more examples
---
 docs/dev/relay_add_op.rst     | 104 ++++++++++++++++++++++++++++++++++
 src/relay/pass/pattern_util.h |   5 ++
 2 files changed, 109 insertions(+)

diff --git a/docs/dev/relay_add_op.rst b/docs/dev/relay_add_op.rst
index c17e8318bc1f..466dca038185 100644
--- a/docs/dev/relay_add_op.rst
+++ b/docs/dev/relay_add_op.rst
@@ -156,6 +156,110 @@ before producing the call node:
         tup = Tuple(list(args))
         return _make.concat(tup)
 
+Gradient Operators
+------------------
+
+Gradient operators are important for writing differentiable programs in
+Relay. While it is the case that Relay's autodiff algorithm can differentiate
+first-class language constructs, operators are opaque. Because Relay can't
+look into the implementation, an explicit differentiation rule must be
+provided.
+
+Both Python and C++ can be used to write gradient operators, but we focus our
+examples on Python, as it is more commonly used.
+
+Adding a Gradient in Python
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A collection of Python gradient operators can be found in
+``python/tvm/relay/op/_tensor_grad.py``. We will walk through two
+representative examples: ``sigmoid`` and ``multiply``.
+
+.. code:: python
+
+    @register_gradient("sigmoid")
+    def sigmoid_grad(orig, grad):
+        """Returns [grad * sigmoid(x) * (1 - sigmoid(x))]."""
+        return [grad * orig * (ones_like(orig) - orig)]
+
+The inputs here are the original operator ``orig`` and a gradient ``grad`` to
+accumulate into. What we return is a list, where the element at the i'th
+index is the derivative of the operator with respect to the operator's i'th
+input. In general, the gradient will return a list with as many elements as
+there are inputs to the base operator.
+
+Before we further analyze this definition, first we should recall the
+derivative of the sigmoid function: :math:`\frac{\partial \sigma}{\partial x}
+= \sigma(x)(1 - \sigma(x))`. The definition above looks similar to the
+mathematical definition, but there is one important addition, which we
+describe below.
+
+The term ``orig * (ones_like(orig) - orig)`` directly matches the derivative,
+because ``orig`` here is the sigmoid function, but we're not just interested
+in how to compute the gradient of this function. We're interested in
+composing this gradient with other gradients, so we can accumulate the
+gradient across an entire program. This is where the ``grad`` term comes in.
+In the expression ``grad * orig * (ones_like(orig) - orig)``, multiplying by
+``grad`` specifies how to compose the derivative with the gradient thus far.
+
+Now, we consider ``multiply``, a slightly more interesting example:
+
+.. code:: python
+
+    @register_gradient("multiply")
+    def multiply_grad(orig, grad):
+        """Returns [grad * y, grad * x]"""
+        x, y = orig.args
+        return [collapse_sum_like(grad * y, x),
+                collapse_sum_like(grad * x, y)]
+
+In this example, there are two elements in the returned list, because
+``multiply`` is a binary operator. And to recall, if :math:`f(x, y) = xy`, the
+partial derivatives are :math:`\frac{\partial f}{\partial x} = y` and
+:math:`\frac{\partial f}{\partial y} = x`.
+
+There is one required step for ``multiply`` that is not required for
+``sigmoid``, because ``multiply`` has broadcasting semantics. Since the shape
+of ``grad`` might not match the shape of the inputs, we use
+``collapse_sum_like`` to take the contents of the ``grad * <var>`` terms and
+make the shape match the shape of the input we're differentiating with
+respect to.
+
+Adding a Gradient in C++
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Adding a gradient in C++ is similar to adding one in Python, but the
+interface for registering is slightly different.
+
+First, make sure ``src/relay/pass/pattern_util.h`` is included. It provides
+helper functions for creating nodes in the Relay AST. Then, define the
+gradient in a similar fashion as in the Python example:
+
+.. code:: c
+
+    tvm::Array<Expr> MultiplyGrad(const Expr& orig_call, const Expr& output_grad) {
+        const Call& call = orig_call.Downcast<Call>();
+        return { CollapseSumLike(Multiply(output_grad, call.args[1]), call.args[0]),
+                 CollapseSumLike(Multiply(output_grad, call.args[0]), call.args[1]) };
+    }
+
+Notice that in C++ we can't use the same operator overloading that we have in
+Python, and we need to downcast, so the implementation is more verbose. Even
+so, we can easily verify that this definition mirrors the earlier example in
+Python.
+
+Now, instead of using a Python decorator, we need to tack a ``set_attr`` call
+for "FPrimalGradient" onto the end of the base operator's registration, in
+order to register the gradient.
+
+.. code:: c
+
+    RELAY_REGISTER_OP("multiply")
+        // ...
+        // Set other attributes
+        // ...
+        .set_attr<FPrimalGradient>("FPrimalGradient", MultiplyGrad);
+
 Summary
 -------
 
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index 22307d12303e..1e4060fe6c75 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -328,6 +328,11 @@ inline Expr OnesLike(Expr e) {
   return CallNode::make(op, {e});
 }
 
+inline Expr CollapseSumLike(Expr e) {
+  static const Op& op = Op::Get("collapse_sum_like");
+  return CallNode::make(op, {e});
+}
+
 inline Expr Power(Expr lhs, Expr rhs) {
   static const Op& op = Op::Get("power");
   return CallNode::make(op, {lhs, rhs}, Attrs(), {});

From 07114145546f1603d8fe4ff318248d20ab65b635 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Fri, 12 Apr 2019 16:13:45 -0700
Subject: [PATCH 007/106] [Relay] C++ GraphRuntimeCodegen, Deprecate Python2
 (#2986)

* [Relay] C++ GraphRuntimeCodegen

* [Test] Deprecate Python2

* [Python3] Add Py2 check

* Update _pyversion.py

* [Python3] Update test
---
 CMakeLists.txt                                |   1 +
 Jenkinsfile                                   |   4 +-
 python/tvm/__init__.py                        |   2 +
 python/tvm/_pyversion.py                      |  25 +
 .../relay/backend/graph_runtime_codegen.py    | 445 +---------
 src/relay/backend/graph_runtime_codegen.cc    | 760 ++++++++++++++++++
 src/relay/backend/utils.h                     |  79 ++
 src/relay/pass/pattern_util.h                 |   1 -
 tests/python/relay/test_pass_annotation.py    |   8 +-
 tests/python/unittest/test_module_load.py     |   2 +-
 tests/scripts/task_java_unittest.sh           |   8 +-
 tests/scripts/task_python_frontend.sh         |   2 -
 tests/scripts/task_python_integration.sh      |   7 +-
 tests/scripts/task_python_topi.sh             |   2 -
 tests/scripts/task_python_unittest.sh         |   3 -
 tests/scripts/task_python_vta.sh              |   3 -
 16 files changed, 924 insertions(+), 428 deletions(-)
 create mode 100644 python/tvm/_pyversion.py
 create mode 100644 src/relay/backend/graph_runtime_codegen.cc
 create mode 100644 src/relay/backend/utils.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5a6b85e43c0d..76da288eba9e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -85,6 +85,7 @@ else(MSVC)
   include(CheckCXXCompilerFlag)
   check_cxx_compiler_flag("-std=c++11"    SUPPORT_CXX11)
   if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
+    message("Build in Debug mode")
     set(CMAKE_C_FLAGS "-O0 -g -Wall -fPIC ${CMAKE_C_FLAGS} -rdynamic")
     set(CMAKE_CXX_FLAGS "-O0 -g -Wall -fPIC -std=c++11 ${CMAKE_CXX_FLAGS} -rdynamic")
   else()
diff --git a/Jenkinsfile b/Jenkinsfile
index dc3a56234509..4765538a3806 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -214,7 +214,7 @@ stage('Build') {
 }
 
 stage('Unit Test') {
-  parallel 'python2/3: GPU': {
+  parallel 'python3: GPU': {
     node('GPU') {
       ws('workspace/tvm/ut-python-gpu') {
         init_git()
@@ -226,7 +226,7 @@ stage('Unit Test') {
       }
     }
   },
-  'python2/3: i386': {
+  'python3: i386': {
     node('CPU') {
       ws('workspace/tvm/ut-python-i386') {
         init_git()
diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index c0470ade60f9..ce6f0602a572 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -18,6 +18,8 @@
 """TVM: Low level DSL/IR stack for tensor computation."""
 from __future__ import absolute_import as _abs
 
+from . import _pyversion
+
 from . import tensor
 from . import arith
 from . import expr
diff --git a/python/tvm/_pyversion.py b/python/tvm/_pyversion.py
new file mode 100644
index 000000000000..a46b22028387
--- /dev/null
+++ b/python/tvm/_pyversion.py
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Python2 version check
+"""
+import sys
+
+if not (sys.version_info[0] >= 3 and sys.version_info[1] >= 5):
+    PY3STATEMENT = """TVM project proudly dropped support of Python2.
+    The minimal Python requirement is Python 3.5
+    """
+    raise Exception(PY3STATEMENT)
diff --git a/python/tvm/relay/backend/graph_runtime_codegen.py b/python/tvm/relay/backend/graph_runtime_codegen.py
index cc3f14ba3951..ea1846b93beb 100644
--- a/python/tvm/relay/backend/graph_runtime_codegen.py
+++ b/python/tvm/relay/backend/graph_runtime_codegen.py
@@ -21,7 +21,7 @@
 
 First we define a compiler from a single Relay expression to the
 graph langauge. We require the expression to be a function.
-The function's parameters correpond to the placeholder/inputs
+The function's parameters correspond to the placeholder/inputs
 and model parameters found in the computation graph representation.
 The body of the function represents the computation graph.
 
@@ -31,387 +31,44 @@
 
 To connect to the graph runtime, we use a printer that converts our graph format
 into TVM's JSON format. The resulting string can be loaded by
-contrib.graph_runtime or any other TVM runtime comptatible system.
+contrib.graph_runtime or any other TVM runtime compatible systems.
 """
-
 from __future__ import absolute_import
-import json
-from collections import defaultdict, OrderedDict
-import attr
-from . import _backend
-from . import compile_engine
-from ..op import Op
-from ..expr import Function, GlobalVar
-from ..expr_functor import ExprFunctor
-from ..ty import TupleType, TensorType
-from ... import target as _target
-
-
-@attr.s
-class NodeRef(object):
-    """A reference to a node, used for constructing the graph."""
-    ident = attr.ib()
-    index = attr.ib(default=0)
-    version = attr.ib(default=0)
-
-    def to_json(self):
-        return [self.ident, self.index, self.version]
-
-
-@attr.s
-class Node(object):
-    """The base class for nodes in the TVM runtime system graph input."""
-    name = attr.ib()
-    attrs = attr.ib()
-
-    def to_json(self):
-        raise Exception("Abstract method, please implement me.")
-
-
-@attr.s
-class InputNode(Node):
-    """An input node in the TVM runtime system graph input."""
-    name = attr.ib()
-    attrs = attr.ib()
-
-    def to_json(self):
-        return {
-            "op": "null",
-            "name": self.name,
-            "inputs": []
-        }
 
+from tvm.ndarray import empty
+from tvm._ffi.function import _init_api
 
-@attr.s
-class OpNode(Node):
-    """An operator node in the TVM runtime system"s graph input."""
-    op_name = attr.ib()
-    inputs = attr.ib()
-    op_attrs = attr.ib()
-    num_outputs = attr.ib(default=1)
+from tvm.relay import build_module
+from tvm import target as _target
 
-    def to_json(self):
-        attrs = dict.copy(self.op_attrs)
-        # Extend ops with extra info.
-        attrs["func_name"] = self.op_name
-        attrs["flatten_data"] = "0"
-        attrs["num_inputs"] = str(len(self.inputs))
-        attrs["num_outputs"] = str(self.num_outputs)
+_init_api("tvm.relay.build_module")
 
-        return {
-            "op": "tvm_op",
-            "name": self.name,
-            "attrs": attrs,
-            "inputs": self.inputs
-        }
-
-
-def shape_to_json(shape):
-    """Convert symbolic shape to json compatible forma."""
-    return [sh.value for sh in shape]
-
-
-class GraphRuntimeCodegen(ExprFunctor):
+class GraphRuntimeCodegen(object):
     """The compiler from Relay to the TVM runtime system."""
-    nodes = attr.ib()
-    var_map = attr.ib()
 
     def __init__(self, mod, target):
-        ExprFunctor.__init__(self)
-        self.mod = mod
-        self.target = target
-        self.nodes = []
-        self.var_map = {}
-        self.params = {}
-        self.storage_device_map = None
-        self.compile_engine = compile_engine.get()
-        self.lowered_funcs = defaultdict(set)
-        self._name_map = {}
-
-    def add_node(self, node, expr):
-        """
-        Add a node to the graph.
-
-        Parameters
-        ----------
-        node: Node
-            The node to add to the graph.
-
-        expr: tvm.relay.Expr
-            The corresponding expression.
-
-        Returns
-        -------
-        node_ref: Union[NodeRef, List[NodeRef]]
-            A reference to the node.
-        """
-        checked_type = expr.checked_type
-        # setup storage ids
-        assert expr in self.storage_device_map
-        storage_device_info = self.storage_device_map[expr]
-        assert len(storage_device_info) == 2
-        node.attrs["storage_id"] = [x.value for x in storage_device_info[0]]
-        device_types = [x.value for x in storage_device_info[1]]
-        num_unknown_devices = device_types.count(0)
-        if num_unknown_devices != 0 and num_unknown_devices != len(device_types):
-            raise RuntimeError("The graph contains not annotated nodes for "
-                               "heterogeneous execution. All nodes must be "
-                               "annotated.")
-
-        # Add the `device_index` attribute when the graph is annotated.
-        if num_unknown_devices == 0:
-            node.attrs["device_index"] = device_types
-
-        node_id = len(self.nodes)
-        self.nodes.append(node)
-        # Tuple return value, flatten as tuple
-        if isinstance(checked_type, TupleType):
-            ret = []
-            shape = []
-            dtype = []
-            for i, typ in enumerate(checked_type.fields):
-                if not isinstance(typ, TensorType):
-                    raise RuntimeError("type %s not supported" % typ)
-                ret.append(NodeRef(node_id, i))
-                shape.append(shape_to_json(typ.shape))
-                dtype.append(typ.dtype)
-            node.attrs["shape"] = shape
-            node.attrs["dtype"] = dtype
-            assert isinstance(node, OpNode)
-            node.num_outputs = len(checked_type.fields)
-            return tuple(ret)
-        # Normal tensor return type
-        if not isinstance(checked_type, TensorType):
-            raise RuntimeError("type %s not supported" % checked_type)
-        node.attrs["shape"] = [shape_to_json(checked_type.shape)]
-        node.attrs["dtype"] = [checked_type.dtype]
-        node.num_outputs = 1
-        return NodeRef(node_id, 0)
-
-    def visit_tuple(self, vtuple):
-        fields = []
-        for field in vtuple.fields:
-            ref = self.visit(field)
-            assert isinstance(ref, NodeRef)
-            fields.append(ref)
-        return tuple(fields)
-
-    def visit_tuple_getitem(self, op):
-        vtuple = self.visit(op.tuple_value)
-        assert isinstance(vtuple, tuple)
-        return vtuple[op.index]
-
-    def visit_constant(self, op):
-        index = len(self.params)
-        name = "p%d" % index
-        self.params[name] = op.data
-        node = InputNode(name, {})
-        return self.add_node(node, op)
-
-    def visit_function(self, _):
-        raise RuntimeError("function not supported")
-
-    def visit_if(self, _):
-        raise RuntimeError("if not supported")
-
-    def visit_global_var(self, _):
-        raise RuntimeError()
-
-    def visit_let(self, let):
-        """
-        Visit the let binding, by first traversing its value,
-        then setting the metadata on the returned NodeRef.
-
-        Finally visit the body, and return the NodeRef corresponding
-        to it.
-
-        Parameters
-        ----------
-        let: tvm.relay.Expr
-            The let binding to transform.
-
-        Returns
-        -------
-        ref: NodeRef
-            The node reference to the body.
-        """
-        assert let.var not in self.var_map
-        self.var_map[let.var] = self.visit(let.value)
-        return self.visit(let.body)
-
-    def visit_var(self, rvar):
-        return self.var_map[rvar]
-
-    def visit_call(self, call):
-        """Transform a ::tvm.relay.Call into an operator in the TVM graph."""
-        if isinstance(call.op, Op):
-            raise Exception(
-                "Operators should be transformed away; try applying" +
-                "the fuse_ops transformation to the expression.")
-        elif isinstance(call.op, GlobalVar):
-            func = self.mod[call.op]
-        elif isinstance(call.op, Function):
-            func = call.op
-        else:
-            raise Exception(
-                "TVM runtime does not support calls to {0}".format(type(call.op)))
-        if int(func.attrs.Primitive) != 1:
-            raise Exception(
-                "TVM only support calls to primitive functions " +
-                "(i.e functions composed of fusable operator invocations)")
-
-        assert call in self.storage_device_map
-        device_types = self.storage_device_map[call][1]
-        call_dev_type = device_types[0].value
-        if isinstance(self.target, (str, _target.Target)):
-            # homogeneous execution.
-            cached_func = self.compile_engine.lower(func, self.target)
-            self.target = {0: str(self.target)}
-        elif isinstance(self.target, dict):
-            # heterogeneous execution.
-            if call_dev_type not in self.target:
-                raise Exception("No target is provided for device " +
-                                "{0}".format(call_dev_type))
-            cached_func = self.compile_engine.lower(func,
-                                                    self.target[call_dev_type])
-        else:
-            raise ValueError("self.target must be the type of str," +
-                             "tvm.target.Target, or dict of int to str")
-        for loweredf in cached_func.funcs:
-            self.lowered_funcs[self.target[call_dev_type]].add(loweredf)
-
-        inputs = []
-        # flatten tuple in the call.
-        for arg in call.args:
-            res = self.visit(arg)
-            if isinstance(arg.checked_type, TupleType):
-                assert isinstance(res, tuple)
-                inputs += res
-            else:
-                inputs.append(res)
-
-        inputs = [x.to_json() for x in inputs]
-        op_name = cached_func.func_name
-        op_node = OpNode(self._get_unique_name(op_name), {},
-                         op_name, inputs, {})
-        return self.add_node(op_node, call)
-
-    def visit_op(self, _):
-        raise Exception("can not compile op in non-eta expanded form")
-
-    def visit_ref_create(self, _):
-        raise RuntimeError("reference not supported")
-
-    def visit_ref_read(self, _):
-        raise RuntimeError("reference not supported")
-
-    def visit_ref_write(self, _):
-        raise RuntimeError("reference not supported")
-
-    def visit_constructor(self, _):
-        raise Exception("ADT constructor case not yet implemented")
-
-    def visit_match(self, _):
-        raise Exception("match case not yet implemented")
-
-    def _get_json(self):
-        """
-        Convert the sequence of nodes stored by the compiler into the
-        TVM graph runtime format.
-
-        Returns
-        -------
-        graph_json : str
-            The generated JSON as a string.
-        """
-        nodes = []
-        # First we compute "nodes" field.
-        for node in self.nodes:
-            nodes.append(node.to_json())
-
-        arg_nodes = []
-        # Compute "arg_nodes" and "heads" fields.
-        for i, node in enumerate(self.nodes):
-            if isinstance(node, InputNode):
-                arg_nodes.append(i)
-
-        heads = self.heads
-        heads = heads if isinstance(heads, tuple) else [heads]
-        heads = [x.to_json() for x in heads]
-
-        # Compute "node_row_ptr" and entry attributes.
-        num_entry = 0
-        shapes = []
-        storage_ids = []
-        device_types = []
-        dltypes = []
-        node_row_ptr = [0]
-        for node in self.nodes:
-            assert node.num_outputs == len(node.attrs["shape"])
-            shapes += node.attrs["shape"]
-            dltypes += node.attrs["dtype"]
-            storage_ids += node.attrs["storage_id"]
-            if "device_index" in node.attrs:
-                device_types += node.attrs["device_index"]
-            num_entry += node.num_outputs
-            node_row_ptr.append(num_entry)
-
-        # Compute "attrs" field.
-        attrs = {}
-        attrs["shape"] = ["list_shape", shapes]
-        attrs["storage_id"] = ["list_int", storage_ids]
-        if device_types:
-            attrs["device_index"] = ["list_int", device_types]
-        attrs["dltype"] = ["list_str", dltypes]
-
-        # Metadata definitions
-        def nested_defaultdict():
-            return defaultdict(nested_defaultdict)
-        metadata = nested_defaultdict()
-        for node_id in arg_nodes:
-            node_name = nodes[node_id]['name']
-            if node_name not in self.params:
-                metadata['signatures']['default']['inputs'][node_name]['id'] = node_id
-                metadata['signatures']['default']['inputs'][node_name]['dtype'] = dltypes[node_id]
-                metadata['signatures']['default']['inputs'][node_name]['shape'] = shapes[node_id]
-        for node_id in heads:
-            node_name = nodes[node_id[0]]['name']
-            metadata['signatures']['default']['outputs'][node_name]['id'] = node_id[0]
-            metadata['signatures']['default']['outputs'][node_name]['dtype'] = dltypes[node_id[0]]
-            metadata['signatures']['default']['outputs'][node_name]['shape'] = shapes[node_id[0]]
-
-        # Keep  'metadata' always at end
-        json_dict = OrderedDict([
-            ("nodes", nodes),
-            ("arg_nodes", arg_nodes),
-            ("heads", heads),
-            ("attrs", attrs),
-            ("node_row_ptr", node_row_ptr),
-            ("metadata", metadata),
-        ])
-
-        return json.dumps(json_dict, indent=2)
-
-    def debug_dump_memory_plan(self, func):
-        """Debug function to dump memory plan."""
-        def _annotate(expr):
-            if expr in self.storage_device_map:
-                storage_device_info = self.storage_device_map[expr]
-                assert len(storage_device_info) == 2
-                return str(storage_device_info[0])
-            return ""
-        return func.astext(show_meta_data=False, annotate=_annotate)
-
-    def debug_dump_device_annotation(self, func):
-        """Debug function to dump device annotation result."""
-        def _annotate(expr):
-            if expr in self.storage_device_map:
-                storage_device_info = self.storage_device_map[expr]
-                assert len(storage_device_info) == 2
-                return str(storage_device_info[1])
-            return ""
-        return func.astext(show_meta_data=False, annotate=_annotate)
-
+        self._mod = build_module._GraphRuntimeCodegen()
+        self._init = self._mod["init"]
+        self._codegen = self._mod["codegen"]
+        self._get_graph_json = self._mod["get_graph_json"]
+        self._list_params_name = self._mod["list_params_name"]
+        self._get_param_by_name = self._mod["get_param_by_name"]
+        self._get_lowered_funcs = self._mod["get_lowered_funcs"]
+        self._setup(mod, target)
+
+    def _setup(self, mod, target):
+        tgts = []
+        if isinstance(target, dict):
+            for kv in target.items():
+                tgts.append(kv[0])
+                if isinstance(kv[1], (str, _target.Target)):
+                    tgts.append(str(kv[1]))
+                else:
+                    raise Exception("Unknown target type")
+        elif isinstance(target, (str, _target.Target)):
+            tgts.append("0")
+            tgts.append(str(target))
+        self._init(mod, tgts)
 
     def codegen(self, func):
         """Compile a single function into a graph.
@@ -425,38 +82,20 @@ def codegen(self, func):
         -------
         graph_json : str
             The graph json that can be consumed by runtime.
-
         lowered_funcs : List[tvm.LoweredFunc] or Dict[str, List[tvm.LoweredFunc]]
             The lowered functions.
-
         params : Dict[str, tvm.nd.NDArray]
             Additional constant parameters.
         """
-        self.storage_device_map = _backend.GraphPlanMemory(func)
-        # First we convert all the parameters into input nodes.
-        for param in func.params:
-            node = InputNode(param.name_hint, {})
-            self.var_map[param] = self.add_node(node, param)
-
-        # Then we compile the body into a graph which can depend
-        # on input variables.
-        self.heads = self.visit(func.body)
-        graph_json = self._get_json()
-
-        # Return the lowered functions as a list for homogeneous compilation.
-        # Otherwise, for heterogeneous compilation, a dictionary containing
-        # the device id to a list of lowered functions is returned. Both forms
-        # are acceptable to tvm.build.
-        if not isinstance(self.target, dict):
-            lowered_funcs = list(list(self.lowered_funcs.values())[0])
-        else:
-            lowered_funcs = {k: list(v) for k, v in self.lowered_funcs.items()}
-        return graph_json, lowered_funcs, self.params
-
-    def _get_unique_name(self, name):
-        if name not in self._name_map:
-            self._name_map[name] = 1
-            return name
-        index = self._name_map[name]
-        self._name_map[name] += 1
-        return self._get_unique_name(name + str(index))
+        self._codegen(func)
+        graph_json = self._get_graph_json()
+        lowered_func = self._get_lowered_funcs()
+        param_names = self._list_params_name()
+        params = {}
+        for name in param_names:
+            key = name.value
+            arr = self._get_param_by_name(key)
+            param = empty(arr.shape, dtype=arr.dtype, ctx=arr.ctx)
+            arr.copyto(param)
+            params[key] = param
+        return graph_json, lowered_func, params
diff --git a/src/relay/backend/graph_runtime_codegen.cc b/src/relay/backend/graph_runtime_codegen.cc
new file mode 100644
index 000000000000..beb13032ce55
--- /dev/null
+++ b/src/relay/backend/graph_runtime_codegen.cc
@@ -0,0 +1,760 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file relay/backend/graph_codegen.cc
+ * \brief Graph runtime codegen
+ */
+
+#include <dmlc/any.h>
+#include <dmlc/json.h>
+#include <tvm/node/ir_functor.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/runtime/device_api.h>
+
+
+#include <list>
+#include <string>
+#include <vector>
+
+#include "utils.h"
+#include "compile_engine.h"
+
+namespace tvm {
+namespace relay {
+namespace backend {
+
+class GraphNode;
+class GraphInputNode;
+class GraphOpNode;
+
+using IntegerArray = Array<Integer>;
+using ShapeVector = std::vector<std::vector<int64_t> >;
+using GraphAttrs = std::unordered_map<std::string, dmlc::any>;
+using GraphNodePtr = std::shared_ptr<GraphNode>;
+using GraphInputNodePtr = std::shared_ptr<GraphInputNode>;
+using GraphOpNodePtr = std::shared_ptr<GraphOpNode>;
+using TargetsMap = std::unordered_map<std::string, Target>;
+
+/*! \brief Lowered outputs */
+struct LoweredOutput {
+  std::string graph_json;
+  Map<std::string, Array<LoweredFunc> > lowered_funcs;
+  std::unordered_map<std::string, tvm::runtime::NDArray> params;
+};
+
+/*! \brief Node types */
+enum GraphNodeType {
+  kGraphNop,
+  kGraphInputNode,
+  kGraphOpNode,
+};
+
+class GraphNodeRef {
+ public:
+  GraphNodeRef() {}
+  GraphNodeRef(int ident, int index, int version = 0)
+    : ident_(ident), index_(index), version_(version) {}
+
+
+  inline void Save(dmlc::JSONWriter* writer) const {
+    writer->BeginArray();
+    writer->WriteArrayItem(ident_);
+    writer->WriteArrayItem(index_);
+    writer->WriteArrayItem(version_);
+    writer->EndArray();
+  }
+
+  inline void Load(dmlc::JSONReader* reader) {
+    LOG(FATAL) << "Not implemented.";
+  }
+
+ protected:
+  int ident_;
+  int index_{0};
+  int version_{0};
+};
+
+/*! \brief Base Node class */
+class GraphNode {
+ public:
+  GraphNode() {}
+  virtual void Save(dmlc::JSONWriter* writer) const {}
+  virtual void Load(dmlc::JSONReader* reader) {}
+  virtual GraphNodeType Type() const { return kGraphNop; }
+  virtual ~GraphNode() {}
+
+ public:
+  int num_outputs_{1};
+  std::string name_;
+  GraphAttrs attrs_;
+};
+
+/*! \brief Input Node */
+class GraphInputNode : public GraphNode {
+ public:
+  GraphInputNode() {}
+  GraphInputNode(const std::string& name, const GraphAttrs& attrs) {
+    name_ = name;
+    attrs_ = attrs;
+  }
+
+  GraphNodeType Type() const override { return kGraphInputNode; }
+
+  void Save(dmlc::JSONWriter* writer) const override {
+    const std::string op_name{"null"};
+    writer->BeginObject();
+    writer->WriteObjectKeyValue("op", op_name);
+    writer->WriteObjectKeyValue("name", this->name_);
+    writer->WriteObjectKeyValue("inputs", std::list<int>());
+    writer->EndObject();
+  }
+  static std::shared_ptr<GraphNode> make_node_ptr(const std::string& name,
+                                                  const GraphAttrs& attrs) {
+    auto ptr = std::make_shared<GraphInputNode>(name, attrs);
+    return std::dynamic_pointer_cast<GraphNode>(ptr);
+  }
+};
+
+/*! \brief Op Node */
+class GraphOpNode : public GraphNode {
+ public:
+  GraphOpNode() {}
+  GraphOpNode(const std::string& name,
+              const GraphAttrs& nd_attrs,
+              const std::string& op_name,
+              const std::vector<GraphNodeRef>& inputs,
+              const GraphAttrs& attrs,
+              size_t num_outputs = 1) {
+    name_ = name;
+    attrs_ = nd_attrs;
+    op_name_ = op_name;
+    inputs_ = inputs;
+    op_attrs_ = attrs_;
+    num_outputs_ = num_outputs;
+    op_attrs_["func_name"] = op_name_;
+    op_attrs_["flatten_data"] = std::string("0");
+    op_attrs_["num_inputs"] = std::to_string(inputs_.size());
+    op_attrs_["num_outputs"] = std::to_string(num_outputs_);
+  }
+
+  GraphNodeType Type() const override { return kGraphOpNode; }
+
+  void Save(dmlc::JSONWriter* writer) const override {
+    GraphAttrs attrs = op_attrs_;
+    attrs["func_name"] = this->op_name_;
+    attrs["flatten_data"] = std::string("0");
+    attrs["num_inputs"] = std::to_string(this->inputs_.size());
+    attrs["num_outputs"] = std::to_string(this->num_outputs_);
+    writer->BeginObject();
+    writer->WriteObjectKeyValue("op", op_type_name_);
+    writer->WriteObjectKeyValue("name", name_);
+    writer->WriteObjectKeyValue("attrs", attrs);
+    writer->WriteObjectKeyValue("inputs", this->inputs_);
+    writer->EndObject();
+  }
+  static std::shared_ptr<GraphNode> make_node_ptr(const std::string& name,
+                                                  const GraphAttrs& nd_attrs,
+                                                  const std::string& op_name,
+                                                  const std::vector<GraphNodeRef>& inputs,
+                                                  const GraphAttrs& attrs,
+                                                  size_t num_outputs = 1) {
+    auto ptr = std::make_shared<GraphOpNode>(name, nd_attrs, op_name, inputs, attrs, num_outputs);
+    return std::dynamic_pointer_cast<GraphNode>(ptr);
+  }
+
+ public:
+  std::string op_name_;
+  std::vector<GraphNodeRef> inputs_;
+  GraphAttrs op_attrs_;
+
+ private:
+  const std::string op_type_name_{"tvm_op"};
+};
+
+/*! \brief Code generator for graph runtime */
+class GraphRuntimeCodegen
+    : public ::tvm::relay::ExprFunctor<std::vector<GraphNodeRef>(const Expr&)> {
+ public:
+  GraphRuntimeCodegen(runtime::Module* mod,
+                      const std::unordered_map<std::string, std::string>& targets) : mod_(mod) {
+    compile_engine_ = CompileEngine::Global();
+    for (auto &kv : targets) {
+      targets_[kv.first] = Target::create(kv.second);
+    }
+  }
+
+  LoweredOutput Codegen(relay::Function func) {
+    auto pf = GetPackedFunc("relay.backend.GraphPlanMemory");
+    storage_device_map_ = (*pf)(func);
+    // First we convert all the parameters into input nodes.
+    for (auto param : func->params) {
+      auto node_ptr = GraphInputNode::make_node_ptr(param->name_hint(), GraphAttrs());
+      var_map_[param.get()] = AddNode(node_ptr, param);
+    }
+    heads_ = VisitExpr(func->body);
+    std::ostringstream os;
+    dmlc::JSONWriter writer(&os);
+    GetJSON(&writer);
+    LoweredOutput ret;
+    ret.graph_json = os.str();
+    ret.params = params_;
+    for (auto& kv : lowered_funcs_) {
+      if (ret.lowered_funcs.count(kv.first) == 0) {
+        ret.lowered_funcs.Set(kv.first, Array<LoweredFunc>());
+      }
+      auto& vec = ret.lowered_funcs[kv.first];
+      Array<LoweredFunc> tmp;
+      for (auto f : kv.second) {
+        tmp.push_back(f);
+      }
+      for (auto f : vec) {
+        tmp.push_back(f);
+      }
+      ret.lowered_funcs.Set(kv.first, tmp);
+    }
+    return ret;
+  }
+
+ protected:
+  /*!
+   * \brief Extract shape from expr to vector<int64_t>
+   *
+   * \param shape
+   * \return std::vector<int64_t>
+   */
+  std::vector<int64_t> _ShapeToJSON(tvm::Array<HalideIR::Expr> shape) {
+    std::vector<int64_t> ret;
+    for (IndexExpr dim : shape) {
+      const int64_t* pval = as_const_int(dim);
+      ret.push_back(*pval);
+    }
+    return ret;
+  }
+
+  /*!
+   * \brief Add node to graph
+   *
+   * \param node
+   * \param expr
+   * \return std::vector<_NodeRef>
+   */
+  std::vector<GraphNodeRef> AddNode(GraphNodePtr node, Expr expr) {
+    auto checked_type = expr->checked_type();
+    size_t count = storage_device_map_.count(expr);
+    CHECK_GT(count, 0) << "Expr is not existing in storage plan";
+    auto storage_device_info = storage_device_map_[expr];
+    CHECK_EQ(storage_device_info.size(), 2);
+    // storage
+    std::vector<int64_t> storage_info;
+    for (auto& v : storage_device_info[0]) {
+      storage_info.push_back(v->value);
+    }
+    node->attrs_["storage_id"] = std::move(storage_info);
+    // type
+    std::vector<int64_t> device_types;
+    for (auto& v : storage_device_info[1]) {
+      device_types.push_back(v->value);
+    }
+    size_t num_unknown_devices = std::count(device_types.begin(), device_types.end(), 0);
+    if (num_unknown_devices != 0 && num_unknown_devices != device_types.size()) {
+      LOG(FATAL) << "The graph contains not annotated nodes for "
+                 << "heterogeneous execution. All nodes must be "
+                 << "annotated.";
+    }
+    if (num_unknown_devices == 0) {
+      node->attrs_["device_index"] = device_types;
+    }
+    auto node_id = nodes_.size();
+    nodes_.push_back(node);
+    // Tuple return value, flatten as tuple
+    if (const auto* tuple_type = checked_type.as<TupleTypeNode>()) {
+      std::vector<GraphNodeRef> ret;
+      ShapeVector shape;
+      std::vector<std::string> dtype;
+      for (size_t i = 0; i < tuple_type->fields.size(); ++i) {
+        if (const auto* typ = tuple_type->fields[i].as<TensorTypeNode>()) {
+          ret.push_back(GraphNodeRef(node_id, i));
+          shape.emplace_back(_ShapeToJSON(typ->shape));
+          dtype.emplace_back(DType2String(typ->dtype));
+        } else {
+          LOG(FATAL) << "type " << checked_type->type_key() << " not supported";
+        }
+      }
+      CHECK_EQ(node->Type(), kGraphOpNode);
+      auto op_nd = std::dynamic_pointer_cast<GraphOpNode>(node);
+      op_nd->attrs_["shape"] = shape;
+      op_nd->attrs_["dtype"] = dtype;
+      op_nd->num_outputs_ = tuple_type->fields.size();
+      return ret;
+    }
+    // Normal tensor return type
+    if (const auto* tensor_type = checked_type.as<TensorTypeNode>()) {
+      ShapeVector shape;
+      std::vector<std::string> dtype;
+      shape.emplace_back(_ShapeToJSON(tensor_type->shape));
+      dtype.emplace_back(DType2String(tensor_type->dtype));
+      node->attrs_["shape"] = shape;
+      node->attrs_["dtype"] = dtype;
+    } else {
+      LOG(FATAL) << "type " << checked_type->type_key() << " not supported";
+    }
+    return {GraphNodeRef(node_id, 0)};
+  }
+
+  /*! \brief Visitors */
+  std::unordered_map<Expr, std::vector<GraphNodeRef>, NodeHash, NodeEqual> visitor_cache_;
+
+  std::vector<GraphNodeRef> VisitExpr(const Expr& expr) override {
+    if (visitor_cache_.count(expr)) return visitor_cache_.at(expr);
+    std::vector<GraphNodeRef> res;
+    if (expr.as<ConstantNode>()) {
+      res = VisitExpr_(expr.as<ConstantNode>());
+    } else if (expr.as<TupleNode>()) {
+      res = VisitExpr_(expr.as<TupleNode>());
+    } else if (expr.as<VarNode>()) {
+      res = VisitExpr_(expr.as<VarNode>());
+    } else if (expr.as<GlobalVarNode>()) {
+      res = VisitExpr_(expr.as<GlobalVarNode>());
+    } else if (expr.as<FunctionNode>()) {
+      res = VisitExpr_(expr.as<FunctionNode>());
+    } else if (expr.as<CallNode>()) {
+      res = VisitExpr_(expr.as<CallNode>());
+    } else if (expr.as<LetNode>()) {
+      res = VisitExpr_(expr.as<LetNode>());
+    } else if (expr.as<IfNode>()) {
+      res = VisitExpr_(expr.as<IfNode>());
+    } else if (expr.as<OpNode>()) {
+      res = VisitExpr_(expr.as<OpNode>());
+    } else if (expr.as<TupleGetItemNode>()) {
+      res = VisitExpr_(expr.as<TupleGetItemNode>());
+    } else if (expr.as<RefCreateNode>()) {
+      res = VisitExpr_(expr.as<RefCreateNode>());
+    } else if (expr.as<RefReadNode>()) {
+      res = VisitExpr_(expr.as<RefReadNode>());
+    } else if (expr.as<RefWriteNode>()) {
+      res = VisitExpr_(expr.as<RefWriteNode>());
+    } else if (expr.as<ConstructorNode>()) {
+      res = VisitExpr_(expr.as<ConstructorNode>());
+    } else if (expr.as<MatchNode>()) {
+      res = VisitExpr_(expr.as<MatchNode>());
+    }
+    visitor_cache_[expr] = res;
+    return res;
+  }
+
+  std::vector<GraphNodeRef> VisitExpr_(const VarNode* op) override {
+    Expr expr = GetRef<Expr>(op);
+    return var_map_[expr.get()];
+  }
+
+  std::vector<GraphNodeRef> VisitExpr_(const ConstantNode* op) override {
+    Expr expr = GetRef<Expr>(op);
+    size_t index = params_.size();
+    std::string name = "p" + std::to_string(index);
+    params_[name] = op->data;
+    auto node = GraphInputNode::make_node_ptr(name, GraphAttrs());
+    return AddNode(node, expr);
+  }
+
+  std::vector<GraphNodeRef> VisitExpr_(const TupleNode* op) override {
+    std::vector<GraphNodeRef> fields;
+    for (auto field : op->fields) {
+      auto ref_vec = VisitExpr(field);
+      for (auto ref : ref_vec) {
+        fields.push_back(ref);
+      }
+    }
+    return fields;
+  }
+  std::vector<GraphNodeRef> VisitExpr_(const CallNode* op) override {
+    Expr expr = GetRef<Expr>(op);
+    Function func;
+    if (op->op.as<OpNode>()) {
+      LOG(FATAL) << "Operators should be transformed away; try applying"
+                 << "the fuse_ops transformation to the expression.";
+    } else if (op->op.as<GlobalVarNode>()) {
+      LOG(FATAL) << "Not implemented";
+    } else if (op->op.as<FunctionNode>()) {
+      func = GetRef<Function>(op->op.as<FunctionNode>());
+    } else {
+      LOG(FATAL) << "TVM runtime does not support calls to " << op->op->type_key();
+    }
+    if (!func->IsPrimitive()) {
+      LOG(FATAL) << "TVM only support calls to primitive functions "
+                 << "(i.e functions composed of fusable operator invocations)";
+    }
+
+    CHECK_GE(storage_device_map_.count(expr), 0);
+    auto pf0 = GetPackedFunc("relay.backend._make_CCacheKey");
+    auto pf1 = GetPackedFunc("relay.backend._CompileEngineLower");
+    auto &device_type = storage_device_map_[expr][1];
+    auto call_dev_type = device_type[0]->value;  //-> int to string
+    Target target;
+    if (targets_.size() == 1) {
+       // homogeneous execution.
+       for (auto kv : targets_) {
+         target = kv.second;
+       }
+    } else {
+      // heterogeneous execution.
+      const auto call_dev_key = std::to_string(call_dev_type);
+      const auto call_dev_name = runtime::DeviceName(call_dev_type);
+      if (targets_.count(call_dev_name) == 0 && targets_.count(call_dev_key) == 0) {
+        LOG(FATAL) << "No target is provided for device "
+                   << call_dev_name;
+      }
+      if (targets_.count(call_dev_key)) {
+        target = targets_[call_dev_key];
+      } else {
+        target = targets_[call_dev_name];
+      }
+    }
+    CCacheKey key = (*pf0)(func, target);
+    CachedFunc lowerd_func = (*pf1)(compile_engine_, key);
+    if (!lowered_funcs_.count(target->target_name)) {
+      lowered_funcs_[target->target_name] = {};
+    }
+    for (auto f : lowerd_func->funcs) {
+      lowered_funcs_[target->target_name].insert(f);
+    }
+
+    std::vector<GraphNodeRef> inputs;
+    for (auto arg : op->args) {
+      auto res = VisitExpr(arg);
+      for (auto nr : res) {
+        inputs.push_back(nr);
+      }
+    }
+    auto& op_name = lowerd_func->func_name;
+    auto node = GraphOpNode::make_node_ptr(_GetUniqueName(op_name),
+                                           GraphAttrs(),
+                                           op_name,
+                                           inputs,
+                                           GraphAttrs());
+    return AddNode(node, expr);
+  }
+
+  std::vector<GraphNodeRef> VisitExpr_(const LetNode* op) override {
+    CHECK_EQ(var_map_.count(op->var.get()), 0);
+    var_map_[op->var.get()] = VisitExpr(op->value);
+    return VisitExpr(op->body);
+  }
+  std::vector<GraphNodeRef> VisitExpr_(const TupleGetItemNode* op) override {
+    auto vtuple = VisitExpr(op->tuple);
+    return {vtuple[op->index]};
+  }
+  std::vector<GraphNodeRef> VisitExpr_(const OpNode* op) override {
+    throw std::runtime_error("can not compile op in non-eta expanded form");
+    return {};
+  }
+  std::vector<GraphNodeRef> VisitExpr_(const GlobalVarNode* op) override {
+    throw std::runtime_error("");
+    return {};
+  }
+  std::vector<GraphNodeRef> VisitExpr_(const IfNode* op) override {
+    throw std::invalid_argument("if not supported");
+    return {};
+  }
+  std::vector<GraphNodeRef> VisitExpr_(const FunctionNode* op) override {
+    throw std::invalid_argument("function not supported");
+    return {};
+  }
+  std::vector<GraphNodeRef> VisitExpr_(const RefCreateNode* op) override {
+    throw std::invalid_argument("reference not supported");
+    return {};
+  }
+  std::vector<GraphNodeRef> VisitExpr_(const RefReadNode* op) override {
+    throw std::invalid_argument("reference not supported");
+    return {};
+  }
+  std::vector<GraphNodeRef> VisitExpr_(const RefWriteNode* op) override {
+    throw std::invalid_argument("reference not supported");
+    return {};
+  }
+  std::vector<GraphNodeRef> VisitExpr_(const ConstructorNode* op) override {
+    throw std::invalid_argument("ADT constructor case not yet implemented");
+    return {};
+  }
+  std::vector<GraphNodeRef> VisitExpr_(const MatchNode* op) override {
+    throw std::invalid_argument("match case not yet implemented");
+    return {};
+  }
+  /*!
+   * \brief Generate Graph JSON
+   *
+   * \param writer json writer
+   */
+  void GetJSON(dmlc::JSONWriter* writer) {
+    std::vector<size_t> arg_nodes;
+    for (size_t i = 0; i < nodes_.size(); ++i) {
+      auto node = nodes_[i];
+      if (node->Type() == kGraphInputNode) {
+        arg_nodes.push_back(i);
+      }
+    }
+    size_t num_entry = 0;
+    ShapeVector shapes;
+    std::vector<size_t> storage_ids;
+    std::vector<size_t> device_types;
+    std::vector<std::string> dltypes;
+    std::vector<size_t> node_row_ptr{0};
+    for (auto node : nodes_) {
+      const auto& shape_vec = dmlc::get<ShapeVector>(node->attrs_["shape"]);
+      const auto& storage_id = dmlc::get<std::vector<int64_t>>(node->attrs_["storage_id"]);
+      const auto& dtype_vec = dmlc::get<std::vector<std::string>>(node->attrs_["dtype"]);
+
+      CHECK_EQ(node->num_outputs_, shape_vec.size());
+      num_entry += node->num_outputs_;
+
+      shapes.insert(shapes.end(), shape_vec.begin(), shape_vec.end());
+      dltypes.insert(dltypes.end(), dtype_vec.begin(), dtype_vec.end());
+      storage_ids.insert(storage_ids.end(), storage_id.begin(), storage_id.end());
+      if (node->attrs_.count("device_index")) {
+        const auto& dev_types = dmlc::get<std::vector<int64_t>>(node->attrs_["device_index"]);
+        device_types.insert(device_types.end(), dev_types.begin(), dev_types.end());
+      }
+      node_row_ptr.push_back(num_entry);
+    }
+    writer->BeginObject();
+    writer->WriteObjectKeyValue("nodes", nodes_);
+    writer->WriteObjectKeyValue("arg_nodes", arg_nodes);
+    writer->WriteObjectKeyValue("heads", heads_);
+    std::unordered_map<std::string, std::vector<dmlc::any>> attrs;
+    attrs["shape"].emplace_back(std::string("list_shape"));
+    attrs["shape"].emplace_back(shapes);
+    attrs["storage_id"].emplace_back(std::string("list_int"));
+    attrs["storage_id"].emplace_back(storage_ids);
+    if (device_types.size()) {
+      attrs["device_index"].emplace_back(std::string("list_int"));
+      attrs["device_index"].emplace_back(device_types);
+    }
+    attrs["dltype"].emplace_back(std::string("list_str"));
+    attrs["dltype"].emplace_back(dltypes);
+    writer->WriteObjectKeyValue("attrs", attrs);
+    writer->WriteObjectKeyValue("node_row_ptr", node_row_ptr);
+    writer->EndObject();
+  }
+
+  /*!
+   * \brief Get unique name for func
+   *
+   * \param name
+   * \return std::string
+   */
+  std::string _GetUniqueName(const std::string& name) {
+    if (!name_map_.count(name)) {
+      name_map_[name] = 1;
+      return name;
+    }
+    auto index = name_map_[name];
+    name_map_[name] += 1;
+    return _GetUniqueName(name + std::to_string(index));
+  }
+
+ protected:
+  /*! \brief nodes */
+  std::vector<GraphNodePtr> nodes_;
+  /*! \brief output of graph */
+  std::vector<GraphNodeRef> heads_;
+  /*! \brief mod */
+  runtime::Module* mod_;
+  /*! \brief variable map */
+  std::unordered_map<const Node*, std::vector<GraphNodeRef>> var_map_;
+  /*! \brief target device */
+  TargetsMap targets_;
+  /*! \brief params */
+  std::unordered_map<std::string, runtime::NDArray> params_;
+  /*! \brief plan memory of device result */
+  Map<Expr, Array<IntegerArray>> storage_device_map_;
+  /*! \brief lowered funcs */
+  std::unordered_map<std::string, std::unordered_set<LoweredFunc, NodeHash, NodeEqual>>
+      lowered_funcs_;
+  /*! \brief name map */
+  std::unordered_map<std::string, size_t> name_map_;
+  /*! \brief compile engine */
+  CompileEngine compile_engine_;
+};
+
+class GraphRuntimeCodegenModule : public runtime::ModuleNode {
+ public:
+  GraphRuntimeCodegenModule() {}
+  virtual PackedFunc GetFunction(const std::string& name,
+                                 const std::shared_ptr<ModuleNode>& sptr_to_self) {
+     if (name == "init") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        CHECK_EQ(args.num_args, 2) << "The expected of arguments are: "
+                                   << "runtime::Module mod and Map<str, StringImm> targets";
+        void* mod = args[0];
+        auto& sptr = args[1].node_sptr();
+        auto* node = static_cast<const ArrayNode*>(sptr.get());
+        auto& tmp_targets = node->data;
+        std::unordered_map<std::string, std::string> targets;
+        for (size_t i = 0; i < tmp_targets.size(); i += 2) {
+          std::string key;
+          auto sk = Expr(tmp_targets[i]).as<ir::StringImm>();
+          auto ik = Expr(tmp_targets[i]).as<ir::IntImm>();
+          if (sk) {
+            key = sk->value;
+          }
+          if (ik) {
+            key = std::to_string(ik->value);
+          }
+          auto v = Expr(tmp_targets[i + 1]).as<ir::StringImm>();
+          targets[key] = v->value;
+        }
+        codegen_ = std::make_shared<GraphRuntimeCodegen>(
+          reinterpret_cast<runtime::Module*>(mod), targets);
+      });
+    } else if (name == "codegen") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        Function func = args[0];
+        this->output_ = this->codegen_->Codegen(func);
+      });
+    } else if (name == "get_graph_json") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        *rv = this->output_.graph_json;
+      });
+    } else if (name == "list_params_name") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        Array<HalideIR::Expr> ret;
+        for (const auto &kv : this->output_.params) {
+          HalideIR::Expr name = ir::StringImm::make(kv.first);
+          ret.push_back(name);
+        }
+        *rv = ret;
+      });
+
+    } else if (name == "get_param_by_name") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        std::string key = args[0];
+        CHECK_GT(this->output_.params.count(key), 0);
+        *rv = this->output_.params[key];
+      });
+    } else if (name == "get_lowered_funcs") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        *rv = this->output_.lowered_funcs;
+      });
+    } else {
+      return PackedFunc([](TVMArgs args, TVMRetValue* rv) {});
+    }
+  }
+
+  const char* type_key() const final {
+    return "RelayGraphRuntimeCodegenModule";
+  }
+
+ private:
+  std::shared_ptr<GraphRuntimeCodegen> codegen_;
+  LoweredOutput output_;
+};
+
+runtime::Module CreateGraphCodegenMod() {
+  std::shared_ptr<GraphRuntimeCodegenModule> ptr =
+    std::make_shared<GraphRuntimeCodegenModule>();
+  return runtime::Module(ptr);
+}
+
+TVM_REGISTER_GLOBAL("relay.build_module._GraphRuntimeCodegen")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = CreateGraphCodegenMod();
+});
+
+}  // namespace backend
+}  // namespace relay
+}  // namespace tvm
+
+namespace dmlc {
+namespace json {
+// JSON utils
+template <typename T>
+inline bool SameType(const dmlc::any& data) {
+  return std::type_index(data.type()) == std::type_index(typeid(T));
+}
+
+template <>
+struct Handler<std::shared_ptr<tvm::relay::backend::GraphNode>> {
+  inline static void Write(dmlc::JSONWriter* writer,
+                           const std::shared_ptr<tvm::relay::backend::GraphNode>& data) {
+    data->Save(writer);
+  }
+  inline static void Read(dmlc::JSONReader* reader,
+                          std::shared_ptr<tvm::relay::backend::GraphNode>* data) {
+    LOG(FATAL) << "Not implemented.";
+  }
+};
+
+template <>
+struct Handler<std::unordered_map<std::string, dmlc::any>> {
+  inline static void Write(dmlc::JSONWriter* writer,
+                           const std::unordered_map<std::string, dmlc::any>& data) {
+    writer->BeginObject();
+    for (const auto& kv : data) {
+      auto k = kv.first;
+      const dmlc::any& v = kv.second;
+      if (SameType<std::string>(v)) {
+        writer->WriteObjectKeyValue(k, dmlc::get<std::string>(v));
+      } else if (SameType<int>(v)) {
+        writer->WriteObjectKeyValue(k, dmlc::get<int>(v));
+      } else if (SameType<std::vector<size_t>>(v)) {
+        writer->WriteObjectKeyValue(k, dmlc::get<std::vector<size_t>>(v));
+      } else if (SameType<std::vector<std::vector<int64_t>>>(v)) {
+        writer->WriteObjectKeyValue(k, dmlc::get<std::vector<std::vector<int64_t>>>(v));
+      } else if (SameType<std::vector<std::string>>(v)) {
+        writer->WriteObjectKeyValue(k, dmlc::get<std::vector<std::string>>(v));
+      } else {
+        LOG(FATAL) << "Not supported";
+      }
+    }
+    writer->EndObject();
+  }
+  inline static void Read(dmlc::JSONReader* reader,
+                          std::unordered_map<std::string, dmlc::any>* data) {
+    LOG(FATAL) << "Not implemented.";
+  }
+};
+
+template <>
+struct Handler<std::vector<dmlc::any>> {
+  inline static void Write(dmlc::JSONWriter* writer, const std::vector<dmlc::any>& data) {
+    writer->BeginArray();
+    for (const auto& v : data) {
+      if (SameType<std::string>(v)) {
+        writer->WriteArrayItem(dmlc::get<std::string>(v));
+      } else if (SameType<int>(v)) {
+        writer->WriteArrayItem(dmlc::get<int>(v));
+      } else if (SameType<std::vector<size_t>>(v)) {
+        writer->WriteArrayItem(dmlc::get<std::vector<size_t>>(v));
+      } else if (SameType<std::vector<std::vector<int64_t>>>(v)) {
+        writer->WriteArrayItem(dmlc::get<std::vector<std::vector<int64_t>>>(v));
+      } else if (SameType<std::vector<std::string>>(v)) {
+        writer->WriteArrayItem(dmlc::get<std::vector<std::string>>(v));
+      } else {
+        LOG(FATAL) << "Not supported";
+      }
+    }
+    writer->EndArray();
+  }
+  inline static void Read(dmlc::JSONReader* reader, std::vector<dmlc::any>* data) {
+    LOG(FATAL) << "Not implemented.";
+  }
+};
+}  // namespace json
+}  // namespace dmlc
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
new file mode 100644
index 000000000000..65a7efd4c205
--- /dev/null
+++ b/src/relay/backend/utils.h
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file relay/backend/utils.h
+ * \brief Utils function for backend
+ */
+#ifndef TVM_RELAY_BACKEND_UTILS_H_
+#define TVM_RELAY_BACKEND_UTILS_H_
+
+#include <dmlc/json.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/pass.h>
+#include <tvm/relay/type.h>
+#include <tvm/tvm.h>
+#include <tvm/build_module.h>
+#include <tvm/codegen.h>
+#include <tvm/ir_pass.h>
+#include <tvm/operation.h>
+
+#include <typeinfo>
+#include <string>
+
+namespace tvm {
+namespace relay {
+namespace backend {
+/*!
+ * \brief Get the Packed Func
+ *
+ * \param func_name
+ * \return const PackedFunc*
+ */
+inline const PackedFunc* GetPackedFunc(const std::string& func_name) {
+  return tvm::runtime::Registry::Get(func_name);
+}
+/*!
+ * \brief Convert type to string
+ *
+ * \param typ
+ * \return std::string string format of type
+ */
+inline std::string DType2String(const tvm::Type typ) {
+  std::ostringstream os;
+  auto tvm_type = Type2TVMType(typ);
+  if (tvm_type.code == kDLFloat) {
+    os << "float";
+  } else if (tvm_type.code == kDLInt) {
+    os << "int";
+  } else if (tvm_type.code == kDLUInt) {
+    os << "uint";
+  } else {
+    LOG(FATAL) << "Unknown type";
+  }
+  os << typ.bits();
+  return os.str();
+}
+
+}  // namespace backend
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_BACKEND_UTILS_H_
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index 1e4060fe6c75..87160b2cd130 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -32,7 +32,6 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/attrs/transform.h>
-#include <tvm/relay/attrs/nn.h>
 #include <string>
 
 
diff --git a/tests/python/relay/test_pass_annotation.py b/tests/python/relay/test_pass_annotation.py
index ebf9ba913cac..c55a9fb2dd85 100644
--- a/tests/python/relay/test_pass_annotation.py
+++ b/tests/python/relay/test_pass_annotation.py
@@ -325,7 +325,9 @@ def expected():
 
         annotated_func = annotated()
         expected_func = expected()
-        expected_index = [1, 1, 1, 2, 2, 1, 1, 2, 2]
+        ctx = tvm.context(device, 0)
+        dev_idx = ctx.device_type
+        expected_index = [1, 1, 1, dev_idx, dev_idx, 1, 1, dev_idx, dev_idx]
         check_annotated_graph(annotated_func, expected_func)
         test_runtime(target, device, annotated_func, fallback_device,
                      expected_index)
@@ -401,7 +403,9 @@ def expected():
 
         annotated_func = annotated()
         expected_func = expected()
-        expected_index = [2, 2, 2, 1, 1]
+        ctx = tvm.context(device, 0)
+        dev_idx = ctx.device_type
+        expected_index = [dev_idx, dev_idx, dev_idx, 1, 1]
         check_annotated_graph(annotated_func, expected_func)
         test_runtime(target, device, annotated_func, fallback_device,
                      expected_index)
diff --git a/tests/python/unittest/test_module_load.py b/tests/python/unittest/test_module_load.py
index ca508f88f1a7..ba5044825308 100644
--- a/tests/python/unittest/test_module_load.py
+++ b/tests/python/unittest/test_module_load.py
@@ -82,7 +82,7 @@ def save_object(names):
         fo.write(runtime_py)
 
     subprocess.check_call(
-        "python %s %s %s" % (path_runtime_py, path_dso, dtype),
+        "python3 %s %s %s" % (path_runtime_py, path_dso, dtype),
         shell=True)
 
 
diff --git a/tests/scripts/task_java_unittest.sh b/tests/scripts/task_java_unittest.sh
index 6dc25c825c04..63f16fd755f6 100755
--- a/tests/scripts/task_java_unittest.sh
+++ b/tests/scripts/task_java_unittest.sh
@@ -26,13 +26,13 @@ CURR_DIR=$(cd `dirname $0`; pwd)
 SCRIPT_DIR=$CURR_DIR/../../jvm/core/src/test/scripts
 TEMP_DIR=$(mktemp -d)
 
-python $SCRIPT_DIR/test_add_cpu.py $TEMP_DIR
-python $SCRIPT_DIR/test_add_gpu.py $TEMP_DIR
-python $SCRIPT_DIR/test_graph_runtime.py $TEMP_DIR
+python3 $SCRIPT_DIR/test_add_cpu.py $TEMP_DIR
+python3 $SCRIPT_DIR/test_add_gpu.py $TEMP_DIR
+python3 $SCRIPT_DIR/test_graph_runtime.py $TEMP_DIR
 
 # start rpc proxy server
 PORT=$(( ( RANDOM % 1000 )  + 9000 ))
-python $SCRIPT_DIR/test_rpc_proxy_server.py $PORT 30 &
+python3 $SCRIPT_DIR/test_rpc_proxy_server.py $PORT 30 &
 
 make jvmpkg
 make jvmpkg JVM_TEST_ARGS="-DskipTests=false \
diff --git a/tests/scripts/task_python_frontend.sh b/tests/scripts/task_python_frontend.sh
index ffcd0a104914..1679595e712b 100755
--- a/tests/scripts/task_python_frontend.sh
+++ b/tests/scripts/task_python_frontend.sh
@@ -24,14 +24,12 @@ export PYTHONPATH=nnvm/python:python:topi/python
 export OMP_NUM_THREADS=1
 
 # Rebuild cython
-make cython
 make cython3
 
 echo "Running relay TFLite frontend test..."
 python3 -m nose -v tests/python/frontend/tflite
 
 echo "Running nnvm unittest..."
-python -m nose -v nnvm/tests/python/unittest
 python3 -m nose -v nnvm/tests/python/unittest
 
 echo "Running nnvm compiler test..."
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index 54e41d39e6e2..85dd6de64f6d 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -25,7 +25,6 @@ export LD_LIBRARY_PATH="build:${LD_LIBRARY_PATH:-}"
 rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc
 
 # Test TVM
-make cython
 make cython3
 
 # Test extern package
@@ -33,14 +32,12 @@ cd apps/extension
 rm -rf lib
 make
 cd ../..
-python -m nose -v apps/extension/tests
 
-TVM_FFI=cython python -m nose -v tests/python/integration
+python3 -m nose -v apps/extension/tests
+
 TVM_FFI=ctypes python3 -m nose -v tests/python/integration
-TVM_FFI=cython python -m nose -v tests/python/contrib
 TVM_FFI=ctypes python3 -m nose -v tests/python/contrib
 
-TVM_FFI=cython python -m nose -v tests/python/relay
 TVM_FFI=ctypes python3 -m nose -v tests/python/relay
 
 # Do not enable OpenGL
diff --git a/tests/scripts/task_python_topi.sh b/tests/scripts/task_python_topi.sh
index bd89604c52ef..a204f38c6cc6 100755
--- a/tests/scripts/task_python_topi.sh
+++ b/tests/scripts/task_python_topi.sh
@@ -22,11 +22,9 @@ set -u
 export PYTHONPATH=python:topi/python
 
 # Rebuild cython
-make cython
 make cython3
 
 rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc
 rm -rf topi/python/topi/*.pyc topi/python/topi/*/*.pyc topi/python/topi/*/*/*.pyc topi/python/topi/*/*/*/*.pyc 
 
-python -m nose -v topi/tests/python
 python3 -m nose -v topi/tests/python
diff --git a/tests/scripts/task_python_unittest.sh b/tests/scripts/task_python_unittest.sh
index 6f6b2777d1d0..7879c8d64e11 100755
--- a/tests/scripts/task_python_unittest.sh
+++ b/tests/scripts/task_python_unittest.sh
@@ -23,9 +23,6 @@ export PYTHONPATH=python:topi/python
 
 rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc
 
-TVM_FFI=ctypes python -m nose -v tests/python/unittest
 TVM_FFI=ctypes python3 -m nose -v tests/python/unittest
-make cython
 make cython3
-TVM_FFI=cython python -m nose -v tests/python/unittest
 TVM_FFI=cython python3 -m nose -v tests/python/unittest
diff --git a/tests/scripts/task_python_vta.sh b/tests/scripts/task_python_vta.sh
index be0f0c694aab..4345fc2ba39b 100755
--- a/tests/scripts/task_python_vta.sh
+++ b/tests/scripts/task_python_vta.sh
@@ -25,13 +25,10 @@ rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc python/tvm/*/*/*
 rm -rf ~/.tvm
 
 # Rebuild cython
-make cython
 make cython3
 
 echo "Running unittest..."
-python -m nose -v vta/tests/python/unittest
 python3 -m nose -v vta/tests/python/unittest
 
 echo "Running integration test..."
-python -m nose -v vta/tests/python/integration
 python3 -m nose -v vta/tests/python/integration

From 5b5e0fb2cce7b2ef20da002d59ee7801e05a6024 Mon Sep 17 00:00:00 2001
From: hlu1 <14827759+hlu1@users.noreply.github.com>
Date: Fri, 12 Apr 2019 18:33:45 -0700
Subject: [PATCH 008/106] [Bugfix] Fix caffe2 nnvm frontend (#2996)

---
 nnvm/python/nnvm/frontend/caffe2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nnvm/python/nnvm/frontend/caffe2.py b/nnvm/python/nnvm/frontend/caffe2.py
index 367834956a10..2b3ff5a27e01 100644
--- a/nnvm/python/nnvm/frontend/caffe2.py
+++ b/nnvm/python/nnvm/frontend/caffe2.py
@@ -19,7 +19,7 @@
 from __future__ import absolute_import as _abs
 import tvm
 from nnvm import symbol as _sym
-from .common import get_nnvm_op
+from .common import get_nnvm_op, Renamer, AttrConverter as AttrCvt
 from .onnx_caffe2_utils import dimension_picker, dimension_constraint, infer_channels, revert_caffe2_pad
 from . import onnx
 

From 7561043576c99ed493d3c389ea2e9d94b1d922ae Mon Sep 17 00:00:00 2001
From: Josh Pollock <joshpollock1997@gmail.com>
Date: Fri, 12 Apr 2019 18:36:19 -0700
Subject: [PATCH 009/106] [Relay][Text Format] Pretty Printer Smart Inlining
 (#2881)

---
 python/tvm/relay/ir_pass.py                |  33 ----
 src/relay/ir/pretty_printer.cc             | 112 ++++++------
 src/relay/pass/dependency_graph.cc         | 165 ++++++++++++++++++
 src/relay/pass/dependency_graph.h          |  57 +++++++
 src/relay/pass/to_a_normal_form.cc         | 188 +--------------------
 tests/python/relay/test_ir_text_printer.py |  22 ++-
 tests/python/relay/test_op_level1.py       |   4 +-
 tests/python/relay/test_op_level4.py       |   2 +-
 tests/python/relay/test_type_infer.py      |  31 ++--
 9 files changed, 324 insertions(+), 290 deletions(-)
 create mode 100644 src/relay/pass/dependency_graph.cc
 create mode 100644 src/relay/pass/dependency_graph.h

diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index d2000263479d..93ce2dc92fbd 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -925,39 +925,6 @@ def eliminate_common_subexpr(expr, fskip=None):
     """
     return _ir_pass.eliminate_common_subexpr(expr, fskip)
 
-
-def pass_debug_print(ast, show_meta_data=True, annotate=None, gnf=True):
-    """
-    THIS SHOULD BE USED ONLY FOR DEBUGGING, NOT AS AN INTERCHANGE FORMAT!
-    USE `.astext()` INSTEAD!
-
-    A version of the pretty printer intended for debugging passes. Contains
-    advanced printing options.
-
-    Parameters
-    ----------
-    ast : Union[relay.Expr, relay.Module, relay.Type]
-        The relay fragment to be turned into text.
-
-    show_meta_data : bool
-        Whether to include meta data section in the text
-        if there is meta data.
-
-    annotate: Optional[relay.Expr->str]
-        Optional annotate function to provide additional
-        information in the comment block.
-
-    gnf : bool
-        Whether to print in GNF. If it is disabled, pointers are left implicit.
-
-    Returns
-    -------
-    text : str
-        A text representation of `ast`.
-    """
-    return _ir_pass.pass_debug_print(ast, show_meta_data, annotate, gnf)
-
-
 def partial_evaluate(expr):
     """
     Evaluate the static fragment of the code.
diff --git a/src/relay/ir/pretty_printer.cc b/src/relay/ir/pretty_printer.cc
index 969f08b32e83..f4a830040f70 100644
--- a/src/relay/ir/pretty_printer.cc
+++ b/src/relay/ir/pretty_printer.cc
@@ -22,12 +22,22 @@
  * \file pretty_printer.cc
  * \brief Pretty printer for Relay programs
  * Supports ANF, GNF, and metadata.
+ *
+ * Inlining heuristics:
+ *  - Always inline:
+ *    - GlobalVar
+ *    - Constant
+ *    - Op
+ *    - Var
+ *  - Otherwise, inline if the node is at the end of a scope and is used at most once.
  */
+
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/module.h>
 #include <tvm/relay/pattern_functor.h>
 #include "doc.h"
 #include "type_functor.h"
+#include "../pass/dependency_graph.h"
 #include "../../lang/attr_functor.h"
 
 namespace tvm {
@@ -135,10 +145,8 @@ class PrettyPrinter :
     public TypeFunctor<Doc(const Type&)>,
     public AttrFunctor<Doc(const NodeRef&)> {
  public:
-  explicit PrettyPrinter(bool GNF,
-                         bool show_meta_data,
+  explicit PrettyPrinter(bool show_meta_data,
                          runtime::TypedPackedFunc<std::string(Expr)> annotate) :
-                         GNF_(GNF),
                          show_meta_data_(show_meta_data),
                          annotate_(annotate) {}
 
@@ -150,10 +158,9 @@ class PrettyPrinter :
     Doc doc;
     // additional information in comment.
     if (annotate_ != nullptr) {
-      return doc << " // " << annotate_(expr);
+      return doc << " /* " << annotate_(expr) << " */";
     } else if (expr->checked_type_.defined()) {
-      doc << " // ty=";
-      return doc << Print(expr->checked_type());
+      return doc << " /* ty=" << Print(expr->checked_type()) << " */";
     } else {
       return doc;
     }
@@ -176,13 +183,18 @@ class PrettyPrinter :
     // print in a new scope
     doc_stack_.push_back(Doc());
     // must print first so doc_stack_.back() reference doesn't become stale
-    Doc doc = Print(node);
+    Doc doc = Print(node, false, true);
     doc = doc_stack_.back() << doc;
     doc_stack_.pop_back();
     return doc;
   }
 
   Doc PrintFinal(const NodeRef& node) {
+    if (node.as_derived<ExprNode>()) {
+      Expr expr = Downcast<Expr>(node);
+      dg_ = DependencyGraph::Create(&arena_, expr);
+    }
+
     Doc doc;
     doc << PrintScope(node);
     if (!meta_.empty()) {
@@ -200,9 +212,9 @@ class PrettyPrinter :
 
   Doc PrintAttrs(const Attrs& attrs, const Expr& op);
 
-  Doc Print(const NodeRef& node, bool meta = false) {
+  Doc Print(const NodeRef& node, bool meta = false, bool try_inline = false) {
     if (node.as_derived<ExprNode>()) {
-      return PrintExpr(Downcast<Expr>(node), meta);
+      return PrintExpr(Downcast<Expr>(node), meta, try_inline);
     } else if (node.as_derived<TypeNode>()) {
       return PrintType(Downcast<Type>(node), meta);
     } else if (node.as_derived<ModuleNode>()) {
@@ -308,7 +320,12 @@ class PrettyPrinter :
     return val;
   }
 
-  inline bool IsAtomicExpr(const Expr& expr) {
+  bool IsUnique(const Expr& expr) {
+    return !(dg_.expr_node.at(expr)->parents.head &&
+             dg_.expr_node.at(expr)->parents.head->next);
+  }
+
+  bool AlwaysInline(const Expr& expr) {
     return expr.as<GlobalVarNode>() || expr.as<ConstantNode>() ||
            expr.as<OpNode>() || expr.as<VarNode>();
   }
@@ -316,17 +333,25 @@ class PrettyPrinter :
   //------------------------------------
   // Overload of Expr printing functions
   //------------------------------------
-  Doc PrintExpr(const Expr& expr, bool meta) {
+  Doc PrintExpr(const Expr& expr, bool meta, bool try_inline) {
     // Exploit memoization to print GNF.
     // The first time we visit an expression, we need to allocate a temp var
     // for it. Every subsequent time we can just use its assigned variable.
     // This works since hashing uses pointer equality.
+
+    // determine whether to inline
+    bool inline_expr = AlwaysInline(expr);
+    if (try_inline) {
+      inline_expr |= IsUnique(expr);
+    }
+
     auto it = memo_.find(expr);
     if (it != memo_.end()) return it->second;
+
     Doc printed_expr;
     if (meta) {
       printed_expr = meta_.GetMetaNode(GetRef<NodeRef>(expr.get()));
-    } else if (GNF_ && expr.as<LetNode>()) {
+    } else if (!inline_expr && expr.as<LetNode>()) {
       // wrap GNFed let in brackets
       Doc body;
       printed_expr << "{";
@@ -335,28 +360,26 @@ class PrettyPrinter :
     } else {
       printed_expr = VisitExpr(expr);
     }
-    // we choose to inline atomic exprs
-    if (GNF_ && !IsAtomicExpr(expr)) {
-      Doc temp_var = AllocTemp();
-      memo_[expr] = temp_var;
-      doc_stack_.back() << temp_var << " = " << printed_expr;
-      if (expr.as<CallNode>()) {
-        doc_stack_.back() << PrintOptionalInfo(expr);
-      }
-      doc_stack_.back() << "\n";
-      return temp_var;
-    } else if (expr.as<VarNode>()) {
+
+    if (expr.as<CallNode>()) {
+      printed_expr << PrintOptionalInfo(expr);
+    }
+
+    // add expr to doc
+    if (expr.as<VarNode>()) {
       // This is our first time visiting the var and we hit the VarNode case
       // in the visitor. Thus the variable is free.
       doc_stack_.back() << "free_var " << printed_expr << "\n";
       // Memoization is done in AllocVar.
       return memo_[expr];
-    } else {
+    } else if (inline_expr) {
       memo_[expr] = printed_expr;
-      if (GNF_ && expr.as<CallNode>()) {
-        printed_expr << PrintOptionalInfo(expr);
-      }
       return printed_expr;
+    } else {
+      Doc temp_var = AllocTemp();
+      memo_[expr] = temp_var;
+      doc_stack_.back() << temp_var << " = " << printed_expr << "\n";
+      return temp_var;
     }
   }
 
@@ -420,8 +443,9 @@ class PrettyPrinter :
 
   Doc VisitExpr_(const LetNode* op) final {
     Doc doc;
-    doc << "let " << AllocVar(op->var) << " = " << Print(op->value) << "\n";
+    doc << "let " << AllocVar(op->var) << " = " << Print(op->value, false, true) << "\n";
     // we use a scope here so GNF hoisting doesn't escape too far
+    // and nested, unique lets are not hoisted
     doc << PrintScope(op->body);
     return doc;
   }
@@ -456,6 +480,8 @@ class PrettyPrinter :
     Doc doc;
     int counter = 0;
     for (const auto& kv : mod->functions) {
+      dg_ = DependencyGraph::Create(&arena_, kv.second);
+
       std::ostringstream os;
       if (counter++ != 0) {
         doc << "\n";
@@ -664,8 +690,6 @@ class PrettyPrinter :
   }
 
  private:
-  /*! \brief Whether to use GNF. */
-  bool GNF_;
   /*! \brief Whether to print meta data. */
   bool show_meta_data_;
   /*! \brief additional comment function */
@@ -682,6 +706,10 @@ class PrettyPrinter :
   TextMetaDataContext meta_;
   /*! \brief counter of temporary variable */
   size_t temp_var_counter_{0};
+  /*! \brief arena for dependency graph */
+  common::Arena arena_;
+  /*! \brief dependency graph of the expr */
+  DependencyGraph dg_;
   class AttrPrinter;
   friend class AttrPrinter;
 };
@@ -751,25 +779,17 @@ Doc PrettyPrinter::PrintAttrs(const Attrs& attrs, const Expr& op) {
 
 std::string PrettyPrint_(const NodeRef& node,
                          bool show_meta_data,
-                         runtime::TypedPackedFunc<std::string(Expr)> annotate,
-                         bool gnf) {
+                         runtime::TypedPackedFunc<std::string(Expr)> annotate) {
   Doc doc;
   doc << "v0.0.1" << "\n"
-      << PrettyPrinter(gnf, show_meta_data, annotate).PrintFinal(node);
+      << PrettyPrinter(show_meta_data, annotate).PrintFinal(node);
   return doc.str();
 }
 
 std::string AsText(const NodeRef& node,
-                    bool show_meta_data,
-                    runtime::TypedPackedFunc<std::string(Expr)> annotate) {
-  return PrettyPrint_(node, show_meta_data, annotate, true);
-}
-
-std::string PassDebugPrint(const NodeRef& node,
-                           bool show_meta_data,
-                           runtime::TypedPackedFunc<std::string(Expr)> annotate,
-                           bool gnf) {
-  return PrettyPrint_(node, show_meta_data, annotate, gnf);
+                       bool show_meta_data,
+                       runtime::TypedPackedFunc<std::string(Expr)> annotate) {
+  return PrettyPrint_(node, show_meta_data, annotate);
 }
 
 TVM_REGISTER_API("relay._expr.AsText")
@@ -777,11 +797,5 @@ TVM_REGISTER_API("relay._expr.AsText")
                             bool,
                             runtime::TypedPackedFunc<std::string(Expr)>)>(AsText);
 
-TVM_REGISTER_API("relay._ir_pass.pass_debug_print")
-.set_body_typed<std::string(const NodeRef&,
-                            bool,
-                            runtime::TypedPackedFunc<std::string(Expr)>,
-                            bool)>(PassDebugPrint);
-
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/dependency_graph.cc b/src/relay/pass/dependency_graph.cc
new file mode 100644
index 000000000000..6e25086fe826
--- /dev/null
+++ b/src/relay/pass/dependency_graph.cc
@@ -0,0 +1,165 @@
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file tvm/relay/pass/dependency_graph.cc
+ * \brief
+ */
+#include "dependency_graph.h"
+#include <tvm/relay/expr_functor.h>
+#include <unordered_set>
+#include <utility>
+
+namespace tvm {
+namespace relay {
+
+// Creator of DependencyGraph
+class DependencyGraph::Creator : private ExprFunctor<void(const Expr& e)> {
+ public:
+  explicit Creator(common::Arena* arena)
+    : arena_(arena) {}
+
+  DependencyGraph Create(const Expr& body) {
+    this->VisitExpr(body);
+    return std::move(graph_);
+  }
+
+ private:
+  /*! \brief allocator of all the internal node object */
+  common::Arena* arena_;
+  // The output.
+  DependencyGraph graph_;
+  // Update the message stored at the node.
+  void Depend(DependencyGraph::Node* parent, const Expr& child) {
+    VisitExpr(child);
+
+    CHECK_NE(graph_.expr_node.count(child), 0);
+
+    Depend(parent, graph_.expr_node[child]);
+  }
+
+  void Depend(DependencyGraph::Node* parent, DependencyGraph::Node* child) {
+    auto* parent_link = arena_->make<LinkNode<DependencyGraph::Node*> >();
+    parent_link->value = parent;
+    child->parents.Push(parent_link);
+
+    auto* child_link = arena_->make<LinkNode<DependencyGraph::Node*> >();
+    child_link->value = child;
+    parent->children.Push(child_link);
+  }
+
+  std::unordered_set<Expr, NodeHash, NodeEqual> visited_;
+
+  DependencyGraph::Node* NewNode(bool new_scope) {
+    auto* ret = arena_->make<DependencyGraph::Node>();
+    ret->new_scope = new_scope;
+    return ret;
+  }
+
+  void VisitExpr(const Expr& e) final {
+    if (visited_.count(e) == 0) {
+      if (graph_.expr_node.count(e) == 0) {
+        graph_.expr_node[e] = NewNode(false);
+      }
+      visited_.insert(e);
+      ExprFunctor<void(const Expr&)>::VisitExpr(e);
+      graph_.post_dfs_order.push_back(graph_.expr_node[e]);
+    }
+  }
+
+  void VisitExpr_(const CallNode* c) final {
+    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(c)];
+    Depend(n, c->op);
+    for (const auto& a : c->args) {
+      Depend(n, a);
+    }
+  }
+
+  void VisitExpr_(const TupleNode* t) final {
+    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(t)];
+    for (const auto& a : t->fields) {
+      Depend(n, a);
+    }
+  }
+
+  void VisitExpr_(const TupleGetItemNode* t) final {
+    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(t)];
+    Depend(n, t->tuple);
+  }
+
+  void VisitExpr_(const RefCreateNode* r) final {
+    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(r)];
+    Depend(n, r->value);
+  }
+
+  void VisitExpr_(const RefReadNode* r) final {
+    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(r)];
+    Depend(n, r->ref);
+  }
+
+  void VisitExpr_(const RefWriteNode* r) final {
+    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(r)];
+    Depend(n, r->ref);
+    Depend(n, r->value);
+  }
+
+  void VisitExpr_(const IfNode* i) final {
+    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(i)];
+    DependencyGraph::Node* t = NewNode(true);
+    DependencyGraph::Node* f = NewNode(true);
+    Depend(n, i->cond);
+    Depend(n, t);
+    Depend(n, f);
+    Depend(t, i->true_branch);
+    Depend(f, i->false_branch);
+    graph_.post_dfs_order.push_back(f);
+    graph_.post_dfs_order.push_back(t);
+  }
+
+  void VisitExpr_(const FunctionNode* f) final {
+    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(f)];
+    DependencyGraph::Node* b = NewNode(true);
+    Depend(n, b);
+    Depend(b, f->body);
+    graph_.post_dfs_order.push_back(b);
+  }
+
+  void VisitExpr_(const LetNode* l) final {
+    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(l)];
+    DependencyGraph::Node* b = NewNode(true);
+    Depend(n, b);
+    Depend(b, l->value);
+    Depend(b, l->body);
+    graph_.post_dfs_order.push_back(b);
+  }
+
+  void VisitExpr_(const MatchNode* m) final {
+    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(m)];
+    Depend(n, m->data);
+    std::vector<DependencyGraph::Node*> v;
+    for (const Clause& c : m->clauses) {
+      DependencyGraph::Node* b = NewNode(true);
+      Depend(n, b);
+      Depend(b, c->rhs);
+      v.push_back(b);
+    }
+    for (auto it = v.rbegin(); it != v.rend(); ++it) {
+      graph_.post_dfs_order.push_back(*it);
+    }
+  }
+
+  void VisitExpr_(const VarNode* v) final { }
+
+  void VisitExpr_(const GlobalVarNode* v) final { }
+
+  void VisitExpr_(const ConstantNode* c) final { }
+
+  void VisitExpr_(const OpNode* o) final { }
+
+  void VisitExpr_(const ConstructorNode* c) final { }
+};
+
+DependencyGraph DependencyGraph::Create(common::Arena* arena, const Expr& body) {
+  return Creator(arena).Create(body);
+}
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/dependency_graph.h b/src/relay/pass/dependency_graph.h
new file mode 100644
index 000000000000..91cef1ce7cde
--- /dev/null
+++ b/src/relay/pass/dependency_graph.h
@@ -0,0 +1,57 @@
+/*!
+ *  Copyright (c) 2019 by Contributors.
+ * \file tvm/relay/pass/dependency_graph.h
+ * \brief 
+ */
+#ifndef TVM_RELAY_PASS_DEPENDENCY_GRAPH_H_
+#define TVM_RELAY_PASS_DEPENDENCY_GRAPH_H_
+
+#include <tvm/relay/expr.h>
+#include <unordered_map>
+#include <vector>
+#include "let_list.h"
+#include "../../common/arena.h"
+
+namespace tvm {
+namespace relay {
+
+using common::LinkNode;
+using common::LinkedList;
+
+/* DependencyGraph track input and output of an Expr.
+ * Additionally, dummy scope is created to model scope.
+ * It allow us to traverse the graph in reverse order.
+ */
+class DependencyGraph {
+ public:
+  /*! \brief A node in the graph. */
+  struct Node {
+    // Determine scope boundaries. Used for calculating scopes, not for
+    // constructing dependency graph.
+    bool new_scope = false;
+    // incoming edges
+    LinkedList<Node*> children;
+    // outgoing edges
+    LinkedList<Node*> parents;
+  };
+
+  /*! \brief Maps a Relay Expr to its node in the dependency graph. */
+  std::unordered_map<Expr, Node*, NodeHash, NodeEqual> expr_node;
+
+  /*! \brief The dependency graph in post DFS order. */
+  std::vector<Node*> post_dfs_order;
+
+  /*!
+   * \brief Create a dependency graph.
+   * \param arena The arena used for data allocation.
+   * \param body The body of the expression to create a graph.
+   */
+  static DependencyGraph Create(common::Arena* arena, const Expr& body);
+
+ private:
+  class Creator;
+};
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_PASS_DEPENDENCY_GRAPH_H_
diff --git a/src/relay/pass/to_a_normal_form.cc b/src/relay/pass/to_a_normal_form.cc
index 5507de471ae5..1f0ed9eff28e 100644
--- a/src/relay/pass/to_a_normal_form.cc
+++ b/src/relay/pass/to_a_normal_form.cc
@@ -29,193 +29,11 @@
 #include "let_list.h"
 #include "../../common/arena.h"
 #include "pass_util.h"
+#include "dependency_graph.h"
 
 namespace tvm {
 namespace relay {
 
-using common::LinkNode;
-using common::LinkedList;
-
-/* DependencyGraph track input and output of an Expr.
- * Additionally, dummy scope is created to model scope.
- * It allow us to traverse the graph in reverse order.
- */
-class DependencyGraph {
- public:
-  /*! \brief A node in the graph. */
-  struct Node {
-    bool new_scope = false;
-    LinkedList<Node*> input;
-    LinkedList<Node*> output;
-  };
-
-  /*! \brief The node map that maps node to graph */
-  std::unordered_map<Expr, Node*, NodeHash, NodeEqual> expr_node;
-
-  /*! \brief All the nodes in post DFS order */
-  std::vector<Node*> post_dfs_order;
-
-  /*!
-   * \brief create a dependency graph.
-   * \param arena The arena used for data allocation.
-   * \param body The body of the expression to create a graph.
-   */
-  static DependencyGraph Create(common::Arena* arena, const Expr& body);
-
- private:
-  class Creator;
-};
-
-// Creator of DependencyGraph
-class DependencyGraph::Creator : private ExprFunctor<void(const Expr& e)> {
- public:
-  explicit Creator(common::Arena* arena)
-    : arena_(arena) {}
-
-  DependencyGraph Create(const Expr& body) {
-    this->VisitExpr(body);
-    return std::move(graph_);
-  }
-
- private:
-  /*! \brief allocator of all the internal node object */
-  common::Arena* arena_;
-  // The output.
-  DependencyGraph graph_;
-  // Update the message stored at the node.
-  void Depend(DependencyGraph::Node* parent, const Expr& child) {
-    VisitExpr(child);
-
-    CHECK_NE(graph_.expr_node.count(child), 0);
-
-    Depend(parent, graph_.expr_node[child]);
-  }
-
-  void Depend(DependencyGraph::Node* parent, DependencyGraph::Node* child) {
-    auto* parent_link = arena_->make<LinkNode<DependencyGraph::Node*> >();
-    parent_link->value = parent;
-    child->output.Push(parent_link);
-
-    auto* child_link = arena_->make<LinkNode<DependencyGraph::Node*> >();
-    child_link->value = child;
-    parent->input.Push(child_link);
-  }
-
-  std::unordered_set<Expr, NodeHash, NodeEqual> visited_;
-
-  DependencyGraph::Node* NewNode(bool new_scope) {
-    auto* ret = arena_->make<DependencyGraph::Node>();
-    ret->new_scope = new_scope;
-    return ret;
-  }
-
-  void VisitExpr(const Expr& e) final {
-    if (visited_.count(e) == 0) {
-      if (graph_.expr_node.count(e) == 0) {
-        graph_.expr_node[e] = NewNode(false);
-      }
-      visited_.insert(e);
-      ExprFunctor<void(const Expr&)>::VisitExpr(e);
-      graph_.post_dfs_order.push_back(graph_.expr_node[e]);
-    }
-  }
-
-  void VisitExpr_(const CallNode* c) final {
-    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(c)];
-    Depend(n, c->op);
-    for (const auto& a : c->args) {
-      Depend(n, a);
-    }
-  }
-
-  void VisitExpr_(const TupleNode* t) final {
-    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(t)];
-    for (const auto& a : t->fields) {
-      Depend(n, a);
-    }
-  }
-
-  void VisitExpr_(const TupleGetItemNode* t) final {
-    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(t)];
-    Depend(n, t->tuple);
-  }
-
-  void VisitExpr_(const RefCreateNode* r) final {
-    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(r)];
-    Depend(n, r->value);
-  }
-
-  void VisitExpr_(const RefReadNode* r) final {
-    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(r)];
-    Depend(n, r->ref);
-  }
-
-  void VisitExpr_(const RefWriteNode* r) final {
-    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(r)];
-    Depend(n, r->ref);
-    Depend(n, r->value);
-  }
-
-  void VisitExpr_(const IfNode* i) final {
-    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(i)];
-    DependencyGraph::Node* t = NewNode(true);
-    DependencyGraph::Node* f = NewNode(true);
-    Depend(n, i->cond);
-    Depend(n, t);
-    Depend(n, f);
-    Depend(t, i->true_branch);
-    Depend(f, i->false_branch);
-    graph_.post_dfs_order.push_back(f);
-    graph_.post_dfs_order.push_back(t);
-  }
-
-  void VisitExpr_(const FunctionNode* f) final {
-    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(f)];
-    DependencyGraph::Node* b = NewNode(true);
-    Depend(n, b);
-    Depend(b, f->body);
-    graph_.post_dfs_order.push_back(b);
-  }
-
-  void VisitExpr_(const LetNode* l) final {
-    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(l)];
-    DependencyGraph::Node* b = NewNode(true);
-    Depend(n, b);
-    Depend(b, l->value);
-    Depend(b, l->body);
-    graph_.post_dfs_order.push_back(b);
-  }
-
-  void VisitExpr_(const MatchNode* m) final {
-    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(m)];
-    Depend(n, m->data);
-    std::vector<DependencyGraph::Node*> v;
-    for (const Clause& c : m->clauses) {
-      DependencyGraph::Node* b = NewNode(true);
-      Depend(n, b);
-      Depend(b, c->rhs);
-      v.push_back(b);
-    }
-    for (auto it = v.rbegin(); it != v.rend(); ++it) {
-      graph_.post_dfs_order.push_back(*it);
-    }
-  }
-
-  void VisitExpr_(const VarNode* v) final { }
-
-  void VisitExpr_(const GlobalVarNode* v) final { }
-
-  void VisitExpr_(const ConstantNode* c) final { }
-
-  void VisitExpr_(const OpNode* o) final { }
-
-  void VisitExpr_(const ConstructorNode* c) final { }
-};
-
-DependencyGraph DependencyGraph::Create(common::Arena* arena, const Expr& body) {
-  return Creator(arena).Create(body);
-}
-
 Expr ToANormalForm(const Expr& e, const Module& m, std::set<GlobalVar>* gv);
 
 struct ScopeNode;
@@ -256,7 +74,7 @@ std::unordered_map<DependencyGraph::Node*, Scope> CalcScope(const DependencyGrap
   Scope global_scope = std::make_shared<ScopeNode>();
   for (auto it = dg.post_dfs_order.rbegin(); it != dg.post_dfs_order.rend(); ++it) {
     DependencyGraph::Node* n = *it;
-    auto iit = n->output.head;
+    auto iit = n->parents.head;
     Scope s;
     if (iit == nullptr) {
       s = global_scope;
@@ -313,7 +131,7 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)> {
 
   Scope GetSubScope(const Expr& e, size_t i) {
     DependencyGraph::Node* n = dg_.expr_node.at(e);
-    auto h = n->input.head;
+    auto h = n->children.head;
     while (i != 0) {
       CHECK(h);
       --i;
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
index 4206d68b83bf..f10b258ff3cf 100644
--- a/tests/python/relay/test_ir_text_printer.py
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -50,8 +50,8 @@ def test_env():
     text = env.astext()
     assert "def @myf" in text
     assert "def @myf" in str(env)
-    assert "%1 = add(%0, %0) // ty=float32" in text
-    assert "%1 = add(%0, %0) // ty=float32" in str(env)
+    assert "add(%0, %0) /* ty=float32 */" in text
+    assert "add(%0, %0) /* ty=float32 */" in str(env)
     show(env.astext(annotate=lambda x: str(x.checked_type.dtype)))
     show(text)
 
@@ -112,7 +112,7 @@ def test_let_if_scope():
 
     f = relay.Function([x, y, cond], result)
     text = f.astext()
-    assert text.count("{") == 6
+    assert text.count("{") == 4
     assert "%cond: bool" in text
     show(f.astext())
 
@@ -180,8 +180,19 @@ def test_call_node_order():
          "%2 = fn (%x) {\n"
          "  %x\n"
          "}\n"
-         "%3 = %2(%1)\n"
-         "%3")
+         "%2(%1)")
+
+def test_let_inlining():
+    tup = relay.Tuple([relay.const(0), relay.const(0)])
+    x = relay.var("x")
+    assert relay.Let(x, tup, tup).astext() == SEMVER + \
+        ("%0 = (0, 0)\n"
+         "let %x = %0\n"
+         "%0")
+
+    assert relay.Let(x, tup, x).astext() == SEMVER + \
+        ("let %x = (0, 0)\n"
+         "%x")
 
 if __name__ == "__main__":
     do_print[0] = True
@@ -201,3 +212,4 @@ def test_call_node_order():
     test_let_if_scope()
     test_variable_name()
     test_call_node_order()
+    test_let_inlining()
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 94d3b157dd0d..d83f25db1b77 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -38,7 +38,7 @@ def check_single_op(opfunc, ref):
         x = relay.var("x", tp)
         y = opfunc(x)
         # test printer
-        assert ("%0 = {}(%x)".format(y.op.name)) in y.astext()
+        assert ("{}(%x)".format(y.op.name)) in y.astext()
         # test type inference
         assert relay.ir_pass.infer_type(y).checked_type == tp
 
@@ -78,7 +78,7 @@ def check_binary_op(opfunc, ref):
         y = relay.var("y", t2)
         z = opfunc(x, y)
         # test printer
-        assert ("%0 = {}(%x, %y)".format(z.op.name)) in z.astext()
+        assert ("{}(%x, %y)".format(z.op.name)) in z.astext()
         assert relay.ir_pass.infer_type(z).checked_type == t1
 
         if ref is not None:
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 8db90fbf91f0..0e44bf851dc4 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -29,7 +29,7 @@ def check_binary_op(opfunc, ref):
         y = relay.var("y", t2)
         z = opfunc(x, y)
         # test printer
-        assert ("%0 = {}(%x, %y)".format(z.op.name)) in z.astext()
+        assert ("{}(%x, %y)".format(z.op.name)) in z.astext()
         assert relay.ir_pass.infer_type(z).checked_type == t1
 
         if ref is not None:
diff --git a/tests/python/relay/test_type_infer.py b/tests/python/relay/test_type_infer.py
index 4dfe59b8a6a3..8e047354fafd 100644
--- a/tests/python/relay/test_type_infer.py
+++ b/tests/python/relay/test_type_infer.py
@@ -44,7 +44,7 @@ def initialize_box_adt(mod):
 
 
 def test_monomorphic_let():
-    "Program: let x = 1; x"
+    "Program: let %x = 1; %x"
     sb = relay.ScopeBuilder()
     x = sb.let('x', relay.const(1.0, "float64"))
     sb.ret(x)
@@ -53,7 +53,7 @@ def test_monomorphic_let():
 
 
 def test_single_op():
-    "Program: fn (x : float32) { let t1 = f(x); t1 }"
+    "Program: fn (%x : float32) { let %t1 = f(%x); %t1 }"
     x = relay.var('x', shape=[])
     func = relay.Function([x], op.log(x))
     ttype = relay.TensorType([], dtype='float32')
@@ -63,8 +63,9 @@ def test_single_op():
 def test_add_broadcast_op():
     """
     Program:
-        fn (x: Tensor[(10, 4), f32], y: Tensor[(5, 10, 1), f32]) -> Tensor[(5, 10, 4), f32] {
-            x + y
+        fn (%x: Tensor[(10, 4), float32], %y: Tensor[(5, 10, 1), float32])
+            -> Tensor[(5, 10, 4), float32] {
+            %x + %y
         }
     """
     x = relay.var('x', shape=(10, 4))
@@ -80,10 +81,10 @@ def test_add_broadcast_op():
 
 def test_dual_op():
     """Program:
-       fn (x : Tensor[f32, (10, 10)]) {
-         let t1 = log(x);
-         let t2 = add(t1, x);
-         t1
+       fn (%x : Tensor[(10, 10), float32]) {
+         let %t1 = log(x);
+         let %t2 = add(%t1, %x);
+         %t1
        }
     """
     tp = relay.TensorType((10, 10), "float32")
@@ -99,8 +100,8 @@ def test_dual_op():
 
 def test_decl():
     """Program:
-       def f(x : Tensor[(10, 10), f32]) {
-           log(x)
+       def @f(%x : Tensor[(10, 10), float32]) {
+           log(%x)
        }
     """
     tp = relay.TensorType((10, 10))
@@ -113,11 +114,11 @@ def f(x : Tensor[(10, 10), f32]) {
 def test_recursion():
     """
     Program:
-       def f(n: i32, data: f32) -> f32 {
-          if (n == 0) {
-              data
+       def @f(%n: int32, %data: float32) -> float32 {
+          if (%n == 0) {
+              %data
           } else {
-              f(n - 1, log(data))
+              @f(%n - 1, log(%data))
           }
        }
     """
@@ -134,7 +135,7 @@ def f(n: i32, data: f32) -> f32 {
         sb.ret(f(relay.subtract(n, relay.const(1, ti32)), relay.log(data)))
     mod = relay.Module()
     mod[f] = relay.Function([n, data], sb.get())
-    assert "%3 = @f(%1, %2)" in mod.astext()
+    assert "@f(%1, %2) /* ty=float32 */" in mod.astext()
     assert mod[f].checked_type == relay.FuncType([ti32, tf32], tf32)
 
 

From 55fa34a85cf0a5e866702f007e39c3b5db25a209 Mon Sep 17 00:00:00 2001
From: MORINAGA <34588258+imorinaga@users.noreply.github.com>
Date: Sat, 13 Apr 2019 12:03:11 +0900
Subject: [PATCH 010/106] [Heterogeneous][Bugfix] Fix bug of wrongly generated
 device_map (#2990)

* fix bug of device_index

* cpplint

* nose

* Update test_pass_annotation.py

* fix name of testcase

* delete comment
---
 src/relay/pass/device_annotation.cc        | 71 +++++++++--------
 tests/python/relay/test_pass_annotation.py | 92 ++++++++++++++++++++--
 2 files changed, 122 insertions(+), 41 deletions(-)

diff --git a/src/relay/pass/device_annotation.cc b/src/relay/pass/device_annotation.cc
index 46f4268cc970..0139cc912849 100644
--- a/src/relay/pass/device_annotation.cc
+++ b/src/relay/pass/device_annotation.cc
@@ -334,9 +334,9 @@ class AnnotatationVisitor : private ExprVisitor {
  *  -Pass 1: Propagating the source device type to ops in a bottom-up way to the
  *           ancestors until encountering another copy op. For example, this way
  *           provides add, x, and y device types from the copy operator, `copy1`.
- *  -Pass 2: Propagating the destination device type of "the last" copy op in a
- *           top-down manner to the nodes on the output paths. For instance,
- *           this offers `subtract` and `exp` the same device type as `copy3`.
+ *  -Pass 2: Propagating the destination device type of "the last" copy op to the
+ *           remain nodes. For instance, this offers `subtract` and `exp` the 
+ *           same device type as `copy3`.
  */
 
 class DeviceInfo {
@@ -371,17 +371,22 @@ class DeviceInfo {
     }
 
     void VisitExpr_(const ConstantNode* cn) final {
-      post_dfs_order_.push_back(cn);
+      post_dfs_order_.push_back(std::make_pair(cn, has_copy_));
     }
 
     void VisitExpr_(const CallNode* call) final {
       // Skip annotation nodes.
       if (!IsOnDeviceNode(call)) {
-        ExprVisitor::VisitExpr_(call);
-        post_dfs_order_.push_back(call);
-
         if (GetDeviceCopyNode(call)) {
           num_device_copy_ops_++;
+          bool has_copy_prev = has_copy_;
+          has_copy_ = true;
+          ExprVisitor::VisitExpr_(call);
+          post_dfs_order_.push_back(std::make_pair(call, has_copy_));
+          has_copy_ = has_copy_prev;
+        } else {
+          ExprVisitor::VisitExpr_(call);
+          post_dfs_order_.push_back(std::make_pair(call, has_copy_));
         }
       }
     }
@@ -393,23 +398,27 @@ class DeviceInfo {
 
     void VisitExpr_(const TupleGetItemNode* op) final {
       ExprVisitor::VisitExpr_(op);
-      post_dfs_order_.push_back(op);
+      std::make_pair(op, has_copy_);
     }
 
-    void VisitExpr_(const VarNode* vn) final { post_dfs_order_.push_back(vn); }
+    void VisitExpr_(const VarNode* vn) final {
+        post_dfs_order_.push_back(std::make_pair(vn, has_copy_));
+    }
 
     void VisitExpr_(const LetNode* ln) final {
       ExprVisitor::VisitExpr_(ln);
-      post_dfs_order_.push_back(ln);
+      post_dfs_order_.push_back(std::make_pair(ln, has_copy_));
     }
 
     void VisitExpr_(const IfNode* in) final {
       ExprVisitor::VisitExpr_(in);
-      post_dfs_order_.push_back(in);
+      post_dfs_order_.push_back(std::make_pair(in, has_copy_));
     }
 
+
     int num_device_copy_ops_{0};
-    std::vector<const ExprNode*> post_dfs_order_;
+    bool has_copy_ = false;
+    std::vector<std::pair<const ExprNode*, bool>> post_dfs_order_;
     friend DeviceInfo;
   };
 
@@ -435,46 +444,41 @@ class DeviceInfo {
 
   void PropagateDeviceId() {
     // Bottom-up propagation.
-    BottomUpPropagation();
-    // Top-down propagation.
-    TopDownPropagation();
+    int out_dev_type = BottomUpPropagation();
+    // propagation for remained nodes.
+    FillPropagation(out_dev_type);
   }
 
-  void BottomUpPropagation() {
+  int BottomUpPropagation() {
     const CallNode* last_copy_node = nullptr;
     int cur_dev_type = -1;
+    int out_dev_type = -1;
     for (auto it = post_visitor_.post_dfs_order_.crbegin();
          it != post_visitor_.post_dfs_order_.crend(); ++it) {
-      if (const auto* node = GetDeviceCopyNode(*it)) {
+      if (const auto* node = GetDeviceCopyNode(it->first)) {
         last_copy_node = dynamic_cast<const CallNode*>(node);
         const auto* attrs = last_copy_node->attrs.as<DeviceCopyAttrs>();
         cur_dev_type = attrs->src_dev_type;
-        device_map_.Set(GetRef<Expr>(*it), attrs->dst_dev_type);
+        if (out_dev_type == -1) out_dev_type = attrs->dst_dev_type;
+        if (it->second) device_map_.Set(GetRef<Expr>(it->first),
+                                        attrs->dst_dev_type);
       } else if (last_copy_node) {
-        Expr expr = GetRef<Expr>(*it);
+        Expr expr = GetRef<Expr>(it->first);
         CHECK_EQ(device_map_.count(expr), 0U);
-        device_map_.Set(expr, cur_dev_type);
+        if (it->second) device_map_.Set(expr, cur_dev_type);
       }
     }
+      return out_dev_type;
   }
 
-  void TopDownPropagation() {
-    const CallNode* last_copy_node = nullptr;
-    int cur_dev_type = -1;
+  void FillPropagation(int out_dev_type) {
     for (const auto& it : post_visitor_.post_dfs_order_) {
-      if (const auto* node = GetDeviceCopyNode(it)) {
-        last_copy_node = dynamic_cast<const CallNode*>(node);
-        const auto* attrs = last_copy_node->attrs.as<DeviceCopyAttrs>();
-        cur_dev_type = attrs->dst_dev_type;
-      } else if (last_copy_node) {
-        Expr expr = GetRef<Expr>(it);
-        if (device_map_.count(expr) == 0) {
-          device_map_.Set(expr, cur_dev_type);
-        }
-      }
+        Expr expr = GetRef<Expr>(it.first);
+        if (!it.second) device_map_.Set(expr, out_dev_type);
     }
   }
 
+
   PostDfsOrderVisitor post_visitor_;
   Map<Expr, Integer> device_map_;
 };
@@ -503,3 +507,4 @@ TVM_REGISTER_API("relay._ir_pass.CollectDeviceAnnotationOps")
 
 }  // namespace relay
 }  // namespace tvm
+
diff --git a/tests/python/relay/test_pass_annotation.py b/tests/python/relay/test_pass_annotation.py
index c55a9fb2dd85..04081e06735b 100644
--- a/tests/python/relay/test_pass_annotation.py
+++ b/tests/python/relay/test_pass_annotation.py
@@ -231,7 +231,7 @@ def check_storage_and_device_types():
     check_storage_and_device_types()
 
 
-def test_fusible_network():
+def run_fusible_network(dev, tgt):
     R""" The network is as following:
                x     y
                 \   /
@@ -417,20 +417,96 @@ def test_fallback_all_operators(device, tgt):
         check_annotated_graph(annotated_func, expected_func)
         test_runtime(target, device, annotated_func)
 
+
+    test_fuse_log_add(dev, tgt)
+    test_fuse_all(dev, tgt)
+    test_fallback_exp(dev, tgt)
+    test_fallback_all_operators(dev, tgt)
+
+def run_unpropagatable_graph(dev, tgt):
+    R""" The network is as following:
+            a     b  c     d
+             \   /    \   /
+              add      mul
+                \      /
+                subtract
+    """
+    
+    a = relay.var("a", shape=(10, 10))
+    b = relay.var("b", shape=(10, 10))
+    c = relay.var("c", shape=(10, 10))
+    d = relay.var("d", shape=(10, 10))
+    a_data = np.random.rand(10, 10).astype('float32')
+    b_data = np.random.rand(10, 10).astype('float32')
+    c_data = np.random.rand(10, 10).astype('float32')
+    d_data = np.random.rand(10, 10).astype('float32')
+    tmp_add = a_data + b_data
+    tmp_mul = np.multiply(c_data, d_data)
+    ref_res = np.subtract(tmp_add, tmp_mul)
+    
+    fallback_device = tvm.context("cpu")
+    target = {"cpu": "llvm", dev: tgt}
+    cpu_ctx = fallback_device
+    dev_ctx = tvm.context(dev)
+    
+    def annotated():    
+        add = relay.add(a, b)
+        _add = relay.annotation.on_device(add, dev_ctx)
+        mul = relay.multiply(c, d)
+        _mul = relay.annotation.on_device(mul, cpu_ctx)
+        sub = relay.subtract(add, mul)
+        _sub = relay.annotation.on_device(sub, dev_ctx)
+        func = relay.Function([a, b, c, d],
+                              relay.Tuple(tvm.convert([_add, _mul,
+                                                       _sub, sub])))
+        func = relay.ir_pass.infer_type(func)
+        func = relay.ir_pass.rewrite_annotated_ops(func,
+                                                   dev_ctx.device_type)
+        func = relay.ir_pass.infer_type(func)
+        return relay.Function(relay.ir_pass.free_vars(func.body[3]),
+                              func.body[3])
+        
+    def expected():    
+        add = relay.add(a, b)
+        mul = relay.multiply(c, d)
+        copy_mul_sub = relay.device_copy(mul, cpu_ctx, dev_ctx)
+        sub = relay.subtract(add, copy_mul_sub)
+        func = relay.Function([a, b, c, d], sub)
+        return func
+    
+    annotated_func = annotated()
+    expected_func = expected()
+    expected_index = [2, 2, 2, 1, 1, 1, 2, 2]
+    check_annotated_graph(annotated_func, expected_func)
+    params = {"a": a_data, "b": b_data, "c": c_data, "d": d_data}
+    config = {"opt_level": 0}
+    config["fallback_device"] = fallback_device
+    with relay.build_config(**config):
+        graph, lib, params = relay.build(annotated_func, target, params=params)
+        contexts = [tvm.cpu(0), tvm.context(dev)]
+        graph_json = json.loads(graph)
+        if "device_index" in graph_json["attrs"]:
+            device_index = graph_json["attrs"]["device_index"][1]
+            assert device_index == expected_index
+        mod = graph_runtime.create(graph, lib, contexts)
+        mod.set_input(**params)
+        mod.run()
+        res = mod.get_output(0).asnumpy()
+        tvm.testing.assert_allclose(res, ref_res, rtol=1e-5, atol=1e-5)
+        
+def test_check_run():
     for dev, tgt in [("opencl", "opencl"), ("cuda", "cuda"),
-                     ("opencl", str(tvm.target.intel_graphics()))]:
+                 ("opencl", str(tvm.target.intel_graphics()))]:
         if not tvm.module.enabled(dev):
             print("Skip test because %s is not enabled." % dev)
             continue
-        test_fuse_log_add(dev, tgt)
-        test_fuse_all(dev, tgt)
-        test_fallback_exp(dev, tgt)
-        test_fallback_all_operators(dev, tgt)
-
+        run_fusible_network(dev, tgt)
+        run_unpropagatable_graph(dev, tgt)
 
+ 
 if __name__ == "__main__":
     test_redundant_annotation()
     test_annotate_all()
     test_annotate_none()
     test_conv_network()
-    test_fusible_network()
+    test_check_run()

From e906b93698b3ce19ae630ad64521eef826274e21 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 13 Apr 2019 19:20:24 -0400
Subject: [PATCH 011/106] [COMMUNITY] @hlu1 -> Reviewer (#3021)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 0e518b14b132..bd9b9c1c3c55 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -65,6 +65,7 @@ We do encourage everyone to work anything they are interested in.
 - [Wei Chen](https://github.com/wweic): @wweic
 - [Zhi Chen](https://github.com/zhiics): @zhiics
 - [Sergei Grechanik](https://github.com/sgrechanik-h): @sgrechanik-h
+- [Hao Lu](https://github.com/hlu1): @hlu1
 - [Nick Hynes](https://github.com/nhynes): @nhynes
 - [Yuwei Hu](https://github.com/Huyuwei): @Huyuwei
 - [Yizhi Liu](https://github.com/yzhliu) : @yzhliu

From 625455077c11da3a3b7d61df581d51564d81ae4e Mon Sep 17 00:00:00 2001
From: "Ehsan M. Kermani" <ehsanmo1367@gmail.com>
Date: Sun, 14 Apr 2019 19:11:18 -0700
Subject: [PATCH 012/106] [RUST][FRONTEND] Fix resnet example (#3000)

Due to the previous changes the frontend resnet example failed to build.  So this patch

1) fixes it
2) adds ~~a local `run_tests.sh` to remedy non-existence of MXNet CI (used in python build example)~~ the example build to CI with random weights and a flag for pretrained resnet weights

Please review: @tqchen @nhynes @kazimuth
---
 rust/common/src/packed_func.rs                | 14 ++-
 rust/common/src/value.rs                      | 52 +++++++++--
 rust/frontend/examples/resnet/README.md       | 17 +++-
 rust/frontend/examples/resnet/build.rs        | 15 ++-
 .../examples/resnet/src/build_resnet.py       | 62 ++++++++-----
 rust/frontend/examples/resnet/src/main.rs     |  5 +-
 rust/frontend/src/bytearray.rs                | 92 -------------------
 rust/frontend/src/context.rs                  | 10 +-
 rust/frontend/src/lib.rs                      |  6 +-
 tests/scripts/task_rust.sh                    |  4 +
 10 files changed, 138 insertions(+), 139 deletions(-)
 delete mode 100644 rust/frontend/src/bytearray.rs

diff --git a/rust/common/src/packed_func.rs b/rust/common/src/packed_func.rs
index 675b8ba5dc44..c75e9020cc93 100644
--- a/rust/common/src/packed_func.rs
+++ b/rust/common/src/packed_func.rs
@@ -155,7 +155,7 @@ TVMPODValue! {
         Bytes(val) => {
             (TVMValue { v_handle: val.clone() as *const _ as *mut c_void }, TVMTypeCode_kBytes)
         }
-        Str(val) => { (TVMValue { v_handle: val.as_ptr() as *mut c_void }, TVMTypeCode_kStr)}
+        Str(val) => { (TVMValue { v_handle: val.as_ptr() as *mut c_void }, TVMTypeCode_kStr) }
     }
 }
 
@@ -260,12 +260,24 @@ impl<'a> From<&'a str> for TVMArgValue<'a> {
     }
 }
 
+impl<'a> From<String> for TVMArgValue<'a> {
+    fn from(s: String) -> Self {
+        Self::String(CString::new(s).unwrap())
+    }
+}
+
 impl<'a> From<&'a CStr> for TVMArgValue<'a> {
     fn from(s: &'a CStr) -> Self {
         Self::Str(s)
     }
 }
 
+impl<'a> From<&'a TVMByteArray> for TVMArgValue<'a> {
+    fn from(s: &'a TVMByteArray) -> Self {
+        Self::Bytes(s)
+    }
+}
+
 impl<'a> TryFrom<TVMArgValue<'a>> for &'a str {
     type Error = ValueDowncastError;
     fn try_from(val: TVMArgValue<'a>) -> Result<Self, Self::Error> {
diff --git a/rust/common/src/value.rs b/rust/common/src/value.rs
index 94af95c62841..6d17db207865 100644
--- a/rust/common/src/value.rs
+++ b/rust/common/src/value.rs
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-use std::str::FromStr;
+use std::{os::raw::c_char, str::FromStr};
 
 use failure::Error;
 
@@ -157,17 +157,57 @@ impl_tvm_context!(
     DLDeviceType_kDLExtDev: [ext_dev]
 );
 
+/// A struct holding TVM byte-array.
+///
+/// ## Example
+///
+/// ```
+/// let v = b"hello";
+/// let barr = TVMByteArray::from(&v);
+/// assert_eq!(barr.len(), v.len());
+/// assert_eq!(barr.data(), &[104u8, 101, 108, 108, 111]);
+/// ```
 impl TVMByteArray {
+    /// Gets the underlying byte-array
     pub fn data(&self) -> &'static [u8] {
         unsafe { std::slice::from_raw_parts(self.data as *const u8, self.size) }
     }
+
+    /// Gets the length of the underlying byte-array
+    pub fn len(&self) -> usize {
+        self.size
+    }
+
+    /// Converts the underlying byte-array to `Vec<u8>`
+    pub fn to_vec(&self) -> Vec<u8> {
+        self.data().to_vec()
+    }
 }
 
-impl<'a> From<&'a [u8]> for TVMByteArray {
-    fn from(bytes: &[u8]) -> Self {
-        Self {
-            data: bytes.as_ptr() as *const i8,
-            size: bytes.len(),
+// Needs AsRef for Vec
+impl<T: AsRef<[u8]>> From<T> for TVMByteArray {
+    fn from(arg: T) -> Self {
+        let arg = arg.as_ref();
+        TVMByteArray {
+            data: arg.as_ptr() as *const c_char,
+            size: arg.len(),
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn convert() {
+        let v = vec![1u8, 2, 3];
+        let barr = TVMByteArray::from(&v);
+        assert_eq!(barr.len(), v.len());
+        assert_eq!(barr.to_vec(), vec![1u8, 2, 3]);
+        let v = b"hello";
+        let barr = TVMByteArray::from(&v);
+        assert_eq!(barr.len(), v.len());
+        assert_eq!(barr.data(), &[104u8, 101, 108, 108, 111]);
+    }
+}
diff --git a/rust/frontend/examples/resnet/README.md b/rust/frontend/examples/resnet/README.md
index e84c099de411..3ce4a778e4bd 100644
--- a/rust/frontend/examples/resnet/README.md
+++ b/rust/frontend/examples/resnet/README.md
@@ -21,12 +21,25 @@ This end-to-end example shows how to:
 * build `Resnet 18` with `tvm` and `nnvm` from Python
 * use the provided Rust frontend API to test for an input image
 
-To run the example, first `tvm`, `nnvm` and `mxnet` must be installed for the python build. To install mxnet for cpu, run `pip install mxnet`
+To run the example with pretrained resnet weights, first `tvm`, `nnvm` and `mxnet` must be installed for the python build. To install mxnet for cpu, run `pip install mxnet`
 and to install `tvm` and `nnvm` with `llvm` follow the [TVM installation guide](https://docs.tvm.ai/install/index.html).
 
-* **Build the example**: `cargo build`
+* **Build the example**: `cargo build
 
 To have a successful build, note that it is required to instruct Rust compiler to link to the compiled shared library, for example with
 `println!("cargo:rustc-link-search=native={}", build_path)`. See the `build.rs` for more details.
 
 * **Run the example**: `cargo run`
+
+Note: To use pretrained weights, one can enable `--pretrained` in `build.rs` with
+
+```
+let output = Command::new("python")
+        .arg(concat!(env!("CARGO_MANIFEST_DIR"), "/src/build_resnet.py"))
+        .arg(&format!("--build-dir={}", env!("CARGO_MANIFEST_DIR")))
+        .arg(&format!("--pretrained"))
+        .output()
+        .expect("Failed to execute command");
+```
+
+Otherwise, *random weights* are used, therefore, the prediction will be `limpkin, Aramus pictus`!
diff --git a/rust/frontend/examples/resnet/build.rs b/rust/frontend/examples/resnet/build.rs
index 037c3bbd97d2..b9a3c4ccdf12 100644
--- a/rust/frontend/examples/resnet/build.rs
+++ b/rust/frontend/examples/resnet/build.rs
@@ -17,16 +17,23 @@
  * under the License.
  */
 
-use std::process::Command;
+use std::{path::Path, process::Command};
 
 fn main() {
-    let output = Command::new(concat!(env!("CARGO_MANIFEST_DIR"), "/src/build_resnet.py"))
+    let output = Command::new("python3")
+        .arg(concat!(env!("CARGO_MANIFEST_DIR"), "/src/build_resnet.py"))
+        .arg(&format!("--build-dir={}", env!("CARGO_MANIFEST_DIR")))
         .output()
         .expect("Failed to execute command");
     assert!(
-        std::path::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/deploy_lib.o")).exists(),
+        Path::new(&format!("{}/deploy_lib.o", env!("CARGO_MANIFEST_DIR"))).exists(),
         "Could not prepare demo: {}",
-        String::from_utf8(output.stderr).unwrap().trim()
+        String::from_utf8(output.stderr)
+            .unwrap()
+            .trim()
+            .split("\n")
+            .last()
+            .unwrap_or("")
     );
     println!(
         "cargo:rustc-link-search=native={}",
diff --git a/rust/frontend/examples/resnet/src/build_resnet.py b/rust/frontend/examples/resnet/src/build_resnet.py
index 5da1db63310e..2497a41c6ef7 100644
--- a/rust/frontend/examples/resnet/src/build_resnet.py
+++ b/rust/frontend/examples/resnet/src/build_resnet.py
@@ -24,19 +24,18 @@
 
 import numpy as np
 
-import mxnet as mx
-from mxnet.gluon.model_zoo.vision import get_model
-from mxnet.gluon.utils import download
-
 import tvm
+from tvm import relay
+from tvm.relay import testing
 from tvm.contrib import graph_runtime, cc
-import nnvm
 
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 
 parser = argparse.ArgumentParser(description='Resnet build example')
 aa = parser.add_argument
+aa('--build-dir', type=str, required=True, help='directory to put the build artifacts')
+aa('--pretrained', action='store_true', help='use a pretrained resnet')
 aa('--batch-size', type=int, default=1, help='input image batch size')
 aa('--opt-level', type=int, default=3,
    help='level of optimization. 0 is unoptimized and 3 is the highest level')
@@ -45,7 +44,7 @@
 aa('--image-name', type=str, default='cat.png', help='name of input image to download')
 args = parser.parse_args()
 
-target_dir = osp.dirname(osp.dirname(osp.realpath(__file__)))
+build_dir = args.build_dir
 batch_size = args.batch_size
 opt_level = args.opt_level
 target = tvm.target.create(args.target)
@@ -57,30 +56,42 @@ def build(target_dir):
     deploy_lib = osp.join(target_dir, 'deploy_lib.o')
     if osp.exists(deploy_lib):
         return
-    # download the pretrained resnet18 trained on imagenet1k dataset for
-    # image classification task
-    block = get_model('resnet18_v1', pretrained=True)
 
-    sym, params = nnvm.frontend.from_mxnet(block)
-    # add the softmax layer for prediction
-    net = nnvm.sym.softmax(sym)
+    if args.pretrained:
+        # needs mxnet installed
+        from mxnet.gluon.model_zoo.vision import get_model
+        
+        # if `--pretrained` is enabled, it downloads a pretrained
+        # resnet18 trained on imagenet1k dataset for image classification task
+        block = get_model('resnet18_v1', pretrained=True)
+        net, params = relay.frontend.from_mxnet(block, {"data": data_shape})
+        # we want a probability so add a softmax operator
+        net = relay.Function(net.params, relay.nn.softmax(net.body),
+            None, net.type_params, net.attrs)
+    else:
+        # use random weights from relay.testing
+        net, params = relay.testing.resnet.get_workload(
+            num_layers=18, batch_size=batch_size, image_shape=image_shape)
+
     # compile the model
-    with nnvm.compiler.build_config(opt_level=opt_level):
-        graph, lib, params = nnvm.compiler.build(
-            net, target, shape={"data": data_shape}, params=params)
+    with relay.build_config(opt_level=opt_level):
+            graph, lib, params = relay.build_module.build(net, target, params=params)
+
     # save the model artifacts
     lib.save(deploy_lib)
     cc.create_shared(osp.join(target_dir, "deploy_lib.so"),
                     [osp.join(target_dir, "deploy_lib.o")])
 
     with open(osp.join(target_dir, "deploy_graph.json"), "w") as fo:
-        fo.write(graph.json())
+        fo.write(graph)
 
     with open(osp.join(target_dir,"deploy_param.params"), "wb") as fo:
-        fo.write(nnvm.compiler.save_param_dict(params))
+        fo.write(relay.save_param_dict(params))
 
 def download_img_labels():
     """ Download an image and imagenet1k class labels for test"""
+    from mxnet.gluon.utils import download
+
     img_name = 'cat.png'
     synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
                       '4d0b62f3d01426887599d4f7ede23ee5/raw/',
@@ -97,11 +108,11 @@ def download_img_labels():
         w = csv.writer(fout)
         w.writerows(synset.items())
 
-def test_build(target_dir):
+def test_build(build_dir):
     """ Sanity check with random input"""
-    graph = open(osp.join(target_dir, "deploy_graph.json")).read()
-    lib = tvm.module.load(osp.join(target_dir, "deploy_lib.so"))
-    params = bytearray(open(osp.join(target_dir,"deploy_param.params"), "rb").read())
+    graph = open(osp.join(build_dir, "deploy_graph.json")).read()
+    lib = tvm.module.load(osp.join(build_dir, "deploy_lib.so"))
+    params = bytearray(open(osp.join(build_dir,"deploy_param.params"), "rb").read())
     input_data = tvm.nd.array(np.random.uniform(size=data_shape).astype("float32"))
     ctx = tvm.cpu()
     module = graph_runtime.create(graph, lib, ctx)
@@ -112,10 +123,11 @@ def test_build(target_dir):
 
 if __name__ == '__main__':
     logger.info("building the model")
-    build(target_dir)
+    build(build_dir)
     logger.info("build was successful")
     logger.info("test the build artifacts")
-    test_build(target_dir)
+    test_build(build_dir)
     logger.info("test was successful")
-    download_img_labels()
-    logger.info("image and synset downloads are successful")
+    if args.pretrained:
+        download_img_labels()
+        logger.info("image and synset downloads are successful")
diff --git a/rust/frontend/examples/resnet/src/main.rs b/rust/frontend/examples/resnet/src/main.rs
index e50d92795883..cf24973ada5b 100644
--- a/rust/frontend/examples/resnet/src/main.rs
+++ b/rust/frontend/examples/resnet/src/main.rs
@@ -84,7 +84,7 @@ fn main() {
     let runtime_create_fn = Function::get("tvm.graph_runtime.create").unwrap();
     let runtime_create_fn_ret = call_packed!(
         runtime_create_fn,
-        &graph,
+        graph,
         &lib,
         &ctx.device_type,
         &ctx.device_id
@@ -107,8 +107,7 @@ fn main() {
         .get_function("set_input", false)
         .unwrap();
 
-    let data_str = "data".to_string();
-    call_packed!(set_input_fn, &data_str, &input).unwrap();
+    call_packed!(set_input_fn, "data".to_string(), &input).unwrap();
     // get `run` function from runtime module
     let ref run_fn = graph_runtime_module.get_function("run", false).unwrap();
     // execute the run function. Note that it has no argument
diff --git a/rust/frontend/src/bytearray.rs b/rust/frontend/src/bytearray.rs
deleted file mode 100644
index a1d183d9f525..000000000000
--- a/rust/frontend/src/bytearray.rs
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-//! Provides [`TVMByteArray`] used for passing the model parameters
-//! (stored as byte-array) to a runtime module.
-//!
-//! For more detail, please see the example `resnet` in `examples` repository.
-
-use std::os::raw::c_char;
-
-use tvm_common::ffi;
-
-/// A struct holding TVM byte-array.
-///
-/// ## Example
-///
-/// ```
-/// let v = b"hello".to_vec();
-/// let barr = TVMByteArray::from(&v);
-/// assert_eq!(barr.len(), v.len());
-/// assert_eq!(barr.data(), vec![104i8, 101, 108, 108, 111]);
-/// ```
-#[derive(Debug, Clone)]
-pub struct TVMByteArray {
-    pub(crate) inner: ffi::TVMByteArray,
-}
-
-impl TVMByteArray {
-    pub(crate) fn new(barr: ffi::TVMByteArray) -> TVMByteArray {
-        TVMByteArray { inner: barr }
-    }
-
-    /// Gets the length of the underlying byte-array
-    pub fn len(&self) -> usize {
-        self.inner.size
-    }
-
-    /// Gets the underlying byte-array as `Vec<i8>`
-    pub fn data(&self) -> Vec<i8> {
-        unsafe {
-            let sz = self.len();
-            let mut ret_buf = Vec::with_capacity(sz);
-            ret_buf.set_len(sz);
-            self.inner.data.copy_to(ret_buf.as_mut_ptr(), sz);
-            ret_buf
-        }
-    }
-}
-
-impl<'a, T: AsRef<[u8]>> From<T> for TVMByteArray {
-    fn from(arg: T) -> Self {
-        let arg = arg.as_ref();
-        let barr = ffi::TVMByteArray {
-            data: arg.as_ptr() as *const c_char,
-            size: arg.len(),
-        };
-        TVMByteArray::new(barr)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn convert() {
-        let v = vec![1u8, 2, 3];
-        let barr = TVMByteArray::from(&v);
-        assert_eq!(barr.len(), v.len());
-        assert_eq!(barr.data(), vec![1i8, 2, 3]);
-        let v = b"hello".to_vec();
-        let barr = TVMByteArray::from(&v);
-        assert_eq!(barr.len(), v.len());
-        assert_eq!(barr.data(), vec![104i8, 101, 108, 108, 111]);
-    }
-}
diff --git a/rust/frontend/src/context.rs b/rust/frontend/src/context.rs
index a5f0dd7b1019..d147871a3968 100644
--- a/rust/frontend/src/context.rs
+++ b/rust/frontend/src/context.rs
@@ -47,7 +47,7 @@ use failure::Error;
 
 use tvm_common::ffi;
 
-use crate::function;
+use crate::{function, TVMArgValue};
 
 /// Device type can be from a supported device name. See the supported devices
 /// in [TVM](https://github.com/dmlc/tvm).
@@ -60,7 +60,7 @@ use crate::function;
 ///```
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub struct TVMDeviceType(pub usize);
+pub struct TVMDeviceType(pub i64);
 
 impl Default for TVMDeviceType {
     /// default device is cpu.
@@ -141,6 +141,12 @@ impl<'a> From<&'a str> for TVMDeviceType {
     }
 }
 
+impl<'a> From<&TVMDeviceType> for TVMArgValue<'a> {
+    fn from(dev: &TVMDeviceType) -> Self {
+        Self::Int(dev.0)
+    }
+}
+
 /// Represents the underlying device context. Default is cpu.
 ///
 /// ## Examples
diff --git a/rust/frontend/src/lib.rs b/rust/frontend/src/lib.rs
index 6e4123cb6217..adb258dbd3d9 100644
--- a/rust/frontend/src/lib.rs
+++ b/rust/frontend/src/lib.rs
@@ -30,7 +30,7 @@
 //!
 //! Checkout the `examples` repository for more details.
 
-#![feature(box_syntax)]
+#![feature(box_syntax, type_alias_enum_variants)]
 
 #[macro_use]
 extern crate failure;
@@ -48,7 +48,6 @@ use std::{
 use failure::Error;
 
 pub use crate::{
-    bytearray::TVMByteArray,
     context::{TVMContext, TVMDeviceType},
     errors::*,
     function::Function,
@@ -56,7 +55,7 @@ pub use crate::{
     ndarray::NDArray,
     tvm_common::{
         errors as common_errors,
-        ffi::{self, TVMType},
+        ffi::{self, TVMByteArray, TVMType},
         packed_func::{TVMArgValue, TVMRetValue},
     },
 };
@@ -89,7 +88,6 @@ pub(crate) fn set_last_error(err: &Error) {
 
 #[macro_use]
 pub mod function;
-pub mod bytearray;
 pub mod context;
 pub mod errors;
 pub mod module;
diff --git a/tests/scripts/task_rust.sh b/tests/scripts/task_rust.sh
index 46cafe3ccd79..1728fece5965 100755
--- a/tests/scripts/task_rust.sh
+++ b/tests/scripts/task_rust.sh
@@ -76,3 +76,7 @@ cargo run --bin float
 cargo run --bin array
 cargo run --bin string
 cd -
+
+cd examples/resnet
+cargo build
+cd -

From a9762685db5a279bd0ea05dca4f07ce9df3cee1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Mon, 15 Apr 2019 12:56:31 -0700
Subject: [PATCH 013/106] [Relay] use unordered_map instead of map in ANF
 (#3024)

---
 src/relay/pass/to_a_normal_form.cc | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/relay/pass/to_a_normal_form.cc b/src/relay/pass/to_a_normal_form.cc
index 1f0ed9eff28e..5e4253de23e5 100644
--- a/src/relay/pass/to_a_normal_form.cc
+++ b/src/relay/pass/to_a_normal_form.cc
@@ -34,7 +34,9 @@
 namespace tvm {
 namespace relay {
 
-Expr ToANormalForm(const Expr& e, const Module& m, std::set<GlobalVar>* gv);
+Expr ToANormalForm(const Expr& e,
+                   const Module& m,
+                   std::unordered_set<GlobalVar, NodeHash, NodeEqual>* gv);
 
 struct ScopeNode;
 using Scope = std::shared_ptr<ScopeNode>;
@@ -104,7 +106,7 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)> {
                             const Module& m,
                             const DependencyGraph& dg,
                             std::unordered_map<DependencyGraph::Node*, Scope>* node_scope,
-                            std::set<GlobalVar>* gv) {
+                            std::unordered_set<GlobalVar, NodeHash, NodeEqual>* gv) {
     Fill fi(m, dg, node_scope, gv);
     return fi.GetScope(e)->ll->Get(fi.VisitExpr(e));
   }
@@ -113,13 +115,13 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)> {
   Module mod_;
   const DependencyGraph& dg_;
   std::unordered_map<DependencyGraph::Node*, Scope>* node_scope_;
-  std::set<GlobalVar>* visited_;
+  std::unordered_set<GlobalVar, NodeHash, NodeEqual>* visited_;
   std::unordered_map<Expr, Expr, NodeHash, NodeEqual> memo;
 
   Fill(Module mod,
        const DependencyGraph& dg,
        std::unordered_map<DependencyGraph::Node*, Scope>* node_scope,
-       std::set<GlobalVar>* visited) :
+       std::unordered_set<GlobalVar, NodeHash, NodeEqual>* visited) :
     mod_(mod),
     dg_(dg),
     node_scope_(node_scope),
@@ -273,7 +275,9 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)> {
   }
 };
 
-Expr ToANormalFormAux(const Expr& e, const Module& m, std::set<GlobalVar>* gv) {
+Expr ToANormalFormAux(const Expr& e,
+                      const Module& m,
+                      std::unordered_set<GlobalVar, NodeHash, NodeEqual>* gv) {
   /* When you lift a lambda, what is inside is also being lift.
    *
    * So we must determine the scope of the lambda before determining the scope of it's body.
@@ -299,12 +303,14 @@ Expr ToANormalFormAux(const Expr& e, const Module& m, std::set<GlobalVar>* gv) {
   return Fill::ToANormalForm(e, m, dg, &node_scope, gv);
 }
 
-Expr ToANormalForm(const Expr& e, const Module& m, std::set<GlobalVar>* gv) {
+Expr ToANormalForm(const Expr& e,
+                   const Module& m,
+                   std::unordered_set<GlobalVar, NodeHash, NodeEqual>* gv) {
   return TransformF([&](const Expr& e) { return ToANormalFormAux(e, m, gv); }, e);
 }
 
 Expr ToANormalForm(const Expr& e, const Module& m) {
-  std::set<GlobalVar> gv;
+  std::unordered_set<GlobalVar, NodeHash, NodeEqual> gv;
   return ToANormalForm(e, m, &gv);
 }
 

From 3777f7d6614c4bcc50db598e10326bfc35a4949d Mon Sep 17 00:00:00 2001
From: Logan Weber <36520469+weberlo@users.noreply.github.com>
Date: Mon, 15 Apr 2019 15:58:54 -0700
Subject: [PATCH 014/106] [Relay] Add compiler pass tutorial docs (#2746)

* Add Relay compiler pass tutorial docs

* Add Python API hook wrapping step

* Incorporate feedback

* More doc iteration

* Mooooore iteration

* Rewrite `runtime.md` in rst
---
 docs/dev/index.rst                   |   1 +
 docs/dev/relay_add_pass.rst          | 341 +++++++++++++++++++++++++++
 docs/dev/{runtime.md => runtime.rst} | 316 ++++++++++++++-----------
 src/relay/pass/fold_constant.cc      |   7 +-
 4 files changed, 524 insertions(+), 141 deletions(-)
 create mode 100644 docs/dev/relay_add_pass.rst
 rename docs/dev/{runtime.md => runtime.rst} (56%)

diff --git a/docs/dev/index.rst b/docs/dev/index.rst
index 0bf5cfb0de60..a76e8ec7e8cc 100644
--- a/docs/dev/index.rst
+++ b/docs/dev/index.rst
@@ -31,4 +31,5 @@ In this part of documentation, we share the rationale for the specific choices m
    hybrid_script
    relay_intro
    relay_add_op
+   relay_add_pass
    codebase_walkthrough
diff --git a/docs/dev/relay_add_pass.rst b/docs/dev/relay_add_pass.rst
new file mode 100644
index 000000000000..a394fe061697
--- /dev/null
+++ b/docs/dev/relay_add_pass.rst
@@ -0,0 +1,341 @@
+.. _relay-add-pass:
+
+Adding a Compiler Pass to Relay
+===============================
+
+Compiler passes are the primary interface for both extending Relay's feature
+set and for performing optimizations on Relay programs. By writing a compiler
+pass, you can then modify the AST and/or collect information about the AST,
+depending on your goal. Indeed, some of Relay's most important "built-in"
+features (e.g., autodiff and type inference) are nothing more than compiler
+passes.
+
+At a high level, there are three key components to writing a pass:
+
+- Creating one or more C++ classes that traverse the program
+- Registering an API endpoint (a TVM packed function) with the
+  ``TVM_REGISTER_API`` macro that performs the pass
+- Wrapping the Python API hook in a neater interface
+
+To begin, we'll give an overview of the key mechanisms for writing a compiler
+pass. Then, we'll walk through a concrete example of the constant-folding
+pass in Relay.
+
+AST Traversers
+--------------
+
+The base class used to traverse Relay programs is ``ExprFunctor``. The public
+interface it provides is a ``VisitExpr`` method that takes an expression and
+zero or more arguments and returns an instance of some type. When you extend
+this class, you define the AST traversal pattern by overriding
+implementations of ``VisitExpr_`` for each type of expression.
+
+The relation between ``VisitExpr`` and ``VisitExpr_`` has to do with
+dispatch. Each ``VisitExpr_`` definition targets a specific type of
+expression, but you don't always know which node type you'll be visiting.
+To remedy this, ``ExprFunctor`` provides a ``VisitExpr`` function which
+routes from the given expression to the ``VisitExpr_`` case that handles it.
+Although C++ already provides dynamic dispatch, ``ExprFunctor`` defines its
+own vtable, which ``VisitExpr`` uses. By defining our own vtable, we have
+more control over dispatch. For example, if we wanted to define a
+``PrintVisitor`` traverser that printed "Here" before every visit, we
+could override ``VisitExpr``:
+
+.. code:: c
+
+    void PrintVisitor::VisitExpr(const Expr& expr) {
+      std::cout << "Here" << std::endl;
+      ExprFunctor::VisitExpr(expr);
+    }
+
+``ExprFunctor`` itself is a very general class, which is why more often than
+not, you will be extending ``ExprVisitor`` or ``ExprMutator``. These classes
+extend ``ExprFunctor`` and provide default implementations of ``VisitExpr_``
+that capture common traversal patterns for each expression type. Having these
+default implementations means we only need to provide overriding
+implementations for the expression types where we want different behavior. We
+describe each subclass on its own in the following sections.
+
+Expression Visitors
+~~~~~~~~~~~~~~~~~~~
+
+``ExprVisitor`` is for passes that don't modify the program and instead
+perform program analyses and collect information. With this class,
+``VisitExpr`` and the private counterparts return nothing. The ``VisitExpr_``
+implementations provided by this class simply visit all of the expression's
+fields that are expressions. The default implementation for ``IfNode`` is
+shown below.
+
+.. code:: c
+
+    void ExprVisitor::VisitExpr_(const IfNode* op) {
+      this->VisitExpr(op->cond);
+      this->VisitExpr(op->true_branch);
+      this->VisitExpr(op->false_branch);
+    }
+
+Note that we're calling ``VisitExpr`` and not ``VisitExpr_`` here, so we can
+use the vtable in ``ExprFunctor`` for routing.
+
+Now, if we wanted to write a class ``CallChecker`` that checks if any
+function calls appear in the program, we would only need to extend
+``ExprVisitor`` and define the following ``VisitExpr_`` method:
+
+.. code:: c
+
+    void VisitExpr_(const CallNode* n) final {
+      result_ = true;
+    }
+
+where ``result_`` is a field. In this case, we don't need to further recurse
+on the fields of the ``CallNode``, because ``result_`` is already true and we
+now know the original expression contains a call. To make this visitor
+usable, we would provide the following public method:
+
+.. code:: c
+
+    bool Check(const Expr& expr) final {
+      result_ = false;
+      VisitExpr(expr);
+      return result_;
+    }
+
+And that's all we need. It is very common to define a public interface that
+performs some bookkeeping before invoking the top-level recursion. We could
+of course further wrap the API by making a standalone procedure that creates
+a ``CallChecker`` instance and calls ``Check`` on it, but the takeaway is
+that we've achieved our goal with very little effort.
+
+Expression Mutators
+~~~~~~~~~~~~~~~~~~~
+
+``ExprMutator`` is for passes that transform the program in some way. With
+this class, ``VisitExpr`` and its private counterparts return ``Expr``. The
+default ``VisitExpr_`` implementations provided by this class visit all of
+the expression's fields that are expressions and set the fields to be the
+result of visiting them. The default implementation for ``TupleGetItemNode``
+is shown below.
+
+.. code:: c
+
+    Expr ExprMutator::VisitExpr_(const TupleGetItemNode* g) {
+      auto t = this->Mutate(g->tuple);
+      if (g->tuple == t) {
+        return GetRef<Expr>(g);
+      } else {
+        return TupleGetItemNode::make(t, g->index);
+      }
+    }
+
+There are a few things to notice here. First, ``Mutate`` is an alias for
+``VisitExpr`` in ``ExprMutator``. Second, we only return a new node if the
+call to ``Mutate`` modified the ``tuple`` field. This method of update is
+called a functional update and doing so avoids unnecessary allocations.
+
+One feature ``ExprMutator`` has that ``ExprVisitor`` doesn't is a built-in
+``memo_`` field for caching results. It makes sense that ``ExprMutator`` has
+a memoizer, because we know which types of results we're caching (i.e.,
+``Expr``), whereas the visit methods of ``ExprVisitor`` don't return
+anything. Usually, when we want to cache results in a subclass of
+``ExprVisitor``, we need to define the cache ourselves.
+
+Now, if we wanted to write a class ``IfCollapser`` that replaces every if
+statement with its true branch, we would override ``VisitExpr_`` for
+``IfNode``:
+
+.. code:: c
+
+    Expr ExprMutator::VisitExpr_(const IfNode* op) {
+      return this->Mutate(op->true_branch);
+    }
+
+Note that the returned expression will not necessarily be an ``IfNode``, and
+this is fine, because the return type is ``Expr``. Now, we create the public
+interface:
+
+.. code:: c
+
+    Expr CollapseIfs(const Expr& expr) final {
+      return this->Mutate(expr);
+    }
+
+With this mutator, we didn't need to do any bookkeeping, but we still want to
+follow the convention of having a descriptive method as the interface.
+
+Example: Constant Folding
+-------------------------
+
+In order to better understand the process of writing a pass, we will look at
+the constant folding pass (found in ``src/relay/pass/fold_constant.cc`` and
+in ``python/tvm/relay/ir_pass.py``) as a guide, because it is a relatively
+simple pass that incorporates both types of traversals.
+
+Constant folding involves evaluating expressions in the program that only
+involve constant values, then replacing those expressions with the result
+of evaluating them. The goal of this pass is to frontload all of the
+computations that we can. To achieve this, the constant folding pass makes
+use of a visitor (``ConstantChecker``) and a mutator (``ConstantFolder``).
+
+The ``ConstantChecker`` Visitor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This visitor is used to check if an expression is constant. In Relay, we
+define an expression to be constant if it is a ``ConstantNode`` or it is a
+``TupleNode`` with only constant fields.
+
+We use a ``memo_`` field to map from nodes to whether they are constant and
+to cache these results. Below are the ``VisitExpr_`` definitions in the
+``ConstantChecker``.
+
+.. code:: c
+
+    void VisitExpr_(const ConstantNode* n) final {
+      memo_[GetRef<Constant>(n)] = true;
+    }
+
+    void VisitExpr_(const TupleNode* n) final {
+      bool result = true;
+      for (const auto& field : n->fields) {
+        if (!Check(field)) {
+          result = false;
+          break;
+        }
+      }
+      memo_[GetRef<Tuple>(n)] = result;
+    }
+
+The bookkeeping used to coordinate these definitions is a ``Check`` method
+that returns whether the given expression is considered constant.
+
+.. code:: c
+
+    bool Check(const Expr& expr) {
+      const auto it = memo_.find(expr);
+      if (it != memo_.end())
+        return it->second;
+      VisitExpr(expr);
+      return memo_[expr];
+    }
+
+We don't modify ``memo_`` for every node we encounter; instead we only modify
+``memo_`` when the encountered node could potentially be constant. Then we
+rely on the default value being false when ``memo_`` doesn't contain
+``expr``.
+
+The ``ConstantFolder`` Mutator
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This mutator performs the bulk of the constant folding pass and internally
+uses ``ConstantChecker``. In Relay, there are three node types that are
+involved in constant folding: ``LetNode``, ``TupleItemGetNode``, and
+``CallNode``. In the following paragraphs, we explain the roles of each in
+the pass.
+
+.. code:: c
+
+    Expr VisitExpr_(const LetNode* op) final {
+      Expr value = this->Mutate(op->value);
+      if (value.as<ConstantNode>()) {
+        memo_[op->var] = value;
+        return this->Mutate(op->body);
+      } else {
+        Var var = Downcast<Var>(this->Mutate(op->var));
+        Expr body = this->Mutate(op->body);
+        if (var.same_as(op->var) &&
+            value.same_as(op->value) &&
+            body.same_as(op->body)) {
+          return GetRef<Expr>(op);
+        } else {
+          return LetNode::make(var, value, body);
+        }
+      }
+    }
+
+In the ``LetNode`` case, we first attempt to const-fold the value being bound
+in the expression. If we can, then we populate ``memo_`` and return the
+result of visiting the body---essentially, propagating the bound value to its
+use sites in the body. If we can't const-fold the bound value, we mimic the
+default implementation.
+
+.. code:: c
+
+    Expr VisitExpr_(const TupleGetItemNode* op) final {
+      Expr res = ExprMutator::VisitExpr_(op);
+      op = res.as<TupleGetItemNode>();
+      if (const auto* tuple = op->tuple.as<TupleNode>()) {
+        return tuple->fields[op->index];
+      } else {
+        return res;
+      }
+    }
+
+In the ``TupleItemGetNode`` case, we check if ``op->tuple`` field is a
+``TupleNode``. If so, we replace the tuple get with the field of the tuple
+pointed to by ``op->index``. The reason we need to check is because
+``op->tuple`` might evaluate to a tuple, without itself being a tuple.
+
+.. code:: c
+
+    Expr VisitExpr_(const CallNode* call) final {
+      static auto op_stateful = Op::GetAttr<TOpIsStateful>("TOpIsStateful");
+      Expr res = ExprMutator::VisitExpr_(call);
+      call = res.as<CallNode>();
+      // We don't constant fold function with zero arguments.
+      // This is a heuristic that is useful.
+      // For example it is harmful to fold ones(shape=(4, 5)).
+      if (call->args.size() == 0) return res;
+      const OpNode* op = call->op.as<OpNode>();
+      if (op == nullptr) return res;
+      // skip stateful ops.
+      if (op_stateful.get(GetRef<Op>(op), false)) return res;
+      bool all_const_args = true;
+      for (Expr arg : call->args) {
+        if (!checker_.Check(arg)) {
+          all_const_args = false;
+        }
+      }
+      if (all_const_args) {
+        return ConstEvaluate(res);
+      } else {
+        return res;
+      }
+    }
+
+In the ``CallNode`` case, we first use the ``VisitExpr_`` of ``ExprMutator``
+to visit the call, which const-folds all of the fields of the call. We use
+``ExprMutator::VisitExpr_`` instead of ``VisitExpr``, because we want to
+bypass the vtable (to avoid an infinite loop) and use the default
+implementation provided by ``ExprMutator``. Then we evaluate the call only if
+all of the arguments are constant (using ``ConstantChecker``). Evaluating the
+call produces a **value**, so we use a helper method ``ValueToExpr`` to allow
+us to place the evaluated expression back into the AST.
+
+Now, we construct the public interface ``FoldConstant`` to our constant
+folder, which is a standalone function outside of the ``ConstantFolder``
+class. ``FoldConstant`` takes an expression and internally creates and uses a
+``ConstantFolder`` instance (the full definition can be found in
+``include/tvm/relay/pass.h``).
+
+To allow other C++ modules to use our pass, we declare the public interface
+in ``src/relay/pass/pass.h``:
+
+.. code:: c
+
+    TVM_DLL Expr FoldConstant(const Expr& expr);
+
+Registering an API Endpoint
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+With the AST traversers written, the pass can be registered to become a TVM
+API endpoint with the following code snippet:
+
+.. code:: c
+
+    TVM_REGISTER_API("relay._ir_pass.FoldConstant")
+    .set_body([](TVMArgs args, TVMRetValue *ret) {
+        *ret = FoldConstant(args[0]);
+    });
+
+And the pass can now be used in C++ and Python, though it's a good idea to
+wrap the API in Python, as described in :ref:`relay-add-op`. More detail
+about registration can be found in :ref:`tvm-runtime-system`.
diff --git a/docs/dev/runtime.md b/docs/dev/runtime.rst
similarity index 56%
rename from docs/dev/runtime.md
rename to docs/dev/runtime.rst
index 317ba5d54e75..3efb71d6ae30 100644
--- a/docs/dev/runtime.md
+++ b/docs/dev/runtime.rst
@@ -1,28 +1,31 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# TVM Runtime System
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+..
+..   http://www.apache.org/licenses/LICENSE-2.0
+..
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied.  See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. _tvm-runtime-system:
+
+TVM Runtime System
+==================
 
 TVM supports multiple programming languages for the compiler stack development and deployment.
 In this note, we explain the key elements of the TVM runtime.
 
-![](http://www.tvm.ai/images/release/tvm_flexible.png)
+.. image:: http://www.tvm.ai/images/release/tvm_flexible.png
 
-We need to satisfy quite a few interesting requirements
+We need to satisfy quite a few interesting requirements:
 
 - Deployment: invoke the compiled function from python/javascript/c++ language.
 - Debug: define a function in python and call that from a compiled function.
@@ -34,30 +37,34 @@ We need to satisfy quite a few interesting requirements
 We want to be able to define a function from any language and call from another.
 We also want the runtime core to be minimal to deploy to embedded devices.
 
-## PackedFunc
+PackedFunc
+----------
 
-[PackedFunc](https://github.com/dmlc/tvm/blob/master/include/tvm/runtime/packed_func.h) is a simple but elegant solution
+`PackedFunc`_ is a simple but elegant solution
 we find to solve the challenges listed. The following code block provides an example in C++
 
-```c++
-#include <tvm/runtime/packed_func.h>
-
-void MyAdd(TVMArgs args, TVMRetValue* rv) {
-  // automatically convert arguments to desired type.
-  int a = args[0];
-  int b = args[1];
-  // automatically assign value return to rv
-  *rv = a + b;
-}
-
-void CallPacked() {
-  PackedFunc myadd = PackedFunc(MyAdd);
-  // get back 3
-  int c = myadd(1, 2);
-}
-```
+.. _PackedFunc: https://github.com/dmlc/tvm/blob/master/include/tvm/runtime/packed_func.h
+
+.. code:: c
+
+    #include <tvm/runtime/packed_func.h>
+
+    void MyAdd(TVMArgs args, TVMRetValue* rv) {
+      // automatically convert arguments to desired type.
+      int a = args[0];
+      int b = args[1];
+      // automatically assign value return to rv
+      *rv = a + b;
+    }
+
+    void CallPacked() {
+      PackedFunc myadd = PackedFunc(MyAdd);
+      // get back 3
+      int c = myadd(1, 2);
+    }
+
 In the above codeblock, we defined a PackedFunc MyAdd. It takes two arguments
-: ```args``` represents input arguments and ```rv``` represents return value.
+: ``args`` represents input arguments and ``rv`` represents return value.
 The function is type-erased, which means that the function signature does not restrict which input type to pass in or type to return.
 Under the hood, when we call a PackedFunc, it packs the input arguments to TVMArgs on stack,
 and gets the result back via TVMRetValue.
@@ -65,21 +72,23 @@ and gets the result back via TVMRetValue.
 Thanks to template tricks in C++, we can call a PackedFunc just like a normal function. Because of its type-erased nature, we can call a PackedFunc from dynamic languages like python, without additional glue code for each new type function created.
 The following example registers PackedFunc in C++ and calls from python.
 
-```c++
-// register a global packed function in c++
-TVM_REGISTER_GLOBAL("myadd")
-.set_body(MyAdd);
-```
-```python
-import tvm
+.. code:: c
 
-myadd = tvm.get_global_func("myadd")
-# prints 3
-print(myadd(1, 2))
-```
+    // register a global packed function in c++
+    TVM_REGISTER_GLOBAL("myadd")
+    .set_body(MyAdd);
 
-Most of the magic of PackedFunc lies in ```TVMArgs``` and ```TVMRetValue``` structure.
-We restrict a list of possible types which can be passed, here are the common ones
+.. code:: python
+
+    import tvm
+
+    myadd = tvm.get_global_func("myadd")
+    # prints 3
+    print(myadd(1, 2))
+
+Most of the magic of PackedFunc lies in ``TVMArgs`` and ``TVMRetValue`` structure.
+We restrict a list of possible types which can be passed.
+Here are the common ones:
 
 - int, float and string
 - PackedFunc itself
@@ -92,43 +101,54 @@ Despite being minimum, the PackedFunc is sufficient for the use-case of deep lea
 most functions only take DLTensor or numbers.
 
 Since one PackedFunc can take another PackedFunc as an argument,
-we can pass functions from python(as PackedFunc) to C++.
-```c++
-TVM_REGISTER_GLOBAL("callhello")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-  PackedFunc f = args[0];
-  f("hello world");
-});
-```
-```python
-import tvm
-
-def callback(msg):
-   print(msg)
-
-# convert to PackedFunc
-f = tvm.convert(callback)
-callhello = tvm.get_global_func("callhello")
-# prints hello world
-callhello(f)
-```
-
-TVM provides a [minimum C API](https://github.com/dmlc/tvm/blob/master/include/tvm/runtime/c_runtime_api.h),
+we can pass functions from python (as PackedFunc) to C++.
+
+.. code:: c
+
+    TVM_REGISTER_GLOBAL("callhello")
+    .set_body([](TVMArgs args, TVMRetValue* rv) {
+      PackedFunc f = args[0];
+      f("hello world");
+    });
+
+.. code:: python
+
+    import tvm
+
+    def callback(msg):
+      print(msg)
+
+    # convert to PackedFunc
+    f = tvm.convert(callback)
+    callhello = tvm.get_global_func("callhello")
+    # prints hello world
+    callhello(f)
+
+TVM provides a `minimum C API`_,
 which allows us to embed the PackedFunc into any languages. Besides python, so far we supported
-[java](https://github.com/dmlc/tvm/tree/master/jvm) and [javascript](https://github.com/dmlc/tvm/tree/master/web).
+`java`_ and `javascript`_.
 This philosophy of embedded API is very like Lua, except that we don't have a new language but use C++.
 
+.. _minimum C API: https://github.com/dmlc/tvm/blob/master/include/tvm/runtime/c_runtime_api.h
+.. _java: https://github.com/dmlc/tvm/tree/master/jvm
+.. _javascript: https://github.com/dmlc/tvm/tree/master/web
+
+
 One fun fact about PackedFunc is that we use it for both compiler and deployment stack.
-- All TVM's compiler pass functions are exposed to frontend as PackedFunc, see [here](https://github.com/dmlc/tvm/tree/master/src/api)
+
+- All TVM's compiler pass functions are exposed to frontend as PackedFunc, see `here`_
 - The compiled module also returns the compiled function as PackedFunc
 
+.. _here: https://github.com/dmlc/tvm/tree/master/src/api
+
 To keep the runtime minimum, we isolated the IR Node support from the deployment runtime. The resulting runtime takes around 200K - 600K depending on how many runtime driver modules (e.g., CUDA) get included.
 
 The overhead of calling into PackedFunc vs. a normal function is small, as it is only saving a few values on the stack.
 So it is OK as long as we don't wrap small functions.
 In summary, the PackedFunc is the universal glue in TVM where we use it extensively to support our compiler and deployment.
 
-## Module
+Module
+------
 
 Since TVM supports multiple types of devices, we need to support different type of drivers.
 We have to use the driver API to load the kernel, set up the argument in packed format and perform kernel launch.
@@ -136,28 +156,34 @@ We also need to patch up the driver API so that the exposed functions are thread
 So we often need to implement these driver glues in C++ and expose them to the user.
 We can certainly not do it for each type of functions, so again PackedFunc is our answer.
 
-TVM defines the compiled object as [Module](https://github.com/dmlc/tvm/blob/master/include/tvm/runtime/module.h).
+TVM defines the compiled object as `Module`_.
 The user can get the compiled function from Module as PackedFunc.
 The generated compiled code can dynamically get function from Module in runtime. It caches the function handle in the first call and reuses in subsequent calls. We use this to link device code and callback into any PackedFunc(e.g., python) from generated code.
 
+.. _Module: https://github.com/dmlc/tvm/blob/master/include/tvm/runtime/module.h
+
 The ModuleNode is an abstract class that can be implemented by each type of device.
 So far we support modules for CUDA, Metal, OpenCL and loading dynamic shared libraries. This abstraction makes introduction
 of new device easy, and we do not need to redo the host code generation for each type of device.
 
-## Remote Deployment
+Remote Deployment
+-----------------
 
 The PackedFunc and Module system also makes it easy to ship the function into remote devices directly.
 Under the hood, we have an RPCModule that serializes the arguments to do the data movement and launches the computation on the remote.
 
-![](http://www.tvm.ai/images/release/tvm_rpc.png)
+.. image:: http://www.tvm.ai/images/release/tvm_rpc.png
 
 The RPC server itself is minimum and can be bundled into the runtime. We can start a minimum TVM
 RPC server on iPhone/android/raspberry pi or even the browser. The cross compilation on server and shipping of the module for testing can be done in the same script. Checkout
-[Cross compilation and RPC tutorial](http://docs.tvm.ai/tutorials/deployment/cross_compilation_and_rpc.html#sphx-glr-tutorials-deployment-cross-compilation-and-rpc-py)  for more details.
+`Cross compilation and RPC tutorial`_ for more details.
+
+.. _Cross compilation and RPC tutorial: http://docs.tvm.ai/tutorials/deployment/cross_compilation_and_rpc.html#sphx-glr-tutorials-deployment-cross-compilation-and-rpc-py
 
 This instant feedback gives us a lot of advantages. For example, to test the correctness of generated code on iPhone, we no longer have to write test-cases in swift/objective-c from scratch -- We can use RPC to execute on iPhone, copy the result back and do verification on the host via numpy. We can also do the profiling using the same script.
 
-## TVM Node and Compiler Stack
+TVM Node and Compiler Stack
+---------------------------
 
 As we mentioned earlier, we build compiler stack API on top of the PackedFunc runtime system.
 We faced a constant changing of the compiler API for the need of research. We need a new language object or IR node whenever we want to test out new primitives.
@@ -166,89 +192,101 @@ However, we don't want to change our API from time to time. Besides that, we als
 - be able to serialize any language object and IRs
 - be able to explore, print, and manipulate the IR objects in front-end language to do quick prototyping.
 
-We introduced a base class, called [Node](https://github.com/dmlc/HalideIR/blob/master/src/tvm/node.h#L52) to solve this problem.
+We introduced a base class, called `Node`_ to solve this problem.
 All the language object in the compiler stack is a subclass of Node. Each node contains a string type_key that uniquely identifies
 the type of object. We choose string instead of int as type key so new Node class can be added in the decentralized fashion without
 adding the code back to the central repo. To ease the speed of dispatching, we allocate an integer type_index at runtime for each type_key.
 
+.. _Node: https://github.com/dmlc/HalideIR/blob/master/src/tvm/node.h#L52
+
 Since usually one Node object could be referenced in multiple places in the language, we use a shared_ptr to keep
 track of reference. We use NodeRef class to represent a reference to the Node.
 We can roughly view NodeRef class as shared_ptr to the Node container.
 We can also define subclass NodeRef to hold each subtypes of Node. Each Node class needs to define the VisitAttr function.
 
-```c++
-class AttrVisitor {
- public:
-  virtual void Visit(const char* key, double* value) = 0;
-  virtual void Visit(const char* key, int64_t* value) = 0;
-  virtual void Visit(const char* key, uint64_t* value) = 0;
-  virtual void Visit(const char* key, int* value) = 0;
-  virtual void Visit(const char* key, bool* value) = 0;
-  virtual void Visit(const char* key, std::string* value) = 0;
-  virtual void Visit(const char* key, void** value) = 0;
-  virtual void Visit(const char* key, Type* value) = 0;
-  virtual void Visit(const char* key, NodeRef* value) = 0;
-  // ...
-};
-
-class Node {
- public:
-  virtual void VisitAttrs(AttrVisitor* visitor) {}
-  // ...
-};
-```
+.. code:: c
+
+    class AttrVisitor {
+    public:
+      virtual void Visit(const char* key, double* value) = 0;
+      virtual void Visit(const char* key, int64_t* value) = 0;
+      virtual void Visit(const char* key, uint64_t* value) = 0;
+      virtual void Visit(const char* key, int* value) = 0;
+      virtual void Visit(const char* key, bool* value) = 0;
+      virtual void Visit(const char* key, std::string* value) = 0;
+      virtual void Visit(const char* key, void** value) = 0;
+      virtual void Visit(const char* key, Type* value) = 0;
+      virtual void Visit(const char* key, NodeRef* value) = 0;
+      // ...
+    };
+
+    class Node {
+    public:
+      virtual void VisitAttrs(AttrVisitor* visitor) {}
+      // ...
+    };
 
 Each Node subclass will override this to visit its members. Here is an example implementation of TensorNode.
-```c++
-class TensorNode : public Node {
- public:
-  /*! \brief The shape of the tensor */
-  Array<Expr> shape;
-  /*! \brief data type in the content of the tensor */
-  Type dtype;
-  /*! \brief the source operation, can be None */
-  Operation op;
-  /*! \brief the output index from source operation */
-  int value_index{0};
-  /*! \brief constructor */
-  TensorNode() {}
-
-  void VisitAttrs(AttrVisitor* v) final {
-    v->Visit("shape", &shape);
-    v->Visit("dtype", &dtype);
-    v->Visit("op", &op);
-    v->Visit("value_index", &value_index);
-  }
-};
-```
-In the above examples, both ```Operation``` and ```Array<Expr>``` are NodeRef.
+
+.. code:: c
+
+    class TensorNode : public Node {
+    public:
+      /*! \brief The shape of the tensor */
+      Array<Expr> shape;
+      /*! \brief data type in the content of the tensor */
+      Type dtype;
+      /*! \brief the source operation, can be None */
+      Operation op;
+      /*! \brief the output index from source operation */
+      int value_index{0};
+      /*! \brief constructor */
+      TensorNode() {}
+
+      void VisitAttrs(AttrVisitor* v) final {
+        v->Visit("shape", &shape);
+        v->Visit("dtype", &dtype);
+        v->Visit("op", &op);
+        v->Visit("value_index", &value_index);
+      }
+    };
+
+In the above examples, both ``Operation`` and ``Array<Expr>`` are NodeRef.
 The VisitAttrs gives us a reflection API to visit each member of the object.
 We can use this function to visit the node and serialize any language object recursively.
 It also allows us to get members of an object easily in front-end language.
 For example, in the following code, we accessed the op field of the TensorNode.
 
-```python
-import tvm
+.. code:: python
+    import tvm
 
-x = tvm.placeholder((3,4), name="x")
-# access the op field of TensorNode
-print(x.op.name)
-```
+    x = tvm.placeholder((3,4), name="x")
+    # access the op field of TensorNode
+    print(x.op.name)
 
 New Node can be added to C++ without changing the front-end runtime, making it easy to make extensions to the compiler stack.
 Note that this is not the fastest way to expose members to front-end language, but might be one of the simplest
 approaches possible. We also find that it fits our purposes as we mainly use python for testing and prototyping and still use c++
 to do the heavy lifting job.
 
-## Implementation Details
+Implementation Details
+----------------------
 
-Each argument in PackedFunc contains a union value [TVMValue](https://github.com/dmlc/tvm/blob/master/include/tvm/runtime/c_runtime_api.h#L122)
+Each argument in PackedFunc contains a union value `TVMValue`_
 and a type code. This design allows the dynamically typed language to convert to the corresponding type directly, and statically typed language to
 do runtime type checking during conversion.
 
+.. _TVMValue: https://github.com/dmlc/tvm/blob/master/include/tvm/runtime/c_runtime_api.h#L122
+
 The relevant files are
-- [packed_func.h](https://github.com/dmlc/tvm/blob/master/include/tvm/runtime/packed_func.h) for C++ API
-- [c_runtime_api.cc](https://github.com/dmlc/tvm/blob/master/src/runtime/c_runtime_api.cc#L262) for C API and how to provide callback.
+
+- `packed_func.h`_ for C++ API
+- `c_runtime_api.cc`_ for C API and how to provide callback.
+
+.. _packed_func.h: https://github.com/dmlc/tvm/blob/master/include/tvm/runtime/packed_func.h
+.. _c_runtime_api.cc: https://github.com/dmlc/tvm/blob/master/src/runtime/c_runtime_api.cc#L262
 
 To support extension types, we used a registry system to register type related information, like support of any
-in C++, see [Extension types](https://github.com/dmlc/tvm/tree/master/apps/extension) for more details.
+in C++, see `Extension types`_ for more details.
+
+.. _Extension types: https://github.com/dmlc/tvm/tree/master/apps/extension
diff --git a/src/relay/pass/fold_constant.cc b/src/relay/pass/fold_constant.cc
index 5bfee6cfe9f6..9f0d60bf788f 100644
--- a/src/relay/pass/fold_constant.cc
+++ b/src/relay/pass/fold_constant.cc
@@ -35,8 +35,11 @@ using FInterpreter = runtime::TypedPackedFunc<Value(Expr)>;
 
 class ConstantChecker : private ExprVisitor {
  public:
-  // Check whether an expression is constant. The results are memorized.
+  // Check whether an expression is constant. The results are memoized.
   bool Check(const Expr& expr) {
+    // The `ConstantNode` case is common enough that we check directly for the
+    // case here, to avoid the time overhead of dispatching through the vtable
+    // and the space overhead of memoizing always-true results.
     if (expr.as<ConstantNode>()) {
       return true;
     }
@@ -44,7 +47,7 @@ class ConstantChecker : private ExprVisitor {
     if (it != memo_.end())
       return it->second;
     VisitExpr(expr);
-    return memo_[expr];  // return memorized result or the default value false
+    return memo_[expr];  // return memoized result or the default value false
   }
 
  private:

From 1db51e399799df6b1ead4c5c38aa8c277ab85a9b Mon Sep 17 00:00:00 2001
From: tkat0 <tkato.dev@gmail.com>
Date: Tue, 16 Apr 2019 10:11:51 +0900
Subject: [PATCH 015/106] [DOC] Add Android Tutorial (#2977)

* fix APP_STL for latest android ndk

* add vulkan sdk for tutorial

* add android tutorial

* fix of invalid input layer name

* update relay build opt_level 1 -> 3
---
 .../app/src/main/jni/Application.mk           |   2 +-
 docker/Dockerfile.demo_android                |  23 +-
 tutorials/frontend/deploy_model_on_android.py | 342 ++++++++++++++++++
 3 files changed, 364 insertions(+), 3 deletions(-)
 create mode 100644 tutorials/frontend/deploy_model_on_android.py

diff --git a/apps/android_rpc/app/src/main/jni/Application.mk b/apps/android_rpc/app/src/main/jni/Application.mk
index f142e2995777..aef7629990c2 100644
--- a/apps/android_rpc/app/src/main/jni/Application.mk
+++ b/apps/android_rpc/app/src/main/jni/Application.mk
@@ -12,7 +12,7 @@ include $(config)
 # 1) armeabi is deprecated in NDK r16 and removed in r17
 # 2) vulkan is not supported in armeabi
 APP_ABI ?= armeabi-v7a arm64-v8a x86 x86_64 mips
-APP_STL := c++_static
+APP_STL := c++_shared
 
 APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++11 -Oz -frtti
 ifeq ($(USE_OPENCL), 1)
diff --git a/docker/Dockerfile.demo_android b/docker/Dockerfile.demo_android
index 743f0a5fc938..e9c3e4f6ce8e 100644
--- a/docker/Dockerfile.demo_android
+++ b/docker/Dockerfile.demo_android
@@ -44,9 +44,28 @@ RUN bash /install/ubuntu_install_gradle.sh
 COPY install/ubuntu_install_androidsdk.sh /install/ubuntu_install_androidsdk.sh
 RUN bash /install/ubuntu_install_androidsdk.sh
 
+COPY install/ubuntu_install_vulkan.sh /install/ubuntu_install_vulkan.sh
+RUN bash /install/ubuntu_install_vulkan.sh
+
+ENV VULKAN_SDK /usr/local/VulkanSDK/1.0.65.0/x86_64
+ENV PATH ${PATH}:${VULKAN_SDK}/bin
+ENV LD_LIBRARY_PATH ${LD_LIBRARY_PATH}:${VULKAN_SDK}/lib
+ENV VK_LAYER_PATH ${VULKAN_SDK}/etc/explicit_layer.d
+
 # Build TVM
-COPY install/install_tvm_cpu.sh /install/install_tvm_cpu.sh
-RUN bash /install/install_tvm_cpu.sh
+RUN cd /usr && \
+    git clone --depth=1 https://github.com/dmlc/tvm --recursive && \
+    cd /usr/tvm && \
+    mkdir -p build && \
+    cd build && \
+    cmake \
+        -DUSE_LLVM=llvm-config-6.0 \
+        -DUSE_RPC=ON \
+        -DUSE_SORT=ON \
+        -DUSE_GRAPH_RUNTIME=ON \
+        -DUSE_VULKAN=ON \
+        .. && \
+    make -j10
 
 # Environment variables
 ENV PYTHONPATH=/usr/tvm/python:/usr/tvm/topi/python:/usr/tvm/nnvm/python/:/usr/tvm/vta/python:${PYTHONPATH}
diff --git a/tutorials/frontend/deploy_model_on_android.py b/tutorials/frontend/deploy_model_on_android.py
new file mode 100644
index 000000000000..4ec72f6c4c5a
--- /dev/null
+++ b/tutorials/frontend/deploy_model_on_android.py
@@ -0,0 +1,342 @@
+"""
+.. _tutorial-deploy-model-on-android:
+
+Deploy the Pretrained Model on Android
+=======================================
+**Author**: `Tomohiro Kato <https://tkat0.github.io/>`_
+
+This is an example of using Relay to compile a keras model and deploy it on Android device.
+"""
+
+import os
+import numpy as np
+from PIL import Image
+import keras
+from keras.applications.mobilenet_v2 import MobileNetV2
+import tvm
+import tvm.relay as relay
+from tvm import rpc
+from tvm.contrib import util, ndk, graph_runtime as runtime
+from tvm.contrib.download import download_testdata
+
+
+######################################################################
+# Setup Environment
+# --------------------
+# Since there are many required packages for Android, it is recommended to use the official Docker Image.
+#
+# First, to build and run Docker Image, we can run the following command.
+#
+# .. code-block:: bash
+#
+#   git clone --recursive https://github.com/dmlc/tvm
+#   cd tvm
+#   docker build -t tvm.demo_android -f docker/Dockerfile.demo_android ./docker
+#   docker run --pid=host -h tvm -v $PWD:/workspace \
+#          -w /workspace -p 9190:9190 --name tvm -it tvm.demo_android bash
+#
+# You are now inside the container. The cloned tvm directory is mounted on /workspace.
+# At this time, mount the 9190 port used by RPC described later.
+#
+# .. note::
+#
+#   Please execute the following steps in the container.
+#   We can execute :code:`docker exec -it tvm bash` to open a new terminal in the container.
+#
+# Next we build the TVM.
+#
+# .. code-block:: bash
+#
+#   mkdir build
+#   cd build
+#   cmake -DUSE_LLVM=llvm-config-6.0 \
+#         -DUSE_RPC=ON \
+#         -DUSE_SORT=ON \
+#         -DUSE_VULKAN=ON \
+#         -DUSE_GRAPH_RUNTIME=ON \
+#         ..
+#   make -j10
+#
+# After building tvm successfully, Please set PYTHONPATH.
+#
+# .. code-block:: bash
+#
+#   echo 'export PYTHONPATH=/workspace/python:/workspacem/topi/python:/workspace/nnvm/python/:/workspace/vta/python:${PYTHONPATH}' >> ~/.bashrc
+#   source ~/.bashrc
+
+#################################################################
+# Start RPC Tracker
+# -----------------
+# TVM uses RPC session to communicate with Android device.
+#
+# To start an RPC tracker, run this command in the container. The tracker is
+# required during the whole tuning process, so we need to open a new terminal for
+# this command:
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190
+#
+# The expected output is
+#
+# .. code-block:: bash
+#
+#   INFO:RPCTracker:bind to 0.0.0.0:9190
+
+#################################################################
+# Register Android device to RPC Tracker
+# ---------------------------------------
+# Now we can register our Android device to the tracker.
+#
+# Follow this `readme page <https://github.com/dmlc/tvm/tree/master/apps/android_rpc>`_ to
+# install tvm rpc apk on the android device.
+#
+# Here is an example of config.mk. I enabled OpenCL and Vulkan.
+#
+#
+# .. code-block:: bash
+#
+#   APP_ABI = arm64-v8a
+#
+#   APP_PLATFORM = android-24
+#
+#   # whether enable OpenCL during compile
+#   USE_OPENCL = 1
+#
+#   # whether to enable Vulkan during compile
+#   USE_VULKAN = 1
+#
+#   ifeq ($(USE_VULKAN), 1)
+#     # Statically linking vulkan requires API Level 24 or higher
+#     APP_PLATFORM = android-24
+#   endif
+#
+#   # the additional include headers you want to add, e.g., SDK_PATH/adrenosdk/Development/Inc
+#   ADD_C_INCLUDES += /work/adrenosdk-linux-5_0/Development/Inc
+#   # download from https://github.com/KhronosGroup/OpenCL-Headers
+#   ADD_C_INCLUDES += /workspace/3rdparty/OpenCL-Headers/
+#
+#   # the additional link libs you want to add, e.g., ANDROID_LIB_PATH/libOpenCL.so
+#   ADD_LDLIBS = /workspace/pull-from-android-device/libOpenCL.so
+#
+# .. note::
+#
+#   At this time, don't forget to `create a standalone toolchain <https://github.com/dmlc/tvm/tree/master/apps/android_rpc#architecture-and-android-standalone-toolchain>`_ .
+#
+#   for example
+#
+#   .. code-block:: bash
+#
+#     /opt/android-sdk-linux/ndk-bundle/build/tools/make-standalone-toolchain.sh \
+#        --platform=android-24 --use-llvm --arch=arm64 --install-dir=/opt/android-toolchain-arm64
+#     export TVM_NDK_CC=/opt/android-toolchain-arm64/bin/aarch64-linux-android-g++
+#
+# Next, start the Android application and enter the IP address and port of RPC Tracker.
+# Then you have already registered your device.
+#
+# After registering devices, we can confirm it by querying rpc_tracker
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190
+#
+# For example, if we have 1 Android device.
+# the output can be
+#
+# .. code-block:: bash
+#
+#    Queue Status
+#    ----------------------------------
+#    key          total  free  pending
+#    ----------------------------------
+#    android      1      1     0
+#    ----------------------------------
+#
+# To confirm that you can communicate with Android, we can run following test script.
+# If you use OpenCL and Vulkan, please set :code:`test_opencl` and :code:`test_vulkan` in the script.
+#
+# .. code-block:: bash
+#
+#   export TVM_TRACKER_HOST=0.0.0.0
+#   export TVM_TRACKER_PORT=9190
+#
+# .. code-block:: bash
+#
+#   cd /workspace/apps/android_rpc
+#   python tests/android_rpc_test.py
+#
+
+######################################################################
+# Load pretrained keras model
+# ----------------------------
+# We load a pretrained MobileNetV2(alpha=0.5) classification model provided by keras.
+keras.backend.clear_session()  # Destroys the current TF graph and creates a new one.
+weights_url = ''.join(['https://github.com/JonathanCMitchell/',
+                       'mobilenet_v2_keras/releases/download/v1.1/',
+                       'mobilenet_v2_weights_tf_dim_ordering_tf_kernels_0.5_224.h5'])
+weights_file = 'mobilenet_v2_weights.h5'
+weights_path = download_testdata(weights_url, weights_file, module='keras')
+keras_mobilenet_v2 = MobileNetV2(alpha=0.5, include_top=True, weights=None,
+                                input_shape=(224, 224, 3), classes=1000)
+keras_mobilenet_v2.load_weights(weights_path)
+
+######################################################################
+# In order to test our model, here we download an image of cat and
+# transform its format.
+img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
+img_name = 'cat.png'
+img_path = download_testdata(img_url, img_name, module='data')
+image = Image.open(img_path).resize((224, 224))
+dtype = 'float32'
+
+def transform_image(image):
+    image = np.array(image) - np.array([123., 117., 104.])
+    image /= np.array([58.395, 57.12, 57.375])
+    image = image.transpose((2, 0, 1))
+    image = image[np.newaxis, :]
+    return image
+
+x = transform_image(image)
+
+######################################################################
+# synset is used to transform the label from number of ImageNet class to
+# the word human can understand.
+synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
+                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',
+                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',
+                      'imagenet1000_clsid_to_human.txt'])
+synset_name = 'imagenet1000_clsid_to_human.txt'
+synset_path = download_testdata(synset_url, synset_name, module='data')
+with open(synset_path) as f:
+    synset = eval(f.read())
+
+
+######################################################################
+# Compile the model with relay
+# ---------------------------------------------
+# If we run the example on our x86 server for demonstration, we can simply
+# set it as :code:`llvm`. If running it on the Android device, we need to
+# specify its instruction set. Set :code:`local_demo` to False if you want
+# to run this tutorial with a real device.
+
+local_demo = True
+
+# by default on CPU target will execute.
+# select 'cpu', 'opencl' and 'vulkan'
+test_target = 'cpu'
+
+# Change target configuration.
+# Run `adb shell cat /proc/cpuinfo` to find the arch.
+arch = 'arm64'
+target = 'llvm -target=%s-linux-android' % arch
+target_host = None
+
+if local_demo:
+    target_host = None
+    target = 'llvm'
+elif test_target == 'opencl':
+    target_host = target
+    target = 'opencl'
+elif test_target == 'vulkan':
+    target_host = target
+    target = 'vulkan'
+
+input_name = 'input_1'
+shape_dict = {input_name: x.shape}
+func, params = relay.frontend.from_keras(keras_mobilenet_v2, shape_dict)
+
+with relay.build_config(opt_level=3):
+    graph, lib, params = relay.build(func, target=target,
+                                     target_host=target_host, params=params)
+
+# After `relay.build`, you will get three return values: graph,
+# library and the new parameter, since we do some optimization that will
+# change the parameters but keep the result of model as the same.
+
+# Save the library at local temporary directory.
+tmp = util.tempdir()
+lib_fname = tmp.relpath('net.so')
+fcompile = ndk.create_shared if not local_demo else None
+lib.export_library(lib_fname, fcompile)
+
+######################################################################
+# Deploy the Model Remotely by RPC
+# ---------------------------------------------
+# With RPC, you can deploy the model remotely from your host machine
+# to the remote android device.
+
+tracker_host = os.environ.get('TVM_TRACKER_HOST', '0.0.0.0')
+tracker_port = int(os.environ.get('TVM_TRACKER_PORT', 9190))
+key = 'android'
+
+if local_demo:
+    remote = rpc.LocalSession()
+else:
+    tracker = rpc.connect_tracker(tracker_host, tracker_port)
+    # When running a heavy model, we should increase the `session_timeout`
+    remote = tracker.request(key, priority=0,
+                             session_timeout=60)
+
+if local_demo:
+    ctx = remote.cpu(0)
+elif test_target == 'opencl':
+    ctx = remote.cl(0)
+elif test_target == 'vulkan':
+    ctx = remote.vulkan(0)
+else:
+    ctx = remote.cpu(0)
+
+# upload the library to remote device and load it
+remote.upload(lib_fname)
+rlib = remote.load_module('net.so')
+
+# create the remote runtime module
+module = runtime.create(graph, rlib, ctx)
+
+######################################################################
+# Execute on TVM
+# ---------------------------------------------
+
+# set parameter (upload params to the remote device. This may take a while)
+module.set_input(**params)
+# set input data
+module.set_input(input_name, tvm.nd.array(x.astype(dtype)))
+# run
+module.run()
+# get output
+out = module.get_output(0)
+
+# get top1 result
+top1 = np.argmax(out.asnumpy())
+print('TVM prediction top-1: {}'.format(synset[top1]))
+
+print('Evaluate inference time cost...')
+ftimer = module.module.time_evaluator('run', ctx, number=1, repeat=10)
+prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
+print('Mean inference time (std dev): %.2f ms (%.2f ms)' % (np.mean(prof_res),
+                                                            np.std(prof_res)))
+
+######################################################################
+# Sample Output
+# ---------------------------------------------
+# The following is the result of 'cpu', 'opencl' and 'vulkan' using Adreno 530 on Snapdragon 820
+#
+# Although we can run on a GPU, it is slower than CPU.
+# To speed up, we need to write and optimize the schedule according to the GPU architecture.
+#
+# .. code-block:: bash
+#
+#    # cpu
+#    TVM prediction top-1: tiger cat
+#    Evaluate inference time cost...
+#    Mean inference time (std dev): 37.92 ms (19.67 ms)
+#
+#    # opencl
+#    TVM prediction top-1: tiger cat
+#    Evaluate inference time cost...
+#    Mean inference time (std dev): 419.83 ms (7.49 ms)
+#
+#    # vulkan
+#    TVM prediction top-1: tiger cat
+#    Evaluate inference time cost...
+#    Mean inference time (std dev): 465.80 ms (4.52 ms)

From 1d71097311af5be7368dbd043c44b06fc8d28ad3 Mon Sep 17 00:00:00 2001
From: Sergei Grechanik <grechanik.sergey@huawei.com>
Date: Tue, 16 Apr 2019 19:37:45 +0300
Subject: [PATCH 016/106] [ARITH] Fix x||!x for comparisons in rewrite
 simplifier (#3029)

---
 src/arithmetic/rewrite_simplify.cc                   | 2 +-
 tests/python/unittest/test_arith_rewrite_simplify.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/arithmetic/rewrite_simplify.cc b/src/arithmetic/rewrite_simplify.cc
index 1ebb328925d0..6098faa44846 100644
--- a/src/arithmetic/rewrite_simplify.cc
+++ b/src/arithmetic/rewrite_simplify.cc
@@ -1169,7 +1169,7 @@ Mutate_(const Or* op, const Expr& self) {
   TVM_TRY_REWRITE(x != y || x == y, ctrue);
   TVM_TRY_REWRITE(x || !x, ctrue);
   TVM_TRY_REWRITE(x <= y || y < x, ctrue);
-  TVM_TRY_REWRITE(y < x || y <= x, ctrue);
+  TVM_TRY_REWRITE(y < x || x <= y, ctrue);
 
   TVM_TRY_REWRITE_IF(x < c1 || c2 < x, ctrue,
                      c2.Eval()->value < c1.Eval()->value);
diff --git a/tests/python/unittest/test_arith_rewrite_simplify.py b/tests/python/unittest/test_arith_rewrite_simplify.py
index e752d7c632ab..be961a5c6543 100644
--- a/tests/python/unittest/test_arith_rewrite_simplify.py
+++ b/tests/python/unittest/test_arith_rewrite_simplify.py
@@ -508,10 +508,10 @@ def test_logical_simplify():
               tvm.const(True, "bool"))
     ck.verify(tvm.expr.Or(tvm.expr.NE(x, y), tvm.expr.EQ(x, y)),
               tvm.const(True, "bool"))
-    ck.verify(tvm.expr.Or(x > y, tvm.expr.Not(x < y)), tvm.const(True, "bool"))
+    ck.verify(tvm.expr.Or(x > y, tvm.expr.Not(x > y)), tvm.const(True, "bool"))
 
     ck.verify(tvm.expr.Or(x <= y, y < x), tvm.const(True, "bool"))
-    ck.verify(tvm.expr.Or(y < x, y <= x), tvm.const(True, "bool"))
+    ck.verify(tvm.expr.Or(y < x, y >= x), tvm.const(True, "bool"))
 
     ck.verify(tvm.expr.Or(x < 1, 0 < x), tvm.const(True, "bool"))
     ck.verify(tvm.expr.Or(0 < x, x < 1), tvm.const(True, "bool"))

From 295b33dbba64276a5890155dbbbeec166809a6e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Tue, 16 Apr 2019 12:37:49 -0700
Subject: [PATCH 017/106] [Relay] Fix BatchMatMulRel typerelation (#3032)

return false mean retry in the future, and in the case of error, it should be report ASAP, not retry.
---
 src/relay/op/nn/nn.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 2356634c4ed0..ae256629f3b1 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -658,7 +658,7 @@ bool BatchMatmulRel(const Array<Type>& types,
   const auto* x = types[0].as<TensorTypeNode>();
   const auto* y = types[1].as<TensorTypeNode>();
   if (x == nullptr || y == nullptr) return false;
-  if (x->shape.size() != 3 || y->shape.size() != 3) return false;
+  CHECK(x->shape.size() == 3 && y->shape.size() == 3);
   CHECK(reporter->AssertEQ(x->shape[0], y->shape[0]))
       << "BatchDot: batch dimension doesn't match, "
       << " x shape=" << x->shape

From c68b39f6a7ec3c30ee5a3502b590ab8f791f1c0f Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <apivovarov@gmail.com>
Date: Tue, 16 Apr 2019 13:28:37 -0700
Subject: [PATCH 018/106] Simplify TF get_output_names (#3025)

---
 python/tvm/relay/frontend/tensorflow_parser.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow_parser.py b/python/tvm/relay/frontend/tensorflow_parser.py
index a9be162f2074..9cb7eabf0ea5 100644
--- a/python/tvm/relay/frontend/tensorflow_parser.py
+++ b/python/tvm/relay/frontend/tensorflow_parser.py
@@ -85,11 +85,9 @@ def _get_output_names(self):
                                                         tags,
                                                         self._model_dir)
             output_names = set()
-            for k in meta_graph_def.signature_def.keys():
-                outputs_tensor_info = meta_graph_def.signature_def[k].outputs
-                for output_tensor in outputs_tensor_info.values():
-                    output_names.add(output_tensor.name)
-            output_names = [i.replace(":0", "") for i in output_names]
+            for sig_def in meta_graph_def.signature_def.values():
+                for output_tensor in sig_def.outputs.values():
+                    output_names.add(output_tensor.name.replace(":0", ""))
             return ",".join(output_names)
 
     def _load_saved_model(self):

From e9466fbef6c6529132866cc5b4a230d6b65eeb7b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Tue, 16 Apr 2019 13:43:10 -0700
Subject: [PATCH 019/106] Update expr.h (#3031)

---
 include/tvm/relay/expr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 1d2fa5472993..cb4f4ddece99 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -521,7 +521,7 @@ RELAY_DEFINE_NODE_REF(RefWrite, RefWriteNode, Expr);
  * rewriting pass such as layout or type transformation.
  *
  * Subclass TempExprNode allows us to pattern match on
- * specific kind TempExpr and use them for expression rewriting.
+ * specific kind of TempExpr and use them for expression rewriting.
  *
  * TempExpr should only be used within a pass,
  */

From f141d34423a7f3eec7a4487a31400626e2ebf527 Mon Sep 17 00:00:00 2001
From: hlu1 <14827759+hlu1@users.noreply.github.com>
Date: Tue, 16 Apr 2019 13:43:37 -0700
Subject: [PATCH 020/106] Add caffe2 nnvm frontend to CI (#3018)

---
 nnvm/tests/python/frontend/caffe2/test_forward.py | 13 ++++++-------
 tests/scripts/task_python_frontend.sh             |  7 +++++--
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/nnvm/tests/python/frontend/caffe2/test_forward.py b/nnvm/tests/python/frontend/caffe2/test_forward.py
index 9f3c5c9fa85a..2a216314ba1a 100644
--- a/nnvm/tests/python/frontend/caffe2/test_forward.py
+++ b/nnvm/tests/python/frontend/caffe2/test_forward.py
@@ -45,7 +45,6 @@ def get_tvm_output(model,
     graph, lib, params = nnvm.compiler.build(
         sym, target, shape=shape_dict, dtype=dtype_dict, params=params)
 
-    ctx = tvm.cpu(0)
     m = graph_runtime.create(graph, lib, ctx)
 
     # set inputs
@@ -89,21 +88,21 @@ def verify_caffe2_forward_impl(model, data_shape, out_shape):
         tvm.testing.assert_allclose(c2_out, tvm_out, rtol=1e-5, atol=1e-5)
 
 
-def verify_squeezenet1_1():
+def test_squeezenet1_1():
     verify_caffe2_forward_impl(c2_squeezenet, (1, 3, 224, 224),
                                (1, 1000, 1, 1))
 
 
-def verify_resnet50():
+def test_resnet50():
     verify_caffe2_forward_impl(c2_resnet50, (1, 3, 224, 224),
                                (1, 1000))
 
 
-def verify_vgg19():
+def test_vgg19():
     verify_caffe2_forward_impl(c2_vgg19, (1, 3, 224, 224), (1, 1000))
 
 
 if __name__ == '__main__':
-    verify_squeezenet1_1()
-    verify_resnet50()
-    verify_vgg19()
+    test_squeezenet1_1()
+    test_resnet50()
+    test_vgg19()
diff --git a/tests/scripts/task_python_frontend.sh b/tests/scripts/task_python_frontend.sh
index 1679595e712b..37159dbc9a58 100755
--- a/tests/scripts/task_python_frontend.sh
+++ b/tests/scripts/task_python_frontend.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -50,6 +50,9 @@ python3 -m nose -v nnvm/tests/python/frontend/tensorflow
 echo "Running nnvm CoreML frontend test..."
 python3 -m nose -v nnvm/tests/python/frontend/coreml
 
+echo "Running nnvm Caffe2 frontend test..."
+python3 -m nose -v nnvm/tests/python/frontend/caffe2
+
 echo "Running nnvm DarkNet frontend test..."
 python3 -m nose -v nnvm/tests/python/frontend/darknet || exit -1
 

From 4d64ff2c84f068b0e59e2d003389c7d56e531f34 Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@gmail.com>
Date: Tue, 16 Apr 2019 13:44:30 -0700
Subject: [PATCH 021/106] Ensure interpreted functions can take values that are
 not TensorValues (#3015)

---
 python/tvm/relay/backend/interpreter.py       |  8 +++-
 .../python/relay/test_backend_interpreter.py  | 43 +++++++++++++++++++
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/backend/interpreter.py b/python/tvm/relay/backend/interpreter.py
index ddcbd79122e0..bb43b278639a 100644
--- a/python/tvm/relay/backend/interpreter.py
+++ b/python/tvm/relay/backend/interpreter.py
@@ -24,7 +24,7 @@
 from .. import _make, ir_pass
 from ... import register_func, nd
 from ..base import NodeBase, register_relay_node
-from ..expr import Call, Constant, GlobalVar, Function, const
+from ..expr import Tuple, RefCreate, Call, Constant, GlobalVar, Function, const
 from ..scope_builder import ScopeBuilder
 
 class Value(NodeBase):
@@ -112,6 +112,12 @@ def __init__(self, value):
 def _arg_to_ast(arg):
     if isinstance(arg, TensorValue):
         return Constant(arg.data.copyto(nd.cpu(0)))
+    elif isinstance(arg, TupleValue):
+        return Tuple([_arg_to_ast(field) for field in arg.fields])
+    elif isinstance(arg, RefValue):
+        return RefCreate(_arg_to_ast(arg.value))
+    elif isinstance(arg, ConstructorValue):
+        return Call(arg.constructor, [_arg_to_ast(field) for field in arg.fields])
     elif isinstance(arg, np.ndarray):
         return Constant(nd.array(arg))
     elif isinstance(arg, Constant):
diff --git a/tests/python/relay/test_backend_interpreter.py b/tests/python/relay/test_backend_interpreter.py
index da794e25ab56..5d8ceb4c7bdc 100644
--- a/tests/python/relay/test_backend_interpreter.py
+++ b/tests/python/relay/test_backend_interpreter.py
@@ -19,6 +19,7 @@
 import tvm.testing
 from tvm import relay
 from tvm.relay.backend.interpreter import Value, TupleValue, TensorValue
+from tvm.relay.backend.interpreter import RefValue, ConstructorValue
 from tvm.relay.scope_builder import ScopeBuilder
 from tvm.relay import testing, create_executor
 
@@ -156,6 +157,7 @@ def test_tensor_value():
     xx = np.ones((1, 10)).astype("float32")
     check_eval(relay.Function([x], x), [TensorValue(xx)], xx)
 
+
 def test_kwargs_params():
     x = relay.var("x", shape=(1, 10))
     y = relay.var("y", shape=(1, 10))
@@ -170,6 +172,46 @@ def test_kwargs_params():
     tvm.testing.assert_allclose(res.asnumpy(), x_data + y_data + z_data)
 
 
+def test_function_taking_adt_ref_tuple():
+    mod = relay.Module()
+    prelude = relay.prelude.Prelude(mod)
+    intrp = create_executor("debug", mod)
+
+    nil_value = ConstructorValue(prelude.nil, [], [])
+    cons_value = ConstructorValue(prelude.cons, [
+        TensorValue(np.random.rand(1, 10).astype('float32')),
+        nil_value
+    ], [relay.TensorType((1, 10), 'float32')])
+
+    ref_value = RefValue(TensorValue(np.random.rand(1, 10).astype('float32')))
+    tuple_value = TupleValue(*[
+        TensorValue(np.random.rand(1, 10).astype('float32')) for _ in range(10)
+    ])
+
+    id_func = intrp.evaluate(prelude.id)
+
+    res_nil = id_func(nil_value)
+    assert res_nil.constructor == nil_value.constructor
+    assert len(res_nil.fields) == 0
+
+    res_cons = id_func(cons_value)
+    assert res_cons.constructor == cons_value.constructor
+    assert len(res_cons.fields) == len(cons_value.fields)
+    tvm.testing.assert_allclose(res_cons.fields[0].asnumpy(),
+                                cons_value.fields[0].asnumpy())
+    assert isinstance(res_cons.fields[1], ConstructorValue)
+    assert res_cons.fields[1].constructor == prelude.nil
+    assert len(res_cons.fields[1].fields) == 0
+
+    res_ref = id_func(ref_value)
+    tvm.testing.assert_allclose(res_ref.value.asnumpy(), ref_value.value.asnumpy())
+
+    res_tuple = id_func(tuple_value)
+    for i in range(10):
+        tvm.testing.assert_allclose(res_tuple.fields[i].asnumpy(),
+                                    tuple_value.fields[i].asnumpy())
+
+
 if __name__ == "__main__":
     test_id()
     test_add_const()
@@ -181,3 +223,4 @@ def test_kwargs_params():
     test_kwargs_params()
     test_ref()
     test_tensor_value()
+    test_function_taking_adt_ref_tuple()

From 8cc9fa26279f390fb957761d304eebc11dd876f5 Mon Sep 17 00:00:00 2001
From: Pedro Larroy <pedro.larroy.lists@gmail.com>
Date: Tue, 16 Apr 2019 17:16:23 -0700
Subject: [PATCH 022/106] Update dmlc-core, fix default ctors of NodeEntry 
 (#3017)

---
 3rdparty/dmlc-core       |  2 +-
 nnvm/include/nnvm/node.h | 12 +++++++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 2b5b1ba9c110..3ffea8694adf 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 2b5b1ba9c1103f438d164aca32da7cffd8cd48e8
+Subproject commit 3ffea8694adf9c0363f9abbf162dc0e4a45b22c5
diff --git a/nnvm/include/nnvm/node.h b/nnvm/include/nnvm/node.h
index 782afba0a5ab..103e2783714a 100644
--- a/nnvm/include/nnvm/node.h
+++ b/nnvm/include/nnvm/node.h
@@ -56,8 +56,18 @@ struct NodeEntry {
     version(version)
   {}
 
+  explicit NodeEntry(NodePtr node):
+    node(std::move(node)),
+    index(),
+    version()
+  {}
+
+  /**
+   * MXNet assumes that a node with a null ptr doesn't have a gradient attached. Don't change this
+   * constructor.
+   */
   NodeEntry():
-    node(),
+    node(nullptr),
     index(),
     version()
   {}

From a35ea0465a8c6ee39ffdd8c81a23ec00b9b80f42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Tue, 16 Apr 2019 22:33:31 -0700
Subject: [PATCH 023/106] [Relay] Fix Fuse (#3035)

* save

* fix

* Update fuse_ops.cc
---
 src/relay/pass/fuse_ops.cc                     | 10 +++++++++-
 tests/python/relay/test_backend_interpreter.py | 10 +++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index 6de9c2d65f90..12e3174dcade 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -865,9 +865,17 @@ class FuseMutator : private ExprMutator {
   }
 
   Expr MakeNewFunction(GraphPartitioner::Group* group, Type ret_type, Expr body) {
+    // If the function has no call, it is not a primitive function.
+    struct HasCallVisitor : ExprVisitor {
+      bool has_call = false;
+      void VisitExpr_(const CallNode* op) final {
+        has_call = true;
+      }
+    } visitor;
+    visitor(body);
     const GroupInfo& ginfo = ginfo_[group];
     auto func = FunctionNode::make(ginfo.params, body, ret_type, {});
-    func = FunctionSetAttr(func, "Primitive", tvm::Integer(1));
+    func = FunctionSetAttr(func, "Primitive", tvm::Integer(visitor.has_call));
     return CallNode::make(func, ginfo.arguments, Attrs());
   }
 
diff --git a/tests/python/relay/test_backend_interpreter.py b/tests/python/relay/test_backend_interpreter.py
index 5d8ceb4c7bdc..e8a99e14d741 100644
--- a/tests/python/relay/test_backend_interpreter.py
+++ b/tests/python/relay/test_backend_interpreter.py
@@ -51,6 +51,12 @@ def test_tuple_value():
     np.testing.assert_allclose(tv[2].asnumpy(), 3)
 
 
+def test_tuple_getitem():
+    two = relay.add(relay.const(1), relay.const(1))
+    func = relay.Function([], relay.TupleGetItem(relay.Tuple([relay.const(1), relay.const(2)]), 0))
+    check_eval(func, [], 1)
+
+
 def test_id():
     x = relay.var('x', 'float32')
     ident = relay.Function([x], x)
@@ -223,4 +229,6 @@ def test_function_taking_adt_ref_tuple():
     test_kwargs_params()
     test_ref()
     test_tensor_value()
-    test_function_taking_adt_ref_tuple()
+    test_tuple_value()
+    test_tuple_getitem()
+    test_function_taking_adt_ref_tuple()
\ No newline at end of file

From 0f66c3cb124d726875e3ed647d3cc5c1b75ff462 Mon Sep 17 00:00:00 2001
From: Balint Cristian <cristian.balint@gmail.com>
Date: Wed, 17 Apr 2019 17:20:41 +0300
Subject: [PATCH 024/106] Support Deriving channels when it is not provided in
 AlterLayout. (#2972)

---
 .../nnvm_to_relay/test_alter_conv2d.py        | 87 +++++++++++++++++++
 topi/python/topi/arm_cpu/conv2d.py            |  4 +
 topi/python/topi/cuda/conv2d_winograd.py      |  4 +
 topi/python/topi/intel_graphics/conv2d.py     |  6 +-
 topi/python/topi/x86/conv2d.py                |  7 +-
 5 files changed, 106 insertions(+), 2 deletions(-)
 create mode 100644 tests/python/frontend/nnvm_to_relay/test_alter_conv2d.py

diff --git a/tests/python/frontend/nnvm_to_relay/test_alter_conv2d.py b/tests/python/frontend/nnvm_to_relay/test_alter_conv2d.py
new file mode 100644
index 000000000000..a03868550160
--- /dev/null
+++ b/tests/python/frontend/nnvm_to_relay/test_alter_conv2d.py
@@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test alter conv2d layout pass"""
+import tvm
+import nnvm
+
+from tvm import relay
+from tvm import autotvm
+from tvm.relay.ir_pass import infer_type, alpha_equal
+
+
+def test_alter_layout_conv2d():
+    """Additional layout transformations should occour on the graph.
+    """
+
+    def convnet():
+        """Alternating layout of simple convnet (from image super-resolution).
+        """
+        bias1 = relay.var('bias1', shape=(64,))
+        bias2 = relay.var('bias2', shape=(64,))
+        bias3 = relay.var('bias3', shape=(64,))
+        bias4 = relay.var('bias4', shape=(64,))
+        weight1 = relay.var('weight1', shape=(64, 1, 5, 5))
+        weight2 = relay.var('weight2', shape=(64, 64, 3, 3))
+        weight3 = relay.var('weight3', shape=(64, 64, 3, 3))
+        weight4 = relay.var('weight4', shape=(64, 64, 3, 3))
+        data = relay.var("x", shape=(1, 1, 224, 224))
+        n00 = relay.nn.conv2d(data, weight1, padding=[2, 2], kernel_size=[5, 5])
+        n01 = relay.expand_dims(bias1, axis=1, num_newaxis=2)
+        n02 = relay.add(n00, n01)
+        n03 = relay.nn.relu(n02)
+        n04 = relay.nn.conv2d(n03, weight2, padding=[1, 1], kernel_size=[3, 3])
+        n05 = relay.expand_dims(bias2, axis=1, num_newaxis=2)
+        n06 = relay.add(n04, n05)
+        n07 = relay.nn.relu(n06)
+        n08 = relay.nn.conv2d(n07, weight3, padding=[1, 1], kernel_size=[3, 3])
+        n09 = relay.expand_dims(bias3, axis=1, num_newaxis=2)
+        n10 = relay.add(n08, n09)
+        n11 = relay.nn.relu(n10)
+        n12 = relay.nn.conv2d(n11, weight4, padding=[1, 1], kernel_size=[3, 3])
+        n13 = relay.expand_dims(bias4, axis=1, num_newaxis=2)
+        n14 = relay.add(n12, n13)
+        n15 = relay.reshape(n14, newshape=[1, 1, 3, 3, 224, 224])
+        n16 = relay.transpose(n15, axes=[0, 1, 4, 2, 5, 3])
+        net = relay.reshape(n16, newshape=[1, 1, 672, 672])
+        args = relay.ir_pass.free_vars(net)
+        return relay.Function(args, net)
+
+    # orig net
+    N = convnet()
+    N = infer_type(N)
+
+    # trigger a test
+    # for each known alter_conv2d
+    targets=['cuda',
+             'opencl -device=mali',
+             'opencl -device=intel_graphics',
+             'llvm -device=arm_cpu',
+             'llvm -device=core-avx-ii']
+
+    for tgt in targets:
+        with tvm.target.create(tgt) as target:
+            with relay.build_config(opt_level=-1, add_pass='AlterOpLayout'):
+               with autotvm.tophub.context(target):
+                   O = relay.optimize(N, target, params=None)
+                   O = relay.ir_pass.infer_type(O)
+
+                   # graph should differ
+                   assert not relay.ir_pass.alpha_equal(N, O)
+
+if __name__ == "__main__":
+    np.random.seed(42)
+    test_alter_layout_conv2d()
diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index 3e1ebaa647e3..7bfa715fb182 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -700,6 +700,10 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F):
 
     new_attrs = {k: attrs[k] for k in attrs.keys()}
 
+    if F == tvm.relay.op:
+        # Derive channels for frontends (e.g ONNX) that miss "channel" field.
+        new_attrs["channels"] = inputs[1].checked_type.shape[attrs['kernel_layout'].index('O')]
+
     dilation = attrs.get_int_tuple("dilation")
     assert attrs.get_int_tuple("dilation") == (1, 1), "Does not support dilation " \
                                                       "when alter_op_layout is enabled"
diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py
index 7fe3a4d7e3d6..cd9a78574593 100644
--- a/topi/python/topi/cuda/conv2d_winograd.py
+++ b/topi/python/topi/cuda/conv2d_winograd.py
@@ -379,6 +379,10 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, F):
     if "target" in new_attrs:
         del new_attrs["target"]
 
+    if F == tvm.relay.op:
+        # Derive channels for frontends (e.g ONNX) that miss "channel" field.
+        new_attrs["channels"] = inputs[1].checked_type.shape[attrs['kernel_layout'].index('O')]
+
     strides = attrs.get_int_tuple("strides")
     padding = attrs.get_int_tuple("padding")
     dilation = attrs.get_int_tuple("dilation")
diff --git a/topi/python/topi/intel_graphics/conv2d.py b/topi/python/topi/intel_graphics/conv2d.py
index 0cdbe365dda3..5de49a9b147c 100644
--- a/topi/python/topi/intel_graphics/conv2d.py
+++ b/topi/python/topi/intel_graphics/conv2d.py
@@ -73,7 +73,11 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, F):
             break
 
     new_attrs = {k: attrs[k] for k in attrs.keys()}
-    new_attrs['kernel_layout'] = 'OIHW%do' % (oc_bn)
+    new_attrs["kernel_layout"] = 'OIHW%do' % (oc_bn)
+
+    if F == tvm.relay.op:
+        # Derive channels for frontends (e.g ONNX) that miss "channel" field.
+        new_attrs["channels"] = inputs[1].checked_type.shape[attrs['kernel_layout'].index('O')]
 
     # Remove attached compilation target because conv2d_NCHWc needs to create
     # a conv2d_nchwc op and target is not one of conv2d's parameters.
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index 5806590d266a..adb45de0f33f 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -327,11 +327,16 @@ def _alter_conv2d_layout(attrs, inputs, tinfo, F):
 
     copy_inputs = [s for s in inputs]
     new_attrs = {k : attrs[k] for k in attrs.keys()}
+
+    if F == tvm.relay.op:
+        # Derive channels for frontends (e.g ONNX) that miss "channel" field.
+        new_attrs["channels"] = inputs[1].checked_type.shape[attrs['kernel_layout'].index('O')]
+
     data, kernel = tinfo[0], tinfo[1]
     batch_size, in_channel, height, width = get_const_tuple(data.shape)
 
     groups = attrs.get_int("groups")
-    out_channel = attrs.get_int("channels") if F == sym else attrs.get_int("channels").value
+    out_channel = attrs.get_int("channels") if F == sym else new_attrs["channels"]
     padding = attrs.get_int_tuple("padding")
     strides = attrs.get_int_tuple("strides")
     dilation = attrs.get_int_tuple("dilation")

From 60b6d268d5316f76f903063768390cf99f218f5f Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@fb.com>
Date: Wed, 17 Apr 2019 13:06:30 -0700
Subject: [PATCH 025/106] Implement relay nn.bias_add compute in C++ (#3027)

* Implement nn.bias_add compute in C++

* Address comments

* Remove unnecessary check
---
 python/tvm/relay/op/nn/_nn.py   | 14 ---------
 src/relay/op/nn/nn.cc           | 12 +++++--
 topi/include/topi/nn/bias_add.h | 56 +++++++++++++++++++++++++++++++++
 topi/src/topi.cc                | 11 +++++--
 4 files changed, 74 insertions(+), 19 deletions(-)
 create mode 100644 topi/include/topi/nn/bias_add.h

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 5a47b1d42ed3..e60c01cfb3ff 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -182,20 +182,6 @@ def schedule_conv2d_transpose(attrs, outs, target):
 reg.register_pattern("nn.conv2d_transpose", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 # bias_add
-@reg.register_compute("nn.bias_add")
-def compute_bias_add(attrs, inputs, out_dtype, target):
-    """Compute definition of conv2d_transpose"""
-    axis = attrs.axis
-    bias = inputs[1]
-    data_ndim = len(inputs[0].shape)
-    if axis < 0:
-        axis = axis + data_ndim
-    num_newaxis = data_ndim - axis - 1
-
-    if num_newaxis:
-        bias = topi.expand_dims(bias, axis=1, num_newaxis=num_newaxis)
-    return [topi.add(inputs[0], bias)]
-
 reg.register_schedule("nn.bias_add", schedule_injective)
 reg.register_pattern("nn.bias_add", OpPattern.BROADCAST)
 
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index ae256629f3b1..4141e602d6bc 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -28,6 +28,7 @@
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/attrs/image.h>
 #include <topi/nn.h>
+#include <topi/nn/bias_add.h>
 #include <topi/nn/softmax.h>
 #include <topi/nn/flatten.h>
 #include <vector>
@@ -90,7 +91,12 @@ RELAY_REGISTER_OP("nn.bias_add")
 .add_argument("data", "nD Tensor", "Input data.")
 .add_argument("bias", "1D Tensor", "Bias.")
 .set_support_level(1)
-.add_type_rel("BiasAdd", BiasAddRel);
+.add_type_rel("BiasAdd", BiasAddRel)
+.set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs, const Array<Tensor>& inputs,
+                                        const Type& out_type, const Target& target) {
+    const auto* param = attrs.as<BiasAddAttrs>();
+    return tvm::Array<tvm::Tensor>{topi::nn::bias_add(inputs[0], inputs[1], param->axis)};
+});
 
 
 // relay.nn.dense
diff --git a/topi/include/topi/nn/bias_add.h b/topi/include/topi/nn/bias_add.h
new file mode 100644
index 000000000000..fb4ae30ca404
--- /dev/null
+++ b/topi/include/topi/nn/bias_add.h
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \brief bias_add op constructions
+ * \file nn/bias_add.h
+ */
+#ifndef TOPI_NN_BIAS_ADD_H_
+#define TOPI_NN_BIAS_ADD_H_
+
+#include <string>
+
+#include "topi/tags.h"
+#include "topi/broadcast.h"
+#include "topi/transform.h"
+#include "tvm/tvm.h"
+
+namespace topi {
+namespace nn {
+
+/*!
+* \brief Creates an operation that calculates data + bias
+*
+* \param data Tensor with shape [batch, in_dim]
+* \param bias Tensor with shape [batch].
+*
+* \return Tensor with shape [batch, in_dim]
+*/
+inline tvm::Tensor bias_add(const tvm::Tensor& data, const tvm::Tensor& bias, int axis) {
+  int data_ndim = data->shape.size();
+  if (axis < 0) {
+    axis += data_ndim;
+  }
+  int num_newaxis = data_ndim - axis - 1;
+  return add(data, (num_newaxis ? expand_dims(bias, 1, num_newaxis) : bias));
+}
+}  // namespace nn
+}  // namespace topi
+#endif  // TOPI_NN_BIAS_ADD_H_
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index 47e999c74587..c583f1c115c2 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -36,6 +36,7 @@
 #include <topi/reduction.h>
 #include <topi/transform.h>
 
+#include <topi/nn/bias_add.h>
 #include <topi/nn/bnn.h>
 #include <topi/nn/dense.h>
 #include <topi/nn/dilate.h>
@@ -400,6 +401,12 @@ TVM_REGISTER_GLOBAL("topi.nn.dense")
   *rv = nn::dense(args[0], args[1], args[2]);
   });
 
+/* Ops from nn/bias_add.h */
+TVM_REGISTER_GLOBAL("topi.nn.bias_add")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = nn::bias_add(args[0], args[1], args[2]);
+  });
+
 /* Ops from nn/batch_matmul.h */
 TVM_REGISTER_GLOBAL("topi.nn.batch_matmul")
 .set_body([](TVMArgs args, TVMRetValue *rv) {

From c66a3ff7a03014df86b83cb071ded923d6d45493 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Wed, 17 Apr 2019 13:16:37 -0700
Subject: [PATCH 026/106] [Relay] Add printing for ADT Type (#3030)

* Update pretty_printer.cc

* Update pretty_printer.cc
---
 src/relay/ir/pretty_printer.cc | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/relay/ir/pretty_printer.cc b/src/relay/ir/pretty_printer.cc
index f4a830040f70..71502614abd6 100644
--- a/src/relay/ir/pretty_printer.cc
+++ b/src/relay/ir/pretty_printer.cc
@@ -592,6 +592,22 @@ class PrettyPrinter :
     return AllocTypeVar(GetRef<TypeVar>(node));
   }
 
+  Doc VisitType_(const GlobalTypeVarNode* node) final {
+    return Doc(node->var->name_hint);
+  }
+
+  Doc VisitType_(const TypeCallNode* node) final {
+    Doc doc = PrintType(node->func, false);
+    std::vector<Doc> args;
+    for (const Type& t : node->args) {
+      args.push_back(PrintType(t, false));
+    }
+    doc << "[";
+    doc << PrintVec(args);
+    doc << "]";
+    return doc;
+  }
+
   Doc VisitType_(const TensorTypeNode* node) final {
     // scalar type
     if (node->shape.size() == 0) {

From 1d068a46e8f12df153e5f8a606d22dd77561dfbc Mon Sep 17 00:00:00 2001
From: Balint Cristian <cristian.balint@gmail.com>
Date: Fri, 19 Apr 2019 01:14:00 +0300
Subject: [PATCH 027/106] Additional fix for PR#2972 (#3044)

---
 topi/python/topi/arm_cpu/conv2d.py        |  2 +-
 topi/python/topi/cuda/conv2d_winograd.py  |  2 +-
 topi/python/topi/intel_graphics/conv2d.py |  5 ++---
 topi/python/topi/x86/conv2d.py            | 12 ++++++------
 4 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index 7bfa715fb182..463214fa5bfd 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -700,7 +700,7 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F):
 
     new_attrs = {k: attrs[k] for k in attrs.keys()}
 
-    if F == tvm.relay.op:
+    if F.__name__ == 'tvm.relay.op':
         # Derive channels for frontends (e.g ONNX) that miss "channel" field.
         new_attrs["channels"] = inputs[1].checked_type.shape[attrs['kernel_layout'].index('O')]
 
diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py
index cd9a78574593..3a530a016e37 100644
--- a/topi/python/topi/cuda/conv2d_winograd.py
+++ b/topi/python/topi/cuda/conv2d_winograd.py
@@ -379,7 +379,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, F):
     if "target" in new_attrs:
         del new_attrs["target"]
 
-    if F == tvm.relay.op:
+    if F.__name__ == 'tvm.relay.op':
         # Derive channels for frontends (e.g ONNX) that miss "channel" field.
         new_attrs["channels"] = inputs[1].checked_type.shape[attrs['kernel_layout'].index('O')]
 
diff --git a/topi/python/topi/intel_graphics/conv2d.py b/topi/python/topi/intel_graphics/conv2d.py
index 5de49a9b147c..c3967ea8488b 100644
--- a/topi/python/topi/intel_graphics/conv2d.py
+++ b/topi/python/topi/intel_graphics/conv2d.py
@@ -54,7 +54,6 @@ def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None
 
 @conv2d_alter_layout.register(["intel_graphics"])
 def _alter_conv2d_layout(attrs, inputs, tinfos, F):
-    import nnvm.symbol as sym
 
     copy_inputs = [s for s in inputs]
 
@@ -75,7 +74,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, F):
     new_attrs = {k: attrs[k] for k in attrs.keys()}
     new_attrs["kernel_layout"] = 'OIHW%do' % (oc_bn)
 
-    if F == tvm.relay.op:
+    if F.__name__ == 'tvm.relay.op':
         # Derive channels for frontends (e.g ONNX) that miss "channel" field.
         new_attrs["channels"] = inputs[1].checked_type.shape[attrs['kernel_layout'].index('O')]
 
@@ -84,7 +83,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, F):
     if "target" in new_attrs:
         del new_attrs["target"]
 
-    if F == sym:
+    if F.__name__ == 'nnvm.symbol':
         out = F.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
     else:
         out = F.nn.contrib_conv2d_nchwc(*copy_inputs, **new_attrs)
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index adb45de0f33f..4d4b3fef4826 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -323,12 +323,11 @@ def _topi_nn_conv2d_NCHWc(*args, **kwargs):
 
 @conv2d_alter_layout.register("cpu")
 def _alter_conv2d_layout(attrs, inputs, tinfo, F):
-    import nnvm.symbol as sym
 
     copy_inputs = [s for s in inputs]
     new_attrs = {k : attrs[k] for k in attrs.keys()}
 
-    if F == tvm.relay.op:
+    if F.__name__ == 'tvm.relay.op':
         # Derive channels for frontends (e.g ONNX) that miss "channel" field.
         new_attrs["channels"] = inputs[1].checked_type.shape[attrs['kernel_layout'].index('O')]
 
@@ -336,13 +335,14 @@ def _alter_conv2d_layout(attrs, inputs, tinfo, F):
     batch_size, in_channel, height, width = get_const_tuple(data.shape)
 
     groups = attrs.get_int("groups")
-    out_channel = attrs.get_int("channels") if F == sym else new_attrs["channels"]
+    out_channel = attrs.get_int("channels") \
+        if F.__name__ == 'nnvm.symbol' else new_attrs["channels"]
     padding = attrs.get_int_tuple("padding")
     strides = attrs.get_int_tuple("strides")
     dilation = attrs.get_int_tuple("dilation")
     out_dtype = attrs["out_dtype"]
 
-    layout_name = 'layout' if F == sym else 'data_layout'
+    layout_name = 'layout' if F.__name__ == 'nnvm.symbol' else 'data_layout'
 
     layout = attrs[layout_name]
     kh, kw = attrs.get_int_tuple("kernel_size")
@@ -404,12 +404,12 @@ def _alter_conv2d_layout(attrs, inputs, tinfo, F):
     dispatch_ctx.update(target, new_workload, cfg)
 
     if is_depthwise:
-        if F == sym:
+        if F.__name__ == 'nnvm.symbol':
             logging.warning("Use native layout for depthwise convolution on NNVM.")
             return None
         return F.nn.contrib_depthwise_conv2d_nchwc(*copy_inputs, **new_attrs)
     else:
-        if F == sym:
+        if F.__name__ == 'nnvm.symbol':
             return F.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
         return F.nn.contrib_conv2d_nchwc(*copy_inputs, **new_attrs)
 

From 80b04c629f9b4f7df1d1d7642a9eff9aa284e189 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Fri, 19 Apr 2019 03:50:11 +0530
Subject: [PATCH 028/106] Bugfix for path issues (#3038)

---
 nnvm/python/nnvm/testing/yolo_detection.py | 8 ++++----
 nnvm/tutorials/from_darknet.py             | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/nnvm/python/nnvm/testing/yolo_detection.py b/nnvm/python/nnvm/testing/yolo_detection.py
index 9ecb49ae04f0..bdf9efe62de4 100644
--- a/nnvm/python/nnvm/testing/yolo_detection.py
+++ b/nnvm/python/nnvm/testing/yolo_detection.py
@@ -165,7 +165,7 @@ def do_nms_sort(dets, classes, thresh):
                 if _box_iou(a, b) > thresh:
                     dets[j]['prob'][k] = 0
 
-def draw_detections(im, dets, thresh, names, classes):
+def draw_detections(font_path, im, dets, thresh, names, classes):
     "Draw the markings around the detected region"
     for det in dets:
         labelstr = []
@@ -198,7 +198,7 @@ def draw_detections(im, dets, thresh, names, classes):
             if bot > imh-1:
                 bot = imh-1
             _draw_box_width(im, left, top, right, bot, width, red, green, blue)
-            label = _get_label(''.join(labelstr), rgb)
+            label = _get_label(font_path, ''.join(labelstr), rgb)
             _draw_label(im, top + width, left, label, rgb)
 
 def _get_pixel(im, x, y, c):
@@ -223,7 +223,7 @@ def _draw_label(im, r, c, label, rgb):
                         val = _get_pixel(label, i, j, k)
                         _set_pixel(im, i+c, j+r, k, val)#rgb[k] * val)
 
-def _get_label(labelstr, rgb):
+def _get_label(font_path, labelstr, rgb):
     from PIL import Image
     from PIL import ImageDraw
     from PIL import ImageFont
@@ -231,7 +231,7 @@ def _get_label(labelstr, rgb):
     text = labelstr
     colorText = "black"
     testDraw = ImageDraw.Draw(Image.new('RGB', (1, 1)))
-    font = ImageFont.truetype("arial.ttf", 25)
+    font = ImageFont.truetype(font_path, 25)
     width, height = testDraw.textsize(labelstr, font=font)
     img = Image.new('RGB', (width, height), color=(int(rgb[0]*255), int(rgb[1]*255),
                                                    int(rgb[2]*255)))
diff --git a/nnvm/tutorials/from_darknet.py b/nnvm/tutorials/from_darknet.py
index 607af1038628..857ef46015cd 100644
--- a/nnvm/tutorials/from_darknet.py
+++ b/nnvm/tutorials/from_darknet.py
@@ -153,7 +153,7 @@
 # do the detection and bring up the bounding boxes
 thresh = 0.5
 nms_thresh = 0.45
-img = nnvm.testing.darknet.load_image_color(test_image)
+img = nnvm.testing.darknet.load_image_color(img_path)
 _, im_h, im_w = img.shape
 dets = nnvm.testing.yolo_detection.fill_network_boxes((netw, neth), (im_w, im_h), thresh,
                                                       1, tvm_out)
@@ -172,6 +172,6 @@
 
 names = [x.strip() for x in content]
 
-nnvm.testing.yolo_detection.draw_detections(img, dets, thresh, names, last_layer.classes)
+nnvm.testing.yolo_detection.draw_detections(font_path, img, dets, thresh, names, last_layer.classes)
 plt.imshow(img.transpose(1, 2, 0))
 plt.show()

From 6b2247bedc5a15afcfc99d5b69fab2f35ee04f24 Mon Sep 17 00:00:00 2001
From: Yong Wu <55wuyong@163.com>
Date: Thu, 18 Apr 2019 18:07:25 -0700
Subject: [PATCH 029/106] [Relay][Frontend] TF Tile Round Sign Pow Exp Reverse
 (#2960)

* [Relay][Frontend] TF Round Sign Pow Exp Reverse

* fix ci

* fix comments
---
 python/tvm/relay/frontend/tensorflow.py       | 101 +++++++++++-------
 .../frontend/tensorflow/test_forward.py       |  73 +++++++++++++
 2 files changed, 137 insertions(+), 37 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index b357a2fbff30..43e770c301d2 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -532,6 +532,18 @@ def _impl(inputs, attr, params):
         return _op.concatenate(inputs_reshaped, axis)
     return _impl
 
+def _tile():
+    def _impl(inputs, attr, params):
+        reps = params[inputs.pop().name_hint].asnumpy()
+        new_input = []
+        new_input.append(inputs.pop(0))
+
+        return AttrCvt(
+            op_name='tile',
+            extras={'reps': tuple(reps)},
+            ignores=['Tmultiples'])(new_input, attr)
+    return _impl
+
 def _slice():
     def _impl(inputs, attr, params):
         begin = params.pop(_get_name_hint(inputs[1])).asnumpy().tolist()
@@ -851,6 +863,15 @@ def _impl(inputs, attr, params):
         return AttrCvt(op_name="where")(inputs, attr)
     return _impl
 
+def _reverse_v2():
+    def _impl(inputs, attr, params):
+        axis = params.pop(inputs[1].name_hint).asnumpy()[0]
+        return AttrCvt(
+            op_name="reverse",
+            ignores=['Tidx'],
+            extras={'axis': int(axis)})([inputs[0]], attr)
+    return _impl
+
 def _rank():
     def _impl(inputs, attr, params):
         input_shape = attr['_input_shapes'][inputs[0]]
@@ -1078,6 +1099,7 @@ def _impl(inputs, attr, params):
 # for 1 to N mapping(composed), use custom callable functions
 # for N to 1 mapping, currently not supported(?)
 _convert_map = {
+    'Add'                               : _elemwise('add'),
     'ArgMax'                            : _argx(_op.argmax, 'argmax'),
     'ArgMin'                            : _argx(_op.argmin, 'argmin'),
     'AvgPool'                           : _pooling('avg_pool'),
@@ -1090,60 +1112,65 @@ def _impl(inputs, attr, params):
     'ConcatV2'                          : _concatV2(),
     'Conv2D'                            : _conv('conv'),
     'DecodeJpeg'                        : _decode_image(),
+    'DepthwiseConv2dNative'             : _conv('depthwise'),
+    'Equal'                             : _broadcast('equal'),
     'Elu'                               : _elu(),
+    'Exp'                               : AttrCvt('exp'),
     'ExpandDims'                        : _expand_dims(),
+    'Fill'                              : _fill(),
     'Floor'                             : AttrCvt('floor'),
+    'FusedBatchNorm'                    : _fused_batch_norm(),
+    'FusedBatchNormV2'                  : _fused_batch_norm(),
+    'Gather'                            : _gather(),
+    'GatherV2'                          : _gather(),
+    'Greater'                           : _broadcast('greater'),
+    'GreaterEqual'                      : _broadcast('greater_equal'),
     'Identity'                          : _identity(),
+    'LeakyRelu'                         : AttrCvt('leaky_relu'),
+    'Less'                              : _broadcast('less'),
+    'LessEqual'                         : _broadcast('less_equal'),
+    'LogicalAnd'                        : _logical('logical_and'),
+    'LogicalOr'                         : _logical('logical_or'),
+    'LogicalNot'                        : _logical('logical_not'),
+    'LRN'                               : _lrn(),
     'MatMul'                            : _matmul(),
     'MaxPool'                           : _pooling('max_pool'),
-    'Add'                               : _elemwise('add'),
-    'Sub'                               : _elemwise('subtract'),
-    'Mul'                               : _elemwise('multiply'),
-    'RealDiv'                           : _elemwise('div'),
     'Maximum'                           : _elemwise('maximum'),
+    'Mean'                              : _mean(),
     'Minimum'                           : _elemwise('minimum'),
-    'Sum'                               : _sum(),
-    'Square'                            : _square(),
+    'Mul'                               : _elemwise('multiply'),
+    'NotEqual'                          : _broadcast('not_equal'),
     'Pack'                              : _pack(),
-    'Slice'                             : _slice(),
-    'LeakyRelu'                         : AttrCvt('leaky_relu'),
+    'Pad'                               : _pad('Pad'),
+    'PadV2'                             : _pad('PadV2'),
+    'Pow'                               : _elemwise('power'),
+    'Range'                             : _range(),
+    'Rank'                              : _rank(),
+    'RealDiv'                           : _elemwise('div'),
     'Relu'                              : AttrCvt('relu'),
+    'Relu6'                             : _relu6(),
     'Reshape'                           : _reshape(),
     'ResizeBilinear'                    : _resize_bilinear(),
-    'Selu'                              : _selu(),
-    'Softmax'                           : _softmax(),
+    'ReverseV2'                         : _reverse_v2(),
+    'Round'                             : AttrCvt('round'),
     'Rsqrt'                             : _rsqrt(),
-    'Squeeze'                           : _squeeze(),
-    'FusedBatchNorm'                    : _fused_batch_norm(),
-    'FusedBatchNormV2'                  : _fused_batch_norm(),
-    'Relu6'                             : _relu6(),
-    'DepthwiseConv2dNative'             : _conv('depthwise'),
+    'Select'                            : _where(),
+    'Selu'                              : _selu(),
     'Shape'                             : _shape(),
     'Sigmoid'                           : AttrCvt('sigmoid'),
-    'Select'                            : _where(),
-    'Fill'                              : _fill(),
-    'GatherV2'                          : _gather(),
-    'Gather'                            : _gather(),
-    'StridedSlice'                      : _stridedSlice(),
-    'LRN'                               : _lrn(),
-    'Pad'                               : _pad('Pad'),
-    'PadV2'                             : _pad('PadV2'),
-    'Range'                             : _range(),
-    'Rank'                              : _rank(),
-    'Transpose'                         : _transpose(),
-    'Tanh'                              : AttrCvt('tanh'),
-    'Mean'                              : _mean(),
-    'LogicalAnd'                        : _logical('logical_and'),
-    'LogicalOr'                         : _logical('logical_or'),
-    'LogicalNot'                        : _logical('logical_not'),
-    'Less'                              : _broadcast('less'),
-    'Greater'                           : _broadcast('greater'),
-    'LessEqual'                         : _broadcast('less_equal'),
-    'GreaterEqual'                      : _broadcast('greater_equal'),
-    'Equal'                             : _broadcast('equal'),
-    'NotEqual'                          : _broadcast('not_equal'),
+    'Sign'                              : AttrCvt('sign'),
+    'Slice'                             : _slice(),
+    'Softmax'                           : _softmax(),
     'Split'                             : _split(False),
     'SplitV'                            : _split(True),
+    'Square'                            : _square(),
+    'Squeeze'                           : _squeeze(),
+    'StridedSlice'                      : _stridedSlice(),
+    'Sub'                               : _elemwise('subtract'),
+    'Sum'                               : _sum(),
+    'Tanh'                              : AttrCvt('tanh'),
+    'Tile'                              : _tile(),
+    'Transpose'                         : _transpose(),
     'Unpack'                            : _unpack(),
     'SpaceToBatchND'                    : _space_to_batch_nd(),
     'BatchToSpaceND'                    : _batch_to_space_nd(),
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 7e7c1510c60b..6894c5d46210 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -762,6 +762,24 @@ def test_forward_unstack():
     _test_unstack((3, 6, 4), -2, 'float32')
 
 
+#######################################################################
+# Tile
+# ----
+
+def _test_tile(in_shape, multiples, dtype):
+    np_data = np.random.uniform(-5, 5, size=in_shape).astype(dtype)
+    tf.reset_default_graph()
+    in_data = tf.placeholder(dtype, in_shape, name="in_data")
+    tf.tile(in_data, multiples=multiples, name="tile")
+    compare_tf_with_tvm([np_data], ['in_data:0'], 'tile:0')
+
+def test_forward_tile():
+    '''test Tile'''
+    _test_tile((2, ), (3, ), "int32")
+    _test_tile((2, 2), (2, 3), "float32")
+    _test_tile((2, 4, 6), (6, 7, 8), "float64")
+
+
 #######################################################################
 # Multi Input to graph
 # --------------------
@@ -1353,6 +1371,53 @@ def test_forward_tanh():
         tf.nn.tanh(in1)
         compare_tf_with_tvm(inp_array, 'Placeholder:0', 'Tanh:0')
 
+#######################################################################
+# Tensor
+# ------
+
+def test_forward_round():
+    """test Round"""
+    np_data = np.random.uniform(-10, 10, size=(5, 7)).astype(np.float32)
+    tf.reset_default_graph()
+    in_data = tf.placeholder(tf.float32, (5, 7), name="in_data")
+    tf.round(in_data, name="round")
+    compare_tf_with_tvm([np_data], ['in_data:0'], 'round:0')
+
+def _test_forward_reverse_v2(in_shape, axis, dtype):
+    np_data = np.random.uniform(-10, 10, size=in_shape).astype(dtype)
+    tf.reset_default_graph()
+    in_data = tf.placeholder(dtype, in_shape, name="in_data")
+    tf.reverse(in_data, axis=[axis], name="reverse")
+    compare_tf_with_tvm([np_data], ['in_data:0'], 'reverse:0')
+
+def test_forward_reverse_v2():
+    """test ReverseV2"""
+    _test_forward_reverse_v2((2, 3), 0, "int32")
+    _test_forward_reverse_v2((2, 3, 5), 2, "float32")
+    _test_forward_reverse_v2((2, 3, 5, 7), 1, "float32")
+    _test_forward_reverse_v2((2, 3, 5), -1, "float64")
+    _test_forward_reverse_v2((2, 3, 5), -3, "float64")
+
+def test_forward_sign():
+    """test Sign"""
+    np_data = np.random.uniform(-10, 10, size=(5, 7, 11)).astype(np.float32)
+    tf.reset_default_graph()
+    in_data = tf.placeholder(tf.float32, (5, 7, 11), name="in_data")
+    tf.sign(in_data, name="sign")
+    compare_tf_with_tvm([np_data], ['in_data:0'], 'sign:0')
+
+def test_forward_pow_exp():
+    """test Pow"""
+    np_in1 = np.random.uniform(-10, 10, size=(5, 7, 11)).astype(np.float32)
+    np_in2 = np.random.uniform(-10, 10, size=(5, 7, 11)).astype(np.float32)
+    tf.reset_default_graph()
+    in1 = tf.placeholder(tf.float32, (5, 7, 11), name="in1")
+    in2 = tf.placeholder(tf.float32, (5, 7, 11), name="in2")
+    out1 = tf.pow(in1, in2, name="pow")
+    out = tf.exp(out1, name='exp')
+    compare_tf_with_tvm([np_in1, np_in2], ['in1:0', 'in2:0'], 'pow:0')
+    compare_tf_with_tvm([np_in1, np_in2], ['in1:0', 'in2:0'], 'exp:0')
+
 #######################################################################
 # Mean
 # ----
@@ -1394,6 +1459,7 @@ def test_forward_rel_ops():
 # Main
 # ----
 if __name__ == '__main__':
+
     # Transforms
     test_forward_transpose()
     test_forward_reshape()
@@ -1407,6 +1473,7 @@ def test_forward_rel_ops():
     test_forward_stridedslice()
     test_forward_split()
     test_forward_unstack()
+    test_forward_tile()
 
     # Activations
     test_forward_sigmoid()
@@ -1416,6 +1483,12 @@ def test_forward_rel_ops():
     test_forward_selu()
     test_forward_tanh()
 
+    # Tensor
+    test_forward_round()
+    test_forward_reverse_v2()
+    test_forward_pow_exp()
+    test_forward_sign()
+
     # Reductions
     test_forward_argminmax()
     test_forward_reduce()

From ccd4160f5c11ef579e324d6c4671ac4c94c2fb69 Mon Sep 17 00:00:00 2001
From: lixiaoquan <radioheads@163.com>
Date: Sat, 20 Apr 2019 00:19:22 +0800
Subject: [PATCH 030/106] [RELAY] Avoid unnecessarily reconstructing
 FunctionNode. (#3047)

---
 src/relay/ir/expr_functor.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index 7a6250cd6580..e09d79082227 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -83,27 +83,27 @@ Expr ExprMutator::VisitExpr_(const TupleNode* op) {
 
 Expr ExprMutator::VisitExpr_(const FunctionNode* op) {
   tvm::Array<TypeVar> ty_params;
-  bool all_ty_params_changed = true;
+  bool all_ty_params_unchanged = true;
 
   for (auto ty_param : op->type_params) {
     TypeVar new_ty_param = Downcast<TypeVar>(VisitType(ty_param));
     ty_params.push_back(new_ty_param);
-    all_ty_params_changed &= new_ty_param.same_as(ty_param);
+    all_ty_params_unchanged &= new_ty_param.same_as(ty_param);
   }
 
   tvm::Array<Var> params;
-  bool all_params_changed = true;
+  bool all_params_unchanged = true;
   for (auto param : op->params) {
     Var new_param = Downcast<Var>(this->Mutate(param));
     params.push_back(new_param);
-    all_params_changed &= param.same_as(new_param);
+    all_params_unchanged &= param.same_as(new_param);
   }
 
   auto ret_type = this->VisitType(op->ret_type);
   auto body = this->Mutate(op->body);
 
-  if (ty_params.same_as(op->type_params) &&
-      params.same_as(op->params) &&
+  if (all_ty_params_unchanged &&
+      all_params_unchanged &&
       ret_type.same_as(op->ret_type) &&
       body.same_as(op->body)) {
     return GetRef<Expr>(op);

From 34600642226f47c0d74868a547aefc45d831d1eb Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Sat, 20 Apr 2019 01:20:19 +0900
Subject: [PATCH 031/106] fix PostOrderVisit signature (#3048)

---
 include/tvm/relay/expr_functor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
index 3b179f8e5330..27c0aa08ea61 100644
--- a/include/tvm/relay/expr_functor.h
+++ b/include/tvm/relay/expr_functor.h
@@ -235,7 +235,7 @@ class ExprMutator
  * \param node The ir to be visited.
  * \param fvisit The visitor function to be applied.
  */
-void PostOrderVisit(const NodeRef& node, std::function<void(const NodeRef&)> fvisit);
+void PostOrderVisit(const Expr& node, std::function<void(const Expr&)> fvisit);
 
 /*
  * \brief Bind function parameters or free variables.

From 1f537a61000fffa5596e5baa18081ea7d69b113b Mon Sep 17 00:00:00 2001
From: hlu1 <14827759+hlu1@users.noreply.github.com>
Date: Fri, 19 Apr 2019 17:50:34 -0700
Subject: [PATCH 032/106] [Bugfix] Fix winograd nnpack fp16 (#3046)

---
 topi/python/topi/arm_cpu/conv2d.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index 463214fa5bfd..f84a1eec0815 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -569,7 +569,7 @@ def conv2d_arm_cpu_winograd_nnpack(
     assert N == 1
     with tvm.tag_scope("winograd_nnpack_conv2d_weight_transform"):
         transformed_kernel = tvm.contrib.nnpack.convolution_inference_weight_transform(
-            kernel, algorithm=tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8)
+            kernel, algorithm=cfg['winograd_nnpack_algorithm'].val)
         if autotvm.GLOBAL_SCOPE.in_tuning:
             transformed_kernel = tvm.compute(transformed_kernel.shape, lambda *args: 0.0)
 
@@ -653,7 +653,7 @@ def conv2d_winograd_nnpack_ww(cfg, data, transformed_kernel, bias, strides,
             bias=bias,
             padding=[HPAD, HPAD, WPAD, WPAD],
             stride=[HSTR, WSTR],
-            algorithm=tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8)
+            algorithm=cfg['winograd_nnpack_algorithm'].val)
 
     # we have to manually assign effective GFLOP for winograd
     cfg.add_flop(2 * N * CI * H * W * KH * KW * CO)

From 5b903ea87b74466284662840680baebe100819fb Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 21 Apr 2019 07:20:03 +0800
Subject: [PATCH 033/106] [TOPI] Rename output tensors for better readability
 (#3006)

---
 topi/include/topi/broadcast.h  | 54 +++++++++++++++++-----------------
 topi/include/topi/elemwise.h   | 20 ++++++-------
 topi/include/topi/nn.h         | 18 ++++++------
 topi/include/topi/nn/pooling.h |  4 +--
 topi/include/topi/transform.h  | 44 +++++++++++++--------------
 topi/python/topi/nn/dense.py   |  2 +-
 topi/python/topi/nn/softmax.py |  8 +++--
 7 files changed, 76 insertions(+), 74 deletions(-)

diff --git a/topi/include/topi/broadcast.h b/topi/include/topi/broadcast.h
index c1675caf9e5b..1bbb6e1587ce 100644
--- a/topi/include/topi/broadcast.h
+++ b/topi/include/topi/broadcast.h
@@ -46,7 +46,7 @@ namespace topi {
  */
 inline tvm::Tensor broadcast_to(const tvm::Tensor& t,
                                 const tvm::Array<tvm::Expr>& output_shape,
-                                std::string name = "tensor",
+                                std::string name = "T_broadcast_to",
                                 std::string tag = kBroadcast) {
   CHECK_GE(output_shape.size(), t->shape.size())
       << "Not a broadcast, output dimensionality smaller than input.\noutput: "
@@ -66,35 +66,35 @@ inline tvm::Tensor broadcast_to(const tvm::Tensor& t,
       tag);
 }
 
-#define TOPI_DEFINE_BCAST_OP(Name, ComputeRule)                   \
-  inline tvm::Expr Name(const tvm::Expr& a,                       \
-                        const tvm::Expr& b) {                     \
-    ComputeRule;                                                  \
-  }                                                               \
-  inline tvm::Tensor Name(const tvm::Tensor& A,                   \
-                          const tvm::Tensor& B,                   \
-                          std::string name = "tensor",            \
-                          std::string tag = kBroadcast) {         \
-    auto l = [](tvm::Expr a, tvm::Expr b) { ComputeRule; };       \
-    return detail::WithBroadcast(l, A, B, name, tag);             \
-  }                                                               \
-  inline tvm::Tensor Name(const tvm::Tensor& A,                   \
-                          const tvm::Expr& B,                     \
-                          std::string name = "tensor",            \
-                          std::string tag = kElementWise) {       \
+#define TOPI_DEFINE_BCAST_OP(Name, ComputeRule)                       \
+  inline tvm::Expr Name(const tvm::Expr& a,                           \
+                        const tvm::Expr& b) {                         \
+    ComputeRule;                                                      \
+  }                                                                   \
+  inline tvm::Tensor Name(const tvm::Tensor& A,                       \
+                          const tvm::Tensor& B,                       \
+                          std::string name = "T_" #Name,              \
+                          std::string tag = kBroadcast) {             \
+    auto l = [](tvm::Expr a, tvm::Expr b) { ComputeRule; };           \
+    return detail::WithBroadcast(l, A, B, name, tag);                 \
+  }                                                                   \
+  inline tvm::Tensor Name(const tvm::Tensor& A,                       \
+                          const tvm::Expr& B,                         \
+                          std::string name = "T_" #Name,              \
+                          std::string tag = kElementWise) {           \
     auto l = [](tvm::Expr a, tvm::Expr b) { ComputeRule; };           \
     return compute(A->shape, [&](const ::tvm::Array<::tvm::Var>& i) { \
-        return l(A(i), B);                                        \
-      }, name, tag);                                              \
-  }                                                               \
-  inline tvm::Tensor Name(const tvm::Expr& A,                     \
-                          const tvm::Tensor& B,                   \
-                          std::string name = "tensor",            \
-                          std::string tag = kElementWise) {       \
-    auto l = [&](tvm::Expr a, tvm::Expr b) { ComputeRule; };      \
+        return l(A(i), B);                                            \
+      }, name, tag);                                                  \
+  }                                                                   \
+  inline tvm::Tensor Name(const tvm::Expr& A,                         \
+                          const tvm::Tensor& B,                       \
+                          std::string name = "T_" #Name,              \
+                          std::string tag = kElementWise) {           \
+    auto l = [&](tvm::Expr a, tvm::Expr b) { ComputeRule; };          \
     return compute(B->shape, [&](const ::tvm::Array<::tvm::Var>& i) { \
-        return l(A, B(i));                                        \
-      }, name, tag);                                              \
+        return l(A, B(i));                                            \
+      }, name, tag);                                                  \
   }
 
 
diff --git a/topi/include/topi/elemwise.h b/topi/include/topi/elemwise.h
index 9a32e34eda73..5b6a96dd44ae 100644
--- a/topi/include/topi/elemwise.h
+++ b/topi/include/topi/elemwise.h
@@ -38,7 +38,7 @@ using namespace tvm;
 // Unary intrinsic operators
 #define TOPI_DECLARE_UNARY_OP(OpName)                           \
   inline Tensor OpName(const Tensor& x,                         \
-                       std::string name = "tensor",             \
+                       std::string name = "T_" #OpName,         \
                        std::string tag = kElementWise) {        \
     return compute(x->shape, [&](const Array<Var>& i) {         \
         return ::tvm::OpName(x(i));                             \
@@ -66,7 +66,7 @@ TOPI_DECLARE_UNARY_OP(abs);
 * \return A Tensor whose op member is the identity operation
 */
 inline Tensor identity(const Tensor& x,
-                       std::string name = "tensor",
+                       std::string name = "T_identity",
                        std::string tag = kElementWise) {
   return compute(x->shape, [&](const Array<Var>& i) {
     return x(i);
@@ -83,7 +83,7 @@ inline Tensor identity(const Tensor& x,
 * \return A Tensor whose op member is the negation operation
 */
 inline Tensor negative(const Tensor& x,
-                       std::string name = "tensor",
+                       std::string name = "T_negative",
                        std::string tag = kElementWise) {
   return compute(x->shape, [&](const Array<Var>& i) {
     return -x(i);
@@ -100,7 +100,7 @@ inline Tensor negative(const Tensor& x,
 * \return A Tensor whose op member is the logical NOT operation
 */
 inline Tensor logical_not(const Tensor& x,
-                          std::string name = "tensor",
+                          std::string name = "T_logical_not",
                           std::string tag = kElementWise) {
   return compute(x->shape, [&](const Array<Var>& i) {
     return !x(i);
@@ -117,7 +117,7 @@ inline Tensor logical_not(const Tensor& x,
 * \return A Tensor whose op member is the sign
 */
 inline Tensor sign(const Tensor& x,
-                   std::string name = "tensor",
+                   std::string name = "T_sign",
                    std::string tag = kElementWise) {
   return compute(x->shape, [&](const Array<Var>& i) {
     Expr zero = make_zero(x->dtype);
@@ -144,7 +144,7 @@ inline Tensor sign(const Tensor& x,
 inline Tensor clip(const Tensor& x,
                    const Expr& a_min,
                    const Expr& a_max,
-                   std::string name = "tensor",
+                   std::string name = "T_clip",
                    std::string tag = kElementWise) {
   return compute(x->shape, [&](const Array<Var>& i) {
     auto min_val = tvm::cast(x->dtype, a_min);
@@ -167,7 +167,7 @@ inline Tensor clip(const Tensor& x,
  */
 inline Tensor cast(const Tensor& x,
                    Type type,
-                   std::string name = "tensor",
+                   std::string name = "T_cast",
                    std::string tag = kElementWise) {
   return compute(x->shape, [&](const Array<Var>& i) {
     auto expr = x(i);
@@ -193,7 +193,7 @@ inline Tensor cast(const Tensor& x,
 * \return A Tensor whose op member is the sum operation
 */
 inline Tensor elemwise_sum(const Array<Tensor>& xs,
-                           std::string name = "tensor",
+                           std::string name = "T_elemwise_sum",
                            std::string tag = kElementWise) {
   CHECK_GT(xs.size(), 0) << "elemwise sum must have at least one input tensor.";
   return compute(xs[0]->shape, [&](const Array<Var>& i) {
@@ -219,7 +219,7 @@ inline Tensor elemwise_sum(const Array<Tensor>& xs,
 inline Tensor full(const Array<Expr>& shape,
                    Type dtype,
                    const Expr fill_value,
-                   std::string name = "tensor",
+                   std::string name = "T_full",
                    std::string tag = kElementWise) {
   Expr ev = cast(dtype, fill_value);
   if (!ev.defined()) {
@@ -243,7 +243,7 @@ inline Tensor full(const Array<Expr>& shape,
 */
 inline Tensor full_like(const Tensor& x,
                         const Expr fill_value,
-                        std::string name = "tensor",
+                        std::string name = "T_full_like",
                         std::string tag = kElementWise) {
   Expr ev = cast(x->dtype, fill_value);
   return compute(x->shape, [&](const Array<Var>& i) {
diff --git a/topi/include/topi/nn.h b/topi/include/topi/nn.h
index 10f5d4992f6a..dbeed4ac9436 100644
--- a/topi/include/topi/nn.h
+++ b/topi/include/topi/nn.h
@@ -63,7 +63,7 @@ tvm::Expr Map(const tvm::Array<tvm::Expr>& exprs, T op) {
 template <typename T>
 inline tvm::Tensor relu(const tvm::Tensor& t,
                         T threshold = static_cast<T>(0),
-                        std::string name = "tensor",
+                        std::string name = "T_relu",
                         std::string tag = kElementWise) {
   return tvm::compute(
       t->shape,
@@ -87,7 +87,7 @@ inline tvm::Tensor relu(const tvm::Tensor& t,
 */
 inline tvm::Tensor leaky_relu(const tvm::Tensor& t,
                               double alpha = 0.1,
-                              std::string name = "tensor",
+                              std::string name = "T_leaky_relu",
                               std::string tag = kElementWise) {
   return tvm::compute(
     t->shape,
@@ -114,7 +114,7 @@ inline tvm::Tensor leaky_relu(const tvm::Tensor& t,
 inline tvm::Tensor prelu(const tvm::Tensor &x,
                          const tvm::Tensor &slope,
                          const int axis = 1,
-                         std::string name = "tensor",
+                         std::string name = "T_prelu",
                          std::string tag = kBroadcast) {
   CHECK((size_t)axis < x->shape.size()) <<
         "Wrong axis ("  << axis << ")value. ";
@@ -171,7 +171,7 @@ inline tvm::Tensor pad(const tvm::Tensor& t,
                        const tvm::Array<tvm::Expr>& pad_before,
                        tvm::Array<tvm::Expr> pad_after = tvm::Array<tvm::Expr>(),
                        Expr pad_value = Expr(),
-                       std::string name = "tensor",
+                       std::string name = "T_pad",
                        std::string tag = kElementWise) {
   if (pad_after.size() < pad_before.size()) {
     for (size_t i = pad_after.size(); i < pad_before.size(); ++i) {
@@ -247,7 +247,7 @@ inline tvm::Tensor conv2d_nchw(const tvm::Tensor& I,
                                int pad_w = 0,
                                int stride_h = 1,
                                int stride_w = 1,
-                               std::string name = "tensor",
+                               std::string name = "T_conv2d_nchw",
                                std::string tag = kConv2dNCHW) {
   CHECK_EQ(4, I->shape.size());
   CHECK_EQ(4, W->shape.size());
@@ -298,7 +298,7 @@ inline tvm::Tensor conv2d_hwcn(const tvm::Tensor& I,
                                int pad_w = 0,
                                int stride_h = 1,
                                int stride_w = 1,
-                               std::string name = "tensor",
+                               std::string name = "T_conv2d_hwcn",
                                std::string tag = kConv2dHWCN) {
   CHECK_EQ(4, I->shape.size());
   CHECK_EQ(4, W->shape.size());
@@ -349,7 +349,7 @@ inline tvm::Tensor depthwise_conv2d_nchw(const tvm::Tensor& I,
                                          int pad_w = 0,
                                          int stride_h = 1,
                                          int stride_w = 1,
-                                         std::string name = "tensor",
+                                         std::string name = "T_depthwise_conv2d_nchw",
                                          std::string tag = kDepthwiseConv2dNCHW) {
   CHECK_EQ(4, I->shape.size());
   CHECK_EQ(4, W->shape.size());
@@ -382,7 +382,7 @@ inline tvm::Tensor depthwise_conv2d_nhwc(const tvm::Tensor& I,
                                          int pad_w = 0,
                                          int stride_h = 1,
                                          int stride_w = 1,
-                                         std::string name = "tensor",
+                                         std::string name = "T_depthwise_conv2d_nhwc",
                                          std::string tag = kDepthwiseConv2dNHWC) {
   CHECK_EQ(4, I->shape.size());
   CHECK_EQ(4, W->shape.size());
@@ -435,7 +435,7 @@ inline tvm::Tensor group_conv2d_ngchw(const tvm::Tensor& I,
                                       int pad_w = 0,
                                       int stride_h = 1,
                                       int stride_w = 1,
-                                      std::string name = "tensor",
+                                      std::string name = "T_group_conv2d_ngchw",
                                       std::string tag = kGroupConv2d) {
   CHECK_EQ(5, I->shape.size());
   CHECK_EQ(5, W->shape.size());
diff --git a/topi/include/topi/nn/pooling.h b/topi/include/topi/nn/pooling.h
index 86d5aff9ec35..8648c01fdb3a 100644
--- a/topi/include/topi/nn/pooling.h
+++ b/topi/include/topi/nn/pooling.h
@@ -272,8 +272,8 @@ inline Tensor global_pool(const Tensor& x,
   auto height = x->shape[height_axis];
   auto width = x->shape[width_axis];
 
-  auto dheight = tvm::reduce_axis(Range(0, height));
-  auto dwidth = tvm::reduce_axis(Range(0, width));
+  auto dheight = tvm::reduce_axis(Range(0, height), "rv1");
+  auto dwidth = tvm::reduce_axis(Range(0, width), "rv2");
 
   if (pool_type == kMaxPool) {
     return tvm::compute(out_shape,
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index abb8455d45b5..a658ba3cf995 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -57,7 +57,7 @@ using namespace topi::detail;
 inline Tensor expand_dims(const Tensor& x,
                           int axis,
                           int num_newaxis = 1,
-                          std::string name = "tensor",
+                          std::string name = "T_expand_dims",
                           std::string tag = kBroadcast) {
   int ndim = static_cast<int>(x->shape.size());
   CHECK(-ndim - 1 <= axis && axis <= ndim)
@@ -108,7 +108,7 @@ inline Tensor expand_dims(const Tensor& x,
 */
 inline Tensor transpose(const Tensor& x,
                         Array<Integer> axes,
-                        std::string name = "tensor",
+                        std::string name = "T_transpose",
                         std::string tag = kInjective) {
   if (!axes.defined() || axes.size() == 0) {
     axes = Array<Integer>();
@@ -164,7 +164,7 @@ inline Tensor transpose(const Tensor& x,
 */
 inline Tensor flip(const Tensor& x,
                    int axis = 0,
-                   std::string name = "tensor",
+                   std::string name = "T_flip",
                    std::string tag = kInjective) {
   size_t src_tensor_dim = x->shape.size();
   int axis_inp = axis;
@@ -204,7 +204,7 @@ inline Tensor flip(const Tensor& x,
 */
 inline Tensor reshape(const Tensor& x,
                       Array<Expr> newshape,
-                      std::string name = "tensor",
+                      std::string name = "T_reshape",
                       std::string tag = kInjective) {
   auto x_shape = x->shape;
   return compute(
@@ -229,7 +229,7 @@ inline Tensor reshape(const Tensor& x,
 inline Tensor squeeze(const Tensor& x,
                       Array<Integer> axis,
                       bool atleast1d = false,
-                      std::string name = "tensor",
+                      std::string name = "T_squeeze",
                       std::string tag = kInjective) {
   auto ndim = x->shape.size();
   std::vector<int> axis_val;
@@ -291,7 +291,7 @@ inline Tensor squeeze(const Tensor& x,
 */
 inline Tensor concatenate(const Array<Tensor>& inputs,
                           int axis = 0,
-                          std::string name = "tensor",
+                          std::string name = "T_concat",
                           std::string tag = kInjective) {
   int ndim = static_cast<int>(inputs[0]->shape.size());
   CHECK(-ndim <= axis && axis < ndim)
@@ -355,7 +355,7 @@ inline Tensor concatenate(const Array<Tensor>& inputs,
 */
 inline Tensor stack(const Array<Tensor>& inputs,
                     int axis = 0,
-                    std::string name = "tensor",
+                    std::string name = "T_stack",
                     std::string tag = kInjective) {
   int ndim = static_cast<int>(inputs[0]->shape.size());
   CHECK(-ndim - 1 <= axis && axis <= ndim)
@@ -408,7 +408,7 @@ inline Tensor stack(const Array<Tensor>& inputs,
 inline Array<Tensor> split(const Tensor& x,
                            Array<Integer> split_indices,
                            int axis,
-                           std::string name = "tensor",
+                           std::string name = "T_split",
                            std::string tag = kInjective) {
   if (axis < 0) {
     axis += static_cast<int>(x->shape.size());
@@ -486,7 +486,7 @@ inline Tensor strided_slice(const Tensor& x,
                             const Array<Integer>& begin,
                             const Array<Integer>& end,
                             const Array<Integer>& strides,
-                            std::string name = "tensor",
+                            std::string name = "T_strided_slice",
                             std::string tag = kInjective) {
   size_t src_tensor_dim = static_cast<size_t>(x->shape.size());
   // Setup the ranges.
@@ -585,7 +585,7 @@ inline Tensor strided_slice(const Tensor& x,
 inline Array<Tensor> split_sections(const Tensor& x,
                                     int num_sections,
                                     int axis,
-                                    std::string name = "tensor",
+                                    std::string name = "T_split_sections",
                                     std::string tag = kInjective) {
   if (axis < 0) {
     axis += static_cast<int>(x->shape.size());
@@ -624,7 +624,7 @@ inline Array<Tensor> split_sections(const Tensor& x,
 inline Tensor take(const Tensor& a,
                    const Tensor& indices,
                    std::string mode = "clip",
-                   std::string name = "tensor",
+                   std::string name = "T_take",
                    std::string tag = kInjective) {
   Array<Expr> a_shape = a->shape;
   Array<Expr> out_shape = indices->shape;
@@ -664,7 +664,7 @@ inline Tensor take(const Tensor& a,
                    const Tensor& indices,
                    int axis,
                    std::string mode = "clip",
-                   std::string name = "tensor",
+                   std::string name = "T_take",
                    std::string tag = kInjective) {
   if (axis < 0) {
     axis += static_cast<int>(a->shape.size());
@@ -738,7 +738,7 @@ inline Tensor take(const Tensor& a,
 inline Tensor where(const Tensor& condition,
                     const Tensor& x,
                     const Tensor& y,
-                    std::string name = "tensor",
+                    std::string name = "T_where",
                     std::string tag = kInjective) {
   CHECK_EQ(x->shape.size(), y->shape.size())
     << "x and y must have the same shape.Got different number of dimension: "
@@ -786,7 +786,7 @@ inline Tensor where(const Tensor& condition,
 inline Tensor repeat(const Tensor& x,
                      int repeats,
                      int axis,
-                     std::string name = "tensor",
+                     std::string name = "T_repeat",
                      std::string tag = kBroadcast) {
   int ndim = static_cast<int>(x->shape.size());
   CHECK(-ndim - 1 <= axis && axis <= ndim)
@@ -835,7 +835,7 @@ inline Tensor repeat(const Tensor& x,
 */
 inline Tensor tile(const Tensor& x,
                    Array<Integer> reps,
-                   std::string name = "tensor",
+                   std::string name = "T_tile",
                    std::string tag = kBroadcast) {
   size_t ndim = x->shape.size();
   size_t rdim = reps.size();
@@ -892,7 +892,7 @@ inline Tensor tile(const Tensor& x,
 */
 inline Tensor gather_nd(const Tensor& data,
                         const Tensor& indices,
-                        std::string name = "tensor",
+                        std::string name = "T_gather_nd",
                         std::string tag = kInjective) {
   size_t ndim_d = data->shape.size();
   size_t ndim_i = indices->shape.size();
@@ -953,7 +953,7 @@ inline tvm::Tensor matmul(const tvm::Tensor& A,
                            const tvm::Tensor& B,
                            bool trans_a = false,
                            bool trans_b = false,
-                           std::string name = "tensor",
+                           std::string name = "T_matmul",
                            std::string tag = kMatMul) {
   tvm::Array<tvm::Expr> output_shape{A->shape[trans_a ? 1 : 0],
                                      B->shape[trans_b ? 0 : 1]};
@@ -979,7 +979,7 @@ inline tvm::Tensor matmul(const tvm::Tensor& A,
 inline Tensor tensordot(const Tensor& A,
                         const tvm::Tensor& B,
                         int axes = 2,
-                        std::string name = "tensor",
+                        std::string name = "T_tensordot",
                         std::string tag = kMatMul) {
   CHECK_GE(A->shape.size(), axes);
   CHECK_GE(B->shape.size(), axes);
@@ -1035,7 +1035,7 @@ inline Tensor tensordot(const Tensor& A,
                         const tvm::Tensor& B,
                         Array<Expr> A_axes,
                         Array<Expr> B_axes,
-                        std::string name = "tensor",
+                        std::string name = "T_tensordot",
                         std::string tag = kMatMul) {
   CHECK_EQ(A_axes.size(), B_axes.size());
 
@@ -1084,7 +1084,7 @@ inline Tensor arange(const Expr start,
                      const Expr stop,
                      const Expr step,
                      Type dtype,
-                     std::string name = "tensor",
+                     std::string name = "T_arange",
                      std::string tag = kInjective) {
   Expr num_elem = tvm::cast(tvm::Int(32), tvm::ceil(
       tvm::cast(tvm::Float(32), stop - start) / step));
@@ -1106,7 +1106,7 @@ inline Tensor arange(const Expr start,
 inline Tensor layout_transform(const Tensor& src,
                                const std::string& src_layout,
                                const std::string& dst_layout,
-                               const std::string name = "layout_transform",
+                               const std::string name = "T_layout_trans",
                                const std::string tag = kInjective) {
   Layout src_layout_struct = LayoutNode::make(src_layout);
   Layout dst_layout_struct = LayoutNode::make(dst_layout);
@@ -1142,7 +1142,7 @@ inline Tensor layout_transform(const Tensor& src,
  */
 inline Tensor shape(const Tensor& src,
                     Type dtype,
-                    const std::string name = "shape",
+                    const std::string name = "T_shape",
                     const std::string tag = kInjective) {
   int ndim = static_cast<int>(src->shape.size());
   Array<Expr> out_shape{ndim};
diff --git a/topi/python/topi/nn/dense.py b/topi/python/topi/nn/dense.py
index 6c4aba1e84ef..f116e7c4a31c 100644
--- a/topi/python/topi/nn/dense.py
+++ b/topi/python/topi/nn/dense.py
@@ -47,7 +47,7 @@ def dense_default(data, weight, bias=None):
     k = tvm.reduce_axis((0, in_dim), name='k')
     matmul = tvm.compute((batch, out_dim), \
                          lambda i, j: tvm.sum(data[i, k] * weight[j, k], axis=k), \
-                         tag='dense')
+                         name='T_dense', tag='dense')
     if bias is not None:
         matmul = tvm.compute((batch, out_dim), \
                              lambda i, j: matmul[i, j] + bias[j], \
diff --git a/topi/python/topi/nn/softmax.py b/topi/python/topi/nn/softmax.py
index c02fd67f12bb..00bbe55e8d70 100644
--- a/topi/python/topi/nn/softmax.py
+++ b/topi/python/topi/nn/softmax.py
@@ -61,9 +61,11 @@ def _normalize(max_elem, expsum, *indices):
         return tvm.exp(x[indices] - max_elem[non_reduce_indices]) / expsum[non_reduce_indices]
 
     reduced_shape = tuple([dim for (i, dim) in enumerate(shape) if i != axis])
-    max_elem = tvm.compute(reduced_shape, _compute_max)
-    expsum = tvm.compute(reduced_shape, lambda *indices: _compute_expsum(max_elem, *indices))
-    return tvm.compute(shape, lambda *indices: _normalize(max_elem, expsum, *indices))
+    max_elem = tvm.compute(reduced_shape, _compute_max, name='T_softmax_maxelem')
+    expsum = tvm.compute(reduced_shape, lambda *indices: _compute_expsum(max_elem, *indices),
+                         name='T_softmax_expsum')
+    return tvm.compute(shape, lambda *indices: _normalize(max_elem, expsum, *indices),
+                       name='T_softmax_norm')
 
 
 @tvm.tag_scope(tag='log_softmax_output')

From 64569a3c8a68cf4feb7ab90267e463fae0c1b0c3 Mon Sep 17 00:00:00 2001
From: Yong Wu <55wuyong@163.com>
Date: Sat, 20 Apr 2019 23:59:22 -0700
Subject: [PATCH 034/106] [Frontend][TF] Fix Placeholder issue (#2834)

* [Frontend][TF] Fix Placeholder issue

* Add test cases
---
 nnvm/python/nnvm/frontend/tensorflow.py       | 28 +++++++-----
 .../frontend/tensorflow/test_forward.py       | 24 +++++++++++
 python/tvm/relay/frontend/tensorflow.py       | 43 +++++++++----------
 .../frontend/tensorflow/test_forward.py       | 22 ++++++++++
 4 files changed, 84 insertions(+), 33 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index 84649038e82f..2f91cad8143a 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -127,7 +127,7 @@ def _impl(inputs, attr, params):
 
 def _elemwise(name):
     def _impl(inputs, attr, *args):
-        assert len(inputs) == 2, "Math op take 2 inputs, {} given".format(len(inputs))
+        assert len(inputs) == 2, "{} take 2 inputs, {} given".format(name, len(inputs))
         op_name = _math_name_picker(name)(attr)
         return get_nnvm_op(op_name)(*inputs)
     return _impl
@@ -1237,16 +1237,24 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
 
         for node in graph.node:
             if node.op == 'Placeholder' or node.op == 'PlaceholderWithDefault':
+                # Give priority to user argument.
                 if shape and node.name in shape:
                     self._input_shapes[node.name] = list(shape[node.name])
-                    continue
-                self._input_shapes[node.name] = \
-                    tensor_util.TensorShapeProtoToList(node.attr['shape'].shape)
-                for idx, dim in enumerate(self._input_shapes[node.name]):
-                    if dim < 0:
-                        self._input_shapes[node.name][idx] = 1
-                        warnings.warn("Use 1 instead of -1 in shape of operator %s."
-                                      % node.name)
+                else:
+                    self._input_shapes[node.name] = \
+                        tensor_util.TensorShapeProtoToList(node.attr['shape'].shape)
+                    for idx, dim in enumerate(self._input_shapes[node.name]):
+                        if dim < 0:
+                            self._input_shapes[node.name][idx] = 1
+                            warnings.warn("Use 1 instead of -1 in shape of operator %s."
+                                          % node.name)
+
+                self._nodes[node.name] = _sym.Variable(name=node.name,
+                                                       shape=self._input_shapes[node.name])
+                self._output_shapes[node.name] = [self._input_shapes[node.name]]
+                self._outputs_are_0d[node.name] = [ \
+                    not tshape if isinstance(tshape, list) else False \
+                    for tshape in self._output_shapes[node.name]]
 
             # Ignore user's input shape for Non placeholder
             elif node.op == 'Const':
@@ -1304,7 +1312,7 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
 
                 attr = self._parse_attr(node.attr)
 
-            else:
+            elif node.op != "Placeholder":
                 # Pass the parsed shapes instead
                 attr["_output_shapes"] = output_shapes = self._output_shapes[node.name]
 
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index 54bbe085f66d..4df31ddf7bdd 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -941,6 +941,29 @@ def test_forward_resnetv2():
                     tvm_output = run_tvm_graph(graph_def, data, 'input_tensor', len(tf_output), target=device)
                     tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tf_output[0]), rtol=1e-5, atol=1e-5)
 
+#######################################################################
+# Placeholder
+# -----------
+def test_forward_placeholder():
+    '''test a simple pb with Placeholder node in the end of GraphDef'''
+    with tf.Graph().as_default():
+        graph_def = tf_testing.get_workload("Custom/placeholder.pb")
+
+        # Call the utility to import the graph definition into default graph.
+        graph_def = tf_testing.ProcessGraphDefParam(graph_def)
+
+
+        data = np.random.uniform(size=(1, 224, 224, 3)).astype('float32')
+        out_node = 'mul'
+
+        with tf.Session() as sess:
+            # Add shapes to the graph.
+            graph_def = tf_testing.AddShapesToGraphDef(sess, out_node)
+            tf_output = run_tf_graph(sess, data, 'Placeholder:0', out_node + ':0')
+            tvm_output = run_tvm_graph(graph_def, data, 'Placeholder')
+            print("tf_output is {}\ntvm_output is {}".format(tf_output, tvm_output))
+            tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tf_output[0]), rtol=1e-5, atol=1e-5)
+
 #######################################################################
 # PTB
 # ---
@@ -1262,6 +1285,7 @@ def test_forward_rel_ops():
     test_forward_inception_v1()
     test_forward_mobilenet()
     test_forward_resnetv2()
+    test_forward_placeholder()
     test_forward_ptb()
 
     # RNN
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 43e770c301d2..38903b055216 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -239,7 +239,7 @@ def _impl(inputs, attr, params):
 
 def _elemwise(name):
     def _impl(inputs, attr, *args):
-        assert len(inputs) == 2, "Math op take 2 inputs, {} given".format(len(inputs))
+        assert len(inputs) == 2, "{} take 2 inputs, {} given".format(name, len(inputs))
         return _get_relay_op(name)(*inputs)
     return _impl
 
@@ -1704,16 +1704,23 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
             node_name_prefix = node.name.rsplit('/', 1)[0]
             control_flow_node_map[node_name_prefix].add(node.op)
             if node.op == 'Placeholder':
+                # Give priority to user argument.
                 if shape and node.name in shape:
                     self._input_shapes[node.name] = list(shape[node.name])
-                    continue
-                self._input_shapes[node.name] = \
-                    tensor_util.TensorShapeProtoToList(node.attr['shape'].shape)
-                for idx, dim in enumerate(self._input_shapes[node.name]):
-                    if dim < 0:
-                        self._input_shapes[node.name][idx] = 1
-                        warnings.warn("Use 1 instead of -1 in shape of operator %s."
-                                      % node.name)
+                else:
+                    self._input_shapes[node.name] = \
+                        tensor_util.TensorShapeProtoToList(node.attr['shape'].shape)
+                    for idx, dim in enumerate(self._input_shapes[node.name]):
+                        if dim < 0:
+                            self._input_shapes[node.name][idx] = 1
+                            warnings.warn("Use 1 instead of -1 in shape of operator %s."
+                                          % node.name)
+
+                self._output_shapes[node.name] = [self._input_shapes[node.name]]
+                attr = self._parse_attr(node.attr)
+                self._nodes[node.name] = [_expr.var(node.name,
+                                                    shape=self._input_shapes[node.name],
+                                                    dtype=attr['dtype'].name)]
 
                 # Ignore user's input shape for Non placeholder
             elif node.op == 'Const':
@@ -1736,11 +1743,6 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
             # Variable converted to Const will not have only value attr
             if 'value' in attr and node.op == 'Const':
                 self._output_shapes[node.name] = [self._input_shapes[node.name]]
-            elif shape and node.name in shape:
-                # Give priority to user argument.
-                self._output_shapes[node.name] = [shape[node.name]]
-            elif node.op == 'Placeholder':
-                self._output_shapes[node.name] = [self._input_shapes[node.name]]
             elif '_output_shapes' in attr:
                 self._output_shapes[node.name] = \
                     [tensor_util.TensorShapeProtoToList(tshape) \
@@ -1755,13 +1757,7 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
                 not shape if isinstance(tshape, list) else False \
                 for tshape in self._output_shapes[node.name]]
 
-            if node.op == "Placeholder":
-                self._output_shapes[node.name] = [self._input_shapes[node.name]]
-                self._nodes[node.name] = [_expr.var(node.name,
-                                                    shape=self._input_shapes[node.name],
-                                                    dtype=attr['dtype'].name)]
-
-            elif node.op == "Const":
+            if node.op == "Const":
                 # All Const nodes are Param nodes, lets parse
                 self._num_param += 1
                 for key, value in node.attr.items():
@@ -1772,7 +1768,7 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
 
                 attr = self._parse_attr(node.attr)
 
-            else:
+            elif node.op != "Placeholder":
                 # Pass the parsed shapes instead
                 attr["_output_shapes"] = output_shapes = self._output_shapes[node.name]
 
@@ -1816,7 +1812,8 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
                         input_shapes[in_sym[0]] = input_shape
                         # This means the node is 1d in Relay and 0d in TF.
                         # See `_expand_dims_0d_aware`.
-                        if self._outputs_are_0d[node_name][tensor_slot] and input_shape:
+                        if node_name in self._outputs_are_0d \
+                                and self._outputs_are_0d[node_name][tensor_slot] and input_shape:
                             input_0d_mismatch.add(in_sym[0])
 
                 attr['_input_shapes'] = input_shapes
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 6894c5d46210..0430b8cbfd27 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -1133,6 +1133,27 @@ def test_forward_resnetv2():
                     tvm_output = run_tvm_graph(graph_def, data, 'input_tensor', len(tf_output), target=device)
                     tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tf_output[0]), rtol=1e-5, atol=1e-5)
 
+#######################################################################
+# Placeholder
+# -----------
+def test_forward_placeholder():
+    '''test a simple pb with Placeholder node in the end of GraphDef'''
+    with tf.Graph().as_default():
+        graph_def = tf_testing.get_workload("Custom/placeholder.pb")
+        # Call the utility to import the graph definition into default graph.
+        graph_def = tf_testing.ProcessGraphDefParam(graph_def)
+
+        data = np.random.uniform(size=(1, 224, 224, 3)).astype('float32')
+        out_node = 'mul'
+
+        with tf.Session() as sess:
+            # Add shapes to the graph.
+            graph_def = tf_testing.AddShapesToGraphDef(sess, out_node)
+            tf_output = run_tf_graph(sess, data, 'Placeholder:0', out_node + ':0')
+            tvm_output = run_tvm_graph(graph_def, data, 'Placeholder')
+            print("tf_output is {}\ntvm_output is {}".format(tf_output, tvm_output))
+            tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tf_output[0]), rtol=1e-5, atol=1e-5)
+
 #######################################################################
 # PTB
 # ---
@@ -1514,6 +1535,7 @@ def test_forward_rel_ops():
     test_forward_inception_v1()
     test_forward_mobilenet()
     test_forward_resnetv2()
+    test_forward_placeholder()
     test_forward_ptb()
 
     # RNN

From 55b014dc23506bc014ebd5a0992b6804b6bb4a1d Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Mon, 22 Apr 2019 08:17:50 +0800
Subject: [PATCH 035/106] Fix code comment and typos. (#3063)

---
 include/tvm/arithmetic.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/tvm/arithmetic.h b/include/tvm/arithmetic.h
index fe3dca483acb..9a8d9d372956 100644
--- a/include/tvm/arithmetic.h
+++ b/include/tvm/arithmetic.h
@@ -327,9 +327,9 @@ class Analyzer {
   ConstIntBoundAnalyzer const_int_bound;
   /*! \brief sub-analyzer: modular set */
   ModularSetAnalyzer modular_set;
-  /*! \brief sub-analyzer rewrite simplfy */
+  /*! \brief sub-analyzer rewrite simplify */
   RewriteSimplifier rewrite_simplify;
-  /*! \brief sub-analyzer rewrite simplfy */
+  /*! \brief sub-analyzer canonical simplify */
   CanonicalSimplifier canonical_simplify;
   /*! \brief constructor */
   Analyzer();

From dde0f8b49d49730bcf1750954de646b6f5cc5e0e Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Mon, 22 Apr 2019 16:21:47 -0700
Subject: [PATCH 036/106] [Relay] fix target string (#3071)

---
 src/relay/backend/graph_runtime_codegen.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/relay/backend/graph_runtime_codegen.cc b/src/relay/backend/graph_runtime_codegen.cc
index beb13032ce55..7f16891da8a7 100644
--- a/src/relay/backend/graph_runtime_codegen.cc
+++ b/src/relay/backend/graph_runtime_codegen.cc
@@ -429,11 +429,11 @@ class GraphRuntimeCodegen
     }
     CCacheKey key = (*pf0)(func, target);
     CachedFunc lowerd_func = (*pf1)(compile_engine_, key);
-    if (!lowered_funcs_.count(target->target_name)) {
-      lowered_funcs_[target->target_name] = {};
+    if (!lowered_funcs_.count(target->str())) {
+      lowered_funcs_[target->str()] = {};
     }
     for (auto f : lowerd_func->funcs) {
-      lowered_funcs_[target->target_name].insert(f);
+      lowered_funcs_[target->str()].insert(f);
     }
 
     std::vector<GraphNodeRef> inputs;

From 3759281a731611130cdf153be49761f0861b710e Mon Sep 17 00:00:00 2001
From: Gemfield <gemfield@civilnet.cn>
Date: Tue, 23 Apr 2019 11:50:55 +0800
Subject: [PATCH 037/106] =?UTF-8?q?Enhance=20upsample=20operator=20to=20ad?=
 =?UTF-8?q?apt=20onnx=20opset=20version=209=20for=20nnvm=20comp=E2=80=A6?=
 =?UTF-8?q?=20(#2968)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Enhance upsample operator to adapt onnx opset version 9 for nnvm compiler

* Add upsample test case for newer opset in nnvm

* re-trigger the CI
---
 nnvm/python/nnvm/frontend/onnx.py             |  8 ++++-
 .../python/frontend/onnx/test_forward.py      | 32 +++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/nnvm/python/nnvm/frontend/onnx.py b/nnvm/python/nnvm/frontend/onnx.py
index c3c4768a97db..eb78b7845c23 100644
--- a/nnvm/python/nnvm/frontend/onnx.py
+++ b/nnvm/python/nnvm/frontend/onnx.py
@@ -404,8 +404,14 @@ class Upsample(OnnxOpConverter):
     """
 
     @classmethod
-    def _impl_v7(cls, inputs, attr, params):
+    def _impl_v9(cls, inputs, attr, params):
         scales = attr.get('scales')
+        if not scales:
+            #Here we are going to higher OPSET version.
+            assert len(inputs) == 2, "Upsample op take 2 inputs, {} given".format(len(inputs))
+            input_name = inputs[1].list_input_names()[0]
+            scales = params[input_name].asnumpy()
+            inputs = inputs[:1]
         assert len(scales) == 4 and scales[0] == 1.0 and scales[1] == 1.0 and scales[2] == scales[3]
         mode = attr.get('mode')
         if mode == b'nearest':
diff --git a/nnvm/tests/python/frontend/onnx/test_forward.py b/nnvm/tests/python/frontend/onnx/test_forward.py
index 24f84ee73efe..941a275a8045 100644
--- a/nnvm/tests/python/frontend/onnx/test_forward.py
+++ b/nnvm/tests/python/frontend/onnx/test_forward.py
@@ -438,9 +438,41 @@ def _test_upsample_bilinear():
         tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, 'float32')
         tvm.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5)
 
+def _test_upsample_bilinear_opset9():
+    scale = 2
+    in_shape = (1, 1, 3, 3)
+    out_shape = (1, 1, 3*scale, 3*scale)
+    y = helper.make_node("Upsample", ['in','scales'], ['out'], mode='linear')
+    scales=[1.0, 1.0, 2.0, 2.0]
+    in_array = np.random.uniform(size=in_shape).astype(np.float32)
+    out_array = topi.testing.bilinear_resize_python(in_array, (3*scale, 3*scale), "NCHW")
+
+    ref_array = np.array(scales)
+    ref_node = helper.make_node('Constant',
+                                 inputs=[],
+                                 outputs=['scales'],
+                                 value=onnx.helper.make_tensor(name = 'const_tensor',
+                                                               data_type = TensorProto.FLOAT,
+                                                               dims = ref_array.shape,
+                                                               vals = ref_array.flatten().astype(float)))
+
+    graph = helper.make_graph([ref_node, y],
+                              'upsample_bilinear_opset9_test',
+                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))])
+
+    model = helper.make_model(graph, producer_name='upsample_bilinear_opset9_test')
+    inputs = []
+    inputs.append(in_array)
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, inputs, target, ctx, out_shape, 'float32')
+        tvm.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5)
+
 def test_upsample():
     _test_upsample_nearest()
     _test_upsample_bilinear()
+    _test_upsample_bilinear_opset9()
 
 def _test_softmax(inshape, axis):
     opname = 'Softmax'

From 65fe95c9ab3275bee0064ffb63971c43374edf16 Mon Sep 17 00:00:00 2001
From: OuHangKresnik <ouhang.kresnik.zero@gmail.com>
Date: Thu, 25 Apr 2019 06:17:40 +0800
Subject: [PATCH 038/106] Fix UnboundLocalError: local variable 'tensor'
 referenced before assignment (#3074)

---
 topi/python/topi/opengl/pooling.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/topi/python/topi/opengl/pooling.py b/topi/python/topi/opengl/pooling.py
index 95dc561f6b08..99c4decd9960 100644
--- a/topi/python/topi/opengl/pooling.py
+++ b/topi/python/topi/opengl/pooling.py
@@ -107,8 +107,8 @@ def traverse(OP):
         if tag.is_broadcast(OP.tag):
             if OP not in s.outputs:
                 s[OP].compute_inline()
-            for tensor in OP.input_tensors and tensor.op not in scheduled_ops:
-                if tensor.op.input_tensors:
+            for tensor in OP.input_tensors:
+                if tensor.op not in scheduled_ops and tensor.op.input_tensors:
                     traverse(tensor.op)
         # schedule pool
         elif OP.tag.startswith('pool'):

From cbc194b2da9c2244b2d4183697606f0b3bca95c7 Mon Sep 17 00:00:00 2001
From: eqy <eddieyan101@gmail.com>
Date: Wed, 24 Apr 2019 20:16:59 -0700
Subject: [PATCH 039/106] check in (#3089)

---
 src/relay/pass/pattern_util.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index 87160b2cd130..b44bb682d317 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -300,7 +300,7 @@ inline Expr Add(Expr lhs, Expr rhs) {
 }
 
 
-inline Expr Substract(Expr lhs, Expr rhs) {
+inline Expr Subtract(Expr lhs, Expr rhs) {
   static const Op& op = Op::Get("subtract");
   return CallNode::make(op, {lhs, rhs}, Attrs(), {});
 }

From 84da99a4c4fb9961839d260e9e4e5ef86b401850 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 26 Apr 2019 01:49:50 +0800
Subject: [PATCH 040/106] Use bridge network and expose port on macOS when
 launch docker image (#3086)

---
 docker/bash.sh          |  8 +++++++-
 docs/install/docker.rst | 13 ++++++++++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/docker/bash.sh b/docker/bash.sh
index 23b2e209e01e..401c892e1a92 100755
--- a/docker/bash.sh
+++ b/docker/bash.sh
@@ -35,7 +35,13 @@ DOCKER_IMAGE_NAME=("$1")
 
 if [ "$#" -eq 1 ]; then
     COMMAND="bash"
-    CI_DOCKER_EXTRA_PARAMS=("-it --net=host")
+    if [[ $(uname) == "Darwin" ]]; then
+        # Docker's host networking driver isn't supported on macOS.
+        # Use default bridge network and expose port for jupyter notebook.
+        CI_DOCKER_EXTRA_PARAMS=("-it -p 8888:8888")
+    else
+        CI_DOCKER_EXTRA_PARAMS=("-it --net=host")
+    fi
 else
     shift 1
     COMMAND=("$@")
diff --git a/docs/install/docker.rst b/docs/install/docker.rst
index 47503e70be97..f4236d7a29cd 100644
--- a/docs/install/docker.rst
+++ b/docs/install/docker.rst
@@ -46,7 +46,8 @@ This auxiliary script does the following things:
 
 - Mount current directory to /workspace
 - Switch user to be the same user that calls the bash.sh (so you can read/write host system)
-- Use the host-side network (so you can use jupyter notebook)
+- Use the host-side network on Linux. Use the bridge network and expose port 8888 on macOS,
+  because host networking driver isn't supported. (so you can use jupyter notebook)
 
 
 Then you can start a jupyter notebook by typing
@@ -55,6 +56,16 @@ Then you can start a jupyter notebook by typing
 
    jupyter notebook
 
+You might see an error ``OSError: [Errno 99] Cannot assign requested address`` when starting
+a jupyter notebook on macOS. You can change the binding IP address by
+
+.. code:: bash
+
+   jupyter notebook --ip=0.0.0.0
+
+Note that on macOS, because we use bridge network, jupyter notebook will be reportedly running
+at an URL like ``http://{container_hostname}:8888/?token=...``. You should replace the ``container_hostname``
+with ``localhost`` when pasting it into browser.
 
 Docker Source
 -------------

From fc301183decd90da1bd4a67cf7c73d866f96047a Mon Sep 17 00:00:00 2001
From: Josh Pollock <joshpoll@cs.washington.edu>
Date: Thu, 25 Apr 2019 10:56:46 -0700
Subject: [PATCH 041/106] [Relay][Text Format] Fix Pretty Printing Annotations
 (#3041)

---
 src/relay/ir/pretty_printer.cc             | 24 ++++++++++++----------
 tests/python/relay/test_ir_text_printer.py |  2 +-
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/relay/ir/pretty_printer.cc b/src/relay/ir/pretty_printer.cc
index 71502614abd6..6a3e5f85c9b2 100644
--- a/src/relay/ir/pretty_printer.cc
+++ b/src/relay/ir/pretty_printer.cc
@@ -156,14 +156,19 @@ class PrettyPrinter :
     */
   Doc PrintOptionalInfo(const Expr& expr) {
     Doc doc;
-    // additional information in comment.
-    if (annotate_ != nullptr) {
-      return doc << " /* " << annotate_(expr) << " */";
-    } else if (expr->checked_type_.defined()) {
-      return doc << " /* ty=" << Print(expr->checked_type()) << " */";
+    // default annotations
+    if (annotate_ == nullptr) {
+      if ((expr.as<ConstantNode>() || expr.as<CallNode>()) && expr->checked_type_.defined()) {
+        doc << " /* ty=" << Print(expr->checked_type()) << " */";
+      }
     } else {
-      return doc;
+      std::string annotated_expr = annotate_(expr);
+      if (annotated_expr != "") {
+        doc << annotated_expr;
+      }
     }
+
+    return doc;
   }
 
   // indent a new body
@@ -361,9 +366,7 @@ class PrettyPrinter :
       printed_expr = VisitExpr(expr);
     }
 
-    if (expr.as<CallNode>()) {
-      printed_expr << PrintOptionalInfo(expr);
-    }
+    printed_expr << PrintOptionalInfo(expr);
 
     // add expr to doc
     if (expr.as<VarNode>()) {
@@ -409,8 +412,7 @@ class PrettyPrinter :
     }
     // default fall-back, record it as meta node.
     Doc doc;
-    return doc << Print(GetRef<NodeRef>(op), true)
-               << PrintOptionalInfo(GetRef<Expr>(op));
+    return doc << Print(GetRef<NodeRef>(op), true);
   }
 
   Doc VisitExpr_(const TupleNode* op) final {
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
index f10b258ff3cf..1924de006b7b 100644
--- a/tests/python/relay/test_ir_text_printer.py
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -52,7 +52,7 @@ def test_env():
     assert "def @myf" in str(env)
     assert "add(%0, %0) /* ty=float32 */" in text
     assert "add(%0, %0) /* ty=float32 */" in str(env)
-    show(env.astext(annotate=lambda x: str(x.checked_type.dtype)))
+    show(env.astext(annotate=lambda x: str(x.checked_type.dtype) if type(x) == relay.Call else ""))
     show(text)
 
 

From c48c6e5ef8503028aa44ed5a10d227bbf7d93fd4 Mon Sep 17 00:00:00 2001
From: Hiroyuki Makino <makihiro@users.noreply.github.com>
Date: Fri, 26 Apr 2019 03:05:42 +0900
Subject: [PATCH 042/106] [Relay][TOPI] Add rsqrt operator (#2949)

---
 docs/api/python/topi.rst             |  2 ++
 docs/langref/relay_op.rst            |  2 ++
 include/tvm/expr_operator.h          |  1 +
 python/tvm/hybrid/runtime.py         | 17 +++++++++++++++++
 python/tvm/intrin.py                 | 18 +++++++++++++++++-
 python/tvm/relay/op/_tensor.py       |  1 +
 python/tvm/relay/op/tensor.py        | 20 ++++++++++++++++++++
 src/codegen/intrin_rule.cc           | 10 ++++++++++
 src/relay/op/tensor/unary.cc         | 11 ++++++++++-
 tests/python/relay/test_ir_op.py     |  2 +-
 tests/python/relay/test_op_level1.py |  5 +++++
 topi/include/topi/elemwise.h         | 18 ++++++++++++++++++
 topi/python/topi/math.py             | 17 +++++++++++++++++
 topi/src/topi.cc                     |  5 +++++
 topi/tests/python/test_topi_basic.py |  1 +
 topi/tests/python/test_topi_math.py  |  1 +
 16 files changed, 128 insertions(+), 3 deletions(-)

diff --git a/docs/api/python/topi.rst b/docs/api/python/topi.rst
index ce6d9e519ee5..222b3347d08e 100644
--- a/docs/api/python/topi.rst
+++ b/docs/api/python/topi.rst
@@ -36,6 +36,7 @@ List of operators
    topi.tanh
    topi.log
    topi.sqrt
+   topi.rsqrt
    topi.sigmoid
    topi.clip
    topi.cast
@@ -122,6 +123,7 @@ topi
 .. autofunction:: topi.tanh
 .. autofunction:: topi.log
 .. autofunction:: topi.sqrt
+.. autofunction:: topi.rsqrt
 .. autofunction:: topi.sigmoid
 .. autofunction:: topi.clip
 .. autofunction:: topi.cast
diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 9bdac71b6ee4..c45e9b92ab6f 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -41,6 +41,7 @@ This level enables fully connected multi-layer perceptron.
 
    tvm.relay.log
    tvm.relay.sqrt
+   tvm.relay.rsqrt
    tvm.relay.exp
    tvm.relay.sigmoid
    tvm.relay.add
@@ -186,6 +187,7 @@ Level 1 Definitions
 -------------------
 .. autofunction:: tvm.relay.log
 .. autofunction:: tvm.relay.sqrt
+.. autofunction:: tvm.relay.rsqrt
 .. autofunction:: tvm.relay.exp
 .. autofunction:: tvm.relay.sigmoid
 .. autofunction:: tvm.relay.add
diff --git a/include/tvm/expr_operator.h b/include/tvm/expr_operator.h
index 25da5d0e4bf4..4ef3effaf251 100644
--- a/include/tvm/expr_operator.h
+++ b/include/tvm/expr_operator.h
@@ -486,6 +486,7 @@ TVM_DECLARE_INTRIN_UNARY(exp);
 TVM_DECLARE_INTRIN_UNARY(tanh);
 TVM_DECLARE_INTRIN_UNARY(sigmoid);
 TVM_DECLARE_INTRIN_UNARY(sqrt);
+TVM_DECLARE_INTRIN_UNARY(rsqrt);
 TVM_DECLARE_INTRIN_UNARY(log);
 TVM_DECLARE_INTRIN_UNARY(popcount);
 
diff --git a/python/tvm/hybrid/runtime.py b/python/tvm/hybrid/runtime.py
index 7e4217c69c67..aa00b4b80251 100644
--- a/python/tvm/hybrid/runtime.py
+++ b/python/tvm/hybrid/runtime.py
@@ -52,6 +52,22 @@ def allocate(shape, dtype='float32', scope='global'): #pylint: disable=unused-ar
     return numpy.zeros(shape).astype(dtype)
 
 
+def rsqrt(x):
+    """
+    Computes reciprocal of square root of x element-wise
+
+    Parameters
+    ----------
+    x: Tensor
+
+    Returns
+    -------
+    res: Tensor
+        The result of reciprocal of square root of x
+    """
+    return numpy.ones_like(x) / numpy.sqrt(x)
+
+
 def popcount(x):
     """
     Count ones in the binary representation of number x
@@ -103,6 +119,7 @@ def max_num_threads(allow_none=True):
     'allocate'       : allocate,
     'output_tensor'  : allocate,
     'sqrt'           : numpy.sqrt,
+    'rsqrt'          : rsqrt,
     'log'            : numpy.log,
     'tanh'           : numpy.tanh,
     'power'          : numpy.power,
diff --git a/python/tvm/intrin.py b/python/tvm/intrin.py
index 71739ad4321b..df854e270e9d 100644
--- a/python/tvm/intrin.py
+++ b/python/tvm/intrin.py
@@ -260,7 +260,7 @@ def log(x):
 
 
 def sqrt(x):
-    """Take log of input x.
+    """Take square root of input x.
 
     Parameters
     ----------
@@ -275,6 +275,22 @@ def sqrt(x):
     return call_pure_intrin(x.dtype, "sqrt", x)
 
 
+def rsqrt(x):
+    """Take reciprocal of square root of input x.
+
+    Parameters
+    ----------
+    x : Expr
+        Input argument.
+
+    Returns
+    -------
+    y : Expr
+        The result.
+    """
+    return call_pure_intrin(x.dtype, "rsqrt", x)
+
+
 def floor(x):
     """Take floor of float input x.
 
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index ef34ae9af8c8..8b3dd72db043 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -27,6 +27,7 @@
 register_schedule("log", schedule_broadcast)
 register_schedule("exp", schedule_broadcast)
 register_schedule("sqrt", schedule_broadcast)
+register_schedule("rsqrt", schedule_broadcast)
 register_schedule("sigmoid", schedule_broadcast)
 register_schedule("floor", schedule_broadcast)
 register_schedule("ceil", schedule_broadcast)
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index eef4ec9c5e48..bcbbe0c55377 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -79,6 +79,26 @@ def sqrt(data):
     return _make.sqrt(data)
 
 
+def rsqrt(data):
+    """Compute elementwise rsqrt of data.
+
+    .. math::
+
+      1/sqrt(x)
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.rsqrt(data)
+
+
 def sigmoid(data):
     """Compute elementwise sigmoid of data.
 
diff --git a/src/codegen/intrin_rule.cc b/src/codegen/intrin_rule.cc
index 5bc5c518bb5b..230a3bccab3c 100644
--- a/src/codegen/intrin_rule.cc
+++ b/src/codegen/intrin_rule.cc
@@ -40,6 +40,16 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.tanh")
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.sqrt")
 .set_body(DispatchExtern<FloatSuffix>);
 
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.rsqrt")
+.set_body([](const TVMArgs& args, TVMRetValue* rv){
+    Expr e = args[0];
+    const Call* call = e.as<Call>();
+    CHECK(call != nullptr);
+
+    auto one = make_const(call->args[0].type(), 1);
+    *rv = one / sqrt(call->args[0]);
+  });
+
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.pow")
 .set_body(DispatchExtern<FloatSuffix>);
 
diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc
index 781d99488448..b723137a3a8e 100644
--- a/src/relay/op/tensor/unary.cc
+++ b/src/relay/op/tensor/unary.cc
@@ -64,7 +64,7 @@ RELAY_REGISTER_UNARY_OP("exp")
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::exp));
 
 RELAY_REGISTER_UNARY_OP("sqrt")
-.describe(R"code(Returns the rsqrt input array, computed element-wise.
+.describe(R"code(Returns the sqrt input array, computed element-wise.
 
 .. math::
    sqrt(x)
@@ -73,6 +73,15 @@ RELAY_REGISTER_UNARY_OP("sqrt")
 .set_support_level(1)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::sqrt));
 
+RELAY_REGISTER_UNARY_OP("rsqrt")
+.describe(R"code(Returns the rsqrt input array, computed element-wise.
+
+.. math::
+   1/sqrt(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::rsqrt));
 
 RELAY_REGISTER_UNARY_OP("zeros_like")
 .describe(R"code(Returns an array of zeros, with same type and shape as the input.
diff --git a/tests/python/relay/test_ir_op.py b/tests/python/relay/test_ir_op.py
index 8873e4d1c6b1..2f3109d36172 100644
--- a/tests/python/relay/test_ir_op.py
+++ b/tests/python/relay/test_ir_op.py
@@ -30,7 +30,7 @@ def test(x):
 def test_op_level1():
     x = relay.Var("x")
 
-    for op_name in ["log", "exp", "sqrt", "tanh"]:
+    for op_name in ["log", "exp", "sqrt", "rsqrt","tanh"]:
         y = getattr(relay, op_name)(x)
         assert y.op.name == op_name
         assert y.op.support_level == 1
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index d83f25db1b77..202464493d4b 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -30,6 +30,10 @@ def relu(x):
     np.maximum(x_copy, 0, x_copy)
     return x_copy
 
+def rsqrt(x):
+    one = np.ones_like(x)
+    return one / np.sqrt(x)
+
 def test_unary_op():
     def check_single_op(opfunc, ref):
         shape = (10, 4)
@@ -57,6 +61,7 @@ def check_single_op(opfunc, ref):
     for opfunc, ref in [(tvm.relay.log, np.log),
                         (tvm.relay.exp, np.exp),
                         (tvm.relay.sqrt, np.sqrt),
+                        (tvm.relay.rsqrt, rsqrt),
                         (tvm.relay.sigmoid, sigmoid),
                         (tvm.relay.tanh, np.tanh),
                         (relay.nn.relu, relu)]:
diff --git a/topi/include/topi/elemwise.h b/topi/include/topi/elemwise.h
index 5b6a96dd44ae..a9f8f630471f 100644
--- a/topi/include/topi/elemwise.h
+++ b/topi/include/topi/elemwise.h
@@ -129,6 +129,24 @@ inline Tensor sign(const Tensor& x,
   }, name, tag);
 }
 
+/*!
+* \brief Creates an operation that returns rsqrt of a given tensor
+*
+* \param x The input tensor
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the rsqrt operation
+*/
+inline Tensor rsqrt(const Tensor& x,
+                       std::string name = "tensor",
+                       std::string tag = kElementWise) {
+  return compute(x->shape, [&](const Array<Var>& i) {
+    Expr one = make_const(x->dtype, 1);
+    return one/tvm::sqrt(x(i));
+  }, name, tag);
+}
+
 /*!
 * \brief Creates an operation that clips each element of a tensor to
 * the interval [a_min, a_max]
diff --git a/topi/python/topi/math.py b/topi/python/topi/math.py
index c63f041b959e..5a1742b12c56 100644
--- a/topi/python/topi/math.py
+++ b/topi/python/topi/math.py
@@ -224,6 +224,23 @@ def sqrt(x):
     return tvm.compute(x.shape, lambda *i: tvm.sqrt(x(*i)))
 
 
+@tvm.tag_scope(tag=tag.ELEMWISE)
+def rsqrt(x):
+    """Take inverse square root of input x.
+
+    Parameters
+    ----------
+    x : tvm.Tensor
+        Input argument.
+
+    Returns
+    -------
+    y : tvm.Tensor
+        The result.
+    """
+    return tvm.compute(x.shape, lambda *i: tvm.rsqrt(x(*i)))
+
+
 @tvm.tag_scope(tag=tag.ELEMWISE)
 def sigmoid(x):
     """Take sigmoid tanh of input x.
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index c583f1c115c2..2ab9a4235f38 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -163,6 +163,11 @@ TVM_REGISTER_GLOBAL("topi.sqrt")
   *rv = sqrt(args[0]);
   });
 
+TVM_REGISTER_GLOBAL("topi.rsqrt")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+*rv = rsqrt(args[0]);
+});
+
 TVM_REGISTER_GLOBAL("topi.log")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   *rv = log(args[0]);
diff --git a/topi/tests/python/test_topi_basic.py b/topi/tests/python/test_topi_basic.py
index d18bbe537da1..ab485c853a2e 100644
--- a/topi/tests/python/test_topi_basic.py
+++ b/topi/tests/python/test_topi_basic.py
@@ -40,6 +40,7 @@ def test_apply(func, name):
     test_apply(topi.sigmoid, "sigmoid")
     test_apply(topi.log, "log")
     test_apply(topi.sqrt, "sqrt")
+    test_apply(topi.rsqrt, "rsqrt")
 
 
 if __name__ == "__main__":
diff --git a/topi/tests/python/test_topi_math.py b/topi/tests/python/test_topi_math.py
index 42e3e68a2db9..c180bc77e829 100644
--- a/topi/tests/python/test_topi_math.py
+++ b/topi/tests/python/test_topi_math.py
@@ -75,6 +75,7 @@ def check_device(device):
     test_apply(topi.sigmoid, "sigmoid", lambda x:1/(1+np.exp(-x)), -1, 1)
     test_apply(topi.log, "log", np.log, 0, 100)
     test_apply(topi.sqrt, "sqrt", np.sqrt, 0, 100)
+    test_apply(topi.rsqrt, "rsqrt", lambda x:np.ones_like(x)/np.sqrt(x), 0, 100, skip_name_check=True)
 
 if __name__ == "__main__":
     test_util()

From 72d73abea2f51eaef43fe106b15931afbb424745 Mon Sep 17 00:00:00 2001
From: Logan Weber <36520469+weberlo@users.noreply.github.com>
Date: Thu, 25 Apr 2019 15:38:40 -0700
Subject: [PATCH 043/106] Add VSCode directories to gitignore (#3095)

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 04dad2039860..e44aa6e21464 100644
--- a/.gitignore
+++ b/.gitignore
@@ -196,6 +196,9 @@ tvm_t.*
 .pytest_cache
 .local
 
+# Visual Studio Code
+.vscode
+
 # tmp file
 .nfs*
 

From 5e858bba127d8ee9021f5f5eaa773f2dade2d65c Mon Sep 17 00:00:00 2001
From: lixiaoquan <radioheads@163.com>
Date: Fri, 26 Apr 2019 13:57:37 +0800
Subject: [PATCH 044/106] [Relay][TensorFlow] Remove 'input_0d_mismatch'
 special handling (#3087)

* [Relay][TensorFlow] Remove 'input_0d_mismatch' special handling

* Add more tests.

* Cover the case that strided_slice outputs a scalar
---
 python/tvm/relay/frontend/tensorflow.py       | 35 ++++---------------
 .../frontend/tensorflow/test_forward.py       | 17 +++++++++
 2 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 38903b055216..0f8b19bfb45f 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -99,7 +99,6 @@ def __call__(self, inputs, attrs, *args):
         self._ignores.append('_node_name')
         self._ignores.append('is_training')
         self._ignores.append('_target_layout')
-        self._ignores.append('_input_0d_mismatch')
 
         # apply custom check
         if self._custom_check:
@@ -458,9 +457,9 @@ def _impl(inputs, attr, params):
 def _expand_dims():
     def _impl(inputs, attr, params):
         dim_input = inputs.pop(1)
-        axis = params[dim_input.name_hint]
-        params.pop(dim_input.name_hint)
-        return _expand_dims_0d_aware(inputs[0], attr, axis=axis.asnumpy()[0])
+        axis = params.pop(_get_name_hint(dim_input)).asnumpy()[0]
+        return AttrCvt(op_name="expand_dims", ignores=['Tdim', 'N'],
+                       extras={'axis': int(axis), 'num_newaxis': 1})(inputs, attr)
     return _impl
 
 def _resize_bilinear():
@@ -528,7 +527,7 @@ def _impl(inputs, attr, params):
 def _pack():
     def _impl(inputs, attr, params):
         axis = int(attr["axis"])
-        inputs_reshaped = [_expand_dims_0d_aware(i, attr, axis=axis, num_newaxis=1) for i in inputs]
+        inputs_reshaped = [_op.expand_dims(i, axis=axis, num_newaxis=1) for i in inputs]
         return _op.concatenate(inputs_reshaped, axis)
     return _impl
 
@@ -820,9 +819,9 @@ def _transform_mask(stride_dim, ellipsis_mask):
                 pass
             else:
                 final_output.append(out_shape[gather_index])
-        # Prevent 0-dim tensors which are not accepted by Relay
+
         if not final_output:
-            final_output.append(1)
+            return out
         return _op.reshape(out, newshape=tuple(final_output))
     return _impl
 
@@ -984,16 +983,6 @@ def _impl(inputs, attr, params):
             for split_item in splitted]), len(splitted))
     return _impl
 
-def _expand_dims_0d_aware(data, attr, axis, num_newaxis=1):
-    if data in attr['_input_0d_mismatch']:
-        return data if num_newaxis == 1 else \
-            AttrCvt(op_name="expand_dims", ignores=['Tdim', 'N'],
-                    extras={'axis': int(axis), 'num_newaxis': int(num_newaxis-1)})([data], attr)
-
-    return AttrCvt(op_name="expand_dims", ignores=['Tdim', 'N'],
-                   extras={'axis': int(axis), 'num_newaxis': int(num_newaxis)})([data], attr)
-
-
 def _softmax():
     def _impl(inputs, attr, params):
         return AttrCvt(op_name='softmax',
@@ -1647,7 +1636,6 @@ def __init__(self):
         self._output_shapes = {}
         self._num_param = 0
         self._num_rnn_layer = False
-        self._outputs_are_0d = {}
         self._input_shapes = {}
         self._loops = {}
         self._branches = {}
@@ -1737,7 +1725,6 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
             # Operator name 'Const' is treated as a parameter to build params dict.
 
             input_shapes = {}
-            input_0d_mismatch = set()
             attr = self._parse_attr(node.attr)
 
             # Variable converted to Const will not have only value attr
@@ -1753,10 +1740,6 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
                 # Will infer shapes if the graph is not frozen with add_shapes=True
                 self._output_shapes[node.name] = [None]
 
-            self._outputs_are_0d[node.name] = [ \
-                not shape if isinstance(tshape, list) else False \
-                for tshape in self._output_shapes[node.name]]
-
             if node.op == "Const":
                 # All Const nodes are Param nodes, lets parse
                 self._num_param += 1
@@ -1810,14 +1793,8 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
                             input_shape = self._output_shapes[node_name][0]
                         inputs.append(in_sym[0])
                         input_shapes[in_sym[0]] = input_shape
-                        # This means the node is 1d in Relay and 0d in TF.
-                        # See `_expand_dims_0d_aware`.
-                        if node_name in self._outputs_are_0d \
-                                and self._outputs_are_0d[node_name][tensor_slot] and input_shape:
-                            input_0d_mismatch.add(in_sym[0])
 
                 attr['_input_shapes'] = input_shapes
-                attr['_input_0d_mismatch'] = input_0d_mismatch
 
                 if node.op in _control_flow_nodes:
                     op = self._convert_control_flow_operator(node, inputs,
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 0430b8cbfd27..84f40a19968b 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -580,6 +580,7 @@ def _test_stridedslice(ip_shape, begin, end, stride, dtype,
 def test_forward_stridedslice():
     '''test StridedSlice'''
 
+    _test_stridedslice((2), [1], [1], [1], 'float32', shrink_axis_mask=1)
     _test_stridedslice((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], 'float32')
     _test_stridedslice((3, 4, 3), [1, 0], [4, 3], [2, 1], 'float32', ellipsis_mask=8)
     _test_stridedslice((3, 4, 3), [1, 0], [4, 2], [2, 1], 'float32', ellipsis_mask=2)
@@ -1475,6 +1476,21 @@ def test_forward_rel_ops():
     _test_forward_rel_op([t1, t2], math_ops.equal)
     _test_forward_rel_op([t1, t2], math_ops.not_equal)
 
+#######################################################################
+# ExpandDims
+# ----------
+def _test_forward_expand_dims(data, axis):
+    in1 = tf.placeholder(shape=data.shape, dtype=data.dtype, name='in1')
+    out = tf.expand_dims(in1, axis)
+    compare_tf_with_tvm([data], [in1.name], out.name)
+
+def test_forward_expand_dims():
+    _test_forward_expand_dims(np.int32(1), 0)
+    _test_forward_expand_dims(np.array([1]), 0)
+    _test_forward_expand_dims(np.array([1]), -1)
+    _test_forward_expand_dims(np.array([[1], [2]]), 0)
+    _test_forward_expand_dims(np.array([[1], [2]]), 1)
+    _test_forward_expand_dims(np.array([[1], [2]]), -1)
 
 #######################################################################
 # Main
@@ -1509,6 +1525,7 @@ def test_forward_rel_ops():
     test_forward_reverse_v2()
     test_forward_pow_exp()
     test_forward_sign()
+    test_forward_expand_dims()
 
     # Reductions
     test_forward_argminmax()

From 9ff03ba28e4cdbd79f454db741cdbfee07415fb9 Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Fri, 26 Apr 2019 20:57:30 +0530
Subject: [PATCH 045/106] [TEST][FLAKY] fix for #3099 (#3101)

---
 tests/python/frontend/tensorflow/test_forward.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 84f40a19968b..9b6dc573901e 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -1429,16 +1429,16 @@ def test_forward_sign():
     compare_tf_with_tvm([np_data], ['in_data:0'], 'sign:0')
 
 def test_forward_pow_exp():
-    """test Pow"""
-    np_in1 = np.random.uniform(-10, 10, size=(5, 7, 11)).astype(np.float32)
-    np_in2 = np.random.uniform(-10, 10, size=(5, 7, 11)).astype(np.float32)
+    """test Pow and Exp """
+    np_in1 = np.random.uniform(-2, 2, size=(5, 7, 11)).astype(np.float32)
+    np_in2 = np.random.uniform(-2, 2, size=(5, 7, 11)).astype(np.float32)
     tf.reset_default_graph()
     in1 = tf.placeholder(tf.float32, (5, 7, 11), name="in1")
     in2 = tf.placeholder(tf.float32, (5, 7, 11), name="in2")
     out1 = tf.pow(in1, in2, name="pow")
-    out = tf.exp(out1, name='exp')
+    out = tf.exp(in1, name='exp')
     compare_tf_with_tvm([np_in1, np_in2], ['in1:0', 'in2:0'], 'pow:0')
-    compare_tf_with_tvm([np_in1, np_in2], ['in1:0', 'in2:0'], 'exp:0')
+    compare_tf_with_tvm([np_in1], ['in1:0'], 'exp:0')
 
 #######################################################################
 # Mean

From 5200c47f5c13ef928dd235d94bca3b9d5cdd89fa Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 26 Apr 2019 09:09:11 -0700
Subject: [PATCH 046/106] [COMMUNITY] @vinx13 -> committer (#3100)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index bd9b9c1c3c55..7cc913190db4 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -44,6 +44,7 @@ We do encourage everyone to work anything they are interested in.
 - [Yuwei Hu](https://github.com/Huyuwei): @Huyuwei - topi, frontends
 - [Nick Hynes](https://github.com/nhynes): @nhynes: - sgx, rust
 - [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler
+- [Wuwei Lin](https://github.com/vinx13): @vinx13 - relay, topi
 - [Yizhi Liu](https://github.com/yzhliu) (PMC): @yzhliu - jvm, topi, relay
 - [Masahiro Masuda](https://github.com/masahi): @masahi - topi, relay
 - [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta

From e057e2a357b05f1fe043d37dec435972d4a3840e Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Sat, 27 Apr 2019 00:41:54 +0800
Subject: [PATCH 047/106] [Relay, Quantization, TOPI] int8 dense on CUDA &
 Dense op quantization  (#2877)

* Quantize dense layers

* Add out_dtype arggument to dense; Add dense_int8 on CUDA

* Add topi unittest of dense int8

* Fix relay

* Fix topi integration

* Fix quantization

* Update dense_rewrite

* Triger CI

* Change qconfig quantize_dense to quantize_op

* Fix

* Remove quantize_op from qconfig
---
 include/tvm/relay/attrs/nn.h                |   6 +
 python/tvm/autotvm/task/topi_integration.py |   2 +-
 python/tvm/relay/op/nn/_nn.py               |   4 +-
 python/tvm/relay/op/nn/nn.py                |   7 +-
 python/tvm/relay/quantize/_annotate.py      |  20 +++
 src/relay/op/nn/nn.cc                       |  10 +-
 src/relay/pass/quantize.cc                  |  33 +++++
 topi/include/topi/cuda/dense.h              |   7 +-
 topi/include/topi/nn/dense.h                |   9 +-
 topi/include/topi/rocm/dense.h              |   7 +-
 topi/python/topi/cuda/__init__.py           |   4 +-
 topi/python/topi/cuda/dense.py              | 128 +++++++++++++++++++-
 topi/python/topi/nn/dense.py                |  19 ++-
 topi/python/topi/rocm/dense.py              |  10 +-
 topi/python/topi/x86/dense.py               |  24 ++--
 topi/src/topi.cc                            |  17 +--
 topi/tests/python/common.py                 |   2 +-
 topi/tests/python/test_topi_conv2d_int8.py  |   4 +-
 topi/tests/python/test_topi_dense.py        |  58 ++++++++-
 topi/tests/python/test_topi_group_conv2d.py |   4 +-
 20 files changed, 326 insertions(+), 49 deletions(-)

diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 431b6032c8cd..2049a8f869f8 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -336,10 +336,16 @@ struct GlobalPool2DAttrs : public tvm::AttrsNode<GlobalPool2DAttrs> {
 /*! \brief Attributes for dense operator */
 struct DenseAttrs : public tvm::AttrsNode<DenseAttrs> {
   IndexExpr units;
+  DataType out_dtype;
 
   TVM_DECLARE_ATTRS(DenseAttrs, "relay.attrs.DenseAttrs") {
     TVM_ATTR_FIELD(units)
         .describe("Number of hidden units of the dense transformation.");
+
+    // use 0 bits to indicate none.
+    TVM_ATTR_FIELD(out_dtype)
+        .set_default(NullValue<DataType>())
+        .describe("Output data type, set to explicit type under mixed precision setting");
   }
 };
 
diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index ee34c6da8607..ed1a2b75c979 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -188,7 +188,7 @@ def _topi_nn_conv2d_transpose_nchw(*args, **kwargs):
         def _topi_nn_dense(*args, **kwargs):
             assert not kwargs, "Do not support kwargs in template function call"
             args = deserialize_args(args)
-            data, weight, bias = args
+            data, weight, bias, _ = args
             C = topi.nn.dense(*args, **kwargs)
             s = topi.generic.schedule_dense([C])
             if bias is not None:
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index e60c01cfb3ff..272b7511b9ed 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -51,7 +51,9 @@ def schedule_log_softmax(_, outputs, target):
 @reg.register_compute("nn.dense")
 def compute_dense(attrs, inputs, out_type, target):
     """Compute definition of dense"""
-    return [topi.nn.dense(inputs[0], inputs[1])]
+    out_dtype = attrs.out_dtype
+    out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
+    return [topi.nn.dense(inputs[0], inputs[1], out_dtype=out_dtype)]
 
 @reg.register_schedule("nn.dense")
 def schedule_dense(attrs, outputs, target):
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 2d13f53f17fd..b772c43e11cd 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -475,7 +475,7 @@ def bias_add(data, bias, axis=1):
     return _make.bias_add(data, bias, axis)
 
 
-def dense(data, weight, units=None):
+def dense(data, weight, units=None, out_dtype=""):
     """Dense operator.
     Applies a linear transformation
 
@@ -494,12 +494,15 @@ def dense(data, weight, units=None):
     units : int, optional
         Number of hidden units of the dense transformation.
 
+    out_dtype : str, optional
+        Specifies the output data type for mixed precision dense.
+
     Returns
     -------
     result : tvm.relay.Expr
         The computed result.
     """
-    return _make.dense(data, weight, units)
+    return _make.dense(data, weight, units, out_dtype)
 
 
 def relu(data):
diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
index 2fe1cb81675b..e52ce142e5c3 100644
--- a/python/tvm/relay/quantize/_annotate.py
+++ b/python/tvm/relay/quantize/_annotate.py
@@ -171,6 +171,26 @@ def conv2d_rewrite(ref_call, new_args, ctx):
     return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
 
 
+@register_annotate_function("nn.dense")
+def dense_rewrite(ref_call, new_args, ctx):
+    """Rewrite function for dense. Lhs of dense will be quantized to input field, and rhs of
+    dense will be quantized to weight field. Output would be in activation field."""
+    cnt = _conv_counter()
+    if cnt < current_qconfig().skip_k_conv:
+        return None
+    lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
+    rhs_expr, rhs_kind = _get_expr_kind(new_args[1])
+
+    if lhs_kind is None or lhs_kind != QAnnotateKind.INPUT:
+        lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT)
+
+    assert rhs_kind is None
+    rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT)
+
+    expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
+    return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
+
+
 @register_annotate_function("multiply")
 def multiply_rewrite(ref_call, new_args, ctx):
     """Rewrite function for multiply."""
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 4141e602d6bc..b8749013867f 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -131,8 +131,12 @@ bool DenseRel(const Array<Type>& types,
     oshape.Set((oshape.size() - 1), wshape[0]);
   }
 
+  DataType out_dtype = param->out_dtype;
+  if (out_dtype.bits() == 0) {
+    out_dtype = data->dtype;
+  }
   // assign output type
-  reporter->Assign(types[2], TensorTypeNode::make(oshape, data->dtype));
+  reporter->Assign(types[2], TensorTypeNode::make(oshape, out_dtype));
   return true;
 }
 
@@ -140,9 +144,11 @@ bool DenseRel(const Array<Type>& types,
 // Positional relay function to create dense operator used by frontend FFI.
 Expr MakeDense(Expr data,
                Expr weight,
-               IndexExpr units) {
+               IndexExpr units,
+               DataType out_dtype) {
   auto attrs = make_node<DenseAttrs>();
   attrs->units = units;
+  attrs->out_dtype = out_dtype;
   static const Op& op = Op::Get("nn.dense");
   return CallNode::make(op, {data, weight}, Attrs(attrs), {});
 }
diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc
index 5fa30535b002..7fd27b46ad6a 100644
--- a/src/relay/pass/quantize.cc
+++ b/src/relay/pass/quantize.cc
@@ -296,6 +296,39 @@ RELAY_REGISTER_OP("nn.conv2d")
 .set_attr<FForwardRewrite>("FQRealizeRewrite", Conv2dRealize);
 
 
+Expr DenseRealize(const Call& ref_call,
+                  const Array<Expr>& new_args,
+                  const NodeRef& ctx) {
+  const QConfig& cfg = QConfig::Current();
+  CHECK_EQ(new_args.size(), 2);
+  if (!new_args[0]->derived_from<TempExprNode>() || !new_args[1]->derived_from<TempExprNode>()) {
+    return Expr(nullptr);
+  }
+  const auto* lhs = new_args[0].as<QRealizeIntExprNode>();
+  const auto* rhs = new_args[1].as<QRealizeIntExprNode>();
+
+  Expr ldata = lhs->data;
+  if (lhs->dtype != cfg->dtype_input) {
+    ldata = Cast(ldata, cfg->dtype_input);
+  }
+  Expr rdata = Cast(rhs->data, cfg->dtype_weight);
+
+  const auto ref_attrs = ref_call->attrs.as<DenseAttrs>();
+  auto attrs = make_node<DenseAttrs>();
+  *attrs = *ref_attrs;
+  DataType out_dtype = cfg->dtype_activation;
+  attrs->out_dtype = out_dtype;
+
+  Expr ret = CallNode::make(ref_call->op,
+          {ldata, rdata}, Attrs(attrs), ref_call->type_args);
+  Expr dom_scale = FoldConstant(Multiply(lhs->dom_scale, rhs->dom_scale));
+  return QRealizeIntExprNode::make(ret, dom_scale, out_dtype);
+}
+
+RELAY_REGISTER_OP("nn.dense")
+.set_attr<FForwardRewrite>("FQRealizeRewrite", DenseRealize);
+
+
 Expr MulRealize(const Call& ref_call,
                 const Array<Expr>& new_args,
                 const NodeRef& ctx) {
diff --git a/topi/include/topi/cuda/dense.h b/topi/include/topi/cuda/dense.h
index 5cfdce000ede..f640ee7c75a7 100644
--- a/topi/include/topi/cuda/dense.h
+++ b/topi/include/topi/cuda/dense.h
@@ -44,13 +44,15 @@ namespace cuda {
 * \param data Tensor with shape [batch, in_dim]
 * \param weight Tensor with shape [out_dim, in_dim]
 * \param bias Tensor with shape [out_dim]. Optional; to omit bias, pass Tensor()
+* \param out_dtype Output data type. Used for mixed precision.
 *
 * \return Tensor with shape [batch, out_dim]
 */
 inline tvm::Tensor dense_cuda(const Target& target,
                               const tvm::Tensor& data,
                               const tvm::Tensor& weight,
-                              const tvm::Tensor& bias) {
+                              const tvm::Tensor& bias,
+                              const Type& out_dtype) {
   CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
   CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
   if (bias.defined()) {
@@ -62,6 +64,7 @@ inline tvm::Tensor dense_cuda(const Target& target,
   auto out_dim = weight->shape[0];
 
   if (target->libs().count("cublas")) {
+    CHECK_EQ(data->dtype, out_dtype) << "Mixed precision not supported.";
     auto mm = topi::contrib::cublas_matmul(data, weight, false, true);
     if (bias.defined()) {
       mm = tvm::compute({ batch, out_dim },
@@ -72,7 +75,7 @@ inline tvm::Tensor dense_cuda(const Target& target,
 
     return mm;
   } else {
-    return topi::nn::dense(data, weight, bias);
+    return topi::nn::dense(data, weight, bias, out_dtype);
   }
 }
 
diff --git a/topi/include/topi/nn/dense.h b/topi/include/topi/nn/dense.h
index 755aad6b784d..d4a8b8963b6e 100644
--- a/topi/include/topi/nn/dense.h
+++ b/topi/include/topi/nn/dense.h
@@ -40,12 +40,14 @@ using namespace tvm;
 * \param data Tensor with shape [batch, in_dim]
 * \param weight Tensor with shape [out_dim, in_dim]
 * \param bias Tensor with shape [out_dim]. Optional; to omit bias, pass Tensor()
+* \param out_dtype Output data type. Used for mixed precision.
 *
 * \return Tensor with shape [batch, out_dim]
 */
 inline tvm::Tensor dense(const tvm::Tensor& data,
                          const tvm::Tensor& weight,
-                         const tvm::Tensor& bias) {
+                         const tvm::Tensor& bias,
+                         const Type& out_dtype) {
   CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
   CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
   if (bias.defined()) {
@@ -60,14 +62,15 @@ inline tvm::Tensor dense(const tvm::Tensor& data,
   auto matmul = tvm::compute(
     { batch, out_dim },
     [&](Var i, Var j) {
-      return tvm::sum(data(i, k) * weight(j, k), { k });
+      return tvm::sum(tvm::cast(out_dtype, data(i, k)) *
+                      tvm::cast(out_dtype, weight(j, k)), { k });
     }, "tensor", "dense");
 
   if (bias.defined()) {
     matmul = tvm::compute(
       { batch, out_dim },
       [&](Var i, Var j) {
-        return matmul(i, j) + bias(j);
+        return matmul(i, j) + tvm::cast(out_dtype, bias(j));
       }, "tensor", kBroadcast);
   }
 
diff --git a/topi/include/topi/rocm/dense.h b/topi/include/topi/rocm/dense.h
index 67e60492a12d..58badfab446b 100644
--- a/topi/include/topi/rocm/dense.h
+++ b/topi/include/topi/rocm/dense.h
@@ -45,13 +45,15 @@ namespace rocm {
 * \param data Tensor with shape [batch, in_dim]
 * \param weight Tensor with shape [out_dim, in_dim]
 * \param bias Tensor with shape [out_dim]. Optional; to omit bias, pass Tensor()
+* \param out_dtype Output data type. Used for mixed precision.
 *
 * \return Tensor with shape [batch, out_dim]
 */
 inline tvm::Tensor dense_rocm(const Target& target,
                               const tvm::Tensor& data,
                               const tvm::Tensor& weight,
-                              const tvm::Tensor& bias) {
+                              const tvm::Tensor& bias,
+                              const Type& out_dtype) {
   CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
   CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
   if (bias.defined()) {
@@ -63,6 +65,7 @@ inline tvm::Tensor dense_rocm(const Target& target,
   auto out_dim = weight->shape[0];
 
   if (target->libs().count("rocblas")) {
+    CHECK_EQ(data->dtype, out_dtype) << "Mixed precision not supported.";
     auto mm = topi::contrib::rocblas_matmul(data, weight, false, true);
     if (bias.defined()) {
       mm = tvm::compute({ batch, out_dim },
@@ -73,7 +76,7 @@ inline tvm::Tensor dense_rocm(const Target& target,
 
     return mm;
   } else {
-    return topi::nn::dense(data, weight, bias);
+    return topi::nn::dense(data, weight, bias, out_dtype);
   }
 }
 
diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
index 706ecfb7f4bc..aca410b93276 100644
--- a/topi/python/topi/cuda/__init__.py
+++ b/topi/python/topi/cuda/__init__.py
@@ -2,7 +2,8 @@
 """CUDA specific declaration and schedules."""
 from __future__ import absolute_import as _abs
 
-from . import conv2d, depthwise_conv2d, conv2d_transpose_nchw, deformable_conv2d, group_conv2d_nchw
+from . import conv2d, depthwise_conv2d, conv2d_transpose_nchw, deformable_conv2d, \
+              group_conv2d_nchw, dense
 from .conv2d_hwcn import schedule_conv2d_hwcn
 from .depthwise_conv2d import schedule_depthwise_conv2d_backward_input_nhwc
 from .depthwise_conv2d import schedule_depthwise_conv2d_backward_weight_nhwc
@@ -10,7 +11,6 @@
 from .reduction import schedule_reduce
 from .softmax import schedule_softmax
 from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
-from .dense import dense_cuda, schedule_dense
 from .pooling import schedule_pool, schedule_global_pool
 from .extern import schedule_extern
 from .nn import schedule_lrn, schedule_l2_normalize
diff --git a/topi/python/topi/cuda/dense.py b/topi/python/topi/cuda/dense.py
index 680567cf5791..d8c1f3303362 100644
--- a/topi/python/topi/cuda/dense.py
+++ b/topi/python/topi/cuda/dense.py
@@ -18,13 +18,17 @@
 """Schedule for dense operator"""
 from __future__ import absolute_import as _abs
 import tvm
+import tvm.autotvm as autotvm
 from tvm.contrib import cublas
+from .tensor_intrin import dp4a
 from ..nn.dense import dense, dense_default
 from .. import tag
 from .. import generic
+from ..util import traverse_inline, get_const_tuple
 
-@dense.register("cuda")
-def dense_cuda(data, weight, bias=None):
+
+@autotvm.register_topi_compute(dense, ["cuda", "gpu"], "direct")
+def dense_cuda(cfg, data, weight, bias=None, out_dtype=None):
     """Dense operator for cuda backend.
 
     Parameters
@@ -43,25 +47,29 @@ def dense_cuda(data, weight, bias=None):
     output : tvm.Tensor
         2-D with shape [batch, out_dim]
     """
+    # pylint: disable=unused-argument
     assert len(data.shape) == 2 and len(weight.shape) == 2, \
         "only support 2-dim dense"
     if bias is not None:
         assert len(bias.shape) == 1
+    if out_dtype is None:
+        out_dtype = data.dtype
     batch, in_dim = data.shape
     out_dim, _ = weight.shape
     target = tvm.target.current_target()
     if "cublas" in target.libs:
+        assert out_dtype == data.dtype, "Mixed precision not supported."
         matmul = cublas.matmul(data, weight, False, True)
         if bias is not None:
             matmul = tvm.compute((batch, out_dim), \
                                  lambda i, j: matmul[i, j] + bias[j], \
                                  tag=tag.BROADCAST)
         return matmul
-    return dense_default(data, weight, bias)
+    return dense_default(data, weight, bias, out_dtype)
 
 
-@generic.schedule_dense.register(["cuda", "gpu"])
-def schedule_dense(outs):
+@autotvm.register_topi_schedule(generic.schedule_dense, ["cuda", "gpu"], "direct")
+def schedule_dense(cfg, outs):
     """Schedule for dense operator.
 
     Parameters
@@ -75,6 +83,7 @@ def schedule_dense(outs):
     s: Schedule
         The computation schedule for dense.
     """
+    # pylint: disable=unused-argument
     target = tvm.target.current_target()
     if target.target_name == "cuda" and "cublas" in target.libs:
         return generic.schedule_extern(outs)
@@ -124,3 +133,112 @@ def traverse(OP):
 
     traverse(outs[0].op)
     return s
+
+
+@autotvm.register_topi_compute(dense, ['cuda'], ['int8'])
+def dense_int8(cfg, data, weight, bias=None, out_dtype=None):
+    """Dense operator for int8 on CUDA"""
+    if out_dtype is None:
+        out_dtype = data.dtype
+    batch, in_dim = get_const_tuple(data.shape)
+    out_dim, _ = get_const_tuple(weight.shape)
+
+    k = tvm.reduce_axis((0, in_dim), name='k')
+
+    matmul = tvm.compute((batch, out_dim),
+                         lambda i, j: tvm.sum(data[i, k].astype(out_dtype) *
+                                              weight[j, k].astype(out_dtype), axis=[k]),
+                         tag="dense_int8")
+
+    cfg.add_flop(batch * in_dim * out_dim * 2)
+
+    if bias is not None:
+        matmul = tvm.compute((batch, out_dim),
+                             lambda i, j: matmul[i, j] + bias[j].astype(out_dtype),
+                             tag=tag.BROADCAST)
+        cfg.add_flop(batch * out_dim)
+
+    return matmul
+
+
+@autotvm.register_topi_schedule(generic.schedule_dense, ['cuda', 'gpu'], ['int8'])
+def schedule_dense_int8(cfg, outs):
+    s = tvm.create_schedule([x.op for x in outs])
+    def _callback(op):
+        if "dense_int8" in op.tag:
+            _schedule_dense_int8(cfg, s, op.output(0))
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+_dp4a = dp4a('shared', 'shared', 'local')
+
+def _schedule_dense_int8(cfg, s, output):
+    data, weight = s[output].op.input_tensors
+
+    batch, in_dim = get_const_tuple(data.shape)
+    out_dim, _ = get_const_tuple(weight.shape)
+
+    in_dim_factor = 4
+    assert in_dim % in_dim_factor == 0, "Input dimension must divide {}".format(in_dim_factor)
+    if in_dim % 16 == 0:
+        in_dim_factor = 16
+
+    # create tuning space
+    cfg.define_split("tile_y", batch, num_outputs=4)
+    cfg.define_split("tile_x", out_dim, num_outputs=4)
+    cfg.define_split("tile_k", in_dim // in_dim_factor, num_outputs=2)
+    cfg.define_knob('auto_unroll_max_step', [0, 512, 1500])
+
+    # create cache stage
+    AA = s.cache_read(data, 'shared', [output])
+    WW = s.cache_read(weight, 'shared', [output])
+    CC = s.cache_write(output, 'local')
+
+    # handle bias
+    if output.op not in s.outputs:
+        s[output].compute_inline()
+        output = s.outputs[0].output(0)
+
+    n, x = s[output].op.axis
+
+    # this is the scope to attach global config inside this kernel
+    kernel_scope, n = s[output].split(n, nparts=1)
+
+    ko = CC.op.reduce_axis[0]
+    ko, ki = s[CC].split(ko, factor=4)
+    ko, kt = cfg['tile_k'].apply(s, CC, ko)
+    s[CC].tensorize(ki, _dp4a)
+    by, vy, ty, yi = cfg['tile_y'].apply(s, output, n)
+    bx, vx, tx, xi = cfg['tile_x'].apply(s, output, x)
+
+    s[output].reorder(by, bx, vy, vx, ty, tx, yi, xi)
+    s[output].bind(by, tvm.thread_axis('blockIdx.y'))
+    s[output].bind(bx, tvm.thread_axis('blockIdx.x'))
+    s[output].bind(vy, tvm.thread_axis('vthread'))
+    s[output].bind(vx, tvm.thread_axis('vthread'))
+    s[output].bind(ty, tvm.thread_axis('threadIdx.y'))
+    s[output].bind(tx, tvm.thread_axis('threadIdx.x'))
+    n_ty = cfg['tile_y'].size[2]
+    n_tx = cfg['tile_x'].size[2]
+
+    s[CC].compute_at(s[output], tx)
+    yo, xo = CC.op.axis[:2]
+    s[CC].reorder(ko, kt, yo, xo, ki)
+
+    for load in [AA, WW]:
+        s[load].compute_at(s[CC], ko)
+
+        outer, inner = s[load].split(s[load].op.axis[-1], factor=in_dim_factor)
+        s[load].vectorize(inner)
+        fused = s[load].op.axis[:-1] + [outer]
+        fused = s[load].fuse(*fused)
+
+        fused, tx = s[load].split(fused, factor=n_tx)
+        fused, ty = s[load].split(fused, factor=n_ty)
+        s[load].bind(tx, tvm.thread_axis('threadIdx.x'))
+        s[load].bind(ty, tvm.thread_axis('threadIdx.y'))
+
+    s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+    s[output].pragma(kernel_scope, 'unroll_explicit', False)
+    return s
diff --git a/topi/python/topi/nn/dense.py b/topi/python/topi/nn/dense.py
index f116e7c4a31c..671b602edc30 100644
--- a/topi/python/topi/nn/dense.py
+++ b/topi/python/topi/nn/dense.py
@@ -19,7 +19,7 @@
 import tvm
 from .. import tag
 
-def dense_default(data, weight, bias=None):
+def dense_default(data, weight, bias=None, out_dtype=None):
     """The default implementation of dense in topi.
 
     Parameters
@@ -33,6 +33,9 @@ def dense_default(data, weight, bias=None):
     bias : tvm.Tensor, optional
         1-D with shape [out_dim]
 
+    out_dtype : str
+        The output type. This is used for mixed precision.
+
     Returns
     -------
     output : tvm.Tensor
@@ -42,21 +45,24 @@ def dense_default(data, weight, bias=None):
         "only support 2-dim dense"
     if bias is not None:
         assert len(bias.shape) == 1
+    if out_dtype is None:
+        out_dtype = data.dtype
     batch, in_dim = data.shape
     out_dim, _ = weight.shape
     k = tvm.reduce_axis((0, in_dim), name='k')
     matmul = tvm.compute((batch, out_dim), \
-                         lambda i, j: tvm.sum(data[i, k] * weight[j, k], axis=k), \
+                         lambda i, j: tvm.sum(data[i, k].astype(out_dtype) * \
+                                              weight[j, k].astype(out_dtype), axis=k), \
                          name='T_dense', tag='dense')
     if bias is not None:
         matmul = tvm.compute((batch, out_dim), \
-                             lambda i, j: matmul[i, j] + bias[j], \
+                             lambda i, j: matmul[i, j] + bias[j].astype(out_dtype), \
                              tag=tag.BROADCAST)
     return matmul
 
 
 @tvm.target.override_native_generic_func("dense")
-def dense(data, weight, bias=None):
+def dense(data, weight, bias=None, out_dtype=None):
     """Applies a linear transformation: :math:`Y = XW^T + b`.
 
     Parameters
@@ -70,9 +76,12 @@ def dense(data, weight, bias=None):
     bias : tvm.Tensor, optional
         1-D with shape [out_dim]
 
+    out_dtype : str
+        The output type. This is used for mixed precision.
+
     Returns
     -------
     output : tvm.Tensor
         2-D with shape [batch, out_dim]
     """
-    return dense_default(data, weight, bias)
+    return dense_default(data, weight, bias, out_dtype)
diff --git a/topi/python/topi/rocm/dense.py b/topi/python/topi/rocm/dense.py
index f0ed1cd4b70c..a8c033f0bd73 100644
--- a/topi/python/topi/rocm/dense.py
+++ b/topi/python/topi/rocm/dense.py
@@ -25,7 +25,7 @@
 from .. import generic
 
 @dense.register("rocm")
-def dense_rocm(data, weight, bias=None):
+def dense_rocm(data, weight, bias=None, out_dtype=None):
     """Dense operator for rocm backend.
 
     Parameters
@@ -39,6 +39,9 @@ def dense_rocm(data, weight, bias=None):
     bias : tvm.Tensor, optional
         1-D with shape [out_dim]
 
+    out_dtype : str
+        The output type. This is used for mixed precision.
+
     Returns
     -------
     output : tvm.Tensor
@@ -48,17 +51,20 @@ def dense_rocm(data, weight, bias=None):
         "only support 2-dim dense"
     if bias is not None:
         assert len(bias.shape) == 1
+    if out_dtype is None:
+        out_dtype = data.dtype
     batch, in_dim = data.shape
     out_dim, _ = weight.shape
     target = tvm.target.current_target()
     if "rocblas" in target.libs:
+        assert out_dtype == data.dtype, "Mixed precision not supported."
         matmul = rocblas.matmul(data, weight, False, True)
         if bias is not None:
             matmul = tvm.compute((batch, out_dim), \
                                  lambda i, j: matmul[i, j] + bias[j], \
                                  tag=tag.BROADCAST)
         return matmul
-    return dense_default(data, weight, bias)
+    return dense_default(data, weight, bias, out_dtype)
 
 
 @generic.schedule_dense.register(["rocm"])
diff --git a/topi/python/topi/x86/dense.py b/topi/python/topi/x86/dense.py
index 01f3a735f30b..2525ba0129ef 100644
--- a/topi/python/topi/x86/dense.py
+++ b/topi/python/topi/x86/dense.py
@@ -26,20 +26,22 @@
 from ..util import traverse_inline, get_const_tuple
 
 @autotvm.register_topi_compute(nn.dense, "cpu", "direct")
-def _declaration_dense(cfg, data, weight, bias=None):
+def _declaration_dense(cfg, data, weight, bias=None, out_dtype=None):
     batch, _ = get_const_tuple(data.shape)
 
     # For small batch sizes, don't pack weight into cache-friendly layout
     # because of overhead in packing and limited reuse from batch dimension
     # TODO(icemelon9): use a more systematic way to determine which schedule to use
     if batch <= 16:
-        return _declaration_dense_nopack(cfg, data, weight, bias)
-    return _declaration_dense_pack(cfg, data, weight, bias)
+        return _declaration_dense_nopack(cfg, data, weight, bias, out_dtype)
+    return _declaration_dense_pack(cfg, data, weight, bias, out_dtype)
 
 
 # Declare dense compute with packing weight into cache-friendly layout
 @autotvm.register_topi_compute(nn.dense, "cpu", "direct_pack")
-def _declaration_dense_pack(cfg, data, weight, bias=None):
+def _declaration_dense_pack(cfg, data, weight, bias=None, out_dtype=None):
+    if out_dtype is None:
+        out_dtype = data.dtype
     batch, in_dim = get_const_tuple(data.shape)
     out_dim, _ = get_const_tuple(weight.shape)
     # create tuning space
@@ -57,18 +59,21 @@ def _declaration_dense_pack(cfg, data, weight, bias=None):
     k = tvm.reduce_axis((0, in_dim), name="k")
     C = tvm.compute((batch, out_dim),
                     lambda y, x: tvm.sum(
-                        data[y, k] * packw[x // packw_bn, k, x % packw_bn],
+                        data[y, k].astype(out_dtype) *
+                        packw[x // packw_bn, k, x % packw_bn].astype(out_dtype),
                         axis=k),
                     tag="dense_pack")
     if bias is not None:
-        C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j],
+        C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j].astype(out_dtype),
                         tag=tag.BROADCAST)
     return C
 
 
 # Declare dense compute without packing weight
 @autotvm.register_topi_compute(nn.dense, "cpu", "direct_nopack")
-def _declaration_dense_nopack(cfg, data, weight, bias=None):
+def _declaration_dense_nopack(cfg, data, weight, bias=None, out_dtype=None):
+    if out_dtype is None:
+        out_dtype = data.dtype
     batch, in_dim = get_const_tuple(data.shape)
     out_dim, _ = get_const_tuple(weight.shape)
     # create tuning space
@@ -82,14 +87,15 @@ def _declaration_dense_nopack(cfg, data, weight, bias=None):
     k = tvm.reduce_axis((0, in_dim // vec), "k")
     CC = tvm.compute((batch, out_dim, vec),
                      lambda z, y, x: tvm.sum(
-                         data[z, k * vec + x] * weight[y, k * vec + x], axis=k))
+                         data[z, k * vec + x].astype(out_dtype) *
+                         weight[y, k * vec + x].astype(out_dtype), axis=k))
 
     kk = tvm.reduce_axis((0, vec), "kk")
     C = tvm.compute((batch, out_dim),
                     lambda y, x: tvm.sum(CC[y, x, kk], axis=kk),
                     tag="dense_nopack")
     if bias is not None:
-        C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j],
+        C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j].astype(out_dtype),
                         tag=tag.BROADCAST)
 
     return C
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index 2ab9a4235f38..d486e7b831bc 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -403,7 +403,7 @@ TVM_REGISTER_GLOBAL("topi.nn.binary_dense")
 /* Ops from nn/dense.h */
 TVM_REGISTER_GLOBAL("topi.nn.dense")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = nn::dense(args[0], args[1], args[2]);
+  *rv = nn::dense(args[0], args[1], args[2], args[3]);
   });
 
 /* Ops from nn/bias_add.h */
@@ -544,7 +544,7 @@ TVM_REGISTER_GLOBAL("topi.x86.schedule_injective")
 /* ROCm schedules */
 TVM_REGISTER_GLOBAL("topi.rocm.dense_cuda")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = rocm::dense_rocm(args[0], args[1], args[2], args[3]);
+  *rv = rocm::dense_rocm(args[0], args[1], args[2], args[3], args[4]);
   });
 
 TVM_REGISTER_GLOBAL("topi.rocm.schedule_dense")
@@ -565,7 +565,7 @@ TVM_REGISTER_GLOBAL("topi.rocm.schedule_l2_normalize")
 /* CUDA schedules */
 TVM_REGISTER_GLOBAL("topi.cuda.dense_cuda")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = cuda::dense_cuda(args[0], args[1], args[2], args[3]);
+  *rv = cuda::dense_cuda(args[0], args[1], args[2], args[3], args[4]);
   });
 
 TVM_REGISTER_GLOBAL("topi.cuda.schedule_dense")
@@ -686,7 +686,8 @@ TVM_REGISTER_GENERIC_FUNC(schedule_binary_dense)
 using FTVMDenseOpBuilder = std::function<tvm::Tensor(const Target& target,
                                                      const tvm::Tensor& data,
                                                      const tvm::Tensor& weight,
-                                                     const tvm::Tensor& bias)>;
+                                                     const tvm::Tensor& bias,
+                                                     const Type& out_dtype)>;
 
 /*!
 * \brief Helper function for registering dense ops matching the
@@ -703,8 +704,9 @@ inline PackedFunc WrapDenseOp(FTVMDenseOpBuilder builder) {
     Tensor data = args[0];
     Tensor weight = args[1];
     Tensor bias = args[2];
+    Type out_dtype = args[3];
 
-    *ret = builder(target, data, weight, bias);
+    *ret = builder(target, data, weight, bias, out_dtype);
   });
 }
 
@@ -712,8 +714,9 @@ TVM_REGISTER_GENERIC_FUNC(dense)
 .set_default(WrapDenseOp([](const Target& target,
                             const tvm::Tensor& data,
                             const tvm::Tensor& weight,
-                            const tvm::Tensor& bias) {
-  return topi::nn::dense(data, weight, bias);
+                            const tvm::Tensor& bias,
+                            const Type& out_dtype) {
+  return topi::nn::dense(data, weight, bias, out_dtype);
 }))
 .register_func({ "cuda", "gpu" }, WrapDenseOp(topi::cuda::dense_cuda))
 .register_func({ "rocm" }, WrapDenseOp(topi::rocm::dense_rocm));
diff --git a/topi/tests/python/common.py b/topi/tests/python/common.py
index d814c7a4012a..0bacd61129b7 100644
--- a/topi/tests/python/common.py
+++ b/topi/tests/python/common.py
@@ -32,7 +32,7 @@ def get_all_backend():
             'llvm -device=arm_cpu', 'opencl -device=mali', 'aocl_sw_emu']
 
 
-class NCHWcInt8Fallback(autotvm.FallbackContext):
+class Int8Fallback(autotvm.FallbackContext):
     def _query_inside(self, target, workload):
         key = (target, workload)
         if key in self.memory:
diff --git a/topi/tests/python/test_topi_conv2d_int8.py b/topi/tests/python/test_topi_conv2d_int8.py
index b76c1af1353c..09adbcecefc3 100644
--- a/topi/tests/python/test_topi_conv2d_int8.py
+++ b/topi/tests/python/test_topi_conv2d_int8.py
@@ -25,7 +25,7 @@
 from tvm.contrib.pickle_memoize import memoize
 from topi.util import get_const_tuple
 
-from common import get_all_backend, NCHWcInt8Fallback
+from common import get_all_backend, Int8Fallback
 
 oc_block_factor = 4
 
@@ -105,7 +105,7 @@ def check_device(device):
 
 
 def test_conv2d_nchw():
-    with NCHWcInt8Fallback():
+    with Int8Fallback():
         # ResNet18 workloads where channels in / out are multiple of oc_block_factor
         verify_conv2d_NCHWc_int8(1,  64,  56,  64, 3, 1, 1)
         verify_conv2d_NCHWc_int8(1,  64,  56,  64, 1, 1, 0)
diff --git a/topi/tests/python/test_topi_dense.py b/topi/tests/python/test_topi_dense.py
index cf386e45aa71..412eb30501bd 100644
--- a/topi/tests/python/test_topi_dense.py
+++ b/topi/tests/python/test_topi_dense.py
@@ -22,7 +22,7 @@
 from topi.util import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
-from common import get_all_backend
+from common import get_all_backend, Int8Fallback
 
 def verify_dense(batch, in_dim, out_dim, use_bias=True):
     A = tvm.placeholder((batch, in_dim), name='A')
@@ -65,6 +65,55 @@ def check_device(device):
     for device in get_all_backend():
         check_device(device)
 
+
+def verify_dense_int8(batch, in_dim, out_dim, use_bias=True):
+    dtype = 'int8'
+    out_dtype = 'int32'
+    A = tvm.placeholder((batch, in_dim), name='A', dtype=dtype)
+    B = tvm.placeholder((out_dim, in_dim), name='B', dtype=dtype)
+    C = tvm.placeholder((out_dim,), name='C', dtype=out_dtype)
+
+    # use memoize to pickle the test data for next time use
+    @memoize("topi.tests.test_topi_dense_int8")
+    def get_ref_data():
+        a_np = np.random.randint(low=-128, high=127, size=(batch, in_dim)).astype(dtype)
+        b_np = np.random.randint(low=-128, high=127, size=(out_dim, in_dim)).astype(dtype)
+        c_np = np.random.randint(low=-128, high=127, size=(out_dim,)).astype(out_dtype)
+        d_np = np.dot(a_np.astype(out_dtype), b_np.T.astype(out_dtype))
+        if use_bias:
+            d_np += c_np
+        d_np = np.maximum(d_np, 0.0)
+        return (a_np, b_np, c_np, d_np)
+
+    # get the test data
+    a_np, b_np, c_np, d_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version):
+            print("Skip because int8 intrinsics are not available")
+            return
+
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            D = topi.nn.dense(A, B, C if use_bias else None, out_dtype=out_dtype)
+            D = topi.nn.relu(D)
+            s = topi.generic.schedule_dense([D])
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(c_np, ctx)
+        d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=out_dtype), ctx)
+        f = tvm.build(s, [A, B, C, D], device, name="dense")
+        f(a, b, c, d)
+        tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
+
+    for device in ['cuda']:
+        check_device(device)
+
+
 def test_dense():
     verify_dense(1, 1024, 1000, use_bias=True)
     verify_dense(1, 1024, 1000, use_bias=False)
@@ -72,5 +121,12 @@ def test_dense():
     verify_dense(2, 1024, 1000, use_bias=True)
 
 
+def test_dense_int8():
+    with Int8Fallback():
+        verify_dense_int8(2, 1024, 1000, use_bias=True)
+        verify_dense_int8(2, 1024, 1000, use_bias=False)
+
+
 if __name__ == "__main__":
     test_dense()
+    test_dense_int8()
diff --git a/topi/tests/python/test_topi_group_conv2d.py b/topi/tests/python/test_topi_group_conv2d.py
index 5116db29e740..4189ac0a76f5 100644
--- a/topi/tests/python/test_topi_group_conv2d.py
+++ b/topi/tests/python/test_topi_group_conv2d.py
@@ -25,7 +25,7 @@
 from tvm.contrib.pickle_memoize import memoize
 from topi.util import get_const_tuple
 
-from common import get_all_backend, NCHWcInt8Fallback
+from common import get_all_backend, Int8Fallback
 
 
 def verify_group_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups, add_bias=False, add_relu=False):
@@ -203,7 +203,7 @@ def test_group_conv2d_nchw():
 
 
 def test_group_conv2d_NCHWc_int8():
-    with NCHWcInt8Fallback():
+    with Int8Fallback():
         # ResNeXt-50 workload
         verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 1, 32)
         verify_group_conv2d_NCHWc_int8(1, 256, 56, 256, 3, 2, 1, 1, 32)

From 06c60bfaa421156626959d25f8cd1aa501d5cfd2 Mon Sep 17 00:00:00 2001
From: Salem Derisavi <derisavi@users.noreply.github.com>
Date: Fri, 26 Apr 2019 12:49:29 -0400
Subject: [PATCH 048/106] [TVM][ARITH] Teach BoundDeduce to handle the case in
 which target var can appear in rhs of expression (#2795)

* target variable can now appear in either lhs or rhs of the expression to be analyzed

* removed extra spaces
---
 src/arithmetic/bound_deducer.cc            | 57 ++++++++++++++++------
 tests/python/unittest/test_arith_intset.py | 36 ++++++++++++--
 2 files changed, 75 insertions(+), 18 deletions(-)

diff --git a/src/arithmetic/bound_deducer.cc b/src/arithmetic/bound_deducer.cc
index c35dffb848d6..89e556c6f75f 100644
--- a/src/arithmetic/bound_deducer.cc
+++ b/src/arithmetic/bound_deducer.cc
@@ -207,24 +207,53 @@ void BoundDeducer::Init() {
 }
 
 void BoundDeducer::Transform() {
+  // We will ensure to set expr_ such that it contains target_
   if (const LT* op = expr_.as<LT>()) {
-    is_greater = false;
-    expr_      = op->a;
-    // a < b -> a <= b - 1
-    result     = op->b - 1;
+    if (GetPath(target_, op->a).empty()) {
+      // a < b -> b >= a + 1
+      is_greater = true;
+      expr_ = op->b;
+      result = op->a + 1;
+    } else {
+      // a < b -> a <= b - 1
+      is_greater = false;
+      expr_ = op->a;
+      result = op->b - 1;
+    }
   } else if (const LE* op = expr_.as<LE>()) {
-    is_greater = false;
-    expr_      = op->a;
-    result     = op->b;
+    if (GetPath(target_, op->a).empty()) {
+      // a <= b -> b >= a
+      is_greater = true;
+      expr_ = op->b;
+      result = op->a;
+    } else {
+      is_greater = false;
+      expr_ = op->a;
+      result = op->b;
+    }
   } else if (const GT* op = expr_.as<GT>()) {
-    is_greater = true;
-    expr_      = op->a;
-    // a > b -> a >= b + 1
-    result     = op->b + 1;
+    if (GetPath(target_, op->a).empty()) {
+      // a > b -> b <= a - 1
+      is_greater = false;
+      expr_ = op->b;
+      result = op->a - 1;
+    } else {
+      // a > b -> a >= b + 1
+      is_greater = true;
+      expr_ = op->a;
+      result = op->b + 1;
+    }
   } else if (const GE* op = expr_.as<GE>()) {
-    is_greater = true;
-    expr_      = op->a;
-    result     = op->b;
+    if (GetPath(target_, op->a).empty()) {
+      // a >= b -> b <= a
+      is_greater = false;
+      expr_ = op->b;
+      result = op->a;
+    } else {
+      is_greater = true;
+      expr_ = op->a;
+      result = op->b;
+    }
   } else {
     success = false;
   }
diff --git a/tests/python/unittest/test_arith_intset.py b/tests/python/unittest/test_arith_intset.py
index 650efdfb2f0a..a74162ec07f2 100644
--- a/tests/python/unittest/test_arith_intset.py
+++ b/tests/python/unittest/test_arith_intset.py
@@ -38,32 +38,56 @@ def test_deduce():
     b_s = tvm.arith.intset_interval(2, 3)
     c_s = tvm.arith.intset_interval(10, 15)
     d_s = tvm.arith.intset_interval(-3, -1)
+    zero = tvm.const(0, "int32")
 
     e0 = (-b)*a+c-d
     res0 = tvm.arith.DeduceBound(a, e0>=0, {b: b_s, c: c_s, d: d_s}, {})
     ans0 = ((d - c) /(b*-1))
     assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0)
 
+    # expression containing variable a is on rhs
+    res0 = tvm.arith.DeduceBound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {})
+    assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0)
+
     e0 = d*a+c-d
     res0 = tvm.arith.DeduceBound(a, e0>=0, {b: b_s, c: c_s, d: d_s}, {})
     ans0 = ((0-c)/d + 1)
     assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0)
 
+    # expression containing variable a is on rhs
+    res0 = tvm.arith.DeduceBound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {})
+    assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0)
+
     e1 = (a*4+b < c)
     res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {})
     ans1 = (((c - b) + -1)/4)
     assert str(tvm.ir_pass.Simplify(res1.max())) == str(ans1)
 
+    # expression containing variable a is on rhs
+    e1 = (c > a*4+b)
+    res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {})
+    assert str(tvm.ir_pass.Simplify(res1.max())) == str(ans1)
+
     e2 = (tvm.max(5, a * 4) < 0)
     res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
     assert str(res2.max()) == "neg_inf"
     assert str(res2.min()) == "pos_inf"
 
+    # expression containing variable a is on rhs
+    e2 = (zero < tvm.max(5, a * 4))
+    res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
+    assert str(res2.max()) == "neg_inf"
+    assert str(res2.min()) == "pos_inf"
+
+
     e3 = (-b)+a*c-d
     res3 = tvm.arith.DeduceBound(a, e3>=0, {b: b_s, c: c_s, d: d_s}, {b: b_s, d: d_s})
     ans3 = 2/c+1
     assert str(tvm.ir_pass.Simplify(res3.min())) == str(ans3)
 
+    res3 = tvm.arith.DeduceBound(a, zero <= e3, {b: b_s, c: c_s, d: d_s}, {b: b_s, d: d_s})
+    assert str(tvm.ir_pass.Simplify(res3.min())) == str(ans3)
+
 def test_check():
     a = tvm.var('a')
     b = tvm.var('b')
@@ -97,11 +121,13 @@ def test_basic(a1, a2, coff):
         [x, y] = [res1.max(), b_s.max()] if coff > 0 else [res1.min(), b_s.min()]
         assert (tvm.ir_pass.Simplify((x * coff + 3 + y) < 17)).value == 1
 
-        res1 = tvm.arith.DeduceBound(a, e0>17, {b: b_s}, {b: b_s})
+        # expression containing variable a is on rhs
+        res1 = tvm.arith.DeduceBound(a, tvm.const(17, "int32") < e0, {b: b_s}, {b: b_s})
         [x, y] = [res1.max(), b_s.max()] if coff < 0 else [res1.min(), b_s.min()]
         assert (tvm.ir_pass.Simplify((x * coff + 3 + y) > 17)).value == 1
 
-        res1 = tvm.arith.DeduceBound(a, e0<=17, {b: b_s}, {b: b_s})
+        # expression containing variable a is on rhs
+        res1 = tvm.arith.DeduceBound(a, tvm.const(17, "int32")>= e0, {b: b_s}, {b: b_s})
         [x, y] = [res1.max(), b_s.max()] if coff > 0 else [res1.min(), b_s.min()]
         assert (tvm.ir_pass.Simplify((x * coff + 3 + y) <= 17)).value == 1
 
@@ -127,7 +153,8 @@ def test_complex(a1, a2, coff):
         [t, x] = [res1.max(), b_s.max()] if coff > 0 else [res1.min(), b_s.min()]
         assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) < 63)).value == 1
 
-        res1 = tvm.arith.DeduceBound(a, e0<=63, {b: b_s}, {b: b_s})
+        # expression containing variable a is on rhs
+        res1 = tvm.arith.DeduceBound(a, tvm.const(63, "int32")>= e0, {b: b_s}, {b: b_s})
         [t, x] = [res1.max(), b_s.max()] if coff > 0 else [res1.min(), b_s.min()]
         assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) <= 63)).value == 1
 
@@ -135,7 +162,8 @@ def test_complex(a1, a2, coff):
         [t, x] = [res1.max(), b_s.max()] if coff < 0 else [res1.min(), b_s.min()]
         assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) > 63)).value == 1
 
-        res1 = tvm.arith.DeduceBound(a, e0>=63, {b: b_s}, {b: b_s})
+        # expression containing variable a is on rhs
+        res1 = tvm.arith.DeduceBound(a, tvm.const(63, "int32") <= e0, {b: b_s}, {b: b_s})
         [t, x] = [res1.max(), b_s.max()] if coff < 0 else [res1.min(), b_s.min()]
         assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) >= 63)).value == 1
 

From 3560e7a7e70211f23ec42a3ba58c3dc040bb8cd7 Mon Sep 17 00:00:00 2001
From: Salem Derisavi <derisavi@users.noreply.github.com>
Date: Fri, 26 Apr 2019 17:10:42 -0400
Subject: [PATCH 049/106] 1) fixed a functional bug in loop partitioning
 algorithm that is exposed when double splitting with indivisible factors 2)
 added a testcase (#2956)

---
 src/pass/loop_partition.cc                    | 294 ++++++++++++++----
 .../unittest/test_pass_loop_partition.py      |  41 +++
 2 files changed, 277 insertions(+), 58 deletions(-)

diff --git a/src/pass/loop_partition.cc b/src/pass/loop_partition.cc
index 1869b39ff5a9..04bb9385b156 100644
--- a/src/pass/loop_partition.cc
+++ b/src/pass/loop_partition.cc
@@ -38,12 +38,20 @@ using arith::IntSet;
 using arith::DeduceBound;
 using arith::Intersect;
 
-// a partition means the expr is equal to true in the interval
-struct Partition {
-  Expr expr;
-  IntSet interval;
+using PartitionKey = std::pair<const Node*, bool>;
+struct PartitionKeyHash {
+  std::size_t operator()(PartitionKey const& k) const noexcept {
+    std::size_t h1 = std::hash<const Node*>{}(k.first);
+    std::size_t h2 = std::hash<bool>{}(k.second);
+    return h1 ^ h2;
+  }
 };
 
+// Each mapping (cond, cond_value) -> interval represents the fact that
+// condition cond is proven to have value cond_value (true or false) in interval.
+using Partition = std::unordered_map<PartitionKey, IntSet, PartitionKeyHash>;
+
+
 bool ExprUseVars(Expr expr, const std::unordered_set<const Variable*>& vars) {
   bool success = false;
   PostOrderVisit(expr, [&vars, &success](const NodeRef& node) {
@@ -140,7 +148,9 @@ class CandidateSelector final : public IRVisitor {
   std::unordered_map<const Variable*, VarIsUsed> record_;
 };
 
-// Find valid partition for specific variable
+// Populate partitions data structure, i.e., for a specific variable,
+// find an interval in which each condition
+// (currently, "likely" conditions) has fixed true or false value
 class PartitionFinder : public IRVisitor {
  public:
   explicit PartitionFinder(VarExpr current_var,
@@ -188,10 +198,23 @@ class PartitionFinder : public IRVisitor {
       Expr cond = op->args[0];
       if (ExprUseVars(cond,
           std::unordered_set<const Variable*>({current_var_.get()}))) {
+        // For cond, find out the interval, if exists, in which we can prove that cond is
+        // true. Also find the interval, if exists, in which we can prove that cond is
+        // false.
         IntSet interval =
-          DeduceBound(current_var_, cond, hint_map_, relax_map_);
+                DeduceBound(current_var_, cond, hint_map_, relax_map_);
         if (!interval.is_nothing()) {
-          partitions[cond.get()] = Partition{cond, interval};
+          // cond is true within interval
+          partitions[{cond.get(), true}] = interval;
+        }
+        Expr inverse_cond = InverseCond(cond);
+        if (inverse_cond.defined()) {
+          IntSet interval =
+                  DeduceBound(current_var_, inverse_cond, hint_map_, relax_map_);
+          if (!interval.is_nothing()) {
+            // cond is false within interval
+            partitions[{cond.get(), false}] = interval;
+          }
         }
       }
     } else {
@@ -199,36 +222,59 @@ class PartitionFinder : public IRVisitor {
     }
   }
 
-  std::unordered_map<const Node*, Partition> partitions;
+  Partition partitions;
 
  private:
+  Expr InverseCond(const Expr& cond) {
+    // We expect most condition not to be of EQ or NE form.
+    // Currently we do not handle inversing EQ or NE.
+    Expr inverse_cond;
+    if (const LT* op = cond.as<LT>()) {
+      // a < b -> a >= b
+      inverse_cond = GE::make(op->a, op->b);
+    } else if (const GT* op = cond.as<GT>()) {
+      // a > b -> a <= b
+      inverse_cond = LE::make(op->a, op->b);
+    } else if (const LE* op = cond.as<LE>()) {
+      // a <= b -> a > b
+      inverse_cond = GT::make(op->a, op->b);
+    } else if (const GE* op = cond.as<GE>()) {
+      // a >= b -> a < b
+      inverse_cond = LT::make(op->a, op->b);
+    }
+    return inverse_cond;
+  }
+
   VarExpr current_var_;
   std::unordered_set<const Variable*> out_vars_;
   std::unordered_map<const Variable*, IntSet> hint_map_;
   std::unordered_map<const Variable*, IntSet> relax_map_;
 };
 
-// Eliminate the condition expressions by partitions
+// Replace the set of conditions given by ps with cond_value (true or false)
 class ConditionEliminator : public IRMutator {
  public:
-  explicit ConditionEliminator(const std::unordered_map<const Node*, Partition>& ps)
-    : ps_(ps) {}
+  explicit ConditionEliminator(const std::unordered_set<const Node*>& ps, bool cond_value = true)
+    : ps_(ps), cond_value_(cond_value) {}
 
   using IRMutator::Mutate;
   Expr Mutate(Expr e) final {
-    if (ps_.count(e.get())) return Mutate(const_true());
+    if (ps_.find(e.get()) != ps_.end()) {
+      return Mutate(cond_value_ ? const_true() : const_false());
+    }
     return IRMutator::Mutate(e);
   }
 
  private:
-  const std::unordered_map<const Node*, Partition>& ps_;
+  std::unordered_set<const Node*> ps_;
+  bool cond_value_;
 };
 
 
 // Insert the partition branch at the innermost thread scope
 class ThreadPartitionInserter : public IRMutator {
  public:
-  explicit ThreadPartitionInserter(const std::unordered_map<const Node*, Partition>& ps,
+  explicit ThreadPartitionInserter(const std::unordered_set<const Node*>& ps,
     Expr cond) : ps_(ps), cond_(cond), innermost_thread_scope_(false) {}
 
   Stmt Mutate_(const AttrStmt* op, const Stmt& s) final {
@@ -250,12 +296,13 @@ class ThreadPartitionInserter : public IRMutator {
   }
 
  private:
-  const std::unordered_map<const Node*, Partition>& ps_;
+  const std::unordered_set<const Node*>& ps_;
   Expr cond_;
   bool innermost_thread_scope_;
 };
 
-// Try to do partition at the candidate IRs
+// Try to partition range of iteration variables in order to remove (some)
+// likely conditions
 class LoopPartitioner : public IRMutator {
  public:
   explicit LoopPartitioner(bool split_const_loop)
@@ -273,7 +320,7 @@ class LoopPartitioner : public IRMutator {
       if (s.defined()) return s;
     }
 
-    // normal path when loop parittion fails
+    // normal path when loop partition fails
     // normal loop variable can be put into hint map.
     hint_map_.insert({op->loop_var.get(),
       IntSet::interval(op->min, op->min + op->extent - 1)});
@@ -316,6 +363,12 @@ class LoopPartitioner : public IRMutator {
  private:
   Stmt TryPartition(const Node* op, const Stmt& stmt, VarExpr var,
       Expr min, Expr max, Stmt body, bool partition_thread_scope);
+
+  std::pair<IntSet, std::unordered_set<const Node*>>
+  GetIntervalAndCondset(const Partition &partitions,
+                        const arith::Interval &for_interval,
+                        bool cond_value);
+
   inline Stmt MakeFor(const Node* op, Expr extent, Stmt body);
 
   /* Candidate IRs that may be partitioned potentially */
@@ -324,6 +377,98 @@ class LoopPartitioner : public IRMutator {
   CandidateSelector selector;
 };
 
+// Returns an interval (in the first component) in which all the conditions
+// given in the second component provably have value given by cond_value
+std::pair<IntSet, std::unordered_set<const Node*>>
+LoopPartitioner::GetIntervalAndCondset(const Partition &partitions,
+                                       const arith::Interval &for_interval,
+                                       bool cond_value) {
+  Array<IntSet> sets;
+  std::unordered_set<const Node*> cond_set;
+
+  for (const auto &kv : partitions) {
+    if (kv.first.second == cond_value) {
+      arith::Interval interval = kv.second.as<arith::IntervalSet>()->i;
+      auto intersection = arith::Interval::make_intersection(interval, for_interval);
+
+      // TODO(derisavi): the following if statement needs to be removed as soon as
+      // TVM uses commit a768f2f0 of HalideIR repo
+      if (intersection.min.same_as(arith::Interval::pos_inf) ||
+          intersection.max.same_as(arith::Interval::neg_inf)) {
+        intersection = arith::Interval::nothing();
+      } else if (intersection.min.type() == intersection.max.type() &&
+                 (intersection.min.type().is_int() ||
+                  intersection.min.type().is_uint()) &&
+                 can_prove(intersection.min > intersection.max)) {
+        intersection = arith::Interval::nothing();
+      }
+
+      if (!intersection.is_empty()) {
+        sets.push_back(kv.second);
+        cond_set.insert(kv.first.first);
+      }
+    }
+  }
+  IntSet interval = sets.empty() ? IntSet::nothing() : Intersect(sets);
+  return std::make_pair(interval, cond_set);
+}
+
+Stmt AppendStmts(const Stmt& a, const Stmt& b) {
+  if (!a.defined()) {
+    return b;
+  } else if (!b.defined()) {
+    return a;
+  } else {
+    return Block::make(a, b);
+  }
+}
+
+/*
+ * Tries to recursively partition the range of the variable (given by var) of
+ * the for loop (given by node and stmt) into a
+ * number of disjoint ranges such that in some ranges one or more predicates
+ * in the loopnest are provably true or false in each range. For example, given the
+ * following loop to partition:
+ * for (i = 0; i < 4; i++)
+ *    for (j = 0; j < 10; j++)
+ *        if (likely(i*10 + j < 36))
+ *            A[10*i+j] = B[10*i+j]
+ *
+ * We first partition range of i, i.e., [0,3] into subranges [0,2] and [3,3] because the
+ * likely condition is always true for the first subrange but not always true for the
+ * second subrange. Therefore, we'll have
+ * for (i = 0; i < 3; i++)
+ *    for (j = 0; j < 10; j++)
+ *        if (likely(1))
+ *           A[10*i+j] = B[10*i+j]
+ * for (i = 0; i < 1; i++)
+ *    for (j = 0; j < 10; j++)
+ *        if (likely((i+3)*10 + j < 36))
+ *            A[10*(i+3)+j] = B[10*(i+3)+j]
+ * Which is simplified as:
+ * for (i = 0; i < 3; i++)
+ *    for (j = 0; j < 10; j++)
+ *        A[10*i+j] = B[10*i+j]
+ * for (j = 0; j < 10; j++) // loopnest 1
+ *    if (likely(j < 6))
+ *            A[30+j] = B[30+j]
+ * Now, we recursively partition j in loopnest 1 into subranges [0,5] and [6,9] where the
+ * condition is true for the first subrange and now always true for the second subrange.
+ * for (j = 0; j < 6; j++)
+ *    if (likely(1))
+ *         A[30+j] = B[30+j]
+ * for (j = 0; j < 4; j++) // loop 2
+ *    if (likely(j < 0))
+ *        A[36+j] = B[36+j]
+ * Finally we recursively partition loop 2 above into subrange [0,3] where the
+ * condition is false and empty interval where the condition is not false,
+ * therefore we generate
+ * for (j = 0; j < 4; j++)
+ *    if (likely(0))
+ *        A[36+j] = B[36+j]
+ * which will eventually be simplified to empty code. And because only one loop was generated
+ * from loop 2 we stop recursing.
+ */
 Stmt LoopPartitioner::TryPartition(const Node* node,
                                    const Stmt& stmt,
                                    VarExpr var,
@@ -333,29 +478,51 @@ Stmt LoopPartitioner::TryPartition(const Node* node,
                                    bool partition_thread_scope) {
   PartitionFinder finder(var, hint_map_, relax_map_);
   finder.Visit(body);
-  const auto& partitions = finder.partitions;
-  if (partitions.empty()) return Stmt();
-
-  Array<IntSet> sets;
-  // merge partitions (take their intersect)
-  for (const auto& kv : partitions) {
-    sets.push_back(kv.second.interval);
+  if (finder.partitions.empty()) return Stmt();
+
+  arith::Interval for_interval(min, max);
+  bool cond_value;
+  IntSet middle_interval;
+  std::unordered_set<const Node*> cond_set;
+  // find an interval in which all conditions on var are true
+  std::tie(middle_interval, cond_set) =
+          GetIntervalAndCondset(finder.partitions, for_interval, true);
+  if (middle_interval.is_nothing()) {
+    // if such interval doesn't exist, find an interval in which all
+    // conditions on var are false
+    std::tie(middle_interval, cond_set) =
+            GetIntervalAndCondset(finder.partitions, for_interval, false);
+    if (middle_interval.is_nothing())
+      // we couldn't find an interval in which the condintions are provably true or false
+      // Therefore, we can't partition the loop based on those conds
+      return Stmt();
+    cond_value = false;
+  } else {
+    cond_value = true;
   }
-  IntSet true_itrv  = Intersect(sets);
 
+  arith::Interval middle_interval_i = middle_interval.as<arith::IntervalSet>()->i;
+  // middle_interval is the subrange of the loop variable range for which a
+  // set of conditions are true (or false resp.)
+  // The part of the loop variable range that is before (after resp.) that
+  // subrange is prefixed with pre- (post- resp.)
+
+  // Calculating pre-subrange and generating code for it.
+  // pre-subrange = [min, body_begin)
   Expr body_begin;
   Stmt pre_stmt;
-  arith::Interval true_itrv_i = true_itrv.as<arith::IntervalSet>()->i;
-  if (true_itrv_i.has_lower_bound()) {
-    body_begin = ir::Simplify(true_itrv.min());
+  bool pre_stmt_recurse = true;
+  if (middle_interval_i.has_lower_bound()) {
+    body_begin = ir::Simplify(middle_interval.min());
     if (!can_prove(body_begin == min)) {
       Expr cond = (body_begin - min >= 0);
       if (!can_prove(cond)) {
         LOG(WARNING) << "Cannot prove: " << cond
                      << ", when generating the pre doubt loop";
         body_begin = Max::make(body_begin, min);
+        // stop recursing on this interval if we can't prove it has non-negative length
+        pre_stmt_recurse = false;
       }
-      // [min, body_begin)
       if (!partition_thread_scope) {
         Stmt pre_body = Substitute(body, {{Var{var}, var + min}});
         pre_stmt = MakeFor(node, body_begin - min, pre_body);
@@ -365,31 +532,27 @@ Stmt LoopPartitioner::TryPartition(const Node* node,
     body_begin = min;
   }
 
+  // Calculating post-subrange and generating code for it.
+  // post-subrange = [post_doubt_begin, max]
   Expr post_doubt_begin;
   Stmt post_stmt;
-  if (true_itrv_i.has_upper_bound()) {
-    post_doubt_begin = ir::Simplify(true_itrv.max() + 1);
-    if (!can_prove(true_itrv.max() == max)) {
+  bool post_stmt_recurse = true;
+  if (middle_interval_i.has_upper_bound()) {
+    post_doubt_begin = ir::Simplify(middle_interval.max() + 1);
+    if (!can_prove(middle_interval.max() == max)) {
       // require the extent to be non-negative
       Expr cond = (max - post_doubt_begin + 1 >= 0);
       if (!can_prove(cond)) {
         LOG(WARNING) << "Cannot prove: " << cond
                      << ", when generating the post doubt loop";
         post_doubt_begin = Min::make(post_doubt_begin, max);
+        // stop recursing on this interval if we can't prove it has non-negative length
+        post_stmt_recurse = false;
       }
-      // [post_doubt_begin, max]
       if (!partition_thread_scope) {
-        Stmt post_body;
-        // If the loop is going from 0 to 1, replace the loop var with min value
-        if (as_const_int(max) && as_const_int(post_doubt_begin)) {
-            if (*as_const_int(max) == *as_const_int(post_doubt_begin)) {
-                post_body = Substitute(body, {{Var{var}, post_doubt_begin}});
-                post_stmt = post_body;
-            }
-        } else {
-            post_body = Substitute(body, {{Var{var}, var + post_doubt_begin}});
-            post_stmt = MakeFor(node, max - post_doubt_begin + 1, post_body);
-        }
+        Stmt post_body =
+                Substitute(body, {{Var{var}, var + post_doubt_begin}});
+        post_stmt = MakeFor(node, max - post_doubt_begin + 1, post_body);
       }
     }
   } else {
@@ -397,25 +560,35 @@ Stmt LoopPartitioner::TryPartition(const Node* node,
   }
 
   Stmt s;
+
+  // Generating code for middle subrange
   if (!partition_thread_scope) {
-    // [body_begin, post_doubt_begin)
-    Stmt simplified_body = ConditionEliminator(partitions).Mutate(body);
-    Stmt new_body = Substitute(simplified_body, {{Var{var}, var + body_begin}});
-    s = MakeFor(node, post_doubt_begin - body_begin, new_body);
-
-    if (!(pre_stmt.defined() && post_stmt.defined())) s = VisitAndMutate(s);
-    if (pre_stmt.defined()) s = Block::make(pre_stmt, s);
-    if (post_stmt.defined()) {
-      if (as_const_int(max) && as_const_int(post_doubt_begin)) {
-        post_stmt = VisitAndMutate(post_stmt);
+    Stmt mid_stmt;
+    if (!can_prove(body_begin >= post_doubt_begin)) {
+      // [body_begin, post_doubt_begin)
+      Stmt simplified_body = ConditionEliminator(cond_set, cond_value).Mutate(body);
+      Stmt new_body = Substitute(simplified_body, {{Var{var}, var + body_begin}});
+      mid_stmt = MakeFor(node, post_doubt_begin - body_begin, new_body);
+
+      // Recurse for each non-empty subrange only if there are at least
+      // two non-empty subranges
+      if (pre_stmt.defined() || post_stmt.defined()) {
+        mid_stmt = VisitAndMutate(mid_stmt);
+        if (pre_stmt.defined() && pre_stmt_recurse) {
+          pre_stmt = VisitAndMutate(pre_stmt);
+        }
+        if (post_stmt.defined() && post_stmt_recurse) {
+          post_stmt = VisitAndMutate(post_stmt);
+        }
       }
-      s = Block::make(s, post_stmt);
     }
+    s = AppendStmts(pre_stmt, mid_stmt);
+    s = AppendStmts(s, post_stmt);
   } else {
     Expr cond = const_true();
     if (!can_prove(body_begin == min)) cond = cond && (var >= body_begin);
     if (!can_prove(post_doubt_begin == (max + 1))) cond = cond && (var < post_doubt_begin);
-    s = ThreadPartitionInserter(partitions, cond).Mutate(stmt);
+    s = ThreadPartitionInserter(cond_set, cond).Mutate(stmt);
   }
   s = ConvertSSA(s);
   return s;
@@ -424,8 +597,13 @@ Stmt LoopPartitioner::TryPartition(const Node* node,
 inline Stmt LoopPartitioner::MakeFor(const Node *node, Expr extent, Stmt body) {
   const For *for_node = static_cast<const For*>(node);
   CHECK(for_node);
-  return For::make(for_node->loop_var, 0, extent,
-    for_node->for_type, for_node->device_api, body);
+  if (can_prove(extent == make_const(Int(32), 1))) {
+    // If the loop extent is 1, do not create the loop anymore
+    return Substitute(body, {{Var{for_node->loop_var}, make_const(Int(32), 0)}});
+  } else {
+    return For::make(for_node->loop_var, 0, extent,
+                     for_node->for_type, for_node->device_api, body);
+  }
 }
 
 class RemoveLikelyTags : public IRMutator {
diff --git a/tests/python/unittest/test_pass_loop_partition.py b/tests/python/unittest/test_pass_loop_partition.py
index 80b4f9232c50..85cb9b9acd16 100644
--- a/tests/python/unittest/test_pass_loop_partition.py
+++ b/tests/python/unittest/test_pass_loop_partition.py
@@ -15,12 +15,21 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+import numpy
 
 def collect_visit(stmt, f):
     ret = []
     tvm.ir_pass.PostOrderVisit(stmt, lambda x : ret.append(f(x)))
     return ret
 
+def find_top_produce(stmt):
+    def f(x, ret):
+        if isinstance(x, tvm.stmt.ProducerConsumer):
+            ret.append(x)
+    ret = []
+    tvm.ir_pass.PostOrderVisit(stmt, lambda x : f(x, ret))
+    return ret[-1]
+
 def lower(sch, args):
     binds = {}
     arg_list = []
@@ -344,6 +353,37 @@ def test_conv_tiling():
     stmt = tvm.ir_pass.Simplify(stmt)
     assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
 
+def test_double_splitting_with_indivisible_factors():
+    m = 48
+    dtype="float32"
+    A = tvm.placeholder((m,), name='A', dtype=dtype)
+    C = tvm.compute((m,), lambda i: A[i], name='C')
+    D = tvm.compute((m,), lambda i: C[i], name='D')
+
+    s = tvm.create_schedule(D.op)
+    co, ci = s[C].split(C.op.axis[0], factor=10)
+    do, di = s[D].split(D.op.axis[0], 32)
+    s[C].compute_at(s[D], do)
+
+    target = 'llvm'
+    with tvm.build_config(partition_const_loop=True):
+        f = tvm.lower(s, [A, C, D], name="fadd1", simple_mode=False)
+        func = tvm.build(f, target=target)
+
+    # Find the beginning of the Halide IR corresponding to kernel code
+    # and make sure it doesn't have an if statements left
+    top_produce = find_top_produce(f.body)
+    assert(not any(collect_visit(top_produce, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
+
+    # check functional correctness of generated code
+    ctx = tvm.context(target, 0)
+    a = tvm.nd.array(numpy.ones(m,).astype(dtype), ctx)
+    c = tvm.nd.array(numpy.zeros(m,).astype(dtype), ctx)
+    d = tvm.nd.array(numpy.zeros(m,).astype(dtype), ctx)
+    func(a, c, d)
+    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy(), rtol=1e-5)
+    tvm.testing.assert_allclose(d.asnumpy(), a.asnumpy(), rtol=1e-5)
+
 if __name__ == "__main__":
     test_basic()
     test_const_loop()
@@ -361,3 +401,4 @@ def test_conv_tiling():
     test_cce_loop_2()
     test_cce_loop_3()
     test_conv_tiling()
+    test_double_splitting_with_indivisible_factors()

From 992f918e16ca504989c9ed6578e726baeea612e7 Mon Sep 17 00:00:00 2001
From: "Ruizhe Zhao (Vincent)" <kumasento@users.noreply.github.com>
Date: Sat, 27 Apr 2019 03:15:21 +0100
Subject: [PATCH 050/106] Fixed issue #3069 by checking op tag (#3070)

* Fixed issue #3069 by adding in_channels

* Registerd group_conv2d_nchw as topi compute

* Improved by checking tag value

* Removed group_conv2d_nchw topi registration

* Added test for relay group_conv2d_nchw

* Added assertions to forbid small group size

* Removed hard-coded oc_block_factor

* Added explanatory comments to group_conv2d_nchw_cuda

* Updated group_conv2d_nchw_cuda schedule

Removed 'direct' CUDA tests

* Reverted an accidental change in a conv2d test

* Fixed indentation problems

* Fixed a mis-commented line

* Reverted change in group_conv2d_nchw tag

* Removed commented int8 group_conv2d test

* Fixed group size assertions in group_conv2d_nchw_cuda
---
 python/tvm/relay/frontend/onnx.py          |  1 -
 python/tvm/relay/op/nn/_nn.py              | 89 ++++++++++++++++++----
 python/tvm/target.py                       |  2 +-
 tests/python/relay/test_op_level2.py       | 29 ++++++-
 topi/python/topi/cuda/group_conv2d_nchw.py | 37 +++++++--
 topi/python/topi/generic/nn.py             |  2 +-
 topi/python/topi/nn/conv2d.py              |  2 +-
 7 files changed, 135 insertions(+), 27 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index ebedc20375e5..53f104ce48cf 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -169,7 +169,6 @@ class Conv(OnnxOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
-        # get number of channels
         out = AttrCvt(op_name=dimension_picker('conv'),
                       transforms={
                           'kernel_shape': 'kernel_size',
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 272b7511b9ed..5e9d5d74498d 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-#pylint: disable=invalid-name, unused-argument
+# pylint: disable=invalid-name, unused-argument
 """Backend compiler related feature registration"""
 from __future__ import absolute_import
 
@@ -34,16 +34,19 @@ def schedule_softmax(_, outputs, target):
     with target:
         return topi.generic.schedule_softmax(outputs)
 
+
 reg.register_pattern("nn.softmax", OpPattern.OPAQUE)
 
 schedule_broadcast = schedule_injective
 
+
 @reg.register_schedule("nn.log_softmax")
 def schedule_log_softmax(_, outputs, target):
     """Schedule definition of log_softmax"""
     with target:
         return topi.generic.schedule_softmax(outputs)
 
+
 reg.register_pattern("nn.log_softmax", OpPattern.OPAQUE)
 
 
@@ -55,12 +58,14 @@ def compute_dense(attrs, inputs, out_type, target):
     out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
     return [topi.nn.dense(inputs[0], inputs[1], out_dtype=out_dtype)]
 
+
 @reg.register_schedule("nn.dense")
 def schedule_dense(attrs, outputs, target):
     """Schedule definition of dense"""
     with target:
         return topi.generic.schedule_dense(outputs)
 
+
 reg.register_pattern("nn.dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
@@ -70,16 +75,29 @@ def compute_batch_matmul(attrs, inputs, out_type, target):
     """Compute definition of batch_matmul"""
     return [topi.nn.batch_matmul(inputs[0], inputs[1])]
 
+
 @reg.register_schedule("nn.batch_matmul")
 def schedule_batch_matmul(attrs, outputs, target):
     """Schedule definition of batch_matmul"""
     with target:
         return topi.generic.schedule_batch_matmul(outputs)
 
+
 reg.register_pattern("nn.batch_matmul", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # conv2d
+def _find_conv2d_op(op):
+    """Find the op with conv2d in its tag by traversing."""
+    if 'conv2d' in op.tag:
+        return op
+    for tensor in op.input_tensors:
+        op_ = _find_conv2d_op(tensor.op)
+        if op_ is not None:
+            return op_
+    return None
+
+
 @reg.register_compute("nn.conv2d")
 def compute_conv2d(attrs, inputs, out_type, target):
     """Compute definition of conv2d"""
@@ -103,14 +121,14 @@ def compute_conv2d(attrs, inputs, out_type, target):
             inputs[0], inputs[1], strides, padding,
             dilation, layout, out_dtype=out_dtype)
     elif layout == "NCHW" and \
-         get_const_int(inputs[1].shape[0]) == groups and \
-         get_const_int(inputs[1].shape[1]) == 1:
+            get_const_int(inputs[1].shape[0]) == groups and \
+            get_const_int(inputs[1].shape[1]) == 1:
         out = topi.nn.depthwise_conv2d_nchw(
             inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
     elif layout == "NHWC" and \
-         kernel_layout == "HWOI" and\
-         get_const_int(inputs[1].shape[2]) == groups and \
-         get_const_int(inputs[1].shape[3]) == 1:
+            kernel_layout == "HWOI" and\
+            get_const_int(inputs[1].shape[2]) == groups and \
+            get_const_int(inputs[1].shape[3]) == 1:
         out = topi.nn.depthwise_conv2d_nhwc(
             inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
     elif layout in ['NCHW', 'NCHW4c']:
@@ -127,6 +145,7 @@ def schedule_conv2d(attrs, outs, target):
     groups = attrs.groups
     layout = attrs.data_layout
     kernel_layout = attrs.kernel_layout
+
     with target:
         if groups == 1 and layout == "NCHW":
             return topi.generic.schedule_conv2d_nchw(outs)
@@ -135,13 +154,20 @@ def schedule_conv2d(attrs, outs, target):
         if groups == 1 and layout == "NHWC":
             return topi.generic.schedule_conv2d_nhwc(outs)
         if groups != 1:
-            if layout == "NCHW":
-                # TODO(leyuan, merrymercy, Huyuwei): fold depthwise topi into conv2d.
-                return topi.generic.schedule_depthwise_conv2d_nchw(outs)
-            if layout == "NHWC" and kernel_layout == "HWOI":
-                return topi.generic.schedule_depthwise_conv2d_nhwc(outs)
-            if layout == "NCHW4c":
-                return topi.generic.schedule_group_conv2d_nchw(outs)
+            # collect in_channels to distinguish depthwise and group conv2d
+            op = _find_conv2d_op(outs[0].op)
+            assert op is not None
+
+            is_depthwise = 'depthwise' in op.tag
+            if is_depthwise:
+                if layout == "NCHW":
+                    # TODO(leyuan, merrymercy, Huyuwei): fold depthwise topi into conv2d.
+                    return topi.generic.schedule_depthwise_conv2d_nchw(outs)
+                if layout == "NHWC" and kernel_layout == "HWOI":
+                    return topi.generic.schedule_depthwise_conv2d_nhwc(outs)
+            else:
+                if layout in ["NCHW", "NCHW4c"]:
+                    return topi.generic.schedule_group_conv2d_nchw(outs)
     raise ValueError("No compatible schedule")
 
 
@@ -151,6 +177,7 @@ def alter_op_layout_conv2d(attrs, inputs, tinfos):
     from ... import op
     return topi.nn.conv2d_alter_layout(attrs, inputs, tinfos, op)
 
+
 reg.register_pattern("nn.conv2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
@@ -169,18 +196,21 @@ def compute_conv2d_transpose(attrs, inputs, out_dtype, target):
     assert layout == "NCHW", "only support nchw for now"
     assert dilation == (1, 1), "not support dilate now"
     assert groups == 1, "only support groups == 1 for now"
-    out = topi.nn.conv2d_transpose_nchw(inputs[0], inputs[1], strides, padding, out_dtype)
+    out = topi.nn.conv2d_transpose_nchw(
+        inputs[0], inputs[1], strides, padding, out_dtype)
     output_padding = get_const_tuple(attrs.output_padding)
     out = topi.nn.pad(out,
                       [0, 0, 0, 0], [0, 0, output_padding[0], output_padding[1]])
     return [out]
 
+
 @reg.register_schedule("nn.conv2d_transpose")
 def schedule_conv2d_transpose(attrs, outs, target):
     """Schedule definition of conv2d_transpose"""
     with target:
         return topi.generic.schedule_conv2d_transpose_nchw(outs)
 
+
 reg.register_pattern("nn.conv2d_transpose", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 # bias_add
@@ -196,6 +226,7 @@ def schedule_max_pool2d(attrs, outs, target):
     with target:
         return topi.generic.schedule_pool(outs, layout)
 
+
 reg.register_pattern("nn.max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
@@ -207,6 +238,7 @@ def schedule_avg_pool2d(attrs, outs, target):
     with target:
         return topi.generic.schedule_pool(outs, layout)
 
+
 reg.register_pattern("nn.avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
@@ -217,6 +249,7 @@ def schedule_global_max_pool2d(_, outs, target):
     with target:
         return topi.generic.schedule_global_pool(outs)
 
+
 reg.register_pattern("nn.global_max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
@@ -227,6 +260,7 @@ def schedule_global_avg_pool2d(_, outs, target):
     with target:
         return topi.generic.schedule_global_pool(outs)
 
+
 reg.register_pattern("nn.global_avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 # leaky_relu
@@ -250,12 +284,14 @@ def compute_lrn(attrs, inputs, out_dtype, target):
     return [topi.nn.lrn(inputs[0], attrs.size, attrs.axis,
                         attrs.alpha, attrs.beta, attrs.bias)]
 
+
 @reg.register_schedule("nn.lrn")
 def schedule_lrn(attrs, outs, target):
     """Schedule definition of lrn"""
     with target:
         return topi.generic.schedule_lrn(outs)
 
+
 reg.register_pattern("nn.lrn", OpPattern.OPAQUE)
 
 
@@ -265,20 +301,26 @@ def compute_l2_normalize(attrs, inputs, out_dtype, target):
     """Compute definition of l2 normalize"""
     return [topi.nn.l2_normalize(inputs[0], attrs.eps, attrs.axis)]
 
+
 @reg.register_schedule("nn.l2_normalize")
 def schedule_l2_normalize(attrs, outs, target):
     """Schedule definition of l2 normalize"""
     with target:
         return topi.generic.schedule_l2_normalize(outs)
 
+
 reg.register_pattern("nn.l2_normalize", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 # upsampling
 reg.register_schedule("nn.upsampling", reg.schedule_injective)
+
+
 def schedule_upsampling(_, outs, target):
     """Schedule definition of upsampling"""
     with target:
         return topi.generic.schedule_injective(outs)
+
+
 # pad
 reg.register_schedule("nn.pad", schedule_broadcast)
 
@@ -304,12 +346,14 @@ def compute_contrib_conv2d_winograd_without_weight_transform(attrs, inputs, out_
 
     return [out]
 
+
 @reg.register_schedule("nn.contrib_conv2d_winograd_without_weight_transform")
 def schedule_contrib_conv2d_winograd_without_weight_transform(attrs, outs, target):
     """Schedule definition of conv2d_winograd_without_weight_transform"""
     with target:
         return topi.generic.schedule_conv2d_winograd_without_weight_transform(outs)
 
+
 reg.register_pattern("nn.contrib_conv2d_winograd_without_weight_transform",
                      OpPattern.OUT_ELEMWISE_FUSABLE)
 
@@ -317,15 +361,18 @@ def schedule_contrib_conv2d_winograd_without_weight_transform(attrs, outs, targe
 @reg.register_compute("nn.contrib_conv2d_winograd_weight_transform")
 def compute_contrib_conv2d_winograd_weight_transform(attrs, inputs, out_dtype, target):
     """Compute definition of contrib_conv2d_winograd_weight_transform"""
-    out = topi.nn.conv2d_winograd_weight_transform(inputs[0], attrs.get_int('tile_size'))
+    out = topi.nn.conv2d_winograd_weight_transform(
+        inputs[0], attrs.get_int('tile_size'))
     return [out]
 
+
 @reg.register_schedule("nn.contrib_conv2d_winograd_weight_transform")
 def schedule_contrib_conv2d_winograd_weight_transform(attrs, outs, target):
     """Schedule definition of contrib_conv2d_winograd_weight_transform"""
     with target:
         return topi.generic.schedule_conv2d_winograd_weight_transform(outs)
 
+
 reg.register_pattern("nn.contrib_conv2d_winograd_weight_transform",
                      OpPattern.OUT_ELEMWISE_FUSABLE)
 
@@ -353,12 +400,14 @@ def compute_contrib_conv2d_winograd_nnpack_without_weight_transform(
 
     return [out]
 
+
 @reg.register_schedule("nn.contrib_conv2d_winograd_nnpack_without_weight_transform")
 def schedule_contrib_conv2d_winograd_nnpack_without_weight_transform(attrs, outs, target):
     """Schedule definition of conv2d_winograd_nnpack_without_weight_transform"""
     with target:
         return topi.generic.schedule_conv2d_winograd_nnpack_without_weight_transform(outs)
 
+
 reg.register_pattern("nn.contrib_conv2d_winograd_nnpack_without_weight_transform",
                      OpPattern.OPAQUE)
 
@@ -371,12 +420,14 @@ def compute_contrib_conv2d_winograd_nnpack_weight_transform(attrs, inputs, out_d
         inputs[0], convolution_algorithm, out_dtype)
     return [out]
 
+
 @reg.register_schedule("nn.contrib_conv2d_winograd_nnpack_weight_transform")
 def schedule_contrib_conv2d_winograd_nnpack_weight_transform(attrs, outs, target):
     """Schedule definition of contrib_conv2d_winograd_nnpack_weight_transform"""
     with target:
         return topi.generic.schedule_conv2d_winograd_nnpack_weight_transform(outs)
 
+
 reg.register_pattern("nn.contrib_conv2d_winograd_nnpack_weight_transform",
                      OpPattern.OPAQUE)
 
@@ -397,15 +448,18 @@ def compute_contrib_conv2d_NCHWc(attrs, inputs, out_dtype, target):
                                data_layout, out_layout, out_dtype)
     return [out]
 
+
 @reg.register_schedule("nn.contrib_conv2d_NCHWc")
 def schedule_contrib_conv2d_NCHWc(attrs, outs, target):
     """Schedule definition of contrib_conv2d_NCHWc"""
     with target:
         return topi.generic.schedule_conv2d_NCHWc(outs)
 
+
 reg.register_pattern("nn.contrib_conv2d_NCHWc",
                      OpPattern.OUT_ELEMWISE_FUSABLE)
 
+
 @reg.register_compute("nn.contrib_depthwise_conv2d_NCHWc")
 def compute_contrib_depthwise_conv2d_NCHWc(attrs, inputs, out_dtype, target):
     """Compute definition of depthwise conv2d NCHWc"""
@@ -422,15 +476,18 @@ def compute_contrib_depthwise_conv2d_NCHWc(attrs, inputs, out_dtype, target):
                                          data_layout, out_layout, out_dtype)
     return [out]
 
+
 @reg.register_schedule("nn.contrib_depthwise_conv2d_NCHWc")
 def schedule_contrib_depthwise_conv2d_NCHWc(attrs, outs, target):
     """Schedule definition of contrib_conv2d_NCHWc"""
     with target:
         return topi.generic.schedule_depthwise_conv2d_NCHWc(outs)
 
+
 reg.register_pattern("nn.contrib_depthwise_conv2d_NCHWc",
                      OpPattern.OUT_ELEMWISE_FUSABLE)
 
+
 @reg.register_compute("nn.deformable_conv2d")
 def compute_deformable_conv2d(attrs, inputs, out_dtype, target):
     """Compute definition of deformable_conv2d"""
@@ -446,10 +503,12 @@ def compute_deformable_conv2d(attrs, inputs, out_dtype, target):
                                              dilation, deformable_groups, groups, out_dtype)
     return [out]
 
+
 @reg.register_schedule("nn.deformable_conv2d")
 def schedule_deformable_conv2d(attrs, outs, target):
     """Schedule definition of deformable_conv2d"""
     with target:
         return topi.generic.schedule_deformable_conv2d_nchw(outs)
 
+
 reg.register_pattern("nn.deformable_conv2d", OpPattern.OUT_ELEMWISE_FUSABLE)
diff --git a/python/tvm/target.py b/python/tvm/target.py
index d3df3d705cb8..eff0088b37ce 100644
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -296,7 +296,7 @@ def dispatch_func(func, *args, **kwargs):
 def generic_func(fdefault):
     """Wrap a target generic function.
 
-    Generic function allows registeration of further functions
+    Generic function allows registration of further functions
     that can be dispatched on current target context.
     If no registered dispatch is matched, the fdefault will be called.
 
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index a6efd8cf0971..88963a63c770 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -86,9 +86,13 @@ def run_test_conv2d(dtype, out_dtype, scale, dshape, kshape,
                         fref=None,
                         groups=1,
                         dilation=(1, 1),
+                        except_targets=None,
                         **attrs):
-        x = relay.var("x", shape=dshape)
-        w = relay.var("w")
+        if except_targets is None:
+          except_targets = []
+          
+        x = relay.var("x", shape=dshape, dtype=dtype)
+        w = relay.var("w", dtype=dtype)
         y = relay.nn.conv2d(x, w,
                             padding=padding,
                             dilation=dilation,
@@ -100,11 +104,15 @@ def run_test_conv2d(dtype, out_dtype, scale, dshape, kshape,
         dkernel = topi.testing.dilate_python(kernel, (1, 1) + dilation)
         if fref is None:
             ref_res = topi.testing.conv2d_nchw_python(
-                data.astype(out_dtype), dkernel.astype(out_dtype), 1, padding)
+                data.astype(out_dtype), dkernel.astype(out_dtype), 1, padding,
+                groups=groups)
         else:
             ref_res = fref(data.astype(out_dtype), dkernel.astype(out_dtype))
 
+
         for target, ctx in ctx_list():
+            if target in except_targets:
+                continue
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(data, kernel)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
@@ -117,6 +125,21 @@ def run_test_conv2d(dtype, out_dtype, scale, dshape, kshape,
                     fref=lambda x, w: topi.testing.depthwise_conv2d_python_nchw(
                         x, w, (1, 1), "SAME"))
 
+    # CUDA is disabled for 'direct' schedule:
+    # https://github.com/dmlc/tvm/pull/3070#issuecomment-486597553
+    # group conv2d
+    dshape = (1, 32, 18, 18)
+    kshape = (32, 4, 3, 3)
+    run_test_conv2d("float32", "float32", 1, dshape, kshape,
+                    padding=(1, 1), channels=32, groups=8, kernel_size=(3 ,3),
+                    except_targets=['cuda'])
+    # also group conv2d
+    dshape = (1, 32, 18, 18)
+    kshape = (64, 1, 3, 3)
+    run_test_conv2d("float32", "float32", 1, dshape, kshape,
+                    padding=(1, 1), channels=64, groups=32, kernel_size=(3 ,3),
+                    except_targets=['cuda'])
+
     # normal conv2d
     dshape = (1, 3, 224, 224)
     kshape = (10, 3, 3, 3)
diff --git a/topi/python/topi/cuda/group_conv2d_nchw.py b/topi/python/topi/cuda/group_conv2d_nchw.py
index 601b9b6e062c..be4ae3554e33 100644
--- a/topi/python/topi/cuda/group_conv2d_nchw.py
+++ b/topi/python/topi/cuda/group_conv2d_nchw.py
@@ -27,10 +27,13 @@
 from .. import nn, generic
 
 
-@autotvm.register_topi_compute(nn.group_conv2d_nchw, ['cuda', 'gpu'], ['direct', 'int8'])
+autotvm.register_topi_compute(nn.group_conv2d_nchw, ['cuda', 'gpu'], 'direct',
+                              nn.group_conv2d_nchw.fdefault)
+
+@autotvm.register_topi_compute(nn.group_conv2d_nchw, ['cuda', 'gpu'], ['int8'])
 def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups,
                            out_dtype='float32'):
-    """Group convolution operator in NCHW layout.
+    """Group convolution operator for 'group_conv2d_NCHWc_int8'.
 
     Parameters
     ----------
@@ -76,7 +79,7 @@ def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups,
         assert out_channels % groups == 0, "output channels must divide group size"
         assert channels % ic_block_factor == 0, \
             "Number of input channels per group must divide {}".format(ic_block_factor)
-        assert out_channels % 4 == 0, \
+        assert out_channels % oc_block_factor == 0, \
             "Number of output channels per group must divide {}".format(oc_block_factor)
 
         packed_data = tvm.compute((batch, channels // ic_block_factor, height, width,
@@ -99,6 +102,17 @@ def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups,
     oc_chunk, _, kernel_h, kernel_w, oc_block, ic_block = get_const_tuple(
         packed_kernel.shape)
 
+    # TODO(kumasento): these assertions ensure that the number of groups
+    # should be smaller or equal to the number of blocks, so that each
+    # group will have at least one block.
+    # Shall we pad the channels to avoid raising assertions?
+    assert groups <= oc_chunk, \
+        ('Number of groups {} should be less than '
+         'output channel chunk size {}'.format(groups, oc_chunk))
+    assert groups <= ic_chunk, \
+        ('Number of groups {} should be less than '
+         'input channel chunk size {}'.format(groups, ic_chunk))
+
     if isinstance(stride, int):
         stride_h = stride_w = stride
     else:
@@ -109,9 +123,9 @@ def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups,
     else:
         dilation_h, dilation_w = dilation
 
+    # pad the input data
     pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
         padding, (kernel_h, kernel_w))
-    # compute graph
     pad_before = [0, 0, pad_top, pad_left, 0]
     pad_after = [0, 0, pad_down, pad_right, 0]
     pad_data = pad(packed_data, pad_before, pad_after, name="pad_data")
@@ -129,6 +143,17 @@ def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups,
     kh = tvm.reduce_axis((0, kernel_h), name='kh')
     kw = tvm.reduce_axis((0, kernel_w), name='kw')
 
+    # NOTE(kumasento): explanation of this snippet -
+    # oc_chunk//groups and ic_chunk//groups give you the number of blocks,
+    # i.e., chunk, per group.
+    # occ is the ID of the output channel block, so that occ//(oc_chunk//groups)
+    # produces the ID of the group.
+    # Multiplying that result with ic_chunk//groups resulting in the ID
+    # of the beginning block of the corresponding input group.
+    # Adding the block offset (icc) will give you the exact block ID.
+    #
+    # Compared with a normal convolution, group convolution only sums
+    # input channels from the group that an output channel resides in.
     conv = tvm.compute(oshape, lambda n, occ, oh, ow, ocb:
                        tvm.sum(pad_data[n, occ//(oc_chunk//groups)*(ic_chunk//groups)+icc,
                                         oh*stride_h+kh*dilation_h, ow*stride_w+kw*dilation_w, icb]
@@ -138,8 +163,10 @@ def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups,
                                .astype('int32'),
                                axis=[icc, kh, kw, icb]))
 
+    # Type conversion
     output = tvm.compute(oshape, lambda *index: conv(*index).astype(out_dtype),
                          tag='group_conv2d_NCHWc_int8')
+
     num_flop = batch * oc_chunk * oc_block * out_height * out_width * \
         ic_chunk * ic_block * kernel_h * kernel_w * 2 // groups
     cfg.add_flop(num_flop)
@@ -295,7 +322,7 @@ def schedule_group_conv2d_NCHWc_int8(cfg, s, output):
 
 
 @autotvm.register_topi_schedule(generic.schedule_group_conv2d_nchw,
-                                ["cuda", "gpu"], ["direct", "int8"])
+                                ["cuda", "gpu"], ["int8"])
 def schedule_conv2d_nchw_cuda(cfg, outs):
     """TOPI schedule callback of group conv2d for cuda gpu
 
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index 70ce5791d905..7bd95688b75d 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -242,7 +242,7 @@ def schedule_depthwise_conv2d_NCHWc(outs):
 
 @tvm.target.generic_func
 def schedule_group_conv2d_nchw(outs):
-    """Schedule for conv2d_nchw
+    """Schedule for group_conv2d_nchw
 
     Parameters
     ----------
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index 49c0bd79eacc..06d4074147c1 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -603,4 +603,4 @@ def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtyp
                  yy * stride_h + ry * dilation_h,
                  xx * stride_w + rx * dilation_w].astype(out_dtype) *
             Filter[ff, rc, ry, rx].astype(out_dtype),
-            axis=[rc, ry, rx]), tag="conv2d_nchw")
+            axis=[rc, ry, rx]), tag='group_conv2d_nchw')

From a8fe6e88100362aabd99e517faf34cebb435464c Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Fri, 26 Apr 2019 19:49:24 -0700
Subject: [PATCH 051/106] [ROCM] Fix conv2d (#3107)

---
 topi/python/topi/rocm/conv2d.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/topi/python/topi/rocm/conv2d.py b/topi/python/topi/rocm/conv2d.py
index 7acc1b8e16a7..aacdb90286a6 100644
--- a/topi/python/topi/rocm/conv2d.py
+++ b/topi/python/topi/rocm/conv2d.py
@@ -71,7 +71,7 @@ def conv2d_rocm(cfg, data, kernel, strides, padding, dilation, layout='NCHW', ou
                     ((KW - 1) * dilation_w + 1))
 
         return miopen.conv2d_forward(data,
-                                     kernel_before_dilation,
+                                     kernel,
                                      stride_h,
                                      stride_w,
                                      pad_h,

From 23e7e7d9621da03f59589761453977c509285544 Mon Sep 17 00:00:00 2001
From: Meghan Cowan <cowanmeg@cs.washington.edu>
Date: Fri, 26 Apr 2019 19:55:22 -0700
Subject: [PATCH 052/106] [TOPI] Bitserial dense operators for CPU (#3051)

---
 python/tvm/autotvm/task/topi_integration.py   |  21 +-
 topi/python/topi/arm_cpu/__init__.py          |   1 +
 topi/python/topi/arm_cpu/bitserial_conv2d.py  |   7 +-
 topi/python/topi/arm_cpu/bitserial_dense.py   | 184 ++++++++++++++++++
 topi/python/topi/generic/nn.py                |  16 ++
 topi/python/topi/nn/__init__.py               |   1 +
 topi/python/topi/nn/bitserial_conv2d.py       |  94 ++-------
 topi/python/topi/nn/bitserial_dense.py        | 138 +++++++++++++
 topi/python/topi/nn/bitserial_util.py         |  91 +++++++++
 topi/python/topi/x86/__init__.py              |   1 +
 topi/python/topi/x86/bitserial_dense.py       |  94 +++++++++
 .../tests/python/test_topi_bitserial_dense.py |  74 +++++++
 12 files changed, 635 insertions(+), 87 deletions(-)
 create mode 100644 topi/python/topi/arm_cpu/bitserial_dense.py
 create mode 100644 topi/python/topi/nn/bitserial_dense.py
 create mode 100644 topi/python/topi/nn/bitserial_util.py
 create mode 100644 topi/python/topi/x86/bitserial_dense.py
 create mode 100644 topi/tests/python/test_topi_bitserial_dense.py

diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index ed1a2b75c979..3c983768ab3e 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -87,6 +87,7 @@ def __init__(self):
             topi.nn.dense: "topi_nn_dense",
             topi.nn.bitserial_conv2d_nchw: "topi_nn_bitserial_conv2d_nchw",
             topi.nn.bitserial_conv2d_nhwc: "topi_nn_bitserial_conv2d_nhwc",
+            topi.nn.bitserial_dense: "topi_nn_bitserial_dense",
             topi.nn.deformable_conv2d_nchw: "topi_nn_deformable_conv2d_nchw",
         }
 
@@ -101,6 +102,7 @@ def __init__(self):
             topi.nn.dense: [topi.generic.schedule_dense],
             topi.nn.bitserial_conv2d_nchw: [topi.generic.schedule_bitserial_conv2d_nchw],
             topi.nn.bitserial_conv2d_nhwc: [topi.generic.schedule_bitserial_conv2d_nhwc],
+            topi.nn.bitserial_dense: [topi.generic.schedule_bitserial_dense],
             topi.nn.deformable_conv2d_nchw: [topi.generic.schedule_deformable_conv2d_nchw],
         }
 
@@ -200,18 +202,25 @@ def _topi_bitserial_conv2d_nhwc(*args, **kwargs):
             args = deserialize_args(args)
             C = topi.nn.bitserial_conv2d_nhwc(*args, **kwargs)
             s = topi.generic.nn.schedule_bitserial_conv2d_nhwc([C])
-            data = args[0]
-            kernel = args[1]
-            return s, [data, kernel, C]
+            A, W = args[:2]
+            return s, [A, W, C]
 
         @register("topi_nn_bitserial_conv2d_nchw")
         def _topi_bitserial_conv2d_nchw(*args, **kwargs):
             args = deserialize_args(args)
             C = topi.nn.bitserial_conv2d_nchw(*args, **kwargs)
             s = topi.generic.nn.schedule_bitserial_conv2d_nchw([C])
-            data = args[0]
-            kernel = args[1]
-            return s, [data, kernel, C]
+            A, W = args[:2]
+            return s, [A, W, C]
+
+        @register("topi_nn_bitserial_dense")
+        def _topi_nn_bitserial_dense(*args, **kwargs):
+            assert not kwargs, "Do not support kwargs in template function call"
+            args = deserialize_args(args)
+            A, W = args[:2]
+            C = topi.nn.bitserial_dense(*args, **kwargs)
+            s = topi.generic.schedule_bitserial_dense([C])
+            return s, [A, W, C]
 
         @register("topi_nn_deformable_conv2d_nchw")
         def _topi_nn_deformable_conv2d_nchw(*args, **kwargs):
diff --git a/topi/python/topi/arm_cpu/__init__.py b/topi/python/topi/arm_cpu/__init__.py
index 3e888de55fec..6cf4d9139343 100644
--- a/topi/python/topi/arm_cpu/__init__.py
+++ b/topi/python/topi/arm_cpu/__init__.py
@@ -4,4 +4,5 @@
 from . import depthwise_conv2d
 from . import conv2d_transpose
 from . import bitserial_conv2d
+from . import bitserial_dense
 from . import injective
diff --git a/topi/python/topi/arm_cpu/bitserial_conv2d.py b/topi/python/topi/arm_cpu/bitserial_conv2d.py
index 1a4087168fc5..fed3e3fa4a84 100644
--- a/topi/python/topi/arm_cpu/bitserial_conv2d.py
+++ b/topi/python/topi/arm_cpu/bitserial_conv2d.py
@@ -21,7 +21,8 @@
 from tvm import autotvm
 from .. import tag
 from ..nn.pad import pad
-from ..nn.bitserial_conv2d import bitpack, bitserial_conv2d_nhwc
+from ..nn.bitserial_conv2d import bitserial_conv2d_nhwc
+from ..nn.bitserial_util import bitpack, binary_op_multiplier
 from ..nn.util import get_pad_tuple
 from ..util import get_const_int, get_const_tuple
 from .. import generic
@@ -93,7 +94,8 @@ def spatial_pack_nhwc(cfg, data, kernel, stride, padding, activation_bits, weigh
                                  policy='candidate', candidate=[
                                      [n, oh, ow, co, vh, vw, kh, kw, ci_o, kb, ib, vc, ci_i],
                                      [n, oh, ow, co, vh, vw, kw, kh, ci_o, kb, ib, vc, ci_i],])
-    cfg.add_flop(2 * N * OH * OW * CO * CI * 8 * KH * KW) # these are actually binary ops
+    # binary ops
+    cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW * binary_op_multiplier(pack_dtype))
     # ====================
 
     VC = cfg["tile_co"].size[-1]
@@ -310,7 +312,6 @@ def _schedule_spatial_conv2d_nhwc(cfg, s, data_pad, data_vec, kernel_vec,
 
     s[conv_out].compute_at(s[last], co)
     s[last].parallel(oh)
-    s = s.normalize()
     return s
 
 @autotvm.register_topi_schedule(generic.nn.schedule_bitserial_conv2d_nhwc, 'arm_cpu', 'direct')
diff --git a/topi/python/topi/arm_cpu/bitserial_dense.py b/topi/python/topi/arm_cpu/bitserial_dense.py
new file mode 100644
index 000000000000..ab1053df5430
--- /dev/null
+++ b/topi/python/topi/arm_cpu/bitserial_dense.py
@@ -0,0 +1,184 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, invalid-name, too-many-locals, too-many-arguments
+"""Schedule for bitserial dense operator."""
+from __future__ import absolute_import as _abs
+import tvm
+from tvm import autotvm
+from topi.util import get_const_tuple
+from .. import tag
+from .. import generic
+from .bitserial_conv2d import _intrin_popcount
+from ..nn.pad import pad
+from ..nn.bitserial_dense import bitserial_dense
+from ..nn.bitserial_util import bitpack, binary_op_multiplier
+
+@autotvm.register_topi_compute(bitserial_dense, ['arm_cpu'], 'direct')
+def bitserial_dense_generic(cfg, data, weight, data_bits, weight_bits, pack_dtype, out_dtype,
+                            unipolar):
+    """The default implementation of bitserial dense in topi.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        2-D with shape [batch, in_dim]
+
+    weight : tvm.Tensor
+        2-D with shape [out_dim, in_dim]
+
+    Returns
+    -------
+    output : tvm.Tensor
+        2-D with shape [batch, out_dim]
+    """
+    data_packed = bitpack(data, data_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype)
+    if len(weight.shape) == 2:
+        weight_packed = bitpack(weight, weight_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype)
+    else:
+        weight_packed = weight
+
+    batch, DB, in_dim = get_const_tuple(data_packed.shape)
+    out_dim, WB, in_dim = get_const_tuple(weight_packed.shape)
+
+    # Pad Inputs so that microkernel can be used
+    # out_dim and in_dim need to be multiples of 8
+    if out_dim % 8 != 0:
+        out_dim_pad = out_dim % 8
+        data_packed = pad(data_packed, [0, 0, 0], [out_dim_pad, 0, 0], name='PaddedInput')
+        out_dim += out_dim_pad
+
+    ######## Search space
+
+    x, y = cfg.axis(batch), cfg.axis(out_dim)
+    db, wb, k = cfg.reduce_axis(DB), cfg.reduce_axis(WB), cfg.reduce_axis(in_dim)
+
+    ko, ki = cfg.define_split('tile_k', k, policy='all', num_outputs=2,
+                              filter=lambda xx: xx.size[-1] == 8 or xx.size[-1] == 16)
+    xo, xi = cfg.define_split('tile_x', x, policy='all', num_outputs=2)
+    yo, yi = cfg.define_split('tile_y', y, policy='all', num_outputs=2,
+                              filter=lambda xx: xx.size[-1] == 8)
+
+    cfg.define_reorder('reorder_0', [yo, xo, ko, xi, wb, db, yi, ki],
+                       policy='candidate', candidate=[
+                           [yo, xo, ko, xi, wb, db, yi, ki],
+                           [yo, xo, xi, ko, wb, db, yi, ki],
+                           [yo, xo, ko, xi, wb, db, yi, ki]])
+
+    ###### Compute rule
+    VY = cfg['tile_y'].size[-1]
+    VK = cfg['tile_k'].size[-1]
+
+    wvshape = (out_dim//VY, in_dim//VK, WB, VY, VK)
+    oshape = (batch, out_dim)
+
+    k = tvm.reduce_axis((0, in_dim), name='k')
+    db = tvm.reduce_axis((0, DB), name='db')
+    wb = tvm.reduce_axis((0, WB), name='wb')
+
+    # Tile data and weights
+    weight_vec = tvm.compute(wvshape, lambda yo, ko, wb, vy, vk:
+                             weight_packed[yo*VY+vy][wb][ko*VK+vk], name='weight_vec')
+    matmul_unipolar = tvm.compute(oshape, lambda x, y: tvm.sum(
+        (tvm.popcount(weight_vec[y//VY, k//VK, wb, y%VY, k%VK].astype(out_dtype) &
+                      data_packed[x, db, k].astype(out_dtype)) -
+         tvm.popcount(~weight_vec[y//VY, k//VK, wb, y%VY, k%VK].astype(out_dtype) &
+                      data_packed[x, db, k].astype(out_dtype)))
+        << (wb+db).astype(out_dtype), axis=[wb, db, k]), tag='bitserial_dense_unipolar')
+
+    matmul = tvm.compute(oshape, lambda x, y: tvm.sum(
+        tvm.popcount(weight_vec[y//VY, k//VK, wb, y%VY, k%VK].astype(out_dtype) &
+                     data_packed[x, db, k].astype(out_dtype))
+        << (wb+db).astype(out_dtype), axis=[wb, db, k]), tag='bitserial_dense')
+
+    cfg.add_flop(batch * out_dim * in_dim * binary_op_multiplier(pack_dtype))
+
+    if unipolar:
+        return matmul_unipolar
+    return matmul
+
+
+@autotvm.register_topi_schedule(generic.nn.schedule_bitserial_dense, ['arm_cpu'], 'direct')
+def schedule_bitserial_dense(cfg, outs):
+    """Schedule for binary_dense.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of bitserial dense operator.
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for bitserial_dense.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _schedule(cfg, s, data_vec, weight_vec, output, unipolar):
+
+        z, k, _, y, x = s[weight_vec].op.axis
+        s[weight_vec].parallel(z)
+        s[weight_vec].vectorize(x)
+
+        x, y = s[output].op.axis
+        wb, db, k = s[output].op.reduce_axis
+        _, DB, _ = get_const_tuple(data_vec.shape)
+        _, _, WB, _, _ = get_const_tuple(weight_vec.shape)
+
+        yo, yi = cfg["tile_y"].apply(s, output, y)
+        xo, xi = cfg["tile_x"].apply(s, output, x)
+        ko, ki = cfg["tile_k"].apply(s, output, k)
+
+        cfg["reorder_0"].apply(s, output, [yo, xo, ko, xi, wb, db, yi, ki])
+
+        fused = s[output].fuse(xo, yo)
+        s[output].parallel(fused)
+
+        nfactor = cfg['tile_y'].size[-1]
+        kfactor = cfg['tile_k'].size[-1]
+        if nfactor % 8 == 0:
+            pc = _intrin_popcount(nfactor, kfactor, WB, DB, unipolar)
+            s[output].tensorize(wb, pc)
+
+        return s
+
+    def traverse(op):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(op.tag) or 'elemwise' in op.tag:
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+
+        elif op.tag == 'bitserial_dense' or 'bitserial_dense_unipolar':
+            output = op.output(0)
+            weight_vec = op.input_tensors[0]
+
+            data_vec = op.input_tensors[1]
+            data = data_vec.op.input_tensors[0]
+            if "QuantizeInput" in data.op.name:
+                data = data.op.input_tensors[0]
+            unipolar = (output.op.tag == 'bitserial_dense_unipolar')
+            _schedule(cfg, s, data_vec, weight_vec, output, unipolar)
+        else:
+            raise RuntimeError("Unsupported operator: %s" % op.tag)
+
+    traverse(outs[0].op)
+    return s
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index 7bd95688b75d..db77f37e36a9 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -312,6 +312,22 @@ def schedule_bitserial_conv2d_nhwc(outs):
     return _default_schedule(outs, False)
 
 
+@tvm.target.generic_func
+def schedule_bitserial_dense(outs):
+    """Schedule for bitserial_dense
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of bitserial_dense
+          in the format of an array of tensors.
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
 @tvm.target.override_native_generic_func("schedule_reduce")
 def schedule_reduce(outs):
     """Schedule for reduction
diff --git a/topi/python/topi/nn/__init__.py b/topi/python/topi/nn/__init__.py
index 65eb7341babd..e817aa401689 100644
--- a/topi/python/topi/nn/__init__.py
+++ b/topi/python/topi/nn/__init__.py
@@ -17,5 +17,6 @@
 from .upsampling import *
 from .local_response_norm import *
 from .bitserial_conv2d import *
+from .bitserial_dense import *
 from .l2_normalize import *
 from .batch_matmul import *
diff --git a/topi/python/topi/nn/bitserial_conv2d.py b/topi/python/topi/nn/bitserial_conv2d.py
index 5da90f134984..99cac889deea 100644
--- a/topi/python/topi/nn/bitserial_conv2d.py
+++ b/topi/python/topi/nn/bitserial_conv2d.py
@@ -14,16 +14,15 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name, unused-variable, too-many-locals, too-many-arguments, unused-argument
+# pylint: disable=invalid-name, too-many-locals, too-many-arguments
 """Bitserial Conv2D operators"""
 from __future__ import absolute_import as _abs
-import numpy as np
 import tvm
 from tvm import autotvm
-from topi.transform import concatenate
 from .pad import pad
 from .util import get_pad_tuple
-from ..util import get_const_tuple, get_const_int
+from .bitserial_util import bitpack, binary_op_multiplier
+from ..util import get_const_tuple
 
 @tvm.target.generic_func
 def bitserial_conv2d_nchw(data, kernel, stride, padding, activation_bits, weight_bits,
@@ -68,7 +67,7 @@ def bitserial_conv2d_nchw(data, kernel, stride, padding, activation_bits, weight
     Input_q = bitpack(data, activation_bits, pack_axis=1, bit_axis=2, pack_type=pack_dtype)
     Filter_q = bitpack(filter, weight_bits, pack_axis=1, bit_axis=4, pack_type=pack_dtype)
     batch, in_channel, activation_bits, in_height, in_width = Input_q.shape
-    num_filter, channel, kernel_h, kernel_w, weight_bits = Filter_q.shape
+    num_filter, _, kernel_h, kernel_w, weight_bits = Filter_q.shape
 
     if isinstance(padding, int) or (isinstance(padding, (tuple, list)) and len(padding) == 2):
         TPAD, LPAD, DPAD, RPAD = get_pad_tuple(padding, kernel)
@@ -259,10 +258,11 @@ def spatial_pack_nchw(cfg, data, kernel, stride, padding, in_bits, weight_bits,
                               filter=lambda x: max(x.size[1:]) <= 16)
     cfg.define_annotate('ann_reduce', [ib, kb, kh, kw], policy='try_unroll')
 
-    re_axes = cfg.define_reorder("reorder_0",
-                                 [n, co, oh, ow, vc, vh, vw, kh, kw, kb, ib, ci],
-                                 policy='interval_all', interval=(6, 11))
-    cfg.add_flop(2 * N * OH * OW * CO * CI * 8 * KH * KW) # these are actually binary ops
+    cfg.define_reorder("reorder_0",
+                       [n, co, oh, ow, vc, vh, vw, kh, kw, kb, ib, ci],
+                       policy='interval_all', interval=(6, 11))
+    # binary ops
+    cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW * binary_op_multiplier(pack_dtype))
     # ====================
 
     VC = cfg["tile_co"].size[-1]
@@ -275,7 +275,7 @@ def spatial_pack_nchw(cfg, data, kernel, stride, padding, in_bits, weight_bits,
     oshape = (1, CO, OH, OW)
 
     if (TPAD != 0 and RPAD != 0):
-        data_pad = pad(data_q, (0, 0, 0, TPAD, LPAD), (0, 0, 0, DPAD, RPAD), name="data_pad")
+        data_pad = pad(data_q, pad_before, pad_after, name="data_pad")
     else:
         data_pad = data_q
 
@@ -361,10 +361,11 @@ def spatial_pack_nhwc(cfg, data, kernel, stride, padding, in_bits, weight_bits,
     ow, vw = cfg.define_split('tile_ow', ow, policy='all', num_outputs=2,
                               filter=lambda x: max(x.size[1:]) <= 16)
     cfg.define_annotate('ann_reduce', [ib, kb, kh, kw], policy='try_unroll')
-    re_axes = cfg.define_reorder("reorder_0",
-                                 [n, oh, ow, co, vh, vw, kh, kw, kb, ib, vc, ci],
-                                 policy='interval_all', interval=(3, 7))
-    cfg.add_flop(2 * N * OH * OW * CO * CI * 8 * KH * KW) # these are actually binary ops
+    cfg.define_reorder("reorder_0",
+                       [n, oh, ow, co, vh, vw, kh, kw, kb, ib, vc, ci],
+                       policy='interval_all', interval=(3, 7))
+    # binary ops
+    cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW * binary_op_multiplier(pack_dtype))
     # ====================
 
     VC = cfg["tile_co"].size[-1]
@@ -377,7 +378,7 @@ def spatial_pack_nhwc(cfg, data, kernel, stride, padding, in_bits, weight_bits,
     oshape = (1, OH, OW, CO)
 
     if (DPAD != 0 and RPAD != 0):
-        data_pad = pad(data_q, (0, TPAD, LPAD, 0, 0), (0, DPAD, RPAD, 0, 0), name="data_pad")
+        data_pad = pad(data_q, pad_before, pad_after, name="data_pad")
     else:
         data_pad = data_q
 
@@ -413,66 +414,3 @@ def _conv(n, h, w, co, vh, vw, vc):
     return tvm.compute(oshape, lambda n, h, w, co:
                        conv[n][h//VH][w//VW][co//VC][h%VH][w%VW][co%VC],
                        name='output_unpack', tag='spatial_bitserial_conv_nhwc')
-
-
-def bitpack(data, bits, pack_axis, bit_axis, pack_type, name="QuantizeInput"):
-    """Packs data into format necessary for bitserial computation
-    pack_axis : int
-       index of the axis to pack in data
-    bit_axis : int
-       index of axis to place bit axis in resulting packed data"""
-    ishape = data.shape
-    n = len(ishape)
-    if pack_type == 'uint8':
-        data_width = 8
-    elif pack_type == 'uint16':
-        data_width = 16
-    elif pack_type == 'uint32':
-        data_width = 32
-    elif pack_type == 'uint64':
-        data_width = 64
-
-    # Data must be in multiples of the data_width
-    assert get_const_int(ishape[pack_axis]) % data_width == 0, "Not a multiple of word size"
-
-    shape_vec = list(ishape)
-    shape_vec[pack_axis] = (shape_vec[pack_axis] // data_width)
-    shape_vec.insert(bit_axis, 1)
-    bitserial_oshape = tuple(shape_vec)
-    masks = np.array([0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80])
-
-    # pack axis shifts if bit axis comes before
-    if bit_axis <= pack_axis:
-        pack_axis += 1
-
-    def _bitpack(*indices):
-        packed_data = [tvm.const(0, pack_type)] * bits
-        for k in range(data_width):
-            # Translate indices for packed data back to original
-            idx = [0] * n
-            j = 0
-            for i in range(n+1):
-                if i == bit_axis:
-                    continue
-                elif i == pack_axis:
-                    idx[j] = indices[i] * data_width + k
-                else:
-                    idx[j] = indices[i]
-                j += 1
-
-            element = data(*idx)
-            for b in range(bits):
-                extracted_bit = ((element & tvm.const(masks[b], "int32")) >> b).astype(pack_type)
-                packed_data[b] = (packed_data[b] | extracted_bit)
-                if k < data_width - 1:
-                    packed_data[b] = packed_data[b] << 1
-
-            if k == data_width - 1:
-                return tuple(packed_data)
-        return tuple(packed_data)
-
-    output_tuple = tvm.compute(bitserial_oshape, _bitpack, name=name, tag='bitpack')
-
-    if bits > 1:
-        return concatenate(output_tuple, axis=bit_axis)
-    return output_tuple
diff --git a/topi/python/topi/nn/bitserial_dense.py b/topi/python/topi/nn/bitserial_dense.py
new file mode 100644
index 000000000000..5d47b2974a7c
--- /dev/null
+++ b/topi/python/topi/nn/bitserial_dense.py
@@ -0,0 +1,138 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, too-many-locals, too-many-arguments
+"""Bitserial Dense operator."""
+from __future__ import absolute_import
+import tvm
+from tvm import autotvm
+from topi.util import get_const_tuple
+from .bitserial_util import bitpack, binary_op_multiplier
+
+@tvm.target.generic_func
+def bitserial_dense(data, weight, data_bits, weight_bits, pack_dtype='uint32',
+                    out_dtype='int16', unipolar=True):
+    """The default implementation of bitserial dense in topi.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        2-D with shape [batch, in_dim]
+    weight : tvm.Tensor
+        2-D with shape [out_dim, in_dim] or
+        3-D with shape [out_dim, weight_bits, in_dim]
+    Returns
+    -------
+    output : tvm.Tensor
+        2-D with shape [batch, out_dim]
+    """
+    data_packed = bitpack(data, data_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype)
+    if len(weight.shape) == 2:
+        weight_packed = bitpack(weight, weight_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype)
+    else:
+        weight_packed = weight
+    Y, DB, K = get_const_tuple(data_packed.shape)
+    X, WB, _ = get_const_tuple(weight_packed.shape)
+
+    oshape = (Y, X)
+    k = tvm.reduce_axis((0, K), name='k')
+    db = tvm.reduce_axis((0, DB), name='db')
+    wb = tvm.reduce_axis((0, WB), name='wb')
+
+    matmul_unipolar = tvm.compute(oshape, lambda i, j: tvm.sum(
+        (tvm.popcount(weight_packed[j, wb, k] & data_packed[i, db, k]) -
+         tvm.popcount(~weight_packed[j, wb, k] & data_packed[i, db, k])).astype(out_dtype)
+        << (db+wb).astype(out_dtype), axis=[wb, db, k]),
+                                  tag='bitserial_dense_unipolar')
+
+    matmul = tvm.compute(oshape, lambda i, j: tvm.sum(
+        tvm.popcount(weight_packed[j, wb, k] & data_packed[i, db, k]).astype(out_dtype)
+        << (db+wb).astype(out_dtype), axis=[wb, db, k]), tag='bitserial_dense')
+
+
+    if unipolar:
+        return matmul_unipolar
+    return matmul
+
+
+@autotvm.register_topi_compute(bitserial_dense, ['cpu'], 'direct')
+def bitserial_dense_default(cfg, data, weight, data_bits, weight_bits, pack_dtype='uint32',
+                            out_dtype='int16', unipolar=True):
+    """Bitserial dense implementation. TODO: Why are these separate
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        2-D with shape [batch, in_dim]
+    weight : tvm.Tensor
+        2-D with shape [out_dim, in_dim] or
+        3-D with shape [out_dim, weight_bits, in_dim]
+    Returns
+    -------
+    output : tvm.Tensor
+        2-D with shape [batch, out_dim]
+    """
+    data_packed = bitpack(data, data_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype)
+    if len(weight.shape) == 2:
+        weight_packed = bitpack(weight, weight_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype)
+    else:
+        weight_packed = weight
+    Y, DB, K = get_const_tuple(data_packed.shape)
+    X, WB, _ = get_const_tuple(weight_packed.shape)
+    ######## Search space
+    x, y = cfg.axis(X), cfg.axis(Y)
+    db, wb, k = cfg.reduce_axis(DB), cfg.reduce_axis(WB), cfg.reduce_axis(K)
+    ko, ki = cfg.define_split('tile_k', k, policy='all', num_outputs=2)
+    yo, yi = cfg.define_split('tile_y', y, policy='all', num_outputs=2)
+    xo, xi = cfg.define_split('tile_x', x, policy='all', num_outputs=2)
+
+    cfg.define_reorder('reorder_0', [yo, xo, ko, yi, wb, db, ki, xi],
+                       policy='candidate', candidate=[
+                           [yo, xo, ko, yi, wb, db, ki, xi],
+                           [yo, xo, yi, ko, wb, db, ki, xi]])
+
+    cfg.define_annotate('ann_reduce', [db, wb], policy='try_unroll')
+    cfg.define_annotate('ann_spatial', [yi, xi], policy='try_unroll_vec')
+
+    ###### Compute rule
+    VX = cfg['tile_x'].size[-1]
+
+    wvshape = (X//VX, WB, VX, K)
+    oshape = (Y, X)
+
+    k = tvm.reduce_axis((0, K), name='k')
+    db = tvm.reduce_axis((0, DB), name='db')
+    wb = tvm.reduce_axis((0, WB), name='wb')
+
+    # Tile data and weights
+    weight_vec = tvm.compute(wvshape, lambda xo, wb, vx, k:
+                             weight_packed[xo*VX+vx][wb][k], name='weight_vec')
+
+    matmul_unipolar = tvm.compute(oshape, lambda i, j: tvm.sum(
+        (tvm.popcount(weight_vec[j//VX, wb, j%VX, k] & data_packed[i, db, k]) -
+         tvm.popcount(~weight_vec[j//VX, wb, j%VX, k] & data_packed[i, db, k])).astype(out_dtype)
+        << (db+wb).astype(out_dtype), axis=[wb, db, k]), tag='bitserial_dense_unipolar')
+
+    matmul = tvm.compute(oshape, lambda i, j: tvm.sum(
+        tvm.popcount(weight_vec[j//VX, wb, j%VX, k] & data_packed[i, db, k]).astype(out_dtype)
+        << (db+wb).astype(out_dtype), axis=[wb, db, k]), tag='bitserial_dense')
+
+    # binary ops
+    cfg.add_flop(2 * Y * X * K * binary_op_multiplier(pack_dtype))
+
+    if unipolar:
+        return matmul_unipolar
+    return matmul
diff --git a/topi/python/topi/nn/bitserial_util.py b/topi/python/topi/nn/bitserial_util.py
new file mode 100644
index 000000000000..09a301f7c962
--- /dev/null
+++ b/topi/python/topi/nn/bitserial_util.py
@@ -0,0 +1,91 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, too-many-locals, too-many-arguments
+"""Utility functions for bitserial operators"""
+import numpy as np
+import tvm
+from topi.transform import concatenate
+from ..util import get_const_int
+
+def bitpack(data, bits, pack_axis, bit_axis, pack_type, name="QuantizeInput"):
+    """Packs data into format necessary for bitserial computation
+    pack_axis : int
+       index of the axis to pack in data
+    bit_axis : int
+       index of axis to place bit axis in resulting packed data"""
+    ishape = data.shape
+    n = len(ishape)
+    if pack_type == 'uint8':
+        data_width = 8
+    elif pack_type == 'uint16':
+        data_width = 16
+    elif pack_type == 'uint32':
+        data_width = 32
+    elif pack_type == 'uint64':
+        data_width = 64
+
+    # Data must be in multiples of the data_width
+    assert get_const_int(ishape[pack_axis]) % data_width == 0, "Not a multiple of word size"
+
+    shape_vec = list(ishape)
+    shape_vec[pack_axis] = (shape_vec[pack_axis] // data_width)
+    shape_vec.insert(bit_axis, 1)
+    bitserial_oshape = tuple(shape_vec)
+    masks = np.array([0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80])
+
+    # pack axis shifts if bit axis comes before
+    if bit_axis <= pack_axis:
+        pack_axis += 1
+
+    def _bitpack(*indices):
+        packed_data = [tvm.const(0, pack_type)] * bits
+        for k in range(data_width):
+            # Translate indices for packed data back to original
+            idx = [0] * n
+            j = 0
+            for i in range(n+1):
+                if i == bit_axis:
+                    continue
+                elif i == pack_axis:
+                    idx[j] = indices[i] * data_width + k
+                else:
+                    idx[j] = indices[i]
+                j += 1
+
+            element = data(*idx)
+            for b in range(bits):
+                extracted_bit = ((element & tvm.const(masks[b], "int32")) >> b).astype(pack_type)
+                packed_data[b] = (packed_data[b] | extracted_bit)
+                if k < data_width - 1:
+                    packed_data[b] = packed_data[b] << 1
+
+            if k == data_width - 1:
+                return tuple(packed_data)
+        return tuple(packed_data)
+
+    output_tuple = tvm.compute(bitserial_oshape, _bitpack, name=name, tag='bitpack')
+
+    if bits > 1:
+        return concatenate(output_tuple, axis=bit_axis)
+    return output_tuple
+
+def binary_op_multiplier(pack_dtype):
+    """"Returns number of bits packed into
+    pack_dtype: string
+        pack type for the operator (must be a uint)"""
+    return int(pack_dtype[4:])
+    
\ No newline at end of file
diff --git a/topi/python/topi/x86/__init__.py b/topi/python/topi/x86/__init__.py
index 638d428ec28a..a414e3f7a5b7 100644
--- a/topi/python/topi/x86/__init__.py
+++ b/topi/python/topi/x86/__init__.py
@@ -9,6 +9,7 @@
 from .injective import *
 from .pooling import schedule_pool, schedule_global_pool
 from .bitserial_conv2d import schedule_bitserial_conv2d
+from .bitserial_dense import schedule_bitserial_dense
 from .depthwise_conv2d import schedule_depthwise_conv2d_NCHWc
 from .dense import _schedule_dense, _schedule_dense_pack, _schedule_dense_nopack
 from .batch_matmul import schedule_batch_matmul
diff --git a/topi/python/topi/x86/bitserial_dense.py b/topi/python/topi/x86/bitserial_dense.py
new file mode 100644
index 000000000000..db9ec27ea879
--- /dev/null
+++ b/topi/python/topi/x86/bitserial_dense.py
@@ -0,0 +1,94 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, too-many-locals, too-many-arguments
+"""Schedule for bitserial dense operator."""
+from __future__ import absolute_import as _abs
+import tvm
+from tvm import autotvm
+from topi.util import get_const_int
+from .. import tag
+from .. import generic
+
+@autotvm.register_topi_schedule(generic.nn.schedule_bitserial_dense, ['cpu'], 'direct')
+def schedule_bitserial_dense(cfg, outs):
+    """Schedule for bitserial_dense.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of bitserial dense operator.
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for bitserial_dense.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _schedule(cfg, s, data_vec, weight_vec, output):
+        s[data_vec].parallel(s[data_vec].op.axis[0])
+        s[weight_vec].parallel(s[weight_vec].op.axis[0])
+
+        y, x = s[output].op.axis
+        wb, db, k = s[output].op.reduce_axis
+
+        yo, yi = cfg["tile_y"].apply(s, output, y)
+        xo, xi = cfg["tile_x"].apply(s, output, x)
+        ko, ki = cfg["tile_k"].apply(s, output, k)
+
+
+        cfg["reorder_0"].apply(s, output, [yo, xo, ko, yi, wb, db, ki, xi])
+        cfg["ann_reduce"].apply(s, output, [db, wb],
+                                axis_lens=[get_const_int(db.dom.extent),
+                                           get_const_int(wb.dom.extent)],
+                                max_unroll=8,
+                                cfg=cfg)
+        cfg["ann_spatial"].apply(s, output, [yi, xi],
+                                 axis_lens=[cfg['tile_y'].size[-1],
+                                            cfg['tile_x'].size[-1]],
+                                 max_unroll=8,
+                                 cfg=cfg)
+        s[output].vectorize(xi)
+        s[output].parallel(yo)
+        return s
+
+    def traverse(op):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(op.tag) or 'elemwise' in op.tag:
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+
+        elif op.tag == 'bitserial_dense' or 'bitserial_dense_unipolar':
+            output = op.output(0)
+            weight_vec = op.input_tensors[0]
+
+            data_vec = op.input_tensors[1]
+            data = data_vec.op.input_tensors[0]
+            if "QuantizeInput" in data.op.name:
+                data = data.op.input_tensors[0]
+            _schedule(cfg, s, data_vec, weight_vec, output)
+        else:
+            raise RuntimeError("Unsupported operator: %s" % op.tag)
+
+    traverse(outs[0].op)
+    return s
diff --git a/topi/tests/python/test_topi_bitserial_dense.py b/topi/tests/python/test_topi_bitserial_dense.py
new file mode 100644
index 000000000000..f1bd02357796
--- /dev/null
+++ b/topi/tests/python/test_topi_bitserial_dense.py
@@ -0,0 +1,74 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test code for bitserial_dense operator"""
+import numpy as np
+import tvm
+import topi
+import topi.testing
+from topi.util import get_const_tuple
+from tvm.contrib.pickle_memoize import memoize
+
+def generate_quantized_np(shape, bits, out_dtype):
+    min_val = 0
+    max_val = 1 << bits
+    return np.random.randint(min_val, max_val, size=shape).astype(out_dtype)
+
+def verify_bitserial_dense(batch, in_dim, out_dim, activation_bits, weight_bits, unipolar):
+    input_dtype = 'uint32'
+    out_dtype = 'int16'
+
+    with tvm.target.create('llvm'):
+        A = tvm.placeholder((batch, in_dim), dtype=input_dtype, name='A')
+        B = tvm.placeholder((out_dim, in_dim), dtype=input_dtype, name='B')
+        C = topi.nn.bitserial_dense(A, B, activation_bits, weight_bits, out_dtype=out_dtype,
+                                    unipolar=unipolar)
+        s = topi.generic.schedule_bitserial_dense([C])
+
+    a_shape = get_const_tuple(A.shape)
+    b_shape = get_const_tuple(B.shape)
+
+    @memoize("topi.tests.test_topi_bitseral_dense")
+    def get_ref_data():
+        a_np = generate_quantized_np(get_const_tuple(a_shape), activation_bits, input_dtype)
+        b_np = generate_quantized_np(get_const_tuple(b_shape), weight_bits, input_dtype)
+        if unipolar:
+            b_ = np.copy(b_np).astype(out_dtype)
+            for x in np.nditer(b_, op_flags=['readwrite']):
+                x[...] = 1 if x == 1 else -1
+            c_np = np.dot(a_np, b_.T)
+        else:
+            c_np = np.dot(a_np, b_np.T)
+        return a_np, b_np, c_np
+    a_np, b_np, c_np = get_ref_data()
+
+    ctx = tvm.cpu(0)
+    a = tvm.nd.array(a_np, ctx)
+    b = tvm.nd.array(b_np, ctx)
+    c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+    func = tvm.build(s, [A, B, C], "llvm")
+    func(a, b, c)
+    tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+def test_bitserial_dense():
+    verify_bitserial_dense(1, 1024, 1000, 1, 1, True)
+    verify_bitserial_dense(1, 1024, 1000, 2, 1, True)
+
+    verify_bitserial_dense(1, 1024, 1000, 1, 1, False)
+    verify_bitserial_dense(1, 1024, 1000, 2, 1, False)
+
+if __name__ == "__main__":
+    test_bitserial_dense()

From f2041c8c3e7b6a5e1abe71f37b235ad5561fbbd7 Mon Sep 17 00:00:00 2001
From: Pedro Larroy <pedro.larroy.lists@gmail.com>
Date: Fri, 26 Apr 2019 19:56:34 -0700
Subject: [PATCH 053/106] Check that the node is not null, add contains to
 OpMap (#3037)

---
 3rdparty/dmlc-core        |  2 +-
 nnvm/include/nnvm/graph.h | 10 +++++++---
 nnvm/include/nnvm/op.h    | 22 ++++++++++++++++++++--
 nnvm/src/core/graph.cc    |  1 +
 nnvm/src/pass/gradient.cc | 32 ++++++++++++++++++--------------
 5 files changed, 47 insertions(+), 20 deletions(-)

diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 3ffea8694adf..82bf4c2e2af3 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 3ffea8694adf9c0363f9abbf162dc0e4a45b22c5
+Subproject commit 82bf4c2e2af312b3d52513aa727483803a2f8734
diff --git a/nnvm/include/nnvm/graph.h b/nnvm/include/nnvm/graph.h
index f67db5d1a7a4..9efee161fdba 100644
--- a/nnvm/include/nnvm/graph.h
+++ b/nnvm/include/nnvm/graph.h
@@ -315,12 +315,16 @@ inline void DFSVisit(const std::vector<NodeEntry>& heads,
                  });
   PostOrderDFSVisit<GNode, Node*>(
       head_nodes,
-      [fvisit](GNode n) { fvisit(*n); },  // FVisit
-      [](GNode n)->Node* { return n->get(); },  // HashFunc
+      [fvisit](GNode n) {
+        fvisit(*n);
+        },  // FVisit
+      [](GNode n)->Node* {
+        return n->get();
+        },  // HashFunc
       [](GNode n)->uint32_t {  // InDegree
         if (!(*n)) return 0;
         return (*n)->inputs.size() + (*n)->control_deps.size();
-      },
+        },
       [](GNode n, uint32_t index)->GNode {  // GetInput
         if (index < (*n)->inputs.size()) {
           return &(*n)->inputs.at(index).node;
diff --git a/nnvm/include/nnvm/op.h b/nnvm/include/nnvm/op.h
index e1d596089a88..eba9d2a3c728 100644
--- a/nnvm/include/nnvm/op.h
+++ b/nnvm/include/nnvm/op.h
@@ -368,6 +368,13 @@ class OpMap {
    */
   inline int count(const Op* op) const;
 
+  /*!
+   * \brief Check if the map has op as key.
+   * \param op The key to the map
+   * \return true if op is contained in map, false otherwise.
+   */
+  inline bool contains(const Op* op) const;
+
  private:
   friend class Op;
   // internal attribute name
@@ -578,9 +585,20 @@ inline Op& Op::set_attr_parser(std::function<void (NodeAttrs* attrs)> fn) {  //
 // member functions of OpMap
 template<typename ValueType>
 inline int OpMap<ValueType>::count(const Op* op) const {
-  if (op == nullptr) return 0;
+  if (contains(op)) {
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+template<typename ValueType>
+inline bool OpMap<ValueType>::contains(const Op* op) const {
+  if (op == nullptr) {
+    return false;
+  }
   const uint32_t idx = op->index_;
-  return idx < data_.size() ? (data_[idx].second != 0) : 0;
+  return idx < data_.size() ? (data_[idx].second != 0) : false;
 }
 
 template<typename ValueType>
diff --git a/nnvm/src/core/graph.cc b/nnvm/src/core/graph.cc
index 0aae7edd9dd6..92ff98618ec8 100644
--- a/nnvm/src/core/graph.cc
+++ b/nnvm/src/core/graph.cc
@@ -78,6 +78,7 @@ IndexedGraph::IndexedGraph(const Graph &g) {
              (const NodePtr& n) {
       CHECK_LT(nodes_.size(), std::numeric_limits<uint32_t>::max());
       uint32_t nid = static_cast<uint32_t>(nodes_.size());
+      CHECK(n);
       for (const auto &subgraph : n->attrs.subgraphs)
         subgraphs.push_back(subgraph);
       // nodes_
diff --git a/nnvm/src/pass/gradient.cc b/nnvm/src/pass/gradient.cc
index b29d24654c03..b1b1e3a506e3 100644
--- a/nnvm/src/pass/gradient.cc
+++ b/nnvm/src/pass/gradient.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -143,13 +143,13 @@ Graph Gradient(Graph src) {
         << "because it is unreachable from the outputs.";
   }
 
-  // construct mirror reduece memory strategy if needed
+  // construct mirror as memory reduction strategy if needed
   std::unordered_map<Node*, NodePtr> mirror_map;
   if (mirror_fun != nullptr) {
-    for (const NodePtr& n : topo_order) {
-      if (mirror_fun(*n)) {
+    for (const NodePtr& node_ptr : topo_order) {
+      if (mirror_fun(*node_ptr)) {
         NodePtr new_node = Node::Create();
-        *new_node = *n;
+        *new_node = *node_ptr;
         new_node->attrs.name += "_mirror";
         for (auto& e : new_node->inputs) {
           e.node = mirror_map.at(e.node.get());
@@ -157,9 +157,9 @@ Graph Gradient(Graph src) {
         for (auto& n : new_node->control_deps) {
           n = mirror_map.at(n.get());
         }
-        mirror_map[n.get()] = std::move(new_node);
+        mirror_map[node_ptr.get()] = std::move(new_node);
       } else {
-        mirror_map[n.get()] = n;
+        mirror_map[node_ptr.get()] = node_ptr;
       }
     }
   }
@@ -185,7 +185,8 @@ Graph Gradient(Graph src) {
     if ((*rit)->inputs.size() != 0) {
       NodePtr fwd_node = (mirror_map.size() == 0 ? ptr : mirror_map.at(ptr.get()));
       std::vector<NodeEntry> input_grads;
-      if (grad_fun_map.count(ptr->op())) {
+      // Check for FGradient
+      if (grad_fun_map.contains(ptr->op())) {
         input_grads = grad_fun_map[ptr->op()](fwd_node, out_agg_grads);
         CHECK_EQ((*rit)->inputs.size(), input_grads.size())
             << "Gradient function not returning enough gradient";
@@ -205,20 +206,23 @@ Graph Gradient(Graph src) {
           if (p->op()->attr_parser != nullptr) {
             p->op()->attr_parser(&(p->attrs));
           }
-          input_grads.emplace_back(nnvm::NodeEntry{p, 0, 0});
+          input_grads.emplace_back(p, 0, 0);
         }
       } else {
         LOG(FATAL) << "Operator " << fwd_node->op()->name << " is non-differentiable "
                    << "because it didn't register FGradient attribute.";
       }
+      for (const auto& nodeEntry : input_grads)
+        CHECK(nodeEntry.node);
       auto git = input_grads.begin();
+      CHECK((*rit)->inputs.size() <= input_grads.size());
       for (auto it = (*rit)->inputs.begin(); it != (*rit)->inputs.end(); ++it, ++git) {
-        auto& ge = output_grads[it->node.get()][it->index];
+        auto& output_grad_entry = output_grads[it->node.get()][it->index];
         // if any of the backward op can do shape inference, the hint is not necessary.
-        if (finfer_shape.count(git->node->op())) {
-          ge.need_attr_hint = false;
+        if (finfer_shape.contains(git->node->op())) {
+          output_grad_entry.need_attr_hint = false;
         }
-        ge.grads.emplace_back(std::move(*git));
+        output_grad_entry.grads.emplace_back(std::move(*git));
       }
     }
   }

From fe23d28168dde3d8ad24c418f9a2e9efa585a112 Mon Sep 17 00:00:00 2001
From: MaxXing <x@MaxXSoft.net>
Date: Sun, 28 Apr 2019 11:20:38 +0800
Subject: [PATCH 054/106] fixed some typos (#3112)

---
 docs/dev/debugger.rst    | 2 +-
 nnvm/include/nnvm/base.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/dev/debugger.rst b/docs/dev/debugger.rst
index 254a82def6b8..65f206f0cd5e 100644
--- a/docs/dev/debugger.rst
+++ b/docs/dev/debugger.rst
@@ -155,7 +155,7 @@ folder specified while creating the runtime.
 Sample Output
 ***************************************
 
-The below is the output of running  ``tvm/nnvm/tutorials/from_onnnx.py`` with debugger.
+The below is the output of running  ``tvm/nnvm/tutorials/from_onnx.py`` with debugger.
 
 ::
 
diff --git a/nnvm/include/nnvm/base.h b/nnvm/include/nnvm/base.h
index 43e0fb9b0c59..b6a1d0e54135 100644
--- a/nnvm/include/nnvm/base.h
+++ b/nnvm/include/nnvm/base.h
@@ -38,7 +38,7 @@ namespace nnvm {
 /*! \brief any type */
 using dmlc::any;
 
-/*! \brief array_veiw type  */
+/*! \brief array_view type  */
 using dmlc::array_view;
 
 /*!\brief getter function of any type */

From b63435a73833093f1ef526fa5ac3ad468950a3da Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Sun, 28 Apr 2019 11:48:34 +0800
Subject: [PATCH 055/106] [TOPI] Fix group_conv2d unit test (#3113)

---
 topi/tests/python/test_topi_group_conv2d.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/topi/tests/python/test_topi_group_conv2d.py b/topi/tests/python/test_topi_group_conv2d.py
index 4189ac0a76f5..e80999977e5b 100644
--- a/topi/tests/python/test_topi_group_conv2d.py
+++ b/topi/tests/python/test_topi_group_conv2d.py
@@ -194,7 +194,7 @@ def test_group_conv2d_nchw():
                              add_bias=True)
 
     # dilation
-    verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 2, 32)
+    verify_group_conv2d_nchw(1, 128, 56, 128, 3, 1, 1, 2, 32)
 
     # batch size
     verify_group_conv2d_nchw(2, 128, 56, 128, 3, 1, 1, 1, 32)

From 9ac4922b773f5dd511aa6729d46c149a1fcf68c6 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 28 Apr 2019 12:04:19 -0700
Subject: [PATCH 056/106] [CI] Add file type check (#3116)

---
 Jenkinsfile                             |   2 +-
 docker/Dockerfile.ci_lint               |  10 +-
 docker/install/ubuntu_install_rat.sh    |  27 ++++
 docker/install/ubuntu_install_vulkan.sh |   4 +-
 tests/lint/check_file_type.py           | 163 ++++++++++++++++++++++++
 tests/scripts/task_lint.sh              |   7 +-
 6 files changed, 207 insertions(+), 6 deletions(-)
 create mode 100755 docker/install/ubuntu_install_rat.sh
 create mode 100644 tests/lint/check_file_type.py

diff --git a/Jenkinsfile b/Jenkinsfile
index 4765538a3806..57a1fd791c1c 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -38,7 +38,7 @@
 // - Tag the new version as the lates
 // - Periodically cleanup the old versions on local workers
 //
-ci_lint = "tvmai/ci-lint:v0.50"
+ci_lint = "tvmai/ci-lint:v0.51"
 ci_gpu = "tvmai/ci-gpu:v0.51"
 ci_cpu = "tvmai/ci-cpu:v0.50"
 ci_i386 = "tvmai/ci-i386:v0.50"
diff --git a/docker/Dockerfile.ci_lint b/docker/Dockerfile.ci_lint
index 7fa23adc99dc..adb766e4cb81 100644
--- a/docker/Dockerfile.ci_lint
+++ b/docker/Dockerfile.ci_lint
@@ -22,5 +22,13 @@ FROM ubuntu:16.04
 RUN apt-get update && apt-get install -y sudo wget
 COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
 RUN bash /install/ubuntu_install_python.sh
-RUN apt-get install -y doxygen graphviz
+
+RUN apt-get install -y doxygen graphviz git
 RUN pip3 install cpplint pylint==1.9.4 mypy
+
+# java deps for rat
+COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh
+RUN bash /install/ubuntu_install_java.sh
+
+COPY install/ubuntu_install_rat.sh /install/ubuntu_install_rat.sh
+RUN bash /install/ubuntu_install_rat.sh
diff --git a/docker/install/ubuntu_install_rat.sh b/docker/install/ubuntu_install_rat.sh
new file mode 100755
index 000000000000..e2f3244e6859
--- /dev/null
+++ b/docker/install/ubuntu_install_rat.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+cd /tmp
+wget -q http://www.trieuvan.com/apache//creadur/apache-rat-0.12/apache-rat-0.12-bin.tar.gz
+tar xf apache-rat-0.12-bin.tar.gz
+mv apache-rat-0.12/apache-rat-0.12.jar /bin/apache-rat.jar
+rm -rf apache-rat-0.12-bin.tar.gz apache-rat-0.12
diff --git a/docker/install/ubuntu_install_vulkan.sh b/docker/install/ubuntu_install_vulkan.sh
index 2a53f8c50803..5fb40829e0bc 100755
--- a/docker/install/ubuntu_install_vulkan.sh
+++ b/docker/install/ubuntu_install_vulkan.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
new file mode 100644
index 000000000000..4214d5d21e6c
--- /dev/null
+++ b/tests/lint/check_file_type.py
@@ -0,0 +1,163 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Helper tool to check file types that are allowed to checkin."""
+import os
+import sys
+import subprocess
+
+# List of file types we allow
+ALLOW_EXTENSION = {
+    # source code
+    "cc",
+    "c",
+    "h",
+    "rs",
+    "m",
+    "mm",
+    "g4",
+    "gradle",
+    "js",
+    "tcl",
+    "scala",
+    "java",
+    "go",
+    "sh",
+    "py",
+    "pyi",
+    "pxi",
+    "pyd",
+    "pyx",
+    # configurations
+    "mk",
+    "in",
+    "cmake",
+    "xml",
+    "toml",
+    "yml",
+    "yaml",
+    "json",
+    # docs
+    "txt",
+    "md",
+    "rst",
+    # sgx
+    "edl",
+    "lds",
+    # ios
+    "pbxproj",
+    "plist",
+    "xcworkspacedata",
+    "storyboard",
+    }
+
+# List of file names allowed
+ALLOW_FILE_NAME = {
+    ".gitignore",
+    "README",
+    "Makefile",
+    "Doxyfile",
+    "pylintrc",
+    "rat-excludes",
+    "log4j.properties",
+    ".clang-format",
+    ".gitmodules",
+    "CODEOWNERS",
+   }
+
+# List of specific files allowed in relpath to <proj_root>
+ALLOW_SPECIFIC_FILE = {
+    "docker/with_the_same_user",
+    "LICENSE",
+    "NOTICE",
+    "Jenkinsfile",
+    # sgx file
+    "apps/sgx/enclave/sgx-deps.diff",
+    # html for demo purposes
+    "nnvm/tutorials/web/resnet.html",
+    "tests/webgl/test_static_webgl_library.html",
+    "web/example_rpc.html",
+    # images are normally not allowed
+    # discuss with committers before add more images
+    "apps/android_rpc/app/src/main/res/mipmap-hdpi/ic_launcher.png",
+    "apps/android_rpc/app/src/main/res/mipmap-mdpi/ic_launcher.png",
+    # documentation related files
+    "docs/_static/css/tvm_theme.css",
+    "docs/_static/img/tvm-logo-small.png",
+   }
+
+
+def filename_allowed(name):
+    """Check if name is allowed by the current policy.
+
+    Paramaters
+    ----------
+    name : str
+        Input name
+
+    Returns
+    -------
+    allowed : bool
+        Whether the filename is allowed.
+    """
+    arr = name.rsplit(".", 1)
+    if arr[-1] in ALLOW_EXTENSION:
+        return True
+
+    if os.path.basename(name) in ALLOW_FILE_NAME:
+        return True
+
+    if os.path.basename(name).startswith("Dockerfile"):
+        return True
+
+    if name.startswith("3rdparty"):
+        return True
+
+    if name in ALLOW_SPECIFIC_FILE:
+        return True
+
+    return False
+
+def main():
+    cmd = ["git", "ls-files"]
+    proc = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    (out, _) = proc.communicate()
+    assert proc.returncode == 0
+    res = out.decode("utf-8")
+
+    error_list = []
+
+    for fname in res.split():
+        if not filename_allowed(fname):
+            error_list.append(fname)
+
+    if error_list:
+        report = "====File type check report=====\n"
+        report += "\n".join(error_list)
+        report += "\nFound %d files that are now allowed\n" % len(error_list)
+        report += ("We do not check in binary files into the repo.\n"
+                   "If necessary, please discuss with committers and"
+                   "modify tests/scripts/check_file_type.py to enable the file you need.\n")
+        sys.stderr.write(report)
+        sys.stderr.flush()
+        sys.exit(-1)
+
+    print("check_file_type.py: all checks passed..")
+
+if __name__ == "__main__":
+    main()
+
diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index 99836748b572..757b059e18d2 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -26,6 +26,9 @@ cleanup()
 }
 trap cleanup 0
 
+echo "Check file types..."
+python3 tests/lint/check_file_type.py
+
 echo "Check codestyle of c++ code..."
 make cpplint
 echo "Check codestyle of python code..."

From 1cd1f3522e4d10aacccc60a83a5c6e7aba992b38 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 28 Apr 2019 13:21:08 -0700
Subject: [PATCH 057/106] [LINT] recover lint error, add asf header check
 (#3117)

---
 docs/dev/relay_add_pass.rst                   | 17 ++++++++++++++++
 src/relay/pass/dependency_graph.cc            | 19 ++++++++++++++++++
 src/relay/pass/dependency_graph.h             | 19 ++++++++++++++++++
 tests/lint/add_asf_header.py                  |  5 +++++
 tests/lint/rat-excludes                       |  1 +
 tests/python/relay/test_pass_partial_eval.py  | 17 ++++++++++++++++
 tests/scripts/task_lint.sh                    | 20 ++++++++++++++++++-
 topi/include/topi/nn/bias_add.h               |  2 +-
 topi/include/topi/transform.h                 |  6 ++++--
 tutorials/frontend/deploy_model_on_android.py | 17 ++++++++++++++++
 10 files changed, 119 insertions(+), 4 deletions(-)

diff --git a/docs/dev/relay_add_pass.rst b/docs/dev/relay_add_pass.rst
index a394fe061697..1a264ef9c012 100644
--- a/docs/dev/relay_add_pass.rst
+++ b/docs/dev/relay_add_pass.rst
@@ -1,3 +1,20 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
 .. _relay-add-pass:
 
 Adding a Compiler Pass to Relay
diff --git a/src/relay/pass/dependency_graph.cc b/src/relay/pass/dependency_graph.cc
index 6e25086fe826..a9018266589a 100644
--- a/src/relay/pass/dependency_graph.cc
+++ b/src/relay/pass/dependency_graph.cc
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
  * Copyright (c) 2019 by Contributors
  * \file tvm/relay/pass/dependency_graph.cc
diff --git a/src/relay/pass/dependency_graph.h b/src/relay/pass/dependency_graph.h
index 91cef1ce7cde..7f53918ebcb7 100644
--- a/src/relay/pass/dependency_graph.h
+++ b/src/relay/pass/dependency_graph.h
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
  *  Copyright (c) 2019 by Contributors.
  * \file tvm/relay/pass/dependency_graph.h
diff --git a/tests/lint/add_asf_header.py b/tests/lint/add_asf_header.py
index 3d5cc719ef15..7e0352f4bc2c 100644
--- a/tests/lint/add_asf_header.py
+++ b/tests/lint/add_asf_header.py
@@ -117,6 +117,9 @@
 """.strip()
 
 FMT_MAP = {
+    "cc" : header_cstyle,
+    "h" : header_cstyle,
+    "py" : header_pystyle,
     "toml" : header_pystyle,
     "yml": header_pystyle,
     "yaml": header_pystyle,
@@ -149,6 +152,8 @@ def main(args):
         print("Usage: python add_asf_header.py <file_list>")
 
     for l in open(args[1]):
+        if l.startswith("-----"):
+            continue
         if l.find("File:") != -1:
             l = l.split(":")[-1]
         fname = l.strip()
diff --git a/tests/lint/rat-excludes b/tests/lint/rat-excludes
index 235514b83073..f449c5ee68b9 100644
--- a/tests/lint/rat-excludes
+++ b/tests/lint/rat-excludes
@@ -39,6 +39,7 @@ MANIFEST
 .gitignore
 .gitmodules
 .clang-format
+.bash_history
 rat-excludes
 __init__.py
 pylintrc
diff --git a/tests/python/relay/test_pass_partial_eval.py b/tests/python/relay/test_pass_partial_eval.py
index a00cebd244b3..9e0545021512 100644
--- a/tests/python/relay/test_pass_partial_eval.py
+++ b/tests/python/relay/test_pass_partial_eval.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 import numpy as np
 import tvm
 from tvm import relay
diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index 757b059e18d2..e4b20a2f4b40 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -26,15 +26,31 @@ cleanup()
 }
 trap cleanup 0
 
+
 echo "Check file types..."
 python3 tests/lint/check_file_type.py
 
+echo "Check ASF license header..."
+java -jar /bin/apache-rat.jar -E tests/lint/rat-excludes  -d . |grep "== File" > /tmp/$$.apache-rat.txt || true
+if grep --quiet -E "File" /tmp/$$.apache-rat.txt; then
+    echo "Need to add ASF header to the following files."
+    echo "----------------File List----------------"
+    cat /tmp/$$.apache-rat.txt
+    echo "-----------------------------------------"
+    echo "Use the following steps to add the headers:"
+    echo "- Create file_list.txt in your text editor"
+    echo "- Copy paste the above content in file-list into file_list.txt"
+    echo "- python3 tests/lint/add_asf_header.py file_list.txt"
+    exit -1
+fi
+
 echo "Check codestyle of c++ code..."
 make cpplint
 echo "Check codestyle of python code..."
 make pylint
 echo "Check codestyle of jni code..."
 make jnilint
+
 echo "Check documentations of c++ code..."
 make doc 2>/tmp/$$.log.txt
 
@@ -42,4 +58,6 @@ grep -v -E "ENABLE_PREPROCESSING|unsupported tag" < /tmp/$$.log.txt > /tmp/$$.lo
 echo "---------Error Log----------"
 cat /tmp/$$.logclean.txt
 echo "----------------------------"
-grep -E "warning|error" < /tmp/$$.logclean.txt || true
+if grep --quiet -E "warning|error" < /tmp/$$.logclean.txt; then
+    exit -1
+fi
diff --git a/topi/include/topi/nn/bias_add.h b/topi/include/topi/nn/bias_add.h
index fb4ae30ca404..2a7afd8b6714 100644
--- a/topi/include/topi/nn/bias_add.h
+++ b/topi/include/topi/nn/bias_add.h
@@ -40,7 +40,7 @@ namespace nn {
 *
 * \param data Tensor with shape [batch, in_dim]
 * \param bias Tensor with shape [batch].
-*
+* \param axis The axis to add the bias to.
 * \return Tensor with shape [batch, in_dim]
 */
 inline tvm::Tensor bias_add(const tvm::Tensor& data, const tvm::Tensor& bias, int axis) {
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index a658ba3cf995..946240352076 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -617,6 +617,7 @@ inline Array<Tensor> split_sections(const Tensor& x,
 * \param a The source array.
 * \param indices The indices of the values to extract.
 * \param name The name of the operation.
+* \param mode The mode of to handle out of bound indices.
 * \param tag The tag to mark the operation.
 *
 * \return A Tensor whose op member is the take operation
@@ -655,6 +656,7 @@ inline Tensor take(const Tensor& a,
 * \param indices The indices of the values to extract.
 * \param axis The axis over which to select values. By default,
 * the flattened input array is used.
+* \param mode The mode of to handle out of bound indices.
 * \param name The name of the operation.
 * \param tag The tag to mark the operation.
 *
diff --git a/tutorials/frontend/deploy_model_on_android.py b/tutorials/frontend/deploy_model_on_android.py
index 4ec72f6c4c5a..6985e3ad793d 100644
--- a/tutorials/frontend/deploy_model_on_android.py
+++ b/tutorials/frontend/deploy_model_on_android.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 """
 .. _tutorial-deploy-model-on-android:
 

From 5b5ff5110066e74a2d3810039164ede6db23f8eb Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Mon, 29 Apr 2019 11:18:41 +0900
Subject: [PATCH 058/106] [Relay, OpFusion] Better tuple fusion implementation 
 (#3092)

---
 include/tvm/relay/op_attr_types.h             |   3 +
 python/tvm/relay/op/op.py                     |   2 +
 src/relay/pass/fuse_ops.cc                    |  54 +++--
 .../relay/test_backend_compile_engine.py      |  10 +-
 tests/python/relay/test_pass_fuse_ops.py      | 212 ++++++++++++++----
 5 files changed, 214 insertions(+), 67 deletions(-)

diff --git a/include/tvm/relay/op_attr_types.h b/include/tvm/relay/op_attr_types.h
index 464bc1cc0b64..ca7f6e5d3908 100644
--- a/include/tvm/relay/op_attr_types.h
+++ b/include/tvm/relay/op_attr_types.h
@@ -49,6 +49,9 @@ enum OpPatternKind {
   // Complex operation, can still fuse elemwise operations into its output.
   // but cannot chain another complex op
   kOutEWiseFusable = 4,
+  // The pattern for tuple nodes. Can fuse into subsequent injective ops,
+  // but treated specially
+  kTuple = 7,
   // Opaque operation, cannot fuse anything.
   kOpaque = 8
 };
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index 6312f023df0d..6ba207934d1b 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -112,6 +112,8 @@ class OpPattern(object):
     COMM_REDUCE = 3
     # Complex op, can still fuse ewise into it
     OUT_ELEMWISE_FUSABLE = 4
+    # Represents tuple node
+    TUPLE = 7
     # Not fusable opaque op
     OPAQUE = 8
 
diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index 12e3174dcade..55d609872929 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -267,7 +267,7 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
   void VisitExpr_(const TupleNode* op) final {
     CHECK(graph_.node_map.count(op));
     Node* tuple_node = graph_.node_map.at(op);
-    tuple_node->pattern = kInjective;
+    tuple_node->pattern = kTuple;
     for (const Expr& field : op->fields) {
       if (field->checked_type().as<TensorTypeNode>()) {
         this->Update(field, tuple_node, kInjective);
@@ -661,12 +661,36 @@ class GraphPartitioner {
       // no actions needed if the current node have no dominator
       if (dom_node->parent == nullptr) continue;
       CHECK(!graph_node->extern_ref);
-      // Skip if current node is already fused to the parent.
       size_t dom_parent_gindex = dom_node->parent->gnode->index;
+
+      if (phase == 2) {
+        // Fuse injective ops into intermediate tuples, if any
+        if (group_node->pattern > kInjective) continue;
+        Group* dom_parent_group = groups_[dom_parent_gindex];
+        Group* dom_root_group = dom_parent_group->FindRoot();
+        // If dom node group has a tuple as its root, we do not fuse tuple fields into it
+        if (dom_root_group->pattern == kTuple) continue;
+        if (dom_parent_group->pattern == kTuple && dom_root_group->pattern <= kInjective) {
+          // Now we know the tuple has been fused into subsequent injective ops
+          auto fcond = [](OpPatternKind kind, bool is_sink) {
+            return kind <= kInjective;
+          };
+          // dom_root_group can also be tuple, as in inception layers
+          // CheckPath is needed to avoid fusing two intermediate tuples
+          if (CheckPath(graph_node, dom_node->parent->gnode, fcond)) {
+            CommitFuse(graph_node, dom_node->parent->gnode);
+          }
+        }
+        continue;
+      }
+
+      // Skip if current node is already fused to the parent.
       if (groups_[dom_parent_gindex] != nullptr &&
           group_node->FindRoot() == groups_[dom_parent_gindex]->FindRoot()) {
         continue;
       }
+      // Do not fuse into tuple for now
+      if (groups_[dom_parent_gindex]->pattern == kTuple) continue;
       // Try to fuse current node to its post-dominator.
       if (group_node->pattern == kOutEWiseFusable) {
         if (phase != 0) continue;
@@ -702,7 +726,7 @@ class GraphPartitioner {
             CommitFuse(graph_node, dom_node->parent->gnode);
           }
         }
-      } else if (group_node->pattern == kInjective) {
+      } else if (group_node->pattern == kInjective || group_node->pattern == kTuple) {
         // defer injective fusion to second phase.
         // so conv2d always finishes fusing.
         if (phase != 1) continue;
@@ -728,7 +752,7 @@ GraphPartitioner::Partition(const IndexedForwardGraph& graph) {
   // get post dominator tree
   auto post_dom_tree = DominatorTree::PostDom(arena_, graph);
   // run fusion algorithm.
-  for (int phase = 0; phase < 2; ++phase) {
+  for (int phase = 0; phase < 3; ++phase) {
     this->RunFuse(graph, post_dom_tree, phase);
   }
   return std::move(groups_);
@@ -821,29 +845,11 @@ class FuseMutator : private ExprMutator {
 
   Expr VisitExpr_(const TupleNode* tuple) {
     auto* ret_group = gmap_.at(tuple)->FindRoot();
-    Array<Expr> new_fields = GetNewArguments(tuple->fields, ret_group);
     if (ret_group == gmap_.at(tuple)) {
-      // This tuple is the root of its group. Check if all fields come from other groups.
-      bool isolated = new_fields.size() == ginfo_[ret_group].params.size();
-      for (size_t i = 0; i < new_fields.size() && isolated; ++i) {
-        isolated &= (new_fields[i].same_as(ginfo_[ret_group].params[i]));
-      }
-      if (isolated) {
-        // Do not put a isolated tuple into a function
-        return ExprMutator::VisitExpr_(tuple);
-      }
-      // This tuple has been fused with other ops before it
-      for (size_t i = 0; i < new_fields.size(); i++) {
-        // Copy function arguments to tuple field of the output because currently graph memory
-        // planer doesn't support inplace operations
-        if (new_fields[i].as<VarNode>()) {
-          auto copy = Copy(new_fields[i]);
-          new_fields.Set(i, copy);
-        }
-      }
-      return MakeNewFunction(ret_group, tuple->checked_type(), TupleNode::make(new_fields));
+      return ExprMutator::VisitExpr_(tuple);
     }
     // This tuple is an intermediate node in the group
+    Array<Expr> new_fields = GetNewArguments(tuple->fields, ret_group);
     return TupleNode::make(new_fields);
   }
 
diff --git a/tests/python/relay/test_backend_compile_engine.py b/tests/python/relay/test_backend_compile_engine.py
index 3b479b847619..ca4619c97886 100644
--- a/tests/python/relay/test_backend_compile_engine.py
+++ b/tests/python/relay/test_backend_compile_engine.py
@@ -69,8 +69,16 @@ def test_compile_injective_with_tuple():
     relay.build(func, 'llvm')
 
 
+def test_compile_tuple_dup():
+    x = relay.var("data", shape=(16, 16))
+    log = relay.log(x)
+    output = relay.Tuple([log, log])
+    f = relay.Function([x], output)
+    relay.build(f, 'llvm')
+
+
 if __name__ == "__main__":
     test_compile_engine()
     test_compile_placeholder_bypass()
     test_compile_injective_with_tuple()
-
+    test_compile_tuple_dup()
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
index baafbeebd560..bdffdf7c129f 100644
--- a/tests/python/relay/test_pass_fuse_ops.py
+++ b/tests/python/relay/test_pass_fuse_ops.py
@@ -176,16 +176,14 @@ def expected(dshape):
         f0 = relay.Function([x], pooled)
 
         p0 = relay.var("p0", shape=(dshape[0], dshape[1], dshape[2]//2, dshape[3]//2))
-        p1 = relay.var("p1", shape=(dshape[0], dshape[1], dshape[2], dshape[3]))
-        p1_copy = relay.copy(p1)
         upsampled = relay.nn.upsampling(p0, scale=2, layout="NCHW")
-        out = relay.Tuple((upsampled, p1_copy))
-        f1 = relay.Function([p0, p1], out)
+        f1 = relay.Function([p0], upsampled)
 
         x = relay.var("x", shape=dshape)
         y = relay.Call(f0, [x])
-        z = relay.Call(f1, [y, x])
-        return relay.Function([x], z)
+        z = relay.Call(f1, [y])
+        tup = relay.Tuple((z, x))
+        return relay.Function([x], tup)
 
     dshape = (1, 16, 64, 64)
     z = before(dshape)
@@ -199,41 +197,6 @@ def expected(dshape):
     assert relay.ir_pass.alpha_equal(zz, after)
 
 
-def test_tuple_strided_slice():
-    """
-    Test fusion case where the number of fields of tuple and
-    the number of parameters to the function containing the tuple are different
-    """
-
-    def before(dshape):
-        x = relay.var("x", shape=dshape)
-        slice1 = relay.strided_slice(x, begin=[0, 0], end=[dshape[1]//2, dshape[1]], strides=[1,1])
-        slice2 = relay.strided_slice(x, begin=[dshape[1]//2, 0], end=[dshape[0], dshape[1]], strides=[1,1])
-        out = relay.Tuple((slice1, slice2))
-        return relay.Function([x], out)
-
-    def expected(dshape):
-        x = relay.var("x", shape=dshape)
-        slice1 = relay.strided_slice(x, begin=[0, 0], end=[dshape[1]//2, dshape[1]], strides=[1,1])
-        slice2 = relay.strided_slice(x, begin=[dshape[1]//2, 0], end=[dshape[0], dshape[1]], strides=[1,1])
-        out = relay.Tuple((slice1, slice2))
-        f0 = relay.Function([x], out)
-
-        x = relay.var("x", shape=dshape)
-        y = relay.Call(f0, [x])
-        return relay.Function([x], y)
-
-    dshape = (64, 64)
-    z = before(dshape)
-    z = relay.ir_pass.infer_type(z)
-    zz = relay.ir_pass.fuse_ops(z, opt_level=0)
-    assert not relay.ir_pass.free_vars(zz)
-    zz = relay.ir_pass.fuse_ops(z, opt_level=2)
-    zz = relay.ir_pass.infer_type(zz)
-    assert not relay.ir_pass.free_vars(zz)
-    after = relay.ir_pass.infer_type(expected(dshape))
-    assert relay.ir_pass.alpha_equal(zz, after)
-
 
 def test_stop_fusion():
     def before(dshape):
@@ -377,13 +340,178 @@ def expected(dim):
     assert relay.ir_pass.alpha_equal(zz, after)
 
 
+def test_tuple_intermediate():
+    def before(x):
+        inj = relay.squeeze(x)
+        y1 = relay.add(inj, relay.const(1, "float32"))
+        tmp = relay.squeeze(inj)
+        tmp = relay.add(tmp, relay.const(1, "float32"))
+        y2 = relay.add(tmp, relay.const(1, "float32"))
+        y3 = relay.add(inj, relay.const(1, "float32"))
+        concat = relay.concatenate((y1, y2, y3), axis=1)
+        out_inj = relay.squeeze(concat)
+        out = relay.add(out_inj, relay.const(1, "float32"))
+        return relay.Function(relay.ir_pass.free_vars(out), out)
+
+    def expected(p0):
+        f0 = before(p0)
+        x = relay.var("x", shape=dshape)
+        y = relay.Call(f0, [x])
+        return relay.Function([x], y)
+
+    dshape = (1, 16, 64, 64)
+    x = relay.var("x", shape=dshape)
+    z = before(x)
+    z = relay.ir_pass.infer_type(z)
+    zz = relay.ir_pass.fuse_ops(z, opt_level=0)
+    assert not relay.ir_pass.free_vars(zz)
+    zz = relay.ir_pass.fuse_ops(z, opt_level=2)
+    relay.build(zz, 'llvm')
+    zz = relay.ir_pass.infer_type(zz)
+    assert not relay.ir_pass.free_vars(zz)
+    after = relay.ir_pass.infer_type(expected(x))
+    assert relay.ir_pass.alpha_equal(zz, after)
+
+
+def test_tuple_consecutive():
+    def gen_intermediate_tuple(x):
+        y1 = relay.add(x, relay.const(1, "float32"))
+        y2 = relay.add(x, relay.const(1, "float32"))
+        y3 = relay.add(x, relay.const(1, "float32"))
+        concat = relay.concatenate((y1, y2, y3), axis=1)
+        out = relay.add(concat, relay.const(1, "float32"))
+        return out
+
+    def gen_consecutive_tuple(x):
+        y1 = gen_intermediate_tuple(x)
+        y2 = gen_intermediate_tuple(x)
+        y3 = gen_intermediate_tuple(x)
+        concat = relay.concatenate((y1, y2, y3), axis=1)
+        return concat
+
+    def before(x):
+        concat = gen_consecutive_tuple(x)
+        pooled = relay.nn.max_pool2d(concat, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
+        out = relay.add(pooled, relay.const(1, "float32"))
+        out2 = relay.add(out, relay.const(1, "float32"))
+        out_tup = relay.Tuple((out, out2))
+        return relay.Function(relay.ir_pass.free_vars(out_tup), out_tup)
+
+    def expected(dshape):
+        p0 = relay.var("p0", shape=dshape)
+        concat = gen_consecutive_tuple(p0)
+        f0 = relay.Function([p0], concat)
+
+        p01 = relay.var("p01", shape=(1, dshape[1]*9, dshape[2], dshape[3]))
+        pooled = relay.nn.max_pool2d(p01, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
+        out = relay.add(pooled, relay.const(1, "float32"))
+        f1 = relay.Function([p01], out)
+
+        p02 = relay.var("p02", shape=(1, dshape[1]*9, dshape[2]//2, dshape[3]//2))
+        out = relay.add(p02, relay.const(1, "float32"))
+        f2 = relay.Function([p02], out)
+
+        x = relay.var("x", shape=dshape)
+        y = relay.Call(f0, [x])
+        z = relay.Call(f1, [y])
+        z2 = relay.Call(f2, [z])
+
+        return relay.Function([x], relay.Tuple((z, z2)))
+
+    dshape = (1, 16, 64, 64)
+    x = relay.var("x", shape=dshape)
+    z = before(x)
+    z = relay.ir_pass.infer_type(z)
+    zz = relay.ir_pass.fuse_ops(z, opt_level=0)
+    assert not relay.ir_pass.free_vars(zz)
+    zz = relay.ir_pass.fuse_ops(z, opt_level=2)
+    relay.build(zz, 'llvm')
+    zz = relay.ir_pass.infer_type(zz)
+    assert not relay.ir_pass.free_vars(zz)
+    after = relay.ir_pass.infer_type(expected(dshape))
+    assert relay.ir_pass.alpha_equal(zz, after)
+
+
+def test_inception_like():
+    def conv(data):
+        y = relay.nn.conv2d(data, relay.var("w"),
+                            kernel_size=(3, 3),
+                            padding=(1, 1),
+                            channels=16)
+        return relay.nn.relu(data=y)
+
+    def inception_like(data):
+        c0 = conv(data)
+        c1 = conv(data)
+        return relay.concatenate((c0, c1), axis=1)
+
+    def before(dshape):
+        x = relay.var("x", shape=dshape)
+        in1 = inception_like(x)
+        in2 = inception_like(in1)
+        return relay.Function(relay.ir_pass.free_vars(in2), in2)
+
+    def expected(dshape):
+        p0 = relay.var("p0", shape=dshape)
+        c = conv(p0)
+        f0 = relay.Function(relay.ir_pass.free_vars(c), c)
+
+        p01 = relay.var("p01", shape=dshape)
+        c = conv(p01)
+        f1 = relay.Function(relay.ir_pass.free_vars(c), c)
+
+        p02 = relay.var("p02", shape=dshape)
+        p12 = relay.var("p12", shape=dshape)
+        concat1 = relay.concatenate((p02, p12), axis=1)
+        f_concat1 = relay.Function([p02, p12], concat1)
+
+        dshape2 = (dshape[0], dshape[1]*2, dshape[2], dshape[3])
+
+        p03 = relay.var("p03", shape=dshape2)
+        c = conv(p03)
+        f2 = relay.Function(relay.ir_pass.free_vars(c), c)
+
+        p04 = relay.var("p04", shape=dshape2)
+        c = conv(p04)
+        f3 = relay.Function(relay.ir_pass.free_vars(c), c)
+
+        p05 = relay.var("p05", shape=dshape)
+        p15 = relay.var("p15", shape=dshape)
+        concat2 = relay.concatenate((p05, p15), axis=1)
+        f_concat2 = relay.Function([p05, p15], concat2)
+
+        x = relay.var("x", shape=dshape)
+        c1 = relay.Call(f0, [x, relay.var("w1")])
+        c2 = relay.Call(f1, [x, relay.var("w2")])
+        concat = relay.Call(f_concat1, [c1, c2])
+        c3 = relay.Call(f2, [concat, relay.var("w3")])
+        c4 = relay.Call(f3, [concat, relay.var("w4")])
+        out = relay.Call(f_concat2, [c3, c4])
+
+        return relay.Function(relay.ir_pass.free_vars(out), out)
+
+    dshape = (1, 16, 64, 64)
+    z = before(dshape)
+    z = relay.ir_pass.infer_type(z)
+    zz = relay.ir_pass.fuse_ops(z, opt_level=0)
+    assert not relay.ir_pass.free_vars(zz)
+    zz = relay.ir_pass.fuse_ops(z, opt_level=2)
+    relay.build(zz, 'llvm')
+    zz = relay.ir_pass.infer_type(zz)
+    assert not relay.ir_pass.free_vars(zz)
+    after = relay.ir_pass.infer_type(expected(dshape))
+    assert relay.ir_pass.alpha_equal(zz, after)
+
+
 if __name__ == "__main__":
     test_fuse_simple()
     test_conv2d_fuse()
     test_concatenate()
     test_tuple_root()
-    test_tuple_strided_slice()
     test_stop_fusion()
     test_fuse_myia_regression()
     test_fuse_tuple_get_elemwise()
     test_tuple_get_root()
+    test_tuple_intermediate()
+    test_tuple_consecutive()
+    test_inception_like()

From ceeefc4eaeaab171de6f00741e4d59f9209ec538 Mon Sep 17 00:00:00 2001
From: Gemfield <gemfield@civilnet.cn>
Date: Mon, 29 Apr 2019 10:24:28 +0800
Subject: [PATCH 059/106] porting new upsample test case from nnvm to relay
 (#3115)

---
 tests/python/frontend/onnx/test_forward.py | 30 ++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 1e89b9ddaa8c..2564d83b1fc2 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -427,9 +427,39 @@ def _test_upsample_bilinear():
         tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, 'float32')
         tvm.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5)
 
+def _test_upsample_bilinear_opset9():
+    scale = 2
+    in_shape = (1, 1, 3, 3)
+    out_shape = (1, 1, 3*scale, 3*scale)
+    y = helper.make_node("Upsample", ['in','scales'], ['out'], mode='linear')
+    scales=[1.0, 1.0, 2.0, 2.0]
+    in_array = np.random.uniform(size=in_shape).astype(np.float32)
+    out_array = topi.testing.bilinear_resize_python(in_array, (3*scale, 3*scale), "NCHW")
+
+    ref_array = np.array(scales)
+    ref_node = helper.make_node('Constant',
+                                 inputs=[],
+                                 outputs=['scales'],
+                                 value=onnx.helper.make_tensor(name = 'const_tensor',
+                                                               data_type = TensorProto.FLOAT,
+                                                               dims = ref_array.shape,
+                                                               vals = ref_array.flatten().astype(float)))
+
+    graph = helper.make_graph([ref_node, y],
+                              'upsample_bilinear_opset9_test',
+                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))])
+
+    model = helper.make_model(graph, producer_name='upsample_bilinear_opset9_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, 'float32')
+        tvm.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5)
+
 def test_upsample():
     _test_upsample_nearest()
     _test_upsample_bilinear()
+    _test_upsample_bilinear_opset9()
 
 def _test_softmax(inshape, axis):
     opname = 'Softmax'

From 69f0a93a92950418242d5caaddedb23c24e700ca Mon Sep 17 00:00:00 2001
From: Yizhi Liu <liuyizhi@apache.org>
Date: Sun, 28 Apr 2019 19:25:38 -0700
Subject: [PATCH 060/106] [Lang] Fix undef BijectiveLayout and add scalar
 layout support (#3105)

---
 include/tvm/data_layout.h                     |  6 ++++--
 src/lang/data_layout.cc                       |  9 ++++++++-
 src/relay/pass/alter_op_layout.h              | 12 +++++++----
 .../python/relay/test_pass_alter_op_layout.py | 20 +++++++++----------
 .../python/unittest/test_lang_data_layout.py  |  6 ++++++
 5 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/include/tvm/data_layout.h b/include/tvm/data_layout.h
index ed61fd3903b1..ff5f8e37dbb6 100644
--- a/include/tvm/data_layout.h
+++ b/include/tvm/data_layout.h
@@ -94,12 +94,13 @@ class Layout;
 // Internal node container Buffer
 class LayoutNode : public Node {
  public:
-  /*! \brief string representation of layout */
+  /*! \brief string representation of layout, "" for scalar. */
   std::string name;
   /*! \brief specify each axis of the layout,
    *   in which the variable name is the name of the axis.
    *   The IterVar's extent indicates the size of the axis,
    *   it is a variable for a primal axis, but a constant for a subordinate axis.
+   *   Empty for scalar's layout.
    */
   Array<IterVar> axes;
 
@@ -122,6 +123,7 @@ class LayoutNode : public Node {
  *  For example, NCHW16c can describe a 5-D tensor of
  *  [batch_size, channel, height, width, channel_block].
  *  Here subordinate axis channel_block=16 is the factor size of the primal axis C (channel).
+ *  Layout for scalar is defined, while both its name and axes have size 0.
  */
 class Layout : public NodeRef {
  public:
@@ -175,7 +177,7 @@ class Layout : public NodeRef {
    *        that starts at dimension \p pos and spans \p len dimensions
    *        (or until the end of the layout, whichever comes first).
    * \param pos The start position.
-   * \param len The length of the sub-layout.
+   * \param len The length of the sub-layout. if 0, return layout of scalar
    * \return A newly constructed Layout object.
    */
   Layout SubLayout(size_t pos, size_t len) const;
diff --git a/src/lang/data_layout.cc b/src/lang/data_layout.cc
index 71b323c57daa..9f3d0fe494a2 100644
--- a/src/lang/data_layout.cc
+++ b/src/lang/data_layout.cc
@@ -88,12 +88,14 @@ Layout::Layout(const Array<IterVar>& axes) {
 }
 
 Layout::Layout(const std::string& name) { // NOLINT(*)
-  if (name.empty() || name == "__undef__") return;
+  if (name == "__undef__") return;
 
   node_ = make_node<LayoutNode>();
   LayoutNode *node = operator->();
   node->name = name;
 
+  if (name.empty()) return;  // scalar
+
   // parse layout string
   int32_t factor = 0;
   for (char c : name) {
@@ -146,6 +148,7 @@ Layout LayoutNode::make(const std::string& layout) {
 
 Layout Layout::SubLayout(size_t pos, size_t len) const {
   if (!defined() || pos > ndim()) return Layout::Undef();
+  if (len == 0) return Layout(Array<IterVar>());
   if (pos + len > ndim()) len = ndim() - pos;
   Array<IterVar> new_layout;
   const auto axes = operator->()->axes;
@@ -195,6 +198,10 @@ int32_t Layout::FactorOf(const LayoutAxis& axis) const {
 inline bool GetStoreRule(Array<Expr>* rule,
                          const Layout& src_layout,
                          const Layout& dst_layout) {
+  if (!src_layout.defined() || src_layout.name().empty() ||
+      !dst_layout.defined() || dst_layout.name().empty()) {
+    return false;
+  }
   for (size_t i = 0; i < dst_layout.ndim(); ++i) {
     const auto& store_axis = dst_layout[i];
     const IterVar& store_axis_impl = dst_layout->axes[i];
diff --git a/src/relay/pass/alter_op_layout.h b/src/relay/pass/alter_op_layout.h
index e5040259a5c4..80593a521f25 100644
--- a/src/relay/pass/alter_op_layout.h
+++ b/src/relay/pass/alter_op_layout.h
@@ -97,15 +97,19 @@ inline Array<Array<Layout> > BinaryBroadcastLayout(const Attrs& attrs,
     if (old_in_shapes[defined_idx].size() >= old_in_shapes[undef_idx].size()) {
       layouts.Set(undef_idx,
                   layouts[defined_idx].SubLayout(
-                  old_in_shapes[defined_idx].size() - old_in_shapes[undef_idx].size(),
-                  old_in_shapes[undef_idx].size()));
-      return Array<Array<Layout> > {layouts, {layouts[defined_idx]}};
+                      old_in_shapes[defined_idx].size() - old_in_shapes[undef_idx].size(),
+                      old_in_shapes[undef_idx].size()));
+      return Array<Array<Layout> >{layouts, {layouts[defined_idx]}};
     } else {
       // only know the tensor with smaller dimensions,
       // so we cannot infer the final broadcasted output.
       // fails in this case.
-      return Array<Array<Layout> > {{Layout::Undef()}, {Layout::Undef()}};
+      return Array<Array<Layout> >{{Layout::Undef()}, {Layout::Undef()}};
     }
+  } else if (layouts[0].defined() && layouts[1].defined() &&
+            (layouts[0].ndim() == 0 || layouts[1].ndim() == 0)) {
+    int scalar = layouts[0].ndim() == 0 ? 0 : 1;
+    return Array<Array<Layout> >{layouts, {layouts[1-scalar]}};
   } else {
     // try to broadcast the tensors to the larger dimension
     int large_idx = layouts[0].ndim_primal() >= layouts[1].ndim_primal() ? 0 : 1;
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index f7a1c83ddff1..b000bae031d1 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -57,7 +57,7 @@ def expected():
     b = expected()
     b = infer_type(b)
 
-    assert(alpha_equal(a, b))
+    assert alpha_equal(a, b), "Actual = \n" + str(a)
 
 
 def test_alter_return_none():
@@ -81,7 +81,7 @@ def alter_conv2d(attrs, inputs, tinfos):
 
     b = before()
     b = infer_type(b)
-    assert(alpha_equal(a, b))
+    assert alpha_equal(a, b), "Actual = \n" + str(a)
     assert(called[0])
 
 
@@ -147,7 +147,7 @@ def expected():
     b = expected()
     b = infer_type(b)
 
-    assert(alpha_equal(a, b))
+    assert alpha_equal(a, b), "Actual = \n" + str(a)
 
 
 def test_alter_layout_dual_path():
@@ -213,7 +213,7 @@ def expected():
     b = expected()
     b = infer_type(b)
 
-    assert(alpha_equal(a, b))
+    assert alpha_equal(a, b), "Actual = \n" + str(a)
 
 def test_alter_layout_resnet():
     """Test alternating the layout of a residual block
@@ -273,7 +273,7 @@ def expected():
     b = expected()
     b = infer_type(b)
 
-    assert(alpha_equal(a, b))
+    assert alpha_equal(a, b), "Actual = \n" + str(a)
 
 
 def test_alter_layout_broadcast_op():
@@ -323,7 +323,7 @@ def expected():
     b = expected()
     b = infer_type(b)
 
-    assert(alpha_equal(a, b))
+    assert alpha_equal(a, b), "Actual = \n" + str(a)
 
 def test_alter_layout_scalar():
     """Test alternating the layout of a conv2d.
@@ -370,7 +370,7 @@ def expected():
     b = expected()
     b = infer_type(b)
 
-    assert(alpha_equal(a, b))
+    assert alpha_equal(a, b), "Actual = \n" + str(a)
 
 def test_alter_layout_concatenate():
     """ """
@@ -425,7 +425,7 @@ def expected():
     b = expected()
     b = infer_type(b)
 
-    assert(alpha_equal(a, b))
+    assert alpha_equal(a, b), "Actual = \n" + str(a)
 
 
 def test_alter_layout_nchw_upsamping_op():
@@ -469,7 +469,7 @@ def expected():
     b = expected()
     b = infer_type(b)
 
-    assert(alpha_equal(a, b))
+    assert alpha_equal(a, b), "Actual = \n" + str(a)
 
 
 def test_alter_layout_strided_slice():
@@ -511,7 +511,7 @@ def expected():
     b = expected()
     b = infer_type(b)
 
-    assert(alpha_equal(a, b))
+    assert alpha_equal(a, b), "Actual = \n" + str(a)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_lang_data_layout.py b/tests/python/unittest/test_lang_data_layout.py
index 164fe1a23098..cde4a813d89a 100644
--- a/tests/python/unittest/test_lang_data_layout.py
+++ b/tests/python/unittest/test_lang_data_layout.py
@@ -52,6 +52,12 @@ def test_layout():
 def test_bilayout_convertible():
     # not convertible
     assert tvm.bijective_layout("NCHW", "ABCD") is None
+    assert tvm.bijective_layout("__undef__", "NCHW") is None
+    assert tvm.bijective_layout("NCHW", "__undef__") is None
+    assert tvm.bijective_layout("__undef__", "__undef__") is None
+    assert tvm.bijective_layout("", "NCHW") is None
+    assert tvm.bijective_layout("NCHW", "") is None
+    assert tvm.bijective_layout("", "") is None
     # convertible
     assert tvm.bijective_layout("NCHW", "NCHW16c") is not None
 

From afcb3d900bdc9b6ba0efc9984d0cd97b972490f1 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Sun, 28 Apr 2019 20:47:21 -0700
Subject: [PATCH 061/106] [Relay][TOPI] Gluncv SSD support on the GPU (#2784)

* ssd gluoncv gpu op updated

* ssd gluoncv gpu op updated

* tutorials and testes modified

* tutorials and testes modified

* fix lint

* fix lint

* address comment

* multibox bug fixed

* space line added

* use less threads per block

* use less threads per block

* less threads per block for get valid count

* less threads per block for get valid count

* merge with master

* Revert "less threads per block for get valid count"

This reverts commit 08896cfccc34b0b2a1646d01d01ea4cad73941c4.

* Revert "less threads per block for get valid count"

This reverts commit 08896cfccc34b0b2a1646d01d01ea4cad73941c4.

* typo fixed

* elem length made to a variable

* fix lint error

* fix lint error

* lint fixed

* bug fixed

* bug fixed

* lint fixed

* error fixed

* error fixed

* test ci

* test ci

* seperate argsort to be an independent op

* seperate argsort to be an independent op

* fix lint

* fix lint

* remove unsupported models

* typo fixed

* argsort added to realy

* solve conflicts with master

* fix lint

* fix lint

* test push

* Revert "test push"

This reverts commit 6db00883fab6cc06bddf564c926bb27c874397d8.

* fix lint error

* fix more lint

* cpu test_sort udpated

* debug ci

* nms fixed

* expose argsort to relay frontend

* test ci

* fix lint

* sort register error fixed

* fix nnvm

* nms type fixed

* adaptive pooling added to relay

* Revert "adaptive pooling added to relay"

This reverts commit 1119f1f2c055753e0cc5611627597749134c5c8c.

* fix lint

* expose argsort op

* fix lint

* fix lint

* fix lint

* sort test updated

* sort bug fixed

* nnvm error fixed

* fix argsort default data type returned to be float insteaf of int

* fix lint

* fix lint

* test fixed

* fix valid count

* fix titanx bug

* tutorial add both targets

* titanx error fixed

* try to fix CI old gpu error

* try to solve CI GPU error

* get_valid_count added

* reverse get_valid_count

* get valid count optimized

* address comments

* fix ci error

* remove unessesary block sync

* add back one sync

* address comments

* address more comments

* more comments

* move sort to be indepent algorithm

* typo fixed

* more typos

* comments addressed

* doc updated

* fix pylint

* address final comments

* apache license added
---
 docs/langref/relay_op.rst                     |  13 +
 include/tvm/relay/attrs/algorithm.h           |  53 ++
 include/tvm/relay/attrs/vision.h              |   6 +
 nnvm/include/nnvm/top/nn.h                    |   6 +
 nnvm/python/nnvm/top/vision.py                |  10 +-
 nnvm/tests/python/compiler/test_top_level4.py |  51 +-
 python/tvm/relay/__init__.py                  |   1 +
 python/tvm/relay/frontend/mxnet.py            |  29 +-
 python/tvm/relay/op/__init__.py               |   2 +
 python/tvm/relay/op/_algorithm.py             |  45 ++
 python/tvm/relay/op/algorithm.py              |  47 ++
 python/tvm/relay/op/tensor.py                 |  24 +
 python/tvm/relay/op/transform.py              |  23 -
 python/tvm/relay/op/vision/_vision.py         |   5 +-
 python/tvm/relay/op/vision/nms.py             |  11 +-
 src/contrib/sort/sort.cc                      |  75 +-
 src/relay/op/algorithm/sort.cc                |  78 ++
 src/relay/op/vision/nms.cc                    |   4 +
 tests/python/contrib/test_sort.py             |  12 +-
 tests/python/relay/test_op_level5.py          |  25 +-
 tests/python/relay/test_op_level6.py          |  49 ++
 topi/python/topi/__init__.py                  |   1 +
 topi/python/topi/cuda/nms.py                  | 747 ++++++++++++++----
 topi/python/topi/cuda/sort.py                 | 249 ++++++
 topi/python/topi/cuda/ssd/multibox.py         | 219 ++---
 topi/python/topi/cuda/vision.py               |  37 +-
 topi/python/topi/generic/__init__.py          |   1 +
 topi/python/topi/generic/sort.py              |  38 +
 topi/python/topi/sort.py                      | 105 +++
 topi/python/topi/vision/nms.py                |  41 +-
 topi/python/topi/vision/ssd/multibox.py       |   6 +-
 topi/tests/python/test_topi_sort.py           |  59 ++
 topi/tests/python/test_topi_vision.py         |   8 +-
 tutorials/frontend/deploy_ssd_gluoncv.py      |  23 +-
 34 files changed, 1731 insertions(+), 372 deletions(-)
 create mode 100644 include/tvm/relay/attrs/algorithm.h
 create mode 100644 python/tvm/relay/op/_algorithm.py
 create mode 100644 python/tvm/relay/op/algorithm.py
 create mode 100644 src/relay/op/algorithm/sort.cc
 create mode 100644 tests/python/relay/test_op_level6.py
 create mode 100644 topi/python/topi/cuda/sort.py
 create mode 100644 topi/python/topi/generic/sort.py
 create mode 100644 topi/python/topi/sort.py
 create mode 100644 topi/tests/python/test_topi_sort.py

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index c45e9b92ab6f..4719aba6d3f9 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -165,6 +165,14 @@ This level enables additional math and transform operators.
    tvm.relay.vision.yolo_reorg
 
 
+**Level 6: Algorithm Operators**
+
+.. autosummary::
+   :nosignatures:
+
+   tvm.relay.argsort
+
+
 **Level 10: Temporary Operators**
 
 This level support backpropagation of broadcast operators. It is temporary.
@@ -294,6 +302,11 @@ Level 5 Definitions
 .. autofunction:: tvm.relay.vision.yolo_reorg
 
 
+Level 6 Definitions
+-------------------
+.. autofunction:: tvm.relay.argsort
+
+
 Level 10 Definitions
 --------------------
 .. autofunction:: tvm.relay.broadcast_to_like
diff --git a/include/tvm/relay/attrs/algorithm.h b/include/tvm/relay/attrs/algorithm.h
new file mode 100644
index 000000000000..20f135c11bba
--- /dev/null
+++ b/include/tvm/relay/attrs/algorithm.h
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/relay/attrs/vision.h
+ * \brief Auxiliary attributes for vision operators.
+ */
+#ifndef TVM_RELAY_ATTRS_ALGORITHM_H_
+#define TVM_RELAY_ATTRS_ALGORITHM_H_
+
+#include <tvm/attrs.h>
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+/*! \brief Attributes used in argsort operators */
+struct ArgsortAttrs : public tvm::AttrsNode<ArgsortAttrs> {
+  int axis;
+  bool is_ascend;
+  DataType dtype;
+
+  TVM_DECLARE_ATTRS(ArgsortAttrs, "relay.attrs.ArgsortAttrs") {
+    TVM_ATTR_FIELD(axis).set_default(-1)
+      .describe("Axis along which to sort the input tensor."
+                "If not given, the flattened array is used.");
+    TVM_ATTR_FIELD(is_ascend).set_default(true)
+      .describe("Whether to sort in ascending or descending order."
+                "By default, sort in ascending order");
+    TVM_ATTR_FIELD(dtype).set_default(NullValue<DataType>())
+      .describe("DType of the output indices.");
+  }
+};
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_ATTRS_ALGORITHM_H_
diff --git a/include/tvm/relay/attrs/vision.h b/include/tvm/relay/attrs/vision.h
index 2b3eb4f32b45..11b4ebfcfaad 100644
--- a/include/tvm/relay/attrs/vision.h
+++ b/include/tvm/relay/attrs/vision.h
@@ -92,6 +92,8 @@ struct NonMaximumSuppressionAttrs : public tvm::AttrsNode<NonMaximumSuppressionA
   double iou_threshold;
   bool force_suppress;
   int top_k;
+  int coord_start;
+  int score_index;
   int id_index;
   bool return_indices;
   bool invalid_to_bottom;
@@ -106,6 +108,10 @@ struct NonMaximumSuppressionAttrs : public tvm::AttrsNode<NonMaximumSuppressionA
       .describe("Suppress all detections regardless of class_id.");
     TVM_ATTR_FIELD(top_k).set_default(-1)
       .describe("Keep maximum top k detections before nms, -1 for no limit.");
+    TVM_ATTR_FIELD(coord_start).set_default(2)
+      .describe("Start index of the consecutive 4 coordinates.");
+    TVM_ATTR_FIELD(score_index).set_default(1)
+      .describe("Index of the scores/confidence of boxes.");
     TVM_ATTR_FIELD(id_index).set_default(0)
       .describe("Axis index of id.");
     TVM_ATTR_FIELD(return_indices).set_default(true)
diff --git a/nnvm/include/nnvm/top/nn.h b/nnvm/include/nnvm/top/nn.h
index 424a6a0fa5e6..137d8ca5d78d 100644
--- a/nnvm/include/nnvm/top/nn.h
+++ b/nnvm/include/nnvm/top/nn.h
@@ -488,6 +488,8 @@ struct NonMaximumSuppressionParam : public dmlc::Parameter<NonMaximumSuppression
   bool force_suppress;
   int top_k;
   int id_index;
+  int coord_start;
+  int score_index;
   int max_output_size;
   bool invalid_to_bottom;
   DMLC_DECLARE_PARAMETER(NonMaximumSuppressionParam) {
@@ -500,6 +502,10 @@ struct NonMaximumSuppressionParam : public dmlc::Parameter<NonMaximumSuppression
       .describe("Suppress all detections regardless of class_id.");
     DMLC_DECLARE_FIELD(top_k).set_default(-1)
       .describe("Keep maximum top k detections before nms, -1 for no limit.");
+    DMLC_DECLARE_FIELD(coord_start).set_default(2)
+      .describe("Start index of the consecutive 4 coordinates.");
+    DMLC_DECLARE_FIELD(score_index).set_default(1)
+      .describe("Index of the scores/confidence of boxes.");
     DMLC_DECLARE_FIELD(id_index).set_default(0)
       .describe("Axis index of id.");
     DMLC_DECLARE_FIELD(return_indices).set_default(true)
diff --git a/nnvm/python/nnvm/top/vision.py b/nnvm/python/nnvm/top/vision.py
index a1000927c995..e520ea83c8d0 100644
--- a/nnvm/python/nnvm/top/vision.py
+++ b/nnvm/python/nnvm/top/vision.py
@@ -99,8 +99,12 @@ def compute_nms(attrs, inputs, _):
     invalid_to_bottom = attrs.get_bool('invalid_to_bottom')
 
     with tvm.target.create(attrs.get_str("target")):
-        return topi.vision.non_max_suppression(inputs[0], inputs[1], max_output_size,
-                                               iou_threshold, force_suppress, top_k,
-                                               id_index, return_indices, invalid_to_bottom)
+        return topi.vision.non_max_suppression(inputs[0], inputs[1],
+                                               max_output_size=max_output_size,
+                                               iou_threshold=iou_threshold,
+                                               force_suppress=force_suppress,
+                                               top_k=top_k, id_index=id_index,
+                                               return_indices=return_indices,
+                                               invalid_to_bottom=invalid_to_bottom)
 
 reg.register_pattern("non_max_suppression", OpPattern.OPAQUE)
diff --git a/nnvm/tests/python/compiler/test_top_level4.py b/nnvm/tests/python/compiler/test_top_level4.py
index f8d4f5bf657e..691163974470 100644
--- a/nnvm/tests/python/compiler/test_top_level4.py
+++ b/nnvm/tests/python/compiler/test_top_level4.py
@@ -543,14 +543,13 @@ def verify_multibox_prior(dshape, sizes=(1,), ratios=(1,), steps=(-1, -1),
     if clip:
         np_out = np.clip(np_out, 0, 1)
 
-    target = "llvm"
-    ctx = tvm.cpu()
-    graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape})
-    m = graph_runtime.create(graph, lib, ctx)
-    m.set_input("data", np.random.uniform(size=dshape).astype(dtype))
-    m.run()
-    out = m.get_output(0, tvm.nd.empty(np_out.shape, dtype))
-    tvm.testing.assert_allclose(out.asnumpy(), np_out, atol=1e-5, rtol=1e-5)
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.set_input("data", np.random.uniform(size=dshape).astype(dtype))
+        m.run()
+        tvm_out = m.get_output(0, tvm.nd.empty(np_out.shape, dtype))
+        tvm.testing.assert_allclose(tvm_out.asnumpy(), np_out, atol=1e-5, rtol=1e-5)
 
 def test_multibox_prior():
     verify_multibox_prior((1, 3, 50, 50))
@@ -577,17 +576,16 @@ def test_multibox_transform_loc():
                                  [0, 0.44999999, 1, 1, 1, 1],
                                  [0, 0.30000001, 0, 0, 0.22903419, 0.20435292]]])
 
-    target = "llvm"
     dtype = "float32"
-    ctx = tvm.cpu()
-    graph, lib, _ = nnvm.compiler.build(out, target, {"cls_prob": (batch_size, num_anchors, num_classes),
-                                                      "loc_preds": (batch_size, num_anchors * 4),
-                                                      "anchors": (1, num_anchors, 4)})
-    m = graph_runtime.create(graph, lib, ctx)
-    m.set_input(**{"cls_prob": np_cls_prob.astype(dtype), "loc_preds": np_loc_preds.astype(dtype), "anchors": np_anchors.astype(dtype)})
-    m.run()
-    out = m.get_output(0, tvm.nd.empty(expected_np_out.shape, dtype))
-    tvm.testing.assert_allclose(out.asnumpy(), expected_np_out, atol=1e-5, rtol=1e-5)
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(out, target, {"cls_prob": (batch_size, num_anchors, num_classes),
+                                                          "loc_preds": (batch_size, num_anchors * 4),
+                                                          "anchors": (1, num_anchors, 4)})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.set_input(**{"cls_prob": np_cls_prob.astype(dtype), "loc_preds": np_loc_preds.astype(dtype), "anchors": np_anchors.astype(dtype)})
+        m.run()
+        tvm_out = m.get_output(0, tvm.nd.empty(expected_np_out.shape, dtype))
+        tvm.testing.assert_allclose(tvm_out.asnumpy(), expected_np_out, atol=1e-5, rtol=1e-5)
 
 def test_non_max_suppression():
     dshape = (1, 5, 6)
@@ -607,15 +605,14 @@ def test_non_max_suppression():
                            [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
 
-    target = "llvm"
-    ctx = tvm.cpu()
-    graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape, "valid_count": (dshape[0],)},
-                                        dtype={"data": "float32", "valid_count": "int32"})
-    m = graph_runtime.create(graph, lib, ctx)
-    m.set_input(**{"data": np_data, "valid_count": np_valid_count})
-    m.run()
-    out = m.get_output(0, tvm.nd.empty(np_result.shape, "float32"))
-    tvm.testing.assert_allclose(out.asnumpy(), np_result, atol=1e-5, rtol=1e-5)
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape, "valid_count": (dshape[0],)},
+                                            dtype={"data": "float32", "valid_count": "int32"})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.set_input(**{"data": np_data, "valid_count": np_valid_count})
+        m.run()
+        tvm_out = m.get_output(0, tvm.nd.empty(np_result.shape, "float32"))
+        tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, atol=1e-5, rtol=1e-5)
 
 def np_slice_like(np_data, np_shape_like, axis=[]):
     begin_idx = [0 for _ in np_data.shape]
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index 2ab4ca2e1404..80555d3dfbf6 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -36,6 +36,7 @@
 from .op.reduce import *
 from .op.tensor import *
 from .op.transform import *
+from .op.algorithm import *
 from . import nn
 from . import annotation
 from . import vision
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 1218e65f53e1..f1bf6788ea20 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -186,6 +186,13 @@ def _pool2d(new_op, is_avg):
         'Operator {} Pooling is not supported for frontend MXNet.'.format(pool_type.capitalize()))
 
 
+def _mx_adaptive_avg_pooling(inputs, attrs):
+    output_size = attrs.get_int_tuple("output_size", [])
+    if output_size != (1,):
+        raise RuntimeError("AdaptiveAvgPooling with output_size other than 1 is not supported yet.")
+    return _op.nn.global_avg_pool2d(inputs[0])
+
+
 def _mx_dropout(inputs, attrs):
     rate = attrs.get_float("p", 0.5)
     return _op.nn.dropout(inputs[0], rate=rate)
@@ -529,15 +536,6 @@ def _mx_box_nms(inputs, attrs):
     id_index = attrs.get_int('id_index', -1)
     in_format = attrs.get_str('in_format', 'corner')
     out_format = attrs.get_str('out_format', 'corner')
-    if coord_start != 2:
-        raise tvm.error.OpAttributeInvalid(
-            'Value of attribute "coord_start" must equal 2 for operator box_nms.')
-    if score_index != 1:
-        raise tvm.error.OpAttributeInvalid(
-            'Value of attribute "score_index" must equal 1 for operator box_nms.')
-    if id_index != -1 and int(id_index) != 0:
-        raise tvm.error.OpAttributeInvalid(
-            'Value of attribute "id_index" must equal either -1 or 0 for operator box_nms.')
     if in_format != 'corner':
         raise tvm.error.OpAttributeInvalid(
             'Value of attribute "in_format" must equal "corner" for operator box_nms.')
@@ -551,6 +549,8 @@ def _mx_box_nms(inputs, attrs):
                                              iou_threshold=iou_thresh,
                                              force_suppress=force_suppress,
                                              top_k=top_k,
+                                             coord_start=coord_start,
+                                             score_index=score_index,
                                              id_index=id_index,
                                              return_indices=False,
                                              invalid_to_bottom=True)
@@ -648,6 +648,15 @@ def _mx_deformable_convolution(inputs, attrs):
     return res
 
 
+def _mx_argsort(inputs, attrs):
+    assert len(inputs) == 1
+    new_attrs = {}
+    new_attrs["axis"] = attrs.get_int("axis", -1)
+    new_attrs["is_ascend"] = attrs.get_bool("is_ascend", True)
+    new_attrs["dtype"] = attrs.get_str("dtype", "float32")
+    return _op.argsort(inputs[0], **new_attrs)
+
+
 # Note: due to attribute conversion constraint
 # ops in the identity set must be attribute free
 _identity_list = [
@@ -783,6 +792,7 @@ def _mx_deformable_convolution(inputs, attrs):
     "BlockGrad"     : _mx_BlockGrad,
     "shape_array"   : _mx_shape_array,
     "Embedding"     : _mx_embedding,
+    "argsort"       : _mx_argsort,
     "SoftmaxOutput" : _mx_softmax_output,
     "SoftmaxActivation" : _mx_softmax_activation,
     "smooth_l1"     : _mx_smooth_l1,
@@ -796,6 +806,7 @@ def _mx_deformable_convolution(inputs, attrs):
     "_contrib_MultiProposal" : _mx_proposal,
     "_contrib_box_nms" : _mx_box_nms,
     "_contrib_DeformableConvolution" : _mx_deformable_convolution,
+    "_contrib_AdaptiveAvgPooling2D" : _mx_adaptive_avg_pooling,
     # List of missing operators that are present in NNVMv1
     # TODO(tvm-tvm): support all operators.
     #
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index fdc990ea6410..3bea795a2c38 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -24,6 +24,7 @@
 from .reduce import *
 from .tensor import *
 from .transform import *
+from .algorithm import *
 from . import nn
 from . import annotation
 from . import image
@@ -36,6 +37,7 @@
 from . import _tensor_grad
 from . import _transform
 from . import _reduce
+from . import _algorithm
 from ..expr import Expr
 from ..base import register_relay_node
 
diff --git a/python/tvm/relay/op/_algorithm.py b/python/tvm/relay/op/_algorithm.py
new file mode 100644
index 000000000000..57e716534ee5
--- /dev/null
+++ b/python/tvm/relay/op/_algorithm.py
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"Definition of classic algorithms"
+# pylint: disable=invalid-name,unused-argument
+from __future__ import absolute_import
+
+import topi
+from topi.util import get_const_int
+from ..op import OpPattern, register_compute, register_schedule, register_pattern
+
+
+@register_schedule("argsort")
+def schedule_argsort(_, outs, target):
+    """Schedule definition of argsort"""
+    with target:
+        return topi.generic.schedule_argsort(outs)
+
+
+@register_compute("argsort")
+def compute_argsort(attrs, inputs, _, target):
+    """Compute definition of argsort"""
+    axis = get_const_int(attrs.axis)
+    is_ascend = bool(get_const_int(attrs.is_ascend))
+    dtype = str(attrs.dtype)
+    return [
+        topi.argsort(inputs[0], None, axis=axis, is_ascend=is_ascend, \
+                            dtype=dtype, flag=False)
+    ]
+
+
+register_pattern("argsort", OpPattern.OPAQUE)
diff --git a/python/tvm/relay/op/algorithm.py b/python/tvm/relay/op/algorithm.py
new file mode 100644
index 000000000000..6451eb41aeb9
--- /dev/null
+++ b/python/tvm/relay/op/algorithm.py
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Classic algorithm operation"""
+from __future__ import absolute_import as _abs
+from . import _make
+
+def argsort(data, axis=-1, is_ascend=1, dtype="float32"):
+    """Performs sorting along the given axis and returns an array of indicies
+    having same shape as an input array that index data in sorted order.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data tensor.
+
+    valid_count : tvm.Tensor
+        The number of valid elements to be sorted.
+
+    axis : int, optional
+        Axis long which to sort the input tensor.
+
+    is_ascend : boolean, optional
+        Whether to sort in ascending or descending order.
+
+    dtype : string, optional
+        DType of the output indices.
+
+    Returns
+    -------
+    out : relay.Expr
+        Tensor with same shape as data.
+    """
+    return _make.argsort(data, axis, is_ascend, dtype)
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index bcbbe0c55377..3795e6598cab 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -710,6 +710,30 @@ def concatenate(data, axis):
     return _make.concatenate(Tuple(data), axis)
 
 
+def stack(data, axis):
+    """Join a sequence of arrays along a new axis.
+
+    Parameters
+    ----------
+    data : Union(List[relay.Expr], Tuple(relay.Expr))
+        A list of tensors.
+
+    axis : int
+        The axis in the result array along which the input arrays are stacked.
+
+    Returns
+    -------
+    ret : relay.Expr
+        The stacked tensor.
+    """
+    data = list(data)
+    if not data:
+        raise ValueError("relay.stack requires data to be non-empty.")
+    if not isinstance(axis, int):
+        raise ValueError("For now, we only support integer axis")
+    return _make.stack(Tuple(data), axis)
+
+
 def copy(data):
     """Copy a tensor.
 
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 5489ad1e2659..9c76b7e569dc 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -315,28 +315,6 @@ def arange(start, stop=None, step=1, dtype="float32"):
     return _make.arange(start, stop, step, dtype)
 
 
-def stack(data, axis):
-    """Join a sequence of arrays along a new axis.
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The input data to the operator.
-
-    axis : int
-        The axis in the result array along which the input arrays are stacked.
-
-    .. note::
-        Each array in the input array sequence must have the same shape.
-
-    Returns
-    -------
-    ret : relay.Expr
-        The computed result.
-    """
-    return _make.stack(data, axis)
-
-
 def repeat(data, repeats, axis):
     """Repeats elements of an array.
     By default, repeat flattens the input array into 1-D and then repeats the elements.
@@ -698,5 +676,4 @@ def gather_nd(data, indices):
         indices = [[0, 1], [1, 0]]
         relay.gather_nd(data, indices) = [[3, 4], [5, 6]]
     """
-
     return _make.gather_nd(data, indices)
diff --git a/python/tvm/relay/op/vision/_vision.py b/python/tvm/relay/op/vision/_vision.py
index bcf7e066ad15..8c8c4cd9aaa3 100644
--- a/python/tvm/relay/op/vision/_vision.py
+++ b/python/tvm/relay/op/vision/_vision.py
@@ -103,12 +103,15 @@ def compute_nms(attrs, inputs, _, target):
     iou_threshold = get_const_float(attrs.iou_threshold)
     force_suppress = bool(get_const_int(attrs.force_suppress))
     top_k = get_const_int(attrs.top_k)
+    coord_start = get_const_int(attrs.coord_start)
+    score_index = get_const_int(attrs.score_index)
     id_index = get_const_int(attrs.id_index)
     invalid_to_bottom = bool(get_const_int(attrs.invalid_to_bottom))
     return [
         topi.vision.non_max_suppression(inputs[0], inputs[1], max_output_size,
                                         iou_threshold, force_suppress, top_k,
-                                        id_index, return_indices, invalid_to_bottom)
+                                        coord_start, score_index, id_index,
+                                        return_indices, invalid_to_bottom)
     ]
 
 
diff --git a/python/tvm/relay/op/vision/nms.py b/python/tvm/relay/op/vision/nms.py
index b8f9bf1b6782..ab34eb6e6cfb 100644
--- a/python/tvm/relay/op/vision/nms.py
+++ b/python/tvm/relay/op/vision/nms.py
@@ -49,6 +49,8 @@ def non_max_suppression(data,
                         iou_threshold=0.5,
                         force_suppress=False,
                         top_k=-1,
+                        coord_start=2,
+                        score_index=1,
                         id_index=0,
                         return_indices=True,
                         invalid_to_bottom=False):
@@ -77,6 +79,12 @@ def non_max_suppression(data,
     top_k : int, optional
         Keep maximum top k detections before nms, -1 for no limit.
 
+    coord_start : int, optional
+        The starting index of the consecutive 4 coordinates.
+
+    score_index : int, optional
+        Index of the scores/confidence of boxes.
+
     id_index : int, optional
         index of the class categories, -1 to disable.
 
@@ -93,4 +101,5 @@ def non_max_suppression(data,
     """
     return _make.non_max_suppression(data, valid_count, max_output_size,
                                      iou_threshold, force_suppress, top_k,
-                                     id_index, return_indices, invalid_to_bottom)
+                                     coord_start, score_index, id_index,
+                                     return_indices, invalid_to_bottom)
diff --git a/src/contrib/sort/sort.cc b/src/contrib/sort/sort.cc
index fd0107c4706d..cf25e89b9109 100644
--- a/src/contrib/sort/sort.cc
+++ b/src/contrib/sort/sort.cc
@@ -46,20 +46,20 @@ bool CompareDescend(const std::pair<int32_t, DType>& lhs,
 }
 
 
-// Argsort implemented C library sort.
+// Argsort implemented C library sort for nms.
 // Return indices of sorted tensor.
 // By default, the last axis will be used to sort.
 // sort_num specify the number of elements to be sorted.
 // If input tensor has dimension (d0, d1, ..., d(k-1), dk, d(k+1), ..., d(n-1))
 // and sort axis is dk. sort_num should have dimension of
 // (d1, d2, ..., d(k-1), d(k+1), ..., dn).
-TVM_REGISTER_GLOBAL("tvm.contrib.sort.argsort")
+TVM_REGISTER_GLOBAL("tvm.contrib.sort.argsort_nms")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
   DLTensor *input = args[0];
   DLTensor *sort_num = args[1];
   DLTensor *output = args[2];
   int32_t axis = args[3];
-  bool is_descend = args[4];
+  bool is_ascend = args[4];
 
   auto dtype = input->dtype;
   auto data_ptr = static_cast<float *>(input->data);
@@ -97,10 +97,10 @@ TVM_REGISTER_GLOBAL("tvm.contrib.sort.argsort")
         int64_t full_idx = base_idx + k * axis_mul_after;
         sorter.emplace_back(std::make_pair(k, *(data_ptr + full_idx)));
       }
-      if (is_descend) {
-        std::stable_sort(sorter.begin(), sorter.end(), CompareDescend<float>);
-      } else {
+      if (is_ascend) {
         std::stable_sort(sorter.begin(), sorter.end(), CompareAscend<float>);
+      } else {
+        std::stable_sort(sorter.begin(), sorter.end(), CompareDescend<float>);
       }
       for (int32_t k = 0; k < input->shape[axis]; ++k) {
         *(static_cast<int32_t *>(output->data) + base_idx + k * axis_mul_after)
@@ -110,5 +110,68 @@ TVM_REGISTER_GLOBAL("tvm.contrib.sort.argsort")
   }
 });
 
+
+// Argsort implemented C library sort.
+// Return indices of sorted tensor.
+// By default, the last axis will be used to sort.
+// sort_num specify the number of elements to be sorted.
+// If input tensor has dimension (d0, d1, ..., d(k-1), dk, d(k+1), ..., d(n-1))
+// and sort axis is dk. sort_num should have dimension of
+// (d1, d2, ..., d(k-1), d(k+1), ..., dn).
+TVM_REGISTER_GLOBAL("tvm.contrib.sort.argsort")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  DLTensor *input = args[0];
+  DLTensor *output = args[1];
+  int32_t axis = args[2];
+  bool is_ascend = args[3];
+
+  auto dtype = input->dtype;
+  auto data_ptr = static_cast<float *>(input->data);
+  std::vector<std::pair<float, float>> sorter;
+  int64_t axis_mul_before = 1;
+  int64_t axis_mul_after = 1;
+
+  if (axis < 0) {
+    axis = input->ndim + axis;
+  }
+
+  // Currently only supports input dtype to be float32.
+  CHECK_EQ(dtype.code, 2) << "Currently only supports input dtype "
+      "to be float32.";
+  CHECK_EQ(dtype.bits, 32) << "Currently only supports input dtype "
+      "to be float32.";
+  CHECK_LT(axis, input->ndim) << "Axis out of boundary for "
+      "input ndim " << input->ndim;
+
+  for (int i = 0; i < input->ndim; ++i) {
+    if (i < axis) {
+      axis_mul_before *= input->shape[i];
+    } else if (i > axis) {
+      axis_mul_after *= input->shape[i];
+    }
+  }
+
+  int32_t current_sort_num = input->shape[axis];
+  for (int64_t i = 0 ; i < axis_mul_before; ++i) {
+    for (int64_t j = 0 ; j < axis_mul_after; ++j) {
+      sorter.clear();
+      int64_t base_idx = i * input->shape[axis] * axis_mul_after + j;
+      for (int64_t k = 0; k < current_sort_num; ++k) {
+        int64_t full_idx = base_idx + k * axis_mul_after;
+        sorter.emplace_back(std::make_pair(k, *(data_ptr + full_idx)));
+      }
+      if (is_ascend) {
+        std::stable_sort(sorter.begin(), sorter.end(), CompareAscend<float>);
+      } else {
+        std::stable_sort(sorter.begin(), sorter.end(), CompareDescend<float>);
+      }
+      for (int32_t k = 0; k < input->shape[axis]; ++k) {
+        *(static_cast<float *>(output->data) + base_idx + k * axis_mul_after)
+            = k < static_cast<float>(sorter.size()) ? sorter[k].first : k;
+      }
+    }
+  }
+});
+
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/relay/op/algorithm/sort.cc b/src/relay/op/algorithm/sort.cc
new file mode 100644
index 000000000000..5777b79699b1
--- /dev/null
+++ b/src/relay/op/algorithm/sort.cc
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file nms.cc
+ * \brief Non-maximum suppression operators
+ */
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/algorithm.h>
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(ArgsortAttrs);
+
+bool ArgsortRel(const Array<Type>& types,
+                int num_inputs,
+                const Attrs& attrs,
+                const TypeReporter& reporter) {
+  // `types` contains: [data, result]
+  const ArgsortAttrs* param = attrs.as<ArgsortAttrs>();
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    CHECK(types[0].as<IncompleteTypeNode>())
+        << "Argsort: expect input type to be TensorType but get "
+        << types[0];
+    return false;
+  }
+  CHECK_EQ(param->dtype, Float(32));
+  reporter->Assign(types[1], TensorTypeNode::make(data->shape, param->dtype));
+  return true;
+}
+
+Expr MakeArgsort(Expr data,
+                 int axis,
+                 bool is_ascend,
+                 DataType dtype) {
+  auto attrs = make_node<ArgsortAttrs>();
+  attrs->axis = axis;
+  attrs->is_ascend = is_ascend;
+  attrs->dtype = dtype;
+  static const Op& op = Op::Get("argsort");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op._make.argsort")
+.set_body_typed(MakeArgsort);
+
+RELAY_REGISTER_OP("argsort")
+.describe(R"doc(Returns the indices that would sort an
+input array along the given axis.
+)doc" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_attrs_type_key("relay.attrs.ArgsortAttrs")
+.add_argument("data", "Tensor", "Input data.")
+.set_support_level(6)
+.add_type_rel("Argsort", ArgsortRel);
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/vision/nms.cc b/src/relay/op/vision/nms.cc
index 5344bce3d641..2e5661cdc4dc 100644
--- a/src/relay/op/vision/nms.cc
+++ b/src/relay/op/vision/nms.cc
@@ -106,6 +106,8 @@ Expr MakeNMS(Expr data,
              double iou_threshold,
              bool force_suppress,
              int top_k,
+             int coord_start,
+             int score_index,
              int id_index,
              bool return_indices,
              bool invalid_to_bottom) {
@@ -114,6 +116,8 @@ Expr MakeNMS(Expr data,
   attrs->iou_threshold = iou_threshold;
   attrs->force_suppress = force_suppress;
   attrs->top_k = top_k;
+  attrs->coord_start = coord_start;
+  attrs->score_index = score_index;
   attrs->id_index = id_index;
   attrs->return_indices = return_indices;
   attrs->invalid_to_bottom = invalid_to_bottom;
diff --git a/tests/python/contrib/test_sort.py b/tests/python/contrib/test_sort.py
index 856d3fa9cf83..87cdac01ce3a 100644
--- a/tests/python/contrib/test_sort.py
+++ b/tests/python/contrib/test_sort.py
@@ -24,11 +24,11 @@ def test_sort():
     data = tvm.placeholder((n, l, m), name='data')
     sort_num = tvm.placeholder((n, m), name="sort_num", dtype="int32")
     axis = 1
-    is_descend = True
+    is_ascend = False
     out = tvm.extern(data.shape, [data, sort_num],
                      lambda ins, outs: tvm.call_packed(
-                         "tvm.contrib.sort.argsort", ins[0],
-                         ins[1], outs[0], axis, is_descend),
+                         "tvm.contrib.sort.argsort_nms", ins[0],
+                         ins[1], outs[0], axis, is_ascend),
                      dtype='int32', name="sort_tensor")
     input = [[[1, 2, 3], [2, 4.5, 3.5], [1.1, 0.5, 1], [3.2, -5, 0.5], [1.5, 0, 0]],
              [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]]]
@@ -50,13 +50,13 @@ def test_sort_np():
     dshape = (1, 2, 3, 4, 5, 6)
     axis = 4
     reduced_shape = (1, 2, 3, 4, 6)
-    is_descend = False
+    is_ascend = True
     data = tvm.placeholder(dshape, name='data')
     sort_num = tvm.placeholder(reduced_shape, name="sort_num", dtype="int32")
     out = tvm.extern(data.shape, [data, sort_num],
                      lambda ins, outs: tvm.call_packed(
-                         "tvm.contrib.sort.argsort", ins[0],
-                         ins[1], outs[0], axis, is_descend),
+                         "tvm.contrib.sort.argsort_nms", ins[0],
+                         ins[1], outs[0], axis, is_ascend),
                      dtype='int32', name="sort_tensor")
 
     ctx = tvm.cpu(0)
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 7e1c37169978..e6d99c765c87 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -177,12 +177,13 @@ def verify_get_valid_counts(dshape, score_threshold):
         assert "score_threshold" in z.astext()
         func = relay.Function([x], z.astuple())
         func = relay.ir_pass.infer_type(func)
-        ctx_list = [("llvm", tvm.cpu(0))]
-        for target, ctx in ctx_list:
+        for target, ctx in ctx_list():
+            if target == 'cuda':
+                return
             intrp = relay.create_executor("debug", ctx=ctx, target=target)
             out = intrp.evaluate(func)(np_data)
-            tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3)
-            tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3)
+            tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3, atol=1e-04)
+            tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3, atol=1e-04)
 
     verify_get_valid_counts((1, 2500, 6), 0)
     verify_get_valid_counts((1, 2500, 6), -1)
@@ -195,9 +196,13 @@ def verify_nms(x0_data, x1_data, dshape, ref_res, ref_indices_res,
                    iou_threshold=0.5, force_suppress=False, top_k=-1,
                    check_type_only=False):
         x0 = relay.var("x0", relay.ty.TensorType(dshape, "float32"))
-        x1 = relay.var("x1", relay.ty.TensorType((dshape[0],), "int"))
-        z = relay.vision.non_max_suppression(x0, x1, -1, iou_threshold, force_suppress, top_k, return_indices=False)
-        z_indices = relay.vision.non_max_suppression(x0, x1, -1, iou_threshold, force_suppress, top_k)
+        x1 = relay.var("x1", relay.ty.TensorType((dshape[0],), "int32"))
+        z = relay.vision.non_max_suppression(x0, x1, max_output_size = -1, \
+            iou_threshold = iou_threshold, force_suppress = force_suppress, \
+            top_k = top_k, return_indices=False)
+        z_indices = relay.vision.non_max_suppression(x0, x1, max_output_size = -1, \
+                    iou_threshold = iou_threshold, force_suppress = force_suppress, \
+                    top_k = top_k)
         assert "iou_threshold" in z.astext()
         assert "iou_threshold" in z_indices.astext()
         zz = relay.ir_pass.infer_type(z)
@@ -212,8 +217,7 @@ def verify_nms(x0_data, x1_data, dshape, ref_res, ref_indices_res,
         func = relay.ir_pass.infer_type(func)
         func_indices = relay.Function([x0, x1], z_indices)
         func_indices = relay.ir_pass.infer_type(func_indices)
-        ctx_list = [("llvm", tvm.cpu(0))]
-        for target, ctx in ctx_list:
+        for target, ctx in ctx_list():
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(x0_data, x1_data)
             op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data)
@@ -296,8 +300,7 @@ def test_default_value():
         nms = relay.vision.non_max_suppression(mtl[0], mtl[1], return_indices=False)
         func = relay.Function([cls_prob, loc_pred, anchors], nms)
         func = relay.ir_pass.infer_type(func)
-        ctx_list = [("llvm", tvm.cpu(0))]
-        for target, ctx in ctx_list:
+        for target, ctx in ctx_list():
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(np_cls_prob, np_loc_preds,
                                             np_anchors)
diff --git a/tests/python/relay/test_op_level6.py b/tests/python/relay/test_op_level6.py
new file mode 100644
index 000000000000..983a9154df34
--- /dev/null
+++ b/tests/python/relay/test_op_level6.py
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" Support level6 operator test cases.
+"""
+import math
+import numpy as np
+import tvm
+from tvm import relay
+from tvm.relay.testing import ctx_list
+import topi.testing
+
+def test_argsort():
+    def verify_argsort(shape, axis, is_ascend):
+        x = relay.var("x", relay.TensorType(shape, "float32"))
+        z = relay.argsort(x, axis=axis, is_ascend=is_ascend)
+        zz = relay.ir_pass.infer_type(z)
+        func = relay.Function([x], z)
+        x_data = np.random.uniform(size=shape).astype("float32")
+        if is_ascend:
+            ref_res = np.argsort(x_data, axis=axis)
+        else:
+            ref_res = np.argsort(-x_data, axis=axis)
+
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.astype("float"), rtol=1e-5)
+    verify_argsort((2, 3, 4), axis=0, is_ascend=False)
+    verify_argsort((1, 4, 6), axis=1, is_ascend=True)
+    verify_argsort((3, 5, 6), axis=-1, is_ascend=False)
+
+
+if __name__ == "__main__":
+    test_argsort()
diff --git a/topi/python/topi/__init__.py b/topi/python/topi/__init__.py
index 2eb460d151ae..a9984148d5d3 100644
--- a/topi/python/topi/__init__.py
+++ b/topi/python/topi/__init__.py
@@ -21,6 +21,7 @@
 from .reduction import *
 from .transform import *
 from .broadcast import *
+from .sort import *
 from . import nn
 from . import x86
 from . import cuda
diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index e6377fa40c52..5d04d72a7eca 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -20,77 +20,380 @@
 import tvm
 
 from tvm import api
-from topi.vision import non_max_suppression
-from ..util import get_const_tuple
+from tvm.generic import cast
+from tvm.intrin import if_then_else, log, power
+from topi.vision import non_max_suppression, get_valid_counts
+from .sort import argsort
 
-def sort_ir(data, index, output):
-    """Low level IR to do sorting on the GPU, same usage as tvm.contrib.sort.argsort on the CPU.
+
+def get_valid_counts_pre(data, flag, idx, score_threshold):
+    """Low level IR to Prepare get valid count of bounding boxes
+    given a score threshold. Also moves valid boxes to the
+    top of input data.
 
     Parameters
     ----------
     data: Buffer
-        2D Buffer of input boxes' score with shape [batch_size, num_anchors].
+        3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms.
+
+    flag : Buffer
+        2D Buffer of flag indicating valid data with shape [batch_size, num_anchors].
 
-    index : Buffer
-        1D Buffer of number of valid number of boxes.
+    idx : Buffer
+        2D Buffer of valid data indices with shape [batch_size, num_anchors].
 
-    output : Buffer
-        2D Output buffer of indicies of sorted tensor with shape [batch_size, num_anchors].
+    score_threshold : float32
+        Lower limit of score for valid bounding boxes.
 
     Returns
     -------
     stmt : Stmt
         The result IR statement.
     """
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    box_data_length = data.shape[2]
+
+    ib = tvm.ir_builder.create()
+
+    data = ib.buffer_ptr(data)
+    flag = ib.buffer_ptr(flag)
+    idx = ib.buffer_ptr(idx)
+    score_threshold = tvm.make.node("FloatImm", dtype="float32", value=score_threshold)
 
-    assert data.dtype == "float32", "Currently only supports input dtype to be float32"
-    batch, num_anchors = get_const_tuple(data.shape)
     max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
+    nthread_tx = max_threads
+    nthread_bx = batch_size * num_anchors // max_threads + 1
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("blockIdx.x")
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+
+    with ib.if_scope(tid < batch_size * num_anchors):
+        with ib.if_scope(data[tid * box_data_length + 1] > score_threshold):
+            flag[tid] = 1
+            idx[tid] = 1
+        with ib.else_scope():
+            flag[tid] = 0
+            idx[tid] = 0
+
+    return ib.get()
+
+def get_valid_counts_upsweep(data, idx_in, idx, partial):
+    """Low level IR of first step of scan: unsweep.
+
+    Parameters
+    ----------
+    data: Buffer
+        3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms.
+
+    idx_in : Buffer
+        2D Buffer of valid data indices with shape [batch_size, num_anchors].
+
+    idx : Buffer
+        2D Buffer of valid data indices with shape [batch_size, num_anchors].
+
+    partial : Buffer
+        2D Buffer of valid data indices with shape [batch_size, new_range].
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
     ib = tvm.ir_builder.create()
-    p_data = ib.buffer_ptr(data)
-    p_index = ib.buffer_ptr(index)
-    p_out = ib.buffer_ptr(output)
+    data = ib.buffer_ptr(data)
+    idx_in = ib.buffer_ptr(idx_in)
+    idx = ib.buffer_ptr(idx)
+    partial = ib.buffer_ptr(partial)
+    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
+    elem_per_thread = num_anchors // max_threads + 1
     nthread_tx = max_threads
-    nthread_bx = num_anchors // max_threads + 1
+    nthread_bx = batch_size
     tx = tvm.thread_axis("threadIdx.x")
-    bx = tvm.thread_axis("vthread")
+    bx = tvm.thread_axis("blockIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "virtual_thread", nthread_bx)
-    tid = bx * nthread_tx + tx
-    temp_data = ib.allocate("float32", (1,), name="temp_data", scope="local")
-    temp_index = ib.allocate("int32", (1,), name="temp_index", scope="local")
-
-    with ib.for_range(0, batch, for_type="unroll") as b:
-        start = b * num_anchors
-        with ib.if_scope(tid < num_anchors):
-            p_out[start + tid] = tid
-        # OddEvenTransposeSort
-        with ib.for_range(0, p_index[b]) as k:
-            with ib.if_scope(tid < (p_index[b] + 1) // 2):
-                offset = start + 2 * tid + (k % 2)
-                with ib.if_scope( \
-                        tvm.all(offset + 1 < p_index[0], p_data[offset] < p_data[offset + 1])):
-                    temp_data[0] = p_data[offset]
-                    p_data[offset] = p_data[offset + 1]
-                    p_data[offset + 1] = temp_data[0]
-                    temp_index[0] = p_out[offset]
-                    p_out[offset] = p_out[offset + 1]
-                    p_out[offset + 1] = temp_index[0]
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    new_range = num_anchors // elem_per_thread + 1
+    # Scan: Upsweep:
+    with ib.if_scope(tvm.all(bx < batch_size, tx < new_range)):
+        with ib.for_range(0, elem_per_thread) as i:
+            with ib.if_scope(bx * num_anchors + \
+                             tx * elem_per_thread + i < batch_size * num_anchors):
+                with ib.if_scope(i == 0):
+                    partial[bx * new_range + tx] = idx_in[bx * num_anchors + tx * elem_per_thread]
+                    idx[bx * num_anchors + tx * elem_per_thread] = \
+                    idx_in[bx * num_anchors + tx * elem_per_thread]
+                with ib.else_scope():
+                    partial[bx * new_range + tx] += \
+                    idx_in[bx * num_anchors + tx * elem_per_thread + i]
+                    idx[bx * num_anchors + tx * elem_per_thread + i] = \
+                    idx[bx * num_anchors + tx * elem_per_thread + i - 1] + \
+                    idx_in[bx * num_anchors + tx * elem_per_thread + i]
+    return ib.get()
+
+def get_valid_counts_scan(data, partial_in, partial):
+    """Low level IR to do scan.
+
+    Parameters
+    ----------
+    data: Buffer
+        3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms.
+
+    idx_in : Buffer
+        2D Buffer of valid data indices with shape [batch_size, num_anchors].
+
+    idx : Buffer
+        2D Buffer of valid data indices with shape [batch_size, num_anchors].
+
+    partial : Buffer
+        2D Buffer of valid data indices with shape [batch_size, new_range].
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    ib = tvm.ir_builder.create()
+    partial_in = ib.buffer_ptr(partial_in)
+    partial = ib.buffer_ptr(partial)
+    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
+    elem_per_thread = num_anchors // max_threads + 1
+    nthread_tx = max_threads
+    nthread_bx = batch_size
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("blockIdx.x")
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    var = tvm.make.node("FloatImm", dtype="float32", value=2)
+    new_range = num_anchors // elem_per_thread + 1
+    iteration = log(cast(new_range, "float32")) // math.log(2)
+    # Scan: Kogge-Stone adder
+    with ib.if_scope(tvm.all(bx < batch_size, tx < tvm.min(new_range, num_anchors))):
+        with ib.for_range(0, iteration) as k:
+            with ib.if_scope(k == 0):
+                with ib.if_scope(tvm.all(tx > 0, tx < tvm.min(new_range, num_anchors))):
+                    partial[bx * new_range + tx] = \
+                    partial_in[bx * new_range + tx] + partial_in[bx * new_range + tx - 1]
+                with ib.else_scope():
+                    partial[bx * new_range] = partial_in[bx * new_range]
+            with ib.else_scope():
+                with ib.if_scope(tvm.all(tx >= cast(power(var, k), "int32"), \
+                                         tx < tvm.min(new_range, num_anchors))):
+                    partial[bx * new_range + tx] += \
+                    partial[bx * new_range + tx - cast(power(var, k), "int32")]
             ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
                                   tvm.convert(['shared']),
                                   tvm.expr.Call.Intrinsic, None, 0))
+    return ib.get()
+
+def get_valid_counts_downsweep(data, idx_in, partial, idx):
+    """Low level IR to do downsweep of scan.
+
+    Parameters
+    ----------
+    data: Buffer
+        3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms.
+
+    idx_in : Buffer
+        2D Buffer of valid data indices with shape [batch_size, num_anchors].
+
+    partial : Buffer
+        2D Buffer of valid data indices with shape [batch_size, new_range].
+
+    idx : Buffer
+        2D Buffer of valid data indices with shape [batch_size, num_anchors].
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    ib = tvm.ir_builder.create()
+    idx_in = ib.buffer_ptr(idx_in)
+    idx = ib.buffer_ptr(idx)
+    partial = ib.buffer_ptr(partial)
+    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
+    elem_per_thread = num_anchors // max_threads + 1
+    nthread_tx = max_threads
+    nthread_bx = batch_size * num_anchors // max_threads + 1
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("blockIdx.x")
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+    new_range = num_anchors // elem_per_thread + 1
+    # Scan: Downsweep:
+    with ib. if_scope(tid < batch_size * num_anchors):
+        i = tid / num_anchors # number of batches
+        j = tid % num_anchors # number of anchors
+        with ib.if_scope(j < elem_per_thread):
+            idx[tid] = idx_in[tid]
+        with ib.else_scope():
+            idx[tid] = idx_in[tid] + partial[i * new_range + j // elem_per_thread - 1]
 
     return ib.get()
 
-def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, nms_topk):
+def get_valid_counts_ir(data, flag, idx, valid_count, out):
+    """Low level IR to get valid count of bounding boxes
+    given a score threshold. Also moves valid boxes to the
+    top of input data.
+
+    Parameters
+    ----------
+    data : Buffer
+        Input data. 3-D Buffer with shape [batch_size, num_anchors, elem_length].
+
+    flag : Buffer
+        2D Buffer of flag indicating valid data with shape [batch_size, num_anchors].
+
+    idx : Buffer
+        2D Buffer of valid data indices with shape [batch_size, num_anchors].
+
+    valid_count : Buffer
+        1-D buffer for valid number of boxes.
+
+    out : Buffer
+        Rearranged data buffer.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    elem_length = data.shape[2]
+    size = batch_size * num_anchors * elem_length
+
+    ib = tvm.ir_builder.create()
+
+    data = ib.buffer_ptr(data)
+    flag = ib.buffer_ptr(flag)
+    idx = ib.buffer_ptr(idx)
+    valid_count = ib.buffer_ptr(valid_count)
+    out = ib.buffer_ptr(out)
+
+    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
+    nthread_tx = max_threads
+    nthread_bx = batch_size * num_anchors * elem_length // max_threads + 1
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("blockIdx.x")
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+
+    with ib.if_scope(tid < batch_size * num_anchors):
+        i = tid / num_anchors
+        j = tid % num_anchors
+        base_idx = i * num_anchors * elem_length
+        with ib.if_scope(flag[tid] > 0):
+            with ib.for_range(0, elem_length) as k:
+                with ib.if_scope(base_idx + (idx[tid] - 1) * elem_length + k < size):
+                    out[base_idx + (idx[tid] - 1) * elem_length + k] =\
+                    data[base_idx + j * elem_length + k]
+        with ib.if_scope(j == 0):
+            valid_count[i] = idx[tid + num_anchors - 1]
+        with ib.if_scope(j >= idx[i * num_anchors + num_anchors - 1]):
+            with ib.for_range(0, elem_length) as l:
+                with ib.if_scope(tid * elem_length + l < size):
+                    out[tid * elem_length + l] = -1.0
+    return ib.get()
+
+
+@get_valid_counts.register(["cuda", "gpu"])
+def get_valid_counts_gpu(data, score_threshold=0):
+    """Get valid count of bounding boxes given a score threshold.
+    Also moves valid boxes to the top of input data.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        Input data. 3-D tensor with shape [batch_size, num_anchors, elem_length].
+
+    score_threshold : optional, float
+        Lower limit of score for valid bounding boxes.
+
+    Returns
+    -------
+    valid_count : tvm.Tensor
+        1-D tensor for valid number of boxes.
+
+    out_tensor : tvm.Tensor
+        Rearranged data tensor.
+    """
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
+    elem_per_thread = num_anchors // max_threads + 1
+    new_range = num_anchors // elem_per_thread + 1
+    temp_flag_buf = api.decl_buffer(
+        (batch_size, num_anchors,), "int32", "temp_flag", data_alignment=8)
+    temp_idx_buf = api.decl_buffer(
+        (batch_size, num_anchors,), "int32", "temp_idx", data_alignment=8)
+    temp_partial_buf = api.decl_buffer(
+        (batch_size, new_range), "int32", "temp_partial", data_alignment=8)
+    data_buf = api.decl_buffer(
+        data.shape, data.dtype, "data_buf", data_alignment=8)
+
+    temp_flag, temp_idx = \
+        tvm.extern([(batch_size, num_anchors,), (batch_size, num_anchors,)], [data],
+                   lambda ins, outs: get_valid_counts_pre(
+                       ins[0], outs[0], outs[1], score_threshold),
+                   dtype=["int32", "int32"],
+                   out_buffers=[temp_flag_buf, temp_idx_buf],
+                   name="get_valid_counts_phase_one")
+    temp_idx_new, temp_partial = \
+        tvm.extern([(batch_size, num_anchors,), (batch_size, new_range)], [data, temp_idx],
+                   lambda ins, outs: get_valid_counts_upsweep(
+                       ins[0], ins[1], outs[0], outs[1]),
+                   dtype=["int32", "int32"],
+                   out_buffers=[temp_idx_buf, temp_partial_buf],
+                   name="get_valid_counts_phase_two")
+    temp_partial_new = \
+        tvm.extern([(batch_size, new_range)], [data, temp_partial],
+                   lambda ins, outs: get_valid_counts_scan(
+                       ins[0], ins[1], outs[0]),
+                   dtype=["int32"],
+                   out_buffers=[temp_partial_buf],
+                   name="get_valid_counts_phase_three")
+    temp_idx_final = \
+        tvm.extern([(batch_size, num_anchors)], [data, temp_idx_new, temp_partial_new],
+                   lambda ins, outs: get_valid_counts_downsweep(
+                       ins[0], ins[1], ins[2], outs[0]),
+                   dtype=["int32"],
+                   out_buffers=[temp_idx_buf],
+                   name="get_valid_counts_phase_four")
+    valid_count, out_tensor = \
+	tvm.extern([(batch_size,), data.shape], [data, temp_flag, temp_idx_final],
+            lambda ins, outs: get_valid_counts_ir(
+                ins[0], ins[1], ins[2], outs[0], outs[1]),
+            dtype=["int32", data.dtype],
+            in_buffers=[data_buf, temp_flag_buf, temp_idx_buf],
+            name="get_valid_counts_phase_five",
+            tag="get_valid_counts_gpu")
+
+    return [valid_count, out_tensor]
+
+
+def nms_ir(data, sorted_index, valid_count, out, box_indices,
+           max_output_size, iou_threshold, force_suppress,
+           top_k, coord_start, id_index):
     """Low level IR routing for transform location in multibox_detection operator.
 
     Parameters
     ----------
-    data: Buffer
+    data : Buffer
         Buffer of output boxes with class and score.
 
-    sort_result : Buffer
+    sort_index : Buffer
         Buffer of output box indexes sorted by score.
 
     valid_count : Buffer
@@ -99,15 +402,25 @@ def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, n
     out : Buffer
         Output buffer.
 
-    nms_threshold : float
-        Non-maximum suppression threshold.
+    max_output_size : int
+        Max number of output valid boxes for each instance.
+        By default all valid boxes are returned.
+
+    iou_threshold : float
+        Overlapping(IoU) threshold to suppress object with smaller score.
 
     force_suppress : boolean
         Whether to suppress all detections regardless of class_id.
 
-    nms_topk : int
+    top_k : int
         Keep maximum top k detections before nms, -1 for no limit.
 
+    coord_start : int
+        Start index of the consecutive 4 coordinates.
+
+    id_index : int
+        index of the class categories, -1 to disable.
+
     Returns
     -------
     stmt : Stmt
@@ -127,100 +440,232 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
             (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i
         return tvm.expr.Select(u <= 0.0, 0.0, i / u)
 
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    box_data_length = data.shape[2]
+
+    ib = tvm.ir_builder.create()
+
+    data = ib.buffer_ptr(data)
+    sorted_index = ib.buffer_ptr(sorted_index)
+    valid_count = ib.buffer_ptr(valid_count)
+    out = ib.buffer_ptr(out)
+    box_indices = ib.buffer_ptr(box_indices)
+    num_valid_boxes = ib.allocate("int32", (1,), name="num_valid_boxes", scope="local")
+
     max_threads = int(math.sqrt(
         tvm.target.current_target(allow_none=False).max_num_threads))
-    ib = tvm.ir_builder.create()
-    p_data = ib.buffer_ptr(data)
-    p_sort_result = ib.buffer_ptr(sort_result)
-    p_valid_count = ib.buffer_ptr(valid_count)
-    p_out = ib.buffer_ptr(out)
-    batch_size = out.shape[0]
-    num_anchors = out.shape[1]
     nthread_tx = max_threads
     nthread_bx = num_anchors // max_threads + 1
     tx = tvm.thread_axis("threadIdx.x")
     bx = tvm.thread_axis("blockIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
-    i = bx * max_threads + tx
-
-    nms_threshold_node = tvm.make.node(
-        "FloatImm", dtype="float32", value=nms_threshold)
-    nms_topk_node = tvm.make.node("IntImm", dtype="int32", value=nms_topk)
-    force_suppress_node = tvm.make.node(
-        "IntImm", dtype="int32", value=1 if force_suppress else 0)
-    with ib.for_range(0, batch_size, for_type="unroll") as b:
-        base_idx = b * num_anchors * 6
-        with ib.if_scope( \
-                tvm.all(nms_threshold_node > 0, nms_threshold_node < 1,
-                        p_valid_count[0] > 0)):
+    k = bx * max_threads + tx
+
+    iou_threshold = tvm.make.node("FloatImm", dtype="float32", value=iou_threshold)
+    top_k = tvm.make.node("IntImm", dtype="int32", value=top_k)
+    coord_start = tvm.make.node("IntImm", dtype="int32", value=coord_start)
+    id_index = tvm.make.node("IntImm", dtype="int32", value=id_index)
+    force_suppress = tvm.make.node("IntImm", dtype="int32", value=1 if force_suppress else 0)
+
+    with ib.for_range(0, batch_size, for_type="unroll") as i:
+        base_idx = i * num_anchors * box_data_length
+        with ib.if_scope(tvm.all(iou_threshold > 0, valid_count[i] > 0)):
             # Reorder output
-            nkeep = tvm.if_then_else( \
-                    tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[b]),
-                    nms_topk, p_valid_count[b])
-            with ib.for_range(0, nkeep) as l:
-                with ib.if_scope(i < 6):
-                    p_out[(base_idx + l * 6 + i)] = \
-                            p_data[(base_idx + p_sort_result[b * num_anchors + l] * 6 + i)]
-            with ib.if_scope(tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[b])):
-                with ib.for_range(0, p_valid_count[b] - nkeep) as l:
-                    with ib.if_scope(i < 6):
-                        p_out[(base_idx + (l + nkeep) * 6 + i)] = -1.0
+            nkeep = if_then_else( \
+                    tvm.all(top_k > 0, top_k < valid_count[i]),
+                    top_k, valid_count[i])
+            with ib.for_range(0, nkeep) as j:
+                with ib.if_scope(k < box_data_length):
+                    out[(base_idx + j * box_data_length + k)] = \
+                    data[(base_idx + sorted_index[i * num_anchors + j] \
+                    * box_data_length + k)]
+                box_indices[i * num_anchors + j] = sorted_index[i * num_anchors + j]
+            with ib.if_scope(tvm.all(top_k > 0, top_k < valid_count[i])):
+                with ib.for_range(0, valid_count[i] - nkeep) as j:
+                    with ib.if_scope(k < box_data_length):
+                        out[(base_idx + (j + nkeep) * box_data_length + k)] = -1.0
+                    box_indices[i * num_anchors + (j + nkeep)] = -1
             # Apply nms
-            with ib.for_range(0, p_valid_count[b]) as l:
-                offset_l = l * 6
-                with ib.if_scope(p_out[base_idx + offset_l] >= 0):
-                    with ib.if_scope(i < p_valid_count[b]):
-                        offset_i = i * 6
-                        with ib.if_scope(tvm.all(i > l, p_out[base_idx
-                                                              + offset_i] >= 0)):
-                            with ib.if_scope(tvm.any(force_suppress_node > 0,
-                                                     p_out[base_idx + offset_l] ==
-                                                     p_out[base_idx + offset_i])):
-                                # When force_suppress == True or class_id equals
-                                iou = calculate_overlap(p_out, base_idx + offset_l + 2,
-                                                        base_idx + offset_i + 2)
-                                with ib.if_scope(iou >= nms_threshold):
-                                    p_out[base_idx + offset_i] = -1.0
+            with ib.for_range(0, valid_count[i]) as j:
+                offset_j = j * box_data_length
+                with ib.if_scope(out[base_idx + offset_j] >= 0):
+                    with ib.if_scope(k < valid_count[i]):
+                        offset_k = k * box_data_length
+                        with ib.if_scope(tvm.all(k > j, out[base_idx + offset_k] >= 0, \
+						 tvm.any(force_suppress > 0, id_index < 0, \
+                                                         out[base_idx + offset_j] == \
+                                                         out[base_idx + offset_k]))):
+                            iou = calculate_overlap(out, base_idx + offset_k + coord_start,
+                                                    base_idx + offset_j + coord_start)
+                            with ib.if_scope(iou >= iou_threshold):
+                                out[base_idx + offset_k] = -1.0
+                                box_indices[i * num_anchors + k] = -1
                 ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
                                       tvm.convert(['shared']),
                                       tvm.expr.Call.Intrinsic, None, 0))
         with ib.else_scope():
-            with ib.for_range(0, p_valid_count[b]) as c:
-                with ib.if_scope(i < 6):
-                    p_out[(base_idx + c * 6 + i)] = p_data[base_idx + c * 6 + i]
+            with ib.for_range(0, valid_count[i]) as j:
+                offset_j = j * box_data_length
+                with ib.if_scope(k < box_data_length):
+                    out[(base_idx + offset_j + k)] = data[base_idx + offset_j + k]
+                box_indices[i * num_anchors + j] = j
         # Set invalid entry to be -1
-        with ib.for_range(0, num_anchors - p_valid_count[b]) as c:
-            with ib.if_scope(i < 6):
-                p_out[base_idx + (c + p_valid_count[b]) * 6 + i] = -1.0
-    body = ib.get()
-    return body
+        with ib.for_range(0, num_anchors - valid_count[i]) as j:
+            with ib.if_scope(k < box_data_length):
+                out[base_idx + (j + valid_count[i]) * box_data_length + k] = -1.0
+            box_indices[i * num_anchors + j + valid_count[i]] = -1
+        # Only return max_output_size number of valid boxes
+        num_valid_boxes[0] = 0
+        with ib.if_scope(max_output_size > 0):
+            with ib.for_range(0, valid_count[i]) as j:
+                offset_j = j * box_data_length
+                with ib.if_scope(out[base_idx + offset_j] >= 0):
+                    with ib.if_scope(num_valid_boxes[0] == max_output_size):
+                        with ib.if_scope(k < box_data_length):
+                            out[base_idx + offset_j + k] = -1.0
+                        box_indices[i * num_anchors + j] = -1
+                    with ib.else_scope():
+                        num_valid_boxes[0] += 1
+                ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
+                                      tvm.convert(['shared']),
+                                      tvm.expr.Call.Intrinsic, None, 0))
+
+    return ib.get()
+
+
+def invalid_to_bottom_pre(data, flag, idx):
+    """Low level IR to rearrange nms output to move all valid entries to top.
+
+    Parameters
+    ----------
+    data: Buffer
+        3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms.
+
+    flag : Buffer
+        1D Buffer of flag indicating valid data with [num_anchors].
+
+    idx : Buffer
+        1D Buffer of valid data indices with [num_anchors].
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    elem_length = data.shape[2]
+
+    ib = tvm.ir_builder.create()
+
+    data = ib.buffer_ptr(data)
+    flag = ib.buffer_ptr(flag)
+    idx = ib.buffer_ptr(idx)
+
+    max_threads = int(math.sqrt(
+        tvm.target.current_target(allow_none=False).max_num_threads))
+    nthread_tx = max_threads
+    nthread_bx = num_anchors // max_threads + 1
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("blockIdx.x")
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    j = bx * max_threads + tx
+
+    with ib.for_range(0, batch_size, for_type="unroll") as i:
+        base_idx = i * num_anchors * elem_length
+        with ib.if_scope(j < num_anchors):
+            with ib.if_scope(data[base_idx + j * elem_length] >= 0):
+                flag[i * num_anchors + j] = 1
+                idx[i * num_anchors + j] = 1
+            with ib.else_scope():
+                flag[i * num_anchors + j] = 0
+                idx[i * num_anchors + j] = 0
+
+    with ib.if_scope(j < batch_size):
+        with ib.for_range(0, num_anchors) as k:
+            with ib.if_scope(k > 0):
+                idx[j * num_anchors + k] += idx[j * num_anchors + k - 1]
+    return ib.get()
+
+
+def invalid_to_bottom_ir(data, flag, idx, out):
+    """Low level IR to rearrange nms output to move all valid entries to top.
+
+    Parameters
+    ----------
+    data: Buffer
+        3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms.
+
+    flag : Buffer
+        1D Buffer of flag indicating valid data with [num_anchors].
+
+    idx : Buffer
+        1D Buffer of valid data indices with [num_anchors].
+
+    out : Buffer
+        3D Buffer of rearranged nms output with shape [batch_size, num_anchors, elem_length].
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    elem_length = data.shape[2]
+
+    ib = tvm.ir_builder.create()
+
+    data = ib.buffer_ptr(data)
+    flag = ib.buffer_ptr(flag)
+    idx = ib.buffer_ptr(idx)
+    out = ib.buffer_ptr(out)
+
+    max_threads = int(math.sqrt(
+        tvm.target.current_target(allow_none=False).max_num_threads))
+    nthread_tx = max_threads
+    nthread_bx = num_anchors // max_threads + 1
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("blockIdx.x")
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    j = bx * max_threads + tx
+
+    with ib.for_range(0, batch_size, for_type="unroll") as i:
+        base_idx = i * num_anchors * elem_length
+        with ib.if_scope(j < num_anchors):
+            with ib.for_range(0, elem_length) as k:
+                out[base_idx + j * elem_length + k] = -1.0
+            with ib.if_scope(flag[i * num_anchors + j] > 0):
+                with ib.for_range(0, elem_length) as k:
+                    out[base_idx + (idx[i * num_anchors + j] - 1) * elem_length + k] \
+                    = data[base_idx + j * elem_length + k]
+    return ib.get()
 
 
 @non_max_suppression.register(["cuda", "gpu"])
-def nms_gpu(data,
-            valid_count,
-            max_output_size=-1,
-            iou_threshold=0.5,
-            force_suppress=False,
-            top_k=-1,
-            id_index=0,
-            return_indices=True,
-            invalid_to_bottom=False):
+def non_max_suppression_gpu(data, valid_count, max_output_size=-1,
+                            iou_threshold=0.5, force_suppress=False, top_k=-1,
+                            coord_start=2, score_index=1, id_index=0,
+                            return_indices=True, invalid_to_bottom=False):
     """Non-maximum suppression operator for object detection.
 
     Parameters
     ----------
     data : tvm.Tensor
-        3-D tensor with shape [batch_size, num_anchors, 6].
+        3-D tensor with shape [batch_size, num_anchors, elem_length].
         The last dimension should be in format of
         [class_id, score, box_left, box_top, box_right, box_bottom].
 
     valid_count : tvm.Tensor
         1-D tensor for valid number of boxes.
 
-    return_indices : boolean
-        Whether to return box indices in input data.
+    max_output_size : optional, int
+        Max number of output valid boxes for each instance.
+        By default all valid boxes are returned.
 
     iou_threshold : optional, float
         Non-maximum suppression threshold.
@@ -231,16 +676,25 @@ def nms_gpu(data,
     top_k : optional, int
         Keep maximum top k detections before nms, -1 for no limit.
 
+    coord_start : required, int
+        Start index of the consecutive 4 coordinates.
+
+    score_index : optional, int
+        Index of the scores/confidence of boxes.
+
     id_index : optional, int
         index of the class categories, -1 to disable.
 
+    return_indices : boolean
+        Whether to return box indices in input data.
+
     invalid_to_bottom : optional, boolean
         Whether to move all valid bounding boxes to the top.
 
     Returns
     -------
     out : tvm.Tensor
-        3-D tensor with shape [batch_size, num_anchors, 6].
+        3-D tensor with shape [batch_size, num_anchors, elem_length].
 
     Example
     --------
@@ -253,12 +707,13 @@ def nms_gpu(data,
         iou_threshold = 0.7
         force_suppress = True
         top_k = -1
-        out = nms(data, valid_count, iou_threshold, force_suppress, topk)
+        out = non_max_suppression(data=data, valid_count=valid_count, iou_threshold=iou_threshold,
+                                 force_suppress=force_supress, top_k=top_k, return_indices=False)
         np_data = np.random.uniform(dshape)
         np_valid_count = np.array([4])
         s = topi.generic.schedule_nms(out)
-        f = tvm.build(s, [data, valid_count, out], "llvm")
-        ctx = tvm.cpu()
+        f = tvm.build(s, [data, valid_count, out], "cuda")
+        ctx = tvm.gpu(0)
         tvm_data = tvm.nd.array(np_data, ctx)
         tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
         tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
@@ -266,38 +721,62 @@ def nms_gpu(data,
     """
     batch_size = data.shape[0]
     num_anchors = data.shape[1]
+
     valid_count_dtype = "int32"
     valid_count_buf = api.decl_buffer(valid_count.shape, valid_count_dtype,
                                       "valid_count_buf", data_alignment=4)
-    data_buf = api.decl_buffer(
-        data.shape, data.dtype, "data_buf", data_alignment=8)
+    score_axis = score_index
     score_shape = (batch_size, num_anchors)
-    score_tensor = tvm.compute(
-        score_shape, lambda i, j: data[i, j, 1], name="score_tensor")
-    score_tensor_buf = api.decl_buffer(score_tensor.shape, data.dtype,
-                                       "score_tensor_buf", data_alignment=8)
+    score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis])
+    sort_tensor = argsort(score_tensor, valid_count=valid_count, axis=1, is_ascend=False, flag=True)
 
-    sort_tensor_dtype = "int32"
-    sort_tensor_buf = api.decl_buffer(score_shape, sort_tensor_dtype,
+    sort_tensor_buf = api.decl_buffer(sort_tensor.shape, sort_tensor.dtype,
                                       "sort_tensor_buf", data_alignment=8)
 
-    sort_tensor = \
-        tvm.extern(score_shape,
-                   [score_tensor, valid_count],
-                   lambda ins, outs: sort_ir(
-                       ins[0], ins[1], outs[0]),
-                   dtype=sort_tensor_dtype,
-                   in_buffers=[score_tensor_buf, valid_count_buf],
-                   out_buffers=sort_tensor_buf,
-                   name="nms_sort")
+    data_buf = api.decl_buffer(
+        data.shape, data.dtype, "data_buf", data_alignment=8)
 
-    out = \
-        tvm.extern(data.shape,
+    out_buf = api.decl_buffer(
+        data.shape, data.dtype, "out_buf", data_alignment=8)
+
+    out, box_indices = \
+        tvm.extern([data.shape, score_shape],
                    [data, sort_tensor, valid_count],
                    lambda ins, outs: nms_ir(
-                       ins[0], ins[1], ins[2], outs[0], iou_threshold,
-                       force_suppress, top_k),
-                   dtype="float32",
+                       ins[0], ins[1], ins[2], outs[0], outs[1],
+                       max_output_size, iou_threshold, force_suppress,
+                       top_k, coord_start, id_index),
+                   dtype=[data.dtype, "int32"],
                    in_buffers=[data_buf, sort_tensor_buf, valid_count_buf],
+                   name="nms",
                    tag="nms")
+
+    if return_indices:
+        return box_indices
+
+    if invalid_to_bottom:
+        output_buf = api.decl_buffer(
+            data.shape, data.dtype, "output_buf", data_alignment=8)
+        temp_flag_buf = api.decl_buffer(
+            score_shape, valid_count_dtype, "temp_flag", data_alignment=8)
+        temp_idx_buf = api.decl_buffer(
+            score_shape, valid_count_dtype, "temp_idx", data_alignment=8)
+        temp_flag, temp_idx = tvm.extern([score_shape, score_shape], [out],
+                                         lambda ins, outs: invalid_to_bottom_pre(
+                                             ins[0], outs[0], outs[1]),
+                                         dtype=["int32", "int32"],
+                                         in_buffers=[out_buf],
+                                         out_buffers=[temp_flag_buf, temp_idx_buf],
+                                         name="invalid_to_bottom_phase_one")
+
+        output = tvm.extern([data.shape], [out, temp_flag, temp_idx],
+                            lambda ins, outs: invalid_to_bottom_ir(
+                                ins[0], ins[1], ins[2], outs[0]),
+                            dtype=[data.dtype],
+                            in_buffers=[out_buf, temp_flag_buf, temp_idx_buf],
+                            out_buffers=[output_buf],
+                            name="invalid_to_bottom",
+                            tag="invalid_to_bottom")
+        return output
+
     return out
diff --git a/topi/python/topi/cuda/sort.py b/topi/python/topi/cuda/sort.py
new file mode 100644
index 000000000000..99ba8527cdfb
--- /dev/null
+++ b/topi/python/topi/cuda/sort.py
@@ -0,0 +1,249 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, singleton-comparison, unused-argument
+"""Argsort operator """
+import tvm
+
+from tvm import api
+from topi.sort import argsort
+
+def sort_ir(data, output, axis, is_ascend):
+    """Low level IR to do nms sorting on the GPU, same usage as tvm.contrib.sort.argsort on the CPU.
+
+    Parameters
+    ----------
+    data: Buffer
+        Buffer of input data.
+
+    output : Buffer
+        Output buffer of indicies of sorted tensor with same shape as data.
+
+    axis : Int
+        Axis long which to sort the input tensor.
+
+    is_ascend : Boolean
+        Whether to sort in ascending or descending order.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    size = 1
+    axis_mul_before = 1
+    axis_mul_after = 1
+    shape = data.shape
+    if axis < 0:
+        axis = len(shape) + axis
+    for i, value in enumerate(shape, 0):
+        size *= value
+        if i < axis:
+            axis_mul_before *= value
+        elif i > axis:
+            axis_mul_after *= value
+    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
+    ib = tvm.ir_builder.create()
+    data = ib.buffer_ptr(data)
+    output = ib.buffer_ptr(output)
+    nthread_tx = max_threads
+    nthread_bx = size // max_threads + 1
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("vthread")
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "virtual_thread", nthread_bx)
+    tid = bx * nthread_tx + tx
+    temp_data = ib.allocate("float32", (1,), name="temp_data", scope="local")
+    temp_index = ib.allocate("float32", (1,), name="temp_index", scope="local")
+    is_ascend = tvm.make.node("IntImm", dtype="int32", value=is_ascend)
+
+    with ib.for_range(0, axis_mul_before) as i:
+        with ib.for_range(0, axis_mul_after) as j:
+            current_sort_num = shape[axis]
+            base_idx = i * shape[axis] * axis_mul_after + j
+            with ib.if_scope(tid < shape[axis]):
+                output[base_idx + tid * axis_mul_after] = tid.astype("float32")
+            # OddEvenTransposeSort
+            with ib.for_range(0, current_sort_num) as k:
+                with ib.if_scope(tid < (current_sort_num + 1) // 2):
+                    offset = base_idx + (2 * tid + (k % 2)) * axis_mul_after
+                    with ib.if_scope(tvm.all(is_ascend == 1, \
+                                             2 * tid + (k % 2) + 1 < current_sort_num, \
+                                             data[offset] > data[offset + axis_mul_after])):
+                        temp_data[0] = data[offset]
+                        data[offset] = data[offset + axis_mul_after]
+                        data[offset + axis_mul_after] = temp_data[0]
+                        temp_index[0] = output[offset]
+                        output[offset] = output[offset + axis_mul_after]
+                        output[offset + axis_mul_after] = temp_index[0]
+                    with ib.if_scope(tvm.all(is_ascend == 0, \
+                                             2 * tid + (k % 2) + 1 < current_sort_num, \
+                                             data[offset] < data[offset + axis_mul_after])):
+                        temp_data[0] = data[offset]
+                        data[offset] = data[offset + axis_mul_after]
+                        data[offset + axis_mul_after] = temp_data[0]
+                        temp_index[0] = output[offset]
+                        output[offset] = output[offset + axis_mul_after]
+                        output[offset + axis_mul_after] = temp_index[0]
+                ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
+                                      tvm.convert(['shared']),
+                                      tvm.expr.Call.Intrinsic, None, 0))
+
+    return ib.get()
+
+
+
+def sort_nms_ir(data, valid_count, output, axis, is_ascend):
+    """Low level IR to do nms sorting on the GPU, same usage as tvm.contrib.sort.argsort on the CPU.
+
+    Parameters
+    ----------
+    data: Buffer
+        Buffer of input data.
+
+    valid_count : Buffer
+        1D Buffer of number of valid number of boxes.
+
+    output : Buffer
+        Output buffer of indicies of sorted tensor with same shape as data.
+
+    axis : Int
+        Axis long which to sort the input tensor.
+
+    is_ascend : Boolean
+        Whether to sort in ascending or descending order.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+
+    size = 1
+    axis_mul_before = 1
+    axis_mul_after = 1
+    shape = data.shape
+    if axis < 0:
+        axis = len(shape) + axis
+    for i, value in enumerate(shape, 0):
+        size *= value
+        if i < axis:
+            axis_mul_before *= value
+        elif i > axis:
+            axis_mul_after *= value
+    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
+    ib = tvm.ir_builder.create()
+    data = ib.buffer_ptr(data)
+    valid_count = ib.buffer_ptr(valid_count)
+    output = ib.buffer_ptr(output)
+    nthread_tx = max_threads
+    nthread_bx = size // max_threads + 1
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("vthread")
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "virtual_thread", nthread_bx)
+    tid = bx * nthread_tx + tx
+    temp_data = ib.allocate("float32", (1,), name="temp_data", scope="local")
+    temp_index = ib.allocate("int32", (1,), name="temp_index", scope="local")
+    is_ascend = tvm.make.node("IntImm", dtype="int32", value=is_ascend)
+
+    with ib.for_range(0, axis_mul_before) as i:
+        with ib.for_range(0, axis_mul_after) as j:
+            current_sort_num = valid_count[i * axis_mul_after + j]
+            base_idx = i * shape[axis] * axis_mul_after + j
+            with ib.if_scope(tid < shape[axis]):
+                output[base_idx + tid * axis_mul_after] = tid
+            # OddEvenTransposeSort
+            with ib.for_range(0, current_sort_num) as k:
+                with ib.if_scope(tid < (current_sort_num + 1) // 2):
+                    offset = base_idx + (2 * tid + (k % 2)) * axis_mul_after
+                    with ib.if_scope(tvm.all(is_ascend == 1, \
+                                             2 * tid + (k % 2) + 1 < current_sort_num, \
+                                             data[offset] > data[offset + axis_mul_after])):
+                        temp_data[0] = data[offset]
+                        data[offset] = data[offset + axis_mul_after]
+                        data[offset + axis_mul_after] = temp_data[0]
+                        temp_index[0] = output[offset]
+                        output[offset] = output[offset + axis_mul_after]
+                        output[offset + axis_mul_after] = temp_index[0]
+                    with ib.if_scope(tvm.all(is_ascend == 0, \
+                                             2 * tid + (k % 2) + 1 < current_sort_num, \
+                                             data[offset] < data[offset + axis_mul_after])):
+                        temp_data[0] = data[offset]
+                        data[offset] = data[offset + axis_mul_after]
+                        data[offset + axis_mul_after] = temp_data[0]
+                        temp_index[0] = output[offset]
+                        output[offset] = output[offset + axis_mul_after]
+                        output[offset + axis_mul_after] = temp_index[0]
+                ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
+                                      tvm.convert(['shared']),
+                                      tvm.expr.Call.Intrinsic, None, 0))
+
+    return ib.get()
+
+@argsort.register(["cuda", "gpu"])
+def argsort_gpu(data, valid_count, axis=-1, is_ascend=1, dtype="float32", flag=0):
+    """Performs sorting along the given axis and returns an array of indicies
+    having same shape as an input array that index data in sorted order.
+
+    Parameters
+    ----------
+    data: tvm.Tensor
+        The input array.
+
+    valid_count : tvm.Tensor
+        The number of valid elements to be sorted.
+
+    axis : int
+        Axis long which to sort the input tensor.
+
+    is_ascend : boolean
+        Whether to sort in ascending or descending order.
+
+    flag : boolean
+        Whether this argsort is used in nms operator
+
+    Returns
+    -------
+    out : tvm.Tensor
+        The output of this function.
+    """
+    data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
+    if flag:
+        valid_count_buf = api.decl_buffer(valid_count.shape, valid_count.dtype,
+                                          "valid_count_buf", data_alignment=4)
+        out_buf = api.decl_buffer(data.shape, "int32", "out_buf", data_alignment=4)
+        out = tvm.extern([data.shape],
+                         [data, valid_count],
+                         lambda ins, outs: sort_nms_ir(
+                             ins[0], ins[1], outs[0], axis, is_ascend),
+                         dtype="int32",
+                         in_buffers=[data_buf, valid_count_buf],
+                         out_buffers=[out_buf],
+                         name="argsort_nms_gpu",
+                         tag="argsort_nms_gpu")
+    else:
+        out_buf = api.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8)
+        out = tvm.extern([data.shape],
+                         [data],
+                         lambda ins, outs: sort_ir(
+                             ins[0], outs[0], axis, is_ascend),
+                         dtype=dtype,
+                         in_buffers=[data_buf],
+                         out_buffers=[out_buf],
+                         name="argsort_gpu",
+                         tag="argsort_gpu")
+    return out
diff --git a/topi/python/topi/cuda/ssd/multibox.py b/topi/python/topi/cuda/ssd/multibox.py
index 38b76f36801e..f7e5f94a5655 100644
--- a/topi/python/topi/cuda/ssd/multibox.py
+++ b/topi/python/topi/cuda/ssd/multibox.py
@@ -21,6 +21,7 @@
 import tvm
 
 from tvm import api
+from tvm.intrin import if_then_else, exp
 
 import topi
 
@@ -93,12 +94,11 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
             center_w = (j + offset_w) * steps_w
 
             for k in range(num_sizes + num_ratios - 1):
-                w = tvm.if_then_else(k < num_sizes,
-                                     size_ratio_concat[
-                                         k] * in_height / in_width / 2.0,
-                                     size_ratio_concat[0] * in_height / in_width *
-                                     math.sqrt(size_ratio_concat[k + 1]) / 2.0)
-                h = tvm.if_then_else(
+                w = if_then_else(k < num_sizes,
+                                 size_ratio_concat[k] * in_height / in_width / 2.0,
+                                 size_ratio_concat[0] * in_height / in_width *
+                                 math.sqrt(size_ratio_concat[k + 1]) / 2.0)
+                h = if_then_else(
                     k < num_sizes, size_ratio_concat[k] / 2.0,
                     size_ratio_concat[0] / math.sqrt(size_ratio_concat[k + 1]) / 2.0)
                 count = (i * in_width * (num_sizes + num_ratios - 1) +
@@ -154,8 +154,7 @@ def multibox_prior_gpu(data, sizes=(1,), ratios=(1,), steps=(-1, -1),
         out = topi.clip(out, 0, 1)
     return out
 
-
-def transform_loc_pre(cls_prob, valid_count, temp_flag, temp_id, temp_score_out, threshold):
+def transform_loc_pre(cls_prob, valid_count, temp_valid_count, temp_cls_id, temp_score, threshold):
     """Low level IR routing for transform location data preparation.
 
     Parameters
@@ -166,13 +165,13 @@ def transform_loc_pre(cls_prob, valid_count, temp_flag, temp_id, temp_score_out,
     valid_count : Buffer
         Buffer of number of valid output boxes.
 
-    temp_flag : Buffer
+    temp_valid_count : Buffer
         Output intermediate result buffer
 
-    temp_id : Buffer
+    temp_cls_id : Buffer
         Output intermediate result buffer
 
-    temp_score_out : Buffer
+    temp_score : Buffer
         Output buffer
 
     threshold : float
@@ -187,53 +186,53 @@ def transform_loc_pre(cls_prob, valid_count, temp_flag, temp_id, temp_score_out,
     num_classes = cls_prob.shape[1]
     num_anchors = cls_prob.shape[2]
 
-    max_threads = int(
-        tvm.target.current_target(allow_none=False).max_num_threads)
     ib = tvm.ir_builder.create()
-    score = ib.buffer_ptr(temp_score_out)
-    cls_id = ib.buffer_ptr(temp_id)
-    flag = ib.buffer_ptr(temp_flag)
+
+    cls_prob = ib.buffer_ptr(cls_prob)
+    cls_id = ib.buffer_ptr(temp_cls_id)
+    valid_count = ib.buffer_ptr(valid_count)
+    temp_valid_count = ib.buffer_ptr(temp_valid_count)
+    score = ib.buffer_ptr(temp_score)
+
+    threshold = tvm.make.node("FloatImm", dtype="float32", value=threshold)
+
+    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
+    nthread_tx = max_threads
+    nthread_bx = (batch_size *  num_anchors) // max_threads + 1
     tx = tvm.thread_axis("threadIdx.x")
     bx = tvm.thread_axis("blockIdx.x")
-    nthread_tx = max_threads
-    nthread_bx = (batch_size * num_anchors * num_classes) // max_threads + 1
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     tid = bx * max_threads + tx
-    p_cls_prob = ib.buffer_ptr(cls_prob)
-    p_valid_count = ib.buffer_ptr(valid_count)
 
     with ib.if_scope(tid < batch_size * num_anchors):
-        n = tid / num_anchors  # number of batches
-        i = tid % num_anchors  # number of anchors
-        score[i] = -1.0
-        cls_id[i] = 0
-        p_valid_count[n] = 0
-        with ib.for_range(0, num_classes-1, name="k") as k:
-            temp = p_cls_prob[n * num_anchors * num_classes + (k + 1) * num_anchors + i]
-            with ib.if_scope(temp > score[i]):
-                cls_id[i] = k + 1
-                score[i] = temp
-        with ib.if_scope(tvm.all(cls_id[i] > 0, score[i] < threshold)):
-            cls_id[i] = 0
-        with ib.if_scope(cls_id[i] > 0):
-            flag[i] = 1
+        i = tid / num_anchors
+        j = tid % num_anchors
+        valid_count[i] = 0
+        score[tid] = -1.0
+        cls_id[tid] = 0
+        with ib.for_range(0, num_classes - 1) as k:
+            temp = cls_prob[i * num_classes * num_anchors + (k + 1) * num_anchors + j]
+            cls_id[tid] = if_then_else(temp > score[tid], k + 1, cls_id[tid])
+            score[tid] = tvm.max(temp, score[tid])
+        with ib.if_scope(tvm.all(cls_id[tid] > 0, score[tid] < threshold)):
+            cls_id[tid] = 0
+        with ib.if_scope(cls_id[tid] > 0):
+            temp_valid_count[tid] = 1
         with ib.else_scope():
-            flag[i] = 0
+            temp_valid_count[tid] = 0
 
         with ib.if_scope(tid < batch_size):
-            with ib.for_range(0, num_anchors, name="k") as k:
+            with ib.for_range(0, num_anchors) as k:
                 with ib.if_scope(k > 0):
-                    flag[tid * num_anchors +
-                         k] += flag[tid * num_anchors + k - 1]
-            p_valid_count[n] = flag[tid * num_anchors + num_anchors - 1]
+                    temp_valid_count[tid * num_anchors + k] += \
+                    temp_valid_count[tid * num_anchors + k - 1]
+            valid_count[i] = temp_valid_count[tid * num_anchors + num_anchors - 1]
 
-    body = ib.get()
-    return body
+    return ib.get()
 
-
-def transform_loc_ir(loc_pred, anchor, temp_flag, temp_id, temp_score_in, \
-                     out, clip, variances, batch_size, num_classes, num_anchors):
+def transform_loc_ir(loc_pred, anchor, temp_valid_count, temp_cls_id, temp_score, out, \
+                     clip, variances, batch_size, num_anchors):
     """Low level IR routing for transform location in multibox_detection operator.
 
     Parameters
@@ -244,13 +243,13 @@ def transform_loc_ir(loc_pred, anchor, temp_flag, temp_id, temp_score_in, \
     anchor : Buffer
         Buffer of prior anchor boxes.
 
-    temp_flag : Buffer
+    temp_valid_count : Buffer
         Intermediate result buffer.
 
-    temp_id : Buffer
+    temp_cls_id : Buffer
         Intermediate result buffer.
 
-    temp_score_in : Buffer
+    temp_score : Buffer
         Input buffer which stores intermediate results.
 
     out : Buffer
@@ -265,9 +264,6 @@ def transform_loc_ir(loc_pred, anchor, temp_flag, temp_id, temp_score_in, \
     batch_size : int
         Batch size
 
-    num_classes : int
-        Number of classes
-
     num_anchors : int
         Number of anchors
 
@@ -293,47 +289,55 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
         ph = loc[loc_base_idx + 3]
         ox = px * vx * aw + ax
         oy = py * vy * ah + ay
-        ow = tvm.exp(pw * vw) * aw / 2.0
-        oh = tvm.exp(ph * vh) * ah / 2.0
-        return tvm.if_then_else(clip, tvm.make.Max(0.0, tvm.make.Min(1.0, ox - ow)), ox - ow), \
-            tvm.if_then_else(clip, tvm.make.Max(0.0, tvm.make.Min(1.0, oy - oh)), oy - oh), \
-            tvm.if_then_else(clip, tvm.make.Max(0.0, tvm.make.Min(1.0, ox + ow)), ox + ow), \
-            tvm.if_then_else(clip, tvm.make.Max(0.0, tvm.make.Min(1.0, oy + oh)), oy + oh)
-
-    max_threads = int(
-        tvm.target.current_target(allow_none=False).max_num_threads)
+        ow = exp(pw * vw) * aw / 2.0
+        oh = exp(ph * vh) * ah / 2.0
+        return tvm.if_then_else(clip, tvm.max(0.0, tvm.min(1.0, ox - ow)), ox - ow), \
+            tvm.if_then_else(clip, tvm.max(0.0, tvm.min(1.0, oy - oh)), oy - oh), \
+            tvm.if_then_else(clip, tvm.max(0.0, tvm.min(1.0, ox + ow)), ox + ow), \
+            tvm.if_then_else(clip, tvm.max(0.0, tvm.min(1.0, oy + oh)), oy + oh)
+
     ib = tvm.ir_builder.create()
-    score = ib.buffer_ptr(temp_score_in)
-    cls_id = ib.buffer_ptr(temp_id)
-    flag = ib.buffer_ptr(temp_flag)
+
+    loc_pred = ib.buffer_ptr(loc_pred)
+    anchor = ib.buffer_ptr(anchor)
+    temp_valid_count = ib.buffer_ptr(temp_valid_count)
+    cls_id = ib.buffer_ptr(temp_cls_id)
+    score = ib.buffer_ptr(temp_score)
+    out_loc = ib.buffer_ptr(out)
+
+    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
+    nthread_tx = max_threads
+    nthread_bx = (batch_size * num_anchors) // max_threads + 1
     tx = tvm.thread_axis("threadIdx.x")
     bx = tvm.thread_axis("blockIdx.x")
-    nthread_tx = max_threads
-    nthread_bx = (batch_size * num_anchors * num_classes) // max_threads + 1
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     tid = bx * max_threads + tx
-    p_loc_pred = ib.buffer_ptr(loc_pred)
-    p_anchor = ib.buffer_ptr(anchor)
-    p_out = ib.buffer_ptr(out)
 
     with ib.if_scope(tid < batch_size * num_anchors):
-        n = tid / num_anchors  # number of batches
-        i = tid % num_anchors  # number of anchors
+        i = tid / num_anchors
+        j = tid % num_anchors
         with ib.if_scope(cls_id[tid] > 0):
             with ib.if_scope(tid == 0):
-                out_base_idx = n * num_anchors * 6
+                out_base_idx = i * num_anchors * 6
+                out_loc[out_base_idx] = cls_id[tid] - 1.0
+                out_loc[out_base_idx + 1] = score[tid]
+                out_loc[out_base_idx + 2], out_loc[out_base_idx + 3], out_loc[out_base_idx + 4], \
+                    out_loc[out_base_idx + 5] = transform_loc(loc_pred, tid * 4,
+                                                              anchor, j * 4, clip, variances[0],
+                                                              variances[1], variances[2],
+                                                              variances[3])
             with ib.else_scope():
-                out_base_idx = n * num_anchors * 6 + flag[tid - 1] * 6
-            p_out[out_base_idx] = cls_id[tid] - 1.0
-            p_out[out_base_idx + 1] = score[tid]
-            p_out[out_base_idx + 2], p_out[out_base_idx + 3], p_out[out_base_idx + 4], \
-                p_out[out_base_idx + 5] = transform_loc(p_loc_pred, tid * 4,
-                                                        p_anchor, i*4, clip, variances[0],
-                                                        variances[1], variances[2], variances[3])
+                out_base_idx = i * num_anchors * 6 + temp_valid_count[tid - 1] * 6
+                out_loc[out_base_idx] = cls_id[tid] - 1.0
+                out_loc[out_base_idx + 1] = score[tid]
+                out_loc[out_base_idx + 2], out_loc[out_base_idx + 3], out_loc[out_base_idx + 4], \
+                    out_loc[out_base_idx + 5] = transform_loc(loc_pred, tid * 4,
+                                                              anchor, j * 4, clip, variances[0],
+                                                              variances[1], variances[2],
+                                                              variances[3])
 
-    body = ib.get()
-    return body
+    return ib.get()
 
 
 @multibox_transform_loc.register(["cuda", "gpu"])
@@ -372,44 +376,48 @@ def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, \
         1-D tensor with shape (batch_size,), number of valid anchor boxes.
     """
     batch_size = cls_prob.shape[0]
-    num_classes = cls_prob.shape[1]
     num_anchors = cls_prob.shape[2]
     oshape = (batch_size, num_anchors, 6)
     # Define data alignment for intermediate buffer
     valid_count_dtype = "int32"
+    out_loc_dtype = loc_pred.dtype
+
     valid_count_buf = api.decl_buffer((batch_size,), valid_count_dtype,
                                       "valid_count_buf", data_alignment=4)
-    out_buf = api.decl_buffer(
-        oshape, cls_prob.dtype, "out_buf", data_alignment=8)
-    size = num_anchors
-    temp_flag_buf = api.decl_buffer(
-        (size,), valid_count_dtype, "flag", data_alignment=8)
-    temp_id_buf = api.decl_buffer(
-        (size,), valid_count_dtype, "cls_id", data_alignment=8)
+    loc_pred_buf = api.decl_buffer(loc_pred.shape, loc_pred.dtype,
+                                   "loc_pred_buf", data_alignment=8)
+    anchor_buf = api.decl_buffer(anchor.shape, anchor.dtype,
+                                 "anchor_buf", data_alignment=8)
+
+    temp_valid_count_buf = api.decl_buffer(
+        (batch_size, num_anchors,), valid_count_dtype, "temp_valid_count", data_alignment=8)
+    temp_cls_id_buf = api.decl_buffer(
+        (batch_size, num_anchors,), valid_count_dtype, "temp_cls_id", data_alignment=8)
     temp_score_buf = api.decl_buffer(
-        (size,), cls_prob.dtype, "score", data_alignment=8)
+        (batch_size, num_anchors,), cls_prob.dtype, "temp_score", data_alignment=8)
 
-    valid_count, temp_flag, temp_id, temp_score = \
-        tvm.extern([(batch_size,), (size,), (size,), (size,)],
-                   [cls_prob],
+    valid_count, temp_valid_count, temp_cls_id, temp_score = \
+        tvm.extern([(batch_size,), (batch_size, num_anchors,), (batch_size, num_anchors,), \
+                    (batch_size, num_anchors,)], [cls_prob],
                    lambda ins, outs: transform_loc_pre(
                        ins[0], outs[0], outs[1], outs[2], outs[3], threshold),
-                   dtype=[valid_count_dtype,
-                          valid_count_dtype, valid_count_dtype, cls_prob.dtype],
-                   out_buffers=[valid_count_buf,
-                                temp_flag_buf, temp_id_buf, temp_score_buf],
-                   tag="multibox_transform_loc_first_step")
+                   dtype=[valid_count_dtype, valid_count_dtype, valid_count_dtype, cls_prob.dtype],
+                   out_buffers=[valid_count_buf, temp_valid_count_buf, \
+                                temp_cls_id_buf, temp_score_buf],
+                   tag="multibox_transform_loc_phase_one")
 
-    out = \
+    out_loc = \
         tvm.extern([oshape],
-                   [loc_pred, anchor, temp_flag, temp_id, temp_score],
+                   [loc_pred, anchor, temp_valid_count, temp_cls_id, temp_score],
                    lambda ins, outs: transform_loc_ir(
-                       ins[0], ins[1], ins[2], ins[3], ins[4], outs[0], clip, \
-                       variances, batch_size, num_classes, num_anchors),
-                   dtype=[cls_prob.dtype],
-                   out_buffers=[out_buf],
+                       ins[0], ins[1], ins[2], ins[3], ins[4], outs[0], clip, variances, \
+                       batch_size, num_anchors),
+                   in_buffers=[loc_pred_buf, anchor_buf, temp_valid_count_buf, \
+                               temp_cls_id_buf, temp_score_buf],
+                   dtype=[out_loc_dtype],
                    tag="multibox_transform_loc")
-    return [out, valid_count]
+
+    return [out_loc, valid_count]
 
 
 @multibox_detection.register(["cuda", "gpu"])
@@ -453,6 +461,7 @@ def multibox_detection_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=0.01
     """
     inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
                                        clip, threshold, variances)
-    out = non_max_suppression(
-        inter_out[0], inter_out[1], nms_threshold, force_suppress, nms_topk)
+    out = non_max_suppression(inter_out[0], inter_out[1], max_output_size=-1,
+                              iou_threshold=nms_threshold, force_suppress=force_suppress,
+                              top_k=nms_topk, return_indices=False)
     return out
diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py
index 5d7bc9e00da6..78f5c1f51ec6 100644
--- a/topi/python/topi/cuda/vision.py
+++ b/topi/python/topi/cuda/vision.py
@@ -32,11 +32,15 @@ def _default_schedule(outs):
 
     def traverse(op):
         """inline all one-to-one-mapping operators except the last stage (output)"""
-        if "nms" in op.tag:
-            sort = op.input_tensors[1]
+        if op.tag in ["nms", "invalid_to_bottom"]:
+            if op.tag == "nms":
+                sort = op.input_tensors[1]
+            else:
+                out = op.input_tensors[0]
+                sort = s[out].op.input_tensors[1]
             score = s[sort].op.input_tensors[0]
             fused = s[score].fuse(*s[score].op.axis)
-            num_thread = tvm.target.current_target(allow_none=False).max_num_threads
+            num_thread = int(tvm.target.current_target(allow_none=False).max_num_threads)
             bx, tx = s[score].split(fused, factor=num_thread)
             s[score].bind(bx, tvm.thread_axis("blockIdx.x"))
             s[score].bind(tx, tvm.thread_axis("threadIdx.x"))
@@ -199,3 +203,30 @@ def schedule_get_valid_counts(outs):
       The computation schedule for the op.
     """
     return _default_schedule(outs)
+
+@generic.schedule_argsort.register(["cuda", "gpu"])
+def schedule_argsort(outs):
+    """Schedule for argsort operator.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of argsort
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    scheduled_ops = []
+    from .injective import _schedule_injective
+    def traverse(op):
+        for tensor in op.input_tensors:
+            if tensor.op.input_tensors and tensor.op not in scheduled_ops:
+                traverse(tensor.op)
+        scheduled_ops.append(op)
+    traverse(outs[0].op)
+    return s
diff --git a/topi/python/topi/generic/__init__.py b/topi/python/topi/generic/__init__.py
index 8450e2d4c4e2..6bf5f3a053c9 100644
--- a/topi/python/topi/generic/__init__.py
+++ b/topi/python/topi/generic/__init__.py
@@ -19,3 +19,4 @@
 from .injective import *
 from .extern import *
 from .vision import *
+from .sort import *
diff --git a/topi/python/topi/generic/sort.py b/topi/python/topi/generic/sort.py
new file mode 100644
index 000000000000..1ad088c50d04
--- /dev/null
+++ b/topi/python/topi/generic/sort.py
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, no-member
+"""Generic vision operators"""
+from __future__ import absolute_import as _abs
+import tvm
+from .vision import _default_schedule
+
+@tvm.target.generic_func
+def schedule_argsort(outs):
+    """Schedule for argsort operator.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+      The indices that would sort an input array along
+      the given axis.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
diff --git a/topi/python/topi/sort.py b/topi/python/topi/sort.py
new file mode 100644
index 000000000000..84fff8d8f0cd
--- /dev/null
+++ b/topi/python/topi/sort.py
@@ -0,0 +1,105 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=too-many-arguments
+"""Argsort operator"""
+import tvm
+from tvm import api
+
+@tvm.target.generic_func
+def argsort(data, valid_count, axis=-1, is_ascend=1, dtype="float32", flag=0):
+    """Performs sorting along the given axis and returns an array
+    of indices having the same shape as an input array that index
+    data in sorted order.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        The input tensor.
+
+    valid_count : tvm.Tensor
+        1-D tensor for valid number of boxes only for ssd.
+
+    axis : optional, int
+	Axis along which to sort the input tensor.
+        By default the flattened array is used.
+
+    is_ascend : optional, boolean
+        Whether to sort in ascending or descending order.
+
+    dtype : optional, string
+        DType of the output indices.
+
+    flag : optional, boolean
+        Whether valid_count is valid.
+
+    Returns
+    -------
+    out : tvm.Tensor
+        Sorted index tensor.
+
+    Example
+    --------
+    .. code-block:: python
+
+        # An example to use argsort
+        dshape = (1, 5, 6)
+        data = tvm.placeholder(dshape, name="data")
+        valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
+        axis = 0
+        is_ascend = False
+        flag = False
+        out = argsort(data, valid_count, axis, is_ascend, flag)
+        np_data = np.random.uniform(dshape)
+        np_valid_count = np.array([4])
+        s = topi.generic.schedule_argsort(out)
+        f = tvm.build(s, [data, valid_count, out], "llvm")
+        ctx = tvm.cpu()
+        tvm_data = tvm.nd.array(np_data, ctx)
+        tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
+        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
+        f(tvm_data, tvm_valid_count, tvm_out)
+    """
+    data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
+    if flag:
+        valid_count_buf = api.decl_buffer(valid_count.shape, valid_count.dtype,
+                                          "valid_count_buf", data_alignment=4)
+        out_buf = api.decl_buffer(data.shape, "int32", "out_buf", data_alignment=8)
+        out = \
+            tvm.extern(data.shape,
+                       [data, valid_count],
+                       lambda ins, outs: tvm.call_packed(
+                           "tvm.contrib.sort.argsort_nms", ins[0], ins[1],
+                           outs[0], axis, is_ascend),
+                       dtype="int32",
+                       in_buffers=[data_buf, valid_count_buf],
+                       out_buffers=out_buf,
+                       name="argsort_nms_cpu",
+                       tag="argsort_nms_cpu")
+    else:
+        out_buf = api.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8)
+        out = \
+            tvm.extern(data.shape,
+                       [data],
+                       lambda ins, outs: tvm.call_packed(
+                           "tvm.contrib.sort.argsort", ins[0],
+                           outs[0], axis, is_ascend),
+                       dtype=dtype,
+                       in_buffers=[data_buf],
+                       out_buffers=out_buf,
+                       name="argsort_cpu",
+                       tag="argsort_cpu")
+    return out
diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index d8b15aac42c6..979565d31662 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -18,7 +18,8 @@
 """Non-maximum suppression operator"""
 import tvm
 
-from tvm import api, hybrid
+from tvm import hybrid
+from ..sort import argsort
 
 @hybrid.script
 def hybrid_rearrange_out(data):
@@ -129,7 +130,7 @@ def get_valid_counts(data, score_threshold=0):
 @hybrid.script
 def hybrid_nms(data, sorted_index, valid_count,
                max_output_size, iou_threshold, force_suppress,
-               top_k, id_index):
+               top_k, coord_start, id_index):
     """Hybrid routing for non-maximum suppression.
 
     Parameters
@@ -158,6 +159,9 @@ def hybrid_nms(data, sorted_index, valid_count,
     top_k : tvm.const
         Keep maximum top k detections before nms, -1 for no limit.
 
+    coord_start : tvm.const
+        Start index of the consecutive 4 coordinates.
+
     id_index : tvm.const
         index of the class categories, -1 to disable.
 
@@ -208,7 +212,7 @@ def hybrid_nms(data, sorted_index, valid_count,
                             batch_idx = i
                             box_a_idx = j
                             box_b_idx = k
-                            box_start_idx = 2
+                            box_start_idx = coord_start
                             a_t = output[batch_idx, box_a_idx, box_start_idx + 1]
                             a_b = output[batch_idx, box_a_idx, box_start_idx + 3]
                             a_l = output[batch_idx, box_a_idx, box_start_idx]
@@ -252,7 +256,8 @@ def hybrid_nms(data, sorted_index, valid_count,
 @tvm.target.generic_func
 def non_max_suppression(data, valid_count, max_output_size=-1,
                         iou_threshold=0.5, force_suppress=False, top_k=-1,
-                        id_index=0, return_indices=True, invalid_to_bottom=False):
+                        coord_start=2, score_index=1, id_index=0,
+                        return_indices=True, invalid_to_bottom=False):
     """Non-maximum suppression operator for object detection.
 
     Parameters
@@ -278,6 +283,12 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
     top_k : optional, int
         Keep maximum top k detections before nms, -1 for no limit.
 
+    coord_start : required, int
+        Start index of the consecutive 4 coordinates.
+
+    score_index: optional, int
+        Index of the scores/confidence of boxes.
+
     id_index : optional, int
         index of the class categories, -1 to disable.
 
@@ -317,32 +328,16 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
     """
     batch_size = data.shape[0]
     num_anchors = data.shape[1]
-    valid_count_dtype = "int32"
-    valid_count_buf = api.decl_buffer(valid_count.shape, valid_count_dtype,
-                                      "valid_count_buf", data_alignment=4)
-    score_axis = 1
+    score_axis = score_index
     score_shape = (batch_size, num_anchors)
     score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis])
-    score_tensor_buf = api.decl_buffer(score_tensor.shape, data.dtype,
-                                       "score_tensor_buf", data_alignment=8)
-    sort_tensor_dtype = "int32"
-    sort_tensor_buf = api.decl_buffer(score_shape, sort_tensor_dtype,
-                                      "sort_tensor_buf", data_alignment=8)
-    sort_tensor = \
-        tvm.extern(score_shape,
-                   [score_tensor, valid_count],
-                   lambda ins, outs: tvm.call_packed(
-                       "tvm.contrib.sort.argsort", ins[0], ins[1],
-                       outs[0], score_axis, True),
-                   dtype=sort_tensor_dtype,
-                   in_buffers=[score_tensor_buf, valid_count_buf],
-                   out_buffers=sort_tensor_buf,
-                   name="nms_sort")
+    sort_tensor = argsort(score_tensor, valid_count=valid_count, axis=1, is_ascend=False, flag=True)
     out, box_indices = hybrid_nms(data, sort_tensor, valid_count,
                                   tvm.const(max_output_size, dtype="int32"),
                                   tvm.const(iou_threshold, dtype="float32"),
                                   tvm.const(force_suppress, dtype="bool"),
                                   tvm.const(top_k, dtype="int32"),
+                                  tvm.const(coord_start, dtype="int32"),
                                   tvm.const(id_index, dtype="int32"))
     if not return_indices and invalid_to_bottom:
         out = hybrid_rearrange_out(out)
diff --git a/topi/python/topi/vision/ssd/multibox.py b/topi/python/topi/vision/ssd/multibox.py
index 799669003753..ca1b4a9eb268 100644
--- a/topi/python/topi/vision/ssd/multibox.py
+++ b/topi/python/topi/vision/ssd/multibox.py
@@ -308,7 +308,7 @@ def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nm
     """
     inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
                                        clip, threshold, variances)
-    out = non_max_suppression(inter_out[0], inter_out[1], -1,
-                              nms_threshold, force_suppress, nms_topk,
-                              return_indices=False)
+    out = non_max_suppression(inter_out[0], inter_out[1], max_output_size=-1,
+                              iou_threshold=nms_threshold, force_suppress=force_suppress,
+                              top_k=nms_topk, return_indices=False)
     return out
diff --git a/topi/tests/python/test_topi_sort.py b/topi/tests/python/test_topi_sort.py
new file mode 100644
index 000000000000..3a2c9c2e4980
--- /dev/null
+++ b/topi/tests/python/test_topi_sort.py
@@ -0,0 +1,59 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test code for vision package"""
+from __future__ import print_function
+import math
+import numpy as np
+import tvm
+import topi
+import topi.testing
+
+from tvm.contrib.pickle_memoize import memoize
+from topi.util import get_const_tuple
+from topi import argsort
+
+def test_argsort():
+    dshape = (1, 8)
+    valid_count_shape = (2,)
+    data = tvm.placeholder(dshape, name="data", dtype="float32")
+    valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
+    np_data = np.random.rand(dshape[0], dshape[1]).astype(data.dtype)
+    np_valid_count = np.array([4]).astype(valid_count.dtype)
+    np_result = np.argsort(-np_data)
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            out = argsort(data, valid_count, axis = -1, is_ascend = False, flag=False)
+            s = topi.generic.schedule_argsort(out)
+
+        tvm_data = tvm.nd.array(np_data, ctx)
+        tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
+        tvm_out = tvm.nd.array(np.zeros(dshape, dtype="float32"), ctx)
+        f = tvm.build(s, [data, valid_count, out], device)
+        f(tvm_data, tvm_valid_count, tvm_out)
+        tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result.astype("float32"), rtol=1e0)
+
+    for device in ['llvm', 'cuda', 'opencl']:
+        check_device(device)
+
+
+if __name__ == "__main__":
+    test_argsort()
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index 6bb57b541c88..483f3a641c70 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -66,7 +66,7 @@ def check_device(device):
         tvm.testing.assert_allclose(tvm_out1.asnumpy(), np_out1, rtol=1e-3)
         tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3)
 
-    for device in ['llvm']:
+    for device in ['llvm', 'cuda', 'opencl']:
         check_device(device)
 
 
@@ -124,7 +124,7 @@ def check_device(device):
         f(tvm_data, tvm_valid_count, tvm_indices_out)
         tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4)
 
-    for device in ['llvm']:
+    for device in ['llvm', 'cuda', 'opencl']:
         check_device(device)
 
 
@@ -231,7 +231,7 @@ def check_device(device):
         f(tvm_cls_prob, tvm_loc_preds, tvm_anchors, tvm_out)
         tvm.testing.assert_allclose(tvm_out.asnumpy(), expected_np_out, rtol=1e-4)
 
-    for device in ['llvm', 'opencl']:
+    for device in ['llvm', 'opencl', 'cuda']:
         check_device(device)
 
 
@@ -275,7 +275,7 @@ def check_device(device):
         f(tvm_a, tvm_rois, tvm_b)
         tvm.testing.assert_allclose(tvm_b.asnumpy(), b_np, rtol=1e-3)
 
-    for device in ['llvm', 'cuda']:
+    for device in ['llvm', 'cuda', 'opencl']:
         check_device(device)
 
 
diff --git a/tutorials/frontend/deploy_ssd_gluoncv.py b/tutorials/frontend/deploy_ssd_gluoncv.py
index fe84283ad191..ff7691c7bf55 100644
--- a/tutorials/frontend/deploy_ssd_gluoncv.py
+++ b/tutorials/frontend/deploy_ssd_gluoncv.py
@@ -18,6 +18,7 @@
 Deploy Single Shot Multibox Detector(SSD) model
 ===============================================
 **Author**: `Yao Wang <https://github.com/kevinthesun>`_
+`Leyuan Wang <https://github.com/Laurawly>`_
 
 This article is an introductory tutorial to deploy SSD models with TVM.
 We will use GluonCV pre-trained SSD model and convert it to Relay IR
@@ -37,30 +38,29 @@
 # ------------------------------
 # .. note::
 #
-#   Currently we support compiling SSD on CPU only.
-#   GPU support is in progress.
+#   We support compiling SSD on bot CPUs and GPUs now.
 #
 #   To get best inference performance on CPU, change
 #   target argument according to your device and
 #   follow the :ref:`tune_relay_x86` to tune x86 CPU and
 #   :ref:`tune_relay_arm` for arm cpu.
 #
+#   To get best performance fo SSD on Intel graphics,
+#   change target argument to 'opencl -device=intel_graphics'
+#
 #   SSD with VGG as body network is not supported yet since
 #   x86 conv2d schedule doesn't support dilation.
 
 supported_model = [
-    'ssd_512_resnet18_v1_voc',
-    'ssd_512_resnet18_v1_coco',
     'ssd_512_resnet50_v1_voc',
     'ssd_512_resnet50_v1_coco',
     'ssd_512_resnet101_v2_voc',
-    'ssd_512_mobilenet1_0_voc',
-    'ssd_512_mobilenet1_0_coco',
+    'ssd_512_mobilenet1.0_voc',
+    'ssd_512_mobilenet1.0_coco',
 ]
 
-model_name = "ssd_512_resnet50_v1_voc"
+model_name = supported_model[0]
 dshape = (1, 3, 512, 512)
-dtype = "float32"
 target_list = ctx_list()
 
 ######################################################################
@@ -76,7 +76,7 @@
 
 block = model_zoo.get_model(model_name, pretrained=True)
 
-def compile(target):
+def build(target):
     net, params = relay.frontend.from_mxnet(block, {"data": dshape})
     with relay.build_config(opt_level=3):
         graph, lib, params = relay.build(net, target, params=params)
@@ -98,10 +98,7 @@ def run(graph, lib, params, ctx):
     return class_IDs, scores, bounding_boxs
 
 for target, ctx in target_list:
-    if target == "cuda":
-        print("GPU not supported yet, skip.")
-        continue
-    graph, lib, params = compile(target)
+    graph, lib, params = build(target)
     class_IDs, scores, bounding_boxs = run(graph, lib, params, ctx)
 
 ######################################################################

From e88f46a7a368c0b0ff38afb477470f22a7d4c31f Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Mon, 29 Apr 2019 12:54:16 -0700
Subject: [PATCH 062/106] Fix bug in ONNX importer (#3084)

---
 python/tvm/relay/frontend/onnx.py          | 5 ++++-
 tests/python/frontend/onnx/test_forward.py | 9 +++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 53f104ce48cf..d91ee4b8c5d7 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -944,7 +944,10 @@ def from_onnx(self, graph, opset):
                                               dtype=self._params[i_name].dtype)
             else:
                 self._num_input += 1
-                tshape = self._shape[i_name] if i_name in self._shape else ()
+                if i_name in self._shape:
+                    tshape = self._shape[i_name]
+                else:
+                    raise ValueError("Must provide an input shape for `{0}`.".format(i_name))
                 if isinstance(self._dtype, dict):
                     dtype = self._dtype[i_name] if i_name in self._dtype else d_type
                 else:
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 2564d83b1fc2..7be6bb611e9a 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -724,10 +724,15 @@ def verify_constantfill(is_shape, input_dim, out_dim, value, dtype, **kwargs):
     else:
         fill_node = helper.make_node("ConstantFill", ["input_a"], ["out"], value=value, dtype=dtype, **kwargs)
 
+    if is_shape == True:
+        inputs = []
+    else:
+        inputs = [helper.make_tensor_value_info("input_a",
+                  TensorProto.FLOAT, list(input_dim))]
+
     graph = helper.make_graph([fill_node],
                               "fill_test",
-                              inputs = [helper.make_tensor_value_info("input_a",
-                                            TensorProto.FLOAT, list(input_dim))],
+                              inputs,
                               outputs = [helper.make_tensor_value_info("out",
                                             TensorProto.FLOAT, list(out.shape))])
 

From 0e333338737af4b14e5ab6a1a90f43fb8cc3fac1 Mon Sep 17 00:00:00 2001
From: Rahul <rahul.unnikrishnan.nair@intel.com>
Date: Tue, 30 Apr 2019 15:43:46 -0700
Subject: [PATCH 063/106] Fixing a doc nit (#3123)

URLs to the authors repo for these tutorials had an extra
`https://`, this patch removes that.
---
 nnvm/tutorials/tune_nnvm_cuda.py           | 2 +-
 nnvm/tutorials/tune_nnvm_mobile_gpu.py     | 2 +-
 tutorials/autotvm/tune_conv2d_cuda.py      | 2 +-
 tutorials/autotvm/tune_relay_mobile_gpu.py | 2 +-
 tutorials/autotvm/tune_simple_template.py  | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/nnvm/tutorials/tune_nnvm_cuda.py b/nnvm/tutorials/tune_nnvm_cuda.py
index c6dec422c2aa..aae5c00e32c1 100644
--- a/nnvm/tutorials/tune_nnvm_cuda.py
+++ b/nnvm/tutorials/tune_nnvm_cuda.py
@@ -17,7 +17,7 @@
 """
 Auto-tuning a convolutional network for NVIDIA GPU (NNVM)
 =========================================================
-**Author**: `Lianmin Zheng <https://https://github.com/merrymercy>`_
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
 
 Auto-tuning for specific devices and workloads is critical for getting the
 best performance. This is a tutorial on how to tune a whole convolutional
diff --git a/nnvm/tutorials/tune_nnvm_mobile_gpu.py b/nnvm/tutorials/tune_nnvm_mobile_gpu.py
index 293339c224e7..f6da7eafa112 100644
--- a/nnvm/tutorials/tune_nnvm_mobile_gpu.py
+++ b/nnvm/tutorials/tune_nnvm_mobile_gpu.py
@@ -17,7 +17,7 @@
 """
 Auto-tuning a convolutional network for Mobile GPU (NNVM)
 =========================================================
-**Author**: `Lianmin Zheng <https://https://github.com/merrymercy>`_
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
 
 Auto-tuning for a specific device is critical for getting the best
 performance. This is a tutorial about how to tune a whole convolutional
diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index 1783a16258a9..7124ad0a8fbb 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -17,7 +17,7 @@
 """
 Tuning High Performance Convolution on NVIDIA GPUs
 =========================================================================
-**Author**: `Lianmin Zheng <https://https://github.com/merrymercy>`_
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
 
 This is an advanced tutorial for writing high performance tunable template for
 NVIDIA GPU. By running auto-tuner on this template, we can outperform the
diff --git a/tutorials/autotvm/tune_relay_mobile_gpu.py b/tutorials/autotvm/tune_relay_mobile_gpu.py
index 57aea56aa993..5b231064e2ac 100644
--- a/tutorials/autotvm/tune_relay_mobile_gpu.py
+++ b/tutorials/autotvm/tune_relay_mobile_gpu.py
@@ -17,7 +17,7 @@
 """
 Auto-tuning a convolutional network for Mobile GPU
 ==================================================
-**Author**: `Lianmin Zheng <https://https://github.com/merrymercy>`_, `Eddie Yan <https://github.com/eqy>`_
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, `Eddie Yan <https://github.com/eqy>`_
 
 Auto-tuning for a specific device is critical for getting the best
 performance. This is a tutorial about how to tune a whole convolutional
diff --git a/tutorials/autotvm/tune_simple_template.py b/tutorials/autotvm/tune_simple_template.py
index 832e060312d4..3608e9a84427 100644
--- a/tutorials/autotvm/tune_simple_template.py
+++ b/tutorials/autotvm/tune_simple_template.py
@@ -17,7 +17,7 @@
 """
 Writing tunable template and Using auto-tuner
 =============================================
-**Author**: `Lianmin Zheng <https://https://github.com/merrymercy>`_
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
 
 This is an introduction tutorial to the auto-tuning module in tvm.
 

From 1e2748b0726e7e173e773e72d36d4aaf7212cb88 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Tue, 30 Apr 2019 17:10:19 -0700
Subject: [PATCH 064/106] [Bugfix] Fix type code error for StringImm (#3050)

---
 src/pass/lower_tvm_builtin.cc                 | 7 +++++--
 tests/python/unittest/test_runtime_measure.py | 1 -
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/pass/lower_tvm_builtin.cc b/src/pass/lower_tvm_builtin.cc
index a11e3ceec1c5..69618985d50c 100644
--- a/src/pass/lower_tvm_builtin.cc
+++ b/src/pass/lower_tvm_builtin.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -251,6 +251,9 @@ class BuiltinLower : public IRMutator {
           stack_value_, static_cast<int>(arg_stack_begin + i - 1),
           intrinsic::kTVMValueContent, arg));
       int arg_tcode = api_type.code();
+      if (api_type.is_handle() && arg.as<StringImm>()) {
+        arg_tcode = kStr;
+      }
       if (IsArrayHandle(arg)) arg_tcode = kArrayHandle;
       prep_seq_.emplace_back(
           Store::make(stack_tcode_,
diff --git a/tests/python/unittest/test_runtime_measure.py b/tests/python/unittest/test_runtime_measure.py
index 0fc72d06ec37..7413a3732086 100644
--- a/tests/python/unittest/test_runtime_measure.py
+++ b/tests/python/unittest/test_runtime_measure.py
@@ -29,7 +29,6 @@ def test_min_repeat_ms():
     def my_debug(filename):
         """one call lasts for 100 ms and writes one character to a file"""
         time.sleep(0.1)
-        filename = ctypes.c_char_p(filename.value).value
         with open(filename, "a") as fout:
             fout.write("c")
 

From 7cbcf7eca1998aeeb07096772d1abd522cb39ac4 Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Wed, 1 May 2019 11:42:27 +0800
Subject: [PATCH 065/106] [RELAY][FUSION] Enhance fusion rule that starts from
 elemwise and broadcast (#2932)

* [relay][bugfix] fuse injective to elemwise and broadcast

* enhance fusion for prarllel injectiveOD

* check if tensor in schedule

* fix codegen

* fix lint

* update

* lint
---
 include/tvm/schedule.h                   | 16 ++++++++++
 src/relay/backend/compile_engine.cc      |  4 ++-
 src/relay/pass/fuse_ops.cc               |  5 ++-
 src/schedule/schedule_lang.cc            |  4 +++
 tests/python/relay/test_pass_fuse_ops.py | 39 ++++++++++++++++++++++--
 5 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/include/tvm/schedule.h b/include/tvm/schedule.h
index 9a556b6ce960..6c2a759db471 100644
--- a/include/tvm/schedule.h
+++ b/include/tvm/schedule.h
@@ -551,6 +551,22 @@ class ScheduleNode : public Node {
   /*! \brief Invalidate temp cache. */
   void InvalidateCache();
 
+  /*!
+   * \brief Check if the schedule contains an Operation.
+   * \param op The candidate Operation.
+   * \return true if the schedule has the Operation. Otherwise, false.
+   */
+  EXPORT bool Contain(const Operation& op) const;
+
+  /*!
+   * \brief Check if the schedule contains a Tensor.
+   * \param tensor The candidate tensor.
+   * \return true if the schedule has the tensor. Otherwise, false.
+   */
+  EXPORT bool Contain(const Tensor& tensor) const {
+    return Contain(tensor->op);
+  }
+
   /*!
    * \brief Create a schedule for array of ops(and their dependencies).
    * \param ops The ops to be scheduled.
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index 43515105bd94..4b5842c36020 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -127,7 +127,9 @@ class ScheduleGetter :
       schedule =
           fschedule[master_op_](master_attrs_, tensor_outs, target_);
       for (const auto& scalar : scalars_) {
-        schedule[scalar].compute_inline();
+        if (schedule->Contain(scalar)) {
+          schedule[scalar].compute_inline();
+        }
       }
     }
     return std::make_pair(schedule, cfunc);
diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index 55d609872929..fc7aad6ce515 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -715,10 +715,13 @@ class GraphPartitioner {
           // The final terminal node can already be fused to a OutEWiseFusable group.
           auto fcond = [](OpPatternKind kind, bool is_sink) {
             if (!is_sink) {
-              return kind <= kBroadcast;
+              // Elemwise, broadcast, and injective ops on the parallel branches
+              // are allowed be fused to the elemwise/broadcast master.
+              return kind <= kInjective;
             } else {
               return (kind <= kBroadcast ||
                       kind == kCommReduce ||
+                      kind == kInjective ||
                       kind == kOutEWiseFusable);
             }
           };
diff --git a/src/schedule/schedule_lang.cc b/src/schedule/schedule_lang.cc
index ffee804198b6..e1cb4c5f9bdc 100644
--- a/src/schedule/schedule_lang.cc
+++ b/src/schedule/schedule_lang.cc
@@ -712,6 +712,10 @@ void ScheduleNode::InitCache() {
   CHECK_EQ(op2stage_cache_.size(), stages.size());
 }
 
+bool ScheduleNode::Contain(const Operation& op) const {
+  return stage_map.find(op) != stage_map.end();
+}
+
 Schedule ScheduleNode::make(Array<Operation> ops) {
   auto n = make_node<ScheduleNode>();
   Schedule sch(n);
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
index bdffdf7c129f..6d6781046a10 100644
--- a/tests/python/relay/test_pass_fuse_ops.py
+++ b/tests/python/relay/test_pass_fuse_ops.py
@@ -23,13 +23,15 @@ def before():
         x = relay.var("x", shape=(10, 20))
         y = relay.add(x, relay.const(1, "float32"))
         z = relay.exp(y)
-        return relay.Function([x], z)
+        w = relay.squeeze(z)
+        return relay.Function([x], w)
 
     def expected():
         x = relay.var("p", shape=(10, 20))
         y = relay.add(x, relay.const(1, "float32"))
         z = relay.exp(y)
-        f1 = relay.Function([x], z)
+        w = relay.squeeze(z)
+        f1 = relay.Function([x], w)
         x = relay.var("x", shape=(10, 20))
         y = relay.Call(f1, [x])
         return relay.Function([x], y)
@@ -503,6 +505,38 @@ def expected(dshape):
     assert relay.ir_pass.alpha_equal(zz, after)
 
 
+def test_fuse_parallel_injective():
+    """Test fusing parallel injective ops to an elemwise op."""
+    def before():
+        x = relay.var("x", shape=(10, 20))
+        y = relay.add(x, relay.const(1, "float32"))
+        z = relay.squeeze(y)
+        u = relay.transpose(y, axes=[0, 1])
+        w = relay.left_shift(z, u)
+        return relay.Function([x], w)
+
+    def expected():
+        x = relay.var("p", shape=(10, 20))
+        y = relay.add(x, relay.const(1, "float32"))
+        z = relay.squeeze(y)
+        u = relay.transpose(y, axes=[0, 1])
+        w = relay.left_shift(z, u)
+        f1 = relay.Function([x], w)
+        x = relay.var("x", shape=(10, 20))
+        y = relay.Call(f1, [x])
+        return relay.Function([x], y)
+
+    z = before()
+    z = relay.ir_pass.infer_type(z)
+    zz = relay.ir_pass.fuse_ops(z, opt_level=0)
+    assert not relay.ir_pass.free_vars(zz)
+    zz = relay.ir_pass.fuse_ops(z, opt_level=2)
+    zz = relay.ir_pass.infer_type(zz)
+    assert not relay.ir_pass.free_vars(zz)
+    after = relay.ir_pass.infer_type(expected())
+    assert relay.ir_pass.alpha_equal(zz, after)
+
+
 if __name__ == "__main__":
     test_fuse_simple()
     test_conv2d_fuse()
@@ -515,3 +549,4 @@ def expected(dshape):
     test_tuple_intermediate()
     test_tuple_consecutive()
     test_inception_like()
+    test_fuse_parallel_injective()

From 057e796822205a29f6a691f30b5a1c89294c5bf1 Mon Sep 17 00:00:00 2001
From: lixiaoquan <radioheads@163.com>
Date: Wed, 1 May 2019 23:02:12 +0800
Subject: [PATCH 066/106] [Relay][Tensorflow] Allow an op as loop var. (#3056)

---
 python/tvm/relay/frontend/tensorflow.py       | 35 ++++++++++++++++---
 .../frontend/tensorflow/test_control_flow.py  | 18 ++++++++++
 2 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 0f8b19bfb45f..9c312990c379 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -30,6 +30,7 @@
 from .. import ir_pass
 from .. import expr as _expr
 from .. import op as _op
+from ..expr_functor import ExprMutator
 
 __all__ = ['from_tensorflow']
 
@@ -1414,6 +1415,27 @@ def _get_abs_layer_name(node):
 # 1.x.
 _control_flow_nodes = ['Merge', 'Switch', 'NextIteration', 'Exit', 'Enter', 'LoopCond']
 
+class RewriteSubgraph(ExprMutator):
+    """
+    A helper class to rewrite expr in while loop function to variable
+
+    Parameters
+    ----------
+    rewrite_map : Dict[expr, expr]
+        A dictionay contains a set of expr to var mapping.
+    """
+    def __init__(self, rewrite_map):
+        ExprMutator.__init__(self)
+        self.rewrite_map = rewrite_map
+
+    def visit(self, expr):
+        if expr in self.rewrite_map:
+            return self.rewrite_map[expr]
+        return super().visit(expr)
+
+def rewrite_subgraph(expr, rewrites):
+    return RewriteSubgraph(rewrites).visit(expr)
+
 def _in_while_loop(control_flow_node_map, op_name):
     """
     Check if a given control flow operator is part of a while loop execution
@@ -1594,14 +1616,17 @@ def _while_loop(self):
         loop_vars = []
         bind_map = {}
         for i, var in enumerate(self.loop_vars):
-            assert isinstance(var, _expr.Var), repr(var)
-            v = tvm.relay.var("loop_var" + str(i),
-                              type_annotation=var.type_annotation)
+            if not isinstance(var, _expr.Var):
+                var_type = ir_pass.infer_type(var).checked_type
+            else:
+                var_type = var.type_annotation
+
+            v = tvm.relay.var("loop_var" + str(i), type_annotation=var_type)
             loop_vars.append(v)
             bind_map[var] = v
 
-        self.cond = tvm.relay.bind(self.cond, bind_map)
-        self.body = [tvm.relay.bind(b, bind_map) for b in self.body]
+        self.cond = rewrite_subgraph(self.cond, bind_map)
+        self.body = [rewrite_subgraph(b, bind_map) for b in self.body]
 
         cond = tvm.relay.op.min(self.cond)
 
diff --git a/tests/python/frontend/tensorflow/test_control_flow.py b/tests/python/frontend/tensorflow/test_control_flow.py
index e76a849ae8c3..b1860658a961 100644
--- a/tests/python/frontend/tensorflow/test_control_flow.py
+++ b/tests/python/frontend/tensorflow/test_control_flow.py
@@ -51,6 +51,23 @@ def b(i): return tf.add(i, 1)
         check_equal(graph, tf_out)
 
 
+def test_callnode_loop_vars():
+    graph = tf.Graph()
+    with graph.as_default():
+        i = tf.add(tf.constant(0), 1)
+
+        def c(i): return tf.less(i, 10)
+
+        def b(i): return tf.add(i, 1)
+
+        r = tf.while_loop(c, b, [i])
+
+        with tf.Session() as sess:
+            tf_out = sess.run(r)
+
+        check_equal(graph, tf_out)
+
+
 def test_loop_2_vars():
     graph = tf.Graph()
     with graph.as_default():
@@ -288,6 +305,7 @@ def condition(x):
     test_loop_3_vars()
     test_loop_conditions()
     test_loop_bodies()
+    test_callnode_loop_vars()
 
     # tf.cond
     test_vanilla_cond()

From b7599cc3fdbe1a35d6ffa15157ba1dc9ebf75ad4 Mon Sep 17 00:00:00 2001
From: songqun <songqun001@gmail.com>
Date: Wed, 1 May 2019 08:03:52 -0700
Subject: [PATCH 067/106] [FRONTEND][TFLITE] Add FULLY_CONNECTED op into tflite
 frontend, support Inception V4 (#3019)

* Add FULLY_CONNECTED op into tflite frontend, support Inception V4

* Fix comment style in TF Lite tests.
---
 python/tvm/relay/frontend/tflite.py          | 68 +++++++++++++++++-
 tests/python/frontend/tflite/test_forward.py | 76 ++++++++++++++++++--
 2 files changed, 139 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 2225d6c82f8d..ff62d89412e9 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -63,7 +63,8 @@ def __init__(self, model, subgraph, exp_tab):
             'SQUEEZE': self.convert_squeeze,
             'MAX_POOL_2D': self.convert_max_pool2d,
             'CONCATENATION': self.convert_concatenation,
-            'ADD': self.convert_add
+            'ADD': self.convert_add,
+            'FULLY_CONNECTED': self.convert_fully_connected,
         }
 
     def check_unsupported_ops(self):
@@ -352,6 +353,71 @@ def convert_add(self, op):
         out = _op.add(lhs_expr, rhs_expr)
         return out
 
+    def convert_fully_connected(self, op):
+        """Convert TFLite fully connected"""
+        try:
+            from tflite.Operator import Operator
+            from tflite.FullyConnectedOptions import FullyConnectedOptions
+            from tflite.BuiltinOptions import BuiltinOptions
+            from tflite.TensorType import TensorType
+            from tflite.ActivationFunctionType import ActivationFunctionType
+        except ImportError:
+            raise ImportError("The tflite package must be installed")
+
+        assert isinstance(op, Operator)
+        input_tensors = self.get_input_tensors(op)
+        assert len(input_tensors) >= 2, "input tensors length should be >= 2"
+
+        input_tensor = input_tensors[0]
+        input_tensor_idx = input_tensor.tensor_idx
+        weight_tensor = input_tensors[1]
+
+        input_tensor_shape = input_tensor.tensor.ShapeAsNumpy()
+        weight_tensor_shape = weight_tensor.tensor.ShapeAsNumpy()
+
+        # reshape input tensor from N H W C to N H*W*C
+        input_size_per_batch = 1
+        for s in range(1, len(input_tensor_shape)):
+            input_size_per_batch *= input_tensor_shape[s]
+        assert input_size_per_batch == weight_tensor_shape[1], \
+            "input size and weight size are mismatched"
+        target_shape = tuple((input_tensor_shape[0], input_size_per_batch))
+        in_expr = self.get_expr(input_tensor_idx)
+        in_expr = _op.reshape(in_expr, target_shape)
+
+        assert op.BuiltinOptionsType() == BuiltinOptions.FullyConnectedOptions
+        op_options = op.BuiltinOptions()
+        fully_connected_options = FullyConnectedOptions()
+        fully_connected_options.Init(op_options.Bytes, op_options.Pos)
+        fused_activation_fn = fully_connected_options.FusedActivationFunction()
+
+        # weight tensor type should be UINT8 (quantization) or FLOAT32
+        weight_tensor_type = weight_tensor.tensor.Type()
+        assert weight_tensor_type in (TensorType.UINT8, TensorType.FLOAT32)
+        weight_tensor_type_str = self.get_tensor_type_str(weight_tensor_type)
+
+        weight_value = self.get_tensor_value(weight_tensor)
+        weight_expr = self.exp_tab.new_const(weight_value, dtype=weight_tensor_type_str)
+
+        out = _op.nn.dense(in_expr, weight_expr)
+
+        # if we have bias
+        if len(input_tensors) == 3:
+            bias_tensor = input_tensors[2]
+            bias_tensor_type = bias_tensor.tensor.Type()
+            # bias tensor type should be INT32 (quantization) or FLOAT32
+            assert bias_tensor_type in (TensorType.INT32, TensorType.FLOAT32)
+            bias_tensor_type_str = self.get_tensor_type_str(bias_tensor_type)
+            bias_expr = self.exp_tab.new_const(self.get_tensor_value(bias_tensor),
+                                               dtype=bias_tensor_type_str)
+            out = _op.nn.bias_add(out, bias_expr)
+
+        # If we have fused activations
+        if fused_activation_fn != ActivationFunctionType.NONE:
+            out = self.convert_fused_activation_function(out, fused_activation_fn)
+
+        return out
+
     def convert_squeeze(self, op):
         """Convert TFLite squeeze"""
         try:
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 5f95d869ae74..63a345a5a6d5 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -459,12 +459,63 @@ def test_forward_softmax():
     """ Softmax """
     _test_softmax(np.arange(6.0, dtype=np.float32).reshape((1, 6)))
 
+
+#######################################################################
+# Fully Connected
+# -------
+
+def _test_fully_connected(tensor_in_sizes, filter_in_sizes, bias_in_size=None):
+    """ One iteration of fully connected """
+
+    total_size_1 = 1
+    total_size_2 = 1
+    for s in tensor_in_sizes:
+        total_size_1 *= s
+    for s in filter_in_sizes:
+        total_size_2 *= s
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    data_array = [f * 1.0 for f in range(1, total_size_1 + 1)]
+    filter_array = [f * 1.0 for f in range(1, total_size_2 + 1)]
+    assert int(total_size_1 / tensor_in_sizes[0]) == filter_in_sizes[0], \
+        "input size and filter size are mismatched"
+
+    with tf.Graph().as_default():
+        in_data = array_ops.placeholder(shape=tensor_in_sizes, dtype='float32')
+        in_filter = constant_op.constant(filter_array, shape=filter_in_sizes, dtype='float32')
+
+        # reshape N H W C into N H*W*C
+        in_data_reshape = array_ops.reshape(in_data, [tensor_in_sizes[0], -1])
+
+        out = math_ops.mat_mul(in_data_reshape, in_filter)
+
+        # if we have bias
+        if bias_in_size:
+            assert bias_in_size[0] == filter_in_sizes[1], "bias and filter size are mismatched"
+            bias_array = [f * 1.0 for f in range(1, bias_in_size[0] + 1)]
+            in_bias = constant_op.constant(bias_array, shape=bias_in_size, dtype='float32')
+            out = nn_ops.bias_add(out, in_bias)
+
+        tflite_data_array = np.reshape(data_array, tensor_in_sizes).astype('float32')
+        tvm_data_array = np.transpose(tflite_data_array, axes=(0, 3, 1, 2))
+        compare_tflite_with_tvm(tflite_data_array, tvm_data_array,
+                                'Placeholder:0', [in_data], [out])
+
+
+def test_forward_fully_connected():
+    """ Fully Connected """
+    _test_fully_connected([1, 1, 1, 150], [150, 100])
+    _test_fully_connected([1, 1, 1, 150], [150, 100], [100])
+    _test_fully_connected([5, 1, 1, 150], [150, 100])
+    _test_fully_connected([5, 1, 1, 150], [150, 100], [100])
+
+
 #######################################################################
 # Mobilenet
 # ---------
 
 def test_forward_mobilenet_v1():
-    '''test mobilenet v1 tflite model'''
+    """Test the Mobilenet V1 TF Lite model."""
     # MobilenetV1
     tflite_model_file = tf_testing.get_workload_official(
         "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz",
@@ -479,7 +530,7 @@ def test_forward_mobilenet_v1():
                                 rtol=1e-5, atol=1e-5)
 
 def test_forward_mobilenet_v2():
-    '''test mobilenet v2 tflite model'''
+    """Test the Mobilenet V2 TF Lite model."""
     # MobilenetV2
     tflite_model_file = tf_testing.get_workload_official(
         "http://download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224.tgz",
@@ -494,11 +545,11 @@ def test_forward_mobilenet_v2():
                                 rtol=1e-5, atol=1e-5)
 
 #######################################################################
-# Inception V3
+# Inception
 # ------------
 
 def test_forward_inception_v3_net():
-    '''test inception v3 tflite model'''
+    """Test the Inception V3 TF Lite model."""
     # InceptionV3
     tflite_model_file = tf_testing.get_workload_official(
         "https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v3_2018_04_27.tgz",
@@ -512,6 +563,21 @@ def test_forward_inception_v3_net():
     tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]),
                                 rtol=1e-5, atol=1e-5)
 
+def test_forward_inception_v4_net():
+    """Test the Inception V4 TF Lite model."""
+    # InceptionV4
+    tflite_model_file = tf_testing.get_workload_official(
+        "https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz",
+        "inception_v4.tflite")
+    with open(tflite_model_file, "rb") as f:
+        tflite_model_buf = f.read()
+    data = np.random.uniform(size=(1, 299, 299, 3)).astype('float32')
+    tvm_data = np.transpose(data, axes=(0, 3, 1, 2))
+    tflite_output = run_tflite_graph(tflite_model_buf, data)
+    tvm_output = run_tvm_graph(tflite_model_buf, tvm_data, 'input')
+    tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]),
+                                rtol=1e-5, atol=1e-5)
+
 #######################################################################
 # Main
 # ----
@@ -525,6 +591,7 @@ def test_forward_inception_v3_net():
     test_forward_convolution()
     test_forward_pooling()
     test_forward_softmax()
+    test_forward_fully_connected()
 
     # Math
     test_forward_add()
@@ -533,3 +600,4 @@ def test_forward_inception_v3_net():
     test_forward_mobilenet_v1()
     test_forward_mobilenet_v2()
     test_forward_inception_v3_net()
+    test_forward_inception_v4_net()

From 6750d952665c7a0dbd84e53f98f4974720fa1860 Mon Sep 17 00:00:00 2001
From: Marcus Shawcroft <marcus.shawcroft@arm.com>
Date: Wed, 1 May 2019 17:05:18 +0100
Subject: [PATCH 068/106] [DOC] various assorted grammar fixes (#3127)

* Correct spelling of 'inavlid'

* [DOC] correct spelling of 'schdule'.

* [DOC] clean up use of abbreviation "interop"

* [DOC] capitalize API abbreviation consistently

* [DOC] correct spelling of 'peformed'.

* [DOC] correct spelling of 'intermidiate'

* Remove trailing white space.

* Correct spelling of 'parametrization'.

* [DOC] minor improvements to Range documentation.
---
 docs/dev/codebase_walkthrough.rst         | 12 ++++++------
 python/tvm/container.py                   |  6 +++---
 src/codegen/build_module.cc               |  4 ++--
 src/codegen/codegen_c_host.cc             |  4 ++--
 src/codegen/codegen_c_host.h              |  4 ++--
 src/codegen/llvm/llvm_common.cc           |  2 +-
 tutorials/autotvm/tune_simple_template.py |  2 +-
 7 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/docs/dev/codebase_walkthrough.rst b/docs/dev/codebase_walkthrough.rst
index 956519b764a4..b7c849554994 100644
--- a/docs/dev/codebase_walkthrough.rst
+++ b/docs/dev/codebase_walkthrough.rst
@@ -40,10 +40,10 @@ Relay is the new IR for deep networks that is intended to replace NNVM. If you h
 When a user invokes graph compilation by ``relay.build(...)`` (or ``nnvm.compiler.build(...)`` for the older API), the following sequence of actions happens for each node in the graph:
 
 - Look up an operator implementation by querying the operator registry
-- Generate a compute expression and a schdule for the operator
+- Generate a compute expression and a schedule for the operator
 - Compile the operator into object code
 
-One of the interesting aspects of TVM codebase is that interop between C++ and Python is not unidirectional. Typically, all code that do heavy liftings are implemented in C++, and Python bindings are provided for user interface. This is also true in TVM, but in TVM codebase, C++ code also call into functions defined in a Python module. For example, the convolution operator is implemented in Python, and its implementation is invoked from C++ code in Relay.
+One of the interesting aspects of TVM codebase is that interoperability between C++ and Python is not unidirectional. Typically, all code that do heavy liftings are implemented in C++, and Python bindings are provided for user interface. This is also true in TVM, but in TVM codebase, C++ code also call into functions defined in a Python module. For example, the convolution operator is implemented in Python, and its implementation is invoked from C++ code in Relay.
 
 *******************************************
 Vector Add Example
@@ -71,7 +71,7 @@ Here, types of ``A``, ``B``, ``C`` are ``tvm.tensor.Tensor``, defined in ``pytho
 
 The Node system is the basis of exposing C++ types to frontend languages, including Python. The way TVM implements Python wrapping is not straightforward. It is briefly covered in `this document <https://docs.tvm.ai/dev/runtime.html#tvm-node-and-compiler-stack>`_, and details are in ``python/tvm/_ffi/`` if you are interested.
 
-``Tensor`` is created by functions in ``python/tvm/api.py``, which in turn calls into C++ functions exposed in ``src/api/api_lang.cc``. All C++ functions that are callable from Python are exposed in the ``src/api`` subdirectory. For example, the ``tvm.compute()`` function above calls into ``_ComputeOp`` api exposed in ``src/api/api_lang.cc``:
+``Tensor`` is created by functions in ``python/tvm/api.py``, which in turn calls into C++ functions exposed in ``src/api/api_lang.cc``. All C++ functions that are callable from Python are exposed in the ``src/api`` subdirectory. For example, the ``tvm.compute()`` function above calls into ``_ComputeOp`` API exposed in ``src/api/api_lang.cc``:
 
 ::
 
@@ -84,7 +84,7 @@ The Node system is the basis of exposing C++ types to frontend languages, includ
                                   args[4]);
      });
 
-We use ``TVM_REGISTER_*`` macro to expose C++ functions to frontend languages, in the form of `PackedFunc <https://docs.tvm.ai/dev/runtime.html#packedfunc>`_. ``PackedFunc`` is another mechanism by which TVM implements C++ and Python interop. In particular, this is what makes calling Python functions from the C++ codebase very easy.
+We use ``TVM_REGISTER_*`` macro to expose C++ functions to frontend languages, in the form of `PackedFunc <https://docs.tvm.ai/dev/runtime.html#packedfunc>`_. ``PackedFunc`` is another mechanism by which TVM implements interoperability between C++ and Python. In particular, this is what makes calling Python functions from the C++ codebase very easy.
 
 A ``Tensor`` object has an ``Operation`` object associated with it, defined in ``python/tvm/tensor.py``, ``include/tvm/operation.h``, and ``src/tvm/op`` subdirectory. A ``Tensor`` is an output of its ``Operation`` object. Each ``Operation`` object has in turn ``input_tensors()`` method, which returns a list of input ``Tensor`` to it. This way we can keep track of dependencies between ``Operation``.
 
@@ -122,7 +122,7 @@ The process of ``tvm.build()`` can be divided into two steps:
 - Lowering, where a high level, initial loop nest structures are transformed into a final, low level IR
 - Code generation, where target machine code is generated from the low level IR
 
-Lowering is done by ``tvm.lower()`` function, defined in ``python/tvm/build_module.py``. First, bound inference is peformed, and an initial loop nest structure is created.
+Lowering is done by ``tvm.lower()`` function, defined in ``python/tvm/build_module.py``. First, bound inference is performed, and an initial loop nest structure is created.
 
 ::
 
@@ -136,7 +136,7 @@ Lowering is done by ``tvm.lower()`` function, defined in ``python/tvm/build_modu
       stmt = schedule.ScheduleOps(sch, bounds)
       ...
 
-Bound inference is the process where all loop bounds and sizes of intermidiate buffers are inferred. If you target the CUDA backend and you use shared memory, its required minimum size is automatically determined here. Bound inference is implemented in ``src/schedule/bound.cc``, ``src/schedule/graph.cc`` and ``src/schedule/message_passing.cc``.
+Bound inference is the process where all loop bounds and sizes of intermediate buffers are inferred. If you target the CUDA backend and you use shared memory, its required minimum size is automatically determined here. Bound inference is implemented in ``src/schedule/bound.cc``, ``src/schedule/graph.cc`` and ``src/schedule/message_passing.cc``.
 
 ``stmt``, which is the output of ``ScheduleOps()``, represents an initial loop nest structure. If you have applied ``reorder`` or ``split`` primitives to your schedule, then the initial loop nest already reflects that changes. ``ScheduleOps()`` is defined in ``src/schedule/schedule_ops.cc``.
 
diff --git a/python/tvm/container.py b/python/tvm/container.py
index ebc8376b1872..aedbe95b01b2 100644
--- a/python/tvm/container.py
+++ b/python/tvm/container.py
@@ -101,10 +101,10 @@ def items(self):
 
 @register_node
 class Range(NodeBase):
-    """Represent range in TVM.
+    """Represent a range in TVM.
 
-    You do not need to create Range explicitly.
-    Python list and tuple will be converted automatically to Range in api functions.
+    You do not need to create a Range explicitly.
+    Python lists and tuples will be converted automatically to a Range in API functions.
     """
 
 
diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index 65542dd50810..ddda5a93cd36 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/src/codegen/codegen_c_host.cc b/src/codegen/codegen_c_host.cc
index 5e33f11d5877..ca7b070a97c7 100644
--- a/src/codegen/codegen_c_host.cc
+++ b/src/codegen/codegen_c_host.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/src/codegen/codegen_c_host.h b/src/codegen/codegen_c_host.h
index 4f20df4c85e3..23ae185512e1 100644
--- a/src/codegen/codegen_c_host.h
+++ b/src/codegen/codegen_c_host.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/src/codegen/llvm/llvm_common.cc b/src/codegen/llvm/llvm_common.cc
index f763d7147dd6..0afe1f364f81 100644
--- a/src/codegen/llvm/llvm_common.cc
+++ b/src/codegen/llvm/llvm_common.cc
@@ -84,7 +84,7 @@ void ParseLLVMTargetOptions(const std::string& target_str,
     size_t pos = key.find('=');
     if (pos != std::string::npos) {
       CHECK_GE(key.length(), pos + 1)
-          << "inavlid argument " << key;
+          << "invalid argument " << key;
       value = key.substr(pos + 1, key.length() - 1);
       key = key.substr(0, pos);
     } else {
diff --git a/tutorials/autotvm/tune_simple_template.py b/tutorials/autotvm/tune_simple_template.py
index 3608e9a84427..45f95947341f 100644
--- a/tutorials/autotvm/tune_simple_template.py
+++ b/tutorials/autotvm/tune_simple_template.py
@@ -63,7 +63,7 @@
 # --------------------------------
 # In this section, we will rewrite a deterministic tvm schedule code to a
 # tunable schedule template. You can regard the process of search space definition
-# as the parametrization of our existing schedule code.
+# as the parameterization of our existing schedule code.
 #
 # To begin with, here is how we implement a blocked matrix multiplication in tvm.
 

From 00dd7a0d173fb4b2294d5e0887691e65db78370f Mon Sep 17 00:00:00 2001
From: Zhao Wu <wuzhaozju@gmail.com>
Date: Thu, 2 May 2019 02:18:15 +0800
Subject: [PATCH 069/106] Fix PRelu layout in Relay (#3013)

* Fix PRelu layout in Relay

* Fix cpplint

* Add PRelu test case
---
 nnvm/src/top/nn/nn.cc                         |  1 -
 src/relay/op/nn/nn.cc                         | 19 +++++++-
 .../python/relay/test_pass_alter_op_layout.py | 47 +++++++++++++++++++
 3 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/nnvm/src/top/nn/nn.cc b/nnvm/src/top/nn/nn.cc
index da73d3bcca73..93b01f5c6b50 100644
--- a/nnvm/src/top/nn/nn.cc
+++ b/nnvm/src/top/nn/nn.cc
@@ -541,7 +541,6 @@ inline bool PReluInferShape(const nnvm::NodeAttrs &attrs,
   NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, 0, dshape);
 
   // The case of parametric relu
-  CHECK_EQ(dshape.ndim(), 4) << "Input data should be 4D, but got " << dshape.ndim();
   CHECK(size_t(param.axis) < dshape.Size())
       << "Wrong axis ("  << param.axis << ")value.";
 
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index b8749013867f..c0f36bfa2915 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -238,6 +238,23 @@ bool PReluRel(const Array<Type>& types,
   return true;
 }
 
+template<typename T>
+Array<Array<Layout> > PReluInferCorrectLayout(
+    const Attrs& attrs,
+    const Array<Layout>& new_in_layouts,
+    const Array<Layout>& old_in_layouts,
+    const Array<Array<IndexExpr>> &old_in_shapes) {
+
+  CHECK_EQ(old_in_layouts.size(), 2U);
+  CHECK_EQ(old_in_shapes.size(), 2U);
+  Layout data_layout = old_in_layouts[0];
+  if (new_in_layouts.defined()) {
+    CHECK_EQ(new_in_layouts.size(), 2U);
+  }
+  return Array<Array<Layout> >{{data_layout, Layout("C")},
+                               {data_layout}};
+}
+
 // Positional relay function to create prelu operator used by frontend FFI.
 Expr MakePRelu(Expr data,
                Expr alpha,
@@ -265,7 +282,7 @@ where :math:`*` is an channelwise multiplication for each sample in the batch.
 .add_argument("alpha", "Tensor", "Input channelwise alpha.")
 .set_support_level(3)
 .add_type_rel("PRelu", PReluRel)
-.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", PReluInferCorrectLayout<PReluAttrs>)
 .set_attr<FTVMCompute>(
   "FTVMCompute", [](const Attrs& attrs,
                     const Array<Tensor>& inputs,
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index b000bae031d1..2eea1c4ca87a 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -513,6 +513,52 @@ def expected():
 
     assert alpha_equal(a, b), "Actual = \n" + str(a)
 
+def test_alter_layout_prelu():
+    """Test PRelu operator"""
+    def before():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        weight = relay.var("weight")
+        alpha = relay.var("alpha", relay.IncompleteType())
+        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
+        y = relay.nn.prelu(y, alpha)
+        y = relay.Function(free_vars(y), y)
+        return y
+
+    @register_alter_op_layout("nn.conv2d", level=110)
+    def alter_conv2d(attrs, inputs, tinfos):
+        data, weight = inputs
+        new_attrs = dict(attrs)
+        new_attrs['data_layout'] = 'NCHW16c'
+        return relay.nn.conv2d(data, weight, **new_attrs)
+
+    def expected():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        w = relay.var("weight")
+        alpha = relay.var("alpha", relay.IncompleteType())
+
+        y = relay.layout_transform(x, "NCHW", "NCHW16c")
+        y = relay.nn.conv2d(y, w,
+                            channels=64,
+                            kernel_size=(3, 3),
+                            padding=(1, 1),
+                            data_layout="NCHW16c")
+        y = relay.layout_transform(y, "NCHW16c", "NCHW")
+        y = relay.nn.prelu(y, alpha)
+        y = relay.Function(free_vars(y), y)
+        return y
+
+    a = before()
+    a = infer_type(a)
+    a = canonicalize_ops(a)
+    a = infer_type(a)
+    a = alter_op_layout(a)
+    a = infer_type(a)
+
+    b = expected()
+    b = infer_type(b)
+
+    assert(alpha_equal(a, b))
+
 
 if __name__ == "__main__":
     test_alter_op()
@@ -525,3 +571,4 @@ def expected():
     test_alter_layout_concatenate()
     test_alter_layout_nchw_upsamping_op()
     test_alter_layout_strided_slice()
+    test_alter_layout_prelu()

From d7199433a65b3ea6735924775540f87eb739545f Mon Sep 17 00:00:00 2001
From: Yida Wang <yidawa@gmail.com>
Date: Wed, 1 May 2019 13:28:01 -0700
Subject: [PATCH 070/106] Minor addition to graph runtime debug (#3129)

* print op names in graph runtime debug

* fix lint
---
 src/runtime/graph/debug/graph_runtime_debug.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/runtime/graph/debug/graph_runtime_debug.cc b/src/runtime/graph/debug/graph_runtime_debug.cc
index 23bb3109bf4f..560bf3da238e 100644
--- a/src/runtime/graph/debug/graph_runtime_debug.cc
+++ b/src/runtime/graph/debug/graph_runtime_debug.cc
@@ -110,7 +110,8 @@ class GraphRuntimeDebug : public GraphRuntime {
       for (size_t index = 0; index < time_per_op.size(); index++) {
         if (op_execs_[index]) {
           time_per_op[index] /= number;
-          LOG(INFO) << "Op #" << op++ << ": " << time_per_op[index] << " ms/iter";
+          LOG(INFO) << "Op #" << op++ << " " << GetNodeName(index) << ": "
+            << time_per_op[index] << " ms/iter";
         }
       }
     }

From 34e82c0624ba7f21d5aaee177a2cf3738fe6f36c Mon Sep 17 00:00:00 2001
From: Marcus Shawcroft <marcus.shawcroft@arm.com>
Date: Wed, 1 May 2019 22:13:36 +0100
Subject: [PATCH 071/106] [DOC] Add missing targets to target_name
 documentation. (#3128)

---
 src/codegen/build_module.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index ddda5a93cd36..92a12a0da1b7 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -45,7 +45,8 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 /*!
 * \brief Construct a Target node from the given name and options.
 * \param target_name The major target name. Should be one of
-* {"llvm", "cuda", "opencl", "metal", "rocm", "stackvm", "opengl", "ext_dev"}
+* {"aocl", "aocl_sw_emu", "c", "cuda", "ext_dev", "hybrid", "llvm", "metal",
+*  "nvptx", "opencl", "opengl", "rocm", "sdaccel", "stackvm", "vulkan"}
 * \param options Additional options appended to the target
 * \return The constructed Target
 */

From 098b6c0cae212343eaec5e3a6fdd15117d14b416 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Thu, 2 May 2019 12:30:28 +0800
Subject: [PATCH 072/106] Update CONTRIBUTORS.md (#3130)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 7cc913190db4..4d4515e09410 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -65,6 +65,7 @@ We do encourage everyone to work anything they are interested in.
 - [Liangfu Chen](https://github.com/liangfu): @liangfu
 - [Wei Chen](https://github.com/wweic): @wweic
 - [Zhi Chen](https://github.com/zhiics): @zhiics
+- [Meghan Cowan](https://github.com/cowanmeg): @cowanmeg
 - [Sergei Grechanik](https://github.com/sgrechanik-h): @sgrechanik-h
 - [Hao Lu](https://github.com/hlu1): @hlu1
 - [Nick Hynes](https://github.com/nhynes): @nhynes

From 9e43187f1f07074e6eb5e06054a0235848286aba Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Thu, 2 May 2019 09:54:52 -0400
Subject: [PATCH 073/106] [Relay][Runtime] Add support for virtual machine
 Objects (#3120)

---
 3rdparty/HalideIR                    |   2 +-
 include/tvm/runtime/c_runtime_api.h  |   3 +-
 include/tvm/runtime/object.h         | 391 +++++++++++++++++++++++++++
 include/tvm/runtime/packed_func.h    |  25 ++
 python/setup.py                      |   2 +-
 python/tvm/_ffi/_ctypes/function.py  |  14 +
 python/tvm/_ffi/_cython/base.pxi     |   2 +
 python/tvm/_ffi/_cython/function.pxi |  33 +++
 python/tvm/_ffi/function.py          |  15 +-
 python/tvm/_ffi/runtime_ctypes.py    |   2 +
 src/api/dsl_api.cc                   |  13 +-
 src/lang/reflection.cc               |  34 ++-
 src/relay/ir/pretty_printer.cc       |   3 +
 src/runtime/vm/object.cc             |  94 +++++++
 14 files changed, 623 insertions(+), 10 deletions(-)
 create mode 100644 include/tvm/runtime/object.h
 create mode 100644 src/runtime/vm/object.cc

diff --git a/3rdparty/HalideIR b/3rdparty/HalideIR
index 55ba1778fd26..a768f2f06279 160000
--- a/3rdparty/HalideIR
+++ b/3rdparty/HalideIR
@@ -1 +1 @@
-Subproject commit 55ba1778fd264c7507953552d8e51212ed11f748
+Subproject commit a768f2f0627917659a4d7167eee3190469b9d164
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 788eaf2019f3..735eb1be11c2 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -112,7 +112,8 @@ typedef enum {
   kNNVMLast = 20U,
   // The following section of code is used for non-reserved types.
   kExtReserveEnd = 64U,
-  kExtEnd = 128U
+  kExtEnd = 128U,
+  kObject = 14U,
 } TVMTypeCode;
 
 /*!
diff --git a/include/tvm/runtime/object.h b/include/tvm/runtime/object.h
new file mode 100644
index 000000000000..bba9e14aeeb1
--- /dev/null
+++ b/include/tvm/runtime/object.h
@@ -0,0 +1,391 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file tvm/runtime/object.h
+ * \brief A managed object in the TVM runtime.
+ */
+#ifndef TVM_RUNTIME_OBJECT_H_
+#define TVM_RUNTIME_OBJECT_H_
+
+#include <tvm/runtime/ndarray.h>
+#include <memory>
+#include <utility>
+#include <vector>
+
+namespace tvm {
+namespace runtime {
+
+template <typename T>
+class ObjectPtr;
+class Object;
+
+enum struct ObjectTag {
+  /*! \brief The tag of a tensor. */
+  kTensor = 0U,
+  /*! \brief The tag of a closure. */
+  kClosure = 1U,
+  /*! \brief The tag of a structure. */
+  kDatatype = 2U,
+};
+
+std::ostream& operator<<(std::ostream& os, const ObjectTag&);
+
+struct ObjectCell {
+ public:
+  /*!
+   * \brief The type of object deleter.
+   * \param The self pointer to the ObjectCell.
+   */
+  typedef void (*FDeleter)(ObjectCell* self);
+
+  /*! \brief The tag of the object.
+   *
+   * Describes which type of value
+   * is represented by this object.
+   */
+  ObjectTag tag;
+
+  /*!
+   * \brief Increment the reference count.
+   */
+  void IncRef() { ref_counter_.fetch_add(1, std::memory_order_relaxed); }
+
+  /*!
+   * \brief Decrement the reference count.
+   */
+  void DecRef() {
+    if (ref_counter_.fetch_sub(1, std::memory_order_release) == 1) {
+      std::atomic_thread_fence(std::memory_order_acquire);
+      if (this->deleter_ != nullptr) {
+        (*this->deleter_)(this);
+      }
+    }
+  }
+
+ protected:
+  // default constructor and copy constructor
+  ObjectCell() {}
+
+  explicit ObjectCell(ObjectTag tag) : tag(tag) {}
+
+  // override the copy and assign constructors to do nothing.
+  // This is to make sure only contents, but not deleter and ref_counter
+  // are copied when a child class copies itself.
+  ObjectCell(const ObjectCell& other) {  // NOLINT(*)
+  }
+
+  ObjectCell(ObjectCell&& other) {  // NOLINT(*)
+  }
+
+  ObjectCell& operator=(const ObjectCell& other) {  // NOLINT(*)
+    return *this;
+  }
+
+  ObjectCell& operator=(ObjectCell&& other) {  // NOLINT(*)
+    return *this;
+  }
+
+ private:
+  /*! \brief Internal reference counter */
+  std::atomic<int> ref_counter_{0};
+  /*!
+   * \brief deleter of this object to enable customized allocation.
+   * If the deleter is nullptr, no deletion will be performed.
+   * The creator of the Node must always set the deleter field properly.
+   */
+  FDeleter deleter_ = nullptr;
+
+  int use_count() const { return ref_counter_.load(std::memory_order_relaxed); }
+
+  // friend declaration
+  template <typename>
+  friend class ObjectPtr;
+
+  template <typename Y, typename... Args>
+  friend ObjectPtr<Y> MakeObject(Args&&...);
+};
+
+/*!
+ * \brief A custom smart pointer for Object.
+ *  must be subclass of NodeBase
+ * \tparam T the content data type.
+ */
+template <typename T>
+class ObjectPtr {
+ public:
+  /*! \brief default constructor */
+  ObjectPtr() {}
+  /*! \brief default constructor */
+  ObjectPtr(std::nullptr_t) {}  // NOLINT(*)
+  /*!
+   * \brief copy constructor
+   * \param other The value to be moved
+   */
+  ObjectPtr(const ObjectPtr<T>& other)  // NOLINT(*)
+      : ObjectPtr(other.data_) {}
+  /*!
+   * \brief copy constructor
+   * \param other The value to be moved
+   */
+  template <typename U>
+  ObjectPtr(const ObjectPtr<U>& other)  // NOLINT(*)
+      : ObjectPtr(other.data_) {
+    static_assert(std::is_base_of<T, U>::value,
+                  "can only assign of child class ObjectPtr to parent");
+  }
+  /*!
+   * \brief move constructor
+   * \param other The value to be moved
+   */
+  ObjectPtr(ObjectPtr<T>&& other)  // NOLINT(*)
+      : data_(other.data_) {
+    other.data_ = nullptr;
+  }
+
+  /*!
+   * \brief move constructor
+   * \param other The value to be moved
+   */
+  template <typename Y>
+  ObjectPtr(ObjectPtr<Y>&& other)  // NOLINT(*)
+      : data_(other.data_) {
+    static_assert(std::is_base_of<T, Y>::value,
+                  "can only assign of child class ObjectPtr to parent");
+    other.data_ = nullptr;
+  }
+
+  /*! \brief destructor */
+  ~ObjectPtr() { this->reset(); }
+
+  /*!
+   * \brief Swap this array with another Object
+   * \param other The other Object
+   */
+  void swap(ObjectPtr<T>& other) {  // NOLINT(*)
+    std::swap(data_, other.data_);
+  }
+
+  /*!
+   * \return Get the content of the pointer
+   */
+  T* get() const { return static_cast<T*>(data_); }
+
+  /*!
+   * \return The pointer
+   */
+  T* operator->() const { return get(); }
+
+  /*!
+   * \return The reference
+   */
+  T& operator*() const {  // NOLINT(*)
+    return *get();
+  }
+
+  /*!
+   * \brief copy assignmemt
+   * \param other The value to be assigned.
+   * \return reference to self.
+   */
+  ObjectPtr<T>& operator=(const ObjectPtr<T>& other) {  // NOLINT(*)
+    // takes in plane operator to enable copy elison.
+    // copy-and-swap idiom
+    ObjectPtr(other).swap(*this);  // NOLINT(*)
+    return *this;
+  }
+
+  /*!
+   * \brief move assignmemt
+   * \param other The value to be assigned.
+   * \return reference to self.
+   */
+  ObjectPtr<T>& operator=(ObjectPtr<T>&& other) {  // NOLINT(*)
+    // copy-and-swap idiom
+    ObjectPtr(std::move(other)).swap(*this);  // NOLINT(*)
+    return *this;
+  }
+
+  /*! \brief reset the content of ptr to be nullptr */
+  void reset() {
+    if (data_ != nullptr) {
+      data_->DecRef();
+      data_ = nullptr;
+    }
+  }
+
+  /*! \return The use count of the ptr, for debug purposes */
+  int use_count() const { return data_ != nullptr ? data_->use_count() : 0; }
+
+  /*! \return whether the reference is unique */
+  bool unique() const { return data_ != nullptr && data_->use_count() == 1; }
+
+  /*! \return Whether two ObjectPtr do not equal each other */
+  bool operator==(const ObjectPtr<T>& other) const { return data_ == other.data_; }
+
+  /*! \return Whether two ObjectPtr equals each other */
+  bool operator!=(const ObjectPtr<T>& other) const { return data_ != other.data_; }
+
+  /*! \return Whether the pointer is nullptr */
+  bool operator==(std::nullptr_t null) const { return data_ == nullptr; }
+
+  /*! \return Whether the pointer is not nullptr */
+  bool operator!=(std::nullptr_t null) const { return data_ != nullptr; }
+
+  /* ObjectPtr's support custom allocators.
+   *
+   * The below allocator represents the simplest
+   * possible impl. It can be easily swapped
+   * for customized executor's, different allocation
+   * strategies, and so on.
+   *
+   * See memory.h for more discussion on NodePtr's
+   * allocator.
+   */
+  class StdAllocator {
+   public:
+    template <typename... Args>
+    static T* New(Args&&... args) {
+      return new T(std::forward<Args>(args)...);
+    }
+
+    static ObjectCell::FDeleter Deleter() { return Deleter_; }
+
+   private:
+    static void Deleter_(ObjectCell* ptr) { delete static_cast<T*>(ptr); }
+  };
+
+  template <typename U>
+  ObjectPtr<U> As() const {
+    auto ptr = reinterpret_cast<U*>(get());
+    return ObjectPtr<U>(ptr);
+  }
+
+ private:
+  /*! \brief internal pointer field */
+  ObjectCell* data_{nullptr};
+  /*!
+   * \brief constructor from NodeBase
+   * \param data The node base pointer
+   */
+  // TODO(jroesch): NodePtr design doesn't really work here due to the passing.
+ public:
+  explicit ObjectPtr(ObjectCell* data) : data_(data) {
+    if (data != nullptr) {
+      data_->IncRef();
+    }
+  }
+
+ private:
+  template <typename Y, typename... Args>
+  friend ObjectPtr<Y> MakeObject(Args&&...);
+  template <typename>
+  friend class ObjectPtr;
+  friend class NDArray;
+  friend class TVMPODValue_;
+  friend class TVMArgValue;
+  friend class TVMRetValue;
+  friend class RPCWrappedFunc;
+};
+
+struct TensorCell;
+struct DatatypeCell;
+struct ClosureCell;
+
+/*!
+ * \brief A managed object in the TVM runtime.
+ *
+ * For example a tuple, list, closure, and so on.
+ *
+ * Maintains a reference count for the object.
+ */
+class Object {
+ public:
+  ObjectPtr<ObjectCell> ptr_;
+  explicit Object(ObjectPtr<ObjectCell> ptr) : ptr_(ptr) {}
+  explicit Object(ObjectCell* ptr) : ptr_(ptr) {}
+  Object() : ptr_() {}
+  Object(const Object& obj) : ptr_(obj.ptr_) {}
+  ObjectCell* operator->() { return this->ptr_.operator->(); }
+
+  /*! \brief Construct a tensor object. */
+  static Object Tensor(const NDArray& data);
+  /*! \brief Construct a datatype object. */
+  static Object Datatype(size_t tag, const std::vector<Object>& fields);
+  /*! \brief Construct a tuple object. */
+  static Object Tuple(const std::vector<Object>& fields);
+  /*! \brief Construct a closure object. */
+  static Object Closure(size_t func_index, const std::vector<Object>& free_vars);
+
+  ObjectPtr<TensorCell> AsTensor() const;
+  ObjectPtr<DatatypeCell> AsDatatype() const;
+  ObjectPtr<ClosureCell> AsClosure() const;
+};
+
+/*! \brief An object containing an NDArray. */
+struct TensorCell : public ObjectCell {
+  /*! \brief The NDArray. */
+  NDArray data;
+  explicit TensorCell(const NDArray& data) : ObjectCell(ObjectTag::kTensor), data(data) {}
+};
+
+/*! \brief An object representing a structure or enumeration. */
+struct DatatypeCell : public ObjectCell {
+  /*! \brief The tag representing the constructor used. */
+  size_t tag;
+  /*! \brief The fields of the structure. */
+  std::vector<Object> fields;
+
+  DatatypeCell(size_t tag, const std::vector<Object>& fields)
+      : ObjectCell(ObjectTag::kDatatype), tag(tag), fields(fields) {}
+};
+
+/*! \brief An object representing a closure. */
+struct ClosureCell : public ObjectCell {
+  /*! \brief The index into the VM function table. */
+  size_t func_index;
+  /*! \brief The free variables of the closure. */
+  std::vector<Object> free_vars;
+
+  ClosureCell(size_t func_index, const std::vector<Object>& free_vars)
+      : ObjectCell(ObjectTag::kClosure), func_index(func_index), free_vars(free_vars) {}
+};
+
+/*! \brief Extract the NDArray from a tensor object. */
+NDArray ToNDArray(const Object& obj);
+
+/*!
+ * \brief Allocate a node object.
+ * \param args arguments to the constructor.
+ * \tparam T the node type.
+ * \return The NodePtr to the allocated object.
+ */
+template <typename T, typename... Args>
+inline ObjectPtr<T> MakeObject(Args&&... args) {
+  using Allocator = typename ObjectPtr<T>::StdAllocator;
+  static_assert(std::is_base_of<ObjectCell, T>::value, "MakeObject can only be used to create ");
+  T* node = Allocator::New(std::forward<Args>(args)...);
+  node->deleter_ = Allocator::Deleter();
+  return ObjectPtr<T>(node);
+}
+
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_OBJECT_H_
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 654f4e5efe50..9fcefcbbe4b1 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -39,6 +39,7 @@
 #include "c_runtime_api.h"
 #include "module.h"
 #include "ndarray.h"
+#include "object.h"
 #include "node_base.h"
 
 namespace HalideIR {
@@ -48,6 +49,7 @@ struct Type;
 struct Expr;
 }
 
+
 // Whether use TVM runtime in header only mode.
 #ifndef TVM_RUNTIME_HEADER_ONLY
 #define TVM_RUNTIME_HEADER_ONLY 0
@@ -470,6 +472,11 @@ class TVMPODValue_ {
     TVM_CHECK_TYPE_CODE(type_code_, kNDArrayContainer);
     return NDArray(static_cast<NDArray::Container*>(value_.v_handle));
   }
+  operator Object() const {
+    if (type_code_ == kNull) return Object();
+    TVM_CHECK_TYPE_CODE(type_code_, kObject);
+    return Object(static_cast<ObjectCell*>(value_.v_handle));
+  }
   operator TVMContext() const {
     TVM_CHECK_TYPE_CODE(type_code_, kTVMContext);
     return value_.v_ctx;
@@ -542,6 +549,7 @@ class TVMArgValue : public TVMPODValue_ {
   using TVMPODValue_::operator DLTensor*;
   using TVMPODValue_::operator NDArray;
   using TVMPODValue_::operator TVMContext;
+  using TVMPODValue_::operator Object;
 
   // conversion operator.
   operator std::string() const {
@@ -637,6 +645,7 @@ class TVMRetValue : public TVMPODValue_ {
   using TVMPODValue_::operator DLTensor*;
   using TVMPODValue_::operator TVMContext;
   using TVMPODValue_::operator NDArray;
+  using TVMPODValue_::operator Object;
   TVMRetValue(const TVMRetValue& other) : TVMPODValue_() {
     this->Assign(other);
   }
@@ -733,6 +742,13 @@ class TVMRetValue : public TVMPODValue_ {
     other.data_ = nullptr;
     return *this;
   }
+  TVMRetValue& operator=(Object other) {
+    this->Clear();
+    type_code_ = kObject;
+    value_.v_handle = other.ptr_.data_;
+    other.ptr_.data_ = nullptr;
+    return *this;
+  }
   TVMRetValue& operator=(PackedFunc f) {
     this->SwitchToClass(kFuncHandle, f);
     return *this;
@@ -828,6 +844,10 @@ class TVMRetValue : public TVMPODValue_ {
             kNodeHandle, *other.template ptr<NodePtr<Node> >());
         break;
       }
+      case kObject: {
+        *this = other.operator Object();
+        break;
+      }
       default: {
         if (other.type_code() < kExtBegin) {
           SwitchToPOD(other.type_code());
@@ -875,6 +895,10 @@ class TVMRetValue : public TVMPODValue_ {
         static_cast<NDArray::Container*>(value_.v_handle)->DecRef();
         break;
       }
+      case kObject: {
+        static_cast<ObjectCell*>(value_.v_handle)->DecRef();
+        break;
+      }
     }
     if (type_code_ > kExtBegin) {
 #if TVM_RUNTIME_HEADER_ONLY
@@ -904,6 +928,7 @@ inline const char* TypeCode2Str(int type_code) {
     case kFuncHandle: return "FunctionHandle";
     case kModuleHandle: return "ModuleHandle";
     case kNDArrayContainer: return "NDArrayContainer";
+    case kObject: return "Object";
     default: LOG(FATAL) << "unknown type_code="
                         << static_cast<int>(type_code); return "";
   }
diff --git a/python/setup.py b/python/setup.py
index 11cd6642459d..37d4cec3177c 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -96,7 +96,7 @@ def config_cython():
                 library_dirs=library_dirs,
                 libraries=libraries,
                 language="c++"))
-        return cythonize(ret)
+        return cythonize(ret, compiler_directives={"language_level": 3})
     except ImportError:
         print("WARNING: Cython is not installed, will compile without cython module")
         return []
diff --git a/python/tvm/_ffi/_ctypes/function.py b/python/tvm/_ffi/_ctypes/function.py
index a000c27a5f60..b674d34bd8fd 100644
--- a/python/tvm/_ffi/_ctypes/function.py
+++ b/python/tvm/_ffi/_ctypes/function.py
@@ -162,6 +162,9 @@ def _make_tvm_args(args, temp_args):
             values[i].v_handle = arg.handle
             type_codes[i] = TypeCode.FUNC_HANDLE
             temp_args.append(arg)
+        elif isinstance(arg, ObjectBase):
+            values[i].v_handle = arg.handle
+            type_codes[i] = TypeCode.OBJECT
         else:
             raise TypeError("Don't know how to handle type %s" % type(arg))
     return values, type_codes, num_args
@@ -240,12 +243,18 @@ def _handle_return_func(x):
         handle = FunctionHandle(handle)
     return _CLASS_FUNCTION(handle, False)
 
+class ObjectBase(object):
+    __slots__ = ["handle"]
+
+    def __init__(self, handle):
+        self.handle = handle
 
 # setup return handle for function type
 _node.__init_by_constructor__ = __init_handle_by_constructor__
 RETURN_SWITCH[TypeCode.FUNC_HANDLE] = _handle_return_func
 RETURN_SWITCH[TypeCode.MODULE_HANDLE] = _return_module
 RETURN_SWITCH[TypeCode.NDARRAY_CONTAINER] = lambda x: _make_array(x.v_handle, False, True)
+RETURN_SWITCH[TypeCode.OBJECT] = lambda x: _CLASS_OBJECT(x.v_handle)
 C_TO_PY_ARG_SWITCH[TypeCode.FUNC_HANDLE] = _wrap_arg_func(
     _handle_return_func, TypeCode.FUNC_HANDLE)
 C_TO_PY_ARG_SWITCH[TypeCode.MODULE_HANDLE] = _wrap_arg_func(
@@ -255,6 +264,7 @@ def _handle_return_func(x):
 
 _CLASS_MODULE = None
 _CLASS_FUNCTION = None
+_CLASS_OBJECT = None
 
 def _set_class_module(module_class):
     """Initialize the module."""
@@ -264,3 +274,7 @@ def _set_class_module(module_class):
 def _set_class_function(func_class):
     global _CLASS_FUNCTION
     _CLASS_FUNCTION = func_class
+
+def _set_class_object(obj_class):
+    global _CLASS_OBJECT
+    _CLASS_OBJECT = obj_class
diff --git a/python/tvm/_ffi/_cython/base.pxi b/python/tvm/_ffi/_cython/base.pxi
index 44e379de69bb..0734d651e119 100644
--- a/python/tvm/_ffi/_cython/base.pxi
+++ b/python/tvm/_ffi/_cython/base.pxi
@@ -37,6 +37,7 @@ cdef enum TVMTypeCode:
     kStr = 11
     kBytes = 12
     kNDArrayContainer = 13
+    kObject = 14
     kExtBegin = 15
 
 cdef extern from "tvm/runtime/c_runtime_api.h":
@@ -76,6 +77,7 @@ ctypedef DLTensor* DLTensorHandle
 ctypedef void* TVMStreamHandle
 ctypedef void* TVMRetValueHandle
 ctypedef void* TVMFunctionHandle
+ctypedef void* ObjectHandle
 ctypedef void* NodeHandle
 
 ctypedef struct TVMNDArrayContainer:
diff --git a/python/tvm/_ffi/_cython/function.pxi b/python/tvm/_ffi/_cython/function.pxi
index 514d1b7b3dd4..d0e0c2b5a612 100644
--- a/python/tvm/_ffi/_cython/function.pxi
+++ b/python/tvm/_ffi/_cython/function.pxi
@@ -44,6 +44,7 @@ cdef int tvm_callback(TVMValue* args,
         if (tcode == kNodeHandle or
             tcode == kFuncHandle or
             tcode == kModuleHandle or
+            tcode == kObject or
             tcode > kExtBegin):
             CALL(TVMCbArgToReturn(&value, tcode))
 
@@ -157,6 +158,9 @@ cdef inline int make_arg(object arg,
     elif isinstance(arg, _CLASS_MODULE):
         value[0].v_handle = c_handle(arg.handle)
         tcode[0] = kModuleHandle
+    elif isinstance(arg, _CLASS_OBJECT):
+        value[0].v_handle = c_handle(arg.handle)
+        tcode[0] = kObject
     elif isinstance(arg, FunctionBase):
         value[0].v_handle = (<FunctionBase>arg).chandle
         tcode[0] = kFuncHandle
@@ -208,6 +212,8 @@ cdef inline object make_ret(TVMValue value, int tcode):
         fobj = _CLASS_FUNCTION(None, False)
         (<FunctionBase>fobj).chandle = value.v_handle
         return fobj
+    elif tcode == kObject:
+        return _CLASS_OBJECT(ctypes_handle(value.v_handle))
     elif tcode in _TVM_EXT_RET:
         return _TVM_EXT_RET[tcode](ctypes_handle(value.v_handle))
 
@@ -304,8 +310,31 @@ cdef class FunctionBase:
         FuncCall(self.chandle, args, &ret_val, &ret_tcode)
         return make_ret(ret_val, ret_tcode)
 
+cdef class ObjectBase:
+    cdef ObjectHandle chandle
+
+    cdef inline _set_handle(self, handle):
+        if handle is None:
+            self.chandle = NULL
+        else:
+            self.chandle = c_handle(handle)
+
+    property handle:
+        def __get__(self):
+            if self.chandle == NULL:
+                return None
+            else:
+                return ctypes.cast(<unsigned long long>self.chandle, ctypes.c_void_p)
+        def __set__(self, value):
+            self._set_handle(value)
+
+    def __init__(self, handle):
+        self._set_handle(handle)
+
+
 _CLASS_FUNCTION = None
 _CLASS_MODULE = None
+_CLASS_OBJECT = None
 
 def _set_class_module(module_class):
     """Initialize the module."""
@@ -315,3 +344,7 @@ def _set_class_module(module_class):
 def _set_class_function(func_class):
     global _CLASS_FUNCTION
     _CLASS_FUNCTION = func_class
+
+def _set_class_object(obj_class):
+    global _CLASS_OBJECT
+    _CLASS_OBJECT = obj_class
diff --git a/python/tvm/_ffi/function.py b/python/tvm/_ffi/function.py
index 60e7aeb9aec5..509407ae4c58 100644
--- a/python/tvm/_ffi/function.py
+++ b/python/tvm/_ffi/function.py
@@ -30,19 +30,28 @@
     if _FFI_MODE == "ctypes":
         raise ImportError()
     if sys.version_info >= (3, 0):
-        from ._cy3.core import _set_class_function, _set_class_module
+        from ._cy3.core import _set_class_function, _set_class_module, _set_class_object
         from ._cy3.core import FunctionBase as _FunctionBase
+        from ._cy3.core import ObjectBase as _ObjectBase
         from ._cy3.core import convert_to_tvm_func
     else:
-        from ._cy2.core import _set_class_function, _set_class_module
+        from ._cy2.core import _set_class_function, _set_class_module, _set_class_object
         from ._cy2.core import FunctionBase as _FunctionBase
+        from ._cy2.core import ObjectBase as _ObjectBase
         from ._cy2.core import convert_to_tvm_func
 except IMPORT_EXCEPT:
     # pylint: disable=wrong-import-position
-    from ._ctypes.function import _set_class_function, _set_class_module
+    from ._ctypes.function import _set_class_function, _set_class_module, _set_class_object
+    from ._ctypes.function import ObjectBase as _ObjectBase
     from ._ctypes.function import FunctionBase as _FunctionBase
     from ._ctypes.function import convert_to_tvm_func
 
+class Object(_ObjectBase):
+    # TODO(@jroesch): Eventually add back introspection functionality.
+    pass
+
+_set_class_object(Object)
+
 FunctionHandle = ctypes.c_void_p
 
 class Function(_FunctionBase):
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index cb30457b64c7..4ede33a63936 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -42,8 +42,10 @@ class TypeCode(object):
     STR = 11
     BYTES = 12
     NDARRAY_CONTAINER = 13
+    OBJECT = 14
     EXT_BEGIN = 15
 
+
 class TVMByteArray(ctypes.Structure):
     """Temp data structure for byte array."""
     _fields_ = [("data", ctypes.POINTER(ctypes.c_byte)),
diff --git a/src/api/dsl_api.cc b/src/api/dsl_api.cc
index 5e01383e9cb2..9b91d4fc91dd 100644
--- a/src/api/dsl_api.cc
+++ b/src/api/dsl_api.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -92,6 +92,12 @@ struct APIAttrGetter : public AttrVisitor {
       found_ref_object = true;
     }
   }
+  void Visit(const char* key, runtime::Object* value) final {
+    if (skey == key) {
+      *ret = value[0];
+      found_ref_object = true;
+    }
+  }
 };
 
 struct APIAttrDir : public AttrVisitor {
@@ -127,6 +133,9 @@ struct APIAttrDir : public AttrVisitor {
   void Visit(const char* key, runtime::NDArray* value) final {
     names->push_back(key);
   }
+  void Visit(const char* key, runtime::Object* value) final {
+    names->push_back(key);
+  }
 };
 
 class DSLAPIImpl : public DSLAPI {
diff --git a/src/lang/reflection.cc b/src/lang/reflection.cc
index 32d521e80621..bc3d2895b811 100644
--- a/src/lang/reflection.cc
+++ b/src/lang/reflection.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -53,6 +53,8 @@ inline Type String2Type(std::string s) {
   return TVMType2Type(runtime::String2TVMType(s));
 }
 
+using runtime::Object;
+using runtime::ObjectCell;
 
 // indexer to index all the ndoes
 class NodeIndexer : public AttrVisitor {
@@ -61,6 +63,8 @@ class NodeIndexer : public AttrVisitor {
   std::vector<Node*> node_list{nullptr};
   std::unordered_map<DLTensor*, size_t> tensor_index;
   std::vector<DLTensor*> tensor_list;
+  std::unordered_map<ObjectCell*, size_t> vm_obj_index;
+  std::vector<ObjectCell*> vm_obj_list;
 
   void Visit(const char* key, double* value) final {}
   void Visit(const char* key, int64_t* value) final {}
@@ -73,6 +77,7 @@ class NodeIndexer : public AttrVisitor {
   void Visit(const char* key, NodeRef* value) final {
     MakeIndex(value->node_.get());
   }
+
   void Visit(const char* key, runtime::NDArray* value) final {
     DLTensor* ptr = const_cast<DLTensor*>((*value).operator->());
     if (tensor_index.count(ptr)) return;
@@ -80,6 +85,15 @@ class NodeIndexer : public AttrVisitor {
     tensor_index[ptr] = tensor_list.size();
     tensor_list.push_back(ptr);
   }
+
+  void Visit(const char* key, Object* value) final {
+    ObjectCell* ptr = value->ptr_.get();
+    if (vm_obj_index.count(ptr)) return;
+    CHECK_EQ(vm_obj_index.size(), vm_obj_list.size());
+    vm_obj_index[ptr] = vm_obj_list.size();
+    vm_obj_list.push_back(ptr);
+  }
+
   // make index of all the children of node
   void MakeIndex(Node* node) {
     if (node == nullptr) return;
@@ -163,6 +177,7 @@ class JSONAttrGetter : public AttrVisitor {
  public:
   const std::unordered_map<Node*, size_t>* node_index_;
   const std::unordered_map<DLTensor*, size_t>* tensor_index_;
+  const std::unordered_map<ObjectCell*, size_t>* vm_obj_index_;
   JSONNode* node_;
 
   void Visit(const char* key, double* value) final {
@@ -197,6 +212,10 @@ class JSONAttrGetter : public AttrVisitor {
     node_->attrs[key] = std::to_string(
         tensor_index_->at(const_cast<DLTensor*>((*value).operator->())));
   }
+  void Visit(const char* key, Object* value) final {
+    node_->attrs[key] = std::to_string(
+        vm_obj_index_->at(value->ptr_.get()));
+  }
   // Get the node
   void Get(Node* node) {
     if (node == nullptr) {
@@ -250,6 +269,8 @@ class JSONAttrSetter : public AttrVisitor {
  public:
   const std::vector<NodePtr<Node> >* node_list_;
   const std::vector<runtime::NDArray>* tensor_list_;
+  const std::vector<Object>* vm_obj_list_;
+
   JSONNode* node_;
 
   std::string GetValue(const char* key) const {
@@ -304,6 +325,12 @@ class JSONAttrSetter : public AttrVisitor {
     CHECK_LE(index, tensor_list_->size());
     *value = tensor_list_->at(index);
   }
+  void Visit(const char* key, Object* value) final {
+    size_t index;
+    ParseValue(key, &index);
+    CHECK_LE(index, vm_obj_list_->size());
+    *value = vm_obj_list_->at(index);
+  }
   // set node to be current JSONNode
   void Set(Node* node) {
     if (node == nullptr) return;
@@ -481,6 +508,9 @@ class NodeAttrSetter : public AttrVisitor {
   void Visit(const char* key, runtime::NDArray* value) final {
     *value = GetAttr(key).operator runtime::NDArray();
   }
+  void Visit(const char* key, Object* value) final {
+    *value = GetAttr(key).operator Object();
+  }
 
  private:
   runtime::TVMArgValue GetAttr(const char* key) {
diff --git a/src/relay/ir/pretty_printer.cc b/src/relay/ir/pretty_printer.cc
index 6a3e5f85c9b2..7a61079204ed 100644
--- a/src/relay/ir/pretty_printer.cc
+++ b/src/relay/ir/pretty_printer.cc
@@ -775,6 +775,9 @@ class PrettyPrinter::AttrPrinter : public AttrVisitor {
   void Visit(const char* key, runtime::NDArray* value) final {
     LOG(FATAL) << "do not allow NDarray as argument";
   }
+  void Visit(const char* key, runtime::Object* obj) final {
+    LOG(FATAL) << "do not allow Object as argument";
+  }
 
  private:
   Doc& doc_;
diff --git a/src/runtime/vm/object.cc b/src/runtime/vm/object.cc
new file mode 100644
index 000000000000..566e5b032f85
--- /dev/null
+++ b/src/runtime/vm/object.cc
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file object.cc
+ * \brief A managed object in the TVM runtime.
+ */
+
+#include <tvm/logging.h>
+#include <tvm/runtime/object.h>
+#include <iostream>
+
+namespace tvm {
+namespace runtime {
+
+std::ostream& operator<<(std::ostream& os, const ObjectTag& tag) {
+  switch (tag) {
+    case ObjectTag::kClosure:
+      os << "Closure";
+      break;
+    case ObjectTag::kDatatype:
+      os << "Datatype";
+      break;
+    case ObjectTag::kTensor:
+      os << "Tensor";
+      break;
+    case ObjectTag::kExternalFunc:
+      os << "ExternalFunction";
+      break;
+    default:
+      LOG(FATAL) << "Invalid object tag: found " << static_cast<int>(tag);
+  }
+  return os;
+}
+
+Object Object::Tensor(const NDArray& data) {
+  ObjectPtr<ObjectCell> ptr = MakeObject<TensorCell>(data);
+  return Object(ptr);
+}
+
+Object Object::Datatype(size_t tag, const std::vector<Object>& fields) {
+  ObjectPtr<ObjectCell> ptr = MakeObject<DatatypeCell>(tag, fields);
+  return Object(ptr);
+}
+
+Object Object::Tuple(const std::vector<Object>& fields) { return Object::Datatype(0, fields); }
+
+Object Object::Closure(size_t func_index, const std::vector<Object>& free_vars) {
+  ObjectPtr<ObjectCell> ptr = MakeObject<ClosureCell>(func_index, free_vars);
+  return Object(ptr);
+}
+
+ObjectPtr<TensorCell> Object::AsTensor() const {
+  CHECK(ptr.get());
+  CHECK(ptr.get()->tag == ObjectTag::kTensor);
+  return ptr.As<TensorCell>();
+}
+
+ObjectPtr<DatatypeCell> Object::AsDatatype() const {
+  CHECK(ptr.get());
+  CHECK(ptr.get()->tag == ObjectTag::kDatatype);
+  return ptr.As<DatatypeCell>();
+}
+
+ObjectPtr<ClosureCell> Object::AsClosure() const {
+  CHECK(ptr.get());
+  CHECK(ptr.get()->tag == ObjectTag::kClosure);
+  return ptr.As<ClosureCell>();
+}
+
+NDArray ToNDArray(const Object& obj) {
+  auto tensor = obj.AsTensor();
+  return tensor->data;
+}
+
+}  // namespace runtime
+}  // namespace tvm

From 85bf2368d16926a1f01f1a87b7588c97a9e8ef23 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 2 May 2019 11:52:13 -0400
Subject: [PATCH 074/106] [LINT] Add more allowed file type

---
 tests/lint/check_file_type.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index 4214d5d21e6c..69322f5efaaa 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -62,6 +62,10 @@
     "plist",
     "xcworkspacedata",
     "storyboard",
+    # hw/chisel
+    "sbt",
+    "properties",
+    "v",
     }
 
 # List of file names allowed

From 224c0ec7ce8e805c3a55c87e01300683b3b46d58 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Thu, 2 May 2019 08:59:22 -0700
Subject: [PATCH 075/106] Add MXNet converter for RNN layer ops (#3125)

---
 python/tvm/relay/build_module.py            |  10 +-
 python/tvm/relay/frontend/mxnet.py          | 129 +++++++++++++++++---
 tests/python/frontend/mxnet/test_forward.py |  49 ++++++++
 3 files changed, 173 insertions(+), 15 deletions(-)

diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index b16b5e28bf34..a4929d0b839d 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -26,6 +26,7 @@
 from ..contrib import graph_runtime as _graph_rt
 from . import ir_pass
 from . import expr as _expr
+from . import ty as _ty
 from .backend import interpreter as _interpreter
 from .backend import graph_runtime_codegen as _graph_gen
 
@@ -427,6 +428,8 @@ def __init__(self, mod, ctx, target):
         self.target = target
 
     def _make_executor(self, func):
+        ret_type = ir_pass.infer_type(func).ret_type
+        num_outputs = len(ret_type.fields) if isinstance(ret_type, _ty.TupleType) else 1
         graph_json, mod, params = build(func, target=self.target)
         gmodule = _graph_rt.create(graph_json, mod, self.ctx)
         if params:
@@ -440,7 +443,12 @@ def _graph_wrapper(*args, **kwargs):
             # Run the module, and fetch the output.
             gmodule.run()
             # make a copy so multiple invocation won't hurt perf.
-            return gmodule.get_output(0).copyto(_nd.cpu(0))
+            if num_outputs == 1:
+                return gmodule.get_output(0).copyto(_nd.cpu(0))
+            outputs = []
+            for i in range(num_outputs):
+                outputs.append(gmodule.get_output(i).copyto(_nd.cpu(0)))
+            return outputs
 
         return _graph_wrapper
 
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index f1bf6788ea20..b93bd5b244eb 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -34,6 +34,12 @@
 
 __all__ = ['from_mxnet']
 
+_activation_map = {
+    "sigmoid": _op.sigmoid,
+    "tanh"   : _op.tanh,
+    "relu"   : _op.nn.relu
+}
+
 def _mx_fully_connected(inputs, attrs):
     import mxnet as mx
     units = attrs.get_int("num_hidden")
@@ -66,12 +72,6 @@ def _get_channel_axis(layout, op_name):
 def _mx_activations(inputs, attrs):
     act_type = attrs.get_str("act_type")
     assert len(inputs) == 1
-    if act_type == "sigmoid":
-        return _op.sigmoid(inputs[0])
-    if act_type == "tanh":
-        return _op.tanh(inputs[0])
-    if act_type == "relu":
-        return _op.nn.relu(inputs[0])
     if act_type == "softrelu":
         def _stable_softrelu(x):
             # log(1 + exp(-abs(x))) + relu(x)
@@ -80,8 +80,10 @@ def _stable_softrelu(x):
             return _op.add(_op.log(_op.add(one, exp_neg_abs_x)),
                            _op.nn.relu(x))
         return _stable_softrelu(inputs[0])
-    raise tvm.error.OpNotImplemented(
-        'Operator {} is not supported for frontend MXNet.'.format(act_type))
+    if act_type not in _activation_map:
+        raise tvm.error.OpNotImplemented(
+            'Operator {} is not supported for frontend MXNet.'.format(act_type))
+    return _activation_map[act_type](inputs[0])
 
 
 def _mx_compare(new_op, wrapper):
@@ -189,7 +191,8 @@ def _pool2d(new_op, is_avg):
 def _mx_adaptive_avg_pooling(inputs, attrs):
     output_size = attrs.get_int_tuple("output_size", [])
     if output_size != (1,):
-        raise RuntimeError("AdaptiveAvgPooling with output_size other than 1 is not supported yet.")
+        raise tvm.error.OpAttributeUnimplemented(
+            "AdaptiveAvgPooling with output_size other than 1 is not supported yet.")
     return _op.nn.global_avg_pool2d(inputs[0])
 
 
@@ -471,7 +474,7 @@ def _mx_take(inputs, attrs):
     assert len(inputs) == 2
     mode = attrs.get_str("mode", "clip")
     if mode == "raise":
-        raise RuntimeError("take doesn't support raise mode")
+        raise tvm.error.OpAttributeUnimplemented("take with raise mode is not supported yet")
     axis = attrs.get_int("axis", 0)
     return _op.take(inputs[0], inputs[1].astype("int32"), axis, mode)
 
@@ -571,13 +574,13 @@ def _mx_l2_normalize(inputs, attrs):
 def _mx_shape_array(inputs, attrs):
     assert len(inputs) == 1
     if attrs.get_int("lhs_begin", None) is not None:
-        raise RuntimeError("shape_array doesn't support lhs_begin")
+        raise tvm.error.OpAttributeUnimplemented("shape_array doesn't support lhs_begin")
     if attrs.get_int("lhs_end", None) is not None:
-        raise RuntimeError("shape_array doesn't support lhs_end")
+        raise tvm.error.OpAttributeUnimplemented("shape_array doesn't support lhs_end")
     if attrs.get_int("rhs_begin", None) is not None:
-        raise RuntimeError("shape_array doesn't support rhs_begin")
+        raise tvm.error.OpAttributeUnimplemented("shape_array doesn't support rhs_begin")
     if attrs.get_int("rhs_end", None) is not None:
-        raise RuntimeError("shape_array doesn't support rhs_end")
+        raise tvm.error.OpAttributeUnimplemented("shape_array doesn't support rhs_end")
     return _op.shape_of(inputs[0], dtype='int64')
 
 
@@ -657,6 +660,101 @@ def _mx_argsort(inputs, attrs):
     return _op.argsort(inputs[0], **new_attrs)
 
 
+def _mx_rnn_param_concat(inputs, _):
+    # We don't need to concatenate RNN params because we will unravel the RNN op
+    return [inputs]
+
+
+def _mx_rnn_layer(inputs, attrs):
+    def _rnn_cell(data, states, i2h_weight, h2h_weight, i2h_bias, h2h_bias, activation):
+        i2h = _op.nn.bias_add(_op.nn.dense(data, i2h_weight), i2h_bias, axis=-1)
+        h2h = _op.nn.bias_add(_op.nn.dense(states[0], h2h_weight), h2h_bias, axis=-1)
+        out = _activation_map[activation](i2h + h2h)
+        return out, [out]
+
+    def _gru_cell(data, states, i2h_weight, h2h_weight, i2h_bias, h2h_bias):
+        dtype = ir_pass.infer_type(data).checked_type.dtype
+        i2h = _op.nn.bias_add(_op.nn.dense(data, i2h_weight), i2h_bias, axis=-1)
+        h2h = _op.nn.bias_add(_op.nn.dense(states[0], h2h_weight), h2h_bias, axis=-1)
+        i2h_r, i2h_z, i2h = _op.split(i2h, indices_or_sections=3, axis=1)
+        h2h_r, h2h_z, h2h = _op.split(h2h, indices_or_sections=3, axis=1)
+        reset_gate = _activation_map["sigmoid"](i2h_r + h2h_r)
+        update_gate = _activation_map["sigmoid"](i2h_z + h2h_z)
+        next_h_tmp = _activation_map["tanh"](reset_gate * h2h + i2h)
+        next_h = (_expr.const(1, dtype) - update_gate) * next_h_tmp + update_gate * states[0]
+        return next_h, [next_h]
+
+    def _lstm_cell(data, states, i2h_weight, h2h_weight, i2h_bias, h2h_bias):
+        i2h = _op.nn.bias_add(_op.nn.dense(data, i2h_weight), i2h_bias, axis=-1)
+        h2h = _op.nn.bias_add(_op.nn.dense(states[0], h2h_weight), h2h_bias, axis=-1)
+        gates = i2h + h2h
+        slice_gates = _op.split(gates, indices_or_sections=4, axis=1)
+        in_gate = _activation_map["sigmoid"](slice_gates[0])
+        forget_gate = _activation_map["sigmoid"](slice_gates[1])
+        in_transform = _activation_map["tanh"](slice_gates[2])
+        out_gate = _activation_map["sigmoid"](slice_gates[3])
+        next_c = forget_gate * states[1] + in_gate * in_transform
+        next_h = out_gate * _activation_map["tanh"](next_c)
+        return next_h, [next_h, next_c]
+
+    num_layers = attrs.get_int("num_layers", 1)
+    mode = attrs.get_str("mode")
+    if mode.startswith("rnn"):
+        mode, activation = mode.split('_')
+    assert mode in ["rnn", "gru", "lstm"]
+    bidirectional = attrs.get_bool("bidirectional", False)
+    if bidirectional:
+        raise tvm.error.OpAttributeUnimplemented(
+            "Bidirectional RNN op is not supported yet")
+    layout = attrs.get_str("layout", "TNC")
+    if layout != "TNC":
+        raise tvm.error.OpAttributeUnimplemented(
+            "RNN with layout other than TNC is not supported yet")
+    num_states = 2 if mode == 'lstm' else 1
+    assert len(inputs) == num_states + 2
+
+    seq_data = inputs[0]
+    concat_weight = inputs[1]
+    concat_states = inputs[2:]
+    seq_len = int(ir_pass.infer_type(seq_data).checked_type.shape[0])
+    assert len(concat_weight) == num_layers * 4
+
+    weights = []
+    bias = []
+    states = []
+    for i in range(num_layers):
+        w = []
+        b = []
+        s = []
+        for j in range(2):
+            w.append(concat_weight[i*2 + j].args[0])
+            b.append(concat_weight[num_layers*2 + i*2 + j].args[0])
+        for state in concat_states:
+            s.append(_op.take(state, _expr.const(i, "int32"), axis=0))
+        weights.append(w)
+        bias.append(b)
+        states.append(s)
+
+    seq_output = []
+    for t in range(seq_len):
+        data = _op.take(seq_data, _expr.const(t, "int32"), axis=0)
+        for l in range(num_layers):
+            if mode == "rnn":
+                out, new_states = _rnn_cell(data, states[l], *weights[l], *bias[l], activation)
+            elif mode == "gru":
+                out, new_states = _gru_cell(data, states[l], *weights[l], *bias[l])
+            else: # mode == "lstm"
+                out, new_states = _lstm_cell(data, states[l], *weights[l], *bias[l])
+            states[l] = new_states
+            data = out
+        seq_output.append(out)
+
+    outputs = [_op.stack(seq_output, axis=0)]
+    for i in range(num_states):
+        outputs.append(_op.stack([s[i] for s in states], axis=0))
+    return outputs
+
+
 # Note: due to attribute conversion constraint
 # ops in the identity set must be attribute free
 _identity_list = [
@@ -807,6 +905,9 @@ def _mx_argsort(inputs, attrs):
     "_contrib_box_nms" : _mx_box_nms,
     "_contrib_DeformableConvolution" : _mx_deformable_convolution,
     "_contrib_AdaptiveAvgPooling2D" : _mx_adaptive_avg_pooling,
+    # NLP
+    "RNN"               : _mx_rnn_layer,
+    "_rnn_param_concat" : _mx_rnn_param_concat,
     # List of missing operators that are present in NNVMv1
     # TODO(tvm-tvm): support all operators.
     #
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index d00efb39e16f..067c356830bb 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -527,6 +527,54 @@ def test_forward_bilinear_resize():
     mx_sym = mx.sym.contrib.BilinearResize2D(data, height=5, width=10)
     verify_mxnet_frontend_impl(mx_sym, (1, 2, 3, 4), (1, 2, 5, 10))
 
+def test_forward_rnn_layer():
+    def verify(mode, input_size, seq_len, hidden_size, num_layers, batch=1):
+        if mode == "rnn":
+            layer = gluon.rnn.RNN(hidden_size, num_layers)
+        elif mode == "gru":
+            layer = gluon.rnn.GRU(hidden_size, num_layers)
+        else: # mode == "lstm"
+            layer = gluon.rnn.LSTM(hidden_size, num_layers)
+        num_states = 2 if mode == "lstm" else 1
+        layer.initialize()
+
+        dtype = "float32"
+        data_np = np.random.uniform(size=(seq_len, batch, input_size)).astype(dtype)
+        states_np = []
+        states_mx = []
+        shape_dict = {'data0': data_np.shape}
+        inputs = {'data0': data_np}
+        for i in range(num_states):
+            s = np.random.uniform(size=(num_layers, batch, hidden_size)).astype(dtype)
+            states_np.append(s)
+            states_mx.append(mx.nd.array(s))
+            shape_dict['data%s' % (i+1)] = s.shape
+            inputs['data%s' % (i+1)] = s
+
+        layer.hybridize()
+        mx_out, mx_states = layer(mx.nd.array(data_np), states_mx)
+        mx_res = [mx_out] + mx_states
+        mx_sym = layer._cached_graph[1]
+        mx_params = {}
+        for name, param in layer.collect_params().items():
+            mx_params[name] = param._reduce()
+
+        new_sym, params = relay.frontend.from_mxnet(
+            mx_sym, shape=shape_dict, arg_params=mx_params)
+        for target, ctx in ctx_list():
+            # only test graph runtime because debug runtime is too slow
+            for kind in ["graph"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(new_sym)(**inputs, **params)
+                assert len(op_res) == len(mx_res)
+                for i, val in enumerate(op_res):
+                    tvm.testing.assert_allclose(val.asnumpy(), mx_res[i].asnumpy(), rtol=1e-3)
+
+    for mode in ["rnn", "gru", "lstm"]:
+        verify(mode, 64, 10, 64, 1)
+        verify(mode, 64, 10, 64, 2)
+        verify(mode, 64, 10, 32, 2)
+
 
 if __name__ == '__main__':
     test_forward_mlp()
@@ -566,3 +614,4 @@ def test_forward_bilinear_resize():
     test_forward_take()
     test_forward_gather_nd()
     test_forward_bilinear_resize()
+    test_forward_rnn_layer()

From 4326acc69047de5e66e8698a5cf39f2c84f80275 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Thu, 2 May 2019 12:10:34 -0400
Subject: [PATCH 076/106] [Relay][Runtime] Add memory manager for NDArray
 (#3121)

* Add support for custom NDArray memory management

Credit to @icemelon9 and @wweic

* Fix copy-paste issue

* Fix naive allocator.h

* Remove buffer field

* Apply Wei's suggestions.

Co-Authored-By: jroesch <roeschinc@gmail.com>

* Fix Wei's suggestion

* Fix go rts

* Break MM dependency

* Add docs and clean up diff

* Add more docs

* Move to VM folder

* Fix lint

* Remove Go dep.

* Rename to Empty

* Address Haichen's comments
---
 golang/src/tvm_runtime_pack.cc    |   4 +-
 include/tvm/runtime/ndarray.h     |  18 ++++-
 src/runtime/vm/memory_manager.cc  |  75 ++++++++++++++++++++
 src/runtime/vm/memory_manager.h   | 114 ++++++++++++++++++++++++++++++
 src/runtime/vm/naive_allocator.h  |  69 ++++++++++++++++++
 src/runtime/vm/pooled_allocator.h | 104 +++++++++++++++++++++++++++
 6 files changed, 381 insertions(+), 3 deletions(-)
 create mode 100644 src/runtime/vm/memory_manager.cc
 create mode 100644 src/runtime/vm/memory_manager.h
 create mode 100644 src/runtime/vm/naive_allocator.h
 create mode 100644 src/runtime/vm/pooled_allocator.h

diff --git a/golang/src/tvm_runtime_pack.cc b/golang/src/tvm_runtime_pack.cc
index 70376cb166da..cfbe237fd31c 100644
--- a/golang/src/tvm_runtime_pack.cc
+++ b/golang/src/tvm_runtime_pack.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index 5133d2861922..9e7814b7f620 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -19,7 +19,7 @@
 
 /*!
  * \file tvm/runtime/ndarray.h
- * \brief Abstract device memory management API
+ * \brief A device-independent managed NDArray abstraction.
  */
 #ifndef TVM_RUNTIME_NDARRAY_H_
 #define TVM_RUNTIME_NDARRAY_H_
@@ -32,6 +32,7 @@
 
 namespace tvm {
 namespace runtime {
+
 /*!
  * \brief Managed NDArray.
  *  The array is backed by reference counted blocks.
@@ -248,6 +249,7 @@ class NDArray::Container {
    *  The head ptr of this struct can be viewed as DLTensor*.
    */
   DLTensor dl_tensor;
+
   /*!
    * \brief addtional context, reserved for recycling
    * \note We can attach additional content here
@@ -281,6 +283,7 @@ class NDArray::Container {
   int32_t array_type_code_{0};
   /*! \brief The internal reference counter */
   std::atomic<int> ref_counter_{0};
+
   /*!
    * \brief The shape container,
    *  can be used used for shape data.
@@ -296,6 +299,19 @@ class NDArray::Container {
     dl_tensor.strides = nullptr;
     dl_tensor.byte_offset = 0;
   }
+
+  Container(void* data,
+            std::vector<int64_t> shape,
+            DLDataType dtype,
+            DLContext ctx) {
+    dl_tensor.data = data;
+    shape_ = std::move(shape);
+    dl_tensor.shape = dmlc::BeginPtr(shape);
+    dl_tensor.ndim = static_cast<int>(shape.size());
+    dl_tensor.dtype = dtype;
+    dl_tensor.ctx = ctx;
+  }
+
   /*! \brief developer function, increases reference counter */
   void IncRef() {
     ref_counter_.fetch_add(1, std::memory_order_relaxed);
diff --git a/src/runtime/vm/memory_manager.cc b/src/runtime/vm/memory_manager.cc
new file mode 100644
index 000000000000..c2bad38831ec
--- /dev/null
+++ b/src/runtime/vm/memory_manager.cc
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file tvm/runtime/memory_manager.cc
+ * \brief Allocate and manage memory for the runtime.
+ */
+#include <utility>
+#include <memory>
+#include "memory_manager.h"
+#include "naive_allocator.h"
+#include "pooled_allocator.h"
+
+namespace tvm {
+namespace runtime {
+namespace vm {
+
+MemoryManager* MemoryManager::Global() {
+  static MemoryManager memory_manager;
+  return &memory_manager;
+}
+
+Allocator* MemoryManager::GetAllocator(TVMContext ctx) {
+  std::lock_guard<std::mutex> lock(mu_);
+  if (allocators_.find(ctx) == allocators_.end()) {
+    // LOG(INFO) << "New allocator for " << DeviceName(ctx.device_type) << "("
+    //           << ctx.device_id << ")";
+    std::unique_ptr<Allocator> alloc(new NaiveAllocator(ctx));
+    allocators_.emplace(ctx, std::move(alloc));
+  }
+  return allocators_.at(ctx).get();
+}
+
+static void BufferDeleter(NDArray::Container* ptr) {
+  CHECK(ptr->manager_ctx != nullptr);
+  Buffer* buffer = reinterpret_cast<Buffer*>(ptr->manager_ctx);
+  MemoryManager::Global()->GetAllocator(buffer->ctx)->
+      Free(*(buffer));
+  delete buffer;
+  delete ptr;
+}
+
+NDArray Allocator::Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx) {
+  VerifyDataType(dtype);
+  NDArray::Container* container = new NDArray::Container(nullptr, shape, dtype, ctx);
+  container->deleter = BufferDeleter;
+  size_t size = GetDataSize(container->dl_tensor);
+  size_t alignment = GetDataAlignment(container->dl_tensor);
+  Buffer *buffer = new Buffer;
+  *buffer = this->Alloc(size, alignment, dtype);
+  container->manager_ctx = reinterpret_cast<void*>(buffer);
+  container->dl_tensor.data = buffer->data;
+  return NDArray(container);
+}
+
+}  // namespace vm
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/vm/memory_manager.h b/src/runtime/vm/memory_manager.h
new file mode 100644
index 000000000000..2fd1f4995c44
--- /dev/null
+++ b/src/runtime/vm/memory_manager.h
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file src/runtime/memory_manager.h
+ * \brief Abstract device memory management API
+ */
+#ifndef TVM_RUNTIME_VM_MEMORY_MANAGER_H_
+#define TVM_RUNTIME_VM_MEMORY_MANAGER_H_
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+
+namespace std {
+template <>
+struct hash<TVMContext> {
+  std::size_t operator()(const TVMContext& ctx) const {
+    return ((ctx.device_id << 8) | ctx.device_type);
+  }
+};
+
+template <>
+struct equal_to<TVMContext> {
+  bool operator()(const TVMContext& lhs, const TVMContext& rhs) const {
+    return (lhs.device_type == rhs.device_type && lhs.device_id == rhs.device_id);
+  }
+};
+
+}  // namespace std
+
+namespace tvm {
+namespace runtime {
+namespace vm {
+
+struct Buffer {
+  /*! \brief The pointer to the allocated block of memory. */
+  void* data{nullptr};
+  /*! \brief The size of the block. */
+  size_t size{0};
+  /*! \brief The context of the allocated buffers. */
+  TVMContext ctx;
+};
+
+class Allocator {
+ public:
+  Allocator() {}
+
+  /*! \brief Allocate an empty NDArray using from the allocator.
+   *  \param shape The shape of the NDArray.
+   *  \param alignment The datatype of the NDArray.
+   *  \param ctx The context where the array is allocated.
+   *  \return The empty NDArray.
+   */
+  NDArray Empty(std::vector<int64_t> shape,
+                DLDataType dtype,
+                DLContext ctx);
+  /*! \brief Allocate a buffer given a size, alignment and type.
+   *  \param nbytes The size of the buffer.
+   *  \param alignment The alignment of the buffer.
+   *  \param type_hint A type hint to the allocator.
+   *  \return A sized allocation in the form of a buffer.
+  */
+  virtual Buffer Alloc(size_t nbytes, size_t alignment, TVMType type_hint) = 0;
+  /*! \brief Free a buffer allocated by the allocator.
+   *  \param buffer The buffer to free.
+   */
+  virtual void Free(const Buffer& buffer) = 0;
+  /*! \brief The amount of memory currently allocated.
+   *  \return The amount of memory currently allocated.
+   */
+  virtual size_t UsedMemory() const = 0;
+  virtual ~Allocator() = default;
+};
+
+class MemoryManager {
+ public:
+  static MemoryManager* Global();
+
+  Allocator* GetAllocator(TVMContext ctx);
+
+ private:
+  MemoryManager() {}
+
+ private:
+  std::mutex mu_;
+  std::unordered_map<TVMContext, std::unique_ptr<Allocator> > allocators_;
+};
+
+}  // namespace vm
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_VM_MEMORY_MANAGER_H_
diff --git a/src/runtime/vm/naive_allocator.h b/src/runtime/vm/naive_allocator.h
new file mode 100644
index 000000000000..b4e2ee5d4890
--- /dev/null
+++ b/src/runtime/vm/naive_allocator.h
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file src/runtime/naive_allocator.h
+ */
+#ifndef TVM_RUNTIME_VM_NAIVE_ALLOCATOR_H_
+#define TVM_RUNTIME_VM_NAIVE_ALLOCATOR_H_
+
+#include <tvm/runtime/device_api.h>
+#include <atomic>
+
+#include "memory_manager.h"
+
+namespace tvm {
+namespace runtime {
+namespace vm {
+
+class NaiveAllocator final : public Allocator {
+ public:
+  explicit NaiveAllocator(TVMContext ctx) : Allocator(), used_memory_(0) {}
+
+  Buffer Alloc(size_t nbytes, size_t alignment, TVMType type_hint) override {
+    Buffer buf;
+    buf.ctx = ctx_;
+    buf.size = nbytes;
+    buf.data = DeviceAPI::Get(ctx_)->AllocDataSpace(ctx_, nbytes, alignment, type_hint);
+    used_memory_.fetch_add(nbytes, std::memory_order_relaxed);
+    DLOG(INFO) << "allocate " << nbytes << " B, used memory " << used_memory_ << " B";
+    return buf;
+  }
+
+  void Free(const Buffer& buffer) override {
+    DeviceAPI::Get(ctx_)->FreeDataSpace(buffer.ctx, buffer.data);
+    used_memory_.fetch_sub(buffer.size, std::memory_order_relaxed);
+    DLOG(INFO) << "free " << buffer.size << " B, used memory " << used_memory_ << " B";
+  }
+
+  size_t UsedMemory() const override {
+    return used_memory_.load(std::memory_order_relaxed);
+  }
+
+ private:
+  std::atomic<size_t> used_memory_;
+  TVMContext ctx_;
+};
+
+}  // namespace vm
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_VM_NAIVE_ALLOCATOR_H_
diff --git a/src/runtime/vm/pooled_allocator.h b/src/runtime/vm/pooled_allocator.h
new file mode 100644
index 000000000000..4a9dc6ab9952
--- /dev/null
+++ b/src/runtime/vm/pooled_allocator.h
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file runtime/pooled_allocator.h
+ */
+#ifndef TVM_RUNTIME_VM_POOLED_ALLOCATOR_H_
+#define TVM_RUNTIME_VM_POOLED_ALLOCATOR_H_
+
+#include <tvm/runtime/device_api.h>
+#include <atomic>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+
+#include "memory_manager.h"
+
+namespace tvm {
+namespace runtime {
+namespace vm {
+
+class PooledAllocator final : public Allocator {
+ public:
+  static constexpr size_t kDefaultPageSize = 4096;
+
+  explicit PooledAllocator(TVMContext ctx, size_t page_size = kDefaultPageSize)
+      : Allocator(), page_size_(page_size), used_memory_(0) {}
+
+  ~PooledAllocator() { ReleaseAll(); }
+
+  Buffer Alloc(size_t nbytes, size_t alignment, TVMType type_hint) override {
+    std::lock_guard<std::mutex> lock(mu_);
+    size_t size = ((nbytes + page_size_ - 1) / page_size_) * page_size_;
+    auto&& it = memory_pool_.find(size);
+    if (it != memory_pool_.end() && !it->second.empty()) {
+      auto&& pool = it->second;
+      auto ret = pool.back();
+      pool.pop_back();
+      return ret;
+    }
+    Buffer buf;
+    buf.ctx = ctx_;
+    buf.size = size;
+    buf.data = DeviceAPI::Get(ctx_)->AllocDataSpace(ctx_, size, alignment, type_hint);
+    used_memory_.fetch_add(size, std::memory_order_relaxed);
+    DLOG(INFO) << "allocate " << size << " B, used memory " << used_memory_ << " B";
+    return buf;
+  }
+
+  void Free(const Buffer& buffer) override {
+    std::lock_guard<std::mutex> lock(mu_);
+    if (memory_pool_.find(buffer.size) == memory_pool_.end()) {
+      memory_pool_.emplace(buffer.size, std::vector<Buffer>{});
+    }
+    memory_pool_.at(buffer.size).push_back(buffer);
+    DLOG(INFO) << "reclaim buffer " << buffer.size;
+  }
+
+  size_t UsedMemory() const override { return used_memory_.load(std::memory_order_relaxed); }
+
+ private:
+  void ReleaseAll() {
+    std::lock_guard<std::mutex> lock(mu_);
+    for (auto const& it : memory_pool_) {
+      auto const& pool = it.second;
+      for (auto const& buf : pool) {
+        DeviceAPI::Get(buf.ctx)->FreeDataSpace(buf.ctx, buf.data);
+      }
+    }
+    memory_pool_.clear();
+    used_memory_ = 0;
+    DLOG(INFO) << "release all buffers";
+  }
+
+ private:
+  size_t page_size_;
+  std::atomic<size_t> used_memory_;
+  std::unordered_map<size_t, std::vector<Buffer> > memory_pool_;
+  std::mutex mu_;
+  TVMContext ctx_;
+};
+
+}  // namespace vm
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_VM_POOLED_ALLOCATOR_H_

From 8e077461029fc6926cb2c83129de617c5592f3df Mon Sep 17 00:00:00 2001
From: Marcus Shawcroft <marcus.shawcroft@arm.com>
Date: Thu, 2 May 2019 17:11:37 +0100
Subject: [PATCH 077/106] [DOC] Various documentation improvements (#3133)

---
 docs/contribute/code_guide.rst                |  4 ++--
 docs/install/docker.rst                       |  4 ++--
 docs/install/from_source.rst                  | 10 +++++-----
 docs/install/index.rst                        |  2 +-
 include/tvm/schedule.h                        | 10 +++++-----
 python/tvm/tensor.py                          |  6 +++---
 tutorials/autotvm/tune_conv2d_cuda.py         |  2 +-
 tutorials/autotvm/tune_relay_arm.py           | 12 ++++++------
 tutorials/autotvm/tune_relay_cuda.py          |  4 ++--
 tutorials/autotvm/tune_relay_mobile_gpu.py    | 10 +++++-----
 tutorials/autotvm/tune_relay_x86.py           |  8 ++++----
 tutorials/autotvm/tune_simple_template.py     |  8 ++++----
 tutorials/cross_compilation_and_rpc.py        |  6 +++---
 tutorials/frontend/deploy_model_on_android.py |  6 +++---
 tutorials/frontend/deploy_model_on_rasp.py    |  2 +-
 tutorials/frontend/deploy_ssd_gluoncv.py      |  2 +-
 tutorials/frontend/from_caffe2.py             |  4 ++--
 tutorials/frontend/from_tensorflow.py         |  2 +-
 tutorials/frontend/from_tflite.py             |  2 +-
 tutorials/language/extern_op.py               |  6 +++---
 tutorials/language/scan.py                    |  2 +-
 tutorials/tensor_expr_get_started.py          |  8 ++++----
 22 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/docs/contribute/code_guide.rst b/docs/contribute/code_guide.rst
index f9bd61c375d3..6a426b8277a0 100644
--- a/docs/contribute/code_guide.rst
+++ b/docs/contribute/code_guide.rst
@@ -20,7 +20,7 @@
 Code Guide and Tips
 ===================
 
-This is a document used to record tips in tvm codebase for reviewers and contributors.
+This is a document used to record tips in TVM codebase for reviewers and contributors.
 Most of them are summarized through lessons during the contributing and process.
 
 
@@ -42,7 +42,7 @@ Python Code Styles
 
 Handle Integer Constant Expression
 ----------------------------------
-We often need to handle constant integer expressions in tvm. Before we do so, the first question we want to ask is that is it really necessary to get a constant integer. If symbolic expression also works and let the logic flow, we should use symbolic expression as much as possible. So the generated code works for shapes that are not known ahead of time.
+We often need to handle constant integer expressions in TVM. Before we do so, the first question we want to ask is that is it really necessary to get a constant integer. If symbolic expression also works and let the logic flow, we should use symbolic expression as much as possible. So the generated code works for shapes that are not known ahead of time.
 
 Note that in some cases we cannot know certain information, e.g. sign of symbolic variable, it is ok to make assumptions in certain cases. While adding precise support if the variable is constant.
 
diff --git a/docs/install/docker.rst b/docs/install/docker.rst
index f4236d7a29cd..eb7331c0a1b7 100644
--- a/docs/install/docker.rst
+++ b/docs/install/docker.rst
@@ -19,13 +19,13 @@
 
 Docker Images
 =============
-We provide several prebuilt docker images to quickly try out tvm.
+We provide several prebuilt docker images to quickly try out TVM.
 These images are also helpful run through TVM demo and tutorials.
 You can get the docker images via the following steps.
 We need `docker <https://docs.docker.com/engine/installation/>`_ and
 `nvidia-docker <https://github.com/NVIDIA/nvidia-docker/>`_ if we want to use cuda.
 
-First, clone tvm repo to get the auxiliary scripts
+First, clone TVM repo to get the auxiliary scripts
 
 .. code:: bash
 
diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 62f669ec77b4..3a769dee2dce 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -19,13 +19,13 @@
 
 Install from Source
 ===================
-This page gives instructions on how to build and install the tvm package from
+This page gives instructions on how to build and install the TVM package from
 scratch on various systems. It consists of two steps:
 
 1. First build the shared library from the C++ codes (`libtvm.so` for linux, `libtvm.dylib` for macOS and `libtvm.dll` for windows).
 2. Setup for the language packages (e.g. Python Package).
 
-To get started, clone tvm repo from github. It is important to clone the submodules along, with ``--recursive`` option.
+To get started, clone TVM repo from github. It is important to clone the submodules along, with ``--recursive`` option.
 
 .. code:: bash
 
@@ -63,7 +63,7 @@ The minimal building requirements are
 - If you want to use the NNVM compiler, then LLVM is required
 
 We use cmake to build the library.
-The configuration of tvm can be modified by `config.cmake`.
+The configuration of TVM can be modified by `config.cmake`.
 
 
 - First, check the cmake in your system. If you do not have cmake,
@@ -111,7 +111,7 @@ Building on Windows
 
 TVM support build via MSVC using cmake. The minimum required VS version is **Visual Studio Community 2015 Update 3**.
 In order to generate the VS solution file using cmake,
-make sure you have a recent version of cmake added to your path and then from the tvm directory:
+make sure you have a recent version of cmake added to your path and then from the TVM directory:
 
 .. code:: bash
 
@@ -159,7 +159,7 @@ Method 1
 
 
 Method 2
-   Install tvm python bindings by `setup.py`:
+   Install TVM python bindings by `setup.py`:
 
    .. code:: bash
 
diff --git a/docs/install/index.rst b/docs/install/index.rst
index 560811b5f78e..f1caec14e68b 100644
--- a/docs/install/index.rst
+++ b/docs/install/index.rst
@@ -19,7 +19,7 @@ Installation
 ============
 To install TVM, please read :ref:`install-from-source`.
 If you are interested in deploying to mobile/embedded devices,
-you do not need to install the entire tvm stack on your device,
+you do not need to install the entire TVM stack on your device,
 instead, you only need the runtime, please read :ref:`deploy-and-integration`.
 If you would like to quickly try out TVM or do demo/tutorials, checkout :ref:`docker-images`
 
diff --git a/include/tvm/schedule.h b/include/tvm/schedule.h
index 6c2a759db471..774d7cd9a40a 100644
--- a/include/tvm/schedule.h
+++ b/include/tvm/schedule.h
@@ -94,10 +94,10 @@ class Stage : public NodeRef {
    */
   EXPORT Stage& compute_root();  // NOLINT(*)
   /*!
-   * \brief Bind the ivar to thread index.
+   * \brief Bind the IterVar to thread index.
    *
-   * \param ivar The IterVar to be binded.
-   * \param thread_ivar The thread axis to be binded.
+   * \param ivar The IterVar to be bound.
+   * \param thread_ivar The thread axis to be bound.
    * \return reference to self.
    */
   EXPORT Stage& bind(IterVar ivar, IterVar thread_ivar);
@@ -107,7 +107,7 @@ class Stage : public NodeRef {
    *  need one of them to do the store.
    *
    * \note This is a dangerous scheduling primitive that can change behavior of program.
-   *    Only do when we are certain that thare are duplicated store.
+   *    Only do when we are certain that thare are duplicated stores.
    * \param predicate The condition to be checked.
    * \return reference to self.
    */
@@ -155,7 +155,7 @@ class Stage : public NodeRef {
    * \param p_target The result target domain.
    *
    * \note axes can be an empty array,
-   *       in that case, a singleton itervar is created and
+   *       in that case, a singleton IterVar is created and
    *       inserted to the outermost loop.
    *       The fuse of empty array is used to support zero-dimension tensors.
    *
diff --git a/python/tvm/tensor.py b/python/tvm/tensor.py
index 1e297a471863..ce7cbae385d9 100644
--- a/python/tvm/tensor.py
+++ b/python/tvm/tensor.py
@@ -110,7 +110,7 @@ def op(self):
 
     @property
     def value_index(self):
-        """The output value index the tensor corressponds to."""
+        """The output value index the tensor corresponds to."""
         return self.__getattr__("value_index")
 
     @property
@@ -128,7 +128,7 @@ def name(self):
 
 
 class Operation(NodeBase):
-    """Represent an operation that generate a tensor"""
+    """Represent an operation that generates a tensor"""
 
     def output(self, index):
         """Get the index-th output of the operation
@@ -197,7 +197,7 @@ def scan_axis(self):
 
 @register_node
 class ExternOp(Operation):
-    """Extern operation."""
+    """External operation."""
 
 
 @register_node
diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index 7124ad0a8fbb..a367c9925900 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -34,7 +34,7 @@
 #
 #   pip3 install --user psutil xgboost tornado
 #
-# To make tvm run faster in tuning, it is recommended to use cython
+# To make TVM run faster in tuning, it is recommended to use cython
 # as FFI of tvm. In the root directory of tvm, execute
 #
 # .. code-block:: bash
diff --git a/tutorials/autotvm/tune_relay_arm.py b/tutorials/autotvm/tune_relay_arm.py
index 0f5ab8237461..2c1dca9921eb 100644
--- a/tutorials/autotvm/tune_relay_arm.py
+++ b/tutorials/autotvm/tune_relay_arm.py
@@ -27,7 +27,7 @@
 The template has many tunable knobs (tile factor, vectorization, unrolling, etc).
 We will tune all convolution and depthwise convolution operators
 in the neural network. After tuning, we produce a log file which stores
-the best knob values for all required operators. When the tvm compiler compiles
+the best knob values for all required operators. When the TVM compiler compiles
 these operators, it will query this log file to get the best knob values.
 
 We also released pre-tuned parameters for some arm devices. You can go to
@@ -45,8 +45,8 @@
 #
 #   pip3 install --user psutil xgboost tornado
 #
-# To make tvm run faster during tuning, it is recommended to use cython
-# as FFI of tvm. In the root directory of tvm, execute
+# To make TVM run faster during tuning, it is recommended to use cython
+# as FFI of TVM. In the root directory of TVM, execute
 # (change "3" to "2" if you use python2):
 #
 # .. code-block:: bash
@@ -134,11 +134,11 @@ def get_network(name, batch_size):
 # Register devices to RPC Tracker
 # -----------------------------------
 # Now we can register our devices to the tracker. The first step is to
-# build tvm runtime for the ARM devices.
+# build the TVM runtime for the ARM devices.
 #
 # * For Linux:
 #   Follow this section :ref:`build-tvm-runtime-on-device` to build
-#   tvm runtime on the device. Then register the device to tracker by
+#   the TVM runtime on the device. Then register the device to tracker by
 #
 #   .. code-block:: bash
 #
@@ -148,7 +148,7 @@ def get_network(name, batch_size):
 #
 # * For Android:
 #   Follow this `readme page <https://github.com/dmlc/tvm/tree/master/apps/android_rpc>`_ to
-#   install tvm rpc apk on the android device. Make sure you can pass the android rpc test.
+#   install the TVM RPC APK on the android device. Make sure you can pass the android rpc test.
 #   Then you have already registred your device. During tuning, you have to go to developer option
 #   and enable "Keep screen awake during changing" and charge your phone to make it stable.
 #
diff --git a/tutorials/autotvm/tune_relay_cuda.py b/tutorials/autotvm/tune_relay_cuda.py
index f8ef71996ff4..571334e8c106 100644
--- a/tutorials/autotvm/tune_relay_cuda.py
+++ b/tutorials/autotvm/tune_relay_cuda.py
@@ -27,7 +27,7 @@
 The template has many tunable knobs (tile factor, unrolling, etc).
 We will tune all convolution and depthwise convolution operators
 in the neural network. After tuning, we produce a log file which stores
-the best knob values for all required operators. When the tvm compiler compiles
+the best knob values for all required operators. When the TVM compiler compiles
 these operators, it will query this log file to get the best knob values.
 
 We also released pre-tuned parameters for some NVIDIA GPUs. You can go to
@@ -45,7 +45,7 @@
 #
 #   pip3 install --user psutil xgboost tornado
 #
-# To make tvm run faster during tuning, it is recommended to use cython
+# To make TVM run faster during tuning, it is recommended to use cython
 # as FFI of tvm. In the root directory of tvm, execute:
 #
 # .. code-block:: bash
diff --git a/tutorials/autotvm/tune_relay_mobile_gpu.py b/tutorials/autotvm/tune_relay_mobile_gpu.py
index 5b231064e2ac..1e4cf6d52ade 100644
--- a/tutorials/autotvm/tune_relay_mobile_gpu.py
+++ b/tutorials/autotvm/tune_relay_mobile_gpu.py
@@ -27,7 +27,7 @@
 The template has many tunable knobs (tile factor, vectorization, unrolling, etc).
 We will tune all convolution, depthwise convolution and dense operators
 in the neural network. After tuning, we produce a log file which stores
-the best knob values for all required operators. When the tvm compiler compiles
+the best knob values for all required operators. When the TVM compiler compiles
 these operators, it will query this log file to get the best knob values.
 
 We also released pre-tuned parameters for some arm devices. You can go to
@@ -45,7 +45,7 @@
 #
 #   pip3 install --user psutil xgboost tornado
 #
-# To make tvm run faster during tuning, it is recommended to use cython
+# To make TVM run faster during tuning, it is recommended to use cython
 # as FFI of tvm. In the root directory of tvm, execute
 # (change "3" to "2" if you use python2):
 #
@@ -135,11 +135,11 @@ def get_network(name, batch_size):
 # Register devices to RPC Tracker
 # -----------------------------------
 # Now we can register our devices to the tracker. The first step is to
-# build tvm runtime for the ARM devices.
+# build the TVM runtime for the ARM devices.
 #
 # * For Linux:
 #   Follow this section :ref:`build-tvm-runtime-on-device` to build
-#   tvm runtime on the device. Then register the device to tracker by
+#   the TVM runtime on the device. Then register the device to tracker by
 #
 #   .. code-block:: bash
 #
@@ -149,7 +149,7 @@ def get_network(name, batch_size):
 #
 # * For Android:
 #   Follow this `readme page <https://github.com/dmlc/tvm/tree/master/apps/android_rpc>`_ to
-#   install tvm rpc apk on the android device. Make sure you can pass the android rpc test.
+#   install TVM RPC APK on the android device. Make sure you can pass the android RPC test.
 #   Then you have already registred your device. During tuning, you have to go to developer option
 #   and enable "Keep screen awake during changing" and charge your phone to make it stable.
 #
diff --git a/tutorials/autotvm/tune_relay_x86.py b/tutorials/autotvm/tune_relay_x86.py
index 0fa4e31f2b19..f100a35e5770 100644
--- a/tutorials/autotvm/tune_relay_x86.py
+++ b/tutorials/autotvm/tune_relay_x86.py
@@ -20,7 +20,7 @@
 **Author**: `Yao Wang <https://github.com/kevinthesun>`_, `Eddie Yan <https://github.com/eqy>`_
 
 This is a tutorial about how to tune convolution neural network
-for x86 cpu.
+for x86 CPU.
 """
 import os
 import numpy as np
@@ -70,7 +70,7 @@ def get_network(name, batch_size):
 
     return net, params, input_shape, output_shape
 
-# Replace "llvm" with the correct target of your cpu.
+# Replace "llvm" with the correct target of your CPU.
 # For example, for AWS EC2 c5 instance with Intel Xeon
 # Platinum 8000 series, the target should be "llvm -mcpu=skylake-avx512".
 # For AWS EC2 c4 instance with Intel Xeon E5-2666 v3, it should be
@@ -83,7 +83,7 @@ def get_network(name, batch_size):
 log_file = "%s.log" % model_name
 
 # Set number of threads used for tuning based on the number of
-# physical cpu cores on your machine.
+# physical CPU cores on your machine.
 num_threads = 1
 os.environ["TVM_NUM_THREADS"] = str(num_threads)
 
@@ -91,7 +91,7 @@ def get_network(name, batch_size):
 #################################################################
 # Configure tensor tuning settings and create tasks
 # -------------------------------------------------
-# To get better kernel execution performance on x86 cpu,
+# To get better kernel execution performance on x86 CPU,
 # we need to change data layout of convolution kernel from
 # "NCHW" to "NCHWc". To deal with this situation, we define
 # conv2d_NCHWc operator in topi. We will tune this operator
diff --git a/tutorials/autotvm/tune_simple_template.py b/tutorials/autotvm/tune_simple_template.py
index 45f95947341f..c7eea7f42c0b 100644
--- a/tutorials/autotvm/tune_simple_template.py
+++ b/tutorials/autotvm/tune_simple_template.py
@@ -38,8 +38,8 @@
 #
 #   pip3 install --user psutil xgboost
 #
-# To make tvm run faster in tuning, it is recommended to use cython
-# as FFI of tvm. In the root directory of tvm, execute
+# To make TVM run faster in tuning, it is recommended to use cython
+# as FFI of TVM. In the root directory of TVM, execute
 # (change "3" to "2" if you use python2):
 #
 # .. code-block:: bash
@@ -61,7 +61,7 @@
 ######################################################################
 # Step 1:  Define the search space
 # --------------------------------
-# In this section, we will rewrite a deterministic tvm schedule code to a
+# In this section, we will rewrite a deterministic TVM schedule code to a
 # tunable schedule template. You can regard the process of search space definition
 # as the parameterization of our existing schedule code.
 #
@@ -288,7 +288,7 @@ def matmul(N, L, M, dtype):
 logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))
 
 # There are two steps for measuring a config: build and run.
-# By default, we use all cpu cores to compile program. Then measure them sequentially.
+# By default, we use all CPU cores to compile program. Then measure them sequentially.
 # We measure 5 times and take average to reduce variance.
 measure_option = autotvm.measure_option(
     builder='local',
diff --git a/tutorials/cross_compilation_and_rpc.py b/tutorials/cross_compilation_and_rpc.py
index 1872b7dafe74..ea1b88cbf96a 100644
--- a/tutorials/cross_compilation_and_rpc.py
+++ b/tutorials/cross_compilation_and_rpc.py
@@ -35,7 +35,7 @@
 # Build TVM Runtime on Device
 # ---------------------------
 #
-# The first step is to build tvm runtime on the remote device.
+# The first step is to build the TVM runtime on the remote device.
 #
 # .. note::
 #
@@ -43,8 +43,8 @@
 #   executed on the target device, e.g. Raspberry Pi. And we assume it
 #   has Linux running.
 #
-# Since we do compilation on local machine, the remote device is only used
-# for running the generated code. We only need to build tvm runtime on
+# Since we do compilation on the local machine, the remote device is only used
+# for running the generated code. We only need to build the TVM runtime on
 # the remote device.
 #
 # .. code-block:: bash
diff --git a/tutorials/frontend/deploy_model_on_android.py b/tutorials/frontend/deploy_model_on_android.py
index 6985e3ad793d..a3ea8651b110 100644
--- a/tutorials/frontend/deploy_model_on_android.py
+++ b/tutorials/frontend/deploy_model_on_android.py
@@ -52,7 +52,7 @@
 #   docker run --pid=host -h tvm -v $PWD:/workspace \
 #          -w /workspace -p 9190:9190 --name tvm -it tvm.demo_android bash
 #
-# You are now inside the container. The cloned tvm directory is mounted on /workspace.
+# You are now inside the container. The cloned TVM directory is mounted on /workspace.
 # At this time, mount the 9190 port used by RPC described later.
 #
 # .. note::
@@ -74,7 +74,7 @@
 #         ..
 #   make -j10
 #
-# After building tvm successfully, Please set PYTHONPATH.
+# After building TVM successfully, Please set PYTHONPATH.
 #
 # .. code-block:: bash
 #
@@ -106,7 +106,7 @@
 # Now we can register our Android device to the tracker.
 #
 # Follow this `readme page <https://github.com/dmlc/tvm/tree/master/apps/android_rpc>`_ to
-# install tvm rpc apk on the android device.
+# install TVM RPC APK on the android device.
 #
 # Here is an example of config.mk. I enabled OpenCL and Vulkan.
 #
diff --git a/tutorials/frontend/deploy_model_on_rasp.py b/tutorials/frontend/deploy_model_on_rasp.py
index 8015b0b1c89e..c471e8228840 100644
--- a/tutorials/frontend/deploy_model_on_rasp.py
+++ b/tutorials/frontend/deploy_model_on_rasp.py
@@ -38,7 +38,7 @@
 # Build TVM Runtime on Device
 # ---------------------------
 #
-# The first step is to build tvm runtime on the remote device.
+# The first step is to build the TVM runtime on the remote device.
 #
 # .. note::
 #
diff --git a/tutorials/frontend/deploy_ssd_gluoncv.py b/tutorials/frontend/deploy_ssd_gluoncv.py
index ff7691c7bf55..f536679183c8 100644
--- a/tutorials/frontend/deploy_ssd_gluoncv.py
+++ b/tutorials/frontend/deploy_ssd_gluoncv.py
@@ -43,7 +43,7 @@
 #   To get best inference performance on CPU, change
 #   target argument according to your device and
 #   follow the :ref:`tune_relay_x86` to tune x86 CPU and
-#   :ref:`tune_relay_arm` for arm cpu.
+#   :ref:`tune_relay_arm` for arm CPU.
 #
 #   To get best performance fo SSD on Intel graphics,
 #   change target argument to 'opencl -device=intel_graphics'
diff --git a/tutorials/frontend/from_caffe2.py b/tutorials/frontend/from_caffe2.py
index 8185767cb038..ceec8c0ad119 100644
--- a/tutorials/frontend/from_caffe2.py
+++ b/tutorials/frontend/from_caffe2.py
@@ -86,7 +86,7 @@ def transform_image(image):
 func, params = relay.frontend.from_caffe2(resnet50.init_net, resnet50.predict_net, shape_dict, dtype_dict)
 
 # compile the model
-# target x86 cpu
+# target x86 CPU
 target = 'llvm'
 with relay.build_config(opt_level=3):
     graph, lib, params = relay.build(func, target, params=params)
@@ -97,7 +97,7 @@ def transform_image(image):
 # The process is no different from other examples.
 import tvm
 from tvm.contrib import graph_runtime
-# context x86 cpu, use tvm.gpu(0) if you run on GPU
+# context x86 CPU, use tvm.gpu(0) if you run on GPU
 ctx = tvm.cpu(0)
 # create a runtime executor module
 m = graph_runtime.create(graph, lib, ctx)
diff --git a/tutorials/frontend/from_tensorflow.py b/tutorials/frontend/from_tensorflow.py
index 58f63a0b7e78..8d402820377e 100644
--- a/tutorials/frontend/from_tensorflow.py
+++ b/tutorials/frontend/from_tensorflow.py
@@ -135,7 +135,7 @@
 # Results:
 #   graph: Final graph after compilation.
 #   params: final params after compilation.
-#   lib: target library which can be deployed on target with tvm runtime.
+#   lib: target library which can be deployed on target with TVM runtime.
 
 with relay.build_config(opt_level=3):
     graph, lib, params = relay.build(sym, target=target, target_host=target_host, params=params)
diff --git a/tutorials/frontend/from_tflite.py b/tutorials/frontend/from_tflite.py
index 52ecb65b3689..67edeb8a38de 100644
--- a/tutorials/frontend/from_tflite.py
+++ b/tutorials/frontend/from_tflite.py
@@ -151,7 +151,7 @@ def extract(path):
                                           shape_dict={input_tensor: input_shape},
                                           dtype_dict={input_tensor: input_dtype})
 
-# targt x86 cpu
+# target x86 CPU
 target = "llvm"
 with relay.build_module.build_config(opt_level=3):
     graph, lib, params = relay.build(func, target, params=params)
diff --git a/tutorials/language/extern_op.py b/tutorials/language/extern_op.py
index 071968ce2b1f..2ad3e3063415 100644
--- a/tutorials/language/extern_op.py
+++ b/tutorials/language/extern_op.py
@@ -25,7 +25,7 @@
 some of the convolution kernels and define the rest of the stages.
 
 TVM supports these black box function calls natively.
-Specfically, tvm support all the tensor functions that are DLPack compatible.
+Specfically, TVM support all the tensor functions that are DLPack compatible.
 Which means we can call any function with POD types(pointer, int, float)
 or pointer to DLTensor as argument.
 """
@@ -46,7 +46,7 @@
 # The compute function takes list of symbolic placeholder for the inputs,
 # list of symbolic placeholder for the outputs and returns the executing statement.
 #
-# In this case we simply call a registered tvm function, which invokes a CBLAS call.
+# In this case we simply call a registered TVM function, which invokes a CBLAS call.
 # TVM does not control internal of the extern array function and treats it as blackbox.
 # We can further mix schedulable TVM calls that add a bias term to the result.
 #
@@ -95,7 +95,7 @@
 # Since we can call into any PackedFunc in TVM. We can use the extern
 # function to callback into python.
 #
-# The following example registers a python function into tvm runtime system
+# The following example registers a python function into TVM runtime system
 # and use it to complete one stage of the computation.
 # This makes TVM much more flexible. For example, we can insert front-end
 # callbacks to inspect the intermediate results or mix customized code
diff --git a/tutorials/language/scan.py b/tutorials/language/scan.py
index be637fba0f70..2fa9c210ead2 100644
--- a/tutorials/language/scan.py
+++ b/tutorials/language/scan.py
@@ -77,7 +77,7 @@
 ######################################################################
 # Build and Verify
 # ----------------
-# We can build the scan kernel like other tvm kernels, here we use
+# We can build the scan kernel like other TVM kernels, here we use
 # numpy to verify the correctness of the result.
 #
 fscan = tvm.build(s, [X, s_scan], "cuda", name="myscan")
diff --git a/tutorials/tensor_expr_get_started.py b/tutorials/tensor_expr_get_started.py
index b066fbad57c6..cdd07d466a37 100644
--- a/tutorials/tensor_expr_get_started.py
+++ b/tutorials/tensor_expr_get_started.py
@@ -143,10 +143,10 @@
 # We provide an minimum array API in python to aid quick testing and prototyping.
 # The array API is based on `DLPack <https://github.com/dmlc/dlpack>`_ standard.
 #
-# - We first create a gpu context.
-# - Then tvm.nd.array copies the data to gpu.
+# - We first create a GPU context.
+# - Then tvm.nd.array copies the data to GPU.
 # - fadd runs the actual computation.
-# - asnumpy() copies the gpu array back to cpu and we can use this to verify correctness
+# - asnumpy() copies the GPU array back to CPU and we can use this to verify correctness
 #
 ctx = tvm.context(tgt, 0)
 
@@ -161,7 +161,7 @@
 # Inspect the Generated Code
 # --------------------------
 # You can inspect the generated code in TVM. The result of tvm.build
-# is a tvm Module. fadd is the host module that contains the host wrapper,
+# is a TVM Module. fadd is the host module that contains the host wrapper,
 # it also contains a device module for the CUDA (GPU) function.
 #
 # The following code fetches the device module and prints the content code.

From 5775d1a8fc653f0639d0def91c1290ce3a841f39 Mon Sep 17 00:00:00 2001
From: Jessica Davies <46725573+jdavies-huawei@users.noreply.github.com>
Date: Fri, 3 May 2019 11:48:18 +0200
Subject: [PATCH 078/106] [DOC] Developer documentation for InferBound pass.
 (#3126)

* Developer documentation for InferBound pass.
---
 docs/dev/codebase_walkthrough.rst |   5 +-
 docs/dev/index.rst                |   1 +
 docs/dev/inferbound.rst           | 772 ++++++++++++++++++++++++++++++
 3 files changed, 777 insertions(+), 1 deletion(-)
 create mode 100644 docs/dev/inferbound.rst

diff --git a/docs/dev/codebase_walkthrough.rst b/docs/dev/codebase_walkthrough.rst
index b7c849554994..788f1f8b50a3 100644
--- a/docs/dev/codebase_walkthrough.rst
+++ b/docs/dev/codebase_walkthrough.rst
@@ -136,7 +136,10 @@ Lowering is done by ``tvm.lower()`` function, defined in ``python/tvm/build_modu
       stmt = schedule.ScheduleOps(sch, bounds)
       ...
 
-Bound inference is the process where all loop bounds and sizes of intermediate buffers are inferred. If you target the CUDA backend and you use shared memory, its required minimum size is automatically determined here. Bound inference is implemented in ``src/schedule/bound.cc``, ``src/schedule/graph.cc`` and ``src/schedule/message_passing.cc``.
+Bound inference is the process where all loop bounds and sizes of intermediate buffers are inferred. If you target the CUDA backend and you use shared memory, its required minimum size is automatically determined here. Bound inference is implemented in ``src/schedule/bound.cc``, ``src/schedule/graph.cc`` and ``src/schedule/message_passing.cc``. For more information on how bound inference works, see `InferBound Pass`_.
+
+.. _InferBound Pass: http://docs.tvm.ai/dev/inferbound.html
+
 
 ``stmt``, which is the output of ``ScheduleOps()``, represents an initial loop nest structure. If you have applied ``reorder`` or ``split`` primitives to your schedule, then the initial loop nest already reflects that changes. ``ScheduleOps()`` is defined in ``src/schedule/schedule_ops.cc``.
 
diff --git a/docs/dev/index.rst b/docs/dev/index.rst
index a76e8ec7e8cc..b5818277ca1e 100644
--- a/docs/dev/index.rst
+++ b/docs/dev/index.rst
@@ -33,3 +33,4 @@ In this part of documentation, we share the rationale for the specific choices m
    relay_add_op
    relay_add_pass
    codebase_walkthrough
+   inferbound
diff --git a/docs/dev/inferbound.rst b/docs/dev/inferbound.rst
new file mode 100644
index 000000000000..e16871e3b7cf
--- /dev/null
+++ b/docs/dev/inferbound.rst
@@ -0,0 +1,772 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+*******************************************
+InferBound Pass
+*******************************************
+
+The InferBound pass is run after normalize, and before ScheduleOps `build_module.py <https://github.com/dmlc/tvm/blob/master/python/tvm/build_module.py>`_. The main job of InferBound is to create the bounds map, which specifies a Range for each IterVar in the program. These bounds are then passed to ScheduleOps, where they are used to set the extents of For loops, see `MakeLoopNest <https://github.com/dmlc/tvm/blob/master/src/op/op_util.cc>`_, and to set the sizes of allocated buffers (`BuildRealize <https://github.com/dmlc/tvm/blob/master/src/op/compute_op.cc>`_), among other uses.
+
+The output of InferBound is a map from IterVar to Range:
+
+.. code:: cpp
+
+   Map<IterVar, Range> InferBound(const Schedule& sch);
+
+Therefore, let's review the Range and IterVar classes:
+
+.. code:: cpp
+
+   namespace HalideIR {
+   namespace IR {
+   	class RangeNode : public Node {
+   	public:
+   		Expr min;
+   		Expr extent;
+   		// remainder ommitted
+   	};
+   	}}
+
+   namespace tvm {
+   	class IterVarNode : public Node {
+   	public:
+   		Range dom;
+   		Var var;
+   		// remainder ommitted
+   	};
+   }
+
+Note that IterVarNode also contains a Range ``dom``. This ``dom`` may or may not have a meaningful value, depending on when the IterVar was created. For example, when ``tvm.compute`` is called, an `IterVar is created <https://github.com/dmlc/tvm/blob/master/src/op/compute_op.cc>`_ for each axis and reduce axis, with dom's equal to the shape supplied in the call to ``tvm.compute``.
+
+On the other hand, when ``tvm.split`` is called, `IterVars are created <https://github.com/dmlc/tvm/blob/master/src/schedule/schedule_lang.cc>`_ for the inner and outer axes, but these IterVars are not given a meaningful ``dom`` value.
+
+In any case, the ``dom`` member of an IterVar is never modified during InferBound. However, keep in mind that the ``dom`` member of an IterVar is sometimes used as default value for the Ranges InferBound computes.
+
+We next review some TVM codebase concepts that are required to understand the InferBound pass.
+
+Recall that InferBound takes one argument, a Schedule. This schedule object, and its members, contains all information about the program being compiled.
+
+A TVM schedule is composed of Stages. Each stage has exactly one Operation, e.g., a ComputeOp or a TensorComputeOp. Each operation has a list of root_iter_vars, which in the case of ComputeOp, are composed of the axis IterVars and the reduce axis IterVars. Each operation can also contain many other IterVars, but all of them are related by the operations's list of IterVarRelations. Each IterVarRelation represents either a split, fuse or rebase in the schedule. For example, in the case of split, the IterVarRelation specifies the parent IterVar that was split, and the two children IterVars: inner and outer.
+
+
+.. code:: cpp
+
+   namespace tvm {
+   	class ScheduleNode : public Node {
+   	public:
+   		Array<Operation> outputs;
+   		Array<Stage> stages;
+   		Map<Operation, Stage> stage_map;
+   		// remainder ommitted
+   	};
+
+   	class StageNode : public Node {
+   	public:
+   		Operation op;
+   		Operation origin_op;
+   		Array<IterVar> all_iter_vars;
+   		Array<IterVar> leaf_iter_vars;
+   		Array<IterVarRelation> relations;
+   		// remainder ommitted
+   	};
+   	
+   	class OperationNode : public Node {
+   	public:
+   		virtual Array<IterVar> root_iter_vars();
+   		virtual Array<Tensor> InputTensors();
+   		// remainder ommitted
+   	};
+   	
+   	class ComputeOpNode : public OperationNode {
+   	public:
+   		Array<IterVar> axis;
+   		Array<IterVar> reduce_axis;
+   		Array<Expr> body;
+   		Array<IterVar> root_iter_vars();
+   		// remainder ommitted
+   	};
+   }
+
+Tensors haven't been mentioned yet, but in the context of TVM, a Tensor represents output of an operation.
+
+.. code:: cpp
+
+   class TensorNode : public Node {
+   public:
+   	// The source operation, can be None
+   	// This Tensor is output by this op
+   	Operation op;
+   	// The output index from the source operation
+   	int value_index;
+   };
+
+In the Operation class declaration above, we can see that each operation also has a list of InputTensors. Thus the stages of the schedule form a DAG, where each stage is a node in the graph. There is an edge in the graph from Stage A to Stage B, if the operation of Stage B has an input tensor whose source operation is the op of Stage A. Put simply, there is an edge from A to B, if B consumes a tensor produced by A. See the diagram below. This graph is created at the beginning of InferBound, by a call to `CreateReadGraph <https://github.com/dmlc/tvm/blob/master/src/schedule/bound.cc>`_.
+
+.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/docs/inferbound/stage_graph.png
+    :align: center
+    :scale: 70%
+
+InferBound makes one pass through the graph, visiting each stage exactly once. InferBound starts from the output stages (i.e., the solid blue nodes in the graph above), and moves upwards (in the opposite direction of the edges). This is achieved by performing a reverse topological sort on the nodes of the graph. Therefore, when InferBound visits a stage, each of its consumer stages has already been visited.
+
+.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/docs/inferbound/inferbound_traversal.png
+    :align: center
+    :scale: 70%
+
+The InferBound pass is shown in the following pseudo-code:
+
+.. code:: cpp
+
+   Map<IterVar, Range> InferBound(const Schedule& sch) {
+   	Array<Operation> outputs = sch->get_outputs();
+   	G = CreateGraph(outputs);
+   	stage_list = sch->reverse_topological_sort(G);
+   	Map<IterVar, Range> rmap;
+   	for (Stage s in stage_list) {
+   		InferRootBound(s, &rmap);
+   		PassDownDomain(s, &rmap);
+   	}
+   	return rmap;
+   }
+
+The InferBound pass has two interesting properties that are not immediately obvious:
+
+1. After InferBound visits a stage, the ranges of all IterVars in the stage will be set in ``rmap``.
+2. The Range of each IterVar is only set once in ``rmap``, and then never changed.
+
+So it remains to explain what InferBound does when it visits a stage. As can be seen in the pseudo-code above, InferBound calls two functions on each stage: InferRootBound, and PassDownDomain. The purpose of InferRootBound is to set the Range (in ``rmap``) of each root_iter_var of the stage. (Note: InferRootBound does not set the Range of any other IterVar, only those belonging to root_iter_vars). The purpose of PassDownDomain is to propagate this information to the rest of the stage's IterVars.  When PassDownDomain returns, all IterVars of the stage have known Ranges in ``rmap``.
+
+The remainder of the document dives into the details of InferRootBound and PassDownDomain. Since PassDownDomain is simpler to describe, we will cover it first.
+
+.. _IterVarHyperGraph:
+
+IterVar Hyper-graph
+-------------------
+
+The InferBound pass traverses the stage graph, as described above. However, within each stage is another graph, whose nodes are IterVars. InferRootBound and PassDownDomain perform message-passing on these IterVar graphs.
+
+Recall that all IterVars of the stage are related by IterVarRelations. The IterVarRelations of a stage form a directed acyclic hyper-graph, where each node of the graph corresponds to an IterVar, and each hyper-edge corresponds to an IterVarRelation. We can also represent this hyper-graph as a DAG, which is simpler to visualize as shown below.
+
+.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/docs/inferbound/relations.png
+    :align: center
+    :scale: 70%
+
+
+The above diagram shows the IterVar hyper-graph for one stage. The stage has one root_iter_var, ``i``. It has been split, and the resulting inner axis ``i.inner``, has been split again. The leaf_iter_vars of the stage are shown in green: ``i.outer``, ``i.inner.outer``, and ``i.inner.inner``.
+
+Message passing functions are named "PassUp" or "PassDown", depending on whether messages are passed from children to their parent in the DAG ("PassUp"), or from the parent to its children ("PassDown"). For example, the large arrow on the left-hand side of the diagram above, shows that PassDownDomain sends messages from the root IterVar ``i`` to its children ``i.outer`` and ``i.inner``.
+
+.. _PassDownDomain:
+
+PassDownDomain
+--------------
+The purpose of PassDownDomain is to take the Ranges produced by InferRootBound for the root_iter_vars, and set the Ranges of all other IterVars in the stage.
+
+PassDownDomain iterates through the stage's IterVarRelations. There are three possible types of IterVarRelation: split, fuse, and rebase. The most interesting case (since it offers opportunity for improvement), is IterVarRelations representing splits.
+
+The Ranges of the inner and outer IterVars of the split are set based on the parent IterVar's known Range, as follows:
+
+.. code:: cpp
+
+   rmap[split->inner] = Range::make_by_min_extent(0, split->factor)
+   rmap[split->outer] = Range::make_by_min_extent(0, DivCeil(rmap[split->parent]->extent, split->factor))
+
+There is an opportunity here to tighten the bounds produced by InferBound, when ``split->factor`` does not evenly divide the parent's extent. Suppose the parent's extent is 20, and the split factor is 16. Then on the second iteration of the outer loop, the inner loop only needs to perform 4 iterations, not 16. If PassDownDomain could set the extent of ``split->inner`` to ``min(split->factor, rmap[split->parent]->extent - (split->outer * split->factor))``, then the extent of the inner variable would properly adapt, based on which iteration of the outer loop is being executed.
+
+For Fuse relations, the Range of the fused IterVar is set based on the known Ranges of the inner and outer IterVars, as follows:
+
+.. code:: cpp
+
+   rmap[fuse->fused] = Range::make_by_min_extent(0, rmap[fuse->outer]->extent * rmap[fuse->inner]->extent)
+
+
+InferRootBound
+--------------
+
+Recall that InferBound calls InferRootBound, followed by :ref:`PassDownDomain` on each stage in the stage graph. The purpose of InferRootBound is to set the Range of each root_iter_var of the Stage's operation. These Ranges will be propagated to the rest of the stage's IterVars using :ref:`PassDownDomain`. Note that InferRootBound does not set the Range of any other IterVar, only those belonging to the stage's root_iter_vars.
+
+If the stage is an output stage or placeholder, InferRootBound simply sets the root_iter_var Ranges to their default values. The default Range for a root_iter_var is taken from the ``dom`` member of the IterVar (see the IterVarNode class declaration above).
+
+Otherwise, InferRootBound iterates through the consumers of the stage. IntSets are created for each of the consumer's IterVars, as follows. Phase 1) IntSets are initialized for the consumer's leaf_iter_vars, and propagated to the consumer's root_iter_vars by PassUpDomain (Phase 2). These IntSets are used to create TensorDom of the input tensors of the consumer stage (Phase 3). Finally, once all of the consumers have been processed, InferRootBound calls GatherBound, to set the Ranges of the stage's root_iter_vars, based on the TensorDoms (Phase 4).
+
+This process can seem complicated. One reason is that a stage can have more than one consumer. Each consumer has different requirements, and these must somehow be consolidated. Similarly, the stage may output more than one tensor, and each consumer only uses a particular subset of these tensors. Furthermore, even if a consumer uses a particular tensor, it may not use all elements of the tensor.
+
+As mentioned above, a consumer may only require a small number of elements from each tensor. The consumers can be thought of as making requests to the stage, for certain regions of its output tensors. The job of Phases 1-3 is to establish the regions of each output tensor that are required by each consumer.
+
+.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/docs/inferbound/inferbound_phases.png
+    :align: center
+    :scale: 70%
+
+IntSets
+~~~~~~~
+
+During InferRootBound, Ranges are converted to IntSets, and message passing is performed over IntSets. Therefore, it is important to understand the difference between Ranges and IntSets. The name "IntSet" suggests it can represent an arbitrary set of integers, e.g., A = \{-10, 0, 10, 12, 13\}. This would certainly be more expressive than a Range, which only represents a set of contiguous integers, e.g., B = \{10,11,12\}.
+
+However, currently IntSets come in only three varieties: IntervalSets, StrideSets, and ModularSets. IntervalSets, similarly to Ranges, only represent sets of contiguous integers. A StrideSet is defined by a base IntervalSet, a list of strides, and a list of extents. However, StrideSet is unused, and ModularSet is only used by the frontend.
+
+Therefore, not all sets of integers can be represented by an IntSet in TVM currently. For example, set A in the example above can not be represented by an IntSet. However, in future the functionality of IntSet can be extended to handle more general kinds of integer sets, without requiring modification to users of IntSet.
+
+*InferBound is more complicated for schedules that contain compute_at. Therefore, we first explain InferBound for schedules that do not contain compute_at.*
+
+.. _Phase1:
+
+Phase 1: Initialize IntSets for consumer's leaf_iter_vars
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: cpp
+
+   /*
+    * Input: Map<IterVar, Range> rmap: contains the Range for each IterVar of the consumer stage
+    * Output: Map<IterVar, IntSet> up_state: contains an IntSet for each leaf_iter_var of the consumer
+    */
+
+In Phase 1, IntSets for each of the consumer's leaf_iter_vars are created, based on the Ranges of the leaf_iter_vars from ``rmap``.  Recall that the consumer has already been visited by InferBound, so all of its IterVars have known Ranges in ``rmap``.
+
+There are three cases:
+
+- Case 1: Extent of leaf var's Range is 1. In this case, the up_state for the leaf is just a single point, equal to the Range's min.
+- Case 2: *No relaxation is needed. In this case, the up_state for the leaf is just a single point, defined by the leaf var itself.*
+- Case 3: Relaxation is needed. In this case, the leaf's Range is simply converted to an IntSet.
+
+For simplicity, we assume the schedule does not contain thread axes. In this case, Case 2 is only relevant if the schedule contains compute_at. Please refer to the section :ref:`InferBoundCA`, for further explanation.
+
+.. _Phase2:
+
+Phase 2: Propagate IntSets from consumer's leaves to consumer's roots
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: cpp
+
+   /*
+    * Input: Map<IterVar, IntSet> up_state: consumer leaf -> IntSet
+    * Output: Map<IterVar, IntSet> dom_map: consumer root -> IntSet
+    */
+
+The purpose of Phase 2 is to propagate the IntSet information from the consumer's leaf_iter_vars to the consumer's root_iter_vars. The result of Phase 2 is another map, ``dom_map``, that contains an IntSet for each of the consumer's root_iter_vars.
+
+Phase 2 begins by calling PassUpDomain, which visits the IterVarRelations of the consumer stage. In the case of a Split relation, PassUpDomain sets the up_state of the parent IterVar, based on the inner and outer IntSets, as follows:
+
+- Case 1: The Ranges of outer and inner IterVars match their ``up_state`` domains. In this case, set the parent's ``up_state`` by simply converting the parent's Range to an IntSet.
+- Case 2: *Otherwise, the parent's* ``up_state`` *is defined by evaluating* ``outer*f + inner + rmap[parent]->min``, *with respect to the* ``up_state`` *of outer and inner. Here, instead of using the Split relation's factor, TVM uses* ``f = rmap[inner]->extent``.
+
+Case 2 is only needed if the schedule contains compute_at. Please refer to the section :ref:`InferBoundCA` below, for further explanation.
+
+After PassUpDomain has finished propagating up_state to all IterVars of the consumer, a fresh map, from root_iter_vars to IntSet, is created. If the schedule does not contain compute_at, the IntSet for root_iter_var ``iv`` is created by the following code:
+
+.. code:: cpp
+
+   dom_map[iv->var.get()] = IntSet::range(up_state.at(iv).cover_range(iv->dom));
+
+Note that if the schedule does not contain compute_at, Phases 1-2 are actually unnecessary. dom_map can be built directly from the known Ranges in rmap. Ranges simply need to be converted to IntSets, which involves no loss of information.
+
+.. _Phase3:
+
+Phase 3: Propagate IntSets to consumer's input tensors
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: cpp
+
+   /*
+    * Input: Map<IterVar, IntSet> dom_map: consumer root -> IntSet
+    * Output: Map<Tensor, TensorDom> tmap: output tensor -> vector<vector<IntSet> >
+    */
+
+Note that the consumer's input tensors are output tensors of the stage InferBound is working on. So by establishing information about the consumer's input tensors, we actually obtain information about the stage's output tensors too: the consumers require certain regions of these tensors to be computed. This information can then be propagated through the rest of the stage, eventually obtaining Ranges for the stage's root_iter_vars by the end of Phase 4.
+
+The output of Phase 3 is tmap, which is a map containing all of the stage's output tensors. Recall that a Tensor is multi-dimensional, with a number of different axes. For each output tensor, and each of that tensor's axes, tmap contains a list of IntSets. Each IntSet in the list is a request from a different consumer.
+
+Phase 3 is accomplished by calling PropBoundToInputs on the consumer. PropBoundToInputs adds IntSets to tmap's lists, for all input Tensors of the consumer.
+
+The exact behavior of PropBoundToInputs depends on the type of the consumer's operation: ComputeOp, TensorComputeOp, PlaceholderOp, ExternOp, etc. Consider the case of TensorComputeOp. A TensorComputeOp already has a Region for each of its Tensor inputs, defining the slice of the tensor that the operation depends on. For each input tensor i, and dimension j, a request is added to tmap, based on the corresponding dimension in the Region:
+
+.. code:: cpp
+
+   for (size_t j = 0; j < t.ndim(); ++j) {
+   	// i selects the Tensor t
+   	tmap[i][j].push_back(EvalSet(region[j], dom_map));
+   }
+
+.. _Phase4:
+
+Phase 4: Consolidate across all consumers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: cpp
+
+   /*
+    * Input: Map<Tensor, TensorDom> tmap: output tensor -> vector<vector<IntSet> >
+    * Output: Map<IterVar, Range> rmap: rmap is populated for all of the stage's root_iter_vars
+    */
+
+Phase 4 is performed by GatherBound, whose behavior depends on the type of operation of the stage. We discuss the ComputeOp case only, but TensorComputeOp is the same.
+
+A ComputeOp has only a single output Tensor, whose axes correspond to the axis variables of the ComputeOp. The root_iter_vars of a ComputeOp include these axis variables, as well as the reduce_axis variables. If the root IterVar is an axis var, it corresponds to one of the axes of the output Tensor. GatherBound sets the Range of such a root IterVar to the union of all IntSets (i.e., union of all consumer requests) for the corresponding axis of the tensor. If the root IterVar is a reduce_axis, its Range is just set to its default (i.e., the ``dom`` member of IterVarNode).
+
+.. code:: cpp
+
+   // 'output' selects the output tensor
+   // i is the dimension
+   rmap[axis[i]] = arith::Union(tmap[output][i]).cover_range(axis[i]->dom);
+
+.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/docs/inferbound/gatherbound.png
+    :align: center
+    :scale: 70%
+
+
+The union of IntSets is computed by converting each IntSet to an Interval, and then taking the minimum of all minimums, and the maximum of all of these interval's maximums.
+
+.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/docs/inferbound/union.png
+    :align: center
+    :scale: 70%
+
+
+This clearly results in some unnecessary computation, i.e., tensor elements will be computed that are never used.
+
+Unfortunately, even if we're lucky and the IntervalSet unions do not produce unnecessary computation, the fact that GatherBound considers each dimension of the tensor separately can also cause unnecessary computation. For example, in the diagram below the two consumers A and B require disjoint regions of the 2D tensor: consumer A requires T[0:2, 0:2], and consumer B requires T[2:4, 2:4]. GatherBound operates on each dimension of the tensor separately. For the first dimension of the tensor, GatherBound takes the union of intervals 0:2 and 2:4, producing 0:4 (note that no approximation was required here). Similarly for the second dimension of the tensor. Therefore, the dimension-wise union of these two requests is T[0:4, 0:4]. So GatherBound will cause all 16 elements of tensor T to be computed, even though only half of those elements will ever be used.
+
+
+.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/docs/inferbound/gatherbound_problem.png
+    :align: center
+    :scale: 70%
+
+.. _InferBoundCA:
+
+InferBound with compute_at
+--------------------------
+
+If the schedule contains compute_at, Phases 1-2 of InferRootBound become more complex.
+
+Motivation
+~~~~~~~~~~
+
+**Ex. 1**
+
+Consider the following snippet of a TVM program:
+
+::
+
+   C = tvm.compute((5, 16), lambda i, j : tvm.const(5, "int32"), name='C')
+   D = tvm.compute((5, 16), lambda i, j : C[i, j]*2, name='D')
+
+This produces the following (simplified IR):
+
+::
+
+   for i 0, 5
+       for j 0, 16
+           C[i, j] = 5
+   for i 0, 5
+       for j 0, 16
+           D[i, j] = C[i, j]*2
+
+It's easy to see that stage D requires all (5,16) elements of C to be computed.
+
+**Ex. 2**
+
+However, suppose C is computed at axis j of D:
+
+::
+
+   s = tvm.create_schedule(D.op)
+   s[C].compute_at(s[D], D.op.axis[1])
+
+Then only a single element of C is needed at a time:
+
+::
+
+   for i 0, 5
+       for j 0, 16
+           C[0] = 5
+           D[i, j] = C[0]*2
+
+**Ex. 3**
+
+Similarly, if C is computed at axis i of D, only a vector of 16 elements of C are needed at a time:
+
+::
+
+   for i 0, 5
+       for j 0, 16
+           C[j] = 5
+       for j 0, 16
+           D[i, j] = C[j]*2
+
+Based on the above examples, it is clear that InferBound should give different answers for stage C depending on where in its consumer D it is "attached".
+
+.. _AttachPaths:
+
+Attach Paths
+~~~~~~~~~~~~
+
+If stage C is computed at axis j of stage D, we say that C is  *attached*  to axis j of stage D. This is reflected in the Stage object by setting the following three member variables:
+
+.. code:: cpp
+
+   class StageNode : public Node {
+   public:
+       // ommitted
+
+       // For compute_at, attach_type = kScope
+       AttachType attach_type;
+
+       // For compute_at, this is the axis
+       // passed to compute_at, e.g., D.op.axis[1]
+       IterVar attach_ivar;
+
+       // The stage passed to compute_at, e.g., D
+       Stage attach_stage;
+
+       // ommitted
+   };
+
+Consider the above examples again. In order for InferBound to determine how many elements of C must be computed, it is important to know whether the computation of C occurs within the scope of a leaf variable of D, or above that scope. For example, in Ex. 1, the computation of C occurs  *above*  the scopes of all of D's leaf variables. In Ex. 2, the computation of C occurs  *within*  the scope of all of D's leaf variables. In Ex. 3, C occurs within the scope of D's i, but above the scope of D's j.
+
+CreateAttachPath is responsible for figuring out which scopes contain a stage C. These scopes are ordered from innermost scope to outermost. Thus for each stage CreateAttachPath produces an "attach path", which lists the scopes containing the stage, from innermost to outermost scope. In Ex. 1, the attach path of C is empty. In Ex. 2, the attach path of C contains {j, i}. In Ex. 3, the attach path of C is {i}.
+
+The following example clarifies the concept of an attach path, for a more complicated case.
+
+**Ex. 4**
+
+::
+
+   C = tvm.compute((5, 16), lambda i, j : tvm.const(5, "int32"), name='C')
+   D = tvm.compute((4, 5, 16), lambda di, dj, dk : C[dj, dk]*2, name='D')
+   s = tvm.create_schedule(D.op)
+   s[C].compute_at(s[D], D.op.axis[2])
+
+Here is the IR after ScheduleOps (note that loops with extent 1 have been preserved, using the ``debug_keep_trivial_loop`` argument of ScheduleOps):
+
+::
+
+   // attr [compute(D, 0x2c070b0)] realize_scope = ""
+   realize D([0, 4], [0, 5], [0, 16]) {
+     produce D {
+       for (di, 0, 4) {
+         for (dj, 0, 5) {
+           for (dk, 0, 16) {
+             // attr [compute(C, 0x2c29990)] realize_scope = ""
+             realize C([dj, 1], [dk, 1]) {
+               produce C {
+                 for (i, 0, 1) {
+                   for (j, 0, 1) {
+                     C((i + dj), (j + dk)) =5
+                   }
+                 }
+               }
+               D(di, dj, dk) =(C(dj, dk)*2)
+             }
+           }
+         }
+       }
+     }
+   }
+
+In this case, the attach path of C is {dk, dj, di}. Note that C does not use di, but di still appears in C's attach path.
+
+**Ex. 5**
+
+Compute_at is commonly applied after splitting, but this can be handled very naturally given the above definitions. In the example below, the attachment point of C is j_inner of D. The attach path of C is {j_inner, j_outer, i}.
+
+::
+
+   C = tvm.compute((5, 16), lambda i, j : tvm.const(5, "int32"), name='C')
+   D = tvm.compute((5, 16), lambda i, j : C[i, j]*2, name='D')
+   s = tvm.create_schedule(D.op)
+   d_o, d_i = s[D].split(D.op.axis[1], factor=8)
+   s[C].compute_at(s[D], d_i)
+
+The IR in this case looks like:
+
+::
+
+   for i 0, 5
+       for j_outer 0, 2
+           for j_inner 0, 8
+               C[0] = 5
+               D[i, j_outer*8 + j_inner] = C[0]*2
+
+Building an Attach Path
+~~~~~~~~~~~~~~~~~~~~~~~
+
+We continue to refer to stages C and D, as introduced in the previous section. The CreateAttachPath algorithm builds the attach path of a stage C as follows. If C does not have attach_type ``kScope``, then C has no attachment, and C's attach path is empty. Otherwise, C is attached at attach_stage=D. We iterate through D's leaf variables in top-down order. All leaf variables starting from C.attach_ivar and lower are added to C's attach path. Then, if D is also attached somewhere, e.g., to stage E, the process is repeated for E's leaves. Thus CreateAttachPath continues to add variables to C's attach path until a stage with no attachment is encountered.
+
+In the example below, C is attached at D, and D is attached at E.
+
+::
+
+   C = tvm.compute((5, 16), lambda ci, cj : tvm.const(5, "int32"), name='C')
+   D = tvm.compute((5, 16), lambda di, dj : C[di, dj]*2, name='D')
+   E = tvm.compute((5, 16), lambda ei, ej : D[ei, ej]*4, name='E')
+   s = tvm.create_schedule(E.op)
+   s[C].compute_at(s[D], D.op.axis[1])
+   s[D].compute_at(s[E], E.op.axis[1])
+
+With ``debug_keep_trivial_loop=True``, the attach path of C is {dj, di, ej, ei}, and the attach path of D is {ej, ei}:
+
+::
+
+   // attr [D] storage_scope = "global"
+   allocate D[int32 * 1]
+   // attr [C] storage_scope = "global"
+   allocate C[int32 * 1]
+   produce E {
+     for (ei, 0, 5) {
+       for (ej, 0, 16) {
+         produce D {
+           for (di, 0, 1) {
+             for (dj, 0, 1) {
+               produce C {
+                 for (ci, 0, 1) {
+                   for (cj, 0, 1) {
+                     C[(ci + cj)] = 5
+                   }
+                 }
+               }
+               D[(di + dj)] = (C[(di + dj)]*2)
+             }
+           }
+         }
+         E[((ei*16) + ej)] = (D[0]*4)
+       }
+     }
+   }
+
+InferBound with compute_at
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Now that the concept of an attach path has been introduced, we return to how InferBound differs if the schedule contains compute_at. The only difference is in InferRootBound, :ref:`Phase1` and :ref:`Phase2`.
+
+In InferRootBound, the goal is to determine Ranges for the root_iter_vars of a particular stage, C. Phases 1-2 of InferRootBound assign IntSets to the leaf IterVars of C's consumers, and then propagate those IntSets up to the consumers' root_iter_vars.
+
+If there are no attachments, the Ranges already computed for the consumer's variables define how much of C is needed by the consumer. However, if the stage is actually inside the scope of one of the consumer's variables j, then only a single point within the Range of j is needed at a time.
+
+.. _Phase1CA:
+
+Phase 1: Initialize IntSets for consumer's leaf_iter_vars
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: cpp
+
+   /*
+    * Input: Map<IterVar, Range> rmap: contains the Range for each IterVar of the consumer stage
+    * Output: Map<IterVar, IntSet> up_state: contains an IntSet for each leaf_iter_var of the consumer
+    */
+
+In Phase 1, IntSets for each of the consumer's leaf_iter_vars are created, based on the Ranges of the leaf_iter_vars from rmap. Recall that the consumer has already been visited by InferBound, so all of its IterVars have known Ranges in rmap.
+
+There are three cases:
+
+- Case 1: Extent of leaf var's Range is 1. In this case, the up_state for the leaf is just a single point, equal to the Range's min.
+- Case 2: No relaxation is needed. In this case, the up_state for the leaf is just a single point, defined by the leaf var itself.
+- Case 3: Relaxation is needed. In this case, the leaf's Range is simply converted to an IntSet.
+
+Case 2 occurs if we encounter the attachment point of stage C in the consumer. For this attach_ivar, and all higher leaf variables of the consumer, Case 2 will be applied. This ensures that only a single point within the Range of the leaf variable will be requested, if C is inside the leaf variable's scope.
+
+.. _Phase2CA:
+
+Phase 2: Propagate IntSets from consumer's leaves to consumer's roots
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: cpp
+
+   /*
+    * Input: Map<IterVar, IntSet> up_state: consumer leaf -> IntSet
+    * Output: Map<IterVar, IntSet> dom_map: consumer root -> IntSet
+    */
+
+Phase 2 begins by calling PassUpDomain, which visits the IterVarRelations of the consumer stage. In the case of a Split relation, PassUpDomain sets the up_state of the parent IterVar, based on the inner and outer IntSets, as follows:
+
+- Case 1: The Ranges of outer and inner IterVars match their ``up_state`` domains. In this case, set the parent's ``up_state`` by simply converting the parent's Range to an IntSet.
+- Case 2: Otherwise, the parent's ``up_state`` is defined by evaluating ``outer*f + inner + rmap[parent]->min``, with respect to the ``up_state`` of outer and inner. Here, instead of using the Split relation's factor, TVM uses* ``f = rmap[inner]->extent``.
+
+
+Now, because the schedule contains compute_at, it is possible for Case 2 to apply. This is because the leaf IntSets may now be initialized to a single point within their Range (Case 2 of :ref:`Phase1CA`), so the IntSets will no longer always match the Ranges.
+
+After PassUpDomain has finished propagating up_state to all IterVars of the consumer, a fresh map, from root_iter_vars to IntSet, is created. If the stage is not attached to the current consumer, then for each variable iv in the consumer's attach_path, iv's Range is added to a ``relax_set``. The root variables of the stage are evaluated with respect to this ``relax_set``.
+
+This is to handle cases like the following example, where C is not attached anywhere, but its consumer D is attached in stage E. In this case, D's attach_path, {ej, ei} must be considered when determining how much of C must be computed.
+
+::
+
+   C = tvm.compute((5, 16), lambda ci, cj : tvm.const(5, "int32"), name='C')
+   D = tvm.compute((5, 16), lambda di, dj : C[di, dj]*2, name='D')
+   E = tvm.compute((5, 16), lambda ei, ej : D[ei, ej]*4, name='E')
+   s = tvm.create_schedule(E.op)
+   s[D].compute_at(s[E], E.op.axis[1])
+
+
+::
+
+   for ci 0, 5
+       for cj 0, 16
+           C[ci, cj] = 5
+   for ei 0, 5
+       for ej 0, 16
+           D[0] = C[ei, ej]*2
+           E[ei, ej] = D[0]*4
+
+Limitations of PassUpDomain
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This section describes known limitations of PassUpDomain. These limitations affect the Ranges produced by InferBound, as well as other users of PassUpDomain such as ``tensorize``.
+
+**Ex. 6**
+
+Above, we discussed the behavior of PassUpDomain on Split relations only. In the following example, the schedule contains ``fuse`` in addition to ``split``. In the TVM program below, the operation C has two axes that are fused, and then the fused axis is split. Note that all tensors are originally of shape ``(4, 4)`` and the fused axis is split by factor ``4`` as well. Therefore, it would be natural to assume that the effect of the fuse is simply undone by the split. However, this is not the case in TVM, as explained below.
+
+::
+
+   import tvm
+
+   n = 4
+   m = 4
+
+   A = tvm.placeholder((n, m), name='A')
+   B = tvm.compute((n, m), lambda bi, bj: A[bi, bj]+2, name='B')
+   C = tvm.compute((n, m), lambda ci, cj: B[ci, cj]*3, name='C')
+
+   s = tvm.create_schedule(C.op)
+
+   fused_axes = s[C].fuse(C.op.axis[0], C.op.axis[1])
+   xo, xi = s[C].split(fused_axes, 4)
+
+   s[B].compute_at(s[C], xo)
+
+   print(tvm.lower(s, [A, C], simple_mode=True))
+
+The output of this program is shown below. Notice that all 16 elements of B are computed every time through the outer loop, even though C only uses 4 of them.
+
+::
+
+   // attr [B] storage_scope = "global"
+   allocate B[float32 * 16]
+   produce C {
+     for (ci.cj.fused.outer, 0, 4) {
+       produce B {
+         for (bi, 0, 4) {
+           for (bj, 0, 4) {
+             B[((bi*4) + bj)] = (A[((bi*4) + bj)] + 2.000000f)
+           }
+         }
+       }
+       for (ci.cj.fused.inner, 0, 4) {
+         C[((ci.cj.fused.outer*4) + ci.cj.fused.inner)] = (B[((ci.cj.fused.outer*4) + ci.cj.fused.inner)]*3.000000f)
+       }
+     }
+   }
+
+This is in contrast to the following IR, which is produced by modifying the above program by deleting the fuse and split, and replacing the compute_at with ``s[B].compute_at(s[C], C.op.axis[0])``. Note that in the IR below, only 4 elements of B are computed at a time, as desired. The size of buffer B is also smaller.
+
+::
+
+   // attr [B] storage_scope = "global"
+   allocate B[float32 * 4]
+   produce C {
+     for (ci, 0, 4) {
+       produce B {
+         for (bj, 0, 4) {
+           B[bj] = (A[((ci*4) + bj)] + 2.000000f)
+         }
+       }
+       for (cj, 0, 4) {
+         C[((ci*4) + cj)] = (B[cj]*3.000000f)
+       }
+     }
+   }
+
+This example demonstrates that contrary to what we expect, the split does not simply undo the fuse. So what causes the difference? Why is the entire tensor B re-computed 4 times, when only a single row is actually needed at a time?
+
+Determining the amount of B that must be computed is the responsibility of InferBound. However, the Ranges returned by InferBound for B's root_iter_vars are too large in this case: ``[0, 4]`` for both ``bi`` and ``bj``.  This occurs because of a limitation in PassUpDomain on Fuse relations, which we explain next.
+
+When InferRootBound is working on stage B, it visits B's consumer stage C to find out how much of B is requested by C. C has root_iter_vars ci and cj, which have been fused and then split. This results in the following :ref:`IterVarHyperGraph` for stage C.
+
+
+.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/docs/inferbound/passupdomain_problem.png
+    :align: center
+    :scale: 70%
+
+
+
+We trace the execution of InferRootBound on stage B. Recall that :ref:`Phase1CA` of InferRootBound involves setting the IntSets for all leaf_iter_vars of B's consumer stage C. In this case, C's leaf_iter_vars are ``ci.cj.fused.outer`` and ``ci.cj.fused.inner``.  Since B is attached at ``ci.cj.fused.outer``, ``ci.cj.fused.inner`` must be relaxed but ``ci.cj.fused.outer`` is a single point. The IntSets of C's leaf_iter_vars, after :ref:`Phase1CA`, are shown in the following table.
+
++----------------------+---------------------------------------------------+
+| IterVar              | IntSet after Phase 1                              |
++======================+===================================================+
+| ``ci.cj.fused.inner``|``[0, (min(4, (16 - (ci.cj.fused.outer*4))) - 1)]``|
++----------------------+---------------------------------------------------+
+| ``ci.cj.fused.outer``| ``[ci.cj.fused.outer, ci.cj.fused.outer]``        |
++----------------------+---------------------------------------------------+
+
+In :ref:`Phase2CA` of InferRootBound, PassUpDomain is called on all of C's IterVarRelations in bottom-up order.
+
+PassUpDomain is called on C's Split node first. Case 2 of PassUpDomain applies, because the IntSet of ``ci.cj.fused.outer`` is just a single point, and doesn't equal its Range (as previously computed by InferBound on stage C). PassUpDomain therefore sets the IntSet of ``ci.cj.fused`` based on the IntSets of ``ci.cj.fused.inner`` and ``ci.cj.fused.outer``, as shown in row 3 of the following table.
+
++----------------------+--------------------------------------------------------------------------------------------------+
+| IterVar              | IntSet after PassUpDomain on SplitNode                                                           |
++======================+==================================================================================================+
+| ``ci.cj.fused.inner``| ``[0, (min(4, (16 - (ci.cj.fused.outer*4))) - 1)]``                                              |
++----------------------+--------------------------------------------------------------------------------------------------+
+| ``ci.cj.fused.outer``| ``[ci.cj.fused.outer, ci.cj.fused.outer]``                                                       |
++----------------------+--------------------------------------------------------------------------------------------------+
+| ``ci.cj.fused``      | ``[(ci.cj.fused.outer*4), ((ci.cj.fused.outer*4) + (min(4, (16 - (ci.cj.fused.outer*4))) - 1))]``|
++----------------------+--------------------------------------------------------------------------------------------------+
+
+After PassUpDomain is called on the Split node, it is called on the Fuse node.
+
+- Case 1: the Range of IterVar ``fused`` (i.e., as previously calculated by InferBound) is equal to its IntSet
+- Case 2: the IntSet of IterVar ``fused`` is a single point
+- Case 3: otherwise
+
+In our case, the Range of ``ci.cj.fused``, is [0, 16). This is not equal to the IntSet of ``ci.cj.fused``, which has extent at most 4 (see row 3 of the table above). Therefore Case 1 does not apply.  Case 2 doesn't apply either, since the IntSet of ``ci.cj.fused`` is not a single point.  Therefore, only the default Case 3 applies.
+
+Unfortunately in Case 3, PassUpDomain conservatively applies a "fallback inference rule", i.e., it just returns IntSets equal to the Ranges of ``ci`` and ``cj``. Since C is the output stage of the schedule, we know that InferBound will have set the Ranges of the root_iter_vars of C (i.e., ``ci`` and ``cj``) to their original dimensions (i.e., the ``dom`` value of their IterVars). The resulting output of PassUpDomain for ``ci`` and ``cj`` is shown in the last two rows of the table below.
+
++----------------------+--------------------------------------------------------------------------------------------------+
+| IterVar              | IntSet after PassUpDomain on FuseNode                                                            |
++======================+==================================================================================================+
+| ``ci.cj.fused.inner``| ``[0, (min(4, (16 - (ci.cj.fused.outer*4))) - 1)]``                                              |
++----------------------+--------------------------------------------------------------------------------------------------+
+| ``ci.cj.fused.outer``| ``[ci.cj.fused.outer, ci.cj.fused.outer]``                                                       |
++----------------------+--------------------------------------------------------------------------------------------------+
+| ``ci.cj.fused``      |``[(ci.cj.fused.outer*4), ((ci.cj.fused.outer*4) + (min(4, (16 - (ci.cj.fused.outer*4))) - 1))]`` |
++----------------------+--------------------------------------------------------------------------------------------------+
+| ``ci``               | ``[0, 4]``                                                                                       |
++----------------------+--------------------------------------------------------------------------------------------------+
+| ``cj``               | ``[0, 4]``                                                                                       |
++----------------------+--------------------------------------------------------------------------------------------------+
+
+This is enough to guarantee that consumer C requests *all* elements of B: the IntSets of ``ci`` and ``cj`` become requests from consumer C to the output tensors of stage B (via PropBoundToInputs in :ref:`Phase3` and GatherBound in :ref:`Phase4`).
+
+This example shows that schedules containing a split of fused axes are difficult to handle in TVM. The source of the difficulty is similar to the limitations of GatherBound. The region of tensor B requested by a consumer C must be a single rectangular region of B. Or, if B has more than two dimensions, the region of B must be expressible as an independent Range for each of its axes.
+
+If the split factor is 4, or 8, in the above example, the region of B needed in each iteration of the outer loop is rectangular.
+
+.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/docs/inferbound/passupdomain_div.png
+    :align: center
+    :scale: 70%
+
+However, if the split factor is changed from 4 to 3 in the example above, it is easy to see that the region of B that C needs can no longer be described by an independent Range for each of its axes.
+
+
+.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/docs/inferbound/passupdomain_nodiv.png
+    :align: center
+    :scale: 70%
+
+The best that can be done with rectangular regions is shown in the following diagram. The orange regions are the minimum rectangular regions covering the region of B that needs to be computed, at each iteration of the outer loop.
+
+.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/docs/inferbound/passupdomain_min.png
+    :align: center
+    :scale: 70%

From ef78f81a263f782c2d96fb910710462add6c5dab Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 3 May 2019 21:07:14 -0400
Subject: [PATCH 079/106] [ARITH] Constraint-aware ConstIntBound, Enhance
 CanonicalSimplify (#3132)

---
 src/arithmetic/canonical_simplify.cc          | 17 +++-
 src/arithmetic/const_int_bound.cc             | 77 ++++++++++++++++++-
 src/arithmetic/modular_set.cc                 |  4 +-
 src/arithmetic/rewrite_simplify.cc            | 66 +++++++++++++---
 .../unittest/test_arith_canonical_simplify.py | 33 +++++++-
 5 files changed, 180 insertions(+), 17 deletions(-)

diff --git a/src/arithmetic/canonical_simplify.cc b/src/arithmetic/canonical_simplify.cc
index d9b528291211..0feb00fc904b 100644
--- a/src/arithmetic/canonical_simplify.cc
+++ b/src/arithmetic/canonical_simplify.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -453,6 +453,9 @@ class CanonicalSimplifier::Impl : public RewriteSimplifier::Impl {
     if (const auto* op = expr.as<SplitExprNode>()) {
       return GetRef<SplitExpr>(op);
     }
+    if (const auto* op = expr.as<SumExprNode>()) {
+      if (op->base == 0 && op->args.size() == 1) return op->args[0];
+    }
     if (const auto* op = expr.as_derived<CanonicalExprNode>()) {
       expr = op->Normalize();
     }
@@ -764,6 +767,16 @@ Mutate_(const Mod* op, const Expr& self) {
           }
         }
       }
+      // Simplify the offset constant if necessary.
+      // (x - 5) % 3 => (x - 2) % 3 if x - 5 >= 0
+      auto cbound = parent_->const_int_bound(Normalize(a));
+      int64_t new_base = psum->base % cval;
+      if (cbound->min_value >= 0 &&
+          cbound->min_value - psum->base + new_base >= 0) {
+        SumExpr sum_expr(std::move(a.node_));
+        sum_expr.CopyOnWrite()->base = new_base;
+        return SplitModConst(ToSplitExpr(std::move(sum_expr)), cval);
+      }
     } else {
       // if a >= 0 && a < cval, then result == 0
       auto cbound = parent_->const_int_bound(Normalize(a));
diff --git a/src/arithmetic/const_int_bound.cc b/src/arithmetic/const_int_bound.cc
index c591e58aa542..bfd06c8ba255 100644
--- a/src/arithmetic/const_int_bound.cc
+++ b/src/arithmetic/const_int_bound.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -25,6 +25,7 @@
 #include <tvm/ir_functor_ext.h>
 #include <algorithm>
 #include "int_op_overflow.h"
+#include "pattern_match.h"
 
 namespace tvm {
 namespace arith {
@@ -65,6 +66,19 @@ struct ConstIntBoundAnalyzer::Entry {
 class ConstIntBoundAnalyzer::Impl :
       public ExprFunctor<ConstIntBoundAnalyzer::Entry(const Expr&)> {
  public:
+  /*! \brief additional bound info about expr \in bound */
+  struct BoundInfo {
+    /*! \brief The expr */
+    Expr expr;
+    /*! \brief The additional bound */
+    Entry bound;
+
+    BoundInfo() {}
+    BoundInfo(Expr expr, Entry bound)
+        : expr(expr), bound(bound) {
+    }
+  };
+
   void Bind(const Var& var, const Range& range) {
     Entry a = VisitExpr(range->min);
     Entry b = VisitExpr(range->extent);
@@ -99,6 +113,18 @@ class ConstIntBoundAnalyzer::Impl :
         static_cast<const ir::BaseExprNode*>(op)->type);
   }
 
+  Entry VisitExpr(const Expr& expr) final {
+    Entry res = ExprFunctor::VisitExpr(expr);
+    // a linear search over additional info
+    // assume we won't have a lot of conditions
+    for (const BoundInfo& info : additional_info_) {
+      if (ir::Equal(expr, info.expr)) {
+        res = Intersect(res, info.bound);
+      }
+    }
+    return res;
+  }
+
   Entry VisitExpr_(const Cast* op) final {
     Entry a = VisitExpr(op->value);
     Entry b = Everything(op->type);
@@ -243,9 +269,24 @@ class ConstIntBoundAnalyzer::Impl :
     }
   }
 
+  std::function<void()> EnterConstraint(const Expr& constraint) {
+    std::vector<BoundInfo> info = DetectBoundInfo(constraint);
+    if (info.size() == 0) return nullptr;
+    size_t old_size = additional_info_.size();
+    additional_info_.insert(additional_info_.end(), info.begin(), info.end());
+    size_t new_size = old_size + info.size();
+    auto frecover = [old_size, new_size, this]() {
+      CHECK_EQ(additional_info_.size(), new_size);
+      additional_info_.resize(old_size);
+    };
+    return frecover;
+  }
+
  private:
   // internal variable map
   std::unordered_map<Var, Entry, ExprHash, ExprEqual> var_map_;
+  // additional bound info
+  std::vector<BoundInfo> additional_info_;
   // constants: the limit value means umlimited
   // NOTE: kNegInf/kPosInf are used to represent infinity.
   static const constexpr int64_t kNegInf = ConstIntBoundNode::kNegInf;
@@ -387,6 +428,36 @@ class ConstIntBoundAnalyzer::Impl :
     }
     return ret;
   }
+
+  /*!
+   * \brief Detect additional constant bound from cond, if any
+   * \param cond The constraint condition.
+   * \return List of detected bounds.
+   */
+  static std::vector<BoundInfo> DetectBoundInfo(const Expr& cond) {
+    PVar<Expr> x, y;
+    PVar<Integer> c;
+    // NOTE: canonical form always use <= or <
+    if ((c <= x).Match(cond)) {
+      return {BoundInfo(x.Eval(), MakeBound(c.Eval()->value, kPosInf))};
+    }
+    if ((c < x).Match(cond)) {
+      return {BoundInfo(x.Eval(), MakeBound(c.Eval()->value + 1, kPosInf))};
+    }
+    if ((x <= c).Match(cond)) {
+      return {BoundInfo(x.Eval(), MakeBound(kNegInf, c.Eval()->value))};
+    }
+    if ((x < c).Match(cond)) {
+      return {BoundInfo(x.Eval(), MakeBound(kNegInf, c.Eval()->value - 1))};
+    }
+    if ((x && y).Match(cond)) {
+      auto ret1 = DetectBoundInfo(x.Eval());
+      auto ret2 = DetectBoundInfo(y.Eval());
+      ret1.insert(ret1.end(), ret2.begin(), ret2.end());
+      return ret1;
+    }
+    return {};
+  }
 };
 
 ConstIntBound ConstIntBoundAnalyzer::operator()(const Expr& expr) {
@@ -405,7 +476,7 @@ void ConstIntBoundAnalyzer::Bind(const Var& var, const Range& range) {
 }
 
 std::function<void()> ConstIntBoundAnalyzer::EnterConstraint(const Expr& constraint) {
-  return nullptr;
+  return impl_->EnterConstraint(constraint);
 }
 
 ConstIntBoundAnalyzer::ConstIntBoundAnalyzer(Analyzer* parent)
diff --git a/src/arithmetic/modular_set.cc b/src/arithmetic/modular_set.cc
index 5958233d6d52..7701e04844fa 100644
--- a/src/arithmetic/modular_set.cc
+++ b/src/arithmetic/modular_set.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/src/arithmetic/rewrite_simplify.cc b/src/arithmetic/rewrite_simplify.cc
index 6098faa44846..58d2b83a223a 100644
--- a/src/arithmetic/rewrite_simplify.cc
+++ b/src/arithmetic/rewrite_simplify.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -1197,14 +1197,32 @@ Mutate_(const Or* op, const Expr& self) {
 
 Expr RewriteSimplifier::Impl::
 Mutate_(const Select* op, const Expr& self) {
-  Expr ret = IRMutator::Mutate_(op, self);
-  op = ret.as<Select>();
-  if (is_zero(op->condition)) {
-    return op->false_value;
+  Expr cond = Mutate(op->condition);
+  Expr true_value, false_value;
+  {
+    ConstraintContext constraint(parent_, cond);
+    true_value = Mutate(op->true_value);
+  }
+  {
+    ConstraintContext constraint(parent_, Mutate(Not::make(cond)));
+    false_value = Mutate(op->false_value);
+  }
+  if (is_zero(cond)) {
+    return false_value;
   }
-  if (is_one(op->condition)) {
-    return op->true_value;
+  if (is_one(cond)) {
+    return true_value;
   }
+  // normal path
+  Expr ret;
+  if (cond.same_as(op->condition) &&
+      true_value.same_as(op->true_value) &&
+      false_value.same_as(op->false_value)) {
+    ret = self;
+  } else {
+    ret = Select::make(cond, true_value, false_value);
+  }
+  op = ret.as<Select>();
   // Pattern var to match any expression
   PVar<Expr> x, y;
   TVM_TRY_REWRITE(select(x, y, y), y);
@@ -1213,7 +1231,37 @@ Mutate_(const Select* op, const Expr& self) {
 
 Expr RewriteSimplifier::Impl::
 Mutate_(const Call* op, const Expr& self) {
-  Expr ret = IRMutator::Mutate_(op, self);
+  // add condition context to if_then_else
+  Expr ret;
+  if (op->is_intrinsic(ir::intrinsic::tvm_if_then_else)) {
+    Expr cond = Mutate(op->args[0]);
+    Expr true_value, false_value;
+    {
+      ConstraintContext constraint(parent_, cond);
+      true_value = Mutate(op->args[1]);
+    }
+    {
+      ConstraintContext constraint(parent_, Mutate(Not::make(cond)));
+      false_value = Mutate(op->args[2]);
+    }
+    if (is_zero(cond)) {
+      return false_value;
+    }
+    if (is_one(cond)) {
+      return true_value;
+    }
+    if (cond.same_as(op->args[0]) &&
+        true_value.same_as(op->args[1]) &&
+        false_value.same_as(op->args[2])) {
+      ret = self;
+    } else {
+      ret = Call::make(op->type, op->name,
+                        {cond, true_value, false_value},
+                        op->call_type);
+    }
+  } else {
+    ret = IRMutator::Mutate_(op, self);
+  }
   op = ret.as<Call>();
   if (op->is_intrinsic(Call::likely) && is_const(op->args[0])) {
     return op->args[0];
diff --git a/tests/python/unittest/test_arith_canonical_simplify.py b/tests/python/unittest/test_arith_canonical_simplify.py
index 6af058523cd8..3e69f21fa2b2 100644
--- a/tests/python/unittest/test_arith_canonical_simplify.py
+++ b/tests/python/unittest/test_arith_canonical_simplify.py
@@ -22,7 +22,7 @@ def __init__(self):
 
     def verify(self, data, expected):
         res = self.analyzer.canonical_simplify(data)
-        assert tvm.ir_pass.Equal(res, expected), "data={}, res={}, expected={}".format(data, res, expected)
+        assert tvm.ir_pass.Equal(res, expected), "\ndata={}\nres={}\nexpected={}".format(data, res, expected)
 
 
 def test_mul_sum_simplify():
@@ -157,7 +157,38 @@ def test_reduce_simplify():
     ck.verify(tvm.sum(k / 10, k), tvm.sum(tvm.const(0, "int32"), k))
 
 
+def test_simplify_if_then_else():
+    ck = CanonicalChecker()
+    x = tvm.var("x")
+    y = tvm.var("y")
+    # simplification that takes condition into account.
+    res = tvm.if_then_else((x * 4 + y) >= 466036,
+                           tvm.if_then_else(24512 <= ((((x*4) + y) - 466036) % 24528),
+                                            (((((x*4) + y)  - 466036) % 24528) -24512) % 16,
+                                            x), y)
+    expected = tvm.if_then_else(
+        tvm.expr.LE(466036, (x * 4 + y)),
+        tvm.if_then_else(tvm.expr.LE(24512, ((((x*4) + y) - 4) % 24528)),
+                         (((x*4) + y)  - 4) % 16,
+                         x), y)
+    ck.verify(res, expected)
+    # can only simplify if condition
+    res = tvm.expr.Select(tvm.all(x >= -1, y >= 0), (x + y + 100) % 3, (x + 100) % 3)
+    expected = tvm.expr.Select(tvm.all(x >= -1, y >= 0), (x + y + 1) % 3, (x + 100) % 3)
+    ck.verify(res, ck.analyzer.canonical_simplify(expected))
+
+    res = tvm.expr.Select(x >= 10,
+                          tvm.if_then_else(x / 3 > 2, x, 0), 0)
+    expected = tvm.expr.Select(x >= 10, x, 0)
+    ck.verify(res, ck.analyzer.canonical_simplify(expected))
+
+    res = tvm.expr.Select(x >= 10,
+                          tvm.if_then_else(x / 3 < 2, x, 0), 0)
+    ck.verify(res, 0)
+
+
 if __name__ == "__main__":
+    test_simplify_if_then_else()
     test_div_simplify()
     test_reduce_simplify()
     test_reduce_combiner_simplify()

From 0a570eddeb806a88117d2e78dd64c2683c5f124e Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 5 May 2019 16:05:18 +0800
Subject: [PATCH 080/106] [TOPI] Fix mali conv2d performance regression (#3131)

* [TOPI] fix mali conv

* fix typo

* address comments
---
 python/tvm/autotvm/tophub.py    | 2 +-
 topi/python/topi/mali/conv2d.py | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index 1fa0409def58..850f501cb1fc 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -42,7 +42,7 @@
     'cuda':    "v0.04",
     'rocm':    "v0.02",
     'opencl':  "v0.02",
-    'mali':    "v0.04",
+    'mali':    "v0.05",
 
     'vta':     "v0.04",
 }
diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py
index 1d89d7e7cef6..9a333353f41d 100644
--- a/topi/python/topi/mali/conv2d.py
+++ b/topi/python/topi/mali/conv2d.py
@@ -352,9 +352,11 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt
     # unpack output
     output = tvm.compute((N, CO, H, W), lambda n, co, h, w:
                          Y[co][n * nH * nW + (h//m) * nW + w//m][h % m][w % m]
-                         # thw following term is used to make the padding effective,
-                         # otherwise the padding will be eliminated by bound inference
-                         + tvm.const(0, out_dtype) * M[alpha-1][alpha-1][CO-1][P_round-1],
+                         # The following hack term is used to make the padding in batch gemm ("M")
+                         # effective, otherwise the padding will be eliminated by bound inference.
+                         # Use `tvm.expr.Mul` instead of `*` to avoid issues in const folding.
+                         + tvm.expr.Mul(tvm.const(0, out_dtype),
+                                        M[alpha-1][alpha-1][CO-1][P_round-1]),
                          name='output', tag='winograd_conv2d_output')
 
     # we have to manually assign effective GFLOP for winograd

From f1b9b727f5f25cd2e7621f5869c4569fc949649d Mon Sep 17 00:00:00 2001
From: Yong Wu <ywu118@alumni.jh.edu>
Date: Sun, 5 May 2019 01:08:10 -0700
Subject: [PATCH 081/106] [Relay][Frontend] add log op in tf frontend (#3111)

* [Relay][Frontend] add log op in tf frontend

* address comment
---
 python/tvm/relay/frontend/tensorflow.py       |  6 ++++--
 .../frontend/tensorflow/test_forward.py       | 19 +++++++++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 9c312990c379..bbbb0a2feaec 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -1094,6 +1094,7 @@ def _impl(inputs, attr, params):
     'ArgMin'                            : _argx(_op.argmin, 'argmin'),
     'AvgPool'                           : _pooling('avg_pool'),
     'BatchNormWithGlobalNormalization'  : _batch_norm(),
+    'BatchToSpaceND'                    : _batch_to_space_nd(),
     'BiasAdd'                           : _bias_add(),
     'Cast'                              : _cast(),
     'Ceil'                              : AttrCvt('ceil'),
@@ -1119,6 +1120,7 @@ def _impl(inputs, attr, params):
     'LeakyRelu'                         : AttrCvt('leaky_relu'),
     'Less'                              : _broadcast('less'),
     'LessEqual'                         : _broadcast('less_equal'),
+    'Log'                               : AttrCvt('log'),
     'LogicalAnd'                        : _logical('logical_and'),
     'LogicalOr'                         : _logical('logical_or'),
     'LogicalNot'                        : _logical('logical_not'),
@@ -1151,6 +1153,7 @@ def _impl(inputs, attr, params):
     'Sign'                              : AttrCvt('sign'),
     'Slice'                             : _slice(),
     'Softmax'                           : _softmax(),
+    'SpaceToBatchND'                    : _space_to_batch_nd(),
     'Split'                             : _split(False),
     'SplitV'                            : _split(True),
     'Square'                            : _square(),
@@ -1162,8 +1165,7 @@ def _impl(inputs, attr, params):
     'Tile'                              : _tile(),
     'Transpose'                         : _transpose(),
     'Unpack'                            : _unpack(),
-    'SpaceToBatchND'                    : _space_to_batch_nd(),
-    'BatchToSpaceND'                    : _batch_to_space_nd(),
+
 }
 
 def _LSTMBlockCell():
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 9b6dc573901e..8dd538aa859c 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -1440,6 +1440,23 @@ def test_forward_pow_exp():
     compare_tf_with_tvm([np_in1, np_in2], ['in1:0', 'in2:0'], 'pow:0')
     compare_tf_with_tvm([np_in1], ['in1:0'], 'exp:0')
 
+def test_forward_log():
+    """test Log """
+    np_data = np.random.uniform(1, 100, size=(2, 3, 5)).astype(np.float32)
+    tf.reset_default_graph()
+    in_data = tf.placeholder(tf.float32, (2, 3, 5), name="in_data")
+    tf.log(in_data, name="log")
+    compare_tf_with_tvm([np_data], ['in_data:0'], 'log:0')
+
+def test_forward_rsqrt():
+    """test Rsqrt """
+    np_data = np.random.uniform(1, 100, size=(5, 7, 11)).astype(np.float32)
+    tf.reset_default_graph()
+    in_data = tf.placeholder(tf.float32, (5, 7, 11), name="in_data")
+    tf.rsqrt(in_data, name="rsqrt")
+    print(tf.get_default_graph().as_graph_def())
+    compare_tf_with_tvm([np_data], ['in_data:0'], 'rsqrt:0')
+
 #######################################################################
 # Mean
 # ----
@@ -1525,6 +1542,8 @@ def test_forward_expand_dims():
     test_forward_reverse_v2()
     test_forward_pow_exp()
     test_forward_sign()
+    test_forward_log()
+    test_forward_rsqrt()
     test_forward_expand_dims()
 
     # Reductions

From 87374d11aa159514ae916d9dda8c25725262bd2e Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Sun, 5 May 2019 21:17:29 +0900
Subject: [PATCH 082/106] [ROCm] Fix dense autotvm template registration
 (#3136)

* Fix rocm dense autotvm template

* suppres lint warning
---
 topi/python/topi/cuda/__init__.py |  1 +
 topi/python/topi/rocm/dense.py    | 13 +++++++------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
index aca410b93276..65ed0ff10dad 100644
--- a/topi/python/topi/cuda/__init__.py
+++ b/topi/python/topi/cuda/__init__.py
@@ -11,6 +11,7 @@
 from .reduction import schedule_reduce
 from .softmax import schedule_softmax
 from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
+from .dense import schedule_dense
 from .pooling import schedule_pool, schedule_global_pool
 from .extern import schedule_extern
 from .nn import schedule_lrn, schedule_l2_normalize
diff --git a/topi/python/topi/rocm/dense.py b/topi/python/topi/rocm/dense.py
index a8c033f0bd73..6fca7cd79656 100644
--- a/topi/python/topi/rocm/dense.py
+++ b/topi/python/topi/rocm/dense.py
@@ -14,18 +14,19 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name, unused-variable
+# pylint: disable=invalid-name, unused-variable, unused-argument
 """Schedule for dense operator"""
 from __future__ import absolute_import as _abs
 import tvm
+from tvm import autotvm
 from tvm.contrib import rocblas
 import topi
 from ..nn.dense import dense, dense_default
 from .. import tag
 from .. import generic
 
-@dense.register("rocm")
-def dense_rocm(data, weight, bias=None, out_dtype=None):
+@autotvm.register_topi_compute(dense, "rocm", "direct")
+def dense_rocm(cfg, data, weight, bias=None, out_dtype=None):
     """Dense operator for rocm backend.
 
     Parameters
@@ -67,8 +68,8 @@ def dense_rocm(data, weight, bias=None, out_dtype=None):
     return dense_default(data, weight, bias, out_dtype)
 
 
-@generic.schedule_dense.register(["rocm"])
-def schedule_dense(outs):
+@autotvm.register_topi_schedule(generic.schedule_dense, "rocm", "direct")
+def schedule_dense(cfg, outs):
     """Schedule for dense operator.
 
     Parameters
@@ -85,4 +86,4 @@ def schedule_dense(outs):
     target = tvm.target.current_target()
     if target.target_name == "rocm" and "rocblas" in target.libs:
         return generic.schedule_extern(outs)
-    return topi.cuda.schedule_dense(outs)
+    return topi.cuda.schedule_dense(cfg, outs)

From f6c52fa9df97c4aead6774f8104ab6827e2a90b5 Mon Sep 17 00:00:00 2001
From: Wei Chen <ipondering.weic@gmail.com>
Date: Tue, 7 May 2019 20:52:24 -0700
Subject: [PATCH 083/106] Handle vectorize for LE statement (#3137)

* Handle vectorize for LE statement

Fix a new cases introduced by commit 7afbca5691fdb599cd90b043d5a5036e55cae2d6

* Add test
---
 src/pass/vectorize_loop.cc                   |  3 +++
 tests/python/unittest/test_pass_vectorize.py | 24 ++++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/src/pass/vectorize_loop.cc b/src/pass/vectorize_loop.cc
index bd0a91ce4a99..f87e80c2d030 100644
--- a/src/pass/vectorize_loop.cc
+++ b/src/pass/vectorize_loop.cc
@@ -166,6 +166,9 @@ class Vectorizer : public IRMutator {
   Expr Mutate_(const LT* op, const Expr &e) final {
     return BinaryVec(op, e);
   }
+  Expr Mutate_(const LE* op, const Expr &e) final {
+    return BinaryVec(op, e);
+  }
   Expr Mutate_(const GT* op, const Expr &e) final {
     return BinaryVec(op, e);
   }
diff --git a/tests/python/unittest/test_pass_vectorize.py b/tests/python/unittest/test_pass_vectorize.py
index 03516872e835..fca22a1eca30 100644
--- a/tests/python/unittest/test_pass_vectorize.py
+++ b/tests/python/unittest/test_pass_vectorize.py
@@ -69,6 +69,28 @@ def test_vectorize_with_if():
     assert stmt.then_case.value.dtype == "float32x4"
     assert isinstance(stmt.else_case, tvm.stmt.For)
 
+def test_vectorize_with_le_cond():
+    n = tvm.var('n')
+    ib = tvm.ir_builder.create()
+    A = ib.pointer("float32", name="A")
+    with ib.for_range(0, 4, for_type="vectorize") as i:
+        with ib.if_scope(i <= n):
+            A[i] = A[i] + 1
+    stmt = ib.get()
+    stmt = tvm.ir_pass.VectorizeLoop(stmt)
+    assert isinstance(stmt, tvm.stmt.For)
+
+def test_vectorize_with_ge_cond():
+    n = tvm.var('n')
+    ib = tvm.ir_builder.create()
+    A = ib.pointer("float32", name="A")
+    with ib.for_range(0, 4, for_type="vectorize") as i:
+        with ib.if_scope(i >= n):
+            A[i] = A[i] + 1
+    stmt = ib.get()
+    stmt = tvm.ir_pass.VectorizeLoop(stmt)
+    assert isinstance(stmt, tvm.stmt.For)
+
 def test_vectorize_if_then_else():
     n = tvm.var('n')
     x = tvm.var('x')
@@ -102,3 +124,5 @@ def test_vectorize_if_then_else():
     test_vectorize_with_if()
     test_vectorize_loop()
     test_vectorize_if_then_else()
+    test_vectorize_with_le_cond()
+    test_vectorize_with_ge_cond()

From aab1f6f8492eedc38d40209c8c8bfd073dab4073 Mon Sep 17 00:00:00 2001
From: Marcus Shawcroft <marcus.shawcroft@arm.com>
Date: Wed, 8 May 2019 05:50:44 +0100
Subject: [PATCH 084/106] [DOC] fix :code: markup syntax (#3140)

---
 tutorials/topi/intro_topi.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tutorials/topi/intro_topi.py b/tutorials/topi/intro_topi.py
index 0b0d1c8d8ec3..6c7bb6a95251 100644
--- a/tutorials/topi/intro_topi.py
+++ b/tutorials/topi/intro_topi.py
@@ -50,8 +50,8 @@
 
 ######################################################################
 # However, for such a common operation we had to define the reduce axis ourselves as well as explicit computation with
-# :code: `tvm.compute`. Imagine for more complicated operations how much details we need to provide.
-# Fortunately, we can replace those two lines with simple :code:`topi.sum` much like :code`numpy.sum`
+# :code:`tvm.compute`. Imagine for more complicated operations how much details we need to provide.
+# Fortunately, we can replace those two lines with simple :code:`topi.sum` much like :code:`numpy.sum`
 #
 C = topi.sum(A, axis=1)
 ts = tvm.create_schedule(C.op)

From 84a4def17916a8689c4da74994085c00c31cc3ab Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Tue, 7 May 2019 21:51:00 -0700
Subject: [PATCH 085/106] [Bugfix][TOPI] conv2d_transpose bugfix (#3138)

* deconv tests

* deconv bug fixed for certain cases tests added
---
 topi/python/topi/cuda/conv2d_transpose_nchw.py       | 9 ++++-----
 topi/tests/python/test_topi_conv2d_transpose_nchw.py | 1 +
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/topi/python/topi/cuda/conv2d_transpose_nchw.py b/topi/python/topi/cuda/conv2d_transpose_nchw.py
index 4bbb135376a3..534a1695cd55 100644
--- a/topi/python/topi/cuda/conv2d_transpose_nchw.py
+++ b/topi/python/topi/cuda/conv2d_transpose_nchw.py
@@ -174,7 +174,6 @@ def _callback(op):
             by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
             bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
 
-            bf = s[output].fuse(n, bf)
             s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
             s[output].bind(by, tvm.thread_axis("blockIdx.y"))
             s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
@@ -184,7 +183,7 @@ def _callback(op):
             s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
             s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
             s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
-            s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
+            s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
             s[OL].compute_at(s[output], tx)
 
             # tile reduction axes
@@ -193,13 +192,13 @@ def _callback(op):
             rco, rcm, rci = cfg['tile_rc'].apply(s, OL, rc)
             s[OL].reorder(rco, rcm, ry, rx, rci, n, f, y, x)
 
-            s[AA].compute_at(s[OL], rcm)
-            s[WW].compute_at(s[OL], rcm)
+            s[AA].compute_at(s[OL], rx)
+            s[WW].compute_at(s[OL], rx)
 
             # cooperative fetching
             for load in [AA, WW]:
                 n, f, y, x = s[load].op.axis
-                fused = s[load].fuse(n, f, y, x)
+                fused = s[load].fuse(f, y, x)
                 tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
                 ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
                 tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
diff --git a/topi/tests/python/test_topi_conv2d_transpose_nchw.py b/topi/tests/python/test_topi_conv2d_transpose_nchw.py
index 1a2d779ac28b..0960760a89de 100644
--- a/topi/tests/python/test_topi_conv2d_transpose_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_transpose_nchw.py
@@ -74,6 +74,7 @@ def check_device(device):
 def test_conv2d_transpose_nchw():
     verify_conv2d_transpose_nchw(1, 3, 224, 32, 3, 1, 0)
     verify_conv2d_transpose_nchw(1, 3, 224, 32, 3, 2, 1)
+    verify_conv2d_transpose_nchw(1, 3, 224, 32, 2, 2, 0)
     verify_conv2d_transpose_nchw(1, 32, 32, 128, 5, 1, 0)
     verify_conv2d_transpose_nchw(1, 32, 32, 128, 5, 2, 1)
 

From f4bf1631c01208f4714d8a090cd2a2e117546943 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Wed, 8 May 2019 00:16:15 -0700
Subject: [PATCH 086/106] Relay C++ Build Module (#3082)

* [Relay] C++ Build module

* asdf
---
 include/tvm/build_module.h                  |  13 +
 src/codegen/build_module.cc                 |  19 +-
 src/relay/backend/build_module.cc           | 713 ++++++++++++++++++++
 src/relay/backend/compile_engine.cc         |   4 +-
 src/relay/backend/graph_runtime_codegen.cc  |   7 +-
 tests/cpp/relay_build_module_test.cc        | 104 +++
 tests/python/relay/test_cpp_build_module.py | 106 +++
 7 files changed, 960 insertions(+), 6 deletions(-)
 create mode 100644 src/relay/backend/build_module.cc
 create mode 100644 tests/cpp/relay_build_module_test.cc
 create mode 100644 tests/python/relay/test_cpp_build_module.py

diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h
index 3c136444229b..334fe169ad41 100644
--- a/include/tvm/build_module.h
+++ b/include/tvm/build_module.h
@@ -344,6 +344,19 @@ TVM_DLL Array<LoweredFunc> lower(Schedule sch,
                                  const std::string& name,
                                  const std::unordered_map<Tensor, Buffer>& binds,
                                  const BuildConfig& config);
+/*!
+* \brief Split host/device function and running necessary pass before build
+* \param funcs The functions to be built.
+* \param target The target device to build for.
+* \param target_host The target for building host code. To use the default, pass Target()
+* \param config The build configuration.
+* \return The Array<Array<LoweredFunc>> with 2 elements. First is host function Array,
+          second is device function array
+*/
+TVM_DLL Array<Array<LoweredFunc> > split_dev_host_funcs(const Array<LoweredFunc>& funcs,
+                                                        const Target& target,
+                                                        const Target& target_host,
+                                                        const BuildConfig& config);
 
 /*!
 * \brief Build a device and host module for a specific target from an array of lowered functions.
diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index 92a12a0da1b7..01ebcacf6180 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -423,10 +423,10 @@ Array<LoweredFunc> lower(Schedule sch,
   return Array<LoweredFunc>({ ir::MakeAPI(stmt, name, out_arg_list, 0, config->restricted_func) });
 }
 
-runtime::Module build(const Array<LoweredFunc>& funcs,
-                      const Target& target,
-                      const Target& target_host,
-                      const BuildConfig& config) {
+Array<Array<LoweredFunc> > split_dev_host_funcs(const Array<LoweredFunc>& funcs,
+                                                const Target& target,
+                                                const Target& target_host,
+                                                const BuildConfig& config) {
   std::unordered_set<std::string> all_names;
   for (const auto &x : funcs) {
     CHECK(all_names.count(x->name) == 0) << "Duplicate function name " << x->name;
@@ -493,6 +493,17 @@ runtime::Module build(const Array<LoweredFunc>& funcs,
     func = ir::CombineContextCall(func);
     fhost.Set(i, func);
   }
+  return {fhost, fdevice};
+}
+
+runtime::Module build(const Array<LoweredFunc>& funcs,
+                      const Target& target,
+                      const Target& target_host,
+                      const BuildConfig& config) {
+  auto target_host_val = target_host.defined() ? target_host : DefaultTargetHost(target);
+  auto host_dev_funcs = split_dev_host_funcs(funcs, target, target_host, config);
+  auto& fhost = host_dev_funcs[0];
+  auto& fdevice = host_dev_funcs[1];
 
   auto mhost = codegen::Build(fhost, target_host_val->str());
 
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
new file mode 100644
index 000000000000..b60a048e638a
--- /dev/null
+++ b/src/relay/backend/build_module.cc
@@ -0,0 +1,713 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file relay/backend/build_module.cc
+ * \brief Code generation for TVM's graph runtime.
+ */
+
+#include <tvm/build_module.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/attrs/transform.h>
+#include <vector>
+#include <string>
+#include <memory>
+
+#include "utils.h"
+
+namespace tvm {
+namespace relay {
+namespace backend {
+
+/*!
+ * \brief Context name / index 
+ *        See: python/tvm/_ffi/runtime_ctypes.py
+ */
+struct ContextMap {
+  static const std::unordered_map<int, std::string> mask2str;
+  static const std::unordered_map<std::string, int> str2mask;
+  static std::string Mask2Str(int mask) {
+    CHECK_GT(mask2str.count(mask), 0) << "Unknown mask.";
+    return mask2str.at(mask);
+  }
+  static int Str2Mask(const std::string& str) {
+    CHECK_GT(str2mask.count(str), 0) << "Unknown context.";
+    return str2mask.at(str);
+  }
+};
+
+const std::unordered_map<int, std::string> ContextMap::mask2str = {
+  {1, "cpu"},
+  {2, "gpu"},
+  {4, "opencl"},
+  {5, "aocl"},
+  {6, "sdaccel"},
+  {7, "vulkan"},
+  {8, "metal"},
+  {9, "vpi"},
+  {10, "rocm"},
+  {11, "opengl"},
+  {12, "ext_dev"}
+};
+
+const std::unordered_map<std::string, int> ContextMap::str2mask = {
+  {"llvm", 1},
+  {"cpu", 1},
+  {"c", 1},
+  {"gpu", 2},
+  {"cuda", 2},
+  {"nvptx", 2},
+  {"cl", 4},
+  {"opencl", 4},
+  {"aocl", 5},
+  {"aocl_sw_emu", 5},
+  {"vulkan", 7},
+  {"metal", 8},
+  {"vpi", 9},
+  {"rocm", 10},
+  {"opengl", 11},
+  {"ext_dev", 12}
+};
+
+/*!
+ * \brief A data structure to map the names of specific optimizations to
+ *        numeric optimization levels
+ * 
+ */
+struct OptPassLevel {
+  static const std::unordered_map<std::string, int> _data;
+  /*!
+   * \brief Get level for an optimization pass
+   * 
+   * \param key pass name
+   * \return int level
+   */
+  int operator[](const std::string& key) const {
+    auto it = _data.find(key);
+    if (it == _data.end()) {
+      return -1;
+    }
+    return it->second;
+  }
+};
+
+const std::unordered_map<std::string, int> OptPassLevel::_data = {
+  {"SimplifyInference", 0},
+  {"OpFusion", 1},
+  {"FoldConstant", 2},
+  {"CombineParallelConv2D", 3},
+  {"FoldScaleAxis", 3},
+  {"AlterOpLayout", 3},
+  {"CanonicalizeOps", 3},
+  {"EliminateCommonSubexpr", 3}
+};
+
+/*!
+ * \brief Output of building module
+ * 
+ */
+struct BuildOutput {
+  std::string graph_json;
+  runtime::Module mod;
+  std::unordered_map<std::string, tvm::runtime::NDArray> params;
+};
+
+/*!
+ * \brief Relay building config
+ * 
+ */
+struct RelayBuildConfig {
+  int opt_level{2};
+  std::string fallback_device{"llvm"};
+  std::unordered_set<std::string> enabled_pass;
+  std::unordered_set<std::string> disabled_pass;
+  OptPassLevel OPT_PASS_LEVEL;
+  inline bool pass_enabled(const std::string& pass_name) const {
+    if (disabled_pass.count(pass_name)) {
+      return false;
+    }
+    if (enabled_pass.count(pass_name)) {
+      return true;
+    }
+    return opt_level >= OPT_PASS_LEVEL[pass_name];
+  }
+};
+
+/*!
+ * \brief GraphCodegen module wrapper 
+ * 
+ */
+struct GraphCodegen {
+ public:
+  GraphCodegen() {
+    auto pf = GetPackedFunc("relay.build_module._GraphRuntimeCodegen");
+    mod = (*pf)();
+  }
+  ~GraphCodegen() {}
+
+  void Init(runtime::Module* m,
+            Map<HalideIR::Expr, HalideIR::Expr> targets) {
+    Array<HalideIR::Expr> tgts;
+    for (auto kv : targets) {
+      tgts.push_back(kv.first);
+      tgts.push_back(kv.second);
+    }
+    CallFunc("init", m, tgts);
+  }
+
+  void Codegen(const Function& func) {
+    CallFunc("codegen", func);
+  }
+
+  std::string GetJSON() {
+    return CallFunc<std::string>("get_graph_json", nullptr);
+  }
+
+  Map<std::string, Array<LoweredFunc> > GetLoweredFunc() {
+    return CallFunc<Map<std::string, Array<LoweredFunc> > >("get_lowered_funcs", nullptr);
+  }
+
+  std::unordered_map<std::string, tvm::runtime::NDArray> GetParams() {
+    std::unordered_map<std::string, tvm::runtime::NDArray> ret;
+    auto names = CallFunc<Array<HalideIR::Expr> >("list_params_name", nullptr);
+    for (auto expr : names) {
+      auto key = expr.as<ir::StringImm>()->value;
+      ret[key] = CallFunc<runtime::NDArray>("get_param_by_name", key);
+    }
+    return ret;
+  }
+
+ protected:
+  tvm::runtime::Module mod;
+  template<typename R, typename ...Args>
+  R CallFunc(const std::string &name, Args... args) {
+    auto pf = mod.GetFunction(name, false);
+    return pf(std::forward<Args>(args)...);
+  }
+  template<typename ...Args>
+  void CallFunc(const std::string &name, Args... args) {
+    auto pf = mod.GetFunction(name, false);
+    pf(std::forward<Args>(args)...);
+    return;
+  }
+};
+
+template<typename R, typename ...Args>
+R CallPackedFunc(const std::string &name, Args... args) {
+  auto pf = GetPackedFunc(name);
+  return (*pf)(std::forward<Args>(args)...);
+}
+
+template<typename ...Args>
+Function CallPackedFunc(const std::string &name, Args... args) {
+  auto pf = GetPackedFunc(name);
+  return (*pf)(std::forward<Args>(args)...);
+}
+
+/*!
+ * \brief Relay build module
+ * 
+ */
+class RelayBuildModule : public runtime::ModuleNode {
+ public:
+  /*!
+   * \brief Get member function to front-end
+   * \param name The name of the function.
+   * \param sptr_to_self The pointer to the module node.
+   * \return The corresponding member function.
+   */
+  PackedFunc GetFunction(const std::string& name,
+                         const std::shared_ptr<ModuleNode>& sptr_to_self) final {
+    if (name == "get_graph_json") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        *rv = this->GetGraphJSON();
+      });
+    } else if (name == "get_module") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        *rv = this->GetModule();
+      });
+    } else if (name == "build") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        CHECK_EQ(args.num_args, 3);
+        Array<HalideIR::Expr> tmp = args[1];
+        std::unordered_map<std::string, std::string> targets;
+        for (size_t i = 0; i < tmp.size(); i += 2) {
+          auto k = tmp[i].as<ir::StringImm>()->value;
+          auto v = tmp[i + 1].as<ir::StringImm>()->value;
+          targets[k] = v;
+        }
+        this->Build(args[0], targets, args[2]);
+      });
+    } else if (name == "list_params") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        *rv = this->ListParamNames();
+      });
+    } else if (name == "get_params") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        *rv = this->GetParams();
+      });
+    } else if (name == "set_opt_level") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        CHECK_EQ(args.num_args, 1);
+        int level = args[0];
+        this->SetOptLevel(level);
+      });
+    } else if (name == "set_fallback_device") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        std::string dev = args[0];
+        this->SetFallBackDev(dev);
+      });
+    } else if (name == "add_pass") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        std::string pass_name = args[0];
+        this->AddPass(pass_name);
+      });
+    } else if (name == "disable_pass") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        std::string pass_name = args[0];
+        this->DisablePass(pass_name);
+      });
+    } else if (name == "set_params") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        Map<std::string, Constant> params = args[0];
+        for (const auto& kv : params) {
+          this->SetParam(kv.first, kv.second->data);
+        }
+      });
+    } else {
+      LOG(FATAL) << "Unknown packed function: " << name;
+      return PackedFunc([sptr_to_self, name](TVMArgs args, TVMRetValue* rv) {});
+    }
+  }
+
+  /*!
+   * \brief Get the GraphJSON for runtime
+   *
+   * \return const std::string graph_json
+   */
+  const std::string& GetGraphJSON() {
+    return ret_.graph_json;
+  }
+  /*!
+   * \brief Add extra pass into build cfg
+   * 
+   * \param pass_name name of pass 
+   */
+  void AddPass(const std::string& pass_name) {
+    cfg_.enabled_pass.insert(pass_name);
+  }
+  /*!
+   * \brief Disable a specific pass in cfg
+   * 
+   * \param pass_name name of pass
+   */
+  void DisablePass(const std::string& pass_name) {
+    cfg_.disabled_pass.insert(pass_name);
+  }
+  /*!
+   * \brief Set the Fallback device 
+   * 
+   * \param device name
+   */
+  void SetFallBackDev(const std::string& dev) {
+    cfg_.fallback_device = dev;
+  }
+  /*!
+   * \brief Get the Module object
+   *
+   * \return runtime::Module
+   */
+  runtime::Module GetModule() {
+    return ret_.mod;
+  }
+
+  /*!
+   * \brief List all paramter names
+   * 
+   * \return Array<StringImm> names of params
+   */
+  Array<HalideIR::Expr> ListParamNames() {
+    Array<HalideIR::Expr> ret;
+    for (const auto& kv : params_) {
+      ret.push_back(ir::StringImm::make(kv.first));
+    }
+    return ret;
+  }
+
+  /*!
+   * \brief Get params dictionary
+   * 
+   * \return Map<std::string, Constant> params dictionary
+   */
+  Map<std::string, Constant> GetParams() {
+    Map<std::string, Constant> ret;
+    for (const auto& kv : ret_.params) {
+      ret.Set(kv.first, ConstantNode::make(kv.second));
+    }
+    return ret;
+  }
+
+  /*!
+   * \brief Set the parameters
+   *
+   * \param name name of parameter
+   * \param data_in input DLTensor
+   */
+  void SetParam(const std::string& name, runtime::NDArray data_in) {
+    params_[name] = data_in;
+  }
+
+  /*!
+   * \brief Set the optimization level
+   *
+   * \param level
+   */
+  void SetOptLevel(char level) {
+    cfg_.opt_level = level;
+  }
+
+  /*!
+   * \brief type key
+   *
+   * \return const char*
+   */
+  const char* type_key() const final {
+    return "RelayBuildModule";
+  }
+
+  /*!
+   * \brief Build relay function for graph runtime
+   *
+   * \param func Relay Function
+   * \param target Target device
+   * \param target_host Host target device
+   */
+  void Build(Function func,
+             const std::unordered_map<std::string, std::string>& targets,
+             const std::string& target_host) {
+    targets_ = targets;
+    target_host_ = target_host;
+    BuildRelay(func, cfg_, params_);
+  }
+
+ protected:
+  /*!
+   * \brief Bind params to function by using name
+   * \param func Relay function
+   * \param params params dict
+   * \return relay::Function
+   */
+  relay::Function BindParamsByName(relay::Function func,
+                              const std::unordered_map<std::string, runtime::NDArray>& params) {
+    std::unordered_map<std::string, relay::Var> name_dict;
+    std::unordered_set<relay::Var, NodeHash, NodeEqual> repeat_var;
+    for (auto arg : func->params) {
+      const auto &name = arg->name_hint();
+      if (name_dict.count(name)) {
+        repeat_var.insert(arg);
+      } else {
+        name_dict[name] = arg;
+      }
+    }
+
+    std::unordered_map<relay::Var, Expr, NodeHash, NodeEqual> bind_dict;
+    for (auto &kv : params) {
+      if (name_dict.count(kv.first) == 0) {
+        continue;
+      }
+      auto arg = name_dict.at(kv.first);
+      if (repeat_var.count(arg)) {
+        LOG(FATAL) << "Multiple args in the function have name " << kv.first;
+      }
+      auto e = CallPackedFunc<Expr>("relay._make.Constant", kv.second);
+      bind_dict[arg] = e;
+    }
+    return CallPackedFunc("relay._expr.Bind", func, tvm::Map<relay::Var, Expr>(bind_dict));
+  }
+
+  /*!
+   * \brief Optimize Relay function
+   *
+   * \param func Input function
+   * \param target target device
+   * \param cfg Relay build config
+   * \param params params dict
+   * \return relay::Function
+   */
+  relay::Function Optimize(relay::Function func,
+                           const std::unordered_map<std::string, std::string>& targets,
+                           const RelayBuildConfig& cfg,
+                           const std::unordered_map<std::string, runtime::NDArray>& params) {
+    if (params.size()) {
+      func = BindParamsByName(func, params);
+    }
+    if (cfg.pass_enabled("SimplifyInference")) {
+      func = CallPackedFunc("relay._ir_pass.infer_type", func, nullptr);
+      func = CallPackedFunc("relay._ir_pass.simplify_inference", func);
+    }
+    if (cfg.pass_enabled("EliminateCommonSubexpr")) {
+      auto fskip = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
+        Expr expr = args[0];
+        if (expr.as<CallNode>()) {
+          auto call_node = expr.as<CallNode>();
+          auto op_node = call_node->op.as<OpNode>();
+          if (op_node->name == "cast") {
+            auto attrs = call_node->attrs.as<CastAttrs>();
+            if (attrs->dtype == HalideIR::Int(32)) {
+              *rv = true;
+            }
+          }
+        }
+        *rv =  false;
+      });
+      func = CallPackedFunc("relay._ir_pass.infer_type", func, nullptr);
+      func = CallPackedFunc("relay._ir_pass.eliminate_common_subexpr", func, fskip);
+    }
+    if (cfg.pass_enabled("CombineParallelConv2D")) {
+      const int min_num_branches = 3;
+      func = CallPackedFunc("relay._ir_pass.infer_type", func, nullptr);
+      func = CallPackedFunc("relay._ir_pass.CombineParallelConv2D", func, min_num_branches);
+    }
+    if (cfg.pass_enabled("FoldConstant")) {
+      func = CallPackedFunc("relay._ir_pass.FoldConstant", func);
+    }
+    if (cfg.pass_enabled("FoldScaleAxis")) {
+      func = CallPackedFunc("relay._ir_pass.infer_type", func, nullptr);
+      func = CallPackedFunc("relay._ir_pass.backward_fold_scale_axis", func);
+      func = CallPackedFunc("relay._ir_pass.infer_type", func, nullptr);
+      func = CallPackedFunc("relay._ir_pass.forward_fold_scale_axis", func);
+      func = CallPackedFunc("relay._ir_pass.FoldConstant", func);
+    }
+    if (cfg.pass_enabled("CanonicalizeOps")) {
+      func = CallPackedFunc("relay._ir_pass.infer_type", func, nullptr);
+      func = CallPackedFunc("relay._ir_pass.canonicalize_ops", func);
+    }
+    if (cfg.pass_enabled("AlterOpLayout")) {
+      if (targets.size() == 1) {
+        func = CallPackedFunc("relay._ir_pass.infer_type", func, nullptr);
+        func = CallPackedFunc("relay._ir_pass.AlterOpLayout", func);
+      } else {
+        LOG(WARNING) << "AlterOpLayout pass is not enabled for heterogeneous"
+                  << " execution yet.";
+      }
+    }
+    if (cfg.pass_enabled("FoldConstant")) {
+      func = CallPackedFunc("relay._ir_pass.FoldConstant", func);
+    }
+    return func;
+  }
+  /*!
+   * \brief Update the target and fallback device required for heterogeneous
+   * compilation. CPU is used as the fallback device if it wasn't provided.
+   * Meanwhile, a CPU device type and "llvm" pair will be added to the target
+   * dictionary in this case.
+   * 
+   * \param targets dictionary
+   * \param cfg 
+   * \return Map<HalideIR::Expr, HalideIR::Expr> 
+   */
+  Map<HalideIR::Expr, HalideIR::Expr> UpdateHeterogeneousInputs(
+    const std::unordered_map<std::string, std::string>& targets,
+    const RelayBuildConfig& cfg) {
+    Map<HalideIR::Expr, HalideIR::Expr> device_target;
+    std::unordered_map<int64_t, std::string> tmp_map;
+    auto fallback_idx = ContextMap::Str2Mask(cfg.fallback_device);
+
+    for (const auto& kv : targets) {
+      tmp_map[ContextMap::Str2Mask(kv.first)] = kv.second;
+    }
+    if (tmp_map.count(fallback_idx) == 0) {
+      tmp_map[fallback_idx] = cfg.fallback_device;
+    }
+    for (const auto& kv : tmp_map) {
+      device_target.Set(
+        ir::IntImm::make(HalideIR::Int(64), kv.first),
+        ir::StringImm::make(kv.second));
+    }
+    return device_target;
+  }
+  /*!
+   * \brief Execute the device annotation passes to update the input program and
+   *        target information.
+   * 
+   * \param func 
+   * \param cfg 
+   * \param targets_map_ptr 
+   * \return Function 
+   */
+  Function RunDeviceAnnotationPass(
+      Function func,
+      const RelayBuildConfig& cfg,
+      Map<HalideIR::Expr, HalideIR::Expr>* targets_map_ptr) {
+    auto fallback_idx = ContextMap::Str2Mask(cfg.fallback_device);
+    func = CallPackedFunc("relay._ir_pass.infer_type", func, nullptr);
+    func = CallPackedFunc("relay._ir_pass.RewriteDeviceAnnotation", func, fallback_idx);
+    auto device_map = CallPackedFunc<Map<Expr, Integer> >("relay._ir_pass.CollectDeviceInfo",
+                                                       func,
+                                                       nullptr);
+    if (device_map.size() == 0) {
+      auto annotation_map =
+        CallPackedFunc<Map<Expr, Integer> >("relay._ir_pass.CollectDeviceAnnotationOps",
+                                            func,
+                                            nullptr);
+      if (annotation_map.size() == 0) {
+        targets_map_ptr->Set(
+          ir::IntImm::make(HalideIR::Int(64), 0),
+          ir::StringImm::make(cfg.fallback_device));
+      } else {
+        int64_t dev_type = -1;
+        for (auto kv : annotation_map) {
+          dev_type = kv.second->value;
+          break;
+        }
+        for (auto kv : annotation_map) {
+          CHECK_EQ(kv.second->value, dev_type)
+            << "Expressions in the function are "
+            << "annotated with various device types,"
+            << "but not device copy operators "
+            << "found. Please check the "
+            << "RewriteAnnotation pass.";
+        }
+        targets_map_ptr->Set(
+          ir::IntImm::make(HalideIR::Int(64), 0),
+          ir::StringImm::make(ContextMap::Mask2Str(dev_type)));
+      }
+    }
+    return func;
+  }
+  /*!
+   * \brief Build module given lowered functions for each target
+   * 
+   * \param lowered_funcs target_str -> Array<LoweredFunc> map
+   * \param targets Targets map
+   * \param cfg Building configuration
+   */
+  void BuildModule(const Map<std::string, Array<LoweredFunc> >& lowered_funcs,
+                   const Map<HalideIR::Expr, HalideIR::Expr>& targets,
+                   const BuildConfig& cfg) {
+    auto target_host = Target::create(cfg_.fallback_device);
+    for (const auto& kv : lowered_funcs) {
+      std::unordered_set<std::string> fname_set;
+      for (auto f : kv.second) {
+        if (fname_set.count(f->name)) {
+          LOG(FATAL) << "Duplicate function name "
+                     << f->name;
+        }
+        fname_set.insert(f->name);
+      }
+    }
+    std::unordered_map<std::string, Target> target_map;
+    for (const auto& kv : lowered_funcs) {
+      target_map[kv.first] = Target::create(kv.first);
+    }
+    Array<LoweredFunc> fhost_all;
+    std::vector<runtime::Module> device_module;
+    for (const auto& kv : lowered_funcs) {
+      auto target = target_map[kv.first];
+      auto host_dev_funcs = split_dev_host_funcs(kv.second, target, target_host, cfg);
+      for (auto f : host_dev_funcs[0]) {
+        fhost_all.push_back(f);
+      }
+      if (host_dev_funcs[1].size()) {
+        auto mdev = codegen::Build(host_dev_funcs[1], target->str());
+        device_module.push_back(mdev);
+      }
+    }
+
+    auto mhost = codegen::Build(fhost_all, target_host->str());
+
+    for (auto mdev : device_module) {
+      mhost.Import(mdev);
+    }
+    ret_.mod = mhost;
+  }
+
+  /*!
+   * \brief Build relay function to runtime module
+   *
+   * \param func Relay Function
+   * \param cfg Relay build config
+   * \param params parameters
+   */
+  void BuildRelay(Function func,
+                  const RelayBuildConfig& cfg,
+                  const std::unordered_map<std::string, tvm::runtime::NDArray> &params) {
+    // convert
+    tvm_cfg_ = build_config();
+    Map<HalideIR::Expr, HalideIR::Expr> device_target;
+    if (targets_.size() > 1) {
+      device_target = UpdateHeterogeneousInputs(targets_, cfg);
+    } else {
+      for (auto &kv : targets_) {
+        device_target.Set(
+          ir::IntImm::make(HalideIR::Int(64), ContextMap::Str2Mask(kv.first)),
+          ir::StringImm::make(kv.second));
+      }
+    }
+    func = Optimize(func, targets_, cfg, params);
+    if (device_target.size() > 1) {
+      func = RunDeviceAnnotationPass(func, cfg, &device_target);
+    }
+    func = CallPackedFunc("relay._ir_pass.infer_type", func, nullptr);
+    func = CallPackedFunc("relay._ir_pass.FuseOps", func, cfg.opt_level);
+    func = CallPackedFunc("relay._ir_pass.infer_type", func, nullptr);
+
+    graph_codegen_ = std::unique_ptr<GraphCodegen>(new GraphCodegen());
+    graph_codegen_->Init(nullptr, device_target);
+    graph_codegen_->Codegen(func);
+
+    ret_.graph_json = graph_codegen_->GetJSON();
+    ret_.params = graph_codegen_->GetParams();
+
+    BuildModule(graph_codegen_->GetLoweredFunc(),
+                device_target,
+                tvm_cfg_);
+  }
+
+ protected:
+  std::unique_ptr<GraphCodegen> graph_codegen_;
+  /*! \brief target device */
+  std::unordered_map<std::string, std::string> targets_;
+  /*! \brief target host device */
+  std::string target_host_;
+  /*! \brief frontend optimization configure */
+  RelayBuildConfig cfg_;
+  /*! \brief parameters */
+  std::unordered_map<std::string, runtime::NDArray> params_;
+  /*! \brief building output */
+  BuildOutput ret_;
+  /*! \brief tvm building cfg */
+  BuildConfig tvm_cfg_;
+};
+
+runtime::Module RelayBuildCreate() {
+  std::shared_ptr<RelayBuildModule> exec = std::make_shared<RelayBuildModule>();
+  return runtime::Module(exec);
+}
+
+TVM_REGISTER_GLOBAL("relay.build_module._BuildModule").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = RelayBuildCreate();
+});
+
+}  // namespace backend
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index 4b5842c36020..a824c457107a 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -371,7 +371,9 @@ class CompileEngineImpl : public CompileEngineNode {
       cache_node->funcs = (*f)(
           spair.first, all_args, cache_node->func_name, key->source_func);
     } else {
-      LOG(FATAL) << "relay.backend.lower is not registred";
+      tvm::BuildConfig bcfg = tvm::build_config();
+      std::unordered_map<Tensor, Buffer> binds;
+      cache_node->funcs = tvm::lower(spair.first, all_args, cache_node->func_name, binds, bcfg);
     }
     value->cached_func = CachedFunc(cache_node);
     return value;
diff --git a/src/relay/backend/graph_runtime_codegen.cc b/src/relay/backend/graph_runtime_codegen.cc
index 7f16891da8a7..415e0ec9c2a5 100644
--- a/src/relay/backend/graph_runtime_codegen.cc
+++ b/src/relay/backend/graph_runtime_codegen.cc
@@ -416,7 +416,12 @@ class GraphRuntimeCodegen
     } else {
       // heterogeneous execution.
       const auto call_dev_key = std::to_string(call_dev_type);
-      const auto call_dev_name = runtime::DeviceName(call_dev_type);
+      std::string call_dev_name;
+      if (call_dev_type == 0) {
+        call_dev_name = "llvm";
+      } else {
+        call_dev_name = runtime::DeviceName(call_dev_type);
+      }
       if (targets_.count(call_dev_name) == 0 && targets_.count(call_dev_key) == 0) {
         LOG(FATAL) << "No target is provided for device "
                    << call_dev_name;
diff --git a/tests/cpp/relay_build_module_test.cc b/tests/cpp/relay_build_module_test.cc
new file mode 100644
index 000000000000..38481bfb8204
--- /dev/null
+++ b/tests/cpp/relay_build_module_test.cc
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <tvm/tvm.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/type.h>
+#include <tvm/relay/pass.h>
+#include <topi/generic/injective.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/packed_func_ext.h>
+
+
+TVM_REGISTER_GLOBAL("test.sch")
+.set_body([](tvm::TVMArgs args, tvm::TVMRetValue *rv) {
+  *rv = topi::generic::schedule_injective(args[0], args[1]);
+  });
+
+TEST(Relay, BuildModule) {
+  using namespace tvm;
+  auto tensor_type = relay::TensorTypeNode::make({2, 3}, ::tvm::Float(32));
+  auto a = relay::VarNode::make("a", tensor_type);
+  auto b = relay::VarNode::make("b", tensor_type);
+  auto add_op = relay::Op::Get("add");
+  auto x = relay::CallNode::make(add_op, {a, b}, tvm::Attrs(), {});
+  auto c = relay::VarNode::make("c", tensor_type);
+  auto y = relay::CallNode::make(add_op, {x, c}, tvm::Attrs(), {});
+  auto func = relay::FunctionNode::make(relay::FreeVars(y), y, relay::Type(), {});
+  auto A = tvm::runtime::NDArray::Empty({2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto B = tvm::runtime::NDArray::Empty({2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto C = tvm::runtime::NDArray::Empty({2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0});
+
+  auto pA = (float*)A.ToDLPack()->dl_tensor.data;
+  auto pB = (float*)B.ToDLPack()->dl_tensor.data;
+  auto pC = (float*)C.ToDLPack()->dl_tensor.data;
+
+  for (int i = 0; i < 6; ++i) {
+    pA[i] = i;
+    pB[i] = i + 1;
+    pC[i] = i + 2;
+  }
+  // get schedule
+  auto reg = tvm::runtime::Registry::Get("relay.op._Register");
+  auto s_i = tvm::runtime::Registry::Get("test.sch");
+  if (!reg) {
+    LOG(FATAL) << "no _Register";
+  }
+  if (!s_i) {
+    LOG(FATAL) << "no _Register";
+  }
+  (*reg)("add", "FTVMSchedule", *s_i, 10);
+  // build
+  auto pfb = tvm::runtime::Registry::Get("relay.build_module._BuildModule");
+  tvm::runtime::Module build_mod = (*pfb)();
+  auto build_f = build_mod.GetFunction("build", false);
+  auto json_f = build_mod.GetFunction("get_graph_json", false);
+  auto mod_f = build_mod.GetFunction("get_module", false);
+  Array<HalideIR::Expr> target_pair;
+  target_pair.push_back(ir::StringImm::make("cpu"));
+  target_pair.push_back(ir::StringImm::make("llvm"));
+  build_f(func, target_pair, "llvm");
+  std::string json = json_f();
+  tvm::runtime::Module mod = mod_f();
+  // run
+  auto ctx = A->ctx;
+  auto pfr = tvm::runtime::Registry::Get("tvm.graph_runtime.create");
+  tvm::runtime::Module run_mod = (*pfr)(json, mod, (int)ctx.device_type, (int)ctx.device_id);
+  auto set_input_f = run_mod.GetFunction("set_input", false);
+  auto run_f = run_mod.GetFunction("run", false);
+  auto get_output_f = run_mod.GetFunction("get_output", false);
+  set_input_f("a", A);
+  set_input_f("b", B);
+  set_input_f("c", C);
+  run_f();
+  tvm::runtime::NDArray Y = get_output_f(0);
+  auto pY = (float*)Y.ToDLPack()->dl_tensor.data;
+  for (int i = 0; i < 6; ++i) {
+    CHECK_LT(fabs(pY[i] - (i + (i + 1) + (i + 2))), 1e-4);
+  }
+}
+
+int main(int argc, char ** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  testing::FLAGS_gtest_death_test_style = "threadsafe";
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/python/relay/test_cpp_build_module.py b/tests/python/relay/test_cpp_build_module.py
new file mode 100644
index 000000000000..c69d877d3b09
--- /dev/null
+++ b/tests/python/relay/test_cpp_build_module.py
@@ -0,0 +1,106 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+
+import tvm
+from tvm import relay
+
+from tvm._ffi.function import _init_api
+_init_api("tvm.relay.build_module")
+
+class BuildModule(object):
+    def __init__(self):
+        self.mod = relay.build_module._BuildModule()
+        self._get_graph_json = self.mod["get_graph_json"]
+        self._get_module = self.mod["get_module"]
+        self._build = self.mod["build"]
+        self._set_opt_level = self.mod["set_opt_level"]
+        self._set_params_func = self.mod["set_params"]
+        self._get_params_func = self.mod["get_params"]
+
+  
+    def build(self, func, target, target_host, params):
+        tgts = []
+        for kv in target.items():
+            tgts.append(kv[0])
+            tgts.append(kv[1])
+        self._set_params(params)
+        self._build(func, tgts, target_host)
+
+    def get_json(self):
+        return self._get_graph_json()
+
+    def get_module(self):
+        return self._get_module()
+
+    def set_opt_level(self, level):
+        self._set_opt_level(level)
+
+    def _set_params(self, params):
+        inputs = {}
+        for name, param in params.items():
+            inputs[name] = relay.Constant(param)
+        self._set_params_func(inputs)
+
+    def get_params(self):
+        params = self._get_params_func()
+        ret = {}
+        for key, value in params.items():
+            ret[key] = value.data
+        return ret
+
+
+def test_build():
+    m_bld = BuildModule()
+    tgt_name = "llvm"
+    tgt = "llvm"
+    ctx = tvm.cpu()
+    # func
+    a = relay.var("a", dtype="float32", shape=(16, 8))
+    b = relay.var("b", dtype="float32", shape=(8, 8))
+    c = relay.var("c", dtype="float32", shape=(16, 8))
+    x = relay.nn.dense(a, b)
+    y = relay.nn.relu(x)
+    z = y + c
+    func = relay.Function([a, b, c], z)
+    A = tvm.nd.array(np.random.uniform(-1, 1, (16, 8)).astype("float32"), ctx=ctx)
+    B = tvm.nd.array(np.random.uniform(-1, 1, (8, 8)).astype("float32"), ctx=ctx)
+    C = tvm.nd.array(np.random.uniform(-1, 1, (16, 8)).astype("float32"), ctx=ctx)
+    params = {
+        "b" : B,
+        "c" : C
+    }
+    # build
+    targets = {
+        tgt: tgt
+    }
+    m_bld.set_opt_level(3)
+    m_bld.build(func, targets, "llvm -mcpu=sse3", params=params)
+    g_json = m_bld.get_json()
+    mmod = m_bld.get_module()
+    params = m_bld.get_params()
+   
+    # test
+    rt = tvm.contrib.graph_runtime.create(g_json, mmod, ctx)
+    rt.set_input("a", A)
+    rt.load_params(relay.save_param_dict(params))
+    rt.run()
+    out = rt.get_output(0)
+   
+    np.testing.assert_allclose(out.asnumpy(),
+        np.maximum(np.dot(A.asnumpy(), B.asnumpy().T), 0) + C.asnumpy(), atol=1e-5, rtol=1e-5)
+  

From 78b61f71ba470d50a9ff5077e3d3bfa8db76ffe6 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 8 May 2019 08:14:08 -0700
Subject: [PATCH 087/106] [CI] Always run cpptest during build to ensure
 library correctness (#3147)

---
 Jenkinsfile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 57a1fd791c1c..af5a2ce3eb42 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -92,10 +92,13 @@ def make(docker_type, path, make_flag) {
   timeout(time: max_time, unit: 'MINUTES') {
     try {
       sh "${docker_run} ${docker_type} ./tests/scripts/task_build.sh ${path} ${make_flag}"
+      // always run cpp test when build
+      sh "${docker_run} ${docker_type} ./tests/scripts/task_cpp_unittest.sh"
     } catch (exc) {
       echo 'Incremental compilation failed. Fall back to build from scratch'
       sh "${docker_run} ${docker_type} ./tests/scripts/task_clean.sh ${path}"
       sh "${docker_run} ${docker_type} ./tests/scripts/task_build.sh ${path} ${make_flag}"
+      sh "${docker_run} ${docker_type} ./tests/scripts/task_cpp_unittest.sh"
     }
   }
 }
@@ -181,7 +184,6 @@ stage('Build') {
         make(ci_cpu, 'build', '-j4')
         pack_lib('cpu', tvm_lib)
         timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} ${ci_cpu} ./tests/scripts/task_cpp_unittest.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_rust.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"

From 560a21e6fe9bc11c73a673363371db8e2a34d55d Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Wed, 8 May 2019 08:23:10 -0700
Subject: [PATCH 088/106] fix python lint warnings (#3145)

---
 python/tvm/relay/frontend/coreml.py | 2 +-
 python/tvm/relay/frontend/keras.py  | 2 +-
 python/tvm/relay/frontend/onnx.py   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/frontend/coreml.py b/python/tvm/relay/frontend/coreml.py
index f4cfb09bb330..653df92b71fc 100644
--- a/python/tvm/relay/frontend/coreml.py
+++ b/python/tvm/relay/frontend/coreml.py
@@ -17,8 +17,8 @@
 # pylint: disable=invalid-name, import-self, unused-argument, unused-variable, inconsistent-return-statements
 """CoreML frontend."""
 from __future__ import absolute_import as _abs
-import tvm
 import numpy as np
+import tvm
 from .. import ir_pass
 from .. import expr as _expr
 from .. import op as _op
diff --git a/python/tvm/relay/frontend/keras.py b/python/tvm/relay/frontend/keras.py
index cf712f793177..2648a5a6637b 100644
--- a/python/tvm/relay/frontend/keras.py
+++ b/python/tvm/relay/frontend/keras.py
@@ -18,8 +18,8 @@
 """Keras frontend."""
 from __future__ import absolute_import as _abs
 import sys
-import tvm
 import numpy as np
+import tvm
 from .. import ir_pass
 from .. import expr as _expr
 from .. import op as _op
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index d91ee4b8c5d7..b4d36306c85d 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -19,8 +19,8 @@
 from __future__ import absolute_import as _abs
 
 import logging
-import tvm
 import numpy as np
+import tvm
 from ... import nd as _nd
 from .. import ir_pass
 from .. import expr as _expr

From 82dfab1b8bcafbdae01ce84088191f0bc6b8bf86 Mon Sep 17 00:00:00 2001
From: Luis Vega <vegaluisjose@users.noreply.github.com>
Date: Wed, 8 May 2019 09:59:00 -0700
Subject: [PATCH 089/106] [RFC] [VTA] [TSIM] Enabling Cycle-Accurate Hardware
 Simulation for VTA #3009 (#3010)

* merge files

* move verilator to the right place

* change name to tsim

* add default rule to be build and run

* add README for tsim

* Update README.md

* add some structural feedback

* change name of VTASim to VTADPISim

* more renaming

* update comment

* add license

* fix indentation

* add switch for vta-tsim

* add more licenses

* update readme

* address some of the new feedback

* add some feedback from cpplint

* add one more whitespace

* pass pointer so linter is happy

* pass pointer so linter is happy

* README moved to vta documentation

* create types for dpi functions, so they can be handle easily

* fix pointer style

* add feedback from docs

* parametrize width data and pointers

* fix comments

* fix comment

* add comment to class

* add missing parameters

* move README back to tsim example

* add feedback

* add more comments and remove un-necessary argument in finish

* update comments

* fix cpplint

* fix doc
---
 cmake/config.cmake                            |   3 +
 cmake/modules/VTA.cmake                       |   7 +
 vta/apps/tsim_example/CMakeLists.txt          |  39 ++
 vta/apps/tsim_example/Makefile                |  45 +++
 vta/apps/tsim_example/README.md               |  71 ++++
 .../tsim_example/cmake/modules/driver.cmake   |  24 ++
 .../tsim_example/cmake/modules/tsim.cmake     | 152 +++++++
 .../tsim_example/hardware/chisel/Makefile     |  19 +
 .../tsim_example/hardware/chisel/build.sbt    |  69 ++++
 .../hardware/chisel/project/build.properties  |  20 +
 .../hardware/chisel/project/plugins.sbt       |  20 +
 .../chisel/src/main/scala/accel/Accel.scala   |  52 +++
 .../chisel/src/main/scala/accel/Compute.scala | 114 ++++++
 .../chisel/src/main/scala/accel/RegFile.scala | 105 +++++
 .../chisel/src/test/scala/dut/TestAccel.scala |  70 ++++
 .../tsim_example/hardware/verilog/Accel.v     | 124 ++++++
 .../tsim_example/hardware/verilog/Compute.v   | 159 ++++++++
 .../tsim_example/hardware/verilog/RegFile.v   | 149 +++++++
 .../tsim_example/hardware/verilog/TestAccel.v | 117 ++++++
 vta/apps/tsim_example/python/tsim/__init__.py |   0
 vta/apps/tsim_example/python/tsim/config.json |   7 +
 vta/apps/tsim_example/python/tsim/config.py   |  61 +++
 vta/apps/tsim_example/python/tsim/load.py     |  56 +++
 vta/apps/tsim_example/src/driver.cc           |  92 +++++
 .../tsim_example/tests/python/test_tsim.py    |  38 ++
 vta/hardware/chisel/Makefile                  |  19 +
 vta/hardware/chisel/build.sbt                 |  68 ++++
 vta/hardware/chisel/project/build.properties  |  20 +
 vta/hardware/chisel/project/plugins.sbt       |  20 +
 .../src/main/resources/verilog/VTAHostDPI.v   | 120 ++++++
 .../src/main/resources/verilog/VTAMemDPI.v    | 106 +++++
 .../src/main/scala/dpi/VTAHostDPI.scala       |  72 ++++
 .../chisel/src/main/scala/dpi/VTAMemDPI.scala |  73 ++++
 vta/hardware/dpi/tsim_device.cc               | 132 ++++++
 vta/include/vta/dpi/module.h                  |  65 +++
 vta/include/vta/dpi/tsim.h                    | 113 ++++++
 vta/src/dpi/module.cc                         | 376 ++++++++++++++++++
 37 files changed, 2797 insertions(+)
 create mode 100644 vta/apps/tsim_example/CMakeLists.txt
 create mode 100644 vta/apps/tsim_example/Makefile
 create mode 100644 vta/apps/tsim_example/README.md
 create mode 100644 vta/apps/tsim_example/cmake/modules/driver.cmake
 create mode 100644 vta/apps/tsim_example/cmake/modules/tsim.cmake
 create mode 100644 vta/apps/tsim_example/hardware/chisel/Makefile
 create mode 100644 vta/apps/tsim_example/hardware/chisel/build.sbt
 create mode 100644 vta/apps/tsim_example/hardware/chisel/project/build.properties
 create mode 100644 vta/apps/tsim_example/hardware/chisel/project/plugins.sbt
 create mode 100644 vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Accel.scala
 create mode 100644 vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Compute.scala
 create mode 100644 vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/RegFile.scala
 create mode 100644 vta/apps/tsim_example/hardware/chisel/src/test/scala/dut/TestAccel.scala
 create mode 100644 vta/apps/tsim_example/hardware/verilog/Accel.v
 create mode 100644 vta/apps/tsim_example/hardware/verilog/Compute.v
 create mode 100644 vta/apps/tsim_example/hardware/verilog/RegFile.v
 create mode 100644 vta/apps/tsim_example/hardware/verilog/TestAccel.v
 create mode 100644 vta/apps/tsim_example/python/tsim/__init__.py
 create mode 100644 vta/apps/tsim_example/python/tsim/config.json
 create mode 100644 vta/apps/tsim_example/python/tsim/config.py
 create mode 100644 vta/apps/tsim_example/python/tsim/load.py
 create mode 100644 vta/apps/tsim_example/src/driver.cc
 create mode 100644 vta/apps/tsim_example/tests/python/test_tsim.py
 create mode 100644 vta/hardware/chisel/Makefile
 create mode 100644 vta/hardware/chisel/build.sbt
 create mode 100644 vta/hardware/chisel/project/build.properties
 create mode 100644 vta/hardware/chisel/project/plugins.sbt
 create mode 100644 vta/hardware/chisel/src/main/resources/verilog/VTAHostDPI.v
 create mode 100644 vta/hardware/chisel/src/main/resources/verilog/VTAMemDPI.v
 create mode 100644 vta/hardware/chisel/src/main/scala/dpi/VTAHostDPI.scala
 create mode 100644 vta/hardware/chisel/src/main/scala/dpi/VTAMemDPI.scala
 create mode 100644 vta/hardware/dpi/tsim_device.cc
 create mode 100644 vta/include/vta/dpi/module.h
 create mode 100644 vta/include/vta/dpi/tsim.h
 create mode 100644 vta/src/dpi/module.cc

diff --git a/cmake/config.cmake b/cmake/config.cmake
index f71353619e04..448fb25bd519 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -134,3 +134,6 @@ set(USE_SORT ON)
 set(USE_TENSORRT OFF)
 # Build ANTLR parser for Relay text format
 set(USE_ANTLR OFF)
+
+# Build TSIM for VTA
+set(USE_VTA_TSIM OFF)
diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
index 3c6390688f9a..1adb0aaf387a 100644
--- a/cmake/modules/VTA.cmake
+++ b/cmake/modules/VTA.cmake
@@ -60,6 +60,13 @@ elseif(PYTHON)
     find_library(__cma_lib NAMES cma PATH /usr/lib)
     target_link_libraries(vta ${__cma_lib})
   endif()
+
+  if(NOT USE_VTA_TSIM STREQUAL "OFF")
+    include_directories("vta/include")
+    file(GLOB RUNTIME_DPI_SRCS vta/src/dpi/module.cc)
+    list(APPEND RUNTIME_SRCS ${RUNTIME_DPI_SRCS})
+  endif()
+
 else()
   message(STATUS "Cannot found python in env, VTA build is skipped..")
 endif()
diff --git a/vta/apps/tsim_example/CMakeLists.txt b/vta/apps/tsim_example/CMakeLists.txt
new file mode 100644
index 000000000000..4163c88ce3b8
--- /dev/null
+++ b/vta/apps/tsim_example/CMakeLists.txt
@@ -0,0 +1,39 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+cmake_minimum_required(VERSION 3.2)
+project(tsim C CXX)
+
+set(TVM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../)
+set(VTA_DIR ${TVM_DIR}/vta)
+
+include_directories("${TVM_DIR}/include")
+include_directories("${TVM_DIR}/3rdparty/dlpack/include")
+include_directories("${TVM_DIR}/3rdparty/dmlc-core/include")
+include_directories("${TVM_DIR}/vta/src/dpi")
+
+set(CMAKE_C_FLAGS "-O2 -Wall -fPIC -fvisibility=hidden")
+set(CMAKE_CXX_FLAGS "-O2 -Wall -fPIC -fvisibility=hidden -std=c++11")
+
+if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND
+    CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+  set(CMAKE_CXX_FLAGS "-faligned-new ${CMAKE_CXX_FLAGS}")
+endif()
+
+# Module rules
+include(cmake/modules/tsim.cmake)
+include(cmake/modules/driver.cmake)
diff --git a/vta/apps/tsim_example/Makefile b/vta/apps/tsim_example/Makefile
new file mode 100644
index 000000000000..e4911ceda419
--- /dev/null
+++ b/vta/apps/tsim_example/Makefile
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+export PYTHONPATH:=$(PWD)/python:$(PYTHONPATH)
+
+BUILD_DIR = $(shell python python/tsim/config.py --get-build-name)
+
+TVM_DIR = $(abspath ../../../)
+
+TSIM_TARGET = verilog
+TSIM_TOP_NAME = TestAccel
+TSIM_BUILD_NAME = build
+
+# optional
+TSIM_TRACE_NAME = trace.vcd
+
+default: cmake run
+
+.PHONY: cmake
+
+cmake: | $(BUILD_DIR)
+	cd $(BUILD_DIR) && cmake .. && make
+
+$(BUILD_DIR):
+	mkdir -p $@
+
+run:
+	python3 tests/python/test_tsim.py | grep PASS
+
+clean:
+	-rm -rf $(BUILD_DIR)
diff --git a/vta/apps/tsim_example/README.md b/vta/apps/tsim_example/README.md
new file mode 100644
index 000000000000..4cde4242dc28
--- /dev/null
+++ b/vta/apps/tsim_example/README.md
@@ -0,0 +1,71 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+VTA TSIM Installation
+======================
+
+*TSIM* is a cycle-accurate hardware simulation environment that can be invoked and managed directly from TVM. It aims to enable cycle accurate simulation of deep learning accelerators including VTA.
+This simulation environment can be used in both OSX and Linux.
+There are two dependencies required to make *TSIM* works: [Verilator](https://www.veripool.org/wiki/verilator) and [sbt](https://www.scala-sbt.org/) for accelerators designed in [Chisel3](https://github.com/freechipsproject/chisel3).
+
+## OSX Dependencies
+
+Install `sbt` and `verilator` using [Homebrew](https://brew.sh/).
+
+```bash
+brew install verilator sbt
+```
+
+## Linux Dependencies
+
+Add `sbt` to package manager (Ubuntu).
+
+```bash
+echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list
+sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823
+sudo apt-get update
+```
+
+Install `sbt` and `verilator`.
+
+```bash
+sudo apt install verilator sbt
+```
+
+## Setup in TVM
+
+1. Install `verilator` and `sbt` as described above
+2. Enable VTA TSIM by turning on the switch `USE_VTA_TSIM` in config.cmake
+3. Build tvm
+
+## How to run VTA TSIM examples
+
+There are two sample VTA accelerators (add-by-one) designed in Chisel3 and Verilog to show how *TSIM* works.
+These examples are located at `<tvm-root>/vta/apps/tsim_example`.
+
+* Instructions
+    * Open `<tvm-root>/vta/apps/tsim_example/python/tsim/config.json`
+    * Change `TARGET` from `verilog` to `chisel`, depending on what language backend you would like to test
+    * Go to `tvm/vta/apps/tsim`
+    * Run `make`
+
+* Some pointers
+    * Build cmake script for driver `<tvm-root>/vta/apps/tsim_example/cmake/modules/driver.cmake`
+    * Build cmake script for tsim `<tvm-root>/vta/apps/tsim_example/cmake/modules/tsim.cmake`
+    * Software driver that handles the VTA accelerator `<tvm-root>/vta/apps/tsim_example/src/driver.cc`
+    * VTA add-by-one accelerator (Verilog) `<tvm-root>/vta/apps/tsim_example/hardware/verilog`
+    * VTA add-by-one accelerator (Chisel) `<tvm-root>/vta/apps/tsim_example/hardware/chisel`
diff --git a/vta/apps/tsim_example/cmake/modules/driver.cmake b/vta/apps/tsim_example/cmake/modules/driver.cmake
new file mode 100644
index 000000000000..c4c80637918f
--- /dev/null
+++ b/vta/apps/tsim_example/cmake/modules/driver.cmake
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+file(GLOB TSIM_SW_SRC src/driver.cc)
+add_library(driver SHARED ${TSIM_SW_SRC})
+target_include_directories(driver PRIVATE ${VTA_DIR}/include)
+
+if(APPLE)
+  set_target_properties(driver PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
+endif(APPLE)
diff --git a/vta/apps/tsim_example/cmake/modules/tsim.cmake b/vta/apps/tsim_example/cmake/modules/tsim.cmake
new file mode 100644
index 000000000000..4c81f288e45a
--- /dev/null
+++ b/vta/apps/tsim_example/cmake/modules/tsim.cmake
@@ -0,0 +1,152 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if(MSVC)
+  message(STATUS "TSIM build is skipped in Windows..")
+else()
+  find_program(PYTHON NAMES python python3 python3.6)
+  find_program(VERILATOR NAMES verilator)
+
+  if (VERILATOR AND PYTHON)
+
+    if (TSIM_TOP_NAME STREQUAL "")
+      message(FATAL_ERROR "TSIM_TOP_NAME should be defined")
+    endif()
+
+    if (TSIM_BUILD_NAME STREQUAL "")
+      message(FATAL_ERROR "TSIM_BUILD_NAME should be defined")
+    endif()
+
+    set(TSIM_CONFIG ${PYTHON} ${CMAKE_CURRENT_SOURCE_DIR}/python/tsim/config.py)
+
+    execute_process(COMMAND ${TSIM_CONFIG} --get-target OUTPUT_VARIABLE __TSIM_TARGET)
+    execute_process(COMMAND ${TSIM_CONFIG} --get-top-name OUTPUT_VARIABLE __TSIM_TOP_NAME)
+    execute_process(COMMAND ${TSIM_CONFIG} --get-build-name OUTPUT_VARIABLE __TSIM_BUILD_NAME)
+    execute_process(COMMAND ${TSIM_CONFIG} --get-use-trace OUTPUT_VARIABLE __TSIM_USE_TRACE)
+    execute_process(COMMAND ${TSIM_CONFIG} --get-trace-name OUTPUT_VARIABLE __TSIM_TRACE_NAME)
+
+    string(STRIP ${__TSIM_TARGET} TSIM_TARGET)
+    string(STRIP ${__TSIM_TOP_NAME} TSIM_TOP_NAME)
+    string(STRIP ${__TSIM_BUILD_NAME} TSIM_BUILD_NAME)
+    string(STRIP ${__TSIM_USE_TRACE} TSIM_USE_TRACE)
+    string(STRIP ${__TSIM_TRACE_NAME} TSIM_TRACE_NAME)
+
+    set(TSIM_BUILD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/${TSIM_BUILD_NAME})
+
+    if (TSIM_TARGET STREQUAL "chisel")
+
+      find_program(SBT NAMES sbt)
+
+      if (SBT)
+
+        # Install Chisel VTA package for DPI modules
+        set(VTA_CHISEL_DIR ${VTA_DIR}/hardware/chisel)
+
+        execute_process(WORKING_DIRECTORY ${VTA_CHISEL_DIR}
+          COMMAND ${SBT} publishLocal RESULT_VARIABLE RETCODE)
+
+        if (NOT RETCODE STREQUAL "0")
+          message(FATAL_ERROR "[TSIM] sbt failed to install VTA scala package")
+        endif()
+
+        # Chisel - Scala to Verilog compilation
+        set(TSIM_CHISEL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/hardware/chisel)
+        set(CHISEL_TARGET_DIR ${TSIM_BUILD_DIR}/chisel)
+        set(CHISEL_OPT "test:runMain test.Elaborate --target-dir ${CHISEL_TARGET_DIR} --top-name ${TSIM_TOP_NAME}")
+
+        execute_process(WORKING_DIRECTORY ${TSIM_CHISEL_DIR} COMMAND ${SBT} ${CHISEL_OPT} RESULT_VARIABLE RETCODE)
+
+        if (NOT RETCODE STREQUAL "0")
+          message(FATAL_ERROR "[TSIM] sbt failed to compile from Chisel to Verilog.")
+        endif()
+
+        file(GLOB VERILATOR_RTL_SRC ${CHISEL_TARGET_DIR}/*.v)
+
+      else()
+        message(FATAL_ERROR "[TSIM] sbt should be installed for Chisel")
+      endif() # sbt
+
+    elseif (TSIM_TARGET STREQUAL "verilog")
+
+      set(VTA_VERILOG_DIR ${VTA_DIR}/hardware/chisel/src/main/resources/verilog)
+      set(TSIM_VERILOG_DIR ${CMAKE_CURRENT_SOURCE_DIR}/hardware/verilog)
+      file(GLOB VERILATOR_RTL_SRC ${VTA_VERILOG_DIR}/*.v ${TSIM_VERILOG_DIR}/*.v)
+
+    else()
+      message(STATUS "[TSIM] target language can be only verilog or chisel...")
+    endif() # TSIM_TARGET
+
+    if (TSIM_TARGET STREQUAL "chisel" OR TSIM_TARGET STREQUAL "verilog")
+
+      # Check if tracing can be enabled
+      if (NOT TSIM_USE_TRACE STREQUAL "OFF")
+        message(STATUS "[TSIM] Verilog enable tracing")
+      else()
+        message(STATUS "[TSIM] Verilator disable tracing")
+      endif()
+
+      # Verilator - Verilog to C++ compilation
+      set(VERILATOR_TARGET_DIR ${TSIM_BUILD_DIR}/verilator)
+      set(VERILATOR_OPT +define+RANDOMIZE_GARBAGE_ASSIGN +define+RANDOMIZE_REG_INIT)
+      list(APPEND VERILATOR_OPT +define+RANDOMIZE_MEM_INIT --x-assign unique)
+      list(APPEND VERILATOR_OPT --output-split 20000 --output-split-cfuncs 20000)
+      list(APPEND VERILATOR_OPT --top-module ${TSIM_TOP_NAME} -Mdir ${VERILATOR_TARGET_DIR})
+      list(APPEND VERILATOR_OPT --cc ${VERILATOR_RTL_SRC})
+
+      if (NOT TSIM_USE_TRACE STREQUAL "OFF")
+        list(APPEND VERILATOR_OPT --trace)
+      endif()
+
+      execute_process(COMMAND ${VERILATOR} ${VERILATOR_OPT} RESULT_VARIABLE RETCODE)
+
+      if (NOT RETCODE STREQUAL "0")
+        message(FATAL_ERROR "[TSIM] Verilator failed to compile Verilog to C++...")
+      endif()
+
+      # Build shared library (.so)
+      set(VTA_HW_DPI_DIR ${VTA_DIR}/hardware/dpi)
+      set(VERILATOR_INC_DIR /usr/local/share/verilator/include)
+      set(VERILATOR_LIB_SRC ${VERILATOR_INC_DIR}/verilated.cpp ${VERILATOR_INC_DIR}/verilated_dpi.cpp)
+
+      if (NOT TSIM_USE_TRACE STREQUAL "OFF")
+        list(APPEND VERILATOR_LIB_SRC ${VERILATOR_INC_DIR}/verilated_vcd_c.cpp)
+      endif()
+
+      file(GLOB VERILATOR_GEN_SRC ${VERILATOR_TARGET_DIR}/*.cpp)
+      file(GLOB VERILATOR_SRC ${VTA_HW_DPI_DIR}/tsim_device.cc)
+      add_library(tsim SHARED ${VERILATOR_LIB_SRC} ${VERILATOR_GEN_SRC} ${VERILATOR_SRC})
+
+      set(VERILATOR_DEF VL_TSIM_NAME=V${TSIM_TOP_NAME} VL_PRINTF=printf VM_COVERAGE=0 VM_SC=0)
+      if (NOT TSIM_USE_TRACE STREQUAL "OFF")
+        list(APPEND VERILATOR_DEF VM_TRACE=1 TSIM_TRACE_FILE=${TSIM_BUILD_DIR}/${TSIM_TRACE_NAME}.vcd)
+      else()
+        list(APPEND VERILATOR_DEF VM_TRACE=0)
+      endif()
+      target_compile_definitions(tsim PRIVATE ${VERILATOR_DEF})
+      target_compile_options(tsim PRIVATE -Wno-sign-compare -include V${TSIM_TOP_NAME}.h)
+      target_include_directories(tsim PRIVATE ${VERILATOR_TARGET_DIR} ${VERILATOR_INC_DIR} ${VERILATOR_INC_DIR}/vltstd ${VTA_DIR}/include)
+
+      if(APPLE)
+        set_target_properties(tsim PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
+      endif(APPLE)
+
+    endif() # TSIM_TARGET STREQUAL "chisel" OR TSIM_TARGET STREQUAL "verilog"
+
+  else()
+    message(STATUS "[TSIM] could not find Python or Verilator, build is skipped...")
+  endif() # VERILATOR
+endif() # MSVC
diff --git a/vta/apps/tsim_example/hardware/chisel/Makefile b/vta/apps/tsim_example/hardware/chisel/Makefile
new file mode 100644
index 000000000000..65a9ed13c989
--- /dev/null
+++ b/vta/apps/tsim_example/hardware/chisel/Makefile
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+clean:
+	-rm -rf target project/target project/project
diff --git a/vta/apps/tsim_example/hardware/chisel/build.sbt b/vta/apps/tsim_example/hardware/chisel/build.sbt
new file mode 100644
index 000000000000..a2afc0d9d362
--- /dev/null
+++ b/vta/apps/tsim_example/hardware/chisel/build.sbt
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+name := "accel"
+version := "0.1.0-SNAPSHOT"
+organization := "edu.washington.cs"
+
+def scalacOptionsVersion(scalaVersion: String): Seq[String] = {
+  Seq() ++ {
+    // If we're building with Scala > 2.11, enable the compile option
+    //  switch to support our anonymous Bundle definitions:
+    //  https://github.com/scala/bug/issues/10047
+    CrossVersion.partialVersion(scalaVersion) match {
+      case Some((2, scalaMajor: Long)) if scalaMajor < 12 => Seq()
+      case _ => Seq(
+        "-Xsource:2.11",
+        "-language:reflectiveCalls",
+        "-language:implicitConversions",
+        "-deprecation",
+        "-Xlint",
+        "-Ywarn-unused",
+      )
+    }
+  }
+}
+
+def javacOptionsVersion(scalaVersion: String): Seq[String] = {
+  Seq() ++ {
+    // Scala 2.12 requires Java 8. We continue to generate
+    //  Java 7 compatible code for Scala 2.11
+    //  for compatibility with old clients.
+    CrossVersion.partialVersion(scalaVersion) match {
+      case Some((2, scalaMajor: Long)) if scalaMajor < 12 =>
+        Seq("-source", "1.7", "-target", "1.7")
+      case _ =>
+        Seq("-source", "1.8", "-target", "1.8")
+    }
+  }
+}
+
+scalaVersion := "2.11.12"
+
+resolvers ++= Seq(
+  Resolver.sonatypeRepo("snapshots"),
+  Resolver.sonatypeRepo("releases"))
+
+libraryDependencies ++= Seq(
+  "edu.berkeley.cs" %% "chisel3" % "3.1.7",
+  "edu.washington.cs" %% "vta" % "0.1.0-SNAPSHOT",
+)
+
+scalacOptions ++= scalacOptionsVersion(scalaVersion.value)
+javacOptions ++= javacOptionsVersion(scalaVersion.value)
diff --git a/vta/apps/tsim_example/hardware/chisel/project/build.properties b/vta/apps/tsim_example/hardware/chisel/project/build.properties
new file mode 100644
index 000000000000..7e2b74b51a4f
--- /dev/null
+++ b/vta/apps/tsim_example/hardware/chisel/project/build.properties
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+sbt.version = 1.1.1
diff --git a/vta/apps/tsim_example/hardware/chisel/project/plugins.sbt b/vta/apps/tsim_example/hardware/chisel/project/plugins.sbt
new file mode 100644
index 000000000000..79ffb2245d52
--- /dev/null
+++ b/vta/apps/tsim_example/hardware/chisel/project/plugins.sbt
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+logLevel := Level.Warn
diff --git a/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Accel.scala b/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Accel.scala
new file mode 100644
index 000000000000..9225f83b0821
--- /dev/null
+++ b/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Accel.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package accel
+
+import chisel3._
+import vta.dpi._
+
+/** Add-by-one accelerator.
+  *
+  * ___________      ___________
+  * |         |      |         |
+  * | HostDPI | <--> | RegFile | <->|
+  * |_________|      |_________|    |
+  *                                 |
+  * ___________      ___________    |
+  * |         |      |         |    |
+  * | MemDPI  | <--> | Compute | <->|
+  * |_________|      |_________|
+  *
+  */
+class Accel extends Module {
+  val io = IO(new Bundle {
+    val host = new VTAHostDPIClient
+    val mem = new VTAMemDPIMaster
+  })
+  val rf = Module(new RegFile)
+  val ce = Module(new Compute)
+  rf.io.host <> io.host
+  io.mem <> ce.io.mem
+  ce.io.launch := rf.io.launch
+  rf.io.finish := ce.io.finish
+  ce.io.length := rf.io.length
+  ce.io.inp_baddr := rf.io.inp_baddr
+  ce.io.out_baddr := rf.io.out_baddr
+}
diff --git a/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Compute.scala b/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Compute.scala
new file mode 100644
index 000000000000..fb7a2f396cb0
--- /dev/null
+++ b/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Compute.scala
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package accel
+
+import chisel3._
+import chisel3.util._
+import vta.dpi._
+
+/** Compute
+  *
+  * Add-by-one procedure:
+  *
+  * 1. Wait for launch to be asserted
+  * 2. Issue a read request for 8-byte value at inp_baddr address
+  * 3. Wait for the value
+  * 4. Issue a write request for 8-byte value at out_baddr address
+  * 5. Increment read-address and write-address for next value
+  * 6. Check if counter (cnt) is equal to length to assert finish,
+  *    otherwise go to step 2.
+  */
+class Compute extends Module {
+  val io = IO(new Bundle {
+    val launch = Input(Bool())
+    val finish = Output(Bool())
+    val length = Input(UInt(32.W))
+    val inp_baddr = Input(UInt(64.W))
+    val out_baddr = Input(UInt(64.W))
+    val mem = new VTAMemDPIMaster
+  })
+  val sIdle :: sReadReq :: sReadData :: sWriteReq :: sWriteData :: Nil = Enum(5)
+  val state = RegInit(sIdle)
+  val reg = Reg(chiselTypeOf(io.mem.rd.bits))
+  val cnt = Reg(chiselTypeOf(io.length))
+  val raddr = Reg(chiselTypeOf(io.inp_baddr))
+  val waddr = Reg(chiselTypeOf(io.out_baddr))
+
+  switch (state) {
+    is (sIdle) {
+      when (io.launch) {
+        state := sReadReq
+      }
+    }
+    is (sReadReq) {
+      state := sReadData
+    }
+    is (sReadData) {
+      when (io.mem.rd.valid) {
+        state := sWriteReq
+      }
+    }
+    is (sWriteReq) {
+      state := sWriteData
+    }
+    is (sWriteData) {
+      when (cnt === (io.length - 1.U)) {
+        state := sIdle
+      } .otherwise {
+        state := sReadReq
+      }
+    }
+  }
+
+  // calculate next address
+  when (state === sIdle) {
+    raddr := io.inp_baddr
+    waddr := io.out_baddr
+  } .elsewhen (state === sWriteData) { // increment by 8-bytes
+    raddr := raddr + 8.U
+    waddr := waddr + 8.U
+  }
+
+  // create request
+  io.mem.req.valid := state === sReadReq | state === sWriteReq
+  io.mem.req.opcode := state === sWriteReq
+  io.mem.req.len := 0.U // one-word-per-request
+  io.mem.req.addr := Mux(state === sReadReq, raddr, waddr)
+
+  // read
+  when (state === sReadData && io.mem.rd.valid) {
+    reg := io.mem.rd.bits + 1.U
+  }
+  io.mem.rd.ready := state === sReadData
+
+  // write
+  io.mem.wr.valid := state === sWriteData
+  io.mem.wr.bits := reg
+
+  // count read/write
+  when (state === sIdle) {
+    cnt := 0.U
+  } .elsewhen (state === sWriteData) {
+    cnt := cnt + 1.U
+  }
+
+  // done when read/write are equal to length
+  io.finish := state === sWriteData && cnt === (io.length - 1.U)
+}
diff --git a/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/RegFile.scala b/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/RegFile.scala
new file mode 100644
index 000000000000..e636afdfb2e1
--- /dev/null
+++ b/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/RegFile.scala
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package accel
+
+import chisel3._
+import chisel3.util._
+import vta.dpi._
+
+/** Register File.
+  *
+  * Six 32-bit register file.
+  *
+  * -------------------------------
+  *  Register description    | addr
+  * -------------------------|-----
+  *  Control status register | 0x00
+  *  Length value register   | 0x04
+  *  Input pointer lsb       | 0x08
+  *  Input pointer msb       | 0x0c
+  *  Output pointer lsb      | 0x10
+  *  Output pointer msb      | 0x14
+  * -------------------------------
+
+  * ------------------------------
+  *  Control status register | bit
+  * ------------------------------
+  *  Launch                  | 0
+  *  Finish                  | 1
+  * ------------------------------
+  */
+class RegFile extends Module {
+  val io = IO(new Bundle {
+    val launch = Output(Bool())
+    val finish = Input(Bool())
+    val length = Output(UInt(32.W))
+    val inp_baddr = Output(UInt(64.W))
+    val out_baddr = Output(UInt(64.W))
+    val host = new VTAHostDPIClient
+  })
+  val sIdle :: sRead :: Nil = Enum(2)
+  val state = RegInit(sIdle)
+
+  switch (state) {
+    is (sIdle) {
+      when (io.host.req.valid && !io.host.req.opcode) {
+        state := sRead
+      }
+    }
+    is (sRead) {
+      state := sIdle
+    }
+  }
+
+  io.host.req.deq := state === sIdle & io.host.req.valid
+
+  val reg = Seq.fill(6)(RegInit(0.U.asTypeOf(chiselTypeOf(io.host.req.value))))
+  val addr = Seq.tabulate(6)(_ * 4)
+  val reg_map = (addr zip reg)  map { case (a, r) => a.U -> r }
+
+  (reg zip addr).foreach { case(r, a) =>
+    if (a == 0) { // control status register
+      when (io.finish) {
+        r := "b_10".U
+      } .elsewhen (state === sIdle && io.host.req.valid &&
+            io.host.req.opcode && a.U === io.host.req.addr) {
+        r := io.host.req.value
+      }
+    } else {
+      when (state === sIdle && io.host.req.valid &&
+            io.host.req.opcode && a.U === io.host.req.addr) {
+        r := io.host.req.value
+      }
+    }
+  }
+
+  val rdata = RegInit(0.U.asTypeOf(chiselTypeOf(io.host.req.value)))
+  when (state === sIdle && io.host.req.valid && !io.host.req.opcode) {
+    rdata := MuxLookup(io.host.req.addr, 0.U, reg_map)
+  }
+
+  io.host.resp.valid := state === sRead
+  io.host.resp.bits := rdata
+
+  io.launch := reg(0)(0)
+  io.length := reg(1)
+  io.inp_baddr := Cat(reg(3), reg(2))
+  io.out_baddr := Cat(reg(5), reg(4))
+}
diff --git a/vta/apps/tsim_example/hardware/chisel/src/test/scala/dut/TestAccel.scala b/vta/apps/tsim_example/hardware/chisel/src/test/scala/dut/TestAccel.scala
new file mode 100644
index 000000000000..45f81d50a50b
--- /dev/null
+++ b/vta/apps/tsim_example/hardware/chisel/src/test/scala/dut/TestAccel.scala
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package test
+
+import chisel3._
+import chisel3.experimental.{RawModule, withClockAndReset}
+import vta.dpi._
+import accel._
+
+/** VTA simulation shell.
+  *
+  * Instantiate Host and Memory DPI modules.
+  *
+  */
+class VTASimShell extends RawModule {
+  val io = IO(new Bundle {
+    val clock = Input(Clock())
+    val reset = Input(Bool())
+    val host = new VTAHostDPIMaster
+    val mem = new VTAMemDPIClient
+  })
+  val host = Module(new VTAHostDPI)
+  val mem = Module(new VTAMemDPI)
+  mem.io.reset := io.reset
+  mem.io.clock := io.clock
+  host.io.reset := io.reset
+  host.io.clock := io.clock
+  io.mem <> mem.io.dpi
+  io.host <> host.io.dpi
+}
+
+/** Test accelerator.
+  *
+  * Instantiate and connect the simulation-shell and the accelerator.
+  *
+  */
+class TestAccel extends RawModule {
+  val clock = IO(Input(Clock()))
+  val reset = IO(Input(Bool()))
+
+  val sim_shell = Module(new VTASimShell)
+  val vta_accel = withClockAndReset(clock, reset) { Module(new Accel) }
+
+  sim_shell.io.clock := clock
+  sim_shell.io.reset := reset
+  vta_accel.io.host <> sim_shell.io.host
+  sim_shell.io.mem <> vta_accel.io.mem
+}
+
+/** Generate TestAccel as top module */
+object Elaborate extends App {
+  chisel3.Driver.execute(args, () => new TestAccel)
+}
diff --git a/vta/apps/tsim_example/hardware/verilog/Accel.v b/vta/apps/tsim_example/hardware/verilog/Accel.v
new file mode 100644
index 000000000000..b025aad22ab7
--- /dev/null
+++ b/vta/apps/tsim_example/hardware/verilog/Accel.v
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/** Add-by-one accelerator.
+  *
+  * ___________      ___________
+  * |         |      |         |
+  * | HostDPI | <--> | RegFile | <->|
+  * |_________|      |_________|    |
+  *                                 |
+  * ___________      ___________    |
+  * |         |      |         |    |
+  * | MemDPI  | <--> | Compute | <->|
+  * |_________|      |_________|
+  *
+  */
+module Accel #
+( parameter HOST_ADDR_BITS = 8,
+  parameter HOST_DATA_BITS = 32,
+  parameter MEM_LEN_BITS = 8,
+  parameter MEM_ADDR_BITS = 64,
+  parameter MEM_DATA_BITS = 64
+)
+(
+  input                         clock,
+  input                         reset,
+
+  input                         host_req_valid,
+  input                         host_req_opcode,
+  input    [HOST_ADDR_BITS-1:0] host_req_addr,
+  input    [HOST_DATA_BITS-1:0] host_req_value,
+  output                        host_req_deq,
+  output                        host_resp_valid,
+  output   [HOST_DATA_BITS-1:0] host_resp_bits,
+
+  output                        mem_req_valid,
+  output                        mem_req_opcode,
+  output     [MEM_LEN_BITS-1:0] mem_req_len,
+  output    [MEM_ADDR_BITS-1:0] mem_req_addr,
+  output                        mem_wr_valid,
+  output    [MEM_DATA_BITS-1:0] mem_wr_bits,
+  input                         mem_rd_valid,
+  input     [MEM_DATA_BITS-1:0] mem_rd_bits,
+  output                        mem_rd_ready
+);
+
+  logic                      launch;
+  logic                      finish;
+  logic [HOST_DATA_BITS-1:0] length;
+  logic  [MEM_ADDR_BITS-1:0] inp_baddr;
+  logic  [MEM_ADDR_BITS-1:0] out_baddr;
+
+  RegFile #
+  (
+    .MEM_ADDR_BITS(MEM_ADDR_BITS),
+    .HOST_ADDR_BITS(HOST_ADDR_BITS),
+    .HOST_DATA_BITS(HOST_DATA_BITS)
+  )
+  rf
+  (
+    .clock           (clock),
+    .reset           (reset),
+
+    .host_req_valid  (host_req_valid),
+    .host_req_opcode (host_req_opcode),
+    .host_req_addr   (host_req_addr),
+    .host_req_value  (host_req_value),
+    .host_req_deq    (host_req_deq),
+    .host_resp_valid (host_resp_valid),
+    .host_resp_bits  (host_resp_bits),
+
+    .launch          (launch),
+    .finish          (finish),
+    .length          (length),
+    .inp_baddr       (inp_baddr),
+    .out_baddr       (out_baddr)
+  );
+
+  Compute #
+  (
+    .MEM_LEN_BITS(MEM_LEN_BITS),
+    .MEM_ADDR_BITS(MEM_ADDR_BITS),
+    .MEM_DATA_BITS(MEM_DATA_BITS),
+    .HOST_DATA_BITS(HOST_DATA_BITS)
+  )
+  comp
+  (
+    .clock           (clock),
+    .reset           (reset),
+
+    .mem_req_valid   (mem_req_valid),
+    .mem_req_opcode  (mem_req_opcode),
+    .mem_req_len     (mem_req_len),
+    .mem_req_addr    (mem_req_addr),
+    .mem_wr_valid    (mem_wr_valid),
+    .mem_wr_bits     (mem_wr_bits),
+    .mem_rd_valid    (mem_rd_valid),
+    .mem_rd_bits     (mem_rd_bits),
+    .mem_rd_ready    (mem_rd_ready),
+
+    .launch          (launch),
+    .finish          (finish),
+    .length          (length),
+    .inp_baddr       (inp_baddr),
+    .out_baddr       (out_baddr)
+  );
+
+endmodule
diff --git a/vta/apps/tsim_example/hardware/verilog/Compute.v b/vta/apps/tsim_example/hardware/verilog/Compute.v
new file mode 100644
index 000000000000..a5660ac8bc7d
--- /dev/null
+++ b/vta/apps/tsim_example/hardware/verilog/Compute.v
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/** Compute
+  *
+  * Add-by-one procedure:
+  *
+  * 1. Wait for launch to be asserted
+  * 2. Issue a read request for 8-byte value at inp_baddr address
+  * 3. Wait for the value
+  * 4. Issue a write request for 8-byte value at out_baddr address
+  * 5. Increment read-address and write-address for next value
+  * 6. Check if counter (cnt) is equal to length to assert finish,
+  *    otherwise go to step 2.
+  */
+module Compute #
+(
+  parameter MEM_LEN_BITS = 8,
+  parameter MEM_ADDR_BITS = 64,
+  parameter MEM_DATA_BITS = 64,
+  parameter HOST_DATA_BITS = 32
+)
+(
+  input                         clock,
+  input                         reset,
+
+  output                        mem_req_valid,
+  output                        mem_req_opcode,
+  output     [MEM_LEN_BITS-1:0] mem_req_len,
+  output    [MEM_ADDR_BITS-1:0] mem_req_addr,
+  output                        mem_wr_valid,
+  output    [MEM_DATA_BITS-1:0] mem_wr_bits,
+  input                         mem_rd_valid,
+  input     [MEM_DATA_BITS-1:0] mem_rd_bits,
+  output                        mem_rd_ready,
+
+  input                         launch,
+  output                        finish,
+  input    [HOST_DATA_BITS-1:0] length,
+  input     [MEM_ADDR_BITS-1:0] inp_baddr,
+  input     [MEM_ADDR_BITS-1:0] out_baddr
+);
+
+  typedef enum logic [2:0] {IDLE,
+                            READ_REQ,
+                            READ_DATA,
+                            WRITE_REQ,
+                            WRITE_DATA} state_t;
+
+  state_t state_n, state_r;
+
+  logic [31:0] cnt;
+  logic [MEM_DATA_BITS-1:0] data;
+  logic [MEM_ADDR_BITS-1:0] raddr;
+  logic [MEM_ADDR_BITS-1:0] waddr;
+
+  always_ff @(posedge clock) begin
+    if (reset) begin
+      state_r <= IDLE;
+    end else begin
+      state_r <= state_n;
+    end
+  end
+
+  always_comb begin
+    state_n = IDLE;
+    case (state_r)
+      IDLE: begin
+        if (launch) begin
+          state_n = READ_REQ;
+	end
+      end
+
+      READ_REQ: begin
+        state_n = READ_DATA;
+      end
+
+      READ_DATA: begin
+        if (mem_rd_valid) begin
+          state_n = WRITE_REQ;
+	end else begin
+          state_n = READ_DATA;
+	end
+      end
+
+      WRITE_REQ: begin
+        state_n = WRITE_DATA;
+      end
+
+      WRITE_DATA: begin
+        if (cnt == (length - 1'b1)) begin
+          state_n = IDLE;
+	end else begin
+          state_n = READ_REQ;
+	end
+      end
+
+      default: begin
+      end
+    endcase
+  end
+
+  // calculate next address
+  always_ff @(posedge clock) begin
+    if (reset | state_r == IDLE) begin
+      raddr <= inp_baddr;
+      waddr <= out_baddr;
+    end else if (state_r == WRITE_DATA) begin
+      raddr <= raddr + 'd8;
+      waddr <= waddr + 'd8;
+    end
+  end
+
+  // create request
+  assign mem_req_valid = (state_r == READ_REQ) | (state_r == WRITE_REQ);
+  assign mem_req_opcode = state_r == WRITE_REQ;
+  assign mem_req_len = 'd0; // one-word-per-request
+  assign mem_req_addr = (state_r == READ_REQ)? raddr : waddr;
+
+  // read
+  always_ff @(posedge clock) begin
+    if ((state_r == READ_DATA) & mem_rd_valid) begin
+      data <= mem_rd_bits + 1'b1;
+    end
+  end
+  assign mem_rd_ready = state_r == READ_DATA;
+
+  // write
+  assign mem_wr_valid = state_r == WRITE_DATA;
+  assign mem_wr_bits = data;
+
+  // count read/write
+  always_ff @(posedge clock) begin
+    if (reset | state_r == IDLE) begin
+      cnt <= 'd0;
+    end else if (state_r == WRITE_DATA) begin
+      cnt <= cnt + 1'b1;
+    end
+  end
+
+  // done when read/write are equal to length
+  assign finish = (state_r == WRITE_DATA) & (cnt == (length - 1'b1));
+endmodule
diff --git a/vta/apps/tsim_example/hardware/verilog/RegFile.v b/vta/apps/tsim_example/hardware/verilog/RegFile.v
new file mode 100644
index 000000000000..28edf9672f48
--- /dev/null
+++ b/vta/apps/tsim_example/hardware/verilog/RegFile.v
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/** Register File.
+  *
+  * Six 32-bit register file.
+  *
+  * -------------------------------
+  *  Register description    | addr
+  * -------------------------|-----
+  *  Control status register | 0x00
+  *  Length value register   | 0x04
+  *  Input pointer lsb       | 0x08
+  *  Input pointer msb       | 0x0c
+  *  Output pointer lsb      | 0x10
+  *  Output pointer msb      | 0x14
+  * -------------------------------
+
+  * ------------------------------
+  *  Control status register | bit
+  * ------------------------------
+  *  Launch                  | 0
+  *  Finish                  | 1
+  * ------------------------------
+  */
+module RegFile #
+ (parameter MEM_ADDR_BITS = 64,
+  parameter HOST_ADDR_BITS = 8,
+  parameter HOST_DATA_BITS = 32
+)
+(
+  input                         clock,
+  input                         reset,
+
+  input                         host_req_valid,
+  input                         host_req_opcode,
+  input    [HOST_ADDR_BITS-1:0] host_req_addr,
+  input    [HOST_DATA_BITS-1:0] host_req_value,
+  output                        host_req_deq,
+  output                        host_resp_valid,
+  output   [HOST_DATA_BITS-1:0] host_resp_bits,
+
+  output                        launch,
+  input                         finish,
+  output   [HOST_DATA_BITS-1:0] length,
+  output    [MEM_ADDR_BITS-1:0] inp_baddr,
+  output    [MEM_ADDR_BITS-1:0] out_baddr
+);
+
+  typedef enum logic {IDLE, READ} state_t;
+  state_t state_n, state_r;
+
+  always_ff @(posedge clock) begin
+    if (reset) begin
+      state_r <= IDLE;
+    end else begin
+      state_r <= state_n;
+    end
+  end
+
+  always_comb begin
+    state_n = IDLE;
+    case (state_r)
+      IDLE: begin
+        if (host_req_valid & ~host_req_opcode) begin
+          state_n = READ;
+	end
+      end
+
+      READ: begin
+        state_n = IDLE;
+      end
+    endcase
+  end
+
+  assign host_req_deq = (state_r == IDLE) ? host_req_valid : 1'b0;
+
+  logic [HOST_DATA_BITS-1:0] rf [5:0];
+
+  genvar i;
+  for (i = 0; i < 6; i++) begin
+    logic wen = (state_r == IDLE)? host_req_valid & host_req_opcode & i*4 == host_req_addr : 1'b0;
+    if (i == 0) begin
+      always_ff @(posedge clock) begin
+        if (reset) begin
+	end else if (finish) begin
+	  rf[i] <= 'd2;
+	end else if (wen) begin
+	  rf[i] <= host_req_value;
+	end
+      end
+    end else begin
+      always_ff @(posedge clock) begin
+        if (reset) begin
+	end else if (wen) begin
+	  rf[i] <= host_req_value;
+	end
+      end
+    end
+  end
+
+  logic [HOST_DATA_BITS-1:0] rdata;
+  always_ff @(posedge clock) begin
+    if (reset) begin
+      rdata <= 'd0;
+    end else if ((state_r == IDLE) & host_req_valid & ~host_req_opcode) begin
+      if (host_req_addr == 'h00) begin
+        rdata <= rf[0];
+      end else if (host_req_addr == 'h04) begin
+        rdata <= rf[1];
+      end else if (host_req_addr == 'h08) begin
+        rdata <= rf[2];
+      end else if (host_req_addr == 'h0c) begin
+        rdata <= rf[3];
+      end else if (host_req_addr == 'h10) begin
+        rdata <= rf[4];
+      end else if (host_req_addr == 'h14) begin
+        rdata <= rf[5];
+      end else begin
+        rdata <= 'd0;
+      end
+    end
+  end
+
+  assign host_resp_valid = (state_r == READ);
+  assign host_resp_bits = rdata;
+
+  assign launch = rf[0][0];
+  assign length = rf[1];
+  assign inp_baddr = {rf[3], rf[2]};
+  assign out_baddr = {rf[5], rf[4]};
+
+endmodule
diff --git a/vta/apps/tsim_example/hardware/verilog/TestAccel.v b/vta/apps/tsim_example/hardware/verilog/TestAccel.v
new file mode 100644
index 000000000000..f3bcc862a3ba
--- /dev/null
+++ b/vta/apps/tsim_example/hardware/verilog/TestAccel.v
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/** Test accelerator.
+  *
+  * Instantiate host/memory DPI modules and connect them to the accelerator.
+  *
+  */
+module TestAccel
+(
+  input clock,
+  input reset
+);
+
+  localparam HOST_ADDR_BITS = 8;
+  localparam HOST_DATA_BITS = 32;
+
+  logic                      host_req_valid;
+  logic                      host_req_opcode;
+  logic [HOST_ADDR_BITS-1:0] host_req_addr;
+  logic [HOST_DATA_BITS-1:0] host_req_value;
+  logic                      host_req_deq;
+  logic                      host_resp_valid;
+  logic [HOST_DATA_BITS-1:0] host_resp_bits;
+
+  localparam MEM_LEN_BITS = 8;
+  localparam MEM_ADDR_BITS = 64;
+  localparam MEM_DATA_BITS = 64;
+
+  logic                     mem_req_valid;
+  logic                     mem_req_opcode;
+  logic  [MEM_LEN_BITS-1:0] mem_req_len;
+  logic [MEM_ADDR_BITS-1:0] mem_req_addr;
+  logic                     mem_wr_valid;
+  logic [MEM_DATA_BITS-1:0] mem_wr_bits;
+  logic                     mem_rd_valid;
+  logic [MEM_DATA_BITS-1:0] mem_rd_bits;
+  logic                     mem_rd_ready;
+
+  VTAHostDPI host
+  (
+    .clock          (clock),
+    .reset          (reset),
+
+    .dpi_req_valid  (host_req_valid),
+    .dpi_req_opcode (host_req_opcode),
+    .dpi_req_addr   (host_req_addr),
+    .dpi_req_value  (host_req_value),
+    .dpi_req_deq    (host_req_deq),
+    .dpi_resp_valid (host_resp_valid),
+    .dpi_resp_bits  (host_resp_bits)
+  );
+
+  VTAMemDPI mem
+  (
+    .clock          (clock),
+    .reset          (reset),
+
+    .dpi_req_valid  (mem_req_valid),
+    .dpi_req_opcode (mem_req_opcode),
+    .dpi_req_len    (mem_req_len),
+    .dpi_req_addr   (mem_req_addr),
+    .dpi_wr_valid   (mem_wr_valid),
+    .dpi_wr_bits    (mem_wr_bits),
+    .dpi_rd_valid   (mem_rd_valid),
+    .dpi_rd_bits    (mem_rd_bits),
+    .dpi_rd_ready   (mem_rd_ready)
+  );
+
+  Accel #
+  (
+    .HOST_ADDR_BITS(HOST_ADDR_BITS),
+    .HOST_DATA_BITS(HOST_DATA_BITS),
+    .MEM_LEN_BITS(MEM_LEN_BITS),
+    .MEM_ADDR_BITS(MEM_ADDR_BITS),
+    .MEM_DATA_BITS(MEM_DATA_BITS)
+  )
+  accel
+  (
+    .clock           (clock),
+    .reset           (reset),
+
+    .host_req_valid  (host_req_valid),
+    .host_req_opcode (host_req_opcode),
+    .host_req_addr   (host_req_addr),
+    .host_req_value  (host_req_value),
+    .host_req_deq    (host_req_deq),
+    .host_resp_valid (host_resp_valid),
+    .host_resp_bits  (host_resp_bits),
+
+    .mem_req_valid   (mem_req_valid),
+    .mem_req_opcode  (mem_req_opcode),
+    .mem_req_len     (mem_req_len),
+    .mem_req_addr    (mem_req_addr),
+    .mem_wr_valid    (mem_wr_valid),
+    .mem_wr_bits     (mem_wr_bits),
+    .mem_rd_valid    (mem_rd_valid),
+    .mem_rd_bits     (mem_rd_bits),
+    .mem_rd_ready    (mem_rd_ready)
+  );
+endmodule
diff --git a/vta/apps/tsim_example/python/tsim/__init__.py b/vta/apps/tsim_example/python/tsim/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vta/apps/tsim_example/python/tsim/config.json b/vta/apps/tsim_example/python/tsim/config.json
new file mode 100644
index 000000000000..887eaac67d74
--- /dev/null
+++ b/vta/apps/tsim_example/python/tsim/config.json
@@ -0,0 +1,7 @@
+{
+  "TARGET" : "verilog",
+  "TOP_NAME" : "TestAccel",
+  "BUILD_NAME" : "build",
+  "USE_TRACE" : "off",
+  "TRACE_NAME" : "trace"
+}
diff --git a/vta/apps/tsim_example/python/tsim/config.py b/vta/apps/tsim_example/python/tsim/config.py
new file mode 100644
index 000000000000..6ff4f4234cf0
--- /dev/null
+++ b/vta/apps/tsim_example/python/tsim/config.py
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os.path as osp
+import sys
+import json
+import argparse
+
+cur = osp.abspath(osp.dirname(__file__))
+cfg = json.load(open(osp.join(cur, 'config.json')))
+
+def main():
+    """Main function"""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--get-target", action="store_true",
+                        help="Get target language, i.e. verilog or chisel")
+    parser.add_argument("--get-top-name", action="store_true",
+                        help="Get hardware design top name")
+    parser.add_argument("--get-build-name", action="store_true",
+                        help="Get build folder name")
+    parser.add_argument("--get-use-trace", action="store_true",
+                        help="Get use trace")
+    parser.add_argument("--get-trace-name", action="store_true",
+                        help="Get trace filename")
+    args = parser.parse_args()
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        return
+
+    if args.get_target:
+        print(cfg['TARGET'])
+
+    if args.get_top_name:
+        print(cfg['TOP_NAME'])
+
+    if args.get_build_name:
+        print(cfg['BUILD_NAME'])
+
+    if args.get_use_trace:
+        print(cfg['USE_TRACE'])
+
+    if args.get_trace_name:
+        print(cfg['TRACE_NAME'])
+
+if __name__ == "__main__":
+    main()
diff --git a/vta/apps/tsim_example/python/tsim/load.py b/vta/apps/tsim_example/python/tsim/load.py
new file mode 100644
index 000000000000..ef94fa97d206
--- /dev/null
+++ b/vta/apps/tsim_example/python/tsim/load.py
@@ -0,0 +1,56 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import ctypes
+import json
+import os.path as osp
+from sys import platform
+
+def get_build_path():
+    curr_path = osp.dirname(osp.abspath(osp.expanduser(__file__)))
+    cfg = json.load(open(osp.join(curr_path, 'config.json')))
+    return osp.join(curr_path, "..", "..", cfg['BUILD_NAME'])
+
+def get_lib_ext():
+    if platform == "darwin":
+        ext = ".dylib"
+    else:
+        ext = ".so"
+    return ext
+
+def get_lib_path(name):
+    build_path = get_build_path()
+    ext = get_lib_ext()
+    libname = name + ext
+    return osp.join(build_path, libname)
+
+def _load_driver_lib():
+    lib = get_lib_path("libdriver")
+    try:
+        return [ctypes.CDLL(lib, ctypes.RTLD_GLOBAL)]
+    except OSError:
+        return []
+
+def load_driver():
+    return tvm.get_global_func("tvm.vta.driver")
+
+def load_tsim():
+    lib = get_lib_path("libtsim")
+    return tvm.module.load(lib, "vta-tsim")
+
+LIBS = _load_driver_lib()
diff --git a/vta/apps/tsim_example/src/driver.cc b/vta/apps/tsim_example/src/driver.cc
new file mode 100644
index 000000000000..9898537a3f25
--- /dev/null
+++ b/vta/apps/tsim_example/src/driver.cc
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/registry.h>
+#include <vta/dpi/module.h>
+
+namespace vta {
+namespace driver {
+
+uint32_t get_half_addr(void *p, bool upper) {
+  if (upper) {
+    return ((uint64_t) ((uint64_t*) p)) >> 32;
+  } else {
+    return ((uint64_t) ((uint64_t*) p));
+  }
+}
+
+using vta::dpi::DPIModuleNode;
+using tvm::runtime::Module;
+
+class TestDriver {
+ public:
+  TestDriver(Module module)
+      : module_(module) {
+    dpi_ = static_cast<DPIModuleNode*>(
+        module.operator->());
+  }
+
+  int Run(uint32_t length, void* inp, void* out) {
+    uint32_t wait_cycles = 100000000;
+    this->Launch(wait_cycles, length, inp, out);
+    this->WaitForCompletion(wait_cycles);
+    dpi_->Finish();
+    return 0;
+  }
+
+ private:
+  void Launch(uint32_t wait_cycles, uint32_t length, void* inp, void* out) {
+    dpi_->Launch(wait_cycles);
+    // write registers
+    dpi_->WriteReg(0x04, length);
+    dpi_->WriteReg(0x08, get_half_addr(inp, false));
+    dpi_->WriteReg(0x0c, get_half_addr(inp, true));
+    dpi_->WriteReg(0x10, get_half_addr(out, false));
+    dpi_->WriteReg(0x14, get_half_addr(out, true));
+    dpi_->WriteReg(0x00, 0x1); // launch
+  }
+
+  void WaitForCompletion(uint32_t wait_cycles) {
+    uint32_t i, val;
+    for (i = 0; i < wait_cycles; i++) {
+      val = dpi_->ReadReg(0x00);
+      if (val == 2) break; // finish
+    }
+  }
+
+ private:
+  DPIModuleNode* dpi_;
+  Module module_;
+};
+
+using tvm::runtime::TVMRetValue;
+using tvm::runtime::TVMArgs;
+
+TVM_REGISTER_GLOBAL("tvm.vta.driver")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    Module dev_mod = args[0];
+    DLTensor* A = args[1];
+    DLTensor* B = args[2];
+    TestDriver dev_(dev_mod);
+    dev_.Run(A->shape[0], A->data, B->data);
+  });
+
+}  // namespace driver
+}  // namespace vta
diff --git a/vta/apps/tsim_example/tests/python/test_tsim.py b/vta/apps/tsim_example/tests/python/test_tsim.py
new file mode 100644
index 000000000000..fd032f91914e
--- /dev/null
+++ b/vta/apps/tsim_example/tests/python/test_tsim.py
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import numpy as np
+from tsim.load import load_driver, load_tsim
+
+def test_tsim(i):
+    rmin = 1 # min vector size of 1
+    rmax = 64
+    n = np.random.randint(rmin, rmax)
+    ctx = tvm.cpu(0)
+    a = tvm.nd.array(np.random.randint(rmax, size=n).astype("uint64"), ctx)
+    b = tvm.nd.array(np.zeros(n).astype("uint64"), ctx)
+    tsim = load_tsim()
+    f = load_driver()
+    f(tsim, a, b)
+    emsg = "[FAIL] test number:{} n:{}".format(i, n)
+    np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1, err_msg=emsg)
+    print("[PASS] test number:{} n:{}".format(i, n))
+
+if __name__ == "__main__":
+    for i in range(10):
+        test_tsim(i)
diff --git a/vta/hardware/chisel/Makefile b/vta/hardware/chisel/Makefile
new file mode 100644
index 000000000000..65a9ed13c989
--- /dev/null
+++ b/vta/hardware/chisel/Makefile
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+clean:
+	-rm -rf target project/target project/project
diff --git a/vta/hardware/chisel/build.sbt b/vta/hardware/chisel/build.sbt
new file mode 100644
index 000000000000..3fa93d2c5cfc
--- /dev/null
+++ b/vta/hardware/chisel/build.sbt
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+name := "vta"
+version := "0.1.0-SNAPSHOT"
+organization := "edu.washington.cs"
+
+def scalacOptionsVersion(scalaVersion: String): Seq[String] = {
+  Seq() ++ {
+    // If we're building with Scala > 2.11, enable the compile option
+    //  switch to support our anonymous Bundle definitions:
+    //  https://github.com/scala/bug/issues/10047
+    CrossVersion.partialVersion(scalaVersion) match {
+      case Some((2, scalaMajor: Long)) if scalaMajor < 12 => Seq()
+      case _ => Seq(
+        "-Xsource:2.11",
+        "-language:reflectiveCalls",
+        "-language:implicitConversions",
+        "-deprecation",
+        "-Xlint",
+        "-Ywarn-unused",
+      )
+    }
+  }
+}
+
+def javacOptionsVersion(scalaVersion: String): Seq[String] = {
+  Seq() ++ {
+    // Scala 2.12 requires Java 8. We continue to generate
+    //  Java 7 compatible code for Scala 2.11
+    //  for compatibility with old clients.
+    CrossVersion.partialVersion(scalaVersion) match {
+      case Some((2, scalaMajor: Long)) if scalaMajor < 12 =>
+        Seq("-source", "1.7", "-target", "1.7")
+      case _ =>
+        Seq("-source", "1.8", "-target", "1.8")
+    }
+  }
+}
+
+scalaVersion := "2.11.12"
+
+resolvers ++= Seq(
+  Resolver.sonatypeRepo("snapshots"),
+  Resolver.sonatypeRepo("releases"))
+
+libraryDependencies ++= Seq(
+  "edu.berkeley.cs" %% "chisel3" % "3.1.7",
+)
+
+scalacOptions ++= scalacOptionsVersion(scalaVersion.value)
+javacOptions ++= javacOptionsVersion(scalaVersion.value)
diff --git a/vta/hardware/chisel/project/build.properties b/vta/hardware/chisel/project/build.properties
new file mode 100644
index 000000000000..7e2b74b51a4f
--- /dev/null
+++ b/vta/hardware/chisel/project/build.properties
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+sbt.version = 1.1.1
diff --git a/vta/hardware/chisel/project/plugins.sbt b/vta/hardware/chisel/project/plugins.sbt
new file mode 100644
index 000000000000..79ffb2245d52
--- /dev/null
+++ b/vta/hardware/chisel/project/plugins.sbt
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+logLevel := Level.Warn
diff --git a/vta/hardware/chisel/src/main/resources/verilog/VTAHostDPI.v b/vta/hardware/chisel/src/main/resources/verilog/VTAHostDPI.v
new file mode 100644
index 000000000000..02fcf0d779e1
--- /dev/null
+++ b/vta/hardware/chisel/src/main/resources/verilog/VTAHostDPI.v
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+module VTAHostDPI #
+( parameter ADDR_BITS = 8,
+  parameter DATA_BITS = 32
+)
+(
+  input                        clock,
+  input                        reset,
+  output logic                 dpi_req_valid,
+  output logic                 dpi_req_opcode,
+  output logic [ADDR_BITS-1:0] dpi_req_addr,
+  output logic [DATA_BITS-1:0] dpi_req_value,
+  input                        dpi_req_deq,
+  input                        dpi_resp_valid,
+  input        [DATA_BITS-1:0] dpi_resp_bits
+);
+
+  import "DPI-C" function void VTAHostDPI
+  (
+    output byte unsigned exit,
+    output byte unsigned req_valid,
+    output byte unsigned req_opcode,
+    output byte unsigned req_addr,
+    output int  unsigned req_value,
+    input  byte unsigned req_deq,
+    input  byte unsigned resp_valid,
+    input  int  unsigned resp_value
+  );
+
+  typedef logic        dpi1_t;
+  typedef logic  [7:0] dpi8_t;
+  typedef logic [31:0] dpi32_t;
+
+  dpi1_t  __reset;
+  dpi8_t  __exit;
+  dpi8_t  __req_valid;
+  dpi8_t  __req_opcode;
+  dpi8_t  __req_addr;
+  dpi32_t __req_value;
+  dpi8_t  __req_deq;
+  dpi8_t  __resp_valid;
+  dpi32_t __resp_bits;
+
+  // reset
+  always_ff @(posedge clock) begin
+    __reset <= reset;
+  end
+
+  // delaying outputs by one-cycle
+  // since verilator does not support delays
+  always_ff @(posedge clock) begin
+    dpi_req_valid  <= dpi1_t ' (__req_valid);
+    dpi_req_opcode <= dpi1_t ' (__req_opcode);
+    dpi_req_addr   <= __req_addr;
+    dpi_req_value  <= __req_value;
+  end
+
+  assign __req_deq    = dpi8_t ' (dpi_req_deq);
+  assign __resp_valid = dpi8_t ' (dpi_resp_valid);
+  assign __resp_bits  = dpi_resp_bits;
+
+  // evaluate DPI function
+  always_ff @(posedge clock) begin
+    if (reset | __reset) begin
+      __exit = 0;
+      __req_valid = 0;
+      __req_opcode = 0;
+      __req_addr = 0;
+      __req_value = 0;
+    end
+    else begin
+      VTAHostDPI(
+        __exit,
+        __req_valid,
+        __req_opcode,
+        __req_addr,
+        __req_value,
+        __req_deq,
+        __resp_valid,
+        __resp_bits);
+    end
+  end
+
+  logic [63:0] cycles;
+
+  always_ff @(posedge clock) begin
+    if (reset | __reset) begin
+      cycles <= 'd0;
+    end
+    else begin
+      cycles <= cycles + 1'b1;
+    end
+  end
+
+  always_ff @(posedge clock) begin
+    if (__exit == 'd1) begin
+      $display("[DONE] at cycle:%016d", cycles);
+      $finish;
+    end
+  end
+
+endmodule
diff --git a/vta/hardware/chisel/src/main/resources/verilog/VTAMemDPI.v b/vta/hardware/chisel/src/main/resources/verilog/VTAMemDPI.v
new file mode 100644
index 000000000000..e0ed949bf8cf
--- /dev/null
+++ b/vta/hardware/chisel/src/main/resources/verilog/VTAMemDPI.v
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+module VTAMemDPI #
+( parameter LEN_BITS = 8,
+  parameter ADDR_BITS = 64,
+  parameter DATA_BITS = 64
+)
+(
+  input                        clock,
+  input                        reset,
+  input                        dpi_req_valid,
+  input                        dpi_req_opcode,
+  input         [LEN_BITS-1:0] dpi_req_len,
+  input        [ADDR_BITS-1:0] dpi_req_addr,
+  input                        dpi_wr_valid,
+  input        [DATA_BITS-1:0] dpi_wr_bits,
+  output logic                 dpi_rd_valid,
+  output logic [DATA_BITS-1:0] dpi_rd_bits,
+  input                        dpi_rd_ready
+);
+
+  import "DPI-C" function void VTAMemDPI
+  (
+    input  byte     unsigned req_valid,
+    input  byte     unsigned req_opcode,
+    input  byte     unsigned req_len,
+    input  longint  unsigned req_addr,
+    input  byte     unsigned wr_valid,
+    input  longint  unsigned wr_value,
+    output byte     unsigned rd_valid,
+    output longint  unsigned rd_value,
+    input  byte     unsigned rd_ready
+  );
+
+  typedef logic        dpi1_t;
+  typedef logic  [7:0] dpi8_t;
+  typedef logic [31:0] dpi32_t;
+  typedef logic [63:0] dpi64_t;
+
+  dpi1_t  __reset;
+  dpi8_t  __req_valid;
+  dpi8_t  __req_opcode;
+  dpi8_t  __req_len;
+  dpi64_t __req_addr;
+  dpi8_t  __wr_valid;
+  dpi64_t __wr_value;
+  dpi8_t  __rd_valid;
+  dpi64_t __rd_value;
+  dpi8_t  __rd_ready;
+
+  always_ff @(posedge clock) begin
+    __reset <= reset;
+  end
+
+  // delaying outputs by one-cycle
+  // since verilator does not support delays
+  always_ff @(posedge clock) begin
+    dpi_rd_valid <= dpi1_t ' (__rd_valid);
+    dpi_rd_bits  <= __rd_value;
+  end
+
+  assign __req_valid  = dpi8_t ' (dpi_req_valid);
+  assign __req_opcode = dpi8_t ' (dpi_req_opcode);
+  assign __req_len    = dpi_req_len;
+  assign __req_addr   = dpi_req_addr;
+  assign __wr_valid   = dpi8_t ' (dpi_wr_valid);
+  assign __wr_value   = dpi_wr_bits;
+  assign __rd_ready   = dpi8_t ' (dpi_rd_ready);
+
+  // evaluate DPI function
+  always_ff @(posedge clock) begin
+    if (reset | __reset) begin
+      __rd_valid = 0;
+      __rd_value = 0;
+    end
+    else begin
+      VTAMemDPI(
+        __req_valid,
+        __req_opcode,
+        __req_len,
+        __req_addr,
+        __wr_valid,
+        __wr_value,
+        __rd_valid,
+        __rd_value,
+        __rd_ready);
+    end
+  end
+endmodule
diff --git a/vta/hardware/chisel/src/main/scala/dpi/VTAHostDPI.scala b/vta/hardware/chisel/src/main/scala/dpi/VTAHostDPI.scala
new file mode 100644
index 000000000000..aab2d630c307
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/dpi/VTAHostDPI.scala
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.dpi
+
+import chisel3._
+import chisel3.util._
+
+/** Host DPI parameters */
+trait VTAHostDPIParams {
+  val dpiAddrBits = 8
+  val dpiDataBits = 32
+}
+
+/** Host master interface.
+  *
+  * This interface is tipically used by the Host
+  */
+class VTAHostDPIMaster extends Bundle with VTAHostDPIParams {
+  val req = new Bundle {
+    val valid = Output(Bool())
+    val opcode = Output(Bool())
+    val addr = Output(UInt(dpiAddrBits.W))
+    val value = Output(UInt(dpiDataBits.W))
+    val deq = Input(Bool())
+  }
+  val resp = Flipped(ValidIO(UInt(dpiDataBits.W)))
+}
+
+/** Host client interface.
+  *
+  * This interface is tipically used by the Accelerator
+  */
+class VTAHostDPIClient extends Bundle with VTAHostDPIParams {
+  val req = new Bundle {
+    val valid = Input(Bool())
+    val opcode = Input(Bool())
+    val addr = Input(UInt(dpiAddrBits.W))
+    val value = Input(UInt(dpiDataBits.W))
+    val deq = Output(Bool())
+  }
+  val resp = ValidIO(UInt(dpiDataBits.W))
+}
+
+/** Host DPI module.
+  *
+  * Wrapper for Host Verilog DPI module.
+  */
+class VTAHostDPI extends BlackBox with HasBlackBoxResource {
+  val io = IO(new Bundle {
+    val clock = Input(Clock())
+    val reset = Input(Bool())
+    val dpi = new VTAHostDPIMaster
+  })
+  setResource("/verilog/VTAHostDPI.v")
+}
diff --git a/vta/hardware/chisel/src/main/scala/dpi/VTAMemDPI.scala b/vta/hardware/chisel/src/main/scala/dpi/VTAMemDPI.scala
new file mode 100644
index 000000000000..090f0459570a
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/dpi/VTAMemDPI.scala
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.dpi
+
+import chisel3._
+import chisel3.util._
+
+/** Memory DPI parameters */
+trait VTAMemDPIParams {
+  val dpiLenBits = 8
+  val dpiAddrBits = 64
+  val dpiDataBits = 64
+}
+
+/** Memory master interface.
+  *
+  * This interface is tipically used by the Accelerator
+  */
+class VTAMemDPIMaster extends Bundle with VTAMemDPIParams {
+  val req = new Bundle {
+    val valid = Output(Bool())
+    val opcode = Output(Bool())
+    val len = Output(UInt(dpiLenBits.W))
+    val addr = Output(UInt(dpiAddrBits.W))
+  }
+  val wr = ValidIO(UInt(dpiDataBits.W))
+  val rd = Flipped(Decoupled(UInt(dpiDataBits.W)))
+}
+
+/** Memory client interface.
+  *
+  * This interface is tipically used by the Host
+  */
+class VTAMemDPIClient extends Bundle with VTAMemDPIParams {
+  val req = new Bundle {
+    val valid = Input(Bool())
+    val opcode = Input(Bool())
+    val len = Input(UInt(dpiLenBits.W))
+    val addr = Input(UInt(dpiAddrBits.W))
+  }
+  val wr = Flipped(ValidIO(UInt(dpiDataBits.W)))
+  val rd = Decoupled(UInt(dpiDataBits.W))
+}
+
+/** Memory DPI module.
+  *
+  * Wrapper for Memory Verilog DPI module.
+  */
+class VTAMemDPI extends BlackBox with HasBlackBoxResource {
+  val io = IO(new Bundle {
+    val clock = Input(Clock())
+    val reset = Input(Bool())
+    val dpi = new VTAMemDPIClient
+  })
+  setResource("/verilog/VTAMemDPI.v")
+}
diff --git a/vta/hardware/dpi/tsim_device.cc b/vta/hardware/dpi/tsim_device.cc
new file mode 100644
index 000000000000..08954179f1d2
--- /dev/null
+++ b/vta/hardware/dpi/tsim_device.cc
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <vta/dpi/tsim.h>
+
+#if VM_TRACE
+#include <verilated_vcd_c.h>
+#endif
+
+#if VM_TRACE
+#define STRINGIZE(x) #x
+#define STRINGIZE_VALUE_OF(x) STRINGIZE(x)
+#endif
+
+static VTAContextHandle _ctx = nullptr;
+static VTAMemDPIFunc _mem_dpi = nullptr;
+static VTAHostDPIFunc _host_dpi = nullptr;
+
+void VTAHostDPI(dpi8_t* exit,
+                dpi8_t* req_valid,
+                dpi8_t* req_opcode,
+                dpi8_t* req_addr,
+                dpi32_t* req_value,
+                dpi8_t req_deq,
+                dpi8_t resp_valid,
+                dpi32_t resp_value) {
+  assert(_host_dpi != nullptr);
+  (*_host_dpi)(_ctx, exit, req_valid, req_opcode,
+               req_addr, req_value, req_deq,
+               resp_valid, resp_value);
+}
+
+void VTAMemDPI(dpi8_t req_valid,
+               dpi8_t req_opcode,
+               dpi8_t req_len,
+               dpi64_t req_addr,
+               dpi8_t wr_valid,
+               dpi64_t wr_value,
+               dpi8_t* rd_valid,
+               dpi64_t* rd_value,
+               dpi8_t rd_ready) {
+  assert(_mem_dpi != nullptr);
+  (*_mem_dpi)(_ctx, req_valid, req_opcode, req_len,
+              req_addr, wr_valid, wr_value,
+              rd_valid, rd_value, rd_ready);
+
+}
+
+void VTADPIInit(VTAContextHandle handle,
+                VTAHostDPIFunc host_dpi,
+                VTAMemDPIFunc mem_dpi) {
+  _ctx = handle;
+  _host_dpi = host_dpi;
+  _mem_dpi = mem_dpi;
+}
+
+int VTADPISim(uint64_t max_cycles) {
+  uint64_t trace_count = 0;
+
+#if VM_TRACE
+  uint64_t start = 0;
+#endif
+
+  VL_TSIM_NAME* top = new VL_TSIM_NAME;
+
+#if VM_TRACE
+  Verilated::traceEverOn(true);
+  VerilatedVcdC* tfp = new VerilatedVcdC;
+  top->trace(tfp, 99);
+  tfp->open(STRINGIZE_VALUE_OF(TSIM_TRACE_FILE));
+#endif
+
+  // reset
+  for (int i = 0; i < 10; i++) {
+    top->reset = 1;
+    top->clock = 0;
+    top->eval();
+#if VM_TRACE
+    if (trace_count >= start)
+      tfp->dump(static_cast<vluint64_t>(trace_count * 2));
+#endif
+    top->clock = 1;
+    top->eval();
+#if VM_TRACE
+    if (trace_count >= start)
+      tfp->dump(static_cast<vluint64_t>(trace_count * 2 + 1));
+#endif
+    trace_count++;
+  }
+  top->reset = 0;
+
+  // start simulation
+  while (!Verilated::gotFinish() && trace_count < max_cycles) {
+    top->clock = 0;
+    top->eval();
+#if VM_TRACE
+    if (trace_count >= start)
+      tfp->dump(static_cast<vluint64_t>(trace_count * 2));
+#endif
+    top->clock = 1;
+    top->eval();
+#if VM_TRACE
+    if (trace_count >= start)
+      tfp->dump(static_cast<vluint64_t>(trace_count * 2 + 1));
+#endif
+    trace_count++;
+  }
+
+#if VM_TRACE
+  tfp->close();
+#endif
+
+  delete top;
+
+  return 0;
+}
diff --git a/vta/include/vta/dpi/module.h b/vta/include/vta/dpi/module.h
new file mode 100644
index 000000000000..d2e4c80129eb
--- /dev/null
+++ b/vta/include/vta/dpi/module.h
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef VTA_DPI_MODULE_H_
+#define VTA_DPI_MODULE_H_
+
+#include <tvm/runtime/module.h>
+#include <mutex>
+#include <queue>
+#include <condition_variable>
+#include <string>
+
+namespace vta {
+namespace dpi {
+
+/*!
+ * \brief DPI driver module for managing the accelerator
+ */
+class DPIModuleNode : public tvm::runtime::ModuleNode {
+ public:
+/*!
+ * \brief Launch accelerator until it finishes or reach max_cycles
+ * \param max_cycles The maximum of cycles to wait
+ */
+  virtual void Launch(uint64_t max_cycles) = 0;
+
+/*!
+ * \brief Write an accelerator register
+ * \param addr The register address
+ * \param value The register value
+ */
+  virtual void WriteReg(int addr, uint32_t value) = 0;
+
+/*!
+ * \brief Read an accelerator register
+ * \param addr The register address
+ */
+  virtual uint32_t ReadReg(int addr) = 0;
+
+/*! \brief Kill or Exit() the accelerator */
+  virtual void Finish() = 0;
+
+  static tvm::runtime::Module Load(std::string dll_name);
+};
+
+}  // namespace dpi
+}  // namespace vta
+#endif  // VTA_DPI_MODULE_H_
+
diff --git a/vta/include/vta/dpi/tsim.h b/vta/include/vta/dpi/tsim.h
new file mode 100644
index 000000000000..6170cde5cdf8
--- /dev/null
+++ b/vta/include/vta/dpi/tsim.h
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef VTA_DPI_TSIM_H_
+#define VTA_DPI_TSIM_H_
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef unsigned char dpi8_t;
+
+typedef unsigned int dpi32_t;
+
+typedef unsigned long long dpi64_t; // NOLINT(*)
+
+/*! \brief the context handle */
+typedef void* VTAContextHandle;
+
+/*!
+ * \brief Host DPI callback function that is invoked in VTAHostDPI.v every clock cycle
+ * \param exit Host kill simulation
+ * \param req_valid Host has a valid request for read or write a register in Accel
+ * \param req_opcode Host request type, opcode=0 for read and opcode=1 for write
+ * \param req_addr Host request register address
+ * \param req_value Host request value to be written to a register
+ * \param req_deq Accel is ready to dequeue Host request
+ * \param resp_valid Accel has a valid response for Host
+ * \param resp_value Accel response value for Host
+ * \return 0 if success,
+ */
+typedef void (*VTAHostDPIFunc)(
+    VTAContextHandle self,
+    dpi8_t* exit,
+    dpi8_t* req_valid,
+    dpi8_t* req_opcode,
+    dpi8_t* req_addr,
+    dpi32_t* req_value,
+    dpi8_t req_deq,
+    dpi8_t resp_valid,
+    dpi32_t resp_value);
+
+/*!
+ * \brief Memory DPI callback function that is invoked in VTAMemDPI.v every clock cycle
+ * \param req_valid Accel has a valid request for Host
+ * \param req_opcode Accel request type, opcode=0 (read) and opcode=1 (write)
+ * \param req_len Accel request length of size 8-byte and starts at 0
+ * \param req_addr Accel request base address
+ * \param wr_valid Accel has a valid value for Host
+ * \param wr_value Accel has a value to be written Host
+ * \param rd_valid Host has a valid value for Accel
+ * \param rd_value Host has a value to be read by Accel
+ */
+typedef void (*VTAMemDPIFunc)(
+    VTAContextHandle self,
+    dpi8_t req_valid,
+    dpi8_t req_opcode,
+    dpi8_t req_len,
+    dpi64_t req_addr,
+    dpi8_t wr_valid,
+    dpi64_t wr_value,
+    dpi8_t* rd_valid,
+    dpi64_t* rd_value,
+    dpi8_t rd_ready);
+
+/*! \brief The type of VTADPIInit function pointer */
+typedef void (*VTADPIInitFunc)(VTAContextHandle handle,
+                            VTAHostDPIFunc host_dpi,
+                            VTAMemDPIFunc mem_dpi);
+
+
+/*! \brief The type of VTADPISim function pointer */
+typedef int (*VTADPISimFunc)(uint64_t max_cycles);
+
+/*!
+ * \brief Set Host and Memory DPI functions
+ * \param handle DPI Context handle
+ * \param host_dpi Host DPI function
+ * \param mem_dpi Memory DPI function
+ */
+TVM_DLL void VTADPIInit(VTAContextHandle handle,
+                VTAHostDPIFunc host_dpi,
+                VTAMemDPIFunc mem_dpi);
+
+/*!
+ * \brief Instantiate VTA design and generate clock/reset
+ * \param max_cycles The maximum number of simulation cycles
+ */
+TVM_DLL int VTADPISim(uint64_t max_cycles);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // VTA_DPI_TSIM_H_
diff --git a/vta/src/dpi/module.cc b/vta/src/dpi/module.cc
new file mode 100644
index 000000000000..4839b026d51e
--- /dev/null
+++ b/vta/src/dpi/module.cc
@@ -0,0 +1,376 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+
+#include <vta/dpi/module.h>
+#include <vta/dpi/tsim.h>
+#if defined(_WIN32)
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <condition_variable>
+
+namespace vta {
+namespace dpi {
+
+using namespace tvm::runtime;
+
+typedef void* DeviceHandle;
+
+struct HostRequest {
+  uint8_t opcode;
+  uint8_t addr;
+  uint32_t value;
+};
+
+struct HostResponse {
+  uint32_t value;
+};
+
+struct MemResponse {
+  uint8_t valid;
+  uint64_t value;
+};
+
+template <typename T>
+class ThreadSafeQueue {
+ public:
+  void Push(const T item) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    queue_.push(std::move(item));
+    cond_.notify_one();
+  }
+
+  void WaitPop(T* item) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cond_.wait(lock, [this]{return !queue_.empty();});
+    *item = std::move(queue_.front());
+    queue_.pop();
+  }
+
+  bool TryPop(T* item, bool pop) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (queue_.empty()) return false;
+    *item = std::move(queue_.front());
+    if (pop) queue_.pop();
+    return true;
+  }
+
+ private:
+  mutable std::mutex mutex_;
+  std::queue<T> queue_;
+  std::condition_variable cond_;
+};
+
+class HostDevice {
+ public:
+  void PushRequest(uint8_t opcode, uint8_t addr, uint32_t value);
+  bool TryPopRequest(HostRequest* r, bool pop);
+  void PushResponse(uint32_t value);
+  void WaitPopResponse(HostResponse* r);
+  void Exit();
+  uint8_t GetExitStatus();
+
+ private:
+  uint8_t exit_{0};
+  mutable std::mutex mutex_;
+  ThreadSafeQueue<HostRequest> req_;
+  ThreadSafeQueue<HostResponse> resp_;
+};
+
+class MemDevice {
+ public:
+  void SetRequest(uint8_t opcode, uint64_t addr, uint32_t len);
+  MemResponse ReadData(uint8_t ready);
+  void WriteData(uint64_t value);
+
+ private:
+  uint64_t* raddr_{0};
+  uint64_t* waddr_{0};
+  uint32_t rlen_{0};
+  uint32_t wlen_{0};
+  std::mutex mutex_;
+};
+
+void HostDevice::PushRequest(uint8_t opcode, uint8_t addr, uint32_t value) {
+  HostRequest r;
+  r.opcode = opcode;
+  r.addr = addr;
+  r.value = value;
+  req_.Push(r);
+}
+
+bool HostDevice::TryPopRequest(HostRequest* r, bool pop) {
+  r->opcode = 0xad;
+  r->addr = 0xad;
+  r->value = 0xbad;
+  return req_.TryPop(r, pop);
+}
+
+void HostDevice::PushResponse(uint32_t value) {
+  HostResponse r;
+  r.value = value;
+  resp_.Push(r);
+}
+
+void HostDevice::WaitPopResponse(HostResponse* r) {
+  resp_.WaitPop(r);
+}
+
+void HostDevice::Exit() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  exit_ = 1;
+}
+
+uint8_t HostDevice::GetExitStatus() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  return exit_;
+}
+
+void MemDevice::SetRequest(uint8_t opcode, uint64_t addr, uint32_t len) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  if (opcode == 1) {
+    wlen_ = len + 1;
+    waddr_ = reinterpret_cast<uint64_t*>(addr);
+  } else {
+    rlen_ = len + 1;
+    raddr_ = reinterpret_cast<uint64_t*>(addr);
+  }
+}
+
+MemResponse MemDevice::ReadData(uint8_t ready) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  MemResponse r;
+  r.valid = rlen_ > 0;
+  r.value = rlen_ > 0 ? *raddr_ : 0xdeadbeefdeadbeef;
+  if (ready == 1 && rlen_ > 0) {
+    raddr_++;
+    rlen_ -= 1;
+  }
+  return r;
+}
+
+void MemDevice::WriteData(uint64_t value) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  if (wlen_ > 0) {
+    *waddr_ = value;
+    waddr_++;
+    wlen_ -= 1;
+  }
+}
+
+class DPIModule final : public DPIModuleNode {
+ public:
+  ~DPIModule() {
+    if (lib_handle_) Unload();
+  }
+
+  const char* type_key() const final {
+    return "vta-tsim";
+  }
+
+  PackedFunc GetFunction(
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) final {
+    if (name == "WriteReg") {
+      return TypedPackedFunc<void(int, int)>(
+          [this](int addr, int value){
+            this->WriteReg(addr, value);
+          });
+    } else {
+      LOG(FATAL) << "Member " << name << "does not exists";
+      return nullptr;
+    }
+  }
+
+  void Init(const std::string& name) {
+    Load(name);
+    VTADPIInitFunc finit =  reinterpret_cast<VTADPIInitFunc>(
+        GetSymbol("VTADPIInit"));
+    CHECK(finit != nullptr);
+    finit(this, VTAHostDPI, VTAMemDPI);
+    fvsim_ = reinterpret_cast<VTADPISimFunc>(GetSymbol("VTADPISim"));
+    CHECK(fvsim_ != nullptr);
+  }
+
+  void Launch(uint64_t max_cycles) {
+    auto frun = [this, max_cycles]() {
+      (*fvsim_)(max_cycles);
+    };
+    vsim_thread_ = std::thread(frun);
+  }
+
+  void WriteReg(int addr, uint32_t value) {
+    host_device_.PushRequest(1, addr, value);
+  }
+
+  uint32_t ReadReg(int addr) {
+    uint32_t value;
+    HostResponse* r = new HostResponse;
+    host_device_.PushRequest(0, addr, 0);
+    host_device_.WaitPopResponse(r);
+    value = r->value;
+    delete r;
+    return value;
+  }
+
+  void Finish() {
+    host_device_.Exit();
+    vsim_thread_.join();
+  }
+
+ protected:
+  VTADPISimFunc fvsim_;
+  HostDevice host_device_;
+  MemDevice mem_device_;
+  std::thread vsim_thread_;
+
+  void HostDPI(dpi8_t* exit,
+               dpi8_t* req_valid,
+               dpi8_t* req_opcode,
+               dpi8_t* req_addr,
+               dpi32_t* req_value,
+               dpi8_t req_deq,
+               dpi8_t resp_valid,
+               dpi32_t resp_value) {
+    HostRequest* r = new HostRequest;
+    *exit = host_device_.GetExitStatus();
+    *req_valid = host_device_.TryPopRequest(r, req_deq);
+    *req_opcode = r->opcode;
+    *req_addr = r->addr;
+    *req_value = r->value;
+    if (resp_valid) {
+      host_device_.PushResponse(resp_value);
+    }
+    delete r;
+  }
+
+  void MemDPI(
+      dpi8_t req_valid,
+      dpi8_t req_opcode,
+      dpi8_t req_len,
+      dpi64_t req_addr,
+      dpi8_t wr_valid,
+      dpi64_t wr_value,
+      dpi8_t* rd_valid,
+      dpi64_t* rd_value,
+      dpi8_t rd_ready) {
+    MemResponse r = mem_device_.ReadData(rd_ready);
+    *rd_valid = r.valid;
+    *rd_value = r.value;
+    if (wr_valid) {
+      mem_device_.WriteData(wr_value);
+    }
+    if (req_valid) {
+      mem_device_.SetRequest(req_opcode, req_addr, req_len);
+    }
+  }
+
+  static void VTAHostDPI(
+      VTAContextHandle self,
+      dpi8_t* exit,
+      dpi8_t* req_valid,
+      dpi8_t* req_opcode,
+      dpi8_t* req_addr,
+      dpi32_t* req_value,
+      dpi8_t req_deq,
+      dpi8_t resp_valid,
+      dpi32_t resp_value) {
+    static_cast<DPIModule*>(self)->HostDPI(
+        exit, req_valid, req_opcode, req_addr,
+        req_value, req_deq, resp_valid, resp_value);
+  }
+
+  static void VTAMemDPI(
+    VTAContextHandle self,
+    dpi8_t req_valid,
+    dpi8_t req_opcode,
+    dpi8_t req_len,
+    dpi64_t req_addr,
+    dpi8_t wr_valid,
+    dpi64_t wr_value,
+    dpi8_t* rd_valid,
+    dpi64_t* rd_value,
+    dpi8_t rd_ready) {
+    static_cast<DPIModule*>(self)->MemDPI(
+        req_valid, req_opcode, req_len,
+        req_addr, wr_valid, wr_value,
+        rd_valid, rd_value, rd_ready);
+  }
+
+ private:
+  // Platform dependent handling.
+#if defined(_WIN32)
+  // library handle
+  HMODULE lib_handle_{nullptr};
+  // Load the library
+  void Load(const std::string& name) {
+    // use wstring version that is needed by LLVM.
+    std::wstring wname(name.begin(), name.end());
+    lib_handle_ = LoadLibraryW(wname.c_str());
+    CHECK(lib_handle_ != nullptr)
+        << "Failed to load dynamic shared library " << name;
+  }
+  void* GetSymbol(const char* name) {
+    return reinterpret_cast<void*>(
+        GetProcAddress(lib_handle_, (LPCSTR)name)); // NOLINT(*)
+  }
+  void Unload() {
+    FreeLibrary(lib_handle_);
+  }
+#else
+  // Library handle
+  void* lib_handle_{nullptr};
+  // load the library
+  void Load(const std::string& name) {
+    lib_handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
+    CHECK(lib_handle_ != nullptr)
+        << "Failed to load dynamic shared library " << name
+        << " " << dlerror();
+  }
+  void* GetSymbol(const char* name) {
+    return dlsym(lib_handle_, name);
+  }
+  void Unload() {
+    dlclose(lib_handle_);
+  }
+#endif
+};
+
+Module DPIModuleNode::Load(std::string dll_name) {
+  std::shared_ptr<DPIModule> n =
+      std::make_shared<DPIModule>();
+  n->Init(dll_name);
+  return Module(n);
+}
+
+TVM_REGISTER_GLOBAL("module.loadfile_vta-tsim")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = DPIModuleNode::Load(args[0]);
+  });
+}  // namespace dpi
+}  // namespace vta

From c1648d00abe9be93c698df436d56e1fe59939b0b Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Wed, 8 May 2019 17:21:41 -0700
Subject: [PATCH 090/106] [Relay][Op] Adaptive pooling (#3085)

* Add topi adaptive_pool

* Use adaptive_pool to compute global_pool

* Add relay adaptive pool2d

* Fix lint

* Fix typo

* Minor change

* Change support level to 10

* Add contrib

* Remove global pool schedule

* Add contrib module

* Fix lint

* Update doc

* Update doc
---
 docs/api/python/topi.rst                    |   1 +
 docs/langref/relay_op.rst                   |   4 +
 include/tvm/relay/attrs/nn.h                |  16 ++
 nnvm/python/nnvm/top/nn.py                  |   4 +-
 python/tvm/relay/__init__.py                |   1 +
 python/tvm/relay/contrib.py                 |  20 +++
 python/tvm/relay/frontend/mxnet.py          |   5 +-
 python/tvm/relay/op/__init__.py             |   1 +
 python/tvm/relay/op/contrib/__init__.py     |  21 +++
 python/tvm/relay/op/contrib/_contrib.py     |  43 +++++
 python/tvm/relay/op/contrib/_make.py        |  20 +++
 python/tvm/relay/op/contrib/contrib.py      | 113 +++++++++++++
 python/tvm/relay/op/nn/_nn.py               |   5 +-
 src/relay/op/nn/pooling.cc                  | 167 +++++++++++++++++++-
 tests/python/frontend/mxnet/test_forward.py |   9 ++
 tests/python/relay/test_op_level10.py       |  43 +++++
 tests/python/relay/test_op_level2.py        |   1 -
 topi/include/topi/nn/pooling.h              | 158 +++++++++++++-----
 topi/python/topi/cuda/__init__.py           |   2 +-
 topi/python/topi/cuda/pooling.py            |  15 +-
 topi/python/topi/generic/nn.py              |   9 +-
 topi/python/topi/hls/nn.py                  |  10 +-
 topi/python/topi/nn/pooling.py              |  40 +++++
 topi/python/topi/opengl/__init__.py         |   2 +-
 topi/python/topi/opengl/pooling.py          |  10 +-
 topi/python/topi/x86/__init__.py            |   2 +-
 topi/python/topi/x86/pooling.py             |  10 +-
 topi/src/topi.cc                            |   7 +
 topi/tests/python/test_topi_pooling.py      |  53 ++++++-
 29 files changed, 709 insertions(+), 83 deletions(-)
 create mode 100644 python/tvm/relay/contrib.py
 create mode 100644 python/tvm/relay/op/contrib/__init__.py
 create mode 100644 python/tvm/relay/op/contrib/_contrib.py
 create mode 100644 python/tvm/relay/op/contrib/_make.py
 create mode 100644 python/tvm/relay/op/contrib/contrib.py

diff --git a/docs/api/python/topi.rst b/docs/api/python/topi.rst
index 222b3347d08e..eaa5dacd678e 100644
--- a/docs/api/python/topi.rst
+++ b/docs/api/python/topi.rst
@@ -57,6 +57,7 @@ List of operators
    topi.nn.dilate
    topi.nn.pool
    topi.nn.global_pool
+   topi.nn.adaptive_pool
    topi.nn.upsampling
    topi.nn.softmax
    topi.nn.dense
diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 4719aba6d3f9..cd5677293571 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -189,6 +189,8 @@ This level support backpropagation of broadcast operators. It is temporary.
    tvm.relay.annotation.on_device
    tvm.relay.reverse_reshape
    tvm.relay.nn.batch_matmul
+   tvm.relay.contrib.adaptive_max_pool2d
+   tvm.relay.contrib.adaptive_avg_pool2d
 
 
 Level 1 Definitions
@@ -318,3 +320,5 @@ Level 10 Definitions
 .. autofunction:: tvm.relay.annotation.on_device
 .. autofunction:: tvm.relay.reverse_reshape
 .. autofunction:: tvm.relay.nn.batch_matmul
+.. autofunction:: tvm.relay.contrib.adaptive_max_pool2d
+.. autofunction:: tvm.relay.contrib.adaptive_avg_pool2d
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 2049a8f869f8..8a1aca0a4b4a 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -332,6 +332,22 @@ struct GlobalPool2DAttrs : public tvm::AttrsNode<GlobalPool2DAttrs> {
   }
 };
 
+/*! \brief Attributes for adaptive pool operator */
+struct AdaptivePool2DAttrs : public tvm::AttrsNode<AdaptivePool2DAttrs> {
+  Array<IndexExpr> output_size;
+  std::string layout;
+
+  TVM_DECLARE_ATTRS(AdaptivePool2DAttrs, "relay.attrs.AdaptivePool2DAttrs") {
+    TVM_ATTR_FIELD(output_size).set_default(Array<IndexExpr>({}))
+      .describe("Output height and width.");
+    TVM_ATTR_FIELD(layout).set_default("NCHW")
+      .describe("Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc."
+                  "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                  "dimensions respectively. Convolution is applied on the 'H' and"
+                  "'W' dimensions.");
+  }
+};
+
 
 /*! \brief Attributes for dense operator */
 struct DenseAttrs : public tvm::AttrsNode<DenseAttrs> {
diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
index d0d714dcf506..656e3662b492 100644
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -432,7 +432,7 @@ def schedule_avg_pool2d(attrs, outs, target):
 def schedule_global_max_pool2d(_, outs, target):
     """Schedule definition of global_max_pool2d"""
     with tvm.target.create(target):
-        return topi.generic.schedule_global_pool(outs)
+        return topi.generic.schedule_adaptive_pool(outs)
 
 reg.register_pattern("global_max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
@@ -442,7 +442,7 @@ def schedule_global_max_pool2d(_, outs, target):
 def schedule_global_avg_pool2d(_, outs, target):
     """Schedule definition of global_avg_pool2d"""
     with tvm.target.create(target):
-        return topi.generic.schedule_global_pool(outs)
+        return topi.generic.schedule_adaptive_pool(outs)
 
 reg.register_pattern("global_avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index 80555d3dfbf6..6201681e0294 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -40,6 +40,7 @@
 from . import nn
 from . import annotation
 from . import vision
+from . import contrib
 from . import image
 from . import frontend
 from . import backend
diff --git a/python/tvm/relay/contrib.py b/python/tvm/relay/contrib.py
new file mode 100644
index 000000000000..d22c67614999
--- /dev/null
+++ b/python/tvm/relay/contrib.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=wildcard-import, unused-import, unused-wildcard-import
+"""Contrib operators."""
+# Re-export in a specific file name so that autodoc can pick it up
+from .op.contrib import *
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index b93bd5b244eb..1a4d52f5b679 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -190,10 +190,7 @@ def _pool2d(new_op, is_avg):
 
 def _mx_adaptive_avg_pooling(inputs, attrs):
     output_size = attrs.get_int_tuple("output_size", [])
-    if output_size != (1,):
-        raise tvm.error.OpAttributeUnimplemented(
-            "AdaptiveAvgPooling with output_size other than 1 is not supported yet.")
-    return _op.nn.global_avg_pool2d(inputs[0])
+    return _op.contrib.adaptive_avg_pool2d(inputs[0], output_size)
 
 
 def _mx_dropout(inputs, attrs):
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index 3bea795a2c38..a27ab1dc50ff 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -29,6 +29,7 @@
 from . import annotation
 from . import image
 from . import vision
+from . import contrib
 from . import op_attrs
 
 
diff --git a/python/tvm/relay/op/contrib/__init__.py b/python/tvm/relay/op/contrib/__init__.py
new file mode 100644
index 000000000000..3159006486b3
--- /dev/null
+++ b/python/tvm/relay/op/contrib/__init__.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=wildcard-import
+"""Neural network related operators."""
+from __future__ import absolute_import as _abs
+from .contrib import *
+from . import _contrib
diff --git a/python/tvm/relay/op/contrib/_contrib.py b/python/tvm/relay/op/contrib/_contrib.py
new file mode 100644
index 000000000000..f0df75648467
--- /dev/null
+++ b/python/tvm/relay/op/contrib/_contrib.py
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""Backend compiler related feature registration"""
+from __future__ import absolute_import
+
+import topi
+from .. import op as reg
+from ..op import OpPattern
+
+
+# adaptive_max_pool2d
+@reg.register_schedule("contrib.adaptive_max_pool2d")
+def schedule_adaptive_max_pool2d(_, outs, target):
+    """Schedule definition of adaptive_max_pool2d"""
+    with target:
+        return topi.generic.schedule_adaptive_pool(outs)
+
+reg.register_pattern("contrib.adaptive_max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
+# adaptive_avg_pool2d
+@reg.register_schedule("contrib.adaptive_avg_pool2d")
+def schedule_adaptive_avg_pool2d(_, outs, target):
+    """Schedule definition of adaptive_avg_pool2d"""
+    with target:
+        return topi.generic.schedule_adaptive_pool(outs)
+
+reg.register_pattern("contrib.adaptive_avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
diff --git a/python/tvm/relay/op/contrib/_make.py b/python/tvm/relay/op/contrib/_make.py
new file mode 100644
index 000000000000..42d71755abb8
--- /dev/null
+++ b/python/tvm/relay/op/contrib/_make.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Constructor APIs"""
+from ...._ffi.function import _init_api
+
+_init_api("relay.op.contrib._make", __name__)
diff --git a/python/tvm/relay/op/contrib/contrib.py b/python/tvm/relay/op/contrib/contrib.py
new file mode 100644
index 000000000000..1f073d4aae45
--- /dev/null
+++ b/python/tvm/relay/op/contrib/contrib.py
@@ -0,0 +1,113 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#pylint: disable=invalid-name, too-many-lines
+"""Contrib operations."""
+from __future__ import absolute_import as _abs
+from . import _make
+
+
+def adaptive_max_pool2d(data,
+                        output_size=None,
+                        layout="NCHW"):
+    r"""2D adaptive max pooling operator. This operator is experimental.
+
+    This operator takes data as input and does 2D max value calculation
+    across each window represented by WxH.
+
+
+    In the default case, where the data_layout is `NCHW`
+    a data Tensor with shape `(batch_size, in_channels, height, width)`,
+    to produce an output Tensor with shape
+    (batch_size, in_channels, output_height, output_width).
+
+    The pooling kernel and stride sizes are automatically chosen for
+    desired output sizes.
+
+    For output_size:
+        If this argument is not provided, input height and width will be used
+        as output height and width.
+
+        If a single integer is provided for output_size, the output size is
+        (N x C x output_size x output_size) for any input (NCHW).
+
+        If a tuple of integers (height, width) are provided for output_size,
+        the output size is (N x C x height x width) for any input (NCHW).
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    output_size : tuple of int. optional
+        Output height and width.
+
+    layout : str, optional
+        Layout of the input.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    output_size = [] or output_size
+    return _make.adaptive_max_pool2d(data, output_size, layout)
+
+def adaptive_avg_pool2d(data,
+                        output_size=None,
+                        layout="NCHW"):
+    r"""2D adaptive average pooling operator. This operator is experimental.
+
+    This operator takes data as input and does 2D average value calculation
+    across each window represented by WxH.
+
+
+    In the default case, where the data_layout is `NCHW`
+    a data Tensor with shape `(batch_size, in_channels, height, width)`,
+    to produce an output Tensor with shape
+    (batch_size, in_channels, output_height, output_width).
+
+    The pooling kernel and stride sizes are automatically chosen for
+    desired output sizes.
+
+    For output_size:
+        If this argument is not provided, input height and width will be used
+        as output height and width.
+
+        If a single integer is provided for output_size, the output size is
+        (N x C x output_size x output_size) for any input (NCHW).
+
+        If a tuple of integers (height, width) are provided for output_size,
+        the output size is (N x C x height x width) for any input (NCHW).
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    output_size : tuple of int. optional
+        Output height and width.
+
+    layout : str, optional
+        Layout of the input.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    output_size = [] or output_size
+    return _make.adaptive_avg_pool2d(data, output_size, layout)
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 5e9d5d74498d..6c8f8f88795c 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -247,7 +247,7 @@ def schedule_avg_pool2d(attrs, outs, target):
 def schedule_global_max_pool2d(_, outs, target):
     """Schedule definition of global_max_pool2d"""
     with target:
-        return topi.generic.schedule_global_pool(outs)
+        return topi.generic.schedule_adaptive_pool(outs)
 
 
 reg.register_pattern("nn.global_max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
@@ -258,11 +258,12 @@ def schedule_global_max_pool2d(_, outs, target):
 def schedule_global_avg_pool2d(_, outs, target):
     """Schedule definition of global_avg_pool2d"""
     with target:
-        return topi.generic.schedule_global_pool(outs)
+        return topi.generic.schedule_adaptive_pool(outs)
 
 
 reg.register_pattern("nn.global_avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
+
 # leaky_relu
 reg.register_schedule("nn.leaky_relu", schedule_broadcast)
 reg.register_pattern("nn.leaky_relu", OpPattern.ELEMWISE)
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
index df238b38c9cd..4dd763b45654 100644
--- a/src/relay/op/nn/pooling.cc
+++ b/src/relay/op/nn/pooling.cc
@@ -72,7 +72,6 @@ bool Pool2DRel(const Array<Type>& types,
 
   CHECK(data != nullptr);
   const auto dshape = data->shape;
-  CHECK_NE(dshape.size(), 0);
   CHECK_GE(dshape.size(), 2U)
       << "Pool2D only support input >= 2-D: input must have height and width";
   const auto param = attrs.as<AttrType>();
@@ -284,7 +283,6 @@ bool GlobalPool2DRel(const Array<Type>& types,
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) { return false; }
   const auto dshape = data->shape;
-  CHECK_NE(dshape.size(), 0);
   CHECK_GE(dshape.size(), 2U)
       << "Pool2D only support input >= 2-D: input must have height and width";
   const auto param = attrs.as<GlobalPool2DAttrs>();
@@ -393,5 +391,170 @@ RELAY_REGISTER_OP("nn.global_max_pool2d")
                                Pool2DInferCorrectLayout<GlobalPool2DAttrs>)
 .set_attr<FTVMCompute>("FTVMCompute", GlobalPool2DCompute<topi::nn::kMaxPool>);
 
+
+// relay.nn.adaptive_pool_2d
+TVM_REGISTER_NODE_TYPE(AdaptivePool2DAttrs);
+
+bool AdaptivePool2DRel(const Array<Type>& types,
+                       int num_inputs,
+                       const Attrs& attrs,
+                       const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) { return false; }
+  const auto dshape = data->shape;
+  CHECK_GE(dshape.size(), 2U)
+    << "Pool2D only support input >= 2-D: input must have height and width";
+  const auto* param = attrs.as<AdaptivePool2DAttrs>();
+  CHECK(param != nullptr);
+
+  Layout layout(param->layout);
+  CHECK(layout.Contains(LayoutAxis::Get('H')) && layout.Contains(LayoutAxis::Get('W')) &&
+        !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
+    << "Invalid layout " << layout
+    << ". Pool2D layout must have H and W, which cannot be split";
+
+  const auto hidx = layout.IndexOf(LayoutAxis::Get('H'));
+  const auto widx = layout.IndexOf(LayoutAxis::Get('W'));
+  Array<IndexExpr> oshape(dshape);
+  auto output_size = param->output_size;
+  CHECK_LE(output_size.size(), 2U)
+    << "output_size can have up to 2 elements.";
+  IndexExpr output_height, output_width;
+  if (output_size.empty()) {
+    output_height = dshape[hidx];
+    output_width = dshape[widx];
+  } else if (output_size.size() == 1) {
+    output_height = output_size[0];
+    output_width = output_size[0];
+  } else {
+    output_height = output_size[0];
+    output_width = output_size[1];
+  }
+
+  oshape.Set(hidx, output_height);
+  oshape.Set(widx, output_width);
+
+  // assign output type
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+template<topi::nn::PoolType mode>
+Array<Tensor> AdaptivePool2DCompute(const Attrs& attrs,
+                                    const Array<Tensor>& inputs,
+                                    const Type& out_type,
+                                    const Target& target) {
+  static const Layout kNCHW("NCHW");
+  const auto* param = attrs.as<AdaptivePool2DAttrs>();
+  CHECK(param != nullptr);
+  Layout layout(param->layout);
+  CHECK(BijectiveLayoutNode::make(layout, kNCHW).defined())
+    << "Adaptive pool2d currently only supports layouts that are convertible from NCHW";
+  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
+    << "Adaptive pool2d does not support input split on height";
+  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
+    << "Adaptive pool2d does not support input split on width";
+
+  CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
+    << "Pool2D only support 4-D input (e.g., NCHW)"
+    << " or 5-D input (last dimension is a split of channel)";
+
+  auto output_size = param->output_size;
+  const auto hidx = layout.IndexOf(LayoutAxis::Get('H'));
+  const auto widx = layout.IndexOf(LayoutAxis::Get('W'));
+  IndexExpr output_height, output_width;
+  if (output_size.empty()) {
+    output_height = inputs[0]->shape[hidx];
+    output_width = inputs[0]->shape[widx];
+  } else if (output_size.size() == 1) {
+    output_height = output_size[0];
+    output_width = output_size[0];
+  } else {
+    output_height = output_size[0];
+    output_width = output_size[1];
+  }
+  return Array<Tensor>{
+    topi::nn::adaptive_pool(inputs[0], Array<IndexExpr>{ output_height, output_width },
+                            mode, layout.name()) };
+}
+
+// relay.contrib.adaptive_avg_pool2d
+Expr MakeAdaptiveAvgPool2D(Expr data,
+                           Array<IndexExpr> output_size,
+                           std::string layout) {
+  auto attrs = make_node<AdaptivePool2DAttrs>();
+  attrs->output_size = std::move(output_size);
+  attrs->layout = std::move(layout);
+  static const Op& op = Op::Get("contrib.adaptive_avg_pool2d");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op.contrib._make.adaptive_avg_pool2d")
+.set_body_typed(MakeAdaptiveAvgPool2D);
+
+RELAY_REGISTER_OP("contrib.adaptive_avg_pool2d")
+  .describe(R"code(Adaptive average pooling operation for 2D data.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, channels, height, width) if `layout` is `NCHW`.
+- **output_size**: If this argument is not provided, input height and width will be used
+                   as output height and width.
+                   If a single integer is provided for output_size, the output size is
+                   (N x C x output_size x output_size) for any input (NCHW).
+                   If a tuple of integers (height, width) are provided for output_size,
+                   the output size is (N x C x height x width) for any input (NCHW).
+- **out**: This depends on the `layout` parameter. Output is 4D array of shape
+           (batch_size, channels, output_height, output_width)  if `layout` is `NCHW`.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.AdaptivePool2DAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(10)
+.add_type_rel("AdaptiveAvgPool2D", AdaptivePool2DRel)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+                               Pool2DInferCorrectLayout<AdaptivePool2DAttrs>)
+.set_attr<FTVMCompute>("FTVMCompute", AdaptivePool2DCompute<topi::nn::kAvgPool>);
+
+
+// relay.contrib.adaptive_max_pool2d
+Expr MakeAdaptiveMaxPool2D(Expr data,
+                           Array<IndexExpr> output_size,
+                           std::string layout) {
+  auto attrs = make_node<AdaptivePool2DAttrs>();
+  attrs->output_size = std::move(output_size);
+  attrs->layout = std::move(layout);
+  static const Op& op = Op::Get("contrib.adaptive_max_pool2d");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op.contrib._make.adaptive_max_pool2d")
+.set_body_typed(MakeAdaptiveMaxPool2D);
+
+RELAY_REGISTER_OP("contrib.adaptive_max_pool2d")
+  .describe(R"code(Adaptive max pooling operation for 2D data.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, channels, height, width) if `layout` is `NCHW`.
+- **output_size**: If this argument is not provided, input height and width will be used
+                   as output height and width.
+                   If a single integer is provided for output_size, the output size is
+                   (N x C x output_size x output_size) for any input (NCHW).
+                   If a tuple of integers (height, width) are provided for output_size,
+                   the output size is (N x C x height x width) for any input (NCHW).
+- **out**: This depends on the `layout` parameter. Output is 4D array of shape
+           (batch_size, channels, output_height, output_width)  if `layout` is `NCHW`.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.AdaptivePool2DAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(10)
+.add_type_rel("AdaptiveMaxPool2D", AdaptivePool2DRel)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+                               Pool2DInferCorrectLayout<AdaptivePool2DAttrs>)
+.set_attr<FTVMCompute>("FTVMCompute", AdaptivePool2DCompute<topi::nn::kMaxPool>);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index 067c356830bb..e75e60da5ce4 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -170,6 +170,14 @@ def test_forward_pooling():
     mx_sym = mx.sym.Pooling(data, kernel=(3, 3), pad=(1, 1), pool_type='max')
     verify_mxnet_frontend_impl(mx_sym, (1, 20, 8, 8), (1, 20, 8, 8))
 
+def test_forward_adaptive_pooling():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.contrib.AdaptiveAvgPooling2D(data, output_size=(1,))
+    verify_mxnet_frontend_impl(mx_sym, (1, 20, 8, 8), (1, 20, 1, 1))
+
+    mx_sym = mx.sym.contrib.AdaptiveAvgPooling2D(data, output_size=(3, 3))
+    verify_mxnet_frontend_impl(mx_sym, (1, 20, 8, 8), (1, 20, 3, 3))
+
 def test_forward_lrn():
     data = mx.sym.var('data')
     mx_sym = mx.sym.LRN(data, alpha=2, beta=2, knorm=1, nsize=5)
@@ -590,6 +598,7 @@ def verify(mode, input_size, seq_len, hidden_size, num_layers, batch=1):
     test_forward_split_squeeze()
     test_forward_expand_dims()
     test_forward_pooling()
+    test_forward_adaptive_pooling()
     test_forward_lrn()
     test_forward_ones()
     test_forward_zeros()
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 667dd87a6688..244744c3912c 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -208,7 +208,50 @@ def test_shape_of():
             tvm.testing.assert_allclose(op_res.asnumpy(),
                                         np.array(shape).astype('int32'))
 
+def verify_adaptive_pool2d(dshape, out_size, pool_type, layout="NCHW", dtype="float32"):
+    def start_index(index, odim, idim):
+        return int(np.floor(index * idim / odim))
+
+    def end_index(index, odim, idim):
+        return int(np.ceil((index + 1) * idim / odim))
+
+    np_data = np.random.uniform(low=0, high=255, size=dshape).astype(dtype)
+    n, c, h, w = dshape
+    oh, ow = out_size
+    oshape = (n, c) + out_size
+    np_out = np.zeros(oshape).astype(dtype)
+    np_op = np.mean if pool_type == "avg" else np.max
+    for i in range(n):
+        for j in range(c):
+            for k in range(oh):
+                k_start = start_index(k, oh, h)
+                k_end = end_index(k, oh, h)
+                k_sl = slice(k_start, k_end)
+                for l in range(ow):
+                    l_start = start_index(l, ow, w)
+                    l_end = end_index(l, ow, w)
+                    l_sl = slice(l_start, l_end)
+                    np_out[i, j, k, l] = np_op(np_data[i, j, k_sl, l_sl])
+
+    opfunc = relay.contrib.adaptive_avg_pool2d if pool_type == "avg" else relay.contrib.adaptive_max_pool2d
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    y = opfunc(x, out_size, layout)
+    func = relay.Function([x], y)
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        relay_out = intrp1.evaluate(func)(np_data)
+        tvm.testing.assert_allclose(relay_out.asnumpy(), np_out, rtol=1e-5, atol=1e-5)
+
+def test_adaptive_pool2d():
+    verify_adaptive_pool2d((1, 9, 224, 224), (1, 1), "max")
+    verify_adaptive_pool2d((1, 3, 224, 224), (2, 3), "avg")
+    verify_adaptive_pool2d((1, 14, 56, 78), (34, 13), "max")
+    verify_adaptive_pool2d((1, 5, 46, 97), (4, 96), "avg")
+
+
 if __name__ == "__main__":
+    test_adaptive_pool2d()
     test_collapse_sum_like()
     test_broadcast_to_like()
     test_slice_like()
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 88963a63c770..a5350450b0a5 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -316,7 +316,6 @@ def test_avg_pool2d_no_count_pad():
         op_res1 = intrp1.evaluate(func)(data)
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
-
 def test_flatten_infer_type():
     d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4")
     x = relay.var("x", relay.TensorType((d1, d2, d3, d4), "float32"))
diff --git a/topi/include/topi/nn/pooling.h b/topi/include/topi/nn/pooling.h
index 8648c01fdb3a..08cc93572e1a 100644
--- a/topi/include/topi/nn/pooling.h
+++ b/topi/include/topi/nn/pooling.h
@@ -231,6 +231,120 @@ inline Tensor pool(const Tensor& x,
                    count_include_pad);
 }
 
+
+inline Expr start_index(const Var& out_index,
+                        const Expr& odim,
+                        const Expr& idim) {
+  return out_index * idim / odim;
+}
+
+inline Expr end_index(const Var& out_index,
+                      const Expr& odim,
+                      const Expr& idim) {
+  Expr tmp = (out_index + 1) * idim / odim;
+  return tvm::ir::Select::make((out_index + 1) * idim % odim == 0,
+                               tmp, tmp + 1);
+}
+
+/*!
+* \brief Perform adaptive pooling on height and width dimension of data.
+*
+* \param x The input tensor
+* \param output_size Vector of two ints: {output_height, output_width}
+* \param pool_type The type of pooling operator
+* \param height_axis index of the height dimension
+* \param width_axis index of the width dimension
+*
+* \return The output tensor in same layout order
+*/
+inline Tensor adaptive_pool_impl(const Tensor& x,
+                                 const Array<Expr>& output_size,
+                                 PoolType pool_type,
+                                 const size_t height_axis,
+                                 const size_t width_axis) {
+  CHECK_EQ(output_size.size(), 2) << "Pooling kernel_size must have 2 elements";
+
+  auto height = x->shape[height_axis];
+  auto width = x->shape[width_axis];
+
+  auto out_height = output_size[0];
+  auto out_width = output_size[1];
+  Array<Expr> out_shape = x->shape;
+  out_shape.Set(height_axis, out_height);
+  out_shape.Set(width_axis, out_width);
+
+  if (pool_type == kMaxPool) {
+    return tvm::compute(out_shape, [&](const Array<Var>& output) {
+      Array<Expr> indices;
+      for (const Var& var : output) indices.push_back(var);
+      auto i_start_h = start_index(output[height_axis], out_height, height);
+      auto i_end_h = end_index(output[height_axis], out_height, height);
+      auto i_start_w = start_index(output[width_axis], out_width, width);
+      auto i_end_w = end_index(output[width_axis], out_width, width);
+      auto dheight = tvm::reduce_axis(Range(0, i_end_h - i_start_h), "rv1");
+      auto dwidth = tvm::reduce_axis(Range(0, i_end_w - i_start_w), "rv2");
+      indices.Set(height_axis, i_start_h + dheight);
+      indices.Set(width_axis, i_start_w + dwidth);
+      return tvm::max(x(indices), { dheight, dwidth });  // NOLINT(*)
+    }, "tensor", "adaptive_pool_max");
+  } else if (pool_type == kAvgPool) {
+    return tvm::compute(out_shape, [&](const Array<Var>& output) {
+      Array<Expr> indices;
+      for (const Var& var : output) indices.push_back(var);
+      auto i_start_h = start_index(output[height_axis], out_height, height);
+      auto i_end_h = end_index(output[height_axis], out_height, height);
+      auto i_start_w = start_index(output[width_axis], out_width, width);
+      auto i_end_w = end_index(output[width_axis], out_width, width);
+      auto divide_factor = tvm::cast(x->dtype, (i_end_h - i_start_h)
+                                               * (i_end_w - i_start_w));
+      auto dheight = tvm::reduce_axis(Range(0, i_end_h - i_start_h), "rv1");
+      auto dwidth = tvm::reduce_axis(Range(0, i_end_w - i_start_w), "rv2");
+      indices.Set(height_axis, i_start_h + dheight);
+      indices.Set(width_axis, i_start_w + dwidth);
+      return tvm::sum(x(indices) / divide_factor, { dheight, dwidth });
+    }, "tensor", "adaptive_pool_avg");
+  } else {
+    LOG(ERROR) << "Unrecognized pool_type: " << pool_type;
+    return x;
+  }
+}
+
+/*!
+* \brief Adaptively perform pooling on height and width dimension of data.
+*        The pooling kernel and stride sizes are automatically chosen for desired output sizes.
+*        It decides the height and width dimension according to the layout string,
+*        in which 'W' and 'H' means width and height respectively.
+*        Width and height dimension cannot be split.
+*        For example, NCHW, NCHW16c, etc. are valid for pool,
+*        while NCHW16w, NCHW16h are not.
+*        See \a layout for more information of the layout string convention.
+*
+* \param x The input tensor
+* \param output_size Vector of two ints: {output_height, output_width}
+* \param pool_type The type of pooling operator
+* \param layout The input layout. Pooling supports any layout as long as 'H' and 'W' appear.
+*        The layout is supposed to be composed of upper cases, lower cases and (optional) numbers,
+*        where upper case indicates a dimension and
+*        the corresponding lower case (with factor size) indicates the split dimension.
+*        For example, NCHW16c can describe a 5-D tensor of
+*        [batch_size, channel, height, width, channel_block].
+*        (in which factor size `16` will not be used in pooling but for other operators,
+*        it can be used to decide the output shape).
+*        Since pooling does not care about the factor size of dimensions
+*        other than `H` and `W`, one can pass `NCHWc` as well.
+*
+* \return The output tensor in same layout order
+*/
+inline Tensor adaptive_pool(const Tensor& x,
+                            const Array<Expr>& output_size,
+                            PoolType pool_type,
+                            const std::string& layout = "NCHW") {
+  int height_axis = -1, width_axis = -1;
+  CHECK(find_height_width(layout, &height_axis, &width_axis))
+    << "Unsupported layout " << layout;
+  return adaptive_pool_impl(x, output_size, pool_type, height_axis, width_axis);
+}
+
 /*!
 * \brief Perform global pooling on height and width dimension of data.
 *        It decides the height and width dimension according to the layout string,
@@ -259,49 +373,7 @@ inline Tensor pool(const Tensor& x,
 inline Tensor global_pool(const Tensor& x,
                           PoolType pool_type,
                           const std::string& layout = "NCHW") {
-  CHECK(x->shape.size() >= 2) << "Pooling input must >= 2-D (H, W)";
-
-  int height_axis = -1, width_axis = -1;
-  CHECK(find_height_width(layout, &height_axis, &width_axis))
-    << "Unsupported layout " << layout;
-
-  Array<Expr> out_shape = x->shape;
-  out_shape.Set(height_axis, 1);
-  out_shape.Set(width_axis, 1);
-
-  auto height = x->shape[height_axis];
-  auto width = x->shape[width_axis];
-
-  auto dheight = tvm::reduce_axis(Range(0, height), "rv1");
-  auto dwidth = tvm::reduce_axis(Range(0, width), "rv2");
-
-  if (pool_type == kMaxPool) {
-    return tvm::compute(out_shape,
-      [&](const Array<Var>& output) {
-        Array<Expr> indices;
-        for (const Var& var : output) indices.push_back(var);
-        indices.Set(height_axis, dheight);
-        indices.Set(width_axis, dwidth);
-        return tvm::max(x(indices), { dheight, dwidth });  // NOLINT(*)
-      }, "tensor", "global_pool_max");
-  } else if (pool_type == kAvgPool) {
-    auto tsum = tvm::compute(out_shape,
-      [&](const Array<Var>& output) {
-        Array<Expr> indices;
-        for (const Var& var : output) indices.push_back(var);
-        indices.Set(height_axis, dheight);
-        indices.Set(width_axis, dwidth);
-        return tvm::sum(x(indices), { dheight, dwidth });
-      }, "tensor", "global_pool_sum");
-
-    return tvm::compute(out_shape,
-      [&](const Array<Var>& output) {
-        return tsum(output) / tvm::cast(x->dtype, height * width);
-      }, "tensor", kElementWise);
-  } else {
-    LOG(ERROR) << "Unrecognized pool_type: " << pool_type;
-    return x;
-  }
+  return adaptive_pool(x, Array<Expr>{1, 1}, pool_type, layout);
 }
 
 }  // namespace nn
diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
index 65ed0ff10dad..526429b91bee 100644
--- a/topi/python/topi/cuda/__init__.py
+++ b/topi/python/topi/cuda/__init__.py
@@ -12,7 +12,7 @@
 from .softmax import schedule_softmax
 from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
 from .dense import schedule_dense
-from .pooling import schedule_pool, schedule_global_pool
+from .pooling import schedule_pool, schedule_adaptive_pool
 from .extern import schedule_extern
 from .nn import schedule_lrn, schedule_l2_normalize
 from .batch_matmul import schedule_batch_matmul
diff --git a/topi/python/topi/cuda/pooling.py b/topi/python/topi/cuda/pooling.py
index ac3644ddd1b8..2d12c4a5ee65 100644
--- a/topi/python/topi/cuda/pooling.py
+++ b/topi/python/topi/cuda/pooling.py
@@ -20,23 +20,26 @@
 from .. import tag
 from .. import generic
 
-@generic.schedule_global_pool.register(["cuda", "gpu"])
-def schedule_global_pool(outs):
-    """Schedule for global_pool.
+
+
+@generic.schedule_adaptive_pool.register(["cuda", "gpu"])
+def schedule_adaptive_pool(outs):
+    """Schedule for adaptive_pool.
 
     Parameters
     ----------
     outs: Array of Tensor
-        The computation graph description of global_pool
+        The computation graph description of adaptive_pool
         in the format of an array of tensors.
 
     Returns
     -------
     s: Schedule
-        The computation schedule for global_pool.
+        The computation schedule for adaptive_pool.
     """
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
+
     def _schedule(Pool):
         num_thread = 8
         block_x = tvm.thread_axis("blockIdx.x")
@@ -73,7 +76,7 @@ def traverse(OP):
                 if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
         # schedule global_pool
-        elif OP.tag.startswith('global_pool'):
+        elif OP.tag.startswith('adaptive_pool'):
             Pool = OP.output(0)
             _schedule(Pool)
         else:
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index db77f37e36a9..db1c772279e5 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -403,14 +403,14 @@ def schedule_pool(outs, layout):
     return _default_schedule(outs, False)
 
 
-@tvm.target.override_native_generic_func("schedule_global_pool")
-def schedule_global_pool(outs):
-    """Schedule for global pool
+@tvm.target.override_native_generic_func("schedule_adaptive_pool")
+def schedule_adaptive_pool(outs):
+    """Schedule for adaptive pool
 
     Parameters
     ----------
     outs: Array of Tensor
-          The computation graph description of global pool
+          The computation graph description of adaptive pool
           in the format of an array of tensors.
 
     Returns
@@ -420,6 +420,7 @@ def schedule_global_pool(outs):
     """
     return _default_schedule(outs, False)
 
+
 @tvm.target.override_native_generic_func("schedule_binarize_pack")
 def schedule_binarize_pack(outs):
     """Schedule for binarize_pack
diff --git a/topi/python/topi/hls/nn.py b/topi/python/topi/hls/nn.py
index 5ce8394de981..adb8862f1b2c 100644
--- a/topi/python/topi/hls/nn.py
+++ b/topi/python/topi/hls/nn.py
@@ -360,14 +360,14 @@ def traverse(OP):
     return s
 
 
-@generic.schedule_global_pool.register(["hls"])
-def schedule_global_pool(outs):
-    """Schedule for global pool
+@generic.schedule_adaptive_pool.register(["hls"])
+def schedule_adaptive_pool(outs):
+    """Schedule for adaptive_pool
 
     Parameters
     ----------
     outs: Array of Tensor
-          The computation graph description of global pool
+          The computation graph description of adaptive_pool
           in the format of an array of tensors.
 
     Returns
@@ -389,7 +389,7 @@ def traverse(OP):
                 if tensor.op.input_tensors:
                     traverse(tensor.op)
         # schedule global_pool
-        elif OP.tag.startswith('global_pool'):
+        elif OP.tag.startswith('adaptive_pool'):
             Pool = OP.output(0)
             if not Pool.op in s.outputs:
                 Out = outs[0].op.output(0)
diff --git a/topi/python/topi/nn/pooling.py b/topi/python/topi/nn/pooling.py
index 258de99a5842..2ecc3cf1bffd 100644
--- a/topi/python/topi/nn/pooling.py
+++ b/topi/python/topi/nn/pooling.py
@@ -186,3 +186,43 @@ def global_avg_pool2d_alter_layout(attrs, inputs, tinfos):
     """
     # not to change by default
     return None
+def adaptive_pool(data,
+                  output_size,
+                  pool_type,
+                  layout="NCHW"):
+    """Perform pooling on height and width dimension of data.
+       The pooling kernel and stride sizes are automatically chosen for desired
+       output sizes.
+       It decides the height and width dimension according to the layout string,
+       in which 'W' and 'H' means width and height respectively.
+       Width and height dimension cannot be split.
+       For example, NCHW, NCHW16c, etc. are valid for pool,
+       while NCHW16w, NCHW16h are not.
+       See parameter `layout` for more information of the layout string convention.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        n-D with shape of layout
+
+    output_size : tuple of int
+        output height and width.
+
+    pool_type : str
+        Pool type, 'max' or 'avg'
+
+    layout: string
+        Layout of the input data.
+        The layout is supposed to be composed of upper cases, lower cases and numbers,
+        where upper case indicates a dimension and
+        the corresponding lower case with factor size indicates the split dimension.
+        For example, NCHW16c can describe a 5-D tensor of
+        [batch_size, channel, height, width, channel_block],
+        in which channel_block=16 is a split of dimension channel.
+
+    Returns
+    -------
+    output : tvm.Tensor
+        n-D in the same layout
+    """
+    return cpp.nn.adaptive_pool(data, output_size, POOL_TYPE_CODE[pool_type], layout)
diff --git a/topi/python/topi/opengl/__init__.py b/topi/python/topi/opengl/__init__.py
index c8f20b9825a7..37eac441df30 100644
--- a/topi/python/topi/opengl/__init__.py
+++ b/topi/python/topi/opengl/__init__.py
@@ -6,4 +6,4 @@
 from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
 from .softmax import schedule_softmax
 from .dense import schedule_dense
-from .pooling import schedule_pool, schedule_global_pool
+from .pooling import schedule_pool, schedule_adaptive_pool
diff --git a/topi/python/topi/opengl/pooling.py b/topi/python/topi/opengl/pooling.py
index 99c4decd9960..56f0b088d1e5 100644
--- a/topi/python/topi/opengl/pooling.py
+++ b/topi/python/topi/opengl/pooling.py
@@ -20,9 +20,9 @@
 from .. import tag
 from .. import generic
 
-@generic.schedule_global_pool.register(["opengl"])
-def schedule_global_pool(outs):
-    """Schedule for global_pool.
+@generic.schedule_adaptive_pool.register(["opengl"])
+def schedule_adaptive_pool(outs):
+    """Schedule for adaptive pool.
 
     Parameters
     ----------
@@ -33,7 +33,7 @@ def schedule_global_pool(outs):
     Returns
     -------
     s: Schedule
-        The computation schedule for global_pool.
+        The computation schedule for adaptive pool.
     """
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
@@ -57,7 +57,7 @@ def traverse(OP):
                 if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
         # schedule global_pool
-        elif OP.tag.startswith('global_pool'):
+        elif OP.tag.startswith('adaptive_pool'):
             Pool = OP.output(0)
             _schedule(Pool)
         else:
diff --git a/topi/python/topi/x86/__init__.py b/topi/python/topi/x86/__init__.py
index a414e3f7a5b7..cce816d43ba1 100644
--- a/topi/python/topi/x86/__init__.py
+++ b/topi/python/topi/x86/__init__.py
@@ -7,7 +7,7 @@
 from .binary_dense import schedule_binary_dense
 from .nn import *
 from .injective import *
-from .pooling import schedule_pool, schedule_global_pool
+from .pooling import schedule_pool, schedule_adaptive_pool
 from .bitserial_conv2d import schedule_bitserial_conv2d
 from .bitserial_dense import schedule_bitserial_dense
 from .depthwise_conv2d import schedule_depthwise_conv2d_NCHWc
diff --git a/topi/python/topi/x86/pooling.py b/topi/python/topi/x86/pooling.py
index 4cf213aac86d..816e03c3c76a 100644
--- a/topi/python/topi/x86/pooling.py
+++ b/topi/python/topi/x86/pooling.py
@@ -110,14 +110,14 @@ def traverse(OP):
     return s
 
 
-@generic.schedule_global_pool.register(["cpu"])
-def schedule_global_pool(outs):
-    """Schedule for global pool
+@generic.schedule_adaptive_pool.register(["cpu"])
+def schedule_adaptive_pool(outs):
+    """Schedule for adaptive pool
 
     Parameters
     ----------
     outs: Array of Tensor
-          The computation graph description of pool
+          The computation graph description of adaptive pool
           in the format of an array of tensors.
 
     Returns
@@ -139,7 +139,7 @@ def traverse(OP):
                 if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
         # schedule pool
-        elif OP.tag.startswith('global_pool'):
+        elif OP.tag.startswith('adaptive_pool'):
             Pool = OP.output(0)
             _parallel_sch(s[Pool], outs[0].shape)
         else:
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index d486e7b831bc..1585d877b625 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -455,6 +455,13 @@ TVM_REGISTER_GLOBAL("topi.nn.global_pool")
                         static_cast<nn::PoolType>(static_cast<int>(args[1])));
   });
 
+TVM_REGISTER_GLOBAL("topi.nn.adaptive_pool")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = nn::adaptive_pool(args[0], args[1],
+                          static_cast<nn::PoolType>(static_cast<int>(args[2])),
+                          args[3]);
+});
+
 /* Ops from nn/softmax.h */
 TVM_REGISTER_GLOBAL("topi.nn.softmax")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
diff --git a/topi/tests/python/test_topi_pooling.py b/topi/tests/python/test_topi_pooling.py
index 36dcd0fc2b2e..bba14ec06654 100644
--- a/topi/tests/python/test_topi_pooling.py
+++ b/topi/tests/python/test_topi_pooling.py
@@ -120,7 +120,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_global_pool(B)
+            s = topi.generic.schedule_adaptive_pool(B)
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         f = tvm.build(s, [A, B], device)
@@ -136,7 +136,58 @@ def test_global_pool():
     verify_global_pool(1, 1024, 7, 7, 'max')
     verify_global_pool(4, 1024, 7, 7, 'max')
 
+def verify_adaptive_pool(dshape, out_size, pool_type, layout="NCHW", dtype="float32"):
+    def start_index(index, odim, idim):
+        return int(np.floor(index * idim / odim))
+
+    def end_index(index, odim, idim):
+        return int(np.ceil((index + 1) * idim / odim))
+
+    np_data = np.random.uniform(low=0, high=255, size=dshape).astype(dtype)
+    n, c, h, w = dshape
+    oh, ow = out_size
+    oshape = (n, c) + out_size
+    np_out = np.zeros(oshape).astype(dtype)
+    np_op = np.mean if pool_type == "avg" else np.max
+    for i in range(n):
+        for j in range(c):
+            for k in range(oh):
+                k_start = start_index(k, oh, h)
+                k_end = end_index(k, oh, h)
+                k_sl = slice(k_start, k_end)
+                for l in range(ow):
+                    l_start = start_index(l, ow, w)
+                    l_end = end_index(l, ow, w)
+                    l_sl = slice(l_start, l_end)
+                    np_out[i, j, k, l] = np_op(np_data[i, j, k_sl, l_sl])
+
+    data = tvm.placeholder(dshape, name="data", dtype=dtype)
+    out = topi.nn.adaptive_pool(data, out_size, pool_type, layout)
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_adaptive_pool(out)
+        a = tvm.nd.array(np_data, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(oshape), dtype=out.dtype), ctx)
+        f = tvm.build(s, [data, out], device)
+        f(a, b)
+        tvm.testing.assert_allclose(b.asnumpy(), np_out, rtol=1e-5)
+
+    for device in get_all_backend():
+        check_device(device)
+
+def test_adaptive_pool():
+    verify_adaptive_pool((1, 3, 224, 224), (1, 1), "max")
+    verify_adaptive_pool((1, 3, 224, 224), (1, 1), "avg")
+    verify_adaptive_pool((1, 14, 56, 78), (34, 13), "max")
+    verify_adaptive_pool((1, 5, 46, 97), (4, 96), "avg")
+
 
 if __name__ == "__main__":
     test_pool()
     test_global_pool()
+    test_adaptive_pool()

From 1d16e83fbaef48b70ea97999ee268c56275cee1d Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Wed, 8 May 2019 21:06:33 -0700
Subject: [PATCH 091/106] [BuildModule] Fix AlterLayout Pass (#3155)

---
 src/relay/backend/build_module.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index b60a048e638a..67ab7501b9fa 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -504,7 +504,14 @@ class RelayBuildModule : public runtime::ModuleNode {
     if (cfg.pass_enabled("AlterOpLayout")) {
       if (targets.size() == 1) {
         func = CallPackedFunc("relay._ir_pass.infer_type", func, nullptr);
-        func = CallPackedFunc("relay._ir_pass.AlterOpLayout", func);
+        auto enter_pf = GetPackedFunc("_EnterTargetScope");
+        auto exit_pf = GetPackedFunc("_ExitTargetScope");
+        for (const auto& kv : targets) {
+          auto target = Target::create(kv.second);
+          (*enter_pf)(target);
+          func = CallPackedFunc("relay._ir_pass.AlterOpLayout", func);
+          (*exit_pf)();
+        }
       } else {
         LOG(WARNING) << "AlterOpLayout pass is not enabled for heterogeneous"
                   << " execution yet.";

From b260969c872396f6c208f489ad7034b20b931692 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Thu, 9 May 2019 02:09:15 -0400
Subject: [PATCH 092/106] [Relay][Runtime] Implementation of Relay VM (#2889)

* Implement the virtual machine

Co-Authored-By: wweic <ipondering.weic@gmail.com>

* Fix rebase build issues

* Reorganize vm.py and fix allocator bug

* Remove compiler

* Remove tests

* Remove backend/vm/vm.cc too

* Fix docs

* Fix doc

* Fix doc

* Add vm docs

* Remove change to dead_code.cc

* Remove Relay logging

* Remove reduce

* Update include/tvm/runtime/vm.h

Co-Authored-By: jroesch <roeschinc@gmail.com>

* Reformat

* Update include/tvm/runtime/vm.h

Co-Authored-By: jroesch <roeschinc@gmail.com>

* Address feedback

* Update include/tvm/runtime/vm.h

Co-Authored-By: jroesch <roeschinc@gmail.com>

* Apply suggestions from code review

Co-Authored-By: jroesch <roeschinc@gmail.com>

* Fix a couple outstanding comments

* Last couple comments

* Update include/tvm/runtime/vm.h

Co-Authored-By: jroesch <roeschinc@gmail.com>

* Address code review feedback

* Fix final comment

* Address comments

* Error reporting and example

* add Const

* Explicitly delete copy assignment operator

* Fix rebase

* Pass 3rd arg to fusion
---
 CMakeLists.txt                                |  13 +-
 cmake/config.cmake                            |   4 +
 include/tvm/relay/logging.h                   |  51 --
 include/tvm/relay/pass.h                      |  19 +-
 include/tvm/runtime/c_runtime_api.h           |   2 +-
 include/tvm/runtime/ndarray.h                 |   6 +-
 include/tvm/runtime/vm.h                      | 424 +++++++++++
 python/tvm/relay/backend/_vm.py               |  21 +
 python/tvm/relay/backend/interpreter.py       |   6 +-
 python/tvm/relay/backend/vm.py                | 129 ++++
 python/tvm/relay/build_module.py              |   6 +-
 python/tvm/relay/expr.py                      |  28 +-
 python/tvm/relay/ir_pass.py                   |  24 +-
 python/tvm/relay/module.py                    |  16 +-
 src/arithmetic/canonical_simplify.cc          |  10 +-
 src/relay/backend/build_module.cc             |  55 +-
 src/relay/backend/compile_engine.h            |   1 +
 src/relay/backend/interpreter.cc              |  16 +-
 src/relay/ir/error.cc                         |   1 +
 src/relay/ir/expr.cc                          |   4 +-
 src/relay/ir/hash.cc                          |   6 +-
 src/relay/ir/module.cc                        |  19 +-
 src/relay/ir/type_functor.cc                  |   4 +-
 src/relay/ir/type_functor.h                   |   5 +-
 src/relay/op/type_relations.cc                |   9 +-
 src/relay/pass/eta_expand.cc                  |  71 ++
 src/relay/pass/fold_constant.cc               |   2 +-
 src/relay/pass/fuse_ops.cc                    |  40 +-
 src/relay/pass/kind_check.cc                  |   4 +-
 src/relay/pass/partial_eval.cc                |   6 +-
 src/relay/pass/to_a_normal_form.cc            |  22 +-
 src/relay/pass/type_infer.cc                  |   9 +-
 src/runtime/vm/memory_manager.cc              |  24 +-
 src/runtime/vm/memory_manager.h               |   1 +
 src/runtime/vm/naive_allocator.h              |   2 +-
 src/runtime/vm/object.cc                      |  21 +-
 src/runtime/vm/vm.cc                          | 670 ++++++++++++++++++
 .../relay/test_pass_dead_code_elimination.py  |   4 +-
 tests/python/relay/test_pass_eta_expand.py    |  32 +
 tests/python/relay/test_pass_partial_eval.py  |   7 +-
 topi/include/topi/transform.h                 |   3 +-
 41 files changed, 1627 insertions(+), 170 deletions(-)
 delete mode 100644 include/tvm/relay/logging.h
 create mode 100644 include/tvm/runtime/vm.h
 create mode 100644 python/tvm/relay/backend/_vm.py
 create mode 100644 python/tvm/relay/backend/vm.py
 create mode 100644 src/relay/pass/eta_expand.cc
 create mode 100644 src/runtime/vm/vm.cc
 create mode 100644 tests/python/relay/test_pass_eta_expand.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 76da288eba9e..dceb9f46568e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,7 @@ tvm_option(USE_LLVM "Build with LLVM, can be set to specific llvm-config path" O
 tvm_option(USE_STACKVM_RUNTIME "Include stackvm into the runtime" OFF)
 tvm_option(USE_GRAPH_RUNTIME "Build with tiny graph runtime" ON)
 tvm_option(USE_GRAPH_RUNTIME_DEBUG "Build with tiny graph runtime debug mode" OFF)
+tvm_option(USE_RELAY_DEBUG "Building Relay in debug mode..." OFF)
 tvm_option(USE_SGX "Build with SGX" OFF)
 tvm_option(USE_RTTI "Build with RTTI" ON)
 tvm_option(USE_MSVC_MT "Build with MT" OFF)
@@ -141,7 +142,10 @@ file(GLOB TOPI_SRCS
 )
 file(GLOB_RECURSE HALIDEIR_SRCS 3rdparty/HalideIR/src/*.cpp)
 list(APPEND COMPILER_SRCS ${HALIDEIR_SRCS})
-file(GLOB RUNTIME_SRCS src/runtime/*.cc)
+file(GLOB RUNTIME_SRCS
+  src/runtime/*.cc
+  src/runtime/vm/*.cc
+)
 
 # Package runtime rules
 if(NOT USE_RTTI)
@@ -201,6 +205,13 @@ add_library(tvm SHARED ${COMPILER_SRCS} ${RUNTIME_SRCS})
 add_library(tvm_topi SHARED ${TOPI_SRCS})
 add_library(tvm_runtime SHARED ${RUNTIME_SRCS})
 add_library(tvm_runtime_static STATIC ${RUNTIME_SRCS})
+
+if(USE_RELAY_DEBUG)
+  message(STATUS "Building Relay in debug mode...")
+  set_target_properties(tvm PROPERTIES COMPILE_DEFINITIONS "USE_RELAY_DEBUG")
+  set_target_properties(tvm PROPERTIES COMPILE_DEFINITIONS "NDEBUG")
+endif(USE_RELAY_DEBUG)
+
 if(NOT USE_SGX STREQUAL "OFF")
   add_dependencies(tvm sgx_edl)
   add_dependencies(tvm_runtime sgx_edl tvm_t)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 448fb25bd519..e7ddb9aba6b8 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -137,3 +137,7 @@ set(USE_ANTLR OFF)
 
 # Build TSIM for VTA
 set(USE_VTA_TSIM OFF)
+
+# Whether use Relay debug mode
+set(USE_RELAY_DEBUG OFF)
+
diff --git a/include/tvm/relay/logging.h b/include/tvm/relay/logging.h
deleted file mode 100644
index 709ab5a0a6b2..000000000000
--- a/include/tvm/relay/logging.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file tvm/relay/logging.h
- * \brief A wrapper around dmlc-core/logging.h which adds the ability
- * to toggle logging via an environment variable.
- */
-
-#ifndef TVM_RELAY_LOGGING_H_
-#define TVM_RELAY_LOGGING_H_
-
-#include <dmlc/logging.h>
-#include <string>
-#include <cstdlib>
-#include <iostream>
-
-namespace tvm {
-namespace relay {
-
-static bool logging_enabled() {
-  if (auto var = std::getenv("RELAY_LOG")) {
-    std::string is_on(var);
-    return is_on == "1";
-  } else {
-      return false;
-  }
-}
-
-#define RELAY_LOG(severity) LOG_IF(severity, logging_enabled())
-
-}  // namespace relay
-}  // namespace tvm
-
-#endif  // TVM_RELAY_LOGGING_H_
diff --git a/include/tvm/relay/pass.h b/include/tvm/relay/pass.h
index 2db3a061b872..43831fce3bbc 100644
--- a/include/tvm/relay/pass.h
+++ b/include/tvm/relay/pass.h
@@ -320,6 +320,22 @@ TVM_DLL bool AlphaEqual(const Expr& e1, const Expr& e2);
  */
 TVM_DLL bool AlphaEqual(const Type& t1, const Type& t2);
 
+/*! \brief Add abstraction over a function
+ *
+ * For example: `square` is transformed to
+ * `fun x -> square x`.
+ *
+ * See https://en.wikipedia.org/wiki/Lambda_calculus#%CE%B7-conversion
+ * for more details.
+ *
+ * \param e The original function.
+ * \param mod The module used for referencing global functions, can be
+ * None.
+ *
+ * \return the new function with abstraction
+ */
+TVM_DLL Expr EtaExpand(const Expr& e, const Module& mod);
+
 /*! \brief Check that each Var is only bound once.
  *
  * For example, the expression `let x = 1 in let x = 2 in 3` bound x twice.
@@ -467,9 +483,10 @@ TVM_DLL Expr FoldConstant(const Expr& expr);
  * \brief Fuse operations into expr into seperate functions.
  * \param expr The expression.
  * \param fuse_opt_level Optimization level.
+ * \param mod the module.
  * \return The optimized expression.
  */
-TVM_DLL Expr FuseOps(const Expr& expr, int fuse_opt_level);
+TVM_DLL Expr FuseOps(const Expr& expr, int fuse_opt_level, const Module& mod);
 
 /*!
  * \brief Apply rewrite rules to rewrite the expr in post DFS order.
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 735eb1be11c2..f992e87ad100 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -103,6 +103,7 @@ typedef enum {
   kStr = 11U,
   kBytes = 12U,
   kNDArrayContainer = 13U,
+  kObject = 14U,
   // Extension codes for other frameworks to integrate TVM PackedFunc.
   // To make sure each framework's id do not conflict, use first and
   // last sections to mark ranges.
@@ -113,7 +114,6 @@ typedef enum {
   // The following section of code is used for non-reserved types.
   kExtReserveEnd = 64U,
   kExtEnd = 128U,
-  kObject = 14U,
 } TVMTypeCode;
 
 /*!
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index 9e7814b7f620..aea551ee7d69 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -306,9 +306,11 @@ class NDArray::Container {
             DLContext ctx) {
     dl_tensor.data = data;
     shape_ = std::move(shape);
-    dl_tensor.shape = dmlc::BeginPtr(shape);
-    dl_tensor.ndim = static_cast<int>(shape.size());
+    dl_tensor.ndim = static_cast<int>(shape_.size());
+    dl_tensor.shape = dmlc::BeginPtr(shape_);
     dl_tensor.dtype = dtype;
+    dl_tensor.strides = nullptr;
+    dl_tensor.byte_offset = 0;
     dl_tensor.ctx = ctx;
   }
 
diff --git a/include/tvm/runtime/vm.h b/include/tvm/runtime/vm.h
new file mode 100644
index 000000000000..0a0a4debf294
--- /dev/null
+++ b/include/tvm/runtime/vm.h
@@ -0,0 +1,424 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/runtime/vm.h
+ * \brief A virtual machine for executing Relay programs.
+ */
+#ifndef TVM_RUNTIME_VM_H_
+#define TVM_RUNTIME_VM_H_
+
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace tvm {
+namespace runtime {
+namespace vm {
+
+/*! \brief A register name. */
+using RegName = int64_t;
+
+/*! \brief An alias for the integer type used ubiquitously
+ * in the VM.
+ */
+using Index = int64_t;
+
+/*! \brief An enumeration of Relay's opcodes.
+ *
+ * The opcode is used to implement instruction
+ * as a tagged union.
+ */
+enum class Opcode {
+  Move = 0U,
+  Ret = 1U,
+  Invoke = 2U,
+  InvokeClosure = 3U,
+  InvokePacked = 4U,
+  AllocTensor = 5U,
+  AllocDatatype = 6U,
+  AllocClosure = 7U,
+  GetField = 8U,
+  If = 9U,
+  Select = 10U,
+  LoadConst = 11U,
+  Goto = 12U
+};
+
+/*! \brief A single virtual machine instruction.
+ *
+ * The representation of the instruction is as
+ * a tagged union.
+ *
+ * The first field represents which instruction,
+ * and by extension which field of the union
+ * is active.
+ */
+struct Instruction {
+  /*! \brief The instruction opcode. */
+  Opcode op;
+
+  /*! \brief The destination register. */
+  RegName dst;
+
+  union {
+    struct /* AllocTensor Operands */ {
+      /*! \brief The register to read the shape out of. */
+      RegName shape_register;
+      /*! \brief The datatype of tensor to be allocated. */
+      DLDataType dtype;
+    };
+    struct /* InvokeClosure Operands */ {
+      /*! \brief The register containing the closure. */
+      RegName closure;
+      /*! \brief The number of arguments to the closure. */
+      Index closure_args_num;
+      /*! \brief The closure arguments as an array. */
+      RegName* closure_args;
+    };
+    struct /* Return Operands */ {
+      /*! \brief The register to return. */
+      RegName result;
+    };
+    struct /* Move Operands */ {
+      /*! \brief The source register for a move operation. */
+      RegName from;
+    };
+    struct /* Packed Operands */ {
+      /*! \brief The index into the packed function table. */
+      Index packed_index;
+      /*! \brief The arity of the packed function. */
+      Index arity;
+      /*! \brief The number of outputs produced by the packed function. */
+      Index output_size;
+      /*! \brief The arguments to pass to the packed function. */
+      RegName* packed_args;
+    };
+    struct /* Select Operands */ {
+      /*! \brief The condition of select. */
+      RegName select_cond;
+      /*! \brief The true branch. */
+      RegName select_op1;
+      /*! \brief The false branch. */
+      RegName select_op2;
+    };
+    struct /* If Operands */ {
+      /*! \brief The register containing the condition value. */
+      RegName if_cond;
+      /*! \brief The program counter offset for the true branch. */
+      Index true_offset;
+      /*! \brief The program counter offset for the false branch. */
+      Index false_offset;
+    };
+    struct /* Invoke Operands */ {
+      /*! \brief The function to call. */
+      Index func_index;
+      /*! \brief The number of arguments to the function. */
+      Index num_args;
+      /*! \brief The registers containing the arguments. */
+      RegName* invoke_args_registers;
+    };
+    struct /* Const Operands */ {
+      /* \brief The index into the constant pool. */
+      Index const_index;
+    };
+    struct /* Jump Operands */ {
+      /*! \brief The jump offset. */
+      Index pc_offset;
+    };
+    struct /* Proj Operands */ {
+      /*! \brief The register to project from. */
+      RegName object;
+      /*! \brief The field to read out. */
+      Index field_index;
+    };
+    struct /* AllocDatatype Operands */ {
+      /*! \brief The datatype's constructor tag. */
+      Index constructor_tag;
+      /*! \brief The number of fields to store in the datatype. */
+      Index num_fields;
+      /*! \brief The fields as an array. */
+      RegName* datatype_fields;
+    };
+    struct /* AllocClosure Operands */ {
+      /*! \brief The index into the function table. */
+      Index clo_index;
+      /*! \brief The number of free variables to capture. */
+      Index num_freevar;
+      /*! \brief The free variables as an array. */
+      RegName* free_vars;
+    };
+  };
+
+  /*! \brief Construct a select instruction.
+   *  \param cond The condition register.
+   *  \param op1 The true register.
+   *  \param op2 The false register.
+   *  \param dst The destination register.
+   *  \return The select instruction.
+   */
+  static Instruction Select(RegName cond, RegName op1, RegName op2, RegName dst);
+  /*! \brief Construct a return instruction.
+   *  \param return_reg The register containing the return value.
+   *  \return The return instruction.
+   * */
+  static Instruction Ret(RegName return_reg);
+  /*! \brief Construct a invoke packed instruction.
+   *  \param packed_index The index of the packed function.
+   *  \param arity The arity of the function.
+   *  \param output_size The number of outputs of the packed function.
+   *  \param args The argument registers.
+   *  \return The invoke packed instruction.
+   */
+  static Instruction InvokePacked(Index packed_index, Index arity, Index output_size,
+                                  const std::vector<RegName>& args);
+  /*! \brief Construct an allocate tensor instruction.
+   *  \param shape_register The register containing the shape.
+   *  \param dtype The dtype of the tensor.
+   *  \param dst The destination register.
+   *  \return The allocate tensor instruction.
+   */
+  static Instruction AllocTensor(RegName shape_register, DLDataType dtype, RegName dst);
+  /*! \brief Construct an allocate datatype instruction.
+   *  \param tag The datatype tag.
+   *  \param num_fields The number of fields for the datatype.
+   *  \param fields The registers containing the fields.
+   *  \param dst The register name of the destination.
+   *  \return The allocate instruction tensor.
+   */
+  static Instruction AllocDatatype(Index tag, Index num_fields, const std::vector<RegName>& fields,
+                                   RegName dst);
+  /*! \brief Construct an allocate closure instruction.
+   *  \param func_index The index of the function table.
+   *  \param num_freevar The number of free variables.
+   *  \param free_vars The registers of the free variables.
+   *  \param dst The destination register.
+   *  \return The allocate closure instruction.
+   */
+  static Instruction AllocClosure(Index func_index, Index num_freevar,
+                                  const std::vector<RegName>& free_vars, RegName dst);
+  /*! \brief Construct a get field instruction.
+   *  \param object_reg The register containing the object to project from.
+   *  \param field_index The field to read out of the object.
+   *  \param dst The destination register.
+   *  \return The get field instruction.
+   */
+  static Instruction GetField(RegName object_reg, Index field_index, RegName dst);
+  /*! \brief Construct an if instruction.
+   *  \param cond_reg The register containing the condition.
+   *  \param true_branch The offset to the true branch.
+   *  \param false_branch The offset to the false branch.
+   *  \return The if instruction.
+   */
+  static Instruction If(RegName cond_reg, Index true_branch, Index false_branch);
+  /*! \brief Construct a goto instruction.
+   *  \param pc_offset The offset from the current pc.
+   *  \return The goto instruction.
+   */
+  static Instruction Goto(Index pc_offset);
+  /*! \brief Construct an invoke instruction.
+   *  \param func_index The index of the function to invoke.
+   *  \param args The registers containing the arguments.
+   *  \param dst The destination register.
+   *  \return The invoke instruction.
+   */
+  static Instruction Invoke(Index func_index, const std::vector<RegName>& args, RegName dst);
+  /*! \brief Construct an invoke closure instruction.
+   *  \param closure The register of the closure to invoke.
+   *  \param args The registers containing the arguments.
+   *  \param dst The destination register.
+   *  \return The invoke closure instruction.
+   */
+  static Instruction InvokeClosure(RegName closure, const std::vector<RegName>& args, RegName dst);
+  /*! \brief Construct a load constant instruction.
+   *  \param const_index The index of the constant.
+   *  \param dst The destination register.
+   *  \return The load constant instruction.
+   */
+  static Instruction LoadConst(Index const_index, RegName dst);
+  /*! \brief Construct a move instruction.
+   *  \param src The source register.
+   *  \param dst The destination register.
+   *  \return The move instruction.
+   */
+  static Instruction Move(RegName src, RegName dst);
+
+  Instruction();
+  Instruction(const Instruction& instr);
+  Instruction& operator=(const Instruction& instr) = delete;
+  ~Instruction();
+
+  friend std::ostream& operator<<(std::ostream& os, const Instruction&);
+};
+
+/*! \brief A representation of a Relay function in the VM.
+ *
+ * Contains metadata about the compiled function, as
+ * well as the compiled VM instructions.
+ */
+struct VMFunction {
+  /*! \brief The function's name. */
+  std::string name;
+  /*! \brief The number of function parameters. */
+  Index params;
+  /*! \brief The instructions representing the function. */
+  std::vector<Instruction> instructions;
+  /*! \brief The size of the frame for this function */
+  Index register_file_size;
+
+  VMFunction(const std::string& name, Index params,
+             const std::vector<Instruction>& instructions,
+             Index register_file_size)
+      : name(name),
+        params(params),
+        instructions(instructions),
+        register_file_size(register_file_size) {}
+
+  VMFunction() {}
+
+  friend std::ostream& operator<<(std::ostream& os, const VMFunction&);
+};
+
+/*! \brief A representation of a stack frame.
+ *
+ * A stack frame is a record containing the information needed
+ * to restore the caller's virtual machine state after returning
+ * from a function call.
+ */
+struct VMFrame {
+  /*! \brief The return program counter. */
+  Index pc;
+  /*! \brief The index into the function table, points to the caller. */
+  Index func_index;
+  /*! \brief The number of arguments. */
+  Index args;
+  /*! \brief A pointer into the caller function's instructions. */
+  const Instruction* code;
+
+  /*! \brief Statically allocated space for objects */
+  std::vector<Object> register_file;
+
+  /*! \brief Register in caller's frame to put return value */
+  RegName caller_return_register;
+
+  VMFrame(Index pc, Index func_index, Index args, const Instruction* code, Index register_file_size)
+      : pc(pc),
+        func_index(func_index),
+        args(args),
+        code(code),
+        register_file(register_file_size),
+        caller_return_register(0) {}
+};
+
+/*! \brief The virtual machine.
+ *
+ * The virtual machine contains all the current execution state,
+ * as well as the global view of functions, the global constant
+ * table, the compiled operators.
+ *
+ * The goal is to have a single self-contained object,
+ * enabling one to easily pass around VMs, execute them on
+ * multiple threads, or serialized them to disk or over the
+ * wire.
+ */
+struct VirtualMachine {
+  /*! \brief The virtual machine's packed function table. */
+  std::vector<PackedFunc> packed_funcs;
+  /*! \brief The virtual machine's function table. */
+  std::vector<VMFunction> functions;
+  /*! \brief The current stack of call frames. */
+  std::vector<VMFrame> frames;
+  /*! \brief The global constant pool. */
+  std::vector<Object> constants;
+  /*! \brief The fuction table index of the current function. */
+  Index func_index;
+  /*! \brief The current pointer to the code section. */
+  const Instruction* code;
+  /*! \brief The virtual machine PC. */
+  Index pc;
+
+  /*! \brief The special return register. */
+  Object return_register;
+
+  /*! \brief The set of TVM contexts the VM is currently executing on. */
+  std::vector<TVMContext> ctxs;
+
+  /*! \brief Push a call frame on to the call stack. */
+  void PushFrame(Index arg_count, Index ret_pc, const VMFunction& vm_func);
+  /*! \brief Pop a frame off the call stack.
+   *  \return The number of frames left.
+   */
+  Index PopFrame();
+
+  /*! \brief Write to a VM register.
+   *  \param reg The register to write to.
+   *  \param obj The object to write to.
+   */
+  inline void WriteRegister(RegName reg, const Object& obj);
+
+  /*! \brief Read a VM register.
+   *  \param reg The register to read from.
+   *  \return The read object.
+   */
+  inline Object ReadRegister(RegName reg) const;
+
+  /*! \brief Invoke a VM function.
+   * \param func The function.
+   * \param args The arguments to the function.
+   * \return The object representing the result.
+   */
+  Object Invoke(const VMFunction& func, const std::vector<Object>& args);
+
+  // TODO(@jroesch): I really would like this to be a global variable.
+  /*! \brief Invoke a VM function by name.
+   * \param name The function's name.
+   * \param args The arguments to the function.
+   * \return The object representing the result.
+   */
+  Object Invoke(const std::string& name, const std::vector<Object>& args);
+
+  VirtualMachine() : functions(), frames(), func_index(0), code(nullptr), pc(0) {}
+
+  /*! \brief Initialize the virtual machine for a set of contexts.
+   *  \param contexts The set of TVM contexts.
+   */
+  void Init(const std::vector<TVMContext>& contexts);
+  void Run();
+
+  /*! \brief A map from globals (as strings) to their index in the function map.
+   */
+  std::unordered_map<std::string, Index> global_map_;
+
+ private:
+  /*! \brief Invoke a global setting up the VM state to execute.
+   *
+   * This does not begin execution of the VM.
+   */
+  void InvokeGlobal(const VMFunction& func, const std::vector<Object>& args);
+};
+
+}  // namespace vm
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_VM_H_
diff --git a/python/tvm/relay/backend/_vm.py b/python/tvm/relay/backend/_vm.py
new file mode 100644
index 000000000000..e88f02a5a7c8
--- /dev/null
+++ b/python/tvm/relay/backend/_vm.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""The Relay virtual machine FFI namespace.
+"""
+from tvm._ffi.function import _init_api
+
+_init_api("relay._vm", __name__)
diff --git a/python/tvm/relay/backend/interpreter.py b/python/tvm/relay/backend/interpreter.py
index bb43b278639a..fc47f4e1b7c8 100644
--- a/python/tvm/relay/backend/interpreter.py
+++ b/python/tvm/relay/backend/interpreter.py
@@ -26,6 +26,7 @@
 from ..base import NodeBase, register_relay_node
 from ..expr import Tuple, RefCreate, Call, Constant, GlobalVar, Function, const
 from ..scope_builder import ScopeBuilder
+from . import _vm
 
 class Value(NodeBase):
     """Base class of all values.
@@ -36,6 +37,9 @@ def from_scalar(value, dtype=None):
         """Convert a Python scalar to a Relay scalar."""
         return TensorValue(const(value, dtype).data)
 
+    def to_vm(self):
+        return _vm._ValueToVM(self)
+
 
 @register_relay_node
 class TupleValue(Value):
@@ -278,7 +282,7 @@ def optimize(self, expr):
         ck_expr = ir_pass.infer_type(wrapped_expr, mod=self.mod)
         simp_expr = ir_pass.simplify_inference(ck_expr)
         ck_simp = ir_pass.infer_type(simp_expr, mod=self.mod)
-        fused_expr = ir_pass.fuse_ops(ck_simp)
+        fused_expr = ir_pass.fuse_ops(ck_simp, 0, mod=self.mod)
         ck_fused = ir_pass.infer_type(fused_expr, mod=self.mod)
         return ck_fused if isinstance(expr, Function) else Call(ck_fused, [])
 
diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
new file mode 100644
index 000000000000..bebadd167fe9
--- /dev/null
+++ b/python/tvm/relay/backend/vm.py
@@ -0,0 +1,129 @@
+# License .to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=no-else-return, unidiomatic-typecheck, undefined-variable
+"""
+The Relay Virtual Vachine.
+
+Implements a Python interface to compiling and executing on the Relay VM.
+"""
+import tvm
+from tvm._ffi.function import Object
+import numpy as np
+from .. import ir_pass
+from ..backend.interpreter import Executor
+from ..expr import GlobalVar, Function, Expr
+from . import _vm
+
+Object = Object
+
+def optimize(expr, mod=None):
+    # TODO: We need to move this optimization code into the optimizer/pass manager
+    ck_expr = ir_pass.infer_type(expr, mod=mod)
+    simplified_expr = ir_pass.simplify_inference(ck_expr)
+    simplified_expr = ir_pass.infer_type(simplified_expr, mod=mod)
+    fused_expr = ir_pass.fuse_ops(simplified_expr, mod=mod)
+    ck_fused = ir_pass.infer_type(fused_expr, mod=mod)
+    return ck_fused
+
+def _convert(arg, cargs):
+    if isinstance(arg, np.ndarray):
+        tensor = _vm._Tensor(tvm.nd.array(arg))
+        cargs.append(tensor)
+    elif isinstance(arg, tvm.nd.NDArray):
+        tensor = _vm._Tensor(arg)
+        cargs.append(tensor)
+    elif isinstance(arg, tuple):
+        field_args = []
+        for field in arg:
+            _convert(field, field_args)
+        cargs.append(_vm._Tuple(*field_args))
+    else:
+        raise "unsupported type"
+
+def convert(args):
+    cargs = []
+    for arg in args:
+        _convert(arg, cargs)
+
+    return cargs
+
+def _eval_vm(mod, ctx, *args):
+    """
+    Evaluate a module on a given context with the provided arguments.
+
+    Parameters
+    ----------
+    mod: relay.Module
+        The module to optimize, will execute its entry_func.
+
+    ctx: tvm.Context
+        The TVM context to execute on.
+
+    args: List[tvm.NDArray, np.ndarray]
+        The arguments to evaluate.
+    """
+    main_func = mod[mod.entry_func]
+
+    if not main_func.params and isinstance(main_func.body, GlobalVar):
+        main_func = ir_pass.eta_expand(main_func.body, mod)
+
+    assert isinstance(main_func, Function)
+    main_func = optimize(mod[mod.entry_func], mod)
+    mod[mod.entry_func] = main_func
+
+    args = list(args)
+    assert isinstance(args, list)
+    cargs = convert(args)
+
+    result = _vm._evaluate_vm(mod, ctx.device_type, ctx.device_id, *cargs)
+    return result
+
+class VMExecutor(Executor):
+    """
+    An implementation of the executor interface for
+    the Relay VM.
+
+    Useful interface for experimentation and debugging
+    the VM can also be used directly from the API.
+    supported by `tvm.relay.vm`.
+
+    Parameters
+    ----------
+    mod : :py:class:`~tvm.relay.module.Module`
+        The module to support the execution.
+
+    ctx : :py:class:`TVMContext`
+        The runtime context to run the code on.
+
+    target : :py:class:`Target`
+        The target option to build the function.
+    """
+    def __init__(self, mod, ctx, target):
+        self.mod = mod
+        self.ctx = ctx
+        self.target = target
+
+    def _make_executor(self, expr):
+        assert isinstance(expr, Expr)
+        self.mod[self.mod.entry_func] = expr
+        main = self.mod[self.mod.entry_func]
+
+        def _vm_wrapper(*args, **kwargs):
+            args = self._convert_args(main, args, kwargs)
+            return _eval_vm(self.mod, self.ctx, *args)
+
+        return _vm_wrapper
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index a4929d0b839d..c8b69e011543 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -29,6 +29,7 @@
 from . import ty as _ty
 from .backend import interpreter as _interpreter
 from .backend import graph_runtime_codegen as _graph_gen
+from .backend.vm import VMExecutor
 
 # List of optimization pass and level when switch on
 OPT_PASS_LEVEL = {
@@ -484,4 +485,7 @@ def create_executor(kind="debug",
         return _interpreter.Interpreter(mod, ctx, target)
     if kind == "graph":
         return GraphExecutor(mod, ctx, target)
-    raise RuntimeError("unknown mode {0}".format(mode))
+    elif kind == "vm":
+        return VMExecutor(mod, ctx, target)
+    else:
+        raise RuntimeError("unknown execution strategy: {0}".format(kind))
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 1530befb5d45..98b4a83e09de 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -126,6 +126,20 @@ def __truediv__(self, other):
     def __rtruediv__(self, other):
         return self.__rdiv__(other)
 
+    def __call__(self, *args):
+        """Call the variable (if it represents a function).
+
+        Parameters
+        ----------
+        args: List[relay.Expr]
+            The arguments to the call.
+
+        Returns
+        -------
+        call: Call
+            A call taking the variable as a function.
+        """
+        return Call(self, args)
 
 @register_relay_node
 class Constant(Expr):
@@ -191,20 +205,6 @@ def name_hint(self):
         name = self.vid.name_hint
         return name
 
-    def __call__(self, *args):
-        """Call the variable (if it represents a function).
-
-        Parameters
-        ----------
-        args: List[relay.Expr]
-            The arguments to the call.
-
-        Returns
-        -------
-        call: Call
-            A call taking the variable as a function.
-        """
-        return Call(self, args)
 
 @register_relay_node
 class GlobalVar(Expr):
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index 93ce2dc92fbd..5f23e14d5559 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -391,6 +391,23 @@ def backward_fold_scale_axis(expr):
     """
     return _ir_pass.backward_fold_scale_axis(expr)
 
+def eta_expand(expr, mod):
+    """Add abstraction over a function.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression, we expect that expr's types
+        should be fully inferred by infer_type.
+    mod : tvm.relay.Module
+         The global module.
+
+    Returns
+    -------
+    expanded_expr : tvm.relay.Expr
+        The expression after eta expansion.
+    """
+    return _ir_pass.eta_expand(expr, mod)
 
 def forward_fold_scale_axis(expr):
     """Fold the scaling of axis into weights of conv2d/dense.
@@ -703,7 +720,7 @@ def fold_constant(expr):
     return _ir_pass.FoldConstant(expr)
 
 
-def fuse_ops(expr, opt_level=1):
+def fuse_ops(expr, opt_level=1, mod=None):
     """Fuse operators in expr together.
 
     Parameters
@@ -714,12 +731,15 @@ def fuse_ops(expr, opt_level=1):
     opt_level : int
         The level of fuse optimization.
 
+    mod : tvm.relay.Module
+        The module to perform fusion over.
+
     Returns
     -------
     transformed_expr : tvm.relay.Expr
         Transformed expression, containing fused result.
     """
-    return _ir_pass.FuseOps(expr, opt_level)
+    return _ir_pass.FuseOps(expr, opt_level, mod)
 
 
 def combine_parallel_conv2d(expr, min_num_branches=3):
diff --git a/python/tvm/relay/module.py b/python/tvm/relay/module.py
index 3eb287c90040..138dfa882215 100644
--- a/python/tvm/relay/module.py
+++ b/python/tvm/relay/module.py
@@ -21,7 +21,6 @@
 from . import _make
 from . import _module
 from . import expr as _expr
-
 from . import ty as _ty
 
 @register_relay_node
@@ -77,9 +76,18 @@ def __setitem__(self, var, val):
         return self._add(var, val)
 
     def _add(self, var, val, update=False):
-        if isinstance(val, _expr.Function):
+        if isinstance(val, _expr.Expr):
             if isinstance(var, _base.string_types):
                 var = _expr.GlobalVar(var)
+
+            # TODO(@jroesch): Port this logic to C++.
+            if not isinstance(val, _expr.Function):
+                if isinstance(val, _expr.GlobalVar):
+                    val = ir_pass.eta_expand(val, self)
+                else:
+                    val = _expr.Function([], val)
+
+
             _make.Module_Add(self, var, val, update)
         else:
             assert isinstance(val, _ty.Type)
@@ -156,3 +164,7 @@ def get_global_type_var(self, name):
         tvm.TVMError if we cannot find corresponding global type var.
         """
         return _module.Module_GetGlobalTypeVar(self, name)
+
+    @staticmethod
+    def from_expr(expr):
+        return _module.Module_FromExpr(expr)
diff --git a/src/arithmetic/canonical_simplify.cc b/src/arithmetic/canonical_simplify.cc
index 0feb00fc904b..1bf1f84fb635 100644
--- a/src/arithmetic/canonical_simplify.cc
+++ b/src/arithmetic/canonical_simplify.cc
@@ -510,7 +510,7 @@ Mutate_(const Add* op, const Expr& self) {
   } else {
     ret.CopyOnWrite()->AddToSelf(ToSplitExpr(b), 1);
   }
-  return ret;
+  return std::move(ret);
 }
 
 Expr CanonicalSimplifier::Impl::
@@ -536,7 +536,7 @@ Mutate_(const Sub* op, const Expr& self) {
   } else {
     ret.CopyOnWrite()->AddToSelf(ToSplitExpr(b), -1);
   }
-  return ret;
+  return std::move(ret);
 }
 
 
@@ -561,11 +561,11 @@ Mutate_(const Mul* op, const Expr& self) {
     if (a.as<SumExprNode>()) {
       SumExpr ret(std::move(a.node_));
       ret.CopyOnWrite()->MulToSelf(bconst->value);
-      return ret;
+      return std::move(ret);
     } else {
       SplitExpr ret = ToSplitExpr(std::move(a));
       ret.CopyOnWrite()->MulToSelf(bconst->value);
-      return ret;
+      return std::move(ret);
     }
   }
 
@@ -684,7 +684,7 @@ Mutate_(const Div* op, const Expr& self) {
                 SplitDivConst(ToSplitExpr(temp), cval), 1);
           }
         }
-        return lhs;
+        return std::move(lhs);
       }
     } else {
       // if a >= 0 && a < cval, then result == 0
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 67ab7501b9fa..564715c00d90 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -39,7 +39,7 @@ namespace relay {
 namespace backend {
 
 /*!
- * \brief Context name / index 
+ * \brief Context name / index
  *        See: python/tvm/_ffi/runtime_ctypes.py
  */
 struct ContextMap {
@@ -91,13 +91,13 @@ const std::unordered_map<std::string, int> ContextMap::str2mask = {
 /*!
  * \brief A data structure to map the names of specific optimizations to
  *        numeric optimization levels
- * 
+ *
  */
 struct OptPassLevel {
   static const std::unordered_map<std::string, int> _data;
   /*!
    * \brief Get level for an optimization pass
-   * 
+   *
    * \param key pass name
    * \return int level
    */
@@ -123,7 +123,7 @@ const std::unordered_map<std::string, int> OptPassLevel::_data = {
 
 /*!
  * \brief Output of building module
- * 
+ *
  */
 struct BuildOutput {
   std::string graph_json;
@@ -133,7 +133,7 @@ struct BuildOutput {
 
 /*!
  * \brief Relay building config
- * 
+ *
  */
 struct RelayBuildConfig {
   int opt_level{2};
@@ -153,8 +153,8 @@ struct RelayBuildConfig {
 };
 
 /*!
- * \brief GraphCodegen module wrapper 
- * 
+ * \brief GraphCodegen module wrapper
+ *
  */
 struct GraphCodegen {
  public:
@@ -225,7 +225,7 @@ Function CallPackedFunc(const std::string &name, Args... args) {
 
 /*!
  * \brief Relay build module
- * 
+ *
  */
 class RelayBuildModule : public runtime::ModuleNode {
  public:
@@ -309,23 +309,23 @@ class RelayBuildModule : public runtime::ModuleNode {
   }
   /*!
    * \brief Add extra pass into build cfg
-   * 
-   * \param pass_name name of pass 
+   *
+   * \param pass_name name of pass
    */
   void AddPass(const std::string& pass_name) {
     cfg_.enabled_pass.insert(pass_name);
   }
   /*!
    * \brief Disable a specific pass in cfg
-   * 
+   *
    * \param pass_name name of pass
    */
   void DisablePass(const std::string& pass_name) {
     cfg_.disabled_pass.insert(pass_name);
   }
   /*!
-   * \brief Set the Fallback device 
-   * 
+   * \brief Set the Fallback device
+   *
    * \param device name
    */
   void SetFallBackDev(const std::string& dev) {
@@ -342,7 +342,7 @@ class RelayBuildModule : public runtime::ModuleNode {
 
   /*!
    * \brief List all paramter names
-   * 
+   *
    * \return Array<StringImm> names of params
    */
   Array<HalideIR::Expr> ListParamNames() {
@@ -355,7 +355,7 @@ class RelayBuildModule : public runtime::ModuleNode {
 
   /*!
    * \brief Get params dictionary
-   * 
+   *
    * \return Map<std::string, Constant> params dictionary
    */
   Map<std::string, Constant> GetParams() {
@@ -527,10 +527,10 @@ class RelayBuildModule : public runtime::ModuleNode {
    * compilation. CPU is used as the fallback device if it wasn't provided.
    * Meanwhile, a CPU device type and "llvm" pair will be added to the target
    * dictionary in this case.
-   * 
+   *
    * \param targets dictionary
-   * \param cfg 
-   * \return Map<HalideIR::Expr, HalideIR::Expr> 
+   * \param cfg
+   * \return Map<HalideIR::Expr, HalideIR::Expr>
    */
   Map<HalideIR::Expr, HalideIR::Expr> UpdateHeterogeneousInputs(
     const std::unordered_map<std::string, std::string>& targets,
@@ -555,11 +555,11 @@ class RelayBuildModule : public runtime::ModuleNode {
   /*!
    * \brief Execute the device annotation passes to update the input program and
    *        target information.
-   * 
-   * \param func 
-   * \param cfg 
-   * \param targets_map_ptr 
-   * \return Function 
+   *
+   * \param func
+   * \param cfg
+   * \param targets_map_ptr
+   * \return Function
    */
   Function RunDeviceAnnotationPass(
       Function func,
@@ -603,7 +603,7 @@ class RelayBuildModule : public runtime::ModuleNode {
   }
   /*!
    * \brief Build module given lowered functions for each target
-   * 
+   *
    * \param lowered_funcs target_str -> Array<LoweredFunc> map
    * \param targets Targets map
    * \param cfg Building configuration
@@ -674,8 +674,9 @@ class RelayBuildModule : public runtime::ModuleNode {
     if (device_target.size() > 1) {
       func = RunDeviceAnnotationPass(func, cfg, &device_target);
     }
+    // TODO(@jroesch): use the passes directly.
     func = CallPackedFunc("relay._ir_pass.infer_type", func, nullptr);
-    func = CallPackedFunc("relay._ir_pass.FuseOps", func, cfg.opt_level);
+    func = CallPackedFunc("relay._ir_pass.FuseOps", func, cfg.opt_level, nullptr);
     func = CallPackedFunc("relay._ir_pass.infer_type", func, nullptr);
 
     graph_codegen_ = std::unique_ptr<GraphCodegen>(new GraphCodegen());
diff --git a/src/relay/backend/compile_engine.h b/src/relay/backend/compile_engine.h
index 391310612d23..9b510ad2fd29 100644
--- a/src/relay/backend/compile_engine.h
+++ b/src/relay/backend/compile_engine.h
@@ -28,6 +28,7 @@
 
 #include <tvm/lowered_func.h>
 #include <tvm/relay/expr.h>
+#include <tvm/relay/pass.h>
 #include <string>
 #include <functional>
 
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index 9af3f822a07d..d700c2036e21 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -278,17 +278,19 @@ class Interpreter :
     return TupleValueNode::make(values);
   }
 
-  // TODO(@jroesch): this doesn't support mutual letrec.
-  Value MakeClosure(const Function& func, const Var& letrec_name = Var()) {
+  // TODO(@jroesch): this doesn't support mututal letrec
+  inline Value MakeClosure(const Function& func, Var letrec_name = Var()) {
     tvm::Map<Var, Value> captured_mod;
     Array<Var> free_vars = FreeVars(func);
 
     for (const auto& var : free_vars) {
       // Evaluate the free var (which could be a function call) if it hasn't
       // shown up in a letting binding that has invoked the function.
-      if (!letrec_name.defined() || letrec_name != var) {
-        captured_mod.Set(var, Eval(var));
+      if (letrec_name.defined() && letrec_name == var) {
+        continue;
       }
+
+      captured_mod.Set(var, Eval(var));
     }
 
     // We must use mutation here to build a self referential closure.
@@ -296,7 +298,7 @@ class Interpreter :
     auto mut_closure =
         static_cast<ClosureNode*>(const_cast<Node*>(closure.get()));
     mut_closure->env.Set(letrec_name, closure);
-    return closure;
+    return std::move(closure);
   }
 
   Value VisitExpr_(const FunctionNode* func_node) final {
diff --git a/src/relay/ir/error.cc b/src/relay/ir/error.cc
index e0f4bcb9b508..5e621316a136 100644
--- a/src/relay/ir/error.cc
+++ b/src/relay/ir/error.cc
@@ -113,6 +113,7 @@ void ErrorReporter::RenderErrors(const Module& module, bool use_color) {
     annotated_prog << AsText(func, false, [&err_map](tvm::relay::Expr expr) {
       auto it = err_map.find(expr);
       if (it != err_map.end()) {
+        CHECK_NE(it->second.size(), 0);
         return it->second;
       } else {
         return std::string("");
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index 63d41c405e33..64706933fde3 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/src/relay/ir/hash.cc b/src/relay/ir/hash.cc
index 89ad6083fb8e..c56c4ce17067 100644
--- a/src/relay/ir/hash.cc
+++ b/src/relay/ir/hash.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -271,6 +271,7 @@ class RelayHashHandler:
     }
 
     for (auto t : call->type_args) {
+      CHECK(t.defined());
       hash = Combine(hash, TypeHash(t));
     }
 
@@ -394,7 +395,6 @@ class RelayHashHandler:
     size_t hash = std::hash<std::string>()(PatternWildcardNode::_type_key);
     return hash;
   }
-
  private:
   // renaming of NodeRef to indicate two nodes equals to each other
   std::unordered_map<NodeRef, size_t, NodeHash, NodeEqual> hash_map_;
diff --git a/src/relay/ir/module.cc b/src/relay/ir/module.cc
index eabea2ecfeb0..6b5fee82af89 100644
--- a/src/relay/ir/module.cc
+++ b/src/relay/ir/module.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -59,9 +59,13 @@ Module ModuleNode::make(tvm::Map<GlobalVar, Function> global_funcs,
 
 GlobalVar ModuleNode::GetGlobalVar(const std::string& name) {
   auto it = global_var_map_.find(name);
-  CHECK(it != global_var_map_.end())
-      << "Cannot find global var " << name << " in the Module";
-  return (*it).second;
+  if (it == global_var_map_.end()) {
+    auto gvar = GlobalVarNode::make(name);
+    global_var_map_.Set(name, gvar);
+    return gvar;
+  } else {
+    return (*it).second;
+  }
 }
 
 void ModuleNode::AddUnchecked(const GlobalVar& var,
@@ -215,6 +219,11 @@ TVM_REGISTER_API("relay._module.Module_LookupDef_str")
     return mod->LookupDef(var);
   });
 
+TVM_REGISTER_API("relay._module.Module_FromExpr")
+.set_body_typed<Module(Expr)>([](Expr e) {
+    return ModuleNode::FromExpr(e);
+});
+
 TVM_REGISTER_API("relay._module.Module_Update")
 .set_body_typed<void(Module, Module)>([](Module mod, Module from) {
     mod->Update(from);
diff --git a/src/relay/ir/type_functor.cc b/src/relay/ir/type_functor.cc
index 1f89046f044a..9fca2e032685 100644
--- a/src/relay/ir/type_functor.cc
+++ b/src/relay/ir/type_functor.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/src/relay/ir/type_functor.h b/src/relay/ir/type_functor.h
index e143fdac824d..27ac288fe48d 100644
--- a/src/relay/ir/type_functor.h
+++ b/src/relay/ir/type_functor.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -94,7 +94,6 @@ class TypeFunctor<R(const Type& n, Args...)> {
   virtual R VisitType_(const GlobalTypeVarNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
   virtual R VisitType_(const TypeCallNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
   virtual R VisitType_(const TypeDataNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
-
   virtual R VisitTypeDefault_(const Node* op, Args...) {
     LOG(FATAL) << "Do not have a default for " << op->type_key();
     throw;  // unreachable, written to stop compiler warning
diff --git a/src/relay/op/type_relations.cc b/src/relay/op/type_relations.cc
index b4cdd98ac88b..16d09c46dfa2 100644
--- a/src/relay/op/type_relations.cc
+++ b/src/relay/op/type_relations.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -24,7 +24,6 @@
  * for type relations.
  */
 #include <tvm/relay/expr.h>
-#include <tvm/relay/logging.h>
 #include <tvm/relay/op.h>
 #include <tvm/ir_pass.h>
 #include <numeric>
@@ -109,7 +108,7 @@ bool BroadcastRel(const Array<Type>& types,
                   const Attrs& attrs,
                   const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 3);
-  RELAY_LOG(INFO) << "In1:" << types[0] << ",In2:" << types[1]
+  DLOG(INFO) << "In1:" << types[0] << ",In2:" << types[1]
                   << ",Out:" << types[2] << std::endl;
   if (auto t0 = ToTensorType(types[0])) {
     if (auto t1 = ToTensorType(types[1])) {
@@ -127,7 +126,7 @@ bool BroadcastCompRel(const Array<Type>& types,
                       const Attrs& attrs,
                       const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 3);
-  RELAY_LOG(INFO) << "In1:" << types[0] << ",In2:" << types[1]
+  DLOG(INFO) << "In1:" << types[0] << ",In2:" << types[1]
                   << ",Out:" << types[2] << std::endl;
   if (auto t0 = ToTensorType(types[0])) {
     if (auto t1 = ToTensorType(types[1])) {
diff --git a/src/relay/pass/eta_expand.cc b/src/relay/pass/eta_expand.cc
new file mode 100644
index 000000000000..0193b9afc62e
--- /dev/null
+++ b/src/relay/pass/eta_expand.cc
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ *
+ * \file eta_expand.cc
+ *
+ * \brief Add abstraction over a function. For example, abs will become (fun x -> abs x).
+ *
+ */
+#include <tvm/relay/pass.h>
+
+namespace tvm {
+namespace relay {
+
+Expr EtaExpand(const Expr& e, const Module& mod) {
+  tvm::Array<Var> original_params;
+  tvm::Array<Expr> params;
+  tvm::Array<Var> args;
+  tvm::Array<TypeVar> original_type_params;
+  Type ret_type;
+
+  if (e->is_type<GlobalVarNode>()) {
+    auto gvar_node = e.as_derived<GlobalVarNode>();
+    auto func = mod->Lookup(GetRef<GlobalVar>(gvar_node));
+    original_params = func->params;
+    original_type_params = func->type_params;
+    ret_type = func->ret_type;
+  } else {
+    auto inferred = InferType(e, mod);
+    CHECK(inferred->is_type<FunctionNode>());
+
+    auto func = GetRef<Function>(inferred.as_derived<FunctionNode>());
+    original_params = func->params;
+    original_type_params = func->type_params;
+    ret_type = func->ret_type;
+  }
+
+  for (size_t i = 0; i < original_params.size(); ++i) {
+    auto var = VarNode::make("a", original_params[i]->type_annotation);
+    params.push_back(var);
+    args.push_back(var);
+  }
+
+  auto new_func =
+      FunctionNode::make(args, CallNode::make(e, params), ret_type, original_type_params);
+
+  return InferType(new_func, mod);
+}
+
+TVM_REGISTER_API("relay._ir_pass.eta_expand").set_body_typed(EtaExpand);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/fold_constant.cc b/src/relay/pass/fold_constant.cc
index 9f0d60bf788f..45aa449e72ab 100644
--- a/src/relay/pass/fold_constant.cc
+++ b/src/relay/pass/fold_constant.cc
@@ -156,7 +156,7 @@ class ConstantFolder : public ExprMutator {
   // Constant evaluate a expression.
   Expr ConstEvaluate(Expr expr) {
     expr = InferType(expr, Module(nullptr));
-    expr = FuseOps(expr, 0);
+    expr = FuseOps(expr, 0, Module(nullptr));
     expr = InferType(expr, Module(nullptr));
     return ValueToExpr(executor_(expr));
   }
diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index fc7aad6ce515..d0d0cab22432 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -808,6 +808,7 @@ class FuseMutator : private ExprMutator {
   std::unordered_map<const Node*, GraphPartitioner::Group*> gmap_;
   /* \brief Internal group information map. */
   std::unordered_map<GraphPartitioner::Group*, GroupInfo> ginfo_;
+
   // Skip primitive function.
   Expr VisitExpr_(const FunctionNode* fn_node) {
     if (fn_node->IsPrimitive()) {
@@ -816,6 +817,7 @@ class FuseMutator : private ExprMutator {
       return ExprMutator::VisitExpr_(fn_node);
     }
   }
+
   // Transform calls.
   Expr VisitExpr_(const CallNode* call) {
     static const Op& stop_fusion = Op::Get("annotation.stop_fusion");
@@ -870,7 +872,7 @@ class FuseMutator : private ExprMutator {
       return MakeNewFunction(ret_group, tuple_get->checked_type(), new_node);
     }
     // This is an intermediate node in the group
-    return new_node;
+    return std::move(new_node);
   }
 
   Expr MakeNewFunction(GraphPartitioner::Group* group, Type ret_type, Expr body) {
@@ -919,13 +921,45 @@ class FuseMutator : private ExprMutator {
   }
 };
 
+// Temporary solution, should be handled by implementing a "FunctionPass"
+// which applies fusion to each function.
+struct GlobalVarLiveness : ExprVisitor {
+  Module module;
+  std::set<GlobalVar> visited;
+
+  explicit GlobalVarLiveness(const Module& mod) : module(mod), visited() {}
 
-Expr FuseOps(const Expr& expr, int fuse_opt_level) {
+  void VisitExpr_(const GlobalVarNode* gvar_node) {
+    auto gvar = GetRef<GlobalVar>(gvar_node);
+    if (visited.find(gvar) == visited.end()) {
+      visited.insert(gvar);
+      this->VisitExpr(this->module->Lookup(gvar));
+    }
+  }
+};
+
+std::set<GlobalVar> LiveGlobals(const Module& mod, const Expr& expr) {
+  auto gvl = GlobalVarLiveness(mod);
+  gvl.VisitExpr(expr);
+  return gvl.visited;
+}
+
+Expr FuseOps(const Expr& expr, int fuse_opt_level, const Module& module) {
   // First we convert all chains of fusable ops into
   // abstracted functions which we mark as primtive
   // then we convert these primtive functions into
   // new operators.
-  return FuseMutator().Transform(expr, fuse_opt_level);
+  if (!module.defined()) {
+    return FuseMutator().Transform(expr, fuse_opt_level);
+  } else {
+    auto lgvs = LiveGlobals(module, expr);
+    for (auto lv : lgvs) {
+      auto body = module->Lookup(lv);
+      auto e = FuseMutator().Transform(body, fuse_opt_level);
+      module->Add(lv, Downcast<Function>(e), true);
+    }
+    return FuseMutator().Transform(expr, fuse_opt_level);
+  }
 }
 
 TVM_REGISTER_API("relay._ir_pass.FuseOps")
diff --git a/src/relay/pass/kind_check.cc b/src/relay/pass/kind_check.cc
index 0b96ce50658a..976a2ef8ec54 100644
--- a/src/relay/pass/kind_check.cc
+++ b/src/relay/pass/kind_check.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/src/relay/pass/partial_eval.cc b/src/relay/pass/partial_eval.cc
index f6283d380176..5349532ca697 100644
--- a/src/relay/pass/partial_eval.cc
+++ b/src/relay/pass/partial_eval.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -585,7 +585,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
   // Constant evaluate a expression.
   PStatic ConstEvaluate(const Expr& expr, LetList* ll) {
     Expr infered = InferType(expr, Module(nullptr));
-    Expr fused = FuseOps(infered, 0);
+    Expr fused = FuseOps(infered, 0, Module(nullptr));
     Expr fused_infered = InferType(fused, Module(nullptr));
     return Reify(executor_(fused_infered), ll);
   }
diff --git a/src/relay/pass/to_a_normal_form.cc b/src/relay/pass/to_a_normal_form.cc
index 5e4253de23e5..913f8de05d7b 100644
--- a/src/relay/pass/to_a_normal_form.cc
+++ b/src/relay/pass/to_a_normal_form.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -26,6 +26,7 @@
  */
 #include <tvm/relay/pass.h>
 #include <tvm/relay/expr_functor.h>
+#include <tvm/logging.h>
 #include "let_list.h"
 #include "../../common/arena.h"
 #include "pass_util.h"
@@ -306,7 +307,22 @@ Expr ToANormalFormAux(const Expr& e,
 Expr ToANormalForm(const Expr& e,
                    const Module& m,
                    std::unordered_set<GlobalVar, NodeHash, NodeEqual>* gv) {
-  return TransformF([&](const Expr& e) { return ToANormalFormAux(e, m, gv); }, e);
+  DLOG(INFO)
+  << "ToANF:" << std::endl
+  << AsText(e, false);
+
+  Expr ret =
+    TransformF([&](const Expr& e) {
+      return ToANormalFormAux(e, m, gv);
+    }, e);
+
+  CHECK_EQ(FreeVars(ret).size(), 0);
+
+  DLOG(INFO)
+    << "ToANF: transformed" << std::endl
+    << AsText(ret, false);
+
+  return ret;
 }
 
 Expr ToANormalForm(const Expr& e, const Module& m) {
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index 30d4d79f6c86..482cef3b2c2d 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -796,7 +796,10 @@ Function InferType(const Function& func,
   CHECK(WellFormed(func_ret));
   auto free_tvars = FreeTypeVars(func_ret, mod);
   CHECK(free_tvars.size() == 0)
-    << "Found unbound type variables in " << func << ": " << free_tvars;
+    << "Found unbound type variables in: "
+    << std::endl
+    << AsText(func, true)
+    << std::endl << free_tvars;
   return Downcast<Function>(func_ret);
 }
 
diff --git a/src/runtime/vm/memory_manager.cc b/src/runtime/vm/memory_manager.cc
index c2bad38831ec..f32d232141d0 100644
--- a/src/runtime/vm/memory_manager.cc
+++ b/src/runtime/vm/memory_manager.cc
@@ -19,7 +19,7 @@
 
 /*!
  *  Copyright (c) 2019 by Contributors
- * \file tvm/runtime/memory_manager.cc
+ * \file tvm/runtime/vm/memory_manager.cc
  * \brief Allocate and manage memory for the runtime.
  */
 #include <utility>
@@ -32,6 +32,24 @@ namespace tvm {
 namespace runtime {
 namespace vm {
 
+inline void VerifyDataType(DLDataType dtype) {
+  CHECK_GE(dtype.lanes, 1);
+  if (dtype.code == kDLFloat) {
+    CHECK_EQ(dtype.bits % 8, 0);
+  } else {
+    // allow uint1 as a special flag for bool.
+    if (dtype.bits == 1 && dtype.code == kDLUInt) return;
+    CHECK_EQ(dtype.bits % 8, 0);
+  }
+  CHECK_EQ(dtype.bits & (dtype.bits - 1), 0);
+}
+
+inline size_t GetDataAlignment(const DLTensor& arr) {
+  size_t align = (arr.dtype.bits / 8) * arr.dtype.lanes;
+  if (align < kAllocAlignment) return kAllocAlignment;
+  return align;
+}
+
 MemoryManager* MemoryManager::Global() {
   static MemoryManager memory_manager;
   return &memory_manager;
@@ -40,8 +58,8 @@ MemoryManager* MemoryManager::Global() {
 Allocator* MemoryManager::GetAllocator(TVMContext ctx) {
   std::lock_guard<std::mutex> lock(mu_);
   if (allocators_.find(ctx) == allocators_.end()) {
-    // LOG(INFO) << "New allocator for " << DeviceName(ctx.device_type) << "("
-    //           << ctx.device_id << ")";
+    DLOG(INFO) << "New allocator for " << DeviceName(ctx.device_type) << "("
+               << ctx.device_id << ")";
     std::unique_ptr<Allocator> alloc(new NaiveAllocator(ctx));
     allocators_.emplace(ctx, std::move(alloc));
   }
diff --git a/src/runtime/vm/memory_manager.h b/src/runtime/vm/memory_manager.h
index 2fd1f4995c44..988df84d9a0a 100644
--- a/src/runtime/vm/memory_manager.h
+++ b/src/runtime/vm/memory_manager.h
@@ -26,6 +26,7 @@
 #define TVM_RUNTIME_VM_MEMORY_MANAGER_H_
 
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/ndarray.h>
 #include <functional>
 #include <memory>
 #include <mutex>
diff --git a/src/runtime/vm/naive_allocator.h b/src/runtime/vm/naive_allocator.h
index b4e2ee5d4890..a8e53a8d4c4f 100644
--- a/src/runtime/vm/naive_allocator.h
+++ b/src/runtime/vm/naive_allocator.h
@@ -35,7 +35,7 @@ namespace vm {
 
 class NaiveAllocator final : public Allocator {
  public:
-  explicit NaiveAllocator(TVMContext ctx) : Allocator(), used_memory_(0) {}
+  explicit NaiveAllocator(TVMContext ctx) : Allocator(), used_memory_(0), ctx_(ctx) {}
 
   Buffer Alloc(size_t nbytes, size_t alignment, TVMType type_hint) override {
     Buffer buf;
diff --git a/src/runtime/vm/object.cc b/src/runtime/vm/object.cc
index 566e5b032f85..acf8729eec5e 100644
--- a/src/runtime/vm/object.cc
+++ b/src/runtime/vm/object.cc
@@ -41,9 +41,6 @@ std::ostream& operator<<(std::ostream& os, const ObjectTag& tag) {
     case ObjectTag::kTensor:
       os << "Tensor";
       break;
-    case ObjectTag::kExternalFunc:
-      os << "ExternalFunction";
-      break;
     default:
       LOG(FATAL) << "Invalid object tag: found " << static_cast<int>(tag);
   }
@@ -68,21 +65,21 @@ Object Object::Closure(size_t func_index, const std::vector<Object>& free_vars)
 }
 
 ObjectPtr<TensorCell> Object::AsTensor() const {
-  CHECK(ptr.get());
-  CHECK(ptr.get()->tag == ObjectTag::kTensor);
-  return ptr.As<TensorCell>();
+  CHECK(ptr_.get());
+  CHECK(ptr_.get()->tag == ObjectTag::kTensor);
+  return ptr_.As<TensorCell>();
 }
 
 ObjectPtr<DatatypeCell> Object::AsDatatype() const {
-  CHECK(ptr.get());
-  CHECK(ptr.get()->tag == ObjectTag::kDatatype);
-  return ptr.As<DatatypeCell>();
+  CHECK(ptr_.get());
+  CHECK(ptr_.get()->tag == ObjectTag::kDatatype);
+  return ptr_.As<DatatypeCell>();
 }
 
 ObjectPtr<ClosureCell> Object::AsClosure() const {
-  CHECK(ptr.get());
-  CHECK(ptr.get()->tag == ObjectTag::kClosure);
-  return ptr.As<ClosureCell>();
+  CHECK(ptr_.get());
+  CHECK(ptr_.get()->tag == ObjectTag::kClosure);
+  return ptr_.As<ClosureCell>();
 }
 
 NDArray ToNDArray(const Object& obj) {
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
new file mode 100644
index 000000000000..d7ea53e75f6f
--- /dev/null
+++ b/src/runtime/vm/vm.cc
@@ -0,0 +1,670 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file src/runtime/vm/vm.cc
+ * \brief The Relay virtual machine.
+ */
+
+#include <tvm/logging.h>
+#include <tvm/runtime/vm.h>
+
+#include <chrono>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <vector>
+
+#include "../../runtime/vm/memory_manager.h"
+#include "../../runtime/vm/naive_allocator.h"
+
+using namespace tvm::runtime;
+
+namespace tvm {
+namespace runtime {
+namespace vm {
+
+Instruction::Instruction() {}
+
+template <typename T>
+static T* Duplicate(T* src, Index size) {
+  auto dst = new T[size];
+  std::copy(src, src + size, dst);
+  return dst;
+}
+
+Instruction::Instruction(const Instruction& instr) {
+  this->op = instr.op;
+  this->dst = instr.dst;
+
+  switch (instr.op) {
+    case Opcode::Move:
+      this->from = instr.from;
+      return;
+    case Opcode::Select:
+      this->select_cond = instr.select_cond;
+      this->select_op1 = instr.select_op1;
+      this->select_op2 = instr.select_op2;
+      return;
+    case Opcode::Ret:
+      this->result = instr.result;
+      return;
+    case Opcode::AllocTensor:
+      this->shape_register = instr.shape_register;
+      this->dtype = instr.dtype;
+      return;
+    case Opcode::AllocDatatype:
+      this->constructor_tag = instr.constructor_tag;
+      this->num_fields = instr.num_fields;
+      this->datatype_fields = Duplicate<RegName>(instr.datatype_fields, instr.num_fields);
+      return;
+    case Opcode::AllocClosure:
+      this->clo_index = instr.clo_index;
+      this->num_freevar = instr.num_freevar;
+      this->free_vars = Duplicate<RegName>(instr.free_vars, instr.num_freevar);
+      return;
+    case Opcode::InvokePacked:
+      this->packed_index = instr.packed_index;
+      this->arity = instr.arity;
+      this->output_size = instr.output_size;
+      this->packed_args = Duplicate<RegName>(instr.packed_args, instr.arity);
+      return;
+    case Opcode::InvokeClosure:
+      this->closure = instr.closure;
+      this->closure_args_num = instr.closure_args_num;
+      this->closure_args = Duplicate<RegName>(instr.closure_args, instr.closure_args_num);
+      return;
+    case Opcode::Invoke:
+      this->func_index = instr.func_index;
+      this->num_args = instr.num_args;
+      this->invoke_args_registers = Duplicate<RegName>(instr.invoke_args_registers, instr.num_args);
+      return;
+    case Opcode::If:
+      this->if_cond = instr.if_cond;
+      this->true_offset = instr.true_offset;
+      this->false_offset = instr.false_offset;
+      return;
+    case Opcode::LoadConst:
+      this->const_index = instr.const_index;
+      return;
+    case Opcode::GetField:
+      this->object = instr.object;
+      this->field_index = instr.field_index;
+      return;
+    case Opcode::Goto:
+      this->pc_offset = instr.pc_offset;
+      return;
+    default:
+      std::ostringstream out;
+      out << "Invalid instruction " << static_cast<int>(instr.op);
+      throw std::runtime_error(out.str());
+  }
+}
+
+Instruction::~Instruction() {
+  switch (this->op) {
+    case Opcode::Move:
+    case Opcode::Select:
+    case Opcode::Ret:
+    case Opcode::AllocTensor:
+    case Opcode::If:
+    case Opcode::LoadConst:
+    case Opcode::GetField:
+    case Opcode::Goto:
+      return;
+    case Opcode::AllocDatatype:
+      delete this->datatype_fields;
+      return;
+    case Opcode::AllocClosure:
+      delete this->free_vars;
+      return;
+    case Opcode::InvokePacked:
+      delete this->packed_args;
+      return;
+    case Opcode::InvokeClosure:
+      delete this->closure_args;
+      return;
+    case Opcode::Invoke:
+      delete this->invoke_args_registers;
+      return;
+    default:
+      std::ostringstream out;
+      out << "Invalid instruction " << static_cast<int>(this->op);
+      throw std::runtime_error(out.str());
+  }
+}
+
+Instruction Instruction::Ret(RegName result) {
+  Instruction instr;
+  instr.op = Opcode::Ret;
+  instr.result = result;
+  return instr;
+}
+
+Instruction Instruction::InvokePacked(Index packed_index, Index arity, Index output_size,
+                                      const std::vector<RegName>& args) {
+  Instruction instr;
+  instr.op = Opcode::InvokePacked;
+  instr.packed_index = packed_index;
+  instr.arity = arity;
+  instr.output_size = output_size;
+  instr.packed_args = new RegName[arity];
+  for (Index i = 0; i < arity; ++i) {
+    instr.packed_args[i] = args[i];
+  }
+  return instr;
+}
+
+Instruction Instruction::AllocTensor(RegName shape_register, DLDataType dtype, Index dst) {
+  Instruction instr;
+  instr.op = Opcode::AllocTensor;
+  instr.dst = dst;
+  instr.shape_register = shape_register;
+  instr.dtype = dtype;
+  return instr;
+}
+
+Instruction Instruction::AllocDatatype(Index tag, Index num_fields,
+                                       const std::vector<RegName>& datatype_fields, Index dst) {
+  Instruction instr;
+  instr.op = Opcode::AllocDatatype;
+  instr.dst = dst;
+  instr.constructor_tag = tag;
+  instr.num_fields = num_fields;
+  instr.datatype_fields = new RegName[num_fields];
+  for (Index i = 0; i < num_fields; ++i) {
+    instr.datatype_fields[i] = datatype_fields[i];
+  }
+  return instr;
+}
+
+Instruction Instruction::AllocClosure(Index func_index, Index free_vars,
+                                      const std::vector<RegName>& free_var_register, Index dst) {
+  Instruction instr;
+  instr.op = Opcode::AllocClosure;
+  instr.dst = dst;
+  instr.clo_index = func_index;
+  instr.num_freevar = free_vars;
+  instr.free_vars = new RegName[instr.num_freevar];
+  for (Index i = 0; i < instr.num_freevar; ++i) {
+    instr.free_vars[i] = free_var_register[i];
+  }
+  return instr;
+}
+
+Instruction Instruction::GetField(RegName object, Index field_index, RegName dst) {
+  Instruction instr;
+  instr.op = Opcode::GetField;
+  instr.dst = dst;
+  instr.object = object;
+  instr.field_index = field_index;
+  return instr;
+}
+
+Instruction Instruction::If(RegName cond, Index true_branch, Index false_branch) {
+  Instruction instr;
+  instr.op = Opcode::If;
+  instr.if_cond = cond;
+  instr.true_offset = true_branch;
+  instr.false_offset = false_branch;
+  return instr;
+}
+
+Instruction Instruction::Select(RegName cond, RegName op1, RegName op2, RegName dst) {
+  Instruction instr;
+  instr.op = Opcode::Select;
+  instr.dst = dst;
+  instr.select_cond = cond;
+  instr.select_op1 = op1;
+  instr.select_op2 = op2;
+  return instr;
+}
+
+Instruction Instruction::Goto(Index pc_offset) {
+  Instruction instr;
+  instr.op = Opcode::Goto;
+  instr.pc_offset = pc_offset;
+  return instr;
+}
+
+Instruction Instruction::Invoke(Index func_index, const std::vector<RegName>& args_registers,
+                                RegName dst) {
+  Instruction instr;
+  instr.op = Opcode::Invoke;
+  instr.dst = dst;
+  instr.func_index = func_index;
+  instr.num_args = args_registers.size();
+  instr.invoke_args_registers = new RegName[instr.num_args];
+  for (Index i = 0; i < instr.num_args; ++i) {
+    instr.invoke_args_registers[i] = args_registers[i];
+  }
+  return instr;
+}
+
+Instruction Instruction::InvokeClosure(RegName closure, const std::vector<RegName>& args,
+                                       RegName dst) {
+  Instruction instr;
+  instr.op = Opcode::InvokeClosure;
+  instr.dst = dst;
+  instr.closure = closure;
+  instr.closure_args_num = args.size();
+  instr.closure_args = new RegName[args.size()];
+  for (size_t i = 0; i < args.size(); ++i) {
+    instr.closure_args[i] = args[i];
+  }
+  return instr;
+}
+
+Instruction Instruction::LoadConst(Index const_index, RegName dst) {
+  Instruction instr;
+  instr.op = Opcode::LoadConst;
+  instr.dst = dst;
+  instr.const_index = const_index;
+  return instr;
+}
+
+Instruction Instruction::Move(RegName src, RegName dst) {
+  Instruction instr;
+  instr.op = Opcode::Move;
+  instr.dst = dst;
+  instr.from = src;
+  return instr;
+}
+
+void DLDatatypePrint(std::ostream& os, const DLDataType& dtype) {
+  switch (dtype.code) {
+    case kDLInt:
+      os << "int";
+      break;
+    case kDLUInt:
+      os << "uint";
+      break;
+    case kDLFloat:
+      os << "float";
+      break;
+  }
+
+  os << dtype.bits;
+  if (dtype.lanes != 0) {
+    os << "[" << dtype.lanes << "]";
+  }
+}
+
+void InstructionPrint(std::ostream& os, const Instruction& instr) {
+  switch (instr.op) {
+    case Opcode::Move: {
+      os << "move " << instr.from << " " << instr.dst;
+      break;
+    }
+    case Opcode::Ret: {
+      os << "ret " << instr.result;
+      break;
+    }
+    case Opcode::InvokePacked: {
+      os << "invoke_packed ";
+      os << instr.packed_index;
+      os << " " << instr.arity;
+      os << "(";
+      for (Index i = 0; i < instr.arity; ++i) {
+        os << instr.packed_args[i] << ",";
+      }
+      os << ")";
+      os << " " << instr.output_size;
+      break;
+    }
+    case Opcode::AllocTensor: {
+      os << "alloc_tensor ";
+      os << instr.dst << " ";
+      os << instr.shape_register << " ";
+      DLDatatypePrint(os, instr.dtype);
+      break;
+    }
+    case Opcode::AllocDatatype: {
+      os << "alloc_data ";
+      os << instr.dst << " ";
+      os << instr.constructor_tag << " ";
+      os << instr.num_fields;
+      break;
+    }
+    case Opcode::AllocClosure: {
+      os << "alloc_closure ";
+      os << instr.dst << " ";
+      os << instr.clo_index << " ";
+      os << instr.num_freevar << "(";
+      for (Index i = 0; i < instr.num_freevar; ++i) {
+        os << instr.free_vars[i] << ",";
+      }
+      os << ")";
+      break;
+    }
+    case Opcode::If: {
+      os << "if "
+         << "$" << instr.if_cond << " " << instr.true_offset << " " << instr.false_offset;
+      break;
+    }
+    case Opcode::Invoke: {
+      os << "invoke "
+         << "$" << instr.dst << " " << instr.func_index << " " << instr.num_args << "(";
+      for (Index i = 0; i < instr.num_args; ++i) {
+        os << instr.invoke_args_registers[i] << ",";
+      }
+      os << ")";
+      break;
+    }
+    case Opcode::InvokeClosure: {
+      os << "invoke_closure "
+         << "$" << instr.dst << " " << instr.closure << " " << instr.closure_args_num << "()";
+      break;
+    }
+    case Opcode::LoadConst: {
+      os << "load_const "
+         << "$" << instr.dst << " " << instr.const_index;
+      break;
+    }
+    case Opcode::GetField: {
+      os << "get_field " << instr.dst << " " << instr.object << " " << instr.field_index;
+      break;
+    }
+    case Opcode::Goto: {
+      os << "goto " << instr.pc_offset;
+      break;
+    }
+    case Opcode::Select: {
+      os << "select " << instr.dst << " " << instr.select_cond << " " << instr.select_op1 << " "
+         << instr.select_op2;
+      break;
+    }
+    default:
+      LOG(FATAL) << "should never hit this case" << static_cast<int>(instr.op);
+      break;
+  }
+}
+
+std::ostream& operator<<(std::ostream& os, const Instruction& instr) {
+  InstructionPrint(os, instr);
+  return os;
+}
+
+void VMFunctionPrint(std::ostream& os, const VMFunction& vm_func) {
+  os << vm_func.name << ": " << std::endl;
+  for (size_t i = 0; i < vm_func.instructions.size(); ++i) {
+    os << i << ": ";
+    InstructionPrint(os, vm_func.instructions[i]);
+    os << ";" << std::endl;
+  }
+}
+
+std::ostream& operator<<(std::ostream& os, const VMFunction& vm_func) {
+  VMFunctionPrint(os, vm_func);
+  return os;
+}
+
+void VirtualMachine::PushFrame(Index arg_count, Index ret_pc, const VMFunction& vm_func) {
+  auto frame = VMFrame(ret_pc, func_index, arg_count, code, vm_func.register_file_size);
+  frames.push_back(frame);
+}
+
+Index VirtualMachine::PopFrame() {
+  CHECK_GT(frames.size(), 0);
+  const VMFrame& fr = frames.back();
+  func_index = fr.func_index;
+  code = fr.code;
+  pc = fr.pc;
+  auto call_stack_size = frames.size();
+  frames.pop_back();
+  return call_stack_size;
+}
+
+void VirtualMachine::InvokeGlobal(const VMFunction& func, const std::vector<Object>& args) {
+  DLOG(INFO) << "===================\nInvoking global " << func.name << " " << args.size()
+                  << std::endl;
+
+  PushFrame(func.params, this->pc + 1, func);
+  for (size_t i = 0; i < args.size(); ++i) {
+    WriteRegister(i, args[i]);
+  }
+  DLOG(INFO) << "func.params= " << func.params << std::endl;
+
+  code = func.instructions.data();
+  pc = 0;
+}
+
+Object VirtualMachine::Invoke(const VMFunction& func, const std::vector<Object>& args) {
+  DLOG(INFO) << "Executing Function: " << std::endl << func << std::endl;
+
+  InvokeGlobal(func, args);
+  Run();
+  auto alloc = MemoryManager::Global()->GetAllocator(ctxs[0]);
+  DLOG(INFO) << "Memory used: " << alloc->UsedMemory() << " B\n";
+  return return_register;
+}
+
+Object VirtualMachine::Invoke(const std::string& name, const std::vector<Object>& args) {
+  auto func_index = this->global_map_[name];
+  DLOG(INFO) << "Invoke Global " << name << " at index " << func_index << std::endl;
+  return Invoke(this->functions[func_index], args);
+}
+
+void InvokePacked(const PackedFunc& func, Index arg_count, Index output_size,
+                  const std::vector<Object>& args) {
+  std::vector<TVMValue> values(arg_count);
+  std::vector<int> codes(arg_count);
+  runtime::TVMArgsSetter setter(values.data(), codes.data());
+
+  for (Index i = 0; i < arg_count; i++) {
+    NDArray data = ToNDArray(args[i]);
+    setter(i, data);
+  }
+
+  TVMRetValue rv;
+  func.CallPacked(TVMArgs(values.data(), codes.data(), arg_count), &rv);
+}
+
+void VirtualMachine::Init(const std::vector<TVMContext>& ctxs) { this->ctxs = ctxs; }
+
+inline void VirtualMachine::WriteRegister(Index r, const Object& val) {
+  frames.back().register_file[r] = val;
+}
+
+inline Object VirtualMachine::ReadRegister(Index r) const {
+  return frames.back().register_file[r];
+}
+
+void VirtualMachine::Run() {
+  CHECK(this->code);
+  this->pc = 0;
+  Index frame_start = frames.size();
+  while (true) {
+  main_loop:
+    auto const& instr = this->code[this->pc];
+    DLOG(INFO) << "\nExecuting(" << pc << "): ";
+#if USE_RELAY_DEBUG
+    InstructionPrint(std::cout, instr);
+#endif  // USE_RELAY_DEBUG
+
+    switch (instr.op) {
+      case Opcode::Move: {
+        Object from_obj;
+        if (instr.from == 0) {
+          from_obj = return_register;
+        } else {
+          from_obj = ReadRegister(instr.from);
+        }
+        WriteRegister(instr.dst, from_obj);
+        pc++;
+        goto main_loop;
+      }
+      case Opcode::LoadConst: {
+        WriteRegister(instr.dst, this->constants[instr.const_index]);
+        pc++;
+        goto main_loop;
+      }
+      case Opcode::Invoke: {
+        std::vector<Object> args;
+        for (Index i = 0; i < instr.num_args; ++i) {
+          args.push_back(ReadRegister(instr.invoke_args_registers[i]));
+        }
+        InvokeGlobal(this->functions[instr.func_index], args);
+        frames.back().caller_return_register = instr.dst;
+        goto main_loop;
+      }
+      case Opcode::InvokePacked: {
+        const auto& func = packed_funcs[instr.packed_index];
+        const auto& arity = instr.arity;
+        std::vector<Object> args;
+        for (Index i = 0; i < arity; ++i) {
+          args.push_back(ReadRegister(instr.packed_args[i]));
+        }
+        InvokePacked(func, arity, instr.output_size, args);
+        for (Index i = 0; i < instr.output_size; ++i) {
+          WriteRegister(instr.packed_args[instr.arity - instr.output_size + i],
+                        args[instr.arity - instr.output_size + i]);
+        }
+        pc++;
+        goto main_loop;
+      }
+      case Opcode::InvokeClosure: {
+        auto object = ReadRegister(instr.closure);
+        const auto& closure = object.AsClosure();
+        std::vector<Object> args;
+        for (Index i = 0; i < instr.closure_args_num; ++i) {
+          args.push_back(ReadRegister(instr.closure_args[i]));
+        }
+        for (auto free_var : closure->free_vars) {
+          args.push_back(free_var);
+        }
+        InvokeGlobal(this->functions[closure->func_index], args);
+        frames.back().caller_return_register = instr.dst;
+        goto main_loop;
+      }
+      case Opcode::GetField: {
+        auto object = ReadRegister(instr.object);
+        CHECK(object->tag == ObjectTag::kDatatype)
+            << "Object is not data type object, register " << instr.object << ", Object tag "
+            << static_cast<int>(object->tag);
+        const auto& tuple = object.AsDatatype();
+        auto field = tuple->fields[instr.field_index];
+        WriteRegister(instr.dst, field);
+        pc++;
+        goto main_loop;
+      }
+      case Opcode::Goto: {
+        pc += instr.pc_offset;
+        goto main_loop;
+      }
+      case Opcode::If: {
+        // How do we do this efficiently?
+        DLContext cpu_ctx;
+        cpu_ctx.device_type = kDLCPU;
+        cpu_ctx.device_id = 0;
+
+        const auto& cond = ReadRegister(instr.if_cond);
+        NDArray cpu_array = ToNDArray(cond).CopyTo(cpu_ctx);
+        // CHECK_EQ(cpu_array->dtype, Bool());
+        bool branch = reinterpret_cast<uint8_t*>(cpu_array->data)[0];
+
+        if (branch) {
+          pc += instr.true_offset;
+        } else {
+          pc += instr.false_offset;
+        }
+
+        goto main_loop;
+      }
+      case Opcode::AllocTensor: {
+        DLContext cpu_ctx;
+        cpu_ctx.device_type = kDLCPU;
+        cpu_ctx.device_id = 0;
+
+        auto shape_tensor_obj = ReadRegister(instr.shape_register);
+        NDArray shape_tensor = ToNDArray(shape_tensor_obj).CopyTo(cpu_ctx);
+
+        int64_t* dims = static_cast<int64_t*>(shape_tensor->data);
+        auto num_dims = shape_tensor->shape[0];
+        auto shape = std::vector<int64_t>(shape_tensor->shape[0]);
+        shape.assign(dims, dims + num_dims);
+        auto allocator = MemoryManager::Global()->GetAllocator(ctxs[0]);
+        auto data = allocator->Empty(shape, instr.dtype, ctxs[0]);
+        auto obj = Object::Tensor(data);
+        WriteRegister(instr.dst, obj);
+        pc++;
+        goto main_loop;
+      }
+      case Opcode::AllocDatatype: {
+        std::vector<Object> fields;
+        for (Index i = 0; i < instr.num_fields; ++i) {
+          fields.push_back(ReadRegister(instr.datatype_fields[i]));
+        }
+        Object obj = Object::Datatype(instr.constructor_tag, fields);
+        WriteRegister(instr.dst, obj);
+        pc++;
+        goto main_loop;
+      }
+      case Opcode::AllocClosure: {
+        std::vector<Object> free_vars;
+        for (Index i = 0; i < instr.num_freevar; i++) {
+          free_vars.push_back(ReadRegister(instr.free_vars[i]));
+        }
+        WriteRegister(instr.dst, Object::Closure(instr.func_index, free_vars));
+        pc++;
+        goto main_loop;
+      }
+      case Opcode::Select: {
+        DLContext cpu_ctx;
+        cpu_ctx.device_type = kDLCPU;
+        cpu_ctx.device_id = 0;
+
+        auto cond = ReadRegister(instr.select_cond);
+        NDArray cpu_array = ToNDArray(cond).CopyTo(cpu_ctx);
+        // CHECK_EQ(TVMType2Type(cpu_array->dtype), Bool());
+        bool branch = reinterpret_cast<uint8_t*>(cpu_array->data)[0];
+
+        if (branch) {
+          auto op1 = ReadRegister(instr.select_op1);
+          WriteRegister(instr.dst, op1);
+        } else {
+          auto op2 = ReadRegister(instr.select_op2);
+          WriteRegister(instr.dst, op2);
+        }
+        pc++;
+        goto main_loop;
+      }
+      case Opcode::Ret: {
+        // If we have hit the point from which we started
+        // running, we should return to the caller breaking
+        // the dispatch loop.
+        return_register = ReadRegister(instr.result);
+        auto caller_return_register = frames.back().caller_return_register;
+
+        if (PopFrame() == frame_start) {
+          return;
+          // Otherwise we are just returning from a local call.
+        } else {
+          WriteRegister(caller_return_register, return_register);
+          goto main_loop;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace vm
+}  // namespace runtime
+}  // namespace tvm
diff --git a/tests/python/relay/test_pass_dead_code_elimination.py b/tests/python/relay/test_pass_dead_code_elimination.py
index 963d490eaf50..9158f0729d61 100644
--- a/tests/python/relay/test_pass_dead_code_elimination.py
+++ b/tests/python/relay/test_pass_dead_code_elimination.py
@@ -14,6 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+from nose.tools import nottest
+
 import tvm
 from tvm import relay
 from tvm.relay.ir_pass import dead_code_elimination, alpha_equal
@@ -51,7 +53,7 @@ def test_used_let():
     orig = relay.Let(e.c, e.one, e.c + e.c)
     assert alpha_equal(dead_code_elimination(orig), relay.Let(e.c, e.one, e.c + e.c))
 
-
+@nottest
 def test_inline():
     orig = relay.Let(e.a, e.b, relay.Let(e.c, e.d, e.c))
     assert alpha_equal(dead_code_elimination(orig), e.d)
diff --git a/tests/python/relay/test_pass_eta_expand.py b/tests/python/relay/test_pass_eta_expand.py
new file mode 100644
index 000000000000..40a84286d08a
--- /dev/null
+++ b/tests/python/relay/test_pass_eta_expand.py
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from tvm import relay
+
+def test_eta_expand_basic():
+    mod = relay.Module()
+    x = relay.var('x', 'int32')
+    y = relay.var('y', 'int32')
+    orig = relay.Function([x], x)
+    got = relay.ir_pass.eta_expand(orig, mod)
+    expected = relay.Function([y], orig(y))
+
+    got = relay.ir_pass.infer_type(got, mod)
+    expected = relay.ir_pass.infer_type(expected, mod)
+    assert(relay.ir_pass.alpha_equal(got, expected))
+
+if __name__ == "__main__":
+    test_eta_expand_basic()
diff --git a/tests/python/relay/test_pass_partial_eval.py b/tests/python/relay/test_pass_partial_eval.py
index 9e0545021512..78fa63b5231d 100644
--- a/tests/python/relay/test_pass_partial_eval.py
+++ b/tests/python/relay/test_pass_partial_eval.py
@@ -25,6 +25,7 @@
 from tvm.relay.prelude import Prelude
 from tvm.relay import create_executor
 
+from nose.tools import nottest
 
 def check_eval(expr, expected_result, mod=None, rtol=1e-07):
     ctx = tvm.context("llvm", 0)
@@ -45,8 +46,9 @@ def test_tuple():
     f = relay.Function([x], body, None, [t])
     assert alpha_equal(dcpe(f), relay.Function([x], x, None, [t]))
 
-
+@nottest
 def test_const_inline():
+    # TODO(MK): fix me
     d = relay.Var("d")
     double = relay.Function([d], d + d)
     orig = double(relay.const(4.0))
@@ -63,8 +65,9 @@ def test_ref():
     square = relay.Function([d], body)
     assert alpha_equal(dcpe(square), relay.Function([d], d * d))
 
-
+@nottest
 def test_ad():
+    # TODO(MK): fix me
     shape = (10, 10)
     dtype = "float32"
     t = relay.TensorType(shape, dtype)
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index 946240352076..4dba4eade6bd 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -616,6 +616,7 @@ inline Array<Tensor> split_sections(const Tensor& x,
 *
 * \param a The source array.
 * \param indices The indices of the values to extract.
+* \param mode The mode of the operation.
 * \param name The name of the operation.
 * \param mode The mode of to handle out of bound indices.
 * \param tag The tag to mark the operation.
@@ -656,7 +657,7 @@ inline Tensor take(const Tensor& a,
 * \param indices The indices of the values to extract.
 * \param axis The axis over which to select values. By default,
 * the flattened input array is used.
-* \param mode The mode of to handle out of bound indices.
+* \param mode The mode for handling out of bound indices.
 * \param name The name of the operation.
 * \param tag The tag to mark the operation.
 *

From 5ae0ecce2a4442f4e84bc14ed3456541d1f9d75a Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Thu, 9 May 2019 08:44:47 -0700
Subject: [PATCH 093/106] add more syncs (#3151)

---
 topi/python/topi/cuda/nms.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 5d04d72a7eca..0c27bd216999 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -133,6 +133,9 @@ def get_valid_counts_upsweep(data, idx_in, idx, partial):
                     idx[bx * num_anchors + tx * elem_per_thread + i] = \
                     idx[bx * num_anchors + tx * elem_per_thread + i - 1] + \
                     idx_in[bx * num_anchors + tx * elem_per_thread + i]
+            ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
+                                  tvm.convert(['shared']),
+                                  tvm.expr.Call.Intrinsic, None, 0))
     return ib.get()
 
 def get_valid_counts_scan(data, partial_in, partial):

From bf3ac220a0d41c78fb97f6abc94a525455378a51 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@fb.com>
Date: Thu, 9 May 2019 08:46:10 -0700
Subject: [PATCH 094/106] Fix a multithreaded bug in llvm LazyInitJIT (#3158)

---
 src/codegen/llvm/llvm_module.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/codegen/llvm/llvm_module.cc b/src/codegen/llvm/llvm_module.cc
index 10b891ca2bda..2b3f316eaba5 100644
--- a/src/codegen/llvm/llvm_module.cc
+++ b/src/codegen/llvm/llvm_module.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -226,8 +226,10 @@ class LLVMModuleNode final : public runtime::ModuleNode {
 
  private:
   void LazyInitJIT() {
-    CHECK(ee_ == nullptr);
     std::lock_guard<std::mutex> lock(mutex_);
+    if (ee_) {
+      return;
+    }
     llvm::EngineBuilder builder(std::move(module_));
     std::string triple, mcpu, mattr;
     llvm::TargetOptions opt;

From 82f5a55a37daf5070e1d87a275ee66f6c0524dd1 Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Thu, 9 May 2019 21:29:16 -0700
Subject: [PATCH 095/106] [codegen] heterogeneous build for c++ (#3144)

* heterogeneous build for c++

* merge relay buildmodule to codegen build

* use module split

* use target_host

* remove sse3

* retrigger ci
---
 include/tvm/build_module.h                  |  29 +++++
 src/codegen/build_module.cc                 | 126 ++++++++++++++-----
 src/relay/backend/build_module.cc           |  51 +-------
 tests/cpp/build_module_test.cc              | 132 ++++++++++++++++++++
 tests/python/relay/test_cpp_build_module.py |   2 +-
 5 files changed, 262 insertions(+), 78 deletions(-)

diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h
index 334fe169ad41..208f086f86c0 100644
--- a/include/tvm/build_module.h
+++ b/include/tvm/build_module.h
@@ -371,6 +371,35 @@ TVM_DLL runtime::Module build(const Array<LoweredFunc>& funcs,
                               const Target& target_host,
                               const BuildConfig& config);
 
+/*!
+ * \brief Build a device and host module for a specific target from a map
+ * contains target to a list of lowered functions pairs. This function is used
+ * for heterogeneous build.
+ * \param input The map contains target to a list of lowered functions pairs.
+ * \param target_host The target for building host code. To use the default,
+ *        pass Target().
+ * \param config The build configuration.
+ * \return The built module that contains code for different processors.
+ */
+TVM_DLL runtime::Module build(const Map<Target, Array<LoweredFunc>>& input,
+                              const Target& target_host,
+                              const BuildConfig& config);
+
+/*!
+ * \brief Build a device and host module for a specific target from a map
+ * contains target to a list of lowered functions pairs. This function is used
+ * for heterogeneous build.
+ * \param input The map contains target string to a list of lowered functions
+ *        pairs.
+ * \param target_host The target for building host code. To use the default,
+ *        pass Target().
+ * \param config The build configuration.
+ * \return The built module that contains code for different processors.
+ */
+TVM_DLL runtime::Module build(const Map<std::string, Array<LoweredFunc>>& input,
+                              const Target& target_host,
+                              const BuildConfig& config);
+
 class GenericFuncNode;
 
 /*!
diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index 01ebcacf6180..57e300fafec2 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -428,20 +428,19 @@ Array<Array<LoweredFunc> > split_dev_host_funcs(const Array<LoweredFunc>& funcs,
                                                 const Target& target_host,
                                                 const BuildConfig& config) {
   std::unordered_set<std::string> all_names;
-  for (const auto &x : funcs) {
-    CHECK(all_names.count(x->name) == 0) << "Duplicate function name " << x->name;
+  for (const auto& x : funcs) {
+    CHECK(all_names.count(x->name) == 0)
+        << "Duplicate function name " << x->name;
     all_names.insert(x->name);
   }
 
-  auto target_host_val = target_host.defined() ? target_host : DefaultTargetHost(target);
-
   Array<LoweredFunc> fhost;
   Array<LoweredFunc> fdevice;
 
   for (const auto& x : funcs) {
     CHECK(ir::VerifyMemory(x, target->device_type))
-        << "Direct host side access to device memory is detected in " << x->func_name()
-        << ". Did you forget to bind?";
+        << "Direct host side access to device memory is detected in "
+        << x->func_name() << ". Did you forget to bind?";
 
     if (x->func_type == kMixedFunc) {
       auto func = x;
@@ -450,6 +449,7 @@ Array<Array<LoweredFunc> > split_dev_host_funcs(const Array<LoweredFunc>& funcs,
       }
 
       func = ir::ThreadSync(func, "shared");
+      func = ir::ThreadSync(func, "warp");
       func = ir::LowerThreadAllreduce(func, target->thread_warp_size);
       auto fsplits = ir::SplitHostDevice(func);
       fhost.push_back(fsplits[0]);
@@ -465,12 +465,32 @@ Array<Array<LoweredFunc> > split_dev_host_funcs(const Array<LoweredFunc>& funcs,
     }
   }
 
+  for (size_t i = 0; i < fdevice.size(); i++) {
+    auto warp_size = target->thread_warp_size;
+    auto func = fdevice[i];
+    func = ir::LowerWarpMemory(fdevice[i], warp_size);
+    fdevice.Set(i, func);
+  }
+
   auto keys = target->keys();
-  bool target_is_gpu =
-    std::find(keys.begin(), keys.end(), "gpu") != keys.end();
+  bool target_is_gpu = std::find(keys.begin(), keys.end(), "gpu") != keys.end();
   if (target_is_gpu && fdevice.size() == 0) {
-    LOG(WARNING) << "Specified target " + target->str() +
-      " but cannot find device code. Did you forget to bind?";
+    LOG(WARNING) << "Specified target "
+                 << target->str()
+                 << " but cannot find device code. Did you forget to bind?";
+  }
+
+  for (size_t i = 0; i < fdevice.size(); ++i) {
+    auto func = fdevice[i];
+    func = ir::LowerIntrin(func, target->target_name);
+    fdevice.Set(i, func);
+  }
+
+  if (target->device_type == target::llvm()->device_type &&
+        target_host == target) {
+    CHECK(fdevice.empty()) << "No device code should be generated when target "
+                           << "and host_target are both llvm target."
+                           << "\n";
   }
 
   for (size_t i = 0; i < fhost.size(); ++i) {
@@ -480,41 +500,91 @@ Array<Array<LoweredFunc> > split_dev_host_funcs(const Array<LoweredFunc>& funcs,
     fhost.Set(i, func);
   }
 
-
-  for (size_t i = 0; i < fdevice.size(); ++i) {
-    auto func = fdevice[i];
-    func = ir::LowerIntrin(func, target->target_name);
-    fdevice.Set(i, func);
-  }
-
   for (size_t i = 0; i < fhost.size(); ++i) {
     auto func = fhost[i];
-    func = ir::LowerIntrin(func, target_host_val->target_name);
+    func = ir::LowerIntrin(func, target_host->target_name);
     func = ir::CombineContextCall(func);
     fhost.Set(i, func);
   }
   return {fhost, fdevice};
 }
 
-runtime::Module build(const Array<LoweredFunc>& funcs,
-                      const Target& target,
+// Create a module for a specific device (target). The lowered functions
+// associated with the host is returned as well.
+runtime::Module DeviceBuild(const Array<LoweredFunc>& fdevice,
+                            const Target& target) {
+  if (!fdevice.empty()) {
+    return codegen::Build(fdevice, target->str());
+  } else {
+    return runtime::Module(nullptr);
+  }
+}
+
+// Build for heterogeneous execution.
+runtime::Module build(const Map<Target, Array<LoweredFunc>>& inputs,
                       const Target& target_host,
                       const BuildConfig& config) {
-  auto target_host_val = target_host.defined() ? target_host : DefaultTargetHost(target);
-  auto host_dev_funcs = split_dev_host_funcs(funcs, target, target_host, config);
-  auto& fhost = host_dev_funcs[0];
-  auto& fdevice = host_dev_funcs[1];
+  Array<LoweredFunc> fhost_all;
+  std::vector<runtime::Module> device_modules;
+
+  Target target_host_val = target_host;
+  if (!target_host.defined()) {
+    for (const auto& it : inputs) {
+      if (it.first->device_type == kDLCPU) {
+        target_host_val = it.first;
+        break;
+      }
+    }
+  }
 
-  auto mhost = codegen::Build(fhost, target_host_val->str());
+  if (!target_host_val.defined()) {
+    target_host_val = DefaultTargetHost(target_host_val);
+  }
 
-  if (fdevice.size() > 0) {
-    auto mdev = codegen::Build(fdevice, target->str());
-    mhost.Import(mdev);
+  for (const auto& it : inputs) {
+    auto host_dev_funcs =
+        split_dev_host_funcs(it.second, it.first, target_host_val, config);
+    auto& fhost = host_dev_funcs[0];
+    auto& fdevice = host_dev_funcs[1];
+    // Get the module for a certain target.
+    runtime::Module mdev = DeviceBuild(fdevice, it.first);
+    for (const auto& it : fhost) {
+      fhost_all.push_back(it);
+    }
+    device_modules.push_back(mdev);
   }
 
+  runtime::Module mhost = codegen::Build(fhost_all, target_host_val->str());
+  // Import all modules
+  for (const auto& it : device_modules) {
+    if (it.operator->()) {
+      mhost.Import(it);
+    }
+  }
   return mhost;
 }
 
+// Build for heterogeneous execution when target is a string.
+runtime::Module build(const Map<std::string, Array<LoweredFunc>>& inputs,
+                      const Target& target_host,
+                      const BuildConfig& config) {
+  Map<Target, Array<LoweredFunc>> updated_input;
+  for (const auto& it : inputs) {
+    auto target = Target::create(it.first);
+    updated_input.Set(target, it.second);
+  }
+  return build(updated_input, target_host, config);
+}
+
+// Build for homogeneous execution.
+runtime::Module build(const Array<LoweredFunc>& funcs,
+                      const Target& target,
+                      const Target& target_host,
+                      const BuildConfig& config) {
+  Map<Target, Array<LoweredFunc>> inputs = {{target, funcs}};
+  return build(inputs, target_host, config);
+}
+
 BuildConfig build_config() {
   return BuildConfig(make_node<BuildConfigNode>());
 }
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 564715c00d90..08a88d53350f 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -601,52 +601,6 @@ class RelayBuildModule : public runtime::ModuleNode {
     }
     return func;
   }
-  /*!
-   * \brief Build module given lowered functions for each target
-   *
-   * \param lowered_funcs target_str -> Array<LoweredFunc> map
-   * \param targets Targets map
-   * \param cfg Building configuration
-   */
-  void BuildModule(const Map<std::string, Array<LoweredFunc> >& lowered_funcs,
-                   const Map<HalideIR::Expr, HalideIR::Expr>& targets,
-                   const BuildConfig& cfg) {
-    auto target_host = Target::create(cfg_.fallback_device);
-    for (const auto& kv : lowered_funcs) {
-      std::unordered_set<std::string> fname_set;
-      for (auto f : kv.second) {
-        if (fname_set.count(f->name)) {
-          LOG(FATAL) << "Duplicate function name "
-                     << f->name;
-        }
-        fname_set.insert(f->name);
-      }
-    }
-    std::unordered_map<std::string, Target> target_map;
-    for (const auto& kv : lowered_funcs) {
-      target_map[kv.first] = Target::create(kv.first);
-    }
-    Array<LoweredFunc> fhost_all;
-    std::vector<runtime::Module> device_module;
-    for (const auto& kv : lowered_funcs) {
-      auto target = target_map[kv.first];
-      auto host_dev_funcs = split_dev_host_funcs(kv.second, target, target_host, cfg);
-      for (auto f : host_dev_funcs[0]) {
-        fhost_all.push_back(f);
-      }
-      if (host_dev_funcs[1].size()) {
-        auto mdev = codegen::Build(host_dev_funcs[1], target->str());
-        device_module.push_back(mdev);
-      }
-    }
-
-    auto mhost = codegen::Build(fhost_all, target_host->str());
-
-    for (auto mdev : device_module) {
-      mhost.Import(mdev);
-    }
-    ret_.mod = mhost;
-  }
 
   /*!
    * \brief Build relay function to runtime module
@@ -686,9 +640,8 @@ class RelayBuildModule : public runtime::ModuleNode {
     ret_.graph_json = graph_codegen_->GetJSON();
     ret_.params = graph_codegen_->GetParams();
 
-    BuildModule(graph_codegen_->GetLoweredFunc(),
-                device_target,
-                tvm_cfg_);
+    auto target_host = Target::create(target_host_);
+    ret_.mod = tvm::build(graph_codegen_->GetLoweredFunc(), target_host, tvm_cfg_);
   }
 
  protected:
diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc
index 734e457d3bf1..393714d8f636 100644
--- a/tests/cpp/build_module_test.cc
+++ b/tests/cpp/build_module_test.cc
@@ -19,10 +19,14 @@
 
 #include <dmlc/logging.h>
 #include <gtest/gtest.h>
+#include <topi/cuda/injective.h>
 #include <tvm/tvm.h>
 #include <tvm/operation.h>
 #include <tvm/build_module.h>
 
+#include <string>
+#include <cmath>
+
 TEST(BuildModule, Basic) {
   using namespace tvm;
   auto n = var("n");
@@ -56,6 +60,134 @@ TEST(BuildModule, Basic) {
   CHECK_EQ(mali_target->str(), "opencl -model=Mali-T860MP4@800Mhz -device=mali"); 
 }
 
+TEST(BuildModule, Heterogeneous) {
+  /* The testing network is like following, where the element-wise add and sub
+   * ops are allocated to GPU and CPU, respectively:
+   *
+   *          A    B
+   *           \  /
+   *      elemwise_add  (gpu)
+   *              \
+   *              copy      C
+   *                \      /
+   *              elemwise_sub  (cpu)
+   */
+
+  using namespace tvm;
+  const runtime::PackedFunc* pf = runtime::Registry::Get("module._Enabled");
+  bool enabled = (*pf)("cuda");
+  if (!enabled) {
+    LOG(INFO) << "Skip heterogeneous test because cuda is not enabled."
+              << "\n";
+    return;
+  }
+
+  auto target_llvm = target::llvm();
+  auto target_cuda = target::cuda();
+
+  // The shape of input tensors.
+  const int n = 4;
+  Array<Expr> shape{n};
+
+  auto A = placeholder(shape, Float(32), "A");
+  auto B = placeholder(shape, Float(32), "B");
+  auto C = placeholder(shape, Float(32), "C");
+
+  auto elemwise_add = compute(A->shape, [&A, &B](Expr i) {
+    return A[i] + B[i];
+  }, "elemwise_add");
+
+  auto copy = placeholder(shape, Float(32), "__copy");
+  auto elemwise_sub = compute(C->shape, [&copy, &C](Expr i) {
+    return copy[i] - C[i];
+  }, "elemwise_sub");
+
+  auto s1 = topi::cuda::schedule_injective(target_cuda, {elemwise_add});
+  auto s2 = create_schedule({elemwise_sub->op});
+
+  auto config = build_config();
+  auto args1 = Array<Tensor>({A, B, elemwise_add});
+  auto args2 = Array<Tensor>({copy, C, elemwise_sub});
+
+  std::unordered_map<Tensor, Buffer> binds;
+  auto lowered_s1 = lower(s1, args1, "elemwise_add", binds, config);
+  auto lowered_s2 = lower(s2, args2, "elemwise_sub", binds, config);
+  Map<tvm::Target, Array<LoweredFunc>> inputs = {{target_cuda, lowered_s1},
+                                                 {target_llvm, lowered_s2}};
+  auto module = build(inputs, Target(), config);
+
+  // Assertion for build.
+  CHECK_EQ(module->imports().size(), 1);
+
+  // Execute the graph and check the correctness.
+  // Setup graph json.
+  std::string json =
+      "{\"nodes\": [{\"op\": \"null\", \"name\": \"A\", \"inputs\": []}, "
+      "{\"op\": \"null\", \"name\": \"B\", \"inputs\": []}, {\"op\": "
+      "\"tvm_op\", \"name\": \"elemwise_add\", \"attrs\": {\"flatten_data\": "
+      "\"1\", \"func_name\": \"elemwise_add\", \"num_inputs\": \"2\", "
+      "\"num_outputs\": \"1\"}, \"inputs\": [[0, 0, 0], [1, 0, 0]]}, {\"op\": "
+      "\"tvm_op\", \"name\": \"__copy_add_to_sub\", \"attrs\": "
+      "{\"flatten_data\": \"0\", \"func_name\": \"__copy\", \"num_inputs\": "
+      "\"1\", \"num_outputs\": \"1\"}, \"inputs\": [[2, 0, 0]]}, {\"op\": "
+      "\"null\", \"name\": \"C\", \"inputs\": []}, {\"op\": \"tvm_op\", "
+      "\"name\": \"elemwise_sub\", \"attrs\": {\"flatten_data\": \"0\", "
+      "\"func_name\": \"elemwise_sub\", \"num_inputs\": \"2\", "
+      "\"num_outputs\": \"1\"}, \"inputs\": [[3, 0, 0], [4, 0, 0]]}], "
+      "\"arg_nodes\": [0, 1, 4], \"node_row_ptr\": [0, 1, 2, 3, 4, 5, 6], "
+      "\"heads\": [[5, 0, 0]], \"attrs\": {\"storage_id\": [\"list_int\", [3, "
+      "4, 0, 1, 5, 2]], \"shape\": [\"list_shape\", [[4], [4], [4], [4], [4], "
+      "[4]]], \"device_index\": [\"list_int\", [2, 2, 2, 1, 1, 1]], \"dtype\": "
+      "[\"list_int\", [0, 0, 0, 0, 0, 0]], \"dltype\": [\"list_str\", "
+      "[\"float32\", \"float32\", \"float32\", \"float32\", \"float32\", "
+      "\"float32\"]]}}";
+
+  // Setup inputs.
+  auto a_val =
+      runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto b_val =
+      runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto c_val =
+      runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0});
+
+  auto pa = (float*)a_val.ToDLPack()->dl_tensor.data;
+  auto pb = (float*)b_val.ToDLPack()->dl_tensor.data;
+  auto pc = (float*)c_val.ToDLPack()->dl_tensor.data;
+
+  // Assign values.
+  for (int i = 0; i < n; i++) {
+    pa[i] = i;
+    pb[i] = i + 1.0;
+    pc[i] = i - 1.0;
+  }
+
+  // Initialize graph runtime.
+  int cpu_dev_ty = static_cast<int>(kDLCPU);
+  int cpu_dev_id = 0;
+  int gpu_dev_ty = static_cast<int>(kDLGPU);
+  int gpu_dev_id = 0;
+
+  const runtime::PackedFunc* graph_runtime =
+      tvm::runtime::Registry::Get("tvm.graph_runtime.create");
+  runtime::Module mod = (*graph_runtime)(
+      json, module, cpu_dev_ty, cpu_dev_id, gpu_dev_ty, gpu_dev_id);
+
+  PackedFunc set_input = mod.GetFunction("set_input", false);
+  PackedFunc run = mod.GetFunction("run", false);
+  PackedFunc get_output = mod.GetFunction("get_output", false);
+  set_input("A", a_val);
+  set_input("B", b_val);
+  set_input("C", c_val);
+
+  run();
+  tvm::runtime::NDArray out = get_output(0);
+  float* p_out = (float*)out.ToDLPack()->dl_tensor.data;
+
+  // Check correctness.
+  for (int i = 0; i < n; ++i) {
+    CHECK_LT(std::fabs(p_out[i] - (i + (i + 1.0) - (i - 1.0))), 1e-5);
+  }
+}
 
 int main(int argc, char ** argv) {
   testing::InitGoogleTest(&argc, argv);
diff --git a/tests/python/relay/test_cpp_build_module.py b/tests/python/relay/test_cpp_build_module.py
index c69d877d3b09..b94f57d77286 100644
--- a/tests/python/relay/test_cpp_build_module.py
+++ b/tests/python/relay/test_cpp_build_module.py
@@ -89,7 +89,7 @@ def test_build():
         tgt: tgt
     }
     m_bld.set_opt_level(3)
-    m_bld.build(func, targets, "llvm -mcpu=sse3", params=params)
+    m_bld.build(func, targets, "llvm", params=params)
     g_json = m_bld.get_json()
     mmod = m_bld.get_module()
     params = m_bld.get_params()

From 9b447347687f207fad25f18044dba7a06a65db04 Mon Sep 17 00:00:00 2001
From: lixiaoquan <radioheads@163.com>
Date: Sat, 11 May 2019 01:14:39 +0800
Subject: [PATCH 096/106] Fix a tensorflow test bug. (#3165)

Length of input_shape isn't always 4.
---
 tests/python/frontend/tensorflow/test_forward.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 8dd538aa859c..1579769ed5c1 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -185,7 +185,7 @@ def _test_pooling_iteration(input_shape, **kwargs):
 def _test_pooling(input_shape, **kwargs):
     _test_pooling_iteration(input_shape, **kwargs)
 
-    if is_gpu_available():
+    if is_gpu_available() and (len(input_shape) == 4):
         input_shape = [input_shape[ii] for ii in (0, 3, 1, 2)]
         kwargs['data_format'] = 'NCHW'
         _test_pooling_iteration(input_shape, **kwargs)

From c16932e6b2d59b178ad18a6311468db0215f8bf4 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sat, 11 May 2019 07:36:54 +0800
Subject: [PATCH 097/106] [HybridScript] Capture constant external python
 variables (#3157)

---
 python/tvm/hybrid/__init__.py               |  6 ++-
 python/tvm/hybrid/module.py                 |  2 +-
 python/tvm/hybrid/parser.py                 | 50 ++++++++++++++-------
 python/tvm/hybrid/preprocessor.py           | 16 +++++--
 python/tvm/hybrid/util.py                   |  6 +++
 tests/python/unittest/test_hybrid_script.py | 19 ++++++++
 6 files changed, 78 insertions(+), 21 deletions(-)

diff --git a/python/tvm/hybrid/__init__.py b/python/tvm/hybrid/__init__.py
index 7aca007ecd94..11ecbc8f7b60 100644
--- a/python/tvm/hybrid/__init__.py
+++ b/python/tvm/hybrid/__init__.py
@@ -31,6 +31,8 @@
 
 from __future__ import absolute_import as _abs
 
+import inspect
+
 from .._ffi.base import decorate
 from .._ffi.function import _init_api
 from ..build_module import form_body
@@ -55,7 +57,9 @@ def wrapped_func(func, *args, **kwargs): #pylint: disable=missing-docstring
         from .util import _is_tvm_arg_types
         if _is_tvm_arg_types(args):
             src = _pruned_source(func)
-            return source_to_op(src, func.__globals__, args)
+            closure_vars = inspect.getclosurevars(func).nonlocals
+            closure_vars.update(inspect.getclosurevars(func).globals)
+            return source_to_op(src, args, func.__globals__, closure_vars)
 
         from .runtime import _enter_hybrid_runtime, _restore_runtime
         intersect = _enter_hybrid_runtime(func)
diff --git a/python/tvm/hybrid/module.py b/python/tvm/hybrid/module.py
index 297dd0b9941a..13e45a7516fa 100644
--- a/python/tvm/hybrid/module.py
+++ b/python/tvm/hybrid/module.py
@@ -62,7 +62,7 @@ def __init__(self, src=None, name=None):
 
     def __call__(self, *args):
         if _is_tvm_arg_types(args):
-            return source_to_op(self.root_, globals(), args)
+            return source_to_op(self.root_, args, globals(), {})
         return self.func_(*args)
 
 
diff --git a/python/tvm/hybrid/parser.py b/python/tvm/hybrid/parser.py
index 1c1525e11be8..40ea1714fc35 100644
--- a/python/tvm/hybrid/parser.py
+++ b/python/tvm/hybrid/parser.py
@@ -25,7 +25,7 @@
 
 from enum import Enum
 
-from .util import _internal_assert
+from .util import _internal_assert, _apply_indices
 from . import calls
 from . import util
 from .preprocessor import determine_variable_usage
@@ -112,7 +112,7 @@ class HybridParser(ast.NodeVisitor):
     }
 
 
-    def __init__(self, args, usage, symbols, func_name=None):
+    def __init__(self, args, usage, symbols, closure_vars, func_name=None):
         """
         Parameters
         ----------
@@ -122,6 +122,12 @@ def __init__(self, args, usage, symbols, func_name=None):
         usage: A dict of variables used in last in this function
             Provided by last lower pass, which collects this information
 
+        symbols : list of str
+            The symbol list of the global context of the function.
+
+        closure_vars: dict
+            A dict of external name reference captured by this function.
+
         Returns
         -------
         func_name: str
@@ -136,6 +142,8 @@ def __init__(self, args, usage, symbols, func_name=None):
             if isinstance(v, types.FunctionType):
                 self.add_symbol(k, Symbol.Callable, v)
 
+        self.closure_vars = closure_vars
+
         self.binds = {} # Thread binds
         self.device = 0 # Is it generating device
 
@@ -236,7 +244,11 @@ def visit_Expr(self, node):
     def visit_Name(self, node):
         name = node.id
         if sys.version_info[0] == 2 and name in ['True', 'False']:
-            return _api.convert(eval(name)) #pylint: disable=eval-used
+            return _api.convert(ast.literal_eval(name))
+
+        if name in self.closure_vars:
+            return _api.convert(self.closure_vars[name])
+
         ty, entry = self.symbols[name]
         _internal_assert(name in self.symbols, "Unknown symbol %s!" % name)
         if ty in [Symbol.LoopVar, Symbol.Input, Symbol.ConstLoopVar]:
@@ -356,10 +368,12 @@ def visit_Attribute(self, node):
         buf = self.visit(node.value)
         return getattr(buf, node.attr)
 
-
     def visit_Subscript(self, node):
         args = self.visit(node.slice)
         if isinstance(node.value, ast.Name):
+            if node.value.id in self.closure_vars:
+                args = ast.literal_eval(str(args))
+                return _api.convert(_apply_indices(self.closure_vars[node.value.id], args))
 
             buf = self.visit(node.value)
             if isinstance(buf, Array):
@@ -576,7 +590,7 @@ def visit_Assert(self, node):
         return _make.AssertStmt(test, mesg, util.make_nop())
 
 
-def parse_python(src, symbols, args):
+def parse_python(src, args, symbols, closure_vars):
     """The helper function of calling the AST visitor
 
     Parameters
@@ -585,14 +599,17 @@ def parse_python(src, symbols, args):
         If an ast.node, then directly lower it.
         If a str, then parse it to ast and lower it.
 
-    symbols : str
-        The symbol list of the global context of the function.
-
     args : list of Tensors or Vars
         The argument lists to the function.
         It is NOT encouraged to write a function without arguments.
         It is NOT encouraged to write a function with side effect.
 
+    symbols : list of str
+        The symbol list of the global context of the function.
+
+    closure_vars: dict
+        A dict of external name reference captured by this function.
+
     Returns
     -------
     root : Stmt
@@ -600,14 +617,14 @@ def parse_python(src, symbols, args):
     """
     root = ast.parse(src) if isinstance(src, str) else src
     _internal_assert(root, ast.AST)
-    var_usage = determine_variable_usage(root, args, symbols)
-    parser = HybridParser(args, var_usage, symbols)
+    var_usage = determine_variable_usage(root, args, symbols, closure_vars)
+    parser = HybridParser(args, var_usage, symbols, closure_vars)
     parser.parsed_body = parser.visit(root)
     _internal_assert(parser.returned, 'No valid return found in the function body!')
     return parser
 
 
-def source_to_op(src, symbols, args):
+def source_to_op(src, args, symbols, closure_vars):
     """Another level of wrapper
 
     Parameters
@@ -616,20 +633,23 @@ def source_to_op(src, symbols, args):
         If an ast.node, then directly lower it.
         If a str, then parse it to ast and lower it.
 
-    symbols : str
-        The symbol list of the global context of the function.
-
     args : list of Tensors or Vars
         The argument lists to the function.
         It is NOT encouraged to write a function without arguments.
         It is NOT encouraged to write a function with side effect.
 
+    symbols : list of str
+        The symbol list of the global context of the function.
+
+    closure_vars: dict
+        A dict of external name reference captured by this function.
+
     Returns
     -------
     res : list of output tensors
         The result of output tensors of the formed OpNode.
     """
-    parser = parse_python(src, symbols, args)
+    parser = parse_python(src, args, symbols, closure_vars)
 
     input_tensors = []
     for i in args:
diff --git a/python/tvm/hybrid/preprocessor.py b/python/tvm/hybrid/preprocessor.py
index 117ebd3091ed..1a9de4e3f801 100644
--- a/python/tvm/hybrid/preprocessor.py
+++ b/python/tvm/hybrid/preprocessor.py
@@ -26,14 +26,14 @@ class PyVariableUsage(ast.NodeVisitor):
     """The vistor class to determine the declaration, r/w status, and last use of each variable"""
     #pylint: disable=invalid-name
     #pylint: disable=missing-docstring
-    def __init__(self, args, symbols):
+    def __init__(self, args, symbols, closure_vars):
         self.status = {}
         self.scope_level = []
         self._args = {}
         self.args = args
         self.aug_assign_ = False
         self.symbols = symbols
-
+        self.closure_vars = closure_vars
 
     def visit_FunctionDef(self, node):
         self.scope_level.append(node)
@@ -89,6 +89,14 @@ def visit_Name(self, node):
                          "Iter var cannot be overwritten")
 
         if node.id not in self.status.keys():
+            # It is a captured value in closure
+            if node.id in self.closure_vars:
+                try:
+                    ast.literal_eval(str(self.closure_vars[node.id]))
+                except ValueError:
+                    raise ValueError("Only support capturing constant values in closure")
+                return
+
             _internal_assert(isinstance(node.ctx, ast.Store), \
                              'Undeclared variable %s' % node.id)
             if self.aug_assign_:
@@ -102,8 +110,8 @@ def visit_Name(self, node):
             self.status[node.id] = (decl, loop, usage)
 
 
-def determine_variable_usage(root, args, symbols):
+def determine_variable_usage(root, args, symbols, closure_vars):
     """The helper function for calling the dedicated visitor."""
-    visitor = PyVariableUsage(args, symbols)
+    visitor = PyVariableUsage(args, symbols, closure_vars)
     visitor.visit(root)
     return visitor.status
diff --git a/python/tvm/hybrid/util.py b/python/tvm/hybrid/util.py
index 0dd1fa141329..058c5aa30af7 100644
--- a/python/tvm/hybrid/util.py
+++ b/python/tvm/hybrid/util.py
@@ -101,3 +101,9 @@ def _is_tvm_arg_types(args):
         _internal_assert(isinstance(elem, np_arg_types), \
                          "Expect a numpy type but %s get!" % str(type(elem)))
     return False
+
+def _apply_indices(value, indices):
+    """Apply multidimensional index"""
+    if indices:
+        return _apply_indices(value[indices[0]], indices[1:])
+    return value
diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
index 254264662fdc..805cff8f5d15 100644
--- a/tests/python/unittest/test_hybrid_script.py
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -768,6 +768,24 @@ def outer_product(a, b):
 
     # Test loop binds
 
+def test_capture():
+    n = 8
+
+    constant_tuple = (10, n)
+    constant_list = [[1, 2], [3, n]]
+    const_value = 1
+
+    @tvm.hybrid.script
+    def add_something(a):
+        c = output_tensor((constant_tuple[1],), 'int32')
+        for i in range(constant_tuple[1]):
+            c[i] = a[i] + constant_list[1][const_value]
+        return c
+
+    a = tvm.placeholder((n, ), dtype='int32', name='a')
+
+    func, ins, outs = run_and_check(add_something, [a])
+    run_and_check(func, ins, outs=outs)
 
 if __name__ == "__main__":
     test_outer_product()
@@ -786,5 +804,6 @@ def outer_product(a, b):
     test_bool()
     test_const_range()
     test_schedule()
+    test_capture()
     # TODO:
     # test_inplace()

From e10553213b13de3f86f8ae72192aac227195af00 Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <sslyu@cs.washington.edu>
Date: Fri, 10 May 2019 17:25:54 -0700
Subject: [PATCH 098/106] Register all operators' Python attributes in Python
 so they can be easily accessed from Python code (#3175)

---
 python/tvm/relay/op/op_attrs.py | 219 +++++++++++++++++++++++++++++++-
 1 file changed, 214 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py
index abaa0e4c5d4f..8f1127662367 100644
--- a/python/tvm/relay/op/op_attrs.py
+++ b/python/tvm/relay/op/op_attrs.py
@@ -21,24 +21,233 @@
 
 @register_relay_attr_node
 class Conv2DAttrs(Attrs):
-    """Attribute of nn.conv2d"""
+    """Attributes for nn.conv2d"""
 
 
 @register_relay_attr_node
 class Conv2DWinogradAttrs(Attrs):
-    """Attribute of nn.contrib_conv2d_winograd_without_weight_transform"""
+    """Attributes for nn.contrib_conv2d_winograd_without_weight_transform"""
 
 
 @register_relay_attr_node
 class Conv2DWinogradWeightTransformAttrs(Attrs):
-    """Attribute of nn.contrib_conv2d_winograd_weight_transform"""
+    """Attributes for nn.contrib_conv2d_winograd_weight_transform"""
 
 
 @register_relay_attr_node
 class Conv2DWinogradNNPACKWeightTransformAttrs(Attrs):
-    """Attribute of nn.contrib_conv2d_winograd_nnpack_weight_transform"""
+    """Attributes for nn.contrib_conv2d_winograd_nnpack_weight_transform"""
 
 
 @register_relay_attr_node
 class GlobalPool2DAttrs(Attrs):
-    """Attribute of nn.global_pool"""
+    """Attributes for nn.global_pool"""
+
+
+@register_relay_attr_node
+class BiasAddAttrs(Attrs):
+    """Atttribute of nn.bias_add"""
+
+
+@register_relay_attr_node
+class DenseAttrs(Attrs):
+    """Attributes for nn.dense"""
+
+
+@register_relay_attr_node
+class UpSamplingAttrs(Attrs):
+    """Attributes for nn.upsampling"""
+
+@register_relay_attr_node
+class PadAttrs(Attrs):
+    """Attributes for nn.pad"""
+
+
+@register_relay_attr_node
+class LeakyReluAttrs(Attrs):
+    """Attributes for nn.leaky_relu"""
+
+
+@register_relay_attr_node
+class PReluAttrs(Attrs):
+    """Attributes for nn.prelu"""
+
+
+@register_relay_attr_node
+class DropoutAttrs(Attrs):
+    """Attributes for nn.dropout"""
+
+
+@register_relay_attr_node
+class BatchNormAttrs(Attrs):
+    """Attributes for nn.batch_norm"""
+
+
+@register_relay_attr_node
+class LRNAttrs(Attrs):
+    """Attributes for nn.lrn"""
+
+
+@register_relay_attr_node
+class L2NormalizeAttrs(Attrs):
+    """Attributes for nn.l2_normalize"""
+
+
+@register_relay_attr_node
+class DeformableConv2DAttrs(Attrs):
+    """Attributes for nn.deformable_conv2d"""
+
+
+@register_relay_attr_node
+class ResizeAttrs(Attrs):
+    """Attributes for image.resize"""
+
+
+@register_relay_attr_node
+class ArgsortAttrs(Attrs):
+    """Attributes for algorithm.argsort"""
+
+
+@register_relay_attr_node
+class OnDeviceAttrs(Attrs):
+    """Attributes for annotation.on_device"""
+
+
+@register_relay_attr_node
+class DebugAttrs(Attrs):
+    """Attributes for debug"""
+
+
+@register_relay_attr_node
+class DeviceCopyAttrs(Attrs):
+    """Attributes for tensor.device_copy"""
+
+
+@register_relay_attr_node
+class CastAttrs(Attrs):
+    """Attributes for transform.cast"""
+
+
+@register_relay_attr_node
+class ConcatenateAttrs(Attrs):
+    """Attributes for tensor.concatenate"""
+
+
+@register_relay_attr_node
+class TransposeAttrs(Attrs):
+    """Attributes for transform.transpose"""
+
+
+@register_relay_attr_node
+class ReshapeAttrs(Attrs):
+    """Attributes for transform.reshape"""
+
+
+@register_relay_attr_node
+class TakeAttrs(Attrs):
+    """Attributes for transform.take"""
+
+
+@register_relay_attr_node
+class InitOpAttrs(Attrs):
+    """Attributes for ops specifying a tensor"""
+
+
+@register_relay_attr_node
+class ArangeAttrs(Attrs):
+    """Attributes used in arange operators"""
+
+
+@register_relay_attr_node
+class StackAttrs(Attrs):
+    """Attributes used in stack operators"""
+
+
+@register_relay_attr_node
+class RepeatAttrs(Attrs):
+    """Attributes used in repeat operators"""
+
+
+@register_relay_attr_node
+class TileAttrs(Attrs):
+    """Attributes used in tile operators"""
+
+
+@register_relay_attr_node
+class ReverseAttrs(Attrs):
+    """Attributes used in reverse operators"""
+
+
+@register_relay_attr_node
+class SqueezeAttrs(Attrs):
+    """Attributes used in squeeze operators"""
+
+
+@register_relay_attr_node
+class SplitAttrs(Attrs):
+    """Attributes for transform.split"""
+
+
+@register_relay_attr_node
+class StridedSliceAttrs(Attrs):
+    """Attributes for transform.stranded_slice"""
+
+
+@register_relay_attr_node
+class SliceLikeAttrs(Attrs):
+    """Attributes for transform.slice_like"""
+
+
+@register_relay_attr_node
+class ClipAttrs(Attrs):
+    """Attributes for transform.clip"""
+
+
+@register_relay_attr_node
+class LayoutTransformAttrs(Attrs):
+    """Attributes for transform.layout_transform"""
+
+
+@register_relay_attr_node
+class ShapeOfAttrs(Attrs):
+    """Attributes for tensor.shape_of"""
+
+
+@register_relay_attr_node
+class MultiBoxPriorAttrs(Attrs):
+    """Attributes for vision.multibox_prior"""
+
+
+@register_relay_attr_node
+class MultiBoxTransformLocAttrs(Attrs):
+    """Attributes for vision.multibox_transform_loc"""
+
+
+@register_relay_attr_node
+class GetValidCountsAttrs(Attrs):
+    """Attributes for vision.get_valid_counts"""
+
+
+@register_relay_attr_node
+class NonMaximumSuppressionAttrs(Attrs):
+    """Attributes for vision.non_maximum_suppression"""
+
+
+@register_relay_attr_node
+class ROIAlignAttrs(Attrs):
+    """Attributes for vision.roi_align"""
+
+
+@register_relay_attr_node
+class ROIPoolAttrs(Attrs):
+    """Attributes for vision.roi_pool"""
+
+
+@register_relay_attr_node
+class YoloReorgAttrs(Attrs):
+    """Attributes for vision.yolo_reorg"""
+
+
+@register_relay_attr_node
+class ProposalAttrs(Attrs):
+    """Attributes used in proposal operators"""

From 9a903545ddb739c53db0ea47f4850f37ea281d49 Mon Sep 17 00:00:00 2001
From: lixiaoquan <radioheads@163.com>
Date: Sat, 11 May 2019 12:56:01 +0800
Subject: [PATCH 099/106] [Relay][TensorFlow] Support tf.math.reduce_prod
 (#3166)

---
 python/tvm/relay/frontend/tensorflow.py       | 10 +++++++++
 .../frontend/tensorflow/test_forward.py       | 21 ++++++++++++++++++-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index bbbb0a2feaec..48f78837c525 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -1080,6 +1080,15 @@ def _impl(inputs, attr, params):
 
     return _impl
 
+
+def _prod():
+    def _impl(inputs, attr, params):
+        axis = params.pop(inputs[1].name_hint).asnumpy()[0]
+        keepdims = attr['keep_dims']
+        return _op.prod(inputs[0], int(axis), keepdims=keepdims)
+    return _impl
+
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -1136,6 +1145,7 @@ def _impl(inputs, attr, params):
     'Pad'                               : _pad('Pad'),
     'PadV2'                             : _pad('PadV2'),
     'Pow'                               : _elemwise('power'),
+    'Prod'                              : _prod(),
     'Range'                             : _range(),
     'Rank'                              : _rank(),
     'RealDiv'                           : _elemwise('div'),
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 1579769ed5c1..58bbdab02b84 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -161,7 +161,6 @@ def is_gpu_available():
     else:
         return False
 
-
 #######################################################################
 # Pooling
 # -------
@@ -1509,6 +1508,25 @@ def test_forward_expand_dims():
     _test_forward_expand_dims(np.array([[1], [2]]), 1)
     _test_forward_expand_dims(np.array([[1], [2]]), -1)
 
+
+#######################################################################
+# Prod
+# ----
+def _test_forward_reduce_prod(shape, axis, keepdims):
+    inp_array1 = np.random.uniform(-5, 5, size=shape).astype(np.float32)
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(shape=inp_array1.shape, dtype=inp_array1.dtype)
+        out = tf.math.reduce_prod(in1, axis, keepdims)
+        compare_tf_with_tvm(inp_array1, in1.name, out.name)
+
+def test_forward_reduce_prod():
+    _test_forward_reduce_prod((5,), 0, False)
+    _test_forward_reduce_prod((5, 5), 0, False)
+    _test_forward_reduce_prod((5, 5), 1, False)
+    _test_forward_reduce_prod((5,), 0, True)
+    _test_forward_reduce_prod((5, 5), 0, True)
+    _test_forward_reduce_prod((5, 5), 1, True)
+
 #######################################################################
 # Main
 # ----
@@ -1550,6 +1568,7 @@ def test_forward_expand_dims():
     test_forward_argminmax()
     test_forward_reduce()
     test_forward_mean()
+    test_forward_reduce_prod()
 
     # General
     test_forward_multi_input()

From e1e357dfaf0570e00b81eb50acda6335c514d21b Mon Sep 17 00:00:00 2001
From: hlu1 <14827759+hlu1@users.noreply.github.com>
Date: Fri, 10 May 2019 22:55:11 -0700
Subject: [PATCH 100/106] [Bugfix] Check file exists before removing it (#3178)

---
 python/tvm/contrib/download.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/tvm/contrib/download.py b/python/tvm/contrib/download.py
index 46a9a6d98879..bc6d32476eb7 100644
--- a/python/tvm/contrib/download.py
+++ b/python/tvm/contrib/download.py
@@ -111,7 +111,8 @@ def _download_progress(count, block_size, total_size):
         except Exception as err:
             retries -= 1
             if retries == 0:
-                os.remove(tempfile)
+                if os.path.exists(tempfile):
+                    os.remove(tempfile)
                 raise err
             else:
                 print("download failed due to {}, retrying, {} attempt{} left"

From b9dcb098cdb1f7e2f5a4d1bf09c36c5ec762a747 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Sat, 11 May 2019 18:08:13 -0400
Subject: [PATCH 101/106] [Relay][Runtime] Add VM compiler.  (#3139)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Implement the VM compiler

* Fix issues

* Fix ASF headers

* Fix test issue

* Apply typo fixes.

* Update src/relay/backend/vm/compiler.cc

Co-Authored-By: 雾雨魔理沙 <lolisa@marisa.moe>

* Refactor compiler

* Fix

* Fix

* Fix in benchmark

* Fix

* Address comments
---
 include/tvm/relay/pass.h                      |  13 +
 include/tvm/runtime/vm.h                      |   2 +-
 src/relay/backend/vm/compiler.cc              | 616 ++++++++++++++++++
 src/relay/backend/vm/inline_primitives.cc     | 146 +++++
 src/relay/backend/vm/lambda_lift.cc           | 166 +++++
 src/relay/backend/vm/vm.cc                    | 159 +++++
 src/relay/op/tensor/reduce.cc                 |   4 +-
 src/relay/pass/dead_code.cc                   |   7 +-
 src/runtime/vm/vm.cc                          |  80 +++
 .../python/relay/benchmarking/benchmark_vm.py | 133 ++++
 tests/python/relay/test_vm.py                 | 264 ++++++++
 11 files changed, 1585 insertions(+), 5 deletions(-)
 create mode 100644 src/relay/backend/vm/compiler.cc
 create mode 100644 src/relay/backend/vm/inline_primitives.cc
 create mode 100644 src/relay/backend/vm/lambda_lift.cc
 create mode 100644 src/relay/backend/vm/vm.cc
 create mode 100644 tests/python/relay/benchmarking/benchmark_vm.py
 create mode 100644 tests/python/relay/test_vm.py

diff --git a/include/tvm/relay/pass.h b/include/tvm/relay/pass.h
index 43831fce3bbc..31067925fa63 100644
--- a/include/tvm/relay/pass.h
+++ b/include/tvm/relay/pass.h
@@ -65,6 +65,7 @@
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/type.h>
 #include <tvm/relay/adt.h>
+#include <tvm/runtime/vm.h>
 #include <string>
 #include <vector>
 
@@ -593,6 +594,18 @@ TVM_DLL Expr ToGraphNormalForm(const Expr& e);
  * As a side effect, code size will explode.
  */
 Expr PartialEval(const Expr& e);
+
+namespace vm {
+
+/*! \brief Compile a module, and construct the virtual machine.
+ *
+ * \param mod The module to compile.
+ * \return The constructed virtual machine.
+ */
+runtime::vm::VirtualMachine CompileModule(const Module& mod);
+
+}  // namespace vm
+
 }  // namespace relay
 }  // namespace tvm
 
diff --git a/include/tvm/runtime/vm.h b/include/tvm/runtime/vm.h
index 0a0a4debf294..8911ad499e4c 100644
--- a/include/tvm/runtime/vm.h
+++ b/include/tvm/runtime/vm.h
@@ -265,7 +265,7 @@ struct Instruction {
 
   Instruction();
   Instruction(const Instruction& instr);
-  Instruction& operator=(const Instruction& instr) = delete;
+  Instruction& operator=(const Instruction& instr);
   ~Instruction();
 
   friend std::ostream& operator<<(std::ostream& os, const Instruction&);
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
new file mode 100644
index 000000000000..97f03c629cb7
--- /dev/null
+++ b/src/relay/backend/vm/compiler.cc
@@ -0,0 +1,616 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file src/relay/backend/vm/compiler.cc
+ * \brief A compiler from relay::Module to the VM byte code.
+ */
+
+#include <tvm/relay/error.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/interpreter.h>
+#include <tvm/logging.h>
+#include <tvm/relay/pass.h>
+#include <tvm/runtime/vm.h>
+#include <iostream>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "../../../runtime/vm/naive_allocator.h"
+#include "../../backend/compile_engine.h"
+
+namespace tvm {
+namespace relay {
+namespace vm {
+
+using namespace tvm::runtime;
+using namespace tvm::runtime::vm;
+
+// (@jroesch): VM passes, eventually declare as passes.
+bool IsClosure(const Function& func);
+Module LambdaLift(const Module& module);
+Module InlinePrimitives(const Module& module);
+
+template <typename T, typename U>
+using NodeMap = std::unordered_map<T, U, NodeHash, NodeEqual>;
+using TagMap = NodeMap<tvm::relay::Constructor, Index>;
+using TagNameMap = std::unordered_map<size_t, tvm::relay::Constructor>;
+using GlobalMap = NodeMap<GlobalVar, Index>;
+using ConstMap = NodeMap<Constant, Index>;
+using ConstTensorShapeMap = NodeMap<TensorType, std::pair<Index, NDArray>>;
+
+struct VMCompilerContext {
+  // The module context for the compilation
+  Module module;
+  // Error reporter
+  ErrorReporter err_reporter;
+  // Map from a unique integer to ADT constructor tag
+  TagNameMap tag_index_map;
+  // Map from ADT constructor tag to a unique integer
+  TagMap tag_map;
+  // Map from global var to a unique integer
+  GlobalMap global_map;
+  // Map from Const object to its index in const pool
+  ConstMap const_map;
+  // Map from Const tensor shape to its index in const pool
+  ConstTensorShapeMap const_tensor_shape_map;
+  // List of lowered functions
+  std::vector<LoweredFunc> lowered_funcs;
+};
+
+// Compute the constant pool, i.e a mapping from Constant node to constant index.
+struct ConstantPool : ExprVisitor {
+  std::set<GlobalVar> visited;
+  Module module;
+  ConstMap const_map;
+  ConstTensorShapeMap const_tensor_shape_map;
+
+  size_t index;
+
+  explicit ConstantPool(const Module& mod) : module(mod), const_map(), index(0) {}
+
+  void VisitExpr_(const GlobalVarNode* var_node) {
+    auto gvar = GetRef<GlobalVar>(var_node);
+    if (visited.find(gvar) == visited.end()) {
+      visited.insert(gvar);
+      this->VisitExpr(this->module->Lookup(gvar));
+    }
+  }
+
+  void AddConstantTensorShape(TensorType expr, NDArray value) {
+    auto it = this->const_tensor_shape_map.find(expr);
+    if (it == this->const_tensor_shape_map.end()) {
+      this->const_tensor_shape_map.insert({expr, std::make_pair(index++, value)});
+    }
+  }
+
+  void VisitExpr_(const ConstantNode* const_node) {
+    auto konst = GetRef<Constant>(const_node);
+    auto it = this->const_map.find(konst);
+    if (it == this->const_map.end()) {
+      this->const_map.insert({konst, index++});
+    }
+  }
+
+  NDArray GetTensorConstant(const TensorTypeNode* ttype) {
+    std::vector<int64_t> shapes;
+    for (auto sh : ttype->shape) {
+      shapes.push_back(Downcast<tvm::Integer>(sh)->value);
+    }
+    int64_t s = shapes.size();
+    DLContext cpu_ctx;
+    cpu_ctx.device_type = kDLCPU;
+    cpu_ctx.device_id = 0;
+    auto shape_tensor = NDArray::Empty({s}, Type2TVMType(Int(64)), cpu_ctx);
+    int64_t* dims = static_cast<int64_t*>(shape_tensor->data);
+    for (size_t i = 0; i < shapes.size(); ++i) {
+      dims[i] = shapes[i];
+    }
+    return shape_tensor;
+  }
+
+  void VisitExpr_(const CallNode* call_node) {
+    for (auto arg : call_node->args) {
+      this->VisitExpr(arg);
+    }
+
+    Expr op = call_node->op;
+    auto func_node = op.as<FunctionNode>();
+    if (func_node) {
+      auto ret_type = call_node->checked_type();
+      if (const TensorTypeNode* ttype = ret_type.as<TensorTypeNode>()) {
+        auto shape = GetTensorConstant(ttype);
+        auto tensor_type = GetRef<TensorType>(ttype);
+        AddConstantTensorShape(tensor_type, shape);
+      } else if (const TupleTypeNode* ttype = ret_type.as<TupleTypeNode>()) {
+        for (size_t i = 0; i < ttype->fields.size(); ++i) {
+          auto f = ttype->fields[i];
+          auto f_type = f.as<TensorTypeNode>();
+          auto shape = GetTensorConstant(f_type);
+          auto tensor_type = GetRef<TensorType>(f_type);
+          AddConstantTensorShape(tensor_type, shape);
+        }
+      }
+    }
+  }
+};
+
+std::tuple<ConstMap, ConstTensorShapeMap> LayoutConstantPool(const Module& module) {
+  auto cp = ConstantPool(module);
+  for (auto& func : module->functions) {
+    cp.VisitExpr(func.first);
+  }
+  return std::make_tuple(cp.const_map, cp.const_tensor_shape_map);
+}
+
+void InstructionPrint(std::ostream& os, const Instruction& instr);
+
+struct VMCompiler : ExprFunctor<void(const Expr& expr)> {
+  /*! \brief Store the expression a variable points to. */
+  std::unordered_map<Var, Expr, NodeHash, NodeEqual> expr_map;
+
+  std::vector<Instruction> instructions;
+
+  // var -> register num
+  std::unordered_map<Var, RegName, NodeHash, NodeEqual> var_register_map;
+
+  size_t last_register;
+
+  // Total number of virtual registers allocated
+  size_t registers_num;
+  CompileEngine engine;
+
+  /*! \brief The functions that have been lowered. */
+  std::unordered_map<LoweredFunc, size_t, NodeHash, NodeEqual> seen_funcs;
+
+  /*! \brief Global shared meta data */
+  VMCompilerContext* context;
+
+  VMCompiler(VMCompilerContext* context)
+      : instructions(),
+        var_register_map(),
+        last_register(0),
+        registers_num(0),
+        engine(CompileEngine::Global()),
+        context(context)
+        {}
+
+  size_t NewRegister() { return registers_num++; }
+
+  inline void Emit(const Instruction& instr) {
+    DLOG(INFO) << "VMCompiler::Emit: instr=" << instr;
+    CHECK((int)instr.op < 100) << "Invalid opcode " << (int)instr.op;
+    switch (instr.op) {
+      case Opcode::AllocDatatype:
+      case Opcode::AllocTensor:
+      case Opcode::GetField:
+      case Opcode::LoadConst:
+      case Opcode::Select:
+      case Opcode::Invoke:
+      case Opcode::AllocClosure:
+      case Opcode::Move:
+      case Opcode::InvokeClosure:
+        last_register = instr.dst;
+        break;
+      case Opcode::InvokePacked:
+        last_register = instr.packed_args[instr.arity - 1];
+        break;
+      case Opcode::If:
+      case Opcode::Ret:
+      case Opcode::Goto:
+        break;
+    }
+    instructions.push_back(instr);
+  }
+
+  void VisitExpr_(const ConstantNode* const_node) {
+    auto rconst = GetRef<Constant>(const_node);
+    auto it = this->context->const_map.find(rconst);
+    CHECK(it != this->context->const_map.end());
+    Emit(Instruction::LoadConst(it->second, NewRegister()));
+  }
+
+  void VisitExpr_(const VarNode* var_node) {
+    auto var = GetRef<Var>(var_node);
+    auto reg_it = this->var_register_map.find(var);
+    CHECK(reg_it != this->var_register_map.end());
+    last_register = reg_it->second;
+  }
+
+  void VisitExpr_(const TupleNode* tuple_node) {
+    auto tuple = GetRef<Tuple>(tuple_node);
+    std::vector<Index> fields_registers;
+
+    for (auto& field : tuple->fields) {
+      this->VisitExpr(field);
+      fields_registers.push_back(last_register);
+    }
+
+    // TODO(@jroesch): use correct tag
+    Emit(Instruction::AllocDatatype(
+      0,
+      tuple->fields.size(),
+      fields_registers,
+      NewRegister()));
+  }
+
+  void VisitExpr_(const MatchNode* match_node) {
+    auto match = GetRef<Match>(match_node);
+    LOG(FATAL) << "translation of match nodes to the VM is"
+               << "currently unsupported" << std::endl;
+  }
+
+  void VisitExpr_(const LetNode* let_node) {
+    DLOG(INFO) << let_node->value << std::endl;
+    this->VisitExpr(let_node->value);
+    DLOG(INFO) << this->last_register << std::endl;
+    var_register_map.insert({let_node->var, this->last_register});
+    this->VisitExpr(let_node->body);
+  }
+
+  void VisitExpr_(const TupleGetItemNode* get_node) {
+    auto get = GetRef<TupleGetItem>(get_node);
+    this->VisitExpr(get->tuple);
+    auto tuple_register = last_register;
+    Emit(Instruction::GetField(tuple_register, get->index, NewRegister()));
+  }
+
+  void VisitExpr_(const GlobalVarNode* gvar) {
+    LOG(FATAL) << "Global variables should only appear in the call position";
+  }
+
+  void VisitExpr_(const IfNode* if_node) {
+    this->VisitExpr(if_node->cond);
+
+    size_t cond_register = last_register;
+
+    auto after_cond = this->instructions.size();
+
+    this->Emit(Instruction::If(cond_register, 0, 0));
+    this->VisitExpr(if_node->true_branch);
+
+    size_t true_register = last_register;
+
+    Emit(Instruction::Goto(0));
+
+    // Finally store how many instructions there are in the
+    // true branch.
+    auto after_true = this->instructions.size();
+
+    this->VisitExpr(if_node->false_branch);
+
+    size_t false_register = last_register;
+
+    // Compute the total number of instructions
+    // after generating false.
+    auto after_false = this->instructions.size();
+
+    // Now we will compute the jump targets in order
+    // to properly patch the instruction with the
+    // the requiste targets.
+
+    // After we emit the true body, and false body,
+    // we patch up the if instruction, and goto.
+    auto true_offset = 1;
+    auto false_offset = after_true - after_cond;
+    this->instructions[after_cond].true_offset = true_offset;
+    this->instructions[after_cond].false_offset = false_offset;
+
+    // Patch the Goto.
+    this->instructions[after_true - 1].pc_offset = (after_false - after_true) + 1;
+
+    Emit(Instruction::Select(cond_register, true_register, false_register, NewRegister()));
+  }
+
+  Instruction AllocTensorFromType(const TensorTypeNode* ttype) {
+    DataType dtype = ttype->dtype;
+    TVMType dltype = Type2TVMType(dtype);
+
+    auto tensor_type = GetRef<TensorType>(ttype);
+    auto it = this->context->const_tensor_shape_map.find(tensor_type);
+    if (it == this->context->const_tensor_shape_map.end()) {
+      DLOG(INFO) << "Can not find constant shape for " << tensor_type;
+    } else {
+      Emit(Instruction::LoadConst(it->second.first, NewRegister()));
+    }
+
+    return Instruction::AllocTensor(last_register, dltype, NewRegister());
+  }
+
+  void EmitInvokePrimitive(const Function& func, std::vector<Index> args_registers,
+                           const Type& ret_type) {
+    std::vector<Instruction> allocs;
+    size_t return_num = 0;
+    if (const TensorTypeNode* ttype = ret_type.as<TensorTypeNode>()) {
+      // Allocate space for the return tensor.
+      auto alloc = AllocTensorFromType(ttype);
+      allocs.push_back(alloc);
+      return_num = 1;
+    } else if (const TupleTypeNode* ttype = ret_type.as<TupleTypeNode>()) {
+      std::vector<Index> fields_registers;
+
+      for (size_t i = 0; i < ttype->fields.size(); ++i) {
+        auto f = ttype->fields[i];
+        auto f_type = f.as<TensorTypeNode>();
+        allocs.push_back(AllocTensorFromType(f_type));
+        fields_registers.push_back(allocs.back().dst);
+      }
+      return_num = ttype->fields.size();
+    } else {
+      LOG(FATAL) << "Unsupported return value type";
+    }
+
+    for (auto& alloc : allocs) {
+      Emit(alloc);
+      args_registers.push_back(alloc.dst);
+    }
+
+    // Next generate the invoke instruction.
+    CHECK(func->IsPrimitive());
+    auto target = Target::create("llvm");
+    auto key = CCacheKeyNode::make(func, target);
+    auto cfunc = engine->Lower(key);
+    // TODO(jroesch): support lowered funcs for multiple targets
+    CHECK_EQ(cfunc->funcs.size(), 1);
+    auto op_index = -1;
+    if (seen_funcs.find(cfunc->funcs[0]) == seen_funcs.end()) {
+      op_index = this->context->lowered_funcs.size();
+      this->context->lowered_funcs.push_back(cfunc->funcs[0]);
+      seen_funcs[cfunc->funcs[0]] = op_index;
+    } else {
+      op_index = seen_funcs[cfunc->funcs[0]];
+    }
+
+    // If Tensor, 1
+    // If Tuple, size of tuple
+    size_t arity = func->params.size() + return_num;
+    Emit(Instruction::InvokePacked(op_index, arity, return_num, args_registers));
+    if (return_num > 1) {
+      // return value is a tuple, we need to create a tuple
+      std::vector<Index> fields_registers;
+      for (size_t i = func->params.size(); i < arity; ++i) {
+        fields_registers.push_back(args_registers[i]);
+      }
+      Emit(Instruction::AllocDatatype(0, return_num, fields_registers, NewRegister()));
+    }
+  }
+
+  void VisitExpr_(const CallNode* call_node) {
+    std::vector<Index> args_registers;
+
+    for (auto arg : call_node->args) {
+      CHECK(arg.as<VarNode>()) << "found: " << AsText(arg, false) << std::endl << arg;
+      this->VisitExpr(arg);
+      args_registers.push_back(last_register);
+    }
+
+    Expr op = call_node->op;
+
+    if (auto func_node = op.as<FunctionNode>()) {
+      CHECK(func_node->IsPrimitive());
+      EmitInvokePrimitive(GetRef<Function>(func_node), args_registers, call_node->checked_type());
+    } else if (auto global_node = op.as<GlobalVarNode>()) {
+      auto global = GetRef<GlobalVar>(global_node);
+      auto it = this->context->global_map.find(global);
+      CHECK(it != this->context->global_map.end());
+      DLOG(INFO) << "VisitExpr_: generating invoke for " << global->name_hint
+                      << " with func_index=" << it->second;
+
+      auto func = this->context->module->Lookup(global);
+      if (IsClosure(func)) {
+        auto arity = func->params.size();
+        std::vector<Index> free_var_registers;
+        for (size_t i = 0; i < arity; ++i) {
+          free_var_registers.push_back(var_register_map.at(func->params[i]));
+        }
+        Emit(Instruction::AllocClosure(it->second, arity, free_var_registers, NewRegister()));
+      } else {
+        Emit(Instruction::Invoke(it->second, args_registers, NewRegister()));
+      }
+    } else if (auto constructor_node = op.as<ConstructorNode>()) {
+      auto constructor = GetRef<Constructor>(constructor_node);
+      auto tag = GetConstructorTag(constructor);
+      Emit(Instruction::AllocDatatype(tag, call_node->args.size(), args_registers, NewRegister()));
+    } else if (auto var_node = op.as<VarNode>()) {
+      VisitExpr(GetRef<Var>(var_node));
+      Emit(Instruction::InvokeClosure(last_register, args_registers, NewRegister()));
+    } else {
+      LOG(FATAL) << "unsupported case in vm compiler: " << op;
+    }
+  }
+
+  size_t GetConstructorTag(tvm::relay::Constructor constructor) {
+    auto it = this->context->tag_map.find(constructor);
+    if (it != this->context->tag_map.end()) {
+      return it->second;
+    } else {
+      auto tag = this->context->tag_map.size();
+      this->context->tag_map[constructor] = tag;
+      this->context->tag_index_map[tag] = constructor;
+      return tag;
+    }
+  }
+
+  void VisitExpr_(const FunctionNode* func_node) {
+    if (!func_node->IsPrimitive()) {
+      LOG(FATAL) << "local functions should have been removed by lambda lifting:" << std::endl
+                 << "Program: " << AsText(GetRef<Function>(func_node), false) << std::endl
+                 << "AST: " << GetRef<Function>(func_node);
+    }
+  }
+
+  void CompileClosure(const Function& func) {
+    // We first layout the function arguments.
+    auto inner_func = Downcast<Function>(func->body);
+
+    size_t i = 0;
+    for (auto param : inner_func->params) {
+      auto arg_register = NewRegister();
+      CHECK_EQ(i, arg_register);
+      var_register_map.insert({param, arg_register});
+      i++;
+    }
+
+    // We then assign register num to the free variables
+    for (auto param : func->params) {
+      auto arg_register = NewRegister();
+      CHECK_EQ(i, arg_register);
+      var_register_map.insert({param, arg_register});
+      i++;
+    }
+
+    // We will now process the body like normal.
+    this->VisitExpr(inner_func->body);
+  }
+
+  void Compile(const Function& func) {
+    // We need to generate code specially for lifted closures.
+    if (IsClosure(func)) {
+      CompileClosure(func);
+      return;
+    }
+
+    for (size_t i = 0; i < func->params.size(); ++i) {
+      auto arg_register = NewRegister();
+      CHECK_EQ(arg_register, i);
+      var_register_map.insert({func->params[i], arg_register});
+    }
+
+    this->VisitExpr(func->body);
+  }
+};
+
+void PopulatePackedFuncMap(const std::vector<LoweredFunc>& lowered_funcs,
+                           std::vector<PackedFunc>* packed_funcs) {
+  runtime::Module mod;
+  if (lowered_funcs.size() > 0) {
+    // TODO(@jroesch): we need to read target from build config
+    Target target = Target::create("llvm");
+    if (const auto* f = runtime::Registry::Get("relay.backend.build")) {
+      mod = (*f)(tvm::Array<LoweredFunc>(lowered_funcs.begin(), lowered_funcs.end()), target);
+    } else {
+      LOG(FATAL) << "relay.backend.build is not registered";
+    }
+    CHECK(mod.operator->());
+    for (auto lfunc : lowered_funcs) {
+      packed_funcs->push_back(mod.GetFunction(lfunc->name));
+    }
+  }
+}
+
+VMFunction CompileFunc(VMCompilerContext* context, const GlobalVar& var, const Function& func) {
+  DLOG(INFO) << "CompileFunc: " << std::endl << AsText(func, false) << std::endl;
+  size_t params = func->params.size();
+  VMCompiler compiler(context);
+  compiler.Compile(func);
+  // return the last evaluated expression
+  compiler.instructions.push_back(Instruction::Ret(compiler.last_register));
+
+  // Would like to refactor this so we only check if closure once.
+  if (IsClosure(func)) {
+    auto inner_params = Downcast<Function>(func->body)->params.size();
+    return VMFunction(var->name_hint, params + inner_params, compiler.instructions,
+                      compiler.registers_num);
+  } else {
+    return VMFunction(var->name_hint, params, compiler.instructions, compiler.registers_num);
+  }
+}
+
+Module OptimizeModule(const Module& mod) {
+  ToANormalForm(mod->entry_func, mod);
+  InlinePrimitives(mod);
+  LambdaLift(mod);
+  return InlinePrimitives(mod);
+}
+
+void PopulateGlobalMap(GlobalMap* global_map, const Module& mod) {
+  // First we populate global map.
+  size_t global_index = 0;
+  for (auto named_func : mod->functions) {
+    auto gvar = named_func.first;
+    global_map->insert({gvar, global_index++});
+  }
+}
+
+VirtualMachine CompileModule(const Module& mod_ref) {
+  Module mod = mod_ref;
+
+  // Run some optimizations first, this code should
+  // be moved to pass manager.
+  mod = OptimizeModule(mod);
+
+  VirtualMachine vm;
+
+  VMCompilerContext context;
+  context.module = mod;
+
+  // Populate the global map.
+  //
+  // This maps global variables to a global index
+  // in the VMFunction table.
+  PopulateGlobalMap(&context.global_map, mod);
+
+  // Next we populate constant map.
+  auto constant_analysis_result = LayoutConstantPool(mod);
+  context.const_map = std::get<0>(constant_analysis_result);
+  context.const_tensor_shape_map = std::get<1>(constant_analysis_result);
+
+  // Next we get ready by allocating space for
+  // the global state.
+  vm.functions.resize(mod->functions.size());
+  vm.constants.resize(context.const_map.size() + context.const_tensor_shape_map.size());
+
+  for (auto pair : context.const_map) {
+    vm.constants[pair.second] = Object::Tensor(pair.first->data);
+  }
+
+  for (auto pair : context.const_tensor_shape_map) {
+    vm.constants[pair.second.first] = Object::Tensor(pair.second.second);
+  }
+
+  for (auto named_func : mod->functions) {
+    auto gvar = named_func.first;
+    auto func = named_func.second;
+    auto vm_func = CompileFunc(&context, gvar, func);
+
+    size_t func_index = context.global_map.at(gvar);
+    CHECK(func_index < vm.functions.size());
+    vm.functions[func_index] = vm_func;
+  }
+
+#ifdef USE_RELAY_DEBUG
+  for (auto vm_func : vm.functions) {
+    std::cout << "Function: " << vm_func.name << std::endl
+              << vm_func << "-------------" << std::endl;
+  }
+#endif  // USE_RELAY_DEBUG
+
+  PopulatePackedFuncMap(context.lowered_funcs, &vm.packed_funcs);
+
+  for (auto gv : context.global_map) {
+    vm.global_map_.insert({gv.first->name_hint, gv.second});
+  }
+
+  return vm;
+}
+
+}  // namespace vm
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/vm/inline_primitives.cc b/src/relay/backend/vm/inline_primitives.cc
new file mode 100644
index 000000000000..b033a37e42b8
--- /dev/null
+++ b/src/relay/backend/vm/inline_primitives.cc
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file tvm/relay/backend/vm/inline_primitives.cc
+ * \brief Ensure that primitives only appear in the call position.
+ */
+
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/logging.h>
+#include <tvm/relay/pass.h>
+#include <tvm/runtime/vm.h>
+#include <iostream>
+#include <vector>
+
+using namespace tvm::runtime;
+
+namespace tvm {
+namespace relay {
+namespace vm {
+
+struct PrimitiveInliner : ExprMutator {
+  Module module_;
+  std::unordered_map<Var, Expr, NodeHash, NodeEqual> var_map;
+
+  explicit PrimitiveInliner(const Module& module) : module_(module) {}
+
+  Expr VisitExpr_(const LetNode* let_node) {
+    var_map.insert({let_node->var, VisitExpr(let_node->value)});
+    return ExprMutator::VisitExpr_(let_node);
+  }
+
+  Expr VisitExpr_(const CallNode* call) {
+    Expr op = call->op;
+    // For now just collapse the chain of variables to see if
+    // they point to a primitive function.
+    const VarNode* var_node;
+
+    // Collapse a chain of let bindings
+    //
+    // let x = fn (..) { .. };
+    // let y = x
+    // let w = y
+    // in w(...)
+    while ((var_node = op.as<VarNode>())) {
+      auto var = GetRef<Var>(var_node);
+      DLOG(INFO) << "Var: " << var << std::endl;
+      auto it = var_map.find(GetRef<Var>(var_node));
+      if (it != var_map.end()) {
+        op = it->second;
+      } else {
+        return ExprMutator::VisitExpr_(call);
+      }
+    }
+
+    if (auto func = op.as<FunctionNode>()) {
+      if (func->IsPrimitive()) {
+        return CallNode::make(GetRef<Function>(func), call->args, call->attrs, call->type_args);
+      }
+    }
+
+    if (auto global = op.as<GlobalVarNode>()) {
+      return CallNode::make(GetRef<GlobalVar>(global), call->args, call->attrs, call->type_args);
+    }
+
+    return ExprMutator::VisitExpr_(call);
+  }
+
+  Expr VisitExpr_(const FunctionNode* func) {
+    if (func->IsPrimitive()) {
+      return GetRef<Function>(func);
+    } else {
+      return ExprMutator::VisitExpr_(func);
+    }
+  }
+
+  Function Inline(const Function& func) {
+    DLOG(INFO) << "Before inlining primitives: " << std::endl
+                    << "func= " << AsText(func, false) << std::endl;
+
+    auto inlined = FunctionNode::make(func->params, VisitExpr(func->body), func->ret_type,
+                                      func->type_params, func->attrs);
+
+    inlined = Downcast<Function>(DeadCodeElimination(inlined));
+
+    DLOG(INFO) << "After inlining primitives" << std::endl
+                    << "after_func= " << AsText(inlined, false) << std::endl;
+    return inlined;
+  }
+};
+
+// TODO(@jroesch): write verifier
+
+/* This pass will eliminate primitives which have been lifted by the ANF
+ * transform inlining them directly into call sites.
+ *
+ * This makes VM related code generation easier as the call target is always
+ * a primitive function.
+ *
+ * let prim = fn(...) { ... };
+ * prim(...)
+ *
+ * will become:
+ *
+ * (fn(...) { ... })(...)
+ */
+Module InlinePrimitives(const Module& module) {
+  PrimitiveInliner inliner(module);
+
+  tvm::Map<GlobalVar, Function> updates;
+
+  // There is an ordering bug here.
+  for (auto pair : module->functions) {
+    auto global = pair.first;
+    auto func = pair.second;
+    updates.Set(global, inliner.Inline(func));
+  }
+
+  for (auto pair : updates) {
+    module->Add(pair.first, pair.second, true);
+  }
+
+  return module;
+}
+
+}  // namespace vm
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/vm/lambda_lift.cc b/src/relay/backend/vm/lambda_lift.cc
new file mode 100644
index 000000000000..13d8112440fb
--- /dev/null
+++ b/src/relay/backend/vm/lambda_lift.cc
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file tvm/relay/backend/vm/lambda_lift.cc
+ * \brief Lift all local functions into global functions.
+ */
+
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/logging.h>
+#include <tvm/relay/pass.h>
+#include <tvm/runtime/vm.h>
+#include <iostream>
+#include <vector>
+
+using namespace tvm::runtime;
+
+namespace tvm {
+namespace relay {
+namespace vm {
+
+static const char* kIsClosure = "IsClosure";
+
+inline std::string GenerateName(const Function& func) {
+  size_t hash = StructuralHash()(func);
+  return std::string("lifted_name") + std::to_string(hash);
+}
+
+bool IsClosure(const Function& func) {
+  NodeRef res = FunctionGetAttr(func, kIsClosure);
+  const ir::IntImm* pval = res.as<ir::IntImm>();
+  return pval && pval->value != 0;
+}
+
+Function MarkClosure(const Function& func) {
+  return FunctionSetAttr(func, kIsClosure, tvm::Integer(1));
+}
+
+struct LambdaLifter : ExprMutator {
+  Module module_;
+  std::vector<std::pair<GlobalVar, Function>> lifted_;
+  explicit LambdaLifter(const Module& module) : module_(module) {}
+
+  Expr VisitExpr_(const FunctionNode* func_node) final {
+    auto func = GetRef<Function>(func_node);
+
+    // We should not transform primitive functions.
+    if (func->IsPrimitive()) {
+      return std::move(func);
+    }
+
+    auto free_vars = FreeVars(func);
+    auto free_type_vars = FreeTypeVars(func, module_);
+    auto body = Downcast<Function>(ExprMutator::VisitExpr_(func_node));
+
+    // When performing this optimization there are two
+    // cases.
+    //
+    // The first case in which we have no free variables
+    // we can just lift the function into the global
+    // environment without needing to allocate a closure.
+    //
+    //
+    // The second case requires that we generate a special
+    // function with makes a distinction between allocating
+    // a closure, and then the code for the closure.
+    //
+    // We represent a closure allocation by lifting the
+    // closure to a global function which takes its
+    // captured arguments and then directly returns
+    // the function representing the closure's code.
+    //
+    // When we generate code later on a call to the "outer"
+    // function marked as a closure is used to emit allocation
+    // code for the closure's environment.
+    //
+    // The "inner" function is should be used to generate the
+    // code for the closure.
+    Function lifted_func;
+    if (free_vars.size() == 0) {
+      lifted_func = FunctionNode::make(body->params, body->body, body->ret_type, free_type_vars);
+    } else {
+      lifted_func =
+          FunctionNode::make(free_vars, body, func->func_type_annotation(), free_type_vars);
+
+      lifted_func = MarkClosure(lifted_func);
+    }
+
+    CHECK(lifted_func.defined());
+
+    auto name = GenerateName(lifted_func);
+    auto global = this->module_->GetGlobalVar(name);
+
+    lifted_.push_back({global, lifted_func});
+
+    if (free_vars.size() == 0) {
+      return std::move(global);
+    } else {
+      // If we need to allocate a closure
+      // we pass the variables in its environment
+      // here.
+      Array<Expr> fvs;
+      for (auto fv : free_vars) {
+        fvs.push_back(fv);
+      }
+      return CallNode::make(global, fvs);
+    }
+  }
+
+  Function Lift(const Function& func) {
+    DLOG(INFO) << "Lifting: " << AsText(func, false) << std::endl;
+    return FunctionNode::make(func->params, VisitExpr(func->body), func->ret_type,
+                              func->type_params, func->attrs);
+  }
+};
+
+/* The goal of this pass is to lift out any nested functions into top-level
+ * functions.
+ *
+ * We will lift the functions out into globals which take the set of the free vars
+ * and then return a function whcih has b
+ */
+Module LambdaLift(const Module& module) {
+  LambdaLifter lifter(module);
+
+  tvm::Map<GlobalVar, Function> updates;
+
+  // There is an ordering bug here.
+  for (auto pair : module->functions) {
+    auto global = pair.first;
+    auto func = pair.second;
+    updates.Set(global, lifter.Lift(func));
+  }
+
+  for (auto i = lifter.lifted_.begin(); i != lifter.lifted_.end(); i++) {
+    module->Add(i->first, i->second);
+  }
+
+  for (auto pair : updates) {
+    module->Add(pair.first, pair.second, true);
+  }
+
+  return module;
+}
+
+}  // namespace vm
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/vm/vm.cc b/src/relay/backend/vm/vm.cc
new file mode 100644
index 000000000000..34d067b9c68c
--- /dev/null
+++ b/src/relay/backend/vm/vm.cc
@@ -0,0 +1,159 @@
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file src/relay/backend/vm/vm.cc
+ * \brief The Relay virtual machine.
+ */
+
+#include <tvm/relay/interpreter.h>
+#include <tvm/logging.h>
+#include <tvm/relay/module.h>
+#include <tvm/runtime/vm.h>
+#include <tvm/relay/pass.h>
+
+namespace tvm {
+namespace relay {
+namespace vm {
+
+using tvm::runtime::Object;
+using tvm::runtime::ObjectTag;
+using tvm::runtime::vm::VirtualMachine;
+
+
+VirtualMachine FromModule(const Module& module, const std::vector<TVMContext>& ctxs) {
+  auto vm = CompileModule(module);
+  vm.Init(ctxs);
+  return vm;
+}
+
+Object EvaluateModule(const Module& module, const std::vector<TVMContext> ctxs,
+                      const std::vector<Object>& vm_args) {
+  VirtualMachine vm = FromModule(module, ctxs);
+  // TODO(zhiics): This measurement is for temporary usage. Remove it later. We
+  // need to introduce a better profiling method.
+#if ENABLE_PROFILING
+  DLOG(INFO) << "Entry function is " << module->entry_func << std::endl;
+  auto start = std::chrono::high_resolution_clock::now();
+#endif  // ENABLE_PROFILING
+  Object res = vm.Invoke(module->entry_func->name_hint, vm_args);
+#if ENABLE_PROFILING
+  auto end = std::chrono::high_resolution_clock::now();
+  auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+  LOG(INFO) << "Inference time: " << duration << "ms\n";
+#endif  // ENABLE_PROFILING
+  return res;
+}
+
+Value VMToValue(const relay::Module& module, const relay::Type& type, Object obj) {
+  CHECK(module.defined() && type.defined());
+  switch (obj->tag) {
+    case ObjectTag::kTensor: {
+      CHECK(type.as<TensorTypeNode>()) << "VM internal error: return value must be a tensor";
+      return TensorValueNode::make(ToNDArray(obj));
+    }
+    case ObjectTag::kDatatype: {
+      // const auto* tuple_type
+      // const auto& data_type = obj.AsDatatype();
+
+      // tvm::Array<Value> fields;
+      // for (size_t i = 0; i < data_type->fields.size(); ++i) {
+      //   fields.push_back(VMToValue(tag_index_map, data_type->fields[i]));
+      // }
+
+      // return ConstructorValueNode::make(tag_index_map.at(data_type->tag), fields);
+      LOG(FATAL) << "fix me";
+    }
+    default:
+      LOG(FATAL) << "unsupported return value of type: " << obj->tag;
+      return Value();
+  }
+}
+
+TVM_REGISTER_API("relay._vm._Tensor").set_body([](TVMArgs args, TVMRetValue* ret) {
+  *ret = Object::Tensor(args[0]);
+});
+
+TVM_REGISTER_API("relay._vm._Tuple").set_body([](TVMArgs args, TVMRetValue* ret) {
+  std::vector<Object> fields;
+  for (auto i = 0; i < args.size(); i++) {
+    fields.push_back(args[i]);
+  }
+  *ret = Object::Tuple(fields);
+});
+
+template <typename T>
+std::string ToString(const T& t) {
+  std::stringstream s;
+  s << t;
+  return s.str();
+}
+
+TVM_REGISTER_API("relay._vm._ObjectTag").set_body([](TVMArgs args, TVMRetValue* ret) {
+  Object obj = args[0];
+  *ret = ToString(obj->tag);
+});
+
+TVM_REGISTER_API("relay._vm._Datatype")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    int itag = args[0];
+    size_t tag = static_cast<size_t>(itag);
+    std::vector<Object> fields;
+    for (int i = 1; i < args.size(); i++) {
+      fields.push_back(args[i]);
+    }
+
+    *ret = Object::Datatype(tag, fields);
+});
+
+TVM_REGISTER_API("relay._vm._evaluate_vm").set_body([](TVMArgs args, TVMRetValue* ret) {
+  NodeRef to_compile = args[0];
+  TVMContext ctx;
+  int dev_type = args[1];
+  ctx.device_type = static_cast<DLDeviceType>(dev_type);
+  ctx.device_id = args[2];
+
+  Module module;
+  if (to_compile.as<FunctionNode>()) {
+    Function to_compile = args[0];
+    module = ModuleNode::FromExpr(to_compile);
+  } else if (to_compile.as<ModuleNode>()) {
+    module = args[0];
+  } else {
+    LOG(FATAL) << "expected function or module";
+  }
+
+  auto return_type = module->Lookup(module->entry_func)->ret_type;
+
+  std::vector<Object> vm_args;
+  for (auto i = 3; i < args.size(); i++) {
+    Object obj = args[i];
+    vm_args.push_back(obj);
+  }
+
+  auto result = EvaluateModule(module, {ctx}, vm_args);
+  DLOG(INFO) << "Evaluate VM returning: result=" << result->tag;
+  *ret = VMToValue(module, return_type, result);
+});
+
+}  // namespace vm
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc
index b889b6ce51cd..a4ebd1e8d050 100644
--- a/src/relay/op/tensor/reduce.cc
+++ b/src/relay/op/tensor/reduce.cc
@@ -154,6 +154,9 @@ Array<Tensor> ReduceCompute(const Attrs& attrs,
                             F f) {
   const ReduceAttrs* param = attrs.as<ReduceAttrs>();
   CHECK(param != nullptr);
+  if (inputs[0]->shape.size() == 0) {
+    return { topi::identity(inputs[0]) };
+  }
   auto axes = param->axis;
   if (param->exclude) {
     axes = GetExcludeAxes(inputs[0]->shape.size(), param->axis);
@@ -251,7 +254,6 @@ bool ReduceRel(const Array<Type>& types,
   CHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
-  CHECK(static_cast<int>(data->shape.size()) != 0);
   std::vector<IndexExpr>&& in_shape = AsVector(data->shape);
 
   const ReduceAttrs* param = attrs.as<ReduceAttrs>();
diff --git a/src/relay/pass/dead_code.cc b/src/relay/pass/dead_code.cc
index c5c4f333ecfe..533c21429995 100644
--- a/src/relay/pass/dead_code.cc
+++ b/src/relay/pass/dead_code.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -124,7 +124,8 @@ class CalcDep : private ExprVisitor {
     friend CalcDep;
 
     bool HasLet(const Var& v) {
-      return (use_map_[v] > 1 || (use_map_[v] != 0 && letrec_set_.count(v) != 0));
+      // TODO(@jroesch): MK fix me
+      return (use_map_[v] > 0 || (use_map_[v] != 0 && letrec_set_.count(v) != 0));
     }
 
     Expr VisitExpr_(const VarNode* op) final {
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index d7ea53e75f6f..b2d326ec7792 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -118,6 +118,86 @@ Instruction::Instruction(const Instruction& instr) {
   }
 }
 
+template<typename T>
+static inline void FreeIf(T* t) {
+  if (t != nullptr) {
+    delete t;
+  }
+}
+
+Instruction& Instruction::operator=(const Instruction& instr) {
+  this->op = instr.op;
+  this->dst = instr.dst;
+
+  switch (instr.op) {
+    case Opcode::Move:
+      this->from = instr.from;
+      return *this;
+    case Opcode::Select:
+      this->select_cond = instr.select_cond;
+      this->select_op1 = instr.select_op1;
+      this->select_op2 = instr.select_op2;
+      return *this;
+    case Opcode::Ret:
+      this->result = instr.result;
+      return *this;
+    case Opcode::AllocTensor:
+      this->shape_register = instr.shape_register;
+      this->dtype = instr.dtype;
+      return *this;
+    case Opcode::AllocDatatype:
+      this->constructor_tag = instr.constructor_tag;
+      this->num_fields = instr.num_fields;
+      FreeIf(this->datatype_fields);
+      this->datatype_fields = Duplicate<RegName>(instr.datatype_fields, instr.num_fields);
+      return *this;
+    case Opcode::AllocClosure:
+      this->clo_index = instr.clo_index;
+      this->num_freevar = instr.num_freevar;
+      FreeIf(this->free_vars);
+      this->free_vars = Duplicate<RegName>(instr.free_vars, instr.num_freevar);
+      return *this;
+    case Opcode::InvokePacked:
+      this->packed_index = instr.packed_index;
+      this->arity = instr.arity;
+      this->output_size = instr.output_size;
+      FreeIf(this->packed_args);
+      this->packed_args = Duplicate<RegName>(instr.packed_args, instr.arity);
+      return *this;
+    case Opcode::InvokeClosure:
+      this->closure = instr.closure;
+      this->closure_args_num = instr.closure_args_num;
+      FreeIf(this->closure_args);
+      this->closure_args = Duplicate<RegName>(instr.closure_args, instr.closure_args_num);
+      return *this;
+    case Opcode::Invoke:
+      this->func_index = instr.func_index;
+      this->num_args = instr.num_args;
+      FreeIf(this->invoke_args_registers);
+      this->invoke_args_registers = Duplicate<RegName>(instr.invoke_args_registers, instr.num_args);
+      return *this;
+    case Opcode::If:
+      this->if_cond = instr.if_cond;
+      this->true_offset = instr.true_offset;
+      this->false_offset = instr.false_offset;
+      return *this;
+    case Opcode::LoadConst:
+      this->const_index = instr.const_index;
+      return *this;
+    case Opcode::GetField:
+      this->object = instr.object;
+      this->field_index = instr.field_index;
+      return *this;
+    case Opcode::Goto:
+      this->pc_offset = instr.pc_offset;
+      return *this;
+    default:
+      std::ostringstream out;
+      out << "Invalid instruction " << static_cast<int>(instr.op);
+      throw std::runtime_error(out.str());
+  }
+}
+
 Instruction::~Instruction() {
   switch (this->op) {
     case Opcode::Move:
diff --git a/tests/python/relay/benchmarking/benchmark_vm.py b/tests/python/relay/benchmarking/benchmark_vm.py
new file mode 100644
index 000000000000..e359ade864e2
--- /dev/null
+++ b/tests/python/relay/benchmarking/benchmark_vm.py
@@ -0,0 +1,133 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Benchmarking Relay VM using models from MXNet."""
+import numpy as np
+
+import tvm
+from tvm.contrib import graph_runtime
+from tvm import relay
+from tvm.relay import testing
+
+
+def benchmark_execution(net,
+                        params,
+                        measure=False,
+                        data_shape=(1, 3, 224, 224),
+                        out_shape=(1, 1000),
+                        dtype='float32'):
+    def get_tvm_output(net, data, params, target, ctx, dtype='float32'):
+        with relay.build_config(opt_level=1):
+            graph, lib, params = relay.build(net, target, params=params)
+
+        m = graph_runtime.create(graph, lib, ctx)
+        # set inputs
+        m.set_input("data", data)
+        m.set_input(**params)
+        m.run()
+        out = m.get_output(0, tvm.nd.empty(out_shape, dtype))
+
+        if measure:
+            print("Evaluate graph runtime inference time cost...")
+            ftimer = m.module.time_evaluator("run", ctx, number=1, repeat=20)
+            # Measure in millisecond.
+            prof_res = np.array(ftimer().results) * 1000
+            print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
+                  (np.mean(prof_res), np.std(prof_res)))
+
+        return out.asnumpy()
+
+    def get_tvm_vm_output(net, data, params, target, ctx, dtype='float32'):
+        ex = relay.create_executor('vm', mod=relay.Module(), ctx=ctx)
+        result = ex.evaluate(net)(data, **params)
+        return result.asnumpy().astype(dtype)
+
+    # random input
+    data = np.random.uniform(size=data_shape).astype(dtype)
+    target = "llvm"
+    ctx = tvm.cpu(0)
+
+    tvm_out = get_tvm_output(net, tvm.nd.array(data.astype(dtype)), params,
+                             target, ctx, dtype)
+    vm_out = get_tvm_vm_output(net, tvm.nd.array(data.astype(dtype)), params,
+                               target, ctx, dtype)
+    tvm.testing.assert_allclose(vm_out, tvm_out, rtol=1e-5, atol=1e-5)
+
+
+def test_mlp():
+    image_shape = (1, 28, 28)
+    net, params = testing.mlp.get_workload(1)
+    benchmark_execution(net, params, data_shape=image_shape, out_shape=(1, 10))
+
+
+def test_vgg():
+    for n in [11, 16]:
+        net, params = testing.vgg.get_workload(1, num_layers=n)
+        benchmark_execution(net, params)
+
+
+def test_resnet():
+    for n in [18, 50]:
+        net, params = testing.resnet.get_workload(batch_size=1, num_layers=n)
+        benchmark_execution(net, params, True)
+
+
+def test_squeezenet():
+    for version in ['1.0', '1.1']:
+        net, params = testing.squeezenet.get_workload(version=version)
+        benchmark_execution(net, params)
+
+
+def test_inception_v3():
+    image_shape = (3, 299, 299)
+    net, params = testing.inception_v3.get_workload(image_shape=image_shape)
+    benchmark_execution(net, params, data_shape=image_shape)
+
+
+def test_dqn():
+    image_shape = (4, 84, 84)
+    net, params = testing.dqn.get_workload(
+        batch_size=1, image_shape=image_shape)
+    benchmark_execution(net, params, data_shape=image_shape, out_shape=(1, 18))
+
+
+def test_dcgan():
+    image_shape = (1, 100)
+    net, params = testing.dcgan.get_workload(batch_size=1)
+    benchmark_execution(net, params, data_shape=image_shape)
+
+
+def test_mobilenet():
+    net, params = testing.mobilenet.get_workload(batch_size=1)
+    benchmark_execution(net, params)
+
+
+def test_densenet():
+    net, params = testing.densenet.get_workload(batch_size=1)
+    benchmark_execution(net, params)
+
+
+if __name__ == '__main__':
+    test_resnet()
+    test_vgg()
+    test_squeezenet()
+    test_mobilenet()
+    test_densenet()
+    # The following networks fail
+    # test_inception_v3()
+    # test_mlp()
+    # test_dqn()
+    # test_dcgan()
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
new file mode 100644
index 000000000000..bc99418d5da4
--- /dev/null
+++ b/tests/python/relay/test_vm.py
@@ -0,0 +1,264 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import os
+from nose.tools import nottest
+
+import tvm
+import numpy as np
+from tvm import relay
+from tvm.relay.scope_builder import ScopeBuilder
+from tvm.relay.prelude import Prelude
+
+def veval(f, *args, ctx=tvm.cpu()):
+    if isinstance(f, relay.Expr):
+        ex = relay.create_executor('vm', mod=relay.Module(), ctx=ctx)
+        if len(args) == 0:
+            return ex.evaluate(f)
+        else:
+            return ex.evaluate(f)(*args)
+    else:
+        assert isinstance(f, relay.Module), "expected expression or module"
+        mod = f
+        ex = relay.create_executor('vm', mod=mod, ctx=ctx)
+        if len(args) == 0:
+            return ex.evaluate(mod[mod.entry_func])
+        else:
+            return ex.evaluate(mod[mod.entry_func])(*args)
+
+def test_split():
+    x = relay.var('x', shape=(12,))
+    y = relay.split(x, 3, axis=0).astuple()
+    z = relay.concatenate([relay.TupleGetItem(y, 0)], axis=0)
+    f = relay.Function([x], z)
+
+    x_data = np.random.rand(12,).astype('float32')
+    res = veval(f, x_data)
+    tvm.testing.assert_allclose(res.asnumpy(), np.split(x_data, 3, axis=0)[0])
+
+def test_id():
+    x = relay.var('x', shape=(10, 10))
+    f = relay.Function([x], x)
+    x_data = np.random.rand(10, 10).astype('float64')
+    res = veval(f, x_data)
+    tvm.testing.assert_allclose(res.asnumpy(), x_data)
+
+def test_op():
+    x = relay.var('x', shape=(10, 10))
+    f = relay.Function([x], x + x)
+    x_data = np.random.rand(10, 10).astype('float32')
+    res = veval(f, x_data)
+    tvm.testing.assert_allclose(res.asnumpy(), x_data + x_data)
+
+def any(x):
+    x = relay.op.nn.batch_flatten(x)
+    return relay.op.min(x, axis=[0, 1])
+
+def test_cond():
+    x = relay.var('x', shape=(10, 10))
+    y = relay.var('x', shape=(10, 10))
+    # f = relay.Function([x, y], relay.op.equal(x, y))
+    f = relay.Function([x, y], any(relay.op.equal(x, y)))
+    x_data = np.random.rand(10, 10).astype('float32')
+    y_data = np.random.rand(10, 10).astype('float32')
+
+    # same
+    res = veval(f, x_data, x_data)
+    np.testing.assert_allclose(res.asnumpy(), True)
+
+    # diff
+    res = veval(f, x_data, y_data)
+    tvm.testing.assert_allclose(res.asnumpy(), False)
+
+
+def test_simple_if():
+    x = relay.var('x', shape=(10, 10))
+    y = relay.var('y', shape=(10, 10))
+    f = relay.Function([x, y],
+        relay.If(any(relay.op.equal(x, y)), x, y))
+    x_data = np.random.rand(10, 10).astype('float32')
+    y_data = np.random.rand(10, 10).astype('float32')
+
+    # same
+    res = veval(f, x_data, x_data)
+    tvm.testing.assert_allclose(res.asnumpy(), x_data)
+
+    # diff
+    res = veval(f, x_data, y_data)
+    tvm.testing.assert_allclose(res.asnumpy(), y_data)
+
+def test_simple_call():
+    mod = relay.module.Module({})
+    sum_up = relay.GlobalVar('sum_up')
+    i = relay.var('i', shape=[], dtype='int32')
+    sb = ScopeBuilder()
+    sb.ret(i)
+    func = relay.Function([i], sb.get(), ret_type=relay.TensorType([], 'int32'))
+    mod[sum_up] = func
+    i_data = np.array(0, dtype='int32')
+    iarg = relay.var('i', shape=[], dtype='int32')
+    mod[mod.entry_func] = relay.Function([iarg], sum_up(iarg))
+    result = veval(mod, i_data)
+    tvm.testing.assert_allclose(result.asnumpy(), i_data)
+
+def test_count_loop():
+    mod = relay.module.Module({})
+    sum_up = relay.GlobalVar('sum_up')
+    i = relay.var('i', shape=[], dtype='int32')
+    sb = ScopeBuilder()
+    with sb.if_scope(relay.equal(i, relay.const(0, dtype='int32'))):
+        sb.ret(i)
+    with sb.else_scope():
+        one_less = relay.subtract(i, relay.const(1, dtype='int32'))
+        rec_call = relay.Call(sum_up, [one_less])
+        sb.ret(relay.add(rec_call, i))
+    func = relay.Function([i], sb.get(), ret_type=relay.TensorType([], 'int32'))
+    mod[sum_up] = func
+    i_data = np.array(0, dtype='int32')
+    iarg = relay.var('i', shape=[], dtype='int32')
+    mod[mod.entry_func] = relay.Function([iarg], sum_up(iarg))
+    result = veval(mod, i_data)
+    tvm.testing.assert_allclose(result.asnumpy(), i_data)
+
+def test_sum_loop():
+    mod = relay.module.Module({})
+    sum_up = relay.GlobalVar('sum_up')
+    i = relay.var('i', shape=[], dtype='int32')
+    accum = relay.var('accum', shape=[], dtype='int32')
+    sb = ScopeBuilder()
+    with sb.if_scope(relay.equal(i, relay.const(0, 'int32'))):
+        sb.ret(accum)
+    with sb.else_scope():
+        one_less = relay.subtract(i, relay.const(1, 'int32'))
+        new_accum = relay.add(accum, i)
+        sb.ret(relay.Call(sum_up, [one_less, new_accum]))
+    func = relay.Function([i, accum], sb.get())
+    mod[sum_up] = func
+    loop_bound = 0
+    i_data = np.array(loop_bound, dtype='int32')
+    accum_data = np.array(0, dtype='int32')
+    iarg = relay.var('i', shape=[], dtype='int32')
+    aarg = relay.var('accum', shape=[], dtype='int32')
+    mod[mod.entry_func] = relay.Function([iarg, aarg], sum_up(iarg, aarg))
+    result = veval(mod, i_data, accum_data)
+    tvm.testing.assert_allclose(result.asnumpy(), sum(range(1, loop_bound + 1)))
+
+def test_tuple_fst():
+    ttype = relay.TupleType([relay.TensorType((1,)), relay.TensorType((10,))])
+    tup = relay.var('tup', type_annotation=ttype)
+    f = relay.Function([tup], relay.TupleGetItem(tup, 0))
+    i_data = np.random.rand(41).astype('float32')
+    j_data = np.random.rand(10).astype('float32')
+    result = veval(f, (i_data, j_data))
+    tvm.testing.assert_allclose(result.asnumpy(), i_data)
+
+def test_tuple_second():
+    ttype = relay.TupleType([relay.TensorType((1,)), relay.TensorType((10,))])
+    tup = relay.var('tup', type_annotation=ttype)
+    f = relay.Function([tup], relay.TupleGetItem(tup, 1))
+    i_data = np.random.rand(41).astype('float32')
+    j_data = np.random.rand(10).astype('float32')
+    result = veval(f, (i_data, j_data))
+    tvm.testing.assert_allclose(result.asnumpy(), j_data)
+
+@nottest
+def test_list_constructor():
+    # TODO(wweic): implement pattern match to support this test
+    def to_list(o):
+        if isinstance(o, tvm.relay.backend.interpreter.TensorValue):
+            return [o.data.asnumpy().tolist()]
+        if isinstance(o, tvm.relay.backend.interpreter.ConstructorValue):
+            result = []
+            for f in o.fields:
+                result.extend(to_list(f))
+            return result
+
+    mod = relay.Module()
+    p = Prelude(mod)
+
+    nil = p.nil
+    cons = p.cons
+    l = p.l
+
+    one2 = cons(relay.const(1), nil())
+    one3 = cons(relay.const(2), one2)
+    one4 = cons(relay.const(3), one3)
+    f = relay.Function([], one4)
+
+    mod[mod.entry_func] = f
+
+    result = veval(mod)()
+    obj = to_list(result)
+    import pdb; pdb.set_trace()
+    tvm.testing.assert_allclose(obj, np.array([3,2,1]))
+
+def test_let_tensor():
+    sb = relay.ScopeBuilder()
+    shape = (1,)
+    x = relay.var('x', shape=shape, dtype='float32')
+    x1 = relay.var('x1', shape=shape, dtype='float32')
+
+    x1 = sb.let(x1, x)
+    xplusone = x1 + relay.const(42.0, 'float32')
+    sb.ret(xplusone)
+    body = sb.get()
+
+    f = relay.Function([x], body)
+
+    x_data = np.random.rand(*shape).astype('float32')
+    result = veval(f, x_data)
+    tvm.testing.assert_allclose(result.asnumpy(), x_data + 42.0)
+
+def test_let_scalar():
+    sb = relay.ScopeBuilder()
+
+    x = relay.var('x', 'float32')
+    x1 = sb.let('x1', x)
+    xplusone = x1 + relay.const(42.0, 'float32')
+    sb.ret(xplusone)
+    body = sb.get()
+
+    f = relay.Function([x], body)
+
+    x_data = np.array(np.random.rand()).astype('float32')
+    result = veval(f, x_data)
+    tvm.testing.assert_allclose(result.asnumpy(), x_data + 42.0)
+
+def test_closure():
+    x = relay.var('x', shape=())
+    y = relay.var('y', shape=())
+    f = relay.Function([x], x + y)
+    ff = relay.Function([y], f)
+    clo = ff(relay.const(1.0))
+    main = clo(relay.const(2.0))
+    res = veval(main)
+    tvm.testing.assert_allclose(res.asnumpy(), 3.0)
+
+if __name__ == "__main__":
+    test_id()
+    test_op()
+    test_cond()
+    test_simple_if()
+    test_simple_call()
+    test_count_loop()
+    test_sum_loop()
+    test_tuple_fst()
+    test_tuple_second()
+    test_let_scalar()
+    test_let_tensor()
+    # TODO(@jroesch): restore when match is supported
+    # test_list_constructor()
+    test_closure()

From 307f978ddf302fb1a8f3df3816db2d86c80078f6 Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Mon, 13 May 2019 22:33:32 +0530
Subject: [PATCH 102/106] [GOLANG] Some fixes for golang latest version
 compiler. #3119 (#3182)

---
 golang/src/bytearray.go |  4 ++--
 golang/src/function.go  | 20 ++++++++++----------
 golang/src/module.go    |  6 +++---
 golang/src/ndarray.go   | 22 +++++++++++-----------
 4 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/golang/src/bytearray.go b/golang/src/bytearray.go
index 6fe305fc5c41..d8a4fda893a1 100644
--- a/golang/src/bytearray.go
+++ b/golang/src/bytearray.go
@@ -51,7 +51,7 @@ func (tbytearray ByteArray) nativeCPtr() (retVal uintptr) {
 // `val` is the golang string object from which the ByteArray is initialized.
 func (tbytearray ByteArray) setData(val string) {
     bufPtr := ((*C.TVMByteArray)(unsafe.Pointer(tbytearray))).data
-    if bufPtr == (*_Ctype_char)(C.NULL) {
+    if bufPtr == (*C.char)(C.NULL) {
         C.free(unsafe.Pointer(bufPtr))
     }
 
@@ -74,7 +74,7 @@ func (tbytearray ByteArray) getData() (retVal []byte) {
 // returns newly created ByteArray.
 func newByteArray(val []byte) (retVal ByteArray) {
     handle := ByteArray(C.malloc(C.sizeof_TVMByteArray))
-    ((*C.TVMByteArray)(unsafe.Pointer(handle))).data = (*_Ctype_char)(C.NULL)
+    ((*C.TVMByteArray)(unsafe.Pointer(handle))).data = (*C.char)(C.NULL)
     ((*C.TVMByteArray)(unsafe.Pointer(handle))).size = 0
     handle.setData(string(val))
     retVal = handle
diff --git a/golang/src/function.go b/golang/src/function.go
index 783032494829..24975c4e9e8a 100644
--- a/golang/src/function.go
+++ b/golang/src/function.go
@@ -123,7 +123,7 @@ func GetGlobalFunction(funcname string) (retVal *Function, err error) {
 
     cfuncname := C.CString(funcname)
     ret := (int32)(C.TVMFuncGetGlobal(cfuncname,
-                                      (*_Ctype_TVMFunctionHandle)(unsafe.Pointer(&funp))))
+                                      (*C.TVMFunctionHandle)(unsafe.Pointer(&funp))))
     C.free(unsafe.Pointer(cfuncname))
 
     if ret != 0 {
@@ -229,12 +229,12 @@ func nativeTVMFuncCall(funp *Function, argValues []*Value, typeCodes []int32,
                  retValues []*Value, retTypeCode *int32) (err error) {
     nargValues := nativeFromGoSlice(argValues)
     nretValues := nativeFromGoSlice(retValues)
-	result := (int32)(C.TVMFuncCall(_Ctype_TVMFunctionHandle(*funp),
-                                    (*_Ctype_TVMValue)(unsafe.Pointer(nargValues)),
-                                    (*_Ctype_int)(unsafe.Pointer(&(typeCodes[0]))),
+	result := (int32)(C.TVMFuncCall(C.TVMFunctionHandle(*funp),
+                                    (*C.TVMValue)(unsafe.Pointer(nargValues)),
+                                    (*C.int)(unsafe.Pointer(&(typeCodes[0]))),
                                     C.int(len(argValues)),
-                                    (*_Ctype_TVMValue)(unsafe.Pointer(nretValues)),
-                                    (*_Ctype_int)(unsafe.Pointer(retTypeCode))))
+                                    (*C.TVMValue)(unsafe.Pointer(nretValues)),
+                                    (*C.int)(unsafe.Pointer(retTypeCode))))
     nativeToGoSlice(nargValues, argValues, typeCodes)
     nativeToGoSlice(nretValues, retValues, (*[1<<31] int32)(unsafe.Pointer(retTypeCode))[:1:1])
     C.free(unsafe.Pointer(nargValues))
@@ -312,9 +312,9 @@ func goTVMCallback(args C.native_voidp, typeCodes C.native_voidp, numArgs int32,
         // Handle KStr, KBytes: Local finalizers shouldn't try freeing them.
         retValues[0].isLocal = false
 
-        apiRet := (int32) (C.TVMCFuncSetReturn(_Ctype_TVMRetValueHandle(retArg),
-                                               (*_Ctype_TVMValue)(unsafe.Pointer(nretValues)),
-                                               (*_Ctype_int)(unsafe.Pointer(&retTypeCode)), 1))
+        apiRet := (int32) (C.TVMCFuncSetReturn(C.TVMRetValueHandle(retArg),
+                                               (*C.TVMValue)(unsafe.Pointer(nretValues)),
+                                               (*C.int)(unsafe.Pointer(&retTypeCode)), 1))
         C.free(unsafe.Pointer(nretValues))
         if apiRet != 0 {
             errStr := string("TVMCFuncSetReturn failed ")
@@ -372,7 +372,7 @@ func RegisterFunction(args ...interface{}) (err error) {
 
     cfuncname := C.CString(funcname)
     result := (int32) (C.TVMFuncRegisterGlobal(cfuncname,
-                                               _Ctype_TVMFunctionHandle(*fhandle),
+                                               C.TVMFunctionHandle(*fhandle),
                                                0)); // Override = False
     C.free(unsafe.Pointer(cfuncname))
     if result != 0 {
diff --git a/golang/src/module.go b/golang/src/module.go
index c58590e45ee1..98b88d86766e 100644
--- a/golang/src/module.go
+++ b/golang/src/module.go
@@ -64,7 +64,7 @@ func LoadModuleFromFile(modpath string, args ...interface{}) (retVal *Module, er
 
     ret := (int32)(C.TVMModLoadFromFile(cmodpath,
                                         cmodtype,
-                                        (*_Ctype_TVMModuleHandle)(unsafe.Pointer(&modp))))
+                                        (*C.TVMModuleHandle)(unsafe.Pointer(&modp))))
 
     C.free(unsafe.Pointer(cmodpath))
     C.free(unsafe.Pointer(cmodtype))
@@ -117,10 +117,10 @@ func (tvmmodule *Module) GetFunction (
 
     var funp uintptr
     cfuncname := C.CString(funcname)
-    ret := (int32)(C.TVMModGetFunction((_Ctype_TVMModuleHandle)(*tvmmodule),
+    ret := (int32)(C.TVMModGetFunction((C.TVMModuleHandle)(*tvmmodule),
                                        cfuncname,
                                        C.int(queryImports),
-                                       (*_Ctype_TVMFunctionHandle)(unsafe.Pointer(&funp))))
+                                       (*C.TVMFunctionHandle)(unsafe.Pointer(&funp))))
     C.free(unsafe.Pointer(cfuncname))
 
     if ret != 0 {
diff --git a/golang/src/ndarray.go b/golang/src/ndarray.go
index 548e48d0fbdf..cd5e3fa9a17b 100644
--- a/golang/src/ndarray.go
+++ b/golang/src/ndarray.go
@@ -48,7 +48,7 @@ func (parray Array) nativeCPtr() (retVal uintptr) {
 }
 
 func (parray Array) nativeCopyFrom(data unsafe.Pointer, datalen int) (err error) {
-    ret := C.TVMArrayCopyFromBytes((*_Ctype_TVMArray)(unsafe.Pointer(parray.nativeCPtr())),
+    ret := C.TVMArrayCopyFromBytes((*C.TVMArray)(unsafe.Pointer(parray.nativeCPtr())),
                                    data,
                                    C.ulong(datalen))
     if ret != 0 {
@@ -66,7 +66,7 @@ func (parray Array) nativeCopyFrom(data unsafe.Pointer, datalen int) (err error)
 func (parray Array) CopyFrom(val interface{}) (err error) {
     var data unsafe.Pointer
     var datalen int
-    dtype := ((*_Ctype_TVMArray)(unsafe.Pointer(parray))).dtype
+    dtype := ((*C.TVMArray)(unsafe.Pointer(parray))).dtype
 
     switch val.(type) {
         case []int8:
@@ -127,7 +127,7 @@ func (parray Array) CopyFrom(val interface{}) (err error) {
 }
 
 func (parray Array) nativeCopyTo (data unsafe.Pointer, datalen int) (err error){
-    ret := C.TVMArrayCopyToBytes((*_Ctype_TVMArray)(unsafe.Pointer(parray.nativeCPtr())),
+    ret := C.TVMArrayCopyToBytes((*C.TVMArray)(unsafe.Pointer(parray.nativeCPtr())),
                                   unsafe.Pointer(data),
                                   C.ulong(datalen))
 
@@ -150,7 +150,7 @@ func (parray Array) AsSlice() (retVal interface{}, err error) {
     for ii := range shape {
         size *= shape[ii]
     }
-    dtype := ((*_Ctype_TVMArray)(unsafe.Pointer(parray))).dtype
+    dtype := ((*C.TVMArray)(unsafe.Pointer(parray))).dtype
 
     switch parray.GetDType() {
         case "int8":
@@ -222,13 +222,13 @@ func (parray Array) AsSlice() (retVal interface{}, err error) {
 
 // GetNdim returns the number of dimentions in Array
 func (parray Array) GetNdim() (retVal int32) {
-    retVal = int32(((*_Ctype_TVMArray)(unsafe.Pointer(parray))).ndim)
+    retVal = int32(((*C.TVMArray)(unsafe.Pointer(parray))).ndim)
     return
 }
 
 // GetShape returns the number of dimentions in Array
 func (parray Array) GetShape() (retVal []int64) {
-    shapePtr := (*C.int64_t)(((*_Ctype_TVMArray)(unsafe.Pointer(parray))).shape)
+    shapePtr := (*C.int64_t)(((*C.TVMArray)(unsafe.Pointer(parray))).shape)
     ndim := parray.GetNdim()
 
     shapeSlice := (*[1<<31] int64)(unsafe.Pointer(shapePtr))[:ndim:ndim]
@@ -239,14 +239,14 @@ func (parray Array) GetShape() (retVal []int64) {
 
 // GetDType returns the number of dimentions in Array
 func (parray Array) GetDType() (retVal string) {
-    ret := ((*_Ctype_TVMArray)(unsafe.Pointer(parray))).dtype
+    ret := ((*C.TVMArray)(unsafe.Pointer(parray))).dtype
     retVal, _ = dtypeFromTVMType(*(*pTVMType)(unsafe.Pointer(&ret)))
     return
 }
 
 // GetCtx returns the number of dimentions in Array
 func (parray Array) GetCtx() (retVal Context) {
-    ret := ((*_Ctype_TVMArray)(unsafe.Pointer(parray))).ctx
+    ret := ((*C.TVMArray)(unsafe.Pointer(parray))).ctx
     retVal = *(*Context)(unsafe.Pointer(&ret))
     return
 }
@@ -267,14 +267,14 @@ func (parray Array) GetCtx() (retVal Context) {
 func nativeTVMArrayAlloc(shape []int64, ndim int32,
                    dtypeCode int32, dtypeBits int32, dtypeLanes int32,
                    deviceType int32, deviceID int32) (retVal uintptr, err error) {
-    ret := (int32)(C.TVMArrayAlloc((*_Ctype_long)(&(shape[0])),
+    ret := (int32)(C.TVMArrayAlloc((*C.long)(&(shape[0])),
                                    C.int(ndim),
                                    C.int(dtypeCode),
                                    C.int(dtypeBits),
                                    C.int(dtypeLanes),
                                    C.int(deviceType),
                                    C.int(deviceID),
-                                   (*_Ctype_TVMArrayHandle)(unsafe.Pointer(&retVal))))
+                                   (*C.TVMArrayHandle)(unsafe.Pointer(&retVal))))
     if ret != 0 {
         err = errors.New(getTVMLastError())
         return
@@ -343,6 +343,6 @@ func Empty(shape []int64, args ...interface{}) (parray *Array, err error) {
 //
 // `ret` indicates the status of this api execution.
 func nativeTVMArrayFree(parray Array) (retVal int32) {
-    retVal = (int32)(C.TVMArrayFree((*_Ctype_TVMArray)(unsafe.Pointer(parray.nativeCPtr()))))
+    retVal = (int32)(C.TVMArrayFree((*C.TVMArray)(unsafe.Pointer(parray.nativeCPtr()))))
     return
 }

From 0045a3fc719c8c1f3eca911cd36163b2b98e1a26 Mon Sep 17 00:00:00 2001
From: Oldpan <295484914@qq.com>
Date: Tue, 14 May 2019 02:03:41 +0800
Subject: [PATCH 103/106] Fix a bug of flatten in ONNX to Relay converter
 (#3180)

* fix onnx frontend flatten bug

* Update onnx.py

* Update onnx.py

* Update onnx.py
---
 python/tvm/relay/frontend/onnx.py          | 19 ++++++++++++++++-
 tests/python/frontend/onnx/test_forward.py | 24 ++++++++++++++++++++++
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index b4d36306c85d..eba02e70c865 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -335,6 +335,23 @@ class Reciprocal(OnnxOpConverter):
     def _impl_v1(cls, inputs, attr, params):
         return _expr.const(1.0) / inputs[0]
 
+
+class Flatten(OnnxOpConverter):
+    """ Operator converter for Flatten.
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        axis = attr.get('axis', 1)
+        if axis == 1:
+            out = _op.nn.batch_flatten(inputs[0])
+        else:
+            newshape = [0] * (axis + 1)
+            newshape[axis] = -1
+            out = _op.reshape(inputs[0], list(newshape))
+        return out
+
+
 class Reshape(OnnxOpConverter):
     """ Operator converter for Reshape.
     """
@@ -850,7 +867,7 @@ def _get_convert_map(opset):
         # 'InstanceNormalization'
         # 'LpNormalization'
         'Dropout': AttrCvt('dropout', {'ratio': 'rate'}, ignores=['is_test']),
-        'Flatten': Renamer('batch_flatten'),
+        'Flatten': Flatten.get_converter(opset),
         'LRN': LRN.get_converter(opset),
 
         # defs/reduction
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 7be6bb611e9a..f867e73e8c08 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -211,6 +211,29 @@ def test_squeeze():
 
     tvm.testing.assert_allclose(out_shape, tvm_out.shape)
 
+def test_flatten():
+
+    in_shape = (1, 3, 4, 4)
+    axis = 1
+    ref_shape = (1, 48)
+
+    flatten_node = helper.make_node("Flatten", ["in"], ["out"], axis = axis)
+
+    graph = helper.make_graph([flatten_node],
+                              "flatten_test",
+                              inputs = [helper.make_tensor_value_info("in",
+                                            TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(ref_shape))])
+
+    model = helper.make_model(graph, producer_name='flatten_test')
+
+    for target, ctx in ctx_list():
+        x = np.random.uniform(size=in_shape).astype('int32')
+        tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, 'float32')
+
+    tvm.testing.assert_allclose(ref_shape, tvm_out.shape)
+
 def test_unsqueeze():
     in_shape = (3, 3)
     axis = (0, 3, 4)
@@ -1046,6 +1069,7 @@ def test_LogSoftmax():
                               {'axis': 1})
 
 if __name__ == '__main__':
+    test_flatten()
     test_reshape()
     test_shape()
     test_power()

From fd1e26a7db371ecae52abe933e7d5b481f2517bf Mon Sep 17 00:00:00 2001
From: Wei Chen <wweic@amazon.com>
Date: Mon, 13 May 2019 12:40:15 -0700
Subject: [PATCH 104/106] Update dmlc_tvm_commit_id

---
 dmlc_tvm_commit_id | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dmlc_tvm_commit_id b/dmlc_tvm_commit_id
index 44fb847332d7..9eb98e3754f0 100644
--- a/dmlc_tvm_commit_id
+++ b/dmlc_tvm_commit_id
@@ -1 +1 @@
-57f47a17f266e4123be49b84b5caf6a143d2544a
\ No newline at end of file
+25c91d34c4de744cc9428944ccb1e84a72476ce5

From c6f8342fbc05f3498257836f6e048f5d99d1c250 Mon Sep 17 00:00:00 2001
From: Wei Chen <wweic@amazon.com>
Date: Mon, 13 May 2019 13:33:14 -0700
Subject: [PATCH 105/106] Fix file type

---
 dmlc_tvm_commit_id => dmlc_tvm_commit_id.txt | 0
 neo-tools/sync-with-dmlc.py                  | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename dmlc_tvm_commit_id => dmlc_tvm_commit_id.txt (100%)

diff --git a/dmlc_tvm_commit_id b/dmlc_tvm_commit_id.txt
similarity index 100%
rename from dmlc_tvm_commit_id
rename to dmlc_tvm_commit_id.txt
diff --git a/neo-tools/sync-with-dmlc.py b/neo-tools/sync-with-dmlc.py
index 3369ad49bf26..9bc7a0b5ad35 100755
--- a/neo-tools/sync-with-dmlc.py
+++ b/neo-tools/sync-with-dmlc.py
@@ -31,7 +31,7 @@ def add_remote(repo, name, url):
 """
 def main():
     args = parse_args()
-    last_commit_file = 'dmlc_tvm_commit_id'
+    last_commit_file = 'dmlc_tvm_commit_id.txt'
     last_commit = None
     with open(last_commit_file) as f:
         last_commit = f.read().strip()

From a8e679333b62b684a6e947867183b61341e9783c Mon Sep 17 00:00:00 2001
From: Wei Chen <wweic@amazon.com>
Date: Mon, 13 May 2019 13:48:34 -0700
Subject: [PATCH 106/106] Add Apache licence header

---
 benchmark/tensorrt/run_mxnet.py               | 17 +++++++++++++++++
 benchmark/tensorrt/run_tvm.py                 | 17 +++++++++++++++++
 benchmark/tensorrt/run_tvm.sh                 | 16 ++++++++++++++++
 cmake/modules/contrib/TensorRT.cmake          | 17 +++++++++++++++++
 neo-tools/sync-with-dmlc.py                   | 17 +++++++++++++++++
 nnvm/include/nnvm/c_api_subgraph.h            | 19 +++++++++++++++++++
 nnvm/python/nnvm/subgraph.py                  | 17 +++++++++++++++++
 nnvm/src/c_api/c_api_subgraph.cc              | 19 +++++++++++++++++++
 nnvm/src/pass/device_copy_op.cc               | 19 +++++++++++++++++++
 nnvm/src/pass/graph_annotate.cc               | 19 +++++++++++++++++++
 nnvm/src/pass/graph_annotate.h                | 19 +++++++++++++++++++
 nnvm/src/pass/insert_copy_op.cc               | 19 +++++++++++++++++++
 nnvm/src/pass/subgraph/partition_graph.cc     | 19 +++++++++++++++++++
 .../subgraph/tensorrt_subgraph_property.cc    | 19 +++++++++++++++++++
 nnvm/src/top/subgraph/tensorrt_subgraph_op.cc | 19 +++++++++++++++++++
 nnvm/tests/cpp/op_fallback_test.cc            | 19 +++++++++++++++++++
 .../python/unittest/test_graph_annotation.py  | 17 +++++++++++++++++
 src/contrib/subgraph/subgraph.h               | 19 +++++++++++++++++++
 src/contrib/subgraph/tensorrt_executor.cc     | 19 +++++++++++++++++++
 src/contrib/subgraph/tensorrt_executor.h      | 19 +++++++++++++++++++
 tests/python/tensorrt/test_cross_compile.py   | 17 +++++++++++++++++
 tests/python/tensorrt/test_tensorrt.py        | 17 +++++++++++++++++
 22 files changed, 399 insertions(+)

diff --git a/benchmark/tensorrt/run_mxnet.py b/benchmark/tensorrt/run_mxnet.py
index 925b650e5751..887d6e898417 100644
--- a/benchmark/tensorrt/run_mxnet.py
+++ b/benchmark/tensorrt/run_mxnet.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 import os
 import time
 import numpy as np
diff --git a/benchmark/tensorrt/run_tvm.py b/benchmark/tensorrt/run_tvm.py
index 2abbb4a1277e..1cdf825a936e 100644
--- a/benchmark/tensorrt/run_tvm.py
+++ b/benchmark/tensorrt/run_tvm.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 import time
 import numpy as np
 import argparse
diff --git a/benchmark/tensorrt/run_tvm.sh b/benchmark/tensorrt/run_tvm.sh
index 11a29963db0f..08b0b5c4387a 100644
--- a/benchmark/tensorrt/run_tvm.sh
+++ b/benchmark/tensorrt/run_tvm.sh
@@ -1,3 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
 #!/usr/bin/env bash
 
 declare -a models=(
diff --git a/cmake/modules/contrib/TensorRT.cmake b/cmake/modules/contrib/TensorRT.cmake
index 11e62477b2cd..7c52cd77abdf 100644
--- a/cmake/modules/contrib/TensorRT.cmake
+++ b/cmake/modules/contrib/TensorRT.cmake
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 # TensorRT Module
 
 if(IS_DIRECTORY ${USE_TENSORRT})
diff --git a/neo-tools/sync-with-dmlc.py b/neo-tools/sync-with-dmlc.py
index 9bc7a0b5ad35..9147ba092af1 100755
--- a/neo-tools/sync-with-dmlc.py
+++ b/neo-tools/sync-with-dmlc.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 import argparse
 import logging
 
diff --git a/nnvm/include/nnvm/c_api_subgraph.h b/nnvm/include/nnvm/c_api_subgraph.h
index 9ae82951cb40..b2f70e3a0b4d 100644
--- a/nnvm/include/nnvm/c_api_subgraph.h
+++ b/nnvm/include/nnvm/c_api_subgraph.h
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
  *  Copyright (c) 2018 by Contributors
  * \file c_api_subgraph.h
diff --git a/nnvm/python/nnvm/subgraph.py b/nnvm/python/nnvm/subgraph.py
index 4dfccd778143..b044b78994ea 100644
--- a/nnvm/python/nnvm/subgraph.py
+++ b/nnvm/python/nnvm/subgraph.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 # coding: utf-8
 # pylint: disable=invalid-name, protected-access, too-many-arguments, too-many-lines
 """NNVM subgraph.
diff --git a/nnvm/src/c_api/c_api_subgraph.cc b/nnvm/src/c_api/c_api_subgraph.cc
index ac5215e62b08..71446345ebce 100644
--- a/nnvm/src/c_api/c_api_subgraph.cc
+++ b/nnvm/src/c_api/c_api_subgraph.cc
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
  *  Copyright (c) 2018 by Contributors
  * \file c_api_subgraph.cc
diff --git a/nnvm/src/pass/device_copy_op.cc b/nnvm/src/pass/device_copy_op.cc
index d0ad107f31f6..1970791f18c5 100644
--- a/nnvm/src/pass/device_copy_op.cc
+++ b/nnvm/src/pass/device_copy_op.cc
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*
  * Copyright (c) 2018 by Contributors
  * \file device_copy_op.h
diff --git a/nnvm/src/pass/graph_annotate.cc b/nnvm/src/pass/graph_annotate.cc
index 29e65f20b1e6..9e2a20a39e93 100644
--- a/nnvm/src/pass/graph_annotate.cc
+++ b/nnvm/src/pass/graph_annotate.cc
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
  * Copyright (c) 2018 by Contributors
  * \file graph_annotate.cc
diff --git a/nnvm/src/pass/graph_annotate.h b/nnvm/src/pass/graph_annotate.h
index 80eca756b386..37b2a165661f 100644
--- a/nnvm/src/pass/graph_annotate.h
+++ b/nnvm/src/pass/graph_annotate.h
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
  * Copyright (c) 2018 by Contributors
  * \file graph_annotate.h
diff --git a/nnvm/src/pass/insert_copy_op.cc b/nnvm/src/pass/insert_copy_op.cc
index e54b37c5290a..fd4dabc2d420 100644
--- a/nnvm/src/pass/insert_copy_op.cc
+++ b/nnvm/src/pass/insert_copy_op.cc
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
  * Copyright (c) 2018 by Contributors
  * \file insert_copy_op.cc
diff --git a/nnvm/src/pass/subgraph/partition_graph.cc b/nnvm/src/pass/subgraph/partition_graph.cc
index 654ec3b767e4..61c5b0265450 100644
--- a/nnvm/src/pass/subgraph/partition_graph.cc
+++ b/nnvm/src/pass/subgraph/partition_graph.cc
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
  *  Copyright (c) 2018 by Contributors
  * \file partition_graph.cc
diff --git a/nnvm/src/pass/subgraph/tensorrt_subgraph_property.cc b/nnvm/src/pass/subgraph/tensorrt_subgraph_property.cc
index aaca364e8e5f..194961be12ef 100644
--- a/nnvm/src/pass/subgraph/tensorrt_subgraph_property.cc
+++ b/nnvm/src/pass/subgraph/tensorrt_subgraph_property.cc
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
  *  Copyright (c) 2018 by Contributors
  * \file tensorrt_subgraph_property.cc
diff --git a/nnvm/src/top/subgraph/tensorrt_subgraph_op.cc b/nnvm/src/top/subgraph/tensorrt_subgraph_op.cc
index 48eaa406c489..cc8f24d6aa2f 100644
--- a/nnvm/src/top/subgraph/tensorrt_subgraph_op.cc
+++ b/nnvm/src/top/subgraph/tensorrt_subgraph_op.cc
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
  *  Copyright (c) 2018 by Contributors
  * \file tensorrt_subgraph_op.cc
diff --git a/nnvm/tests/cpp/op_fallback_test.cc b/nnvm/tests/cpp/op_fallback_test.cc
index c1d9953fa05c..477d9be6626d 100644
--- a/nnvm/tests/cpp/op_fallback_test.cc
+++ b/nnvm/tests/cpp/op_fallback_test.cc
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 #include <dmlc/logging.h>
 #include <gmock/gmock-matchers.h>
 #include <gmock/gmock.h>
diff --git a/nnvm/tests/python/unittest/test_graph_annotation.py b/nnvm/tests/python/unittest/test_graph_annotation.py
index a96c36d1c547..1408e4217b61 100644
--- a/nnvm/tests/python/unittest/test_graph_annotation.py
+++ b/nnvm/tests/python/unittest/test_graph_annotation.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 """Unit tests for graph annotation."""
 import time
 import zipfile
diff --git a/src/contrib/subgraph/subgraph.h b/src/contrib/subgraph/subgraph.h
index 16e42e3d4e0f..92d115f4dc39 100644
--- a/src/contrib/subgraph/subgraph.h
+++ b/src/contrib/subgraph/subgraph.h
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
  *  Copyright (c) 2018 by Contributors
  *
diff --git a/src/contrib/subgraph/tensorrt_executor.cc b/src/contrib/subgraph/tensorrt_executor.cc
index ee490cf9914d..d25f62c2070e 100644
--- a/src/contrib/subgraph/tensorrt_executor.cc
+++ b/src/contrib/subgraph/tensorrt_executor.cc
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
  *  Copyright (c) 2018 by Contributors
  *
diff --git a/src/contrib/subgraph/tensorrt_executor.h b/src/contrib/subgraph/tensorrt_executor.h
index d8a278b68cbc..8aa759428066 100644
--- a/src/contrib/subgraph/tensorrt_executor.h
+++ b/src/contrib/subgraph/tensorrt_executor.h
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
  *  Copyright (c) 2018 by Contributors
  *
diff --git a/tests/python/tensorrt/test_cross_compile.py b/tests/python/tensorrt/test_cross_compile.py
index e1d8d1293e79..07d39aab81ec 100644
--- a/tests/python/tensorrt/test_cross_compile.py
+++ b/tests/python/tensorrt/test_cross_compile.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 import time
 import numpy as np
 import argparse
diff --git a/tests/python/tensorrt/test_tensorrt.py b/tests/python/tensorrt/test_tensorrt.py
index 496d8b7f8fea..12f7972d60ee 100644
--- a/tests/python/tensorrt/test_tensorrt.py
+++ b/tests/python/tensorrt/test_tensorrt.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 import random
 import logging
 logging.basicConfig(level=logging.INFO)