diff --git a/.gitignore b/.gitignore
index e44aa6e21464..042b96841ce5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -216,3 +216,14 @@ patched.txt
 # Python type checking
 .mypy_cache/
 .pyre/
+
+# pipenv file
+Pipfile
+Pipfile.lock
+
+# conda package artifacts
+conda/Dockerfile.cuda*
+conda/pkg
+
+.envrc
+*.nix
\ No newline at end of file
diff --git a/3rdparty/HalideIR b/3rdparty/HalideIR
index a768f2f06279..32057b53eee8 160000
--- a/3rdparty/HalideIR
+++ b/3rdparty/HalideIR
@@ -1 +1 @@
-Subproject commit a768f2f0627917659a4d7167eee3190469b9d164
+Subproject commit 32057b53eee870d73c6c21dc820d6546b4d9a13f
diff --git a/3rdparty/bfloat16/bfloat16.cc b/3rdparty/bfloat16/bfloat16.cc
new file mode 100644
index 000000000000..1f25be17f72d
--- /dev/null
+++ b/3rdparty/bfloat16/bfloat16.cc
@@ -0,0 +1,80 @@
+/*
+    Copyright (c) 2019 by Contributors
+   \file tvm/src/codegen/custom_datatypes/mybfloat16.cc
+   \brief Small bfloat16 library for use in unittests
+
+  Code originally from TensorFlow; taken and simplified. Original license:
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  ==============================================================================*/
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <cstddef>
+#include <cstdint>
+
+void FloatToBFloat16(const float* src, uint16_t* dst, size_t size) {
+  const uint16_t* p = reinterpret_cast<const uint16_t*>(src);
+  uint16_t* q = reinterpret_cast<uint16_t*>(dst);
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  for (; size != 0; p += 2, q++, size--) {
+    *q = p[0];
+  }
+#else
+  for (; size != 0; p += 2, q++, size--) {
+    *q = p[1];
+  }
+#endif
+}
+
+void BFloat16ToFloat(const uint16_t* src, float* dst, size_t size) {
+  const uint16_t* p = reinterpret_cast<const uint16_t*>(src);
+  uint16_t* q = reinterpret_cast<uint16_t*>(dst);
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  for (; size != 0; p++, q += 2, size--) {
+    q[0] = *p;
+    q[1] = 0;
+  }
+#else
+  for (; size != 0; p++, q += 2, size--) {
+    q[0] = 0;
+    q[1] = *p;
+  }
+#endif
+}
+
+void BFloat16Add(const uint16_t* a, const uint16_t* b, uint16_t* dst,
+                 size_t size) {
+  float a_f, b_f;
+  BFloat16ToFloat(a, &a_f, 1);
+  BFloat16ToFloat(b, &b_f, 1);
+  float out_f = a_f + b_f;
+  FloatToBFloat16(&out_f, dst, 1);
+}
+
+extern "C" {
+TVM_DLL TVM_DLL uint16_t FloatToBFloat16_wrapper(float in) {
+  uint16_t out;
+  FloatToBFloat16(&in, &out, 1);
+  return out;
+}
+
+TVM_DLL float BFloat16ToFloat_wrapper(uint16_t in) {
+  float out;
+  BFloat16ToFloat(&in, &out, 1);
+  return out;
+}
+
+TVM_DLL uint16_t BFloat16Add_wrapper(uint16_t a, uint16_t b) {
+  uint16_t out;
+  BFloat16Add(&a, &b, &out, 1);
+  return out;
+}
+}
diff --git a/3rdparty/dlpack b/3rdparty/dlpack
index 5c792cef3aee..0acb731e0e43 160000
--- a/3rdparty/dlpack
+++ b/3rdparty/dlpack
@@ -1 +1 @@
-Subproject commit 5c792cef3aee54ad8b7000111c9dc1797f327b59
+Subproject commit 0acb731e0e43d15deee27b66f10e4c5b4e667913
diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 82bf4c2e2af3..3943914eed66 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 82bf4c2e2af312b3d52513aa727483803a2f8734
+Subproject commit 3943914eed66470bd010df581e29e4dca4f7df6f
diff --git a/CMakeLists.txt b/CMakeLists.txt
index dceb9f46568e..80b121477631 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,6 +37,13 @@ tvm_option(USE_SGX "Build with SGX" OFF)
 tvm_option(USE_RTTI "Build with RTTI" ON)
 tvm_option(USE_MSVC_MT "Build with MT" OFF)
 tvm_option(INSTALL_DEV "Install compiler infrastructure" OFF)
+tvm_option(HIDE_PRIVATE_SYMBOLS "Compile with -fvisibility=hidden." OFF)
+
+# 3rdparty libraries
+tvm_option(DLPACK_PATH "Path to DLPACK" "3rdparty/dlpack/include")
+tvm_option(DMLC_PATH "Path to DMLC" "3rdparty/dmlc-core/include")
+tvm_option(RANG_PATH "Path to RANG" "3rdparty/rang/include")
+tvm_option(COMPILER_RT_PATH "Path to COMPILER-RT" "3rdparty/compiler-rt")
 
 # Contrib library options
 tvm_option(USE_BLAS "The blas library to be linked" none)
@@ -52,11 +59,12 @@ tvm_option(USE_TENSORRT "Build with TensorRT, must have CUDA and CUDNN enabled"
 tvm_option(USE_ANTLR "Build with ANTLR for Relay parsing" OFF)
 
 # include directories
+include_directories(${CMAKE_INCLUDE_PATH})
 include_directories("include")
-include_directories("3rdparty/dlpack/include")
-include_directories("3rdparty/dmlc-core/include")
-include_directories("3rdparty/rang/include")
-include_directories("3rdparty/compiler-rt")
+include_directories(${DLPACK_PATH})
+include_directories(${DMLC_PATH})
+include_directories(${RANG_PATH})
+include_directories(${COMPILER_RT_PATH})
 
 # initial variables
 set(TVM_LINKER_LIBS "")
@@ -90,8 +98,13 @@ else(MSVC)
     set(CMAKE_C_FLAGS "-O0 -g -Wall -fPIC ${CMAKE_C_FLAGS} -rdynamic")
     set(CMAKE_CXX_FLAGS "-O0 -g -Wall -fPIC -std=c++11 ${CMAKE_CXX_FLAGS} -rdynamic")
   else()
-    set(CMAKE_C_FLAGS "-O2 -Wall -fPIC -fvisibility=hidden ${CMAKE_C_FLAGS}")
-    set(CMAKE_CXX_FLAGS "-O2 -Wall -fPIC -fvisibility=hidden -std=c++11 ${CMAKE_CXX_FLAGS}")
+    set(CMAKE_C_FLAGS "-O2 -Wall -fPIC ${CMAKE_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "-O2 -Wall -fPIC -std=c++11 ${CMAKE_CXX_FLAGS}")
+    if (HIDE_PRIVATE_SYMBOLS)
+      message("Hide private symbols...")
+      set(CMAKE_C_FLAGS "-fvisibility=hidden ${CMAKE_C_FLAGS}")
+      set(CMAKE_CXX_FLAGS "-fvisibility=hidden ${CMAKE_CXX_FLAGS}")
+    endif(HIDE_PRIVATE_SYMBOLS)
   endif ()
   if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND
       CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
@@ -123,6 +136,8 @@ file(GLOB_RECURSE RELAY_SRCS
     )
 list(APPEND COMPILER_SRCS ${RELAY_SRCS})
 
+file(GLOB DATATYPE_SRCS src/codegen/datatype/*.cc)
+list(APPEND COMPILER_SRCS ${DATATYPE_SRCS})
 
 if(NOT MSVC)
   file(GLOB COMPILER_VERILOG_SRCS src/codegen/verilog/*.cc)
@@ -152,6 +167,8 @@ if(NOT USE_RTTI)
   add_definitions(-DDMLC_ENABLE_RTTI=0)
 endif()
 
+list(APPEND RUNTIME_SRCS 3rdparty/bfloat16/bfloat16.cc)
+
 if(USE_RPC)
   message(STATUS "Build with RPC support...")
   file(GLOB RUNTIME_RPC_SRCS src/runtime/rpc/*.cc)
@@ -209,6 +226,7 @@ add_library(tvm_runtime_static STATIC ${RUNTIME_SRCS})
 if(USE_RELAY_DEBUG)
   message(STATUS "Building Relay in debug mode...")
   set_target_properties(tvm PROPERTIES COMPILE_DEFINITIONS "USE_RELAY_DEBUG")
+else()
   set_target_properties(tvm PROPERTIES COMPILE_DEFINITIONS "NDEBUG")
 endif(USE_RELAY_DEBUG)
 
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 4d4515e09410..2f4e0b65190b 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -55,7 +55,8 @@ We do encourage everyone to work anything they are interested in.
 - [Zhixun Tan](https://github.com/phisiart): @phisiart - opengl, web
 - [Leyuan Wang](https://github.com/Laurawly): @Laurawly: - topi
 - [Yao Wang](https://github.com/kevinthesun): @kevinthesun: - topi, vision
-- [Eddie Yan](https://github.com/eqy): @eqy - runtime, autotvm, rpc, topi
+- [Jian Weng](https://github.com/were): @were: - hybrid script
+- [Eddie Yan](https://github.com/eqy) (PMC): @eqy - runtime, autotvm, rpc, topi
 - [Lianmin Zheng](https://github.com/merrymercy) (PMC): @merrymercy - autotvm, topi, relay
 
 ## Reviewers
@@ -82,6 +83,7 @@ We do encourage everyone to work anything they are interested in.
 - [Kazutaka Morita](https://github.com/kazum): @kazum
 - [Tatsuya Nishiyama](https://github.com/nishi-t): @nishi-t
 - [Pariksheet Pinjari](https://github.com/PariksheetPinjari909): @PariksheetPinjari909
+- [Josh Pollock](https://github.com/joshpoll): @joshpoll
 - [Jared Roesch](https://github.com/jroesch): @jroesch
 - [Siva](https://github.com/srkreddy1238): @srkreddy1238
 - [Siju Samuel](https://github.com/siju-samuel): @siju-samuel
@@ -94,6 +96,7 @@ We do encourage everyone to work anything they are interested in.
 - [Leyuan Wang](https://github.com/Laurawly): @Laurawly
 - [Jian Weng](https://github.com/were): @were
 - [Zhao Wu](https://github.com/FrozenGene): @FrozenGene
+- [Bing Xu](https://github.com/antinucleon): @antinucleon
 - [Eddie Yan](https://github.com/eqy): @eqy
 - [Joshua Z. Zhang](https://github.com/zhreshold): @zhreshold
 - [Lianmin Zheng](https://github.com/merrymercy): @merrymercy
diff --git a/Jenkinsfile b/Jenkinsfile
index af5a2ce3eb42..3b6fc032a389 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -39,7 +39,7 @@
 // - Periodically cleanup the old versions on local workers
 //
 ci_lint = "tvmai/ci-lint:v0.51"
-ci_gpu = "tvmai/ci-gpu:v0.51"
+ci_gpu = "tvmai/ci-gpu:v0.52"
 ci_cpu = "tvmai/ci-cpu:v0.50"
 ci_i386 = "tvmai/ci-i386:v0.50"
 
@@ -135,6 +135,7 @@ stage('Build') {
            echo set\\(USE_CUDNN ON\\) >> config.cmake
            echo set\\(USE_CUDA ON\\) >> config.cmake
            echo set\\(USE_OPENGL ON\\) >> config.cmake
+           echo set\\(USE_MICRO ON\\) >> config.cmake
            echo set\\(USE_LLVM llvm-config-6.0\\) >> config.cmake
            echo set\\(USE_NNPACK ON\\) >> config.cmake
            echo set\\(NNPACK_PATH /NNPACK/build/\\) >> config.cmake
@@ -157,6 +158,7 @@ stage('Build') {
            echo set\\(USE_OPENCL ON\\) >> config.cmake
            echo set\\(USE_ROCM ON\\) >> config.cmake
            echo set\\(USE_VULKAN ON\\) >> config.cmake
+           echo set\\(USE_MICRO ON\\) >> config.cmake
            echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
            echo set\\(CMAKE_CXX_COMPILER clang-6.0\\) >> config.cmake
            echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
@@ -174,12 +176,14 @@ stage('Build') {
            cd build
            cp ../cmake/config.cmake .
            echo set\\(USE_SORT ON\\) >> config.cmake
+           echo set\\(USE_MICRO ON\\) >> config.cmake
            echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
            echo set\\(USE_LLVM llvm-config-4.0\\) >> config.cmake
            echo set\\(USE_NNPACK ON\\) >> config.cmake
            echo set\\(NNPACK_PATH /NNPACK/build/\\) >> config.cmake
            echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
            echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
+           echo set\\(HIDE_PRIVATE_SYMBOLS ON\\) >> config.cmake
            """
         make(ci_cpu, 'build', '-j4')
         pack_lib('cpu', tvm_lib)
@@ -202,6 +206,7 @@ stage('Build') {
            cd build
            cp ../cmake/config.cmake .
            echo set\\(USE_SORT ON\\) >> config.cmake
+           echo set\\(USE_MICRO ON\\) >> config.cmake
            echo set\\(USE_RPC ON\\) >> config.cmake
            echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
            echo set\\(USE_LLVM llvm-config-5.0\\) >> config.cmake
@@ -277,6 +282,17 @@ stage('Integration Test') {
       }
     }
   },
+  'legacy: GPU': {
+    node('GPU') {
+      ws('workspace/tvm/legacy-python-gpu') {
+        init_git()
+        unpack_lib('gpu', tvm_multilib)
+        timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_legacy.sh"
+        }
+      }
+    }
+  },
   'docs: GPU': {
     node('GPU') {
       ws('workspace/tvm/docs-python-gpu') {
diff --git a/apps/android_rpc/README.md b/apps/android_rpc/README.md
index 36ace85405c0..38725917f424 100644
--- a/apps/android_rpc/README.md
+++ b/apps/android_rpc/README.md
@@ -141,7 +141,7 @@ export TVM_NDK_CC=/opt/android-toolchain-arm64/bin/aarch64-linux-android-g++
 python android_rpc_test.py
 ```
 
-This will compile TVM IR to shared libraries (CPU, OpenCL and Vulkan) and run vector addition on your Android device. To verify compiled TVM IR shared libraries on OpenCL target set [`'test_opencl = True'`](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py#L25) and on Vulkan target set [`'test_vulkan = False'`](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py#L27) in  [tests/android_rpc_test.py](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py), by default on CPU target will execute.
+This will compile TVM IR to shared libraries (CPU, OpenCL and Vulkan) and run vector addition on your Android device. To verify compiled TVM IR shared libraries on OpenCL target set `'test_opencl = True'` and on Vulkan target set `'test_vulkan = True'` in  [tests/android_rpc_test.py](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py), by default on CPU target will execute.
 On my test device, it gives following results.
 
 ```bash
diff --git a/cmake/config.cmake b/cmake/config.cmake
index e7ddb9aba6b8..679de8d7e752 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -135,9 +135,6 @@ set(USE_TENSORRT OFF)
 # Build ANTLR parser for Relay text format
 set(USE_ANTLR OFF)
 
-# Build TSIM for VTA
-set(USE_VTA_TSIM OFF)
-
 # Whether use Relay debug mode
 set(USE_RELAY_DEBUG OFF)
 
diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
index 1adb0aaf387a..6d5ea000edc2 100644
--- a/cmake/modules/VTA.cmake
+++ b/cmake/modules/VTA.cmake
@@ -29,8 +29,7 @@ elseif(PYTHON)
       --use-cfg=${CMAKE_CURRENT_BINARY_DIR}/vta_config.json)
   endif()
 
-  execute_process(COMMAND ${VTA_CONFIG} --target OUTPUT_VARIABLE __vta_target)
-  string(STRIP ${__vta_target} VTA_TARGET)
+  execute_process(COMMAND ${VTA_CONFIG} --target OUTPUT_VARIABLE VTA_TARGET OUTPUT_STRIP_TRAILING_WHITESPACE)
 
   message(STATUS "Build VTA runtime with target: " ${VTA_TARGET})
 
@@ -44,6 +43,13 @@ elseif(PYTHON)
 
   add_library(vta SHARED ${VTA_RUNTIME_SRCS})
 
+  if(${VTA_TARGET} STREQUAL "tsim")
+    target_compile_definitions(vta PUBLIC USE_TSIM)
+    include_directories("vta/include")
+    file(GLOB RUNTIME_DPI_SRCS vta/src/dpi/module.cc)
+    list(APPEND RUNTIME_SRCS ${RUNTIME_DPI_SRCS})
+  endif()
+
   target_include_directories(vta PUBLIC vta/include)
 
   foreach(__def ${VTA_DEFINITIONS})
@@ -55,18 +61,12 @@ elseif(PYTHON)
     set_target_properties(vta PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
   endif(APPLE)
 
-  # PYNQ rules for Pynq v2.3
+  # PYNQ rules for Pynq v2.4
   if(${VTA_TARGET} STREQUAL "pynq")
     find_library(__cma_lib NAMES cma PATH /usr/lib)
     target_link_libraries(vta ${__cma_lib})
   endif()
 
-  if(NOT USE_VTA_TSIM STREQUAL "OFF")
-    include_directories("vta/include")
-    file(GLOB RUNTIME_DPI_SRCS vta/src/dpi/module.cc)
-    list(APPEND RUNTIME_SRCS ${RUNTIME_DPI_SRCS})
-  endif()
-
 else()
   message(STATUS "Cannot found python in env, VTA build is skipped..")
 endif()
diff --git a/cmake/modules/contrib/BLAS.cmake b/cmake/modules/contrib/BLAS.cmake
index e1e151d6a9f8..a47f83771d37 100644
--- a/cmake/modules/contrib/BLAS.cmake
+++ b/cmake/modules/contrib/BLAS.cmake
@@ -27,7 +27,11 @@ elseif(USE_BLAS STREQUAL "mkl")
   if(NOT IS_DIRECTORY ${USE_MKL_PATH})
     set(USE_MKL_PATH /opt/intel/mkl)
   endif()
-  find_library(BLAS_LIBRARY NAMES mkl_rt mklml_gnu HINTS ${USE_MKL_PATH}/lib/ ${USE_MKL_PATH}/lib/intel64)
+  if(APPLE)
+    find_library(BLAS_LIBRARY NAMES mklml HINTS ${USE_MKL_PATH}/lib/ ${USE_MKL_PATH}/lib/intel64)
+  elseif(UNIX)
+    find_library(BLAS_LIBRARY NAMES mkl_rt mklml_gnu HINTS ${USE_MKL_PATH}/lib/ ${USE_MKL_PATH}/lib/intel64)
+  endif()
   include_directories(${USE_MKL_PATH}/include)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${BLAS_LIBRARY})
   list(APPEND RUNTIME_SRCS ${CBLAS_CONTRIB_SRC})
diff --git a/conda/Dockerfile.cuda100 b/conda/Dockerfile.template
similarity index 72%
rename from conda/Dockerfile.cuda100
rename to conda/Dockerfile.template
index def8c9ac5d6a..59b9ac96814e 100644
--- a/conda/Dockerfile.cuda100
+++ b/conda/Dockerfile.template
@@ -15,7 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 
-FROM nvidia/cuda:10.0-devel-centos6
+FROM nvidia/cuda:{{ cuda_version }}-devel-centos6
+
+RUN curl -fsSL http://developer.download.nvidia.com/compute/redist/cudnn/v{{ cudnn_short_version }}/cudnn-{{ cuda_version }}-linux-x64-v{{ cudnn_version }}.tgz -O && \
+    tar --no-same-owner -xzf cudnn-{{ cuda_version }}-linux-x64-v{{ cudnn_version }}.tgz -C /usr/local && \
+    rm cudnn-{{ cuda_version }}-linux-x64-v{{ cudnn_version }}.tgz && \
+    ldconfig
+
 
 RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
      chmod +x ~/miniconda.sh && \
@@ -30,4 +36,4 @@ ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
 WORKDIR /workspace
 RUN chmod -R a+w /workspace
 
-CMD conda build --output-folder /workspace/conda/pkg --variants '{cuda: True, cuda_version: 10.0}' /workspace/conda/tvm-libs
+CMD conda build --output-folder /workspace/conda/pkg --variants '{cuda: True, cuda_version: {{ cuda_version }}}' /workspace/conda/tvm-libs
diff --git a/conda/build_cuda.sh b/conda/Makefile
old mode 100755
new mode 100644
similarity index 66%
rename from conda/build_cuda.sh
rename to conda/Makefile
index 2f3207e22987..cda546ac73ce
--- a/conda/build_cuda.sh
+++ b/conda/Makefile
@@ -5,22 +5,18 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-#/bin/sh
-condadir=`dirname $0`
-condadir=`readlink -f $condadir`
-srcdir=`dirname $condadir`
 
-docker build -t tvm-cuda100-forge $condadir -f $condadir/Dockerfile.cuda100
-docker run --rm -v $srcdir:/workspace tvm-cuda100-forge
-docker build -t tvm-cuda92-forge $condadir -f $condadir/Dockerfile.cuda92
-docker run --rm -v $srcdir:/workspace tvm-cuda92-forge
-sudo chown -R `whoami` $condadir/pkg
+packages:
+	conda build tvm-libs
+	conda build tvm
+	conda build topi
+	conda built nnvm
diff --git a/conda/build_cuda.py b/conda/build_cuda.py
new file mode 100644
index 000000000000..47af6ce4564e
--- /dev/null
+++ b/conda/build_cuda.py
@@ -0,0 +1,76 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import sys
+import subprocess
+
+from jinja2 import Template
+
+CUDA_VERSIONS = ['10.0', '9.0']
+
+
+# Make sure that the cudnn version you set here is available
+# for all the cuda versions that you want both from nvidia
+# and from conda.
+
+# These two must be in sync
+CUDNN_FULL_VERSION = '7.3.1.20'
+CUDNN_VERSION = '7.3.1'
+
+
+condadir = os.path.dirname(sys.argv[0])
+condadir = os.path.abspath(condadir)
+srcdir = os.path.dirname(condadir)
+
+
+with open(os.path.join(condadir, 'Dockerfile.template')) as f:
+    docker_template = Template(f.read())
+
+
+def render_dockerfile(version):
+    txt = docker_template.render(cuda_version=version,
+                                 cudnn_short_version=CUDNN_VERSION,
+                                 cudnn_version=CUDNN_FULL_VERSION)
+    fname = os.path.join(condadir,
+                         'Dockerfile.cuda' + version.replace('.', ''))
+    with open(fname, 'w') as f:
+        f.write(txt)
+    return fname
+
+
+def build_docker(version):
+    vv = version.replace('.', '')
+    fname = render_dockerfile(version)
+    tagname = f'tvm-cuda{ vv }-forge'
+    subprocess.run(['docker', 'build', '-t', tagname,
+                    condadir, '-f', fname], check=True)
+    return tagname
+
+
+def build_pkg(version):
+    tagname = build_docker(version)
+    subprocess.run(['docker', 'run', '--rm', '-v', f'{ srcdir }:/workspace',
+                    tagname], check=True)
+
+
+if __name__ == '__main__':
+    build_versions = CUDA_VERSIONS
+    if len(sys.argv) > 1:
+        build_versions = sys.argv[1:]
+    for version in build_versions:
+        build_pkg(version)
diff --git a/conda/nnvm/meta.yaml b/conda/nnvm/meta.yaml
index 883655f335cb..d948484a61e5 100644
--- a/conda/nnvm/meta.yaml
+++ b/conda/nnvm/meta.yaml
@@ -25,7 +25,7 @@ source:
   path: ../..
 
 build:
-  number: 0
+  number: 1
   skip: True  # [win]
 
 requirements:
diff --git a/conda/topi/meta.yaml b/conda/topi/meta.yaml
index bbba452a6422..f4bc8950d4c4 100644
--- a/conda/topi/meta.yaml
+++ b/conda/topi/meta.yaml
@@ -25,7 +25,7 @@ source:
   path: ../..
 
 build:
-  number: 0
+  number: 1
 
 requirements:
   host:
diff --git a/conda/tvm-libs/build.sh b/conda/tvm-libs/build.sh
index d4cf2578b570..e0b85910475e 100644
--- a/conda/tvm-libs/build.sh
+++ b/conda/tvm-libs/build.sh
@@ -16,42 +16,25 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# Fix for OSX build to hide the clang LLVM
-rm -f ${BUILD_PREFIX}/bin/llvm-config
-rm -rf ${BUILD_PREFIX}/lib/cmake
-
 set -e
 
-if [ -z "$PREFIX" ]; then
-  PREFIX="$CONDA_PREFIX"
-fi
-
-if [ -z "$cuda" ] || [ "$cuda" == "False" ]; then
-    CUDA_OPT=""
+if [ "$cuda" == "True" ]; then
+    CUDA_OPT="-DUSE_CUDA=ON -DUSE_CUBLAS=ON -DUSE_CUDNN=ON"
 else
-    CUDA_OPT="-DUSE_CUDA=ON -DUSE_CUBLAS=ON"
+    CUDA_OPT=""
 fi
 
 if [ "$target_platform" == "osx-64" ]; then
     # macOS 64 bits
     METAL_OPT=""  # Conda can only target 10.9 for now
-    TOOLCHAIN_OPT=""
 else
     METAL_OPT=""
-    if [ "$target_platform" == "linux-64" ]; then
-        # Linux 64 bits
-        TOOLCHAIN_OPT="-DCMAKE_TOOLCHAIN_FILE=${RECIPE_DIR}/../cross-linux.cmake"
-    else
-        # Windows (or 32 bits, which we don't support)
-        METAL_OPT=""
-        TOOLCHAIN_OPT=""
-    fi
 fi
 
 rm -rf build || true
 mkdir -p build
 cd build
-cmake $METAL_OPT $CUDA_OPT -DUSE_LLVM=ON -DINSTALL_DEV=ON -DCMAKE_INSTALL_PREFIX="$PREFIX" $TOOLCHAIN_OPT ..
+cmake $METAL_OPT $CUDA_OPT -DUSE_LLVM=$PREFIX/bin/llvm-config -DINSTALL_DEV=ON -DCMAKE_INSTALL_PREFIX="$PREFIX" ..
 make -j${CPU_COUNT} VERBOSE=1
 make install
 cd ..
diff --git a/conda/tvm-libs/meta.yaml b/conda/tvm-libs/meta.yaml
index 5126f5b30359..aad8f251c2a6 100644
--- a/conda/tvm-libs/meta.yaml
+++ b/conda/tvm-libs/meta.yaml
@@ -25,7 +25,7 @@ source:
   path: ../..
 
 build:
-  number: 0
+  number: 1
   string: cuda{{ cuda_version }}_{{ PKG_BUILDNUM }}  # [cuda]
 
 requirements:
@@ -39,6 +39,7 @@ requirements:
     - zlib  # [linux]
   run:
     - {{ pin_compatible('cudatoolkit', lower_bound=cuda_version, max_pin='x.x') }}  # [cuda]
+    - {{ pin_compatible('cudnn', lower_bound='7.3.1', max_pin='x') }}  # [cuda]
 
 about:
   home: https://github.com/dmlc/tvm
diff --git a/conda/tvm/meta.yaml b/conda/tvm/meta.yaml
index 693237ce07c0..221dc7950f75 100644
--- a/conda/tvm/meta.yaml
+++ b/conda/tvm/meta.yaml
@@ -25,7 +25,7 @@ source:
   path: ../..
 
 build:
-  number: 0
+  number: 1
 
 requirements:
   build:
diff --git a/dmlc_tvm_commit_id.txt b/dmlc_tvm_commit_id.txt
index 9eb98e3754f0..ceb215ac275d 100644
--- a/dmlc_tvm_commit_id.txt
+++ b/dmlc_tvm_commit_id.txt
@@ -1 +1 @@
-25c91d34c4de744cc9428944ccb1e84a72476ce5
+cbec5b94b87455f07918f7f4488c9a82a2d26708
diff --git a/vta/apps/tsim_example/cmake/modules/driver.cmake b/docker/Dockerfile.ci_jekyll
similarity index 76%
rename from vta/apps/tsim_example/cmake/modules/driver.cmake
rename to docker/Dockerfile.ci_jekyll
index c4c80637918f..5d3cf86dd6f5 100644
--- a/vta/apps/tsim_example/cmake/modules/driver.cmake
+++ b/docker/Dockerfile.ci_jekyll
@@ -15,10 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 
-file(GLOB TSIM_SW_SRC src/driver.cc)
-add_library(driver SHARED ${TSIM_SW_SRC})
-target_include_directories(driver PRIVATE ${VTA_DIR}/include)
+# CI docker Jekyll env for building website
+# tag: v0.50
+FROM ubuntu:16.04
 
-if(APPLE)
-  set_target_properties(driver PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
-endif(APPLE)
+RUN apt-get update && apt-get install -y sudo wget
+RUN apt-get update && apt-get install -y ruby-full build-essential zlib1g-dev
+RUN gem install jekyll bundler
diff --git a/docker/install/ubuntu_install_nodejs.sh b/docker/install/ubuntu_install_nodejs.sh
index 681f1b4a4641..8da9e2485797 100755
--- a/docker/install/ubuntu_install_nodejs.sh
+++ b/docker/install/ubuntu_install_nodejs.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -25,7 +25,7 @@ apt-get install -y curl
 
 # The node install script fetched and executed here will update the
 # apt source list, hence the second apt-get update is necessary.
-curl -s -S -L https://deb.nodesource.com/setup_6.x | bash -
+curl -s -S -L https://deb.nodesource.com/setup_8.x | bash -
 apt-get update
 apt-get install -y nodejs
 
diff --git a/docker/install/ubuntu_install_onnx.sh b/docker/install/ubuntu_install_onnx.sh
index ec5b7f3b4964..a073389472b2 100755
--- a/docker/install/ubuntu_install_onnx.sh
+++ b/docker/install/ubuntu_install_onnx.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -21,7 +21,7 @@ set -u
 set -o pipefail
 
 # fix to certain version for now
-pip3 install onnx>=1.1.0
+pip3 install onnx>=1.4.1
 
 pip3 install https://download.pytorch.org/whl/cu80/torch-1.0.1.post2-cp36-cp36m-linux_x86_64.whl
 pip3 install torchvision
diff --git a/docker/install/ubuntu_install_tflite.sh b/docker/install/ubuntu_install_tflite.sh
index d70f9890053d..802fb3b87d8c 100755
--- a/docker/install/ubuntu_install_tflite.sh
+++ b/docker/install/ubuntu_install_tflite.sh
@@ -35,7 +35,7 @@ pip2 install flatbuffers
 # Setup tflite from schema
 mkdir tflite
 cd tflite
-wget -q https://raw.githubusercontent.com/tensorflow/tensorflow/r1.12/tensorflow/contrib/lite/schema/schema.fbs
+wget -q https://raw.githubusercontent.com/tensorflow/tensorflow/r1.13/tensorflow/lite/schema/schema.fbs
 flatc --python schema.fbs
 
 cat <<EOM >setup.py
diff --git a/docs/_static/img/tvm-logo-square.png b/docs/_static/img/tvm-logo-square.png
new file mode 100644
index 000000000000..37822d1a2d22
Binary files /dev/null and b/docs/_static/img/tvm-logo-square.png differ
diff --git a/docs/api/python/dev.rst b/docs/api/python/dev.rst
index e4b207bf4cbc..7bb938ca7517 100644
--- a/docs/api/python/dev.rst
+++ b/docs/api/python/dev.rst
@@ -61,6 +61,7 @@ tvm.ir_pass
    tvm.ir_pass.CanonicalSimplify
    tvm.ir_pass.StorageFlatten
    tvm.ir_pass.VectorizeLoop
+   tvm.ir_pass.SkipVectorize
    tvm.ir_pass.UnrollLoop
    tvm.ir_pass.ThreadSync
    tvm.ir_pass.StorageRewrite
diff --git a/docs/api/python/relay/build_module.rst b/docs/api/python/relay/build_module.rst
index 28dadea21e78..26164bf1ade9 100644
--- a/docs/api/python/relay/build_module.rst
+++ b/docs/api/python/relay/build_module.rst
@@ -22,17 +22,9 @@ tvm.relay.build_module
 
 .. autofunction:: tvm.relay.build_module.build
 
-.. autofunction:: tvm.relay.build_module.build_config
-
 .. autofunction:: tvm.relay.build_module.optimize
 
 .. autofunction:: tvm.relay.build_module.create_executor
 
-.. autoclass:: tvm.relay.build_module.BuildConfig
-    :members:
-
-.. autofunction:: tvm.relay.build_module.build_config
-    :members:
-
 .. autoclass:: tvm.relay.build_module.GraphExecutor
     :members:
diff --git a/docs/api/python/relay/transform.rst b/docs/api/python/relay/transform.rst
new file mode 100644
index 000000000000..4eb7f9d8fea7
--- /dev/null
+++ b/docs/api/python/relay/transform.rst
@@ -0,0 +1,45 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+tvm.relay.transform
+----------------------
+
+.. automodule:: tvm.relay.transform
+
+.. autofunction:: tvm.relay.transform.build_config
+
+.. autofunction:: tvm.relay.transform.module_pass
+
+.. autofunction:: tvm.relay.transform.function_pass
+
+.. autoclass:: tvm.relay.transform.Pass
+    :members:
+
+.. autoclass:: tvm.relay.transform.PassInfo
+    :members:
+
+.. autoclass:: tvm.relay.transform.PassContext
+    :members:
+
+.. autoclass:: tvm.relay.transform.ModulePass
+    :members:
+
+.. autoclass:: tvm.relay.transform.FunctionPass
+    :members:
+
+.. autoclass:: tvm.relay.transform.Sequential
+    :members:
diff --git a/docs/api/python/topi.rst b/docs/api/python/topi.rst
index eaa5dacd678e..ade0f1a5b390 100644
--- a/docs/api/python/topi.rst
+++ b/docs/api/python/topi.rst
@@ -88,6 +88,7 @@ List of operators
    topi.not_equal
    topi.greater_equal
    topi.less_equal
+   topi.all
    topi.logical_and
    topi.logical_or
    topi.logical_not
@@ -98,6 +99,8 @@ List of operators
    topi.shape
    topi.layout_transform
    topi.image.resize
+   topi.argsort
+   topi.topk
 
 
 List of schedules
@@ -140,6 +143,7 @@ topi
 .. autofunction:: topi.gather_nd
 .. autofunction:: topi.full
 .. autofunction:: topi.full_like
+.. autofunction:: topi.all
 .. autofunction:: topi.max
 .. autofunction:: topi.sum
 .. autofunction:: topi.min
@@ -161,6 +165,8 @@ topi
 .. autofunction:: topi.tile
 .. autofunction:: topi.shape
 .. autofunction:: topi.layout_transform
+.. autofunction:: topi.argsort
+.. autofunction:: topi.topk
 
 topi.nn
 ~~~~~~~
diff --git a/docs/conf.py b/docs/conf.py
index e458b5549207..a1b66325a527 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -165,6 +165,8 @@
 
 html_logo = "_static/img/tvm-logo-small.png"
 
+html_favicon = "_static/img/tvm-logo-square.png"
+
 
 # Output file base name for HTML help builder.
 htmlhelp_basename = project + 'doc'
diff --git a/docs/dev/codebase_walkthrough.rst b/docs/dev/codebase_walkthrough.rst
index 788f1f8b50a3..e6df9becdc9e 100644
--- a/docs/dev/codebase_walkthrough.rst
+++ b/docs/dev/codebase_walkthrough.rst
@@ -33,9 +33,9 @@ At the root of the TVM repository, we have following subdirectories that togethe
 - ``topi`` - Compute definitions and backend schedules for standard neural network operators.
 - ``nnvm`` - C++ code and Python frontend for graph optimization and compilation. After the introduction of Relay, it remains in the codebase for backward compatibility.
 
-Using standard Deep Learning terminologies, ``src/relay`` is the component that manages a computational graph, and nodes in a graph are compiled and executed using infrastructures implemented in the rest of ``src``. ``python`` provides python bindings for the C++ API and driver code that users can use to execute compilation. Operators corresponding to each node are registered in ``src/relay/op``. Implementations for operators are in ``topi``, and they are coded in either C++ or Python.
+Using standard Deep Learning terminology, ``src/relay`` is the component that manages a computational graph, and nodes in a graph are compiled and executed using infrastructure implemented in the rest of ``src``. ``python`` provides python bindings for the C++ API and driver code that users can use to execute compilation. Operators corresponding to each node are registered in ``src/relay/op``. Implementations of operators are in ``topi``, and they are coded in either C++ or Python.
 
-Relay is the new IR for deep networks that is intended to replace NNVM. If you have used NNVM, Relay provides equivalent or better functionalities. In fact, Relay goes beyond a traditional way of thinking deep networks in terms of computational graphs. But for the purpose of this document, we can think of Relay as a traditional computational graph framework. You can read more about Relay `here <https://docs.tvm.ai/dev/relay_intro.html>`_.
+Relay is the new IR for deep networks that is intended to replace NNVM. If you have used NNVM, Relay provides equivalent or better functionality. In fact, Relay goes beyond a traditional way of thinking deep networks in terms of computational graphs. But for the purpose of this document, we can think of Relay as a traditional computational graph framework. You can read more about Relay `here <https://docs.tvm.ai/dev/relay_intro.html>`_.
 
 When a user invokes graph compilation by ``relay.build(...)`` (or ``nnvm.compiler.build(...)`` for the older API), the following sequence of actions happens for each node in the graph:
 
@@ -43,7 +43,7 @@ When a user invokes graph compilation by ``relay.build(...)`` (or ``nnvm.compile
 - Generate a compute expression and a schedule for the operator
 - Compile the operator into object code
 
-One of the interesting aspects of TVM codebase is that interoperability between C++ and Python is not unidirectional. Typically, all code that do heavy liftings are implemented in C++, and Python bindings are provided for user interface. This is also true in TVM, but in TVM codebase, C++ code also call into functions defined in a Python module. For example, the convolution operator is implemented in Python, and its implementation is invoked from C++ code in Relay.
+One of the interesting aspects of the TVM codebase is that interoperability between C++ and Python is not unidirectional. Typically, all code that performs heavy lifting is implemented in C++, and Python bindings are provided for the user interface. This is also true in TVM, but in the TVM codebase, C++ code can also call into functions defined in a Python module. For example, the convolution operator is implemented in Python, and its implementation is invoked from C++ code in Relay.
 
 *******************************************
 Vector Add Example
@@ -84,7 +84,7 @@ The Node system is the basis of exposing C++ types to frontend languages, includ
                                   args[4]);
      });
 
-We use ``TVM_REGISTER_*`` macro to expose C++ functions to frontend languages, in the form of `PackedFunc <https://docs.tvm.ai/dev/runtime.html#packedfunc>`_. ``PackedFunc`` is another mechanism by which TVM implements interoperability between C++ and Python. In particular, this is what makes calling Python functions from the C++ codebase very easy.
+We use the ``TVM_REGISTER_*`` macro to expose C++ functions to frontend languages, in the form of a `PackedFunc <https://docs.tvm.ai/dev/runtime.html#packedfunc>`_. A ``PackedFunc`` is another mechanism by which TVM implements interoperability between C++ and Python. In particular, this is what makes calling Python functions from the C++ codebase very easy.
 
 A ``Tensor`` object has an ``Operation`` object associated with it, defined in ``python/tvm/tensor.py``, ``include/tvm/operation.h``, and ``src/tvm/op`` subdirectory. A ``Tensor`` is an output of its ``Operation`` object. Each ``Operation`` object has in turn ``input_tensors()`` method, which returns a list of input ``Tensor`` to it. This way we can keep track of dependencies between ``Operation``.
 
@@ -141,7 +141,7 @@ Bound inference is the process where all loop bounds and sizes of intermediate b
 .. _InferBound Pass: http://docs.tvm.ai/dev/inferbound.html
 
 
-``stmt``, which is the output of ``ScheduleOps()``, represents an initial loop nest structure. If you have applied ``reorder`` or ``split`` primitives to your schedule, then the initial loop nest already reflects that changes. ``ScheduleOps()`` is defined in ``src/schedule/schedule_ops.cc``.
+``stmt``, which is the output of ``ScheduleOps()``, represents an initial loop nest structure. If you have applied ``reorder`` or ``split`` primitives to your schedule, then the initial loop nest already reflects those changes. ``ScheduleOps()`` is defined in ``src/schedule/schedule_ops.cc``.
 
 Next, we apply a number of lowering passes to ``stmt``. These passes are implemented in ``src/pass`` subdirectory. For example, if you have applied ``vectorize`` or ``unroll`` primitives to your schedule, they are applied in loop vectorization and unrolling passes below.
 
@@ -173,7 +173,7 @@ Code generation is done by ``build_module()`` function, defined in ``python/tvm/
    }
 
 
-``Build()`` function looks up the code generator for the given target in the ``PackedFunc`` registry, and invokes the function found. For example, ``codegen.build_cuda`` function is registered in ``src/codegen/build_cuda_on.cc``, like this:
+The ``Build()`` function looks up the code generator for the given target in the ``PackedFunc`` registry, and invokes the function found. For example, ``codegen.build_cuda`` function is registered in ``src/codegen/build_cuda_on.cc``, like this:
 
 ::
 
@@ -182,9 +182,9 @@ Code generation is done by ``build_module()`` function, defined in ``python/tvm/
        *rv = BuildCUDA(args[0]);
      });
 
-``BuildCUDA()`` above generates CUDA kernel source from the lowered IR using ``CodeGenCUDA`` class defined in ``src/codegen/codegen_cuda.cc``, and compile the kernel using NVRTC. If you target a backend that uses LLVM, which includes x86, ARM, NVPTX and AMDGPU, code generation is done primarily by ``CodeGenLLVM`` class defined in ``src/codegen/llvm/codegen_llvm.cc``. ``CodeGenLLVM`` translates TVM IR into LLVM IR, runs a number of LLVM optimization passes, and generates target machine code.
+The ``BuildCUDA()`` above generates CUDA kernel source from the lowered IR using ``CodeGenCUDA`` class defined in ``src/codegen/codegen_cuda.cc``, and compile the kernel using NVRTC. If you target a backend that uses LLVM, which includes x86, ARM, NVPTX and AMDGPU, code generation is done primarily by ``CodeGenLLVM`` class defined in ``src/codegen/llvm/codegen_llvm.cc``. ``CodeGenLLVM`` translates TVM IR into LLVM IR, runs a number of LLVM optimization passes, and generates target machine code.
 
-``Build()`` function in ``src/codegen/codegen.cc`` returns a ``runtime::Module`` object, defined in ``include/tvm/runtime/module.h`` and ``src/runtime/module.cc``. A ``Module`` object is a container for the underlying target specific ``ModuleNode`` object. Each backend implements a subclass of ``ModuleNode`` to add target specific runtime API calls. For example, the CUDA backend implements ``CUDAModuleNode`` class in ``src/runtime/cuda/cuda_module.cc``, which manages CUDA driver API. ``BuildCUDA()`` function above wraps ``CUDAModuleNode`` with ``runtime::Module`` and return it to the Python side. The LLVM backend implements ``LLVMModuleNode`` in ``src/codegen/llvm/llvm_module.cc``, which handles JIT execution of compiled code. Other subclasses of ``ModuleNode`` can be found under subdirectories of ``src/runtime`` corresponding to each backend.
+The ``Build()`` function in ``src/codegen/codegen.cc`` returns a ``runtime::Module`` object, defined in ``include/tvm/runtime/module.h`` and ``src/runtime/module.cc``. A ``Module`` object is a container for the underlying target specific ``ModuleNode`` object. Each backend implements a subclass of ``ModuleNode`` to add target specific runtime API calls. For example, the CUDA backend implements ``CUDAModuleNode`` class in ``src/runtime/cuda/cuda_module.cc``, which manages the CUDA driver API. The ``BuildCUDA()`` function above wraps ``CUDAModuleNode`` with ``runtime::Module`` and return it to the Python side. The LLVM backend implements ``LLVMModuleNode`` in ``src/codegen/llvm/llvm_module.cc``, which handles JIT execution of compiled code. Other subclasses of ``ModuleNode`` can be found under subdirectories of ``src/runtime`` corresponding to each backend.
 
 The returned module, which can be thought of as a combination of a compiled function and a device API, can be invoked on TVM's NDArray objects.
 
@@ -243,4 +243,4 @@ The ``PackedFunc``'s overloaded ``operator()`` will be called, which in turn cal
      }
    };
 
-This concludes an overview of how TVM compiles and executes a function. Although we did not detail TOPI or Relay, at the end all neural network operators go through the same compilation process as above. You are encouraged to dive into the details of the rest of the codebase.
+This concludes an overview of how TVM compiles and executes a function. Although we did not detail TOPI or Relay, in the end, all neural network operators go through the same compilation process as above. You are encouraged to dive into the details of the rest of the codebase.
diff --git a/docs/dev/virtual_machine.rst b/docs/dev/virtual_machine.rst
new file mode 100644
index 000000000000..a59620a0a861
--- /dev/null
+++ b/docs/dev/virtual_machine.rst
@@ -0,0 +1,314 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+Putting the VM in TVM: The Relay Virtual Machine
+================================================
+
+Relay, a new program representation, has enabled the representation and optimization of
+a great breadth of machine learning programs.
+Unfortunately, by supporting a more expressive set of programs, we have
+introduced several new execution challenges.
+
+Relay's interpreter can execute the full language but has notable limitations
+that make it unsuited for production deployments. It is structured as an inefficient
+interpreter that performs AST traversal to execute the program. This approach is conceptually
+simple but inefficient, as the AST traversal heavily relies on indirection.
+
+There are further challenges in compiling dynamic code, such as dynamic scheduling and allocation,
+fully dynamic tensor shapes, and control flow. The interpreter offers simple solutions
+for these, but none is sufficiently compelling or optimized.
+
+The second execution mechanism is the existing graph runtime. In order to target Relay
+programs to this, we compile a small subset of them to the old graph format and execute
+them on the runtime. Graph runtime provides a fast execution experience but only for a very limited
+subset of Relay programs.
+
+An alternative but not-standard approach is Relay's ahead-of-time compiler,
+which compiles a Relay program into a shared library containing an ahead-
+of-time implementation. The ahead-of-time compiler provides compelling performance
+but is difficult to extend and instrument, which can only be done by modifying the
+code generation and optimization mechanisms.
+
+The Relay virtual machine is intended to be a framework that balances these competing
+approaches, providing a dynamic execution environment which can be extended, instrumented,
+and integrated with other approaches like ahead-of-time compilation via a flexible extension
+mechanism.
+
+The virtual machine is designed to strike a balance between performance and flexibility
+when deploying and executing Relay programs, without giving up the benefits of TVM.
+
+Virtual machine (VM) design is a well-studied area in programming languages and systems,
+and there have been various virtual machine designs for both full-fledged
+and embedded programing languages.
+Previous language VM designs have been heavily tailored to the execution profile of traditional programs.
+Traditional programs manipulate small scalar values and consist of a large number of low-level instructions.
+The sheer quantity of instructions requires instruction execution and dispatch to be extremely efficient.
+In the context of machine learning we manipulate primarily tensor values, using a (relatively)
+low number of high level instructions. ML programs' cost centers are expensive operator invocations,
+such as GEMM or convolution, over a large input. Due to the execution profile exhibited by ML programs,
+micro-optimizations present in scalar VMs are dramatically less important.
+
+TVM has provided strong support for vision models,
+but we want to grow to support a wider variety of models.
+The graph runtime is able to utilize the fully static nature of the input graphs to perform
+aggressive optimization such as fully static allocation, and optimal memory reuse.
+When we introduce models which make use of control flow, recursion, dynamic shapes, and dynamic
+allocation, we must change how execution works. A virtual machine for Relay is a natural choice.
+
+The rest of this document provides a high-level overview of the Relay
+virtual machine design and its instruction set.
+
+Design
+------
+
+The VM's design is focused on simplicity without sacrificing performance.
+In order to accomplish this we have focused on designing a tensor VM rather than a scalar VM.
+
+In the tensor VM setting, we optimize for cheap “allocation” of objects (by trying to avoid real allocation),
+reuse of static fragments, and the ability to do dynamic shape (i.e jagged tensors).
+
+Instruction Set
+~~~~~~~~~~~~~~~
+
+The choices of an instruction set and instruction representation are the most critical design decisions for a VM.
+The current representation of the instructions is a tagged union containing the op-code and the data payload.  An important design decision is the level of abstraction of the instructions (RISC vs. CISC) and how they take their data (fixed-width instruction encoding vs. variable-length encoding). The current version is closer to CISC, with complex instructions like AllocTensor, and is variable-length due to the inclusion of the shape as part of the instruction. The current instruction set is very high-level and corresponds roughly to high-level operations in Relay.
+
+Ret
+^^^
+**Arguments**:
+::
+  RegName dst
+  RegName result
+
+Returns the object in register `result` to caller's register `dst`.
+
+InvokePacked
+^^^^^^^^^^^^
+**Arguments**:
+::
+  size_t packed_index
+  size_t arity
+  size_t output_size
+  RegName* packed_args
+
+Invoke the packed function denoted by `packed_index`. The `arity`
+and `output_size` are used to inform the VM how many inputs and
+outputs to expect. `packed_args` stores the list of argument registers.
+
+AllocTensor
+^^^^^^^^^^^
+**Arguments**:
+::
+  RegName dst
+  RegName shape_register
+  size_t ndim
+  DLDataType dtype
+
+Allocate a tensor value of the appropriate shape (stored in `shape_register`) and `dtype`. The result
+is saved to register `dst`.
+
+AllocDatatype
+^^^^^^^^^^^^^
+**Arguments**:
+::
+  RegName dst
+  size_t tag
+  size_t num_fields
+  RegName* datatype_fields
+
+Allocate a data type with the tag `tag` using the `num_fields` entries
+from registers `datatype_fields`. The result is saved to register `dst`.
+
+AllocClosure
+^^^^^^^^^^^^
+**Arguments**:
+::
+  RegName dst
+  size_t clo_index
+  size_t num_freevar
+  RegName* free_vars;
+
+Allocate a closure with the VMFunction at `clo_index` as
+its code, and the `num_freevar` entries from registers in
+`free_vars`. The result is saved to register `dst`.
+
+GetField
+^^^^^^^^
+**Arguments**:
+::
+  RegName dst
+  RegName object
+  size_t field_index
+
+Get the field value with index `field_index` from `object`. And saves the result to register `dst`.
+
+If
+^^
+**Arguments**:
+::
+  RegName if_cond
+  size_t true_offset
+  size_t false_offset
+
+Check if the object at register `if_cond` is `true` or `false`.
+If `true`, relative jump by `true_offset`, else relative
+jump by `false_offset`.
+
+Goto
+^^^^
+**Arguments**:
+::
+  size_t pc_offset
+
+Relative unconditional jump by `pc_offset`.
+
+Invoke
+^^^^^^
+**Arguments**:
+::
+  size_t func_index
+
+Invoke function at `func_index`, consumes the number of arguments contained in the VMFunction's
+arity field.
+
+InvokeClosure
+^^^^^^^^^^^^^
+**Arguments**:
+::
+    RegName closure
+    size_t closure_args_num
+    RegName* closure_args
+
+Invokes `closure`, consuming the number of arguments declared in the closure's VMFunction.
+
+LoadConst
+^^^^^^^^^
+**Arguments**:
+::
+  RegName dst
+  size_t const_index
+
+Load the constant at `const_index` from the constant pool. The result is saved to register `dst`.
+
+Object Representation
+~~~~~~~~~~~~~~~~~~~~~
+We use a simple object representation that uses shared pointers and tagging.
+There is a huge space of possible object representations trade-offs, but we
+believe micro-optimizing this code has little to no effect on the end-to-end performance.
+
+::
+
+    struct ObjectCell {
+      ObjectTag tag;
+      ...
+    };
+
+    struct Object {
+      std::shared_ptr<ObjectCell> ptr;
+      ...
+    }
+
+See `include/tvm/runtime/vm.h` for more details.
+
+Currently, we support 3 types of objects: tensors, data types, and closures.
+
+::
+
+    VMObject VMTensor(const tvm::runtime::NDArray& data);
+    VMObject VMDatatype(size_t tag, const std::vector<VMObject>& fields);
+    VMObject VMClosure(size_t func_index, std::vector<VMObject> free_vars);
+
+
+Stack and State
+~~~~~~~~~~~~~~~
+
+The Relay VM maintains a stack frame, which contains information about how to resume the
+previous call. Registers are allocated in a continuous space (virtual register file) for each function.
+
+We keep track of a set of Relay functions we have called, a pointer into its bytecode, an offset into the byte code (known as the program counter).
+
+::
+
+    struct VirtualMachine {
+      ...
+      std::vector<VMFrame> frames;
+      ...
+      // Current function.
+      size_t func_index;
+      // Pointer into the current function's instructions.
+      const Instruction* code;
+      // Current program counter relative to the code pointer.
+      size_t pc;
+      ...
+    };
+
+
+Dispatch Loop
+~~~~~~~~~~~~~
+A critical piece of a VM is the dispatch loop. The dispatch loop usually dominates the execution time of a
+virtual machine, but we have experimentally found this not to be the case for Relay. We have just implemented
+a simple `switch`/`goto` dispatch loop which dispatches based on instruction op code.
+
+This loop is implemented by `VirtualMachine::Run()`.
+
+VM Compiler
+~~~~~~~~~~~
+
+An important part of this infrastructure is a compiler from Relay's full IR into a sequence of bytecode.
+The VM compiler transforms a `tvm::relay::Module` into a `tvm::relay::vm::VirtualMachine`. The virtual
+machine contains a set of compiled functions, the compiled functions are contained in `tvm::relay::vm::Function`. The functions contain metadata about the the function as well as its compiled bytecode. For full definitions of the data structures see `vm.h`.
+
+Optimizations
+~~~~~~~~~~~~~
+
+There are quite a few optimizations required by the VM compiler.
+
+We have implemented them in the old pass style, but plan to port them to
+the new pass manager (#2546) before merging.
+
+Optimizations marked with `TODO` are not implemented yet.
+
+- A-Normal Form
+- Lambda Lift (see `src/relay/vm/lambda_lift.cc`)
+- Inline Primitives (see `src/relay/vm/inline_primitives.cc`)
+- Inliner (see `src/relay/pass/inliner.cc`)
+- Constant Pool Layout (see `src/relay/backend/vm/compiler.cc`)
+- ADT Tag Allocation (see `src/relay/backend/vm/compiler.cc`)
+- Tail Call Optimization (TODO)
+- Liveness Analysis (TODO)
+
+Serialization
+~~~~~~~~~~~~~
+
+A final and yet-to-be-implemented part of the VM design is serialization. The accompanying PR will introduce both the bytecode and its serialization, as well as VM-level serialization. The design premise is that a VM can be efficiently stored to disk and resumed at a later time. This would also allow us to efficiently schedule many models on to a single machine in order to obtain good utilization.
+
+Unresolved Questions
+~~~~~~~~~~~~~~~~~~~~
+
+How do we handle dynamic shapes?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+TODO
+
+How can we modify the VM to support JIT compilation of certain code paths?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+In the code generation space there are still many tradeoffs to be analyzed and the VM is designed
+to be very flexible so we can modify it for future experiments.
+
+How do we support heterogenous execution?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Heterogenous execution should work out of the box assuming we have annotated the appropriate device copies.
+In order to do this properly we need to run the device annotation and copying passes. 
diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 3a769dee2dce..1ea8f3478341 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -192,6 +192,12 @@ Python dependencies
    .. code:: bash
 
        pip install --user tornado psutil xgboost
+       
+   * If you want to parse Relay text format progams, you must use Python 3 and run the following
+
+   .. code:: bash
+
+       pip install --user mypy orderedset antlr4-python3-runtime
 
 
 Install Contrib Libraries
diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index cd5677293571..28ee99e77981 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -135,6 +135,7 @@ This level enables additional math and transform operators.
    tvm.relay.greater_equal
    tvm.relay.less
    tvm.relay.less_equal
+   tvm.relay.all
    tvm.relay.logical_and
    tvm.relay.logical_or
    tvm.relay.logical_not
@@ -171,6 +172,7 @@ This level enables additional math and transform operators.
    :nosignatures:
 
    tvm.relay.argsort
+   tvm.relay.topk
 
 
 **Level 10: Temporary Operators**
@@ -277,6 +279,7 @@ Level 4 Definitions
 .. autofunction:: tvm.relay.greater_equal
 .. autofunction:: tvm.relay.less
 .. autofunction:: tvm.relay.less_equal
+.. autofunction:: tvm.relay.all
 .. autofunction:: tvm.relay.logical_and
 .. autofunction:: tvm.relay.logical_or
 .. autofunction:: tvm.relay.logical_not
@@ -307,6 +310,7 @@ Level 5 Definitions
 Level 6 Definitions
 -------------------
 .. autofunction:: tvm.relay.argsort
+.. autofunction:: tvm.relay.topk
 
 
 Level 10 Definitions
diff --git a/docs/vta/install.md b/docs/vta/install.md
index 233bb5ca0260..6c87b4edd288 100644
--- a/docs/vta/install.md
+++ b/docs/vta/install.md
@@ -84,7 +84,7 @@ This guide covers the following themes:
 
 Setup your Pynq board based on the [Pynq board getting started tutorial](http://pynq.readthedocs.io/en/latest/getting_started.html).
 You should follow the instructions up to and including the *Turning On the PYNQ-Z1* step (no need to pursue the tutorial beyond this point).
-* Make sure that you've downloaded the latest Pynq image, [PYNQ-Z1 v2.3](http://www.pynq.io/board.html) (released October 3rd 2018), and have imaged your SD card with it (we recommend the free [Etcher](https://etcher.io/) program).
+* Make sure that you've downloaded the latest Pynq image, [PYNQ-Z1 v2.4](http://www.pynq.io/board.html)(released February 22rd 2019), and have imaged your SD card with it (we recommend the free [Etcher](https://etcher.io/) program).
 * For this test setup, follow the ["Connect to a Computer"](http://pynq.readthedocs.io/en/latest/getting_started.html#connect-to-a-computer) Ethernet setup instructions. To be able to talk to the board, make sure to [assign your computer a static IP address](http://pynq.readthedocs.io/en/latest/appendix.html#assign-your-computer-a-static-ip)
 
 Once the board is powered on and connected to your development machine, try connecting to it to make sure you've properly set up your Pynq board:
@@ -208,7 +208,7 @@ chmod u+x Xilinx_Vivado_SDK_Web_2018.2_0614_1954_Lin64.bin
 
 #### Xilinx Vivado GUI Installer Steps
 
-At this point you've launched the Vivado 2017.1 Installer GUI program.
+At this point you've launched the Vivado 2018.2 Installer GUI program.
 
 1. Click “Next” on the *Welcome* screen.
 2. On the *Select Install Type* screen, enter your Xilinx user credentials under the “User Authentication” box and select the “Download and Install Now” option before clicking “Next” .
diff --git a/include/tvm/arithmetic.h b/include/tvm/arithmetic.h
index 9a8d9d372956..c506268cb14b 100644
--- a/include/tvm/arithmetic.h
+++ b/include/tvm/arithmetic.h
@@ -48,11 +48,7 @@ namespace arith {
 
 // Forward declare Analyzer
 class Analyzer;
-/*!
- * \brief reference class to ConstIntBoundNode
- * \sa ConstIntBoundNode
- */
-class ConstIntBound;
+
 /*!
  * \brief Constant integer up and lower bound(inclusive).
  *  Useful for value bound analysis.
@@ -69,8 +65,6 @@ class ConstIntBoundNode : public Node {
     v->Visit("max_value", &max_value);
   }
 
-  TVM_DLL static ConstIntBound make(int64_t min_value, int64_t max_value);
-
   /*! \brief Number to represent +inf */
   static const constexpr int64_t kPosInf = std::numeric_limits<int64_t>::max();
   /*!
@@ -83,7 +77,23 @@ class ConstIntBoundNode : public Node {
   TVM_DECLARE_NODE_TYPE_INFO(ConstIntBoundNode, Node);
 };
 
-TVM_DEFINE_NODE_REF(ConstIntBound, ConstIntBoundNode);
+/*!
+ * \brief reference class to ConstIntBoundNode
+ * \sa ConstIntBoundNode
+ */
+class ConstIntBound : public NodeRef {
+ public:
+  /*!
+   * \brief constructor by fields.
+   * \param min_value The mininum value.
+   * \param max_value The maximum value.
+   */
+  TVM_DLL ConstIntBound(int64_t min_value, int64_t max_value);
+
+  static const constexpr int64_t kPosInf = ConstIntBoundNode::kPosInf;
+  static const constexpr int64_t kNegInf = ConstIntBoundNode::kNegInf;
+  TVM_DEFINE_NODE_REF_METHODS(ConstIntBound, NodeRef, ConstIntBoundNode);
+};
 
 /*!
  * \brief Analyzer to get constant integer bound over expression.
@@ -133,11 +143,6 @@ class ConstIntBoundAnalyzer {
   Impl* impl_;
 };
 
-/*!
- * \brief reference of ModularSetNode
- * \sa ModularSetNode
- */
-class ModularSet;
 /*!
  * \brief Range of a linear integer function.
  *  Use to do specify the possible index values.
@@ -162,13 +167,20 @@ class ModularSetNode : public Node {
     v->Visit("base", &base);
   }
 
-  TVM_DLL static ModularSet make(int64_t coeff, int64_t base);
-
   static constexpr const char* _type_key = "arith.ModularSet";
   TVM_DECLARE_NODE_TYPE_INFO(ModularSetNode, Node);
 };
 
-TVM_DEFINE_NODE_REF(ModularSet, ModularSetNode);
+/*!
+ * \brief reference of ModularSetNode
+ * \sa ModularSetNode
+ */
+class ModularSet : public NodeRef {
+ public:
+  TVM_DLL ModularSet(int64_t coeff, int64_t base);
+
+  TVM_DEFINE_NODE_REF_METHODS(ModularSet, NodeRef, ModularSetNode);
+};
 
 /*!
  * \brief Analyzer to get modular information over expression.
@@ -278,14 +290,14 @@ class CanonicalSimplifier {
 };
 
 /*!
- * \brief A RAII constraint context.
+ * \brief Constraint context.
  *
  * \code
  *
  *  Var("x");
  *  arith::Analyzer analyzer;
  *  {
- *    arith::ConstraintContext cctx(&analyzer, x % 3 == 0);
+ *    With<arith::ConstraintContext> scope(&analyzer, x % 3 == 0);
  *    CHECK_EQ(analyzer.modular_set(x)->coeff, 3);
  *  }
  *  // constraint no longer in effect.
@@ -294,88 +306,36 @@ class CanonicalSimplifier {
  * \endcode
  */
 class ConstraintContext {
- public:
+ private:
+  // declare friend to enable with.
+  friend class With<ConstraintContext>;
   /*!
    * \brief Construct a constraint context.
    * \param analyzer The analyzer.
    * \param constraint The constraint to be applied.
    */
-  ConstraintContext(Analyzer* analyzer, const Expr& constraint) DMLC_THROW_EXCEPTION;
-  /*! \brief destructor */
-  ~ConstraintContext() DMLC_THROW_EXCEPTION {
-    exit_();
-  }
-
- private:
+  ConstraintContext(Analyzer* analyzer, Expr constraint)
+      : analyzer_(analyzer), constraint_(constraint) {}
+  // enter the scope.
+  void EnterWithScope();
+  // exit the scope.
+  void ExitWithScope();
+  /*! \brief The analyzer */
+  Analyzer* analyzer_;
+  /*! \brief The constraint */
+  Expr constraint_;
   /*! \brief function to be called in recovery */
   std::function<void()> exit_;
 };
 
-/*!
- * \brief Analyzer that contains bunch of sub-analyzers.
- *
- * Each sub-analyzer can make use of another sub-analyzer
- * by weak reference of this.
- *
- * NOTE for sub-analyzer developers:
- * If the analyzer uses memoization, we need to clear the internal
- * cache when information about a Var has been overrideen.
- */
-class Analyzer {
- public:
-  /*! \brief sub-analyzer: const integer bound */
-  ConstIntBoundAnalyzer const_int_bound;
-  /*! \brief sub-analyzer: modular set */
-  ModularSetAnalyzer modular_set;
-  /*! \brief sub-analyzer rewrite simplify */
-  RewriteSimplifier rewrite_simplify;
-  /*! \brief sub-analyzer canonical simplify */
-  CanonicalSimplifier canonical_simplify;
-  /*! \brief constructor */
-  Analyzer();
-  /*!
-   * \brief Notify all the sub-analyzers that var
-   *        is created and binded to expr.
-   *
-   *  Each var can only be binded once.
-   *
-   * \param var The variable.
-   * \param expr The expression we bind to.
-   */
-  void Bind(const VarExpr& var, const Expr& expr);
-  /*!
-   * \brief Notify all the sub-analyzers that var
-   *        is created and binded to a range.
-   *
-   *  Each var can only be binded once.
-   *
-   * \param var The variable.
-   * \param range The range we bind to.
-   */
-  void Bind(const VarExpr& var, const Range& range);
-  /*!
-   * \brief Whether can we proof expr >= val.
-
-   *  Non-negative proof is very useful in integer analysis
-   *  to lower divisions and mods given difference in trunc and ceil mode.
-   *
-   * \param expr The expression.
-   * \param lower_bound The lower bound.
-   * \return Whether we can proof it.
-   *
-   * \note Analyzer will call into sub-analyzers to get the result.
-   */
-  bool CanProveGreaterEqual(const Expr& expr, int64_t lower_bound);
-};
-
 //-----------------------------------------------
-// Integer set abstraction API.
+// Integer set data structure.
 //
 // This is a API build on top of the base
 // integer analysis API to provide set analysis.
 //------------------------------------------------
 /*!
- * \brief Sign of an expression or set.
+ * \brief Sign type of an integer expression.
  */
 enum SignType {
   kPositive,
@@ -384,8 +344,13 @@ enum SignType {
   kUnknown
 };
 
-// internal node container of int set.
-struct IntSetNode;
+/*!
+ * \brief Base class of all IntSet containers.
+ */
+struct IntSetNode : public Node {
+  static constexpr const char* _type_key = "IntSet";
+  TVM_DECLARE_BASE_NODE_INFO(IntSetNode, Node);
+};
 
 /*!
  * \brief Integer set class, represent a set of integers in one dimension.
@@ -407,11 +372,6 @@ class IntSet : public NodeRef {
    * \return The covering range.
    */
   Range cover_range(Range max_range) const;
-  /*!
-   * \brief find an interval that covers the set.
-   * \return The covering interval set.
-   */
-  IntSet cover_interval() const;
   /*! \return Lower bound of the set */
   Expr min() const;
   /*! \return upper bound of the set */
@@ -476,33 +436,91 @@ class IntSet : public NodeRef {
 };
 
 /*!
- * \brief Base class of all IntSet containers.
+ * \brief Integer set analyzer.
  */
-struct IntSetNode : public Node {
-  static constexpr const char* _type_key = "IntSet";
-  TVM_DECLARE_BASE_NODE_INFO(IntSetNode, Node);
+class IntSetAnalyzer {
+ public:
+  /*!
+   * \brief Find a symbolic integer set that contains all possible values of
+   *        expr given the domain of each variables.
+   *
+   * \param expr The expression of interest.
+   * \param dom_map The domain map to indicate which variable to relax.
+   * \return the result of the analysis.
+   */
+  IntSet operator()(const Expr& expr, const Map<Var, IntSet>& dom_map);
+
+ private:
+  friend class Analyzer;
+  explicit IntSetAnalyzer(Analyzer* parent);
+  ~IntSetAnalyzer();
+  class Impl;
+  /*! \brief Internal impl */
+  Impl* impl_;
 };
 
 /*!
- * \brief Detect if e can be rewritten as e = sum_{i=0}^{n-1} var[i] * coeff[i] + coeff[n]
- *  Where coeff[i] and base are invariant of var[j] for all i and j.
+ * \brief Analyzer that contains bunch of sub-analyzers.
  *
- * \param e The expression to be detected.
- * \param vars List of variables to be used in detection.
- * \return [coeff[i]] if it is possible, empty array if it is not.
- */
-Array<Expr> DetectLinearEquation(const Expr& e, const Array<Var>& vars);
-
-/*!
- * \brief Detect if expression corresponds to clip bound of the vars
+ * Each sub-analyzer can make use of another sub-analyzer
+ * by weak reference of this.
  *
- * \param e The expression to be detected.
- * \param vars List of variables to be used in detection.
- * \return concat([min_value[i], max_value[i]]), None is returned if there is no min or max value
- *          return empty if the e does not match the pattern.
+ * NOTE for sub-analyzer developers:
+ * If the analyzer uses memoization, we need to clear the internal
+ * cache when information about a Var has been overridden.
  */
-Array<Expr> DetectClipBound(const Expr& e, const Array<Var>& vars);
+class Analyzer {
+ public:
+  /*! \brief sub-analyzer: const integer bound */
+  ConstIntBoundAnalyzer const_int_bound;
+  /*! \brief sub-analyzer: modular set */
+  ModularSetAnalyzer modular_set;
+  /*! \brief sub-analyzer rewrite simplify */
+  RewriteSimplifier rewrite_simplify;
+  /*! \brief sub-analyzer canonical simplify */
+  CanonicalSimplifier canonical_simplify;
+  /*! \brief sub-analyzer: int set */
+  IntSetAnalyzer int_set;
+  /*! \brief constructor */
+  Analyzer();
+  /*!
+   * \brief Notify all the sub-analyzers that var
+   *        is created and binded to expr.
+   *
+   *  Each var can only be binded once.
+   *
+   * \param var The variable.
+   * \param expr The expression we bind to.
+   */
+  void Bind(const VarExpr& var, const Expr& expr);
+  /*!
+   * \brief Notify all the sub-analyzers that var
+   *        is created and binded to a range.
+   *
+   *  Each var can only be binded once.
+   *
+   * \param var The variable.
+   * \param range The range we bind to.
+   */
+  void Bind(const VarExpr& var, const Range& range);
+  /*!
+   * \brief Whether can we prove expr >= val.
+
+   *  Non-negative proof is very useful in integer analysis
+   *  to lower divisions and mods given difference in trunc and ceil mode.
+   *
+   * \param expr The expression.
+   * \param lower_bound The lower bound.
+   * \return Whether we can prove it.
+   *
+   * \note Analyzer will call into sub-analyzers to get the result.
+   */
+  bool CanProveGreaterEqual(const Expr& expr, int64_t lower_bound);
+};
 
+//-----------------------------------------------
+// Integer set legacy API.
+//------------------------------------------------
 /*!
  * \brief Find an symbolic integer set that contains all possible values of
  *  e given the domain of each iteration variables.
@@ -621,6 +639,29 @@ IntSet DeduceBound(Expr v, Expr cond,
  */
 Domain DomainTouched(Stmt body, const Tensor &tensor, bool consider_calls, bool consider_provides);
 
+// Expression pattern detector.
+/*!
+ * \brief Detect if e can be rewritten as e = sum_{i=0}^{n-1} var[i] * coeff[i] + coeff[n]
+ *  Where coeff[i] and base are invariant of var[j] for all i and j.
+ *
+ * \param e The expression to be detected.
+ * \param vars List of variables to be used in detection.
+ * \return [coeff[i]] if it is possible, empty array if it is not.
+ */
+Array<Expr> DetectLinearEquation(const Expr& e,
+                                 const Array<Var>& vars);
+
+/*!
+ * \brief Detect if expression corresponds to clip bound of the vars
+ *
+ * \param e The expression to be detected.
+ * \param vars List of variables to be used in detection.
+ * \return concat([min_value[i], max_value[i]]), None is returned if there is no min or max value
+ *          return empty if the e does not match the pattern.
+ */
+Array<Expr> DetectClipBound(const Expr& e,
+                            const Array<Var>& vars);
+
 // implementation
 inline const IntSetNode* IntSet::operator->() const {
   return static_cast<const IntSetNode*>(node_.get());
diff --git a/include/tvm/base.h b/include/tvm/base.h
index ae2d91ff8523..f358f7f5d447 100644
--- a/include/tvm/base.h
+++ b/include/tvm/base.h
@@ -39,21 +39,24 @@ using ::tvm::Node;
 using ::tvm::NodeRef;
 using ::tvm::AttrVisitor;
 
-/*! \brief Macro to make it easy to define node ref type given node */
-#define TVM_DEFINE_NODE_REF(TypeName, NodeName)                  \
-  class TypeName : public ::tvm::NodeRef {                       \
-   public:                                                       \
-    TypeName() {}                                                 \
-    explicit TypeName(::tvm::NodePtr<::tvm::Node> n) : NodeRef(n) {}     \
-    const NodeName* operator->() const {                          \
-      return static_cast<const NodeName*>(node_.get());           \
-    }                                                             \
-    using ContainerType = NodeName;                               \
-  };                                                              \
+/*!
+ * \brief Macro to define common node ref methods.
+ * \param TypeName The name of the NodeRef.
+ * \param BaseTypeName The Base type.
+ * \param NodeName The node container type.
+ */
+#define TVM_DEFINE_NODE_REF_METHODS(TypeName, BaseTypeName, NodeName)   \
+  TypeName() {}                                                         \
+  explicit TypeName(::tvm::NodePtr<::tvm::Node> n) : BaseTypeName(n) {} \
+  const NodeName* operator->() const {                                  \
+    return static_cast<const NodeName*>(node_.get());                   \
+  }                                                                     \
+  operator bool() const { return this->defined(); }                     \
+  using ContainerType = NodeName;
 
 /*!
- * \brief Macro to make it easy to define node ref type that
- *  has a CopyOnWrite member function.
+ * \brief Macro to define CopyOnWrite function in a NodeRef.
+ * \param NodeName The Type of the Node.
  *
  *  CopyOnWrite will generate a unique copy of the internal node.
  *  The node will be copied if it is referenced by multiple places.
@@ -70,25 +73,77 @@ using ::tvm::AttrVisitor;
  *
  * \endcode
  */
-#define TVM_DEFINE_COW_NODE_REF(TypeName, BaseType, NodeName)           \
-  class TypeName : public BaseType {                                    \
-   public:                                                              \
-    TypeName() {}                                                       \
-    explicit TypeName(::tvm::NodePtr<::tvm::Node> n) : BaseType(n) {}   \
-    const NodeName* operator->() const {                                \
-      return static_cast<const NodeName*>(node_.get());                 \
-    }                                                                   \
-    inline NodeName* CopyOnWrite() {                                    \
+#define TVM_DEFINE_NODE_REF_COW(NodeName)                               \
+  NodeName* CopyOnWrite() {                                             \
       CHECK(node_ != nullptr);                                          \
       if (!node_.unique())  {                                           \
         NodePtr<NodeName> n = make_node<NodeName>(*(operator->()));     \
         NodePtr<Node>(std::move(n)).swap(node_);                        \
       }                                                                 \
       return static_cast<NodeName*>(node_.get());                       \
-    }                                                                   \
-    using ContainerType = NodeName;                                     \
+    }
+
+/*! \brief Macro to make it easy to define node ref type given node */
+#define TVM_DEFINE_NODE_REF(TypeName, NodeName)                      \
+  class TypeName : public ::tvm::NodeRef {                           \
+   public:                                                           \
+    TVM_DEFINE_NODE_REF_METHODS(TypeName, ::tvm::NodeRef, NodeName); \
+  };                                                                 \
+
+/*!
+ * \brief Macro to make it easy to define node ref type that
+ *  has a CopyOnWrite member function.
+ */
+#define TVM_DEFINE_COW_NODE_REF(TypeName, BaseType, NodeName)           \
+  class TypeName : public BaseType {                                    \
+   public:                                                              \
+    TVM_DEFINE_NODE_REF_METHODS(TypeName, BaseType, NodeName);          \
+    TVM_DEFINE_NODE_REF_COW(NodeName);                                  \
   };
 
+/*!
+ * \brief RAII wrapper function to enter and exit a context object
+ *        similar to python's with syntax.
+ *
+ * \code
+ * // context class
+ * class MyContext {
+ *  private:
+ *    friend class With<MyContext>;
+      MyContext(arguments);
+ *    void EnterWithScope();
+ *    void ExitWithScope();
+ * };
+ *
+ * {
+ *   With<MyContext> scope(arguments);
+ *   // effect take place.
+ * }
+ * \endcode
+ *
+ * \tparam ContextType Type of the context object.
+ */
+template<typename ContextType>
+class With {
+ public:
+  /*!
+   * \brief constructor.
+   *  Enter the scope of the context.
+   */
+  template<typename ...Args>
+  explicit With(Args&& ...args)
+      : ctx_(std::forward<Args>(args)...) {
+    ctx_.EnterWithScope();
+  }
+  /*! \brief destructor, leaves the scope of the context. */
+  ~With() DMLC_THROW_EXCEPTION {
+    ctx_.ExitWithScope();
+  }
+
+ private:
+  /*! \brief internal context type. */
+  ContextType ctx_;
+};
 
 /*!
  * \brief save the node as well as all the node it depends on as json.
diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h
index 208f086f86c0..187a74552241 100644
--- a/include/tvm/build_module.h
+++ b/include/tvm/build_module.h
@@ -37,7 +37,7 @@ namespace tvm {
 
 /*!
 * \brief Container for target device information.
-* Use target::llvm, target::cuda etc functions instead of constructing directly.
+*   Use target::llvm, target::cuda etc functions instead of constructing directly.
 */
 class TargetNode : public Node {
  public:
@@ -89,65 +89,47 @@ class TargetNode : public Node {
   mutable std::string str_repr_;
 };
 
+/*! \brief reference cpass to the target. */
 class Target : public NodeRef {
  public:
   Target() {}
   explicit Target(NodePtr<Node> n) : NodeRef(n) {}
-
   /*!
   * \brief Create a Target given a string
   * \param target_str the string to parse
   */
-  TVM_DLL static Target create(const std::string& target_str);
-
-  /*!
-  * \brief Push a new target context onto the thread local stack. The Target on top of
-  * the stack is used to determine which specialization to use when invoking a GenericFunc.
-  * \param target The target to set as the current context.
-  */
-  TVM_DLL static void EnterTargetScope(const tvm::Target& target);
-
-  /*!
-  * \brief Pop a target off the thread local context stack, restoring the previous target
-  * as the current context.
-  */
-  TVM_DLL static void ExitTargetScope();
-
+  TVM_DLL static Target Create(const std::string& target_str);
   /*!
-  * \brief Get the current target context from thread local storage.
-  * \param allow_not_defined If the context stack is empty and this is set to true, an
-  * undefined Target will be returned. Otherwise, an empty context stack will cause a
-  * runtime error.
-  * \return The target that is the current context. The target may not be defined if
-  * allow_not_defined is true.
-  */
-  TVM_DLL static tvm::Target current_target(bool allow_not_defined = true);
+   * \brief Get the current target context from thread local storage.
+   * \param allow_not_defined If the context stack is empty and this is set to true, an
+   *   undefined Target will be returned. Otherwise, an empty context stack will cause a
+   *   runtime error.
+   * \return The target that is the current context. The target may not be defined if
+   * allow_not_defined is true.
+   */
+  TVM_DLL static tvm::Target Current(bool allow_not_defined = true);
 
-  inline const TargetNode* operator->() const {
+  const TargetNode* operator->() const {
       return static_cast<const TargetNode*>(node_.get());
   }
 
   using ContainerType = TargetNode;
-};
-
-/*!
- * \brief RAII container to provide a scoped target context. Pushes a target onto the
- * context stack when constructed, and pops it when destructed.
- */
-struct TargetContext {
+  class Internal;
+ private:
+  // enable with syntax.
+  friend class Internal;
+  friend class With<Target>;
   /*!
-   * \brief Enter a new target context. The given target becomes the new current context.
-   * When the TargetContext is destructed, the previous context is restored.
-   * \param target The target to set as the new current context.
+   * \brief Push a new target context onto the thread local stack.
+   *  The Target on top of the stack is used to determine which
+   *  specialization to use when invoking a GenericFunc.
    */
-  explicit TargetContext(const tvm::Target& target) {
-    Target::EnterTargetScope(target);
-  }
-
-  /*! \brief Destructor. Pops the context off the thread local stack. */
-  ~TargetContext() {
-    Target::ExitTargetScope();
-  }
+  TVM_DLL void EnterWithScope();
+  /*!
+   * \brief Pop a target off the thread local context stack,
+   *  restoring the previous target as the current context.
+   */
+  TVM_DLL void ExitWithScope();
 };
 
 /*! \brief This namespace provides functions to construct Target instances */
@@ -190,11 +172,9 @@ TVM_DLL Target stackvm(const std::vector<std::string>& options =
 
 }  // namespace target
 
-class BuildConfig;
-
 /*!
-* \brief Container for build configuration options
-*/
+ * \brief Container for build configuration options
+ */
 class BuildConfigNode : public Node {
  public:
   /*!
@@ -246,6 +226,9 @@ class BuildConfigNode : public Node {
   /*! \brief Whether to disable select rewriting. */
   bool disable_select_rewriting = false;
 
+  /*! \brief Whether to disable loop vectorization. */
+  bool disable_vectorize = false;
+
   void VisitAttrs(AttrVisitor* v) final {
     v->Visit("data_alignment", &data_alignment);
     v->Visit("offset_factor", &offset_factor);
@@ -260,6 +243,7 @@ class BuildConfigNode : public Node {
     v->Visit("dump_pass_ir", &dump_pass_ir);
     v->Visit("instrument_bound_checkers", &instrument_bound_checkers);
     v->Visit("disable_select_rewriting", &disable_select_rewriting);
+    v->Visit("disable_vectorize", &disable_vectorize);
   }
 
   static constexpr const char* _type_key = "BuildConfig";
@@ -267,69 +251,48 @@ class BuildConfigNode : public Node {
 };
 
 /*!
-* \brief Container for build configuration options
-*/
+ * \brief Build configuration for compilations.
+ */
 class BuildConfig : public ::tvm::NodeRef {
  public:
   BuildConfig() {}
   explicit BuildConfig(NodePtr<::tvm::Node> n) : NodeRef(n) {}
-
   const BuildConfigNode* operator->() const {
     return static_cast<const BuildConfigNode*>(node_.get());
   }
-
   BuildConfigNode* operator->() {
     return static_cast<BuildConfigNode*>(node_.get());
   }
-
   /*!
-   * \brief Push a new BuildConfig context onto the thread local stack.
-   * \param build_config The configuration to set as the current context.
-   */
-  TVM_DLL static void EnterBuildConfigScope(const tvm::BuildConfig& build_config);
-
-  /*!
-   * \brief Pop a build config off the thread local context stack, restoring the previous
-   * configuration as the current context.
+   * \brief Construct a BuildConfig containing a empty build config node.
+   * \return The new BuildConfig
    */
-  TVM_DLL static void ExitBuildConfigScope();
-
+  TVM_DLL static BuildConfig Create();
   /*!
    * \brief Get the current BuildConfig context from thread local storage, or a default
    * configuration if a BuildConfig scope has not been entered.
    * \return The configuration that is the current context.
    */
-  TVM_DLL static tvm::BuildConfig Current();
+  TVM_DLL static BuildConfig Current();
 
   using ContainerType = BuildConfigNode;
-};
+  class Internal;
 
-/*!
- * \brief RAII container to provide a scoped BuildConfig context. Pushes a configuration onto the
- * context stack when constructed, and pops it when destructed.
- */
-struct BuildConfigContext {
+ private:
+  // Enable with syntax.
+  friend class With<BuildConfig>;
   /*!
-   * \brief Enter a new BuildConfig context. The given BuildConfig becomes the new current
-   * context. When the BuildConfigContext is destructed, the previous context is restored.
-   * \param build_config The BuildConfig to set as the new current context.
+   * \brief Push a new BuildConfig context onto the thread local stack.
    */
-  explicit BuildConfigContext(const tvm::BuildConfig& build_config) {
-    BuildConfig::EnterBuildConfigScope(build_config);
-  }
+  TVM_DLL void EnterWithScope();
 
-  /*! \brief Destructor. Pops the context off the thread local stack. */
-  ~BuildConfigContext() {
-    BuildConfig::ExitBuildConfigScope();
-  }
+  /*!
+   * \brief Pop a build config off the thread local context stack,
+   * restoring the previous configuration as the current context.
+   */
+  TVM_DLL void ExitWithScope();
 };
 
-/*!
-* \brief Construct a BuildConfig containing a new BuildConfigNode
-* \return The new BuildConfig
-*/
-TVM_DLL BuildConfig build_config();
-
 /*!
 * \brief Build a LoweredFunc given a schedule, args and binds
 * \param sch The schedule to lower.
diff --git a/include/tvm/expr_operator.h b/include/tvm/expr_operator.h
index 4ef3effaf251..f289bdd810d5 100644
--- a/include/tvm/expr_operator.h
+++ b/include/tvm/expr_operator.h
@@ -33,6 +33,7 @@
 #include "ir.h"
 
 namespace tvm {
+
 /*!
  * \brief Make a const value with certain data type.
  * \param t The target type.
@@ -427,6 +428,13 @@ TVM_DLL Expr abs(Expr x);
  */
 TVM_DLL Expr sum(Expr source, Array<IterVar> axis);
 
+/*!
+ * \brief logical And of of source expression over axis
+ * \param source The source expression.
+ * \param axis List of iteration variables that will be used for reduction.
+ */
+TVM_DLL Expr all(Expr source, Array<IterVar> axis);
+
 /*!
  * \brief max of of source expression over axis
  * \param source The source expression.
@@ -551,6 +559,12 @@ inline Expr MakeConstScalar(Type t, ValueType value) {
   if (t.is_int()) return ir::IntImm::make(t, static_cast<int64_t>(value));
   if (t.is_uint()) return ir::UIntImm::make(t, static_cast<uint64_t>(value));
   if (t.is_float()) return ir::FloatImm::make(t, static_cast<double>(value));
+  // For now, we store const scalar values of custom datatypes within doubles; later, during the
+  // datatypes lowering pass, we will lower the value to its true representation in the format
+  // specified by the datatype.
+  // TODO(gus) when do we need to start worrying about doubles not being precise enough?
+  if (static_cast<uint8_t>(t.code()) >= static_cast<uint8_t>(kCustomBegin))
+    return ir::FloatImm::make(t, static_cast<double>(value));
   LOG(FATAL) << "cannot make const for type " << t;
   return Expr();
 }
diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h
index 20b56e0676eb..e1c92e50e6ad 100644
--- a/include/tvm/ir_pass.h
+++ b/include/tvm/ir_pass.h
@@ -250,35 +250,42 @@ Stmt UnrollLoop(Stmt stmt,
 
 /*!
  * \brief vectorize the constant loops
- * \param stmt The statment to be vectorized.
+ * \param stmt The statement to be vectorized.
  * \return Transformed stmt.
  */
 Stmt VectorizeLoop(Stmt stmt);
 
+/*!
+ * \brief convert vectorized loops into serialized loops
+ * \param stmt The statement to skip vectorization on.
+ * \return Transformed stmt.
+ */
+Stmt SkipVectorize(Stmt stmt);
+
 /*!
 * \brief instruments bound checkers.
-* \param stmt The statment to be instrumented.
-* \return Instrumented Stmt.
+* \param stmt The statement to be instrumented.
+* \return Instrumented stmt.
 */
 Stmt InstrumentBoundCheckers(Stmt stmt);
 
 /*!
  * \brief Inject virtual thread loops into stmt.
- * \param stmt The statment to be transformed.
+ * \param stmt The statement to be transformed.
  * \return Transformed stmt.
  */
 Stmt InjectVirtualThread(Stmt stmt);
 
 /*!
  * \brief Inject prefetch instructions into stmt.
- * \param stmt The statment to be transformed.
+ * \param stmt The statement to be transformed.
  * \return Transformed stmt.
  */
 Stmt InjectPrefetch(Stmt stmt);
 
 /*!
  * \brief Inject double buffer into stmt.
- * \param stmt The statment to be transformed.
+ * \param stmt The statement to be transformed.
  * \param split_loop Loop splitting factor.
  * \return Transformed stmt.
  */
@@ -287,7 +294,7 @@ Stmt InjectDoubleBuffer(Stmt stmt, int split_loop);
 /*!
  * \brief Inject copy intrinsics with optional pad.
  *
- * \param stmt The statment to be transformed.
+ * \param stmt The statement to be transformed.
  * \param pragma_key The pragma key for hint of copy.
  * \param fintrin The function with signature
  *
@@ -308,7 +315,7 @@ Stmt InjectCopyIntrin(Stmt stmt,
  *  Trying to share space between allocations to make
  *  a static allocation plan when possible.
  *
- * \param stmt The stmt to be trasnformed
+ * \param stmt The stmt to be transformed
  * \return Transformed stmt.
  */
 Stmt StorageRewrite(Stmt stmt);
@@ -324,7 +331,7 @@ Stmt LoopPartition(Stmt stmt, bool split_const_loop);
 /*!
  * \brief Detect and insert sync points to co-processor.
  *
- * \param stmt The stmt to be trasnformed
+ * \param stmt The stmt to be transformed
  * \return Transformed stmt.
  */
 Stmt CoProcSync(Stmt stmt);
@@ -332,7 +339,7 @@ Stmt CoProcSync(Stmt stmt);
 /*!
  * \brief Lift common attrs with attr_key to outer scope.
  *
- * \param stmt The stmt to be trasnformed
+ * \param stmt The stmt to be transformed
  * \param attr_key The attribute key to be checked.
  * \return Transformed stmt.
  */
@@ -340,7 +347,7 @@ Stmt LiftAttrScope(Stmt stmt, std::string attr_key);
 
 /*!
  * \brief Detect and rewrite unsafe select that contains memory access.
- * \param stmt The statment to be rewritten.
+ * \param stmt The statement to be rewritten.
  * \return Transformed stmt.
  */
 Stmt RewriteUnsafeSelect(Stmt stmt);
@@ -349,7 +356,7 @@ Stmt RewriteUnsafeSelect(Stmt stmt);
  * \brief Lower attached storage access information.
  * Do this pass after all storage access analysis finish.
  *
- * \param stmt The stmt to be trasnformed
+ * \param stmt The stmt to be transformed
  * \return Transformed stmt.
  */
 Stmt LowerStorageAccessInfo(Stmt stmt);
@@ -358,7 +365,7 @@ Stmt LowerStorageAccessInfo(Stmt stmt);
  * \brief Decorate the stmt with a device scope, this is helpful for
  * hardware accelerator without thread blocks.
  *
- * \param stmt The stmt to be trasnformed
+ * \param stmt The stmt to be transformed
  * \return Transformed stmt.
  */
 Stmt DecorateDeviceScope(Stmt stmt);
@@ -381,7 +388,7 @@ Stmt DecorateDeviceScope(Stmt stmt);
  * \return a LoweredFunc with the specified signiture.
  *
  * \note
- *  The function signiture have two cases
+ *  The function signature have two cases
  *
  *  let num_packed_args = len(api_args) - num_unpacked_args;
  *
@@ -500,6 +507,17 @@ LoweredFunc PointerValueTypeRewrite(LoweredFunc f);
  */
 LoweredFunc LowerIntrin(LoweredFunc f, const std::string& target);
 
+/*!
+ * \brief Lower custom datatypes.
+ *
+ * See tvm::datatypes::Registry for more information on adding custom datatypes.
+ *
+ * \param f The device function to be lowered.
+ * \param target The target device.
+ * \return Transformed function.
+ */
+LoweredFunc LowerCustomDatatypes(LoweredFunc f, const std::string& target);
+
 /*!
  * \brief Verify if memory accesses are legal for a specific target device type.
  *
diff --git a/include/tvm/operation.h b/include/tvm/operation.h
index 15a8c1215177..38dc39bbe7a7 100644
--- a/include/tvm/operation.h
+++ b/include/tvm/operation.h
@@ -286,6 +286,8 @@ class TensorComputeOpNode : public BaseComputeOpNode {
   Array<Tensor> inputs;
   /*! \brief region of input tensors */
   Array<Region> input_regions;
+  /*! \brief scalar expression inputs */
+  Array<Expr> scalar_inputs;
   /*! \brief constructor */
   TensorComputeOpNode() {}
   // override functions
@@ -314,6 +316,7 @@ class TensorComputeOpNode : public BaseComputeOpNode {
     v->Visit("intrin", &intrin);
     v->Visit("inputs", &inputs);
     v->Visit("input_regions", &input_regions);
+    v->Visit("scalar_inputs", &scalar_inputs);
   }
   static Operation make(std::string name,
                         std::string tag,
@@ -322,7 +325,8 @@ class TensorComputeOpNode : public BaseComputeOpNode {
                         int schedulable_ndim,
                         TensorIntrin intrin,
                         Array<Tensor> tensors,
-                        Array<Region> regions);
+                        Array<Region> regions,
+                        Array<Expr> scalar_inputs);
 
   static constexpr const char* _type_key = "TensorComputeOp";
   TVM_DECLARE_NODE_TYPE_INFO(TensorComputeOpNode, BaseComputeOpNode);
diff --git a/include/tvm/relay/attrs/algorithm.h b/include/tvm/relay/attrs/algorithm.h
index 20f135c11bba..ce14a6a2d535 100644
--- a/include/tvm/relay/attrs/algorithm.h
+++ b/include/tvm/relay/attrs/algorithm.h
@@ -25,6 +25,7 @@
 #define TVM_RELAY_ATTRS_ALGORITHM_H_
 
 #include <tvm/attrs.h>
+#include <tvm/relay/base.h>
 #include <string>
 
 namespace tvm {
@@ -48,6 +49,31 @@ struct ArgsortAttrs : public tvm::AttrsNode<ArgsortAttrs> {
   }
 };
 
+struct TopKAttrs : public tvm::AttrsNode<TopKAttrs> {
+  int k;
+  int axis;
+  bool is_ascend;
+  std::string ret_type;
+  DataType dtype;
+
+  TVM_DECLARE_ATTRS(TopKAttrs, "relay.attrs.TopkAttrs") {
+    TVM_ATTR_FIELD(k).set_default(1)
+      .describe("Number of top elements to select");
+    TVM_ATTR_FIELD(axis).set_default(-1)
+      .describe("Axis along which to sort the input tensor.");
+    TVM_ATTR_FIELD(ret_type).set_default("both")
+      .describe("The return type [both, values, indices]."
+                "both - return both top k data and indices."
+                "values - return top k data only."
+                "indices - return top k indices only.");
+    TVM_ATTR_FIELD(is_ascend).set_default(false)
+      .describe("Whether to sort in ascending or descending order."
+                "By default, sort in descending order");
+    TVM_ATTR_FIELD(dtype).set_default(NullValue<DataType>())
+      .describe("Data type of the output indices.");
+  }
+};
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_ALGORITHM_H_
diff --git a/include/tvm/relay/attrs/image.h b/include/tvm/relay/attrs/image.h
index f3caf213575d..e74c0f53ad65 100644
--- a/include/tvm/relay/attrs/image.h
+++ b/include/tvm/relay/attrs/image.h
@@ -25,6 +25,7 @@
 #define TVM_RELAY_ATTRS_IMAGE_H_
 
 #include <tvm/attrs.h>
+#include <tvm/relay/base.h>
 #include <string>
 
 namespace tvm {
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 8a1aca0a4b4a..0fbf984817b9 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -25,6 +25,7 @@
 #define TVM_RELAY_ATTRS_NN_H_
 
 #include <tvm/attrs.h>
+#include <tvm/relay/base.h>
 #include <string>
 
 namespace tvm {
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 1b82412d0482..5e315185ea7f 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -25,6 +25,7 @@
 #define TVM_RELAY_ATTRS_TRANSFORM_H_
 
 #include <tvm/attrs.h>
+#include <tvm/relay/base.h>
 #include <string>
 
 namespace tvm {
@@ -101,7 +102,8 @@ struct TakeAttrs : public tvm::AttrsNode<TakeAttrs> {
     TVM_ATTR_FIELD(mode).set_default("clip")
         .describe("Specify how out-of-bound indices will behave."
                   "clip - clip to the range (default)"
-                  "wrap - wrap around the indices");
+                  "wrap - wrap around the indices"
+                  "fast - no clip or wrap around (user must make sure indices are in-bound)");
   }
 };
 
diff --git a/include/tvm/relay/attrs/vision.h b/include/tvm/relay/attrs/vision.h
index 11b4ebfcfaad..b98bbfc5988e 100644
--- a/include/tvm/relay/attrs/vision.h
+++ b/include/tvm/relay/attrs/vision.h
@@ -25,6 +25,7 @@
 #define TVM_RELAY_ATTRS_VISION_H_
 
 #include <tvm/attrs.h>
+#include <tvm/relay/base.h>
 #include <string>
 
 namespace tvm {
@@ -79,10 +80,16 @@ struct MultiBoxTransformLocAttrs
 /*! \brief Attributes used in get_valid_counts operator */
 struct GetValidCountsAttrs : public tvm::AttrsNode<GetValidCountsAttrs> {
   double score_threshold;
+  int id_index;
+  int score_index;
 
   TVM_DECLARE_ATTRS(GetValidCountsAttrs, "relay.attrs.GetValidCountsAttrs") {
     TVM_ATTR_FIELD(score_threshold).set_default(0.0)
       .describe("Lower limit of score for valid bounding boxes.");
+    TVM_ATTR_FIELD(id_index).set_default(0)
+      .describe("Axis index of id.");
+    TVM_ATTR_FIELD(score_index).set_default(1)
+      .describe("Index of the scores/confidence of boxes.");
   }
 };
 
diff --git a/include/tvm/relay/error.h b/include/tvm/relay/error.h
index 6b9a1fa7b7c6..5189fd982d37 100644
--- a/include/tvm/relay/error.h
+++ b/include/tvm/relay/error.h
@@ -83,7 +83,7 @@ struct Error : public dmlc::Error {
  *
  * The final mode represents the old mode, if we report an error that has no span or
  * expression, we will default to throwing an exception with a textual representation
- * of the error and no indication of where it occured in the original program.
+ * of the error and no indication of where it occurred in the original program.
  *
  * The latter mode is not ideal, and the goal of the new error reporting machinery is
  * to avoid ever reporting errors in this style.
diff --git a/include/tvm/relay/interpreter.h b/include/tvm/relay/interpreter.h
index 15c96bb12822..68b7ccab99c7 100644
--- a/include/tvm/relay/interpreter.h
+++ b/include/tvm/relay/interpreter.h
@@ -182,17 +182,22 @@ RELAY_DEFINE_NODE_REF(RefValue, RefValueNode, Value);
 class ConstructorValue;
 
 struct ConstructorValueNode : ValueNode {
-  Constructor constructor;
+  int tag;
 
   tvm::Array<Value> fields;
 
+  /*! \brief Optional field tracking ADT constructor. */
+  Constructor constructor;
+
   void VisitAttrs(tvm::AttrVisitor* v) final {
-    v->Visit("constructor", &constructor);
+    v->Visit("tag", &tag);
     v->Visit("fields", &fields);
+    v->Visit("constructor", &constructor);
   }
 
-  TVM_DLL static ConstructorValue make(Constructor constructor,
-                                       tvm::Array<Value> fields);
+  TVM_DLL static ConstructorValue make(int tag,
+                                       tvm::Array<Value> fields,
+                                       Constructor construtor = {});
 
   static constexpr const char* _type_key = "relay.ConstructorValue";
   TVM_DECLARE_NODE_TYPE_INFO(ConstructorValueNode, ValueNode);
diff --git a/include/tvm/relay/module.h b/include/tvm/relay/module.h
index 6441fb3f5b9c..638f75968fd3 100644
--- a/include/tvm/relay/module.h
+++ b/include/tvm/relay/module.h
@@ -87,14 +87,14 @@ class ModuleNode : public RelayNode {
    * \param update Controls whether you can replace a definition in the
    * environment.
    */
-  void Add(const GlobalVar& var, const Function& func, bool update = false);
+  TVM_DLL void Add(const GlobalVar& var, const Function& func, bool update = false);
 
   /*!
    * \brief Add a type-level definition to the global environment.
    * \param var The var of the global type definition.
    * \param type The type definition.
    */
-  void AddDef(const GlobalTypeVar& var, const TypeData& type);
+  TVM_DLL void AddDef(const GlobalTypeVar& var, const TypeData& type);
 
   /*!
    * \brief Add a function to the global environment.
@@ -103,69 +103,69 @@ class ModuleNode : public RelayNode {
    *
    * It does not do type inference as Add does.
    */
-  void AddUnchecked(const GlobalVar& var, const Function& func);
+  TVM_DLL void AddUnchecked(const GlobalVar& var, const Function& func);
 
   /*!
    * \brief Update a function in the global environment.
    * \param var The name of the global function to update.
    * \param func The new function.
    */
-  void Update(const GlobalVar& var, const Function& func);
+  TVM_DLL void Update(const GlobalVar& var, const Function& func);
 
   /*!
    * \brief Remove a function from the global environment.
    * \param var The name of the global function to update.
    */
-  void Remove(const GlobalVar& var);
+  TVM_DLL void Remove(const GlobalVar& var);
 
   /*!
    * \brief Lookup a global function by its variable.
    * \param str The unique string specifying the global variable.
    * \returns The global variable.
    */
-  GlobalVar GetGlobalVar(const std::string& str);
+  TVM_DLL GlobalVar GetGlobalVar(const std::string& str) const;
 
   /*!
    * \brief Look up a global function by its name.
    * \param str The unique string specifying the global variable.
    * \returns The global variable.
    */
-  GlobalTypeVar GetGlobalTypeVar(const std::string& str);
+  TVM_DLL GlobalTypeVar GetGlobalTypeVar(const std::string& str) const;
 
   /*!
    * \brief Lookup a global function by its variable.
    * \param var The global var to lookup.
    * \returns The function named by the variable argument.
    */
-  Function Lookup(const GlobalVar& var);
+  TVM_DLL Function Lookup(const GlobalVar& var) const;
 
   /*!
    * \brief Lookup a global function by its string name
    * \param name The name of the function.
    * \returns The function named by the argument.
    */
-  Function Lookup(const std::string& name);
+  TVM_DLL Function Lookup(const std::string& name) const;
 
   /*!
    * \brief Lookup a global type definition by its variable.
    * \param var The var of the global type definition.
    * \return The type definition.
    */
-  TypeData LookupDef(const GlobalTypeVar& var);
+  TVM_DLL TypeData LookupDef(const GlobalTypeVar& var) const;
 
   /*!
    * \brief Lookup a global type definition by its name.
    * \param var The name of the global type definition.
    * \return The type definition.
    */
-  TypeData LookupDef(const std::string& var);
+  TVM_DLL TypeData LookupDef(const std::string& var) const;
 
   /*!
    * \brief Update the functions inside this environment by
    *        functions in another environment.
    * \param other The other environment.
    */
-  void Update(const Module& other);
+  TVM_DLL void Update(const Module& other);
 
   /*! \brief Construct a module from a standalone expression.
    *
@@ -177,7 +177,7 @@ class ModuleNode : public RelayNode {
    *
    * \returns A module with expr set as the entry point.
    */
-  static Module FromExpr(
+  TVM_DLL static Module FromExpr(
     const Expr& expr,
     const tvm::Map<GlobalVar, Function>& global_funcs = {});
 
diff --git a/include/tvm/relay/pass.h b/include/tvm/relay/pass.h
index 31067925fa63..fff630f55eb7 100644
--- a/include/tvm/relay/pass.h
+++ b/include/tvm/relay/pass.h
@@ -20,51 +20,18 @@
 /*!
  * \file tvm/relay/pass.h
  * \brief The set of Relay passes written in C++.
- *
- * This file also implements a pass manager. The pass manager manages a sequence
- * of Relay-to-Relay transformation passes over a particlar unit of AST. The
- * design is largely inspired from LLVM's pass manager and modern deep learning
- * frameworks that perform tensor->tensor transformations.
- *
- * The responsibilities of a traditional compiler pass manager usually involves:
- *  - Organizing the execution order of optimization passes though not
- * necessarily in the optimal sequence.
- *  - Collecting required analysis information and keep them up-to-date.
- *  - Reducing the effort required to implement new passes for compiler
- * developers, etc.
- *
- * Similar to LLVM's pass manager, we designed the Relay pass manager to work
- * different granularity, i.e. module level, function level, and even sequential
- * passe that contains a host of passes.
- *
- * However, we also extend the functionality of the traditional pass manager
- * with the consideration of requirements/convention from deep learning
- * frameworks, such as Pytorch and Gluon, etc. Each pass in the Relay pass
- * manager performs the Relay.Module -> Relay.Module transformation. All
- * different types of passes, including the sequential-level pass object, are
- * essentially pass objects. This design, therefore, effectively provides users
- * a consistent and convenient interface, i.e. Pass, to play with. It offers a
- * means to ease the development and testing of Relay passes. For example, with
- * the pass manager, external users will be able to have custom passes correctly
- * scheduled without having to modify a single handcrafted pass order.
- *
- * In the future we need to describe constraints between passes. For example,
- * we may want to preserve dependencies between different passes and validate
- * them on the completion of a certain pass.
- *
- * We also need to store side information and import the error reporting system.
- */
+  */
 #ifndef TVM_RELAY_PASS_H_
 #define TVM_RELAY_PASS_H_
 
 #include <tvm/ir.h>
 #include <tvm/packed_func_ext.h>
-#include <tvm/relay/error.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/module.h>
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/type.h>
 #include <tvm/relay/adt.h>
+#include <tvm/relay/transform.h>
 #include <tvm/runtime/vm.h>
 #include <string>
 #include <vector>
@@ -72,174 +39,6 @@
 namespace tvm {
 namespace relay {
 
-namespace pass {
-
-/*
- * \brief The context of pass.
- */
-class PassContext;
-
-/*!
- * \brief PassContextNode contains the information that a pass can rely on, such as
- * analysis results.
- */
-class PassContextNode : public RelayNode {
- public:
-  /*!
-   * \brief The error reporter used to notify users why an optimization fails.
-   */
-  ErrorReporter err_reporter;
-
-  PassContextNode() = default;
-
-  void VisitAttrs(tvm::AttrVisitor* v) final {
-  }
-
-  TVM_DLL static PassContext make();
-
-  static constexpr const char* _type_key = "relay.PassContext";
-  TVM_DECLARE_NODE_TYPE_INFO(PassContextNode, RelayNode);
-};
-
-TVM_DEFINE_NODE_REF(PassContext, PassContextNode)
-
-/*
- * \brief The meta data of a pass.
- *
- * PassInfo can be extended conveniently in the future if more meta information
- * is needed.
- */
-class PassInfo;
-
-/*!
- * \brief PassInfoNode contains meta data that will be used to help optimization
- * and analysis.
- */
-class PassInfoNode : public RelayNode {
- public:
-  /*! \brief The minimal optimization level that this pass will be enabled. */
-  int opt_level;
-
-  /*! \brief The name of an optimization/analysis pass. */
-  std::string name;
-
-  /*! \brief The passes that are required to perform the current pass. */
-  tvm::Array<tvm::Expr> required;
-
-  PassInfoNode() = default;
-
-  void VisitAttrs(tvm::AttrVisitor* v) final {
-    v->Visit("opt_level", &opt_level);
-    v->Visit("name", &name);
-    v->Visit("required", &required);
-  }
-
-  TVM_DLL static PassInfo make(int opt_level, std::string name,
-                               tvm::Array<tvm::Expr> required);
-
-  static constexpr const char* _type_key = "relay.PassInfo";
-  TVM_DECLARE_NODE_TYPE_INFO(PassInfoNode, RelayNode);
-};
-
-TVM_DEFINE_NODE_REF(PassInfo, PassInfoNode)
-
-class Pass;
-
-/*!
- * \brief PassNode is the base type of differnt types of optimization passes.
- * It is designed as a pure class and implemented by different pass subclasses
- * at different granularity of Relay nodes.
- */
-class PassNode : public RelayNode {
- public:
-  /*
-   * \brief Get the pass information/meta data. */
-  virtual PassInfo Info() const = 0;
-
-  /*!
-   * \brief Set the context information for a pass.
-   *
-   * \param pass_ctx The context information for a certain pass.
-   */
-  virtual void SetContext(const PassContext& pass_ctx) = 0;
-
-  /*!
-   * \brief Execute the optimization pass using a functor.
-   *
-   * \param mod The module that an optimization pass runs on.
-   *
-   * \return The updated module.
-   */
-  virtual Module operator()(const Module& mod) const = 0;
-
-  void VisitAttrs(tvm::AttrVisitor* v) override {}
-
-  static constexpr const char* _type_key = "relay.Pass";
-  TVM_DECLARE_BASE_NODE_INFO(PassNode, RelayNode);
-};
-
-class Pass : public NodeRef {
- public:
-  Pass() = default;
-  explicit Pass(NodePtr<tvm::Node> p) : NodeRef(p) {}
-
-  PassNode* operator->() const {
-    return static_cast<PassNode*>(this->node_.get());
-  }
-
-  using ContainerType = PassNode;
-};
-
-/*
- * \brief Create a module pass.
- *
- * \param pass_func The packed function that contains the optimization.
- * \param opt_level The optimization level of the module pass.
- * \param name The name of the module pass.
- * \param required The list of the passes that the module pass is dependent on.
- *
- * \return The created module pass.
- */
-Pass CreateModulePass(
-    const runtime::TypedPackedFunc<Module(Module, PassContext)>& pass_func,
-    int opt_level,
-    const std::string& name,
-    const tvm::Array<tvm::Expr>& required);
-
-/*
- * \brief Create a function pass.
- *
- * \param pass_func The packed function that contains the optimization.
- * \param opt_level The optimization level of the function pass.
- * \param name The name of the function pass.
- * \param required The list of the passes that the function pass is dependent on.
- *
- * \return The created function pass.
- */
-Pass CreateFunctionPass(
-    const runtime::TypedPackedFunc<Function(Function, PassContext)>& pass_func,
-    int opt_level,
-    const std::string& name,
-    const tvm::Array<tvm::Expr>& required);
-/*
- * \brief Create a sequential pass.
- *
- * \param passes The optimization passes will be performed.
- * \param opt_level The optimization level of the sequential pass.
- * \param name The name of the sequential pass.
- * \param required The list of the passes that the sequential pass is dependent on.
- * \param disabled The disabled passes.
- *
- * \return The created sequential pass.
- */
-Pass CreateSequentialPass(const tvm::Array<Pass>& passes,
-                          int opt_level,
-                          const std::string& name,
-                          const tvm::Array<tvm::Expr>& required,
-                          const tvm::Array<tvm::Expr>& disabled);
-
-}  // namespace pass
-
 /*!
  * \brief Infer the type of an expression.
  *
@@ -286,7 +85,8 @@ TVM_DLL Function InferType(const Function& f, const Module& mod,
  */
 TVM_DLL Kind KindCheck(const Type& t, const Module& mod);
 
-/*! \brief Compare two expressions for structural equivalence.
+/*!
+ * \brief Compare two expressions for structural equivalence.
  *
  * This comparison operator respects scoping and compares
  * expressions without regard to variable choice.
@@ -303,7 +103,8 @@ TVM_DLL Kind KindCheck(const Type& t, const Module& mod);
  */
 TVM_DLL bool AlphaEqual(const Expr& e1, const Expr& e2);
 
-/*! \brief Compare two types for structural equivalence.
+/*!
+ * \brief Compare two types for structural equivalence.
  *
  * This comparison operator respects scoping and compares
  * expressions without regard to variable choice.
@@ -321,7 +122,26 @@ TVM_DLL bool AlphaEqual(const Expr& e1, const Expr& e2);
  */
 TVM_DLL bool AlphaEqual(const Type& t1, const Type& t2);
 
-/*! \brief Add abstraction over a function
+/*!
+ * \brief Compare two patterns for structural equivalence.
+ *
+ * This comparison operator respects scoping and compares
+ * patterns without regard to variable choice.
+ *
+ * For example: `A(x, _, y)` is equal to `A(z, _, a)`.
+ *
+ * See https://en.wikipedia.org/wiki/Lambda_calculus#Alpha_equivalence
+ * for more details.
+ *
+ * \param t1 The left hand pattern.
+ * \param t2 The right hand pattern.
+ *
+ * \return true if equal, otherwise false
+ */
+TVM_DLL bool AlphaEqual(const Pattern& t1, const Pattern& t2);
+
+/*!
+ * \brief Add abstraction over a function
  *
  * For example: `square` is transformed to
  * `fun x -> square x`.
@@ -337,7 +157,8 @@ TVM_DLL bool AlphaEqual(const Type& t1, const Type& t2);
  */
 TVM_DLL Expr EtaExpand(const Expr& e, const Module& mod);
 
-/*! \brief Check that each Var is only bound once.
+/*!
+ * \brief Check that each Var is only bound once.
  *
  * For example, the expression `let x = 1 in let x = 2 in 3` bound x twice.
  *
@@ -350,7 +171,8 @@ TVM_DLL Expr EtaExpand(const Expr& e, const Module& mod);
  */
 TVM_DLL bool WellFormed(const Expr& expr);
 
-/*! \brief Get all bound variables from expression expr.
+/*!
+ * \brief Get all bound variables from expression expr.
  *
  * Bound variables are all variables that are declared in the expr.
  * They only have meaning inside that expr, and can only be used in it.
@@ -361,7 +183,8 @@ TVM_DLL bool WellFormed(const Expr& expr);
  */
 TVM_DLL tvm::Array<Var> BoundVars(const Expr& expr);
 
-/*! \brief Get all bound variables from pattern pat.
+/*!
+ * \brief Get all bound variables from pattern pat.
  *
  * Bound variables are all variables that got bound by the pat.
  * They only have meaning inside that expr, and can only be used in it.
@@ -372,7 +195,8 @@ TVM_DLL tvm::Array<Var> BoundVars(const Expr& expr);
  */
 TVM_DLL tvm::Array<Var> BoundVars(const Pattern& pat);
 
-/*! \brief Get free type parameters from expression expr.
+/*!
+ * \brief Get free type parameters from expression expr.
  *
  * Free variables are variables that are not bound by a
  * let or a function parameter in the context.
@@ -383,7 +207,8 @@ TVM_DLL tvm::Array<Var> BoundVars(const Pattern& pat);
  */
 TVM_DLL tvm::Array<Var> FreeVars(const Expr& expr);
 
-/*! \brief Get all variables from expression expr.
+/*!
+ * \brief Get all variables from expression expr.
  *
  * \param expr the expression.
  *
@@ -391,7 +216,8 @@ TVM_DLL tvm::Array<Var> FreeVars(const Expr& expr);
  */
 TVM_DLL tvm::Array<Var> AllVars(const Expr& expr);
 
-/*! \brief Get free TypeVars from expression expr.
+/*!
+ * \brief Get free TypeVars from expression expr.
  *
  * Free type parameters are type parameters that are not bound by a function
  * type in the context.
@@ -403,7 +229,8 @@ TVM_DLL tvm::Array<Var> AllVars(const Expr& expr);
  */
 TVM_DLL tvm::Array<TypeVar> FreeTypeVars(const Expr& expr, const Module& mod);
 
-/*! \brief Get free TypeVars from type t.
+/*!
+ * \brief Get free TypeVars from type t.
  *
  * Free type parameters are type parameters that are not bound by a function
  * type in the context.
@@ -415,7 +242,8 @@ TVM_DLL tvm::Array<TypeVar> FreeTypeVars(const Expr& expr, const Module& mod);
  */
 TVM_DLL tvm::Array<TypeVar> FreeTypeVars(const Type& t, const Module& mod);
 
-/*! \brief Get all bound type variables from expression expr.
+/*!
+ * \brief Get all bound type variables from expression expr.
  *
  * Bound variables are all type variables that are declared in the expr.
  * They only have meaning inside that expr, and can only be used in it.
@@ -427,7 +255,8 @@ TVM_DLL tvm::Array<TypeVar> FreeTypeVars(const Type& t, const Module& mod);
  */
 TVM_DLL tvm::Array<TypeVar> BoundTypeVars(const Expr& expr, const Module& mod);
 
-/*! \brief Get all bound type variables from type t.
+/*!
+ * \brief Get all bound type variables from type t.
  *
  * Bound variables are all type variables that are declared in the type.
  * They only have meaning inside that type, and can only be used in it.
@@ -439,7 +268,8 @@ TVM_DLL tvm::Array<TypeVar> BoundTypeVars(const Expr& expr, const Module& mod);
  */
 TVM_DLL tvm::Array<TypeVar> BoundTypeVars(const Type& t, const Module& mod);
 
-/*! \brief Get all type variables in expression expr.
+/*!
+ * \brief Get all type variables in expression expr.
  *
  * \param expr the expression.
  * \param mod the module.
@@ -448,7 +278,8 @@ TVM_DLL tvm::Array<TypeVar> BoundTypeVars(const Type& t, const Module& mod);
  */
 TVM_DLL tvm::Array<TypeVar> AllTypeVars(const Expr& expr, const Module& mod);
 
-/*! \brief Get all type variables in type t.
+/*!
+ * \brief Get all type variables in type t.
  *
  * \param t the type.
  * \param mod the module.
@@ -465,32 +296,39 @@ TVM_DLL tvm::Array<TypeVar> AllTypeVars(const Type& t, const Module& mod);
  * For example, this pass should turn `let a = 1 in 2` into `2`,
  * as the value of the expression does not depend on a.
  *
- * As another example, `let a = 1 in a` will be optimized into 1.
+ * As another example, `let a = 1 in a` will be optimized into 1,
+ * if the flag is turned on.
  *
  * \param e the expression to optimize.
+ * \param inline_once whether or not to inline binding used one.
  *
  * \return the optimized expression.
  */
-TVM_DLL Expr DeadCodeElimination(const Expr& e);
+TVM_DLL Expr DeadCodeElimination(const Expr& e, bool inline_once = false);
 
 /*!
  * \brief Fold constant expressions.
+ *
  * \param expr the expression to be optimized.
+ *
  * \return The optimized expression.
  */
 TVM_DLL Expr FoldConstant(const Expr& expr);
 
 /*!
  * \brief Fuse operations into expr into seperate functions.
+ *
  * \param expr The expression.
  * \param fuse_opt_level Optimization level.
  * \param mod the module.
+ *
  * \return The optimized expression.
  */
 TVM_DLL Expr FuseOps(const Expr& expr, int fuse_opt_level, const Module& mod);
 
 /*!
  * \brief Apply rewrite rules to rewrite the expr in post DFS order.
+ *
  * \param expr The expression.
  * \param rewrite_map_attr_name The Op's attr name which corresponds to the rewrite
  *                              rule function.
@@ -500,84 +338,77 @@ TVM_DLL Expr FuseOps(const Expr& expr, int fuse_opt_level, const Module& mod);
  * \return The rewritten expression.
  */
 TVM_DLL Expr ForwardRewrite(const Expr& expr,
-                    const std::string& rewrite_map_attr_name,
-                    std::function<NodeRef(const Call&)> fcontext = nullptr,
-                    std::function<Expr(const Expr&)> fmulti_ref_trigger = nullptr);
+                            const std::string& rewrite_map_attr_name,
+                            std::function<NodeRef(const Call&)> fcontext = nullptr,
+                            std::function<Expr(const Expr&)> fmulti_ref_trigger = nullptr);
 
 /*!
  * \brief Apply rewrite rules to rewrite the expr in post DFS order.
+ *
  * \param expr The expression.
  * \param rewrite_func The rewrite func that will apply to all operators.
  * \param fcontext Additional callback to provide context argument for each call node.
  * \param fmulti_ref_trigger Transformation function to be called when
  *                           an Expr consumed by multiple callers.
+ *
  * \return The rewritten expression.
  */
 TVM_DLL Expr ForwardRewrite(const Expr& expr,
-                    const FForwardRewrite& rewrite_func,
-                    std::function<NodeRef(const Call&)> fcontext = nullptr,
-                    std::function<Expr(const Expr&)> fmulti_ref_trigger = nullptr);
+                            const FForwardRewrite& rewrite_func,
+                            std::function<NodeRef(const Call&)> fcontext = nullptr,
+                            std::function<Expr(const Expr&)> fmulti_ref_trigger = nullptr);
 
 /*!
  * \brief Rewrite the annotated program.
+ *
  * \param expr The expression.
  * \param fallback_device The fallback device which is the default device for
  *                        operators without annotation.
+ *
  * \return The updated program.
  */
 TVM_DLL Expr RewriteAnnotatedOps(const Expr& expr, int fallback_device);
 
 /*!
  * \brief Collect the device mapping information of each expression.
+ *
  * \param expr The expression.
+ *
  * \return The device mapping.
  */
 TVM_DLL Map<Expr, Integer> CollectDeviceInfo(const Expr& expr);
 
-/*! \brief A hashing structure in the style of std::hash. */
-struct StructuralHash {
-  /*! \brief Hash a Relay type.
-   *
-   * Implements structural hashing of a Relay type.
-   *
-   *  \param type the type to hash.
-   *
-   *  \return the hash value.
-   */
-  size_t operator()(const Type& type) const;
-
-  /*! \brief Hash a Relay expression.
-   *
-   * Implements structural hashing of a Relay expression.
-   *
-   * \param expr the expression to hash.
-   *
-   * \return the hash value.
-   */
-  size_t operator()(const Expr& expr) const;
-};
+/*!
+ * \brief Collect the device anntation operators.
+ *
+ * \param expr The expression.
+ *
+ * \return The annotated expression to device type mapping for annotation ops.
+ */
+TVM_DLL Map<Expr, Integer> CollectDeviceAnnotationOps(const Expr& expr);
 
-/*! \brief turn a dataflow graph into Administrative Normal Form, or A-Normal Form (ANF).
+/*!
+ * \brief turn a dataflow graph into Administrative Normal Form, or A-Normal Form (ANF).
  *
  * It will turn an expression that is in a graph form (with sharing implicit),
  * to an expression with explicit sharing (A-Normal Form).
  *
  * The scope of the root expression is the global scope.
-
+ *
  * The scope of any non root expression is the least common ancestor of all it's scope.
  *
  * Values are ordered by post-DFS order in each scope.
  *
- * \param e the expression to observably share
- *
+ * \param e the expression to observably share.
  * \param mod The module used for referencing global functions, can be
  * None.
  *
- * \return expression in A-Normal Form
+ * \return expression in A-Normal Form.
  */
 TVM_DLL Expr ToANormalForm(const Expr& e, const Module& mod);
 
-/*! \brief Remove let binding and directly share via pointer instead.
+/*!
+ * \brief Remove let binding and directly share via pointer instead.
  *
  * It will remove all let binding,
  * and turn all of the variable bound by let into direct pointer reference.
@@ -588,18 +419,72 @@ TVM_DLL Expr ToANormalForm(const Expr& e, const Module& mod);
  */
 TVM_DLL Expr ToGraphNormalForm(const Expr& e);
 
-/*! \brief Aggressive constant propagation/constant folding/inlining.
+/*!
+ * \brief Finds cases that the given match expression does not catch, if any.
+ *
+ * \param match the match expression to test
+ *
+ * \param mod The module used for accessing global type var definitions, can be None.
+ *
+ * \return Returns a list of cases (as patterns) that are not handled by the match
+ * expression.
+ */
+TVM_DLL Array<Pattern> UnmatchedCases(const Match& match, const Module& mod);
+
+/*!
+ * \brief Aggressive constant propagation/constant folding/inlining.
  * It will do as much computation in compile time as possible.
  * It has two benefit: remove runtime overhead, and allow more optimization (typically fusion).
  * As a side effect, code size will explode.
+ *
+ * \param e the expression
+ * \param mod the module
+ *
+ * \return the optimized expression.
+ */
+TVM_DLL Expr PartialEval(const Expr& e, const Module& mod);
+
+/*!
+ * \brief Bind the free variables to a Relay expression.
+ *
+ * \param expr The expression.
+ * \param bind_map The variable to expression map that will be used to help the
+ *        binding.
+ *
+ * \return The updated expression.
  */
-Expr PartialEval(const Expr& e);
+TVM_DLL Expr Bind(const Expr& expr, const tvm::Map<Var, Expr>& bind_map);
+
+/*! \brief A hashing structure in the style of std::hash. */
+struct StructuralHash {
+  /*! \brief Hash a Relay type.
+   *
+   * Implements structural hashing of a Relay type.
+   *
+   * \param type the type to hash.
+   *
+   * \return the hash value.
+   */
+  size_t operator()(const Type& type) const;
+
+  /*! \brief Hash a Relay expression.
+   *
+   * Implements structural hashing of a Relay expression.
+   *
+   * \param expr the expression to hash.
+   *
+   * \return the hash value.
+   */
+  size_t operator()(const Expr& expr) const;
+};
 
 namespace vm {
 
-/*! \brief Compile a module, and construct the virtual machine.
+/*!
+ * \brief Compile a module, and construct the virtual machine.
  *
  * \param mod The module to compile.
+ *
  * \return The constructed virtual machine.
  */
 runtime::vm::VirtualMachine CompileModule(const Module& mod);
diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
new file mode 100644
index 000000000000..04b4e64dc9c3
--- /dev/null
+++ b/include/tvm/relay/transform.h
@@ -0,0 +1,548 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/relay/transform.h
+ *
+ * This file implements a pass manager. The pass manager manages a sequence
+ * of Relay-to-Relay transformation passes over a particlar unit of AST. The
+ * design is largely inspired from LLVM's pass manager and modern deep learning
+ * frameworks that perform tensor->tensor transformations.
+ *
+ * The responsibilities of a traditional compiler pass manager usually involves:
+ *  - Organizing the execution order of optimization passes though not
+ * necessarily in the optimal sequence.
+ *  - Collecting required analysis information and keep them up-to-date.
+ *  - Reducing the effort required to implement new passes for compiler
+ * developers, etc.
+ *
+ * Similar to LLVM's pass manager, we designed the Relay pass manager to work
+ * different granularity, i.e. module level, function level, and even sequential
+ * passe that contains a host of passes.
+ *
+ * However, we also extend the functionality of the traditional pass manager
+ * with the consideration of requirements/convention from deep learning
+ * frameworks, such as Pytorch and Gluon, etc. Each pass in the Relay pass
+ * manager performs the Relay.Module -> Relay.Module transformation. All
+ * different types of passes, including the sequential-level pass object, are
+ * essentially pass objects. This design, therefore, effectively provides users
+ * a consistent and convenient interface, i.e. Pass, to play with. It offers a
+ * means to ease the development and testing of Relay passes. For example, with
+ * the pass manager, external users will be able to have custom passes correctly
+ * scheduled without having to modify a single handcrafted pass order.
+ *
+ * In the future we need to describe constraints between passes. For example,
+ * we may want to preserve dependencies between different passes and validate
+ * them on the completion of a certain pass.
+ *
+ * We also need to store side information and import the error reporting system.
+ */
+#ifndef TVM_RELAY_TRANSFORM_H_
+#define TVM_RELAY_TRANSFORM_H_
+
+#include <tvm/base.h>
+#include <tvm/packed_func_ext.h>
+#include <tvm/relay/attrs/transform.h>
+#include <tvm/relay/error.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/module.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/op_attr_types.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace tvm {
+namespace relay {
+namespace transform {
+
+/*
+ * \brief The context of pass.
+ */
+class PassContext;
+
+/*!
+ * \brief PassContextNode contains the information that a pass can rely on,
+ * such as analysis results.
+ */
+class PassContextNode : public RelayNode {
+ public:
+  /*!
+   * \brief The error reporter used to notify users why an optimization fails.
+   */
+  ErrorReporter err_reporter;
+
+  /*! \brief The default optimization level. */
+  int opt_level{2};
+
+  /*! \brief CPU is the default fallback device for heterogeneous execution. */
+  int fallback_device{static_cast<int>(kDLCPU)};
+
+  /*! \brief The list of required passes. */
+  tvm::Array<tvm::Expr> required_pass;
+  /*! \brief The list of disabled passes. */
+  tvm::Array<tvm::Expr> disabled_pass;
+
+  PassContextNode() = default;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("opt_level", &opt_level);
+    v->Visit("fallback_device", &fallback_device);
+    v->Visit("required_pass", &required_pass);
+    v->Visit("disabled_pass", &disabled_pass);
+  }
+
+  static constexpr const char* _type_key = "relay.PassContext";
+  TVM_DECLARE_NODE_TYPE_INFO(PassContextNode, RelayNode);
+};
+
+/*!
+ * \brief PassContext that is used to configure the pass behavior.
+ *
+ * \code
+ *
+ *  auto new_ctx = PassContext::Create();
+ *  ctx->opt_level = 2;
+ *  ctx->fallback_device = kDLCPU;
+ *  With<PassContext> scope(ctx);
+ *  // pass context in effect.
+ *
+ * \endcode
+ */
+class PassContext : public NodeRef {
+ public:
+  PassContext() {}
+  explicit PassContext(NodePtr<::tvm::Node> n) : NodeRef(n) {}
+  /*!
+   * \brief const accessor.
+   * \return const access pointer.
+   */
+  const PassContextNode* operator->() const {
+    CHECK(node_.get() != nullptr);
+    return static_cast<const PassContextNode*>(node_.get());
+  }
+  /*!
+   * \brief mutable accessor.
+   * \return mutable access pointer.
+   */
+  PassContextNode* operator->() {
+    CHECK(node_.get() != nullptr);
+    return static_cast<PassContextNode*>(node_.get());
+  }
+  /*!
+   * \brief Construct a PassContext containing the default configurations.
+   * \return The new PassContext.
+   */
+  TVM_DLL static PassContext Create();
+  /*!
+   * \brief Get the default pass context in the current scope.
+   * \return The pass context.
+   */
+  TVM_DLL static PassContext Current();
+
+  // accessor.
+  using ContainerType = PassContextNode;
+  class Internal;
+
+ private:
+  // The entry of a pass context scope.
+  TVM_DLL void EnterWithScope();
+  // The exit of a pass context scope.
+  TVM_DLL void ExitWithScope();
+
+  // Classes to get the Python `with` like syntax.
+  friend class Internal;
+  friend class tvm::With<PassContext>;
+};
+
+/*
+ * \brief The meta data of a pass.
+ *
+ * PassInfo can be extended conveniently in the future if more meta information
+ * is needed.
+ */
+class PassInfo;
+
+/*!
+ * \brief PassInfoNode contains meta data that will be used to help optimization
+ * and analysis.
+ */
+class PassInfoNode : public RelayNode {
+ public:
+  /*! \brief The minimal optimization level that this pass will be enabled. */
+  int opt_level;
+
+  /*! \brief The name of an optimization/analysis pass. */
+  std::string name;
+
+  /*! \brief The passes that are required to perform the current pass. */
+  tvm::Array<tvm::Expr> required;
+
+  PassInfoNode() = default;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("opt_level", &opt_level);
+    v->Visit("name", &name);
+    v->Visit("required", &required);
+  }
+
+  TVM_DLL static PassInfo make(int opt_level,
+                               std::string name,
+                               tvm::Array<tvm::Expr> required);
+
+  static constexpr const char* _type_key = "relay.PassInfo";
+  TVM_DECLARE_NODE_TYPE_INFO(PassInfoNode, RelayNode);
+};
+
+TVM_DEFINE_NODE_REF(PassInfo, PassInfoNode)
+
+class Pass;
+
+/*!
+ * \brief PassNode is the base type of differnt types of optimization passes.
+ * It is designed as a pure class and implemented by different pass subclasses
+ * at different granularity of Relay nodes.
+ */
+class PassNode : public RelayNode {
+ public:
+  /*!
+   * \brief Get the pass information/meta data. */
+  virtual PassInfo Info() const = 0;
+
+  /*!
+   * \brief Transform mod using the default PassContext in the current scope.
+   *
+   * \param mod The module that an optimization pass runs on.
+   *
+   * \return The transformed module.
+   */
+  Module operator()(const Module& mod) const {
+    return this->operator()(mod, PassContext::Current());
+  }
+
+  /*!
+   * \brief Transform mod using a functor under a given pass context.
+   *
+   * \param mod The module that an optimization pass runs on.
+   * \param pass_ctx The pass context that can provide information for the optimization.
+   *
+   * \return The transformed module.
+   */
+  virtual Module operator()(const Module& mod,
+                            const PassContext& pass_ctx) const = 0;
+
+  void VisitAttrs(tvm::AttrVisitor* v) override {}
+
+  static constexpr const char* _type_key = "relay.Pass";
+  TVM_DECLARE_BASE_NODE_INFO(PassNode, RelayNode);
+};
+
+class Pass : public NodeRef {
+ public:
+  /*!
+   * \brief Transform mod using the default PassContext in the current scope.
+   *
+   * \param mod The module that an optimization pass runs on.
+   *
+   * \return The transformed module.
+   */
+  Module operator()(const Module& mod) const {
+    const PassNode* node = operator->();
+    CHECK(node != nullptr);
+    return node->operator()(mod);
+  }
+  /*!
+   * \brief Transform mod using a functor under a given pass context.
+   *
+   * \param mod The module that an optimization pass runs on.
+   * \param pass_ctx The pass context that can provide information for the optimization.
+   *
+   * \return The transformed module.
+   */
+  Module operator()(const Module& mod,
+                    const PassContext& pass_ctx) const {
+    const PassNode* node = operator->();
+    CHECK(node != nullptr);
+    return node->operator()(mod, pass_ctx);
+  }
+
+  TVM_DEFINE_NODE_REF_METHODS(Pass, NodeRef, PassNode);
+};
+
+class SequentialNode;
+
+class Sequential : public Pass {
+ public:
+  /*!
+   * \brief The constructor of `Sequential`.
+   *
+   * \param passes The passes to apply.
+   * \param pass_info The pass metadata.
+   */
+  TVM_DLL Sequential(tvm::Array<Pass> passes, PassInfo pass_info);
+
+  /*!
+   * \brief The constructor of `Sequential`.
+   *
+   * \param passes The passes to apply.
+   * \param name The name of a sequential pass. It's defaulted to "sequential".
+   *        This allows users to only provide a list of passes and execute them
+   *        under a given context.
+   */
+  TVM_DLL Sequential(tvm::Array<Pass> passes, std::string name = "sequential");
+
+  Sequential() = default;
+  explicit Sequential(tvm::NodePtr<::tvm::Node> n) : Pass(n) {}
+
+  const SequentialNode* operator->() const;
+  using ContainerType = Sequential;
+};
+
+/*
+ * \brief Create a module pass.
+ *
+ * \param pass_func The packed function that contains the optimization.
+ * \param opt_level The optimization level of the module pass.
+ * \param name The name of the module pass.
+ * \param required The list of the passes that the module pass is dependent on.
+ *
+ * \return The created module pass.
+ */
+Pass CreateModulePass(
+    const runtime::TypedPackedFunc<Module(Module, PassContext)>& pass_func,
+    int opt_level,
+    const std::string& name,
+    const tvm::Array<tvm::Expr>& required);
+
+/*
+ * \brief Create a function pass.
+ *
+ * \param pass_func The packed function that contains the optimization.
+ * \param opt_level The optimization level of the function pass.
+ * \param name The name of the function pass.
+ * \param required The list of the passes that the function pass is dependent on.
+ *
+ * \return The created function pass.
+ */
+TVM_DLL Pass CreateFunctionPass(const runtime::TypedPackedFunc<
+                                Function(Function, Module, PassContext)>& pass_func,
+                                int opt_level,
+                                const std::string& name,
+                                const tvm::Array<tvm::Expr>& required);
+
+/*! \brief Remove expressions which does not effect the program result.
+ *
+ * It will remove let bindings which are not referenced,
+ * and inline let bindings that are only used once.
+ *
+ * For example, this pass should turn `let a = 1 in 2` into `2`,
+ * as the value of the expression does not depend on a.
+ *
+ * As another example, `let a = 1 in a` will be optimized into 1.
+ *
+ * \param inline_once whether or not to inline binding used one.
+ *
+ * \return the pass.
+ */
+TVM_DLL Pass DeadCodeElimination(bool inline_once = false);
+
+/*!
+ * \brief Fold constant expressions.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass FoldConstant();
+
+/*!
+ * \brief Fuse operations into expr into seperate functions.
+ *
+ * \param fuse_opt_level Optimization level. If it is -1 it will be inferred from pass context.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass FuseOps(int fuse_opt_level = -1);
+
+/*!
+ * \brief Apply rewrite rules to rewrite the expr in post DFS order.
+ *
+ * \param rewrite_map_attr_name The Op's attr name which corresponds to the rewrite
+ *                              rule function.
+ * \param fcontext Additional callback to provide context argument for each call node.
+ * \param fmulti_ref_trigger Transformation function to be called when
+ *                           an Expr consumed by multiple callers.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass ForwardRewrite(const std::string& rewrite_map_attr_name,
+                            std::function<NodeRef(const Call&)> fcontext = nullptr,
+                            std::function<Expr(const Expr&)>
+                            fmulti_ref_trigger = nullptr);
+
+/*!
+ * \brief Apply rewrite rules to rewrite the expr in post DFS order.
+ *
+ * \param rewrite_func The rewrite func that will apply to all operators.
+ * \param fcontext Additional callback to provide context argument for each call node.
+ * \param fmulti_ref_trigger Transformation function to be called when
+ *                           an Expr consumed by multiple callers.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass ForwardRewrite(const FForwardRewrite& rewrite_func,
+                            std::function<NodeRef(const Call&)> fcontext = nullptr,
+                            std::function<Expr(const Expr&)> fmulti_ref_trigger = nullptr);
+
+/*!
+ * \brief Rewrite the annotated program.
+ *
+ * \param fallback_device The fallback device which is the default device for
+ *                        operators without annotation.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass RewriteAnnotatedOps(int fallback_device);
+
+/*!
+ * \brief turn a dataflow graph into Administrative Normal Form, or A-Normal Form (ANF).
+ *
+ * It will turn an expression that is in a graph form (with sharing implicit),
+ * to an expression with explicit sharing (A-Normal Form).
+ *
+ * The scope of the root expression is the global scope.
+ *
+ * The scope of any non root expression is the least common ancestor of all it's scope.
+ *
+ * Values are ordered by post-DFS order in each scope.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass ToANormalForm();
+
+/*!
+ * \brief Remove let binding and directly share via pointer instead.
+ *
+ * It will remove all let binding,
+ * and turn all of the variable bound by let into direct pointer reference.
+ *
+ * \return the expression in graph normal form.
+ */
+TVM_DLL Pass ToGraphNormalForm();
+
+/*!
+ * \brief Aggressive constant propagation/constant folding/inlining.
+ *
+ * It will do as much computation in compile time as possible.
+ * It has two benefit: remove runtime overhead, and allow more optimization (typically fusion).
+ * As a side effect, code size will explode.
+ *
+ * \return the optimized expression.
+ */
+TVM_DLL Pass PartialEval();
+
+/*!
+ * \brief Simplify certain operators during inference. For example, batch norm
+ * will be unpacked into a number of simplified operators.
+ *
+ * \return The Pass.
+ */
+TVM_DLL Pass SimplifyInference();
+
+/*!
+ * \brief Infer the type of an expression.
+ *
+ * The result of type checking is a new expression with unambigous
+ * type information filled in, as well as it's checked type field
+ * populated with the result type.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass InferType();
+
+/*!
+ * \brief Search and eliminate common subexpression. For example, if there are
+ * two expressions evaluated to an identical value, a single variable is created
+ * and these two expressions are replaced by this variable.
+ *
+ * \param fskip The callback argument that allows to skip certain expressions.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass EliminateCommonSubexpr(PackedFunc fskip = nullptr);
+
+/*!
+ * \brief Combine parallel 2d convolutions into a single convolution if the
+ * number of branches of this conv2d operator is not less than
+ * `min_num_branch`.
+ *
+ * \param min_num_branches The minimun number of branches.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass CombineParallelConv2D(uint64_t min_num_branches = 3);
+
+/*!
+ * \brief Backward fold axis scaling into weights of conv/dense operators.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass BackwardFoldScaleAxis();
+
+/*!
+ * \brief Forward fold axis scaling into weights of conv/dense operators.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass ForwardFoldScaleAxis();
+
+/*!
+ * \brief A sequential pass that executes ForwardFoldScaleAxis and
+ * BackwardFoldScaleAxis passes.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass FoldScaleAxis();
+
+/*!
+ * \brief Canonicalize some operators to the simplified operators. For example,
+ * bias_add can be canonicalized to expand_dims and broadcast_add.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass CanonicalizeOps();
+
+/*!
+ * \brief Alternate the layouts of operators or replace primitive operators
+ * with other expressions.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass AlterOpLayout();
+
+/*!
+ * \brief Canonicalize cast expressions to make operator fusion more efficient.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass CanonicalizeCast();
+
+}  // namespace transform
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_TRANSFORM_H_
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index f992e87ad100..ba2c0d2291b6 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -114,6 +114,8 @@ typedef enum {
   // The following section of code is used for non-reserved types.
   kExtReserveEnd = 64U,
   kExtEnd = 128U,
+  // The rest of the space is used for custom, user-supplied datatypes
+  kCustomBegin = 129U,
 } TVMTypeCode;
 
 /*!
@@ -185,7 +187,7 @@ TVM_DLL void TVMAPISetLastError(const char* msg);
 /*!
  * \brief return str message of the last error
  *  all function in this file will return 0 when success
- *  and -1 when an error occured,
+ *  and -1 when an error occurred,
  *  TVMGetLastError can be called to retrieve the error
  *
  *  this function is threadsafe and can be called by different thread
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 9fcefcbbe4b1..82b3dd469541 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -60,6 +60,29 @@ namespace tvm {
 class Integer;
 
 namespace runtime {
+
+/*!
+ * \brief Runtime utility for getting custom type name from code
+ * \param type_code Custom type code
+ * \return Custom type name
+ */
+TVM_DLL std::string GetCustomTypeName(uint8_t type_code);
+
+/*!
+ * \brief Runtime utility for checking whether custom type is registered
+ * \param type_code Custom type code
+ * \return Bool representing whether type is registered
+ */
+TVM_DLL bool GetCustomTypeRegistered(uint8_t type_code);
+
+/*!
+ * \brief Runtime utility for parsing string of the form "custom[<typename>]"
+ * \param s String to parse
+ * \param scan pointer to parsing pointer, which is scanning across s
+ * \return type code of custom type parsed
+ */
+TVM_DLL uint8_t ParseCustomDatatype(const std::string& s, const char** scan);
+
 // forward declarations
 class TVMArgs;
 class TVMArgValue;
@@ -939,7 +962,11 @@ inline std::ostream& operator<<(std::ostream& os, TVMType t) {  // NOLINT(*)
   if (t.bits == 1 && t.lanes == 1 && t.code == kDLUInt) {
     os << "bool"; return os;
   }
-  os << TypeCode2Str(t.code);
+  if (GetCustomTypeRegistered(t.code)) {
+    os << "custom[" << GetCustomTypeName(t.code) << "]";
+  } else {
+    os << TypeCode2Str(t.code);
+  }
   if (t.code == kHandle) return os;
   os << static_cast<int>(t.bits);
   if (t.lanes != 1) {
@@ -960,7 +987,11 @@ inline std::string TVMType2String(TVMType t) {
   if (t.bits == 1 && t.lanes == 1 && t.code == kDLUInt) {
     return "bool";
   }
-  repr += TypeCode2Str(t.code);
+  if (GetCustomTypeRegistered(t.code)) {
+    repr += "custom[" + GetCustomTypeName(t.code) + "]";
+  } else {
+    repr += TypeCode2Str(t.code);
+  }
   if (t.code == kHandle) return repr;
   repr += std::to_string(static_cast<int>(t.bits));
   if (t.lanes != 1) {
@@ -994,6 +1025,8 @@ inline TVMType String2TVMType(std::string s) {
     t.bits = 1;
     t.lanes = 1;
     return t;
+  } else if (s.substr(0, 6) == "custom") {
+    t.code = ParseCustomDatatype(s, &scan);
   } else {
     scan = s.c_str();
     LOG(FATAL) << "unknown type " << s;
diff --git a/include/tvm/runtime/vm.h b/include/tvm/runtime/vm.h
index 8911ad499e4c..028a5ff9d1ad 100644
--- a/include/tvm/runtime/vm.h
+++ b/include/tvm/runtime/vm.h
@@ -56,13 +56,14 @@ enum class Opcode {
   InvokeClosure = 3U,
   InvokePacked = 4U,
   AllocTensor = 5U,
-  AllocDatatype = 6U,
-  AllocClosure = 7U,
-  GetField = 8U,
-  If = 9U,
-  Select = 10U,
-  LoadConst = 11U,
-  Goto = 12U
+  AllocTensorReg = 6U,
+  AllocDatatype = 7U,
+  AllocClosure = 8U,
+  GetField = 9U,
+  If = 10U,
+  Select = 11U,
+  LoadConst = 12U,
+  Goto = 13U
 };
 
 /*! \brief A single virtual machine instruction.
@@ -83,11 +84,19 @@ struct Instruction {
 
   union {
     struct /* AllocTensor Operands */ {
+      /*! \brief The number of dimensions. */
+      uint32_t ndim;
+      /*! \brief The shape of tensor. */
+      int64_t* shape;
+      /*! \brief The datatype of tensor to be allocated. */
+      DLDataType dtype;
+    } alloc_tensor;
+    struct /* AllocTensorReg Operands */ {
       /*! \brief The register to read the shape out of. */
       RegName shape_register;
       /*! \brief The datatype of tensor to be allocated. */
       DLDataType dtype;
-    };
+    } alloc_tensor_reg;
     struct /* InvokeClosure Operands */ {
       /*! \brief The register containing the closure. */
       RegName closure;
@@ -192,13 +201,20 @@ struct Instruction {
    */
   static Instruction InvokePacked(Index packed_index, Index arity, Index output_size,
                                   const std::vector<RegName>& args);
-  /*! \brief Construct an allocate tensor instruction.
+  /*! \brief Construct an allocate tensor instruction with constant shape.
+   *  \param shape The shape of the tensor.
+   *  \param dtype The dtype of the tensor.
+   *  \param dst The destination register.
+   *  \return The allocate tensor instruction.
+   */
+  static Instruction AllocTensor(std::vector<int64_t> shape, DLDataType dtype, RegName dst);
+  /*! \brief Construct an allocate tensor instruction with register.
    *  \param shape_register The register containing the shape.
    *  \param dtype The dtype of the tensor.
    *  \param dst The destination register.
    *  \return The allocate tensor instruction.
    */
-  static Instruction AllocTensor(RegName shape_register, DLDataType dtype, RegName dst);
+  static Instruction AllocTensorReg(RegName shape_register, DLDataType dtype, RegName dst);
   /*! \brief Construct an allocate datatype instruction.
    *  \param tag The datatype tag.
    *  \param num_fields The number of fields for the datatype.
diff --git a/include/tvm/schedule.h b/include/tvm/schedule.h
index 774d7cd9a40a..659b42aa1afa 100644
--- a/include/tvm/schedule.h
+++ b/include/tvm/schedule.h
@@ -102,8 +102,8 @@ class Stage : public NodeRef {
    */
   EXPORT Stage& bind(IterVar ivar, IterVar thread_ivar);
   /*!
-   * \brief Set predicate under which store to the array can be performed.
-   *  Use this when there are duplicated threads doing the same store and we only
+   * \brief Set the predicate to determine whether a store to the array should be performed.
+   *  Use this when there are multiple threads performing the same store and we only
    *  need one of them to do the store.
    *
    * \note This is a dangerous scheduling primitive that can change behavior of program.
diff --git a/include/tvm/tensor_intrin.h b/include/tvm/tensor_intrin.h
index e61ce6634bd3..b5ca6eb4358b 100644
--- a/include/tvm/tensor_intrin.h
+++ b/include/tvm/tensor_intrin.h
@@ -67,6 +67,11 @@ class TensorIntrinNode : public Node {
    *  When it is a constant, it means we can only take data in that shape.
    */
   Array<Buffer> buffers;
+  /*! \brief List of scalar variables, used in body. These placeholders
+   *  will be bound to expressions passed in when the TensorIntrin is called
+   * from a TensorComputeOp.
+   */
+  Array<Var> scalar_params;
   /*! \brief The normal statement to execute the intrinsic */
   Stmt body;
   /*!
@@ -87,6 +92,7 @@ class TensorIntrinNode : public Node {
     v->Visit("op", &op);
     v->Visit("inputs", &inputs);
     v->Visit("buffers", &buffers);
+    v->Visit("scalar_params", &scalar_params);
     v->Visit("body", &body);
     v->Visit("reduce_init", &reduce_init);
     v->Visit("reduce_update", &reduce_update);
@@ -96,6 +102,7 @@ class TensorIntrinNode : public Node {
                                    Operation op,
                                    Array<Tensor> inputs,
                                    Array<Buffer> buffers,
+                                   Array<Var> scalar_params,
                                    Stmt body,
                                    Stmt reduce_init,
                                    Stmt reduce_update);
@@ -134,22 +141,29 @@ class TensorIntrinCallNode : public Node {
   Array<Tensor> tensors;
   /*! \brief regions of input tensors */
   Array<Region> regions;
+
+
   /*!
    * \brief IterVar on each reduction axis, if the
    * intrin will use the reduce axis
    */
   Array<IterVar> reduce_axis;
 
+  /*! \brief scalar expression inputs */
+  Array<Expr> scalar_inputs;
+
   void VisitAttrs(AttrVisitor* v) final {
     v->Visit("intrin", &intrin);
     v->Visit("tensors", &tensors);
     v->Visit("regions", &regions);
     v->Visit("reduce_axis", &reduce_axis);
+    v->Visit("scalar_inputs", &scalar_inputs);
   }
   static TensorIntrinCall make(TensorIntrin intrin,
                                Array<Tensor> tensors,
                                Array<Region> regions,
-                               Array<IterVar> reduce_axis);
+                               Array<IterVar> reduce_axis,
+                               Array<Expr> scalar_inputs);
 
   static constexpr const char* _type_key = "TensorIntrinCall";
   TVM_DECLARE_NODE_TYPE_INFO(TensorIntrinCallNode, Node);
diff --git a/jvm/README.md b/jvm/README.md
index 626a87c84cf8..c5996313447b 100644
--- a/jvm/README.md
+++ b/jvm/README.md
@@ -30,6 +30,7 @@ This folder contains the Java interface for TVM runtime. It brings TVM runtime t
 
 - JDK 1.6+. Oracle JDK and OpenJDK are well tested.
 - Maven 3 for build.
+- LLVM (TVM4J need LLVM support. Please refer to [build-the-shared-library](https://docs.tvm.ai/install/from_source.html#build-the-shared-library) for how to enable LLVM support.)
 
 ### Modules
 
diff --git a/nnvm/include/nnvm/c_api.h b/nnvm/include/nnvm/c_api.h
index 75054e892d8e..773bc63b7dad 100644
--- a/nnvm/include/nnvm/c_api.h
+++ b/nnvm/include/nnvm/c_api.h
@@ -60,7 +60,7 @@ NNVM_DLL void NNAPISetLastError(const char* msg);
 /*!
  * \brief return str message of the last error
  *  all function in this file will return 0 when success
- *  and -1 when an error occured,
+ *  and -1 when an error occurred,
  *  NNGetLastError can be called to retrieve the error
  *
  *  this function is threadsafe and can be called by different thread
diff --git a/nnvm/include/nnvm/op_attr_types.h b/nnvm/include/nnvm/op_attr_types.h
index 976ad929f496..ad328c30312a 100644
--- a/nnvm/include/nnvm/op_attr_types.h
+++ b/nnvm/include/nnvm/op_attr_types.h
@@ -136,6 +136,17 @@ using FInferType = FInferNodeEntryAttr<int>;
  */
 using TIsBackward = bool;
 
+/*!
+ * \brief Whether this op is a ghost node.
+ * If TIsGhost is true:
+ *   - The node with this op will not be visible in the indexed graph.
+ *
+ * \note Register under "TIsGhost"
+ * This enables shape/type inference for backward nodes when
+ * fusion is present.
+ */
+using TIsGhost = bool;
+
 /*!
  * \brief Get possible inplace options.
  *  This function enables optimization to reuse memory of inputs in output.
diff --git a/nnvm/python/nnvm/compiler/build_module.py b/nnvm/python/nnvm/compiler/build_module.py
index 0c1bc5047870..a49e9741a901 100644
--- a/nnvm/python/nnvm/compiler/build_module.py
+++ b/nnvm/python/nnvm/compiler/build_module.py
@@ -171,7 +171,7 @@ def _update_shape_dtype(shape, dtype, params):
     shape.update({k : v.shape for k, v in params.items()})
     if isinstance(dtype, str):
         for k, v in params.items():
-            if v.dtype != dtype:
+            if v.dtype != dtype and v.shape:
                 raise ValueError(
                     "%s: dtype not expected %s vs %s" % (k, dtype, v.dtype))
     else:
diff --git a/nnvm/python/nnvm/frontend/caffe2.py b/nnvm/python/nnvm/frontend/caffe2.py
index 2b3ff5a27e01..f951db66b5a6 100644
--- a/nnvm/python/nnvm/frontend/caffe2.py
+++ b/nnvm/python/nnvm/frontend/caffe2.py
@@ -411,7 +411,7 @@ def _convert_operator(self,
                           identity_list=None,
                           convert_map=None):
         """Convert from Caffe2 operator to nnvm operator.
-        The converter must specify conversions explicity for incompatible name, and
+        The converter must specify conversions explicitly for incompatible name, and
         apply handlers to operator attributes.
 
         Parameters
diff --git a/nnvm/python/nnvm/frontend/common.py b/nnvm/python/nnvm/frontend/common.py
index 610546d1973b..0e09a2c43323 100644
--- a/nnvm/python/nnvm/frontend/common.py
+++ b/nnvm/python/nnvm/frontend/common.py
@@ -58,7 +58,7 @@ def __call__(self, inputs, attrs, *args):
 
 
 class AttrConverter(object):
-    """Common attribute conveter. An AttrConverter instance is a callable:
+    """Common attribute converter. An AttrConverter instance is a callable:
     ```
     attr_converter = AttrConverter(op_name, transforms={'a':'b', 'c':('d', 1)})
     new_op_name, new_attr = attr_converter(attrs)
@@ -72,12 +72,12 @@ class AttrConverter(object):
         `op_name = func(attr)`
     transforms : dict of `new_name, or (new_name, default_value, transform function)`
         If only a new_name is provided, it's like renaming the attribute name.
-        If default_value if provded, then the attribute is considered as optional.
+        If default_value if provided, then the attribute is considered as optional.
         If transform function is provided, the original attribute value is handled
         by transform function.
     excludes : list
         A list of excluded attributes that should `NOT` appear.
-        Raise NotImplementedError if occured.
+        Raise NotImplementedError if occurred.
     disables : list
         A list of attributes that is disabled in nnvm. Log warnings.
     ignores : list
diff --git a/nnvm/python/nnvm/frontend/keras.py b/nnvm/python/nnvm/frontend/keras.py
index 7af8cf8833dd..f647a644bd2b 100644
--- a/nnvm/python/nnvm/frontend/keras.py
+++ b/nnvm/python/nnvm/frontend/keras.py
@@ -180,7 +180,6 @@ def _convert_convolution(insym, keras_layer, symtab):
     else:
         kernel_h, kernel_w, in_channels, n_filters = weightList[0].shape
         weight = weightList[0].transpose([3, 2, 0, 1])
-    dilation = [1, 1]
     if isinstance(keras_layer.dilation_rate, (list, tuple)):
         dilation = [keras_layer.dilation_rate[0], keras_layer.dilation_rate[1]]
     else:
diff --git a/nnvm/python/nnvm/frontend/mxnet.py b/nnvm/python/nnvm/frontend/mxnet.py
index 77671225aa3e..6f6bfc87ea8a 100644
--- a/nnvm/python/nnvm/frontend/mxnet.py
+++ b/nnvm/python/nnvm/frontend/mxnet.py
@@ -269,7 +269,7 @@ def _crop_like(inputs, attrs):
         raise tvm.error.OpAttributeUnimplemented(
             'Center crop is not supported in operator crop_like.')
     if len(inputs) < 2:
-        raise RuntimeError("Only support crop_like pattern.")
+        raise tvm.error.OpAttributeUnimplemented("Only support crop_like pattern.")
     new_attrs["axis"] = [2, 3]
     return get_nnvm_op('slice_like')(inputs[0], inputs[1], **new_attrs)
 
diff --git a/nnvm/python/nnvm/frontend/onnx.py b/nnvm/python/nnvm/frontend/onnx.py
index eb78b7845c23..b5e294b97fb1 100644
--- a/nnvm/python/nnvm/frontend/onnx.py
+++ b/nnvm/python/nnvm/frontend/onnx.py
@@ -27,6 +27,13 @@
 __all__ = ['from_onnx']
 
 
+def onnx_storage_order2layout(storage_order):
+    if storage_order not in (0, 1):
+        raise tvm.error.OpAttributeInvalid('Mode of storage_order must be either 0 or 1')
+
+    return 'NCHW' if sotrage_order == 0 else 'NHWC'
+
+
 class OnnxOpConverter(object):
     """ A helper class for holding onnx op converters.
     """
@@ -207,8 +214,38 @@ def _impl_v1(cls, inputs, attr, params):
 
 
 class MaxPool(Pool):
+    """ Operator converter for MaxPool
+    """
     name = 'max_pool'
 
+    @classmethod
+    def _impl_v8(cls, inputs, attr, params):
+        return AttrCvt(
+            op_name=dimension_picker(cls.name),
+            transforms={
+                'kernel_shape': 'pool_size',
+                'pads': ('padding', (0, 0), revert_caffe2_pad),
+                'storage_order': ('layout', 'NCHW', onnx_storage_order2layout),
+            },
+            # very weird attributes here in onnx, force check
+            ignores=['dilations', 'auto_pad'],
+            # TODO(higumachan): make sure ceil_mode in onnx, and layout?
+            extras={'ceil_mode': False},
+            custom_check=dimension_constraint())(inputs, attr, params)
+
+    @classmethod
+    def _impl_v10(cls, inputs, attr, params):
+        return AttrCvt(
+            op_name=dimension_picker(cls.name),
+            transforms={
+                'kernel_shape': 'pool_size',
+                'pads': ('padding', (0, 0), revert_caffe2_pad),
+                'storage_order': ('layout', 'NCHW', onnx_storage_order2layout),
+                'ceil_mode': 'ceil_mode'
+            },
+            # very weird attributes here in onnx, force check
+            ignores=['dilations', 'auto_pad'],
+            custom_check=dimension_constraint())(inputs, attr, params)
 
 class Mul(Elemwise):
     name = 'mul'
@@ -830,6 +867,19 @@ def from_onnx(self, graph, opset):
             else:
                 self._num_input += 1
                 self._nodes[i_name] = _sym.Variable(name=i_name)
+        # get list of unsupported ops
+        convert_map = _get_convert_map(opset)
+        unsupported_ops = set()
+        for node in graph.node:
+            op_name = node.op_type
+            if op_name not in convert_map and \
+               op_name != 'Constant' and \
+               op_name not in _identity_list:
+                unsupported_ops.add(op_name)
+        if unsupported_ops:
+            msg = 'The following operators are not supported for frontend ONNX: '
+            msg += ', '.join(unsupported_ops)
+            raise tvm.error.OpNotImplemented(msg)
         # construct nodes, nodes are stored as directed acyclic graph
         for node in graph.node:
             op_name = node.op_type
@@ -913,7 +963,7 @@ def _convert_operator(self,
                           identity_list=None,
                           convert_map=None):
         """Convert from onnx operator to nnvm operator.
-        The converter must specify conversions explicity for incompatible name, and
+        The converter must specify conversions explicitly for incompatible name, and
         apply handlers to operator attributes.
 
         Parameters
diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index 2f91cad8143a..7b4147155d93 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -1188,7 +1188,7 @@ def __init__(self):
         self._input_shapes = {}
 
     def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
-        """Construct nnvm nodes from tensorflow  graph definition - GraphDef.
+        """Construct nnvm nodes from tensorflow graph definition - GraphDef.
 
         Follow the tensorflow graph definition to parse and convert it to NNVM.
         Some of the assumptions listed below.
@@ -1197,7 +1197,7 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
             -> All Const nodes are params.
             -> Last node is assumed as graph output.
             -> _output_shapes : Graph should be frozen with add_shapes=True.
-                                Or user can pass input shape dictionaly optionally.
+                                Or user can pass input shape dictionary optionally.
             -> DecodeJpeg, ResizeBilinear: These are dummy operators.
                                            Hence user should handle preprocessing outside.
             -> CheckNumerics: No implementation as of now for this.
@@ -1214,6 +1214,9 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
         shape : Dictionary of input dimensions (Optional)
             Graph level input shape dictionary.
 
+        outputs : List of output tensor names (Optional)
+            if not specified then the last node is assumed as graph output.
+
         Returns
         -------
         sym : nnvm.sym.Symbol
@@ -1547,7 +1550,7 @@ def _convert_rnn_operator(self, op_name, inputs,
     def _convert_operator(self, op_name, inputs, attrs,
                           graph, identity_list=None, convert_map=None):
         """Convert from Tensorflow operator to nnvm operator.
-        The converter must specify conversions explicity for incompatible name, and
+        The converter must specify conversions explicitly for incompatible name, and
         apply handlers to operator attributes.
 
         Parameters
@@ -1599,7 +1602,7 @@ def _fix_extranodes(self, op_name, attr, inputs):
         return inputs
 
 def from_tensorflow(graph, layout="NHWC", shape=None, outputs=None):
-    """  Load tensorflow graph which is a python tensorflow graph object into nnvm graph.
+    """Load tensorflow graph which is a python tensorflow graph object into nnvm graph.
     The companion parameters will be handled automatically.
 
     Parameters
@@ -1607,6 +1610,15 @@ def from_tensorflow(graph, layout="NHWC", shape=None, outputs=None):
     graph : GraphDef object
         Tensorflow GraphDef
 
+    layout : target layout to be used (Optional)
+        NCHW only supported now to enable NHWC models on GPU.
+
+    shape : Dictionary of input dimensions (Optional)
+        Graph level input shape dictionary.
+
+    outputs : List of output tensor names (Optional)
+        if not specified then the last node is assumed as graph output.
+
     Returns
     -------
     sym : nnvm.Symbol
diff --git a/nnvm/python/nnvm/testing/__init__.py b/nnvm/python/nnvm/testing/__init__.py
index 44b8529821d0..41bcf83eb511 100644
--- a/nnvm/python/nnvm/testing/__init__.py
+++ b/nnvm/python/nnvm/testing/__init__.py
@@ -13,5 +13,4 @@
 from . import inception_v3
 from . import dcgan
 from . import dqn
-from . import yolo_detection
 from . import check_computation
diff --git a/nnvm/src/core/graph.cc b/nnvm/src/core/graph.cc
index 92ff98618ec8..e2d6d36020f1 100644
--- a/nnvm/src/core/graph.cc
+++ b/nnvm/src/core/graph.cc
@@ -76,6 +76,8 @@ IndexedGraph::IndexedGraph(const Graph &g) {
 
   DFSVisit(g.outputs, [this, &inputs_rptr, &control_rptr, &subgraphs]
              (const NodePtr& n) {
+      const auto& is_ghost = Op::GetAttr<TIsGhost>("TIsGhost");
+      if (!n->is_variable() && is_ghost.get(n->op(), false)) return;
       CHECK_LT(nodes_.size(), std::numeric_limits<uint32_t>::max());
       uint32_t nid = static_cast<uint32_t>(nodes_.size());
       CHECK(n);
@@ -103,8 +105,9 @@ IndexedGraph::IndexedGraph(const Graph &g) {
       inputs_rptr.push_back(input_entries_.size());
       // control deps
       for (const auto& nptr : n->control_deps) {
+        if (!nptr->is_variable() && is_ghost.get(nptr->op(), false)) continue;
         auto it = node2index_.find(nptr.get());
-        CHECK(it != node2index_.end() && it->first == nptr.get());
+        CHECK(it != node2index_.end()) << "control dep not found in graph";
         control_deps_.push_back(it->second);
       }
       control_rptr.push_back(control_deps_.size());
diff --git a/nnvm/tests/python/compiler/test_top_level2.py b/nnvm/tests/python/compiler/test_top_level2.py
index b25feb74793f..3c5651578b64 100644
--- a/nnvm/tests/python/compiler/test_top_level2.py
+++ b/nnvm/tests/python/compiler/test_top_level2.py
@@ -305,7 +305,7 @@ def test_upsampling_nearest_neighbor():
         data = tvm.nd.array(a_np)
         m.run(x=data)
         out = m.get_output(0, tvm.nd.empty(oshape, dtype))
-        b_np = topi.testing.upsampling_python(a_np, scale, "NCHW")
+        b_np = topi.testing.upsampling_python(a_np, (scale, scale), "NCHW")
         tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
 
 def test_upsampling_bilinear():
diff --git a/nnvm/tests/python/frontend/coreml/test_forward.py b/nnvm/tests/python/frontend/coreml/test_forward.py
index 679afe4e86bc..7a9f294f4359 100644
--- a/nnvm/tests/python/frontend/coreml/test_forward.py
+++ b/nnvm/tests/python/frontend/coreml/test_forward.py
@@ -195,7 +195,7 @@ def verify_UpsampleLayerParams(input_dim, scale, mode):
 
     a_np = np.full(input_dim, 1, dtype=dtype)
     if mode == 'NN':
-        b_np = topi.testing.upsampling_python(a_np, scale)
+        b_np = topi.testing.upsampling_python(a_np, (scale, scale))
     else:
         new_h = input_dim[2] * scale
         new_w = input_dim[3] * scale
diff --git a/nnvm/tests/python/frontend/darknet/test_forward.py b/nnvm/tests/python/frontend/darknet/test_forward.py
index 7f45a6149efc..4e62ff2e1f33 100644
--- a/nnvm/tests/python/frontend/darknet/test_forward.py
+++ b/nnvm/tests/python/frontend/darknet/test_forward.py
@@ -27,8 +27,8 @@
 from tvm.contrib.download import download_testdata
 download_testdata.__test__ = False
 from nnvm import frontend
-from nnvm.testing.darknet import LAYERTYPE
-from nnvm.testing.darknet import __darknetffi__
+from tvm.relay.testing.darknet import LAYERTYPE
+from tvm.relay.testing.darknet import __darknetffi__
 import nnvm.compiler
 
 DARKNET_LIB = 'libdarknet2.0.so'
diff --git a/nnvm/tests/python/frontend/mxnet/test_forward.py b/nnvm/tests/python/frontend/mxnet/test_forward.py
index db5534daee1a..446ebebbfc5a 100644
--- a/nnvm/tests/python/frontend/mxnet/test_forward.py
+++ b/nnvm/tests/python/frontend/mxnet/test_forward.py
@@ -137,7 +137,7 @@ def test_forward_fc_flatten():
 
 def test_forward_clip():
     data = mx.sym.var('data')
-    data = mx.sym.concat(data, -data, dim=1)  # negative part explicity
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
     mx_sym = mx.sym.clip(data, a_min=0, a_max=1)
     verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
 
diff --git a/nnvm/tests/python/frontend/onnx/test_forward.py b/nnvm/tests/python/frontend/onnx/test_forward.py
index 941a275a8045..3365b0f25fb1 100644
--- a/nnvm/tests/python/frontend/onnx/test_forward.py
+++ b/nnvm/tests/python/frontend/onnx/test_forward.py
@@ -405,7 +405,7 @@ def _test_upsample_nearest():
     y = helper.make_node("Upsample", ['in'], ['out'], mode='nearest', scales=[1.0, 1.0, 2.0, 2.0])
 
     in_array = np.random.uniform(size=in_shape).astype(np.float32)
-    out_array = topi.testing.upsampling_python(in_array, scale, "NCHW")
+    out_array = topi.testing.upsampling_python(in_array, (scale, scale), "NCHW")
 
     graph = helper.make_graph([y],
                               'upsample_nearest_test',
diff --git a/nnvm/tutorials/from_darknet.py b/nnvm/tutorials/from_darknet.py
index 857ef46015cd..d2ab647da1b3 100644
--- a/nnvm/tutorials/from_darknet.py
+++ b/nnvm/tutorials/from_darknet.py
@@ -33,8 +33,8 @@
 
 import nnvm
 import nnvm.frontend.darknet
-import nnvm.testing.yolo_detection
-import nnvm.testing.darknet
+import tvm.relay.testing.yolo_detection
+import tvm.relay.testing.darknet
 import matplotlib.pyplot as plt
 import numpy as np
 import tvm
@@ -42,7 +42,7 @@
 
 from ctypes import *
 from tvm.contrib.download import download_testdata
-from nnvm.testing.darknet import __darknetffi__
+from tvm.relay.testing.darknet import __darknetffi__
 
 # Model name
 MODEL_NAME = 'yolov3'
@@ -104,7 +104,7 @@
           test_image + '?raw=true'
 img_path = download_testdata(img_url, test_image, "data")
 
-data = nnvm.testing.darknet.load_image(img_path, netw, neth)
+data = tvm.relay.testing.darknet.load_image(img_path, netw, neth)
 ######################################################################
 # Execute on TVM Runtime
 # ----------------------
@@ -153,12 +153,12 @@
 # do the detection and bring up the bounding boxes
 thresh = 0.5
 nms_thresh = 0.45
-img = nnvm.testing.darknet.load_image_color(img_path)
+img = tvm.relay.testing.darknet.load_image_color(img_path)
 _, im_h, im_w = img.shape
-dets = nnvm.testing.yolo_detection.fill_network_boxes((netw, neth), (im_w, im_h), thresh,
+dets = tvm.relay.testing.yolo_detection.fill_network_boxes((netw, neth), (im_w, im_h), thresh,
                                                       1, tvm_out)
 last_layer = net.layers[net.n - 1]
-nnvm.testing.yolo_detection.do_nms_sort(dets, last_layer.classes, nms_thresh)
+tvm.relay.testing.yolo_detection.do_nms_sort(dets, last_layer.classes, nms_thresh)
 
 coco_name = 'coco.names'
 coco_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + coco_name + '?raw=true'
@@ -172,6 +172,6 @@
 
 names = [x.strip() for x in content]
 
-nnvm.testing.yolo_detection.draw_detections(font_path, img, dets, thresh, names, last_layer.classes)
+tvm.relay.testing.yolo_detection.draw_detections(font_path, img, dets, thresh, names, last_layer.classes)
 plt.imshow(img.transpose(1, 2, 0))
 plt.show()
diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index ce6f0602a572..5765eed0ad8b 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -38,12 +38,13 @@
 from . import hybrid
 from . import testing
 from . import error
+from . import datatype
 
 from . import ndarray as nd
 from .ndarray import context, cpu, gpu, opencl, cl, vulkan, metal, mtl
 from .ndarray import vpi, rocm, opengl, ext_dev
 
-from ._ffi.runtime_ctypes import TypeCode
+from ._ffi.runtime_ctypes import TypeCode, TVMType
 from ._ffi.ndarray import TVMContext
 from ._ffi.function import Function
 from ._ffi.base import TVMError, __version__
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index 4ede33a63936..72cff1a10ead 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -91,6 +91,13 @@ def __init__(self, type_str):
             self.type_code = 4
             bits = 64
             head = ""
+        elif head.startswith("custom"):
+            low, high = head.find('['), head.find(']')
+            if not low or not high or low >= high:
+                raise ValueError("Badly formatted custom type string %s" % type_str)
+            type_name = head[low + 1:high]
+            self.type_code = _api_internal._datatype_get_type_code(type_name)
+            head = head[high+1:]
         else:
             raise ValueError("Do not know how to handle type %s" % type_str)
         bits = int(head) if head else bits
@@ -100,7 +107,12 @@ def __init__(self, type_str):
     def __repr__(self):
         if self.bits == 1 and self.lanes == 1:
             return "bool"
-        x = "%s%d" % (TVMType.CODE2STR[self.type_code], self.bits)
+        if self.type_code in TVMType.CODE2STR:
+            type_name = TVMType.CODE2STR[self.type_code]
+        else:
+            type_name = "custom[%s]" % \
+                        _api_internal._datatype_get_type_name(self.type_code)
+        x = "%s%d" % (type_name, self.bits)
         if self.lanes != 1:
             x += "x%d" % self.lanes
         return x
diff --git a/python/tvm/api.py b/python/tvm/api.py
index 66fa4fa30e90..d88f06170543 100644
--- a/python/tvm/api.py
+++ b/python/tvm/api.py
@@ -319,7 +319,8 @@ def compute(shape, fcompute, name="compute", tag="", attrs=None):
                                                  out_ndim,
                                                  body.intrin,
                                                  body.tensors,
-                                                 body.regions)
+                                                 body.regions,
+                                                 body.scalar_inputs)
     else:
         if not isinstance(body, (list, tuple)):
             body = [body]
diff --git a/python/tvm/arith.py b/python/tvm/arith.py
index eda5cb825326..4c3c05f75796 100644
--- a/python/tvm/arith.py
+++ b/python/tvm/arith.py
@@ -32,21 +32,21 @@ def is_everything(self):
         return _api_internal._IntSetIsEverything(self)
 
 
-@register_node
+@register_node("arith.IntervalSet")
 class IntervalSet(IntSet):
-    """Represent set of continuous interval"""
-    def min(self):
-        """get the minimum value"""
-        return _api_internal._IntervalSetGetMin(self)
-
-    def max(self):
-        """get the maximum value"""
-        return _api_internal._IntervalSetGetMax(self)
+    """Represent set of continuous interval [min_value, max_value]
 
+    Parameters
+    ----------
+    min_value : Expr
+        The minimum value in the interval.
 
-@register_node
-class StrideSet(IntSet):
-    """Represent set of strided integers"""
+    max_value : Expr
+        The maximum value in the interval.
+    """
+    def __init__(self, min_value, max_value):
+        self.__init_handle_by_constructor__(
+            _make_IntervalSet, min_value, max_value)
 
 
 @register_node("arith.ModularSet")
@@ -114,6 +114,7 @@ def __init__(self):
         self._modular_set = _mod("modular_set")
         self._rewrite_simplify = _mod("rewrite_simplify")
         self._canonical_simplify = _mod("canonical_simplify")
+        self._int_set = _mod("int_set")
         self._enter_constraint_context = _mod("enter_constraint_context")
 
     def const_int_bound(self, expr):
@@ -176,6 +177,24 @@ def canonical_simplify(self, expr):
         """
         return self._canonical_simplify(expr)
 
+    def int_set(self, expr, dom_map):
+        """Compute a symbolic IntSet that covers expr for all values in dom_map.
+
+        Parameters
+        ----------
+        expr : tvm.Expr
+            The expression.
+
+        dom_map : Dict[Var, tvm.arith.IntSet]
+            The domain for variables to be relaxed.
+
+        Returns
+        -------
+        result : IntSet
+            The result.
+        """
+        return self._int_set(expr, dom_map)
+
     def bind(self, var, expr):
         """Bind a variable to the expression.
 
diff --git a/python/tvm/autotvm/graph_tuner/__init__.py b/python/tvm/autotvm/graph_tuner/__init__.py
new file mode 100644
index 000000000000..d590db0e7c48
--- /dev/null
+++ b/python/tvm/autotvm/graph_tuner/__init__.py
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Autotvm graph tuner API."""
+from __future__ import absolute_import as _abs
+
+from . import _base
+from . import base_graph_tuner
+
+from .base_graph_tuner import BaseGraphTuner
+from .dynamic_programming_tuner import DPTuner
+from .pbqp_tuner import PBQPTuner
diff --git a/python/tvm/autotvm/graph_tuner/_base.py b/python/tvm/autotvm/graph_tuner/_base.py
new file mode 100644
index 000000000000..83b9e06ba564
--- /dev/null
+++ b/python/tvm/autotvm/graph_tuner/_base.py
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Helper functions and global data"""
+
+
+RULE_OUT_NODE_NAMES = ["Tuple", "TupleGetItem", "batch_flatten", "transpose", "reshape",
+                       "multibox_prior", "multibox_transform_loc", "where",
+                       "non_max_suppression", "strided_slice"]
+
+# We set a large time to represent an invalid layout-transformation.
+# This number is set to be 10e9 seconds to align with autotvm.
+INVALID_LAYOUT_TIME = 10e9
diff --git a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
new file mode 100644
index 000000000000..0fbfc27310cb
--- /dev/null
+++ b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
@@ -0,0 +1,522 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=too-many-arguments,too-many-locals,too-many-statements,too-many-instance-attributes,too-many-branches,too-many-nested-blocks,invalid-name,unused-argument,unused-variable,no-member,no-value-for-parameter
+"""Base class for graph tuner."""
+import logging
+from abc import abstractmethod
+
+import numpy as np
+import topi
+
+import tvm
+from tvm import autotvm, relay
+from tvm.autotvm.task import get_config
+from tvm.autotvm.task.topi_integration import deserialize_args, serialize_args
+from tvm.autotvm.record import encode, load_from_file
+from tvm.autotvm.measure import MeasureResult, MeasureInput
+
+from ... import target as _target
+from .utils import is_input_node, get_in_nodes, get_out_nodes, has_multiple_inputs, \
+    bind_inputs, expr2graph
+from ._base import INVALID_LAYOUT_TIME
+
+
+# Setup topi_op_name -> layout function
+# NOTE: To add more ops, change the following dictionary.
+OP2LAYOUT = {
+    "topi_nn_conv2d": topi.nn.conv2d_infer_layout,
+    "topi_nn_depthwise_conv2d_nchw": topi.nn.depthwise_conv2d_infer_layout,
+}
+
+
+@autotvm.template
+def layout_transform(*args):
+    """Autotvm layout transform template."""
+    args = deserialize_args(args)
+    cfg = get_config()
+    cfg.add_flop(-1)
+    data = args[0]
+    out = topi.layout_transform(*args)
+    sch = topi.generic.schedule_injective([out])
+    return sch, [data, out]
+
+
+class BaseGraphTuner(object):
+    """Class to search schedules considering both kernel execution time and
+    layout transformation time.
+
+    Before creating a Graph Executor instance, schedule candidates for all kernels in
+    graph should be provided through tensor-level tuning.
+    """
+    def __init__(self, graph, input_shapes, records, target_ops,
+                 target, max_sch_num=20, dtype="float32", verbose=True,
+                 log_file="graph_tuner.log", log_level=logging.DEBUG,
+                 name="graph_tuner"):
+        """Create a GlobalTuner instance. Local schedule searching for all nodes with
+        target_op in the input graph and layout transformation benchmark need to be
+        executed before initialization.
+
+        graph : tvm.relay.Expr.Function
+            Input graph
+
+        input_shapes : dict of str to tuple.
+            Input shapes of graph
+
+        records : str or iterator of (MeasureInput, MeasureResult)
+            Collection of kernel level tuning records.
+            If it is str, then it should be the filename of a records log file.
+                       Each row of this file is an encoded record pair.
+            Otherwise, it is an iterator.
+
+        target_ops : List of str
+            Target tuning operators.
+
+        target : str or tvm.target
+            Compilation target.
+
+        max_sch_num : int, optional
+            Maximum number of schedule candidates for each workload.
+
+        dtype : str, optional
+            Data type.
+
+        log_file : str, optional
+            graph tuner log file name
+
+        name : str, optional
+            Name of global tuner.
+        """
+        self._node_list = []
+        self._layout_transform_perf_records = {}
+        self._layout_transform_interlayer_cost = {}
+        self._input_shapes = input_shapes
+        self._target_ops = [op.__name__ for op in target_ops]
+
+        self._name = name
+        self._max_sch_num = max_sch_num
+        self._optimal_sch_dict = {}
+        self._records = records
+        self._dtype = dtype
+        if isinstance(target, str):
+            target = _target.create(target)
+        self._target = target
+        self._optimal_record_dict = {}
+
+        # Set up logger
+        self._verbose = verbose
+        self._logger = logging.getLogger(name + "_logger")
+        need_file_handler = need_console_handler = True
+        for handler in self._logger.handlers:
+            if handler.__class__.__name__ == 'FileHandler':
+                need_file_handler = False
+            if handler.__class__.__name__ == 'StreamHandler':
+                need_console_handler = False
+        self._log_level = log_level
+        self._log_file = log_file
+        self._formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
+        self._logger.setLevel(log_level)
+        if need_file_handler:
+            file_handler = logging.FileHandler(log_file)
+            file_handler.setFormatter(self._formatter)
+            self._logger.addHandler(file_handler)
+        if self._verbose and need_console_handler:
+            console_handler = logging.StreamHandler()
+            console_handler.setFormatter(self._formatter)
+            self._logger.addHandler(console_handler)
+            self._logger.setLevel(log_level)
+            self._logger.propagate = False
+
+        # Generate workload and schedule dictionaries.
+        if isinstance(graph, relay.expr.Function):
+            node_dict = {}
+            graph = bind_inputs(graph, input_shapes, dtype)
+            expr2graph(graph, self._target_ops, node_dict, self._node_list)
+        else:
+            raise RuntimeError("Unsupported graph type: %s" % str(type(graph)))
+
+        self._graph = graph
+        self._in_nodes_dict = get_in_nodes(self._node_list, self._target_ops, input_shapes.keys())
+        self._out_nodes_dict = get_out_nodes(self._in_nodes_dict)
+        self._fetch_cfg()
+
+        # Setup infer_layout for elemwise-like nodes
+        # Note: graph tuner currently only supports tuning of single input and single output
+        # op as target op, such as conv2d, dense and conv2d_transpose. In this case, we can
+        # reuse infer_layout function from target ops for elemwise-like nodes. The behavior
+        # is to modify the first tensor shape of input workload to the output shape of
+        # elemwise-like node, and use infer_layout function from input op to generate layouts.
+        input_names = self._input_shapes.keys()
+        for idx in sorted(self._in_nodes_dict.keys()):
+            if has_multiple_inputs(self._node_list, idx, input_names):
+                node_entry = self._node_list[idx]
+                node_entry["topi_op"] = []
+                node_entry["workloads"] = []
+                for input_idx in self._in_nodes_dict[idx]:
+                    input_node = self._node_list[input_idx]
+                    if not is_input_node(input_node, input_names):
+                        input_topi_op = input_node["topi_op"][0]
+                        node_entry["topi_op"].append(input_topi_op)
+                        # Only replace the first input tensor
+                        input_workload = input_node["workloads"][0]
+                        first_tensor = input_workload[1]
+                        dtype = first_tensor[-1]
+                        new_shape = tuple([val.value for val in node_entry["types"][0].shape])
+                        actual_workload = (input_workload[0],) + \
+                                          ((new_shape + (dtype,)),) + input_workload[2:]
+                        node_entry["workloads"].append(actual_workload)
+                        if "record_candidates" not in node_entry:
+                            node_entry["record_candidates"] = input_node["record_candidates"]
+                    else:
+                        node_entry["topi_op"].append(None)
+                        node_entry["workloads"].append(None)
+
+
+    def _fetch_cfg(self):
+        """Read and pre-process input schedules."""
+        if isinstance(self._records, str):
+            records = load_from_file(self._records)
+        else:
+            records = self._records
+        cfg_dict = {}
+        for record in records:
+            in_measure, _ = record
+            workload = in_measure.task.workload
+            if workload not in cfg_dict:
+                cfg_dict[workload] = []
+            cfg_dict[workload].append(record)
+
+        cache_dict = {}
+        for key in self._in_nodes_dict:
+            node_entry = self._node_list[key]
+            if node_entry["op"] not in self._target_ops:
+                continue
+            workload = node_entry["workloads"][0]
+            if workload in cache_dict:
+                node_entry["record_candidates"] = cache_dict[workload]
+                continue
+            record_candidates = []
+            infer_layout_func = OP2LAYOUT[node_entry["topi_op"][0]]
+            layout_tracking_dict = {}
+            for record in cfg_dict[workload]:
+                in_measure, out_measure = record
+                workload = in_measure.task.workload
+                cfg = in_measure.config
+                # For multiple cfgs which produces the same in/out layouts,
+                # only the most efficient one is preserved.
+                with self._target:
+                    layouts = infer_layout_func(workload, cfg)
+                    if layouts in layout_tracking_dict:
+                        cost = out_measure.costs[0]
+                        current_best_cost = layout_tracking_dict[layouts][1].costs[0]
+                        if cost < current_best_cost:
+                            layout_tracking_dict[layouts] = record
+                    else:
+                        layout_tracking_dict[layouts] = record
+            sorted_records = sorted(layout_tracking_dict.values(),
+                                    key=lambda item: item[1].costs[0])
+            for i in range(min(self._max_sch_num, len(sorted_records))):
+                record_candidates.append(sorted_records[i])
+            node_entry["record_candidates"] = record_candidates
+            cache_dict[workload] = record_candidates
+
+    def _iterate_layout_transform(self, callback):
+        """Iterate all possible layout transformations and execute callback for each
+        iteration. callback function accepts 6 arguments: from_node_idx, to_node_idx,
+        from_sch_idx, to_sch_idx, args which represent the argument list of layout
+        transformation and is_valid showing whether this is a valid layout transformation.
+        """
+        input_names = self._input_shapes.keys()
+        for key, val in self._in_nodes_dict.items():
+            node_entry = self._node_list[key]
+            target_input_idx = -1
+            target_input_pos = -1
+            if has_multiple_inputs(self._node_list, key, input_names):
+                for i, item in enumerate(val):
+                    if not is_input_node(self._node_list[item], input_names):
+                        target_input_idx = item
+                        target_input_pos = i
+                        break
+
+            for i, item in enumerate(val):
+                i_idx = item
+                in_node_entry = self._node_list[i_idx]
+                if is_input_node(in_node_entry, input_names):
+                    continue
+
+                if node_entry["op"] in self._target_ops:
+                    o_idx = key
+                    o_infer_layout_func = OP2LAYOUT[node_entry["topi_op"][0]]
+                    o_wkl = node_entry["workloads"][0]
+                    i_topi_op = in_node_entry["topi_op"][0]
+                    i_wkl = in_node_entry["workloads"][0]
+                    pivot = 0
+                    while not i_wkl:
+                        pivot += 1
+                        i_topi_op = in_node_entry["topi_op"][pivot]
+                        i_wkl = in_node_entry["workloads"][pivot]
+                    i_infer_layout_func = OP2LAYOUT[i_topi_op]
+                else:
+                    o_idx = target_input_idx
+                    if i <= target_input_pos:
+                        continue
+                    o_infer_layout_func = OP2LAYOUT[node_entry["topi_op"][0]]
+                    o_wkl = node_entry["workloads"][target_input_pos]
+                    i_infer_layout_func = OP2LAYOUT[node_entry["topi_op"][i]]
+                    i_wkl = node_entry["workloads"][i]
+
+
+                for m, i_record in enumerate(in_node_entry["record_candidates"]):
+                    for n, o_record in enumerate(node_entry["record_candidates"]):
+                        i_cfg, o_cfg = i_record[0].config, o_record[0].config
+                        with self._target:
+                            i_input_info, i_output_info = i_infer_layout_func(i_wkl, i_cfg)
+                            o_input_info, o_output_info = o_infer_layout_func(o_wkl, o_cfg)
+                        if len(i_input_info) > 1 or len(i_output_info) > 1 or \
+                                len(o_input_info) > 1 or len(o_output_info) > 1:
+                            raise RuntimeError("Graph tuner only supports target operator "
+                                               "with single input and single output. "
+                                               "Please check target_ops argument.")
+
+                        in_shape, in_layout = i_output_info[0]
+                        if node_entry["op"] in self._target_ops:
+                            _, out_layout = o_input_info[0]
+                        else:
+                            _, out_layout = o_output_info[0]
+                        data_placeholder = tvm.placeholder(in_shape, name="data",
+                                                           dtype=self._dtype)
+                        args = [data_placeholder, in_layout, out_layout]
+                        callback(i_idx, o_idx, m, n, args)
+
+
+    def _create_matrix_callback(self, from_node_idx, to_node_idx, from_sch_idx,
+                                to_sch_idx, args):
+        """Create dictionary containing matrix format of layout transformation
+        between nodes."""
+        sargs = serialize_args(args)
+        in_layout, out_layout = args[1], args[2]
+        ltf_workload = ('layout_transform',) + autotvm.task.args_to_workload(sargs)
+        idx_pair_key = (from_node_idx, to_node_idx)
+
+        if in_layout == out_layout:
+            layout_transform_time = 0
+        else:
+            layout_transform_time = \
+                self._layout_transform_perf_records[ltf_workload][1].costs[0]
+
+        if idx_pair_key not in self._layout_transform_interlayer_cost:
+            self._layout_transform_interlayer_cost[idx_pair_key] = []
+        if len(self._layout_transform_interlayer_cost[idx_pair_key]) <= from_sch_idx:
+            self._layout_transform_interlayer_cost[idx_pair_key].append([])
+        self._layout_transform_interlayer_cost[idx_pair_key][from_sch_idx]\
+            .append(layout_transform_time)
+
+    def benchmark_layout_transform(self, min_exec_num=100, timeout=10,
+                                   use_rpc=False, device_key=None, host="localhost",
+                                   port=9190, n_parallel=1, build_func='default',
+                                   layout_records=None, target_host=None, infer_layout=False):
+        """Benchmark all possible layout transformation in the graph,
+        given a set of schedule candidates for each workload of target operator.
+
+        Parameters
+        ----------
+        min_exec_num : int, optional
+            Minimum number of execution. Final execution time is the average of
+            all execution time.
+
+        timeout : int, optional
+            Time out for each execution.
+
+        use_rpc : boolean, optional
+            Whether to use rpc mode for benchmarking.
+
+        device_key : str, optional
+            Remote device key which can be queried by
+            python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190
+
+        host : str, optional
+            IP address used to create RPC tracker on host machine.
+
+        port : int, optional
+            Port number used to create RPC tracker on host machine.
+
+        n_parallel: int, optional
+            The number of measurement task that can run in parallel.
+            Set this according to the number of cpu cores (for compilation) and
+            the number of devices you have (for measuring generate code).
+
+        build_func: str or callable, optional
+            'default': call default builder. This works for normal target (llvm, cuda)
+
+            'ndk': use Android NDK to create shared library. Use this for android target.
+
+            callable: customized build function for other backends (e.g. VTA).
+                      See autotvm/measure/measure_methods.py::default_build_func for example.
+
+        layout_records : str or iterator of (MeasureInput, MeasureResult). optional
+            Collection of layout_transform benchmarking records.
+            If is str, then it should be the filename of a records log file.
+                   Each row of this file is an encoded record pair.
+            Otherwise, it is an iterator.
+
+            If this argument is set, graph tuner will first check whether layout_transform
+            workload already exists in records and skip benchmarking if possible.
+
+        target_host : str, optional
+            str or :any:`tvm.target.Target` optional
+            Host compilation target, if target is device.
+            When TVM compiles device specific program such as CUDA,
+            we also need host(CPU) side code to interact with the driver
+            setup the dimensions and parameters correctly.
+            target_host is used to specify the host side codegen target.
+            By default, llvm is used if it is enabled,
+            otherwise a stackvm intepreter is used.
+
+        infer_layout : bool, optional
+            Whether to infer layout transformation time if it doesn't exist in records, instead
+            of benchmarking on target device.
+
+            This might bring performance loss comparing to benchmarking layout transformation.
+        """
+        self._logger.info("Start to benchmark layout transformation...")
+        if layout_records is None and infer_layout:
+            raise RuntimeError("Requires some records to infer layout transformation time.")
+
+        if isinstance(layout_records, str):
+            layout_records = load_from_file(layout_records)
+            if not layout_records and infer_layout:
+                raise RuntimeError("Records must be non-empty to infer layout transformation time.")
+
+        if isinstance(layout_records, str):
+            layout_records = load_from_file(layout_records)
+        num_flops, total_time = 0, 0
+        if layout_records is not None:
+            for record in layout_records:
+                ltf_wkl = record[0].task.workload
+                self._layout_transform_perf_records[ltf_wkl] = record
+                input_shape = ltf_wkl[1][1]
+                flops = np.prod(input_shape)
+                num_flops += flops
+                total_time += record[1].costs[0]
+        avg_time = total_time / num_flops if num_flops > 0 else 0
+
+        args_list = []
+        def _fetch_args_callback(from_node_idx, to_node_idx, from_sch_idx,
+                                 to_sch_idx, args):
+            """Callback function to fetch layout transform args"""
+            _, in_layout, out_layout = args
+            if in_layout != out_layout:
+                args_list.append(args)
+
+        self._iterate_layout_transform(_fetch_args_callback)
+
+        def _log_to_list(record_list):
+            """Callback to log result to a list."""
+            def _callback(_, inputs, results):
+                """Callback implementation"""
+                record_list.append((inputs[0], results[0]))
+            return _callback
+
+        builder = autotvm.LocalBuilder(n_parallel=n_parallel, build_func=build_func)
+        runner = autotvm.LocalRunner(number=min_exec_num, repeat=1, timeout=timeout)
+        if use_rpc:
+            if device_key is None:
+                raise RuntimeError("device_key need to be set to use rpc tracker mode.")
+            runner = autotvm.measure.RPCRunner(device_key, host, port, n_parallel=n_parallel,
+                                               number=min_exec_num, repeat=1,
+                                               timeout=timeout)
+        measure_option = autotvm.measure_option(builder=builder, runner=runner)
+        for args in args_list:
+            args = serialize_args(args)
+            ltf_workload = ('layout_transform',) + autotvm.task.args_to_workload(args)
+            if ltf_workload in  self._layout_transform_perf_records:
+                continue
+
+            if infer_layout:
+                input_shape = ltf_workload[1][1]
+                flops = 1
+                for i in input_shape:
+                    flops *= i
+                inferred_time = flops * avg_time
+                record_input = MeasureInput(target=self._target, task=None, config=None)
+                record_output = MeasureResult(costs=(inferred_time,), error_no=0,
+                                              all_cost=-1, timestamp=-1)
+                self._layout_transform_perf_records[ltf_workload] = (record_input, record_output)
+                continue
+
+            records = []
+            task = autotvm.task.create(layout_transform, args=args, target=self._target,
+                                       target_host=target_host)
+            task.workload = ltf_workload
+            tuner = autotvm.tuner.GridSearchTuner(task)
+            tuner.tune(n_trial=1, measure_option=measure_option,
+                       callbacks=[_log_to_list(records)])
+            if not isinstance(records[0][1].costs[0], float):
+                records[0] = (records[0][0], records[0][1]._replace(costs=(INVALID_LAYOUT_TIME,)))
+            self._layout_transform_perf_records[ltf_workload] = records[0]
+
+        self._iterate_layout_transform(self._create_matrix_callback)
+        self._logger.info("Benchmarking layout transformation successful.")
+
+    @property
+    def layout_transform_perf_records(self):
+        """Get layout transformation dictionary for input graph.
+
+        Returns
+        -------
+        layout_transform_perf_records : dict of tuple to (MeasureInput, MeasureResult)
+            Layout transformation dictionary for input graph.
+        """
+        return self._layout_transform_perf_records
+
+
+    def get_optimal_records(self):
+        """Convert optimal record dictionary to a list of records
+        with ascending order of node index in graph.
+
+        Returns
+        -------
+        sch_list : list of tuple
+            List of records with ascending order of node index in graph.
+        """
+        ordered_index_list = sorted(self._optimal_record_dict.keys())
+        ret = []
+        for index in ordered_index_list:
+            node_entry = self._node_list[index]
+            if node_entry["op"] not in self._target_ops:
+                continue
+            ret.append(node_entry["record_candidates"][self._optimal_record_dict[index]])
+        return ret
+
+    def write_opt_sch2record_file(self, record_file="graph_opt_schedule.log"):
+        """Write graph level optimal schedules into file.
+
+        Parameters
+        ----------
+        record_file : str, optional
+            Output schedule file.
+        """
+        with open(record_file, "a") as out_file:
+            records = self.get_optimal_records()
+            for record in records:
+                out_file.write(encode(record[0], record[1]) + "\n")
+        msg = "Writing optimal schedules to %s successfully." % record_file
+        self._logger.info(msg)
+
+    @abstractmethod
+    def run(self, **kwargs):
+        """Run graph tuning."""
+        pass
diff --git a/python/tvm/autotvm/graph_tuner/dynamic_programming_stage.py b/python/tvm/autotvm/graph_tuner/dynamic_programming_stage.py
new file mode 100644
index 000000000000..4a512c224a1d
--- /dev/null
+++ b/python/tvm/autotvm/graph_tuner/dynamic_programming_stage.py
@@ -0,0 +1,358 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=too-many-instance-attributes,too-many-branches,too-many-statements,too-many-arguments,too-many-locals,invalid-name
+"""Stage class for dynamic programming tuner"""
+import numpy as np
+
+from .utils import is_input_node
+
+
+class DPStage(object):
+    """Class to represent node in Markov decision process. A stage has states
+    to represent different schedules of the current node. Since in this problem
+    the action is the schedule selected for current node, action can be fully
+    represented by states. No extra attribute needs for action.
+
+    In most cases, instance of this class should be created through DPTuner.
+    """
+    def __init__(self, idx, input_shapes, node_list,
+                 counted_nodes_set, layout_transform_interlayer_cost,
+                 stage_dict, in_nodes_dict, out_nodes_dict,
+                 dep_dict, target_ops, dtype="float32"):
+        """Initialize a stage and create all states.
+
+        Parameters
+        ----------
+        idx : int
+            Index for current node.
+
+        input_shapes : dict of string to tuple of int
+            Input shapes for current graph.
+
+        node_list : list of dict
+            List of all nodes for current graph.
+
+        counted_nodes_set : set of int
+            Global set recording whether the execution time of a node has been counted.
+
+        layout_transform_interlayer_cost : dict of tuple to list
+            Dictionary maps node index pair to layout transformation time between them.
+
+        stage_dict : dict of int to Stage
+            Global dictionary for all stages mapping node index to stage.
+
+        in_nodes_dict : dict of int to list of int
+            Dictionary maps node index to corresponding input node index.
+
+        out_nodes_dict : dict of int to list of int
+            Dictionary maps node index to corresponding output node index.
+
+        dep_dict : dict of int to set of int
+            Dictionary maps node index to dependent node index.
+
+        target_ops : list of str
+            Target operators
+
+        dtype : str, optional
+            Data type.
+        """
+        self._global_input_shapes = input_shapes
+        self._global_input_names = input_shapes.keys()
+        self._global_node_list = node_list
+        self._global_counted_nodes_set = counted_nodes_set
+        self._global_layout_transform_interlayer_cost = layout_transform_interlayer_cost
+        self._global_stage_dict = stage_dict
+        self._global_in_nodes_dict = in_nodes_dict
+        self._global_out_nodes_dict = out_nodes_dict
+        self._global_dep_dict = dep_dict
+
+        self._idx = idx
+        self._node_entry = self._global_node_list[idx]
+        self._target_ops = target_ops
+        self._wkl = self._node_entry["workloads"][0]
+        self._record_list = self._node_entry["record_candidates"]
+        self._dep = []
+        self._dtype = dtype
+        self._states = None
+        self._full_states = None
+        self._full_states_idx = None
+        self._create_states()
+
+    def _create_states(self):
+        """Create states."""
+        node = self._global_node_list[self._idx]
+        if node["op"] in self._target_ops:
+            self._create_op_states()
+        else:
+            self._create_multi_inputs_states()
+
+    def _create_op_states(self):
+        """State creation routine for nodes with target_op."""
+        input_idx = -1
+        for index in self._global_in_nodes_dict[self._idx]:
+            input_idx = index
+            if not is_input_node(self._global_node_list[input_idx],
+                                 self._global_input_names):
+                break
+
+        if is_input_node(self._global_node_list[input_idx],
+                         self._global_input_names):
+            self._full_states = np.array([record[1].costs[0]
+                                          for record in self._record_list])
+            self._states = self._full_states
+        else:
+            input_node_entry = self._global_node_list[input_idx]
+            input_stage = self._global_stage_dict[input_idx]
+            input_dep = input_stage.dep
+            input_states = input_stage.states
+            input_flatten_states = input_states.flatten()
+            input_record_list = input_node_entry["record_candidates"]
+            num_schedules = len(self._record_list)
+            num_input_schedules = len(input_record_list)
+            num_input_states = input_flatten_states.shape[0]
+
+            full_states_shape = tuple([num_schedules, num_input_schedules] +
+                                      [len(self._global_node_list[dep_idx]["record_candidates"])
+                                       for dep_idx in input_dep])
+            self._full_states = np.zeros(full_states_shape).flatten().astype("float32")
+            self._full_states_idx = [self._idx, input_idx] + input_dep
+            dep_multiplier = 1
+            for i in range(2, len(full_states_shape)):
+                dep_multiplier *= full_states_shape[i]
+            input_node_time_counted = input_idx in self._global_counted_nodes_set
+
+            for i in range(num_schedules):
+                current_sch_time = float(self._record_list[i][1].costs[0])
+                for j in range(num_input_states):
+                    input_sch_idx = j // dep_multiplier
+                    layout_transform_time = \
+                        self._global_layout_transform_interlayer_cost \
+                            [(input_idx, self._idx)][input_sch_idx][i]
+
+                    if input_node_time_counted:
+                        total_time = current_sch_time + layout_transform_time
+                    else:
+                        total_time = \
+                            current_sch_time + layout_transform_time + input_flatten_states[j]
+                    current_state_idx = i * num_input_states + j
+                    self._full_states[current_state_idx] = total_time
+
+            if not input_node_time_counted:
+                self._global_counted_nodes_set.add(input_idx)
+            self._full_states = self._full_states.reshape(full_states_shape)
+
+            # If out degree of input node is 1, we can remove the dimension of input node,
+            # since the states of input node will not be needed any more. Otherwise, input
+            # node should become a dependency.
+            if len(self._global_out_nodes_dict[input_idx]) == 1:
+                self._states = np.amin(self._full_states, axis=1)
+                self._dep = list(input_dep)
+            else:
+                self._states = self._full_states
+                self._dep = [input_idx,] + input_dep
+
+        # Update global dependency dictionary.
+        # This is to monitor the dependency states to decide
+        # when a dependency can be eliminated, so that total
+        # number of states can be largely reduced.
+        for dep_idx in self._dep:
+            self._global_dep_dict[dep_idx].remove(self._idx)
+            for child in self._global_out_nodes_dict[self._idx]:
+                self._global_dep_dict[dep_idx].add(child)
+        if len(self._global_out_nodes_dict[self._idx]) > 1:
+            self._global_dep_dict[self._idx] = set()
+            for child in self._global_out_nodes_dict[self._idx]:
+                self._global_dep_dict[self._idx].add(child)
+
+    def _create_multi_inputs_states(self):
+        """State creation routine for multi_input operator
+
+        In tvm, layout transformation for an elemwise-like follow the rule which
+        all input operators transform their layouts to the leftmost input operator
+        layout. For example:
+                            elemwise-sum
+                            |    |    |
+                            |    |    |
+                           op0  op1  op2
+        In this block, the possible layout transformations are: op1 -> op0 and op2 -> op0.
+        In graph tuning, a 3-D array with shape (k0, k1, k2) can represent the layout
+        transformations between these three nodes. It is also possible some earlier states
+        belong to other nodes(We name them as dependency) are required for dynamic programming.
+        The final states array for this elemwise-sum can be with shape (e0, k0, k1, e1, k2).
+        To iterate through all states, we first align the shape of op0, op1 and op2 to be
+        (e0, k0, k1, e1, k2) by broadcasting the original states. We also record the axis of
+        each input node in the states array, together with the multiplier. For example,
+        the axis index for op0 is 1, and multiplier is k1 * e1 * k2. If current iterating index
+        in the flatten array is i, the index of op0 can be computed as:
+        i % (k0 * k1 * e1 * k2) // (k1 * e1 * k2).
+        """
+        full_input_node_list = list(self._global_in_nodes_dict[self._idx])
+        input_index_list = []
+        # Remove input and parameter nodes
+        for input_idx in full_input_node_list:
+            if not is_input_node(self._global_node_list[input_idx],
+                                 self._global_input_names):
+                input_index_list.append(input_idx)
+
+        # Generate new states
+        states_list, aligned_node_list = DPStage.align_states(input_index_list,
+                                                              self._global_stage_dict,
+                                                              self._global_node_list)
+        target_node_idx, target_major_axis, target_multiplier, target_states = states_list[0]
+        aligned_shape = target_states.shape
+        self._full_states = np.zeros(aligned_shape).astype("float32").flatten()
+        self._full_states_idx = list(aligned_node_list)
+        num_states = self._full_states.shape[0]
+        node_time_counted = [item[0] in self._global_counted_nodes_set for item in states_list]
+        target_states = target_states.flatten()
+        src_states_list = [states_list[i][3].flatten() for i in range(1, len(states_list))]
+
+        for i in range(num_states):
+            target_sch_idx = (i % (target_multiplier *
+                                   aligned_shape[target_major_axis])) // target_multiplier
+            if node_time_counted[0]:
+                new_state = 0
+            else:
+                new_state = target_states[i]
+
+            for j in range(1, len(states_list)):
+                src_states = src_states_list[j - 1]
+                src_node_idx, src_major_axis, src_multiplier, _ = states_list[j]
+                src_sch_idx = (i % (src_multiplier *
+                                    aligned_shape[src_major_axis])) // src_multiplier
+                layout_transform_time = \
+                    self._global_layout_transform_interlayer_cost\
+                        [(src_node_idx, target_node_idx)][src_sch_idx][target_sch_idx]
+
+                if node_time_counted[j]:
+                    new_state += layout_transform_time
+                else:
+                    new_state += layout_transform_time + src_states[i]
+                self._full_states[i] = new_state
+
+        for i, node_counted in enumerate(node_time_counted):
+            if not node_counted:
+                self._global_counted_nodes_set.add(states_list[i][0])
+        self._full_states = self._full_states.reshape(aligned_shape)
+
+        # Remove dependency to reduce states
+        reduced_states = np.array(self._full_states)
+        reduced_states_transpose = [states_list[0][1]]
+        reduced_states_dep_list = []
+        self._dep = []
+        for i in range(len(reduced_states.shape)):
+            if i != states_list[0][1]:
+                reduced_states_transpose.append(i)
+                reduced_states_dep_list.append(aligned_node_list[i])
+        reduced_states = np.transpose(reduced_states, reduced_states_transpose)
+        shift = 0
+        for i, dep in enumerate(reduced_states_dep_list):
+            if dep not in self._global_dep_dict or len(self._global_dep_dict[dep]) == 1:
+                self._global_dep_dict.pop(dep, None)
+                reduced_states = np.amin(reduced_states, axis=i+1-shift)
+                shift += 1
+            else:
+                self._dep.append(dep)
+        self._states = reduced_states
+
+        # Update dependency
+        for dep in self._dep:
+            self._global_dep_dict[dep].remove(self._idx)
+            for child in self._global_out_nodes_dict[self._idx]:
+                self._global_dep_dict[dep].add(child)
+        if len(self._global_out_nodes_dict[self._idx]) > 1:
+            self._global_dep_dict[self._idx] = set()
+            for child in self._global_out_nodes_dict[self._idx]:
+                self._global_dep_dict[self._idx].add(child)
+
+    @property
+    def dep(self):
+        """Get dependency list."""
+        return self._dep
+
+    @property
+    def states(self):
+        """Get states."""
+        return self._states
+
+    @property
+    def full_states(self):
+        """Get complete states."""
+        return self._full_states
+
+    @property
+    def full_states_idx(self):
+        """Get node index of complete states."""
+        return self._full_states_idx
+
+    @staticmethod
+    def align_states(input_index_list, stage_dict, node_list):
+        """Align all input node states shapes to be the same and transpose/reshape properly.
+
+        This is used in creating multi_input operator states.
+
+        Parameters
+        ----------
+        input_index_list : list of int
+            List of input node index.
+
+        stage_dict : dict of int to Stage
+            Global dictionary of node index to stage.
+
+        node_list : list of dict
+            List of all nodes for current graph.
+
+        Returns
+        -------
+        states_list : list of tuple
+            List of aligned states.
+
+        aligned_node_list : list in int
+            List of node index for aligned states.
+        """
+        aligned_node_list = list(input_index_list)
+        states_list = []
+        for input_idx in input_index_list:
+            input_node_stage = stage_dict[input_idx]
+            for dep_idx in input_node_stage.dep:
+                if dep_idx not in aligned_node_list:
+                    aligned_node_list.append(dep_idx)
+        aligned_shape = tuple([len(node_list[idx]["record_candidates"])
+                               for idx in aligned_node_list])
+        for input_idx in input_index_list:
+            input_node_stage = stage_dict[input_idx]
+            input_node_shape_idx_list = [input_idx] + input_node_stage.dep
+            transpose_idx_list = []
+            reshape_list = []
+            major_axis = -1
+            for i, idx in enumerate(aligned_node_list):
+                if input_idx == idx:
+                    major_axis = i
+                if idx in input_node_shape_idx_list:
+                    transpose_idx_list.append(idx)
+                    reshape_list.append(aligned_shape[i])
+                else:
+                    reshape_list.append(1)
+            transpose_list = [input_node_shape_idx_list.index(idx) for idx in transpose_idx_list]
+            input_node_states = np.transpose(input_node_stage.states, tuple(transpose_list))
+            input_node_states = np.reshape(input_node_states, tuple(reshape_list))
+            input_node_states = np.broadcast_to(input_node_states, aligned_shape)
+            multiplier = 1
+            for i in range(major_axis + 1, len(aligned_shape)):
+                multiplier *= aligned_shape[i]
+            states_list.append((input_idx, major_axis, multiplier, input_node_states))
+        return states_list, aligned_node_list
diff --git a/python/tvm/autotvm/graph_tuner/dynamic_programming_tuner.py b/python/tvm/autotvm/graph_tuner/dynamic_programming_tuner.py
new file mode 100644
index 000000000000..11571f2bdef9
--- /dev/null
+++ b/python/tvm/autotvm/graph_tuner/dynamic_programming_tuner.py
@@ -0,0 +1,189 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=import-error,too-many-locals,too-many-statements,too-many-branches,unused-variable
+"""Dynamic programming tuner."""
+import sys
+import numpy as np
+
+from .base_graph_tuner import BaseGraphTuner
+from .dynamic_programming_stage import DPStage
+from .utils import has_multiple_inputs, is_input_node
+
+if sys.version_info[0] == 3:
+    import queue
+else:
+    import Queue as queue
+
+class DPTuner(BaseGraphTuner):
+    """Tuner which uses dynamic programming to solve MDP problem.
+
+    Note: currently dynamic programming is used to solve this MDP problem. However,
+    this problem is intrinsically non-polynomial. DP can't apply for more complicated
+    models, such as networks with many element-wise sum operators. In this case, switch
+    to heuristic algorithm such as PBQP tuner.
+    """
+    def __init__(self, *args, **kwargs):
+        """Create a dynamic programming tuner.
+        """
+        super(DPTuner, self).__init__(*args, **kwargs)
+        self._num_states = self._max_num_states = None
+        self._stage_dict = {}
+        self._dep_dict = {}
+        self._counted_nodes_set = set()
+
+        self._global_data_dict = {
+            "dtype": self._dtype,
+            "counted_nodes_set": self._counted_nodes_set,
+            "stage_dict": self._stage_dict,
+            "in_nodes_dict": self._in_nodes_dict,
+            "out_nodes_dict": self._out_nodes_dict,
+            "dep_dict": self._dep_dict,
+            "node_list": self._node_list,
+            "input_shapes": self._input_shapes,
+            "layout_transform_interlayer_cost": self._layout_transform_interlayer_cost
+        }
+
+    def _check_num_states(self, num_states):
+        """Track the number of states."""
+        self._num_states += num_states
+        if self._max_num_states is not None:
+            if self._num_states > self._max_num_states:
+                raise RuntimeError("Too many states detected while running dynamic "
+                                   "programming: got %d states but upper limit is %d." %
+                                   (self._num_states, self._max_num_states))
+
+    def _forward(self):
+        """Forward pass in DP to generate states for all stages.
+        """
+        self._logger.info("Start forward pass...")
+        for node_idx in sorted(self._in_nodes_dict.keys()):
+            stage = DPStage(idx=node_idx, target_ops=self._target_ops,
+                            **self._global_data_dict)
+            self._check_num_states(stage.full_states.size)
+            self._stage_dict[node_idx] = stage
+        self._logger.info("Finished forward pass.")
+
+    def _backward(self):
+        """Backward pass in DP to generate optimal solution.
+        """
+        self._logger.info("Start backward pass...")
+        input_names = self._input_shapes.keys()
+        optimal_record_dict = {}
+        # Pick optimal schedule for output nodes
+        output_idx_list = []
+        for key, val in self._out_nodes_dict.items():
+            if not val:
+                output_idx_list.append(key)
+        states_list, aligned_node_list = DPStage.align_states(output_idx_list, self._stage_dict,
+                                                              self._node_list)
+        num_states = states_list[0][3].size
+        self._check_num_states(num_states * len(output_idx_list))
+        aligned_node_shape = states_list[0][3].shape
+        min_time = 0
+        min_pos = -1
+        for states in states_list:
+            min_time += np.amax(states[3])
+        flatten_states_list = [current_states[3].flatten() for current_states in states_list]
+        for i in range(num_states):
+            current_time = 0
+            for j, current_states in enumerate(states_list):
+                current_time += flatten_states_list[j][i]
+            if min_time > current_time:
+                min_time = current_time
+                min_pos = i
+        for i, states in enumerate(states_list):
+            current_major_axis = states[1]
+            current_sch_idx = (min_pos % (states[2] *
+                                          aligned_node_shape[current_major_axis])) // states[2]
+            optimal_record_dict[aligned_node_list[i]] = current_sch_idx
+        # Pick optimal schedule for dependencies of output nodes
+        for i in range(len(states_list), len(aligned_node_list)):
+            multiplier = 1
+            for j in range(i + 1, len(aligned_node_list)):
+                multiplier *= aligned_node_shape[j]
+            optimal_record_dict[aligned_node_list[i]] = \
+                min_pos // multiplier % aligned_node_shape[i]
+
+        # Backward pass to get optimal schedules for other nodes
+        bfs_q = queue.Queue()
+        visited = set()
+        for out_idx in output_idx_list:
+            bfs_q.put(out_idx)
+        while not bfs_q.empty():
+            node_idx = bfs_q.get()
+            visited.add(node_idx)
+            if is_input_node(self._node_list[node_idx], input_names):
+                continue
+            optimal_sch_idx = optimal_record_dict[node_idx]
+            full_states = self._stage_dict[node_idx].full_states
+            if not has_multiple_inputs(self._node_list, node_idx, input_names):
+                input_idx = self._in_nodes_dict[node_idx][0]
+                if is_input_node(self._node_list[input_idx], input_names):
+                    continue
+                if input_idx not in visited:
+                    bfs_q.put(input_idx)
+                    if input_idx not in optimal_record_dict:
+                        dep_list = self._stage_dict[node_idx].dep
+                        dep_idx = tuple([optimal_record_dict[item] for item in dep_list])
+                        tmp = np.argmin(full_states, axis=1)
+                        optimal_input_sch_idx = tmp[(optimal_sch_idx,) + dep_idx]
+                        optimal_record_dict[input_idx] = optimal_input_sch_idx
+            else:
+                input_idx_list = self._in_nodes_dict[node_idx]
+                optimal_record_dict[input_idx_list[0]] = optimal_sch_idx
+                full_states_idx = self._stage_dict[node_idx].full_states_idx
+                tmp = full_states[optimal_sch_idx]
+                new_states_idx, new_states_pos = [], []
+                visited_states_idx, visited_states_pos = [], []
+                for i in range(1, len(full_states_idx)):
+                    if full_states_idx[i] in optimal_record_dict:
+                        visited_states_idx.append(full_states_idx[i])
+                        visited_states_pos.append(i - 1)
+                    else:
+                        new_states_idx.append(full_states_idx[i])
+                        new_states_pos.append(i - 1)
+                if visited_states_idx:
+                    tmp = np.transpose(tmp, tuple(visited_states_pos + new_states_pos))
+                    tmp = tmp[tuple([optimal_record_dict[idx] for idx in visited_states_idx])]
+                min_pos = np.argmin(tmp)
+                multiplier = 1
+                for i in range(len(new_states_idx)):
+                    multiplier *= full_states.shape[new_states_pos[i] + 1]
+                for pos, idx in zip(new_states_pos, new_states_idx):
+                    multiplier //= full_states.shape[pos + 1]
+                    optimal_record_dict[idx] = min_pos // multiplier
+                    min_pos %= multiplier
+                for input_idx in input_idx_list:
+                    if input_idx not in visited:
+                        bfs_q.put(input_idx)
+
+        self._optimal_record_dict = optimal_record_dict
+        for node_idx, _ in self._in_nodes_dict.items():
+            if self._node_list[node_idx]["op"] not in self._target_ops:
+                continue
+        self._logger.info("Finished backward pass...")
+
+    def run(self, **kwargs):
+        """Run dynamic programming solver.
+        """
+        max_num_states = None if "max_num_states" not in kwargs else kwargs["max_num_states"]
+        self._num_states = 0
+        self._max_num_states = max_num_states
+        self._logger.info("Start to run dynamic programming algorithm...")
+        self._forward()
+        self._backward()
+        self._logger.info("Finished DPExecutor run.")
diff --git a/python/tvm/autotvm/graph_tuner/pbqp_tuner.py b/python/tvm/autotvm/graph_tuner/pbqp_tuner.py
new file mode 100644
index 000000000000..1d7089ef248b
--- /dev/null
+++ b/python/tvm/autotvm/graph_tuner/pbqp_tuner.py
@@ -0,0 +1,288 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,too-many-locals
+"""Partitioned Boolean Quadratic Programming Tuner"""
+from ._base import INVALID_LAYOUT_TIME
+from .base_graph_tuner import BaseGraphTuner
+from .utils import is_input_node, has_multiple_inputs
+
+
+class PBQPTuner(BaseGraphTuner):
+    """An approximation method to deal with intractably
+    large size of graph tuning problem.
+
+    This graph coloring algorithm mainly comes from:
+
+    Lang Hames and Bernhard Scholz.
+    Nearly optimal register allocation with pbqp.JMLC 2006.
+    LNCS, vol.4228,pp. 346-361, 2016
+    """
+    def __init__(self, *args, **kwargs):
+        """Create a partitioned boolean quadratic programming tuner.
+        """
+        super(PBQPTuner, self).__init__(*args, **kwargs)
+
+        # Remove input nodes
+        input_names = self._input_shapes.keys()
+        for node_idx in self._out_nodes_dict:
+            if is_input_node(self._node_list[node_idx], input_names):
+                for out_node_idx in self._out_nodes_dict[node_idx]:
+                    self._in_nodes_dict[out_node_idx].remove(node_idx)
+
+        self._adj_dict = {}
+        for node_idx in self._in_nodes_dict:
+            self._adj_dict[node_idx] = list(self._in_nodes_dict[node_idx]) + \
+                                       list(self._out_nodes_dict[node_idx])
+
+        self._record_cost_dict = {}
+        for key in self._in_nodes_dict:
+            self._record_cost_dict[key] = []
+            for record in self._node_list[key]["record_candidates"]:
+                self._record_cost_dict[key].append(record[1].costs[0])
+
+        self._max_degree = -1
+        self._node_degree_dict = {}
+        for node_idx in self._in_nodes_dict:
+            node_degree = self._get_degree(node_idx)
+            self._node_degree_dict[node_idx] = node_degree
+            self._max_degree = max(self._max_degree, node_degree)
+
+        self._stack = []
+        self._buckets = [[] for _ in range(self._max_degree + 2)]
+        for node_idx in sorted(self._in_nodes_dict):
+            node_degree = self._get_degree(node_idx)
+            self._buckets[node_degree].append(node_idx)
+
+        self._is_optimal = True
+
+    def _get_degree(self, node_idx):
+        """Get node degree.
+        """
+        return len(self._adj_dict[node_idx])
+
+    def _reorder_adj_nodes(self, node_idx):
+        """Update buckets list with current adjacency list.
+        """
+        for adj_node in self._adj_dict[node_idx]:
+            current_degree = self._get_degree(adj_node)
+            prev_degree = self._node_degree_dict[adj_node]
+            if prev_degree != current_degree:
+                self._buckets[prev_degree].remove(adj_node)
+                self._buckets[current_degree].insert(0, adj_node)
+                self._node_degree_dict[adj_node] = current_degree
+
+    def _remove_node(self, node_idx):
+        """Remove node from graph. Update adjacency list accordingly.
+        """
+        node_degree = self._get_degree(node_idx)
+        self._buckets[node_degree].remove(node_idx)
+        for adj_node in self._adj_dict[node_idx]:
+            self._adj_dict[adj_node].remove(node_idx)
+
+    def _insert_edge(self, node_x, node_y, adj_cost_matrix):
+        """Insert an edge between two nodes.
+        """
+        self._layout_transform_interlayer_cost[(node_x, node_y)] = adj_cost_matrix
+        self._layout_transform_interlayer_cost[(node_y, node_x)] = []
+        for i in range(len(adj_cost_matrix[0])):
+            self._layout_transform_interlayer_cost[(node_y, node_x)].append([])
+            for cost_vec in adj_cost_matrix:
+                self._layout_transform_interlayer_cost[(node_y, node_x)][i] \
+                    .append(cost_vec[i])
+
+        self._adj_dict[node_x].append(node_y)
+        self._adj_dict[node_y].append(node_x)
+
+    def _backward_insert_node(self, node_idx):
+        """Reinsert node in backward pass.
+        """
+        for adj_node in self._adj_dict[node_idx]:
+            self._adj_dict[adj_node].append(node_idx)
+
+    def _RI_reduction(self, node_idx):
+        """Reduce nodes with degree 1.
+        """
+        adj_node = self._adj_dict[node_idx][0]
+        ltf_matrix = self._layout_transform_interlayer_cost[(adj_node, node_idx)]
+        for i, cost_vec in enumerate(ltf_matrix):
+            min_cost = INVALID_LAYOUT_TIME
+            for j, cost in enumerate(cost_vec):
+                min_cost = min(min_cost, cost + self._record_cost_dict[node_idx][j])
+            self._record_cost_dict[adj_node][i] += min_cost
+        self._remove_node(node_idx)
+        self._reorder_adj_nodes(node_idx)
+        self._stack.append(node_idx)
+
+    def _RII_reduction(self, node_idx):
+        """Reduce nodes with degree 2.
+        """
+        adj_node_x, adj_node_y = self._adj_dict[node_idx]
+        ltf_matrix_x = self._layout_transform_interlayer_cost[(adj_node_x, node_idx)]
+        ltf_matrix_y = self._layout_transform_interlayer_cost[(adj_node_y, node_idx)]
+        delta_matrix = [[] for _ in range(len(ltf_matrix_x))]
+        for i, cost_vec_x in enumerate(ltf_matrix_x):
+            for j, cost_vec_y in enumerate(ltf_matrix_y):
+                min_cost = INVALID_LAYOUT_TIME
+                for k in range(len(self._record_cost_dict[node_idx])):
+                    min_cost = min(min_cost, cost_vec_x[k] + cost_vec_y[k]
+                                   + self._record_cost_dict[node_idx][k])
+                delta_matrix[i].append(min_cost)
+
+        if adj_node_x == adj_node_y:
+            for i, delta_row in enumerate(delta_matrix):
+                self._record_cost_dict[adj_node_x][i] += delta_row[i]
+        elif adj_node_x in self._adj_dict[adj_node_y]:
+            for i, _ in enumerate(delta_matrix):
+                for j, delta in enumerate(delta_matrix[i]):
+                    self._layout_transform_interlayer_cost[(adj_node_x, adj_node_y)][i][j] \
+                        += delta
+                    self._layout_transform_interlayer_cost[(adj_node_y, adj_node_x)][j][i] \
+                        += delta
+        else:
+            self._insert_edge(adj_node_x, adj_node_y, delta_matrix)
+
+        self._remove_node(node_idx)
+        self._reorder_adj_nodes(node_idx)
+        self._stack.append(node_idx)
+
+    def _RN_reduction(self, node_idx):
+        """Reduce nodes with degree greater than 2.
+        """
+        min_cost = INVALID_LAYOUT_TIME
+        record_idx = -1
+
+        for i, record_cost in enumerate(self._record_cost_dict[node_idx]):
+            current_cost = record_cost
+            for adj_node in self._adj_dict[node_idx]:
+                ltf_matrix = self._layout_transform_interlayer_cost[(node_idx, adj_node)]
+                adj_record_cost = list(self._record_cost_dict[adj_node])
+                for j, ltf_cost in enumerate(ltf_matrix[i]):
+                    adj_record_cost[j] += ltf_cost
+                current_cost += min(adj_record_cost)
+            if current_cost < min_cost:
+                min_cost = current_cost
+                record_idx = i
+
+        if record_idx < 0:
+            raise RuntimeError("Can't find a soltuion for node %d when "
+                               "applying RN reduction" % node_idx)
+        self._optimal_record_dict[node_idx] = record_idx
+        self._is_optimal = False
+
+        for adj_node in self._adj_dict[node_idx]:
+            ltf_matrix = self._layout_transform_interlayer_cost[(node_idx, adj_node)]
+            for i, ltf_cost in enumerate(ltf_matrix[record_idx]):
+                self._record_cost_dict[adj_node][i] += ltf_cost
+
+        self._remove_node(node_idx)
+        self._reorder_adj_nodes(node_idx)
+        self._stack.append(node_idx)
+
+    def _forward(self):
+        """Forward pass in PBQP to reduce nodes.
+        """
+        while True:
+            if self._buckets[1]:
+                node_idx = self._buckets[1][0]
+                self._RI_reduction(node_idx)
+            elif self._max_degree >= 2 and self._buckets[2]:
+                node_idx = self._buckets[2][0]
+                self._RII_reduction(node_idx)
+            elif self._max_degree >= 3:
+                max_degree_node = -1
+                for i in range(self._max_degree, 2, -1):
+                    if self._buckets[i]:
+                        max_degree_node = self._buckets[i][0]
+                        self._RN_reduction(max_degree_node)
+                        break
+                if max_degree_node < 0:
+                    break
+            else:
+                break
+
+    def _backward(self):
+        """Backward pass in PBQP to generate optimal solution.
+        """
+        # Solve nodes left in the forward graph
+        for node_idx in self._buckets[0]:
+            record_costs = self._record_cost_dict[node_idx]
+            min_cost = min(record_costs)
+            self._optimal_record_dict[node_idx] = record_costs.index(min_cost)
+
+        # Solve nodes with one or two degrees
+        for node_idx in reversed(self._stack):
+            self._backward_insert_node(node_idx)
+            if node_idx not in self._optimal_record_dict:
+                record_costs = list(self._record_cost_dict[node_idx])
+                for adj_node in self._adj_dict[node_idx]:
+                    adj_optimal_idx = self._optimal_record_dict[adj_node]
+                    for i, _ in enumerate(record_costs):
+                        record_costs[i] += \
+                            self._layout_transform_interlayer_cost \
+                                [(node_idx, adj_node)][i][adj_optimal_idx]
+                min_cost = min(record_costs)
+                self._optimal_record_dict[node_idx] = record_costs.index(min_cost)
+
+    def run(self, **kwargs):
+        """Run partitioned boolean quadratic programming tuner.
+        """
+        self._logger.info("Start to run PBQP algorithm...")
+        # Define virtual record lists and layout transformaton matrices
+        # for multi-input nodes.
+        input_names = self._input_shapes.keys()
+        temp = {}
+        for key, val in self._in_nodes_dict.items():
+            target_input_idx = -1
+            target_input_pos = -1
+            if has_multiple_inputs(self._node_list, key, input_names):
+                for i, item in enumerate(val):
+                    if not is_input_node(self._node_list[item], input_names):
+                        target_input_idx = item
+                        target_input_pos = i
+                        break
+                temp[(target_input_idx, key)] = []
+                record_candidates = self._node_list[target_input_idx]["record_candidates"]
+                for j in range(len(record_candidates)):
+                    temp[(target_input_idx, key)].append([])
+                    for k in range(len(record_candidates)):
+                        temp[(target_input_idx, key)][j].append(0 if j == k
+                                                                else INVALID_LAYOUT_TIME)
+
+                for j in range(target_input_pos + 1, len(val)):
+                    input_idx = val[j]
+                    if is_input_node(self._node_list[input_idx], input_names):
+                        continue
+                    temp[(input_idx, key)] = \
+                        self._layout_transform_interlayer_cost[(input_idx, target_input_idx)]
+        self._layout_transform_interlayer_cost.update(temp)
+
+        # Create reverse layout transformation matrices
+        temp = {}
+        for idx_pair, ltf_matrix in self._layout_transform_interlayer_cost.items():
+            reverse_key = (idx_pair[1], idx_pair[0])
+            reverse_matrix = [[] for _ in range(len(ltf_matrix[0]))]
+            for i, _ in enumerate(ltf_matrix):
+                for j, ltf in enumerate(ltf_matrix[i]):
+                    reverse_matrix[j].append(ltf)
+            temp[reverse_key] = reverse_matrix
+        self._layout_transform_interlayer_cost.update(temp)
+
+        self._forward()
+        self._backward()
+        is_optimal = "optimal" if self._is_optimal else "sub-optimal"
+        msg = "Finished PBQPExecutor run. Got %s solution." % is_optimal
+        self._logger.info(msg)
diff --git a/python/tvm/autotvm/graph_tuner/utils/__init__.py b/python/tvm/autotvm/graph_tuner/utils/__init__.py
new file mode 100644
index 000000000000..8b36e752bdef
--- /dev/null
+++ b/python/tvm/autotvm/graph_tuner/utils/__init__.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=wildcard-import
+"""Graph tuner utility functions"""
+from __future__ import absolute_import
+
+from . import traverse_graph
+from . import utils
+
+from .traverse_graph import expr2graph, get_direct_ancestor, get_in_nodes, \
+    get_out_nodes
+from .utils import has_multiple_inputs, is_input_node, bind_inputs
diff --git a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
new file mode 100644
index 000000000000..08f1017e7fb8
--- /dev/null
+++ b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
@@ -0,0 +1,312 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=too-many-locals,too-many-statements,too-many-branches,protected-access
+"""API for graph traversing."""
+import threading
+
+import topi
+
+from tvm import relay, autotvm
+from tvm.relay.expr import Call, Function, TupleGetItem, Var, Constant, Tuple
+from tvm.relay.ty import TupleType, TensorType
+from tvm.autotvm.task import TaskExtractEnv
+
+from .._base import RULE_OUT_NODE_NAMES
+from .utils import has_multiple_inputs, is_input_node
+
+
+# Setup relay op base name -> topi compute functions
+# NOTE: To add more ops, change the following dictionary.
+OP2COMPUTE = {
+    "conv2d" : [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw],
+}
+
+
+def expr2graph(expr, target_ops, node_dict, node_list):
+    """Convert relay expr to graph data structure
+    and fetch workloads of target operators.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr.Function
+        Input relay function expression.
+
+    target_ops: List of str
+        List of target relay base op name
+
+    node_dict : dictionary from tvm.relay.Expr to int
+        Dictionary to record node index
+
+    node_list : list of dictionary
+        List of nodes which contains all expr in the input relay function.
+        Each node will be stored as a dictionary in the format of
+        {"op": str, "node": tvm.relay.expr, "inputs": [int], "types": [tvm.relay.Type],
+         "name": str, "workloads": [tuple], "topi_op": [function]}
+    """
+    env = TaskExtractEnv.get(allow_duplicate=True)
+    topi_funcs = []
+    for op_name in target_ops:
+        if op_name not in OP2COMPUTE:
+            raise RuntimeError("Not supported relay op in graph tuner: %s"
+                               % op_name)
+        topi_funcs += OP2COMPUTE[op_name]
+    env.reset(topi_funcs)
+    _expr2graph_impl(expr, target_ops, node_dict, node_list)
+    task_pos = 0
+    for node_entry in node_list:
+        if node_entry["op"] in target_ops:
+            task_name, args = env.task_collection[task_pos]
+            task = autotvm.task.create(task_name, args,
+                                       target="llvm",
+                                       target_host=None,
+                                       template_key='direct')
+            node_entry["workloads"] = [task.workload]
+            node_entry["topi_op"] = [task_name]
+            task_pos += 1
+
+
+def _expr2graph_impl(expr, target_ops, node_dict, node_list):
+    """Implementation to convert relay expr to graph data structure
+    """
+    def _traverse_expr(node):
+        if node in node_dict:
+            return
+        node_index = len(node_list)
+        node_entry = {"node": node, "inputs": [], "types": [],
+                      "op": "null", "name": None}
+
+        if isinstance(node, Call):
+            op_name = node.op.name.split(".")[-1]
+            node_entry["op"] = op_name
+            for arg in node.args:
+                in_node_idx = node_dict[arg]
+                if isinstance(arg, (Tuple, TupleGetItem)):
+                    node_entry["inputs"] += node_list[in_node_idx]["inputs"]
+                else:
+                    node_entry["inputs"].append([in_node_idx, 0, 0])
+            infer_out = relay.ir_pass.infer_type(node)
+            out_type = infer_out._checked_type_
+            if isinstance(out_type, TensorType):
+                node_entry["types"].append(out_type)
+            elif isinstance(out_type, TupleType):
+                for tupe_type in out_type.fields:
+                    node_entry["types"].append(tupe_type)
+            else:
+                raise RuntimeError("Unsupported output type %s in operator %s"
+                                   % (type(out_type), op_name))
+
+            # Utilize tracing target to fetch workload with topo-order.
+            # Since we only need workload, dummy target can be used to
+            # create task.
+            if op_name in target_ops:
+                params = []
+                for i, input_idx in enumerate(node_entry["inputs"]):
+                    input_node_entry = node_list[input_idx[0]]
+                    input_type = input_node_entry["types"][input_idx[1]]
+                    if not isinstance(input_node_entry["node"], (Var, Call)):
+                        raise RuntimeError("Graph tuner can only tune target "
+                                           "operators with input node of type "
+                                           "relay.expr.Var or relay.expr.Call. Now "
+                                           "find a target op %s with input type %s"
+                                           % (op_name, str(type(input_node_entry["node"]))))
+                    free_var = relay.Var("var_%d" % i, input_type)
+                    params.append(free_var)
+                call = relay.Call(node.op, params, node.attrs)
+                func = relay.Function(params, call)
+                relay.backend.compile_engine.get().clear()
+                build_thread = threading.Thread(target=relay.build,
+                                                args=(func,
+                                                      "llvm -device=tracing",
+                                                      None,
+                                                      None))
+                build_thread.start()
+                build_thread.join()
+        elif isinstance(node, Var):
+            node_entry["name"] = node.name_hint
+            node_entry["types"] = [node.type_annotation]
+        elif isinstance(node, Function):
+            # Ignore root node since it equals to input function expression
+            if node != expr:
+                _expr2graph_impl(node, target_ops, node_dict, node_list)
+            return
+        elif isinstance(node, TupleGetItem):
+            node_entry["op"] = "TupleGetItem"
+            in_node_idx = node_dict[node.tuple_value]
+            node_entry["inputs"].append([in_node_idx, node.index, 0])
+        elif isinstance(node, Tuple):
+            node_entry["op"] = "Tuple"
+            for tuple_item in node:
+                in_node_idx = node_dict[tuple_item]
+                if isinstance(tuple_item, TupleGetItem):
+                    node_entry["inputs"] += node_list[in_node_idx]["inputs"]
+                elif isinstance(tuple_item, Tuple):
+                    raise RuntimeError("Graph tuner doesn't support nested tuple.")
+                else:
+                    node_entry["inputs"].append([in_node_idx, 0, 0])
+        elif isinstance(node, Constant):
+            pass
+        elif isinstance(node, relay.op.op.Op):
+            return
+        else:
+            raise RuntimeError("Not supported relay node type in graph tuning: %s"
+                               % str(type(node)))
+        node_dict[node] = node_index
+        node_list.append(node_entry)
+
+    relay.ir_pass.post_order_visit(expr, _traverse_expr)
+
+
+def get_direct_ancestor(node_list, visited_dict, target_ops, node_idx, input_names):
+    """Given a node_list in relay function and a node index, return the
+    closest ancestor which has op_name as operator name or is multi_input operator.
+
+    If node has multiple inputs, multiple ancestor nodes will be returned.
+
+    Parameters
+    ----------
+    node_list : list of dict of str to object
+        List of all nodes in a graph.
+
+    visited_dict : dict of int to int
+        Nodes and corresponding ancestors which have been visited.
+
+    target_ops: List of str
+        List of target relay base op name
+
+    node_idx : int
+        Input node index.
+
+    input_names : list of str
+        Names of graph input nodes.
+
+    Returns
+    -------
+    out : list of int
+        List of ancestor node index.
+    """
+    if node_idx in visited_dict:
+        return visited_dict[node_idx]
+    if is_input_node(node_list[node_idx], input_names):
+        return [node_idx]
+    node = node_list[node_idx]
+    # Rule out injective operators
+    is_rule_out = False
+    for item_idx in node["inputs"]:
+        item = node_list[item_idx[0]]
+        if item["op"] in RULE_OUT_NODE_NAMES:
+            is_rule_out = True
+            break
+    if is_rule_out:
+        visited_dict[node_idx] = []
+        return []
+
+    node_direct_ancestor = []
+    for item_idx in node["inputs"]:
+        item = node_list[item_idx[0]]
+        is_multiple_inputs = has_multiple_inputs(node_list, item_idx[0], input_names)
+        if item["op"] in target_ops or is_multiple_inputs:
+            node_direct_ancestor.append(item_idx[0])
+        else:
+            tmp = get_direct_ancestor(node_list, visited_dict, target_ops,
+                                      item_idx[0], input_names)
+            for tmp_item in tmp:
+                node_direct_ancestor.append(tmp_item)
+    if not has_multiple_inputs(node_list, node_idx, input_names) and node_direct_ancestor:
+        node_direct_ancestor = [node_direct_ancestor[0]]
+    visited_dict[node_idx] = node_direct_ancestor
+    return node_direct_ancestor
+
+
+def get_in_nodes(node_list, target_ops, input_names):
+    """Create a dictionary mapping from op_name nodes or multi_input
+    nodes to closest input ancestors.
+
+    Parameters
+    ----------
+    node_list : list of dict of str to object
+        List of all nodes in a graph.
+
+    target_ops: List of str
+        List of target relay op
+
+    input_names : list of str
+        Names of graph input nodes.
+
+    Returns
+    -------
+    out : dict of int to list of int
+        Dictionary maps node index to closest input ancestors.
+    """
+
+    visited_dict = {}
+    in_node_dict = {}
+    for i, node in enumerate(node_list):
+        if node["op"] in RULE_OUT_NODE_NAMES:
+            continue
+        get_direct_ancestor(node_list, visited_dict, target_ops, i, input_names)
+    for key, val in visited_dict.items():
+        node = node_list[key]
+        is_multiple_inputs = has_multiple_inputs(node_list, key, input_names)
+        if node["op"] in target_ops or is_multiple_inputs:
+            in_node_dict[key] = val
+
+    # Remove empty nodes
+    has_empty_node = True
+    out_node_dict = get_out_nodes(in_node_dict)
+    while has_empty_node:
+        empty_nodes = []
+        for key, val in in_node_dict.items():
+            if not val:
+                empty_nodes.append(key)
+        if empty_nodes:
+            has_empty_node = True
+            for node in empty_nodes:
+                del in_node_dict[node]
+                if node in out_node_dict:
+                    for out_node in out_node_dict[node]:
+                        in_node_dict[out_node].remove(node)
+        else:
+            has_empty_node = False
+
+    return in_node_dict
+
+
+def get_out_nodes(in_node_dict):
+    """Create output dictionary from input dictionary.
+
+    Parameters
+    ----------
+    in_node_dict : dict of int to list of int
+        Dictionary maps node index to closest input ancestors.
+        It can be created with get_in_nodes.
+
+    Returns
+    -------
+    out : dict of int to list of int
+        Dictionary maps node index to closest output nodes.
+    """
+    out_node_dict = {}
+    for key in in_node_dict:
+        out_node_dict[key] = []
+    for key, val in in_node_dict.items():
+        for item in val:
+            if item in out_node_dict:
+                out_node_dict[item].append(key)
+            else:
+                out_node_dict[item] = [key]
+
+    return out_node_dict
diff --git a/python/tvm/autotvm/graph_tuner/utils/utils.py b/python/tvm/autotvm/graph_tuner/utils/utils.py
new file mode 100644
index 000000000000..6151734299af
--- /dev/null
+++ b/python/tvm/autotvm/graph_tuner/utils/utils.py
@@ -0,0 +1,110 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=eval-used,invalid-name,too-many-arguments
+"""Utility functions"""
+from tvm import relay
+
+
+def has_multiple_inputs(node_list, node_idx, input_names):
+    """Check whether a node has multiple input nodes
+    except variable nodes.
+
+    Parameters
+    ----------
+    node_list : list of dict of str to object
+        List of all nodes in a graph.
+
+    node_idx : int
+        Node index to be checked.
+
+    input_names : list of str
+        List of input names of graph.
+
+    Returns
+    -------
+    out : bool
+        Whether the specified node has multiple input nodes
+    """
+    num_inputs = 0
+    node = node_list[node_idx]
+    for in_idx in node["inputs"]:
+        in_idx = in_idx[0]
+        in_node = node_list[in_idx]
+        # Exclude parameter nodes
+        if in_node["op"] != "null" or is_input_node(in_node,
+                                                    input_names):
+            num_inputs += 1
+    return num_inputs > 1
+
+
+def is_input_node(node_entry, input_names):
+    """Whether a node is an input node.
+
+    Parameters
+    ----------
+    node_entry : dict
+        Node entry.
+
+    input_names : list of str
+        List of input names of graph.
+
+    Returns
+    -------
+    out : bool
+        whether node is a input node.
+    """
+    return "name" in node_entry and node_entry["name"] in input_names
+
+
+def bind_inputs(expr, input_shapes=None, input_dtypes="float32"):
+    """Bind input variables of a relay function expression
+    to new shapes and/or dtypes.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr.Function
+        Input relay function expression.
+
+    input_shapes : dict of str to tuple of int, optional
+        Input shapes.
+
+    input_dtypes : str or dict of str to str, optional
+        Input dtypes.
+
+    Returns
+    -------
+    out : tvm.relay.Expr.Function
+        Bind relay function expression.
+    """
+    if input_shapes is None:
+        return expr
+    if isinstance(input_dtypes, str):
+        input_dtypes = {key : input_dtypes for key in input_shapes.keys()}
+
+    updated_input_dict = {}
+    for input_name in input_shapes.keys():
+        updated_input = relay.var(input_name, shape=input_shapes[input_name],
+                                  dtype=input_dtypes[input_name])
+        updated_input_dict[input_name] = updated_input
+
+    rebind_dict = {}
+    for var in expr.params:
+        if var.name_hint in updated_input_dict:
+            rebind_dict[var] = updated_input_dict[var.name_hint]
+    updated_expr = relay.expr.bind(expr, rebind_dict)
+
+    return relay.ir_pass.infer_type(updated_expr)
diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py
index 4c0f98347d4b..14efb7bd9239 100644
--- a/python/tvm/autotvm/record.py
+++ b/python/tvm/autotvm/record.py
@@ -25,6 +25,8 @@
 import pickle
 import json
 import time
+import os
+import itertools
 from collections import OrderedDict
 
 from .. import build, lower, target as _target
@@ -238,6 +240,8 @@ def pick_best(in_file, out_file):
     """
     Pick best entries from a file and store it to another file.
     This distill the useful log entries from a large log file.
+    If out_file already exists, the best entries from both
+    in_file and out_file will be saved.
 
     Parameters
     ----------
@@ -246,7 +250,12 @@ def pick_best(in_file, out_file):
     out_file: str or file
         The filename of output
     """
-    best_context = ApplyHistoryBest(load_from_file(in_file))
+    context = load_from_file(in_file)
+    if os.path.isfile(out_file):
+        out_context = load_from_file(out_file)
+        context = itertools.chain(context, out_context)
+    context, context_clone = itertools.tee(context)
+    best_context = ApplyHistoryBest(context)
     best_set = set()
 
     for v in best_context.best_by_model.values():
@@ -258,7 +267,7 @@ def pick_best(in_file, out_file):
     logger.info("Extract %d best records from the %s", len(best_set), in_file)
     fout = open(out_file, 'w') if isinstance(out_file, str) else out_file
 
-    for inp, res in load_from_file(in_file):
+    for inp, res in context_clone:
         if measure_str_key(inp) in best_set:
             fout.write(encode(inp, res) + "\n")
             best_set.remove(measure_str_key(inp))
diff --git a/python/tvm/autotvm/task/__init__.py b/python/tvm/autotvm/task/__init__.py
index ff50a4ebc81d..0a0e6e1e8ac7 100644
--- a/python/tvm/autotvm/task/__init__.py
+++ b/python/tvm/autotvm/task/__init__.py
@@ -28,6 +28,7 @@
 from .dispatcher import dispatcher, DispatchContext, ApplyConfig, ApplyHistoryBest, \
     FallbackContext, clear_fallback_cache, ApplyGraphBest
 
-from .topi_integration import register_topi_compute, register_topi_schedule
+from .topi_integration import register_topi_compute, register_topi_schedule, \
+    TaskExtractEnv
 from .nnvm_integration import extract_from_graph, extract_from_multiple_graph
 from .relay_integration import extract_from_program, extract_from_multiple_program
diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index 3c983768ab3e..c48d4f58edce 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -74,7 +74,7 @@ class TaskExtractEnv:
     """Global environment for extracting tuning tasks from nnvm graph"""
     current = None
 
-    def __init__(self):
+    def __init__(self, allow_duplicate=False):
         import topi
 
         # topi compute -> autotvm task name
@@ -106,6 +106,7 @@ def __init__(self):
             topi.nn.deformable_conv2d_nchw: [topi.generic.schedule_deformable_conv2d_nchw],
         }
 
+        self.allow_duplicate = allow_duplicate
         self._register_tracing()
         self._register_topi_task()
         self.task_collection = []
@@ -123,10 +124,9 @@ def _tracing_topi_compute(*args, **kwargs):
                     assert not kwargs, "Do not support extracting tuning tasks when" \
                                        "kwargs is used in TOPI function call." \
                                        "Please modify it to use only positional args."
-
                     if compute_func in self.wanted_topi_funcs:  # record this call
                         key = (self.topi_to_task[compute_func], serialize_args(args))
-                        if key not in self.task_collection:
+                        if self.allow_duplicate or key not in self.task_collection:
                             self.task_collection.append(key)
                     return compute_func.fdefault(*args)
             _local_scope(topi_compute)
@@ -262,20 +262,29 @@ def get_tasks(self):
         return self.task_collection
 
     @staticmethod
-    def get():
+    def get(allow_duplicate=False):
         """Get the single instance of TaskExtractEnv
 
+        Parameters
+        ----------
+        allow_duplicate : boolean
+            Whether to fetch all workloads in the network,
+            even though some of them are the same. This is
+            useful for graph tuning.
+
         Returns
         -------
         env: TaskExtractEnv
             The single instance of TaskExtractEnv
         """
         if not TaskExtractEnv.current:
-            TaskExtractEnv.current = TaskExtractEnv()
+            TaskExtractEnv.current = TaskExtractEnv(allow_duplicate)
+        else:
+            TaskExtractEnv.current.allow_duplicate = allow_duplicate
         return TaskExtractEnv.current
 
 
-def register_topi_compute(topi_compute, target_keys, template_keys, func=None):
+def register_topi_compute(topi_compute, target_keys, template_keys, func=None, override=False):
     """Register a tunable template for a topi compute function.
 
     After the registration, this topi compute will become a configuration dispatcher. It uses
@@ -324,7 +333,7 @@ def config_dispatcher(*args, **kwargs):
 
             config_dispatcher = _REGISTERED_DISPATCHER[target_key][topi_compute]
 
-            @config_dispatcher.register(template_keys)
+            @config_dispatcher.register(template_keys, override=override)
             def template_call(cfg, *args, **kwargs):
                 """call the topi func and attach workload to compute node"""
                 assert not kwargs, "Do not support kwargs in template function call"
@@ -363,7 +372,7 @@ def template_call(cfg, *args, **kwargs):
     return _decorator
 
 
-def register_topi_schedule(topi_schedule, target_keys, template_keys, func=None):
+def register_topi_schedule(topi_schedule, target_keys, template_keys, func=None, override=False):
     """Register a tunable template for a topi schedule function.
 
     After the registration. This topi schedule will become a configuration dispatcher. It dispatches
@@ -429,7 +438,7 @@ def traverse(tensors):
 
             config_dispatcher = _REGISTERED_DISPATCHER[target_key][topi_schedule]
 
-            @config_dispatcher.register(template_keys)
+            @config_dispatcher.register(template_keys, override=override)
             def template_call(cfg, outs, *args, **kwargs):
                 """call the schedule func"""
                 if f == topi_schedule.fdefault:
diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py
index 120bf629a959..76170a844db1 100644
--- a/python/tvm/build_module.py
+++ b/python/tvm/build_module.py
@@ -143,7 +143,8 @@ class BuildConfig(NodeBase):
         "double_buffer_split_loop": 1,
         "dump_pass_ir": False,
         "instrument_bound_checkers": False,
-        "disable_select_rewriting": False
+        "disable_select_rewriting": False,
+        "disable_vectorize": False
     }
     _dump_ir = DumpIR()
 
@@ -186,7 +187,7 @@ def __enter__(self):
     def __exit__(self, ptype, value, trace):
         if self.dump_pass_ir:
             BuildConfig._dump_ir.exit()
-        _api_internal._ExitBuildConfigScope()
+        _api_internal._ExitBuildConfigScope(self)
 
     def __setattr__(self, name, value):
         if name in BuildConfig._node_defaults:
@@ -384,7 +385,10 @@ def lower(sch,
     # Phase 2
     if not simple_mode:
         stmt = ir_pass.LoopPartition(stmt, cfg.partition_const_loop)
-    stmt = ir_pass.VectorizeLoop(stmt)
+    if cfg.disable_vectorize:
+        stmt = ir_pass.SkipVectorize(stmt)
+    else:
+        stmt = ir_pass.VectorizeLoop(stmt)
     stmt = ir_pass.InjectVirtualThread(stmt)
     stmt = ir_pass.InjectDoubleBuffer(stmt, cfg.double_buffer_split_loop)
     stmt = ir_pass.StorageRewrite(stmt)
diff --git a/python/tvm/contrib/cblas.py b/python/tvm/contrib/cblas.py
index c656fcc2b966..7c024b792867 100644
--- a/python/tvm/contrib/cblas.py
+++ b/python/tvm/contrib/cblas.py
@@ -17,10 +17,10 @@
 """External function interface to BLAS libraries."""
 from __future__ import absolute_import as _abs
 
-from .. import api as _api
-from .. import intrin as _intrin
+from .. import api as _api, intrin as _intrin
 
-def matmul(lhs, rhs, transa=False, transb=False):
+
+def matmul(lhs, rhs, transa=False, transb=False, **kwargs):
     """Create an extern op that compute matrix mult of A and rhs with CrhsLAS
 
     This function serves as an example on how to call external libraries.
@@ -44,7 +44,50 @@ def matmul(lhs, rhs, transa=False, transb=False):
     n = lhs.shape[1] if transa else lhs.shape[0]
     m = rhs.shape[0] if transb else rhs.shape[1]
     return _api.extern(
-        (n, m), [lhs, rhs],
+        (n, m),
+        [lhs, rhs],
+        lambda ins, outs: _intrin.call_packed(
+            "tvm.contrib.cblas.matmul", ins[0], ins[1], outs[0], transa, transb
+        ),
+        name="C",
+        **kwargs
+    )
+
+
+def batch_matmul(lhs, rhs, transa=False, transb=False, iterative=False, **kwargs):
+    """Create an extern op that compute batched matrix mult of A and rhs with CBLAS
+     This function serves as an example on how to call external libraries.
+     Parameters
+    ----------
+    lhs : Tensor
+        The left matrix operand
+    rhs : Tensor
+        The right matrix operand
+    transa : bool
+        Whether transpose lhs
+    transb : bool
+        Whether transpose rhs
+     Returns
+    -------
+    C : Tensor
+        The result tensor.
+    """
+    b = lhs.shape[0]
+    n = lhs.shape[2] if transa else lhs.shape[1]
+    m = rhs.shape[1] if transb else rhs.shape[2]
+    return _api.extern(
+        (b, n, m),
+        [lhs, rhs],
         lambda ins, outs: _intrin.call_packed(
-            "tvm.contrib.cblas.matmul",
-            ins[0], ins[1], outs[0], transa, transb), name="C")
+            "tvm.contrib.cblas.batch_matmul"
+            if not iterative
+            else "tvm.contrib.cblas.batch_matmul_iterative",
+            ins[0],
+            ins[1],
+            outs[0],
+            transa,
+            transb,
+        ),
+        name="C",
+        **kwargs
+    )
diff --git a/python/tvm/contrib/debugger/debug_result.py b/python/tvm/contrib/debugger/debug_result.py
index c53a2c287339..882364dd3971 100644
--- a/python/tvm/contrib/debugger/debug_result.py
+++ b/python/tvm/contrib/debugger/debug_result.py
@@ -207,10 +207,8 @@ def dump_graph_json(self, graph):
     def display_debug_result(self):
         """Displays the debugger result"
         """
-        header = ["Node Name", "Ops", "Time(us)", "Time(%)", "Start Time", \
-                    "End Time", "Shape", "Inputs", "Outputs"]
-        lines = ["---------", "---", "--------", "-------", "----------", \
-                    "--------", "-----", "------", "-------"]
+        header = ["Node Name", "Ops", "Time(us)", "Time(%)", "Shape", "Inputs", "Outputs"]
+        lines = ["---------", "---", "--------", "-------", "-----", "------", "-------"]
         eid = 0
         data = []
         total_time = sum(time[0] for time in self._time_list)
@@ -223,12 +221,11 @@ def display_debug_result(self):
                     continue
                 name = node['name']
                 shape = str(self._output_tensor_list[eid].shape)
-                time_us = round(time[0] * 1000000, 2)
-                time_percent = round(((time[0] / total_time) * 100), 2)
+                time_us = round(time[0] * 1000000, 3)
+                time_percent = round(((time[0] / total_time) * 100), 3)
                 inputs = str(node['attrs']['num_inputs'])
                 outputs = str(node['attrs']['num_outputs'])
-                node_data = [name, op, time_us, time_percent, str(time[1]), str(time[2]), \
-                             shape, inputs, outputs]
+                node_data = [name, op, time_us, time_percent, shape, inputs, outputs]
                 data.append(node_data)
                 eid += 1
         fmt = ""
diff --git a/python/tvm/contrib/debugger/debug_runtime.py b/python/tvm/contrib/debugger/debug_runtime.py
index 01cda35769a5..f77a927eeabf 100644
--- a/python/tvm/contrib/debugger/debug_runtime.py
+++ b/python/tvm/contrib/debugger/debug_runtime.py
@@ -19,7 +19,6 @@
 import os
 import tempfile
 import shutil
-from datetime import datetime
 from tvm._ffi.base import string_types
 from tvm._ffi.function import get_global_func
 from tvm.contrib import graph_runtime
@@ -30,6 +29,7 @@
 _DUMP_ROOT_PREFIX = "tvmdbg_"
 _DUMP_PATH_PREFIX = "_tvmdbg_"
 
+
 def create(graph_json_str, libmod, ctx, dump_root=None):
     """Create a runtime executor module given a graph and module.
 
@@ -62,17 +62,23 @@ def create(graph_json_str, libmod, ctx, dump_root=None):
     try:
         fcreate = get_global_func("tvm.graph_runtime_debug.create")
     except ValueError:
-        raise ValueError("Please set '(USE_GRAPH_RUNTIME_DEBUG ON)' in " \
-                         "config.cmake and rebuild TVM to enable debug mode")
+        raise ValueError(
+            "Please set '(USE_GRAPH_RUNTIME_DEBUG ON)' in "
+            "config.cmake and rebuild TVM to enable debug mode"
+        )
 
     ctx, num_rpc_ctx, device_type_id = graph_runtime.get_device_ctx(libmod, ctx)
     if num_rpc_ctx == len(ctx):
         libmod = rpc_base._ModuleHandle(libmod)
         try:
-            fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime_debug.remote_create")
+            fcreate = ctx[0]._rpc_sess.get_function(
+                "tvm.graph_runtime_debug.remote_create"
+            )
         except ValueError:
-            raise ValueError("Please set '(USE_GRAPH_RUNTIME_DEBUG ON)' in " \
-                             "config.cmake and rebuild TVM to enable debug mode")
+            raise ValueError(
+                "Please set '(USE_GRAPH_RUNTIME_DEBUG ON)' in "
+                "config.cmake and rebuild TVM to enable debug mode"
+            )
     func_obj = fcreate(graph_json_str, libmod, *device_type_id)
     return GraphModuleDebug(func_obj, ctx, graph_json_str, dump_root)
 
@@ -100,10 +106,10 @@ class GraphModuleDebug(graph_runtime.GraphModule):
         To select which folder the outputs should be kept.
         None will make a temp folder in /tmp/tvmdbg<rand_string> and does the dumping
     """
+
     def __init__(self, module, ctx, graph_json_str, dump_root):
         self._dump_root = dump_root
         self._dump_path = None
-        self._debug_run = module["debug_run"]
         self._get_output_by_layer = module["get_output_by_layer"]
         self._run_individual = module["run_individual"]
         graph_runtime.GraphModule.__init__(self, module)
@@ -181,13 +187,10 @@ def _run_debug(self):
         Time consumed for each execution will be set as debug output.
 
         """
-        self.debug_datum._time_list = []
-
+        self.debug_datum._time_list = [
+            [float(t) * 1e-6] for t in self.run_individual(10, 1, 1)
+        ]
         for i, node in enumerate(self.debug_datum.get_graph_nodes()):
-            start_time = datetime.now().time()
-            time_stamp = self._debug_run(i)
-            end_time = datetime.now().time()
-            self.debug_datum._time_list.append([time_stamp, start_time, end_time])
             num_outputs = self.debug_datum.get_graph_node_output_num(node)
             for j in range(num_outputs):
                 out_tensor = self._get_output_by_layer(i, j)
@@ -212,8 +215,13 @@ def debug_get_output(self, node, out):
                 ret = output_tensors[node]
             except:
                 node_list = output_tensors.keys()
-                raise RuntimeError("Node " + node + " not found, available nodes are: "
-                                   + str(node_list) + ".")
+                raise RuntimeError(
+                    "Node "
+                    + node
+                    + " not found, available nodes are: "
+                    + str(node_list)
+                    + "."
+                )
         elif isinstance(node, int):
             output_tensors = self.debug_datum._output_tensor_list
             ret = output_tensors[node]
@@ -242,7 +250,9 @@ def run(self, **input_dict):
         self.debug_datum.display_debug_result()
 
     def run_individual(self, number, repeat=1, min_repeat_ms=0):
-        self._run_individual(number, repeat, min_repeat_ms)
+        ret = self._run_individual(number, repeat, min_repeat_ms)
+        return ret.strip(",").split(",") if ret else []
+
 
     def exit(self):
         """Exits the dump folder and all its contents"""
diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py
index 4d0698a40db7..0c9ce404c48e 100644
--- a/python/tvm/contrib/graph_runtime.py
+++ b/python/tvm/contrib/graph_runtime.py
@@ -129,6 +129,7 @@ def __init__(self, module):
         self._get_input = module["get_input"]
         self._get_num_outputs = module["get_num_outputs"]
         self._load_params = module["load_params"]
+        self._share_params = module["share_params"]
 
     def set_input(self, key=None, value=None, **params):
         """Set inputs to the module via kwargs
@@ -234,6 +235,19 @@ def load_params(self, params_bytes):
         """
         self._load_params(bytearray(params_bytes))
 
+    def share_params(self, other, params_bytes):
+        """Share parameters from pre-existing GraphRuntime instance.
+
+        Parameters
+        ----------
+        other: GraphRuntime
+            The parent GraphRuntime from which this instance should share
+            it's parameters.
+        params_bytes : bytearray
+            The serialized parameter dict (used only for the parameter names).
+        """
+        self._share_params(other.module, bytearray(params_bytes))
+
     def __getitem__(self, key):
         """Get internal module function
 
diff --git a/python/tvm/datatype.py b/python/tvm/datatype.py
new file mode 100644
index 000000000000..df3e3a62a510
--- /dev/null
+++ b/python/tvm/datatype.py
@@ -0,0 +1,146 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Custom datatype functionality"""
+from __future__ import absolute_import as _abs
+
+from ._ffi.function import register_func as _register_func
+from . import make as _make
+from .api import convert
+from .expr import Call as _Call, Cast as _Cast, FloatImm as _FloatImm
+from ._ffi.runtime_ctypes import TVMType as _TVMType
+from . import _api_internal
+
+
+def register(type_name, type_code):
+    """Register a custom datatype with the given type name and type code
+    Currently, the type code is manually allocated by the user, and the
+    user must ensure that no two custom types share the same code.
+    Generally, this should be straightforward, as the user will be
+    manually registering all of their custom types.
+
+    Parameters
+    ----------
+    type_name : str
+        The name of the custom datatype
+
+    type_code : int
+        The type's code, which should be >= kCustomBegin
+    """
+    _api_internal._datatype_register(type_name, type_code)
+
+
+def get_type_name(type_code):
+    """Get the type name from the type code
+
+    Parameters
+    ----------
+    type_code : int
+        The type code
+    """
+    return _api_internal._datatype_get_type_name(type_code)
+
+
+def get_type_code(type_name):
+    """Get the type code from the type name
+
+    Parameters
+    ----------
+    type_name : str
+        The type name
+    """
+    return _api_internal._datatype_get_type_code(type_name)
+
+
+def get_type_registered(type_code):
+    """Get a boolean representing whether the type is registered
+
+    Parameters
+    ----------
+    type_code: int
+        The type code
+    """
+    return _api_internal._datatype_get_type_registered(type_code)
+
+
+def register_op(lower_func, op_name, target, type_name, src_type_name=None):
+    """Register an external function which computes the given op.
+
+    Currently, this will only work with Casts and binary expressions
+    whose arguments are named `a` and `b`.
+    TODO(gus) figure out what other special cases must be handled by
+        looking through expr.py.
+
+    Parameters
+    ----------
+    lower_func : function
+        The lowering function to call. See create_lower_func.
+
+    op_name : str
+        The name of the operation which the function computes, given by its
+        Halide::Internal class name (e.g. Add, LE, Cast).
+
+    target : str
+        The name of codegen target.
+
+    type_name : str
+        The name of the custom datatype, e.g. posit (but not custom[posit]8).
+
+    src_type_name : str
+        If op_name is "Cast", then this should be set to the source datatype of
+        the argument to the Cast. If op_name is not "Cast", this is unused.
+    """
+
+    if op_name == "Cast":
+        assert src_type_name is not None
+        lower_func_name = "tvm.datatype.lower." + target + "." + op_name + "." \
+                          + type_name + "." + src_type_name
+    else:
+        lower_func_name = "tvm.datatype.lower." + target + "." + op_name + "." \
+                          + type_name
+    _register_func(lower_func_name, lower_func)
+
+
+def create_lower_func(extern_func_name):
+    """Returns a function which lowers an operation to a function call.
+
+    Parameters
+    ----------
+    extern_func_name : str
+        The name of the extern "C" function to lower to
+    """
+
+    def lower(op):
+        """
+        Takes an op---either a Cast or a binary op (e.g. an Add) and returns a
+        call to the specified external function, passing the op's argument
+        (Cast) or arguments (a binary op). The return type of the call depends
+        on the type of the op: if it is a custom type, then a uint of the same
+        width as the custom type is returned. Otherwise, the type is
+        unchanged."""
+        dtype = op.dtype
+        t = _TVMType(dtype)
+        if get_type_registered(t.type_code):
+            dtype = "uint" + str(t.bits)
+            if t.lanes > 1:
+                dtype += "x" + str(t.lanes)
+        if isinstance(op, (_Cast, _FloatImm)):
+            return _make.Call(dtype, extern_func_name, convert([op.value]),
+                              _Call.Extern, None, 0)
+        return _make.Call(dtype, extern_func_name, convert([op.a, op.b]),
+                          _Call.Extern, None, 0)
+
+    return lower
diff --git a/python/tvm/expr.py b/python/tvm/expr.py
index a234ac4da53b..9c8a9ab89d3b 100644
--- a/python/tvm/expr.py
+++ b/python/tvm/expr.py
@@ -222,7 +222,7 @@ def asnode(self):
 
 class Expr(ExprOp, NodeBase):
     """Base class of all tvm Expressions"""
-    # In Python3, We have to explicity tell interpreter to retain __hash__ if we overide __eq__
+    # In Python3, We have to explicitly tell interpreter to retain __hash__ if we overide __eq__
     # https://docs.python.org/3.1/reference/datamodel.html#object.__hash__
     __hash__ = NodeBase.__hash__
 
@@ -349,6 +349,16 @@ def __init__(self, value):
         self.__init_handle_by_constructor__(
             _make.StringImm, value)
 
+    def __eq__(self, other):
+        if isinstance(other, ConstExpr):
+            return self.value == other.value
+        return self.value == other
+
+    def __ne__(self, other):
+        if isinstance(other, ConstExpr):
+            return self.value != other.value
+        return self.value != other
+
 
 @register_node
 class Cast(Expr):
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index 6201681e0294..5536e503e6b6 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -25,7 +25,9 @@
 from . import module
 from . import adt
 from . import ir_pass
-from .build_module import build, build_config, create_executor, optimize
+from . import transform
+from .build_module import build, create_executor
+from .transform import build_config
 from . import prelude
 from . import parser
 from . import debug
@@ -97,9 +99,9 @@
 var = expr.var
 const = expr.const
 bind = expr.bind
-module_pass = ir_pass.module_pass
-function_pass = ir_pass.function_pass
-sequential_pass = ir_pass.sequential_pass
+module_pass = transform.module_pass
+function_pass = transform.function_pass
+alpha_equal = ir_pass.alpha_equal
 
 # ExprFunctor
 ExprFunctor = expr_functor.ExprFunctor
@@ -114,9 +116,9 @@
 load_param_dict = param_dict.load_param_dict
 
 # Pass manager
-PassInfo = ir_pass.PassInfo
-PassContext = ir_pass.PassContext
-Pass = ir_pass.Pass
-ModulePass = ir_pass.ModulePass
-FunctionPass = ir_pass.FunctionPass
-SequentialPass = ir_pass.SequentialPass
+PassInfo = transform.PassInfo
+PassContext = transform.PassContext
+Pass = transform.Pass
+ModulePass = transform.ModulePass
+FunctionPass = transform.FunctionPass
+Sequential = transform.Sequential
diff --git a/python/tvm/relay/_build_module.py b/python/tvm/relay/_build_module.py
new file mode 100644
index 000000000000..bdbcbefff523
--- /dev/null
+++ b/python/tvm/relay/_build_module.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=no-else-return, unidiomatic-typecheck, undefined-variable
+"""The interface for building Relay functions exposed from C++."""
+from tvm._ffi.function import _init_api
+
+_init_api("relay.build_module", __name__)
diff --git a/python/tvm/relay/_ir_pass.pyi b/python/tvm/relay/_ir_pass.pyi
index 6aedb5248657..13035bb36f71 100644
--- a/python/tvm/relay/_ir_pass.pyi
+++ b/python/tvm/relay/_ir_pass.pyi
@@ -17,62 +17,8 @@
 
 import tvm
 from . import ir
-from .base import NodeBase
 from .env import Module
 
-
-class PassContext(NodeBase):
-    def __init__(self):
-        ...
-
-class PassInfo(NodeBase):
-    name = ...  # type: str
-    opt_level = ... # type: int
-    required = ... # type: list
-
-    def __init__(self, name, opt_level, required)
-        # type: (str, int, list) -> None
-
-
-class Pass(NodeBase):
-    def __init__(self):
-        ...
-
-
-class ModulePass(Pass):
-    name = ...  # type: str
-    opt_level = ...  # type: int
-    pass_func = ...  # type: Callable
-    required = ...  # type: list
-
-    def __init__(self, name, opt_level, pass_func, required):
-        # type: (str, int, Callable, list) -> None
-        ...
-
-
-class FunctionPass(Pass):
-    name = ...  # type: str
-    opt_level = ...  # type: int
-    pass_func = ...  # type: Callable
-    required = ...  # type: list
-
-    def __init__(self, name, opt_level, pass_func, required):
-        # type: (str, int, Callable, list) -> None
-        ...
-
-
-class SequentialPass(Pass):
-    name = ...  # type: str
-    opt_level = ...  # type: int
-    passes = ...  # type: list
-    required = ...  # type: list
-    disabled = ... # type: list
-
-    def __init__(self, name, opt_level, passes, required, disabled):
-        # type: (str, int, list, list, list) -> None
-        ...
-
-
 def check_expr(env: Module, expr: ir.Expr) -> ir.Type: ...
 def generalize(env: Module, expr: ir.Expr) -> ir.Expr: ...
 def _get_checked_type(expr: ir.Expr) -> ir.Type: ...
diff --git a/python/tvm/relay/_parser.py b/python/tvm/relay/_parser.py
index 62f0ffe15cba..303f694896a5 100644
--- a/python/tvm/relay/_parser.py
+++ b/python/tvm/relay/_parser.py
@@ -242,10 +242,12 @@ def visitProg(self, ctx):
             self.visit_list(ctx.defn())
             return self.module
 
-        return self.visit(ctx.expr())
+        if ctx.expr():
+            return self.visit(ctx.expr())
 
-    # Exprs
+        return self.module
 
+    # Exprs
     def visitOpIdent(self, ctx):
         # type: (RelayParser.OpIdentContext) -> op.Op
         return op.get(ctx.CNAME().getText())
@@ -368,14 +370,25 @@ def mk_func(self, ctx):
         self.enter_var_scope()
         # Capture type params in params.
         self.enter_type_param_scope()
+        type_params = ctx.typeParamSeq()
+
+        if type_params is not None:
+            type_params = type_params.ident()
+            assert type_params
+            for ty_param in type_params:
+                name = ty_param.getText()
+                self.mk_typ(name, ty.Kind.Type)
+
         var_list, attr_list = self.visit(ctx.argList())
         ret_type = self.getType_(ctx.type_())
 
+        body = self.visit(ctx.body())
+        # NB(@jroesch): you must stay in the type parameter scope until
+        # after you exit the body, you can reference the type parameters
+        # of your parent scopes.
         type_params = list(self.exit_type_param_scope())
         if type_params:
             _, type_params = zip(*type_params)
-
-        body = self.visit(ctx.body())
         self.exit_var_scope()
 
         attrs = tvm.make.node("DictAttrs", **attr_list) if attr_list is not None else None
@@ -453,16 +466,23 @@ def visitIncompleteType(self, ctx):
         # type (RelayParser.IncompleteTypeContext) -> None:
         return None
 
-    def visitIdentType(self, ctx):
-        # type: (RelayParser.IdentTypeContext) -> Union[ty.TensorType, str]
-        ident_type = ctx.CNAME().getText()
+    def visitTypeIdent(self, ctx):
+        # type: (RelayParser.TypeIdentContext) -> Union[ty.TensorType, str]
+        '''
+        Handle type identifier.
+        '''
+        type_ident = ctx.CNAME().getText()
 
-        # look through all type prefixes for a match
+        # Look through all type prefixes for a match
         for type_prefix in TYPE_PREFIXES:
-            if ident_type.startswith(type_prefix):
-                return ty.scalar_type(ident_type)
+            if type_ident.startswith(type_prefix):
+                return ty.scalar_type(type_ident)
+
+        type_param = lookup(self.type_param_scopes, type_ident)
+        if type_param is not None:
+            return type_param
 
-        raise ParseError("Unknown builtin type: {}".format(ident_type))
+        raise ParseError("Unknown builtin type: {}".format(type_ident))
 
     # def visitCallType(self, ctx):
     #     # type: (RelayParser.CallTypeContext) -> Union[expr.Expr, ty.TensorType]
diff --git a/python/tvm/relay/_transform.py b/python/tvm/relay/_transform.py
new file mode 100644
index 000000000000..273d97e0962a
--- /dev/null
+++ b/python/tvm/relay/_transform.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""FFI exposing the Relay type inference and checking."""
+
+from tvm._ffi.function import _init_api
+
+_init_api("relay._transform", __name__)
diff --git a/python/tvm/relay/backend/_backend.py b/python/tvm/relay/backend/_backend.py
index 50e9694b40df..860788a4e5d0 100644
--- a/python/tvm/relay/backend/_backend.py
+++ b/python/tvm/relay/backend/_backend.py
@@ -17,7 +17,6 @@
 """The interface of expr function exposed from C++."""
 from __future__ import absolute_import
 
-import logging
 from ... import build_module as _build
 from ... import container as _container
 from ..._ffi.function import _init_api, register_func
@@ -50,8 +49,8 @@ def lower(sch, inputs, func_name, source_func):
     # pylint: disable=broad-except
     try:
         f = _build.lower(sch, inputs, name=func_name)
-        logging.debug("lower function %s", func_name)
-        logging.debug("%s", _build.lower(sch, inputs, simple_mode=True))
+        # logging.debug("lower function %s", func_name)
+        # logging.debug("%s", _build.lower(sch, inputs, simple_mode=True))
     except Exception:
         msg = traceback.format_exc()
         msg += "Error during compile function\n"
diff --git a/python/tvm/relay/backend/graph_runtime_codegen.py b/python/tvm/relay/backend/graph_runtime_codegen.py
index ea1846b93beb..cf31e9cff833 100644
--- a/python/tvm/relay/backend/graph_runtime_codegen.py
+++ b/python/tvm/relay/backend/graph_runtime_codegen.py
@@ -36,12 +36,9 @@
 from __future__ import absolute_import
 
 from tvm.ndarray import empty
-from tvm._ffi.function import _init_api
-
 from tvm.relay import build_module
 from tvm import target as _target
-
-_init_api("tvm.relay.build_module")
+from tvm import expr as _expr
 
 class GraphRuntimeCodegen(object):
     """The compiler from Relay to the TVM runtime system."""
@@ -57,17 +54,14 @@ def __init__(self, mod, target):
         self._setup(mod, target)
 
     def _setup(self, mod, target):
-        tgts = []
+        tgts = {}
         if isinstance(target, dict):
-            for kv in target.items():
-                tgts.append(kv[0])
-                if isinstance(kv[1], (str, _target.Target)):
-                    tgts.append(str(kv[1]))
-                else:
+            for dev, tgt in target.items():
+                if not isinstance(tgt, (str, _target.Target)):
                     raise Exception("Unknown target type")
+                tgts[dev] = _target.create(tgt)
         elif isinstance(target, (str, _target.Target)):
-            tgts.append("0")
-            tgts.append(str(target))
+            tgts[_expr.IntImm("int32", 0)] = _target.create(target)
         self._init(mod, tgts)
 
     def codegen(self, func):
diff --git a/python/tvm/relay/backend/interpreter.py b/python/tvm/relay/backend/interpreter.py
index fc47f4e1b7c8..c54a65b78fb2 100644
--- a/python/tvm/relay/backend/interpreter.py
+++ b/python/tvm/relay/backend/interpreter.py
@@ -21,7 +21,8 @@
 import numpy as np
 
 from . import _backend
-from .. import _make, ir_pass
+from .. import _make, ir_pass, transform
+from .. import module
 from ... import register_func, nd
 from ..base import NodeBase, register_relay_node
 from ..expr import Tuple, RefCreate, Call, Constant, GlobalVar, Function, const
@@ -73,9 +74,9 @@ class Closure(Value):
 
 @register_relay_node
 class ConstructorValue(Value):
-    def __init__(self, constructor, fields, types):
+    def __init__(self, tag, fields, constructor, types):
         self.__init_handle_by_constructor__(
-            _make.ConstructorValue, constructor, fields, types)
+            _make.ConstructorValue, tag, fields, constructor, types)
 
 
 @register_relay_node
@@ -118,6 +119,8 @@ def _arg_to_ast(arg):
         return Constant(arg.data.copyto(nd.cpu(0)))
     elif isinstance(arg, TupleValue):
         return Tuple([_arg_to_ast(field) for field in arg.fields])
+    elif isinstance(arg, tuple):
+        return Tuple([_arg_to_ast(field) for field in arg])
     elif isinstance(arg, RefValue):
         return RefCreate(_arg_to_ast(arg.value))
     elif isinstance(arg, ConstructorValue):
@@ -189,14 +192,14 @@ def _convert_args(self, expr, args, kwargs):
 
         return tuple(cargs)
 
-    def _make_executor(self, _):
+    def _make_executor(self, expr=None):
         """
         Construct a Python function that implements the evaluation
         of expression.
 
         Parameters
         ----------
-        expr: relay.Expr
+        expr: Optional[relay.Expr]
             The Relay expression to execute.
 
         Returns
@@ -206,16 +209,16 @@ def _make_executor(self, _):
         """
         raise NotImplementedError()
 
-    def evaluate(self, expr, binds=None):
+    def evaluate(self, expr=None, binds=None):
         """
         Evaluate a Relay expression on the executor.
 
         Parameters
         ----------
-        expr: tvm.relay.Expr
+        expr: Optional[tvm.relay.Expr]
             The expression to evaluate.
 
-        binds: Map[tvm.relay.Var, tvm.relay.Expr]
+        binds: Optional[Map[tvm.relay.Var, tvm.relay.Expr]]
             Additional binding of free variable.
 
         Returns
@@ -230,6 +233,9 @@ def evaluate(self, expr, binds=None):
             scope_builder.ret(expr)
             expr = scope_builder.get()
 
+        if not expr:
+            return self._make_executor()
+
         if isinstance(expr, Function):
             assert not ir_pass.free_vars(expr)
 
@@ -262,46 +268,47 @@ def __init__(self, mod, ctx, target):
         self.target = target
         self._intrp = _backend.CreateInterpreter(mod, ctx, target)
 
-    def optimize(self, expr):
-        """Optimize an expr.
-
-        Parameters
-        ----------
-        expr : Expr
-            The expression to be optimized.
+    def optimize(self):
+        """Optimize functions in a module.
 
         Returns
         -------
-        opt_expr : Expr
-            The optimized expression.
+        opt_mod : tvm.relay.Module
+            The optimized module.
         """
-        # TODO: We need to move this optimization code into the optimizer/pass manager
-        wrapped_expr = expr if isinstance(expr, Function) else Function([], expr)
-        if self.mod:
-            self.mod[self.mod.entry_func] = wrapped_expr
-        ck_expr = ir_pass.infer_type(wrapped_expr, mod=self.mod)
-        simp_expr = ir_pass.simplify_inference(ck_expr)
-        ck_simp = ir_pass.infer_type(simp_expr, mod=self.mod)
-        fused_expr = ir_pass.fuse_ops(ck_simp, 0, mod=self.mod)
-        ck_fused = ir_pass.infer_type(fused_expr, mod=self.mod)
-        return ck_fused if isinstance(expr, Function) else Call(ck_fused, [])
-
-    def _make_executor(self, expr):
+        seq = transform.Sequential([transform.SimplifyInference(),
+                                    transform.FuseOps(0),
+                                    transform.InferType()])
+        return seq(self.mod)
+
+    def _make_executor(self, expr=None):
+        if expr is None or isinstance(expr, GlobalVar):
+            assert self.mod is not None
         def _interp_wrapper(*args, **kwargs):
-            args = self._convert_args(expr, args, kwargs)
+            if expr is None:
+                args = self._convert_args(self.mod[self.mod.entry_func], args, kwargs)
+            else:
+                args = self._convert_args(expr, args, kwargs)
 
             relay_args = []
             for arg in args:
                 relay_args.append(_arg_to_ast(arg))
 
-            if isinstance(expr, GlobalVar):
-                func = self.mod[expr]
-                func = self.optimize(func)
-                self.mod._add(expr, func, True)
-                opt_expr = Call(expr, relay_args)
-                return self._intrp(opt_expr)
+            # Set the entry function for the module.
+            if expr is None:
+                pass
+            elif isinstance(expr, GlobalVar):
+                self.mod[self.mod.entry_func] = self.mod[expr]
             else:
-                call = Call(expr, relay_args)
-                opt_expr = self.optimize(call)
-                return self._intrp(opt_expr)
+                assert isinstance(expr, Function)
+                func = Function([], Call(expr, relay_args))
+                relay_args = []
+                if self.mod:
+                    self.mod[self.mod.entry_func] = func
+                else:
+                    self.mod = module.Module.from_expr(func)
+
+            mod = self.optimize()
+            opt_expr = Call(mod[self.mod.entry_func.name_hint], relay_args)
+            return self._intrp(opt_expr)
         return _interp_wrapper
diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
index bebadd167fe9..ceb403fe7717 100644
--- a/python/tvm/relay/backend/vm.py
+++ b/python/tvm/relay/backend/vm.py
@@ -20,24 +20,45 @@
 
 Implements a Python interface to compiling and executing on the Relay VM.
 """
+import numpy as np
+
 import tvm
 from tvm._ffi.function import Object
-import numpy as np
-from .. import ir_pass
+from .. import transform
 from ..backend.interpreter import Executor
-from ..expr import GlobalVar, Function, Expr
+from ..expr import GlobalVar, Expr
 from . import _vm
 
 Object = Object
 
-def optimize(expr, mod=None):
-    # TODO: We need to move this optimization code into the optimizer/pass manager
-    ck_expr = ir_pass.infer_type(expr, mod=mod)
-    simplified_expr = ir_pass.simplify_inference(ck_expr)
-    simplified_expr = ir_pass.infer_type(simplified_expr, mod=mod)
-    fused_expr = ir_pass.fuse_ops(simplified_expr, mod=mod)
-    ck_fused = ir_pass.infer_type(fused_expr, mod=mod)
-    return ck_fused
+def optimize(mod):
+    """Perform several optimizations on a module before executing it in the
+    Relay virtual machine.
+
+    Parameters
+    ----------
+    mod : tvm.relay.Module
+        The module to optimize.
+
+    Returns
+    -------
+    ret : tvm.relay.Module
+        The optimized module.
+    """
+    main_func = mod[mod.entry_func]
+
+    opt_passes = []
+    if not main_func.params and isinstance(main_func.body, GlobalVar):
+        opt_passes.append(transform.EtaExpand())
+
+    opt_passes = opt_passes + [
+        transform.SimplifyInference(),
+        transform.FuseOps(),
+        transform.InferType()
+    ]
+
+    seq = transform.Sequential(opt_passes)
+    return seq(mod)
 
 def _convert(arg, cargs):
     if isinstance(arg, np.ndarray):
@@ -76,15 +97,7 @@ def _eval_vm(mod, ctx, *args):
     args: List[tvm.NDArray, np.ndarray]
         The arguments to evaluate.
     """
-    main_func = mod[mod.entry_func]
-
-    if not main_func.params and isinstance(main_func.body, GlobalVar):
-        main_func = ir_pass.eta_expand(main_func.body, mod)
-
-    assert isinstance(main_func, Function)
-    main_func = optimize(mod[mod.entry_func], mod)
-    mod[mod.entry_func] = main_func
-
+    mod = optimize(mod)
     args = list(args)
     assert isinstance(args, list)
     cargs = convert(args)
@@ -117,9 +130,11 @@ def __init__(self, mod, ctx, target):
         self.ctx = ctx
         self.target = target
 
-    def _make_executor(self, expr):
-        assert isinstance(expr, Expr)
-        self.mod[self.mod.entry_func] = expr
+    def _make_executor(self, expr=None):
+        expr = expr if expr else self.mod
+        assert expr, "either expr or self.mod should be not null."
+        if isinstance(expr, Expr):
+            self.mod[self.mod.entry_func] = expr
         main = self.mod[self.mod.entry_func]
 
         def _vm_wrapper(*args, **kwargs):
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index c8b69e011543..1aa4d5ae57c4 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -18,219 +18,128 @@
 Construct the necessary state for the TVM graph runtime
 from a Relay expression.
 """
-import warnings
+import numpy as np
 
-from tvm._ffi.runtime_ctypes import TVMContext
-from ..build_module import build as _tvm_build_module
+from tvm import expr as tvm_expr
 from .. import nd as _nd, target as _target, autotvm
 from ..contrib import graph_runtime as _graph_rt
+from . import _build_module
 from . import ir_pass
-from . import expr as _expr
 from . import ty as _ty
+from . import expr as _expr
 from .backend import interpreter as _interpreter
-from .backend import graph_runtime_codegen as _graph_gen
 from .backend.vm import VMExecutor
 
-# List of optimization pass and level when switch on
-OPT_PASS_LEVEL = {
-    "SimplifyInference": 0,
-    "OpFusion": 1,
-    "FoldConstant": 2,
-    "CombineParallelConv2D": 3,
-    "FoldScaleAxis": 3,
-    "AlterOpLayout": 3,
-    "CanonicalizeOps": 3,
-    "EliminateCommonSubexpr": 3,
-}
+def _update_target(target):
+    target = target if target else _target.current_target()
+    if target is None:
+        raise ValueError("Target is not set in env or passed as argument.")
 
+    tgts = {}
+    if isinstance(target, (str, _target.Target)):
+        dev_type = tvm_expr.IntImm("int32", _nd.context(str(target)).device_type)
+        tgts[dev_type] = _target.create(target)
+    elif isinstance(target, dict):
+        for dev, tgt in target.items():
+            dev_type = tvm_expr.IntImm("int32", _nd.context(dev).device_type)
+            tgts[dev_type] = _target.create(tgt)
+    else:
+        raise TypeError("target is expected to be str or " +
+                        "tvm.target.Target, but received " +
+                        "{}".format(type(target)))
+    return tgts
 
-class BuildConfig(object):
-    """Configuration scope to set a build config option.
 
-    Parameters
-    ----------
-    kwargs
-        Keyword arguments of configurations to set.
+class BuildModule(object):
+    """Build a Relay function to run on TVM graph runtime. This class is used
+    to expose the `RelayBuildModule` APIs implemented in C++.
     """
-    current = None
-    defaults = {
-        "opt_level": 2,
-        "add_pass": None,
-        "fallback_device": None,
-    }
-
-    def __init__(self, **kwargs):
-        self._old_scope = None
-        for k, _ in kwargs.items():
-            if k not in BuildConfig.defaults:
-                raise ValueError("invalid argument %s, candidates are %s" %
-                                 (k, BuildConfig.defaults.keys()))
-        self._attr = kwargs
-
-    def __getattr__(self, name):
-        if name not in self._attr:
-            return BuildConfig.defaults[name]
-        return self._attr[name]
-
-    def __enter__(self):
-        # pylint: disable=protected-access
-        self._old_scope = BuildConfig.current
-        attr = BuildConfig.current._attr.copy()
-        attr.update(self._attr)
-        self._attr = attr
-        BuildConfig.current = self
-        return self
-
-    def __exit__(self, ptype, value, trace):
-        assert self._old_scope
-        BuildConfig.current = self._old_scope
-
-    def pass_enabled(self, pass_name):
-        """Get whether pass is enabled.
-
+    def __init__(self):
+        self.mod = _build_module._BuildModule()
+        self._get_graph_json = self.mod["get_graph_json"]
+        self._get_module = self.mod["get_module"]
+        self._build = self.mod["build"]
+        self._set_params_func = self.mod["set_params"]
+        self._get_params_func = self.mod["get_params"]
+
+    def build(self, func, target=None, target_host=None, params=None):
+        """
         Parameters
         ----------
-        pass_name : str
-            The optimization pass name
+        func: relay.Function
+            The function to build.
+
+        target : str, :any:`tvm.target.Target`, or dict of str(i.e.
+        device/context name) to str/tvm.target.Target, optional
+            For heterogeneous compilation, it is a dictionary indicating context
+            to target mapping. For homogeneous compilation, it is a build target.
+
+        target_host : str or :any:`tvm.target.Target`, optional
+            Host compilation target, if target is device.
+            When TVM compiles device specific program such as CUDA,
+            we also need host(CPU) side code to interact with the driver
+            to setup the dimensions and parameters correctly.
+            target_host is used to specify the host side codegen target.
+            By default, llvm is used if it is enabled,
+            otherwise a stackvm intepreter is used.
+
+        params : dict of str to NDArray
+            Input parameters to the graph that do not change
+            during inference time. Used for constant folding.
 
         Returns
         -------
-        enabled : bool
-            Whether pass is enabled.
-        """
-        if self.add_pass and pass_name in self.add_pass:
-            return True
-        return self.opt_level >= OPT_PASS_LEVEL[pass_name]
-
-
-BuildConfig.current = BuildConfig()
-
-
-def build_config(**kwargs):
-    """Configure the build behavior by setting config variables.
-
-    Parameters
-    ----------
-    opt_level: int, default=2
-        Optimization level. See OPT_PASS_LEVEL for level of each pass.
-
-    add_pass: set of str
-        Optimization pass to be added regardless of optimization level.
-
-    fallback_device : str or tvm.TVMContext
-        The fallback device. It is also used as the default device for
-        operators without specified device during heterogeneous execution.
-
-    Returns
-    -------
-    config: BuildConfig
-        The build configuration
-    """
-    return BuildConfig(**kwargs)
-
-
-def _bind_params_by_name(func, params):
-    """Bind parameters of function by its name."""
-    name_dict = {}
-    for arg in func.params:
-        name = arg.name_hint
-        if name in name_dict:
-            name_dict[name] = None
-        else:
-            name_dict[name] = arg
-    bind_dict = {}
-    for k, v in params.items():
-        if k not in name_dict:
-            continue
-        arg = name_dict[k]
-        if arg is None:
-            raise ValueError("Multiple args in the function have name %s" % k)
-        bind_dict[arg] = _expr.const(v)
-    return _expr.bind(func, bind_dict)
-
-
-def optimize(func, target=None, params=None):
-    """Perform target invariant optimizations.
+        graph_json : str
+            The json string that can be accepted by graph runtime.
 
-    Parameters
-    ----------
-    func : tvm.relay.Function
-        The input to optimization.
+        mod : tvm.Module
+            The module containing necessary libraries.
 
-    target : Optional[:any:`tvm.target.Target`, Dict[int, tvm.target.Target]]
-        The optimization target. For heterogeneous compilation, it is a
-        dictionary mapping device type to compilation target. For homogeneous
-        compilation, it is a build target.
-
-    params : Optional[Dict[str, tvm.nd.NDArray]]
-        Input parameters to the graph that do not change
-        during inference time. used for constant folding.
+        params : dict
+            The parameters of the final graph.
+        """
+        target = _update_target(target)
 
-    Returns
-    -------
-    opt_func : tvm.relay.Function
-        The optimized version of the function.
-    """
-    cfg = BuildConfig.current
-
-    # bind expressions
-    if params:
-        func = _bind_params_by_name(func, params)
-
-    if cfg.pass_enabled("SimplifyInference"):
-        func = ir_pass.infer_type(func)
-        func = ir_pass.simplify_inference(func)
-
-    if cfg.pass_enabled("EliminateCommonSubexpr"):
-        def fskip(expr):
-            if isinstance(expr, _expr.Call) and expr.op.name == 'cast' and \
-               expr.attrs.dtype == 'int32':
-                return True
-            return False
-
-        func = ir_pass.infer_type(func)
-        func = ir_pass.eliminate_common_subexpr(func, fskip)
-
-    if cfg.pass_enabled("CombineParallelConv2D"):
-        func = ir_pass.infer_type(func)
-        func = ir_pass.combine_parallel_conv2d(func)
-
-    # The constant folding pass is necessary because FoldScaleAxis pass needs
-    # to check the constantness and positiveness of scales.
-    if cfg.pass_enabled("FoldConstant"):
-        func = ir_pass.fold_constant(func)
-
-    if cfg.pass_enabled("FoldScaleAxis"):
-        func = ir_pass.infer_type(func)
-        func = ir_pass.backward_fold_scale_axis(func)
-        func = ir_pass.infer_type(func)
-        func = ir_pass.forward_fold_scale_axis(func)
-        func = ir_pass.fold_constant(func)
-
-    if cfg.pass_enabled("CanonicalizeOps"):
-        func = ir_pass.infer_type(func)
-        func = ir_pass.canonicalize_ops(func)
-
-    # FIXME(zhiics) Skip AlterOpLayout pass for heterogeneous compilation for
-    # now. We probably need to pass target to this pass as well. Fix it in
-    # a followup PR.
-    if cfg.pass_enabled("AlterOpLayout"):
-        if isinstance(target, _target.Target):
-            func = ir_pass.infer_type(func)
-            with target:
-                func = ir_pass.alter_op_layout(func)
-        elif isinstance(target, dict):
-            warnings.warn("AlterOpLayout pass is not enabled for heterogeneous"
-                          " execution yet.")
-
-    if cfg.pass_enabled("FoldConstant"):
-        func = ir_pass.fold_constant(func)
-
-    return func
+        # Setup the params.
+        if params:
+            self._set_params(params)
+        # Build the function
+        self._build(func, target, target_host)
+        # Get artifacts
+        graph_json = self.get_json()
+        mod = self.get_module()
+        params = self.get_params()
+
+        return graph_json, mod, params
+
+    def _set_params(self, params):
+        inputs = {}
+        for name, param in params.items():
+            if isinstance(param, np.ndarray):
+                param = _nd.array(param)
+            inputs[name] = _expr.const(param)
+        self._set_params_func(inputs)
+
+    def get_json(self):
+        """Return the json file of the built program."""
+        return self._get_graph_json()
+
+    def get_module(self):
+        """Return the built module."""
+        return self._get_module()
+
+    def get_params(self):
+        """Return the updated weights."""
+        params = self._get_params_func()
+        ret = {}
+        for key, value in params.items():
+            ret[key] = value.data
+        return ret
 
 
 def build(func, target=None, target_host=None, params=None):
-    """Build a function to run on TVM graph runtime.
+    """Helper function that builds a Relay function to run on TVM graph
+    runtime.
 
     Parameters
     ----------
@@ -266,146 +175,28 @@ def build(func, target=None, target_host=None, params=None):
     params : dict
         The parameters of the final graph.
     """
-    target = target if target else _target.current_target()
-    if target is None:
-        raise ValueError("Target is not set in env or passed as argument.")
+    target = _update_target(target)
 
-    if isinstance(target, dict):
-        target, fallback_device = _update_heterogeneous_inputs(target)
-    elif isinstance(target, (str, _target.Target)):
-        target = _target.create(target)
-    else:
-        raise ValueError("target must be the type of str, tvm.target.Target," +
-                         "or dict of device name to target")
+    if isinstance(target_host, (str, _target.Target)):
+        target_host = _target.create(target_host)
+    elif target_host:
+        raise ValueError("target host must be the type of str, " +
+                         "tvm.target.Target, or None")
 
     # If current dispatch context is fallback context (the default root context),
     # then load pre-tuned parameters from TopHub
     if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
-        if isinstance(target, dict):
-            tophub_context = autotvm.tophub.context(list(target.values()))
-        else:
-            tophub_context = autotvm.tophub.context(target)
+        tophub_context = autotvm.tophub.context(list(target.values()))
     else:
         tophub_context = autotvm.util.EmptyContext()
 
-    cfg = BuildConfig.current
-
     with tophub_context:
-        func = optimize(func, target, params)
-        # Annotate the ops for heterogeneous execution.
-        if isinstance(target, dict):
-            func, target = _run_device_annotation_passes(func, target,
-                                                         fallback_device)
-        # Fuse ops before running code gen
-        func = ir_pass.infer_type(func)
-        func = ir_pass.fuse_ops(func, cfg.opt_level)
-        # Graph code generation
-        func = ir_pass.infer_type(func)
-        graph_gen = _graph_gen.GraphRuntimeCodegen(mod=None, target=target)
-        graph_json, lowered_funcs, params = graph_gen.codegen(func)
-        mod = _tvm_build_module(
-            lowered_funcs, target=target, target_host=target_host)
+        bld_mod = BuildModule()
+        graph_json, mod, params = bld_mod.build(func, target, target_host,
+                                                params)
     return graph_json, mod, params
 
 
-def _update_heterogeneous_inputs(target):
-    """Update the target and fallback device required for heterogeneous
-    compilation. CPU is used as the fallback device if it wasn't provided.
-    Meanwhile, a CPU device type and "llvm" pair will be added to the target
-    dictionary in this case.
-
-    Parameters
-    ----------
-    target : dict of str(i.e. device/context name) to str/tvm.target.Target.
-        A dict contains context to target pairs.
-
-    Returns
-    -------
-    device_target : dict of int to tvm.target.Target.
-        The updated device type to target dict.
-
-    fallback_device : int
-        The updated fallback device type.
-    """
-    if not isinstance(target, dict):
-        raise ValueError("target must be dict of device name to target for " +
-                         "heterogeneous execution, but received %s."
-                         % type(target))
-
-    fallback_device = BuildConfig.current.fallback_device
-    if fallback_device is None:
-        # cpu is used as the default fallback device when heterogeneous
-        # execution is needed, but no fallback device is provided.
-        fallback_device = _nd.cpu(0).device_type
-        target[fallback_device] = str(_target.create("llvm"))
-    elif isinstance(fallback_device, str):
-        fallback_device = _nd.context(fallback_device).device_type
-    elif isinstance(fallback_device, TVMContext):
-        fallback_device = fallback_device.device_type
-    else:
-        raise ValueError("fallback_device expects the type of str or " +
-                         "TVMContext, but received %s." % type(fallback_device))
-
-    device_target = {}
-    for dev, tgt in target.items():
-        device_target[_nd.context(dev).device_type] = _target.create(tgt)
-
-    if fallback_device not in device_target:
-        raise ValueError("%s is used as the default device, but the target" +
-                         "is not provided."
-                         % _nd.context(fallback_device).device_name)
-    return device_target, fallback_device
-
-
-def _run_device_annotation_passes(func, target, fallback_device):
-    """Execute the device annotation passes to update the input program and
-    target information.
-
-    Parameters
-    ----------
-    func: tvm.relay.Function
-        The function where annotation passes will be execute at.
-
-    target : Dict[int, tvm.target.Target]
-        A dict contains device type to target pairs.
-
-    fallback_device : int
-        The fallback device type.
-
-    Returns
-    -------
-    target : Dict[int, tvm.target.Target]
-        The updated device type to target dict.
-
-    func : tvm.relay.Function
-        The updated func.
-    """
-    func = ir_pass.infer_type(func)
-    func = ir_pass.rewrite_annotated_ops(func, fallback_device)
-    device_map = ir_pass.collect_device_info(func)
-    # The expression to device type map will be empty if all or none of
-    # the expressions in the `func` are annotated because this map is
-    # obtained by propagating the device information in the device copy
-    # operator. None of the above cases needs device copy operator.
-    if not device_map:
-        annotation_map = ir_pass.collect_device_annotation_ops(func)
-        # No annotation.
-        if not annotation_map:
-            target = {0: target[fallback_device]}
-        else:
-            dev_type = next(iter(annotation_map.values()))
-            # All annotated with the same device type.
-            if all(val == dev_type for val in annotation_map.values()):
-                target = {0: target[dev_type]}
-            else:
-                raise RuntimeError("Expressions in the function are "
-                                   "annotated with various device types,"
-                                   "but not device copy operators "
-                                   "found. Please check the "
-                                   "RewriteAnnotation pass.")
-    return func, target
-
-
 class GraphExecutor(_interpreter.Executor):
     """Wrapper around Executor interface.
 
@@ -428,16 +219,19 @@ def __init__(self, mod, ctx, target):
         self.ctx = ctx
         self.target = target
 
-    def _make_executor(self, func):
-        ret_type = ir_pass.infer_type(func).ret_type
+    def _make_executor(self, expr=None):
+        if not expr:
+            assert self.mod, "either expr or self.mod should be not null."
+            expr = self.mod[self.mod.entry_func]
+        ret_type = ir_pass.infer_type(expr).ret_type
         num_outputs = len(ret_type.fields) if isinstance(ret_type, _ty.TupleType) else 1
-        graph_json, mod, params = build(func, target=self.target)
+        graph_json, mod, params = build(expr, target=self.target)
         gmodule = _graph_rt.create(graph_json, mod, self.ctx)
         if params:
             gmodule.set_input(**params)
 
         def _graph_wrapper(*args, **kwargs):
-            args = self._convert_args(func, args, kwargs)
+            args = self._convert_args(expr, args, kwargs)
             # Create map of inputs.
             for i, arg in enumerate(args):
                 gmodule.set_input(i, arg)
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 98b4a83e09de..8e7f95c4dc26 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -70,6 +70,38 @@ def astype(self, dtype):
     def __neg__(self):
         return _op_make.negative(self)
 
+    def __lt__(self, other):
+        if isinstance(other, Expr):
+            return _op_make.less(self, other)
+        elif isinstance(other, _Number):
+            raise TypeError('convert "%s" with `const` first' % str(other))
+        else:
+            raise TypeError("type %s not supported" % str(type(other)))
+
+    def __gt__(self, other):
+        if isinstance(other, Expr):
+            return _op_make.greater(self, other)
+        elif isinstance(other, _Number):
+            raise TypeError('convert "%s" with `const` first' % str(other))
+        else:
+            raise TypeError("type %s not supported" % str(type(other)))
+
+    def __ge__(self, other):
+        if isinstance(other, Expr):
+            return _op_make.greater_equal(self, other)
+        elif isinstance(other, _Number):
+            raise TypeError('convert "%s" with `const` first' % str(other))
+        else:
+            raise TypeError("type %s not supported" % str(type(other)))
+
+    def __le__(self, other):
+        if isinstance(other, Expr):
+            return _op_make.less_equal(self, other)
+        elif isinstance(other, _Number):
+            raise TypeError('convert "%s" with `const` first' % str(other))
+        else:
+            raise TypeError("type %s not supported" % str(type(other)))
+
     def __add__(self, other):
         if isinstance(other, Expr):
             return _op_make.add(self, other)
diff --git a/python/tvm/relay/frontend/__init__.py b/python/tvm/relay/frontend/__init__.py
index 8d308c7e8833..76761fd78325 100644
--- a/python/tvm/relay/frontend/__init__.py
+++ b/python/tvm/relay/frontend/__init__.py
@@ -30,3 +30,4 @@
 from .coreml import from_coreml
 from .caffe2 import from_caffe2
 from .tensorflow import from_tensorflow
+from .darknet import from_darknet
diff --git a/python/tvm/relay/frontend/caffe2.py b/python/tvm/relay/frontend/caffe2.py
index e92a6226072f..18489b380ee7 100644
--- a/python/tvm/relay/frontend/caffe2.py
+++ b/python/tvm/relay/frontend/caffe2.py
@@ -20,6 +20,7 @@
 import tvm
 from .. import ir_pass
 from .. import expr as _expr
+from .. import module as _module
 from .. import op as _op
 from ... import nd as _nd
 from .common import AttrCvt, Renamer
@@ -382,6 +383,7 @@ def __init__(self, shape, dtype):
         self._ops = {}
         self._shape = shape
         self._dtype = dtype
+        self._mod = _module.Module({})
 
     def from_caffe2(self, init_net, predict_net):
         """Construct Relay expression from caffe2 graph.
@@ -393,8 +395,9 @@ def from_caffe2(self, init_net, predict_net):
 
         Returns
         -------
-        func : tvm.relay.expr.Function
-            Compatible relay function
+        mod : tvm.relay.Module
+            The module that optimizations will be performed on.
+
         params : dict
             A dict of name: tvm.nd.array pairs, used as pretrained weights
         """
@@ -448,8 +451,9 @@ def from_caffe2(self, init_net, predict_net):
             outputs = out[0]
 
         func = _expr.Function(ir_pass.free_vars(outputs), outputs)
+        self._mod[self._mod.entry_func] = func
 
-        return func, self._params
+        return self._mod, self._params
 
     def _get_node(self, blob):
         """Get the Symbol of blob and detect cyclic dependency in the graph."""
@@ -505,7 +509,7 @@ def _convert_operator(self,
                           identity_list=None,
                           convert_map=None):
         """Convert from Caffe2 operator to Relay operator.
-        The converter must specify conversions explicity for incompatible name, and
+        The converter must specify conversions explicitly for incompatible name, and
         apply handlers to operator attributes.
 
         Parameters
@@ -560,8 +564,8 @@ def from_caffe2(init_net, predict_net, shape=None, dtype="float32"):
 
     Returns
     -------
-    sym : tvm.relay.expr.Function
-        Compatible relay function
+    mod : tvm.relay.Module
+        The module that optimizations will be performed on.
 
     params : dict of str to tvm.ndarray
         Dict of converted parameters stored in tvm.ndarray format
diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index 9b89936de015..efd198803c2b 100644
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -241,7 +241,7 @@ def get_relay_op(op_name):
             op = None
     else:
         # try search op in various modules
-        for candidate in (_op, _op.nn, _op.image):
+        for candidate in (_op, _op.nn, _op.image, _op.vision):
             op = getattr(candidate, op_name, None)
             if op is not None:
                 break
@@ -286,7 +286,7 @@ def clear_padding(self):
 
 
 class AttrCvt(object):
-    """Common attribute conveter. An AttrConverter instance is a callable:
+    """Common attribute converter. An AttrConverter instance is a callable:
     ```
     attr_converter = AttrConverter(op_name, transforms={'a':'b', 'c':('d', 1)})
     new_op_name, new_attr = attr_converter(attrs)
@@ -300,12 +300,12 @@ class AttrCvt(object):
         `op_name = func(attr)`
     transforms : dict of `new_name, or (new_name, default_value, transform function)`
         If only a new_name is provided, it's like renaming the attribute name.
-        If default_value if provded, then the attribute is considered as optional.
+        If default_value if provided, then the attribute is considered as optional.
         If transform function is provided, the original attribute value is handled
         by transform function.
     excludes : list
         A list of excluded attributes that should `NOT` appear.
-        Raise NotImplementedError if occured.
+        Raise NotImplementedError if occurred.
     disables : list
         A list of attributes that is disabled in relay. Log warnings.
     ignores : list
diff --git a/python/tvm/relay/frontend/coreml.py b/python/tvm/relay/frontend/coreml.py
index 653df92b71fc..1cac547d07c9 100644
--- a/python/tvm/relay/frontend/coreml.py
+++ b/python/tvm/relay/frontend/coreml.py
@@ -21,6 +21,7 @@
 import tvm
 from .. import ir_pass
 from .. import expr as _expr
+from .. import module as _module
 from .. import op as _op
 from ... import nd as _nd
 from ..._ffi import base as _base
@@ -416,8 +417,8 @@ def from_coreml(model, shape=None):
 
     Returns
     -------
-    func : tvm.relay.Function
-        Compatible relay Function.
+    mod : tvm.relay.Module
+        The relay module for compilation.
 
     params : dict of str to tvm.NDArray
         The parameter dict to be used by Relay.
@@ -463,4 +464,4 @@ def from_coreml(model, shape=None):
     outexpr = outexpr[0]
     func = _expr.Function(ir_pass.free_vars(outexpr), outexpr)
     params = {k:_nd.array(np.array(v, dtype=np.float32)) for k, v in etab.params.items()}
-    return func, params
+    return _module.Module.from_expr(func), params
diff --git a/python/tvm/relay/frontend/darknet.py b/python/tvm/relay/frontend/darknet.py
new file mode 100644
index 000000000000..7b26ed5692df
--- /dev/null
+++ b/python/tvm/relay/frontend/darknet.py
@@ -0,0 +1,849 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""
+DarkNet symbol frontend for Relay.
+"""
+
+from __future__ import absolute_import as _abs
+from enum import Enum
+import numpy as np
+import tvm
+from .. import ir_pass
+from .. import expr as _expr
+from .. import module as _module
+from .common import get_relay_op, new_var
+
+__all__ = ['from_darknet']
+
+def _darknet_not_support(attr, op='relay'):
+    """Raise error if any operation is not supported."""
+    err = "{} is not supported in {}.".format(attr, op)
+    raise NotImplementedError(err)
+
+def _get_params_prefix(opname, layer_num):
+    """Makes the params prefix name from opname and layer number."""
+    return str(opname) + str(layer_num)
+
+def _get_params_name(prefix, item):
+    """Makes the params name for the k,v pair."""
+    return prefix + '_'+ item
+
+def _get_param_var(params, prefix, item):
+    name = _get_params_name(prefix, item)
+    if name not in params:
+        raise AttributeError("{} not found in params dict.".format(name))
+    return new_var(name, shape=params[name].shape, dtype=params[name].dtype)
+
+def _darknet_maxpooling(inputs, params, attrs, prefix):
+    """Process the max pool 2d operation."""
+    new_attrs = {}
+    kernel = attrs.get('kernel')
+    strides = attrs.get('stride', 1)
+    pads = attrs.get('pad', 1)
+    new_attrs['pool_size'] = (kernel, kernel)
+    new_attrs['strides'] = (strides, strides)
+    new_attrs['padding'] = (pads, pads)
+    extra_pad_size = attrs.get('extra_pad_size', 0)
+    if extra_pad_size:
+        pad_width = ((0, 0), (0, 0), (0, extra_pad_size), (0, extra_pad_size))
+        inputs = [get_relay_op('pad')(*inputs,
+                                      pad_width=pad_width,
+                                      pad_value=np.finfo(np.float32).min)]
+    return get_relay_op('max_pool2d')(*inputs, **new_attrs)
+
+def _darknet_avgpooling(inputs, params, attrs, prefix):
+    """Process the average pool 2d operation."""
+    new_attrs = {}
+    kernel = attrs.get('kernel')
+    strides = attrs.get('stride', 1)
+    pads = attrs.get('pad', 0)
+
+    new_attrs['pool_size'] = (kernel, kernel)
+    new_attrs['strides'] = (strides, strides)
+    new_attrs['padding'] = (pads, pads)
+    return get_relay_op('avg_pool2d')(*inputs, **new_attrs)
+
+def _darknet_conv2d(inputs, params, attrs, prefix):
+    """Process the convolution 2d operation."""
+    new_attrs = {}
+    kernel = attrs.get('kernel')
+    strides = attrs.get('stride', 1)
+    pads = attrs.get('pad', 0)
+
+    new_attrs['channels'] = attrs.get('num_filter')
+    new_attrs['kernel_size'] = (kernel, kernel)
+    new_attrs['strides'] = (strides, strides)
+    new_attrs['padding'] = (pads, pads)
+    new_attrs['dilation'] = attrs.get('dilate', (1, 1))
+    new_attrs['groups'] = attrs.get('num_group', 1)
+
+    weight = _get_param_var(params, prefix, 'weight')
+    out = get_relay_op('conv2d')(*inputs, weight=weight, **new_attrs)
+
+    use_bias = not attrs.get('use_batchNorm', False)
+    if use_bias:
+        new_attrs = {}
+        new_attrs['axis'] = 1
+        bias = _get_param_var(params, prefix, 'bias')
+        out = get_relay_op('bias_add')(out, bias=bias, **new_attrs)
+    else:
+        new_attrs = {}
+        new_attrs['epsilon'] = 0.000001
+        gamma = _get_param_var(params, prefix, 'gamma')
+        beta = _get_param_var(params, prefix, 'beta')
+        moving_mean = _get_param_var(params, prefix, 'moving_mean')
+        moving_var = _get_param_var(params, prefix, 'moving_var')
+        out = get_relay_op('batch_norm')(out, gamma, beta, moving_mean, moving_var, **new_attrs)
+
+    if 'activation' in attrs:
+        new_attrs = {}
+        new_attrs['activation'] = attrs['activation']
+        new_attrs['slope'] = 0.1
+        out = _darknet_activations(out, None, new_attrs)
+    return out
+
+def _darknet_shortcut(inputs, params, attrs, prefix):
+    """Process the shortcut operation."""
+    input_0 = inputs[0]
+    input_1 = inputs[1]
+
+    input_0_channel = int(attrs['out_channel'])
+    input_1_channel = int(attrs['add_out_channel'])
+    input_0_size = int(attrs['out_size'])
+    input_1_size = int(attrs['add_out_size'])
+
+    if input_0_size > input_1_size:
+        scale = int(input_0_size/input_1_size)
+        input_1 = get_relay_op('upsampling')(input_1, scale=scale)
+
+    elif input_0_size < input_1_size:
+        stride = int(input_1_size/input_0_size)
+        input_1 = get_relay_op('avg_pool2d')(input_1,
+                                             pool_size=(1, 1),
+                                             strides=(stride, stride),
+                                             padding=(0, 0))
+
+    if input_0_channel != input_1_channel:
+        pad_channel = input_0_channel - input_1_channel
+        input_1 = get_relay_op('pad')(input_1,
+                                      pad_width=((0, 0), (0, pad_channel), (0, 0), (0, 0)),
+                                      pad_value=0.)
+    sym = input_0 + input_1
+    if 'activation' in attrs:
+        new_attrs = {}
+        new_attrs['activation'] = attrs['activation']
+        sym = _darknet_activations(sym, None, new_attrs)
+    return sym
+
+def _darknet_dense(inputs, params, attrs, prefix):
+    """Process the dense operation."""
+    new_attrs = {}
+    new_attrs['units'] = attrs.get('num_hidden')
+    data = inputs[0]
+
+    if attrs.get('use_flatten', False) is True:
+        data = get_relay_op('batch_flatten')(data)
+
+    weight = _get_param_var(params, prefix, 'weight')
+    data = get_relay_op('dense')(data, weight, **new_attrs)
+
+    use_bias = attrs.get('use_bias', False)
+    if use_bias:
+        bias = _get_param_var(params, prefix, 'bias')
+        data = get_relay_op('bias_add')(data, bias, axis=1)
+
+    if 'use_batchNorm' in attrs:
+        new_attrs = {}
+        new_attrs['epsilon'] = 0.000001
+        gamma = _get_param_var(params, prefix, 'gamma')
+        beta = _get_param_var(params, prefix, 'beta')
+        moving_mean = _get_param_var(params, prefix, 'moving_mean')
+        moving_var = _get_param_var(params, prefix, 'moving_var')
+        data = get_relay_op('batch_norm')(data, gamma, beta, moving_mean, moving_var, **new_attrs)
+    if 'activation' in attrs:
+        new_attrs = {}
+        new_attrs['activation'] = attrs['activation']
+        data = _darknet_activations(data, None, new_attrs)
+    return data
+
+def _darknet_dropout(inputs, params, attrs, prefix):
+    """Process the dropout operation, its a blank operation."""
+    new_attrs = {}
+    new_attrs['rate'] = attrs.get('p', 0.5)
+    return get_relay_op('dropout')(*inputs, **new_attrs)
+
+def _darknet_reshape(inputs, params, attrs, prefix):
+    """Process the reshape operation."""
+    new_attrs = {}
+    new_attrs['shape'] = attrs.get('shape')
+    return get_relay_op('reshape')(*inputs, **new_attrs)
+
+def _darknet_upsampling(inputs, params, attrs, prefix):
+    """Process the upsampling operation."""
+    new_attrs = {}
+    new_attrs['scale'] = attrs.get('scale', 1)
+    return get_relay_op('upsampling')(*inputs, **new_attrs)
+
+def _darknet_l2normalize(inputs, params, attrs, prefix):
+    """Process the l2 normalization operation."""
+    new_attrs = {}
+    new_attrs['eps'] = attrs.get('eps', 0.0)
+    new_attrs['axis'] = [attrs.get('axis', 1)]
+    return get_relay_op('l2_normalize')(*inputs, **new_attrs)
+
+def _darknet_softmax_output(inputs, params, attrs, prefix):
+    """Process the softmax operation."""
+    temperature = attrs.get('temperature', 1)
+    data = inputs[0]
+    if temperature != 1:
+        data = data / _expr.const(float(temperature))
+
+    if attrs.get('use_flatten', False) is True:
+        data = get_relay_op('batch_flatten')(data)
+
+    new_attrs = {}
+    if attrs.get('multi_output', False):
+        new_attrs['axis'] = 1
+    return get_relay_op('softmax')(data, **new_attrs)
+
+def _darknet_route(inputs, params, attrs, prefix):
+    """Process the route operation, which is equivalent to concat."""
+    new_attrs = {'axis': attrs.get('dim', 1)}
+    return get_relay_op('concatenate')((inputs[0], inputs[1]), **new_attrs)
+
+def _darknet_reorg(inputs, params, attrs, prefix):
+    """Process the reorg operation."""
+    new_attrs = {}
+    if 'stride' in attrs:
+        new_attrs = {'stride': attrs.get('stride', 1)}
+    return get_relay_op('yolo_reorg')(*inputs, **new_attrs)
+
+def _darknet_region(inputs, params, attrs, prefix):
+    """Process the region operation."""
+    num = attrs.get('n', 1)
+    classes = attrs.get('classes', 1)
+    coords = attrs.get('coords', 0)
+    background = attrs.get('background', 0)
+    softmax = attrs.get('softmax', True)
+    input_shape = attrs.get('shape')
+
+    split_size = classes + coords + 1
+    intermediate_shape = (input_shape[0], num, split_size, input_shape[2], input_shape[3])
+    data_block = get_relay_op('reshape')(inputs[0], newshape=intermediate_shape)
+    split_indices = (2, 4, 5)
+    split_res = get_relay_op('split')(data_block, indices_or_sections=split_indices, axis=2)
+    split_res0 = get_relay_op('sigmoid')(split_res[0])
+    split_res2 = split_res[2] if background else get_relay_op('sigmoid')(split_res[2])
+    split_res3 = get_relay_op('softmax')(split_res[3], axis=2) if softmax else split_res[3]
+    out = get_relay_op('concatenate')((split_res0, split_res[1], split_res2, split_res3), axis=2)
+    return get_relay_op('reshape')(out, newshape=input_shape)
+
+def _darknet_yolo(inputs, params, attrs, prefix):
+    """Process the yolo operation."""
+    num = attrs.get('n', 1)
+    classes = attrs.get('classes', 1)
+    input_shape = attrs.get('shape')
+    split_size = classes + 5
+    intermediate_shape = (input_shape[0], num, split_size, input_shape[2], input_shape[3])
+    data_block = get_relay_op('reshape')(inputs[0], newshape=intermediate_shape)
+    split_indices = (2, 4)
+    split_res = get_relay_op('split')(data_block, indices_or_sections=split_indices, axis=2)
+    split_res0 = get_relay_op('sigmoid')(split_res[0])
+    split_res2 = get_relay_op('sigmoid')(split_res[2])
+    out = get_relay_op('concatenate')((split_res0, split_res[1], split_res2), axis=2)
+    return get_relay_op('reshape')(out, newshape=input_shape)
+
+class ACTIVATION(object):
+    """Darknet ACTIVATION Class constant."""
+    LOGISTIC = 0
+    RELU = 1
+    RELIE = 2
+    LINEAR = 3
+    RAMP = 4
+    TANH = 5
+    PLSE = 6
+    LEAKY = 7
+    ELU = 8
+    LOGGY = 9
+    STAIR = 10
+    HARDTAN = 11
+    LHTAN = 12
+
+def _darknet_activations(inputs, params, attrs):
+    """Process the activation function."""
+    act = attrs.get('activation')
+    data = inputs[0] if isinstance(inputs, _expr.TupleWrapper) else inputs
+
+    def _const(val):
+        return _expr.const(val)
+
+    def _relu(data):
+        return get_relay_op('relu')(data)
+
+    def _exp(data):
+        return get_relay_op('exp')(data)
+
+    def _tanh(data):
+        return get_relay_op('tanh')(data)
+
+    def _sigmoid(data):
+        return get_relay_op('sigmoid')(data)
+
+    def _elu(data):
+        alpha = _const(-1.0)
+        return alpha * _relu(_const(1.0) - _exp(data)) + _relu(data)
+
+    def _leaky_relu(data, slope):
+        new_attrs = {}
+        new_attrs['alpha'] = slope
+        return get_relay_op('leaky_relu')(data, **new_attrs)
+
+    if ACTIVATION.LOGISTIC == act:
+        data = _sigmoid(data)
+    elif ACTIVATION.RELU == act:
+        data = _relu(data)
+    elif ACTIVATION.TANH == act:
+        data = _tanh(data)
+    elif ACTIVATION.LINEAR == act:
+        return data
+    elif ACTIVATION.LEAKY == act:
+        data = _leaky_relu(data, attrs.get('slope', 0.1))
+    elif ACTIVATION.ELU == act:
+        data = _elu(data)
+    else:
+        _darknet_not_support('act: ' + attrs)
+    return data
+
+class LAYERTYPE(Enum):
+    """Darknet LAYERTYPE Class constant."""
+    CONVOLUTIONAL = 0
+    DECONVOLUTIONAL = 1
+    CONNECTED = 2
+    MAXPOOL = 3
+    SOFTMAX = 4
+    DETECTION = 5
+    DROPOUT = 6
+    CROP = 7
+    ROUTE = 8
+    COST = 9
+    NORMALIZATION = 10
+    AVGPOOL = 11
+    LOCAL = 12
+    SHORTCUT = 13
+    ACTIVE = 14
+    RNN = 15
+    GRU = 16
+    LSTM = 17
+    CRNN = 18
+    BATCHNORM = 19
+    NETWORK = 20
+    XNOR = 21
+    REGION = 22
+    YOLO = 23
+    REORG = 24
+    UPSAMPLE = 25
+    LOGXENT = 26
+    L2NORM = 27
+    BLANK = 28
+
+_DARKNET_CONVERT_MAP = {
+    LAYERTYPE.CONVOLUTIONAL   : _darknet_conv2d,
+    LAYERTYPE.CONNECTED       : _darknet_dense,
+    LAYERTYPE.MAXPOOL         : _darknet_maxpooling,
+    LAYERTYPE.SOFTMAX         : _darknet_softmax_output,
+    LAYERTYPE.DROPOUT         : _darknet_dropout,
+    LAYERTYPE.AVGPOOL         : _darknet_avgpooling,
+    LAYERTYPE.ROUTE           : _darknet_route,
+    LAYERTYPE.REORG           : _darknet_reorg,
+    LAYERTYPE.REGION          : _darknet_region,
+    LAYERTYPE.SHORTCUT        : _darknet_shortcut,
+    LAYERTYPE.UPSAMPLE        : _darknet_upsampling,
+    LAYERTYPE.L2NORM          : _darknet_l2normalize,
+    LAYERTYPE.YOLO            : _darknet_yolo,
+    LAYERTYPE.DECONVOLUTIONAL : _darknet_not_support,
+    LAYERTYPE.BATCHNORM       : _darknet_not_support,
+    LAYERTYPE.DETECTION       : _darknet_not_support,
+    LAYERTYPE.CROP            : _darknet_not_support,
+    LAYERTYPE.COST            : _darknet_not_support,
+    LAYERTYPE.NORMALIZATION   : _darknet_not_support,
+    LAYERTYPE.LOCAL           : _darknet_not_support,
+    LAYERTYPE.ACTIVE          : _darknet_not_support,
+    LAYERTYPE.RNN             : _darknet_not_support,
+    LAYERTYPE.GRU             : _darknet_not_support,
+    LAYERTYPE.LSTM            : _darknet_not_support,
+    LAYERTYPE.CRNN            : _darknet_not_support,
+    LAYERTYPE.NETWORK         : _darknet_not_support,
+    LAYERTYPE.XNOR            : _darknet_not_support,
+    LAYERTYPE.BLANK           : _darknet_not_support,
+}
+
+def _darknet_convert_symbol(op_name, inputs, params, attrs, params_prefix):
+    """Convert from darknet op to relay op.
+    Parameters
+    ----------
+    op_name : str
+        Operator name, such as Convolution, Connected, etc
+    inputs : list of relay.Function
+        List of input symbols.
+    attrs : dict
+        Dict of operator attributes
+    params_prefix: str
+        Params name for this operation
+
+    Returns
+    -------
+    out_name : converted out name of operation
+    sym : tvm.relay.Function
+        Converted relay function
+    """
+
+    if op_name in _DARKNET_CONVERT_MAP:
+        sym = _DARKNET_CONVERT_MAP[op_name](inputs, params, attrs, params_prefix)
+    else:
+        _darknet_not_support('Operator type ' + str(op_name))
+    return sym
+
+def _as_list(arr):
+    """Force being a list, ignore if already is."""
+    if isinstance(arr, list):
+        return arr
+    return [arr]
+
+class GraphProto(object):
+    """A helper class for handling relay functions from darknet model.
+    """
+
+    def __init__(self, net, shape, dtype='float32'):
+        self._net = net
+        self._shape = shape
+        self._dtype = dtype
+        self._sym_array = {}
+        self._tvmparams = {}
+        self._outs = []
+        self._state_ctr = {}
+        self._state_ctr['rnn'] = 0
+        self._state_ctr['crnn'] = 0
+        self._state_ctr['lstm'] = 0
+        self._state_ctr['cell_state'] = 0
+        self._state_ctr['gru'] = 0
+
+    def _read_memory_buffer(self, shape, data, dtype=None):
+        if dtype is None:
+            dtype = self._dtype
+        length = 1
+        for x in shape:
+            length *= x
+        data_np = np.zeros(length, dtype=dtype)
+        for i in range(length):
+            data_np[i] = data[i]
+        return data_np.reshape(shape)
+
+    def _get_convolution_weights(self, layer, opname):
+        """Get the convolution layer weights and biases."""
+        if layer.nweights == 0:
+            return None
+
+        if (layer.n * layer.c * layer.size * layer.size) != layer.nweights:
+            raise RuntimeError("layer weights size not matching with n c h w")
+
+        params = {}
+        shape = (layer.n, layer.c, layer.size, layer.size)
+        weights = self._read_memory_buffer(shape, layer.weights)
+
+        biases = self._read_memory_buffer((layer.n, ), layer.biases)
+
+        k = _get_params_name(opname, 'weight')
+        params[k] = tvm.nd.array(weights)
+
+        if layer.batch_normalize == 1 and layer.dontloadscales != 1:
+            params.update(self._get_batchnorm_weights(layer, opname, layer.n))
+            k = _get_params_name(opname, 'beta')
+            params[k] = tvm.nd.array(biases)
+        else:
+            k = _get_params_name(opname, 'bias')
+            params[k] = tvm.nd.array(biases)
+        return params
+
+    def _get_connected_weights(self, layer, opname):
+        """Parse the weights and biases for fully connected or dense layer."""
+        size = layer.outputs * layer.inputs
+        if size == 0:
+            return None
+
+        weights = self._read_memory_buffer((layer.outputs, layer.inputs), layer.weights)
+        biases = self._read_memory_buffer((layer.outputs, ), layer.biases)
+
+        params = {}
+        k = _get_params_name(opname, 'weight')
+        params[k] = tvm.nd.array(weights)
+
+        if layer.batch_normalize == 1 and layer.dontloadscales != 1:
+            params.update(self._get_batchnorm_weights(layer, opname, layer.outputs))
+            k = _get_params_name(opname, 'beta')
+            params[k] = tvm.nd.array(biases)
+        else:
+            k = _get_params_name(opname, 'bias')
+            params[k] = tvm.nd.array(biases)
+        return params
+
+    def _get_region_weights(self, layer, opname):
+        """Parse the biases for region layer."""
+        biases = self._read_memory_buffer((layer.n*2, ), layer.biases)
+        attributes = np.array([layer.n, layer.out_c, layer.out_h, layer.out_w,
+                               layer.classes, layer.coords, layer.background],
+                              dtype=np.int32)
+        params = {}
+        k = _get_params_name(opname, 'bias')
+        params[k] = tvm.nd.array(biases)
+        k = _get_params_name(opname, 'attr')
+        params[k] = tvm.nd.array(attributes)
+        return params
+
+    def _get_yolo_weights(self, layer, opname):
+        """Parse the biases and mask for yolo layer."""
+        biases = self._read_memory_buffer((layer.total*2, ), layer.biases)
+        mask = self._read_memory_buffer((layer.n, ), layer.mask, dtype='int32')
+        attributes = np.array([layer.n, layer.out_c, layer.out_h, layer.out_w,
+                               layer.classes, layer.total],
+                              dtype=np.int32)
+        params = {}
+        k = _get_params_name(opname, 'bias')
+        params[k] = tvm.nd.array(biases)
+        k = _get_params_name(opname, 'mask')
+        params[k] = tvm.nd.array(mask)
+        k = _get_params_name(opname, 'attr')
+        params[k] = tvm.nd.array(attributes)
+        return params
+
+    def _get_batchnorm_weights(self, layer, opname, size):
+        """Parse the weights for batchnorm, which includes, scales, moving mean
+        and moving variances."""
+        scales = self._read_memory_buffer((size, ), layer.scales)
+        rolling_mean = self._read_memory_buffer((size, ), layer.rolling_mean)
+        rolling_variance = self._read_memory_buffer((size, ), layer.rolling_variance)
+
+        params = {}
+        k = _get_params_name(opname, 'moving_mean')
+        params[k] = tvm.nd.array(rolling_mean)
+        k = _get_params_name(opname, 'moving_var')
+        params[k] = tvm.nd.array(rolling_variance)
+        k = _get_params_name(opname, 'gamma')
+        params[k] = tvm.nd.array(scales)
+        return params
+
+    def _get_darknet_attrs(self, layer, layer_num):
+        """Parse attributes of each layer and return."""
+        attr = {}
+        use_flatten = True
+        layer_type = LAYERTYPE(layer.type)
+        if LAYERTYPE.CONVOLUTIONAL == layer_type:
+            attr.update({'pad' : layer.pad})
+            attr.update({'num_group' : layer.groups})
+            attr.update({'num_filter' : layer.n})
+            attr.update({'stride' : layer.stride})
+            attr.update({'kernel' : layer.size})
+            attr.update({'activation' : (layer.activation)})
+
+            if layer.nbiases == 0:
+                attr.update({'use_bias' : False})
+            else:
+                attr.update({'use_bias' : True})
+
+            if layer.batch_normalize == 1 and layer.dontloadscales != 1:
+                attr.update({'use_batchNorm' : True})
+                attr.update({'use_scales' : True})
+
+        elif LAYERTYPE.CONNECTED == layer_type:
+            attr.update({'num_hidden' : layer.outputs})
+            attr.update({'activation' : (layer.activation)})
+            if layer_num != 0:
+                layer_prev = self._net.layers[layer_num - 1]
+                if (layer_prev.out_h == layer.h and
+                        layer_prev.out_w == layer.w and
+                        layer_prev.out_c == layer.c):
+                    use_flatten = False
+            attr.update({'use_flatten' : use_flatten})
+            attr.update({'use_bias' : True})
+            if layer.batch_normalize == 1 and layer.dontloadscales != 1:
+                attr.update({'use_batchNorm' : True})
+                attr.update({'use_scales' : True})
+                attr.update({'use_bias' : False})
+
+        elif LAYERTYPE.MAXPOOL == layer_type:
+            attr.update({'pad' : layer.pad})
+            attr.update({'stride' : layer.stride})
+            attr.update({'kernel' : layer.size})
+            max_output = (layer.w - layer.size + 2 * layer.pad)/float(layer.stride) + 1
+            if max_output < layer.out_w:
+                extra_pad = (layer.out_w - max_output)*layer.stride
+                attr.update({'extra_pad_size' : int(extra_pad)})
+        elif LAYERTYPE.AVGPOOL == layer_type:
+            attr.update({'pad' : layer.pad})
+            if layer.stride == 0:
+                attr.update({'stride' : 1})
+            else:
+                attr.update({'stride' : layer.stride})
+            if layer.size == 0 and layer.h == layer.w:
+                attr.update({'kernel' : layer.h})
+            else:
+                attr.update({'kernel' : layer.size})
+
+        elif LAYERTYPE.DROPOUT == layer_type:
+            attr.update({'p' : layer.probability})
+
+        elif LAYERTYPE.SOFTMAX == layer_type:
+            attr.update({'axis' : 1})
+            attr.update({'use_flatten' : True})
+            if layer.temperature:
+                attr.update({'temperature' : str(layer.temperature)})
+
+        elif LAYERTYPE.SHORTCUT == layer_type:
+            add_layer = self._net.layers[layer.index]
+            attr.update({'activation' : layer.activation})
+            attr.update({'out_channel' : layer.out_c})
+            attr.update({'out_size' : layer.out_h})
+            attr.update({'add_out_channel' : add_layer.out_c})
+            attr.update({'add_out_size' : add_layer.out_h})
+
+        elif LAYERTYPE.ROUTE == layer_type:
+            pass
+
+        elif LAYERTYPE.COST == layer_type:
+            pass
+
+        elif LAYERTYPE.REORG == layer_type:
+            attr.update({'stride' : layer.stride})
+
+        elif LAYERTYPE.REGION == layer_type:
+            attr.update({'n' : layer.n})
+            attr.update({'classes' : layer.classes})
+            attr.update({'coords' : layer.coords})
+            attr.update({'background' : layer.background})
+            attr.update({'softmax' : layer.softmax})
+            attr.update({'shape' : (1, layer.c, layer.h, layer.w)})
+
+        elif LAYERTYPE.YOLO == layer_type:
+            attr.update({'n' : layer.n})
+            attr.update({'classes' : layer.classes})
+            attr.update({'shape' : (1, layer.c, layer.h, layer.w)})
+
+        elif LAYERTYPE.UPSAMPLE == layer_type:
+            attr.update({'scale' : layer.stride})
+
+        elif LAYERTYPE.L2NORM == layer_type:
+            pass
+
+        else:
+            err = "Darknet layer type {} is not supported in relay.".format(layer_type)
+            raise NotImplementedError(err)
+
+        return attr
+
+    def _get_darknet_params(self, layer, opname):
+        """To parse and get the darknet params."""
+        layer_type = LAYERTYPE(layer.type)
+        params = None
+        if LAYERTYPE.CONVOLUTIONAL == layer_type:
+            params = self._get_convolution_weights(layer, opname)
+        elif LAYERTYPE.CONNECTED == layer_type:
+            params = self._get_connected_weights(layer, opname)
+        elif LAYERTYPE.REGION == layer_type:
+            params = self._get_region_weights(layer, opname)
+        elif LAYERTYPE.YOLO == layer_type:
+            params = self._get_yolo_weights(layer, opname)
+        return params
+
+    def _preproc_layer(self, layer, layer_num):
+        """To preprocess each darknet layer, some layer doesnt need processing."""
+        if layer_num == 0:
+            name = 'data'
+            sym = new_var(name, shape=self._shape, dtype=self._dtype)
+        else:
+            sym = self._sym_array[layer_num - 1]
+        skip_layer = False
+        layer_type = LAYERTYPE(layer.type)
+        if LAYERTYPE.ROUTE == layer_type:
+            sym = []
+            for j in range(layer.n):
+                sym.append(self._sym_array[layer.input_layers[j]])
+            if layer.n == 1:
+                skip_layer = True
+
+        elif LAYERTYPE.COST == layer_type:
+            skip_layer = True
+
+        elif LAYERTYPE.SHORTCUT == layer_type:
+            sym = [sym, self._sym_array[layer.index]]
+
+        elif LAYERTYPE.BLANK == layer_type:
+            skip_layer = True
+
+        if skip_layer is True:
+            self._sym_array[layer_num] = sym
+
+        return skip_layer, sym
+
+    def _get_opname(self, layer):
+        """Returs the layer name."""
+        return LAYERTYPE(layer.type)
+
+    def _new_rnn_state_var(self, state=None, name='rnn'):
+        """Returs a symbol for state"""
+        sym_name = name + "%d_state" % self._state_ctr[name]
+        self._state_ctr[name] += 1
+        return new_var(sym_name, shape=state.shape, dtype=str(state.dtype))
+
+    def _get_rnn_state_buffer(self, layer, name):
+        """Get the state buffer for rnn."""
+        buffer = np.zeros((1, layer.outputs), self._dtype)
+        return self._new_rnn_state_var(buffer, name)
+
+    def _get_darknet_rnn_attrs(self, layer, name, sym):
+        """Get the rnn converted symbol from attributes."""
+        attr = self._get_darknet_attrs(layer, 0)
+        op_name = self._get_opname(layer)
+        prefix = _get_params_prefix(op_name, name)
+        params = self._get_darknet_params(layer, prefix)
+        sym = _darknet_convert_symbol(op_name, _as_list(sym), params, attr, prefix)
+        if params:
+            self._tvmparams.update(params)
+        return sym
+
+    def _handle_darknet_rnn_layers(self, layer_num, sym):
+        """Parse attributes and handle the rnn layers."""
+        attr = {}
+        layer = self._net.layers[layer_num]
+        processed = False
+
+        layer_type = LAYERTYPE(layer.type)
+        if LAYERTYPE.RNN == layer_type:
+            attr.update({'n' : layer.n})
+            attr.update({'batch' : layer.batch})
+            attr.update({'num_hidden' : str(layer.outputs)})
+            state = self._get_rnn_state_buffer(layer, 'rnn')
+            for _ in range(layer.steps):
+                input_layer = layer.input_layer
+                prefix = "_input_" + str(layer_num)
+                sym = self._get_darknet_rnn_attrs(input_layer, prefix, sym)
+
+                self_layer = layer.self_layer
+                prefix = "_self_" + str(layer_num)
+                state = self._get_darknet_rnn_attrs(self_layer, prefix, state)
+
+                state = sym + state
+                self._outs.append(state)
+
+                output_layer = layer.output_layer
+                prefix = "_output_" + str(layer_num)
+                sym = self._get_darknet_rnn_attrs(output_layer, prefix, state)
+
+            self._sym_array[layer_num] = sym
+            processed = True
+        return processed, sym
+
+    def _make_outlist(self, sym, op_name, layer, layer_num):
+        layer_type = LAYERTYPE(layer.type)
+        if layer_type == LAYERTYPE.REGION:
+            #Add attributes
+            k = _get_params_name(op_name, 'attr')
+            dshape = self._tvmparams[k].shape
+            dtype = self._tvmparams[k].dtype
+            self._outs.insert(0, new_var(k, shape=dshape, dtype=dtype))
+
+            #Add bias
+            k = _get_params_name(op_name, 'bias')
+            dshape = self._tvmparams[k].shape
+            dtype = self._tvmparams[k].dtype
+            self._outs.insert(0, new_var(k, shape=dshape, dtype=dtype))
+            if layer_num != self._net.n-1:
+                self._outs.insert(0, sym)
+
+        elif layer_type == LAYERTYPE.YOLO:
+            #Add attributes
+            k = _get_params_name(op_name, 'attr')
+            dshape = self._tvmparams[k].shape
+            dtype = self._tvmparams[k].dtype
+            self._outs.insert(0, new_var(k, shape=dshape, dtype=dtype))
+
+            #Add bias
+            k = _get_params_name(op_name, 'bias')
+            dshape = self._tvmparams[k].shape
+            dtype = self._tvmparams[k].dtype
+            self._outs.insert(0, new_var(k, shape=dshape, dtype=dtype))
+
+            #Add mask
+            k = _get_params_name(op_name, 'mask')
+            dshape = self._tvmparams[k].shape
+            dtype = self._tvmparams[k].dtype
+            self._outs.insert(0, new_var(k, shape=dshape, dtype=dtype))
+
+            if layer_num != self._net.n-1:
+                self._outs.insert(0, sym)
+
+    def from_darknet(self):
+        """To convert the darknet symbol to relay functions."""
+        for i in range(self._net.n):
+            layer = self._net.layers[i]
+            need_skip, sym = self._preproc_layer(layer, i)
+            if need_skip:
+                continue
+
+            processed, sym = self._handle_darknet_rnn_layers(i, sym)
+            if processed:
+                continue
+
+            attr = self._get_darknet_attrs(layer, i)
+            op_name = self._get_opname(layer)
+            prefix = _get_params_prefix(op_name, i)
+            params = self._get_darknet_params(self._net.layers[i], prefix)
+            sym = _darknet_convert_symbol(op_name, _as_list(sym), params, attr, prefix)
+
+            if params:
+                self._tvmparams.update(params)
+            self._sym_array[i] = sym
+            self._make_outlist(sym, prefix, layer, i)
+
+        outputs = _as_list(sym) + self._outs
+        outputs = outputs[0] if len(outputs) == 1 else _expr.Tuple(outputs)
+        sym = _expr.Function(ir_pass.free_vars(outputs), outputs)
+        return _module.Module.from_expr(sym), self._tvmparams
+
+def from_darknet(net,
+                 shape=None,
+                 dtype="float32"):
+    """Convert from Darknet's model into compatible relay Function.
+
+    Parameters
+    ----------
+    net : Darknet net parameter
+        Darknet net structure.
+    shape : dict of str to tuple, optional
+        The input shape to the graph
+    dtype : str or dict of str to str
+        The input types to the graph
+
+    Returns
+    -------
+    mod : tvm.relay.Module
+        The relay module for compilation.
+
+    params : dict of str to tvm.NDArray
+        The parameter dict to be used by relay
+    """
+
+    return GraphProto(net, shape, dtype).from_darknet()
diff --git a/python/tvm/relay/frontend/keras.py b/python/tvm/relay/frontend/keras.py
index 2648a5a6637b..ad033f9bf326 100644
--- a/python/tvm/relay/frontend/keras.py
+++ b/python/tvm/relay/frontend/keras.py
@@ -22,6 +22,7 @@
 import tvm
 from .. import ir_pass
 from .. import expr as _expr
+from .. import module as _module
 from .. import op as _op
 from ... import nd as _nd
 from .common import ExprTable, new_var
@@ -203,7 +204,6 @@ def _convert_convolution(inexpr, keras_layer, etab):
     else:
         kernel_h, kernel_w, in_channels, n_filters = weightList[0].shape
         weight = weightList[0].transpose([3, 2, 0, 1])
-    dilation = [1, 1]
     if isinstance(keras_layer.dilation_rate, (list, tuple)):
         dilation = [keras_layer.dilation_rate[0], keras_layer.dilation_rate[1]]
     else:
@@ -680,8 +680,8 @@ def from_keras(model, shape=None):
 
     Returns
     -------
-    func : tvm.relay.Function
-        Compatible relay Function.
+    mod : tvm.relay.Module
+        The relay module for compilation.
 
     params : dict of str to tvm.NDArray
         The parameter dict to be used by Relay.
@@ -745,4 +745,4 @@ def _convert_input_layer(keras_layer):
     outexpr = outexpr[0] if len(outexpr) == 1 else _expr.Tuple(outexpr)
     func = _expr.Function(ir_pass.free_vars(outexpr), outexpr)
     params = {k:_nd.array(np.array(v, dtype=np.float32)) for k, v in etab.params.items()}
-    return func, params
+    return _module.Module.from_expr(func), params
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 1a4d52f5b679..2f36355abf23 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -23,6 +23,7 @@
 from .. import ir_pass
 from .. import expr as _expr
 from .. import op as _op
+from .. import module as _module
 from ... import nd as _nd
 
 from .common import StrAttrsDict
@@ -93,6 +94,15 @@ def impl(inputs, attrs):
     return impl
 
 
+def _mx_zeros(inputs, attrs):
+    assert len(inputs) == 0
+    shape = attrs.get_int_tuple("shape")
+    dtype = attrs.get_str("dtype", "float32")
+    if 0 in shape:
+        return None
+    return _op.zeros(shape=shape, dtype=dtype)
+
+
 def _mx_conv2d(inputs, attrs):
     kernel_size = attrs.get_int_tuple("kernel")
     if len(kernel_size) != 2:
@@ -149,7 +159,7 @@ def _mx_conv2d_transpose(inputs, attrs):
     new_attrs["groups"] = attrs.get_int("num_group", 1)
     new_attrs["data_layout"] = data_layout
     new_attrs["kernel_layout"] = kernel_layout
-    use_bias = not attrs.get_bool("no_bias", False)
+    use_bias = not attrs.get_bool("no_bias", True)
     res = _op.nn.conv2d_transpose(inputs[0], inputs[1], **new_attrs)
 
     if use_bias:
@@ -277,6 +287,28 @@ def _mx_slice_axis(inputs, attrs):
     return _op.strided_slice(inputs[0], begin, end)
 
 
+def _mx_crop_like(inputs, attrs):
+    if len(inputs) < 2:
+        raise tvm.error.OpAttributeUnimplemented(
+            "Only support crop_like pattern for operator Crop.")
+    if attrs.get_bool("center_crop", False):
+        raise tvm.error.OpAttributeUnimplemented(
+            "Center crop is not supported in operator Crop.")
+    if attrs.get_int_tuple("h_w", (0, 0)) != (0, 0):
+        raise tvm.error.OpAttributeUnimplemented(
+            "Doesn't support h_w in operator Crop.")
+    offset = attrs.get_int_tuple("offset", (0, 0))
+    new_attrs = {}
+    if offset == (0, 0):
+        new_attrs["axes"] = (2, 3)
+        return _op.slice_like(*inputs, **new_attrs)
+    like_shape = ir_pass.infer_type(inputs[1]).checked_type.shape
+    new_attrs['begin'] = [0, 0, offset[0], offset[1]]
+    new_attrs['end'] = [like_shape[0], like_shape[1], offset[0]+like_shape[2],
+                        offset[1]+like_shape[3]]
+    return _op.strided_slice(inputs[0], **new_attrs)
+
+
 def _mx_split(inputs, attrs):
     axis = attrs.get_int("axis", 1)
     new_attrs = {}
@@ -300,6 +332,10 @@ def _mx_softmax_output(inputs, attrs):
     return _op.nn.softmax(inputs[0])
 
 
+def _mx_linear_regression_output(inputs, _):
+    return inputs[0]
+
+
 def _mx_concat(inputs, attrs):
     axis = attrs.get_int("dim", 1)
     return _op.concatenate(tuple(inputs), axis=axis)
@@ -543,7 +579,8 @@ def _mx_box_nms(inputs, attrs):
         raise tvm.error.OpAttributeInvalid(
             'Value of attribute "out_format" must equal "corner" for operator box_nms.')
 
-    ret = _op.vision.get_valid_counts(inputs[0], score_threshold=valid_thresh)
+    ret = _op.vision.get_valid_counts(inputs[0], score_threshold=valid_thresh,
+                                      id_index=id_index, score_index=score_index)
     nms_out = _op.vision.non_max_suppression(ret[1],
                                              ret[0],
                                              iou_threshold=iou_thresh,
@@ -657,6 +694,21 @@ def _mx_argsort(inputs, attrs):
     return _op.argsort(inputs[0], **new_attrs)
 
 
+def _mx_topk(inputs, attrs):
+    assert len(inputs) == 1
+    new_attrs = {}
+    new_attrs["k"] = attrs.get_int("k", 1)
+    new_attrs["axis"] = attrs.get_int("axis", -1)
+    new_attrs["is_ascend"] = attrs.get_bool("is_ascend", True)
+    ret_type = attrs.get_str("ret_typ", "indices")
+    if ret_type == "mask":
+        raise tvm.error.OpAttributeUnimplemented(
+            "Attribute ret_type=mask is not supported in topk operator")
+    new_attrs["ret_type"] = "values" if ret_type == "value" else ret_type
+    new_attrs["dtype"] = attrs.get_str("dtype", "float32")
+    return _op.topk(inputs[0], **new_attrs)
+
+
 def _mx_rnn_param_concat(inputs, _):
     # We don't need to concatenate RNN params because we will unravel the RNN op
     return [inputs]
@@ -696,13 +748,12 @@ def _lstm_cell(data, states, i2h_weight, h2h_weight, i2h_bias, h2h_bias):
 
     num_layers = attrs.get_int("num_layers", 1)
     mode = attrs.get_str("mode")
+    output_states = attrs.get_bool("state_outputs", False)
     if mode.startswith("rnn"):
         mode, activation = mode.split('_')
     assert mode in ["rnn", "gru", "lstm"]
     bidirectional = attrs.get_bool("bidirectional", False)
-    if bidirectional:
-        raise tvm.error.OpAttributeUnimplemented(
-            "Bidirectional RNN op is not supported yet")
+    direct = 2 if bidirectional else 1
     layout = attrs.get_str("layout", "TNC")
     if layout != "TNC":
         raise tvm.error.OpAttributeUnimplemented(
@@ -712,44 +763,98 @@ def _lstm_cell(data, states, i2h_weight, h2h_weight, i2h_bias, h2h_bias):
 
     seq_data = inputs[0]
     concat_weight = inputs[1]
-    concat_states = inputs[2:]
-    seq_len = int(ir_pass.infer_type(seq_data).checked_type.shape[0])
-    assert len(concat_weight) == num_layers * 4
+    init_states = inputs[2:]
+    data_shape = ir_pass.infer_type(seq_data).checked_type.shape
+    seq_len = int(data_shape[0])
+    assert len(concat_weight) == num_layers * 4 * direct
+
+    for idx, state in enumerate(init_states[:]):
+        if isinstance(state, dict):
+            node = state
+            attrs = StrAttrsDict(node.get("attrs", {}))
+            op_name = node["op"]
+            # by default, RNN layer uses zeros to initialize states
+            assert op_name == "_zeros"
+            shape = attrs.get_int_tuple("shape")
+            dtype = attrs.get_str("dtype", "float32")
+            init_layout = attrs.get_str("__layout__")
+            new_shape = list(shape)
+            for i, dim in enumerate(shape):
+                if dim == 0:
+                    axis = layout.find(init_layout[i])
+                    assert axis >= 0
+                    new_shape[i] = int(data_shape[axis])
+            init_states[idx] = _op.zeros(new_shape, dtype)
 
     weights = []
     bias = []
     states = []
+    back_weights = []
+    back_bias = []
+    back_states = []
     for i in range(num_layers):
-        w = []
-        b = []
+        weights.append([concat_weight[i*2*direct].args[0],
+                        concat_weight[i*2*direct + 1].args[0]])
+        bias.append([concat_weight[(num_layers+i)*2*direct].args[0],
+                     concat_weight[(num_layers+i)*2*direct + 1].args[0]])
         s = []
-        for j in range(2):
-            w.append(concat_weight[i*2 + j].args[0])
-            b.append(concat_weight[num_layers*2 + i*2 + j].args[0])
-        for state in concat_states:
-            s.append(_op.take(state, _expr.const(i, "int32"), axis=0))
-        weights.append(w)
-        bias.append(b)
+        for state in init_states:
+            s.append(_op.take(state, _expr.const(i*direct, "int32"), axis=0))
         states.append(s)
-
-    seq_output = []
-    for t in range(seq_len):
-        data = _op.take(seq_data, _expr.const(t, "int32"), axis=0)
-        for l in range(num_layers):
+        if bidirectional:
+            back_weights.append([concat_weight[i*2*direct + 2].args[0],
+                                 concat_weight[i*2*direct + 3].args[0]])
+            back_bias.append([concat_weight[(num_layers+i)*2*direct + 2].args[0],
+                              concat_weight[(num_layers+i)*2*direct + 3].args[0]])
+            s = []
+            for state in init_states:
+                s.append(_op.take(state, _expr.const(i*direct+1, "int32"), axis=0))
+            back_states.append(s)
+
+    xs = [_op.take(seq_data, _expr.const(t, "int32"), axis=0) for t in range(seq_len)]
+    for l in range(num_layers):
+        outputs = []
+        back_outputs = []
+        for x in xs:
             if mode == "rnn":
-                out, new_states = _rnn_cell(data, states[l], *weights[l], *bias[l], activation)
+                out, new_states = _rnn_cell(x, states[l], *weights[l], *bias[l], activation)
             elif mode == "gru":
-                out, new_states = _gru_cell(data, states[l], *weights[l], *bias[l])
+                out, new_states = _gru_cell(x, states[l], *weights[l], *bias[l])
             else: # mode == "lstm"
-                out, new_states = _lstm_cell(data, states[l], *weights[l], *bias[l])
+                out, new_states = _lstm_cell(x, states[l], *weights[l], *bias[l])
             states[l] = new_states
-            data = out
-        seq_output.append(out)
-
-    outputs = [_op.stack(seq_output, axis=0)]
-    for i in range(num_states):
-        outputs.append(_op.stack([s[i] for s in states], axis=0))
-    return outputs
+            outputs.append(out)
+        if bidirectional:
+            for x in reversed(xs):
+                if mode == "rnn":
+                    out, new_states = _rnn_cell(
+                        x, back_states[l], *back_weights[l], *back_bias[l], activation)
+                elif mode == "gru":
+                    out, new_states = _gru_cell(
+                        x, back_states[l], *back_weights[l], *back_bias[l])
+                else: # mode == "lstm"
+                    out, new_states = _lstm_cell(
+                        x, back_states[l], *back_weights[l], *back_bias[l])
+                back_states[l] = new_states
+                back_outputs.append(out)
+            back_outputs.reverse()
+            concat_outputs = []
+            for t, out in enumerate(outputs):
+                new_out = _op.concatenate([out, back_outputs[t]], axis=-1)
+                concat_outputs.append(new_out)
+            outputs = concat_outputs
+        xs = outputs
+
+    ret = [_op.stack(outputs, axis=0)]
+    if output_states:
+        for i in range(num_states):
+            inputs = []
+            for l, s in enumerate(states):
+                inputs.append(s[i])
+                if bidirectional:
+                    inputs.append(back_states[l][i])
+            ret.append(_op.stack(inputs, axis=0))
+    return ret
 
 
 # Note: due to attribute conversion constraint
@@ -839,7 +944,6 @@ def _lstm_cell(data, states, i2h_weight, h2h_weight, i2h_bias, h2h_bias):
     "argmin"        : _arg_reduce(_op.argmin),
     # init ops
     "_ones"         : _init_op(_op.ones),
-    "_zeros"        : _init_op(_op.zeros),
     # softmax
     "softmax"       : _softmax_op(_op.nn.softmax),
     "log_softmax"   : _softmax_op(_op.nn.log_softmax),
@@ -853,6 +957,7 @@ def _lstm_cell(data, states, i2h_weight, h2h_weight, i2h_bias, h2h_bias):
     "UpSampling"    : _upsampling,
     "add_n"         : _elemwise_sum,
     # MXNet specific implementations
+    "_zeros"        : _mx_zeros,
     "FullyConnected": _mx_fully_connected,
     "Activation"    : _mx_activations,
     "Convolution"   : _mx_conv2d,
@@ -888,8 +993,10 @@ def _lstm_cell(data, states, i2h_weight, h2h_weight, i2h_bias, h2h_bias):
     "shape_array"   : _mx_shape_array,
     "Embedding"     : _mx_embedding,
     "argsort"       : _mx_argsort,
+    "topk"          : _mx_topk,
     "SoftmaxOutput" : _mx_softmax_output,
     "SoftmaxActivation" : _mx_softmax_activation,
+    "LinearRegressionOutput" : _mx_linear_regression_output,
     "smooth_l1"     : _mx_smooth_l1,
     # vision
     "_contrib_BilinearResize2D" : _mx_resize,
@@ -905,18 +1012,20 @@ def _lstm_cell(data, states, i2h_weight, h2h_weight, i2h_bias, h2h_bias):
     # NLP
     "RNN"               : _mx_rnn_layer,
     "_rnn_param_concat" : _mx_rnn_param_concat,
+    # Depricated:
+    "Crop"              : _mx_crop_like,
     # List of missing operators that are present in NNVMv1
     # TODO(tvm-tvm): support all operators.
     #
     # "broadcast_to",
-    # "Crop"          : _crop_like,
 }
 
 # set identity list
 _convert_map.update({k : _rename(k) for k in _identity_list})
 
 
-def _from_mxnet_impl(symbol, shape_dict, dtype_info):
+def _from_mxnet_impl(symbol, shape_dict, dtype_info, mod=None):
+    #pylint: disable=unused-argument
     """Convert mxnet symbol to compatible relay Function.
 
     Reconstruct a relay Function by traversing the mxnet symbol.
@@ -933,6 +1042,10 @@ def _from_mxnet_impl(symbol, shape_dict, dtype_info):
     dtype_info : dict or str.
         Known parameter dtypes
 
+    mod : tvm.relay.Module
+        The module that contains global information. It will be used for
+        converting ops that need global information, e.g. control-flow ops.
+
     Returns:
     -------
     func : tvm.relay.Function
@@ -957,7 +1070,10 @@ def _from_mxnet_impl(symbol, shape_dict, dtype_info):
             node_map[nid] = [_expr.var(node_name, shape=shape, dtype=dtype)]
         elif op_name in _convert_map:
             res = _convert_map[op_name](children, attrs)
-            if isinstance(res, (_expr.TupleWrapper, tuple, list)):
+            if res is None:
+                # defer conversion, used in RNN state initialization
+                res = [node]
+            elif isinstance(res, (_expr.TupleWrapper, tuple, list)):
                 pass
             elif isinstance(res, _expr.Expr):
                 res = [res]
@@ -1018,8 +1134,8 @@ def from_mxnet(symbol,
 
     Returns
     -------
-    sym : tvm.relay.Function
-        Compatible relay Function
+    mod : tvm.relay.Module
+        The relay module for compilation
 
     params : dict of str to tvm.NDArray
         The parameter dict to be used by nnvm
@@ -1029,6 +1145,7 @@ def from_mxnet(symbol,
     except ImportError as e:
         raise ImportError("{}. MXNet is required to parse symbols.".format(e))
 
+    mod = _module.Module()
     if isinstance(symbol, mx.sym.Symbol):
         params = {}
         arg_params = arg_params if arg_params else {}
@@ -1038,7 +1155,7 @@ def from_mxnet(symbol,
         for k, v in aux_params.items():
             params[k] = _nd.array(v.asnumpy())
         shape, dtype = _update_shape_dtype(shape, dtype, params)
-        sym = _from_mxnet_impl(symbol, shape, dtype)
+        func = _from_mxnet_impl(symbol, shape, dtype, mod)
     elif isinstance(symbol, mx.gluon.HybridBlock):
         if arg_params is not None or aux_params is not None:
             raise ValueError("arg_params and aux_params ae not used when importing HybridBlock")
@@ -1050,10 +1167,11 @@ def from_mxnet(symbol,
         if isinstance(sym, (list, tuple)):
             sym = mx.sym.Group(sym)
         shape, dtype = _update_shape_dtype(shape, dtype, params)
-        sym = _from_mxnet_impl(sym, shape, dtype)
+        func = _from_mxnet_impl(sym, shape, dtype, mod)
     elif isinstance(symbol, mx.gluon.Block):
         raise NotImplementedError("Only Hybrid Blocks are supported now.")
     else:
         msg = "mxnet.Symbol or gluon.HybridBlock expected, got {}".format(type(symbol))
         raise ValueError(msg)
-    return sym, params
+    mod[mod.entry_func] = func
+    return mod, params
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index eba02e70c865..bb968ec0bea8 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -23,7 +23,9 @@
 import tvm
 from ... import nd as _nd
 from .. import ir_pass
+from .. import transform as _transform
 from .. import expr as _expr
+from .. import module as _module
 from .. import op as _op
 from .common import AttrCvt, Renamer
 from .common import get_relay_op, new_var, infer_shape, infer_channels, get_name
@@ -52,6 +54,15 @@ def revert_caffe2_pad(pads):
             'Number of pads must be either 2 or 4.')
     return pads
 
+
+def onnx_storage_order2layout(storage_order):
+    """converter of onnx storage order parameter to tvm storage order format"""
+    if storage_order not in (0, 1):
+        raise tvm.error.OpAttributeInvalid('Mode of storage_order must be either 0 or 1')
+
+    return 'NCHW' if sotrage_order == 0 else 'NHWC'
+
+
 def dimension_constraint():
     def _dim_check(attrs):
         if len(attrs['kernel_shape']) == 2:
@@ -60,6 +71,7 @@ def _dim_check(attrs):
 
     return _dim_check, "Only 2d kernel supported."
 
+
 class OnnxOpConverter(object):
     """ A helper class for holding onnx op converters.
     """
@@ -108,6 +120,7 @@ def _impl_v1(cls, inputs, attr, params):
             inputs[1] = _op.expand_dims(inputs[1], axis=axis, num_newaxis=2)
         return get_relay_op(op_name)(*inputs)
 
+
 class Pool(OnnxOpConverter):
     """ A helper class for pool op converters.
     """
@@ -247,6 +260,7 @@ def _impl_v1(cls, inputs, attr, params):
                            inputs[1], units=channels)
         return _op.nn.bias_add(out, _expr.const(beta) * inputs[2])
 
+
 class MatMul(OnnxOpConverter):
     """ Operator converter for MatMul.
     """
@@ -257,9 +271,40 @@ def _impl_v1(cls, inputs, attr, params):
         input_1_t = _op.transpose(inputs[1], axes=(1, 0))
         return _op.nn.dense(inputs[0], input_1_t)
 
+
 class MaxPool(Pool):
+    """ Operator converter for MaxPool
+    """
     name = 'max_pool'
 
+    @classmethod
+    def _impl_v8(cls, inputs, attr, params):
+        return AttrCvt(
+            op_name=dimension_picker(cls.name),
+            transforms={
+                'kernel_shape': 'pool_size',
+                'pads': ('padding', (0, 0), revert_caffe2_pad),
+                'storage_order': ('layout', 'NCHW', onnx_storage_order2layout),
+            },
+            # very weird attributes here in onnx, force check
+            ignores=['dilations', 'auto_pad'],
+            # TODO(higumachan): make sure ceil_mode in onnx, and layout?
+            extras={'ceil_mode': False},
+            custom_check=dimension_constraint())(inputs, attr, params)
+
+    @classmethod
+    def _impl_v10(cls, inputs, attr, params):
+        return AttrCvt(
+            op_name=dimension_picker(cls.name),
+            transforms={
+                'kernel_shape': 'pool_size',
+                'pads': ('padding', (0, 0), revert_caffe2_pad),
+                'storage_order': ('layout', 'NCHW', onnx_storage_order2layout),
+                'ceil_mode': 'ceil_mode'
+            },
+            # very weird attributes here in onnx, force check
+            ignores=['dilations', 'auto_pad'],
+            custom_check=dimension_constraint())(inputs, attr, params)
 
 class Mul(Elemwise):
     name = 'multiply'
@@ -365,21 +410,27 @@ def _impl_v1(cls, inputs, attr, params):
             shape = tuple(params[inputs[1].name_hint].asnumpy())
             out = _op.reshape(inputs[0], shape)
         else:
-            # Try to infer shape by precompute prune if possible.
-            # TODO: good to check inputs to be in params.
-            #       to be enhanced when relay support list_input_names API of NNVM
-            logging.warning("Infering Reshape argument by precompute")
-            func = _expr.Function(ir_pass.free_vars(inputs[1]), inputs[1])
+            data, shape = inputs
+            logging.warning("Constant evaluating Reshape's shape argument, may reduce performance")
+            shape_params = ir_pass.free_vars(shape)
+            func = _expr.Function(shape_params, shape)
+            mod = _module.Module.from_expr(func)
+            seq = _transform.Sequential([_transform.InferType(),
+                                         _transform.FoldConstant(),
+                                         _transform.FuseOps(0),
+                                         _transform.InferType()])
+            with tvm.relay.PassContext(opt_level=2):
+                mod = seq(mod)
             with tvm.relay.build_config(opt_level=0):
-                graph, lib, params = tvm.relay.build(func, target="llvm", params=params)
-            ctx = tvm.context("llvm", 0)
-            from tvm.contrib import graph_runtime
-            m = graph_runtime.create(graph, lib, ctx)
-            m.set_input(**params)
-            m.run()
-            params_new = m.get_output(0)
-            inputs.pop(1)
-            out = _op.reshape(inputs[0], tuple(params_new.asnumpy().astype('int32').flatten()))
+                ex = tvm.relay.create_executor("debug", mod=mod)
+                inputs = []
+                for sp in shape_params:
+                    if not sp.name_hint in params:
+                        sh = [int(i) for i in sp.type_annotation.shape]
+                        inputs.append(
+                            tvm.nd.array(np.random.rand(*sh).astype('float32')))
+                static_shape = ex.evaluate()(*inputs, **params)
+            out = _op.reshape(data, newshape=tuple(static_shape.asnumpy()))
 
         return out
 
@@ -524,6 +575,7 @@ class Shape(OnnxOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
+        # TODO(@jroesch): use shape_of once it has been fixed)
         return _op.shape_of(inputs[0])
 
 class Cast(OnnxOpConverter):
@@ -622,6 +674,23 @@ def _impl_v1(cls, inputs, attr, params):
                        extras={'axis':axis})(inputs, {})
         #return _op.take(inputs[0], inputs[1], axis)
 
+
+class Greater(OnnxOpConverter):
+    """ Operator logical greater.
+    """
+    @classmethod
+    def _impl_v7(cls, inputs, attr, params):
+        return _op.greater(inputs[0], inputs[1])
+
+
+class Less(OnnxOpConverter):
+    """ Operator logical less than.
+    """
+    @classmethod
+    def _impl_v7(cls, inputs, attr, params):
+        return _op.less(inputs[0], inputs[1])
+
+
 class LRN(OnnxOpConverter):
     """ Operator converter for Local Response Normalization.
     """
@@ -836,6 +905,8 @@ def _get_convert_map(opset):
         'Selu': Selu.get_converter(opset),
         'Elu': Elu.get_converter(opset),
         'Exp': Renamer('exp'),
+        'Greater': Greater.get_converter(opset),
+        'Less': Less.get_converter(opset),
         'Log': Renamer('log'),
         'Tanh': Renamer('tanh'),
         'Pow': Renamer('power'),
@@ -915,7 +986,7 @@ def __init__(self, shape, dtype):
         self._renames = {}
         self._num_input = 0
         self._num_param = 0
-        self._shape = shape
+        self._shape = shape if shape else {}
         self._dtype = dtype
 
     def from_onnx(self, graph, opset):
@@ -937,8 +1008,9 @@ def from_onnx(self, graph, opset):
 
         Returns
         -------
-        sym : tvm.relay.expr.Function
-            The returned relay function
+        mod : tvm.relay.Module
+            The returned relay module
+
         params : dict
             A dict of name: tvm.nd.array pairs, used as pretrained weights
         """
@@ -947,6 +1019,9 @@ def from_onnx(self, graph, opset):
             if not init_tensor.name.strip():
                 raise ValueError("Tensor's name is required.")
             self._params[init_tensor.name] = self._parse_array(init_tensor)
+            self._nodes[init_tensor.name] = new_var(init_tensor.name,
+                                                    shape=self._params[init_tensor.name].shape,
+                                                    dtype=self._params[init_tensor.name].dtype)
         for i in graph.input:
             # from onnx v0.2, GraphProto.input has type ValueInfoProto,
             #  and the name is 'i.name'
@@ -970,6 +1045,19 @@ def from_onnx(self, graph, opset):
                 else:
                     dtype = d_type
                 self._nodes[i_name] = new_var(i_name, shape=tshape, dtype=dtype)
+        # get list of unsupported ops
+        convert_map = _get_convert_map(opset)
+        unsupported_ops = set()
+        for node in graph.node:
+            op_name = node.op_type
+            if op_name not in convert_map and \
+               op_name != 'Constant' and \
+               op_name not in _identity_list:
+                unsupported_ops.add(op_name)
+        if unsupported_ops:
+            msg = 'The following operators are not supported for frontend ONNX: '
+            msg += ', '.join(unsupported_ops)
+            raise tvm.error.OpNotImplemented(msg)
         # construct nodes, nodes are stored as directed acyclic graph
         for node in graph.node:
             op_name = node.op_type
@@ -978,8 +1066,15 @@ def from_onnx(self, graph, opset):
             if op_name == "Constant":
                 t_proto = self._parse_attr(node.attribute)["value"]
                 self._num_param += 1
-                self._params[node.output[0]] = self._parse_array(t_proto)
-                self._nodes[node.output[0]] = new_var(node.output[0], shape=list(t_proto.dims))
+                # We should convert scalar integers to int32, to normalize.
+                array = self._parse_array(t_proto)
+                if len(array.shape) == 0 and array.dtype == 'int64':
+                    array = _nd.array(array.asnumpy().astype('int32'))
+                self._params[node.output[0]] = array
+                self._nodes[node.output[0]] = new_var(
+                    node.output[0],
+                    shape=list(t_proto.dims),
+                    dtype=array.dtype)
             else:
                 if op_name == "ConstantFill":
                     fill_value = attr.get('value', 0.0)
@@ -1012,7 +1107,7 @@ def from_onnx(self, graph, opset):
         outputs = [self._nodes[self._parse_value_proto(i)] for i in graph.output]
         outputs = outputs[0] if len(outputs) == 1 else _expr.Tuple(outputs)
         func = _expr.Function(ir_pass.free_vars(outputs), outputs)
-        return func, self._params
+        return _module.Module.from_expr(func), self._params
 
     def _parse_value_proto(self, value_proto):
         """Parse ValueProto or raw str."""
@@ -1076,7 +1171,7 @@ def _convert_operator(self,
                           attrs,
                           opset):
         """Convert ONNX operator into a Relay operator.
-        The converter must specify conversions explicity for incompatible name, and
+        The converter must specify conversions explicitly for incompatible name, and
         apply handlers to operator attributes.
 
         Parameters
@@ -1141,17 +1236,29 @@ def from_onnx(model,
 
     Returns
     -------
-    sym : tvm.relay.expr.Function
-        Compatible relay function
+    mod : tvm.relay.Module
+        The relay module for compilation
 
     params : dict of str to tvm.NDArray
         The parameter dict to be used by relay
     """
+    try:
+        import onnx
+        if hasattr(onnx.checker, 'check_model'):
+            # try use onnx's own model checker before converting any model
+            try:
+                onnx.checker.check_model(model)
+            except onnx.onnx_cpp2py_export.checker.ValidationError as e:
+                import warnings
+                # the checker is a bit violent about errors, so simply print warnings here
+                warnings.warn(str(e))
+    except ImportError:
+        pass
     g = GraphProto(shape, dtype)
     graph = model.graph
     try:
         opset = model.opset_import[0].version if model.opset_import else 1
     except AttributeError:
         opset = 1
-    sym, params = g.from_onnx(graph, opset)
-    return sym, params
+    mod, params = g.from_onnx(graph, opset)
+    return mod, params
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 48f78837c525..1b5573121e20 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -31,9 +31,24 @@
 from .. import expr as _expr
 from .. import op as _op
 from ..expr_functor import ExprMutator
+from .. import module as _module
 
 __all__ = ['from_tensorflow']
 
+def _infer_value(input_val, params):
+    from tvm.contrib import graph_runtime
+    # Check that all free variables have associated parameters.
+    assert all(var.name_hint in params.keys() for var in ir_pass.free_vars(
+        input_val)), "All inputs to infer must be available in params."
+    func = _expr.Function(ir_pass.free_vars(input_val), input_val)
+    with tvm.relay.build_config(opt_level=0):
+        graph, lib, params = tvm.relay.build(func, target="llvm", params=params)
+    ctx = tvm.context("llvm", 0)
+    m = graph_runtime.create(graph, lib, ctx)
+    m.set_input(**params)
+    m.run()
+    return m.get_output(0)
+
 def _get_relay_op(op_name):
     try:
         op = getattr(_op, op_name)
@@ -49,7 +64,7 @@ def _get_relay_op(op_name):
     return op
 
 class AttrCvt(object):
-    """Common attribute conveter. An AttrConverter instance is a callable:
+    """Common attribute converter. An AttrConverter instance is a callable:
     ```
     attr_converter = AttrConverter(op_name, transforms={'a':'b', 'c':('d', 1)})
     new_op_name, new_attr = attr_converter(attrs)
@@ -63,12 +78,12 @@ class AttrCvt(object):
         `op_name = func(attr)`
     transforms : dict of `new_name, or (new_name, default_value, transform function)`
         If only a new_name is provided, it's like renaming the attribute name.
-        If default_value if provded, then the attribute is considered as optional.
+        If default_value if provided, then the attribute is considered as optional.
         If transform function is provided, the original attribute value is handled
         by transform function.
     excludes : list
         A list of excluded attributes that should `NOT` appear.
-        Raise NotImplementedError if occured.
+        Raise NotImplementedError if occurred.
     disables : list
         A list of attributes that is disabled in relay. Log warnings.
     ignores : list
@@ -208,17 +223,37 @@ def _dim_check(attrs):
         return False
     return _dim_check, "Only 2d kernel supported."
 
-def _infer_channels(inputs, params, transpose=False):
-    """A hack for getting 'channles' or 'units' since tensorflow don't provide
+def _infer_channels(node, params, transpose=False):
+    """A hack for getting 'channels' or 'units' since tensorflow don't provide
     these attributes. We check the shape of weights provided to get the number.
     """
-    out_type = ir_pass.infer_type(inputs)
-    out_shapes = [get_const_tuple(out_type.checked_type.shape)]
-    channels = out_shapes[0][0] if not transpose else out_shapes[0][1]
+    out_shape = _infer_shape(node, params)
+    channels = out_shape[0] if not transpose else out_shape[1]
     return channels
 
+def _infer_out_shapes(inputs, params):
+    """A method to get the output shape of intermediate nodes in the relay graph."""
+    return [_infer_shape(inputs, params)]
+
+def _infer_shape(node, params=None):
+    """A method to get the output shape of an intermediate node in the relay graph."""
+    out_type = ir_pass.infer_type(node)
+    return get_const_tuple(out_type.checked_type.shape)
+
+def _get_param(params, input_node):
+    return params.pop(input_node.name_hint).asnumpy()
+
+def _get_num_param(params, input_node):
+    return _get_param(params, input_node)[0]
+
+def _get_list_param(params, input_node):
+    return _get_param(params, input_node).tolist()
+
+def _get_tuple_param(params, input_node):
+    return tuple(_get_param(params, input_node))
+
 def _rsqrt():
-    def _impl(inputs, attr, *args):
+    def _impl(inputs, attr, params):
         inputs.append(tvm.relay.const(-0.5, attr['T'].name))
         return AttrCvt(op_name="power")(inputs, attr)
     return _impl
@@ -229,16 +264,15 @@ def _impl(inputs, attr, params):
         try:
             # In Tensorflow, `axis` argument is a Tensor, not attribute. We
             # support the case where it inputs from a scalar constant.
-            axis_input_name = inputs[1].name_hint
-            axis_input_vlaue = [params[axis_input_name].asnumpy()[0]]
+            axis_input_value = [_get_num_param(params, inputs[1])]
         except (IndexError, KeyError):
             raise TypeError( \
                 "Unsupported argument for `{}` : `axis` should be a constant".format(func_name))
-        return func(inputs[0], axis=axis_input_vlaue, keepdims=False)
+        return func(inputs[0], axis=axis_input_value, keepdims=False)
     return _impl
 
 def _elemwise(name):
-    def _impl(inputs, attr, *args):
+    def _impl(inputs, attr, params):
         assert len(inputs) == 2, "{} take 2 inputs, {} given".format(name, len(inputs))
         return _get_relay_op(name)(*inputs)
     return _impl
@@ -450,6 +484,54 @@ def _impl(inputs, attr, params):
         return inputs[0]
     return _impl
 
+def _crop_and_resize():
+    def _impl(inputs, attr, params):
+        # input image is a 4-D tensor of shape [batch, image_height, image_width, depth]
+        # boxes is a 2-D tensor of shape [num_boxes, 4], 4 is for [y1, x1, y2, x2]
+        try:
+            boxes = params.pop(inputs[1].name_hint).asnumpy().tolist()
+            box_ind = params.pop(inputs[2].name_hint).asnumpy().tolist()
+            crop_size = params.pop(inputs[3].name_hint).asnumpy().tolist()
+        except (IndexError, KeyError):
+            boxes = _infer_value(inputs[1], params).asnumpy().tolist()
+            box_ind = _infer_value(inputs[2], params).asnumpy().tolist()
+            crop_size = _infer_value(inputs[3], params).asnumpy().tolist()
+
+        data_shape = attr['_input_shapes'][inputs[0]]
+        data_dim = len(data_shape)
+        method = attr['method'].decode()
+
+        attrs = {}
+        attrs['size'] = crop_size
+        attrs['layout'] = 'NHWC'
+        if method.lower() == 'nearest':
+            raise tvm.error.OpAttributeUnimplemented(
+                'Attribute method=nearest is not supported')
+        else:
+            attrs['align_corners'] = True
+            attrs['method'] = 'BILINEAR'
+
+        out = None
+        begin = [0] * data_dim
+        size = data_shape[:]
+        for idx in box_ind:
+            # 1) Crop
+            # y is mapped to the image coordinate at y * (image_height - 1)
+            # x is mapped to the image coordinate at x * (image_width - 1)
+            begin[0] = idx
+            begin[1] = int(round(boxes[idx][0] * (data_shape[1] - 1)))
+            begin[2] = int(round(boxes[idx][1] * (data_shape[2] - 1)))
+            size[0] = idx + 1
+            size[1] = int(round((data_shape[1] - 1) * boxes[idx][2])) + 1
+            size[2] = int(round((data_shape[2] - 1) * boxes[idx][3])) + 1
+            res_crop = _op.strided_slice(inputs[0], begin=begin, end=size)
+
+            # 2) Resize
+            res_resize = _get_relay_op('resize')(res_crop, **attrs)
+            out = _op.concatenate([out, res_resize], axis=0) if out else res_resize
+        return out
+    return _impl
+
 def _cast():
     def _impl(inputs, attr, params):
         return inputs[0].astype(attr['DstT'].name)
@@ -458,14 +540,19 @@ def _impl(inputs, attr, params):
 def _expand_dims():
     def _impl(inputs, attr, params):
         dim_input = inputs.pop(1)
-        axis = params.pop(_get_name_hint(dim_input)).asnumpy()[0]
+        axis = _get_num_param(params, dim_input)
         return AttrCvt(op_name="expand_dims", ignores=['Tdim', 'N'],
                        extras={'axis': int(axis), 'num_newaxis': 1})(inputs, attr)
     return _impl
 
 def _resize_bilinear():
     def _impl(inputs, attr, params):
-        attr['size'] = attr['_output_shapes'][0][1:3]
+        size = attr['_output_shapes'][0][1:3]
+        # Important that the size is defined. If an axis is not, we need to infer what
+        # the shape should be.
+        if -1 in size:
+            size = _infer_value(inputs[1], params).asnumpy().reshape([-1]).tolist()
+        attr['size'] = size
         inputs.pop(1)
         # NHWC
         attr['layout'] = 'NHWC'
@@ -475,6 +562,21 @@ def _impl(inputs, attr, params):
                        extras={'method': "BILINEAR"})(inputs, attr)
     return _impl
 
+def _resize_nearest_neighbor():
+    def _impl(inputs, attr, params):
+        size = attr['_output_shapes'][0][1:3]
+        if -1 in size:
+            size = _infer_value(inputs[1], params).asnumpy().reshape([-1]).tolist()
+        attr['size'] = size
+        inputs.pop(1)
+        # NHWC
+        attr['layout'] = 'NHWC'
+
+        return AttrCvt(op_name="resize",
+                       ignores=['Tdim'],
+                       extras={'method': "NEAREST_NEIGHBOR"})(inputs, attr)
+    return _impl
+
 def _check_numerics():
     def _impl(inputs, attr, params):
         # Making a copy node assuming no need to verify
@@ -508,21 +610,19 @@ def _impl(inputs, attr, params):
 def _concatV2():
     def _impl(inputs, attr, params):
         pop_node = inputs.pop(len(inputs)-1)
-        axis = params[pop_node.name_hint]
-        params.pop(pop_node.name_hint)
+        axis = int(_get_num_param(params, pop_node))
         return AttrCvt(
             op_name="concatenate", ignores=['T', 'N', 'Tidx'],
-            extras={'axis': int(axis.asnumpy()[0])})([inputs], attr)
+            extras={'axis': axis})([inputs], attr)
     return _impl
 
 def _concat():
     def _impl(inputs, attr, params):
         pop_node = inputs.pop(0)
-        axis = params[pop_node.name_hint]
-        params.pop(pop_node.name_hint)
+        axis = int(_get_num_param(params, pop_node))
         return AttrCvt(
             op_name="concatenate", ignores=['N'],
-            extras={'axis': int(axis.asnumpy()[0])})([inputs], attr)
+            extras={'axis': axis})([inputs], attr)
     return _impl
 
 def _pack():
@@ -546,8 +646,8 @@ def _impl(inputs, attr, params):
 
 def _slice():
     def _impl(inputs, attr, params):
-        begin = params.pop(_get_name_hint(inputs[1])).asnumpy().tolist()
-        size = params.pop(_get_name_hint(inputs[2])).asnumpy().tolist()
+        begin = _get_list_param(params, inputs[1])
+        size = _get_list_param(params, inputs[2])
         data_shape = attr['_input_shapes'][inputs[0]]
         data_dim = len(data_shape)
         end = size
@@ -556,43 +656,83 @@ def _impl(inputs, attr, params):
                 end[i] = data_shape[i] - begin[i]
             else:
                 end[i] += begin[i]
-        return _op.strided_slice(inputs[0], begin=begin, end=size)
+        return _op.strided_slice(inputs[0], begin=begin, end=end)
     return _impl
 
 
 def _reshape():
     def _impl(inputs, attr, params):
+        pop_node = inputs.pop(1)
         try:
-            pop_node = inputs[1]
-            shape_arg = params.pop(pop_node.name_hint)
-            inputs.pop(1)
-
-            return AttrCvt(
-                op_name="reshape",
-                extras={'newshape':tuple(shape_arg.asnumpy())},
-                ignores=['Tshape'])(inputs, attr)
+            shape_arg = _get_tuple_param(params, pop_node)
         except AttributeError:
             # Shape operator is already pruned, hence
             # try to infer shape by precompute prune if possible.
-            func = _expr.Function(ir_pass.free_vars(inputs[1]), inputs[1])
-            with tvm.relay.build_config(opt_level=0):
-                graph, lib, params = tvm.relay.build(func, target="llvm", params=params)
-            ctx = tvm.context("llvm", 0)
-            from tvm.contrib import graph_runtime
-            m = graph_runtime.create(graph, lib, ctx)
-            m.set_input(**params)
-            m.run()
-            params_new = m.get_output(0)
-            inputs.pop(1)
-            return AttrCvt(
-                op_name="reshape",
-                extras={'newshape':tuple(params_new.asnumpy().astype('int64').flatten())},
-                ignores=['Tshape'])(inputs, attr)
+            params_new = _infer_value(pop_node, params)
+            shape_arg = tuple(params_new.asnumpy().astype('int64').flatten())
+        return AttrCvt(
+            op_name="reshape",
+            extras={'newshape': shape_arg},
+            ignores=['Tshape'])(inputs, attr)
+    return _impl
+
+
+def _depth_to_space():
+    def _impl(inputs, attr, params):
+        # Need to handle data layouts differently.
+        input_shape = attr['_input_shapes'][inputs[0]]
+        block_size = int(attr['block_size'])
+        if attr['data_format'].decode("utf-8") == 'NHWC':
+            in_n, in_h, in_w, in_c = input_shape
+            new_c = int(in_c / (block_size * block_size))
+
+            # First expand input to larger dimension.
+            expanded = _op.reshape(
+                inputs[0], newshape=(in_n, in_h, in_w, block_size, block_size, new_c))
+            # Now reorder to expand spatial blocks.
+            transposed = _op.transpose(expanded, axes=(0, 1, 3, 2, 4, 5))
+            # Finally reshape to proper output.
+            new_h = in_h * block_size
+            new_w = in_w * block_size
+            newshape = (in_n, new_h, new_w, new_c)
+
+        else: # Handle NCHW layout
+            in_n, in_c, in_h, in_w = input_shape
+            new_c = int(in_c / (block_size * block_size))
+
+            expanded = _op.reshape(
+                inputs[0], newshape=(in_n, block_size, block_size, new_c, in_h, in_w))
+            transposed = _op.transpose(expanded, axes=(0, 3, 4, 1, 5, 2))
+            new_h = in_h * block_size
+            new_w = in_w * block_size
+            newshape = (in_n, new_c, new_h, new_w)
+
+        return AttrCvt(
+            op_name="reshape",
+            extras={'newshape': newshape},
+            ignores=['data_format', 'block_size'])([transposed], attr)
+
     return _impl
 
+
 def _bias_add():
     def _impl(inputs, attr, params):
-        return _op.add(inputs[0], inputs[1])
+        # Must expand for proper broadcasting in NCHW.
+        if attr['data_format'].decode("utf-8") == 'NCHW':
+            bias = _op.reshape(inputs[1], newshape=(1, -1, 1, 1))
+        else:
+            bias = inputs[1]
+        return _op.add(inputs[0], bias)
+    return _impl
+
+def _broadcast_to():
+    def _impl(inputs, attr, params):
+        if isinstance(inputs[1], _expr.Var):
+            shape = params[inputs[1].name_hint]
+        else:
+            shape = _infer_value(inputs[1], params)
+        shape = list(shape.asnumpy().reshape([-1]))
+        return _op.broadcast_to(inputs[0], shape)
     return _impl
 
 def _squeeze():
@@ -666,9 +806,16 @@ def _impl(inputs, attr, params):
 
 def _fill():
     def _impl(inputs, attr, params):
-        fill_arg = params.pop(inputs.pop(1).name_hint)
-        return _op.full(tvm.relay.const(fill_arg.asnumpy()[0], attr['T'].name),
-                        attr['_output_shapes'][0], attr['T'].name)
+        output_shape = attr['_output_shapes'][0]
+        # Output shape must be defined to avoid errors. If any axis is not, we must
+        # try to compute its shape.
+        if -1 in output_shape:
+            output_shape = _infer_value(inputs[0], params).asnumpy().reshape([-1]).tolist()
+
+        fill_arg = _get_num_param(params, inputs.pop(1))
+        dtype = attr['T'].name
+        return _op.full(tvm.relay.const(fill_arg, dtype),
+                        output_shape, dtype)
     return _impl
 
 def _lrn():
@@ -685,12 +832,21 @@ def _impl(inputs, attr, params):
     return _impl
 
 def _sum():
+    def _impl(inputs, attr, params):
+        axis = _get_tuple_param(params, inputs[1])
+        return AttrCvt(
+            op_name='sum',
+            extras={'axis': axis},
+            transforms={'keep_dims':'keepdims'},
+            ignores=['name', 'Tidx'])([inputs[0]], attr)
+    return _impl
+
+def _reduce(op):
     def _impl(inputs, attr, params):
         axis = params.pop(inputs[1].name_hint).asnumpy()
-        # convert to tuple for preventing invalid parameter format error
         axis = tuple(axis)
         return AttrCvt(
-            op_name='sum',
+            op_name=op,
             extras={'axis': axis},
             transforms={'keep_dims':'keepdims'},
             ignores=['name', 'Tidx'])([inputs[0]], attr)
@@ -704,24 +860,24 @@ def _impl(inputs, attr, params):
 def _gather():
     "GatherV2, Gather"
     def _impl(inputs, attr, params):
-
-        axis = 0
         if len(inputs) > 2:
-            axis = params[inputs.pop(2).name_hint].asnumpy()[0]
-        new_input = []
-        new_input.append(inputs.pop(0))
-        new_input.append(inputs.pop(0))
+            axis = _get_num_param(params, inputs.pop(2))
+        else:
+            axis = 0
+        new_input = inputs[0:2]
         return AttrCvt(op_name="take",
                        extras={'axis': tvm.const(axis, 'int32')},
-                       ignores=['Tindices', 'Tparams', 'validate_indices', \
+                       ignores=['Tindices', 'Tparams', 'validate_indices',
                                 'Taxis', '_class'])(new_input, attr)
     return _impl
 
-def _infer_out_shapes(inputs, params):
-    """A method to get the output shape of an intermediate node in the relay graph."""
-    out_type = ir_pass.infer_type(inputs)
-    out_shapes = [get_const_tuple(out_type.checked_type.shape)]
-    return out_shapes
+def _gather_nd():
+    """GatherNd"""
+    def _impl(inputs, attr, params):
+        return AttrCvt(op_name="gather_nd",
+                       ignores=['Tindices', 'Tparams',\
+                                'Taxis', '_class'])(inputs, attr)
+    return _impl
 
 def _stridedSlice():
     def _impl(inputs, attr, params):
@@ -730,9 +886,9 @@ def _impl(inputs, attr, params):
         Tensorflow mask validation: https://github.com/tensorflow/tensorflow/blob/master/
         tensorflow/core/util/strided_slice_op.cc#L147-L368
         """
-        begin = params.pop(inputs[1].name_hint).asnumpy().tolist()
-        end = params.pop(inputs[2].name_hint).asnumpy().tolist()
-        stride = params.pop(inputs[3].name_hint).asnumpy().tolist()
+        begin = _get_list_param(params, inputs[1])
+        end = _get_list_param(params, inputs[2])
+        stride = _get_list_param(params, inputs[3])
         begin_mask = int(attr.get('begin_mask', 0))
         end_mask = int(attr.get('end_mask', 0))
         ellipsis_mask = int(attr.get('ellipsis_mask', 0))
@@ -807,7 +963,7 @@ def _transform_mask(stride_dim, ellipsis_mask):
         if begin_mask or end_mask or ellipsis_mask or new_axis_mask or shrink_axis_mask:
             begin, end, stride, fshape_indices = _transform_mask(stride_dim, ellipsis_mask)
         out = _op.strided_slice(inputs[0], begin=begin, end=end, strides=stride)
-        out_shape = _infer_out_shapes(out, params)[0]
+        out_shape = _infer_shape(out, params)
         if not fshape_indices:
             fshape_indices = range(len(out_shape))
 
@@ -828,19 +984,14 @@ def _transform_mask(stride_dim, ellipsis_mask):
 
 def _pad(name):
     def _impl(inputs, attr, params):
-        padlist_key = inputs[1].name_hint
-        if padlist_key in params:
-            padlist = params.pop(padlist_key).asnumpy()
-        else:
-            raise tvm.error.OpAttributeRequired(
-                'Attribute {} not found in operator Pad.'.format(padlist_key))
-        paddings = tuple([tuple(l) for l in padlist])
+        padlist = _get_param(params, inputs[1])
+        paddings = tuple(tuple(l) for l in padlist)
         attr['pad_width'] = paddings
         attr['pad_value'] = 0
         new_inputs = [inputs[0]]
         if name == 'PadV2':
-            constant_values = params.pop(inputs[2].name_hint).asnumpy()
-            attr['pad_value'] = constant_values[0]
+            constant_values = _get_num_param(params, inputs[2])
+            attr['pad_value'] = constant_values
         return AttrCvt(
             op_name='pad',
             ignores=['Tpaddings'],)(new_inputs, attr)
@@ -850,10 +1001,9 @@ def _transpose():
     def _impl(inputs, attr, params):
         # If perm is not specified, axes is left empty,
         # otherwise its value is get from params
-        param_name = _get_name_hint(inputs[1])
-        if param_name in params:
-            axes = tuple(params.get(param_name).asnumpy())
-        else:
+        try:
+            axes = _get_list_param(params, inputs[1])
+        except (IndexError, KeyError):
             axes = None
         return _op.transpose(inputs[0], axes=axes)
     return _impl
@@ -863,9 +1013,16 @@ def _impl(inputs, attr, params):
         return AttrCvt(op_name="where")(inputs, attr)
     return _impl
 
+def _clip_by_value():
+    def _impl(inputs, attr, params):
+        a_min = params.pop(inputs[1].name_hint).asnumpy()[0]
+        a_max = params.pop(inputs[2].name_hint).asnumpy()[0]
+        return _op.clip(inputs[0], a_min=a_min, a_max=a_max)
+    return _impl
+
 def _reverse_v2():
     def _impl(inputs, attr, params):
-        axis = params.pop(inputs[1].name_hint).asnumpy()[0]
+        axis = _get_num_param(params, inputs[1])
         return AttrCvt(
             op_name="reverse",
             ignores=['Tidx'],
@@ -887,37 +1044,42 @@ def _impl(inputs, attr, params):
 def _range():
     def _impl(inputs, attr, params):
         start = params.pop(inputs[0].name_hint).asnumpy()[0]
-        limit = params.pop(inputs[1].name_hint).asnumpy()[0]
+        limit = params.pop(inputs[1].name_hint).asnumpy()[0] \
+            if hasattr(inputs[1], "name_hint") else params.pop('Rank').asnumpy()[0]
         delta = params.pop(inputs[2].name_hint).asnumpy()[0]
-
-        name = attr["_node_name"]
-        params[name] = tvm.nd.array([start, limit, delta])
-        return [_expr.var(name,
-                          shape=params[name].shape,
-                          dtype='int32')]
+        dtype = attr['dtype'].name if 'dtype' in attr else "int32"
+        return AttrCvt(
+            op_name="arange",
+            ignores=['Tidx'],
+            extras={'start': start,
+                    "stop": limit,
+                    'step': delta,
+                    'dtype': dtype})([], attr)
     return _impl
 
 def _elu():
     def _impl(inputs, attr, params):
-        alpha = tvm.relay.const(-1.0, attr['T'].name)
-        return alpha * _op.nn.relu(tvm.relay.const(1, attr['T'].name) \
+        dtype = attr['T'].name
+        alpha = tvm.relay.const(-1.0, dtype)
+        return alpha * _op.nn.relu(tvm.relay.const(1, dtype) \
                                    - _op.exp(inputs[0])) + _op.nn.relu(inputs[0])
     return _impl
 
 def _selu():
     def _impl(inputs, attr, params):
-        alpha = tvm.relay.const(-1.6732632423543772848170429916717, attr['T'].name)
-        gamma = tvm.relay.const(1.0507009873554804934193349852946, attr['T'].name)
-        return gamma * (alpha * _op.nn.relu(tvm.relay.const(1, attr['T'].name) \
+        dtype = attr['T'].name
+        alpha = tvm.relay.const(-1.6732632423543772848170429916717, dtype)
+        gamma = tvm.relay.const(1.0507009873554804934193349852946, dtype)
+        return gamma * (alpha * _op.nn.relu(tvm.relay.const(1, dtype) \
                                             - _op.exp(inputs[0])) + _op.nn.relu(inputs[0]))
     return _impl
 
 def _mean():
     def _impl(inputs, attr, params):
-        axis = params.pop(inputs[1].name_hint)
+        axis = _get_tuple_param(params, inputs[1])
         return AttrCvt(op_name="mean", ignores=['Tdim', 'Tidx'],
                        transforms={'keep_dims': 'keepdims'},
-                       extras={'axis': tuple(axis.asnumpy())})([inputs[0]], attr)
+                       extras={'axis': axis})([inputs[0]], attr)
     return _impl
 
 def _broadcast(name):
@@ -943,8 +1105,7 @@ def _impl(inputs, attr, params):
             if has_size_vector:
                 input_node_index = 0
                 input_axis_index = 2
-                size_splits_input_name = _get_name_hint(inputs[1])
-                size_splits = params[size_splits_input_name].asnumpy()
+                size_splits = _get_param(params, inputs[1])
                 section_beginnings = np.cumsum(size_splits)[:-1]
                 indices_or_sections = tuple(section_beginnings)
             else:
@@ -952,8 +1113,7 @@ def _impl(inputs, attr, params):
                 input_axis_index = 0
                 indices_or_sections = attr['num_split']
             input_node = inputs[input_node_index]
-            axis_input_name = _get_name_hint(inputs[input_axis_index])
-            axis_input_value = params[axis_input_name].asnumpy()[0]
+            axis_input_value = _get_num_param(params, inputs[input_axis_index])
         except (IndexError, KeyError):
             raise TypeError( \
                 "Unsupported argument for split: `axis` and `num_or_size_splits` " \
@@ -990,6 +1150,37 @@ def _impl(inputs, attr, params):
                        transforms={'axis': ('axis', 1)})([inputs[0]], attr)
     return _impl
 
+def _softplus():
+    # op description: https://www.tensorflow.org/api_docs/python/tf/math/softplus
+    def _impl(inputs, attr, params):
+        exp_out = AttrCvt('exp')(inputs, attr)
+        inputs.append(tvm.relay.const(1, attr['T'].name))
+        rh = tvm.relay.const(1, attr['T'].name)
+        add_out = _get_relay_op('add')(exp_out, rh)
+        return _get_relay_op('log')(add_out)
+    return _impl
+
+def _topk():
+    def _impl(inputs, attr, params):
+        k = int(params.pop(inputs.pop(1).name_hint).asnumpy())
+        if k < 1:
+            raise tvm.error.OpAttributeInvalid(
+                'Attribute k must be positive in operator TopKV2')
+        if attr['sorted'] is False:
+            raise tvm.error.OpAttributeUnimplemented(
+                'Attribute sorted=False is not supported in operator TopKV2')
+        return AttrCvt(op_name='topk',
+                       ignores=['sorted'],
+                       extras={'k': k, 'is_ascend': False, 'dtype': 'int32'})(inputs, attr)
+    return _impl
+
+def _floordiv():
+    def _impl(inputs, attr, params):
+        assert len(inputs) == 2
+        div = AttrCvt('divide')(inputs, attr)
+        return _get_relay_op('floor')(div)
+    return _impl
+
 def _logical(name):
     def _impl(inputs, attr, params):
         return AttrCvt(op_name=name)(inputs, attr)
@@ -999,8 +1190,8 @@ def _space_to_batch_nd():
     def _impl(inputs, attr, params):
         input_node = inputs[0]
         input_shape = attr['_input_shapes'][input_node]
-        block_shape = params.pop(inputs[1].name_hint).asnumpy().tolist()
-        paddings = params.pop(inputs[2].name_hint).asnumpy().tolist()
+        block_shape = _get_list_param(params, inputs[1])
+        paddings = _get_list_param(params, inputs[2])
         N = len(input_shape)
         M = len(block_shape)
         batch = input_shape[0]
@@ -1021,7 +1212,7 @@ def _impl(inputs, attr, params):
         axes = [2 * i + 2 for i in range(M)] + [0] + [2 * i + 1 for i in range(M)] + \
                list(range(1 + 2 * M, 1 + 2 * M + remaining_shape_length))
         permuted_reshaped_padded = tvm.relay.transpose(reshaped_padded, axes=axes)
-        permuted_reshaped_padded_shape = _infer_out_shapes(permuted_reshaped_padded, params)[0]
+        permuted_reshaped_padded_shape = _infer_shape(permuted_reshaped_padded, params)
         # Reshape permuted_reshaped_padded to flatten block_shape into the batch dimension,
         # producing an output tensor of shape:
         # [batch * prod(block_shape)] + [padded_shape[1] / block_shape[0], ...,
@@ -1038,8 +1229,8 @@ def _batch_to_space_nd():
     def _impl(inputs, attr, params):
         input_node = inputs[0]
         input_shape = attr['_input_shapes'][input_node]
-        block_shape = params.pop(inputs[1].name_hint).asnumpy().tolist()
-        crops = params.pop(inputs[2].name_hint).asnumpy().tolist()
+        block_shape = _get_list_param(params, inputs[1])
+        crops = _get_list_param(params, inputs[2])
         M = len(block_shape)
         batch = input_shape[0]
         # From https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/batch-to-space-n-d:
@@ -1064,7 +1255,7 @@ def _impl(inputs, attr, params):
         # [batch / prod(block_shape), input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
         #  ..., input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
         #  input_shape[M+1], ..., input_shape[N-1]]
-        reshaped_permuted_shape = _infer_out_shapes(reshaped_permuted, params)[0]
+        reshaped_permuted_shape = _infer_shape(reshaped_permuted, params)
         cropped = reshaped_permuted
         for axis in range(1, M+1):
             crop = crops[axis - 1]
@@ -1098,48 +1289,62 @@ def _impl(inputs, attr, params):
 # for 1 to N mapping(composed), use custom callable functions
 # for N to 1 mapping, currently not supported(?)
 _convert_map = {
+    'Abs'                               : AttrCvt('abs'),
     'Add'                               : _elemwise('add'),
+    'All'                               : _reduce('all'),
     'ArgMax'                            : _argx(_op.argmax, 'argmax'),
     'ArgMin'                            : _argx(_op.argmin, 'argmin'),
     'AvgPool'                           : _pooling('avg_pool'),
     'BatchNormWithGlobalNormalization'  : _batch_norm(),
     'BatchToSpaceND'                    : _batch_to_space_nd(),
     'BiasAdd'                           : _bias_add(),
+    'BroadcastTo'                       : _broadcast_to(),
     'Cast'                              : _cast(),
     'Ceil'                              : AttrCvt('ceil'),
     'CheckNumerics'                     : _check_numerics(),
+    'ClipByValue'                       : _clip_by_value(),
     'Concat'                            : _concat(),
     'ConcatV2'                          : _concatV2(),
     'Conv2D'                            : _conv('conv'),
+    'CropAndResize'                     : _crop_and_resize(),
     'DecodeJpeg'                        : _decode_image(),
     'DepthwiseConv2dNative'             : _conv('depthwise'),
+    'DepthToSpace'                      : _depth_to_space(),
     'Equal'                             : _broadcast('equal'),
     'Elu'                               : _elu(),
     'Exp'                               : AttrCvt('exp'),
     'ExpandDims'                        : _expand_dims(),
     'Fill'                              : _fill(),
     'Floor'                             : AttrCvt('floor'),
+    'FloorDiv'                          : _floordiv(),
     'FusedBatchNorm'                    : _fused_batch_norm(),
     'FusedBatchNormV2'                  : _fused_batch_norm(),
     'Gather'                            : _gather(),
+    'GatherNd'                          : _gather_nd(),
     'GatherV2'                          : _gather(),
     'Greater'                           : _broadcast('greater'),
     'GreaterEqual'                      : _broadcast('greater_equal'),
     'Identity'                          : _identity(),
     'LeakyRelu'                         : AttrCvt('leaky_relu'),
+    'LeftShift'                         : AttrCvt('left_shift'),
     'Less'                              : _broadcast('less'),
     'LessEqual'                         : _broadcast('less_equal'),
     'Log'                               : AttrCvt('log'),
     'LogicalAnd'                        : _logical('logical_and'),
     'LogicalOr'                         : _logical('logical_or'),
     'LogicalNot'                        : _logical('logical_not'),
+    'LogSoftmax'                        : AttrCvt('log_softmax'),
     'LRN'                               : _lrn(),
     'MatMul'                            : _matmul(),
+    'Max'                               : _reduce('max'),
     'MaxPool'                           : _pooling('max_pool'),
     'Maximum'                           : _elemwise('maximum'),
     'Mean'                              : _mean(),
+    'Min'                               : _reduce('min'),
     'Minimum'                           : _elemwise('minimum'),
+    'Mod'                               : _elemwise('mod'),
     'Mul'                               : _elemwise('multiply'),
+    'Neg'                               : AttrCvt('negative'),
     'NotEqual'                          : _broadcast('not_equal'),
     'Pack'                              : _pack(),
     'Pad'                               : _pad('Pad'),
@@ -1148,12 +1353,15 @@ def _impl(inputs, attr, params):
     'Prod'                              : _prod(),
     'Range'                             : _range(),
     'Rank'                              : _rank(),
-    'RealDiv'                           : _elemwise('div'),
+    'RealDiv'                           : _elemwise('divide'),
     'Relu'                              : AttrCvt('relu'),
     'Relu6'                             : _relu6(),
     'Reshape'                           : _reshape(),
     'ResizeBilinear'                    : _resize_bilinear(),
+    'ResizeBicubic'                     : _resize_bilinear(),
+    'ResizeNearestNeighbor'             : _resize_nearest_neighbor(),
     'ReverseV2'                         : _reverse_v2(),
+    'RightShift'                        : AttrCvt('right_shift'),
     'Round'                             : AttrCvt('round'),
     'Rsqrt'                             : _rsqrt(),
     'Select'                            : _where(),
@@ -1163,9 +1371,11 @@ def _impl(inputs, attr, params):
     'Sign'                              : AttrCvt('sign'),
     'Slice'                             : _slice(),
     'Softmax'                           : _softmax(),
+    'Softplus'                          : _softplus(),
     'SpaceToBatchND'                    : _space_to_batch_nd(),
     'Split'                             : _split(False),
     'SplitV'                            : _split(True),
+    'Sqrt'                              : AttrCvt('sqrt'),
     'Square'                            : _square(),
     'Squeeze'                           : _squeeze(),
     'StridedSlice'                      : _stridedSlice(),
@@ -1173,8 +1383,11 @@ def _impl(inputs, attr, params):
     'Sum'                               : _sum(),
     'Tanh'                              : AttrCvt('tanh'),
     'Tile'                              : _tile(),
+    'TopKV2'                            : _topk(),
     'Transpose'                         : _transpose(),
+    'TruncateMod'                       : _elemwise('mod'),
     'Unpack'                            : _unpack(),
+    'ZerosLike'                         : AttrCvt('zeros_like'),
 
 }
 
@@ -1458,7 +1671,7 @@ def _in_while_loop(control_flow_node_map, op_name):
     Parameters
     ----------
     control_flow_node_map : Dict[str, Set[str]]
-        A dictionay contains the unqiue control flow execution frame name to
+        A dictionay contains the unique control flow execution frame name to
         a set of primitive operators mapping.
 
     op_name : str
@@ -1510,7 +1723,7 @@ def f2():
                 return tf.add(4, 23)
             r = tf.cond(tf.less(i, j), f1, f2)
 
-    This condition statement should be coverted into Relay in the following
+    This condition statement should be converted into Relay in the following
     form:
 
     .. code-block:: python
@@ -1618,7 +1831,7 @@ def __init__(self):
         self._loop = None
 
     def _while_loop(self):
-        """An internal API to create a Relay recurisve call for a matched TF
+        """An internal API to create a Relay recursive call for a matched TF
         `while_loop` construct.
         """
         wl = tvm.relay.var('while_loop')
@@ -1676,9 +1889,10 @@ def __init__(self):
         self._input_shapes = {}
         self._loops = {}
         self._branches = {}
+        self._mod = _module.Module({})
 
     def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
-        """Construct relay nodes from tensorflow  graph definition - GraphDef.
+        """Construct relay nodes from tensorflow graph definition - GraphDef.
 
         Follow the tensorflow graph definition to parse and convert it to Relay.
         Some of the assumptions listed below.
@@ -1687,7 +1901,7 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
             -> All Const nodes are params.
             -> Last node is assumed as graph output.
             -> _output_shapes : Graph should be frozen with add_shapes=True.
-                                Or user can pass input shape dictionaly optionally.
+                                Or user can pass input shape dictionary optionally.
             -> DecodeJpeg, ResizeBilinear: These are dummy operators.
                                            Hence user should handle preprocessing outside.
             -> CheckNumerics: No implementation as of now for this.
@@ -1704,10 +1918,14 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
         shape : Dictionary of input dimensions (Optional)
             Graph level input shape dictionary.
 
+        outputs : List of output tensor names (Optional)
+            if not specified then the last node is assumed as graph output.
+
         Returns
         -------
-        sym : relay.op
-            The returned relay operator
+        mod : tvm.relay.Module
+            The module that optimizations will be performed on.
+
         params : dict
             A dict of name: tvm.nd.array pairs, used as pretrained weights
         """
@@ -1728,7 +1946,7 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
         for node in graph.node:
             node_name_prefix = node.name.rsplit('/', 1)[0]
             control_flow_node_map[node_name_prefix].add(node.op)
-            if node.op == 'Placeholder':
+            if node.op == 'Placeholder' or node.op == 'PlaceholderWithDefault':
                 # Give priority to user argument.
                 if shape and node.name in shape:
                     self._input_shapes[node.name] = list(shape[node.name])
@@ -1788,7 +2006,7 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
 
                 attr = self._parse_attr(node.attr)
 
-            elif node.op != "Placeholder":
+            elif node.op != "Placeholder" and node.op != 'PlaceholderWithDefault':
                 # Pass the parsed shapes instead
                 attr["_output_shapes"] = output_shapes = self._output_shapes[node.name]
 
@@ -1858,23 +2076,17 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
 
                 # Infer shapes even without specifying "add_shapes=True"
                 if output_shapes == [None]:
-                    out_shapes = []
-                    for node_item in self._nodes[node.name]:
-                        out_type = ir_pass.infer_type(node_item)
-                        out_shapes.append(get_const_tuple(out_type.checked_type.shape))
+                    out_shapes = [_infer_shape(node_item) for node_item in self._nodes[node.name]]
                     self._output_shapes[node.name] = out_shapes
 
                 if self._output_shapes[node.name] and shape and node.name in shape:
                     assert self._output_shapes[node.name] == list(shape[node.name])
 
-            # Infer shapes if passed explicitely
+            # Infer shapes if passed explicitly
             node_output = self._nodes[node.name]
             if shape and (not self._output_shapes[node.name][0]
                           or -1 in self._output_shapes[node.name][0]):
-                out_shapes = []
-                for node_item in node_output:
-                    out_type = ir_pass.infer_type(node_item)
-                    out_shapes.append(get_const_tuple(out_type.checked_type.shape))
+                out_shapes = [_infer_shape(node_item) for node_item in node_output]
                 self._output_shapes[node.name] = out_shapes
 
         out = []
@@ -1902,8 +2114,8 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
 
         out = out[0] if len(out) == 1 else _expr.Tuple(out)
         func = _expr.Function(ir_pass.free_vars(out), out)
-
-        return func, self._params
+        self._mod[self._mod.entry_func] = func
+        return self._mod, self._params
 
     def _parse_import_prerequisites(self, graph):
         """ Calculate the named preconditions from TensorFlow `graph`.
@@ -1913,7 +2125,7 @@ def _parse_import_prerequisites(self, graph):
         """
         missing_operators = set()
         for node in graph.node:
-            if node.op == "Placeholder":
+            if node.op == "Placeholder" or node.op == 'PlaceholderWithDefault':
                 pass
             elif node.op == "Const":
                 pass
@@ -2133,7 +2345,7 @@ def _convert_control_flow_operator(self, node, inputs, attrs, control_flow_node_
     def _convert_operator(self, op_name, inputs, attrs,
                           graph, identity_list=None, convert_map=None):
         """Convert from Tensorflow operator to relay operator.
-        The converter must specify conversions explicity for incompatible name, and
+        The converter must specify conversions explicitly for incompatible name, and
         apply handlers to operator attributes.
 
         Parameters
@@ -2173,7 +2385,7 @@ def _convert_operator(self, op_name, inputs, attrs,
 
 
 def from_tensorflow(graph, layout="NHWC", shape=None, outputs=None):
-    """  Load tensorflow graph which is a python tensorflow graph object into relay.
+    """Load tensorflow graph which is a python tensorflow graph object into relay.
     The companion parameters will be handled automatically.
 
     Parameters
@@ -2181,14 +2393,23 @@ def from_tensorflow(graph, layout="NHWC", shape=None, outputs=None):
     graph : GraphDef object
         Tensorflow GraphDef
 
+    layout : target layout to be used (Optional)
+        NCHW only supported now to enable NHWC models on GPU.
+
+    shape : Dictionary of input dimensions (Optional)
+        Graph level input shape dictionary.
+
+    outputs : List of output tensor names (Optional)
+        if not specified then the last node is assumed as graph output.
+
     Returns
     -------
-    sym : relay.op
-        Compatible relay operator
+    mod : tvm.relay.Module
+        The module that optimizations will be performed on.
 
     params : dict of str to tvm.ndarray
         Dict of converted parameters stored in tvm.ndarray format
     """
     g = GraphProto()
-    sym, params = g.from_tensorflow(graph, layout, shape, outputs)
-    return sym, params
+    mod, params = g.from_tensorflow(graph, layout, shape, outputs)
+    return mod, params
diff --git a/python/tvm/relay/frontend/tensorflow_parser.py b/python/tvm/relay/frontend/tensorflow_parser.py
index 9cb7eabf0ea5..8105ef03aeca 100644
--- a/python/tvm/relay/frontend/tensorflow_parser.py
+++ b/python/tvm/relay/frontend/tensorflow_parser.py
@@ -18,7 +18,6 @@
 from __future__ import absolute_import as _abs
 from __future__ import print_function
 import os
-from tensorflow.core.framework import graph_pb2
 from tvm.contrib import util
 
 
@@ -35,12 +34,12 @@ class TFParser(object):
     --------
     .. code-block:: python
 
-        parser = TfParser(model_dir)
-        graph = parser.parse()
-        # graph is related graphdef of the model
+        parser = TFParser(model_dir)
+        graphdef = parser.parse()
     """
 
     def __init__(self, model_dir):
+        from tensorflow.core.framework import graph_pb2
         self._tmp_dir = util.tempdir()
         self._model_dir = model_dir
         self._graph = graph_pb2.GraphDef()
@@ -96,6 +95,7 @@ def _load_saved_model(self):
             from tensorflow.python.tools import freeze_graph
             from tensorflow.python.framework import ops
             from tensorflow.python.framework import graph_util
+            from tensorflow.core.framework import graph_pb2
         except ImportError:
             raise ImportError(
                 "InputConfiguration: Unable to import tensorflow which is "
diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index ff62d89412e9..fe163871fa60 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -22,6 +22,7 @@
 import tvm
 from .. import ir_pass
 from .. import expr as _expr
+from .. import module as _module
 from .. import op as _op
 from ... import nd as _nd
 from .common import ExprTable
@@ -59,12 +60,26 @@ def __init__(self, model, subgraph, exp_tab):
             'DEPTHWISE_CONV_2D': self.convert_depthwise_conv2d,
             'AVERAGE_POOL_2D': self.convert_average_pool2d,
             'RESHAPE': self.convert_reshape,
+            'RESIZE_BILINEAR': self.convert_resize_bilinear,
+            'RESIZE_NEAREST_NEIGHBOR': self.convert_resize_nearest_neighbor,
             'SOFTMAX': self.convert_softmax,
             'SQUEEZE': self.convert_squeeze,
             'MAX_POOL_2D': self.convert_max_pool2d,
             'CONCATENATION': self.convert_concatenation,
             'ADD': self.convert_add,
+            'SUB': self.convert_sub,
+            'MUL': self.convert_mul,
+            'DIV': self.convert_div,
+            'POW': self.convert_pow,
+            'MAXIMUM': self.convert_maximum,
+            'MINIMUM': self.convert_minimum,
+            'REDUCE_MIN': self._convert_reduce_min,
+            'REDUCE_MAX': self._convert_reduce_max,
+            'MEAN': self._convert_reduce_mean,
+            'REDUCE_PROD': self._convert_reduce_prod,
             'FULLY_CONNECTED': self.convert_fully_connected,
+            'PAD': self.convert_pad,
+            'LOGISTIC': self.convert_logistic,
         }
 
     def check_unsupported_ops(self):
@@ -112,7 +127,7 @@ def get_op_code_str(self, op):
         op_code_str = self.builtin_op_code[op_code_id]
         if op_code_id == BuiltinOperator.CUSTOM:
             # Custom operator
-            raise NotImplementedError("Not Support Custom Operator Now")
+            raise NotImplementedError("Custom operators are currently not supported")
         return op_code_str
 
     def get_input_tensors(self, op):
@@ -155,7 +170,7 @@ def get_tensor_value(self, tensor_wrapper):
         if tensor_wrapper.tensor.Type() == TensorType.INT32:
             return np.frombuffer(tensor_wrapper.buffer.DataAsNumpy(), dtype=np.int32).reshape(
                 tensor_wrapper.tensor.ShapeAsNumpy())
-        raise NotImplementedError("Not support tensor type {}"
+        raise NotImplementedError("Tensor type {} is currently not supported"
                                   .format(str(tensor_wrapper.tensor.Type())))
 
     def get_tensor_type_str(self, tensor_type):
@@ -171,7 +186,8 @@ def get_tensor_type_str(self, tensor_type):
             return "float32"
         if tensor_type == TensorType.INT32:
             return "int32"
-        raise NotImplementedError("Not support tensor type {}".format(str(tensor_type)))
+        raise NotImplementedError("Tensor type {} is currently not supported"
+                                  .format(str(tensor_type)))
 
     def convert_conv2d(self, op):
         """Convert TFLite conv2d"""
@@ -209,44 +225,79 @@ def convert_reshape(self, op):
         reshape_options = ReshapeOptions()
         reshape_options.Init(op_options.Bytes, op_options.Pos)
         target_shape = reshape_options.NewShapeAsNumpy()
-        input_shape_length = len(input_tensor.tensor.ShapeAsNumpy())
 
         in_expr = self.get_expr(input_tensor_idx)
+        out = _op.reshape(in_expr, newshape=tuple(target_shape))
 
-        if input_shape_length in (1, 2):
-            # The rule is channel first (after N but before H, W).
-            # length of 1 means N*H*W*C, do nothing.
-            # length of 2 means N*H*W, C, do nothing.
-            pass
-        elif input_shape_length == 3:
-            # convert N C H*W to N H*W C
-            in_expr = _op.transpose(in_expr, axes=(0, 2, 1))
-        elif input_shape_length == 4:
-            # convert input to N H W C, then reshape to target shape,
-            # finally convert back if necessary
-            in_expr = _op.transpose(in_expr, axes=(0, 2, 3, 1))
-        else:
-            msg = 'Input shape length {} for operator Reshape is not valid.'
-            raise tvm.error.OpAttributeInvalid(msg.format(input_shape_length))
+        return out
 
-        out = _op.reshape(in_expr, newshape=tuple(target_shape))
+    def _convert_resize(self, method, op):
+        """Generic method to Convert TFLite RESIZE operators"""
+        try:
+            from tflite.BuiltinOptions import BuiltinOptions
+            from tflite.Operator import Operator
+            from tflite.ResizeBilinearOptions import ResizeBilinearOptions
+            # ResizeNearestNeighborOptions was added in tflite v1.13
+            tflite_ver = 1120
+            if 'ResizeNearestNeighborOptions' in dir(BuiltinOptions):
+                from tflite.ResizeNearestNeighborOptions import ResizeNearestNeighborOptions
+                tflite_ver = 1130
+        except ImportError:
+            raise ImportError("The tflite package must be installed")
 
-        # The rule is channel first.
-        # 1: N*H*W*C
-        # 2: N*H*W, C
-        # 3: N H W C, reshape to N H*W C, transpose to N C H*W
-        # 4: N H W C, transpose to N C H W
-        # add more if we need target shapes in future
-        if len(target_shape) == 1 or len(target_shape) == 2:
-            pass
-        elif len(target_shape) == 3:
-            out = _op.transpose(out, axes=(0, 2, 1))
-        elif len(target_shape) == 4:
-            out = _op.transpose(out, axes=(0, 3, 1, 2))
-        else:
-            raise tvm.error.OpAttributeInvalid(
-                'Length of target shape must be between 1 and 5 for operator Reshape.')
+        assert isinstance(op, Operator)
+        input_tensors = self.get_input_tensors(op)
+        assert len(input_tensors) == 2, "input tensors length should be 2"
+
+        # images, 4-D Tensor with shape NHWC.
+        input_tensor = input_tensors[0]
+        in_expr = self.get_expr(input_tensor.tensor_idx)
+
+        # size - 1-D int32 Tensor of 2 elements: new_height, new_width
+        target_size = tuple(self.get_tensor_value(input_tensors[1]))
+
+        # Options - align_corners (bool)
+        resize_options = None
+        align_corners = False
+        if method == "BILINEAR":
+            assert op.BuiltinOptionsType() == BuiltinOptions.ResizeBilinearOptions
+            resize_options = ResizeBilinearOptions()
+        elif tflite_ver >= 1130:
+            assert op.BuiltinOptionsType() == BuiltinOptions.ResizeNearestNeighborOptions
+            resize_options = ResizeNearestNeighborOptions()
+
+        if resize_options is not None:
+            op_options = op.BuiltinOptions()
+            resize_options.Init(op_options.Bytes, op_options.Pos)
+            align_corners = resize_options.AlignCorners()
+
+        # Use layout NHWC
+        out = _op.image.resize(in_expr, target_size, "NHWC", method, align_corners)
+        return out
+
+    def convert_resize_bilinear(self, op):
+        """Convert TFLite RESIZE_BILINEAR"""
+        return self._convert_resize("BILINEAR", op)
+
+    def convert_resize_nearest_neighbor(self, op):
+        """Convert TFLite RESIZE_NEAREST_NEIGHBOR"""
+        return self._convert_resize("NEAREST_NEIGHBOR", op)
+
+    def convert_logistic(self, op):
+        """Convert TFLite LOGISTIC"""
+        try:
+            from tflite.Operator import Operator
+        except ImportError:
+            raise ImportError("The tflite package must be installed")
+
+        assert isinstance(op, Operator)
+        input_tensors = self.get_input_tensors(op)
+        assert len(input_tensors) == 1, "input tensors length should be 1"
+
+        input_tensor = input_tensors[0]
+        in_expr = self.get_expr(input_tensor.tensor_idx)
 
+        out = _op.sigmoid(in_expr)
         return out
 
     def convert_softmax(self, op):
@@ -269,7 +320,7 @@ def convert_softmax(self, op):
         return out
 
     def convert_concatenation(self, op):
-        """ convert TFLite concatenation"""
+        """Convert TFLite concatenation"""
         try:
             from tflite.Operator import Operator
             from tflite.ConcatenationOptions import ConcatenationOptions
@@ -292,15 +343,6 @@ def convert_concatenation(self, op):
         concatenation_options.Init(op_options.Bytes, op_options.Pos)
         concatenation_axis = concatenation_options.Axis()
         fused_activation_fn = concatenation_options.FusedActivationFunction()
-        input_shape_length = len(input_tensors[0].tensor.ShapeAsNumpy())
-
-        # TFLite is N H W C, our layout is N C H W
-        if input_shape_length <= 4:
-            axis_convert_map = [0] + list(range(2, input_shape_length)) + [1]
-            concatenation_axis = axis_convert_map[concatenation_axis]
-        else:
-            raise NotImplementedError("Not support input shape length {} of concatenatio : "
-                                      .format(str(input_shape_length)))
 
         # with axis in N H W C
         out = _op.concatenate(in_exprs, axis=concatenation_axis)
@@ -310,10 +352,16 @@ def convert_concatenation(self, op):
             out = self.convert_fused_activation_function(out, fused_activation_fn)
         return out
 
-    def convert_add(self, op):
-        """Convert TFLite add"""
+    def _convert_elemwise(self, relay_op, op):
+        """Generic method to Convert TFLite elemwise"""
         try:
             from tflite.Operator import Operator
+            from tflite.AddOptions import AddOptions
+            from tflite.SubOptions import SubOptions
+            from tflite.MulOptions import MulOptions
+            from tflite.DivOptions import DivOptions
+            from tflite.BuiltinOptions import BuiltinOptions
+            from tflite.ActivationFunctionType import ActivationFunctionType
         except ImportError:
             raise ImportError("The tflite package must be installed")
 
@@ -326,33 +374,105 @@ def convert_add(self, op):
 
         rhs_tensor = input_tensors[1]
         if self.has_expr(rhs_tensor.tensor_idx):
-            # In most cases, we can assume that TOCO fuses ADD operators
+            # In most cases, we can assume that TOCO fuses elemwise operators
             # with constants - it means both will be tensors.
             rhs_expr = self.get_expr(rhs_tensor.tensor_idx)
         else:
-            # However, in some corner cases, the ADD operator is not fused,
+            # However, in some corner cases, the elemwise operator is not fused,
             # we can receive as constant.
             rhs_type_str = self.get_tensor_type_str(rhs_tensor.tensor.Type())
             rhs_expr = self.exp_tab.new_const(self.get_tensor_value(rhs_tensor),
                                               dtype=rhs_type_str)
+        out = relay_op(lhs_expr, rhs_expr)
+
+        # Options (fused_activation_function)
+        options = None
+        if op.BuiltinOptionsType() == BuiltinOptions.AddOptions:
+            options = AddOptions()
+        elif op.BuiltinOptionsType() == BuiltinOptions.SubOptions:
+            options = SubOptions()
+        elif op.BuiltinOptionsType() == BuiltinOptions.MulOptions:
+            options = MulOptions()
+        elif op.BuiltinOptionsType() == BuiltinOptions.DivOptions:
+            options = DivOptions()
+
+        if options is not None:
+            op_options = op.BuiltinOptions()
+            options.Init(op_options.Bytes, op_options.Pos)
+            fused_activation_fn = options.FusedActivationFunction()
+            # if we have activation fn
+            if fused_activation_fn != ActivationFunctionType.NONE:
+                out = self.convert_fused_activation_function(out, fused_activation_fn)
 
-            # In this case, we have to be careful about formatting.
-            input_shape_length = len(rhs_tensor.tensor.ShapeAsNumpy())
-            if input_shape_length in (1, 2):
-                pass
-            elif input_shape_length == 3:
-                # N H*W C to N C H*W
-                rhs_expr = _op.transpose(rhs_expr, axes=(0, 2, 1))
-            elif input_shape_length == 4:
-                # N H W C to N C H W
-                rhs_expr = _op.transpose(rhs_expr, axes=(0, 3, 1, 2))
-            else:
-                msg = 'Input shape length {} for operator ADD is not valid.'
-                raise tvm.error.OpAttributeInvalid(msg.format(input_shape_length))
+        return out
+
+    def convert_add(self, op):
+        """Convert TFLite ADD"""
+        return self._convert_elemwise(_op.add, op)
+
+    def convert_sub(self, op):
+        """Convert TFLite SUB"""
+        return self._convert_elemwise(_op.subtract, op)
+
+    def convert_mul(self, op):
+        """Convert TFLite MUL"""
+        return self._convert_elemwise(_op.multiply, op)
+
+    def convert_div(self, op):
+        """Convert TFLite DIV"""
+        return self._convert_elemwise(_op.divide, op)
 
-        out = _op.add(lhs_expr, rhs_expr)
+    def convert_pow(self, op):
+        return self._convert_elemwise(_op.power, op)
+
+    def convert_maximum(self, op):
+        return self._convert_elemwise(_op.maximum, op)
+
+    def convert_minimum(self, op):
+        return self._convert_elemwise(_op.minimum, op)
+
+    def _convert_reduce(self, relay_op, op):
+        """Generic method to Convert TFLite MEAN operators"""
+        try:
+            from tflite.BuiltinOptions import BuiltinOptions
+            from tflite.Operator import Operator
+            from tflite.ReducerOptions import ReducerOptions
+        except ImportError:
+            raise ImportError("The tflite package must be installed")
+
+        assert isinstance(op, Operator)
+        input_tensors = self.get_input_tensors(op)
+        assert len(input_tensors) == 2, "input tensors length should be 2"
+
+        # input_tensor
+        input_tensor = input_tensors[0]
+        in_expr = self.get_expr(input_tensor.tensor_idx)
+
+        # axis
+        axis = tuple(self.get_tensor_value(input_tensors[1]))
+
+        # Options - keep_dims (bool)
+        assert op.BuiltinOptionsType() == BuiltinOptions.ReducerOptions
+        reduce_options = ReducerOptions()
+        op_options = op.BuiltinOptions()
+        reduce_options.Init(op_options.Bytes, op_options.Pos)
+        keep_dims = reduce_options.KeepDims()
+
+        out = relay_op(in_expr, axis, keep_dims)
         return out
 
+    def _convert_reduce_min(self, op):
+        return self._convert_reduce(_op.reduce.min, op)
+
+    def _convert_reduce_max(self, op):
+        return self._convert_reduce(_op.reduce.max, op)
+
+    def _convert_reduce_mean(self, op):
+        return self._convert_reduce(_op.reduce.mean, op)
+
+    def _convert_reduce_prod(self, op):
+        return self._convert_reduce(_op.reduce.prod, op)
+
     def convert_fully_connected(self, op):
         """Convert TFLite fully connected"""
         try:
@@ -440,46 +560,10 @@ def convert_squeeze(self, op):
         squeeze_options = SqueezeOptions()
         squeeze_options.Init(op_options.Bytes, op_options.Pos)
         squeeze_axis = squeeze_options.SqueezeDimsAsNumpy()
-        input_shape_length = len(input_tensor.tensor.ShapeAsNumpy())
-        output_shape_length = len(output_tensors[0].tensor.ShapeAsNumpy())
 
         in_expr = self.get_expr(input_tensor_idx)
-
-        # TFLite is N H W C, our layout is N C H W
-        if input_shape_length in (1, 2):
-            # The rule is channel first (after N but before H, W).
-            # length of 1 means N*H*W*C, do nothing.
-            # length of 2 means N*H*W, C, do nothing.
-            pass
-        elif input_shape_length == 3:
-            # convert N C H*W to N H*W C
-            in_expr = _op.transpose(in_expr, axes=(0, 2, 1))
-        elif input_shape_length == 4:
-            # convert input to N H W C, then reshape to target shape,
-            # finally convert back if necessary
-            in_expr = _op.transpose(in_expr, axes=(0, 2, 3, 1))
-        else:
-            msg = 'Input shape length {} for operator Squeeze is not valid.'
-            raise tvm.error.OpAttributeInvalid(msg.format(input_shape_length))
-
         out = _op.squeeze(in_expr, axis=tuple(squeeze_axis))
 
-        # The rule is channel first.
-        # 1: N*H*W*C
-        # 2: N*H*W, C
-        # 3: N H W C, reshape to N H*W C, transpose to N C H*W
-        # 4: N H W C, transpose to N C H W
-        # add more if we need target shapes in future
-        if output_shape_length in (1, 2):
-            pass
-        elif output_shape_length == 3:
-            out = _op.transpose(out, axes=(0, 2, 1))
-        elif output_shape_length == 4:
-            out = _op.transpose(out, axes=(0, 3, 1, 2))
-        else:
-            msg = 'Output shape length {} for operator Squeeze is not valid.'
-            raise tvm.error.OpAttributeInvalid(msg.format(output_shape_length))
-
         return out
 
     def convert_fused_activation_function(self, in_expr, fused_activation_fn):
@@ -535,8 +619,8 @@ def convert_conv(self, op, conv_type):
             conv_options = DepthwiseConv2DOptions()
             conv_options.Init(op_options.Bytes, op_options.Pos)
             depth_multiplier = conv_options.DepthMultiplier()
-            assert depth_multiplier == 1, "TF frontend have transformed it be 1 " \
-                                          "no matter original value be set by 0.25, 0.5 or any else"
+            assert depth_multiplier == 1, "TF frontend transforms it to be 1 regardless of what " \
+                                          "original value is set to 0.25, 0.5 or anything else"
         else:
             raise tvm.error.OpNotImplemented(
                 'Operator {} is not supported for frontend TFLite.'.format(conv_type))
@@ -562,13 +646,16 @@ def convert_conv(self, op, conv_type):
         params = {'kernel_size': [kernel_h, kernel_w],
                   'strides': [stride_h, stride_w],
                   'dilation': [dilation_h, dilation_w],
-                  'padding': [0, 0]}
+                  'padding': [0, 0],
+                  'data_layout': 'NHWC'}
 
         if is_depthwise_conv:
             params['channels'] = int(in_channels * multiplier)
             params['groups'] = int(in_channels)
+            params['kernel_layout'] = 'HWOI'
         else:
             params['channels'] = int(output_channels)
+            params['kernel_layout'] = 'HWIO'
 
         # weight tensor type should be UINT8 (quantization) or FLOAT32
         weight_tensor_type = weight_tensor.tensor.Type()
@@ -578,12 +665,9 @@ def convert_conv(self, op, conv_type):
         in_expr = self.get_expr(input_tensor_idx)
         weight_value = self.get_tensor_value(weight_tensor)
 
-        if is_depthwise_conv:
-            # TFLite is M KH KW IC, we require IC M KH KW
-            weight_value = weight_value.transpose((3, 0, 1, 2))
-        else:
-            # TFLite is OC KH KW IC, we require OC IC KH kW
-            weight_value = weight_value.transpose((0, 3, 1, 2))
+        # TFLite is OC/M KH KW IC, we require KH KW IC OC/M
+        # M means multiplier in depthwise convolution
+        weight_value = weight_value.transpose((1, 2, 3, 0))
 
         weight_expr = self.exp_tab.new_const(weight_value, dtype=weight_tensor_type_str)
 
@@ -592,9 +676,10 @@ def convert_conv(self, op, conv_type):
         elif padding == Padding.SAME:
             pad_top, pad_bottom = get_pad_value(input_h, dilated_kernel_h, stride_h)
             pad_left, pad_right = get_pad_value(input_w, dilated_kernel_w, stride_w)
-            in_expr = _op.nn.pad(data=in_expr, pad_width=((0, 0), (0, 0),
+            in_expr = _op.nn.pad(data=in_expr, pad_width=((0, 0),
                                                           (pad_top, pad_bottom),
-                                                          (pad_left, pad_right)))
+                                                          (pad_left, pad_right),
+                                                          (0, 0)))
         else:
             raise tvm.error.OpAttributeUnimplemented(
                 'Padding format {} is not supported for operator Conv.'.format(padding))
@@ -610,7 +695,8 @@ def convert_conv(self, op, conv_type):
             bias_tensor_type_str = self.get_tensor_type_str(bias_tensor_type)
             bias_expr = self.exp_tab.new_const(self.get_tensor_value(bias_tensor),
                                                dtype=bias_tensor_type_str)
-            out = _op.nn.bias_add(out, bias_expr)
+            channel_axis = 3
+            out = _op.nn.bias_add(out, bias_expr, axis=channel_axis)
 
         # If we have fused activations
         if fused_activation_fn != ActivationFunctionType.NONE:
@@ -648,7 +734,8 @@ def convert_pool2d(self, op, pool_type):
 
         params = {'pool_size': (filter_h, filter_w),
                   'strides': (stride_h, stride_w),
-                  'padding': [0, 0]}
+                  'padding': [0, 0],
+                  'layout': 'NHWC'}
 
         in_expr = self.get_expr(input_tensor_idx)
 
@@ -677,6 +764,31 @@ def convert_pool2d(self, op, pool_type):
 
         return out
 
+    def convert_pad(self, op):
+        """Convert TFLite PAD"""
+        try:
+            from tflite.Operator import Operator
+        except ImportError:
+            raise ImportError("The tflite package must be installed")
+
+        assert isinstance(op, Operator)
+        input_tensors = self.get_input_tensors(op)
+        assert len(input_tensors) == 2, "input tensors length should be 2"
+
+        # TFLite only support CONSTANT mode and does not support constant_values parameter.
+        # tensor
+        input_tensor = input_tensors[0]
+        in_expr = self.get_expr(input_tensor.tensor_idx)
+
+        # paddings
+        pad_list = self.get_tensor_value(input_tensors[1])
+        # convert list of lists to tuple of tuples
+        paddings = tuple(tuple(l) for l in pad_list)
+
+        # Use default pad_value 0 because TFLite does not support constant_values parameter
+        out = _op.nn.pad(in_expr, paddings)
+        return out
+
     def get_expr(self, input_tensor_idx):
         return self.exp_tab.get_expr(get_tensor_name(self.subgraph, input_tensor_idx))
 
@@ -764,8 +876,8 @@ def from_tflite(model, shape_dict, dtype_dict):
 
     Returns
     -------
-    func : tvm.relay.Function
-        Compatible relay Function
+    mod : tvm.relay.Module
+        The relay module for compilation.
 
     params : dict of str to tvm.NDArray
         The parameter dict to be used by relay
@@ -803,4 +915,4 @@ def from_tflite(model, shape_dict, dtype_dict):
     outputs = [exp_tab.get_expr(get_tensor_name(subgraph, i)) for i in model_outputs]
     outputs = outputs[0] if len(outputs) == 1 else _expr.Tuple(outputs)
     func = _expr.Function(ir_pass.free_vars(outputs), outputs)
-    return func, params
+    return _module.Module.from_expr(func), params
diff --git a/python/tvm/relay/grammar/Relay.g4 b/python/tvm/relay/grammar/Relay.g4
index 58546439e1ce..97b4ea24a8b2 100644
--- a/python/tvm/relay/grammar/Relay.g4
+++ b/python/tvm/relay/grammar/Relay.g4
@@ -19,7 +19,7 @@
 
 grammar Relay;
 
-SEMVER: 'v0.0.1' ;
+SEMVER: 'v0.0.2' ;
 
 // Lexing
 // comments
@@ -111,8 +111,8 @@ expr
   // | 'debug'                                # debug
   ;
 
-func: 'fn'        '(' argList ')' ('->' type_)? body ;
-defn: 'def' ident '(' argList ')' ('->' type_)? body ;
+func: 'fn'        typeParamSeq? '(' argList ')' ('->' type_)? body ;
+defn: 'def' ident typeParamSeq? '(' argList ')' ('->' type_)? body ;
 
 argList
   : varList
@@ -132,15 +132,20 @@ attr: CNAME '=' expr ;
 // relations: 'where' relation (',' relation)* ;
 // relation: ident '(' (type_ (',' type_)*)? ')' ;
 
+typeParamSeq
+  : '[' ']'
+  | '[' ident (',' ident)* ']'
+  ;
+
 type_
   : '(' ')'                                         # tupleType
   | '(' type_ ',' ')'                               # tupleType
   | '(' type_ (',' type_)+ ')'                      # tupleType
-  | identType                                       # identTypeType
+  | typeIdent                                       # typeIdentType
   | 'Tensor' '[' shapeSeq ',' type_ ']'             # tensorType
   // currently unused
-  // | identType '[' (type_ (',' type_)*)? ']'         # callType
-  | 'fn' '(' (type_ (',' type_)*)? ')' '->' type_   # funcType
+  // | typeIdent '[' (type_ (',' type_)*)? ']'         # callType
+  | 'fn' typeParamSeq? '(' (type_ (',' type_)*)? ')' '->' type_   # funcType
   | '_'                                             # incompleteType
   | NAT                                             # intType
   ;
@@ -158,7 +163,7 @@ shape
   | NAT                             # intShape
   ;
 
-identType: CNAME ;
+typeIdent : CNAME ;
 // int8, int16, int32, int64
 // uint8, uint16, uint32, uint64
 // float16, float32, float64
diff --git a/python/tvm/relay/grammar/py2/.gitattributes b/python/tvm/relay/grammar/py2/.gitattributes
new file mode 100644
index 000000000000..4adf65fa2f3c
--- /dev/null
+++ b/python/tvm/relay/grammar/py2/.gitattributes
@@ -0,0 +1,3 @@
+Relay* binary
+Relay* linguist-generated=true
+Relay* linguist-detectable=false
\ No newline at end of file
diff --git a/python/tvm/relay/grammar/py2/.gitignore b/python/tvm/relay/grammar/py2/.gitignore
deleted file mode 100644
index d677ff551940..000000000000
--- a/python/tvm/relay/grammar/py2/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-Relay*
diff --git a/python/tvm/relay/grammar/py2/Relay.interp b/python/tvm/relay/grammar/py2/Relay.interp
new file mode 100644
index 000000000000..c6893d096168
--- /dev/null
+++ b/python/tvm/relay/grammar/py2/Relay.interp
@@ -0,0 +1,109 @@
+token literal names:
+null
+'('
+')'
+','
+'['
+']'
+'if'
+'else'
+'let'
+'='
+';'
+'{'
+'}'
+'fn'
+'->'
+'def'
+':'
+'Tensor'
+'_'
+'v0.0.2'
+null
+null
+null
+'*'
+'/'
+'+'
+'-'
+'<'
+'>'
+'<='
+'>='
+'=='
+'!='
+null
+null
+null
+'mut'
+null
+null
+null
+null
+
+token symbolic names:
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+SEMVER
+WS
+LINE_COMMENT
+COMMENT
+MUL
+DIV
+ADD
+SUB
+LT
+GT
+LE
+GE
+EQ
+NE
+GLOBAL_VAR
+LOCAL_VAR
+GRAPH_VAR
+MUT
+BOOL_LIT
+FLOAT
+NAT
+CNAME
+
+rule names:
+opIdent
+prog
+expr
+func
+defn
+argList
+varList
+var
+attrList
+attr
+typeParamSeq
+type_
+shapeSeq
+shape
+typeIdent
+body
+scalar
+ident
+
+
+atn:
+[3, 24715, 42794, 33075, 47597, 16764, 15335, 30598, 22884, 3, 42, 332, 4, 2, 9, 2, 4, 3, 9, 3, 4, 4, 9, 4, 4, 5, 9, 5, 4, 6, 9, 6, 4, 7, 9, 7, 4, 8, 9, 8, 4, 9, 9, 9, 4, 10, 9, 10, 4, 11, 9, 11, 4, 12, 9, 12, 4, 13, 9, 13, 4, 14, 9, 14, 4, 15, 9, 15, 4, 16, 9, 16, 4, 17, 9, 17, 4, 18, 9, 18, 4, 19, 9, 19, 3, 2, 3, 2, 3, 3, 3, 3, 7, 3, 43, 10, 3, 12, 3, 14, 3, 46, 11, 3, 3, 3, 5, 3, 49, 10, 3, 3, 3, 3, 3, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 6, 4, 72, 10, 4, 13, 4, 14, 4, 73, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 7, 4, 82, 10, 4, 12, 4, 14, 4, 85, 11, 4, 5, 4, 87, 10, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 5, 4, 100, 10, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 5, 4, 110, 10, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 5, 4, 128, 10, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 7, 4, 150, 10, 4, 12, 4, 14, 4, 153, 11, 4, 5, 4, 155, 10, 4, 3, 4, 7, 4, 158, 10, 4, 12, 4, 14, 4, 161, 11, 4, 3, 5, 3, 5, 5, 5, 165, 10, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 5, 5, 172, 10, 5, 3, 5, 3, 5, 3, 6, 3, 6, 3, 6, 5, 6, 179, 10, 6, 3, 6, 3, 6, 3, 6, 3, 6, 3, 6, 5, 6, 186, 10, 6, 3, 6, 3, 6, 3, 7, 3, 7, 3, 7, 3, 7, 3, 7, 3, 7, 5, 7, 196, 10, 7, 3, 8, 3, 8, 3, 8, 7, 8, 201, 10, 8, 12, 8, 14, 8, 204, 11, 8, 5, 8, 206, 10, 8, 3, 9, 3, 9, 3, 9, 5, 9, 211, 10, 9, 3, 10, 3, 10, 3, 10, 7, 10, 216, 10, 10, 12, 10, 14, 10, 219, 11, 10, 5, 10, 221, 10, 10, 3, 11, 3, 11, 3, 11, 3, 11, 3, 12, 3, 12, 3, 12, 3, 12, 3, 12, 3, 12, 7, 12, 233, 10, 12, 12, 12, 14, 12, 236, 11, 12, 3, 12, 3, 12, 5, 12, 240, 10, 12, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 6, 13, 253, 10, 13, 13, 13, 14, 13, 254, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 5, 13, 269, 10, 13, 3, 13, 3, 13, 3, 13, 3, 13, 7, 13, 275, 10, 13, 12, 13, 14, 13, 278, 11, 13, 5, 13, 280, 10, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 5, 13, 287, 10, 13, 3, 14, 3, 14, 3, 14, 3, 14, 3, 14, 3, 14, 3, 14, 3, 14, 3, 14, 3, 14, 3, 14, 6, 14, 300, 10, 14, 13, 14, 14, 14, 301, 3, 14, 3, 14, 5, 14, 306, 10, 14, 3, 15, 3, 15, 3, 15, 3, 15, 3, 15, 5, 15, 313, 10, 15, 3, 16, 3, 16, 3, 17, 3, 17, 3, 17, 3, 17, 3, 18, 3, 18, 3, 18, 5, 18, 324, 10, 18, 3, 19, 3, 19, 3, 19, 3, 19, 5, 19, 330, 10, 19, 3, 19, 2, 3, 6, 20, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 2, 6, 3, 2, 25, 26, 3, 2, 27, 28, 3, 2, 29, 32, 3, 2, 33, 34, 2, 373, 2, 38, 3, 2, 2, 2, 4, 40, 3, 2, 2, 2, 6, 127, 3, 2, 2, 2, 8, 162, 3, 2, 2, 2, 10, 175, 3, 2, 2, 2, 12, 195, 3, 2, 2, 2, 14, 205, 3, 2, 2, 2, 16, 207, 3, 2, 2, 2, 18, 220, 3, 2, 2, 2, 20, 222, 3, 2, 2, 2, 22, 239, 3, 2, 2, 2, 24, 286, 3, 2, 2, 2, 26, 305, 3, 2, 2, 2, 28, 312, 3, 2, 2, 2, 30, 314, 3, 2, 2, 2, 32, 316, 3, 2, 2, 2, 34, 323, 3, 2, 2, 2, 36, 329, 3, 2, 2, 2, 38, 39, 7, 42, 2, 2, 39, 3, 3, 2, 2, 2, 40, 48, 7, 21, 2, 2, 41, 43, 5, 10, 6, 2, 42, 41, 3, 2, 2, 2, 43, 46, 3, 2, 2, 2, 44, 42, 3, 2, 2, 2, 44, 45, 3, 2, 2, 2, 45, 49, 3, 2, 2, 2, 46, 44, 3, 2, 2, 2, 47, 49, 5, 6, 4, 2, 48, 44, 3, 2, 2, 2, 48, 47, 3, 2, 2, 2, 49, 50, 3, 2, 2, 2, 50, 51, 7, 2, 2, 3, 51, 5, 3, 2, 2, 2, 52, 53, 8, 4, 1, 2, 53, 54, 7, 3, 2, 2, 54, 55, 5, 6, 4, 2, 55, 56, 7, 4, 2, 2, 56, 128, 3, 2, 2, 2, 57, 58, 7, 28, 2, 2, 58, 128, 5, 6, 4, 19, 59, 128, 5, 8, 5, 2, 60, 61, 7, 3, 2, 2, 61, 128, 7, 4, 2, 2, 62, 63, 7, 3, 2, 2, 63, 64, 5, 6, 4, 2, 64, 65, 7, 5, 2, 2, 65, 66, 7, 4, 2, 2, 66, 128, 3, 2, 2, 2, 67, 68, 7, 3, 2, 2, 68, 71, 5, 6, 4, 2, 69, 70, 7, 5, 2, 2, 70, 72, 5, 6, 4, 2, 71, 69, 3, 2, 2, 2, 72, 73, 3, 2, 2, 2, 73, 71, 3, 2, 2, 2, 73, 74, 3, 2, 2, 2, 74, 75, 3, 2, 2, 2, 75, 76, 7, 4, 2, 2, 76, 128, 3, 2, 2, 2, 77, 86, 7, 6, 2, 2, 78, 83, 5, 6, 4, 2, 79, 80, 7, 5, 2, 2, 80, 82, 5, 6, 4, 2, 81, 79, 3, 2, 2, 2, 82, 85, 3, 2, 2, 2, 83, 81, 3, 2, 2, 2, 83, 84, 3, 2, 2, 2, 84, 87, 3, 2, 2, 2, 85, 83, 3, 2, 2, 2, 86, 78, 3, 2, 2, 2, 86, 87, 3, 2, 2, 2, 87, 88, 3, 2, 2, 2, 88, 128, 7, 7, 2, 2, 89, 90, 7, 8, 2, 2, 90, 91, 7, 3, 2, 2, 91, 92, 5, 6, 4, 2, 92, 93, 7, 4, 2, 2, 93, 94, 5, 32, 17, 2, 94, 95, 7, 9, 2, 2, 95, 96, 5, 32, 17, 2, 96, 128, 3, 2, 2, 2, 97, 99, 7, 10, 2, 2, 98, 100, 7, 38, 2, 2, 99, 98, 3, 2, 2, 2, 99, 100, 3, 2, 2, 2, 100, 101, 3, 2, 2, 2, 101, 102, 5, 16, 9, 2, 102, 103, 7, 11, 2, 2, 103, 104, 5, 6, 4, 2, 104, 105, 7, 12, 2, 2, 105, 106, 5, 6, 4, 8, 106, 128, 3, 2, 2, 2, 107, 109, 7, 10, 2, 2, 108, 110, 7, 38, 2, 2, 109, 108, 3, 2, 2, 2, 109, 110, 3, 2, 2, 2, 110, 111, 3, 2, 2, 2, 111, 112, 5, 16, 9, 2, 112, 113, 7, 11, 2, 2, 113, 114, 7, 13, 2, 2, 114, 115, 5, 6, 4, 2, 115, 116, 7, 14, 2, 2, 116, 117, 7, 12, 2, 2, 117, 118, 5, 6, 4, 7, 118, 128, 3, 2, 2, 2, 119, 120, 5, 36, 19, 2, 120, 121, 7, 11, 2, 2, 121, 122, 5, 6, 4, 2, 122, 123, 7, 12, 2, 2, 123, 124, 5, 6, 4, 5, 124, 128, 3, 2, 2, 2, 125, 128, 5, 36, 19, 2, 126, 128, 5, 34, 18, 2, 127, 52, 3, 2, 2, 2, 127, 57, 3, 2, 2, 2, 127, 59, 3, 2, 2, 2, 127, 60, 3, 2, 2, 2, 127, 62, 3, 2, 2, 2, 127, 67, 3, 2, 2, 2, 127, 77, 3, 2, 2, 2, 127, 89, 3, 2, 2, 2, 127, 97, 3, 2, 2, 2, 127, 107, 3, 2, 2, 2, 127, 119, 3, 2, 2, 2, 127, 125, 3, 2, 2, 2, 127, 126, 3, 2, 2, 2, 128, 159, 3, 2, 2, 2, 129, 130, 12, 18, 2, 2, 130, 131, 9, 2, 2, 2, 131, 158, 5, 6, 4, 19, 132, 133, 12, 17, 2, 2, 133, 134, 9, 3, 2, 2, 134, 158, 5, 6, 4, 18, 135, 136, 12, 16, 2, 2, 136, 137, 9, 4, 2, 2, 137, 158, 5, 6, 4, 17, 138, 139, 12, 15, 2, 2, 139, 140, 9, 5, 2, 2, 140, 158, 5, 6, 4, 16, 141, 142, 12, 6, 2, 2, 142, 143, 7, 12, 2, 2, 143, 158, 5, 6, 4, 7, 144, 145, 12, 20, 2, 2, 145, 154, 7, 3, 2, 2, 146, 151, 5, 6, 4, 2, 147, 148, 7, 5, 2, 2, 148, 150, 5, 6, 4, 2, 149, 147, 3, 2, 2, 2, 150, 153, 3, 2, 2, 2, 151, 149, 3, 2, 2, 2, 151, 152, 3, 2, 2, 2, 152, 155, 3, 2, 2, 2, 153, 151, 3, 2, 2, 2, 154, 146, 3, 2, 2, 2, 154, 155, 3, 2, 2, 2, 155, 156, 3, 2, 2, 2, 156, 158, 7, 4, 2, 2, 157, 129, 3, 2, 2, 2, 157, 132, 3, 2, 2, 2, 157, 135, 3, 2, 2, 2, 157, 138, 3, 2, 2, 2, 157, 141, 3, 2, 2, 2, 157, 144, 3, 2, 2, 2, 158, 161, 3, 2, 2, 2, 159, 157, 3, 2, 2, 2, 159, 160, 3, 2, 2, 2, 160, 7, 3, 2, 2, 2, 161, 159, 3, 2, 2, 2, 162, 164, 7, 15, 2, 2, 163, 165, 5, 22, 12, 2, 164, 163, 3, 2, 2, 2, 164, 165, 3, 2, 2, 2, 165, 166, 3, 2, 2, 2, 166, 167, 7, 3, 2, 2, 167, 168, 5, 12, 7, 2, 168, 171, 7, 4, 2, 2, 169, 170, 7, 16, 2, 2, 170, 172, 5, 24, 13, 2, 171, 169, 3, 2, 2, 2, 171, 172, 3, 2, 2, 2, 172, 173, 3, 2, 2, 2, 173, 174, 5, 32, 17, 2, 174, 9, 3, 2, 2, 2, 175, 176, 7, 17, 2, 2, 176, 178, 5, 36, 19, 2, 177, 179, 5, 22, 12, 2, 178, 177, 3, 2, 2, 2, 178, 179, 3, 2, 2, 2, 179, 180, 3, 2, 2, 2, 180, 181, 7, 3, 2, 2, 181, 182, 5, 12, 7, 2, 182, 185, 7, 4, 2, 2, 183, 184, 7, 16, 2, 2, 184, 186, 5, 24, 13, 2, 185, 183, 3, 2, 2, 2, 185, 186, 3, 2, 2, 2, 186, 187, 3, 2, 2, 2, 187, 188, 5, 32, 17, 2, 188, 11, 3, 2, 2, 2, 189, 196, 5, 14, 8, 2, 190, 196, 5, 18, 10, 2, 191, 192, 5, 14, 8, 2, 192, 193, 7, 5, 2, 2, 193, 194, 5, 18, 10, 2, 194, 196, 3, 2, 2, 2, 195, 189, 3, 2, 2, 2, 195, 190, 3, 2, 2, 2, 195, 191, 3, 2, 2, 2, 196, 13, 3, 2, 2, 2, 197, 202, 5, 16, 9, 2, 198, 199, 7, 5, 2, 2, 199, 201, 5, 16, 9, 2, 200, 198, 3, 2, 2, 2, 201, 204, 3, 2, 2, 2, 202, 200, 3, 2, 2, 2, 202, 203, 3, 2, 2, 2, 203, 206, 3, 2, 2, 2, 204, 202, 3, 2, 2, 2, 205, 197, 3, 2, 2, 2, 205, 206, 3, 2, 2, 2, 206, 15, 3, 2, 2, 2, 207, 210, 5, 36, 19, 2, 208, 209, 7, 18, 2, 2, 209, 211, 5, 24, 13, 2, 210, 208, 3, 2, 2, 2, 210, 211, 3, 2, 2, 2, 211, 17, 3, 2, 2, 2, 212, 217, 5, 20, 11, 2, 213, 214, 7, 5, 2, 2, 214, 216, 5, 20, 11, 2, 215, 213, 3, 2, 2, 2, 216, 219, 3, 2, 2, 2, 217, 215, 3, 2, 2, 2, 217, 218, 3, 2, 2, 2, 218, 221, 3, 2, 2, 2, 219, 217, 3, 2, 2, 2, 220, 212, 3, 2, 2, 2, 220, 221, 3, 2, 2, 2, 221, 19, 3, 2, 2, 2, 222, 223, 7, 42, 2, 2, 223, 224, 7, 11, 2, 2, 224, 225, 5, 6, 4, 2, 225, 21, 3, 2, 2, 2, 226, 227, 7, 6, 2, 2, 227, 240, 7, 7, 2, 2, 228, 229, 7, 6, 2, 2, 229, 234, 5, 36, 19, 2, 230, 231, 7, 5, 2, 2, 231, 233, 5, 36, 19, 2, 232, 230, 3, 2, 2, 2, 233, 236, 3, 2, 2, 2, 234, 232, 3, 2, 2, 2, 234, 235, 3, 2, 2, 2, 235, 237, 3, 2, 2, 2, 236, 234, 3, 2, 2, 2, 237, 238, 7, 7, 2, 2, 238, 240, 3, 2, 2, 2, 239, 226, 3, 2, 2, 2, 239, 228, 3, 2, 2, 2, 240, 23, 3, 2, 2, 2, 241, 242, 7, 3, 2, 2, 242, 287, 7, 4, 2, 2, 243, 244, 7, 3, 2, 2, 244, 245, 5, 24, 13, 2, 245, 246, 7, 5, 2, 2, 246, 247, 7, 4, 2, 2, 247, 287, 3, 2, 2, 2, 248, 249, 7, 3, 2, 2, 249, 252, 5, 24, 13, 2, 250, 251, 7, 5, 2, 2, 251, 253, 5, 24, 13, 2, 252, 250, 3, 2, 2, 2, 253, 254, 3, 2, 2, 2, 254, 252, 3, 2, 2, 2, 254, 255, 3, 2, 2, 2, 255, 256, 3, 2, 2, 2, 256, 257, 7, 4, 2, 2, 257, 287, 3, 2, 2, 2, 258, 287, 5, 30, 16, 2, 259, 260, 7, 19, 2, 2, 260, 261, 7, 6, 2, 2, 261, 262, 5, 26, 14, 2, 262, 263, 7, 5, 2, 2, 263, 264, 5, 24, 13, 2, 264, 265, 7, 7, 2, 2, 265, 287, 3, 2, 2, 2, 266, 268, 7, 15, 2, 2, 267, 269, 5, 22, 12, 2, 268, 267, 3, 2, 2, 2, 268, 269, 3, 2, 2, 2, 269, 270, 3, 2, 2, 2, 270, 279, 7, 3, 2, 2, 271, 276, 5, 24, 13, 2, 272, 273, 7, 5, 2, 2, 273, 275, 5, 24, 13, 2, 274, 272, 3, 2, 2, 2, 275, 278, 3, 2, 2, 2, 276, 274, 3, 2, 2, 2, 276, 277, 3, 2, 2, 2, 277, 280, 3, 2, 2, 2, 278, 276, 3, 2, 2, 2, 279, 271, 3, 2, 2, 2, 279, 280, 3, 2, 2, 2, 280, 281, 3, 2, 2, 2, 281, 282, 7, 4, 2, 2, 282, 283, 7, 16, 2, 2, 283, 287, 5, 24, 13, 2, 284, 287, 7, 20, 2, 2, 285, 287, 7, 41, 2, 2, 286, 241, 3, 2, 2, 2, 286, 243, 3, 2, 2, 2, 286, 248, 3, 2, 2, 2, 286, 258, 3, 2, 2, 2, 286, 259, 3, 2, 2, 2, 286, 266, 3, 2, 2, 2, 286, 284, 3, 2, 2, 2, 286, 285, 3, 2, 2, 2, 287, 25, 3, 2, 2, 2, 288, 289, 7, 3, 2, 2, 289, 306, 7, 4, 2, 2, 290, 291, 7, 3, 2, 2, 291, 292, 5, 28, 15, 2, 292, 293, 7, 5, 2, 2, 293, 294, 7, 4, 2, 2, 294, 306, 3, 2, 2, 2, 295, 296, 7, 3, 2, 2, 296, 299, 5, 28, 15, 2, 297, 298, 7, 5, 2, 2, 298, 300, 5, 28, 15, 2, 299, 297, 3, 2, 2, 2, 300, 301, 3, 2, 2, 2, 301, 299, 3, 2, 2, 2, 301, 302, 3, 2, 2, 2, 302, 303, 3, 2, 2, 2, 303, 304, 7, 4, 2, 2, 304, 306, 3, 2, 2, 2, 305, 288, 3, 2, 2, 2, 305, 290, 3, 2, 2, 2, 305, 295, 3, 2, 2, 2, 306, 27, 3, 2, 2, 2, 307, 308, 7, 3, 2, 2, 308, 309, 5, 28, 15, 2, 309, 310, 7, 4, 2, 2, 310, 313, 3, 2, 2, 2, 311, 313, 7, 41, 2, 2, 312, 307, 3, 2, 2, 2, 312, 311, 3, 2, 2, 2, 313, 29, 3, 2, 2, 2, 314, 315, 7, 42, 2, 2, 315, 31, 3, 2, 2, 2, 316, 317, 7, 13, 2, 2, 317, 318, 5, 6, 4, 2, 318, 319, 7, 14, 2, 2, 319, 33, 3, 2, 2, 2, 320, 324, 7, 40, 2, 2, 321, 324, 7, 41, 2, 2, 322, 324, 7, 39, 2, 2, 323, 320, 3, 2, 2, 2, 323, 321, 3, 2, 2, 2, 323, 322, 3, 2, 2, 2, 324, 35, 3, 2, 2, 2, 325, 330, 5, 2, 2, 2, 326, 330, 7, 35, 2, 2, 327, 330, 7, 36, 2, 2, 328, 330, 7, 37, 2, 2, 329, 325, 3, 2, 2, 2, 329, 326, 3, 2, 2, 2, 329, 327, 3, 2, 2, 2, 329, 328, 3, 2, 2, 2, 330, 37, 3, 2, 2, 2, 36, 44, 48, 73, 83, 86, 99, 109, 127, 151, 154, 157, 159, 164, 171, 178, 185, 195, 202, 205, 210, 217, 220, 234, 239, 254, 268, 276, 279, 286, 301, 305, 312, 323, 329]
\ No newline at end of file
diff --git a/python/tvm/relay/grammar/py2/Relay.tokens b/python/tvm/relay/grammar/py2/Relay.tokens
new file mode 100644
index 000000000000..41f3ee62a86c
--- /dev/null
+++ b/python/tvm/relay/grammar/py2/Relay.tokens
@@ -0,0 +1,70 @@
+T__0=1
+T__1=2
+T__2=3
+T__3=4
+T__4=5
+T__5=6
+T__6=7
+T__7=8
+T__8=9
+T__9=10
+T__10=11
+T__11=12
+T__12=13
+T__13=14
+T__14=15
+T__15=16
+T__16=17
+T__17=18
+SEMVER=19
+WS=20
+LINE_COMMENT=21
+COMMENT=22
+MUL=23
+DIV=24
+ADD=25
+SUB=26
+LT=27
+GT=28
+LE=29
+GE=30
+EQ=31
+NE=32
+GLOBAL_VAR=33
+LOCAL_VAR=34
+GRAPH_VAR=35
+MUT=36
+BOOL_LIT=37
+FLOAT=38
+NAT=39
+CNAME=40
+'('=1
+')'=2
+','=3
+'['=4
+']'=5
+'if'=6
+'else'=7
+'let'=8
+'='=9
+';'=10
+'{'=11
+'}'=12
+'fn'=13
+'->'=14
+'def'=15
+':'=16
+'Tensor'=17
+'_'=18
+'v0.0.2'=19
+'*'=23
+'/'=24
+'+'=25
+'-'=26
+'<'=27
+'>'=28
+'<='=29
+'>='=30
+'=='=31
+'!='=32
+'mut'=36
diff --git a/python/tvm/relay/grammar/py2/RelayLexer.interp b/python/tvm/relay/grammar/py2/RelayLexer.interp
new file mode 100644
index 000000000000..092b3589ab70
--- /dev/null
+++ b/python/tvm/relay/grammar/py2/RelayLexer.interp
@@ -0,0 +1,140 @@
+token literal names:
+null
+'('
+')'
+','
+'['
+']'
+'if'
+'else'
+'let'
+'='
+';'
+'{'
+'}'
+'fn'
+'->'
+'def'
+':'
+'Tensor'
+'_'
+'v0.0.2'
+null
+null
+null
+'*'
+'/'
+'+'
+'-'
+'<'
+'>'
+'<='
+'>='
+'=='
+'!='
+null
+null
+null
+'mut'
+null
+null
+null
+null
+
+token symbolic names:
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+SEMVER
+WS
+LINE_COMMENT
+COMMENT
+MUL
+DIV
+ADD
+SUB
+LT
+GT
+LE
+GE
+EQ
+NE
+GLOBAL_VAR
+LOCAL_VAR
+GRAPH_VAR
+MUT
+BOOL_LIT
+FLOAT
+NAT
+CNAME
+
+rule names:
+T__0
+T__1
+T__2
+T__3
+T__4
+T__5
+T__6
+T__7
+T__8
+T__9
+T__10
+T__11
+T__12
+T__13
+T__14
+T__15
+T__16
+T__17
+SEMVER
+WS
+LINE_COMMENT
+COMMENT
+MUL
+DIV
+ADD
+SUB
+LT
+GT
+LE
+GE
+EQ
+NE
+GLOBAL_VAR
+LOCAL_VAR
+GRAPH_VAR
+MUT
+BOOL_LIT
+FLOAT
+NAT
+EXP
+CNAME
+LETTER
+DIGIT
+
+channel names:
+DEFAULT_TOKEN_CHANNEL
+HIDDEN
+
+mode names:
+DEFAULT_MODE
+
+atn:
+[3, 24715, 42794, 33075, 47597, 16764, 15335, 30598, 22884, 2, 42, 267, 8, 1, 4, 2, 9, 2, 4, 3, 9, 3, 4, 4, 9, 4, 4, 5, 9, 5, 4, 6, 9, 6, 4, 7, 9, 7, 4, 8, 9, 8, 4, 9, 9, 9, 4, 10, 9, 10, 4, 11, 9, 11, 4, 12, 9, 12, 4, 13, 9, 13, 4, 14, 9, 14, 4, 15, 9, 15, 4, 16, 9, 16, 4, 17, 9, 17, 4, 18, 9, 18, 4, 19, 9, 19, 4, 20, 9, 20, 4, 21, 9, 21, 4, 22, 9, 22, 4, 23, 9, 23, 4, 24, 9, 24, 4, 25, 9, 25, 4, 26, 9, 26, 4, 27, 9, 27, 4, 28, 9, 28, 4, 29, 9, 29, 4, 30, 9, 30, 4, 31, 9, 31, 4, 32, 9, 32, 4, 33, 9, 33, 4, 34, 9, 34, 4, 35, 9, 35, 4, 36, 9, 36, 4, 37, 9, 37, 4, 38, 9, 38, 4, 39, 9, 39, 4, 40, 9, 40, 4, 41, 9, 41, 4, 42, 9, 42, 4, 43, 9, 43, 4, 44, 9, 44, 3, 2, 3, 2, 3, 3, 3, 3, 3, 4, 3, 4, 3, 5, 3, 5, 3, 6, 3, 6, 3, 7, 3, 7, 3, 7, 3, 8, 3, 8, 3, 8, 3, 8, 3, 8, 3, 9, 3, 9, 3, 9, 3, 9, 3, 10, 3, 10, 3, 11, 3, 11, 3, 12, 3, 12, 3, 13, 3, 13, 3, 14, 3, 14, 3, 14, 3, 15, 3, 15, 3, 15, 3, 16, 3, 16, 3, 16, 3, 16, 3, 17, 3, 17, 3, 18, 3, 18, 3, 18, 3, 18, 3, 18, 3, 18, 3, 18, 3, 19, 3, 19, 3, 20, 3, 20, 3, 20, 3, 20, 3, 20, 3, 20, 3, 20, 3, 21, 6, 21, 149, 10, 21, 13, 21, 14, 21, 150, 3, 21, 3, 21, 3, 22, 3, 22, 3, 22, 3, 22, 7, 22, 159, 10, 22, 12, 22, 14, 22, 162, 11, 22, 3, 22, 3, 22, 3, 22, 3, 22, 3, 23, 3, 23, 3, 23, 3, 23, 7, 23, 172, 10, 23, 12, 23, 14, 23, 175, 11, 23, 3, 23, 3, 23, 3, 23, 3, 23, 3, 23, 3, 24, 3, 24, 3, 25, 3, 25, 3, 26, 3, 26, 3, 27, 3, 27, 3, 28, 3, 28, 3, 29, 3, 29, 3, 30, 3, 30, 3, 30, 3, 31, 3, 31, 3, 31, 3, 32, 3, 32, 3, 32, 3, 33, 3, 33, 3, 33, 3, 34, 3, 34, 3, 34, 3, 35, 3, 35, 3, 35, 3, 36, 3, 36, 3, 36, 3, 37, 3, 37, 3, 37, 3, 37, 3, 38, 3, 38, 3, 38, 3, 38, 3, 38, 3, 38, 3, 38, 3, 38, 3, 38, 5, 38, 228, 10, 38, 3, 39, 3, 39, 3, 39, 3, 39, 5, 39, 234, 10, 39, 3, 39, 3, 39, 3, 39, 5, 39, 239, 10, 39, 3, 40, 6, 40, 242, 10, 40, 13, 40, 14, 40, 243, 3, 41, 3, 41, 5, 41, 248, 10, 41, 3, 41, 3, 41, 3, 42, 3, 42, 5, 42, 254, 10, 42, 3, 42, 3, 42, 3, 42, 7, 42, 259, 10, 42, 12, 42, 14, 42, 262, 11, 42, 3, 43, 3, 43, 3, 44, 3, 44, 4, 160, 173, 2, 45, 3, 3, 5, 4, 7, 5, 9, 6, 11, 7, 13, 8, 15, 9, 17, 10, 19, 11, 21, 12, 23, 13, 25, 14, 27, 15, 29, 16, 31, 17, 33, 18, 35, 19, 37, 20, 39, 21, 41, 22, 43, 23, 45, 24, 47, 25, 49, 26, 51, 27, 53, 28, 55, 29, 57, 30, 59, 31, 61, 32, 63, 33, 65, 34, 67, 35, 69, 36, 71, 37, 73, 38, 75, 39, 77, 40, 79, 41, 81, 2, 83, 42, 85, 2, 87, 2, 3, 2, 7, 5, 2, 11, 12, 15, 15, 34, 34, 4, 2, 71, 71, 103, 103, 4, 2, 45, 45, 47, 47, 4, 2, 67, 92, 99, 124, 3, 2, 50, 59, 2, 275, 2, 3, 3, 2, 2, 2, 2, 5, 3, 2, 2, 2, 2, 7, 3, 2, 2, 2, 2, 9, 3, 2, 2, 2, 2, 11, 3, 2, 2, 2, 2, 13, 3, 2, 2, 2, 2, 15, 3, 2, 2, 2, 2, 17, 3, 2, 2, 2, 2, 19, 3, 2, 2, 2, 2, 21, 3, 2, 2, 2, 2, 23, 3, 2, 2, 2, 2, 25, 3, 2, 2, 2, 2, 27, 3, 2, 2, 2, 2, 29, 3, 2, 2, 2, 2, 31, 3, 2, 2, 2, 2, 33, 3, 2, 2, 2, 2, 35, 3, 2, 2, 2, 2, 37, 3, 2, 2, 2, 2, 39, 3, 2, 2, 2, 2, 41, 3, 2, 2, 2, 2, 43, 3, 2, 2, 2, 2, 45, 3, 2, 2, 2, 2, 47, 3, 2, 2, 2, 2, 49, 3, 2, 2, 2, 2, 51, 3, 2, 2, 2, 2, 53, 3, 2, 2, 2, 2, 55, 3, 2, 2, 2, 2, 57, 3, 2, 2, 2, 2, 59, 3, 2, 2, 2, 2, 61, 3, 2, 2, 2, 2, 63, 3, 2, 2, 2, 2, 65, 3, 2, 2, 2, 2, 67, 3, 2, 2, 2, 2, 69, 3, 2, 2, 2, 2, 71, 3, 2, 2, 2, 2, 73, 3, 2, 2, 2, 2, 75, 3, 2, 2, 2, 2, 77, 3, 2, 2, 2, 2, 79, 3, 2, 2, 2, 2, 83, 3, 2, 2, 2, 3, 89, 3, 2, 2, 2, 5, 91, 3, 2, 2, 2, 7, 93, 3, 2, 2, 2, 9, 95, 3, 2, 2, 2, 11, 97, 3, 2, 2, 2, 13, 99, 3, 2, 2, 2, 15, 102, 3, 2, 2, 2, 17, 107, 3, 2, 2, 2, 19, 111, 3, 2, 2, 2, 21, 113, 3, 2, 2, 2, 23, 115, 3, 2, 2, 2, 25, 117, 3, 2, 2, 2, 27, 119, 3, 2, 2, 2, 29, 122, 3, 2, 2, 2, 31, 125, 3, 2, 2, 2, 33, 129, 3, 2, 2, 2, 35, 131, 3, 2, 2, 2, 37, 138, 3, 2, 2, 2, 39, 140, 3, 2, 2, 2, 41, 148, 3, 2, 2, 2, 43, 154, 3, 2, 2, 2, 45, 167, 3, 2, 2, 2, 47, 181, 3, 2, 2, 2, 49, 183, 3, 2, 2, 2, 51, 185, 3, 2, 2, 2, 53, 187, 3, 2, 2, 2, 55, 189, 3, 2, 2, 2, 57, 191, 3, 2, 2, 2, 59, 193, 3, 2, 2, 2, 61, 196, 3, 2, 2, 2, 63, 199, 3, 2, 2, 2, 65, 202, 3, 2, 2, 2, 67, 205, 3, 2, 2, 2, 69, 208, 3, 2, 2, 2, 71, 211, 3, 2, 2, 2, 73, 214, 3, 2, 2, 2, 75, 227, 3, 2, 2, 2, 77, 238, 3, 2, 2, 2, 79, 241, 3, 2, 2, 2, 81, 245, 3, 2, 2, 2, 83, 253, 3, 2, 2, 2, 85, 263, 3, 2, 2, 2, 87, 265, 3, 2, 2, 2, 89, 90, 7, 42, 2, 2, 90, 4, 3, 2, 2, 2, 91, 92, 7, 43, 2, 2, 92, 6, 3, 2, 2, 2, 93, 94, 7, 46, 2, 2, 94, 8, 3, 2, 2, 2, 95, 96, 7, 93, 2, 2, 96, 10, 3, 2, 2, 2, 97, 98, 7, 95, 2, 2, 98, 12, 3, 2, 2, 2, 99, 100, 7, 107, 2, 2, 100, 101, 7, 104, 2, 2, 101, 14, 3, 2, 2, 2, 102, 103, 7, 103, 2, 2, 103, 104, 7, 110, 2, 2, 104, 105, 7, 117, 2, 2, 105, 106, 7, 103, 2, 2, 106, 16, 3, 2, 2, 2, 107, 108, 7, 110, 2, 2, 108, 109, 7, 103, 2, 2, 109, 110, 7, 118, 2, 2, 110, 18, 3, 2, 2, 2, 111, 112, 7, 63, 2, 2, 112, 20, 3, 2, 2, 2, 113, 114, 7, 61, 2, 2, 114, 22, 3, 2, 2, 2, 115, 116, 7, 125, 2, 2, 116, 24, 3, 2, 2, 2, 117, 118, 7, 127, 2, 2, 118, 26, 3, 2, 2, 2, 119, 120, 7, 104, 2, 2, 120, 121, 7, 112, 2, 2, 121, 28, 3, 2, 2, 2, 122, 123, 7, 47, 2, 2, 123, 124, 7, 64, 2, 2, 124, 30, 3, 2, 2, 2, 125, 126, 7, 102, 2, 2, 126, 127, 7, 103, 2, 2, 127, 128, 7, 104, 2, 2, 128, 32, 3, 2, 2, 2, 129, 130, 7, 60, 2, 2, 130, 34, 3, 2, 2, 2, 131, 132, 7, 86, 2, 2, 132, 133, 7, 103, 2, 2, 133, 134, 7, 112, 2, 2, 134, 135, 7, 117, 2, 2, 135, 136, 7, 113, 2, 2, 136, 137, 7, 116, 2, 2, 137, 36, 3, 2, 2, 2, 138, 139, 7, 97, 2, 2, 139, 38, 3, 2, 2, 2, 140, 141, 7, 120, 2, 2, 141, 142, 7, 50, 2, 2, 142, 143, 7, 48, 2, 2, 143, 144, 7, 50, 2, 2, 144, 145, 7, 48, 2, 2, 145, 146, 7, 52, 2, 2, 146, 40, 3, 2, 2, 2, 147, 149, 9, 2, 2, 2, 148, 147, 3, 2, 2, 2, 149, 150, 3, 2, 2, 2, 150, 148, 3, 2, 2, 2, 150, 151, 3, 2, 2, 2, 151, 152, 3, 2, 2, 2, 152, 153, 8, 21, 2, 2, 153, 42, 3, 2, 2, 2, 154, 155, 7, 49, 2, 2, 155, 156, 7, 49, 2, 2, 156, 160, 3, 2, 2, 2, 157, 159, 11, 2, 2, 2, 158, 157, 3, 2, 2, 2, 159, 162, 3, 2, 2, 2, 160, 161, 3, 2, 2, 2, 160, 158, 3, 2, 2, 2, 161, 163, 3, 2, 2, 2, 162, 160, 3, 2, 2, 2, 163, 164, 7, 12, 2, 2, 164, 165, 3, 2, 2, 2, 165, 166, 8, 22, 2, 2, 166, 44, 3, 2, 2, 2, 167, 168, 7, 49, 2, 2, 168, 169, 7, 44, 2, 2, 169, 173, 3, 2, 2, 2, 170, 172, 11, 2, 2, 2, 171, 170, 3, 2, 2, 2, 172, 175, 3, 2, 2, 2, 173, 174, 3, 2, 2, 2, 173, 171, 3, 2, 2, 2, 174, 176, 3, 2, 2, 2, 175, 173, 3, 2, 2, 2, 176, 177, 7, 44, 2, 2, 177, 178, 7, 49, 2, 2, 178, 179, 3, 2, 2, 2, 179, 180, 8, 23, 2, 2, 180, 46, 3, 2, 2, 2, 181, 182, 7, 44, 2, 2, 182, 48, 3, 2, 2, 2, 183, 184, 7, 49, 2, 2, 184, 50, 3, 2, 2, 2, 185, 186, 7, 45, 2, 2, 186, 52, 3, 2, 2, 2, 187, 188, 7, 47, 2, 2, 188, 54, 3, 2, 2, 2, 189, 190, 7, 62, 2, 2, 190, 56, 3, 2, 2, 2, 191, 192, 7, 64, 2, 2, 192, 58, 3, 2, 2, 2, 193, 194, 7, 62, 2, 2, 194, 195, 7, 63, 2, 2, 195, 60, 3, 2, 2, 2, 196, 197, 7, 64, 2, 2, 197, 198, 7, 63, 2, 2, 198, 62, 3, 2, 2, 2, 199, 200, 7, 63, 2, 2, 200, 201, 7, 63, 2, 2, 201, 64, 3, 2, 2, 2, 202, 203, 7, 35, 2, 2, 203, 204, 7, 63, 2, 2, 204, 66, 3, 2, 2, 2, 205, 206, 7, 66, 2, 2, 206, 207, 5, 83, 42, 2, 207, 68, 3, 2, 2, 2, 208, 209, 7, 39, 2, 2, 209, 210, 5, 83, 42, 2, 210, 70, 3, 2, 2, 2, 211, 212, 7, 39, 2, 2, 212, 213, 5, 79, 40, 2, 213, 72, 3, 2, 2, 2, 214, 215, 7, 111, 2, 2, 215, 216, 7, 119, 2, 2, 216, 217, 7, 118, 2, 2, 217, 74, 3, 2, 2, 2, 218, 219, 7, 86, 2, 2, 219, 220, 7, 116, 2, 2, 220, 221, 7, 119, 2, 2, 221, 228, 7, 103, 2, 2, 222, 223, 7, 72, 2, 2, 223, 224, 7, 99, 2, 2, 224, 225, 7, 110, 2, 2, 225, 226, 7, 117, 2, 2, 226, 228, 7, 103, 2, 2, 227, 218, 3, 2, 2, 2, 227, 222, 3, 2, 2, 2, 228, 76, 3, 2, 2, 2, 229, 230, 5, 79, 40, 2, 230, 231, 7, 48, 2, 2, 231, 233, 5, 79, 40, 2, 232, 234, 5, 81, 41, 2, 233, 232, 3, 2, 2, 2, 233, 234, 3, 2, 2, 2, 234, 239, 3, 2, 2, 2, 235, 236, 5, 79, 40, 2, 236, 237, 5, 81, 41, 2, 237, 239, 3, 2, 2, 2, 238, 229, 3, 2, 2, 2, 238, 235, 3, 2, 2, 2, 239, 78, 3, 2, 2, 2, 240, 242, 5, 87, 44, 2, 241, 240, 3, 2, 2, 2, 242, 243, 3, 2, 2, 2, 243, 241, 3, 2, 2, 2, 243, 244, 3, 2, 2, 2, 244, 80, 3, 2, 2, 2, 245, 247, 9, 3, 2, 2, 246, 248, 9, 4, 2, 2, 247, 246, 3, 2, 2, 2, 247, 248, 3, 2, 2, 2, 248, 249, 3, 2, 2, 2, 249, 250, 5, 79, 40, 2, 250, 82, 3, 2, 2, 2, 251, 254, 7, 97, 2, 2, 252, 254, 5, 85, 43, 2, 253, 251, 3, 2, 2, 2, 253, 252, 3, 2, 2, 2, 254, 260, 3, 2, 2, 2, 255, 259, 7, 97, 2, 2, 256, 259, 5, 85, 43, 2, 257, 259, 5, 87, 44, 2, 258, 255, 3, 2, 2, 2, 258, 256, 3, 2, 2, 2, 258, 257, 3, 2, 2, 2, 259, 262, 3, 2, 2, 2, 260, 258, 3, 2, 2, 2, 260, 261, 3, 2, 2, 2, 261, 84, 3, 2, 2, 2, 262, 260, 3, 2, 2, 2, 263, 264, 9, 5, 2, 2, 264, 86, 3, 2, 2, 2, 265, 266, 9, 6, 2, 2, 266, 88, 3, 2, 2, 2, 14, 2, 150, 160, 173, 227, 233, 238, 243, 247, 253, 258, 260, 3, 8, 2, 2]
\ No newline at end of file
diff --git a/python/tvm/relay/grammar/py2/RelayLexer.py b/python/tvm/relay/grammar/py2/RelayLexer.py
new file mode 100644
index 000000000000..be87421c2da6
--- /dev/null
+++ b/python/tvm/relay/grammar/py2/RelayLexer.py
@@ -0,0 +1,209 @@
+# Generated from /home/sslyu/tvm/python/tvm/relay/grammar/Relay.g4 by ANTLR 4.7.2
+# encoding: utf-8
+from __future__ import print_function
+from antlr4 import *
+from io import StringIO
+import sys
+
+
+
+def serializedATN():
+    with StringIO() as buf:
+        buf.write(u"\3\u608b\ua72a\u8133\ub9ed\u417c\u3be7\u7786\u5964\2")
+        buf.write(u"*\u010b\b\1\4\2\t\2\4\3\t\3\4\4\t\4\4\5\t\5\4\6\t\6\4")
+        buf.write(u"\7\t\7\4\b\t\b\4\t\t\t\4\n\t\n\4\13\t\13\4\f\t\f\4\r")
+        buf.write(u"\t\r\4\16\t\16\4\17\t\17\4\20\t\20\4\21\t\21\4\22\t\22")
+        buf.write(u"\4\23\t\23\4\24\t\24\4\25\t\25\4\26\t\26\4\27\t\27\4")
+        buf.write(u"\30\t\30\4\31\t\31\4\32\t\32\4\33\t\33\4\34\t\34\4\35")
+        buf.write(u"\t\35\4\36\t\36\4\37\t\37\4 \t \4!\t!\4\"\t\"\4#\t#\4")
+        buf.write(u"$\t$\4%\t%\4&\t&\4\'\t\'\4(\t(\4)\t)\4*\t*\4+\t+\4,\t")
+        buf.write(u",\3\2\3\2\3\3\3\3\3\4\3\4\3\5\3\5\3\6\3\6\3\7\3\7\3\7")
+        buf.write(u"\3\b\3\b\3\b\3\b\3\b\3\t\3\t\3\t\3\t\3\n\3\n\3\13\3\13")
+        buf.write(u"\3\f\3\f\3\r\3\r\3\16\3\16\3\16\3\17\3\17\3\17\3\20\3")
+        buf.write(u"\20\3\20\3\20\3\21\3\21\3\22\3\22\3\22\3\22\3\22\3\22")
+        buf.write(u"\3\22\3\23\3\23\3\24\3\24\3\24\3\24\3\24\3\24\3\24\3")
+        buf.write(u"\25\6\25\u0095\n\25\r\25\16\25\u0096\3\25\3\25\3\26\3")
+        buf.write(u"\26\3\26\3\26\7\26\u009f\n\26\f\26\16\26\u00a2\13\26")
+        buf.write(u"\3\26\3\26\3\26\3\26\3\27\3\27\3\27\3\27\7\27\u00ac\n")
+        buf.write(u"\27\f\27\16\27\u00af\13\27\3\27\3\27\3\27\3\27\3\27\3")
+        buf.write(u"\30\3\30\3\31\3\31\3\32\3\32\3\33\3\33\3\34\3\34\3\35")
+        buf.write(u"\3\35\3\36\3\36\3\36\3\37\3\37\3\37\3 \3 \3 \3!\3!\3")
+        buf.write(u"!\3\"\3\"\3\"\3#\3#\3#\3$\3$\3$\3%\3%\3%\3%\3&\3&\3&")
+        buf.write(u"\3&\3&\3&\3&\3&\3&\5&\u00e4\n&\3\'\3\'\3\'\3\'\5\'\u00ea")
+        buf.write(u"\n\'\3\'\3\'\3\'\5\'\u00ef\n\'\3(\6(\u00f2\n(\r(\16(")
+        buf.write(u"\u00f3\3)\3)\5)\u00f8\n)\3)\3)\3*\3*\5*\u00fe\n*\3*\3")
+        buf.write(u"*\3*\7*\u0103\n*\f*\16*\u0106\13*\3+\3+\3,\3,\4\u00a0")
+        buf.write(u"\u00ad\2-\3\3\5\4\7\5\t\6\13\7\r\b\17\t\21\n\23\13\25")
+        buf.write(u"\f\27\r\31\16\33\17\35\20\37\21!\22#\23%\24\'\25)\26")
+        buf.write(u"+\27-\30/\31\61\32\63\33\65\34\67\359\36;\37= ?!A\"C")
+        buf.write(u"#E$G%I&K\'M(O)Q\2S*U\2W\2\3\2\7\5\2\13\f\17\17\"\"\4")
+        buf.write(u"\2GGgg\4\2--//\4\2C\\c|\3\2\62;\2\u0113\2\3\3\2\2\2\2")
+        buf.write(u"\5\3\2\2\2\2\7\3\2\2\2\2\t\3\2\2\2\2\13\3\2\2\2\2\r\3")
+        buf.write(u"\2\2\2\2\17\3\2\2\2\2\21\3\2\2\2\2\23\3\2\2\2\2\25\3")
+        buf.write(u"\2\2\2\2\27\3\2\2\2\2\31\3\2\2\2\2\33\3\2\2\2\2\35\3")
+        buf.write(u"\2\2\2\2\37\3\2\2\2\2!\3\2\2\2\2#\3\2\2\2\2%\3\2\2\2")
+        buf.write(u"\2\'\3\2\2\2\2)\3\2\2\2\2+\3\2\2\2\2-\3\2\2\2\2/\3\2")
+        buf.write(u"\2\2\2\61\3\2\2\2\2\63\3\2\2\2\2\65\3\2\2\2\2\67\3\2")
+        buf.write(u"\2\2\29\3\2\2\2\2;\3\2\2\2\2=\3\2\2\2\2?\3\2\2\2\2A\3")
+        buf.write(u"\2\2\2\2C\3\2\2\2\2E\3\2\2\2\2G\3\2\2\2\2I\3\2\2\2\2")
+        buf.write(u"K\3\2\2\2\2M\3\2\2\2\2O\3\2\2\2\2S\3\2\2\2\3Y\3\2\2\2")
+        buf.write(u"\5[\3\2\2\2\7]\3\2\2\2\t_\3\2\2\2\13a\3\2\2\2\rc\3\2")
+        buf.write(u"\2\2\17f\3\2\2\2\21k\3\2\2\2\23o\3\2\2\2\25q\3\2\2\2")
+        buf.write(u"\27s\3\2\2\2\31u\3\2\2\2\33w\3\2\2\2\35z\3\2\2\2\37}")
+        buf.write(u"\3\2\2\2!\u0081\3\2\2\2#\u0083\3\2\2\2%\u008a\3\2\2\2")
+        buf.write(u"\'\u008c\3\2\2\2)\u0094\3\2\2\2+\u009a\3\2\2\2-\u00a7")
+        buf.write(u"\3\2\2\2/\u00b5\3\2\2\2\61\u00b7\3\2\2\2\63\u00b9\3\2")
+        buf.write(u"\2\2\65\u00bb\3\2\2\2\67\u00bd\3\2\2\29\u00bf\3\2\2\2")
+        buf.write(u";\u00c1\3\2\2\2=\u00c4\3\2\2\2?\u00c7\3\2\2\2A\u00ca")
+        buf.write(u"\3\2\2\2C\u00cd\3\2\2\2E\u00d0\3\2\2\2G\u00d3\3\2\2\2")
+        buf.write(u"I\u00d6\3\2\2\2K\u00e3\3\2\2\2M\u00ee\3\2\2\2O\u00f1")
+        buf.write(u"\3\2\2\2Q\u00f5\3\2\2\2S\u00fd\3\2\2\2U\u0107\3\2\2\2")
+        buf.write(u"W\u0109\3\2\2\2YZ\7*\2\2Z\4\3\2\2\2[\\\7+\2\2\\\6\3\2")
+        buf.write(u"\2\2]^\7.\2\2^\b\3\2\2\2_`\7]\2\2`\n\3\2\2\2ab\7_\2\2")
+        buf.write(u"b\f\3\2\2\2cd\7k\2\2de\7h\2\2e\16\3\2\2\2fg\7g\2\2gh")
+        buf.write(u"\7n\2\2hi\7u\2\2ij\7g\2\2j\20\3\2\2\2kl\7n\2\2lm\7g\2")
+        buf.write(u"\2mn\7v\2\2n\22\3\2\2\2op\7?\2\2p\24\3\2\2\2qr\7=\2\2")
+        buf.write(u"r\26\3\2\2\2st\7}\2\2t\30\3\2\2\2uv\7\177\2\2v\32\3\2")
+        buf.write(u"\2\2wx\7h\2\2xy\7p\2\2y\34\3\2\2\2z{\7/\2\2{|\7@\2\2")
+        buf.write(u"|\36\3\2\2\2}~\7f\2\2~\177\7g\2\2\177\u0080\7h\2\2\u0080")
+        buf.write(u" \3\2\2\2\u0081\u0082\7<\2\2\u0082\"\3\2\2\2\u0083\u0084")
+        buf.write(u"\7V\2\2\u0084\u0085\7g\2\2\u0085\u0086\7p\2\2\u0086\u0087")
+        buf.write(u"\7u\2\2\u0087\u0088\7q\2\2\u0088\u0089\7t\2\2\u0089$")
+        buf.write(u"\3\2\2\2\u008a\u008b\7a\2\2\u008b&\3\2\2\2\u008c\u008d")
+        buf.write(u"\7x\2\2\u008d\u008e\7\62\2\2\u008e\u008f\7\60\2\2\u008f")
+        buf.write(u"\u0090\7\62\2\2\u0090\u0091\7\60\2\2\u0091\u0092\7\64")
+        buf.write(u"\2\2\u0092(\3\2\2\2\u0093\u0095\t\2\2\2\u0094\u0093\3")
+        buf.write(u"\2\2\2\u0095\u0096\3\2\2\2\u0096\u0094\3\2\2\2\u0096")
+        buf.write(u"\u0097\3\2\2\2\u0097\u0098\3\2\2\2\u0098\u0099\b\25\2")
+        buf.write(u"\2\u0099*\3\2\2\2\u009a\u009b\7\61\2\2\u009b\u009c\7")
+        buf.write(u"\61\2\2\u009c\u00a0\3\2\2\2\u009d\u009f\13\2\2\2\u009e")
+        buf.write(u"\u009d\3\2\2\2\u009f\u00a2\3\2\2\2\u00a0\u00a1\3\2\2")
+        buf.write(u"\2\u00a0\u009e\3\2\2\2\u00a1\u00a3\3\2\2\2\u00a2\u00a0")
+        buf.write(u"\3\2\2\2\u00a3\u00a4\7\f\2\2\u00a4\u00a5\3\2\2\2\u00a5")
+        buf.write(u"\u00a6\b\26\2\2\u00a6,\3\2\2\2\u00a7\u00a8\7\61\2\2\u00a8")
+        buf.write(u"\u00a9\7,\2\2\u00a9\u00ad\3\2\2\2\u00aa\u00ac\13\2\2")
+        buf.write(u"\2\u00ab\u00aa\3\2\2\2\u00ac\u00af\3\2\2\2\u00ad\u00ae")
+        buf.write(u"\3\2\2\2\u00ad\u00ab\3\2\2\2\u00ae\u00b0\3\2\2\2\u00af")
+        buf.write(u"\u00ad\3\2\2\2\u00b0\u00b1\7,\2\2\u00b1\u00b2\7\61\2")
+        buf.write(u"\2\u00b2\u00b3\3\2\2\2\u00b3\u00b4\b\27\2\2\u00b4.\3")
+        buf.write(u"\2\2\2\u00b5\u00b6\7,\2\2\u00b6\60\3\2\2\2\u00b7\u00b8")
+        buf.write(u"\7\61\2\2\u00b8\62\3\2\2\2\u00b9\u00ba\7-\2\2\u00ba\64")
+        buf.write(u"\3\2\2\2\u00bb\u00bc\7/\2\2\u00bc\66\3\2\2\2\u00bd\u00be")
+        buf.write(u"\7>\2\2\u00be8\3\2\2\2\u00bf\u00c0\7@\2\2\u00c0:\3\2")
+        buf.write(u"\2\2\u00c1\u00c2\7>\2\2\u00c2\u00c3\7?\2\2\u00c3<\3\2")
+        buf.write(u"\2\2\u00c4\u00c5\7@\2\2\u00c5\u00c6\7?\2\2\u00c6>\3\2")
+        buf.write(u"\2\2\u00c7\u00c8\7?\2\2\u00c8\u00c9\7?\2\2\u00c9@\3\2")
+        buf.write(u"\2\2\u00ca\u00cb\7#\2\2\u00cb\u00cc\7?\2\2\u00ccB\3\2")
+        buf.write(u"\2\2\u00cd\u00ce\7B\2\2\u00ce\u00cf\5S*\2\u00cfD\3\2")
+        buf.write(u"\2\2\u00d0\u00d1\7\'\2\2\u00d1\u00d2\5S*\2\u00d2F\3\2")
+        buf.write(u"\2\2\u00d3\u00d4\7\'\2\2\u00d4\u00d5\5O(\2\u00d5H\3\2")
+        buf.write(u"\2\2\u00d6\u00d7\7o\2\2\u00d7\u00d8\7w\2\2\u00d8\u00d9")
+        buf.write(u"\7v\2\2\u00d9J\3\2\2\2\u00da\u00db\7V\2\2\u00db\u00dc")
+        buf.write(u"\7t\2\2\u00dc\u00dd\7w\2\2\u00dd\u00e4\7g\2\2\u00de\u00df")
+        buf.write(u"\7H\2\2\u00df\u00e0\7c\2\2\u00e0\u00e1\7n\2\2\u00e1\u00e2")
+        buf.write(u"\7u\2\2\u00e2\u00e4\7g\2\2\u00e3\u00da\3\2\2\2\u00e3")
+        buf.write(u"\u00de\3\2\2\2\u00e4L\3\2\2\2\u00e5\u00e6\5O(\2\u00e6")
+        buf.write(u"\u00e7\7\60\2\2\u00e7\u00e9\5O(\2\u00e8\u00ea\5Q)\2\u00e9")
+        buf.write(u"\u00e8\3\2\2\2\u00e9\u00ea\3\2\2\2\u00ea\u00ef\3\2\2")
+        buf.write(u"\2\u00eb\u00ec\5O(\2\u00ec\u00ed\5Q)\2\u00ed\u00ef\3")
+        buf.write(u"\2\2\2\u00ee\u00e5\3\2\2\2\u00ee\u00eb\3\2\2\2\u00ef")
+        buf.write(u"N\3\2\2\2\u00f0\u00f2\5W,\2\u00f1\u00f0\3\2\2\2\u00f2")
+        buf.write(u"\u00f3\3\2\2\2\u00f3\u00f1\3\2\2\2\u00f3\u00f4\3\2\2")
+        buf.write(u"\2\u00f4P\3\2\2\2\u00f5\u00f7\t\3\2\2\u00f6\u00f8\t\4")
+        buf.write(u"\2\2\u00f7\u00f6\3\2\2\2\u00f7\u00f8\3\2\2\2\u00f8\u00f9")
+        buf.write(u"\3\2\2\2\u00f9\u00fa\5O(\2\u00faR\3\2\2\2\u00fb\u00fe")
+        buf.write(u"\7a\2\2\u00fc\u00fe\5U+\2\u00fd\u00fb\3\2\2\2\u00fd\u00fc")
+        buf.write(u"\3\2\2\2\u00fe\u0104\3\2\2\2\u00ff\u0103\7a\2\2\u0100")
+        buf.write(u"\u0103\5U+\2\u0101\u0103\5W,\2\u0102\u00ff\3\2\2\2\u0102")
+        buf.write(u"\u0100\3\2\2\2\u0102\u0101\3\2\2\2\u0103\u0106\3\2\2")
+        buf.write(u"\2\u0104\u0102\3\2\2\2\u0104\u0105\3\2\2\2\u0105T\3\2")
+        buf.write(u"\2\2\u0106\u0104\3\2\2\2\u0107\u0108\t\5\2\2\u0108V\3")
+        buf.write(u"\2\2\2\u0109\u010a\t\6\2\2\u010aX\3\2\2\2\16\2\u0096")
+        buf.write(u"\u00a0\u00ad\u00e3\u00e9\u00ee\u00f3\u00f7\u00fd\u0102")
+        buf.write(u"\u0104\3\b\2\2")
+        return buf.getvalue()
+
+
+class RelayLexer(Lexer):
+
+    atn = ATNDeserializer().deserialize(serializedATN())
+
+    decisionsToDFA = [ DFA(ds, i) for i, ds in enumerate(atn.decisionToState) ]
+
+    T__0 = 1
+    T__1 = 2
+    T__2 = 3
+    T__3 = 4
+    T__4 = 5
+    T__5 = 6
+    T__6 = 7
+    T__7 = 8
+    T__8 = 9
+    T__9 = 10
+    T__10 = 11
+    T__11 = 12
+    T__12 = 13
+    T__13 = 14
+    T__14 = 15
+    T__15 = 16
+    T__16 = 17
+    T__17 = 18
+    SEMVER = 19
+    WS = 20
+    LINE_COMMENT = 21
+    COMMENT = 22
+    MUL = 23
+    DIV = 24
+    ADD = 25
+    SUB = 26
+    LT = 27
+    GT = 28
+    LE = 29
+    GE = 30
+    EQ = 31
+    NE = 32
+    GLOBAL_VAR = 33
+    LOCAL_VAR = 34
+    GRAPH_VAR = 35
+    MUT = 36
+    BOOL_LIT = 37
+    FLOAT = 38
+    NAT = 39
+    CNAME = 40
+
+    channelNames = [ u"DEFAULT_TOKEN_CHANNEL", u"HIDDEN" ]
+
+    modeNames = [ u"DEFAULT_MODE" ]
+
+    literalNames = [ u"<INVALID>",
+            u"'('", u"')'", u"','", u"'['", u"']'", u"'if'", u"'else'", 
+            u"'let'", u"'='", u"';'", u"'{'", u"'}'", u"'fn'", u"'->'", 
+            u"'def'", u"':'", u"'Tensor'", u"'_'", u"'v0.0.2'", u"'*'", 
+            u"'/'", u"'+'", u"'-'", u"'<'", u"'>'", u"'<='", u"'>='", u"'=='", 
+            u"'!='", u"'mut'" ]
+
+    symbolicNames = [ u"<INVALID>",
+            u"SEMVER", u"WS", u"LINE_COMMENT", u"COMMENT", u"MUL", u"DIV", 
+            u"ADD", u"SUB", u"LT", u"GT", u"LE", u"GE", u"EQ", u"NE", u"GLOBAL_VAR", 
+            u"LOCAL_VAR", u"GRAPH_VAR", u"MUT", u"BOOL_LIT", u"FLOAT", u"NAT", 
+            u"CNAME" ]
+
+    ruleNames = [ u"T__0", u"T__1", u"T__2", u"T__3", u"T__4", u"T__5", 
+                  u"T__6", u"T__7", u"T__8", u"T__9", u"T__10", u"T__11", 
+                  u"T__12", u"T__13", u"T__14", u"T__15", u"T__16", u"T__17", 
+                  u"SEMVER", u"WS", u"LINE_COMMENT", u"COMMENT", u"MUL", 
+                  u"DIV", u"ADD", u"SUB", u"LT", u"GT", u"LE", u"GE", u"EQ", 
+                  u"NE", u"GLOBAL_VAR", u"LOCAL_VAR", u"GRAPH_VAR", u"MUT", 
+                  u"BOOL_LIT", u"FLOAT", u"NAT", u"EXP", u"CNAME", u"LETTER", 
+                  u"DIGIT" ]
+
+    grammarFileName = u"Relay.g4"
+
+    def __init__(self, input=None, output=sys.stdout):
+        super(RelayLexer, self).__init__(input, output=output)
+        self.checkVersion("4.7.2")
+        self._interp = LexerATNSimulator(self, self.atn, self.decisionsToDFA, PredictionContextCache())
+        self._actions = None
+        self._predicates = None
+
+
diff --git a/python/tvm/relay/grammar/py2/RelayLexer.tokens b/python/tvm/relay/grammar/py2/RelayLexer.tokens
new file mode 100644
index 000000000000..41f3ee62a86c
--- /dev/null
+++ b/python/tvm/relay/grammar/py2/RelayLexer.tokens
@@ -0,0 +1,70 @@
+T__0=1
+T__1=2
+T__2=3
+T__3=4
+T__4=5
+T__5=6
+T__6=7
+T__7=8
+T__8=9
+T__9=10
+T__10=11
+T__11=12
+T__12=13
+T__13=14
+T__14=15
+T__15=16
+T__16=17
+T__17=18
+SEMVER=19
+WS=20
+LINE_COMMENT=21
+COMMENT=22
+MUL=23
+DIV=24
+ADD=25
+SUB=26
+LT=27
+GT=28
+LE=29
+GE=30
+EQ=31
+NE=32
+GLOBAL_VAR=33
+LOCAL_VAR=34
+GRAPH_VAR=35
+MUT=36
+BOOL_LIT=37
+FLOAT=38
+NAT=39
+CNAME=40
+'('=1
+')'=2
+','=3
+'['=4
+']'=5
+'if'=6
+'else'=7
+'let'=8
+'='=9
+';'=10
+'{'=11
+'}'=12
+'fn'=13
+'->'=14
+'def'=15
+':'=16
+'Tensor'=17
+'_'=18
+'v0.0.2'=19
+'*'=23
+'/'=24
+'+'=25
+'-'=26
+'<'=27
+'>'=28
+'<='=29
+'>='=30
+'=='=31
+'!='=32
+'mut'=36
diff --git a/python/tvm/relay/grammar/py2/RelayParser.py b/python/tvm/relay/grammar/py2/RelayParser.py
new file mode 100644
index 000000000000..77f56bf0545a
--- /dev/null
+++ b/python/tvm/relay/grammar/py2/RelayParser.py
@@ -0,0 +1,2311 @@
+# Generated from /home/sslyu/tvm/python/tvm/relay/grammar/Relay.g4 by ANTLR 4.7.2
+# encoding: utf-8
+from __future__ import print_function
+from antlr4 import *
+from io import StringIO
+import sys
+
+
+def serializedATN():
+    with StringIO() as buf:
+        buf.write(u"\3\u608b\ua72a\u8133\ub9ed\u417c\u3be7\u7786\u5964\3")
+        buf.write(u"*\u014c\4\2\t\2\4\3\t\3\4\4\t\4\4\5\t\5\4\6\t\6\4\7\t")
+        buf.write(u"\7\4\b\t\b\4\t\t\t\4\n\t\n\4\13\t\13\4\f\t\f\4\r\t\r")
+        buf.write(u"\4\16\t\16\4\17\t\17\4\20\t\20\4\21\t\21\4\22\t\22\4")
+        buf.write(u"\23\t\23\3\2\3\2\3\3\3\3\7\3+\n\3\f\3\16\3.\13\3\3\3")
+        buf.write(u"\5\3\61\n\3\3\3\3\3\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3")
+        buf.write(u"\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\6\4H\n\4\r")
+        buf.write(u"\4\16\4I\3\4\3\4\3\4\3\4\3\4\3\4\7\4R\n\4\f\4\16\4U\13")
+        buf.write(u"\4\5\4W\n\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3")
+        buf.write(u"\4\5\4d\n\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\5\4n\n\4")
+        buf.write(u"\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4")
+        buf.write(u"\3\4\3\4\3\4\5\4\u0080\n\4\3\4\3\4\3\4\3\4\3\4\3\4\3")
+        buf.write(u"\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3")
+        buf.write(u"\4\7\4\u0096\n\4\f\4\16\4\u0099\13\4\5\4\u009b\n\4\3")
+        buf.write(u"\4\7\4\u009e\n\4\f\4\16\4\u00a1\13\4\3\5\3\5\5\5\u00a5")
+        buf.write(u"\n\5\3\5\3\5\3\5\3\5\3\5\5\5\u00ac\n\5\3\5\3\5\3\6\3")
+        buf.write(u"\6\3\6\5\6\u00b3\n\6\3\6\3\6\3\6\3\6\3\6\5\6\u00ba\n")
+        buf.write(u"\6\3\6\3\6\3\7\3\7\3\7\3\7\3\7\3\7\5\7\u00c4\n\7\3\b")
+        buf.write(u"\3\b\3\b\7\b\u00c9\n\b\f\b\16\b\u00cc\13\b\5\b\u00ce")
+        buf.write(u"\n\b\3\t\3\t\3\t\5\t\u00d3\n\t\3\n\3\n\3\n\7\n\u00d8")
+        buf.write(u"\n\n\f\n\16\n\u00db\13\n\5\n\u00dd\n\n\3\13\3\13\3\13")
+        buf.write(u"\3\13\3\f\3\f\3\f\3\f\3\f\3\f\7\f\u00e9\n\f\f\f\16\f")
+        buf.write(u"\u00ec\13\f\3\f\3\f\5\f\u00f0\n\f\3\r\3\r\3\r\3\r\3\r")
+        buf.write(u"\3\r\3\r\3\r\3\r\3\r\3\r\6\r\u00fd\n\r\r\r\16\r\u00fe")
+        buf.write(u"\3\r\3\r\3\r\3\r\3\r\3\r\3\r\3\r\3\r\3\r\3\r\3\r\5\r")
+        buf.write(u"\u010d\n\r\3\r\3\r\3\r\3\r\7\r\u0113\n\r\f\r\16\r\u0116")
+        buf.write(u"\13\r\5\r\u0118\n\r\3\r\3\r\3\r\3\r\3\r\5\r\u011f\n\r")
+        buf.write(u"\3\16\3\16\3\16\3\16\3\16\3\16\3\16\3\16\3\16\3\16\3")
+        buf.write(u"\16\6\16\u012c\n\16\r\16\16\16\u012d\3\16\3\16\5\16\u0132")
+        buf.write(u"\n\16\3\17\3\17\3\17\3\17\3\17\5\17\u0139\n\17\3\20\3")
+        buf.write(u"\20\3\21\3\21\3\21\3\21\3\22\3\22\3\22\5\22\u0144\n\22")
+        buf.write(u"\3\23\3\23\3\23\3\23\5\23\u014a\n\23\3\23\2\3\6\24\2")
+        buf.write(u"\4\6\b\n\f\16\20\22\24\26\30\32\34\36 \"$\2\6\3\2\31")
+        buf.write(u"\32\3\2\33\34\3\2\35 \3\2!\"\2\u0175\2&\3\2\2\2\4(\3")
+        buf.write(u"\2\2\2\6\177\3\2\2\2\b\u00a2\3\2\2\2\n\u00af\3\2\2\2")
+        buf.write(u"\f\u00c3\3\2\2\2\16\u00cd\3\2\2\2\20\u00cf\3\2\2\2\22")
+        buf.write(u"\u00dc\3\2\2\2\24\u00de\3\2\2\2\26\u00ef\3\2\2\2\30\u011e")
+        buf.write(u"\3\2\2\2\32\u0131\3\2\2\2\34\u0138\3\2\2\2\36\u013a\3")
+        buf.write(u"\2\2\2 \u013c\3\2\2\2\"\u0143\3\2\2\2$\u0149\3\2\2\2")
+        buf.write(u"&\'\7*\2\2\'\3\3\2\2\2(\60\7\25\2\2)+\5\n\6\2*)\3\2\2")
+        buf.write(u"\2+.\3\2\2\2,*\3\2\2\2,-\3\2\2\2-\61\3\2\2\2.,\3\2\2")
+        buf.write(u"\2/\61\5\6\4\2\60,\3\2\2\2\60/\3\2\2\2\61\62\3\2\2\2")
+        buf.write(u"\62\63\7\2\2\3\63\5\3\2\2\2\64\65\b\4\1\2\65\66\7\3\2")
+        buf.write(u"\2\66\67\5\6\4\2\678\7\4\2\28\u0080\3\2\2\29:\7\34\2")
+        buf.write(u"\2:\u0080\5\6\4\23;\u0080\5\b\5\2<=\7\3\2\2=\u0080\7")
+        buf.write(u"\4\2\2>?\7\3\2\2?@\5\6\4\2@A\7\5\2\2AB\7\4\2\2B\u0080")
+        buf.write(u"\3\2\2\2CD\7\3\2\2DG\5\6\4\2EF\7\5\2\2FH\5\6\4\2GE\3")
+        buf.write(u"\2\2\2HI\3\2\2\2IG\3\2\2\2IJ\3\2\2\2JK\3\2\2\2KL\7\4")
+        buf.write(u"\2\2L\u0080\3\2\2\2MV\7\6\2\2NS\5\6\4\2OP\7\5\2\2PR\5")
+        buf.write(u"\6\4\2QO\3\2\2\2RU\3\2\2\2SQ\3\2\2\2ST\3\2\2\2TW\3\2")
+        buf.write(u"\2\2US\3\2\2\2VN\3\2\2\2VW\3\2\2\2WX\3\2\2\2X\u0080\7")
+        buf.write(u"\7\2\2YZ\7\b\2\2Z[\7\3\2\2[\\\5\6\4\2\\]\7\4\2\2]^\5")
+        buf.write(u" \21\2^_\7\t\2\2_`\5 \21\2`\u0080\3\2\2\2ac\7\n\2\2b")
+        buf.write(u"d\7&\2\2cb\3\2\2\2cd\3\2\2\2de\3\2\2\2ef\5\20\t\2fg\7")
+        buf.write(u"\13\2\2gh\5\6\4\2hi\7\f\2\2ij\5\6\4\bj\u0080\3\2\2\2")
+        buf.write(u"km\7\n\2\2ln\7&\2\2ml\3\2\2\2mn\3\2\2\2no\3\2\2\2op\5")
+        buf.write(u"\20\t\2pq\7\13\2\2qr\7\r\2\2rs\5\6\4\2st\7\16\2\2tu\7")
+        buf.write(u"\f\2\2uv\5\6\4\7v\u0080\3\2\2\2wx\5$\23\2xy\7\13\2\2")
+        buf.write(u"yz\5\6\4\2z{\7\f\2\2{|\5\6\4\5|\u0080\3\2\2\2}\u0080")
+        buf.write(u"\5$\23\2~\u0080\5\"\22\2\177\64\3\2\2\2\1779\3\2\2\2")
+        buf.write(u"\177;\3\2\2\2\177<\3\2\2\2\177>\3\2\2\2\177C\3\2\2\2")
+        buf.write(u"\177M\3\2\2\2\177Y\3\2\2\2\177a\3\2\2\2\177k\3\2\2\2")
+        buf.write(u"\177w\3\2\2\2\177}\3\2\2\2\177~\3\2\2\2\u0080\u009f\3")
+        buf.write(u"\2\2\2\u0081\u0082\f\22\2\2\u0082\u0083\t\2\2\2\u0083")
+        buf.write(u"\u009e\5\6\4\23\u0084\u0085\f\21\2\2\u0085\u0086\t\3")
+        buf.write(u"\2\2\u0086\u009e\5\6\4\22\u0087\u0088\f\20\2\2\u0088")
+        buf.write(u"\u0089\t\4\2\2\u0089\u009e\5\6\4\21\u008a\u008b\f\17")
+        buf.write(u"\2\2\u008b\u008c\t\5\2\2\u008c\u009e\5\6\4\20\u008d\u008e")
+        buf.write(u"\f\6\2\2\u008e\u008f\7\f\2\2\u008f\u009e\5\6\4\7\u0090")
+        buf.write(u"\u0091\f\24\2\2\u0091\u009a\7\3\2\2\u0092\u0097\5\6\4")
+        buf.write(u"\2\u0093\u0094\7\5\2\2\u0094\u0096\5\6\4\2\u0095\u0093")
+        buf.write(u"\3\2\2\2\u0096\u0099\3\2\2\2\u0097\u0095\3\2\2\2\u0097")
+        buf.write(u"\u0098\3\2\2\2\u0098\u009b\3\2\2\2\u0099\u0097\3\2\2")
+        buf.write(u"\2\u009a\u0092\3\2\2\2\u009a\u009b\3\2\2\2\u009b\u009c")
+        buf.write(u"\3\2\2\2\u009c\u009e\7\4\2\2\u009d\u0081\3\2\2\2\u009d")
+        buf.write(u"\u0084\3\2\2\2\u009d\u0087\3\2\2\2\u009d\u008a\3\2\2")
+        buf.write(u"\2\u009d\u008d\3\2\2\2\u009d\u0090\3\2\2\2\u009e\u00a1")
+        buf.write(u"\3\2\2\2\u009f\u009d\3\2\2\2\u009f\u00a0\3\2\2\2\u00a0")
+        buf.write(u"\7\3\2\2\2\u00a1\u009f\3\2\2\2\u00a2\u00a4\7\17\2\2\u00a3")
+        buf.write(u"\u00a5\5\26\f\2\u00a4\u00a3\3\2\2\2\u00a4\u00a5\3\2\2")
+        buf.write(u"\2\u00a5\u00a6\3\2\2\2\u00a6\u00a7\7\3\2\2\u00a7\u00a8")
+        buf.write(u"\5\f\7\2\u00a8\u00ab\7\4\2\2\u00a9\u00aa\7\20\2\2\u00aa")
+        buf.write(u"\u00ac\5\30\r\2\u00ab\u00a9\3\2\2\2\u00ab\u00ac\3\2\2")
+        buf.write(u"\2\u00ac\u00ad\3\2\2\2\u00ad\u00ae\5 \21\2\u00ae\t\3")
+        buf.write(u"\2\2\2\u00af\u00b0\7\21\2\2\u00b0\u00b2\5$\23\2\u00b1")
+        buf.write(u"\u00b3\5\26\f\2\u00b2\u00b1\3\2\2\2\u00b2\u00b3\3\2\2")
+        buf.write(u"\2\u00b3\u00b4\3\2\2\2\u00b4\u00b5\7\3\2\2\u00b5\u00b6")
+        buf.write(u"\5\f\7\2\u00b6\u00b9\7\4\2\2\u00b7\u00b8\7\20\2\2\u00b8")
+        buf.write(u"\u00ba\5\30\r\2\u00b9\u00b7\3\2\2\2\u00b9\u00ba\3\2\2")
+        buf.write(u"\2\u00ba\u00bb\3\2\2\2\u00bb\u00bc\5 \21\2\u00bc\13\3")
+        buf.write(u"\2\2\2\u00bd\u00c4\5\16\b\2\u00be\u00c4\5\22\n\2\u00bf")
+        buf.write(u"\u00c0\5\16\b\2\u00c0\u00c1\7\5\2\2\u00c1\u00c2\5\22")
+        buf.write(u"\n\2\u00c2\u00c4\3\2\2\2\u00c3\u00bd\3\2\2\2\u00c3\u00be")
+        buf.write(u"\3\2\2\2\u00c3\u00bf\3\2\2\2\u00c4\r\3\2\2\2\u00c5\u00ca")
+        buf.write(u"\5\20\t\2\u00c6\u00c7\7\5\2\2\u00c7\u00c9\5\20\t\2\u00c8")
+        buf.write(u"\u00c6\3\2\2\2\u00c9\u00cc\3\2\2\2\u00ca\u00c8\3\2\2")
+        buf.write(u"\2\u00ca\u00cb\3\2\2\2\u00cb\u00ce\3\2\2\2\u00cc\u00ca")
+        buf.write(u"\3\2\2\2\u00cd\u00c5\3\2\2\2\u00cd\u00ce\3\2\2\2\u00ce")
+        buf.write(u"\17\3\2\2\2\u00cf\u00d2\5$\23\2\u00d0\u00d1\7\22\2\2")
+        buf.write(u"\u00d1\u00d3\5\30\r\2\u00d2\u00d0\3\2\2\2\u00d2\u00d3")
+        buf.write(u"\3\2\2\2\u00d3\21\3\2\2\2\u00d4\u00d9\5\24\13\2\u00d5")
+        buf.write(u"\u00d6\7\5\2\2\u00d6\u00d8\5\24\13\2\u00d7\u00d5\3\2")
+        buf.write(u"\2\2\u00d8\u00db\3\2\2\2\u00d9\u00d7\3\2\2\2\u00d9\u00da")
+        buf.write(u"\3\2\2\2\u00da\u00dd\3\2\2\2\u00db\u00d9\3\2\2\2\u00dc")
+        buf.write(u"\u00d4\3\2\2\2\u00dc\u00dd\3\2\2\2\u00dd\23\3\2\2\2\u00de")
+        buf.write(u"\u00df\7*\2\2\u00df\u00e0\7\13\2\2\u00e0\u00e1\5\6\4")
+        buf.write(u"\2\u00e1\25\3\2\2\2\u00e2\u00e3\7\6\2\2\u00e3\u00f0\7")
+        buf.write(u"\7\2\2\u00e4\u00e5\7\6\2\2\u00e5\u00ea\5$\23\2\u00e6")
+        buf.write(u"\u00e7\7\5\2\2\u00e7\u00e9\5$\23\2\u00e8\u00e6\3\2\2")
+        buf.write(u"\2\u00e9\u00ec\3\2\2\2\u00ea\u00e8\3\2\2\2\u00ea\u00eb")
+        buf.write(u"\3\2\2\2\u00eb\u00ed\3\2\2\2\u00ec\u00ea\3\2\2\2\u00ed")
+        buf.write(u"\u00ee\7\7\2\2\u00ee\u00f0\3\2\2\2\u00ef\u00e2\3\2\2")
+        buf.write(u"\2\u00ef\u00e4\3\2\2\2\u00f0\27\3\2\2\2\u00f1\u00f2\7")
+        buf.write(u"\3\2\2\u00f2\u011f\7\4\2\2\u00f3\u00f4\7\3\2\2\u00f4")
+        buf.write(u"\u00f5\5\30\r\2\u00f5\u00f6\7\5\2\2\u00f6\u00f7\7\4\2")
+        buf.write(u"\2\u00f7\u011f\3\2\2\2\u00f8\u00f9\7\3\2\2\u00f9\u00fc")
+        buf.write(u"\5\30\r\2\u00fa\u00fb\7\5\2\2\u00fb\u00fd\5\30\r\2\u00fc")
+        buf.write(u"\u00fa\3\2\2\2\u00fd\u00fe\3\2\2\2\u00fe\u00fc\3\2\2")
+        buf.write(u"\2\u00fe\u00ff\3\2\2\2\u00ff\u0100\3\2\2\2\u0100\u0101")
+        buf.write(u"\7\4\2\2\u0101\u011f\3\2\2\2\u0102\u011f\5\36\20\2\u0103")
+        buf.write(u"\u0104\7\23\2\2\u0104\u0105\7\6\2\2\u0105\u0106\5\32")
+        buf.write(u"\16\2\u0106\u0107\7\5\2\2\u0107\u0108\5\30\r\2\u0108")
+        buf.write(u"\u0109\7\7\2\2\u0109\u011f\3\2\2\2\u010a\u010c\7\17\2")
+        buf.write(u"\2\u010b\u010d\5\26\f\2\u010c\u010b\3\2\2\2\u010c\u010d")
+        buf.write(u"\3\2\2\2\u010d\u010e\3\2\2\2\u010e\u0117\7\3\2\2\u010f")
+        buf.write(u"\u0114\5\30\r\2\u0110\u0111\7\5\2\2\u0111\u0113\5\30")
+        buf.write(u"\r\2\u0112\u0110\3\2\2\2\u0113\u0116\3\2\2\2\u0114\u0112")
+        buf.write(u"\3\2\2\2\u0114\u0115\3\2\2\2\u0115\u0118\3\2\2\2\u0116")
+        buf.write(u"\u0114\3\2\2\2\u0117\u010f\3\2\2\2\u0117\u0118\3\2\2")
+        buf.write(u"\2\u0118\u0119\3\2\2\2\u0119\u011a\7\4\2\2\u011a\u011b")
+        buf.write(u"\7\20\2\2\u011b\u011f\5\30\r\2\u011c\u011f\7\24\2\2\u011d")
+        buf.write(u"\u011f\7)\2\2\u011e\u00f1\3\2\2\2\u011e\u00f3\3\2\2\2")
+        buf.write(u"\u011e\u00f8\3\2\2\2\u011e\u0102\3\2\2\2\u011e\u0103")
+        buf.write(u"\3\2\2\2\u011e\u010a\3\2\2\2\u011e\u011c\3\2\2\2\u011e")
+        buf.write(u"\u011d\3\2\2\2\u011f\31\3\2\2\2\u0120\u0121\7\3\2\2\u0121")
+        buf.write(u"\u0132\7\4\2\2\u0122\u0123\7\3\2\2\u0123\u0124\5\34\17")
+        buf.write(u"\2\u0124\u0125\7\5\2\2\u0125\u0126\7\4\2\2\u0126\u0132")
+        buf.write(u"\3\2\2\2\u0127\u0128\7\3\2\2\u0128\u012b\5\34\17\2\u0129")
+        buf.write(u"\u012a\7\5\2\2\u012a\u012c\5\34\17\2\u012b\u0129\3\2")
+        buf.write(u"\2\2\u012c\u012d\3\2\2\2\u012d\u012b\3\2\2\2\u012d\u012e")
+        buf.write(u"\3\2\2\2\u012e\u012f\3\2\2\2\u012f\u0130\7\4\2\2\u0130")
+        buf.write(u"\u0132\3\2\2\2\u0131\u0120\3\2\2\2\u0131\u0122\3\2\2")
+        buf.write(u"\2\u0131\u0127\3\2\2\2\u0132\33\3\2\2\2\u0133\u0134\7")
+        buf.write(u"\3\2\2\u0134\u0135\5\34\17\2\u0135\u0136\7\4\2\2\u0136")
+        buf.write(u"\u0139\3\2\2\2\u0137\u0139\7)\2\2\u0138\u0133\3\2\2\2")
+        buf.write(u"\u0138\u0137\3\2\2\2\u0139\35\3\2\2\2\u013a\u013b\7*")
+        buf.write(u"\2\2\u013b\37\3\2\2\2\u013c\u013d\7\r\2\2\u013d\u013e")
+        buf.write(u"\5\6\4\2\u013e\u013f\7\16\2\2\u013f!\3\2\2\2\u0140\u0144")
+        buf.write(u"\7(\2\2\u0141\u0144\7)\2\2\u0142\u0144\7\'\2\2\u0143")
+        buf.write(u"\u0140\3\2\2\2\u0143\u0141\3\2\2\2\u0143\u0142\3\2\2")
+        buf.write(u"\2\u0144#\3\2\2\2\u0145\u014a\5\2\2\2\u0146\u014a\7#")
+        buf.write(u"\2\2\u0147\u014a\7$\2\2\u0148\u014a\7%\2\2\u0149\u0145")
+        buf.write(u"\3\2\2\2\u0149\u0146\3\2\2\2\u0149\u0147\3\2\2\2\u0149")
+        buf.write(u"\u0148\3\2\2\2\u014a%\3\2\2\2$,\60ISVcm\177\u0097\u009a")
+        buf.write(u"\u009d\u009f\u00a4\u00ab\u00b2\u00b9\u00c3\u00ca\u00cd")
+        buf.write(u"\u00d2\u00d9\u00dc\u00ea\u00ef\u00fe\u010c\u0114\u0117")
+        buf.write(u"\u011e\u012d\u0131\u0138\u0143\u0149")
+        return buf.getvalue()
+
+
+class RelayParser ( Parser ):
+
+    grammarFileName = "Relay.g4"
+
+    atn = ATNDeserializer().deserialize(serializedATN())
+
+    decisionsToDFA = [ DFA(ds, i) for i, ds in enumerate(atn.decisionToState) ]
+
+    sharedContextCache = PredictionContextCache()
+
+    literalNames = [ u"<INVALID>", u"'('", u"')'", u"','", u"'['", u"']'", 
+                     u"'if'", u"'else'", u"'let'", u"'='", u"';'", u"'{'", 
+                     u"'}'", u"'fn'", u"'->'", u"'def'", u"':'", u"'Tensor'", 
+                     u"'_'", u"'v0.0.2'", u"<INVALID>", u"<INVALID>", u"<INVALID>", 
+                     u"'*'", u"'/'", u"'+'", u"'-'", u"'<'", u"'>'", u"'<='", 
+                     u"'>='", u"'=='", u"'!='", u"<INVALID>", u"<INVALID>", 
+                     u"<INVALID>", u"'mut'" ]
+
+    symbolicNames = [ u"<INVALID>", u"<INVALID>", u"<INVALID>", u"<INVALID>", 
+                      u"<INVALID>", u"<INVALID>", u"<INVALID>", u"<INVALID>", 
+                      u"<INVALID>", u"<INVALID>", u"<INVALID>", u"<INVALID>", 
+                      u"<INVALID>", u"<INVALID>", u"<INVALID>", u"<INVALID>", 
+                      u"<INVALID>", u"<INVALID>", u"<INVALID>", u"SEMVER", 
+                      u"WS", u"LINE_COMMENT", u"COMMENT", u"MUL", u"DIV", 
+                      u"ADD", u"SUB", u"LT", u"GT", u"LE", u"GE", u"EQ", 
+                      u"NE", u"GLOBAL_VAR", u"LOCAL_VAR", u"GRAPH_VAR", 
+                      u"MUT", u"BOOL_LIT", u"FLOAT", u"NAT", u"CNAME" ]
+
+    RULE_opIdent = 0
+    RULE_prog = 1
+    RULE_expr = 2
+    RULE_func = 3
+    RULE_defn = 4
+    RULE_argList = 5
+    RULE_varList = 6
+    RULE_var = 7
+    RULE_attrList = 8
+    RULE_attr = 9
+    RULE_typeParamSeq = 10
+    RULE_type_ = 11
+    RULE_shapeSeq = 12
+    RULE_shape = 13
+    RULE_typeIdent = 14
+    RULE_body = 15
+    RULE_scalar = 16
+    RULE_ident = 17
+
+    ruleNames =  [ u"opIdent", u"prog", u"expr", u"func", u"defn", u"argList", 
+                   u"varList", u"var", u"attrList", u"attr", u"typeParamSeq", 
+                   u"type_", u"shapeSeq", u"shape", u"typeIdent", u"body", 
+                   u"scalar", u"ident" ]
+
+    EOF = Token.EOF
+    T__0=1
+    T__1=2
+    T__2=3
+    T__3=4
+    T__4=5
+    T__5=6
+    T__6=7
+    T__7=8
+    T__8=9
+    T__9=10
+    T__10=11
+    T__11=12
+    T__12=13
+    T__13=14
+    T__14=15
+    T__15=16
+    T__16=17
+    T__17=18
+    SEMVER=19
+    WS=20
+    LINE_COMMENT=21
+    COMMENT=22
+    MUL=23
+    DIV=24
+    ADD=25
+    SUB=26
+    LT=27
+    GT=28
+    LE=29
+    GE=30
+    EQ=31
+    NE=32
+    GLOBAL_VAR=33
+    LOCAL_VAR=34
+    GRAPH_VAR=35
+    MUT=36
+    BOOL_LIT=37
+    FLOAT=38
+    NAT=39
+    CNAME=40
+
+    def __init__(self, input, output=sys.stdout):
+        super(RelayParser, self).__init__(input, output=output)
+        self.checkVersion("4.7.2")
+        self._interp = ParserATNSimulator(self, self.atn, self.decisionsToDFA, self.sharedContextCache)
+        self._predicates = None
+
+
+
+
+    class OpIdentContext(ParserRuleContext):
+
+        def __init__(self, parser, parent=None, invokingState=-1):
+            super(RelayParser.OpIdentContext, self).__init__(parent, invokingState)
+            self.parser = parser
+
+        def CNAME(self):
+            return self.getToken(RelayParser.CNAME, 0)
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_opIdent
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitOpIdent"):
+                return visitor.visitOpIdent(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def opIdent(self):
+
+        localctx = RelayParser.OpIdentContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 0, self.RULE_opIdent)
+        try:
+            self.enterOuterAlt(localctx, 1)
+            self.state = 36
+            self.match(RelayParser.CNAME)
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class ProgContext(ParserRuleContext):
+
+        def __init__(self, parser, parent=None, invokingState=-1):
+            super(RelayParser.ProgContext, self).__init__(parent, invokingState)
+            self.parser = parser
+
+        def SEMVER(self):
+            return self.getToken(RelayParser.SEMVER, 0)
+
+        def EOF(self):
+            return self.getToken(RelayParser.EOF, 0)
+
+        def expr(self):
+            return self.getTypedRuleContext(RelayParser.ExprContext,0)
+
+
+        def defn(self, i=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.DefnContext)
+            else:
+                return self.getTypedRuleContext(RelayParser.DefnContext,i)
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_prog
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitProg"):
+                return visitor.visitProg(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def prog(self):
+
+        localctx = RelayParser.ProgContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 2, self.RULE_prog)
+        self._la = 0 # Token type
+        try:
+            self.enterOuterAlt(localctx, 1)
+            self.state = 38
+            self.match(RelayParser.SEMVER)
+            self.state = 46
+            self._errHandler.sync(self)
+            token = self._input.LA(1)
+            if token in [RelayParser.EOF, RelayParser.T__14]:
+                self.state = 42
+                self._errHandler.sync(self)
+                _la = self._input.LA(1)
+                while _la==RelayParser.T__14:
+                    self.state = 39
+                    self.defn()
+                    self.state = 44
+                    self._errHandler.sync(self)
+                    _la = self._input.LA(1)
+
+                pass
+            elif token in [RelayParser.T__0, RelayParser.T__3, RelayParser.T__5, RelayParser.T__7, RelayParser.T__12, RelayParser.SUB, RelayParser.GLOBAL_VAR, RelayParser.LOCAL_VAR, RelayParser.GRAPH_VAR, RelayParser.BOOL_LIT, RelayParser.FLOAT, RelayParser.NAT, RelayParser.CNAME]:
+                self.state = 45
+                self.expr(0)
+                pass
+            else:
+                raise NoViableAltException(self)
+
+            self.state = 48
+            self.match(RelayParser.EOF)
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class ExprContext(ParserRuleContext):
+
+        def __init__(self, parser, parent=None, invokingState=-1):
+            super(RelayParser.ExprContext, self).__init__(parent, invokingState)
+            self.parser = parser
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_expr
+
+     
+        def copyFrom(self, ctx):
+            super(RelayParser.ExprContext, self).copyFrom(ctx)
+
+
+    class IdentExprContext(ExprContext):
+
+        def __init__(self, parser, ctx): # actually a RelayParser.ExprContext)
+            super(RelayParser.IdentExprContext, self).__init__(parser)
+            self.copyFrom(ctx)
+
+        def ident(self):
+            return self.getTypedRuleContext(RelayParser.IdentContext,0)
+
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitIdentExpr"):
+                return visitor.visitIdentExpr(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class CallContext(ExprContext):
+
+        def __init__(self, parser, ctx): # actually a RelayParser.ExprContext)
+            super(RelayParser.CallContext, self).__init__(parser)
+            self.copyFrom(ctx)
+
+        def expr(self, i=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.ExprContext)
+            else:
+                return self.getTypedRuleContext(RelayParser.ExprContext,i)
+
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitCall"):
+                return visitor.visitCall(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class NegContext(ExprContext):
+
+        def __init__(self, parser, ctx): # actually a RelayParser.ExprContext)
+            super(RelayParser.NegContext, self).__init__(parser)
+            self.copyFrom(ctx)
+
+        def SUB(self):
+            return self.getToken(RelayParser.SUB, 0)
+        def expr(self):
+            return self.getTypedRuleContext(RelayParser.ExprContext,0)
+
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitNeg"):
+                return visitor.visitNeg(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class TupleContext(ExprContext):
+
+        def __init__(self, parser, ctx): # actually a RelayParser.ExprContext)
+            super(RelayParser.TupleContext, self).__init__(parser)
+            self.copyFrom(ctx)
+
+        def expr(self, i=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.ExprContext)
+            else:
+                return self.getTypedRuleContext(RelayParser.ExprContext,i)
+
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitTuple"):
+                return visitor.visitTuple(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class ParensContext(ExprContext):
+
+        def __init__(self, parser, ctx): # actually a RelayParser.ExprContext)
+            super(RelayParser.ParensContext, self).__init__(parser)
+            self.copyFrom(ctx)
+
+        def expr(self):
+            return self.getTypedRuleContext(RelayParser.ExprContext,0)
+
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitParens"):
+                return visitor.visitParens(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class FuncExprContext(ExprContext):
+
+        def __init__(self, parser, ctx): # actually a RelayParser.ExprContext)
+            super(RelayParser.FuncExprContext, self).__init__(parser)
+            self.copyFrom(ctx)
+
+        def func(self):
+            return self.getTypedRuleContext(RelayParser.FuncContext,0)
+
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitFuncExpr"):
+                return visitor.visitFuncExpr(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class ScalarExprContext(ExprContext):
+
+        def __init__(self, parser, ctx): # actually a RelayParser.ExprContext)
+            super(RelayParser.ScalarExprContext, self).__init__(parser)
+            self.copyFrom(ctx)
+
+        def scalar(self):
+            return self.getTypedRuleContext(RelayParser.ScalarContext,0)
+
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitScalarExpr"):
+                return visitor.visitScalarExpr(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class LetContext(ExprContext):
+
+        def __init__(self, parser, ctx): # actually a RelayParser.ExprContext)
+            super(RelayParser.LetContext, self).__init__(parser)
+            self.copyFrom(ctx)
+
+        def var(self):
+            return self.getTypedRuleContext(RelayParser.VarContext,0)
+
+        def expr(self, i=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.ExprContext)
+            else:
+                return self.getTypedRuleContext(RelayParser.ExprContext,i)
+
+        def MUT(self):
+            return self.getToken(RelayParser.MUT, 0)
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitLet"):
+                return visitor.visitLet(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class TensorContext(ExprContext):
+
+        def __init__(self, parser, ctx): # actually a RelayParser.ExprContext)
+            super(RelayParser.TensorContext, self).__init__(parser)
+            self.copyFrom(ctx)
+
+        def expr(self, i=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.ExprContext)
+            else:
+                return self.getTypedRuleContext(RelayParser.ExprContext,i)
+
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitTensor"):
+                return visitor.visitTensor(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class IfElseContext(ExprContext):
+
+        def __init__(self, parser, ctx): # actually a RelayParser.ExprContext)
+            super(RelayParser.IfElseContext, self).__init__(parser)
+            self.copyFrom(ctx)
+
+        def expr(self):
+            return self.getTypedRuleContext(RelayParser.ExprContext,0)
+
+        def body(self, i=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.BodyContext)
+            else:
+                return self.getTypedRuleContext(RelayParser.BodyContext,i)
+
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitIfElse"):
+                return visitor.visitIfElse(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class GraphContext(ExprContext):
+
+        def __init__(self, parser, ctx): # actually a RelayParser.ExprContext)
+            super(RelayParser.GraphContext, self).__init__(parser)
+            self.copyFrom(ctx)
+
+        def ident(self):
+            return self.getTypedRuleContext(RelayParser.IdentContext,0)
+
+        def expr(self, i=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.ExprContext)
+            else:
+                return self.getTypedRuleContext(RelayParser.ExprContext,i)
+
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitGraph"):
+                return visitor.visitGraph(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class BinOpContext(ExprContext):
+
+        def __init__(self, parser, ctx): # actually a RelayParser.ExprContext)
+            super(RelayParser.BinOpContext, self).__init__(parser)
+            self.op = None # Token
+            self.copyFrom(ctx)
+
+        def expr(self, i=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.ExprContext)
+            else:
+                return self.getTypedRuleContext(RelayParser.ExprContext,i)
+
+        def MUL(self):
+            return self.getToken(RelayParser.MUL, 0)
+        def DIV(self):
+            return self.getToken(RelayParser.DIV, 0)
+        def ADD(self):
+            return self.getToken(RelayParser.ADD, 0)
+        def SUB(self):
+            return self.getToken(RelayParser.SUB, 0)
+        def LT(self):
+            return self.getToken(RelayParser.LT, 0)
+        def GT(self):
+            return self.getToken(RelayParser.GT, 0)
+        def LE(self):
+            return self.getToken(RelayParser.LE, 0)
+        def GE(self):
+            return self.getToken(RelayParser.GE, 0)
+        def EQ(self):
+            return self.getToken(RelayParser.EQ, 0)
+        def NE(self):
+            return self.getToken(RelayParser.NE, 0)
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitBinOp"):
+                return visitor.visitBinOp(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+    def expr(self, _p=0):
+        _parentctx = self._ctx
+        _parentState = self.state
+        localctx = RelayParser.ExprContext(self, self._ctx, _parentState)
+        _prevctx = localctx
+        _startState = 4
+        self.enterRecursionRule(localctx, 4, self.RULE_expr, _p)
+        self._la = 0 # Token type
+        try:
+            self.enterOuterAlt(localctx, 1)
+            self.state = 125
+            self._errHandler.sync(self)
+            la_ = self._interp.adaptivePredict(self._input,7,self._ctx)
+            if la_ == 1:
+                localctx = RelayParser.ParensContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+
+                self.state = 51
+                self.match(RelayParser.T__0)
+                self.state = 52
+                self.expr(0)
+                self.state = 53
+                self.match(RelayParser.T__1)
+                pass
+
+            elif la_ == 2:
+                localctx = RelayParser.NegContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+                self.state = 55
+                self.match(RelayParser.SUB)
+                self.state = 56
+                self.expr(17)
+                pass
+
+            elif la_ == 3:
+                localctx = RelayParser.FuncExprContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+                self.state = 57
+                self.func()
+                pass
+
+            elif la_ == 4:
+                localctx = RelayParser.TupleContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+                self.state = 58
+                self.match(RelayParser.T__0)
+                self.state = 59
+                self.match(RelayParser.T__1)
+                pass
+
+            elif la_ == 5:
+                localctx = RelayParser.TupleContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+                self.state = 60
+                self.match(RelayParser.T__0)
+                self.state = 61
+                self.expr(0)
+                self.state = 62
+                self.match(RelayParser.T__2)
+                self.state = 63
+                self.match(RelayParser.T__1)
+                pass
+
+            elif la_ == 6:
+                localctx = RelayParser.TupleContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+                self.state = 65
+                self.match(RelayParser.T__0)
+                self.state = 66
+                self.expr(0)
+                self.state = 69 
+                self._errHandler.sync(self)
+                _la = self._input.LA(1)
+                while True:
+                    self.state = 67
+                    self.match(RelayParser.T__2)
+                    self.state = 68
+                    self.expr(0)
+                    self.state = 71 
+                    self._errHandler.sync(self)
+                    _la = self._input.LA(1)
+                    if not (_la==RelayParser.T__2):
+                        break
+
+                self.state = 73
+                self.match(RelayParser.T__1)
+                pass
+
+            elif la_ == 7:
+                localctx = RelayParser.TensorContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+                self.state = 75
+                self.match(RelayParser.T__3)
+                self.state = 84
+                self._errHandler.sync(self)
+                _la = self._input.LA(1)
+                if (((_la) & ~0x3f) == 0 and ((1 << _la) & ((1 << RelayParser.T__0) | (1 << RelayParser.T__3) | (1 << RelayParser.T__5) | (1 << RelayParser.T__7) | (1 << RelayParser.T__12) | (1 << RelayParser.SUB) | (1 << RelayParser.GLOBAL_VAR) | (1 << RelayParser.LOCAL_VAR) | (1 << RelayParser.GRAPH_VAR) | (1 << RelayParser.BOOL_LIT) | (1 << RelayParser.FLOAT) | (1 << RelayParser.NAT) | (1 << RelayParser.CNAME))) != 0):
+                    self.state = 76
+                    self.expr(0)
+                    self.state = 81
+                    self._errHandler.sync(self)
+                    _la = self._input.LA(1)
+                    while _la==RelayParser.T__2:
+                        self.state = 77
+                        self.match(RelayParser.T__2)
+                        self.state = 78
+                        self.expr(0)
+                        self.state = 83
+                        self._errHandler.sync(self)
+                        _la = self._input.LA(1)
+
+
+
+                self.state = 86
+                self.match(RelayParser.T__4)
+                pass
+
+            elif la_ == 8:
+                localctx = RelayParser.IfElseContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+                self.state = 87
+                self.match(RelayParser.T__5)
+                self.state = 88
+                self.match(RelayParser.T__0)
+                self.state = 89
+                self.expr(0)
+                self.state = 90
+                self.match(RelayParser.T__1)
+                self.state = 91
+                self.body()
+                self.state = 92
+                self.match(RelayParser.T__6)
+                self.state = 93
+                self.body()
+                pass
+
+            elif la_ == 9:
+                localctx = RelayParser.LetContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+                self.state = 95
+                self.match(RelayParser.T__7)
+                self.state = 97
+                self._errHandler.sync(self)
+                _la = self._input.LA(1)
+                if _la==RelayParser.MUT:
+                    self.state = 96
+                    self.match(RelayParser.MUT)
+
+
+                self.state = 99
+                self.var()
+                self.state = 100
+                self.match(RelayParser.T__8)
+                self.state = 101
+                self.expr(0)
+                self.state = 102
+                self.match(RelayParser.T__9)
+                self.state = 103
+                self.expr(6)
+                pass
+
+            elif la_ == 10:
+                localctx = RelayParser.LetContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+                self.state = 105
+                self.match(RelayParser.T__7)
+                self.state = 107
+                self._errHandler.sync(self)
+                _la = self._input.LA(1)
+                if _la==RelayParser.MUT:
+                    self.state = 106
+                    self.match(RelayParser.MUT)
+
+
+                self.state = 109
+                self.var()
+                self.state = 110
+                self.match(RelayParser.T__8)
+                self.state = 111
+                self.match(RelayParser.T__10)
+                self.state = 112
+                self.expr(0)
+                self.state = 113
+                self.match(RelayParser.T__11)
+                self.state = 114
+                self.match(RelayParser.T__9)
+                self.state = 115
+                self.expr(5)
+                pass
+
+            elif la_ == 11:
+                localctx = RelayParser.GraphContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+                self.state = 117
+                self.ident()
+                self.state = 118
+                self.match(RelayParser.T__8)
+                self.state = 119
+                self.expr(0)
+                self.state = 120
+                self.match(RelayParser.T__9)
+                self.state = 121
+                self.expr(3)
+                pass
+
+            elif la_ == 12:
+                localctx = RelayParser.IdentExprContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+                self.state = 123
+                self.ident()
+                pass
+
+            elif la_ == 13:
+                localctx = RelayParser.ScalarExprContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+                self.state = 124
+                self.scalar()
+                pass
+
+
+            self._ctx.stop = self._input.LT(-1)
+            self.state = 157
+            self._errHandler.sync(self)
+            _alt = self._interp.adaptivePredict(self._input,11,self._ctx)
+            while _alt!=2 and _alt!=ATN.INVALID_ALT_NUMBER:
+                if _alt==1:
+                    if self._parseListeners is not None:
+                        self.triggerExitRuleEvent()
+                    _prevctx = localctx
+                    self.state = 155
+                    self._errHandler.sync(self)
+                    la_ = self._interp.adaptivePredict(self._input,10,self._ctx)
+                    if la_ == 1:
+                        localctx = RelayParser.BinOpContext(self, RelayParser.ExprContext(self, _parentctx, _parentState))
+                        self.pushNewRecursionContext(localctx, _startState, self.RULE_expr)
+                        self.state = 127
+                        if not self.precpred(self._ctx, 16):
+                            from antlr4.error.Errors import FailedPredicateException
+                            raise FailedPredicateException(self, "self.precpred(self._ctx, 16)")
+                        self.state = 128
+                        localctx.op = self._input.LT(1)
+                        _la = self._input.LA(1)
+                        if not(_la==RelayParser.MUL or _la==RelayParser.DIV):
+                            localctx.op = self._errHandler.recoverInline(self)
+                        else:
+                            self._errHandler.reportMatch(self)
+                            self.consume()
+                        self.state = 129
+                        self.expr(17)
+                        pass
+
+                    elif la_ == 2:
+                        localctx = RelayParser.BinOpContext(self, RelayParser.ExprContext(self, _parentctx, _parentState))
+                        self.pushNewRecursionContext(localctx, _startState, self.RULE_expr)
+                        self.state = 130
+                        if not self.precpred(self._ctx, 15):
+                            from antlr4.error.Errors import FailedPredicateException
+                            raise FailedPredicateException(self, "self.precpred(self._ctx, 15)")
+                        self.state = 131
+                        localctx.op = self._input.LT(1)
+                        _la = self._input.LA(1)
+                        if not(_la==RelayParser.ADD or _la==RelayParser.SUB):
+                            localctx.op = self._errHandler.recoverInline(self)
+                        else:
+                            self._errHandler.reportMatch(self)
+                            self.consume()
+                        self.state = 132
+                        self.expr(16)
+                        pass
+
+                    elif la_ == 3:
+                        localctx = RelayParser.BinOpContext(self, RelayParser.ExprContext(self, _parentctx, _parentState))
+                        self.pushNewRecursionContext(localctx, _startState, self.RULE_expr)
+                        self.state = 133
+                        if not self.precpred(self._ctx, 14):
+                            from antlr4.error.Errors import FailedPredicateException
+                            raise FailedPredicateException(self, "self.precpred(self._ctx, 14)")
+                        self.state = 134
+                        localctx.op = self._input.LT(1)
+                        _la = self._input.LA(1)
+                        if not((((_la) & ~0x3f) == 0 and ((1 << _la) & ((1 << RelayParser.LT) | (1 << RelayParser.GT) | (1 << RelayParser.LE) | (1 << RelayParser.GE))) != 0)):
+                            localctx.op = self._errHandler.recoverInline(self)
+                        else:
+                            self._errHandler.reportMatch(self)
+                            self.consume()
+                        self.state = 135
+                        self.expr(15)
+                        pass
+
+                    elif la_ == 4:
+                        localctx = RelayParser.BinOpContext(self, RelayParser.ExprContext(self, _parentctx, _parentState))
+                        self.pushNewRecursionContext(localctx, _startState, self.RULE_expr)
+                        self.state = 136
+                        if not self.precpred(self._ctx, 13):
+                            from antlr4.error.Errors import FailedPredicateException
+                            raise FailedPredicateException(self, "self.precpred(self._ctx, 13)")
+                        self.state = 137
+                        localctx.op = self._input.LT(1)
+                        _la = self._input.LA(1)
+                        if not(_la==RelayParser.EQ or _la==RelayParser.NE):
+                            localctx.op = self._errHandler.recoverInline(self)
+                        else:
+                            self._errHandler.reportMatch(self)
+                            self.consume()
+                        self.state = 138
+                        self.expr(14)
+                        pass
+
+                    elif la_ == 5:
+                        localctx = RelayParser.LetContext(self, RelayParser.ExprContext(self, _parentctx, _parentState))
+                        self.pushNewRecursionContext(localctx, _startState, self.RULE_expr)
+                        self.state = 139
+                        if not self.precpred(self._ctx, 4):
+                            from antlr4.error.Errors import FailedPredicateException
+                            raise FailedPredicateException(self, "self.precpred(self._ctx, 4)")
+                        self.state = 140
+                        self.match(RelayParser.T__9)
+                        self.state = 141
+                        self.expr(5)
+                        pass
+
+                    elif la_ == 6:
+                        localctx = RelayParser.CallContext(self, RelayParser.ExprContext(self, _parentctx, _parentState))
+                        self.pushNewRecursionContext(localctx, _startState, self.RULE_expr)
+                        self.state = 142
+                        if not self.precpred(self._ctx, 18):
+                            from antlr4.error.Errors import FailedPredicateException
+                            raise FailedPredicateException(self, "self.precpred(self._ctx, 18)")
+                        self.state = 143
+                        self.match(RelayParser.T__0)
+                        self.state = 152
+                        self._errHandler.sync(self)
+                        _la = self._input.LA(1)
+                        if (((_la) & ~0x3f) == 0 and ((1 << _la) & ((1 << RelayParser.T__0) | (1 << RelayParser.T__3) | (1 << RelayParser.T__5) | (1 << RelayParser.T__7) | (1 << RelayParser.T__12) | (1 << RelayParser.SUB) | (1 << RelayParser.GLOBAL_VAR) | (1 << RelayParser.LOCAL_VAR) | (1 << RelayParser.GRAPH_VAR) | (1 << RelayParser.BOOL_LIT) | (1 << RelayParser.FLOAT) | (1 << RelayParser.NAT) | (1 << RelayParser.CNAME))) != 0):
+                            self.state = 144
+                            self.expr(0)
+                            self.state = 149
+                            self._errHandler.sync(self)
+                            _la = self._input.LA(1)
+                            while _la==RelayParser.T__2:
+                                self.state = 145
+                                self.match(RelayParser.T__2)
+                                self.state = 146
+                                self.expr(0)
+                                self.state = 151
+                                self._errHandler.sync(self)
+                                _la = self._input.LA(1)
+
+
+
+                        self.state = 154
+                        self.match(RelayParser.T__1)
+                        pass
+
+             
+                self.state = 159
+                self._errHandler.sync(self)
+                _alt = self._interp.adaptivePredict(self._input,11,self._ctx)
+
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.unrollRecursionContexts(_parentctx)
+        return localctx
+
+
+    class FuncContext(ParserRuleContext):
+
+        def __init__(self, parser, parent=None, invokingState=-1):
+            super(RelayParser.FuncContext, self).__init__(parent, invokingState)
+            self.parser = parser
+
+        def argList(self):
+            return self.getTypedRuleContext(RelayParser.ArgListContext,0)
+
+
+        def body(self):
+            return self.getTypedRuleContext(RelayParser.BodyContext,0)
+
+
+        def typeParamSeq(self):
+            return self.getTypedRuleContext(RelayParser.TypeParamSeqContext,0)
+
+
+        def type_(self):
+            return self.getTypedRuleContext(RelayParser.Type_Context,0)
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_func
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitFunc"):
+                return visitor.visitFunc(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def func(self):
+
+        localctx = RelayParser.FuncContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 6, self.RULE_func)
+        self._la = 0 # Token type
+        try:
+            self.enterOuterAlt(localctx, 1)
+            self.state = 160
+            self.match(RelayParser.T__12)
+            self.state = 162
+            self._errHandler.sync(self)
+            _la = self._input.LA(1)
+            if _la==RelayParser.T__3:
+                self.state = 161
+                self.typeParamSeq()
+
+
+            self.state = 164
+            self.match(RelayParser.T__0)
+            self.state = 165
+            self.argList()
+            self.state = 166
+            self.match(RelayParser.T__1)
+            self.state = 169
+            self._errHandler.sync(self)
+            _la = self._input.LA(1)
+            if _la==RelayParser.T__13:
+                self.state = 167
+                self.match(RelayParser.T__13)
+                self.state = 168
+                self.type_()
+
+
+            self.state = 171
+            self.body()
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class DefnContext(ParserRuleContext):
+
+        def __init__(self, parser, parent=None, invokingState=-1):
+            super(RelayParser.DefnContext, self).__init__(parent, invokingState)
+            self.parser = parser
+
+        def ident(self):
+            return self.getTypedRuleContext(RelayParser.IdentContext,0)
+
+
+        def argList(self):
+            return self.getTypedRuleContext(RelayParser.ArgListContext,0)
+
+
+        def body(self):
+            return self.getTypedRuleContext(RelayParser.BodyContext,0)
+
+
+        def typeParamSeq(self):
+            return self.getTypedRuleContext(RelayParser.TypeParamSeqContext,0)
+
+
+        def type_(self):
+            return self.getTypedRuleContext(RelayParser.Type_Context,0)
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_defn
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitDefn"):
+                return visitor.visitDefn(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def defn(self):
+
+        localctx = RelayParser.DefnContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 8, self.RULE_defn)
+        self._la = 0 # Token type
+        try:
+            self.enterOuterAlt(localctx, 1)
+            self.state = 173
+            self.match(RelayParser.T__14)
+            self.state = 174
+            self.ident()
+            self.state = 176
+            self._errHandler.sync(self)
+            _la = self._input.LA(1)
+            if _la==RelayParser.T__3:
+                self.state = 175
+                self.typeParamSeq()
+
+
+            self.state = 178
+            self.match(RelayParser.T__0)
+            self.state = 179
+            self.argList()
+            self.state = 180
+            self.match(RelayParser.T__1)
+            self.state = 183
+            self._errHandler.sync(self)
+            _la = self._input.LA(1)
+            if _la==RelayParser.T__13:
+                self.state = 181
+                self.match(RelayParser.T__13)
+                self.state = 182
+                self.type_()
+
+
+            self.state = 185
+            self.body()
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class ArgListContext(ParserRuleContext):
+
+        def __init__(self, parser, parent=None, invokingState=-1):
+            super(RelayParser.ArgListContext, self).__init__(parent, invokingState)
+            self.parser = parser
+
+        def varList(self):
+            return self.getTypedRuleContext(RelayParser.VarListContext,0)
+
+
+        def attrList(self):
+            return self.getTypedRuleContext(RelayParser.AttrListContext,0)
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_argList
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitArgList"):
+                return visitor.visitArgList(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def argList(self):
+
+        localctx = RelayParser.ArgListContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 10, self.RULE_argList)
+        try:
+            self.state = 193
+            self._errHandler.sync(self)
+            la_ = self._interp.adaptivePredict(self._input,16,self._ctx)
+            if la_ == 1:
+                self.enterOuterAlt(localctx, 1)
+                self.state = 187
+                self.varList()
+                pass
+
+            elif la_ == 2:
+                self.enterOuterAlt(localctx, 2)
+                self.state = 188
+                self.attrList()
+                pass
+
+            elif la_ == 3:
+                self.enterOuterAlt(localctx, 3)
+                self.state = 189
+                self.varList()
+                self.state = 190
+                self.match(RelayParser.T__2)
+                self.state = 191
+                self.attrList()
+                pass
+
+
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class VarListContext(ParserRuleContext):
+
+        def __init__(self, parser, parent=None, invokingState=-1):
+            super(RelayParser.VarListContext, self).__init__(parent, invokingState)
+            self.parser = parser
+
+        def var(self, i=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.VarContext)
+            else:
+                return self.getTypedRuleContext(RelayParser.VarContext,i)
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_varList
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitVarList"):
+                return visitor.visitVarList(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def varList(self):
+
+        localctx = RelayParser.VarListContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 12, self.RULE_varList)
+        self._la = 0 # Token type
+        try:
+            self.enterOuterAlt(localctx, 1)
+            self.state = 203
+            self._errHandler.sync(self)
+            _la = self._input.LA(1)
+            if (((_la) & ~0x3f) == 0 and ((1 << _la) & ((1 << RelayParser.GLOBAL_VAR) | (1 << RelayParser.LOCAL_VAR) | (1 << RelayParser.GRAPH_VAR) | (1 << RelayParser.CNAME))) != 0):
+                self.state = 195
+                self.var()
+                self.state = 200
+                self._errHandler.sync(self)
+                _alt = self._interp.adaptivePredict(self._input,17,self._ctx)
+                while _alt!=2 and _alt!=ATN.INVALID_ALT_NUMBER:
+                    if _alt==1:
+                        self.state = 196
+                        self.match(RelayParser.T__2)
+                        self.state = 197
+                        self.var() 
+                    self.state = 202
+                    self._errHandler.sync(self)
+                    _alt = self._interp.adaptivePredict(self._input,17,self._ctx)
+
+
+
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class VarContext(ParserRuleContext):
+
+        def __init__(self, parser, parent=None, invokingState=-1):
+            super(RelayParser.VarContext, self).__init__(parent, invokingState)
+            self.parser = parser
+
+        def ident(self):
+            return self.getTypedRuleContext(RelayParser.IdentContext,0)
+
+
+        def type_(self):
+            return self.getTypedRuleContext(RelayParser.Type_Context,0)
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_var
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitVar"):
+                return visitor.visitVar(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def var(self):
+
+        localctx = RelayParser.VarContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 14, self.RULE_var)
+        self._la = 0 # Token type
+        try:
+            self.enterOuterAlt(localctx, 1)
+            self.state = 205
+            self.ident()
+            self.state = 208
+            self._errHandler.sync(self)
+            _la = self._input.LA(1)
+            if _la==RelayParser.T__15:
+                self.state = 206
+                self.match(RelayParser.T__15)
+                self.state = 207
+                self.type_()
+
+
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class AttrListContext(ParserRuleContext):
+
+        def __init__(self, parser, parent=None, invokingState=-1):
+            super(RelayParser.AttrListContext, self).__init__(parent, invokingState)
+            self.parser = parser
+
+        def attr(self, i=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.AttrContext)
+            else:
+                return self.getTypedRuleContext(RelayParser.AttrContext,i)
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_attrList
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitAttrList"):
+                return visitor.visitAttrList(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def attrList(self):
+
+        localctx = RelayParser.AttrListContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 16, self.RULE_attrList)
+        self._la = 0 # Token type
+        try:
+            self.enterOuterAlt(localctx, 1)
+            self.state = 218
+            self._errHandler.sync(self)
+            _la = self._input.LA(1)
+            if _la==RelayParser.CNAME:
+                self.state = 210
+                self.attr()
+                self.state = 215
+                self._errHandler.sync(self)
+                _la = self._input.LA(1)
+                while _la==RelayParser.T__2:
+                    self.state = 211
+                    self.match(RelayParser.T__2)
+                    self.state = 212
+                    self.attr()
+                    self.state = 217
+                    self._errHandler.sync(self)
+                    _la = self._input.LA(1)
+
+
+
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class AttrContext(ParserRuleContext):
+
+        def __init__(self, parser, parent=None, invokingState=-1):
+            super(RelayParser.AttrContext, self).__init__(parent, invokingState)
+            self.parser = parser
+
+        def CNAME(self):
+            return self.getToken(RelayParser.CNAME, 0)
+
+        def expr(self):
+            return self.getTypedRuleContext(RelayParser.ExprContext,0)
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_attr
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitAttr"):
+                return visitor.visitAttr(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def attr(self):
+
+        localctx = RelayParser.AttrContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 18, self.RULE_attr)
+        try:
+            self.enterOuterAlt(localctx, 1)
+            self.state = 220
+            self.match(RelayParser.CNAME)
+            self.state = 221
+            self.match(RelayParser.T__8)
+            self.state = 222
+            self.expr(0)
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class TypeParamSeqContext(ParserRuleContext):
+
+        def __init__(self, parser, parent=None, invokingState=-1):
+            super(RelayParser.TypeParamSeqContext, self).__init__(parent, invokingState)
+            self.parser = parser
+
+        def ident(self, i=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.IdentContext)
+            else:
+                return self.getTypedRuleContext(RelayParser.IdentContext,i)
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_typeParamSeq
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitTypeParamSeq"):
+                return visitor.visitTypeParamSeq(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def typeParamSeq(self):
+
+        localctx = RelayParser.TypeParamSeqContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 20, self.RULE_typeParamSeq)
+        self._la = 0 # Token type
+        try:
+            self.state = 237
+            self._errHandler.sync(self)
+            la_ = self._interp.adaptivePredict(self._input,23,self._ctx)
+            if la_ == 1:
+                self.enterOuterAlt(localctx, 1)
+                self.state = 224
+                self.match(RelayParser.T__3)
+                self.state = 225
+                self.match(RelayParser.T__4)
+                pass
+
+            elif la_ == 2:
+                self.enterOuterAlt(localctx, 2)
+                self.state = 226
+                self.match(RelayParser.T__3)
+                self.state = 227
+                self.ident()
+                self.state = 232
+                self._errHandler.sync(self)
+                _la = self._input.LA(1)
+                while _la==RelayParser.T__2:
+                    self.state = 228
+                    self.match(RelayParser.T__2)
+                    self.state = 229
+                    self.ident()
+                    self.state = 234
+                    self._errHandler.sync(self)
+                    _la = self._input.LA(1)
+
+                self.state = 235
+                self.match(RelayParser.T__4)
+                pass
+
+
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class Type_Context(ParserRuleContext):
+
+        def __init__(self, parser, parent=None, invokingState=-1):
+            super(RelayParser.Type_Context, self).__init__(parent, invokingState)
+            self.parser = parser
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_type_
+
+     
+        def copyFrom(self, ctx):
+            super(RelayParser.Type_Context, self).copyFrom(ctx)
+
+
+
+    class IntTypeContext(Type_Context):
+
+        def __init__(self, parser, ctx): # actually a RelayParser.Type_Context)
+            super(RelayParser.IntTypeContext, self).__init__(parser)
+            self.copyFrom(ctx)
+
+        def NAT(self):
+            return self.getToken(RelayParser.NAT, 0)
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitIntType"):
+                return visitor.visitIntType(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class TupleTypeContext(Type_Context):
+
+        def __init__(self, parser, ctx): # actually a RelayParser.Type_Context)
+            super(RelayParser.TupleTypeContext, self).__init__(parser)
+            self.copyFrom(ctx)
+
+        def type_(self, i=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.Type_Context)
+            else:
+                return self.getTypedRuleContext(RelayParser.Type_Context,i)
+
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitTupleType"):
+                return visitor.visitTupleType(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class TypeIdentTypeContext(Type_Context):
+
+        def __init__(self, parser, ctx): # actually a RelayParser.Type_Context)
+            super(RelayParser.TypeIdentTypeContext, self).__init__(parser)
+            self.copyFrom(ctx)
+
+        def typeIdent(self):
+            return self.getTypedRuleContext(RelayParser.TypeIdentContext,0)
+
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitTypeIdentType"):
+                return visitor.visitTypeIdentType(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class IncompleteTypeContext(Type_Context):
+
+        def __init__(self, parser, ctx): # actually a RelayParser.Type_Context)
+            super(RelayParser.IncompleteTypeContext, self).__init__(parser)
+            self.copyFrom(ctx)
+
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitIncompleteType"):
+                return visitor.visitIncompleteType(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class TensorTypeContext(Type_Context):
+
+        def __init__(self, parser, ctx): # actually a RelayParser.Type_Context)
+            super(RelayParser.TensorTypeContext, self).__init__(parser)
+            self.copyFrom(ctx)
+
+        def shapeSeq(self):
+            return self.getTypedRuleContext(RelayParser.ShapeSeqContext,0)
+
+        def type_(self):
+            return self.getTypedRuleContext(RelayParser.Type_Context,0)
+
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitTensorType"):
+                return visitor.visitTensorType(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class FuncTypeContext(Type_Context):
+
+        def __init__(self, parser, ctx): # actually a RelayParser.Type_Context)
+            super(RelayParser.FuncTypeContext, self).__init__(parser)
+            self.copyFrom(ctx)
+
+        def type_(self, i=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.Type_Context)
+            else:
+                return self.getTypedRuleContext(RelayParser.Type_Context,i)
+
+        def typeParamSeq(self):
+            return self.getTypedRuleContext(RelayParser.TypeParamSeqContext,0)
+
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitFuncType"):
+                return visitor.visitFuncType(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+    def type_(self):
+
+        localctx = RelayParser.Type_Context(self, self._ctx, self.state)
+        self.enterRule(localctx, 22, self.RULE_type_)
+        self._la = 0 # Token type
+        try:
+            self.state = 284
+            self._errHandler.sync(self)
+            la_ = self._interp.adaptivePredict(self._input,28,self._ctx)
+            if la_ == 1:
+                localctx = RelayParser.TupleTypeContext(self, localctx)
+                self.enterOuterAlt(localctx, 1)
+                self.state = 239
+                self.match(RelayParser.T__0)
+                self.state = 240
+                self.match(RelayParser.T__1)
+                pass
+
+            elif la_ == 2:
+                localctx = RelayParser.TupleTypeContext(self, localctx)
+                self.enterOuterAlt(localctx, 2)
+                self.state = 241
+                self.match(RelayParser.T__0)
+                self.state = 242
+                self.type_()
+                self.state = 243
+                self.match(RelayParser.T__2)
+                self.state = 244
+                self.match(RelayParser.T__1)
+                pass
+
+            elif la_ == 3:
+                localctx = RelayParser.TupleTypeContext(self, localctx)
+                self.enterOuterAlt(localctx, 3)
+                self.state = 246
+                self.match(RelayParser.T__0)
+                self.state = 247
+                self.type_()
+                self.state = 250 
+                self._errHandler.sync(self)
+                _la = self._input.LA(1)
+                while True:
+                    self.state = 248
+                    self.match(RelayParser.T__2)
+                    self.state = 249
+                    self.type_()
+                    self.state = 252 
+                    self._errHandler.sync(self)
+                    _la = self._input.LA(1)
+                    if not (_la==RelayParser.T__2):
+                        break
+
+                self.state = 254
+                self.match(RelayParser.T__1)
+                pass
+
+            elif la_ == 4:
+                localctx = RelayParser.TypeIdentTypeContext(self, localctx)
+                self.enterOuterAlt(localctx, 4)
+                self.state = 256
+                self.typeIdent()
+                pass
+
+            elif la_ == 5:
+                localctx = RelayParser.TensorTypeContext(self, localctx)
+                self.enterOuterAlt(localctx, 5)
+                self.state = 257
+                self.match(RelayParser.T__16)
+                self.state = 258
+                self.match(RelayParser.T__3)
+                self.state = 259
+                self.shapeSeq()
+                self.state = 260
+                self.match(RelayParser.T__2)
+                self.state = 261
+                self.type_()
+                self.state = 262
+                self.match(RelayParser.T__4)
+                pass
+
+            elif la_ == 6:
+                localctx = RelayParser.FuncTypeContext(self, localctx)
+                self.enterOuterAlt(localctx, 6)
+                self.state = 264
+                self.match(RelayParser.T__12)
+                self.state = 266
+                self._errHandler.sync(self)
+                _la = self._input.LA(1)
+                if _la==RelayParser.T__3:
+                    self.state = 265
+                    self.typeParamSeq()
+
+
+                self.state = 268
+                self.match(RelayParser.T__0)
+                self.state = 277
+                self._errHandler.sync(self)
+                _la = self._input.LA(1)
+                if (((_la) & ~0x3f) == 0 and ((1 << _la) & ((1 << RelayParser.T__0) | (1 << RelayParser.T__12) | (1 << RelayParser.T__16) | (1 << RelayParser.T__17) | (1 << RelayParser.NAT) | (1 << RelayParser.CNAME))) != 0):
+                    self.state = 269
+                    self.type_()
+                    self.state = 274
+                    self._errHandler.sync(self)
+                    _la = self._input.LA(1)
+                    while _la==RelayParser.T__2:
+                        self.state = 270
+                        self.match(RelayParser.T__2)
+                        self.state = 271
+                        self.type_()
+                        self.state = 276
+                        self._errHandler.sync(self)
+                        _la = self._input.LA(1)
+
+
+
+                self.state = 279
+                self.match(RelayParser.T__1)
+                self.state = 280
+                self.match(RelayParser.T__13)
+                self.state = 281
+                self.type_()
+                pass
+
+            elif la_ == 7:
+                localctx = RelayParser.IncompleteTypeContext(self, localctx)
+                self.enterOuterAlt(localctx, 7)
+                self.state = 282
+                self.match(RelayParser.T__17)
+                pass
+
+            elif la_ == 8:
+                localctx = RelayParser.IntTypeContext(self, localctx)
+                self.enterOuterAlt(localctx, 8)
+                self.state = 283
+                self.match(RelayParser.NAT)
+                pass
+
+
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class ShapeSeqContext(ParserRuleContext):
+
+        def __init__(self, parser, parent=None, invokingState=-1):
+            super(RelayParser.ShapeSeqContext, self).__init__(parent, invokingState)
+            self.parser = parser
+
+        def shape(self, i=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.ShapeContext)
+            else:
+                return self.getTypedRuleContext(RelayParser.ShapeContext,i)
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_shapeSeq
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitShapeSeq"):
+                return visitor.visitShapeSeq(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def shapeSeq(self):
+
+        localctx = RelayParser.ShapeSeqContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 24, self.RULE_shapeSeq)
+        self._la = 0 # Token type
+        try:
+            self.state = 303
+            self._errHandler.sync(self)
+            la_ = self._interp.adaptivePredict(self._input,30,self._ctx)
+            if la_ == 1:
+                self.enterOuterAlt(localctx, 1)
+                self.state = 286
+                self.match(RelayParser.T__0)
+                self.state = 287
+                self.match(RelayParser.T__1)
+                pass
+
+            elif la_ == 2:
+                self.enterOuterAlt(localctx, 2)
+                self.state = 288
+                self.match(RelayParser.T__0)
+                self.state = 289
+                self.shape()
+                self.state = 290
+                self.match(RelayParser.T__2)
+                self.state = 291
+                self.match(RelayParser.T__1)
+                pass
+
+            elif la_ == 3:
+                self.enterOuterAlt(localctx, 3)
+                self.state = 293
+                self.match(RelayParser.T__0)
+                self.state = 294
+                self.shape()
+                self.state = 297 
+                self._errHandler.sync(self)
+                _la = self._input.LA(1)
+                while True:
+                    self.state = 295
+                    self.match(RelayParser.T__2)
+                    self.state = 296
+                    self.shape()
+                    self.state = 299 
+                    self._errHandler.sync(self)
+                    _la = self._input.LA(1)
+                    if not (_la==RelayParser.T__2):
+                        break
+
+                self.state = 301
+                self.match(RelayParser.T__1)
+                pass
+
+
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class ShapeContext(ParserRuleContext):
+
+        def __init__(self, parser, parent=None, invokingState=-1):
+            super(RelayParser.ShapeContext, self).__init__(parent, invokingState)
+            self.parser = parser
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_shape
+
+     
+        def copyFrom(self, ctx):
+            super(RelayParser.ShapeContext, self).copyFrom(ctx)
+
+
+
+    class ParensShapeContext(ShapeContext):
+
+        def __init__(self, parser, ctx): # actually a RelayParser.ShapeContext)
+            super(RelayParser.ParensShapeContext, self).__init__(parser)
+            self.copyFrom(ctx)
+
+        def shape(self):
+            return self.getTypedRuleContext(RelayParser.ShapeContext,0)
+
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitParensShape"):
+                return visitor.visitParensShape(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class IntShapeContext(ShapeContext):
+
+        def __init__(self, parser, ctx): # actually a RelayParser.ShapeContext)
+            super(RelayParser.IntShapeContext, self).__init__(parser)
+            self.copyFrom(ctx)
+
+        def NAT(self):
+            return self.getToken(RelayParser.NAT, 0)
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitIntShape"):
+                return visitor.visitIntShape(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+    def shape(self):
+
+        localctx = RelayParser.ShapeContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 26, self.RULE_shape)
+        try:
+            self.state = 310
+            self._errHandler.sync(self)
+            token = self._input.LA(1)
+            if token in [RelayParser.T__0]:
+                localctx = RelayParser.ParensShapeContext(self, localctx)
+                self.enterOuterAlt(localctx, 1)
+                self.state = 305
+                self.match(RelayParser.T__0)
+                self.state = 306
+                self.shape()
+                self.state = 307
+                self.match(RelayParser.T__1)
+                pass
+            elif token in [RelayParser.NAT]:
+                localctx = RelayParser.IntShapeContext(self, localctx)
+                self.enterOuterAlt(localctx, 2)
+                self.state = 309
+                self.match(RelayParser.NAT)
+                pass
+            else:
+                raise NoViableAltException(self)
+
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class TypeIdentContext(ParserRuleContext):
+
+        def __init__(self, parser, parent=None, invokingState=-1):
+            super(RelayParser.TypeIdentContext, self).__init__(parent, invokingState)
+            self.parser = parser
+
+        def CNAME(self):
+            return self.getToken(RelayParser.CNAME, 0)
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_typeIdent
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitTypeIdent"):
+                return visitor.visitTypeIdent(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def typeIdent(self):
+
+        localctx = RelayParser.TypeIdentContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 28, self.RULE_typeIdent)
+        try:
+            self.enterOuterAlt(localctx, 1)
+            self.state = 312
+            self.match(RelayParser.CNAME)
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class BodyContext(ParserRuleContext):
+
+        def __init__(self, parser, parent=None, invokingState=-1):
+            super(RelayParser.BodyContext, self).__init__(parent, invokingState)
+            self.parser = parser
+
+        def expr(self):
+            return self.getTypedRuleContext(RelayParser.ExprContext,0)
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_body
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitBody"):
+                return visitor.visitBody(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def body(self):
+
+        localctx = RelayParser.BodyContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 30, self.RULE_body)
+        try:
+            self.enterOuterAlt(localctx, 1)
+            self.state = 314
+            self.match(RelayParser.T__10)
+            self.state = 315
+            self.expr(0)
+            self.state = 316
+            self.match(RelayParser.T__11)
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class ScalarContext(ParserRuleContext):
+
+        def __init__(self, parser, parent=None, invokingState=-1):
+            super(RelayParser.ScalarContext, self).__init__(parent, invokingState)
+            self.parser = parser
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_scalar
+
+     
+        def copyFrom(self, ctx):
+            super(RelayParser.ScalarContext, self).copyFrom(ctx)
+
+
+
+    class ScalarFloatContext(ScalarContext):
+
+        def __init__(self, parser, ctx): # actually a RelayParser.ScalarContext)
+            super(RelayParser.ScalarFloatContext, self).__init__(parser)
+            self.copyFrom(ctx)
+
+        def FLOAT(self):
+            return self.getToken(RelayParser.FLOAT, 0)
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitScalarFloat"):
+                return visitor.visitScalarFloat(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class ScalarBoolContext(ScalarContext):
+
+        def __init__(self, parser, ctx): # actually a RelayParser.ScalarContext)
+            super(RelayParser.ScalarBoolContext, self).__init__(parser)
+            self.copyFrom(ctx)
+
+        def BOOL_LIT(self):
+            return self.getToken(RelayParser.BOOL_LIT, 0)
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitScalarBool"):
+                return visitor.visitScalarBool(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class ScalarIntContext(ScalarContext):
+
+        def __init__(self, parser, ctx): # actually a RelayParser.ScalarContext)
+            super(RelayParser.ScalarIntContext, self).__init__(parser)
+            self.copyFrom(ctx)
+
+        def NAT(self):
+            return self.getToken(RelayParser.NAT, 0)
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitScalarInt"):
+                return visitor.visitScalarInt(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+    def scalar(self):
+
+        localctx = RelayParser.ScalarContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 32, self.RULE_scalar)
+        try:
+            self.state = 321
+            self._errHandler.sync(self)
+            token = self._input.LA(1)
+            if token in [RelayParser.FLOAT]:
+                localctx = RelayParser.ScalarFloatContext(self, localctx)
+                self.enterOuterAlt(localctx, 1)
+                self.state = 318
+                self.match(RelayParser.FLOAT)
+                pass
+            elif token in [RelayParser.NAT]:
+                localctx = RelayParser.ScalarIntContext(self, localctx)
+                self.enterOuterAlt(localctx, 2)
+                self.state = 319
+                self.match(RelayParser.NAT)
+                pass
+            elif token in [RelayParser.BOOL_LIT]:
+                localctx = RelayParser.ScalarBoolContext(self, localctx)
+                self.enterOuterAlt(localctx, 3)
+                self.state = 320
+                self.match(RelayParser.BOOL_LIT)
+                pass
+            else:
+                raise NoViableAltException(self)
+
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class IdentContext(ParserRuleContext):
+
+        def __init__(self, parser, parent=None, invokingState=-1):
+            super(RelayParser.IdentContext, self).__init__(parent, invokingState)
+            self.parser = parser
+
+        def opIdent(self):
+            return self.getTypedRuleContext(RelayParser.OpIdentContext,0)
+
+
+        def GLOBAL_VAR(self):
+            return self.getToken(RelayParser.GLOBAL_VAR, 0)
+
+        def LOCAL_VAR(self):
+            return self.getToken(RelayParser.LOCAL_VAR, 0)
+
+        def GRAPH_VAR(self):
+            return self.getToken(RelayParser.GRAPH_VAR, 0)
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_ident
+
+        def accept(self, visitor):
+            if hasattr(visitor, "visitIdent"):
+                return visitor.visitIdent(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def ident(self):
+
+        localctx = RelayParser.IdentContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 34, self.RULE_ident)
+        try:
+            self.state = 327
+            self._errHandler.sync(self)
+            token = self._input.LA(1)
+            if token in [RelayParser.CNAME]:
+                self.enterOuterAlt(localctx, 1)
+                self.state = 323
+                self.opIdent()
+                pass
+            elif token in [RelayParser.GLOBAL_VAR]:
+                self.enterOuterAlt(localctx, 2)
+                self.state = 324
+                self.match(RelayParser.GLOBAL_VAR)
+                pass
+            elif token in [RelayParser.LOCAL_VAR]:
+                self.enterOuterAlt(localctx, 3)
+                self.state = 325
+                self.match(RelayParser.LOCAL_VAR)
+                pass
+            elif token in [RelayParser.GRAPH_VAR]:
+                self.enterOuterAlt(localctx, 4)
+                self.state = 326
+                self.match(RelayParser.GRAPH_VAR)
+                pass
+            else:
+                raise NoViableAltException(self)
+
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+
+    def sempred(self, localctx, ruleIndex, predIndex):
+        if self._predicates == None:
+            self._predicates = dict()
+        self._predicates[2] = self.expr_sempred
+        pred = self._predicates.get(ruleIndex, None)
+        if pred is None:
+            raise Exception("No predicate with index:" + str(ruleIndex))
+        else:
+            return pred(localctx, predIndex)
+
+    def expr_sempred(self, localctx, predIndex):
+            if predIndex == 0:
+                return self.precpred(self._ctx, 16)
+         
+
+            if predIndex == 1:
+                return self.precpred(self._ctx, 15)
+         
+
+            if predIndex == 2:
+                return self.precpred(self._ctx, 14)
+         
+
+            if predIndex == 3:
+                return self.precpred(self._ctx, 13)
+         
+
+            if predIndex == 4:
+                return self.precpred(self._ctx, 4)
+         
+
+            if predIndex == 5:
+                return self.precpred(self._ctx, 18)
+         
+
+
+
+
diff --git a/python/tvm/relay/grammar/py2/RelayVisitor.py b/python/tvm/relay/grammar/py2/RelayVisitor.py
new file mode 100644
index 000000000000..eae67d8cff58
--- /dev/null
+++ b/python/tvm/relay/grammar/py2/RelayVisitor.py
@@ -0,0 +1,192 @@
+# Generated from /home/sslyu/tvm/python/tvm/relay/grammar/Relay.g4 by ANTLR 4.7.2
+from antlr4 import *
+
+# This class defines a complete generic visitor for a parse tree produced by RelayParser.
+
+class RelayVisitor(ParseTreeVisitor):
+
+    # Visit a parse tree produced by RelayParser#opIdent.
+    def visitOpIdent(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#prog.
+    def visitProg(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#identExpr.
+    def visitIdentExpr(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#call.
+    def visitCall(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#neg.
+    def visitNeg(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#tuple.
+    def visitTuple(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#parens.
+    def visitParens(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#funcExpr.
+    def visitFuncExpr(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#scalarExpr.
+    def visitScalarExpr(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#let.
+    def visitLet(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#tensor.
+    def visitTensor(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#ifElse.
+    def visitIfElse(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#graph.
+    def visitGraph(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#binOp.
+    def visitBinOp(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#func.
+    def visitFunc(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#defn.
+    def visitDefn(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#argList.
+    def visitArgList(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#varList.
+    def visitVarList(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#var.
+    def visitVar(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#attrList.
+    def visitAttrList(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#attr.
+    def visitAttr(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#typeParamSeq.
+    def visitTypeParamSeq(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#tupleType.
+    def visitTupleType(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#typeIdentType.
+    def visitTypeIdentType(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#tensorType.
+    def visitTensorType(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#funcType.
+    def visitFuncType(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#incompleteType.
+    def visitIncompleteType(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#intType.
+    def visitIntType(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#shapeSeq.
+    def visitShapeSeq(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#parensShape.
+    def visitParensShape(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#intShape.
+    def visitIntShape(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#typeIdent.
+    def visitTypeIdent(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#body.
+    def visitBody(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#scalarFloat.
+    def visitScalarFloat(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#scalarInt.
+    def visitScalarInt(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#scalarBool.
+    def visitScalarBool(self, ctx):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#ident.
+    def visitIdent(self, ctx):
+        return self.visitChildren(ctx)
+
+
diff --git a/python/tvm/relay/grammar/py3/.gitattributes b/python/tvm/relay/grammar/py3/.gitattributes
new file mode 100644
index 000000000000..4adf65fa2f3c
--- /dev/null
+++ b/python/tvm/relay/grammar/py3/.gitattributes
@@ -0,0 +1,3 @@
+Relay* binary
+Relay* linguist-generated=true
+Relay* linguist-detectable=false
\ No newline at end of file
diff --git a/python/tvm/relay/grammar/py3/.gitignore b/python/tvm/relay/grammar/py3/.gitignore
deleted file mode 100644
index d677ff551940..000000000000
--- a/python/tvm/relay/grammar/py3/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-Relay*
diff --git a/python/tvm/relay/grammar/py3/Relay.interp b/python/tvm/relay/grammar/py3/Relay.interp
new file mode 100644
index 000000000000..c6893d096168
--- /dev/null
+++ b/python/tvm/relay/grammar/py3/Relay.interp
@@ -0,0 +1,109 @@
+token literal names:
+null
+'('
+')'
+','
+'['
+']'
+'if'
+'else'
+'let'
+'='
+';'
+'{'
+'}'
+'fn'
+'->'
+'def'
+':'
+'Tensor'
+'_'
+'v0.0.2'
+null
+null
+null
+'*'
+'/'
+'+'
+'-'
+'<'
+'>'
+'<='
+'>='
+'=='
+'!='
+null
+null
+null
+'mut'
+null
+null
+null
+null
+
+token symbolic names:
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+SEMVER
+WS
+LINE_COMMENT
+COMMENT
+MUL
+DIV
+ADD
+SUB
+LT
+GT
+LE
+GE
+EQ
+NE
+GLOBAL_VAR
+LOCAL_VAR
+GRAPH_VAR
+MUT
+BOOL_LIT
+FLOAT
+NAT
+CNAME
+
+rule names:
+opIdent
+prog
+expr
+func
+defn
+argList
+varList
+var
+attrList
+attr
+typeParamSeq
+type_
+shapeSeq
+shape
+typeIdent
+body
+scalar
+ident
+
+
+atn:
+[3, 24715, 42794, 33075, 47597, 16764, 15335, 30598, 22884, 3, 42, 332, 4, 2, 9, 2, 4, 3, 9, 3, 4, 4, 9, 4, 4, 5, 9, 5, 4, 6, 9, 6, 4, 7, 9, 7, 4, 8, 9, 8, 4, 9, 9, 9, 4, 10, 9, 10, 4, 11, 9, 11, 4, 12, 9, 12, 4, 13, 9, 13, 4, 14, 9, 14, 4, 15, 9, 15, 4, 16, 9, 16, 4, 17, 9, 17, 4, 18, 9, 18, 4, 19, 9, 19, 3, 2, 3, 2, 3, 3, 3, 3, 7, 3, 43, 10, 3, 12, 3, 14, 3, 46, 11, 3, 3, 3, 5, 3, 49, 10, 3, 3, 3, 3, 3, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 6, 4, 72, 10, 4, 13, 4, 14, 4, 73, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 7, 4, 82, 10, 4, 12, 4, 14, 4, 85, 11, 4, 5, 4, 87, 10, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 5, 4, 100, 10, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 5, 4, 110, 10, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 5, 4, 128, 10, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 7, 4, 150, 10, 4, 12, 4, 14, 4, 153, 11, 4, 5, 4, 155, 10, 4, 3, 4, 7, 4, 158, 10, 4, 12, 4, 14, 4, 161, 11, 4, 3, 5, 3, 5, 5, 5, 165, 10, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 5, 5, 172, 10, 5, 3, 5, 3, 5, 3, 6, 3, 6, 3, 6, 5, 6, 179, 10, 6, 3, 6, 3, 6, 3, 6, 3, 6, 3, 6, 5, 6, 186, 10, 6, 3, 6, 3, 6, 3, 7, 3, 7, 3, 7, 3, 7, 3, 7, 3, 7, 5, 7, 196, 10, 7, 3, 8, 3, 8, 3, 8, 7, 8, 201, 10, 8, 12, 8, 14, 8, 204, 11, 8, 5, 8, 206, 10, 8, 3, 9, 3, 9, 3, 9, 5, 9, 211, 10, 9, 3, 10, 3, 10, 3, 10, 7, 10, 216, 10, 10, 12, 10, 14, 10, 219, 11, 10, 5, 10, 221, 10, 10, 3, 11, 3, 11, 3, 11, 3, 11, 3, 12, 3, 12, 3, 12, 3, 12, 3, 12, 3, 12, 7, 12, 233, 10, 12, 12, 12, 14, 12, 236, 11, 12, 3, 12, 3, 12, 5, 12, 240, 10, 12, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 6, 13, 253, 10, 13, 13, 13, 14, 13, 254, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 5, 13, 269, 10, 13, 3, 13, 3, 13, 3, 13, 3, 13, 7, 13, 275, 10, 13, 12, 13, 14, 13, 278, 11, 13, 5, 13, 280, 10, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 5, 13, 287, 10, 13, 3, 14, 3, 14, 3, 14, 3, 14, 3, 14, 3, 14, 3, 14, 3, 14, 3, 14, 3, 14, 3, 14, 6, 14, 300, 10, 14, 13, 14, 14, 14, 301, 3, 14, 3, 14, 5, 14, 306, 10, 14, 3, 15, 3, 15, 3, 15, 3, 15, 3, 15, 5, 15, 313, 10, 15, 3, 16, 3, 16, 3, 17, 3, 17, 3, 17, 3, 17, 3, 18, 3, 18, 3, 18, 5, 18, 324, 10, 18, 3, 19, 3, 19, 3, 19, 3, 19, 5, 19, 330, 10, 19, 3, 19, 2, 3, 6, 20, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 2, 6, 3, 2, 25, 26, 3, 2, 27, 28, 3, 2, 29, 32, 3, 2, 33, 34, 2, 373, 2, 38, 3, 2, 2, 2, 4, 40, 3, 2, 2, 2, 6, 127, 3, 2, 2, 2, 8, 162, 3, 2, 2, 2, 10, 175, 3, 2, 2, 2, 12, 195, 3, 2, 2, 2, 14, 205, 3, 2, 2, 2, 16, 207, 3, 2, 2, 2, 18, 220, 3, 2, 2, 2, 20, 222, 3, 2, 2, 2, 22, 239, 3, 2, 2, 2, 24, 286, 3, 2, 2, 2, 26, 305, 3, 2, 2, 2, 28, 312, 3, 2, 2, 2, 30, 314, 3, 2, 2, 2, 32, 316, 3, 2, 2, 2, 34, 323, 3, 2, 2, 2, 36, 329, 3, 2, 2, 2, 38, 39, 7, 42, 2, 2, 39, 3, 3, 2, 2, 2, 40, 48, 7, 21, 2, 2, 41, 43, 5, 10, 6, 2, 42, 41, 3, 2, 2, 2, 43, 46, 3, 2, 2, 2, 44, 42, 3, 2, 2, 2, 44, 45, 3, 2, 2, 2, 45, 49, 3, 2, 2, 2, 46, 44, 3, 2, 2, 2, 47, 49, 5, 6, 4, 2, 48, 44, 3, 2, 2, 2, 48, 47, 3, 2, 2, 2, 49, 50, 3, 2, 2, 2, 50, 51, 7, 2, 2, 3, 51, 5, 3, 2, 2, 2, 52, 53, 8, 4, 1, 2, 53, 54, 7, 3, 2, 2, 54, 55, 5, 6, 4, 2, 55, 56, 7, 4, 2, 2, 56, 128, 3, 2, 2, 2, 57, 58, 7, 28, 2, 2, 58, 128, 5, 6, 4, 19, 59, 128, 5, 8, 5, 2, 60, 61, 7, 3, 2, 2, 61, 128, 7, 4, 2, 2, 62, 63, 7, 3, 2, 2, 63, 64, 5, 6, 4, 2, 64, 65, 7, 5, 2, 2, 65, 66, 7, 4, 2, 2, 66, 128, 3, 2, 2, 2, 67, 68, 7, 3, 2, 2, 68, 71, 5, 6, 4, 2, 69, 70, 7, 5, 2, 2, 70, 72, 5, 6, 4, 2, 71, 69, 3, 2, 2, 2, 72, 73, 3, 2, 2, 2, 73, 71, 3, 2, 2, 2, 73, 74, 3, 2, 2, 2, 74, 75, 3, 2, 2, 2, 75, 76, 7, 4, 2, 2, 76, 128, 3, 2, 2, 2, 77, 86, 7, 6, 2, 2, 78, 83, 5, 6, 4, 2, 79, 80, 7, 5, 2, 2, 80, 82, 5, 6, 4, 2, 81, 79, 3, 2, 2, 2, 82, 85, 3, 2, 2, 2, 83, 81, 3, 2, 2, 2, 83, 84, 3, 2, 2, 2, 84, 87, 3, 2, 2, 2, 85, 83, 3, 2, 2, 2, 86, 78, 3, 2, 2, 2, 86, 87, 3, 2, 2, 2, 87, 88, 3, 2, 2, 2, 88, 128, 7, 7, 2, 2, 89, 90, 7, 8, 2, 2, 90, 91, 7, 3, 2, 2, 91, 92, 5, 6, 4, 2, 92, 93, 7, 4, 2, 2, 93, 94, 5, 32, 17, 2, 94, 95, 7, 9, 2, 2, 95, 96, 5, 32, 17, 2, 96, 128, 3, 2, 2, 2, 97, 99, 7, 10, 2, 2, 98, 100, 7, 38, 2, 2, 99, 98, 3, 2, 2, 2, 99, 100, 3, 2, 2, 2, 100, 101, 3, 2, 2, 2, 101, 102, 5, 16, 9, 2, 102, 103, 7, 11, 2, 2, 103, 104, 5, 6, 4, 2, 104, 105, 7, 12, 2, 2, 105, 106, 5, 6, 4, 8, 106, 128, 3, 2, 2, 2, 107, 109, 7, 10, 2, 2, 108, 110, 7, 38, 2, 2, 109, 108, 3, 2, 2, 2, 109, 110, 3, 2, 2, 2, 110, 111, 3, 2, 2, 2, 111, 112, 5, 16, 9, 2, 112, 113, 7, 11, 2, 2, 113, 114, 7, 13, 2, 2, 114, 115, 5, 6, 4, 2, 115, 116, 7, 14, 2, 2, 116, 117, 7, 12, 2, 2, 117, 118, 5, 6, 4, 7, 118, 128, 3, 2, 2, 2, 119, 120, 5, 36, 19, 2, 120, 121, 7, 11, 2, 2, 121, 122, 5, 6, 4, 2, 122, 123, 7, 12, 2, 2, 123, 124, 5, 6, 4, 5, 124, 128, 3, 2, 2, 2, 125, 128, 5, 36, 19, 2, 126, 128, 5, 34, 18, 2, 127, 52, 3, 2, 2, 2, 127, 57, 3, 2, 2, 2, 127, 59, 3, 2, 2, 2, 127, 60, 3, 2, 2, 2, 127, 62, 3, 2, 2, 2, 127, 67, 3, 2, 2, 2, 127, 77, 3, 2, 2, 2, 127, 89, 3, 2, 2, 2, 127, 97, 3, 2, 2, 2, 127, 107, 3, 2, 2, 2, 127, 119, 3, 2, 2, 2, 127, 125, 3, 2, 2, 2, 127, 126, 3, 2, 2, 2, 128, 159, 3, 2, 2, 2, 129, 130, 12, 18, 2, 2, 130, 131, 9, 2, 2, 2, 131, 158, 5, 6, 4, 19, 132, 133, 12, 17, 2, 2, 133, 134, 9, 3, 2, 2, 134, 158, 5, 6, 4, 18, 135, 136, 12, 16, 2, 2, 136, 137, 9, 4, 2, 2, 137, 158, 5, 6, 4, 17, 138, 139, 12, 15, 2, 2, 139, 140, 9, 5, 2, 2, 140, 158, 5, 6, 4, 16, 141, 142, 12, 6, 2, 2, 142, 143, 7, 12, 2, 2, 143, 158, 5, 6, 4, 7, 144, 145, 12, 20, 2, 2, 145, 154, 7, 3, 2, 2, 146, 151, 5, 6, 4, 2, 147, 148, 7, 5, 2, 2, 148, 150, 5, 6, 4, 2, 149, 147, 3, 2, 2, 2, 150, 153, 3, 2, 2, 2, 151, 149, 3, 2, 2, 2, 151, 152, 3, 2, 2, 2, 152, 155, 3, 2, 2, 2, 153, 151, 3, 2, 2, 2, 154, 146, 3, 2, 2, 2, 154, 155, 3, 2, 2, 2, 155, 156, 3, 2, 2, 2, 156, 158, 7, 4, 2, 2, 157, 129, 3, 2, 2, 2, 157, 132, 3, 2, 2, 2, 157, 135, 3, 2, 2, 2, 157, 138, 3, 2, 2, 2, 157, 141, 3, 2, 2, 2, 157, 144, 3, 2, 2, 2, 158, 161, 3, 2, 2, 2, 159, 157, 3, 2, 2, 2, 159, 160, 3, 2, 2, 2, 160, 7, 3, 2, 2, 2, 161, 159, 3, 2, 2, 2, 162, 164, 7, 15, 2, 2, 163, 165, 5, 22, 12, 2, 164, 163, 3, 2, 2, 2, 164, 165, 3, 2, 2, 2, 165, 166, 3, 2, 2, 2, 166, 167, 7, 3, 2, 2, 167, 168, 5, 12, 7, 2, 168, 171, 7, 4, 2, 2, 169, 170, 7, 16, 2, 2, 170, 172, 5, 24, 13, 2, 171, 169, 3, 2, 2, 2, 171, 172, 3, 2, 2, 2, 172, 173, 3, 2, 2, 2, 173, 174, 5, 32, 17, 2, 174, 9, 3, 2, 2, 2, 175, 176, 7, 17, 2, 2, 176, 178, 5, 36, 19, 2, 177, 179, 5, 22, 12, 2, 178, 177, 3, 2, 2, 2, 178, 179, 3, 2, 2, 2, 179, 180, 3, 2, 2, 2, 180, 181, 7, 3, 2, 2, 181, 182, 5, 12, 7, 2, 182, 185, 7, 4, 2, 2, 183, 184, 7, 16, 2, 2, 184, 186, 5, 24, 13, 2, 185, 183, 3, 2, 2, 2, 185, 186, 3, 2, 2, 2, 186, 187, 3, 2, 2, 2, 187, 188, 5, 32, 17, 2, 188, 11, 3, 2, 2, 2, 189, 196, 5, 14, 8, 2, 190, 196, 5, 18, 10, 2, 191, 192, 5, 14, 8, 2, 192, 193, 7, 5, 2, 2, 193, 194, 5, 18, 10, 2, 194, 196, 3, 2, 2, 2, 195, 189, 3, 2, 2, 2, 195, 190, 3, 2, 2, 2, 195, 191, 3, 2, 2, 2, 196, 13, 3, 2, 2, 2, 197, 202, 5, 16, 9, 2, 198, 199, 7, 5, 2, 2, 199, 201, 5, 16, 9, 2, 200, 198, 3, 2, 2, 2, 201, 204, 3, 2, 2, 2, 202, 200, 3, 2, 2, 2, 202, 203, 3, 2, 2, 2, 203, 206, 3, 2, 2, 2, 204, 202, 3, 2, 2, 2, 205, 197, 3, 2, 2, 2, 205, 206, 3, 2, 2, 2, 206, 15, 3, 2, 2, 2, 207, 210, 5, 36, 19, 2, 208, 209, 7, 18, 2, 2, 209, 211, 5, 24, 13, 2, 210, 208, 3, 2, 2, 2, 210, 211, 3, 2, 2, 2, 211, 17, 3, 2, 2, 2, 212, 217, 5, 20, 11, 2, 213, 214, 7, 5, 2, 2, 214, 216, 5, 20, 11, 2, 215, 213, 3, 2, 2, 2, 216, 219, 3, 2, 2, 2, 217, 215, 3, 2, 2, 2, 217, 218, 3, 2, 2, 2, 218, 221, 3, 2, 2, 2, 219, 217, 3, 2, 2, 2, 220, 212, 3, 2, 2, 2, 220, 221, 3, 2, 2, 2, 221, 19, 3, 2, 2, 2, 222, 223, 7, 42, 2, 2, 223, 224, 7, 11, 2, 2, 224, 225, 5, 6, 4, 2, 225, 21, 3, 2, 2, 2, 226, 227, 7, 6, 2, 2, 227, 240, 7, 7, 2, 2, 228, 229, 7, 6, 2, 2, 229, 234, 5, 36, 19, 2, 230, 231, 7, 5, 2, 2, 231, 233, 5, 36, 19, 2, 232, 230, 3, 2, 2, 2, 233, 236, 3, 2, 2, 2, 234, 232, 3, 2, 2, 2, 234, 235, 3, 2, 2, 2, 235, 237, 3, 2, 2, 2, 236, 234, 3, 2, 2, 2, 237, 238, 7, 7, 2, 2, 238, 240, 3, 2, 2, 2, 239, 226, 3, 2, 2, 2, 239, 228, 3, 2, 2, 2, 240, 23, 3, 2, 2, 2, 241, 242, 7, 3, 2, 2, 242, 287, 7, 4, 2, 2, 243, 244, 7, 3, 2, 2, 244, 245, 5, 24, 13, 2, 245, 246, 7, 5, 2, 2, 246, 247, 7, 4, 2, 2, 247, 287, 3, 2, 2, 2, 248, 249, 7, 3, 2, 2, 249, 252, 5, 24, 13, 2, 250, 251, 7, 5, 2, 2, 251, 253, 5, 24, 13, 2, 252, 250, 3, 2, 2, 2, 253, 254, 3, 2, 2, 2, 254, 252, 3, 2, 2, 2, 254, 255, 3, 2, 2, 2, 255, 256, 3, 2, 2, 2, 256, 257, 7, 4, 2, 2, 257, 287, 3, 2, 2, 2, 258, 287, 5, 30, 16, 2, 259, 260, 7, 19, 2, 2, 260, 261, 7, 6, 2, 2, 261, 262, 5, 26, 14, 2, 262, 263, 7, 5, 2, 2, 263, 264, 5, 24, 13, 2, 264, 265, 7, 7, 2, 2, 265, 287, 3, 2, 2, 2, 266, 268, 7, 15, 2, 2, 267, 269, 5, 22, 12, 2, 268, 267, 3, 2, 2, 2, 268, 269, 3, 2, 2, 2, 269, 270, 3, 2, 2, 2, 270, 279, 7, 3, 2, 2, 271, 276, 5, 24, 13, 2, 272, 273, 7, 5, 2, 2, 273, 275, 5, 24, 13, 2, 274, 272, 3, 2, 2, 2, 275, 278, 3, 2, 2, 2, 276, 274, 3, 2, 2, 2, 276, 277, 3, 2, 2, 2, 277, 280, 3, 2, 2, 2, 278, 276, 3, 2, 2, 2, 279, 271, 3, 2, 2, 2, 279, 280, 3, 2, 2, 2, 280, 281, 3, 2, 2, 2, 281, 282, 7, 4, 2, 2, 282, 283, 7, 16, 2, 2, 283, 287, 5, 24, 13, 2, 284, 287, 7, 20, 2, 2, 285, 287, 7, 41, 2, 2, 286, 241, 3, 2, 2, 2, 286, 243, 3, 2, 2, 2, 286, 248, 3, 2, 2, 2, 286, 258, 3, 2, 2, 2, 286, 259, 3, 2, 2, 2, 286, 266, 3, 2, 2, 2, 286, 284, 3, 2, 2, 2, 286, 285, 3, 2, 2, 2, 287, 25, 3, 2, 2, 2, 288, 289, 7, 3, 2, 2, 289, 306, 7, 4, 2, 2, 290, 291, 7, 3, 2, 2, 291, 292, 5, 28, 15, 2, 292, 293, 7, 5, 2, 2, 293, 294, 7, 4, 2, 2, 294, 306, 3, 2, 2, 2, 295, 296, 7, 3, 2, 2, 296, 299, 5, 28, 15, 2, 297, 298, 7, 5, 2, 2, 298, 300, 5, 28, 15, 2, 299, 297, 3, 2, 2, 2, 300, 301, 3, 2, 2, 2, 301, 299, 3, 2, 2, 2, 301, 302, 3, 2, 2, 2, 302, 303, 3, 2, 2, 2, 303, 304, 7, 4, 2, 2, 304, 306, 3, 2, 2, 2, 305, 288, 3, 2, 2, 2, 305, 290, 3, 2, 2, 2, 305, 295, 3, 2, 2, 2, 306, 27, 3, 2, 2, 2, 307, 308, 7, 3, 2, 2, 308, 309, 5, 28, 15, 2, 309, 310, 7, 4, 2, 2, 310, 313, 3, 2, 2, 2, 311, 313, 7, 41, 2, 2, 312, 307, 3, 2, 2, 2, 312, 311, 3, 2, 2, 2, 313, 29, 3, 2, 2, 2, 314, 315, 7, 42, 2, 2, 315, 31, 3, 2, 2, 2, 316, 317, 7, 13, 2, 2, 317, 318, 5, 6, 4, 2, 318, 319, 7, 14, 2, 2, 319, 33, 3, 2, 2, 2, 320, 324, 7, 40, 2, 2, 321, 324, 7, 41, 2, 2, 322, 324, 7, 39, 2, 2, 323, 320, 3, 2, 2, 2, 323, 321, 3, 2, 2, 2, 323, 322, 3, 2, 2, 2, 324, 35, 3, 2, 2, 2, 325, 330, 5, 2, 2, 2, 326, 330, 7, 35, 2, 2, 327, 330, 7, 36, 2, 2, 328, 330, 7, 37, 2, 2, 329, 325, 3, 2, 2, 2, 329, 326, 3, 2, 2, 2, 329, 327, 3, 2, 2, 2, 329, 328, 3, 2, 2, 2, 330, 37, 3, 2, 2, 2, 36, 44, 48, 73, 83, 86, 99, 109, 127, 151, 154, 157, 159, 164, 171, 178, 185, 195, 202, 205, 210, 217, 220, 234, 239, 254, 268, 276, 279, 286, 301, 305, 312, 323, 329]
\ No newline at end of file
diff --git a/python/tvm/relay/grammar/py3/Relay.tokens b/python/tvm/relay/grammar/py3/Relay.tokens
new file mode 100644
index 000000000000..41f3ee62a86c
--- /dev/null
+++ b/python/tvm/relay/grammar/py3/Relay.tokens
@@ -0,0 +1,70 @@
+T__0=1
+T__1=2
+T__2=3
+T__3=4
+T__4=5
+T__5=6
+T__6=7
+T__7=8
+T__8=9
+T__9=10
+T__10=11
+T__11=12
+T__12=13
+T__13=14
+T__14=15
+T__15=16
+T__16=17
+T__17=18
+SEMVER=19
+WS=20
+LINE_COMMENT=21
+COMMENT=22
+MUL=23
+DIV=24
+ADD=25
+SUB=26
+LT=27
+GT=28
+LE=29
+GE=30
+EQ=31
+NE=32
+GLOBAL_VAR=33
+LOCAL_VAR=34
+GRAPH_VAR=35
+MUT=36
+BOOL_LIT=37
+FLOAT=38
+NAT=39
+CNAME=40
+'('=1
+')'=2
+','=3
+'['=4
+']'=5
+'if'=6
+'else'=7
+'let'=8
+'='=9
+';'=10
+'{'=11
+'}'=12
+'fn'=13
+'->'=14
+'def'=15
+':'=16
+'Tensor'=17
+'_'=18
+'v0.0.2'=19
+'*'=23
+'/'=24
+'+'=25
+'-'=26
+'<'=27
+'>'=28
+'<='=29
+'>='=30
+'=='=31
+'!='=32
+'mut'=36
diff --git a/python/tvm/relay/grammar/py3/RelayLexer.interp b/python/tvm/relay/grammar/py3/RelayLexer.interp
new file mode 100644
index 000000000000..092b3589ab70
--- /dev/null
+++ b/python/tvm/relay/grammar/py3/RelayLexer.interp
@@ -0,0 +1,140 @@
+token literal names:
+null
+'('
+')'
+','
+'['
+']'
+'if'
+'else'
+'let'
+'='
+';'
+'{'
+'}'
+'fn'
+'->'
+'def'
+':'
+'Tensor'
+'_'
+'v0.0.2'
+null
+null
+null
+'*'
+'/'
+'+'
+'-'
+'<'
+'>'
+'<='
+'>='
+'=='
+'!='
+null
+null
+null
+'mut'
+null
+null
+null
+null
+
+token symbolic names:
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+null
+SEMVER
+WS
+LINE_COMMENT
+COMMENT
+MUL
+DIV
+ADD
+SUB
+LT
+GT
+LE
+GE
+EQ
+NE
+GLOBAL_VAR
+LOCAL_VAR
+GRAPH_VAR
+MUT
+BOOL_LIT
+FLOAT
+NAT
+CNAME
+
+rule names:
+T__0
+T__1
+T__2
+T__3
+T__4
+T__5
+T__6
+T__7
+T__8
+T__9
+T__10
+T__11
+T__12
+T__13
+T__14
+T__15
+T__16
+T__17
+SEMVER
+WS
+LINE_COMMENT
+COMMENT
+MUL
+DIV
+ADD
+SUB
+LT
+GT
+LE
+GE
+EQ
+NE
+GLOBAL_VAR
+LOCAL_VAR
+GRAPH_VAR
+MUT
+BOOL_LIT
+FLOAT
+NAT
+EXP
+CNAME
+LETTER
+DIGIT
+
+channel names:
+DEFAULT_TOKEN_CHANNEL
+HIDDEN
+
+mode names:
+DEFAULT_MODE
+
+atn:
+[3, 24715, 42794, 33075, 47597, 16764, 15335, 30598, 22884, 2, 42, 267, 8, 1, 4, 2, 9, 2, 4, 3, 9, 3, 4, 4, 9, 4, 4, 5, 9, 5, 4, 6, 9, 6, 4, 7, 9, 7, 4, 8, 9, 8, 4, 9, 9, 9, 4, 10, 9, 10, 4, 11, 9, 11, 4, 12, 9, 12, 4, 13, 9, 13, 4, 14, 9, 14, 4, 15, 9, 15, 4, 16, 9, 16, 4, 17, 9, 17, 4, 18, 9, 18, 4, 19, 9, 19, 4, 20, 9, 20, 4, 21, 9, 21, 4, 22, 9, 22, 4, 23, 9, 23, 4, 24, 9, 24, 4, 25, 9, 25, 4, 26, 9, 26, 4, 27, 9, 27, 4, 28, 9, 28, 4, 29, 9, 29, 4, 30, 9, 30, 4, 31, 9, 31, 4, 32, 9, 32, 4, 33, 9, 33, 4, 34, 9, 34, 4, 35, 9, 35, 4, 36, 9, 36, 4, 37, 9, 37, 4, 38, 9, 38, 4, 39, 9, 39, 4, 40, 9, 40, 4, 41, 9, 41, 4, 42, 9, 42, 4, 43, 9, 43, 4, 44, 9, 44, 3, 2, 3, 2, 3, 3, 3, 3, 3, 4, 3, 4, 3, 5, 3, 5, 3, 6, 3, 6, 3, 7, 3, 7, 3, 7, 3, 8, 3, 8, 3, 8, 3, 8, 3, 8, 3, 9, 3, 9, 3, 9, 3, 9, 3, 10, 3, 10, 3, 11, 3, 11, 3, 12, 3, 12, 3, 13, 3, 13, 3, 14, 3, 14, 3, 14, 3, 15, 3, 15, 3, 15, 3, 16, 3, 16, 3, 16, 3, 16, 3, 17, 3, 17, 3, 18, 3, 18, 3, 18, 3, 18, 3, 18, 3, 18, 3, 18, 3, 19, 3, 19, 3, 20, 3, 20, 3, 20, 3, 20, 3, 20, 3, 20, 3, 20, 3, 21, 6, 21, 149, 10, 21, 13, 21, 14, 21, 150, 3, 21, 3, 21, 3, 22, 3, 22, 3, 22, 3, 22, 7, 22, 159, 10, 22, 12, 22, 14, 22, 162, 11, 22, 3, 22, 3, 22, 3, 22, 3, 22, 3, 23, 3, 23, 3, 23, 3, 23, 7, 23, 172, 10, 23, 12, 23, 14, 23, 175, 11, 23, 3, 23, 3, 23, 3, 23, 3, 23, 3, 23, 3, 24, 3, 24, 3, 25, 3, 25, 3, 26, 3, 26, 3, 27, 3, 27, 3, 28, 3, 28, 3, 29, 3, 29, 3, 30, 3, 30, 3, 30, 3, 31, 3, 31, 3, 31, 3, 32, 3, 32, 3, 32, 3, 33, 3, 33, 3, 33, 3, 34, 3, 34, 3, 34, 3, 35, 3, 35, 3, 35, 3, 36, 3, 36, 3, 36, 3, 37, 3, 37, 3, 37, 3, 37, 3, 38, 3, 38, 3, 38, 3, 38, 3, 38, 3, 38, 3, 38, 3, 38, 3, 38, 5, 38, 228, 10, 38, 3, 39, 3, 39, 3, 39, 3, 39, 5, 39, 234, 10, 39, 3, 39, 3, 39, 3, 39, 5, 39, 239, 10, 39, 3, 40, 6, 40, 242, 10, 40, 13, 40, 14, 40, 243, 3, 41, 3, 41, 5, 41, 248, 10, 41, 3, 41, 3, 41, 3, 42, 3, 42, 5, 42, 254, 10, 42, 3, 42, 3, 42, 3, 42, 7, 42, 259, 10, 42, 12, 42, 14, 42, 262, 11, 42, 3, 43, 3, 43, 3, 44, 3, 44, 4, 160, 173, 2, 45, 3, 3, 5, 4, 7, 5, 9, 6, 11, 7, 13, 8, 15, 9, 17, 10, 19, 11, 21, 12, 23, 13, 25, 14, 27, 15, 29, 16, 31, 17, 33, 18, 35, 19, 37, 20, 39, 21, 41, 22, 43, 23, 45, 24, 47, 25, 49, 26, 51, 27, 53, 28, 55, 29, 57, 30, 59, 31, 61, 32, 63, 33, 65, 34, 67, 35, 69, 36, 71, 37, 73, 38, 75, 39, 77, 40, 79, 41, 81, 2, 83, 42, 85, 2, 87, 2, 3, 2, 7, 5, 2, 11, 12, 15, 15, 34, 34, 4, 2, 71, 71, 103, 103, 4, 2, 45, 45, 47, 47, 4, 2, 67, 92, 99, 124, 3, 2, 50, 59, 2, 275, 2, 3, 3, 2, 2, 2, 2, 5, 3, 2, 2, 2, 2, 7, 3, 2, 2, 2, 2, 9, 3, 2, 2, 2, 2, 11, 3, 2, 2, 2, 2, 13, 3, 2, 2, 2, 2, 15, 3, 2, 2, 2, 2, 17, 3, 2, 2, 2, 2, 19, 3, 2, 2, 2, 2, 21, 3, 2, 2, 2, 2, 23, 3, 2, 2, 2, 2, 25, 3, 2, 2, 2, 2, 27, 3, 2, 2, 2, 2, 29, 3, 2, 2, 2, 2, 31, 3, 2, 2, 2, 2, 33, 3, 2, 2, 2, 2, 35, 3, 2, 2, 2, 2, 37, 3, 2, 2, 2, 2, 39, 3, 2, 2, 2, 2, 41, 3, 2, 2, 2, 2, 43, 3, 2, 2, 2, 2, 45, 3, 2, 2, 2, 2, 47, 3, 2, 2, 2, 2, 49, 3, 2, 2, 2, 2, 51, 3, 2, 2, 2, 2, 53, 3, 2, 2, 2, 2, 55, 3, 2, 2, 2, 2, 57, 3, 2, 2, 2, 2, 59, 3, 2, 2, 2, 2, 61, 3, 2, 2, 2, 2, 63, 3, 2, 2, 2, 2, 65, 3, 2, 2, 2, 2, 67, 3, 2, 2, 2, 2, 69, 3, 2, 2, 2, 2, 71, 3, 2, 2, 2, 2, 73, 3, 2, 2, 2, 2, 75, 3, 2, 2, 2, 2, 77, 3, 2, 2, 2, 2, 79, 3, 2, 2, 2, 2, 83, 3, 2, 2, 2, 3, 89, 3, 2, 2, 2, 5, 91, 3, 2, 2, 2, 7, 93, 3, 2, 2, 2, 9, 95, 3, 2, 2, 2, 11, 97, 3, 2, 2, 2, 13, 99, 3, 2, 2, 2, 15, 102, 3, 2, 2, 2, 17, 107, 3, 2, 2, 2, 19, 111, 3, 2, 2, 2, 21, 113, 3, 2, 2, 2, 23, 115, 3, 2, 2, 2, 25, 117, 3, 2, 2, 2, 27, 119, 3, 2, 2, 2, 29, 122, 3, 2, 2, 2, 31, 125, 3, 2, 2, 2, 33, 129, 3, 2, 2, 2, 35, 131, 3, 2, 2, 2, 37, 138, 3, 2, 2, 2, 39, 140, 3, 2, 2, 2, 41, 148, 3, 2, 2, 2, 43, 154, 3, 2, 2, 2, 45, 167, 3, 2, 2, 2, 47, 181, 3, 2, 2, 2, 49, 183, 3, 2, 2, 2, 51, 185, 3, 2, 2, 2, 53, 187, 3, 2, 2, 2, 55, 189, 3, 2, 2, 2, 57, 191, 3, 2, 2, 2, 59, 193, 3, 2, 2, 2, 61, 196, 3, 2, 2, 2, 63, 199, 3, 2, 2, 2, 65, 202, 3, 2, 2, 2, 67, 205, 3, 2, 2, 2, 69, 208, 3, 2, 2, 2, 71, 211, 3, 2, 2, 2, 73, 214, 3, 2, 2, 2, 75, 227, 3, 2, 2, 2, 77, 238, 3, 2, 2, 2, 79, 241, 3, 2, 2, 2, 81, 245, 3, 2, 2, 2, 83, 253, 3, 2, 2, 2, 85, 263, 3, 2, 2, 2, 87, 265, 3, 2, 2, 2, 89, 90, 7, 42, 2, 2, 90, 4, 3, 2, 2, 2, 91, 92, 7, 43, 2, 2, 92, 6, 3, 2, 2, 2, 93, 94, 7, 46, 2, 2, 94, 8, 3, 2, 2, 2, 95, 96, 7, 93, 2, 2, 96, 10, 3, 2, 2, 2, 97, 98, 7, 95, 2, 2, 98, 12, 3, 2, 2, 2, 99, 100, 7, 107, 2, 2, 100, 101, 7, 104, 2, 2, 101, 14, 3, 2, 2, 2, 102, 103, 7, 103, 2, 2, 103, 104, 7, 110, 2, 2, 104, 105, 7, 117, 2, 2, 105, 106, 7, 103, 2, 2, 106, 16, 3, 2, 2, 2, 107, 108, 7, 110, 2, 2, 108, 109, 7, 103, 2, 2, 109, 110, 7, 118, 2, 2, 110, 18, 3, 2, 2, 2, 111, 112, 7, 63, 2, 2, 112, 20, 3, 2, 2, 2, 113, 114, 7, 61, 2, 2, 114, 22, 3, 2, 2, 2, 115, 116, 7, 125, 2, 2, 116, 24, 3, 2, 2, 2, 117, 118, 7, 127, 2, 2, 118, 26, 3, 2, 2, 2, 119, 120, 7, 104, 2, 2, 120, 121, 7, 112, 2, 2, 121, 28, 3, 2, 2, 2, 122, 123, 7, 47, 2, 2, 123, 124, 7, 64, 2, 2, 124, 30, 3, 2, 2, 2, 125, 126, 7, 102, 2, 2, 126, 127, 7, 103, 2, 2, 127, 128, 7, 104, 2, 2, 128, 32, 3, 2, 2, 2, 129, 130, 7, 60, 2, 2, 130, 34, 3, 2, 2, 2, 131, 132, 7, 86, 2, 2, 132, 133, 7, 103, 2, 2, 133, 134, 7, 112, 2, 2, 134, 135, 7, 117, 2, 2, 135, 136, 7, 113, 2, 2, 136, 137, 7, 116, 2, 2, 137, 36, 3, 2, 2, 2, 138, 139, 7, 97, 2, 2, 139, 38, 3, 2, 2, 2, 140, 141, 7, 120, 2, 2, 141, 142, 7, 50, 2, 2, 142, 143, 7, 48, 2, 2, 143, 144, 7, 50, 2, 2, 144, 145, 7, 48, 2, 2, 145, 146, 7, 52, 2, 2, 146, 40, 3, 2, 2, 2, 147, 149, 9, 2, 2, 2, 148, 147, 3, 2, 2, 2, 149, 150, 3, 2, 2, 2, 150, 148, 3, 2, 2, 2, 150, 151, 3, 2, 2, 2, 151, 152, 3, 2, 2, 2, 152, 153, 8, 21, 2, 2, 153, 42, 3, 2, 2, 2, 154, 155, 7, 49, 2, 2, 155, 156, 7, 49, 2, 2, 156, 160, 3, 2, 2, 2, 157, 159, 11, 2, 2, 2, 158, 157, 3, 2, 2, 2, 159, 162, 3, 2, 2, 2, 160, 161, 3, 2, 2, 2, 160, 158, 3, 2, 2, 2, 161, 163, 3, 2, 2, 2, 162, 160, 3, 2, 2, 2, 163, 164, 7, 12, 2, 2, 164, 165, 3, 2, 2, 2, 165, 166, 8, 22, 2, 2, 166, 44, 3, 2, 2, 2, 167, 168, 7, 49, 2, 2, 168, 169, 7, 44, 2, 2, 169, 173, 3, 2, 2, 2, 170, 172, 11, 2, 2, 2, 171, 170, 3, 2, 2, 2, 172, 175, 3, 2, 2, 2, 173, 174, 3, 2, 2, 2, 173, 171, 3, 2, 2, 2, 174, 176, 3, 2, 2, 2, 175, 173, 3, 2, 2, 2, 176, 177, 7, 44, 2, 2, 177, 178, 7, 49, 2, 2, 178, 179, 3, 2, 2, 2, 179, 180, 8, 23, 2, 2, 180, 46, 3, 2, 2, 2, 181, 182, 7, 44, 2, 2, 182, 48, 3, 2, 2, 2, 183, 184, 7, 49, 2, 2, 184, 50, 3, 2, 2, 2, 185, 186, 7, 45, 2, 2, 186, 52, 3, 2, 2, 2, 187, 188, 7, 47, 2, 2, 188, 54, 3, 2, 2, 2, 189, 190, 7, 62, 2, 2, 190, 56, 3, 2, 2, 2, 191, 192, 7, 64, 2, 2, 192, 58, 3, 2, 2, 2, 193, 194, 7, 62, 2, 2, 194, 195, 7, 63, 2, 2, 195, 60, 3, 2, 2, 2, 196, 197, 7, 64, 2, 2, 197, 198, 7, 63, 2, 2, 198, 62, 3, 2, 2, 2, 199, 200, 7, 63, 2, 2, 200, 201, 7, 63, 2, 2, 201, 64, 3, 2, 2, 2, 202, 203, 7, 35, 2, 2, 203, 204, 7, 63, 2, 2, 204, 66, 3, 2, 2, 2, 205, 206, 7, 66, 2, 2, 206, 207, 5, 83, 42, 2, 207, 68, 3, 2, 2, 2, 208, 209, 7, 39, 2, 2, 209, 210, 5, 83, 42, 2, 210, 70, 3, 2, 2, 2, 211, 212, 7, 39, 2, 2, 212, 213, 5, 79, 40, 2, 213, 72, 3, 2, 2, 2, 214, 215, 7, 111, 2, 2, 215, 216, 7, 119, 2, 2, 216, 217, 7, 118, 2, 2, 217, 74, 3, 2, 2, 2, 218, 219, 7, 86, 2, 2, 219, 220, 7, 116, 2, 2, 220, 221, 7, 119, 2, 2, 221, 228, 7, 103, 2, 2, 222, 223, 7, 72, 2, 2, 223, 224, 7, 99, 2, 2, 224, 225, 7, 110, 2, 2, 225, 226, 7, 117, 2, 2, 226, 228, 7, 103, 2, 2, 227, 218, 3, 2, 2, 2, 227, 222, 3, 2, 2, 2, 228, 76, 3, 2, 2, 2, 229, 230, 5, 79, 40, 2, 230, 231, 7, 48, 2, 2, 231, 233, 5, 79, 40, 2, 232, 234, 5, 81, 41, 2, 233, 232, 3, 2, 2, 2, 233, 234, 3, 2, 2, 2, 234, 239, 3, 2, 2, 2, 235, 236, 5, 79, 40, 2, 236, 237, 5, 81, 41, 2, 237, 239, 3, 2, 2, 2, 238, 229, 3, 2, 2, 2, 238, 235, 3, 2, 2, 2, 239, 78, 3, 2, 2, 2, 240, 242, 5, 87, 44, 2, 241, 240, 3, 2, 2, 2, 242, 243, 3, 2, 2, 2, 243, 241, 3, 2, 2, 2, 243, 244, 3, 2, 2, 2, 244, 80, 3, 2, 2, 2, 245, 247, 9, 3, 2, 2, 246, 248, 9, 4, 2, 2, 247, 246, 3, 2, 2, 2, 247, 248, 3, 2, 2, 2, 248, 249, 3, 2, 2, 2, 249, 250, 5, 79, 40, 2, 250, 82, 3, 2, 2, 2, 251, 254, 7, 97, 2, 2, 252, 254, 5, 85, 43, 2, 253, 251, 3, 2, 2, 2, 253, 252, 3, 2, 2, 2, 254, 260, 3, 2, 2, 2, 255, 259, 7, 97, 2, 2, 256, 259, 5, 85, 43, 2, 257, 259, 5, 87, 44, 2, 258, 255, 3, 2, 2, 2, 258, 256, 3, 2, 2, 2, 258, 257, 3, 2, 2, 2, 259, 262, 3, 2, 2, 2, 260, 258, 3, 2, 2, 2, 260, 261, 3, 2, 2, 2, 261, 84, 3, 2, 2, 2, 262, 260, 3, 2, 2, 2, 263, 264, 9, 5, 2, 2, 264, 86, 3, 2, 2, 2, 265, 266, 9, 6, 2, 2, 266, 88, 3, 2, 2, 2, 14, 2, 150, 160, 173, 227, 233, 238, 243, 247, 253, 258, 260, 3, 8, 2, 2]
\ No newline at end of file
diff --git a/python/tvm/relay/grammar/py3/RelayLexer.py b/python/tvm/relay/grammar/py3/RelayLexer.py
new file mode 100644
index 000000000000..fbf74bf1411b
--- /dev/null
+++ b/python/tvm/relay/grammar/py3/RelayLexer.py
@@ -0,0 +1,203 @@
+# Generated from /home/sslyu/tvm/python/tvm/relay/grammar/Relay.g4 by ANTLR 4.7.2
+from antlr4 import *
+from io import StringIO
+from typing.io import TextIO
+import sys
+
+
+
+def serializedATN():
+    with StringIO() as buf:
+        buf.write("\3\u608b\ua72a\u8133\ub9ed\u417c\u3be7\u7786\u5964\2*")
+        buf.write("\u010b\b\1\4\2\t\2\4\3\t\3\4\4\t\4\4\5\t\5\4\6\t\6\4\7")
+        buf.write("\t\7\4\b\t\b\4\t\t\t\4\n\t\n\4\13\t\13\4\f\t\f\4\r\t\r")
+        buf.write("\4\16\t\16\4\17\t\17\4\20\t\20\4\21\t\21\4\22\t\22\4\23")
+        buf.write("\t\23\4\24\t\24\4\25\t\25\4\26\t\26\4\27\t\27\4\30\t\30")
+        buf.write("\4\31\t\31\4\32\t\32\4\33\t\33\4\34\t\34\4\35\t\35\4\36")
+        buf.write("\t\36\4\37\t\37\4 \t \4!\t!\4\"\t\"\4#\t#\4$\t$\4%\t%")
+        buf.write("\4&\t&\4\'\t\'\4(\t(\4)\t)\4*\t*\4+\t+\4,\t,\3\2\3\2\3")
+        buf.write("\3\3\3\3\4\3\4\3\5\3\5\3\6\3\6\3\7\3\7\3\7\3\b\3\b\3\b")
+        buf.write("\3\b\3\b\3\t\3\t\3\t\3\t\3\n\3\n\3\13\3\13\3\f\3\f\3\r")
+        buf.write("\3\r\3\16\3\16\3\16\3\17\3\17\3\17\3\20\3\20\3\20\3\20")
+        buf.write("\3\21\3\21\3\22\3\22\3\22\3\22\3\22\3\22\3\22\3\23\3\23")
+        buf.write("\3\24\3\24\3\24\3\24\3\24\3\24\3\24\3\25\6\25\u0095\n")
+        buf.write("\25\r\25\16\25\u0096\3\25\3\25\3\26\3\26\3\26\3\26\7\26")
+        buf.write("\u009f\n\26\f\26\16\26\u00a2\13\26\3\26\3\26\3\26\3\26")
+        buf.write("\3\27\3\27\3\27\3\27\7\27\u00ac\n\27\f\27\16\27\u00af")
+        buf.write("\13\27\3\27\3\27\3\27\3\27\3\27\3\30\3\30\3\31\3\31\3")
+        buf.write("\32\3\32\3\33\3\33\3\34\3\34\3\35\3\35\3\36\3\36\3\36")
+        buf.write("\3\37\3\37\3\37\3 \3 \3 \3!\3!\3!\3\"\3\"\3\"\3#\3#\3")
+        buf.write("#\3$\3$\3$\3%\3%\3%\3%\3&\3&\3&\3&\3&\3&\3&\3&\3&\5&\u00e4")
+        buf.write("\n&\3\'\3\'\3\'\3\'\5\'\u00ea\n\'\3\'\3\'\3\'\5\'\u00ef")
+        buf.write("\n\'\3(\6(\u00f2\n(\r(\16(\u00f3\3)\3)\5)\u00f8\n)\3)")
+        buf.write("\3)\3*\3*\5*\u00fe\n*\3*\3*\3*\7*\u0103\n*\f*\16*\u0106")
+        buf.write("\13*\3+\3+\3,\3,\4\u00a0\u00ad\2-\3\3\5\4\7\5\t\6\13\7")
+        buf.write("\r\b\17\t\21\n\23\13\25\f\27\r\31\16\33\17\35\20\37\21")
+        buf.write("!\22#\23%\24\'\25)\26+\27-\30/\31\61\32\63\33\65\34\67")
+        buf.write("\359\36;\37= ?!A\"C#E$G%I&K\'M(O)Q\2S*U\2W\2\3\2\7\5\2")
+        buf.write("\13\f\17\17\"\"\4\2GGgg\4\2--//\4\2C\\c|\3\2\62;\2\u0113")
+        buf.write("\2\3\3\2\2\2\2\5\3\2\2\2\2\7\3\2\2\2\2\t\3\2\2\2\2\13")
+        buf.write("\3\2\2\2\2\r\3\2\2\2\2\17\3\2\2\2\2\21\3\2\2\2\2\23\3")
+        buf.write("\2\2\2\2\25\3\2\2\2\2\27\3\2\2\2\2\31\3\2\2\2\2\33\3\2")
+        buf.write("\2\2\2\35\3\2\2\2\2\37\3\2\2\2\2!\3\2\2\2\2#\3\2\2\2\2")
+        buf.write("%\3\2\2\2\2\'\3\2\2\2\2)\3\2\2\2\2+\3\2\2\2\2-\3\2\2\2")
+        buf.write("\2/\3\2\2\2\2\61\3\2\2\2\2\63\3\2\2\2\2\65\3\2\2\2\2\67")
+        buf.write("\3\2\2\2\29\3\2\2\2\2;\3\2\2\2\2=\3\2\2\2\2?\3\2\2\2\2")
+        buf.write("A\3\2\2\2\2C\3\2\2\2\2E\3\2\2\2\2G\3\2\2\2\2I\3\2\2\2")
+        buf.write("\2K\3\2\2\2\2M\3\2\2\2\2O\3\2\2\2\2S\3\2\2\2\3Y\3\2\2")
+        buf.write("\2\5[\3\2\2\2\7]\3\2\2\2\t_\3\2\2\2\13a\3\2\2\2\rc\3\2")
+        buf.write("\2\2\17f\3\2\2\2\21k\3\2\2\2\23o\3\2\2\2\25q\3\2\2\2\27")
+        buf.write("s\3\2\2\2\31u\3\2\2\2\33w\3\2\2\2\35z\3\2\2\2\37}\3\2")
+        buf.write("\2\2!\u0081\3\2\2\2#\u0083\3\2\2\2%\u008a\3\2\2\2\'\u008c")
+        buf.write("\3\2\2\2)\u0094\3\2\2\2+\u009a\3\2\2\2-\u00a7\3\2\2\2")
+        buf.write("/\u00b5\3\2\2\2\61\u00b7\3\2\2\2\63\u00b9\3\2\2\2\65\u00bb")
+        buf.write("\3\2\2\2\67\u00bd\3\2\2\29\u00bf\3\2\2\2;\u00c1\3\2\2")
+        buf.write("\2=\u00c4\3\2\2\2?\u00c7\3\2\2\2A\u00ca\3\2\2\2C\u00cd")
+        buf.write("\3\2\2\2E\u00d0\3\2\2\2G\u00d3\3\2\2\2I\u00d6\3\2\2\2")
+        buf.write("K\u00e3\3\2\2\2M\u00ee\3\2\2\2O\u00f1\3\2\2\2Q\u00f5\3")
+        buf.write("\2\2\2S\u00fd\3\2\2\2U\u0107\3\2\2\2W\u0109\3\2\2\2YZ")
+        buf.write("\7*\2\2Z\4\3\2\2\2[\\\7+\2\2\\\6\3\2\2\2]^\7.\2\2^\b\3")
+        buf.write("\2\2\2_`\7]\2\2`\n\3\2\2\2ab\7_\2\2b\f\3\2\2\2cd\7k\2")
+        buf.write("\2de\7h\2\2e\16\3\2\2\2fg\7g\2\2gh\7n\2\2hi\7u\2\2ij\7")
+        buf.write("g\2\2j\20\3\2\2\2kl\7n\2\2lm\7g\2\2mn\7v\2\2n\22\3\2\2")
+        buf.write("\2op\7?\2\2p\24\3\2\2\2qr\7=\2\2r\26\3\2\2\2st\7}\2\2")
+        buf.write("t\30\3\2\2\2uv\7\177\2\2v\32\3\2\2\2wx\7h\2\2xy\7p\2\2")
+        buf.write("y\34\3\2\2\2z{\7/\2\2{|\7@\2\2|\36\3\2\2\2}~\7f\2\2~\177")
+        buf.write("\7g\2\2\177\u0080\7h\2\2\u0080 \3\2\2\2\u0081\u0082\7")
+        buf.write("<\2\2\u0082\"\3\2\2\2\u0083\u0084\7V\2\2\u0084\u0085\7")
+        buf.write("g\2\2\u0085\u0086\7p\2\2\u0086\u0087\7u\2\2\u0087\u0088")
+        buf.write("\7q\2\2\u0088\u0089\7t\2\2\u0089$\3\2\2\2\u008a\u008b")
+        buf.write("\7a\2\2\u008b&\3\2\2\2\u008c\u008d\7x\2\2\u008d\u008e")
+        buf.write("\7\62\2\2\u008e\u008f\7\60\2\2\u008f\u0090\7\62\2\2\u0090")
+        buf.write("\u0091\7\60\2\2\u0091\u0092\7\64\2\2\u0092(\3\2\2\2\u0093")
+        buf.write("\u0095\t\2\2\2\u0094\u0093\3\2\2\2\u0095\u0096\3\2\2\2")
+        buf.write("\u0096\u0094\3\2\2\2\u0096\u0097\3\2\2\2\u0097\u0098\3")
+        buf.write("\2\2\2\u0098\u0099\b\25\2\2\u0099*\3\2\2\2\u009a\u009b")
+        buf.write("\7\61\2\2\u009b\u009c\7\61\2\2\u009c\u00a0\3\2\2\2\u009d")
+        buf.write("\u009f\13\2\2\2\u009e\u009d\3\2\2\2\u009f\u00a2\3\2\2")
+        buf.write("\2\u00a0\u00a1\3\2\2\2\u00a0\u009e\3\2\2\2\u00a1\u00a3")
+        buf.write("\3\2\2\2\u00a2\u00a0\3\2\2\2\u00a3\u00a4\7\f\2\2\u00a4")
+        buf.write("\u00a5\3\2\2\2\u00a5\u00a6\b\26\2\2\u00a6,\3\2\2\2\u00a7")
+        buf.write("\u00a8\7\61\2\2\u00a8\u00a9\7,\2\2\u00a9\u00ad\3\2\2\2")
+        buf.write("\u00aa\u00ac\13\2\2\2\u00ab\u00aa\3\2\2\2\u00ac\u00af")
+        buf.write("\3\2\2\2\u00ad\u00ae\3\2\2\2\u00ad\u00ab\3\2\2\2\u00ae")
+        buf.write("\u00b0\3\2\2\2\u00af\u00ad\3\2\2\2\u00b0\u00b1\7,\2\2")
+        buf.write("\u00b1\u00b2\7\61\2\2\u00b2\u00b3\3\2\2\2\u00b3\u00b4")
+        buf.write("\b\27\2\2\u00b4.\3\2\2\2\u00b5\u00b6\7,\2\2\u00b6\60\3")
+        buf.write("\2\2\2\u00b7\u00b8\7\61\2\2\u00b8\62\3\2\2\2\u00b9\u00ba")
+        buf.write("\7-\2\2\u00ba\64\3\2\2\2\u00bb\u00bc\7/\2\2\u00bc\66\3")
+        buf.write("\2\2\2\u00bd\u00be\7>\2\2\u00be8\3\2\2\2\u00bf\u00c0\7")
+        buf.write("@\2\2\u00c0:\3\2\2\2\u00c1\u00c2\7>\2\2\u00c2\u00c3\7")
+        buf.write("?\2\2\u00c3<\3\2\2\2\u00c4\u00c5\7@\2\2\u00c5\u00c6\7")
+        buf.write("?\2\2\u00c6>\3\2\2\2\u00c7\u00c8\7?\2\2\u00c8\u00c9\7")
+        buf.write("?\2\2\u00c9@\3\2\2\2\u00ca\u00cb\7#\2\2\u00cb\u00cc\7")
+        buf.write("?\2\2\u00ccB\3\2\2\2\u00cd\u00ce\7B\2\2\u00ce\u00cf\5")
+        buf.write("S*\2\u00cfD\3\2\2\2\u00d0\u00d1\7\'\2\2\u00d1\u00d2\5")
+        buf.write("S*\2\u00d2F\3\2\2\2\u00d3\u00d4\7\'\2\2\u00d4\u00d5\5")
+        buf.write("O(\2\u00d5H\3\2\2\2\u00d6\u00d7\7o\2\2\u00d7\u00d8\7w")
+        buf.write("\2\2\u00d8\u00d9\7v\2\2\u00d9J\3\2\2\2\u00da\u00db\7V")
+        buf.write("\2\2\u00db\u00dc\7t\2\2\u00dc\u00dd\7w\2\2\u00dd\u00e4")
+        buf.write("\7g\2\2\u00de\u00df\7H\2\2\u00df\u00e0\7c\2\2\u00e0\u00e1")
+        buf.write("\7n\2\2\u00e1\u00e2\7u\2\2\u00e2\u00e4\7g\2\2\u00e3\u00da")
+        buf.write("\3\2\2\2\u00e3\u00de\3\2\2\2\u00e4L\3\2\2\2\u00e5\u00e6")
+        buf.write("\5O(\2\u00e6\u00e7\7\60\2\2\u00e7\u00e9\5O(\2\u00e8\u00ea")
+        buf.write("\5Q)\2\u00e9\u00e8\3\2\2\2\u00e9\u00ea\3\2\2\2\u00ea\u00ef")
+        buf.write("\3\2\2\2\u00eb\u00ec\5O(\2\u00ec\u00ed\5Q)\2\u00ed\u00ef")
+        buf.write("\3\2\2\2\u00ee\u00e5\3\2\2\2\u00ee\u00eb\3\2\2\2\u00ef")
+        buf.write("N\3\2\2\2\u00f0\u00f2\5W,\2\u00f1\u00f0\3\2\2\2\u00f2")
+        buf.write("\u00f3\3\2\2\2\u00f3\u00f1\3\2\2\2\u00f3\u00f4\3\2\2\2")
+        buf.write("\u00f4P\3\2\2\2\u00f5\u00f7\t\3\2\2\u00f6\u00f8\t\4\2")
+        buf.write("\2\u00f7\u00f6\3\2\2\2\u00f7\u00f8\3\2\2\2\u00f8\u00f9")
+        buf.write("\3\2\2\2\u00f9\u00fa\5O(\2\u00faR\3\2\2\2\u00fb\u00fe")
+        buf.write("\7a\2\2\u00fc\u00fe\5U+\2\u00fd\u00fb\3\2\2\2\u00fd\u00fc")
+        buf.write("\3\2\2\2\u00fe\u0104\3\2\2\2\u00ff\u0103\7a\2\2\u0100")
+        buf.write("\u0103\5U+\2\u0101\u0103\5W,\2\u0102\u00ff\3\2\2\2\u0102")
+        buf.write("\u0100\3\2\2\2\u0102\u0101\3\2\2\2\u0103\u0106\3\2\2\2")
+        buf.write("\u0104\u0102\3\2\2\2\u0104\u0105\3\2\2\2\u0105T\3\2\2")
+        buf.write("\2\u0106\u0104\3\2\2\2\u0107\u0108\t\5\2\2\u0108V\3\2")
+        buf.write("\2\2\u0109\u010a\t\6\2\2\u010aX\3\2\2\2\16\2\u0096\u00a0")
+        buf.write("\u00ad\u00e3\u00e9\u00ee\u00f3\u00f7\u00fd\u0102\u0104")
+        buf.write("\3\b\2\2")
+        return buf.getvalue()
+
+
+class RelayLexer(Lexer):
+
+    atn = ATNDeserializer().deserialize(serializedATN())
+
+    decisionsToDFA = [ DFA(ds, i) for i, ds in enumerate(atn.decisionToState) ]
+
+    T__0 = 1
+    T__1 = 2
+    T__2 = 3
+    T__3 = 4
+    T__4 = 5
+    T__5 = 6
+    T__6 = 7
+    T__7 = 8
+    T__8 = 9
+    T__9 = 10
+    T__10 = 11
+    T__11 = 12
+    T__12 = 13
+    T__13 = 14
+    T__14 = 15
+    T__15 = 16
+    T__16 = 17
+    T__17 = 18
+    SEMVER = 19
+    WS = 20
+    LINE_COMMENT = 21
+    COMMENT = 22
+    MUL = 23
+    DIV = 24
+    ADD = 25
+    SUB = 26
+    LT = 27
+    GT = 28
+    LE = 29
+    GE = 30
+    EQ = 31
+    NE = 32
+    GLOBAL_VAR = 33
+    LOCAL_VAR = 34
+    GRAPH_VAR = 35
+    MUT = 36
+    BOOL_LIT = 37
+    FLOAT = 38
+    NAT = 39
+    CNAME = 40
+
+    channelNames = [ u"DEFAULT_TOKEN_CHANNEL", u"HIDDEN" ]
+
+    modeNames = [ "DEFAULT_MODE" ]
+
+    literalNames = [ "<INVALID>",
+            "'('", "')'", "','", "'['", "']'", "'if'", "'else'", "'let'", 
+            "'='", "';'", "'{'", "'}'", "'fn'", "'->'", "'def'", "':'", 
+            "'Tensor'", "'_'", "'v0.0.2'", "'*'", "'/'", "'+'", "'-'", "'<'", 
+            "'>'", "'<='", "'>='", "'=='", "'!='", "'mut'" ]
+
+    symbolicNames = [ "<INVALID>",
+            "SEMVER", "WS", "LINE_COMMENT", "COMMENT", "MUL", "DIV", "ADD", 
+            "SUB", "LT", "GT", "LE", "GE", "EQ", "NE", "GLOBAL_VAR", "LOCAL_VAR", 
+            "GRAPH_VAR", "MUT", "BOOL_LIT", "FLOAT", "NAT", "CNAME" ]
+
+    ruleNames = [ "T__0", "T__1", "T__2", "T__3", "T__4", "T__5", "T__6", 
+                  "T__7", "T__8", "T__9", "T__10", "T__11", "T__12", "T__13", 
+                  "T__14", "T__15", "T__16", "T__17", "SEMVER", "WS", "LINE_COMMENT", 
+                  "COMMENT", "MUL", "DIV", "ADD", "SUB", "LT", "GT", "LE", 
+                  "GE", "EQ", "NE", "GLOBAL_VAR", "LOCAL_VAR", "GRAPH_VAR", 
+                  "MUT", "BOOL_LIT", "FLOAT", "NAT", "EXP", "CNAME", "LETTER", 
+                  "DIGIT" ]
+
+    grammarFileName = "Relay.g4"
+
+    def __init__(self, input=None, output:TextIO = sys.stdout):
+        super().__init__(input, output)
+        self.checkVersion("4.7.2")
+        self._interp = LexerATNSimulator(self, self.atn, self.decisionsToDFA, PredictionContextCache())
+        self._actions = None
+        self._predicates = None
+
+
diff --git a/python/tvm/relay/grammar/py3/RelayLexer.tokens b/python/tvm/relay/grammar/py3/RelayLexer.tokens
new file mode 100644
index 000000000000..41f3ee62a86c
--- /dev/null
+++ b/python/tvm/relay/grammar/py3/RelayLexer.tokens
@@ -0,0 +1,70 @@
+T__0=1
+T__1=2
+T__2=3
+T__3=4
+T__4=5
+T__5=6
+T__6=7
+T__7=8
+T__8=9
+T__9=10
+T__10=11
+T__11=12
+T__12=13
+T__13=14
+T__14=15
+T__15=16
+T__16=17
+T__17=18
+SEMVER=19
+WS=20
+LINE_COMMENT=21
+COMMENT=22
+MUL=23
+DIV=24
+ADD=25
+SUB=26
+LT=27
+GT=28
+LE=29
+GE=30
+EQ=31
+NE=32
+GLOBAL_VAR=33
+LOCAL_VAR=34
+GRAPH_VAR=35
+MUT=36
+BOOL_LIT=37
+FLOAT=38
+NAT=39
+CNAME=40
+'('=1
+')'=2
+','=3
+'['=4
+']'=5
+'if'=6
+'else'=7
+'let'=8
+'='=9
+';'=10
+'{'=11
+'}'=12
+'fn'=13
+'->'=14
+'def'=15
+':'=16
+'Tensor'=17
+'_'=18
+'v0.0.2'=19
+'*'=23
+'/'=24
+'+'=25
+'-'=26
+'<'=27
+'>'=28
+'<='=29
+'>='=30
+'=='=31
+'!='=32
+'mut'=36
diff --git a/python/tvm/relay/grammar/py3/RelayParser.py b/python/tvm/relay/grammar/py3/RelayParser.py
new file mode 100644
index 000000000000..ff5cffc36a9f
--- /dev/null
+++ b/python/tvm/relay/grammar/py3/RelayParser.py
@@ -0,0 +1,2307 @@
+# Generated from /home/sslyu/tvm/python/tvm/relay/grammar/Relay.g4 by ANTLR 4.7.2
+# encoding: utf-8
+from antlr4 import *
+from io import StringIO
+from typing.io import TextIO
+import sys
+
+
+def serializedATN():
+    with StringIO() as buf:
+        buf.write("\3\u608b\ua72a\u8133\ub9ed\u417c\u3be7\u7786\u5964\3*")
+        buf.write("\u014c\4\2\t\2\4\3\t\3\4\4\t\4\4\5\t\5\4\6\t\6\4\7\t\7")
+        buf.write("\4\b\t\b\4\t\t\t\4\n\t\n\4\13\t\13\4\f\t\f\4\r\t\r\4\16")
+        buf.write("\t\16\4\17\t\17\4\20\t\20\4\21\t\21\4\22\t\22\4\23\t\23")
+        buf.write("\3\2\3\2\3\3\3\3\7\3+\n\3\f\3\16\3.\13\3\3\3\5\3\61\n")
+        buf.write("\3\3\3\3\3\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4")
+        buf.write("\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\6\4H\n\4\r\4\16\4I\3")
+        buf.write("\4\3\4\3\4\3\4\3\4\3\4\7\4R\n\4\f\4\16\4U\13\4\5\4W\n")
+        buf.write("\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\5\4d\n")
+        buf.write("\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\5\4n\n\4\3\4\3\4\3")
+        buf.write("\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4")
+        buf.write("\5\4\u0080\n\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4")
+        buf.write("\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\3\4\7\4\u0096\n\4")
+        buf.write("\f\4\16\4\u0099\13\4\5\4\u009b\n\4\3\4\7\4\u009e\n\4\f")
+        buf.write("\4\16\4\u00a1\13\4\3\5\3\5\5\5\u00a5\n\5\3\5\3\5\3\5\3")
+        buf.write("\5\3\5\5\5\u00ac\n\5\3\5\3\5\3\6\3\6\3\6\5\6\u00b3\n\6")
+        buf.write("\3\6\3\6\3\6\3\6\3\6\5\6\u00ba\n\6\3\6\3\6\3\7\3\7\3\7")
+        buf.write("\3\7\3\7\3\7\5\7\u00c4\n\7\3\b\3\b\3\b\7\b\u00c9\n\b\f")
+        buf.write("\b\16\b\u00cc\13\b\5\b\u00ce\n\b\3\t\3\t\3\t\5\t\u00d3")
+        buf.write("\n\t\3\n\3\n\3\n\7\n\u00d8\n\n\f\n\16\n\u00db\13\n\5\n")
+        buf.write("\u00dd\n\n\3\13\3\13\3\13\3\13\3\f\3\f\3\f\3\f\3\f\3\f")
+        buf.write("\7\f\u00e9\n\f\f\f\16\f\u00ec\13\f\3\f\3\f\5\f\u00f0\n")
+        buf.write("\f\3\r\3\r\3\r\3\r\3\r\3\r\3\r\3\r\3\r\3\r\3\r\6\r\u00fd")
+        buf.write("\n\r\r\r\16\r\u00fe\3\r\3\r\3\r\3\r\3\r\3\r\3\r\3\r\3")
+        buf.write("\r\3\r\3\r\3\r\5\r\u010d\n\r\3\r\3\r\3\r\3\r\7\r\u0113")
+        buf.write("\n\r\f\r\16\r\u0116\13\r\5\r\u0118\n\r\3\r\3\r\3\r\3\r")
+        buf.write("\3\r\5\r\u011f\n\r\3\16\3\16\3\16\3\16\3\16\3\16\3\16")
+        buf.write("\3\16\3\16\3\16\3\16\6\16\u012c\n\16\r\16\16\16\u012d")
+        buf.write("\3\16\3\16\5\16\u0132\n\16\3\17\3\17\3\17\3\17\3\17\5")
+        buf.write("\17\u0139\n\17\3\20\3\20\3\21\3\21\3\21\3\21\3\22\3\22")
+        buf.write("\3\22\5\22\u0144\n\22\3\23\3\23\3\23\3\23\5\23\u014a\n")
+        buf.write("\23\3\23\2\3\6\24\2\4\6\b\n\f\16\20\22\24\26\30\32\34")
+        buf.write("\36 \"$\2\6\3\2\31\32\3\2\33\34\3\2\35 \3\2!\"\2\u0175")
+        buf.write("\2&\3\2\2\2\4(\3\2\2\2\6\177\3\2\2\2\b\u00a2\3\2\2\2\n")
+        buf.write("\u00af\3\2\2\2\f\u00c3\3\2\2\2\16\u00cd\3\2\2\2\20\u00cf")
+        buf.write("\3\2\2\2\22\u00dc\3\2\2\2\24\u00de\3\2\2\2\26\u00ef\3")
+        buf.write("\2\2\2\30\u011e\3\2\2\2\32\u0131\3\2\2\2\34\u0138\3\2")
+        buf.write("\2\2\36\u013a\3\2\2\2 \u013c\3\2\2\2\"\u0143\3\2\2\2$")
+        buf.write("\u0149\3\2\2\2&\'\7*\2\2\'\3\3\2\2\2(\60\7\25\2\2)+\5")
+        buf.write("\n\6\2*)\3\2\2\2+.\3\2\2\2,*\3\2\2\2,-\3\2\2\2-\61\3\2")
+        buf.write("\2\2.,\3\2\2\2/\61\5\6\4\2\60,\3\2\2\2\60/\3\2\2\2\61")
+        buf.write("\62\3\2\2\2\62\63\7\2\2\3\63\5\3\2\2\2\64\65\b\4\1\2\65")
+        buf.write("\66\7\3\2\2\66\67\5\6\4\2\678\7\4\2\28\u0080\3\2\2\29")
+        buf.write(":\7\34\2\2:\u0080\5\6\4\23;\u0080\5\b\5\2<=\7\3\2\2=\u0080")
+        buf.write("\7\4\2\2>?\7\3\2\2?@\5\6\4\2@A\7\5\2\2AB\7\4\2\2B\u0080")
+        buf.write("\3\2\2\2CD\7\3\2\2DG\5\6\4\2EF\7\5\2\2FH\5\6\4\2GE\3\2")
+        buf.write("\2\2HI\3\2\2\2IG\3\2\2\2IJ\3\2\2\2JK\3\2\2\2KL\7\4\2\2")
+        buf.write("L\u0080\3\2\2\2MV\7\6\2\2NS\5\6\4\2OP\7\5\2\2PR\5\6\4")
+        buf.write("\2QO\3\2\2\2RU\3\2\2\2SQ\3\2\2\2ST\3\2\2\2TW\3\2\2\2U")
+        buf.write("S\3\2\2\2VN\3\2\2\2VW\3\2\2\2WX\3\2\2\2X\u0080\7\7\2\2")
+        buf.write("YZ\7\b\2\2Z[\7\3\2\2[\\\5\6\4\2\\]\7\4\2\2]^\5 \21\2^")
+        buf.write("_\7\t\2\2_`\5 \21\2`\u0080\3\2\2\2ac\7\n\2\2bd\7&\2\2")
+        buf.write("cb\3\2\2\2cd\3\2\2\2de\3\2\2\2ef\5\20\t\2fg\7\13\2\2g")
+        buf.write("h\5\6\4\2hi\7\f\2\2ij\5\6\4\bj\u0080\3\2\2\2km\7\n\2\2")
+        buf.write("ln\7&\2\2ml\3\2\2\2mn\3\2\2\2no\3\2\2\2op\5\20\t\2pq\7")
+        buf.write("\13\2\2qr\7\r\2\2rs\5\6\4\2st\7\16\2\2tu\7\f\2\2uv\5\6")
+        buf.write("\4\7v\u0080\3\2\2\2wx\5$\23\2xy\7\13\2\2yz\5\6\4\2z{\7")
+        buf.write("\f\2\2{|\5\6\4\5|\u0080\3\2\2\2}\u0080\5$\23\2~\u0080")
+        buf.write("\5\"\22\2\177\64\3\2\2\2\1779\3\2\2\2\177;\3\2\2\2\177")
+        buf.write("<\3\2\2\2\177>\3\2\2\2\177C\3\2\2\2\177M\3\2\2\2\177Y")
+        buf.write("\3\2\2\2\177a\3\2\2\2\177k\3\2\2\2\177w\3\2\2\2\177}\3")
+        buf.write("\2\2\2\177~\3\2\2\2\u0080\u009f\3\2\2\2\u0081\u0082\f")
+        buf.write("\22\2\2\u0082\u0083\t\2\2\2\u0083\u009e\5\6\4\23\u0084")
+        buf.write("\u0085\f\21\2\2\u0085\u0086\t\3\2\2\u0086\u009e\5\6\4")
+        buf.write("\22\u0087\u0088\f\20\2\2\u0088\u0089\t\4\2\2\u0089\u009e")
+        buf.write("\5\6\4\21\u008a\u008b\f\17\2\2\u008b\u008c\t\5\2\2\u008c")
+        buf.write("\u009e\5\6\4\20\u008d\u008e\f\6\2\2\u008e\u008f\7\f\2")
+        buf.write("\2\u008f\u009e\5\6\4\7\u0090\u0091\f\24\2\2\u0091\u009a")
+        buf.write("\7\3\2\2\u0092\u0097\5\6\4\2\u0093\u0094\7\5\2\2\u0094")
+        buf.write("\u0096\5\6\4\2\u0095\u0093\3\2\2\2\u0096\u0099\3\2\2\2")
+        buf.write("\u0097\u0095\3\2\2\2\u0097\u0098\3\2\2\2\u0098\u009b\3")
+        buf.write("\2\2\2\u0099\u0097\3\2\2\2\u009a\u0092\3\2\2\2\u009a\u009b")
+        buf.write("\3\2\2\2\u009b\u009c\3\2\2\2\u009c\u009e\7\4\2\2\u009d")
+        buf.write("\u0081\3\2\2\2\u009d\u0084\3\2\2\2\u009d\u0087\3\2\2\2")
+        buf.write("\u009d\u008a\3\2\2\2\u009d\u008d\3\2\2\2\u009d\u0090\3")
+        buf.write("\2\2\2\u009e\u00a1\3\2\2\2\u009f\u009d\3\2\2\2\u009f\u00a0")
+        buf.write("\3\2\2\2\u00a0\7\3\2\2\2\u00a1\u009f\3\2\2\2\u00a2\u00a4")
+        buf.write("\7\17\2\2\u00a3\u00a5\5\26\f\2\u00a4\u00a3\3\2\2\2\u00a4")
+        buf.write("\u00a5\3\2\2\2\u00a5\u00a6\3\2\2\2\u00a6\u00a7\7\3\2\2")
+        buf.write("\u00a7\u00a8\5\f\7\2\u00a8\u00ab\7\4\2\2\u00a9\u00aa\7")
+        buf.write("\20\2\2\u00aa\u00ac\5\30\r\2\u00ab\u00a9\3\2\2\2\u00ab")
+        buf.write("\u00ac\3\2\2\2\u00ac\u00ad\3\2\2\2\u00ad\u00ae\5 \21\2")
+        buf.write("\u00ae\t\3\2\2\2\u00af\u00b0\7\21\2\2\u00b0\u00b2\5$\23")
+        buf.write("\2\u00b1\u00b3\5\26\f\2\u00b2\u00b1\3\2\2\2\u00b2\u00b3")
+        buf.write("\3\2\2\2\u00b3\u00b4\3\2\2\2\u00b4\u00b5\7\3\2\2\u00b5")
+        buf.write("\u00b6\5\f\7\2\u00b6\u00b9\7\4\2\2\u00b7\u00b8\7\20\2")
+        buf.write("\2\u00b8\u00ba\5\30\r\2\u00b9\u00b7\3\2\2\2\u00b9\u00ba")
+        buf.write("\3\2\2\2\u00ba\u00bb\3\2\2\2\u00bb\u00bc\5 \21\2\u00bc")
+        buf.write("\13\3\2\2\2\u00bd\u00c4\5\16\b\2\u00be\u00c4\5\22\n\2")
+        buf.write("\u00bf\u00c0\5\16\b\2\u00c0\u00c1\7\5\2\2\u00c1\u00c2")
+        buf.write("\5\22\n\2\u00c2\u00c4\3\2\2\2\u00c3\u00bd\3\2\2\2\u00c3")
+        buf.write("\u00be\3\2\2\2\u00c3\u00bf\3\2\2\2\u00c4\r\3\2\2\2\u00c5")
+        buf.write("\u00ca\5\20\t\2\u00c6\u00c7\7\5\2\2\u00c7\u00c9\5\20\t")
+        buf.write("\2\u00c8\u00c6\3\2\2\2\u00c9\u00cc\3\2\2\2\u00ca\u00c8")
+        buf.write("\3\2\2\2\u00ca\u00cb\3\2\2\2\u00cb\u00ce\3\2\2\2\u00cc")
+        buf.write("\u00ca\3\2\2\2\u00cd\u00c5\3\2\2\2\u00cd\u00ce\3\2\2\2")
+        buf.write("\u00ce\17\3\2\2\2\u00cf\u00d2\5$\23\2\u00d0\u00d1\7\22")
+        buf.write("\2\2\u00d1\u00d3\5\30\r\2\u00d2\u00d0\3\2\2\2\u00d2\u00d3")
+        buf.write("\3\2\2\2\u00d3\21\3\2\2\2\u00d4\u00d9\5\24\13\2\u00d5")
+        buf.write("\u00d6\7\5\2\2\u00d6\u00d8\5\24\13\2\u00d7\u00d5\3\2\2")
+        buf.write("\2\u00d8\u00db\3\2\2\2\u00d9\u00d7\3\2\2\2\u00d9\u00da")
+        buf.write("\3\2\2\2\u00da\u00dd\3\2\2\2\u00db\u00d9\3\2\2\2\u00dc")
+        buf.write("\u00d4\3\2\2\2\u00dc\u00dd\3\2\2\2\u00dd\23\3\2\2\2\u00de")
+        buf.write("\u00df\7*\2\2\u00df\u00e0\7\13\2\2\u00e0\u00e1\5\6\4\2")
+        buf.write("\u00e1\25\3\2\2\2\u00e2\u00e3\7\6\2\2\u00e3\u00f0\7\7")
+        buf.write("\2\2\u00e4\u00e5\7\6\2\2\u00e5\u00ea\5$\23\2\u00e6\u00e7")
+        buf.write("\7\5\2\2\u00e7\u00e9\5$\23\2\u00e8\u00e6\3\2\2\2\u00e9")
+        buf.write("\u00ec\3\2\2\2\u00ea\u00e8\3\2\2\2\u00ea\u00eb\3\2\2\2")
+        buf.write("\u00eb\u00ed\3\2\2\2\u00ec\u00ea\3\2\2\2\u00ed\u00ee\7")
+        buf.write("\7\2\2\u00ee\u00f0\3\2\2\2\u00ef\u00e2\3\2\2\2\u00ef\u00e4")
+        buf.write("\3\2\2\2\u00f0\27\3\2\2\2\u00f1\u00f2\7\3\2\2\u00f2\u011f")
+        buf.write("\7\4\2\2\u00f3\u00f4\7\3\2\2\u00f4\u00f5\5\30\r\2\u00f5")
+        buf.write("\u00f6\7\5\2\2\u00f6\u00f7\7\4\2\2\u00f7\u011f\3\2\2\2")
+        buf.write("\u00f8\u00f9\7\3\2\2\u00f9\u00fc\5\30\r\2\u00fa\u00fb")
+        buf.write("\7\5\2\2\u00fb\u00fd\5\30\r\2\u00fc\u00fa\3\2\2\2\u00fd")
+        buf.write("\u00fe\3\2\2\2\u00fe\u00fc\3\2\2\2\u00fe\u00ff\3\2\2\2")
+        buf.write("\u00ff\u0100\3\2\2\2\u0100\u0101\7\4\2\2\u0101\u011f\3")
+        buf.write("\2\2\2\u0102\u011f\5\36\20\2\u0103\u0104\7\23\2\2\u0104")
+        buf.write("\u0105\7\6\2\2\u0105\u0106\5\32\16\2\u0106\u0107\7\5\2")
+        buf.write("\2\u0107\u0108\5\30\r\2\u0108\u0109\7\7\2\2\u0109\u011f")
+        buf.write("\3\2\2\2\u010a\u010c\7\17\2\2\u010b\u010d\5\26\f\2\u010c")
+        buf.write("\u010b\3\2\2\2\u010c\u010d\3\2\2\2\u010d\u010e\3\2\2\2")
+        buf.write("\u010e\u0117\7\3\2\2\u010f\u0114\5\30\r\2\u0110\u0111")
+        buf.write("\7\5\2\2\u0111\u0113\5\30\r\2\u0112\u0110\3\2\2\2\u0113")
+        buf.write("\u0116\3\2\2\2\u0114\u0112\3\2\2\2\u0114\u0115\3\2\2\2")
+        buf.write("\u0115\u0118\3\2\2\2\u0116\u0114\3\2\2\2\u0117\u010f\3")
+        buf.write("\2\2\2\u0117\u0118\3\2\2\2\u0118\u0119\3\2\2\2\u0119\u011a")
+        buf.write("\7\4\2\2\u011a\u011b\7\20\2\2\u011b\u011f\5\30\r\2\u011c")
+        buf.write("\u011f\7\24\2\2\u011d\u011f\7)\2\2\u011e\u00f1\3\2\2\2")
+        buf.write("\u011e\u00f3\3\2\2\2\u011e\u00f8\3\2\2\2\u011e\u0102\3")
+        buf.write("\2\2\2\u011e\u0103\3\2\2\2\u011e\u010a\3\2\2\2\u011e\u011c")
+        buf.write("\3\2\2\2\u011e\u011d\3\2\2\2\u011f\31\3\2\2\2\u0120\u0121")
+        buf.write("\7\3\2\2\u0121\u0132\7\4\2\2\u0122\u0123\7\3\2\2\u0123")
+        buf.write("\u0124\5\34\17\2\u0124\u0125\7\5\2\2\u0125\u0126\7\4\2")
+        buf.write("\2\u0126\u0132\3\2\2\2\u0127\u0128\7\3\2\2\u0128\u012b")
+        buf.write("\5\34\17\2\u0129\u012a\7\5\2\2\u012a\u012c\5\34\17\2\u012b")
+        buf.write("\u0129\3\2\2\2\u012c\u012d\3\2\2\2\u012d\u012b\3\2\2\2")
+        buf.write("\u012d\u012e\3\2\2\2\u012e\u012f\3\2\2\2\u012f\u0130\7")
+        buf.write("\4\2\2\u0130\u0132\3\2\2\2\u0131\u0120\3\2\2\2\u0131\u0122")
+        buf.write("\3\2\2\2\u0131\u0127\3\2\2\2\u0132\33\3\2\2\2\u0133\u0134")
+        buf.write("\7\3\2\2\u0134\u0135\5\34\17\2\u0135\u0136\7\4\2\2\u0136")
+        buf.write("\u0139\3\2\2\2\u0137\u0139\7)\2\2\u0138\u0133\3\2\2\2")
+        buf.write("\u0138\u0137\3\2\2\2\u0139\35\3\2\2\2\u013a\u013b\7*\2")
+        buf.write("\2\u013b\37\3\2\2\2\u013c\u013d\7\r\2\2\u013d\u013e\5")
+        buf.write("\6\4\2\u013e\u013f\7\16\2\2\u013f!\3\2\2\2\u0140\u0144")
+        buf.write("\7(\2\2\u0141\u0144\7)\2\2\u0142\u0144\7\'\2\2\u0143\u0140")
+        buf.write("\3\2\2\2\u0143\u0141\3\2\2\2\u0143\u0142\3\2\2\2\u0144")
+        buf.write("#\3\2\2\2\u0145\u014a\5\2\2\2\u0146\u014a\7#\2\2\u0147")
+        buf.write("\u014a\7$\2\2\u0148\u014a\7%\2\2\u0149\u0145\3\2\2\2\u0149")
+        buf.write("\u0146\3\2\2\2\u0149\u0147\3\2\2\2\u0149\u0148\3\2\2\2")
+        buf.write("\u014a%\3\2\2\2$,\60ISVcm\177\u0097\u009a\u009d\u009f")
+        buf.write("\u00a4\u00ab\u00b2\u00b9\u00c3\u00ca\u00cd\u00d2\u00d9")
+        buf.write("\u00dc\u00ea\u00ef\u00fe\u010c\u0114\u0117\u011e\u012d")
+        buf.write("\u0131\u0138\u0143\u0149")
+        return buf.getvalue()
+
+
+class RelayParser ( Parser ):
+
+    grammarFileName = "Relay.g4"
+
+    atn = ATNDeserializer().deserialize(serializedATN())
+
+    decisionsToDFA = [ DFA(ds, i) for i, ds in enumerate(atn.decisionToState) ]
+
+    sharedContextCache = PredictionContextCache()
+
+    literalNames = [ "<INVALID>", "'('", "')'", "','", "'['", "']'", "'if'", 
+                     "'else'", "'let'", "'='", "';'", "'{'", "'}'", "'fn'", 
+                     "'->'", "'def'", "':'", "'Tensor'", "'_'", "'v0.0.2'", 
+                     "<INVALID>", "<INVALID>", "<INVALID>", "'*'", "'/'", 
+                     "'+'", "'-'", "'<'", "'>'", "'<='", "'>='", "'=='", 
+                     "'!='", "<INVALID>", "<INVALID>", "<INVALID>", "'mut'" ]
+
+    symbolicNames = [ "<INVALID>", "<INVALID>", "<INVALID>", "<INVALID>", 
+                      "<INVALID>", "<INVALID>", "<INVALID>", "<INVALID>", 
+                      "<INVALID>", "<INVALID>", "<INVALID>", "<INVALID>", 
+                      "<INVALID>", "<INVALID>", "<INVALID>", "<INVALID>", 
+                      "<INVALID>", "<INVALID>", "<INVALID>", "SEMVER", "WS", 
+                      "LINE_COMMENT", "COMMENT", "MUL", "DIV", "ADD", "SUB", 
+                      "LT", "GT", "LE", "GE", "EQ", "NE", "GLOBAL_VAR", 
+                      "LOCAL_VAR", "GRAPH_VAR", "MUT", "BOOL_LIT", "FLOAT", 
+                      "NAT", "CNAME" ]
+
+    RULE_opIdent = 0
+    RULE_prog = 1
+    RULE_expr = 2
+    RULE_func = 3
+    RULE_defn = 4
+    RULE_argList = 5
+    RULE_varList = 6
+    RULE_var = 7
+    RULE_attrList = 8
+    RULE_attr = 9
+    RULE_typeParamSeq = 10
+    RULE_type_ = 11
+    RULE_shapeSeq = 12
+    RULE_shape = 13
+    RULE_typeIdent = 14
+    RULE_body = 15
+    RULE_scalar = 16
+    RULE_ident = 17
+
+    ruleNames =  [ "opIdent", "prog", "expr", "func", "defn", "argList", 
+                   "varList", "var", "attrList", "attr", "typeParamSeq", 
+                   "type_", "shapeSeq", "shape", "typeIdent", "body", "scalar", 
+                   "ident" ]
+
+    EOF = Token.EOF
+    T__0=1
+    T__1=2
+    T__2=3
+    T__3=4
+    T__4=5
+    T__5=6
+    T__6=7
+    T__7=8
+    T__8=9
+    T__9=10
+    T__10=11
+    T__11=12
+    T__12=13
+    T__13=14
+    T__14=15
+    T__15=16
+    T__16=17
+    T__17=18
+    SEMVER=19
+    WS=20
+    LINE_COMMENT=21
+    COMMENT=22
+    MUL=23
+    DIV=24
+    ADD=25
+    SUB=26
+    LT=27
+    GT=28
+    LE=29
+    GE=30
+    EQ=31
+    NE=32
+    GLOBAL_VAR=33
+    LOCAL_VAR=34
+    GRAPH_VAR=35
+    MUT=36
+    BOOL_LIT=37
+    FLOAT=38
+    NAT=39
+    CNAME=40
+
+    def __init__(self, input:TokenStream, output:TextIO = sys.stdout):
+        super().__init__(input, output)
+        self.checkVersion("4.7.2")
+        self._interp = ParserATNSimulator(self, self.atn, self.decisionsToDFA, self.sharedContextCache)
+        self._predicates = None
+
+
+
+
+    class OpIdentContext(ParserRuleContext):
+
+        def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1):
+            super().__init__(parent, invokingState)
+            self.parser = parser
+
+        def CNAME(self):
+            return self.getToken(RelayParser.CNAME, 0)
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_opIdent
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitOpIdent" ):
+                return visitor.visitOpIdent(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def opIdent(self):
+
+        localctx = RelayParser.OpIdentContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 0, self.RULE_opIdent)
+        try:
+            self.enterOuterAlt(localctx, 1)
+            self.state = 36
+            self.match(RelayParser.CNAME)
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class ProgContext(ParserRuleContext):
+
+        def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1):
+            super().__init__(parent, invokingState)
+            self.parser = parser
+
+        def SEMVER(self):
+            return self.getToken(RelayParser.SEMVER, 0)
+
+        def EOF(self):
+            return self.getToken(RelayParser.EOF, 0)
+
+        def expr(self):
+            return self.getTypedRuleContext(RelayParser.ExprContext,0)
+
+
+        def defn(self, i:int=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.DefnContext)
+            else:
+                return self.getTypedRuleContext(RelayParser.DefnContext,i)
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_prog
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitProg" ):
+                return visitor.visitProg(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def prog(self):
+
+        localctx = RelayParser.ProgContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 2, self.RULE_prog)
+        self._la = 0 # Token type
+        try:
+            self.enterOuterAlt(localctx, 1)
+            self.state = 38
+            self.match(RelayParser.SEMVER)
+            self.state = 46
+            self._errHandler.sync(self)
+            token = self._input.LA(1)
+            if token in [RelayParser.EOF, RelayParser.T__14]:
+                self.state = 42
+                self._errHandler.sync(self)
+                _la = self._input.LA(1)
+                while _la==RelayParser.T__14:
+                    self.state = 39
+                    self.defn()
+                    self.state = 44
+                    self._errHandler.sync(self)
+                    _la = self._input.LA(1)
+
+                pass
+            elif token in [RelayParser.T__0, RelayParser.T__3, RelayParser.T__5, RelayParser.T__7, RelayParser.T__12, RelayParser.SUB, RelayParser.GLOBAL_VAR, RelayParser.LOCAL_VAR, RelayParser.GRAPH_VAR, RelayParser.BOOL_LIT, RelayParser.FLOAT, RelayParser.NAT, RelayParser.CNAME]:
+                self.state = 45
+                self.expr(0)
+                pass
+            else:
+                raise NoViableAltException(self)
+
+            self.state = 48
+            self.match(RelayParser.EOF)
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class ExprContext(ParserRuleContext):
+
+        def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1):
+            super().__init__(parent, invokingState)
+            self.parser = parser
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_expr
+
+     
+        def copyFrom(self, ctx:ParserRuleContext):
+            super().copyFrom(ctx)
+
+
+    class IdentExprContext(ExprContext):
+
+        def __init__(self, parser, ctx:ParserRuleContext): # actually a RelayParser.ExprContext
+            super().__init__(parser)
+            self.copyFrom(ctx)
+
+        def ident(self):
+            return self.getTypedRuleContext(RelayParser.IdentContext,0)
+
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitIdentExpr" ):
+                return visitor.visitIdentExpr(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class CallContext(ExprContext):
+
+        def __init__(self, parser, ctx:ParserRuleContext): # actually a RelayParser.ExprContext
+            super().__init__(parser)
+            self.copyFrom(ctx)
+
+        def expr(self, i:int=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.ExprContext)
+            else:
+                return self.getTypedRuleContext(RelayParser.ExprContext,i)
+
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitCall" ):
+                return visitor.visitCall(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class NegContext(ExprContext):
+
+        def __init__(self, parser, ctx:ParserRuleContext): # actually a RelayParser.ExprContext
+            super().__init__(parser)
+            self.copyFrom(ctx)
+
+        def SUB(self):
+            return self.getToken(RelayParser.SUB, 0)
+        def expr(self):
+            return self.getTypedRuleContext(RelayParser.ExprContext,0)
+
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitNeg" ):
+                return visitor.visitNeg(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class TupleContext(ExprContext):
+
+        def __init__(self, parser, ctx:ParserRuleContext): # actually a RelayParser.ExprContext
+            super().__init__(parser)
+            self.copyFrom(ctx)
+
+        def expr(self, i:int=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.ExprContext)
+            else:
+                return self.getTypedRuleContext(RelayParser.ExprContext,i)
+
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitTuple" ):
+                return visitor.visitTuple(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class ParensContext(ExprContext):
+
+        def __init__(self, parser, ctx:ParserRuleContext): # actually a RelayParser.ExprContext
+            super().__init__(parser)
+            self.copyFrom(ctx)
+
+        def expr(self):
+            return self.getTypedRuleContext(RelayParser.ExprContext,0)
+
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitParens" ):
+                return visitor.visitParens(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class FuncExprContext(ExprContext):
+
+        def __init__(self, parser, ctx:ParserRuleContext): # actually a RelayParser.ExprContext
+            super().__init__(parser)
+            self.copyFrom(ctx)
+
+        def func(self):
+            return self.getTypedRuleContext(RelayParser.FuncContext,0)
+
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitFuncExpr" ):
+                return visitor.visitFuncExpr(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class ScalarExprContext(ExprContext):
+
+        def __init__(self, parser, ctx:ParserRuleContext): # actually a RelayParser.ExprContext
+            super().__init__(parser)
+            self.copyFrom(ctx)
+
+        def scalar(self):
+            return self.getTypedRuleContext(RelayParser.ScalarContext,0)
+
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitScalarExpr" ):
+                return visitor.visitScalarExpr(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class LetContext(ExprContext):
+
+        def __init__(self, parser, ctx:ParserRuleContext): # actually a RelayParser.ExprContext
+            super().__init__(parser)
+            self.copyFrom(ctx)
+
+        def var(self):
+            return self.getTypedRuleContext(RelayParser.VarContext,0)
+
+        def expr(self, i:int=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.ExprContext)
+            else:
+                return self.getTypedRuleContext(RelayParser.ExprContext,i)
+
+        def MUT(self):
+            return self.getToken(RelayParser.MUT, 0)
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitLet" ):
+                return visitor.visitLet(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class TensorContext(ExprContext):
+
+        def __init__(self, parser, ctx:ParserRuleContext): # actually a RelayParser.ExprContext
+            super().__init__(parser)
+            self.copyFrom(ctx)
+
+        def expr(self, i:int=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.ExprContext)
+            else:
+                return self.getTypedRuleContext(RelayParser.ExprContext,i)
+
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitTensor" ):
+                return visitor.visitTensor(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class IfElseContext(ExprContext):
+
+        def __init__(self, parser, ctx:ParserRuleContext): # actually a RelayParser.ExprContext
+            super().__init__(parser)
+            self.copyFrom(ctx)
+
+        def expr(self):
+            return self.getTypedRuleContext(RelayParser.ExprContext,0)
+
+        def body(self, i:int=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.BodyContext)
+            else:
+                return self.getTypedRuleContext(RelayParser.BodyContext,i)
+
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitIfElse" ):
+                return visitor.visitIfElse(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class GraphContext(ExprContext):
+
+        def __init__(self, parser, ctx:ParserRuleContext): # actually a RelayParser.ExprContext
+            super().__init__(parser)
+            self.copyFrom(ctx)
+
+        def ident(self):
+            return self.getTypedRuleContext(RelayParser.IdentContext,0)
+
+        def expr(self, i:int=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.ExprContext)
+            else:
+                return self.getTypedRuleContext(RelayParser.ExprContext,i)
+
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitGraph" ):
+                return visitor.visitGraph(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class BinOpContext(ExprContext):
+
+        def __init__(self, parser, ctx:ParserRuleContext): # actually a RelayParser.ExprContext
+            super().__init__(parser)
+            self.op = None # Token
+            self.copyFrom(ctx)
+
+        def expr(self, i:int=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.ExprContext)
+            else:
+                return self.getTypedRuleContext(RelayParser.ExprContext,i)
+
+        def MUL(self):
+            return self.getToken(RelayParser.MUL, 0)
+        def DIV(self):
+            return self.getToken(RelayParser.DIV, 0)
+        def ADD(self):
+            return self.getToken(RelayParser.ADD, 0)
+        def SUB(self):
+            return self.getToken(RelayParser.SUB, 0)
+        def LT(self):
+            return self.getToken(RelayParser.LT, 0)
+        def GT(self):
+            return self.getToken(RelayParser.GT, 0)
+        def LE(self):
+            return self.getToken(RelayParser.LE, 0)
+        def GE(self):
+            return self.getToken(RelayParser.GE, 0)
+        def EQ(self):
+            return self.getToken(RelayParser.EQ, 0)
+        def NE(self):
+            return self.getToken(RelayParser.NE, 0)
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitBinOp" ):
+                return visitor.visitBinOp(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+    def expr(self, _p:int=0):
+        _parentctx = self._ctx
+        _parentState = self.state
+        localctx = RelayParser.ExprContext(self, self._ctx, _parentState)
+        _prevctx = localctx
+        _startState = 4
+        self.enterRecursionRule(localctx, 4, self.RULE_expr, _p)
+        self._la = 0 # Token type
+        try:
+            self.enterOuterAlt(localctx, 1)
+            self.state = 125
+            self._errHandler.sync(self)
+            la_ = self._interp.adaptivePredict(self._input,7,self._ctx)
+            if la_ == 1:
+                localctx = RelayParser.ParensContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+
+                self.state = 51
+                self.match(RelayParser.T__0)
+                self.state = 52
+                self.expr(0)
+                self.state = 53
+                self.match(RelayParser.T__1)
+                pass
+
+            elif la_ == 2:
+                localctx = RelayParser.NegContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+                self.state = 55
+                self.match(RelayParser.SUB)
+                self.state = 56
+                self.expr(17)
+                pass
+
+            elif la_ == 3:
+                localctx = RelayParser.FuncExprContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+                self.state = 57
+                self.func()
+                pass
+
+            elif la_ == 4:
+                localctx = RelayParser.TupleContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+                self.state = 58
+                self.match(RelayParser.T__0)
+                self.state = 59
+                self.match(RelayParser.T__1)
+                pass
+
+            elif la_ == 5:
+                localctx = RelayParser.TupleContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+                self.state = 60
+                self.match(RelayParser.T__0)
+                self.state = 61
+                self.expr(0)
+                self.state = 62
+                self.match(RelayParser.T__2)
+                self.state = 63
+                self.match(RelayParser.T__1)
+                pass
+
+            elif la_ == 6:
+                localctx = RelayParser.TupleContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+                self.state = 65
+                self.match(RelayParser.T__0)
+                self.state = 66
+                self.expr(0)
+                self.state = 69 
+                self._errHandler.sync(self)
+                _la = self._input.LA(1)
+                while True:
+                    self.state = 67
+                    self.match(RelayParser.T__2)
+                    self.state = 68
+                    self.expr(0)
+                    self.state = 71 
+                    self._errHandler.sync(self)
+                    _la = self._input.LA(1)
+                    if not (_la==RelayParser.T__2):
+                        break
+
+                self.state = 73
+                self.match(RelayParser.T__1)
+                pass
+
+            elif la_ == 7:
+                localctx = RelayParser.TensorContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+                self.state = 75
+                self.match(RelayParser.T__3)
+                self.state = 84
+                self._errHandler.sync(self)
+                _la = self._input.LA(1)
+                if (((_la) & ~0x3f) == 0 and ((1 << _la) & ((1 << RelayParser.T__0) | (1 << RelayParser.T__3) | (1 << RelayParser.T__5) | (1 << RelayParser.T__7) | (1 << RelayParser.T__12) | (1 << RelayParser.SUB) | (1 << RelayParser.GLOBAL_VAR) | (1 << RelayParser.LOCAL_VAR) | (1 << RelayParser.GRAPH_VAR) | (1 << RelayParser.BOOL_LIT) | (1 << RelayParser.FLOAT) | (1 << RelayParser.NAT) | (1 << RelayParser.CNAME))) != 0):
+                    self.state = 76
+                    self.expr(0)
+                    self.state = 81
+                    self._errHandler.sync(self)
+                    _la = self._input.LA(1)
+                    while _la==RelayParser.T__2:
+                        self.state = 77
+                        self.match(RelayParser.T__2)
+                        self.state = 78
+                        self.expr(0)
+                        self.state = 83
+                        self._errHandler.sync(self)
+                        _la = self._input.LA(1)
+
+
+
+                self.state = 86
+                self.match(RelayParser.T__4)
+                pass
+
+            elif la_ == 8:
+                localctx = RelayParser.IfElseContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+                self.state = 87
+                self.match(RelayParser.T__5)
+                self.state = 88
+                self.match(RelayParser.T__0)
+                self.state = 89
+                self.expr(0)
+                self.state = 90
+                self.match(RelayParser.T__1)
+                self.state = 91
+                self.body()
+                self.state = 92
+                self.match(RelayParser.T__6)
+                self.state = 93
+                self.body()
+                pass
+
+            elif la_ == 9:
+                localctx = RelayParser.LetContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+                self.state = 95
+                self.match(RelayParser.T__7)
+                self.state = 97
+                self._errHandler.sync(self)
+                _la = self._input.LA(1)
+                if _la==RelayParser.MUT:
+                    self.state = 96
+                    self.match(RelayParser.MUT)
+
+
+                self.state = 99
+                self.var()
+                self.state = 100
+                self.match(RelayParser.T__8)
+                self.state = 101
+                self.expr(0)
+                self.state = 102
+                self.match(RelayParser.T__9)
+                self.state = 103
+                self.expr(6)
+                pass
+
+            elif la_ == 10:
+                localctx = RelayParser.LetContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+                self.state = 105
+                self.match(RelayParser.T__7)
+                self.state = 107
+                self._errHandler.sync(self)
+                _la = self._input.LA(1)
+                if _la==RelayParser.MUT:
+                    self.state = 106
+                    self.match(RelayParser.MUT)
+
+
+                self.state = 109
+                self.var()
+                self.state = 110
+                self.match(RelayParser.T__8)
+                self.state = 111
+                self.match(RelayParser.T__10)
+                self.state = 112
+                self.expr(0)
+                self.state = 113
+                self.match(RelayParser.T__11)
+                self.state = 114
+                self.match(RelayParser.T__9)
+                self.state = 115
+                self.expr(5)
+                pass
+
+            elif la_ == 11:
+                localctx = RelayParser.GraphContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+                self.state = 117
+                self.ident()
+                self.state = 118
+                self.match(RelayParser.T__8)
+                self.state = 119
+                self.expr(0)
+                self.state = 120
+                self.match(RelayParser.T__9)
+                self.state = 121
+                self.expr(3)
+                pass
+
+            elif la_ == 12:
+                localctx = RelayParser.IdentExprContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+                self.state = 123
+                self.ident()
+                pass
+
+            elif la_ == 13:
+                localctx = RelayParser.ScalarExprContext(self, localctx)
+                self._ctx = localctx
+                _prevctx = localctx
+                self.state = 124
+                self.scalar()
+                pass
+
+
+            self._ctx.stop = self._input.LT(-1)
+            self.state = 157
+            self._errHandler.sync(self)
+            _alt = self._interp.adaptivePredict(self._input,11,self._ctx)
+            while _alt!=2 and _alt!=ATN.INVALID_ALT_NUMBER:
+                if _alt==1:
+                    if self._parseListeners is not None:
+                        self.triggerExitRuleEvent()
+                    _prevctx = localctx
+                    self.state = 155
+                    self._errHandler.sync(self)
+                    la_ = self._interp.adaptivePredict(self._input,10,self._ctx)
+                    if la_ == 1:
+                        localctx = RelayParser.BinOpContext(self, RelayParser.ExprContext(self, _parentctx, _parentState))
+                        self.pushNewRecursionContext(localctx, _startState, self.RULE_expr)
+                        self.state = 127
+                        if not self.precpred(self._ctx, 16):
+                            from antlr4.error.Errors import FailedPredicateException
+                            raise FailedPredicateException(self, "self.precpred(self._ctx, 16)")
+                        self.state = 128
+                        localctx.op = self._input.LT(1)
+                        _la = self._input.LA(1)
+                        if not(_la==RelayParser.MUL or _la==RelayParser.DIV):
+                            localctx.op = self._errHandler.recoverInline(self)
+                        else:
+                            self._errHandler.reportMatch(self)
+                            self.consume()
+                        self.state = 129
+                        self.expr(17)
+                        pass
+
+                    elif la_ == 2:
+                        localctx = RelayParser.BinOpContext(self, RelayParser.ExprContext(self, _parentctx, _parentState))
+                        self.pushNewRecursionContext(localctx, _startState, self.RULE_expr)
+                        self.state = 130
+                        if not self.precpred(self._ctx, 15):
+                            from antlr4.error.Errors import FailedPredicateException
+                            raise FailedPredicateException(self, "self.precpred(self._ctx, 15)")
+                        self.state = 131
+                        localctx.op = self._input.LT(1)
+                        _la = self._input.LA(1)
+                        if not(_la==RelayParser.ADD or _la==RelayParser.SUB):
+                            localctx.op = self._errHandler.recoverInline(self)
+                        else:
+                            self._errHandler.reportMatch(self)
+                            self.consume()
+                        self.state = 132
+                        self.expr(16)
+                        pass
+
+                    elif la_ == 3:
+                        localctx = RelayParser.BinOpContext(self, RelayParser.ExprContext(self, _parentctx, _parentState))
+                        self.pushNewRecursionContext(localctx, _startState, self.RULE_expr)
+                        self.state = 133
+                        if not self.precpred(self._ctx, 14):
+                            from antlr4.error.Errors import FailedPredicateException
+                            raise FailedPredicateException(self, "self.precpred(self._ctx, 14)")
+                        self.state = 134
+                        localctx.op = self._input.LT(1)
+                        _la = self._input.LA(1)
+                        if not((((_la) & ~0x3f) == 0 and ((1 << _la) & ((1 << RelayParser.LT) | (1 << RelayParser.GT) | (1 << RelayParser.LE) | (1 << RelayParser.GE))) != 0)):
+                            localctx.op = self._errHandler.recoverInline(self)
+                        else:
+                            self._errHandler.reportMatch(self)
+                            self.consume()
+                        self.state = 135
+                        self.expr(15)
+                        pass
+
+                    elif la_ == 4:
+                        localctx = RelayParser.BinOpContext(self, RelayParser.ExprContext(self, _parentctx, _parentState))
+                        self.pushNewRecursionContext(localctx, _startState, self.RULE_expr)
+                        self.state = 136
+                        if not self.precpred(self._ctx, 13):
+                            from antlr4.error.Errors import FailedPredicateException
+                            raise FailedPredicateException(self, "self.precpred(self._ctx, 13)")
+                        self.state = 137
+                        localctx.op = self._input.LT(1)
+                        _la = self._input.LA(1)
+                        if not(_la==RelayParser.EQ or _la==RelayParser.NE):
+                            localctx.op = self._errHandler.recoverInline(self)
+                        else:
+                            self._errHandler.reportMatch(self)
+                            self.consume()
+                        self.state = 138
+                        self.expr(14)
+                        pass
+
+                    elif la_ == 5:
+                        localctx = RelayParser.LetContext(self, RelayParser.ExprContext(self, _parentctx, _parentState))
+                        self.pushNewRecursionContext(localctx, _startState, self.RULE_expr)
+                        self.state = 139
+                        if not self.precpred(self._ctx, 4):
+                            from antlr4.error.Errors import FailedPredicateException
+                            raise FailedPredicateException(self, "self.precpred(self._ctx, 4)")
+                        self.state = 140
+                        self.match(RelayParser.T__9)
+                        self.state = 141
+                        self.expr(5)
+                        pass
+
+                    elif la_ == 6:
+                        localctx = RelayParser.CallContext(self, RelayParser.ExprContext(self, _parentctx, _parentState))
+                        self.pushNewRecursionContext(localctx, _startState, self.RULE_expr)
+                        self.state = 142
+                        if not self.precpred(self._ctx, 18):
+                            from antlr4.error.Errors import FailedPredicateException
+                            raise FailedPredicateException(self, "self.precpred(self._ctx, 18)")
+                        self.state = 143
+                        self.match(RelayParser.T__0)
+                        self.state = 152
+                        self._errHandler.sync(self)
+                        _la = self._input.LA(1)
+                        if (((_la) & ~0x3f) == 0 and ((1 << _la) & ((1 << RelayParser.T__0) | (1 << RelayParser.T__3) | (1 << RelayParser.T__5) | (1 << RelayParser.T__7) | (1 << RelayParser.T__12) | (1 << RelayParser.SUB) | (1 << RelayParser.GLOBAL_VAR) | (1 << RelayParser.LOCAL_VAR) | (1 << RelayParser.GRAPH_VAR) | (1 << RelayParser.BOOL_LIT) | (1 << RelayParser.FLOAT) | (1 << RelayParser.NAT) | (1 << RelayParser.CNAME))) != 0):
+                            self.state = 144
+                            self.expr(0)
+                            self.state = 149
+                            self._errHandler.sync(self)
+                            _la = self._input.LA(1)
+                            while _la==RelayParser.T__2:
+                                self.state = 145
+                                self.match(RelayParser.T__2)
+                                self.state = 146
+                                self.expr(0)
+                                self.state = 151
+                                self._errHandler.sync(self)
+                                _la = self._input.LA(1)
+
+
+
+                        self.state = 154
+                        self.match(RelayParser.T__1)
+                        pass
+
+             
+                self.state = 159
+                self._errHandler.sync(self)
+                _alt = self._interp.adaptivePredict(self._input,11,self._ctx)
+
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.unrollRecursionContexts(_parentctx)
+        return localctx
+
+
+    class FuncContext(ParserRuleContext):
+
+        def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1):
+            super().__init__(parent, invokingState)
+            self.parser = parser
+
+        def argList(self):
+            return self.getTypedRuleContext(RelayParser.ArgListContext,0)
+
+
+        def body(self):
+            return self.getTypedRuleContext(RelayParser.BodyContext,0)
+
+
+        def typeParamSeq(self):
+            return self.getTypedRuleContext(RelayParser.TypeParamSeqContext,0)
+
+
+        def type_(self):
+            return self.getTypedRuleContext(RelayParser.Type_Context,0)
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_func
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitFunc" ):
+                return visitor.visitFunc(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def func(self):
+
+        localctx = RelayParser.FuncContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 6, self.RULE_func)
+        self._la = 0 # Token type
+        try:
+            self.enterOuterAlt(localctx, 1)
+            self.state = 160
+            self.match(RelayParser.T__12)
+            self.state = 162
+            self._errHandler.sync(self)
+            _la = self._input.LA(1)
+            if _la==RelayParser.T__3:
+                self.state = 161
+                self.typeParamSeq()
+
+
+            self.state = 164
+            self.match(RelayParser.T__0)
+            self.state = 165
+            self.argList()
+            self.state = 166
+            self.match(RelayParser.T__1)
+            self.state = 169
+            self._errHandler.sync(self)
+            _la = self._input.LA(1)
+            if _la==RelayParser.T__13:
+                self.state = 167
+                self.match(RelayParser.T__13)
+                self.state = 168
+                self.type_()
+
+
+            self.state = 171
+            self.body()
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class DefnContext(ParserRuleContext):
+
+        def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1):
+            super().__init__(parent, invokingState)
+            self.parser = parser
+
+        def ident(self):
+            return self.getTypedRuleContext(RelayParser.IdentContext,0)
+
+
+        def argList(self):
+            return self.getTypedRuleContext(RelayParser.ArgListContext,0)
+
+
+        def body(self):
+            return self.getTypedRuleContext(RelayParser.BodyContext,0)
+
+
+        def typeParamSeq(self):
+            return self.getTypedRuleContext(RelayParser.TypeParamSeqContext,0)
+
+
+        def type_(self):
+            return self.getTypedRuleContext(RelayParser.Type_Context,0)
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_defn
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitDefn" ):
+                return visitor.visitDefn(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def defn(self):
+
+        localctx = RelayParser.DefnContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 8, self.RULE_defn)
+        self._la = 0 # Token type
+        try:
+            self.enterOuterAlt(localctx, 1)
+            self.state = 173
+            self.match(RelayParser.T__14)
+            self.state = 174
+            self.ident()
+            self.state = 176
+            self._errHandler.sync(self)
+            _la = self._input.LA(1)
+            if _la==RelayParser.T__3:
+                self.state = 175
+                self.typeParamSeq()
+
+
+            self.state = 178
+            self.match(RelayParser.T__0)
+            self.state = 179
+            self.argList()
+            self.state = 180
+            self.match(RelayParser.T__1)
+            self.state = 183
+            self._errHandler.sync(self)
+            _la = self._input.LA(1)
+            if _la==RelayParser.T__13:
+                self.state = 181
+                self.match(RelayParser.T__13)
+                self.state = 182
+                self.type_()
+
+
+            self.state = 185
+            self.body()
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class ArgListContext(ParserRuleContext):
+
+        def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1):
+            super().__init__(parent, invokingState)
+            self.parser = parser
+
+        def varList(self):
+            return self.getTypedRuleContext(RelayParser.VarListContext,0)
+
+
+        def attrList(self):
+            return self.getTypedRuleContext(RelayParser.AttrListContext,0)
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_argList
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitArgList" ):
+                return visitor.visitArgList(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def argList(self):
+
+        localctx = RelayParser.ArgListContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 10, self.RULE_argList)
+        try:
+            self.state = 193
+            self._errHandler.sync(self)
+            la_ = self._interp.adaptivePredict(self._input,16,self._ctx)
+            if la_ == 1:
+                self.enterOuterAlt(localctx, 1)
+                self.state = 187
+                self.varList()
+                pass
+
+            elif la_ == 2:
+                self.enterOuterAlt(localctx, 2)
+                self.state = 188
+                self.attrList()
+                pass
+
+            elif la_ == 3:
+                self.enterOuterAlt(localctx, 3)
+                self.state = 189
+                self.varList()
+                self.state = 190
+                self.match(RelayParser.T__2)
+                self.state = 191
+                self.attrList()
+                pass
+
+
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class VarListContext(ParserRuleContext):
+
+        def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1):
+            super().__init__(parent, invokingState)
+            self.parser = parser
+
+        def var(self, i:int=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.VarContext)
+            else:
+                return self.getTypedRuleContext(RelayParser.VarContext,i)
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_varList
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitVarList" ):
+                return visitor.visitVarList(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def varList(self):
+
+        localctx = RelayParser.VarListContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 12, self.RULE_varList)
+        self._la = 0 # Token type
+        try:
+            self.enterOuterAlt(localctx, 1)
+            self.state = 203
+            self._errHandler.sync(self)
+            _la = self._input.LA(1)
+            if (((_la) & ~0x3f) == 0 and ((1 << _la) & ((1 << RelayParser.GLOBAL_VAR) | (1 << RelayParser.LOCAL_VAR) | (1 << RelayParser.GRAPH_VAR) | (1 << RelayParser.CNAME))) != 0):
+                self.state = 195
+                self.var()
+                self.state = 200
+                self._errHandler.sync(self)
+                _alt = self._interp.adaptivePredict(self._input,17,self._ctx)
+                while _alt!=2 and _alt!=ATN.INVALID_ALT_NUMBER:
+                    if _alt==1:
+                        self.state = 196
+                        self.match(RelayParser.T__2)
+                        self.state = 197
+                        self.var() 
+                    self.state = 202
+                    self._errHandler.sync(self)
+                    _alt = self._interp.adaptivePredict(self._input,17,self._ctx)
+
+
+
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class VarContext(ParserRuleContext):
+
+        def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1):
+            super().__init__(parent, invokingState)
+            self.parser = parser
+
+        def ident(self):
+            return self.getTypedRuleContext(RelayParser.IdentContext,0)
+
+
+        def type_(self):
+            return self.getTypedRuleContext(RelayParser.Type_Context,0)
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_var
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitVar" ):
+                return visitor.visitVar(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def var(self):
+
+        localctx = RelayParser.VarContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 14, self.RULE_var)
+        self._la = 0 # Token type
+        try:
+            self.enterOuterAlt(localctx, 1)
+            self.state = 205
+            self.ident()
+            self.state = 208
+            self._errHandler.sync(self)
+            _la = self._input.LA(1)
+            if _la==RelayParser.T__15:
+                self.state = 206
+                self.match(RelayParser.T__15)
+                self.state = 207
+                self.type_()
+
+
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class AttrListContext(ParserRuleContext):
+
+        def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1):
+            super().__init__(parent, invokingState)
+            self.parser = parser
+
+        def attr(self, i:int=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.AttrContext)
+            else:
+                return self.getTypedRuleContext(RelayParser.AttrContext,i)
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_attrList
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitAttrList" ):
+                return visitor.visitAttrList(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def attrList(self):
+
+        localctx = RelayParser.AttrListContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 16, self.RULE_attrList)
+        self._la = 0 # Token type
+        try:
+            self.enterOuterAlt(localctx, 1)
+            self.state = 218
+            self._errHandler.sync(self)
+            _la = self._input.LA(1)
+            if _la==RelayParser.CNAME:
+                self.state = 210
+                self.attr()
+                self.state = 215
+                self._errHandler.sync(self)
+                _la = self._input.LA(1)
+                while _la==RelayParser.T__2:
+                    self.state = 211
+                    self.match(RelayParser.T__2)
+                    self.state = 212
+                    self.attr()
+                    self.state = 217
+                    self._errHandler.sync(self)
+                    _la = self._input.LA(1)
+
+
+
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class AttrContext(ParserRuleContext):
+
+        def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1):
+            super().__init__(parent, invokingState)
+            self.parser = parser
+
+        def CNAME(self):
+            return self.getToken(RelayParser.CNAME, 0)
+
+        def expr(self):
+            return self.getTypedRuleContext(RelayParser.ExprContext,0)
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_attr
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitAttr" ):
+                return visitor.visitAttr(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def attr(self):
+
+        localctx = RelayParser.AttrContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 18, self.RULE_attr)
+        try:
+            self.enterOuterAlt(localctx, 1)
+            self.state = 220
+            self.match(RelayParser.CNAME)
+            self.state = 221
+            self.match(RelayParser.T__8)
+            self.state = 222
+            self.expr(0)
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class TypeParamSeqContext(ParserRuleContext):
+
+        def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1):
+            super().__init__(parent, invokingState)
+            self.parser = parser
+
+        def ident(self, i:int=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.IdentContext)
+            else:
+                return self.getTypedRuleContext(RelayParser.IdentContext,i)
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_typeParamSeq
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitTypeParamSeq" ):
+                return visitor.visitTypeParamSeq(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def typeParamSeq(self):
+
+        localctx = RelayParser.TypeParamSeqContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 20, self.RULE_typeParamSeq)
+        self._la = 0 # Token type
+        try:
+            self.state = 237
+            self._errHandler.sync(self)
+            la_ = self._interp.adaptivePredict(self._input,23,self._ctx)
+            if la_ == 1:
+                self.enterOuterAlt(localctx, 1)
+                self.state = 224
+                self.match(RelayParser.T__3)
+                self.state = 225
+                self.match(RelayParser.T__4)
+                pass
+
+            elif la_ == 2:
+                self.enterOuterAlt(localctx, 2)
+                self.state = 226
+                self.match(RelayParser.T__3)
+                self.state = 227
+                self.ident()
+                self.state = 232
+                self._errHandler.sync(self)
+                _la = self._input.LA(1)
+                while _la==RelayParser.T__2:
+                    self.state = 228
+                    self.match(RelayParser.T__2)
+                    self.state = 229
+                    self.ident()
+                    self.state = 234
+                    self._errHandler.sync(self)
+                    _la = self._input.LA(1)
+
+                self.state = 235
+                self.match(RelayParser.T__4)
+                pass
+
+
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class Type_Context(ParserRuleContext):
+
+        def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1):
+            super().__init__(parent, invokingState)
+            self.parser = parser
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_type_
+
+     
+        def copyFrom(self, ctx:ParserRuleContext):
+            super().copyFrom(ctx)
+
+
+
+    class IntTypeContext(Type_Context):
+
+        def __init__(self, parser, ctx:ParserRuleContext): # actually a RelayParser.Type_Context
+            super().__init__(parser)
+            self.copyFrom(ctx)
+
+        def NAT(self):
+            return self.getToken(RelayParser.NAT, 0)
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitIntType" ):
+                return visitor.visitIntType(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class TupleTypeContext(Type_Context):
+
+        def __init__(self, parser, ctx:ParserRuleContext): # actually a RelayParser.Type_Context
+            super().__init__(parser)
+            self.copyFrom(ctx)
+
+        def type_(self, i:int=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.Type_Context)
+            else:
+                return self.getTypedRuleContext(RelayParser.Type_Context,i)
+
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitTupleType" ):
+                return visitor.visitTupleType(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class TypeIdentTypeContext(Type_Context):
+
+        def __init__(self, parser, ctx:ParserRuleContext): # actually a RelayParser.Type_Context
+            super().__init__(parser)
+            self.copyFrom(ctx)
+
+        def typeIdent(self):
+            return self.getTypedRuleContext(RelayParser.TypeIdentContext,0)
+
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitTypeIdentType" ):
+                return visitor.visitTypeIdentType(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class IncompleteTypeContext(Type_Context):
+
+        def __init__(self, parser, ctx:ParserRuleContext): # actually a RelayParser.Type_Context
+            super().__init__(parser)
+            self.copyFrom(ctx)
+
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitIncompleteType" ):
+                return visitor.visitIncompleteType(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class TensorTypeContext(Type_Context):
+
+        def __init__(self, parser, ctx:ParserRuleContext): # actually a RelayParser.Type_Context
+            super().__init__(parser)
+            self.copyFrom(ctx)
+
+        def shapeSeq(self):
+            return self.getTypedRuleContext(RelayParser.ShapeSeqContext,0)
+
+        def type_(self):
+            return self.getTypedRuleContext(RelayParser.Type_Context,0)
+
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitTensorType" ):
+                return visitor.visitTensorType(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class FuncTypeContext(Type_Context):
+
+        def __init__(self, parser, ctx:ParserRuleContext): # actually a RelayParser.Type_Context
+            super().__init__(parser)
+            self.copyFrom(ctx)
+
+        def type_(self, i:int=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.Type_Context)
+            else:
+                return self.getTypedRuleContext(RelayParser.Type_Context,i)
+
+        def typeParamSeq(self):
+            return self.getTypedRuleContext(RelayParser.TypeParamSeqContext,0)
+
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitFuncType" ):
+                return visitor.visitFuncType(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+    def type_(self):
+
+        localctx = RelayParser.Type_Context(self, self._ctx, self.state)
+        self.enterRule(localctx, 22, self.RULE_type_)
+        self._la = 0 # Token type
+        try:
+            self.state = 284
+            self._errHandler.sync(self)
+            la_ = self._interp.adaptivePredict(self._input,28,self._ctx)
+            if la_ == 1:
+                localctx = RelayParser.TupleTypeContext(self, localctx)
+                self.enterOuterAlt(localctx, 1)
+                self.state = 239
+                self.match(RelayParser.T__0)
+                self.state = 240
+                self.match(RelayParser.T__1)
+                pass
+
+            elif la_ == 2:
+                localctx = RelayParser.TupleTypeContext(self, localctx)
+                self.enterOuterAlt(localctx, 2)
+                self.state = 241
+                self.match(RelayParser.T__0)
+                self.state = 242
+                self.type_()
+                self.state = 243
+                self.match(RelayParser.T__2)
+                self.state = 244
+                self.match(RelayParser.T__1)
+                pass
+
+            elif la_ == 3:
+                localctx = RelayParser.TupleTypeContext(self, localctx)
+                self.enterOuterAlt(localctx, 3)
+                self.state = 246
+                self.match(RelayParser.T__0)
+                self.state = 247
+                self.type_()
+                self.state = 250 
+                self._errHandler.sync(self)
+                _la = self._input.LA(1)
+                while True:
+                    self.state = 248
+                    self.match(RelayParser.T__2)
+                    self.state = 249
+                    self.type_()
+                    self.state = 252 
+                    self._errHandler.sync(self)
+                    _la = self._input.LA(1)
+                    if not (_la==RelayParser.T__2):
+                        break
+
+                self.state = 254
+                self.match(RelayParser.T__1)
+                pass
+
+            elif la_ == 4:
+                localctx = RelayParser.TypeIdentTypeContext(self, localctx)
+                self.enterOuterAlt(localctx, 4)
+                self.state = 256
+                self.typeIdent()
+                pass
+
+            elif la_ == 5:
+                localctx = RelayParser.TensorTypeContext(self, localctx)
+                self.enterOuterAlt(localctx, 5)
+                self.state = 257
+                self.match(RelayParser.T__16)
+                self.state = 258
+                self.match(RelayParser.T__3)
+                self.state = 259
+                self.shapeSeq()
+                self.state = 260
+                self.match(RelayParser.T__2)
+                self.state = 261
+                self.type_()
+                self.state = 262
+                self.match(RelayParser.T__4)
+                pass
+
+            elif la_ == 6:
+                localctx = RelayParser.FuncTypeContext(self, localctx)
+                self.enterOuterAlt(localctx, 6)
+                self.state = 264
+                self.match(RelayParser.T__12)
+                self.state = 266
+                self._errHandler.sync(self)
+                _la = self._input.LA(1)
+                if _la==RelayParser.T__3:
+                    self.state = 265
+                    self.typeParamSeq()
+
+
+                self.state = 268
+                self.match(RelayParser.T__0)
+                self.state = 277
+                self._errHandler.sync(self)
+                _la = self._input.LA(1)
+                if (((_la) & ~0x3f) == 0 and ((1 << _la) & ((1 << RelayParser.T__0) | (1 << RelayParser.T__12) | (1 << RelayParser.T__16) | (1 << RelayParser.T__17) | (1 << RelayParser.NAT) | (1 << RelayParser.CNAME))) != 0):
+                    self.state = 269
+                    self.type_()
+                    self.state = 274
+                    self._errHandler.sync(self)
+                    _la = self._input.LA(1)
+                    while _la==RelayParser.T__2:
+                        self.state = 270
+                        self.match(RelayParser.T__2)
+                        self.state = 271
+                        self.type_()
+                        self.state = 276
+                        self._errHandler.sync(self)
+                        _la = self._input.LA(1)
+
+
+
+                self.state = 279
+                self.match(RelayParser.T__1)
+                self.state = 280
+                self.match(RelayParser.T__13)
+                self.state = 281
+                self.type_()
+                pass
+
+            elif la_ == 7:
+                localctx = RelayParser.IncompleteTypeContext(self, localctx)
+                self.enterOuterAlt(localctx, 7)
+                self.state = 282
+                self.match(RelayParser.T__17)
+                pass
+
+            elif la_ == 8:
+                localctx = RelayParser.IntTypeContext(self, localctx)
+                self.enterOuterAlt(localctx, 8)
+                self.state = 283
+                self.match(RelayParser.NAT)
+                pass
+
+
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class ShapeSeqContext(ParserRuleContext):
+
+        def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1):
+            super().__init__(parent, invokingState)
+            self.parser = parser
+
+        def shape(self, i:int=None):
+            if i is None:
+                return self.getTypedRuleContexts(RelayParser.ShapeContext)
+            else:
+                return self.getTypedRuleContext(RelayParser.ShapeContext,i)
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_shapeSeq
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitShapeSeq" ):
+                return visitor.visitShapeSeq(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def shapeSeq(self):
+
+        localctx = RelayParser.ShapeSeqContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 24, self.RULE_shapeSeq)
+        self._la = 0 # Token type
+        try:
+            self.state = 303
+            self._errHandler.sync(self)
+            la_ = self._interp.adaptivePredict(self._input,30,self._ctx)
+            if la_ == 1:
+                self.enterOuterAlt(localctx, 1)
+                self.state = 286
+                self.match(RelayParser.T__0)
+                self.state = 287
+                self.match(RelayParser.T__1)
+                pass
+
+            elif la_ == 2:
+                self.enterOuterAlt(localctx, 2)
+                self.state = 288
+                self.match(RelayParser.T__0)
+                self.state = 289
+                self.shape()
+                self.state = 290
+                self.match(RelayParser.T__2)
+                self.state = 291
+                self.match(RelayParser.T__1)
+                pass
+
+            elif la_ == 3:
+                self.enterOuterAlt(localctx, 3)
+                self.state = 293
+                self.match(RelayParser.T__0)
+                self.state = 294
+                self.shape()
+                self.state = 297 
+                self._errHandler.sync(self)
+                _la = self._input.LA(1)
+                while True:
+                    self.state = 295
+                    self.match(RelayParser.T__2)
+                    self.state = 296
+                    self.shape()
+                    self.state = 299 
+                    self._errHandler.sync(self)
+                    _la = self._input.LA(1)
+                    if not (_la==RelayParser.T__2):
+                        break
+
+                self.state = 301
+                self.match(RelayParser.T__1)
+                pass
+
+
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class ShapeContext(ParserRuleContext):
+
+        def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1):
+            super().__init__(parent, invokingState)
+            self.parser = parser
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_shape
+
+     
+        def copyFrom(self, ctx:ParserRuleContext):
+            super().copyFrom(ctx)
+
+
+
+    class ParensShapeContext(ShapeContext):
+
+        def __init__(self, parser, ctx:ParserRuleContext): # actually a RelayParser.ShapeContext
+            super().__init__(parser)
+            self.copyFrom(ctx)
+
+        def shape(self):
+            return self.getTypedRuleContext(RelayParser.ShapeContext,0)
+
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitParensShape" ):
+                return visitor.visitParensShape(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class IntShapeContext(ShapeContext):
+
+        def __init__(self, parser, ctx:ParserRuleContext): # actually a RelayParser.ShapeContext
+            super().__init__(parser)
+            self.copyFrom(ctx)
+
+        def NAT(self):
+            return self.getToken(RelayParser.NAT, 0)
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitIntShape" ):
+                return visitor.visitIntShape(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+    def shape(self):
+
+        localctx = RelayParser.ShapeContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 26, self.RULE_shape)
+        try:
+            self.state = 310
+            self._errHandler.sync(self)
+            token = self._input.LA(1)
+            if token in [RelayParser.T__0]:
+                localctx = RelayParser.ParensShapeContext(self, localctx)
+                self.enterOuterAlt(localctx, 1)
+                self.state = 305
+                self.match(RelayParser.T__0)
+                self.state = 306
+                self.shape()
+                self.state = 307
+                self.match(RelayParser.T__1)
+                pass
+            elif token in [RelayParser.NAT]:
+                localctx = RelayParser.IntShapeContext(self, localctx)
+                self.enterOuterAlt(localctx, 2)
+                self.state = 309
+                self.match(RelayParser.NAT)
+                pass
+            else:
+                raise NoViableAltException(self)
+
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class TypeIdentContext(ParserRuleContext):
+
+        def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1):
+            super().__init__(parent, invokingState)
+            self.parser = parser
+
+        def CNAME(self):
+            return self.getToken(RelayParser.CNAME, 0)
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_typeIdent
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitTypeIdent" ):
+                return visitor.visitTypeIdent(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def typeIdent(self):
+
+        localctx = RelayParser.TypeIdentContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 28, self.RULE_typeIdent)
+        try:
+            self.enterOuterAlt(localctx, 1)
+            self.state = 312
+            self.match(RelayParser.CNAME)
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class BodyContext(ParserRuleContext):
+
+        def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1):
+            super().__init__(parent, invokingState)
+            self.parser = parser
+
+        def expr(self):
+            return self.getTypedRuleContext(RelayParser.ExprContext,0)
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_body
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitBody" ):
+                return visitor.visitBody(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def body(self):
+
+        localctx = RelayParser.BodyContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 30, self.RULE_body)
+        try:
+            self.enterOuterAlt(localctx, 1)
+            self.state = 314
+            self.match(RelayParser.T__10)
+            self.state = 315
+            self.expr(0)
+            self.state = 316
+            self.match(RelayParser.T__11)
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class ScalarContext(ParserRuleContext):
+
+        def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1):
+            super().__init__(parent, invokingState)
+            self.parser = parser
+
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_scalar
+
+     
+        def copyFrom(self, ctx:ParserRuleContext):
+            super().copyFrom(ctx)
+
+
+
+    class ScalarFloatContext(ScalarContext):
+
+        def __init__(self, parser, ctx:ParserRuleContext): # actually a RelayParser.ScalarContext
+            super().__init__(parser)
+            self.copyFrom(ctx)
+
+        def FLOAT(self):
+            return self.getToken(RelayParser.FLOAT, 0)
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitScalarFloat" ):
+                return visitor.visitScalarFloat(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class ScalarBoolContext(ScalarContext):
+
+        def __init__(self, parser, ctx:ParserRuleContext): # actually a RelayParser.ScalarContext
+            super().__init__(parser)
+            self.copyFrom(ctx)
+
+        def BOOL_LIT(self):
+            return self.getToken(RelayParser.BOOL_LIT, 0)
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitScalarBool" ):
+                return visitor.visitScalarBool(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+    class ScalarIntContext(ScalarContext):
+
+        def __init__(self, parser, ctx:ParserRuleContext): # actually a RelayParser.ScalarContext
+            super().__init__(parser)
+            self.copyFrom(ctx)
+
+        def NAT(self):
+            return self.getToken(RelayParser.NAT, 0)
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitScalarInt" ):
+                return visitor.visitScalarInt(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+    def scalar(self):
+
+        localctx = RelayParser.ScalarContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 32, self.RULE_scalar)
+        try:
+            self.state = 321
+            self._errHandler.sync(self)
+            token = self._input.LA(1)
+            if token in [RelayParser.FLOAT]:
+                localctx = RelayParser.ScalarFloatContext(self, localctx)
+                self.enterOuterAlt(localctx, 1)
+                self.state = 318
+                self.match(RelayParser.FLOAT)
+                pass
+            elif token in [RelayParser.NAT]:
+                localctx = RelayParser.ScalarIntContext(self, localctx)
+                self.enterOuterAlt(localctx, 2)
+                self.state = 319
+                self.match(RelayParser.NAT)
+                pass
+            elif token in [RelayParser.BOOL_LIT]:
+                localctx = RelayParser.ScalarBoolContext(self, localctx)
+                self.enterOuterAlt(localctx, 3)
+                self.state = 320
+                self.match(RelayParser.BOOL_LIT)
+                pass
+            else:
+                raise NoViableAltException(self)
+
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+    class IdentContext(ParserRuleContext):
+
+        def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1):
+            super().__init__(parent, invokingState)
+            self.parser = parser
+
+        def opIdent(self):
+            return self.getTypedRuleContext(RelayParser.OpIdentContext,0)
+
+
+        def GLOBAL_VAR(self):
+            return self.getToken(RelayParser.GLOBAL_VAR, 0)
+
+        def LOCAL_VAR(self):
+            return self.getToken(RelayParser.LOCAL_VAR, 0)
+
+        def GRAPH_VAR(self):
+            return self.getToken(RelayParser.GRAPH_VAR, 0)
+
+        def getRuleIndex(self):
+            return RelayParser.RULE_ident
+
+        def accept(self, visitor:ParseTreeVisitor):
+            if hasattr( visitor, "visitIdent" ):
+                return visitor.visitIdent(self)
+            else:
+                return visitor.visitChildren(self)
+
+
+
+
+    def ident(self):
+
+        localctx = RelayParser.IdentContext(self, self._ctx, self.state)
+        self.enterRule(localctx, 34, self.RULE_ident)
+        try:
+            self.state = 327
+            self._errHandler.sync(self)
+            token = self._input.LA(1)
+            if token in [RelayParser.CNAME]:
+                self.enterOuterAlt(localctx, 1)
+                self.state = 323
+                self.opIdent()
+                pass
+            elif token in [RelayParser.GLOBAL_VAR]:
+                self.enterOuterAlt(localctx, 2)
+                self.state = 324
+                self.match(RelayParser.GLOBAL_VAR)
+                pass
+            elif token in [RelayParser.LOCAL_VAR]:
+                self.enterOuterAlt(localctx, 3)
+                self.state = 325
+                self.match(RelayParser.LOCAL_VAR)
+                pass
+            elif token in [RelayParser.GRAPH_VAR]:
+                self.enterOuterAlt(localctx, 4)
+                self.state = 326
+                self.match(RelayParser.GRAPH_VAR)
+                pass
+            else:
+                raise NoViableAltException(self)
+
+        except RecognitionException as re:
+            localctx.exception = re
+            self._errHandler.reportError(self, re)
+            self._errHandler.recover(self, re)
+        finally:
+            self.exitRule()
+        return localctx
+
+
+
+    def sempred(self, localctx:RuleContext, ruleIndex:int, predIndex:int):
+        if self._predicates == None:
+            self._predicates = dict()
+        self._predicates[2] = self.expr_sempred
+        pred = self._predicates.get(ruleIndex, None)
+        if pred is None:
+            raise Exception("No predicate with index:" + str(ruleIndex))
+        else:
+            return pred(localctx, predIndex)
+
+    def expr_sempred(self, localctx:ExprContext, predIndex:int):
+            if predIndex == 0:
+                return self.precpred(self._ctx, 16)
+         
+
+            if predIndex == 1:
+                return self.precpred(self._ctx, 15)
+         
+
+            if predIndex == 2:
+                return self.precpred(self._ctx, 14)
+         
+
+            if predIndex == 3:
+                return self.precpred(self._ctx, 13)
+         
+
+            if predIndex == 4:
+                return self.precpred(self._ctx, 4)
+         
+
+            if predIndex == 5:
+                return self.precpred(self._ctx, 18)
+         
+
+
+
+
diff --git a/python/tvm/relay/grammar/py3/RelayVisitor.py b/python/tvm/relay/grammar/py3/RelayVisitor.py
new file mode 100644
index 000000000000..64308dca1a3a
--- /dev/null
+++ b/python/tvm/relay/grammar/py3/RelayVisitor.py
@@ -0,0 +1,198 @@
+# Generated from /home/sslyu/tvm/python/tvm/relay/grammar/Relay.g4 by ANTLR 4.7.2
+from antlr4 import *
+if __name__ is not None and "." in __name__:
+    from .RelayParser import RelayParser
+else:
+    from RelayParser import RelayParser
+
+# This class defines a complete generic visitor for a parse tree produced by RelayParser.
+
+class RelayVisitor(ParseTreeVisitor):
+
+    # Visit a parse tree produced by RelayParser#opIdent.
+    def visitOpIdent(self, ctx:RelayParser.OpIdentContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#prog.
+    def visitProg(self, ctx:RelayParser.ProgContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#identExpr.
+    def visitIdentExpr(self, ctx:RelayParser.IdentExprContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#call.
+    def visitCall(self, ctx:RelayParser.CallContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#neg.
+    def visitNeg(self, ctx:RelayParser.NegContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#tuple.
+    def visitTuple(self, ctx:RelayParser.TupleContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#parens.
+    def visitParens(self, ctx:RelayParser.ParensContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#funcExpr.
+    def visitFuncExpr(self, ctx:RelayParser.FuncExprContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#scalarExpr.
+    def visitScalarExpr(self, ctx:RelayParser.ScalarExprContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#let.
+    def visitLet(self, ctx:RelayParser.LetContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#tensor.
+    def visitTensor(self, ctx:RelayParser.TensorContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#ifElse.
+    def visitIfElse(self, ctx:RelayParser.IfElseContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#graph.
+    def visitGraph(self, ctx:RelayParser.GraphContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#binOp.
+    def visitBinOp(self, ctx:RelayParser.BinOpContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#func.
+    def visitFunc(self, ctx:RelayParser.FuncContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#defn.
+    def visitDefn(self, ctx:RelayParser.DefnContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#argList.
+    def visitArgList(self, ctx:RelayParser.ArgListContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#varList.
+    def visitVarList(self, ctx:RelayParser.VarListContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#var.
+    def visitVar(self, ctx:RelayParser.VarContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#attrList.
+    def visitAttrList(self, ctx:RelayParser.AttrListContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#attr.
+    def visitAttr(self, ctx:RelayParser.AttrContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#typeParamSeq.
+    def visitTypeParamSeq(self, ctx:RelayParser.TypeParamSeqContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#tupleType.
+    def visitTupleType(self, ctx:RelayParser.TupleTypeContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#typeIdentType.
+    def visitTypeIdentType(self, ctx:RelayParser.TypeIdentTypeContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#tensorType.
+    def visitTensorType(self, ctx:RelayParser.TensorTypeContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#funcType.
+    def visitFuncType(self, ctx:RelayParser.FuncTypeContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#incompleteType.
+    def visitIncompleteType(self, ctx:RelayParser.IncompleteTypeContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#intType.
+    def visitIntType(self, ctx:RelayParser.IntTypeContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#shapeSeq.
+    def visitShapeSeq(self, ctx:RelayParser.ShapeSeqContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#parensShape.
+    def visitParensShape(self, ctx:RelayParser.ParensShapeContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#intShape.
+    def visitIntShape(self, ctx:RelayParser.IntShapeContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#typeIdent.
+    def visitTypeIdent(self, ctx:RelayParser.TypeIdentContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#body.
+    def visitBody(self, ctx:RelayParser.BodyContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#scalarFloat.
+    def visitScalarFloat(self, ctx:RelayParser.ScalarFloatContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#scalarInt.
+    def visitScalarInt(self, ctx:RelayParser.ScalarIntContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#scalarBool.
+    def visitScalarBool(self, ctx:RelayParser.ScalarBoolContext):
+        return self.visitChildren(ctx)
+
+
+    # Visit a parse tree produced by RelayParser#ident.
+    def visitIdent(self, ctx:RelayParser.IdentContext):
+        return self.visitChildren(ctx)
+
+
+
+del RelayParser
\ No newline at end of file
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index 5f23e14d5559..dd0f54c664ca 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -17,324 +17,16 @@
 # pylint: disable=no-else-return
 # pylint: disable=unidiomatic-typecheck
 """
-This file contains:
-1. The set of passes for Relay, which exposes an interface for configuring the
-   passes and scripting them in Python.
-
-2. The pass manager for Relay which exposes different granularity of interfaces
-   for users to implement and use passes more conveniently.
+This file contains the set of passes for Relay, which exposes an interface for
+configuring the passes and scripting them in Python.
 """
-import types
-
 from . import _ir_pass
 from . import _make
 from .expr import Expr
 from .ty import Type
-from .base import RelayNode, register_relay_node
 from .module import Module
 
 
-@register_relay_node
-class PassInfo(RelayNode):
-    """The class that contains the meta data required by a pass. It is the
-    container of information needed by running an optimization or analysis.
-    This class can be extended by adding new members when more meta data is
-    needed.
-
-    Parameters
-    ----------
-    name : str
-        The pass name.
-
-    opt_level : int
-        The optimization level of this pass.
-
-    required : List[str]
-        The list of passes that are required by a certain pass.
-    """
-
-    def __init__(self, name, opt_level, required=None):
-        self.__init_handle_by_constructor__(_ir_pass.PassInfo, name, opt_level,
-                                            required)
-
-
-@register_relay_node
-class PassContext(RelayNode):
-    """The basis where a Relay optimization/analysis runs on.
-    Each pass context contains a number of auxiliary information that is used
-    to help an optimization pass. Such information includes the error reporter
-    to record the errors of during the optimization, etc.
-    """
-
-    def __init__(self):
-        self.__init_handle_by_constructor__(_ir_pass.PassContext)
-
-
-@register_relay_node
-class Pass(RelayNode):
-    """The base class of all passes. All methods here are just simple wrappers
-    that are implemented in the backend. They are defined for users to
-    conveniently interact with the base class.
-    """
-
-    def set_pass_context(self, pass_ctx):
-        """Setup the pass context for analysis and optimizations. This context
-        could be shared by different passes for sequential passes.
-
-        Parameters
-        ----------
-        pass_ctx : PassContext
-            The context that is used to help perform a certain pass or a series
-            of passes.
-        """
-        if not isinstance(pass_ctx, PassContext):
-            raise TypeError("pass_ctx is expected to be the PassContext type")
-        _ir_pass.SetContext(self, pass_ctx)
-
-    @property
-    def info(self):
-        """Get the pass meta."""
-        return _ir_pass.Info(self)
-
-    def __call__(self, mod):
-        """Execute the pass. Note that for sequential pass, the dependency among
-        different passes will be resolved in the backend.
-
-        Parameters
-        ----------
-        mod : tvm.relay.Module
-            The module that a certain optimization is performed on.
-
-        Returns
-        -------
-        mod : tvm.relay.Module
-            The updated module after applying this pass.
-        """
-        return _ir_pass.RunPass(self, mod)
-
-
-@register_relay_node
-class ModulePass(Pass):
-    """A pass that works on tvm.relay.Module. Users don't need to interact with
-    this class directly. Instead, a module pass should be created through
-    `module_pass`, because the design of the `module_pass` API is flexible
-    enough to handle the creation of a module pass in different manners. In
-    addition, all members of a module pass can be accessed from the base class.
-    The same rule applies to FunctionPass and SequentialPass as well.
-    """
-
-
-@register_relay_node
-class FunctionPass(Pass):
-    """A pass that works on each tvm.relay.Function in a module. A function
-    pass class should be created through `function_pass`.
-    """
-
-
-@register_relay_node
-class SequentialPass(Pass):
-    """A pass that works on a sequence of pass objects. A sequential pass class
-    should be created through `sequential_pass`.
-    """
-
-
-def module_pass(pass_func=None, opt_level=None, name=None, required=None):
-    """Create a module pass. This function returns a callback when pass_func
-    is provided. Otherwise, it returns the created module level pass using the
-    given optimization function.
-
-    Parameters
-    ----------
-    pass_func : Optional[Callable[(Module/Function, PassContext) ->
-                Module/Function]]
-        The implemented optimization pass.
-
-    opt_level : int
-        The optimization level of this module pass.
-
-    name : Optional[str]
-        The name of the module pass. The name could be empty. In this case, the
-        name of the optimization function will be used as the pass name.
-
-    required : Optional[List[str]]
-        The list of passes that the module pass is dependent on.
-
-    Returns
-    -------
-    create_module_pass : Union[Callable, ModulePass]
-        The callable that will create a module pass is returned when
-        pass_func is not passed in. Otherwise, a ModulePass object will be
-        directly created.
-
-	Examples
-    --------
-    The following code creates a module level pass and adds an abs function to
-    the module.
-
-    .. code-block:: python
-
-        @relay.ir_pass.module_pass(opt_level=2)
-        def transform(mod, ctx):
-            tp = relay.TensorType((10,), "float32")
-            x = relay.var("x", tp)
-            gv = relay.GlobalVar("var")
-            func = relay.Function([x], relay.abs(x))
-            new_mod = relay.Module({gv: func})
-            new_mod.update(mod)
-            return new_mod
-
-        module_pass = transform
-        assert isinstance(module_pass, ir_pass.ModulePass)
-        assert module_pass.info.opt_level == 2
-
-        # Given a module m, the optimization could be invoked as the follwoing:
-        updated_mod = module_pass(m)
-        # Now a function abs should be added to the module m.
-    """
-
-    if opt_level is None:
-        raise ValueError("Please provide opt_level for the module pass.")
-
-    required = required if required else []
-    if not isinstance(required, (list, tuple)):
-        raise TypeError("Required is expected to be the type of " +
-                        "list/tuple.")
-
-    def create_module_pass(pass_func):
-        """Internal function that creates a module pass"""
-        if not isinstance(pass_func, (types.FunctionType, types.LambdaType)):
-            raise TypeError("pass_func must be a callable for Module pass")
-
-        return _ir_pass.CreateModulePass(pass_func, opt_level,
-                                         name if name else pass_func.__name__,
-                                         required)
-
-    if pass_func:
-        return create_module_pass(pass_func)
-    return create_module_pass
-
-
-def function_pass(pass_func=None, opt_level=None, name=None, required=None):
-    """Create a function pass. This function returns a callback when pass_func
-    is provided. Otherwise, it returns the created function pass using the
-    given optimization function.
-
-    Parameters
-    ----------
-    pass_func : Optional[Callable[(Module/Function, PassContext) ->
-                Module/Function]]
-        The implemented optimization pass.
-
-    opt_level : int
-        The optimization level of this module pass.
-
-    name : Optional[str]
-        The name of the function pass. The name could be empty. In this case, the
-        name of the optimization function will be used as the pass name.
-
-    required : Optional[List[str]]
-        The list of passes that the module pass is dependent on.
-
-    Returns
-    -------
-    create_function_pass : Union[Callable, FunctionPass]
-        The callable that will create a function pass is returned when
-        pass_func is not passed in. Otherwise, a FunctionPass object will be
-        created.
-
-    Examples
-    --------
-    The following code creates a function level pass that performs constant
-    folding.
-
-    .. code-block:: python
-
-        @relay.ir_pass.function_pass(opt_level=2)
-        def transform(func, ctx):
-            return ir_pass.fold_constant(func)
-
-        function_pass = transform
-        assert isinstance(function_pass, ir_pass.FunctionPass)
-        assert function_pass.info.opt_level == 2
-
-        # Given a module m, the optimization could be invoked as the follwoing:
-        updated_mod = function_pass(m)
-        # Now constant folding should have been applied to every function in
-        # the provided module m. And the updated module will be returned.
-    """
-
-    if opt_level is None:
-        raise ValueError("Please provide opt_level for the funtion pass.")
-
-    required = required if required else []
-    if not isinstance(required, (list, tuple)):
-        raise TypeError("Required is expected to be the type of " +
-                        "list/tuple.")
-
-    def create_function_pass(pass_func):
-        """Internal function that creates a function pass"""
-        if not isinstance(pass_func, (types.FunctionType, types.LambdaType)):
-            raise TypeError("pass_func must be a callable for Module pass")
-
-        return _ir_pass.CreateFunctionPass(pass_func, opt_level,
-                                           name if name else pass_func.__name__,
-                                           required)
-
-    if pass_func:
-        return create_function_pass(pass_func)
-    return create_function_pass
-
-
-def sequential_pass(passes=None, opt_level=2, name="sequential_pass",
-                    required=None, disabled=None):
-    """Create a sequential pass using a defined optimization function from
-    Python. Some typical usage of the sequential pass are:
-    1. Users provide a list of passes for optimization.
-    2. Only an optimization level is provided so that the backend system has
-       to glob all passes at this level and below to perform the optimizations.
-    Note that users can also provide a series of passes that they don't want to
-    apply when running a sequential pass. Pass dependency will be resolved in
-    the backend as well.
-
-    Parameters
-    ----------
-    passes : Optional[List[Pass]]
-        A sequence of passes candidate for optimization.
-
-    opt_level : Optional[int]
-        The optimization level of this sequential pass.
-
-    name : Optional[str]
-        The name of the sequential pass.
-
-    required : Optional[List[str]]
-        The list of passes that the sequential pass is dependent on.
-
-    disabled : Optional[List[str]]
-        A list of disabled passes.
-
-    Returns
-    -------
-    ret : Pass
-        A sequential pass built through pass_func.
-    """
-
-    passes = passes if passes else []
-    if not isinstance(passes, (list, tuple)):
-        raise TypeError("passes must be a list of Pass objects.")
-
-    disabled = disabled if disabled else []
-    if not isinstance(disabled, (list, tuple)):
-        raise TypeError("disabled must be a list or tuple of pass names")
-
-    required = required if required else []
-    if not isinstance(required, (list, tuple)):
-        raise TypeError("Required is expected to be the type of list/tuple.")
-
-    return _ir_pass.CreateSequentialPass(passes, opt_level, name, required,
-                                         disabled)
-
-
 def post_order_visit(expr, fvisit):
     """Recursively visit the ir in post DFS order node,
     apply fvisit. Each node is guaranteed to be visited
@@ -437,7 +129,7 @@ def well_formed(expr):
 
     Parameters
     ----------
-    expr: tvm.relay.Expr
+    expr : tvm.relay.Expr
         The input expression
 
     Returns
@@ -483,7 +175,7 @@ def free_vars(expr):
 
     Parameters
     ----------
-    expr: tvm.relay.Expr
+    expr : tvm.relay.Expr
         The input expression
 
     Returns
@@ -505,7 +197,7 @@ def bound_vars(expr):
 
     Parameters
     ----------
-    expr: tvm.relay.Expr
+    expr : tvm.relay.Expr
         The input expression
 
     Returns
@@ -521,7 +213,7 @@ def all_vars(expr):
 
     Parameters
     ----------
-    expr: tvm.relay.Expr
+    expr : tvm.relay.Expr
         The input expression
 
     Returns
@@ -537,9 +229,10 @@ def free_type_vars(expr, mod=None):
 
     Parameters
     ----------
-    expr: Union[tvm.relay.Expr,tvm.relay.Type]
+    expr : Union[tvm.relay.Expr,tvm.relay.Type]
         The input expression/type
-    mod: tvm.relay.Module, optional
+
+    mod : Optional[tvm.relay.Module]
         The global module
 
     Returns
@@ -556,9 +249,10 @@ def bound_type_vars(expr, mod=None):
 
     Parameters
     ----------
-    expr: Union[tvm.relay.Expr,tvm.relay.Type]
+    expr : Union[tvm.relay.Expr,tvm.relay.Type]
         The input expression/type
-    mod: tvm.relay.Module, optional
+
+    mod : Optional[tvm.relay.Module]
         The global module
 
     Returns
@@ -575,9 +269,9 @@ def all_type_vars(expr, mod=None):
 
     Parameters
     ----------
-    expr: Union[tvm.relay.Expr,tvm.relay.Type]
+    expr : Union[tvm.relay.Expr,tvm.relay.Type]
         The input expression/type
-    mod: tvm.relay.Module, optional
+    mod : Optional[tvm.relay.Module]
         The global module
 
     Returns
@@ -594,12 +288,12 @@ def simplify_inference(expr):
 
     Parameters
     ----------
-    e: tvm.relay.Expr
+    expr : tvm.relay.Expr
         The input Expression
 
     Returns
     -------
-    result: tvm.relay.Expr
+    result : tvm.relay.Expr
         An expression which is semantically equal to the input expression,
         but with some simplification
     """
@@ -612,32 +306,34 @@ def canonicalize_ops(expr):
 
     Parameters
     ----------
-    e: tvm.relay.Expr
+    expr : tvm.relay.Expr
         The input Expression
 
     Returns
     -------
-    result: tvm.relay.Expr
+    result : tvm.relay.Expr
         An expression without bias_add
     """
     return _ir_pass.canonicalize_ops(expr)
 
 
-def dead_code_elimination(expr):
+def dead_code_elimination(expr, inline_once=False):
     """ Remove expressions which does not effect the program result (dead code).
 
     Parameters
     ----------
-    e: tvm.relay.Expr
+    expr : tvm.relay.Expr
         The input Expression
 
+    inline_once : Optional[Bool]
+        Whether to inline binding that occur only once.
     Returns
     -------
-    result: tvm.relay.Expr
+    result : tvm.relay.Expr
         An expression which is semantically equal to the input expression,
         but with dead code removed.
     """
-    return _ir_pass.dead_code_elimination(expr)
+    return _ir_pass.dead_code_elimination(expr, inline_once)
 
 
 def alpha_equal(lhs, rhs):
@@ -645,15 +341,15 @@ def alpha_equal(lhs, rhs):
 
     Parameters
     ----------
-    lhs: tvm.relay.Expr
+    lhs : tvm.relay.Expr
         One of the input Expression.
 
-    rhs: tvm.relay.Expr
+    rhs : tvm.relay.Expr
         One of the input Expression.
 
     Returns
     -------
-    result: bool
+    result : bool
         True iff lhs is alpha equal to rhs.
     """
     return bool(_make._alpha_equal(lhs, rhs))
@@ -667,15 +363,15 @@ def graph_equal(lhs, rhs):
 
     Parameters
     ----------
-    lhs: tvm.relay.Expr
+    lhs : tvm.relay.Expr
       One of the input Expression.
 
-    rhs: tvm.relay.Expr
+    rhs : tvm.relay.Expr
       One of the input Expression.
 
     Returns
     -------
-    result: bool
+    result : bool
       True iff lhs is data-flow equivalent to rhs.
     """
     return bool(_make._graph_equal(lhs, rhs))
@@ -686,12 +382,12 @@ def structural_hash(value):
 
     Parameters
     ----------
-    expr: tvm.relay.Expr or tvm.relay.Type
+    expr : Union[tvm.relay.Expr, tvm.relay.Type]
       The expression to hash.
 
     Returns
     -------
-    result: int
+    result : int
       The hash value
     """
     if isinstance(value, Expr):
@@ -852,12 +548,12 @@ def to_a_normal_form(expr, mod=None):
     expr : tvm.relay.Expr
         The input expression.
 
-    mod: Optional[tvm.relay.Module]
+    mod : Optional[tvm.relay.Module]
         The global module.
 
     Returns
     -------
-    expr: tvm.relay.Expr
+    result : tvm.relay.Expr
       The output expression.
     """
     return _ir_pass.to_a_normal_form(expr, mod)
@@ -871,7 +567,7 @@ def to_graph_normal_form(expr):
         The input expression
     Returns
     -------
-    expr : tvm.relay.Expr
+    result : tvm.relay.Expr
       The output expression
     """
     return _ir_pass.to_graph_normal_form(expr)
@@ -920,7 +616,7 @@ def get_total_mac_number(expr):
 
     Returns
     -------
-    ret : int64
+    result : int64
       The number of MACs (multiply-accumulate) of a model
     """
     return _ir_pass.GetTotalMacNumber(expr)
@@ -935,17 +631,17 @@ def eliminate_common_subexpr(expr, fskip=None):
     expr : tvm.relay.Expr
         The input expression.
 
-    fskip: function
+    fskip : function
         The callback function that decides whether an expression should be skipped.
 
     Returns
     -------
-    expr : tvm.relay.Expr
+    result : tvm.relay.Expr
       The output expression.
     """
     return _ir_pass.eliminate_common_subexpr(expr, fskip)
 
-def partial_evaluate(expr):
+def partial_evaluate(expr, mod=None):
     """
     Evaluate the static fragment of the code.
 
@@ -954,9 +650,30 @@ def partial_evaluate(expr):
     expr : tvm.relay.Expr
         The input expression.
 
+    mod : Optional[tvm.relay.Module]
+        The global module
+
     Returns
     -------
-    expr : tvm.relay.Expr
+    result : tvm.relay.Expr
       The output expression.
     """
-    return _ir_pass.partial_evaluate(expr)
+    return _ir_pass.partial_evaluate(expr, mod)
+
+def unmatched_cases(match, mod=None):
+    """
+    Finds cases that the match expression does not catch, if any.
+
+    Parameters
+    ----------
+    match : tvm.relay.Match
+        The match expression
+    mod : Optional[tvm.relay.Module]
+        The module (defaults to an empty module)
+
+    Returns
+    -------
+    missing_patterns : [tvm.relay.Pattern]
+      Patterns that the match expression does not catch.
+    """
+    return _ir_pass.unmatched_cases(match, mod)
diff --git a/python/tvm/relay/op/_algorithm.py b/python/tvm/relay/op/_algorithm.py
index 57e716534ee5..09746be13e30 100644
--- a/python/tvm/relay/op/_algorithm.py
+++ b/python/tvm/relay/op/_algorithm.py
@@ -35,11 +35,31 @@ def compute_argsort(attrs, inputs, _, target):
     """Compute definition of argsort"""
     axis = get_const_int(attrs.axis)
     is_ascend = bool(get_const_int(attrs.is_ascend))
-    dtype = str(attrs.dtype)
-    return [
-        topi.argsort(inputs[0], None, axis=axis, is_ascend=is_ascend, \
-                            dtype=dtype, flag=False)
-    ]
+    dtype = attrs.dtype
+    return [topi.argsort(inputs[0], axis=axis, is_ascend=is_ascend, dtype=dtype)]
 
 
 register_pattern("argsort", OpPattern.OPAQUE)
+
+
+@register_schedule("topk")
+def schedule_topk(_, outs, target):
+    """Schedule definition of argsort"""
+    with target:
+        return topi.generic.schedule_topk(outs)
+
+
+@register_compute("topk")
+def compute_topk(attrs, inputs, _, target):
+    """Compute definition of argsort"""
+    k = get_const_int(attrs.k)
+    axis = get_const_int(attrs.axis)
+    ret_type = attrs.ret_type
+    is_ascend = bool(get_const_int(attrs.is_ascend))
+    dtype = attrs.dtype
+    out = topi.topk(inputs[0], k, axis, ret_type, is_ascend, dtype)
+    out = out if isinstance(out, list) else [out]
+    return out
+
+
+register_pattern("topk", OpPattern.OPAQUE)
diff --git a/python/tvm/relay/op/_reduce.py b/python/tvm/relay/op/_reduce.py
index b97e3a8ce993..b7c9a79a8ad9 100644
--- a/python/tvm/relay/op/_reduce.py
+++ b/python/tvm/relay/op/_reduce.py
@@ -30,6 +30,7 @@ def _schedule_reduce(_, outs, target):
 _reg.register_schedule("argmax", _schedule_reduce)
 _reg.register_schedule("argmin", _schedule_reduce)
 _reg.register_schedule("sum", _schedule_reduce)
+_reg.register_schedule("all", _schedule_reduce)
 _reg.register_schedule("max", _schedule_reduce)
 _reg.register_schedule("min", _schedule_reduce)
 _reg.register_schedule("prod", _schedule_reduce)
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 2eec6d03e7cd..95fb2ad18a25 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -23,6 +23,7 @@
 
 schedule_injective = _reg.schedule_injective
 schedule_broadcast = _reg.schedule_injective
+schedule_concatenate = _reg.schedule_concatenate
 
 
 _reg.register_schedule("collapse_sum_like", _schedule_reduce)
@@ -46,7 +47,7 @@
 _reg.register_schedule("transpose", schedule_injective)
 _reg.register_schedule("where", schedule_broadcast)
 _reg.register_schedule("stack", schedule_injective)
-_reg.register_schedule("concatenate", schedule_injective)
+_reg.register_schedule("concatenate", schedule_concatenate)
 _reg.register_schedule("_contrib_reverse_reshape", schedule_injective)
 _reg.register_schedule("gather_nd", schedule_injective)
 
diff --git a/python/tvm/relay/op/algorithm.py b/python/tvm/relay/op/algorithm.py
index 6451eb41aeb9..6f875919df4c 100644
--- a/python/tvm/relay/op/algorithm.py
+++ b/python/tvm/relay/op/algorithm.py
@@ -17,8 +17,9 @@
 """Classic algorithm operation"""
 from __future__ import absolute_import as _abs
 from . import _make
+from ..expr import TupleWrapper
 
-def argsort(data, axis=-1, is_ascend=1, dtype="float32"):
+def argsort(data, axis=-1, is_ascend=1, dtype="int32"):
     """Performs sorting along the given axis and returns an array of indicies
     having same shape as an input array that index data in sorted order.
 
@@ -37,7 +38,7 @@ def argsort(data, axis=-1, is_ascend=1, dtype="float32"):
         Whether to sort in ascending or descending order.
 
     dtype : string, optional
-        DType of the output indices.
+        The data type of the output indices.
 
     Returns
     -------
@@ -45,3 +46,42 @@ def argsort(data, axis=-1, is_ascend=1, dtype="float32"):
         Tensor with same shape as data.
     """
     return _make.argsort(data, axis, is_ascend, dtype)
+
+
+def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int32"):
+    """Get the top k elements in an input tensor along the given axis.
+
+    ret_type specifies the return type, can be one of ("both", "values", "indices").
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data tensor.
+
+    k : int, optional
+        Number of top elements to select. Return all elements if k < 1.
+
+    axis : int, optional
+        Axis long which to sort the input tensor.
+
+    ret_type: str, optional
+        The return type [both, values, indices].
+        "both": return both top k data and indices.
+        "values": return top k data only.
+        "indices": return top k indices only.
+
+    is_ascend : boolean, optional
+        Whether to sort in ascending or descending order.
+
+    dtype : string, optional
+        The data type of the indices output.
+
+    Returns
+    -------
+    out : relay.Expr or List[relay.Expr]
+        The computed result.
+    """
+    out = _make.topk(data, k, axis, ret_type, is_ascend, dtype)
+    if ret_type == "both":
+        return TupleWrapper(out, 2)
+    return out
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index b772c43e11cd..7bce9dd3c5b9 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -67,7 +67,7 @@ def conv2d(data,
         The weight expressions.
 
     strides : tuple of int, optional
-        The strides of convoltution.
+        The strides of convolution.
 
     padding : tuple of int, optional
         The padding of convolution on both sides of inputs before convolution.
@@ -129,7 +129,7 @@ def conv2d_transpose(data,
         The weight expressions.
 
     strides : Tuple[int], optional
-        The strides of convoltution.
+        The strides of convolution.
 
     padding : Tuple[int], optional
         The padding of convolution on both sides of inputs.
@@ -401,7 +401,7 @@ def upsampling(data,
     with data of shape (n, c, h, w)
     out will have a shape (n, c, h*scale, w*scale)
 
-    method indicates the algorithm to be used while calculating ghe out value
+    method indicates the algorithm to be used while calculating the out value
     and method can be one of ("BILINEAR", "NEAREST_NEIGHBOR")
 
     Parameters
@@ -842,7 +842,7 @@ def contrib_conv2d_winograd_without_weight_transform(data,
         The Tile size of winograd. E.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)
 
     strides : tuple of int, optional
-        The strides of convoltution.
+        The strides of convolution.
 
     padding : tuple of int, optional
         The padding of convolution on both sides of inputs before convolution.
@@ -908,7 +908,7 @@ def contrib_conv2d_winograd_nnpack_without_weight_transform(data,
         The weight expressions.
 
     strides : tuple of int, optional
-        The strides of convoltution.
+        The strides of convolution.
 
     padding : tuple of int, optional
         The padding of convolution on both sides of inputs before convolution.
@@ -975,7 +975,7 @@ def contrib_conv2d_nchwc(data,
         The kernel expressions.
 
     strides : tuple of int, optional
-        The strides of convoltution.
+        The strides of convolution.
 
     padding : tuple of int, optional
         The padding of convolution on both sides of inputs before convolution.
@@ -1040,7 +1040,7 @@ def contrib_depthwise_conv2d_nchwc(data,
         The kernel expressions.
 
     strides : tuple of int, optional
-        The strides of convoltution.
+        The strides of convolution.
 
     padding : tuple of int, optional
         The padding of convolution on both sides of inputs before convolution.
@@ -1156,7 +1156,7 @@ def deformable_conv2d(data,
         The weight expressions.
 
     strides : tuple of int, optional
-        The strides of convoltution.
+        The strides of convolution.
 
     padding : tuple of int, optional
         The padding of convolution on both sides of inputs before convolution.
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index 6ba207934d1b..906bf255d46e 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -219,6 +219,13 @@ def schedule_injective(attrs, outputs, target):
     with target:
         return topi.generic.schedule_injective(outputs)
 
+
+def schedule_concatenate(attrs, outputs, target):
+    """Generic schedule for concatinate."""
+    with target:
+        return topi.generic.schedule_concatenate(outputs)
+
+
 __DEBUG_COUNTER__ = 0
 
 def debug(expr, debug_func=None):
diff --git a/python/tvm/relay/op/reduce.py b/python/tvm/relay/op/reduce.py
index 9d58a92041f3..0f2594600b0a 100644
--- a/python/tvm/relay/op/reduce.py
+++ b/python/tvm/relay/op/reduce.py
@@ -39,7 +39,7 @@ def argmax(data, axis=None, keepdims=False, exclude=False):
 
     exclude : bool
         If `exclude` is true, reduction will be performed on the axes that are
-      NOT in axis instead.
+        NOT in axis instead.
 
     Returns
     -------
@@ -69,7 +69,7 @@ def argmin(data, axis=None, keepdims=False, exclude=False):
 
     exclude : bool
         If `exclude` is true, reduction will be performed on the axes that are
-      NOT in axis instead.
+        NOT in axis instead.
 
     Returns
     -------
@@ -100,7 +100,7 @@ def sum(data, axis=None, keepdims=False, exclude=False):
 
     exclude : bool
         If `exclude` is true, reduction will be performed on the axes that are
-      NOT in axis instead.
+        NOT in axis instead.
 
     Returns
     -------
@@ -111,6 +111,58 @@ def sum(data, axis=None, keepdims=False, exclude=False):
     return _make.sum(data, axis, keepdims, exclude)
 
 
+def all(data, axis=None, keepdims=False, exclude=False):
+    """Computes the logical AND of boolean array elements over given axes.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input boolean tensor
+
+    axis : None or int or tuple of int
+        Axis or axes along which a sum is performed. The default, axis=None,
+        will sum all of the elements of the input array. If axis is
+        negative it counts from the last to the first axis.
+
+    keepdims : bool
+        If this is set to True, the axes which are reduced are left in the result as
+        dimensions with size one. With this option, the result will broadcast
+        correctly against the input array.
+
+    exclude : bool
+        If `exclude` is true, reduction will be performed on the axes that are
+        NOT in axis instead.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+
+    Examples
+    --------
+    .. code-block:: python
+
+    data = relay.Constant(tvm.nd.array([[[ True,  True,  True],
+                                         [ True,  True,  True],
+                                         [False,  True, False]],
+                                        [[ True, False, False],
+                                         [ True,  True, False],
+                                         [False,  True,  True]]]))
+
+    relay.all(data, axis=1)
+    # [[False,  True, False],
+    # [False, False, False]]
+
+    relay.all(data, axis=0)
+    # [[ True, False, False],
+    # [ True,  True, False],
+    # [False,  True, False]]
+
+    """
+    axis = [axis] if axis and isinstance(axis, int) else axis
+    return _make.all(data, axis, keepdims, exclude)
+
+
 def max(data, axis=None, keepdims=False, exclude=False):
     """ Computes the max of array elements over given axes.
 
@@ -131,7 +183,7 @@ def max(data, axis=None, keepdims=False, exclude=False):
 
     exclude : bool
         If `exclude` is true, reduction will be performed on the axes that are
-      NOT in axis instead.
+        NOT in axis instead.
 
     Returns
     -------
@@ -163,7 +215,7 @@ def min(data, axis=None, keepdims=False, exclude=False):
 
     exclude : bool
         If `exclude` is true, reduction will be performed on the axes that are
-      NOT in axis instead.
+        NOT in axis instead.
 
     Returns
     -------
@@ -194,7 +246,7 @@ def mean(data, axis=None, keepdims=False, exclude=False):
 
     exclude : bool
         If `exclude` is true, reduction will be performed on the axes that are
-      NOT in axis instead.
+        NOT in axis instead.
 
     Returns
     -------
@@ -225,7 +277,7 @@ def prod(data, axis=None, keepdims=False, exclude=False):
 
     exclude : bool
         If `exclude` is true, reduction will be performed on the axes that are
-      NOT in axis instead.
+        NOT in axis instead.
 
     Returns
     -------
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 9c76b7e569dc..dce2258946cd 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -218,9 +218,10 @@ def take(data, indices, axis=None, mode="clip"):
         the flattened input array is used.
 
     mode : str, optional
-        Specifies how out-of-bound indices will behave.
-        clip - clip to the range (default)
-        wrap - wrap around the indices
+        Specifies how out-of-bound indices will behave [clip, wrap, fast].
+        clip: clip to the range (default).
+        wrap: wrap around the indices.
+        fast: no clip or wrap around (user must make sure indices are in-bound).
 
     Returns
     -------
diff --git a/python/tvm/relay/op/vision/_vision.py b/python/tvm/relay/op/vision/_vision.py
index 8c8c4cd9aaa3..7de118071aa4 100644
--- a/python/tvm/relay/op/vision/_vision.py
+++ b/python/tvm/relay/op/vision/_vision.py
@@ -82,7 +82,10 @@ def schedule_get_valid_counts(_, outs, target):
 def compute_get_valid_counts(attrs, inputs, _, target):
     """Compute definition of get_valid_counts"""
     score_threshold = get_const_float(attrs.score_threshold)
-    return topi.vision.get_valid_counts(inputs[0], score_threshold)
+    id_index = get_const_int(attrs.id_index)
+    score_index = get_const_int(attrs.score_index)
+    return topi.vision.get_valid_counts(inputs[0], score_threshold,
+                                        id_index, score_index)
 
 reg.register_pattern("vision.get_valid_counts", OpPattern.OPAQUE)
 
diff --git a/python/tvm/relay/op/vision/nms.py b/python/tvm/relay/op/vision/nms.py
index ab34eb6e6cfb..d19dde306aca 100644
--- a/python/tvm/relay/op/vision/nms.py
+++ b/python/tvm/relay/op/vision/nms.py
@@ -20,7 +20,9 @@
 from ...expr import TupleWrapper
 
 def get_valid_counts(data,
-                     score_threshold):
+                     score_threshold,
+                     id_index=0,
+                     score_index=1):
     """Get valid count of bounding boxes given a score threshold.
     Also moves valid boxes to the top of input data.
 
@@ -32,6 +34,12 @@ def get_valid_counts(data,
     score_threshold : optional, float
         Lower limit of score for valid bounding boxes.
 
+    id_index : optional, int
+        index of the class categories, -1 to disable.
+
+    score_index: optional, int
+        Index of the scores/confidence of boxes.
+
     Returns
     -------
     valid_count : relay.Expr
@@ -40,7 +48,8 @@ def get_valid_counts(data,
     out_tensor : relay.Expr
         Rearranged data tensor.
     """
-    return TupleWrapper(_make.get_valid_counts(data, score_threshold), 2)
+    return TupleWrapper(_make.get_valid_counts(data, score_threshold,
+                                               id_index, score_index), 2)
 
 
 def non_max_suppression(data,
diff --git a/python/tvm/relay/parser.py b/python/tvm/relay/parser.py
index 2d27b7b53f89..9218cae3de66 100644
--- a/python/tvm/relay/parser.py
+++ b/python/tvm/relay/parser.py
@@ -18,19 +18,6 @@
 from __future__ import absolute_import
 from .. import register_func
 
-def enabled():
-    """Checks whether the parser is enabled, this allows users to
-       optionally support building the parser.
-
-       We use this check before importing the parser.
-    """
-    try:
-        # pylint: disable=unused-variable
-        from tvm.relay import _parser
-        return True
-    # pylint: disable=broad-except
-    except Exception:
-        return False
 
 @register_func("relay.fromtext")
 def fromtext(data, source_name=None):
diff --git a/python/tvm/relay/prelude.py b/python/tvm/relay/prelude.py
index ff823c3413fa..af0497e3801a 100644
--- a/python/tvm/relay/prelude.py
+++ b/python/tvm/relay/prelude.py
@@ -15,11 +15,16 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name
-"""Adds certain standard global functions and ADT definitions to the module."""
+"""A prelude containing useful global functions and ADT definitions."""
+import os
 from .ty import GlobalTypeVar, TypeVar, FuncType, TupleType, scalar_type
-from .expr import Var, Function, GlobalVar, Let, If, Tuple, TupleGetItem
+from .expr import Var, Function, GlobalVar, Let, If, Tuple, TupleGetItem, const
+from .op.tensor import add, subtract, equal
 from .adt import Constructor, TypeData, Clause, Match
 from .adt import PatternConstructor, PatternVar, PatternWildcard
+from .parser import fromtext
+
+__PRELUDE_PATH__ = os.path.dirname(os.path.realpath(__file__))
 
 class Prelude:
     """Contains standard definitions."""
@@ -61,39 +66,44 @@ def define_list_tl(self):
         cons_case = Clause(PatternConstructor(self.cons, [PatternVar(y), PatternVar(z)]), z)
         self.mod[self.tl] = Function([x], Match(x, [cons_case]), self.l(a), [a])
 
+
     def define_list_nth(self):
         """Defines a function to get the nth element of a list.
 
-        nth(l) : list[a] -> a
+        nth(l) : list[a] -> Tensor[(), int32] -> a
         """
         self.nth = GlobalVar("nth")
         a = TypeVar("a")
         x = Var("x", self.l(a))
-        n = Var("n", self.nat())
+        n = Var("n", scalar_type('int32'))
+
+        body = If(equal(n, const(0)),
+                  self.hd(x),
+                  self.nth(self.tl(x), subtract(n, const(1))))
+
+        self.mod[self.nth] = Function([x, n], body, a, [a])
 
-        y = Var("y")
-        z_case = Clause(PatternConstructor(self.z), self.hd(x))
-        s_case = Clause(PatternConstructor(self.s, [PatternVar(y)]), self.nth(self.tl(x), y))
-        self.mod[self.nth] = Function([x, n], Match(n, [z_case, s_case]), a, [a])
 
     def define_list_update(self):
         """Defines a function to update the nth element of a list and return the updated list.
 
-        update(l, i, v) : list[a] -> nat -> a -> list[a]
+        update(l, i, v) : list[a] -> Tensor[(), int32] -> a -> list[a]
         """
         self.update = GlobalVar("update")
         a = TypeVar("a")
         l = Var("l", self.l(a))
-        n = Var("n", self.nat())
+        n = Var("n", scalar_type('int32'))
         v = Var("v", a)
 
-        y = Var("y")
+        body = If(equal(n, const(0)),
+                  self.cons(v, self.tl(l)),
+                  self.cons(self.hd(l),
+                            self.update(self.tl(l),
+                                        subtract(n, const(1)),
+                                        v)))
 
-        z_case = Clause(PatternConstructor(self.z), self.cons(v, self.tl(l)))
-        s_case = Clause(PatternConstructor(self.s, [PatternVar(y)]),
-                        self.cons(self.hd(l), self.update(self.tl(l), y, v)))
+        self.mod[self.update] = Function([l, n, v], body, self.l(a), [a])
 
-        self.mod[self.update] = Function([l, n, v], Match(n, [z_case, s_case]), self.l(a), [a])
 
     def define_list_map(self):
         """Defines a function for mapping a function over a list's
@@ -114,6 +124,7 @@ def define_list_map(self):
                            self.cons(f(y), self.map(f, z)))
         self.mod[self.map] = Function([f, x], Match(x, [nil_case, cons_case]), self.l(b), [a, b])
 
+
     def define_list_foldl(self):
         """Defines a left-way fold over a list.
 
@@ -136,6 +147,7 @@ def define_list_foldl(self):
         self.mod[self.foldl] = Function([f, av, bv],
                                         Match(bv, [nil_case, cons_case]), a, [a, b])
 
+
     def define_list_foldr(self):
         """Defines a right-way fold over a list.
 
@@ -158,6 +170,7 @@ def define_list_foldr(self):
         self.mod[self.foldr] = Function([f, bv, av],
                                         Match(av, [nil_case, cons_case]), b, [a, b])
 
+
     def define_list_foldr1(self):
         """Defines a right-way fold over a nonempty list.
 
@@ -196,6 +209,7 @@ def define_list_concat(self):
                                          self.foldr(updater, l2, l1),
                                          self.l(a), [a])
 
+
     def define_list_filter(self):
         """Defines a function that filters a list.
 
@@ -214,6 +228,7 @@ def define_list_filter(self):
                            If(f(h), self.cons(h, self.filter(f, t)), self.filter(f, t)))
         self.mod[self.filter] = Function([f, l], Match(l, [nil_case, cons_case]), self.l(a), [a])
 
+
     def define_list_zip(self):
         """Defines a function that combines two lists into a list of tuples of their elements.
 
@@ -238,6 +253,7 @@ def define_list_zip(self):
         self.mod[self.zip] = Function([l1, l2], Match(l1, [nil_case, outer_cons_case]),
                                       self.l(TupleType([a, b])), [a, b])
 
+
     def define_list_rev(self):
         """Defines a function that reverses a list.
 
@@ -253,6 +269,7 @@ def define_list_rev(self):
                                       self.foldl(updater, self.nil(), l),
                                       self.l(a), [a])
 
+
     def define_list_map_accumr(self):
         """Defines an accumulative map, which is a fold that simulataneously updates
         an accumulator value and a list of results.
@@ -282,6 +299,7 @@ def define_list_map_accumr(self):
                                              TupleType([a, self.l(c)]),
                                              [a, b, c])
 
+
     def define_list_map_accuml(self):
         """Defines an accumulative map, which is a fold that simulataneously updates
         an accumulator value and a list of results.
@@ -321,6 +339,7 @@ def define_optional_adt(self):
         self.none = Constructor("none", [], self.optional)
         self.mod[self.optional] = TypeData(self.optional, [a], [self.some, self.none])
 
+
     def define_list_unfoldr(self):
         """Defines a function that builds up a list starting from a seed value.
 
@@ -343,6 +362,7 @@ def define_list_unfoldr(self):
         self.mod[self.unfoldr] = Function([f, s], Match(f(s), [none_case, some_case]),
                                           self.l(b), [a, b])
 
+
     def define_list_unfoldl(self):
         """Defines a function that builds up a list starting from a seed value.
 
@@ -362,52 +382,29 @@ def define_list_unfoldl(self):
                                           self.rev(self.unfoldr(f, s)),
                                           self.l(b), [a, b])
 
-    def define_nat_adt(self):
-        """Defines a Peano (unary) natural number ADT.
-        Zero is represented by z(). s(n) adds 1 to a nat n."""
-        self.nat = GlobalTypeVar("nat")
-        self.z = Constructor("z", [], self.nat)
-        self.s = Constructor("s", [self.nat()], self.nat)
-        self.mod[self.nat] = TypeData(self.nat, [], [self.z, self.s])
-
-    def define_nat_double(self):
-        """Defines a function that doubles a nat."""
-        self.double = GlobalVar("double")
-        x = Var("x", self.nat())
-        y = Var("y")
-        z_case = Clause(PatternConstructor(self.z), self.z())
-        s_case = Clause(PatternConstructor(self.s, [PatternVar(y)]),
-                        self.s(self.s(self.double(y))))
-        self.mod[self.double] = Function([x], Match(x, [z_case, s_case]))
-
-    def define_nat_add(self):
-        """Defines a function that adds two nats."""
-        self.add = GlobalVar("add")
-        x = Var("x", self.nat())
-        y = Var("y", self.nat())
-        a = Var("a")
-        z_case = Clause(PatternConstructor(self.z), y)
-        s_case = Clause(PatternConstructor(self.s, [PatternVar(a)]),
-                        self.s(self.add(a, y)))
-        self.mod[self.add] = Function([x, y], Match(x, [z_case, s_case]))
 
     def define_list_sum(self):
-        """Defines a function that computes the sum of a list of nats."""
+        """Defines a function that computes the sum of a list of integer scalars."""
         self.sum = GlobalVar("sum")
-        a = Var("a", self.l(self.nat()))
-        self.mod[self.sum] = Function([a], self.foldl(self.add, self.z(), a))
+        a = Var("a", self.l(scalar_type('int32')))
+        x = Var('x')
+        y = Var('y')
+        addf = Function([x, y], add(x, y))
+        self.mod[self.sum] = Function([a], self.foldl(addf, const(0), a))
+
 
     def define_list_length(self):
-        """Defines a function that returns the length of a list as a nat"""
+        """Defines a function that returns the length of a list"""
         self.length = GlobalVar("length")
         a = TypeVar("a")
         x = Var("x", self.l(a))
         y = Var("y")
-        nil_case = Clause(PatternConstructor(self.nil), self.z())
+        nil_case = Clause(PatternConstructor(self.nil), const(0))
         cons_case = Clause(PatternConstructor(self.cons, [PatternWildcard(), PatternVar(y)]),
-                           self.s(self.length(y)))
+                           add(const(1), self.length(y)))
         self.mod[self.length] = Function([x],
-                                         Match(x, [nil_case, cons_case]), None, [a])
+                                         Match(x, [nil_case, cons_case]), scalar_type('int32'), [a])
+
 
     def define_tree_adt(self):
         """Defines a tree ADT. A tree can contain any type.
@@ -420,6 +417,7 @@ def define_tree_adt(self):
         self.rose = Constructor("rose", [a, self.l(self.tree(a))], self.tree)
         self.mod[self.tree] = TypeData(self.tree, [a], [self.rose])
 
+
     def define_tree_map(self):
         """Defines a function that maps over a tree. The function
         is applied to each subtree's contents.
@@ -439,71 +437,58 @@ def define_tree_map(self):
         self.mod[self.tmap] = Function([f, t],
                                        Match(t, [rose_case]), self.tree(b), [a, b])
 
+
     def define_tree_size(self):
-        """Defines a function that computes the size of a tree as a nat.
+        """Defines a function that computes the size of a tree.
 
-        Signature: fn<a>(t : tree[a]) -> nat
+        Signature: fn<a>(t : tree[a]) -> Tensor[(), int32]
         """
         self.size = GlobalVar("size")
         a = TypeVar("a")
         t = Var("t", self.tree(a))
-        x = Var("x", self.tree(a))
         z = Var("z")
         rose_case = Clause(PatternConstructor(self.rose, [PatternWildcard(), PatternVar(z)]),
-                           self.s(self.sum(self.map(Function([x], self.size(x)), z))))
+                           add(const(1), self.sum(self.map(self.size, z))))
         self.mod[self.size] = Function([t],
-                                       Match(t, [rose_case]), self.nat(), [a])
-
-    def define_id(self):
-        """Defines a function that return it's argument.
-
-        Signature: fn<a>(x : a) -> a
-        """
-        self.id = GlobalVar("id")
-        a = TypeVar("a")
-        x = Var("x", a)
-        self.mod[self.id] = Function([x], x, a, [a])
-
-
-    def define_compose(self):
-        """Defines a function that compose two function.
-
-        Signature: fn<a, b, c>(f : fn(b) -> c, g : fn(a) -> b) -> fn(a) -> c
-        """
-        self.compose = GlobalVar("compose")
-        a = TypeVar("a")
-        b = TypeVar("b")
-        c = TypeVar("c")
-        f = Var("f", FuncType([b], c))
-        g = Var("g", FuncType([a], b))
-        x = Var("x")
-        self.mod[self.compose] = Function([f, g],
-                                          Function([x], f(g(x))),
-                                          FuncType([a], c),
-                                          [a, b, c])
+                                       Match(t, [rose_case]), scalar_type('int32'), [a])
 
 
     def define_iterate(self):
-        """Define a function that take a number n, a function f,
-        and return a closure that apply f n time on it's argument.
+        """Defines a function that take a number n and a function f;
+        returns a closure that takes an argument and applies f
+        n times to its argument.
 
-        Signature: fn<a>(n : nat, f : fn(a) -> a) -> fn(a) -> a
+        Signature: fn<a>(f : fn(a) -> a, n : Tensor[(), int32]) -> fn(a) -> a
         """
         self.iterate = GlobalVar("iterate")
         a = TypeVar("a")
         f = Var("f", FuncType([a], a))
-        x = Var("x", self.nat())
-        y = Var("y", self.nat())
-        z_case = Clause(PatternConstructor(self.z), self.id)
-        s_case = Clause(PatternConstructor(self.s, [PatternVar(y)]),
-                        self.compose(f, self.iterate(f, y)))
+        x = Var("x", scalar_type('int32'))
+        body = If(equal(x, const(0)),
+                  self.id,
+                  self.compose(f,
+                               self.iterate(f, subtract(x, const(1)))))
         self.mod[self.iterate] = Function([f, x],
-                                          Match(x, [z_case, s_case]),
+                                          body,
                                           FuncType([a], a),
                                           [a])
 
+    def load_prelude(self):
+        """
+        Parses the portions of the Prelude written in Relay's text format and adds
+        them to the module.
+        """
+        prelude_file = os.path.join(__PRELUDE_PATH__, "prelude.rly")
+        with open(prelude_file) as prelude:
+            prelude = fromtext(prelude.read())
+            self.mod.update(prelude)
+            self.id = self.mod.get_global_var("id")
+            self.compose = self.mod.get_global_var("compose")
+
+
     def __init__(self, mod):
         self.mod = mod
+        self.load_prelude()
         self.define_list_adt()
         self.define_list_hd()
         self.define_list_tl()
@@ -522,9 +507,6 @@ def __init__(self, mod):
         self.define_list_unfoldr()
         self.define_list_unfoldl()
 
-        self.define_nat_adt()
-        self.define_nat_double()
-        self.define_nat_add()
         self.define_list_length()
         self.define_list_nth()
         self.define_list_update()
@@ -534,6 +516,4 @@ def __init__(self, mod):
         self.define_tree_map()
         self.define_tree_size()
 
-        self.define_id()
-        self.define_compose()
         self.define_iterate()
diff --git a/python/tvm/relay/prelude.rly b/python/tvm/relay/prelude.rly
new file mode 100644
index 000000000000..35c794a6d479
--- /dev/null
+++ b/python/tvm/relay/prelude.rly
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+v0.0.2
+
+def @id[a](%x: a) -> a {
+    %x
+}
+
+def @compose[a, b, c](%f: fn(b) -> c, %g: fn(a) -> b) {
+    fn (%x: a) -> c {
+        %f(%g(%x))
+    }
+}
diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
index e52ce142e5c3..61e895ac7efb 100644
--- a/python/tvm/relay/quantize/_annotate.py
+++ b/python/tvm/relay/quantize/_annotate.py
@@ -22,7 +22,7 @@
 import topi
 from . import _quantize
 from .quantize import QAnnotateKind, current_qconfig
-from .quantize import _conv_counter, _set_conv_counter
+from .quantize import annotate_context
 from .. import expr as _expr
 from .. import op as _op
 from ..op import op as _reg
@@ -116,7 +116,6 @@ def frewrite_with_guard(ref_call, new_args, ctx):
     return _register(frewrite) if frewrite is not None else _register
 
 
-@register_func("relay.quantize.attach_simulated_quantize")
 def attach_simulated_quantize(data, kind, sign=True, rounding="round"):
     """Attach a simulated quantize operation after input data expr.
 
@@ -133,11 +132,20 @@ def attach_simulated_quantize(data, kind, sign=True, rounding="round"):
         if data.attrs.kind == kind and data.attrs.sign == sign and data.attrs.rounding == rounding:
             return data
 
+    actx = annotate_context()
+    key = tuple([data, kind, sign, rounding])
+    if key in actx.qnode_map:
+        return actx.qnode_map[key]
+
     dom_scale = _expr.var("dom_scale")
     clip_min = _expr.var("clip_min")
     clip_max = _expr.var("clip_max")
-    return _quantize.simulated_quantize(
+    qnode = _quantize.simulated_quantize(
         data, dom_scale, clip_min, clip_max, kind, sign, rounding)
+    actx.qnode_map[key] = qnode
+    return qnode
+
+register_func("relay.quantize.attach_simulated_quantize", attach_simulated_quantize)
 
 
 @register_annotate_function("nn.contrib_conv2d_NCHWc")
@@ -152,11 +160,13 @@ def conv2d_rewrite(ref_call, new_args, ctx):
     """Rewrite function for conv2d. Lhs of conv will be quantized to
     input field, and rhs of conv will be quantized to weight field.
     Output would be in activation field"""
-    cnt = _conv_counter()
-    if cnt < current_qconfig().skip_k_conv:
-        _set_conv_counter(cnt + 1)
-        return None
-    _set_conv_counter(cnt + 1)
+    actx = annotate_context()
+    if current_qconfig().skip_conv_layers is not None:
+        skipped_indices = [int(x) for x in current_qconfig().skip_conv_layers]
+        if actx.conv2d_counter() in skipped_indices:
+            actx.count_conv2d()
+            return None
+    actx.count_conv2d()
 
     lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
     rhs_expr, rhs_kind = _get_expr_kind(new_args[1])
@@ -168,16 +178,26 @@ def conv2d_rewrite(ref_call, new_args, ctx):
     rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT)
 
     expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
+
     return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
 
 
+def check_to_skip():
+    """Check the index of conv2d layer to decide whether to skip the current operator."""
+    if current_qconfig().skip_conv_layers is not None:
+        skipped_indices = [int(x) for x in current_qconfig().skip_conv_layers]
+        if annotate_context().conv2d_counter() - 1 in skipped_indices:
+            return True
+    return False
+
+
 @register_annotate_function("nn.dense")
 def dense_rewrite(ref_call, new_args, ctx):
     """Rewrite function for dense. Lhs of dense will be quantized to input field, and rhs of
     dense will be quantized to weight field. Output would be in activation field."""
-    cnt = _conv_counter()
-    if cnt < current_qconfig().skip_k_conv:
+    if check_to_skip():
         return None
+
     lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
     rhs_expr, rhs_kind = _get_expr_kind(new_args[1])
 
@@ -194,7 +214,7 @@ def dense_rewrite(ref_call, new_args, ctx):
 @register_annotate_function("multiply")
 def multiply_rewrite(ref_call, new_args, ctx):
     """Rewrite function for multiply."""
-    if _conv_counter() <= current_qconfig().skip_k_conv:
+    if check_to_skip():
         return None
 
     lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
@@ -216,7 +236,7 @@ def multiply_rewrite(ref_call, new_args, ctx):
 @register_annotate_function("add")
 def add_rewrite(ref_call, new_args, ctx):
     """Rewrite function for add."""
-    if _conv_counter() <= current_qconfig().skip_k_conv:
+    if check_to_skip():
         return None
 
     lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
@@ -242,9 +262,24 @@ def add_rewrite(ref_call, new_args, ctx):
     return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
 
 
+@register_annotate_function("stop_fusion")
+def stop_fusion_rewrite(ref_call, new_args, ctx):
+    """Rewrite function for add."""
+    if check_to_skip():
+        return None
+
+    x_expr, x_kind = _get_expr_kind(new_args[0])
+    if x_kind is None:
+        return None
+
+    ret_expr = attach_simulated_quantize(x_expr, QAnnotateKind.INPUT)
+    ret_expr = _forward_op(ref_call, [ret_expr])
+    return QAnnotateExpr(ret_expr, QAnnotateKind.INPUT)
+
+
 def identity_rewrite(ref_call, new_args, ctx):
     """Simply forward the original operation"""
-    if _conv_counter() <= current_qconfig().skip_k_conv:
+    if check_to_skip():
         return None
 
     x_expr, x_kind = _get_expr_kind(new_args[0])
@@ -255,6 +290,7 @@ def identity_rewrite(ref_call, new_args, ctx):
     return QAnnotateExpr(ret_expr, x_kind)
 
 
+register_annotate_function("clip", identity_rewrite)
 register_annotate_function("nn.relu", identity_rewrite)
 register_annotate_function("strided_slice", identity_rewrite)
 register_annotate_function("nn.avg_pool2d", identity_rewrite)
@@ -262,8 +298,9 @@ def identity_rewrite(ref_call, new_args, ctx):
 
 def pool2d_rewrite(ref_call, new_args, ctx):
     """Rewrite function for max pool2d"""
-    if _conv_counter() <= current_qconfig().skip_k_conv:
+    if check_to_skip():
         return None
+
     expr, x_kind = _get_expr_kind(new_args[0])
 
     if x_kind is None:
@@ -280,7 +317,7 @@ def pool2d_rewrite(ref_call, new_args, ctx):
 @register_annotate_function("concatenate")
 def concatenate_rewrite(ref_call, new_args, ctx):
     """Rewrite function for concatenate"""
-    if _conv_counter() <= current_qconfig().skip_k_conv:
+    if check_to_skip():
         return None
 
     input_tuple = new_args[0]
diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py
index 607ee1821c86..a7749d4892fb 100644
--- a/python/tvm/relay/quantize/quantize.py
+++ b/python/tvm/relay/quantize/quantize.py
@@ -21,8 +21,9 @@
 
 from . import _quantize
 from .. import expr as _expr
+from .. import module as _module
 from .. import ir_pass as _ir_pass
-from .. import build_module as _build
+from .. import transform as _transform
 from .. import op as _op
 from ... import make as _make
 from ..base import NodeBase, register_relay_node
@@ -70,11 +71,10 @@ class QConfig(NodeBase):
         "dtype_weight": "int8",
         "dtype_activation": "int32",
         "global_scale": 8.0,
-        "skip_k_conv": 1,
+        "skip_conv_layers": [0],
         "round_for_shift": True,
         "store_lowbit_output": True,
         "debug_enabled_ops": None,
-        "use_stop_fusion": True
     }
 
     # pylint: disable=no-member
@@ -136,8 +136,9 @@ def qconfig(**kwargs):
     global_scale: float
         The global scale for calibration.
 
-    skip_k_conv: int
-        The number of skipped conv2d.
+    skip_conv_layers: list
+        Specifying which layers to be skipped. Provide a list of indices
+        that indicate which conv2d layers to leave untouched.
 
     round_for_shift: boolean
         Whether to add bias for rounding during shift.
@@ -146,9 +147,10 @@ def qconfig(**kwargs):
         Whether to store low-bit integer back as output before dequantizing.
         Some accelerators need this, e.g. VTA.
 
-    use_stop_fusion: boolean
-        Whether add stop_fusion when casting to dtype_activation. stop_fusion forces lowbit
-        results to be stored in memory.
+    debug_enabled_ops: None or list of str
+        Partially quantize specified operators for debugging. The default value
+        is None, which means will try to call all operartors' annotate rewrite
+        function.
 
     Returns
     -------
@@ -160,40 +162,38 @@ def qconfig(**kwargs):
     return _make.node("relay.quantize.QConfig", **node_args)
 
 
-CONV_COUNTER = 0
+class AnnotateContext(object):
+    """A global singleton annotate scope"""
+    Current = None
 
+    def __init__(self):
+        self.qnode_map = dict()
+        self._conv2d_counter = 0
 
-def _conv_counter():
-    """Get the global counter for conv2d."""
-    return CONV_COUNTER
+    def __enter__(self):
+        self._conv2d_counter = 0
+        return self
 
+    def conv2d_counter(self):
+        """Get the counter for conv2d."""
+        return self._conv2d_counter
 
-def _set_conv_counter(n):
-    """Set the value of the global conv2d counter."""
-    global CONV_COUNTER
-    CONV_COUNTER = n
+    def count_conv2d(self):
+        """Increase the value of the conv2d counter by one."""
+        self._conv2d_counter += 1
 
+    def __exit__(self, ptype, value, traceback):
+        pass
 
-def annotate(graph):
-    """Given a float32 graph, annotate will rewrite the graph
-    and return back a graph which simulates the error brought by
-    current quantization scheme.
 
-    Parameters
-    ---------
-    graph: Function
-        The original graph
-
-    Returns
-    -------
-    ret: Function
-        The graph after annotation
-    """
-    _set_conv_counter(0)  # reset counter
-    return _quantize.annotate(graph)
+def annotate_context():
+    """Get the global singleton scope"""
+    if AnnotateContext.Current is None:
+        AnnotateContext.Current = AnnotateContext()
+    return AnnotateContext.Current
 
 
-def calibrate(graph, dataset=None):
+def calibrate(graph, mod=None, ctx=None):
     """The calibrate procedure will try to calculate the content of
     dom_scale, nbit, clip_min, clip_max for every `simulated_quantize`
     operator.
@@ -203,8 +203,11 @@ def calibrate(graph, dataset=None):
     graph: Function
         The simulation graph after annotation.
 
-    dataset: list of dict of Var -> NDArray
-        The calibration dataset.
+    mod: tvm.relay.Module
+        The module where calibration happens on.
+
+    ctx: tvm.relay.PassContext
+        The pass context used for calibration.
 
     Returns
     -------
@@ -249,24 +252,52 @@ def _make_const(val):
     return _expr.bind(graph, const_params)
 
 
-def realize(graph):
-    """The realize pass will transform the simulated quantized
-    graph, which computes with float32 actually, to a real low-bit
-    integer graph. It will replace the simulated_quantize with
-    several fine-grained operators like add, multiply, and shift
-    as more as possible for performance (fusion, etc.)
+def annotate():
+    """Given a float32 graph, this pass will rewrite the graph and return
+    a graph which simulates the error brought by the current quantization
+    scheme.
 
-    Parameters
-    ---------
-    graph: Function
-        The simulated graph after calibrating.
+    Returns
+    -------
+    ret: tvm.relay.Pass
+        The registered pass for quantization annotation.
+    """
+    return _quantize.QuantizeAnnotate()
+
+
+def realize():
+    """The realize pass will transform the simulated quantized graph, which
+    actually computes with float32, to a real low-bit integer graph. It will
+    replace the `simulated_quantize` with several fine-grained operators like
+    add, multiply, and shift as much as possible for better performance.
 
     Returns
     -------
-    ret: Function
-        The graph after realization
+    ret: tvm.relay.Pass
+        The registered pass for quantization realization.
+    """
+    return _quantize.QuantizeRealize()
+
+
+def _bind_params(func, params):
+    """Bind the params to the expression.
     """
-    return _quantize.realize(graph)
+    name_dict = {}
+    for arg in func.params:
+        name = arg.name_hint
+        if name in name_dict:
+            name_dict[name] = None
+        else:
+            name_dict[name] = arg
+    bind_dict = {}
+    for k, v in params.items():
+        if k not in name_dict:
+            continue
+        arg = name_dict[k]
+        if arg is None:
+            raise ValueError("Multiple args in the function have name %s" % k)
+        bind_dict[arg] = _expr.const(v)
+    return _expr.bind(func, bind_dict)
 
 
 def quantize(graph, params=None, dataset=None):
@@ -292,15 +323,29 @@ def quantize(graph, params=None, dataset=None):
     ret: Function
         The graph after quantization
     """
-    opt_passes = ["SimplifyInference",
-                  "FoldScaleAxis",
-                  "FoldConstant",
-                  "CanonicalizeOps"]
-    with _build.build_config(add_pass=opt_passes):
-        graph = _build.optimize(graph, params=params)
-
-    graph = annotate(graph)
-    graph = calibrate(graph, dataset)
-    graph = realize(graph)
-    graph = _ir_pass.fold_constant(graph)
-    return graph
+    if params:
+        graph = _bind_params(graph, params)
+
+    mod = _module.Module.from_expr(graph)
+    # Perform "SimplifyInference", "FoldScaleAxis", "FoldConstant", and
+    # "CanonicalizeOps" optimization before quantization.
+    optimize = _transform.Sequential([_transform.SimplifyInference(),
+                                      _transform.FoldConstant(),
+                                      _transform.FoldScaleAxis(),
+                                      _transform.CanonicalizeOps(),
+                                      _transform.FoldConstant()])
+
+    calibrate_pass = _transform.function_pass(calibrate, opt_level=1,
+                                              name="QuantizeCalibrate")
+    quantize_seq = _transform.Sequential([annotate(),
+                                          calibrate_pass,
+                                          realize(),
+                                          _transform.FoldConstant()])
+    with annotate_context():
+        with _transform.PassContext(opt_level=3,
+                                    required_pass=["QuantizeAnnotate",
+                                                   "QuantizeCalibrate",
+                                                   "QuantizeRealize"]):
+            mod = optimize(mod)
+            mod = quantize_seq(mod)
+    return mod[mod.entry_func.name_hint]
diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py
index b4a8394e2659..7a5007bbfb8f 100644
--- a/python/tvm/relay/testing/__init__.py
+++ b/python/tvm/relay/testing/__init__.py
@@ -27,6 +27,8 @@
 from . import squeezenet
 from . import vgg
 from . import densenet
+from . import yolo_detection
 
 from .config import ctx_list
 from .init import create_workload
+from .nat import add_nat_definitions, count, make_nat_value, make_nat_expr
diff --git a/nnvm/python/nnvm/testing/darknet.py b/python/tvm/relay/testing/darknet.py
similarity index 100%
rename from nnvm/python/nnvm/testing/darknet.py
rename to python/tvm/relay/testing/darknet.py
diff --git a/python/tvm/relay/testing/nat.py b/python/tvm/relay/testing/nat.py
new file mode 100644
index 000000000000..a76a340f113d
--- /dev/null
+++ b/python/tvm/relay/testing/nat.py
@@ -0,0 +1,184 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Defines a unary natural number (Peano natural number) abstract
+data type for Relay and provides some utility functions for it.
+Nats are useful for testing purposes, as they make it easy to write
+test cases for recursion and pattern matching."""
+
+from tvm.relay.adt import Constructor, TypeData, Clause, Match, PatternConstructor, PatternVar
+from tvm.relay.backend.interpreter import ConstructorValue
+from tvm.relay.expr import Var, Function, GlobalVar
+from tvm.relay.ty import GlobalTypeVar, TypeVar, FuncType
+
+def define_nat_adt(prelude):
+    """Defines a Peano (unary) natural number ADT.
+    Zero is represented by z(). s(n) adds 1 to a nat n.
+    Adds the fields nat, z, and s to the preluide, representing
+    (respectively) the nat ADT and the z and s constructors.
+    """
+    prelude.nat = GlobalTypeVar("nat")
+    prelude.z = Constructor("z", [], prelude.nat)
+    prelude.s = Constructor("s", [prelude.nat()], prelude.nat)
+    prelude.mod[prelude.nat] = TypeData(prelude.nat, [], [prelude.z, prelude.s])
+
+
+def define_nat_double(prelude):
+    """Defines a function that doubles a nat. Adds a field called
+    'double' to the prelude, giving the GlobalVar pointing to
+    the function.
+    """
+    prelude.double = GlobalVar("double")
+    x = Var("x", prelude.nat())
+    y = Var("y")
+    z_case = Clause(PatternConstructor(prelude.z), prelude.z())
+    s_case = Clause(PatternConstructor(prelude.s, [PatternVar(y)]),
+                    prelude.s(prelude.s(prelude.double(y))))
+    prelude.mod[prelude.double] = Function([x], Match(x, [z_case, s_case]))
+
+
+def define_nat_add(prelude):
+    """Defines a function that adds two nats and adds a field to the
+    prelude 'add' giving the GlobalVar pointing to that function.
+    """
+    prelude.add = GlobalVar("add")
+    x = Var("x", prelude.nat())
+    y = Var("y", prelude.nat())
+    a = Var("a")
+    z_case = Clause(PatternConstructor(prelude.z), y)
+    s_case = Clause(PatternConstructor(prelude.s, [PatternVar(a)]),
+                    prelude.s(prelude.add(a, y)))
+    prelude.mod[prelude.add] = Function([x, y], Match(x, [z_case, s_case]))
+
+
+# versions of prelude functions that use nats instead of scalars
+
+def define_nat_nth(prelude):
+    """Defines a function to get the nth eleemnt of a list using
+    a nat to index into the list.
+
+    nat_nth(l, n): fun<a>(list[a], nat) -> a
+    """
+    prelude.nat_nth = GlobalVar("nat_nth")
+    a = TypeVar("a")
+    x = Var("x", prelude.l(a))
+    n = Var("n", prelude.nat())
+    y = Var("y")
+
+    z_case = Clause(PatternConstructor(prelude.z), prelude.hd(x))
+    s_case = Clause(PatternConstructor(prelude.s, [PatternVar(y)]),
+                    prelude.nat_nth(prelude.tl(x), y))
+
+    prelude.mod[prelude.nat_nth] = Function([x, n],
+                                            Match(n, [z_case, s_case]),
+                                            a, [a])
+
+
+def define_nat_update(prelude):
+    """Defines a function to update the nth element of a list and return the updated list.
+
+    nat_update(l, i, v) : fun<a>(list[a], nat, a) -> list[a]
+    """
+    prelude.nat_update = GlobalVar("nat_update")
+    a = TypeVar("a")
+    # pylint: disable=invalid-name
+    l = Var("l", prelude.l(a))
+    n = Var("n", prelude.nat())
+    v = Var("v", a)
+    y = Var("y")
+
+    z_case = Clause(PatternConstructor(prelude.z),
+                    prelude.cons(v, prelude.tl(l)))
+    s_case = Clause(PatternConstructor(prelude.s, [PatternVar(y)]),
+                    prelude.cons(
+                        prelude.hd(l),
+                        prelude.nat_update(prelude.tl(l), y, v)))
+
+    prelude.mod[prelude.nat_update] = Function([l, n, v],
+                                               Match(n, [z_case, s_case]),
+                                               prelude.l(a), [a])
+
+
+def define_nat_iterate(prelude):
+    """Defines a function that takes a number n and a function f;
+    returns a closure that takes an argument and applies f
+    n times to its argument.
+
+    Signature: fn<a>(fn(a) -> a, nat) -> fn(a) -> a
+    """
+    prelude.nat_iterate = GlobalVar("nat_iterate")
+    a = TypeVar("a")
+    f = Var("f", FuncType([a], a))
+    x = Var("x", prelude.nat())
+    y = Var("y", prelude.nat())
+
+    z_case = Clause(PatternConstructor(prelude.z), prelude.id)
+    s_case = Clause(PatternConstructor(prelude.s, [PatternVar(y)]),
+                    prelude.compose(f, prelude.nat_iterate(f, y)))
+
+    prelude.mod[prelude.nat_iterate] = Function([f, x],
+                                                Match(x, [z_case, s_case]),
+                                                FuncType([a], a),
+                                                [a])
+
+
+def add_nat_definitions(prelude):
+    """Given a Relay prelude, adds a Peano nat ADT, as well as functions
+    for adding nats and doubling nats. It also adds versions of
+    update, nth, and iterate that take nats instead of scalars (the
+    names are prefixed with 'nat_')."""
+    define_nat_adt(prelude)
+    define_nat_double(prelude)
+    define_nat_add(prelude)
+    define_nat_nth(prelude)
+    define_nat_update(prelude)
+    define_nat_iterate(prelude)
+
+
+# helper functions for working with nats
+
+
+def count(prelude, n):
+    """Takes a ConstructorValue corresponding to a nat ADT
+    and converts it into a Python integer. This is an example of
+    using an ADT value in Python.
+    """
+    assert isinstance(n, ConstructorValue)
+    if n.tag == prelude.z.tag:
+        return 0
+    assert n.tag == prelude.s.tag
+    return 1 + count(prelude, n.fields[0])
+
+
+def make_nat_value(prelude, n):
+    """The inverse of count(): Given a non-negative Python integer,
+    constructs a ConstructorValue representing that value as a nat.
+    """
+    if n == 0:
+        return ConstructorValue(prelude.z.tag, [], None, [])
+    return ConstructorValue(prelude.s.tag, [make_nat_value(prelude, n - 1)], None, [])
+
+
+def make_nat_expr(prelude, n):
+    """Given a non-negative Python integer, constructs a Python
+    expression representing that integer's value as a nat.
+    """
+    assert n >= 0
+    ret = prelude.z()
+    while n > 0:
+        ret = prelude.s(ret)
+        n = n - 1
+    return ret
diff --git a/python/tvm/relay/testing/tf.py b/python/tvm/relay/testing/tf.py
index d82ed0f46097..a56e6fe1782d 100644
--- a/python/tvm/relay/testing/tf.py
+++ b/python/tvm/relay/testing/tf.py
@@ -163,13 +163,10 @@ def get_workload_official(model_url, model_sub_path):
     model_sub_path:
         Sub path in extracted tar for the ftozen protobuf file.
 
-    temp_dir: TempDirectory
-        The temporary directory object to download the content.
-
     Returns
     -------
-    graph_def: graphdef
-        graph_def is the tensorflow workload for mobilenet.
+    model_path: str
+        Full path to saved model file
 
     """
 
@@ -200,7 +197,7 @@ def get_workload(model_path, model_sub_path=None):
     Returns
     -------
     graph_def: graphdef
-        graph_def is the tensorflow workload for mobilenet.
+        graph_def is the tensorflow workload.
 
     """
 
diff --git a/nnvm/python/nnvm/testing/yolo_detection.py b/python/tvm/relay/testing/yolo_detection.py
similarity index 100%
rename from nnvm/python/nnvm/testing/yolo_detection.py
rename to python/tvm/relay/testing/yolo_detection.py
diff --git a/python/tvm/relay/transform.py b/python/tvm/relay/transform.py
new file mode 100644
index 000000000000..5f47e5b446aa
--- /dev/null
+++ b/python/tvm/relay/transform.py
@@ -0,0 +1,725 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""
+Relay pass transformation infrastructure.
+"""
+import types
+import inspect
+import functools
+
+from tvm._ffi.runtime_ctypes import TVMContext
+from . import _transform
+from .base import RelayNode, register_relay_node
+from .. import nd as _nd
+
+
+@register_relay_node
+class PassInfo(RelayNode):
+    """The class that contains the meta data required by a pass. It is the
+    container of information needed by running an optimization or analysis.
+    This class can be extended by adding new members when more meta data is
+    needed.
+
+    Parameters
+    ----------
+    opt_level : int
+        The optimization level of this pass.
+
+    name : str
+        The pass name.
+
+    required : List[str]
+        The list of passes that are required by a certain pass.
+    """
+
+    def __init__(self, opt_level, name, required=None):
+        self.__init_handle_by_constructor__(
+            _transform.PassInfo, opt_level, name, required)
+
+
+@register_relay_node
+class PassContext(RelayNode):
+    """The basis where a Relay optimization/analysis runs on.
+    Each pass context contains a number of auxiliary information that is used
+    to help an optimization pass. Such information includes the error reporter
+    to record the errors of during the optimization, etc.
+
+    opt_level : Optional[int]
+        The optimization level of this pass.
+
+    fallback_device : Optional[Union[int, str, TVMContext]]
+        The fallback device type. It is also used as the default device for
+        operators that are not annotated during heterogeneous execution.
+
+    required_pass : Optional[Union[List[str], Set[str], Tuple[str]]]
+        The list of passes that are required by a certain pass.
+
+    disabled_pass : Optional[Union[List[str], Set[str], Tuple[str]]]
+        The list of passes that are disabled.
+    """
+    def __init__(self,
+                 opt_level=2,
+                 fallback_device=_nd.cpu(),
+                 required_pass=None,
+                 disabled_pass=None):
+        if isinstance(fallback_device, str):
+            fallback_device = _nd.context(fallback_device).device_type
+        elif isinstance(fallback_device, TVMContext):
+            fallback_device = fallback_device.device_type
+        if not isinstance(fallback_device, int):
+            raise TypeError("required_pass is expected to be the type of " +
+                            "int/str/TVMContext.")
+
+        required = list(required_pass) if required_pass else []
+        if not isinstance(required, (list, tuple)):
+            raise TypeError("required_pass is expected to be the type of " +
+                            "list/tuple/set.")
+
+        disabled = list(disabled_pass) if disabled_pass else []
+        if not isinstance(disabled, (list, tuple)):
+            raise TypeError("disabled_pass is expected to be the type of " +
+                            "list/tuple/set.")
+
+        self.__init_handle_by_constructor__(_transform.PassContext, opt_level,
+                                            fallback_device, required,
+                                            disabled)
+
+    def __enter__(self):
+        _transform.EnterPassContext(self)
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        _transform.ExitPassContext(self)
+
+    @staticmethod
+    def current():
+        """Return the current pass context."""
+        return _transform.GetCurrentPassContext()
+
+
+def build_config(opt_level=2,
+                 fallback_device=_nd.cpu(),
+                 required_pass=None,
+                 disabled_pass=None):
+    """Configure the build behavior by setting config variables.
+
+    Parameters
+    ----------
+    opt_level: int, optional
+        Optimization level. The optimization pass name and level are as the
+        following:
+
+        .. code-block:: python
+
+            OPT_PASS_LEVEL = {
+                "SimplifyInference": 0,
+                "OpFusion": 1,
+                "FoldConstant": 2,
+                "CombineParallelConv2D": 3,
+                "FoldScaleAxis": 3,
+                "AlterOpLayout": 3,
+                "CanonicalizeOps": 3,
+                "EliminateCommonSubexpr": 3,
+            }
+
+    fallback_device : int, str, or tvm.TVMContext, optional
+        The fallback device. It is also used as the default device for
+        operators without specified device during heterogeneous execution.
+
+    required_pass: set of str, optional
+        Optimization passes that are required regardless of optimization level.
+
+    disabled_pass: set of str, optional
+        Optimization passes to be disabled during optimization.
+
+    Returns
+    -------
+    pass_context: PassContext
+        The pass context for optimizations.
+    """
+    return PassContext(opt_level, fallback_device, required_pass,
+                       disabled_pass)
+
+
+@register_relay_node
+class Pass(RelayNode):
+    """The base class of all passes. All methods here are just simple wrappers
+    that are implemented in the backend. They are defined for users to
+    conveniently interact with the base class.
+    """
+
+    @property
+    def info(self):
+        """Get the pass meta."""
+        return _transform.Info(self)
+
+    def __call__(self, mod):
+        """Execute the pass. Note that for sequential pass, the dependency among
+        different passes will be resolved in the backend.
+
+        Parameters
+        ----------
+        mod : tvm.relay.Module
+            The module that a certain optimization is performed on.
+
+        Returns
+        -------
+        mod : tvm.relay.Module
+            The updated module after applying this pass.
+        """
+        return _transform.RunPass(self, mod)
+
+
+@register_relay_node
+class ModulePass(Pass):
+    """A pass that works on tvm.relay.Module. Users don't need to interact with
+    this class directly. Instead, a module pass should be created through
+    `module_pass`, because the design of the `module_pass` API is flexible
+    enough to handle the creation of a module pass in different manners. In
+    addition, all members of a module pass can be accessed from the base class.
+    The same rule applies to FunctionPass as well.
+    """
+
+
+@register_relay_node
+class FunctionPass(Pass):
+    """A pass that works on each tvm.relay.Function in a module. A function
+    pass class should be created through `function_pass`.
+    """
+
+
+@register_relay_node
+class Sequential(Pass):
+    """A pass that works on a sequence of pass objects. Multiple passes can be
+    executed sequentially using this class.
+
+    Some typical usage of the sequential pass are:
+    1. Users provide a list of passes for optimization.
+    2. Only an optimization level is provided so that the backend system has
+       to glob all passes at this level and below to perform the optimizations.
+    Note that users can also provide a series of passes that they don't want to
+    apply when running a sequential pass. Pass dependency will be resolved in
+    the backend as well.
+
+    Parameters
+    ----------
+    passes : Optional[List[Pass]]
+        A sequence of passes candidate for optimization.
+
+    opt_level : Optional[int]
+        The optimization level of this sequential pass.
+
+    name : Optional[str]
+        The name of the sequential pass.
+
+    required : Optional[List[str]]
+        The list of passes that the sequential pass is dependent on.
+    """
+
+    def __init__(self,
+                 passes=None,
+                 opt_level=2,
+                 name="sequential",
+                 required=None):
+        passes = passes if passes else []
+        if not isinstance(passes, (list, tuple)):
+            raise TypeError("passes must be a list of Pass objects.")
+
+        required = required if required else []
+        if not isinstance(required, (list, tuple)):
+            raise TypeError("Required is expected to be the type of list/tuple.")
+
+        self.__init_handle_by_constructor__(_transform.Sequential,
+                                            passes, opt_level, name, required)
+
+
+def InferType():
+    """Infer the type of an expr.
+
+    Returns
+    -------
+    ret : tvm.relay.Pass
+        The registered type inference pass.
+    """
+    return _transform.InferType()
+
+
+def FoldScaleAxis():
+    """Fold the scaling of axis into weights of conv2d/dense. This pass will
+    invoke both forward and backward scale folding.
+
+    Returns
+    -------
+    ret : tvm.relay.Pass
+        The registered pass to fold expressions.
+
+    Note
+    ----
+    Internally, we will call backward_fold_scale_axis before using
+    forward_fold_scale_axis. As backward folding targets common conv-bn
+    pattern.
+    """
+    return _transform.FoldScaleAxis()
+
+
+def SimplifyInference():
+    """Simplify the data-flow graph for inference phase. An simplified expression
+    which is semantically equal to the input expression will be returned.
+
+    Returns
+    -------
+    ret: tvm.relay.Pass
+        The registered to perform operator simplification.
+    """
+    return _transform.SimplifyInference()
+
+
+def CanonicalizeOps():
+    """ Canonicalize special operators to basic operators.
+    This can simplify followed analysis. (e.g. expanding bias_add to
+    expand_dims and broadcast_add.)
+
+    Returns
+    -------
+    ret: tvm.relay.Pass
+        The registered pass performing the canonicalization.
+    """
+    return _transform.CanonicalizeOps()
+
+
+def DeadCodeElimination():
+    """ Remove expressions which does not effect the program result (dead code).
+
+    Returns
+    -------
+    ret: tvm.relay.Pass
+        The registered pass that eliminates the dead code in a Relay program.
+    """
+    return _transform.DeadCodeElimination()
+
+
+def FoldConstant():
+    """Fold the constant expression in expr.
+
+    Returns
+    -------
+    ret : tvm.relay.Pass
+        The registered pass for constant folding.
+    """
+    return _transform.FoldConstant()
+
+
+def FuseOps(fuse_opt_level=-1):
+    """Fuse operators in an expr to a larger operator according to some rules.
+
+    Parameters
+    ----------
+    fuse_opt_level : int
+        The level of fuse optimization. -1 indicates that the level will be
+        inferred from pass context.
+
+    Returns
+    -------
+    ret : tvm.relay.Pass
+        The registered pass for operator fusion.
+    """
+    return _transform.FuseOps(fuse_opt_level)
+
+
+def CombineParallelConv2D(min_num_branches=3):
+    """Combine multiple conv2d operators into one.
+
+    Parameters
+    ----------
+    min_num_branches : int
+        The minimum number of required parallel branches for performing this
+        optimization.
+
+    Returns
+    -------
+    ret: tvm.relay.Pass
+        The registered pass that combines parallel conv2d operators.
+    """
+    return _transform.CombineParallelConv2D(min_num_branches)
+
+
+def AlterOpLayout():
+    """Alternate the layouts of operators or replace primitive operators with
+    other expressions.
+    This pass can be used for computing convolution in custom layouts or
+    other general weight pre-transformation.
+
+    Returns
+    -------
+    ret : tvm.relay.Pass
+        The registered pass that alters the layout of operators.
+    """
+    return _transform.AlterOpLayout()
+
+
+def RewriteAnnotatedOps(fallback_device):
+    """Rewrite the annotated program where annotation operators, e.g.
+    `on_deivce`, mark which device an expression should be scheduled to.
+    This pass helps heterogeneous execution where different operators may need
+    to be allocated on various devices.
+
+    Parameters
+    ----------
+    fallback_device : int
+        The fallback device type. It is also used as the default device for
+        operators with no annotated device.
+
+    Returns
+    -------
+    ret: tvm.relay.Pass
+        The registered pass that rewrites an expression with annotated
+        `on_device` operators.
+    """
+    return _transform.RewriteDeviceAnnotation(fallback_device)
+
+
+def ToANormalForm():
+    """Turn Graph Normal Form expression into A Normal Form Expression.
+    The scope of the root expression is the global scope.
+    The scope of any non root expression is the least common ancestor of all it's scope.
+    Values are ordered by post-DFS order in each scope.
+
+    Returns
+    -------
+    ret: tvm.relay.Pass
+        The registered pass that transforms an expression into A Normal Form.
+    """
+    return _transform.ToANormalForm()
+
+def EtaExpand():
+    """Add abstraction over a function
+
+    Returns
+    -------
+    ret: tvm.relay.Pass
+        The registered pass that eta expands an expression.
+    """
+    return _transform.EtaExpand()
+
+def ToGraphNormalForm():
+    """Turn A Normal Form expression into Graph Normal Form expression
+
+    Returns
+    -------
+    ret : tvm.relay.Pass
+        The registered pass that transforms an expression into Graph Normal Form.
+    """
+    return _transform.ToGraphNormalForm()
+
+
+def EliminateCommonSubexpr(fskip=None):
+    """Eliminate common subexpressions.
+
+    Parameters
+    ----------
+    fskip: Callable
+        The callback function that decides whether an expression should be
+        skipped.
+
+    Returns
+    -------
+    ret : tvm.relay.Pass
+        The registered pass that eliminates common subexpressions.
+    """
+    return _transform.EliminateCommonSubexpr(fskip)
+
+
+def PartialEvaluate():
+    """Evaluate the static fragment of the code.
+
+    Returns
+    -------
+    ret : tvm.relay.Pass
+        The registered pass that performs partial evaluation on an expression.
+    """
+    return _transform.PartialEvaluate()
+
+def CanonicalizeCast():
+    """
+    Canonicalize cast expressions to make operator fusion more efficient.
+
+    Returns
+    -------
+    ret : tvm.relay.Pass
+        The registered pass that canonicalizes cast expression.
+    """
+    return _transform.CanonicalizeCast()
+
+def _wrap_class_module_pass(pass_cls, pass_info):
+    """Wrap a python class as function pass"""
+    class PyModulePass(ModulePass):
+        """Internal wrapper class to create a class instance."""
+        def __init__(self, *args, **kwargs):
+            # initialize handle in cass pass_cls creation failed.fg
+            self.handle = None
+            inst = pass_cls(*args, **kwargs)
+            # it is important not to capture self to
+            # avoid a cyclic dependency
+            def _pass_func(mod, ctx):
+                return inst.transform_module(mod, ctx)
+            self.__init_handle_by_constructor__(
+                _transform.MakeModulePass, _pass_func, pass_info)
+            self._inst = inst
+
+        def __getattr__(self, name):
+            # fall back to instance attribute if there is not any
+            return self._inst.__getattribute__(name)
+
+    functools.update_wrapper(PyModulePass.__init__, pass_cls.__init__)
+    PyModulePass.__name__ = pass_cls.__name__
+    PyModulePass.__doc__ = pass_cls.__doc__
+    PyModulePass.__module__ = pass_cls.__module__
+    return PyModulePass
+
+
+def module_pass(pass_func=None, opt_level=None, name=None, required=None):
+    """Decorate a module pass.
+
+    This function returns a callback when pass_func is provided.
+    Otherwise, it serves a decorator function.
+
+    pass_func can also be a class type with a method transform_module.
+    This function will create a decorated ModulePass using transform_module
+    as the pass function.
+
+    Parameters
+    ----------
+    pass_func : Optional[Callable[(Module, PassContext) ->Module]]
+        The transformation function or class.
+
+    opt_level : int
+        The optimization level of this module pass.
+
+    name : Optional[str]
+        The name of the module pass. The name could be empty. In this case, the
+        name of the optimization function will be used as the pass name.
+
+    required : Optional[List[str]]
+        The list of passes that the module pass is dependent on.
+
+    Returns
+    -------
+    create_module_pass : Union[Callable, ModulePass]
+        A decorator will be returned if pass_func is not provided,
+        otherwise return the decorated result.
+        The returned decorator has two behaviors depending on the input:
+        A new ModulePass will be returned when we decorate a pass function.
+        A new ModulePass class will be returned when we decorate a class type.
+
+    Examples
+    --------
+    The following code block decorates a module pass class.
+
+    .. code-block:: python
+
+        @relay.transform.module_pass
+        class CustomPipeline:
+            def __init__(self, enable_fold):
+                self.enable_fold = enable_fold
+                self.cse = relay.transform.EliminateCommonSubexpr()
+                self.const_fold = relay.transform.FoldConstant()
+
+            def transform_module(self, mod, ctx):
+                mod = self.cse(mod, ctx)
+                if self.enable_fold:
+                    mod = self.const_fold(mod, ctx)
+                return mod
+
+        # create an instance of customized pipeline
+        pipeline = CustomPipeline(enable_fold=False)
+        assert isinstance(pipeline, transform.ModulePass)
+        # run the pipeline.
+        output_module = pipeline(input_module)
+
+    The following code creates a module pass by decorating
+    a user defined transform function.
+
+    .. code-block:: python
+
+        @relay.transform.module_pass(opt_level=2)
+        def transform(mod, ctx):
+            tp = relay.TensorType((10,), "float32")
+            x = relay.var("x", tp)
+            gv = relay.GlobalVar("var")
+            func = relay.Function([x], relay.abs(x))
+            new_mod = relay.Module({gv: func})
+            new_mod.update(mod)
+            return new_mod
+
+        module_pass = transform
+        assert isinstance(module_pass, transform.ModulePass)
+        assert module_pass.info.opt_level == 2
+
+        # Given a module m, the optimization could be invoked as the follwoing:
+        updated_mod = module_pass(m)
+        # Now a function abs should be added to the module m.
+    """
+    if opt_level is None:
+        raise ValueError("Please provide opt_level for the module pass.")
+
+    required = required if required else []
+    if not isinstance(required, (list, tuple)):
+        raise TypeError("Required is expected to be the type of " +
+                        "list/tuple.")
+
+    def create_module_pass(pass_arg):
+        """Internal function that creates a module pass"""
+        fname = name if name else pass_arg.__name__
+        info = PassInfo(opt_level, fname, required)
+        if inspect.isclass(pass_arg):
+            return _wrap_class_module_pass(pass_arg, info)
+        if not isinstance(pass_arg, (types.FunctionType, types.LambdaType)):
+            raise TypeError("pass_func must be a callable for Module pass")
+        return _transform.MakeModulePass(pass_arg, info)
+
+    if pass_func:
+        return create_module_pass(pass_func)
+    return create_module_pass
+
+
+def _wrap_class_function_pass(pass_cls, pass_info):
+    """Wrap a python class as function pass"""
+    class PyFunctionPass(FunctionPass):
+        """Internal wrapper class to create a class instance."""
+        def __init__(self, *args, **kwargs):
+            # initialize handle in cass pass_cls creation failed.fg
+            self.handle = None
+            inst = pass_cls(*args, **kwargs)
+            # it is important not to capture self to
+            # avoid a cyclic dependency
+            def _pass_func(func, mod, ctx):
+                return inst.transform_function(func, mod, ctx)
+            self.__init_handle_by_constructor__(
+                _transform.MakeFunctionPass, _pass_func, pass_info)
+            self._inst = inst
+
+        def __getattr__(self, name):
+            # fall back to instance attribute if there is not any
+            return self._inst.__getattribute__(name)
+
+    functools.update_wrapper(PyFunctionPass.__init__, pass_cls.__init__)
+    PyFunctionPass.__name__ = pass_cls.__name__
+    PyFunctionPass.__doc__ = pass_cls.__doc__
+    PyFunctionPass.__module__ = pass_cls.__module__
+    return PyFunctionPass
+
+
+def function_pass(pass_func=None, opt_level=None, name=None, required=None):
+    """Decorate a function pass.
+
+    This function returns a callback when pass_func
+    is provided. Otherwise, it returns the created function pass using the
+    given optimization function.
+
+    Parameters
+    ----------
+    pass_func : Optional[Callable[(Function, Module, PassContext) -> Function]]
+        The transformation function or class.
+
+    opt_level : int
+        The optimization level of this module pass.
+
+    name : Optional[str]
+        The name of the function pass. The name could be empty. In this case, the
+        name of the optimization function will be used as the pass name.
+
+    required : Optional[List[str]]
+        The list of passes that the module pass is dependent on.
+
+    Returns
+    -------
+    create_function_pass : Union[Callable, FunctionPass]
+
+        A decorator will be returned if pass_func is not provided,
+        otherwise return the decorated result.
+        The returned decorator has two behaviors depending on the input:
+        A new FunctionPass will be returned when we decorate a pass function.
+        A new FunctionPass class will be returned when we decorate a class type.
+
+    Examples
+    --------
+    The following code block decorates a function pass class.
+
+    .. code-block:: python
+
+        @relay.transform.function_pass(opt_level=1)
+        class TestReplaceFunc:
+            def __init__(self, new_func):
+                self.new_func = new_func
+
+            def transform_function(self, func, mod, ctx):
+                # just for demo purposes
+                # transform func to new_func
+                return self.new_func
+
+        x = relay.var("x", shape=(10, 20))
+        f1 = relay.Function([x], x)
+        f2 = relay.Function([x], relay.log(x))
+        # fpass is now a special pass that replaces every
+        # function to f1
+        fpass = TestReplaceFunc(f1)
+        # now every function in input_mod is replaced by f1
+        res_mod = fpass(input_mod)
+
+
+    The following code creates a function pass by decorating
+    a user defined transform function.
+
+    .. code-block:: python
+
+        @relay.transform.function_pass(opt_level=2)
+        def transform(func, mod, ctx):
+            # my transformations here.
+            return func
+
+        function_pass = transform
+        assert isinstance(function_pass, transform.FunctionPass)
+        assert function_pass.info.opt_level == 2
+
+        # Given a module m, the optimization could be invoked as the follwoing:
+        updated_mod = function_pass(m)
+        # Now constant folding should have been applied to every function in
+        # the provided module m. And the updated module will be returned.
+    """
+
+    if opt_level is None:
+        raise ValueError("Please provide opt_level for the funtion pass.")
+
+    required = required if required else []
+    if not isinstance(required, (list, tuple)):
+        raise TypeError("Required is expected to be the type of " +
+                        "list/tuple.")
+
+    def create_function_pass(pass_arg):
+        """Internal function that creates a function pass"""
+        fname = name if name else pass_arg.__name__
+        info = PassInfo(opt_level, fname, required)
+        if inspect.isclass(pass_arg):
+            return _wrap_class_function_pass(pass_arg, info)
+        if not isinstance(pass_arg, (types.FunctionType, types.LambdaType)):
+            raise TypeError("pass_func must be a callable for Module pass")
+        return _transform.MakeFunctionPass(pass_arg, info)
+
+    if pass_func:
+        return create_function_pass(pass_func)
+    return create_function_pass
diff --git a/python/tvm/relay/transform.pyi b/python/tvm/relay/transform.pyi
new file mode 100644
index 000000000000..343e89976b09
--- /dev/null
+++ b/python/tvm/relay/transform.pyi
@@ -0,0 +1,71 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+from .base import NodeBase
+
+
+class PassContext(NodeBase):
+    def __init__(self):
+        ...
+
+class PassInfo(NodeBase):
+    name = ...  # type: str
+    opt_level = ... # type: int
+    required = ... # type: list
+
+    def __init__(self, name, opt_level, required)
+        # type: (str, int, list) -> None
+
+
+class Pass(NodeBase):
+    def __init__(self):
+        ...
+
+
+class ModulePass(Pass):
+    name = ...  # type: str
+    opt_level = ...  # type: int
+    pass_func = ...  # type: Callable
+    required = ...  # type: list
+
+    def __init__(self, name, opt_level, pass_func, required):
+        # type: (str, int, Callable, list) -> None
+        ...
+
+
+class FunctionPass(Pass):
+    name = ...  # type: str
+    opt_level = ...  # type: int
+    pass_func = ...  # type: Callable
+    required = ...  # type: list
+
+    def __init__(self, name, opt_level, pass_func, required):
+        # type: (str, int, Callable, list) -> None
+        ...
+
+
+class Sequential(Pass):
+    name = ...  # type: str
+    opt_level = ...  # type: int
+    passes = ...  # type: list
+    required = ...  # type: list
+    disabled = ... # type: list
+
+    def __init__(self, name, opt_level, passes, required, disabled):
+        # type: (str, int, list, list, list) -> None
+        ...
diff --git a/python/tvm/target.py b/python/tvm/target.py
index eff0088b37ce..828fff8e228c 100644
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -133,7 +133,7 @@ def __enter__(self):
         return self
 
     def __exit__(self, ptype, value, trace):
-        _api_internal._ExitTargetScope()
+        _api_internal._ExitTargetScope(self)
 
 
 @register_node
diff --git a/python/tvm/tensor.py b/python/tvm/tensor.py
index ce7cbae385d9..db8fb272a551 100644
--- a/python/tvm/tensor.py
+++ b/python/tvm/tensor.py
@@ -147,7 +147,7 @@ def output(self, index):
 
     @property
     def num_outputs(self):
-        """Number of outputs of this op."""
+        """Number of outputs from this op."""
         return _api_internal._OpNumOutputs(self)
 
     @property
@@ -166,7 +166,7 @@ class BaseComputeOp(Operation):
     """Compute operation."""
     @property
     def axis(self):
-        """Represent axis of IterVar, defined when it is a ComputeOp"""
+        """Represent the IterVar axis, defined when it is a ComputeOp"""
         return self.__getattr__("axis")
 
     @property
@@ -191,7 +191,7 @@ class ScanOp(Operation):
     """Scan operation."""
     @property
     def scan_axis(self):
-        """Represent axis of scan, only defined when it is a ScanOp"""
+        """Represent the scan axis, only defined when it is a ScanOp"""
         return self.__getattr__("scan_axis")
 
 
@@ -205,7 +205,7 @@ class HybridOp(Operation):
     """Hybrid operation."""
     @property
     def axis(self):
-        """Represent axis of IterVar, also defined when it is a HybridOp"""
+        """Represent the IterVar axis, also defined when it is a HybridOp"""
         return self.__getattr__("axis")
 
 
diff --git a/python/tvm/tensor_intrin.py b/python/tvm/tensor_intrin.py
index f97e6b7579a1..2ef7a4bbb23e 100644
--- a/python/tvm/tensor_intrin.py
+++ b/python/tvm/tensor_intrin.py
@@ -50,20 +50,23 @@ class TensorIntrin(NodeBase):
     decl_tensor_intrin: Construct a TensorIntrin
     """
     def __call__(self, *args, **kwargs):
-        tensors = [x.tensor for x in args]
-        regions = [_get_region(x) for x in args]
+        tensors = [x.tensor for x in args if isinstance(x, _tensor.TensorSlice)]
+        scalar_inputs = [x for x in args if not isinstance(x, _tensor.TensorSlice)]
+        regions = [_get_region(x) for x in args if isinstance(x, _tensor.TensorSlice)]
         reduce_axis = []
         if "reduce_axis" in kwargs:
             reduce_axis = kwargs["reduce_axis"]
             if not isinstance(reduce_axis, (list, tuple)):
                 reduce_axis = [reduce_axis]
             reduce_axis = _api.convert(reduce_axis)
-        return _api_internal._TensorIntrinCall(self, tensors, regions, reduce_axis)
+        if scalar_inputs:
+            scalar_inputs = _api.convert(scalar_inputs)
+        return _api_internal._TensorIntrinCall(self, tensors, regions, reduce_axis, scalar_inputs)
 
 def decl_tensor_intrin(op,
                        fcompute,
                        name="tensor_intrin",
-                       binds=None):
+                       binds=None, scalar_params=None):
     """Declare a tensor intrinsic function.
 
     Parameters
@@ -96,6 +99,9 @@ def decl_tensor_intrin(op,
         requirement of the function. By default, a new compact buffer is created
         for each tensor in the argument.
 
+    scalar_params: a list of variables used by op, whose values will be passed
+                   as scalar_inputs when the tensor intrinsic is called.
+
     Returns
     -------
     intrin: TensorIntrin
@@ -122,11 +128,15 @@ def decl_tensor_intrin(op,
                                 offset_factor=cfg.offset_factor))
         binds_list.append(buf)
 
-    body = fcompute(binds_list[:len(inputs)], binds_list[len(inputs):])
+    if scalar_params:
+        body = fcompute(binds_list[:len(inputs)], binds_list[len(inputs):], scalar_params)
+    else:
+        body = fcompute(binds_list[:len(inputs)], binds_list[len(inputs):])
+        scalar_params = []
     if isinstance(body, (_expr.Expr, _stmt.Stmt)):
         body = [body]
     body = [_make.Evaluate(x) if isinstance(x, _expr.Expr) else x for x in body]
     if len(body) < 3:
         body += [None] * (3 - len(body))
     return _api_internal._TensorIntrin(
-        name, op, inputs, binds_list, *body)
+        name, op, inputs, binds_list, scalar_params, *body)
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index 25466e08bdf9..02e2c7c67c99 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -18,8 +18,10 @@
 [workspace]
 members = [
 	"common",
+	"macros",
 	"runtime",
 	"runtime/tests/test_tvm_basic",
+	"runtime/tests/test_tvm_dso",
 	"runtime/tests/test_nnvm",
 	"frontend",
 	"frontend/tests/basics",
diff --git a/rust/common/build.rs b/rust/common/build.rs
index 5dac99ec54bb..919e0adc46c8 100644
--- a/rust/common/build.rs
+++ b/rust/common/build.rs
@@ -22,23 +22,30 @@ extern crate bindgen;
 use std::path::PathBuf;
 
 fn main() {
+    let tvm_home = option_env!("TVM_HOME").map(str::to_string).unwrap_or({
+        let tvm_home = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+            .canonicalize()
+            .unwrap();
+        tvm_home
+            .parent()
+            .unwrap()
+            .parent()
+            .unwrap()
+            .to_str()
+            .unwrap()
+            .to_string()
+    });
     if cfg!(feature = "bindings") {
         println!("cargo:rerun-if-env-changed=TVM_HOME");
         println!("cargo:rustc-link-lib=dylib=tvm_runtime");
-        println!("cargo:rustc-link-search={}/build", env!("TVM_HOME"));
+        println!("cargo:rustc-link-search={}/build", tvm_home);
     }
 
     // @see rust-bindgen#550 for `blacklist_type`
     bindgen::Builder::default()
-        .header(format!(
-            "{}/include/tvm/runtime/c_runtime_api.h",
-            env!("TVM_HOME")
-        ))
-        .header(format!(
-            "{}/include/tvm/runtime/c_backend_api.h",
-            env!("TVM_HOME")
-        ))
-        .clang_arg(format!("-I{}/3rdparty/dlpack/include/", env!("TVM_HOME")))
+        .header(format!("{}/include/tvm/runtime/c_runtime_api.h", tvm_home))
+        .header(format!("{}/include/tvm/runtime/c_backend_api.h", tvm_home))
+        .clang_arg(format!("-I{}/3rdparty/dlpack/include/", tvm_home))
         .blacklist_type("max_align_t")
         .layout_tests(false)
         .derive_partialeq(true)
diff --git a/rust/macros/Cargo.toml b/rust/macros/Cargo.toml
new file mode 100644
index 000000000000..15773b625be9
--- /dev/null
+++ b/rust/macros/Cargo.toml
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+name = "tvm-macros"
+version = "0.1.0"
+license = "Apache-2.0"
+description = "Proc macros used by the TVM crates."
+repository = "https://github.com/dmlc/tvm"
+readme = "README.md"
+keywords = ["tvm"]
+authors = ["TVM Contributors"]
+edition = "2018"
+
+[lib]
+proc-macro = true
+
+[dependencies]
+goblin = "0.0.22"
+proc-macro2 = "0.4"
+proc-quote = "0.2"
+syn = "0.15"
diff --git a/rust/macros/src/lib.rs b/rust/macros/src/lib.rs
new file mode 100644
index 000000000000..704f7c1de58b
--- /dev/null
+++ b/rust/macros/src/lib.rs
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#![feature(bind_by_move_pattern_guards, proc_macro_span)]
+
+extern crate proc_macro;
+
+use std::{fs::File, io::Read};
+
+use proc_quote::quote;
+
+#[proc_macro]
+pub fn import_module(input: proc_macro::TokenStream) -> proc_macro::TokenStream {
+    let obj_file_path = syn::parse_macro_input!(input as syn::LitStr);
+
+    let mut path = obj_file_path.span().unwrap().source_file().path();
+    path.pop(); // remove the filename
+    path.push(obj_file_path.value());
+
+    let mut fd = File::open(&path)
+        .unwrap_or_else(|_| panic!("Unable to find TVM object file at `{}`", path.display()));
+    let mut buffer = Vec::new();
+    fd.read_to_end(&mut buffer).unwrap();
+
+    let fn_names = match goblin::Object::parse(&buffer).unwrap() {
+        goblin::Object::Elf(elf) => elf
+            .syms
+            .iter()
+            .filter_map(|s| {
+                if s.st_type() == 0 || goblin::elf::sym::type_to_str(s.st_type()) == "FILE" {
+                    return None;
+                }
+                match elf.strtab.get(s.st_name) {
+                    Some(Ok(name)) if name != "" => {
+                        Some(syn::Ident::new(name, proc_macro2::Span::call_site()))
+                    }
+                    _ => None,
+                }
+            })
+            .collect::<Vec<_>>(),
+        goblin::Object::Mach(goblin::mach::Mach::Binary(obj)) => {
+            obj.symbols()
+                .filter_map(|s| match s {
+                    Ok((name, nlist))
+                        if nlist.is_global()
+                            && nlist.n_sect != 0
+                            && !name.ends_with("tvm_module_ctx") =>
+                    {
+                        Some(syn::Ident::new(
+                            if name.starts_with('_') {
+                                // Mach objects prepend a _ to globals.
+                                &name[1..]
+                            } else {
+                                &name
+                            },
+                            proc_macro2::Span::call_site(),
+                        ))
+                    }
+                    _ => None,
+                })
+                .collect::<Vec<_>>()
+        }
+        _ => panic!("Unsupported object format."),
+    };
+
+    let extern_fns = quote! {
+        mod ext {
+            extern "C" {
+                #(
+                    pub(super) fn #fn_names(
+                        args: *const tvm_runtime::ffi::TVMValue,
+                        type_codes: *const std::os::raw::c_int,
+                        num_args: std::os::raw::c_int
+                    ) -> std::os::raw::c_int;
+                )*
+            }
+        }
+    };
+
+    let fns = quote! {
+        use tvm_runtime::{ffi::TVMValue, TVMArgValue, TVMRetValue, FuncCallError};
+        #extern_fns
+
+        #(
+            pub fn #fn_names(args: &[TVMArgValue]) -> Result<TVMRetValue, FuncCallError> {
+                let (values, type_codes): (Vec<TVMValue>, Vec<i32>) = args
+                   .into_iter()
+                   .map(|arg| {
+                       let (val, code) = arg.to_tvm_value();
+                       (val, code as i32)
+                   })
+                   .unzip();
+                let exit_code = unsafe {
+                    ext::#fn_names(values.as_ptr(), type_codes.as_ptr(), values.len() as i32)
+                };
+                if exit_code == 0 {
+                    Ok(TVMRetValue::default())
+                } else {
+                    Err(FuncCallError::get_with_context(stringify!(#fn_names).to_string()))
+                }
+            }
+        )*
+    };
+
+    proc_macro::TokenStream::from(fns)
+}
diff --git a/rust/runtime/Cargo.toml b/rust/runtime/Cargo.toml
index 8e70565a6c13..3c81a93c9bbf 100644
--- a/rust/runtime/Cargo.toml
+++ b/rust/runtime/Cargo.toml
@@ -41,7 +41,11 @@ nom = {version = "4.0.0", default-features = false }
 serde = "1.0.59"
 serde_derive = "1.0.79"
 serde_json = "1.0.17"
-tvm-common = { version = "0.1.0", path = "../common/" }
+tvm-common = { version = "0.1", path = "../common" }
+tvm-macros = { version = "0.1", path = "../macros" }
 
 [target.'cfg(not(target_env = "sgx"))'.dependencies]
 num_cpus = "1.8.0"
+
+[target.'cfg(not(any(target_arch = "wasm32", target_env = "sgx")))'.dependencies]
+libloading = "0.5"
diff --git a/rust/runtime/src/graph.rs b/rust/runtime/src/graph.rs
index bff02f504a5e..cacd7a38a97f 100644
--- a/rust/runtime/src/graph.rs
+++ b/rust/runtime/src/graph.rs
@@ -164,7 +164,7 @@ impl<'a> TryFrom<&'a str> for Graph {
 /// ```
 pub struct GraphExecutor<'m, 't> {
     graph: Graph,
-    op_execs: Vec<Box<Fn() + 'm>>,
+    op_execs: Vec<Box<dyn Fn() + 'm>>,
     tensors: Vec<Tensor<'t>>,
 }
 
@@ -240,7 +240,7 @@ impl<'m, 't> GraphExecutor<'m, 't> {
         graph: &Graph,
         lib: &'m M,
         tensors: &Vec<Tensor<'t>>,
-    ) -> Result<Vec<Box<Fn() + 'm>>, Error> {
+    ) -> Result<Vec<Box<dyn Fn() + 'm>>, Error> {
         ensure!(graph.node_row_ptr.is_some(), "Missing node_row_ptr.");
         let node_row_ptr = graph.node_row_ptr.as_ref().unwrap();
 
@@ -279,7 +279,7 @@ impl<'m, 't> GraphExecutor<'m, 't> {
                 })
                 .collect::<Result<Vec<DLTensor>, Error>>()
                 .unwrap();
-            let op: Box<Fn()> = box move || {
+            let op: Box<dyn Fn()> = box move || {
                 let args = dl_tensors
                     .iter()
                     .map(|t| t.into())
diff --git a/rust/runtime/src/lib.rs b/rust/runtime/src/lib.rs
index c774d5bbc983..010fbf7d6a29 100644
--- a/rust/runtime/src/lib.rs
+++ b/rust/runtime/src/lib.rs
@@ -29,7 +29,6 @@
 //! For examples of use, please refer to the multi-file tests in the `tests` directory.
 
 #![feature(
-    alloc,
     allocator_api,
     box_syntax,
     fn_traits,
@@ -77,6 +76,7 @@ pub use tvm_common::{
     packed_func::{self, *},
     TVMArgValue, TVMRetValue,
 };
+pub use tvm_macros::import_module;
 
 pub use self::{array::*, errors::*, graph::*, module::*, threading::*, workspace::*};
 
diff --git a/rust/runtime/src/module/dso.rs b/rust/runtime/src/module/dso.rs
new file mode 100644
index 000000000000..3442fad13bf9
--- /dev/null
+++ b/rust/runtime/src/module/dso.rs
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use std::{
+    cell::RefCell,
+    collections::HashMap,
+    ffi::CStr,
+    os::raw::{c_char, c_int, c_void},
+    pin::Pin,
+};
+
+use tvm_common::{ffi::BackendPackedCFunc, packed_func::PackedFunc};
+
+use crate::{
+    threading::{TVMBackendParallelBarrier, TVMBackendParallelLaunch},
+    workspace::{TVMBackendAllocWorkspace, TVMBackendFreeWorkspace},
+    TVMAPISetLastError,
+};
+
+use super::Module;
+
+const TVM_MAIN: &'static [u8] = b"__tvm_main__";
+const TVM_MODULE_CTX: &'static [u8] = b"__tvm_module_ctx";
+
+/// A module backed by a Dynamic Shared Object (dylib).
+pub struct DsoModule<'a> {
+    lib: libloading::Library,
+    packed_funcs: RefCell<HashMap<String, &'a (dyn PackedFunc)>>,
+    _pin: std::marker::PhantomPinned,
+}
+
+macro_rules! init_context_func {
+    ($lib:ident, $( ($fn:ident, $sig:ty) ),+ $(,)?) => {
+        unsafe {
+            $(
+                let fn_ptr = $lib.get::<*mut $sig>(concat!("__", stringify!($fn)).as_bytes());
+                if let Ok(fn_ptr) = fn_ptr {
+                    **fn_ptr = $fn;
+                }
+            )+
+        }
+    };
+}
+
+impl<'a> DsoModule<'a> {
+    pub fn new<P: AsRef<std::ffi::OsStr>>(filename: P) -> Result<Pin<Box<Self>>, failure::Error> {
+        let lib = libloading::Library::new(filename)?;
+
+        init_context_func!(
+            lib,
+            (TVMAPISetLastError, extern "C" fn(*const i8)),
+            (
+                TVMBackendAllocWorkspace,
+                extern "C" fn(c_int, c_int, u64, c_int, c_int) -> *mut c_void
+            ),
+            (
+                TVMBackendFreeWorkspace,
+                extern "C" fn(c_int, c_int, *mut c_void) -> c_int
+            ),
+            (
+                TVMBackendParallelLaunch,
+                extern "C" fn(crate::threading::FTVMParallelLambda, *const c_void, usize) -> c_int
+            ),
+            (
+                TVMBackendParallelBarrier,
+                extern "C" fn(usize, *const tvm_common::ffi::TVMParallelGroupEnv)
+            ),
+        );
+
+        // Pin the module in memory so that `ctx` pointer (below) is stable.
+        let dso_mod = Box::pin(Self {
+            lib,
+            packed_funcs: RefCell::new(HashMap::new()),
+            _pin: std::marker::PhantomPinned,
+        });
+
+        unsafe {
+            if let Ok(ctx) = dso_mod.lib.get::<*mut *const c_void>(TVM_MODULE_CTX) {
+                **ctx = &dso_mod as *const _ as *const c_void;
+            }
+        }
+
+        Ok(dso_mod)
+    }
+}
+
+impl<'a> Module for DsoModule<'a> {
+    fn get_function<S: AsRef<str>>(&self, name: S) -> Option<&(dyn PackedFunc)> {
+        let name = name.as_ref();
+        let func = match unsafe {
+            self.lib
+                .get::<BackendPackedCFunc>(if name.as_bytes() == TVM_MAIN {
+                    // If __tvm_main__ is present, it contains the name of the
+                    // actual main function.
+                    match self
+                        .lib
+                        .get::<*const c_char>(TVM_MAIN)
+                        .map(|p| CStr::from_ptr(*p))
+                    {
+                        Ok(m) => m.to_bytes(),
+                        _ => return None,
+                    }
+                } else {
+                    name.as_bytes()
+                })
+        } {
+            Ok(func) => unsafe { func.into_raw() },
+            Err(_) => return None,
+        };
+
+        self.packed_funcs.borrow_mut().insert(
+            name.to_string(),
+            &*Box::leak(super::wrap_backend_packed_func(name.to_string(), *func)),
+        );
+
+        self.packed_funcs.borrow().get(name).map(|f| *f)
+    }
+}
+
+impl<'a> Drop for DsoModule<'a> {
+    fn drop(&mut self) {
+        self.packed_funcs
+            .replace(HashMap::new())
+            .into_iter()
+            .map(|(_name, f)| unsafe { Box::from_raw(f as *const _ as *mut (dyn PackedFunc)) })
+            .for_each(std::mem::drop);
+    }
+}
diff --git a/rust/runtime/src/module/mod.rs b/rust/runtime/src/module/mod.rs
new file mode 100644
index 000000000000..2c7c107f6b30
--- /dev/null
+++ b/rust/runtime/src/module/mod.rs
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#[cfg(not(any(target_arch = "wasm32", target_env = "sgx")))]
+mod dso;
+mod syslib;
+
+use tvm_common::{
+    ffi::BackendPackedCFunc,
+    packed_func::{PackedFunc, TVMArgValue, TVMRetValue, TVMValue},
+};
+
+#[cfg(not(any(target_arch = "wasm32", target_env = "sgx")))]
+pub use dso::DsoModule;
+pub use syslib::SystemLibModule;
+
+pub trait Module {
+    fn get_function<S: AsRef<str>>(&self, name: S) -> Option<&(dyn PackedFunc)>;
+}
+
+// @see `WrapPackedFunc` in `llvm_module.cc`.
+fn wrap_backend_packed_func(func_name: String, func: BackendPackedCFunc) -> Box<dyn PackedFunc> {
+    box move |args: &[TVMArgValue]| {
+        let (values, type_codes): (Vec<TVMValue>, Vec<i32>) = args
+            .into_iter()
+            .map(|arg| {
+                let (val, code) = arg.to_tvm_value();
+                (val, code as i32)
+            })
+            .unzip();
+        let exit_code = func(values.as_ptr(), type_codes.as_ptr(), values.len() as i32);
+        if exit_code == 0 {
+            Ok(TVMRetValue::default())
+        } else {
+            Err(tvm_common::errors::FuncCallError::get_with_context(
+                func_name.clone(),
+            ))
+        }
+    }
+}
diff --git a/rust/runtime/src/module.rs b/rust/runtime/src/module/syslib.rs
similarity index 62%
rename from rust/runtime/src/module.rs
rename to rust/runtime/src/module/syslib.rs
index 865338f848fa..227b8c727e8f 100644
--- a/rust/runtime/src/module.rs
+++ b/rust/runtime/src/module/syslib.rs
@@ -21,14 +21,9 @@ use std::{
     collections::HashMap, convert::AsRef, ffi::CStr, os::raw::c_char, string::String, sync::Mutex,
 };
 
-use tvm_common::{
-    ffi::BackendPackedCFunc,
-    packed_func::{PackedFunc, TVMArgValue, TVMRetValue, TVMValue},
-};
+use tvm_common::{ffi::BackendPackedCFunc, packed_func::PackedFunc};
 
-pub trait Module {
-    fn get_function<S: AsRef<str>>(&self, name: S) -> Option<&(dyn PackedFunc)>;
-}
+use super::Module;
 
 pub struct SystemLibModule;
 
@@ -53,30 +48,6 @@ impl Default for SystemLibModule {
     }
 }
 
-// @see `WrapPackedFunc` in `llvm_module.cc`.
-pub(super) fn wrap_backend_packed_func(
-    func_name: String,
-    func: BackendPackedCFunc,
-) -> Box<dyn PackedFunc> {
-    box move |args: &[TVMArgValue]| {
-        let (values, type_codes): (Vec<TVMValue>, Vec<i32>) = args
-            .into_iter()
-            .map(|arg| {
-                let (val, code) = arg.to_tvm_value();
-                (val, code as i32)
-            })
-            .unzip();
-        let exit_code = func(values.as_ptr(), type_codes.as_ptr(), values.len() as i32);
-        if exit_code == 0 {
-            Ok(TVMRetValue::default())
-        } else {
-            Err(tvm_common::errors::FuncCallError::get_with_context(
-                func_name.clone(),
-            ))
-        }
-    }
-}
-
 #[no_mangle]
 pub extern "C" fn TVMBackendRegisterSystemLibSymbol(
     cname: *const c_char,
@@ -85,7 +56,7 @@ pub extern "C" fn TVMBackendRegisterSystemLibSymbol(
     let name = unsafe { CStr::from_ptr(cname).to_str().unwrap() };
     SYSTEM_LIB_FUNCTIONS.lock().unwrap().insert(
         name.to_string(),
-        &*Box::leak(wrap_backend_packed_func(name.to_string(), func)),
+        &*Box::leak(super::wrap_backend_packed_func(name.to_string(), func)),
     );
     return 0;
 }
diff --git a/rust/runtime/src/threading.rs b/rust/runtime/src/threading.rs
index 96143848f5e2..eb2f418473ed 100644
--- a/rust/runtime/src/threading.rs
+++ b/rust/runtime/src/threading.rs
@@ -42,7 +42,7 @@ use tvm_common::ffi::TVMParallelGroupEnv;
 #[cfg(target_env = "sgx")]
 use super::{TVMArgValue, TVMRetValue};
 
-type FTVMParallelLambda =
+pub(crate) type FTVMParallelLambda =
     extern "C" fn(task_id: usize, penv: *const TVMParallelGroupEnv, cdata: *const c_void) -> i32;
 
 /// Holds a parallel job request made by a TVM library function.
diff --git a/rust/runtime/tests/test_tvm_basic/build.rs b/rust/runtime/tests/test_tvm_basic/build.rs
index ea3bfcb85136..3439f9c2efc7 100644
--- a/rust/runtime/tests/test_tvm_basic/build.rs
+++ b/rust/runtime/tests/test_tvm_basic/build.rs
@@ -19,13 +19,21 @@
 
 extern crate ar;
 
-use std::{env, path::Path, process::Command};
+use std::{path::PathBuf, process::Command};
 
 use ar::Builder;
 use std::fs::File;
 
 fn main() {
-    let out_dir = env::var("OUT_DIR").unwrap();
+    let mut out_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    out_dir.push("lib");
+
+    if !out_dir.is_dir() {
+        std::fs::create_dir(&out_dir).unwrap();
+    }
+
+    let obj_file = out_dir.join("test.o");
+    let lib_file = out_dir.join("libtest.a");
 
     let output = Command::new(concat!(
         env!("CARGO_MANIFEST_DIR"),
@@ -35,7 +43,7 @@ fn main() {
     .output()
     .expect("Failed to execute command");
     assert!(
-        Path::new(&format!("{}/test.o", out_dir)).exists(),
+        obj_file.exists(),
         "Could not build tvm lib: {}",
         String::from_utf8(output.stderr)
             .unwrap()
@@ -45,9 +53,9 @@ fn main() {
             .unwrap_or("")
     );
 
-    let mut builder = Builder::new(File::create(format!("{}/libtest.a", out_dir)).unwrap());
-    builder.append_path(format!("{}/test.o", out_dir)).unwrap();
+    let mut builder = Builder::new(File::create(lib_file).unwrap());
+    builder.append_path(obj_file).unwrap();
 
     println!("cargo:rustc-link-lib=static=test");
-    println!("cargo:rustc-link-search=native={}", out_dir);
+    println!("cargo:rustc-link-search=native={}", out_dir.display());
 }
diff --git a/rust/runtime/tests/test_tvm_basic/src/main.rs b/rust/runtime/tests/test_tvm_basic/src/main.rs
index 14bb7c20c680..a83078e5834a 100644
--- a/rust/runtime/tests/test_tvm_basic/src/main.rs
+++ b/rust/runtime/tests/test_tvm_basic/src/main.rs
@@ -22,13 +22,14 @@ extern crate ndarray;
 extern crate tvm_runtime;
 
 use ndarray::Array;
-use tvm_runtime::{DLTensor, Module, SystemLibModule};
+use tvm_runtime::{DLTensor, Module as _, SystemLibModule};
+
+mod tvm_mod {
+    import_module!("../lib/test.o");
+}
 
 fn main() {
-    let syslib = SystemLibModule::default();
-    let add = syslib
-        .get_function("default_function")
-        .expect("main function not found");
+    // try static
     let mut a = Array::from_vec(vec![1f32, 2., 3., 4.]);
     let mut b = Array::from_vec(vec![1f32, 0., 1., 0.]);
     let mut c = Array::from_vec(vec![0f32; 4]);
@@ -36,6 +37,14 @@ fn main() {
     let mut a_dl: DLTensor = (&mut a).into();
     let mut b_dl: DLTensor = (&mut b).into();
     let mut c_dl: DLTensor = (&mut c).into();
+    call_packed!(tvm_mod::default_function, &mut a_dl, &mut b_dl, &mut c_dl).unwrap();
+    assert!(c.all_close(&e, 1e-8f32));
+
+    // try runtime
+    let syslib = SystemLibModule::default();
+    let add = syslib
+        .get_function("default_function")
+        .expect("main function not found");
     call_packed!(add, &mut a_dl, &mut b_dl, &mut c_dl).unwrap();
     assert!(c.all_close(&e, 1e-8f32));
 }
diff --git a/rust/runtime/tests/test_tvm_dso/Cargo.toml b/rust/runtime/tests/test_tvm_dso/Cargo.toml
new file mode 100644
index 000000000000..afe7f26e1220
--- /dev/null
+++ b/rust/runtime/tests/test_tvm_dso/Cargo.toml
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+name = "test-tvm-dso"
+version = "0.0.0"
+license = "Apache-2.0"
+authors = ["TVM Contributors"]
+
+[dependencies]
+ndarray="0.12"
+tvm-runtime = { path = "../../" }
diff --git a/rust/runtime/tests/test_tvm_dso/build.rs b/rust/runtime/tests/test_tvm_dso/build.rs
new file mode 100644
index 000000000000..f1d9822b01a5
--- /dev/null
+++ b/rust/runtime/tests/test_tvm_dso/build.rs
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use std::{env, path::Path, process::Command};
+
+fn main() {
+    let out_dir = env::var("OUT_DIR").unwrap();
+
+    let output = Command::new(concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/src/build_test_lib.py"
+    ))
+    .arg(&out_dir)
+    .output()
+    .expect("Failed to execute command");
+    assert!(
+        Path::new(&format!("{}/test.so", out_dir)).exists(),
+        "Could not build tvm lib: {}",
+        String::from_utf8(output.stderr)
+            .unwrap()
+            .trim()
+            .split("\n")
+            .last()
+            .unwrap_or("")
+    );
+}
diff --git a/conda/cross-linux.cmake b/rust/runtime/tests/test_tvm_dso/src/build_test_lib.py
old mode 100644
new mode 100755
similarity index 54%
rename from conda/cross-linux.cmake
rename to rust/runtime/tests/test_tvm_dso/src/build_test_lib.py
index f84ba8e44a26..63b43a5f9bef
--- a/conda/cross-linux.cmake
+++ b/rust/runtime/tests/test_tvm_dso/src/build_test_lib.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -15,23 +16,25 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# this one is important
-set(CMAKE_SYSTEM_NAME Linux)
-set(CMAKE_PLATFORM Linux)
-#this one not so much
-set(CMAKE_SYSTEM_VERSION 1)
+"""Prepares a simple TVM library for testing."""
 
-# specify the cross compiler
-set(CMAKE_C_COMPILER $ENV{CC})
+from os import path as osp
+import sys
 
-# where is the target environment
-set(CMAKE_FIND_ROOT_PATH $ENV{PREFIX} $ENV{BUILD_PREFIX}/$ENV{HOST}/sysroot)
+import tvm
+from tvm.contrib import cc
 
-# search for programs in the build host directories
-set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-# for libraries and headers in the target directories
-set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+def main():
+    n = tvm.var('n')
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.placeholder((n,), name='B')
+    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+    s = tvm.create_schedule(C.op)
+    s[C].parallel(s[C].op.axis[0])
+    print(tvm.lower(s, [A, B, C], simple_mode=True))
+    obj_file = osp.join(sys.argv[1], 'test.o')
+    tvm.build(s, [A, B, C], 'llvm').save(obj_file)
+    cc.create_shared(osp.join(sys.argv[1], 'test.so'), [obj_file])
 
-# god-awful hack because it seems to not run correct tests to determine this:
-set(__CHAR_UNSIGNED___EXITCODE 1)
+if __name__ == '__main__':
+    main()
diff --git a/rust/runtime/tests/test_tvm_dso/src/main.rs b/rust/runtime/tests/test_tvm_dso/src/main.rs
new file mode 100644
index 000000000000..953676cea5bb
--- /dev/null
+++ b/rust/runtime/tests/test_tvm_dso/src/main.rs
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+extern crate ndarray;
+#[macro_use]
+extern crate tvm_runtime;
+
+use ndarray::Array;
+use tvm_runtime::{DLTensor, DsoModule, Module};
+
+fn main() {
+    tvm_runtime::TVMGetLastError();
+    let module = DsoModule::new(concat!(env!("OUT_DIR"), "/test.so")).unwrap();
+    let add = module
+        .get_function("__tvm_main__")
+        .expect("main function not found");
+    let mut a = Array::from_vec(vec![1f32, 2., 3., 4.]);
+    let mut b = Array::from_vec(vec![1f32, 0., 1., 0.]);
+    let mut c = Array::from_vec(vec![0f32; 4]);
+    let e = Array::from_vec(vec![2f32, 2., 4., 4.]);
+    let mut a_dl: DLTensor = (&mut a).into();
+    let mut b_dl: DLTensor = (&mut b).into();
+    let mut c_dl: DLTensor = (&mut c).into();
+    call_packed!(add, &mut a_dl, &mut b_dl, &mut c_dl).unwrap();
+    assert!(c.all_close(&e, 1e-8f32));
+}
diff --git a/src/api/api_arith.cc b/src/api/api_arith.cc
index fce73aabf6a7..f31f02b1eaf4 100644
--- a/src/api/api_arith.cc
+++ b/src/api/api_arith.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -39,6 +39,7 @@ TVM_REGISTER_API("arith.intset_vector")
 TVM_REGISTER_API("arith.intset_interval")
 .set_body_typed(IntSet::interval);
 
+
 TVM_REGISTER_API("arith.DetectLinearEquation")
 .set_body_typed(DetectLinearEquation);
 
@@ -58,7 +59,6 @@ TVM_REGISTER_API("arith.DeduceBound")
 TVM_REGISTER_API("arith.DomainTouched")
 .set_body_typed(DomainTouched);
 
-
 TVM_REGISTER_API("_IntervalSetGetMin")
 .set_body_method(&IntSet::min);
 
@@ -71,11 +71,19 @@ TVM_REGISTER_API("_IntSetIsNothing")
 TVM_REGISTER_API("_IntSetIsEverything")
 .set_body_method(&IntSet::is_everything);
 
+ConstIntBound MakeConstIntBound(int64_t min_value, int64_t max_value) {
+  return ConstIntBound(min_value, max_value);
+}
+
 TVM_REGISTER_API("arith._make_ConstIntBound")
-.set_body_typed(ConstIntBoundNode::make);
+.set_body_typed(MakeConstIntBound);
+
+ModularSet MakeModularSet(int64_t coeff, int64_t base) {
+  return ModularSet(coeff, base);
+}
 
 TVM_REGISTER_API("arith._make_ModularSet")
-.set_body_typed(ModularSetNode::make);
+.set_body_typed(MakeModularSet);
 
 TVM_REGISTER_API("arith._CreateAnalyzer")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
@@ -103,6 +111,10 @@ TVM_REGISTER_API("arith._CreateAnalyzer")
         return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
             *ret = self->canonical_simplify(args[0]);
         });
+      } else if (name == "int_set") {
+        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
+            *ret = self->int_set(args[0], args[1]);
+        });
       } else if (name == "bind") {
         return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
             auto& sptr = args[1].node_sptr();
@@ -116,8 +128,8 @@ TVM_REGISTER_API("arith._CreateAnalyzer")
         return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
             // can't use make_shared due to noexcept(false) decl in destructor,
             // see https://stackoverflow.com/a/43907314
-            auto ctx =
-                std::shared_ptr<ConstraintContext>(new ConstraintContext(self.get(), args[0]));
+            auto ctx = std::shared_ptr<With<ConstraintContext> >(
+                new With<ConstraintContext>(self.get(), args[0]));
             auto fexit = [ctx](TVMArgs, TVMRetValue*) mutable {
               ctx.reset();
             };
diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc
index 6195aac1b93f..e5b003cafb87 100644
--- a/src/api/api_pass.cc
+++ b/src/api/api_pass.cc
@@ -130,6 +130,7 @@ REGISTER_PASS(RewriteUnsafeSelect);
 REGISTER_PASS(Inline);
 REGISTER_PASS(IRTransform);
 REGISTER_PASS(VectorizeLoop);
+REGISTER_PASS(SkipVectorize);
 REGISTER_PASS(UnrollLoop);
 REGISTER_PASS(InjectCopyIntrin);
 REGISTER_PASS(ThreadSync);
@@ -151,6 +152,7 @@ REGISTER_PASS(LowerThreadAllreduce);
 REGISTER_PASS(LowerWarpMemory);
 REGISTER_PASS(RemapThreadAxis);
 REGISTER_PASS(LowerIntrin);
+REGISTER_PASS(LowerCustomDatatypes);
 REGISTER_PASS(LowerTVMBuiltin);
 REGISTER_PASS(CombineContextCall);
 REGISTER_PASS(VerifyMemory);
diff --git a/src/arithmetic/analyzer.cc b/src/arithmetic/analyzer.cc
index 420d6f9c1d0d..10a1c7f041c3 100644
--- a/src/arithmetic/analyzer.cc
+++ b/src/arithmetic/analyzer.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -31,7 +31,8 @@ Analyzer::Analyzer()
     : const_int_bound(this),
       modular_set(this),
       rewrite_simplify(this),
-      canonical_simplify(this) {
+      canonical_simplify(this),
+      int_set(this) {
 }
 
 void Analyzer::Bind(const VarExpr& v, const Expr& expr) {
@@ -54,10 +55,12 @@ void Analyzer::Bind(const VarExpr& v, const Range& range) {
   // skip rewrite simplify
 }
 
-ConstraintContext::ConstraintContext(Analyzer* analyzer, const Expr& constraint) {
+
+void ConstraintContext::EnterWithScope() {
+  CHECK(exit_ == nullptr);
   // entering the scope.
-  auto f0 = analyzer->const_int_bound.EnterConstraint(constraint);
-  auto f1 = analyzer->modular_set.EnterConstraint(constraint);
+  auto f0 = analyzer_->const_int_bound.EnterConstraint(constraint_);
+  auto f1 = analyzer_->modular_set.EnterConstraint(constraint_);
   // recovery function.
   exit_ = [f0, f1]() {
     if (f1 != nullptr) f1();
@@ -65,9 +68,14 @@ ConstraintContext::ConstraintContext(Analyzer* analyzer, const Expr& constraint)
   };
 }
 
+void ConstraintContext::ExitWithScope() {
+  CHECK(exit_ != nullptr);
+  exit_();
+}
+
 bool Analyzer::CanProveGreaterEqual(const Expr& expr, int64_t lower_bound) {
   if (const auto* ptr = expr.as<ir::IntImm>()) {
-    return ptr->value > lower_bound;
+    return ptr->value >= lower_bound;
   }
   auto bd = this->const_int_bound(this->rewrite_simplify(expr));
   if (bd->min_value >= lower_bound) return true;
diff --git a/src/arithmetic/bound_deducer.cc b/src/arithmetic/bound_deducer.cc
index 89e556c6f75f..395a371f43af 100644
--- a/src/arithmetic/bound_deducer.cc
+++ b/src/arithmetic/bound_deducer.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -30,12 +30,12 @@
 
 #include <unordered_set>
 #include <unordered_map>
+#include "int_set.h"
 
 namespace tvm {
 namespace arith {
 
 using namespace ir;
-using HalideIR::Internal::Interval;
 
 // a visitor to find the path to the target variable
 // from a expression.
@@ -293,7 +293,7 @@ IntSet DeduceBound(Expr v, Expr e,
   BoundDeducer d(v, e, hint_map, relax_map);
   d.Deduce();
   if (!d.success) return IntSet::nothing();
-  Expr min = Interval::neg_inf, max = Interval::pos_inf;
+  Expr min = neg_inf(), max = pos_inf();
   if (d.is_greater) {
     min = d.result;
   } else {
diff --git a/src/arithmetic/canonical_simplify.cc b/src/arithmetic/canonical_simplify.cc
index 1bf1f84fb635..a50cbfb96591 100644
--- a/src/arithmetic/canonical_simplify.cc
+++ b/src/arithmetic/canonical_simplify.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2019 by Contributors
  * \file canonical_simplify.cc
  * \brief Canonical form based simplification.
  */
@@ -763,7 +762,10 @@ Mutate_(const Mod* op, const Expr& self) {
           if (TryCompare(temp, cval) == kLT) {
             return temp;
           } else {
-            return SplitModConst(ToSplitExpr(temp), cval);
+            // contonue to use logic below.
+            a = extra;
+            psum = a.as<SumExprNode>();
+            CHECK(psum != nullptr);
           }
         }
       }
diff --git a/src/arithmetic/compute_expr.h b/src/arithmetic/compute_expr.h
index ff2fb8dbd4ac..cc54bff596be 100644
--- a/src/arithmetic/compute_expr.h
+++ b/src/arithmetic/compute_expr.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -27,8 +27,8 @@
 #define TVM_ARITHMETIC_COMPUTE_EXPR_H_
 
 #include <tvm/ir.h>
-#include <arithmetic/Interval.h>
 #include <limits>
+#include <algorithm>
 
 namespace tvm {
 namespace arith {
@@ -105,12 +105,12 @@ inline Expr ComputeExpr<ir::Mod>(Expr a, Expr b) {
 
 template<>
 inline Expr ComputeExpr<ir::Max>(Expr a, Expr b) {
-  return HalideIR::Internal::Interval::make_max(a, b);
+  return max(a, b);
 }
 
 template<>
 inline Expr ComputeExpr<ir::Min>(Expr a, Expr b) {
-  return HalideIR::Internal::Interval::make_min(a, b);
+  return min(a, b);
 }
 
 template<typename Op>
diff --git a/src/arithmetic/const_fold.h b/src/arithmetic/const_fold.h
index fbf8fe7e6f89..ec50aef5c51e 100644
--- a/src/arithmetic/const_fold.h
+++ b/src/arithmetic/const_fold.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -206,6 +206,7 @@ inline Expr TryConstFold<ir::Min>(Expr a, Expr b) {
       if (pa && pb) return IntImm::make(rtype, std::min(pa->value, pb->value));
       if (fa && fb) return FloatImm::make(rtype, std::min(fa->value, fb->value));
     });
+  if (a.same_as(b)) return a;
   return Expr();
 }
 
@@ -216,6 +217,7 @@ inline Expr TryConstFold<ir::Max>(Expr a, Expr b) {
       if (pa && pb) return IntImm::make(rtype, std::max(pa->value, pb->value));
       if (fa && fb) return FloatImm::make(rtype, std::max(fa->value, fb->value));
     });
+  if (a.same_as(b)) return a;
   return Expr();
 }
 
@@ -307,6 +309,58 @@ inline Expr TryConstFold<ir::Not>(Expr a) {
   return Expr();
 }
 
+/*! \brief Helper namespace for symbolic value limits */
+struct SymbolicLimits {
+  /*! \brief positive infinity */
+  static Expr pos_inf_;
+  /*! \brief negative infinity */
+  static Expr neg_inf_;
+};
+
+/*!
+ * \brief Opaque expression representing positive infinity.
+ *
+ *  It can can only be used as parameter of by min/max
+ *  for integer analysis and cannot be used in normal expressions.
+ *
+ * \return positive infinity.
+ */
+inline Expr pos_inf() {
+  return SymbolicLimits::pos_inf_;
+}
+
+/*!
+ * \brief Check if value is positive infinity.
+ * \param value The value to be checked.
+ *
+ * \return The check result.
+ */
+inline bool is_pos_inf(const Expr& value) {
+  return value.same_as(SymbolicLimits::pos_inf_);
+}
+
+/*!
+ * \brief Opaque expression representing negative infinity.
+ *
+ *  It can can only be used as parameter of by min/max
+ *  for integer analysis and cannot be used in normal expressions.
+ *
+ * \return negative infinity.
+ */
+inline Expr neg_inf() {
+  return SymbolicLimits::neg_inf_;
+}
+
+/*!
+ * \brief Check if value is negative infinity.
+ * \param value The value to be checked.
+ *
+ * \return The check result.
+ */
+inline bool is_neg_inf(const Expr& value) {
+  return value.same_as(SymbolicLimits::neg_inf_);
+}
+
 }  // namespace arith
 }  // namespace tvm
 #endif  // TVM_ARITHMETIC_CONST_FOLD_H_
diff --git a/src/arithmetic/const_int_bound.cc b/src/arithmetic/const_int_bound.cc
index bfd06c8ba255..ed8faba3509b 100644
--- a/src/arithmetic/const_int_bound.cc
+++ b/src/arithmetic/const_int_bound.cc
@@ -34,12 +34,12 @@ using namespace ir;
 
 TVM_REGISTER_NODE_TYPE(ConstIntBoundNode);
 
-ConstIntBound ConstIntBoundNode::make(
+ConstIntBound::ConstIntBound(
     int64_t min_value, int64_t max_value) {
   auto node = make_node<ConstIntBoundNode>();
   node->min_value = min_value;
   node->max_value = max_value;
-  return ConstIntBound(node);
+  node_ = std::move(node);
 }
 
 TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
@@ -190,7 +190,7 @@ class ConstIntBoundAnalyzer::Impl :
                          std::min(a.max_value, b_max_cap));
       } else {
         return MakeBound(std::max(a.min_value, -b_max_cap),
-                         std::min(a.max_value, b_max_cap));
+                         std::min(std::max(a.max_value, (int64_t)0), b_max_cap));
       }
     } else {
       CHECK(!b.is_const(0)) << "mod by zero";
@@ -289,8 +289,8 @@ class ConstIntBoundAnalyzer::Impl :
   std::vector<BoundInfo> additional_info_;
   // constants: the limit value means umlimited
   // NOTE: kNegInf/kPosInf are used to represent infinity.
-  static const constexpr int64_t kNegInf = ConstIntBoundNode::kNegInf;
-  static const constexpr int64_t kPosInf = ConstIntBoundNode::kPosInf;
+  static const constexpr int64_t kNegInf = ConstIntBound::kNegInf;
+  static const constexpr int64_t kPosInf = ConstIntBound::kPosInf;
   static_assert(-kNegInf == kPosInf, "invariant of inf");
   // internal helper functions
   /*!
@@ -462,7 +462,7 @@ class ConstIntBoundAnalyzer::Impl :
 
 ConstIntBound ConstIntBoundAnalyzer::operator()(const Expr& expr) {
   Entry ret = impl_->VisitExpr(expr);
-  return ConstIntBoundNode::make(ret.min_value, ret.max_value);
+  return ConstIntBound(ret.min_value, ret.max_value);
 }
 
 void ConstIntBoundAnalyzer::Update(const Var& var,
diff --git a/src/arithmetic/detect_linear_equation.cc b/src/arithmetic/detect_linear_equation.cc
index 2fe21fef7e21..e584c8b1ce33 100644
--- a/src/arithmetic/detect_linear_equation.cc
+++ b/src/arithmetic/detect_linear_equation.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -19,8 +19,8 @@
 
 /*!
  *  Copyright (c) 2017 by Contributors
- * \file bound_deducer.cc
- * \brief Utility to deduce bound of expression
+ * \file detect_linear_equation.cc
+ * \brief Utility to detect patterns in the expression.
  */
 #include <tvm/expr.h>
 #include <tvm/ir_pass.h>
diff --git a/src/arithmetic/int_op_overflow.h b/src/arithmetic/int_op_overflow.h
index 87f4f059e858..b78f21cb1dba 100644
--- a/src/arithmetic/int_op_overflow.h
+++ b/src/arithmetic/int_op_overflow.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/src/arithmetic/int_set.cc b/src/arithmetic/int_set.cc
index abbb7cd9744e..75a4aaf83ab6 100644
--- a/src/arithmetic/int_set.cc
+++ b/src/arithmetic/int_set.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,201 +18,55 @@
  */
 
 /*!
- *  Copyright (c) 2017 by Contributors
  * \file int_set.cc
  * \brief The integer set functions
  */
 #include <tvm/ir.h>
-#include <tvm/ir_pass.h>
-#include <tvm/arithmetic.h>
 #include <tvm/ir_functor_ext.h>
-#include <arithmetic/Interval.h>
+#include <tvm/api_registry.h>
+#include <utility>
+#include <algorithm>
 #include <unordered_map>
-#include "compute_expr.h"
-#include "int_set_internal.h"
+#include "int_set.h"
+#include "pattern_match.h"
 
 namespace tvm {
 namespace arith {
 
-using HalideIR::Internal::Interval;
-using namespace ir;
-
-inline IntSet IntSet::cover_interval() const {
-  if ((*this).as<IntervalSet>()) return *this;
-  const StrideSet* s =  (*this).as<StrideSet>();
-  if (s) {
-    CHECK_NE(s->extents.size(), 0U);
-    Expr max = s->base.max;
-    for (size_t i = 0; i < s->extents.size(); ++i) {
-      max = max + s->extents[i] * s->strides[i] - s->strides[i];
-    }
-    return IntervalSet::make(s->base.min, Simplify(max));
-  }
-  LOG(FATAL) << "cannot convert set " << (*this)->type_key() << " to interval";
-  return IntSet::everything();
-}
-
-Range IntSet::cover_range(Range max_range) const {
-  IntSet temp;
-  const IntervalSet* s_int = (*this).as<IntervalSet>();
-  if (s_int == nullptr) {
-    temp = this->cover_interval();
-    s_int = temp.as<IntervalSet>();
-  }
-  if (s_int->i.is_bounded()) {
-    return Range::make_by_min_extent(
-        s_int->i.min, Simplify(s_int->i.max + 1 - s_int->i.min));
-  }
-  return max_range;
-}
-
-Expr IntSet::min() const {
-  const IntervalSet* s_int = (*this).as<IntervalSet>();
-  CHECK(s_int);
-  return s_int->i.min;
-}
-
-Expr IntSet::max() const {
-  const IntervalSet* s_int = (*this).as<IntervalSet>();
-  CHECK(s_int);
-  return s_int->i.max;
-}
-
-bool IntSet::is_nothing() const {
-  const IntervalSet* s_int = (*this).as<IntervalSet>();
-  return (s_int && s_int->i.is_empty());
-}
-
-bool IntSet::is_everything() const {
-  const IntervalSet* s_int = (*this).as<IntervalSet>();
-  return (s_int && s_int->i.is_everything());
-}
+Expr SymbolicLimits::pos_inf_ = Var("pos_inf", Handle());
+Expr SymbolicLimits::neg_inf_ = Var("neg_inf", Handle());
 
-bool IntSet::is_single_point() const {
-  const IntervalSet* s_int = (*this).as<IntervalSet>();
-  return (s_int && s_int->i.is_single_point());
+IntervalSet::IntervalSet(Expr min_value, Expr max_value) {
+  auto node = make_node<IntervalSetNode>();
+  node->min_value = std::move(min_value);
+  node->max_value = std::move(max_value);
+  node_ = std::move(node);
 }
 
-bool IntSet::can_prove_positive() const {
-  const IntervalSet* s_int = (*this).as<IntervalSet>();
-  return (s_int && is_positive_const(ir::Simplify(s_int->i.min)));
+IntervalSet MakeIntervalSet(Expr min_value, Expr max_value) {
+  return IntervalSet(min_value, max_value);
 }
 
-bool IntSet::can_prove_negative() const {
-  const IntervalSet* s_int = (*this).as<IntervalSet>();
-  return (s_int && is_negative_const(ir::Simplify(s_int->i.max)));
-}
+TVM_REGISTER_API("arith._make_IntervalSet")
+.set_body_typed(MakeIntervalSet);
 
-bool IntSet::can_prove_non_positive() const {
-  if (const IntervalSet* s_int = (*this).as<IntervalSet>()) {
-    auto max = ir::Simplify(s_int->i.max);
-    return is_zero(max) || is_negative_const(max);
-  }
-  return false;
-}
 
-bool IntSet::can_prove_non_negative() const {
-  if (const IntervalSet* s_int = (*this).as<IntervalSet>()) {
-    // Any reason why we should or should not use can_prove() to implement
-    // these functions?
-    auto min = ir::Simplify(s_int->i.min);
-    return is_zero(min) || is_positive_const(min);
-  }
-  return false;
-}
-
-
-SignType IntSet::sign_type() const {
-  if (can_prove_positive()) {
-    return kPositive;
-  } else if (can_prove_negative()) {
-    return kNegative;
-  } else if (is_single_point() && is_zero(point_value())) {
-    return kZero;
+IntervalSet Intersect(Analyzer* analyzer, IntervalSet a, IntervalSet b) {
+  Expr max_value = min(a->max_value, b->max_value);
+  Expr min_value = max(a->min_value, b->min_value);
+  if ((max_value.type().is_int() || max_value.type().is_uint()) &&
+      (min_value.type().is_int() || min_value.type().is_uint()) &&
+      analyzer->CanProveGreaterEqual(min_value - max_value, 1)) {
+    return IntervalSet::Empty();
   } else {
-    return kUnknown;
-  }
-}
-Expr IntSet::point_value() const {
-  const IntervalSet* s_int = (*this).as<IntervalSet>();
-  CHECK(s_int && s_int->i.is_single_point());
-  return s_int->i.min;
-}
-
-IntSet IntSet::nothing() {
-  return IntervalSet::make(Interval::nothing());
-}
-
-IntSet IntSet::everything() {
-  return IntervalSet::make(Interval::everything());
-}
-
-IntSet IntSet::single_point(Expr x) {
-  return IntervalSet::make(Interval::single_point(x));
-}
-
-IntSet IntSet::range(Range r) {
-  // must make sure it can be matched back by MatchRange.
-  if (is_one(r->extent)) {
-    return IntSet::single_point(r->min);
-  }
-  if (is_positive_const(r->extent) && is_const(r->min)) {
-    return IntervalSet::make(
-        r->min, ComputeExpr<Sub>(ComputeExpr<Add>(r->extent, r->min), 1));
-  }
-  return IntervalSet::make(r->min, (r->extent + r->min) - 1);
-}
-
-IntSet IntSet::interval(Expr min, Expr max) {
-  if (min.same_as(max)) {
-    return IntSet::single_point(min);
-  }
-  return IntervalSet::make(min, max);
-}
-
-inline bool prove_equal(Expr lhs, Expr rhs) {
-  return is_zero(ir::Simplify(lhs - rhs));
-}
-
-// Check if a is created from b.
-bool IntSet::match_range(const Range& b) const {
-  const IntSet& a = *this;
-  const IntervalSet* a_int = a.as<IntervalSet>();
-  if (!a_int) return false;
-  const Interval& i = a_int->i;
-  return prove_equal(i.min, b->min) &&
-      prove_equal(i.max, ComputeExpr<Sub>(ComputeExpr<Add>(b->extent, b->min), 1));
-}
-
-inline bool MatchPoint(const IntSet& a,
-                       const Expr& b) {
-  const IntervalSet* a_int = a.as<IntervalSet>();
-  if (!a_int) return false;
-  const Interval& i = a_int->i;
-  return i.is_single_point() && i.min.same_as(b);
-}
-
-IntSet Union(const Array<IntSet>& sets) {
-  if (sets.size() == 0) return IntSet::nothing();
-  if (sets.size() == 1) return sets[0];
-  Interval x = sets[0].cover_interval().as<IntervalSet>()->i;
-  for (size_t i = 1; i < sets.size(); ++i) {
-    IntSet s = sets[i].cover_interval();
-    const Interval& y = s.as<IntervalSet>()->i;
-    x.include(y);
+    return IntervalSet(min_value, max_value);
   }
-  x.max = ir::Simplify(x.max);
-  x.min = ir::Simplify(x.min);
-  return IntervalSet::make(x);
 }
 
-IntSet Intersect(const Array<IntSet>& sets) {
-  Interval x = sets[0].cover_interval().as<IntervalSet>()->i;
-  for (size_t i = 1; i < sets.size(); ++i) {
-    Interval y = sets[i].cover_interval().as<IntervalSet>()->i;
-    x = Interval::make_intersection(x, y);
-  }
-  return IntervalSet::make(x);
+IntervalSet Union(Analyzer* analyzer, IntervalSet a, IntervalSet b) {
+  Expr max_value = max(a->max_value, b->max_value);
+  Expr min_value = min(a->min_value, b->min_value);
+  return IntervalSet(min_value, max_value);
 }
 
 // type traits
@@ -227,407 +81,623 @@ struct is_logical_op {
     static const bool value = true;             \
   };
 
-// interval related.
-template<typename OP>
-inline IntSet CombineInterval(Interval a, Interval b) {
-  if (a.is_single_point() && b.is_single_point()) {
-    return IntSet::single_point(ComputeExpr<OP>(a.min, b.min));
-  }
-  LOG(WARNING) << "Return Everything in CombineInterval " << OP::_type_key;
-  return IntSet::everything();
+TVM_DECLARE_LOGICAL_OP(And);
+TVM_DECLARE_LOGICAL_OP(Or);
+TVM_DECLARE_LOGICAL_OP(EQ);
+TVM_DECLARE_LOGICAL_OP(NE);
+TVM_DECLARE_LOGICAL_OP(GE);
+TVM_DECLARE_LOGICAL_OP(GT);
+TVM_DECLARE_LOGICAL_OP(LE);
+TVM_DECLARE_LOGICAL_OP(LT);
+TVM_DECLARE_LOGICAL_OP(Not);
+
+/*!
+ * \brief Combine two interval set under arithmetic operations.
+ * \note this can possibly relax the set.
+ */
+template<typename Op>
+inline IntervalSet Combine(Analyzer* analyzer,
+                           IntervalSet a,
+                           IntervalSet b) {
+  if (a->IsSinglePoint() && b->IsSinglePoint()) {
+    Expr res = TryConstFold<Op>(a->min_value, b->min_value);
+    if (!res.defined()) res = Op::make(a->min_value, b->min_value);
+    return IntervalSet::SinglePoint(res);
+  }
+  if (is_logical_op<Op>::value) {
+    return IntervalSet(make_const(a->min_value.type(), 0),
+                       make_const(a->min_value.type(), 1));
+  }
+  if (a->IsEmpty()) return a;
+  if (b->IsEmpty()) return b;
+  if (a->IsEverything()) return a;
+  if (b->IsEverything()) return b;
+  return IntervalSet::Everything();
 }
 
 template<>
-inline IntSet CombineInterval<Add>(Interval a, Interval b) {
-  if (a.is_single_point() && b.is_single_point()) {
-    return IntSet::single_point(ComputeExpr<Add>(a.min, b.min));
-  }
-  Interval r = Interval::everything();
-  if (a.has_lower_bound() && b.has_lower_bound()) {
-    r.min = ComputeExpr<Add>(a.min, b.min);
-  }
-  if (a.has_upper_bound() && b.has_upper_bound()) {
-    r.max = ComputeExpr<Add>(a.max, b.max);
-  }
-  return IntervalSet::make(r);
+inline IntervalSet Combine<ir::Add>(Analyzer* analyer,
+                                    IntervalSet a,
+                                    IntervalSet b) {
+  if (a->IsSinglePoint() && b->IsSinglePoint()) {
+    return IntervalSet::SinglePoint(a->min_value + b->min_value);
+  }
+  if (a->IsEmpty()) return a;
+  if (b->IsEmpty()) return b;
+  Expr min_value =
+      a->HasLowerBound() && b->HasLowerBound() ?
+      a->min_value + b->min_value : neg_inf();
+  Expr max_value =
+      a->HasUpperBound() && b->HasUpperBound() ?
+      a->max_value + b->max_value : pos_inf();
+  return IntervalSet(min_value, max_value);
 }
 
 template<>
-inline IntSet CombineInterval<Sub>(Interval a, Interval b) {
-  if (a.is_single_point() && b.is_single_point()) {
-    return IntSet::single_point(ComputeExpr<Sub>(a.min, b.min));
+inline IntervalSet Combine<ir::Sub>(Analyzer* analyer,
+                                    IntervalSet a,
+                                    IntervalSet b) {
+  if (a->IsSinglePoint() && b->IsSinglePoint()) {
+    return IntervalSet::SinglePoint(a->min_value - b->min_value);
   }
-  Interval r = Interval::everything();
-  if (a.has_lower_bound() && b.has_upper_bound()) {
-    r.min = ComputeExpr<Sub>(a.min, b.max);
-  }
-  if (a.has_upper_bound() && b.has_lower_bound()) {
-    r.max = ComputeExpr<Sub>(a.max, b.min);
-  }
-  return IntervalSet::make(r);
+  if (a->IsEmpty()) return a;
+  if (b->IsEmpty()) return b;
+  Expr min_value =
+      a->HasLowerBound() && b->HasUpperBound() ?
+      a->min_value - b->max_value : neg_inf();
+  Expr max_value =
+      a->HasUpperBound() && b->HasLowerBound() ?
+      a->max_value - b->min_value : pos_inf();
+  return IntervalSet(min_value, max_value);
 }
 
+
 template<>
-inline IntSet CombineInterval<Mul>(Interval a, Interval b) {
-  if (a.is_single_point() && b.is_single_point()) {
-    return IntSet::single_point(ComputeExpr<Mul>(a.min, b.min));
-  }
-  if (a.is_single_point() && !b.is_single_point()) {
+inline IntervalSet Combine<ir::Mul>(Analyzer* analyzer,
+                                    IntervalSet a,
+                                    IntervalSet b) {
+  if (a->IsSinglePoint() && b->IsSinglePoint()) {
+    return IntervalSet::SinglePoint(a->min_value * b->min_value);
+  }
+  if (a->IsEmpty()) return a;
+  if (b->IsEmpty()) return b;
+  if (a->IsSinglePoint()) {
     std::swap(a, b);
   }
-  if (b.is_single_point()) {
-    if (is_zero(b.min)) return IntSet::single_point(0);
-    if (is_one(b.min)) return IntervalSet::make(a);
-    Expr e1 = a.has_lower_bound() ? ComputeExpr<Mul>(a.min, b.min) : a.min;
-    Expr e2 = a.has_upper_bound() ? ComputeExpr<Mul>(a.max, b.min) : a.max;
-    // no relaxation is needed in here due to set is inclusive
-    // TODO(tqchen): consider convert to StrideSet.
-    if (is_positive_const(b.min)) {
-      return IntervalSet::make(e1, e2);
-    } else if (is_negative_const(b.min)) {
-      return IntervalSet::make(e2, e1);
-    } else if (a.is_bounded()) {
+  if (b->IsSinglePoint()) {
+    if (is_zero(b->min_value)) return b;
+    if (is_one(b->min_value)) return a;
+    if (analyzer->CanProveGreaterEqual(b->min_value, 0)) {
+      Expr min_value = a->HasLowerBound() ? a->min_value * b->min_value : neg_inf();
+      Expr max_value = a->HasUpperBound() ? a->max_value * b->min_value : pos_inf();
+      return IntervalSet(min_value, max_value);
+    } else if (analyzer->CanProveGreaterEqual(-b->min_value, 1)) {
+      Expr min_value = a->HasUpperBound() ? a->max_value * b->min_value : neg_inf();
+      Expr max_value = a->HasLowerBound() ? a->min_value * b->min_value : pos_inf();
+      return IntervalSet(min_value, max_value);
+    } else if (a->HasUpperBound() && a->HasLowerBound()) {
       using ir::Select;
-      Expr cmp = b.min >= make_zero(b.min.type().element_of());
-      return IntervalSet::make(Select::make(cmp, e1, e2), Select::make(cmp, e2, e1));
+      Expr sign = b->min_value >= make_zero(b->min_value.type().element_of());
+      Expr e1 = a->min_value * b->min_value;
+      Expr e2 = a->max_value * b->min_value;
+      return IntervalSet(Select::make(sign, e1, e2), Select::make(sign, e2, e1));
     }
   }
-  LOG(WARNING) << "Return Everything in CombineInterval Mul";
-  return IntSet::everything();
+  DLOG(WARNING) << "Return Everything in CombineInterval Mul";
+  return IntervalSet::Everything();
 }
 
 template<>
-inline IntSet CombineInterval<Div>(Interval a, Interval b) {
-  if (a.is_single_point() && b.is_single_point()) {
-    return IntSet::single_point(ComputeExpr<Div>(a.min, b.min));
-  }
-  if (b.is_single_point()) {
-    if (is_zero(b.min)) {
+inline IntervalSet Combine<ir::Div>(Analyzer* analyzer,
+                                    IntervalSet a,
+                                    IntervalSet b) {
+  if (a->IsSinglePoint() && b->IsSinglePoint()) {
+    return IntervalSet::SinglePoint(a->min_value / b->min_value);
+  }
+  if (a->IsEmpty()) return a;
+  if (b->IsEmpty()) return b;
+  if (b->IsSinglePoint()) {
+    if (is_zero(b->min_value)) {
       LOG(FATAL) << "Divide by zero in CombineInterval Div";
     }
-    if (is_one(b.min)) return IntervalSet::make(a);
-    Expr e1 = a.has_lower_bound() ? ComputeExpr<Div>(a.min, b.min) : a.min;
-    Expr e2 = a.has_upper_bound() ? ComputeExpr<Div>(a.max, b.min) : a.max;
+    if (is_one(b->min_value)) return a;
     // no relaxation is needed in here due to set is inclusive
-    if (is_positive_const(b.min)) {
-      return IntervalSet::make(e1, e2);
-    } else if (is_negative_const(b.min)) {
-      return IntervalSet::make(e2, e1);
-    } else if (a.is_bounded()) {
+    if (analyzer->CanProveGreaterEqual(b->min_value, 0)) {
+      Expr min_value = a->HasLowerBound() ? a->min_value / b->min_value : neg_inf();
+      Expr max_value = a->HasUpperBound() ? a->max_value / b->min_value : pos_inf();
+      return IntervalSet(min_value, max_value);
+    } else if (analyzer->CanProveGreaterEqual(-b->min_value, 1)) {
+      Expr min_value = a->HasUpperBound() ? a->max_value / b->min_value : neg_inf();
+      Expr max_value = a->HasLowerBound() ? a->min_value / b->min_value : pos_inf();
+      return IntervalSet(min_value, max_value);
+    } else if (a->HasUpperBound() && a->HasLowerBound()) {
       using ir::Select;
-      Expr cmp = b.min >= make_zero(b.min.type().element_of());
-      return IntervalSet::make(Select::make(cmp, e1, e2), Select::make(cmp, e2, e1));
+      Expr sign = b->min_value >= make_zero(b->min_value.type().element_of());
+      Expr e1 = a->min_value / b->min_value;
+      Expr e2 = a->max_value / b->min_value;
+      return IntervalSet(Select::make(sign, e1, e2), Select::make(sign, e2, e1));
     }
   }
-  LOG(WARNING) << "Return Everything in CombineInterval Div";
-  return IntSet::everything();
+  DLOG(WARNING) << "Return Everything in CombineInterval Div";
+  return IntervalSet::Everything();
 }
 
 template<>
-inline IntSet CombineInterval<Mod>(Interval a, Interval b) {
-  if (a.is_single_point() && b.is_single_point()) {
-    return IntSet::single_point(ComputeExpr<Mod>(a.min, b.min));
+inline IntervalSet Combine<ir::Mod>(Analyzer* analyzer,
+                                    IntervalSet a,
+                                    IntervalSet b) {
+  if (a->IsSinglePoint() && b->IsSinglePoint()) {
+    return IntervalSet::SinglePoint(a->min_value % b->min_value);
   }
-  if (b.is_single_point()) {
-    Expr divisor = b.min;
+  if (a->IsEmpty()) return a;
+  if (b->IsEmpty()) return b;
+
+  if (b->IsSinglePoint()) {
+    const Expr& divisor = b->min_value;
     if (is_zero(divisor)) {
       LOG(FATAL) << "Modular by zero in CombineInterval Mod";
     }
-    return IntervalSet::make(make_zero(divisor.type()), divisor - 1);
+    // We need to add more bound constraints throughout the code.
+    // The logic below assumes a is non-negative, which usually
+    // is the case of our application.
+    // TODO(tqchen): add bound constraints for a.
+    if (analyzer->CanProveGreaterEqual(divisor, 0)) {
+      return IntervalSet(make_zero(divisor.type()), divisor - 1);
+    } else {
+      Expr bound = abs(divisor) - 1;
+      return IntervalSet(-bound, bound);
+    }
   }
-
-  LOG(WARNING) << "Return Everything in CombineInterval Mod";
-  return IntSet::everything();
+  DLOG(WARNING) << "Return Everything in CombineInterval Mod";
+  return IntervalSet::Everything();
 }
 
 template<>
-inline IntSet CombineInterval<Max>(Interval a, Interval b) {
-  if (a.is_single_point() && b.is_single_point()) {
-    return IntSet::single_point(ComputeExpr<Max>(a.min, b.min));
+inline IntervalSet Combine<ir::Max>(Analyzer* analzyer,
+                                    IntervalSet a,
+                                    IntervalSet b) {
+  if (a->IsSinglePoint() && b->IsSinglePoint()) {
+    return IntervalSet::SinglePoint(max(a->min_value,  b->min_value));
   }
-  return IntervalSet::make(Interval::make_max(a.min, b.min),
-                           Interval::make_max(a.max, b.max));
+  if (a->IsEmpty()) return a;
+  if (b->IsEmpty()) return b;
+  return IntervalSet(max(a->min_value, b->min_value),
+                     max(a->max_value, b->max_value));
 }
 
 template<>
-inline IntSet CombineInterval<Min>(Interval a, Interval b) {
-  if (a.is_single_point() && b.is_single_point()) {
-    return IntSet::single_point(ComputeExpr<Min>(a.min, b.min));
+inline IntervalSet Combine<ir::Min>(Analyzer* analzyer,
+                                    IntervalSet a,
+                                    IntervalSet b) {
+  if (a->IsSinglePoint() && b->IsSinglePoint()) {
+    return IntervalSet::SinglePoint(min(a->min_value, b->min_value));
   }
-  return IntervalSet::make(Interval::make_min(a.min, b.min),
-                           Interval::make_min(a.max, b.max));
-}
-
-template<typename OP>
-inline IntSet CombineInterval_(IntSet a, IntSet b) {
-  return CombineInterval<OP>(
-      a.as<IntervalSet>()->i, b.as<IntervalSet>()->i);
-}
-
-// stride related
-inline IntSet AsStrideSet(IntSet a) {
-  if (a.as<StrideSet>()) return a;
-  const IntervalSet* s = a.as<IntervalSet>();
-  CHECK(s->i.is_bounded());
-  NodePtr<StrideSet> n = make_node<StrideSet>();
-  n->base = s->i;
-  return IntSet(n);
-}
-template<typename OP>
-inline IntSet CombineSets(IntSet a, IntSet b) {
-  return CombineInterval_<OP>(a.cover_interval(), b.cover_interval());
+  if (a->IsEmpty()) return a;
+  if (b->IsEmpty()) return b;
+  return IntervalSet(min(a->min_value, b->min_value),
+                     min(a->max_value, b->max_value));
 }
 
-template<>
-inline IntSet CombineSets<Add>(IntSet a, IntSet b) {
-  const IntervalSet* a_int = a.as<IntervalSet>();
-  const IntervalSet* b_int = b.as<IntervalSet>();
-  if (a_int && is_zero(a_int->i.min)) return b;
-  if (b_int && is_zero(b_int->i.min)) return a;
-  a = AsStrideSet(a);
-  b = AsStrideSet(b);
-  const StrideSet* a_stride = a.as<StrideSet>();
-  const StrideSet* b_stride = b.as<StrideSet>();
-  auto n = make_node<StrideSet>(*a_stride);
-  for (size_t i = 0; i < b_stride->extents.size(); ++i) {
-    n->extents.push_back(b_stride->extents[i]);
-    n->strides.push_back(b_stride->strides[i]);
-  }
-  n->base = CombineInterval<Add>(
-      a_stride->base, b_stride->base).as<IntervalSet>()->i;
-  return IntSet(n);
-}
-
-inline IntSet NegateSet(IntSet a) {
-  const IntervalSet* a_int = a.as<IntervalSet>();
-  if (a_int) {
-    if (a_int->i.is_single_point()) {
-      return IntSet::single_point(-a_int->i.min);
-    } else {
-      Interval r = Interval::everything();
-      if (a_int->i.has_upper_bound()) {
-        r.min = -(a_int->i.max);
-      }
-      if (a_int->i.has_lower_bound()) {
-        r.max = -(a_int->i.min);
-      }
-      return IntervalSet::make(r);
-    }
-  } else {
-    return NegateSet(a.cover_interval());
+// internal helper function to get an interval set
+IntervalSet ToIntervalSet(IntSet set) {
+  if (auto* node = set.as<IntervalSetNode>()) {
+    return GetRef<IntervalSet>(node);
   }
+  DLOG(INFO) << "cannot resolve int set " << set;
+  return IntervalSet::Everything();
 }
 
-template<>
-inline IntSet CombineSets<Sub>(IntSet a, IntSet b) {
-  return CombineSets<Add>(a, NegateSet(b));
-}
-
-TVM_DECLARE_LOGICAL_OP(And);
-TVM_DECLARE_LOGICAL_OP(Or);
-TVM_DECLARE_LOGICAL_OP(EQ);
-TVM_DECLARE_LOGICAL_OP(NE);
-TVM_DECLARE_LOGICAL_OP(GE);
-TVM_DECLARE_LOGICAL_OP(GT);
-TVM_DECLARE_LOGICAL_OP(LE);
-TVM_DECLARE_LOGICAL_OP(LT);
-TVM_DECLARE_LOGICAL_OP(Not);
+using namespace ir;
 
-// generic combine operations of two sets
-template<typename OP>
-inline IntSet Combine(const IntSet& a, const IntSet &b) {
-  if (is_logical_op<OP>::value) {
-    return IntervalSet::make(0, 1);
+// Simplified version of int set evaluator that operates on IntervalSet
+// We might use better set analysis in the future to replace the intervalset.
+class IntervalSetEvaluator :
+      public ExprFunctor<IntervalSet(const Expr&)> {
+ public:
+  IntervalSetEvaluator(Analyzer* analyzer,
+                       const Map<Var, IntSet>& dom_map,
+                       bool eval_vec = false)
+      : analyzer_(analyzer),
+        dom_map_(dom_map),
+        eval_vec_(eval_vec) {
   }
-  const IntervalSet* a_int = a.as<IntervalSet>();
-  const IntervalSet* b_int = b.as<IntervalSet>();
-  if (a_int && a_int->i.is_everything()) return a;
-  if (b_int && b_int->i.is_everything()) return b;
-  if (a_int && b_int) {
-    return CombineInterval<OP>(a_int->i, b_int->i);
+
+  IntervalSet Eval(const Expr& val) {
+    return this->VisitExpr(val);
   }
-  if (a_int && !(a_int->i.is_bounded())) {
-    return CombineInterval_<OP>(a, b.cover_interval());
+
+  IntervalSet VisitExpr_(const IntImm* op) final {
+    return IntervalSet::SinglePoint(GetRef<Expr>(op));
   }
-  if (b_int && !(b_int->i.is_bounded())) {
-    return CombineInterval_<OP>(a.cover_interval(), b);
+
+  IntervalSet VisitExpr_(const UIntImm* op) final {
+    return IntervalSet::SinglePoint(GetRef<Expr>(op));
   }
-  return CombineSets<OP>(a, b);
-}
 
-class IntSetEvaluator :
-      public ExprFunctor<IntSet(const Expr&, const Expr&)> {
- public:
-  explicit IntSetEvaluator(
-      const std::unordered_map<const Variable*, IntSet>& dom_map,
-      bool eval_vec = false)
-      : dom_map_(dom_map), eval_vec_(eval_vec) {}
-  // Evaluate.
-  IntSet Eval(const Expr& e) {
-    return this->VisitExpr(e, e);
-  }
-  IntSet VisitExpr_(const IntImm* op, const Expr& e) final {
-    return IntSet::single_point(e);
-  }
-  IntSet VisitExpr_(const UIntImm* op, const Expr& e) final {
-    return IntSet::single_point(e);
-  }
-  IntSet VisitExpr_(const Variable* op, const Expr& e) final {
-    auto it = dom_map_.find(op);
+  IntervalSet VisitExpr_(const Variable* op) final {
+    Var var = GetRef<Var>(op);
+    auto it = dom_map_.find(var);
     if (it != dom_map_.end()) {
-      return it->second;
+      return ToIntervalSet((*it).second);
     } else {
-      return IntSet::single_point(e);
+      return IntervalSet::SinglePoint(var);
     }
   }
-  IntSet VisitExpr_(const Add* op, const Expr& e) final {
-    return Binary(op, e);
+
+  IntervalSet VisitExpr_(const Add* op) final {
+    return VisitBinaryExpr_(op);
   }
-  IntSet VisitExpr_(const Sub* op, const Expr& e) final {
-    return Binary(op, e);
+
+  IntervalSet VisitExpr_(const Sub* op) final {
+    return VisitBinaryExpr_(op);
   }
-  IntSet VisitExpr_(const Mul* op, const Expr& e) final {
-    return Binary(op, e);
+
+  IntervalSet VisitExpr_(const Mul* op) final {
+    return VisitBinaryExpr_(op);
   }
-  IntSet VisitExpr_(const Div* op, const Expr& e) final {
-    return Binary(op, e);
+
+  IntervalSet VisitExpr_(const Div* op) final {
+    return VisitBinaryExpr_(op);
   }
-  IntSet VisitExpr_(const Mod* op, const Expr& e) final {
-    return Binary(op, e);
+
+  IntervalSet VisitExpr_(const Mod* op) final {
+    return VisitBinaryExpr_(op);
   }
-  IntSet VisitExpr_(const Min* op, const Expr& e) final {
-    return Binary(op, e);
+
+  IntervalSet VisitExpr_(const Min* op) final {
+    return VisitBinaryExpr_(op);
   }
-  IntSet VisitExpr_(const Max* op, const Expr& e) final {
-    return Binary(op, e);
+
+  IntervalSet VisitExpr_(const Max* op) final {
+    return VisitBinaryExpr_(op);
   }
-  IntSet VisitExpr_(const EQ* op, const Expr& e) final {
-    return Binary(op, e);
+
+  IntervalSet VisitExpr_(const EQ* op) final {
+    return VisitBinaryExpr_(op);
   }
-  IntSet VisitExpr_(const NE* op, const Expr& e) final {
-    return Binary(op, e);
+
+  IntervalSet VisitExpr_(const NE* op) final {
+    return VisitBinaryExpr_(op);
   }
-  IntSet VisitExpr_(const LT* op, const Expr& e) final {
-    return Binary(op, e);
+
+  IntervalSet VisitExpr_(const LT* op) final {
+    return VisitBinaryExpr_(op);
   }
-  IntSet VisitExpr_(const LE* op, const Expr& e) final {
-    return Binary(op, e);
+
+  IntervalSet VisitExpr_(const LE* op) final {
+    return VisitBinaryExpr_(op);
   }
-  IntSet VisitExpr_(const GT* op, const Expr& e) final {
-    return Binary(op, e);
+
+  IntervalSet VisitExpr_(const GT* op) final {
+    return VisitBinaryExpr_(op);
   }
-  IntSet VisitExpr_(const GE* op, const Expr& e) final {
-    return Binary(op, e);
+
+  IntervalSet VisitExpr_(const GE* op) final {
+    return VisitBinaryExpr_(op);
   }
-  IntSet VisitExpr_(const And* op, const Expr& e) final {
-    return Binary(op, e);
+
+  IntervalSet VisitExpr_(const And* op) final {
+    return VisitBinaryExpr_(op);
   }
-  IntSet VisitExpr_(const Or* op, const Expr& e) final {
-    return Binary(op, e);
+
+  IntervalSet VisitExpr_(const Or* op) final {
+    return VisitBinaryExpr_(op);
   }
-  IntSet VisitExpr_(const Ramp* op, const Expr& e) final {
+
+  IntervalSet VisitExpr_(const Ramp* op) final {
     CHECK(eval_vec_);
-    IntSet base = Eval(op->base);
-    int vstride;
-    if (GetConstInt(op->stride, &vstride)) {
+    IntervalSet base = Eval(op->base);
+    PVar<Integer> stride;
+    if (stride.Match(op->stride)) {
       Type t = op->base.type();
-      if (vstride > 0) {
+      int64_t vstride = stride.Eval()->value;
+      if (vstride> 0) {
         return Combine<Add>(
+            analyzer_,
             base,
-            IntSet::interval(make_zero(t),
-                             make_const(t, vstride * op->lanes -1)));
+            IntervalSet(make_zero(t), make_const(t, vstride * op->lanes - 1)));
       } else {
         return Combine<Add>(
+            analyzer_,
             base,
-            IntSet::interval(make_const(t, vstride * op->lanes + 1),
-                             make_zero(t)));
+            IntervalSet(make_const(t, vstride * op->lanes + 1), make_zero(t)));
       }
     }
-    LOG(WARNING) << "cannot evaluate set on expression " << e;
-    return IntSet::everything();
+    DLOG(WARNING) << "cannot evaluate set on expression " << GetRef<Expr>(op);
+    return IntervalSet::Everything();
   }
-  IntSet VisitExpr_(const Broadcast* op, const Expr& e) final {
+
+  IntervalSet VisitExpr_(const Broadcast* op) final {
     CHECK(eval_vec_);
-    return Eval(op->value);
+    return VisitExpr(op->value);
   }
-  IntSet VisitExpr_(const Select* op, const Expr& e) final {
-    IntSet true_set = this->Eval(op->true_value);
-    IntSet false_set = this->Eval(op->false_value);
-    return Union({false_set, true_set});
+
+  IntervalSet VisitExpr_(const Select* op) final {
+    IntervalSet true_set = this->Eval(op->true_value);
+    IntervalSet false_set = this->Eval(op->false_value);
+    return Union(analyzer_, false_set, true_set);
   }
-  IntSet VisitExprDefault_(const Node* op, const Expr& e) final {
-    LOG(WARNING) << "cannot evaluate set type " << e->type_key();
-    return IntSet::everything();
+
+  IntervalSet VisitExprDefault_(const Node* op) final {
+    DLOG(WARNING) << "cannot evaluate set type " << op->type_key();
+    return IntervalSet::Everything();
   }
 
  private:
+  // whether set is exactly single point that equals value.
+  bool MatchPoint(const IntervalSet& set,
+                  const Expr& value) const {
+    return set->min_value.same_as(value) && set->max_value.same_as(value);
+  }
+
   template<typename T>
-  inline IntSet Binary(const T* op, const Expr& e) {
-    IntSet a = this->Eval(op->a);
-    IntSet b = this->Eval(op->b);
+  inline IntervalSet VisitBinaryExpr_(const T* op) {
+    IntervalSet a = this->Eval(op->a);
+    IntervalSet b = this->Eval(op->b);
     if (MatchPoint(a, op->a) && MatchPoint(b, op->b)) {
-      return IntSet::single_point(e);
+      return IntervalSet::SinglePoint(GetRef<Expr>(op));
     }
-    return Combine<T>(a, b);
+    return Combine<T>(analyzer_, a, b);
   }
 
-  const std::unordered_map<const Variable*, IntSet>& dom_map_;
+  Analyzer* analyzer_;
+  const Map<Var, IntSet>& dom_map_;
   bool eval_vec_{false};
 };
 
+class IntSetAnalyzer::Impl {
+ public:
+  explicit Impl(Analyzer* analyzer)
+      : analyzer_(analyzer) {
+  }
+
+  IntSet Eval(const Expr& expr, const Map<Var, IntSet>& dom_map) const {
+    return IntervalSetEvaluator(analyzer_, dom_map).Eval(expr);
+  }
+
+ private:
+  Analyzer* analyzer_;
+};
+
+IntSetAnalyzer::IntSetAnalyzer(Analyzer* parent)
+    : impl_(new Impl(parent)) {
+}
+
+IntSetAnalyzer::~IntSetAnalyzer() {
+  delete impl_;
+}
+
+IntSet IntSetAnalyzer::operator()(const Expr& expr,
+                                  const Map<Var, IntSet>& dom_map) {
+  return impl_->Eval(expr, dom_map);
+}
+
+// Quickly adapt to IntSet interface
+// TODO(tqchen): revisit IntSet interface as well.
+Range IntSet::cover_range(Range max_range) const {
+  IntSet temp;
+  const IntervalSetNode* s_int = (*this).as<IntervalSetNode>();
+  CHECK(s_int != nullptr);
+  if (s_int->HasUpperBound() && s_int->HasLowerBound()) {
+    return Range::make_by_min_extent(
+        s_int->min_value, Simplify(s_int->max_value + 1 - s_int->min_value));
+  }
+  return max_range;
+}
+
+Expr IntSet::min() const {
+  const IntervalSetNode* s_int = (*this).as<IntervalSetNode>();
+  CHECK(s_int);
+  return s_int->min_value;
+}
+
+Expr IntSet::max() const {
+  const IntervalSetNode* s_int = (*this).as<IntervalSetNode>();
+  CHECK(s_int);
+  return s_int->max_value;
+}
+
+bool IntSet::is_nothing() const {
+  const IntervalSetNode* s_int = (*this).as<IntervalSetNode>();
+  return (s_int && s_int->IsEmpty());
+}
+
+bool IntSet::is_everything() const {
+  const IntervalSetNode* s_int = (*this).as<IntervalSetNode>();
+  return (s_int && s_int->IsEverything());
+}
+
+bool IntSet::is_single_point() const {
+  const IntervalSetNode* s_int = (*this).as<IntervalSetNode>();
+  return (s_int && s_int->IsSinglePoint());
+}
+
+bool IntSet::can_prove_positive() const {
+  const IntervalSetNode* s_int = (*this).as<IntervalSetNode>();
+  return (s_int && is_positive_const(ir::Simplify(s_int->min_value)));
+}
+
+bool IntSet::can_prove_negative() const {
+  const IntervalSetNode* s_int = (*this).as<IntervalSetNode>();
+  return (s_int && is_negative_const(ir::Simplify(s_int->max_value)));
+}
+
+bool IntSet::can_prove_non_positive() const {
+  if (const auto* s_int = (*this).as<IntervalSetNode>()) {
+    auto max = ir::Simplify(s_int->max_value);
+    return is_zero(max) || is_negative_const(max);
+  }
+  return false;
+}
+
+bool IntSet::can_prove_non_negative() const {
+  if (const IntervalSetNode* s_int = (*this).as<IntervalSetNode>()) {
+    auto min = ir::Simplify(s_int->min_value);
+    return is_zero(min) || is_positive_const(min);
+  }
+  return false;
+}
+
+SignType IntSet::sign_type() const {
+  if (can_prove_positive()) {
+    return kPositive;
+  } else if (can_prove_negative()) {
+    return kNegative;
+  } else if (is_single_point() && is_zero(point_value())) {
+    return kZero;
+  } else {
+    return kUnknown;
+  }
+}
+Expr IntSet::point_value() const {
+  const IntervalSetNode* s_int = (*this).as<IntervalSetNode>();
+  CHECK(s_int && s_int->IsSinglePoint());
+  return s_int->min_value;
+}
+
+IntSet IntSet::nothing() {
+  return IntervalSet::Empty();
+}
+
+IntSet IntSet::everything() {
+  return IntervalSet::Everything();
+}
+
+IntSet IntSet::single_point(Expr x) {
+  return IntervalSet::SinglePoint(x);
+}
+
+IntSet IntSet::interval(Expr min, Expr max) {
+  if (min.same_as(max)) {
+    return IntSet::single_point(min);
+  }
+  return IntervalSet(min, max);
+}
+
+// Range related code
+inline bool ProveEqual(Expr lhs, Expr rhs) {
+  return is_zero(ir::Simplify(lhs - rhs));
+}
+
+IntSet IntSet::range(Range r) {
+  // must make sure it can be matched back by MatchRange.
+  if (is_one(r->extent)) {
+    return IntSet::single_point(r->min);
+  }
+  return IntervalSet(r->min, r->extent + r->min - 1);
+}
+
+bool IntSet::match_range(const Range& b) const {
+  const IntSet& a = *this;
+  const IntervalSetNode* a_int = a.as<IntervalSetNode>();
+  if (!a_int) return false;
+  return ProveEqual(a_int->min_value, b->min) &&
+      ProveEqual(a_int->max_value, b->extent + b->min - 1);
+}
+
+IntSet Union(const Array<IntSet>& sets) {
+  if (sets.size() == 0) return IntSet::nothing();
+  if (sets.size() == 1) return sets[0];
+  Analyzer ana;
+  IntervalSet x = ToIntervalSet(sets[0]);
+  for (size_t i = 1; i < sets.size(); ++i) {
+    x = Union(&ana, x, ToIntervalSet(sets[i]));
+  }
+  return IntervalSet(ir::Simplify(x->min_value),
+                     ir::Simplify(x->max_value));
+}
+
+IntSet Intersect(const Array<IntSet>& sets) {
+  if (sets.size() == 0) return IntSet::nothing();
+  if (sets.size() == 1) return sets[0];
+  Analyzer ana;
+  IntervalSet x = ToIntervalSet(sets[0]);
+  for (size_t i = 1; i < sets.size(); ++i) {
+    x = Intersect(&ana, x, ToIntervalSet(sets[i]));
+  }
+  return IntervalSet(ir::Simplify(x->min_value),
+                     ir::Simplify(x->max_value));
+}
+
+Map<Var, IntSet> ConvertDomMap(const Map<IterVar, IntSet>& dom_map) {
+  Map<Var, IntSet> dmap;
+  for (auto kv : dom_map) {
+    dmap.Set(kv.first->var, kv.second);
+  }
+  return dmap;
+}
+
+Map<Var, IntSet> ConvertDomMap(
+    const std::unordered_map<const Variable*, IntSet>& dom_map) {
+  Map<Var, IntSet> dmap;
+  for (auto kv : dom_map) {
+    dmap.Set(GetRef<Var>(kv.first), kv.second);
+  }
+  return dmap;
+}
+
 IntSet EvalSet(Expr e,
-               const std::unordered_map<const Variable*, IntSet>& dom_map) {
-  return IntSetEvaluator(dom_map, false).Eval(e);
+               const Map<Var, IntSet>& dom_map) {
+  Analyzer ana;
+  return IntervalSetEvaluator(&ana, dom_map, false).Eval(e);
 }
 
 IntSet IntSet::vector(Expr x) {
-  std::unordered_map<const Variable*, IntSet> dmap;
-  return IntSetEvaluator(dmap, true).Eval(x);
+  Analyzer ana;
+  Map<Var, IntSet> dmap;
+  return IntervalSetEvaluator(&ana, dmap, true).Eval(x);
 }
 
 IntSet EvalSet(Expr e,
                const Map<IterVar, IntSet>& dom_map) {
-  std::unordered_map<const Variable*, IntSet> dmap;
-  for (auto kv : dom_map) {
-    dmap[kv.first->var.as<Variable>()] = kv.second;
-  }
-  return EvalSet(e, dmap);
+  return EvalSet(e, ConvertDomMap(dom_map));
 }
 
-IntSet EvalSet(Range r,
+IntSet EvalSet(Expr e,
                const std::unordered_map<const Variable*, IntSet>& dom_map) {
-  IntSetEvaluator m(dom_map);
-  IntSet min_set = m.Eval(r->min).cover_interval();
+  return EvalSet(e, ConvertDomMap(dom_map));
+}
+
+IntSet EvalSet(Range r,
+               const Map<Var, IntSet>& dom_map) {
+  Analyzer ana;
+  IntervalSetEvaluator m(&ana, dom_map);
+  IntervalSet min_set = m.Eval(r->min);
   // Simplifying first can give tighter bounds if r->min and r->extent share variables
-  Expr sum = ComputeExpr<Sub>(ComputeExpr<Add>(r->min, r->extent), 1);
-  IntSet max_set = m.Eval(Simplify(sum)).cover_interval();
-  const Interval& ni = min_set.as<IntervalSet>()->i;
-  const Interval& xi = max_set.as<IntervalSet>()->i;
-  if (!ni.has_lower_bound()) return IntSet::everything();
-  if (!xi.has_upper_bound()) return IntSet::everything();
-  return IntervalSet::make(ni.min, xi.max);
+  Expr sum = r->min + r->extent - 1;
+  IntervalSet max_set = m.Eval(Simplify(sum));
+  if (!min_set->HasLowerBound()) return IntSet::everything();
+  if (!max_set->HasUpperBound()) return IntSet::everything();
+  return IntervalSet(min_set->min_value, max_set->max_value);
 }
 
-IntSet EvalSet(IntSet s,
+IntSet EvalSet(Range r,
                const std::unordered_map<const Variable*, IntSet>& dom_map) {
-  IntSetEvaluator m(dom_map);
-  s = s.cover_interval();
-  const IntervalSet* s_int = s.as<IntervalSet>();
-  Expr vmax = s_int->i.has_upper_bound() ?
-      m.Eval(s_int->i.max).cover_interval().max() : s_int->i.max;
-  Expr vmin = s_int->i.has_lower_bound() ?
-      m.Eval(s_int->i.min).cover_interval().min() : s_int->i.min;
-  return IntervalSet::make(vmin, vmax);
+  return EvalSet(r, ConvertDomMap(dom_map));
 }
 
-class SubExprIntSetEvaluator : public IntSetEvaluator {
+IntSet EvalSet(IntSet s,
+               const std::unordered_map<const Variable*, IntSet>& dom_map) {
+  Analyzer ana;
+  auto dmap = ConvertDomMap(dom_map);
+  IntervalSetEvaluator m(&ana, dmap);
+  const IntervalSetNode* s_int = s.as<IntervalSetNode>();
+  Expr vmax = s_int->HasUpperBound() ?
+      m.Eval(s_int->max_value).max() : s_int->max_value;
+  Expr vmin = s_int->HasLowerBound() ?
+      m.Eval(s_int->min_value).min() : s_int->min_value;
+  return IntervalSet(vmin, vmax);
+}
+
+class SubExprIntervalSetEvaluator : public IntervalSetEvaluator {
  public:
-  explicit SubExprIntSetEvaluator(
-      const std::unordered_map<const Variable*, IntSet>& dom_map)
-      : IntSetEvaluator(dom_map) {}
+  explicit SubExprIntervalSetEvaluator(
+      Analyzer* analyzer,
+      const Map<Var, IntSet>& dom_map)
+      : IntervalSetEvaluator(analyzer, dom_map) {}
 
-  IntSet VisitExpr(const Expr& n, const Expr& e) final {
-    IntSet ret = IntSetEvaluator::VisitExpr(n, e);
+  IntervalSet VisitExpr(const Expr& n) final {
+    IntervalSet ret = IntervalSetEvaluator::VisitExpr(n);
     expr_map[n] = ret;
     return ret;
   }
@@ -635,28 +705,26 @@ class SubExprIntSetEvaluator : public IntSetEvaluator {
   ExprIntSetMap expr_map;
 };
 
-ExprIntSetMap EvalSetForEachSubExpr(Expr e,
+ExprIntSetMap EvalSetForEachSubExpr(
+    Expr e,
     const std::unordered_map<const Variable*, IntSet>& dom_map) {
-  SubExprIntSetEvaluator m(dom_map);
+  Analyzer ana;
+  auto dmap = ConvertDomMap(dom_map);
+  SubExprIntervalSetEvaluator m(&ana, dmap);
   m.Eval(e);
   return m.expr_map;
 }
 
 IntSet EvalSet(Range r,
                const Map<IterVar, IntSet>& dom_map) {
-  std::unordered_map<const Variable*, IntSet> dmap;
-  for (auto kv : dom_map) {
-    dmap[kv.first->var.as<Variable>()] = kv.second;
-  }
-  return EvalSet(r, dmap);
+  return EvalSet(r, ConvertDomMap(dom_map));
 }
 
 TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
-.set_dispatch<IntervalSet>([](const IntervalSet *op, IRPrinter *p) {
-    p->stream << "interval-set"
-              << "[" << op->i.min << ", "
-              << op->i.max << ']';
+.set_dispatch<IntervalSetNode>([](const IntervalSetNode *op, IRPrinter *p) {
+    p->stream << "IntervalSet"
+              << "[" << op->min_value << ", "
+              << op->max_value << ']';
   });
-
 }  // namespace arith
 }  // namespace tvm
diff --git a/src/arithmetic/int_set.h b/src/arithmetic/int_set.h
new file mode 100644
index 000000000000..bf7fec24f78a
--- /dev/null
+++ b/src/arithmetic/int_set.h
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file int_set.h
+ * \brief Internal data structure for integer set.
+ */
+#ifndef TVM_ARITHMETIC_INT_SET_H_
+#define TVM_ARITHMETIC_INT_SET_H_
+
+#include <tvm/arithmetic.h>
+#include <tvm/expr_operator.h>
+#include <limits>
+#include "const_fold.h"
+
+namespace tvm {
+namespace arith {
+
+/*!
+ * \brief Symbolic interval set.
+ *
+ * \note We intentionally keep the internal of IntSet private,
+         as we might change it later.
+ */
+class IntervalSetNode : public IntSetNode {
+ public:
+  /*! \brief Minimum value in the interval. */
+  Expr min_value;
+  /*! \brief Maximum value in the interval. */
+  Expr max_value;
+
+  // visitor overload.
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("min_value", &min_value);
+    v->Visit("max_value", &max_value);
+  }
+
+  /*! \return Whether the interval has upper bound. */
+  bool HasUpperBound() const {
+    return !is_pos_inf(max_value) && !IsEmpty();
+  }
+  /*! \return Whether the interval has lower bound. */
+  bool HasLowerBound() const {
+    return !is_neg_inf(min_value) && !IsEmpty();
+  }
+  /*! \return Whether the interval is a single point. */
+  bool IsSinglePoint() const {
+    return min_value.same_as(max_value);
+  }
+  /*! \return whether interval represent nothing */
+  bool IsEmpty() const {
+    // during computations, either extreme could occur.
+    return is_pos_inf(min_value) || is_neg_inf(max_value);
+  }
+  /*! \return whether interval represent everything */
+  bool IsEverything() const {
+    return is_neg_inf(min_value) && is_pos_inf(max_value);
+  }
+
+  static constexpr const char* _type_key = "arith.IntervalSet";
+  TVM_DECLARE_NODE_TYPE_INFO(IntervalSetNode, IntSetNode);
+};
+
+/*!
+ * \brief Interval set used for symbolic integer analysis.
+ * \sa IntervalSetNode
+ */
+class IntervalSet : public IntSet {
+ public:
+  /*!
+   * \brief Make a new instance of interval set.
+   * \param min_value The minimum value in the interval.
+   * \param max_value The maximum value in the interval.
+   * \return The created set.
+   */
+  TVM_DLL IntervalSet(Expr min_value, Expr max_value);
+
+  /*!
+   * \brief Create an IntervalSet that represents a single point.
+   * \param value The value to be represented.
+   * \return The result set.
+   */
+  static IntervalSet SinglePoint(Expr value) {
+    return IntervalSet(value, value);
+  }
+  /*!
+   * \brief Create an IntervalSet that represents everything.
+   * \param value The value to be represented.
+   * \return The result set.
+   */
+  static IntervalSet Everything() {
+    return IntervalSet(neg_inf(), pos_inf());
+  }
+  /*!
+   * \brief Create an empty eet.
+   * \return The result set.
+   */
+  static IntervalSet Empty() {
+    return IntervalSet(pos_inf(), neg_inf());
+  }
+
+  TVM_DEFINE_NODE_REF_COW(IntervalSetNode);
+  TVM_DEFINE_NODE_REF_METHODS(IntervalSet, IntSet, IntervalSetNode);
+};
+
+/*!
+ * \brief Create union of two IntervalSets.
+ * \param analyzer The analyzer for simplification analysis.
+ * \param a The first set.
+ * \param b The second set.
+ * \return The result set.
+ */
+TVM_DLL IntervalSet Union(Analyzer* analyzer, IntervalSet a, IntervalSet b);
+
+/*!
+ * \brief Create insersection of two IntervalSets.
+ * \param analzyer The analyzer for simplification analysis.
+ * \param a The first set.
+ * \param b The second set.
+ * \return The result set.
+ */
+TVM_DLL IntervalSet Intersect(Analyzer *analzyer, IntervalSet a, IntervalSet b);
+
+}  // namespace arith
+}  // namespace tvm
+
+#endif  // TVM_ARITHMETIC_INT_SET_H_
diff --git a/src/arithmetic/int_set_internal.h b/src/arithmetic/int_set_internal.h
deleted file mode 100644
index 8b675cfbffda..000000000000
--- a/src/arithmetic/int_set_internal.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file int_set_internal.h
- * \brief Implementations of integer set
- */
-#ifndef TVM_ARITHMETIC_INT_SET_INTERNAL_H_
-#define TVM_ARITHMETIC_INT_SET_INTERNAL_H_
-
-#include <tvm/ir.h>
-#include <tvm/ir_pass.h>
-#include <tvm/arithmetic.h>
-
-namespace tvm {
-namespace arith {
-
-using HalideIR::Internal::Interval;
-
-/*! \brief Set of continuous interval */
-struct IntervalSet : public IntSetNode {
-  /*! \brief the internal interval*/
-  Interval i;
-
-  static IntSet make(Interval i) {
-    NodePtr<IntervalSet> n =
-        make_node<IntervalSet>();
-    n->i = i;
-    return IntSet(n);
-  }
-  static IntSet make(Expr min, Expr max) {
-    NodePtr<IntervalSet> n =
-        make_node<IntervalSet>();
-    n->i.min = min;
-    n->i.max = max;
-    return IntSet(n);
-  }
-
-  static constexpr const char* _type_key = "IntervalSet";
-  TVM_DECLARE_NODE_TYPE_INFO(IntervalSet, IntSetNode);
-};
-
-/*!
- * \brief set represented by strided integers
- *  Reserved for cases where strided access is supported.
- */
-struct StrideSet : public IntSetNode {
-  /*! \brief the base inetrval */
-  Interval base;
-  /*! \brief additional extents in positive number */
-  Array<Expr> extents;
-  /*! \brief additional strides in positive number */
-  Array<Expr> strides;
-
-  static constexpr const char* _type_key = "StrideSet";
-  TVM_DECLARE_NODE_TYPE_INFO(StrideSet, IntSetNode);
-};
-
-}  // namespace arith
-}  // namespace tvm
-
-#endif  // TVM_ARITHMETIC_INT_SET_INTERNAL_H_
diff --git a/src/arithmetic/modular_set.cc b/src/arithmetic/modular_set.cc
index 7701e04844fa..b3e943fc7631 100644
--- a/src/arithmetic/modular_set.cc
+++ b/src/arithmetic/modular_set.cc
@@ -26,6 +26,8 @@
 #include <tvm/expr_operator.h>
 #include <tvm/ir_functor_ext.h>
 #include <limits>
+#include <utility>
+#include <unordered_map>
 #include "pattern_match.h"
 
 namespace tvm {
@@ -35,11 +37,12 @@ using namespace ir;
 
 TVM_REGISTER_NODE_TYPE(ModularSetNode);
 
-ModularSet ModularSetNode::make(int64_t coeff, int64_t base) {
+ModularSet::ModularSet(int64_t coeff, int64_t base) {
   auto node = make_node<ModularSetNode>();
   node->coeff = coeff;
   node->base = base;
-  return ModularSet(node);
+  // finish construction.
+  node_ = std::move(node);
 }
 
 TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
@@ -366,13 +369,13 @@ class ModularSetAnalyzer::Impl :
    * \return Bound that represent everything dtype can represent.
    */
   static Entry Nothing() {
-      return Entry(0, 1);
+    return Entry(0, 1);
   }
 };
 
 ModularSet ModularSetAnalyzer::operator()(const Expr& expr) {
   Entry ret = impl_->VisitExpr(expr);
-  return ModularSetNode::make(ret.coeff, ret.base);
+  return ModularSet(ret.coeff, ret.base);
 }
 
 void ModularSetAnalyzer::Update(const Var& var,
diff --git a/src/arithmetic/rewrite_simplify.cc b/src/arithmetic/rewrite_simplify.cc
index 58d2b83a223a..ea6530631880 100644
--- a/src/arithmetic/rewrite_simplify.cc
+++ b/src/arithmetic/rewrite_simplify.cc
@@ -80,12 +80,6 @@ TryCompare(const Expr& x, int64_t val) {
       return kLT;
     }
   }
-  if (val == 0) {
-    ModularSet dmod = parent_->modular_set(diff);
-    if (dmod->base != 0) {
-      return kNE;
-    }
-  }
   ConstIntBound dbound = parent_->const_int_bound(diff);
   if (dbound->min_value > val) {
     return kGT;
@@ -99,6 +93,12 @@ TryCompare(const Expr& x, int64_t val) {
   if (dbound->max_value <= val) {
     return kLE;
   }
+  if (val == 0) {
+    ModularSet dmod = parent_->modular_set(diff);
+    if (dmod->base != 0) {
+      return kNE;
+    }
+  }
   return kUnknown;
 }
 
@@ -284,11 +284,39 @@ Mutate_(const Sub* op, const Expr& self) {
                        CanProveEqual(((b1 - s2) - (b2 - s1)).Eval(), 0));
 
     // modular-div simplification
-    // Always pre-condition on positive integer domain
+    // Note that c*(x/c) + x % c == x is true for every x and c != 0 even for truncated division
     TVM_TRY_REWRITE_IF(x - (x / c1) * c1, x % c1,
-                       CanProveGreaterEqual(x.Eval(), 0) && c1.Eval()->value > 0);
+                       c1.Eval()->value != 0);
     TVM_TRY_REWRITE_IF((x / c1) * c1 - x, 0 - (x % c1),
-                       CanProveGreaterEqual(x.Eval(), 0) && c1.Eval()->value > 0);
+                       c1.Eval()->value != 0);
+    TVM_TRY_REWRITE_IF(x - ((x + y) / c1) * c1, (x + y) % c1 - y,
+                       c1.Eval()->value != 0);
+    TVM_TRY_REWRITE_IF(((x + y) / c1) * c1 - x, y - ((x + y) % c1),
+                       c1.Eval()->value != 0);
+    TVM_TRY_REWRITE_IF(x - ((x - y) / c1) * c1, (x - y) % c1 + y,
+                       c1.Eval()->value != 0);
+    TVM_TRY_REWRITE_IF(((x - y) / c1) * c1 - x, 0 - (x - y) % c1 - y,
+                       c1.Eval()->value != 0);
+
+    TVM_TRY_REWRITE_IF(x * c2 - (x / c1) * c3, (x % c1) * c2,
+                       c1.Eval()->value != 0 &&
+                       c3.Eval()->value == c1.Eval()->value * c2.Eval()->value);
+    TVM_TRY_REWRITE_IF((x / c1) * c3 - x * c2, 0 - (x % c1) * c2,
+                       c1.Eval()->value != 0 &&
+                       c3.Eval()->value == c1.Eval()->value * c2.Eval()->value);
+    TVM_TRY_REWRITE_IF(x * c2 - ((x + y) / c1) * c3, ((x + y) % c1 - y) * c2,
+                       c1.Eval()->value != 0 &&
+                       c3.Eval()->value == c1.Eval()->value * c2.Eval()->value);
+    TVM_TRY_REWRITE_IF(((x + y) / c1) * c3 - x * c2, (y - ((x + y) % c1)) * c2,
+                       c1.Eval()->value != 0 &&
+                       c3.Eval()->value == c1.Eval()->value * c2.Eval()->value);
+    TVM_TRY_REWRITE_IF(x * c2 - ((x - y) / c1) * c3, ((x - y) % c1 + y) * c2,
+                       c1.Eval()->value != 0 &&
+                       c3.Eval()->value == c1.Eval()->value * c2.Eval()->value);
+    TVM_TRY_REWRITE_IF(((x - y) / c1) * c3 - x * c2, (0 - (x - y) % c1 - y) * c2,
+                       c1.Eval()->value != 0 &&
+                       c3.Eval()->value == c1.Eval()->value * c2.Eval()->value);
+
     TVM_TRY_REWRITE_IF((x + c1) / c3  - (x + c2) / c3,
                        ((x + (c1 % c3)) % c3 + (c1 - c2)) / c3,
                        CanProveGreaterEqual(x.Eval(), -c2.Eval()->value) &&
@@ -348,6 +376,7 @@ Mutate_(const Mul* op, const Expr& self) {
 
     // canonicalization
     TVM_TRY_RECURSIVE_REWRITE(x * (c1 * y), (x * y) * c1);
+    TVM_TRY_RECURSIVE_REWRITE(c1 * x, x * c1);
     TVM_TRY_RECURSIVE_REWRITE_IF(
         (x - y) * c1, (y - x) * (0 - c1),
         c1.Eval()->value < 0);
@@ -396,6 +425,16 @@ Mutate_(const Div* op, const Expr& self) {
     // We adopt the default C division uses truncation instead of floordiv.
     // This means most rules need to check non-negativeness of the operands.
 
+    // TryConstFold doesn't work for negative cases because it is also used by legacy
+    // parts of tvm which still assume euclidean div. In this simplifier we assume that the division
+    // is truncated, so perform const folding again.
+    // NOTE: trunc div required
+    if ((c1 / c2).Match(ret)) {
+      int64_t c1val = c1.Eval()->value;
+      int64_t c2val = c2.Eval()->value;
+      return make_const(op->type, c1val / c2val);
+    }
+
     // while it is always true for trunc div
     // restrict to common case(positive div)
     TVM_TRY_REWRITE_IF((x / c1) / c2, x / (c1 * c2),
@@ -595,10 +634,12 @@ Mutate_(const Mod* op, const Expr& self) {
     TVM_TRY_REWRITE_IF((x * c1 + y) % c2, y % c2,
                        c2.Eval()->value > 0 &&
                        c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual((x * c1).Eval(), 0) &&
                        CanProveGreaterEqual(y.Eval(), 0));
 
     TVM_TRY_REWRITE_IF((x + c1) % c2, x % c2,
                        c2.Eval()->value > 0 &&
+                       c1.Eval()->value >= 0 &&
                        c1.Eval()->value % c2.Eval()->value == 0 &&
                        CanProveGreaterEqual(x.Eval(), 0));
 
@@ -606,7 +647,13 @@ Mutate_(const Mod* op, const Expr& self) {
                        c2.Eval()->value > 0 &&
                        c1.Eval()->value % c2.Eval()->value == 0 &&
                        CanProveGreaterEqual(x.Eval(), 0) &&
-                       CanProveGreaterEqual(y.Eval(), 0));
+                       CanProveGreaterEqual((y * c1).Eval(), 0));
+
+    // canonicalization: x % c == x % (-c) for truncated division
+    // NOTE: trunc div required
+    TVM_TRY_RECURSIVE_REWRITE_IF(x % c1,
+                                 x % PConst<Expr>(make_const(op->type, -c1.Eval()->value)),
+                                 c1.Eval()->value < 0);
 
     // try modular analysis
     if ((x % c1).Match(ret)) {
@@ -766,7 +813,9 @@ Mutate_(const Min* op, const Expr& self) {
 
     // canonicalization
     TVM_TRY_RECURSIVE_REWRITE(min(min(x, c1), y), min(min(x, y), c1));
-    TVM_TRY_RECURSIVE_REWRITE(min(c1 - x, c2), c1 - max(x, c2 - c1));
+    TVM_TRY_RECURSIVE_REWRITE_IF(
+        min(c1 - x, c2), c1 - max(x, c1 - c2),
+        c2.Eval()->value != 0);
   }
 
   // condition rules.
@@ -914,7 +963,8 @@ Mutate_(const Max* op, const Expr& self) {
 
     // canonicalization
     TVM_TRY_RECURSIVE_REWRITE(max(max(x, c1), y), max(max(x, y), c1));
-    TVM_TRY_RECURSIVE_REWRITE(max(c1 - x, c2), c1 - min(x, c2 - c1));
+    TVM_TRY_RECURSIVE_REWRITE_IF(
+        max(c1 - x, c2), c1 - min(x, c1 - c2), c2.Eval()->value != 0);
   }
 
   // condition rules.
@@ -1025,20 +1075,53 @@ Mutate_(const LT* op, const Expr& self) {
     TVM_TRY_REWRITE_IF(x * c1 < y * c1, y < x,
                        c1.Eval()->value < 0);
 
-    // require c1 > 0 to work for any div mode
     TVM_TRY_REWRITE_IF(x * c2 < c1, x < (c1 - 1) / c2 + 1,
                        c1.Eval()->value > 0 &&
                        c2.Eval()->value > 0);
-    TVM_TRY_REWRITE_IF(x / c1 < c2, x < c1 * c2,
+    // NOTE: trunc div required
+    TVM_TRY_REWRITE_IF(x * c2 < c1, x < c1 / c2,
+                       c1.Eval()->value <= 0 &&
+                       c2.Eval()->value > 0);
+    // NOTE: trunc div required (euclidean is ok too, floored is not)
+    TVM_TRY_REWRITE_IF(x * c2 < c1, (c1 - 1) / c2 - 1 < x,
                        c1.Eval()->value > 0 &&
+                       c2.Eval()->value < 0);
+    // NOTE: trunc div required (floored is ok too, euclidean is not)
+    TVM_TRY_REWRITE_IF(x * c2 < c1, c1 / c2 < x,
+                       c1.Eval()->value <= 0 &&
+                       c2.Eval()->value < 0);
+
+    // NOTE: trunc div required
+    TVM_TRY_REWRITE_IF(c1 < x * c2, (c1 + 1) / c2 - 1 < x,
+                       c1.Eval()->value < 0 &&
                        c2.Eval()->value > 0);
-
     TVM_TRY_REWRITE_IF(c1 < x * c2, c1 / c2 < x,
                        c1.Eval()->value >= 0 &&
                        c2.Eval()->value > 0);
+    // NOTE: trunc div required (floored is ok too, euclidean is not)
+    TVM_TRY_REWRITE_IF(c1 < x * c2, x < (c1 + 1) / c2 + 1,
+                       c1.Eval()->value < 0 &&
+                       c2.Eval()->value < 0);
+    // NOTE: trunc div required (euclidean is ok too, floored is not)
+    TVM_TRY_REWRITE_IF(c1 < x * c2, x < c1 / c2,
+                       c1.Eval()->value >= 0 &&
+                       c2.Eval()->value < 0);
+
+    TVM_TRY_REWRITE_IF(x / c1 < c2, x < c1 * c2,
+                       c1.Eval()->value > 0 &&
+                       c2.Eval()->value > 0);
+    // NOTE: trunc div required
+    TVM_TRY_REWRITE_IF(x / c1 < c2, x < c1 * (c2 - 1) + 1,
+                       c1.Eval()->value > 0 &&
+                       c2.Eval()->value <= 0);
+
     TVM_TRY_REWRITE_IF(c1 < x / c2, (c1 + 1) * c2 - 1 < x,
                        c1.Eval()->value >= 0 &&
                        c2.Eval()->value > 0);
+    // NOTE: trunc div required
+    TVM_TRY_REWRITE_IF(c1 < x / c2, c1 * c2 < x,
+                       c1.Eval()->value < 0 &&
+                       c2.Eval()->value > 0);
 
     // division related simplificationx
     // invariance for any div mod: x - (x / c1) * c1 == x % c1
@@ -1200,11 +1283,11 @@ Mutate_(const Select* op, const Expr& self) {
   Expr cond = Mutate(op->condition);
   Expr true_value, false_value;
   {
-    ConstraintContext constraint(parent_, cond);
+    With<ConstraintContext> constraint(parent_, cond);
     true_value = Mutate(op->true_value);
   }
   {
-    ConstraintContext constraint(parent_, Mutate(Not::make(cond)));
+    With<ConstraintContext> constraint(parent_, Mutate(Not::make(cond)));
     false_value = Mutate(op->false_value);
   }
   if (is_zero(cond)) {
@@ -1237,11 +1320,11 @@ Mutate_(const Call* op, const Expr& self) {
     Expr cond = Mutate(op->args[0]);
     Expr true_value, false_value;
     {
-      ConstraintContext constraint(parent_, cond);
+      With<ConstraintContext> constraint(parent_, cond);
       true_value = Mutate(op->args[1]);
     }
     {
-      ConstraintContext constraint(parent_, Mutate(Not::make(cond)));
+      With<ConstraintContext> constraint(parent_, Mutate(Not::make(cond)));
       false_value = Mutate(op->args[2]);
     }
     if (is_zero(cond)) {
diff --git a/src/arithmetic/stmt_simplify.cc b/src/arithmetic/stmt_simplify.cc
index c793214b92f4..403187eb39fd 100644
--- a/src/arithmetic/stmt_simplify.cc
+++ b/src/arithmetic/stmt_simplify.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -48,11 +48,11 @@ class StmtSimplifier : public IRMutator {
     Expr condition = this->Mutate(op->condition);
     Stmt then_case, else_case;
     {
-      ConstraintContext ctx(&analyzer_, condition);
+      With<ConstraintContext> ctx(&analyzer_, condition);
       then_case = this->Mutate(op->then_case);
     }
     if (op->else_case.defined()) {
-      ConstraintContext ctx(&analyzer_, Mutate(Not::make(condition)));
+      With<ConstraintContext> ctx(&analyzer_, Mutate(Not::make(condition)));
       else_case = this->Mutate(op->else_case);
     }
     if (is_one(condition)) return then_case;
@@ -94,7 +94,7 @@ class StmtSimplifier : public IRMutator {
   Stmt Mutate_(const AssertStmt* op, const Stmt& s) final {
     Expr condition = this->Mutate(op->condition);
     Expr message = this->Mutate(op->message);
-    ConstraintContext ctx(&analyzer_, condition);
+    With<ConstraintContext> ctx(&analyzer_, condition);
     Stmt body = this->Mutate(op->body);
 
     if (condition.same_as(op->condition) &&
diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index 57e300fafec2..0a488f38457b 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2017 by Contributors
  *  Compile executable modules.
  * \file build_module.cc
  */
@@ -84,7 +83,7 @@ Target CreateTarget(const std::string& target_name,
     t->device_type = kDLGPU;
     t->keys_array.push_back(ir::StringImm::make("cuda"));
     t->keys_array.push_back(ir::StringImm::make("gpu"));
-    t->max_num_threads = 512;
+    t->max_num_threads = 1024;
     t->thread_warp_size = 32;
   } else if (target_name == "rocm" || target_name == "opencl") {
     // For now assume rocm schedule for opencl
@@ -148,8 +147,7 @@ TVM_REGISTER_API("_TargetCreate")
 TVM_REGISTER_API("_TargetFromString")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
   std::string target_str = args[0];
-
-  *ret = Target::create(target_str);
+  *ret = Target::Create(target_str);
   });
 
 std::vector<std::string> TargetNode::keys() const {
@@ -207,7 +205,7 @@ std::string GetDeviceName(const std::string& target_str) {
   return "";
 }
 
-Target Target::create(const std::string& target_str) {
+Target Target::Create(const std::string& target_str) {
   if (target_str.length() == 0) {
     LOG(ERROR) << "target_str must not be empty";
   }
@@ -231,25 +229,24 @@ Target Target::create(const std::string& target_str) {
 struct TVMTargetThreadLocalEntry {
   /*! \brief The current target context */
   std::stack<tvm::Target> context_stack;
-
-  TVMTargetThreadLocalEntry() {
-  }
 };
 
 /*! \brief Thread local store to hold the Target context stack. */
 typedef dmlc::ThreadLocalStore<TVMTargetThreadLocalEntry> TVMTargetThreadLocalStore;
 
-void Target::EnterTargetScope(const tvm::Target& target) {
+void Target::EnterWithScope() {
   TVMTargetThreadLocalEntry *entry = TVMTargetThreadLocalStore::Get();
-  entry->context_stack.push(target);
+  entry->context_stack.push(*this);
 }
 
-void Target::ExitTargetScope() {
+void Target::ExitWithScope() {
   TVMTargetThreadLocalEntry *entry = TVMTargetThreadLocalStore::Get();
+  CHECK(!entry->context_stack.empty());
+  CHECK(entry->context_stack.top().same_as(*this));
   entry->context_stack.pop();
 }
 
-tvm::Target Target::current_target(bool allow_not_defined) {
+tvm::Target Target::Current(bool allow_not_defined) {
   TVMTargetThreadLocalEntry *entry = TVMTargetThreadLocalStore::Get();
   if (entry->context_stack.size() > 0) {
     return entry->context_stack.top();
@@ -311,7 +308,7 @@ bool LLVMEnabled() {
 
 /*! \return The default host target for a given device target */
 Target DefaultTargetHost(Target target) {
-  if (target->device_type == kDLCPU) {
+  if (target.defined() && target->device_type == kDLCPU) {
     return target;
   } else {
     if (LLVMEnabled()) {
@@ -392,7 +389,11 @@ Stmt BuildStmt(Schedule sch,
   if (loop_partition) {
     stmt = ir::LoopPartition(stmt, config->partition_const_loop);
   }
-  stmt = ir::VectorizeLoop(stmt);
+  if (config->disable_vectorize) {
+    stmt = ir::SkipVectorize(stmt);
+  } else {
+    stmt = ir::VectorizeLoop(stmt);
+  }
   stmt = ir::InjectVirtualThread(stmt);
   stmt = ir::InjectDoubleBuffer(stmt, config->double_buffer_split_loop);
   stmt = ir::StorageRewrite(stmt);
@@ -570,7 +571,7 @@ runtime::Module build(const Map<std::string, Array<LoweredFunc>>& inputs,
                       const BuildConfig& config) {
   Map<Target, Array<LoweredFunc>> updated_input;
   for (const auto& it : inputs) {
-    auto target = Target::create(it.first);
+    auto target = Target::Create(it.first);
     updated_input.Set(target, it.second);
   }
   return build(updated_input, target_host, config);
@@ -585,33 +586,35 @@ runtime::Module build(const Array<LoweredFunc>& funcs,
   return build(inputs, target_host, config);
 }
 
-BuildConfig build_config() {
+BuildConfig BuildConfig::Create() {
   return BuildConfig(make_node<BuildConfigNode>());
 }
 
 /*! \brief Entry to hold the BuildConfig context stack. */
 struct TVMBuildConfigThreadLocalEntry {
   /*! \brief The default build config if the stack is empty */
-  tvm::BuildConfig default_config;
+  BuildConfig default_config;
 
   /*! \brief The current build config context */
-  std::stack<tvm::BuildConfig> context_stack;
+  std::stack<BuildConfig> context_stack;
 
   TVMBuildConfigThreadLocalEntry() :
-    default_config(build_config()) {
+      default_config(BuildConfig::Create()) {
   }
 };
 
 /*! \brief Thread local store to hold the BuildConfig context stack. */
 typedef dmlc::ThreadLocalStore<TVMBuildConfigThreadLocalEntry> TVMBuildConfigThreadLocalStore;
 
-void BuildConfig::EnterBuildConfigScope(const tvm::BuildConfig& build_config) {
+void BuildConfig::EnterWithScope() {
   TVMBuildConfigThreadLocalEntry *entry = TVMBuildConfigThreadLocalStore::Get();
-  entry->context_stack.push(build_config);
+  entry->context_stack.push(*this);
 }
 
-void BuildConfig::ExitBuildConfigScope() {
+void BuildConfig::ExitWithScope() {
   TVMBuildConfigThreadLocalEntry *entry = TVMBuildConfigThreadLocalStore::Get();
+  CHECK(!entry->context_stack.empty());
+  CHECK(entry->context_stack.top().same_as(*this));
   entry->context_stack.pop();
 }
 
@@ -642,6 +645,7 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
   p->stream << "dump_pass_ir=" << op->dump_pass_ir << ", ";
   p->stream << "instrument_bound_checkers=" << op->instrument_bound_checkers << ", ";
   p->stream << "disable_select_rewriting=" << op->disable_select_rewriting;
+  p->stream << "disable_vectorize=" << op->disable_vectorize;
   p->stream << ")";
 });
 
@@ -709,7 +713,7 @@ GenericFunc& GenericFunc::register_func(const std::vector<std::string>& tags,
 
 void GenericFunc::CallPacked(TVMArgs args, TVMRetValue* ret) const {
   auto node = static_cast<GenericFuncNode*>(node_.get());
-  auto target = Target::current_target(true);
+  auto target = Target::Current(true);
   PackedFunc func;
 
   if (target.defined()) {
@@ -735,16 +739,21 @@ TVM_REGISTER_API("_GetCurrentBuildConfig")
   *ret = BuildConfig::Current();
   });
 
+class BuildConfig::Internal {
+ public:
+  static void EnterScope(BuildConfig target) {
+    target.EnterWithScope();
+  }
+  static void ExitScope(BuildConfig target) {
+    target.ExitWithScope();
+  }
+};
+
 TVM_REGISTER_API("_EnterBuildConfigScope")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  BuildConfig target = args[0];
-  BuildConfig::EnterBuildConfigScope(target);
-  });
+.set_body_typed(BuildConfig::Internal::EnterScope);
 
 TVM_REGISTER_API("_ExitBuildConfigScope")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  BuildConfig::ExitBuildConfigScope();
-  });
+.set_body_typed(BuildConfig::Internal::ExitScope);
 
 TVM_REGISTER_API("_BuildConfigSetAddLowerPass")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
@@ -831,18 +840,23 @@ TVM_REGISTER_API("_GenericFuncCallFunc")
 TVM_REGISTER_API("_GetCurrentTarget")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
   bool allow_not_defined = args[0];
-  *ret = Target::current_target(allow_not_defined);
+  *ret = Target::Current(allow_not_defined);
   });
 
+class Target::Internal {
+ public:
+  static void EnterScope(Target target) {
+    target.EnterWithScope();
+  }
+  static void ExitScope(Target target) {
+    target.ExitWithScope();
+  }
+};
+
 TVM_REGISTER_API("_EnterTargetScope")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  Target target = args[0];
-  Target::EnterTargetScope(target);
-  });
+.set_body_typed(Target::Internal::EnterScope);
 
 TVM_REGISTER_API("_ExitTargetScope")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  Target::ExitTargetScope();
-  });
+.set_body_typed(Target::Internal::ExitScope);
 
 }  // namespace tvm
diff --git a/src/codegen/codegen_aocl.cc b/src/codegen/codegen_aocl.cc
index 6f899cbb0b53..03b9b6869d17 100644
--- a/src/codegen/codegen_aocl.cc
+++ b/src/codegen/codegen_aocl.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -54,7 +54,7 @@ runtime::Module BuildAOCL(Array<LoweredFunc> funcs, std::string target_str,
   std::string cmd = "aoc aocl.cl";
   // AOCL supports fp64.
   cmd += " -Dcl_khr_fp64";
-  Target target = Target::create(target_str);
+  Target target = Target::Create(target_str);
   if (target->device_name != "") {
     cmd += " -board=" + target->device_name;
   }
diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc
index ef92f9ae3175..22dde1c46389 100644
--- a/src/codegen/codegen_cuda.cc
+++ b/src/codegen/codegen_cuda.cc
@@ -57,6 +57,10 @@ std::string CodeGenCUDA::Finish() {
     decl_stream << "#include <sm_61_intrinsics.h>\n";
   }
 
+  if (need_math_constants_h_) {
+    decl_stream << "#include <math_constants.h>\n";
+  }
+
   return CodeGenC::Finish();
 }
 
@@ -318,8 +322,19 @@ inline void PrintConst(const FloatImm* op, std::ostream& os, CodeGenCUDA* p) { /
   switch (op->type.bits()) {
     case 64: case 32: {
       std::ostringstream temp;
-      temp << std::scientific << op->value;
-      if (op->type.bits() == 32) temp << 'f';
+      if (std::isinf(op->value)) {
+        if (op->value < 0) {
+          temp << "-";
+        }
+        temp << ((op->type.bits() == 32) ? "CUDART_INF_F" : "CUDART_INF");
+        p->need_math_constants_h_ = true;
+      } else if (std::isnan(op->value)) {
+        temp << ((op->type.bits() == 32) ? "CUDART_NAN_F" : "CUDART_NAN");
+        p->need_math_constants_h_ = true;
+      } else {
+        temp << std::scientific << op->value;
+        if (op->type.bits() == 32) temp << 'f';
+      }
       p->MarkConst(temp.str());
       os << temp.str();
       break;
diff --git a/src/codegen/codegen_cuda.h b/src/codegen/codegen_cuda.h
index 381784a13a57..acd759f33889 100644
--- a/src/codegen/codegen_cuda.h
+++ b/src/codegen/codegen_cuda.h
@@ -39,7 +39,9 @@ class CodeGenCUDA final : public CodeGenC {
   void Init(bool output_ssa);
   void AddFunction(LoweredFunc f);
   std::string Finish();
-  bool need_include_path() { return (enable_fp16_ || enable_int8_); }
+  bool need_include_path() {
+    return (enable_fp16_ || enable_int8_ || need_math_constants_h_);
+  }
   // override behavior
   void VisitStmt_(const ir::For* op) final;
   void PrintStorageSync(const Call* op) final;
@@ -70,6 +72,9 @@ class CodeGenCUDA final : public CodeGenC {
   bool enable_fp16_{false};
   // whether enable int8
   bool enable_int8_{false};
+  // whether need math_constants.h
+  bool need_math_constants_h_{false};
+  friend void PrintConst(const FloatImm* op, std::ostream& os, CodeGenCUDA* p);
 };
 
 }  // namespace codegen
diff --git a/src/codegen/codegen_opencl.cc b/src/codegen/codegen_opencl.cc
index 382124a7ed2d..0b33bf43c151 100644
--- a/src/codegen/codegen_opencl.cc
+++ b/src/codegen/codegen_opencl.cc
@@ -247,6 +247,19 @@ void CodeGenOpenCL::VisitExpr_(const Select* op, std::ostream& os) {  // NOLINT(
   CodeGenC::VisitExpr_(op, os);
 }
 
+void CodeGenOpenCL::VisitExpr_(const FloatImm *op, std::ostream& os) { // NOLINT(*)
+  if (std::isinf(op->value)) {
+    if (op->value < 0) {
+      os << "-";
+    }
+    os << "INFINITY";
+  } else if (std::isnan(op->value)) {
+    os << "NAN";
+  } else {
+    CodeGenC::VisitExpr_(op, os);
+  }
+}
+
 runtime::Module BuildOpenCL(Array<LoweredFunc> funcs) {
   using tvm::runtime::Registry;
   bool output_ssa = false;
diff --git a/src/codegen/codegen_opencl.h b/src/codegen/codegen_opencl.h
index 0eff3a633ba3..36a55a545cbd 100644
--- a/src/codegen/codegen_opencl.h
+++ b/src/codegen/codegen_opencl.h
@@ -59,6 +59,7 @@ class CodeGenOpenCL final : public CodeGenC {
   void VisitExpr_(const Broadcast* op, std::ostream& os) final; // NOLINT(*)
   void VisitExpr_(const Call* op, std::ostream& os) final; // NOLINT(*)
   void VisitExpr_(const Select* op, std::ostream& os) final; // NOLINT(*)
+  void VisitExpr_(const FloatImm *op, std::ostream& os) final; // NOLINT(*)
 
  private:
   // whether enable fp16 and fp64 extension
diff --git a/src/codegen/codegen_vhls.cc b/src/codegen/codegen_vhls.cc
index a18312fe6af5..4d86cc5b4b00 100644
--- a/src/codegen/codegen_vhls.cc
+++ b/src/codegen/codegen_vhls.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -155,7 +155,7 @@ runtime::Module BuildSDAccel(Array<LoweredFunc> funcs, std::string target_str) {
 
   std::string xclbin;
   if (const auto* f = Registry::Get("tvm_callback_sdaccel_compile")) {
-    Target target = Target::create(target_str);
+    Target target = Target::Create(target_str);
     xclbin = (*f)(kernel_info, target->device_name).operator std::string();
   } else {
     LOG(FATAL) << "Cannot compile Vivado HLS code.";
diff --git a/src/codegen/datatype/registry.cc b/src/codegen/datatype/registry.cc
new file mode 100644
index 000000000000..28cc58204e8d
--- /dev/null
+++ b/src/codegen/datatype/registry.cc
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "registry.h"
+#include <tvm/api_registry.h>
+
+namespace tvm {
+namespace datatype {
+
+TVM_REGISTER_GLOBAL("_datatype_register").set_body([](TVMArgs args, TVMRetValue* ret) {
+  datatype::Registry::Global()->Register(args[0], static_cast<uint8_t>(args[1].operator int()));
+});
+
+TVM_REGISTER_GLOBAL("_datatype_get_type_code").set_body([](TVMArgs args, TVMRetValue* ret) {
+  *ret = datatype::Registry::Global()->GetTypeCode(args[0]);
+});
+
+TVM_REGISTER_GLOBAL("_datatype_get_type_name").set_body([](TVMArgs args, TVMRetValue* ret) {
+  *ret = Registry::Global()->GetTypeName(args[0].operator int());
+});
+
+TVM_REGISTER_GLOBAL("_datatype_get_type_registered").set_body([](TVMArgs args, TVMRetValue* ret) {
+  *ret = Registry::Global()->GetTypeRegistered(args[0].operator int());
+});
+
+Registry* Registry::Global() {
+  static Registry inst;
+  return &inst;
+}
+
+void Registry::Register(const std::string& type_name, uint8_t type_code) {
+  CHECK(type_code >= kCustomBegin) << "Please choose a type code >= kCustomBegin for custom types";
+  code_to_name_[type_code] = type_name;
+  name_to_code_[type_name] = type_code;
+}
+
+uint8_t Registry::GetTypeCode(const std::string& type_name) {
+  CHECK(name_to_code_.find(type_name) != name_to_code_.end())
+      << "Type name " << type_name << " not registered";
+  return name_to_code_[type_name];
+}
+
+std::string Registry::GetTypeName(uint8_t type_code) {
+  CHECK(code_to_name_.find(type_code) != code_to_name_.end())
+      << "Type code " << static_cast<unsigned>(type_code) << " not registered";
+  return code_to_name_[type_code];
+}
+
+const runtime::PackedFunc* GetCastLowerFunc(const std::string& target, uint8_t type_code,
+                                            uint8_t src_type_code) {
+  std::ostringstream ss;
+  ss << "tvm.datatype.lower.";
+  ss << target << ".";
+  ss << "Cast"
+     << ".";
+
+  if (datatype::Registry::Global()->GetTypeRegistered(type_code)) {
+    ss << datatype::Registry::Global()->GetTypeName(type_code);
+  } else {
+    ss << runtime::TypeCode2Str(type_code);
+  }
+
+  ss << ".";
+
+  if (datatype::Registry::Global()->GetTypeRegistered(src_type_code)) {
+    ss << datatype::Registry::Global()->GetTypeName(src_type_code);
+  } else {
+    ss << runtime::TypeCode2Str(src_type_code);
+  }
+
+  return runtime::Registry::Get(ss.str());
+}
+
+const runtime::PackedFunc* GetFloatImmLowerFunc(const std::string& target, uint8_t type_code) {
+  std::ostringstream ss;
+  ss << "tvm.datatype.lower.";
+  ss << target;
+  ss << ".FloatImm.";
+  ss << datatype::Registry::Global()->GetTypeName(type_code);
+  return runtime::Registry::Get(ss.str());
+}
+
+uint64_t ConvertConstScalar(uint8_t type_code, double value) {
+  std::ostringstream ss;
+  ss << "tvm.datatype.convertconstscalar.float.";
+  ss << datatype::Registry::Global()->GetTypeName(type_code);
+  auto make_const_scalar_func = runtime::Registry::Get(ss.str());
+  return (*make_const_scalar_func)(value).operator uint64_t();
+}
+
+}  // namespace datatype
+}  // namespace tvm
diff --git a/src/codegen/datatype/registry.h b/src/codegen/datatype/registry.h
new file mode 100644
index 000000000000..d2e615765a18
--- /dev/null
+++ b/src/codegen/datatype/registry.h
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_CODEGEN_DATATYPE_REGISTRY_H_
+#define TVM_CODEGEN_DATATYPE_REGISTRY_H_
+
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+#include <string>
+#include <unordered_map>
+
+namespace tvm {
+namespace datatype {
+
+/*!
+ * \brief Registry for custom datatypes.
+ *
+ * Adding custom datatypes currently requires two steps:
+ * 1. Register the datatype with the registry via a call to
+ *    datatype::Registry::Register. This can also be done in Python
+ *    directly---see the TVM globals registered in the corresponding .cc file.
+ *    Currently, user should manually choose a type name and a type code,
+ *    ensuring that neither conflict with existing types.
+ * 2. Use TVM_REGISTER_GLOBAL to register the lowering functions needed to
+ *    lower the custom datatype. In general, these will look like:
+ *      For Casts: tvm.datatype.lower.<target>.Cast.<type>.<src_type>
+ *        Example: tvm.datatype.lower.llvm.Cast.myfloat.float for a Cast from
+ *                 float to myfloat.
+ *  For other ops: tvm.datatype.lower.<target>.<op>.<type>
+ *       Examples: tvm.datatype.lower.llvm.Add.myfloat
+ *                 tvm.datatype.lower.llvm.FloatImm.posit
+ */
+class Registry {
+ public:
+  /*!
+   * \brief Get the global custom datatype registry singleton
+   */
+  static Registry* Global();
+
+  /*!
+   * \brief Register custom datatype
+   * Register a custom datatype with the given type name and type code. Currently, the type code is
+   * manually allocated by the user, and the user must ensure that no two custom types share the
+   * same code. Generally, this should be straightforward, as the user will be manually registering
+   * all of their custom types.
+   * \param type_name The name of the type, e.g. "bfloat"
+   * \param type_code The type code, which should be greater than TVMTypeCode::kExtEnd
+   */
+  void Register(const std::string& type_name, uint8_t type_code);
+
+  /*!
+   * \brief Get type code from type name
+   * \param type_name The type name
+   * \return The type code
+   */
+  uint8_t GetTypeCode(const std::string &type_name);
+
+  /*!
+   * \brief Get type name from type code
+   * \param type_code The type code
+   * \return The type name
+   */
+  std::string GetTypeName(uint8_t type_code);
+
+  /*!
+   * \brief Get bool representing whether type is registered, given the type code
+   * \param type_code The type code
+   * \return bool representing whether the type is registered
+   */
+  inline bool GetTypeRegistered(uint8_t type_code) {
+    return code_to_name_.find(type_code) != code_to_name_.end();
+  }
+
+  /*!
+   * \brief Get bool representing whether type is registered, given the type name
+   * \param type_name The type name
+   * \return bool representing whether the type is registered
+   */
+  inline bool GetTypeRegistered(std::string type_name) {
+    return name_to_code_.find(type_name) != name_to_code_.end();
+  }
+
+ private:
+  // TODO(gus) is there a typedef for the code?
+  std::unordered_map<uint8_t, std::string> code_to_name_;
+  std::unordered_map<std::string, uint8_t> name_to_code_;
+};
+
+/*!
+ * \brief Convert scalar value to a custom datatype format
+ * \param type_code The custom datatype to convert to, specified by type code
+ * \param value The floating point value to convert
+ * \return The value, encoded in the bits of a uint64_t
+ */
+uint64_t ConvertConstScalar(uint8_t type_code, double value);
+
+/*!
+ * \brief Get lowering function for Cast ops
+ * \param target The target we are lowering to, e.g. "llvm"
+ * \param type_code The datatype being cast to
+ * \param src_type_code The datatype being cast from
+ * \return Lowering function for Cast ops for the provided target, type, and source type
+ */
+const runtime::PackedFunc* GetCastLowerFunc(const std::string& target, uint8_t type_code,
+                                            uint8_t src_type_code);
+
+/*!
+ * \brief Get lowering function for FloatImms
+ * \param target The target we are lowering to, e.g. "llvm"
+ * \param type_code The datatype of the FloatImm
+ * \return Lowering function for FloatImms for the provided target and type
+ */
+const runtime::PackedFunc* GetFloatImmLowerFunc(const std::string& target, uint8_t type_code);
+
+/*!
+ * \brief Get lowering function for other ops
+ * \param target The target we are lowering to, e.g. "llvm"
+ * \param type_code The datatype of the op
+ * \return Lowering function for other ops for the provided target and type
+ */
+#define DEFINE_GET_LOWER_FUNC_(OP)                                                       \
+  inline const runtime::PackedFunc* Get##OP##LowerFunc(const std::string& target,        \
+                                                       uint8_t type_code) {              \
+    return runtime::Registry::Get("tvm.datatype.lower." + target + "." #OP "." +         \
+                                  datatype::Registry::Global()->GetTypeName(type_code)); \
+  }
+
+DEFINE_GET_LOWER_FUNC_(Add)
+DEFINE_GET_LOWER_FUNC_(Sub)
+DEFINE_GET_LOWER_FUNC_(Mul)
+DEFINE_GET_LOWER_FUNC_(Div)
+DEFINE_GET_LOWER_FUNC_(Mod)
+DEFINE_GET_LOWER_FUNC_(Min)
+DEFINE_GET_LOWER_FUNC_(Max)
+DEFINE_GET_LOWER_FUNC_(EQ)
+DEFINE_GET_LOWER_FUNC_(NE)
+DEFINE_GET_LOWER_FUNC_(LT)
+DEFINE_GET_LOWER_FUNC_(LE)
+DEFINE_GET_LOWER_FUNC_(GT)
+DEFINE_GET_LOWER_FUNC_(GE)
+// Later changes may need to add more lowering functions as we support workloads with more ops.
+
+}  // namespace datatype
+}  // namespace tvm
+
+#endif  // TVM_CODEGEN_DATATYPE_REGISTRY_H_
diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc
index 7946f906125f..1e56583a37fd 100644
--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -537,6 +537,14 @@ llvm::Value* CodeGenLLVM::CreateCast(Type from, Type to, llvm::Value* value) {
   if (value->getType() == target) return value;
   if (to.is_handle()) {
     return builder_->CreateBitCast(value, target);
+  } else if (to.is_uint() && to.bits() == 1) {
+    if (from.is_float()) {
+      llvm::Constant* zero = llvm::ConstantFP::get(LLVMType(from), 0.);
+      return builder_->CreateFCmpONE(value, zero);
+    } else {
+      llvm::Constant* zero = llvm::ConstantInt::get(LLVMType(from), 0);
+      return builder_->CreateICmpNE(value, zero);
+    }
   } else if (!from.is_float() && !to.is_float()) {
     return builder_->CreateIntCast(value, target, from.is_int());
   } else if (from.is_float() && to.is_int()) {
@@ -1134,7 +1142,7 @@ void CodeGenLLVM::VisitStmt_(const AttrStmt* op) {
 }
 
 void CodeGenLLVM::VisitStmt_(const AssertStmt* op) {
-  arith::ConstraintContext cctx(analyzer_.get(), op->condition);
+  With<arith::ConstraintContext> cctx(analyzer_.get(), op->condition);
   this->VisitStmt(op->body);
 }
 
diff --git a/src/codegen/opt/build_cuda_on.cc b/src/codegen/opt/build_cuda_on.cc
index fda239f0766f..e2a788f1bbd4 100644
--- a/src/codegen/opt/build_cuda_on.cc
+++ b/src/codegen/opt/build_cuda_on.cc
@@ -84,12 +84,13 @@ std::string NVRTCCompile(const std::string& code, bool include_path = false) {
   std::vector<std::string> compile_params;
   std::vector<const char*> param_cstrings{};
   nvrtcProgram prog;
-  cudaDeviceProp device_prop;
   std::string cc = "30";
-  cudaError_t e = cudaGetDeviceProperties(&device_prop, 0);
+  int major, minor;
+  cudaError_t e1 = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, 0);
+  cudaError_t e2 = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, 0);
 
-  if (e == cudaSuccess) {
-    cc = std::to_string(device_prop.major) + std::to_string(device_prop.minor);
+  if (e1 == cudaSuccess && e2 == cudaSuccess) {
+    cc = std::to_string(major) + std::to_string(minor);
   } else {
     LOG(WARNING) << "cannot detect compute capability from your device, "
                  << "fall back to compute_30.";
diff --git a/src/codegen/spirv/codegen_spirv.cc b/src/codegen/spirv/codegen_spirv.cc
index e6fc0088dc81..fd113ca4614a 100644
--- a/src/codegen/spirv/codegen_spirv.cc
+++ b/src/codegen/spirv/codegen_spirv.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -626,7 +626,7 @@ void CodeGenSPIRV::VisitStmt_(const AttrStmt* op) {
 }
 
 void CodeGenSPIRV::VisitStmt_(const AssertStmt* op) {
-  arith::ConstraintContext cctx(analyzer_.get(), op->condition);
+  With<arith::ConstraintContext> cctx(analyzer_.get(), op->condition);
   this->VisitStmt(op->body);
 }
 
diff --git a/src/common/socket.h b/src/common/socket.h
index 58705f16bf73..91f9f4e5cf0a 100644
--- a/src/common/socket.h
+++ b/src/common/socket.h
@@ -373,7 +373,7 @@ class TCPSocket : public Socket {
   }
   /*!
    * \brief decide whether the socket is at OOB mark
-   * \return 1 if at mark, 0 if not, -1 if an error occured
+   * \return 1 if at mark, 0 if not, -1 if an error occurred
    */
   int AtMark() const {
 #ifdef _WIN32
diff --git a/src/contrib/cblas/cblas.cc b/src/contrib/cblas/cblas.cc
index 4ca043f1bcfe..0f222e2f2a39 100644
--- a/src/contrib/cblas/cblas.cc
+++ b/src/contrib/cblas/cblas.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -21,12 +21,11 @@
  *  Copyright (c) 2017 by Contributors
  * \file Use external cblas library call.
  */
+#include <dmlc/logging.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/util.h>
-#include <dmlc/logging.h>
 #include "gemm_common.h"
 
-
 extern "C" {
 #if USE_MKL_BLAS == 1
 #include <mkl_cblas.h>
@@ -40,56 +39,148 @@ namespace contrib {
 
 using namespace runtime;
 
-inline CBLAS_TRANSPOSE BooleanToTranspose(bool trans) {
-  return trans ? CblasTrans : CblasNoTrans;
-}
+inline CBLAS_TRANSPOSE BooleanToTranspose(bool trans) { return trans ? CblasTrans : CblasNoTrans; }
 
 struct CblasSgemmOp {
   typedef float TDatatype;
-  void operator()(bool ta, bool tb,
-                  int M, int N, int K,
-                  float alpha, float* A, int lda,
-                  float* B, int ldb,
-                  float beta, float* C, int ldc) {
-    cblas_sgemm(CblasColMajor,
-                BooleanToTranspose(ta),
-                BooleanToTranspose(tb),
-                M, N, K,
-                alpha, A, lda,
-                B, ldb,
-                beta, C, ldc);
+  void operator()(bool ta, bool tb, int M, int N, int K, float alpha, float* A, int lda, float* B,
+                  int ldb, float beta, float* C, int ldc) {
+    cblas_sgemm(CblasColMajor, BooleanToTranspose(ta), BooleanToTranspose(tb), M, N, K, alpha, A,
+                lda, B, ldb, beta, C, ldc);
   }
 };
 
 struct CblasDgemmOp {
   typedef double TDatatype;
-  void operator()(bool ta, bool tb,
-                  int M, int N, int K,
-                  double alpha, double* A, int lda,
-                  double* B, int ldb,
-                  double beta, double* C, int ldc) {
-    cblas_dgemm(CblasColMajor,
-                BooleanToTranspose(ta),
-                BooleanToTranspose(tb),
-                M, N, K,
-                alpha, A, lda,
-                B, ldb,
-                beta, C, ldc);
+  void operator()(bool ta, bool tb, int M, int N, int K, double alpha, double* A, int lda,
+                  double* B, int ldb, double beta, double* C, int ldc) {
+    cblas_dgemm(CblasColMajor, BooleanToTranspose(ta), BooleanToTranspose(tb), M, N, K, alpha, A,
+                lda, B, ldb, beta, C, ldc);
   }
 };
 
+struct CblasSgemmBatchOp {
+  typedef float TDatatype;
+  void operator()(int batch_size, bool ta, bool tb, int M, int N, int K, float alpha, float* A,
+                  int a_stride, int lda, float* B, int b_stride, int ldb, float beta, float* C,
+                  int c_stride, int ldc) {
+    CBLAS_TRANSPOSE trans_a = BooleanToTranspose(ta);
+    CBLAS_TRANSPOSE trans_b = BooleanToTranspose(tb);
+#if USE_MKL_BLAS == 1
+    std::vector<const float*> A_array(batch_size);
+    std::vector<const float*> B_array(batch_size);
+    std::vector<float*> C_array(batch_size);
+    for (int i = 0; i < batch_size; ++i) {
+      A_array[i] = A + i * a_stride;
+      B_array[i] = B + i * b_stride;
+      C_array[i] = C + i * c_stride;
+    }
+    cblas_sgemm_batch(CblasColMajor, &trans_a, &trans_b, &M, &N, &K, &alpha, A_array.data(), &lda,
+                      B_array.data(), &ldb, &beta, C_array.data(), &ldc, 1, &batch_size);
+#else
+    for (int i = 0; i < batch_size; ++i) {
+      cblas_sgemm(CblasColMajor, trans_a, trans_b, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+      A += a_stride;
+      B += b_stride;
+      C += c_stride;
+    }
+#endif
+  }
+};
+
+struct CblasSgemmBatchIterativeOp {
+  typedef float TDatatype;
+  void operator()(int batch_size, bool ta, bool tb, int M, int N, int K, float alpha, float* A,
+                  int a_stride, int lda, float* B, int b_stride, int ldb, float beta, float* C,
+                  int c_stride, int ldc) {
+    CBLAS_TRANSPOSE trans_a = BooleanToTranspose(ta);
+    CBLAS_TRANSPOSE trans_b = BooleanToTranspose(tb);
+    for (int i = 0; i < batch_size; ++i) {
+      cblas_sgemm(CblasColMajor, trans_a, trans_b, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+      A += a_stride;
+      B += b_stride;
+      C += c_stride;
+    }
+  }
+};
+
+struct CblasDgemmBatchOp {
+  typedef double TDatatype;
+  void operator()(int batch_size, bool ta, bool tb, int M, int N, int K, double alpha, double* A,
+                  int a_stride, int lda, double* B, int b_stride, int ldb, double beta, double* C,
+                  int c_stride, int ldc) {
+    CBLAS_TRANSPOSE trans_a = BooleanToTranspose(ta);
+    CBLAS_TRANSPOSE trans_b = BooleanToTranspose(tb);
+#if USE_MKL_BLAS == 1
+    std::vector<const double*> A_array(batch_size);
+    std::vector<const double*> B_array(batch_size);
+    std::vector<double*> C_array(batch_size);
+    for (int i = 0; i < batch_size; ++i) {
+      A_array[i] = A + i * a_stride;
+      B_array[i] = B + i * b_stride;
+      C_array[i] = C + i * c_stride;
+    }
+    cblas_dgemm_batch(CblasColMajor, &trans_a, &trans_b, &M, &N, &K, &alpha, A_array.data(), &lda,
+                      B_array.data(), &ldb, &beta, C_array.data(), &ldc, 1, &batch_size);
+#else
+    for (int i = 0; i < batch_size; ++i) {
+      cblas_dgemm(CblasColMajor, trans_a, trans_b, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+      A += a_stride;
+      B += b_stride;
+      C += c_stride;
+    }
+#endif
+  }
+};
+
+struct CblasDgemmBatchIterativeOp {
+  typedef double TDatatype;
+  void operator()(int batch_size, bool ta, bool tb, int M, int N, int K, double alpha, double* A,
+                  int a_stride, int lda, double* B, int b_stride, int ldb, double beta, double* C,
+                  int c_stride, int ldc) {
+    CBLAS_TRANSPOSE trans_a = BooleanToTranspose(ta);
+    CBLAS_TRANSPOSE trans_b = BooleanToTranspose(tb);
+    for (int i = 0; i < batch_size; ++i) {
+      cblas_dgemm(CblasColMajor, trans_a, trans_b, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+      A += a_stride;
+      B += b_stride;
+      C += c_stride;
+    }
+  }
+};
 
 // matrix multiplication for row major
 TVM_REGISTER_GLOBAL("tvm.contrib.cblas.matmul")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    DLTensor* A = args[0];
-    CHECK(TypeMatch(A->dtype, kDLFloat, 32) ||
-          TypeMatch(A->dtype, kDLFloat, 64));
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  DLTensor* A = args[0];
+  CHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
+
+  if (TypeMatch(A->dtype, kDLFloat, 32))
+    CallGemm(args, ret, CblasSgemmOp());
+  else
+    CallGemm(args, ret, CblasDgemmOp());
+});
 
-    if (TypeMatch(A->dtype, kDLFloat, 32))
-      CallGemm(args, ret, CblasSgemmOp());
-    else
-      CallGemm(args, ret, CblasDgemmOp());
-  });
+TVM_REGISTER_GLOBAL("tvm.contrib.cblas.batch_matmul")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  DLTensor* A = args[0];
+  CHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
+  if (TypeMatch(A->dtype, kDLFloat, 32)) {
+    CallBatchGemm(args, ret, CblasSgemmBatchOp());
+  } else {
+    CallBatchGemm(args, ret, CblasDgemmBatchOp());
+  }
+});
+
+TVM_REGISTER_GLOBAL("tvm.contrib.cblas.batch_matmul_iterative")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  DLTensor* A = args[0];
+  CHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
+  if (TypeMatch(A->dtype, kDLFloat, 32)) {
+    CallBatchGemm(args, ret, CblasSgemmBatchIterativeOp());
+  } else {
+    CallBatchGemm(args, ret, CblasDgemmBatchIterativeOp());
+  }
+});
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/contrib/cblas/gemm_common.h b/src/contrib/cblas/gemm_common.h
index fe38b2a67513..2bcefb2f26bb 100644
--- a/src/contrib/cblas/gemm_common.h
+++ b/src/contrib/cblas/gemm_common.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -22,16 +22,17 @@
  * \file tvm/contrib/gemm.h
  * \brief Shared implementation of gemm
  */
-#ifndef TVM_CONTRIB_CBLAS_GEMM_COMMON_H_
-#define TVM_CONTRIB_CBLAS_GEMM_COMMON_H_
+#pragma once
+
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/util.h>
 #include <algorithm>
 
 namespace tvm {
 namespace contrib {
 
 using namespace runtime;
-
-inline int ColumnStride(DLTensor* tensor) {
+inline int ColumnStride(DLTensor *tensor) {
   // If the tensor itself is transposed then it will have strides
   // backward from what we expect.  Regardless, the max of the strides
   // (the other stride is 1) is the column stride.
@@ -42,8 +43,7 @@ inline int ColumnStride(DLTensor* tensor) {
   }
 }
 
-
-inline int ElementStride(DLTensor* tensor) {
+inline int ElementStride(DLTensor *tensor) {
   if (tensor->strides) {
     return std::min(tensor->strides[0], tensor->strides[1]);
   } else {
@@ -51,29 +51,26 @@ inline int ElementStride(DLTensor* tensor) {
   }
 }
 
-
 // Reversed strides indicates an in-place transpose operation.
-inline bool IsInPlaceTransposed(DLTensor* tensor) {
+inline bool IsInPlaceTransposed(DLTensor *tensor) {
   return tensor->strides && (tensor->strides[1] > tensor->strides[0]);
 }
 
-
-inline int RowCount(DLTensor* tensor, bool trans) {
+inline int RowCount(DLTensor *tensor, bool trans) {
   return tensor->shape[trans ? 1 : 0];
 }
 
-
-inline int ColumnCount(DLTensor* tensor, bool trans) {
+inline int ColumnCount(DLTensor *tensor, bool trans) {
   return tensor->shape[trans ? 0 : 1];
 }
 
 // Call a column major blas.  Note that data is stored in tvm as row
 // major, so this we switch the arguments.
-template<typename TGemmOp>
+template <typename TGemmOp>
 inline void CallGemm(TVMArgs args, TVMRetValue *ret, TGemmOp op) {
-  DLTensor* A = args[0];
-  DLTensor* B = args[1];
-  DLTensor* C = args[2];
+  DLTensor *A = args[0];
+  DLTensor *B = args[1];
+  DLTensor *C = args[2];
   bool transa = args[3];
   bool transb = args[4];
   int bit_depth = sizeof(typename TGemmOp::TDatatype) * 8;
@@ -96,25 +93,88 @@ inline void CallGemm(TVMArgs args, TVMRetValue *ret, TGemmOp op) {
   CHECK(TypeMatch(C->dtype, kDLFloat, bit_depth));
   double alpha = args.size() > 5 ? args[5] : 1.0;
   double beta = args.size() > 6 ? args[6] : 0.0;
-  op(transb,
-     transa,
-     ColumnCount(B, transb),
-     RowCount(A, transa),
-     ColumnCount(A, transa),
-     static_cast<float>(alpha),
-     reinterpret_cast<typename TGemmOp::TDatatype*>(static_cast<char*>(B->data)
-                                                    + B->byte_offset),
+  op(transb, transa, ColumnCount(B, transb), RowCount(A, transa),
+     ColumnCount(A, transa), static_cast<float>(alpha),
+     reinterpret_cast<typename TGemmOp::TDatatype *>(
+         static_cast<char *>(B->data) + B->byte_offset),
      ColumnStride(B),
-     reinterpret_cast<typename TGemmOp::TDatatype*>(static_cast<char*>(A->data)
-                                                    + A->byte_offset),
-     ColumnStride(A),
-     static_cast<float>(beta),
-     reinterpret_cast<typename TGemmOp::TDatatype*>(static_cast<char*>(C->data)
-                                                    + C->byte_offset),
+     reinterpret_cast<typename TGemmOp::TDatatype *>(
+         static_cast<char *>(A->data) + A->byte_offset),
+     ColumnStride(A), static_cast<float>(beta),
+     reinterpret_cast<typename TGemmOp::TDatatype *>(
+         static_cast<char *>(C->data) + C->byte_offset),
      ColumnStride(C));
 }
 
+inline int ColumnStride3D(DLTensor *tensor) {
+  // If the tensor itself is transposed then it will have strides
+  // backward from what we expect.  Regardless, the max of the strides
+  // (the other stride is 1) is the column stride.
+  if (tensor->strides) {
+    return std::max(tensor->strides[1], tensor->strides[2]);
+  } else {
+    return tensor->shape[2];
+  }
+}
+inline int ElementStride3D(DLTensor *tensor) {
+  if (tensor->strides) {
+    return std::min(tensor->strides[1], tensor->strides[2]);
+  } else {
+    return 1;
+  }
+}
+// Reversed strides indicates an in-place transpose operation.
+inline bool IsInPlaceTransposed3D(DLTensor *tensor) {
+  return tensor->strides && (tensor->strides[2] > tensor->strides[1]);
+}
+inline int BatchCount3D(DLTensor *tensor) { return tensor->shape[0]; }
+inline int RowCount3D(DLTensor *tensor, bool trans) {
+  return tensor->shape[trans ? 2 : 1];
+}
+inline int ColumnCount3D(DLTensor *tensor, bool trans) {
+  return tensor->shape[trans ? 1 : 2];
+}
+template <typename TBatchGemmOp>
+inline void CallBatchGemm(TVMArgs args, TVMRetValue *ret, TBatchGemmOp op) {
+  using DType = typename TBatchGemmOp::TDatatype;
+  DLTensor *A = args[0];
+  DLTensor *B = args[1];
+  DLTensor *C = args[2];
+  bool transa = args[3];
+  bool transb = args[4];
+  int bit_depth = sizeof(DType) * 8;
+  CHECK_EQ(A->ndim, 3);
+  CHECK_EQ(B->ndim, 3);
+  CHECK_EQ(C->ndim, 3);
+  int batch_size = BatchCount3D(A);
+  CHECK_EQ(BatchCount3D(B), batch_size);
+  CHECK_EQ(BatchCount3D(C), batch_size);
+  CHECK_EQ(ElementStride(A), 1);
+  CHECK_EQ(ElementStride(B), 1);
+  CHECK_EQ(ElementStride(C), 1);
+  // C can never be transposed.
+  CHECK(!IsInPlaceTransposed3D(C));
+  // Reversed strides indicates an in-place transpose operation.
+  transa = IsInPlaceTransposed3D(A) ? !transa : transa;
+  transb = IsInPlaceTransposed3D(B) ? !transb : transb;
+  CHECK(TypeMatch(B->dtype, kDLFloat, bit_depth));
+  CHECK(TypeMatch(C->dtype, kDLFloat, bit_depth));
+  double alpha = args.size() > 5 ? args[5] : 1.0;
+  double beta = args.size() > 6 ? args[6] : 0.0;
+  const int A_size = A->shape[1] * A->shape[2];
+  const int B_size = B->shape[1] * B->shape[2];
+  const int C_size = C->shape[1] * C->shape[2];
+  DType *A_data = reinterpret_cast<typename TBatchGemmOp::TDatatype *>(
+      static_cast<char *>(A->data) + A->byte_offset);
+  DType *B_data = reinterpret_cast<typename TBatchGemmOp::TDatatype *>(
+      static_cast<char *>(B->data) + B->byte_offset);
+  DType *C_data = reinterpret_cast<typename TBatchGemmOp::TDatatype *>(
+      static_cast<char *>(C->data) + C->byte_offset);
+  op(batch_size, transb, transa, ColumnCount3D(B, transb),
+     RowCount3D(A, transa), ColumnCount3D(A, transa), static_cast<float>(alpha),
+     B_data, B_size, ColumnStride3D(B), A_data, A_size, ColumnStride3D(A),
+     static_cast<float>(beta), C_data, C_size, ColumnStride3D(C));
+}
+
 }  // namespace contrib
 }  // namespace tvm
-
-#endif  // TVM_CONTRIB_CBLAS_GEMM_COMMON_H_
diff --git a/src/contrib/sort/sort.cc b/src/contrib/sort/sort.cc
index cf25e89b9109..87691f254c5c 100644
--- a/src/contrib/sort/sort.cc
+++ b/src/contrib/sort/sort.cc
@@ -34,14 +34,14 @@ namespace contrib {
 using namespace runtime;
 
 template<typename DType>
-bool CompareAscend(const std::pair<int32_t, DType>& lhs,
-                   const std::pair<int32_t, DType>& rhs) {
+bool CompareAscend(const std::pair<int64_t, DType>& lhs,
+                   const std::pair<int64_t, DType>& rhs) {
   return lhs.second < rhs.second;
 }
 
 template<typename DType>
-bool CompareDescend(const std::pair<int32_t, DType>& lhs,
-                    const std::pair<int32_t, DType>& rhs) {
+bool CompareDescend(const std::pair<int64_t, DType>& lhs,
+                    const std::pair<int64_t, DType>& rhs) {
   return lhs.second > rhs.second;
 }
 
@@ -110,6 +110,41 @@ TVM_REGISTER_GLOBAL("tvm.contrib.sort.argsort_nms")
   }
 });
 
+template<typename DataType, typename OutType>
+void argsort(DLTensor* input, DLTensor* output, int32_t axis, bool is_ascend) {
+  auto data_ptr = static_cast<DataType *>(input->data);
+  auto out_ptr = static_cast<OutType *>(output->data);
+  std::vector<std::pair<int64_t, DataType> > sorter;
+
+  int axis_mul_before = 1;
+  int axis_mul_after = 1;
+  for (int i = 0; i < input->ndim; ++i) {
+    if (i < axis) {
+      axis_mul_before *= input->shape[i];
+    } else if (i > axis) {
+      axis_mul_after *= input->shape[i];
+    }
+  }
+
+  for (int i = 0 ; i < axis_mul_before; ++i) {
+    for (int j = 0 ; j < axis_mul_after; ++j) {
+      sorter.clear();
+      int64_t base_idx = i * input->shape[axis] * axis_mul_after + j;
+      for (int64_t k = 0; k < input->shape[axis]; ++k) {
+        int64_t full_idx = base_idx + k * axis_mul_after;
+        sorter.emplace_back(std::make_pair(k, data_ptr[full_idx]));
+      }
+      if (is_ascend) {
+        std::stable_sort(sorter.begin(), sorter.end(), CompareAscend<DataType>);
+      } else {
+        std::stable_sort(sorter.begin(), sorter.end(), CompareDescend<DataType>);
+      }
+      for (int64_t k = 0; k < input->shape[axis]; ++k) {
+        out_ptr[base_idx + k * axis_mul_after] = static_cast<OutType>(sorter[k].first);
+      }
+    }
+  }
+}
 
 // Argsort implemented C library sort.
 // Return indices of sorted tensor.
@@ -124,25 +159,84 @@ TVM_REGISTER_GLOBAL("tvm.contrib.sort.argsort")
   DLTensor *output = args[1];
   int32_t axis = args[2];
   bool is_ascend = args[3];
-
-  auto dtype = input->dtype;
-  auto data_ptr = static_cast<float *>(input->data);
-  std::vector<std::pair<float, float>> sorter;
-  int64_t axis_mul_before = 1;
-  int64_t axis_mul_after = 1;
-
   if (axis < 0) {
     axis = input->ndim + axis;
   }
-
-  // Currently only supports input dtype to be float32.
-  CHECK_EQ(dtype.code, 2) << "Currently only supports input dtype "
-      "to be float32.";
-  CHECK_EQ(dtype.bits, 32) << "Currently only supports input dtype "
-      "to be float32.";
   CHECK_LT(axis, input->ndim) << "Axis out of boundary for "
-      "input ndim " << input->ndim;
+                                 "input ndim " << input->ndim;
+
+  auto data_dtype = TVMType2String(input->dtype);
+  auto out_dtype = TVMType2String(output->dtype);
+
+  if (data_dtype == "float32") {
+    if (out_dtype == "int32") {
+      argsort<float, int32_t>(input, output, axis, is_ascend);
+    } else if (out_dtype == "int64") {
+      argsort<float, int64_t>(input, output, axis, is_ascend);
+    } else if (out_dtype == "float32") {
+      argsort<float, float>(input, output, axis, is_ascend);
+    } else if (out_dtype == "float64") {
+      argsort<float, double>(input, output, axis, is_ascend);
+    } else {
+      LOG(FATAL) << "Unsupported output dtype: " << out_dtype;
+    }
+  } else if (data_dtype == "float64") {
+    if (out_dtype == "int32") {
+      argsort<double, int32_t>(input, output, axis, is_ascend);
+    } else if (out_dtype == "int64") {
+      argsort<double, int64_t>(input, output, axis, is_ascend);
+    } else if (out_dtype == "float32") {
+      argsort<double, float>(input, output, axis, is_ascend);
+    } else if (out_dtype == "float64") {
+      argsort<double, double>(input, output, axis, is_ascend);
+    } else {
+      LOG(FATAL) << "Unsupported output dtype: " << out_dtype;
+    }
+  } else if (data_dtype == "int32") {
+    if (out_dtype == "int32") {
+      argsort<int32_t, int32_t>(input, output, axis, is_ascend);
+    } else if (out_dtype == "int64") {
+      argsort<int32_t, int64_t>(input, output, axis, is_ascend);
+    } else if (out_dtype == "float32") {
+      argsort<int32_t, float>(input, output, axis, is_ascend);
+    } else if (out_dtype == "float64") {
+      argsort<int32_t, double>(input, output, axis, is_ascend);
+    } else {
+      LOG(FATAL) << "Unsupported output dtype: " << out_dtype;
+    }
+  }  else if (data_dtype == "int64") {
+    if (out_dtype == "int32") {
+      argsort<int64_t, int32_t>(input, output, axis, is_ascend);
+    } else if (out_dtype == "int64") {
+      argsort<int64_t, int64_t>(input, output, axis, is_ascend);
+    } else if (out_dtype == "float32") {
+      argsort<int64_t, float>(input, output, axis, is_ascend);
+    } else if (out_dtype == "float64") {
+      argsort<int64_t, double>(input, output, axis, is_ascend);
+    } else {
+      LOG(FATAL) << "Unsupported output dtype: " << out_dtype;
+    }
+  } else {
+    LOG(FATAL) << "Unsupported input dtype: " << data_dtype;
+  }
+});
 
+template<typename DataType, typename IndicesType>
+void topk(DLTensor* input,
+          DLTensor* out_values,
+          DLTensor* out_indices,
+          int k,
+          int axis,
+          bool is_ascend) {
+  DataType* data_ptr = static_cast<DataType *>(input->data);
+  DataType* values_ptr = (out_values == nullptr) ? nullptr :
+          static_cast<DataType *>(out_values->data);
+  IndicesType* indices_ptr = (out_indices == nullptr) ? nullptr :
+          static_cast<IndicesType *>(out_indices->data);
+  std::vector<std::pair<int64_t, DataType> > sorter;
+
+  int axis_mul_before = 1;
+  int axis_mul_after = 1;
   for (int i = 0; i < input->ndim; ++i) {
     if (i < axis) {
       axis_mul_before *= input->shape[i];
@@ -150,27 +244,124 @@ TVM_REGISTER_GLOBAL("tvm.contrib.sort.argsort")
       axis_mul_after *= input->shape[i];
     }
   }
+  if (k < 1) {
+    k = input->shape[axis];
+  }
 
-  int32_t current_sort_num = input->shape[axis];
-  for (int64_t i = 0 ; i < axis_mul_before; ++i) {
-    for (int64_t j = 0 ; j < axis_mul_after; ++j) {
+  for (int i = 0 ; i < axis_mul_before; ++i) {
+    for (int j = 0 ; j < axis_mul_after; ++j) {
       sorter.clear();
-      int64_t base_idx = i * input->shape[axis] * axis_mul_after + j;
-      for (int64_t k = 0; k < current_sort_num; ++k) {
-        int64_t full_idx = base_idx + k * axis_mul_after;
-        sorter.emplace_back(std::make_pair(k, *(data_ptr + full_idx)));
+      int64_t src_base_idx = i * input->shape[axis] * axis_mul_after + j;
+      int64_t dst_base_idx = i * k * axis_mul_after + j;
+      for (int64_t kk = 0; kk < input->shape[axis]; ++kk) {
+        int64_t full_idx = src_base_idx + kk * axis_mul_after;
+        sorter.emplace_back(std::make_pair(kk, data_ptr[full_idx]));
       }
       if (is_ascend) {
-        std::stable_sort(sorter.begin(), sorter.end(), CompareAscend<float>);
+        std::stable_sort(sorter.begin(), sorter.end(), CompareAscend<DataType>);
       } else {
-        std::stable_sort(sorter.begin(), sorter.end(), CompareDescend<float>);
+        std::stable_sort(sorter.begin(), sorter.end(), CompareDescend<DataType>);
       }
-      for (int32_t k = 0; k < input->shape[axis]; ++k) {
-        *(static_cast<float *>(output->data) + base_idx + k * axis_mul_after)
-            = k < static_cast<float>(sorter.size()) ? sorter[k].first : k;
+      int64_t cnt = k > 0 ? k : input->shape[axis];
+      for (int64_t kk = 0; kk < cnt; ++kk) {
+        if (indices_ptr != nullptr) {
+          indices_ptr[dst_base_idx + kk * axis_mul_after] =
+                  static_cast<IndicesType>(sorter[kk].first);
+        }
+        if (values_ptr != nullptr) {
+          values_ptr[dst_base_idx + kk * axis_mul_after] =
+                  static_cast<DataType>(sorter[kk].second);
+        }
       }
     }
   }
+}
+
+// Argsort implemented C library sort.
+// Return indices of sorted tensor.
+// By default, the last axis will be used to sort.
+// sort_num specify the number of elements to be sorted.
+// If input tensor has dimension (d0, d1, ..., d(k-1), dk, d(k+1), ..., d(n-1))
+// and sort axis is dk. sort_num should have dimension of
+// (d1, d2, ..., d(k-1), d(k+1), ..., dn).
+TVM_REGISTER_GLOBAL("tvm.contrib.sort.topk")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  DLTensor* input = args[0];
+  DLTensor* values_out = nullptr;
+  DLTensor* indices_out = nullptr;
+  int k = args[args.num_args - 4];
+  int axis = args[args.num_args - 3];
+  std::string ret_type = args[args.num_args - 2];
+  bool is_ascend = args[args.num_args - 1];
+  if (ret_type == "both") {
+    values_out = args[1];
+    indices_out = args[2];
+  } else if (ret_type == "values") {
+    values_out = args[1];
+  } else if (ret_type == "indices") {
+    indices_out = args[1];
+  } else {
+    LOG(FATAL) << "Unsupported ret type: " << ret_type;
+  }
+  if (axis < 0) {
+    axis = input->ndim + axis;
+  }
+  CHECK(axis >= 0 && axis < input->ndim) << "Axis out of boundary for input ndim " << input->ndim;
+
+  auto data_dtype = TVMType2String(input->dtype);
+  auto out_dtype = (indices_out == nullptr) ? "int64" : TVMType2String(indices_out->dtype);
+
+  if (data_dtype == "float32") {
+    if (out_dtype == "int32") {
+      topk<float, int32_t>(input, values_out, indices_out, k, axis, is_ascend);
+    } else if (out_dtype == "int64") {
+      topk<float, int64_t>(input, values_out, indices_out, k, axis, is_ascend);
+    } else if (out_dtype == "float32") {
+      topk<float, float>(input, values_out, indices_out, k, axis, is_ascend);
+    } else if (out_dtype == "float64") {
+      topk<float, double>(input, values_out, indices_out, k, axis, is_ascend);
+    } else {
+      LOG(FATAL) << "Unsupported output dtype: " << out_dtype;
+    }
+  } else if (data_dtype == "float64") {
+    if (out_dtype == "int32") {
+      topk<double, int32_t>(input, values_out, indices_out, k, axis, is_ascend);
+    } else if (out_dtype == "int64") {
+      topk<double, int64_t>(input, values_out, indices_out, k, axis, is_ascend);
+    } else if (out_dtype == "float32") {
+      topk<double, float>(input, values_out, indices_out, k, axis, is_ascend);
+    } else if (out_dtype == "float64") {
+      topk<double, double>(input, values_out, indices_out, k, axis, is_ascend);
+    } else {
+      LOG(FATAL) << "Unsupported output dtype: " << out_dtype;
+    }
+  } else if (data_dtype == "int32") {
+    if (out_dtype == "int32") {
+      topk<int32_t, int32_t>(input, values_out, indices_out, k, axis, is_ascend);
+    } else if (out_dtype == "int64") {
+      topk<int32_t, int64_t>(input, values_out, indices_out, k, axis, is_ascend);
+    } else if (out_dtype == "float32") {
+      topk<int32_t, float>(input, values_out, indices_out, k, axis, is_ascend);
+    } else if (out_dtype == "float64") {
+      topk<int32_t, double>(input, values_out, indices_out, k, axis, is_ascend);
+    } else {
+      LOG(FATAL) << "Unsupported output dtype: " << out_dtype;
+    }
+  }  else if (data_dtype == "int64") {
+    if (out_dtype == "int32") {
+      topk<int64_t, int32_t>(input, values_out, indices_out, k, axis, is_ascend);
+    } else if (out_dtype == "int64") {
+      topk<int64_t, int64_t>(input, values_out, indices_out, k, axis, is_ascend);
+    } else if (out_dtype == "float32") {
+      topk<int64_t, float>(input, values_out, indices_out, k, axis, is_ascend);
+    } else if (out_dtype == "float64") {
+      topk<int64_t, double>(input, values_out, indices_out, k, axis, is_ascend);
+    } else {
+      LOG(FATAL) << "Unsupported output dtype: " << out_dtype;
+    }
+  } else {
+    LOG(FATAL) << "Unsupported input dtype: " << data_dtype;
+  }
 });
 
 }  // namespace contrib
diff --git a/src/lang/expr_operator.cc b/src/lang/expr_operator.cc
index 4504ee23f812..3f5254069b8d 100644
--- a/src/lang/expr_operator.cc
+++ b/src/lang/expr_operator.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -188,7 +188,15 @@ Expr operator%(Expr a, Expr b) {
   return ir::Mod::make(a, b);
 }
 
+
 Expr min(Expr a, Expr b) {
+  // inf-aware simplificaiton
+  using arith::is_pos_inf;
+  using arith::is_neg_inf;
+  if (is_pos_inf(a)) return b;
+  if (is_neg_inf(a)) return a;
+  if (is_pos_inf(b)) return a;
+  if (is_neg_inf(b)) return b;
   BinaryOpMatchTypes(a, b);
   Expr ret = arith::TryConstFold<ir::Min>(a, b);
   if (ret.defined()) return ret;
@@ -196,6 +204,13 @@ Expr min(Expr a, Expr b) {
 }
 
 Expr max(Expr a, Expr b) {
+  // inf-aware simplificaiton
+  using arith::is_pos_inf;
+  using arith::is_neg_inf;
+  if (is_pos_inf(a)) return a;
+  if (is_neg_inf(a)) return b;
+  if (is_pos_inf(b)) return b;
+  if (is_neg_inf(b)) return a;
   BinaryOpMatchTypes(a, b);
   Expr ret = arith::TryConstFold<ir::Max>(a, b);
   if (ret.defined()) return ret;
@@ -393,6 +408,16 @@ Expr sum(Expr source, Array<IterVar> rdom) {
   return ir::Reduce::make(combiner, {source}, rdom, make_const(Bool(1), true), 0);
 }
 
+Expr all(Expr source, Array<IterVar> rdom) {
+  CHECK(source.type().is_bool());
+  Var x("x", source.type()), y("y", source.type());
+  Expr result = ir::And::make(x, y);
+  Expr identity_element = make_const(source.type(), true);
+  ir::CommReducer combiner =
+    ir::CommReducerNode::make({x}, {y}, {result}, {identity_element});
+  return ir::Reduce::make(combiner, {source}, rdom, make_const(Bool(1), true), 0);
+}
+
 Expr max(Expr source, Array<IterVar> rdom) {
   Var x("x", source.type()), y("y", source.type());
   Expr result = ir::Max::make(x, y);
diff --git a/src/lang/tensor.cc b/src/lang/tensor.cc
index bab7cf6d93ed..d885d7103606 100644
--- a/src/lang/tensor.cc
+++ b/src/lang/tensor.cc
@@ -83,6 +83,7 @@ TensorIntrin TensorIntrinNode::make(std::string name,
                                     Operation op,
                                     Array<Tensor> inputs,
                                     Array<Buffer> buffers,
+                                    Array<Var> scalar_params,
                                     Stmt body,
                                     Stmt reduce_init,
                                     Stmt reduce_update) {
@@ -91,6 +92,7 @@ TensorIntrin TensorIntrinNode::make(std::string name,
   n->op = std::move(op);
   n->inputs = std::move(inputs);
   n->buffers = std::move(buffers);
+  n->scalar_params = std::move(scalar_params);
   n->body = std::move(body);
   n->reduce_init = std::move(reduce_init);
   n->reduce_update = std::move(reduce_update);
@@ -110,12 +112,14 @@ TVM_REGISTER_NODE_TYPE(TensorIntrinNode);
 TensorIntrinCall TensorIntrinCallNode::make(TensorIntrin intrin,
                                             Array<Tensor> tensors,
                                             Array<Region> regions,
-                                            Array<IterVar> reduce_axis) {
+                                            Array<IterVar> reduce_axis,
+                                            Array<Expr> scalar_inputs) {
   auto n = make_node<TensorIntrinCallNode>();
   n->intrin = std::move(intrin);
   n->tensors = std::move(tensors);
   n->regions = std::move(regions);
   n->reduce_axis = std::move(reduce_axis);
+  n->scalar_inputs = std::move(scalar_inputs);
   return TensorIntrinCall(n);
 }
 
diff --git a/src/op/extern_op.cc b/src/op/extern_op.cc
index e6c6039b610e..7023aebe17ad 100644
--- a/src/op/extern_op.cc
+++ b/src/op/extern_op.cc
@@ -72,7 +72,10 @@ Operation ExternOpNode::make(std::string name,
   CHECK_EQ(inputs.size(), input_placeholders.size());
   for (size_t i = 0; i < inputs.size(); ++i) {
     CHECK_EQ(inputs[i]->dtype, input_placeholders[i]->dtype);
-    CHECK(inputs[i]->shape.same_as(input_placeholders[i]->shape));
+    CHECK_EQ(inputs[i]->shape.size(), input_placeholders[i]->shape.size());
+    for (size_t dim = 0; dim < inputs[i]->shape.size(); ++dim) {
+        CHECK(inputs[i]->shape[dim].same_as(input_placeholders[i]->shape[dim]));
+    }
     CHECK_EQ(input_placeholders[i]->strides.size(), 0U);
   }
   n->inputs = std::move(inputs);
diff --git a/src/op/tensor_compute_op.cc b/src/op/tensor_compute_op.cc
index ed768c2ba216..09e8af7d5cba 100644
--- a/src/op/tensor_compute_op.cc
+++ b/src/op/tensor_compute_op.cc
@@ -58,7 +58,8 @@ Operation TensorComputeOpNode::make(std::string name,
                                     int schedulable_ndim,
                                     TensorIntrin intrin,
                                     Array<Tensor> tensors,
-                                    Array<Region> regions) {
+                                    Array<Region> regions,
+                                    Array<Expr> scalar_inputs) {
   auto n = make_node<TensorComputeOpNode>();
   n->name = std::move(name);
   n->tag = std::move(tag);
@@ -68,6 +69,7 @@ Operation TensorComputeOpNode::make(std::string name,
   n->intrin = std::move(intrin);
   n->inputs = std::move(tensors);
   n->input_regions = std::move(regions);
+  n->scalar_inputs = std::move(scalar_inputs);
   return Operation(n);
 }
 
@@ -184,6 +186,19 @@ Stmt TensorComputeOpNode::BuildProvide(
   std::unordered_map<const Variable*, Expr> vmap;
   ir::ArgBinder binder(&vmap);
 
+  // Map the expressions passed in the call to the TensorIntrin, to the placeholder
+  // variables
+  Array<Expr> user_expr = this->scalar_inputs;
+  Array<Var> scalar_params = this->intrin->scalar_params;
+  Array<Expr> sp_expr;
+  for (auto sp : scalar_params) {
+    Expr esp = sp;
+    sp_expr.push_back(esp);
+  }
+  CHECK_EQ(sp_expr.size(), user_expr.size());
+  // TODO(jdavies-huawei): what name should be used here?
+  binder.BindArray(sp_expr, user_expr, this->name);
+
   size_t tloc = stage->leaf_iter_vars.size();
   ComputeLoopNest n = ComputeLoopNest::make(this, stage, dom_map, debug_keep_trivial_loop);
 
diff --git a/src/pass/arg_binder.h b/src/pass/arg_binder.h
index 9de3a13270dc..f235ea49faac 100644
--- a/src/pass/arg_binder.h
+++ b/src/pass/arg_binder.h
@@ -50,7 +50,7 @@ namespace ir {
  *  - assert bufferB.shape[1] == n + 3
  *
  *  In general, this is a constraint solving problem. We have simplified assumption
- *  over the binding declaration, such that we require the variable occured in
+ *  over the binding declaration, such that we require the variable occurred in
  *  constraint must be declared in argument list. So it is illegal to have signature
  *  f(tA(shape=(n+3))) without any argument variable corresponds to n, even though
  *  it is already enough to derive n from the input argument.
diff --git a/src/pass/loop_partition.cc b/src/pass/loop_partition.cc
index 04bb9385b156..0a5b7410f3cf 100644
--- a/src/pass/loop_partition.cc
+++ b/src/pass/loop_partition.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -28,7 +28,7 @@
 #include <tvm/arithmetic.h>
 #include <unordered_map>
 #include <unordered_set>
-#include "../arithmetic/int_set_internal.h"
+#include "../arithmetic/int_set.h"
 #include "../runtime/thread_storage_scope.h"
 
 namespace tvm {
@@ -366,7 +366,7 @@ class LoopPartitioner : public IRMutator {
 
   std::pair<IntSet, std::unordered_set<const Node*>>
   GetIntervalAndCondset(const Partition &partitions,
-                        const arith::Interval &for_interval,
+                        const arith::IntervalSet &for_interval,
                         bool cond_value);
 
   inline Stmt MakeFor(const Node* op, Expr extent, Stmt body);
@@ -374,6 +374,7 @@ class LoopPartitioner : public IRMutator {
   /* Candidate IRs that may be partitioned potentially */
   std::unordered_map<const Variable*, IntSet> hint_map_;
   std::unordered_map<const Variable*, IntSet> relax_map_;
+  arith::Analyzer analyzer_;
   CandidateSelector selector;
 };
 
@@ -381,29 +382,17 @@ class LoopPartitioner : public IRMutator {
 // given in the second component provably have value given by cond_value
 std::pair<IntSet, std::unordered_set<const Node*>>
 LoopPartitioner::GetIntervalAndCondset(const Partition &partitions,
-                                       const arith::Interval &for_interval,
+                                       const arith::IntervalSet &for_interval,
                                        bool cond_value) {
   Array<IntSet> sets;
   std::unordered_set<const Node*> cond_set;
 
   for (const auto &kv : partitions) {
     if (kv.first.second == cond_value) {
-      arith::Interval interval = kv.second.as<arith::IntervalSet>()->i;
-      auto intersection = arith::Interval::make_intersection(interval, for_interval);
-
-      // TODO(derisavi): the following if statement needs to be removed as soon as
-      // TVM uses commit a768f2f0 of HalideIR repo
-      if (intersection.min.same_as(arith::Interval::pos_inf) ||
-          intersection.max.same_as(arith::Interval::neg_inf)) {
-        intersection = arith::Interval::nothing();
-      } else if (intersection.min.type() == intersection.max.type() &&
-                 (intersection.min.type().is_int() ||
-                  intersection.min.type().is_uint()) &&
-                 can_prove(intersection.min > intersection.max)) {
-        intersection = arith::Interval::nothing();
-      }
-
-      if (!intersection.is_empty()) {
+      arith::IntervalSet interval = Downcast<arith::IntervalSet>(kv.second);
+      arith::IntervalSet intersection = arith::Intersect(
+          &analyzer_, interval, for_interval);
+      if (!intersection->IsEmpty()) {
         sets.push_back(kv.second);
         cond_set.insert(kv.first.first);
       }
@@ -476,11 +465,12 @@ Stmt LoopPartitioner::TryPartition(const Node* node,
                                    Expr max,
                                    Stmt body,
                                    bool partition_thread_scope) {
+  using namespace arith;
   PartitionFinder finder(var, hint_map_, relax_map_);
   finder.Visit(body);
   if (finder.partitions.empty()) return Stmt();
 
-  arith::Interval for_interval(min, max);
+  arith::IntervalSet for_interval(min, max);
   bool cond_value;
   IntSet middle_interval;
   std::unordered_set<const Node*> cond_set;
@@ -491,7 +481,7 @@ Stmt LoopPartitioner::TryPartition(const Node* node,
     // if such interval doesn't exist, find an interval in which all
     // conditions on var are false
     std::tie(middle_interval, cond_set) =
-            GetIntervalAndCondset(finder.partitions, for_interval, false);
+        GetIntervalAndCondset(finder.partitions, for_interval, false);
     if (middle_interval.is_nothing())
       // we couldn't find an interval in which the condintions are provably true or false
       // Therefore, we can't partition the loop based on those conds
@@ -501,7 +491,7 @@ Stmt LoopPartitioner::TryPartition(const Node* node,
     cond_value = true;
   }
 
-  arith::Interval middle_interval_i = middle_interval.as<arith::IntervalSet>()->i;
+  IntervalSet middle_interval_i = Downcast<IntervalSet>(middle_interval);
   // middle_interval is the subrange of the loop variable range for which a
   // set of conditions are true (or false resp.)
   // The part of the loop variable range that is before (after resp.) that
@@ -512,7 +502,7 @@ Stmt LoopPartitioner::TryPartition(const Node* node,
   Expr body_begin;
   Stmt pre_stmt;
   bool pre_stmt_recurse = true;
-  if (middle_interval_i.has_lower_bound()) {
+  if (middle_interval_i->HasLowerBound()) {
     body_begin = ir::Simplify(middle_interval.min());
     if (!can_prove(body_begin == min)) {
       Expr cond = (body_begin - min >= 0);
@@ -537,7 +527,7 @@ Stmt LoopPartitioner::TryPartition(const Node* node,
   Expr post_doubt_begin;
   Stmt post_stmt;
   bool post_stmt_recurse = true;
-  if (middle_interval_i.has_upper_bound()) {
+  if (middle_interval_i->HasUpperBound()) {
     post_doubt_begin = ir::Simplify(middle_interval.max() + 1);
     if (!can_prove(middle_interval.max() == max)) {
       // require the extent to be non-negative
diff --git a/src/pass/lower_custom_datatypes.cc b/src/pass/lower_custom_datatypes.cc
new file mode 100644
index 000000000000..7598ef49eee0
--- /dev/null
+++ b/src/pass/lower_custom_datatypes.cc
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file tvm/src/pass/lower_custom_datatypes.cc
+ * \brief Pass for lowering custom datatypes
+ */
+
+#include <tvm/ir_mutator.h>
+#include <tvm/ir_pass.h>
+#include <tvm/packed_func_ext.h>
+#include "../codegen/datatype/registry.h"
+
+namespace tvm {
+namespace ir {
+
+/*!
+ * \brief Helper mutator to implement lowering of custom datatypes.
+ *
+ * Lowering datatypes works as follows: for every expression containing a custom
+ * datatype, we search for a global (registered by the implementer of the custom
+ * datatype) for lowering this type of expression, and uses it to lower the
+ * expression.
+ */
+class CustomDatatypesLowerer : public IRMutator {
+ public:
+  explicit CustomDatatypesLowerer(const std::string& target) : target_(target) {}
+
+  inline Expr Mutate_(const Cast* op, const Expr& e) final {
+    auto type_code = op->type.code();
+    auto src_type_code = op->value.type().code();
+    // If either datatype is a registered custom datatype, we must lower.
+    bool toBeLowered = datatype::Registry::Global()->GetTypeRegistered(type_code) ||
+                       datatype::Registry::Global()->GetTypeRegistered(src_type_code);
+    Expr expr = IRMutator::Mutate_(op, e);
+    op = expr.as<Cast>();
+    if (toBeLowered) {
+      auto lower = datatype::GetCastLowerFunc(target_, type_code, src_type_code);
+      CHECK(lower) << "Cast lowering function for target " << target_ << " destination type "
+                   << static_cast<unsigned>(type_code) << " source type "
+                   << static_cast<unsigned>(src_type_code) << " not found";
+      return (*lower)(expr);
+    }
+    return expr;
+  }
+
+  inline Expr Mutate_(const FloatImm* imm, const Expr& e) final {
+    auto type_code = imm->type.code();
+    if (datatype::Registry::Global()->GetTypeRegistered(type_code)) {
+      auto lower = datatype::GetFloatImmLowerFunc(target_, type_code);
+      CHECK(lower) << "FloatImm lowering function for target " << target_ << " type "
+                   << static_cast<unsigned>(type_code) << " not found";
+      return (*lower)(e);
+    }
+    return e;
+  }
+
+  inline Stmt Mutate_(const Allocate* allocate, const Stmt& s) final {
+    bool toBeLowered = datatype::Registry::Global()->GetTypeRegistered(allocate->type.code());
+    Stmt stmt = IRMutator::Mutate_(allocate, s);
+    allocate = stmt.as<Allocate>();
+
+    if (toBeLowered) {
+      auto new_allocate_type = UInt(allocate->type.bits(), allocate->type.lanes());
+      return Allocate::make(allocate->buffer_var, new_allocate_type, allocate->extents,
+                            allocate->condition, allocate->body, allocate->new_expr,
+                            allocate->free_function);
+    }
+    return stmt;
+  }
+
+  inline Expr Mutate_(const Load* load, const Expr& e) final {
+    bool toBeLowered = datatype::Registry::Global()->GetTypeRegistered(load->type.code());
+    Expr expr = IRMutator::Mutate_(load, e);
+    load = expr.as<Load>();
+    if (toBeLowered) {
+      auto new_load_type = UInt(load->type.bits());
+      return Load::make(new_load_type, load->buffer_var, load->index, load->predicate);
+    }
+    return expr;
+  }
+
+#define DEFINE_MUTATE__(OP)                                                        \
+  inline Expr Mutate_(const OP* op, const Expr& e) final {                         \
+    auto type_code = op->type.code();                                              \
+    bool toBeLowered = datatype::Registry::Global()->GetTypeRegistered(type_code); \
+    Expr expr = IRMutator::Mutate_(op, e);                                         \
+    op = expr.as<OP>();                                                            \
+    if (toBeLowered) {                                                             \
+      auto lower = datatype::Get##OP##LowerFunc(target_, type_code);               \
+      CHECK(lower) << #OP " lowering function for target " << target_ << " type "  \
+                   << static_cast<unsigned>(type_code) << " not found";            \
+      return (*lower)(expr);                                                       \
+    }                                                                              \
+    return expr;                                                                   \
+  }
+
+  DEFINE_MUTATE__(Add)
+  DEFINE_MUTATE__(Sub)
+  DEFINE_MUTATE__(Mul)
+  DEFINE_MUTATE__(Div)
+  DEFINE_MUTATE__(Mod)
+  DEFINE_MUTATE__(Min)
+  DEFINE_MUTATE__(Max)
+  DEFINE_MUTATE__(EQ)
+  DEFINE_MUTATE__(NE)
+  DEFINE_MUTATE__(LT)
+  DEFINE_MUTATE__(LE)
+  DEFINE_MUTATE__(GT)
+  DEFINE_MUTATE__(GE)
+  // Later changes may need to add more mutate functions as we support workloads with more ops.
+
+ private:
+  std::string target_;
+};
+
+LoweredFunc LowerCustomDatatypes(LoweredFunc f, const std::string& target) {
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
+  n->body = CustomDatatypesLowerer(target).Mutate(n->body);
+  return LoweredFunc(n);
+}
+
+}  // namespace ir
+}  // namespace tvm
diff --git a/src/pass/vectorize_loop.cc b/src/pass/vectorize_loop.cc
index f87e80c2d030..8c3d383c1529 100644
--- a/src/pass/vectorize_loop.cc
+++ b/src/pass/vectorize_loop.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -519,5 +519,23 @@ Stmt VectorizeLoop(Stmt stmt) {
   return LoopVectorizer().Mutate(stmt);
 }
 
+class VectorizeSkipper : public IRMutator {
+ public:
+  Stmt Mutate_(const For* op, const Stmt& s) final {
+    Stmt stmt = IRMutator::Mutate_(op, s);
+    op = stmt.as<For>();
+    if (op->for_type == ForType::Vectorized) {
+      return For::make(op->loop_var, op->min, op->extent, ForType::Serial, op->device_api,
+                       op->body);
+    } else {
+       return stmt;
+    }
+  }
+};
+
+Stmt SkipVectorize(Stmt stmt) {
+  return VectorizeSkipper().Mutate(stmt);
+}
+
 }  // namespace ir
 }  // namespace tvm
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 08a88d53350f..3feb7e4a4b54 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -18,18 +18,13 @@
  */
 
 /*!
- *  Copyright (c) 2019 by Contributors
  * \file relay/backend/build_module.cc
  * \brief Code generation for TVM's graph runtime.
  */
-
 #include <tvm/build_module.h>
-#include <tvm/relay/op.h>
+#include <tvm/runtime/device_api.h>
 #include <tvm/relay/expr.h>
-#include <tvm/relay/attrs/nn.h>
-#include <tvm/relay/attrs/transform.h>
-#include <vector>
-#include <string>
+#include <tvm/relay/transform.h>
 #include <memory>
 
 #include "utils.h"
@@ -38,88 +33,8 @@ namespace tvm {
 namespace relay {
 namespace backend {
 
-/*!
- * \brief Context name / index
- *        See: python/tvm/_ffi/runtime_ctypes.py
- */
-struct ContextMap {
-  static const std::unordered_map<int, std::string> mask2str;
-  static const std::unordered_map<std::string, int> str2mask;
-  static std::string Mask2Str(int mask) {
-    CHECK_GT(mask2str.count(mask), 0) << "Unknown mask.";
-    return mask2str.at(mask);
-  }
-  static int Str2Mask(const std::string& str) {
-    CHECK_GT(str2mask.count(str), 0) << "Unknown context.";
-    return str2mask.at(str);
-  }
-};
-
-const std::unordered_map<int, std::string> ContextMap::mask2str = {
-  {1, "cpu"},
-  {2, "gpu"},
-  {4, "opencl"},
-  {5, "aocl"},
-  {6, "sdaccel"},
-  {7, "vulkan"},
-  {8, "metal"},
-  {9, "vpi"},
-  {10, "rocm"},
-  {11, "opengl"},
-  {12, "ext_dev"}
-};
-
-const std::unordered_map<std::string, int> ContextMap::str2mask = {
-  {"llvm", 1},
-  {"cpu", 1},
-  {"c", 1},
-  {"gpu", 2},
-  {"cuda", 2},
-  {"nvptx", 2},
-  {"cl", 4},
-  {"opencl", 4},
-  {"aocl", 5},
-  {"aocl_sw_emu", 5},
-  {"vulkan", 7},
-  {"metal", 8},
-  {"vpi", 9},
-  {"rocm", 10},
-  {"opengl", 11},
-  {"ext_dev", 12}
-};
-
-/*!
- * \brief A data structure to map the names of specific optimizations to
- *        numeric optimization levels
- *
- */
-struct OptPassLevel {
-  static const std::unordered_map<std::string, int> _data;
-  /*!
-   * \brief Get level for an optimization pass
-   *
-   * \param key pass name
-   * \return int level
-   */
-  int operator[](const std::string& key) const {
-    auto it = _data.find(key);
-    if (it == _data.end()) {
-      return -1;
-    }
-    return it->second;
-  }
-};
-
-const std::unordered_map<std::string, int> OptPassLevel::_data = {
-  {"SimplifyInference", 0},
-  {"OpFusion", 1},
-  {"FoldConstant", 2},
-  {"CombineParallelConv2D", 3},
-  {"FoldScaleAxis", 3},
-  {"AlterOpLayout", 3},
-  {"CanonicalizeOps", 3},
-  {"EliminateCommonSubexpr", 3}
-};
+using TargetsMap = Map<tvm::Integer, tvm::Target>;
+using namespace tvm::relay::transform;
 
 /*!
  * \brief Output of building module
@@ -131,27 +46,6 @@ struct BuildOutput {
   std::unordered_map<std::string, tvm::runtime::NDArray> params;
 };
 
-/*!
- * \brief Relay building config
- *
- */
-struct RelayBuildConfig {
-  int opt_level{2};
-  std::string fallback_device{"llvm"};
-  std::unordered_set<std::string> enabled_pass;
-  std::unordered_set<std::string> disabled_pass;
-  OptPassLevel OPT_PASS_LEVEL;
-  inline bool pass_enabled(const std::string& pass_name) const {
-    if (disabled_pass.count(pass_name)) {
-      return false;
-    }
-    if (enabled_pass.count(pass_name)) {
-      return true;
-    }
-    return opt_level >= OPT_PASS_LEVEL[pass_name];
-  }
-};
-
 /*!
  * \brief GraphCodegen module wrapper
  *
@@ -164,14 +58,8 @@ struct GraphCodegen {
   }
   ~GraphCodegen() {}
 
-  void Init(runtime::Module* m,
-            Map<HalideIR::Expr, HalideIR::Expr> targets) {
-    Array<HalideIR::Expr> tgts;
-    for (auto kv : targets) {
-      tgts.push_back(kv.first);
-      tgts.push_back(kv.second);
-    }
-    CallFunc("init", m, tgts);
+  void Init(runtime::Module* m, TargetsMap targets) {
+    CallFunc("init", m, targets);
   }
 
   void Codegen(const Function& func) {
@@ -211,18 +99,6 @@ struct GraphCodegen {
   }
 };
 
-template<typename R, typename ...Args>
-R CallPackedFunc(const std::string &name, Args... args) {
-  auto pf = GetPackedFunc(name);
-  return (*pf)(std::forward<Args>(args)...);
-}
-
-template<typename ...Args>
-Function CallPackedFunc(const std::string &name, Args... args) {
-  auto pf = GetPackedFunc(name);
-  return (*pf)(std::forward<Args>(args)...);
-}
-
 /*!
  * \brief Relay build module
  *
@@ -248,14 +124,7 @@ class RelayBuildModule : public runtime::ModuleNode {
     } else if (name == "build") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         CHECK_EQ(args.num_args, 3);
-        Array<HalideIR::Expr> tmp = args[1];
-        std::unordered_map<std::string, std::string> targets;
-        for (size_t i = 0; i < tmp.size(); i += 2) {
-          auto k = tmp[i].as<ir::StringImm>()->value;
-          auto v = tmp[i + 1].as<ir::StringImm>()->value;
-          targets[k] = v;
-        }
-        this->Build(args[0], targets, args[2]);
+        this->Build(args[0], args[1], args[2]);
       });
     } else if (name == "list_params") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
@@ -265,27 +134,6 @@ class RelayBuildModule : public runtime::ModuleNode {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         *rv = this->GetParams();
       });
-    } else if (name == "set_opt_level") {
-      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        CHECK_EQ(args.num_args, 1);
-        int level = args[0];
-        this->SetOptLevel(level);
-      });
-    } else if (name == "set_fallback_device") {
-      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        std::string dev = args[0];
-        this->SetFallBackDev(dev);
-      });
-    } else if (name == "add_pass") {
-      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        std::string pass_name = args[0];
-        this->AddPass(pass_name);
-      });
-    } else if (name == "disable_pass") {
-      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        std::string pass_name = args[0];
-        this->DisablePass(pass_name);
-      });
     } else if (name == "set_params") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         Map<std::string, Constant> params = args[0];
@@ -307,30 +155,7 @@ class RelayBuildModule : public runtime::ModuleNode {
   const std::string& GetGraphJSON() {
     return ret_.graph_json;
   }
-  /*!
-   * \brief Add extra pass into build cfg
-   *
-   * \param pass_name name of pass
-   */
-  void AddPass(const std::string& pass_name) {
-    cfg_.enabled_pass.insert(pass_name);
-  }
-  /*!
-   * \brief Disable a specific pass in cfg
-   *
-   * \param pass_name name of pass
-   */
-  void DisablePass(const std::string& pass_name) {
-    cfg_.disabled_pass.insert(pass_name);
-  }
-  /*!
-   * \brief Set the Fallback device
-   *
-   * \param device name
-   */
-  void SetFallBackDev(const std::string& dev) {
-    cfg_.fallback_device = dev;
-  }
+
   /*!
    * \brief Get the Module object
    *
@@ -345,8 +170,8 @@ class RelayBuildModule : public runtime::ModuleNode {
    *
    * \return Array<StringImm> names of params
    */
-  Array<HalideIR::Expr> ListParamNames() {
-    Array<HalideIR::Expr> ret;
+  Array<tvm::Expr> ListParamNames() {
+    Array<tvm::Expr> ret;
     for (const auto& kv : params_) {
       ret.push_back(ir::StringImm::make(kv.first));
     }
@@ -376,15 +201,6 @@ class RelayBuildModule : public runtime::ModuleNode {
     params_[name] = data_in;
   }
 
-  /*!
-   * \brief Set the optimization level
-   *
-   * \param level
-   */
-  void SetOptLevel(char level) {
-    cfg_.opt_level = level;
-  }
-
   /*!
    * \brief type key
    *
@@ -402,11 +218,11 @@ class RelayBuildModule : public runtime::ModuleNode {
    * \param target_host Host target device
    */
   void Build(Function func,
-             const std::unordered_map<std::string, std::string>& targets,
-             const std::string& target_host) {
+             const TargetsMap& targets,
+             const tvm::Target& target_host) {
     targets_ = targets;
     target_host_ = target_host;
-    BuildRelay(func, cfg_, params_);
+    BuildRelay(func, params_);
   }
 
  protected:
@@ -416,8 +232,9 @@ class RelayBuildModule : public runtime::ModuleNode {
    * \param params params dict
    * \return relay::Function
    */
-  relay::Function BindParamsByName(relay::Function func,
-                              const std::unordered_map<std::string, runtime::NDArray>& params) {
+  relay::Function BindParamsByName(
+      relay::Function func,
+      const std::unordered_map<std::string, runtime::NDArray>& params) {
     std::unordered_map<std::string, relay::Var> name_dict;
     std::unordered_set<relay::Var, NodeHash, NodeEqual> repeat_var;
     for (auto arg : func->params) {
@@ -438,149 +255,147 @@ class RelayBuildModule : public runtime::ModuleNode {
       if (repeat_var.count(arg)) {
         LOG(FATAL) << "Multiple args in the function have name " << kv.first;
       }
-      auto e = CallPackedFunc<Expr>("relay._make.Constant", kv.second);
-      bind_dict[arg] = e;
+      bind_dict[arg] = ConstantNode::make(kv.second);
     }
-    return CallPackedFunc("relay._expr.Bind", func, tvm::Map<relay::Var, Expr>(bind_dict));
+    Expr bound_expr = relay::Bind(func, bind_dict);
+    Function ret = Downcast<Function>(bound_expr);
+    CHECK(ret.defined())
+        << "The returning type is expected to be a Relay Function."
+        << "\n";
+    return ret;
   }
 
   /*!
-   * \brief Optimize Relay function
+   * \brief Optimize a Relay module.
    *
-   * \param func Input function
-   * \param target target device
-   * \param cfg Relay build config
-   * \param params params dict
-   * \return relay::Function
+   * \param relay_module The input Relay module where optmization will be
+   *        applied on.
+   * \param targets The device type to `Target` mapping.
+   * \param params The param name to value mapping.
+   *
+   * \return relay::Module The updated Relay module after optimization.
    */
-  relay::Function Optimize(relay::Function func,
-                           const std::unordered_map<std::string, std::string>& targets,
-                           const RelayBuildConfig& cfg,
-                           const std::unordered_map<std::string, runtime::NDArray>& params) {
-    if (params.size()) {
-      func = BindParamsByName(func, params);
-    }
-    if (cfg.pass_enabled("SimplifyInference")) {
-      func = CallPackedFunc("relay._ir_pass.infer_type", func, nullptr);
-      func = CallPackedFunc("relay._ir_pass.simplify_inference", func);
-    }
-    if (cfg.pass_enabled("EliminateCommonSubexpr")) {
-      auto fskip = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
-        Expr expr = args[0];
-        if (expr.as<CallNode>()) {
-          auto call_node = expr.as<CallNode>();
-          auto op_node = call_node->op.as<OpNode>();
-          if (op_node->name == "cast") {
-            auto attrs = call_node->attrs.as<CastAttrs>();
-            if (attrs->dtype == HalideIR::Int(32)) {
-              *rv = true;
-            }
+  relay::Module Optimize(
+      relay::Module relay_module,
+      const TargetsMap& targets,
+      const std::unordered_map<std::string, runtime::NDArray>& params) {
+    Array<Pass> pass_seqs;
+    pass_seqs.push_back(transform::SimplifyInference());
+    PackedFunc fskip = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
+      Expr expr = args[0];
+      if (expr.as<CallNode>()) {
+        auto call_node = expr.as<CallNode>();
+        auto op_node = call_node->op.as<OpNode>();
+        if (op_node->name == "cast") {
+          auto attrs = call_node->attrs.as<CastAttrs>();
+          if (attrs->dtype == HalideIR::Int(32)) {
+            *rv = true;
           }
         }
-        *rv =  false;
-      });
-      func = CallPackedFunc("relay._ir_pass.infer_type", func, nullptr);
-      func = CallPackedFunc("relay._ir_pass.eliminate_common_subexpr", func, fskip);
-    }
-    if (cfg.pass_enabled("CombineParallelConv2D")) {
-      const int min_num_branches = 3;
-      func = CallPackedFunc("relay._ir_pass.infer_type", func, nullptr);
-      func = CallPackedFunc("relay._ir_pass.CombineParallelConv2D", func, min_num_branches);
-    }
-    if (cfg.pass_enabled("FoldConstant")) {
-      func = CallPackedFunc("relay._ir_pass.FoldConstant", func);
-    }
-    if (cfg.pass_enabled("FoldScaleAxis")) {
-      func = CallPackedFunc("relay._ir_pass.infer_type", func, nullptr);
-      func = CallPackedFunc("relay._ir_pass.backward_fold_scale_axis", func);
-      func = CallPackedFunc("relay._ir_pass.infer_type", func, nullptr);
-      func = CallPackedFunc("relay._ir_pass.forward_fold_scale_axis", func);
-      func = CallPackedFunc("relay._ir_pass.FoldConstant", func);
-    }
-    if (cfg.pass_enabled("CanonicalizeOps")) {
-      func = CallPackedFunc("relay._ir_pass.infer_type", func, nullptr);
-      func = CallPackedFunc("relay._ir_pass.canonicalize_ops", func);
+      }
+      *rv = false;
+    });
+    pass_seqs.push_back(transform::EliminateCommonSubexpr(fskip));
+    pass_seqs.push_back(transform::CombineParallelConv2D(3));
+    pass_seqs.push_back(transform::FoldConstant());
+    pass_seqs.push_back(transform::FoldScaleAxis());
+    pass_seqs.push_back(transform::CanonicalizeCast());
+    pass_seqs.push_back(transform::CanonicalizeOps());
+
+    // Alter layout transformation is only applied to homogeneous execution yet.
+    if (targets.size() == 1) {
+      pass_seqs.push_back(transform::AlterOpLayout());
     }
-    if (cfg.pass_enabled("AlterOpLayout")) {
-      if (targets.size() == 1) {
-        func = CallPackedFunc("relay._ir_pass.infer_type", func, nullptr);
-        auto enter_pf = GetPackedFunc("_EnterTargetScope");
-        auto exit_pf = GetPackedFunc("_ExitTargetScope");
-        for (const auto& kv : targets) {
-          auto target = Target::create(kv.second);
-          (*enter_pf)(target);
-          func = CallPackedFunc("relay._ir_pass.AlterOpLayout", func);
-          (*exit_pf)();
-        }
-      } else {
-        LOG(WARNING) << "AlterOpLayout pass is not enabled for heterogeneous"
-                  << " execution yet.";
+    pass_seqs.push_back(transform::FoldConstant());
+
+    // Create a sequential pass and perform optimizations.
+    transform::Pass seq = transform::Sequential(pass_seqs);
+    if (targets.size() == 1) {
+      for (const auto& kv : targets) {
+        With<Target> tctx(kv.second);
+        relay_module = seq(relay_module);
       }
+    } else {
+      relay_module = seq(relay_module);
     }
-    if (cfg.pass_enabled("FoldConstant")) {
-      func = CallPackedFunc("relay._ir_pass.FoldConstant", func);
+
+    // Handle heterogeneous compilation.
+    transform::PassContext pass_ctx = PassContext::Current();
+    if (targets_.size() > 1) {
+      relay_module =
+          RunDeviceAnnotationPass(relay_module, pass_ctx->fallback_device);
     }
-    return func;
+
+    // Fuse the operations if it is needed.
+    relay_module = transform::FuseOps()(relay_module);
+    relay_module = transform::InferType()(relay_module);
+
+    return relay_module;
   }
+
+  /*!
+   * \brief Create a default type.
+   * \param device_type The device type index.
+   * \return the default target for the device.
+   */
+  Target CreateDefaultTarget(int device_type) {
+    std::string name = runtime::DeviceName(device_type);
+    if (name == "cpu") return Target::Create("llvm");
+    if (name == "gpu") return Target::Create("cuda");
+    return Target::Create(name);
+  }
+
   /*!
    * \brief Update the target and fallback device required for heterogeneous
    * compilation. CPU is used as the fallback device if it wasn't provided.
    * Meanwhile, a CPU device type and "llvm" pair will be added to the target
    * dictionary in this case.
    *
-   * \param targets dictionary
-   * \param cfg
-   * \return Map<HalideIR::Expr, HalideIR::Expr>
+   * \param fallback_device The fallback device for heterogeneous execution.
    */
-  Map<HalideIR::Expr, HalideIR::Expr> UpdateHeterogeneousInputs(
-    const std::unordered_map<std::string, std::string>& targets,
-    const RelayBuildConfig& cfg) {
-    Map<HalideIR::Expr, HalideIR::Expr> device_target;
-    std::unordered_map<int64_t, std::string> tmp_map;
-    auto fallback_idx = ContextMap::Str2Mask(cfg.fallback_device);
-
-    for (const auto& kv : targets) {
-      tmp_map[ContextMap::Str2Mask(kv.first)] = kv.second;
+  void UpdateHeterogeneousInputs(int fallback_device) {
+    std::unordered_map<int64_t, tvm::Target> tmp_map;
+    for (const auto& kv : targets_) {
+      tmp_map[kv.first->value] = kv.second;
     }
-    if (tmp_map.count(fallback_idx) == 0) {
-      tmp_map[fallback_idx] = cfg.fallback_device;
+    if (tmp_map.count(fallback_device) == 0) {
+      targets_.Set(fallback_device, CreateDefaultTarget(fallback_device));
     }
-    for (const auto& kv : tmp_map) {
-      device_target.Set(
-        ir::IntImm::make(HalideIR::Int(64), kv.first),
-        ir::StringImm::make(kv.second));
-    }
-    return device_target;
   }
+
   /*!
    * \brief Execute the device annotation passes to update the input program and
    *        target information.
    *
-   * \param func
-   * \param cfg
-   * \param targets_map_ptr
-   * \return Function
+   * \param relay_module The input Relay module.
+   * \param fallback_device The fallback device for heterogeneous execution.
+   *
+   * \return updated_module The updated module after device annotation.
    */
-  Function RunDeviceAnnotationPass(
-      Function func,
-      const RelayBuildConfig& cfg,
-      Map<HalideIR::Expr, HalideIR::Expr>* targets_map_ptr) {
-    auto fallback_idx = ContextMap::Str2Mask(cfg.fallback_device);
-    func = CallPackedFunc("relay._ir_pass.infer_type", func, nullptr);
-    func = CallPackedFunc("relay._ir_pass.RewriteDeviceAnnotation", func, fallback_idx);
-    auto device_map = CallPackedFunc<Map<Expr, Integer> >("relay._ir_pass.CollectDeviceInfo",
-                                                       func,
-                                                       nullptr);
-    if (device_map.size() == 0) {
-      auto annotation_map =
-        CallPackedFunc<Map<Expr, Integer> >("relay._ir_pass.CollectDeviceAnnotationOps",
-                                            func,
-                                            nullptr);
-      if (annotation_map.size() == 0) {
-        targets_map_ptr->Set(
-          ir::IntImm::make(HalideIR::Int(64), 0),
-          ir::StringImm::make(cfg.fallback_device));
+  relay::Module RunDeviceAnnotationPass(const relay::Module& relay_module,
+                                        int fallback_device) {
+    UpdateHeterogeneousInputs(fallback_device);
+    auto rewrite = transform::RewriteAnnotatedOps(fallback_device);
+    auto updated_module = rewrite(relay_module);
+    CHECK(updated_module.defined());
+
+    tvm::Map<Expr, Integer> device_map;
+    for (const auto& it : updated_module->functions) {
+      device_map = relay::CollectDeviceInfo(it.second);
+      if (!device_map.empty()) break;
+    }
+
+    if (device_map.empty()) {
+      tvm::Map<Expr, Integer> annotation_map;
+      for (const auto& it : relay_module->functions) {
+        annotation_map = relay::CollectDeviceAnnotationOps(it.second);
+        if (!annotation_map.empty()) break;
+      }
+      // None op is annotated but they are fallen back to the default device.
+      if (annotation_map.empty()) {
+        targets_.Set(0, CreateDefaultTarget(fallback_device));
       } else {
+        // All ops are annotated to the same device type.
         int64_t dev_type = -1;
         for (auto kv : annotation_map) {
           dev_type = kv.second->value;
@@ -594,70 +409,54 @@ class RelayBuildModule : public runtime::ModuleNode {
             << "found. Please check the "
             << "RewriteAnnotation pass.";
         }
-        targets_map_ptr->Set(
-          ir::IntImm::make(HalideIR::Int(64), 0),
-          ir::StringImm::make(ContextMap::Mask2Str(dev_type)));
+        targets_.Set(0, CreateDefaultTarget(dev_type));
       }
     }
-    return func;
+    return updated_module;
   }
 
   /*!
    * \brief Build relay function to runtime module
    *
    * \param func Relay Function
-   * \param cfg Relay build config
    * \param params parameters
    */
-  void BuildRelay(Function func,
-                  const RelayBuildConfig& cfg,
-                  const std::unordered_map<std::string, tvm::runtime::NDArray> &params) {
-    // convert
-    tvm_cfg_ = build_config();
-    Map<HalideIR::Expr, HalideIR::Expr> device_target;
-    if (targets_.size() > 1) {
-      device_target = UpdateHeterogeneousInputs(targets_, cfg);
-    } else {
-      for (auto &kv : targets_) {
-        device_target.Set(
-          ir::IntImm::make(HalideIR::Int(64), ContextMap::Str2Mask(kv.first)),
-          ir::StringImm::make(kv.second));
-      }
-    }
-    func = Optimize(func, targets_, cfg, params);
-    if (device_target.size() > 1) {
-      func = RunDeviceAnnotationPass(func, cfg, &device_target);
+  void BuildRelay(
+      Function func,
+      const std::unordered_map<std::string, tvm::runtime::NDArray>& params) {
+    if (params.size()) {
+      func = BindParamsByName(func, params);
     }
-    // TODO(@jroesch): use the passes directly.
-    func = CallPackedFunc("relay._ir_pass.infer_type", func, nullptr);
-    func = CallPackedFunc("relay._ir_pass.FuseOps", func, cfg.opt_level, nullptr);
-    func = CallPackedFunc("relay._ir_pass.infer_type", func, nullptr);
 
+    // Perform Module->Module optimizations.
+    relay::Module relay_module = relay::ModuleNode::FromExpr(func);
+    relay_module = Optimize(relay_module, targets_, params);
+    CHECK(relay_module.defined());
+    // Get the updated function.
+    func = relay_module->Lookup(relay_module->entry_func->name_hint);
+
+    // Generate code for the updated function.
     graph_codegen_ = std::unique_ptr<GraphCodegen>(new GraphCodegen());
-    graph_codegen_->Init(nullptr, device_target);
+    graph_codegen_->Init(nullptr, targets_);
     graph_codegen_->Codegen(func);
 
     ret_.graph_json = graph_codegen_->GetJSON();
     ret_.params = graph_codegen_->GetParams();
 
-    auto target_host = Target::create(target_host_);
-    ret_.mod = tvm::build(graph_codegen_->GetLoweredFunc(), target_host, tvm_cfg_);
+    ret_.mod = tvm::build(graph_codegen_->GetLoweredFunc(), target_host_,
+                          BuildConfig::Current());
   }
 
  protected:
   std::unique_ptr<GraphCodegen> graph_codegen_;
   /*! \brief target device */
-  std::unordered_map<std::string, std::string> targets_;
+  TargetsMap targets_;
   /*! \brief target host device */
-  std::string target_host_;
-  /*! \brief frontend optimization configure */
-  RelayBuildConfig cfg_;
+  tvm::Target target_host_;
   /*! \brief parameters */
   std::unordered_map<std::string, runtime::NDArray> params_;
   /*! \brief building output */
   BuildOutput ret_;
-  /*! \brief tvm building cfg */
-  BuildConfig tvm_cfg_;
 };
 
 runtime::Module RelayBuildCreate() {
@@ -665,7 +464,8 @@ runtime::Module RelayBuildCreate() {
   return runtime::Module(exec);
 }
 
-TVM_REGISTER_GLOBAL("relay.build_module._BuildModule").set_body([](TVMArgs args, TVMRetValue* rv) {
+TVM_REGISTER_GLOBAL("relay.build_module._BuildModule")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
   *rv = RelayBuildCreate();
 });
 
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index a824c457107a..f11dd2875b80 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -344,7 +344,7 @@ class CompileEngineImpl : public CompileEngineNode {
       cache_[key] = value;
     }
     // Enforce use the target.
-    TargetContext target_ctx(key->target);
+    With<Target> target_scope(key->target);
 
     CHECK(!value->cached_func.defined());
     auto spair = CreateSchedule(key->source_func, key->target);
@@ -371,7 +371,7 @@ class CompileEngineImpl : public CompileEngineNode {
       cache_node->funcs = (*f)(
           spair.first, all_args, cache_node->func_name, key->source_func);
     } else {
-      tvm::BuildConfig bcfg = tvm::build_config();
+      tvm::BuildConfig bcfg = BuildConfig::Create();
       std::unordered_map<Tensor, Buffer> binds;
       cache_node->funcs = tvm::lower(spair.first, all_args, cache_node->func_name, binds, bcfg);
     }
diff --git a/src/relay/backend/graph_runtime_codegen.cc b/src/relay/backend/graph_runtime_codegen.cc
index 415e0ec9c2a5..b14448c59166 100644
--- a/src/relay/backend/graph_runtime_codegen.cc
+++ b/src/relay/backend/graph_runtime_codegen.cc
@@ -51,7 +51,7 @@ using GraphAttrs = std::unordered_map<std::string, dmlc::any>;
 using GraphNodePtr = std::shared_ptr<GraphNode>;
 using GraphInputNodePtr = std::shared_ptr<GraphInputNode>;
 using GraphOpNodePtr = std::shared_ptr<GraphOpNode>;
-using TargetsMap = std::unordered_map<std::string, Target>;
+using TargetsMap = std::unordered_map<int, Target>;
 
 /*! \brief Lowered outputs */
 struct LoweredOutput {
@@ -193,12 +193,10 @@ class GraphOpNode : public GraphNode {
 class GraphRuntimeCodegen
     : public ::tvm::relay::ExprFunctor<std::vector<GraphNodeRef>(const Expr&)> {
  public:
-  GraphRuntimeCodegen(runtime::Module* mod,
-                      const std::unordered_map<std::string, std::string>& targets) : mod_(mod) {
+  GraphRuntimeCodegen(runtime::Module* mod, const TargetsMap& targets)
+      : mod_(mod) {
     compile_engine_ = CompileEngine::Global();
-    for (auto &kv : targets) {
-      targets_[kv.first] = Target::create(kv.second);
-    }
+    targets_ = targets;
   }
 
   LoweredOutput Codegen(relay::Function func) {
@@ -406,7 +404,7 @@ class GraphRuntimeCodegen
     auto pf0 = GetPackedFunc("relay.backend._make_CCacheKey");
     auto pf1 = GetPackedFunc("relay.backend._CompileEngineLower");
     auto &device_type = storage_device_map_[expr][1];
-    auto call_dev_type = device_type[0]->value;  //-> int to string
+    auto call_dev_type = device_type[0]->value;
     Target target;
     if (targets_.size() == 1) {
        // homogeneous execution.
@@ -415,22 +413,17 @@ class GraphRuntimeCodegen
        }
     } else {
       // heterogeneous execution.
-      const auto call_dev_key = std::to_string(call_dev_type);
       std::string call_dev_name;
       if (call_dev_type == 0) {
         call_dev_name = "llvm";
       } else {
         call_dev_name = runtime::DeviceName(call_dev_type);
       }
-      if (targets_.count(call_dev_name) == 0 && targets_.count(call_dev_key) == 0) {
+      if (targets_.count(call_dev_type) == 0) {
         LOG(FATAL) << "No target is provided for device "
                    << call_dev_name;
       }
-      if (targets_.count(call_dev_key)) {
-        target = targets_[call_dev_key];
-      } else {
-        target = targets_[call_dev_name];
-      }
+      target = targets_[call_dev_type];
     }
     CCacheKey key = (*pf0)(func, target);
     CachedFunc lowerd_func = (*pf1)(compile_engine_, key);
@@ -604,30 +597,21 @@ class GraphRuntimeCodegenModule : public runtime::ModuleNode {
   virtual PackedFunc GetFunction(const std::string& name,
                                  const std::shared_ptr<ModuleNode>& sptr_to_self) {
      if (name == "init") {
-      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        CHECK_EQ(args.num_args, 2) << "The expected of arguments are: "
-                                   << "runtime::Module mod and Map<str, StringImm> targets";
-        void* mod = args[0];
-        auto& sptr = args[1].node_sptr();
-        auto* node = static_cast<const ArrayNode*>(sptr.get());
-        auto& tmp_targets = node->data;
-        std::unordered_map<std::string, std::string> targets;
-        for (size_t i = 0; i < tmp_targets.size(); i += 2) {
-          std::string key;
-          auto sk = Expr(tmp_targets[i]).as<ir::StringImm>();
-          auto ik = Expr(tmp_targets[i]).as<ir::IntImm>();
-          if (sk) {
-            key = sk->value;
-          }
-          if (ik) {
-            key = std::to_string(ik->value);
-          }
-          auto v = Expr(tmp_targets[i + 1]).as<ir::StringImm>();
-          targets[key] = v->value;
-        }
-        codegen_ = std::make_shared<GraphRuntimeCodegen>(
-          reinterpret_cast<runtime::Module*>(mod), targets);
-      });
+       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+         CHECK_EQ(args.num_args, 2)
+             << "The expected of arguments are: "
+             << "runtime::Module mod and Map<int, Target> targets";
+         void* mod = args[0];
+         Map<Integer, tvm::Target> tmp = args[1];
+         TargetsMap targets;
+         for (const auto& it : tmp) {
+           auto dev_type = it.first.as<ir::IntImm>();
+           CHECK(dev_type);
+           targets[dev_type->value] = it.second;
+         }
+         codegen_ = std::make_shared<GraphRuntimeCodegen>(
+             reinterpret_cast<runtime::Module*>(mod), targets);
+       });
     } else if (name == "codegen") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         Function func = args[0];
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index d700c2036e21..1cc81d5174a5 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -103,11 +103,13 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
                               p->stream << "RefValueNode(" << node->value << ")";
                             });
 
-ConstructorValue ConstructorValueNode::make(Constructor constructor,
-                                            tvm::Array<Value> fields) {
+ConstructorValue ConstructorValueNode::make(int tag,
+                                            tvm::Array<Value> fields,
+                                            Constructor constructor) {
   NodePtr<ConstructorValueNode> n = make_node<ConstructorValueNode>();
-  n->constructor = constructor;
+  n->tag = tag;
   n->fields = fields;
+  n->constructor = constructor;
   return ConstructorValue(n);
 }
 
@@ -117,7 +119,7 @@ TVM_REGISTER_API("relay._make.ConstructorValue")
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<ConstructorValueNode>([](const ConstructorValueNode* node,
                                        tvm::IRPrinter* p) {
-  p->stream << "ConstructorValueNode(" << node->constructor
+  p->stream << "ConstructorValueNode(" << node->tag << ","
             << node->fields << ")";
 });
 
@@ -448,7 +450,7 @@ class Interpreter :
                     "fusing and lowering";
     }
     if (auto con = call->op.as<ConstructorNode>()) {
-      return ConstructorValueNode::make(GetRef<Constructor>(con), args);
+      return ConstructorValueNode::make(con->tag, args, GetRef<Constructor>(con));
     }
     // Now we just evaluate and expect to find a closure.
     Value fn_val = Eval(call->op);
@@ -544,9 +546,8 @@ class Interpreter :
     const ConstructorValueNode* cvn = v.as<ConstructorValueNode>();
     CHECK(cvn) << "need to be a constructor for match";
     CHECK_NE(op->constructor->tag, -1);
-    CHECK_NE(cvn->constructor->tag, -1);
-    if (op->constructor->tag == cvn->constructor->tag) {
-      // todo(M.K.): should use ptr equality but it is broken
+    CHECK_NE(cvn->tag, -1);
+    if (op->constructor->tag == cvn->tag) {
       CHECK_EQ(op->patterns.size(), cvn->fields.size());
       for (size_t i = 0; i < op->patterns.size(); ++i) {
         if (!VisitPattern(op->patterns[i], cvn->fields[i])) {
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 97f03c629cb7..dae90aad17d9 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -27,7 +27,7 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/interpreter.h>
 #include <tvm/logging.h>
-#include <tvm/relay/pass.h>
+#include <tvm/relay/transform.h>
 #include <tvm/runtime/vm.h>
 #include <iostream>
 #include <unordered_map>
@@ -38,15 +38,22 @@
 
 namespace tvm {
 namespace relay {
+
+namespace transform {
+
+Pass LambdaLift();
+Pass InlinePrimitives();
+
+}  // namespace transform
+
 namespace vm {
 
 using namespace tvm::runtime;
 using namespace tvm::runtime::vm;
+using namespace relay::transform;
 
 // (@jroesch): VM passes, eventually declare as passes.
 bool IsClosure(const Function& func);
-Module LambdaLift(const Module& module);
-Module InlinePrimitives(const Module& module);
 
 template <typename T, typename U>
 using NodeMap = std::unordered_map<T, U, NodeHash, NodeEqual>;
@@ -73,6 +80,8 @@ struct VMCompilerContext {
   ConstTensorShapeMap const_tensor_shape_map;
   // List of lowered functions
   std::vector<LoweredFunc> lowered_funcs;
+  // The functions that have been lowered.
+  std::unordered_map<LoweredFunc, size_t, NodeHash, NodeEqual> seen_funcs;
 };
 
 // Compute the constant pool, i.e a mapping from Constant node to constant index.
@@ -94,13 +103,6 @@ struct ConstantPool : ExprVisitor {
     }
   }
 
-  void AddConstantTensorShape(TensorType expr, NDArray value) {
-    auto it = this->const_tensor_shape_map.find(expr);
-    if (it == this->const_tensor_shape_map.end()) {
-      this->const_tensor_shape_map.insert({expr, std::make_pair(index++, value)});
-    }
-  }
-
   void VisitExpr_(const ConstantNode* const_node) {
     auto konst = GetRef<Constant>(const_node);
     auto it = this->const_map.find(konst);
@@ -108,48 +110,6 @@ struct ConstantPool : ExprVisitor {
       this->const_map.insert({konst, index++});
     }
   }
-
-  NDArray GetTensorConstant(const TensorTypeNode* ttype) {
-    std::vector<int64_t> shapes;
-    for (auto sh : ttype->shape) {
-      shapes.push_back(Downcast<tvm::Integer>(sh)->value);
-    }
-    int64_t s = shapes.size();
-    DLContext cpu_ctx;
-    cpu_ctx.device_type = kDLCPU;
-    cpu_ctx.device_id = 0;
-    auto shape_tensor = NDArray::Empty({s}, Type2TVMType(Int(64)), cpu_ctx);
-    int64_t* dims = static_cast<int64_t*>(shape_tensor->data);
-    for (size_t i = 0; i < shapes.size(); ++i) {
-      dims[i] = shapes[i];
-    }
-    return shape_tensor;
-  }
-
-  void VisitExpr_(const CallNode* call_node) {
-    for (auto arg : call_node->args) {
-      this->VisitExpr(arg);
-    }
-
-    Expr op = call_node->op;
-    auto func_node = op.as<FunctionNode>();
-    if (func_node) {
-      auto ret_type = call_node->checked_type();
-      if (const TensorTypeNode* ttype = ret_type.as<TensorTypeNode>()) {
-        auto shape = GetTensorConstant(ttype);
-        auto tensor_type = GetRef<TensorType>(ttype);
-        AddConstantTensorShape(tensor_type, shape);
-      } else if (const TupleTypeNode* ttype = ret_type.as<TupleTypeNode>()) {
-        for (size_t i = 0; i < ttype->fields.size(); ++i) {
-          auto f = ttype->fields[i];
-          auto f_type = f.as<TensorTypeNode>();
-          auto shape = GetTensorConstant(f_type);
-          auto tensor_type = GetRef<TensorType>(f_type);
-          AddConstantTensorShape(tensor_type, shape);
-        }
-      }
-    }
-  }
 };
 
 std::tuple<ConstMap, ConstTensorShapeMap> LayoutConstantPool(const Module& module) {
@@ -177,9 +137,6 @@ struct VMCompiler : ExprFunctor<void(const Expr& expr)> {
   size_t registers_num;
   CompileEngine engine;
 
-  /*! \brief The functions that have been lowered. */
-  std::unordered_map<LoweredFunc, size_t, NodeHash, NodeEqual> seen_funcs;
-
   /*! \brief Global shared meta data */
   VMCompilerContext* context;
 
@@ -200,6 +157,7 @@ struct VMCompiler : ExprFunctor<void(const Expr& expr)> {
     switch (instr.op) {
       case Opcode::AllocDatatype:
       case Opcode::AllocTensor:
+      case Opcode::AllocTensorReg:
       case Opcode::GetField:
       case Opcode::LoadConst:
       case Opcode::Select:
@@ -254,13 +212,13 @@ struct VMCompiler : ExprFunctor<void(const Expr& expr)> {
   void VisitExpr_(const MatchNode* match_node) {
     auto match = GetRef<Match>(match_node);
     LOG(FATAL) << "translation of match nodes to the VM is"
-               << "currently unsupported" << std::endl;
+               << "currently unsupported";
   }
 
   void VisitExpr_(const LetNode* let_node) {
-    DLOG(INFO) << let_node->value << std::endl;
+    DLOG(INFO) << let_node->value;
     this->VisitExpr(let_node->value);
-    DLOG(INFO) << this->last_register << std::endl;
+    DLOG(INFO) << this->last_register;
     var_register_map.insert({let_node->var, this->last_register});
     this->VisitExpr(let_node->body);
   }
@@ -273,7 +231,12 @@ struct VMCompiler : ExprFunctor<void(const Expr& expr)> {
   }
 
   void VisitExpr_(const GlobalVarNode* gvar) {
-    LOG(FATAL) << "Global variables should only appear in the call position";
+    auto var = GetRef<GlobalVar>(gvar);
+    auto func = this->context->module->Lookup(var);
+    auto it = this->context->global_map.find(var);
+    CHECK(it != this->context->global_map.end());
+    // Allocate closure with zero free vars
+    Emit(Instruction::AllocClosure(it->second, 0, {}, NewRegister()));
   }
 
   void VisitExpr_(const IfNode* if_node) {
@@ -320,29 +283,51 @@ struct VMCompiler : ExprFunctor<void(const Expr& expr)> {
   }
 
   Instruction AllocTensorFromType(const TensorTypeNode* ttype) {
-    DataType dtype = ttype->dtype;
-    TVMType dltype = Type2TVMType(dtype);
-
+    TVMType dltype = Type2TVMType(ttype->dtype);
     auto tensor_type = GetRef<TensorType>(ttype);
-    auto it = this->context->const_tensor_shape_map.find(tensor_type);
-    if (it == this->context->const_tensor_shape_map.end()) {
-      DLOG(INFO) << "Can not find constant shape for " << tensor_type;
-    } else {
-      Emit(Instruction::LoadConst(it->second.first, NewRegister()));
+    std::vector<int64_t> shape;
+    for (auto dim : tensor_type->shape) {
+      shape.push_back(Downcast<tvm::Integer>(dim)->value);
     }
-
-    return Instruction::AllocTensor(last_register, dltype, NewRegister());
+    return Instruction::AllocTensor(shape, dltype, NewRegister());
   }
 
-  void EmitInvokePrimitive(const Function& func, std::vector<Index> args_registers,
+  void EmitInvokePrimitive(const Function& func,
+                           const std::vector<Index>& args_registers,
                            const Type& ret_type) {
+    std::vector<Index> unpacked_arg_regs;
     std::vector<Instruction> allocs;
-    size_t return_num = 0;
+
+    // Arity calculation must flatten tuples.
+    size_t arity = 0;
+    CHECK_EQ(func->params.size(), args_registers.size());
+    for (size_t i = 0; i < func->params.size(); i++) {
+      auto ty = func->params[i]->checked_type();
+      if (ty.as<TensorTypeNode>()) {
+        unpacked_arg_regs.push_back(args_registers[i]);
+        arity += 1;
+      } else if (auto tuple_ty = ty.as<TupleTypeNode>()) {
+        for (size_t f = 0; f < tuple_ty->fields.size(); f++) {
+          const auto& field = tuple_ty->fields[f];
+          CHECK(field.as<TensorTypeNode>())
+            << "only supports non-nested tuples currently "
+            << "found " << field;
+          auto dst =  NewRegister();
+          Emit(Instruction::GetField(args_registers[i], f, dst));
+          unpacked_arg_regs.push_back(dst);
+        }
+        arity += tuple_ty->fields.size();
+      } else {
+        LOG(FATAL) << "unsupported parameter type " << ty;
+      }
+    }
+
+    size_t return_val_count = 0;
     if (const TensorTypeNode* ttype = ret_type.as<TensorTypeNode>()) {
       // Allocate space for the return tensor.
       auto alloc = AllocTensorFromType(ttype);
       allocs.push_back(alloc);
-      return_num = 1;
+      return_val_count = 1;
     } else if (const TupleTypeNode* ttype = ret_type.as<TupleTypeNode>()) {
       std::vector<Index> fields_registers;
 
@@ -352,43 +337,42 @@ struct VMCompiler : ExprFunctor<void(const Expr& expr)> {
         allocs.push_back(AllocTensorFromType(f_type));
         fields_registers.push_back(allocs.back().dst);
       }
-      return_num = ttype->fields.size();
+      return_val_count = ttype->fields.size();
     } else {
       LOG(FATAL) << "Unsupported return value type";
     }
 
+    arity += return_val_count;
     for (auto& alloc : allocs) {
       Emit(alloc);
-      args_registers.push_back(alloc.dst);
+      unpacked_arg_regs.push_back(alloc.dst);
     }
 
     // Next generate the invoke instruction.
     CHECK(func->IsPrimitive());
-    auto target = Target::create("llvm");
+    auto target = Target::Create("llvm");
     auto key = CCacheKeyNode::make(func, target);
     auto cfunc = engine->Lower(key);
     // TODO(jroesch): support lowered funcs for multiple targets
     CHECK_EQ(cfunc->funcs.size(), 1);
     auto op_index = -1;
-    if (seen_funcs.find(cfunc->funcs[0]) == seen_funcs.end()) {
+    if (this->context->seen_funcs.find(cfunc->funcs[0]) == this->context->seen_funcs.end()) {
       op_index = this->context->lowered_funcs.size();
       this->context->lowered_funcs.push_back(cfunc->funcs[0]);
-      seen_funcs[cfunc->funcs[0]] = op_index;
+      this->context->seen_funcs[cfunc->funcs[0]] = op_index;
     } else {
-      op_index = seen_funcs[cfunc->funcs[0]];
+      op_index = this->context->seen_funcs[cfunc->funcs[0]];
     }
 
-    // If Tensor, 1
-    // If Tuple, size of tuple
-    size_t arity = func->params.size() + return_num;
-    Emit(Instruction::InvokePacked(op_index, arity, return_num, args_registers));
-    if (return_num > 1) {
+    Emit(Instruction::InvokePacked(op_index, arity, return_val_count, unpacked_arg_regs));
+
+    if (return_val_count > 1) {
       // return value is a tuple, we need to create a tuple
       std::vector<Index> fields_registers;
-      for (size_t i = func->params.size(); i < arity; ++i) {
-        fields_registers.push_back(args_registers[i]);
+      for (size_t i = arity - return_val_count; i < arity; ++i) {
+        fields_registers.push_back(unpacked_arg_regs[i]);
       }
-      Emit(Instruction::AllocDatatype(0, return_num, fields_registers, NewRegister()));
+      Emit(Instruction::AllocDatatype(0, return_val_count, fields_registers, NewRegister()));
     }
   }
 
@@ -396,7 +380,6 @@ struct VMCompiler : ExprFunctor<void(const Expr& expr)> {
     std::vector<Index> args_registers;
 
     for (auto arg : call_node->args) {
-      CHECK(arg.as<VarNode>()) << "found: " << AsText(arg, false) << std::endl << arg;
       this->VisitExpr(arg);
       args_registers.push_back(last_register);
     }
@@ -416,18 +399,14 @@ struct VMCompiler : ExprFunctor<void(const Expr& expr)> {
       auto func = this->context->module->Lookup(global);
       if (IsClosure(func)) {
         auto arity = func->params.size();
-        std::vector<Index> free_var_registers;
-        for (size_t i = 0; i < arity; ++i) {
-          free_var_registers.push_back(var_register_map.at(func->params[i]));
-        }
-        Emit(Instruction::AllocClosure(it->second, arity, free_var_registers, NewRegister()));
+        Emit(Instruction::AllocClosure(it->second, arity, args_registers, NewRegister()));
       } else {
         Emit(Instruction::Invoke(it->second, args_registers, NewRegister()));
       }
     } else if (auto constructor_node = op.as<ConstructorNode>()) {
       auto constructor = GetRef<Constructor>(constructor_node);
-      auto tag = GetConstructorTag(constructor);
-      Emit(Instruction::AllocDatatype(tag, call_node->args.size(), args_registers, NewRegister()));
+      Emit(Instruction::AllocDatatype(constructor->tag, call_node->args.size(), args_registers,
+                                      NewRegister()));
     } else if (auto var_node = op.as<VarNode>()) {
       VisitExpr(GetRef<Var>(var_node));
       Emit(Instruction::InvokeClosure(last_register, args_registers, NewRegister()));
@@ -436,18 +415,6 @@ struct VMCompiler : ExprFunctor<void(const Expr& expr)> {
     }
   }
 
-  size_t GetConstructorTag(tvm::relay::Constructor constructor) {
-    auto it = this->context->tag_map.find(constructor);
-    if (it != this->context->tag_map.end()) {
-      return it->second;
-    } else {
-      auto tag = this->context->tag_map.size();
-      this->context->tag_map[constructor] = tag;
-      this->context->tag_index_map[tag] = constructor;
-      return tag;
-    }
-  }
-
   void VisitExpr_(const FunctionNode* func_node) {
     if (!func_node->IsPrimitive()) {
       LOG(FATAL) << "local functions should have been removed by lambda lifting:" << std::endl
@@ -502,7 +469,7 @@ void PopulatePackedFuncMap(const std::vector<LoweredFunc>& lowered_funcs,
   runtime::Module mod;
   if (lowered_funcs.size() > 0) {
     // TODO(@jroesch): we need to read target from build config
-    Target target = Target::create("llvm");
+    Target target = Target::Create("llvm");
     if (const auto* f = runtime::Registry::Get("relay.backend.build")) {
       mod = (*f)(tvm::Array<LoweredFunc>(lowered_funcs.begin(), lowered_funcs.end()), target);
     } else {
@@ -516,7 +483,7 @@ void PopulatePackedFuncMap(const std::vector<LoweredFunc>& lowered_funcs,
 }
 
 VMFunction CompileFunc(VMCompilerContext* context, const GlobalVar& var, const Function& func) {
-  DLOG(INFO) << "CompileFunc: " << std::endl << AsText(func, false) << std::endl;
+  DLOG(INFO) << "CompileFunc: " << var << std::endl << AsText(func, false);
   size_t params = func->params.size();
   VMCompiler compiler(context);
   compiler.Compile(func);
@@ -534,10 +501,13 @@ VMFunction CompileFunc(VMCompilerContext* context, const GlobalVar& var, const F
 }
 
 Module OptimizeModule(const Module& mod) {
-  ToANormalForm(mod->entry_func, mod);
-  InlinePrimitives(mod);
-  LambdaLift(mod);
-  return InlinePrimitives(mod);
+  transform::Sequential seq({transform::ToANormalForm(),
+                             transform::InlinePrimitives(),
+                             transform::LambdaLift(),
+                             transform::InlinePrimitives()});
+  auto pass_ctx = transform::PassContext::Create();
+  tvm::With<relay::transform::PassContext> ctx(pass_ctx);
+  return seq(mod);
 }
 
 void PopulateGlobalMap(GlobalMap* global_map, const Module& mod) {
diff --git a/src/relay/backend/vm/inline_primitives.cc b/src/relay/backend/vm/inline_primitives.cc
index b033a37e42b8..1e561f8a8214 100644
--- a/src/relay/backend/vm/inline_primitives.cc
+++ b/src/relay/backend/vm/inline_primitives.cc
@@ -26,7 +26,7 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/logging.h>
-#include <tvm/relay/pass.h>
+#include <tvm/relay/transform.h>
 #include <tvm/runtime/vm.h>
 #include <iostream>
 #include <vector>
@@ -37,6 +37,21 @@ namespace tvm {
 namespace relay {
 namespace vm {
 
+// TODO(@jroesch): write verifier
+
+/* This pass will eliminate primitives which have been lifted by the ANF
+ * transform inlining them directly into call sites.
+ *
+ * This makes VM related code generation easier as the call target is always
+ * a primitive function.
+ *
+ * let prim = fn(...) { ... };
+ * prim(...)
+ *
+ * will become:
+ *
+ * (fn(...) { ... })(...)
+ */
 struct PrimitiveInliner : ExprMutator {
   Module module_;
   std::unordered_map<Var, Expr, NodeHash, NodeEqual> var_map;
@@ -92,55 +107,46 @@ struct PrimitiveInliner : ExprMutator {
     }
   }
 
-  Function Inline(const Function& func) {
-    DLOG(INFO) << "Before inlining primitives: " << std::endl
-                    << "func= " << AsText(func, false) << std::endl;
-
-    auto inlined = FunctionNode::make(func->params, VisitExpr(func->body), func->ret_type,
-                                      func->type_params, func->attrs);
-
-    inlined = Downcast<Function>(DeadCodeElimination(inlined));
-
-    DLOG(INFO) << "After inlining primitives" << std::endl
-                    << "after_func= " << AsText(inlined, false) << std::endl;
-    return inlined;
+  Module Inline() {
+    auto gvar_funcs = module_->functions;
+    for (auto pair : gvar_funcs) {
+      auto global = pair.first;
+      auto func = pair.second;
+      DLOG(INFO) << "Before inlining primitives: " << global
+                 << std::endl << AsText(func, false);
+
+      func = FunctionNode::make(func->params,
+                                VisitExpr(func->body),
+                                func->ret_type,
+                                func->type_params,
+                                func->attrs);
+      module_->Add(global, func, true);
+
+      DLOG(INFO) << "After inlining primitives: " << global
+                 << std::endl << AsText(func, false);
+    }
+    return module_;
   }
 };
 
-// TODO(@jroesch): write verifier
-
-/* This pass will eliminate primitives which have been lifted by the ANF
- * transform inlining them directly into call sites.
- *
- * This makes VM related code generation easier as the call target is always
- * a primitive function.
- *
- * let prim = fn(...) { ... };
- * prim(...)
- *
- * will become:
- *
- * (fn(...) { ... })(...)
- */
-Module InlinePrimitives(const Module& module) {
-  PrimitiveInliner inliner(module);
+}  // namespace vm
 
-  tvm::Map<GlobalVar, Function> updates;
+namespace transform {
 
-  // There is an ordering bug here.
-  for (auto pair : module->functions) {
-    auto global = pair.first;
-    auto func = pair.second;
-    updates.Set(global, inliner.Inline(func));
-  }
+Pass InlinePrimitives() {
+  runtime::TypedPackedFunc<Module(Module, PassContext)> pass_func =
+    [=](Module m, PassContext pc) {
+      return relay::vm::PrimitiveInliner(m).Inline();
+  };
+  auto inline_pass = CreateModulePass(pass_func, 1, "Inline", {});
+  // Eliminate dead code for each function after inlining.
+  return Sequential({inline_pass, DeadCodeElimination()}, "InlinePrimitives");
+}
 
-  for (auto pair : updates) {
-    module->Add(pair.first, pair.second, true);
-  }
+TVM_REGISTER_API("relay._transform.InlinePrimitives")
+.set_body_typed(InlinePrimitives);
 
-  return module;
-}
+}  // namespace transform
 
-}  // namespace vm
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/vm/lambda_lift.cc b/src/relay/backend/vm/lambda_lift.cc
index 13d8112440fb..668c024a8d55 100644
--- a/src/relay/backend/vm/lambda_lift.cc
+++ b/src/relay/backend/vm/lambda_lift.cc
@@ -27,6 +27,7 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/logging.h>
 #include <tvm/relay/pass.h>
+#include <tvm/relay/transform.h>
 #include <tvm/runtime/vm.h>
 #include <iostream>
 #include <vector>
@@ -54,9 +55,14 @@ Function MarkClosure(const Function& func) {
   return FunctionSetAttr(func, kIsClosure, tvm::Integer(1));
 }
 
+/* The goal of this class is to lift out any nested functions into top-level
+ * functions.
+ *
+ * We will lift a function out into a global which takes the set of the free
+ * vars and then return the new created function.
+ */
 struct LambdaLifter : ExprMutator {
   Module module_;
-  std::vector<std::pair<GlobalVar, Function>> lifted_;
   explicit LambdaLifter(const Module& module) : module_(module) {}
 
   Expr VisitExpr_(const FunctionNode* func_node) final {
@@ -71,8 +77,7 @@ struct LambdaLifter : ExprMutator {
     auto free_type_vars = FreeTypeVars(func, module_);
     auto body = Downcast<Function>(ExprMutator::VisitExpr_(func_node));
 
-    // When performing this optimization there are two
-    // cases.
+    // When performing this optimization there are two cases.
     //
     // The first case in which we have no free variables
     // we can just lift the function into the global
@@ -80,7 +85,7 @@ struct LambdaLifter : ExprMutator {
     //
     //
     // The second case requires that we generate a special
-    // function with makes a distinction between allocating
+    // function which makes a distinction between allocating
     // a closure, and then the code for the closure.
     //
     // We represent a closure allocation by lifting the
@@ -92,7 +97,7 @@ struct LambdaLifter : ExprMutator {
     // function marked as a closure is used to emit allocation
     // code for the closure's environment.
     //
-    // The "inner" function is should be used to generate the
+    // The "inner" function should be used to generate the
     // code for the closure.
     Function lifted_func;
     if (free_vars.size() == 0) {
@@ -107,16 +112,16 @@ struct LambdaLifter : ExprMutator {
     CHECK(lifted_func.defined());
 
     auto name = GenerateName(lifted_func);
-    auto global = this->module_->GetGlobalVar(name);
+    auto global = GlobalVarNode::make(name);
 
-    lifted_.push_back({global, lifted_func});
+    // Add the lifted function to the module.
+    module_->Add(global, lifted_func);
 
     if (free_vars.size() == 0) {
       return std::move(global);
     } else {
-      // If we need to allocate a closure
-      // we pass the variables in its environment
-      // here.
+      // If we need to allocate a closure,
+      // we pass the variables in its environment here.
       Array<Expr> fvs;
       for (auto fv : free_vars) {
         fvs.push_back(fv);
@@ -125,42 +130,39 @@ struct LambdaLifter : ExprMutator {
     }
   }
 
-  Function Lift(const Function& func) {
-    DLOG(INFO) << "Lifting: " << AsText(func, false) << std::endl;
-    return FunctionNode::make(func->params, VisitExpr(func->body), func->ret_type,
-                              func->type_params, func->attrs);
+  Module Lift() {
+    // There is an ordering bug here.
+    auto glob_funcs = module_->functions;
+    for (auto pair : glob_funcs) {
+      auto func = pair.second;
+      DLOG(INFO) << "Lifting " << AsText(func, false);
+      func = FunctionNode::make(func->params,
+                                VisitExpr(func->body),
+                                func->ret_type,
+                                func->type_params,
+                                func->attrs);
+      module_->Add(pair.first, func, true);
+    }
+    return module_;
   }
 };
 
-/* The goal of this pass is to lift out any nested functions into top-level
- * functions.
- *
- * We will lift the functions out into globals which take the set of the free vars
- * and then return a function whcih has b
- */
-Module LambdaLift(const Module& module) {
-  LambdaLifter lifter(module);
-
-  tvm::Map<GlobalVar, Function> updates;
+}  // namespace vm
 
-  // There is an ordering bug here.
-  for (auto pair : module->functions) {
-    auto global = pair.first;
-    auto func = pair.second;
-    updates.Set(global, lifter.Lift(func));
-  }
+namespace transform {
 
-  for (auto i = lifter.lifted_.begin(); i != lifter.lifted_.end(); i++) {
-    module->Add(i->first, i->second);
-  }
+Pass LambdaLift() {
+  runtime::TypedPackedFunc<Module(Module, PassContext)> pass_func =
+    [=](Module m, PassContext pc) {
+    return relay::vm::LambdaLifter(m).Lift();
+  };
+  return CreateModulePass(pass_func, 1, "LambdaLift", {});
+}
 
-  for (auto pair : updates) {
-    module->Add(pair.first, pair.second, true);
-  }
+TVM_REGISTER_API("relay._transform.LambdaLift")
+.set_body_typed(LambdaLift);
 
-  return module;
-}
+}  // namespace transform
 
-}  // namespace vm
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/vm/vm.cc b/src/relay/backend/vm/vm.cc
index 34d067b9c68c..cf0b952005fc 100644
--- a/src/relay/backend/vm/vm.cc
+++ b/src/relay/backend/vm/vm.cc
@@ -63,24 +63,21 @@ Object EvaluateModule(const Module& module, const std::vector<TVMContext> ctxs,
   return res;
 }
 
-Value VMToValue(const relay::Module& module, const relay::Type& type, Object obj) {
-  CHECK(module.defined() && type.defined());
+Value VMToValue(const relay::Module& module, Object obj) {
+  CHECK(module.defined());
   switch (obj->tag) {
     case ObjectTag::kTensor: {
-      CHECK(type.as<TensorTypeNode>()) << "VM internal error: return value must be a tensor";
       return TensorValueNode::make(ToNDArray(obj));
     }
     case ObjectTag::kDatatype: {
-      // const auto* tuple_type
-      // const auto& data_type = obj.AsDatatype();
+      const auto& data_type = obj.AsDatatype();
 
-      // tvm::Array<Value> fields;
-      // for (size_t i = 0; i < data_type->fields.size(); ++i) {
-      //   fields.push_back(VMToValue(tag_index_map, data_type->fields[i]));
-      // }
+      tvm::Array<Value> fields;
+      for (size_t i = 0; i < data_type->fields.size(); ++i) {
+        fields.push_back(VMToValue(module, data_type->fields[i]));
+      }
 
-      // return ConstructorValueNode::make(tag_index_map.at(data_type->tag), fields);
-      LOG(FATAL) << "fix me";
+      return ConstructorValueNode::make(data_type->tag, fields);
     }
     default:
       LOG(FATAL) << "unsupported return value of type: " << obj->tag;
@@ -141,8 +138,6 @@ TVM_REGISTER_API("relay._vm._evaluate_vm").set_body([](TVMArgs args, TVMRetValue
     LOG(FATAL) << "expected function or module";
   }
 
-  auto return_type = module->Lookup(module->entry_func)->ret_type;
-
   std::vector<Object> vm_args;
   for (auto i = 3; i < args.size(); i++) {
     Object obj = args[i];
@@ -151,7 +146,7 @@ TVM_REGISTER_API("relay._vm._evaluate_vm").set_body([](TVMArgs args, TVMRetValue
 
   auto result = EvaluateModule(module, {ctx}, vm_args);
   DLOG(INFO) << "Evaluate VM returning: result=" << result->tag;
-  *ret = VMToValue(module, return_type, result);
+  *ret = VMToValue(module, result);
 });
 
 }  // namespace vm
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index 64706933fde3..e0ec10a87061 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -220,8 +220,8 @@ TVM_REGISTER_API("relay._make.Call")
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<CallNode>([](const CallNode* node, tvm::IRPrinter* p) {
-  p->stream << "CallNode(" << node->op << ", " << node->args << ", "
-    << node->attrs << ", " << node->type_args << ")";
+    p->stream << "CallNode(" << node->op << ", " << node->args << ", "
+              << node->attrs << ", " << node->type_args << ")";
 });
 
 Let LetNode::make(Var var, Expr value, Expr body) {
@@ -324,7 +324,7 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 
 TVM_REGISTER_API("relay._expr.TempExprRealize")
 .set_body_typed<Expr(TempExpr)>([](TempExpr temp) {
-  return temp->Realize();
+    return temp->Realize();
 });
 
 }  // namespace relay
diff --git a/src/relay/ir/hash.cc b/src/relay/ir/hash.cc
index c56c4ce17067..c57475476e58 100644
--- a/src/relay/ir/hash.cc
+++ b/src/relay/ir/hash.cc
@@ -219,6 +219,9 @@ class RelayHashHandler:
   size_t BindVar(const NodeRef& var) {
     size_t hash = std::hash<int>()(var_counter++);
     CHECK_EQ(hash_map_.count(var), 0);
+    if (auto var_node = var.as<VarNode>()) {
+      hash = Combine(hash, TypeHash(var_node->type_annotation));
+    }
     hash_map_[var] = hash;
 
     const auto* ty_param = var.as<TypeVarNode>();
diff --git a/src/relay/ir/module.cc b/src/relay/ir/module.cc
index 6b5fee82af89..58f614a3cc77 100644
--- a/src/relay/ir/module.cc
+++ b/src/relay/ir/module.cc
@@ -57,15 +57,11 @@ Module ModuleNode::make(tvm::Map<GlobalVar, Function> global_funcs,
   return Module(n);
 }
 
-GlobalVar ModuleNode::GetGlobalVar(const std::string& name) {
+GlobalVar ModuleNode::GetGlobalVar(const std::string& name) const {
   auto it = global_var_map_.find(name);
-  if (it == global_var_map_.end()) {
-    auto gvar = GlobalVarNode::make(name);
-    global_var_map_.Set(name, gvar);
-    return gvar;
-  } else {
-    return (*it).second;
-  }
+  CHECK(it != global_var_map_.end())
+    << "Cannot find global var " << name << " in the Module";
+  return (*it).second;
 }
 
 void ModuleNode::AddUnchecked(const GlobalVar& var,
@@ -84,7 +80,7 @@ void ModuleNode::AddUnchecked(const GlobalVar& var,
   global_var_map_.Set(var->name_hint, var);
 }
 
-GlobalTypeVar ModuleNode::GetGlobalTypeVar(const std::string& name) {
+GlobalTypeVar ModuleNode::GetGlobalTypeVar(const std::string& name) const {
   auto it = global_type_var_map_.find(name);
   CHECK(it != global_type_var_map_.end())
     << "Cannot find global type var " << name << " in the Module";
@@ -137,26 +133,26 @@ void ModuleNode::Remove(const GlobalVar& var) {
   gvar_node->data.erase(var->name_hint);
 }
 
-Function ModuleNode::Lookup(const GlobalVar& var) {
+Function ModuleNode::Lookup(const GlobalVar& var) const {
   auto it = functions.find(var);
   CHECK(it != functions.end())
       << "There is no definition of " << var->name_hint;
   return (*it).second;
 }
 
-Function ModuleNode::Lookup(const std::string& name) {
+Function ModuleNode::Lookup(const std::string& name) const {
   GlobalVar id = this->GetGlobalVar(name);
   return this->Lookup(id);
 }
 
-TypeData ModuleNode::LookupDef(const GlobalTypeVar& var) {
+TypeData ModuleNode::LookupDef(const GlobalTypeVar& var) const {
   auto it = type_definitions.find(var);
   CHECK(it != type_definitions.end())
     << "There is no definition of " << var->var->name_hint;
   return (*it).second;
 }
 
-TypeData ModuleNode::LookupDef(const std::string& name) {
+TypeData ModuleNode::LookupDef(const std::string& name) const {
   GlobalTypeVar id = this->GetGlobalTypeVar(name);
   return this->LookupDef(id);
 }
diff --git a/src/relay/ir/op.cc b/src/relay/ir/op.cc
index 4a23f59f9637..b4303e7ac6b1 100644
--- a/src/relay/ir/op.cc
+++ b/src/relay/ir/op.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -54,8 +54,8 @@ struct OpManager {
   std::vector<PackedFunc*> frontend_funcs;
   // get singleton of the op manager
   static OpManager* Global() {
-    static OpManager inst;
-    return &inst;
+    static OpManager* inst = new OpManager();
+    return inst;
   }
 };
 
diff --git a/src/relay/op/algorithm/sort.cc b/src/relay/op/algorithm/argsort.cc
similarity index 94%
rename from src/relay/op/algorithm/sort.cc
rename to src/relay/op/algorithm/argsort.cc
index 5777b79699b1..31aa88808a23 100644
--- a/src/relay/op/algorithm/sort.cc
+++ b/src/relay/op/algorithm/argsort.cc
@@ -18,9 +18,9 @@
  */
 
 /*!
- *  Copyright (c) 2018 by Contributors
- * \file nms.cc
- * \brief Non-maximum suppression operators
+ *  Copyright (c) 2019 by Contributors
+ * \file argsort.cc
+ * \brief Argsort operators
  */
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/algorithm.h>
@@ -44,7 +44,6 @@ bool ArgsortRel(const Array<Type>& types,
         << types[0];
     return false;
   }
-  CHECK_EQ(param->dtype, Float(32));
   reporter->Assign(types[1], TensorTypeNode::make(data->shape, param->dtype));
   return true;
 }
@@ -74,5 +73,6 @@ input array along the given axis.
 .add_argument("data", "Tensor", "Input data.")
 .set_support_level(6)
 .add_type_rel("Argsort", ArgsortRel);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/algorithm/topk.cc b/src/relay/op/algorithm/topk.cc
new file mode 100644
index 000000000000..c88e2c3ea007
--- /dev/null
+++ b/src/relay/op/algorithm/topk.cc
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file topk.cc
+ * \brief TopK operators
+ */
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/algorithm.h>
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(TopKAttrs);
+
+bool TopKRel(const Array<Type>& types,
+             int num_inputs,
+             const Attrs& attrs,
+             const TypeReporter& reporter) {
+  // `types` contains: [data, result]
+  const TopKAttrs* param = attrs.as<TopKAttrs>();
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  CHECK(data);
+  int ndim = data->shape.size();
+  int axis = param->axis;
+  if (axis < 0) {
+    axis += ndim;
+  }
+  CHECK(axis >= 0 && axis < ndim);
+  Array<IndexExpr> out_shape;
+  for (int i = 0; i < ndim; ++i) {
+    if (i != axis || param->k < 1) {
+      out_shape.push_back(data->shape[i]);
+    } else {
+      out_shape.push_back(param->k);
+    }
+  }
+  auto values_ty = TensorTypeNode::make(out_shape, data->dtype);
+  auto indices_ty = TensorTypeNode::make(out_shape, param->dtype);
+  if (param->ret_type == "both") {
+    reporter->Assign(types[1], TupleTypeNode::make({values_ty, indices_ty}));
+  } else if (param->ret_type == "values") {
+    reporter->Assign(types[1], values_ty);
+  } else if (param->ret_type == "indices") {
+    reporter->Assign(types[1], indices_ty);
+  } else {
+    LOG(FATAL) << "Unsupported ret type: " << param->ret_type;
+  }
+  return true;
+}
+
+Expr MakeTopK(Expr data,
+              int k,
+              int axis,
+              std::string ret_type,
+              bool is_ascend,
+              DataType dtype) {
+  auto attrs = make_node<TopKAttrs>();
+  attrs->k = k;
+  attrs->axis = axis;
+  attrs->ret_type = ret_type;
+  attrs->is_ascend = is_ascend;
+  attrs->dtype = dtype;
+  static const Op& op = Op::Get("topk");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op._make.topk")
+.set_body_typed(MakeTopK);
+
+RELAY_REGISTER_OP("topk")
+.describe(R"doc(Get the top k elements in an input tensor along the given axis.
+)doc" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_attrs_type_key("relay.attrs.TopKAttrs")
+.add_argument("data", "Tensor", "Input data.")
+.set_support_level(6)
+.add_type_rel("TopK", TopKRel);
+
+}  // namespace relay
+}  // namespace tvm
+
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
index 4dd763b45654..44c9f89aa9e7 100644
--- a/src/relay/op/nn/pooling.cc
+++ b/src/relay/op/nn/pooling.cc
@@ -70,7 +70,8 @@ bool Pool2DRel(const Array<Type>& types,
   CHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
 
-  CHECK(data != nullptr);
+  if (data == nullptr) return false;
+
   const auto dshape = data->shape;
   CHECK_GE(dshape.size(), 2U)
       << "Pool2D only support input >= 2-D: input must have height and width";
diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc
index a4ebd1e8d050..647e4d0f4f90 100644
--- a/src/relay/op/tensor/reduce.cc
+++ b/src/relay/op/tensor/reduce.cc
@@ -355,6 +355,43 @@ Example::
 .set_attr<TOpPattern>("TOpPattern", kCommReduce);
 
 
+Array<Tensor> AllCompute(const Attrs& attrs,
+                         const Array<Tensor>& inputs,
+                         const Type& out_type,
+                         const Target& target) {
+  return ReduceCompute(attrs, inputs, out_type, target, topi::all);
+}
+
+
+RELAY_REGISTER_REDUCE_OP("all")
+.describe(R"code(Computes the logical AND of boolean array elements over given axes.
+
+Example::
+
+  data = [[[ True,  True,  True],
+           [ True,  True,  True],
+           [False,  True, False]],
+          [[ True, False, False],
+           [ True,  True, False],
+           [False,  True,  True]]]
+
+  all(data, axis=1)
+  [[False,  True, False],
+   [False, False, False]]
+
+  all(data, axis=0)
+  [[ True, False, False],
+   [ True,  True, False],
+   [False,  True, False]]
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_support_level(4)
+.add_type_rel("Reduce", ReduceRel)
+.set_attr<FTVMCompute>("FTVMCompute", AllCompute)
+.set_attr<TOpPattern>("TOpPattern", kCommReduce);
+
+
 Array<Tensor> MaxCompute(const Attrs& attrs,
                          const Array<Tensor>& inputs,
                          const Type& out_type,
diff --git a/src/relay/op/type_relations.cc b/src/relay/op/type_relations.cc
index 16d09c46dfa2..5b147a489b44 100644
--- a/src/relay/op/type_relations.cc
+++ b/src/relay/op/type_relations.cc
@@ -108,8 +108,8 @@ bool BroadcastRel(const Array<Type>& types,
                   const Attrs& attrs,
                   const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 3);
-  DLOG(INFO) << "In1:" << types[0] << ",In2:" << types[1]
-                  << ",Out:" << types[2] << std::endl;
+  // DLOG(INFO) << "In1:" << types[0] << ",In2:" << types[1]
+  //                 << ",Out:" << types[2] << std::endl;
   if (auto t0 = ToTensorType(types[0])) {
     if (auto t1 = ToTensorType(types[1])) {
       CHECK_EQ(t0->dtype, t1->dtype);
@@ -126,8 +126,8 @@ bool BroadcastCompRel(const Array<Type>& types,
                       const Attrs& attrs,
                       const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 3);
-  DLOG(INFO) << "In1:" << types[0] << ",In2:" << types[1]
-                  << ",Out:" << types[2] << std::endl;
+  // DLOG(INFO) << "In1:" << types[0] << ",In2:" << types[1]
+  //                 << ",Out:" << types[2] << std::endl;
   if (auto t0 = ToTensorType(types[0])) {
     if (auto t1 = ToTensorType(types[1])) {
       CHECK_EQ(t0->dtype, t1->dtype);
diff --git a/src/relay/op/vision/nms.cc b/src/relay/op/vision/nms.cc
index 2e5661cdc4dc..c0160e7d7128 100644
--- a/src/relay/op/vision/nms.cc
+++ b/src/relay/op/vision/nms.cc
@@ -50,9 +50,13 @@ bool GetValidCountRel(const Array<Type>& types,
 }
 
 Expr MakeGetValidCounts(Expr data,
-                        double score_threshold) {
+                        double score_threshold,
+                        int id_index,
+                        int score_index) {
   auto attrs = make_node<GetValidCountsAttrs>();
   attrs->score_threshold = score_threshold;
+  attrs->id_index = id_index;
+  attrs->score_index = score_index;
   static const Op& op = Op::Get("vision.get_valid_counts");
   return CallNode::make(op, {data}, Attrs(attrs), {});
 }
diff --git a/src/relay/pass/alter_op_layout.cc b/src/relay/pass/alter_op_layout.cc
index f51c201d0b2a..d623393049a6 100644
--- a/src/relay/pass/alter_op_layout.cc
+++ b/src/relay/pass/alter_op_layout.cc
@@ -27,6 +27,7 @@
 #include <tvm/relay/pass.h>
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/attrs/transform.h>
+#include <tvm/relay/transform.h>
 #include <tvm/tvm.h>
 #include <tuple>
 #include <vector>
@@ -338,17 +339,35 @@ Expr AlterOpLayoutRewrite(const Call &ref_call,
 // Limiations:
 // 1. the altered op should have the same number of arguments as the previous one
 // 2. do not support nested tuple arguments
-TVM_REGISTER_API("relay._ir_pass.AlterOpLayout")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
+Expr AlterOpLayout(const Expr& expr) {
   TransformMemorizer transformMemorizer(make_node<TransformMemorizerNode>());
   auto fcontext = [&](const Call& call) -> NodeRef{
     return transformMemorizer;
   };
 
-  *ret = ForwardRewrite(args[0], AlterOpLayoutRewrite, fcontext);
-});
+  return ForwardRewrite(expr, AlterOpLayoutRewrite, fcontext);
+}
+
+TVM_REGISTER_API("relay._ir_pass.AlterOpLayout")
+.set_body_typed(AlterOpLayout);
 
 }  // namespace alter_op_layout
 
+namespace transform {
+
+Pass AlterOpLayout() {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+      return Downcast<Function>(relay::alter_op_layout::AlterOpLayout(f));
+  };
+  return CreateFunctionPass(pass_func, 3, "AlterOpLayout",
+                            {ir::StringImm::make("InferType")});
+}
+
+TVM_REGISTER_API("relay._transform.AlterOpLayout")
+.set_body_typed(AlterOpLayout);
+
+}  // namespace transform
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/canonicalize_cast.cc b/src/relay/pass/canonicalize_cast.cc
new file mode 100644
index 000000000000..99f4a7f44e7e
--- /dev/null
+++ b/src/relay/pass/canonicalize_cast.cc
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file canonicalize_cast.cc
+ * \brief Canonicalize cast expressions to make operator fusion more efficient.
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/transform.h>
+#include "pattern_util.h"
+#include "pass_util.h"
+
+namespace tvm {
+namespace relay {
+
+// This pass finds upcast that is referred by multiple elemwise/broadcast operators, and creates a
+// copy of it in each branch such that after fusion the previous function have output with fewer
+// bits.
+//
+// Consider the following example:
+// \code
+// def @main(x: int8) {
+//   %1 = cast(%x, f32)
+//   %2 = exp(%1)
+//   %3 = log(%1)
+//   (%3, 4)
+// }
+// \endcode
+//
+// We would like to prevent sharing of the cast expression such that operator fusion can produce
+// more efficient result as below.
+// \code
+// def @main(x: int8) {
+//   %1 = fn (%p1: i8) {
+//     exp(cast(%p1, f32)
+//   }
+//   %3 = %1(%x)
+//   %2 = fn (%p1: i8) {
+//     log(cast(%p1, f32)
+//   }
+//   %4 = %2(%x)
+//   (%3, 4)
+// }
+// \endcode
+class CastCanonicalizer : public ExprMutator {
+ public:
+  Expr VisitExpr_(const CallNode* call) {
+    static auto fpattern = Op::GetAttr<TOpPattern>("TOpPattern");
+
+    if (const OpNode* opnode = call->op.as<OpNode>()) {
+      auto pattern = fpattern[GetRef<Op>(opnode)];
+      if (pattern <= kBroadcast) {
+        Array<Expr> call_args = call->args;
+        bool unchanged = true;
+        for (size_t i = 0; i < call_args.size(); ++i) {
+          Expr arg = call_args[i];
+          Expr new_arg = GetNewCallArg(arg);
+          if (!arg.same_as(new_arg)) {
+            call_args.Set(i, new_arg);
+            unchanged = false;
+          }
+        }
+        if (unchanged) {
+          return GetRef<Expr>(call);
+        }
+        return CallNode::make(call->op, call_args, call->attrs, call->type_args);
+      }
+    }
+
+    Expr new_expr = ExprMutator::VisitExpr_(call);
+    return new_expr;
+  }
+
+ private:
+  std::unordered_map<const Node*, size_t> ref_counter_;
+
+  Expr GetNewCallArg(const Expr& e) {
+    // if e is a upcast and ref count > 1, create an copy; otherwise call the default visitor
+
+    static auto& cast = Op::Get("cast");
+    Expr new_expr = this->VisitExpr(e);
+
+    if (const CallNode* call = e.as<CallNode>()) {
+      if (call->op.same_as(cast)) {
+        auto attrs = call->attrs.as<CastAttrs>();
+        const auto* from_type = call->args[0]->type_as<TensorTypeNode>();
+        CHECK(from_type);
+
+        if (from_type->dtype.bits() < attrs->dtype.bits()) {
+          if (++ref_counter_[call] > 1) {
+            const CallNode* new_call = new_expr.as<CallNode>();
+            CHECK(new_call);
+            CHECK(new_call->op.same_as(cast));
+            return CallNode::make(new_call->op, new_call->args, new_call->attrs,
+                 new_call->type_args);
+          }
+        }
+      }
+    }
+    return new_expr;
+  }
+};
+
+Expr CanonicalizeCast(const Expr& e) {
+  return CastCanonicalizer().Mutate(e);
+}
+
+namespace transform {
+
+Pass CanonicalizeCast() {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+    return Downcast<Function>(CanonicalizeCast(f));
+  };
+  return CreateFunctionPass(pass_func, 3, "CanonicalizeCast",
+                            {ir::StringImm::make("InferType")});
+}
+
+TVM_REGISTER_API("relay._transform.CanonicalizeCast")
+.set_body_typed(CanonicalizeCast);
+
+}  // namespace transform
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/canonicalize_ops.cc b/src/relay/pass/canonicalize_ops.cc
index 9a4602750195..ff9e2304a3bc 100644
--- a/src/relay/pass/canonicalize_ops.cc
+++ b/src/relay/pass/canonicalize_ops.cc
@@ -26,6 +26,7 @@
 #include <tvm/relay/pass.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/transform.h>
 #include "pattern_util.h"
 
 namespace tvm {
@@ -63,5 +64,21 @@ Expr CanonicalizeOps(const Expr& e) {
 TVM_REGISTER_API("relay._ir_pass.canonicalize_ops")
 .set_body_typed(CanonicalizeOps);
 
+namespace transform {
+
+Pass CanonicalizeOps() {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+    return Downcast<Function>(CanonicalizeOps(f));
+  };
+  return CreateFunctionPass(pass_func, 3, "CanonicalizeOps",
+                            {ir::StringImm::make("InferType")});
+}
+
+TVM_REGISTER_API("relay._transform.CanonicalizeOps")
+.set_body_typed(CanonicalizeOps);
+
+}  // namespace transform
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/combine_parallel_conv2d.cc b/src/relay/pass/combine_parallel_conv2d.cc
index 7e76322d5a2a..c95c1ddf8e16 100644
--- a/src/relay/pass/combine_parallel_conv2d.cc
+++ b/src/relay/pass/combine_parallel_conv2d.cc
@@ -38,6 +38,7 @@
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/attrs/transform.h>
 #include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/transform.h>
 #include <unordered_map>
 #include <unordered_set>
 #include "./expr_subst.h"
@@ -357,5 +358,21 @@ Expr CombineParallelConv2D(const Expr& expr, uint64_t min_num_branches) {
 TVM_REGISTER_API("relay._ir_pass.CombineParallelConv2D")
 .set_body_typed(CombineParallelConv2D);
 
+namespace transform {
+
+Pass CombineParallelConv2D(uint64_t min_num_branches) {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+      return Downcast<Function>(CombineParallelConv2D(f, min_num_branches));
+  };
+  return CreateFunctionPass(pass_func, 4, "CombineParallelConv2d",
+                            {ir::StringImm::make("InferType")});
+}
+
+TVM_REGISTER_API("relay._transform.CombineParallelConv2D")
+.set_body_typed(CombineParallelConv2D);
+
+}  // namespace transform
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/dead_code.cc b/src/relay/pass/dead_code.cc
index 533c21429995..7e186f80df92 100644
--- a/src/relay/pass/dead_code.cc
+++ b/src/relay/pass/dead_code.cc
@@ -38,10 +38,10 @@ namespace relay {
 // calculate the dependency graph from expression
 class CalcDep : private ExprVisitor {
  public:
-  static Expr Eliminate(const Expr& e) {
+  static Expr Eliminate(const Expr& e, bool inline_once) {
     CalcDep cd;
     cd.Calculate(e);
-    Eliminator el(cd.expr_map_, cd.use_map_, cd.letrec_set_);
+    Eliminator el(cd.expr_map_, cd.use_map_, cd.letrec_set_, inline_once);
     return el(e);
   }
 
@@ -117,15 +117,23 @@ class CalcDep : private ExprVisitor {
     VarMap<Expr> expr_map_;
     VarMap<size_t> use_map_;
     VarSet letrec_set_;
+    bool inline_once_;
     explicit Eliminator(const VarMap<Expr>& expr_map,
                         const VarMap<size_t>& use_map,
-                        const VarSet& letrec_set) :
-      expr_map_(expr_map), use_map_(use_map), letrec_set_(letrec_set) { }
+                        const VarSet& letrec_set,
+                        bool inline_once) :
+      expr_map_(expr_map), use_map_(use_map), letrec_set_(letrec_set), inline_once_(inline_once) { }
     friend CalcDep;
 
     bool HasLet(const Var& v) {
-      // TODO(@jroesch): MK fix me
-      return (use_map_[v] > 0 || (use_map_[v] != 0 && letrec_set_.count(v) != 0));
+      switch (use_map_[v]) {
+      case 0:
+        return false;
+      case 1:
+        return letrec_set_.count(v) > 0 || !inline_once_;
+      default:
+        return true;
+      }
     }
 
     Expr VisitExpr_(const VarNode* op) final {
@@ -144,12 +152,27 @@ class CalcDep : private ExprVisitor {
   };
 };
 
-Expr DeadCodeElimination(const Expr& e) {
-  return CalcDep::Eliminate(e);
+Expr DeadCodeElimination(const Expr& e, bool inline_once) {
+  return CalcDep::Eliminate(e, inline_once);
 }
 
 TVM_REGISTER_API("relay._ir_pass.dead_code_elimination")
 .set_body_typed(DeadCodeElimination);
 
+namespace transform {
+
+Pass DeadCodeElimination(bool inline_once) {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+    return Downcast<Function>(DeadCodeElimination(f, inline_once));
+  };
+  return CreateFunctionPass(pass_func, 1, "DeadCodeElimination", {});
+}
+
+TVM_REGISTER_API("relay._transform.DeadCodeElimination")
+.set_body_typed(DeadCodeElimination);
+
+}  // namespace transform
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/device_annotation.cc b/src/relay/pass/device_annotation.cc
index 0139cc912849..8eeb493f1feb 100644
--- a/src/relay/pass/device_annotation.cc
+++ b/src/relay/pass/device_annotation.cc
@@ -35,6 +35,7 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/pass.h>
+#include <tvm/relay/transform.h>
 
 #include <memory>
 #include <unordered_map>
@@ -67,6 +68,7 @@ class ValidateAnnotation : private ExprVisitor {
 
  private:
   void VisitExpr_(const CallNode* call_node) final {
+    ExprVisitor::VisitExpr_(call_node);
     if (IsOnDeviceNode(call_node)) {
       int device_type = GetDeviceId(call_node);
       if (annotation_map_.count(call_node)) {
@@ -85,7 +87,14 @@ class ValidateAnnotation : private ExprVisitor {
         annotation_map_.insert({node, GetDeviceId(call_node)});
       }
     }
-    ExprVisitor::VisitExpr_(call_node);
+  }
+
+  void VisitExpr_(const TupleGetItemNode* get_elem) final {
+    ExprVisitor::VisitExpr_(get_elem);
+    const auto* tn = get_elem->tuple.operator->();
+    if (annotation_map_.count(tn)) {
+      annotation_map_.insert({get_elem, annotation_map_.at(tn)});
+    }
   }
 
   /*
@@ -176,7 +185,11 @@ class RewriteAnnotation : public ExprMutator {
   }
 
   Expr VisitExpr_(const CallNode* call_node) final {
-    if (IsOnDeviceNode(call_node) || IsDeviceCopyNode(call_node)) {
+    if (IsOnDeviceNode(call_node)) {
+      return this->VisitExpr(call_node->args[0]);
+    }
+
+    if (IsDeviceCopyNode(call_node)) {
       return ExprMutator::VisitExpr_(call_node);
     }
 
@@ -248,7 +261,9 @@ class RewriteAnnotation : public ExprMutator {
         if (src->is_type<CallNode>() || src->is_type<FunctionNode>()) {
           return annotation_map_.at(dst) != fallback_device_;
         } else {
-          return false;
+          // There shouldn't be any copy nodes between var/constant and another
+          // expression.
+          return !(src->is_type<VarNode>() || src->is_type<ConstantNode>());
         }
       } else {
         return false;
@@ -358,6 +373,9 @@ class DeviceInfo {
    public:
     void Visit(const Expr& expr) {
       if (const auto* fn = expr.as<FunctionNode>()) {
+        for (const auto& param : fn->params) {
+          this->VisitExpr(param);
+        }
         this->VisitExpr(fn->body);
       } else {
         this->VisitExpr(expr);
@@ -402,7 +420,7 @@ class DeviceInfo {
     }
 
     void VisitExpr_(const VarNode* vn) final {
-        post_dfs_order_.push_back(std::make_pair(vn, has_copy_));
+      post_dfs_order_.push_back(std::make_pair(vn, has_copy_));
     }
 
     void VisitExpr_(const LetNode* ln) final {
@@ -485,7 +503,52 @@ class DeviceInfo {
 
 Expr RewriteAnnotatedOps(const Expr& expr, int fallback_device) {
   RewriteAnnotation rewrote = RewriteAnnotation();
-  return rewrote.Rewrite(expr, fallback_device);
+  Expr new_expr = rewrote.Rewrite(expr, fallback_device);
+
+  // Remove OnDevice operators. Note that these operators are only present at the
+  // leaves after annotation. Therefore, we can simply reconstruct the
+  // Function/Expr by removing them directly.
+  if (const FunctionNode* fn = new_expr.as<FunctionNode>()) {
+    auto params = fn->params;
+    auto body = fn->body;
+    std::vector<Expr> new_body;
+    if (const TupleNode* tuple = body.as<TupleNode>()) {
+      for (const auto& field : tuple->fields) {
+        if (!IsOnDeviceNode(field.operator->())) {
+          new_body.push_back(field);
+        }
+      }
+      CHECK_GT(new_body.size(), 0U);
+      if (new_body.size() == 1) {
+        return FunctionNode::make(params, new_body[0], Type(nullptr),
+                                  fn->type_params, fn->attrs);
+      } else if (tuple->fields.size() == new_body.size()) {
+          return new_expr;
+      } else {
+        Tuple tuple_body = TupleNode::make(new_body);
+        return FunctionNode::make(params, tuple_body, Type(nullptr),
+                                  fn->type_params, fn->attrs);
+      }
+    } else {
+      return new_expr;
+    }
+  } else if (const TupleNode* tuple = new_expr.as<TupleNode>()) {
+    std::vector<Expr> new_fields;
+    for (const auto& field : tuple->fields) {
+      if (!IsOnDeviceNode(field.operator->())) {
+        new_fields.push_back(field);
+      }
+    }
+    CHECK_GT(new_fields.size(), 0U);
+    if (tuple->fields.size() == new_fields.size()) {
+      return new_fields.size() == 1 ? new_fields[0] : new_expr;
+    } else {
+      return new_fields.size() == 1 ? new_fields[0]
+                                    : TupleNode::make(new_fields);
+    }
+  } else {
+    return new_expr;
+  }
 }
 
 Map<Expr, Integer> CollectDeviceInfo(const Expr& expr) {
@@ -505,6 +568,21 @@ TVM_REGISTER_API("relay._ir_pass.RewriteDeviceAnnotation")
 TVM_REGISTER_API("relay._ir_pass.CollectDeviceAnnotationOps")
 .set_body_typed(CollectDeviceAnnotationOps);
 
+namespace transform {
+
+Pass RewriteAnnotatedOps(int fallback_device) {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+    return Downcast<Function>(RewriteAnnotatedOps(f, fallback_device));
+  };
+  return CreateFunctionPass(pass_func, 1, "RewriteAnnotatedOps",
+                            {ir::StringImm::make("InferType")});
+}
+
+TVM_REGISTER_API("relay._transform.RewriteDeviceAnnotation")
+.set_body_typed(RewriteAnnotatedOps);
+
+}  // namespace transform
+
 }  // namespace relay
 }  // namespace tvm
-
diff --git a/src/relay/pass/eliminate_common_subexpr.cc b/src/relay/pass/eliminate_common_subexpr.cc
index f8432f671855..883681adcaf4 100644
--- a/src/relay/pass/eliminate_common_subexpr.cc
+++ b/src/relay/pass/eliminate_common_subexpr.cc
@@ -29,6 +29,7 @@
  */
 #include <tvm/relay/pass.h>
 #include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
 #include <unordered_map>
 #include "./pattern_util.h"
 
@@ -87,5 +88,21 @@ Expr EliminateCommonSubexpr(const Expr& expr, PackedFunc callback) {
 TVM_REGISTER_API("relay._ir_pass.eliminate_common_subexpr")
 .set_body_typed<Expr(Expr, PackedFunc)>(EliminateCommonSubexpr);
 
+namespace transform {
+
+Pass EliminateCommonSubexpr(PackedFunc fskip) {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+      return Downcast<Function>(EliminateCommonSubexpr(f, fskip));
+  };
+  return CreateFunctionPass(pass_func, 3, "EliminateCommonSubexpr",
+                            {ir::StringImm::make("InferType")});
+}
+
+TVM_REGISTER_API("relay._transform.EliminateCommonSubexpr")
+.set_body_typed(EliminateCommonSubexpr);
+
+}  // namespace transform
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/eta_expand.cc b/src/relay/pass/eta_expand.cc
index 0193b9afc62e..3139d41d6393 100644
--- a/src/relay/pass/eta_expand.cc
+++ b/src/relay/pass/eta_expand.cc
@@ -67,5 +67,20 @@ Expr EtaExpand(const Expr& e, const Module& mod) {
 
 TVM_REGISTER_API("relay._ir_pass.eta_expand").set_body_typed(EtaExpand);
 
+namespace transform {
+
+Pass EtaExpand() {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+    return Downcast<Function>(EtaExpand(f, m));
+  };
+  return CreateFunctionPass(pass_func, 1, "EtaExpand", {});
+}
+
+TVM_REGISTER_API("relay._transform.EtaExpand")
+.set_body_typed(EtaExpand);
+
+}  // namespace transform
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/fold_constant.cc b/src/relay/pass/fold_constant.cc
index 45aa449e72ab..815407038b08 100644
--- a/src/relay/pass/fold_constant.cc
+++ b/src/relay/pass/fold_constant.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -26,6 +26,7 @@
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/interpreter.h>
 #include <tvm/relay/attrs/transform.h>
+#include <tvm/relay/transform.h>
 
 namespace tvm {
 namespace relay {
@@ -203,10 +204,10 @@ Expr FoldConstant(const Expr& expr) {
   DLContext ctx;
   ctx.device_type = kDLCPU;
   ctx.device_id = 0;
-  Target target = Target::create("llvm");
+  Target target = Target::Create("llvm");
   // use a fresh build context
   // in case we are already in a build context.
-  BuildConfigContext fresh_build_ctx(build_config());
+  With<BuildConfig> fresh_build_ctx(BuildConfig::Create());
 
   return ConstantFolder(CreateInterpreter(
       Module(nullptr), ctx, target)).Mutate(expr);
@@ -215,5 +216,20 @@ Expr FoldConstant(const Expr& expr) {
 TVM_REGISTER_API("relay._ir_pass.FoldConstant")
 .set_body_typed(FoldConstant);
 
+namespace transform {
+
+Pass FoldConstant() {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+      return Downcast<Function>(FoldConstant(f));
+  };
+  return CreateFunctionPass(pass_func, 2, "FoldConstant", {});
+}
+
+TVM_REGISTER_API("relay._transform.FoldConstant")
+.set_body_typed(FoldConstant);
+
+}  // namespace transform
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/fold_scale_axis.cc b/src/relay/pass/fold_scale_axis.cc
index c738e3e3b731..53089807ace5 100644
--- a/src/relay/pass/fold_scale_axis.cc
+++ b/src/relay/pass/fold_scale_axis.cc
@@ -29,6 +29,7 @@
 #include <tvm/relay/pass.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
 #include "pattern_util.h"
 #include "pass_util.h"
 
@@ -530,7 +531,7 @@ RELAY_REGISTER_OP("nn.conv2d")
 .set_attr<FForwardRewrite>("FScaleAxisForwardRewrite", Conv2DForwardRewrite);
 
 
-Expr ForwardFoldScaleAxis(Expr data) {
+Expr ForwardFoldScaleAxis(const Expr& data) {
   auto message = ForwardPrep().Prepare(data);
   auto fcontext = [&](const Call& call) -> NodeRef{
     auto it = message.find(call.get());
@@ -942,7 +943,7 @@ RELAY_REGISTER_OP("nn.conv2d")
 RELAY_REGISTER_OP("nn.conv2d")
 .set_attr<FBackwardTransform>("FScaleAxisBackwardTransform", Conv2DBackwardTransform);
 
-Expr BackwardFoldScaleAxis(Expr data) {
+Expr BackwardFoldScaleAxis(const Expr& data) {
   return make_node<BackwardTransformerNode>()->Fold(data);
 }
 
@@ -950,5 +951,42 @@ TVM_REGISTER_API("relay._ir_pass.backward_fold_scale_axis")
 .set_body_typed<Expr(Expr)>(BackwardFoldScaleAxis);
 
 }  // namespace fold_scale_axis
+
+namespace transform {
+
+Pass ForwardFoldScaleAxis() {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+      return Downcast<Function>(
+          relay::fold_scale_axis::ForwardFoldScaleAxis(f));
+  };
+  return CreateFunctionPass(pass_func, 3, "ForwardFoldScaleAxis",
+                            {ir::StringImm::make("InferType")});
+}
+
+Pass BackwardFoldScaleAxis() {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+      return Downcast<Function>(
+          relay::fold_scale_axis::BackwardFoldScaleAxis(f));
+    };
+  return CreateFunctionPass(pass_func, 3, "BackwardFoldScaleAxis",
+                            {ir::StringImm::make("InferType")});
+}
+
+Pass FoldScaleAxis() {
+  // FoldScaleAxis pass contains the following three passes. Therefore, we can
+  // register it as a sequential pass.
+  Pass pass = Sequential(
+      {BackwardFoldScaleAxis(), ForwardFoldScaleAxis(), FoldConstant()},
+      "FoldScaleAxis");
+  return pass;
+}
+
+TVM_REGISTER_API("relay._transform.FoldScaleAxis")
+.set_body_typed(FoldScaleAxis);
+
+}  // namespace transform
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/forward_rewrite.cc b/src/relay/pass/forward_rewrite.cc
index 88a2d669da9f..8ad61270e33a 100644
--- a/src/relay/pass/forward_rewrite.cc
+++ b/src/relay/pass/forward_rewrite.cc
@@ -206,6 +206,37 @@ Expr ForwardRewrite(const Expr& expr,
   return ForwardRewriter(&rewrite_func, fcontext, fmulti_ref_trigger).Rewrite(expr);
 }
 
+namespace transform {
+
+using std::function;
+
+Pass ForwardRewrite(const std::string& rewrite_map_attr_name,
+                    function<NodeRef(const Call&)> fcontext,
+                    function<Expr(const Expr&)> fmulti_ref_trigger) {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+    return Downcast<Function>(ForwardRewrite(f,
+                                             rewrite_map_attr_name,
+                                             fcontext,
+                                             fmulti_ref_trigger));
+  };
+  return CreateFunctionPass(pass_func, 1, "ForwardRewrite", {});
+}
+
+Pass ForwardRewrite(const FForwardRewrite& rewrite_func,
+                    function<NodeRef(const Call&)> fcontext,
+                    function<Expr(const Expr&)> fmulti_ref_trigger) {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+    return Downcast<Function>(ForwardRewrite(f,
+                                             rewrite_func,
+                                             fcontext,
+                                             fmulti_ref_trigger));
+  };
+  return CreateFunctionPass(pass_func, 1, "ForwardRewriteFunc", {});
+}
+
+}  // namespace transform
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index d0d0cab22432..9f940e54953b 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -29,6 +29,7 @@
 #include <tvm/relay/pass.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/transform.h>
 #include "./pattern_util.h"
 #include "../../common/arena.h"
 
@@ -964,5 +965,23 @@ Expr FuseOps(const Expr& expr, int fuse_opt_level, const Module& module) {
 
 TVM_REGISTER_API("relay._ir_pass.FuseOps")
 .set_body_typed(FuseOps);
+
+namespace transform {
+
+Pass FuseOps(int fuse_opt_level) {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+    int opt_level = fuse_opt_level == -1 ? pc->opt_level : fuse_opt_level;
+    return Downcast<Function>(FuseOps(f, opt_level, m));
+  };
+  return CreateFunctionPass(pass_func, 1, "FuseOps",
+                            {ir::StringImm::make("InferType")});
+}
+
+TVM_REGISTER_API("relay._transform.FuseOps")
+.set_body_typed(FuseOps);
+
+}  // namespace transform
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/gradient.cc b/src/relay/pass/gradient.cc
index 5c5ea01ac2f3..91072b31a910 100644
--- a/src/relay/pass/gradient.cc
+++ b/src/relay/pass/gradient.cc
@@ -279,7 +279,7 @@ struct ReverseAD : ExprMutator {
         }
         std::vector<Expr> orig_args;
         for (const auto& arg : args) {
-          orig_args.push_back(GetField(VisitExpr(arg), 0));
+          orig_args.push_back(GetField(arg, 0));
         }
         Expr orig = CallNode::make(op->op, orig_args, op->attrs, op->type_args);
         Var orig_var = ll->Push(orig);
diff --git a/src/relay/pass/mac_count.cc b/src/relay/pass/mac_count.cc
index c9ee4eec0337..3d77fabe6fe9 100644
--- a/src/relay/pass/mac_count.cc
+++ b/src/relay/pass/mac_count.cc
@@ -30,7 +30,9 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/expr_functor.h>
+#include <tvm/relay/pass.h>
 #include <tvm/data_layout.h>
+#include "pattern_util.h"
 
 namespace tvm {
 namespace relay {
@@ -65,7 +67,7 @@ int64_t ConvMacCount(const Call& call_node) {
   }
   Array<Expr> args = call_node->args;
   CHECK(args.size() == 2)
-      << "The number of input arguments of a CONV 2D node should be 2.";
+    << "The number of input arguments of a CONV 2D node should be 2.";
   const auto* conv_2d_attr = call_node->attrs.as<Conv2DAttrs>();
   const auto* data_type = args[0]->checked_type().as<TensorTypeNode>();
   Array<IndexExpr> data_shape = data_type->shape;
@@ -73,18 +75,21 @@ int64_t ConvMacCount(const Call& call_node) {
   int32_t C_ind = Layout(data_layout).IndexOf(LayoutAxis::Get('C'));
   int32_t c_ind = Layout(data_layout).IndexOf(LayoutAxis::Get('c'));
   CHECK(C_ind != -1)
-      << "There is no input channel dimension.";
+    << "There is no input channel dimension.";
   int64_t input_channel = static_cast<int64_t>(data_shape[C_ind].as<IntImm>()->value);
   if (c_ind != -1)
     input_channel *= static_cast<int64_t>(data_shape[c_ind].as<IntImm>()->value);
   Array<IndexExpr> kernel_size = conv_2d_attr->kernel_size;
   CHECK(kernel_size.size() == 2)
-      << "The dimension of the kernel size in Conv 2D should be 2.";
+    << "The dimension of the kernel in Conv 2D should be 2.";
   const auto* expr = call_node->checked_type().as<TensorTypeNode>();
   Array<IndexExpr> output_tensor = expr->shape;
   CHECK(output_tensor.size() == 4 || output_tensor.size() == 5)
-      << "The dimension of the output tensor in Conv 2D should be 4 or 5.";
-  int64_t count = input_channel * GetCartesianProd(output_tensor) * GetCartesianProd(kernel_size);
+    << "The dimension of the output tensor in Conv 2D should be 4 or 5.";
+  int64_t count = GetCartesianProd(output_tensor) * GetCartesianProd(kernel_size);
+  CHECK_EQ(input_channel % conv_2d_attr->groups, 0)
+  << "The number of input channels is not divisble by groups.";
+  count *= input_channel/conv_2d_attr->groups;
   return count;
 }
 
diff --git a/src/relay/pass/match_exhaustion.cc b/src/relay/pass/match_exhaustion.cc
new file mode 100644
index 000000000000..173d6eacf528
--- /dev/null
+++ b/src/relay/pass/match_exhaustion.cc
@@ -0,0 +1,250 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file match_exhaustion.cc
+ * \brief Checking Relay match expression exhaustiveness.
+ *
+ * This file implements a function that checks whether a match
+ * expression is exhaustive, that is, whether a given match clause
+ * matches every possible case. This is important for ensuring
+ * code correctness, since hitting an unmatched case results in a
+ * dynamic error unless exhaustiveness is checked in advance.
+ */
+#include <tvm/relay/adt.h>
+#include <tvm/relay/error.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/pattern_functor.h>
+#include <tvm/relay/pass.h>
+#include <stack>
+
+namespace tvm {
+namespace relay {
+
+/*! \brief Possible pattern match results */
+enum MatchResult : int {
+  kMatch = 0,        // pattern matches
+  kClash = 1,        // pattern conflicts
+  kUnspecified = 2,  // ambiguous: candidate needs more constructors specified
+};
+
+class CandidateChecker : public PatternFunctor<MatchResult(const Pattern&, const Pattern&)> {
+ public:
+  explicit CandidateChecker() {}
+
+  MatchResult Check(const Pattern& pat, const Pattern& candidate) {
+    return this->VisitPattern(pat, candidate);
+  }
+
+  // for a constructor pattern, we must ensure that the candidate is
+  // a ConstructorPattern, that it has the same constructor, and
+  // that its fields match the subpatterns.
+  MatchResult VisitPattern_(const PatternConstructorNode* op, const Pattern& cand) override {
+    auto* ctor_cand = cand.as<PatternConstructorNode>();
+    // attempting to match non-constructor to constructor pattern: need to specify
+    if (ctor_cand == nullptr) {
+      return MatchResult::kUnspecified;
+    }
+
+    // check that constructors match
+    if (!op->constructor.same_as(ctor_cand->constructor)) {
+      return MatchResult::kClash;
+    }
+
+    // now check that subpatterns match
+    CHECK(op->patterns.size() == ctor_cand->patterns.size());
+    bool unspecified = false;
+    for (size_t i = 0; i < op->patterns.size(); i++) {
+      MatchResult submatch = this->Check(op->patterns[i], ctor_cand->patterns[i]);
+      // if we have a clash anywhere, then we can return clash
+      if (submatch == MatchResult::kClash) {
+        return MatchResult::kClash;
+      }
+      if (submatch == MatchResult::kUnspecified) {
+        unspecified = true;
+      }
+    }
+    // only return unspecified if we have ruled out a clash
+    if (unspecified) {
+      return MatchResult::kUnspecified;
+    }
+    return MatchResult::kMatch;
+  }
+
+  // wildcard and var patterns always match
+  MatchResult VisitPattern_(const PatternWildcardNode*, const Pattern&) override {
+    return MatchResult::kMatch;
+  }
+
+  MatchResult VisitPattern_(const PatternVarNode*, const Pattern&) override {
+    return MatchResult::kMatch;
+  }
+};
+
+// Returns list of arrays corresponding to Cartesian product of input list
+Array<Array<Pattern>> CartesianProduct(Array<Array<Pattern>> fields) {
+  CHECK_NE(fields.size(), 0);
+  Array<Pattern> field_vals = fields[fields.size() - 1];
+  Array<Array<Pattern>> ret;
+
+  // base case: this is the last field left
+  if (fields.size() == 1) {
+    for (auto val : field_vals) {
+      ret.push_back(Array<Pattern>{val});
+    }
+    return ret;
+  }
+
+  // if we have more fields left, get the sub-candidates by getting
+  // their cartesian product and appending the elements here onto those
+  Array<Array<Pattern>> remaining_fields;
+  for (size_t i = 0; i < fields.size() - 1; i++) {
+    remaining_fields.push_back(fields[i]);
+  }
+  Array<Array<Pattern>> candidates = CartesianProduct(remaining_fields);
+  for (auto val : field_vals) {
+    for (auto candidate : candidates) {
+      candidate.push_back(val);
+      ret.push_back(candidate);
+    }
+  }
+  return ret;
+}
+
+// Expands all wildcards in the candidate pattern once, using the pattern
+// to decide which constructors to insert. Returns a list of all possible expansions.
+Array<Pattern> ExpandWildcards(const Pattern& clause_pat, const Pattern& cand,
+                               const Module& mod) {
+  auto ctor_cand = cand.as<PatternConstructorNode>();
+  PatternConstructor clause_ctor = Downcast<PatternConstructor>(clause_pat);
+  auto gtv = Downcast<GlobalTypeVar>(clause_ctor->constructor->belong_to);
+
+  // for a wildcard node, create constructor nodes with wildcards for all args
+  if (!ctor_cand) {
+    TypeData td = mod->LookupDef(gtv);
+    // for each constructor add a candidate
+    Array<Pattern> ret;
+    for (auto constructor : td->constructors) {
+      Array<Pattern> args;
+      for (auto inp : constructor->inputs) {
+        args.push_back(PatternWildcardNode::make());
+      }
+      ret.push_back(PatternConstructorNode::make(constructor, args));
+    }
+    return ret;
+  }
+
+  // for constructors, we will expand the wildcards in any field
+  // that is an ADT
+  Array<Array<Pattern>> values_by_field;
+  for (size_t i = 0; i < ctor_cand->constructor->inputs.size(); i++) {
+    auto* subpattern = clause_ctor->patterns[i].as<PatternConstructorNode>();
+    // for non-ADT fields, we can only have a wildcard for the value
+    if (!subpattern) {
+      values_by_field.push_back({PatternWildcardNode::make()});
+      continue;
+    }
+
+    // otherwise, recursively expand
+    values_by_field.push_back(ExpandWildcards(GetRef<Pattern>(subpattern),
+                                              ctor_cand->patterns[i], mod));
+  }
+
+  // generate new candidates using a cartesian product
+  auto all_subfields = CartesianProduct(values_by_field);
+  Array<Pattern> ret;
+  for (auto subfields : all_subfields) {
+    ret.push_back(PatternConstructorNode::make(ctor_cand->constructor, subfields));
+  }
+  return ret;
+}
+
+/*!
+ * \brief Finds cases that the match expression does not catch, if any.
+ * \return Returns a list of cases that are not handled by the match
+ * expression.
+ */
+Array<Pattern> UnmatchedCases(const Match& match, const Module& mod) {
+  /* algorithm:
+   * candidates = { Wildcard }
+   * while candidates not empty {
+   *   cand = candidates.pop()
+   *   for clause in clauses {
+   *     if clause fails: next clause
+   *     if clause matches candidate: next candidate
+   *     if candidate is not specific enough:
+   *        candidates += expand_possible_wildcards(cand)
+   *        next candidate
+   *   }
+   *   failed_candidates += { cand }
+   * }
+   * return failed_candidates
+   */
+  std::stack<Pattern> candidates;
+  candidates.push(PatternWildcardNode::make());
+  CandidateChecker checker;
+
+  Array<Pattern> failures;
+
+  while (!candidates.empty()) {
+    Pattern cand = candidates.top();
+    candidates.pop();
+
+    bool failure = true;
+    for (auto clause : match->clauses) {
+      // if the check fails, we move on to the next
+      MatchResult check = checker.Check(clause->lhs, cand);
+      if (check == MatchResult::kClash) {
+        continue;
+      }
+
+      // either success or we need to generate more candidates;
+      // either way, we're done with this candidate
+      failure = false;
+      if (check == MatchResult::kUnspecified) {
+        auto new_candidates = ExpandWildcards(clause->lhs, cand, mod);
+        for (auto candidate : new_candidates) {
+          candidates.push(candidate);
+        }
+      }
+      break;
+    }
+
+    if (failure) {
+      failures.push_back(cand);
+    }
+  }
+
+  return failures;
+}
+
+// expose for testing only
+TVM_REGISTER_API("relay._ir_pass.unmatched_cases")
+.set_body_typed<Array<Pattern>(const Match&,
+                               const Module&)>([](const Match& match,
+                                                  const Module& mod_ref) {
+                                                 Module call_mod = mod_ref;
+                                                 if (!call_mod.defined()) {
+                                                   call_mod = ModuleNode::make({}, {});
+                                                 }
+                                                 return UnmatchedCases(match, call_mod);
+                                               });
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/partial_eval.cc b/src/relay/pass/partial_eval.cc
index 5349532ca697..f1ca573d3e0e 100644
--- a/src/relay/pass/partial_eval.cc
+++ b/src/relay/pass/partial_eval.cc
@@ -74,28 +74,19 @@
  *
  * The partial evaluator makes several assumptions, so there is room for improvement:
  *
- * 0: The partial evaluator treats global variables as opaque.
- * Doing PartialEval on a module level will solve this.
- *
- * 1: The partial evaluator assume all functions as terminating.
- * We need to has a max_expand parameter that shrink on every compile time evaluation,
- * to make sure PE does not infinite loop.
- * Additionally, we might add a termination analysis pass that lift this requirement
- * for function that analysis found terminating.
- *
- * 2: Every time an unknown effect happened, we clear the whole store.
+ * 0: Every time an unknown effect happened, we clear the whole store.
  * It is too conservative: if a local reference is created (and do not get passed outside),
  * An unknown global function call/global reference write can not modify it.
  * We can pair PE with escape analysis/alias analysis.
  *
- * 3: We assume all unknown code has effect. Doing effect analysis can make the store more precise.
+ * 1: We assume all unknown code has effect. Doing effect analysis can make the store more precise.
  *
- * 4: When doing pattern matching, we can simplify the match even for dynamic case.
+ * 2: When doing pattern matching, we can simplify the match even for dynamic case.
  * Right now it is all or nothing: either a complete match, or the original dynamic code.
  * Instead, we can get a match tree, pair it with the data and evaluate it to a normal form.
  * We then can reify the result.
  *
- * 5: Every time a function is called, it's code will get expanded and partially evaluated.
+ * 3: Every time a function is called, its code will get expanded and partially evaluated.
  * We can do a binding time analysis to cache the result and avoid re-partial evaluation.
  *
  * These assumptions do not affect the correctness of the algorithm, however.
@@ -104,11 +95,13 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/pattern_functor.h>
 #include <tvm/relay/interpreter.h>
+#include "../ir/type_functor.h"
 #include "pass_util.h"
 #include "let_list.h"
 
 namespace tvm {
 namespace relay {
+namespace partial_eval {
 
 using namespace runtime;
 
@@ -132,6 +125,8 @@ struct VarEqual {
   }
 };
 
+Expr PostProcess(const Expr&);
+
 /*! \brief The base container type of Relay values. */
 class StaticNode : public RelayNode {
  public:
@@ -150,10 +145,20 @@ class Static : public NodeRef {
   using ContainerType = StaticNode;
 };
 
+using Time = size_t;
+
 struct PStaticNode : Node {
+  static Time time() {
+    static Time time_ = 0;
+    Time ret = time_;
+    time_++;
+    return ret;
+  }
   Static pstatic;  // may be null
   Expr dynamic;
-  PStaticNode(const Static& pstatic, const Expr& dynamic) : pstatic(pstatic), dynamic(dynamic) { }
+  Time created_time;
+  PStaticNode(const Static& pstatic, const Expr& dynamic) :
+    pstatic(pstatic), dynamic(dynamic), created_time(time()) { }
   explicit PStaticNode(const Expr& dynamic) : PStaticNode(Static(), dynamic) { }
   TVM_DECLARE_NODE_TYPE_INFO(PStaticNode, Node);
 };
@@ -341,6 +346,7 @@ class Store {
 };
 
 PStatic HasStatic(const Static& stat, const Expr& dynamic) {
+  CHECK(stat.defined());
   return PStatic(make_node<PStaticNode>(stat, dynamic));
 }
 
@@ -375,23 +381,86 @@ DLContext CPUContext() {
 }
 
 FInterpreter CPUInterpreter() {
-  Target target = Target::create("llvm");
+  Target target = Target::Create("llvm");
   // use a fresh build context
   // in case we are already in a build context.
-  BuildConfigContext fresh_build_ctx(build_config());
+  With<BuildConfig> fresh_build_ctx(BuildConfig::Create());
 
   return CreateInterpreter(Module(nullptr), CPUContext(), target);
 }
 
+bool IsAtomic(const Expr& e) {
+  return e.as<VarNode>() || e.as<OpNode>() || e.as<ConstructorNode>() || e.as<GlobalVarNode>();
+}
+
+using FuncId = int;
+
+/*!
+ * \brief Annotate a function with a FuncId.
+ */
+struct WithFuncIdAttrs : public tvm::AttrsNode<WithFuncIdAttrs> {
+  FuncId fid;
+
+  TVM_DECLARE_ATTRS(WithFuncIdAttrs, "relay.attrs.WithFuncIdAttrs") {
+    TVM_ATTR_FIELD(fid)
+      .describe("The FuncId that an function is annotated with.")
+      .set_default(-1);
+  }
+};
+
+TVM_REGISTER_NODE_TYPE(WithFuncIdAttrs);
+
+Op WithFuncIdOp() {
+  static const Op& op = Op::Get("annotation.with_funcid");
+  return op;
+}
+
+Expr MkWithFuncId(const Expr& expr, FuncId fid) {
+  auto attrs = make_node<WithFuncIdAttrs>();
+  attrs->fid = fid;
+  return CallNode::make(WithFuncIdOp(), {expr}, Attrs(attrs), {});
+}
+
+RELAY_REGISTER_OP("annotation.with_funcid")
+.describe(R"code(Annotate a function with a funcid.)code"
+TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("func", "Function", "The input data.");
+
+Expr StripWithFuncId(const Expr& e);
+
+Expr DeDup(const Expr& e);
+
+Function AsFunc(const Expr& e) {
+  if (e.as<FunctionNode>()) {
+    return Downcast<Function>(e);
+  } else if (const CallNode* c = e.as<CallNode>()) {
+    CHECK(c->op.same_as(WithFuncIdOp()));
+    CHECK_EQ(c->args.size(), 1);
+    return AsFunc(c->args[0]);
+  } else {
+    LOG(FATAL) << "Unknown case";
+    throw;
+  }
+}
+
 class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>,
                          public PatternFunctor<MatchStatus(const Pattern&, const PStatic&)> {
  public:
-  PartialEvaluator(const tvm::Array<Var>& free_vars) {
+  PartialEvaluator(const tvm::Array<Var>& free_vars,
+                   const Module& mod) :
+    mod_(mod) {
     for (const Var& v : free_vars) {
       env_.Insert(v, NoStatic(v));
     }
   }
 
+  PStatic VisitExpr(const Expr& e, LetList* ll) final {
+    PStatic ret = ExprFunctor<PStatic(const Expr&, LetList*)>::VisitExpr(e, ll);
+    CHECK(IsAtomic(ret->dynamic)) << ret->dynamic;
+    return ret;
+  }
+
   PStatic VisitExpr_(const ConstantNode* op, LetList* ll) final {
     return HasStatic(MkSTensor(op->data.CopyTo(context_)), ll->Push(GetRef<Expr>(op)));
   }
@@ -421,7 +490,20 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
   }
 
   PStatic VisitExpr_(const GlobalVarNode* op, LetList* ll) final {
-    return NoStatic(GetRef<Expr>(op));
+    GlobalVar gv = GetRef<GlobalVar>(op);
+    if (gv_map_.count(gv) == 0) {
+      if (mod_.defined()) {
+        Function func = mod_->Lookup(gv);
+        InitializeFuncId(func);
+        Func f = VisitFuncStatic(func, gv);
+        gv_map_.insert({gv, HasStatic(MkSFunc(f), gv)});
+        func = AsFunc(PostProcess(VisitFuncDynamic(func, f)));
+        mod_->Update(gv, func);
+      } else {
+        gv_map_.insert({gv, NoStatic(gv)});
+      }
+    }
+    return gv_map_.at(gv);
   }
 
   PStatic VisitExpr_(const LetNode* op, LetList* ll) final {
@@ -485,6 +567,10 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
   }
 
   PStatic VisitExpr_(const CallNode* op, LetList* ll) final {
+    if (op->op.same_as(WithFuncIdOp())) {
+      CHECK_EQ(op->args.size(), 1);
+      return VisitExpr(op->args[0], ll);
+    }
     PStatic f = VisitExpr(op->op, ll);
     std::vector<PStatic> x;
     tvm::Array<Expr> x_dyn;
@@ -501,19 +587,40 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
     }
   }
 
-  PStatic VisitExpr_(const FunctionNode* op, LetList* ll) final {
-    Function func = GetRef<Function>(op);
+  struct TimeFrame {
+    PartialEvaluator* pe_;
+    FuncId fid_;
+    std::vector<Time> old_time;
+    bool has_old_time;
+    TimeFrame(PartialEvaluator* pe,
+              FuncId fid,
+              const std::vector<Time>& args_time) : pe_(pe), fid_(fid) {
+      has_old_time = pe_->time_map_.count(fid_) > 0;
+      old_time = pe_->time_map_[fid_];
+      pe_->time_map_[fid_] = args_time;
+    }
+    ~TimeFrame() {
+      if (has_old_time) {
+        pe_->time_map_[fid_] = old_time;
+      } else {
+        pe_->time_map_.erase(fid_);
+      }
+    }
+  };
+
+  Func VisitFuncStatic(const Function& func, const Expr& var) {
+    CHECK(IsAtomic(var));
     if (func->IsPrimitive()) {
-      return HasStatic(MkSFunc(ConstEvaluateFunc(func, ll)), func);
+      return ConstEvaluateFunc(func);
     }
     std::vector<std::pair<Var, PStatic> > free_vars;
-    for (const auto& v : FreeVars(GetRef<Expr>(op))) {
+    for (const auto& v : FreeVars(func)) {
       free_vars.push_back(std::pair<Var, PStatic>(v, env_.Lookup(v)));
     }
-    Func f = [=](const std::vector<PStatic>& pv,
-                 const Attrs& attrs,
-                 const tvm::Array<Type>& type_args,
-                 LetList* ll) {
+    return [=](const std::vector<PStatic>& pv,
+               const Attrs& attrs,
+               const tvm::Array<Type>& type_args,
+               LetList* ll) {
       return env_.Extend<PStatic>([&]() {
           CHECK_EQ(pv.size(), func->params.size());
           for (size_t i = 0; i < pv.size(); ++i) {
@@ -529,10 +636,50 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
           for (size_t i = type_args.size(); i < func->type_params.size(); ++i) {
             subst.Set(func->type_params[i], Type());
           }
-          return VisitExpr(TypeSubst(func->body, subst), ll);
+          std::vector<Time> args_time;
+          for (const auto& v : pv) {
+            args_time.push_back(v->created_time);
+          }
+          CHECK_GT(func_map_.count(func), 0);
+          FuncId fid = func_map_.at(func);
+          auto recurse = [&]() {
+            TimeFrame tf(this, fid, args_time);
+            return VisitExpr(RegisterFuncId(TypeSubst(AnnotateFuncId(func->body), subst)), ll);
+          };
+          if (time_map_.count(fid) == 0) {
+            return recurse();
+          } else {
+            /* We check to see that at least one argument decrease
+             * with respect to all previous invocation.
+             * The depth of the recursion is bounded by
+             * the sum of the time of all argument at the first call.
+             */
+            bool can_recurse = false;
+            std::vector<Time>& min_time = time_map_.at(fid);
+            CHECK_EQ(args_time.size(), min_time.size());
+            for (size_t i = 0; i < args_time.size(); ++i) {
+              if (args_time[i] < min_time[i]) {
+                can_recurse = true;
+              }
+              args_time[i] = std::min(args_time[i], min_time[i]);
+            }
+            if (can_recurse) {
+              return recurse();
+            } else {
+              std::vector<Expr> dyn;
+              for (const auto& v : pv) {
+                dyn.push_back(v->dynamic);
+              }
+              return NoStatic(ll->Push(CallNode::make(var, dyn, attrs, type_args)));
+            }
+          }
         });
     };
-    Expr dyn = store_.Extend<Expr>([&]() {
+  }
+
+
+  Expr VisitFuncDynamic(const Function& func, const Func& f) {
+    return store_.Extend<Expr>([&]() {
         store_.Invalidate();
         return FunctionNode::make(func->params, LetList::With([&](LetList* ll) {
               std::vector<PStatic> pv;
@@ -546,7 +693,20 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
               return f(pv, Attrs(), type_args, ll)->dynamic;
             }), func->ret_type, func->type_params, func->attrs);
       });
-    return HasStatic(MkSFunc(f), ll->Push(dyn));
+  }
+
+  PStatic VisitFunc(const Function& func, LetList* ll) {
+    Var v = VarNode::make("x", Type());
+    Func f = VisitFuncStatic(func, v);
+    Function u_func = AsFunc(RegisterFuncId(DeDup(AnnotateFuncId(func))));
+    // TODO(@M.K.): we seems to reduce landin knot into letrec.
+    // restore letrec support across whole relay.
+    return HasStatic(MkSFunc(f),
+                     ll->Push(v, VisitFuncDynamic(u_func, f)));
+  }
+
+  PStatic VisitExpr_(const FunctionNode* op, LetList* ll) final {
+    return VisitFunc(GetRef<Function>(op), ll);
   }
 
   Expr Reflect(const PStatic& st) {
@@ -590,7 +750,8 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
     return Reify(executor_(fused_infered), ll);
   }
 
-  Func ConstEvaluateFunc(const Expr& expr, LetList* ll) {
+  Func ConstEvaluateFunc(const Expr& expr) {
+    CHECK_EQ(FreeVars(expr).size(), 0);
     return [=](const std::vector<PStatic>& pv,
                const Attrs& attrs,
                const tvm::Array<Type>& type_args,
@@ -599,7 +760,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
       for (const PStatic& ps : pv) {
         ns_args.push_back(ps->dynamic);
       }
-      PStatic ns = NoStatic(CallNode::make(expr, ns_args, attrs, type_args));
+      PStatic ns = NoStatic(ll->Push(CallNode::make(expr, ns_args, attrs, type_args)));
       if (StatefulOp(expr)) {
         return ns;
       }
@@ -616,7 +777,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
   }
 
   PStatic VisitExpr_(const OpNode* op, LetList* ll) final {
-    return HasStatic(MkSFunc(ConstEvaluateFunc(GetRef<Expr>(op), ll)), GetRef<Expr>(op));
+    return HasStatic(MkSFunc(ConstEvaluateFunc(GetRef<Expr>(op))), GetRef<Expr>(op));
   }
 
   PStatic VisitExpr_(const ConstructorNode* op, LetList* ll) final {
@@ -680,7 +841,6 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
       CHECK_NE(op->constructor->tag, -1);
       CHECK_NE(scn->constructor->tag, -1);
       if (op->constructor->tag == scn->constructor->tag) {
-        // todo(M.K.): should use ptr equality but it is broken
         CHECK_EQ(op->patterns.size(), scn->fields.size());
         MatchStatus current_match_status = MatchStatus::Match;
         for (size_t i = 0; i < op->patterns.size(); ++i) {
@@ -702,27 +862,119 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
     }
   }
 
+  void InitializeFuncId(const Expr& e) {
+    struct InitializeFuncIdVisitor : ExprVisitor, PatternVisitor {
+      PartialEvaluator* pe;
+      explicit InitializeFuncIdVisitor(PartialEvaluator* pe) : pe(pe) { }
+
+      void VisitExpr_(const FunctionNode* op) final {
+        Function f = GetRef<Function>(op);
+        CHECK_EQ(pe->func_map_.count(f), 0);
+        pe->func_map_.insert({f, pe->func_map_.size()});
+        VisitExpr(f->body);
+      }
+
+      void VisitPattern(const Pattern& p) final {
+        PatternVisitor::VisitPattern(p);
+      }
+    };
+    InitializeFuncIdVisitor(this).VisitExpr(e);
+  }
+
+  Expr RegisterFuncId(const Expr& e) {
+    struct RegisterFuncIdVisitor : ExprVisitor, PatternVisitor {
+      PartialEvaluator* pe;
+      explicit RegisterFuncIdVisitor(PartialEvaluator* pe) : pe(pe) { }
+
+      void VisitExpr_(const CallNode* op) final {
+        if (op->op.same_as(WithFuncIdOp())) {
+          CHECK_EQ(op->args.size(), 1);
+          CHECK(op->attrs.defined());
+          CHECK(op->attrs.as<WithFuncIdAttrs>());
+          Function f = AsFunc(op->args[0]);
+          FuncId fid = op->attrs.as<WithFuncIdAttrs>()->fid;
+          if (pe->func_map_.count(f) != 0) {
+            CHECK_EQ(pe->func_map_.at(f), fid);
+          }
+          pe->func_map_.insert({f, fid});
+        }
+        ExprVisitor::VisitExpr_(op);
+      }
+
+      void VisitExpr_(const FunctionNode* op) final {
+        Function f = GetRef<Function>(op);
+        CHECK_GT(pe->func_map_.count(f), 0);
+        ExprVisitor::VisitExpr_(op);
+      }
+
+      void VisitPattern(const Pattern& p) final {
+        PatternVisitor::VisitPattern(p);
+      }
+    };
+    RegisterFuncIdVisitor(this).VisitExpr(e);
+    return e;
+  }
+
+  Expr AnnotateFuncId(const Expr& e) {
+    struct AnnotateFuncIdMutator : ExprMutator, PatternMutator {
+      PartialEvaluator* pe;
+      explicit AnnotateFuncIdMutator(PartialEvaluator* pe) : pe(pe) { }
+
+      Expr VisitExpr_(const FunctionNode* op) final {
+        Function f = GetRef<Function>(op);
+        CHECK_GT(pe->func_map_.count(f), 0);
+        return MkWithFuncId(ExprMutator::VisitExpr_(op), pe->func_map_.at(f));
+      }
+
+      Pattern VisitPattern(const Pattern& p) final {
+        return PatternMutator::VisitPattern(p);
+      }
+
+      Var VisitVar(const Var& v) final {
+        return v;
+      }
+    };
+    return AnnotateFuncIdMutator(this).VisitExpr(e);
+  }
+
  private:
   Environment env_;
+  Module mod_;
+  std::unordered_map<GlobalVar, PStatic, NodeHash, NodeEqual> gv_map_;
+  /*! Termination checking is done as follows:
+   *  We have finitely many FunctionIds.
+   *  Each FunctionId maps to a class of semantically equivalent function (ignoring type),
+   *  as both TypeSubst and DeDup create semantically equivalent function.
+   *  We partially map each FunctionId to a std::vector<Time>,
+   *  denoting the minimal TimeFrame of each argument of the function.
+   *  Every time we try to inline a Function,
+   *  we make sure it either does not have a vector<Time>, which means this is the initial call,
+   *  or some argument has a lesser time, which means some earlier argument is passed in.
+   *  In any case, we remap the mapping to a minimal vector<Time> across all previous invocations
+   *  when we PE inside the Function body.
+   *  Termination is guaranteed because the creation time of at least one argument will decrease every call.
+   */
+  std::unordered_map<Function, FuncId, NodeHash, NodeEqual> func_map_;
+  std::unordered_map<FuncId, std::vector<Time> > time_map_;
   Store store_;
   DLContext context_ = CPUContext();
   FInterpreter executor_ = CPUInterpreter();
 };
 
-Var DeDupVar(const Var& v) {
-  return VarNode::make(v->name_hint(), v->type_annotation);
-}
-
-TypeVar DeDupTypeVar(const TypeVar& tv) {
-  return TypeVarNode::make(tv->var->name_hint, tv->kind);
-}
-
 /*! \brief Use a fresh Id for every Var to make the result well-formed. */
 Expr DeDup(const Expr& e) {
-  class DeDupMutator : public ExprMutator, public PatternMutator {
+  class DeDupMutator : public TypeMutator,
+                       public ExprMutator,
+                       public PatternMutator {
    public:
+    TypeVar Fresh(const TypeVar& tv) {
+      TypeVar ret = TypeVarNode::make(tv->var->name_hint, tv->kind);
+      type_rename_[tv] = ret;
+      return ret;
+    }
+
     Var Fresh(const Var& v) {
-      Var ret = DeDupVar(v);
+      Var ret = VarNode::make(v->name_hint(), VisitType(v->type_annotation));
       rename_[v] = ret;
       return ret;
     }
@@ -737,18 +989,27 @@ Expr DeDup(const Expr& e) {
     }
 
     Expr VisitExpr_(const LetNode* op) final {
-      return LetNode::make(Fresh(op->var), VisitExpr(op->value), VisitExpr(op->body));
+      Var v = Fresh(op->var);
+      return LetNode::make(v, VisitExpr(op->value), VisitExpr(op->body));
+    }
+
+    Type VisitType(const Type& t) final {
+      return t.defined() ? TypeMutator::VisitType(t) : t;
     }
 
     Expr VisitExpr_(const FunctionNode* op) final {
+      tvm::Array<TypeVar> type_params;
+      for (const TypeVar& type_param : op->type_params) {
+        type_params.push_back(Fresh(type_param));
+      }
       tvm::Array<Var> params;
       for (const Var& param : op->params) {
         params.push_back(Fresh(param));
       }
       return FunctionNode::make(params,
                                 VisitExpr(op->body),
-                                op->ret_type,
-                                op->type_params,
+                                VisitType(op->ret_type),
+                                type_params,
                                 op->attrs);
     }
 
@@ -756,14 +1017,28 @@ Expr DeDup(const Expr& e) {
       return PatternMutator::VisitPattern(p);
     }
 
+    Clause VisitClause(const Clause& c) final {
+      Pattern pat = VisitPattern(c->lhs);
+      return ClauseNode::make(pat, VisitExpr(c->rhs));
+    }
+
+    Type VisitType_(const TypeVarNode* op) final {
+      TypeVar v = GetRef<TypeVar>(op);
+      return type_rename_.count(v) != 0 ? type_rename_.at(v) : v;
+    }
+
     Var VisitVar(const Var& v) final {
       return Fresh(v);
     }
 
    private:
     std::unordered_map<Var, Var, NodeHash, NodeEqual> rename_;
+    std::unordered_map<TypeVar, TypeVar, NodeHash, NodeEqual> type_rename_;
   };
-  return DeDupMutator().VisitExpr(e);
+
+  Expr ret = DeDupMutator().VisitExpr(e);
+  CHECK_EQ(FreeVars(ret).size(), FreeVars(e).size());
+  return ret;
 }
 
 /*! \brief Remap multiple Var sharing the same Id into the same Var. */
@@ -787,19 +1062,61 @@ Expr Remap(const Expr& e) {
   return RemapMutator().VisitExpr(e);
 }
 
-Expr PartialEval(const Expr& e) {
+Expr StripWithFuncId(const Expr& e) {
+  struct StripWithFuncIdMutator : ExprMutator, PatternMutator {
+    Expr VisitExpr_(const CallNode* op) final {
+      if (op->op.same_as(WithFuncIdOp())) {
+        CHECK_EQ(op->args.size(), 1);
+        return VisitExpr(op->args[0]);
+      } else {
+        return ExprMutator::VisitExpr_(op);
+      }
+    }
+
+    Pattern VisitPattern(const Pattern& p) final {
+      return PatternMutator::VisitPattern(p);
+    }
+
+    Var VisitVar(const Var& v) final {
+      return v;
+    }
+  };
+  return StripWithFuncIdMutator().VisitExpr(e);
+}
+
+Expr PostProcess(const Expr& e) {
+  return StripWithFuncId(DeDup(Remap(e)));
+}
+
+}  // namespace partial_eval
+
+Expr PartialEval(const Expr& e, const Module& m) {
   return TransformF([&](const Expr& e) {
       return LetList::With([&](LetList* ll) {
-          PartialEvaluator pe(FreeVars(e));
-          return Remap(DeDup(pe.VisitExpr(e, ll)->dynamic));
+          relay::partial_eval::PartialEvaluator pe(FreeVars(e), m);
+          pe.InitializeFuncId(e);
+          return relay::partial_eval::PostProcess(pe.VisitExpr(e, ll)->dynamic);
         });
     }, e);
 }
 
 TVM_REGISTER_API("relay._ir_pass.partial_evaluate")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = PartialEval(args[0]);
-  });
+.set_body_typed(PartialEval);
+
+namespace transform {
+
+Pass PartialEval() {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+    return Downcast<Function>(PartialEval(f, m));
+  };
+  return CreateFunctionPass(pass_func, 1, "PartialEvaluate", {});
+}
+
+TVM_REGISTER_API("relay._transform.PartialEvaluate")
+.set_body_typed(PartialEval);
+
+}  // namespace transform
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/pass_manager.cc b/src/relay/pass/pass_manager.cc
index d607247b3bc8..d63d9121fe27 100644
--- a/src/relay/pass/pass_manager.cc
+++ b/src/relay/pass/pass_manager.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -22,15 +22,65 @@
  * \file src/relay/pass/pass_manager.cc
  * \brief Relay pass manager implementation.
  */
+#include <dmlc/thread_local.h>
 #include <tvm/relay/expr_functor.h>
-#include <tvm/relay/pass.h>
+#include <tvm/relay/transform.h>
+#include <tvm/runtime/device_api.h>
+
+#include <algorithm>
+#include <stack>
+#include <unordered_set>
 
 namespace tvm {
 namespace relay {
-namespace pass {
+namespace transform {
 
 using tvm::IRPrinter;
 
+struct RelayPassContextThreadLocalEntry {
+  /*! \brief The default pass context. */
+  PassContext default_context;
+
+  /*! \brief The current pass context. */
+  std::stack<PassContext> context_stack;
+
+  RelayPassContextThreadLocalEntry() {
+    default_context = PassContext(make_node<PassContextNode>());
+  }
+};
+
+/*! \brief Thread local store to hold the pass context. */
+typedef dmlc::ThreadLocalStore<RelayPassContextThreadLocalEntry>
+    RelayPassContextThreadLocalStore;
+
+void PassContext::EnterWithScope() {
+  RelayPassContextThreadLocalEntry* entry =
+      RelayPassContextThreadLocalStore::Get();
+  entry->context_stack.push(*this);
+}
+
+void PassContext::ExitWithScope() {
+  RelayPassContextThreadLocalEntry* entry =
+      RelayPassContextThreadLocalStore::Get();
+  CHECK(!entry->context_stack.empty());
+  CHECK(entry->context_stack.top().same_as(*this));
+  entry->context_stack.pop();
+}
+
+PassContext PassContext::Current() {
+  RelayPassContextThreadLocalEntry* entry =
+      RelayPassContextThreadLocalStore::Get();
+  if (!entry->context_stack.empty()) {
+    return entry->context_stack.top();
+  } else {
+    return entry->default_context;
+  }
+}
+
+PassContext PassContext::Create() {
+  return PassContext(make_node<PassContextNode>());
+}
+
 class ModulePass;
 
 /*!
@@ -58,38 +108,26 @@ class ModulePassNode : public PassNode {
   }
 
   /*!
-   * \brief Run a module pass on a certain module.
+   * \brief Run a module pass on given pass context.
    *
-   * \param mod The module that an optimization pass runs on.
+   * \param mod The module that an optimization pass is applied on.
+   * \param mod The context that an optimization pass executes on.
    *
    * \return Return the updated module.
    */
-  Module operator()(const Module& mod) const final;
+  Module operator()(const Module& mod, const PassContext& pass_ctx) const final;
 
   /*!
    * \brief Get the pass information/meta data.
    */
   PassInfo Info() const { return pass_info; }
 
-  /*!
-   * \brief Set the context information for a module pass.
-   *
-   * \param pass_ctx The context information for a module pass.
-   */
-  void SetContext(const PassContext& pass_ctx) final;
-
   TVM_DLL static ModulePass make(
       runtime::TypedPackedFunc<Module(Module, PassContext)> pass_func,
       PassInfo pass_info);
 
   static constexpr const char* _type_key = "relay.ModulePass";
   TVM_DECLARE_NODE_TYPE_INFO(ModulePassNode, PassNode);
-
- private:
-  /*!
-   * \brief The context information that is used to help perform a module pass.
-   */
-  PassContext pass_ctx_;
 };
 
 RELAY_DEFINE_NODE_REF(ModulePass, ModulePassNode, Pass);
@@ -115,7 +153,7 @@ class FunctionPassNode : public PassNode {
    * `pass_func` and let it run on a given module. The same `pass_func` will
    * then be applied on each function in the module.
    */
-  runtime::TypedPackedFunc<Function(Function, PassContext)> pass_func;
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func;
 
   FunctionPassNode() = default;
 
@@ -124,28 +162,22 @@ class FunctionPassNode : public PassNode {
   }
 
   /*!
-   * \brief Run a function pass on a certain module.
+   * \brief Run a function pass on given pass context.
    *
-   * \param mod The module that an optimization pass runs on.
+   * \param mod The module that an optimization pass is applied on.
+   * \param mod The context that an optimization pass executes on.
    *
    * \return Return the updated module.
    */
-  Module operator()(const Module& mod) const final;
+  Module operator()(const Module& mod, const PassContext& pass_ctx) const final;
 
   /*!
    * \brief Get the pass information/meta data.
    */
   PassInfo Info() const { return pass_info; }
 
-  /*!
-   * \brief Set the context information for a function-level pass.
-   *
-   * \param pass_ctx The context information for a function-level pass.
-   */
-  void SetContext(const PassContext& pass_ctx) final;
-
   TVM_DLL static FunctionPass make(
-      runtime::TypedPackedFunc<Function(Function, PassContext)> pass_func,
+      runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func,
       PassInfo pass_info);
 
   static constexpr const char* _type_key = "relay.FunctionPass";
@@ -160,42 +192,29 @@ class FunctionPassNode : public PassNode {
    * \return Return true if the function will be skipped, otherwise false.
    */
   bool SkipFunction(const Function& func) const;
-
-  /*!
-   * \brief The context information that is used to help perform a module pass.
-   */
-  PassContext pass_ctx_;
 };
 
 RELAY_DEFINE_NODE_REF(FunctionPass, FunctionPassNode, Pass);
 
-class SequentialPass;
-
 /*!
- * \brief The SequentialPassNode contains a set of passes that transform Relay
+ * \brief The SequentialNode contains a set of passes that transform Relay
  * programs from one AST to another semantically equivalent one.
  *
  * One example of this level of pass is that the pass manager needs to correctly
  * perform a host of optimizations with a given optimization level and disabled
  * passes.
  */
-class SequentialPassNode : public PassNode {
+class SequentialNode : public PassNode {
  public:
   /* \brief The pass meta data.*/
   PassInfo pass_info;
 
   /*! \brief A list of passes that used to compose a sequential pass. */
   tvm::Array<Pass> passes;
-  /*!
-   * \brief A list of disabled passes that should be excluded when executing the
-   * sequential pass.
-   */
-  tvm::Array<tvm::Expr> disabled;
 
   void VisitAttrs(tvm::AttrVisitor* v) final {
     v->Visit("pass_info", &pass_info);
     v->Visit("passes", &passes);
-    v->Visit("disabled", &disabled);
   }
 
   /*!
@@ -204,17 +223,13 @@ class SequentialPassNode : public PassNode {
   PassInfo Info() const { return pass_info; }
 
   /*!
-   * \brief Add a pass to the pass list.
+   * \brief Check if a pass is enabled.
+   *
+   * \param info The pass information.
    *
-   * \param pass The candidate pass to be added.
+   * \return true if the pass is enabled. Otherwise, false.
    */
-  void AddPass(const Pass& pass) {
-    passes.push_back(pass);
-  }
-
-  TVM_DLL static SequentialPass make(tvm::Array<Pass> passes,
-                                     PassInfo pass_info,
-                                     tvm::Array<tvm::Expr> disabled);
+  bool PassEnabled(const PassInfo& info) const;
 
   /*!
    * \brief Resolve the pass dependency. It globs all required passes by
@@ -230,40 +245,25 @@ class SequentialPassNode : public PassNode {
    */
   void ResolveDependency(const Module& mod);
 
-  TVM_DLL std::vector<std::string> DisabledPasses() const;
-
   /*!
    * \brief Perform optimizations on a series of passes. The aforementioned
    *        typical pass manager jobs could be done by it. This function could
    *        be overloaded to focus on different metrics, i.e. performance,
    *        memory footprint, etc.
    *
-   * \param mod The module that an optimization pass runs on.
+   * \param mod The module that these passes are applied on.
+   * \param pass_ctx The context that these passes execute on.
    *
    * \return Return the updated module.
    */
-  Module operator()(const Module& mod) const final;
+  Module operator()(const Module& mod, const PassContext& pass_ctx) const final;
 
-  /*!
-   * \brief Set the context information for a sequential pass.
-   *
-   * \param pass_ctx The context information for a sequential pass.
-   */
-  void SetContext(const PassContext& pass_ctx) final;
-
-  static constexpr const char* _type_key = "relay.SequentialPass";
-  TVM_DECLARE_NODE_TYPE_INFO(SequentialPassNode, PassNode);
-
- private:
-  /*!
-   * \brief The context information that is used to help perform a module pass.
-   */
-  PassContext pass_ctx_;
+  static constexpr const char* _type_key = "relay.Sequential";
+  TVM_DECLARE_NODE_TYPE_INFO(SequentialNode, PassNode);
 };
 
-RELAY_DEFINE_NODE_REF(SequentialPass, SequentialPassNode, Pass);
-
-PassInfo PassInfoNode::make(int opt_level, std::string name,
+PassInfo PassInfoNode::make(int opt_level,
+                            std::string name,
                             tvm::Array<tvm::Expr> required) {
   auto pass_info = make_node<PassInfoNode>();
   pass_info->opt_level = opt_level;
@@ -272,11 +272,6 @@ PassInfo PassInfoNode::make(int opt_level, std::string name,
   return PassInfo(pass_info);
 }
 
-PassContext PassContextNode::make() {
-  auto ctx = make_node<PassContextNode>();
-  return PassContext(ctx);
-}
-
 ModulePass ModulePassNode::make(
     runtime::TypedPackedFunc<Module(Module, PassContext)> pass_func,
     PassInfo pass_info) {
@@ -287,25 +282,21 @@ ModulePass ModulePassNode::make(
 }
 
 // Module -> Module optimizations.
-// TODO(zhiics) 1. Check and handle the required passes.
-//              2. Probably use CoW for all places that use module instead of
-//              returning the updated one.
-Module ModulePassNode::operator()(const Module& mod) const {
-  PassInfo pass_info = Info();
-  LOG(INFO) << "Executing module pass : " << pass_info.operator->()->name
-            << " with opt level: " << pass_info.operator->()->opt_level << "\n";
+Module ModulePassNode::operator()(const Module& mod,
+                                  const PassContext& pass_ctx) const {
+  const PassInfo& pass_info = Info();
+  DLOG(INFO) << "Executing module pass : "
+             << pass_info->name
+             << " with opt level: "
+             << pass_info->opt_level;
   CHECK(mod.defined());
-  auto updated_mod = pass_func(mod, pass_ctx_);
+  Module updated_mod = pass_func(mod, pass_ctx);
   CHECK(updated_mod.defined());
   return updated_mod;
 }
 
-void ModulePassNode::SetContext(const PassContext& pass_ctx) {
-  pass_ctx_ = pass_ctx;
-}
-
 FunctionPass FunctionPassNode::make(
-    runtime::TypedPackedFunc<Function(Function, PassContext)> pass_func,
+    runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func,
     PassInfo pass_info) {
   auto n = make_node<FunctionPassNode>();
   n->pass_func = std::move(pass_func);
@@ -314,32 +305,30 @@ FunctionPass FunctionPassNode::make(
 }
 
 // Perform Module -> Module optimizations at the Function level.
-// TODO(zhiics) Check and handle the required passes.
-Module FunctionPassNode::operator()(const Module& mod) const {
-  PassInfo pass_info = Info();
-  LOG(INFO) << "Executing function pass : " << pass_info.operator->()->name
-            << " with opt level: " << pass_info.operator->()->opt_level << "\n";
+Module FunctionPassNode::operator()(const Module& mod,
+                                    const PassContext& pass_ctx) const {
+  const PassInfo& pass_info = Info();
   CHECK(mod.defined());
-  std::vector<std::pair<GlobalVar, Function>> updated_funcs;
-  ModuleNode* mod_node = mod.operator->();
-  for (const auto& it : mod_node->functions) {
-    if (!SkipFunction(it.second)) {
-      auto updated_func = pass_func(it.second, pass_ctx_);
-      CHECK(updated_func.defined());
-      updated_funcs.push_back({std::move(it.first), std::move(updated_func)});
-    }
+  DLOG(INFO) << "Executing function pass : "
+             << pass_info->name
+             << " with opt level: "
+             << pass_info->opt_level;
+
+  Module updated_mod = mod;
+  // Execute the pass function and return a new module.
+  std::vector<std::pair<GlobalVar, Function> > updates;
+  auto original = mod->functions;
+  for (const auto& it : original) {
+    auto updated_func = SkipFunction(it.second)
+                            ? it.second
+                            : pass_func(it.second, updated_mod, pass_ctx);
+    updates.push_back({it.first, updated_func});
   }
 
-  // Update the optimized functions.
-  for (const auto& it : updated_funcs) {
-    mod_node->Update(it.first, it.second);
+  for (const auto& pair : updates) {
+    updated_mod->Add(pair.first, pair.second, true);
   }
-
-  return GetRef<Module>(mod_node);
-}
-
-void FunctionPassNode::SetContext(const PassContext& pass_ctx) {
-  pass_ctx_ = pass_ctx;
+  return updated_mod;
 }
 
 // TODO(zhiics) Create an enum attribute for FunctionNode
@@ -350,30 +339,26 @@ bool FunctionPassNode::SkipFunction(const Function& func) const {
   return pval && pval->value != 0;
 }
 
-SequentialPass SequentialPassNode::make(tvm::Array<Pass> passes,
-                                        PassInfo pass_info,
-                                        tvm::Array<tvm::Expr> disabled) {
-  auto n = make_node<SequentialPassNode>();
+Sequential::Sequential(tvm::Array<Pass> passes, PassInfo pass_info) {
+  auto n = make_node<SequentialNode>();
   n->passes = std::move(passes);
   n->pass_info = std::move(pass_info);
-  n->disabled = std::move(disabled);
-  return SequentialPass(n);
+  node_ = std::move(n);
 }
 
-// TODO(jroesch, zhiics): we currenlty only sequentially execute each pass in
-// a SequentialPass without the consideration of their orders. The phase
-// ordering problem needed to be handled in the future.
-Module SequentialPassNode::operator()(const Module& module) const {
-  Module mod = module;
-  for (const Pass& pass : passes) {
-    CHECK(pass.defined()) << "Found undefined pass for optimization.";
-    const auto* pn = pass.operator->();
-    mod = (*pn)(mod);
-  }
-  return mod;
+Sequential::Sequential(tvm::Array<Pass> passes, std::string name) {
+  auto n = make_node<SequentialNode>();
+  n->passes = std::move(passes);
+  PassInfo pass_info = PassInfoNode::make(2, std::move(name), {});
+  n->pass_info = std::move(pass_info);
+  node_ = std::move(n);
+}
+
+const SequentialNode* Sequential::operator->() const {
+  return static_cast<const SequentialNode*>(this->node_.get());
 }
 
-void SequentialPassNode::ResolveDependency(const Module& mod) {
+void SequentialNode::ResolveDependency(const Module& mod) {
   // TODO(zhiics) Implement it.
   // 1. Consider the required passes for each pass.
   // 2. Only resolve the enabled passes.
@@ -382,18 +367,59 @@ void SequentialPassNode::ResolveDependency(const Module& mod) {
              << "\n";
 }
 
-std::vector<std::string> SequentialPassNode::DisabledPasses() const {
-  std::vector<std::string> ret;
-  for (const auto& it : disabled) {
-    const auto* str = it.as<tvm::ir::StringImm>();
-    CHECK(str) << "disabled passes must be string.";
-    ret.push_back(str->value);
+// linearly scan the pass array to match pass_name
+inline bool PassArrayContains(const Array<tvm::Expr>& pass_array,
+                              const std::string& pass_name) {
+  for (auto x : pass_array) {
+    auto* str_name = x.as<ir::StringImm>();
+    CHECK(str_name) << "pass name must be str";
+    if (str_name->value == pass_name) return true;
+  }
+  return false;
+}
+
+bool SequentialNode::PassEnabled(const PassInfo& info) const {
+  PassContext ctx = PassContext::Current();
+
+  if (PassArrayContains(ctx->disabled_pass, info->name)) {
+    return false;
+  }
+
+  if (PassArrayContains(ctx->required_pass, info->name)) {
+    return true;
   }
-  return ret;
+
+  return ctx->opt_level >= info->opt_level;
+}
+
+Pass GetPass(const std::string& pass_name) {
+  using tvm::runtime::Registry;
+  std::string fpass_name = "relay._transform." + pass_name;
+  const auto* f = Registry::Get(fpass_name);
+  CHECK(f != nullptr) << "Cannot find " << fpass_name
+                      << "to create the pass " << pass_name;
+  return (*f)();
 }
 
-void SequentialPassNode::SetContext(const PassContext& pass_ctx) {
-  pass_ctx_ = pass_ctx;
+// TODO(zhiics): we currenlty only sequentially execute each pass in
+// a Sequential without the consideration of their orders. The phase
+// ordering problem needs to be handled in the future.
+Module SequentialNode::operator()(const Module& module,
+                                  const PassContext& pass_ctx) const {
+  Module mod = module;
+  for (const Pass& pass : passes) {
+    CHECK(pass.defined()) << "Found undefined pass for optimization.";
+    const PassInfo& pass_info = pass->Info();
+    if (!PassEnabled(pass_info))  continue;
+    // resolve dependencies
+    for (const auto& it : pass_info->required) {
+      const auto* name = it.as<tvm::ir::StringImm>();
+      CHECK(name);
+      mod = GetPass(name->value)(mod, pass_ctx);
+    }
+    mod = pass(mod, pass_ctx);
+  }
+  return mod;
 }
 
 Pass CreateModulePass(
@@ -406,7 +432,7 @@ Pass CreateModulePass(
 }
 
 Pass CreateFunctionPass(
-    const runtime::TypedPackedFunc<Function(Function, PassContext)>& pass_func,
+    const runtime::TypedPackedFunc<Function(Function, Module, PassContext)>& pass_func,
     int opt_level,
     const std::string& name,
     const tvm::Array<tvm::Expr>& required) {
@@ -414,21 +440,12 @@ Pass CreateFunctionPass(
   return FunctionPassNode::make(pass_func, pass_info);
 }
 
-Pass CreateSequentialPass(const tvm::Array<Pass>& passes,
-                          int opt_level,
-                          const std::string& name,
-                          const tvm::Array<tvm::Expr>& required,
-                          const tvm::Array<tvm::Expr>& disabled) {
-  PassInfo pass_info = PassInfoNode::make(opt_level, name, required);
-  return SequentialPassNode::make(passes, pass_info, disabled);
-}
-
 TVM_REGISTER_NODE_TYPE(PassInfoNode);
 
-TVM_REGISTER_API("relay._ir_pass.PassInfo")
+TVM_REGISTER_API("relay._transform.PassInfo")
 .set_body_typed(PassInfoNode::make);
 
-TVM_REGISTER_API("relay._ir_pass.Info")
+TVM_REGISTER_API("relay._transform.Info")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
   Pass pass = args[0];
   *ret = pass->Info();
@@ -450,90 +467,121 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 
 TVM_REGISTER_NODE_TYPE(ModulePassNode);
 
-TVM_REGISTER_API("relay._ir_pass.CreateModulePass")
-.set_body_typed(CreateModulePass);
+TVM_REGISTER_API("relay._transform.MakeModulePass")
+.set_body_typed(ModulePassNode::make);
 
-TVM_REGISTER_API("relay._ir_pass.RunPass")
+TVM_REGISTER_API("relay._transform.RunPass")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
   Pass pass = args[0];
   Module mod = args[1];
-  CHECK(pass.defined())
-      << "Running an undefined pass is not allowed."
-      << "\n";
-
-  const auto* pn = pass.operator->();
-  *ret = (*pn)(mod);
+  *ret = pass(mod);
 });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<ModulePassNode>([](const ModulePassNode* node,
                                  tvm::IRPrinter* p) {
-  const PassInfoNode* pn = node->Info().operator->();
-  p->stream << "Run Module pass: " << pn->name
-            << " at the optimization level " << pn->opt_level;
+  const PassInfo info = node->Info();
+  p->stream << "Run Module pass: " << info->name
+            << " at the optimization level " << info->opt_level;
 });
 
 TVM_REGISTER_NODE_TYPE(FunctionPassNode);
 
-TVM_REGISTER_API("relay._ir_pass.CreateFunctionPass")
-.set_body_typed(CreateFunctionPass);
+TVM_REGISTER_API("relay._transform.MakeFunctionPass")
+.set_body_typed(FunctionPassNode::make);
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<FunctionPassNode>([](const FunctionPassNode* node,
                                    tvm::IRPrinter* p) {
-  const PassInfoNode* pn = node->Info().operator->();
-  p->stream << "Run Function pass: " << pn->name
-            << " at the optimization level " << pn->opt_level;
+  const PassInfo info = node->Info();
+  p->stream << "Run Function pass: " << info->name
+            << " at the optimization level " << info->opt_level;
 });
 
-TVM_REGISTER_NODE_TYPE(SequentialPassNode);
+TVM_REGISTER_NODE_TYPE(SequentialNode);
 
-TVM_REGISTER_API("relay._ir_pass.CreateSequentialPass")
+TVM_REGISTER_API("relay._transform.Sequential")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
   tvm::Array<Pass> passes = args[0];
   int opt_level = args[1];
   std::string name = args[2];
   tvm::Array<tvm::Expr> required = args[3];
-  tvm::Array<tvm::Expr> disabled = args[4];
   PassInfo pass_info = PassInfoNode::make(opt_level, name, required);
-  *ret = SequentialPassNode::make(passes, pass_info, disabled);
+  *ret = Sequential(passes, pass_info);
 });
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
-.set_dispatch<SequentialPassNode>([](const SequentialPassNode* node,
-                                     tvm::IRPrinter* p) {
-  const PassInfoNode* seq_pn = node->Info().operator->();
-  p->stream << "Run SequentialPass pass: " << seq_pn->name
-            << " at the optimization level. " << seq_pn->opt_level;
+.set_dispatch<SequentialNode>([](const SequentialNode* node,
+                                 tvm::IRPrinter* p) {
+  const PassInfo info = node->Info();
+  p->stream << "Run Sequential pass: " << info->name
+            << " at the optimization level " << info->opt_level << ". ";
   p->stream << "The passes will be executed are: [";
   for (const auto& it : node->passes) {
-    const PassNode* pn = it.operator->();
-    const PassInfoNode* pass_info_node = pn->Info().operator->();
-    p->stream << pass_info_node->name << " ";
+    const PassInfo pass_info = it->Info();
+    p->stream << pass_info->name << " ";
   }
   p->stream << "]";
 });
 
-TVM_REGISTER_API("relay._ir_pass.SetContext")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  Pass pass = args[0];
-  PassContext pass_ctx = args[1];
-  pass->SetContext(pass_ctx);
-});
-
 TVM_REGISTER_NODE_TYPE(PassContextNode);
 
-TVM_REGISTER_API("relay._ir_pass.PassContext")
-.set_body_typed(PassContextNode::make);
+TVM_REGISTER_API("relay._transform.PassContext")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  auto pctx = PassContext::Create();
+  int opt_level = args[0];
+  int fallback_device = args[1];
+  tvm::Array<tvm::Expr> required = args[2];
+  tvm::Array<tvm::Expr> disabled = args[3];
+  pctx->opt_level = opt_level;
+  pctx->fallback_device = fallback_device;
+  pctx->required_pass = std::move(required);
+  pctx->disabled_pass = std::move(disabled);
+  *ret = pctx;
+});
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<PassContextNode>([](const PassContextNode* node,
-                                tvm::IRPrinter* p) {
-    p->stream << "TODO(zhiics): printing context";
-    LOG(FATAL) << "PassContext printer has not been implemented yet."
-               << "\n";
+                               tvm::IRPrinter* p) {
+  p->stream << "Pass context information: " << "\n";
+  p->stream << "\topt_level: " << node->opt_level << "\n";
+  p->stream << "\tfallback device: "
+            << runtime::DeviceName(node->fallback_device)
+            << "\n";
+
+  p->stream << "\trequired passes: [" << node->opt_level;
+  for (const auto& it : node->required_pass) {
+    p->stream << it << " ";
+  }
+  p->stream << "]\n";
+
+  p->stream << "\tdisabled passes: [" << node->opt_level;
+  for (const auto& it : node->disabled_pass) {
+    p->stream << it << " ";
+  }
+  p->stream << "]";
 });
 
-}  // namespace pass
+class PassContext::Internal {
+ public:
+  static void EnterScope(PassContext pass_ctx) {
+    pass_ctx.EnterWithScope();
+  }
+
+  static void ExitScope(PassContext pass_ctx) {
+    pass_ctx.ExitWithScope();
+  }
+};
+
+TVM_REGISTER_API("relay._transform.GetCurrentPassContext")
+.set_body_typed(PassContext::Current);
+
+TVM_REGISTER_API("relay._transform.EnterPassContext")
+.set_body_typed(PassContext::Internal::EnterScope);
+
+TVM_REGISTER_API("relay._transform.ExitPassContext")
+.set_body_typed(PassContext::Internal::ExitScope);
+
+}  // namespace transform
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index b44bb682d317..b709f2846b34 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -27,6 +27,7 @@
 #ifndef TVM_RELAY_PASS_PATTERN_UTIL_H_
 #define TVM_RELAY_PASS_PATTERN_UTIL_H_
 
+#include <builtin_fp16.h>
 #include <tvm/data_layout.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/expr.h>
@@ -49,6 +50,9 @@ namespace relay {
   } else if (type == Float(32)) {                       \
     typedef float DType;                                \
     {__VA_ARGS__}                                       \
+  } else if (type == Float(16)) {                       \
+    typedef uint16_t DType;                             \
+    {__VA_ARGS__}                                       \
   } else if (type == Int(64)) {                         \
     typedef int64_t DType;                              \
     {__VA_ARGS__}                                       \
@@ -204,7 +208,14 @@ template<typename T>
 inline Constant MakeConstantScalar(DataType dtype, T value) {
   runtime::NDArray arr = runtime::NDArray::Empty({}, Type2TVMType(dtype), {kDLCPU, 0});
   TVM_DTYPE_DISPATCH(dtype, DType, {
-    *static_cast<DType*>(arr->data) = value;
+    if (dtype == Float(16)) {
+      // convert to float16
+      // storage is uint16_t
+      *static_cast<DType*>(arr->data) =
+        __truncXfYf2__<float, uint32_t, 23, uint16_t, uint16_t, 10>(static_cast<float>(value));
+    } else {
+      *static_cast<DType*>(arr->data) = value;
+    }
   })
   return ConstantNode::make(arr);
 }
diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc
index 7fd27b46ad6a..07233a83ca23 100644
--- a/src/relay/pass/quantize.cc
+++ b/src/relay/pass/quantize.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -43,6 +43,8 @@ namespace tvm {
 namespace relay {
 namespace quantize {
 
+using namespace relay::transform;
+
 /*! \brief Attribute for simulated quantize operator */
 struct SimulatedQuantizeAttrs : public tvm::AttrsNode<SimulatedQuantizeAttrs> {
   int kind;
@@ -131,23 +133,6 @@ TVM_REGISTER_API("relay._quantize.make_annotate_expr")
       static_cast<QAnnotateKind>(args[1].operator int()));
   });
 
-
-TVM_REGISTER_API("relay._quantize.annotate")
-.set_body_typed<Expr(Expr)>([] (const Expr& expr) {
-  std::function<Expr(const Expr&)> fmulti_ref = [](const Expr& e) {
-      if (e->derived_from<TempExprNode>()) {
-        const auto* n = e.as<QAnnotateExprNode>();
-        CHECK(n);
-        const PackedFunc* f = runtime::Registry::Get("relay.quantize.attach_simulated_quantize");
-        Expr ret = (*f)(n->expr, static_cast<int>(kQInput));
-        return static_cast<Expr>(QAnnotateExprNode::make(ret, kQInput));
-      }
-      return e;
-    };
-  return ForwardRewrite(expr, "FQAnnotateRewrite", nullptr, fmulti_ref);
-});
-
-
 // =============
 // realize pass
 
@@ -408,7 +393,7 @@ Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args,
     } else if (ref_arg && ref_arg->op.same_as(simulated_quantize) &&
                ref_arg->attrs.as<SimulatedQuantizeAttrs>()->kind == kQInput) {
       auto new_arg = Cast(ret[i], cfg->dtype_input);
-      if (cfg->use_stop_fusion) {
+      if (cfg->store_lowbit_output) {
         new_arg = StopFusion(new_arg);
       }
       ret.Set(i, Cast(new_arg, dtype));
@@ -446,6 +431,28 @@ Expr AddRealize(const Call& ref_call,
 RELAY_REGISTER_OP("add")
 .set_attr<FForwardRewrite>("FQRealizeRewrite", AddRealize);
 
+Expr ClipRealize(const Call& ref_call,
+                 const Array<Expr>& new_args,
+                 const NodeRef& ctx) {
+  CHECK_EQ(new_args.size(), 1);
+  if (const auto* n = new_args[0].as<QRealizeIntExprNode>()) {
+    const auto ref_attrs = ref_call->attrs.as<ClipAttrs>();
+    auto attrs = make_node<ClipAttrs>();
+    double dom_scale = GetScalarFromConstant<float>(n->dom_scale);
+    attrs->a_min = ref_attrs->a_min / dom_scale;
+    attrs->a_max = ref_attrs->a_max / dom_scale;
+
+    Expr ret = CallNode::make(ref_call->op,
+      {n->data}, Attrs(attrs), ref_call->type_args);
+    return QRealizeIntExprNode::make(ret, n->dom_scale, n->dtype);
+  }
+  CHECK(!new_args[0]->derived_from<TempExprNode>());
+  return Expr(nullptr);
+}
+
+RELAY_REGISTER_OP("clip")
+.set_attr<FForwardRewrite>("FQRealizeRewrite", ClipRealize);
+
 
 Expr ConcatenateRealize(const Call& ref_call,
                         const Array<Expr>& new_args,
@@ -536,14 +543,6 @@ Expr AvgPoolRealize(const Call& ref_call,
 RELAY_REGISTER_OP("nn.avg_pool2d")
 .set_attr<FForwardRewrite>("FQRealizeRewrite", AvgPoolRealize);
 
-
-TVM_REGISTER_API("relay._quantize.realize")
-.set_body_typed<Expr(Expr)>([](const Expr& e) {
-  Expr ret = ForwardRewrite(e, "FQRealizeRewrite", nullptr, nullptr);
-  return ret;
-});
-
-
 // =============
 // qconfig
 
@@ -595,11 +594,10 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
   p->stream << "nbit_weight=" << op->nbit_weight << ", ";
   p->stream << "nbit_activation=" << op->nbit_activation << ", ";
   p->stream << "global_scale=" << op->global_scale << ", ";
-  p->stream << "skip_k_conv==" << op->skip_k_conv << ", ";
+  p->stream << "skip_conv_layers==" << op->skip_conv_layers << ", ";
   p->stream << "round_for_shift==" << op->round_for_shift << ", ";
   p->stream << "store_lowbit_output==" << op->store_lowbit_output << ", ";
-  p->stream << "debug_enabled_ops==" << op->debug_enabled_ops << ", ";
-  p->stream << "use_stop_fusion==" << op->use_stop_fusion;
+  p->stream << "debug_enabled_ops==" << op->debug_enabled_ops;
   p->stream << ")";
 });
 
@@ -612,6 +610,42 @@ TVM_REGISTER_API("relay._quantize._EnterQConfigScope")
 TVM_REGISTER_API("relay._quantize._ExitQConfigScope")
 .set_body_typed(QConfig::ExitQConfigScope);
 
+Pass QuantizeAnnotate() {
+  std::function<Expr(const Expr&)> fmulti_ref = [](const Expr& e) {
+    if (e->derived_from<TempExprNode>()) {
+      const auto* n = e.as<QAnnotateExprNode>();
+      CHECK(n);
+      const PackedFunc* f =
+          runtime::Registry::Get("relay.quantize.attach_simulated_quantize");
+      Expr ret = (*f)(n->expr, static_cast<int>(kQInput));
+      return static_cast<Expr>(QAnnotateExprNode::make(ret, kQInput));
+    }
+    return e;
+  };
+
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+      return Downcast<Function>(
+          ForwardRewrite(f, "FQAnnotateRewrite", fmulti_ref));
+  };
+  return CreateFunctionPass(pass_func, 1, "QuantizeAnnotate", {});
+}
+
+TVM_REGISTER_API("relay._quantize.QuantizeAnnotate")
+.set_body_typed(QuantizeAnnotate);
+
+Pass QuantizeRealizePass() {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+      return Downcast<Function>(
+          ForwardRewrite(f, "FQRealizeRewrite", nullptr, nullptr));
+  };
+  return CreateFunctionPass(pass_func, 1, "QuantizeRealize", {});
+}
+
+TVM_REGISTER_API("relay._quantize.QuantizeRealize")
+.set_body_typed(QuantizeRealizePass);
+
 }  // namespace quantize
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/quantize.h b/src/relay/pass/quantize.h
index 4d26dd6be4a5..da95a6c2134a 100644
--- a/src/relay/pass/quantize.h
+++ b/src/relay/pass/quantize.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -125,11 +125,10 @@ class QConfigNode : public Node {
   DataType dtype_weight = Int(8);
   DataType dtype_activation = Int(32);
   double global_scale = 8.0;
-  int skip_k_conv = 1;
+  Array<Expr> skip_conv_layers = Array<Expr>(NodePtr<Node>(nullptr));
   bool round_for_shift = true;
   bool store_lowbit_output = true;
   Array<Expr> debug_enabled_ops = Array<Expr>(NodePtr<Node>(nullptr));
-  bool use_stop_fusion = true;
 
   void VisitAttrs(AttrVisitor* v) final {
     v->Visit("nbit_input", &nbit_input);
@@ -139,11 +138,10 @@ class QConfigNode : public Node {
     v->Visit("dtype_weight", &dtype_weight);
     v->Visit("dtype_activation", &dtype_activation);
     v->Visit("global_scale", &global_scale);
-    v->Visit("skip_k_conv", &skip_k_conv);
+    v->Visit("skip_conv_layers", &skip_conv_layers);
     v->Visit("round_for_shift", &round_for_shift);
     v->Visit("store_lowbit_output", &store_lowbit_output);
     v->Visit("debug_enabled_ops", &debug_enabled_ops);
-    v->Visit("use_stop_fusion", &use_stop_fusion);
   }
 
   static constexpr const char* _type_key = "relay.quantize.QConfig";
diff --git a/src/relay/pass/simplify_inference.cc b/src/relay/pass/simplify_inference.cc
index cecebc5c04ed..6d6b24abec20 100644
--- a/src/relay/pass/simplify_inference.cc
+++ b/src/relay/pass/simplify_inference.cc
@@ -24,6 +24,7 @@
 #include <tvm/relay/pass.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/transform.h>
 #include "./pattern_util.h"
 
 namespace tvm {
@@ -36,11 +37,13 @@ Expr BatchNormToInferUnpack(const Attrs attrs,
                             Expr moving_mean,
                             Expr moving_var,
                             Type tdata) {
+  auto ttype = tdata.as<TensorTypeNode>();
+  CHECK(ttype);
   const auto param = attrs.as<BatchNormAttrs>();
-  Expr epsilon = MakeConstantScalar(Float(32), static_cast<float>(param->epsilon));
+  Expr epsilon = MakeConstantScalar(ttype->dtype, static_cast<float>(param->epsilon));
   Expr var_add_eps = Add(moving_var, epsilon);
   Expr sqrt_var = Sqrt(var_add_eps);
-  Expr scale = Divide(MakeConstantScalar(Float(32), 1.0f), sqrt_var);
+  Expr scale = Divide(MakeConstantScalar(ttype->dtype, 1.0f), sqrt_var);
 
   if (param->scale) {
     scale = Multiply(scale, gamma);
@@ -52,8 +55,6 @@ Expr BatchNormToInferUnpack(const Attrs attrs,
   }
 
   int axis = param->axis;
-  auto ttype = tdata.as<TensorTypeNode>();
-  CHECK(ttype);
   auto ndim = ttype->shape.size();
   scale = ExpandBiasToMatchAxis(scale, ndim, {axis});
   shift = ExpandBiasToMatchAxis(shift, ndim, {axis});
@@ -105,5 +106,21 @@ Expr SimplifyInference(const Expr& e) {
 TVM_REGISTER_API("relay._ir_pass.simplify_inference")
 .set_body_typed(SimplifyInference);
 
+namespace transform {
+
+Pass SimplifyInference() {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+    return Downcast<Function>(SimplifyInference(f));
+  };
+  return CreateFunctionPass(pass_func, 0, "SimplifyInference",
+                            {ir::StringImm::make("InferType")});
+}
+
+TVM_REGISTER_API("relay._transform.SimplifyInference")
+.set_body_typed(SimplifyInference);
+
+}  // namespace transform
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/to_a_normal_form.cc b/src/relay/pass/to_a_normal_form.cc
index 913f8de05d7b..324eddd21c5c 100644
--- a/src/relay/pass/to_a_normal_form.cc
+++ b/src/relay/pass/to_a_normal_form.cc
@@ -333,5 +333,20 @@ Expr ToANormalForm(const Expr& e, const Module& m) {
 TVM_REGISTER_API("relay._ir_pass.to_a_normal_form")
 .set_body_typed(static_cast<Expr (*)(const Expr&, const Module&)>(ToANormalForm));
 
+namespace transform {
+
+Pass ToANormalForm() {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+    return Downcast<Function>(ToANormalForm(f, m));
+  };
+  return CreateFunctionPass(pass_func, 1, "ToANormalForm", {});
+}
+
+TVM_REGISTER_API("relay._transform.ToANormalForm")
+.set_body_typed(ToANormalForm);
+
+}  // namespace transform
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/to_graph_normal_form.cc b/src/relay/pass/to_graph_normal_form.cc
index 490a80f308ce..9c166f98c1a5 100644
--- a/src/relay/pass/to_graph_normal_form.cc
+++ b/src/relay/pass/to_graph_normal_form.cc
@@ -79,5 +79,20 @@ Expr ToGraphNormalForm(const Expr& e) {
 TVM_REGISTER_API("relay._ir_pass.to_graph_normal_form")
 .set_body_typed(ToGraphNormalForm);
 
+namespace transform {
+
+Pass ToGraphNormalForm() {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+    return Downcast<Function>(ToGraphNormalForm(f));
+  };
+  return CreateFunctionPass(pass_func, 1, "ToGraphNormalForm", {});
+}
+
+TVM_REGISTER_API("relay._transform.ToGraphNormalForm")
+.set_body_typed(ToGraphNormalForm);
+
+}  // namespace transform
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index 482cef3b2c2d..4b126e5299cf 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -43,6 +43,7 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/pattern_functor.h>
 #include <tvm/relay/pass.h>
+#include <tvm/relay/transform.h>
 #include "./pass_util.h"
 #include "type_solver.h"
 #include "../ir/type_functor.h"
@@ -292,6 +293,15 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
                           GetType(c->rhs),
                           op->span);
     }
+
+    // check completness
+    Match match = GetRef<Match>(op);
+    Array<Pattern> unmatched_cases = UnmatchedCases(match, this->mod_);
+    if (unmatched_cases.size() != 0) {
+      LOG(WARNING) << "Match clause " << match <<  " does not handle the following cases: "
+                   << unmatched_cases;
+    }
+
     return rtype;
   }
 
@@ -807,5 +817,23 @@ TVM_REGISTER_API("relay._ir_pass.infer_type")
 .set_body_typed<Expr(const Expr&, const Module&)>([](const Expr& expr, const Module& mod_ref) {
     return InferType(expr, mod_ref);
   });
+
+namespace transform {
+
+Pass InferType() {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+      return Downcast<Function>(InferType(f, m));
+  };
+  return CreateFunctionPass(pass_func, 0, "InferType", {});
+}
+
+TVM_REGISTER_API("relay._transform.InferType")
+.set_body_typed<Pass()>([]() {
+  return InferType();
+});
+
+}  // namespace transform
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index 59cdb7f0a467..20793b4618b3 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -45,6 +45,52 @@
 namespace tvm {
 namespace runtime {
 
+std::string GetCustomTypeName(uint8_t type_code) {
+  auto f = tvm::runtime::Registry::Get("_datatype_get_type_name");
+  CHECK(f) << "Function _datatype_get_type_name not found";
+  return (*f)(type_code).operator std::string();
+}
+
+uint8_t GetCustomTypeCode(const std::string& type_name) {
+  auto f = tvm::runtime::Registry::Get("_datatype_get_type_code");
+  CHECK(f) << "Function _datatype_get_type_code not found";
+  return (*f)(type_name).operator int();
+}
+
+bool GetCustomTypeRegistered(uint8_t type_code) {
+  auto f = tvm::runtime::Registry::Get("_datatype_get_type_registered");
+  CHECK(f) << "Function _datatype_get_type_registered not found";
+  return (*f)(type_code).operator bool();
+}
+
+uint8_t ParseCustomDatatype(const std::string& s, const char** scan) {
+  CHECK(s.substr(0, 6) == "custom") << "Not a valid custom datatype string";
+
+  auto tmp = s.c_str();
+
+  CHECK(s.c_str() == tmp);
+  *scan = s.c_str() + 6;
+  CHECK(s.c_str() == tmp);
+  if (**scan != '[') LOG(FATAL) << "expected opening brace after 'custom' type in" << s;
+  CHECK(s.c_str() == tmp);
+  *scan += 1;
+  CHECK(s.c_str() == tmp);
+  size_t custom_name_len = 0;
+  CHECK(s.c_str() == tmp);
+  while (*scan + custom_name_len <= s.c_str() + s.length() && *(*scan + custom_name_len) != ']')
+    ++custom_name_len;
+  CHECK(s.c_str() == tmp);
+  if (*(*scan + custom_name_len) != ']')
+    LOG(FATAL) << "expected closing brace after 'custom' type in" << s;
+  CHECK(s.c_str() == tmp);
+  *scan += custom_name_len + 1;
+  CHECK(s.c_str() == tmp);
+
+  auto type_name = s.substr(7, custom_name_len);
+  CHECK(s.c_str() == tmp);
+  return GetCustomTypeCode(type_name);
+}
+
 class DeviceAPIManager {
  public:
   static const int kMaxDeviceAPI = 32;
diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
index f812156f1999..f5d660c56816 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -26,6 +26,7 @@
 
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/registry.h>
+#include <cuda.h>
 #include <cuda_runtime.h>
 #include "cuda_common.h"
 
@@ -73,9 +74,10 @@ class CUDADeviceAPI final : public DeviceAPI {
         return;
       }
       case kDeviceName: {
-        cudaDeviceProp props;
-        CUDA_CALL(cudaGetDeviceProperties(&props, ctx.device_id));
-        *rv = std::string(props.name);
+        std::string name(256, 0);
+        CUDA_DRIVER_CALL(cuDeviceGetName(&name[0], name.size(), ctx.device_id));
+        name.resize(strlen(name.c_str()));
+        *rv = std::move(name);
         return;
       }
       case kMaxClockRate: {
diff --git a/src/runtime/graph/debug/graph_runtime_debug.cc b/src/runtime/graph/debug/graph_runtime_debug.cc
index 560bf3da238e..2b26ae541b5f 100644
--- a/src/runtime/graph/debug/graph_runtime_debug.cc
+++ b/src/runtime/graph/debug/graph_runtime_debug.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -24,7 +24,9 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/ndarray.h>
+
 #include <chrono>
+#include <sstream>
 #include "../graph_runtime.h"
 
 namespace tvm {
@@ -39,40 +41,23 @@ namespace runtime {
 class GraphRuntimeDebug : public GraphRuntime {
  public:
   /*!
-   * \brief Run each operation and get the output.
-   * \param index The index of op which needs to be run.
-   * \return the elapsed time.
-   */
-  double DebugRun(size_t index) {
-    CHECK(index < op_execs_.size());
-    TVMContext ctx = data_entry_[entry_id(index, 0)]->ctx;
-    auto tbegin = std::chrono::high_resolution_clock::now();
-    if (op_execs_[index]) {
-      op_execs_[index]();
-    }
-    TVMSynchronize(ctx.device_type, ctx.device_id, nullptr);
-    auto tend = std::chrono::high_resolution_clock::now();
-    double time = std::chrono::duration_cast<std::chrono::duration<double> >(
-        tend - tbegin).count();
-    return time;
-  }
-
-  /*!
-   * \brief Run each operation in the graph and print out the runtime per op.
+   * \brief Run each operation in the graph and get the time per op for all ops.
    * \param number The number of times to run this function for taking average.
    * \param repeat The number of times to repeat the measurement.
-            In total, the function will be invoked (1 + number x repeat) times,
-            where the first one is warmed up and will be discarded in case
-            there is lazy initialization.
+   *        In total, the function will be invoked (1 + number x repeat) times,
+   *        where the first one is warmed up and will be discarded in case
+   *        there is lazy initialization.
    * \param min_repeat_ms The minimum duration of one `repeat` in milliseconds.
-            By default, one `repeat` contains `number` runs. If this parameter is set,
-            the parameters `number` will be dynamically adjusted to meet the
-            minimum duration requirement of one `repeat`.
+   *        By default, one `repeat` contains `number` runs. If this parameter is set,
+   *        the parameters `number` will be dynamically adjusted to meet the
+   *        minimum duration requirement of one `repeat`.
+   * \return Comma seperated string containing the elapsed time per op for the last
+   *         iteration only, because returning a long string over rpc can be expensive.
    */
-  void RunIndividual(int number, int repeat, int min_repeat_ms) {
+  std::string RunIndividual(int number, int repeat, int min_repeat_ms) {
     // warmup run
     GraphRuntime::Run();
-
+    std::ostringstream os;
     std::vector<double> time_per_op(op_execs_.size(), 0);
     for (int i = 0; i < repeat; ++i) {
       std::chrono::time_point<
@@ -96,7 +81,7 @@ class GraphRuntimeDebug : public GraphRuntime {
               auto op_tend = std::chrono::high_resolution_clock::now();
               double op_duration = std::chrono::duration_cast<
                   std::chrono::duration<double> >(op_tend - op_tbegin).count();
-              time_per_op[index] += op_duration * 1000;  // ms
+              time_per_op[index] += op_duration * 1e6;  // us
             }
           }
         }
@@ -105,16 +90,20 @@ class GraphRuntimeDebug : public GraphRuntime {
             (tend - tbegin).count() * 1000;
       } while (duration_ms < min_repeat_ms);
 
-      LOG(INFO) << "Repeat: " << i;
+      LOG(INFO) << "Iteration: " << i;
       int op = 0;
       for (size_t index = 0; index < time_per_op.size(); index++) {
         if (op_execs_[index]) {
           time_per_op[index] /= number;
           LOG(INFO) << "Op #" << op++ << " " << GetNodeName(index) << ": "
-            << time_per_op[index] << " ms/iter";
+            << time_per_op[index] << " us/iter";
         }
       }
     }
+    for (size_t index = 0; index < time_per_op.size(); index++) {
+      os << time_per_op[index] << ",";
+    }
+    return os.str();
   }
 
   /*!
@@ -182,11 +171,7 @@ PackedFunc GraphRuntimeDebug::GetFunction(
     const std::string& name,
     const std::shared_ptr<ModuleNode>& sptr_to_self) {
   // return member functions during query.
-  if (name == "debug_run") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        *rv = this->DebugRun(static_cast<size_t>(args[0].operator int64_t()));
-      });
-  } else if (name == "get_output_by_layer") {
+  if (name == "get_output_by_layer") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         *rv = this->GetOutputByLayer(args[0], args[1]);
       });
@@ -206,7 +191,7 @@ PackedFunc GraphRuntimeDebug::GetFunction(
       CHECK_GT(number, 0);
       CHECK_GT(repeat, 0);
       CHECK_GE(min_repeat_ms, 0);
-      this->RunIndividual(number, repeat, min_repeat_ms);
+      *rv = this->RunIndividual(number, repeat, min_repeat_ms);
     });
   } else {
     return GraphRuntime::GetFunction(name, sptr_to_self);
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 21bcbee88295..f7251ec31af6 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -210,6 +210,32 @@ void GraphRuntime::LoadParams(dmlc::Stream* strm) {
   }
 }
 
+  void GraphRuntime::ShareParams(const GraphRuntime& other, dmlc::Stream* strm) {
+    uint64_t header, reserved;
+    CHECK(strm->Read(&header))
+      << "Invalid parameters file format";
+    CHECK(header == kTVMNDArrayListMagic)
+      << "Invalid parameters file format";
+    CHECK(strm->Read(&reserved))
+      << "Invalid parameters file format";
+  std::vector<std::string> names;
+  CHECK(strm->Read(&names)) << "Invalid parameters file format";
+  uint64_t sz;
+  strm->Read(&sz);
+  size_t size = static_cast<size_t>(sz);
+  CHECK(size == names.size()) << "Invalid parameters file format";
+  for (size_t i = 0; i < size; ++i) {
+    int in_idx = GetInputIndex(names[i]);
+    CHECK_GE(in_idx, 0) << "Found param for non-existent input: " << names[i];
+    uint32_t eid = this->entry_id(input_nodes_[in_idx], 0);
+    CHECK_LT(eid, data_entry_.size());
+    CHECK_EQ(data_entry_[eid].use_count(), 1);
+    data_entry_[eid] = other.GetInput(GetInputIndex(names[i]));
+    CHECK_GT(data_entry_[eid].use_count(), 1);
+  }
+  this->SetupOpExecs();
+}
+
 void GraphRuntime::SetupStorage() {
   // Grab saved optimization plan from graph.
   std::vector<TVMType> vtype;
@@ -411,6 +437,14 @@ PackedFunc GraphRuntime::GetFunction(
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         this->LoadParams(args[0].operator std::string());
       });
+  } else if (name == "share_params") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        const auto& module = args[0].operator Module();
+        CHECK_EQ(module.operator->()->type_key(), "GraphRuntime");
+        const auto& param_blob = args[1].operator std::string();
+        dmlc::MemoryStringStream strm(const_cast<std::string*>(&param_blob));
+        this->ShareParams(dynamic_cast<const GraphRuntime&>(*module.operator->()), &strm);
+      });
   } else {
     return PackedFunc();
   }
diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph/graph_runtime.h
index 0bbdf37b7d42..8e4f20531156 100644
--- a/src/runtime/graph/graph_runtime.h
+++ b/src/runtime/graph/graph_runtime.h
@@ -171,10 +171,19 @@ class GraphRuntime : public ModuleNode {
    * \param param_blob A binary blob of parameter.
    */
   void LoadParams(const std::string& param_blob);
- /*!
-  * \brief Get total number of nodes.
-  * \return Total number of nodes.
-  */
+
+  /*!
+   * \brief Share parameters from pre-existing GraphRuntime instance.
+   * \param other A GraphRuntime instance, previously with |LoadParams| called with the
+   * identical input |param_blob|.
+   * \param strm The input stream.
+   */
+  void ShareParams(const GraphRuntime& other, dmlc::Stream* strm);
+
+  /*!
+   * \brief Get total number of nodes.
+   * \return Total number of nodes.
+   */
   uint32_t GetNumOfNodes() const {
     return static_cast<uint32_t>(nodes_.size());
   }
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index b2d326ec7792..5ba20982e90f 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -67,8 +67,14 @@ Instruction::Instruction(const Instruction& instr) {
       this->result = instr.result;
       return;
     case Opcode::AllocTensor:
-      this->shape_register = instr.shape_register;
-      this->dtype = instr.dtype;
+      this->alloc_tensor.ndim = instr.alloc_tensor.ndim;
+      this->alloc_tensor.shape = Duplicate<int64_t>(instr.alloc_tensor.shape,
+                                                    instr.alloc_tensor.ndim);
+      this->alloc_tensor.dtype = instr.alloc_tensor.dtype;
+      return;
+    case Opcode::AllocTensorReg:
+      this->alloc_tensor_reg.shape_register = instr.alloc_tensor_reg.shape_register;
+      this->alloc_tensor_reg.dtype = instr.alloc_tensor_reg.dtype;
       return;
     case Opcode::AllocDatatype:
       this->constructor_tag = instr.constructor_tag;
@@ -142,8 +148,14 @@ Instruction& Instruction::operator=(const Instruction& instr) {
       this->result = instr.result;
       return *this;
     case Opcode::AllocTensor:
-      this->shape_register = instr.shape_register;
-      this->dtype = instr.dtype;
+      this->alloc_tensor.ndim = instr.alloc_tensor.ndim;
+      this->alloc_tensor.shape = Duplicate<int64_t>(instr.alloc_tensor.shape,
+                                                    instr.alloc_tensor.ndim);
+      this->alloc_tensor.dtype = instr.alloc_tensor.dtype;
+      return *this;
+    case Opcode::AllocTensorReg:
+      this->alloc_tensor_reg.shape_register = instr.alloc_tensor_reg.shape_register;
+      this->alloc_tensor_reg.dtype = instr.alloc_tensor_reg.dtype;
       return *this;
     case Opcode::AllocDatatype:
       this->constructor_tag = instr.constructor_tag;
@@ -203,12 +215,15 @@ Instruction::~Instruction() {
     case Opcode::Move:
     case Opcode::Select:
     case Opcode::Ret:
-    case Opcode::AllocTensor:
+    case Opcode::AllocTensorReg:
     case Opcode::If:
     case Opcode::LoadConst:
     case Opcode::GetField:
     case Opcode::Goto:
       return;
+    case Opcode::AllocTensor:
+      delete this->alloc_tensor.shape;
+      return;
     case Opcode::AllocDatatype:
       delete this->datatype_fields;
       return;
@@ -226,8 +241,7 @@ Instruction::~Instruction() {
       return;
     default:
       std::ostringstream out;
-      out << "Invalid instruction " << static_cast<int>(this->op);
-      throw std::runtime_error(out.str());
+      LOG(FATAL) << "Invalid instruction " << static_cast<int>(this->op);
   }
 }
 
@@ -252,12 +266,25 @@ Instruction Instruction::InvokePacked(Index packed_index, Index arity, Index out
   return instr;
 }
 
-Instruction Instruction::AllocTensor(RegName shape_register, DLDataType dtype, Index dst) {
+Instruction Instruction::AllocTensor(std::vector<int64_t> shape, DLDataType dtype, Index dst) {
   Instruction instr;
   instr.op = Opcode::AllocTensor;
   instr.dst = dst;
-  instr.shape_register = shape_register;
-  instr.dtype = dtype;
+  instr.alloc_tensor.ndim = shape.size();
+  instr.alloc_tensor.shape = new int64_t[shape.size()];
+  for (size_t i = 0; i < shape.size(); ++i) {
+    instr.alloc_tensor.shape[i] = shape[i];
+  }
+  instr.alloc_tensor.dtype = dtype;
+  return instr;
+}
+
+Instruction Instruction::AllocTensorReg(RegName shape_register, DLDataType dtype, Index dst) {
+  Instruction instr;
+  instr.op = Opcode::AllocTensorReg;
+  instr.dst = dst;
+  instr.alloc_tensor_reg.shape_register = shape_register;
+  instr.alloc_tensor_reg.dtype = dtype;
   return instr;
 }
 
@@ -381,85 +408,92 @@ void DLDatatypePrint(std::ostream& os, const DLDataType& dtype) {
       break;
   }
 
-  os << dtype.bits;
-  if (dtype.lanes != 0) {
-    os << "[" << dtype.lanes << "]";
+  os << int(dtype.bits);
+  if (dtype.lanes != 1) {
+    os << "x" << dtype.lanes;
   }
 }
 
+template<typename T>
+std::string StrJoin(T* items, int offset, int cnt, std::string delim = ",") {
+  if (cnt == 0) {
+    return "";
+  }
+  std::ostringstream oss;
+  oss << items[offset];
+  for (int i = 1; i < cnt; ++i) {
+    oss << delim << items[offset + i];
+  }
+  return oss.str();
+}
+
 void InstructionPrint(std::ostream& os, const Instruction& instr) {
   switch (instr.op) {
     case Opcode::Move: {
-      os << "move " << instr.from << " " << instr.dst;
+      os << "move $" << instr.dst << " $" << instr.from;
       break;
     }
     case Opcode::Ret: {
-      os << "ret " << instr.result;
+      os << "ret $" << instr.result;
       break;
     }
     case Opcode::InvokePacked: {
-      os << "invoke_packed ";
-      os << instr.packed_index;
-      os << " " << instr.arity;
-      os << "(";
-      for (Index i = 0; i < instr.arity; ++i) {
-        os << instr.packed_args[i] << ",";
-      }
-      os << ")";
-      os << " " << instr.output_size;
+      os << "invoke_packed PackedFunc[" << instr.packed_index << "](in: $"
+         << StrJoin<RegName>(instr.packed_args, 0, instr.arity - instr.output_size, ",$")
+         << ", out: $"
+         << StrJoin<RegName>(instr.packed_args, instr.arity - instr.output_size,
+                 instr.output_size, ",$")
+         << ")";
       break;
     }
     case Opcode::AllocTensor: {
-      os << "alloc_tensor ";
-      os << instr.dst << " ";
-      os << instr.shape_register << " ";
-      DLDatatypePrint(os, instr.dtype);
+      os << "alloc_tensor $" << instr.dst << " ["
+         << StrJoin<int64_t>(instr.alloc_tensor.shape, 0, instr.alloc_tensor.ndim)
+         << "] ";
+      DLDatatypePrint(os, instr.alloc_tensor.dtype);
+      break;
+    }
+    case Opcode::AllocTensorReg: {
+      os << "alloc_tensor_reg $" << instr.dst << " $"
+         << instr.alloc_tensor_reg.shape_register << " ";
+      DLDatatypePrint(os, instr.alloc_tensor_reg.dtype);
       break;
     }
     case Opcode::AllocDatatype: {
-      os << "alloc_data ";
-      os << instr.dst << " ";
-      os << instr.constructor_tag << " ";
-      os << instr.num_fields;
+      os << "alloc_data $" << instr.dst << " tag(" << instr.constructor_tag << ") [$"
+         << StrJoin<RegName>(instr.datatype_fields, 0, instr.num_fields, ",$") << "]";
       break;
     }
     case Opcode::AllocClosure: {
-      os << "alloc_closure ";
-      os << instr.dst << " ";
-      os << instr.clo_index << " ";
-      os << instr.num_freevar << "(";
-      for (Index i = 0; i < instr.num_freevar; ++i) {
-        os << instr.free_vars[i] << ",";
-      }
-      os << ")";
+      os << "alloc_closure $" << instr.dst << " VMFunc[" << instr.clo_index
+         << "]($" << StrJoin<RegName>(instr.free_vars, 0, instr.num_freevar, ",$")
+         << ")";
       break;
     }
     case Opcode::If: {
-      os << "if "
-         << "$" << instr.if_cond << " " << instr.true_offset << " " << instr.false_offset;
+      os << "if " << "$" << instr.if_cond << " " << instr.true_offset << " "
+         << instr.false_offset;
       break;
     }
     case Opcode::Invoke: {
-      os << "invoke "
-         << "$" << instr.dst << " " << instr.func_index << " " << instr.num_args << "(";
-      for (Index i = 0; i < instr.num_args; ++i) {
-        os << instr.invoke_args_registers[i] << ",";
-      }
-      os << ")";
+      os << "invoke $" << instr.dst << " VMFunc[" << instr.func_index << "]($"
+         << StrJoin<RegName>(instr.invoke_args_registers, 0, instr.num_args, ",$")
+         << ")";
       break;
     }
     case Opcode::InvokeClosure: {
-      os << "invoke_closure "
-         << "$" << instr.dst << " " << instr.closure << " " << instr.closure_args_num << "()";
+      os << "invoke_closure $" << instr.dst << " $" << instr.closure << "($"
+         << StrJoin<RegName>(instr.closure_args, 0, instr.closure_args_num, ",$")
+         << ")";
       break;
     }
     case Opcode::LoadConst: {
-      os << "load_const "
-         << "$" << instr.dst << " " << instr.const_index;
+      os << "load_const $" << instr.dst << " Const[" << instr.const_index << "]";
       break;
     }
     case Opcode::GetField: {
-      os << "get_field " << instr.dst << " " << instr.object << " " << instr.field_index;
+      os << "get_field $" << instr.dst << " $" << instr.object << "["
+         << instr.field_index << "]";
       break;
     }
     case Opcode::Goto: {
@@ -467,8 +501,8 @@ void InstructionPrint(std::ostream& os, const Instruction& instr) {
       break;
     }
     case Opcode::Select: {
-      os << "select " << instr.dst << " " << instr.select_cond << " " << instr.select_op1 << " "
-         << instr.select_op2;
+      os << "select $" << instr.dst << " $" << instr.select_cond << " $"
+         << instr.select_op1 << " $" << instr.select_op2;
       break;
     }
     default:
@@ -513,48 +547,64 @@ Index VirtualMachine::PopFrame() {
 }
 
 void VirtualMachine::InvokeGlobal(const VMFunction& func, const std::vector<Object>& args) {
-  DLOG(INFO) << "===================\nInvoking global " << func.name << " " << args.size()
-                  << std::endl;
+  DLOG(INFO) << "Invoking global " << func.name << " " << args.size();
 
   PushFrame(func.params, this->pc + 1, func);
   for (size_t i = 0; i < args.size(); ++i) {
     WriteRegister(i, args[i]);
   }
-  DLOG(INFO) << "func.params= " << func.params << std::endl;
+  DLOG(INFO) << "func.params= " << func.params;
 
   code = func.instructions.data();
   pc = 0;
 }
 
 Object VirtualMachine::Invoke(const VMFunction& func, const std::vector<Object>& args) {
-  DLOG(INFO) << "Executing Function: " << std::endl << func << std::endl;
+  DLOG(INFO) << "Executing Function: " << std::endl << func;
 
   InvokeGlobal(func, args);
   Run();
   auto alloc = MemoryManager::Global()->GetAllocator(ctxs[0]);
-  DLOG(INFO) << "Memory used: " << alloc->UsedMemory() << " B\n";
+  DLOG(INFO) << "Memory used: " << alloc->UsedMemory() << " B";
   return return_register;
 }
 
 Object VirtualMachine::Invoke(const std::string& name, const std::vector<Object>& args) {
   auto func_index = this->global_map_[name];
-  DLOG(INFO) << "Invoke Global " << name << " at index " << func_index << std::endl;
+  DLOG(INFO) << "Invoke Global " << name << " at index " << func_index;
   return Invoke(this->functions[func_index], args);
 }
 
 void InvokePacked(const PackedFunc& func, Index arg_count, Index output_size,
                   const std::vector<Object>& args) {
-  std::vector<TVMValue> values(arg_count);
-  std::vector<int> codes(arg_count);
-  runtime::TVMArgsSetter setter(values.data(), codes.data());
+  size_t arity = 0;
+  for (Index i = 0; i < arg_count; i++) {
+    if (args[i].ptr_->tag == ObjectTag::kDatatype) {
+      arity += args[i].AsDatatype()->fields.size();
+    } else {
+      ++arity;
+    }
+  }
 
+  std::vector<TVMValue> values(arity);
+  std::vector<int> codes(arity);
+  runtime::TVMArgsSetter setter(values.data(), codes.data());
+  int idx = 0;
   for (Index i = 0; i < arg_count; i++) {
-    NDArray data = ToNDArray(args[i]);
-    setter(i, data);
+    if (args[i].ptr_->tag == ObjectTag::kDatatype) {
+      auto dt_cell = args[i].AsDatatype();
+      for (auto obj : dt_cell->fields) {
+        NDArray data = ToNDArray(obj);
+        setter(idx++, data);
+      }
+    } else {
+      NDArray data = ToNDArray(args[i]);
+      setter(idx++, data);
+    }
   }
 
   TVMRetValue rv;
-  func.CallPacked(TVMArgs(values.data(), codes.data(), arg_count), &rv);
+  func.CallPacked(TVMArgs(values.data(), codes.data(), arity), &rv);
 }
 
 void VirtualMachine::Init(const std::vector<TVMContext>& ctxs) { this->ctxs = ctxs; }
@@ -574,7 +624,7 @@ void VirtualMachine::Run() {
   while (true) {
   main_loop:
     auto const& instr = this->code[this->pc];
-    DLOG(INFO) << "\nExecuting(" << pc << "): ";
+    DLOG(INFO) << "Executing(" << pc << "): ";
 #if USE_RELAY_DEBUG
     InstructionPrint(std::cout, instr);
 #endif  // USE_RELAY_DEBUG
@@ -669,11 +719,23 @@ void VirtualMachine::Run() {
         goto main_loop;
       }
       case Opcode::AllocTensor: {
+        auto shape = std::vector<int64_t>(instr.alloc_tensor.ndim);
+        for (uint i = 0; i < instr.alloc_tensor.ndim; ++i) {
+          shape[i] = instr.alloc_tensor.shape[i];
+        }
+        auto allocator = MemoryManager::Global()->GetAllocator(ctxs[0]);
+        auto data = allocator->Empty(shape, instr.alloc_tensor.dtype, ctxs[0]);
+        auto obj = Object::Tensor(data);
+        WriteRegister(instr.dst, obj);
+        pc++;
+        goto main_loop;
+      }
+      case Opcode::AllocTensorReg: {
         DLContext cpu_ctx;
         cpu_ctx.device_type = kDLCPU;
         cpu_ctx.device_id = 0;
 
-        auto shape_tensor_obj = ReadRegister(instr.shape_register);
+        auto shape_tensor_obj = ReadRegister(instr.alloc_tensor_reg.shape_register);
         NDArray shape_tensor = ToNDArray(shape_tensor_obj).CopyTo(cpu_ctx);
 
         int64_t* dims = static_cast<int64_t*>(shape_tensor->data);
@@ -681,7 +743,7 @@ void VirtualMachine::Run() {
         auto shape = std::vector<int64_t>(shape_tensor->shape[0]);
         shape.assign(dims, dims + num_dims);
         auto allocator = MemoryManager::Global()->GetAllocator(ctxs[0]);
-        auto data = allocator->Empty(shape, instr.dtype, ctxs[0]);
+        auto data = allocator->Empty(shape, instr.alloc_tensor_reg.dtype, ctxs[0]);
         auto obj = Object::Tensor(data);
         WriteRegister(instr.dst, obj);
         pc++;
diff --git a/src/schedule/schedule_dataflow_rewrite.cc b/src/schedule/schedule_dataflow_rewrite.cc
index cbb5ae3df0d6..bb42754ac820 100644
--- a/src/schedule/schedule_dataflow_rewrite.cc
+++ b/src/schedule/schedule_dataflow_rewrite.cc
@@ -410,10 +410,15 @@ Array<Tensor> CacheWriteWithReLayoutTensor(Schedule sch,
     new_regions.push_back(region);
   }
 
+  Array<Expr> new_scalar_inputs;
+  for (Expr old_input : tensor_op->scalar_inputs) {
+    new_scalar_inputs.push_back(VarReplacer(vsub2newvar).Mutate(old_input));
+  }
+
   Operation cache_op = TensorComputeOpNode::make(
       tensor_op->name + "." + scope, tensor_op->tag, new_axis,
       tensor_op->reduce_axis, tensor_op->schedulable_ndim,
-      tensor_op->intrin, tensor_op->inputs, new_regions);
+      tensor_op->intrin, tensor_op->inputs, new_regions, new_scalar_inputs);
 
   // axis will be used in generating compute op
   Array<IterVar> compute_axis = tensor_op->axis;
diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc
index 393714d8f636..6dbd78e9566d 100644
--- a/tests/cpp/build_module_test.cc
+++ b/tests/cpp/build_module_test.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -50,14 +50,14 @@ TEST(BuildModule, Basic) {
   auto args = Array<Tensor>({ A, B, C });
   std::unordered_map<Tensor, Buffer> binds;
 
-  auto config = build_config();
+  auto config = BuildConfig::Create();
   auto target = target::llvm();
 
   auto lowered = lower(s, args, "func", binds, config);
   auto module = build(lowered, target, Target(), config);
 
-  auto mali_target = Target::create("opencl -model=Mali-T860MP4@800Mhz -device=mali");
-  CHECK_EQ(mali_target->str(), "opencl -model=Mali-T860MP4@800Mhz -device=mali"); 
+  auto mali_target = Target::Create("opencl -model=Mali-T860MP4@800Mhz -device=mali");
+  CHECK_EQ(mali_target->str(), "opencl -model=Mali-T860MP4@800Mhz -device=mali");
 }
 
 TEST(BuildModule, Heterogeneous) {
@@ -105,7 +105,7 @@ TEST(BuildModule, Heterogeneous) {
   auto s1 = topi::cuda::schedule_injective(target_cuda, {elemwise_add});
   auto s2 = create_schedule({elemwise_sub->op});
 
-  auto config = build_config();
+  auto config = BuildConfig::Create();
   auto args1 = Array<Tensor>({A, B, elemwise_add});
   auto args2 = Array<Tensor>({copy, C, elemwise_sub});
 
diff --git a/tests/cpp/relay_build_module_test.cc b/tests/cpp/relay_build_module_test.cc
index 38481bfb8204..3f46eed9f10e 100644
--- a/tests/cpp/relay_build_module_test.cc
+++ b/tests/cpp/relay_build_module_test.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,6 +18,7 @@
  */
 
 #include <gtest/gtest.h>
+#include <tvm/build_module.h>
 #include <tvm/tvm.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/type.h>
@@ -73,10 +74,10 @@ TEST(Relay, BuildModule) {
   auto build_f = build_mod.GetFunction("build", false);
   auto json_f = build_mod.GetFunction("get_graph_json", false);
   auto mod_f = build_mod.GetFunction("get_module", false);
-  Array<HalideIR::Expr> target_pair;
-  target_pair.push_back(ir::StringImm::make("cpu"));
-  target_pair.push_back(ir::StringImm::make("llvm"));
-  build_f(func, target_pair, "llvm");
+  Map<tvm::Integer, tvm::Target> targets;
+  Target llvm_tgt = Target::Create("llvm");
+  targets.Set(0, llvm_tgt);
+  build_f(func, targets, llvm_tgt);
   std::string json = json_f();
   tvm::runtime::Module mod = mod_f();
   // run
diff --git a/tests/cpp/relay_transform_sequential.cc b/tests/cpp/relay_transform_sequential.cc
new file mode 100644
index 000000000000..b61a5cc0daad
--- /dev/null
+++ b/tests/cpp/relay_transform_sequential.cc
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <topi/generic/injective.h>
+#include <tvm/build_module.h>
+#include <tvm/packed_func_ext.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/module.h>
+#include <tvm/relay/pass.h>
+#include <tvm/relay/transform.h>
+#include <tvm/relay/type.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/tvm.h>
+
+TVM_REGISTER_GLOBAL("schedule")
+    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+      *rv = topi::generic::schedule_injective(args[0], args[1]);
+    });
+
+TEST(Relay, Sequential) {
+  using namespace tvm;
+  auto tensor_type = relay::TensorTypeNode::make({1, 2, 3}, ::tvm::Float(32));
+  auto c_data =
+      tvm::runtime::NDArray::Empty({1, 2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0});
+
+  // Create a function for optimization.
+  auto c = relay::ConstantNode::make(c_data);
+  auto a = relay::VarNode::make("a", tensor_type);
+  auto x = relay::VarNode::make("x", tensor_type);
+  auto add_op = relay::Op::Get("add");
+  auto y = relay::CallNode::make(add_op, {c, c});
+  y = relay::CallNode::make(add_op, {x, y});
+  auto z = relay::CallNode::make(add_op, {y, c});
+  auto z1 = relay::CallNode::make(add_op, {y, c});
+  auto z2 = relay::CallNode::make(add_op, {z, z1});
+  // Let expression and varaible a should be dead-code eliminated.
+  auto z3 = relay::LetNode::make(a, c, z2);
+  relay::Function func =
+      relay::FunctionNode::make(relay::FreeVars(z3), z3, relay::Type(), {});
+
+  // Get schedule
+  auto reg = tvm::runtime::Registry::Get("relay.op._Register");
+  auto sch = tvm::runtime::Registry::Get("schedule");
+  if (!reg || !sch) {
+    LOG(FATAL) << "Register/schedule is not defined.";
+  }
+
+  (*reg)("add", "FTVMSchedule", *sch, 10);
+
+  // Run sequential passes.
+  tvm::Array<relay::transform::Pass> pass_seqs{
+      relay::transform::InferType(),
+      relay::transform::DeadCodeElimination(),
+      relay::transform::EliminateCommonSubexpr(),
+      relay::transform::AlterOpLayout()
+  };
+  relay::transform::Pass seq = relay::transform::Sequential(pass_seqs);
+  auto mod = relay::ModuleNode::FromExpr(func);
+  auto pass_ctx = relay::transform::PassContext::Create();
+  pass_ctx->opt_level = 3;
+  pass_ctx->fallback_device = 1;
+  {
+    tvm::With<relay::transform::PassContext> ctx_scope(pass_ctx);
+    tvm::With<tvm::Target> tctx(tvm::Target::Create("llvm"));
+    mod = seq(mod);
+  }
+
+  CHECK(mod.defined());
+  auto entry_func = mod->entry_func;
+  CHECK(entry_func.defined());
+  relay::Function f = mod->Lookup(entry_func->name_hint);
+  CHECK(f.defined());
+
+  // Expected function
+  auto c1 = relay::ConstantNode::make(c_data);
+  auto x1 = relay::VarNode::make("x", tensor_type);
+  auto y1 = relay::CallNode::make(add_op, {c1, c1});
+  y1 = relay::CallNode::make(add_op, {x1, y1});
+  auto zz = relay::CallNode::make(add_op, {y1, c1});
+  zz = relay::CallNode::make(add_op, {zz, zz});
+  relay::Function expected_func =
+      relay::FunctionNode::make(relay::FreeVars(zz), zz, relay::Type(), {});
+
+  // Infer type for the expected function.
+  auto expected = relay::InferType(expected_func, relay::Module(nullptr));
+  CHECK(relay::AlphaEqual(f, expected));
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  testing::FLAGS_gtest_death_test_style = "threadsafe";
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/lint/add_asf_header.py b/tests/lint/add_asf_header.py
index 7e0352f4bc2c..1afb3a57b2f1 100644
--- a/tests/lint/add_asf_header.py
+++ b/tests/lint/add_asf_header.py
@@ -117,6 +117,7 @@
 """.strip()
 
 FMT_MAP = {
+    "sh" : header_pystyle,
     "cc" : header_cstyle,
     "h" : header_cstyle,
     "py" : header_pystyle,
@@ -128,6 +129,7 @@
     "cmake" : header_pystyle,
     "rst" : header_rststyle,
     "gradle" : header_groovystyle,
+    "xml": header_mdstyle,
 }
 
 def add_header(fname, header):
@@ -142,8 +144,23 @@ def add_header(fname, header):
         return
 
     with open(fname, "w") as outfile:
-        outfile.write(header + "\n\n")
-        outfile.write(orig)
+        skipline = False
+        lines = orig.split('\n')
+        ext = os.path.splitext(fname)[1][1:]
+        if ext == 'sh' and lines[0][:2] == '#!':
+            skipline = True
+        elif ext == 'xml' and lines[0][:2] == '<?':
+            skipline = True
+
+        if skipline:
+            outfile.write(lines[0] + "\n")
+            outfile.write(header + "\n\n")
+            outfile.write("\n".join(lines[1:]))
+            outfile.write(header + "\n\n")
+            outfile.write(orig)
+        else:
+            outfile.write(header + "\n\n")
+            outfile.write(orig)
     print("Add header to %s" % fname)
 
 
@@ -160,6 +177,8 @@ def main(args):
         suffix = fname.split(".")[-1]
         if suffix in FMT_MAP:
             add_header(fname, FMT_MAP[suffix])
+        elif os.path.basename(fname) == 'gradle.properties':
+            add_header(fname, FMT_MAP['h'])
         else:
             print("Cannot handle %s ..." % fname)
 
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index 69322f5efaaa..ecfad438924a 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -41,6 +41,8 @@
     "pxi",
     "pyd",
     "pyx",
+    # relay text format
+    "rly",
     # configurations
     "mk",
     "in",
@@ -66,11 +68,15 @@
     "sbt",
     "properties",
     "v",
+    # generated parser
+    "interp",
+    "tokens"
     }
 
 # List of file names allowed
 ALLOW_FILE_NAME = {
     ".gitignore",
+    ".gitattributes",
     "README",
     "Makefile",
     "Doxyfile",
@@ -101,6 +107,7 @@
     # documentation related files
     "docs/_static/css/tvm_theme.css",
     "docs/_static/img/tvm-logo-small.png",
+    "docs/_static/img/tvm-logo-square.png",
    }
 
 
@@ -155,7 +162,7 @@ def main():
         report += "\nFound %d files that are now allowed\n" % len(error_list)
         report += ("We do not check in binary files into the repo.\n"
                    "If necessary, please discuss with committers and"
-                   "modify tests/scripts/check_file_type.py to enable the file you need.\n")
+                   "modify tests/lint/check_file_type.py to enable the file you need.\n")
         sys.stderr.write(report)
         sys.stderr.flush()
         sys.exit(-1)
diff --git a/tests/lint/rat-excludes b/tests/lint/rat-excludes
index f449c5ee68b9..2e7a23a1ea4f 100644
--- a/tests/lint/rat-excludes
+++ b/tests/lint/rat-excludes
@@ -1,6 +1,8 @@
 # subdirectories
-3rdparty/*
-.github/*
+3rdparty
+.github
+jvm
+tutorials
 
 # Binary or data files
 .*\.css
@@ -23,20 +25,31 @@
 .*\.csv
 .*\.mk
 .*\.log
+.*\.interp
+.*\.tokens
 
 # Generated modules
-.*gen_modules/*
+.*\.egg-info
+.*gen_modules
 .*doxygen
 core.cpp
 build
 _static
 _build
 .*~
+\#..*\#
+\.#.*
+
+# Relay parser
+RelayLexer.py
+RelayParser.py
+RelayVisitor.py
 
 # Specific files
 package-list
 MANIFEST
 .gitignore
+.gitattributes
 .gitmodules
 .clang-format
 .bash_history
@@ -44,3 +57,4 @@ rat-excludes
 __init__.py
 pylintrc
 config.cmake
+Cargo.lock
diff --git a/tests/python/contrib/test_cblas.py b/tests/python/contrib/test_cblas.py
index 6705328ee50a..808c07a2e602 100644
--- a/tests/python/contrib/test_cblas.py
+++ b/tests/python/contrib/test_cblas.py
@@ -16,19 +16,26 @@
 # under the License.
 import tvm
 import numpy as np
+import topi.testing
 from tvm.contrib import cblas
 
-def test_matmul_add():
-    n = 1024
-    l = 128
-    m = 235
-    bias = tvm.var('bias', dtype=tvm.float32)
-    A = tvm.placeholder((n, l), name='A')
-    B = tvm.placeholder((l, m), name='B')
-    C = cblas.matmul(A, B)
+def verify_matmul_add(m, l, n, transa=False, transb=False, dtype=tvm.float32):
+    bias = tvm.var('bias', dtype=dtype)
+    ashape = (l, n) if transa else (n, l)
+    bshape = (m, l) if transb else (l, m)
+    A = tvm.placeholder(ashape, name='A', dtype=dtype)
+    B = tvm.placeholder(bshape, name='B', dtype=dtype)
+    C = cblas.matmul(A, B, transa, transb)
     D = tvm.compute(C.shape, lambda i, j: C[i,j] + bias, name="D")
     s = tvm.create_schedule(D.op)
 
+    def get_numpy(a, b, bb, transa, transb):
+        if transa:
+            a = a.transpose()
+        if transb:
+            b = b.transpose()
+        return np.dot(a, b) + bb
+
     def verify(target="llvm"):
         if not tvm.module.enabled(target):
             print("skip because %s is not enabled..." % target)
@@ -38,15 +45,69 @@ def verify(target="llvm"):
             return
         ctx = tvm.cpu(0)
         f = tvm.build(s, [A, B, D, bias], target)
-        a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=ashape).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=bshape).astype(B.dtype), ctx)
         d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
         bb = 10.0
         f(a, b, d, bb)
         tvm.testing.assert_allclose(
-            d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + bb, rtol=1e-5)
+            d.asnumpy(), get_numpy(a.asnumpy(), b.asnumpy(), bb, transa, transb), rtol=1e-5)
+    verify()
+
+def test_matmul_add():
+    verify_matmul_add(235, 128, 1024)
+    verify_matmul_add(235, 128, 1024, True, False)
+    verify_matmul_add(235, 128, 1024, False, True)
+    verify_matmul_add(235, 128, 1024, True, True)
+    verify_matmul_add(1, 16, 4)
+    verify_matmul_add(1, 16, 3, True, False)
+    verify_matmul_add(1, 16, 3, False, False)
+    verify_matmul_add(1, 16, 3, True, True)
+
+def verify_batch_matmul(batch, m, l, n, transa=False, transb=False, iterative=False, dtype=tvm.float32):
+    ashape = (batch, l, n) if transa else (batch, n, l)
+    bshape = (batch, m, l) if transb else (batch, l, m)
+    A = tvm.placeholder(ashape, name='A', dtype=dtype)
+    B = tvm.placeholder(bshape, name='B', dtype=dtype)
+    C = cblas.batch_matmul(A, B, transa, transb)
+    D = tvm.compute(C.shape, lambda k, i, j: C[k, i,j], name="D")
+    s = tvm.create_schedule(D.op)
+
+    def get_numpy(a, b, transa, transb):
+        if transa:
+            a = a.transpose(0, 2, 1)
+        if not transb:
+            b = b.transpose(0, 2, 1)
+        return topi.testing.batch_matmul(a, b)
+
+    def verify(target="llvm"):
+        if not tvm.module.enabled(target):
+            print("skip because %s is not enabled..." % target)
+            return
+        if not tvm.get_global_func("tvm.contrib.cblas.matmul", True):
+            print("skip because extern function is not available")
+            return
+        ctx = tvm.cpu(0)
+        f = tvm.build(s, [A, B, D], target)
+        a = tvm.nd.array(np.random.uniform(size=ashape).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=bshape).astype(B.dtype), ctx)
+        d = tvm.nd.array(np.zeros((batch, n, m), dtype=D.dtype), ctx)
+        f(a, b, d)
+        tvm.testing.assert_allclose(
+            d.asnumpy(), get_numpy(a.asnumpy(), b.asnumpy(), transa, transb), rtol=1e-5)
     verify()
 
+def test_batch_matmul():
+    verify_batch_matmul(16, 235, 128, 1024)
+    verify_batch_matmul(16, 235, 128, 1024, True, False)
+    verify_batch_matmul(16, 235, 128, 1024, False, True)
+    verify_batch_matmul(16, 235, 128, 1024, True, True)
+    verify_batch_matmul(1, 1, 16, 3)
+    verify_batch_matmul(1, 1, 16, 3, True, False)
+    verify_batch_matmul(1, 1, 16, 3, False, False)
+    verify_batch_matmul(1, 1, 16, 3, True, True)
+    verify_batch_matmul(1, 1, 16, 3, iterative=True)
 
 if __name__ == "__main__":
     test_matmul_add()
+    test_batch_matmul()
diff --git a/tests/python/contrib/test_gemm_acc16.py b/tests/python/contrib/test_gemm_acc16.py
new file mode 100644
index 000000000000..0fc5e1a9a3fa
--- /dev/null
+++ b/tests/python/contrib/test_gemm_acc16.py
@@ -0,0 +1,90 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=import-self, invalid-name, unused-argument, too-many-lines, len-as-condition
+import tvm
+import numpy as np
+from topi.x86.tensor_intrin import dot_16x1x16_int8_int8_int16
+
+
+def benchmark_fc_int8_acc16():
+    m = 128
+    n = 128
+    k = 128
+
+    X = tvm.placeholder((m, k), name='X', dtype="uint8")
+    W = tvm.placeholder((n, k), name='W', dtype="int8")
+
+    peak = 512/16*2*2*2
+    gops_per_mm = 2*n*m*k
+    print("Peak {} Gops/s \n".format(peak))
+
+    def verify(target="llvm -mcpu=skylake-avx512"):
+        if not tvm.module.enabled(target):
+            print("skip because %s is not enabled..." % target)
+            return
+
+        ctx = tvm.context(target, 0)
+        X = tvm.placeholder((m, k), name='X', dtype="uint8")
+        W = tvm.placeholder((n, k), name='W', dtype="int8")
+        pc = dot_16x1x16_int8_int8_int16()
+        ak = tvm.reduce_axis((0, k), name='k')
+
+        packedW = tvm.placeholder((n/128, 128*(k/2), 2), name='packedW', dtype="int8")
+        t_fc = tvm.compute((m, n), lambda i, j: tvm.sum(X[i, ak].astype("int16") * packedW[j/128, (ak/2)*128+j%128, ak%2].astype("int16"), axis=ak), name="F")
+
+        t_sch = tvm.create_schedule(t_fc.op)
+        a_x, a_y = t_fc.op.axis
+        a_k, = t_fc.op.reduce_axis
+
+        a_yo, a_yi = t_sch[t_fc].split(a_y, factor=128)
+        a_ko, a_ki = t_sch[t_fc].split(a_k, factor=2)
+
+        a_xo, a_xi = t_sch[t_fc].split(a_x, factor=128)
+        a_koo, a_koi = t_sch[t_fc].split(a_ko, factor=32)
+        t_sch[t_fc].reorder(a_yo, a_xo, a_koo, a_xi, a_koi, a_yi, a_ki)
+
+       	t_sch[t_fc].tensorize(a_yi, pc)
+        # print(tvm.lower(t_sch, [X, packedW, t_fc], simple_mode=True))
+        t_func = tvm.build(t_sch, [X, packedW, t_fc], target, name="intrinsic")
+        t_evaluator = t_func.time_evaluator(t_func.entry_name, ctx, number=10)
+
+	    # generate the plain data
+        a_ = np.random.uniform(1, 10, size=(m, k)).astype("uint8")
+        b_ = np.random.uniform(1, 10,  size=(n, k)).astype("int8")
+
+        packW = np.random.uniform(1, 10,  size=(n/128, 128*(k/2), 2)).astype("int8")
+        # This occurs in pre_compute stage
+        for r_idx in range(n/128):
+            for s_idx in range(128*(k/2)):
+                for t_idx in range(2):
+                    packW[r_idx][s_idx][t_idx] = b_[r_idx*128+s_idx%128][s_idx/128*2+t_idx]
+
+        x = tvm.nd.array(a_, ctx)
+        w = tvm.nd.array(packW, ctx)
+        y = tvm.nd.array(np.zeros((m, n), dtype="int16"), ctx)
+
+        result = t_evaluator(x, w, y)
+        gops_per_sec = gops_per_mm/result.mean/1e9
+        tvm.testing.assert_allclose(
+           y.asnumpy(), np.dot(a_, b_.T), rtol=1e-5)
+        print('Tensorization: running time: {:.3f} ms, {:.2f} Gops/s, effiency: {:.2f}.'.format(result.mean*1000, gops_per_sec, gops_per_sec/peak))
+        t_func.export_library("gemm_tensorize.o")
+
+    verify()
+
+if __name__ == "__main__":
+    benchmark_fc_int8_acc16()
diff --git a/tests/python/frontend/caffe2/test_forward.py b/tests/python/frontend/caffe2/test_forward.py
index 73cde08ea8b5..465ac70afc5f 100644
--- a/tests/python/frontend/caffe2/test_forward.py
+++ b/tests/python/frontend/caffe2/test_forward.py
@@ -40,9 +40,10 @@ def get_tvm_output(model,
     input_names = model.predict_net.op[0].input[0]
     shape_dict = {input_names: input_data.shape}
     dtype_dict = {input_names: input_data.dtype}
-    func, params = relay.frontend.from_caffe2(model.init_net, model.predict_net, shape_dict, dtype_dict)
+    mod, params = relay.frontend.from_caffe2(
+        model.init_net, model.predict_net, shape_dict, dtype_dict)
     with relay.build_config(opt_level=3):
-        graph, lib, params = relay.build(func, target, params=params)
+        graph, lib, params = relay.build(mod[mod.entry_func], target, params=params)
 
     m = graph_runtime.create(graph, lib, ctx)
 
diff --git a/tests/python/frontend/caffe2/test_graph.py b/tests/python/frontend/caffe2/test_graph.py
index e5280932463d..ea3a36e60663 100644
--- a/tests/python/frontend/caffe2/test_graph.py
+++ b/tests/python/frontend/caffe2/test_graph.py
@@ -28,9 +28,10 @@ def compare_graph(f1, f2):
 def test_squeeze_net():
     shape_dict = {'data': (1, 3, 224, 224)}
     dtype_dict = {'data': 'float32'}
-    from_c2_func, _ = relay.frontend.from_caffe2(c2_squeezenet.init_net, c2_squeezenet.predict_net, shape_dict, dtype_dict)
+    mod, _, = relay.frontend.from_caffe2(
+        c2_squeezenet.init_net, c2_squeezenet.predict_net, shape_dict, dtype_dict)
     relay_func, _ = relay_squeezenet()
-    compare_graph(from_c2_func, relay_func)
+    compare_graph(mod[mod.entry_func], relay_func)
 
 
 if __name__ == '__main__':
diff --git a/tests/python/frontend/coreml/test_forward.py b/tests/python/frontend/coreml/test_forward.py
index 0fed49079fd2..7f0cdd1a5120 100644
--- a/tests/python/frontend/coreml/test_forward.py
+++ b/tests/python/frontend/coreml/test_forward.py
@@ -31,7 +31,7 @@
 
 def get_tvm_output(func, x, params, target, ctx,
                    out_shape=(1, 1000), input_name='image', dtype='float32'):
-    with relay.build_module.build_config(opt_level=3):
+    with relay.transform.build_config(opt_level=3):
         graph, lib, params = relay.build(func, target, params=params)
     m = graph_runtime.create(graph, lib, ctx)
     # set inputs
@@ -46,9 +46,9 @@ def run_model_checkonly(model_file, model_name='', input_name='image'):
     model = cm.models.MLModel(model_file)
     x = model_zoo.get_cat_image()
     shape_dict = {input_name : x.shape}
-    func, params = relay.frontend.from_coreml(model, shape_dict)
+    mod, params = relay.frontend.from_coreml(model, shape_dict)
     for target, ctx in ctx_list():
-        tvm_output = get_tvm_output(func, x, params, target, ctx)
+        tvm_output = get_tvm_output(mod[mod.entry_func], x, params, target, ctx)
         print(target, ctx, model_name, 'prediction id: ', np.argmax(tvm_output.flat))
 
 def test_mobilenet_checkonly():
@@ -71,9 +71,9 @@ def run_tvm_graph(coreml_model, target, ctx, input_data, input_name, output_shap
         shape_dict = {input_name: input_data.shape}
         dtype_dict = {input_name: input_data.dtype}
 
-    func, params = relay.frontend.from_coreml(coreml_model, shape_dict)
-    with relay.build_module.build_config(opt_level=3):
-        graph, lib, params = relay.build(func, target, params=params)
+    mod, params = relay.frontend.from_coreml(coreml_model, shape_dict)
+    with relay.transform.build_config(opt_level=3):
+        graph, lib, params = relay.build(mod[mod.entry_func], target, params=params)
 
     from tvm.contrib import graph_runtime
     m = graph_runtime.create(graph, lib, ctx)
@@ -179,7 +179,7 @@ def verify_UpsampleLayerParams(input_dim, scale, mode):
 
     a_np = np.full(input_dim, 1, dtype=dtype)
     if mode == 'NN':
-        b_np = topi.testing.upsampling_python(a_np, scale)
+        b_np = topi.testing.upsampling_python(a_np, (scale, scale))
     else:
         new_h = input_dim[2] * scale
         new_w = input_dim[3] * scale
diff --git a/tests/python/frontend/darknet/test_forward.py b/tests/python/frontend/darknet/test_forward.py
new file mode 100644
index 000000000000..06916172f2bf
--- /dev/null
+++ b/tests/python/frontend/darknet/test_forward.py
@@ -0,0 +1,464 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Test Darknet Models
+===================
+This article is a test script to test darknet models with Relay.
+All the required models and libraries will be downloaded from the internet
+by the script.
+"""
+import numpy as np
+import tvm
+from tvm.contrib import graph_runtime
+from tvm.contrib.download import download_testdata
+download_testdata.__test__ = False
+from tvm.relay.testing.darknet import LAYERTYPE
+from tvm.relay.testing.darknet import __darknetffi__
+from tvm.relay.frontend.darknet import ACTIVATION
+from tvm import relay
+
+REPO_URL = 'https://github.com/dmlc/web-data/blob/master/darknet/'
+DARKNET_LIB = 'libdarknet2.0.so'
+DARKNETLIB_URL = REPO_URL + 'lib/' + DARKNET_LIB + '?raw=true'
+LIB = __darknetffi__.dlopen(download_testdata(DARKNETLIB_URL, DARKNET_LIB, module='darknet'))
+
+DARKNET_TEST_IMAGE_NAME = 'dog.jpg'
+DARKNET_TEST_IMAGE_URL = REPO_URL + 'data/' + DARKNET_TEST_IMAGE_NAME +'?raw=true'
+DARKNET_TEST_IMAGE_PATH = download_testdata(DARKNET_TEST_IMAGE_URL, DARKNET_TEST_IMAGE_NAME, module='data')
+
+def _read_memory_buffer(shape, data, dtype='float32'):
+    length = 1
+    for x in shape:
+        length *= x
+    data_np = np.zeros(length, dtype=dtype)
+    for i in range(length):
+        data_np[i] = data[i]
+    return data_np.reshape(shape)
+
+def _get_tvm_output(net, data, build_dtype='float32', states=None):
+    '''Compute TVM output'''
+    dtype = 'float32'
+    mod, params = relay.frontend.from_darknet(net, data.shape, dtype)
+    target = 'llvm'
+    shape_dict = {'data': data.shape}
+    graph, library, params = relay.build(mod[mod.entry_func],
+                                         target,
+                                         params=params)
+
+    # Execute on TVM
+    ctx = tvm.cpu(0)
+    m = graph_runtime.create(graph, library, ctx)
+    # set inputs
+    m.set_input('data', tvm.nd.array(data.astype(dtype)))
+    if states:
+        for name in states.keys():
+            m.set_input(name, tvm.nd.array(states[name].astype(dtype)))
+    m.set_input(**params)
+    m.run()
+    # get outputs
+    tvm_out = []
+    for i in range(m.get_num_outputs()):
+        tvm_out.append(m.get_output(i).asnumpy())
+    return tvm_out
+
+def _load_net(cfg_url, cfg_name, weights_url, weights_name):
+    cfg_path = download_testdata(cfg_url, cfg_name, module='darknet')
+    weights_path = download_testdata(weights_url, weights_name, module='darknet')
+    net = LIB.load_network(cfg_path.encode('utf-8'), weights_path.encode('utf-8'), 0)
+    return net
+
+def verify_darknet_frontend(net, build_dtype='float32'):
+    '''Test network with given input image on both darknet and tvm'''
+    def get_darknet_output(net, img):
+        LIB.network_predict_image(net, img)
+        out = []
+        for i in range(net.n):
+            layer = net.layers[i]
+            if layer.type == LAYERTYPE.REGION:
+                attributes = np.array([layer.n, layer.out_c, layer.out_h,
+                                       layer.out_w, layer.classes,
+                                       layer.coords, layer.background],
+                                      dtype=np.int32)
+                out.insert(0, attributes)
+                out.insert(0, _read_memory_buffer((layer.n*2, ), layer.biases))
+                layer_outshape = (layer.batch, layer.out_c,
+                                  layer.out_h, layer.out_w)
+                out.insert(0, _read_memory_buffer(layer_outshape, layer.output))
+            elif layer.type == LAYERTYPE.YOLO:
+                attributes = np.array([layer.n, layer.out_c, layer.out_h,
+                                       layer.out_w, layer.classes,
+                                       layer.total],
+                                      dtype=np.int32)
+                out.insert(0, attributes)
+                out.insert(0, _read_memory_buffer((layer.total*2, ), layer.biases))
+                out.insert(0, _read_memory_buffer((layer.n, ), layer.mask, dtype='int32'))
+                layer_outshape = (layer.batch, layer.out_c,
+                                  layer.out_h, layer.out_w)
+                out.insert(0, _read_memory_buffer(layer_outshape, layer.output))
+            elif i == net.n-1:
+                if layer.type == LAYERTYPE.CONNECTED:
+                    darknet_outshape = (layer.batch, layer.out_c)
+                elif layer.type in [LAYERTYPE.SOFTMAX]:
+                    darknet_outshape = (layer.batch, layer.outputs)
+                else:
+                    darknet_outshape = (layer.batch, layer.out_c,
+                                        layer.out_h, layer.out_w)
+                out.insert(0, _read_memory_buffer(darknet_outshape, layer.output))
+        return out
+
+    dtype = 'float32'
+
+    img = LIB.letterbox_image(LIB.load_image_color(DARKNET_TEST_IMAGE_PATH.encode('utf-8'), 0, 0), net.w, net.h)
+    darknet_output = get_darknet_output(net, img)
+    batch_size = 1
+    data = np.empty([batch_size, img.c, img.h, img.w], dtype)
+    i = 0
+    for c in range(img.c):
+        for h in range(img.h):
+            for k in range(img.w):
+                data[0][c][h][k] = img.data[i]
+                i = i + 1
+
+    tvm_out = _get_tvm_output(net, data, build_dtype)
+    for tvm_outs, darknet_out in zip(tvm_out, darknet_output):
+        tvm.testing.assert_allclose(darknet_out, tvm_outs, rtol=1e-3, atol=1e-3)
+
+def _test_rnn_network(net, states):
+    '''Test network with given input data on both darknet and tvm'''
+    def get_darknet_network_predict(net, data):
+        return LIB.network_predict(net, data)
+    from cffi import FFI
+    ffi = FFI()
+    np_arr = np.zeros([1, net.inputs], dtype='float32')
+    np_arr[0, 2] = 1
+    cffi_arr = ffi.cast('float*', np_arr.ctypes.data)
+    tvm_out = _get_tvm_output(net, np_arr, states=states)[0]
+    darknet_output = get_darknet_network_predict(net, cffi_arr)
+    darknet_out = np.zeros(net.outputs, dtype='float32')
+    for i in range(net.outputs):
+        darknet_out[i] = darknet_output[i]
+    last_layer = net.layers[net.n-1]
+    darknet_outshape = (last_layer.batch, last_layer.outputs)
+    darknet_out = darknet_out.reshape(darknet_outshape)
+    tvm.testing.assert_allclose(darknet_out, tvm_out, rtol=1e-4, atol=1e-4)
+
+def test_forward_extraction():
+    '''test extraction model'''
+    model_name = 'extraction'
+    cfg_name = model_name + '.cfg'
+    weights_name = model_name + '.weights'
+    cfg_url = 'https://github.com/pjreddie/darknet/blob/master/cfg/' + cfg_name + '?raw=true'
+    weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
+    net = _load_net(cfg_url, cfg_name, weights_url, weights_name)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_alexnet():
+    '''test alexnet model'''
+    model_name = 'alexnet'
+    cfg_name = model_name + '.cfg'
+    weights_name = model_name + '.weights'
+    cfg_url = 'https://github.com/pjreddie/darknet/blob/master/cfg/' + cfg_name + '?raw=true'
+    weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
+    net = _load_net(cfg_url, cfg_name, weights_url, weights_name)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_resnet50():
+    '''test resnet50 model'''
+    model_name = 'resnet50'
+    cfg_name = model_name + '.cfg'
+    weights_name = model_name + '.weights'
+    cfg_url = 'https://github.com/pjreddie/darknet/blob/master/cfg/' + cfg_name + '?raw=true'
+    weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
+    net = _load_net(cfg_url, cfg_name, weights_url, weights_name)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_yolov2():
+    '''test yolov2 model'''
+    model_name = 'yolov2'
+    cfg_name = model_name + '.cfg'
+    weights_name = model_name + '.weights'
+    cfg_url = 'https://github.com/pjreddie/darknet/blob/master/cfg/' + cfg_name + '?raw=true'
+    weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
+    net = _load_net(cfg_url, cfg_name, weights_url, weights_name)
+    build_dtype = {}
+    verify_darknet_frontend(net, build_dtype)
+    LIB.free_network(net)
+
+def test_forward_yolov3():
+    '''test yolov3 model'''
+    model_name = 'yolov3'
+    cfg_name = model_name + '.cfg'
+    weights_name = model_name + '.weights'
+    cfg_url = 'https://github.com/pjreddie/darknet/blob/master/cfg/' + cfg_name + '?raw=true'
+    weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
+    net = _load_net(cfg_url, cfg_name, weights_url, weights_name)
+    build_dtype = {}
+    verify_darknet_frontend(net, build_dtype)
+    LIB.free_network(net)
+
+def test_forward_convolutional():
+    '''test convolutional layer'''
+    net = LIB.make_network(1)
+    layer = LIB.make_convolutional_layer(1, 224, 224, 3, 32, 1, 3, 2, 0, 1, 0, 0, 0, 0)
+    net.layers[0] = layer
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_dense():
+    '''test fully connected layer'''
+    net = LIB.make_network(1)
+    layer = LIB.make_connected_layer(1, 75, 20, 1, 0, 0)
+    net.layers[0] = layer
+    net.w = net.h = 5
+    LIB.resize_network(net, 5, 5)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_dense_batchnorm():
+    '''test fully connected layer with batchnorm'''
+    net = LIB.make_network(1)
+    layer = LIB.make_connected_layer(1, 12, 2, 1, 1, 0)
+    for i in range(5):
+        layer.rolling_mean[i] = np.random.rand(1)
+        layer.rolling_variance[i] = np.random.rand(1)
+        layer.scales[i] = np.random.rand(1)
+    net.layers[0] = layer
+    net.w = net.h = 2
+    LIB.resize_network(net, 2, 2)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_maxpooling():
+    '''test maxpooling layer'''
+    net = LIB.make_network(1)
+    layer = LIB.make_maxpool_layer(1, 224, 224, 3, 2, 2, 0)
+    net.layers[0] = layer
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_avgpooling():
+    '''test avgerage pooling layer'''
+    net = LIB.make_network(1)
+    layer = LIB.make_avgpool_layer(1, 224, 224, 3)
+    net.layers[0] = layer
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_conv_batch_norm():
+    '''test batch normalization layer'''
+    net = LIB.make_network(1)
+    layer = LIB.make_convolutional_layer(1, 224, 224, 3, 32, 1, 3, 2, 0, 1, 1, 0, 0, 0)
+    for i in range(32):
+        layer.rolling_mean[i] = np.random.rand(1)
+        layer.rolling_variance[i] = np.random.rand(1)
+    net.layers[0] = layer
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_shortcut():
+    '''test shortcut layer'''
+    net = LIB.make_network(3)
+    layer_1 = LIB.make_convolutional_layer(1, 224, 224, 3, 32, 1, 3, 2, 0, 1, 0, 0, 0, 0)
+    layer_2 = LIB.make_convolutional_layer(1, 111, 111, 32, 32, 1, 1, 1, 0, 1, 0, 0, 0, 0)
+    layer_3 = LIB.make_shortcut_layer(1, 0, 111, 111, 32, 111, 111, 32)
+    layer_3.activation = ACTIVATION.RELU
+    layer_3.alpha = 1
+    layer_3.beta = 1
+    net.layers[0] = layer_1
+    net.layers[1] = layer_2
+    net.layers[2] = layer_3
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_reorg():
+    '''test reorg layer'''
+    net = LIB.make_network(2)
+    layer_1 = LIB.make_convolutional_layer(1, 222, 222, 3, 32, 1, 3, 2, 0, 1, 0, 0, 0, 0)
+    layer_2 = LIB.make_reorg_layer(1, 110, 110, 32, 2, 0, 0, 0)
+    net.layers[0] = layer_1
+    net.layers[1] = layer_2
+    net.w = net.h = 222
+    LIB.resize_network(net, 222, 222)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_region():
+    '''test region layer'''
+    net = LIB.make_network(2)
+    layer_1 = LIB.make_convolutional_layer(1, 19, 19, 3, 425, 1, 1, 1, 0, 1, 0, 0, 0, 0)
+    layer_2 = LIB.make_region_layer(1, 19, 19, 5, 80, 4)
+    layer_2.softmax = 1
+    net.layers[0] = layer_1
+    net.layers[1] = layer_2
+    net.w = net.h = 19
+    LIB.resize_network(net, 19, 19)
+    build_dtype = {}
+    verify_darknet_frontend(net, build_dtype)
+    LIB.free_network(net)
+
+def test_forward_yolo_op():
+    '''test yolo layer'''
+    net = LIB.make_network(2)
+    layer_1 = LIB.make_convolutional_layer(1, 224, 224, 3, 14, 1, 3, 2, 0, 1, 0, 0, 0, 0)
+    layer_2 = LIB.make_yolo_layer(1, 111, 111, 2, 9, __darknetffi__.NULL, 2)
+    net.layers[0] = layer_1
+    net.layers[1] = layer_2
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    build_dtype = {}
+    verify_darknet_frontend(net, build_dtype)
+    LIB.free_network(net)
+
+def test_forward_upsample():
+    '''test upsample layer'''
+    net = LIB.make_network(1)
+    layer = LIB.make_upsample_layer(1, 19, 19, 3, 3)
+    layer.scale = 1
+    net.layers[0] = layer
+    net.w = net.h = 19
+    LIB.resize_network(net, 19, 19)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_l2normalize():
+    '''test l2 normalization layer'''
+    net = LIB.make_network(1)
+    layer = LIB.make_l2norm_layer(1, 224*224*3)
+    layer.c = layer.out_c = 3
+    layer.h = layer.out_h = 224
+    layer.w = layer.out_w = 224
+    net.layers[0] = layer
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_elu():
+    '''test elu activation layer'''
+    net = LIB.make_network(1)
+    layer_1 = LIB.make_convolutional_layer(1, 224, 224, 3, 32, 1, 3, 2, 0, 1, 0, 0, 0, 0)
+    layer_1.activation = ACTIVATION.ELU
+    net.layers[0] = layer_1
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_softmax():
+    '''test softmax layer'''
+    net = LIB.make_network(1)
+    layer_1 = LIB.make_softmax_layer(1, 75, 1)
+    layer_1.temperature = 1
+    net.layers[0] = layer_1
+    net.w = net.h = 5
+    LIB.resize_network(net, net.w, net.h)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_softmax_temperature():
+    '''test softmax layer'''
+    net = LIB.make_network(1)
+    layer_1 = LIB.make_softmax_layer(1, 75, 1)
+    layer_1.temperature = 0.8
+    net.layers[0] = layer_1
+    net.w = net.h = 5
+    LIB.resize_network(net, net.w, net.h)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_activation_logistic():
+    '''test logistic activation layer'''
+    net = LIB.make_network(1)
+    batch = 1
+    h = 224
+    w = 224
+    c = 3
+    n = 32
+    groups = 1
+    size = 3
+    stride = 2
+    padding = 0
+    activation = ACTIVATION.LOGISTIC
+    batch_normalize = 0
+    binary = 0
+    xnor = 0
+    adam = 0
+    layer_1 = LIB.make_convolutional_layer(batch, h, w, c, n, groups, size, stride, padding,
+                                           activation, batch_normalize, binary, xnor, adam)
+    net.layers[0] = layer_1
+    net.w = w
+    net.h = h
+    LIB.resize_network(net, net.w, net.h)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_rnn():
+    '''test RNN layer'''
+    net = LIB.make_network(1)
+    batch = 1
+    inputs = 4
+    outputs = 4
+    steps = 1
+    activation = ACTIVATION.RELU
+    batch_normalize = 0
+    adam = 0
+    layer_1 = LIB.make_rnn_layer(batch, inputs, outputs, steps, activation, batch_normalize, adam)
+    net.layers[0] = layer_1
+    net.inputs = inputs
+    net.outputs = outputs
+    net.w = net.h = 0
+    LIB.resize_network(net, net.w, net.h)
+    states = {"rnn0_state": np.zeros([1, net.inputs])}
+    _test_rnn_network(net, states)
+    LIB.free_network(net)
+
+if __name__ == '__main__':
+    test_forward_resnet50()
+    test_forward_alexnet()
+    test_forward_extraction()
+    test_forward_yolov2()
+    test_forward_yolov3()
+    test_forward_convolutional()
+    test_forward_maxpooling()
+    test_forward_avgpooling()
+    test_forward_conv_batch_norm()
+    test_forward_shortcut()
+    test_forward_dense()
+    test_forward_dense_batchnorm()
+    test_forward_softmax()
+    test_forward_softmax_temperature()
+    test_forward_reorg()
+    test_forward_region()
+    test_forward_yolo_op()
+    test_forward_upsample()
+    test_forward_l2normalize()
+    test_forward_elu()
+    test_forward_rnn()
+    test_forward_activation_logistic()
diff --git a/tests/python/frontend/keras/test_forward.py b/tests/python/frontend/keras/test_forward.py
index 35a9229443cb..2dfb43b70455 100644
--- a/tests/python/frontend/keras/test_forward.py
+++ b/tests/python/frontend/keras/test_forward.py
@@ -21,7 +21,7 @@
 from tvm.relay.testing.config import ctx_list
 import keras
 
-# prevent keras from using up all gpu memory
+# prevent Keras from using up all gpu memory
 import tensorflow as tf
 from keras.backend.tensorflow_backend import set_session
 config = tf.ConfigProto()
@@ -42,9 +42,11 @@ def get_keras_output(xs, dtype='float32'):
 
     def get_tvm_output(xs, target, ctx, dtype='float32'):
         shape_dict = {name: x.shape for (name, x) in zip(keras_model.input_names, xs)}
-        func, params = relay.frontend.from_keras(keras_model, shape_dict)
-        with relay.build_module.build_config(opt_level=2):
-            graph, lib, params = relay.build(func, target, params=params)
+        mod, params = relay.frontend.from_keras(keras_model, shape_dict)
+        with relay.transform.build_config(opt_level=2):
+            graph, lib, params = relay.build(mod[mod.entry_func],
+                                             target,
+                                             params=params)
         m = graph_runtime.create(graph, lib, ctx)
         for name, x in zip(keras_model.input_names, xs):
             m.set_input(name, tvm.nd.array(x.astype(dtype)))
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index e75e60da5ce4..c82dc208fdc4 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -59,14 +59,14 @@ def get_mxnet_output(symbol, x, dtype='float32'):
     def get_tvm_output(symbol, x, args, auxs, target, ctx, dtype='float32'):
         shape_dict = {"data": x.shape}
         if gluon_impl:
-            new_sym, params = relay.frontend.from_mxnet(symbol, shape_dict)
+            mod, params = relay.frontend.from_mxnet(symbol, shape_dict)
         else:
-            new_sym, params = relay.frontend.from_mxnet(symbol,
-                                                        shape_dict,
-                                                        arg_params=args,
-                                                        aux_params=auxs)
+            mod, params = relay.frontend.from_mxnet(symbol,
+                                                    shape_dict,
+                                                    arg_params=args,
+                                                    aux_params=auxs)
         with relay.build_config(opt_level=3):
-            graph, lib, params = relay.build(new_sym, target, params=params)
+            graph, lib, params = relay.build(mod[mod.entry_func], target, params=params)
         m = graph_runtime.create(graph, lib, ctx)
         # set inputs
         m.set_input("data", tvm.nd.array(x.astype(dtype)))
@@ -143,7 +143,7 @@ def test_forward_fc_flatten():
 
 def test_forward_clip():
     data = mx.sym.var('data')
-    data = mx.sym.concat(data, -data, dim=1)  # negative part explicity
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
     mx_sym = mx.sym.clip(data, a_min=0, a_max=1)
     verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
 
@@ -242,11 +242,11 @@ def test_forward_where():
     args, auxs = mod.get_params()
     mx_out = mx.nd.where(mx_cond, mx_x, mx_y).asnumpy()
 
-    new_sym, _ = relay.frontend.from_mxnet(mx_sym, shapes, args, auxs)
+    mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, args, auxs)
     for target, ctx in ctx_list():
         for kind in ["graph", "debug"]:
-            intrp = relay.create_executor(kind, ctx=ctx, target=target)
-            op_res = intrp.evaluate(new_sym)(np_cond, np_x, np_y)
+            intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+            op_res = intrp.evaluate()(np_cond, np_x, np_y)
             tvm.testing.assert_allclose(op_res.asnumpy(), mx_out)
 
 
@@ -265,11 +265,11 @@ def _mx_symbol(F, start, stop, step):
     def verify(start, stop, step):
         ref_res = _mx_symbol(mx.nd, start, stop, step).asnumpy()
         mx_sym = _mx_symbol(mx.sym, start, stop, step)
-        new_sym, _ = relay.frontend.from_mxnet(mx_sym, {})
+        mod, _ = relay.frontend.from_mxnet(mx_sym, {})
         for target, ctx in ctx_list():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
-                op_res = intrp.evaluate(new_sym)()
+                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                op_res = intrp.evaluate()()
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
     verify(0, 20, None)
     verify(0, 20, 2)
@@ -304,11 +304,11 @@ def test_forward_broadcast_ops():
         mx_sym = _mx_symbol(mx.sym, op, [mx.sym.var('a'), mx.sym.var('b')])
         ref_res = _mx_symbol(mx.nd, op, [mx.nd.array(a_np), mx.nd.array(b_np)])
         shapes = {'a': a_shape, 'b': b_shape}
-        new_sym, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
+        mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
         for target, ctx in ctx_list():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
-                op_res = intrp.evaluate(new_sym)(a_np, b_np)
+                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                op_res = intrp.evaluate()(a_np, b_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
 def test_forward_elemwise_ops():
@@ -321,11 +321,11 @@ def test_forward_elemwise_ops():
         mx_sym = _mx_symbol(mx.sym, op, [mx.sym.var('a'), mx.sym.var('b')])
         ref_res = _mx_symbol(mx.nd, op, [mx.nd.array(a_np), mx.nd.array(b_np)])
         shapes = {'a': shape, 'b': shape}
-        new_sym, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
+        mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
         for target, ctx in ctx_list():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
-                op_res = intrp.evaluate(new_sym)(a_np, b_np)
+                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                op_res = intrp.evaluate()(a_np, b_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
 def test_forward_scalar_ops():
@@ -339,11 +339,11 @@ def test_forward_scalar_ops():
         mx_sym = op(mx.sym.var('a'), b_scalar)
         ref_res = op(mx.nd.array(a_np), b_scalar)
         shapes = {'a': a_shape}
-        new_sym, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
+        mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
         for target, ctx in ctx_list():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
-                op_res = intrp.evaluate(new_sym)(a_np)
+                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                op_res = intrp.evaluate()(a_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
     for op in ["maximum", "minimum"]:
         dtype='float32'
@@ -353,11 +353,11 @@ def test_forward_scalar_ops():
         mx_sym = _mx_symbol(mx.sym, op, [mx.sym.var('a'), b_scalar])
         ref_res = _mx_symbol(mx.nd, op, [mx.nd.array(a_np), b_scalar])
         shapes = {'a': a_shape}
-        new_sym, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
+        mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
         for target, ctx in ctx_list():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
-                op_res = intrp.evaluate(new_sym)(a_np)
+                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                op_res = intrp.evaluate()(a_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
 def test_forward_slice_axis():
@@ -365,11 +365,11 @@ def verify(shape, axis, begin, end):
         data_np = np.random.uniform(size=shape).astype("float32")
         ref_res = mx.nd.slice_axis(mx.nd.array(data_np), axis, begin, end)
         mx_sym = mx.sym.slice_axis(mx.sym.var("data"), axis, begin, end)
-        new_sym, _ = relay.frontend.from_mxnet(mx_sym, {"data": shape})
+        mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": shape})
         for target, ctx in ctx_list():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
-                op_res = intrp.evaluate(new_sym)(data_np)
+                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                op_res = intrp.evaluate()(data_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
     verify((3, 4), 0, 1, 2)
     verify((3, 4), 0, 1, None)
@@ -387,11 +387,11 @@ def verify(x_shape, y_shape, axes):
         else:
             ref_res = mx.nd.slice_like(mx.nd.array(x_np), mx.nd.array(y_np), axes=axes)
             mx_sym = mx.sym.slice_like(mx.sym.var("x"), mx.sym.var("y"), axes=axes)
-        new_sym, _ = relay.frontend.from_mxnet(mx_sym, {"x": x_shape, "y": y_shape})
+        mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": x_shape, "y": y_shape})
         for target, ctx in ctx_list():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
-                op_res = intrp.evaluate(new_sym)(x_np, y_np)
+                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                op_res = intrp.evaluate()(x_np, y_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
     verify((3, 4), (2, 3), None)
     verify((3, 4), (2, 3), (0, 1))
@@ -408,11 +408,11 @@ def verify(shape):
         x_np = np.random.uniform(size=shape).astype("float32")
         ref_res = mx.nd.shape_array(mx.nd.array(x_np))
         mx_sym = mx.sym.shape_array(mx.sym.var("x"))
-        new_sym, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape})
+        mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape})
         for target, ctx in ctx_list():
             for kind in ["debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
-                op_res = intrp.evaluate(new_sym)(x_np)
+                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                op_res = intrp.evaluate()(x_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
     verify((1,))
     verify((3, 4, 5))
@@ -427,11 +427,11 @@ def verify(shape, axis):
         else:
             ref_res = mx.nd.squeeze(mx.nd.array(x_np), axis=axis)
             mx_sym = mx.sym.squeeze(mx.sym.var("x"), axis=axis)
-        new_sym, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape})
+        mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape})
         for target, ctx in ctx_list():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
-                op_res = intrp.evaluate(new_sym)(x_np)
+                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                op_res = intrp.evaluate()(x_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
     verify((1, 3, 1), None)
     verify((1, 3, 1), 0)
@@ -443,11 +443,11 @@ def verify(shape, axis, size):
         x_np = np.random.uniform(size=shape).astype("float32")
         ref_res = mx.nd.broadcast_axis(mx.nd.array(x_np), axis=axis, size=size)
         mx_sym = mx.sym.broadcast_axis(mx.sym.var("x"), axis=axis, size=size)
-        new_sym, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape})
+        mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape})
         for target, ctx in ctx_list():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
-                op_res = intrp.evaluate(new_sym)(x_np)
+                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                op_res = intrp.evaluate()(x_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
     verify((1, 2, 1), 2, 3)
     verify((1, 2, 1), (0, 2), (2, 3))
@@ -457,13 +457,13 @@ def verify(val, shape, dtype):
         ctx = mx.cpu()
         ref_res = mx.nd.full(shape, val, dtype=dtype)
         mx_sym = mx.sym.full(shape, val, dtype=dtype)
-        new_sym, _ = relay.frontend.from_mxnet(mx_sym, {})
+        mod, _ = relay.frontend.from_mxnet(mx_sym, {})
         for target, ctx in ctx_list():
             # Skip testing graph runtime because this op will be optimized out
             # by constant folding.
             for kind in ["debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
-                op_res = intrp.evaluate(new_sym)()
+                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                op_res = intrp.evaluate()()
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
     verify(2, (3, 4), "float32")
     verify(2, (3, 4), "int32")
@@ -478,12 +478,12 @@ def verify(data_shape, weight_shape):
                                   input_dim=in_dim, output_dim=out_dim)
         mx_sym = mx.sym.Embedding(mx.sym.var("x"), mx.sym.var("w"),
                                   input_dim=in_dim, output_dim=out_dim)
-        new_sym, _ = relay.frontend.from_mxnet(
+        mod, _ = relay.frontend.from_mxnet(
             mx_sym, {"x": data_shape, "w": weight_shape})
         for target, ctx in ctx_list():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
-                op_res = intrp.evaluate(new_sym)(x=x_np, w=w_np)
+                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                op_res = intrp.evaluate()(x=x_np, w=w_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
     verify((2, 2), (4, 5))
     verify((2, 3, 4), (4, 5))
@@ -501,11 +501,11 @@ def verify(shape, indices_src, axis, mode="clip"):
         indices_np = np.array(indices_src, dtype="float32")
         ref_res = mx.nd.take(mx.nd.array(x_np), mx.nd.array(indices_np), axis, mode)
         mx_sym = mx.sym.take(mx.sym.var("x"), mx.sym.var("y"), axis, mode)
-        new_sym, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape, "y": indices_np.shape})
+        mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape, "y": indices_np.shape})
         for target, ctx in ctx_list():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
-                op_res = intrp.evaluate(new_sym)(x_np, indices_np)
+                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                op_res = intrp.evaluate()(x_np, indices_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
     verify((2,2), [[[1,0],[0,1]]], 0)
     verify((2,2), [[[1,0],[0,1]]], 1)
@@ -520,11 +520,11 @@ def verify(xshape, yshape, y_data):
         x_data = np.random.uniform(size=xshape).astype("float32")
         ref_res = mx.nd.gather_nd(mx.nd.array(x_data), mx.nd.array(y_data))
         mx_sym = mx.sym.gather_nd(mx.sym.var("x_data"), mx.sym.var("y_data"))
-        new_sym, _ = relay.frontend.from_mxnet(mx_sym, {"x_data": xshape, "y_data": yshape}, {"x_data": "float32", "y_data": "int32"})
+        mod, _ = relay.frontend.from_mxnet(mx_sym, {"x_data": xshape, "y_data": yshape}, {"x_data": "float32", "y_data": "int32"})
         for target, ctx in ctx_list():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
-                op_res = intrp.evaluate(new_sym)(x_data, y_data)
+                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                op_res = intrp.evaluate()(x_data, y_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
     verify((2, 2), (2, 3), [[1, 1, 0], [0, 1, 0]])
     verify((2, 2, 2), (2, 2), [[0, 1], [1, 0]])
@@ -536,52 +536,135 @@ def test_forward_bilinear_resize():
     verify_mxnet_frontend_impl(mx_sym, (1, 2, 3, 4), (1, 2, 5, 10))
 
 def test_forward_rnn_layer():
-    def verify(mode, input_size, seq_len, hidden_size, num_layers, batch=1):
+    def verify(mode, seq_len, input_size, hidden_size, num_layers,
+               batch=1, init_states=True, bidirectional=False):
         if mode == "rnn":
-            layer = gluon.rnn.RNN(hidden_size, num_layers)
+            layer = gluon.rnn.RNN(hidden_size, num_layers, bidirectional=bidirectional)
         elif mode == "gru":
-            layer = gluon.rnn.GRU(hidden_size, num_layers)
+            layer = gluon.rnn.GRU(hidden_size, num_layers, bidirectional=bidirectional)
         else: # mode == "lstm"
-            layer = gluon.rnn.LSTM(hidden_size, num_layers)
+            layer = gluon.rnn.LSTM(hidden_size, num_layers, bidirectional=bidirectional)
         num_states = 2 if mode == "lstm" else 1
         layer.initialize()
+        layer.hybridize()
 
         dtype = "float32"
+        directions = 2 if bidirectional else 1
         data_np = np.random.uniform(size=(seq_len, batch, input_size)).astype(dtype)
-        states_np = []
-        states_mx = []
-        shape_dict = {'data0': data_np.shape}
-        inputs = {'data0': data_np}
-        for i in range(num_states):
-            s = np.random.uniform(size=(num_layers, batch, hidden_size)).astype(dtype)
-            states_np.append(s)
-            states_mx.append(mx.nd.array(s))
-            shape_dict['data%s' % (i+1)] = s.shape
-            inputs['data%s' % (i+1)] = s
+        data_mx = mx.nd.array(data_np)
+
+        if init_states:
+            shape_dict = {'data0': data_np.shape}
+            inputs = {'data0': data_np}
+            state_shape = (num_layers*directions, batch, hidden_size)
+            states_np = []
+            states_mx = []
+            for i in range(num_states):
+                s = np.random.uniform(size=state_shape).astype(dtype)
+                states_np.append(s)
+                states_mx.append(mx.nd.array(s))
+                shape_dict['data%s' % (i+1)] = s.shape
+                inputs['data%s' % (i+1)] = s
+            mx_out, mx_states = layer(data_mx, states_mx)
+            mx_res = [mx_out] + mx_states
+        else:
+            shape_dict = {'data': data_np.shape}
+            inputs = {'data': data_np}
+            mx_res = layer(data_mx)
 
-        layer.hybridize()
-        mx_out, mx_states = layer(mx.nd.array(data_np), states_mx)
-        mx_res = [mx_out] + mx_states
         mx_sym = layer._cached_graph[1]
         mx_params = {}
         for name, param in layer.collect_params().items():
             mx_params[name] = param._reduce()
 
-        new_sym, params = relay.frontend.from_mxnet(
+        mod, params = relay.frontend.from_mxnet(
             mx_sym, shape=shape_dict, arg_params=mx_params)
         for target, ctx in ctx_list():
             # only test graph runtime because debug runtime is too slow
             for kind in ["graph"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
-                op_res = intrp.evaluate(new_sym)(**inputs, **params)
-                assert len(op_res) == len(mx_res)
-                for i, val in enumerate(op_res):
-                    tvm.testing.assert_allclose(val.asnumpy(), mx_res[i].asnumpy(), rtol=1e-3)
+                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                op_res = intrp.evaluate()(**inputs, **params)
+                if init_states:
+                    assert len(op_res) == len(mx_res)
+                    for i, val in enumerate(op_res):
+                        tvm.testing.assert_allclose(
+                            val.asnumpy(), mx_res[i].asnumpy(), rtol=1e-3)
+                else:
+                    tvm.testing.assert_allclose(
+                        op_res.asnumpy(), mx_res.asnumpy(), rtol=1e-3)
 
     for mode in ["rnn", "gru", "lstm"]:
-        verify(mode, 64, 10, 64, 1)
-        verify(mode, 64, 10, 64, 2)
-        verify(mode, 64, 10, 32, 2)
+        verify(mode, 1, 64, 64, 1)
+        verify(mode, 10, 64, 64, 2)
+        verify(mode, 10, 64, 32, 2)
+        verify(mode, 10, 64, 32, 2, batch=2)
+        verify(mode, 10, 64, 64, 3, init_states=False)
+        verify(mode, 10, 32, 64, 1, bidirectional=True)
+        verify(mode, 10, 64, 64, 3, batch=2, bidirectional=True, init_states=False)
+
+def test_forward_Crop():
+    def verify(xshape, yshape, offset=None):
+        x_data = np.random.uniform(size=xshape).astype("float32")
+        y_data = np.random.uniform(size=yshape).astype("float32")
+        if offset is None:
+            mx_sym = mx.sym.Crop(mx.sym.var("x"), mx.sym.var("y"))
+            ref_res = mx.nd.Crop(mx.nd.array(x_data), mx.nd.array(y_data))
+        else:
+            mx_sym = mx.sym.Crop(mx.sym.var("x"), mx.sym.var("y"), offset=offset)
+            ref_res = mx.nd.Crop(mx.nd.array(x_data), mx.nd.array(y_data), offset=offset)
+        mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": xshape, "y": yshape})
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                if offset is None or offset == (0, 0):
+                    op_res = intrp.evaluate()(x_data, y_data)
+                else:
+                    op_res = intrp.evaluate()(x_data)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
+    verify((1, 3, 40, 40), (1, 3, 20, 20))
+    verify((1, 3, 40, 40), (1, 3, 20, 20), (0, 0))
+    verify((1, 3, 40, 40), (1, 3, 20, 20), (10, 10))
+    verify((5, 32, 40, 40), (5, 32, 25, 25))
+    verify((5, 32, 40, 40), (5, 32, 25, 25), (5, 5))
+
+def test_forward_argsort():
+    def verify(shape, axis, is_ascend, dtype="float32"):
+        x_np = np.random.uniform(size=shape).astype("float32")
+        ref_res = mx.nd.argsort(mx.nd.array(x_np), axis=axis, is_ascend=is_ascend, dtype=dtype)
+        mx_sym = mx.sym.argsort(mx.sym.var("x"), axis=axis, is_ascend=is_ascend, dtype=dtype)
+        mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape})
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                op_res = intrp.evaluate()(x_np)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
+    verify((2, 3, 4), axis=0, is_ascend=False)
+    verify((1, 4, 6), axis=1, is_ascend=True)
+    verify((3, 5, 6), axis=-3, is_ascend=False, dtype="int32")
+
+def test_forward_topk():
+    def verify(shape, k, axis, ret_type, is_ascend=False, dtype="float32"):
+        x_np = np.random.uniform(size=shape).astype("float32")
+        ref_res = mx.nd.topk(mx.nd.array(x_np), k=k, axis=axis, ret_typ=ret_type,
+                             is_ascend=is_ascend, dtype=dtype)
+        mx_sym = mx.sym.topk(mx.sym.var("x"), k=k, axis=axis, ret_typ=ret_type,
+                             is_ascend=is_ascend, dtype=dtype)
+        mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape})
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                op_res = intrp.evaluate()(x_np)
+                if isinstance(ref_res, list):
+                    assert len(op_res) == len(ref_res)
+                    for i, t in enumerate(op_res):
+                        tvm.testing.assert_allclose(t.asnumpy(), ref_res[i].asnumpy())
+                else:
+                    tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
+    verify((3, 4), k=1, axis=0, ret_type="both")
+    verify((3, 4), k=1, axis=-1, ret_type="indices")
+    verify((3, 5, 6), k=2, axis=2, ret_type="value")
+    verify((3, 5, 6), k=2, axis=1, ret_type="value", is_ascend=True)
+    verify((3, 5, 6), k=0, axis=2, ret_type="both", dtype="int32")
 
 
 if __name__ == '__main__':
@@ -624,3 +707,6 @@ def verify(mode, input_size, seq_len, hidden_size, num_layers, batch=1):
     test_forward_gather_nd()
     test_forward_bilinear_resize()
     test_forward_rnn_layer()
+    test_forward_Crop()
+    test_forward_argsort()
+    test_forward_topk()
diff --git a/tests/python/frontend/mxnet/test_graph.py b/tests/python/frontend/mxnet/test_graph.py
index 86312648c32c..b7d3ba4a5b60 100644
--- a/tests/python/frontend/mxnet/test_graph.py
+++ b/tests/python/frontend/mxnet/test_graph.py
@@ -26,60 +26,60 @@ def compare_graph(f1, f2):
 def test_mlp():
     shape = {"data": (1, 1, 28, 28)}
     mx_fun = model_zoo.mx_mlp()
-    from_mx_fun, _ = relay.frontend.from_mxnet(mx_fun, shape=shape)
+    mod, _ = relay.frontend.from_mxnet(mx_fun, shape=shape)
     relay_fun = model_zoo.relay_mlp()
-    compare_graph(from_mx_fun, relay_fun)
+    compare_graph(mod[mod.entry_func], relay_fun)
 
 
 def test_vgg():
     shape = {"data": (1, 3, 224, 224)}
     for n in [11, 13, 16, 19]:
         mx_sym = model_zoo.mx_vgg(n)
-        from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, shape=shape)
+        mod, _ = relay.frontend.from_mxnet(mx_sym, shape=shape)
         relay_sym = model_zoo.relay_vgg(n)
-        compare_graph(from_mx_sym, relay_sym)
+        compare_graph(mod[mod.entry_func], relay_sym)
 
 
 def test_resnet():
     shape = {"data": (1, 3, 224, 224)}
     for n in [18, 34, 50, 101]:
         mx_sym = model_zoo.mx_resnet(n)
-        from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, shape=shape)
+        mod, _ = relay.frontend.from_mxnet(mx_sym, shape=shape)
         relay_sym = model_zoo.relay_resnet(n)
-        compare_graph(from_mx_sym, relay_sym)
+        compare_graph(mod[mod.entry_func], relay_sym)
 
 
 def test_squeezenet():
     shape = {"data": (1, 3, 224, 224)}
     for version in ['1.0', '1.1']:
         mx_sym = model_zoo.mx_squeezenet(version)
-        from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, shape)
+        mod, _ = relay.frontend.from_mxnet(mx_sym, shape)
         relay_sym = model_zoo.relay_squeezenet(version)
-        compare_graph(from_mx_sym, relay_sym)
+        compare_graph(mod[mod.entry_func], relay_sym)
 
 
 def test_inception_v3():
     shape = {"data": (1, 3, 299, 299)}
     mx_sym = model_zoo.mx_inception_v3()
-    from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, shape)
+    mod, _ = relay.frontend.from_mxnet(mx_sym, shape)
     relay_sym = model_zoo.relay_inception_v3()
-    compare_graph(from_mx_sym, relay_sym)
+    compare_graph(mod[mod.entry_func], relay_sym)
 
 
 def test_dqn():
     shape = {"data": (1, 4, 84, 84)}
     mx_sym = model_zoo.mx_dqn()
-    from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, shape)
+    mod, _ = relay.frontend.from_mxnet(mx_sym, shape)
     relay_sym = model_zoo.relay_dqn()
-    compare_graph(from_mx_sym, relay_sym)
+    compare_graph(mod[mod.entry_func], relay_sym)
 
 
 def test_dcgan():
     shape = {"data": (2, 100)}
     mx_sym = model_zoo.mx_dcgan()
-    from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, shape)
+    mod, _ = relay.frontend.from_mxnet(mx_sym, shape)
     relay_sym = model_zoo.relay_dcgan(batch_size=2)
-    compare_graph(from_mx_sym, relay_sym)
+    compare_graph(mod[mod.entry_func], relay_sym)
 
 
 def test_multi_outputs():
@@ -100,10 +100,10 @@ def relay_compose(F, **kwargs):
         return relay.Function(relay.ir_pass.free_vars(z), z)
 
     mx_sym = mx_compose(mx, num_outputs=3, axis=1)
-    from_mx_sym, _ = relay.frontend.from_mxnet(
+    mod, _ = relay.frontend.from_mxnet(
         mx_sym, shape={"x":xshape, "y":yshape})
     relay_sym = relay_compose(relay, indices_or_sections=3, axis=1)
-    compare_graph(from_mx_sym, relay_sym)
+    compare_graph(mod[mod.entry_func], relay_sym)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/frontend/nnvm_to_relay/test_alter_conv2d.py b/tests/python/frontend/nnvm_to_relay/test_alter_conv2d.py
index a03868550160..d3538bb0085b 100644
--- a/tests/python/frontend/nnvm_to_relay/test_alter_conv2d.py
+++ b/tests/python/frontend/nnvm_to_relay/test_alter_conv2d.py
@@ -74,13 +74,12 @@ def convnet():
 
     for tgt in targets:
         with tvm.target.create(tgt) as target:
-            with relay.build_config(opt_level=-1, add_pass='AlterOpLayout'):
-               with autotvm.tophub.context(target):
-                   O = relay.optimize(N, target, params=None)
-                   O = relay.ir_pass.infer_type(O)
+            with autotvm.tophub.context(target):
+                O = relay.ir_pass.alter_op_layout(N)
+                O = relay.ir_pass.infer_type(O)
 
-                   # graph should differ
-                   assert not relay.ir_pass.alpha_equal(N, O)
+                # graph should differ
+                assert not relay.ir_pass.alpha_equal(N, O)
 
 if __name__ == "__main__":
     np.random.seed(42)
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index f867e73e8c08..a52e3f0cc16e 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -14,8 +14,11 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import attr
 import numpy as np
 import math
+import torch
+import torchvision
 import topi
 import topi.testing
 import tvm
@@ -42,9 +45,11 @@ def get_tvm_output(graph_def, input_data, target, ctx, output_shape=None, output
         shape_dict = {input_names: input_data.shape}
         dtype_dict = {input_names: input_data.dtype}
 
-    sym, params = relay.frontend.from_onnx(graph_def, shape_dict)
+    mod, params = relay.frontend.from_onnx(graph_def, shape_dict)
     with relay.build_config(opt_level=1):
-        graph, lib, params = relay.build(sym, target, params=params)
+        graph, lib, params = relay.build(mod[mod.entry_func],
+                                         target,
+                                         params=params)
 
     ctx = tvm.cpu(0)
     from tvm.contrib import graph_runtime
@@ -417,7 +422,7 @@ def _test_upsample_nearest():
     y = helper.make_node("Upsample", ['in'], ['out'], mode='nearest', scales=[1.0, 1.0, 2.0, 2.0])
 
     in_array = np.random.uniform(size=in_shape).astype(np.float32)
-    out_array = topi.testing.upsampling_python(in_array, scale, "NCHW")
+    out_array = topi.testing.upsampling_python(in_array, (scale, scale), "NCHW")
 
     graph = helper.make_graph([y],
                               'upsample_nearest_test',
@@ -955,6 +960,8 @@ def verify_binary_ops(op, x, y, out_np, broadcast=None):
     verify_binary_ops("Div", x, y, x / y, broadcast=None)
     verify_binary_ops("Div", x, z, x / z, broadcast=True)
     verify_binary_ops("Sum", x, y, x + y, broadcast=None)
+    verify_binary_ops("Greater", x, y, x > y, broadcast=True)
+    verify_binary_ops("Less", x, y, x < y, broadcast=True)
 
 def test_single_ops():
     in_shape = (1, 2, 3, 3)
@@ -1068,6 +1075,48 @@ def test_LogSoftmax():
                               'LogSoftmax',
                               {'axis': 1})
 
+
+def check_torch_conversion(model, input_size):
+    dummy_input = torch.randn(*input_size)
+    file_name = '{}.onnx'.format(model.__name__)
+    # Set verbose=True for more output
+    torch.onnx.export(model(), dummy_input, file_name, export_params=True, verbose=False)
+    onnx_model = onnx.load(file_name)
+    shapes = { '0' : input_size }
+    expr, params = relay.frontend.from_onnx(onnx_model, shape=shapes)
+
+def test_resnet():
+    check_torch_conversion(torchvision.models.resnet18, (1,3,224,224))
+    # check_torch_conversion(torchvision.models.resnet101, (1,3,224,224))
+
+# def test_alexnet():
+    # Torch's ONNX export does not support the adaptive pooling used by AlexNet?
+    # check_torch_conversion(torchvision.models.alexnet, (1,3,224,224))
+
+# Torch's ONNX export does not support the adaptive pooling used by vgg16?
+# def test_vgg16():
+#     check_torch_conversion(torchvision.models.vgg16, (1,3,224,224))
+
+# TODO(@jroesch): Update Torch + ONNX to support this import.
+# def test_squeezenet():
+#     # Torch's ONNX export does not support the max pooling used by Squezenet
+#     check_torch_conversion(torchvision.models.squeezenet1_0, (1,3,224,224))
+
+def test_densenet():
+    check_torch_conversion(torchvision.models.densenet161, (1,3,224,224))
+
+def test_inception():
+    check_torch_conversion(torchvision.models.inception_v3, (1,3,224,224))
+
+# TODO(@jroesch): Update Torch + ONNX to support this import.
+# def test_googlenet():
+#     check_torch_conversion(torchvision.models.googlenet, (1,3,224,224))
+
+# TODO(@jroesch): Update Torch + ONNX to support this import.
+# def test_shufflenetv2():
+#     check_torch_conversion(torchvision.models.shufflenetv2, (1,3,224,224))
+
+
 if __name__ == '__main__':
     test_flatten()
     test_reshape()
@@ -1107,3 +1156,6 @@ def test_LogSoftmax():
     test_ParametricSoftplus()
     test_Scale()
     test_LogSoftmax()
+    test_resnet()
+    test_inception()
+    test_densenet()
diff --git a/tests/python/frontend/tensorflow/test_control_flow.py b/tests/python/frontend/tensorflow/test_control_flow.py
index b1860658a961..b08da1476601 100644
--- a/tests/python/frontend/tensorflow/test_control_flow.py
+++ b/tests/python/frontend/tensorflow/test_control_flow.py
@@ -22,9 +22,9 @@
 
 
 def check_equal(graph, tf_out):
-    expr, params = from_tensorflow(graph.as_graph_def(add_shapes=True))
-    ex = relay.create_executor('debug')
-    relay_out = ex.evaluate(expr)(**params)
+    mod, params = from_tensorflow(graph.as_graph_def(add_shapes=True))
+    ex = relay.create_executor('debug', mod=mod)
+    relay_out = ex.evaluate()(**params)
     if isinstance(relay_out, relay.backend.interpreter.TensorValue):
         np.testing.assert_allclose(tf_out, relay_out.asnumpy())
     else:
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 58bbdab02b84..2fa1c73538e3 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -22,8 +22,6 @@
 """
 from __future__ import print_function
 import numpy as np
-import tvm
-from tvm import relay
 import tensorflow as tf
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import graph_util
@@ -35,8 +33,9 @@
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import init_ops
-
 from distutils.version import LooseVersion
+import tvm
+from tvm import relay
 import tvm.relay.testing.tf as tf_testing
 
 #######################################################################
@@ -47,7 +46,8 @@ def convert_to_list(x):
         x = [x]
     return x
 
-def run_tvm_graph(graph_def, input_data, input_node, num_output=1, target='llvm', out_names=None):
+def run_tvm_graph(graph_def, input_data, input_node, num_output=1,
+                  target='llvm', out_names=None, opt_level=3):
     """ Generic function to compile on relay and execute on tvm """
     input_data = convert_to_list(input_data)
     input_node = convert_to_list(input_node)
@@ -55,42 +55,31 @@ def run_tvm_graph(graph_def, input_data, input_node, num_output=1, target='llvm'
     layout = None
     if target == "cuda":
         layout = "NCHW"
-    target_host = 'llvm'
-
-    if isinstance(input_data, list):
-        shape_dict = {}
-        dtype_dict = {}
-        for i, e in enumerate(input_node):
-            shape_dict[e] = input_data[i].shape
-            dtype_dict[e] = input_data[i].dtype
-    else:
-        shape_dict = {input_node: input_data.shape}
-        dtype_dict = {input_node: input_data.dtype}
+    target_host = None
+
+    shape_dict = {e: i.shape for e, i in zip(input_node, input_data)}
 
-    sym, params = relay.frontend.from_tensorflow(graph_def,
+    mod, params = relay.frontend.from_tensorflow(graph_def,
                                                  layout=layout,
                                                  shape=shape_dict,
                                                  outputs=out_names)
-    with relay.build_config(opt_level=3):
-        graph, lib, params = relay.build(sym, target, params=params)
+    with relay.build_config(opt_level=opt_level):
+        graph, lib, params = relay.build(mod[mod.entry_func], target, target_host, params)
 
     ctx = tvm.context(target, 0)
     from tvm.contrib import graph_runtime
     m = graph_runtime.create(graph, lib, ctx)
     # set inputs
-    for i, e in enumerate(input_node):
-        m.set_input(e, tvm.nd.array(input_data[i].astype(input_data[i].dtype)))
+    for e, i in zip(input_node, input_data):
+        m.set_input(e, tvm.nd.array(i))
 
     m.set_input(**params)
     # execute
     m.run()
     # get outputs
-    assert out_names is None or num_output == len(out_names),"out_names: {} num_output: {}".format(
-                                                              out_names, num_output)
-    tvm_output_list = []
-    for i in range(0, num_output):
-        tvm_output = m.get_output(i)
-        tvm_output_list.append(tvm_output.asnumpy())
+    assert out_names is None or num_output == len(out_names), (
+        "out_names: {} num_output: {}".format(out_names, num_output))
+    tvm_output_list = [m.get_output(i).asnumpy() for i in range(num_output)]
     return tvm_output_list
 
 def run_tf_graph(sess, input_data, input_node, output_node):
@@ -99,31 +88,26 @@ def run_tf_graph(sess, input_data, input_node, output_node):
     input_node = convert_to_list(input_node)
     output_node = convert_to_list(output_node)
 
-    tensor = [0] * len(output_node)
-    for i in range(len(output_node)):
-        tensor[i] = sess.graph.get_tensor_by_name(output_node[i])
+    tensor = [sess.graph.get_tensor_by_name(output_name) for output_name in output_node]
 
-    input_dict = {}
-    for i, e in enumerate(input_node):
-        input_dict[e] = input_data[i]
+    input_dict = {e: input_data[i] for i, e in enumerate(input_node)}
 
     output_data = sess.run(tensor, input_dict)
     return output_data
 
 
-def compare_tf_with_tvm(in_data, in_name, out_name, init_global_variables=False, no_gpu=False):
+def compare_tf_with_tvm(in_data, in_name, out_name, init_global_variables=False,
+                        no_gpu=False, opt_level=3):
     """Generic function to generate and compare tensorflow and TVM output"""
+    def name_without_num(name):
+        return name.split(':')[0] if ":" in name else name
 
     out_name = convert_to_list(out_name)
-    out_node = [0]*len(out_name)
-    for i in range(len(out_name)):
-        out_node[i] = out_name[i].split(':')[0] if ":" in out_name[i] else out_name[i]
+    out_node = [name_without_num(name) for name in out_name]
 
     in_data = convert_to_list(in_data)
     in_name = convert_to_list(in_name)
-    in_node = [0]*len(in_name)
-    for i in range(len(in_name)):
-        in_node[i] = in_name[i].split(':')[0] if ":" in in_name[i] else in_name[i]
+    in_node = [name_without_num(name) for name in in_name]
     with tf.Session() as sess:
         if init_global_variables:
             sess.run(variables.global_variables_initializer())
@@ -142,8 +126,9 @@ def compare_tf_with_tvm(in_data, in_name, out_name, init_global_variables=False,
             if no_gpu and device == 'cuda':
                 continue
 
-            tvm_output = run_tvm_graph(final_graph_def, in_data, in_node, target=device,
-                                       out_names=out_name, num_output=len(out_name))
+            tvm_output = run_tvm_graph(final_graph_def, in_data, in_node,
+                                       target=device, out_names=out_name,
+                                       num_output=len(out_name), opt_level=opt_level)
             # since the names from tensorflow and relay runs are not exactly same,
             # first len(tf_output) will be compared
             for i in range(len(tf_output)):
@@ -193,46 +178,46 @@ def test_forward_pooling():
     """ Pooling """
 
     for pool_type in ['AVG', 'MAX']:
-            _test_pooling(input_shape=[2, 9, 10, 2],
-                         window_shape=[1, 1],
-                         padding='SAME',
-                         pooling_type=pool_type,
-                         dilation_rate=[1, 1],
-                         strides=[1, 1])
-
-            _test_pooling(input_shape=[2, 10, 9, 2],
-                         window_shape=[1, 1],
-                         padding='SAME',
-                         pooling_type=pool_type,
-                         dilation_rate=[1, 1],
-                         strides=[1, 1])
-
-            _test_pooling(input_shape=[2, 9, 10, 2],
-                         window_shape=[2, 1],
-                         padding='SAME',
-                         pooling_type=pool_type,
-                         dilation_rate=[1, 1],
-                         strides=[1, 1])
-
-            _test_pooling(input_shape=[2, 10, 9, 2],
-                         window_shape=[2, 3],
-                         padding='SAME',
-                         pooling_type=pool_type,
-                         dilation_rate=[1, 1],
-                         strides=[2, 1])
-
-            # Tests involving SpaceToBatchND
-            _test_pooling(input_shape=[1, 1, 2, 1],
-                         window_shape=[1, 1],
-                         padding='VALID',
-                         pooling_type=pool_type,
-                         dilation_rate=[1, 2])
-
-            _test_pooling(input_shape=[1, 2, 1],
-                         window_shape=[1],
-                         padding='VALID',
-                         pooling_type=pool_type,
-                         dilation_rate=[2])
+        _test_pooling(input_shape=[2, 9, 10, 2],
+                      window_shape=[1, 1],
+                      padding='SAME',
+                      pooling_type=pool_type,
+                      dilation_rate=[1, 1],
+                      strides=[1, 1])
+
+        _test_pooling(input_shape=[2, 10, 9, 2],
+                      window_shape=[1, 1],
+                      padding='SAME',
+                      pooling_type=pool_type,
+                      dilation_rate=[1, 1],
+                      strides=[1, 1])
+
+        _test_pooling(input_shape=[2, 9, 10, 2],
+                      window_shape=[2, 1],
+                      padding='SAME',
+                      pooling_type=pool_type,
+                      dilation_rate=[1, 1],
+                      strides=[1, 1])
+
+        _test_pooling(input_shape=[2, 10, 9, 2],
+                      window_shape=[2, 3],
+                      padding='SAME',
+                      pooling_type=pool_type,
+                      dilation_rate=[1, 1],
+                      strides=[2, 1])
+
+        # Tests involving SpaceToBatchND
+        _test_pooling(input_shape=[1, 1, 2, 1],
+                      window_shape=[1, 1],
+                      padding='VALID',
+                      pooling_type=pool_type,
+                      dilation_rate=[1, 2])
+
+        _test_pooling(input_shape=[1, 2, 1],
+                      window_shape=[1],
+                      padding='VALID',
+                      pooling_type=pool_type,
+                      dilation_rate=[2])
 
 #######################################################################
 # Convolution
@@ -411,6 +396,23 @@ def test_forward_reshape():
     _test_reshape(np.arange(6), [-1])
 
 #######################################################################
+# DepthToSpace
+# ------------
+
+def _test_depthtospace(data, block_size):
+    """ One iteration of depth_to_space operation with given data and block size """
+
+    with tf.Graph().as_default():
+        in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
+        array_ops.depth_to_space(in_data, block_size)
+
+        compare_tf_with_tvm(data, 'Placeholder:0', 'DepthToSpace:0')
+
+def test_forward_depthtospace():
+    _test_depthtospace(np.random.normal(size=[1, 32, 32, 4]), 2)
+    _test_depthtospace(np.random.normal(size=[1, 16, 8, 32]), 4)
+
+
 #######################################################################
 # Squeeze
 # -------
@@ -458,24 +460,29 @@ def test_forward_squeeze():
 # ConcatV2
 # --------
 
-def _test_concat_v2(data, dim):
+def _test_concat_v2(shape1, shape2, dim):
     """ One iteration of ConcatV2 """
 
     with tf.Graph().as_default():
-        gen_array_ops._concat_v2(data, dim)
+        dtype = 'float32'
+        in1 = tf.placeholder(shape=shape1, dtype=dtype, name='in1')
+        in2 = tf.placeholder(shape=shape2, dtype=dtype, name='in2')
+        array_ops.concat_v2([in1, in2], dim)
 
-        compare_tf_with_tvm(data, ['ConcatV2/values_0:0', 'ConcatV2/values_1:0'],
-                            'ConcatV2:0')
+        np_data1 = np.random.uniform(size=shape1).astype(dtype)
+        np_data2 = np.random.uniform(size=shape2).astype(dtype)
 
-def _test_forward_concat_v2():
-    t1 = np.array([])
-    t2 = np.array([])
-    _test_concat_v2([t1, t2], 0)
+        compare_tf_with_tvm([np_data1, np_data2], ['in1:0', 'in2:0'], 'ConcatV2:0')
 
-    t1 = np.array([[1, 2, 3], [4, 5, 6]])
-    t2 = np.array([[7, 8, 9], [10, 11, 12]])
+def test_forward_concat_v2():
+    if tf.__version__ < LooseVersion('1.4.1'):
+        return
 
-    _test_concat_v2([t1, t2], 1)
+    _test_concat_v2([2, 3], [2, 3], 0)
+    _test_concat_v2([10, 3, 5], [2, 3, 5], 0)
+    _test_concat_v2([2, 3], [2, 3], 1)
+    _test_concat_v2([5, 8], [5, 4], 1)
+    _test_concat_v2([2, 8, 5], [2, 8, 6], -1)
 
 #######################################################################
 # Sigmoid
@@ -508,8 +515,8 @@ def _test_argx(func, data, **kwargs):
         compare_tf_with_tvm(data, 'c0:0', 'argx0:0')
 
 def test_forward_argminmax():
-    for axis in [None,0,1,2]:
-        data = np.random.uniform(size=(8,4,9)).astype('float32')
+    for axis in [None, 0, 1, 2]:
+        data = np.random.uniform(size=(8, 4, 9)).astype('float32')
         _test_argx(tf.argmax, data=data, axis=axis)
         _test_argx(tf.argmin, data=data, axis=axis)
 
@@ -527,10 +534,10 @@ def _test_reduce(func, data, **kwargs):
         compare_tf_with_tvm(data, 'c0:0', 'reducex0:0')
 
 def test_forward_reduce():
-    data = np.random.uniform(size=(8,4,9)).astype('float32')
+    data = np.random.uniform(size=(8, 4, 9)).astype('float32')
     _test_reduce(tf.reduce_sum, data=data)
     _test_reduce(tf.reduce_sum, data=data, axis=0)
-    _test_reduce(tf.reduce_sum, data=data, axis=(0,1))
+    _test_reduce(tf.reduce_sum, data=data, axis=(0, 1))
 
 
 #######################################################################
@@ -557,21 +564,53 @@ def test_forward_variable():
     _test_variable(np.random.uniform(size=(32, 100)).astype('float32'))
 
 
+#######################################################################
+# MatMul
+# ------
+
+def _test_matmul(i, j, k, dtype, outer=None):
+    """ One iteration of matmul """
+
+    A_shape_init = [i, j]
+    B_shape_init = [j, k]
+
+    for transpose_a in [False, True]:
+        for transpose_b in [False, True]:
+            outer = outer or []
+            A_shape = outer + (A_shape_init[::-1] if transpose_a else A_shape_init)
+            B_shape = outer + (B_shape_init[::-1] if transpose_b else B_shape_init)
+
+            with tf.Graph().as_default():
+                A = tf.placeholder(shape=A_shape, dtype=dtype, name='A')
+                B = tf.placeholder(shape=B_shape, dtype=dtype, name='B')
+                result = tf.matmul(A, B, transpose_a=transpose_a, transpose_b=transpose_b)
+
+                A_np = np.random.uniform(high=5.0, size=A_shape).astype(dtype)
+                B_np = np.random.uniform(high=5.0, size=B_shape).astype(dtype)
+                compare_tf_with_tvm([A_np, B_np], [A.name, B.name], result.name)
+
+def test_forward_matmul():
+    """ Matmul op test"""
+    _test_matmul(1, 3, 6, 'int32')
+    _test_matmul(5, 3, 1, 'float64')
+    # TODO non-empty outer requires BatchMatMul (BatchMatMulV2 for some cases?) support
+
+
 #######################################################################
 # StridedSlice
 # ------------
 
 def _test_stridedslice(ip_shape, begin, end, stride, dtype,
-                             begin_mask=0, end_mask=0, new_axis_mask=0,
-                             shrink_axis_mask=0, ellipsis_mask=0):
+                       begin_mask=0, end_mask=0, new_axis_mask=0,
+                       shrink_axis_mask=0, ellipsis_mask=0):
     """ One iteration of a Stridedslice """
 
     tf.reset_default_graph()
     in_data = tf.placeholder(dtype, ip_shape, name="in_data")
     tf.strided_slice(in_data, begin, end, stride, begin_mask=begin_mask,
-                         end_mask=end_mask, new_axis_mask=new_axis_mask,
-                         shrink_axis_mask=shrink_axis_mask,
-                         ellipsis_mask=ellipsis_mask, name="strided_slice")
+                     end_mask=end_mask, new_axis_mask=new_axis_mask,
+                     shrink_axis_mask=shrink_axis_mask,
+                     ellipsis_mask=ellipsis_mask, name="strided_slice")
     np_data = np.random.uniform(size=ip_shape).astype(dtype)
 
     compare_tf_with_tvm(np_data, 'in_data:0', 'strided_slice:0')
@@ -586,31 +625,87 @@ def test_forward_stridedslice():
     _test_stridedslice((3, 4, 5, 3), [1, 0], [4, 2], [2, 1], 'float32', ellipsis_mask=2)
     _test_stridedslice((3, 4, 5, 3), [1, 0, 1], [4, 2, 2], [2, 1, 1], 'float32', ellipsis_mask=2)
     _test_stridedslice((3, 4, 3), [1, 1, 0], [4, 4, 2], [2, 1, 1], 'float32', new_axis_mask=5)
-    _test_stridedslice((3, 4, 3), [1, 1, 1], [4, 4, 1], [2, 1, 1], 'float32', ellipsis_mask=2, new_axis_mask=4)
-    _test_stridedslice((6, 4, 5), [1, 1, 1], [6, 3, 4], [2, 1, 1], 'float32', ellipsis_mask=2, new_axis_mask=5)
-    _test_stridedslice((3, 4, 3), [1, 1, 2], [4, 4, 3], [2, 1, 1], 'float32', ellipsis_mask=4, new_axis_mask=2)
-    _test_stridedslice((3, 4, 3), [1, 1, 2], [4, 4, 3], [2, 1, 1], 'float32', ellipsis_mask=2, new_axis_mask=3)
-    _test_stridedslice((3, 4, 3), [1, 1, 0], [4, 4, 1], [2, 1, 1], 'float32', ellipsis_mask=2, new_axis_mask=3)
-    _test_stridedslice((3, 4, 3), [1, 1, 2], [4, 4, 3], [2, 1, 1], 'float32', ellipsis_mask=2, new_axis_mask=2)
-    _test_stridedslice((3,4), [1, 0], [4, 4], [1, 1], 'float32', shrink_axis_mask=2)
-    _test_stridedslice((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], 'float32', shrink_axis_mask=2, new_axis_mask=2)
-    _test_stridedslice((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], 'float32', shrink_axis_mask=1, new_axis_mask=2)
-    _test_stridedslice((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], 'float32', shrink_axis_mask=2, new_axis_mask=1)
-    _test_stridedslice((3, 4, 5, 4, 5, 6), [0, 0], [2, 3], [1, 1], 'float32', shrink_axis_mask=5, new_axis_mask=1)
+    _test_stridedslice((3, 4, 3), [1, 1, 1], [4, 4, 1], [2, 1, 1], 'float32', ellipsis_mask=2,
+                       new_axis_mask=4)
+    _test_stridedslice((6, 4, 5), [1, 1, 1], [6, 3, 4], [2, 1, 1], 'float32', ellipsis_mask=2,
+                       new_axis_mask=5)
+    _test_stridedslice((3, 4, 3), [1, 1, 2], [4, 4, 3], [2, 1, 1], 'float32', ellipsis_mask=4,
+                       new_axis_mask=2)
+    _test_stridedslice((3, 4, 3), [1, 1, 2], [4, 4, 3], [2, 1, 1], 'float32', ellipsis_mask=2,
+                       new_axis_mask=3)
+    _test_stridedslice((3, 4, 3), [1, 1, 0], [4, 4, 1], [2, 1, 1], 'float32', ellipsis_mask=2,
+                       new_axis_mask=3)
+    _test_stridedslice((3, 4, 3), [1, 1, 2], [4, 4, 3], [2, 1, 1], 'float32', ellipsis_mask=2,
+                       new_axis_mask=2)
+    _test_stridedslice((3, 4), [1, 0], [4, 4], [1, 1], 'float32', shrink_axis_mask=2)
+    _test_stridedslice((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], 'float32', shrink_axis_mask=2,
+                       new_axis_mask=2)
+    _test_stridedslice((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], 'float32', shrink_axis_mask=1,
+                       new_axis_mask=2)
+    _test_stridedslice((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], 'float32', shrink_axis_mask=2,
+                       new_axis_mask=1)
+    _test_stridedslice((3, 4, 5, 4, 5, 6), [0, 0], [2, 3], [1, 1], 'float32', shrink_axis_mask=5,
+                       new_axis_mask=1)
     _test_stridedslice((3, 4, 5, 4, 5, 6), [0, 0, 1, 2, 1], [2, 3, 4, 5, 3], [1, 1, 2, 2, 1],
-                       'float32', shrink_axis_mask=5, new_axis_mask=1, ellipsis_mask=2, begin_mask=8, end_mask=8)
+                       'float32', shrink_axis_mask=5, new_axis_mask=1, ellipsis_mask=2,
+                       begin_mask=8, end_mask=8)
     _test_stridedslice((3, 4, 5, 4, 5, 6), [0, 0, 1, 2, 1], [2, 3, 4, 5, 3], [1, 1, 2, 2, 1],
-                       'float32', shrink_axis_mask=8, new_axis_mask=1, ellipsis_mask=2, begin_mask=5, end_mask=5)
+                       'float32', shrink_axis_mask=8, new_axis_mask=1, ellipsis_mask=2,
+                       begin_mask=5, end_mask=5)
     _test_stridedslice((3, 4, 5, 4, 5, 6), [0, 0, 1, 2, 1], [2, 3, 4, 5, 3], [1, 1, 2, 2, 1],
-                       'float32', shrink_axis_mask=16, new_axis_mask=1, ellipsis_mask=2, begin_mask=5, end_mask=5)
+                       'float32', shrink_axis_mask=16, new_axis_mask=1, ellipsis_mask=2,
+                       begin_mask=5, end_mask=5)
     _test_stridedslice((3, 4, 5, 4, 5, 6), [1, 2, 0, -3], [4, 5, 3, 3], [2, 2, 1, 1],
-                       'float32', shrink_axis_mask=8, new_axis_mask=1, ellipsis_mask=2, begin_mask=5,
-                       end_mask=8)
+                       'float32', shrink_axis_mask=8, new_axis_mask=1, ellipsis_mask=2,
+                       begin_mask=5, end_mask=8)
+
+#######################################################################
+# FloorDiv, RealDiv
+# -----------------
+
+def _test_forward_divide(ip_shape, dtype):
+    np_numer = np.random.uniform(-100, 100, size=ip_shape).astype(dtype)
+    np_denomin = np.random.uniform(1, 100, size=ip_shape).astype(dtype)
+    tf.reset_default_graph()
+    numerator = tf.placeholder(dtype, ip_shape, name="numer")
+    denominator = tf.placeholder(dtype, ip_shape, name="denomin")
+    tf.math.divide(numerator, denominator, name='RealDiv')
+    compare_tf_with_tvm([np_numer, np_denomin], ['numer:0', 'denomin:0'], 'RealDiv:0')
+
+def _test_forward_floordiv(ip_shape, dtype):
+    np_numer = np.random.uniform(-100, 100, size=ip_shape).astype(dtype)
+    tf.reset_default_graph()
+    numerator = tf.placeholder(dtype, ip_shape, name="numer")
+    tf.math.floordiv(numerator, tf.constant(5, dtype=dtype), name='FloorDiv')
+    compare_tf_with_tvm([np_numer], ['numer:0'], 'FloorDiv:0')
+
+def test_forward_divide():
+    '''test FloorDiv, RealDiv'''
+    _test_forward_divide((4,), 'int32')
+    _test_forward_divide((4, 3, 7), 'float32')
+    _test_forward_floordiv((4, 3, 7), 'float32')
 
 
 #######################################################################
-# Gather, GatherV2
-# ----------------
+# TruncateMod
+# -----------
+def _test_forward_truncatemod(ip_shape, dtype):
+    np_data_1 = np.random.uniform(-100, 100, size=ip_shape).astype(dtype)
+    np_data_2 = np.random.uniform(1, 10, size=ip_shape).astype(dtype)
+    tf.reset_default_graph()
+    in_data_1 = tf.placeholder(dtype, ip_shape, name="in_data_1")
+    in_data_2 = tf.placeholder(dtype, ip_shape, name="in_data_2")
+    tf.truncatemod(in_data_1, in_data_2, name='truncatemod')
+    compare_tf_with_tvm([np_data_1, np_data_2], ['in_data_1:0', 'in_data_2:0'], 'truncatemod:0')
+
+def test_forward_truncatemod():
+    '''test TruncateMod'''
+    _test_forward_truncatemod((4, 3, 7), 'int32')
+
+
+#######################################################################
+# Gather, GatherV2, GatherNd
+# --------------------------
 
 def _test_gather(ip_shape, indice_shape, indice_value, axis, dtype):
     """ One iteration of a GatherV2 """
@@ -618,7 +713,7 @@ def _test_gather(ip_shape, indice_shape, indice_value, axis, dtype):
     tf.reset_default_graph()
     in_data = tf.placeholder(dtype, ip_shape, name="in_data")
     indices = tf.placeholder("int32", indice_shape, name="indices")
-    tf.gather(in_data, indices, axis=axis)
+    out = tf.gather(in_data, indices, axis=axis)
     np_data = np.random.uniform(1, 10, size=ip_shape).astype(dtype)
 
     def _fill_indices(indice_value):
@@ -630,57 +725,46 @@ def _fill_indices(indice_value):
         return indices
     np_indices = _fill_indices(indice_value)
 
-    compare_tf_with_tvm([np_data, np_indices], ['in_data:0', 'indices:0'], 'GatherV2:0')
+    compare_tf_with_tvm([np_data, np_indices], ['in_data:0', 'indices:0'], out.name)
 
 def test_forward_gather():
-    '''test GatherV2 layer'''
+    '''test Gather/GatherV2 layer'''
     _test_gather((4,), (1,), 1, 0, 'int32')
     _test_gather((4,), (1,), 1, 0, 'float32')
     _test_gather((1, 4), (1,), [0], 0, 'int32')
-    _test_gather((4,), (1, 2, 2), [[[1, 0],[0, 1]]], 0, 'float32')
-    _test_gather((2, 2), (1, 2, 2), [[[1, 0],[0, 1]]], 0, 'int32')
-    _test_gather((2, 2), (1, 2, 2), [[[1, 0],[0, 1]]], 1, 'int32')
-    _test_gather((2, 2), (1, 2, 2), [[[1, 0],[0, 1]]], 0, 'float32')
+    _test_gather((4,), (1, 2, 2), [[[1, 0], [0, 1]]], 0, 'float32')
+    _test_gather((2, 2), (1, 2, 2), [[[1, 0], [0, 1]]], 0, 'int32')
+    _test_gather((2, 2), (1, 2, 2), [[[1, 0], [0, 1]]], 1, 'int32')
+    _test_gather((2, 2), (1, 2, 2), [[[1, 0], [0, 1]]], 0, 'float32')
     _test_gather((3, 3, 3), (1, 1, 2), [[[1, 0]]], 0, 'int32')
     _test_gather((3, 3, 3), (1, 1, 2), [[[1, 0]]], 2, 'int32')
     _test_gather((4, 3, 5, 6), (1, 4), [[2, 1, 0, 0]], 0, 'float32')
 
-
-def _test_gather_v1(ip_shape, indice_shape, indice_value, dtype):
-    """ One iteration of a Gather"""
+def test_forward_gather_nd():
+    """test operator GatherNd"""
+    np_data = np.random.uniform(1, 100, size=(2, 2)).astype(np.float32)
     tf.reset_default_graph()
-    in_data = tf.placeholder(dtype, ip_shape, name="in_data")
-    indices = tf.placeholder("int32", indice_shape, name="indices")
-    tf.gather(in_data, indices)
-    np_data = np.random.uniform(size=ip_shape).astype(dtype)
+    in_data = tf.placeholder(tf.float32, (2, 2), name="in_data")
+    tf.gather_nd(in_data, indices=[[1, 0], [0, 1]], name="gather_nd")
+    compare_tf_with_tvm([np_data], ['in_data:0'], 'gather_nd:0')
 
-    def _fill_indices(indice_value):
-        indices = np.array(ip_shape, dtype=dtype)
-        if isinstance(indice_value, int):
-            indices = np.array([indice_value], dtype='int32')
-        else:
-            indices = np.asarray(indice_value, dtype='int32')
-        return indices
-    np_indices = _fill_indices(indice_value)
 
-    compare_tf_with_tvm([np_data, np_indices], ['in_data:0', 'indices:0'], 'Gather:0')
-
-
-def test_forward_gather_v1():
-    '''test gather layer'''
+#######################################################################
+# BiasAdd
+# -------
+def test_forward_bias_add():
+    """test Op BiasAdd"""
+    def check_bias_add(lh_shpae, rh_shape, dtype):
+        tf.reset_default_graph()
+        lh_data = np.random.uniform(size=lh_shpae).astype(dtype)
+        rh_data = np.random.uniform(size=rh_shape).astype(dtype)
+        lft_data = tf.placeholder(dtype, name="lft_data")
+        rgt_data = tf.placeholder(dtype, name="rgt_data")
+        tf.nn.bias_add(lft_data, rgt_data, name="BiasAdd")
+        compare_tf_with_tvm([lh_data, rh_data], ['lft_data:0', 'rgt_data:0'], 'BiasAdd:0')
 
-    if tf.__version__ < LooseVersion('1.7'):
-        _test_gather_v1((4,), (1, 2, 2), [[[1, 0], [0, 1]]], 'float32')
-        _test_gather_v1((4,), (1,), 1, 'int32')
-        _test_gather_v1((4,), (1,), 1, 'float32')
-        _test_gather_v1((1, 4), (1,), [0], 'int32')
-        _test_gather_v1((4,), (1, 2, 2), [[[1, 0], [0, 1]]], 'float32')
-        _test_gather_v1((2, 2), (1, 2, 2), [[[1, 0], [0, 1]]], 'int32')
-        _test_gather_v1((2, 2), (1, 2, 2), [[[1, 0], [0, 1]]], 'int32')
-        _test_gather_v1((2, 2), (1, 2, 2), [[[1, 0], [0, 1]]], 'float32')
-        _test_gather_v1((3, 3, 3), (1, 1, 2), [[[1, 0]]], 'int32')
-        _test_gather_v1((3, 3, 3), (1, 1, 2), [[[1, 0]]], 'int32')
-        _test_gather_v1((4, 3, 5, 6), (1, 4), [[2, 1, 0, 0]], 'float32')
+    check_bias_add((10, 8, 16, 32), (32,), dtype="int32")
+    check_bias_add((10, 20), (20,), dtype="float32")
 
 
 #######################################################################
@@ -693,7 +777,8 @@ def _test_split(in_shape, axis, num_or_size_splits, dtype):
     """ One iteration of a Split """
     tf.reset_default_graph()
     in_data = tf.placeholder(dtype, in_shape, name="in_data")
-    num_split = len(num_or_size_splits) if isinstance(num_or_size_splits, list) else num_or_size_splits
+    num_split = len(num_or_size_splits) if isinstance(num_or_size_splits, list)\
+                else num_or_size_splits
     tf.split(in_data, num_or_size_splits, axis=axis)
 
     compare_tf_with_tvm([np_data], ['in_data:0'], [f'split:{n}' for n in range(num_split)])
@@ -734,6 +819,24 @@ def test_forward_split():
     _test_split((3, 6, 4), -2, [1, 4, 1], 'float32')
 
 
+######################################################################
+# TopKV2
+# ------
+
+def _test_forward_top_k_v2(in_shape, k):
+    np_data = np.random.uniform(-100, 100, size=in_shape).astype("float32")
+    tf.reset_default_graph()
+    in_data = tf.placeholder("float32", in_shape, name="in_data")
+    tf.math.top_k(in_data, k, name='TopK')
+    compare_tf_with_tvm([np_data], ['in_data:0'], 'TopK:0')
+
+def test_forward_top_k_v2():
+    _test_forward_top_k_v2((3,), 1)
+    _test_forward_top_k_v2((3,), 3)
+    _test_forward_top_k_v2((3, 5, 7), 3)
+    _test_forward_top_k_v2((3, 5, 7), 3)
+
+
 #######################################################################
 # Unstack
 # -------
@@ -780,6 +883,23 @@ def test_forward_tile():
     _test_tile((2, 4, 6), (6, 7, 8), "float64")
 
 
+#######################################################################
+# ClipByValue
+# -----------
+
+def _test_forward_clip_by_value(ip_shape, clip_value_min, clip_value_max, dtype):
+    tf.reset_default_graph()
+    in_data = tf.placeholder(dtype, ip_shape, name="in_data")
+    tf.clip_by_value(in_data, clip_value_min, clip_value_max, name="ClipByValue")
+    np_data = np.random.uniform(-100, 100, size=ip_shape).astype(dtype)
+    compare_tf_with_tvm([np_data], ['in_data:0'], 'ClipByValue:0')
+
+def test_forward_clip_by_value():
+    '''test ClipByValue op'''
+    if tf.__version__ < LooseVersion('1.9'):
+        _test_forward_clip_by_value((4,), .1, 5., 'float32')
+        _test_forward_clip_by_value((4, 4), 1, 5, 'int32')
+
 #######################################################################
 # Multi Input to graph
 # --------------------
@@ -829,8 +949,8 @@ def test_forward_multi_output():
                 tvm.testing.assert_allclose(tf_output[i], tvm_output[i], atol=1e-5, rtol=1e-5)
 
 #######################################################################
-# Resize Bilinear
-# ---------------
+# Resize Bilinear, Nearest_Neighbor
+# ---------------------------------
 
 def _test_resize_bilinear(in_shape, to_shape, align_corners):
     """ One iteration of resize bilinear """
@@ -840,16 +960,126 @@ def _test_resize_bilinear(in_shape, to_shape, align_corners):
 
     with tf.Graph().as_default():
         in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
-        shape_data = constant_op.constant(shape_data, shape=shape_data.shape, dtype=shape_data.dtype)
+        shape_data = constant_op.constant(
+            shape_data, shape=shape_data.shape, dtype=shape_data.dtype)
         tf.image.resize_bilinear(in_data, shape_data, align_corners=align_corners)
 
         compare_tf_with_tvm(data, 'Placeholder:0', 'ResizeBilinear:0')
 
-def test_forward_resize_bilinear():
-    """ Resize Bilinear """
+def _test_resize_bilinear_from_tensor(in_shape, align_corners):
+    """ One iteration of resize bilinear with non-constant output shape, requires
+        value inference to get proper output shape."""
+
+    data = np.random.uniform(size=in_shape).astype('float32')
+
+    with tf.Graph().as_default():
+        in_data = array_ops.placeholder(
+            shape=[in_shape[0], in_shape[1], None, None], dtype=data.dtype)
+        to_shape = tf.shape(in_data)[2:]
+        tf.image.resize_bilinear(in_data, to_shape, align_corners=align_corners)
+
+        compare_tf_with_tvm(data, 'Placeholder:0', 'ResizeBilinear:0')
+
+
+def _test_resize_nearest_neighbor(in_shape, to_shape):
+    """ One iteration of resize nearest neighbor """
+
+    data = np.random.uniform(size=in_shape).astype('float32')
+    shape_data = np.array(to_shape).astype('int32')
+
+    with tf.Graph().as_default():
+        in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
+        shape_data = constant_op.constant(
+            shape_data, shape=shape_data.shape, dtype=shape_data.dtype)
+        tf.image.resize_nearest_neighbor(in_data, shape_data, name='resize_nearest_neighbor')
+
+        compare_tf_with_tvm(data, 'Placeholder:0', 'resize_nearest_neighbor:0')
+
+
+def test_forward_resize():
+    """ Resize Bilinear, Nearest_Neighbor """
 
     _test_resize_bilinear((4, 16, 32, 32), [50, 50], False)
     _test_resize_bilinear((6, 32, 64, 64), [20, 20], True)
+    _test_resize_bilinear_from_tensor((4, 16, 32, 32), False)
+    _test_resize_bilinear_from_tensor((6, 32, 50, 50), True)
+    _test_resize_nearest_neighbor((6, 32, 64, 64), [20, 20])
+
+
+#######################################################################
+# BroadcastTo
+# -----------
+
+def _test_broadcast_to(in_shape, to_shape):
+    """ One iteration of broadcast_to"""
+
+    data = np.random.uniform(size=in_shape).astype('float32')
+    shape_data = np.array(to_shape).astype('int32')
+
+    with tf.Graph().as_default():
+        in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
+        shape_data = constant_op.constant(
+            shape_data, shape=shape_data.shape, dtype=shape_data.dtype)
+        tf.broadcast_to(in_data, shape_data)
+
+        compare_tf_with_tvm(data, 'Placeholder:0', 'BroadcastTo:0', opt_level=0)
+
+
+def _test_broadcast_to_from_tensor(in_shape):
+    """ One iteration of broadcast_to with unknown shape at graph build"""
+
+    data = np.random.uniform(size=in_shape).astype('float32')
+
+    with tf.Graph().as_default():
+        in_data = array_ops.placeholder(
+            shape=[None], dtype=data.dtype)
+
+        shape_data = tf.multiply(tf.shape(in_data), 32)
+        tf.broadcast_to(in_data, shape_data)
+
+        compare_tf_with_tvm(data, 'Placeholder:0', 'BroadcastTo:0')
+
+
+def test_forward_broadcast_to():
+    """ Resize Bilinear """
+
+    _test_broadcast_to((4, 1, 32, 32), [4, 8, 32, 32])
+    _test_broadcast_to((6, 32, 32, 1), [6, 32, 32, 16])
+    _test_broadcast_to_from_tensor((1))
+
+
+#######################################################################
+# Fill
+# ----
+
+def _test_fill(in_shape):
+    """ Use the fill op to create a tensor of ones with non-constant shape."""
+
+    with tf.Graph().as_default():
+        tf.ones(shape=in_shape, dtype='float32')
+        compare_tf_with_tvm(in_shape, [], 'ones:0', opt_level=1)
+
+def _test_fill_from_tensor(in_shape):
+    """ Use the fill op to create a tensor of ones with non-constant shape.
+        Some extra ops need to be added here to prevent the graph from
+        being fully constant and folded away."""
+
+    data = np.random.uniform(size=in_shape).astype('float32')
+
+    with tf.Graph().as_default():
+        in_data = array_ops.placeholder(
+            shape=[in_shape[0], in_shape[1], None, None], dtype=data.dtype)
+
+        x = tf.ones(shape=2*tf.shape(in_data), dtype=data.dtype)
+        y = tf.math.add(in_data, tf.reduce_mean(x), name='out1')
+        compare_tf_with_tvm(data, 'Placeholder:0', 'out1:0')
+
+def test_forward_fill():
+    """ Resize Bilinear """
+
+    _test_fill((32))
+    _test_fill((6, 32, 64, 64))
+    _test_fill_from_tensor((6, 32, 64, 64))
 
 #######################################################################
 # Crop to bounding box
@@ -868,6 +1098,39 @@ def test_forward_crop():
     _test_crop((1, 224, 224, 3), 20, 20, 120, 120)
 
 
+#######################################################################
+# CropAndResize
+# -------------
+
+def _test_forward_crop_and_resize(img_shape, boxes, box_idx, crop_size, method='bilinear', dtype="float32"):
+    image = np.random.uniform(0, 10, size=img_shape).astype(dtype)
+    tf.reset_default_graph()
+    in_data = tf.placeholder(dtype, image.shape, name="in_data")
+    tf.image.crop_and_resize(in_data, boxes=boxes, box_ind=box_idx, crop_size=crop_size,
+                             method=method, name="crop_and_resize")
+    compare_tf_with_tvm([image], ['in_data:0'], 'crop_and_resize:0')
+
+def test_forward_crop_and_resize():
+    """ CropAndResize """
+    _test_forward_crop_and_resize([1, 11, 11, 3], [[0, 0, 1, 1]], [0], [5, 5])
+    _test_forward_crop_and_resize([1, 11, 11, 3], [[0, 0, .9, .9]], [0], [5, 5])
+    _test_forward_crop_and_resize([1, 11, 11, 3], [[.1, .2, 1, 1]], [0], [5, 5])
+    _test_forward_crop_and_resize([1, 21, 21, 3], [[.2, .3, .7, .9]], [0], [3, 4])
+    _test_forward_crop_and_resize([1, 106, 106, 3], [[0.2, 0.4, 0.8, 0.8]], [0], [3, 3])
+    _test_forward_crop_and_resize([10, 11, 11, 3],
+                                  [[0, 0, 0.9, 0.9], [0.2, 0.2, 0.8, 0.8]],
+                                  [0, 1],
+                                  [5, 5])
+    _test_forward_crop_and_resize([3, 11, 11, 3],
+                                  [[0, 0, 0.9, 0.9], [0.2, 0.2, 0.8, 0.8],[0, 0, 1, 1]],
+                                  [0, 1, 2],
+                                  [3, 3])
+    _test_forward_crop_and_resize([3, 11, 11, 3],
+                                  [[0, 0, 1, 0.8], [0, 0, 0.9, 0.9], [0, 0, 1, 0.8]],
+                                  [2, 1, 0],
+                                  [3, 3])
+
+
 #######################################################################
 # LSTM
 # ----
@@ -884,10 +1147,10 @@ def _test_lstm_cell(batch_size, num_hidden, num_layers, forget_bias, dtype):
     def _get_tensorflow_output():
         with tf.Session() as sess:
             with variable_scope.variable_scope(
-                "root", initializer=init_ops.constant_initializer(0.5)):
+                    "root", initializer=init_ops.constant_initializer(0.5)):
                 m0 = array_ops.zeros([batch_size, num_hidden])
                 m1 = array_ops.zeros([batch_size, num_hidden])
-                x=tf.placeholder(shape=(batch_size, input_size), dtype=dtype)
+                x = tf.placeholder(shape=(batch_size, input_size), dtype=dtype)
                 g, ((out_m0, out_m1)) = \
                      tf.contrib.rnn.LSTMBlockCell(num_hidden,
                                                   forget_bias=forget_bias)(x, ((m0, m1)))
@@ -935,18 +1198,44 @@ def _test_pack(axis, shape, **kwargs):
     with tf.Graph().as_default():
         tf_a = array_ops.placeholder(shape=shape, dtype='float32', name='pl_a')
         tf_b = array_ops.placeholder(shape=shape, dtype='float32', name='pl_b')
-        tf_c = tf.stack([tf_a,tf_b], axis=axis, **kwargs)
+        tf_c = tf.stack([tf_a, tf_b], axis=axis, **kwargs)
         assert tf_c.op.op_def.name == 'Pack', "tf.stack() is expected to produce 'Pack' operation"
 
-        compare_tf_with_tvm([a,b], ['pl_a:0','pl_b:0'], 'stack:0')
+        compare_tf_with_tvm([a, b], ['pl_a:0', 'pl_b:0'], 'stack:0')
 
 def test_forward_pack():
-    for axis in range(-3,3):
-        _test_pack(axis, [3,2,1])
-    for axis in range(-1,1):
+    for axis in range(-3, 3):
+        _test_pack(axis, [3, 2, 1])
+    for axis in range(-1, 1):
         _test_pack(axis, [3])
     _test_pack(0, [])
 
+
+#######################################################################
+# Unpack
+# ------
+def _test_forward_unpack(in_shape, axis, dtype):
+    """test operator Unpack"""
+    np_data = np.random.uniform(-100, 100, size=in_shape).astype(dtype)
+    tf.reset_default_graph()
+    in_data = tf.placeholder(dtype, in_shape, name="in_data")
+    tf.unstack(in_data, axis=axis, name="Unpack")
+    compare_tf_with_tvm([np_data], ['in_data:0'], 'Unpack:0')
+
+def test_forward_unpack():
+    _test_forward_unpack((3,), 0, 'int32')
+    _test_forward_unpack((3,), -1, 'int16')
+    _test_forward_unpack((21, 23, 3), 2, 'float32')
+
+#######################################################################
+# Range
+# -----
+def test_forward_range():
+    """test operator Range"""
+    tf.reset_default_graph()
+    tf.range(1, 18, 3, name="range")
+    compare_tf_with_tvm([], [], 'range:0')
+
 #######################################################################
 # Pad
 # ---
@@ -970,8 +1259,8 @@ def _test_pad(input_shape, paddings, mode, **kwargs):
 
 def test_forward_pad():
     """ Pad """
-    _test_pad((2, 3), [[1,1], [2,2]], mode="CONSTANT")
-    _test_pad((2, 3), [[1,1], [2,2]], mode="CONSTANT", constant_values=1.0)
+    _test_pad((2, 3), [[1, 1], [2, 2]], mode="CONSTANT")
+    _test_pad((2, 3), [[1, 1], [2, 2]], mode="CONSTANT", constant_values=1.0)
 
 #######################################################################
 # Logical operators
@@ -981,8 +1270,8 @@ def test_logical_and():
         in1 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in1')
         in2 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in2')
         out = tf.logical_and(in1, in2, name='out')
-        in_data1 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
-        in_data2 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        in_data1 = np.random.choice(a=[False, True], size=(1, 4, 4, 3)).astype('bool')
+        in_data2 = np.random.choice(a=[False, True], size=(1, 4, 4, 3)).astype('bool')
         compare_tf_with_tvm([in_data1, in_data2], ['in1:0', 'in2:0'], 'out:0')
 
 def test_logical_or():
@@ -990,8 +1279,8 @@ def test_logical_or():
         in1 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in1')
         in2 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in2')
         out = tf.logical_or(in1, in2, name='out')
-        in_data1 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
-        in_data2 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        in_data1 = np.random.choice(a=[False, True], size=(1, 4, 4, 3)).astype('bool')
+        in_data2 = np.random.choice(a=[False, True], size=(1, 4, 4, 3)).astype('bool')
         compare_tf_with_tvm([in_data1, in_data2], ['in1:0', 'in2:0'], 'out:0')
 
 def test_logical_xor():
@@ -999,15 +1288,15 @@ def test_logical_xor():
         in1 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in1')
         in2 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in2')
         out = tf.logical_xor(in1, in2, name='out')
-        in_data1 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
-        in_data2 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        in_data1 = np.random.choice(a=[False, True], size=(1, 4, 4, 3)).astype('bool')
+        in_data2 = np.random.choice(a=[False, True], size=(1, 4, 4, 3)).astype('bool')
         compare_tf_with_tvm([in_data1, in_data2], ['in1:0', 'in2:0'], 'out:0')
 
 def test_logical_not():
     with tf.Graph().as_default():
         in1 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in1')
         out = tf.logical_not(in1, name='out')
-        in_data1 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        in_data1 = np.random.choice(a=[False, True], size=(1, 4, 4, 3)).astype('bool')
         compare_tf_with_tvm(in_data1, 'in1:0', 'out:0')
 
 def test_forward_logical():
@@ -1020,7 +1309,7 @@ def test_forward_logical():
 #######################################################################
 # Where, Select
 # -------------
-def test_where():
+def test_forward_where():
     ''' Where: return elements depending on conditions'''
     with tf.Graph().as_default():
         with tf.Session() as sess:
@@ -1039,7 +1328,8 @@ def test_where():
 def test_forward_inception_v3():
     '''test inception V3 model'''
     with tf.Graph().as_default():
-        graph_def = tf_testing.get_workload('InceptionV3/inception_v3_2016_08_28_frozen-with_shapes.pb')
+        graph_def = tf_testing.get_workload(
+            'InceptionV3/inception_v3_2016_08_28_frozen-with_shapes.pb')
         # Call the utility to import the graph definition into default graph.
         graph_def = tf_testing.ProcessGraphDefParam(graph_def)
 
@@ -1068,7 +1358,7 @@ def test_forward_inception_v1():
         img = Image.frombuffer('RGB', (600, 600), img_array.tostring(), 'raw', 'RGB', 0, 1)
         temp = util.tempdir()
         img_path = temp.relpath("tf-test.jpg")
-        img.save(img_path);
+        img.save(img_path)
 
         import os.path
         if not tf.gfile.Exists(os.path.join(img_path)):
@@ -1107,7 +1397,8 @@ def test_forward_mobilenet():
             graph_def = tf_testing.AddShapesToGraphDef(sess, out_node)
             tf_output = run_tf_graph(sess, data, 'input:0', out_node + ':0')
             tvm_output = run_tvm_graph(graph_def, data, 'input')
-            tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tf_output[0]), rtol=1e-5, atol=1e-5)
+            tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tf_output[0]),
+                                        rtol=1e-5, atol=1e-5)
 
 #######################################################################
 # ResnetV2
@@ -1116,7 +1407,8 @@ def test_forward_resnetv2():
     '''test resnet model'''
     if is_gpu_available():
         with tf.Graph().as_default():
-            graph_def = tf_testing.get_workload("ResnetV2/resnet-20180601_resnet_v2_imagenet-shapes.pb")
+            graph_def = tf_testing.get_workload(
+                "ResnetV2/resnet-20180601_resnet_v2_imagenet-shapes.pb")
             # Call the utility to import the graph definition into default graph.
             graph_def = tf_testing.ProcessGraphDefParam(graph_def)
 
@@ -1130,8 +1422,10 @@ def test_forward_resnetv2():
                     if not ctx.exist:
                         print("Skip because %s is not enabled" % device)
                         continue
-                    tvm_output = run_tvm_graph(graph_def, data, 'input_tensor', len(tf_output), target=device)
-                    tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tf_output[0]), rtol=1e-5, atol=1e-5)
+                    tvm_output = run_tvm_graph(graph_def, data, 'input_tensor', len(tf_output),
+                                               target=device)
+                    tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tf_output[0]),
+                                                rtol=1e-5, atol=1e-5)
 
 #######################################################################
 # Placeholder
@@ -1151,8 +1445,8 @@ def test_forward_placeholder():
             graph_def = tf_testing.AddShapesToGraphDef(sess, out_node)
             tf_output = run_tf_graph(sess, data, 'Placeholder:0', out_node + ':0')
             tvm_output = run_tvm_graph(graph_def, data, 'Placeholder')
-            print("tf_output is {}\ntvm_output is {}".format(tf_output, tvm_output))
-            tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tf_output[0]), rtol=1e-5, atol=1e-5)
+            tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tf_output[0]),
+                                        rtol=1e-5, atol=1e-5)
 
 #######################################################################
 # PTB
@@ -1181,17 +1475,21 @@ def _pretty_print(items, is_char_model, id2word):
     def _get_tvm_graph_module(graph_def):
         #Cell inputs 'c and 'h' consist of all layers values
         shape_dict = {'Model/Placeholder': (batch_size, num_steps),
-                      'Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell_c':(num_layers, batch_size, num_hidden),
-                      'Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell_h':(num_layers, batch_size, num_hidden)}
+                      'Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell_c':
+                      (num_layers, batch_size, num_hidden),
+                      'Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell_h':
+                      (num_layers, batch_size, num_hidden)}
 
-        sym, params = relay.frontend.from_tensorflow(graph_def, shape=shape_dict)
+        mod, params = relay.frontend.from_tensorflow(graph_def, shape=shape_dict)
 
         dtype_dict = {'Model/Placeholder': 'int32',
                       'Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell_c':'float32',
                       'Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell_h':'float32'}
         target = 'llvm'
         with relay.build_config(opt_level=0):
-            graph, lib, params = relay.build(sym, target, params=params)
+            graph, lib, params = relay.build(mod[mod.entry_func],
+                                             target,
+                                             params=params)
         from tvm.contrib import graph_runtime
         ctx = tvm.cpu(0)
         return params, graph_runtime.create(graph, lib, ctx)
@@ -1209,15 +1507,15 @@ def _get_sample(data, state):
 
             model.set_input('Model/Placeholder', tvm.nd.array(input_data.astype("int32")))
             model.set_input('Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell_c',
-                        tvm.nd.array(in_state_c.astype("float32")))
+                            tvm.nd.array(in_state_c.astype("float32")))
             model.set_input('Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell_h',
-                        tvm.nd.array(in_state_h.astype("float32")))
+                            tvm.nd.array(in_state_h.astype("float32")))
             model.set_input(**params)
             model.run()
             tvm_output = model.get_output(0, tvm.nd.empty(out_sample_shape,
-                                                      "float32")).asnumpy()
+                                                          "float32")).asnumpy()
             state_output = model.get_output(1, tvm.nd.empty(out_state_shape,
-                                                        "float32")).asnumpy()
+                                                            "float32")).asnumpy()
             sample = tf_testing.pick_from_weight(tvm_output[0])
 
             return sample, state_output
@@ -1257,13 +1555,14 @@ def _get_sample(data, state):
                                                     for word in seed_for_sample],
                                                 in_state, params, cnt_sample)
         tvm_sample_str = _pretty_print(tvm_samples, False, id_to_word)
-        tf_samples, tf_state = tf_testing.do_tf_sample(sess,
-                                [word_to_id[word] for word in seed_for_sample],
-                                in_state, cnt_sample)
+        tf_samples, tf_state = tf_testing.do_tf_sample(
+            sess,
+            [word_to_id[word] for word in seed_for_sample],
+            in_state, cnt_sample)
         tf_sample_str = _pretty_print(tf_samples, False, id_to_word)
         inpt = tvm_sample_str
         tvm.testing.assert_allclose(tf_samples, tvm_samples, rtol=1e-5, atol=1e-5)
-        assert(tvm_sample_str == tf_sample_str)
+        assert tvm_sample_str == tf_sample_str
 
 #######################################################################
 # LRN (Local Response Normalization)
@@ -1392,6 +1691,22 @@ def test_forward_tanh():
         tf.nn.tanh(in1)
         compare_tf_with_tvm(inp_array, 'Placeholder:0', 'Tanh:0')
 
+
+#######################################################################
+# Softmax
+# -------
+def test_forward_softmax():
+    """test operator Softmax """
+    def check_softmax(in_shape, axis, dtype):
+        np_data = np.random.uniform(-100, 100, size=in_shape).astype(dtype)
+        tf.reset_default_graph()
+        in_data = tf.placeholder(dtype, in_shape, name="in_data")
+        tf.nn.softmax(in_data, axis=axis, name="Softmax")
+        compare_tf_with_tvm([np_data], ['in_data:0'], 'Softmax:0')
+    check_softmax((2, 3, 5), 2, "float32")
+    check_softmax((2, 3, 5), -1, "float32")
+
+
 #######################################################################
 # Tensor
 # ------
@@ -1404,6 +1719,29 @@ def test_forward_round():
     tf.round(in_data, name="round")
     compare_tf_with_tvm([np_data], ['in_data:0'], 'round:0')
 
+def test_forward_abs():
+    """test operator Abs"""
+    np_data = np.random.uniform(1, 100, size=(9, 11)).astype(np.float32)
+    tf.reset_default_graph()
+    in_data = tf.placeholder(tf.float32, (9, 11), name="in_data")
+    tf.math.abs(in_data, name="abs")
+    compare_tf_with_tvm([np_data], ['in_data:0'], 'abs:0')
+
+def _test_forward_zeros_like(in_shape, dtype):
+    np_data = np.random.uniform(-10, 10, size=in_shape).astype(dtype)
+    tf.reset_default_graph()
+    in_data = tf.placeholder(dtype, in_shape, name="in_data")
+    tf.zeros_like(in_data, name="zeros_like")
+    compare_tf_with_tvm([np_data], ['in_data:0'], 'zeros_like:0')
+
+def test_forward_zeros_like():
+    if tf.__version__ < LooseVersion('1.2'):
+        _test_forward_zeros_like((2, 3), "int32")
+        _test_forward_zeros_like((2, 3, 5), "int8")
+        _test_forward_zeros_like((2, 3, 5, 7), "uint16")
+        _test_forward_zeros_like((2, 3, 11), "float32")
+        _test_forward_zeros_like((2, 3, 11), "float64")
+
 def _test_forward_reverse_v2(in_shape, axis, dtype):
     np_data = np.random.uniform(-10, 10, size=in_shape).astype(dtype)
     tf.reset_default_graph()
@@ -1427,6 +1765,14 @@ def test_forward_sign():
     tf.sign(in_data, name="sign")
     compare_tf_with_tvm([np_data], ['in_data:0'], 'sign:0')
 
+def test_forward_square():
+    """test operator Square """
+    np_data = np.random.uniform(1, 100, size=(2, 3, 5)).astype(np.float32)
+    tf.reset_default_graph()
+    in_data = tf.placeholder(tf.float32, (2, 3, 5), name="in_data")
+    tf.square(in_data, name="square")
+    compare_tf_with_tvm([np_data], ['in_data:0'], 'square:0')
+
 def test_forward_pow_exp():
     """test Pow and Exp """
     np_in1 = np.random.uniform(-2, 2, size=(5, 7, 11)).astype(np.float32)
@@ -1440,22 +1786,81 @@ def test_forward_pow_exp():
     compare_tf_with_tvm([np_in1], ['in1:0'], 'exp:0')
 
 def test_forward_log():
-    """test Log """
+    """test operator Log """
     np_data = np.random.uniform(1, 100, size=(2, 3, 5)).astype(np.float32)
     tf.reset_default_graph()
     in_data = tf.placeholder(tf.float32, (2, 3, 5), name="in_data")
     tf.log(in_data, name="log")
     compare_tf_with_tvm([np_data], ['in_data:0'], 'log:0')
 
+def test_forward_negative():
+    """test tf operator Neg """
+    np_data = np.random.uniform(-100, 255, size=(224, 224, 3)).astype(np.float32)
+    tf.reset_default_graph()
+    in_data = tf.placeholder(tf.float32, (224, 224, 3), name="in_data")
+    tf.negative(in_data, name="negative")
+    compare_tf_with_tvm([np_data], ['in_data:0'], 'negative:0')
+
+def test_forward_log_softmax():
+    """test operator LogSoftmax"""
+    np_data = np.random.uniform(1, 100, size=(9, 11)).astype(np.float32)
+    tf.reset_default_graph()
+    in_data = tf.placeholder(tf.float32, (9, 11), name="in_data")
+    tf.math.log_softmax(in_data, name="LogSoftmax")
+    compare_tf_with_tvm([np_data], ['in_data:0'], 'LogSoftmax:0')
+
+def test_forward_softplus():
+    """test operator Softplus"""
+    np_data = np.random.uniform(1, 10, size=(2, 3, 5)).astype(np.float32)
+    tf.reset_default_graph()
+    in_data = tf.placeholder(tf.float32, (2, 3, 5), name="in_data")
+    tf.nn.softplus(in_data, name="softplus")
+    compare_tf_with_tvm([np_data], ['in_data:0'], 'softplus:0')
+
 def test_forward_rsqrt():
     """test Rsqrt """
     np_data = np.random.uniform(1, 100, size=(5, 7, 11)).astype(np.float32)
     tf.reset_default_graph()
     in_data = tf.placeholder(tf.float32, (5, 7, 11), name="in_data")
     tf.rsqrt(in_data, name="rsqrt")
-    print(tf.get_default_graph().as_graph_def())
     compare_tf_with_tvm([np_data], ['in_data:0'], 'rsqrt:0')
 
+def test_forward_sqrt():
+    """test Sqrt """
+    np_data = np.random.uniform(1, 100, size=(5, 7, 11)).astype(np.float32)
+    tf.reset_default_graph()
+    in_data = tf.placeholder(tf.float32, (5, 7, 11), name="in_data")
+    tf.sqrt(in_data, name="sqrt")
+    compare_tf_with_tvm([np_data], ['in_data:0'], 'sqrt:0')
+
+def _test_forward_right_shift(in_shape, dtype):
+    """test operator RightShift"""
+    lh_data = np.random.randint(1, 3, size=in_shape).astype(dtype)
+    rh_data = np.random.randint(1, 8, size=in_shape).astype(dtype)
+    tf.reset_default_graph()
+    lft_data = tf.placeholder(dtype, in_shape, name="lft_data")
+    rgt_data = tf.placeholder(dtype, in_shape, name="rgt_data")
+    tf.bitwise.right_shift(lft_data, rgt_data, name="RightShift")
+    compare_tf_with_tvm([lh_data, rh_data], ['lft_data:0', 'rgt_data:0'], 'RightShift:0')
+
+def test_forward_right_shift():
+    _test_forward_right_shift((7,), 'int32')
+    _test_forward_right_shift((3, 11), 'int16')
+
+def _test_forward_left_shift(in_shape, dtype):
+    """test operator LeftShift"""
+    lh_data = np.random.randint(100, 1000000, size=in_shape).astype(dtype)
+    rh_data = np.random.randint(1, 3, size=in_shape).astype(dtype)
+    tf.reset_default_graph()
+    lft_data = tf.placeholder(dtype, in_shape, name="lft_data")
+    rgt_data = tf.placeholder(dtype, in_shape, name="rgt_data")
+    tf.bitwise.left_shift(lft_data, rgt_data, name="LeftShift")
+    compare_tf_with_tvm([lh_data, rh_data], ['lft_data:0', 'rgt_data:0'], 'LeftShift:0')
+
+def test_forward_left_shift():
+    _test_forward_left_shift((10,), 'int32')
+    _test_forward_left_shift((224, 224, 3), 'int16')
+
 #######################################################################
 # Mean
 # ----
@@ -1468,8 +1873,43 @@ def check_mean(ishape, **kwargs):
             compare_tf_with_tvm(inp_array, 'Placeholder:0', 'Mean:0', no_gpu=True)
 
     check_mean((10, 8, 16, 32))
-    check_mean((10, 8, 16, 32), axis=(2,3))
-    check_mean((10, 8, 16, 32), axis=(1,2), keepdims=True)
+    check_mean((10, 8, 16, 32), axis=(2, 3))
+    check_mean((10, 8, 16, 32), axis=(1, 2), keepdims=True)
+
+#######################################################################
+# All, Max, Min
+# -------------
+def test_forward_reduce_all():
+    """Test the All operator."""
+    np_data = np.random.choice([True, False], size=(5, 7, 11))
+    tf.reset_default_graph()
+    in_data = tf.placeholder(tf.bool, (5, 7, 11), name="in_data")
+    tf.reduce_all(in_data, name="all")
+    compare_tf_with_tvm([np_data], ['in_data:0'], 'all:0')
+
+def test_forward_reduce_max():
+    def check_max(ishape, axis, keepdims, dtype):
+        tf.reset_default_graph()
+        np_data = np.random.uniform(size=ishape).astype(dtype)
+        in_data = tf.placeholder(dtype, name="in_data")
+        tf.math.reduce_max(in_data, axis=axis, keepdims=keepdims, name="reduce_max")
+        compare_tf_with_tvm([np_data], ['in_data:0'], 'reduce_max:0')
+
+    check_max((10, 8, 16, 32), axis=(-1), keepdims=True, dtype="int32")
+    check_max((10, 8, 16, 32), axis=(2, 3), keepdims=True, dtype="float32")
+    check_max((10, 8, 16, 32), axis=(1, 2), keepdims=True, dtype='float32')
+
+def test_forward_reduce_min():
+    def check_min(ishape, axis, keepdims, dtype):
+        tf.reset_default_graph()
+        np_data = np.random.uniform(size=ishape).astype(dtype)
+        in_data = tf.placeholder(dtype, name="in_data")
+        tf.math.reduce_min(in_data, axis=axis, keepdims=keepdims, name="reduce_max")
+        compare_tf_with_tvm([np_data], ['in_data:0'], 'reduce_max:0')
+
+    check_min((10, 8, 16, 32), axis=(-1), keepdims=True, dtype="int32")
+    check_min((10, 8, 16, 32), axis=(2, 3), keepdims=True, dtype="float32")
+    check_min((10, 8, 16, 32), axis=(1, 2), keepdims=True, dtype='float32')
 
 #######################################################################
 # Relational operators
@@ -1527,6 +1967,58 @@ def test_forward_reduce_prod():
     _test_forward_reduce_prod((5, 5), 0, True)
     _test_forward_reduce_prod((5, 5), 1, True)
 
+
+#######################################################################
+# Maximum, Minimum
+# ----------------
+def test_forward_maximum():
+    """test Op Maximum"""
+    def check_maximum(lh_shape, rh_shape, dtype):
+        tf.reset_default_graph()
+        lh_data = np.random.uniform(size=lh_shape).astype(dtype)
+        rh_data = np.random.uniform(size=rh_shape).astype(dtype)
+        lft_data = tf.placeholder(dtype, name="lft_data")
+        rgt_data = tf.placeholder(dtype, name="rgt_data")
+        tf.math.maximum(lft_data, rgt_data, name="maximum")
+        compare_tf_with_tvm([lh_data, rh_data], ['lft_data:0', 'rgt_data:0'], 'maximum:0')
+
+    check_maximum((10, 8, 16, 32), (1,), dtype="int32")
+    check_maximum((10, 8, 16, 32), (10, 8, 16, 32), dtype="float32")
+
+def test_forward_minimum():
+    """test Op Minimum"""
+    def check_minimum(lh_shape, rh_shape, dtype):
+        tf.reset_default_graph()
+        lh_data = np.random.uniform(size=lh_shape).astype(dtype)
+        rh_data = np.random.uniform(size=rh_shape).astype(dtype)
+        lft_data = tf.placeholder(dtype, name="lft_data")
+        rgt_data = tf.placeholder(dtype, name="rgt_data")
+        tf.math.minimum(lft_data, rgt_data, name="minimum")
+        compare_tf_with_tvm([lh_data, rh_data], ['lft_data:0', 'rgt_data:0'], 'minimum:0')
+
+    check_minimum((10, 8, 16, 32), (1,), dtype="int32")
+    check_minimum((10, 8, 16, 32), (10, 8, 16, 32), dtype="float32")
+
+
+#######################################################################
+# PlaceholderWithDefault
+# ----------------------
+def test_placeholder():
+    with tf.Graph().as_default():
+        in_data1 = np.random.uniform(-5, 5, size=(3, 4, 5)).astype(np.float32)
+        var1 = tf.Variable(in_data1, name='in1')
+        var2 = array_ops.placeholder_with_default(var1, None, name='place1')
+
+        in_data2 = np.random.uniform(-5, 5, size=(3, 4, 5)).astype(np.float32)
+        place1 = array_ops.placeholder(shape=in_data1.shape, dtype=in_data1.dtype, name='in2')
+
+        out1 = tf.math.add(var1, var2, name='out1')
+        out2 = tf.math.add(out1, place1, name='out2')
+
+        compare_tf_with_tvm([in_data1, in_data2], ['place1:0', 'in2:0'], 'out2:0',
+                            init_global_variables=True)
+
+
 #######################################################################
 # Main
 # ----
@@ -1535,17 +2027,30 @@ def test_forward_reduce_prod():
     # Transforms
     test_forward_transpose()
     test_forward_reshape()
+    test_forward_depthtospace()
     test_forward_squeeze()
     test_forward_pack()
-    test_forward_resize_bilinear()
+    test_forward_broadcast_to()
+    test_forward_fill()
     test_forward_crop()
+    test_forward_resize()
+    test_forward_crop_and_resize()
     test_forward_pad()
+    test_forward_unpack()
     test_forward_gather()
-    test_forward_gather_v1()
+    test_forward_gather_nd()
     test_forward_stridedslice()
     test_forward_split()
     test_forward_unstack()
     test_forward_tile()
+    test_forward_top_k_v2()
+    test_forward_clip_by_value()
+    test_forward_maximum()
+    test_forward_minimum()
+    test_forward_range()
+    test_forward_right_shift()
+    test_forward_left_shift()
+    test_forward_truncatemod()
 
     # Activations
     test_forward_sigmoid()
@@ -1561,25 +2066,38 @@ def test_forward_reduce_prod():
     test_forward_pow_exp()
     test_forward_sign()
     test_forward_log()
+    test_forward_negative()
+    test_forward_divide()
+    test_forward_abs()
+    test_forward_softplus()
+    test_forward_sqrt()
     test_forward_rsqrt()
     test_forward_expand_dims()
+    test_forward_square()
+    test_forward_softmax()
+    test_forward_log_softmax()
+    test_forward_bias_add()
+    test_forward_zeros_like()
 
     # Reductions
     test_forward_argminmax()
     test_forward_reduce()
     test_forward_mean()
     test_forward_reduce_prod()
+    test_forward_reduce_all()
+    test_forward_reduce_max()
+    test_forward_reduce_min()
 
     # General
     test_forward_multi_input()
     test_forward_multi_output()
     test_forward_variable()
+    test_placeholder()
 
     # NN
     test_forward_convolution()
     test_forward_pooling()
-    if tf.__version__ == '1.4.1':
-        _test_forward_concat_v2()
+    test_forward_concat_v2()
     test_forward_lrn()
     test_forward_l2_normalize()
     test_forward_space_to_batch_nd()
@@ -1603,4 +2121,7 @@ def test_forward_reduce_prod():
     # Relational ops
     test_forward_rel_ops()
     test_forward_logical()
-    test_where()
+    test_forward_where()
+
+    test_forward_matmul()
+    # TODO missing tests: rank, range
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 63a345a5a6d5..c9fd0dc7143f 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -21,10 +21,10 @@
 This article is a test script to test TFLite operator with Relay.
 """
 from __future__ import print_function
+from functools import partial
 import numpy as np
 import tvm
 from tvm import relay
-from tvm.contrib import util
 import tensorflow as tf
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
@@ -32,7 +32,10 @@
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
-from tensorflow.contrib import lite as interpreter_wrapper
+try:
+    from tensorflow import lite as interpreter_wrapper
+except ImportError:
+    from tensorflow.contrib import lite as interpreter_wrapper
 
 import tvm.relay.testing.tf as tf_testing
 
@@ -64,11 +67,13 @@ def run_tvm_graph(tflite_model_buf, input_data, input_node, num_output=1, target
         shape_dict[e] = input_data[i].shape
         dtype_dict[e] = input_data[i].dtype.name
 
-    func, params = relay.frontend.from_tflite(tflite_model,
-                                              shape_dict=shape_dict,
-                                              dtype_dict=dtype_dict)
+    mod, params = relay.frontend.from_tflite(tflite_model,
+                                             shape_dict=shape_dict,
+                                             dtype_dict=dtype_dict)
     with relay.build_config(opt_level=3):
-        graph, lib, params = relay.build(func, target, params=params)
+        graph, lib, params = relay.build(mod[mod.entry_func],
+                                         target,
+                                         params=params)
 
     ctx = tvm.context(target, 0)
     from tvm.contrib import graph_runtime
@@ -116,12 +121,10 @@ def run_tflite_graph(tflite_model_buf, input_data):
     return tflite_output
 
 
-def compare_tflite_with_tvm(tflite_in_data, tvm_in_data, in_name, input_tensors,
-                            output_tensors, output_need_transpose=False,
-                            init_global_variables=False):
+def compare_tflite_with_tvm(in_data, in_name, input_tensors,
+                            output_tensors, init_global_variables=False):
     """Generic function to generate and compare TFLite and TVM output"""
-    tflite_in_data = convert_to_list(tflite_in_data)
-    tvm_in_data = convert_to_list(tvm_in_data)
+    in_data = convert_to_list(in_data)
     in_name = convert_to_list(in_name)
     in_node = [0] * len(in_name)
     for i in range(len(in_name)):
@@ -131,10 +134,10 @@ def compare_tflite_with_tvm(tflite_in_data, tvm_in_data, in_name, input_tensors,
         if init_global_variables:
             sess.run(variables.global_variables_initializer())
         # convert to tflite model
-        converter = tf.contrib.lite.TFLiteConverter.from_session(
+        converter = interpreter_wrapper.TFLiteConverter.from_session(
             sess, input_tensors, output_tensors)
         tflite_model_buffer = converter.convert()
-        tflite_output = run_tflite_graph(tflite_model_buffer, tflite_in_data)
+        tflite_output = run_tflite_graph(tflite_model_buffer, in_data)
 
         for device in ["llvm"]:
             ctx = tvm.context(device, 0)
@@ -142,27 +145,23 @@ def compare_tflite_with_tvm(tflite_in_data, tvm_in_data, in_name, input_tensors,
                 print("Skip because %s is not enabled" % device)
                 continue
 
-            tvm_output = run_tvm_graph(tflite_model_buffer, tvm_in_data, in_node, target=device)
+            tvm_output = run_tvm_graph(tflite_model_buffer, in_data, in_node, target=device)
             for i in range(len(tflite_output)):
-                if output_need_transpose:
-                    dim = len(tvm_output[i].shape)
-                    if dim == 3:
-                        # N C H*W to N H*W C
-                        axes = (0, 2, 1)
-                    elif dim == 4:
-                        # N C H W to N H W C
-                        axes = (0, 2, 3, 1)
-                    else:
-                        raise NotImplementedError("Not support input shape {} of transpose : ".
-                                                  format(str(dim)))
-                    tvm.testing.assert_allclose(tflite_output[i],
-                                                np.transpose(tvm_output[i], axes=axes),
-                                                atol=1e-5, rtol=1e-5)
-                else:
-                    tvm.testing.assert_allclose(tflite_output[i], tvm_output[i],
-                                                atol=1e-5, rtol=1e-5)
-
-        sess.close()
+                tvm.testing.assert_allclose(tflite_output[i], tvm_output[i], atol=1e-5, rtol=1e-5)
+
+
+def with_fused_activation_function(input_tensor, fn_name):
+    if fn_name is None or fn_name == "NONE":
+        return input_tensor
+    if fn_name == "RELU":
+        return nn_ops.relu(input_tensor)
+    if fn_name == "RELU6":
+        return nn_ops.relu6(input_tensor)
+    if fn_name == "RELU_N1_TO_1":
+        return math_ops.maximum(-1, math_ops.minimum(input_tensor, 1))
+    if fn_name == "TANH":
+        return math_ops.tanh(input_tensor)
+    raise AssertionError("Unknown fused_activation_function {}".format(fn_name))
 
 
 #######################################################################
@@ -173,14 +172,12 @@ def _test_pooling_iteration(input_shape, **kwargs):
 
     x = -np.arange(
         np.prod(input_shape), dtype=np.float32).reshape(input_shape) - 1
-    tvm_data = np.transpose(x, axes=(0, 3, 1, 2))
 
     with tf.Graph().as_default():
         in_data = array_ops.placeholder(shape=input_shape, dtype='float32')
         out = nn_ops.pool(in_data, **kwargs)
 
-        compare_tflite_with_tvm(x, tvm_data, 'Placeholder:0', [in_data], [out],
-                                output_need_transpose=True)
+        compare_tflite_with_tvm(x,'Placeholder:0', [in_data], [out])
 
 
 def _test_pooling(input_shape, **kwargs):
@@ -258,13 +255,8 @@ def _test_convolution(tensor_in_sizes, filter_in_sizes,
                                 strides=strides,
                                 padding=padding,
                                 data_format=data_format)
-        # TFLite is NHWC, TVM is NCHW
-        tflite_data_array = np.reshape(data_array, tensor_in_sizes).astype('float32')
-        tvm_data_array = np.transpose(tflite_data_array, axes=(0, 3, 1, 2))
-        # TFLite output is NHWC, TVM is NCHW, we need transpose
-        compare_tflite_with_tvm(tflite_data_array, tvm_data_array,
-                                'Placeholder:0', [in_data], [out],
-                                output_need_transpose=True)
+        data_array = np.reshape(data_array, tensor_in_sizes).astype('float32')
+        compare_tflite_with_tvm(data_array, 'Placeholder:0', [in_data], [out])
 
 
 def test_forward_convolution():
@@ -286,22 +278,11 @@ def test_forward_convolution():
 
 def _test_reshape(data, out_shape):
     """ One iteration of reshape operation with given data and out shape """
-    # see relay/frontend/tflite.py convert_reshape more detail of channel first rule
-    if len(data.shape) == 1 or len(data.shape) == 2:
-        tvm_data = data
-    elif len(data.shape) == 3:
-        tvm_data = np.transpose(data, axes=(0, 2, 1))
-    elif len(data.shape) == 4:
-        tvm_data = np.transpose(data, axes=(0, 3, 1, 2))
-    else:
-        raise NotImplementedError("Not support input shape {} of reshape : ".
-                                  format(str(len(data))))
-
     with tf.Graph().as_default():
         in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
         out = array_ops.reshape(in_data, out_shape)
 
-        compare_tflite_with_tvm(data, tvm_data, 'Placeholder:0', [in_data], [out])
+        compare_tflite_with_tvm(data, 'Placeholder:0', [in_data], [out])
 
 
 def test_forward_reshape():
@@ -311,6 +292,37 @@ def test_forward_reshape():
     _test_reshape(np.arange(6), [-1])
 
 
+#######################################################################
+# Resize
+# ------
+
+def _test_resize(tf_resize_op, data, align_corners):
+    """ One iteration of Resize """
+
+    assert len(data) == 2
+
+    # Test with tensor and constant
+    with tf.Graph().as_default():
+        images_tensor = array_ops.placeholder(shape=data[0].shape, dtype=data[0].dtype, name='in')
+        size = ops.convert_to_tensor(data[1], dtype=data[1].dtype)
+        out_tensor = tf_resize_op(images=images_tensor, size=size, align_corners=align_corners)
+        compare_tflite_with_tvm([data[0]], ['in:0'], [images_tensor], [out_tensor])
+
+
+def test_all_resize():
+    """ Resize """
+    data = [np.random.rand(1, 16, 16, 3).astype("float32"), np.array([8, 8], dtype=np.int32)]
+    ### RESIZE_BILINEAR
+    _test_resize(tf.image.resize_bilinear, data, align_corners=False)
+    _test_resize(tf.image.resize_bilinear, data, align_corners=True)
+    ### RESIZE_NEAREST_NEIGHBOR (was added in v1.13)
+    # According to topi resize.h
+    # Align corners not supported for nearest neighbour
+    from tflite.BuiltinOperator import BuiltinOperator
+    if 'RESIZE_NEAREST_NEIGHBOR' in dir(BuiltinOperator()):
+        _test_resize(tf.image.resize_nearest_neighbor, data, align_corners=False)
+
+
 #######################################################################
 # Concatenation
 # -------------
@@ -319,18 +331,6 @@ def _test_concatenation(data, axis):
     """ One iteration of concatenation """
 
     assert len(data) >= 1
-    need_transpose = False
-    if len(data[0].shape) == 1 or len(data[0].shape) == 2:
-        tvm_data = data
-    elif len(data[0].shape) == 3:
-        #need_transpose = True
-        tvm_data = [np.transpose(d, axes=(0, 2, 1)) for d in data]
-    elif len(data[0].shape) == 4:
-        need_transpose = True
-        tvm_data = [np.transpose(d, axes=(0, 3, 1, 2)) for d in data]
-    else:
-        raise NotImplementedError("Not support input shape {} of reshape : ".
-                                  format(str(len(data))))
 
     with tf.Graph().as_default():
         in_data = [
@@ -339,7 +339,7 @@ def _test_concatenation(data, axis):
         out = array_ops.concat(in_data, axis=axis)
         name = ["in_{}:0".format(idx) for idx in range(len(data))]
 
-        compare_tflite_with_tvm(data, tvm_data, name, in_data, [out], need_transpose)
+        compare_tflite_with_tvm(data, name, in_data, [out])
 
 
 def test_forward_concatenation():
@@ -359,50 +359,173 @@ def test_forward_concatenation():
 
 
 #######################################################################
-# Add
+# Element-wise
 # ---
 
-def _test_add(data):
-    """ One iteration of add """
+def _test_elemwise(math_op, data, fused_activation_function=None):
+    """ One iteration of elemwise """
 
     assert len(data) == 2
-    need_transpose = False
-    if len(data[0].shape) == 1 or len(data[0].shape) == 2:
-        tvm_data = data
-    elif len(data[0].shape) == 3:
-        need_transpose = True
-        tvm_data = [np.transpose(d, axes=(0, 2, 1)) for d in data]
-    elif len(data[0].shape) == 4:
-        need_transpose = True
-        tvm_data = [np.transpose(d, axes=(0, 3, 1, 2)) for d in data]
-    else:
-        raise NotImplementedError("Not support input shape {} of add : ".
-                                  format(str(len(data.shape))))
 
     # Test with two tensors
     with tf.Graph().as_default():
         in_data = [array_ops.placeholder(shape=data[0].shape, dtype=data[0].dtype, name='in_0'),
                    array_ops.placeholder(shape=data[1].shape, dtype=data[1].dtype, name='in_1')]
-        out = math_ops.add(in_data[0], in_data[1])
-        compare_tflite_with_tvm(data, tvm_data, ['in_0:0','in_1:0'],
-                                in_data, [out], need_transpose)
+        out = math_op(in_data[0], in_data[1])
+        out = with_fused_activation_function(out, fused_activation_function)
+        compare_tflite_with_tvm(data, ['in_0:0', 'in_1:0'], in_data, [out])
 
     # Test with tensor and constant
     with tf.Graph().as_default():
         in_data = [array_ops.placeholder(shape=data[0].shape, dtype=data[0].dtype, name='in')]
-        out = math_ops.add(in_data[0], ops.convert_to_tensor(data[1], dtype=data[1].dtype))
-        compare_tflite_with_tvm([data[0]], [tvm_data[0]], ['in:0'],
-                                in_data, [out], need_transpose)
+        out = math_op(in_data[0], ops.convert_to_tensor(data[1], dtype=data[1].dtype))
+        out = with_fused_activation_function(out, fused_activation_function)
+        compare_tflite_with_tvm([data[0]], ['in:0'], in_data, [out])
+
+
+#######################################################################
+# Add
+# ---
+
+def _test_add(data, fused_activation_function=None):
+    """ One iteration of add """
+    return _test_elemwise(math_ops.add, data, fused_activation_function)
+
+#######################################################################
+# Subtract
+# --------
+
+def _test_sub(data, fused_activation_function=None):
+    """ One iteration of subtract """
+    return _test_elemwise(math_ops.subtract, data, fused_activation_function)
+#######################################################################
+# Mul
+# ---
+def _test_mul(data, fused_activation_function=None):
+    """ One iteration of mul """
+    return _test_elemwise(math_ops.multiply, data, fused_activation_function)
+
+#######################################################################
+# Divide
+# ------
+
+def _test_div(data, fused_activation_function=None):
+    """ One iteration of divide """
+    return _test_elemwise(math_ops.divide, data, fused_activation_function)
+#######################################################################
+# Power
+# -----
+
+def _test_pow(data):
+    """ One iteration of power """
+    return _test_elemwise(math_ops.pow, data)
+#######################################################################
+# Maximum
+# -------
+
+def _test_maximum(data):
+    """ One iteration of maximum """
+    return _test_elemwise(math_ops.maximum, data)
+#######################################################################
+# Minimum
+# -------
+
+def _test_minimum(data):
+    """ One iteration of minimum """
+    return _test_elemwise(math_ops.minimum, data)
+
+def _test_forward_elemwise(testop):
+    """ Elewise"""
+    testop([np.arange(6.0, dtype=np.float32).reshape((2, 1, 1, 3)),
+               np.arange(1.0, 7.0, dtype=np.float32).reshape((2, 1, 1, 3))])
+    testop([np.arange(6.0, dtype=np.float32).reshape((2, 1, 3)),
+               np.arange(1.0, 7.0, dtype=np.float32).reshape((2, 1, 3))])
+    testop([np.arange(3.0, dtype=np.float32).reshape((1, 3)),
+               np.arange(1.0, 4.0, dtype=np.float32).reshape((1, 3))])
+
+def test_all_elemwise():
+    _test_forward_elemwise(_test_add)
+    _test_forward_elemwise(partial(_test_add, fused_activation_function="RELU"))
+    _test_forward_elemwise(partial(_test_add, fused_activation_function="RELU6"))
+    _test_forward_elemwise(_test_sub)
+    _test_forward_elemwise(partial(_test_sub, fused_activation_function="RELU"))
+    _test_forward_elemwise(partial(_test_sub, fused_activation_function="RELU6"))
+    _test_forward_elemwise(_test_mul)
+    _test_forward_elemwise(partial(_test_mul, fused_activation_function="RELU"))
+    _test_forward_elemwise(partial(_test_mul, fused_activation_function="RELU6"))
+    _test_forward_elemwise(_test_div)
+    _test_forward_elemwise(partial(_test_div, fused_activation_function="RELU"))
+    _test_forward_elemwise(partial(_test_div, fused_activation_function="RELU6"))
+    _test_forward_elemwise(_test_pow)
+    _test_forward_elemwise(_test_maximum)
+    _test_forward_elemwise(_test_minimum)
+
+#######################################################################
+# Reduce
+# ------
+
+def _test_reduce(math_op, data, keep_dims=None):
+    """ One iteration of reduce """
+
+    assert len(data) == 2
+
+    # Test with tensor and constant
+    with tf.Graph().as_default():
+        in_data = array_ops.placeholder(shape=data[0].shape, dtype=data[0].dtype, name='in')
+        out = math_op(in_data, data[1], keep_dims)
+        compare_tflite_with_tvm([data[0]], ['in:0'], [in_data], [out])
+
+
+#######################################################################
+# Reduce_min
+# ----------
 
+def _test_reduce_min(data, keep_dims=None):
+    """ One iteration of reduce_min """
+    return _test_reduce(math_ops.reduce_min, data, keep_dims)
 
-def test_forward_add():
-    """ Add """
-    _test_add([np.arange(6.0, dtype=np.float32).reshape((2, 1, 1, 3)),
-               np.arange(6.0, dtype=np.float32).reshape((2, 1, 1, 3))])
-    _test_add([np.arange(6.0, dtype=np.float32).reshape((2, 1, 3)),
-               np.arange(6.0, dtype=np.float32).reshape((2, 1, 3))])
-    _test_add([np.arange(3.0, dtype=np.float32).reshape((1, 3)),
-               np.arange(3.0, dtype=np.float32).reshape((1, 3))])
+#######################################################################
+# Reduce_max
+# ----------
+
+def _test_reduce_max(data, keep_dims=None):
+    """ One iteration of reduce_max """
+    return _test_reduce(math_ops.reduce_max, data, keep_dims)
+
+#######################################################################
+# Reduce_mean
+# -----------
+
+def _test_reduce_mean(data, keep_dims=None):
+    """ One iteration of reduce_mean """
+    return _test_reduce(math_ops.reduce_mean, data, keep_dims)
+
+#######################################################################
+# Reduce_prod
+# -----------
+
+def _test_reduce_prod(data, keep_dims=None):
+    """ One iteration of reduce_prod """
+    return _test_reduce(math_ops.reduce_prod, data, keep_dims)
+
+
+def _test_forward_reduce(testop):
+    """ Reduce """
+    data0 = [np.random.rand(16, 16, 16, 16).astype("float32"), None]
+    data1 = [np.random.rand(16, 16, 16, 16).astype("float32"), np.array([1, 2], dtype=np.int32)]
+    testop(data0)
+    testop(data0, keep_dims=False)
+    testop(data0, keep_dims=True)
+    testop(data1)
+    testop(data1, keep_dims=False)
+    testop(data1, keep_dims=True)
+
+
+def test_all_reduce():
+    _test_forward_reduce(_test_reduce_min)
+    _test_forward_reduce(_test_reduce_max)
+    _test_forward_reduce(_test_reduce_mean)
+    _test_forward_reduce(_test_reduce_prod)
 
 
 #######################################################################
@@ -415,19 +538,6 @@ def _test_squeeze(data, squeeze_dims=None):
     if squeeze_dims is None:
         squeeze_dims = []
 
-    # see relay/frontend/tflite.py convert_squeeze more detail of channel first rule
-    if len(data.shape) == 1 or len(data.shape) == 2:
-        tvm_data = data
-    elif len(data.shape) == 3:
-        tvm_data = np.transpose(data, axes=(0, 2, 1))
-    elif len(data.shape) == 4:
-        tvm_data = np.transpose(data, axes=(0, 3, 1, 2))
-    else:
-        raise NotImplementedError("Not support input shape {} of reshape : ".
-                                  format(str(len(data.shape))))
-
-    tvm_data = np.transpose(data, axes=(0, 3, 1, 2))
-
     with tf.Graph().as_default():
         in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
 
@@ -436,7 +546,7 @@ def _test_squeeze(data, squeeze_dims=None):
         else:
             out = array_ops.squeeze(in_data)
 
-        compare_tflite_with_tvm(data, tvm_data, 'Placeholder:0', [in_data], [out])
+        compare_tflite_with_tvm(data, 'Placeholder:0', [in_data], [out])
 
 
 def test_forward_squeeze():
@@ -444,6 +554,51 @@ def test_forward_squeeze():
     _test_squeeze(np.arange(6).reshape((1, 2, 1, 3)), [0, 2])
     _test_squeeze(np.arange(6).reshape((2, 1, 3, 1)), [1, 3])
 
+
+#######################################################################
+# Pad
+# ---
+
+def _test_pad(data):
+    """ One iteration of PAD """
+
+    assert len(data) == 2
+
+    # Test with tensor and constant
+    with tf.Graph().as_default():
+        in_data = [array_ops.placeholder(shape=data[0].shape, dtype=data[0].dtype, name='in')]
+        out = array_ops.pad(in_data[0], ops.convert_to_tensor(data[1], dtype=data[1].dtype))
+        compare_tflite_with_tvm([data[0]], ['in:0'], in_data, [out])
+
+
+def test_forward_pad():
+    """ Pad """
+    _test_pad([np.arange(1.0, 7.0, dtype=np.float32).reshape((2, 1, 1, 3)),
+               np.array([[1, 1], [2, 2], [1, 1], [2, 2]], dtype=np.int32)])
+    _test_pad([np.arange(1.0, 7.0, dtype=np.float32).reshape((2, 1, 3)),
+               np.array([[2, 2], [1, 1], [1, 1]], dtype=np.int32)])
+    _test_pad([np.arange(1.0, 7.0, dtype=np.float32).reshape((2, 3)),
+               np.array([[1, 1], [2, 2]], dtype=np.int32)])
+    _test_pad([np.arange(1.0, 4.0, dtype=np.float32).reshape((1, 3)),
+               np.array([[1, 1], [2, 2]], dtype=np.int32)])
+
+
+#######################################################################
+# Logistic
+# --------
+
+def _test_logistic(data):
+    """ One iteration of LOGISTIC """
+    with tf.Graph().as_default():
+        in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
+        out = math_ops.sigmoid(in_data)
+        compare_tflite_with_tvm(data, 'Placeholder:0', [in_data], [out])
+
+def test_forward_logistic():
+    """ LOGISTIC """
+    _test_logistic(np.arange(6.0, dtype=np.float32).reshape((1, 6)))
+
+
 #######################################################################
 # Softmax
 # -------
@@ -453,7 +608,7 @@ def _test_softmax(data):
     with tf.Graph().as_default():
         in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
         out = nn_ops.softmax(in_data)
-        compare_tflite_with_tvm(data, data, 'Placeholder:0', [in_data], [out])
+        compare_tflite_with_tvm(data, 'Placeholder:0', [in_data], [out])
 
 def test_forward_softmax():
     """ Softmax """
@@ -496,10 +651,8 @@ def _test_fully_connected(tensor_in_sizes, filter_in_sizes, bias_in_size=None):
             in_bias = constant_op.constant(bias_array, shape=bias_in_size, dtype='float32')
             out = nn_ops.bias_add(out, in_bias)
 
-        tflite_data_array = np.reshape(data_array, tensor_in_sizes).astype('float32')
-        tvm_data_array = np.transpose(tflite_data_array, axes=(0, 3, 1, 2))
-        compare_tflite_with_tvm(tflite_data_array, tvm_data_array,
-                                'Placeholder:0', [in_data], [out])
+        data_array = np.reshape(data_array, tensor_in_sizes).astype('float32')
+        compare_tflite_with_tvm(data_array, 'Placeholder:0', [in_data], [out])
 
 
 def test_forward_fully_connected():
@@ -523,9 +676,8 @@ def test_forward_mobilenet_v1():
     with open(tflite_model_file, "rb") as f:
         tflite_model_buf = f.read()
     data = np.random.uniform(size=(1, 224, 224, 3)).astype('float32')
-    tvm_data = np.transpose(data, axes=(0, 3, 1, 2))
     tflite_output = run_tflite_graph(tflite_model_buf, data)
-    tvm_output = run_tvm_graph(tflite_model_buf, tvm_data, 'input')
+    tvm_output = run_tvm_graph(tflite_model_buf, data, 'input')
     tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]),
                                 rtol=1e-5, atol=1e-5)
 
@@ -538,9 +690,8 @@ def test_forward_mobilenet_v2():
     with open(tflite_model_file, "rb") as f:
         tflite_model_buf = f.read()
     data = np.random.uniform(size=(1, 224, 224, 3)).astype('float32')
-    tvm_data = np.transpose(data, axes=(0, 3, 1, 2))
     tflite_output = run_tflite_graph(tflite_model_buf, data)
-    tvm_output = run_tvm_graph(tflite_model_buf, tvm_data, 'input')
+    tvm_output = run_tvm_graph(tflite_model_buf, data, 'input')
     tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]),
                                 rtol=1e-5, atol=1e-5)
 
@@ -557,9 +708,8 @@ def test_forward_inception_v3_net():
     with open(tflite_model_file, "rb") as f:
         tflite_model_buf = f.read()
     data = np.random.uniform(size=(1, 299, 299, 3)).astype('float32')
-    tvm_data = np.transpose(data, axes=(0, 3, 1, 2))
     tflite_output = run_tflite_graph(tflite_model_buf, data)
-    tvm_output = run_tvm_graph(tflite_model_buf, tvm_data, 'input')
+    tvm_output = run_tvm_graph(tflite_model_buf, data, 'input')
     tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]),
                                 rtol=1e-5, atol=1e-5)
 
@@ -572,9 +722,26 @@ def test_forward_inception_v4_net():
     with open(tflite_model_file, "rb") as f:
         tflite_model_buf = f.read()
     data = np.random.uniform(size=(1, 299, 299, 3)).astype('float32')
-    tvm_data = np.transpose(data, axes=(0, 3, 1, 2))
     tflite_output = run_tflite_graph(tflite_model_buf, data)
-    tvm_output = run_tvm_graph(tflite_model_buf, tvm_data, 'input')
+    tvm_output = run_tvm_graph(tflite_model_buf, data, 'input')
+    tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]),
+                                rtol=1e-5, atol=1e-5)
+
+#######################################################################
+# SSD Mobilenet
+# -------------
+
+def test_forward_ssd_mobilenet_v1():
+    """Test the SSD Mobilenet V1 TF Lite model."""
+    # SSD MobilenetV1
+    tflite_model_file = tf_testing.get_workload_official(
+        "https://raw.githubusercontent.com/dmlc/web-data/master/tensorflow/models/object_detection/ssd_mobilenet_v1_coco_2018_01_28_nopp.tgz",
+        "ssd_mobilenet_v1_coco_2018_01_28_nopp.tflite")
+    with open(tflite_model_file, "rb") as f:
+        tflite_model_buf = f.read()
+    data = np.random.uniform(size=(1, 300, 300, 3)).astype('float32')
+    tflite_output = run_tflite_graph(tflite_model_buf, data)
+    tvm_output = run_tvm_graph(tflite_model_buf, data, 'normalized_input_image_tensor')
     tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]),
                                 rtol=1e-5, atol=1e-5)
 
@@ -584,20 +751,27 @@ def test_forward_inception_v4_net():
 if __name__ == '__main__':
     # Transforms
     test_forward_concatenation()
+    test_forward_pad()
     test_forward_reshape()
+    test_all_resize()
     test_forward_squeeze()
 
     # NN
     test_forward_convolution()
+    test_forward_logistic()
     test_forward_pooling()
     test_forward_softmax()
     test_forward_fully_connected()
 
-    # Math
-    test_forward_add()
+    # Elemwise
+    test_all_elemwise()
+
+    # Reduce
+    test_all_reduce()
 
     # End to End
     test_forward_mobilenet_v1()
     test_forward_mobilenet_v2()
     test_forward_inception_v3_net()
     test_forward_inception_v4_net()
+    test_forward_ssd_mobilenet_v1()
diff --git a/tests/python/relay/test_adt.py b/tests/python/relay/test_adt.py
index 58ab0c481f9c..f3a08a869841 100644
--- a/tests/python/relay/test_adt.py
+++ b/tests/python/relay/test_adt.py
@@ -14,15 +14,22 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import numpy as np
 import tvm
 from tvm import relay
 from tvm.relay.ir_pass import infer_type
 from tvm.relay.backend.interpreter import Value, TupleValue, ConstructorValue
 from tvm.relay import testing, create_executor
 from tvm.relay.prelude import Prelude
+from tvm.relay.testing import add_nat_definitions, count as count_, make_nat_value, make_nat_expr
 
 mod = relay.Module()
 p = Prelude(mod)
+add_nat_definitions(p)
+
+def count(e):
+    return count_(p, e)
+
 ctx = tvm.context("llvm", 0)
 intrp = create_executor(mod=mod, ctx=ctx, target="llvm")
 
@@ -67,15 +74,6 @@
 compose = p.compose
 iterate = p.iterate
 
-# this is an example of using the adt value in python side
-def count(n):
-    assert isinstance(n, ConstructorValue)
-    if n.constructor.name_hint == 's':
-        return 1 + count(n.fields[0])
-    else:
-        assert n.constructor.name_hint == 'z'
-        return 0
-
 # this is an example of creating the adt value in python side
 def make_nat(n):
     if n != 0:
@@ -83,7 +81,7 @@ def make_nat(n):
     else:
         return ConstructorValue(z, [], [])
 
-def build_nat(n):
+def make_nat_expr(n):
     assert n >= 0
     ret = z()
     while n > 0:
@@ -96,18 +94,18 @@ def to_list(l):
     val = l
     ret = []
     while True:
-        if val.constructor.name_hint == 'cons':
+        if val.tag == p.cons.tag:
             ret.append(val.fields[0])
             val = val.fields[1]
         else:
-            assert val.constructor.name_hint == 'nil'
+            assert val.tag == p.nil.tag
             break
     return ret
 
 def tree_to_dict(t):
     assert isinstance(t, ConstructorValue)
     ret = {}
-    assert t.constructor.name_hint == 'rose'
+    assert t.tag == p.rose.tag
     ret['member'] = t.fields[0]
     ret['children'] = []
     for subtree in to_list(t.fields[1]):
@@ -115,8 +113,14 @@ def tree_to_dict(t):
         ret['children'].append(l)
     return ret
 
+
+# turns a scalar-valued relay tensor value into a python number
+def get_scalar(tv):
+    return tv.asnumpy().item()
+
+
 def test_nat_value():
-    assert count(make_nat(10)) == 10
+    assert count(make_nat_value(p, 10)) == 10
     assert count(intrp.evaluate(s(s(z())))) == 2
 
 
@@ -145,7 +149,7 @@ def test_hd_tl():
     expected = list(range(10))
     l = nil()
     for i in reversed(expected):
-        l = cons(build_nat(i), l)
+        l = cons(make_nat_expr(i), l)
 
     got = []
     for i in range(len(expected)):
@@ -158,36 +162,35 @@ def test_nth():
     expected = list(range(10))
     l = nil()
     for i in reversed(expected):
-        l = cons(build_nat(i), l)
+        l = cons(relay.const(i), l)
 
-    got = []
     for i in range(len(expected)):
-        got.append(count(intrp.evaluate(nth(l, build_nat(i)))))
+        item = intrp.evaluate(nth(l, relay.const(i)))
+        assert get_scalar(item) == i
 
-    assert got == expected
 
 def test_update():
     expected = list(range(10))
     l = nil()
     # create zero initialized list
     for i in range(len(expected)):
-        l = cons(build_nat(0), l)
+        l = cons(make_nat_expr(0), l)
 
     # set value
     for i, v in enumerate(expected):
-        l = update(l, build_nat(i), build_nat(v))
+        l = update(l, relay.const(i), make_nat_expr(v))
 
     got = []
     for i in range(len(expected)):
-        got.append(count(intrp.evaluate(nth(l, build_nat(i)))))
+        got.append(count(intrp.evaluate(nth(l, relay.const(i)))))
 
     assert got == expected
 
 def test_length():
     a = relay.TypeVar("a")
-    assert mod[length].checked_type == relay.FuncType([l(a)], nat(), [a])
+    assert mod[length].checked_type == relay.FuncType([l(a)], relay.scalar_type('int32'), [a])
     res = intrp.evaluate(length(cons(z(), cons(z(), cons(z(), nil())))))
-    assert count(res) == 3
+    assert get_scalar(res) == 3
 
 
 def test_map():
@@ -216,9 +219,9 @@ def test_foldl():
     y = relay.Var("y")
     rev_dup = relay.Function([y, x], cons(x, cons(x, y)))
     res = intrp.evaluate(foldl(rev_dup, nil(),
-                               cons(build_nat(1),
-                                    cons(build_nat(2),
-                                         cons(build_nat(3), nil())))))
+                               cons(make_nat_expr(1),
+                                    cons(make_nat_expr(2),
+                                         cons(make_nat_expr(3), nil())))))
     reversed = to_list(res)
     assert len(reversed) == 6
     assert count(reversed[0]) == 3 and count(reversed[1]) == 3
@@ -237,9 +240,9 @@ def test_foldr():
     y = relay.Var("y")
     identity = relay.Function([x, y], cons(x, y))
     res = intrp.evaluate(foldr(identity, nil(),
-                               cons(build_nat(1),
-                                    cons(build_nat(2),
-                                         cons(build_nat(3), nil())))))
+                               cons(make_nat_expr(1),
+                                    cons(make_nat_expr(2),
+                                         cons(make_nat_expr(3), nil())))))
     same = to_list(res)
     assert len(same) == 3
     assert count(same[0]) == 1 and count(same[1]) == 2 and count(same[2]) == 3
@@ -255,25 +258,25 @@ def test_foldr1():
     y = relay.Var("y")
     f = relay.Function([x, y], add(x, y))
     res = intrp.evaluate(foldr1(f,
-                                cons(build_nat(1),
-                                    cons(build_nat(2),
-                                         cons(build_nat(3), nil())))))
+                                cons(make_nat_expr(1),
+                                    cons(make_nat_expr(2),
+                                         cons(make_nat_expr(3), nil())))))
 
     assert count(res) == 6
 
 
 def test_sum():
-    assert mod[sum].checked_type == relay.FuncType([l(nat())], nat())
-    res = intrp.evaluate(sum(cons(build_nat(1), cons(build_nat(2), nil()))))
-    assert count(res) == 3
+    assert mod[sum].checked_type == relay.FuncType([l(relay.scalar_type('int32'))], relay.scalar_type('int32'))
+    res = intrp.evaluate(sum(cons(relay.const(1), cons(relay.const(2), nil()))))
+    assert get_scalar(res) == 3
 
 
 def test_concat():
     a = relay.TypeVar("a")
     assert mod[concat].checked_type == relay.FuncType([l(a), l(a)], l(a), [a])
 
-    l1 = cons(build_nat(1), cons(build_nat(2), nil()))
-    l2 = cons(build_nat(3), cons(build_nat(4), nil()))
+    l1 = cons(make_nat_expr(1), cons(make_nat_expr(2), nil()))
+    l2 = cons(make_nat_expr(3), cons(make_nat_expr(4), nil()))
     res = intrp.evaluate(concat(l1, l2))
 
     catted = to_list(res)
@@ -305,12 +308,12 @@ def test_filter():
         ]))
     res = intrp.evaluate(
         filter(greater_than_one,
-               cons(build_nat(1),
-                    cons(build_nat(1),
-                         cons(build_nat(3),
-                              cons(build_nat(1),
-                                   cons(build_nat(5),
-                                        cons(build_nat(1),
+               cons(make_nat_expr(1),
+                    cons(make_nat_expr(1),
+                         cons(make_nat_expr(3),
+                              cons(make_nat_expr(1),
+                                   cons(make_nat_expr(5),
+                                        cons(make_nat_expr(1),
                                              nil()))))))))
     filtered = to_list(res)
     assert len(filtered) == 2
@@ -325,7 +328,7 @@ def test_zip():
                                    l(relay.TupleType([a, b])), [a, b])
     assert mod[zip].checked_type == expected_type
 
-    l1 = cons(build_nat(1), cons(build_nat(2), cons(build_nat(3), nil())))
+    l1 = cons(make_nat_expr(1), cons(make_nat_expr(2), cons(make_nat_expr(3), nil())))
     l2 = cons(nil(),
               cons(cons(nil(), nil()),
                    cons(cons(nil(), cons(nil(), nil())),
@@ -342,7 +345,7 @@ def test_zip():
     assert len(to_list(zipped[2][1])) == 2
 
     # test truncation
-    l3 = cons(build_nat(4), cons(build_nat(5), nil()))
+    l3 = cons(make_nat_expr(4), cons(make_nat_expr(5), nil()))
     shorter_res = intrp.evaluate(zip(l3, l2))
     truncated = to_list(shorter_res)
     assert len(truncated) == 2
@@ -363,9 +366,9 @@ def test_rev():
     a = relay.TypeVar("a")
     assert mod[rev].checked_type == relay.FuncType([l(a)], l(a), [a])
 
-    res = intrp.evaluate(rev(cons(build_nat(1),
-                                  cons(build_nat(2),
-                                       cons(build_nat(3), nil())))))
+    res = intrp.evaluate(rev(cons(make_nat_expr(1),
+                                  cons(make_nat_expr(2),
+                                       cons(make_nat_expr(3), nil())))))
     reversed = to_list(res)
 
     assert len(reversed) == 3
@@ -392,7 +395,7 @@ def test_unfoldr():
             relay.Clause(relay.PatternConstructor(z, []), none())
         ]))
 
-    res = intrp.evaluate(unfoldr(count_down, build_nat(3)))
+    res = intrp.evaluate(unfoldr(count_down, make_nat_expr(3)))
     unfolded = to_list(res)
 
     assert len(unfolded) == 3
@@ -419,7 +422,7 @@ def test_unfoldl():
             relay.Clause(relay.PatternConstructor(z, []), none())
         ]))
 
-    res = intrp.evaluate(unfoldl(count_down, build_nat(3)))
+    res = intrp.evaluate(unfoldl(count_down, make_nat_expr(3)))
     unfolded = to_list(res)
 
     assert len(unfolded) == 3
@@ -444,7 +447,7 @@ def test_map_accumr():
                                      relay.Tuple([add(x, acc),
                                                   add(x, acc)]))
 
-    vals = cons(build_nat(1), cons(build_nat(2), cons(build_nat(3), nil())))
+    vals = cons(make_nat_expr(1), cons(make_nat_expr(2), cons(make_nat_expr(3), nil())))
     res = intrp.evaluate(map_accumr(add_acc_to_each, z(), vals))
 
     sum = count(res[0])
@@ -472,7 +475,7 @@ def test_map_accuml():
     add_to_acc = relay.Function([acc, x],
                                 relay.Tuple([add(x, acc), x]))
 
-    vals = cons(build_nat(1), cons(build_nat(2), cons(build_nat(3), nil())))
+    vals = cons(make_nat_expr(1), cons(make_nat_expr(2), cons(make_nat_expr(3), nil())))
     res = intrp.evaluate(map_accuml(add_to_acc, z(), vals))
 
     sum = count(res[0])
@@ -497,8 +500,8 @@ def test_optional_matching():
         ]))
 
     res = intrp.evaluate(foldr(condense, nil(), cons(
-        some(build_nat(3)),
-        cons(none(), cons(some(build_nat(1)), nil())))))
+        some(make_nat_expr(3)),
+        cons(none(), cons(some(make_nat_expr(1)), nil())))))
 
     reduced = to_list(res)
     assert len(reduced) == 2
@@ -532,7 +535,7 @@ def test_tmap():
 def test_size():
     a = relay.TypeVar("a")
     lhs = mod[size].checked_type
-    rhs = relay.FuncType([tree(a)], nat(), [a])
+    rhs = relay.FuncType([tree(a)], relay.scalar_type('int32'), [a])
     assert lhs == rhs
 
     root = rose(z(), cons(rose(z(), nil()),
@@ -540,7 +543,7 @@ def test_size():
                                        nil())))
     t = rose(z(), cons(root, cons(root, cons(root, nil()))))
     res = intrp.evaluate(size(t))
-    assert count(res) == 10
+    assert get_scalar(res) == 10
 
 
 def test_wildcard_match_solo():
@@ -601,10 +604,10 @@ def test_nested_matches():
                          inner_match)
         ]), l(a), [a])
 
-    first_list = cons(build_nat(1), cons(build_nat(2),
-                                         cons(build_nat(3), nil())))
-    second_list = cons(build_nat(4), cons(build_nat(5),
-                                          cons(build_nat(6), nil())))
+    first_list = cons(make_nat_expr(1), cons(make_nat_expr(2),
+                                         cons(make_nat_expr(3), nil())))
+    second_list = cons(make_nat_expr(4), cons(make_nat_expr(5),
+                                          cons(make_nat_expr(6), nil())))
     final_list = cons(first_list, cons(second_list, nil()))
 
     res = intrp.evaluate(flatten(final_list))
@@ -660,6 +663,7 @@ def test_nested_pattern_match():
 
     assert count(res) == 2
 
+
 def test_compose():
     n = relay.Var('n')
     inc = relay.Function([n], s(n))
@@ -667,11 +671,13 @@ def test_compose():
     res = intrp.evaluate(relay.Call(compose(inc, double), [s(s(z()))]))
     assert count(res) == 5
 
+
 def test_iterate():
-    expr = relay.Call(iterate(double, build_nat(2)), [build_nat(3)])
+    expr = relay.Call(iterate(double, relay.const(2)), [make_nat_expr(3)])
     res = intrp.evaluate(relay.Function([], expr)())
     assert count(res) == 12
 
+
 if __name__ == "__main__":
     test_nat_constructor()
     test_double()
diff --git a/tests/python/relay/test_backend_interpreter.py b/tests/python/relay/test_backend_interpreter.py
index e8a99e14d741..11ce11e48322 100644
--- a/tests/python/relay/test_backend_interpreter.py
+++ b/tests/python/relay/test_backend_interpreter.py
@@ -183,11 +183,11 @@ def test_function_taking_adt_ref_tuple():
     prelude = relay.prelude.Prelude(mod)
     intrp = create_executor("debug", mod)
 
-    nil_value = ConstructorValue(prelude.nil, [], [])
-    cons_value = ConstructorValue(prelude.cons, [
+    nil_value = ConstructorValue(prelude.nil.tag, [], prelude.nil, [])
+    cons_value = ConstructorValue(prelude.cons.tag, [
         TensorValue(np.random.rand(1, 10).astype('float32')),
         nil_value
-    ], [relay.TensorType((1, 10), 'float32')])
+    ], prelude.cons, [relay.TensorType((1, 10), 'float32')])
 
     ref_value = RefValue(TensorValue(np.random.rand(1, 10).astype('float32')))
     tuple_value = TupleValue(*[
@@ -197,16 +197,16 @@ def test_function_taking_adt_ref_tuple():
     id_func = intrp.evaluate(prelude.id)
 
     res_nil = id_func(nil_value)
-    assert res_nil.constructor == nil_value.constructor
+    assert res_nil.tag == nil_value.tag
     assert len(res_nil.fields) == 0
 
     res_cons = id_func(cons_value)
-    assert res_cons.constructor == cons_value.constructor
+    assert res_cons.tag == cons_value.tag
     assert len(res_cons.fields) == len(cons_value.fields)
     tvm.testing.assert_allclose(res_cons.fields[0].asnumpy(),
                                 cons_value.fields[0].asnumpy())
     assert isinstance(res_cons.fields[1], ConstructorValue)
-    assert res_cons.fields[1].constructor == prelude.nil
+    assert res_cons.fields[1].tag == prelude.nil.tag
     assert len(res_cons.fields[1].fields) == 0
 
     res_ref = id_func(ref_value)
@@ -217,6 +217,31 @@ def test_function_taking_adt_ref_tuple():
         tvm.testing.assert_allclose(res_tuple.fields[i].asnumpy(),
                                     tuple_value.fields[i].asnumpy())
 
+def test_tuple_passing():
+    x = relay.var('x', type_annotation=relay.ty.TupleType([
+        relay.ty.TensorType((), 'int64'),
+        relay.ty.TensorType((), 'int64')]))
+
+    fn = relay.Function([x], relay.expr.TupleGetItem(x, 0))
+    mod = relay.Module({})
+    gv = relay.GlobalVar('fn')
+    mod[gv] = fn
+    mod.entry_func = gv
+    mod[gv] = relay.ir_pass.infer_type(mod[gv], mod=mod)
+
+    ctx = tvm.cpu()
+    target = tvm.target.create('llvm')
+    exec = relay.create_executor(mod=mod, ctx=ctx, target=target)
+    f = exec.evaluate(gv)
+    # First use a Python tuple.
+    out = f((10, 8))
+    tvm.testing.assert_allclose(out.asnumpy(), np.array(10))
+    # Second use a tuple value.
+    value_tuple = TupleValue(
+        TensorValue(np.array(11)),
+        TensorValue(np.array(12)))
+    out = f(value_tuple)
+    tvm.testing.assert_allclose(out.asnumpy(), np.array(11))
 
 if __name__ == "__main__":
     test_id()
@@ -231,4 +256,5 @@ def test_function_taking_adt_ref_tuple():
     test_tensor_value()
     test_tuple_value()
     test_tuple_getitem()
-    test_function_taking_adt_ref_tuple()
\ No newline at end of file
+    test_function_taking_adt_ref_tuple()
+    test_tuple_passing()
diff --git a/conda/Dockerfile.cuda92 b/tests/python/relay/test_cmp_op.py
similarity index 56%
rename from conda/Dockerfile.cuda92
rename to tests/python/relay/test_cmp_op.py
index ad2d8ffca6e0..d096eec598b7 100644
--- a/conda/Dockerfile.cuda92
+++ b/tests/python/relay/test_cmp_op.py
@@ -15,19 +15,22 @@
 # specific language governing permissions and limitations
 # under the License.
 
-FROM nvidia/cuda:9.2-devel-centos6
+from tvm import relay
+a = relay.Var("a")
+b = relay.expr.const (1.0, dtype='float32')
 
-RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
-     chmod +x ~/miniconda.sh && \
-     ~/miniconda.sh -b -p /opt/conda && \
-     rm ~/miniconda.sh && \
-     /opt/conda/bin/conda install conda-build conda-verify && \
-     /opt/conda/bin/conda clean -ya
+c = a < b
+d = relay.less (a, b)
+assert (c.astext() == d.astext())
 
-ENV PATH /opt/conda/bin:$PATH
-ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
+c = a > b
+d = relay.greater (a, b)
+assert (c.astext() == d.astext())
 
-WORKDIR /workspace
-RUN chmod -R a+w /workspace
+c = (a >= b)
+d = relay.greater_equal(a, b)
+assert (c.astext() == d.astext())
 
-CMD conda build --output-folder /workspace/conda/pkg --variants '{cuda: True, cuda_version: 9.2}' /workspace/conda/tvm-libs
+c = (a <= b)
+d = relay.less_equal(a, b)
+assert (c.astext() == d.astext())
diff --git a/tests/python/relay/test_cpp_build_module.py b/tests/python/relay/test_cpp_build_module.py
index b94f57d77286..affc6ce04c6b 100644
--- a/tests/python/relay/test_cpp_build_module.py
+++ b/tests/python/relay/test_cpp_build_module.py
@@ -18,55 +18,10 @@
 
 import tvm
 from tvm import relay
+from tvm.contrib.nvcc import have_fp16
 
-from tvm._ffi.function import _init_api
-_init_api("tvm.relay.build_module")
-
-class BuildModule(object):
-    def __init__(self):
-        self.mod = relay.build_module._BuildModule()
-        self._get_graph_json = self.mod["get_graph_json"]
-        self._get_module = self.mod["get_module"]
-        self._build = self.mod["build"]
-        self._set_opt_level = self.mod["set_opt_level"]
-        self._set_params_func = self.mod["set_params"]
-        self._get_params_func = self.mod["get_params"]
-
-  
-    def build(self, func, target, target_host, params):
-        tgts = []
-        for kv in target.items():
-            tgts.append(kv[0])
-            tgts.append(kv[1])
-        self._set_params(params)
-        self._build(func, tgts, target_host)
-
-    def get_json(self):
-        return self._get_graph_json()
-
-    def get_module(self):
-        return self._get_module()
-
-    def set_opt_level(self, level):
-        self._set_opt_level(level)
-
-    def _set_params(self, params):
-        inputs = {}
-        for name, param in params.items():
-            inputs[name] = relay.Constant(param)
-        self._set_params_func(inputs)
-
-    def get_params(self):
-        params = self._get_params_func()
-        ret = {}
-        for key, value in params.items():
-            ret[key] = value.data
-        return ret
-
-
-def test_build():
-    m_bld = BuildModule()
-    tgt_name = "llvm"
+
+def test_basic_build():
     tgt = "llvm"
     ctx = tvm.cpu()
     # func
@@ -86,21 +41,96 @@ def test_build():
     }
     # build
     targets = {
-        tgt: tgt
+        tvm.expr.IntImm("int32", ctx.device_type): tgt
     }
-    m_bld.set_opt_level(3)
-    m_bld.build(func, targets, "llvm", params=params)
-    g_json = m_bld.get_json()
-    mmod = m_bld.get_module()
-    params = m_bld.get_params()
-   
+    g_json, mmod, params = relay.build(func, targets, "llvm", params=params)
+
     # test
     rt = tvm.contrib.graph_runtime.create(g_json, mmod, ctx)
     rt.set_input("a", A)
     rt.load_params(relay.save_param_dict(params))
     rt.run()
     out = rt.get_output(0)
-   
-    np.testing.assert_allclose(out.asnumpy(),
-        np.maximum(np.dot(A.asnumpy(), B.asnumpy().T), 0) + C.asnumpy(), atol=1e-5, rtol=1e-5)
-  
+
+    np.testing.assert_allclose(out.asnumpy(), np.maximum(np.dot(A.asnumpy(),
+                                                                B.asnumpy().T),
+                                                         0) + C.asnumpy(),
+                               atol=1e-5, rtol=1e-5)
+
+
+def test_fp16_build():
+    dtype = "float16"
+
+    if not tvm.module.enabled("cuda") or not tvm.gpu(0).exist:
+        print("skip because cuda is not enabled.")
+        return
+
+    ctx = tvm.gpu(0)
+    if dtype == "float16" and not have_fp16(ctx.compute_version):
+        print("skip because gpu does not support fp16")
+        return
+
+    x = relay.var("x", dtype=dtype, shape=(4, 4))
+    y = relay.var("y", dtype=dtype, shape=(4, 4))
+    z = x + y
+    func = relay.Function([x, y], z)
+    X = tvm.nd.array(np.random.uniform(-1, 1, (4, 4)).astype(dtype), ctx=ctx)
+    Y = tvm.nd.array(np.random.uniform(-1, 1, (4, 4)).astype(dtype), ctx=ctx)
+    params = {
+        "x": X,
+        "y": Y,
+    }
+
+    # build
+    g_json, mmod, params = relay.build(func, "cuda", params=params)
+
+    # test
+    rt = tvm.contrib.graph_runtime.create(g_json, mmod, ctx)
+    rt.load_params(relay.save_param_dict(params))
+    rt.run()
+    out = rt.get_output(0)
+
+    np.testing.assert_allclose(out.asnumpy(), X.asnumpy() + Y.asnumpy(),
+                               atol=1e-5, rtol=1e-5)
+
+
+def test_fp16_conversion():
+    def check_conversion(tgt, ctx):
+        if not tvm.module.enabled(tgt):
+            print("skip because {} is not enabled.".format(tgt))
+            return
+        elif tgt == "cuda" and ctx.exist and not have_fp16(ctx.compute_version):
+            print("skip because gpu does not support fp16")
+            return
+
+        n = 10
+
+        for (src, dst) in [('float32', 'float16'), ('float16', 'float32')]:
+            x = relay.var("x", relay.TensorType((n,), src))
+            y = x.astype(dst)
+            func = relay.Function([x], y)
+
+            # init input
+            X = tvm.nd.array(n * np.random.randn(n).astype(src) - n / 2)
+
+            # build
+            with relay.build_config(opt_level=1):
+                g_json, mmod, params = relay.build(func, tgt)
+
+            # test
+            rt = tvm.contrib.graph_runtime.create(g_json, mmod, ctx)
+            rt.set_input("x", X)
+            rt.run()
+            out = rt.get_output(0)
+
+            np.testing.assert_allclose(out.asnumpy(), X.asnumpy().astype(dst),
+                                       atol=1e-5, rtol=1e-5)
+
+    for target, ctx in [('llvm', tvm.cpu()), ('cuda', tvm.gpu())]:
+        check_conversion(target, ctx)
+
+
+if __name__ == "__main__":
+    test_basic_build()
+    test_fp16_build()
+    test_fp16_conversion()
diff --git a/tests/python/relay/test_ir_parser.py b/tests/python/relay/test_ir_parser.py
index d3c30be2a953..79b010ba0cb0 100644
--- a/tests/python/relay/test_ir_parser.py
+++ b/tests/python/relay/test_ir_parser.py
@@ -16,19 +16,14 @@
 # under the License.
 import tvm
 from tvm import relay
-from tvm.relay.parser import enabled
 from tvm.relay.ir_pass import alpha_equal
-from nose import SkipTest
 from nose.tools import nottest, raises
 from numpy import isclose
 from typing import Union
 from functools import wraps
-if enabled():
-    raises_parse_error = raises(tvm._ffi.base.TVMError)
-else:
-    raises_parse_error = lambda x: x
+raises_parse_error = raises(tvm._ffi.base.TVMError)
 
-SEMVER = "v0.0.1"
+SEMVER = "v0.0.2"
 
 BINARY_OPS = {
     "*": relay.multiply,
@@ -65,9 +60,12 @@
     "float16x4",
 }
 
+def parse_text(code):
+    return relay.fromtext(SEMVER + "\n" + code)
+
 def parses_as(code, expr):
     # type: (str, relay.Expr) -> bool
-    return alpha_equal(relay.fromtext(SEMVER + "\n" + code), expr)
+    return alpha_equal(parse_text(code), expr)
 
 def get_scalar(x):
     # type: (relay.Constant) -> (Union[float, int, bool])
@@ -83,17 +81,7 @@ def get_scalar(x):
 
 UNIT = relay.Tuple([])
 
-# decorator to determine if parser is enabled
-def if_parser_enabled(func):
-    # https://stackoverflow.com/q/7727678
-    @wraps(func)
-    def wrapper():
-        if not enabled():
-            raise SkipTest("ANTLR is not installed!")
-        func()
-    return wrapper
-
-@if_parser_enabled
+
 def test_comments():
     assert parses_as(
         """
@@ -113,46 +101,46 @@ def test_comments():
         UNIT
     )
 
-@if_parser_enabled
+
 def test_int_literal():
-    assert isinstance(relay.fromtext(SEMVER+"1"), relay.Constant)
-    assert isinstance(relay.fromtext(SEMVER+"1").data, tvm.ndarray.NDArray)
+    assert isinstance(parse_text("1"), relay.Constant)
+    assert isinstance(parse_text("1").data, tvm.ndarray.NDArray)
+
+    assert get_scalar(parse_text("1")) == 1
+    assert get_scalar(parse_text("10")) == 10
+    assert get_scalar(parse_text("0")) == 0
+    assert get_scalar(parse_text("-100")) == -100
+    assert get_scalar(parse_text("-05")) == -5
 
-    assert get_scalar(relay.fromtext(SEMVER+"1")) == 1
-    assert get_scalar(relay.fromtext(SEMVER+"10")) == 10
-    assert get_scalar(relay.fromtext(SEMVER+"0")) == 0
-    assert get_scalar(relay.fromtext(SEMVER+"-100")) == -100
-    assert get_scalar(relay.fromtext(SEMVER+"-05")) == -5
 
-@if_parser_enabled
 def test_float_literal():
-    assert get_scalar(relay.fromtext(SEMVER+"1.0")) == 1.0
-    assert isclose(get_scalar(relay.fromtext(SEMVER+"1.56667")), 1.56667)
-    assert get_scalar(relay.fromtext(SEMVER+"0.0")) == 0.0
-    assert get_scalar(relay.fromtext(SEMVER+"-10.0")) == -10.0
+    assert get_scalar(parse_text("1.0")) == 1.0
+    assert isclose(get_scalar(parse_text("1.56667")), 1.56667)
+    assert get_scalar(parse_text("0.0")) == 0.0
+    assert get_scalar(parse_text("-10.0")) == -10.0
 
     # scientific notation
-    assert isclose(get_scalar(relay.fromtext(SEMVER+"1e-1")), 1e-1)
-    assert get_scalar(relay.fromtext(SEMVER+"1e+1")) == 1e+1
-    assert isclose(get_scalar(relay.fromtext(SEMVER+"1E-1")), 1E-1)
-    assert get_scalar(relay.fromtext(SEMVER+"1E+1")) == 1E+1
-    assert isclose(get_scalar(relay.fromtext(SEMVER+"1.0e-1")), 1.0e-1)
-    assert get_scalar(relay.fromtext(SEMVER+"1.0e+1")) == 1.0e+1
-    assert isclose(get_scalar(relay.fromtext(SEMVER+"1.0E-1")), 1.0E-1)
-    assert get_scalar(relay.fromtext(SEMVER+"1.0E+1")) == 1.0E+1
-
-@if_parser_enabled
+    assert isclose(get_scalar(parse_text("1e-1")), 1e-1)
+    assert get_scalar(parse_text("1e+1")) == 1e+1
+    assert isclose(get_scalar(parse_text("1E-1")), 1E-1)
+    assert get_scalar(parse_text("1E+1")) == 1E+1
+    assert isclose(get_scalar(parse_text("1.0e-1")), 1.0e-1)
+    assert get_scalar(parse_text("1.0e+1")) == 1.0e+1
+    assert isclose(get_scalar(parse_text("1.0E-1")), 1.0E-1)
+    assert get_scalar(parse_text("1.0E+1")) == 1.0E+1
+
+
 def test_bool_literal():
-    assert get_scalar(relay.fromtext(SEMVER+"True")) == True
-    assert get_scalar(relay.fromtext(SEMVER+"False")) == False
+    assert get_scalar(parse_text("True")) == True
+    assert get_scalar(parse_text("False")) == False
+
 
-@if_parser_enabled
 def test_negative():
-    assert isinstance(relay.fromtext(SEMVER+"let %x = 1; -%x").body, relay.Call)
-    assert get_scalar(relay.fromtext(SEMVER+"--10")) == 10
-    assert get_scalar(relay.fromtext(SEMVER+"---10")) == -10
+    assert isinstance(parse_text("let %x = 1; -%x").body, relay.Call)
+    assert get_scalar(parse_text("--10")) == 10
+    assert get_scalar(parse_text("---10")) == -10
+
 
-@if_parser_enabled
 def test_bin_op():
     for bin_op in BINARY_OPS.keys():
         assert parses_as(
@@ -160,18 +148,18 @@ def test_bin_op():
             BINARY_OPS.get(bin_op)(relay.const(1), relay.const(1))
         )
 
-@if_parser_enabled
+
 def test_parens():
-    assert alpha_equal(relay.fromtext(SEMVER+"1 * 1 + 1"), relay.fromtext(SEMVER+"(1 * 1) + 1"))
-    assert not alpha_equal(relay.fromtext(SEMVER+"1 * 1 + 1"), relay.fromtext(SEMVER+"1 * (1 + 1)"))
+    assert alpha_equal(parse_text("1 * 1 + 1"), parse_text("(1 * 1) + 1"))
+    assert not alpha_equal(parse_text("1 * 1 + 1"), parse_text("1 * (1 + 1)"))
+
 
-@if_parser_enabled
 def test_op_assoc():
-    assert alpha_equal(relay.fromtext(SEMVER+"1 * 1 + 1 < 1 == 1"), relay.fromtext(SEMVER+"(((1 * 1) + 1) < 1) == 1"))
-    assert alpha_equal(relay.fromtext(SEMVER+"1 == 1 < 1 + 1 * 1"), relay.fromtext(SEMVER+"1 == (1 < (1 + (1 * 1)))"))
+    assert alpha_equal(parse_text("1 * 1 + 1 < 1 == 1"), parse_text("(((1 * 1) + 1) < 1) == 1"))
+    assert alpha_equal(parse_text("1 == 1 < 1 + 1 * 1"), parse_text("1 == (1 < (1 + (1 * 1)))"))
+
 
 @nottest
-@if_parser_enabled
 def test_vars():
     # temp vars won't work b/c they start with a digit
     # # temp var
@@ -180,21 +168,21 @@ def test_vars():
     # assert temp_var.name == "1"
 
     # var
-    var = relay.fromtext(SEMVER+"let %foo = (); %foo")
+    var = parse_text("let %foo = (); %foo")
     assert isinstance(var.body, relay.Var)
     assert var.body.name_hint == "foo"
 
     # global var
-    global_var = relay.fromtext(SEMVER+"@foo")
+    global_var = parse_text("@foo")
     assert isinstance(global_var, relay.GlobalVar)
     assert global_var.name_hint == "foo"
 
     # operator id
-    op = relay.fromtext(SEMVER+"foo")
+    op = parse_text("foo")
     assert isinstance(op, relay.Op)
     assert op.name == "foo"
 
-@if_parser_enabled
+
 def test_let():
     assert parses_as(
         "let %x = 1; ()",
@@ -222,7 +210,7 @@ def test_let():
         )
     )
 
-@if_parser_enabled
+
 def test_seq():
     assert parses_as(
         "(); ()",
@@ -241,7 +229,7 @@ def test_seq():
         )
     )
 
-@if_parser_enabled
+
 def test_graph():
     assert parses_as(
         "%0 = (); %1 = 1; (%0, %0, %1)",
@@ -253,22 +241,22 @@ def test_graph():
         relay.Tuple([relay.Tuple([]), relay.Tuple([]), relay.const(1)])
     )
 
+
 @raises_parse_error
-@if_parser_enabled
 def test_graph_wrong_order():
-    relay.fromtext(SEMVER+"%1 = (); %1")
+    parse_text("%1 = (); %1")
+
 
 @raises_parse_error
-@if_parser_enabled
 def test_let_global_var():
-    relay.fromtext(SEMVER+"let @x = 1; ()")
+    parse_text("let @x = 1; ()")
+
 
 @raises_parse_error
-@if_parser_enabled
 def test_let_op():
-    relay.fromtext(SEMVER+"let x = 1; ()")
+    parse_text("let x = 1; ()")
+
 
-@if_parser_enabled
 def test_tuple():
     assert parses_as("()", relay.Tuple([]))
 
@@ -278,7 +266,7 @@ def test_tuple():
 
     assert parses_as("(0, 1, 2)", relay.Tuple([relay.const(0), relay.const(1), relay.const(2)]))
 
-@if_parser_enabled
+
 def test_func():
     # 0 args
     assert parses_as(
@@ -330,8 +318,8 @@ def test_func():
         relay.Function([], UNIT, None, None, tvm.make.node("DictAttrs", n=relay.const(5)))
     )
 
+
 # TODO(@jmp): Crashes if %x isn't annnotated.
-@if_parser_enabled
 def test_defn():
     id_defn = relay.fromtext(
         SEMVER+
@@ -342,7 +330,7 @@ def @id(%x: int32) -> int32 {
         """)
     assert isinstance(id_defn, relay.Module)
 
-@if_parser_enabled
+
 def test_recursive_call():
     id_defn = relay.fromtext(
         SEMVER+
@@ -353,7 +341,7 @@ def @id(%x: int32) -> int32 {
         """)
     assert isinstance(id_defn, relay.Module)
 
-@if_parser_enabled
+
 def test_ifelse():
     assert parses_as(
         """
@@ -370,8 +358,8 @@ def test_ifelse():
         )
     )
 
+
 @raises_parse_error
-@if_parser_enabled
 def test_ifelse_scope():
     relay.fromtext(
         SEMVER+
@@ -385,7 +373,7 @@ def test_ifelse_scope():
         """
     )
 
-@if_parser_enabled
+
 def test_call():
     # select right function to call: simple ident case
     id_func = relay.Var("id")
@@ -509,7 +497,7 @@ def test_call():
 
 # Types
 
-@if_parser_enabled
+
 def test_incomplete_type():
     assert parses_as(
         "let %_ : _ = (); ()",
@@ -520,17 +508,17 @@ def test_incomplete_type():
         )
     )
 
-@if_parser_enabled
+
 def test_builtin_types():
     for builtin_type in TYPES:
-        relay.fromtext(SEMVER+"let %_ : {} = (); ()".format(builtin_type))
+        parse_text("let %_ : {} = (); ()".format(builtin_type))
+
 
 @nottest
-@if_parser_enabled
 def test_call_type():
     assert False
 
-@if_parser_enabled
+
 def test_tensor_type():
     assert parses_as(
         "let %_ : Tensor[(), float32] = (); ()",
@@ -559,7 +547,7 @@ def test_tensor_type():
         )
     )
 
-@if_parser_enabled
+
 def test_function_type():
     assert parses_as(
         """
@@ -594,7 +582,7 @@ def test_function_type():
         )
     )
 
-@if_parser_enabled
+
 def test_tuple_type():
     assert parses_as(
         """
diff --git a/tests/python/relay/test_ir_well_formed.py b/tests/python/relay/test_ir_well_formed.py
index e69f839e3ee6..3cf73ae2cc66 100644
--- a/tests/python/relay/test_ir_well_formed.py
+++ b/tests/python/relay/test_ir_well_formed.py
@@ -53,10 +53,12 @@ def test_adt():
     mod = relay.Module()
     p = Prelude(mod)
     x = relay.Var("x")
-    s_case = relay.Clause(relay.PatternConstructor(p.s, [relay.PatternVar(x)]), x)
+    some_case = relay.Clause(relay.PatternConstructor(p.some,
+                                                      [relay.PatternVar(x)]),
+                             x)
     default_case = relay.Clause(relay.PatternVar(x), x)
-    m0 = relay.Match(p.z(), [default_case])
-    m1 = relay.Match(p.z(), [s_case, default_case])
+    m0 = relay.Match(p.none(), [default_case])
+    m1 = relay.Match(p.none(), [some_case, default_case])
     assert well_formed(m0)
     assert not well_formed(m1)
 
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index a5350450b0a5..c8f5b1d27a2a 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -485,7 +485,7 @@ def get_shape():
     func = relay.Function([x], y)
     data = np.random.uniform(size=dshape).astype(dtype)
     if method == "NEAREST_NEIGHBOR":
-        ref = topi.testing.upsampling_python(data, scale, layout)
+        ref = topi.testing.upsampling_python(data, (scale, scale), layout)
     else:
         ref = topi.testing.bilinear_resize_python(data, (h*scale, w*scale), layout)
     for target, ctx in ctx_list():
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 15cb3265b679..a878d79e678d 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -269,7 +269,8 @@ def verify_take(src_shape, indices_src, axis=None, mode="clip"):
 
         func = relay.Function([x, indices], z)
         x_data = np.random.uniform(low=-1, high=1, size=src_shape).astype(src_dtype)
-        ref_res = np.take(x_data, indices=indices_src, axis=axis, mode=mode)
+        np_mode = "raise" if mode == "fast" else mode
+        ref_res = np.take(x_data, indices=indices_src, axis=axis, mode=np_mode)
 
         for target, ctx in ctx_list():
             for kind in ["graph", "debug"]:
@@ -291,6 +292,9 @@ def verify_take(src_shape, indices_src, axis=None, mode="clip"):
     verify_take((3,4), [-1, 2], axis=0, mode="wrap")
     verify_take((3,4), [-1, 2], axis=1)
     verify_take((3,4), [-1, 2], axis=1, mode="wrap")
+    verify_take((3,3,3), [[11,25]], mode="fast")
+    verify_take((3,4), [0, 2], axis=0, mode="fast")
+    verify_take((3,4), [0, 2], axis=1, mode="fast")
 
 
 def test_split_infer_type():
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 0e44bf851dc4..aac4a6d4af16 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -138,6 +138,7 @@ def test_where():
 def verify_reduce(funcs, data, axis, keepdims, exclude, output, dtype="float32"):
     test_func = funcs[0]
     ref_func = funcs[1]
+    dtype = "bool" if ref_func in [np.all] else dtype
 
     x = relay.var("x", relay.TensorType(data, dtype))
     z = test_func(x, axis, keepdims, exclude)
@@ -155,7 +156,9 @@ def verify_reduce(funcs, data, axis, keepdims, exclude, output, dtype="float32")
         return
 
     func = relay.Function([x], z)
-    x_data = np.random.uniform(size=data).astype(dtype)
+    x_data = np.random.choice([True, False], size=data) if ref_func in [np.all] \
+        else np.random.uniform(size=data).astype(dtype)
+
     if ref_func in [np.sum]:
         ref_res = ref_func(x_data + 0, axis=axis, dtype=dtype, keepdims=keepdims)
     elif ref_func in [np.max, np.min, np.mean, np.prod]:
@@ -194,6 +197,7 @@ def _wrapper(data, axis=None, keepdims=False):
                  [relay.min, np.min],
                  [relay.mean, np.mean],
                  [relay.prod, np.prod],
+                 [relay.all, np.all],
                  [relay.argmin, _with_keepdims(np.argmin)],
                  [relay.argmax, _with_keepdims(np.argmax)]]:
         verify_reduce(func, (d1, d2, d3, d4), None, False, False, ())
@@ -203,6 +207,7 @@ def _wrapper(data, axis=None, keepdims=False):
         verify_reduce(func, (d1, d2, d3), (0, 1), True, False, (1, 1, d3))
         verify_reduce(func, (2, 3, 4), 1, True, False, (2, 1, 4))
         verify_reduce(func, (2, 3, 4), (1,), True, False, (2, 1, 4))
+        verify_reduce(func, (2, 3, 4), -1, True, False, (2, 3, 1))
         verify_reduce(func, (2, 3, 4), (0, 1, 2), False, False, ())
         verify_reduce(func, (4, 4, 3), None, False, False, ())
         verify_reduce(func, (4, 4, 3), (0, 2), False, False, (4,))
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index e6d99c765c87..3d9ec6dde4ad 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -48,7 +48,7 @@ def verify_resize(dshape, scale, method, layout):
         if method == "BILINEAR":
             ref_res = topi.testing.bilinear_resize_python(x_data, size, layout)
         else:
-            ref_res = topi.testing.upsampling_python(x_data, scale, layout)
+            ref_res = topi.testing.upsampling_python(x_data, (scale, scale), layout)
         x = relay.var("x", relay.TensorType(dshape, "float32"))
         z = relay.image.resize(x, size, layout, method, False)
         assert "size=" in z.astext()
@@ -152,28 +152,28 @@ def verify_multibox_prior(x, dshape, ref_res, sizes=(1.0,),
 
 
 def test_get_valid_counts():
-    def verify_get_valid_counts(dshape, score_threshold):
+    def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
         dtype = "float32"
         batch_size, num_anchor, elem_length = dshape
-        np_data = np.random.uniform(size=dshape).astype(dtype)
+        np_data = np.random.uniform(low=-2, high=2, size=dshape).astype(dtype)
         np_out1 = np.zeros(shape=(batch_size,))
         np_out2 = np.zeros(shape=dshape).astype(dtype)
         for i in range(batch_size):
             np_out1[i] = 0
             inter_idx = 0
             for j in range(num_anchor):
-                score = np_data[i, j, 1]
-                if score >= score_threshold:
+                score = np_data[i, j, score_index]
+                if score > score_threshold and (id_index < 0 or np_data[i, j, id_index] >= 0):
                     for k in range(elem_length):
                         np_out2[i, inter_idx, k] = np_data[i, j, k]
                     np_out1[i] += 1
                     inter_idx += 1
                 if j >= np_out1[i]:
                     for k in range(elem_length):
-                        np_out2[i, j, k] = -1
+                        np_out2[i, j, k] = -1.0
 
         x = relay.var("x", relay.ty.TensorType(dshape, dtype))
-        z = relay.vision.get_valid_counts(x, score_threshold)
+        z = relay.vision.get_valid_counts(x, score_threshold, id_index, score_index)
         assert "score_threshold" in z.astext()
         func = relay.Function([x], z.astuple())
         func = relay.ir_pass.infer_type(func)
@@ -185,10 +185,10 @@ def verify_get_valid_counts(dshape, score_threshold):
             tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3, atol=1e-04)
             tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3, atol=1e-04)
 
-    verify_get_valid_counts((1, 2500, 6), 0)
-    verify_get_valid_counts((1, 2500, 6), -1)
-    verify_get_valid_counts((3, 1000, 6), 0.55)
-    verify_get_valid_counts((16, 500, 6), 0.95)
+    verify_get_valid_counts((1, 2500, 6), 0, 0, 1)
+    verify_get_valid_counts((1, 2500, 5), -1, -1, 0)
+    verify_get_valid_counts((3, 1000, 6), 0.55, 1, 0)
+    verify_get_valid_counts((16, 500, 5), 0.95, -1, 0)
 
 
 def test_non_max_suppression():
diff --git a/tests/python/relay/test_op_level6.py b/tests/python/relay/test_op_level6.py
index 983a9154df34..286776e3f7b2 100644
--- a/tests/python/relay/test_op_level6.py
+++ b/tests/python/relay/test_op_level6.py
@@ -16,18 +16,15 @@
 # under the License.
 """ Support level6 operator test cases.
 """
-import math
 import numpy as np
 import tvm
 from tvm import relay
 from tvm.relay.testing import ctx_list
-import topi.testing
 
 def test_argsort():
-    def verify_argsort(shape, axis, is_ascend):
+    def verify_argsort(shape, axis, is_ascend, dtype):
         x = relay.var("x", relay.TensorType(shape, "float32"))
-        z = relay.argsort(x, axis=axis, is_ascend=is_ascend)
-        zz = relay.ir_pass.infer_type(z)
+        z = relay.argsort(x, axis=axis, is_ascend=is_ascend, dtype=dtype)
         func = relay.Function([x], z)
         x_data = np.random.uniform(size=shape).astype("float32")
         if is_ascend:
@@ -39,11 +36,58 @@ def verify_argsort(shape, axis, is_ascend):
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data)
-                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.astype("float"), rtol=1e-5)
-    verify_argsort((2, 3, 4), axis=0, is_ascend=False)
-    verify_argsort((1, 4, 6), axis=1, is_ascend=True)
-    verify_argsort((3, 5, 6), axis=-1, is_ascend=False)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.astype(dtype), rtol=1e-5)
+    for dtype in ["int32", "int64", "float32", "float64"]:
+        verify_argsort((2, 3, 4), axis=0, is_ascend=False, dtype=dtype)
+        verify_argsort((1, 4, 6), axis=1, is_ascend=True, dtype=dtype)
+        verify_argsort((3, 5, 6), axis=-1, is_ascend=False, dtype=dtype)
+
+
+def test_topk():
+    def verify_topk(k, axis, ret_type, is_ascend, dtype):
+        shape = (20, 100)
+        x = relay.var("x", relay.TensorType(shape, "float32"))
+        out = relay.topk(x, k, axis, ret_type, is_ascend, dtype)
+        if isinstance(out, relay.expr.TupleWrapper):
+            out = out.astuple()
+        func = relay.Function([x], out)
+        np_data = np.random.uniform(size=shape).astype("float32")
+        if is_ascend:
+            np_indices = np.argsort(np_data, axis=axis)
+        else:
+            np_indices = np.argsort(-np_data, axis=axis)
+        kk = k if k >= 1 else shape[axis]
+        if axis == 0:
+            np_indices = np_indices[:kk, :]
+            np_values = np.zeros(np_indices.shape).astype("float32")
+            for i in range(shape[1]):
+                np_values[:, i] = np_data[np_indices[:, i], i]
+        else:
+            np_indices = np_indices[:, :kk]
+            np_values = np.zeros(np_indices.shape).astype("float32")
+            for i in range(shape[0]):
+                np_values[i, :] = np_data[i, np_indices[i, :]]
+        np_indices = np_indices.astype(dtype)
+
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(np_data)
+                if ret_type == "both":
+                    tvm.testing.assert_allclose(op_res[0].asnumpy(), np_values)
+                    tvm.testing.assert_allclose(op_res[1].asnumpy(), np_indices)
+                elif ret_type == "values":
+                    tvm.testing.assert_allclose(op_res.asnumpy(), np_values)
+                else:
+                    tvm.testing.assert_allclose(op_res.asnumpy(), np_indices)
+    np.random.seed(0)
+    for k in [0, 1, 5]:
+        for axis in [0, -1, 1]:
+            for ret_type in ["both", "values", "indices"]:
+                verify_topk(k, axis, ret_type, True, "int64")
+                verify_topk(k, axis, ret_type, False, "float32")
 
 
 if __name__ == "__main__":
     test_argsort()
+    test_topk()
diff --git a/tests/python/relay/test_pass_alpha_equal.py b/tests/python/relay/test_pass_alpha_equal.py
index f00dc85eb7f8..0e0036565363 100644
--- a/tests/python/relay/test_pass_alpha_equal.py
+++ b/tests/python/relay/test_pass_alpha_equal.py
@@ -521,7 +521,7 @@ def test_match_alpha_equal():
                                                              relay.PatternVar(a)]),
                                    p.cons(z, a))
 
-    data = p.cons(p.z(), p.cons(p.z(), p.nil()))
+    data = p.cons(relay.const(1), p.cons(relay.const(2), p.nil()))
 
     match = relay.Match(data, [nil_case, cons_case])
     equivalent = relay.Match(data, [nil_case, equivalent_cons])
@@ -547,8 +547,8 @@ def test_match_alpha_equal():
         relay.Clause(relay.PatternWildcard(), p.nil())
     ])
     wrong_constructors = relay.Match(data, [
-        relay.Clause(relay.PatternConstructor(p.z), p.nil()),
-        relay.Clause(relay.PatternConstructor(p.s, [relay.PatternVar(x)]),
+        relay.Clause(relay.PatternConstructor(p.none), p.nil()),
+        relay.Clause(relay.PatternConstructor(p.some, [relay.PatternVar(x)]),
                      p.cons(x, p.nil()))
     ])
 
@@ -594,7 +594,24 @@ def test_graph_equal():
     # Check the difference in the text format.
     assert not alpha_equal(z0, z3)
 
+def test_hash_unequal():
+    x1 = relay.var("x1", shape=(10, 10), dtype="float32")
+    y1 = relay.var("y1", shape=(10, 10), dtype="float32")
+    func1 = relay.Function([x1, y1], relay.add(x1, y1))
 
+    # func2 is exactly same structure with same variables shapes and dtypes
+    x2 = relay.var("x2", shape=(10, 10), dtype="float32")
+    y2 = relay.var("y2", shape=(10, 10), dtype="float32")
+    func2 = relay.Function([x2, y2], relay.add(x2, y2))
+
+    assert ir_pass.structural_hash(func1) == ir_pass.structural_hash(func2)
+
+    # func3 is same as func1 but with different var shapes
+    x3 = relay.var("x3", shape=(20, 10), dtype="float32")
+    y3 = relay.var("y3", shape=(20, 10), dtype="float32")
+    func3 = relay.Function([x3, y3], relay.add(x3, y3))
+
+    assert not ir_pass.structural_hash(func1) == ir_pass.structural_hash(func3)
 
 if __name__ == "__main__":
     test_tensor_type_alpha_equal()
@@ -617,3 +634,4 @@ def test_graph_equal():
     test_op_alpha_equal()
     test_var_alpha_equal()
     test_graph_equal()
+    test_hash_unequal()
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index 2eea1c4ca87a..7d022ba25570 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Test alter op layout pass"""
+import tvm
 
 from tvm import relay
 from tvm.relay.op import register_alter_op_layout
@@ -513,6 +514,45 @@ def expected():
 
     assert alpha_equal(a, b), "Actual = \n" + str(a)
 
+def test_alter_layout_depthwise_conv2d():
+    """Test depthwise_conv2d operator"""
+    def before():
+        x = relay.var("x", shape=(1, 32, 56, 56))
+        w = relay.var("w", shape=(32, 1, 3, 3))
+        y = relay.nn.conv2d(x, w, padding=(1, 1), channels=32, kernel_size=(3, 3), groups=32)
+        y = relay.Function(free_vars(y), y)
+        return y
+
+    import topi
+    @register_alter_op_layout("nn.conv2d", level=110)
+    def alter_conv2d(attrs, inputs, tinfos):
+        with tvm.target.create("llvm"):
+            return topi.nn.conv2d_alter_layout(attrs, inputs, tinfos, relay)
+
+    def expected():
+        x = relay.var("x", shape=(1, 32, 56, 56))
+        w = relay.var("w", shape=(32, 1, 3, 3))
+        x = relay.layout_transform(x, "NCHW", "NCHW8c")
+        w = relay.layout_transform(w, "OIHW", "OIHW1i8o")
+        y = relay.nn.contrib_depthwise_conv2d_nchwc(x, w, padding=(1, 1), channels=32, kernel_size=(3, 3),
+                                                    groups=32, data_layout="NCHW8c", kernel_layout="OIHW1i8o",
+                                                    out_layout="NCHW8c")
+        y = relay.layout_transform(y, "NCHW8c", "NCHW")
+        y = relay.Function(free_vars(y), y)
+        return y
+
+    a = before()
+    a = infer_type(a)
+    a = canonicalize_ops(a)
+    a = infer_type(a)
+    a = alter_op_layout(a)
+    a = infer_type(a)
+
+    b = expected()
+    b = infer_type(b)
+
+    assert(alpha_equal(a, b))
+
 def test_alter_layout_prelu():
     """Test PRelu operator"""
     def before():
@@ -524,7 +564,7 @@ def before():
         y = relay.Function(free_vars(y), y)
         return y
 
-    @register_alter_op_layout("nn.conv2d", level=110)
+    @register_alter_op_layout("nn.conv2d", level=111)
     def alter_conv2d(attrs, inputs, tinfos):
         data, weight = inputs
         new_attrs = dict(attrs)
@@ -571,4 +611,5 @@ def expected():
     test_alter_layout_concatenate()
     test_alter_layout_nchw_upsamping_op()
     test_alter_layout_strided_slice()
+    test_alter_layout_depthwise_conv2d()
     test_alter_layout_prelu()
diff --git a/tests/python/relay/test_pass_annotation.py b/tests/python/relay/test_pass_annotation.py
index 04081e06735b..84a5c8749079 100644
--- a/tests/python/relay/test_pass_annotation.py
+++ b/tests/python/relay/test_pass_annotation.py
@@ -21,6 +21,7 @@
 import tvm
 from tvm import relay
 from tvm.contrib import graph_runtime
+from tvm.relay.expr_functor import ExprMutator
 
 
 def test_redundant_annotation():
@@ -34,23 +35,22 @@ def annotated():
         add = relay.add(x, y)
         _add1 = relay.annotation.on_device(add, ctx2)
         _add2 = relay.annotation.on_device(add, ctx2)
-        sub = relay.subtract(add, z)
+        sub1 = relay.subtract(_add1, z)
+        sub2 = relay.subtract(_add2, z)
 
-        func = relay.Function([x, y, z],
-                              relay.Tuple(tvm.convert([_add1, _add2,
-                                                       sub])))
+        func = relay.Function([x, y, z], relay.Tuple([sub1, sub2]))
         func = relay.ir_pass.infer_type(func)
         func = relay.ir_pass.rewrite_annotated_ops(func,
                                                    ctx1.device_type)
-        func = relay.ir_pass.infer_type(func)
-        return relay.Function(relay.ir_pass.free_vars(func.body[2]),
-                              func.body[2])
+        return func
 
     def expected():
         add = relay.add(x, y)
-        copy_add_sub = relay.device_copy(add, ctx2, ctx1)
-        sub = relay.subtract(copy_add_sub, z)
-        func = relay.Function([x, y, z], sub)
+        copy_add_sub1 = relay.device_copy(add, ctx2, ctx1)
+        sub1 = relay.subtract(copy_add_sub1, z)
+        copy_add_sub2 = relay.device_copy(add, ctx2, ctx1)
+        sub2 = relay.subtract(copy_add_sub2, z)
+        func = relay.Function([x, y, z], relay.Tuple([sub1, sub2]))
         return func
 
     annotated_func = relay.ir_pass.infer_type(annotated())
@@ -58,6 +58,34 @@ def expected():
     assert relay.ir_pass.alpha_equal(annotated_func, expected_func)
 
 
+def test_annotate_expr():
+    ctx1 = tvm.context(1)
+    ctx2 = tvm.context(2)
+    x = relay.var("x", shape=(3,))
+    y = relay.var("y", shape=(3,))
+    z = relay.var("z", shape=(3,))
+
+    def annotated():
+        add = relay.add(x, y)
+        _add = relay.annotation.on_device(add, ctx1)
+        sub = relay.subtract(_add, z)
+        _sub = relay.annotation.on_device(sub, ctx2)
+        expr = relay.ir_pass.infer_type(_sub)
+        expr = relay.ir_pass.rewrite_annotated_ops(expr,
+                                                   ctx1.device_type)
+        return expr
+
+    def expected():
+        add = relay.add(x, y)
+        copy_add_sub = relay.device_copy(add, ctx1, ctx2)
+        sub = relay.subtract(copy_add_sub, z)
+        return sub
+
+    annotated_expr = relay.ir_pass.infer_type(annotated())
+    expected_expr = relay.ir_pass.infer_type(expected())
+    assert relay.ir_pass.graph_equal(annotated_expr, expected_expr)
+
+
 def test_annotate_all():
     ctx1 = tvm.context(1)
     ctx2 = tvm.context(2)
@@ -68,18 +96,14 @@ def test_annotate_all():
     def annotated():
         add = relay.add(x, y)
         _add = relay.annotation.on_device(add, ctx2)
-        sub = relay.subtract(add, z)
+        sub = relay.subtract(_add, z)
         _sub = relay.annotation.on_device(sub, ctx2)
 
-        func = relay.Function([x, y, z],
-                              relay.Tuple(tvm.convert([_add, _sub,
-                                                       sub])))
+        func = relay.Function([x, y, z], _sub)
         func = relay.ir_pass.infer_type(func)
         func = relay.ir_pass.rewrite_annotated_ops(func,
                                                    ctx1.device_type)
-        func = relay.ir_pass.infer_type(func)
-        return relay.Function(relay.ir_pass.free_vars(func.body[2]),
-                              func.body[2])
+        return func
 
     def expected():
         add = relay.add(x, y)
@@ -91,6 +115,7 @@ def expected():
     expected_func = relay.ir_pass.infer_type(expected())
     assert relay.ir_pass.alpha_equal(annotated_func, expected_func)
 
+
 def test_annotate_none():
     ctx1 = tvm.context(1)
     ctx2 = tvm.context(2)
@@ -142,6 +167,34 @@ def test_conv_network():
     dev1 = tvm.context(1)
     dev2 = tvm.context(2)
 
+    def original():
+        conv2d_1 = relay.nn.conv2d(
+            data1,
+            weight,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1))
+        conv2d_2 = relay.nn.conv2d(
+            data2,
+            weight,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1))
+        add = relay.add(conv2d_1, conv2d_2)
+        conv2d_3 = relay.nn.conv2d(
+            add,
+            weight,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1))
+
+        func = relay.Function([data1, data2, weight], conv2d_3)
+        func = relay.ir_pass.infer_type(func)
+        func = relay.ir_pass.rewrite_annotated_ops(func,
+                                                   tvm.context(3).device_type)
+        return func
+
+
     def annotated():
         conv2d_1 = relay.nn.conv2d(
             data1,
@@ -157,26 +210,39 @@ def annotated():
             kernel_size=(3, 3),
             padding=(1, 1))
         _conv2d_2 = relay.annotation.on_device(conv2d_2, dev2)
-        add = relay.add(conv2d_1, conv2d_2)
+        add = relay.add(_conv2d_1, _conv2d_2)
         _add = relay.annotation.on_device(add, dev1)
         conv2d_3 = relay.nn.conv2d(
-            add,
+            _add,
             weight,
             channels=64,
             kernel_size=(3, 3),
             padding=(1, 1))
         _conv2d_3 = relay.annotation.on_device(conv2d_3, dev2)
 
-        func = relay.Function([data1, data2, weight],
-                              relay.Tuple(tvm.convert([_conv2d_1, _conv2d_2,
-                                                       _conv2d_3, _add,
-                                                       conv2d_3])))
+        func = relay.Function([data1, data2, weight], _conv2d_3)
         func = relay.ir_pass.infer_type(func)
         func = relay.ir_pass.rewrite_annotated_ops(func,
                                                    tvm.context(3).device_type)
-        func = relay.ir_pass.infer_type(func)
-        return relay.Function(relay.ir_pass.free_vars(func.body[4]),
-                              func.body[4])
+        return func
+
+    class ScheduleConv2d(ExprMutator):
+        def __init__(self, device):
+            self.device = device
+            super().__init__()
+
+        def visit_call(self, expr):
+            visit = super().visit_call(expr)
+            if expr.op == tvm.relay.op.get("nn.conv2d"):
+                return relay.annotation.on_device(visit, self.device)
+            else:
+                return visit
+
+    def annotate_with_visitor(func):
+        sched = ScheduleConv2d(dev2)
+        func = sched.visit(func)
+        func = relay.ir_pass.rewrite_annotated_ops(func, dev1.device_type)
+        return func
 
     def expected():
         conv2d_1 = relay.nn.conv2d(
@@ -202,7 +268,7 @@ def expected():
             kernel_size=(3, 3),
             padding=(1, 1))
 
-        func = relay.Function([data1, weight, data2], conv2d_3)
+        func = relay.Function([data1, data2, weight], conv2d_3)
         return func
 
     def check_storage_and_device_types():
@@ -225,10 +291,19 @@ def check_storage_and_device_types():
         assert len(set(device_types)) == 2
         assert set(device_types) == {1, 2}
 
-    annotated_func = annotated()
-    expected_func = expected()
-    check_annotated_graph(annotated_func, expected_func)
-    check_storage_and_device_types()
+    def test_manual_annotation():
+        annotated_func = annotated()
+        expected_func = expected()
+        check_annotated_graph(annotated_func, expected_func)
+        check_storage_and_device_types()
+
+    def test_visitor_annotation():
+        annotated_func = annotate_with_visitor(original())
+        expected_func = expected()
+        check_annotated_graph(annotated_func, expected_func)
+
+    test_manual_annotation()
+    test_visitor_annotation()
 
 
 def run_fusible_network(dev, tgt):
@@ -297,18 +372,15 @@ def annotated():
             sqrt = relay.sqrt(add)
             _sqrt = relay.annotation.on_device(sqrt, dev_ctx)
             log = relay.log(add)
-            subtract = relay.subtract(sqrt, log)
+            subtract = relay.subtract(_sqrt, log)
             exp = relay.exp(subtract)
             _exp = relay.annotation.on_device(exp, dev_ctx)
 
-            func = relay.Function([x, y],
-                                  relay.Tuple(tvm.convert([_sqrt, _exp, exp])))
+            func = relay.Function([x, y], _exp)
             func = relay.ir_pass.infer_type(func)
             func = relay.ir_pass.rewrite_annotated_ops(func,
                                                        cpu_ctx.device_type)
-            func = relay.ir_pass.infer_type(func)
-            return relay.Function(relay.ir_pass.free_vars(func.body[2]),
-                                  func.body[2])
+            return func
 
         def expected():
             add = relay.add(x, y)
@@ -342,25 +414,20 @@ def test_fuse_all(device, tgt):
         def annotated():
             add = relay.add(x, y)
             _add = relay.annotation.on_device(add, dev_ctx)
-            sqrt = relay.sqrt(add)
+            sqrt = relay.sqrt(_add)
             _sqrt = relay.annotation.on_device(sqrt, dev_ctx)
-            log = relay.log(add)
+            log = relay.log(_add)
             _log = relay.annotation.on_device(log, dev_ctx)
-            subtract = relay.subtract(sqrt, log)
+            subtract = relay.subtract(_sqrt, _log)
             _subtract = relay.annotation.on_device(subtract, dev_ctx)
-            exp = relay.exp(subtract)
+            exp = relay.exp(_subtract)
             _exp = relay.annotation.on_device(exp, dev_ctx)
 
-            func = relay.Function([x, y],
-                                  relay.Tuple(tvm.convert([_add, _sqrt, _log,
-                                                           _subtract, _exp,
-                                                           exp])))
+            func = relay.Function([x, y], _exp)
             func = relay.ir_pass.infer_type(func)
             func = relay.ir_pass.rewrite_annotated_ops(func,
                                                        cpu_ctx.device_type)
-            func = relay.ir_pass.infer_type(func)
-            return relay.Function(relay.ir_pass.free_vars(func.body[5]),
-                                  func.body[5])
+            return func
 
         annotated_func = annotated()
         expected_func = get_func()
@@ -381,14 +448,11 @@ def annotated():
             exp = relay.exp(subtract)
             _exp = relay.annotation.on_device(exp, cpu_ctx)
 
-            func = relay.Function([x, y],
-                                  relay.Tuple(tvm.convert([_exp, exp])))
+            func = relay.Function([x, y], _exp)
             func = relay.ir_pass.infer_type(func)
             func = relay.ir_pass.rewrite_annotated_ops(func,
                                                        dev_ctx.device_type)
-            func = relay.ir_pass.infer_type(func)
-            return relay.Function(relay.ir_pass.free_vars(func.body[1]),
-                                  func.body[1])
+            return func
 
         def expected():
             add = relay.add(x, y)
@@ -411,7 +475,7 @@ def expected():
                      expected_index)
 
     def test_fallback_all_operators(device, tgt):
-        target = {device: tgt}
+        target = {device: tgt, "cpu": "llvm"}
         annotated_func = get_func()
         expected_func = get_func()
         check_annotated_graph(annotated_func, expected_func)
@@ -454,17 +518,13 @@ def annotated():
         _add = relay.annotation.on_device(add, dev_ctx)
         mul = relay.multiply(c, d)
         _mul = relay.annotation.on_device(mul, cpu_ctx)
-        sub = relay.subtract(add, mul)
+        sub = relay.subtract(_add, _mul)
         _sub = relay.annotation.on_device(sub, dev_ctx)
-        func = relay.Function([a, b, c, d],
-                              relay.Tuple(tvm.convert([_add, _mul,
-                                                       _sub, sub])))
+        func = relay.Function([a, b, c, d], _sub)
         func = relay.ir_pass.infer_type(func)
         func = relay.ir_pass.rewrite_annotated_ops(func,
                                                    dev_ctx.device_type)
-        func = relay.ir_pass.infer_type(func)
-        return relay.Function(relay.ir_pass.free_vars(func.body[3]),
-                              func.body[3])
+        return func
         
     def expected():    
         add = relay.add(a, b)
@@ -494,6 +554,7 @@ def expected():
         res = mod.get_output(0).asnumpy()
         tvm.testing.assert_allclose(res, ref_res, rtol=1e-5, atol=1e-5)
         
+
 def test_check_run():
     for dev, tgt in [("opencl", "opencl"), ("cuda", "cuda"),
                  ("opencl", str(tvm.target.intel_graphics()))]:
@@ -503,10 +564,46 @@ def test_check_run():
         run_fusible_network(dev, tgt)
         run_unpropagatable_graph(dev, tgt)
 
- 
+
+def test_tuple_get_item():
+    dev = "cuda"
+    if not tvm.module.enabled(dev):
+        print("Skip test because %s is not enabled." % dev)
+        return
+
+    cpu_ctx = tvm.cpu(0)
+    gpu_ctx = tvm.context(dev)
+
+    def expected():
+        x = relay.var("x", relay.ty.TensorType((3, 3, 4), "float32"))
+        split = relay.op.split(x, 3)
+        elem0 = relay.device_copy(split[0], gpu_ctx, cpu_ctx)
+        elem1 = relay.device_copy(split[1], gpu_ctx, cpu_ctx)
+        sub = elem0 - elem1
+        func = relay.Function(relay.ir_pass.free_vars(sub), sub)
+        return func
+
+    def annotated():
+        x = relay.var("x", relay.ty.TensorType((3, 3, 4), "float32"))
+        split = relay.op.split(x, 3)
+        split = split.astuple()
+        split = relay.annotation.on_device(split, gpu_ctx)
+        split = relay.TupleWrapper(split, 3)
+        sub = split[0] - split[1]
+        func = relay.Function(relay.ir_pass.free_vars(sub), sub)
+        func = relay.ir_pass.rewrite_annotated_ops(func, cpu_ctx.device_type)
+        return func
+
+    annotated_func = relay.ir_pass.infer_type(annotated())
+    expected_func = relay.ir_pass.infer_type(expected())
+    assert relay.ir_pass.graph_equal(annotated_func, expected_func)
+
+
 if __name__ == "__main__":
     test_redundant_annotation()
+    test_annotate_expr()
     test_annotate_all()
     test_annotate_none()
     test_conv_network()
     test_check_run()
+    test_tuple_get_item()
diff --git a/tests/python/relay/test_pass_canonicalize_cast.py b/tests/python/relay/test_pass_canonicalize_cast.py
new file mode 100644
index 000000000000..04478e94039a
--- /dev/null
+++ b/tests/python/relay/test_pass_canonicalize_cast.py
@@ -0,0 +1,70 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import tvm.relay as relay
+import tvm.relay.module as _module
+import tvm.relay.transform as _transform
+
+
+def test_canonicalize_cast():
+    def before(data, conv_weight, bias1, bias2):
+        x = relay.nn.conv2d(data, conv_weight,
+                          channels=16,
+                          kernel_size=(3, 3),
+                          padding=(1, 1),
+                          out_dtype="int8")
+        x1 = relay.cast(x, dtype="int32")
+        y1 = relay.add(x1, bias1)
+        y2 = relay.add(x1, bias2)
+        y = relay.add(y1, y2)
+        return relay.Function([data, conv_weight, bias1, bias2], y)
+
+    def expected(data, conv_weight, bias1, bias2):
+        x = relay.nn.conv2d(data, conv_weight,
+                          channels=16,
+                          kernel_size=(3, 3),
+                          padding=(1, 1),
+                          out_dtype="int8")
+        x1 = relay.cast(x, dtype="int32")
+        x2 = relay.cast(x, dtype="int32")
+        y1 = relay.add(x1, bias1)
+        y2 = relay.add(x2, bias2)
+        y = relay.add(y1, y2)
+        return relay.Function([data, conv_weight, bias1, bias2], y)
+
+    def check(shape):
+        data = relay.var("data", shape=shape, dtype="int8")
+        conv_weight = relay.var("weight")
+        bias1 = relay.var("bias1", shape=(16, 1, 1), dtype="int32")
+        bias2 = relay.var("bias2", shape=(16, 1, 1), dtype="int32")
+        y = before(data, conv_weight, bias1, bias2)
+        mod = _module.Module.from_expr(y)
+        seq = _transform.Sequential([_transform.InferType(), _transform.CanonicalizeCast(),
+                                     _transform.InferType()])
+        with _transform.PassContext(opt_level=3):
+            mod = seq(mod)
+        y = mod[mod.entry_func.name_hint]
+        y_expected = expected(data, conv_weight, bias1, bias2)
+        y_expected = relay.ir_pass.infer_type(y_expected)
+        assert relay.ir_pass.alpha_equal(y, y_expected)
+
+    check((1, 16, 7, 7))
+
+
+if __name__ == '__main__':
+    test_canonicalize_cast()
diff --git a/tests/python/relay/test_pass_eta_expand.py b/tests/python/relay/test_pass_eta_expand.py
index 40a84286d08a..4e20b02357d3 100644
--- a/tests/python/relay/test_pass_eta_expand.py
+++ b/tests/python/relay/test_pass_eta_expand.py
@@ -15,13 +15,20 @@
 # specific language governing permissions and limitations
 # under the License.
 from tvm import relay
+import tvm.relay.module as _module
+import tvm.relay.transform as _transform
 
 def test_eta_expand_basic():
-    mod = relay.Module()
     x = relay.var('x', 'int32')
-    y = relay.var('y', 'int32')
     orig = relay.Function([x], x)
-    got = relay.ir_pass.eta_expand(orig, mod)
+    mod = _module.Module.from_expr(orig)
+    seq = _transform.Sequential([_transform.EtaExpand()])
+    with _transform.PassContext(opt_level=3):
+        mod = seq(mod)
+
+    got = mod[mod.entry_func.name_hint]
+
+    y = relay.var('y', 'int32')
     expected = relay.Function([y], orig(y))
 
     got = relay.ir_pass.infer_type(got, mod)
diff --git a/tests/python/relay/test_pass_gradient.py b/tests/python/relay/test_pass_gradient.py
index f5968a41f028..d99bee58b99b 100644
--- a/tests/python/relay/test_pass_gradient.py
+++ b/tests/python/relay/test_pass_gradient.py
@@ -19,6 +19,7 @@
 from tvm.relay.ir_pass import free_vars, free_type_vars, gradient
 from tvm.relay import create_executor
 from tvm.relay.prelude import Prelude
+from tvm.relay.testing import add_nat_definitions, make_nat_expr
 
 import numpy as np
 
@@ -174,13 +175,14 @@ def test_tuple():
 def test_pow():
     mod = relay.Module()
     p = Prelude(mod)
+    add_nat_definitions(p)
     shape = (10, 10)
     dtype = 'float32'
     t = relay.TensorType(shape, dtype)
     x = relay.var("x", t)
     double = relay.Function([x], x + x)
     i = relay.var("i", t)
-    func = relay.Function([i], relay.Call(p.iterate(double, p.s(p.s(p.s(p.z())))), [i]))
+    func = relay.Function([i], p.nat_iterate(double, make_nat_expr(p, 3))(i))
     back_func = relay.ir_pass.infer_type(gradient(func, mod=mod), mod=mod)
     assert back_func.checked_type == relay.FuncType([t], relay.TupleType([t, relay.TupleType([t])]))
     i_nd = rand(dtype, *shape)
diff --git a/tests/python/relay/test_pass_mac_count.py b/tests/python/relay/test_pass_mac_count.py
index 5a975fd41364..98ba1ad6325d 100644
--- a/tests/python/relay/test_pass_mac_count.py
+++ b/tests/python/relay/test_pass_mac_count.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Unit tests for MAC counter."""
+import numpy as np
 import tvm
 from tvm import relay
 
@@ -99,7 +100,35 @@ def test_simple_network():
     expect_count = 231411712
     assert compute_count == expect_count
 
+def test_depthwise_conv2d():
+    batch_size = 1
+    dshape = (batch_size, 64, 56, 56)
+    weight_conv = relay.var("weight_depthwiseconv", shape=(64, 1, 3, 3))
+    data1 = relay.var("data1", shape=dshape)
+    data2 = relay.var("data2", shape=dshape)
+    depthwise_conv2d_1 = relay.nn.conv2d(
+        data1,
+        weight_conv,
+        kernel_size=(3, 3),
+        padding=(1, 1),
+        groups=64)
+    depthwise_conv2d_2 = relay.nn.conv2d(
+        data2,
+        weight_conv,
+        kernel_size=(3, 3),
+        padding=(1, 1),
+        groups=64)
+    add = relay.add(depthwise_conv2d_1, depthwise_conv2d_2)
+    func = relay.Function([data1, data2, weight_conv],
+                            relay.Tuple(tvm.convert([depthwise_conv2d_1,
+                                                    depthwise_conv2d_2,
+                                                    add])))
+    func = relay.ir_pass.infer_type(func)
+    compute_count = relay.ir_pass.get_total_mac_number(func)
+    assert compute_count == 2 * np.prod(dshape) * 3*3
+
 if __name__ == "__main__":
     test_conv()
     test_gemm()
     test_simple_network()
+    test_depthwise_conv2d()
diff --git a/tests/python/relay/test_pass_manager.py b/tests/python/relay/test_pass_manager.py
index b8216775ee1c..a8f50bdb8f55 100644
--- a/tests/python/relay/test_pass_manager.py
+++ b/tests/python/relay/test_pass_manager.py
@@ -22,6 +22,7 @@
 from tvm.relay import ExprFunctor
 from tvm.relay import Function, Call
 from tvm.relay import ir_pass
+from tvm.relay import transform as _transform
 from tvm.relay.testing import ctx_list
 
 
@@ -126,13 +127,13 @@ def test_module_pass():
     opt_tester = OptTester(mod)
     pass_ctx = None
 
-    @ir_pass.module_pass(opt_level=opt_level, name=pass_name)
+    @_transform.module_pass(opt_level=opt_level, name=pass_name)
     def transform(expr, ctx):
         return opt_tester.transform(expr, ctx)
 
     def test_pass_registration():
         mod_pass = transform
-        assert isinstance(mod_pass, ir_pass.ModulePass)
+        assert isinstance(mod_pass, _transform.ModulePass)
         pass_info = mod_pass.info
         assert pass_info.name == pass_name
         assert pass_info.opt_level == opt_level
@@ -140,8 +141,8 @@ def test_pass_registration():
     def test_pass_registration_no_decorator():
         def direct_transform(expr, ctx):
             return opt_tester.transform(expr, ctx)
-        mod_pass = ir_pass.module_pass(direct_transform, opt_level=3)
-        assert isinstance(mod_pass, ir_pass.ModulePass)
+        mod_pass = _transform.module_pass(direct_transform, opt_level=3)
+        assert isinstance(mod_pass, _transform.ModulePass)
         pass_info = mod_pass.info
         assert pass_info.name == "direct_transform"
         assert pass_info.opt_level == 3
@@ -188,6 +189,29 @@ def test_pass_run():
     test_pass_run()
 
 
+def test_function_class_pass():
+    @relay.transform.function_pass(opt_level=1)
+    class TestReplaceFunc:
+        """Simple test function to replace one argument to another."""
+        def __init__(self, new_func):
+            self.new_func = new_func
+
+        def transform_function(self, func, mod, ctx):
+            return self.new_func
+
+    x = relay.var("x", shape=(10, 20))
+    f1 = relay.Function([x], x)
+    f2 = relay.Function([x], relay.log(x))
+    fpass = TestReplaceFunc(f1)
+    assert fpass.info.opt_level == 1
+    assert fpass.info.name == "TestReplaceFunc"
+    mod = relay.Module.from_expr(f2)
+    mod = fpass(mod)
+    # wrap in expr
+    mod2 = relay.Module.from_expr(f1)
+    assert relay.alpha_equal(mod["main"], mod2["main"])
+
+
 def test_function_pass():
     shape = (10, )
     dtype = 'float32'
@@ -202,8 +226,8 @@ def test_function_pass():
     opt_tester = OptTester(mod)
     pass_ctx = None
 
-    @ir_pass.function_pass(opt_level=opt_level, name=pass_name)
-    def transform(expr, ctx):
+    @_transform.function_pass(opt_level=opt_level, name=pass_name)
+    def transform(expr, mod, ctx):
         return opt_tester.transform(expr, ctx)
 
     def get_ref_log():
@@ -212,7 +236,7 @@ def get_ref_log():
 
     def test_pass_registration():
         function_pass = transform
-        assert isinstance(function_pass, ir_pass.FunctionPass)
+        assert isinstance(function_pass, _transform.FunctionPass)
         pass_info = function_pass.info
         assert pass_info.name == pass_name
         assert pass_info.opt_level == opt_level
@@ -220,8 +244,8 @@ def test_pass_registration():
     def test_pass_registration_no_decorator():
         def direct_transform(expr, ctx):
             return opt_tester.transform(expr, ctx)
-        mod_pass = ir_pass.function_pass(direct_transform, opt_level=0)
-        assert isinstance(mod_pass, ir_pass.FunctionPass)
+        mod_pass = _transform.function_pass(direct_transform, opt_level=0)
+        assert isinstance(mod_pass, _transform.FunctionPass)
         pass_info = mod_pass.info
         assert pass_info.name == "direct_transform"
         assert pass_info.opt_level == 0
@@ -258,6 +282,36 @@ def test_pass_run():
     test_pass_run()
 
 
+def test_module_class_pass():
+    @relay.transform.module_pass(opt_level=1)
+    class TestPipeline:
+        """Simple test function to replace one argument to another."""
+        def __init__(self, new_mod, replace):
+            self.new_mod = new_mod
+            self.replace = replace
+
+        def transform_module(self, mod, ctx):
+            if self.replace:
+                return self.new_mod
+            return mod
+
+    x = relay.var("x", shape=(10, 20))
+    m1 = relay.Module.from_expr(relay.Function([x], x))
+    m2 = relay.Module.from_expr(relay.Function([x], relay.log(x)))
+    fpass = TestPipeline(m2, replace=True)
+    assert fpass.info.name == "TestPipeline"
+    mod3 = fpass(m1)
+    assert mod3.same_as(m2)
+    mod4 = TestPipeline(m2, replace=False)(m1)
+    assert mod4.same_as(m1)
+
+
+def test_pass_info():
+    info = relay.transform.PassInfo(opt_level=1, name="xyz")
+    assert info.opt_level == 1
+    assert info.name == "xyz"
+
+
 def test_sequential_pass():
     shape = (10, )
     dtype = 'float32'
@@ -294,15 +348,15 @@ def get_ref_abs():
     opt_tester = OptTester(mod)
     pass_ctx = None
 
-    @ir_pass.module_pass(opt_level=1)
+    @_transform.module_pass(opt_level=1)
     def mod_transform(expr, ctx):
         return opt_tester.transform(expr, ctx)
 
     module_pass = mod_transform
 
     # Register a function pass.
-    @ir_pass.function_pass(opt_level=1)
-    def func_transform(expr, ctx):
+    @_transform.function_pass(opt_level=1)
+    def func_transform(expr, mod, ctx):
         return opt_tester.transform(expr, ctx)
 
     function_pass = func_transform
@@ -310,25 +364,24 @@ def func_transform(expr, ctx):
     def test_pass_registration():
         passes = [module_pass, function_pass]
         opt_level = 2
-        pass_name = "sequential_pass"
-        sequential_pass = ir_pass.sequential_pass(passes=passes,
-                                                  opt_level=opt_level)
-        assert isinstance(sequential_pass, ir_pass.SequentialPass)
-        pass_info = sequential_pass.info
+        pass_name = "sequential"
+        sequential = _transform.Sequential(passes=passes, opt_level=opt_level)
+        pass_info = sequential.info
         assert pass_info.name == pass_name
         assert pass_info.opt_level == opt_level
 
     def test_no_pass():
         passes = []
-        sequential_pass = ir_pass.sequential_pass(opt_level=1, passes=passes)
-        ret_mod = sequential_pass(mod)
+        sequential = _transform.Sequential(opt_level=1, passes=passes)
+        ret_mod = sequential(mod)
         mod_func = ret_mod[v_sub]
         check_func(sub, mod_func)
 
     def test_only_module_pass():
         passes = [module_pass]
-        sequential_pass = ir_pass.sequential_pass(opt_level=1, passes=passes)
-        ret_mod = sequential_pass(mod)
+        sequential = _transform.Sequential(opt_level=1, passes=passes)
+        with relay.build_config(required_pass=["mod_transform"]):
+            ret_mod = sequential(mod)
         # Check the subtract function.
         sub_var, new_sub = extract_var_func(ret_mod, v_sub.name_hint)
         check_func(new_sub, sub)
@@ -341,8 +394,9 @@ def test_only_module_pass():
     def test_only_function_pass():
         # Check the subtract function.
         passes = [function_pass]
-        sequential_pass = ir_pass.sequential_pass(opt_level=1, passes=passes)
-        ret_mod = sequential_pass(mod)
+        sequential = _transform.Sequential(opt_level=1, passes=passes)
+        with relay.build_config(required_pass=["func_transform"]):
+            ret_mod = sequential(mod)
         _, new_sub = extract_var_func(ret_mod, v_sub.name_hint)
         check_func(new_sub, get_ref_sub())
 
@@ -355,8 +409,10 @@ def test_multiple_passes():
         # function pass.
         mod = relay.Module({v_sub: sub, v_log: log})
         passes = [module_pass, function_pass]
-        sequential_pass = ir_pass.sequential_pass(opt_level=1, passes=passes)
-        ret_mod = sequential_pass(mod)
+        sequential = _transform.Sequential(opt_level=1, passes=passes)
+        required = ["mod_transform", "func_transform"]
+        with relay.build_config(required_pass=required):
+            ret_mod = sequential(mod)
 
         # Check the abs function is added.
         abs_var, abs_func = get_var_func()
@@ -401,7 +457,51 @@ def test_multiple_passes():
     test_multiple_passes()
 
 
+def test_sequential_with_scoping():
+    shape = (1, 2, 3)
+    c_data = np.array(shape).astype("float32")
+    tp = relay.TensorType(shape, "float32")
+    def before():
+        c = relay.const(c_data)
+        x = relay.var("x", tp)
+        y = relay.add(c, c)
+        y = relay.multiply(y, relay.const(2, "float32"))
+        y = relay.add(x, y)
+        z = relay.add(y, c)
+        z1 = relay.add(y, c)
+        z2 = relay.add(z, z1)
+        return relay.Function([x], z2)
+
+    def expected():
+        x = relay.var("x", tp)
+        c_folded = (c_data + c_data) * 2
+        y = relay.add(x, relay.const(c_folded))
+        z = relay.add(y, relay.const(c_data))
+        z1 = relay.add(z, z)
+        return relay.Function([x], z1)
+
+    seq = _transform.Sequential([
+        relay.transform.InferType(),
+        relay.transform.FoldConstant(),
+        relay.transform.EliminateCommonSubexpr(),
+        relay.transform.AlterOpLayout()
+    ])
+
+    mod = relay.Module({"main": before()})
+    with relay.build_config(opt_level=3):
+        with tvm.target.create("llvm"):
+            mod = seq(mod)
+
+    zz = mod["main"]
+    zexpected = ir_pass.infer_type(expected())
+    assert relay.ir_pass.alpha_equal(zz, zexpected)
+
+
 if __name__ == "__main__":
+    test_function_class_pass()
+    test_module_class_pass()
     test_module_pass()
     test_function_pass()
     test_sequential_pass()
+    test_sequential_with_scoping()
+    test_pass_info()
diff --git a/tests/python/relay/test_pass_partial_eval.py b/tests/python/relay/test_pass_partial_eval.py
index 78fa63b5231d..b3c0c28d26cb 100644
--- a/tests/python/relay/test_pass_partial_eval.py
+++ b/tests/python/relay/test_pass_partial_eval.py
@@ -18,14 +18,17 @@
 import numpy as np
 import tvm
 from tvm import relay
-from tvm.relay.ir_pass import partial_evaluate, dead_code_elimination
-from tvm.relay.ir_pass import gradient, alpha_equal, infer_type
+from tvm.relay.ir_pass import partial_evaluate, alpha_equal, infer_type, dead_code_elimination
+from tvm.relay.ir_pass import gradient
 from tvm.relay import op, create_executor
 from tvm.relay.backend.interpreter import Value, TupleValue, ConstructorValue
 from tvm.relay.prelude import Prelude
 from tvm.relay import create_executor
-
 from nose.tools import nottest
+from tvm.relay import Var, TypeVar, TupleGetItem, Let, Function, const, RefRead, RefWrite, RefCreate
+from tvm.relay import TensorType, Tuple, If, Module, Clause, PatternConstructor, PatternVar, Match
+from tvm.relay import GlobalVar, Call, Type
+from tvm.relay.testing import add_nat_definitions, count, make_nat_value, make_nat_expr
 
 def check_eval(expr, expected_result, mod=None, rtol=1e-07):
     ctx = tvm.context("llvm", 0)
@@ -35,24 +38,22 @@ def check_eval(expr, expected_result, mod=None, rtol=1e-07):
     np.testing.assert_allclose(result.asnumpy(), expected_result, rtol=rtol)
 
 
-def dcpe(expr):
-    return dead_code_elimination(partial_evaluate(expr))
+def dcpe(expr, mod=None):
+    return dead_code_elimination(partial_evaluate(expr, mod=mod), inline_once=True)
 
 
 def test_tuple():
-    t = relay.TypeVar("t")
-    x = relay.Var("x", t)
-    body = relay.TupleGetItem(relay.Tuple([relay.const(4.0), x]), 1)
-    f = relay.Function([x], body, None, [t])
+    t = TypeVar("t")
+    x = Var("x", t)
+    body = TupleGetItem(relay.Tuple([relay.const(4.0), x]), 1)
+    f = Function([x], body, None, [t])
     assert alpha_equal(dcpe(f), relay.Function([x], x, None, [t]))
 
-@nottest
 def test_const_inline():
-    # TODO(MK): fix me
-    d = relay.Var("d")
-    double = relay.Function([d], d + d)
-    orig = double(relay.const(4.0))
-    assert alpha_equal(dcpe(double(relay.const(4.0))), relay.const(8.0))
+    d = Var("d")
+    double = Function([d], d + d)
+    orig = double(const(4.0))
+    assert alpha_equal(dcpe(orig), const(8.0))
 
 
 def test_ref():
@@ -60,44 +61,57 @@ def test_ref():
     r = relay.Var("r")
     x = relay.Var("x")
     body = relay.RefRead(r)
-    body = relay.Let(x, relay.RefWrite(r, relay.RefRead(r) * relay.RefRead(r)), body)
-    body = relay.Let(r, relay.RefCreate(d), body)
-    square = relay.Function([d], body)
-    assert alpha_equal(dcpe(square), relay.Function([d], d * d))
+    body = Let(x, RefWrite(r, RefRead(r) * RefRead(r)), body)
+    body = Let(r, RefCreate(d), body)
+    square = Function([d], body)
+    assert alpha_equal(dcpe(square), Function([d], d * d))
+
+
+def test_empty_ad():
+    shape = (10, 10)
+    dtype = "float32"
+    t = TensorType(shape, dtype)
+    d = Var("d", t)
+    f = Function([d], d)
+    g = dcpe(gradient(f))
+    expected = Function([d], Tuple([d, Tuple([op.ones_like(d)])]))
+    assert alpha_equal(g, expected)
 
-@nottest
 def test_ad():
-    # TODO(MK): fix me
     shape = (10, 10)
     dtype = "float32"
-    t = relay.TensorType(shape, dtype)
-    d = relay.Var("d", t)
-    f = relay.Function([d], d * d)
+    t = TensorType(shape, dtype)
+    d = Var("d", t)
+    f = Function([d], d * d)
     g = dcpe(gradient(f))
     m = d * d
-    o = relay.op.ones_like(m)
-    grad = relay.op.zeros_like(d) + relay.op.collapse_sum_like(o * d, d) + relay.op.collapse_sum_like(o * d, d)
-    expected = relay.Function([d], relay.Tuple([m, relay.Tuple([grad])]))
+    x = relay.Var("x")
+    o = op.ones_like(x)
+    x1 = relay.Var("x1")
+    grad = op.zeros_like(d) + op.collapse_sum_like(x1 * d, d) + op.collapse_sum_like(x1 * d, d)
+    body = Tuple([x, Tuple([grad])])
+    body = relay.Let(x1, o, body)
+    expected = Function([d], relay.Let(x, m, body))
     assert alpha_equal(g, expected)
 
 
 def test_if_ref():
     shape = ()
     dtype = "bool"
-    t = relay.TensorType(shape, dtype)
-    d = relay.Var("d", t)
-    r = relay.Var("r")
-    update = relay.Function([], relay.RefWrite(r, relay.RefRead(r) + relay.RefRead(r)))
-    u = relay.Var("u")
-    body = relay.If(d, u(), u())
-    eff = relay.Var("eff")
-    body = relay.Let(eff, body, relay.RefRead(r))
-    f = relay.Function([d], relay.Let(r, relay.RefCreate(relay.const(1)), relay.Let(u, update, body)))
+    t = TensorType(shape, dtype)
+    d = Var("d", t)
+    r = Var("r")
+    update = Function([], RefWrite(r, RefRead(r) + RefRead(r)))
+    u = Var("u")
+    body = If(d, u(), u())
+    eff = Var("eff")
+    body = Let(eff, body, RefRead(r))
+    f = Function([d], Let(r, RefCreate(const(1)), Let(u, update, body)))
     f = infer_type(f)
     pe_f = infer_type(partial_evaluate(f))
     ex = create_executor()
-    f_res = ex.evaluate(f)(relay.const(True))
-    pe_f_res = ex.evaluate(pe_f)(relay.const(True))
+    f_res = ex.evaluate(f)(const(True))
+    pe_f_res = ex.evaluate(pe_f)(const(True))
     np.testing.assert_allclose(f_res.asnumpy(), 2 * np.ones_like(f_res.asnumpy()))
     np.testing.assert_allclose(pe_f_res.asnumpy(), 2 * np.ones_like(pe_f_res.asnumpy()))
 
@@ -105,52 +119,162 @@ def test_if_ref():
 def test_function_invalidate():
     shape = ()
     dtype = "bool"
-    t = relay.TensorType(shape, dtype)
-    d = relay.Var("d", t)
-    r = relay.Var("r")
-    fetch = relay.Function([], relay.RefRead(r))
-    fet = relay.Var("fetch")
-    fet_obscured = relay.Var("fetch_obscured")
-    u = relay.Var("u")
-    body = relay.If(d, fet_obscured(), fet_obscured())
-    body = relay.Let(u, relay.RefWrite(r, relay.const(1)), body)
-    body = relay.Let(fet_obscured, relay.If(d, fet, fet), body)
-    body = relay.Let(fet, fetch, body)
-    body = relay.Let(r, relay.RefCreate(relay.const(0)), body)
-    f = relay.Function([d], body)
+    t = TensorType(shape, dtype)
+    d = Var("d", t)
+    r = Var("r")
+    fetch = Function([], RefRead(r))
+    fet = Var("fetch")
+    fet_obscured = Var("fetch_obscured")
+    u = Var("u")
+    body = If(d, fet_obscured(), fet_obscured())
+    body = Let(u, RefWrite(r, const(1)), body)
+    body = Let(fet_obscured, If(d, fet, fet), body)
+    body = Let(fet, fetch, body)
+    body = Let(r, RefCreate(const(0)), body)
+    f = Function([d], body)
     f = infer_type(f)
     pe_f = infer_type(partial_evaluate(f))
     ex = create_executor()
-    f_res = ex.evaluate(f)(relay.const(True))
-    pe_f_res = ex.evaluate(pe_f)(relay.const(True))
+    f_res = ex.evaluate(f)(const(True))
+    pe_f_res = ex.evaluate(pe_f)(const(True))
     np.testing.assert_allclose(f_res.asnumpy(), np.ones_like(f_res.asnumpy()))
     np.testing.assert_allclose(pe_f_res.asnumpy(), np.ones_like(pe_f_res.asnumpy()))
 
 
 def test_head_cons():
-    mod = relay.Module()
+    mod = Module()
     p = Prelude(mod)
     def hd_impl():
-        a = relay.TypeVar("a")
-        x = relay.Var("x", p.l(a))
-        y = relay.Var("y")
-        z = relay.Var("z")
-        cons_case = relay.Clause(relay.PatternConstructor(p.cons,
-                                                          [relay.PatternVar(y),
-                                                           relay.PatternVar(z)]),
-                                 y)
-        return relay.Function([x], relay.Match(x, [cons_case]), a, [a])
-    t = relay.TypeVar("t")
-    x = relay.Var("x", t)
-    hd = relay.Var("hd")
-    body = relay.Let(hd, hd_impl(), hd(p.cons(x, p.nil())))
-    f = relay.Function([x], body, None, [t])
+        a = TypeVar("a")
+        x = Var("x", p.l(a))
+        y = Var("y")
+        z = Var("z")
+        cons_case = Clause(PatternConstructor(p.cons,
+                                              [PatternVar(y),
+                                               PatternVar(z)]),
+                           y)
+        y = Var("y")
+        z = Var("z")
+        return Function([x], Match(x, [cons_case]), a, [a])
+    t = TypeVar("t")
+    x = Var("x", t)
+    hd = Var("hd")
+    body = Let(hd, hd_impl(), hd(p.cons(x, p.nil())))
+    f = Function([x], body, None, [t])
     f = infer_type(f, mod=mod)
     res = dcpe(f)
-    assert alpha_equal(res, relay.Function([x], x, t, [t]))
+    assert alpha_equal(res, Function([x], x, t, [t]))
+
+
+def test_map():
+    mod = Module()
+    p = Prelude(mod)
+    f = Var("f")
+    orig = p.map(f, p.cons(const(1), p.cons(const(2), p.cons(const(3), p.nil()))))
+    expected = p.cons(f(const(1)), p.cons(f(const(2)), p.cons(f(const(3)), p.nil())))
+    assert alpha_equal(dcpe(orig, mod=mod), expected)
+
+
+def test_loop():
+    mod = Module()
+    t = TypeVar("t")
+    x = Var("x", t)
+    loop = GlobalVar("loop")
+    mod[loop] = Function([x], loop(x), t, [t])
+    res = dcpe(loop(const(1)), mod=mod)
+    expected = Call(loop, [const(1)], None, [None])
+    assert alpha_equal(res, expected)
+
+
+def test_swap_loop():
+    mod = Module()
+    p = Prelude(mod)
+    add_nat_definitions(p)
+    nat = p.nat()
+    x = Var("x", nat)
+    y = Var("y", nat)
+    loop = GlobalVar("loop")
+    mod[loop] = Function([x, y], loop(y, x), nat)
+    prog = loop(make_nat_expr(p, 1), make_nat_expr(p, 2))
+    res = dcpe(prog, mod=mod)
+    assert alpha_equal(prog, res)
+
+
+def test_abs_diff():
+    # TODO(@M.K.): refactor using tuple pattern (not yet implemented)
+    mod = Module()
+    p = Prelude(mod)
+    add_nat_definitions(p)
+    nat = p.nat()
+    x = Var("x", nat)
+    y = Var("y", nat)
+    xp = Var("x'", nat)
+    yp = Var("y'", nat)
+    diff = GlobalVar("diff")
+    y_z_case = Clause(PatternConstructor(p.z, []), x)
+    y_s_case = Clause(PatternConstructor(p.s, [PatternVar(yp)]), diff(yp, xp))
+    x_z_case = Clause(PatternConstructor(p.z, []), y)
+    x_s_case = Clause(PatternConstructor(p.s, [PatternVar(xp)]), Match(y, [y_z_case, y_s_case]))
+    mod[diff] = Function([x, y], Match(x, [x_z_case, x_s_case]))
+    orig = diff(make_nat_expr(p, 7), make_nat_expr(p, 3))
+    res = dcpe(orig, mod=mod)
+    assert alpha_equal(res, make_nat_expr(p, 4))
+
+
+def test_match_nat_id():
+    mod = Module()
+    p = Prelude(mod)
+    add_nat_definitions(p)
+    nat = p.nat()
+    x = Var("x", nat)
+    y = Var("y", nat)
+    nat_id = GlobalVar("nat_id")
+    z_case = Clause(PatternConstructor(p.z, []), p.z())
+    s_case = Clause(PatternConstructor(p.s, [PatternVar(y)]), p.s(y))
+    mod[nat_id] = Function([x], Match(x, [z_case, s_case]))
+    orig = nat_id(make_nat_expr(p, 3))
+    res = dcpe(orig, mod=mod)
+    assert alpha_equal(res, make_nat_expr(p, 3))
+
+
+def test_nat_id():
+    mod = Module()
+    p = Prelude(mod)
+    add_nat_definitions(p)
+    nat = p.nat()
+    x = Var("x", nat)
+    y = Var("y", nat)
+    nat_id = GlobalVar("nat_id")
+    mod[nat_id] = Function([x], x)
+    orig = nat_id(make_nat_expr(p, 3))
+    res = dcpe(orig, mod=mod)
+    assert alpha_equal(res, make_nat_expr(p, 3))
+
+
+def test_global_match_nat_id():
+    mod = Module()
+    p = Prelude(mod)
+    add_nat_definitions(p)
+    nat = p.nat()
+    x = Var("x", nat)
+    z_case = Clause(PatternConstructor(p.z, []), p.z())
+    s_case = Clause(PatternConstructor(p.s, [PatternVar(x)]), p.s(x))
+    orig = Match(make_nat_expr(p, 3), [z_case, s_case])
+    res = dcpe(orig, mod=mod)
+    assert alpha_equal(res, make_nat_expr(p, 3))
+
+
+def test_double():
+    mod = Module()
+    p = Prelude(mod)
+    add_nat_definitions(p)
+    orig = p.double(make_nat_expr(p, 3))
+    res = dcpe(orig, mod=mod)
+    assert alpha_equal(res, make_nat_expr(p, 6))
 
 
 if __name__ == '__main__':
+    test_empty_ad()
     test_tuple()
     test_const_inline()
     test_ref()
@@ -158,3 +282,11 @@ def hd_impl():
     test_if_ref()
     test_function_invalidate()
     test_head_cons()
+    test_map()
+    test_loop()
+    test_swap_loop()
+    test_abs_diff()
+    test_double()
+    test_nat_id()
+    test_global_match_nat_id()
+    test_match_nat_id()
diff --git a/tests/python/relay/test_pass_quantize.py b/tests/python/relay/test_pass_quantize.py
index a9a683c43263..fe62c3b5cea4 100644
--- a/tests/python/relay/test_pass_quantize.py
+++ b/tests/python/relay/test_pass_quantize.py
@@ -47,54 +47,55 @@ def test_simulated_quantize():
     assert out.args[3].checked_type == relay.ty.TensorType(tuple(), "float32")
 
 
-# def test_quantize_pass():
-#     def quantize_weight(arr):
-#         maximum = np.amax(np.abs(arr.asnumpy()))
-#         scale = 2**math.ceil(math.log(maximum, 2))
-#         out = np.around(arr.asnumpy() / scale * 128).astype('int8')
-#         out = np.clip(out, -127, 127)
-#         return relay.const(out, 'int8')
-#
-#     n, c, h, w = 1, 3, 224, 224
-#     def make_graph(data):
-#         weight = relay.var("conv_weight")
-#         out = relay.nn.conv2d(data, weight, kernel_size=(3, 3), padding=(1, 1), channels=c)
-#         out = relay.Function(relay.ir_pass.free_vars(out), out)
-#         return out
-#
-#     def make_qgraph(data, weight):
-#         out = data * relay.const(32.0)
-#         out = relay.round(out)
-#         out = relay.clip(out, a_min=-127, a_max=127)
-#         out = out.astype('int8')
-#
-#         out = relay.nn.conv2d(out, weight, kernel_size=(3, 3),
-#                               padding=(1, 1), channels=c, out_dtype='int32')
-#         out = out.astype('float32')
-#         out = relay.multiply(out, relay.const(0.00024414062))
-#         out = relay.Function(relay.ir_pass.free_vars(out), out)
-#         return out
-#
-#     data = relay.var("data", relay.TensorType((n, c, h, w), "float32"))
-#     graph = make_graph(data)
-#     dataset, params = make_dataset(graph, 10)
-#
-#     with qtz.qconfig(skip_k_conv=0, global_scale=4.0,
-#                      round_for_shift=False, store_lowbit_output=False):
-#         qgraph0 = qtz.quantize(graph, params)
-#         qgraph0 = relay.ir_pass.infer_type(qgraph0)
-#
-#     conv_weight = quantize_weight(params['conv_weight'])
-#     qgraph1 = make_qgraph(data, conv_weight)
-#     qgraph1 = relay.ir_pass.infer_type(qgraph1)
-#
-#     graph = relay.create_executor('graph')
-#     res0 = graph.evaluate(qgraph0)(dataset[0]['data'])
-#     res1 = graph.evaluate(qgraph1)(dataset[0]['data'])
-#     tvm.testing.assert_allclose(res0.asnumpy(), res1.asnumpy(), rtol=1e-3)
+def test_quantize_pass():
+    def quantize_weight(arr):
+        maximum = np.amax(np.abs(arr.asnumpy()))
+        scale = 2**math.ceil(math.log(maximum, 2))
+        out = np.around(arr.asnumpy() / scale * 128).astype('int8')
+        out = np.clip(out, -127, 127)
+        return relay.const(out, 'int8')
 
+    n, c, h, w = 1, 3, 224, 224
+    def make_graph(data):
+        weight = relay.var("conv_weight")
+        out = relay.nn.conv2d(data, weight, kernel_size=(3, 3), padding=(1, 1), channels=c)
+        out = relay.Function(relay.ir_pass.free_vars(out), out)
+        return out
+
+    def make_qgraph(data, weight):
+        out = data * relay.const(32.0)
+        out = relay.round(out)
+        out = relay.clip(out, a_min=-127, a_max=127)
+        out = out.astype('int8')
+
+        out = relay.nn.conv2d(out, weight, kernel_size=(3, 3),
+                              padding=(1, 1), channels=c, out_dtype='int32')
+        out = out.astype('float32')
+        out = relay.multiply(out, relay.const(0.00024414062))
+        out = relay.Function(relay.ir_pass.free_vars(out), out)
+        return out
 
-if __name__ == "__main__":
     np.random.seed(42)
+
+    data = relay.var("data", relay.TensorType((n, c, h, w), "float32"))
+    graph = make_graph(data)
+    dataset, params = make_dataset(graph, 10)
+
+    with qtz.qconfig(skip_conv_layers=None, global_scale=4.0,
+                     round_for_shift=False, store_lowbit_output=False):
+        qgraph0 = qtz.quantize(graph, params)
+        qgraph0 = relay.ir_pass.infer_type(qgraph0)
+
+    conv_weight = quantize_weight(params['conv_weight'])
+    qgraph1 = make_qgraph(data, conv_weight)
+    qgraph1 = relay.ir_pass.infer_type(qgraph1)
+
+    graph = relay.create_executor('graph')
+    res0 = graph.evaluate(qgraph0)(dataset[0]['data'])
+    res1 = graph.evaluate(qgraph1)(dataset[0]['data'])
+    tvm.testing.assert_allclose(res0.asnumpy(), res1.asnumpy(), rtol=1e-3)
+
+
+if __name__ == "__main__":
     test_simulated_quantize()
-    # test_quantize_pass()
+    test_quantize_pass()
diff --git a/tests/python/relay/test_pass_simplify_inference.py b/tests/python/relay/test_pass_simplify_inference.py
index 1387f276b290..aad1d9fc6cf5 100644
--- a/tests/python/relay/test_pass_simplify_inference.py
+++ b/tests/python/relay/test_pass_simplify_inference.py
@@ -17,12 +17,12 @@
 from tvm import relay as rly
 from tvm.relay.ir_pass import simplify_inference, alpha_equal
 
-def test_simplify_batchnorm():
+def test_simplify_batchnorm(dtype='float32'):
     def simple_bn(x, gamma, beta, moving_mean, moving_var,
                   axis=1, epsilon=1e-5, shape=None):
         # expect = (x - moving_mean) / sqrt(moving_var + eps) * gamma + beta
-        scale = rly.multiply(rly.const(1, 'float32') /
-                rly.sqrt(moving_var + rly.const(epsilon, 'float32')), gamma)
+        scale = rly.multiply(rly.const(1, dtype) /
+                rly.sqrt(moving_var + rly.const(epsilon, dtype)), gamma)
         shift = rly.add(
             rly.multiply(rly.negative(moving_mean), scale), beta)
         num_newaxis = len(shape) - (axis + 1)
@@ -33,8 +33,8 @@ def simple_bn(x, gamma, beta, moving_mean, moving_var,
 
     def check(dim, axis, nstep):
         eps = 0.01
-        ttype1 = rly.TensorType(tuple(10 for i in range(dim)), 'float32')
-        ttype2 = rly.TensorType((10,), 'float32')
+        ttype1 = rly.TensorType(tuple(10 for i in range(dim)), dtype)
+        ttype2 = rly.TensorType((10,), dtype)
         x = rly.var("x", ttype1)
         beta = rly.var("beta", ttype2)
         gamma = rly.var("gamma", ttype2)
@@ -43,10 +43,10 @@ def check(dim, axis, nstep):
         y1, y2 = x, x
 
         for _ in range(nstep):
-            y1, _, _ = rly.nn.batch_norm(y1 + rly.const(1, 'float32'),
+            y1, _, _ = rly.nn.batch_norm(y1 + rly.const(1, dtype),
                 gamma, beta, moving_mean, moving_var, epsilon=eps, axis=axis)
             y1 = rly.nn.dropout(y1)
-            y2 = simple_bn(y2 + rly.const(1, 'float32'),
+            y2 = simple_bn(y2 + rly.const(1, dtype),
                            gamma, beta, moving_mean, moving_var,
                            epsilon=eps, axis=axis, shape=ttype1.shape)
         y1 = rly.ir_pass.infer_type(y1)
@@ -60,4 +60,5 @@ def check(dim, axis, nstep):
 
 
 if __name__ == "__main__":
-    test_simplify_batchnorm()
+    test_simplify_batchnorm(dtype='float32')
+    test_simplify_batchnorm(dtype='float16')
diff --git a/tests/python/relay/test_pass_to_a_normal_form.py b/tests/python/relay/test_pass_to_a_normal_form.py
index 2e95dbe55121..db40c86d4b28 100644
--- a/tests/python/relay/test_pass_to_a_normal_form.py
+++ b/tests/python/relay/test_pass_to_a_normal_form.py
@@ -21,6 +21,7 @@
 from tvm.relay import op, create_executor
 from tvm.relay.backend.interpreter import Value, TupleValue, ConstructorValue
 from tvm.relay.prelude import Prelude
+from tvm.relay.testing import add_nat_definitions, count
 
 
 def check_eval(expr, expected_result, mod=None, rtol=1e-07):
@@ -130,19 +131,10 @@ def test_ref():
     check_eval(to_a_normal_form(body), 3)
 
 
-# this is an example of using the adt value in python side
-def count(n):
-    assert isinstance(n, ConstructorValue)
-    if n.constructor.name_hint == 's':
-        return 1 + count(n.fields[0])
-    else:
-        assert n.constructor.name_hint == 'z'
-        return 0
-
-
-def test_add():
+def test_nat_add():
     mod = relay.Module()
     p = Prelude(mod)
+    add_nat_definitions(p)
     nat = p.nat
     add = p.add
     s = p.s
@@ -150,8 +142,8 @@ def test_add():
     ctx = tvm.context("llvm", 0)
     intrp = create_executor(mod=mod, ctx=ctx, target="llvm")
     assert mod[add].checked_type == relay.FuncType([nat(), nat()], nat())
-    assert count(intrp.evaluate(add(s(z()), s(z())))) == 2
-    assert count(intrp.evaluate(to_a_normal_form(add(s(z()), s(z())), mod))) == 2
+    assert count(p, intrp.evaluate(add(s(z()), s(z())))) == 2
+    assert count(p, intrp.evaluate(to_a_normal_form(add(s(z()), s(z())), mod))) == 2
     assert "let" in mod[add].astext()
 
 
@@ -183,4 +175,5 @@ def test_function():
     test_ref()
     test_add()
     test_let()
+    test_nat_add()
     test_function()
diff --git a/tests/python/relay/test_pass_unmatched_cases.py b/tests/python/relay/test_pass_unmatched_cases.py
new file mode 100644
index 000000000000..4f2bb20ad7d6
--- /dev/null
+++ b/tests/python/relay/test_pass_unmatched_cases.py
@@ -0,0 +1,267 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+from tvm import relay
+from tvm.relay.prelude import Prelude
+from tvm.relay.ir_pass import unmatched_cases
+
+def test_empty_match_block():
+    # empty match block will not match anything, so it should return a wildcard pattern
+    v = relay.Var('v')
+    match = relay.Match(v, [])
+
+    unmatched = unmatched_cases(match)
+    assert len(unmatched) == 1
+    assert isinstance(unmatched[0], relay.PatternWildcard)
+
+
+def test_trivial_matches():
+    # a match clause with a wildcard will match anything
+    v = relay.Var('v')
+    match = relay.Match(v, [
+        relay.Clause(relay.PatternWildcard(), v)
+    ])
+    assert len(unmatched_cases(match)) == 0
+
+    # same with a pattern var
+    w = relay.Var('w')
+    match = relay.Match(v, [
+        relay.Clause(relay.PatternVar(w), w)
+    ])
+    assert len(unmatched_cases(match)) == 0
+
+
+def test_single_constructor_adt():
+    mod = relay.Module()
+    box = relay.GlobalTypeVar('box')
+    a = relay.TypeVar('a')
+    box_ctor = relay.Constructor('box', [a], box)
+    box_data = relay.TypeData(box, [a], [box_ctor])
+    mod[box] = box_data
+
+    v = relay.Var('v')
+    match = relay.Match(v, [
+        relay.Clause(relay.PatternConstructor(box_ctor, [relay.PatternWildcard()]), v)
+    ])
+
+    # with one constructor, having one pattern constructor case is exhaustive
+    assert len(unmatched_cases(match, mod)) == 0
+
+    # this will be so if we nest the constructors too
+    nested_pattern = relay.Match(v, [
+        relay.Clause(
+            relay.PatternConstructor(
+                box_ctor,
+                [relay.PatternConstructor(box_ctor,
+                                          [relay.PatternConstructor(
+                                              box_ctor,
+                                              [relay.PatternWildcard()])])]), v)
+    ])
+    assert len(unmatched_cases(nested_pattern, mod)) == 0
+
+
+def test_too_specific_match():
+    mod = relay.Module()
+    p = Prelude(mod)
+
+    v = relay.Var('v')
+    match = relay.Match(v, [
+        relay.Clause(
+            relay.PatternConstructor(
+                p.cons, [relay.PatternWildcard(),
+                         relay.PatternConstructor(p.cons, [relay.PatternWildcard(),
+                                                           relay.PatternWildcard()])]), v)
+    ])
+
+    unmatched = unmatched_cases(match, mod)
+
+    # will not match nil or a list of length 1
+    nil_found = False
+    single_length_found = False
+    assert len(unmatched) == 2
+    for case in unmatched:
+        assert isinstance(case, relay.PatternConstructor)
+        if case.constructor == p.nil:
+            nil_found = True
+        if case.constructor == p.cons:
+            assert isinstance(case.patterns[1], relay.PatternConstructor)
+            assert case.patterns[1].constructor == p.nil
+            single_length_found = True
+    assert nil_found and single_length_found
+
+    # if we add a wildcard, this should work
+    new_match = relay.Match(v, [
+        relay.Clause(
+            relay.PatternConstructor(
+                p.cons, [relay.PatternWildcard(),
+                         relay.PatternConstructor(p.cons, [relay.PatternWildcard(),
+                                                           relay.PatternWildcard()])]), v),
+        relay.Clause(relay.PatternWildcard(), v)
+    ])
+    assert len(unmatched_cases(new_match, mod)) == 0
+
+
+def test_multiple_constructor_clauses():
+    mod = relay.Module()
+    p = Prelude(mod)
+
+    v = relay.Var('v')
+    match = relay.Match(v, [
+        # list of length exactly 1
+        relay.Clause(
+            relay.PatternConstructor(p.cons, [relay.PatternWildcard(),
+                                              relay.PatternConstructor(p.nil, [])]), v),
+        # list of length exactly 2
+        relay.Clause(
+            relay.PatternConstructor(
+                p.cons, [relay.PatternWildcard(),
+                         relay.PatternConstructor(p.cons, [relay.PatternWildcard(),
+                                                           relay.PatternConstructor(p.nil, [])
+                         ])]), v),
+        # empty list
+        relay.Clause(
+            relay.PatternConstructor(p.nil, []), v),
+        # list of length 2 or more
+        relay.Clause(
+            relay.PatternConstructor(
+                p.cons, [relay.PatternWildcard(),
+                         relay.PatternConstructor(p.cons, [relay.PatternWildcard(),
+                                                           relay.PatternWildcard()])]), v)
+    ])
+    assert len(unmatched_cases(match, mod)) == 0
+
+
+def test_missing_in_the_middle():
+    mod = relay.Module()
+    p = Prelude(mod)
+
+    v = relay.Var('v')
+    match = relay.Match(v, [
+        # list of length exactly 1
+        relay.Clause(
+            relay.PatternConstructor(p.cons, [relay.PatternWildcard(),
+                                              relay.PatternConstructor(p.nil, [])]), v),
+        # empty list
+        relay.Clause(
+            relay.PatternConstructor(p.nil, []), v),
+        # list of length 3 or more
+        relay.Clause(
+            relay.PatternConstructor(
+                p.cons, [relay.PatternWildcard(),
+                         relay.PatternConstructor(
+                             p.cons,
+                             [relay.PatternWildcard(),
+                              relay.PatternConstructor(
+                                  p.cons,
+                                  [relay.PatternWildcard(),
+                                   relay.PatternWildcard()])])]),
+            v)
+    ])
+
+    # fails to match a list of length exactly two
+    unmatched = unmatched_cases(match, mod)
+    assert len(unmatched) == 1
+    assert isinstance(unmatched[0], relay.PatternConstructor)
+    assert unmatched[0].constructor == p.cons
+    assert isinstance(unmatched[0].patterns[1], relay.PatternConstructor)
+    assert unmatched[0].patterns[1].constructor == p.cons
+    assert isinstance(unmatched[0].patterns[1].patterns[1], relay.PatternConstructor)
+    assert unmatched[0].patterns[1].patterns[1].constructor == p.nil
+
+
+def test_mixed_adt_constructors():
+    mod = relay.Module()
+    box = relay.GlobalTypeVar('box')
+    a = relay.TypeVar('a')
+    box_ctor = relay.Constructor('box', [a], box)
+    box_data = relay.TypeData(box, [a], [box_ctor])
+    mod[box] = box_data
+
+    p = Prelude(mod)
+
+    v = relay.Var('v')
+    box_of_lists_inc = relay.Match(v, [
+        relay.Clause(
+            relay.PatternConstructor(
+                box_ctor,
+                [relay.PatternConstructor(p.cons, [
+                    relay.PatternWildcard(), relay.PatternWildcard()])]), v)
+    ])
+
+    # will fail to match a box containing an empty list
+    unmatched = unmatched_cases(box_of_lists_inc, mod)
+    assert len(unmatched) == 1
+    assert isinstance(unmatched[0], relay.PatternConstructor)
+    assert unmatched[0].constructor == box_ctor
+    assert len(unmatched[0].patterns) == 1 and unmatched[0].patterns[0].constructor == p.nil
+
+    box_of_lists_comp = relay.Match(v, [
+        relay.Clause(
+            relay.PatternConstructor(
+                box_ctor, [relay.PatternConstructor(p.nil, [])]), v),
+        relay.Clause(
+            relay.PatternConstructor(
+                box_ctor, [relay.PatternConstructor(p.cons, [
+                    relay.PatternWildcard(), relay.PatternWildcard()])]), v)
+    ])
+    assert len(unmatched_cases(box_of_lists_comp, mod)) == 0
+
+    list_of_boxes_inc = relay.Match(v, [
+        relay.Clause(
+            relay.PatternConstructor(
+                p.cons, [relay.PatternConstructor(box_ctor, [relay.PatternWildcard()]),
+                         relay.PatternWildcard()]), v)
+    ])
+
+    # fails to match empty list of boxes
+    unmatched = unmatched_cases(list_of_boxes_inc, mod)
+    assert len(unmatched) == 1
+    assert isinstance(unmatched[0], relay.PatternConstructor)
+    assert unmatched[0].constructor == p.nil
+
+    list_of_boxes_comp = relay.Match(v, [
+        # exactly one box
+        relay.Clause(
+            relay.PatternConstructor(
+                p.cons, [relay.PatternConstructor(box_ctor, [relay.PatternWildcard()]),
+                         relay.PatternConstructor(p.nil, [])]), v),
+        # exactly two boxes
+        relay.Clause(
+            relay.PatternConstructor(
+                p.cons, [relay.PatternConstructor(box_ctor, [relay.PatternWildcard()]),
+                         relay.PatternConstructor(p.cons, [
+                             relay.PatternConstructor(box_ctor, [relay.PatternWildcard()]),
+                             relay.PatternConstructor(p.nil, [])
+                         ])]), v),
+        # exactly three boxes
+        relay.Clause(
+            relay.PatternConstructor(
+                p.cons, [relay.PatternConstructor(box_ctor, [relay.PatternWildcard()]),
+                         relay.PatternConstructor(p.cons, [
+                             relay.PatternConstructor(box_ctor, [relay.PatternWildcard()]),
+                             relay.PatternConstructor(p.cons, [
+                                 relay.PatternConstructor(box_ctor, [relay.PatternWildcard()]),
+                                 relay.PatternConstructor(p.nil, [])
+                             ])])]), v),
+        # one or more boxes
+        relay.Clause(relay.PatternConstructor(p.cons, [relay.PatternWildcard(),
+                                                       relay.PatternWildcard()]), v),
+        # no boxes
+        relay.Clause(relay.PatternConstructor(p.nil, []), v)
+    ])
+    assert len(unmatched_cases(list_of_boxes_comp, mod)) == 0
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index bc99418d5da4..302dc553cdb0 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -35,9 +35,9 @@ def veval(f, *args, ctx=tvm.cpu()):
         mod = f
         ex = relay.create_executor('vm', mod=mod, ctx=ctx)
         if len(args) == 0:
-            return ex.evaluate(mod[mod.entry_func])
+            return ex.evaluate()
         else:
-            return ex.evaluate(mod[mod.entry_func])(*args)
+            return ex.evaluate()(*args)
 
 def test_split():
     x = relay.var('x', shape=(12,))
@@ -49,6 +49,17 @@ def test_split():
     res = veval(f, x_data)
     tvm.testing.assert_allclose(res.asnumpy(), np.split(x_data, 3, axis=0)[0])
 
+def test_split_no_fuse():
+    x = relay.var('x', shape=(12,))
+    y = relay.split(x, 3, axis=0).astuple()
+    z = relay.concatenate([relay.TupleGetItem(y, 0)], axis=0)
+    z = relay.annotation.stop_fusion(z)
+    f = relay.Function([x], z)
+    x_data = np.random.rand(12,).astype('float32')
+    res = veval(f, x_data)
+    tvm.testing.assert_allclose(res.asnumpy(), np.split(x_data, 3, axis=0)[0])
+
+
 def test_id():
     x = relay.var('x', shape=(10, 10))
     f = relay.Function([x], x)
@@ -174,9 +185,7 @@ def test_tuple_second():
     result = veval(f, (i_data, j_data))
     tvm.testing.assert_allclose(result.asnumpy(), j_data)
 
-@nottest
 def test_list_constructor():
-    # TODO(wweic): implement pattern match to support this test
     def to_list(o):
         if isinstance(o, tvm.relay.backend.interpreter.TensorValue):
             return [o.data.asnumpy().tolist()]
@@ -193,6 +202,11 @@ def to_list(o):
     cons = p.cons
     l = p.l
 
+    # remove all functions to not have pattern match to pass vm compilation
+    # TODO(wweic): remove the hack and implement pattern match
+    for v, _ in mod.functions.items():
+        mod[v] = relay.const(0)
+
     one2 = cons(relay.const(1), nil())
     one3 = cons(relay.const(2), one2)
     one4 = cons(relay.const(3), one3)
@@ -202,7 +216,6 @@ def to_list(o):
 
     result = veval(mod)()
     obj = to_list(result)
-    import pdb; pdb.set_trace()
     tvm.testing.assert_allclose(obj, np.array([3,2,1]))
 
 def test_let_tensor():
@@ -237,6 +250,47 @@ def test_let_scalar():
     result = veval(f, x_data)
     tvm.testing.assert_allclose(result.asnumpy(), x_data + 42.0)
 
+def test_compose():
+    mod = relay.Module()
+    p = Prelude(mod)
+
+    compose = p.compose
+
+    # remove all functions to not have pattern match to pass vm compilation
+    # TODO(wweic): remove the hack and implement pattern match
+    for v, _ in mod.functions.items():
+        if v.name_hint == 'compose':
+            continue
+        mod[v] = relay.const(0)
+
+    # add_one = fun x -> x + 1
+    sb = relay.ScopeBuilder()
+    x = relay.var('x', 'float32')
+    x1 = sb.let('x1', x)
+    xplusone = x1 + relay.const(1.0, 'float32')
+    sb.ret(xplusone)
+    body = sb.get()
+    add_one = relay.GlobalVar("add_one")
+    add_one_func = relay.Function([x], body)
+
+    # add_two = compose(add_one, add_one)
+    sb = relay.ScopeBuilder()
+    y = relay.var('y', 'float32')
+    add_two_func = sb.let('add_two', compose(add_one_func, add_one_func))
+    add_two_res = add_two_func(y)
+    sb.ret(add_two_res)
+    add_two_body = sb.get()
+
+    mod[add_one] = add_one_func
+
+    f = relay.Function([y], add_two_body)
+    mod[mod.entry_func] = f
+
+    x_data = np.array(np.random.rand()).astype('float32')
+    result = veval(mod)(x_data)
+
+    tvm.testing.assert_allclose(result.asnumpy(), x_data + 2.0)
+
 def test_closure():
     x = relay.var('x', shape=())
     y = relay.var('y', shape=())
@@ -259,6 +313,8 @@ def test_closure():
     test_tuple_second()
     test_let_scalar()
     test_let_tensor()
+    test_split()
+    test_split_no_fuse()
     # TODO(@jroesch): restore when match is supported
     # test_list_constructor()
     test_closure()
diff --git a/tests/python/unittest/test_arith_deduce_bound.py b/tests/python/unittest/test_arith_deduce_bound.py
new file mode 100644
index 000000000000..7fe6f56edea7
--- /dev/null
+++ b/tests/python/unittest/test_arith_deduce_bound.py
@@ -0,0 +1,168 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+
+def test_deduce():
+    a = tvm.var('a')
+    b = tvm.var('b')
+    c = tvm.var('c')
+    d = tvm.var('d')
+
+    b_s = tvm.arith.IntervalSet(2, 3)
+    c_s = tvm.arith.IntervalSet(10, 15)
+    d_s = tvm.arith.IntervalSet(-3, -1)
+    zero = tvm.const(0, "int32")
+
+    e0 = (-b)*a+c-d
+    res0 = tvm.arith.DeduceBound(a, e0>=0, {b: b_s, c: c_s, d: d_s}, {})
+    ans0 = ((d - c) /(b*-1))
+    assert str(tvm.ir_pass.Simplify(res0.max_value)) == str(ans0)
+
+    # expression containing variable a is on rhs
+    res0 = tvm.arith.DeduceBound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {})
+    assert str(tvm.ir_pass.Simplify(res0.max_value)) == str(ans0)
+
+    e0 = d*a+c-d
+    res0 = tvm.arith.DeduceBound(a, e0>=0, {b: b_s, c: c_s, d: d_s}, {})
+    ans0 = ((0-c)/d + 1)
+    assert str(tvm.ir_pass.Simplify(res0.max_value)) == str(ans0)
+
+    # expression containing variable a is on rhs
+    res0 = tvm.arith.DeduceBound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {})
+    assert str(tvm.ir_pass.Simplify(res0.max_value)) == str(ans0)
+
+    e1 = (a*4+b < c)
+    res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {})
+    ans1 = (((c - b) + -1)/4)
+    assert str(tvm.ir_pass.Simplify(res1.max_value)) == str(ans1)
+
+    # expression containing variable a is on rhs
+    e1 = (c > a*4+b)
+    res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {})
+    assert str(tvm.ir_pass.Simplify(res1.max_value)) == str(ans1)
+
+    e2 = (tvm.max(5, a * 4) < 0)
+    res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
+    assert str(res2.max_value) == "neg_inf"
+    assert str(res2.min_value) == "pos_inf"
+
+    # expression containing variable a is on rhs
+    e2 = (zero < tvm.max(5, a * 4))
+    res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
+    assert str(res2.max_value) == "neg_inf"
+    assert str(res2.min_value) == "pos_inf"
+
+
+    e3 = (-b)+a*c-d
+    res3 = tvm.arith.DeduceBound(a, e3>=0, {b: b_s, c: c_s, d: d_s}, {b: b_s, d: d_s})
+    ans3 = 2/c+1
+    assert str(tvm.ir_pass.Simplify(res3.min_value)) == str(ans3)
+
+    res3 = tvm.arith.DeduceBound(a, zero <= e3, {b: b_s, c: c_s, d: d_s}, {b: b_s, d: d_s})
+    assert str(tvm.ir_pass.Simplify(res3.min_value)) == str(ans3)
+
+def test_check():
+    a = tvm.var('a')
+    b = tvm.var('b')
+    c = tvm.var('c')
+    d = tvm.var('d')
+
+    b_s = tvm.arith.IntervalSet(2, 3)
+    c_s = tvm.arith.IntervalSet(5, 7)
+    d_s = tvm.arith.IntervalSet(-3, -1)
+
+    # no compare operator
+    res1 = tvm.arith.DeduceBound(a, a+b, {b: b_s}, {})
+    assert res1.is_nothing()
+
+    # multiple compare operators
+    res2 = tvm.arith.DeduceBound(a, (a+b>3).astype(c.dtype)>c , {b: b_s, c: c_s}, {})
+    assert res2.is_nothing()
+
+    # multiple target variable
+    res2 = tvm.arith.DeduceBound(a, a*2-a>b, {b: b_s}, {})
+    assert res2.is_nothing()
+
+def test_deduce_basic():
+    def test_basic(a1, a2, coff):
+        a = tvm.var('a')
+        b = tvm.var('b')
+        b_s = tvm.arith.IntervalSet(a1, a2)
+        e0 = b + a*coff + 3
+
+        res1 = tvm.arith.DeduceBound(a, e0<17, {b: b_s}, {b: b_s})
+        [x, y] = [res1.max_value, b_s.max_value] if coff > 0 else [res1.min_value, b_s.min_value]
+        assert (tvm.ir_pass.Simplify((x * coff + 3 + y) < 17)).value == 1
+
+        # expression containing variable a is on rhs
+        res1 = tvm.arith.DeduceBound(a, tvm.const(17, "int32") < e0, {b: b_s}, {b: b_s})
+        [x, y] = [res1.max_value, b_s.max_value] if coff < 0 else [res1.min_value, b_s.min_value]
+        assert (tvm.ir_pass.Simplify((x * coff + 3 + y) > 17)).value == 1
+
+        # expression containing variable a is on rhs
+        res1 = tvm.arith.DeduceBound(a, tvm.const(17, "int32")>= e0, {b: b_s}, {b: b_s})
+        [x, y] = [res1.max_value, b_s.max_value] if coff > 0 else [res1.min_value, b_s.min_value]
+        assert (tvm.ir_pass.Simplify((x * coff + 3 + y) <= 17)).value == 1
+
+        res1 = tvm.arith.DeduceBound(a, e0>=17, {b: b_s}, {b: b_s})
+        [x, y] = [res1.max_value, b_s.max_value] if coff < 0 else [res1.min_value, b_s.min_value]
+        assert (tvm.ir_pass.Simplify((x * coff + 3 + y) >= 17)).value == 1
+
+    test_basic(0, 4, 4)
+    test_basic(1, 5, 4)
+    test_basic(2, 6, 4)
+    test_basic(0, 4, -4)
+    test_basic(1, 5, -4)
+    test_basic(2, 6, -4)
+
+def test_deduce_complex():
+    def test_complex(a1, a2, coff):
+        a = tvm.var('a')
+        b = tvm.var('b')
+        b_s = tvm.arith.IntervalSet(a1, a2)
+        e0 = (b*3 + a* coff) * 4
+
+        res1 = tvm.arith.DeduceBound(a, e0<63, {b: b_s}, {b: b_s})
+        [t, x] = [res1.max_value, b_s.max_value] if coff > 0 else [res1.min_value, b_s.min_value]
+        assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) < 63)).value == 1
+
+        # expression containing variable a is on rhs
+        res1 = tvm.arith.DeduceBound(a, tvm.const(63, "int32")>= e0, {b: b_s}, {b: b_s})
+        [t, x] = [res1.max_value, b_s.max_value] if coff > 0 else [res1.min_value, b_s.min_value]
+        assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) <= 63)).value == 1
+
+        res1 = tvm.arith.DeduceBound(a, e0>63, {b: b_s}, {b: b_s})
+        [t, x] = [res1.max_value, b_s.max_value] if coff < 0 else [res1.min_value, b_s.min_value]
+        assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) > 63)).value == 1
+
+        # expression containing variable a is on rhs
+        res1 = tvm.arith.DeduceBound(a, tvm.const(63, "int32") <= e0, {b: b_s}, {b: b_s})
+        [t, x] = [res1.max_value, b_s.max_value] if coff < 0 else [res1.min_value, b_s.min_value]
+        assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) >= 63)).value == 1
+
+    test_complex(0, 4, 4)
+    test_complex(0, 4, -4)
+    test_complex(2, 6, 4)
+    test_complex(0, 4, -4)
+    test_complex(1, 5, -4)
+    test_complex(2, 6, -4)
+
+
+if __name__ == "__main__":
+    test_check()
+    test_deduce_basic()
+    test_deduce_complex()
diff --git a/tests/python/unittest/test_arith_intset.py b/tests/python/unittest/test_arith_intset.py
index a74162ec07f2..fa14bcf48fdf 100644
--- a/tests/python/unittest/test_arith_intset.py
+++ b/tests/python/unittest/test_arith_intset.py
@@ -16,168 +16,87 @@
 # under the License.
 import tvm
 
+
+class IntSetChecker:
+    def __init__(self):
+        self.analyzer = tvm.arith.Analyzer()
+
+    def verify(self, data, dmap, expected):
+        res = self.analyzer.int_set(data, dmap)
+        def err_msg():
+            return "\ndata={}\ndmap={}\nres={}\nexpected={}".format(data, dmap, res, expected)
+        def equal(x, y):
+            res = self.analyzer.canonical_simplify(x - y)
+            return tvm.ir_pass.Equal(res, 0)
+        assert equal(res.min_value, expected[0]), err_msg()
+        assert equal(res.max_value, expected[1]), err_msg()
+
 def test_basic():
-    s = tvm.arith.intset_interval(2, 3)
-    assert s.min().value == 2
-    assert s.max().value == 3
+    s = tvm.arith.IntervalSet(2, 3)
+    assert s.min_value.value == 2
+    assert s.max_value.value == 3
+
 
 def test_vector():
     base = 10
     stride = 3
     lanes = 2
     s = tvm.arith.intset_vector(tvm.make.Ramp(base, stride, lanes))
-    assert s.min().value == base
-    assert s.max().value == base + stride * lanes - 1
-
-def test_deduce():
-    a = tvm.var('a')
-    b = tvm.var('b')
-    c = tvm.var('c')
-    d = tvm.var('d')
-
-    b_s = tvm.arith.intset_interval(2, 3)
-    c_s = tvm.arith.intset_interval(10, 15)
-    d_s = tvm.arith.intset_interval(-3, -1)
-    zero = tvm.const(0, "int32")
-
-    e0 = (-b)*a+c-d
-    res0 = tvm.arith.DeduceBound(a, e0>=0, {b: b_s, c: c_s, d: d_s}, {})
-    ans0 = ((d - c) /(b*-1))
-    assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0)
-
-    # expression containing variable a is on rhs
-    res0 = tvm.arith.DeduceBound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {})
-    assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0)
-
-    e0 = d*a+c-d
-    res0 = tvm.arith.DeduceBound(a, e0>=0, {b: b_s, c: c_s, d: d_s}, {})
-    ans0 = ((0-c)/d + 1)
-    assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0)
-
-    # expression containing variable a is on rhs
-    res0 = tvm.arith.DeduceBound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {})
-    assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0)
-
-    e1 = (a*4+b < c)
-    res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {})
-    ans1 = (((c - b) + -1)/4)
-    assert str(tvm.ir_pass.Simplify(res1.max())) == str(ans1)
-
-    # expression containing variable a is on rhs
-    e1 = (c > a*4+b)
-    res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {})
-    assert str(tvm.ir_pass.Simplify(res1.max())) == str(ans1)
-
-    e2 = (tvm.max(5, a * 4) < 0)
-    res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
-    assert str(res2.max()) == "neg_inf"
-    assert str(res2.min()) == "pos_inf"
-
-    # expression containing variable a is on rhs
-    e2 = (zero < tvm.max(5, a * 4))
-    res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
-    assert str(res2.max()) == "neg_inf"
-    assert str(res2.min()) == "pos_inf"
-
-
-    e3 = (-b)+a*c-d
-    res3 = tvm.arith.DeduceBound(a, e3>=0, {b: b_s, c: c_s, d: d_s}, {b: b_s, d: d_s})
-    ans3 = 2/c+1
-    assert str(tvm.ir_pass.Simplify(res3.min())) == str(ans3)
-
-    res3 = tvm.arith.DeduceBound(a, zero <= e3, {b: b_s, c: c_s, d: d_s}, {b: b_s, d: d_s})
-    assert str(tvm.ir_pass.Simplify(res3.min())) == str(ans3)
-
-def test_check():
-    a = tvm.var('a')
-    b = tvm.var('b')
-    c = tvm.var('c')
-    d = tvm.var('d')
-
-    b_s = tvm.arith.intset_interval(2, 3)
-    c_s = tvm.arith.intset_interval(5, 7)
-    d_s = tvm.arith.intset_interval(-3, -1)
-
-    # no compare operator
-    res1 = tvm.arith.DeduceBound(a, a+b, {b: b_s}, {})
-    assert res1.is_nothing()
-
-    # multiple compare operators
-    res2 = tvm.arith.DeduceBound(a, (a+b>3).astype(c.dtype)>c , {b: b_s, c: c_s}, {})
-    assert res2.is_nothing()
-
-    # multiple target variable
-    res2 = tvm.arith.DeduceBound(a, a*2-a>b, {b: b_s}, {})
-    assert res2.is_nothing()
-
-def test_deduce_basic():
-    def test_basic(a1, a2, coff):
-        a = tvm.var('a')
-        b = tvm.var('b')
-        b_s = tvm.arith.intset_interval(a1, a2)
-        e0 = b + a*coff + 3
-
-        res1 = tvm.arith.DeduceBound(a, e0<17, {b: b_s}, {b: b_s})
-        [x, y] = [res1.max(), b_s.max()] if coff > 0 else [res1.min(), b_s.min()]
-        assert (tvm.ir_pass.Simplify((x * coff + 3 + y) < 17)).value == 1
-
-        # expression containing variable a is on rhs
-        res1 = tvm.arith.DeduceBound(a, tvm.const(17, "int32") < e0, {b: b_s}, {b: b_s})
-        [x, y] = [res1.max(), b_s.max()] if coff < 0 else [res1.min(), b_s.min()]
-        assert (tvm.ir_pass.Simplify((x * coff + 3 + y) > 17)).value == 1
-
-        # expression containing variable a is on rhs
-        res1 = tvm.arith.DeduceBound(a, tvm.const(17, "int32")>= e0, {b: b_s}, {b: b_s})
-        [x, y] = [res1.max(), b_s.max()] if coff > 0 else [res1.min(), b_s.min()]
-        assert (tvm.ir_pass.Simplify((x * coff + 3 + y) <= 17)).value == 1
-
-        res1 = tvm.arith.DeduceBound(a, e0>=17, {b: b_s}, {b: b_s})
-        [x, y] = [res1.max(), b_s.max()] if coff < 0 else [res1.min(), b_s.min()]
-        assert (tvm.ir_pass.Simplify((x * coff + 3 + y) >= 17)).value == 1
-
-    test_basic(0, 4, 4)
-    test_basic(1, 5, 4)
-    test_basic(2, 6, 4)
-    test_basic(0, 4, -4)
-    test_basic(1, 5, -4)
-    test_basic(2, 6, -4)
-
-def test_deduce_complex():
-    def test_complex(a1, a2, coff):
-        a = tvm.var('a')
-        b = tvm.var('b')
-        b_s = tvm.arith.intset_interval(a1, a2)
-        e0 = (b*3 + a* coff) * 4
-
-        res1 = tvm.arith.DeduceBound(a, e0<63, {b: b_s}, {b: b_s})
-        [t, x] = [res1.max(), b_s.max()] if coff > 0 else [res1.min(), b_s.min()]
-        assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) < 63)).value == 1
-
-        # expression containing variable a is on rhs
-        res1 = tvm.arith.DeduceBound(a, tvm.const(63, "int32")>= e0, {b: b_s}, {b: b_s})
-        [t, x] = [res1.max(), b_s.max()] if coff > 0 else [res1.min(), b_s.min()]
-        assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) <= 63)).value == 1
-
-        res1 = tvm.arith.DeduceBound(a, e0>63, {b: b_s}, {b: b_s})
-        [t, x] = [res1.max(), b_s.max()] if coff < 0 else [res1.min(), b_s.min()]
-        assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) > 63)).value == 1
-
-        # expression containing variable a is on rhs
-        res1 = tvm.arith.DeduceBound(a, tvm.const(63, "int32") <= e0, {b: b_s}, {b: b_s})
-        [t, x] = [res1.max(), b_s.max()] if coff < 0 else [res1.min(), b_s.min()]
-        assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) >= 63)).value == 1
-
-    test_complex(0, 4, 4)
-    test_complex(0, 4, -4)
-    test_complex(2, 6, 4)
-    test_complex(0, 4, -4)
-    test_complex(1, 5, -4)
-    test_complex(2, 6, -4)
+    assert s.min_value.value == base
+    assert s.max_value.value == base + stride * lanes - 1
+
+
+def test_add_sub():
+    ck = IntSetChecker()
+    x, y = tvm.var("x"), tvm.var("y")
+    ck.verify(x + y, {x : tvm.arith.IntervalSet(0, 10)}, (y, 10 + y))
+    ck.verify(x + y,
+              {x : tvm.arith.IntervalSet(0, 10), y : tvm.arith.IntervalSet(1, 11)},
+              (1, 21))
+    ck.verify(x - y,
+              {x : tvm.arith.IntervalSet(0, 10), y : tvm.arith.IntervalSet(1, 11)},
+              (-11, 9))
+
+def test_mul_div():
+    ck = IntSetChecker()
+    x, y = tvm.var("x"), tvm.var("y")
+    ck.analyzer.update(y, tvm.arith.ConstIntBound(1, 100), override=True)
+    ck.verify(x * y, {x : tvm.arith.IntervalSet(0, 10)}, (0, 10 * y))
+    ck.verify(x * 2, {x : tvm.arith.IntervalSet(1, 10)}, (2, 20))
+    ck.verify(x * -2, {x : tvm.arith.IntervalSet(1, 10)}, (-20, -2))
+    ck.verify(x / y, {x : tvm.arith.IntervalSet(0, 10)}, (0, 10 / y))
+    ck.verify(x / 2, {x : tvm.arith.IntervalSet(1, 10)}, (0, 5))
+
+
+def test_mod():
+    ck = IntSetChecker()
+    x, y = tvm.var("x"), tvm.var("y")
+    ck.analyzer.update(y, tvm.arith.ConstIntBound(1, 100), override=True)
+    ck.verify(x % y, {x : tvm.arith.IntervalSet(0, 10)}, (0, y - 1))
+    ck.verify(x % 10, {x : tvm.arith.IntervalSet(1, 10)}, (0, 9))
+
+def test_max_min():
+    ck = IntSetChecker()
+    x, y = tvm.var("x"), tvm.var("y")
+    ck.verify(tvm.max(x, x + 1), {x : tvm.arith.IntervalSet(0, 10)}, (1, 11))
+    ck.verify(tvm.min(x - 1, x + 1), {x : tvm.arith.IntervalSet(0, 10)}, (-1, 9))
+    ck.verify(tvm.min(x, y), {}, (tvm.min(x, y), tvm.min(x, y)))
+    ck.verify(tvm.max(x, y), {}, (tvm.max(x, y), tvm.max(x, y)))
+
+
+def test_select():
+    ck = IntSetChecker()
+    x, y = tvm.var("x"), tvm.var("y")
+    ck.verify(tvm.expr.Select(x > 0, x - 1, x + 1),
+              {x : tvm.arith.IntervalSet(0, 10)}, (-1, 11))
+
 
 if __name__ == "__main__":
     test_basic()
     test_vector()
-    test_deduce()
-    test_check()
-    test_deduce_basic()
-    test_deduce_complex()
+    test_add_sub()
+    test_mul_div()
+    test_max_min()
+    test_select()
+    test_mod()
+
diff --git a/tests/python/unittest/test_arith_rewrite_simplify.py b/tests/python/unittest/test_arith_rewrite_simplify.py
index be961a5c6543..07d460eee7fe 100644
--- a/tests/python/unittest/test_arith_rewrite_simplify.py
+++ b/tests/python/unittest/test_arith_rewrite_simplify.py
@@ -227,6 +227,25 @@ def test_sub_index_simplify():
     ck.verify(x - (x / 3) * 3, x % 3)
     ck.verify((x + 5) / 3 - x / 3, (((x + 2) % 3) + 5)/ 3)
 
+    ck.verify(y - (y / (-5)) * (-5), y % 5)
+    ck.verify((y / 3) * 3 - y, 0 - y % 3)
+    ck.verify(y - ((y - 6) / 5) * 5, (y + (-6)) % 5 + 6)
+    ck.verify(((y - 6) / 5) * 5 - y, (-6) - (y + (-6)) % 5)
+    ck.verify(y - ((y + z) / 5) * 5, (y + z) % 5 - z)
+    ck.verify(((y + z) / 5) * 5 - y, z - (y + z) % 5)
+    ck.verify(y - ((y - z) / 5) * 5, (y - z) % 5 + z)
+    ck.verify(((y - z) / 5) * 5 - y, 0 - (y - z) % 5 - z)
+
+    ck.verify(y * 3 - (y / 2) * 6, (y % 2) * 3)
+    ck.verify((y / 3) * 6 - y * 2, (y % 3) * (-2))
+    ck.verify(y * 5 - ((y + z) / 2) * 10, ((y + z) % 2 - z) * 5)
+    ck.verify(y * 5 - ((y - z) / 2) * 10, ((y - z) % 2 + z) * 5)
+    ck.verify(((y + z) / 3) * 6 - y * 2, (z - (y + z) % 3) * 2)
+    ck.verify(((y - z) / 3) * 6 - y * 2, (0 - (y - z) % 3 - z) * 2)
+    ck.verify(5 * y - ((y + z) / 2) * 10, ((y + z) % 2 - z) * 5)
+    ck.verify(5 * y - 10 * ((y - z) / 2), ((y - z) % 2 + z) * 5)
+    ck.verify(6 * ((y + z) / 3) - y * 2, (z - (y + z) % 3) * 2)
+    ck.verify(((y - z) / 3) * 6 - 2 * y, (0 - (y - z) % 3 - z) * 2)
 
 def test_mul_index_simplify():
     ck = RewriteChecker()
@@ -283,16 +302,36 @@ def test_div_index_simplify():
 
 def test_mod_index_simplify():
     ck = RewriteChecker()
-    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+    x, y, nx, ny, z = tvm.var("x"), tvm.var("y"), tvm.var("nx"), tvm.var("ny"), tvm.var("z")
     ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 1000), override=True)
     ck.analyzer.update(y, tvm.arith.ConstIntBound(0, 1000), override=True)
+    ck.analyzer.update(nx, tvm.arith.ConstIntBound(-1000, 0), override=True)
+    ck.analyzer.update(ny, tvm.arith.ConstIntBound(-1000, 0), override=True)
 
     ck.verify(x * 10 % 2, 0)
     ck.verify((x * 10 + y) % 2, y % 2)
     ck.verify((x + 10) % 2, x % 2)
     ck.verify((x + y * 10) % 2, x % 2)
     ck.verify((x* 10 + 1 + y * 2 + 2) % 2, 1)
-
+    ck.verify(x * 10 % -2, 0)
+    ck.verify((x * 10 + y) % -2, y % 2)
+    ck.verify((x + 10) % -2, x % 2)
+    ck.verify((x + y * 10) % -2, x % 2)
+    ck.verify((x* 10 + 1 + y * 2 + 2) % -2, 1)
+
+    ck.verify(x * (-10) % 2, 0)
+    ck.verify((x * (-10) + y) % 2, (x * (-10) + y) % 2)
+    ck.verify((x + (-10)) % 2, (x + (-10)) % 2)
+    ck.verify((x + y * (-10)) % 2, (x + y * (-10)) % 2)
+    ck.verify(x * (-10) % -2, 0)
+
+    ck.verify(nx * 10 % 2, 0)
+    ck.verify((nx * (-10) + y) % 2, y % 2)
+    ck.verify((x + ny * (-10)) % 2, x % 2)
+    ck.verify((nx * (-10) + 1 + ny * (-2) + 2) % 2, 1)
+    ck.verify(nx * 10 % -2, 0)
+    ck.verify((nx * (-10) + y) % -2, y % 2)
+    ck.verify((x + ny * (-10)) % -2, x % 2)
 
 def test_min_index_simplify():
     ck = RewriteChecker()
@@ -353,6 +392,7 @@ def test_min_index_simplify():
     ck.verify(tvm.min(x / 10, y / 10), tvm.min(x, y) / 10)
     ck.verify(tvm.min(x / (-10), y / (-10)), tvm.max(x, y) / (-10))
     ck.verify(tvm.min(x * 3, 9), tvm.min(x, 3) * 3)
+    ck.verify(tvm.min(3 - x, 2), 3 - tvm.max(x,  1))
 
 
 def test_max_index_simplify():
@@ -409,6 +449,7 @@ def test_max_index_simplify():
     ck.verify(tvm.max(x / 10, y / 10), tvm.max(x, y) / 10)
     ck.verify(tvm.max(x / (-10), y / (-10)), tvm.min(x, y) / (-10))
     ck.verify(tvm.max(x * 3, 9), tvm.max(x, 3) * 3)
+    ck.verify(tvm.max(3 - x, 1), 3 - tvm.min(x,  2))
 
 
 def test_cmp_simplify():
@@ -449,6 +490,50 @@ def test_cmp_simplify():
     ck.verify(x / 2 < 3, x < 6)
     ck.verify(x * 4 <= 2, x <= 0)
     ck.verify(3 < x / 2, tvm.expr.LT(7, x))
+    ck.verify(x / 3 >= 0, tvm.expr.LE(-2, x))
+    ck.verify((0 - x * 3) <= 0, tvm.expr.LE(0, x))
+    ck.verify((0 - x * 3) >= 0, tvm.expr.LE(x, 0))
+    ck.verify(2 * x <= 0, x <= 0)
+
+    ck.verify(x * 2 >= 3, tvm.expr.LE(2, x))
+    ck.verify(x * 2 >= 2, tvm.expr.LE(1, x))
+    ck.verify(x * 2 >= 1, tvm.expr.LE(1, x))
+    ck.verify(x * 2 >= 0, tvm.expr.LE(0, x))
+    ck.verify(x * 2 >= -1, tvm.expr.LE(0, x))
+    ck.verify(x * 2 >= -2, tvm.expr.LE(-1, x))
+    ck.verify(x * 2 >= -3, tvm.expr.LE(-1, x))
+
+    ck.verify(x * 2 <= 3, tvm.expr.LE(x, 1))
+    ck.verify(x * 2 <= 2, tvm.expr.LE(x, 1))
+    ck.verify(x * 2 <= 1, tvm.expr.LE(x, 0))
+    ck.verify(x * 2 <= 0, tvm.expr.LE(x, 0))
+    ck.verify(x * 2 <= -1, tvm.expr.LE(x, -1))
+    ck.verify(x * 2 <= -2, tvm.expr.LE(x, -1))
+    ck.verify(x * 2 <= -3, tvm.expr.LE(x, -2))
+
+    ck.verify(x * (-2) >= 3, tvm.expr.LE(x, -2))
+    ck.verify(x * (-2) >= 2, tvm.expr.LE(x, -1))
+    ck.verify(x * (-2) >= 1, tvm.expr.LE(x, -1))
+    ck.verify(x * (-2) >= 0, tvm.expr.LE(x, 0))
+    ck.verify(x * (-2) >= -1, tvm.expr.LE(x, 0))
+    ck.verify(x * (-2) >= -2, tvm.expr.LE(x, 1))
+    ck.verify(x * (-2) >= -3, tvm.expr.LE(x, 1))
+
+    ck.verify(x * (-2) <= 3, tvm.expr.LE(-1, x))
+    ck.verify(x * (-2) <= 2, tvm.expr.LE(-1, x))
+    ck.verify(x * (-2) <= 1, tvm.expr.LE(0, x))
+    ck.verify(x * (-2) <= 0, tvm.expr.LE(0, x))
+    ck.verify(x * (-2) <= -1, tvm.expr.LE(1, x))
+    ck.verify(x * (-2) <= -2, tvm.expr.LE(1, x))
+    ck.verify(x * (-2) <= -3, tvm.expr.LE(2, x))
+
+    ck.verify(x / 2 >= 1, tvm.expr.LE(2, x))
+    ck.verify(x / 2 >= 0, tvm.expr.LE(-1, x))
+    ck.verify(x / 2 >= -1, tvm.expr.LE(-3, x))
+
+    ck.verify(x / 2 <= 1, tvm.expr.LE(x, 3))
+    ck.verify(x / 2 <= 0, tvm.expr.LE(x, 1))
+    ck.verify(x / 2 <= -1, tvm.expr.LE(x, -2))
 
     ck.verify(x / 4 * 4 < x, tvm.expr.LT(0, x % 4))
     ck.verify(x / 4 * 4 >= x, tvm.expr.LE(x % 4, 0))
@@ -480,6 +565,8 @@ def test_cmp_simplify():
     ck.verify(x*y <= 0, tvm.const(1, "bool"))
     ck.verify((x + 1)*(y - 1) < 0, tvm.const(1, "bool"))
     ck.verify(y*y >= 0, tvm.const(1, "bool"))
+    ck.verify(x*6 <= -3, tvm.const(0, "bool"))
+    ck.verify((y - 1) % 3 == 0, (y + (-1)) % 3 == 0)
 
 
 def test_logical_simplify():
diff --git a/tests/python/unittest/test_codegen_cuda.py b/tests/python/unittest/test_codegen_cuda.py
index f28b4ccfd1da..8fe6720830a5 100644
--- a/tests/python/unittest/test_codegen_cuda.py
+++ b/tests/python/unittest/test_codegen_cuda.py
@@ -125,8 +125,38 @@ def check_cuda(n, value):
     check_cuda(64, 0)
     check_cuda(64, -3)
 
+
+def test_cuda_inf_nan():
+    target = 'cuda'
+    def check_inf_nan(ctx, n, value, dtype):
+        A = tvm.placeholder((n,), name='A', dtype=dtype)
+        inf_value = tvm.const(value, dtype=dtype)
+        C = tvm.compute((n,), lambda i: inf_value, name='C')
+        s = tvm.create_schedule(C.op)
+        s[C].bind(s[C].op.axis[0], tvm.thread_axis("threadIdx.x"))
+        fun = tvm.build(s, [A, C], target)
+        a = tvm.nd.empty((n,), A.dtype, ctx)
+        c = tvm.nd.empty((n,), A.dtype, ctx)
+        # Only need to test compiling here
+        fun(a, c)
+
+    if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
+        print("skip because cuda is not enabled..")
+        return
+
+    ctx = tvm.context(target, 0)
+
+    check_inf_nan(ctx, 1, -float('inf'), 'float32')
+    check_inf_nan(ctx, 1, -float('inf'), 'float64')
+    check_inf_nan(ctx, 1, float('inf'), 'float32')
+    check_inf_nan(ctx, 1, float('inf'), 'float64')
+    check_inf_nan(ctx, 1, float('nan'), 'float32')
+    check_inf_nan(ctx, 1, float('nan'), 'float64')
+
+
 if __name__ == "__main__":
     test_cuda_vectorize_add()
     test_cuda_multiply_add()
     test_cuda_vectorize_load()
     test_cuda_make_int8x4()
+    test_cuda_inf_nan()
diff --git a/tests/python/unittest/test_codegen_opencl.py b/tests/python/unittest/test_codegen_opencl.py
index c484664bdfd8..71fc4f9a7f35 100644
--- a/tests/python/unittest/test_codegen_opencl.py
+++ b/tests/python/unittest/test_codegen_opencl.py
@@ -66,6 +66,33 @@ def check_select(ctx, n, dtype):
     check_select(ctx, 1, 'int16')
     check_select(ctx, 1, 'uint16')
 
+def test_opencl_inf_nan():
+    def check_inf_nan(ctx, n, value, dtype):
+        A = tvm.placeholder((n,), name='A', dtype=dtype)
+        inf_value = tvm.const(value, dtype=dtype)
+        C = tvm.compute((n,), lambda i: inf_value, name='C')
+        s = tvm.create_schedule(C.op)
+        s[C].bind(s[C].op.axis[0], tvm.thread_axis("threadIdx.x"))
+        fun = tvm.build(s, [A, C], target)
+        a = tvm.nd.empty((n,), A.dtype, ctx)
+        c = tvm.nd.empty((n,), A.dtype, ctx)
+        # Only need to test compiling here
+        fun(a, c)
+
+    if not tvm.module.enabled(target):
+        print("skip because opencl is not enabled..")
+        return
+
+    ctx = tvm.context(target, 0)
+
+    check_inf_nan(ctx, 1, -float('inf'), 'float32')
+    check_inf_nan(ctx, 1, -float('inf'), 'float64')
+    check_inf_nan(ctx, 1, float('inf'), 'float32')
+    check_inf_nan(ctx, 1, float('inf'), 'float64')
+    check_inf_nan(ctx, 1, float('nan'), 'float32')
+    check_inf_nan(ctx, 1, float('nan'), 'float64')
+
 
 if __name__ == "__main__":
     test_opencl_ternary_expression()
+    test_opencl_inf_nan()
diff --git a/tests/python/unittest/test_custom_datatypes_mybfloat16.py b/tests/python/unittest/test_custom_datatypes_mybfloat16.py
new file mode 100644
index 000000000000..99c6cf5f268b
--- /dev/null
+++ b/tests/python/unittest/test_custom_datatypes_mybfloat16.py
@@ -0,0 +1,150 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+from ctypes import *
+import topi
+import tvm.ir_pass as ir_pass
+import numpy as np
+
+tgt = "llvm"
+
+
+def setup():
+    # You must first load the library containing the datatype implementation.
+    # In this case, we have built the test functions used below right into TVM.
+    # CDLL("libmybfloat16.so", RTLD_GLOBAL)
+
+    tvm.datatype.register("bfloat", 129)
+
+    tvm.datatype.register_op(
+        tvm.datatype.create_lower_func("FloatToBFloat16_wrapper"), "Cast",
+        "llvm", "bfloat", "float")
+    tvm.datatype.register_op(
+        tvm.datatype.create_lower_func("BFloat16ToFloat_wrapper"), "Cast",
+        "llvm", "float", "bfloat")
+    tvm.datatype.register_op(
+        tvm.datatype.create_lower_func("BFloat16Add_wrapper"), "Add", "llvm",
+        "bfloat")
+    tvm.datatype.register_op(
+        tvm.datatype.create_lower_func("FloatToBFloat16_wrapper"), "FloatImm",
+        "llvm", "bfloat")
+
+def lower_datatypes_and_build(schedule, args):
+    """Create schedule and lower, manually lowering datatypes.
+
+    Once datatype lowering is integrated directly into TVM's lower/build
+    process, we won't need to do this manually.
+    TODO(gus) integrate datatype lowering into build process; change this test"""
+    flist = tvm.lower(schedule, args)
+    flist = [flist]
+    flist = [ir_pass.LowerCustomDatatypes(func, tgt) for func in flist]
+    return tvm.build(flist[0], target=tgt)
+
+def test_bfloat_add_and_cast_1():
+    X = tvm.placeholder((3, ), name="X")
+    Y = tvm.placeholder((3, ), name="Y")
+    Z = topi.cast(
+        topi.cast(X, dtype="custom[bfloat]16") +
+        topi.cast(Y, dtype="custom[bfloat]16"),
+        dtype="float")
+
+    s = tvm.create_schedule([Z.op])
+    built_cast = lower_datatypes_and_build(s, [X,Y,Z])
+
+    ctx = tvm.context(tgt, 0)
+
+    # Used float32 calculator at http://www.weitz.de/ieee/. Generated float32s
+    # with at most 7-bit mantissas which, when added, produce a result with at
+    # most 7-bit mantissas. This is to ensure there are no errors due to
+    # float32->bfloat16 conversions.
+    x = tvm.nd.array(
+        np.array([4.4103796E-32, 14942208.0, 1.78125]).astype("float32"),
+        ctx=ctx)
+    y = tvm.nd.array(
+        np.array([-3.330669E-14, 19660800.0, 2.25]).astype("float32"), ctx=ctx)
+    z_expected = np.array([-3.330669E-14, 34603008.0,
+                           4.03125]).astype("float32")
+    z = tvm.nd.empty(Z.shape, dtype=Z.dtype, ctx=ctx)
+
+    built_cast(x, y, z)
+
+    assert np.array_equal(z_expected, z.asnumpy())
+
+
+def test_bfloat_add_and_cast_2():
+    X = tvm.placeholder((3, ), name="X")
+    Y = tvm.placeholder((3, ), name="Y")
+    Z = topi.cast(
+        topi.cast(X, dtype="custom[bfloat]16") +
+        topi.cast(Y, dtype="custom[bfloat]16"),
+        dtype="float")
+
+    s = tvm.create_schedule([Z.op])
+    built_cast = lower_datatypes_and_build(s, [X,Y,Z])
+
+    ctx = tvm.context(tgt, 0)
+
+    # Used float32 calculator at http://www.weitz.de/ieee/. Generated
+    # unconstrained float32s for the operands and copied them in to x and y.
+    # Then, to simulate float32->bfloat16 conversion implemented by the mybfloat
+    # library, I cut off all but 7 bits of the mantissa. I then added the
+    # numbers. To simulate bfloat16 add implemented in mybfloat, I cut off all
+    # but 7 bits of the result's mantissa. I then copied that value into
+    # z_expected.
+    x = tvm.nd.array(
+        np.array([1.2348297, -1.0298302E25, 1.2034023E-30]).astype("float32"),
+        ctx=ctx)
+    y = tvm.nd.array(
+        np.array([-2.4992788, -9.888288E19, 9.342338E-29]).astype("float32"),
+        ctx=ctx)
+    z_expected = np.array([-1.25, -1.027587E25,
+                           9.426888E-29]).astype("float32")
+    z = tvm.nd.empty(Z.shape, dtype=Z.dtype, ctx=ctx)
+
+    built_cast(x, y, z)
+
+    assert np.array_equal(z_expected, z.asnumpy())
+
+
+def test_bfloat_add_and_cast_FloatImm():
+    X = tvm.placeholder((3, ), name="X")
+    Z = topi.cast(
+        topi.add(
+            topi.cast(X, dtype="custom[bfloat]16"),
+            tvm.expr.FloatImm("custom[bfloat]16", 1.5)),
+        dtype="float")
+
+    s = tvm.create_schedule([Z.op])
+    built_cast = lower_datatypes_and_build(s, [X,Z])
+
+    ctx = tvm.context(tgt, 0)
+
+    x = tvm.nd.array(np.array([0.0, 1.0, 1.5]).astype("float32"), ctx=ctx)
+    z_expected = np.array([1.5, 2.5, 3.0]).astype("float32")
+    z = tvm.nd.empty(Z.shape, dtype=Z.dtype, ctx=ctx)
+
+    built_cast(x, z)
+
+    assert np.array_equal(z_expected, z.asnumpy())
+
+
+if __name__ == "__main__":
+    setup()
+    test_bfloat_add_and_cast_1()
+    test_bfloat_add_and_cast_2()
+    test_bfloat_add_and_cast_FloatImm()
diff --git a/tests/python/unittest/test_graph_tuner_core.py b/tests/python/unittest/test_graph_tuner_core.py
new file mode 100644
index 000000000000..240da7f88628
--- /dev/null
+++ b/tests/python/unittest/test_graph_tuner_core.py
@@ -0,0 +1,254 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# NOTE: We name this test file to start with test_graph_tuner
+# to make it execute after zero_rank tensor test cases. This
+# helps avoid topi arithmetic operator overloading issue:
+# https://github.com/dmlc/tvm/issues/3240.
+# TODO: restore the file name after this issue is resolved.
+import os
+import copy
+import numpy as np
+import tvm
+import tvm.relay.testing
+
+from tvm import autotvm
+from tvm import relay
+from tvm.autotvm.task import ConfigEntity
+from tvm.autotvm.measure import MeasureResult, MeasureInput
+from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner
+from test_graph_tuner_utils import create_workload
+
+
+def _create_data(target, dshape, dtype, layout):
+    data = relay.var("data", shape=dshape, dtype=dtype)
+    w0 = relay.var("w0_weight")
+    conv0 = relay.nn.conv2d(data, w0, channels=16, kernel_size=(3, 3), padding=(1, 1))
+    w1 = relay.var("w1_weight")
+    conv1 = relay.nn.conv2d(conv0, w1, channels=32, kernel_size=(1, 1))
+    w2 = relay.var("w2_weight")
+    conv2 = relay.nn.conv2d(conv1, w2, channels=32, kernel_size=(3, 3), padding=(1, 1))
+    out = relay.add(conv1, conv2)
+    net = relay.Function(relay.ir_pass.free_vars(out), out)
+    net, params = relay.testing.create_workload(net)
+    tasks = autotvm.task.extract_from_program(net,
+                                              target=target,
+                                              params=params,
+                                              ops=(relay.op.nn.conv2d,))
+    wkl_list = [
+        create_workload((1, 3, 8, 8), (16, 3, 3, 3), (1, 1), (1, 1), (1, 1), layout, layout, dtype, dtype),
+        create_workload((1, 16, 8, 8), (32, 16, 1, 1), (1, 1), (0, 0), (1, 1), layout, layout, dtype, dtype),
+        create_workload((1, 32, 8, 8), (32, 32, 3, 3), (1, 1), (1, 1), (1, 1), layout, layout, dtype, dtype),
+    ]
+    costs = [0.04, 0.012, 0.03]
+    config_list = []
+    cfg_dict = {"i": -1,
+                "c": None,
+                "e": [["tile_ic", "sp", [3, 1]],
+                      ["tile_oc", "sp", [4, 4]],
+                      ["tile_ow", "sp", [4, 2]],
+                      ["unroll_kw", "ot", True]],
+                "t": ""}
+    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
+    cfg_dict = {"i": -1,
+                "c": None,
+                "e": [["tile_ic", "sp", [2, 8]],
+                      ["tile_oc", "sp", [1, 32]],
+                      ["tile_oh", "ot", 1],
+                      ["tile_ow", "sp", [4, 2]]],
+                "t": ""}
+    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
+    cfg_dict = {"i": -1,
+                "c": None,
+                "e": [["tile_ic", "sp", [8, 4]],
+                      ["tile_oc", "sp", [4, 8]],
+                      ["tile_ow", "sp", [2, 4]],
+                      ["unroll_kw", "ot", False]],
+                "t": ""}
+    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
+
+    records = []
+    for wkl, cost, config, task in zip(wkl_list, costs, config_list, tasks):
+        task.workload = wkl
+        ms_input = MeasureInput(target=target, task=task, config=config)
+        ms_output = MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1)
+        records.append((ms_input, ms_output))
+
+    ltf_records = []
+    ltf_arg = [tvm.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"]
+    ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg)
+    ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg)
+    ltf_task = copy.deepcopy(tasks[0])
+    ltf_task.workload = ltf_wkl
+    ms_input = MeasureInput(target=target, task=ltf_task, config=None)
+    ms_output =  MeasureResult(costs=(1.91224744e-05,), error_no=0, all_cost=-1, timestamp=-1)
+    ltf_records.append((ms_input, ms_output))
+
+    ltf_keys = []
+    ltf_arg = [tvm.placeholder((1, 4, 8, 8, 4), dtype=dtype), "NCHW4c", "NCHW8c"]
+    ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg)
+    ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg)
+    ltf_keys.append(ltf_wkl)
+    ltf_arg = [tvm.placeholder((1, 1, 8, 8, 32), dtype=dtype), "NCHW32c", "NCHW4c"]
+    ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg)
+    ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg)
+    ltf_keys.append(ltf_wkl)
+    ltf_arg = [tvm.placeholder((1, 4, 8, 8, 8), dtype=dtype), "NCHW8c", "NCHW32c"]
+    ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg)
+    ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg)
+    ltf_keys.append(ltf_wkl)
+
+    return net, records, ltf_records, ltf_keys, tasks
+
+
+def test_graph_tuner_layout_transform():
+    log_file = "%s/test_tuner.log" % (os.getcwd())
+    target = "llvm"
+    dshape = (1, 3, 8, 8)
+    dtype = "float32"
+    layout = "NCHW"
+    target_ops = [relay.nn.conv2d]
+
+    g, records, ltf_records, ltf_keys, _ = _create_data(target, dshape, dtype, layout)
+    executor = DPTuner(g, {"data": dshape}, records, target_ops, target=target, log_file=log_file)
+    executor.benchmark_layout_transform(layout_records=ltf_records, infer_layout=True)
+    out = executor._layout_transform_perf_records
+
+    num_flops = 0
+    total_time = 0
+    for record in ltf_records:
+        ltf_wkl = record[0].task.workload
+        input_shape = ltf_wkl[1][1]
+        flops = np.prod(input_shape)
+        num_flops += flops
+        total_time += record[1].costs[0]
+    avg_time = total_time / num_flops
+
+    for ltf_workload in out:
+        input_shape = ltf_workload[1][1]
+        flops = 1
+        for i in input_shape:
+            flops *= i
+        expected_time = flops * avg_time
+        out_time = out[ltf_workload][1].costs[0]
+        assert expected_time == out_time, "Inferred layout transformation time mismatch for %s: " \
+                                          "expecting %f but got %f" % (str(ltf_workload), expected_time,
+                                                                       out_time)
+
+
+def test_DPTuner_run():
+    log_file = "%s/test_tuner.log" % (os.getcwd())
+    target = "llvm"
+    dtype = "float32"
+    layout = "NCHW"
+    dshape = (1, 3, 8, 8)
+    target_ops = [relay.nn.conv2d]
+
+    g, records, ltf_records, ltf_keys, tasks = _create_data(target, dshape, dtype, layout)
+    costs = [0.02, 0.02, 0.045]
+    config_list = []
+    cfg_dict = {"i": -1,
+                "c": None,
+                "e": [["tile_ic", "sp", [1, 3]],
+                      ["tile_oc", "sp", [2, 8]],
+                      ["tile_ow", "sp", [4, 2]],
+                      ["unroll_kw", "ot", True]],
+                "t": ""}
+    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
+    cfg_dict = {"i": -1,
+                "c": None,
+                "e": [["tile_ic", "sp", [4, 4]],
+                      ["tile_oc", "sp", [2, 16]],
+                      ["tile_oh", "ot", 1],
+                      ["tile_ow", "sp", [4, 2]]],
+                "t": ""}
+    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
+    cfg_dict = {"i": -1,
+                "c": None,
+                "e": [["tile_ic", "sp", [16, 2]],
+                      ["tile_oc", "sp", [8, 4]],
+                      ["tile_ow", "sp", [2, 4]],
+                      ["unroll_kw", "ot", False]],
+                "t": ""}
+    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
+    for cost, config, task in zip(costs, config_list, tasks):
+        ms_input = MeasureInput(target=target, task=task, config=config)
+        ms_output = MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1)
+        records.append((ms_input, ms_output))
+
+    executor = DPTuner(g, {"data": dshape}, records, target_ops, target, log_file=log_file)
+    executor.benchmark_layout_transform(layout_records=ltf_records, infer_layout=True)
+    executor.run()
+    out = [record[0].config for record in executor.get_optimal_records()]
+    expected_out = [records[3][0].config, records[1][0].config, records[2][0].config]
+    assert expected_out == out, "Output mismatch: expecting %s but got %s" \
+                                % (str(expected_out), str(out))
+    assert os.path.isfile(log_file), "No log file with name %s exists." % log_file
+
+
+def test_PBQPTuner_run():
+    target = "llvm"
+    dtype = "float32"
+    layout = "NCHW"
+    dshape = (1, 3, 8, 8)
+    target_ops = [relay.nn.conv2d]
+
+    g, records, ltf_records, ltf_keys, tasks = _create_data(target, dshape, dtype, layout)
+    costs = [0.02, 0.02, 0.045]
+    config_list = []
+    cfg_dict = {"i": -1,
+                "c": None,
+                "e": [["tile_ic", "sp", [1, 3]],
+                      ["tile_oc", "sp", [2, 8]],
+                      ["tile_ow", "sp", [4, 2]],
+                      ["unroll_kw", "ot", True]],
+                "t": ""}
+    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
+    cfg_dict = {"i": -1,
+                "c": None,
+                "e": [["tile_ic", "sp", [4, 4]],
+                      ["tile_oc", "sp", [2, 16]],
+                      ["tile_oh", "ot", 1],
+                      ["tile_ow", "sp", [4, 2]]],
+                "t": ""}
+    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
+    cfg_dict = {"i": -1,
+                "c": None,
+                "e": [["tile_ic", "sp", [16, 2]],
+                      ["tile_oc", "sp", [8, 4]],
+                      ["tile_ow", "sp", [2, 4]],
+                      ["unroll_kw", "ot", False]],
+                "t": ""}
+    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
+    for cost, config, task in zip(costs, config_list, tasks):
+        ms_input = MeasureInput(target=target, task=task, config=config)
+        ms_output = MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1)
+        records.append((ms_input, ms_output))
+
+    executor = PBQPTuner(g, {"data": dshape}, records, target_ops, target)
+    executor.benchmark_layout_transform(layout_records=ltf_records, infer_layout=True)
+    executor.run()
+    out = [record[0].config for record in executor.get_optimal_records()]
+    expected_out = [records[3][0].config, records[1][0].config, records[2][0].config]
+    assert expected_out == out, "Output mismatch: expecting %s but got %s" \
+                           % (str(expected_out), str(out))
+
+
+if __name__=="__main__":
+    test_graph_tuner_layout_transform()
+    test_DPTuner_run()
+    test_PBQPTuner_run()
diff --git a/tests/python/unittest/test_graph_tuner_utils.py b/tests/python/unittest/test_graph_tuner_utils.py
new file mode 100644
index 000000000000..0847166412d2
--- /dev/null
+++ b/tests/python/unittest/test_graph_tuner_utils.py
@@ -0,0 +1,149 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# NOTE: We name this test file to start with test_graph_tuner
+# to make it execute after zero_rank tensor test cases. This
+# helps avoid topi arithmetic operator overloading issue:
+# https://github.com/dmlc/tvm/issues/3240
+# TODO: restore the file name after this issue is resolved.
+import tvm
+
+from tvm import autotvm, relay
+from tvm.relay.testing import resnet
+from tvm.autotvm.graph_tuner.utils import has_multiple_inputs, get_direct_ancestor, get_in_nodes, \
+    get_out_nodes, expr2graph, bind_inputs
+from tvm.relay.expr import Call, TupleGetItem, Tuple
+from topi.nn.conv2d import conv2d
+
+
+def create_workload(dshape, kshape, strides,
+                    padding, dilation, layout,
+                    out_layout, dtype, out_dtype):
+    data = tvm.placeholder(dshape, dtype=dtype)
+    kernel = tvm.placeholder(kshape, dtype=dtype)
+    return autotvm.task.args_to_workload([data, kernel, strides, padding, dilation, layout,
+                                          out_dtype], conv2d)
+
+
+def verify_has_multiple_inputs(node_list, node_idx, input_names, expected_result):
+    out = has_multiple_inputs(node_list, node_idx, input_names)
+    assert out == expected_result, "Output mismatch: expecting checking %s to be %s but got %s." \
+                                   % (node_list[node_idx]["op"], str(expected_result), str(out))
+
+
+def test_has_multiple_inputs():
+    data = relay.var("data")
+    out1 = data * relay.expr.const(3.0)
+    w0 = relay.var("w0")
+    out2 = relay.nn.conv2d(data, w0)
+    out = relay.add(out1, out2)
+    net = relay.Function(relay.ir_pass.free_vars(out), out)
+    net = bind_inputs(net, {"data": (1, 16, 224, 224), "w0": (16, 16, 1, 1)})
+    target_ops = ["conv2d"]
+    node_list = []
+    node_dict = {}
+    expr2graph(net, target_ops, node_dict, node_list)
+    input_names = ["data"]
+    verify_has_multiple_inputs(node_list, 2, input_names, False)
+    verify_has_multiple_inputs(node_list, 4, input_names, False)
+    verify_has_multiple_inputs(node_list, 5, input_names, True)
+
+
+def test_expr2graph():
+    net, _ = resnet.get_workload(num_layers=50, batch_size=1)
+    node_dict = {}
+    node_list = []
+    target_ops = ["conv2d"]
+    op_name_list = []
+    def _count_node(node):
+        if not isinstance(node, relay.op.op.Op,):
+            return
+        if isinstance(node, Call):
+            op_name_list.append(node.op.name.split(".")[-1])
+        elif isinstance(node, TupleGetItem):
+            op_name_list.append("TupleGetItem")
+        elif isinstance(node, Tuple):
+            op_name_list.append("Tuple")
+        else:
+            op_name_list.append("null")
+    relay.ir_pass.post_order_visit(net, _count_node)
+
+    expr2graph(net, target_ops, node_dict, node_list)
+    for i, item in enumerate(zip(op_name_list, node_list)):
+        op_name, node = item
+        assert op_name == node["op"], "%dth Node operator mismatch: expecting %s but got %s" \
+                                      % (i, str(op_name), str(node["op"]))
+
+
+def test_get_direct_ancestor():
+    data = relay.var("data")
+    w0 = relay.var("w0")
+    out1 = relay.nn.conv2d(data, w0)
+    out2 = relay.add(out1, data * relay.expr.const(5.0))
+    out3 = out2 + relay.expr.const(2.5)
+    w1 = relay.var("w1")
+    out = relay.nn.conv2d(out3, w1)
+    net = relay.Function(relay.ir_pass.free_vars(out), out)
+    net = bind_inputs(net, {"data": (1, 16, 224, 224), "w0": (16, 16, 1, 1), "w1": (16, 16, 1, 1)})
+    target_ops = ["conv2d"]
+    node_list = []
+    node_dict = {}
+    expr2graph(net, target_ops, node_dict, node_list)
+    visited_dict = {}
+    input_names = ["data"]
+    out = get_direct_ancestor(node_list, visited_dict, target_ops, 5, input_names)
+    assert out == [2, 0], "Output mismatch: expecting [2, 0] but got %s." % str(out)
+
+
+def test_get_in_nodes():
+    data = relay.var("data")
+    w0 = relay.var("w0")
+    out1 = relay.nn.conv2d(data, w0)
+    out2 = relay.add(out1, data)
+    out3 = out2 + relay.expr.const(2.5)
+    w1 = relay.var("w1")
+    out = relay.nn.conv2d(out3, w1)
+    net = relay.Function(relay.ir_pass.free_vars(out), out)
+    net = bind_inputs(net, {"data": (1, 16, 224, 224), "w0": (16, 16, 1, 1), "w1": (16, 16, 1, 1)})
+    target_ops = ["conv2d"]
+    input_names = ["data"]
+    node_list = []
+    node_dict = {}
+    expr2graph(net, target_ops, node_dict, node_list)
+    out = get_in_nodes(node_list, target_ops, input_names)
+    expected_out = {7: [3], 3: [2, 0], 2: [0]}
+    diff_set = set(out) ^ set(expected_out)
+    if len(diff_set) != 0:
+        raise RuntimeError("Output mismatch: expecting %s but got %s." % (str(expected_out), str(out)))
+
+
+def test_get_out_nodes():
+    in_nodes_dict = {8: [4], 4: [3, 0], 3: [0]}
+    expected_out = {0: [3, 4], 3: [4], 4: [8], 8: []}
+    out = get_out_nodes(in_nodes_dict)
+    diff_set = set(out) ^ set(expected_out)
+    if len(diff_set) != 0:
+        raise RuntimeError("Output mismatch: expecting %s but got %s." % (str(expected_out), str(out)))
+
+
+
+if __name__ == "__main__":
+    test_has_multiple_inputs()
+    test_expr2graph()
+    test_get_direct_ancestor()
+    test_get_in_nodes()
+    test_get_out_nodes()
diff --git a/tests/python/unittest/test_lang_basic.py b/tests/python/unittest/test_lang_basic.py
index 25c0aa14bad7..0ace220ab6e1 100644
--- a/tests/python/unittest/test_lang_basic.py
+++ b/tests/python/unittest/test_lang_basic.py
@@ -163,6 +163,14 @@ def test_equality():
     d = (c != c)
     assert not d
 
+
+def test_equality_string_imm():
+    x = 'a'
+    y = tvm.make.StringImm(x)
+    x == y.value
+    x == y
+
+
 if __name__ == "__main__":
     test_cast()
     test_attr()
@@ -178,3 +186,4 @@ def test_equality():
     test_all()
     test_bitwise()
     test_equality()
+    test_equality_string_imm()
diff --git a/tests/python/unittest/test_lang_container.py b/tests/python/unittest/test_lang_container.py
index cce7479a4278..999e379ca48a 100644
--- a/tests/python/unittest/test_lang_container.py
+++ b/tests/python/unittest/test_lang_container.py
@@ -65,9 +65,16 @@ def test_map_save_load_json():
     assert(dd == {"a": 2, "b": 3})
 
 
+def test_in_container():
+    arr = tvm.convert(['a', 'b', 'c'])
+    assert 'a' in arr
+    assert tvm.make.StringImm('a') in arr
+    assert 'd' not in arr
+
 if __name__ == "__main__":
     test_str_map()
     test_array()
     test_map()
     test_array_save_load_json()
     test_map_save_load_json()
+    test_in_container()
diff --git a/tests/python/unittest/test_lang_schedule.py b/tests/python/unittest/test_lang_schedule.py
index fb33e2b6fc0a..627921546f3c 100644
--- a/tests/python/unittest/test_lang_schedule.py
+++ b/tests/python/unittest/test_lang_schedule.py
@@ -209,11 +209,43 @@ def intrin_func(ins, outs):
     assert(s[z].iter_var_attrs[xi].tensor_intrin == intrin)
     assert(s[z].iter_var_attrs[xi].iter_type == tvm.schedule.IterVar.Tensorized)
 
+def test_tensor_intrin_scalar_params():
+    n = tvm.var("n")
+    x = tvm.placeholder((n,), name='x')
+    v = tvm.var("v")
+    w = tvm.var("w")
+    z = tvm.compute((n,), lambda i: x[i]*v + w, name='z')
+
+    def intrin_func(ins, outs, sp):
+        assert(isinstance(ins[0], tvm.schedule.Buffer))
+        assert(ins[0].shape[0] == n)
+        assert(sp[0] == v)
+        assert(sp[1] == w)
+        return tvm.call_packed("hw_func", ins[0].data, outs[0].data, sp[0], sp[1])
+
+    with tvm.build_config(offset_factor=1):
+      intrin = tvm.decl_tensor_intrin(z.op, intrin_func, scalar_params=[v, w])
+    assert intrin.op == z.op
+    assert intrin.reduce_init is None
+    assert tuple(intrin.inputs) == tuple(z.op.input_tensors)
+    assert(intrin.buffers[0].shape[0] == n)
+    assert tuple(intrin.scalar_params) == tuple((v, w))
+
+    A = tvm.placeholder((10,10), name='A')
+    # Pass scalar inputs to the TensorIntrin, interleaved with tensor inputs
+    C = tvm.compute((10,10), lambda i, j: intrin(i*i, A[i, j], i+j), name="C")
+    s = tvm.create_schedule(C.op)
+    stmt = tvm.lower(s, [A, C], simple_mode=True)
+    assert isinstance(stmt.body.body.body, tvm.stmt.Evaluate)
+    assert len(stmt.body.body.body.value.args) == 5
+    assert str(stmt.body.body.body.value.args[3]) == "(i*i)"
+    assert str(stmt.body.body.body.value.args[4]) == "(i + j)"
 
 if __name__ == "__main__":
     test_singleton()
     test_pragma()
     test_tensor_intrin()
+    test_tensor_intrin_scalar_params()
     test_rfactor()
     test_schedule_create()
     test_reorder()
diff --git a/tests/python/unittest/test_runtime_graph.py b/tests/python/unittest/test_runtime_graph.py
index 20af8a005b77..f331f5b33ee9 100644
--- a/tests/python/unittest/test_runtime_graph.py
+++ b/tests/python/unittest/test_runtime_graph.py
@@ -81,8 +81,46 @@ def check_remote():
         out = mod.get_output(0, out)
         np.testing.assert_equal(out.asnumpy(), a + 1)
 
+    def check_sharing():
+        from tvm import relay
+        x = relay.var('x', shape=(1, 10))
+        y = relay.var('y', shape=(1, 10))
+        z = relay.add(x, y)
+        func = relay.Function([x, y], z)
+
+        x_in = np.ones((1, 10)).astype("float32")
+        params = {'x': x_in}
+        graph, lib, params = relay.build(func, target="llvm", params=params)
+
+        if not tvm.module.enabled("llvm"):
+            print("Skip because llvm is not enabled")
+            return
+        mod_shared = graph_runtime.create(graph, lib, tvm.cpu(0))
+        mod_shared.load_params(relay.save_param_dict(params))
+        num_mods = 10
+        mods = [graph_runtime.create(graph, lib, tvm.cpu(0))
+                for _ in range(num_mods)]
+
+        for mod in mods:
+            mod.share_params(mod_shared, relay.save_param_dict(params))
+
+        a = np.random.uniform(size=(1, 10)).astype("float32")
+        for mod in mods:
+            mod.run(y=a)
+            out = mod.get_output(0, tvm.nd.empty((1, 10)))
+            np.testing.assert_equal(out.asnumpy(), x_in + a)
+
+        # Explicitly delete the shared module and verify correctness.
+        del mod_shared
+        for mod in mods:
+            mod.run(y=a)
+            out = mod.get_output(0, tvm.nd.empty((1, 10)))
+            np.testing.assert_equal(out.asnumpy(), x_in + a)
+            del mod
+
     check_verify()
     check_remote()
+    check_sharing()
 
 if __name__ == "__main__":
     test_graph_simple()
diff --git a/tests/python/unittest/test_runtime_graph_debug.py b/tests/python/unittest/test_runtime_graph_debug.py
index 3de270732403..717b23c22689 100644
--- a/tests/python/unittest/test_runtime_graph_debug.py
+++ b/tests/python/unittest/test_runtime_graph_debug.py
@@ -100,9 +100,6 @@ def check_verify():
         out = mod.get_output(0, tvm.nd.empty((n,)))
         np.testing.assert_equal(out.asnumpy(), a + 1)
 
-        #test individual run
-        mod.run_individual(20, 2, 1)
-
         mod.exit()
         #verify dump root delete after cleanup
         assert(not os.path.exists(directory))
@@ -129,7 +126,6 @@ def check_remote():
         mod.run(x=tvm.nd.array(a, ctx))
         out = tvm.nd.empty((n,), ctx=ctx)
         out = mod.get_output(0, out)
-        mod.run_individual(20, 2, 1)
         np.testing.assert_equal(out.asnumpy(), a + 1)
 
     check_verify()
diff --git a/tests/scripts/task_build.sh b/tests/scripts/task_build.sh
index 2440f0f6e79c..fbf3a63df0b2 100755
--- a/tests/scripts/task_build.sh
+++ b/tests/scripts/task_build.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/tests/scripts/task_clean.sh b/tests/scripts/task_clean.sh
index bee75cebc742..d5e5f0728c1d 100755
--- a/tests/scripts/task_clean.sh
+++ b/tests/scripts/task_clean.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/tests/scripts/task_cpp_unittest.sh b/tests/scripts/task_cpp_unittest.sh
index 793eb91b5d11..15a43b9b465e 100755
--- a/tests/scripts/task_cpp_unittest.sh
+++ b/tests/scripts/task_cpp_unittest.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/tests/scripts/task_golang.sh b/tests/scripts/task_golang.sh
index 3e72756fcafe..ee9ec19c4201 100755
--- a/tests/scripts/task_golang.sh
+++ b/tests/scripts/task_golang.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index e4b20a2f4b40..544ef7224770 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -31,7 +31,7 @@ echo "Check file types..."
 python3 tests/lint/check_file_type.py
 
 echo "Check ASF license header..."
-java -jar /bin/apache-rat.jar -E tests/lint/rat-excludes  -d . |grep "== File" > /tmp/$$.apache-rat.txt || true
+java -jar /bin/apache-rat.jar -E tests/lint/rat-excludes  -d . | (grep "== File" > /tmp/$$.apache-rat.txt || true)
 if grep --quiet -E "File" /tmp/$$.apache-rat.txt; then
     echo "Need to add ASF header to the following files."
     echo "----------------File List----------------"
@@ -41,7 +41,7 @@ if grep --quiet -E "File" /tmp/$$.apache-rat.txt; then
     echo "- Create file_list.txt in your text editor"
     echo "- Copy paste the above content in file-list into file_list.txt"
     echo "- python3 tests/lint/add_asf_header.py file_list.txt"
-    exit -1
+    exit 1
 fi
 
 echo "Check codestyle of c++ code..."
@@ -59,5 +59,5 @@ echo "---------Error Log----------"
 cat /tmp/$$.logclean.txt
 echo "----------------------------"
 if grep --quiet -E "warning|error" < /tmp/$$.logclean.txt; then
-    exit -1
+    exit 1
 fi
diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index b2a3fc1cb176..be511725e774 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/tests/scripts/task_python_frontend.sh b/tests/scripts/task_python_frontend.sh
index 37159dbc9a58..9985d4ab7821 100755
--- a/tests/scripts/task_python_frontend.sh
+++ b/tests/scripts/task_python_frontend.sh
@@ -29,43 +29,16 @@ make cython3
 echo "Running relay TFLite frontend test..."
 python3 -m nose -v tests/python/frontend/tflite
 
-echo "Running nnvm unittest..."
-python3 -m nose -v nnvm/tests/python/unittest
-
-echo "Running nnvm compiler test..."
-python3 -m nose -v nnvm/tests/python/compiler
-
-echo "Running nnvm ONNX frontend test..."
-python3 -m nose -v nnvm/tests/python/frontend/onnx
-
-echo "Running nnvm MXNet frontend test..."
-python3 -m nose -v nnvm/tests/python/frontend/mxnet
-
-echo "Running nnvm Keras frontend test..."
-python3 -m nose -v nnvm/tests/python/frontend/keras
-
-echo "Running nnvm Tensorflow frontend test..."
-python3 -m nose -v nnvm/tests/python/frontend/tensorflow
-
-echo "Running nnvm CoreML frontend test..."
-python3 -m nose -v nnvm/tests/python/frontend/coreml
-
-echo "Running nnvm Caffe2 frontend test..."
-python3 -m nose -v nnvm/tests/python/frontend/caffe2
-
-echo "Running nnvm DarkNet frontend test..."
-python3 -m nose -v nnvm/tests/python/frontend/darknet || exit -1
-
 echo "Running relay MXNet frontend test..."
 python3 -m nose -v tests/python/frontend/mxnet
 
 echo "Running relay Keras frontend test..."
 python3 -m nose -v tests/python/frontend/keras
 
-echo "Running relay ONNX frondend test..."
+echo "Running relay ONNX frontend test..."
 python3 -m nose -v tests/python/frontend/onnx
 
-echo "Running relay CoreML frondend test..."
+echo "Running relay CoreML frontend test..."
 python3 -m nose -v tests/python/frontend/coreml
 
 echo "Running nnvm to relay frontend test..."
@@ -74,5 +47,8 @@ python3 -m nose -v tests/python/frontend/nnvm_to_relay
 echo "Running relay Tensorflow frontend test..."
 python3 -m nose -v tests/python/frontend/tensorflow
 
-echo "Running relay caffe2 frondend test..."
+echo "Running relay caffe2 frontend test..."
 python3 -m nose -v tests/python/frontend/caffe2
+
+echo "Running relay DarkNet frontend test..."
+python3 -m nose -v tests/python/frontend/darknet
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index 85dd6de64f6d..b472e96f9e18 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/tests/scripts/task_python_legacy.sh b/tests/scripts/task_python_legacy.sh
new file mode 100755
index 000000000000..df1615bb8550
--- /dev/null
+++ b/tests/scripts/task_python_legacy.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Test cases for legacy code, will be deprecated in the future.
+set -e
+set -u
+
+export PYTHONPATH=nnvm/python:python:topi/python
+export OMP_NUM_THREADS=1
+
+# Rebuild cython
+make cython3
+
+echo "Running nnvm unittest..."
+python3 -m nose -v nnvm/tests/python/unittest
+
+
+echo "Running nnvm compiler test..."
+python3 -m nose -v nnvm/tests/python/compiler
+
+echo "Running nnvm ONNX frontend test..."
+python3 -m nose -v nnvm/tests/python/frontend/onnx
+
+echo "Running nnvm MXNet frontend test..."
+python3 -m nose -v nnvm/tests/python/frontend/mxnet
+
+echo "Running nnvm DarkNet frontend test..."
+python3 -m nose -v nnvm/tests/python/frontend/darknet
+
+echo "Running nnvm Keras frontend test..."
+python3 -m nose -v nnvm/tests/python/frontend/keras
+
+echo "Running nnvm Tensorflow frontend test..."
+python3 -m nose -v nnvm/tests/python/frontend/tensorflow
+
+echo "Running nnvm CoreML frontend test..."
+python3 -m nose -v nnvm/tests/python/frontend/coreml
+
+echo "Running nnvm Caffe2 frontend test..."
+python3 -m nose -v nnvm/tests/python/frontend/caffe2
diff --git a/tests/scripts/task_python_topi.sh b/tests/scripts/task_python_topi.sh
index a204f38c6cc6..59966cffc0bf 100755
--- a/tests/scripts/task_python_topi.sh
+++ b/tests/scripts/task_python_topi.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/tests/scripts/task_python_unittest.sh b/tests/scripts/task_python_unittest.sh
index 7879c8d64e11..caa82f375ec8 100755
--- a/tests/scripts/task_python_unittest.sh
+++ b/tests/scripts/task_python_unittest.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/tests/scripts/task_python_vta.sh b/tests/scripts/task_python_vta.sh
index 4345fc2ba39b..54fa6f06e39b 100755
--- a/tests/scripts/task_python_vta.sh
+++ b/tests/scripts/task_python_vta.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/tests/scripts/task_rust.sh b/tests/scripts/task_rust.sh
index 1728fece5965..cdf777c86c0e 100755
--- a/tests/scripts/task_rust.sh
+++ b/tests/scripts/task_rust.sh
@@ -48,6 +48,10 @@ cd tests/test_tvm_basic
 cargo run
 cd -
 
+cd tests/test_tvm_dso
+cargo run
+cd -
+
 # run NNVM graph test
 cd tests/test_nnvm
 cargo run
diff --git a/tests/scripts/task_verilog_test.sh b/tests/scripts/task_verilog_test.sh
index 8d725844bb5f..81da1c491a7e 100755
--- a/tests/scripts/task_verilog_test.sh
+++ b/tests/scripts/task_verilog_test.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/tests/scripts/task_web_build.sh b/tests/scripts/task_web_build.sh
index 25854f5e6d21..ec1d15a04fbb 100755
--- a/tests/scripts/task_web_build.sh
+++ b/tests/scripts/task_web_build.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/tests/scripts/task_web_test.sh b/tests/scripts/task_web_test.sh
index 4b383a1780aa..947a133c1a7b 100755
--- a/tests/scripts/task_web_test.sh
+++ b/tests/scripts/task_web_test.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/topi/include/topi/elemwise.h b/topi/include/topi/elemwise.h
index a9f8f630471f..b3681e17da8d 100644
--- a/topi/include/topi/elemwise.h
+++ b/topi/include/topi/elemwise.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -31,6 +31,7 @@
 #include "tvm/tvm.h"
 #include "tvm/ir.h"
 #include "tvm/ir_pass.h"
+#include "broadcast.h"
 
 namespace topi {
 using namespace tvm;
@@ -46,7 +47,6 @@ using namespace tvm;
   }
 
 TOPI_DECLARE_UNARY_OP(exp);
-TOPI_DECLARE_UNARY_OP(tanh);
 TOPI_DECLARE_UNARY_OP(sigmoid);
 TOPI_DECLARE_UNARY_OP(sqrt);
 TOPI_DECLARE_UNARY_OP(log);
@@ -56,6 +56,74 @@ TOPI_DECLARE_UNARY_OP(round);
 TOPI_DECLARE_UNARY_OP(trunc);
 TOPI_DECLARE_UNARY_OP(abs);
 
+/*
+ * \brief Fast_tanh_float implementation from Eigen
+ * https://github.com/eigenteam/eigen-git-mirror/blob/master/Eigen/src/Core/MathFunctionsImpl.h#L26
+ */
+inline Tensor fast_tanh_float(const Tensor& in,
+                              std::string name,
+                              std::string tag) {
+  // Clamp the inputs to the range [-9, 9] since anything outside
+  // this range is +/-1.0f in single-precision.
+  auto x = maximum(minimum(in, make_const(in->dtype, 9.0)), make_const(in->dtype, -9.0));
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  auto alpha_1 = make_const(in->dtype, 4.89352455891786e-03);
+  auto alpha_3 = make_const(in->dtype, 6.37261928875436e-04);
+  auto alpha_5 = make_const(in->dtype, 1.48572235717979e-05);
+  auto alpha_7 = make_const(in->dtype, 5.12229709037114e-08);
+  auto alpha_9 = make_const(in->dtype, -8.60467152213735e-11);
+  auto alpha_11 = make_const(in->dtype, 2.00018790482477e-13);
+  auto alpha_13 = make_const(in->dtype, -2.76076847742355e-16);
+
+  // The monomial coefficients of the denominator polynomial (even).
+  auto beta_0 = make_const(in->dtype, 4.89352518554385e-03);
+  auto beta_2 = make_const(in->dtype, 2.26843463243900e-03);
+  auto beta_4 = make_const(in->dtype, 1.18534705686654e-04);
+  auto beta_6 = make_const(in->dtype, 1.19825839466702e-06);
+
+  return compute(x->shape,
+                 [&](const Array<Var>& i) {
+                   auto x2 = x(i) * x(i);
+                   auto p = x2 * alpha_13 + alpha_11;
+                   p = x2 * p + alpha_9;
+                   p = x2 * p + alpha_7;
+                   p = x2 * p + alpha_5;
+                   p = x2 * p + alpha_3;
+                   p = x2 * p + alpha_1;
+                   p = x(i) * p;
+
+                   auto q = x2 * beta_6 + beta_4;
+                   q = x2 * q + beta_2;
+                   q = x2 * q + beta_0;
+                   return p / q;
+                 },
+                 name, tag);
+}
+
+/*!
+* \brief Creates an operation that returns hyperbolic tanh of a given tensor
+*
+* \param x The input tensor
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is tanh
+*/
+inline Tensor tanh(const Tensor& x,
+                   std::string name = "T_tanh",
+                   std::string tag = kElementWise) {
+  if (x->dtype == Float(32)) {
+    // invoke fast_tanh_float implementation
+    return fast_tanh_float(x, name, tag);
+  } else {
+    // fallback to default implementation
+    return compute(x->shape, [&](const Array<Var>& i) {
+      return ::tvm::tanh(x(i));
+    }, name, tag);
+  }
+}
+
 /*!
 * \brief Creates an operation that returns identity of a given tensor
 *
diff --git a/topi/include/topi/image/resize.h b/topi/include/topi/image/resize.h
index fb577a8f06ef..0119aed3aff0 100644
--- a/topi/include/topi/image/resize.h
+++ b/topi/include/topi/image/resize.h
@@ -101,15 +101,12 @@ inline Tensor resize_nearest_neighbor_nhwc(const Tensor& input,
   out_shape.push_back(shape[1]);
   out_shape.push_back(input->shape[3]);
 
-  Expr h_ratio = shape[0] / input->shape[1];
-  Expr w_ratio = shape[1] / input->shape[2];
-
   return compute(
     out_shape, [&](const Array<Var>& indices) {
     Array<Expr> idx;
     idx.push_back(indices[0]);
-    idx.push_back(indices[1] / h_ratio);
-    idx.push_back(indices[2] / w_ratio);
+    idx.push_back(indices[1] * input->shape[1] / shape[0]);
+    idx.push_back(indices[2] * input->shape[2] / shape[1]);
     idx.push_back(indices[3]);
 
     return input(idx);
@@ -138,16 +135,13 @@ inline Tensor resize_nearest_neighbor_nchw(const Tensor& input,
   out_shape.push_back(shape[0]);
   out_shape.push_back(shape[1]);
 
-  Expr h_ratio = shape[0] / input->shape[2];
-  Expr w_ratio = shape[1] / input->shape[3];
-
   return compute(
     out_shape, [&](const Array<Var>& indices) {
     Array<Expr> idx;
     idx.push_back(indices[0]);
     idx.push_back(indices[1]);
-    idx.push_back(indices[2] / h_ratio);
-    idx.push_back(indices[3] / w_ratio);
+    idx.push_back(indices[2] * input->shape[2] / shape[0]);
+    idx.push_back(indices[3] * input->shape[3] / shape[1]);
 
     return input(idx);
     }, name, tag);
@@ -176,16 +170,13 @@ inline Tensor resize_nearest_neighbor_nchwc(const Tensor& input,
   out_shape.push_back(shape[1]);
   out_shape.push_back(input->shape[4]);
 
-  Expr h_ratio = shape[0] / input->shape[2];
-  Expr w_ratio = shape[1] / input->shape[3];
-
   return compute(
     out_shape, [&](const Array<Var>& indices) {
     Array<Expr> idx;
     idx.push_back(indices[0]);
     idx.push_back(indices[1]);
-    idx.push_back(indices[2] / h_ratio);
-    idx.push_back(indices[3] / w_ratio);
+    idx.push_back(indices[2] * input->shape[2] / shape[0]);
+    idx.push_back(indices[3] * input->shape[3] / shape[1]);
     idx.push_back(indices[4]);
 
      return input(idx);
@@ -393,7 +384,7 @@ inline Tensor resize_bilinear(const Tensor& input,
 * \param shape Output shape to resize to.
 * \param layout input layout
 * \param align_corners To preserve centers of 4 corner pixels
-* \param mode Angorithm to use (NEAREST_NEIGHBOR / BILINEAR)
+* \param mode Algorithm to use (NEAREST_NEIGHBOR / BILINEAR)
 * \param name Name of the operation
 * \param tag The tag to mark the operation
 *
diff --git a/topi/include/topi/nn/upsampling.h b/topi/include/topi/nn/upsampling.h
index eb75d5dd8d92..d7551afbe04f 100644
--- a/topi/include/topi/nn/upsampling.h
+++ b/topi/include/topi/nn/upsampling.h
@@ -43,7 +43,7 @@ using namespace topi::image;
 * \param input The input tensor.
 * \param shape Output shape to upsample.
 * \param layout input layout
-* \param mode Angorithm to use (NEAREST_NEIGHBOR / BILINEAR)
+* \param mode Algorithm to use (NEAREST_NEIGHBOR / BILINEAR)
 * \param name Name of the operation
 * \param tag The tag to mark the operation
 *
diff --git a/topi/include/topi/reduction.h b/topi/include/topi/reduction.h
index b24c4577c4e5..09d1b4b1b33e 100644
--- a/topi/include/topi/reduction.h
+++ b/topi/include/topi/reduction.h
@@ -368,6 +368,27 @@ inline Tensor collapse_sum(const Tensor& data, Array<Expr> target_shape) {
   return DoCommReduce(data, tvm::sum, target_shape, reduce_axes, squeeze_axes);
 }
 
+/*!
+* \brief Creates an operation that computes the logical AND of elements
+* over a given axis
+*
+* \param data The input boolean tensor
+* \param axis The axes to reduce. If axis is empty, the operation will
+* perform logical AND over all elements of the array.
+* \param keepdims If this is set to true, the axes which are reduced are
+* left in the result as dimensions with size one. This enables the result
+* to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
+*
+* \return A Tensor whose op member is the all operation
+*/
+inline Tensor all(const Tensor& data,
+                  const Array<Integer>& axis,
+                  bool keepdims = false,
+                  bool atleast1d = false) {
+  return CommReduce(data, axis, tvm::all, keepdims, atleast1d);
+}
+
 /*!
 * \brief Creates an operation that finds the minimum of elements over
 * a given axis.
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index 4dba4eade6bd..c992be6b0022 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -641,6 +641,13 @@ inline Tensor take(const Tensor& a,
           auto idx = tvm::min(tvm::max(0, indices(out_index)), a_size - 1);
           return a(UnravelIndex(idx, a_shape));
         }, name, tag);
+  } else if (mode == "fast") {
+    LOG(WARNING) << "Fast mode segfaults when there are out-of-bounds indices. "
+                    "Make sure input indices are in bound";
+    return compute(
+        out_shape, [&](const Array<Var>& out_index) {
+          return a(UnravelIndex(indices(out_index), a_shape));
+        }, name, tag);
   } else {  // mode == "wrap"
     return compute(
         out_shape, [&](const Array<Var>& out_index) {
@@ -706,6 +713,25 @@ inline Tensor take(const Tensor& a,
           }
           return a(real_indices);
         }, name, tag);
+  } else if (mode == "fast") {
+    LOG(WARNING) << "Fast mode segfaults when there are out-of-bounds indices. "
+                    "Make sure input indices are in bound";
+    return compute(
+        out_shape, [&](const Array<Var>& out_index) {
+          Array<Expr> indices_position;
+          for (size_t j = axis; j < static_cast<size_t>(axis+indices_len); ++j) {
+            indices_position.push_back(out_index[j]);
+          }
+          Array<Expr> real_indices;
+          for (size_t j = 0; j < static_cast<size_t>(axis); ++j) {
+            real_indices.push_back(out_index[j]);
+          }
+          real_indices.push_back(indices(indices_position));
+          for (size_t j = axis + indices_len; j < out_index.size(); ++j) {
+            real_indices.push_back(out_index[j]);
+          }
+          return a(real_indices);
+        }, name, tag);
   } else {  // mode == "wrap"
     return compute(
         out_shape, [&](const Array<Var>& out_index) {
diff --git a/topi/python/topi/arm_cpu/depthwise_conv2d.py b/topi/python/topi/arm_cpu/depthwise_conv2d.py
index e09e355ad8aa..51088df905ca 100644
--- a/topi/python/topi/arm_cpu/depthwise_conv2d.py
+++ b/topi/python/topi/arm_cpu/depthwise_conv2d.py
@@ -26,11 +26,11 @@
 from ..nn.util import get_pad_tuple
 
 # register original implementation of depthwise_conv2d_nchw since we don't need to change this part
-autotvm.register_topi_compute(depthwise_conv2d_nchw, ['arm_cpu', 'cpu'], 'direct',
+autotvm.register_topi_compute(depthwise_conv2d_nchw, 'arm_cpu', 'direct',
                               depthwise_conv2d_nchw.fdefault)
 
 # register customized schedule for arm cpu.
-@autotvm.register_topi_schedule(schedule_depthwise_conv2d_nchw, ['arm_cpu', 'cpu'],
+@autotvm.register_topi_schedule(schedule_depthwise_conv2d_nchw, 'arm_cpu',
                                 ['direct', 'contrib_spatial_pack'])
 def schedule_depthwise_conv2d_nchw_arm(cfg, outs):
     """Schedule depthwise conv2d
@@ -151,7 +151,7 @@ def _callback(op):
     traverse_inline(s, outs[0].op, _callback)
     return s
 
-@autotvm.register_topi_compute(depthwise_conv2d_nchw, ['arm_cpu', 'cpu'], ['contrib_spatial_pack'])
+@autotvm.register_topi_compute(depthwise_conv2d_nchw, 'arm_cpu', ['contrib_spatial_pack'])
 def depthwise_conv2d_arm_cpu(cfg, data, kernel, strides, padding, dilation, out_dtype):
     """TOPI compute callback for depthwise_conv2d nchw
 
diff --git a/topi/python/topi/arm_cpu/injective.py b/topi/python/topi/arm_cpu/injective.py
index 9afdc32cf117..028558f69e91 100644
--- a/topi/python/topi/arm_cpu/injective.py
+++ b/topi/python/topi/arm_cpu/injective.py
@@ -51,3 +51,32 @@ def schedule_injective(outs):
     elif len(s[x].op.axis) >= 2:
         s[x].parallel(s[x].op.axis[0])
     return s
+
+@generic.schedule_concatenate.register(["arm_cpu"])
+def schedule_concatenate(outs):
+    """Schedule for concatenate op.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of reduce in the format
+          of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    x = outs[0]
+    tvm.schedule.AutoInlineInjective(s)
+    if len(s[x].op.axis) >= 4:
+        fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1], s[x].op.axis[2])
+        s[x].parallel(fused)
+    elif len(s[x].op.axis) >= 3:
+        fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1])
+        s[x].parallel(fused)
+    elif len(s[x].op.axis) >= 2:
+        s[x].parallel(s[x].op.axis[0])
+    return s
diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
index 526429b91bee..403f67b972f7 100644
--- a/topi/python/topi/cuda/__init__.py
+++ b/topi/python/topi/cuda/__init__.py
@@ -21,3 +21,4 @@
 from .ssd import *
 from .nms import *
 from .rcnn import *
+from .sort import *
diff --git a/topi/python/topi/cuda/conv2d.py b/topi/python/topi/cuda/conv2d.py
index 256f91567b71..220235e56a33 100644
--- a/topi/python/topi/cuda/conv2d.py
+++ b/topi/python/topi/cuda/conv2d.py
@@ -104,7 +104,9 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, dilation, layout='NCHW', ou
         return winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,
                              pre_computed=False)
     if cfg.template_key == 'int8':
-        return conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out_dtype)
+        if (data.dtype == 'int8' or data.dtype == 'uint8'):
+            return conv2d_NCHWc_int8(
+                cfg, data, kernel, strides, padding, dilation, layout, out_dtype)
 
     if layout == 'NCHW':
         return nn.conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype)
diff --git a/topi/python/topi/cuda/group_conv2d_nchw.py b/topi/python/topi/cuda/group_conv2d_nchw.py
index be4ae3554e33..cbdb3cb8031d 100644
--- a/topi/python/topi/cuda/group_conv2d_nchw.py
+++ b/topi/python/topi/cuda/group_conv2d_nchw.py
@@ -346,6 +346,8 @@ def schedule_conv2d_nchw_cuda(cfg, outs):
     def _callback(op):
         if op.tag == "group_conv2d_NCHWc_int8":
             schedule_group_conv2d_NCHWc_int8(cfg, s, op.output(0))
+        if op.tag == "group_conv2d_nchw":
+            raise tvm.error.OpNotImplemented("group_conv2d_nchw not supported")
 
     traverse_inline(s, outs[0].op, _callback)
     return s
diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 0c27bd216999..417e2dce5774 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -24,9 +24,10 @@
 from tvm.intrin import if_then_else, log, power
 from topi.vision import non_max_suppression, get_valid_counts
 from .sort import argsort
+from .. import tag
 
 
-def get_valid_counts_pre(data, flag, idx, score_threshold):
+def get_valid_counts_pre(data, flag, idx, score_threshold, id_index, score_index):
     """Low level IR to Prepare get valid count of bounding boxes
     given a score threshold. Also moves valid boxes to the
     top of input data.
@@ -45,6 +46,12 @@ def get_valid_counts_pre(data, flag, idx, score_threshold):
     score_threshold : float32
         Lower limit of score for valid bounding boxes.
 
+    id_index : optional, int
+        index of the class categories, -1 to disable.
+
+    score_index: optional, int
+        Index of the scores/confidence of boxes.
+
     Returns
     -------
     stmt : Stmt
@@ -60,6 +67,8 @@ def get_valid_counts_pre(data, flag, idx, score_threshold):
     flag = ib.buffer_ptr(flag)
     idx = ib.buffer_ptr(idx)
     score_threshold = tvm.make.node("FloatImm", dtype="float32", value=score_threshold)
+    id_index = tvm.make.node("IntImm", dtype="int32", value=id_index)
+    score_index = tvm.make.node("IntImm", dtype="int32", value=score_index)
 
     max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
     nthread_tx = max_threads
@@ -71,7 +80,8 @@ def get_valid_counts_pre(data, flag, idx, score_threshold):
     tid = bx * max_threads + tx
 
     with ib.if_scope(tid < batch_size * num_anchors):
-        with ib.if_scope(data[tid * box_data_length + 1] > score_threshold):
+        with ib.if_scope(tvm.all(data[tid * box_data_length + score_index] > score_threshold, \
+            tvm.any(id_index < 0, data[tid * box_data_length + id_index] >= 0))):
             flag[tid] = 1
             idx[tid] = 1
         with ib.else_scope():
@@ -312,7 +322,7 @@ def get_valid_counts_ir(data, flag, idx, valid_count, out):
 
 
 @get_valid_counts.register(["cuda", "gpu"])
-def get_valid_counts_gpu(data, score_threshold=0):
+def get_valid_counts_gpu(data, score_threshold=0, id_index=0, score_index=1):
     """Get valid count of bounding boxes given a score threshold.
     Also moves valid boxes to the top of input data.
 
@@ -324,6 +334,12 @@ def get_valid_counts_gpu(data, score_threshold=0):
     score_threshold : optional, float
         Lower limit of score for valid bounding boxes.
 
+    id_index : optional, int
+        index of the class categories, -1 to disable.
+
+    score_index: optional, int
+        Index of the scores/confidence of boxes.
+
     Returns
     -------
     valid_count : tvm.Tensor
@@ -349,7 +365,7 @@ def get_valid_counts_gpu(data, score_threshold=0):
     temp_flag, temp_idx = \
         tvm.extern([(batch_size, num_anchors,), (batch_size, num_anchors,)], [data],
                    lambda ins, outs: get_valid_counts_pre(
-                       ins[0], outs[0], outs[1], score_threshold),
+                       ins[0], outs[0], outs[1], score_threshold, id_index, score_index),
                    dtype=["int32", "int32"],
                    out_buffers=[temp_flag_buf, temp_idx_buf],
                    name="get_valid_counts_phase_one")
@@ -388,7 +404,7 @@ def get_valid_counts_gpu(data, score_threshold=0):
 
 def nms_ir(data, sorted_index, valid_count, out, box_indices,
            max_output_size, iou_threshold, force_suppress,
-           top_k, coord_start, id_index):
+           top_k, coord_start, id_index, score_index):
     """Low level IR routing for transform location in multibox_detection operator.
 
     Parameters
@@ -424,6 +440,9 @@ def nms_ir(data, sorted_index, valid_count, out, box_indices,
     id_index : int
         index of the class categories, -1 to disable.
 
+    score_index : optional, int
+        Index of the scores/confidence of boxes.
+
     Returns
     -------
     stmt : Stmt
@@ -456,20 +475,21 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     box_indices = ib.buffer_ptr(box_indices)
     num_valid_boxes = ib.allocate("int32", (1,), name="num_valid_boxes", scope="local")
 
-    max_threads = int(math.sqrt(
-        tvm.target.current_target(allow_none=False).max_num_threads))
+    max_threads = int(
+        tvm.target.current_target(allow_none=False).max_num_threads)
     nthread_tx = max_threads
     nthread_bx = num_anchors // max_threads + 1
     tx = tvm.thread_axis("threadIdx.x")
     bx = tvm.thread_axis("blockIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
-    k = bx * max_threads + tx
+    j = bx * max_threads + tx
 
     iou_threshold = tvm.make.node("FloatImm", dtype="float32", value=iou_threshold)
     top_k = tvm.make.node("IntImm", dtype="int32", value=top_k)
     coord_start = tvm.make.node("IntImm", dtype="int32", value=coord_start)
     id_index = tvm.make.node("IntImm", dtype="int32", value=id_index)
+    score_index = tvm.make.node("IntImm", dtype="int32", value=score_index)
     force_suppress = tvm.make.node("IntImm", dtype="int32", value=1 if force_suppress else 0)
 
     with ib.for_range(0, batch_size, for_type="unroll") as i:
@@ -479,61 +499,61 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
             nkeep = if_then_else( \
                     tvm.all(top_k > 0, top_k < valid_count[i]),
                     top_k, valid_count[i])
-            with ib.for_range(0, nkeep) as j:
-                with ib.if_scope(k < box_data_length):
+            with ib.if_scope(j < nkeep):
+                with ib.for_range(0, box_data_length) as k:
                     out[(base_idx + j * box_data_length + k)] = \
                     data[(base_idx + sorted_index[i * num_anchors + j] \
                     * box_data_length + k)]
                 box_indices[i * num_anchors + j] = sorted_index[i * num_anchors + j]
             with ib.if_scope(tvm.all(top_k > 0, top_k < valid_count[i])):
-                with ib.for_range(0, valid_count[i] - nkeep) as j:
-                    with ib.if_scope(k < box_data_length):
+                with ib.if_scope(j < valid_count[i] - nkeep):
+                    with ib.for_range(0, box_data_length) as k:
                         out[(base_idx + (j + nkeep) * box_data_length + k)] = -1.0
                     box_indices[i * num_anchors + (j + nkeep)] = -1
             # Apply nms
-            with ib.for_range(0, valid_count[i]) as j:
-                offset_j = j * box_data_length
-                with ib.if_scope(out[base_idx + offset_j] >= 0):
-                    with ib.if_scope(k < valid_count[i]):
-                        offset_k = k * box_data_length
-                        with ib.if_scope(tvm.all(k > j, out[base_idx + offset_k] >= 0, \
+            with ib.for_range(0, valid_count[i]) as k:
+                offset_k = k * box_data_length
+                with ib.if_scope(tvm.all(out[base_idx + offset_k + score_index] > 0, \
+                    tvm.any(id_index < 0, out[base_idx + offset_k + id_index] >= 0))):
+                    with ib.if_scope(j < valid_count[i]):
+                        offset_j = j * box_data_length
+                        with ib.if_scope(tvm.all(j > k, \
+                            out[base_idx + offset_j + score_index] > 0, \
+                                                 tvm.any(id_index < 0, \
+                                                    out[base_idx + offset_j + id_index] >= 0), \
 						 tvm.any(force_suppress > 0, id_index < 0, \
-                                                         out[base_idx + offset_j] == \
-                                                         out[base_idx + offset_k]))):
-                            iou = calculate_overlap(out, base_idx + offset_k + coord_start,
-                                                    base_idx + offset_j + coord_start)
+                                                         out[base_idx + offset_k + id_index] == \
+                                                         out[base_idx + offset_j + id_index]))):
+                            iou = calculate_overlap(out, base_idx + offset_j + coord_start,
+                                                    base_idx + offset_k + coord_start)
                             with ib.if_scope(iou >= iou_threshold):
-                                out[base_idx + offset_k] = -1.0
-                                box_indices[i * num_anchors + k] = -1
-                ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
-                                      tvm.convert(['shared']),
-                                      tvm.expr.Call.Intrinsic, None, 0))
+                                out[base_idx + offset_j + score_index] = -1.0
+                                with ib.if_scope(id_index >= 0):
+                                    out[base_idx + offset_j + id_index] = -1.0
+                                box_indices[i * num_anchors + j] = -1
         with ib.else_scope():
-            with ib.for_range(0, valid_count[i]) as j:
+            with ib.if_scope(j < valid_count[i]):
                 offset_j = j * box_data_length
-                with ib.if_scope(k < box_data_length):
+                with ib.for_range(0, box_data_length) as k:
                     out[(base_idx + offset_j + k)] = data[base_idx + offset_j + k]
                 box_indices[i * num_anchors + j] = j
         # Set invalid entry to be -1
-        with ib.for_range(0, num_anchors - valid_count[i]) as j:
-            with ib.if_scope(k < box_data_length):
+        with ib.if_scope(j < num_anchors - valid_count[i]):
+            with ib.for_range(0, box_data_length) as k:
                 out[base_idx + (j + valid_count[i]) * box_data_length + k] = -1.0
             box_indices[i * num_anchors + j + valid_count[i]] = -1
         # Only return max_output_size number of valid boxes
         num_valid_boxes[0] = 0
         with ib.if_scope(max_output_size > 0):
-            with ib.for_range(0, valid_count[i]) as j:
+            with ib.if_scope(j < valid_count[i]):
                 offset_j = j * box_data_length
                 with ib.if_scope(out[base_idx + offset_j] >= 0):
                     with ib.if_scope(num_valid_boxes[0] == max_output_size):
-                        with ib.if_scope(k < box_data_length):
+                        with ib.for_range(0, box_data_length) as k:
                             out[base_idx + offset_j + k] = -1.0
                         box_indices[i * num_anchors + j] = -1
                     with ib.else_scope():
                         num_valid_boxes[0] += 1
-                ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
-                                      tvm.convert(['shared']),
-                                      tvm.expr.Call.Intrinsic, None, 0))
 
     return ib.get()
 
@@ -730,8 +750,8 @@ def non_max_suppression_gpu(data, valid_count, max_output_size=-1,
                                       "valid_count_buf", data_alignment=4)
     score_axis = score_index
     score_shape = (batch_size, num_anchors)
-    score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis])
-    sort_tensor = argsort(score_tensor, valid_count=valid_count, axis=1, is_ascend=False, flag=True)
+    score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis], tag=tag.ELEMWISE)
+    sort_tensor = argsort(score_tensor, valid_count=valid_count, axis=1, is_ascend=False)
 
     sort_tensor_buf = api.decl_buffer(sort_tensor.shape, sort_tensor.dtype,
                                       "sort_tensor_buf", data_alignment=8)
@@ -748,7 +768,7 @@ def non_max_suppression_gpu(data, valid_count, max_output_size=-1,
                    lambda ins, outs: nms_ir(
                        ins[0], ins[1], ins[2], outs[0], outs[1],
                        max_output_size, iou_threshold, force_suppress,
-                       top_k, coord_start, id_index),
+                       top_k, coord_start, id_index, score_index),
                    dtype=[data.dtype, "int32"],
                    in_buffers=[data_buf, sort_tensor_buf, valid_count_buf],
                    name="nms",
diff --git a/topi/python/topi/cuda/reduction.py b/topi/python/topi/cuda/reduction.py
index ff7232cc0fac..25885315179c 100644
--- a/topi/python/topi/cuda/reduction.py
+++ b/topi/python/topi/cuda/reduction.py
@@ -37,7 +37,7 @@ def _schedule_reduce(op, sch, is_idx_reduce=False):
         num_thread = 32
         target = tvm.target.current_target()
         if target and target.target_name == "opencl":
-            # without it, CL_INVALID_WORK_GROUP_SIZE occured when running test_topi_reduce.py
+            # without it, CL_INVALID_WORK_GROUP_SIZE occurred when running test_topi_reduce.py
             # don't know why
             num_thread = 16
         block_x = tvm.thread_axis("blockIdx.x")
diff --git a/topi/python/topi/cuda/sort.py b/topi/python/topi/cuda/sort.py
index 99ba8527cdfb..1d9148f46278 100644
--- a/topi/python/topi/cuda/sort.py
+++ b/topi/python/topi/cuda/sort.py
@@ -19,15 +19,48 @@
 import tvm
 
 from tvm import api
-from topi.sort import argsort
+from ..sort import argsort, topk
+from ..math import identity
+from ..transform import strided_slice
+from .. import generic
+from .. import tag
 
-def sort_ir(data, output, axis, is_ascend):
+def _schedule_sort(outs):
+    """Schedule for argsort operator.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of argsort
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    scheduled_ops = []
+    from .injective import _schedule_injective
+    def traverse(op):
+        if tag.is_injective(op.tag):
+            _schedule_injective(op, s)
+        for tensor in op.input_tensors:
+            if tensor.op.input_tensors and tensor.op not in scheduled_ops:
+                traverse(tensor.op)
+        scheduled_ops.append(op)
+    for out in outs:
+        traverse(out.op)
+    return s
+
+def sort_ir(data, values_out, axis, is_ascend, indices_out=None):
     """Low level IR to do nms sorting on the GPU, same usage as tvm.contrib.sort.argsort on the CPU.
 
     Parameters
     ----------
     data: Buffer
-        Buffer of input data.
+        Buffer of input data. Data will be sorted in place.
 
     output : Buffer
         Output buffer of indicies of sorted tensor with same shape as data.
@@ -43,14 +76,12 @@ def sort_ir(data, output, axis, is_ascend):
     stmt : Stmt
         The result IR statement.
     """
-    size = 1
     axis_mul_before = 1
     axis_mul_after = 1
     shape = data.shape
     if axis < 0:
         axis = len(shape) + axis
     for i, value in enumerate(shape, 0):
-        size *= value
         if i < axis:
             axis_mul_before *= value
         elif i > axis:
@@ -58,46 +89,55 @@ def sort_ir(data, output, axis, is_ascend):
     max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
     ib = tvm.ir_builder.create()
     data = ib.buffer_ptr(data)
-    output = ib.buffer_ptr(output)
+    values_out = ib.buffer_ptr(values_out)
+    if indices_out is not None:
+        indices_out = ib.buffer_ptr(indices_out)
     nthread_tx = max_threads
-    nthread_bx = size // max_threads + 1
+    nthread_bx = shape[axis] // max_threads + 1
+
     tx = tvm.thread_axis("threadIdx.x")
     bx = tvm.thread_axis("vthread")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "virtual_thread", nthread_bx)
     tid = bx * nthread_tx + tx
-    temp_data = ib.allocate("float32", (1,), name="temp_data", scope="local")
-    temp_index = ib.allocate("float32", (1,), name="temp_index", scope="local")
-    is_ascend = tvm.make.node("IntImm", dtype="int32", value=is_ascend)
+    temp_data = ib.allocate(values_out.dtype, (1,), name="temp_data", scope="local")
+    if indices_out is not None:
+        temp_index = ib.allocate(indices_out.dtype, (1,), name="temp_index", scope="local")
 
     with ib.for_range(0, axis_mul_before) as i:
         with ib.for_range(0, axis_mul_after) as j:
-            current_sort_num = shape[axis]
             base_idx = i * shape[axis] * axis_mul_after + j
             with ib.if_scope(tid < shape[axis]):
-                output[base_idx + tid * axis_mul_after] = tid.astype("float32")
+                values_out[base_idx + tid * axis_mul_after] = data[base_idx + tid * axis_mul_after]
+                if indices_out is not None:
+                    indices_out[base_idx + tid * axis_mul_after] = \
+                        tvm.generic.cast(tid, indices_out.dtype)
+    ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
+                          tvm.convert(['shared']),
+                          tvm.expr.Call.Intrinsic, None, 0))
+
+    with ib.for_range(0, axis_mul_before) as i:
+        with ib.for_range(0, axis_mul_after) as j:
+            current_sort_num = shape[axis]
+            base_idx = i * shape[axis] * axis_mul_after + j
             # OddEvenTransposeSort
             with ib.for_range(0, current_sort_num) as k:
                 with ib.if_scope(tid < (current_sort_num + 1) // 2):
                     offset = base_idx + (2 * tid + (k % 2)) * axis_mul_after
-                    with ib.if_scope(tvm.all(is_ascend == 1, \
-                                             2 * tid + (k % 2) + 1 < current_sort_num, \
-                                             data[offset] > data[offset + axis_mul_after])):
-                        temp_data[0] = data[offset]
-                        data[offset] = data[offset + axis_mul_after]
-                        data[offset + axis_mul_after] = temp_data[0]
-                        temp_index[0] = output[offset]
-                        output[offset] = output[offset + axis_mul_after]
-                        output[offset + axis_mul_after] = temp_index[0]
-                    with ib.if_scope(tvm.all(is_ascend == 0, \
-                                             2 * tid + (k % 2) + 1 < current_sort_num, \
-                                             data[offset] < data[offset + axis_mul_after])):
-                        temp_data[0] = data[offset]
-                        data[offset] = data[offset + axis_mul_after]
-                        data[offset + axis_mul_after] = temp_data[0]
-                        temp_index[0] = output[offset]
-                        output[offset] = output[offset + axis_mul_after]
-                        output[offset + axis_mul_after] = temp_index[0]
+                    if is_ascend:
+                        cond = tvm.all(2 * tid + (k % 2) + 1 < current_sort_num,
+                                       values_out[offset] > values_out[offset + axis_mul_after])
+                    else:
+                        cond = tvm.all(2 * tid + (k % 2) + 1 < current_sort_num,
+                                       values_out[offset] < values_out[offset + axis_mul_after])
+                    with ib.if_scope(cond):
+                        temp_data[0] = values_out[offset]
+                        values_out[offset] = values_out[offset + axis_mul_after]
+                        values_out[offset + axis_mul_after] = temp_data[0]
+                        if indices_out is not None:
+                            temp_index[0] = indices_out[offset]
+                            indices_out[offset] = indices_out[offset + axis_mul_after]
+                            indices_out[offset + axis_mul_after] = temp_index[0]
                 ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
                                       tvm.convert(['shared']),
                                       tvm.expr.Call.Intrinsic, None, 0))
@@ -105,7 +145,6 @@ def sort_ir(data, output, axis, is_ascend):
     return ib.get()
 
 
-
 def sort_nms_ir(data, valid_count, output, axis, is_ascend):
     """Low level IR to do nms sorting on the GPU, same usage as tvm.contrib.sort.argsort on the CPU.
 
@@ -195,7 +234,7 @@ def sort_nms_ir(data, valid_count, output, axis, is_ascend):
     return ib.get()
 
 @argsort.register(["cuda", "gpu"])
-def argsort_gpu(data, valid_count, axis=-1, is_ascend=1, dtype="float32", flag=0):
+def argsort_gpu(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
     """Performs sorting along the given axis and returns an array of indicies
     having same shape as an input array that index data in sorted order.
 
@@ -204,46 +243,159 @@ def argsort_gpu(data, valid_count, axis=-1, is_ascend=1, dtype="float32", flag=0
     data: tvm.Tensor
         The input array.
 
-    valid_count : tvm.Tensor
+    valid_count : tvm.Tensor, optional
         The number of valid elements to be sorted.
 
-    axis : int
+    axis : int, optional
         Axis long which to sort the input tensor.
 
-    is_ascend : boolean
+    is_ascend : boolean, optional
         Whether to sort in ascending or descending order.
 
-    flag : boolean
-        Whether this argsort is used in nms operator
+    dtype : string, optional
+        DType of the output indices.
 
     Returns
     -------
     out : tvm.Tensor
         The output of this function.
     """
-    data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
-    if flag:
+    if valid_count is not None:
+        sorted_data = identity(data)
+        sorted_data_buf = api.decl_buffer(data.shape, data.dtype, "sorted_data_buf",
+                                          data_alignment=8)
         valid_count_buf = api.decl_buffer(valid_count.shape, valid_count.dtype,
                                           "valid_count_buf", data_alignment=4)
         out_buf = api.decl_buffer(data.shape, "int32", "out_buf", data_alignment=4)
         out = tvm.extern([data.shape],
-                         [data, valid_count],
+                         [sorted_data, valid_count],
                          lambda ins, outs: sort_nms_ir(
                              ins[0], ins[1], outs[0], axis, is_ascend),
                          dtype="int32",
-                         in_buffers=[data_buf, valid_count_buf],
+                         in_buffers=[sorted_data_buf, valid_count_buf],
                          out_buffers=[out_buf],
                          name="argsort_nms_gpu",
                          tag="argsort_nms_gpu")
     else:
-        out_buf = api.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8)
-        out = tvm.extern([data.shape],
+        value_buf = api.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8)
+        indices_buf = api.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8)
+        out = tvm.extern([data.shape, data.shape],
                          [data],
                          lambda ins, outs: sort_ir(
-                             ins[0], outs[0], axis, is_ascend),
-                         dtype=dtype,
-                         in_buffers=[data_buf],
-                         out_buffers=[out_buf],
+                             ins[0], outs[0], axis, is_ascend, indices_out=outs[1]),
+                         out_buffers=[value_buf, indices_buf],
                          name="argsort_gpu",
-                         tag="argsort_gpu")
+                         tag="argsort_gpu")[1]
     return out
+
+@generic.schedule_argsort.register(["cuda", "gpu"])
+def schedule_argsort(outs):
+    """Schedule for argsort operator.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of argsort
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    return _schedule_sort(outs)
+
+@topk.register(["cuda", "gpu"])
+def topk_gpu(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"):
+    """Get the top k elements in an input tensor along the given axis.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        The input tensor.
+
+    k : int, optional
+        Number of top elements to select. Return all elements if k < 1.
+
+    axis : int, optional
+        Axis long which to sort the input tensor.
+
+    ret_type: str, optional
+        The return type [both, values, indices].
+        "both": return both top k data and indices.
+        "values": return top k data only.
+        "indices": return top k indices only.
+
+    is_ascend : boolean, optional
+        Whether to sort in ascending or descending order.
+
+    dtype : string, optional
+        The data type of the indices output.
+
+    Returns
+    -------
+    out : tvm.Tensor or List[tvm.Tensor]
+        The computed result.
+    """
+    assert ret_type in ["both", "values", "indices"]
+    ndim = len(data.shape)
+    axis = axis + ndim if axis < 0 else axis
+    assert 0 <= axis < ndim
+    values_buf = api.decl_buffer(data.shape, data.dtype, "values_buf", data_alignment=8)
+    indices_buf = api.decl_buffer(data.shape, dtype, "indices_buf", data_alignment=8)
+    if ret_type == "values":
+        output = tvm.extern([data.shape],
+                            [data],
+                            lambda ins, outs: sort_ir(
+                                ins[0], outs[0], axis, is_ascend),
+                            out_buffers=[values_buf],
+                            name="topk_gpu",
+                            tag="topk_gpu")
+    else:
+        output = tvm.extern([data.shape, data.shape],
+                            [data],
+                            lambda ins, outs: sort_ir(
+                                ins[0], outs[0], axis, is_ascend, indices_out=outs[1]),
+                            out_buffers=[values_buf, indices_buf],
+                            name="topk_gpu",
+                            tag="topk_gpu")
+    if k < 1:
+        if ret_type == "indices":
+            return output[1]
+        return output
+    beg = [0] * ndim
+    end = []
+    for i in range(ndim):
+        if i == axis:
+            end.append(k)
+        else:
+            end.append(data.shape[i])
+    if ret_type == "both":
+        values_out, indices_out = output
+        values_out = strided_slice(values_out, beg, end)
+        indices_out = strided_slice(indices_out, beg, end)
+        output = [values_out, indices_out]
+    elif ret_type == "values":
+        output = [strided_slice(output, beg, end)]
+    else: # ret_type == "indices"
+        indices_out = output[1]
+        output = [strided_slice(indices_out, beg, end)]
+    return output
+
+
+@generic.schedule_topk.register(["cuda", "gpu"])
+def schedule_topk(outs):
+    """Schedule for argsort operator.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of argsort
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    return _schedule_sort(outs)
diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py
index 78f5c1f51ec6..968e554ac81d 100644
--- a/topi/python/topi/cuda/vision.py
+++ b/topi/python/topi/cuda/vision.py
@@ -25,41 +25,17 @@
 
 def _default_schedule(outs):
     """Default schedule for gpu."""
-    target = tvm.target.current_target()
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
     scheduled_ops = []
-
+    from .injective import _schedule_injective
     def traverse(op):
-        """inline all one-to-one-mapping operators except the last stage (output)"""
-        if op.tag in ["nms", "invalid_to_bottom"]:
-            if op.tag == "nms":
-                sort = op.input_tensors[1]
-            else:
-                out = op.input_tensors[0]
-                sort = s[out].op.input_tensors[1]
-            score = s[sort].op.input_tensors[0]
-            fused = s[score].fuse(*s[score].op.axis)
-            num_thread = int(tvm.target.current_target(allow_none=False).max_num_threads)
-            bx, tx = s[score].split(fused, factor=num_thread)
-            s[score].bind(bx, tvm.thread_axis("blockIdx.x"))
-            s[score].bind(tx, tvm.thread_axis("threadIdx.x"))
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            else:
-                x = op.output(0)
-                fused = s[x].fuse(*s[x].op.axis)
-                num_thread = tvm.target.current_target(allow_none=False).max_num_threads
-                bx, tx = s[x].split(fused, factor=num_thread)
-                s[x].bind(bx, tvm.thread_axis("blockIdx.x"))
-                s[x].bind(tx, tvm.thread_axis("threadIdx.x"))
-            for tensor in op.input_tensors:
-                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-
+        if tag.is_broadcast(op.tag) or op.tag in ['bbox_score', 'sorted_bbox']:
+            _schedule_injective(op, s)
+        for tensor in op.input_tensors:
+            if tensor.op.input_tensors and tensor.op not in scheduled_ops:
+                traverse(tensor.op)
         scheduled_ops.append(op)
-
     traverse(outs[0].op)
     return s
 
@@ -173,19 +149,7 @@ def schedule_proposal(outs):
     s: Schedule
       The computation schedule for the op.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-    from .injective import _schedule_injective
-    def traverse(op):
-        if op.tag in ['bbox_score', 'sorted_bbox']:
-            _schedule_injective(op, s)
-        for tensor in op.input_tensors:
-            if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                traverse(tensor.op)
-        scheduled_ops.append(op)
-    traverse(outs[0].op)
-    return s
+    return _default_schedule(outs)
 
 @generic.schedule_get_valid_counts.register(["cuda", "gpu"])
 def schedule_get_valid_counts(outs):
@@ -203,30 +167,3 @@ def schedule_get_valid_counts(outs):
       The computation schedule for the op.
     """
     return _default_schedule(outs)
-
-@generic.schedule_argsort.register(["cuda", "gpu"])
-def schedule_argsort(outs):
-    """Schedule for argsort operator.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of argsort
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-    from .injective import _schedule_injective
-    def traverse(op):
-        for tensor in op.input_tensors:
-            if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                traverse(tensor.op)
-        scheduled_ops.append(op)
-    traverse(outs[0].op)
-    return s
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index db1c772279e5..60a2d55486e5 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -52,6 +52,24 @@ def schedule_conv2d_nchw(outs):
     return _default_schedule(outs, False)
 
 
+@tvm.target.generic_func
+def schedule_conv2d_nhwc_pack(outs):
+    """Schedule for conv2d_nhwc_pack
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of conv2d_nhwc_pack
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
 @tvm.target.generic_func
 def schedule_conv2d_nhwc(outs):
     """Schedule for conv2d_nhwc
diff --git a/topi/python/topi/generic/sort.py b/topi/python/topi/generic/sort.py
index 1ad088c50d04..5462f2ce917c 100644
--- a/topi/python/topi/generic/sort.py
+++ b/topi/python/topi/generic/sort.py
@@ -36,3 +36,20 @@ def schedule_argsort(outs):
       The computation schedule for the op.
     """
     return _default_schedule(outs, False)
+
+@tvm.target.generic_func
+def schedule_topk(outs):
+    """Schedule for topk operator.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+      The indices that would sort an input array along
+      the given axis.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index 06d4074147c1..57c1d20c422f 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -28,8 +28,8 @@
 
 # workload description of conv2d
 Workload = namedtuple('Workload',
-                      ['in_dtype', 'out_dtype', 'height', 'width', 'in_filter', 'out_filter',
-                       'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
+                      ['in_dtype', 'out_dtype', 'height', 'width', 'in_filter', 'groups',
+                       'out_filter', 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
 
 @tvm.target.generic_func
 def conv2d(input, filter, strides, padding, dilation, layout='NCHW', out_dtype=None):
@@ -94,12 +94,45 @@ def conv2d_alter_layout(attrs, inputs, tinfos, F):
     # not to change by default
     return None
 
+@tvm.target.generic_func
+def conv2d_infer_layout(workload, cfg):
+    """Infer input/output shapes and layouts from a workload and cfg.
+
+    Parameters
+    ----------
+    workload : tuple
+        conv2d workload
 
-def _get_workload(data, kernel, stride, padding, out_dtype):
+    cfg : tuple
+        tvm.autotvm config
+
+    Returns
+    -------
+    Output : [tuple of tuple and str, tuple of tuple and str]
+        Input shapes and layouts, and output shapes and layouts
+    """
+    raise ValueError("missing register for topi.nn.conv2d_infer_layout")
+
+
+
+def _get_workload(data, kernel, stride, padding, out_dtype, data_layout='NCHW'):
     """ Get the workload structure. """
-    _, CI, IH, IW = [x.value for x in data.shape]
-    CO, _, KH, KW = [x.value for x in kernel.shape]
+    if data_layout == 'NCHW':
+        _, CI, IH, IW = [x.value for x in data.shape]
+    elif data_layout == 'NHWC':
+        _, IH, IW, CI = [x.value for x in data.shape]
+    elif data_layout == 'HWCN':
+        IH, IW, CI, _ = [x.value for x in data.shape]
+    else:
+        raise ValueError("not support this layout {} yet".format(data_layout))
+
+    if data_layout == 'NCHW':
+        CO, CIG, KH, KW = [x.value for x in kernel.shape]
+    else:
+        KH, KW, CO, CIG = [x.value for x in kernel.shape]
+
     HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+    GRPS = CI // CIG
     if isinstance(stride, (tuple, list)):
         HSTR, WSTR = stride
     else:
@@ -107,7 +140,7 @@ def _get_workload(data, kernel, stride, padding, out_dtype):
     assert (data.dtype == kernel.dtype) or (data.dtype == 'uint8' and kernel.dtype == 'int8'), \
         "Do not support inputs with different data types now. ' \
         '{} vs. {}".format(data.dtype, kernel.dtype)
-    return Workload(data.dtype, out_dtype, IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR)
+    return Workload(data.dtype, out_dtype, IH, IW, CI, GRPS, CO, KH, KW, HPAD, WPAD, HSTR, WSTR)
 
 
 def conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None):
diff --git a/topi/python/topi/nn/depthwise_conv2d.py b/topi/python/topi/nn/depthwise_conv2d.py
index 460f4fefe678..e703becf6d40 100644
--- a/topi/python/topi/nn/depthwise_conv2d.py
+++ b/topi/python/topi/nn/depthwise_conv2d.py
@@ -336,3 +336,22 @@ def depthwise_conv2d_NCHWc(Input, Filter, stride, padding, dilation,
         5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
     """
     raise ValueError("missing register for topi.nn.depthwise_conv2d_NCHWc")
+
+@tvm.target.generic_func
+def depthwise_conv2d_infer_layout(workload, cfg):
+    """Infer input/output shapes and layouts from a workload and cfg.
+
+    Parameters
+    ----------
+    workload : tuple
+        conv2d workload
+
+    cfg : tuple
+        tvm.autotvm config
+
+    Returns
+    -------
+    Output : [tuple of tuple and str, tuple of tuple and str]
+        Input shapes and layouts, and output shapes and layouts
+    """
+    raise ValueError("missing register for topi.nn.depthwise_conv2d_infer_layout")
diff --git a/topi/python/topi/nn/upsampling.py b/topi/python/topi/nn/upsampling.py
index 14c7c05e00fa..7926df205a56 100644
--- a/topi/python/topi/nn/upsampling.py
+++ b/topi/python/topi/nn/upsampling.py
@@ -53,5 +53,4 @@ def upsampling(data, scale, layout="NCHW", method='NEAREST_NEIGHBOR'):
         out_shape = (simplify(data.shape[1] * scale), simplify(data.shape[2] * scale))
     else:
         raise ValueError("not support this layout {} yet".format(layout))
-
     return topi.cpp.nn.upsampling(data, out_shape, layout, method)
diff --git a/topi/python/topi/reduction.py b/topi/python/topi/reduction.py
index ce1326b78162..5079bf474deb 100644
--- a/topi/python/topi/reduction.py
+++ b/topi/python/topi/reduction.py
@@ -65,6 +65,31 @@ def sum(data, axis=None, keepdims=False):
     return cpp.sum(data, axis, keepdims)
 
 
+def all(data, axis=None, keepdims=False):
+    """Logical AND of array elements over a given axis or a list of axes
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        The input tvm boolean tensor
+
+    axis : None or int or tuple of int
+        Axis or axes along which a logical AND is performed.
+        The default, axis=None, will perform logical AND over all elements of the input array.
+        If axis is negative it counts from the last to the first axis.
+
+    keepdims : bool
+        If this is set to True, the axes which are reduced are left in the result as dimensions
+        with size one.
+        With this option, the result will broadcast correctly against the input array.
+
+    Returns
+    -------
+    ret : tvm.Tensor
+    """
+    return cpp.all(data, axis, keepdims)
+
+
 def max(data, axis=None, keepdims=False):
     """Maximum of array elements over a given axis or a list of axes
 
diff --git a/topi/python/topi/sort.py b/topi/python/topi/sort.py
index 84fff8d8f0cd..22899c4232f7 100644
--- a/topi/python/topi/sort.py
+++ b/topi/python/topi/sort.py
@@ -18,9 +18,10 @@
 """Argsort operator"""
 import tvm
 from tvm import api
+from .util import get_const_tuple
 
 @tvm.target.generic_func
-def argsort(data, valid_count, axis=-1, is_ascend=1, dtype="float32", flag=0):
+def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
     """Performs sorting along the given axis and returns an array
     of indices having the same shape as an input array that index
     data in sorted order.
@@ -30,22 +31,19 @@ def argsort(data, valid_count, axis=-1, is_ascend=1, dtype="float32", flag=0):
     data : tvm.Tensor
         The input tensor.
 
-    valid_count : tvm.Tensor
+    valid_count : tvm.Tensor, optional
         1-D tensor for valid number of boxes only for ssd.
 
-    axis : optional, int
-	Axis along which to sort the input tensor.
+    axis : int, optional
+	    Axis along which to sort the input tensor.
         By default the flattened array is used.
 
-    is_ascend : optional, boolean
+    is_ascend : boolean, optional
         Whether to sort in ascending or descending order.
 
-    dtype : optional, string
+    dtype : string, optional
         DType of the output indices.
 
-    flag : optional, boolean
-        Whether valid_count is valid.
-
     Returns
     -------
     out : tvm.Tensor
@@ -58,23 +56,19 @@ def argsort(data, valid_count, axis=-1, is_ascend=1, dtype="float32", flag=0):
         # An example to use argsort
         dshape = (1, 5, 6)
         data = tvm.placeholder(dshape, name="data")
-        valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
         axis = 0
         is_ascend = False
-        flag = False
-        out = argsort(data, valid_count, axis, is_ascend, flag)
+        out = argsort(data, axis=axis, is_ascend=is_ascend)
         np_data = np.random.uniform(dshape)
-        np_valid_count = np.array([4])
         s = topi.generic.schedule_argsort(out)
-        f = tvm.build(s, [data, valid_count, out], "llvm")
+        f = tvm.build(s, [data, out], "llvm")
         ctx = tvm.cpu()
         tvm_data = tvm.nd.array(np_data, ctx)
-        tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
         tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
-        f(tvm_data, tvm_valid_count, tvm_out)
+        f(tvm_data, tvm_out)
     """
     data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
-    if flag:
+    if valid_count is not None:
         valid_count_buf = api.decl_buffer(valid_count.shape, valid_count.dtype,
                                           "valid_count_buf", data_alignment=4)
         out_buf = api.decl_buffer(data.shape, "int32", "out_buf", data_alignment=8)
@@ -103,3 +97,58 @@ def argsort(data, valid_count, axis=-1, is_ascend=1, dtype="float32", flag=0):
                        name="argsort_cpu",
                        tag="argsort_cpu")
     return out
+
+
+@tvm.target.generic_func
+def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"):
+    """Get the top k elements in an input tensor along the given axis.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        The input tensor.
+
+    k : int, optional
+        Number of top elements to select. Return all elements if k < 1.
+
+    axis : int, optional
+        Axis long which to sort the input tensor.
+
+    ret_type: str, optional
+        The return type [both, values, indices].
+        "both": return both top k data and indices.
+        "values": return top k data only.
+        "indices": return top k indices only.
+
+    is_ascend : boolean, optional
+        Whether to sort in ascending or descending order.
+
+    dtype : string, optional
+        The data type of the indices output.
+
+    Returns
+    -------
+    out : tvm.Tensor or List[tvm.Tensor]
+        The computed result.
+    """
+    assert ret_type in ["both", "values", "indices"]
+    data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
+    out_shape = list(get_const_tuple(data.shape))
+    if k >= 1:
+        out_shape[axis] = k
+    out_bufs = []
+    if ret_type in ["both", "values"]:
+        out_bufs.append(api.decl_buffer(out_shape, data.dtype, "value_buf", data_alignment=8))
+    if ret_type in ["both", "indices"]:
+        out_bufs.append(api.decl_buffer(out_shape, dtype, "indices_buf", data_alignment=8))
+    out_shapes = [out_shape] * len(out_bufs)
+
+    out = tvm.extern(out_shapes,
+                     [data],
+                     lambda ins, outs: tvm.call_packed(
+                         "tvm.contrib.sort.topk", ins[0], *outs, k, axis, ret_type, is_ascend),
+                     in_buffers=[data_buf],
+                     out_buffers=out_bufs,
+                     name="topk_cpu",
+                     tag="topk_cpu")
+    return out
diff --git a/topi/python/topi/testing/upsampling_python.py b/topi/python/topi/testing/upsampling_python.py
index 8ee964010c82..167fdfc7f227 100644
--- a/topi/python/topi/testing/upsampling_python.py
+++ b/topi/python/topi/testing/upsampling_python.py
@@ -16,25 +16,35 @@
 # under the License.
 # pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
 """Upsampling in python"""
+import math
 import numpy as np
 
 def upsample_nearest(arr, scale):
     """ Populate the array by scale factor"""
-    return arr.repeat(scale, axis=0).repeat(scale, axis=1)
+    h, w = arr.shape
+    out_h = math.floor(h * scale[0])
+    out_w = math.floor(w * scale[1])
+    out = np.empty((out_h, out_w))
+    for y in range(out_h):
+        for x in range(out_w):
+            in_y = math.floor(y / scale[0])
+            in_x = math.floor(x / scale[1])
+            out[y, x] = arr[in_y, in_x]
+    return out
 
 def upsampling_python(data, scale, layout='NCHW'):
     """ Python version of scaling using nearest neighbour """
 
     ishape = data.shape
     if layout == 'NCHW':
-        oshape = (ishape[0], ishape[1], ishape[2]*scale, ishape[3]*scale)
+        oshape = (ishape[0], ishape[1], math.floor(ishape[2]*scale[0]), math.floor(ishape[3]*scale[1]))
         output_np = np.zeros(oshape, dtype=data.dtype)
         for b in range(oshape[0]):
             for c in range(oshape[1]):
                 output_np[b, c, :, :] = upsample_nearest(data[b, c, :, :], scale)
         return output_np
     if layout == 'NHWC':
-        oshape = (ishape[0], ishape[1]*scale, ishape[1]*scale, ishape[3])
+        oshape = (ishape[0], math.floor(ishape[1]*scale[0]), math.floor(ishape[1]*scale[1]), ishape[3])
         output_np = np.zeros(oshape, dtype=data.dtype)
         for b in range(oshape[0]):
             for c in range(oshape[3]):
diff --git a/topi/python/topi/transform.py b/topi/python/topi/transform.py
index 2ad1f6e10057..3d7293edc6ff 100644
--- a/topi/python/topi/transform.py
+++ b/topi/python/topi/transform.py
@@ -151,6 +151,8 @@ def strided_slice(a, begin, end, strides=None):
     -------
     ret : tvm.Tensor
     """
+    if strides is None:
+        strides = []
     return cpp.strided_slice(a, begin, end, strides)
 
 
@@ -263,6 +265,7 @@ def take(a, indices, axis=None, mode="clip"):
         Specifies how out-of-bound indices will behave.
         clip - clip to the range (default)
         wrap - wrap around the indices
+        fast - no clip or wrap around (user must make sure indices are in-bound)
 
     Returns
     -------
diff --git a/topi/python/topi/util.py b/topi/python/topi/util.py
index f648245c6bb7..d4e23be47e58 100644
--- a/topi/python/topi/util.py
+++ b/topi/python/topi/util.py
@@ -20,6 +20,7 @@
 from numbers import Integral
 
 import tvm
+from tvm.api import layout, bijective_layout
 from . import tag
 
 def traverse_inline(s, final_op, callback):
@@ -151,11 +152,7 @@ def get_const_tuple(in_tuple):
     out_tuple : tuple of int
         The output.
     """
-    out_tuple = ()
-    for elem in in_tuple:
-        value = get_const_int(elem)
-        out_tuple = out_tuple + (value, )
-    return out_tuple
+    return tuple(get_const_int(elem) for elem in in_tuple)
 
 
 def get_float_tuple(in_tuple):
@@ -171,11 +168,7 @@ def get_float_tuple(in_tuple):
     out_tuple : tuple of float
         The output.
     """
-    out_tuple = ()
-    for elem in in_tuple:
-        value = get_const_float(elem)
-        out_tuple = out_tuple + (value, )
-    return out_tuple
+    return tuple(get_const_float(elem) for elem in in_tuple)
 
 
 def simplify(expr):
@@ -297,3 +290,41 @@ def get_max_power2_factor(n, max_value=None):
         x *= 2
         n /= 2
     return x
+
+
+def get_shape(src_shape, src_layout, dst_layout):
+    """Given a source shape, a source layout and a destination layout, infer
+    the destination shape.
+
+    Parameter
+    ---------
+    src_shape : tuple of int or IntImm
+        Source shape
+
+    src_layout : str or Layout
+        Source layout
+
+    dst_layout : str or Layout
+        Destination layout
+
+    Returns
+    -------
+    dst_shape : tuple of int
+        Destination shape
+    """
+    if src_layout == dst_layout:
+        return get_const_tuple(src_shape)
+
+    if isinstance(src_layout, str):
+        src_layout = layout(src_layout)
+    if isinstance(dst_layout, str):
+        dst_layout = layout(dst_layout)
+
+    assert len(src_layout) == len(dst_layout), \
+        "Incompatible layout %s vs %s" % (src_layout, dst_layout)
+
+    layout_mapping = bijective_layout(src_layout, dst_layout)
+    dst_indices = layout_mapping.forward_index(
+        tvm.convert([i for i in range(len(src_layout))]))
+
+    return get_const_tuple(tuple([src_shape[i.value] for i in dst_indices]))
diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index 979565d31662..a6ba56eeb943 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable, too-many-nested-blocks, too-many-branches, too-many-statements
+# pylint: disable=import-error, invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable, too-many-nested-blocks, too-many-branches, too-many-statements, too-many-function-args
 """Non-maximum suppression operator"""
 import tvm
 
@@ -60,7 +60,7 @@ def hybrid_rearrange_out(data):
 
 
 @hybrid.script
-def hybrid_get_valid_counts(data, score_threshold):
+def hybrid_get_valid_counts(data, score_threshold, id_index, score_index):
     """Hybrid routine to get valid count of bounding boxes
     given a score threshold. Also moves valid boxes to the
     top of input data.
@@ -68,11 +68,18 @@ def hybrid_get_valid_counts(data, score_threshold):
     Parameters
     ----------
     data : tvm.Tensor or numpy NDArray
-        Input data. 3-D tensor with shape [batch_size, num_anchors, 6].
+        Input data. 3-D tensor with shape [batch_size, num_anchors, 6]
+        or [batch_size, num_anchors, 5].
 
     score_threshold : tvm.const
         Lower limit of score for valid bounding boxes.
 
+    id_index : tvm.const
+        index of the class categories, -1 to disable.
+
+    score_index: tvm.const
+        Index of the scores/confidence of boxes.
+
     Returns
     -------
     out_tensor : tvm.Tensor or numpy NDArray
@@ -92,8 +99,9 @@ def hybrid_get_valid_counts(data, score_threshold):
     for i in parallel(batch_size):
         valid_count[i] = 0
         for j in range(num_anchors):
-            score = data[i, j, 1]
-            if score > score_threshold:
+            score = data[i, j, score_index]
+            if score > score_threshold and \
+                    (id_index < 0 or data[i, j, id_index] >= 0):
                 for k in range(box_data_length):
                     out_tensor[i, valid_count[i], k] = data[i, j, k]
                 valid_count[i] += 1
@@ -103,18 +111,25 @@ def hybrid_get_valid_counts(data, score_threshold):
     return valid_count, out_tensor
 
 @tvm.target.generic_func
-def get_valid_counts(data, score_threshold=0):
+def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
     """Get valid count of bounding boxes given a score threshold.
     Also moves valid boxes to the top of input data.
 
     Parameters
     ----------
     data : tvm.Tensor
-        Input data. 3-D tensor with shape [batch_size, num_anchors, 6].
+        Input data. 3-D tensor with shape [batch_size, num_anchors, 6]
+        or [batch_size, num_anchors, 5].
 
     score_threshold : optional, float
         Lower limit of score for valid bounding boxes.
 
+    id_index : optional, int
+        index of the class categories, -1 to disable.
+
+    score_index: optional, int
+        Index of the scores/confidence of boxes.
+
     Returns
     -------
     out_tensor : tvm.Tensor
@@ -123,14 +138,17 @@ def get_valid_counts(data, score_threshold=0):
     valid_count : tvm.Tensor
         1-D tensor for valid number of boxes.
     """
-    score_threshold_const = tvm.const(score_threshold, "float")
-    return hybrid_get_valid_counts(data, score_threshold_const)
+    score_threshold_const = tvm.const(score_threshold, "float32")
+    id_index_const = tvm.const(id_index, "int32")
+    score_index_const = tvm.const(score_index, "int32")
+    return hybrid_get_valid_counts(data, score_threshold_const,
+                                   id_index_const, score_index_const)
 
 
 @hybrid.script
 def hybrid_nms(data, sorted_index, valid_count,
                max_output_size, iou_threshold, force_suppress,
-               top_k, coord_start, id_index):
+               top_k, coord_start, id_index, score_index):
     """Hybrid routing for non-maximum suppression.
 
     Parameters
@@ -165,6 +183,9 @@ def hybrid_nms(data, sorted_index, valid_count,
     id_index : tvm.const
         index of the class categories, -1 to disable.
 
+    score_index: tvm.const
+        Index of the scores/confidence of boxes.
+
     Returns
     -------
     output : tvm.Tensor
@@ -182,41 +203,42 @@ def hybrid_nms(data, sorted_index, valid_count,
                             box_data_length,),
                            data.dtype)
 
-    for i in parallel(batch_size):
+    for i in range(batch_size):
         if iou_threshold > 0:
             if valid_count[i] > 0:
                 # Reorder output
                 nkeep = valid_count[i]
                 if 0 < top_k < nkeep:
                     nkeep = top_k
-                for j in range(nkeep):
+                for j in parallel(nkeep):
                     for k in range(box_data_length):
                         output[i, j, k] = data[i, sorted_index[i, j], k]
                     box_indices[i, j] = sorted_index[i, j]
                 if 0 < top_k < valid_count[i]:
-                    for j in range(valid_count[i] - nkeep):
+                    for j in parallel(valid_count[i] - nkeep):
                         for k in range(box_data_length):
                             output[i, j + nkeep, k] = -1.0
                         box_indices[i, j + nkeep] = -1
             # Apply nms
+            box_start_idx = coord_start
+            batch_idx = i
             for j in range(valid_count[i]):
-                if output[i, j, 0] >= 0:
-                    for k in range(valid_count[i]):
+                if output[i, j, score_index] > 0 and (id_index < 0 or output[i, j, id_index] >= 0):
+                    box_a_idx = j
+                    for k in parallel(valid_count[i]):
                         check_iou = 0
-                        if k > j and output[i, k, 0] >= 0:
+                        if k > j and output[i, k, score_index] > 0 \
+                                and (id_index < 0 or output[i, k, id_index] >= 0):
                             if force_suppress:
                                 check_iou = 1
-                            elif id_index < 0 or output[i, j, 0] == output[i, k, 0]:
+                            elif id_index < 0 or output[i, j, id_index] == output[i, k, id_index]:
                                 check_iou = 1
                         if check_iou > 0:
-                            batch_idx = i
-                            box_a_idx = j
-                            box_b_idx = k
-                            box_start_idx = coord_start
-                            a_t = output[batch_idx, box_a_idx, box_start_idx + 1]
-                            a_b = output[batch_idx, box_a_idx, box_start_idx + 3]
                             a_l = output[batch_idx, box_a_idx, box_start_idx]
+                            a_t = output[batch_idx, box_a_idx, box_start_idx + 1]
                             a_r = output[batch_idx, box_a_idx, box_start_idx + 2]
+                            a_b = output[batch_idx, box_a_idx, box_start_idx + 3]
+                            box_b_idx = k
                             b_t = output[batch_idx, box_b_idx, box_start_idx + 1]
                             b_b = output[batch_idx, box_b_idx, box_start_idx + 3]
                             b_l = output[batch_idx, box_b_idx, box_start_idx]
@@ -227,22 +249,24 @@ def hybrid_nms(data, sorted_index, valid_count,
                             u = (a_r - a_l) * (a_b - a_t) + (b_r - b_l) * (b_b - b_t) - area
                             iou = 0.0 if u <= 0.0 else area / u
                             if iou >= iou_threshold:
-                                output[i, k, 0] = -1.0
+                                output[i, k, score_index] = -1.0
+                                if id_index >= 0:
+                                    output[i, k, id_index] = -1.0
                                 box_indices[i, k] = -1
         else:
-            for j in range(valid_count[i]):
+            for j in parallel(valid_count[i]):
                 for k in range(box_data_length):
                     output[i, j, k] = data[i, j, k]
                 box_indices[i, j] = j
         # Set invalid entry to be -1
-        for j in range(num_anchors - valid_count[i]):
+        for j in parallel(num_anchors - valid_count[i]):
             for k in range(box_data_length):
                 output[i, j + valid_count[i], k] = -1.0
             box_indices[i, j + valid_count[i]] = -1
         # Only return max_output_size valid boxes
         num_valid_boxes = 0
         if max_output_size > 0:
-            for j in range(valid_count[i]):
+            for j in parallel(valid_count[i]):
                 if output[i, j, 0] >= 0:
                     if num_valid_boxes == max_output_size:
                         for k in range(box_data_length):
@@ -263,9 +287,7 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
     Parameters
     ----------
     data : tvm.Tensor
-        3-D tensor with shape [batch_size, num_anchors, 6].
-        The last dimension should be in format of
-        [class_id, score, box_left, box_top, box_right, box_bottom].
+        3-D tensor with shape [batch_size, num_anchors, 6] or [batch_size, num_anchors, 5].
 
     valid_count : tvm.Tensor
         1-D tensor for valid number of boxes.
@@ -331,14 +353,15 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
     score_axis = score_index
     score_shape = (batch_size, num_anchors)
     score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis])
-    sort_tensor = argsort(score_tensor, valid_count=valid_count, axis=1, is_ascend=False, flag=True)
+    sort_tensor = argsort(score_tensor, valid_count=valid_count, axis=1, is_ascend=False)
     out, box_indices = hybrid_nms(data, sort_tensor, valid_count,
                                   tvm.const(max_output_size, dtype="int32"),
                                   tvm.const(iou_threshold, dtype="float32"),
                                   tvm.const(force_suppress, dtype="bool"),
                                   tvm.const(top_k, dtype="int32"),
                                   tvm.const(coord_start, dtype="int32"),
-                                  tvm.const(id_index, dtype="int32"))
+                                  tvm.const(id_index, dtype="int32"),
+                                  tvm.const(score_index, dtype="int32"))
     if not return_indices and invalid_to_bottom:
         out = hybrid_rearrange_out(out)
 
diff --git a/topi/python/topi/x86/__init__.py b/topi/python/topi/x86/__init__.py
index cce816d43ba1..efc1bc512285 100644
--- a/topi/python/topi/x86/__init__.py
+++ b/topi/python/topi/x86/__init__.py
@@ -13,3 +13,4 @@
 from .depthwise_conv2d import schedule_depthwise_conv2d_NCHWc
 from .dense import _schedule_dense, _schedule_dense_pack, _schedule_dense_nopack
 from .batch_matmul import schedule_batch_matmul
+from .roi_align import roi_align_nchw
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index 4d4b3fef4826..fd3f19a5d060 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -26,9 +26,9 @@
 from tvm.autotvm.task import get_config
 from .. import generic, tag
 from .. import nn
-from ..util import get_const_tuple
+from ..util import get_const_tuple, get_shape
 from ..nn.conv2d import conv2d, conv2d_NCHWc, \
-    conv2d_alter_layout, _get_workload as _get_conv2d_workload
+    conv2d_alter_layout, conv2d_infer_layout, _get_workload as _get_conv2d_workload
 from ..nn.depthwise_conv2d import _get_workload as _get_depthwise_conv2d_workload
 from ..nn.depthwise_conv2d import depthwise_conv2d_NCHWc, depthwise_conv2d_nchw
 from ..nn.pad import pad
@@ -37,7 +37,8 @@
 
 logger = logging.getLogger('topi')
 
-def _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise=False):
+def _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise=False,
+                        layout='NCHW'):
     """
     Get default schedule config for the workload
     """
@@ -46,7 +47,7 @@ def _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depth
         from .depthwise_conv2d import _fallback_schedule
         _fallback_schedule(cfg, wkl)
     else:
-        wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype)
+        wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, layout)
         is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1
         if is_kernel_1x1:
             conv2d_avx_1x1._fallback_schedule(cfg, wkl)
@@ -62,6 +63,9 @@ def _create_tuning_space(cfg, data, kernel, strides, padding, dilation, layout):
     if layout == 'NCHW':
         n, ic, h, w = dshape
         oc, _, kh, kw = kshape
+    elif layout == 'NHWC':
+        n, h, w, ic = dshape
+        kh, kw, oc, _ = kshape
     elif pat.match(layout) is not None:
         n, ic_chunk, h, w, ic_bn = dshape
         if data.dtype == 'uint8':
@@ -93,21 +97,31 @@ def _create_tuning_space(cfg, data, kernel, strides, padding, dilation, layout):
         cfg.define_knob("unroll_kw", [True, False])
 
 
-@autotvm.register_topi_compute(conv2d, 'cpu', 'direct')
+@autotvm.register_topi_compute(conv2d, 'cpu', ['direct'])
 def _declaration_conv(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
     out_dtype = data.dtype if out_dtype is None else out_dtype
     padding = padding if isinstance(padding, (tuple, list)) else (padding, padding)
     strides = strides if isinstance(strides, (tuple, list)) else (strides, strides)
     dilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
+
     if layout == 'NCHW':
         _create_tuning_space(cfg, data, kernel, strides, padding, dilation, layout)
         if cfg.is_fallback:
             _get_default_config(cfg, data, kernel, strides, padding, out_dtype)
         return _declaration_conv_impl(cfg, data, kernel, strides,
                                       padding, dilation, layout, out_dtype)
+
+    # HWOI kernel layout is for NHWC and HWCN
+    kh, kw, _, _ = get_const_tuple(kernel.shape)
     if layout == 'HWCN':
         return nn.conv2d_hwcn(data, kernel, strides, padding, dilation, out_dtype)
-    if layout == 'NHWC':
+    elif layout == 'NHWC' and kh == 1 and kw == 1 and kernel.dtype == "int8":
+        if cfg.is_fallback:
+            _get_default_config(cfg, data, kernel, strides, padding, out_dtype, False, layout)
+        # specialize for INT8 1X1 conv on X86
+        return conv2d_avx_1x1._declaration_conv_nhwc_pack(cfg, data, kernel, strides,
+                                                          padding, dilation, out_dtype)
+    elif layout == 'NHWC':
         return nn.conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype)
     raise ValueError("not support this layout {} yet".format(layout))
 
@@ -226,6 +240,58 @@ def traverse(op):
     return s
 
 
+@autotvm.register_topi_schedule(generic.schedule_conv2d_nhwc_pack, 'cpu', ['direct'])
+def schedule_conv2d_nhwc_pack(cfg, outs):
+    """Create schedule for tensors"""
+    s = tvm.create_schedule([x.op for x in outs])
+    output_op = outs[0].op
+    scheduled_ops = []
+
+    def traverse(op):
+        """Traverse operators from computation graph"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            else: # inject custom schedule
+                if len(op.axis) == 4: # schedule bias + bn + relu
+                    n, h, w, c = op.axis
+                    fused = s[op].fuse(n, h, w)
+                    s[op].parallel(fused)
+                    s[op].vectorize(c)
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
+                    traverse(tensor.op)
+
+        if 'conv2d_nhwc_pack_int8' in op.tag:
+            conv_out = op.output(0)
+            kernel = conv_out.op.input_tensors[1]
+            data_vec = conv_out.op.input_tensors[0]
+            data = data_vec.op.input_tensors[0] \
+                if isinstance(data_vec.op, tvm.tensor.ComputeOp) and "pad" not in data_vec.op.tag \
+                else data_vec
+            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
+                data_pad = data
+                data = data_pad.op.input_tensors[0]
+
+            args = [s, cfg, data_vec, conv_out, outs[0]]
+            if data.dtype == 'uint8':
+                # int8 conv kernel is 7-dim
+                kh, kw, _, _, _ = get_const_tuple(kernel.shape)
+                if kh == 1 and kw == 1:
+                    conv2d_avx_1x1._schedule_conv_nhwc_pack_int8(*args)
+                else:
+                    raise ValueError("Only support 1x1 kernel with "
+                                     "schedule_conv2d_nhwc_pack.")
+            else:
+                raise ValueError("Not support this data type {} with "
+                                 "schedule_conv2d_nhwc_pack. Only support int8".format(data.dtype))
+
+        scheduled_ops.append(op)
+    traverse(output_op)
+    return s
+
+
 @generic.schedule_conv2d_nhwc.register("cpu")
 def schedule_conv2d_nhwc(outs):
     """Create schedule for tensors"""
@@ -349,11 +415,14 @@ def _alter_conv2d_layout(attrs, inputs, tinfo, F):
 
     dtype = data.dtype
     out_dtype = dtype if out_dtype in ("same", "") else out_dtype
-    is_depthwise = groups == in_channel and groups == out_channel
+
+    kshape = get_shape(kernel.shape, attrs["kernel_layout"], "OIHW")
+    is_depthwise = groups == kshape[0] and kshape[1] == 1
 
     # only optimize for NCHW
-    if layout != 'NCHW':
+    if layout != 'NCHW' or attrs["kernel_layout"] != "OIHW":
         return None
+
     if groups != 1 and not is_depthwise:
         return None
 
@@ -414,6 +483,21 @@ def _alter_conv2d_layout(attrs, inputs, tinfo, F):
         return F.nn.contrib_conv2d_nchwc(*copy_inputs, **new_attrs)
 
 
+@conv2d_infer_layout.register("cpu")
+def _conv2d_infer_layout(workload, cfg):
+    _, data, kernel, strides, padding, dilation, layout, dtype = workload
+    batch_size, in_channel, in_height, in_width = data[:-1]
+    out_channel, _, k_height, k_width = kernel[:-1]
+    out_height = (in_height + 2 * padding[0] - k_height) // strides[0] + 1
+    out_width = (in_width + 2 * padding[1] - k_width) // strides[1] + 1
+    tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
+    in_shape = (batch_size, in_channel // tile_ic, in_height, in_width, tile_ic)
+    in_layout = "NCHW%dc" % tile_ic
+    out_shape = (batch_size, out_channel // tile_oc, out_height, out_width, tile_oc)
+    out_layout = "NCHW%dc" % tile_oc
+    return ((in_shape, in_layout),), ((out_shape, out_layout),)
+
+
 @autotvm.register_topi_compute(conv2d_NCHWc, 'cpu', 'direct')
 def _declaration_conv_NCHWc(cfg, data, kernel, strides,
                             padding, dilation, layout, out_layout, out_dtype):
@@ -421,16 +505,22 @@ def _declaration_conv_NCHWc(cfg, data, kernel, strides,
     # we keep them for debug convenience when dumping autotvm workload
     HPAD, WPAD = padding if isinstance(padding, (tuple, list)) else (padding, padding)
     HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-    dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
-    assert (dh, dw) == (1, 1), "Does not support dilation"
+    dilation_h, dilation_w = dilation if isinstance(dilation, (tuple, list)) \
+        else (dilation, dilation)
 
     n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape)
     in_channel = ic_chunk * ic_bn
     if data.dtype == 'uint8':
-        oc_chunk, _, kernel_height, kernel_width, _, oc_bn, _ = get_const_tuple(kernel.shape)
+        oc_chunk, ic_chunk_group, kernel_height, kernel_width, _, oc_bn, _ = \
+            get_const_tuple(kernel.shape)
     else:
-        oc_chunk, _, kernel_height, kernel_width, _, oc_bn = get_const_tuple(kernel.shape)
+        oc_chunk, ic_chunk_group, kernel_height, kernel_width, _, oc_bn = \
+            get_const_tuple(kernel.shape)
     num_filter = oc_chunk * oc_bn
+    groups = ic_chunk // ic_chunk_group
+
+    dilated_kernel_h = (kernel_height - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_width - 1) * dilation_w + 1
 
     if cfg.is_fallback:
         _get_default_config(cfg, tvm.placeholder((n, in_channel, ih, iw), dtype=data.dtype),
@@ -439,8 +529,8 @@ def _declaration_conv_NCHWc(cfg, data, kernel, strides,
                             strides, padding, out_dtype)
 
     # output shape
-    out_height = (ih + 2 * HPAD - kernel_height) // HSTR + 1
-    out_width = (iw + 2 * WPAD - kernel_width) // WSTR + 1
+    out_height = (ih + 2 * HPAD - dilated_kernel_h) // HSTR + 1
+    out_width = (iw + 2 * WPAD - dilated_kernel_w) // WSTR + 1
     oshape = (n, oc_chunk, out_height, out_width, oc_bn)
 
     # DOPAD
@@ -454,7 +544,7 @@ def _declaration_conv_NCHWc(cfg, data, kernel, strides,
     kh = tvm.reduce_axis((0, kernel_height), name='kh')
     kw = tvm.reduce_axis((0, kernel_width), name='kw')
 
-    if data.dtype == 'uint8':
+    if data.dtype == 'uint8' and groups == 1:
         assert out_dtype == "int32", \
             "INT8 convolution requires input dtype = uint8 and output dtype=int32"
         # Intel performs dot product of 2 "4" Int8 values
@@ -466,16 +556,36 @@ def _declaration_conv_NCHWc(cfg, data, kernel, strides,
         ic_f_inner = tvm.reduce_axis((0, ic_bn//n_elems), name='ic_f_inner')
         ic_s_inner = tvm.reduce_axis((0, n_elems), name='ic_s_inner')
         return tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
-                           tvm.sum(data_pad[n, ic_outer, oh*HSTR+kh, ow*WSTR+kw,
-                                            ic_f_inner * n_elems +  ic_s_inner]
+                           tvm.sum(data_pad[n, ic_outer, oh*HSTR+kh*dilation_h,
+                                            ow*WSTR+kw*dilation_w,
+                                            ic_f_inner * n_elems + ic_s_inner]
                                    .astype(out_dtype) *
                                    kernel[oc_chunk, ic_outer, kh, kw, ic_f_inner,
                                           oc_block, ic_s_inner].astype(out_dtype),
                                    axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner]),
                            name='conv2d_NCHWc_int8', tag="conv2d_NCHWc_int8")
+    if data.dtype == 'uint8':
+    	# for int8 group conv support
+        n_elems = 4
+        ic_chunk = in_channel//ic_bn
+        ic_outer = tvm.reduce_axis((0, ic_chunk//groups), name='ic_outer')
+        ic_f_inner = tvm.reduce_axis((0, ic_bn//n_elems), name='ic_f_inner')
+        ic_s_inner = tvm.reduce_axis((0, n_elems), name='ic_s_inner')
+        oshape = (n, oc_chunk, out_height, out_width, oc_bn)
+        return tvm.compute(oshape, lambda n, occ, oh, ow, oc_block:
+                           tvm.sum(data_pad[n, (occ*oc_bn//(oc_chunk*oc_bn//groups))*\
+                                            (ic_chunk//groups)+ic_outer,
+                                            oh*HSTR+kh, ow*WSTR+kw,
+                                            ic_f_inner * n_elems +  ic_s_inner].astype(out_dtype) *
+                                   kernel[occ, ic_outer, kh, kw, ic_f_inner,
+                                          oc_block, ic_s_inner].astype(out_dtype),
+                                   axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner]),
+                           name='conv2d_NCHWc_int8', tag="conv2d_NCHWc_int8")
+
     # else: fp implementation
     return tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
-                       tvm.sum(data_pad[n, ic//ic_bn, oh*HSTR+kh, ow*WSTR+kw,
+                       tvm.sum(data_pad[n, ic//ic_bn, oh*HSTR+kh*dilation_h,
+                                        ow*WSTR+kw*dilation_w,
                                         ic%ic_bn].astype(out_dtype) *
                                kernel[oc_chunk, ic//ic_bn, kh, kw, ic%ic_bn, oc_block],
                                axis=[ic, kh, kw]),
diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py
index bcd2cefc2bdf..256cea569c68 100644
--- a/topi/python/topi/x86/conv2d_avx_1x1.py
+++ b/topi/python/topi/x86/conv2d_avx_1x1.py
@@ -20,8 +20,9 @@
 import tvm
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
-from ..nn.util import infer_pad
-from ..util import get_const_tuple
+from ..nn.pad import pad
+from ..nn.util import infer_pad, get_pad_tuple
+from ..util import get_const_tuple, simplify
 from .tensor_intrin import dot_16x1x16_int8_int8_int32
 from .check_targets import check_skylake
 from .util import get_fp32_len
@@ -133,7 +134,7 @@ def _schedule_conv_NCHWc(s, cfg, data, conv_out, last):
     A = data
     if isinstance(s[A].op, tvm.tensor.ComputeOp):
         batch, ic_chunk, ih, iw, ic_block = s[A].op.axis
-        parallel_axis = s[A].fuse(ic_chunk, ih)
+        parallel_axis = s[A].fuse(batch, ic_chunk, ih)
         s[A].parallel(parallel_axis)
 
     C, O = conv_out, last
@@ -145,7 +146,7 @@ def _schedule_conv_NCHWc(s, cfg, data, conv_out, last):
     s[C].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
     s[C].vectorize(oc_block)
 
-    parallel_axis = s[C].fuse(oc_chunk, oh_outer)
+    parallel_axis = s[C].fuse(batch, oc_chunk, oh_outer)
     s[CC].compute_at(s[C], parallel_axis)
     if C == O:
         s[C].parallel(parallel_axis)
@@ -171,7 +172,7 @@ def _schedule_conv_NCHWc(s, cfg, data, conv_out, last):
         ow_outer, ow_inner = s[O].split(ow, factor=ow_factor)
         s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
 
-        parallel_axis = s[O].fuse(oc_chunk, oh_outer)
+        parallel_axis = s[O].fuse(batch, oc_chunk, oh_outer)
         s[C].compute_at(s[O], parallel_axis)
         s[O].vectorize(oc_block)
         s[O].parallel(parallel_axis)
@@ -202,7 +203,7 @@ def _schedule_conv_NCHWc_int8(s, cfg, data, conv_out, last):
     A = data
     if isinstance(s[A].op, tvm.tensor.ComputeOp):
         batch, ic_chunk, ih, iw, ic_block = s[A].op.axis
-        parallel_axis = s[A].fuse(ic_chunk, ih)
+        parallel_axis = s[A].fuse(batch, ic_chunk, ih)
         s[A].parallel(parallel_axis)
 
     C, O = conv_out, last
@@ -214,7 +215,7 @@ def _schedule_conv_NCHWc_int8(s, cfg, data, conv_out, last):
     s[C].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
     s[C].vectorize(oc_block)
 
-    parallel_axis = s[C].fuse(oc_chunk, oh_outer)
+    parallel_axis = s[C].fuse(batch, oc_chunk, oh_outer)
     s[CC].compute_at(s[C], parallel_axis)
     if C == O:
         s[C].parallel(parallel_axis)
@@ -245,9 +246,110 @@ def _schedule_conv_NCHWc_int8(s, cfg, data, conv_out, last):
         ow_outer, ow_inner = s[O].split(ow, factor=ow_factor)
         s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
 
-        parallel_axis = s[O].fuse(oc_chunk, oh_outer)
+        parallel_axis = s[O].fuse(batch, oc_chunk, oh_outer)
         s[C].compute_at(s[O], parallel_axis)
         s[O].vectorize(oc_block)
         s[O].parallel(parallel_axis)
 
     return s
+
+
+def _declaration_conv_nhwc_pack(cfg, Input, Filter, stride, padding, dilation, out_dtype):
+    # more assertion for the shapes
+    assert isinstance(stride, int) or len(stride) == 2
+    assert isinstance(dilation, int) or len(dilation) == 2
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    batch, in_height, in_width, in_channel = Input.shape
+    kernel_h, kernel_w, num_filter, channel = Filter.shape
+
+    # compute the output shape
+    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        padding, (dilated_kernel_h, dilated_kernel_w))
+    out_channel = num_filter
+    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
+    pad_before = [0, pad_top, pad_left, 0]
+    pad_after = [0, pad_down, pad_right, 0]
+    PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
+    # todo: padding filter to accomodate the intrinsic
+
+    # packing the Filter to let memory access be consecutive for AVX512 intrinsic
+    # Done in pre-compute stage
+    packw_shape = (kernel_h, kernel_w, num_filter/16, 16*(channel/4), 4)
+    PackW = tvm.compute(packw_shape, lambda a, b, c, d, e: Filter[a][b][c*16+d%16][d/16*4+e],
+                        name="packed_filter")
+
+    rc = tvm.reduce_axis((0, in_channel), name='rc')
+    ry = tvm.reduce_axis((0, kernel_h), name='ry')
+    rx = tvm.reduce_axis((0, kernel_w), name='rx')
+    Output = tvm.compute(
+        (batch, out_height, out_width, out_channel),
+        lambda nn, yy, xx, ff: tvm.sum(
+            PaddedInput[nn, yy * stride_h + ry * dilation_h,
+                        xx * stride_w + rx * dilation_w, rc].astype(out_dtype) *
+            PackW[ry, rx, ff/16, (rc/4)*16+ff%16, rc%4].astype(out_dtype), axis=[ry, rx, rc]),
+        name="Conv2d_1x1_Output_int8", tag="conv2d_nhwc_pack_int8")
+    return Output
+
+
+def _schedule_conv_nhwc_pack_int8(s, cfg, data, conv_out, last):
+    """
+    Defines the schedule for the int8 nhwc layout. For 1x1 conv, it
+    is a matrix-multiply operation by using nhwc layout. We will do
+    packing of weight to make the address access be friendly to int8
+    intrinsic
+    """
+    target = tvm.target.current_target(allow_none=False)
+    int32_lanes = -1
+    if check_skylake(target):
+        int32_lanes = 16
+    else:
+        return s
+    assert int32_lanes != -1
+
+    # assertion to fail the unhandled case
+    _, _, _, ic_num = get_const_tuple(data.shape)
+    _, _, _, oc_num = get_const_tuple(conv_out.shape)
+    assert ic_num % 4 == 0
+    assert oc_num % 16 == 0
+
+    ic_factor, oc_factor = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
+    # schedule data
+    A = data
+    if isinstance(s[A].op, tvm.tensor.ComputeOp):
+        batch, ih, iw, ic = s[A].op.axis
+        d_ic_chunk, d_ic_block = s[A].split(ic, factor=4)
+        s[A].vectorize(d_ic_block)
+
+    C, O = conv_out, last
+
+    batch, oh, ow, oc = s[C].op.axis
+    kh, kw, ic = s[C].op.reduce_axis
+    # match the x86 intrinsic
+    ic_outer, ic_inner = s[C].split(ic, factor=4)
+    oc_outer, oc_inner = s[C].split(oc, factor=int32_lanes)
+
+    ic_f_outer, ic_s_outer = s[C].split(ic_outer, factor=ic_factor)
+    s[C].reorder(oc_outer, oh, ow, ic_f_outer, ic_s_outer, kh, kw, oc_inner, ic_inner)
+
+    pc = dot_16x1x16_int8_int8_int32()
+    s[C].tensorize(oc_inner, pc)
+
+    if C != O:
+        batch, last_oh, last_ow, last_oc = s[O].op.axis
+        oc_chunk, oc_block = s[O].split(ochannel, 16)
+        # not saw perf improvement to split oh/ow here
+        s[O].vectorize(oc_block)
+
+    return s
diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py
index 3ab68d71b948..44867c9e33d5 100644
--- a/topi/python/topi/x86/conv2d_avx_common.py
+++ b/topi/python/topi/x86/conv2d_avx_common.py
@@ -143,10 +143,10 @@ def _schedule_conv_NCHWc(s, cfg, data, conv_out, last):
     C, O = conv_out, last
     CC = s.cache_write(C, 'global')
 
-    _, oc_chunk, oh, ow, oc_block = s[C].op.axis
+    batch, oc_chunk, oh, ow, oc_block = s[C].op.axis
     ow_chunk, ow_block = s[C].split(ow, factor=reg_n)
     s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
-    parallel_axis = s[C].fuse(oc_chunk, oh)
+    parallel_axis = s[C].fuse(batch, oc_chunk, oh)
     s[C].vectorize(oc_block)
     if C == O:
         s[C].parallel(parallel_axis)
@@ -171,7 +171,7 @@ def _schedule_conv_NCHWc(s, cfg, data, conv_out, last):
         batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
         ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
         s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
-        parallel_axis = s[O].fuse(oc_chunk, oh)
+        parallel_axis = s[O].fuse(batch, oc_chunk, oh)
         s[C].compute_at(s[O], parallel_axis)
         s[O].vectorize(oc_block)
         s[O].parallel(parallel_axis)
@@ -214,10 +214,10 @@ def _schedule_conv_NCHWc_int8(s, cfg, data, conv_out, last):
     C, O = conv_out, last
     CC = s.cache_write(C, 'global')
 
-    _, oc_chunk, oh, ow, oc_block = s[C].op.axis
+    batch, oc_chunk, oh, ow, oc_block = s[C].op.axis
     ow_chunk, ow_block = s[C].split(ow, factor=reg_n)
     s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
-    parallel_axis = s[C].fuse(oc_chunk, oh)
+    parallel_axis = s[C].fuse(batch, oc_chunk, oh)
     s[C].vectorize(oc_block)
     if C == O:
         s[C].parallel(parallel_axis)
@@ -251,7 +251,7 @@ def _schedule_conv_NCHWc_int8(s, cfg, data, conv_out, last):
         batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
         ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
         s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
-        parallel_axis = s[O].fuse(oc_chunk, oh)
+        parallel_axis = s[O].fuse(batch, oc_chunk, oh)
         s[C].compute_at(s[O], parallel_axis)
         s[O].vectorize(oc_block)
         s[O].parallel(parallel_axis)
diff --git a/topi/python/topi/x86/depthwise_conv2d.py b/topi/python/topi/x86/depthwise_conv2d.py
index f570aaf7e70d..ddcd8415df77 100644
--- a/topi/python/topi/x86/depthwise_conv2d.py
+++ b/topi/python/topi/x86/depthwise_conv2d.py
@@ -22,10 +22,12 @@
 from tvm.autotvm.task.space import SplitEntity
 from tvm.autotvm.task.topi_integration import deserialize_args
 from .. import generic, tag
+from ..generic import schedule_depthwise_conv2d_nchw
 from ..nn.pad import pad
 from ..util import get_const_tuple
 from ..nn.util import get_pad_tuple
-from ..nn.depthwise_conv2d import depthwise_conv2d_NCHWc, _get_workload
+from ..nn.depthwise_conv2d import depthwise_conv2d_nchw, depthwise_conv2d_NCHWc, \
+    _get_workload, depthwise_conv2d_infer_layout
 
 from .util import get_fp32_len
 
@@ -69,6 +71,12 @@ def _fallback_schedule(cfg, wkl):
     cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
 
 
+autotvm.register_topi_compute(depthwise_conv2d_nchw, 'cpu', 'direct',
+                              depthwise_conv2d_nchw.fdefault)
+autotvm.register_topi_schedule(schedule_depthwise_conv2d_nchw, 'cpu', 'direct',
+                               schedule_depthwise_conv2d_nchw.fdefault)
+
+
 @autotvm.register_topi_compute(depthwise_conv2d_NCHWc, 'cpu', 'direct')
 def _depthwise_conv2d_NCHWc_cpu(cfg, data, kernel, strides, padding, dilation,
                                 layout, out_layout, out_dtype=None):
@@ -206,7 +214,7 @@ def _topi_nn_depthwise_conv2d_NCHWc(*args, **kwargs):
     # change shape with the value in config
     ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
     new_data_shape = (batch, in_channel // ic_bn, height, width, ic_bn)
-    new_kernel_shape = (out_channel // oc_bn, kh, kw, oc_bn)
+    new_kernel_shape = (out_channel // oc_bn, 1, kh, kw, 1, oc_bn)
     new_data = tvm.placeholder(new_data_shape, data.dtype)
     new_kernel = tvm.placeholder(new_kernel_shape, kernel.dtype)
 
@@ -217,3 +225,18 @@ def _topi_nn_depthwise_conv2d_NCHWc(*args, **kwargs):
                                     data_layout, out_layout, dtype)
     s = schedule_depthwise_conv2d_NCHWc(cfg, [C])
     return s, [new_data, new_kernel, C]
+
+@depthwise_conv2d_infer_layout.register("cpu")
+def _depthwise_conv2d_infer_layout(workload, cfg):
+    _, data, kernel, strides, padding, dilation, dtype = workload
+    batch_size, in_channel, in_height, in_width = data[:-1]
+    filter_channel, channel_multiplier, k_height, k_width = kernel[:-1]
+    out_channel = filter_channel * channel_multiplier
+    out_height = (in_height + 2 * padding[0] - k_height) // strides[0] + 1
+    out_width = (in_width + 2 * padding[1] - k_width) // strides[1] + 1
+    tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
+    in_shape = (batch_size, in_channel // tile_ic, in_height, in_width, tile_ic)
+    in_layout = "NCHW%dc" % tile_ic
+    out_shape = (batch_size, out_channel // tile_oc, out_height, out_width, tile_oc)
+    out_layout = "NCHW%dc" % tile_oc
+    return ((in_shape, in_layout),), ((out_shape, out_layout),)
diff --git a/topi/python/topi/x86/roi_align.py b/topi/python/topi/x86/roi_align.py
new file mode 100644
index 000000000000..a8ad387a242f
--- /dev/null
+++ b/topi/python/topi/x86/roi_align.py
@@ -0,0 +1,217 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable, too-many-nested-blocks, too-many-branches, too-many-statements
+"""Non-maximum suppression operator for intel cpu"""
+import tvm
+
+from tvm import hybrid
+from ..vision.rcnn import roi_align_nchw
+
+
+@hybrid.script
+def roi_align_nchw_ir(data, rois, pooled_size, spatial_scale, sample_ratio):
+    """Hybrid routing fo ROI align operator in NCHW layout.
+
+    Parameters
+    ----------
+    data : tvm.Tensor or numpy NDArray
+        4-D with shape [batch, channel, height, width]
+
+    rois : tvm.Tensor or numpy NDArray
+        2-D with shape [num_roi, 5]. The last dimension should be in format of
+        [batch_index, w_start, h_start, w_end, h_end]
+
+    pooled_size : tvm ConsExpr
+        [out_height, out_width]
+
+    spatial_scale : tvm.const
+        Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal
+        of total stride in convolutional layers, which should be in range (0.0, 1.0]
+
+    sample_ratio : tvm.const
+        Sampling ratio of ROI align, using adaptive size by default.
+
+    Returns
+    -------
+    output : tvm.Tensor or numpy NDArray
+        4-D with shape [num_roi, channel, pooled_size, pooled_size]
+    """
+    channels = data.shape[1]
+    height = data.shape[2]
+    width = data.shape[3]
+    num_rois = rois.shape[0]
+    pooled_size_h = pooled_size[0]
+    pooled_size_w = pooled_size[1]
+    output = output_tensor((num_rois, channels, pooled_size_h, pooled_size_w), data.dtype)
+    max_num_pc_index = height * width * pooled_size_h * pooled_size_w
+    w_pc = allocate((num_rois, max_num_pc_index, 4), data.dtype)
+    pos_pc = allocate((num_rois, max_num_pc_index, 4), "int32")
+
+    for n in parallel(num_rois):
+        roi_batch_index = int32(rois[n, 0])
+        roi_start_w = rois[n, 1] * spatial_scale
+        roi_start_h = rois[n, 2] * spatial_scale
+        roi_end_w = rois[n, 3] * spatial_scale
+        roi_end_h = rois[n, 4] * spatial_scale
+
+        roi_h = max(roi_end_h - roi_start_h, 1.0)
+        roi_w = max(roi_end_w - roi_start_w, 1.0)
+
+        bin_h = roi_h / pooled_size_h
+        bin_w = roi_w / pooled_size_w
+
+        roi_bin_grid_h = sample_ratio
+        roi_bin_grid_w = roi_bin_grid_h
+        div_h = roi_h / pooled_size_h
+        div_w = roi_w / pooled_size_w
+        rounded_div_h = int32(div_h) * 1.0
+        rounded_div_w = int32(div_w) * 1.0
+        if sample_ratio <= 0:
+            # Cannot use ceil function since hybrid script
+            # doesn't support Call as indexing
+            roi_bin_grid_h = int32(div_h)
+            roi_bin_grid_w = int32(div_w)
+            if rounded_div_h < div_h:
+                roi_bin_grid_h += 1
+            if rounded_div_w < div_w:
+                roi_bin_grid_w += 1
+
+        count = roi_bin_grid_h * roi_bin_grid_w
+
+        # Pre-calculate indices and weights shared by all channels.
+        # This is the key point of optimization.
+        pre_calc_index = 0
+        iy_upper = roi_bin_grid_h
+        ix_upper = roi_bin_grid_w
+        for ph in range(pooled_size_h):
+            for pw in range(pooled_size_w):
+                for iy in range(iy_upper):
+                    yy = roi_start_h + ph * bin_h + (iy + 0.5) * bin_h / roi_bin_grid_h
+                    for ix in range(ix_upper):
+                        xx = roi_start_w + pw * bin_w + (ix + 0.5) * bin_w / roi_bin_grid_w
+                        x = xx
+                        y = yy
+                        if y < -1.0 or y > height or x < -1.0 or x > width:
+                            for i in range(4):
+                                w_pc[n, pre_calc_index, i] = 0.0
+                                pos_pc[n, pre_calc_index, i] = 0
+                        else:
+                            if y < 0.0:
+                                y = 0.0
+                            if x < 0.0:
+                                x = 0.0
+
+                            y_low = int32(y)
+                            x_low = int32(x)
+                            x_high = x_low + 1
+                            y_high = y_low + 1
+
+                            if y_low >= height - 1:
+                                y_high = height - 1
+                                y_low = y_high
+                                y = float32(y_low)
+
+                            if x_low >= width - 1:
+                                x_high = width - 1
+                                x_low = x_high
+                                x = float32(x_low)
+
+                            ly = y - y_low
+                            lx = x - x_low
+                            hy = 1.0 - ly
+                            hx = 1.0 - lx
+                            w1 = hy * hx
+                            w2 = hy * lx
+                            w3 = ly * hx
+                            w4 = ly * lx
+
+                            pos_pc[n, pre_calc_index, 0] = x_low
+                            pos_pc[n, pre_calc_index, 1] = x_high
+                            pos_pc[n, pre_calc_index, 2] = y_low
+                            pos_pc[n, pre_calc_index, 3] = y_high
+                            w_pc[n, pre_calc_index, 0] = w1
+                            w_pc[n, pre_calc_index, 1] = w2
+                            w_pc[n, pre_calc_index, 2] = w3
+                            w_pc[n, pre_calc_index, 3] = w4
+
+                        pre_calc_index += 1
+
+        for c in range(channels):
+            pre_calc_index = 0
+            for ph in range(pooled_size_h):
+                for pw in range(pooled_size_w):
+                    output_val = 0.0
+                    for iy in range(roi_bin_grid_h):
+                        for ix in range(roi_bin_grid_w):
+                            output_val += w_pc[n, pre_calc_index, 0] \
+                                          * data[roi_batch_index, c,
+                                                 pos_pc[n, pre_calc_index, 2],
+                                                 pos_pc[n, pre_calc_index, 0]] \
+                                          + w_pc[n, pre_calc_index, 1] \
+                                          * data[roi_batch_index, c,
+                                                 pos_pc[n, pre_calc_index, 2],
+                                                 pos_pc[n, pre_calc_index, 1]] \
+                                          + w_pc[n, pre_calc_index, 2] \
+                                          * data[roi_batch_index, c,
+                                                 pos_pc[n, pre_calc_index, 3],
+                                                 pos_pc[n, pre_calc_index, 0]] \
+                                          + w_pc[n, pre_calc_index, 3] \
+                                          * data[roi_batch_index, c,
+                                                 pos_pc[n, pre_calc_index, 3],
+                                                 pos_pc[n, pre_calc_index, 1]]
+                            pre_calc_index += 1
+
+                    output_val /= count
+                    output[n, c, ph, pw] = output_val
+
+    return output
+
+
+@roi_align_nchw.register("cpu")
+def roi_align_nchw_cpu(data, rois, pooled_size, spatial_scale, sample_ratio=-1):
+    """ROI align operator in NCHW layout.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        4-D with shape [batch, channel, height, width]
+
+    rois : tvm.Tensor
+        2-D with shape [num_roi, 5]. The last dimension should be in format of
+        [batch_index, w_start, h_start, w_end, h_end]
+
+    pooled_size : int or list/tuple of two ints
+        output size, or [out_height, out_width]
+
+    spatial_scale : float
+        Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal
+        of total stride in convolutional layers, which should be in range (0.0, 1.0]
+
+    sample_ratio : int
+        Optional sampling ratio of ROI align, using adaptive size by default.
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [num_roi, channel, pooled_size, pooled_size]
+    """
+    if not isinstance(pooled_size, (tuple, list)):
+        pooled_size = (pooled_size, pooled_size)
+    pooled_size = tvm.convert(pooled_size)
+    spatial_scale = tvm.const(spatial_scale, "float32")
+    sample_ratio = tvm.const(sample_ratio, "int32")
+    return roi_align_nchw_ir(data, rois, pooled_size, spatial_scale, sample_ratio)
diff --git a/topi/python/topi/x86/tensor_intrin.py b/topi/python/topi/x86/tensor_intrin.py
index 48fa75d81c9b..00681726257a 100644
--- a/topi/python/topi/x86/tensor_intrin.py
+++ b/topi/python/topi/x86/tensor_intrin.py
@@ -98,3 +98,82 @@ def _instr(index):
 
     with tvm.build_config(offset_factor=1, partition_const_loop=True):
         return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})
+
+
+def dot_16x1x16_int8_int8_int16():
+    """
+    Int8 dot product by every 2 elements using AVX2 Skylake instructions.
+    This function takes two arrays of int8 datatype -- data[2] and
+    kernel[4][32][2] -- and computes a dot product of data[2] with every
+    2 elements of kernels, resulting in output[4][32] of int16 datatype.
+    The pseudo code is as follows.
+    .. code-block:: c
+        void dot_16x1x16_int8_int8_int16(int8 data[2], int8 kernel[32*4][2],
+                int16 output[32*4]){
+        for (int i = 0; i< 4; i++){
+                for (int j = 0; j < 32; j++){
+                    out[i][i] = 0;
+                    for (int k = 0; k < 2; k++){
+                        out[i][j][k] += data[k] * kernel[i][j][k]
+                    }
+                }
+        }
+        }
+    Physically, the kernel array sits in four AVX512 vector registers and
+    the data[2] is broadcasted to another AVX512 vector register. This
+    function returns a TensorIntrin that can be used to tensorize
+    a schedule.
+    Returns
+    -------
+    intrin : TensorIntrin
+        The Skylake int8 TensorIntrin that can be used in tensorizing schedule
+    """
+
+    num_int8_elements = 2 # 2 int8 elements in int32
+    data = tvm.placeholder((num_int8_elements,), dtype='uint8', name='data')
+    kernel = tvm.placeholder((128, num_int8_elements), dtype='int8', name='kernel')
+    k = tvm.reduce_axis((0, num_int8_elements), name='k')
+    C = tvm.compute((128, ),
+                    lambda i: tvm.sum(data[k].astype('int16') *
+                                      kernel[i, k].astype('int16'),
+                                      axis=k),
+                    name="C")
+
+    a_buffer = tvm.decl_buffer(data.shape, dtype='uint8', name="a_buffer",
+                               offset_factor=1,
+                               strides=[1])
+    b_buffer = tvm.decl_buffer(kernel.shape, dtype='int8', name="b_buffer",
+                               offset_factor=1)
+                               # strides=[tvm.var('ldw'), 1, 1])
+
+    def _intrin_func(ins, outs):
+        def _instr(index):
+            ib = tvm.ir_builder.create()
+            if index == 1:
+                for i in range(4):
+                    ib.emit(outs[0].vstore([i*32], tvm.const(0, 'int16x32')))
+                return ib.get()
+
+            a_int8 = ins[0].vload([0], "uint8x2")
+            re_int16 = tvm.call_pure_intrin('int16', 'reinterpret', a_int8)
+            vec_ai16 = re_int16.astype('int16x32')
+            vec_a = tvm.call_pure_intrin('int8x64', 'reinterpret', vec_ai16)
+
+            for i in range(4):
+                vec_b = ins[1].vload([i*32, 0], "int8x64")
+                pair_reduction = tvm.call_llvm_intrin('int16x32',
+                                                      'llvm.x86.avx512.pmaddubs.w.512',
+                                                      tvm.const(0, 'uint32'),
+                                                      vec_a, vec_b)
+                if index == 0:
+                    ib.emit(outs[0].vstore([i*32], pair_reduction))
+                else:
+                    ib.emit(outs[0].vstore([i*32], pair_reduction + outs[0].vload([i*32],
+                                                                                  'int16x32')))
+            return ib.get()
+
+        # body, reset, update
+        return _instr(0), _instr(1), _instr(2)
+
+    with tvm.build_config(offset_factor=1, partition_const_loop=True):
+        return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index 1585d877b625..57a2743ae6d0 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -94,7 +94,7 @@ inline bool IsTensorType(TVMArgValue arg) {
 
 TVM_REGISTER_GLOBAL("topi.TEST_create_target")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = tvm::Target::create(args[0]);
+  *rv = tvm::Target::Create(args[0]);
   });
 
 /* Ops from broadcast.h */
@@ -265,6 +265,11 @@ TVM_REGISTER_GLOBAL("topi.prod")
   *rv = topi::prod(args[0], ArrayOrInt(args[1]), args[2]);
   });
 
+TVM_REGISTER_GLOBAL("topi.all")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::all(args[0], ArrayOrInt(args[1]), args[2]);
+  });
+
 /* Ops from transform.h */
 TVM_REGISTER_GLOBAL("topi.expand_dims")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
@@ -635,7 +640,7 @@ using FTVMScheduleBuilder = std::function<
  */
 inline PackedFunc WrapSchedule(FTVMScheduleBuilder builder) {
   return PackedFunc([builder](TVMArgs args, TVMRetValue* ret) {
-    auto target = Target::current_target(false);
+    auto target = Target::Current(false);
     Array<Tensor> outs;
     NodeRef argNodeRef = args[0];
     if (argNodeRef->type_index() == outs->type_index()) {
@@ -707,7 +712,7 @@ using FTVMDenseOpBuilder = std::function<tvm::Tensor(const Target& target,
 */
 inline PackedFunc WrapDenseOp(FTVMDenseOpBuilder builder) {
   return PackedFunc([builder](TVMArgs args, TVMRetValue* ret) {
-    auto target = Target::current_target(false);
+    auto target = Target::Current(false);
     Tensor data = args[0];
     Tensor weight = args[1];
     Tensor bias = args[2];
diff --git a/topi/tests/python/test_topi_conv2d_NCHWc.py b/topi/tests/python/test_topi_conv2d_NCHWc.py
index 5aca0c00c4d6..26b4642bd333 100644
--- a/topi/tests/python/test_topi_conv2d_NCHWc.py
+++ b/topi/tests/python/test_topi_conv2d_NCHWc.py
@@ -49,7 +49,6 @@ def _transform_bias(bias, bn):
 
 def verify_conv2d_NCHWc(batch, in_channel, in_size, num_filter, kernel, stride,
                         padding, dilation=1, add_bias=False, add_relu=False, dtype="float32"):
-    assert dilation == 1, "conv2d_NCHWc does not support dilation for now."
     print("Workload: (%d, %d, %d, %d, %d, %d, %d)" %
           (batch, in_channel, in_size, num_filter, kernel, stride, padding))
 
@@ -79,7 +78,8 @@ def get_ref_data():
         a_np = np.random.uniform(size=(batch, in_channel, in_height, in_width)).astype(dtype)
         w_np = np.random.uniform(size=(num_filter, in_channel, kernel, kernel)).astype(dtype)
         b_np = np.random.uniform(size=(num_filter, 1, 1)).astype(dtype)
-        c_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding)
+        dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
+        c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding)
         if add_bias:
             c_np += b_np
         if add_relu:
@@ -149,8 +149,8 @@ def test_conv2d_NCHWc():
     verify_conv2d_NCHWc(1, 64, 56, 64, 3, 1, 1, add_bias=True)
     verify_conv2d_NCHWc(1, 64, 56, 64, 3, 1, 1, add_bias=True, add_relu=True)
 
-    # disable dilation test since it is not supported by NCHW[x]c conv for now.
-    # verify_conv2d_NCHWc(1, 64, 56, 64, 3, 1, 1, dilation=2)
+    # dilation
+    verify_conv2d_NCHWc(1, 64, 56, 64, 3, 1, 1, dilation=2)
 
     # batch size
     verify_conv2d_NCHWc(4, 64, 56, 64, 3, 1, 1)
diff --git a/topi/tests/python/test_topi_conv2d_nhwc_pack_int8.py b/topi/tests/python/test_topi_conv2d_nhwc_pack_int8.py
new file mode 100644
index 000000000000..763150ac425f
--- /dev/null
+++ b/topi/tests/python/test_topi_conv2d_nhwc_pack_int8.py
@@ -0,0 +1,90 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Example code to do convolution."""
+import os
+import numpy as np
+import tvm
+from tvm import autotvm
+from tvm.autotvm.task.space import FallbackConfigEntity
+import topi
+import topi.testing
+from tvm.contrib.pickle_memoize import memoize
+from topi.util import get_const_tuple
+
+
+def verify_conv2d_1x1_nhwc_pack_int8(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1):
+    in_height = in_width = in_size
+
+    A = tvm.placeholder((batch, in_height, in_width, in_channel), name='A', dtype='uint8')
+    W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W', dtype='int8')
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    adtype = A.dtype
+    wdtype = W.dtype
+
+    @memoize("topi.tests.test_topi_conv2d_1x1_nhwc_pack_int8.verify_nhwc.v2")
+    def get_ref_data():
+        a_np = np.random.uniform(size=a_shape).astype(adtype)
+        w_np = np.random.uniform(size=w_shape).astype(wdtype)
+        dw_np = topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
+        b_np = topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding)
+        return a_np, w_np, b_np
+
+    a_np, w_np, b_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+
+        with tvm.target.create(device):
+            B = topi.nn.conv2d(A, W, stride, padding, dilation, layout='NHWC', out_dtype="int32")
+            s = topi.generic.schedule_conv2d_nhwc_pack([B])
+        a = tvm.nd.array(a_np, ctx)
+        w = tvm.nd.array(w_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        func = tvm.build(s, [A, W, B], device)
+        func(a, w, b)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+    # for device in ['llvm -mcpu=skylake-avx512']:
+    for device in ['llvm']:
+        check_device(device)
+
+
+class DefaultFallback(autotvm.FallbackContext):
+    def _query_inside(self, target, workload):
+        key = (target, workload)
+        if key in self.memory:
+            return self.memory[key]
+        cfg = FallbackConfigEntity()
+        cfg.template_key = 'direct'
+        self.memory[key] = cfg
+        return cfg
+
+
+def test_conv2d_nhwc():
+    autotvm.DispatchContext.current.silent = True
+    with DefaultFallback():
+        verify_conv2d_1x1_nhwc_pack_int8(1, 256, 32, 256, 1, 1, 0)
+
+
+if __name__ == "__main__":
+    test_conv2d_nhwc()
diff --git a/topi/tests/python/test_topi_group_conv2d_NCHWc_int8.py b/topi/tests/python/test_topi_group_conv2d_NCHWc_int8.py
new file mode 100644
index 000000000000..6ed1b4aabc16
--- /dev/null
+++ b/topi/tests/python/test_topi_group_conv2d_NCHWc_int8.py
@@ -0,0 +1,111 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Test for NCHW[x]c convolution"""
+
+import numpy as np
+import tvm
+from tvm import autotvm
+import topi
+import topi.testing
+from tvm.contrib.pickle_memoize import memoize
+from topi.util import get_const_tuple
+
+from common import get_all_backend
+
+def _transform_data(data, bn):
+    # NCHW -> NCHW[x]c
+    batch_size, channel, height, width = data.shape
+    data = np.reshape(data, (batch_size, channel//bn, bn, height, width))
+    data = np.transpose(data, (0, 1, 3, 4, 2))
+    return data
+
+def _transform_kernel(kernel, ic_bn, oc_bn):
+    # OIHW -> OIHW[x]i[x]o
+    out_channel, in_channel, kh, kw = kernel.shape
+    kernel = np.reshape(kernel, (out_channel//oc_bn, oc_bn, in_channel//ic_bn, ic_bn//4, kh, kw, 4))
+    kernel = np.transpose(kernel, (0, 2, 4, 5, 3, 1, 6))
+    return kernel
+
+def verify_group_conv2d_NCHWc_int8(batch, in_channel, groups, in_size, num_filter, kernel, stride,
+                        padding, dilation=1, add_bias=False, add_relu=False, dtype="int32"):
+    assert dilation == 1, "conv2d_NCHWc does not support dilation for now."
+    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" %
+          (batch, in_channel, groups, in_size, num_filter, kernel, stride, padding))
+
+    in_height = in_width = in_size
+
+    # for testing functionality,
+    # we choose arbitrary block size that can divide the channel,
+    # regardless of the performance.
+    oc_block = 1
+    for bn in range(16, 0, -1):
+        if num_filter % bn == 0:
+            oc_block = bn
+            break
+
+    ic_block = 8
+    autotvm.DispatchContext.current.silent = True
+    A = tvm.placeholder((batch, in_channel//ic_block, in_height, in_width, ic_block), name='A', dtype='uint8')
+    W = tvm.placeholder((num_filter//oc_block, in_channel//ic_block//groups, kernel, kernel, ic_block//4, oc_block, 4), name='W', dtype='int8')
+
+    @memoize("topi.tests.test_topi_conv2d_NCHWc_int8.verify_conv2d_NCHWc_int8")
+    def get_ref_data():
+        a_np = np.random.uniform(size=(batch, in_channel, in_height, in_width)).astype("uint8")
+        w_np = np.random.uniform(size=(num_filter, in_channel//groups, kernel, kernel)).astype("int8")
+        c_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding, groups)
+        return _transform_data(a_np, ic_block), _transform_kernel(w_np, ic_block, oc_block), \
+               _transform_data(c_np, oc_block)
+
+    a_np, w_np, c_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            C = topi.nn.conv2d_NCHWc(A, W, (stride, stride), (padding, padding),
+                                     (dilation, dilation),
+                                     layout='NCHW%dc'%ic_block,
+                                     out_layout="NCHW%dc"%oc_block,
+                                     out_dtype=dtype)
+            s = topi.generic.schedule_conv2d_NCHWc([C])
+
+        a = tvm.nd.array(a_np, ctx)
+        w = tvm.nd.array(w_np, ctx)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        func = tvm.build(s, [A, W, C], device,
+                         name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
+                              (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+        # print(tvm.lower(s, [A, W, C], simple_mode=True))
+        func(a, w, c)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-3)
+
+    # for device in ["llvm -mcpu=skylake-avx512"]:
+    for device in ["llvm"]:
+        with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
+            check_device(device)
+
+
+def test_conv2d_NCHWc():
+    # ResNet50 workloads
+    verify_group_conv2d_NCHWc_int8(1, 256, 32, 224, 64, 7, 2, 3)
+
+if __name__ == "__main__":
+    test_conv2d_NCHWc()
diff --git a/topi/tests/python/test_topi_math.py b/topi/tests/python/test_topi_math.py
index c180bc77e829..a276f12b27f0 100644
--- a/topi/tests/python/test_topi_math.py
+++ b/topi/tests/python/test_topi_math.py
@@ -19,6 +19,7 @@
 import topi
 import topi.testing
 from topi import util
+from common import get_all_backend
 
 
 def test_util():
@@ -28,13 +29,21 @@ def test_util():
 
 
 def test_ewise():
-    m = tvm.var('m')
-    l = tvm.var('l')
-    A = tvm.placeholder((m, l), name='A')
+    def test_apply(
+        func,
+        name,
+        f_numpy,
+        low,
+        high,
+        shape=(20, 3),
+        dtype=tvm.float32,
+        check_round=False,
+        skip_name_check=False,
+    ):
+        m = tvm.var("m")
+        l = tvm.var("l")
+        A = tvm.placeholder((m, l), dtype=dtype, name="A")
 
-    shape = (20, 3)
-
-    def test_apply(func, name, f_numpy, low, high, check_round=False, skip_name_check=False):
         B = func(A)
         assert tuple(B.shape) == tuple(A.shape)
         if not skip_name_check:
@@ -59,11 +68,9 @@ def check_device(device):
             foo(a, b)
             tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
-        for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'llvm', 'nvptx', 'sdaccel',
-                       'aocl_sw_emu']:
+        for device in get_all_backend():
             check_device(device)
 
-
     test_apply(topi.floor, "floor", np.floor, -100, 100)
     test_apply(topi.ceil, "ceil", np.ceil, -100, 100)
     test_apply(topi.sign, "sign", np.sign, -100, 100, skip_name_check=True)
@@ -71,12 +78,53 @@ def check_device(device):
     test_apply(topi.abs, "fabs", np.abs, -100, 100)
     test_apply(topi.round, "round", np.round, -100, 100, check_round=True)
     test_apply(topi.exp, "exp", np.exp, -1, 1)
-    test_apply(topi.tanh, "tanh", np.tanh, -10, 10)
-    test_apply(topi.sigmoid, "sigmoid", lambda x:1/(1+np.exp(-x)), -1, 1)
+    test_apply(topi.tanh, "tanh", np.tanh, -10, 10, shape=(128, 128))
+    test_apply(topi.tanh, "tanh", np.tanh, -10, 10, shape=(128, 128), dtype="float64")
+    test_apply(topi.sigmoid, "sigmoid", lambda x: 1 / (1 + np.exp(-x)), -1, 1)
     test_apply(topi.log, "log", np.log, 0, 100)
     test_apply(topi.sqrt, "sqrt", np.sqrt, 0, 100)
-    test_apply(topi.rsqrt, "rsqrt", lambda x:np.ones_like(x)/np.sqrt(x), 0, 100, skip_name_check=True)
+    test_apply(topi.rsqrt, "rsqrt", lambda x: np.ones_like(x) / np.sqrt(x), 0, 100, skip_name_check=True)
+
+
+def test_cast():
+    def verify(from_dtype, to_dtype, low=-100, high=100):
+        shape = (5, 4)
+        A = tvm.placeholder(shape, dtype=from_dtype, name="A")
+        B = topi.cast(A, to_dtype)
+
+        if from_dtype == "bool":
+            a_np = np.random.choice([True, False], size=shape)
+        else:
+            a_np = np.random.uniform(low, high, size=shape).astype(from_dtype)
+        if to_dtype == "bool":
+            a_np = a_np - a_np[2, 3]
+        b_np = a_np.astype(to_dtype)
+
+        for device in get_all_backend():
+            ctx = tvm.context(device, 0)
+            if not ctx.exist:
+                print("Skip because %s is not enabled" % device)
+                continue
+            print("Running on target: %s" % device)
+            with tvm.target.create(device):
+                s = topi.generic.schedule_injective(B)
+            foo = tvm.build(s, [A, B], device)
+            a = tvm.nd.array(a_np, ctx)
+            b = tvm.nd.empty(shape=shape, dtype=to_dtype, ctx=ctx)
+            foo(a, b)
+            tvm.testing.assert_allclose(b.asnumpy(), b_np)
+
+    verify("int32", "float32")
+    verify("int32", "float64")
+    verify("int32", "bool")
+    verify("float32", "int32")
+    verify("float32", "float64")
+    verify("float32", "bool")
+    verify("bool", "float32")
+    verify("bool", "int32")
+
 
 if __name__ == "__main__":
     test_util()
     test_ewise()
+    test_cast()
diff --git a/topi/tests/python/test_topi_reduce.py b/topi/tests/python/test_topi_reduce.py
index 1882cbd7f896..6e6470dad588 100644
--- a/topi/tests/python/test_topi_reduce.py
+++ b/topi/tests/python/test_topi_reduce.py
@@ -50,6 +50,8 @@ def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum", dtype="float32")
     out_dtype = dtype
     if type == "sum":
         B = topi.sum(A1, axis=axis, keepdims=keepdims)
+    elif type == "all":
+        B = topi.all(A, axis=axis, keepdims=keepdims)
     elif type == "max":
         B = topi.max(A1, axis=axis, keepdims=keepdims)
     elif type == "min":
@@ -74,10 +76,16 @@ def check_device(device):
 
         foo = tvm.build(s, [A, B], device, name=type)
         # Test
-        in_npy = np.random.uniform(size=in_shape).astype(dtype)
-        in_npy_map = np.sqrt(np.exp(in_npy)).astype(dtype)
+        if dtype == 'bool':
+            in_npy_map = in_npy = np.random.choice([True, False], size=in_shape)
+        else:
+            in_npy = np.random.uniform(-1, 1, size=in_shape).astype(dtype)
+            in_npy_map = np.sqrt(np.exp(in_npy)).astype(dtype)
+
         if type == "sum":
             out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims)
+        elif type == "all" and dtype == 'bool':
+            out_npy = in_npy_map.all(axis=axis, keepdims=keepdims)
         elif type == "max":
             out_npy = in_npy_map.max(axis=axis, keepdims=keepdims)
         elif type == "min":
@@ -113,26 +121,37 @@ def check_device(device):
 
 
 def test_reduce_map():
+
     verify_reduce_map_ele(in_shape=(32,),
                           axis=0,
                           keepdims=False,
                           type="argmax")
     verify_reduce_map_ele(in_shape=(128, 24, 128, 24),
-                        axis=(1, 2, 3),
-                        keepdims=True,
-                        type="sum")
+                          axis=(1, 2, 3),
+                          keepdims=True,
+                          type="sum")
+    verify_reduce_map_ele(in_shape=(2, 3),
+                          axis=None,
+                          keepdims=True,
+                          type="all",
+                          dtype='bool')
     verify_reduce_map_ele(in_shape=(128, 24 * 128 * 24),
-                        axis=(1,),
-                        keepdims=False,
-                        type="max")
+                          axis=(1,),
+                          keepdims=False,
+                          type="max")
+    verify_reduce_map_ele(in_shape=(32, 128, 24),
+                          axis=None,
+                          keepdims=True,
+                          type="sum")
     verify_reduce_map_ele(in_shape=(32, 128, 24),
-                        axis=None,
-                        keepdims=True,
-                        type="sum")
+                          axis=None,
+                          keepdims=True,
+                          dtype='bool',
+                          type="all")
     verify_reduce_map_ele(in_shape=(128, 24, 128, 24),
-                        axis=(0, 2),
-                        keepdims=False,
-                        type="min")
+                          axis=(0, 2),
+                          keepdims=False,
+                          type="min")
     verify_reduce_map_ele(in_shape=(32, 128),
                           axis=1,
                           keepdims=True,
diff --git a/topi/tests/python/test_topi_resize.py b/topi/tests/python/test_topi_resize.py
index 26a5e3549de7..82778863af55 100644
--- a/topi/tests/python/test_topi_resize.py
+++ b/topi/tests/python/test_topi_resize.py
@@ -23,8 +23,7 @@
 
 from common import get_all_backend
 
-def verify_bilinear_scale(batch, in_channel, in_height, in_width, out_height, out_width, layout='NCHW', align_corners=False):
-
+def verify_resize(batch, in_channel, in_height, in_width, out_height, out_width, layout='NCHW', align_corners=False, method="BILINEAR"):
     if layout == 'NCHW':
         A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A', dtype='float32')
         dtype = A.dtype
@@ -39,9 +38,14 @@ def verify_bilinear_scale(batch, in_channel, in_height, in_width, out_height, ou
         raise NotImplementedError(
             'Layout not supported {} '.format(layout))
 
-    B = topi.image.resize(A, (out_height, out_width), layout=layout, align_corners=align_corners)
+    B = topi.image.resize(A, (out_height, out_width), layout=layout, align_corners=align_corners, method=method)
 
-    b_np = topi.testing.bilinear_resize_python(a_np, (out_height, out_width), layout, align_corners)
+    if method == "BILINEAR":
+        b_np = topi.testing.bilinear_resize_python(a_np, (out_height, out_width), layout, align_corners)
+    else:
+        scale_h = out_height / in_height
+        scale_w = out_width / in_width
+        b_np = topi.testing.upsampling_python(a_np, (scale_h, scale_w), layout)
 
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -61,15 +65,19 @@ def check_device(device):
     for device in get_all_backend():
         check_device(device)
 
+
 def test_resize():
     # Scale NCHW
-    verify_bilinear_scale(4, 16, 32, 32, 50, 50, 'NCHW')
+    verify_resize(4, 16, 32, 32, 50, 50, 'NCHW')
     # Scale NCHW + Align Corners
-    verify_bilinear_scale(6, 32, 64, 64, 20, 20, 'NCHW', True)
+    verify_resize(6, 32, 64, 64, 20, 20, 'NCHW', True)
     # Scale NHWC
-    verify_bilinear_scale(4, 16, 32, 32, 50, 50, "NHWC")
+    verify_resize(4, 16, 32, 32, 50, 50, "NHWC")
     # Scale NHWC + Align Corners
-    verify_bilinear_scale(6, 32, 64, 64, 20, 20, "NHWC", True)
+    verify_resize(6, 32, 64, 64, 20, 20, "NHWC", True)
+    # Nearest + Fractional
+    verify_resize(4, 16, 32, 32, 50, 50, 'NCHW', method="NEAREST_NEIGHBOR")
+    verify_resize(4, 16, 32, 32, 50, 50, 'NHWC', method="NEAREST_NEIGHBOR")
 
 if __name__ == "__main__":
     test_resize()
diff --git a/topi/tests/python/test_topi_sort.py b/topi/tests/python/test_topi_sort.py
index 3a2c9c2e4980..c084a7c431b6 100644
--- a/topi/tests/python/test_topi_sort.py
+++ b/topi/tests/python/test_topi_sort.py
@@ -16,23 +16,15 @@
 # under the License.
 """Test code for vision package"""
 from __future__ import print_function
-import math
 import numpy as np
 import tvm
 import topi
 import topi.testing
 
-from tvm.contrib.pickle_memoize import memoize
-from topi.util import get_const_tuple
-from topi import argsort
-
 def test_argsort():
-    dshape = (1, 8)
-    valid_count_shape = (2,)
+    dshape = (20, 100)
     data = tvm.placeholder(dshape, name="data", dtype="float32")
-    valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
     np_data = np.random.rand(dshape[0], dshape[1]).astype(data.dtype)
-    np_valid_count = np.array([4]).astype(valid_count.dtype)
     np_result = np.argsort(-np_data)
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -41,19 +33,77 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            out = argsort(data, valid_count, axis = -1, is_ascend = False, flag=False)
+            out = topi.argsort(data, axis=-1, is_ascend=False)
             s = topi.generic.schedule_argsort(out)
 
         tvm_data = tvm.nd.array(np_data, ctx)
-        tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
         tvm_out = tvm.nd.array(np.zeros(dshape, dtype="float32"), ctx)
-        f = tvm.build(s, [data, valid_count, out], device)
-        f(tvm_data, tvm_valid_count, tvm_out)
+        f = tvm.build(s, [data, out], device)
+        f(tvm_data, tvm_out)
         tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result.astype("float32"), rtol=1e0)
 
     for device in ['llvm', 'cuda', 'opencl']:
         check_device(device)
 
+def verify_topk(k, axis, ret_type, is_ascend, dtype):
+    shape = (20, 100)
+    data_dtype = "float32"
+    data = tvm.placeholder(shape, name="data", dtype=data_dtype)
+
+    np_data = np.random.uniform(size=shape).astype(data_dtype)
+    if is_ascend:
+        np_indices = np.argsort(np_data, axis=axis)
+    else:
+        np_indices = np.argsort(-np_data, axis=axis)
+    kk = k if k >= 1 else shape[axis]
+    if axis == 0:
+        np_indices = np_indices[:kk, :]
+        np_values = np.zeros(np_indices.shape).astype(data_dtype)
+        for i in range(shape[1]):
+            np_values[:, i] = np_data[np_indices[:, i], i]
+    else:
+        np_indices = np_indices[:, :kk]
+        np_values = np.zeros(np_indices.shape).astype(data_dtype)
+        for i in range(shape[0]):
+            np_values[i, :] = np_data[i, np_indices[i, :]]
+    np_indices = np_indices.astype(dtype)
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            outs = topi.topk(data, k, axis, ret_type, is_ascend, dtype)
+            outs = outs if isinstance(outs, list) else [outs]
+            s = topi.generic.schedule_topk(outs)
+        tvm_data = tvm.nd.array(np_data, ctx)
+        tvm_res = []
+        for t in outs:
+            tvm_res.append(tvm.nd.empty(t.shape, dtype=t.dtype, ctx=ctx))
+        f = tvm.build(s, [data] + outs, device)
+        f(tvm_data, *tvm_res)
+        if ret_type == "both":
+            tvm.testing.assert_allclose(tvm_res[0].asnumpy(), np_values)
+            tvm.testing.assert_allclose(tvm_res[1].asnumpy(), np_indices)
+        elif ret_type == "values":
+            tvm.testing.assert_allclose(tvm_res[0].asnumpy(), np_values)
+        else:
+            tvm.testing.assert_allclose(tvm_res[0].asnumpy(), np_indices)
+
+    for device in ['llvm', 'cuda', 'opencl']:
+        check_device(device)
+
+def test_topk():
+    np.random.seed(0)
+    for k in [0, 1, 5]:
+        for axis in [0, -1, 1]:
+            for ret_type in ["both", "values", "indices"]:
+                verify_topk(k, axis, ret_type, True, "int64")
+                verify_topk(k, axis, ret_type, False, "float32")
+
 
 if __name__ == "__main__":
     test_argsort()
+    test_topk()
diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py
index a078eacae85b..5682fde69372 100644
--- a/topi/tests/python/test_topi_transform.py
+++ b/topi/tests/python/test_topi_transform.py
@@ -127,7 +127,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_injective(out_tensor)
+            s = topi.generic.schedule_concatenate(out_tensor)
 
         foo = tvm.build(s, tensor_l + [out_tensor], device, name="concatenate")
         data_npys = [np.random.normal(size=shape).astype(tensor_l[0].dtype) for shape in shapes]
@@ -275,9 +275,11 @@ def check_device(device):
         data_npy = np.arange(shape_size, dtype=src_dtype).reshape((src_shape))
 
         if axis is None:
-            out_npys = np.take(data_npy, indices_src, mode=mode)
+            np_mode = "raise" if mode == "fast" else mode
+            out_npys = np.take(data_npy, indices_src, mode=np_mode)
         else:
-            out_npys = np.take(data_npy, indices_src, axis=axis, mode=mode)
+            np_mode = "raise" if mode == "fast" else mode
+            out_npys = np.take(data_npy, indices_src, axis=axis, mode=np_mode)
         data_nd = tvm.nd.array(data_npy, ctx)
         indices_nd = tvm.nd.array(indices_src, ctx)
         out_nd = tvm.nd.empty(out_npys.shape, ctx=ctx, dtype=src_dtype)
@@ -476,6 +478,7 @@ def test_concatenate():
                         (12, 6, 7, 3),
                         (8, 6, 7, 3),
                         (2, 6, 7, 3)], 0)
+    verify_concatenate([(1, 14400), (1, 2400), (1, 640), (1, 240)], 1)
 
 
 def test_stack():
@@ -520,6 +523,9 @@ def test_take():
     verify_take((3,4), [-1, 2], axis=0, mode="wrap")
     verify_take((3,4), [-1, 2], axis=1)
     verify_take((3,4), [-1, 2], axis=1, mode="wrap")
+    verify_take((3,3,3), [[11,25]], mode="fast")
+    verify_take((3,4), [0, 2], axis=0, mode="fast")
+    verify_take((3,4), [0, 2], axis=1, mode="fast")
 
 def test_gather_nd():
     for indices_dtype in ['int32', 'float32']:
diff --git a/topi/tests/python/test_topi_upsampling.py b/topi/tests/python/test_topi_upsampling.py
index 0838f02303f6..ddfb002b6f91 100644
--- a/topi/tests/python/test_topi_upsampling.py
+++ b/topi/tests/python/test_topi_upsampling.py
@@ -46,7 +46,7 @@ def verify_upsampling(batch, in_channel, in_height, in_width, scale, layout='NCH
         out_size = (in_height*scale, in_width*scale)
         b_np = topi.testing.bilinear_resize_python(a_np, out_size, layout)
     else:
-        b_np = topi.testing.upsampling_python(a_np, scale, layout)
+        b_np = topi.testing.upsampling_python(a_np, (scale, scale), layout)
 
     def check_device(device):
         ctx = tvm.context(device, 0)
diff --git a/topi/tests/python/test_topi_util.py b/topi/tests/python/test_topi_util.py
new file mode 100644
index 000000000000..534b6993d411
--- /dev/null
+++ b/topi/tests/python/test_topi_util.py
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test code for util"""
+
+import topi
+
+
+def verify_get_shape(src_shape, src_layout, dst_layout, expect_shape):
+    dst_shape = topi.util.get_shape(src_shape, src_layout, dst_layout)
+    assert dst_shape == expect_shape, \
+        "Shape mismatch: expecting %s but got %s" % (expect_shape, dst_shape)
+
+
+def test_get_shape():
+    verify_get_shape((1, 3, 224, 224), "NCHW", "NCHW", (1, 3, 224, 224))
+    verify_get_shape((1, 3, 224, 224), "NCHW", "NHWC", (1, 224, 224, 3))
+    verify_get_shape((3, 2, 32, 48, 16), "NCHW16c", "NC16cWH", (3, 2, 16, 48, 32))
+    verify_get_shape((2, 3, 32, 32, 16, 8), "OIHW16i8o", "HWO8oI16i", (32, 32, 2, 8, 3, 16))
+
+if __name__ == "__main__":
+    test_get_shape()
\ No newline at end of file
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index 483f3a641c70..3a0b13489037 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -27,18 +27,18 @@
 from topi.vision import ssd, non_max_suppression, get_valid_counts
 
 
-def verify_get_valid_counts(dshape, score_threshold):
+def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
     dtype = "float32"
     batch_size, num_anchor, elem_length = dshape
-    np_data = np.random.uniform(size=dshape).astype(dtype)
+    np_data = np.random.uniform(low=-2, high=2, size=dshape).astype(dtype)
     np_out1 = np.zeros(shape=(batch_size,))
     np_out2 = np.zeros(shape=dshape).astype(dtype)
     for i in range(batch_size):
         np_out1[i] = 0
         inter_idx = 0
         for j in range(num_anchor):
-            score = np_data[i, j, 1]
-            if score > score_threshold:
+            score = np_data[i, j, score_index]
+            if score > score_threshold and (id_index < 0 or np_data[i, j, id_index] >= 0):
                 for k in range(elem_length):
                     np_out2[i, inter_idx, k] = np_data[i, j, k]
                 np_out1[i] += 1
@@ -55,8 +55,8 @@ def check_device(device):
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             data = tvm.placeholder(dshape, name="data", dtype=dtype)
-            outs = get_valid_counts(data, score_threshold)
-            s = topi.generic.schedule_multibox_prior(outs)
+            outs = get_valid_counts(data, score_threshold, id_index, score_index)
+            s = topi.generic.schedule_get_valid_counts(outs)
 
         tvm_input_data = tvm.nd.array(np_data, ctx)
         tvm_out1 = tvm.nd.array(np.zeros(np_out1.shape, dtype="int32"), ctx)
@@ -67,33 +67,26 @@ def check_device(device):
         tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3)
 
     for device in ['llvm', 'cuda', 'opencl']:
+        # Disable gpu test for now
+        if device != "llvm":
+            continue
         check_device(device)
 
 
 def test_get_valid_counts():
-    verify_get_valid_counts((1, 2500, 6), 0)
-    verify_get_valid_counts((1, 2500, 6), -1)
-    verify_get_valid_counts((3, 1000, 6), 0.55)
-    verify_get_valid_counts((16, 500, 6), 0.95)
+    verify_get_valid_counts((1, 2500, 6), 0, 0, 1)
+    verify_get_valid_counts((1, 2500, 5), -1, -1, 0)
+    verify_get_valid_counts((3, 1000, 6), 0.55, 1, 0)
+    verify_get_valid_counts((16, 500, 5), 0.95, -1, 1)
 
 
-def test_non_max_suppression():
-    dshape = (1, 5, 6)
-    indices_dshape = (1, 5)
+def verify_non_max_suppression(np_data, np_valid_count, np_result, np_indices_result, iou_threshold,
+                               force_suppress, top_k, coord_start, score_index, id_index):
+    dshape = np_data.shape
+    batch, num_anchors, _ = dshape
+    indices_dshape = (batch, num_anchors)
     data = tvm.placeholder(dshape, name="data")
-    valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
-    nms_threshold = 0.7
-    force_suppress = True
-    nms_topk = 2
-
-    np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80],
-                         [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
-                         [1, 0.5, 100, 60, 70, 110]]]).astype(data.dtype)
-    np_valid_count = np.array([4]).astype(valid_count.dtype)
-    np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
-                           [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
-                           [-1, -1, -1, -1, -1, -1]]])
-    np_indices_result = np.array([[3, 0, -1, -1, -1]])
+    valid_count = tvm.placeholder((batch,), dtype="int32", name="valid_count")
 
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -103,11 +96,17 @@ def check_device(device):
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             if device == 'llvm':
-                out = non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk, return_indices=False)
-                indices_out = non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk)
+                out = non_max_suppression(data, valid_count, -1, iou_threshold, force_suppress, top_k,
+                                          coord_start=coord_start, score_index=score_index, id_index=id_index,
+                                          return_indices=False)
+                indices_out = non_max_suppression(data, valid_count, -1, iou_threshold, force_suppress, top_k,
+                                                  coord_start=coord_start, score_index=score_index, id_index=id_index)
             else:
-                out = topi.cuda.non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk, return_indices=False)
-                indices_out = topi.cuda.non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk)
+                out = topi.cuda.non_max_suppression(data, valid_count, -1, iou_threshold, force_suppress, top_k,
+                                                    coord_start=coord_start, score_index=score_index, id_index=id_index,
+                                                    return_indices=False)
+                indices_out = topi.cuda.non_max_suppression(data, valid_count, -1, iou_threshold, force_suppress, top_k,
+                                                            coord_start=coord_start, score_index=score_index, id_index=id_index)
             s = topi.generic.schedule_nms(out)
             indices_s = topi.generic.schedule_nms(indices_out)
 
@@ -128,6 +127,30 @@ def check_device(device):
         check_device(device)
 
 
+def test_non_max_suppression():
+    np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80],
+                         [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
+                         [1, 0.5, 100, 60, 70, 110]]]).astype("float32")
+    np_valid_count = np.array([4]).astype("int32")
+    np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
+                           [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
+                           [-1, -1, -1, -1, -1, -1]]])
+    np_indices_result = np.array([[3, 0, -1, -1, -1]])
+
+    verify_non_max_suppression(np_data, np_valid_count, np_result, np_indices_result, 0.7, True, 2, 2, 1, 0)
+
+    np_data = np.array([[[0.8, 1, 20, 25, 45], [0.7, 30, 60, 50, 80],
+                         [0.4, 4, 21, 19, 40], [0.9, 35, 61, 52, 79],
+                         [0.5, 100, 60, 70, 110]]]).astype("float32")
+    np_valid_count = np.array([4]).astype("int32")
+    np_result = np.array([[[0.9, 35, 61, 52, 79], [0.8, 1, 20, 25, 45],
+                           [-1, -1, -1, -1, -1], [-1, -1, -1, -1, -1],
+                           [-1, -1, -1, -1, -1]]])
+    np_indices_result = np.array([[3, 0, -1, -1, -1]])
+    verify_non_max_suppression(np_data, np_valid_count, np_result, np_indices_result, 0.7, False, 2, 1, 0, -1)
+
+
+
 def verify_multibox_prior(dshape, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False):
     data = tvm.placeholder(dshape, name="data")
 
@@ -282,6 +305,7 @@ def check_device(device):
 def test_roi_align():
     verify_roi_align(1, 16, 32, 64, 7, 1.0, -1)
     verify_roi_align(4, 16, 32, 64, 7, 0.5, 2)
+    verify_roi_align(1, 32, 32, 80, 8, 0.0625, 2)
 
 
 def verify_roi_pool(batch, in_channel, in_size, num_roi, pooled_size, spatial_scale):
diff --git a/tutorials/autotvm/tune_relay_arm.py b/tutorials/autotvm/tune_relay_arm.py
index 2c1dca9921eb..290f9756f195 100644
--- a/tutorials/autotvm/tune_relay_arm.py
+++ b/tutorials/autotvm/tune_relay_arm.py
@@ -96,7 +96,8 @@ def get_network(name, batch_size):
         # an example for mxnet model
         from mxnet.gluon.model_zoo.vision import get_model
         block = get_model('resnet18_v1', pretrained=True)
-        net, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype)
+        mod, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype)
+        net = mod[mod.entry_func]
         net = relay.Function(net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs)
     else:
         raise ValueError("Unsupported network: " + name)
diff --git a/tutorials/autotvm/tune_relay_cuda.py b/tutorials/autotvm/tune_relay_cuda.py
index 571334e8c106..c158e4b9fe36 100644
--- a/tutorials/autotvm/tune_relay_cuda.py
+++ b/tutorials/autotvm/tune_relay_cuda.py
@@ -96,7 +96,8 @@ def get_network(name, batch_size):
         # an example for mxnet model
         from mxnet.gluon.model_zoo.vision import get_model
         block = get_model('resnet18_v1', pretrained=True)
-        net, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype)
+        mod, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype)
+        net = mod[mod.entry_func]
         net = relay.Function(net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs)
     else:
         raise ValueError("Unsupported network: " + name)
diff --git a/tutorials/autotvm/tune_relay_mobile_gpu.py b/tutorials/autotvm/tune_relay_mobile_gpu.py
index 1e4cf6d52ade..c011268fda51 100644
--- a/tutorials/autotvm/tune_relay_mobile_gpu.py
+++ b/tutorials/autotvm/tune_relay_mobile_gpu.py
@@ -97,7 +97,8 @@ def get_network(name, batch_size):
         # an example for mxnet model
         from mxnet.gluon.model_zoo.vision import get_model
         block = get_model('resnet18_v1', pretrained=True)
-        net, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype)
+        mod, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype)
+        net = mod[mod.entry_func]
         net = relay.Function(net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs)
     else:
         raise ValueError("Unsupported network: " + name)
diff --git a/tutorials/autotvm/tune_relay_x86.py b/tutorials/autotvm/tune_relay_x86.py
index f100a35e5770..c8d9def206fe 100644
--- a/tutorials/autotvm/tune_relay_x86.py
+++ b/tutorials/autotvm/tune_relay_x86.py
@@ -30,6 +30,7 @@
 from tvm import relay
 from tvm.relay import testing
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner
 import tvm.contrib.graph_runtime as runtime
 
 #################################################################
@@ -63,7 +64,8 @@ def get_network(name, batch_size):
         # an example for mxnet model
         from mxnet.gluon.model_zoo.vision import get_model
         block = get_model('resnet18_v1', pretrained=True)
-        net, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype)
+        mod, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype)
+        net = mod[mod.entry_func]
         net = relay.Function(net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs)
     else:
         raise ValueError("Unsupported network: " + name)
@@ -81,6 +83,7 @@ def get_network(name, batch_size):
 dtype = "float32"
 model_name = "resnet-18"
 log_file = "%s.log" % model_name
+graph_opt_sch_file = "%s_graph_opt.log" % model_name
 
 # Set number of threads used for tuning based on the number of
 # physical CPU cores on your machine.
@@ -157,6 +160,16 @@ def tune_kernels(tasks,
                            autotvm.callback.progress_bar(n_trial, prefix=prefix),
                            autotvm.callback.log_to_file(log_filename)])
 
+# Use graph tuner to achieve graph level optimal schedules
+# Set use_DP=False if it takes too long to finish.
+def tune_graph(graph, dshape, records, opt_sch_file, use_DP=True):
+    target_op = [relay.nn.conv2d]
+    Tuner = DPTuner if use_DP else PBQPTuner
+    executor = Tuner(graph, {"data": dshape}, records, target_op, target)
+    executor.benchmark_layout_transform(min_exec_num=2000)
+    executor.run()
+    executor.write_opt_sch2record_file(opt_sch_file)
+
 
 ########################################################################
 # Finally, we launch tuning jobs and evaluate the end-to-end performance.
@@ -171,9 +184,10 @@ def tune_and_evaluate(tuning_opt):
     # run tuning tasks
     print("Tuning...")
     tune_kernels(tasks, **tuning_opt)
+    tune_graph(net, data_shape, log_file, graph_opt_sch_file)
 
-    # compile kernels with history best records
-    with autotvm.apply_history_best(log_file):
+    # compile kernels with graph-level best records
+    with autotvm.apply_graph_best(graph_opt_sch_file):
         print("Compile...")
         with relay.build_config(opt_level=3):
             graph, lib, params = relay.build_module.build(
diff --git a/tutorials/autotvm/tune_simple_template.py b/tutorials/autotvm/tune_simple_template.py
index c7eea7f42c0b..dc1b2ce4a4fd 100644
--- a/tutorials/autotvm/tune_simple_template.py
+++ b/tutorials/autotvm/tune_simple_template.py
@@ -19,19 +19,19 @@
 =============================================
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
 
-This is an introduction tutorial to the auto-tuning module in tvm.
+This is an introduction tutorial to the auto-tuning module in TVM.
 
 There are two steps in auto-tuning.
 The first step is defining a search space.
 The second step is running a search algorithm to explore through this space.
-In this tutorial, you can learn how to perform these two steps in tvm.
+In this tutorial, you can learn how to perform these two steps in TVM.
 The whole workflow is illustrated by a matrix multiplication example.
 """
 
 ######################################################################
 # Install dependencies
 # --------------------
-# To use autotvm package in tvm, we need to install some extra dependencies.
+# To use autotvm package in TVM, we need to install some extra dependencies.
 # (change "3" to "2" if you use python2):
 #
 # .. code-block:: bash
@@ -65,7 +65,7 @@
 # tunable schedule template. You can regard the process of search space definition
 # as the parameterization of our existing schedule code.
 #
-# To begin with, here is how we implement a blocked matrix multiplication in tvm.
+# To begin with, here is how we implement a blocked matrix multiplication in TVM.
 
 # Matmul V0: Constant tiling factor
 def matmul_v0(N, L, M, dtype):
@@ -133,7 +133,7 @@ def matmul_v1(N, L, M, dtype):
 # Here we make four modifications to the previous schedule code and get
 # a tunable "template". We can explain the modifications one by one.
 #
-# 1. Use a decorator to mark this function as a simple template
+# 1. Use a decorator to mark this function as a simple template.
 # 2. Get a config object:
 #    You can regard this :code:`cfg` as an argument of this function but
 #    we obtain it in a different way. With this argument, this function is no longer
@@ -236,7 +236,7 @@ def matmul(N, L, M, dtype):
 # In step 1, we build the search space by extending our old schedule code
 # into a template. The next step is to pick a tuner and explore in this space.
 #
-# Auto-tuners in tvm
+# Auto-tuners in TVM
 # ^^^^^^^^^^^^^^^^^^
 # The job for a tuner can be described by following pseudo code
 #
diff --git a/tutorials/cross_compilation_and_rpc.py b/tutorials/cross_compilation_and_rpc.py
index ea1b88cbf96a..a75398402f9f 100644
--- a/tutorials/cross_compilation_and_rpc.py
+++ b/tutorials/cross_compilation_and_rpc.py
@@ -24,11 +24,11 @@
 This tutorial introduces cross compilation and remote device
 execution with RPC in TVM.
 
-With cross compilation and RPC, you can **compile program on your
+With cross compilation and RPC, you can **compile a program on your
 local machine then run it on the remote device**. It is useful when
-the resource of remote devices is limited, like Raspberry Pi and mobile
-platforms. In this tutorial, we will take Raspberry Pi for CPU example
-and Firefly-RK3399 for opencl example.
+the remote device resource are limited, like Raspberry Pi and mobile
+platforms. In this tutorial, we will use the Raspberry Pi for a CPU example
+and the Firefly-RK3399 for an OpenCL example.
 """
 
 ######################################################################
@@ -39,9 +39,9 @@
 #
 # .. note::
 #
-#   All instructions in both this section and next section should be
-#   executed on the target device, e.g. Raspberry Pi. And we assume it
-#   has Linux running.
+#   All instructions in both this section and the next section should be
+#   executed on the target device, e.g. Raspberry Pi.  We assume the target
+#   is running Linux.
 #
 # Since we do compilation on the local machine, the remote device is only used
 # for running the generated code. We only need to build the TVM runtime on
@@ -53,7 +53,7 @@
 #   cd tvm
 #   make runtime -j2
 #
-# After building runtime successfully, we need to set environment variables
+# After building the runtime successfully, we need to set environment variables
 # in :code:`~/.bashrc` file. We can edit :code:`~/.bashrc`
 # using :code:`vi ~/.bashrc` and add the line below (Assuming your TVM
 # directory is in :code:`~/tvm`):
@@ -88,7 +88,7 @@
 #
 # .. note::
 #
-#   Now we back to the local machine, which has a full TVM installed
+#   Now we go back to the local machine, which has a full TVM installed
 #   (with LLVM).
 #
 # Here we will declare a simple kernel on the local machine:
@@ -127,15 +127,15 @@
 # .. note::
 #
 #   To run this tutorial with a real remote device, change :code:`local_demo`
-#   to False and replace :code:`target` in :code:`build` with the true
-#   target triple of your device. The target triple which might be
+#   to False and replace :code:`target` in :code:`build` with the appropriate
+#   target triple for your device. The target triple which might be
 #   different for different devices. For example, it is
 #   :code:`'llvm -target=armv7l-linux-gnueabihf'` for Raspberry Pi 3B and
 #   :code:`'llvm -target=aarch64-linux-gnu'` for RK3399.
 #
-#   Usually, you can query the target by execute :code:`gcc -v` on your
-#   device, and look for the line starting with :code:`Target:`
-#   (Though it may be still a loose configuration.)
+#   Usually, you can query the target by running :code:`gcc -v` on your
+#   device, and looking for the line starting with :code:`Target:`
+#   (Though it may still be a loose configuration.)
 #
 #   Besides :code:`-target`, you can also set other compilation options
 #   like:
@@ -160,7 +160,7 @@
 ######################################################################
 # Run CPU Kernel Remotely by RPC
 # ------------------------------
-# We show how to run the generated cpu kernel on the remote device.
+# We show how to run the generated CPU kernel on the remote device.
 # First we obtain an RPC session from remote device.
 
 if local_demo:
@@ -200,8 +200,8 @@
 #########################################################################
 # Run OpenCL Kernel Remotely by RPC
 # ---------------------------------
-# As for remote OpenCL devices, the workflow is almost the same as above.
-# You can define the kernel, upload files, and run by RPC.
+# For remote OpenCL devices, the workflow is almost the same as above.
+# You can define the kernel, upload files, and run via RPC.
 #
 # .. note::
 #
@@ -209,7 +209,7 @@
 #    Firefly-RK3399. You may follow this `tutorial <https://gist.github.com/mli/585aed2cec0b5178b1a510f9f236afa2>`_
 #    to setup the OS and OpenCL driver for RK3399.
 #
-#    Also we need to build the runtime with OpenCL enabled on rk3399 board. In the tvm
+#    Also we need to build the runtime with OpenCL enabled on rk3399 board. In the TVM
 #    root directory, execute
 #
 # .. code-block:: bash
@@ -218,7 +218,7 @@
 #    sed -i "s/USE_OPENCL OFF/USE_OPENCL ON/" config.cmake
 #    make runtime -j4
 #
-# The following function shows how we run OpenCL kernel remotely
+# The following function shows how we run an OpenCL kernel remotely
 
 def run_opencl():
     # NOTE: This is the setting for my rk3399 board. You need to modify
@@ -256,7 +256,7 @@ def run_opencl():
 # This tutorial provides a walk through of cross compilation and RPC
 # features in TVM.
 #
-# - Set up RPC server on the remote device.
-# - Set up target device configuration to cross compile kernel on the
+# - Set up an RPC server on the remote device.
+# - Set up the target device configuration to cross compile the kernels on the
 #   local machine.
-# - Upload and run the kernel remotely by RPC API.
+# - Upload and run the kernels remotely via the RPC API.
diff --git a/tutorials/frontend/deploy_model_on_android.py b/tutorials/frontend/deploy_model_on_android.py
index a3ea8651b110..2e416b7f379a 100644
--- a/tutorials/frontend/deploy_model_on_android.py
+++ b/tutorials/frontend/deploy_model_on_android.py
@@ -260,10 +260,10 @@ def transform_image(image):
 
 input_name = 'input_1'
 shape_dict = {input_name: x.shape}
-func, params = relay.frontend.from_keras(keras_mobilenet_v2, shape_dict)
+mod, params = relay.frontend.from_keras(keras_mobilenet_v2, shape_dict)
 
 with relay.build_config(opt_level=3):
-    graph, lib, params = relay.build(func, target=target,
+    graph, lib, params = relay.build(mod[mod.entry_func], target=target,
                                      target_host=target_host, params=params)
 
 # After `relay.build`, you will get three return values: graph,
diff --git a/tutorials/frontend/deploy_model_on_rasp.py b/tutorials/frontend/deploy_model_on_rasp.py
index c471e8228840..78377849c10b 100644
--- a/tutorials/frontend/deploy_model_on_rasp.py
+++ b/tutorials/frontend/deploy_model_on_rasp.py
@@ -140,8 +140,9 @@ def transform_image(image):
 
 # We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon
 shape_dict = {'data': x.shape}
-func, params = relay.frontend.from_mxnet(block, shape_dict)
+mod, params = relay.frontend.from_mxnet(block, shape_dict)
 # we want a probability so add a softmax operator
+func = mod[mod.entry_func]
 func = relay.Function(func.params, relay.nn.softmax(func.body), None, func.type_params, func.attrs)
 
 ######################################################################
diff --git a/tutorials/frontend/deploy_ssd_gluoncv.py b/tutorials/frontend/deploy_ssd_gluoncv.py
index f536679183c8..92b488f8fa07 100644
--- a/tutorials/frontend/deploy_ssd_gluoncv.py
+++ b/tutorials/frontend/deploy_ssd_gluoncv.py
@@ -47,9 +47,6 @@
 #
 #   To get best performance fo SSD on Intel graphics,
 #   change target argument to 'opencl -device=intel_graphics'
-#
-#   SSD with VGG as body network is not supported yet since
-#   x86 conv2d schedule doesn't support dilation.
 
 supported_model = [
     'ssd_512_resnet50_v1_voc',
@@ -57,6 +54,8 @@
     'ssd_512_resnet101_v2_voc',
     'ssd_512_mobilenet1.0_voc',
     'ssd_512_mobilenet1.0_coco',
+    'ssd_300_vgg16_atrous_voc'
+    'ssd_512_vgg16_atrous_coco',
 ]
 
 model_name = supported_model[0]
@@ -77,9 +76,9 @@
 block = model_zoo.get_model(model_name, pretrained=True)
 
 def build(target):
-    net, params = relay.frontend.from_mxnet(block, {"data": dshape})
+    mod, params = relay.frontend.from_mxnet(block, {"data": dshape})
     with relay.build_config(opt_level=3):
-        graph, lib, params = relay.build(net, target, params=params)
+        graph, lib, params = relay.build(mod[mod.entry_func], target, params=params)
     return graph, lib, params
 
 ######################################################################
diff --git a/tutorials/frontend/from_caffe2.py b/tutorials/frontend/from_caffe2.py
index ceec8c0ad119..082dfafb33e7 100644
--- a/tutorials/frontend/from_caffe2.py
+++ b/tutorials/frontend/from_caffe2.py
@@ -83,13 +83,13 @@ def transform_image(image):
 
 # parse Caffe2 model and convert into Relay computation graph
 from tvm import relay
-func, params = relay.frontend.from_caffe2(resnet50.init_net, resnet50.predict_net, shape_dict, dtype_dict)
+mod, params = relay.frontend.from_caffe2(resnet50.init_net, resnet50.predict_net, shape_dict, dtype_dict)
 
 # compile the model
 # target x86 CPU
 target = 'llvm'
 with relay.build_config(opt_level=3):
-    graph, lib, params = relay.build(func, target, params=params)
+    graph, lib, params = relay.build(mod[mod.entry_func], target, params=params)
 
 ######################################################################
 # Execute on TVM
diff --git a/tutorials/frontend/from_coreml.py b/tutorials/frontend/from_coreml.py
index e0c31445f6a8..7eeefb3f2e5b 100644
--- a/tutorials/frontend/from_coreml.py
+++ b/tutorials/frontend/from_coreml.py
@@ -68,10 +68,12 @@
 shape_dict = {'image': x.shape}
 
 # Parse CoreML model and convert into Relay computation graph
-func, params = relay.frontend.from_coreml(mlmodel, shape_dict)
+mod, params = relay.frontend.from_coreml(mlmodel, shape_dict)
 
 with relay.build_config(opt_level=3):
-    graph, lib, params = relay.build(func, target, params=params)
+    graph, lib, params = relay.build(mod[mod.entry_func],
+                                     target,
+                                     params=params)
 
 ######################################################################
 # Execute on TVM
diff --git a/tutorials/frontend/from_darknet.py b/tutorials/frontend/from_darknet.py
new file mode 100644
index 000000000000..d9014e092ff0
--- /dev/null
+++ b/tutorials/frontend/from_darknet.py
@@ -0,0 +1,182 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Compile YOLO-V2 and YOLO-V3 in DarkNet Models
+=============================================
+**Author**: `Siju Samuel <https://siju-samuel.github.io/>`_
+
+This article is an introductory tutorial to deploy darknet models with TVM.
+All the required models and libraries will be downloaded from the internet by the script.
+This script runs the YOLO-V2 and YOLO-V3 Model with the bounding boxes
+Darknet parsing have dependancy with CFFI and CV2 library
+Please install CFFI and CV2 before executing this script
+
+.. code-block:: bash
+
+  pip install cffi
+  pip install opencv-python
+"""
+
+# numpy and matplotlib
+import numpy as np
+import matplotlib.pyplot as plt
+import sys
+
+# tvm, relay
+import tvm
+from tvm import relay
+from ctypes import *
+from tvm.contrib.download import download_testdata
+from tvm.relay.testing.darknet import __darknetffi__
+import tvm.relay.testing.yolo_detection
+import tvm.relay.testing.darknet
+
+# Model name
+MODEL_NAME = 'yolov3'
+
+######################################################################
+# Download required files
+# -----------------------
+# Download cfg and weights file if first time.
+CFG_NAME = MODEL_NAME + '.cfg'
+WEIGHTS_NAME = MODEL_NAME + '.weights'
+REPO_URL = 'https://github.com/dmlc/web-data/blob/master/darknet/'
+CFG_URL = REPO_URL + 'cfg/' + CFG_NAME + '?raw=true'
+WEIGHTS_URL = 'https://pjreddie.com/media/files/' + WEIGHTS_NAME
+
+cfg_path = download_testdata(CFG_URL, CFG_NAME, module="darknet")
+weights_path = download_testdata(WEIGHTS_URL, WEIGHTS_NAME, module="darknet")
+
+# Download and Load darknet library
+if sys.platform in ['linux', 'linux2']:
+    DARKNET_LIB = 'libdarknet2.0.so'
+    DARKNET_URL = REPO_URL + 'lib/' + DARKNET_LIB + '?raw=true'
+elif sys.platform == 'darwin':
+    DARKNET_LIB = 'libdarknet_mac2.0.so'
+    DARKNET_URL = REPO_URL + 'lib_osx/' + DARKNET_LIB + '?raw=true'
+else:
+    err = "Darknet lib is not supported on {} platform".format(sys.platform)
+    raise NotImplementedError(err)
+
+lib_path = download_testdata(DARKNET_URL, DARKNET_LIB, module="darknet")
+
+DARKNET_LIB = __darknetffi__.dlopen(lib_path)
+net = DARKNET_LIB.load_network(cfg_path.encode('utf-8'), weights_path.encode('utf-8'), 0)
+dtype = 'float32'
+batch_size = 1
+
+data = np.empty([batch_size, net.c, net.h, net.w], dtype)
+shape_dict = {'data': data.shape}
+print("Converting darknet to relay functions...")
+mod, params = relay.frontend.from_darknet(net, dtype=dtype, shape=data.shape)
+
+######################################################################
+# Import the graph to Relay
+# -------------------------
+# compile the model
+target = 'llvm'
+target_host = 'llvm'
+ctx = tvm.cpu(0)
+data = np.empty([batch_size, net.c, net.h, net.w], dtype)
+shape = {'data': data.shape}
+print("Compiling the model...")
+with relay.build_config(opt_level=3):
+    graph, lib, params = relay.build(mod[mod.entry_func],
+                                     target=target,
+                                     target_host=target_host,
+                                     params=params)
+
+[neth, netw] = shape['data'][2:] # Current image shape is 608x608
+######################################################################
+# Load a test image
+# -----------------
+test_image = 'dog.jpg'
+print("Loading the test image...")
+img_url = REPO_URL + 'data/' + test_image + '?raw=true'
+img_path = download_testdata(img_url, test_image, "data")
+
+data = tvm.relay.testing.darknet.load_image(img_path, netw, neth)
+######################################################################
+# Execute on TVM Runtime
+# ----------------------
+# The process is no different from other examples.
+from tvm.contrib import graph_runtime
+
+m = graph_runtime.create(graph, lib, ctx)
+
+# set inputs
+m.set_input('data', tvm.nd.array(data.astype(dtype)))
+m.set_input(**params)
+# execute
+print("Running the test image...")
+
+m.run()
+# get outputs
+tvm_out = []
+if MODEL_NAME == 'yolov2':
+    layer_out = {}
+    layer_out['type'] = 'Region'
+    # Get the region layer attributes (n, out_c, out_h, out_w, classes, coords, background)
+    layer_attr = m.get_output(2).asnumpy()
+    layer_out['biases'] = m.get_output(1).asnumpy()
+    out_shape = (layer_attr[0], layer_attr[1]//layer_attr[0],
+                 layer_attr[2], layer_attr[3])
+    layer_out['output'] = m.get_output(0).asnumpy().reshape(out_shape)
+    layer_out['classes'] = layer_attr[4]
+    layer_out['coords'] = layer_attr[5]
+    layer_out['background'] = layer_attr[6]
+    tvm_out.append(layer_out)
+
+elif MODEL_NAME == 'yolov3':
+    for i in range(3):
+        layer_out = {}
+        layer_out['type'] = 'Yolo'
+        # Get the yolo layer attributes (n, out_c, out_h, out_w, classes, total)
+        layer_attr = m.get_output(i*4+3).asnumpy()
+        layer_out['biases'] = m.get_output(i*4+2).asnumpy()
+        layer_out['mask'] = m.get_output(i*4+1).asnumpy()
+        out_shape = (layer_attr[0], layer_attr[1]//layer_attr[0],
+                     layer_attr[2], layer_attr[3])
+        layer_out['output'] = m.get_output(i*4).asnumpy().reshape(out_shape)
+        layer_out['classes'] = layer_attr[4]
+        tvm_out.append(layer_out)
+
+# do the detection and bring up the bounding boxes
+thresh = 0.5
+nms_thresh = 0.45
+img = tvm.relay.testing.darknet.load_image_color(img_path)
+_, im_h, im_w = img.shape
+dets = tvm.relay.testing.yolo_detection.fill_network_boxes((netw, neth), (im_w, im_h), thresh,
+                                                      1, tvm_out)
+last_layer = net.layers[net.n - 1]
+tvm.relay.testing.yolo_detection.do_nms_sort(dets, last_layer.classes, nms_thresh)
+
+coco_name = 'coco.names'
+coco_url = REPO_URL + 'data/' + coco_name + '?raw=true'
+font_name = 'arial.ttf'
+font_url = REPO_URL + 'data/' + font_name + '?raw=true'
+coco_path = download_testdata(coco_url, coco_name, module='data')
+font_path = download_testdata(font_url, font_name, module='data')
+
+with open(coco_path) as f:
+    content = f.readlines()
+
+names = [x.strip() for x in content]
+
+tvm.relay.testing.yolo_detection.draw_detections(font_path, img, dets, thresh, names, last_layer.classes)
+plt.imshow(img.transpose(1, 2, 0))
+plt.show()
diff --git a/tutorials/frontend/from_keras.py b/tutorials/frontend/from_keras.py
index 23c7eaf6b2b1..c1f3471bb644 100644
--- a/tutorials/frontend/from_keras.py
+++ b/tutorials/frontend/from_keras.py
@@ -74,18 +74,18 @@
 # ----------------------------
 # convert the keras model(NHWC layout) to Relay format(NCHW layout).
 shape_dict = {'input_1': data.shape}
-func, params = relay.frontend.from_keras(keras_resnet50, shape_dict)
+mod, params = relay.frontend.from_keras(keras_resnet50, shape_dict)
 # compile the model
 target = 'cuda'
 ctx = tvm.gpu(0)
 with relay.build_config(opt_level=3):
-    executor = relay.build_module.create_executor('graph', func, ctx, target)
+    executor = relay.build_module.create_executor('graph', mod, ctx, target)
 
 ######################################################################
 # Execute on TVM
 # ---------------
 dtype = 'float32'
-tvm_out = executor.evaluate(func)(tvm.nd.array(data.astype(dtype)), **params)
+tvm_out = executor.evaluate()(tvm.nd.array(data.astype(dtype)), **params)
 top1_tvm = np.argmax(tvm_out.asnumpy()[0])
 
 #####################################################################
diff --git a/tutorials/frontend/from_mxnet.py b/tutorials/frontend/from_mxnet.py
index 2629dfaafaab..1109fd9b7d1c 100644
--- a/tutorials/frontend/from_mxnet.py
+++ b/tutorials/frontend/from_mxnet.py
@@ -82,8 +82,9 @@ def transform_image(image):
 # It's as easy as several lines.
 # We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon
 shape_dict = {'data': x.shape}
-func, params = relay.frontend.from_mxnet(block, shape_dict)
+mod, params = relay.frontend.from_mxnet(block, shape_dict)
 ## we want a probability so add a softmax operator
+func = mod[mod.entry_func]
 func = relay.Function(func.params, relay.nn.softmax(func.body), None, func.type_params, func.attrs)
 
 ######################################################################
@@ -132,6 +133,6 @@ def block2symbol(block):
 # for a normal mxnet model, we start from here
 mx_sym, args, auxs = mx.model.load_checkpoint('resnet18_v1', 0)
 # now we use the same API to get Relay computation graph
-relay_func, relay_params = relay.frontend.from_mxnet(mx_sym, shape_dict,
-                                                     arg_params=args, aux_params=auxs)
+mod, relay_params = relay.frontend.from_mxnet(mx_sym, shape_dict,
+                                              arg_params=args, aux_params=auxs)
 # repeat the same steps to run this model using TVM
diff --git a/tutorials/frontend/from_onnx.py b/tutorials/frontend/from_onnx.py
index b177516b9a92..7a615930a905 100644
--- a/tutorials/frontend/from_onnx.py
+++ b/tutorials/frontend/from_onnx.py
@@ -71,16 +71,16 @@
 
 input_name = '1'
 shape_dict = {input_name: x.shape}
-sym, params = relay.frontend.from_onnx(onnx_model, shape_dict)
+mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
 
 with relay.build_config(opt_level=1):
-    intrp = relay.build_module.create_executor('graph', sym, tvm.cpu(0), target)
+    intrp = relay.build_module.create_executor('graph', mod, tvm.cpu(0), target)
 
 ######################################################################
 # Execute on TVM
 # ---------------------------------------------
 dtype = 'float32'
-tvm_output = intrp.evaluate(sym)(tvm.nd.array(x.astype(dtype)), **params).asnumpy()
+tvm_output = intrp.evaluate()(tvm.nd.array(x.astype(dtype)), **params).asnumpy()
 
 ######################################################################
 # Display results
diff --git a/tutorials/frontend/from_tensorflow.py b/tutorials/frontend/from_tensorflow.py
index 8d402820377e..6603c2da15bc 100644
--- a/tutorials/frontend/from_tensorflow.py
+++ b/tutorials/frontend/from_tensorflow.py
@@ -124,7 +124,9 @@
 #   params: params converted from tensorflow params (tensor protobuf).
 shape_dict = {'DecodeJpeg/contents': x.shape}
 dtype_dict = {'DecodeJpeg/contents': 'uint8'}
-sym, params = relay.frontend.from_tensorflow(graph_def, layout=layout, shape=shape_dict)
+mod, params = relay.frontend.from_tensorflow(graph_def,
+                                             layout=layout,
+                                             shape=shape_dict)
 
 print("Tensorflow protobuf imported to relay frontend.")
 ######################################################################
@@ -138,7 +140,10 @@
 #   lib: target library which can be deployed on target with TVM runtime.
 
 with relay.build_config(opt_level=3):
-    graph, lib, params = relay.build(sym, target=target, target_host=target_host, params=params)
+    graph, lib, params = relay.build(mod[mod.entry_func],
+                                     target=target,
+                                     target_host=target_host,
+                                     params=params)
 
 ######################################################################
 # Execute the portable graph on TVM
diff --git a/tutorials/frontend/from_tflite.py b/tutorials/frontend/from_tflite.py
index 67edeb8a38de..5a8525133f7c 100644
--- a/tutorials/frontend/from_tflite.py
+++ b/tutorials/frontend/from_tflite.py
@@ -52,7 +52,7 @@
     flatc --version
 
     # Get the TFLite schema.
-    wget https://raw.githubusercontent.com/tensorflow/tensorflow/r1.12/tensorflow/contrib/lite/schema/schema.fbs
+    wget https://raw.githubusercontent.com/tensorflow/tensorflow/r1.13/tensorflow/lite/schema/schema.fbs
 
     # Generate TFLite package.
     flatc --python schema.fbs
@@ -117,44 +117,35 @@ def extract(path):
 plt.show()
 image_data = np.asarray(resized_image).astype("float32")
 
-# convert HWC to CHW
-image_data = image_data.transpose((2, 0, 1))
-
-# after expand_dims, we have format NCHW
+# after expand_dims, we have format NHWC
 image_data = np.expand_dims(image_data, axis=0)
 
 # preprocess image as described here:
 # https://github.com/tensorflow/models/blob/edb6ed22a801665946c63d650ab9a0b23d98e1b1/research/slim/preprocessing/inception_preprocessing.py#L243
-image_data[:, 0, :, :] = 2.0 / 255.0 * image_data[:, 0, :, :] - 1
-image_data[:, 1, :, :] = 2.0 / 255.0 * image_data[:, 1, :, :] - 1
-image_data[:, 2, :, :] = 2.0 / 255.0 * image_data[:, 2, :, :] - 1
+image_data[:, :, :, 0] = 2.0 / 255.0 * image_data[:, :, :, 0] - 1
+image_data[:, :, :, 1] = 2.0 / 255.0 * image_data[:, :, :, 1] - 1
+image_data[:, :, :, 2] = 2.0 / 255.0 * image_data[:, :, :, 2] - 1
 print('input', image_data.shape)
 
-####################################################################
-#
-# .. note:: Input layout:
-#
-#   Currently, TVM TFLite frontend accepts ``NCHW`` as input layout.
-
 ######################################################################
 # Compile the model with relay
 # ---------------------------------------------
 
 # TFLite input tensor name, shape and type
 input_tensor = "input"
-input_shape = (1, 3, 224, 224)
+input_shape = (1, 224, 224, 3)
 input_dtype = "float32"
 
 # parse TFLite model and convert into Relay computation graph
 from tvm import relay
-func, params = relay.frontend.from_tflite(tflite_model,
-                                          shape_dict={input_tensor: input_shape},
-                                          dtype_dict={input_tensor: input_dtype})
+mod, params = relay.frontend.from_tflite(tflite_model,
+                                         shape_dict={input_tensor: input_shape},
+                                         dtype_dict={input_tensor: input_dtype})
 
 # target x86 CPU
 target = "llvm"
-with relay.build_module.build_config(opt_level=3):
-    graph, lib, params = relay.build(func, target, params=params)
+with relay.build_config(opt_level=3):
+    graph, lib, params = relay.build(mod[mod.entry_func], target, params=params)
 
 ######################################################################
 # Execute on TVM
@@ -189,11 +180,9 @@ def extract(path):
 label_file = "labels_mobilenet_quant_v1_224.txt"
 label_path = download_testdata(label_file_url, label_file, module='data')
 
-# map id to 1001 classes
-labels = dict()
+# list of 1001 classes
 with open(label_path) as f:
-    for id, line in enumerate(f):
-        labels[id] = line
+    labels = f.readlines()
 
 # convert result to 1D data
 predictions = np.squeeze(tvm_output)
diff --git a/tutorials/language/schedule_primitives.py b/tutorials/language/schedule_primitives.py
index 44283818edf4..e59264f29898 100644
--- a/tutorials/language/schedule_primitives.py
+++ b/tutorials/language/schedule_primitives.py
@@ -144,7 +144,7 @@
 ######################################################################
 # compute_at
 # ----------
-# For a schedule consists of multiple operators, tvm will compute
+# For a schedule that consists of multiple operators, TVM will compute
 # tensors at the root separately by default.
 A = tvm.placeholder((m,), name='A')
 B = tvm.compute((m,), lambda i: A[i]+1, name='B')
diff --git a/tutorials/tensor_expr_get_started.py b/tutorials/tensor_expr_get_started.py
index cdd07d466a37..1b5eabcd56ea 100644
--- a/tutorials/tensor_expr_get_started.py
+++ b/tutorials/tensor_expr_get_started.py
@@ -19,7 +19,7 @@
 ==================================
 **Author**: `Tianqi Chen <https://tqchen.github.io>`_
 
-This is an introduction tutorial to Tensor expression language in TVM.
+This is an introductory tutorial to the Tensor expression language in TVM.
 TVM uses a domain specific tensor expression for efficient kernel construction.
 
 In this tutorial, we will demonstrate the basic workflow to use
@@ -48,15 +48,16 @@
 # ------------------------
 # As a first step, we need to describe our computation.
 # TVM adopts tensor semantics, with each intermediate result
-# represented as multi-dimensional array. The user need to describe
-# the computation rule that generate the tensors.
+# represented as a multi-dimensional array. The user needs to describe
+# the computation rule that generates the tensors.
 #
 # We first define a symbolic variable n to represent the shape.
 # We then define two placeholder Tensors, A and B, with given shape (n,)
 #
-# We then describe the result tensor C, with a compute operation.
-# The compute function takes the shape of the tensor, as well as a lambda function
-# that describes the computation rule for each position of the tensor.
+# We then describe the result tensor C, with a compute operation.  The
+# compute function takes the shape of the tensor, as well as a lambda
+# function that describes the computation rule for each position of
+# the tensor.
 #
 # No computation happens during this phase, as we are only declaring how
 # the computation should be done.
@@ -70,9 +71,10 @@
 ######################################################################
 # Schedule the Computation
 # ------------------------
-# While the above lines describes the computation rule, we can compute
-# C in many ways since the axis of C can be computed in data parallel manner.
-# TVM asks user to provide a description of computation called schedule.
+# While the above lines describe the computation rule, we can compute
+# C in many ways since the axis of C can be computed in a data
+# parallel manner.  TVM asks the user to provide a description of the
+# computation called a schedule.
 #
 # A schedule is a set of transformation of computation that transforms
 # the loop of computations in the program.
@@ -108,7 +110,7 @@
 
 ######################################################################
 # Finally we bind the iteration axis bx and tx to threads in the GPU
-# compute grid. These are GPU specific constructs that allows us
+# compute grid. These are GPU specific constructs that allow us
 # to generate code that runs on GPU.
 #
 if tgt == "cuda" or tgt.startswith('opencl'):
@@ -120,33 +122,33 @@
 # -----------
 # After we have finished specifying the schedule, we can compile it
 # into a TVM function. By default TVM compiles into a type-erased
-# function that can be directly called from python side.
+# function that can be directly called from the python side.
 #
 # In the following line, we use tvm.build to create a function.
 # The build function takes the schedule, the desired signature of the
-# function(including the inputs and outputs) as well as target language
+# function (including the inputs and outputs) as well as target language
 # we want to compile to.
 #
-# The result of compilation fadd is a GPU device function(if GPU is involved)
-# that can as well as a host wrapper that calls into the GPU function.
-# fadd is the generated host wrapper function, it contains reference
-# to the generated device function internally.
+# The result of compilation fadd is a GPU device function (if GPU is
+# involved) as well as a host wrapper that calls into the GPU
+# function.  fadd is the generated host wrapper function, it contains
+# a reference to the generated device function internally.
 #
 fadd = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name="myadd")
 
 ######################################################################
 # Run the Function
 # ----------------
-# The compiled function TVM function is designed to be a concise C API
-# that can be invoked from any languages.
+# The compiled TVM function is exposes a concise C API
+# that can be invoked from any language.
 #
-# We provide an minimum array API in python to aid quick testing and prototyping.
-# The array API is based on `DLPack <https://github.com/dmlc/dlpack>`_ standard.
+# We provide a minimal array API in python to aid quick testing and prototyping.
+# The array API is based on the `DLPack <https://github.com/dmlc/dlpack>`_ standard.
 #
 # - We first create a GPU context.
-# - Then tvm.nd.array copies the data to GPU.
+# - Then tvm.nd.array copies the data to the GPU.
 # - fadd runs the actual computation.
-# - asnumpy() copies the GPU array back to CPU and we can use this to verify correctness
+# - asnumpy() copies the GPU array back to the CPU and we can use this to verify correctness
 #
 ctx = tvm.context(tgt, 0)
 
@@ -176,14 +178,14 @@
 ######################################################################
 # .. note:: Code Specialization
 #
-#   As you may noticed, during the declaration, A, B and C both
-#   takes the same shape argument n. TVM will take advantage of this
-#   to pass only single shape argument to the kernel, as you will find in
+#   As you may have noticed, the declarations of A, B and C all
+#   take the same shape argument, n. TVM will take advantage of this
+#   to pass only a single shape argument to the kernel, as you will find in
 #   the printed device code. This is one form of specialization.
 #
 #   On the host side, TVM will automatically generate check code
 #   that checks the constraints in the parameters. So if you pass
-#   arrays with different shapes into the fadd, an error will be raised.
+#   arrays with different shapes into fadd, an error will be raised.
 #
 #   We can do more specializations. For example, we can write
 #   :code:`n = tvm.convert(1024)` instead of :code:`n = tvm.var("n")`,
@@ -195,13 +197,13 @@
 # Save Compiled Module
 # --------------------
 # Besides runtime compilation, we can save the compiled modules into
-# file and load them back later. This is called ahead of time compilation.
+# a file and load them back later. This is called ahead of time compilation.
 #
-# The following code first does the following step:
+# The following code first performs the following steps:
 #
 # - It saves the compiled host module into an object file.
 # - Then it saves the device module into a ptx file.
-# - cc.create_shared calls a env compiler(gcc) to create a shared library
+# - cc.create_shared calls a compiler (gcc) to create a shared library
 #
 from tvm.contrib import cc
 from tvm.contrib import util
@@ -218,9 +220,9 @@
 ######################################################################
 # .. note:: Module Storage Format
 #
-#   The CPU(host) module is directly saved as a shared library(so).
-#   There can be multiple customized format on the device code.
-#   In our example, device code is stored in ptx, as well as a meta
+#   The CPU (host) module is directly saved as a shared library (.so).
+#   There can be multiple customized formats of the device code.
+#   In our example, the device code is stored in ptx, as well as a meta
 #   data json file. They can be loaded and linked separately via import.
 #
 
@@ -228,8 +230,8 @@
 # Load Compiled Module
 # --------------------
 # We can load the compiled module from the file system and run the code.
-# The following code load the host and device module separately and
-# re-link them together. We can verify that the newly loaded function works.
+# The following code loads the host and device module separately and
+# re-links them together. We can verify that the newly loaded function works.
 #
 fadd1 = tvm.module.load(temp.relpath("myadd.so"))
 if tgt == "cuda":
@@ -261,11 +263,11 @@
 # .. note:: Runtime API and Thread-Safety
 #
 #   The compiled modules of TVM do not depend on the TVM compiler.
-#   Instead, it only depends on a minimum runtime library.
-#   TVM runtime library wraps the device drivers and provides
-#   thread-safe and device agnostic call into the compiled functions.
+#   Instead, they only depend on a minimum runtime library.
+#   The TVM runtime library wraps the device drivers and provides
+#   thread-safe and device agnostic calls into the compiled functions.
 #
-#   This means you can call the compiled TVM function from any thread,
+#   This means that you can call the compiled TVM functions from any thread,
 #   on any GPUs.
 #
 
@@ -275,7 +277,7 @@
 # TVM provides code generation features into multiple backends,
 # we can also generate OpenCL code or LLVM code that runs on CPU backends.
 #
-# The following codeblocks generate opencl code, creates array on opencl
+# The following code blocks generate OpenCL code, creates array on an OpenCL
 # device, and verifies the correctness of the code.
 #
 if tgt.startswith('opencl'):
@@ -296,12 +298,12 @@
 # This tutorial provides a walk through of TVM workflow using
 # a vector add example. The general workflow is
 #
-# - Describe your computation via series of operations.
+# - Describe your computation via a series of operations.
 # - Describe how we want to compute use schedule primitives.
 # - Compile to the target function we want.
 # - Optionally, save the function to be loaded later.
 #
-# You are more than welcomed to checkout other examples and
-# tutorials to learn more about the supported operations, schedule primitives
+# You are more than welcome to checkout other examples and
+# tutorials to learn more about the supported operations, scheduling primitives
 # and other features in TVM.
 #
diff --git a/vta/apps/tsim_example/CMakeLists.txt b/vta/apps/tsim_example/CMakeLists.txt
index 4163c88ce3b8..56a5b9a3b228 100644
--- a/vta/apps/tsim_example/CMakeLists.txt
+++ b/vta/apps/tsim_example/CMakeLists.txt
@@ -34,6 +34,10 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND
   set(CMAKE_CXX_FLAGS "-faligned-new ${CMAKE_CXX_FLAGS}")
 endif()
 
-# Module rules
-include(cmake/modules/tsim.cmake)
-include(cmake/modules/driver.cmake)
+file(GLOB TSIM_SW_SRC src/driver.cc)
+add_library(sw SHARED ${TSIM_SW_SRC})
+target_include_directories(sw PRIVATE ${VTA_DIR}/include)
+
+if(APPLE)
+  set_target_properties(sw PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
+endif(APPLE)
diff --git a/vta/apps/tsim_example/Makefile b/vta/apps/tsim_example/Makefile
index e4911ceda419..b18ced840d15 100644
--- a/vta/apps/tsim_example/Makefile
+++ b/vta/apps/tsim_example/Makefile
@@ -17,29 +17,28 @@
 
 export PYTHONPATH:=$(PWD)/python:$(PYTHONPATH)
 
-BUILD_DIR = $(shell python python/tsim/config.py --get-build-name)
+BUILD_NAME = build
+build_dir = $(abspath .)/$(BUILD_NAME)
 
-TVM_DIR = $(abspath ../../../)
+default: verilog driver
+	python3 tests/python/verilog_accel.py
 
-TSIM_TARGET = verilog
-TSIM_TOP_NAME = TestAccel
-TSIM_BUILD_NAME = build
+run_chisel: chisel driver
+	python3 tests/python/chisel_accel.py
 
-# optional
-TSIM_TRACE_NAME = trace.vcd
+driver: | $(build_dir)
+	cd $(build_dir) && cmake .. && make
 
-default: cmake run
-
-.PHONY: cmake
-
-cmake: | $(BUILD_DIR)
-	cd $(BUILD_DIR) && cmake .. && make
-
-$(BUILD_DIR):
+$(build_dir):
 	mkdir -p $@
 
-run:
-	python3 tests/python/test_tsim.py | grep PASS
+verilog:
+	make -C hardware/verilog
+
+chisel:
+	make -C hardware/chisel
 
 clean:
-	-rm -rf $(BUILD_DIR)
+	-rm -rf $(build_dir)
+	make -C hardware/chisel clean
+	make -C hardware/verilog clean
diff --git a/vta/apps/tsim_example/README.md b/vta/apps/tsim_example/README.md
index 4cde4242dc28..20891665c369 100644
--- a/vta/apps/tsim_example/README.md
+++ b/vta/apps/tsim_example/README.md
@@ -49,23 +49,27 @@ sudo apt install verilator sbt
 ## Setup in TVM
 
 1. Install `verilator` and `sbt` as described above
-2. Enable VTA TSIM by turning on the switch `USE_VTA_TSIM` in config.cmake
-3. Build tvm
+2. Get tvm `git clone https://github.com/dmlc/tvm.git`
+3. Change VTA target in `tvm/vta/config/vta_config.json` from `sim` to `tsim`
+4. Build [tvm](https://docs.tvm.ai/install/from_source.html#build-the-shared-library)
 
 ## How to run VTA TSIM examples
 
-There are two sample VTA accelerators (add-by-one) designed in Chisel3 and Verilog to show how *TSIM* works.
-These examples are located at `<tvm-root>/vta/apps/tsim_example`.
+There are two sample VTA accelerators, add-a-constant, designed in Chisel3 and Verilog to show how *TSIM* works.
+The default `TARGET` language for these two implementations is Verilog. The following instructions show
+how to run both of them:
 
-* Instructions
-    * Open `<tvm-root>/vta/apps/tsim_example/python/tsim/config.json`
-    * Change `TARGET` from `verilog` to `chisel`, depending on what language backend you would like to test
-    * Go to `tvm/vta/apps/tsim`
+* Test Verilog backend
+    * Go to `<tvm-root>/vta/apps/tsim_example`
     * Run `make`
 
+* Test Chisel3 backend
+    * Go to `<tvm-root>/vta/apps/tsim_example`
+    * Run `make run_chisel`
+
 * Some pointers
-    * Build cmake script for driver `<tvm-root>/vta/apps/tsim_example/cmake/modules/driver.cmake`
-    * Build cmake script for tsim `<tvm-root>/vta/apps/tsim_example/cmake/modules/tsim.cmake`
-    * Software driver that handles the VTA accelerator `<tvm-root>/vta/apps/tsim_example/src/driver.cc`
-    * VTA add-by-one accelerator (Verilog) `<tvm-root>/vta/apps/tsim_example/hardware/verilog`
-    * VTA add-by-one accelerator (Chisel) `<tvm-root>/vta/apps/tsim_example/hardware/chisel`
+    * Verilog and Chisel3 tests in `<tvm-root>/vta/apps/tsim_example/tests/python`
+    * Verilog accelerator backend `<tvm-root>/vta/apps/tsim_example/hardware/verilog`
+    * Chisel3 accelerator backend `<tvm-root>/vta/apps/tsim_example/hardware/chisel`
+    * Software C++ driver (backend) that handles the accelerator `<tvm-root>/vta/apps/tsim_example/src/driver.cc`
+    * Software Python driver (frontend) that handles the accelerator `<tvm-root>/vta/apps/tsim_example/python/accel`
diff --git a/vta/apps/tsim_example/cmake/modules/tsim.cmake b/vta/apps/tsim_example/cmake/modules/tsim.cmake
deleted file mode 100644
index 4c81f288e45a..000000000000
--- a/vta/apps/tsim_example/cmake/modules/tsim.cmake
+++ /dev/null
@@ -1,152 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-if(MSVC)
-  message(STATUS "TSIM build is skipped in Windows..")
-else()
-  find_program(PYTHON NAMES python python3 python3.6)
-  find_program(VERILATOR NAMES verilator)
-
-  if (VERILATOR AND PYTHON)
-
-    if (TSIM_TOP_NAME STREQUAL "")
-      message(FATAL_ERROR "TSIM_TOP_NAME should be defined")
-    endif()
-
-    if (TSIM_BUILD_NAME STREQUAL "")
-      message(FATAL_ERROR "TSIM_BUILD_NAME should be defined")
-    endif()
-
-    set(TSIM_CONFIG ${PYTHON} ${CMAKE_CURRENT_SOURCE_DIR}/python/tsim/config.py)
-
-    execute_process(COMMAND ${TSIM_CONFIG} --get-target OUTPUT_VARIABLE __TSIM_TARGET)
-    execute_process(COMMAND ${TSIM_CONFIG} --get-top-name OUTPUT_VARIABLE __TSIM_TOP_NAME)
-    execute_process(COMMAND ${TSIM_CONFIG} --get-build-name OUTPUT_VARIABLE __TSIM_BUILD_NAME)
-    execute_process(COMMAND ${TSIM_CONFIG} --get-use-trace OUTPUT_VARIABLE __TSIM_USE_TRACE)
-    execute_process(COMMAND ${TSIM_CONFIG} --get-trace-name OUTPUT_VARIABLE __TSIM_TRACE_NAME)
-
-    string(STRIP ${__TSIM_TARGET} TSIM_TARGET)
-    string(STRIP ${__TSIM_TOP_NAME} TSIM_TOP_NAME)
-    string(STRIP ${__TSIM_BUILD_NAME} TSIM_BUILD_NAME)
-    string(STRIP ${__TSIM_USE_TRACE} TSIM_USE_TRACE)
-    string(STRIP ${__TSIM_TRACE_NAME} TSIM_TRACE_NAME)
-
-    set(TSIM_BUILD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/${TSIM_BUILD_NAME})
-
-    if (TSIM_TARGET STREQUAL "chisel")
-
-      find_program(SBT NAMES sbt)
-
-      if (SBT)
-
-        # Install Chisel VTA package for DPI modules
-        set(VTA_CHISEL_DIR ${VTA_DIR}/hardware/chisel)
-
-        execute_process(WORKING_DIRECTORY ${VTA_CHISEL_DIR}
-          COMMAND ${SBT} publishLocal RESULT_VARIABLE RETCODE)
-
-        if (NOT RETCODE STREQUAL "0")
-          message(FATAL_ERROR "[TSIM] sbt failed to install VTA scala package")
-        endif()
-
-        # Chisel - Scala to Verilog compilation
-        set(TSIM_CHISEL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/hardware/chisel)
-        set(CHISEL_TARGET_DIR ${TSIM_BUILD_DIR}/chisel)
-        set(CHISEL_OPT "test:runMain test.Elaborate --target-dir ${CHISEL_TARGET_DIR} --top-name ${TSIM_TOP_NAME}")
-
-        execute_process(WORKING_DIRECTORY ${TSIM_CHISEL_DIR} COMMAND ${SBT} ${CHISEL_OPT} RESULT_VARIABLE RETCODE)
-
-        if (NOT RETCODE STREQUAL "0")
-          message(FATAL_ERROR "[TSIM] sbt failed to compile from Chisel to Verilog.")
-        endif()
-
-        file(GLOB VERILATOR_RTL_SRC ${CHISEL_TARGET_DIR}/*.v)
-
-      else()
-        message(FATAL_ERROR "[TSIM] sbt should be installed for Chisel")
-      endif() # sbt
-
-    elseif (TSIM_TARGET STREQUAL "verilog")
-
-      set(VTA_VERILOG_DIR ${VTA_DIR}/hardware/chisel/src/main/resources/verilog)
-      set(TSIM_VERILOG_DIR ${CMAKE_CURRENT_SOURCE_DIR}/hardware/verilog)
-      file(GLOB VERILATOR_RTL_SRC ${VTA_VERILOG_DIR}/*.v ${TSIM_VERILOG_DIR}/*.v)
-
-    else()
-      message(STATUS "[TSIM] target language can be only verilog or chisel...")
-    endif() # TSIM_TARGET
-
-    if (TSIM_TARGET STREQUAL "chisel" OR TSIM_TARGET STREQUAL "verilog")
-
-      # Check if tracing can be enabled
-      if (NOT TSIM_USE_TRACE STREQUAL "OFF")
-        message(STATUS "[TSIM] Verilog enable tracing")
-      else()
-        message(STATUS "[TSIM] Verilator disable tracing")
-      endif()
-
-      # Verilator - Verilog to C++ compilation
-      set(VERILATOR_TARGET_DIR ${TSIM_BUILD_DIR}/verilator)
-      set(VERILATOR_OPT +define+RANDOMIZE_GARBAGE_ASSIGN +define+RANDOMIZE_REG_INIT)
-      list(APPEND VERILATOR_OPT +define+RANDOMIZE_MEM_INIT --x-assign unique)
-      list(APPEND VERILATOR_OPT --output-split 20000 --output-split-cfuncs 20000)
-      list(APPEND VERILATOR_OPT --top-module ${TSIM_TOP_NAME} -Mdir ${VERILATOR_TARGET_DIR})
-      list(APPEND VERILATOR_OPT --cc ${VERILATOR_RTL_SRC})
-
-      if (NOT TSIM_USE_TRACE STREQUAL "OFF")
-        list(APPEND VERILATOR_OPT --trace)
-      endif()
-
-      execute_process(COMMAND ${VERILATOR} ${VERILATOR_OPT} RESULT_VARIABLE RETCODE)
-
-      if (NOT RETCODE STREQUAL "0")
-        message(FATAL_ERROR "[TSIM] Verilator failed to compile Verilog to C++...")
-      endif()
-
-      # Build shared library (.so)
-      set(VTA_HW_DPI_DIR ${VTA_DIR}/hardware/dpi)
-      set(VERILATOR_INC_DIR /usr/local/share/verilator/include)
-      set(VERILATOR_LIB_SRC ${VERILATOR_INC_DIR}/verilated.cpp ${VERILATOR_INC_DIR}/verilated_dpi.cpp)
-
-      if (NOT TSIM_USE_TRACE STREQUAL "OFF")
-        list(APPEND VERILATOR_LIB_SRC ${VERILATOR_INC_DIR}/verilated_vcd_c.cpp)
-      endif()
-
-      file(GLOB VERILATOR_GEN_SRC ${VERILATOR_TARGET_DIR}/*.cpp)
-      file(GLOB VERILATOR_SRC ${VTA_HW_DPI_DIR}/tsim_device.cc)
-      add_library(tsim SHARED ${VERILATOR_LIB_SRC} ${VERILATOR_GEN_SRC} ${VERILATOR_SRC})
-
-      set(VERILATOR_DEF VL_TSIM_NAME=V${TSIM_TOP_NAME} VL_PRINTF=printf VM_COVERAGE=0 VM_SC=0)
-      if (NOT TSIM_USE_TRACE STREQUAL "OFF")
-        list(APPEND VERILATOR_DEF VM_TRACE=1 TSIM_TRACE_FILE=${TSIM_BUILD_DIR}/${TSIM_TRACE_NAME}.vcd)
-      else()
-        list(APPEND VERILATOR_DEF VM_TRACE=0)
-      endif()
-      target_compile_definitions(tsim PRIVATE ${VERILATOR_DEF})
-      target_compile_options(tsim PRIVATE -Wno-sign-compare -include V${TSIM_TOP_NAME}.h)
-      target_include_directories(tsim PRIVATE ${VERILATOR_TARGET_DIR} ${VERILATOR_INC_DIR} ${VERILATOR_INC_DIR}/vltstd ${VTA_DIR}/include)
-
-      if(APPLE)
-        set_target_properties(tsim PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
-      endif(APPLE)
-
-    endif() # TSIM_TARGET STREQUAL "chisel" OR TSIM_TARGET STREQUAL "verilog"
-
-  else()
-    message(STATUS "[TSIM] could not find Python or Verilator, build is skipped...")
-  endif() # VERILATOR
-endif() # MSVC
diff --git a/vta/apps/tsim_example/hardware/chisel/Makefile b/vta/apps/tsim_example/hardware/chisel/Makefile
index 65a9ed13c989..4f555bab6dc3 100644
--- a/vta/apps/tsim_example/hardware/chisel/Makefile
+++ b/vta/apps/tsim_example/hardware/chisel/Makefile
@@ -15,5 +15,98 @@
 # specific language governing permissions and limitations
 # under the License.
 
+ifeq (, $(shell which verilator))
+ $(error "No Verilator in $(PATH), consider doing apt-get install verilator")
+endif
+
+# Change VERILATOR_INC_DIR if Verilator is installed on a different location
+ifeq (, $(VERILATOR_INC_DIR))
+  ifeq (, $(wildcard /usr/local/share/verilator/include/*))
+    ifeq (, $(wildcard /usr/share/verilator/include/*))
+      $(error "Verilator include directory is not set properly")
+    else
+      VERILATOR_INC_DIR := /usr/share/verilator/include
+    endif
+  else
+      VERILATOR_INC_DIR := /usr/local/share/verilator/include
+  endif
+endif
+
+TOP = TestAccel
+BUILD_NAME = build
+USE_TRACE = 0
+LIBNAME = libhw
+
+vta_dir = $(abspath ../../../../)
+tvm_dir = $(abspath ../../../../../)
+build_dir = $(abspath .)/$(BUILD_NAME)
+verilator_build_dir = $(build_dir)/verilator
+chisel_build_dir = $(build_dir)/chisel
+
+verilator_opt = --cc
+verilator_opt += +define+RANDOMIZE_GARBAGE_ASSIGN
+verilator_opt += +define+RANDOMIZE_REG_INIT
+verilator_opt += +define+RANDOMIZE_MEM_INIT
+verilator_opt += --x-assign unique
+verilator_opt += --output-split 20000
+verilator_opt += --output-split-cfuncs 20000
+verilator_opt += --top-module ${TOP}
+verilator_opt += -Mdir ${verilator_build_dir}
+verilator_opt += -I$(chisel_build_dir)
+
+cxx_flags = -O2 -Wall -fPIC -shared
+cxx_flags += -fvisibility=hidden -std=c++11
+cxx_flags += -DVL_TSIM_NAME=V$(TOP)
+cxx_flags += -DVL_PRINTF=printf
+cxx_flags += -DVL_USER_FINISH
+cxx_flags += -DVM_COVERAGE=0
+cxx_flags += -DVM_SC=0
+cxx_flags += -Wno-sign-compare
+cxx_flags += -include V$(TOP).h
+cxx_flags += -I$(verilator_build_dir)
+cxx_flags += -I$(VERILATOR_INC_DIR)
+cxx_flags += -I$(VERILATOR_INC_DIR)/vltstd
+cxx_flags += -I$(vta_dir)/include
+cxx_flags += -I$(tvm_dir)/include
+cxx_flags += -I$(tvm_dir)/3rdparty/dlpack/include
+
+cxx_files = $(VERILATOR_INC_DIR)/verilated.cpp
+cxx_files += $(VERILATOR_INC_DIR)/verilated_dpi.cpp
+cxx_files += $(wildcard $(verilator_build_dir)/*.cpp)
+cxx_files += $(vta_dir)/hardware/dpi/tsim_device.cc
+
+ifneq ($(USE_TRACE), 0)
+  verilator_opt += --trace
+  cxx_flags += -DVM_TRACE=1
+  cxx_flags += -DTSIM_TRACE_FILE=$(verilator_build_dir)/$(TOP).vcd
+  cxx_files += $(VERILATOR_INC_DIR)/verilated_vcd_c.cpp
+else
+  cxx_flags += -DVM_TRACE=0
+endif
+
+# The following is to be consistent with cmake
+ifeq ($(shell uname), Darwin)
+  lib_path = $(build_dir)/$(LIBNAME).dylib
+else
+  lib_path = $(build_dir)/$(LIBNAME).so
+endif
+
+default: lib
+
+lib: $(lib_path)
+$(lib_path): $(verilator_build_dir)/V$(TOP).cpp
+	g++ $(cxx_flags) $(cxx_files) -o $@
+
+verilator: $(verilator_build_dir)/V$(TOP).cpp
+$(verilator_build_dir)/V$(TOP).cpp: $(chisel_build_dir)/$(TOP).v
+	verilator $(verilator_opt) $<
+
+verilog: $(chisel_build_dir)/$(TOP).v
+$(chisel_build_dir)/$(TOP).v: install_vta_package
+	sbt 'test:runMain test.Elaborate --target-dir $(chisel_build_dir) --top-name $(TOP)'
+
+install_vta_package:
+	cd $(vta_dir)/hardware/chisel && sbt publishLocal
+
 clean:
-	-rm -rf target project/target project/project
+	-rm -rf $(build_dir) target project/target project/project
diff --git a/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Accel.scala b/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Accel.scala
index 9225f83b0821..d654a7fdd41a 100644
--- a/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Accel.scala
+++ b/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Accel.scala
@@ -35,18 +35,28 @@ import vta.dpi._
   * |_________|      |_________|
   *
   */
+case class AccelConfig() {
+  val nCtrl = 1
+  val nECnt = 1
+  val nVals = 2
+  val nPtrs = 2
+  val regBits = 32
+  val ptrBits = 2*regBits
+}
+
 class Accel extends Module {
   val io = IO(new Bundle {
     val host = new VTAHostDPIClient
     val mem = new VTAMemDPIMaster
   })
+  implicit val config = AccelConfig()
   val rf = Module(new RegFile)
   val ce = Module(new Compute)
   rf.io.host <> io.host
   io.mem <> ce.io.mem
   ce.io.launch := rf.io.launch
   rf.io.finish := ce.io.finish
-  ce.io.length := rf.io.length
-  ce.io.inp_baddr := rf.io.inp_baddr
-  ce.io.out_baddr := rf.io.out_baddr
+  rf.io.ecnt <> ce.io.ecnt
+  ce.io.vals <> rf.io.vals
+  ce.io.ptrs <> rf.io.ptrs
 }
diff --git a/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Compute.scala b/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Compute.scala
index fb7a2f396cb0..f24cbdd8bdb7 100644
--- a/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Compute.scala
+++ b/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Compute.scala
@@ -35,21 +35,24 @@ import vta.dpi._
   * 6. Check if counter (cnt) is equal to length to assert finish,
   *    otherwise go to step 2.
   */
-class Compute extends Module {
+class Compute(implicit config: AccelConfig) extends Module {
   val io = IO(new Bundle {
     val launch = Input(Bool())
     val finish = Output(Bool())
-    val length = Input(UInt(32.W))
-    val inp_baddr = Input(UInt(64.W))
-    val out_baddr = Input(UInt(64.W))
+    val ecnt = Vec(config.nECnt, ValidIO(UInt(config.regBits.W)))
+    val vals = Input(Vec(config.nVals, UInt(config.regBits.W)))
+    val ptrs = Input(Vec(config.nPtrs, UInt(config.ptrBits.W)))
     val mem = new VTAMemDPIMaster
   })
   val sIdle :: sReadReq :: sReadData :: sWriteReq :: sWriteData :: Nil = Enum(5)
   val state = RegInit(sIdle)
+  val const = io.vals(0)
+  val length = io.vals(1)
+  val cycles = RegInit(0.U(config.regBits.W))
   val reg = Reg(chiselTypeOf(io.mem.rd.bits))
-  val cnt = Reg(chiselTypeOf(io.length))
-  val raddr = Reg(chiselTypeOf(io.inp_baddr))
-  val waddr = Reg(chiselTypeOf(io.out_baddr))
+  val cnt = Reg(UInt(config.regBits.W))
+  val raddr = Reg(UInt(config.ptrBits.W))
+  val waddr = Reg(UInt(config.ptrBits.W))
 
   switch (state) {
     is (sIdle) {
@@ -69,7 +72,7 @@ class Compute extends Module {
       state := sWriteData
     }
     is (sWriteData) {
-      when (cnt === (io.length - 1.U)) {
+      when (cnt === (length - 1.U)) {
         state := sIdle
       } .otherwise {
         state := sReadReq
@@ -77,10 +80,22 @@ class Compute extends Module {
     }
   }
 
+  val last = state === sWriteData && cnt === (length - 1.U)
+
+  // cycle counter
+  when (state === sIdle) {
+    cycles := 0.U
+  } .otherwise {
+    cycles := cycles + 1.U
+  }
+
+  io.ecnt(0).valid := last
+  io.ecnt(0).bits := cycles
+
   // calculate next address
   when (state === sIdle) {
-    raddr := io.inp_baddr
-    waddr := io.out_baddr
+    raddr := io.ptrs(0)
+    waddr := io.ptrs(1)
   } .elsewhen (state === sWriteData) { // increment by 8-bytes
     raddr := raddr + 8.U
     waddr := waddr + 8.U
@@ -94,7 +109,7 @@ class Compute extends Module {
 
   // read
   when (state === sReadData && io.mem.rd.valid) {
-    reg := io.mem.rd.bits + 1.U
+    reg := io.mem.rd.bits + const
   }
   io.mem.rd.ready := state === sReadData
 
@@ -110,5 +125,5 @@ class Compute extends Module {
   }
 
   // done when read/write are equal to length
-  io.finish := state === sWriteData && cnt === (io.length - 1.U)
+  io.finish := last
 }
diff --git a/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/RegFile.scala b/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/RegFile.scala
index e636afdfb2e1..92a9833ffaa3 100644
--- a/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/RegFile.scala
+++ b/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/RegFile.scala
@@ -31,11 +31,13 @@ import vta.dpi._
   *  Register description    | addr
   * -------------------------|-----
   *  Control status register | 0x00
-  *  Length value register   | 0x04
-  *  Input pointer lsb       | 0x08
-  *  Input pointer msb       | 0x0c
-  *  Output pointer lsb      | 0x10
-  *  Output pointer msb      | 0x14
+  *  Cycle counter           | 0x04
+  *  Constant value          | 0x08
+  *  Vector length           | 0x0c
+  *  Input pointer lsb       | 0x10
+  *  Input pointer msb       | 0x14
+  *  Output pointer lsb      | 0x18
+  *  Output pointer msb      | 0x1c
   * -------------------------------
 
   * ------------------------------
@@ -45,13 +47,13 @@ import vta.dpi._
   *  Finish                  | 1
   * ------------------------------
   */
-class RegFile extends Module {
+class RegFile(implicit config: AccelConfig) extends Module {
   val io = IO(new Bundle {
     val launch = Output(Bool())
     val finish = Input(Bool())
-    val length = Output(UInt(32.W))
-    val inp_baddr = Output(UInt(64.W))
-    val out_baddr = Output(UInt(64.W))
+    val ecnt = Vec(config.nECnt, Flipped(ValidIO(UInt(config.regBits.W))))
+    val vals = Output(Vec(config.nVals, UInt(config.regBits.W)))
+    val ptrs = Output(Vec(config.nPtrs, UInt(config.ptrBits.W)))
     val host = new VTAHostDPIClient
   })
   val sIdle :: sRead :: Nil = Enum(2)
@@ -70,23 +72,34 @@ class RegFile extends Module {
 
   io.host.req.deq := state === sIdle & io.host.req.valid
 
-  val reg = Seq.fill(6)(RegInit(0.U.asTypeOf(chiselTypeOf(io.host.req.value))))
-  val addr = Seq.tabulate(6)(_ * 4)
+  val nTotal = config.nCtrl + config.nECnt + config.nVals + (2*config.nPtrs)
+  val reg = Seq.fill(nTotal)(RegInit(0.U.asTypeOf(chiselTypeOf(io.host.req.value))))
+  val addr = Seq.tabulate(nTotal)(_ * 4)
   val reg_map = (addr zip reg)  map { case (a, r) => a.U -> r }
+  val eo = config.nCtrl
+  val vo = eo + config.nECnt
+  val po = vo + config.nVals
 
-  (reg zip addr).foreach { case(r, a) =>
-    if (a == 0) { // control status register
-      when (io.finish) {
-        r := "b_10".U
-      } .elsewhen (state === sIdle && io.host.req.valid &&
-            io.host.req.opcode && a.U === io.host.req.addr) {
-        r := io.host.req.value
-      }
-    } else {
-      when (state === sIdle && io.host.req.valid &&
-            io.host.req.opcode && a.U === io.host.req.addr) {
-        r := io.host.req.value
-      }
+  when (io.finish) {
+    reg(0) := "b_10".U
+  } .elsewhen (state === sIdle && io.host.req.valid &&
+        io.host.req.opcode && addr(0).U === io.host.req.addr) {
+    reg(0) := io.host.req.value
+  }
+
+  for (i <- 0 until config.nECnt) {
+    when (io.ecnt(i).valid) {
+      reg(eo + i) := io.ecnt(i).bits
+    } .elsewhen (state === sIdle && io.host.req.valid &&
+          io.host.req.opcode && addr(eo + i).U === io.host.req.addr) {
+      reg(eo + i) := io.host.req.value
+    }
+  }
+
+  for (i <- 0 until (config.nVals + (2*config.nPtrs))) {
+    when (state === sIdle && io.host.req.valid &&
+          io.host.req.opcode && addr(vo + i).U === io.host.req.addr) {
+      reg(vo + i) := io.host.req.value
     }
   }
 
@@ -99,7 +112,12 @@ class RegFile extends Module {
   io.host.resp.bits := rdata
 
   io.launch := reg(0)(0)
-  io.length := reg(1)
-  io.inp_baddr := Cat(reg(3), reg(2))
-  io.out_baddr := Cat(reg(5), reg(4))
+
+  for (i <- 0 until config.nVals) {
+    io.vals(i) := reg(vo + i)
+  }
+
+  for (i <- 0 until config.nPtrs) {
+    io.ptrs(i) := Cat(reg(po + 2*i + 1), reg(po + 2*i))
+  }
 }
diff --git a/vta/apps/tsim_example/hardware/chisel/src/test/scala/dut/TestAccel.scala b/vta/apps/tsim_example/hardware/chisel/src/test/scala/dut/TestAccel.scala
index 45f81d50a50b..2c02ff36a631 100644
--- a/vta/apps/tsim_example/hardware/chisel/src/test/scala/dut/TestAccel.scala
+++ b/vta/apps/tsim_example/hardware/chisel/src/test/scala/dut/TestAccel.scala
@@ -20,7 +20,6 @@
 package test
 
 import chisel3._
-import chisel3.experimental.{RawModule, withClockAndReset}
 import vta.dpi._
 import accel._
 
@@ -29,21 +28,19 @@ import accel._
   * Instantiate Host and Memory DPI modules.
   *
   */
-class VTASimShell extends RawModule {
+class VTASimShell extends Module {
   val io = IO(new Bundle {
-    val clock = Input(Clock())
-    val reset = Input(Bool())
     val host = new VTAHostDPIMaster
     val mem = new VTAMemDPIClient
   })
   val host = Module(new VTAHostDPI)
   val mem = Module(new VTAMemDPI)
-  mem.io.reset := io.reset
-  mem.io.clock := io.clock
-  host.io.reset := io.reset
-  host.io.clock := io.clock
-  io.mem <> mem.io.dpi
+  mem.io.dpi <> io.mem
+  mem.io.reset := reset
+  mem.io.clock := clock
   io.host <> host.io.dpi
+  host.io.reset := reset
+  host.io.clock := clock
 }
 
 /** Test accelerator.
@@ -51,15 +48,10 @@ class VTASimShell extends RawModule {
   * Instantiate and connect the simulation-shell and the accelerator.
   *
   */
-class TestAccel extends RawModule {
-  val clock = IO(Input(Clock()))
-  val reset = IO(Input(Bool()))
-
+class TestAccel extends Module {
+  val io = IO(new Bundle {})
   val sim_shell = Module(new VTASimShell)
-  val vta_accel = withClockAndReset(clock, reset) { Module(new Accel) }
-
-  sim_shell.io.clock := clock
-  sim_shell.io.reset := reset
+  val vta_accel = Module(new Accel)
   vta_accel.io.host <> sim_shell.io.host
   sim_shell.io.mem <> vta_accel.io.mem
 }
diff --git a/vta/apps/tsim_example/hardware/verilog/Makefile b/vta/apps/tsim_example/hardware/verilog/Makefile
new file mode 100644
index 000000000000..9617a07ad565
--- /dev/null
+++ b/vta/apps/tsim_example/hardware/verilog/Makefile
@@ -0,0 +1,107 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+ifeq (, $(shell which verilator))
+ $(error "No Verilator in $(PATH), consider doing apt-get install verilator")
+endif
+
+# Change VERILATOR_INC_DIR if Verilator is installed on a different location
+ifeq (, $(VERILATOR_INC_DIR))
+  ifeq (, $(wildcard /usr/local/share/verilator/include/*))
+    ifeq (, $(wildcard /usr/share/verilator/include/*))
+      $(error "Verilator include directory is not set properly")
+    else
+      VERILATOR_INC_DIR := /usr/share/verilator/include
+    endif
+  else
+      VERILATOR_INC_DIR := /usr/local/share/verilator/include
+  endif
+endif
+
+TOP = TestAccel
+BUILD_NAME = build
+USE_TRACE = 0
+LIBNAME = libhw
+
+vta_dir = $(abspath ../../../../)
+tvm_dir = $(abspath ../../../../../)
+build_dir = $(abspath .)/$(BUILD_NAME)
+
+verilator_opt = --cc
+verilator_opt += +define+RANDOMIZE_GARBAGE_ASSIGN
+verilator_opt += +define+RANDOMIZE_REG_INIT
+verilator_opt += +define+RANDOMIZE_MEM_INIT
+verilator_opt += --x-assign unique
+verilator_opt += --output-split 20000
+verilator_opt += --output-split-cfuncs 20000
+verilator_opt += --top-module ${TOP}
+verilator_opt += -Mdir ${build_dir}
+
+cxx_flags = -O2 -Wall -fPIC -shared
+cxx_flags += -fvisibility=hidden -std=c++11
+cxx_flags += -DVL_TSIM_NAME=V$(TOP)
+cxx_flags += -DVL_PRINTF=printf
+cxx_flags += -DVL_USER_FINISH
+cxx_flags += -DVM_COVERAGE=0
+cxx_flags += -DVM_SC=0
+cxx_flags += -Wno-sign-compare
+cxx_flags += -include V$(TOP).h
+cxx_flags += -I$(build_dir)
+cxx_flags += -I$(VERILATOR_INC_DIR)
+cxx_flags += -I$(VERILATOR_INC_DIR)/vltstd
+cxx_flags += -I$(vta_dir)/include
+cxx_flags += -I$(tvm_dir)/include
+cxx_flags += -I$(tvm_dir)/3rdparty/dlpack/include
+
+cxx_files = $(VERILATOR_INC_DIR)/verilated.cpp
+cxx_files += $(VERILATOR_INC_DIR)/verilated_dpi.cpp
+cxx_files += $(wildcard $(build_dir)/*.cpp)
+cxx_files += $(vta_dir)/hardware/dpi/tsim_device.cc
+
+v_files = $(wildcard $(abspath .)/src/*.v $(vta_dir)/hardware/chisel/src/main/resources/verilog/*.v)
+
+ifneq ($(USE_TRACE), 0)
+  verilator_opt += --trace
+  cxx_flags += -DVM_TRACE=1
+  cxx_flags += -DTSIM_TRACE_FILE=$(build_dir)/$(TOP).vcd
+  cxx_files += $(VERILATOR_INC_DIR)/verilated_vcd_c.cpp
+else
+  cxx_flags += -DVM_TRACE=0
+endif
+
+# The following is to be consistent with cmake
+ifeq ($(shell uname), Darwin)
+  lib_path = $(build_dir)/$(LIBNAME).dylib
+else
+  lib_path = $(build_dir)/$(LIBNAME).so
+endif
+
+default: lib
+
+lib: $(lib_path)
+$(lib_path): $(build_dir)/V$(TOP).cpp
+	g++ $(cxx_flags) $(cxx_files) -o $@
+
+verilator: $(build_dir)/V$(TOP).cpp
+$(build_dir)/V$(TOP).cpp: $(v_files) | $(build_dir)
+	verilator $(verilator_opt) $(v_files)
+
+$(build_dir):
+	mkdir -p $@
+
+clean:
+	-rm -rf $(build_dir)
diff --git a/vta/apps/tsim_example/hardware/verilog/Accel.v b/vta/apps/tsim_example/hardware/verilog/src/Accel.v
similarity index 63%
rename from vta/apps/tsim_example/hardware/verilog/Accel.v
rename to vta/apps/tsim_example/hardware/verilog/src/Accel.v
index b025aad22ab7..34d7d957a858 100644
--- a/vta/apps/tsim_example/hardware/verilog/Accel.v
+++ b/vta/apps/tsim_example/hardware/verilog/src/Accel.v
@@ -62,6 +62,11 @@ module Accel #
 
   logic                      launch;
   logic                      finish;
+
+  logic                      event_counter_valid;
+  logic [HOST_DATA_BITS-1:0] event_counter_value;
+
+  logic [HOST_DATA_BITS-1:0] constant;
   logic [HOST_DATA_BITS-1:0] length;
   logic  [MEM_ADDR_BITS-1:0] inp_baddr;
   logic  [MEM_ADDR_BITS-1:0] out_baddr;
@@ -74,22 +79,27 @@ module Accel #
   )
   rf
   (
-    .clock           (clock),
-    .reset           (reset),
-
-    .host_req_valid  (host_req_valid),
-    .host_req_opcode (host_req_opcode),
-    .host_req_addr   (host_req_addr),
-    .host_req_value  (host_req_value),
-    .host_req_deq    (host_req_deq),
-    .host_resp_valid (host_resp_valid),
-    .host_resp_bits  (host_resp_bits),
-
-    .launch          (launch),
-    .finish          (finish),
-    .length          (length),
-    .inp_baddr       (inp_baddr),
-    .out_baddr       (out_baddr)
+    .clock               (clock),
+    .reset               (reset),
+
+    .host_req_valid      (host_req_valid),
+    .host_req_opcode     (host_req_opcode),
+    .host_req_addr       (host_req_addr),
+    .host_req_value      (host_req_value),
+    .host_req_deq        (host_req_deq),
+    .host_resp_valid     (host_resp_valid),
+    .host_resp_bits      (host_resp_bits),
+
+    .launch              (launch),
+    .finish              (finish),
+
+    .event_counter_valid (event_counter_valid),
+    .event_counter_value (event_counter_value),
+
+    .constant            (constant),
+    .length              (length),
+    .inp_baddr           (inp_baddr),
+    .out_baddr           (out_baddr)
   );
 
   Compute #
@@ -101,24 +111,29 @@ module Accel #
   )
   comp
   (
-    .clock           (clock),
-    .reset           (reset),
-
-    .mem_req_valid   (mem_req_valid),
-    .mem_req_opcode  (mem_req_opcode),
-    .mem_req_len     (mem_req_len),
-    .mem_req_addr    (mem_req_addr),
-    .mem_wr_valid    (mem_wr_valid),
-    .mem_wr_bits     (mem_wr_bits),
-    .mem_rd_valid    (mem_rd_valid),
-    .mem_rd_bits     (mem_rd_bits),
-    .mem_rd_ready    (mem_rd_ready),
-
-    .launch          (launch),
-    .finish          (finish),
-    .length          (length),
-    .inp_baddr       (inp_baddr),
-    .out_baddr       (out_baddr)
+    .clock               (clock),
+    .reset               (reset),
+
+    .mem_req_valid       (mem_req_valid),
+    .mem_req_opcode      (mem_req_opcode),
+    .mem_req_len         (mem_req_len),
+    .mem_req_addr        (mem_req_addr),
+    .mem_wr_valid        (mem_wr_valid),
+    .mem_wr_bits         (mem_wr_bits),
+    .mem_rd_valid        (mem_rd_valid),
+    .mem_rd_bits         (mem_rd_bits),
+    .mem_rd_ready        (mem_rd_ready),
+
+    .launch              (launch),
+    .finish              (finish),
+
+    .event_counter_valid (event_counter_valid),
+    .event_counter_value (event_counter_value),
+
+    .constant            (constant),
+    .length              (length),
+    .inp_baddr           (inp_baddr),
+    .out_baddr           (out_baddr)
   );
 
 endmodule
diff --git a/vta/apps/tsim_example/hardware/verilog/Compute.v b/vta/apps/tsim_example/hardware/verilog/src/Compute.v
similarity index 85%
rename from vta/apps/tsim_example/hardware/verilog/Compute.v
rename to vta/apps/tsim_example/hardware/verilog/src/Compute.v
index a5660ac8bc7d..4360b1ca20dd 100644
--- a/vta/apps/tsim_example/hardware/verilog/Compute.v
+++ b/vta/apps/tsim_example/hardware/verilog/src/Compute.v
@@ -52,6 +52,11 @@ module Compute #
 
   input                         launch,
   output                        finish,
+
+  output                        event_counter_valid,
+  output   [HOST_DATA_BITS-1:0] event_counter_value,
+
+  input    [HOST_DATA_BITS-1:0] constant,
   input    [HOST_DATA_BITS-1:0] length,
   input     [MEM_ADDR_BITS-1:0] inp_baddr,
   input     [MEM_ADDR_BITS-1:0] out_baddr
@@ -84,7 +89,7 @@ module Compute #
       IDLE: begin
         if (launch) begin
           state_n = READ_REQ;
-	end
+        end
       end
 
       READ_REQ: begin
@@ -94,9 +99,9 @@ module Compute #
       READ_DATA: begin
         if (mem_rd_valid) begin
           state_n = WRITE_REQ;
-	end else begin
+        end else begin
           state_n = READ_DATA;
-	end
+        end
       end
 
       WRITE_REQ: begin
@@ -106,9 +111,9 @@ module Compute #
       WRITE_DATA: begin
         if (cnt == (length - 1'b1)) begin
           state_n = IDLE;
-	end else begin
+        end else begin
           state_n = READ_REQ;
-	end
+        end
       end
 
       default: begin
@@ -116,6 +121,22 @@ module Compute #
     endcase
   end
 
+  logic last;
+  assign last = (state_r == WRITE_DATA) & (cnt == (length - 1'b1));
+
+  // cycle counter
+  logic [HOST_DATA_BITS-1:0] cycle_counter;
+  always_ff @(posedge clock) begin
+    if (reset | state_r == IDLE) begin
+      cycle_counter <= '0;
+    end else begin
+      cycle_counter <= cycle_counter + 1'b1;
+    end
+  end
+
+  assign event_counter_valid = last;
+  assign event_counter_value = cycle_counter;
+
   // calculate next address
   always_ff @(posedge clock) begin
     if (reset | state_r == IDLE) begin
@@ -136,7 +157,7 @@ module Compute #
   // read
   always_ff @(posedge clock) begin
     if ((state_r == READ_DATA) & mem_rd_valid) begin
-      data <= mem_rd_bits + 1'b1;
+      data <= mem_rd_bits + {32'd0, constant};
     end
   end
   assign mem_rd_ready = state_r == READ_DATA;
@@ -155,5 +176,5 @@ module Compute #
   end
 
   // done when read/write are equal to length
-  assign finish = (state_r == WRITE_DATA) & (cnt == (length - 1'b1));
+  assign finish = last;
 endmodule
diff --git a/vta/apps/tsim_example/hardware/verilog/RegFile.v b/vta/apps/tsim_example/hardware/verilog/src/RegFile.v
similarity index 72%
rename from vta/apps/tsim_example/hardware/verilog/RegFile.v
rename to vta/apps/tsim_example/hardware/verilog/src/RegFile.v
index 28edf9672f48..7174682dc8a2 100644
--- a/vta/apps/tsim_example/hardware/verilog/RegFile.v
+++ b/vta/apps/tsim_example/hardware/verilog/src/RegFile.v
@@ -25,11 +25,13 @@
   *  Register description    | addr
   * -------------------------|-----
   *  Control status register | 0x00
-  *  Length value register   | 0x04
-  *  Input pointer lsb       | 0x08
-  *  Input pointer msb       | 0x0c
-  *  Output pointer lsb      | 0x10
-  *  Output pointer msb      | 0x14
+  *  Cycle counter           | 0x04
+  *  Constant value          | 0x08
+  *  Vector length           | 0x0c
+  *  Input pointer lsb       | 0x10
+  *  Input pointer msb       | 0x14
+  *  Output pointer lsb      | 0x18
+  *  Output pointer msb      | 0x1c
   * -------------------------------
 
   * ------------------------------
@@ -58,11 +60,18 @@ module RegFile #
 
   output                        launch,
   input                         finish,
+
+  input                         event_counter_valid,
+  input    [HOST_DATA_BITS-1:0] event_counter_value,
+
+  output   [HOST_DATA_BITS-1:0] constant,
   output   [HOST_DATA_BITS-1:0] length,
   output    [MEM_ADDR_BITS-1:0] inp_baddr,
   output    [MEM_ADDR_BITS-1:0] out_baddr
 );
 
+  localparam NUM_REG = 8;
+
   typedef enum logic {IDLE, READ} state_t;
   state_t state_n, state_r;
 
@@ -80,7 +89,7 @@ module RegFile #
       IDLE: begin
         if (host_req_valid & ~host_req_opcode) begin
           state_n = READ;
-	end
+        end
       end
 
       READ: begin
@@ -91,28 +100,49 @@ module RegFile #
 
   assign host_req_deq = (state_r == IDLE) ? host_req_valid : 1'b0;
 
-  logic [HOST_DATA_BITS-1:0] rf [5:0];
+  logic [HOST_DATA_BITS-1:0] rf [NUM_REG-1:0];
 
   genvar i;
-  for (i = 0; i < 6; i++) begin
+  for (i = 0; i < NUM_REG; i++) begin
+
     logic wen = (state_r == IDLE)? host_req_valid & host_req_opcode & i*4 == host_req_addr : 1'b0;
+
     if (i == 0) begin
+
       always_ff @(posedge clock) begin
         if (reset) begin
-	end else if (finish) begin
-	  rf[i] <= 'd2;
-	end else if (wen) begin
-	  rf[i] <= host_req_value;
-	end
+          rf[i] <= 'd0;
+        end else if (finish) begin
+          rf[i] <= 'd2;
+        end else if (wen) begin
+          rf[i] <= host_req_value;
+        end
       end
+
+    end else if (i == 1) begin
+
+      always_ff @(posedge clock) begin
+        if (reset) begin
+          rf[i] <= 'd0;
+        end else if (event_counter_valid) begin
+          rf[i] <= event_counter_value;
+        end else if (wen) begin
+          rf[i] <= host_req_value;
+        end
+      end
+
     end else begin
+
       always_ff @(posedge clock) begin
         if (reset) begin
-	end else if (wen) begin
-	  rf[i] <= host_req_value;
-	end
+          rf[i] <= 'd0;
+        end else if (wen) begin
+          rf[i] <= host_req_value;
+        end
       end
+
     end
+
   end
 
   logic [HOST_DATA_BITS-1:0] rdata;
@@ -132,6 +162,10 @@ module RegFile #
         rdata <= rf[4];
       end else if (host_req_addr == 'h14) begin
         rdata <= rf[5];
+      end else if (host_req_addr == 'h18) begin
+        rdata <= rf[6];
+      end else if (host_req_addr == 'h1c) begin
+        rdata <= rf[7];
       end else begin
         rdata <= 'd0;
       end
@@ -142,8 +176,9 @@ module RegFile #
   assign host_resp_bits = rdata;
 
   assign launch = rf[0][0];
-  assign length = rf[1];
-  assign inp_baddr = {rf[3], rf[2]};
-  assign out_baddr = {rf[5], rf[4]};
+  assign constant = rf[2];
+  assign length = rf[3];
+  assign inp_baddr = {rf[5], rf[4]};
+  assign out_baddr = {rf[7], rf[6]};
 
 endmodule
diff --git a/vta/apps/tsim_example/hardware/verilog/TestAccel.v b/vta/apps/tsim_example/hardware/verilog/src/TestAccel.v
similarity index 100%
rename from vta/apps/tsim_example/hardware/verilog/TestAccel.v
rename to vta/apps/tsim_example/hardware/verilog/src/TestAccel.v
diff --git a/vta/apps/tsim_example/python/__init__.py b/vta/apps/tsim_example/python/__init__.py
new file mode 100644
index 000000000000..784036f7d0ae
--- /dev/null
+++ b/vta/apps/tsim_example/python/__init__.py
@@ -0,0 +1 @@
+from . import tsim
diff --git a/vta/apps/tsim_example/python/tsim.py b/vta/apps/tsim_example/python/tsim.py
new file mode 100644
index 000000000000..a41d904ab006
--- /dev/null
+++ b/vta/apps/tsim_example/python/tsim.py
@@ -0,0 +1,51 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import ctypes
+import os.path as osp
+from sys import platform
+
+def driver(hw_backend):
+    """Init hardware and software shared library for accelerator
+
+     Parameters
+     ------------
+     hw_backend : str
+        Hardware backend can be verilog or chisel
+
+    """
+    _ext = ".dylib" if platform == "darwin" else ".so"
+    _hw_libname = "libhw" + _ext
+    _sw_libname = "libsw" + _ext
+    _cur_path = osp.dirname(osp.abspath(osp.expanduser(__file__)))
+    if hw_backend in ("verilog", "chisel"):
+        _hw_lib = osp.join(_cur_path, "..", "hardware", hw_backend, "build", _hw_libname)
+    _sw_lib = osp.join(_cur_path, "..", "build", _sw_libname)
+
+    def load_dll(dll):
+        try:
+            return [ctypes.CDLL(dll, ctypes.RTLD_GLOBAL)]
+        except OSError:
+            return []
+
+    def run(a, b, c):
+        load_dll(_sw_lib)
+        f = tvm.get_global_func("tvm.vta.driver")
+        m = tvm.module.load(_hw_lib, "vta-tsim")
+        return f(m, a, b, c)
+    return run
diff --git a/vta/apps/tsim_example/python/tsim/__init__.py b/vta/apps/tsim_example/python/tsim/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/vta/apps/tsim_example/python/tsim/config.json b/vta/apps/tsim_example/python/tsim/config.json
deleted file mode 100644
index 887eaac67d74..000000000000
--- a/vta/apps/tsim_example/python/tsim/config.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-  "TARGET" : "verilog",
-  "TOP_NAME" : "TestAccel",
-  "BUILD_NAME" : "build",
-  "USE_TRACE" : "off",
-  "TRACE_NAME" : "trace"
-}
diff --git a/vta/apps/tsim_example/python/tsim/config.py b/vta/apps/tsim_example/python/tsim/config.py
deleted file mode 100644
index 6ff4f4234cf0..000000000000
--- a/vta/apps/tsim_example/python/tsim/config.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os.path as osp
-import sys
-import json
-import argparse
-
-cur = osp.abspath(osp.dirname(__file__))
-cfg = json.load(open(osp.join(cur, 'config.json')))
-
-def main():
-    """Main function"""
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--get-target", action="store_true",
-                        help="Get target language, i.e. verilog or chisel")
-    parser.add_argument("--get-top-name", action="store_true",
-                        help="Get hardware design top name")
-    parser.add_argument("--get-build-name", action="store_true",
-                        help="Get build folder name")
-    parser.add_argument("--get-use-trace", action="store_true",
-                        help="Get use trace")
-    parser.add_argument("--get-trace-name", action="store_true",
-                        help="Get trace filename")
-    args = parser.parse_args()
-
-    if len(sys.argv) == 1:
-        parser.print_help()
-        return
-
-    if args.get_target:
-        print(cfg['TARGET'])
-
-    if args.get_top_name:
-        print(cfg['TOP_NAME'])
-
-    if args.get_build_name:
-        print(cfg['BUILD_NAME'])
-
-    if args.get_use_trace:
-        print(cfg['USE_TRACE'])
-
-    if args.get_trace_name:
-        print(cfg['TRACE_NAME'])
-
-if __name__ == "__main__":
-    main()
diff --git a/vta/apps/tsim_example/python/tsim/load.py b/vta/apps/tsim_example/python/tsim/load.py
deleted file mode 100644
index ef94fa97d206..000000000000
--- a/vta/apps/tsim_example/python/tsim/load.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-import ctypes
-import json
-import os.path as osp
-from sys import platform
-
-def get_build_path():
-    curr_path = osp.dirname(osp.abspath(osp.expanduser(__file__)))
-    cfg = json.load(open(osp.join(curr_path, 'config.json')))
-    return osp.join(curr_path, "..", "..", cfg['BUILD_NAME'])
-
-def get_lib_ext():
-    if platform == "darwin":
-        ext = ".dylib"
-    else:
-        ext = ".so"
-    return ext
-
-def get_lib_path(name):
-    build_path = get_build_path()
-    ext = get_lib_ext()
-    libname = name + ext
-    return osp.join(build_path, libname)
-
-def _load_driver_lib():
-    lib = get_lib_path("libdriver")
-    try:
-        return [ctypes.CDLL(lib, ctypes.RTLD_GLOBAL)]
-    except OSError:
-        return []
-
-def load_driver():
-    return tvm.get_global_func("tvm.vta.driver")
-
-def load_tsim():
-    lib = get_lib_path("libtsim")
-    return tvm.module.load(lib, "vta-tsim")
-
-LIBS = _load_driver_lib()
diff --git a/vta/apps/tsim_example/src/driver.cc b/vta/apps/tsim_example/src/driver.cc
index 9898537a3f25..c1dc61f8bee1 100644
--- a/vta/apps/tsim_example/src/driver.cc
+++ b/vta/apps/tsim_example/src/driver.cc
@@ -35,43 +35,45 @@ uint32_t get_half_addr(void *p, bool upper) {
 using vta::dpi::DPIModuleNode;
 using tvm::runtime::Module;
 
-class TestDriver {
+class Device {
  public:
-  TestDriver(Module module)
+  Device(Module module)
       : module_(module) {
     dpi_ = static_cast<DPIModuleNode*>(
         module.operator->());
   }
 
-  int Run(uint32_t length, void* inp, void* out) {
-    uint32_t wait_cycles = 100000000;
-    this->Launch(wait_cycles, length, inp, out);
-    this->WaitForCompletion(wait_cycles);
+  uint32_t Run(uint32_t c, uint32_t length, void* inp, void* out) {
+    uint32_t cycles;
+    this->Launch(c, length, inp, out);
+    cycles = this->WaitForCompletion();
     dpi_->Finish();
-    return 0;
+    return cycles;
   }
 
  private:
-  void Launch(uint32_t wait_cycles, uint32_t length, void* inp, void* out) {
-    dpi_->Launch(wait_cycles);
-    // write registers
-    dpi_->WriteReg(0x04, length);
-    dpi_->WriteReg(0x08, get_half_addr(inp, false));
-    dpi_->WriteReg(0x0c, get_half_addr(inp, true));
-    dpi_->WriteReg(0x10, get_half_addr(out, false));
-    dpi_->WriteReg(0x14, get_half_addr(out, true));
+  void Launch(uint32_t c, uint32_t length, void* inp, void* out) {
+    dpi_->Launch(wait_cycles_);
+    dpi_->WriteReg(0x08, c);
+    dpi_->WriteReg(0x0c, length);
+    dpi_->WriteReg(0x10, get_half_addr(inp, false));
+    dpi_->WriteReg(0x14, get_half_addr(inp, true));
+    dpi_->WriteReg(0x18, get_half_addr(out, false));
+    dpi_->WriteReg(0x1c, get_half_addr(out, true));
     dpi_->WriteReg(0x00, 0x1); // launch
   }
 
-  void WaitForCompletion(uint32_t wait_cycles) {
+  uint32_t WaitForCompletion() {
     uint32_t i, val;
-    for (i = 0; i < wait_cycles; i++) {
+    for (i = 0; i < wait_cycles_; i++) {
       val = dpi_->ReadReg(0x00);
       if (val == 2) break; // finish
     }
+    val = dpi_->ReadReg(0x04);
+    return val;
   }
 
- private:
+  uint32_t wait_cycles_{100000000};
   DPIModuleNode* dpi_;
   Module module_;
 };
@@ -84,8 +86,9 @@ TVM_REGISTER_GLOBAL("tvm.vta.driver")
     Module dev_mod = args[0];
     DLTensor* A = args[1];
     DLTensor* B = args[2];
-    TestDriver dev_(dev_mod);
-    dev_.Run(A->shape[0], A->data, B->data);
+    Device dev_(dev_mod);
+    uint32_t cycles = dev_.Run(static_cast<int>(args[3]), A->shape[0], A->data, B->data);
+    *rv = static_cast<int>(cycles);
   });
 
 }  // namespace driver
diff --git a/vta/apps/tsim_example/tests/python/test_tsim.py b/vta/apps/tsim_example/tests/python/chisel_accel.py
similarity index 72%
rename from vta/apps/tsim_example/tests/python/test_tsim.py
rename to vta/apps/tsim_example/tests/python/chisel_accel.py
index fd032f91914e..26565c3d78eb 100644
--- a/vta/apps/tsim_example/tests/python/test_tsim.py
+++ b/vta/apps/tsim_example/tests/python/chisel_accel.py
@@ -17,22 +17,21 @@
 
 import tvm
 import numpy as np
-from tsim.load import load_driver, load_tsim
+import tsim
 
-def test_tsim(i):
-    rmin = 1 # min vector size of 1
+def test_accel():
     rmax = 64
-    n = np.random.randint(rmin, rmax)
+    n = np.random.randint(1, rmax)
+    c = np.random.randint(0, rmax)
     ctx = tvm.cpu(0)
     a = tvm.nd.array(np.random.randint(rmax, size=n).astype("uint64"), ctx)
     b = tvm.nd.array(np.zeros(n).astype("uint64"), ctx)
-    tsim = load_tsim()
-    f = load_driver()
-    f(tsim, a, b)
-    emsg = "[FAIL] test number:{} n:{}".format(i, n)
-    np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1, err_msg=emsg)
-    print("[PASS] test number:{} n:{}".format(i, n))
+    f = tsim.driver("chisel")
+    cycles = f(a, b, c)
+    msg = "cycles:{0:4} n:{1:2} c:{2:2}".format(cycles, n, c)
+    np.testing.assert_equal(b.asnumpy(), a.asnumpy() + c, err_msg = "[FAIL] " + msg)
+    print("[PASS] " + msg)
 
 if __name__ == "__main__":
     for i in range(10):
-        test_tsim(i)
+        test_accel()
diff --git a/vta/apps/tsim_example/tests/python/verilog_accel.py b/vta/apps/tsim_example/tests/python/verilog_accel.py
new file mode 100644
index 000000000000..d88964b1ed5c
--- /dev/null
+++ b/vta/apps/tsim_example/tests/python/verilog_accel.py
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import numpy as np
+import tsim
+
+def test_accel():
+    rmax = 64
+    n = np.random.randint(1, rmax)
+    c = np.random.randint(0, rmax)
+    ctx = tvm.cpu(0)
+    a = tvm.nd.array(np.random.randint(rmax, size=n).astype("uint64"), ctx)
+    b = tvm.nd.array(np.zeros(n).astype("uint64"), ctx)
+    f = tsim.driver("verilog")
+    cycles = f(a, b, c)
+    msg = "cycles:{0:4} n:{1:2} c:{2:2}".format(cycles, n, c)
+    np.testing.assert_equal(b.asnumpy(), a.asnumpy() + c, err_msg = "[FAIL] " + msg)
+    print("[PASS] " + msg)
+
+if __name__ == "__main__":
+    for i in range(10):
+        test_accel()
diff --git a/vta/hardware/chisel/Makefile b/vta/hardware/chisel/Makefile
index 65a9ed13c989..3c9b60148017 100644
--- a/vta/hardware/chisel/Makefile
+++ b/vta/hardware/chisel/Makefile
@@ -15,5 +15,104 @@
 # specific language governing permissions and limitations
 # under the License.
 
+ifeq (, $(shell which verilator))
+ $(error "No Verilator in $(PATH), consider doing apt-get install verilator")
+endif
+
+# Change VERILATOR_INC_DIR if Verilator is installed on a different location
+ifeq (, $(VERILATOR_INC_DIR))
+  ifeq (, $(wildcard /usr/local/share/verilator/include/*))
+    ifeq (, $(wildcard /usr/share/verilator/include/*))
+      $(error "Verilator include directory is not set properly")
+    else
+      VERILATOR_INC_DIR := /usr/share/verilator/include
+    endif
+  else
+      VERILATOR_INC_DIR := /usr/local/share/verilator/include
+  endif
+endif
+
+CONFIG = DefaultF1Config
+TOP = VTA
+TOP_TEST = Test
+BUILD_NAME = build
+USE_TRACE = 0
+VTA_LIBNAME = libvta_hw
+
+config_test = $(TOP_TEST)$(CONFIG)
+vta_dir = $(abspath ../../)
+tvm_dir = $(abspath ../../../)
+verilator_build_dir = $(vta_dir)/$(BUILD_NAME)/verilator
+chisel_build_dir = $(vta_dir)/$(BUILD_NAME)/chisel
+
+verilator_opt = --cc
+verilator_opt += +define+RANDOMIZE_GARBAGE_ASSIGN
+verilator_opt += +define+RANDOMIZE_REG_INIT
+verilator_opt += +define+RANDOMIZE_MEM_INIT
+verilator_opt += --x-assign unique
+verilator_opt += --output-split 20000
+verilator_opt += --output-split-cfuncs 20000
+verilator_opt += --top-module ${TOP_TEST}
+verilator_opt += -Mdir ${verilator_build_dir}
+verilator_opt += -I$(chisel_build_dir)
+
+cxx_flags = -O2 -Wall -fPIC -shared
+cxx_flags += -fvisibility=hidden -std=c++11
+cxx_flags += -DVL_TSIM_NAME=V$(TOP_TEST)
+cxx_flags += -DVL_PRINTF=printf
+cxx_flags += -DVL_USER_FINISH
+cxx_flags += -DVM_COVERAGE=0
+cxx_flags += -DVM_SC=0
+cxx_flags += -Wno-sign-compare
+cxx_flags += -include V$(TOP_TEST).h
+cxx_flags += -I$(verilator_build_dir)
+cxx_flags += -I$(VERILATOR_INC_DIR)
+cxx_flags += -I$(VERILATOR_INC_DIR)/vltstd
+cxx_flags += -I$(vta_dir)/include
+cxx_flags += -I$(tvm_dir)/include
+cxx_flags += -I$(tvm_dir)/3rdparty/dlpack/include
+
+cxx_files = $(VERILATOR_INC_DIR)/verilated.cpp
+cxx_files += $(VERILATOR_INC_DIR)/verilated_dpi.cpp
+cxx_files += $(wildcard $(verilator_build_dir)/*.cpp)
+cxx_files += $(vta_dir)/hardware/dpi/tsim_device.cc
+
+ifneq ($(USE_TRACE), 0)
+  verilator_opt += --trace
+  cxx_flags += -DVM_TRACE=1
+  cxx_flags += -DTSIM_TRACE_FILE=$(verilator_build_dir)/$(TOP_TEST).vcd
+  cxx_files += $(VERILATOR_INC_DIR)/verilated_vcd_c.cpp
+else
+  cxx_flags += -DVM_TRACE=0
+endif
+
+# The following is to be consistent with cmake
+ifeq ($(shell uname), Darwin)
+  lib_path = $(vta_dir)/$(BUILD_NAME)/$(VTA_LIBNAME).dylib
+else
+  lib_path = $(vta_dir)/$(BUILD_NAME)/$(VTA_LIBNAME).so
+endif
+
+default: lib
+
+lib: $(lib_path)
+$(lib_path): $(verilator_build_dir)/V$(TOP_TEST).cpp
+	g++ $(cxx_flags) $(cxx_files) -o $@
+
+verilator: $(verilator_build_dir)/V$(TOP_TEST).cpp
+$(verilator_build_dir)/V$(TOP_TEST).cpp: $(chisel_build_dir)/$(TOP_TEST).$(CONFIG).v
+	verilator $(verilator_opt) $<
+
+verilog: $(chisel_build_dir)/$(TOP).$(CONFIG).v
+$(chisel_build_dir)/$(TOP).$(CONFIG).v:
+	sbt 'runMain vta.$(CONFIG) --target-dir $(chisel_build_dir) --top-name $(TOP).$(CONFIG)'
+
+verilog_test: $(chisel_build_dir)/$(TOP_TEST).$(CONFIG).v
+$(chisel_build_dir)/$(TOP_TEST).$(CONFIG).v:
+	sbt 'runMain vta.$(config_test) --target-dir $(chisel_build_dir) --top-name $(TOP_TEST).$(CONFIG)'
+
 clean:
 	-rm -rf target project/target project/project
+
+cleanall:
+	-rm -rf $(vta_dir)/$(BUILD_NAME)
diff --git a/vta/hardware/chisel/src/main/resources/verilog/VTAHostDPI.v b/vta/hardware/chisel/src/main/resources/verilog/VTAHostDPI.v
index 02fcf0d779e1..b466c79d4555 100644
--- a/vta/hardware/chisel/src/main/resources/verilog/VTAHostDPI.v
+++ b/vta/hardware/chisel/src/main/resources/verilog/VTAHostDPI.v
@@ -112,7 +112,6 @@ module VTAHostDPI #
 
   always_ff @(posedge clock) begin
     if (__exit == 'd1) begin
-      $display("[DONE] at cycle:%016d", cycles);
       $finish;
     end
   end
diff --git a/vta/hardware/chisel/src/main/scala/core/Compute.scala b/vta/hardware/chisel/src/main/scala/core/Compute.scala
new file mode 100644
index 000000000000..ef56c3d4224e
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/core/Compute.scala
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.core
+
+import chisel3._
+import chisel3.util._
+import vta.util.config._
+import vta.shell._
+
+/** Compute.
+  *
+  * The compute unit is in charge of the following:
+  * - Loading micro-ops from memory (loadUop module)
+  * - Loading biases (acc) from memory (tensorAcc module)
+  * - Compute ALU instructions (tensorAlu module)
+  * - Compute GEMM instructions (tensorGemm module)
+  */
+class Compute(debug: Boolean = false)(implicit p: Parameters) extends Module {
+  val mp = p(ShellKey).memParams
+  val io = IO(new Bundle {
+    val i_post = Vec(2, Input(Bool()))
+    val o_post = Vec(2, Output(Bool()))
+    val inst = Flipped(Decoupled(UInt(INST_BITS.W)))
+    val uop_baddr = Input(UInt(mp.addrBits.W))
+    val acc_baddr = Input(UInt(mp.addrBits.W))
+    val vme_rd = Vec(2, new VMEReadMaster)
+    val inp = new TensorMaster(tensorType = "inp")
+    val wgt = new TensorMaster(tensorType = "wgt")
+    val out = new TensorMaster(tensorType = "out")
+    val finish = Output(Bool())
+  })
+  val sIdle :: sSync :: sExe :: Nil = Enum(3)
+  val state = RegInit(sIdle)
+
+  val s = Seq.tabulate(2)(_ => Module(new Semaphore(counterBits = 8, counterInitValue = 0)))
+
+  val loadUop = Module(new LoadUop)
+  val tensorAcc = Module(new TensorLoad(tensorType = "acc"))
+  val tensorGemm = Module(new TensorGemm)
+  val tensorAlu = Module(new TensorAlu)
+
+  val inst_q = Module(new Queue(UInt(INST_BITS.W), p(CoreKey).instQueueEntries))
+
+  // decode
+  val dec = Module(new ComputeDecode)
+  dec.io.inst := inst_q.io.deq.bits
+
+  val inst_type = Cat(dec.io.isFinish,
+                      dec.io.isAlu,
+		      dec.io.isGemm,
+		      dec.io.isLoadAcc,
+		      dec.io.isLoadUop).asUInt
+
+  val sprev = inst_q.io.deq.valid & Mux(dec.io.pop_prev, s(0).io.sready, true.B)
+  val snext = inst_q.io.deq.valid & Mux(dec.io.pop_next, s(1).io.sready, true.B)
+  val start = snext & sprev
+  val done =
+    MuxLookup(inst_type,
+               false.B, // default
+      Array(
+        "h_01".U -> loadUop.io.done,
+        "h_02".U -> tensorAcc.io.done,
+        "h_04".U -> tensorGemm.io.done,
+        "h_08".U -> tensorAlu.io.done,
+        "h_10".U -> true.B // Finish
+      )
+    )
+
+  // control
+  switch (state) {
+    is (sIdle) {
+      when (start) {
+	when (dec.io.isSync) {
+          state := sSync
+	} .elsewhen (inst_type.orR) {
+          state := sExe
+	}
+      }
+    }
+    is (sSync) {
+      state := sIdle
+    }
+    is (sExe) {
+      when (done) {
+        state := sIdle
+      }
+    }
+  }
+
+  // instructions
+  inst_q.io.enq <> io.inst
+  inst_q.io.deq.ready := (state === sExe & done) | (state === sSync)
+
+  // uop
+  loadUop.io.start :=  state === sIdle & start & dec.io.isLoadUop
+  loadUop.io.inst := inst_q.io.deq.bits
+  loadUop.io.baddr := io.uop_baddr
+  io.vme_rd(0) <> loadUop.io.vme_rd
+  loadUop.io.uop.idx <> Mux(dec.io.isGemm, tensorGemm.io.uop.idx, tensorAlu.io.uop.idx)
+
+  // acc
+  tensorAcc.io.start := state === sIdle & start & dec.io.isLoadAcc
+  tensorAcc.io.inst := inst_q.io.deq.bits
+  tensorAcc.io.baddr := io.acc_baddr
+  tensorAcc.io.tensor.rd.idx <> Mux(dec.io.isGemm, tensorGemm.io.acc.rd.idx, tensorAlu.io.acc.rd.idx)
+  tensorAcc.io.tensor.wr <> Mux(dec.io.isGemm, tensorGemm.io.acc.wr, tensorAlu.io.acc.wr)
+  io.vme_rd(1) <> tensorAcc.io.vme_rd
+
+  // gemm
+  tensorGemm.io.start :=  state === sIdle & start & dec.io.isGemm
+  tensorGemm.io.inst := inst_q.io.deq.bits
+  tensorGemm.io.uop.data.valid := loadUop.io.uop.data.valid & dec.io.isGemm
+  tensorGemm.io.uop.data.bits <> loadUop.io.uop.data.bits
+  tensorGemm.io.inp <> io.inp
+  tensorGemm.io.wgt <> io.wgt
+  tensorGemm.io.acc.rd.data.valid := tensorAcc.io.tensor.rd.data.valid & dec.io.isGemm
+  tensorGemm.io.acc.rd.data.bits <> tensorAcc.io.tensor.rd.data.bits
+  tensorGemm.io.out.rd.data.valid := io.out.rd.data.valid & dec.io.isGemm
+  tensorGemm.io.out.rd.data.bits <> io.out.rd.data.bits
+
+  // alu
+  tensorAlu.io.start :=  state === sIdle & start & dec.io.isAlu
+  tensorAlu.io.inst := inst_q.io.deq.bits
+  tensorAlu.io.uop.data.valid := loadUop.io.uop.data.valid & dec.io.isAlu
+  tensorAlu.io.uop.data.bits <> loadUop.io.uop.data.bits
+  tensorAlu.io.acc.rd.data.valid := tensorAcc.io.tensor.rd.data.valid & dec.io.isAlu
+  tensorAlu.io.acc.rd.data.bits <> tensorAcc.io.tensor.rd.data.bits
+  tensorAlu.io.out.rd.data.valid := io.out.rd.data.valid & dec.io.isAlu
+  tensorAlu.io.out.rd.data.bits <> io.out.rd.data.bits
+
+  // out
+  io.out.rd.idx <> Mux(dec.io.isGemm, tensorGemm.io.out.rd.idx, tensorAlu.io.out.rd.idx)
+  io.out.wr <> Mux(dec.io.isGemm, tensorGemm.io.out.wr, tensorAlu.io.out.wr)
+
+  // semaphore
+  s(0).io.spost := io.i_post(0)
+  s(1).io.spost := io.i_post(1)
+  s(0).io.swait := dec.io.pop_prev & (state === sIdle & start)
+  s(1).io.swait := dec.io.pop_next & (state === sIdle & start)
+  io.o_post(0) := dec.io.push_prev & ((state === sExe & done) | (state === sSync))
+  io.o_post(1) := dec.io.push_next & ((state === sExe & done) | (state === sSync))
+
+  // finish
+  io.finish := state === sExe & done & dec.io.isFinish
+
+  // debug
+  if (debug) {
+    // start
+    when (state === sIdle && start) {
+      when (dec.io.isSync) {
+        printf("[Compute] start sync\n")
+      } .elsewhen (dec.io.isLoadUop) {
+        printf("[Compute] start load uop\n")
+      } .elsewhen (dec.io.isLoadAcc) {
+        printf("[Compute] start load acc\n")
+      } .elsewhen (dec.io.isGemm) {
+        printf("[Compute] start gemm\n")
+      } .elsewhen (dec.io.isAlu) {
+        printf("[Compute] start alu\n")
+      } .elsewhen (dec.io.isFinish) {
+        printf("[Compute] start finish\n")
+      }
+    }
+    // done
+    when (state === sSync) {
+      printf("[Compute] done sync\n")
+    }
+    when (state === sExe) {
+      when (done) {
+        when (dec.io.isLoadUop) {
+          printf("[Compute] done load uop\n")
+        } .elsewhen (dec.io.isLoadAcc) {
+          printf("[Compute] done load acc\n")
+        } .elsewhen (dec.io.isGemm) {
+          printf("[Compute] done gemm\n")
+        } .elsewhen (dec.io.isAlu) {
+          printf("[Compute] done alu\n")
+        } .elsewhen (dec.io.isFinish) {
+          printf("[Compute] done finish\n")
+        }
+      }
+    }
+  }
+}
diff --git a/vta/hardware/chisel/src/main/scala/core/Configs.scala b/vta/hardware/chisel/src/main/scala/core/Configs.scala
new file mode 100644
index 000000000000..b4e764b120cd
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/core/Configs.scala
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.core
+
+import vta.util.config._
+
+/** CoreConfig.
+  *
+  * This is one supported configuration for VTA. This file will
+  * be eventually filled out with class configurations that can be
+  * mixed/matched with Shell configurations for different backends.
+  */
+class CoreConfig extends Config((site, here, up) => {
+  case CoreKey => CoreParams(
+    batch = 1,
+    blockOut = 16,
+    blockIn = 16,
+    inpBits = 8,
+    wgtBits = 8,
+    uopBits = 32,
+    accBits = 32,
+    outBits = 8,
+    uopMemDepth = 2048,
+    inpMemDepth = 2048,
+    wgtMemDepth = 1024,
+    accMemDepth = 2048,
+    outMemDepth = 2048,
+    instQueueEntries = 512)
+})
diff --git a/vta/hardware/chisel/src/main/scala/core/Core.scala b/vta/hardware/chisel/src/main/scala/core/Core.scala
new file mode 100644
index 000000000000..6c29a88548a7
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/core/Core.scala
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.core
+
+import chisel3._
+import vta.util.config._
+import vta.shell._
+
+/** Core parameters */
+case class CoreParams (
+  batch: Int = 1,
+  blockOut: Int = 16,
+  blockIn: Int = 16,
+  inpBits: Int = 8,
+  wgtBits: Int = 8,
+  uopBits: Int = 32,
+  accBits: Int = 32,
+  outBits: Int = 8,
+  uopMemDepth: Int = 512,
+  inpMemDepth: Int = 512,
+  wgtMemDepth: Int = 512,
+  accMemDepth: Int = 512,
+  outMemDepth: Int = 512,
+  instQueueEntries: Int = 32
+)
+
+case object CoreKey extends Field[CoreParams]
+
+/** Core.
+  *
+  * The core defines the current VTA architecture by connecting memory and
+  * compute modules together such as load/store and compute. Most of the
+  * connections in the core are bulk (<>), and we should try to keep it this
+  * way, because it is easier to understand what is going on.
+  *
+  * Also, the core must be instantiated by a shell using the
+  * VTA Control Register (VCR) and the VTA Memory Engine (VME) interfaces.
+  * More info about these interfaces and modules can be found in the shell
+  * directory.
+  */
+class Core(implicit p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    val vcr = new VCRClient
+    val vme = new VMEMaster
+  })
+  val fetch = Module(new Fetch)
+  val load = Module(new Load)
+  val compute = Module(new Compute)
+  val store = Module(new Store)
+  val ecounters = Module(new EventCounters)
+
+  // Read(rd) and write(wr) from/to memory (i.e. DRAM)
+  io.vme.rd(0) <> fetch.io.vme_rd
+  io.vme.rd(1) <> compute.io.vme_rd(0)
+  io.vme.rd(2) <> load.io.vme_rd(0)
+  io.vme.rd(3) <> load.io.vme_rd(1)
+  io.vme.rd(4) <> compute.io.vme_rd(1)
+  io.vme.wr(0) <> store.io.vme_wr
+
+  // Fetch instructions (tasks) from memory (DRAM) into queues (SRAMs)
+  fetch.io.launch := io.vcr.launch
+  fetch.io.ins_baddr := io.vcr.ptrs(0)
+  fetch.io.ins_count := io.vcr.vals(0)
+
+  // Load inputs and weights from memory (DRAM) into scratchpads (SRAMs)
+  load.io.i_post := compute.io.o_post(0)
+  load.io.inst <> fetch.io.inst.ld
+  load.io.inp_baddr := io.vcr.ptrs(2)
+  load.io.wgt_baddr := io.vcr.ptrs(3)
+
+  // The compute module performs the following:
+  // - Load micro-ops (uops) and accumulations (acc)
+  // - Compute dense and ALU instructions (tasks)
+  compute.io.i_post(0) := load.io.o_post
+  compute.io.i_post(1) := store.io.o_post
+  compute.io.inst <> fetch.io.inst.co
+  compute.io.uop_baddr := io.vcr.ptrs(1)
+  compute.io.acc_baddr := io.vcr.ptrs(4)
+  compute.io.inp <> load.io.inp
+  compute.io.wgt <> load.io.wgt
+
+  // The store module performs the following:
+  // - Writes results from compute into scratchpads (SRAMs)
+  // - Store results from scratchpads (SRAMs) to memory (DRAM)
+  store.io.i_post := compute.io.o_post(1)
+  store.io.inst <> fetch.io.inst.st
+  store.io.out_baddr := io.vcr.ptrs(5)
+  store.io.out <> compute.io.out
+
+  // Event counters
+  ecounters.io.launch := io.vcr.launch
+  ecounters.io.finish := compute.io.finish
+  io.vcr.ecnt <> ecounters.io.ecnt
+
+  // Finish instruction is executed and asserts the VCR finish flag
+  val finish = RegNext(compute.io.finish)
+  io.vcr.finish := finish
+}
diff --git a/vta/hardware/chisel/src/main/scala/core/Decode.scala b/vta/hardware/chisel/src/main/scala/core/Decode.scala
new file mode 100644
index 000000000000..f5bf3406347d
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/core/Decode.scala
@@ -0,0 +1,229 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.core
+
+import chisel3._
+import chisel3.util._
+
+import ISA._
+
+/** MemDecode.
+  *
+  * Decode memory instructions with a Bundle. This is similar to an union,
+  * therefore order matters when declaring fields. These are the instructions
+  * decoded with this bundle:
+  *   - LUOP
+  *   - LWGT
+  *   - LINP
+  *   - LACC
+  *   - SOUT
+  */
+class MemDecode extends Bundle {
+  val xpad_1 = UInt(M_PAD_BITS.W)
+  val xpad_0 = UInt(M_PAD_BITS.W)
+  val ypad_1 = UInt(M_PAD_BITS.W)
+  val ypad_0 = UInt(M_PAD_BITS.W)
+  val xstride = UInt(M_STRIDE_BITS.W)
+  val xsize = UInt(M_SIZE_BITS.W)
+  val ysize = UInt(M_SIZE_BITS.W)
+  val empty_0 = UInt(7.W) // derive this
+  val dram_offset = UInt(M_DRAM_OFFSET_BITS.W)
+  val sram_offset = UInt(M_SRAM_OFFSET_BITS.W)
+  val id = UInt(M_ID_BITS.W)
+  val push_next = Bool()
+  val push_prev = Bool()
+  val pop_next = Bool()
+  val pop_prev = Bool()
+  val op = UInt(OP_BITS.W)
+}
+
+/** GemmDecode.
+  *
+  * Decode GEMM instruction with a Bundle. This is similar to an union,
+  * therefore order matters when declaring fields.
+  */
+class GemmDecode extends Bundle {
+  val wgt_1 = UInt(C_WIDX_BITS.W)
+  val wgt_0 = UInt(C_WIDX_BITS.W)
+  val inp_1 = UInt(C_IIDX_BITS.W)
+  val inp_0 = UInt(C_IIDX_BITS.W)
+  val acc_1 = UInt(C_AIDX_BITS.W)
+  val acc_0 = UInt(C_AIDX_BITS.W)
+  val empty_0 = Bool()
+  val lp_1 = UInt(C_ITER_BITS.W)
+  val lp_0 = UInt(C_ITER_BITS.W)
+  val uop_end = UInt(C_UOP_END_BITS.W)
+  val uop_begin = UInt(C_UOP_BGN_BITS.W)
+  val reset = Bool()
+  val push_next = Bool()
+  val push_prev = Bool()
+  val pop_next = Bool()
+  val pop_prev = Bool()
+  val op = UInt(OP_BITS.W)
+}
+
+/** AluDecode.
+  *
+  * Decode ALU instructions with a Bundle. This is similar to an union,
+  * therefore order matters when declaring fields. These are the instructions
+  * decoded with this bundle:
+  *   - VMIN
+  *   - VMAX
+  *   - VADD
+  *   - VSHX
+  */
+class AluDecode extends Bundle {
+  val empty_1 = Bool()
+  val alu_imm = UInt(C_ALU_IMM_BITS.W)
+  val alu_use_imm = Bool()
+  val alu_op = UInt(C_ALU_DEC_BITS.W)
+  val src_1 = UInt(C_IIDX_BITS.W)
+  val src_0 = UInt(C_IIDX_BITS.W)
+  val dst_1 = UInt(C_AIDX_BITS.W)
+  val dst_0 = UInt(C_AIDX_BITS.W)
+  val empty_0 = Bool()
+  val lp_1 = UInt(C_ITER_BITS.W)
+  val lp_0 = UInt(C_ITER_BITS.W)
+  val uop_end = UInt(C_UOP_END_BITS.W)
+  val uop_begin = UInt(C_UOP_BGN_BITS.W)
+  val reset = Bool()
+  val push_next = Bool()
+  val push_prev = Bool()
+  val pop_next = Bool()
+  val pop_prev = Bool()
+  val op = UInt(OP_BITS.W)
+}
+
+/** UopDecode.
+  *
+  * Decode micro-ops (uops).
+  */
+class UopDecode extends Bundle {
+  val u2 = UInt(10.W)
+  val u1 = UInt(11.W)
+  val u0 = UInt(11.W)
+}
+
+/** FetchDecode.
+  *
+  * Partial decoding for dispatching instructions to Load, Compute, and Store.
+  */
+class FetchDecode extends Module {
+  val io = IO(new Bundle {
+    val inst = Input(UInt(INST_BITS.W))
+    val isLoad = Output(Bool())
+    val isCompute = Output(Bool())
+    val isStore = Output(Bool())
+  })
+  val csignals =
+    ListLookup(io.inst,
+        List(N, OP_X),
+      Array(
+        LUOP -> List(Y, OP_G),
+        LWGT -> List(Y, OP_L),
+        LINP -> List(Y, OP_L),
+        LACC -> List(Y, OP_G),
+        SOUT -> List(Y, OP_S),
+        GEMM -> List(Y, OP_G),
+        FNSH -> List(Y, OP_G),
+        VMIN -> List(Y, OP_G),
+        VMAX -> List(Y, OP_G),
+        VADD -> List(Y, OP_G),
+        VSHX -> List(Y, OP_G)
+      )
+    )
+
+  val (cs_val_inst: Bool) :: cs_op_type :: Nil = csignals
+
+  io.isLoad := cs_val_inst & cs_op_type === OP_L
+  io.isCompute := cs_val_inst & cs_op_type === OP_G
+  io.isStore := cs_val_inst & cs_op_type === OP_S
+}
+
+/** LoadDecode.
+  *
+  * Decode dependencies, type and sync for Load module.
+  */
+class LoadDecode extends Module {
+  val io = IO(new Bundle {
+    val inst = Input(UInt(INST_BITS.W))
+    val push_next = Output(Bool())
+    val pop_next = Output(Bool())
+    val isInput = Output(Bool())
+    val isWeight = Output(Bool())
+    val isSync = Output(Bool())
+  })
+  val dec = io.inst.asTypeOf(new MemDecode)
+  io.push_next := dec.push_next
+  io.pop_next := dec.pop_next
+  io.isInput := io.inst === LINP & dec.xsize =/= 0.U
+  io.isWeight := io.inst === LWGT & dec.xsize =/= 0.U
+  io.isSync := (io.inst === LINP | io.inst === LWGT) & dec.xsize === 0.U
+}
+
+/** ComputeDecode.
+  *
+  * Decode dependencies, type and sync for Compute module.
+  */
+class ComputeDecode extends Module {
+  val io = IO(new Bundle {
+    val inst = Input(UInt(INST_BITS.W))
+    val push_next = Output(Bool())
+    val push_prev = Output(Bool())
+    val pop_next = Output(Bool())
+    val pop_prev = Output(Bool())
+    val isLoadAcc = Output(Bool())
+    val isLoadUop = Output(Bool())
+    val isSync = Output(Bool())
+    val isAlu = Output(Bool())
+    val isGemm = Output(Bool())
+    val isFinish = Output(Bool())
+  })
+  val dec = io.inst.asTypeOf(new MemDecode)
+  io.push_next := dec.push_next
+  io.push_prev := dec.push_prev
+  io.pop_next := dec.pop_next
+  io.pop_prev := dec.pop_prev
+  io.isLoadAcc := io.inst === LACC & dec.xsize =/= 0.U
+  io.isLoadUop := io.inst === LUOP & dec.xsize =/= 0.U
+  io.isSync := (io.inst === LACC | io.inst === LUOP) & dec.xsize === 0.U
+  io.isAlu := io.inst === VMIN | io.inst === VMAX | io.inst === VADD | io.inst === VSHX
+  io.isGemm := io.inst === GEMM
+  io.isFinish := io.inst === FNSH
+}
+
+/** StoreDecode.
+  *
+  * Decode dependencies, type and sync for Store module.
+  */
+class StoreDecode extends Module {
+  val io = IO(new Bundle {
+    val inst = Input(UInt(INST_BITS.W))
+    val push_prev = Output(Bool())
+    val pop_prev = Output(Bool())
+    val isStore = Output(Bool())
+    val isSync = Output(Bool())
+  })
+  val dec = io.inst.asTypeOf(new MemDecode)
+  io.push_prev := dec.push_prev
+  io.pop_prev := dec.pop_prev
+  io.isStore := io.inst === SOUT & dec.xsize =/= 0.U
+  io.isSync := io.inst === SOUT & dec.xsize === 0.U
+}
diff --git a/vta/hardware/chisel/src/main/scala/core/EventCounters.scala b/vta/hardware/chisel/src/main/scala/core/EventCounters.scala
new file mode 100644
index 000000000000..5a5b095aa332
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/core/EventCounters.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.core
+
+import chisel3._
+import chisel3.util._
+import vta.util.config._
+import vta.shell._
+
+/** EventCounters.
+  *
+  * This unit contains all the event counting logic. One common event tracked in
+  * hardware is the number of clock cycles taken to achieve certain task. We
+  * can count the total number of clock cycles spent in a VTA run by checking
+  * launch and finish signals.
+  *
+  * The event counter value is passed to the VCR module via the ecnt port, so
+  * they can be accessed by the host. The number of event counters (nECnt) is
+  * defined in the Shell VCR module as a parameter, see VCRParams.
+  *
+  * If one would like to add an event counter, then the value of nECnt must be
+  * changed in VCRParams together with the corresponding counting logic here.
+  */
+class EventCounters(debug: Boolean = false)(implicit p: Parameters) extends Module {
+  val vp = p(ShellKey).vcrParams
+  val io = IO(new Bundle{
+    val launch = Input(Bool())
+    val finish = Input(Bool())
+    val ecnt = Vec(vp.nECnt, ValidIO(UInt(vp.regBits.W)))
+  })
+  val cycle_cnt = RegInit(0.U(vp.regBits.W))
+  when (io.launch && !io.finish) {
+    cycle_cnt := cycle_cnt + 1.U
+  } .otherwise {
+    cycle_cnt := 0.U
+  }
+  io.ecnt(0).valid := io.finish
+  io.ecnt(0).bits := cycle_cnt
+}
diff --git a/vta/hardware/chisel/src/main/scala/core/Fetch.scala b/vta/hardware/chisel/src/main/scala/core/Fetch.scala
new file mode 100644
index 000000000000..bcc164a8f623
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/core/Fetch.scala
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.core
+
+import chisel3._
+import chisel3.util._
+import vta.util.config._
+import vta.shell._
+
+/** Fetch.
+  *
+  * The fetch unit reads instructions (tasks) from memory (i.e. DRAM), using the
+  * VTA Memory Engine (VME), and push them into an instruction queue called
+  * inst_q. Once the instruction queue is full, instructions are dispatched to
+  * the Load, Compute and Store module queues based on the instruction opcode.
+  * After draining the queue, the fetch unit checks if there are more instructions
+  * via the ins_count register which is written by the host.
+  *
+  * Additionally, instructions are read into two chunks (see sReadLSB and sReadMSB)
+  * because we are using a DRAM payload of 8-bytes or half of a VTA instruction.
+  * This should be configurable for larger payloads, i.e. 64-bytes, which can load
+  * more than one instruction at the time. Finally, the instruction queue is
+  * sized (entries_q), depending on the maximum burst allowed in the memory.
+  */
+class Fetch(debug: Boolean = false)(implicit p: Parameters) extends Module {
+  val vp = p(ShellKey).vcrParams
+  val mp = p(ShellKey).memParams
+  val io = IO(new Bundle {
+    val launch = Input(Bool())
+    val ins_baddr = Input(UInt(mp.addrBits.W))
+    val ins_count = Input(UInt(vp.regBits.W))
+    val vme_rd = new VMEReadMaster
+    val inst = new Bundle {
+      val ld = Decoupled(UInt(INST_BITS.W))
+      val co = Decoupled(UInt(INST_BITS.W))
+      val st = Decoupled(UInt(INST_BITS.W))
+    }
+  })
+  val entries_q = 1 << (mp.lenBits - 1) // one-instr-every-two-vme-word
+  val inst_q = Module(new Queue(UInt(INST_BITS.W), entries_q))
+  val dec = Module(new FetchDecode)
+
+  val s1_launch = RegNext(io.launch)
+  val pulse = io.launch & ~s1_launch
+
+  val raddr = Reg(chiselTypeOf(io.vme_rd.cmd.bits.addr))
+  val rlen = Reg(chiselTypeOf(io.vme_rd.cmd.bits.len))
+  val ilen = Reg(chiselTypeOf(io.vme_rd.cmd.bits.len))
+
+  val xrem = Reg(chiselTypeOf(io.ins_count))
+  val xsize = (io.ins_count << 1.U) - 1.U
+  val xmax = (1 << mp.lenBits).U
+  val xmax_bytes = ((1 << mp.lenBits)*mp.dataBits/8).U
+
+  val sIdle :: sReadCmd :: sReadLSB :: sReadMSB :: sDrain :: Nil = Enum(5)
+  val state = RegInit(sIdle)
+
+  // control
+  switch (state) {
+    is (sIdle) {
+      when (pulse) {
+        state := sReadCmd
+	when (xsize < xmax) {
+          rlen := xsize
+	  ilen := xsize >> 1.U
+          xrem := 0.U
+	} .otherwise {
+          rlen := xmax - 1.U
+	  ilen := (xmax >> 1.U) - 1.U
+          xrem := xsize - xmax
+	}
+      }
+    }
+    is (sReadCmd) {
+      when (io.vme_rd.cmd.ready) {
+        state := sReadLSB
+      }
+    }
+    is (sReadLSB) {
+      when (io.vme_rd.data.valid) {
+          state := sReadMSB
+      }
+    }
+    is (sReadMSB) {
+      when (io.vme_rd.data.valid) {
+        when (inst_q.io.count === ilen) {
+          state := sDrain
+        } .otherwise {
+          state := sReadLSB
+	}
+      }
+    }
+    is (sDrain) {
+      when (inst_q.io.count === 0.U) {
+        when (xrem === 0.U) {
+          state := sIdle
+        } .elsewhen (xrem < xmax) {
+          state := sReadCmd
+          rlen := xrem
+	  ilen := xrem >> 1.U
+          xrem := 0.U
+        } .otherwise {
+          state := sReadCmd
+          rlen := xmax - 1.U
+	  ilen := (xmax >> 1.U) - 1.U
+          xrem := xrem - xmax
+        }
+      }
+    }
+  }
+
+  // read instructions from dram
+  when (state === sIdle) {
+    raddr := io.ins_baddr
+  } .elsewhen (state === sDrain && inst_q.io.count === 0.U && xrem =/= 0.U) {
+    raddr := raddr + xmax_bytes
+  }
+
+  io.vme_rd.cmd.valid := state === sReadCmd
+  io.vme_rd.cmd.bits.addr := raddr
+  io.vme_rd.cmd.bits.len := rlen
+
+  io.vme_rd.data.ready := inst_q.io.enq.ready
+
+  val lsb = Reg(chiselTypeOf(io.vme_rd.data.bits))
+  val msb = io.vme_rd.data.bits
+  val inst = Cat(msb, lsb)
+
+  when (state === sReadLSB) { lsb := io.vme_rd.data.bits }
+
+  inst_q.io.enq.valid := io.vme_rd.data.valid & state === sReadMSB
+  inst_q.io.enq.bits := inst
+
+  // decode
+  dec.io.inst := inst_q.io.deq.bits
+
+  // instruction queues
+  io.inst.ld.valid := dec.io.isLoad & inst_q.io.deq.valid & state === sDrain
+  io.inst.co.valid := dec.io.isCompute & inst_q.io.deq.valid & state === sDrain
+  io.inst.st.valid := dec.io.isStore & inst_q.io.deq.valid & state === sDrain
+
+  io.inst.ld.bits := inst_q.io.deq.bits
+  io.inst.co.bits := inst_q.io.deq.bits
+  io.inst.st.bits := inst_q.io.deq.bits
+
+  // check if selected queue is ready
+  val deq_sel = Cat(dec.io.isCompute, dec.io.isStore, dec.io.isLoad).asUInt
+  val deq_ready =
+    MuxLookup(deq_sel,
+               false.B, // default
+      Array(
+        "h_01".U -> io.inst.ld.ready,
+        "h_02".U -> io.inst.st.ready,
+        "h_04".U -> io.inst.co.ready
+      )
+    )
+
+  // dequeue instruction
+  inst_q.io.deq.ready := deq_ready & inst_q.io.deq.valid & state === sDrain
+
+
+  // debug
+  if (debug) {
+    when (state === sIdle && pulse) {
+      printf("[Fetch] Launch\n")
+    }
+    // instruction
+    when (inst_q.io.deq.fire()) {
+      when (dec.io.isLoad) {
+        printf("[Fetch] [instruction decode] [L] %x\n", inst_q.io.deq.bits)
+      }
+      when (dec.io.isCompute) {
+        printf("[Fetch] [instruction decode] [C] %x\n", inst_q.io.deq.bits)
+      }
+      when (dec.io.isStore) {
+        printf("[Fetch] [instruction decode] [S] %x\n", inst_q.io.deq.bits)
+      }
+    }
+  }
+}
diff --git a/vta/hardware/chisel/src/main/scala/core/ISA.scala b/vta/hardware/chisel/src/main/scala/core/ISA.scala
new file mode 100644
index 000000000000..c3bf6097adcd
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/core/ISA.scala
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.core
+
+import chisel3._
+import chisel3.util._
+
+/** ISAConstants.
+  *
+  * These constants are used for decoding (parsing) fields on instructions.
+  */
+trait ISAConstants
+{
+   val INST_BITS = 128
+
+   val OP_BITS = 3
+
+   val M_DEP_BITS = 4
+   val M_ID_BITS = 2
+   val M_SRAM_OFFSET_BITS = 16
+   val M_DRAM_OFFSET_BITS = 32
+   val M_SIZE_BITS = 16
+   val M_STRIDE_BITS = 16
+   val M_PAD_BITS = 4
+
+   val C_UOP_BGN_BITS = 13
+   val C_UOP_END_BITS = 14
+   val C_ITER_BITS = 14
+   val C_AIDX_BITS = 11
+   val C_IIDX_BITS = 11
+   val C_WIDX_BITS = 10
+   val C_ALU_DEC_BITS = 2 // FIXME: there should be a SHL and SHR instruction
+   val C_ALU_OP_BITS = 3
+   val C_ALU_IMM_BITS = 16
+
+   val Y = true.B
+   val N = false.B
+
+   val OP_L = 0.asUInt(OP_BITS.W)
+   val OP_S = 1.asUInt(OP_BITS.W)
+   val OP_G = 2.asUInt(OP_BITS.W)
+   val OP_F = 3.asUInt(OP_BITS.W)
+   val OP_A = 4.asUInt(OP_BITS.W)
+   val OP_X = 5.asUInt(OP_BITS.W)
+
+   val ALU_OP_NUM = 5
+   val ALU_OP = Enum(ALU_OP_NUM)
+
+   val M_ID_U = 0.asUInt(M_ID_BITS.W)
+   val M_ID_W = 1.asUInt(M_ID_BITS.W)
+   val M_ID_I = 2.asUInt(M_ID_BITS.W)
+   val M_ID_A = 3.asUInt(M_ID_BITS.W)
+}
+
+/** ISA.
+  *
+  * This is the VTA ISA, here we specify the cares and dont-cares that makes
+  * decoding easier. Since instructions are quite long 128-bit, we could generate
+  * these based on ISAConstants.
+  *
+  * FIXME: VSHX should be replaced by VSHR and VSHL once we modify the compiler
+  * TODO: Add VXOR to clear accumulator
+  */
+object ISA {
+  def LUOP = BitPat("b_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_???????0_0????000")
+  def LWGT = BitPat("b_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_???????0_1????000")
+  def LINP = BitPat("b_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_???????1_0????000")
+  def LACC = BitPat("b_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_???????1_1????000")
+  def SOUT = BitPat("b_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_?????001")
+  def GEMM = BitPat("b_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_?????010")
+  def VMIN = BitPat("b_????????_????????_??00????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_?????100")
+  def VMAX = BitPat("b_????????_????????_??01????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_?????100")
+  def VADD = BitPat("b_????????_????????_??10????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_?????100")
+  def VSHX = BitPat("b_????????_????????_??11????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_?????100")
+  def FNSH = BitPat("b_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_????????_?????011")
+}
diff --git a/vta/hardware/chisel/src/main/scala/core/Load.scala b/vta/hardware/chisel/src/main/scala/core/Load.scala
new file mode 100644
index 000000000000..64795139aa4e
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/core/Load.scala
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.core
+
+import chisel3._
+import chisel3.util._
+import vta.util.config._
+import vta.shell._
+
+/** Load.
+  *
+  * Load inputs and weights from memory (DRAM) into scratchpads (SRAMs).
+  * This module instantiate the TensorLoad unit which is in charge of
+  * loading 1D and 2D tensors to scratchpads, so it can be used by
+  * other modules such as Compute.
+  */
+class Load(debug: Boolean = false)(implicit p: Parameters) extends Module {
+  val mp = p(ShellKey).memParams
+  val io = IO(new Bundle {
+    val i_post = Input(Bool())
+    val o_post = Output(Bool())
+    val inst = Flipped(Decoupled(UInt(INST_BITS.W)))
+    val inp_baddr = Input(UInt(mp.addrBits.W))
+    val wgt_baddr = Input(UInt(mp.addrBits.W))
+    val vme_rd = Vec(2, new VMEReadMaster)
+    val inp = new TensorClient(tensorType = "inp")
+    val wgt = new TensorClient(tensorType = "wgt")
+  })
+  val sIdle :: sSync :: sExe :: Nil = Enum(3)
+  val state = RegInit(sIdle)
+
+  val s = Module(new Semaphore(counterBits = 8, counterInitValue = 0))
+  val inst_q = Module(new Queue(UInt(INST_BITS.W), p(CoreKey).instQueueEntries))
+
+  val dec = Module(new LoadDecode)
+  dec.io.inst := inst_q.io.deq.bits
+
+  val tensorType = Seq("inp", "wgt")
+  val tensorDec = Seq(dec.io.isInput, dec.io.isWeight)
+  val tensorLoad = Seq.tabulate(2)(i => Module(new TensorLoad(tensorType = tensorType(i))))
+
+  val start = inst_q.io.deq.valid & Mux(dec.io.pop_next, s.io.sready, true.B)
+  val done = Mux(dec.io.isInput, tensorLoad(0).io.done, tensorLoad(1).io.done)
+
+  // control
+  switch (state) {
+    is (sIdle) {
+      when (start) {
+        when (dec.io.isSync) {
+          state := sSync
+	} .elsewhen (dec.io.isInput || dec.io.isWeight) {
+          state := sExe
+	}
+      }
+    }
+    is (sSync) {
+      state := sIdle
+    }
+    is (sExe) {
+      when (done) {
+        state := sIdle
+      }
+    }
+  }
+
+  // instructions
+  inst_q.io.enq <> io.inst
+  inst_q.io.deq.ready := (state === sExe & done) | (state === sSync)
+
+  // load tensor
+  // [0] input (inp)
+  // [1] weight (wgt)
+  val ptr = Seq(io.inp_baddr, io.wgt_baddr)
+  val tsor = Seq(io.inp, io.wgt)
+  for (i <- 0 until 2) {
+    tensorLoad(i).io.start := state === sIdle & start & tensorDec(i)
+    tensorLoad(i).io.inst := inst_q.io.deq.bits
+    tensorLoad(i).io.baddr := ptr(i)
+    tensorLoad(i).io.tensor <> tsor(i)
+    io.vme_rd(i) <> tensorLoad(i).io.vme_rd
+  }
+
+  // semaphore
+  s.io.spost := io.i_post
+  s.io.swait := dec.io.pop_next & (state === sIdle & start)
+  io.o_post := dec.io.push_next & ((state === sExe & done) | (state === sSync))
+
+  // debug
+  if (debug) {
+    // start
+    when (state === sIdle && start) {
+      when (dec.io.isSync) {
+        printf("[Load] start sync\n")
+      } .elsewhen (dec.io.isInput) {
+        printf("[Load] start input\n")
+      } .elsewhen (dec.io.isWeight) {
+        printf("[Load] start weight\n")
+      }
+    }
+    // done
+    when (state === sSync) {
+      printf("[Load] done sync\n")
+    }
+    when (state === sExe) {
+      when (done) {
+        when (dec.io.isInput) {
+	  printf("[Load] done input\n")
+	} .elsewhen (dec.io.isWeight) {
+	  printf("[Load] done weight\n")
+	}
+      }
+    }
+  }
+}
diff --git a/vta/hardware/chisel/src/main/scala/core/LoadUop.scala b/vta/hardware/chisel/src/main/scala/core/LoadUop.scala
new file mode 100644
index 000000000000..07296523b254
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/core/LoadUop.scala
@@ -0,0 +1,214 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.core
+
+import chisel3._
+import chisel3.util._
+import vta.util.config._
+import vta.shell._
+
+/** UopMaster.
+  *
+  * Uop interface used by a master module, i.e. TensorAlu or TensorGemm,
+  * to request a micro-op (uop) from the uop-scratchpad. The index (idx) is
+  * used as an address to find the uop in the uop-scratchpad.
+  */
+class UopMaster(implicit p: Parameters) extends Bundle {
+  val addrBits = log2Ceil(p(CoreKey).uopMemDepth)
+  val idx = ValidIO(UInt(addrBits.W))
+  val data = Flipped(ValidIO(new UopDecode))
+  override def cloneType = new UopMaster().asInstanceOf[this.type]
+}
+
+/** UopClient.
+  *
+  * Uop interface used by a client module, i.e. LoadUop, to receive
+  * a request from a master module, i.e. TensorAlu or TensorGemm.
+  * The index (idx) is used as an address to find the uop in the uop-scratchpad.
+  */
+class UopClient(implicit p: Parameters) extends Bundle {
+  val addrBits = log2Ceil(p(CoreKey).uopMemDepth)
+  val idx = Flipped(ValidIO(UInt(addrBits.W)))
+  val data = ValidIO(new UopDecode)
+  override def cloneType = new UopClient().asInstanceOf[this.type]
+}
+
+/** LoadUop.
+  *
+  * Load micro-ops (uops) from memory, i.e. DRAM, and store them in the
+  * uop-scratchpad. Currently, micro-ops are 32-bit wide and loaded in
+  * group of 2 given the fact that the DRAM payload is 8-bytes. This module
+  * should be modified later on to support different DRAM sizes efficiently.
+  */
+class LoadUop(debug: Boolean = false)(implicit p: Parameters) extends Module {
+  val mp = p(ShellKey).memParams
+  val io = IO(new Bundle {
+    val start = Input(Bool())
+    val done = Output(Bool())
+    val inst = Input(UInt(INST_BITS.W))
+    val baddr = Input(UInt(mp.addrBits.W))
+    val vme_rd = new VMEReadMaster
+    val uop = new UopClient
+  })
+  val numUop = 2 // store two uops per sram word
+  val uopBits = p(CoreKey).uopBits
+  val uopDepth = p(CoreKey).uopMemDepth / numUop
+
+  val dec = io.inst.asTypeOf(new MemDecode)
+  val raddr = Reg(chiselTypeOf(io.vme_rd.cmd.bits.addr))
+  val xcnt = Reg(chiselTypeOf(io.vme_rd.cmd.bits.len))
+  val xlen = Reg(chiselTypeOf(io.vme_rd.cmd.bits.len))
+  val xrem = Reg(chiselTypeOf(dec.xsize))
+  val xsize = dec.xsize(0) + (dec.xsize >> log2Ceil(numUop)) - 1.U
+  val xmax = (1 << mp.lenBits).U
+  val xmax_bytes = ((1 << mp.lenBits)*mp.dataBits/8).U
+
+  val offsetIsEven = (dec.sram_offset % 2.U) === 0.U
+  val sizeIsEven = (dec.xsize % 2.U) === 0.U
+
+  val sIdle :: sReadCmd :: sReadData :: Nil = Enum(3)
+  val state = RegInit(sIdle)
+
+  // control
+  switch (state) {
+    is (sIdle) {
+      when (io.start) {
+        state := sReadCmd
+	when (xsize < xmax) {
+          xlen := xsize
+          xrem := 0.U
+	} .otherwise {
+          xlen := xmax - 1.U
+          xrem := xsize - xmax
+	}
+      }
+    }
+    is (sReadCmd) {
+      when (io.vme_rd.cmd.ready) {
+        state := sReadData
+      }
+    }
+    is (sReadData) {
+      when (io.vme_rd.data.valid) {
+        when(xcnt === xlen) {
+          when (xrem === 0.U) {
+            state := sIdle
+          } .elsewhen (xrem < xmax) {
+            state := sReadCmd
+            xlen := xrem
+            xrem := 0.U
+          } .otherwise {
+            state := sReadCmd
+            xlen := xmax - 1.U
+            xrem := xrem - xmax
+          }
+        }
+      }
+    }
+  }
+
+  // read-from-dram
+  when (state === sIdle) {
+    when (offsetIsEven) {
+      raddr := io.baddr + dec.dram_offset
+    } .otherwise {
+      raddr := io.baddr + dec.dram_offset - 4.U
+    }
+  } .elsewhen (state === sReadData && xcnt === xlen && xrem =/= 0.U) {
+    raddr := raddr + xmax_bytes
+  }
+
+  io.vme_rd.cmd.valid := state === sReadCmd
+  io.vme_rd.cmd.bits.addr := raddr
+  io.vme_rd.cmd.bits.len := xlen
+
+  io.vme_rd.data.ready := state === sReadData
+
+  when (state =/= sReadData) {
+    xcnt := 0.U
+  } .elsewhen (io.vme_rd.data.fire()) {
+    xcnt := xcnt + 1.U
+  }
+
+  val waddr = Reg(UInt(log2Ceil(uopDepth).W))
+  when (state === sIdle) {
+    waddr := dec.sram_offset >> log2Ceil(numUop)
+  } .elsewhen (io.vme_rd.data.fire()) {
+    waddr := waddr + 1.U
+  }
+
+  val wdata = Wire(Vec(numUop, UInt(uopBits.W)))
+  val mem = SyncReadMem(uopDepth, chiselTypeOf(wdata))
+  val wmask = Reg(Vec(numUop, Bool()))
+
+  when (offsetIsEven) {
+    when (sizeIsEven) {
+      wmask := "b_11".U.asTypeOf(wmask)
+    } .elsewhen (io.vme_rd.cmd.fire()) {
+      when (dec.xsize === 1.U) {
+        wmask := "b_01".U.asTypeOf(wmask)
+      } .otherwise {
+        wmask := "b_11".U.asTypeOf(wmask)
+      }
+    } .elsewhen (io.vme_rd.data.fire()) {
+      when (xcnt === xlen - 1.U) {
+        wmask := "b_01".U.asTypeOf(wmask)
+      } .otherwise {
+        wmask := "b_11".U.asTypeOf(wmask)
+      }
+    }
+  } .otherwise {
+    when (io.vme_rd.cmd.fire()) {
+      wmask := "b_10".U.asTypeOf(wmask)
+    } .elsewhen (io.vme_rd.data.fire()) {
+      when (sizeIsEven && xcnt === xlen - 1.U) {
+        wmask := "b_01".U.asTypeOf(wmask)
+      } .otherwise {
+        wmask := "b_11".U.asTypeOf(wmask)
+      }
+    }
+  }
+
+  wdata := io.vme_rd.data.bits.asTypeOf(wdata)
+  when (io.vme_rd.data.fire()) {
+    mem.write(waddr, wdata, wmask)
+  }
+
+  // read-from-sram
+  io.uop.data.valid := RegNext(io.uop.idx.valid)
+
+  val sIdx = io.uop.idx.bits % numUop.U
+  val rIdx = io.uop.idx.bits >> log2Ceil(numUop)
+  val memRead = mem.read(rIdx, io.uop.idx.valid)
+  val sWord = memRead.asUInt.asTypeOf(wdata)
+  val sUop = sWord(sIdx).asTypeOf(io.uop.data.bits)
+
+  io.uop.data.bits <> sUop
+
+  // done
+  io.done := state === sReadData & io.vme_rd.data.valid & xcnt === xlen & xrem === 0.U
+
+  // debug
+  if (debug) {
+    when (io.vme_rd.cmd.fire()) {
+      printf("[LoadUop] cmd addr:%x len:%x rem:%x\n", raddr, xlen, xrem)
+    }
+  }
+}
diff --git a/vta/hardware/chisel/src/main/scala/core/Semaphore.scala b/vta/hardware/chisel/src/main/scala/core/Semaphore.scala
new file mode 100644
index 000000000000..06df51e20e27
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/core/Semaphore.scala
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.core
+
+import chisel3._
+import chisel3.util._
+
+/** Semaphore.
+  *
+  * This semaphore is used instead of push/pop fifo, used in the initial
+  * version of VTA. This semaphore is incremented (spost) or decremented (swait)
+  * depending on the push and pop fields on instructions to prevent RAW and WAR
+  * hazards.
+  */
+class Semaphore(counterBits: Int = 1, counterInitValue: Int = 1) extends Module {
+  val io = IO(new Bundle {
+    val spost = Input(Bool())
+    val swait = Input(Bool())
+    val sready = Output(Bool())
+  })
+  val cnt = RegInit(counterInitValue.U(counterBits.W))
+  when (io.spost && !io.swait && cnt =/= ((1 << counterBits) - 1).asUInt) { cnt := cnt + 1.U }
+  when (!io.spost && io.swait && cnt =/= 0.U) { cnt := cnt - 1.U }
+  io.sready := cnt =/= 0.U
+}
diff --git a/vta/hardware/chisel/src/main/scala/core/Store.scala b/vta/hardware/chisel/src/main/scala/core/Store.scala
new file mode 100644
index 000000000000..5d89871e65be
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/core/Store.scala
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.core
+
+import chisel3._
+import chisel3.util._
+import vta.util.config._
+import vta.shell._
+
+/** Store.
+  *
+  * Store results back to memory (DRAM) from scratchpads (SRAMs).
+  * This module instantiate the TensorStore unit which is in charge
+  * of storing 1D and 2D tensors to main memory.
+  */
+class Store(debug: Boolean = false)(implicit p: Parameters) extends Module {
+  val mp = p(ShellKey).memParams
+  val io = IO(new Bundle {
+    val i_post = Input(Bool())
+    val o_post = Output(Bool())
+    val inst = Flipped(Decoupled(UInt(INST_BITS.W)))
+    val out_baddr = Input(UInt(mp.addrBits.W))
+    val vme_wr = new VMEWriteMaster
+    val out = new TensorClient(tensorType = "out")
+  })
+  val sIdle :: sSync :: sExe :: Nil = Enum(3)
+  val state = RegInit(sIdle)
+
+  val s = Module(new Semaphore(counterBits = 8, counterInitValue = 0))
+  val inst_q = Module(new Queue(UInt(INST_BITS.W), p(CoreKey).instQueueEntries))
+
+  val dec = Module(new StoreDecode)
+  dec.io.inst := inst_q.io.deq.bits
+
+  val tensorStore = Module(new TensorStore(tensorType = "out"))
+
+  val start = inst_q.io.deq.valid & Mux(dec.io.pop_prev, s.io.sready, true.B)
+  val done = tensorStore.io.done
+
+  // control
+  switch (state) {
+    is (sIdle) {
+      when (start) {
+        when (dec.io.isSync) {
+          state := sSync
+	} .elsewhen (dec.io.isStore) {
+          state := sExe
+	}
+      }
+    }
+    is (sSync) {
+      state := sIdle
+    }
+    is (sExe) {
+      when (done) {
+        state := sIdle
+      }
+    }
+  }
+
+  // instructions
+  inst_q.io.enq <> io.inst
+  inst_q.io.deq.ready := (state === sExe & done) | (state === sSync)
+
+  // store
+  tensorStore.io.start := state === sIdle & start & dec.io.isStore
+  tensorStore.io.inst := inst_q.io.deq.bits
+  tensorStore.io.baddr := io.out_baddr
+  io.vme_wr <> tensorStore.io.vme_wr
+  tensorStore.io.tensor <> io.out
+
+  // semaphore
+  s.io.spost := io.i_post
+  s.io.swait := dec.io.pop_prev & (state === sIdle & start)
+  io.o_post := dec.io.push_prev & ((state === sExe & done) | (state === sSync))
+
+  // debug
+  if (debug) {
+    // start
+    when (state === sIdle && start) {
+      when (dec.io.isSync) {
+        printf("[Store] start sync\n")
+      } .elsewhen (dec.io.isStore) {
+        printf("[Store] start\n")
+      }
+    }
+    // done
+    when (state === sSync) {
+      printf("[Store] done sync\n")
+    }
+    when (state === sExe) {
+      when (done) {
+	printf("[Store] done\n")
+      }
+    }
+  }
+}
diff --git a/vta/hardware/chisel/src/main/scala/core/TensorAlu.scala b/vta/hardware/chisel/src/main/scala/core/TensorAlu.scala
new file mode 100644
index 000000000000..7f429be7249f
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/core/TensorAlu.scala
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.core
+
+import chisel3._
+import chisel3.util._
+import vta.util.config._
+
+/** ALU datapath */
+class Alu(implicit p: Parameters) extends Module {
+  val aluBits = p(CoreKey).accBits
+  val io = IO(new Bundle {
+    val opcode = Input(UInt(C_ALU_OP_BITS.W))
+    val a = Input(SInt(aluBits.W))
+    val b = Input(SInt(aluBits.W))
+    val y = Output(SInt(aluBits.W))
+  })
+
+  // FIXME: the following three will change once we support properly SHR and SHL
+  val ub = io.b.asUInt
+  val width = log2Ceil(aluBits)
+  val m = ~ub(width - 1, 0) + 1.U
+
+  val n = ub(width - 1, 0)
+  val fop = Seq(Mux(io.a < io.b, io.a, io.b),
+                Mux(io.a < io.b, io.b, io.a),
+                io.a + io.b,
+                io.a >> n,
+	        io.a << m)
+
+  val opmux = Seq.tabulate(ALU_OP_NUM)(i => ALU_OP(i) -> fop(i))
+  io.y := MuxLookup(io.opcode, io.a, opmux)
+}
+
+/** Pipelined ALU */
+class AluReg(implicit p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    val opcode = Input(UInt(C_ALU_OP_BITS.W))
+    val a = Flipped(ValidIO(UInt(p(CoreKey).accBits.W)))
+    val b = Flipped(ValidIO(UInt(p(CoreKey).accBits.W)))
+    val y = ValidIO(UInt(p(CoreKey).accBits.W))
+  })
+  val alu = Module(new Alu)
+  val rA = RegEnable(io.a.bits, io.a.valid)
+  val rB = RegEnable(io.b.bits, io.b.valid)
+  val valid = RegNext(io.b.valid)
+
+  alu.io.opcode := io.opcode
+
+  // register input
+  alu.io.a := rA.asSInt
+  alu.io.b := rB.asSInt
+
+  // output
+  io.y.valid := valid
+  io.y.bits := alu.io.y.asUInt
+}
+
+/** Vector of pipeline ALUs */
+class AluVector(implicit p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    val opcode = Input(UInt(C_ALU_OP_BITS.W))
+    val acc_a = new TensorMasterData(tensorType = "acc")
+    val acc_b = new TensorMasterData(tensorType = "acc")
+    val acc_y = new TensorClientData(tensorType = "acc")
+    val out = new TensorClientData(tensorType = "out")
+  })
+  val blockOut = p(CoreKey).blockOut
+  val f = Seq.fill(blockOut)(Module(new AluReg))
+  val valid = Wire(Vec(blockOut, Bool()))
+  for (i <- 0 until blockOut) {
+    f(i).io.opcode := io.opcode
+    f(i).io.a.valid := io.acc_a.data.valid
+    f(i).io.a.bits := io.acc_a.data.bits(0)(i)
+    f(i).io.b.valid := io.acc_b.data.valid
+    f(i).io.b.bits := io.acc_b.data.bits(0)(i)
+    valid(i) := f(i).io.y.valid
+    io.acc_y.data.bits(0)(i) := f(i).io.y.bits
+    io.out.data.bits(0)(i) := f(i).io.y.bits
+  }
+  io.acc_y.data.valid := valid.asUInt.andR
+  io.out.data.valid := valid.asUInt.andR
+}
+
+/** TensorAlu.
+  *
+  * This unit instantiate the ALU vector unit (AluVector) and go over the
+  * micro-ops (uops) which are used to read the source operands (vectors)
+  * from the acc-scratchpad and then they are written back the same
+  * acc-scratchpad.
+  */
+class TensorAlu(debug: Boolean = false)(implicit p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    val start = Input(Bool())
+    val done = Output(Bool())
+    val inst = Input(UInt(INST_BITS.W))
+    val uop = new UopMaster
+    val acc = new TensorMaster(tensorType = "acc")
+    val out = new TensorMaster(tensorType = "out")
+  })
+  val sIdle :: sReadUop :: sComputeIdx :: sReadTensorA :: sReadTensorB :: sExe :: Nil = Enum(6)
+  val state = RegInit(sIdle)
+  val alu = Module(new AluVector)
+  val dec = io.inst.asTypeOf(new AluDecode)
+  val uop_idx = Reg(chiselTypeOf(dec.uop_end))
+  val uop_end = dec.uop_end
+  val uop_dst = Reg(chiselTypeOf(dec.uop_end))
+  val uop_src = Reg(chiselTypeOf(dec.uop_end))
+  val cnt_o = Reg(chiselTypeOf(dec.lp_0))
+  val dst_o = Reg(chiselTypeOf(dec.uop_end))
+  val src_o = Reg(chiselTypeOf(dec.uop_end))
+  val cnt_i = Reg(chiselTypeOf(dec.lp_1))
+  val dst_i = Reg(chiselTypeOf(dec.uop_end))
+  val src_i = Reg(chiselTypeOf(dec.uop_end))
+  val done =
+    state === sExe &
+    alu.io.out.data.valid &
+    (cnt_o === dec.lp_0 - 1.U) &
+    (cnt_i === dec.lp_1 - 1.U) &
+    (uop_idx === uop_end - 1.U)
+
+  switch (state) {
+    is (sIdle) {
+      when (io.start) {
+        state := sReadUop
+      }
+    }
+    is (sReadUop) {
+      state := sComputeIdx
+    }
+    is (sComputeIdx) {
+      state := sReadTensorA
+    }
+    is (sReadTensorA) {
+      state := sReadTensorB
+    }
+    is (sReadTensorB) {
+      state := sExe
+    }
+    is (sExe) {
+      when (alu.io.out.data.valid) {
+        when ((cnt_o === dec.lp_0 - 1.U) &&
+	      (cnt_i === dec.lp_1 - 1.U) &&
+	      (uop_idx === uop_end - 1.U)) {
+          state := sIdle
+        } .otherwise {
+          state := sReadUop
+        }
+      }
+    }
+  }
+
+  when (state === sIdle ||
+         (state === sExe &&
+	  alu.io.out.data.valid &&
+	  uop_idx === uop_end - 1.U)) {
+    uop_idx := dec.uop_begin
+  } .elsewhen (state === sExe && alu.io.out.data.valid) {
+    uop_idx := uop_idx + 1.U
+  }
+
+  when (state === sIdle) {
+    cnt_o := 0.U
+    dst_o := 0.U
+    src_o := 0.U
+  } .elsewhen (state === sExe &&
+               alu.io.out.data.valid &&
+               uop_idx === uop_end - 1.U &&
+	       cnt_i === dec.lp_1 - 1.U) {
+    cnt_o := cnt_o + 1.U
+    dst_o := dst_o + dec.dst_0
+    src_o := src_o + dec.src_0
+  }
+
+  when (state === sIdle) {
+    cnt_i := 0.U
+    dst_i := 0.U
+    src_i := 0.U
+  } .elsewhen (state === sReadUop && cnt_i === dec.lp_1) {
+    cnt_i := 0.U
+    dst_i := dst_o
+    src_i := src_o
+  } .elsewhen (state === sExe &&
+               alu.io.out.data.valid &&
+	       uop_idx === uop_end - 1.U) {
+    cnt_i := cnt_i + 1.U
+    dst_i := dst_i + dec.dst_1
+    src_i := src_i + dec.src_1
+  }
+
+  when (state === sComputeIdx && io.uop.data.valid) {
+    uop_dst := io.uop.data.bits.u0 + dst_i
+    uop_src := io.uop.data.bits.u1 + src_i
+  }
+
+  // uop
+  io.uop.idx.valid := state === sReadUop
+  io.uop.idx.bits := uop_idx
+
+  // acc_i
+  io.acc.rd.idx.valid := state === sReadTensorA | (state === sReadTensorB & ~dec.alu_use_imm)
+  io.acc.rd.idx.bits := Mux(state === sReadTensorA, uop_dst, uop_src)
+
+  // imm
+  val tensorImm = Wire(new TensorClientData(tensorType = "acc"))
+  tensorImm.data.valid := state === sReadTensorB
+  tensorImm.data.bits.foreach { b => b.foreach { c => c := dec.alu_imm } }
+
+  // alu
+  val isSHR = dec.alu_op === ALU_OP(3)
+  val neg_shift = isSHR & dec.alu_imm(C_ALU_IMM_BITS-1)
+  val fixme_alu_op = Cat(neg_shift, Mux(neg_shift, 0.U, dec.alu_op))
+  alu.io.opcode := fixme_alu_op
+  alu.io.acc_a.data.valid := io.acc.rd.data.valid & state === sReadTensorB
+  alu.io.acc_a.data.bits <> io.acc.rd.data.bits
+  alu.io.acc_b.data.valid := Mux(dec.alu_use_imm, tensorImm.data.valid, io.acc.rd.data.valid & state === sExe)
+  alu.io.acc_b.data.bits <> Mux(dec.alu_use_imm, tensorImm.data.bits, io.acc.rd.data.bits)
+
+  // acc_o
+  io.acc.wr.valid := alu.io.acc_y.data.valid
+  io.acc.wr.bits.idx := uop_dst
+  io.acc.wr.bits.data <> alu.io.acc_y.data.bits
+
+  // out
+  io.out.wr.valid := alu.io.out.data.valid
+  io.out.wr.bits.idx := uop_dst
+  io.out.wr.bits.data <> alu.io.out.data.bits
+  io.out.tieoffRead() // write-only
+
+  io.done := done
+
+  if (debug) {
+
+    when (state === sReadUop) {
+      printf("[TensorAlu] [uop] idx:%x\n", uop_idx)
+    }
+
+    when (state === sReadTensorA) {
+      printf("[TensorAlu] [uop] dst:%x src:%x\n", uop_dst, uop_src)
+    }
+
+    when (state === sIdle && io.start) {
+      printf(p"[TensorAlu] decode:$dec\n")
+    }
+
+    alu.io.acc_a.data.bits.foreach { tensor =>
+      tensor.zipWithIndex.foreach { case(elem, i) =>
+        when (alu.io.acc_a.data.valid) {
+          printf("[TensorAlu] [a] i:%x val:%x\n", i.U, elem)
+        }
+      }
+    }
+
+    alu.io.acc_b.data.bits.foreach { tensor =>
+      tensor.zipWithIndex.foreach { case(elem, i) =>
+        when (alu.io.acc_b.data.valid) {
+          printf("[TensorAlu] [b] i:%x val:%x\n", i.U, elem)
+        }
+      }
+    }
+
+    alu.io.acc_y.data.bits.foreach { tensor =>
+      tensor.zipWithIndex.foreach { case(elem, i) =>
+        when (alu.io.acc_y.data.valid) {
+          printf("[TensorAlu] [y] i:%x val:%x\n", i.U, elem)
+        }
+      }
+    }
+
+    alu.io.out.data.bits.foreach { tensor =>
+      tensor.zipWithIndex.foreach { case(elem, i) =>
+        when (alu.io.out.data.valid) {
+          printf("[TensorAlu] [out] i:%x val:%x\n", i.U, elem)
+        }
+      }
+    }
+  }
+}
diff --git a/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala b/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala
new file mode 100644
index 000000000000..2dd8c33aea33
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala
@@ -0,0 +1,364 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.core
+
+import chisel3._
+import chisel3.util._
+import chisel3.experimental._
+import vta.util.config._
+import scala.math.pow
+
+/** Pipelined multiply and accumulate */
+class MAC(dataBits: Int = 8, cBits: Int = 16, outBits: Int = 17) extends Module {
+  require (cBits >= dataBits * 2)
+  require (outBits >= dataBits * 2)
+  val io = IO(new Bundle {
+    val a = Input(SInt(dataBits.W))
+    val b = Input(SInt(dataBits.W))
+    val c = Input(SInt(cBits.W))
+    val y = Output(SInt(outBits.W))
+  })
+  val mult = Wire(SInt(cBits.W))
+  val add = Wire(SInt(outBits.W))
+  val rA = RegNext(io.a)
+  val rB = RegNext(io.b)
+  val rC = RegNext(io.c)
+  mult := rA * rB
+  add := rC + mult
+  io.y := add
+}
+
+/** Pipelined adder */
+class Adder(dataBits: Int = 8, outBits: Int = 17) extends Module {
+  require (outBits >= dataBits)
+  val io = IO(new Bundle {
+    val a = Input(SInt(dataBits.W))
+    val b = Input(SInt(dataBits.W))
+    val y = Output(SInt(outBits.W))
+  })
+  val add = Wire(SInt(outBits.W))
+  val rA = RegNext(io.a)
+  val rB = RegNext(io.b)
+  add := rA + rB
+  io.y := add
+}
+
+/** Pipelined DotProduct based on MAC and Adder */
+class DotProduct(dataBits: Int = 8, size: Int = 16) extends Module {
+  val errMsg = s"\n\n[VTA] [DotProduct] size must be greater than 4 and a power of 2\n\n"
+  require(size >= 4 && isPow2(size), errMsg)
+  val b = dataBits * 2
+  val outBits = b + log2Ceil(size) + 1
+  val io = IO(new Bundle {
+    val a = Input(Vec(size, SInt(dataBits.W)))
+    val b = Input(Vec(size, SInt(dataBits.W)))
+    val y = Output(SInt(outBits.W))
+  })
+  val p = log2Ceil(size/2)
+  val s = Seq.tabulate(log2Ceil(size))(i => pow(2, p - i).toInt)
+  val da = Seq.tabulate(s(0))(i => RegNext(io.a(s(0) + i)))
+  val db = Seq.tabulate(s(0))(i => RegNext(io.b(s(0) + i)))
+  val m = Seq.tabulate(2)(i =>
+    Seq.fill(s(0))(Module(new MAC(dataBits = dataBits, cBits = b + i, outBits = b + i + 1)))
+  )
+  val a = Seq.tabulate(p)(i =>
+    Seq.fill(s(i + 1))(Module(new Adder(dataBits = b + i + 2, outBits = b + i + 3)))
+  )
+
+  for (i <- 0 until log2Ceil(size)) {
+    for (j <- 0 until s(i)) {
+      if (i == 0) {
+        m(i)(j).io.a := io.a(j)
+        m(i)(j).io.b := io.b(j)
+        m(i)(j).io.c := 0.S
+        m(i + 1)(j).io.a := da(j)
+        m(i + 1)(j).io.b := db(j)
+        m(i + 1)(j).io.c := m(i)(j).io.y
+      } else if (i == 1) {
+        a(i - 1)(j).io.a := m(i)(2*j).io.y
+        a(i - 1)(j).io.b := m(i)(2*j + 1).io.y
+      } else {
+        a(i - 1)(j).io.a := a(i - 2)(2*j).io.y
+        a(i - 1)(j).io.b := a(i - 2)(2*j + 1).io.y
+      }
+    }
+  }
+  io.y := a(p-1)(0).io.y
+}
+
+/** Perform matric-vector-multiplication based on DotProduct */
+class MatrixVectorCore(implicit p: Parameters) extends Module {
+  val accBits = p(CoreKey).accBits
+  val size = p(CoreKey).blockOut
+  val dataBits = p(CoreKey).inpBits
+  val io = IO(new Bundle{
+    val reset = Input(Bool()) // FIXME: reset should be replaced by a load-acc instr
+    val inp = new TensorMasterData(tensorType = "inp")
+    val wgt = new TensorMasterData(tensorType = "wgt")
+    val acc_i = new TensorMasterData(tensorType = "acc")
+    val acc_o = new TensorClientData(tensorType = "acc")
+    val out = new TensorClientData(tensorType = "out")
+  })
+  val dot = Seq.fill(size)(Module(new DotProduct(dataBits, size)))
+  val acc = Seq.fill(size)(Module(new Pipe(UInt(accBits.W), latency = log2Ceil(size) + 1)))
+  val add = Seq.fill(size)(Wire(SInt(accBits.W)))
+  val vld = Wire(Vec(size, Bool()))
+
+  for (i <- 0 until size) {
+    acc(i).io.enq.valid := io.inp.data.valid & io.wgt.data.valid & io.acc_i.data.valid & ~io.reset
+    acc(i).io.enq.bits := io.acc_i.data.bits(0)(i)
+    for (j <- 0 until size) {
+      dot(i).io.a(j) := io.inp.data.bits(0)(j).asSInt
+      dot(i).io.b(j) := io.wgt.data.bits(i)(j).asSInt
+    }
+    add(i) := acc(i).io.deq.bits.asSInt + dot(i).io.y
+    io.acc_o.data.bits(0)(i) := Mux(io.reset, 0.U, add(i).asUInt)
+    io.out.data.bits(0)(i) := add(i).asUInt
+    vld(i) := acc(i).io.deq.valid
+  }
+  io.acc_o.data.valid := vld.asUInt.andR | io.reset
+  io.out.data.valid := vld.asUInt.andR
+}
+
+/** TensorGemm.
+  *
+  * This unit instantiate the MatrixVectorCore and go over the
+  * micro-ops (uops) which are used to read inputs, weights and biases,
+  * and writes results back to the acc and out scratchpads.
+  *
+  * Also, the TensorGemm uses the reset field in the Gemm instruction to
+  * clear or zero-out the acc-scratchpad locations based on the micro-ops.
+  */
+class TensorGemm(debug: Boolean = false)(implicit p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    val start = Input(Bool())
+    val done = Output(Bool())
+    val inst = Input(UInt(INST_BITS.W))
+    val uop = new UopMaster
+    val inp = new TensorMaster(tensorType = "inp")
+    val wgt = new TensorMaster(tensorType = "wgt")
+    val acc = new TensorMaster(tensorType = "acc")
+    val out = new TensorMaster(tensorType = "out")
+  })
+  val sIdle :: sReadUop :: sComputeIdx :: sReadTensor :: sExe :: sWait :: Nil = Enum(6)
+  val state = RegInit(sIdle)
+  val mvc = Module(new MatrixVectorCore)
+  val dec = io.inst.asTypeOf(new GemmDecode)
+  val uop_idx = Reg(chiselTypeOf(dec.uop_end))
+  val uop_end = dec.uop_end
+  val uop_acc = Reg(chiselTypeOf(dec.uop_end))
+  val uop_inp = Reg(chiselTypeOf(dec.uop_end))
+  val uop_wgt = Reg(chiselTypeOf(dec.uop_end))
+  val cnt_o = Reg(chiselTypeOf(dec.lp_0))
+  val acc_o = Reg(chiselTypeOf(dec.uop_end))
+  val inp_o = Reg(chiselTypeOf(dec.uop_end))
+  val wgt_o = Reg(chiselTypeOf(dec.uop_end))
+  val cnt_i = Reg(chiselTypeOf(dec.lp_1))
+  val acc_i = Reg(chiselTypeOf(dec.uop_end))
+  val inp_i = Reg(chiselTypeOf(dec.uop_end))
+  val wgt_i = Reg(chiselTypeOf(dec.uop_end))
+  val pBits = log2Ceil(p(CoreKey).blockOut) + 1
+  val inflight = Reg(UInt(pBits.W))
+  val wrpipe = Module(new Pipe(chiselTypeOf(dec.uop_end), latency = pBits))
+  val done = inflight === 0.U &
+             ((state === sExe &
+              cnt_o === dec.lp_0 - 1.U &
+	      cnt_i === dec.lp_1 - 1.U &
+	      uop_idx === uop_end - 1.U &
+	      inflight === 0.U) |
+	     state === sWait)
+
+  switch (state) {
+    is (sIdle) {
+      when (io.start) {
+        state := sReadUop
+      }
+    }
+    is (sReadUop) {
+      state := sComputeIdx
+    }
+    is (sComputeIdx) {
+      state := sReadTensor
+    }
+    is (sReadTensor) {
+      state := sExe
+    }
+    is (sExe) {
+      when ((cnt_o === dec.lp_0 - 1.U) &&
+            (cnt_i === dec.lp_1 - 1.U) &&
+            (uop_idx === uop_end - 1.U)) {
+	when (inflight =/= 0.U) {
+          state := sWait
+	} .otherwise {
+          state := sIdle
+	}
+      } .otherwise {
+        state := sReadUop
+      }
+    }
+    is (sWait) {
+      when (inflight === 0.U) {
+        state := sIdle
+      }
+    }
+  }
+
+  when (state === sIdle) {
+    inflight := 0.U
+  } .elsewhen (!dec.reset) {
+    when (state === sExe && inflight =/= ((1 << pBits) - 1).asUInt) { // overflow check
+      inflight := inflight + 1.U
+    } .elsewhen (mvc.io.acc_o.data.valid && inflight =/= 0.U) { // underflow check
+      inflight := inflight - 1.U
+    }
+  }
+
+  when (state === sIdle ||
+         (state === sExe &&
+	  uop_idx === uop_end - 1.U)) {
+    uop_idx := dec.uop_begin
+  } .elsewhen (state === sExe) {
+    uop_idx := uop_idx + 1.U
+  }
+
+  when (state === sIdle) {
+    cnt_o := 0.U
+    acc_o := 0.U
+    inp_o := 0.U
+    wgt_o := 0.U
+  } .elsewhen (state === sExe &&
+	       uop_idx === uop_end - 1.U &&
+	       cnt_i === dec.lp_1 - 1.U) {
+    cnt_o := cnt_o + 1.U
+    acc_o := acc_o + dec.acc_0
+    inp_o := inp_o + dec.inp_0
+    wgt_o := wgt_o + dec.wgt_0
+  }
+
+  when (state === sIdle) {
+    cnt_i := 0.U
+    acc_i := 0.U
+    inp_i := 0.U
+    wgt_i := 0.U
+  } .elsewhen (state === sReadUop && cnt_i === dec.lp_1) {
+    cnt_i := 0.U
+    acc_i := acc_o
+    inp_i := inp_o
+    wgt_i := wgt_o
+  } .elsewhen (state === sExe &&
+	       uop_idx === uop_end - 1.U) {
+    cnt_i := cnt_i + 1.U
+    acc_i := acc_i + dec.acc_1
+    inp_i := inp_i + dec.inp_1
+    wgt_i := wgt_i + dec.wgt_1
+  }
+
+  when (state === sComputeIdx && io.uop.data.valid) {
+    uop_acc := io.uop.data.bits.u0 + acc_i
+    uop_inp := io.uop.data.bits.u1 + inp_i
+    uop_wgt := io.uop.data.bits.u2 + wgt_i
+  }
+
+  wrpipe.io.enq.valid := state === sExe & ~dec.reset
+  wrpipe.io.enq.bits := uop_acc
+
+  // uop
+  io.uop.idx.valid := state === sReadUop
+  io.uop.idx.bits := uop_idx
+
+  // inp
+  io.inp.rd.idx.valid := state === sReadTensor
+  io.inp.rd.idx.bits := uop_inp
+  io.inp.tieoffWrite() // read-only
+
+  // wgt
+  io.wgt.rd.idx.valid := state === sReadTensor
+  io.wgt.rd.idx.bits := uop_wgt
+  io.wgt.tieoffWrite() // read-only
+
+  // acc_i
+  io.acc.rd.idx.valid := state === sReadTensor
+  io.acc.rd.idx.bits := uop_acc
+
+  // mvc
+  mvc.io.reset := dec.reset & state === sExe
+  mvc.io.inp.data <> io.inp.rd.data
+  mvc.io.wgt.data <> io.wgt.rd.data
+  mvc.io.acc_i.data <> io.acc.rd.data
+
+  // acc_o
+  io.acc.wr.valid := mvc.io.acc_o.data.valid & Mux(dec.reset, true.B, wrpipe.io.deq.valid)
+  io.acc.wr.bits.idx := Mux(dec.reset, uop_acc, wrpipe.io.deq.bits)
+  io.acc.wr.bits.data <> mvc.io.acc_o.data.bits
+
+  // out
+  io.out.wr.valid := mvc.io.out.data.valid & wrpipe.io.deq.valid
+  io.out.wr.bits.idx := wrpipe.io.deq.bits
+  io.out.wr.bits.data <> mvc.io.out.data.bits
+  io.out.tieoffRead() // write-only
+
+  io.done := done
+
+  if (debug) {
+    when (state === sReadUop && ~dec.reset) {
+      printf("[TensorGemm] [uop] idx:%x\n", uop_idx)
+    }
+
+    when (state === sReadTensor && ~dec.reset) {
+      printf("[TensorGemm] [uop] acc:%x inp:%x wgt:%x\n", uop_acc, uop_inp, uop_wgt)
+    }
+
+    io.inp.rd.data.bits.zipWithIndex.foreach { case(r, i) =>
+      when (io.inp.rd.data.valid && ~dec.reset) {
+        printf("[TensorGemm] [inp] i:%x val:%x\n", i.U, r.asUInt)
+      }
+    }
+
+    io.wgt.rd.data.bits.zipWithIndex.foreach { case(r, i) =>
+      when (io.wgt.rd.data.valid && ~dec.reset) {
+        printf("[TensorGemm] [wgt] i:%x val:%x\n", i.U, r.asUInt)
+      }
+    }
+
+    io.acc.rd.data.bits.foreach { tensor =>
+      tensor.zipWithIndex.foreach { case(elem, i) =>
+        when (io.acc.rd.data.valid && ~dec.reset) {
+          printf("[TensorGemm] [acc_i] i:%x val:%x\n", i.U, elem)
+        }
+      }
+    }
+
+    mvc.io.acc_o.data.bits.foreach { tensor =>
+      tensor.zipWithIndex.foreach { case(elem, i) =>
+        when (mvc.io.acc_o.data.valid && ~dec.reset) {
+          printf("[TensorGemm] [acc_o] i:%x val:%x\n", i.U, elem)
+        }
+      }
+    }
+
+    mvc.io.out.data.bits.foreach { tensor =>
+      tensor.zipWithIndex.foreach { case(elem, i) =>
+        when (mvc.io.out.data.valid && ~dec.reset) {
+          printf("[TensorGemm] [out] i:%x val:%x\n", i.U, elem)
+        }
+      }
+    }
+  }
+}
diff --git a/vta/hardware/chisel/src/main/scala/core/TensorLoad.scala b/vta/hardware/chisel/src/main/scala/core/TensorLoad.scala
new file mode 100644
index 000000000000..d96a681e7d69
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/core/TensorLoad.scala
@@ -0,0 +1,278 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.core
+
+import chisel3._
+import chisel3.util._
+import vta.util.config._
+import vta.shell._
+
+/** TensorStore.
+  *
+  * Load 1D and 2D tensors from main memory (DRAM) to input/weight
+  * scratchpads (SRAM). Also, there is support for zero padding, while
+  * doing the load. Zero-padding works on the y and x axis, and it is
+  * managed by TensorPadCtrl. The TensorDataCtrl is in charge of
+  * handling the way tensors are stored on the scratchpads.
+  */
+class TensorLoad(tensorType: String = "none", debug: Boolean = false)
+  (implicit p: Parameters) extends Module {
+  val tp = new TensorParams(tensorType)
+  val mp = p(ShellKey).memParams
+  val io = IO(new Bundle {
+    val start = Input(Bool())
+    val done = Output(Bool())
+    val inst = Input(UInt(INST_BITS.W))
+    val baddr = Input(UInt(mp.addrBits.W))
+    val vme_rd = new VMEReadMaster
+    val tensor = new TensorClient(tensorType)
+  })
+  val sizeFactor = tp.tensorLength * tp.numMemBlock
+  val strideFactor = tp.tensorLength * tp.tensorWidth
+
+  val dec = io.inst.asTypeOf(new MemDecode)
+  val dataCtrl = Module(new TensorDataCtrl(sizeFactor, strideFactor))
+  val dataCtrlDone = RegInit(false.B)
+  val yPadCtrl0 = Module(new TensorPadCtrl(padType = "YPad0", sizeFactor))
+  val yPadCtrl1 = Module(new TensorPadCtrl(padType = "YPad1", sizeFactor))
+  val xPadCtrl0 = Module(new TensorPadCtrl(padType = "XPad0", sizeFactor))
+  val xPadCtrl1 = Module(new TensorPadCtrl(padType = "XPad1", sizeFactor))
+
+  val tag = Reg(UInt(8.W))
+  val set = Reg(UInt(8.W))
+
+  val sIdle :: sYPad0 :: sXPad0 :: sReadCmd :: sReadData :: sXPad1 :: sYPad1 :: Nil = Enum(7)
+  val state = RegInit(sIdle)
+
+  // control
+  switch (state) {
+    is (sIdle) {
+      when (io.start) {
+        when (dec.ypad_0 =/= 0.U) {
+          state := sYPad0
+	} .elsewhen (dec.xpad_0 =/= 0.U) {
+          state := sXPad0
+	} .otherwise {
+          state := sReadCmd
+	}
+      }
+    }
+    is (sYPad0) {
+      when (yPadCtrl0.io.done) {
+        when (dec.xpad_0 =/= 0.U) {
+          state := sXPad0
+	} .otherwise {
+          state := sReadCmd
+	}
+      }
+    }
+    is (sXPad0) {
+      when (xPadCtrl0.io.done) {
+        state := sReadCmd
+      }
+    }
+    is (sReadCmd) {
+      when (io.vme_rd.cmd.ready) {
+        state := sReadData
+      }
+    }
+    is (sReadData) {
+      when (io.vme_rd.data.valid) {
+        when (dataCtrl.io.done) {
+	  when (dec.xpad_1 =/= 0.U) {
+	    state := sXPad1
+	  } .elsewhen (dec.ypad_1 =/= 0.U) {
+	    state := sYPad1
+	  } .otherwise  {
+	    state := sIdle
+	  }
+	} .elsewhen (dataCtrl.io.stride || dataCtrl.io.split) {
+          when (dec.xpad_1 =/= 0.U) {
+	    state := sXPad1
+	  } .elsewhen (dec.xpad_0 =/= 0.U) {
+            state := sXPad0
+	  } .otherwise {
+              state := sReadCmd
+	  }
+	}
+      }
+    }
+    is (sXPad1) {
+      when (xPadCtrl1.io.done) {
+        when (dataCtrlDone) {
+          when (dec.ypad_1 =/= 0.U) {
+            state := sYPad1
+          } .otherwise {
+            state := sIdle
+          }
+        } .otherwise {
+          when (dec.xpad_0 =/= 0.U) {
+            state := sXPad0
+          } .otherwise {
+            state := sReadCmd
+          }
+        }
+      }
+    }
+    is (sYPad1) {
+      when (yPadCtrl1.io.done && dataCtrlDone) {
+        state := sIdle
+      }
+    }
+  }
+
+  // data controller
+  dataCtrl.io.start := state === sIdle & io.start
+  dataCtrl.io.inst := io.inst
+  dataCtrl.io.baddr := io.baddr
+  dataCtrl.io.xinit := io.vme_rd.cmd.fire()
+  dataCtrl.io.xupdate := io.vme_rd.data.fire()
+  dataCtrl.io.yupdate := io.vme_rd.data.fire()
+
+  when (state === sIdle) {
+    dataCtrlDone := false.B
+  } .elsewhen (io.vme_rd.data.fire() && dataCtrl.io.done) {
+    dataCtrlDone := true.B
+  }
+
+  // pad
+  yPadCtrl0.io.start := dec.ypad_0 =/= 0.U & state === sIdle & io.start
+
+  yPadCtrl1.io.start := dec.ypad_1 =/= 0.U &
+                          ((io.vme_rd.data.fire() & dataCtrl.io.done & dec.xpad_1 === 0.U) |
+                           (state === sXPad1 & xPadCtrl1.io.done & dataCtrlDone))
+
+  xPadCtrl0.io.start := dec.xpad_0 =/= 0.U &
+                          ((state === sIdle & io.start) |
+			  (state === sYPad0 & yPadCtrl0.io.done) |
+                          (io.vme_rd.data.fire() & ~dataCtrlDone & (dataCtrl.io.stride | dataCtrl.io.split) & dec.xpad_1 === 0.U) |
+			  (state === sXPad1 & xPadCtrl1.io.done & ~dataCtrlDone))
+
+  xPadCtrl1.io.start := dec.xpad_1 =/= 0.U & io.vme_rd.data.fire() &
+                          ((dataCtrl.io.done) |
+                          (~dataCtrl.io.done & (dataCtrl.io.stride | dataCtrl.io.split) & dec.xpad_1 =/= 0.U))
+
+  yPadCtrl0.io.inst := io.inst
+  yPadCtrl1.io.inst := io.inst
+  xPadCtrl0.io.inst := io.inst
+  xPadCtrl1.io.inst := io.inst
+
+  // read-from-dram
+  io.vme_rd.cmd.valid := state === sReadCmd
+  io.vme_rd.cmd.bits.addr := dataCtrl.io.addr
+  io.vme_rd.cmd.bits.len := dataCtrl.io.len
+
+  io.vme_rd.data.ready := state === sReadData
+
+  // write-to-sram
+  val isZeroPad = state === sYPad0 |
+                  state === sXPad0 |
+		  state === sXPad1 |
+		  state === sYPad1
+
+  when (state === sIdle || state === sReadCmd || tag === (tp.numMemBlock - 1).U) {
+    tag := 0.U
+  } .elsewhen (io.vme_rd.data.fire() || isZeroPad) {
+    tag := tag + 1.U
+  }
+
+  when (state === sIdle || state === sReadCmd || (set === (tp.tensorLength - 1).U && tag === (tp.numMemBlock - 1).U)) {
+    set := 0.U
+  } .elsewhen ((io.vme_rd.data.fire() || isZeroPad) && tag === (tp.numMemBlock - 1).U) {
+    set := set + 1.U
+  }
+
+  val waddr_cur = Reg(UInt(tp.memAddrBits.W))
+  val waddr_nxt = Reg(UInt(tp.memAddrBits.W))
+  when (state === sIdle) {
+    waddr_cur := dec.sram_offset
+    waddr_nxt := dec.sram_offset
+  } .elsewhen ((io.vme_rd.data.fire() || isZeroPad) && set === (tp.tensorLength - 1).U && tag === (tp.numMemBlock - 1).U) {
+    waddr_cur := waddr_cur + 1.U
+  } .elsewhen (dataCtrl.io.stride) {
+    waddr_cur := waddr_nxt + dec.xsize
+    waddr_nxt := waddr_nxt + dec.xsize
+  }
+
+  val tensorFile = Seq.fill(tp.tensorLength) { SyncReadMem(tp.memDepth, Vec(tp.numMemBlock, UInt(tp.memBlockBits.W))) }
+  val wmask = Seq.fill(tp.tensorLength) { Wire(Vec(tp.numMemBlock, Bool())) }
+  val wdata = Seq.fill(tp.tensorLength) { Wire(Vec(tp.numMemBlock, UInt(tp.memBlockBits.W))) }
+  val no_mask = Wire(Vec(tp.numMemBlock, Bool()))
+  no_mask.foreach { m => m := true.B }
+
+  for (i <- 0 until tp.tensorLength) {
+    for (j <- 0 until tp.numMemBlock) {
+      wmask(i)(j) := tag === j.U
+      wdata(i)(j) := Mux(isZeroPad, 0.U, io.vme_rd.data.bits)
+    }
+    val tdata = io.tensor.wr.bits.data(i).asUInt.asTypeOf(wdata(i))
+    val muxWen = Mux(state === sIdle, io.tensor.wr.valid, (io.vme_rd.data.fire() | isZeroPad) & set === i.U)
+    val muxWaddr = Mux(state === sIdle, io.tensor.wr.bits.idx, waddr_cur)
+    val muxWdata = Mux(state === sIdle, tdata, wdata(i))
+    val muxWmask = Mux(state === sIdle, no_mask, wmask(i))
+    when (muxWen) {
+      tensorFile(i).write(muxWaddr, muxWdata, muxWmask)
+    }
+  }
+
+  // read-from-sram
+  val rvalid = RegNext(io.tensor.rd.idx.valid)
+  io.tensor.rd.data.valid := rvalid
+
+  val rdata = tensorFile.map(_.read(io.tensor.rd.idx.bits, io.tensor.rd.idx.valid))
+  rdata.zipWithIndex.foreach { case(r, i) =>
+    io.tensor.rd.data.bits(i) := r.asUInt.asTypeOf(io.tensor.rd.data.bits(i))
+  }
+
+  // done
+  val done_no_pad = io.vme_rd.data.fire() & dataCtrl.io.done & dec.xpad_1 === 0.U & dec.ypad_1 === 0.U
+  val done_x_pad = state === sXPad1 & xPadCtrl1.io.done & dataCtrlDone & dec.ypad_1 === 0.U
+  val done_y_pad = state === sYPad1 & dataCtrlDone & yPadCtrl1.io.done
+  io.done := done_no_pad | done_x_pad | done_y_pad
+
+  // debug
+  if (debug) {
+    if (tensorType == "inp") {
+      when (io.vme_rd.cmd.fire()) {
+        printf("[TensorLoad] [inp] cmd addr:%x len:%x\n", dataCtrl.io.addr, dataCtrl.io.len)
+      }
+      when (state === sYPad0) {
+        printf("[TensorLoad] [inp] sYPad0\n")
+      }
+      when (state === sYPad1) {
+        printf("[TensorLoad] [inp] sYPad1\n")
+      }
+      when (state === sXPad0) {
+        printf("[TensorLoad] [inp] sXPad0\n")
+      }
+      when (state === sXPad1) {
+        printf("[TensorLoad] [inp] sXPad1\n")
+      }
+    } else if (tensorType == "wgt") {
+      when (io.vme_rd.cmd.fire()) {
+        printf("[TensorLoad] [wgt] cmd addr:%x len:%x\n", dataCtrl.io.addr, dataCtrl.io.len)
+      }
+    } else if (tensorType == "acc") {
+      when (io.vme_rd.cmd.fire()) {
+        printf("[TensorLoad] [acc] cmd addr:%x len:%x\n", dataCtrl.io.addr, dataCtrl.io.len)
+      }
+    }
+  }
+}
diff --git a/vta/hardware/chisel/src/main/scala/core/TensorStore.scala b/vta/hardware/chisel/src/main/scala/core/TensorStore.scala
new file mode 100644
index 000000000000..0012e4771c0e
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/core/TensorStore.scala
@@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.core
+
+import chisel3._
+import chisel3.util._
+import vta.util.config._
+import vta.shell._
+
+/** TensorStore.
+  *
+  * Store 1D and 2D tensors from out-scratchpad (SRAM) to main memory (DRAM).
+  */
+class TensorStore(tensorType: String = "true", debug: Boolean = false)
+  (implicit p: Parameters) extends Module {
+  val tp = new TensorParams(tensorType)
+  val mp = p(ShellKey).memParams
+  val io = IO(new Bundle {
+    val start = Input(Bool())
+    val done = Output(Bool())
+    val inst = Input(UInt(INST_BITS.W))
+    val baddr = Input(UInt(mp.addrBits.W))
+    val vme_wr = new VMEWriteMaster
+    val tensor = new TensorClient(tensorType)
+  })
+  val tensorLength = tp.tensorLength
+  val tensorWidth = tp.tensorWidth
+  val tensorElemBits = tp.tensorElemBits
+  val memBlockBits = tp.memBlockBits
+  val memDepth = tp.memDepth
+  val numMemBlock = tp.numMemBlock
+
+  val dec = io.inst.asTypeOf(new MemDecode)
+  val waddr_cur = Reg(chiselTypeOf(io.vme_wr.cmd.bits.addr))
+  val waddr_nxt = Reg(chiselTypeOf(io.vme_wr.cmd.bits.addr))
+  val xcnt = Reg(chiselTypeOf(io.vme_wr.cmd.bits.len))
+  val xlen = Reg(chiselTypeOf(io.vme_wr.cmd.bits.len))
+  val xrem = Reg(chiselTypeOf(dec.xsize))
+  val xsize = (dec.xsize << log2Ceil(tensorLength*numMemBlock)) - 1.U
+  val xmax = (1 << mp.lenBits).U
+  val xmax_bytes = ((1 << mp.lenBits)*mp.dataBits/8).U
+  val ycnt = Reg(chiselTypeOf(dec.ysize))
+  val ysize = dec.ysize
+  val tag = Reg(UInt(8.W))
+  val set = Reg(UInt(8.W))
+
+  val sIdle :: sWriteCmd :: sWriteData :: sReadMem :: sWriteAck :: Nil = Enum(5)
+  val state = RegInit(sIdle)
+
+  // control
+  switch (state) {
+    is (sIdle) {
+      when (io.start) {
+        state := sWriteCmd
+	when (xsize < xmax) {
+          xlen := xsize
+          xrem := 0.U
+	} .otherwise {
+          xlen := xmax - 1.U
+          xrem := xsize - xmax
+	}
+      }
+    }
+    is (sWriteCmd) {
+      when (io.vme_wr.cmd.ready) {
+        state := sWriteData
+      }
+    }
+    is (sWriteData) {
+      when (io.vme_wr.data.ready) {
+        when (xcnt === xlen) {
+          state := sWriteAck
+        } .elsewhen (tag === (numMemBlock - 1).U) {
+          state := sReadMem
+	}
+      }
+    }
+    is (sReadMem) {
+      state := sWriteData
+    }
+    is (sWriteAck) {
+      when (io.vme_wr.ack) {
+        when (xrem === 0.U) {
+	  when (ycnt === ysize - 1.U) {
+            state := sIdle
+	  } .otherwise {
+            state := sWriteCmd
+	    when (xsize < xmax) {
+              xlen := xsize
+              xrem := 0.U
+	    } .otherwise {
+              xlen := xmax - 1.U
+              xrem := xsize - xmax
+	    }
+	  }
+	} .elsewhen (xrem < xmax) {
+          state := sWriteCmd
+          xlen := xrem
+          xrem := 0.U
+	} .otherwise {
+          state := sWriteCmd
+          xlen := xmax - 1.U
+          xrem := xrem - xmax
+	}
+      }
+    }
+  }
+
+  // write-to-sram
+  val tensorFile = Seq.fill(tensorLength) { SyncReadMem(memDepth, Vec(numMemBlock, UInt(memBlockBits.W))) }
+  val wdata_t = Wire(Vec(numMemBlock, UInt(memBlockBits.W)))
+  val no_mask = Wire(Vec(numMemBlock, Bool()))
+
+  wdata_t := DontCare
+  no_mask.foreach { m => m := true.B }
+
+  for (i <- 0 until tensorLength) {
+    val inWrData = io.tensor.wr.bits.data(i).asUInt.asTypeOf(wdata_t)
+    when (io.tensor.wr.valid) {
+      tensorFile(i).write(io.tensor.wr.bits.idx, inWrData, no_mask)
+    }
+  }
+
+  // read-from-sram
+  val stride = state === sWriteAck &
+              io.vme_wr.ack &
+              xcnt === xlen + 1.U &
+	      xrem === 0.U &
+	      ycnt =/= ysize - 1.U
+
+  when (state === sIdle) {
+    ycnt := 0.U
+  } .elsewhen (stride) {
+    ycnt := ycnt + 1.U
+  }
+
+  when (state === sWriteCmd || tag === (numMemBlock - 1).U) {
+    tag := 0.U
+  } .elsewhen (io.vme_wr.data.fire()) {
+    tag := tag + 1.U
+  }
+
+  when (state === sWriteCmd || (set === (tensorLength - 1).U && tag === (numMemBlock - 1).U)) {
+    set := 0.U
+  } .elsewhen (io.vme_wr.data.fire() && tag === (numMemBlock - 1).U) {
+    set := set + 1.U
+  }
+
+  val raddr_cur = Reg(UInt(tp.memAddrBits.W))
+  val raddr_nxt = Reg(UInt(tp.memAddrBits.W))
+  when (state === sIdle) {
+    raddr_cur := dec.sram_offset
+    raddr_nxt := dec.sram_offset
+  } .elsewhen (io.vme_wr.data.fire() && set === (tensorLength - 1).U && tag === (numMemBlock - 1).U) {
+    raddr_cur := raddr_cur + 1.U
+  } .elsewhen (stride) {
+    raddr_cur := raddr_nxt + dec.xsize
+    raddr_nxt := raddr_nxt + dec.xsize
+  }
+
+  val tread = Seq.tabulate(tensorLength) { i => i.U ->
+    tensorFile(i).read(raddr_cur, state === sWriteCmd | state === sReadMem) }
+  val mdata = MuxLookup(set, 0.U.asTypeOf(chiselTypeOf(wdata_t)), tread)
+
+  // write-to-dram
+  when (state === sIdle) {
+    waddr_cur := io.baddr + dec.dram_offset
+    waddr_nxt := io.baddr + dec.dram_offset
+  } .elsewhen (state === sWriteAck && io.vme_wr.ack && xrem =/= 0.U) {
+    waddr_cur := waddr_cur + xmax_bytes
+  } .elsewhen (stride) {
+    waddr_cur := waddr_nxt + (dec.xstride << log2Ceil(tensorLength*tensorWidth))
+    waddr_nxt := waddr_nxt + (dec.xstride << log2Ceil(tensorLength*tensorWidth))
+  }
+
+  io.vme_wr.cmd.valid := state === sWriteCmd
+  io.vme_wr.cmd.bits.addr := waddr_cur
+  io.vme_wr.cmd.bits.len := xlen
+
+  io.vme_wr.data.valid := state === sWriteData
+  io.vme_wr.data.bits := mdata(tag)
+
+  when (state === sWriteCmd) {
+    xcnt := 0.U
+  } .elsewhen (io.vme_wr.data.fire()) {
+    xcnt := xcnt + 1.U
+  }
+
+  // disable external read-from-sram requests
+  io.tensor.tieoffRead()
+
+  // done
+  io.done := state === sWriteAck & io.vme_wr.ack & xrem === 0.U & ycnt === ysize - 1.U
+
+  // debug
+  if (debug) {
+    when (io.vme_wr.cmd.fire()) {
+      printf("[TensorStore] ysize:%x ycnt:%x raddr:%x waddr:%x len:%x rem:%x\n", ysize, ycnt, raddr_cur, waddr_cur, xlen, xrem)
+    }
+    when (io.vme_wr.data.fire()) {
+      printf("[TensorStore] data:%x\n", io.vme_wr.data.bits)
+    }
+    when (io.vme_wr.ack) {
+      printf("[TensorStore] ack\n")
+    }
+  }
+}
diff --git a/vta/hardware/chisel/src/main/scala/core/TensorUtil.scala b/vta/hardware/chisel/src/main/scala/core/TensorUtil.scala
new file mode 100644
index 000000000000..e41a2c5b18e9
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/core/TensorUtil.scala
@@ -0,0 +1,304 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.core
+
+import chisel3._
+import chisel3.util._
+import vta.util.config._
+import vta.shell._
+
+/** TensorParams.
+  *
+  * This Bundle derives parameters for each tensorType, including inputs (inp),
+  * weights (wgt), biases (acc), and outputs (out). This is used to avoid
+  * doing the same boring calculations over and over again.
+  */
+class TensorParams(tensorType: String = "none")(implicit p: Parameters) extends Bundle {
+  val errorMsg = s"\n\n[VTA] [TensorParams] only inp, wgt, acc, and out supported\n\n"
+
+  require (tensorType == "inp" || tensorType == "wgt"
+    || tensorType == "acc" || tensorType == "out", errorMsg)
+
+  val (tensorLength, tensorWidth, tensorElemBits) =
+    if (tensorType == "inp")
+      (p(CoreKey).batch, p(CoreKey).blockIn, p(CoreKey).inpBits)
+    else if (tensorType == "wgt")
+      (p(CoreKey).blockOut, p(CoreKey).blockIn, p(CoreKey).wgtBits)
+    else if (tensorType == "acc")
+      (p(CoreKey).batch, p(CoreKey).blockOut, p(CoreKey).accBits)
+    else
+      (p(CoreKey).batch, p(CoreKey).blockOut, p(CoreKey).outBits)
+
+  val memBlockBits = p(ShellKey).memParams.dataBits
+  val numMemBlock = (tensorWidth * tensorElemBits) / memBlockBits
+
+  val memDepth =
+    if (tensorType == "inp")
+      p(CoreKey).inpMemDepth
+    else if (tensorType == "wgt")
+      p(CoreKey).wgtMemDepth
+    else if (tensorType == "acc")
+      p(CoreKey).accMemDepth
+    else
+      p(CoreKey).outMemDepth
+
+  val memAddrBits = log2Ceil(memDepth)
+}
+
+/** TensorMaster.
+  *
+  * This interface issue read and write tensor-requests to scratchpads. For example,
+  * The TensorGemm unit uses this interface for managing the inputs (inp), weights (wgt),
+  * biases (acc), and outputs (out).
+  *
+  */
+class TensorMaster(tensorType: String = "none")
+  (implicit p: Parameters) extends TensorParams(tensorType) {
+    val rd = new Bundle {
+      val idx = ValidIO(UInt(memAddrBits.W))
+      val data = Flipped(ValidIO(Vec(tensorLength, Vec(tensorWidth, UInt(tensorElemBits.W)))))
+    }
+    val wr = ValidIO(new Bundle {
+      val idx = UInt(memAddrBits.W)
+      val data = Vec(tensorLength, Vec(tensorWidth, UInt(tensorElemBits.W)))
+    })
+    def tieoffRead() {
+      rd.idx.valid := false.B
+      rd.idx.bits := 0.U
+    }
+    def tieoffWrite() {
+      wr.valid := false.B
+      wr.bits.idx := 0.U
+      wr.bits.data.foreach { b => b.foreach { c => c := 0.U } }
+    }
+  override def cloneType =
+    new TensorMaster(tensorType).asInstanceOf[this.type]
+}
+
+/** TensorClient.
+  *
+  * This interface receives read and write tensor-requests to scratchpads. For example,
+  * The TensorLoad unit uses this interface for receiving read and write requests from
+  * the TensorGemm unit.
+  */
+class TensorClient(tensorType: String = "none")
+  (implicit p: Parameters) extends TensorParams(tensorType) {
+    val rd = new Bundle {
+      val idx = Flipped(ValidIO(UInt(memAddrBits.W)))
+      val data = ValidIO(Vec(tensorLength, Vec(tensorWidth, UInt(tensorElemBits.W))))
+    }
+    val wr = Flipped(ValidIO(new Bundle {
+      val idx = UInt(memAddrBits.W)
+      val data = Vec(tensorLength, Vec(tensorWidth, UInt(tensorElemBits.W)))
+    }))
+    def tieoffRead() {
+      rd.data.valid := false.B
+      rd.data.bits.foreach { b => b.foreach { c => c := 0.U } }
+    }
+  override def cloneType =
+    new TensorClient(tensorType).asInstanceOf[this.type]
+}
+
+/** TensorMasterData.
+  *
+  * This interface is only used for datapath only purposes and the direction convention
+  * is based on the TensorMaster interface, which means this is an input. This interface
+  * is used on datapath only module such MatrixVectorCore or AluVector.
+  */
+class TensorMasterData(tensorType: String = "none")
+  (implicit p: Parameters) extends TensorParams(tensorType) {
+  val data = Flipped(ValidIO(Vec(tensorLength, Vec(tensorWidth, UInt(tensorElemBits.W)))))
+  override def cloneType =
+    new TensorMasterData(tensorType).asInstanceOf[this.type]
+}
+
+/** TensorClientData.
+  *
+  * This interface is only used for datapath only purposes and the direction convention
+  * is based on the TensorClient interface, which means this is an output. This interface
+  * is used on datapath only module such MatrixVectorCore or AluVector.
+  */
+class TensorClientData(tensorType: String = "none")
+  (implicit p: Parameters) extends TensorParams(tensorType) {
+  val data = ValidIO(Vec(tensorLength, Vec(tensorWidth, UInt(tensorElemBits.W))))
+  override def cloneType =
+    new TensorClientData(tensorType).asInstanceOf[this.type]
+}
+
+/** TensorPadCtrl. Zero-padding controller for TensorLoad. */
+class TensorPadCtrl(padType: String = "none", sizeFactor: Int = 1) extends Module {
+  val errorMsg = s"\n\n\n[VTA-ERROR] only YPad0, YPad1, XPad0, or XPad1 supported\n\n\n"
+  require (padType == "YPad0" || padType == "YPad1"
+    || padType == "XPad0" || padType == "XPad1", errorMsg)
+
+  val io = IO(new Bundle {
+    val start = Input(Bool())
+    val done = Output(Bool())
+    val inst = Input(UInt(INST_BITS.W))
+  })
+
+  val dec = io.inst.asTypeOf(new MemDecode)
+
+  val xmax = Reg(chiselTypeOf(dec.xsize))
+  val ymax = Reg(chiselTypeOf(dec.ypad_0))
+  val xcnt = Reg(chiselTypeOf(dec.xsize))
+  val ycnt = Reg(chiselTypeOf(dec.ypad_0))
+
+  val xval =
+    if (padType == "YPad0" || padType == "YPad1")
+      ((dec.xpad_0 + dec.xsize + dec.xpad_1) << log2Ceil(sizeFactor)) - 1.U
+    else if (padType == "XPad0")
+      (dec.xpad_0 << log2Ceil(sizeFactor)) - 1.U
+    else
+      (dec.xpad_1 << log2Ceil(sizeFactor)) - 1.U
+
+  val yval =
+    if (padType == "YPad0")
+      Mux(dec.ypad_0 =/= 0.U, dec.ypad_0 - 1.U, 0.U)
+    else if (padType == "YPad1")
+      Mux(dec.ypad_1 =/= 0.U, dec.ypad_1 - 1.U, 0.U)
+    else
+      0.U
+
+  val sIdle :: sActive :: Nil = Enum(2)
+  val state = RegInit(sIdle)
+
+  switch (state) {
+    is (sIdle) {
+      when (io.start) {
+        state := sActive
+      }
+    }
+    is (sActive) {
+      when (ycnt === ymax && xcnt === xmax) {
+        state := sIdle
+      }
+    }
+  }
+
+  when (state === sIdle) {
+    xmax := xval
+    ymax := yval
+  }
+
+  when (state === sIdle || xcnt === xmax) {
+    xcnt := 0.U
+  } .elsewhen (state === sActive) {
+    xcnt := xcnt + 1.U
+  }
+
+  when (state === sIdle || ymax === 0.U) {
+    ycnt := 0.U
+  } .elsewhen (state === sActive && xcnt === xmax) {
+    ycnt := ycnt + 1.U
+  }
+
+  io.done := state === sActive & ycnt === ymax & xcnt === xmax
+}
+
+/** TensorDataCtrl. Data controller for TensorLoad. */
+class TensorDataCtrl(sizeFactor: Int = 1, strideFactor: Int = 1)(implicit p: Parameters) extends Module {
+  val mp = p(ShellKey).memParams
+  val io = IO(new Bundle {
+    val start = Input(Bool())
+    val done = Output(Bool())
+    val inst = Input(UInt(INST_BITS.W))
+    val baddr = Input(UInt(mp.addrBits.W))
+    val xinit = Input(Bool())
+    val xupdate = Input(Bool())
+    val yupdate = Input(Bool())
+    val stride = Output(Bool())
+    val split = Output(Bool())
+    val commit = Output(Bool())
+    val addr = Output(UInt(mp.addrBits.W))
+    val len = Output(UInt(mp.lenBits.W))
+  })
+
+  val dec = io.inst.asTypeOf(new MemDecode)
+
+  val caddr = Reg(UInt(mp.addrBits.W))
+  val baddr = Reg(UInt(mp.addrBits.W))
+
+  val len = Reg(UInt(mp.lenBits.W))
+
+  val xmax_bytes = ((1 << mp.lenBits)*mp.dataBits/8).U
+  val xcnt = Reg(UInt(mp.lenBits.W))
+  val xrem = Reg(chiselTypeOf(dec.xsize))
+  val xsize = (dec.xsize << log2Ceil(sizeFactor)) - 1.U
+  val xmax = (1 << mp.lenBits).U
+  val ycnt = Reg(chiselTypeOf(dec.ysize))
+
+  val stride = xcnt === len &
+	       xrem === 0.U &
+	       ycnt =/= dec.ysize - 1.U
+
+  val split = xcnt === len & xrem =/= 0.U
+
+  when (io.start || (io.xupdate && stride)) {
+    when (xsize < xmax) {
+      len := xsize
+      xrem := 0.U
+    } .otherwise {
+      len := xmax - 1.U
+      xrem := xsize - xmax
+    }
+  } .elsewhen (io.xupdate && split) {
+    when (xrem < xmax) {
+      len := xrem
+      xrem := 0.U
+    } .otherwise {
+      len := xmax - 1.U
+      xrem := xrem - xmax
+    }
+  }
+
+  when (io.xinit) {
+    xcnt := 0.U
+  } .elsewhen (io.xupdate) {
+    xcnt := xcnt + 1.U
+  }
+
+  when (io.start) {
+    ycnt := 0.U
+  } .elsewhen (io.yupdate && stride) {
+    ycnt := ycnt + 1.U
+  }
+
+  when (io.start) {
+    caddr := io.baddr + dec.dram_offset
+    baddr := io.baddr + dec.dram_offset
+  } .elsewhen (io.yupdate) {
+    when (split) {
+      caddr := caddr + xmax_bytes
+    } .elsewhen (stride) {
+      caddr := baddr + (dec.xstride << log2Ceil(strideFactor))
+      baddr := baddr + (dec.xstride << log2Ceil(strideFactor))
+    }
+  }
+
+  io.stride := stride
+  io.split := split
+  io.commit := xcnt === len
+  io.addr := caddr
+  io.len := len
+  io.done := xcnt === len &
+	     xrem === 0.U &
+	     ycnt === dec.ysize - 1.U
+}
diff --git a/vta/hardware/chisel/src/main/scala/core/package.scala b/vta/hardware/chisel/src/main/scala/core/package.scala
new file mode 100644
index 000000000000..673d390901de
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/core/package.scala
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta
+
+/** This trick makes ISAConstants globally available */
+package object core extends vta.core.ISAConstants
diff --git a/vta/hardware/chisel/src/main/scala/dpi/VTAHostDPI.scala b/vta/hardware/chisel/src/main/scala/dpi/VTAHostDPI.scala
index aab2d630c307..115bcbcb5a93 100644
--- a/vta/hardware/chisel/src/main/scala/dpi/VTAHostDPI.scala
+++ b/vta/hardware/chisel/src/main/scala/dpi/VTAHostDPI.scala
@@ -21,6 +21,9 @@ package vta.dpi
 
 import chisel3._
 import chisel3.util._
+import vta.util.config._
+import vta.interface.axi._
+import vta.shell._
 
 /** Host DPI parameters */
 trait VTAHostDPIParams {
@@ -70,3 +73,83 @@ class VTAHostDPI extends BlackBox with HasBlackBoxResource {
   })
   setResource("/verilog/VTAHostDPI.v")
 }
+
+/** Host DPI to AXI Converter.
+  *
+  * Convert Host DPI to AXI for VTAShell
+  */
+
+class VTAHostDPIToAXI(debug: Boolean = false)(implicit p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    val dpi = new VTAHostDPIClient
+    val axi = new AXILiteMaster(p(ShellKey).hostParams)
+  })
+  val addr = RegInit(0.U.asTypeOf(chiselTypeOf(io.dpi.req.addr)))
+  val data = RegInit(0.U.asTypeOf(chiselTypeOf(io.dpi.req.value)))
+  val sIdle :: sReadAddress :: sReadData :: sWriteAddress :: sWriteData :: sWriteResponse :: Nil = Enum(6)
+  val state = RegInit(sIdle)
+
+  switch (state) {
+    is (sIdle) {
+      when (io.dpi.req.valid) {
+        when (io.dpi.req.opcode) {
+          state := sWriteAddress
+        } .otherwise {
+          state := sReadAddress
+        }
+      }
+    }
+    is (sReadAddress) {
+      when (io.axi.ar.ready) {
+        state := sReadData
+      }
+    }
+    is (sReadData) {
+      when (io.axi.r.valid) {
+        state := sIdle
+      }
+    }
+    is (sWriteAddress) {
+      when (io.axi.aw.ready) {
+        state := sWriteData
+      }
+    }
+    is (sWriteData) {
+      when (io.axi.w.ready) {
+        state := sWriteResponse
+      }
+    }
+    is (sWriteResponse) {
+      when (io.axi.b.valid) {
+        state := sIdle
+      }
+    }
+  }
+
+  when (state === sIdle && io.dpi.req.valid) {
+    addr := io.dpi.req.addr
+    data := io.dpi.req.value
+  }
+
+  io.axi.aw.valid := state === sWriteAddress
+  io.axi.aw.bits.addr := addr
+  io.axi.w.valid := state === sWriteData
+  io.axi.w.bits.data := data
+  io.axi.w.bits.strb := "h_f".U
+  io.axi.b.ready := state === sWriteResponse
+
+  io.axi.ar.valid := state === sReadAddress
+  io.axi.ar.bits.addr := addr
+  io.axi.r.ready := state === sReadData
+
+  io.dpi.req.deq := (state === sReadAddress & io.axi.ar.ready) | (state === sWriteAddress & io.axi.aw.ready)
+  io.dpi.resp.valid := io.axi.r.valid
+  io.dpi.resp.bits := io.axi.r.bits.data
+
+  if (debug) {
+    when (state === sWriteAddress && io.axi.aw.ready) { printf("[VTAHostDPIToAXI] [AW] addr:%x\n", addr) }
+    when (state === sReadAddress && io.axi.ar.ready) { printf("[VTAHostDPIToAXI] [AR] addr:%x\n", addr) }
+    when (io.axi.r.fire()) { printf("[VTAHostDPIToAXI] [R] value:%x\n", io.axi.r.bits.data) }
+    when (io.axi.w.fire()) { printf("[VTAHostDPIToAXI] [W] value:%x\n", io.axi.w.bits.data) }
+  }
+}
diff --git a/vta/hardware/chisel/src/main/scala/dpi/VTAMemDPI.scala b/vta/hardware/chisel/src/main/scala/dpi/VTAMemDPI.scala
index 090f0459570a..5e2fa741d72a 100644
--- a/vta/hardware/chisel/src/main/scala/dpi/VTAMemDPI.scala
+++ b/vta/hardware/chisel/src/main/scala/dpi/VTAMemDPI.scala
@@ -21,6 +21,9 @@ package vta.dpi
 
 import chisel3._
 import chisel3.util._
+import vta.util.config._
+import vta.interface.axi._
+import vta.shell._
 
 /** Memory DPI parameters */
 trait VTAMemDPIParams {
@@ -71,3 +74,98 @@ class VTAMemDPI extends BlackBox with HasBlackBoxResource {
   })
   setResource("/verilog/VTAMemDPI.v")
 }
+
+class VTAMemDPIToAXI(debug: Boolean = false)(implicit p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    val dpi = new VTAMemDPIMaster
+    val axi = new AXIClient(p(ShellKey).memParams)
+  })
+  val opcode = RegInit(false.B)
+  val len = RegInit(0.U.asTypeOf(chiselTypeOf(io.dpi.req.len)))
+  val addr = RegInit(0.U.asTypeOf(chiselTypeOf(io.dpi.req.addr)))
+  val sIdle :: sReadAddress :: sReadData :: sWriteAddress :: sWriteData :: sWriteResponse :: Nil = Enum(6)
+  val state = RegInit(sIdle)
+
+  switch (state) {
+    is (sIdle) {
+      when (io.axi.ar.valid) {
+        state := sReadAddress
+      } .elsewhen (io.axi.aw.valid) {
+        state := sWriteAddress
+      }
+    }
+    is (sReadAddress) {
+      when (io.axi.ar.valid) {
+        state := sReadData
+      }
+    }
+    is (sReadData) {
+      when (io.axi.r.ready && io.dpi.rd.valid && len === 0.U) {
+        state := sIdle
+      }
+    }
+    is (sWriteAddress) {
+      when (io.axi.aw.valid) {
+        state := sWriteData
+      }
+    }
+    is (sWriteData) {
+      when (io.axi.w.valid && io.axi.w.bits.last) {
+        state := sWriteResponse
+      }
+    }
+    is (sWriteResponse) {
+      when (io.axi.b.ready) {
+        state := sIdle
+      }
+    }
+  }
+
+  when (state === sIdle) {
+    when (io.axi.ar.valid) {
+      opcode := false.B
+      len := io.axi.ar.bits.len
+      addr := io.axi.ar.bits.addr
+    } .elsewhen (io.axi.aw.valid) {
+      opcode := true.B
+      len := io.axi.aw.bits.len
+      addr := io.axi.aw.bits.addr
+    }
+  } .elsewhen (state === sReadData) {
+    when (io.axi.r.ready && io.dpi.rd.valid && len =/= 0.U) {
+      len := len - 1.U
+    }
+  }
+
+  io.dpi.req.valid := (state === sReadAddress & io.axi.ar.valid) | (state === sWriteAddress & io.axi.aw.valid)
+  io.dpi.req.opcode := opcode
+  io.dpi.req.len := len
+  io.dpi.req.addr := addr
+
+  io.axi.ar.ready := state === sReadAddress
+  io.axi.aw.ready := state === sWriteAddress
+
+  io.axi.r.valid := state === sReadData & io.dpi.rd.valid
+  io.axi.r.bits.data := io.dpi.rd.bits
+  io.axi.r.bits.last := len === 0.U
+  io.axi.r.bits.resp := 0.U
+  io.axi.r.bits.user := 0.U
+  io.axi.r.bits.id := 0.U
+  io.dpi.rd.ready := state === sReadData & io.axi.r.ready
+
+  io.dpi.wr.valid := state === sWriteData & io.axi.w.valid
+  io.dpi.wr.bits := io.axi.w.bits.data
+  io.axi.w.ready := state === sWriteData
+
+  io.axi.b.valid := state === sWriteResponse
+  io.axi.b.bits.resp := 0.U
+  io.axi.b.bits.user := 0.U
+  io.axi.b.bits.id := 0.U
+
+  if (debug) {
+    when (state === sReadAddress && io.axi.ar.valid) { printf("[VTAMemDPIToAXI] [AR] addr:%x len:%x\n", addr, len) }
+    when (state === sWriteAddress && io.axi.aw.valid) { printf("[VTAMemDPIToAXI] [AW] addr:%x len:%x\n", addr, len) }
+    when (io.axi.r.fire()) { printf("[VTAMemDPIToAXI] [R] last:%x data:%x\n", io.axi.r.bits.last, io.axi.r.bits.data) }
+    when (io.axi.w.fire()) { printf("[VTAMemDPIToAXI] [W] last:%x data:%x\n", io.axi.w.bits.last, io.axi.w.bits.data) }
+  }
+}
diff --git a/vta/hardware/chisel/src/main/scala/interface/axi/AXI.scala b/vta/hardware/chisel/src/main/scala/interface/axi/AXI.scala
new file mode 100644
index 000000000000..a853e85e2bd8
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/interface/axi/AXI.scala
@@ -0,0 +1,312 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.interface.axi
+
+import chisel3._
+import chisel3.util._
+import vta.util.genericbundle._
+
+case class AXIParams(
+  addrBits: Int = 32,
+  dataBits: Int = 64
+)
+{
+  require (addrBits > 0)
+  require (dataBits >= 8 && dataBits % 2 == 0)
+
+  val idBits = 1
+  val userBits = 1
+  val strbBits = dataBits/8
+  val lenBits = 8
+  val sizeBits = 3
+  val burstBits = 2
+  val lockBits = 2
+  val cacheBits = 4
+  val protBits = 3
+  val qosBits = 4
+  val regionBits = 4
+  val respBits = 2
+  val sizeConst = log2Ceil(dataBits/8)
+  val idConst = 0
+  val userConst = 0
+  val burstConst = 1
+  val lockConst = 0
+  val cacheConst = 3
+  val protConst = 0
+  val qosConst = 0
+  val regionConst = 0
+}
+
+abstract class AXIBase(params: AXIParams)
+  extends GenericParameterizedBundle(params)
+
+// AXILite
+
+class AXILiteAddress(params: AXIParams) extends AXIBase(params) {
+  val addr = UInt(params.addrBits.W)
+}
+
+class AXILiteWriteData(params: AXIParams) extends AXIBase(params) {
+  val data = UInt(params.dataBits.W)
+  val strb = UInt(params.strbBits.W)
+}
+
+class AXILiteWriteResponse(params: AXIParams) extends AXIBase(params) {
+  val resp = UInt(params.respBits.W)
+}
+
+class AXILiteReadData(params: AXIParams) extends AXIBase(params) {
+  val data = UInt(params.dataBits.W)
+  val resp = UInt(params.respBits.W)
+}
+
+class AXILiteMaster(params: AXIParams) extends AXIBase(params) {
+  val aw = Decoupled(new AXILiteAddress(params))
+  val w = Decoupled(new AXILiteWriteData(params))
+  val b = Flipped(Decoupled(new AXILiteWriteResponse(params)))
+  val ar = Decoupled(new AXILiteAddress(params))
+  val r = Flipped(Decoupled(new AXILiteReadData(params)))
+
+  def tieoff() {
+    aw.valid := false.B
+    aw.bits.addr := 0.U
+    w.valid := false.B
+    w.bits.data := 0.U
+    w.bits.strb := 0.U
+    b.ready := false.B
+    ar.valid := false.B
+    ar.bits.addr := 0.U
+    r.ready := false.B
+  }
+}
+
+class AXILiteClient(params: AXIParams) extends AXIBase(params) {
+  val aw = Flipped(Decoupled(new AXILiteAddress(params)))
+  val w = Flipped(Decoupled(new AXILiteWriteData(params)))
+  val b = Decoupled(new AXILiteWriteResponse(params))
+  val ar = Flipped(Decoupled(new AXILiteAddress(params)))
+  val r = Decoupled(new AXILiteReadData(params))
+
+  def tieoff() {
+    aw.ready := false.B
+    w.ready := false.B
+    b.valid := false.B
+    b.bits.resp := 0.U
+    ar.ready := false.B
+    r.valid := false.B
+    r.bits.resp := 0.U
+    r.bits.data := 0.U
+  }
+}
+
+// AXI extends AXILite
+
+class AXIAddress(params: AXIParams) extends AXILiteAddress(params) {
+  val id = UInt(params.idBits.W)
+  val user = UInt(params.userBits.W)
+  val len = UInt(params.lenBits.W)
+  val size = UInt(params.sizeBits.W)
+  val burst = UInt(params.burstBits.W)
+  val lock = UInt(params.lockBits.W)
+  val cache = UInt(params.cacheBits.W)
+  val prot = UInt(params.protBits.W)
+  val qos = UInt(params.qosBits.W)
+  val region = UInt(params.regionBits.W)
+}
+
+class AXIWriteData(params: AXIParams) extends AXILiteWriteData(params) {
+  val last = Bool()
+  val id = UInt(params.idBits.W)
+  val user = UInt(params.userBits.W)
+}
+
+class AXIWriteResponse(params: AXIParams) extends AXILiteWriteResponse(params) {
+  val id = UInt(params.idBits.W)
+  val user = UInt(params.userBits.W)
+}
+
+class AXIReadData(params: AXIParams) extends AXILiteReadData(params) {
+  val last = Bool()
+  val id = UInt(params.idBits.W)
+  val user = UInt(params.userBits.W)
+}
+
+class AXIMaster(params: AXIParams) extends AXIBase(params) {
+  val aw = Decoupled(new AXIAddress(params))
+  val w = Decoupled(new AXIWriteData(params))
+  val b = Flipped(Decoupled(new AXIWriteResponse(params)))
+  val ar = Decoupled(new AXIAddress(params))
+  val r = Flipped(Decoupled(new AXIReadData(params)))
+
+  def tieoff() {
+    aw.valid := false.B
+    aw.bits.addr := 0.U
+    aw.bits.id := 0.U
+    aw.bits.user := 0.U
+    aw.bits.len := 0.U
+    aw.bits.size := 0.U
+    aw.bits.burst := 0.U
+    aw.bits.lock := 0.U
+    aw.bits.cache := 0.U
+    aw.bits.prot := 0.U
+    aw.bits.qos := 0.U
+    aw.bits.region := 0.U
+    w.valid := false.B
+    w.bits.data := 0.U
+    w.bits.strb := 0.U
+    w.bits.last := false.B
+    w.bits.id := 0.U
+    w.bits.user := 0.U
+    b.ready := false.B
+    ar.valid := false.B
+    ar.bits.addr := 0.U
+    ar.bits.id := 0.U
+    ar.bits.user := 0.U
+    ar.bits.len := 0.U
+    ar.bits.size := 0.U
+    ar.bits.burst := 0.U
+    ar.bits.lock := 0.U
+    ar.bits.cache := 0.U
+    ar.bits.prot := 0.U
+    ar.bits.qos := 0.U
+    ar.bits.region := 0.U
+    r.ready := false.B
+  }
+
+  def setConst() {
+    aw.bits.user := params.userConst.U
+    aw.bits.burst := params.burstConst.U
+    aw.bits.lock := params.lockConst.U
+    aw.bits.cache := params.cacheConst.U
+    aw.bits.prot := params.protConst.U
+    aw.bits.qos := params.qosConst.U
+    aw.bits.region := params.regionConst.U
+    aw.bits.size := params.sizeConst.U
+    aw.bits.id := params.idConst.U
+    w.bits.id := params.idConst.U
+    w.bits.user := params.userConst.U
+    w.bits.strb := Fill(params.strbBits, true.B)
+    ar.bits.user := params.userConst.U
+    ar.bits.burst := params.burstConst.U
+    ar.bits.lock := params.lockConst.U
+    ar.bits.cache := params.cacheConst.U
+    ar.bits.prot := params.protConst.U
+    ar.bits.qos := params.qosConst.U
+    ar.bits.region := params.regionConst.U
+    ar.bits.size := params.sizeConst.U
+    ar.bits.id := params.idConst.U
+  }
+}
+
+class AXIClient(params: AXIParams) extends AXIBase(params) {
+  val aw = Flipped(Decoupled(new AXIAddress(params)))
+  val w = Flipped(Decoupled(new AXIWriteData(params)))
+  val b = Decoupled(new AXIWriteResponse(params))
+  val ar = Flipped(Decoupled(new AXIAddress(params)))
+  val r = Decoupled(new AXIReadData(params))
+
+  def tieoff() {
+    aw.ready := false.B
+    w.ready := false.B
+    b.valid := false.B
+    b.bits.resp := 0.U
+    b.bits.user := 0.U
+    b.bits.id := 0.U
+    ar.ready := false.B
+    r.valid := false.B
+    r.bits.resp := 0.U
+    r.bits.data := 0.U
+    r.bits.user := 0.U
+    r.bits.last := false.B
+    r.bits.id := 0.U
+  }
+}
+
+// XilinxAXILiteClient and XilinxAXIMaster bundles are needed
+// for wrapper purposes, because the package RTL tool in Xilinx Vivado
+// only allows certain name formats
+
+class XilinxAXILiteClient(params: AXIParams) extends AXIBase(params) {
+  val AWVALID = Input(Bool())
+  val AWREADY = Output(Bool())
+  val AWADDR = Input(UInt(params.addrBits.W))
+  val WVALID = Input(Bool())
+  val WREADY = Output(Bool())
+  val WDATA = Input(UInt(params.dataBits.W))
+  val WSTRB = Input(UInt(params.strbBits.W))
+  val BVALID = Output(Bool())
+  val BREADY = Input(Bool())
+  val BRESP = Output(UInt(params.respBits.W))
+  val ARVALID = Input(Bool())
+  val ARREADY = Output(Bool())
+  val ARADDR = Input(UInt(params.addrBits.W))
+  val RVALID = Output(Bool())
+  val RREADY = Input(Bool())
+  val RDATA = Output(UInt(params.dataBits.W))
+  val RRESP = Output(UInt(params.respBits.W))
+}
+
+class XilinxAXIMaster(params: AXIParams) extends AXIBase(params) {
+  val AWVALID = Output(Bool())
+  val AWREADY = Input(Bool())
+  val AWADDR = Output(UInt(params.addrBits.W))
+  val AWID = Output(UInt(params.idBits.W))
+  val AWUSER = Output(UInt(params.userBits.W))
+  val AWLEN = Output(UInt(params.lenBits.W))
+  val AWSIZE = Output(UInt(params.sizeBits.W))
+  val AWBURST = Output(UInt(params.burstBits.W))
+  val AWLOCK = Output(UInt(params.lockBits.W))
+  val AWCACHE = Output(UInt(params.cacheBits.W))
+  val AWPROT = Output(UInt(params.protBits.W))
+  val AWQOS = Output(UInt(params.qosBits.W))
+  val AWREGION = Output(UInt(params.regionBits.W))
+  val WVALID = Output(Bool())
+  val WREADY = Input(Bool())
+  val WDATA = Output(UInt(params.dataBits.W))
+  val WSTRB = Output(UInt(params.strbBits.W))
+  val WLAST = Output(Bool())
+  val WID = Output(UInt(params.idBits.W))
+  val WUSER = Output(UInt(params.userBits.W))
+  val BVALID = Input(Bool())
+  val BREADY = Output(Bool())
+  val BRESP = Input(UInt(params.respBits.W))
+  val BID = Input(UInt(params.idBits.W))
+  val BUSER = Input(UInt(params.userBits.W))
+  val ARVALID = Output(Bool())
+  val ARREADY = Input(Bool())
+  val ARADDR = Output(UInt(params.addrBits.W))
+  val ARID = Output(UInt(params.idBits.W))
+  val ARUSER = Output(UInt(params.userBits.W))
+  val ARLEN = Output(UInt(params.lenBits.W))
+  val ARSIZE = Output(UInt(params.sizeBits.W))
+  val ARBURST = Output(UInt(params.burstBits.W))
+  val ARLOCK = Output(UInt(params.lockBits.W))
+  val ARCACHE = Output(UInt(params.cacheBits.W))
+  val ARPROT = Output(UInt(params.protBits.W))
+  val ARQOS = Output(UInt(params.qosBits.W))
+  val ARREGION = Output(UInt(params.regionBits.W))
+  val RVALID = Input(Bool())
+  val RREADY = Output(Bool())
+  val RDATA = Input(UInt(params.dataBits.W))
+  val RRESP = Input(UInt(params.respBits.W))
+  val RLAST = Input(Bool())
+  val RID = Input(UInt(params.idBits.W))
+  val RUSER = Input(UInt(params.userBits.W))
+}
diff --git a/vta/hardware/chisel/src/main/scala/shell/Configs.scala b/vta/hardware/chisel/src/main/scala/shell/Configs.scala
new file mode 100644
index 000000000000..1d1d5223b73c
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/shell/Configs.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.shell
+
+import chisel3._
+import chisel3.util._
+import vta.util.config._
+import vta.interface.axi._
+
+/** PynqConfig. Shell configuration for Pynq */
+class PynqConfig extends Config((site, here, up) => {
+  case ShellKey => ShellParams(
+    hostParams = AXIParams(
+      addrBits = 16,
+      dataBits = 32),
+    memParams = AXIParams(
+      addrBits = 32,
+      dataBits = 64),
+    vcrParams = VCRParams(),
+    vmeParams = VMEParams())
+})
+
+/** F1Config. Shell configuration for F1 */
+class F1Config extends Config((site, here, up) => {
+  case ShellKey => ShellParams(
+    hostParams = AXIParams(
+      addrBits = 16,
+      dataBits = 32),
+    memParams = AXIParams(
+      addrBits = 64,
+      dataBits = 64),
+    vcrParams = VCRParams(),
+    vmeParams = VMEParams())
+})
diff --git a/vta/hardware/chisel/src/main/scala/shell/SimShell.scala b/vta/hardware/chisel/src/main/scala/shell/SimShell.scala
new file mode 100644
index 000000000000..3ad4b6548ce3
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/shell/SimShell.scala
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.shell
+
+import chisel3._
+import vta.util.config._
+import vta.interface.axi._
+import vta.shell._
+import vta.dpi._
+
+/** VTAHost.
+  *
+  * This module translate the DPI protocol into AXI. This is a simulation only
+  * module and used to test host-to-VTA communication. This module should be updated
+  * for testing hosts using a different bus protocol, other than AXI.
+  */
+class VTAHost(implicit p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    val axi = new AXILiteMaster(p(ShellKey).hostParams)
+  })
+  val host_dpi = Module(new VTAHostDPI)
+  val host_axi = Module(new VTAHostDPIToAXI)
+  host_dpi.io.reset := reset
+  host_dpi.io.clock := clock
+  host_axi.io.dpi <> host_dpi.io.dpi
+  io.axi <> host_axi.io.axi
+}
+
+/** VTAMem.
+  *
+  * This module translate the DPI protocol into AXI. This is a simulation only
+  * module and used to test VTA-to-memory communication. This module should be updated
+  * for testing memories using a different bus protocol, other than AXI.
+  */
+class VTAMem(implicit p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    val axi = new AXIClient(p(ShellKey).memParams)
+  })
+  val mem_dpi = Module(new VTAMemDPI)
+  val mem_axi = Module(new VTAMemDPIToAXI)
+  mem_dpi.io.reset := reset
+  mem_dpi.io.clock := clock
+  mem_dpi.io.dpi <> mem_axi.io.dpi
+  mem_axi.io.axi <> io.axi
+}
+
+/** SimShell.
+  *
+  * The simulation shell instantiate a host and memory simulation modules and it is
+  * intended to be connected to the VTAShell.
+  */
+class SimShell(implicit p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    val mem = new AXIClient(p(ShellKey).memParams)
+    val host = new AXILiteMaster(p(ShellKey).hostParams)
+  })
+  val host = Module(new VTAHost)
+  val mem = Module(new VTAMem)
+  io.mem <> mem.io.axi
+  io.host <> host.io.axi
+}
diff --git a/vta/hardware/chisel/src/main/scala/shell/VCR.scala b/vta/hardware/chisel/src/main/scala/shell/VCR.scala
new file mode 100644
index 000000000000..0f13cfe43cd3
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/shell/VCR.scala
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.shell
+
+import chisel3._
+import chisel3.util._
+import vta.util.config._
+import vta.util.genericbundle._
+import vta.interface.axi._
+
+/** VCR parameters.
+  *
+  * These parameters are used on VCR interfaces and modules.
+  */
+case class VCRParams()
+{
+  val nCtrl = 1
+  val nECnt = 1
+  val nVals = 1
+  val nPtrs = 6
+  val regBits = 32
+}
+
+/** VCRBase. Parametrize base class. */
+abstract class VCRBase(implicit p: Parameters)
+  extends GenericParameterizedBundle(p)
+
+/** VCRMaster.
+  *
+  * This is the master interface used by VCR in the VTAShell to control
+  * the Core unit.
+  */
+class VCRMaster(implicit p: Parameters) extends VCRBase {
+  val vp = p(ShellKey).vcrParams
+  val mp = p(ShellKey).memParams
+  val launch = Output(Bool())
+  val finish = Input(Bool())
+  val ecnt = Vec(vp.nECnt, Flipped(ValidIO(UInt(vp.regBits.W))))
+  val vals = Output(Vec(vp.nVals, UInt(vp.regBits.W)))
+  val ptrs = Output(Vec(vp.nPtrs, UInt(mp.addrBits.W)))
+}
+
+/** VCRClient.
+  *
+  * This is the client interface used by the Core module to communicate
+  * to the VCR in the VTAShell.
+  */
+class VCRClient(implicit p: Parameters) extends VCRBase {
+  val vp = p(ShellKey).vcrParams
+  val mp = p(ShellKey).memParams
+  val launch = Input(Bool())
+  val finish = Output(Bool())
+  val ecnt = Vec(vp.nECnt, ValidIO(UInt(vp.regBits.W)))
+  val vals = Input(Vec(vp.nVals, UInt(vp.regBits.W)))
+  val ptrs = Input(Vec(vp.nPtrs, UInt(mp.addrBits.W)))
+}
+
+/** VTA Control Registers (VCR).
+  *
+  * This unit provides control registers (32 and 64 bits) to be used by a control'
+  * unit, typically a host processor. These registers are read-only by the core
+  * at the moment but this will likely change once we add support to general purpose
+  * registers that could be used as event counters by the Core unit.
+  */
+class VCR(implicit p: Parameters) extends Module {
+  val io = IO(new Bundle{
+    val host = new AXILiteClient(p(ShellKey).hostParams)
+    val vcr = new VCRMaster
+  })
+
+  val vp = p(ShellKey).vcrParams
+  val mp = p(ShellKey).memParams
+  val hp = p(ShellKey).hostParams
+
+  // Write control (AW, W, B)
+  val waddr = RegInit("h_ffff".U(hp.addrBits.W)) // init with invalid address
+  val wdata = io.host.w.bits.data
+  val sWriteAddress :: sWriteData :: sWriteResponse :: Nil = Enum(3)
+  val wstate = RegInit(sWriteAddress)
+
+  // read control (AR, R)
+  val sReadAddress :: sReadData :: Nil = Enum(2)
+  val rstate = RegInit(sReadAddress)
+  val rdata = RegInit(0.U(vp.regBits.W))
+
+  // registers
+  val nTotal = vp.nCtrl + vp.nECnt + vp.nVals + (2*vp.nPtrs)
+  val reg = Seq.fill(nTotal)(RegInit(0.U(vp.regBits.W)))
+  val addr = Seq.tabulate(nTotal)(_ * 4)
+  val reg_map = (addr zip reg)  map { case (a, r) => a.U -> r }
+  val eo = vp.nCtrl
+  val vo = eo + vp.nECnt
+  val po = vo + vp.nVals
+
+  switch (wstate) {
+    is (sWriteAddress) {
+      when (io.host.aw.valid) {
+        wstate := sWriteData
+      }
+    }
+    is (sWriteData) {
+      when (io.host.w.valid) {
+        wstate := sWriteResponse
+      }
+    }
+    is (sWriteResponse) {
+      when (io.host.b.ready) {
+        wstate := sWriteAddress
+      }
+    }
+  }
+
+  when (io.host.aw.fire()) { waddr := io.host.aw.bits.addr }
+
+  io.host.aw.ready := wstate === sWriteAddress
+  io.host.w.ready := wstate === sWriteData
+  io.host.b.valid := wstate === sWriteResponse
+  io.host.b.bits.resp := 0.U
+
+
+  switch (rstate) {
+    is (sReadAddress) {
+      when (io.host.ar.valid) {
+        rstate := sReadData
+      }
+    }
+    is (sReadData) {
+      when (io.host.r.ready) {
+        rstate := sReadAddress
+      }
+    }
+  }
+
+  io.host.ar.ready := rstate === sReadAddress
+  io.host.r.valid := rstate === sReadData
+  io.host.r.bits.data := rdata
+  io.host.r.bits.resp := 0.U
+
+  when (io.vcr.finish) {
+    reg(0) := "b_10".U
+  } .elsewhen (io.host.w.fire() && addr(0).U === waddr) {
+    reg(0) := wdata
+  }
+
+  for (i <- 0 until vp.nECnt) {
+    when (io.vcr.ecnt(i).valid) {
+      reg(eo + i) := io.vcr.ecnt(i).bits
+    } .elsewhen (io.host.w.fire() && addr(eo + i).U === waddr) {
+      reg(eo + i) := wdata
+    }
+  }
+
+  for (i <- 0 until (vp.nVals + (2*vp.nPtrs))) {
+    when (io.host.w.fire() && addr(vo + i).U === waddr) {
+      reg(vo + i) := wdata
+    }
+  }
+
+  when (io.host.ar.fire()) {
+    rdata := MuxLookup(io.host.ar.bits.addr, 0.U, reg_map)
+  }
+
+  io.vcr.launch := reg(0)(0)
+
+  for (i <- 0 until vp.nVals) {
+    io.vcr.vals(i) := reg(vo + i)
+  }
+
+  for (i <- 0 until vp.nPtrs) {
+    io.vcr.ptrs(i) := Cat(reg(po + 2*i + 1), reg(po + 2*i))
+  }
+}
diff --git a/vta/hardware/chisel/src/main/scala/shell/VME.scala b/vta/hardware/chisel/src/main/scala/shell/VME.scala
new file mode 100644
index 000000000000..862e9810c510
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/shell/VME.scala
@@ -0,0 +1,254 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.shell
+
+import chisel3._
+import chisel3.util._
+import vta.util.config._
+import vta.util.genericbundle._
+import vta.interface.axi._
+
+/** VME parameters.
+  *
+  * These parameters are used on VME interfaces and modules.
+  */
+case class VMEParams() {
+  val nReadClients: Int = 5
+  val nWriteClients: Int = 1
+  require (nReadClients > 0, s"\n\n[VTA] [VMEParams] nReadClients must be larger than 0\n\n")
+  require (nWriteClients == 1, s"\n\n[VTA] [VMEParams] nWriteClients must be 1, only one-write-client support atm\n\n")
+}
+
+/** VMEBase. Parametrize base class. */
+abstract class VMEBase(implicit p: Parameters)
+  extends GenericParameterizedBundle(p)
+
+/** VMECmd.
+  *
+  * This interface is used for creating write and read requests to memory.
+  */
+class VMECmd(implicit p: Parameters) extends VMEBase {
+  val addrBits = p(ShellKey).memParams.addrBits
+  val lenBits = p(ShellKey).memParams.lenBits
+  val addr = UInt(addrBits.W)
+  val len = UInt(lenBits.W)
+}
+
+/** VMEReadMaster.
+  *
+  * This interface is used by modules inside the core to generate read requests
+  * and receive responses from VME.
+  */
+class VMEReadMaster(implicit p: Parameters) extends Bundle {
+  val dataBits = p(ShellKey).memParams.dataBits
+  val cmd = Decoupled(new VMECmd)
+  val data = Flipped(Decoupled(UInt(dataBits.W)))
+  override def cloneType =
+    new VMEReadMaster().asInstanceOf[this.type]
+}
+
+/** VMEReadClient.
+  *
+  * This interface is used by the VME to receive read requests and generate
+  * responses to modules inside the core.
+  */
+class VMEReadClient(implicit p: Parameters) extends Bundle {
+  val dataBits = p(ShellKey).memParams.dataBits
+  val cmd = Flipped(Decoupled(new VMECmd))
+  val data = Decoupled(UInt(dataBits.W))
+  override def cloneType =
+    new VMEReadClient().asInstanceOf[this.type]
+}
+
+/** VMEWriteMaster.
+  *
+  * This interface is used by modules inside the core to generate write requests
+  * to the VME.
+  */
+class VMEWriteMaster(implicit p: Parameters) extends Bundle {
+  val dataBits = p(ShellKey).memParams.dataBits
+  val cmd = Decoupled(new VMECmd)
+  val data = Decoupled(UInt(dataBits.W))
+  val ack = Input(Bool())
+  override def cloneType =
+    new VMEWriteMaster().asInstanceOf[this.type]
+}
+
+/** VMEWriteClient.
+  *
+  * This interface is used by the VME to handle write requests from modules inside
+  * the core.
+  */
+class VMEWriteClient(implicit p: Parameters) extends Bundle {
+  val dataBits = p(ShellKey).memParams.dataBits
+  val cmd = Flipped(Decoupled(new VMECmd))
+  val data = Flipped(Decoupled(UInt(dataBits.W)))
+  val ack = Output(Bool())
+  override def cloneType =
+    new VMEWriteClient().asInstanceOf[this.type]
+}
+
+/** VMEMaster.
+  *
+  * Pack nRd number of VMEReadMaster interfaces and nWr number of VMEWriteMaster
+  * interfaces.
+  */
+class VMEMaster(implicit p: Parameters) extends Bundle {
+  val nRd = p(ShellKey).vmeParams.nReadClients
+  val nWr = p(ShellKey).vmeParams.nWriteClients
+  val rd = Vec(nRd, new VMEReadMaster)
+  val wr = Vec(nWr, new VMEWriteMaster)
+}
+
+/** VMEClient.
+  *
+  * Pack nRd number of VMEReadClient interfaces and nWr number of VMEWriteClient
+  * interfaces.
+  */
+class VMEClient(implicit p: Parameters) extends Bundle {
+  val nRd = p(ShellKey).vmeParams.nReadClients
+  val nWr = p(ShellKey).vmeParams.nWriteClients
+  val rd = Vec(nRd, new VMEReadClient)
+  val wr = Vec(nWr, new VMEWriteClient)
+}
+
+/** VTA Memory Engine (VME).
+  *
+  * This unit multiplexes the memory controller interface for the Core. Currently,
+  * it supports single-writer and multiple-reader mode and it is also based on AXI.
+  */
+class VME(implicit p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    val mem = new AXIMaster(p(ShellKey).memParams)
+    val vme = new VMEClient
+  })
+
+  val nReadClients = p(ShellKey).vmeParams.nReadClients
+  val rd_arb = Module(new Arbiter(new VMECmd, nReadClients))
+  val rd_arb_chosen = RegEnable(rd_arb.io.chosen, rd_arb.io.out.fire())
+
+  for (i <- 0 until nReadClients) { rd_arb.io.in(i) <> io.vme.rd(i).cmd }
+
+  val sReadIdle :: sReadAddr :: sReadData :: Nil = Enum(3)
+  val rstate = RegInit(sReadIdle)
+
+  switch (rstate) {
+    is (sReadIdle) {
+      when (rd_arb.io.out.valid) {
+        rstate := sReadAddr
+      }
+    }
+    is (sReadAddr) {
+      when (io.mem.ar.ready) {
+        rstate := sReadData
+      }
+    }
+    is (sReadData) {
+      when (io.mem.r.fire() && io.mem.r.bits.last) {
+        rstate := sReadIdle
+      }
+    }
+  }
+
+  val sWriteIdle :: sWriteAddr :: sWriteData :: sWriteResp :: Nil = Enum(4)
+  val wstate = RegInit(sWriteIdle)
+  val addrBits = p(ShellKey).memParams.addrBits
+  val lenBits = p(ShellKey).memParams.lenBits
+  val wr_cnt = RegInit(0.U(lenBits.W))
+
+  when (wstate === sWriteIdle) {
+    wr_cnt := 0.U
+  } .elsewhen (io.mem.w.fire()) {
+    wr_cnt := wr_cnt + 1.U
+  }
+
+  switch (wstate) {
+    is (sWriteIdle) {
+      when (io.vme.wr(0).cmd.valid) {
+        wstate := sWriteAddr
+      }
+    }
+    is (sWriteAddr) {
+      when (io.mem.aw.ready) {
+        wstate := sWriteData
+      }
+    }
+    is (sWriteData) {
+      when (io.mem.w.ready && wr_cnt === io.vme.wr(0).cmd.bits.len) {
+        wstate := sWriteResp
+      }
+    }
+    is (sWriteResp) {
+      when (io.mem.b.valid) {
+        wstate := sWriteIdle
+      }
+    }
+  }
+
+  // registers storing read/write cmds
+
+  val rd_len = RegInit(0.U(lenBits.W))
+  val wr_len = RegInit(0.U(lenBits.W))
+  val rd_addr = RegInit(0.U(addrBits.W))
+  val wr_addr = RegInit(0.U(addrBits.W))
+
+  when (rd_arb.io.out.fire()) {
+    rd_len := rd_arb.io.out.bits.len
+    rd_addr := rd_arb.io.out.bits.addr
+  }
+
+  when (io.vme.wr(0).cmd.fire()) {
+    wr_len := io.vme.wr(0).cmd.bits.len
+    wr_addr := io.vme.wr(0).cmd.bits.addr
+  }
+
+  // rd arb
+  rd_arb.io.out.ready := rstate === sReadIdle
+
+  // vme
+  for (i <- 0 until nReadClients) {
+    io.vme.rd(i).data.valid := rd_arb_chosen === i.asUInt & io.mem.r.valid
+    io.vme.rd(i).data.bits := io.mem.r.bits.data
+  }
+
+  io.vme.wr(0).cmd.ready := wstate === sWriteIdle
+  io.vme.wr(0).ack := io.mem.b.fire()
+  io.vme.wr(0).data.ready := wstate === sWriteData &  io.mem.w.ready
+
+  // mem
+  io.mem.aw.valid := wstate === sWriteAddr
+  io.mem.aw.bits.addr := wr_addr
+  io.mem.aw.bits.len := wr_len
+
+  io.mem.w.valid := wstate === sWriteData & io.vme.wr(0).data.valid
+  io.mem.w.bits.data := io.vme.wr(0).data.bits
+  io.mem.w.bits.last := wr_cnt === io.vme.wr(0).cmd.bits.len
+
+  io.mem.b.ready := wstate === sWriteResp
+
+  io.mem.ar.valid := rstate === sReadAddr
+  io.mem.ar.bits.addr := rd_addr
+  io.mem.ar.bits.len := rd_len
+
+  io.mem.r.ready := rstate === sReadData & io.vme.rd(rd_arb_chosen).data.ready
+
+  // AXI constants - statically defined
+  io.mem.setConst()
+}
diff --git a/vta/hardware/chisel/src/main/scala/shell/VTAShell.scala b/vta/hardware/chisel/src/main/scala/shell/VTAShell.scala
new file mode 100644
index 000000000000..c8093118308f
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/shell/VTAShell.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.shell
+
+import chisel3._
+import vta.util.config._
+import vta.interface.axi._
+import vta.core._
+
+/** Shell parameters. */
+case class ShellParams(
+  hostParams: AXIParams,
+  memParams: AXIParams,
+  vcrParams: VCRParams,
+  vmeParams: VMEParams
+)
+
+case object ShellKey extends Field[ShellParams]
+
+/** VTAShell.
+  *
+  * The VTAShell is based on a VME, VCR and core. This creates a complete VTA
+  * system that can be used for simulation or real hardware.
+  */
+class VTAShell(implicit p: Parameters) extends Module {
+  val io = IO(new Bundle{
+    val host = new AXILiteClient(p(ShellKey).hostParams)
+    val mem = new AXIMaster(p(ShellKey).memParams)
+  })
+
+  val vcr = Module(new VCR)
+  val vme = Module(new VME)
+  val core = Module(new Core)
+
+  core.io.vcr <> vcr.io.vcr
+  vme.io.vme <> core.io.vme
+
+  vcr.io.host <> io.host
+  io.mem <> vme.io.mem
+}
diff --git a/vta/hardware/chisel/src/main/scala/shell/XilinxShell.scala b/vta/hardware/chisel/src/main/scala/shell/XilinxShell.scala
new file mode 100644
index 000000000000..db721373b7e3
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/shell/XilinxShell.scala
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.shell
+
+import chisel3._
+import chisel3.experimental.{RawModule, withClockAndReset}
+import vta.util.config._
+import vta.interface.axi._
+
+/** XilinxShell.
+  *
+  * This is a wrapper shell mostly used to match Xilinx convention naming,
+  * therefore we can pack VTA as an IP for IPI based flows.
+  */
+class XilinxShell(implicit p: Parameters) extends RawModule {
+
+  val hp = p(ShellKey).hostParams
+  val mp = p(ShellKey).memParams
+
+  val ap_clk = IO(Input(Clock()))
+  val ap_rst_n = IO(Input(Bool()))
+  val m_axi_gmem = IO(new XilinxAXIMaster(mp))
+  val s_axi_control = IO(new XilinxAXILiteClient(hp))
+
+  val shell = withClockAndReset (clock = ap_clk, reset = ~ap_rst_n) { Module(new VTAShell) }
+
+  // memory
+  m_axi_gmem.AWVALID := shell.io.mem.aw.valid
+  shell.io.mem.aw.ready := m_axi_gmem.AWREADY
+  m_axi_gmem.AWADDR := shell.io.mem.aw.bits.addr
+  m_axi_gmem.AWID := shell.io.mem.aw.bits.id
+  m_axi_gmem.AWUSER := shell.io.mem.aw.bits.user
+  m_axi_gmem.AWLEN := shell.io.mem.aw.bits.len
+  m_axi_gmem.AWSIZE := shell.io.mem.aw.bits.size
+  m_axi_gmem.AWBURST := shell.io.mem.aw.bits.burst
+  m_axi_gmem.AWLOCK := shell.io.mem.aw.bits.lock
+  m_axi_gmem.AWCACHE := shell.io.mem.aw.bits.cache
+  m_axi_gmem.AWPROT := shell.io.mem.aw.bits.prot
+  m_axi_gmem.AWQOS := shell.io.mem.aw.bits.qos
+  m_axi_gmem.AWREGION := shell.io.mem.aw.bits.region
+
+  m_axi_gmem.WVALID := shell.io.mem.w.valid
+  shell.io.mem.w.ready := m_axi_gmem.WREADY
+  m_axi_gmem.WDATA := shell.io.mem.w.bits.data
+  m_axi_gmem.WSTRB := shell.io.mem.w.bits.strb
+  m_axi_gmem.WLAST := shell.io.mem.w.bits.last
+  m_axi_gmem.WID := shell.io.mem.w.bits.id
+  m_axi_gmem.WUSER := shell.io.mem.w.bits.user
+
+  shell.io.mem.b.valid := m_axi_gmem.BVALID
+  m_axi_gmem.BREADY := shell.io.mem.b.valid
+  shell.io.mem.b.bits.resp := m_axi_gmem.BRESP
+  shell.io.mem.b.bits.id := m_axi_gmem.BID
+  shell.io.mem.b.bits.user := m_axi_gmem.BUSER
+
+  m_axi_gmem.ARVALID := shell.io.mem.ar.valid
+  shell.io.mem.ar.ready := m_axi_gmem.ARREADY
+  m_axi_gmem.ARADDR := shell.io.mem.ar.bits.addr
+  m_axi_gmem.ARID := shell.io.mem.ar.bits.id
+  m_axi_gmem.ARUSER := shell.io.mem.ar.bits.user
+  m_axi_gmem.ARLEN := shell.io.mem.ar.bits.len
+  m_axi_gmem.ARSIZE := shell.io.mem.ar.bits.size
+  m_axi_gmem.ARBURST := shell.io.mem.ar.bits.burst
+  m_axi_gmem.ARLOCK := shell.io.mem.ar.bits.lock
+  m_axi_gmem.ARCACHE := shell.io.mem.ar.bits.cache
+  m_axi_gmem.ARPROT := shell.io.mem.ar.bits.prot
+  m_axi_gmem.ARQOS := shell.io.mem.ar.bits.qos
+  m_axi_gmem.ARREGION := shell.io.mem.ar.bits.region
+
+  shell.io.mem.r.valid := m_axi_gmem.RVALID
+  m_axi_gmem.RREADY := shell.io.mem.r.ready
+  shell.io.mem.r.bits.data := m_axi_gmem.RDATA
+  shell.io.mem.r.bits.resp := m_axi_gmem.RRESP
+  shell.io.mem.r.bits.last := m_axi_gmem.RLAST
+  shell.io.mem.r.bits.id := m_axi_gmem.RID
+  shell.io.mem.r.bits.user := m_axi_gmem.RUSER
+
+  // host
+  shell.io.host.aw.valid := s_axi_control.AWVALID
+  s_axi_control.AWREADY := shell.io.host.aw.ready
+  shell.io.host.aw.bits.addr := s_axi_control.AWADDR
+
+  shell.io.host.w.valid := s_axi_control.WVALID
+  s_axi_control.WREADY := shell.io.host.w.ready
+  shell.io.host.w.bits.data := s_axi_control.WDATA
+  shell.io.host.w.bits.strb := s_axi_control.WSTRB
+
+  s_axi_control.BVALID := shell.io.host.b.valid
+  shell.io.host.b.ready := s_axi_control.BREADY
+  s_axi_control.BRESP := shell.io.host.b.bits.resp
+
+  shell.io.host.ar.valid := s_axi_control.ARVALID
+  s_axi_control.ARREADY := shell.io.host.ar.ready
+  shell.io.host.ar.bits.addr := s_axi_control.ARADDR
+
+  s_axi_control.RVALID := shell.io.host.r.valid
+  shell.io.host.r.ready := s_axi_control.RREADY
+  s_axi_control.RDATA := shell.io.host.r.bits.data
+  s_axi_control.RRESP := shell.io.host.r.bits.resp
+}
diff --git a/vta/hardware/chisel/src/main/scala/test/Test.scala b/vta/hardware/chisel/src/main/scala/test/Test.scala
new file mode 100644
index 000000000000..db060739147d
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/test/Test.scala
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.test
+
+import chisel3._
+import vta.util.config._
+import vta.shell._
+
+/** Test. This generates a testbench file for simulation */
+class Test(implicit p: Parameters) extends Module {
+  val io = IO(new Bundle {})
+  val sim_shell = Module(new SimShell)
+  val vta_shell = Module(new VTAShell)
+  vta_shell.io.host <> sim_shell.io.host
+  sim_shell.io.mem <> vta_shell.io.mem
+}
diff --git a/vta/hardware/chisel/src/main/scala/util/Config.scala b/vta/hardware/chisel/src/main/scala/util/Config.scala
new file mode 100644
index 000000000000..6699507c9f13
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/util/Config.scala
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.util.config
+
+// taken from https://github.com/vta.roject/rocket-chip
+
+abstract class Field[T] private (val default: Option[T])
+{
+  def this() = this(None)
+  def this(default: T) = this(Some(default))
+}
+
+abstract class View {
+  final def apply[T](pname: Field[T]): T = apply(pname, this)
+  final def apply[T](pname: Field[T], site: View): T = {
+    val out = find(pname, site)
+    require (out.isDefined, s"Key ${pname} is not defined in Parameters")
+    out.get
+  }
+
+  final def lift[T](pname: Field[T]): Option[T] = lift(pname, this)
+  final def lift[T](pname: Field[T], site: View): Option[T] = find(pname, site).map(_.asInstanceOf[T])
+
+  protected[config] def find[T](pname: Field[T], site: View): Option[T]
+}
+
+abstract class Parameters extends View {
+  final def ++ (x: Parameters): Parameters =
+    new ChainParameters(this, x)
+
+  final def alter(f: (View, View, View) => PartialFunction[Any,Any]): Parameters =
+    Parameters(f) ++ this
+
+  final def alterPartial(f: PartialFunction[Any,Any]): Parameters =
+    Parameters((_,_,_) => f) ++ this
+
+  final def alterMap(m: Map[Any,Any]): Parameters =
+    new MapParameters(m) ++ this
+
+  protected[config] def chain[T](site: View, tail: View, pname: Field[T]): Option[T]
+  protected[config] def find[T](pname: Field[T], site: View) = chain(site, new TerminalView, pname)
+}
+
+object Parameters {
+  def empty: Parameters = new EmptyParameters
+  def apply(f: (View, View, View) => PartialFunction[Any,Any]): Parameters = new PartialParameters(f)
+}
+
+class Config(p: Parameters) extends Parameters {
+  def this(f: (View, View, View) => PartialFunction[Any,Any]) = this(Parameters(f))
+
+  protected[config] def chain[T](site: View, tail: View, pname: Field[T]) = p.chain(site, tail, pname)
+  override def toString = this.getClass.getSimpleName
+  def toInstance = this
+}
+
+// Internal implementation:
+
+private class TerminalView extends View {
+  def find[T](pname: Field[T], site: View): Option[T] = pname.default
+}
+
+private class ChainView(head: Parameters, tail: View) extends View {
+  def find[T](pname: Field[T], site: View) = head.chain(site, tail, pname)
+}
+
+private class ChainParameters(x: Parameters, y: Parameters) extends Parameters {
+  def chain[T](site: View, tail: View, pname: Field[T]) = x.chain(site, new ChainView(y, tail), pname)
+}
+
+private class EmptyParameters extends Parameters {
+  def chain[T](site: View, tail: View, pname: Field[T]) = tail.find(pname, site)
+}
+
+private class PartialParameters(f: (View, View, View) => PartialFunction[Any,Any]) extends Parameters {
+  protected[config] def chain[T](site: View, tail: View, pname: Field[T]) = {
+    val g = f(site, this, tail)
+    if (g.isDefinedAt(pname)) Some(g.apply(pname).asInstanceOf[T]) else tail.find(pname, site)
+  }
+}
+
+private class MapParameters(map: Map[Any, Any]) extends Parameters {
+  protected[config] def chain[T](site: View, tail: View, pname: Field[T]) = {
+    val g = map.get(pname)
+    if (g.isDefined) Some(g.get.asInstanceOf[T]) else tail.find(pname, site)
+  }
+}
diff --git a/vta/hardware/chisel/src/main/scala/util/GenericParameterizedBundle.scala b/vta/hardware/chisel/src/main/scala/util/GenericParameterizedBundle.scala
new file mode 100644
index 000000000000..db19635c9345
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/util/GenericParameterizedBundle.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta.util.genericbundle
+
+// taken from https://github.com/vta.roject/rocket-chip
+
+import chisel3._
+
+abstract class GenericParameterizedBundle[+T <: Object](val params: T) extends Bundle
+{
+  override def cloneType = {
+    try {
+      this.getClass.getConstructors.head.newInstance(params).asInstanceOf[this.type]
+    } catch {
+      case e: java.lang.IllegalArgumentException =>
+        throw new Exception("Unable to use GenericParameterizedBundle.cloneType on " +
+                       this.getClass + ", probably because " + this.getClass +
+                       "() takes more than one argument.  Consider overriding " +
+                       "cloneType() on " + this.getClass, e)
+    }
+  }
+}
+
diff --git a/vta/hardware/chisel/src/main/scala/vta/Configs.scala b/vta/hardware/chisel/src/main/scala/vta/Configs.scala
new file mode 100644
index 000000000000..d5aa12798fe7
--- /dev/null
+++ b/vta/hardware/chisel/src/main/scala/vta/Configs.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package vta
+
+import chisel3._
+import vta.util.config._
+import vta.shell._
+import vta.core._
+import vta.test._
+
+/** VTA.
+  *
+  * This file contains all the configurations supported by VTA.
+  * These configurations are built in a mix/match form based on core
+  * and shell configurations.
+  */
+
+class DefaultPynqConfig extends Config(new CoreConfig ++ new PynqConfig)
+class DefaultF1Config extends Config(new CoreConfig ++ new F1Config)
+
+object DefaultPynqConfig extends App {
+  implicit val p: Parameters = new DefaultPynqConfig
+  chisel3.Driver.execute(args, () => new XilinxShell)
+}
+
+object DefaultF1Config extends App {
+  implicit val p: Parameters = new DefaultF1Config
+  chisel3.Driver.execute(args, () => new XilinxShell)
+}
+
+object TestDefaultF1Config extends App {
+  implicit val p: Parameters = new DefaultF1Config
+  chisel3.Driver.execute(args, () => new Test)
+}
diff --git a/vta/hardware/dpi/tsim_device.cc b/vta/hardware/dpi/tsim_device.cc
index 08954179f1d2..aa05c8c2663c 100644
--- a/vta/hardware/dpi/tsim_device.cc
+++ b/vta/hardware/dpi/tsim_device.cc
@@ -70,8 +70,17 @@ void VTADPIInit(VTAContextHandle handle,
   _mem_dpi = mem_dpi;
 }
 
+
+// Override Verilator finish definition
+// VL_USER_FINISH needs to be defined when compiling Verilator code
+void vl_finish(const char* filename, int linenum, const char* hier) {
+  Verilated::gotFinish(true);
+}
+
 int VTADPISim(uint64_t max_cycles) {
   uint64_t trace_count = 0;
+  Verilated::flushCall();
+  Verilated::gotFinish(false);
 
 #if VM_TRACE
   uint64_t start = 0;
diff --git a/vta/hardware/xilinx/src/vta.cc b/vta/hardware/xilinx/src/vta.cc
index da8ea7435a90..cb6b7b76cd11 100644
--- a/vta/hardware/xilinx/src/vta.cc
+++ b/vta/hardware/xilinx/src/vta.cc
@@ -586,19 +586,29 @@ void vta(
 
   // Instantiate temporary instruction queues (used for peeking)
   hls::stream<insn_T> tmp_load_queue;
+  PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=tmp_load_queue)
   hls::stream<insn_T> tmp_gemm_queue;
+  PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=tmp_gemm_queue)
   hls::stream<insn_T> tmp_store_queue;
+  PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=tmp_store_queue)
 
   // Instatiate physical instruction queues
   hls::stream<insn_T> load_queue;
+  PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=load_queue)
   hls::stream<insn_T> gemm_queue;
+  PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=gemm_queue)
   hls::stream<insn_T> store_queue;
+  PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=store_queue)
 
   // Dependence queues
   hls::stream<bool> l2g_dep_queue;
+  PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=l2g_dep_queue)
   hls::stream<bool> s2g_dep_queue;
+  PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=s2g_dep_queue)
   hls::stream<bool> g2l_dep_queue;
+  PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=g2s_dep_queue)
   hls::stream<bool> g2s_dep_queue;
+  PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=g2s_dep_queue)
 
   // Instantiate memories
   inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH];
diff --git a/vta/hardware/xilinx/src/vta.h b/vta/hardware/xilinx/src/vta.h
index 6480debbe4f9..1395d5eaba8e 100644
--- a/vta/hardware/xilinx/src/vta.h
+++ b/vta/hardware/xilinx/src/vta.h
@@ -113,6 +113,13 @@ typedef ap_int<VTA_ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
 /* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
 typedef ap_int<VTA_LOG_ACC_WIDTH> aluop_sh_imm_T;
 
+/*!
+* Define HLS stream depth 
+*/
+#define PRAGMA_SUB(x) _Pragma (#x)
+#define PRAGMA_HLS(x) PRAGMA_SUB(x)
+#define STREAM_IN_DEPTH 8
+
 /*!
 * \brief Fetch module.
 *   Reads in \a insn_count instructions via DMA and pushes them to the
diff --git a/vta/include/vta/dpi/module.h b/vta/include/vta/dpi/module.h
index d2e4c80129eb..c83dad1b3299 100644
--- a/vta/include/vta/dpi/module.h
+++ b/vta/include/vta/dpi/module.h
@@ -35,7 +35,7 @@ namespace dpi {
 class DPIModuleNode : public tvm::runtime::ModuleNode {
  public:
 /*!
- * \brief Launch accelerator until it finishes or reach max_cycles
+ * \brief Launch hardware simulation until accelerator finishes or reach max_cycles
  * \param max_cycles The maximum of cycles to wait
  */
   virtual void Launch(uint64_t max_cycles) = 0;
@@ -53,7 +53,7 @@ class DPIModuleNode : public tvm::runtime::ModuleNode {
  */
   virtual uint32_t ReadReg(int addr) = 0;
 
-/*! \brief Kill or Exit() the accelerator */
+/*! \brief Finish hardware simulation */
   virtual void Finish() = 0;
 
   static tvm::runtime::Module Load(std::string dll_name);
diff --git a/vta/include/vta/driver.h b/vta/include/vta/driver.h
index ed041853b117..eca9e4da9799 100644
--- a/vta/include/vta/driver.h
+++ b/vta/include/vta/driver.h
@@ -45,11 +45,19 @@ extern "C" {
 #define VTA_MAX_XFER (1<<22)
 #endif
 
+/*! PAGE SIZE */
+#define VTA_PAGE_BITS 12
+#define VTA_PAGE_BYTES (1 << VTA_PAGE_BITS)
+
 /*! \brief Device resource context  */
 typedef void * VTADeviceHandle;
 
 /*! \brief physical address */
+#ifdef USE_TSIM
+typedef uint64_t vta_phy_addr_t;
+#else
 typedef uint32_t vta_phy_addr_t;
+#endif
 
 /*!
  * \brief Allocate a device resource handle
@@ -72,10 +80,22 @@ void VTADeviceFree(VTADeviceHandle handle);
  *
  * \return 0 if running is successful, 1 if timeout.
  */
+#ifdef USE_TSIM
 int VTADeviceRun(VTADeviceHandle device,
                  vta_phy_addr_t insn_phy_addr,
+                 vta_phy_addr_t uop_phy_addr,
+                 vta_phy_addr_t inp_phy_addr,
+                 vta_phy_addr_t wgt_phy_addr,
+                 vta_phy_addr_t acc_phy_addr,
+                 vta_phy_addr_t out_phy_addr,
                  uint32_t insn_count,
                  uint32_t wait_cycles);
+#else
+int VTADeviceRun(VTADeviceHandle device,
+                 vta_phy_addr_t insn_phy_addr,
+                 uint32_t insn_count,
+                 uint32_t wait_cycles);
+#endif
 
 /*!
  * \brief Allocates physically contiguous region in memory (limited by MAX_XFER).
diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py
index d5400d868ae4..4c2200d04727 100644
--- a/vta/python/vta/environment.py
+++ b/vta/python/vta/environment.py
@@ -239,7 +239,7 @@ def target_host(self):
         """The target host"""
         if self.TARGET == "pynq":
             return "llvm -target=armv7-none-linux-gnueabihf"
-        if self.TARGET == "sim":
+        if self.TARGET == "sim" or self.TARGET == "tsim":
             return "llvm"
         raise ValueError("Unknown target %s" % self.TARGET)
 
diff --git a/vta/python/vta/exec/rpc_server.py b/vta/python/vta/exec/rpc_server.py
index 5ca5d596a007..8caa48a56104 100644
--- a/vta/python/vta/exec/rpc_server.py
+++ b/vta/python/vta/exec/rpc_server.py
@@ -28,7 +28,7 @@
 import tvm
 from tvm import rpc
 from tvm.contrib import cc
-from pynq import Bitstream
+from vta import program_bitstream
 
 from ..environment import get_env
 from ..pkg_config import PkgConfig
@@ -67,9 +67,9 @@ def ext_dev_callback():
     @tvm.register_func("tvm.contrib.vta.init", override=True)
     def program_fpga(file_name):
         path = tvm.get_global_func("tvm.rpc.server.workpath")(file_name)
-        bitstream = Bitstream(path)
-        bitstream.download()
-        logging.info("Program FPGA with %s", file_name)
+        env = get_env()
+        program_bitstream.bitstream_program(env.TARGET, path)
+        logging.info("Program FPGA with %s ", file_name)
 
     @tvm.register_func("tvm.rpc.server.shutdown", override=True)
     def server_shutdown():
diff --git a/vta/python/vta/pkg_config.py b/vta/python/vta/pkg_config.py
index 3b2824765010..2c30414ace1a 100644
--- a/vta/python/vta/pkg_config.py
+++ b/vta/python/vta/pkg_config.py
@@ -77,10 +77,9 @@ def __init__(self, cfg, proj_root):
         if self.target == "pynq":
             self.ldflags = [
                 "-L/usr/lib",
-                "-lsds_lib",
                 "-L/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/",
                 "-L/opt/python3.6/lib/python3.6/site-packages/pynq/lib/",
-                "-l:libdma.so"]
+                "-l:libcma.so"]
         else:
             self.ldflags = []
 
diff --git a/vta/python/vta/program_bitstream.py b/vta/python/vta/program_bitstream.py
new file mode 100644
index 000000000000..5c5a86293885
--- /dev/null
+++ b/vta/python/vta/program_bitstream.py
@@ -0,0 +1,57 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""VTA specific bitstream program library."""
+import os
+import argparse
+
+def main():
+    """Main funciton"""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("target", type=str, default="",
+                        help="target")
+    parser.add_argument("bitstream", type=str, default="",
+                        help="bitstream path")
+    args = parser.parse_args()
+
+    if (args.target != 'pynq' and args.target != 'sim'):
+        raise RuntimeError("Unknown target {}".format(args.target))
+
+    curr_path = os.path.dirname(
+        os.path.abspath(os.path.expanduser(__file__)))
+    path_list = [
+        os.path.join(curr_path, "/{}".format(args.bitstream)),
+        os.path.join('./', "{}".format(args.bitstream))
+    ]
+    ok_path_list = [p for p in path_list if os.path.exists(p)]
+    if not ok_path_list:
+        raise RuntimeError("Cannot find bitstream file in %s" % str(path_list))
+
+    bitstream_program(args.target, args.bitstream)
+
+def pynq_bitstream_program(bitstream_path):
+    from pynq import Bitstream
+    bitstream = Bitstream(bitstream_path)
+    bitstream.download()
+
+def bitstream_program(target, bitstream):
+    if target == 'pynq':
+        pynq_bitstream_program(bitstream)
+    elif target != 'sim':
+        raise RuntimeError("Unknown target {}".format(target))
+
+if __name__ == "__main__":
+    main()
diff --git a/vta/python/vta/testing/simulator.py b/vta/python/vta/testing/simulator.py
index a1e15ba69880..dbeba84f6d4a 100644
--- a/vta/python/vta/testing/simulator.py
+++ b/vta/python/vta/testing/simulator.py
@@ -17,6 +17,8 @@
 """Utilities to start simulator."""
 import ctypes
 import json
+import sys
+import os
 import tvm
 from ..libinfo import find_libvta
 
@@ -55,5 +57,31 @@ def stats():
     x = tvm.get_global_func("vta.simulator.profiler_status")()
     return json.loads(x)
 
+def tsim_init(hw_lib):
+    """Init hardware shared library for TSIM
+
+     Parameters
+     ------------
+     hw_lib : str
+        Name of hardware shared library
+    """
+    cur_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    vta_build_path = os.path.join(cur_path, "..", "..", "..", "build")
+    if not hw_lib.endswith(("dylib", "so")):
+        hw_lib += ".dylib" if sys.platform == "darwin" else ".so"
+    lib = os.path.join(vta_build_path, hw_lib)
+    f = tvm.get_global_func("tvm.vta.tsim.init")
+    m = tvm.module.load(lib, "vta-tsim")
+    f(m)
+
+def tsim_cycles():
+    """Get tsim clock cycles
+
+    Returns
+    -------
+    stats : int
+        tsim clock cycles
+    """
+    return tvm.get_global_func("tvm.vta.tsim.cycles")()
 
 LIBS = _load_lib()
diff --git a/vta/python/vta/testing/util.py b/vta/python/vta/testing/util.py
index 48dd08588962..06c700cd7119 100644
--- a/vta/python/vta/testing/util.py
+++ b/vta/python/vta/testing/util.py
@@ -31,7 +31,7 @@ def run(run_func):
     """
     env = get_env()
 
-    if env.TARGET == "sim":
+    if env.TARGET in ["sim", "tsim"]:
 
         # Talk to local RPC if necessary to debug RPC server.
         # Compile vta on your host with make at the root.
@@ -48,7 +48,8 @@ def run(run_func):
             # Make sure simulation library exists
             # If this fails, build vta on host (make)
             # with TARGET="sim" in the json.config file.
-            assert simulator.enabled()
+            if env.TARGET == "sim":
+                assert simulator.enabled()
             run_func(env, rpc.LocalSession())
 
     elif env.TARGET == "pynq":
diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
index 9d0d3dfe3d72..d685379f136c 100644
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -165,7 +165,7 @@ def _get_data_movement_byte(schedule, layer):
                     fil_sched.append(schedule)
                     xfer_size.append(_get_data_movement_byte(schedule, layer))
 
-    if best_only:
+    if best_only and xfer_size:
         return [fil_sched[xfer_size.index(min(xfer_size))]]
     return fil_sched
 
@@ -515,5 +515,10 @@ def __str__(self):
 }
 
 for idx in RESNET:
-    scheds = find_schedules(RESNET[idx], vt_only=True, best_only=True)[0]
-    _WL2PLAN[RESNET[idx]] = scheds
+    f_schedules = find_schedules(RESNET[idx], vt_only=True, best_only=True)
+    if f_schedules:
+        scheds = f_schedules[0]
+        _WL2PLAN[RESNET[idx]] = scheds
+    else:
+        logging.warning("No valid schedule was found for the workload on current vta configuration")
+        break
diff --git a/vta/src/runtime.cc b/vta/src/runtime.cc
index 9c8de1aaae5e..06b34743955f 100644
--- a/vta/src/runtime.cc
+++ b/vta/src/runtime.cc
@@ -56,7 +56,7 @@ struct DataBuffer {
     return data_;
   }
   /*! \return Physical address of the data. */
-  uint32_t phy_addr() const {
+  vta_phy_addr_t phy_addr() const {
     return phy_addr_;
   }
   /*!
@@ -113,7 +113,7 @@ struct DataBuffer {
   /*! \brief The internal data. */
   void* data_;
   /*! \brief The physical address of the buffer, excluding header. */
-  uint32_t phy_addr_;
+  vta_phy_addr_t phy_addr_;
 };
 
 /*!
@@ -302,7 +302,7 @@ class BaseQueue {
     return dram_buffer_;
   }
   /*! \return Physical address of DRAM. */
-  uint32_t dram_phy_addr() const {
+  vta_phy_addr_t dram_phy_addr() const {
     return dram_phy_addr_;
   }
   /*! \return Whether there is pending information. */
@@ -367,7 +367,7 @@ class BaseQueue {
   // The buffer in DRAM
   char* dram_buffer_{nullptr};
   // Physics address of the buffer
-  uint32_t dram_phy_addr_;
+  vta_phy_addr_t dram_phy_addr_;
 };
 
 /*!
@@ -424,7 +424,11 @@ class UopQueue : public BaseQueue {
       CHECK((dram_end_ - dram_begin_) == (sram_end_ - sram_begin_));
       insn->memory_type = VTA_MEM_ID_UOP;
       insn->sram_base = sram_begin_;
+#ifdef USE_TSIM
+      insn->dram_base = (uint32_t) dram_phy_addr_ + dram_begin_*kElemBytes;
+#else
       insn->dram_base = dram_phy_addr_ / kElemBytes + dram_begin_;
+#endif
       insn->y_size = 1;
       insn->x_size = (dram_end_ - dram_begin_);
       insn->x_stride = (dram_end_ - dram_begin_);
@@ -514,7 +518,7 @@ class InsnQueue : public BaseQueue {
     }
     // Impossible condition
     CHECK(from != kLoadStage || to != kStoreStage);
-    CHECK(to != kLoadStage || to != kComputeStage);
+    CHECK(from != kStoreStage || to != kLoadStage);
   }
   // Insert dependency push of load
   void DepPush(int from, int to) {
@@ -913,16 +917,33 @@ class CommandQueue {
   }
 
   uint32_t GetElemBytes(uint32_t memory_id) {
+    uint32_t elem_bytes = 0;
     switch (memory_id) {
-      case VTA_MEM_ID_UOP: return VTA_UOP_ELEM_BYTES;
-      case VTA_MEM_ID_INP: return VTA_INP_ELEM_BYTES;
-      case VTA_MEM_ID_WGT: return VTA_WGT_ELEM_BYTES;
-      case VTA_MEM_ID_ACC: return VTA_ACC_ELEM_BYTES;
-      case VTA_MEM_ID_OUT: return VTA_INP_ELEM_BYTES;
-      default: break;
+      case VTA_MEM_ID_UOP:
+          elem_bytes = VTA_UOP_ELEM_BYTES;
+          break;
+      case VTA_MEM_ID_INP:
+          elem_bytes = VTA_INP_ELEM_BYTES;
+          break;
+      case VTA_MEM_ID_WGT:
+          elem_bytes = VTA_WGT_ELEM_BYTES;
+          break;
+      case VTA_MEM_ID_ACC:
+          elem_bytes = VTA_ACC_ELEM_BYTES;
+          break;
+      case VTA_MEM_ID_OUT:
+          elem_bytes = VTA_INP_ELEM_BYTES;
+          break;
+      default:
+          LOG(FATAL) << "Memory id not recognized:" << memory_id;
+          break;
     }
-    LOG(FATAL) << "Memory id not recognized:" << memory_id;
-    return 0;
+    /*
+     * elements size should not larger than VTA_PAGE_BYTES.
+     * 
+     */
+    CHECK_GE(VTA_PAGE_BYTES, elem_bytes);
+    return elem_bytes;
   }
 
   void LoadBuffer2D(void* src_dram_addr,
@@ -941,7 +962,11 @@ class CommandQueue {
     insn->memory_type = dst_memory_type;
     insn->sram_base = dst_sram_index;
     DataBuffer* src = DataBuffer::FromHandle(src_dram_addr);
+#ifdef USE_TSIM
+    insn->dram_base = (uint32_t) src->phy_addr() + src_elem_offset*GetElemBytes(dst_memory_type);
+#else
     insn->dram_base = src->phy_addr() / GetElemBytes(dst_memory_type) + src_elem_offset;
+#endif
     insn->y_size = y_size;
     insn->x_size = x_size;
     insn->x_stride = x_stride;
@@ -964,7 +989,11 @@ class CommandQueue {
     insn->memory_type = src_memory_type;
     insn->sram_base = src_sram_index;
     DataBuffer* dst = DataBuffer::FromHandle(dst_dram_addr);
+#ifdef USE_TSIM
+    insn->dram_base = (uint32_t) dst->phy_addr() + dst_elem_offset*GetElemBytes(src_memory_type);
+#else
     insn->dram_base = dst->phy_addr() / GetElemBytes(src_memory_type) + dst_elem_offset;
+#endif
     insn->y_size = y_size;
     insn->x_size = x_size;
     insn->x_stride = x_stride;
@@ -1029,11 +1058,24 @@ class CommandQueue {
 
     // Make sure that we don't exceed contiguous physical memory limits
     CHECK(insn_queue_.count() * sizeof(VTAGenericInsn) < VTA_MAX_XFER);
+#ifdef USE_TSIM
+    int timeout = VTADeviceRun(
+        device_,
+        insn_queue_.dram_phy_addr(),
+        uop_queue_.dram_phy_addr(),
+        inp_phy_addr_,
+        wgt_phy_addr_,
+        acc_phy_addr_,
+        out_phy_addr_,
+        insn_queue_.count(),
+        wait_cycles);
+#else
     int timeout = VTADeviceRun(
         device_,
         insn_queue_.dram_phy_addr(),
         insn_queue_.count(),
         wait_cycles);
+#endif
     CHECK_EQ(timeout, 0);
     // Reset buffers
     uop_queue_.Reset();
@@ -1108,6 +1150,18 @@ class CommandQueue {
     ThreadLocal().reset();
   }
 
+#ifdef USE_TSIM
+  void SetBufPhyAddr(uint32_t type, vta_phy_addr_t addr) {
+    switch (type) {
+      case VTA_MEM_ID_INP: inp_phy_addr_ = addr;
+      case VTA_MEM_ID_WGT: wgt_phy_addr_ = addr;
+      case VTA_MEM_ID_ACC: acc_phy_addr_ = addr;
+      case VTA_MEM_ID_OUT: out_phy_addr_ = addr;
+      default: break;
+    }
+  }
+#endif
+
  private:
   // Push GEMM uop to the command buffer
   void PushGEMMOp(UopKernel* kernel) {
@@ -1212,6 +1266,16 @@ class CommandQueue {
   InsnQueue<VTA_MAX_XFER, true, true> insn_queue_;
   // Device handle
   VTADeviceHandle device_{nullptr};
+#ifdef USE_TSIM
+  // Input phy addr
+  vta_phy_addr_t inp_phy_addr_{0};
+  // Weight phy addr
+  vta_phy_addr_t wgt_phy_addr_{0};
+  // Accumulator phy addr
+  vta_phy_addr_t acc_phy_addr_{0};
+  // Output phy addr
+  vta_phy_addr_t out_phy_addr_{0};
+#endif
 };
 
 }  // namespace vta
@@ -1300,6 +1364,10 @@ void VTALoadBuffer2D(VTACommandHandle cmd,
                      uint32_t y_pad_after,
                      uint32_t dst_sram_index,
                      uint32_t dst_memory_type) {
+#ifdef USE_TSIM
+  vta::DataBuffer* src = vta::DataBuffer::FromHandle(src_dram_addr);
+  static_cast<vta::CommandQueue*>(cmd)->SetBufPhyAddr(dst_memory_type, src->phy_addr());
+#endif
   static_cast<vta::CommandQueue*>(cmd)->
       LoadBuffer2D(src_dram_addr, src_elem_offset,
                    x_size, y_size, x_stride,
@@ -1316,6 +1384,10 @@ void VTAStoreBuffer2D(VTACommandHandle cmd,
                       uint32_t x_size,
                       uint32_t y_size,
                       uint32_t x_stride) {
+#ifdef USE_TSIM
+  vta::DataBuffer* dst = vta::DataBuffer::FromHandle(dst_dram_addr);
+  static_cast<vta::CommandQueue*>(cmd)->SetBufPhyAddr(src_memory_type, dst->phy_addr());
+#endif
   static_cast<vta::CommandQueue*>(cmd)->
       StoreBuffer2D(src_sram_index, src_memory_type,
                     dst_dram_addr, dst_elem_offset,
diff --git a/vta/src/sim/sim_driver.cc b/vta/src/sim/sim_driver.cc
index 803a54d6b96a..5f9f6b637599 100644
--- a/vta/src/sim/sim_driver.cc
+++ b/vta/src/sim/sim_driver.cc
@@ -196,9 +196,9 @@ class DRAM {
 
  private:
   // The bits in page table
-  static constexpr vta_phy_addr_t kPageBits = 12;
+  static constexpr vta_phy_addr_t kPageBits = VTA_PAGE_BITS;
   // page size, also the maximum allocable size 16 K
-  static constexpr vta_phy_addr_t kPageSize = 1 << kPageBits;
+  static constexpr vta_phy_addr_t kPageSize = VTA_PAGE_BYTES;
   /*! \brief A page in the DRAM */
   struct Page {
     /*! \brief Data Type */
diff --git a/vta/src/tsim/tsim_driver.cc b/vta/src/tsim/tsim_driver.cc
new file mode 100644
index 000000000000..6dd273c25168
--- /dev/null
+++ b/vta/src/tsim/tsim_driver.cc
@@ -0,0 +1,203 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <vta/driver.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/registry.h>
+#include <vta/dpi/module.h>
+
+namespace vta {
+namespace tsim {
+
+using vta::dpi::DPIModuleNode;
+using tvm::runtime::Module;
+
+class Profiler {
+ public:
+  /*! \brief cycle counter */
+  uint64_t cycle_count{0};
+
+  static Profiler* Global() {
+    static Profiler inst;
+    return &inst;
+  }
+};
+
+class DPILoader {
+ public:
+  void Init(Module module) {
+    mod_ = module;
+  }
+
+  DPIModuleNode* Get() {
+    return static_cast<DPIModuleNode*>(mod_.operator->());
+  }
+
+  static DPILoader* Global() {
+    static DPILoader inst;
+    return &inst;
+  }
+
+  Module mod_;
+};
+
+class Device {
+ public:
+  Device() {
+    dpi_ = DPILoader::Global();
+    prof_ = Profiler::Global();
+  }
+
+  int Run(vta_phy_addr_t insn_phy_addr,
+          vta_phy_addr_t uop_phy_addr,
+          vta_phy_addr_t inp_phy_addr,
+          vta_phy_addr_t wgt_phy_addr,
+          vta_phy_addr_t acc_phy_addr,
+          vta_phy_addr_t out_phy_addr,
+          uint32_t insn_count,
+          uint32_t wait_cycles) {
+    this->Init();
+    this->Launch(insn_phy_addr,
+                 uop_phy_addr,
+                 inp_phy_addr,
+                 wgt_phy_addr,
+                 acc_phy_addr,
+                 out_phy_addr,
+                 insn_count,
+                 wait_cycles);
+    this->WaitForCompletion(wait_cycles);
+    dev_->Finish();
+    return 0;
+  }
+
+ private:
+  void Init() {
+    dev_ = dpi_->Get();
+  }
+
+  void Launch(vta_phy_addr_t insn_phy_addr,
+              vta_phy_addr_t uop_phy_addr,
+              vta_phy_addr_t inp_phy_addr,
+              vta_phy_addr_t wgt_phy_addr,
+              vta_phy_addr_t acc_phy_addr,
+              vta_phy_addr_t out_phy_addr,
+              uint32_t insn_count,
+              uint32_t wait_cycles) {
+    // launch simulation thread
+    dev_->Launch(wait_cycles);
+    // set counter to zero
+    dev_->WriteReg(0x04, 0);
+    dev_->WriteReg(0x08, insn_count);
+    dev_->WriteReg(0x0c, insn_phy_addr);
+    dev_->WriteReg(0x10, insn_phy_addr >> 32);
+    dev_->WriteReg(0x14, 0);
+    dev_->WriteReg(0x18, uop_phy_addr >> 32);
+    dev_->WriteReg(0x1c, 0);
+    dev_->WriteReg(0x20, inp_phy_addr >> 32);
+    dev_->WriteReg(0x24, 0);
+    dev_->WriteReg(0x28, wgt_phy_addr >> 32);
+    dev_->WriteReg(0x2c, 0);
+    dev_->WriteReg(0x30, acc_phy_addr >> 32);
+    dev_->WriteReg(0x34, 0);
+    dev_->WriteReg(0x38, out_phy_addr >> 32);
+    // start
+    dev_->WriteReg(0x00, 0x1);
+  }
+
+  void WaitForCompletion(uint32_t wait_cycles) {
+    uint32_t i, val;
+    for (i = 0; i < wait_cycles; i++) {
+      val = dev_->ReadReg(0x00);
+      val &= 0x2;
+      if (val == 0x2) break;  // finish
+    }
+    prof_->cycle_count = dev_->ReadReg(0x04);
+  }
+
+  // Profiler
+  Profiler* prof_;
+  // DPI loader
+  DPILoader* dpi_;
+  // DPI Module
+  DPIModuleNode* dev_;
+};
+
+using tvm::runtime::TVMRetValue;
+using tvm::runtime::TVMArgs;
+
+TVM_REGISTER_GLOBAL("tvm.vta.tsim.init")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    Module m = args[0];
+    DPILoader::Global()->Init(m);
+  });
+
+TVM_REGISTER_GLOBAL("tvm.vta.tsim.cycles")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = static_cast<int>(Profiler::Global()->cycle_count);
+  });
+
+}  // namespace tsim
+}  // namespace vta
+
+void* VTAMemAlloc(size_t size, int cached) {
+  void *p = malloc(size);
+  return p;
+}
+
+void VTAMemFree(void* buf) {
+  free(buf);
+}
+
+vta_phy_addr_t VTAMemGetPhyAddr(void* buf) {
+  return reinterpret_cast<uint64_t>(reinterpret_cast<uint64_t*>(buf));
+}
+
+void VTAFlushCache(vta_phy_addr_t buf, int size) {
+}
+
+void VTAInvalidateCache(vta_phy_addr_t buf, int size) {
+}
+
+VTADeviceHandle VTADeviceAlloc() {
+  return new vta::tsim::Device();
+}
+
+void VTADeviceFree(VTADeviceHandle handle) {
+  delete static_cast<vta::tsim::Device*>(handle);
+}
+
+int VTADeviceRun(VTADeviceHandle handle,
+                 vta_phy_addr_t insn_phy_addr,
+                 vta_phy_addr_t uop_phy_addr,
+                 vta_phy_addr_t inp_phy_addr,
+                 vta_phy_addr_t wgt_phy_addr,
+                 vta_phy_addr_t acc_phy_addr,
+                 vta_phy_addr_t out_phy_addr,
+                 uint32_t insn_count,
+                 uint32_t wait_cycles) {
+  return static_cast<vta::tsim::Device*>(handle)->Run(
+      insn_phy_addr,
+      uop_phy_addr,
+      inp_phy_addr,
+      wgt_phy_addr,
+      acc_phy_addr,
+      out_phy_addr,
+      insn_count,
+      wait_cycles);
+}
diff --git a/vta/tests/hardware/common/test_lib.cc b/vta/tests/hardware/common/test_lib.cc
index 291016a4ef3f..e88cede4d055 100644
--- a/vta/tests/hardware/common/test_lib.cc
+++ b/vta/tests/hardware/common/test_lib.cc
@@ -52,12 +52,6 @@ uint64_t vta(
   snprintf(str_block_bit_width, sizeof(str_block_bit_width), "%d", VTA_WGT_WIDTH);
   snprintf(bitstream, sizeof(bitstream), "%s", "vta.bit");
 
-#if VTA_DEBUG == 1
-  printf("INFO - Programming FPGA: %s!\n", bitstream);
-#endif
-
-  // Program VTA
-  VTAProgram(bitstream);
   // Get VTA handles
   void* vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
   void* vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
diff --git a/vta/tests/hardware/metal_test/Makefile b/vta/tests/hardware/metal_test/Makefile
index 67563f324734..ef1dfc274916 100644
--- a/vta/tests/hardware/metal_test/Makefile
+++ b/vta/tests/hardware/metal_test/Makefile
@@ -18,7 +18,7 @@
 CC ?= g++
 CFLAGS = -Wall -O3 -std=c++11 -I/usr/include
 LDFLAGS = -L/usr/lib -L/opt/python3.6/lib/python3.6/site-packages/pynq/lib/
-LIBS = -l:libsds_lib.so -l:libdma.so -lstdc++
+LIBS = -l:libcma.so -lstdc++ -pthread
 INCLUDE_DIR = ../../../include
 DRIVER_DIR = ../../../src/pynq
 TESTLIB_DIR = ../common
@@ -33,11 +33,15 @@ CFLAGS += `${VTA_CONFIG} --cflags`
 LDFLAGS += `${VTA_CONFIG} --ldflags`
 VTA_TARGET := $(shell ${VTA_CONFIG} --target)
 
+# Include bitstream
+VTA_PROGRAM = python3 ../../../python/vta/program_bitstream.py
+VTA_BIT = "vta.bit"
+
 # Define flags
 CFLAGS += -I $(INCLUDE_DIR) -DNO_SIM -DVTA_DEBUG=0
 
 # All Target
-all: $(EXECUTABLE)
+all: vtainstall $(EXECUTABLE)
 
 %.o: %.cc $(SOURCES)
 	$(CC) -c -o $@ $< $(CFLAGS)
@@ -45,5 +49,7 @@ all: $(EXECUTABLE)
 $(EXECUTABLE): $(OBJECTS)
 	$(CC) $(LDFLAGS) $(OBJECTS) -o $@ $(LIBS)
 
+vtainstall:
+	${VTA_PROGRAM} ${VTA_TARGET} ${VTA_BIT}
 clean:
 	rm -rf *.o $(EXECUTABLE)
diff --git a/vta/tests/python/unittest/test_vta_insn.py b/vta/tests/python/unittest/test_vta_insn.py
index 58835bbe3eab..815f55b5e595 100644
--- a/vta/tests/python/unittest/test_vta_insn.py
+++ b/vta/tests/python/unittest/test_vta_insn.py
@@ -68,9 +68,17 @@ def _run(env, remote):
         y_np = x_np.astype(y.dtype)
         x_nd = tvm.nd.array(x_np, ctx)
         y_nd = tvm.nd.empty(y_np.shape, ctx=ctx, dtype=y_np.dtype)
+
+        if env.TARGET == "tsim":
+            simulator.tsim_init("libvta_hw")
+
         f(x_nd, y_nd)
+
         np.testing.assert_equal(y_np, y_nd.asnumpy())
 
+        if env.TARGET == "tsim":
+            print("Load/store test took {} clock cycles".format(simulator.tsim_cycles()))
+
     vta.testing.run(_run)
 
 
@@ -126,9 +134,17 @@ def _run(env, remote):
              :] = x_np
         x_nd = tvm.nd.array(x_np, ctx)
         y_nd = tvm.nd.empty(y_np.shape, ctx=ctx, dtype=y_np.dtype)
+
+        if env.TARGET == "tsim":
+            simulator.tsim_init("libvta_hw")
+
         f(x_nd, y_nd)
+
         np.testing.assert_equal(y_np, y_nd.asnumpy())
 
+        if env.TARGET == "tsim":
+            print("Padded load test took {} clock cycles".format(simulator.tsim_cycles()))
+
     vta.testing.run(_run)
 
 
@@ -172,7 +188,7 @@ def _run(env, remote):
         if not remote:
             return
 
-        def verify(s):
+        def verify(s, name=None):
             mod = vta.build(s, [x, w, y], "ext_dev", env.target_host)
             temp = util.tempdir()
             mod.save(temp.relpath("gemm.o"))
@@ -197,6 +213,9 @@ def verify(s):
             y_np = np.right_shift(y_np, 8)
             y_np = np.clip(y_np, 0, (1<<(env.INP_WIDTH-1))-1).astype(y.dtype)
 
+            if env.TARGET == "tsim":
+                simulator.tsim_init("libvta_hw")
+
             if env.TARGET == "sim":
                 simulator.clear_stats()
                 f(x_nd, w_nd, y_nd)
@@ -206,6 +225,9 @@ def verify(s):
 
             np.testing.assert_equal(y_np, y_nd.asnumpy())
 
+            if env.TARGET == "tsim":
+                print("GEMM schedule:{} test took {} clock cycles".format(name, simulator.tsim_cycles()))
+
         def test_schedule1():
             # default schedule with no smt
             s = tvm.create_schedule(y.op)
@@ -234,7 +256,7 @@ def test_schedule1():
                 s[y_gem].op.axis[3],
                 ki)
             s[y_gem].tensorize(s[y_gem].op.axis[2], env.gemm)
-            verify(s)
+            verify(s, name="default")
 
         def test_smt():
             # test smt schedule
@@ -268,7 +290,7 @@ def test_smt():
             s[w_buf].compute_at(s[y_gem], ko)
             s[w_buf].pragma(s[w_buf].op.axis[0], env.dma_copy)
             s[y].pragma(abo2, env.dma_copy)
-            verify(s)
+            verify(s, name="smt")
 
         test_schedule1()
         test_smt()
@@ -277,7 +299,7 @@ def test_smt():
 
 def test_alu():
     def _run(env, remote):
-        def check_alu(tvm_op, np_op=None, use_imm=False):
+        def check_alu(tvm_op, np_op=None, use_imm=False, test_name=None):
             """Test ALU"""
             m = 8
             n = 8
@@ -351,19 +373,27 @@ def check_alu(tvm_op, np_op=None, use_imm=False):
             a_nd = tvm.nd.array(a_np, ctx)
             res_nd = tvm.nd.array(
                 np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx)
+
+            if env.TARGET == "tsim":
+                simulator.tsim_init("libvta_hw")
+
             if use_imm:
                 f(a_nd, res_nd)
             else:
                 b_nd = tvm.nd.array(b_np, ctx)
                 f(a_nd, b_nd, res_nd)
+
             np.testing.assert_equal(res_np, res_nd.asnumpy())
 
-        check_alu(lambda x, y: x << y, np.left_shift, use_imm=True)
-        check_alu(tvm.max, np.maximum, use_imm=True)
-        check_alu(tvm.max, np.maximum)
-        check_alu(lambda x, y: x + y, use_imm=True)
-        check_alu(lambda x, y: x + y)
-        check_alu(lambda x, y: x >> y, np.right_shift, use_imm=True)
+            if env.TARGET == "tsim":
+                print("ALU {} imm:{} test took {} clock cycles".format(test_name, use_imm, simulator.tsim_cycles()))
+
+        check_alu(lambda x, y: x << y, np.left_shift, use_imm=True, test_name="SHL")
+        check_alu(tvm.max, np.maximum, use_imm=True, test_name="MAX")
+        check_alu(tvm.max, np.maximum, test_name="MAX")
+        check_alu(lambda x, y: x + y, use_imm=True, test_name="ADD")
+        check_alu(lambda x, y: x + y, test_name="ADD")
+        check_alu(lambda x, y: x >> y, np.right_shift, use_imm=True, test_name="SHR")
 
     vta.testing.run(_run)
 
@@ -420,9 +450,17 @@ def _run(env, remote):
         a_nd = tvm.nd.array(a_np, ctx)
         res_nd = tvm.nd.array(
             np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx)
+
+        if env.TARGET == "tsim":
+            simulator.tsim_init("libvta_hw")
+
         f(a_nd, res_nd)
+
         np.testing.assert_equal(res_np, res_nd.asnumpy())
 
+        if env.TARGET == "tsim":
+            print("Relu test took {} clock cycles".format(simulator.tsim_cycles()))
+
     vta.testing.run(_run)
 
 
@@ -479,9 +517,17 @@ def _run(env, remote):
         a_nd = tvm.nd.array(a_np, ctx)
         res_nd = tvm.nd.array(
             np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx)
+
+        if env.TARGET == "tsim":
+            simulator.tsim_init("libvta_hw")
+
         f(a_nd, res_nd)
+
         np.testing.assert_equal(res_np, res_nd.asnumpy())
 
+        if env.TARGET == "tsim":
+            print("Shift/scale test took {} clock cycles".format(simulator.tsim_cycles()))
+
     vta.testing.run(_run)
 
 
@@ -498,16 +544,10 @@ def _run(env, remote):
 
 
 if __name__ == "__main__":
-    print("Array test")
     test_runtime_array()
-    print("Load/store test")
     test_save_load_out()
-    print("Padded load test")
-    #test_padded_load()
-    print("GEMM test")
+    test_padded_load()
     test_gemm()
     test_alu()
-    print("ALU test")
     test_relu()
-    print("Shift and scale")
     test_shift_and_scale()