diff --git a/CMakeLists.txt b/CMakeLists.txt
index 85abc8a4d3b4..c6906e832e39 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -80,11 +80,11 @@ tvm_option(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME "Build with Arm Compute Library gra
 # include directories
 include_directories(${CMAKE_INCLUDE_PATH})
 include_directories("include")
-include_directories(${DLPACK_PATH})
-include_directories(${DMLC_PATH})
-include_directories(${RANG_PATH})
-include_directories(${COMPILER_RT_PATH})
-include_directories(${PICOJSON_PATH})
+include_directories(SYSTEM ${DLPACK_PATH})
+include_directories(SYSTEM ${DMLC_PATH})
+include_directories(SYSTEM ${RANG_PATH})
+include_directories(SYSTEM ${COMPILER_RT_PATH})
+include_directories(SYSTEM ${PICOJSON_PATH})
 
 # initial variables
 set(TVM_LINKER_LIBS "")
@@ -123,14 +123,15 @@ if(MSVC)
   # DLL interface warning in c++
   add_compile_options(/wd4251)
 else(MSVC)
+  set(WARNING_FLAG -Wall)
   if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
     message(STATUS "Build in Debug mode")
-    set(CMAKE_C_FLAGS "-O0 -g -Wall -fPIC ${CMAKE_C_FLAGS}")
-    set(CMAKE_CXX_FLAGS "-O0 -g -Wall -fPIC ${CMAKE_CXX_FLAGS}")
+    set(CMAKE_C_FLAGS "-O0 -g ${WARNING_FLAG} -fPIC ${CMAKE_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "-O0 -g ${WARNING_FLAG} -fPIC ${CMAKE_CXX_FLAGS}")
     set(CMAKE_CUDA_FLAGS "-O0 -g -Xcompiler=-Wall -Xcompiler=-fPIC ${CMAKE_CUDA_FLAGS}")
   else()
-    set(CMAKE_C_FLAGS "-O2 -Wall -fPIC ${CMAKE_C_FLAGS}")
-    set(CMAKE_CXX_FLAGS "-O2 -Wall -fPIC ${CMAKE_CXX_FLAGS}")
+    set(CMAKE_C_FLAGS "-O2 ${WARNING_FLAG} -fPIC ${CMAKE_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "-O2 ${WARNING_FLAG} -fPIC ${CMAKE_CXX_FLAGS}")
     set(CMAKE_CUDA_FLAGS "-O2 -Xcompiler=-Wall -Xcompiler=-fPIC ${CMAKE_CUDA_FLAGS}")
     set(TVM_VISIBILITY_FLAG "")
     if (HIDE_PRIVATE_SYMBOLS)
@@ -142,6 +143,7 @@ else(MSVC)
       CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
     set(CMAKE_CXX_FLAGS "-faligned-new ${CMAKE_CXX_FLAGS}")
   endif()
+  include(cmake/modules/ClangFlags.cmake)
 
   # Detect if we're compiling for Hexagon.
   set(TEST_FOR_HEXAGON_CXX
@@ -365,6 +367,13 @@ set_property(TARGET tvm APPEND PROPERTY LINK_OPTIONS "${TVM_VISIBILITY_FLAGS}")
 add_library(tvm_runtime SHARED $<TARGET_OBJECTS:tvm_runtime_objs>)
 set_property(TARGET tvm_runtime APPEND PROPERTY LINK_OPTIONS "${TVM_VISIBILITY_FLAGS}")
 
+if(USE_MICRO)
+  # NOTE: cmake doesn't track dependencies at the file level across subdirectories. For the
+  # Unix Makefiles generator, need to add these explicit target-level dependency)
+  add_dependencies(tvm host_standalone_crt)
+  add_dependencies(tvm_runtime host_standalone_crt)
+endif()
+
 if(USE_CPP_RPC)
   add_subdirectory("apps/cpp_rpc")
 endif()
@@ -390,7 +399,7 @@ if(BUILD_FOR_HEXAGON)
   set_target_properties(tvm_runtime PROPERTIES LINK_FLAGS
                         "-Wl,--wrap=pthread_create")
 
-  target_include_directories(tvm_runtime
+  target_include_directories(tvm_runtime SYSTEM
     PUBLIC "${USE_HEXAGON_SDK}/libs/common/qurt/ADSPv62MP/include/posix"
     PUBLIC "${USE_HEXAGON_SDK}/libs/common/qurt/ADSPv62MP/include/qurt"
     PUBLIC "${USE_HEXAGON_SDK}/incs"
@@ -453,7 +462,7 @@ if(GTEST_INCLUDE_DIR AND GTEST_LIB)
     string(REPLACE ".cc" "" __execname ${__srcname})
     add_executable(${__execname} ${__srcpath})
     list(APPEND TEST_EXECS ${__execname})
-    target_include_directories(${__execname} PUBLIC ${GTEST_INCLUDE_DIR})
+    target_include_directories(${__execname} SYSTEM PUBLIC ${GTEST_INCLUDE_DIR})
     target_link_libraries(${__execname} ${TVM_TEST_LIBRARY_NAME} ${GTEST_LIB} pthread dl)
     set_target_properties(${__execname} PROPERTIES EXCLUDE_FROM_ALL 1)
     set_target_properties(${__execname} PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD 1)
diff --git a/Jenkinsfile b/Jenkinsfile
index f6faf2542209..9e3b0f5415eb 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -92,6 +92,19 @@ def init_git_win() {
     }
 }
 
+def cancel_previous_build() {
+    // cancel previous build if it is not on master.
+    if (env.BRANCH_NAME != "master") {
+        def buildNumber = env.BUILD_NUMBER as int
+        // Milestone API allows us to cancel previous build
+        // with the same milestone number
+        if (buildNumber > 1) milestone(buildNumber - 1)
+        milestone(buildNumber)
+    }
+}
+
+cancel_previous_build()
+
 stage("Sanity Check") {
   timeout(time: max_time, unit: 'MINUTES') {
     node('CPU') {
@@ -112,7 +125,11 @@ def make(docker_type, path, make_flag) {
       sh "${docker_run} ${docker_type} ./tests/scripts/task_build.sh ${path} ${make_flag}"
       // always run cpp test when build
       sh "${docker_run} ${docker_type} ./tests/scripts/task_cpp_unittest.sh"
-    } catch (exc) {
+    } catch (hudson.AbortException ae) {
+      // script exited due to user abort, directly throw instead of retry
+      if (ae.getMessage().contains('script returned exit code 143')) {
+        throw ae
+      }
       echo 'Incremental compilation failed. Fall back to build from scratch'
       sh "${docker_run} ${docker_type} ./tests/scripts/task_clean.sh ${path}"
       sh "${docker_run} ${docker_type} ./tests/scripts/task_build.sh ${path} ${make_flag}"
diff --git a/KEYS b/KEYS
index b338c3335f40..ba657b4eb2c7 100644
--- a/KEYS
+++ b/KEYS
@@ -186,3 +186,61 @@ N8yATW8CRIKO3IobUfLMDdPCLO7uzoW95cI35Y0l8JgK2NeU6tVZptP5mDogeAbq
 8PlimrXuzG9Bokct2SOO6Z51i6rSDo/ALj440EvWNw==
 =1xfH
 -----END PGP PUBLIC KEY BLOCK-----
+pub   rsa4096 2020-09-24 [SC]
+      6A0D4938D8C052C759AE2460ED03B26E4FC3509F
+uid           [ultimate] Ziheng Jiang <ziheng@apache.org>
+sig 3        ED03B26E4FC3509F 2020-09-24  Ziheng Jiang <ziheng@apache.org>
+sub   rsa4096 2020-09-24 [E]
+sig          ED03B26E4FC3509F 2020-09-24  Ziheng Jiang <ziheng@apache.org>
+
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+
+mQINBF9tEiUBEAC90om00alNSupM78ZZYMdwKZnJLIhAD22YARntVNVBuD9Znpuo
+BAYwjrWdAi/npwN+r+Pd7Oz6fMBCmB3e4tsrPnBzauGb6aKgjBMHcVEx0p1197kk
+WcGuKt4FNlHYfmc2sOOQre2GcIVOU1XuK8tAhgca78aorAlMtOqq+/ASnKcjRSjW
+0AOzlEKfaGVgst2UO7Fc/w59S3/qv1vBGKnlqLvsJU7kNR1gFotqsGAee5Vu7alQ
+WiHFJbW9ujLTPu7m8enFVuBGFkPsW89Yl/0mXnAKZFFNCHIQ9gkT+1bvZhx8ViJL
+4UeqG7wnSLSSIQz2UPBJYV5stxNtd9HS08Tfviv37shd1SSprFLoQDk87j7wF60b
+AR5IjbVgdprpmVNncO5pnyZwXXWVi7ZyiMSaW6wg+lkeQMGflxgL+05xOafJYgO6
+UepXqu1mc7Q4eVUyft/EPmdyvlg7Fo4T4Db2PnstonkZCyLogdaaJRuxCc0AR/O1
+oNaodrdjqydXVnP3d/gJ5gj78zeMPVbGbzwhpIwhfDouxftaU5zc6prBsMgY/os6
+XMe9bNZWpOLXZrmo/ovaiebmxT5ZYuFRGdeRl1/Y5CWE6Q9JM8euwKuskNQ4G0aY
+fVQ61Cxg4hmrnsv9YFAjf9PPWhpFHvILQoGSs3HbJCLFPphKf37gzfiZMQARAQAB
+tCBaaWhlbmcgSmlhbmcgPHppaGVuZ0BhcGFjaGUub3JnPokCTgQTAQgAOBYhBGoN
+STjYwFLHWa4kYO0Dsm5Pw1CfBQJfbRIlAhsDBQsJCAcCBhUKCQgLAgQWAgMBAh4B
+AheAAAoJEO0Dsm5Pw1CfBl0P/R04MxtqC4aI0fpdwmed55kGunL1W65phBgcOrDL
+58cv5dKJzUmfSUXw3QANcFSn9Q9Z2clj+a2aiGKV5cUiWN0Ny7y6wd3aVOXlRHHy
+f30aDO5Ug5RDYbcChTpen+kq9qDXSr/NxXYLWvhobMeXfiA9Priv49fFWEr17Kai
+NOuoix/eWA5WpnPMf/Rz4HibKcX/izXTW0NOH54jn+4P9M4ZwWbn0AXKoq2i3zF2
+vZavCStcscrfs+kihtEVvwUkyrmSIblIUdkPNxeo/jx7N9Fbu2zXbhl5JiBmBUMJ
+XyFUOBSUDUzA5EvWwXp0yatULOoCH/LIyt+lLdkyfDjsKmAavGf9CcFHVyDIG95N
+34/jECPwwVVkbauE0XYOwenOh+Be1goOA6nidB4QT/rGns7zvCNG8+3ttwA4aiE5
+3GrVWXiPaEMaoM56Phscek30GoLjB2gjvwgwa9oGTDYTu8Z4ifLk8qq8ij9uEG7V
+cKns+1C3ZvfdKi8SmOzj/v9krOi8N4YW03YS2Oq/cGPD/SttoMTOCPxi0PLR7uqy
+YXugsebxlJlXBNTeZx+iiKmkrsILjEd8pUChw79crtH2SGOPqIv1BsqObstLV04r
+iiywruqLRIGlsr8BtepCeEfzW9nJRw7W2571t7oD7QbkdCJ4WUyhMJH73+7KFEE2
+fKL5uQINBF9tEiUBEACmMcP8/zm88BmyhDjWV3ZrZ9cn0N3JJfSONt0AcyE5TZ2y
+20DnHkp3/lNK6EC0k7twtcce/cnKDbXQ/IpuJZwReq5SgmCoGbBZShjALtVCzQRm
+pSA6Wl0JBfw36/IdKUuf8LZtENqp3jgQkkT3TA+/bCh1KQLDYFoVQjUBLiWCDHiL
+iBV5L/PH97l93hkxbSDXrBemQRbr+xhA2TzwcmrjnscNCAXkwU9f1Ygh8zDHSJKB
+g7Ln+ot6QsPhNQEQWhju5xfAn9+kO8OWSAZF/lJTT2Wy+spDBP1ZnviQadWPj5HL
+n4G1qe4QWl08E9FtqVKC7r1YYzT4DlTU2AQ0bJqdvAtojX9ji2Hp4ov8xYPHzy3a
+ZRdDYNWN6i0mbpzj8SYojyEG5cy2j+nzGOYTEdpwW8pG2aCwRvnO+UqXNM3UyQk3
+9Tyfyzw6m9mlq9zaw/nfvOIA6Ns2QR5+UbplkpwVMqMAzZNyEV2wPe9B195MN6tq
+KcznzawD/W1ORccOxrpBXhN3sJSc5n8Uy5pHUHg9B/TdCSLpr7tqqS34gB+AcSUL
+NxjdLn72JHKxCp/wpg3Z4bmY5n/bh/D7Ovt7LP1D/MW9wiR3ls/PtNAK4+SV6oqt
+G1MNS0QgAitovF8dpmX+/zPKax7baZiJY/sDr9crfRvd6e+HYA3yDo08Z44MTwAR
+AQABiQI2BBgBCAAgFiEEag1JONjAUsdZriRg7QOybk/DUJ8FAl9tEiUCGwwACgkQ
+7QOybk/DUJ9Txw/+NXL6cKEIm4NQrBc0RmX37sELc5UnvpycV663OiPF9qHE9iML
+EUt/LBxrGUarplOA66EIkmmnekUgS8ujjhGOw152nSuZTgoPxX4ub6PI7Hi5lmqI
+ZtEpp8VoI+XxAdA5ecN5QNP7P/ovSIZwXvIF00YXqGp6keXi/qdYkylt4s6zLDiL
+ocfOZWt994JVIl30gogkw4PmcWx+PKXos+Hq1La7iZUn1pT5kEsN+fHpnh42sAGZ
+dhb+puB5tczhVJhL553Z6rh4BABd1DqAZihwelkRRvQUp0Fqgc2oxty5o5pdHZdS
+ulomOqGgERHsrtwzqD/n3iep3z22LiitZHsKZ0OoHl9e1YsvdsL5rImEz/FdWwgl
+muO2ZjY2KhuovFROCsGVgw3b9gzIjtE8FWE6wSz6qzKihBbI8YPtQqGJgnX2A01m
+AkfpGPb8430OghDCQFsrWkuTjmSw42ys1lALbK2yQGRuOCq0dIml1QdE6JfU8ceW
+QY1dhH7xpHQxlr9Tcv+enCc4UzCJOnXgkVUnD/u+TqKL9GoSFu6KQrC7jyvfY9t8
+Elf2ReXYVK/jGUePdDFurp+3KFlAHFuen2VZTcNZaUWoYoI84VDEh8/oPEPfzveJ
+/GhL5vbglB0H0aG8SVMaTfzr+nXHUVyOSrlYYk34O7bSimVrX6XDGPZpsXE=
+=nhJ/
+-----END PGP PUBLIC KEY BLOCK-----
diff --git a/README.md b/README.md
index 3f5b172f6268..f0e011bfc616 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@
 [Community](https://tvm.apache.org/community) |
 [Release Notes](NEWS.md)
 
-[![Build Status](https://ci.tvm.ai/buildStatus/icon?job=tvm/master)](https://ci.tvm.ai/job/tvm/job/master/)
+[![Build Status](https://ci.tlcpack.ai/buildStatus/icon?job=tvm/master)](https://ci.tlcpack.ai/job/tvm/job/master/)
 [![WinMacBuild](https://github.com/apache/incubator-tvm/workflows/WinMacBuild/badge.svg)](https://github.com/apache/incubator-tvm/actions?query=workflow%3AWinMacBuild)
 
 Apache TVM (incubating) is a compiler stack for deep learning systems. It is designed to close the gap between the
diff --git a/apps/tf_tvmdsoop/CMakeLists.txt b/apps/tf_tvmdsoop/CMakeLists.txt
index f4e83c528701..9be5da9d26de 100644
--- a/apps/tf_tvmdsoop/CMakeLists.txt
+++ b/apps/tf_tvmdsoop/CMakeLists.txt
@@ -22,8 +22,8 @@ set(BUILD_TVMDSOOP_ONLY ON)
 set(CMAKE_CURRENT_SOURCE_DIR ${TVM_ROOT})
 set(CMAKE_CURRENT_BINARY_DIR ${TVM_ROOT}/build)
 
-include_directories(${TVM_ROOT}/3rdparty/dlpack/include/)
-include_directories(${TVM_ROOT}/3rdparty/dmlc-core/include/)
+include_directories(SYSTEM ${TVM_ROOT}/3rdparty/dlpack/include/)
+include_directories(SYSTEM ${TVM_ROOT}/3rdparty/dmlc-core/include/)
 include_directories(${TVM_ROOT}/include)
 
 link_directories(${TVM_ROOT}/build)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index e7e2a5a8a5c9..6ed660c86964 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -87,17 +87,6 @@ set(USE_OPENGL OFF)
 # Whether enable MicroTVM runtime
 set(USE_MICRO OFF)
 
-# Whether to enable SGX runtime
-#
-# Possible values for USE_SGX:
-# - /path/to/sgxsdk: path to Intel SGX SDK
-# - OFF: disable SGX
-#
-# SGX_MODE := HW|SIM
-set(USE_SGX OFF)
-set(SGX_MODE "SIM")
-set(RUST_SGX_SDK "/path/to/rust-sgx-sdk")
-
 # Whether enable RPC runtime
 set(USE_RPC ON)
 
diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake
index 936bb681b7ff..2583e8f3c9ca 100644
--- a/cmake/modules/CUDA.cmake
+++ b/cmake/modules/CUDA.cmake
@@ -21,7 +21,7 @@ find_cuda(${USE_CUDA})
 if(CUDA_FOUND)
   # always set the includedir when cuda is available
   # avoid global retrigger of cmake
-	include_directories(${CUDA_INCLUDE_DIRS})
+	include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
 endif(CUDA_FOUND)
 
 if(USE_CUDA)
diff --git a/cmake/modules/ClangFlags.cmake b/cmake/modules/ClangFlags.cmake
new file mode 100644
index 000000000000..9a3ac05a2a5b
--- /dev/null
+++ b/cmake/modules/ClangFlags.cmake
@@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# If we are running clang >= 10.0 then enable more checking. Some of these warnings may not exist
+# in older versions of clang so we limit the use of older clang for these checks.
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+  EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE clang_full_version)
+  string (REGEX REPLACE ".*clang version ([0-9]+\\.[0-9]+).*" "\\1" CLANG_VERSION ${clang_full_version})
+  message(STATUS "CLANG_VERSION ${CLANG_VERSION}")
+  if (CLANG_VERSION VERSION_GREATER_EQUAL 10.0)
+    message(STATUS "Setting enhanced clang warning flags")
+
+    # These warnings are only enabled when clang's -Weverything flag is enabled
+    # but there is no harm in turning them off for all cases.
+    add_compile_options(
+      -Wno-c++98-compat
+      -Wno-c++98-compat-extra-semi
+      -Wno-c++98-compat-pedantic
+      -Wno-padded
+      -Wno-extra-semi
+      -Wno-extra-semi-stmt
+      -Wno-unused-parameter
+      -Wno-sign-conversion
+      -Wno-weak-vtables
+      -Wno-deprecated-copy-dtor
+      -Wno-global-constructors
+      -Wno-double-promotion
+      -Wno-float-equal
+      -Wno-missing-prototypes
+      -Wno-implicit-int-float-conversion
+      -Wno-implicit-float-conversion
+      -Wno-implicit-int-conversion
+      -Wno-float-conversion
+      -Wno-shorten-64-to-32
+      -Wno-covered-switch-default
+      -Wno-unused-exception-parameter
+      -Wno-return-std-move-in-c++11
+      -Wno-over-aligned
+      -Wno-undef
+      -Wno-inconsistent-missing-destructor-override
+      -Wno-unreachable-code
+      -Wno-deprecated-copy
+      -Wno-implicit-fallthrough
+      -Wno-unreachable-code-return
+      -Wno-non-virtual-dtor
+    )
+
+    # Here we have non-standard warnings that clang has available and are useful
+    # so enable them if we are using clang.
+    add_compile_options(
+      -Wreserved-id-macro
+      -Wused-but-marked-unused
+      -Wdocumentation-unknown-command
+      -Wcast-qual
+      -Wzero-as-null-pointer-constant
+
+      # These warnings should be enabled one at a time and fixed.
+      # To enable one of these warnings remove the `no-` after -W so
+      # -Wno-documentation -> -Wdocumentation
+      -Wno-documentation
+      -Wno-shadow-uncaptured-local
+      -Wno-shadow-field-in-constructor
+      -Wno-shadow
+      -Wno-shadow-field
+      -Wno-exit-time-destructors
+      -Wno-switch-enum
+      -Wno-old-style-cast
+      -Wno-gnu-anonymous-struct
+      -Wno-nested-anon-types
+      )
+
+  endif ()
+endif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index 30b4ccbc5618..9fc806cf9047 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -76,7 +76,7 @@ if(USE_HEXAGON_DEVICE STREQUAL "${PICK_SIM}")
   find_hexagon_toolchain()
   message(STATUS "Hexagon toolchain: ${HEXAGON_TOOLCHAIN}")
   file(GLOB RUNTIME_HEXAGON_SIM_SRCS src/runtime/hexagon/sim/*.cc)
-  include_directories("${HEXAGON_TOOLCHAIN}/include/iss")
+  include_directories(SYSTEM "${HEXAGON_TOOLCHAIN}/include/iss")
   link_directories("${HEXAGON_TOOLCHAIN}/lib/iss")
   list(APPEND TVM_RUNTIME_LINKER_LIBS "-lwrapper")
   ExternalProject_Add(sim_dev
@@ -91,11 +91,11 @@ elseif(USE_HEXAGON_DEVICE STREQUAL "${PICK_HW}")
   find_hexagon_toolchain()
   message(STATUS "Hexagon SDK: ${HEXAGON_SDK_ROOT}")
   file(GLOB RUNTIME_HEXAGON_DEVICE_SRCS src/runtime/hexagon/target/*.cc)
-  include_directories("${HEXAGON_SDK_ROOT}/incs/stddef")
-  include_directories("${HEXAGON_SDK_ROOT}/libs/common/rpcmem/inc")
+  include_directories(SYSTEM "${HEXAGON_SDK_ROOT}/incs/stddef")
+  include_directories(SYSTEM "${HEXAGON_SDK_ROOT}/libs/common/rpcmem/inc")
   include_directories(
-      "${HEXAGON_SDK_ROOT}/libs/common/remote/ship/android_Release_aarch64")
-  include_directories("${HEXAGON_TOOLCHAIN}/include/iss")
+      SYSTEM "${HEXAGON_SDK_ROOT}/libs/common/remote/ship/android_Release_aarch64")
+  include_directories(SYSTEM "${HEXAGON_TOOLCHAIN}/include/iss")
   list(APPEND TVM_RUNTIME_LINKER_LIBS "dl")
   if(BUILD_FOR_ANDROID)
     # Hexagon runtime uses __android_log_print, which is in liblog.
diff --git a/cmake/modules/LLVM.cmake b/cmake/modules/LLVM.cmake
index be6d0fb27242..5f8ace17111f 100644
--- a/cmake/modules/LLVM.cmake
+++ b/cmake/modules/LLVM.cmake
@@ -18,9 +18,11 @@
 # LLVM rules
 add_definitions(-DDMLC_USE_FOPEN64=0)
 
-if(NOT USE_LLVM STREQUAL "OFF")
+# Test if ${USE_LLVM} is not an explicit boolean false
+# It may be a boolean or a string
+if(NOT ${USE_LLVM} MATCHES ${IS_FALSE_PATTERN})
   find_llvm(${USE_LLVM})
-  include_directories(${LLVM_INCLUDE_DIRS})
+  include_directories(SYSTEM ${LLVM_INCLUDE_DIRS})
   add_definitions(${LLVM_DEFINITIONS})
   message(STATUS "Build with LLVM " ${LLVM_PACKAGE_VERSION})
   message(STATUS "Set TVM_LLVM_VERSION=" ${TVM_LLVM_VERSION})
diff --git a/cmake/modules/OpenCL.cmake b/cmake/modules/OpenCL.cmake
index 64ce3c3d3810..a82c080c4199 100644
--- a/cmake/modules/OpenCL.cmake
+++ b/cmake/modules/OpenCL.cmake
@@ -21,7 +21,7 @@ find_opencl(${USE_OPENCL})
 if(OpenCL_FOUND)
   # always set the includedir when cuda is available
   # avoid global retrigger of cmake
-  include_directories(${OpenCL_INCLUDE_DIRS})
+  include_directories(SYSTEM ${OpenCL_INCLUDE_DIRS})
 endif(OpenCL_FOUND)
 
 if(USE_SDACCEL)
diff --git a/cmake/modules/ROCM.cmake b/cmake/modules/ROCM.cmake
index a28b37ebe465..ec348f8b57f6 100644
--- a/cmake/modules/ROCM.cmake
+++ b/cmake/modules/ROCM.cmake
@@ -21,7 +21,7 @@ find_rocm(${USE_ROCM})
 if(ROCM_FOUND)
   # always set the includedir
   # avoid global retrigger of cmake
-  include_directories(${ROCM_INCLUDE_DIRS})
+  include_directories(SYSTEM ${ROCM_INCLUDE_DIRS})
   add_definitions(-D__HIP_PLATFORM_HCC__=1)
 endif(ROCM_FOUND)
 
diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
index fd3a121ef795..a9fc66507d35 100644
--- a/cmake/modules/VTA.cmake
+++ b/cmake/modules/VTA.cmake
@@ -59,7 +59,7 @@ elseif(PYTHON)
     list(APPEND FSIM_RUNTIME_SRCS ${VTA_HW_PATH}/src/vmem/virtual_memory.cc)
     # Target lib: vta_fsim
     add_library(vta_fsim SHARED ${FSIM_RUNTIME_SRCS})
-    target_include_directories(vta_fsim PUBLIC ${VTA_HW_PATH}/include)
+    target_include_directories(vta_fsim SYSTEM PUBLIC ${VTA_HW_PATH}/include)
     foreach(__def ${VTA_DEFINITIONS})
       string(SUBSTRING ${__def} 3 -1 __strip_def)
       target_compile_definitions(vta_fsim PUBLIC ${__strip_def})
@@ -80,7 +80,7 @@ elseif(PYTHON)
     list(APPEND TSIM_RUNTIME_SRCS ${VTA_HW_PATH}/src/vmem/virtual_memory.cc)
     # Target lib: vta_tsim
     add_library(vta_tsim SHARED ${TSIM_RUNTIME_SRCS})
-    target_include_directories(vta_tsim PUBLIC ${VTA_HW_PATH}/include)
+    target_include_directories(vta_tsim SYSTEM PUBLIC ${VTA_HW_PATH}/include)
     foreach(__def ${VTA_DEFINITIONS})
       string(SUBSTRING ${__def} 3 -1 __strip_def)
       target_compile_definitions(vta_tsim PUBLIC ${__strip_def})
@@ -116,9 +116,9 @@ elseif(PYTHON)
       target_link_libraries(vta ${__cma_lib})
     elseif(${VTA_TARGET} STREQUAL "de10nano")  # DE10-Nano rules
      #target_compile_definitions(vta PUBLIC VTA_MAX_XFER=2097152) # (1<<21)
-      target_include_directories(vta PUBLIC ${VTA_HW_PATH}/src/de10nano)
-      target_include_directories(vta PUBLIC 3rdparty)
-      target_include_directories(vta PUBLIC
+      target_include_directories(vta SYSTEM PUBLIC ${VTA_HW_PATH}/src/de10nano)
+      target_include_directories(vta SYSTEM PUBLIC 3rdparty)
+      target_include_directories(vta SYSTEM PUBLIC
         "/usr/local/intelFPGA_lite/18.1/embedded/ds-5/sw/gcc/arm-linux-gnueabihf/include")
     endif()
   endif()
diff --git a/cmake/modules/Vulkan.cmake b/cmake/modules/Vulkan.cmake
index 29fcfe946361..4df8986c800c 100644
--- a/cmake/modules/Vulkan.cmake
+++ b/cmake/modules/Vulkan.cmake
@@ -29,7 +29,7 @@ tvm_option(USE_VULKAN_VALIDATION "Enable Vulkan API validation layers" OFF
 if(Vulkan_FOUND)
   # always set the includedir
   # avoid global retrigger of cmake
-  include_directories(${Vulkan_INCLUDE_DIRS})
+  include_directories(SYSTEM ${Vulkan_INCLUDE_DIRS})
 endif(Vulkan_FOUND)
 
 if(USE_VULKAN)
diff --git a/cmake/modules/contrib/BLAS.cmake b/cmake/modules/contrib/BLAS.cmake
index e8c8e22e3334..06c8755882d5 100644
--- a/cmake/modules/contrib/BLAS.cmake
+++ b/cmake/modules/contrib/BLAS.cmake
@@ -27,7 +27,7 @@ elseif(USE_BLAS STREQUAL "atlas" OR USE_BLAS STREQUAL "blas")
   message(STATUS "Use BLAS library " ${BLAS_LIBRARY})
 elseif(USE_BLAS STREQUAL "apple")
   find_library(BLAS_LIBRARY Accelerate)
-  include_directories(${BLAS_LIBRARY}/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers/)
+  include_directories(SYSTEM ${BLAS_LIBRARY}/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers/)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${BLAS_LIBRARY})
   list(APPEND RUNTIME_SRCS src/runtime/contrib/cblas/cblas.cc)
   message(STATUS "Use BLAS library " ${BLAS_LIBRARY})
@@ -57,7 +57,7 @@ if(USE_MKL OR USE_MKL_PATH)
   elseif(MSVC)
     find_library(BLAS_LIBRARY_MKL NAMES mkl_rt HINTS ${USE_MKL}/lib/ ${USE_MKL}/lib/intel64_win)
   endif()
-  include_directories(${USE_MKL}/include)
+  include_directories(SYSTEM ${USE_MKL}/include)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${BLAS_LIBRARY_MKL})
   list(APPEND RUNTIME_SRCS src/runtime/contrib/cblas/mkl.cc)
   add_definitions(-DUSE_MKL_BLAS=1)
@@ -69,7 +69,7 @@ if(IS_DIRECTORY ${USE_MKLDNN})
   if (MKLDNN_LIBRARY STREQUAL "MKLDNN_LIBRARY-NOTFOUND")
     message(WARNING "Cannot find MKLDNN library at ${USE_MKLDNN}.")
   else()
-    include_directories(${USE_MKLDNN}/include)
+    include_directories(SYSTEM ${USE_MKLDNN}/include)
     list(APPEND TVM_RUNTIME_LINKER_LIBS ${MKLDNN_LIBRARY})
     list(APPEND RUNTIME_SRCS src/runtime/contrib/cblas/mkldnn.cc)
     add_definitions(-DUSE_DNNL=1)
diff --git a/cmake/modules/contrib/EthosN.cmake b/cmake/modules/contrib/EthosN.cmake
index ca1f7daa8c8a..44d2a2a17ace 100644
--- a/cmake/modules/contrib/EthosN.cmake
+++ b/cmake/modules/contrib/EthosN.cmake
@@ -24,7 +24,7 @@ if(NOT USE_ETHOSN STREQUAL "OFF")
     message(FATAL_ERROR "Cannot find Ethos-N, USE_ETHOSN=" ${USE_ETHOSN})
 
   else()
-    include_directories(${ETHOSN_INCLUDE_DIRS})
+    include_directories(SYSTEM ${ETHOSN_INCLUDE_DIRS})
     add_definitions(${ETHOSN_DEFINITIONS})
 
     message(STATUS "Build with Ethos-N ${ETHOSN_PACKAGE_VERSION}")
diff --git a/cmake/util/FindCUDA.cmake b/cmake/util/FindCUDA.cmake
index f971c87f20a8..e6517517a6d0 100644
--- a/cmake/util/FindCUDA.cmake
+++ b/cmake/util/FindCUDA.cmake
@@ -37,7 +37,7 @@
 #
 macro(find_cuda use_cuda)
   set(__use_cuda ${use_cuda})
-  if(__use_cuda STREQUAL "ON")
+  if(${__use_cuda} MATCHES ${IS_TRUE_PATTERN})
     find_package(CUDA QUIET)
   elseif(IS_DIRECTORY ${__use_cuda})
     set(CUDA_TOOLKIT_ROOT_DIR ${__use_cuda})
@@ -91,7 +91,9 @@ macro(find_cuda use_cuda)
       find_library(CUDA_CUBLAS_LIBRARY cublas
         ${CUDA_TOOLKIT_ROOT_DIR}/lib64
         ${CUDA_TOOLKIT_ROOT_DIR}/lib)
-      find_library(CUDA_CUBLASLT_LIBRARY cublaslt
+      find_library(CUDA_CUBLASLT_LIBRARY
+        NAMES cublaslt cublasLt
+        PATHS
         ${CUDA_TOOLKIT_ROOT_DIR}/lib64
         ${CUDA_TOOLKIT_ROOT_DIR}/lib)
     endif(MSVC)
diff --git a/cmake/util/FindEthosN.cmake b/cmake/util/FindEthosN.cmake
index 7f0fb6499691..d33b55f0c7a9 100644
--- a/cmake/util/FindEthosN.cmake
+++ b/cmake/util/FindEthosN.cmake
@@ -60,7 +60,7 @@ macro(find_ethosn use_ethosn)
 
     set(ETHOSN_PACKAGE_VERSION "0.1.1")
 
-    if(USE_ETHOSN_HW STREQUAL "ON")
+    if(${USE_ETHOSN_HW} MATCHES ${IS_TRUE_PATTERN})
       # Runtime hardware support
       find_path(_DL_DIR NAMES Network.hpp
         PATHS ${__ethosn_stack}/include/ethosn_driver_library)
@@ -79,7 +79,7 @@ macro(find_ethosn use_ethosn)
   endif(__ethosn_stack)
 
   if(NOT ETHOSN_FOUND)
-    if(__use_ethosn STREQUAL "ON")
+    if(${__use_ethosn} MATCHES ${IS_TRUE_PATTERN})
       message(WARNING "No cmake find_package available for Arm Ethos-N")
     endif()
 
diff --git a/cmake/util/FindLLVM.cmake b/cmake/util/FindLLVM.cmake
index d8378803023d..b8c5bf815bf5 100644
--- a/cmake/util/FindLLVM.cmake
+++ b/cmake/util/FindLLVM.cmake
@@ -19,10 +19,13 @@
 # Enhanced version of find llvm.
 #
 # Usage:
-#   find_llvm(${USE_LLVM})
+#   find_llvm(${USE_LLVM} [${LLVM_VERSION}])
 #
 # - When USE_LLVM=ON, use auto search
 # - When USE_LLVM=/path/to/llvm-config, use corresponding config
+# - The optional LLVM_VERSION=<required version> which will find a matching version
+#      If LLVM_VERSION=10 then it will find 10.X.X, any version of 10
+#      If LLVM_VERSION=10.2 then it will find 10.2.X
 #
 # Provide variables:
 # - LLVM_INCLUDE_DIRS
@@ -32,19 +35,25 @@
 # - TVM_INFO_LLVM_VERSION
 #
 macro(find_llvm use_llvm)
+  if(${use_llvm} MATCHES ${IS_FALSE_PATTERN})
+    return()
+  endif()
   set(LLVM_CONFIG ${use_llvm})
-  if(LLVM_CONFIG STREQUAL "ON")
-    find_package(LLVM REQUIRED CONFIG)
+  if(${ARGC} EQUAL 2)
+    set(llvm_version_required ${ARGV1})
+  endif()
+
+  if(${LLVM_CONFIG} MATCHES ${IS_TRUE_PATTERN})
+    find_package(LLVM ${llvm_version_required} REQUIRED CONFIG)
     llvm_map_components_to_libnames(LLVM_LIBS "all")
     if (NOT LLVM_LIBS)
       message(STATUS "Not found - LLVM_LIBS")
       message(STATUS "Fall back to using llvm-config")
-      set(LLVM_CONFIG "llvm-config")
-    else()
-      set(LLVM_CONFIG "ON")
+      set(LLVM_CONFIG "${LLVM_TOOLS_BINARY_DIR}/llvm-config")
     endif()
   endif()
-  if(LLVM_CONFIG STREQUAL "ON")
+
+  if(LLVM_LIBS) # check if defined, not if it is true
     list (FIND LLVM_LIBS "LLVM" _llvm_dynlib_index)
     if (${_llvm_dynlib_index} GREATER -1)
       set(LLVM_LIBS LLVM)
@@ -55,7 +64,7 @@ macro(find_llvm use_llvm)
     endif()
     set(TVM_LLVM_VERSION ${LLVM_VERSION_MAJOR}${LLVM_VERSION_MINOR})
     set(TVM_INFO_LLVM_VERSION "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}")
-  elseif(NOT LLVM_CONFIG STREQUAL "OFF")
+  else()
     # use llvm config
     message(STATUS "Use llvm-config=" ${LLVM_CONFIG})
     separate_arguments(LLVM_CONFIG)
@@ -145,13 +154,11 @@ macro(find_llvm use_llvm)
       list(APPEND LLVM_LIBS "${__flag}")
     endforeach()
   endif()
-  if(NOT LLVM_CONFIG STREQUAL "OFF")
-    message(STATUS "Found LLVM_INCLUDE_DIRS=" "${LLVM_INCLUDE_DIRS}")
-    message(STATUS "Found LLVM_DEFINITIONS=" "${LLVM_DEFINITIONS}")
-    message(STATUS "Found LLVM_LIBS=" "${LLVM_LIBS}")
-    message(STATUS "Found TVM_LLVM_VERSION=" ${TVM_LLVM_VERSION})
-    if (${TVM_LLVM_VERSION} LESS 40)
-      message(FATAL_ERROR "TVM requires LLVM 4.0 or higher.")
-    endif()
+  message(STATUS "Found LLVM_INCLUDE_DIRS=" "${LLVM_INCLUDE_DIRS}")
+  message(STATUS "Found LLVM_DEFINITIONS=" "${LLVM_DEFINITIONS}")
+  message(STATUS "Found LLVM_LIBS=" "${LLVM_LIBS}")
+  message(STATUS "Found TVM_LLVM_VERSION=" ${TVM_LLVM_VERSION})
+  if (${TVM_LLVM_VERSION} LESS 40)
+    message(FATAL_ERROR "TVM requires LLVM 4.0 or higher.")
   endif()
 endmacro(find_llvm)
diff --git a/cmake/util/FindOpenCL.cmake b/cmake/util/FindOpenCL.cmake
index 2510c01a1bec..9b9f8ec94593 100644
--- a/cmake/util/FindOpenCL.cmake
+++ b/cmake/util/FindOpenCL.cmake
@@ -58,7 +58,7 @@ macro(find_opencl use_opencl)
 
    # No user provided OpenCL include/libs found
    if(NOT OpenCL_FOUND)
-     if(__use_opencl STREQUAL "ON")
+     if(${__use_opencl} MATCHES ${IS_TRUE_PATTERN})
        find_package(OpenCL QUIET)
      endif()
    endif()
diff --git a/cmake/util/FindROCM.cmake b/cmake/util/FindROCM.cmake
index 78b242c8bec3..7d4e282956d9 100644
--- a/cmake/util/FindROCM.cmake
+++ b/cmake/util/FindROCM.cmake
@@ -21,8 +21,8 @@
 # Usage:
 #   find_rocm(${USE_ROCM})
 #
-# - When USE_VULKAN=ON, use auto search
-# - When USE_VULKAN=/path/to/vulkan-sdk-path, use the sdk
+# - When USE_ROCM=ON, use auto search
+# - When USE_ROCM=/path/to/rocm-sdk-path, use the sdk
 #
 # Provide variables:
 #
diff --git a/cmake/util/FindVulkan.cmake b/cmake/util/FindVulkan.cmake
index 00ed7dc61716..feb5eec74d89 100644
--- a/cmake/util/FindVulkan.cmake
+++ b/cmake/util/FindVulkan.cmake
@@ -53,7 +53,7 @@ macro(find_vulkan use_vulkan)
 
    # resort to find vulkan of option is on
    if(NOT Vulkan_FOUND)
-     if(__use_vulkan STREQUAL "ON")
+     if(${__use_vulkan} MATCHES ${IS_TRUE_PATTERN})
        find_package(Vulkan QUIET)
      endif()
    endif()
diff --git a/cmake/util/Util.cmake b/cmake/util/Util.cmake
index d105c82a3082..4e6762b14894 100644
--- a/cmake/util/Util.cmake
+++ b/cmake/util/Util.cmake
@@ -74,3 +74,13 @@ function(assign_source_group group)
         source_group("${group}\\${_source_path_msvc}" FILES "${_source}")
     endforeach()
 endfunction(assign_source_group)
+
+# From cmake documentation:
+# True if the constant is 1, ON, YES, TRUE, Y, or a non-zero number.
+# False if the constant is 0, OFF, NO, FALSE, N, IGNORE, NOTFOUND, the empty string, or ends in the suffix -NOTFOUND.
+# Named boolean constants are case-insensitive.
+#
+# While this regex does contain a check for an empty string that check does not work
+# cmake's regex is weak
+set(IS_FALSE_PATTERN "^[Oo][Ff][Ff]$|^0$|^[Ff][Aa][Ll][Ss][Ee]$|^[Nn][Oo]$|^[Nn][Oo][Tt][Ff][Oo][Uu][Nn][Dd]$|.*-[Nn][Oo][Tt][Ff][Oo][Uu][Nn][Dd]$|^$")
+set(IS_TRUE_PATTERN "^[Oo][Nn]$|^[1-9][0-9]*$|^[Tt][Rr][Uu][Ee]$|^[Yy][Ee][Ss]$|^[Yy]$")
diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index 2ad55c0e521e..2b8df74dab7b 100755
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -21,4 +21,4 @@ set -u
 set -o pipefail
 
 # install libraries for python package on ubuntu
-pip3 install pylint==1.9.4 six numpy pytest cython decorator scipy tornado typed_ast pytest mypy orderedset attrs requests Pillow packaging
+pip3 install six numpy pytest cython decorator scipy tornado typed_ast pytest mypy orderedset attrs requests Pillow packaging
diff --git a/docs/api/python/auto_scheduler.rst b/docs/api/python/auto_scheduler.rst
index 85ff22f58b37..c5b8dccf1be2 100644
--- a/docs/api/python/auto_scheduler.rst
+++ b/docs/api/python/auto_scheduler.rst
@@ -18,18 +18,7 @@
 tvm.auto_scheduler
 ------------------
 .. automodule:: tvm.auto_scheduler
-
-tvm.auto_scheduler.auto_schedule
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. automodule:: tvm.auto_scheduler.auto_schedule
-
-.. autoclass:: tvm.auto_scheduler.auto_schedule.SearchTask
-
-.. autoclass:: tvm.auto_scheduler.auto_schedule.TuningOptions
-
-.. autofunction:: tvm.auto_scheduler.auto_schedule.create_task
-
-.. autofunction:: tvm.auto_scheduler.auto_schedule.auto_schedule
-
-
+   :members:
+   :imported-members:
+   :autosummary:
 
diff --git a/docs/deploy/arm_compute_lib.rst b/docs/deploy/arm_compute_lib.rst
index e3399c57db26..1ff034a2cd8d 100644
--- a/docs/deploy/arm_compute_lib.rst
+++ b/docs/deploy/arm_compute_lib.rst
@@ -232,6 +232,8 @@ Operator support
 +----------------------+-------------------------------------------------------------------------+
 | reshape              | fp32, uint8                                                             |
 +----------------------+-------------------------------------------------------------------------+
+| maximum              | fp32                                                                    |
++----------------------+-------------------------------------------------------------------------+
 
 .. note::
     A composite operator is a series of operators that map to a single Arm Compute Library operator. You can view this
diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 775991636167..2bb6e551b1a0 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -173,8 +173,8 @@ Method 1
    This method is **recommended for developers** who may change the codes.
 
    Set the environment variable `PYTHONPATH` to tell python where to find
-   the library. For example, assume we cloned `tvm` on the home directory
-   `~`. then we can added the following line in `~/.bashrc`.
+   the library. For example, assume we cloned `tvm` on the directory
+   `/path/to/tvm` then we can add the following line in `~/.bashrc`.
    The changes will be immediately reflected once you pull the code and rebuild the project (no need to call ``setup`` again)
 
    .. code:: bash
@@ -242,7 +242,7 @@ tests in TVM. The easiest way to install GTest is from source.
        cd build
        cmake ..
        make
-       make install
+       sudo make install
 
 
 After installing GTest, the C++ tests can be built and started with ``./tests/scripts/task_cpp_unittest.sh`` or just built with ``make cpptest``.
diff --git a/docs/langref/hybrid_script.rst b/docs/langref/hybrid_script.rst
index 5fae67d59492..984e83c56a8e 100644
--- a/docs/langref/hybrid_script.rst
+++ b/docs/langref/hybrid_script.rst
@@ -38,12 +38,12 @@ you need to use ``tvm.te.hybrid.script`` decorator to indicate this is a hybrid
 .. code-block:: python
 
     @tvm.te.hybrid.script
-    def outer_product(a, b, c):
+    def outer_product(a, b):
         c = output_tensor((100, 99), 'float32')
         for i in range(a.shape[0]):
             for j in range(b.shape[0]):
                 c[i, j] = a[i] * b[j]
-          return c
+        return c
     a = numpy.random.randn(100)
     b = numpy.random.randn(99)
     c = outer_product(a, b)
@@ -76,7 +76,7 @@ or ``tvm.container.Array``, to this function, it returns a op node:
 
    a = tvm.te.placeholder((100, ), name='a')
    b = tvm.te.placeholder((99, ), name='b')
-   c = outer_product(a, b, c) # return the output tensor(s) of the operator
+   c = outer_product(a, b) # return the output tensor(s) of the operator
 
 You can use any methods that can be applied on a TVM ``OpNode``, like create_schedule, although
 so far, the functionality of schedule is as limited as ``ExternOpNode``. At least, it can be built
@@ -230,5 +230,8 @@ Assert statement is supported, you can simply use it as it is in standard Python
 
 Keywords
 ~~~~~~~~
-- For keywords: ``serial``, ``range``, ``unroll``, ``parallel``, ``vectorize``, ``bind``, ``const_expr``
-- Math keywords: ``log``, ``exp``, ``sigmoid``, ``tanh``, ``power``, ``popcount``
+- For keywords: ``serial``, ``range``, ``unroll``, ``parallel``, ``vectorize``, ``bind``, ``const_range``
+- Math keywords: ``log``, ``exp``, ``sqrt``, ``rsqrt``, ``sigmoid``, ``tanh``, ``power``, ``popcount``, ``round``, ``ceil_div``
+- Allocate keywords: ``allocate``, ``output_tensor``
+- Data type keywords: ``uint8``, ``uint16``, ``uint32``, ``uint64``, ``int8``, ``int16``, ``int32``, ``int64``, ``float16``, ``float32``, ``float64``
+- Others: ``max_num_threads``
diff --git a/include/tvm/auto_scheduler/cost_model.h b/include/tvm/auto_scheduler/cost_model.h
index 89dcab29265d..f7a27895a7d0 100755
--- a/include/tvm/auto_scheduler/cost_model.h
+++ b/include/tvm/auto_scheduler/cost_model.h
@@ -70,6 +70,11 @@ class CostModelNode : public Object {
     LOG(FATAL) << "Not implemented";
   }
 
+  /*!
+   * \brief Default virtual destructor
+   */
+  virtual ~CostModelNode() {}
+
   static constexpr const char* _type_key = "auto_scheduler.CostModel";
   TVM_DECLARE_BASE_OBJECT_INFO(CostModelNode, Object);
 };
diff --git a/include/tvm/auto_scheduler/search_policy.h b/include/tvm/auto_scheduler/search_policy.h
index 176b10c1d7ea..ddb0dd284875 100755
--- a/include/tvm/auto_scheduler/search_policy.h
+++ b/include/tvm/auto_scheduler/search_policy.h
@@ -65,6 +65,7 @@
 #include <tvm/auto_scheduler/search_task.h>
 #include <tvm/node/node.h>
 
+#include <string>
 #include <unordered_set>
 #include <vector>
 
@@ -191,7 +192,7 @@ class SearchPolicyNode : public Object {
    * We store the string format of a state for redundancy check. This is used to make sure a
    * measured state will never be measured again.
    */
-  std::unordered_set<String> measured_states_set_;
+  std::unordered_set<std::string> measured_states_set_;
   /*! \brief The array of already measured states.
    *  The good states can be used as the initial population in evolutionary search. */
   std::vector<State> measured_states_vector_;
diff --git a/include/tvm/ir/attrs.h b/include/tvm/ir/attrs.h
index 7981d58b0ead..0bd1ce6c2ac1 100644
--- a/include/tvm/ir/attrs.h
+++ b/include/tvm/ir/attrs.h
@@ -360,7 +360,7 @@ struct AttrInitEntry {
   }
   // override fields.
   // This function sets the lower bound of the attribute
-  TSelf& set_lower_bound(DMLC_ATTRIBUTE_UNUSED const T& begin) {
+  TSelf& set_lower_bound(const T& begin) {
     if (this->value_missing_) return *this;
     const T& val = *value_;
     if (begin > val) {
@@ -372,7 +372,7 @@ struct AttrInitEntry {
     return *this;
   }
   // This function sets the upper bound of the attribute
-  TSelf& set_upper_bound(DMLC_ATTRIBUTE_UNUSED const T& end) {
+  TSelf& set_upper_bound(const T& end) {
     if (this->value_missing_) return *this;
     const T& val = *value_;
     if (val > end) {
@@ -384,7 +384,7 @@ struct AttrInitEntry {
     return *this;
   }
   // set default when
-  TSelf& set_default(DMLC_ATTRIBUTE_UNUSED const T& value) {
+  TSelf& set_default(const T& value) {
     if (!value_missing_) return *this;
     *value_ = value;
     value_missing_ = false;
@@ -548,12 +548,12 @@ class AttrDocEntry {
   using TSelf = AttrDocEntry;
 
   explicit AttrDocEntry(ObjectPtr<AttrFieldInfoNode> info) : info_(info) {}
-  TSelf& describe(DMLC_ATTRIBUTE_UNUSED const char* str) {
+  TSelf& describe(const char* str) {
     info_->description = str;
     return *this;
   }
   template <typename T>
-  TSelf& set_default(DMLC_ATTRIBUTE_UNUSED const T& value) {
+  TSelf& set_default(const T& value) {
     std::ostringstream os;
     os << info_->type_info << ", default=" << value;
     info_->type_info = os.str();
diff --git a/include/tvm/node/container.h b/include/tvm/node/container.h
index 7c5484bfa7cb..74dabc168924 100644
--- a/include/tvm/node/container.h
+++ b/include/tvm/node/container.h
@@ -1104,7 +1104,7 @@ class DenseMapNode : public MapNode {
   friend class MapNode;
 };
 
-#define _TVM_DISPATCH_MAP(base, var, body)    \
+#define TVM_DISPATCH_MAP(base, var, body)     \
   {                                           \
     using TSmall = SmallMapNode*;             \
     using TDense = DenseMapNode*;             \
@@ -1118,68 +1118,68 @@ class DenseMapNode : public MapNode {
     }                                         \
   }
 
-#define _TVM_DISPATCH_MAP_CONST(base, var, body) \
-  {                                              \
-    using TSmall = const SmallMapNode*;          \
-    using TDense = const DenseMapNode*;          \
-    uint64_t slots = base->slots_;               \
-    if (slots <= SmallMapNode::kMaxSize) {       \
-      TSmall var = static_cast<TSmall>(base);    \
-      body;                                      \
-    } else {                                     \
-      TDense var = static_cast<TDense>(base);    \
-      body;                                      \
-    }                                            \
+#define TVM_DISPATCH_MAP_CONST(base, var, body) \
+  {                                             \
+    using TSmall = const SmallMapNode*;         \
+    using TDense = const DenseMapNode*;         \
+    uint64_t slots = base->slots_;              \
+    if (slots <= SmallMapNode::kMaxSize) {      \
+      TSmall var = static_cast<TSmall>(base);   \
+      body;                                     \
+    } else {                                    \
+      TDense var = static_cast<TDense>(base);   \
+      body;                                     \
+    }                                           \
   }
 
 inline MapNode::iterator::pointer MapNode::iterator::operator->() const {
-  _TVM_DISPATCH_MAP_CONST(self, p, { return p->DeRefItr(index); });
+  TVM_DISPATCH_MAP_CONST(self, p, { return p->DeRefItr(index); });
 }
 
 inline MapNode::iterator& MapNode::iterator::operator++() {
-  _TVM_DISPATCH_MAP_CONST(self, p, {
+  TVM_DISPATCH_MAP_CONST(self, p, {
     index = p->IncItr(index);
     return *this;
   });
 }
 
 inline MapNode::iterator& MapNode::iterator::operator--() {
-  _TVM_DISPATCH_MAP_CONST(self, p, {
+  TVM_DISPATCH_MAP_CONST(self, p, {
     index = p->IncItr(index);
     return *this;
   });
 }
 
 inline size_t MapNode::count(const key_type& key) const {
-  _TVM_DISPATCH_MAP_CONST(this, p, { return p->count(key); });
+  TVM_DISPATCH_MAP_CONST(this, p, { return p->count(key); });
 }
 
 inline const MapNode::mapped_type& MapNode::at(const MapNode::key_type& key) const {
-  _TVM_DISPATCH_MAP_CONST(this, p, { return p->at(key); });
+  TVM_DISPATCH_MAP_CONST(this, p, { return p->at(key); });
 }
 
 inline MapNode::mapped_type& MapNode::at(const MapNode::key_type& key) {
-  _TVM_DISPATCH_MAP(this, p, { return p->at(key); });
+  TVM_DISPATCH_MAP(this, p, { return p->at(key); });
 }
 
 inline MapNode::iterator MapNode::begin() const {
-  _TVM_DISPATCH_MAP_CONST(this, p, { return p->begin(); });
+  TVM_DISPATCH_MAP_CONST(this, p, { return p->begin(); });
 }
 
 inline MapNode::iterator MapNode::end() const {
-  _TVM_DISPATCH_MAP_CONST(this, p, { return p->end(); });
+  TVM_DISPATCH_MAP_CONST(this, p, { return p->end(); });
 }
 
 inline MapNode::iterator MapNode::find(const MapNode::key_type& key) const {
-  _TVM_DISPATCH_MAP_CONST(this, p, { return p->find(key); });
+  TVM_DISPATCH_MAP_CONST(this, p, { return p->find(key); });
 }
 
 inline void MapNode::erase(const MapNode::iterator& position) {
-  _TVM_DISPATCH_MAP(this, p, { return p->erase(position); });
+  TVM_DISPATCH_MAP(this, p, { return p->erase(position); });
 }
 
-#undef _TVM_DISPATCH_MAP
-#undef _TVM_DISPATCH_MAP_CONST
+#undef TVM_DISPATCH_MAP
+#undef TVM_DISPATCH_MAP_CONST
 
 inline ObjectPtr<MapNode> MapNode::Empty() { return SmallMapNode::Empty(); }
 
diff --git a/include/tvm/relay/analysis.h b/include/tvm/relay/analysis.h
index 00da9408408b..b2e7c500eddd 100644
--- a/include/tvm/relay/analysis.h
+++ b/include/tvm/relay/analysis.h
@@ -80,7 +80,7 @@ TVM_DLL bool BasicBlockNormalFormCheck(const Expr& e);
  *
  * For example, the expression `let x = 1 in let x = 2 in 3` bound x twice.
  *
- * `let f = (\x -> x) in let g = (\x -> x + 1) in f(g(2))` also bound x twice,
+ * `let f = (x -> x) in let g = (x -> x + 1) in f(g(2))` also bound x twice,
  * although x is not shadowed.
  *
  * \param expr the expression to check.
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 6bfdb492fed0..fbe31a305ea5 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -596,11 +596,13 @@ struct Conv2DTransposeAttrs : public tvm::AttrsNode<Conv2DTransposeAttrs> {
 /*! \brief Attributes used in dilate operator */
 struct DilateAttrs : public tvm::AttrsNode<DilateAttrs> {
   Array<IndexExpr> strides;
+  double dilation_value;
 
   TVM_DECLARE_ATTRS(DilateAttrs, "relay.attrs.DilateAttrs") {
     TVM_ATTR_FIELD(strides)
         .set_default(Array<IndexExpr>({1, 1}))
         .describe("Dilation stride on each dimension, 1 means no dilation.");
+    TVM_ATTR_FIELD(dilation_value).set_default(0.0).describe("Value used to dilate the input.");
   }
 };
 
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index a03e15a5a836..683f5a28b4f4 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -381,6 +381,25 @@ struct OneHotAttrs : public tvm::AttrsNode<OneHotAttrs> {
   }
 };  // struct OneHotAttrs
 
+/*! \brief Attributes used in matrix_set_diag operator */
+struct MatrixSetDiagAttrs : public tvm::AttrsNode<MatrixSetDiagAttrs> {
+  int k1;
+  int k2;
+  bool super_diag_right_align;
+  bool sub_diag_right_align;
+
+  TVM_DECLARE_ATTRS(MatrixSetDiagAttrs, "relay.attrs.MatrixSetDiagAttrs") {
+    TVM_ATTR_FIELD(k1).set_default(0).describe("Lower limit (included) of the range of diagonals.");
+    TVM_ATTR_FIELD(k2).set_default(0).describe("Upper limit (included) of the range of diagonals.");
+    TVM_ATTR_FIELD(super_diag_right_align)
+        .set_default(true)
+        .describe("Bool, true iff super-diagonal is right aligned (left-padded).");
+    TVM_ATTR_FIELD(sub_diag_right_align)
+        .set_default(false)
+        .describe("Bool, true iff sub-diagonal is right aligned (left-padded).");
+  }
+};  // struct MatrixSetDiagAttrs
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_TRANSFORM_H_
diff --git a/include/tvm/topi/nn/dilate.h b/include/tvm/topi/nn/dilate.h
index a021402e097c..9b5a8047740e 100644
--- a/include/tvm/topi/nn/dilate.h
+++ b/include/tvm/topi/nn/dilate.h
@@ -55,19 +55,20 @@ PrimExpr all(Array<PrimExpr> args) {
 }
 
 /*!
- * \brief Dilate data with zeros
+ * \brief Dilate data with given dilation value (0 by default).
  *
  * \param x The input tensor, this can have any number of
  * dimensions and any layout.
  * \param strides Dilation stride for each dimension. Stride 1
  * means no dilation.
+ * \param dilation_value Value used to dilate the input.
  * \param name The name of the operation
  * \param tag The tag to mark the operation
  *
  * \return The output tensor.
  */
-inline Tensor dilate(const Tensor& x, Array<PrimExpr> strides, std::string name = "tensor",
-                     std::string tag = kInjective) {
+inline Tensor dilate(const Tensor& x, Array<PrimExpr> strides, double dilation_value,
+                     std::string name = "tensor", std::string tag = kInjective) {
   auto n = x->shape.size();
   CHECK_EQ(n, strides.size()) << "strides size (" << strides.size()
                               << ") must match dimension of x (" << n << ")";
@@ -94,7 +95,8 @@ inline Tensor dilate(const Tensor& x, Array<PrimExpr> strides, std::string name
         }
         if (not_zero.size() > 0) {
           auto all_not_zero = all(not_zero);
-          return tvm::if_then_else(all_not_zero, x(index_tuple), make_const(x->dtype, 0));
+          return tvm::if_then_else(all_not_zero, x(index_tuple),
+                                   make_const(x->dtype, dilation_value));
         }
         return x(index_tuple);
       },
diff --git a/include/tvm/topi/nn/pooling.h b/include/tvm/topi/nn/pooling.h
index b6852ffd01c3..935d399a6604 100644
--- a/include/tvm/topi/nn/pooling.h
+++ b/include/tvm/topi/nn/pooling.h
@@ -336,7 +336,9 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
 
 inline bool find_depth_height_width(const std::string& layout, int* depth_axis, int* height_axis,
                                     int* width_axis) {
-  *depth_axis = -1, *height_axis = -1, *width_axis = -1;
+  *depth_axis = -1;
+  *height_axis = -1;
+  *width_axis = -1;
   int curr_idx = 0;
   for (size_t i = 0; i < layout.size(); ++i) {
     if ((layout[i] >= 'A' && layout[i] <= 'Z') || (layout[i] >= 'a' && layout[i] <= 'z')) {
diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index 2c0d102e35b1..e01eb703cb99 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -1524,29 +1524,60 @@ inline Tensor sparse_to_dense(const Tensor& sparse_indices, const Array<Integer>
 }
 
 /*!
- * \brief Returns a tensor with the diagonal of input tensor replaced with the provided diagonal.
+ * \brief Returns a tensor with the diagonal of input tensor replaced with the provided diagonals.
  * \param input input tensor.
- * \param diagonal values to be filled in the diagonal.
+ * \param diagonal values to be filled in the diagonals.
+ * \param k1 lower limit (included) of the range of diagonals.
+ * \param k2 upper limit (included) of the range of diagonals.
+ * \param super_diag_right_align bool, true iff super-diagonal is right aligned (left-padded).
+ * \param sub_diag_right_align bool, true iff sub-diagonal is right aligned (left-padded).
  * \param name output tensor name.
  * \param tag output tensor tag.
  * \return new tensor with given diagonal values.
  */
-inline Tensor matrix_set_diag(const Tensor& input, const Tensor& diagonal,
+inline Tensor matrix_set_diag(const Tensor& input, const Tensor& diagonal, int k1, int k2,
+                              bool super_diag_right_align, bool sub_diag_right_align,
                               const std::string name = "T_matrix_set_diag",
                               const std::string tag = kInjective) {
   size_t ndim = input->shape.size() - 1;
 
+  bool only_one_diagonal = k1 == k2;
+
   return compute(
       input->shape,
       [&](const Array<Var>& iter_vars) {
         auto get_diag = [&]() {
           Array<PrimExpr> diagonal_indices;
-          for (size_t i = 0; i < ndim; i++) {
+          PrimExpr k, offset = 0;
+          for (size_t i = 0; i < ndim - 1; i++) {
             diagonal_indices.push_back(iter_vars[i]);
           }
+          if (only_one_diagonal) {
+            k = k1;
+          } else {
+            // Determining which diagonal/sub-diagonal/super-diagonal it is
+            k = iter_vars[ndim] - iter_vars[ndim - 1];
+            diagonal_indices.push_back(k2 - k);
+
+            // Calculating the offset in diagonal tensor for this diagonal
+            auto get_offset = [&](PrimExpr M, PrimExpr N) {
+              // offset = max_diagonal_length - diagonal_length
+              return diagonal->shape[diagonal->shape.size() - 1] - if_then_else(M < N, M, N);
+            };
+            offset = if_then_else(
+                k >= 0,
+                super_diag_right_align ? get_offset(input->shape[ndim] - k, input->shape[ndim - 1])
+                                       : 0,
+                sub_diag_right_align ? get_offset(input->shape[ndim], input->shape[ndim - 1] + k)
+                                     : 0);
+          }
+          diagonal_indices.push_back(if_then_else(k >= 0, iter_vars[ndim - 1], iter_vars[ndim]) +
+                                     offset);
           return diagonal(diagonal_indices);
         };
-        return if_then_else((PrimExpr)iter_vars[ndim] == iter_vars[ndim - 1], get_diag(),
+        return if_then_else((PrimExpr)iter_vars[ndim] - iter_vars[ndim - 1] >= k1,
+                            if_then_else((PrimExpr)iter_vars[ndim] - iter_vars[ndim - 1] <= k2,
+                                         get_diag(), input(iter_vars)),
                             input(iter_vars));
       },
       name, tag);
diff --git a/pyproject.toml b/pyproject.toml
index 2ea63215baee..8cf53c927c3d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,9 +43,6 @@ exclude = '''
     | src\/
     | vta\/
     | web\/
-  )/|python/topi/python/test_topi_pooling.py
-   |python/tvm/topi/testing/pool3d_python.py
-   |tests/lint/add_asf_header.py
-   |tests/lint/check_file_type.py
+  )/
 )
 '''
diff --git a/python/tvm/auto_scheduler/auto_schedule.py b/python/tvm/auto_scheduler/auto_schedule.py
index eae8b2527b6e..d8763dbc78d9 100644
--- a/python/tvm/auto_scheduler/auto_schedule.py
+++ b/python/tvm/auto_scheduler/auto_schedule.py
@@ -199,7 +199,7 @@ def auto_schedule(task, search_policy=None, tuning_options=TuningOptions()):
 
     Returns
     -------
-        A `te.schedule` and the a list of `te.Tensor` to be used in `tvm.lower` or `tvm.build`.
+        A `te.Schedule` and the a list of `te.Tensor` to be used in `tvm.lower` or `tvm.build`.
     """
     if not isinstance(task, SearchTask):
         raise ValueError(
diff --git a/python/tvm/auto_scheduler/compute_dag.py b/python/tvm/auto_scheduler/compute_dag.py
index 68883a0d06ff..0115dbcf8ebe 100755
--- a/python/tvm/auto_scheduler/compute_dag.py
+++ b/python/tvm/auto_scheduler/compute_dag.py
@@ -39,16 +39,16 @@ class ComputeDAG(Object):
     subgraph) to a ComputeDAG. It keeps the input/output tensors, all operations in the DAG, and
     some static analysis results for the DAG (e.g. the total float operation count,
     consumer/producer relations of operations, whether an operation stage should
-    be tiled/compute inlined ...).
+    be tiled/compute inlined).
     These analyses can help the search policy to make decisions during the search.
     ComputeDAG is also responsible for the interaction between auto-scheduler's `LoopState` and
     TVM schedule (e.g. applying the `LoopState` transform steps to a TVM schedule, providing
-    `LoopState` with extra information got from TVM schedule ...).
+    `LoopState` with extra information got from TVM schedule).
 
     Parameters
     ----------
     compute : Union[List[Tensor], str]
-        `Tensor`s or workload key for a compute declaration.
+        Input/output tensors or workload key for a compute declaration.
     """
 
     def __init__(self, compute):
diff --git a/python/tvm/auto_scheduler/cost_model/cost_model.py b/python/tvm/auto_scheduler/cost_model/cost_model.py
index 83c8463210a6..32e276b31c6a 100644
--- a/python/tvm/auto_scheduler/cost_model/cost_model.py
+++ b/python/tvm/auto_scheduler/cost_model/cost_model.py
@@ -41,9 +41,9 @@ def update(self, inputs, results):
 
         Parameters
         ----------
-        inputs : List[MeasureInput]
+        inputs : List[auto_scheduler.measure.MeasureInput]
             The measurement inputs
-        results : List[MeasureResult]
+        results : List[auto_scheduler.measure.MeasureResult]
             The measurement results
         """
         _ffi_api.CostModelUpdate(self, inputs, results)
@@ -112,9 +112,9 @@ def update(self, inputs, results):
 
         Parameters
         ----------
-        inputs : List[MeasureInput]
+        inputs : List[auto_scheduler.measure.MeasureInput]
             The measurement inputs
-        results : List[MeasureResult]
+        results : List[auto_scheduler.measure.MeasureResult]
             The measurement results
         """
         raise NotImplementedError
diff --git a/python/tvm/auto_scheduler/cost_model/xgb_model.py b/python/tvm/auto_scheduler/cost_model/xgb_model.py
index 15193684b09c..3eb64df693f2 100644
--- a/python/tvm/auto_scheduler/cost_model/xgb_model.py
+++ b/python/tvm/auto_scheduler/cost_model/xgb_model.py
@@ -197,22 +197,26 @@ def predict(self, task, states):
 
     def predict_stages(self, task, states):
         """Predict the scores of all stages in states. This is the breakdown version of `predict`.
+
         Parameters
         ----------
         search_task : SearchTask
             The search task of states
         statse : List[State]
             The input states
+
         Returns
         -------
         scores: List[float]
             The predicted scores for all stages in all states in the packed format
+
         Note
         ----
         For faster data copy between c++ and python, the python part returns scores in a
         single flatten array using a packed format. The c++ part then unpacks the flatten array.
         The packed format is:
         {
+
           float  scores[N];                 // scores[i] is the score for states[i].
           int    n_stage_0;                 // the number of stages in states[0]
           float  stage_scores_0[[n_stage_0] // the scores for all stages in states[0]
@@ -222,6 +226,7 @@ def predict_stages(self, task, states):
           int    n_stage_i;                 // the number of stages in states[i]
           float  stage_scores_1[n_stage_i]; // the scores for all stages in states[i]
           ...  // untill i == N - 1
+
         }
         To implement this format, we also store int as float, so we can store all numbers
         into a single float array.
diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index c57b39bb1d10..7648ebe92a65 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -25,8 +25,8 @@
 A builder builds the executable binary files and a runner runs the binary files to
 get the measurement results. The flow of data structures is
 
-                `ProgramBuilder`                 `ProgramRunner`
-`MeasureInput` -----------------> `BuildResult` ----------------> `MeasureResult`
+  .                `ProgramBuilder`                 `ProgramRunner`
+  `MeasureInput` -----------------> `BuildResult` ----------------> `MeasureResult`
 
 We implement these in python to utilize python's multiprocessing and error handling.
 """
@@ -222,7 +222,7 @@ class LocalRunner(ProgramRunner):
         where the first "1" is warm up and will be discarded.
         The returned result contains `repeat` costs,
         each of which is an average of `number` costs.
-    min_repeat_ms : int = 0
+    min_repeat_ms : int = 100
         The minimum duration of one `repeat` in milliseconds.
         By default, one `repeat` contains `number` runs. If this parameter is set,
         the parameters `number` will be dynamically adjusted to meet the
@@ -244,7 +244,7 @@ def __init__(
         timeout=10,
         number=3,
         repeat=1,
-        min_repeat_ms=0,
+        min_repeat_ms=100,
         cooldown_interval=0.0,
         enable_cpu_cache_flush=False,
     ):
@@ -289,7 +289,7 @@ class RPCRunner(ProgramRunner):
         where the first "1" is warm up and will be discarded.
         The returned result contains `repeat` costs,
         each of which is an average of `number` costs.
-    min_repeat_ms : int = 0
+    min_repeat_ms : int = 100
         The minimum duration of one `repeat` in milliseconds.
         By default, one `repeat` contains `number` runs. If this parameter is set,
         the parameters `number` will be dynamically adjusted to meet the
@@ -316,7 +316,7 @@ def __init__(
         timeout=10,
         number=3,
         repeat=1,
-        min_repeat_ms=0,
+        min_repeat_ms=100,
         cooldown_interval=0.0,
         enable_cpu_cache_flush=False,
     ):
diff --git a/python/tvm/auto_scheduler/measure_record.py b/python/tvm/auto_scheduler/measure_record.py
index 0185d04e017c..c7ae19678a30 100644
--- a/python/tvm/auto_scheduler/measure_record.py
+++ b/python/tvm/auto_scheduler/measure_record.py
@@ -66,9 +66,9 @@ def read_lines(self, max_lines=None, skip_lines=0):
 
         Returns
         -------
-        inputs : List[MeasureInput]
+        inputs : List[auto_scheduler.measure.MeasureInput]
             The MeasureInputs loaded from the log file.
-        results : List[MeasureResult]
+        results : List[auto_scheduler.measure.MeasureResult]
             The MeasureResults loaded from the log file.
         """
         inputs, results = _ffi_api.RecordReaderReadLines(
@@ -95,7 +95,7 @@ def load_records(filename):
 
     Returns
     -------
-    logs : List[MeasureInput, MeasureResult]
+    logs : List[auto_scheduler.measure.MeasureInput, auto_scheduler.measure.MeasureResult]
     """
     return zip(*RecordReader(filename).read_lines())
 
@@ -133,9 +133,9 @@ def load_best(filename, workload_key=None, target=None):
 
     Returns
     -------
-    input : MeasureInput
+    input : auto_scheduler.measure.MeasureInput
         The best State's MeasureInput from this log fine.
-    result : MeasureResult
+    result : auto_scheduler.measure.MeasureResult
         The best State's MeasureResult from this log fine.
     """
     log_reader = RecordReader(filename)
diff --git a/python/tvm/auto_scheduler/search_policy.py b/python/tvm/auto_scheduler/search_policy.py
index a9d323622277..bf7e2eb16ae0 100644
--- a/python/tvm/auto_scheduler/search_policy.py
+++ b/python/tvm/auto_scheduler/search_policy.py
@@ -91,7 +91,7 @@ class SketchPolicy(SearchPolicy):
     ----------
     task : SearchTask
         The SearchTask for the computation declaration.
-    schedule_cost_model : CostModel = RandomModel()
+    program_cost_model : CostModel = RandomModel()
         The cost model to estimate the complete schedules.
     params : Optional[Dict[str, Any]]
         Parameters of the search policy.
@@ -105,9 +105,11 @@ class SketchPolicy(SearchPolicy):
         Callback functions called before the search process, usually used to do extra
         initializations.
         Possible callbacks:
-            - auto_scheduler.PreloadMeasuredStates
-            - auto_scheduler.PreloadCustomSketchRule
-            TODO(jcf94): Add these search callback implementations.
+
+          - auto_scheduler.PreloadMeasuredStates
+          - auto_scheduler.PreloadCustomSketchRule
+
+        TODO(jcf94): Add these search callback implementations.
     """
 
     DEFAULT_PARAMS = {
@@ -129,7 +131,7 @@ class SketchPolicy(SearchPolicy):
     def __init__(
         self,
         task,
-        schedule_cost_model=RandomModel(),
+        program_cost_model=RandomModel(),
         params=None,
         seed=None,
         verbose=1,
@@ -145,7 +147,7 @@ def __init__(
         self.__init_handle_by_constructor__(
             _ffi_api.SketchPolicy,
             task,
-            schedule_cost_model,
+            program_cost_model,
             params,
             seed or random.randint(1, 1 << 30),
             verbose,
diff --git a/python/tvm/auto_scheduler/workload_registry.py b/python/tvm/auto_scheduler/workload_registry.py
index f0c839800d70..1d9ee6da4f7a 100644
--- a/python/tvm/auto_scheduler/workload_registry.py
+++ b/python/tvm/auto_scheduler/workload_registry.py
@@ -55,13 +55,15 @@ def register_workload(func_name, f=None, override=False):
 
     Examples
     --------
-    @auto_scheduler.register_workload
-    def matmul(N, M, K):
-        A = te.placeholder((N, K), name='A')
-        B = te.placeholder((K, M), name='B')
-        k = te.reduce_axis((0, K), name='k')
-        C = te.compute((N, M), lambda i, j: tvm.sum(A[i][k] * B[k][j], axis=[k]), name='C')
-        return [A, B, C]
+    .. code-block:: python
+
+      @auto_scheduler.register_workload
+      def matmul(N, M, K):
+          A = te.placeholder((N, K), name='A')
+          B = te.placeholder((K, M), name='B')
+          k = te.reduce_axis((0, K), name='k')
+          C = te.compute((N, M), lambda i, j: tvm.sum(A[i][k] * B[k][j], axis=[k]), name='C')
+          return [A, B, C]
     """
     global WORKLOAD_FUNC_REGISTRY
 
diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py
index 6650f500a996..5a4e26b6d2ee 100644
--- a/python/tvm/autotvm/record.py
+++ b/python/tvm/autotvm/record.py
@@ -57,7 +57,7 @@ def measure_str_key(inp, include_config=True):
 
     Parameters
     ----------
-    inp: MeasureInput
+    inp: autotvm.measure.MeasureInput
         input for the measure
     include_config: bool, optional
         whether includes config in the str key
@@ -78,8 +78,8 @@ def encode(inp, result, protocol="json"):
 
     Parameters
     ----------
-    inp: autotvm.tuner.MeasureInput
-    result: autotvm.tuner.MeasureResult
+    inp: autotvm.measure.MeasureInput
+    result: autotvm.measure.MeasureResult
         pair of input/result
     protocol: str
         log protocol, json or pickle
@@ -135,7 +135,7 @@ def decode(row, protocol="json"):
 
     Returns
     -------
-    ret : tuple(autotvm.tuner.MeasureInput, autotvm.tuner.MeasureResult), or None
+    ret : tuple(autotvm.measure.MeasureInput, autotvm.measure.MeasureResult), or None
         The tuple of input and result, or None if input uses old version log format.
     """
     # pylint: disable=unused-variable
@@ -204,8 +204,8 @@ def load_from_file(filename):
 
     Yields
     ------
-    input: autotvm.tuner.MeasureInput
-    result: autotvm.tuner.MeasureResult
+    input: autotvm.measure.MeasureInput
+    result: autotvm.measure.MeasureResult
     """
     for row in open(filename):
         if row and not row.startswith("#"):
diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py
index bfc49d50350d..bed02581270e 100644
--- a/python/tvm/autotvm/task/dispatcher.py
+++ b/python/tvm/autotvm/task/dispatcher.py
@@ -184,7 +184,7 @@ class ApplyHistoryBest(DispatchContext):
 
     Parameters
     ----------
-    records : str or iterator of (MeasureInput, MeasureResult)
+    records : str or iterator of (autotvm.measure.MeasureInput, autotvm.measure.MeasureResult)
         Collection of tuning records.
         If is str, then it should be the filename of a records log file.
         Each row of this file is an encoded record pair. Otherwise, it is an iterator.
@@ -205,7 +205,7 @@ def load(self, records):
 
         Parameters
         ----------
-        records : str or iterator of (MeasureInput, MeasureResult)
+        records : str or iterator of (autotvm.measure.MeasureInput, autotvm.measure.MeasureResult)
             Collection of tuning records.
             If is str, then it should be the filename of a records log file.
             Each row of this file is an encoded record pair. Otherwise, it is an iterator.
@@ -382,7 +382,7 @@ def __init__(self, records):
         """
         Parameters
         ----------
-        records : str or iterator of (MeasureInput, MeasureResult)
+        records : str or iterator of (autotvm.measure.MeasureInput, autotvm.measure.MeasureResult)
             Collection of tuning records.
             If is str, then it should be the filename of a records log file.
                    Each row of this file is an encoded record pair.
diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
index d700b64e5fd5..fb8cf57ed7c7 100644
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@@ -1078,7 +1078,7 @@ def fallback_with_reference_log(self, ref_log):
 
         Parameters
         ----------
-        ref_log: List of (MeasureInput, MeasureResult)
+        ref_log: List of (autotvm.measure.MeasureInput, autotvm.measure.MeasureResult)
             The reference log
         """
         knob_names = [x for x in self.space_map.keys() if isinstance(self.space_map[x], SplitSpace)]
diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py
index cbfe97328d67..9864ba005197 100644
--- a/python/tvm/autotvm/tuner/tuner.py
+++ b/python/tvm/autotvm/tuner/tuner.py
@@ -188,7 +188,7 @@ def load_history(self, data_set):
 
         Parameters
         ----------
-        data_set: Array of (MeasureInput, MeasureResult) pair
+        data_set: Array of (autotvm.measure.MeasureInput, autotvm.measure.MeasureResult) pair
             Previous tuning records
         """
         raise NotImplementedError()
diff --git a/python/tvm/driver/tvmc/frontends.py b/python/tvm/driver/tvmc/frontends.py
index 6275f779f778..0ed88216738b 100644
--- a/python/tvm/driver/tvmc/frontends.py
+++ b/python/tvm/driver/tvmc/frontends.py
@@ -154,6 +154,7 @@ def load(self, path):
         # pylint: disable=C0415
         import onnx
 
+        # pylint: disable=E1101
         model = onnx.load(path)
 
         # pylint: disable=E1101
diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py
index 084f46716e15..3565040e1d76 100644
--- a/python/tvm/micro/session.py
+++ b/python/tvm/micro/session.py
@@ -22,10 +22,14 @@
 
 from .._ffi import get_global_func
 from ..contrib import graph_runtime
-from .base import _rpc_connect
 from ..rpc import RPCSession
 from .transport import TransportLogger
 
+try:
+    from .base import _rpc_connect
+except ImportError:
+    raise ImportError("micro tvm is not enabled. Set USE_MICRO to ON in config.cmake")
+
 
 class Session:
     """MicroTVM Device Session
diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index e4d605aa4560..027d6bd76141 100644
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -563,6 +563,23 @@ def infer_value_simulated(input_val, params):
     return output_value
 
 
+def try_infer_value(val, on_success=None, on_failure=None):
+    """Try running infer_value on the input val, and if successful, return the inferred value or
+    pass it to on_success callback if provided. Otherwise, run on_failure callback if it is
+    provided, or return the input val as output. In each case, the second return value
+    indicates whether infer_value has succeeded or not.
+    """
+    try:
+        ret = infer_value(val, {}).asnumpy()
+        if on_success:
+            return on_success(ret), True
+        return ret, True
+    except Exception:
+        if on_failure:
+            return on_failure(), False
+        return val, False
+
+
 def new_var(name_hint, type_annotation=None, shape=None, dtype="float32"):
     return _expr.var(name_hint, type_annotation, shape, dtype)
 
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 467dd4fce2af..40ad44961405 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -748,12 +748,13 @@ class Prelu(OnnxOpConverter):
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
         assert len(inputs) == 2, "Prelu need 2 inputs, {} given".format(len(inputs))
+        input_channels = infer_shape(inputs[0])[1]
         alpha_shape = infer_shape(inputs[1])
         if len(alpha_shape) != 1:
             alpha = _op.reshape(inputs[1], (-1,))
         else:
             alpha = inputs[1]
-        return _op.nn.prelu(inputs[0], alpha)
+        return _op.nn.prelu(inputs[0], _op.broadcast_to(alpha, [input_channels]))
 
 
 class Reciprocal(OnnxOpConverter):
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 9ceb9fc66ec4..063158f35dc3 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -16,7 +16,7 @@
 # under the License.
 # pylint: disable=import-self, too-many-lines, len-as-condition, no-else-return, unused-variable, too-many-nested-blocks
 # pylint: disable=consider-iterating-dictionary, invalid-name, unused-argument, unused-variable, broad-except
-# pylint: disable=import-outside-toplevel, simplifiable-if-expression, unnecessary-comprehension
+# pylint: disable=import-outside-toplevel, simplifiable-if-expression, cell-var-from-loop, unnecessary-lambda
 """PT: PyTorch frontend."""
 import itertools
 import logging
@@ -36,6 +36,7 @@
 from .common import AttrCvt, get_relay_op
 from .common import infer_shape as _infer_shape
 from .common import infer_value as _infer_value
+from .common import try_infer_value
 from .common import infer_value_simulated as _infer_value_simulated
 from .common import infer_type as _infer_type
 from ..prelude import Prelude, StaticTensorArrayOps
@@ -185,11 +186,8 @@ def _impl(inputs, input_types):
         def _get_value(val, dtype):
             # dtype is a tvm dtype
             if isinstance(val, _expr.Expr):
-                try:
-                    ret = _infer_value(_op.cast(val, dtype), {}).asnumpy()
-                    ret = _expr.const(ret, dtype)
-                except Exception:
-                    ret = _op.cast(val, dtype)
+                inp = _op.cast(val, dtype)
+                ret, _ = try_infer_value(inp, lambda ret: _expr.const(ret, dtype))
             else:
                 ret = _create_typed_const(val, dtype)
             return ret
@@ -305,10 +303,7 @@ def _impl(inputs, input_types):
         dim = int(inputs[1])
         stride = int(inputs[4])
         if isinstance(inputs[2], _expr.Call):
-            try:
-                begin[dim] = np.asscalar(_infer_value(inputs[2], {}).asnumpy().astype(np.int))
-            except Exception:
-                begin[dim] = inputs[2]
+            begin[dim], _ = try_infer_value(inputs[2], lambda ret: np.asscalar(ret.astype(np.int)))
         else:
             begin[dim] = int(inputs[2])
 
@@ -329,10 +324,9 @@ def _impl(inputs, input_types):
             target_end = int(inputs[3])
         else:
             if isinstance(inputs[3], _expr.Expr):
-                try:
-                    target_end = np.asscalar(_infer_value(inputs[3], {}).asnumpy().astype(np.int))
-                except Exception:
-                    target_end = inputs[3]
+                target_end, _ = try_infer_value(
+                    inputs[3], lambda ret: np.asscalar(ret.astype(np.int))
+                )
             else:
                 target_end = inputs[3]
 
@@ -457,10 +451,7 @@ def _impl(inputs, input_types):
         sort = bool(inputs[4])
 
         if isinstance(inputs[1], _expr.Expr):
-            try:
-                k = _infer_value(inputs[1], {}).asnumpy().tolist()
-            except Exception:
-                k = inputs[1]
+            k, _ = try_infer_value(inputs[1], lambda ret: ret.tolist())
         else:
             k = inputs[1]
 
@@ -546,15 +537,15 @@ def _full_impl(data, fill_value, dtype):
                     size.append(dim)
                 new_shape.append(dim)
             else:
-                try:
-                    dim = int(_infer_value(dim, {}).asnumpy())
+                dim, success = try_infer_value(dim, lambda ret: int(ret), lambda: 0)
+                new_shape.append(dim)
+
+                if success:
                     if isinstance(size, list):
                         size.append(dim)
-                    new_shape.append(dim)
-                except Exception:
+                else:
                     size = None
                     need_reshape = True
-                    new_shape.append(0)
         else:
             if isinstance(size, list):
                 size.append(dim)
@@ -1346,12 +1337,11 @@ def _impl(inputs, input_types):
             if isinstance(s, _expr.Constant):
                 tmp_shape.append(int(s.data.asnumpy()))
             elif isinstance(s, _expr.Expr):
-                try:
-                    dim = int(_infer_value(s, {}).asnumpy())
-                    tmp_shape.append(dim)
-                except Exception:
+                dim, success = try_infer_value(s, lambda ret: int(ret))
+                tmp_shape.append(dim)
+
+                if not success:
                     is_dyn = True
-                    tmp_shape.append(s)
             else:
                 tmp_shape.append(s)
 
@@ -1822,13 +1812,14 @@ def _impl(inputs, input_types):
         # 6 means dtype = float
         # this happens when converting upsampling with scale factor
         cast_map = {
+            5: "float16",
             6: "float32",
             7: "float64",
             3: "int32",
             4: "int64",
         }
 
-        cast_func = {6: float, 7: float, 3: int, 4: int}
+        cast_func = {5: float, 6: float, 7: float, 3: int, 4: int}
 
         ret = data
         if isinstance(data, _expr.Expr):
@@ -2312,13 +2303,15 @@ def _impl(inputs, input_types):
         if isinstance(inputs[1], _expr.Expr):
             out_size = inputs[1]
         elif isinstance(inputs[1], list):
-            try:
-                infer_res = [_infer_value(size, {}) for size in inputs[1]]
-                out_size = [np.asscalar(res.asnumpy().astype(np.int)) for res in infer_res]
-            except Exception:
-                h = _op.expand_dims(inputs[1][0], axis=0)
-                w = _op.expand_dims(inputs[1][1], axis=0)
-                out_size = _op.concatenate([h, w], axis=0)
+            out_size = []
+            for i in [0, 1]:
+                size, _ = try_infer_value(
+                    inputs[1][i],
+                    lambda ret: ret.astype(np.int),
+                    lambda: _op.expand_dims(inputs[1][i], axis=0),
+                )
+                out_size.append(size)
+            out_size = _op.concatenate(out_size, axis=0)
 
         data = inputs[0]
         align_corners = inputs[4]
@@ -2429,21 +2422,21 @@ def _convert_data_type(input_type, default_dtype=None):
         return default_dtype
 
     input_type = input_type.lower()
-    if input_type in ["double", "torch.float64"]:
+    if input_type in ["double", "float64", "torch.float64"]:
         return "float64"
-    elif input_type in ["float", "torch.float32"]:
+    elif input_type in ["float", "float32", "torch.float32"]:
         return "float32"
-    elif input_type in ["half", "torch.float16"]:
+    elif input_type in ["half", "float16", "torch.float16"]:
         return "float16"
-    elif input_type in ["long", "torch.int64"]:
+    elif input_type in ["long", "int64", "torch.int64"]:
         return "int64"
-    elif input_type in ["int", "torch.int32"]:
+    elif input_type in ["int", "int32", "torch.int32"]:
         return "int32"
-    elif input_type in ["short", "torch.int16"]:
+    elif input_type in ["short", "int16", "torch.int16"]:
         return "int16"
-    elif input_type in ["char", "torch.int8"]:
+    elif input_type in ["char", "int8", "torch.int8"]:
         return "int8"
-    elif input_type in ["byte", "torch.uint8"]:
+    elif input_type in ["byte", "uint8", "torch.uint8"]:
         return "uint8"
     elif input_type in ["quint8", "torch.quint8"]:
         return "quint8"
@@ -2851,7 +2844,7 @@ def _get_operator_nodes(nodes):
     return ops
 
 
-def _get_relay_input_vars(graph, input_shapes, prelude, is_module=True, default_dtype="float32"):
+def _get_relay_input_vars(graph, input_infos, prelude, is_module=True, default_dtype="float32"):
     """
     Return Relay vars from input shapes and create entries based on
     expected graph inputs - to allow translation
@@ -2862,17 +2855,17 @@ def _get_relay_input_vars(graph, input_shapes, prelude, is_module=True, default_
         # a module has "self" as first input, which we do not need/want
         graph_inputs = graph_inputs[1:]
 
-    if not isinstance(input_shapes, list):
-        msg = "Graph inputs input_shapes should be a list"
+    if not isinstance(input_infos, list):
+        msg = "Graph inputs input_infos should be a list"
         raise RuntimeError(msg)
 
-    if len(graph_inputs) != len(input_shapes):
-        msg = "PyTorch has {} inputs and input_shapes lists {}.".format(
-            len(graph_inputs), len(input_shapes)
+    if len(graph_inputs) != len(input_infos):
+        msg = "PyTorch has {} inputs and input_infos lists {}.".format(
+            len(graph_inputs), len(input_infos)
         )
         raise RuntimeError(msg)
 
-    def get_relay_ty(ishape, pt_type):
+    def get_relay_ty(ishape, itype, pt_type):
         if pt_type.kind() == "TensorType":
             if not (_is_int_seq(ishape) or len(ishape) == 0):
                 msg = "Shape for Tensors must be lists of ints"
@@ -2884,6 +2877,8 @@ def get_relay_ty(ishape, pt_type):
                 msg = "Shapes of input list and information in the graph do not match"
                 raise RuntimeError(msg)
             pt_dtype = pt_type.scalarType()
+            if not pt_dtype and itype:
+                pt_dtype = itype
             dtype = _convert_data_type(pt_dtype, default_dtype=default_dtype)
             return TensorType(ishape, dtype)
         elif pt_type.kind() == "TupleType":
@@ -2891,37 +2886,45 @@ def get_relay_ty(ishape, pt_type):
                 msg = "Shapes for tuples must be tuples"
                 raise RuntimeError(msg)
             return TupleType(
-                [get_relay_ty(elem, pt_t) for elem, pt_t in zip(ishape, pt_type.elements())]
+                [get_relay_ty(elem, itype, pt_t) for elem, pt_t in zip(ishape, pt_type.elements())]
             )
         elif pt_type.kind() == "ListType":
             if not isinstance(ishape, list):
                 msg = "Shapes for lists must be lists"
                 raise RuntimeError(msg)
             pt_elemtype = pt_type.getElementType()
-            elem_tys = [get_relay_ty(s, pt_elemtype) for s in ishape]
+            elem_tys = [get_relay_ty(s, itype, pt_elemtype) for s in ishape]
             if len(elem_tys) > 0 and not all(map(lambda ty: ty == elem_tys[0], elem_tys)):
                 msg = "List elements need have identical types"
                 raise RuntimeError(msg)
             return prelude.l(elem_tys[0])
         elif pt_type.kind() == "OptionalType":
             # we do not support None yet, so we fill in the type
-            return get_relay_ty(ishape, pt_type.getElementType())
+            return get_relay_ty(ishape, itype, pt_type.getElementType())
         # TODO: scalar inputs
         raise NotImplementedError("unsupported input type")
 
     input_vars = {}
 
-    for num, inp in enumerate(input_shapes):
+    new_input_infos = []
+    for num, inp in enumerate(input_infos):
         if not isinstance(inp, tuple):
             msg = "Graph input {} is not a tuple".format(num)
             raise RuntimeError(msg)
         if len(inp) != 2 or not isinstance(inp[0], str):
-            msg = "Graph input {} is not valid, expected ('name', shape)".format(inp)
+            msg = (
+                "Graph input {} is not valid,"
+                " expected ('name', shape) or ('name', (shape, dtype))".format(inp)
+            )
             raise RuntimeError(msg)
+        if not isinstance(inp[1], tuple) or len(inp[1]) == 0 or not isinstance(inp[1][-1], str):
+            new_input_infos.append((inp[0], (inp[1], default_dtype)))
+        else:
+            new_input_infos.append(inp)
 
     input_types = [
-        (name, get_relay_ty(shape, gi.type()))
-        for (name, shape), gi in zip(input_shapes, graph_inputs)
+        (name, get_relay_ty(info[0], info[1], gi.type()))
+        for (name, info), gi in zip(new_input_infos, graph_inputs)
     ]
 
     ir_inputs = [i.debugName() for i in graph_inputs]
@@ -3252,7 +3255,7 @@ def get_all_op_names(graph):
     return set(node.kind() for node in nodes)
 
 
-def from_pytorch(script_module, input_shapes, custom_convert_map=None, default_dtype="float32"):
+def from_pytorch(script_module, input_infos, custom_convert_map=None, default_dtype="float32"):
     """Load PyTorch model in the form of a scripted PyTorch model and convert into relay.
     The companion parameters will be handled automatically.
 
@@ -3262,10 +3265,15 @@ def from_pytorch(script_module, input_shapes, custom_convert_map=None, default_d
         TorchScripted PyTorch graph
         Note: We currently only support traces (ie: torch.jit.trace(model, input))
 
-    input_shapes : List of tuples of input name and input dimensions
-        Graph level input shape list
+    input_infos: List of tuples of (input name, input shape)
+                 or (input name, (input shape, input types))
+        Graph level input shape and type list
         The same input names need to be used for deployment, so choose easy to
         remember names (such as: input0, input1)
+        e.g.
+          [('input0', (1, 2)), ('input1', (3, 4))]
+          or
+          [('input0', ((1, 2), 'int')), ('input1', ((3, 4), 'float'))]
 
     custom_convert_map: Dictionary of str to Relay op
         A custom op conversion map in the same format as _convert_map above
@@ -3297,7 +3305,7 @@ def from_pytorch(script_module, input_shapes, custom_convert_map=None, default_d
     is_module = isinstance(script_module, torch.jit.ScriptModule)
     params = script_module.state_dict() if is_module else {}
     outputs = _get_relay_input_vars(
-        graph, input_shapes, prelude, default_dtype=default_dtype, is_module=is_module
+        graph, input_infos, prelude, default_dtype=default_dtype, is_module=is_module
     )
     param_vars, tensors, packed_param_map = convert_params(graph, params)
     tvm_params = {k: tvm.nd.array(v) for k, v in tensors.items()}
diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
index 0c7df75489ae..77fdbbd4006c 100644
--- a/python/tvm/relay/op/contrib/arm_compute_lib.py
+++ b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -337,3 +337,11 @@ def global_avg_pool2d(attrs, args):
     if attrs.layout != "NHWC":
         return False
     return True
+
+
+@tvm.ir.register_op_attr("maximum", "target.arm_compute_lib")
+def maximum(attrs, args):
+    """Check if the external ACL codegen for maximum should be used."""
+    type_a = args[0].checked_type
+    type_b = args[0].checked_type
+    return (type_a.dtype == "float32") and (type_b.dtype == "float32")
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 086603550532..c83f6a943a31 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -668,7 +668,7 @@ def compute_cross_entropy(attrs, inputs, out_dtype):
 # dilate
 @reg.register_compute("nn.dilate")
 def compute_dilate(attrs, inputs, out_dtype):
-    return [topi.nn.dilate(inputs[0], attrs.strides)]
+    return [topi.nn.dilate(inputs[0], attrs.strides, attrs.dilation_value)]
 
 
 reg.register_broadcast_schedule("nn.dilate")
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 853cd4240b48..86a76ff28fa5 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -1549,23 +1549,26 @@ def pad(data, pad_width, pad_value=0, pad_mode="constant"):
     return _make.pad(data, pad_width, pad_value, pad_mode)
 
 
-def dilate(data, strides):
-    """Dilate data with zeros.
+def dilate(data, strides, dilation_value=0.0):
+    """Dilate data with given dilation value (0 by default).
 
     Parameters
     ----------
     data : tvm.relay.Expr
         n-D, can be any layout.
 
-    strides : <tuple of <int>
+    strides : tuple of <int>
         Dilation stride on each dimension, 1 means no dilation.
 
+    dilation_value : int/float, optional
+        Value used to dilate the input.
+
     Returns
     -------
     Output : tvm.relay.Expr
         The computed result
     """
-    return _make.dilate(data, strides)
+    return _make.dilate(data, strides, dilation_value)
 
 
 def mirror_pad(data, pad_width, mode="SYMMETRIC"):
diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 1833dfe2075d..54624ce35b55 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -204,6 +204,14 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                 wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw),
                 name="group_conv2d_nchw.generic",
             )
+        elif layout == "NHWC":
+            assert kernel_layout == "HWIO"
+            logger.warning("group_conv2d with layout NHWC is not optimized for arm cpu.")
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True),
+                wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc),
+                name="group_conv2d_nhwc.generic",
+            )
         else:
             raise RuntimeError("Unsupported group_conv2d layout {} for arm cpu".format(layout))
     return strategy
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 393854ea2536..56ae97652b79 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -256,6 +256,13 @@ def conv2d_strategy(attrs, inputs, out_type, target):
                 wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw),
                 name="group_conv2d_nchw.generic",
             )
+        elif layout == "NHWC":
+            assert kernel_layout == "HWIO"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True),
+                wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc),
+                name="group_conv2d_nhwc.generic",
+            )
         else:
             raise RuntimeError("Unsupported group_conv2d layout {}".format(layout))
     return strategy
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index bbe01f4feb6c..e2a82d396b22 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -175,6 +175,14 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
                 wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw),
                 name="group_conv2d_nchw.generic",
             )
+        elif layout == "NHWC":
+            assert kernel_layout == "HWIO"
+            logger.warning("group_conv2d is not optimized for x86.")
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True),
+                wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc),
+                name="group_conv2d_nhwc.generic",
+            )
         else:
             raise RuntimeError("Unsupported group_conv2d layout {}".format(layout))
     return strategy
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 8ccd14890581..14ac454aec64 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -1178,17 +1178,33 @@ def sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value=0
     return _make.sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value)
 
 
-def matrix_set_diag(data, diagonal):
+def matrix_set_diag(data, diagonal, k=0, align="RIGHT_LEFT"):
     """
-    Returns a tensor with the diagonal of input tensor replaced with the provided diagonal values.
+    Returns a tensor with the diagonals of input tensor replaced with the provided diagonal values.
 
     Parameters
     ----------
     data : relay.Expr
         Input Tensor.
+
     diagonal : relay.Expr
         Values to be filled in the diagonal.
 
+    k : int or tuple of int, optional
+        Diagonal Offset(s). The diagonal or range of diagonals to set. (0 by default)
+        Positive value means superdiagonal, 0 refers to the main diagonal, and
+        negative value means subdiagonals. k can be a single integer (for a single diagonal)
+        or a pair of integers specifying the low and high ends of a matrix band.
+        k[0] must not be larger than k[1].
+
+    align : string, optional
+        Some diagonals are shorter than max_diag_len and need to be padded.
+        align is a string specifying how superdiagonals and subdiagonals should be aligned,
+        respectively. There are four possible alignments: "RIGHT_LEFT" (default), "LEFT_RIGHT",
+        "LEFT_LEFT", and "RIGHT_RIGHT". "RIGHT_LEFT" aligns superdiagonals to the right
+        (left-pads the row) and subdiagonals to the left (right-pads the row). It is the packing
+        format LAPACK uses. cuSPARSE uses "LEFT_RIGHT", which is the opposite alignment.
+
     Returns
     -------
     result : relay.Expr
@@ -1216,7 +1232,22 @@ def matrix_set_diag(data, diagonal):
               [7, 5, 7, 7],
               [7, 7, 6, 7]]]
     """
-    return _make.matrix_set_diag(data, diagonal)
+    if isinstance(k, (tuple, list)):
+        k_one = k[0]
+        if len(k) >= 2:
+            k_two = k[1]
+        else:
+            k_two = k[0]
+    else:
+        k_one = k
+        k_two = k
+
+    super_diag_right_align = align[:5] == "RIGHT"
+    sub_diag_right_align = align[-5:] == "RIGHT"
+
+    return _make.matrix_set_diag(
+        data, diagonal, k_one, k_two, super_diag_right_align, sub_diag_right_align
+    )
 
 
 def adv_index(inputs):
diff --git a/python/tvm/relay/testing/yolo_detection.py b/python/tvm/relay/testing/yolo_detection.py
index c457ce3dd01d..a387f3076bf5 100644
--- a/python/tvm/relay/testing/yolo_detection.py
+++ b/python/tvm/relay/testing/yolo_detection.py
@@ -196,41 +196,91 @@ def do_nms_sort(dets, classes, thresh):
                     dets[j]["prob"][k] = 0
 
 
+def get_detections(im, det, thresh, names, classes):
+    "Draw the markings around the detected region"
+    labelstr = []
+    category = -1
+    detection = None
+    valid = False
+    for j in range(classes):
+        if det["prob"][j] > thresh:
+            if category == -1:
+                category = j
+            labelstr.append(names[j] + " " + str(round(det["prob"][j], 4)))
+
+    if category > -1:
+        valid = True
+        imc, imh, imw = im.shape
+        width = int(imh * 0.006)
+        offset = category * 123457 % classes
+        red = _get_color(2, offset, classes)
+        green = _get_color(1, offset, classes)
+        blue = _get_color(0, offset, classes)
+        rgb = [red, green, blue]
+        b = det["bbox"]
+        left = int((b.x - b.w / 2.0) * imw)
+        right = int((b.x + b.w / 2.0) * imw)
+        top = int((b.y - b.h / 2.0) * imh)
+        bot = int((b.y + b.h / 2.0) * imh)
+
+        if left < 0:
+            left = 0
+        if right > imw - 1:
+            right = imw - 1
+        if top < 0:
+            top = 0
+        if bot > imh - 1:
+            bot = imh - 1
+
+        detection = {
+            "category": category,
+            "labelstr": labelstr,
+            "left": left,
+            "top": top,
+            "right": right,
+            "bot": bot,
+            "width": width,
+            "rgb": rgb,
+        }
+
+    return valid, detection
+
+
 def draw_detections(font_path, im, dets, thresh, names, classes):
     "Draw the markings around the detected region"
     for det in dets:
-        labelstr = []
-        category = -1
-        for j in range(classes):
-            if det["prob"][j] > thresh:
-                if category == -1:
-                    category = j
-                labelstr.append(names[j] + " " + str(round(det["prob"][j], 4)))
-        if category > -1:
-            imc, imh, imw = im.shape
-            width = int(imh * 0.006)
-            offset = category * 123457 % classes
-            red = _get_color(2, offset, classes)
-            green = _get_color(1, offset, classes)
-            blue = _get_color(0, offset, classes)
-            rgb = [red, green, blue]
-            b = det["bbox"]
-            left = int((b.x - b.w / 2.0) * imw)
-            right = int((b.x + b.w / 2.0) * imw)
-            top = int((b.y - b.h / 2.0) * imh)
-            bot = int((b.y + b.h / 2.0) * imh)
-
-            if left < 0:
-                left = 0
-            if right > imw - 1:
-                right = imw - 1
-            if top < 0:
-                top = 0
-            if bot > imh - 1:
-                bot = imh - 1
-            _draw_box_width(im, left, top, right, bot, width, red, green, blue)
-            label = _get_label(font_path, "".join(labelstr), rgb)
-            _draw_label(im, top + width, left, label, rgb)
+        valid, detection = get_detections(im, det, thresh, names, classes)
+        if valid:
+            rgb = detection["rgb"]
+            label = _get_label(font_path, "".join(detection["labelstr"]), rgb)
+            _draw_box_width(
+                im,
+                detection["left"],
+                detection["top"],
+                detection["right"],
+                detection["bot"],
+                detection["width"],
+                rgb[0],
+                rgb[1],
+                rgb[2],
+            )
+            _draw_label(im, detection["top"] + detection["width"], detection["left"], label, rgb)
+
+
+def show_detections(im, dets, thresh, names, classes):
+    "Print the markings and the detected region"
+    for det in dets:
+        valid, detection = get_detections(im, det, thresh, names, classes)
+        if valid:
+            print(
+                "class:{} left:{} right:{} top:{} bottom:{}".format(
+                    detection["labelstr"],
+                    detection["left"],
+                    detection["top"],
+                    detection["right"],
+                    detection["bot"],
+                )
+            )
 
 
 def _get_pixel(im, x, y, c):
diff --git a/python/tvm/topi/generic/nn.py b/python/tvm/topi/generic/nn.py
index 2b5624929bec..4bc3f97d850b 100644
--- a/python/tvm/topi/generic/nn.py
+++ b/python/tvm/topi/generic/nn.py
@@ -428,6 +428,23 @@ def schedule_group_conv2d_nchw(outs):
     return _default_schedule(outs, False)
 
 
+def schedule_group_conv2d_nhwc(outs):
+    """Schedule for group_conv2d_nhwc
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of group_conv2d_nhwc
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
 def schedule_deformable_conv2d_nchw(outs):
     """Schedule for deformable_conv2d_nchw
 
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index 524558441b21..c0e941c4acc7 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -827,6 +827,91 @@ def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtyp
     )
 
 
+def group_conv2d_nhwc(Input, Filter, stride, padding, dilation, groups, out_dtype=None):
+    """Group convolution operator in NHWC layout.
+
+    Parameters
+    ----------
+    Input : tvm.te.Tensor
+        4-D with shape [batch, in_height, in_width, in_channel]
+
+    Filter : tvm.te.Tensor
+        4-D with shape [filter_height, filter_width, in_channel // groups, num_filter]
+
+    stride : int or a list/tuple of two ints
+        Stride size, or [stride_height, stride_width]
+
+    padding : int or a list/tuple of 2 or 4 ints
+        padding size, or
+        [pad_height, pad_width] for 2 ints, or
+        [pad_top, pad_left, pad_bottom, pad_right] for 4 ints
+
+    dilation : int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
+    groups : int
+        number of groups
+
+    out_dtype : str
+        The output type. This is used for mixed precision.
+
+    Returns
+    -------
+    Output : tvm.te.Tensor
+        4-D with shape [batch, out_height, out_width, out_channel]
+    """
+    if out_dtype is None:
+        out_dtype = Input.dtype
+    assert isinstance(stride, int) or len(stride) == 2
+    assert isinstance(dilation, int) or len(dilation) == 2
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    batch, in_height, in_width, in_channel = get_const_tuple(Input.shape)
+    kernel_h, kernel_w, _, num_filter = get_const_tuple(Filter.shape)
+
+    assert in_channel % groups == 0, "input channels must divide group size"
+    assert num_filter % groups == 0, "output channels must divide group size"
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (kernel_h, kernel_w))
+    # compute the output shape
+    out_channel = num_filter
+    out_height = simplify(
+        (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1
+    )
+    out_width = simplify(
+        (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1
+    )
+    # compute graph
+    pad_before = [0, pad_top, pad_left, 0]
+    pad_after = [0, pad_down, pad_right, 0]
+    temp = pad(Input, pad_before, pad_after, name="pad_temp")
+    ry = te.reduce_axis((0, kernel_h), name="ry")
+    rx = te.reduce_axis((0, kernel_w), name="rx")
+    rc = te.reduce_axis((0, in_channel // groups), name="rc")
+    return te.compute(
+        (batch, out_height, out_width, out_channel),
+        lambda nn, yy, xx, ff: te.sum(
+            temp[
+                nn,
+                yy * stride_h + ry * dilation_h,
+                xx * stride_w + rx * dilation_w,
+                ff // (num_filter // groups) * (in_channel // groups) + rc,
+            ].astype(out_dtype)
+            * Filter[ry, rx, rc, ff].astype(out_dtype),
+            axis=[ry, rx, rc],
+        ),
+        tag="group_conv2d_nhwc",
+    )
+
+
 def unpack_NCHWc_to_nchw(packed_out, out_dtype):
     """Unpack conv2d_NCHWc output from layout NCHWc to NCHW
 
diff --git a/python/tvm/topi/nn/conv2d_transpose.py b/python/tvm/topi/nn/conv2d_transpose.py
index d1edbaae9c7f..f67f9c9c8a5a 100644
--- a/python/tvm/topi/nn/conv2d_transpose.py
+++ b/python/tvm/topi/nn/conv2d_transpose.py
@@ -97,11 +97,11 @@ def declaration_conv2d_transpose_impl(data, kernel, strides, padding, out_dtype,
     # convolution stage
     out_c = simplify(out_c)
 
-    out_h = simplify(in_h - filter_h + 1 + output_padding[0])
-    out_w = simplify(in_w - filter_w + 1 + output_padding[1])
-    dc = tvm.reduce_axis((0, in_c), name="dc")
-    dh = tvm.reduce_axis((0, filter_h), name="dh")
-    dw = tvm.reduce_axis((0, filter_w), name="dw")
+    out_h = simplify(in_h - filter_h + 1)
+    out_w = simplify(in_w - filter_w + 1)
+    dc = te.reduce_axis((0, in_c), name="dc")
+    dh = te.reduce_axis((0, filter_h), name="dh")
+    dw = te.reduce_axis((0, filter_w), name="dw")
 
     Output = te.compute(
         (batch, out_c, out_h, out_w),
diff --git a/python/tvm/topi/nn/dilate.py b/python/tvm/topi/nn/dilate.py
index 836e29a6812d..6980fea58173 100644
--- a/python/tvm/topi/nn/dilate.py
+++ b/python/tvm/topi/nn/dilate.py
@@ -23,8 +23,8 @@
 
 
 @te.tag_scope(tag=tag.INJECTIVE + ",dilate")
-def dilate(data, strides, name="DilatedInput"):
-    """Dilate data with zeros.
+def dilate(data, strides, dilation_value=0.0, name="DilatedInput"):
+    """Dilate data with given dilation value (0 by default).
 
     Parameters
     ----------
@@ -34,6 +34,9 @@ def dilate(data, strides, name="DilatedInput"):
     strides : list / tuple of n ints
         Dilation stride on each dimension, 1 means no dilation.
 
+    dilation_value : int/float, optional
+        Value used to dilate the input.
+
     name : str, optional
         The name prefix operators generated
 
@@ -62,7 +65,7 @@ def _dilate(*indices):
         if not_zero:
             not_zero = tvm.tir.all(*not_zero)
             return tvm.tir.if_then_else(
-                not_zero, data(*index_tuple), tvm.tir.const(0.0, data.dtype)
+                not_zero, data(*index_tuple), tvm.tir.const(dilation_value, data.dtype)
             )
         return data(*index_tuple)
 
diff --git a/python/tvm/topi/testing/dilate_python.py b/python/tvm/topi/testing/dilate_python.py
index b4fff24a1d43..0ae611559729 100644
--- a/python/tvm/topi/testing/dilate_python.py
+++ b/python/tvm/topi/testing/dilate_python.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 
-def dilate_python(input_np, strides):
+def dilate_python(input_np, strides, dilation_value=0.0):
     """Dilate operation.
 
     Parameters
@@ -30,6 +30,9 @@ def dilate_python(input_np, strides):
     strides : list / tuple of n ints
         Dilation stride on each dimension, 1 means no dilation.
 
+    dilation_value : int/float, optional
+        Value used to dilate the input.
+
     Returns
     -------
     output_np : numpy.ndarray
@@ -45,7 +48,8 @@ def dilate_python(input_np, strides):
     for i in range(n):
         output_size += ((input_np.shape[i] - 1) * strides[i] + 1,)
         no_zero += ((range(0, output_size[i], strides[i])),)
-    output_np = np.zeros(shape=output_size)
+    output_np = np.ones(shape=output_size)
+    output_np = dilation_value * output_np
     output_np[np.ix_(*no_zero)] = input_np
 
     return output_np
diff --git a/python/tvm/topi/testing/matrix_set_diag.py b/python/tvm/topi/testing/matrix_set_diag.py
index 63edd0a6d637..81a8f6cccafe 100644
--- a/python/tvm/topi/testing/matrix_set_diag.py
+++ b/python/tvm/topi/testing/matrix_set_diag.py
@@ -19,20 +19,28 @@
 import numpy as np
 
 
-def matrix_set_diag(input_np, diagonal):
+def matrix_set_diag(input_np, diagonal, k=0, align="RIGHT_LEFT"):
     """matrix_set_diag operator implemented in numpy.
 
-    Returns a numpy array with the diagonal of input array
+    Returns a numpy array with the diagonals of input array
     replaced with the provided diagonal values.
 
     Parameters
     ----------
-    input : numpy.ndarray
+    input_np : numpy.ndarray
         Input Array.
         Shape = [D1, D2, D3, ... , Dn-1 , Dn]
+
     diagonal : numpy.ndarray
         Values to be filled in the diagonal.
-        Shape = [D1, D2, D3, ... , Dn-1]
+
+    k : int or tuple of int
+        Diagonal Offsets.
+
+    align : string
+        Some diagonals are shorter than max_diag_len and need to be padded.
+        Possible Vales:
+        ["RIGHT_LEFT" (default), "LEFT_RIGHT", "LEFT_LEFT", "RIGHT_RIGHT"]
 
     Returns
     -------
@@ -41,8 +49,36 @@ def matrix_set_diag(input_np, diagonal):
         Shape = [D1, D2, D3, ... , Dn-1 , Dn]
     """
     out = np.array(input_np, copy=True)
-    n = min(input_np.shape[-1], input_np.shape[-2])
-    for i in range(n):
-        out[..., i, i] = diagonal[..., i]
 
+    cols = input_np.shape[-1]
+    rows = input_np.shape[-2]
+
+    onlyOneDiagonal = True
+    if isinstance(k, (tuple, list)):
+        if len(k) < 2 or k[0] == k[1]:
+            k = k[0]
+        else:
+            onlyOneDiagonal = False
+
+    if onlyOneDiagonal:
+        for i in range(diagonal.shape[-1]):
+            if k >= 0:
+                out[..., i, i + k] = diagonal[..., i]
+            else:
+                out[..., i - k, i] = diagonal[..., i]
+    else:
+        for ki in range(k[0], k[1] + 1):
+            diag_len = min(cols - max(ki, 0), rows + min(ki, 0))
+            offset = 0
+            if ki >= 0:
+                if align[:5] == "RIGHT":
+                    offset = diagonal.shape[-1] - diag_len
+            else:
+                if align[-5:] == "RIGHT":
+                    offset = diagonal.shape[-1] - diag_len
+            for i in range(diag_len):
+                if ki >= 0:
+                    out[..., i, i + ki] = diagonal[..., k[1] - ki, i + offset]
+                else:
+                    out[..., i - ki, i] = diagonal[..., k[1] - ki, i + offset]
     return out
diff --git a/python/tvm/topi/testing/pool3d_python.py b/python/tvm/topi/testing/pool3d_python.py
index 457c4015a405..8c687f737166 100644
--- a/python/tvm/topi/testing/pool3d_python.py
+++ b/python/tvm/topi/testing/pool3d_python.py
@@ -20,12 +20,20 @@
 import numpy as np
 import tvm
 
-def pool3d_ncdhw_python(np_data, kernel,
-                        strides, padding,
-                        out_shape, pool_type,
-                        count_include_pad=True,
-                        ceil_mode=False, dtype="float32"):
+
+def pool3d_ncdhw_python(
+    np_data,
+    kernel,
+    strides,
+    padding,
+    out_shape,
+    pool_type,
+    count_include_pad=True,
+    ceil_mode=False,
+    dtype="float32",
+):
     """baseline for max_pool3d and avg_pool3d, default layout is "NCDHW"""
+    # fmt: off
     in_n, in_c, in_d, in_h, in_w = in_shape = np_data.shape
     if isinstance(kernel, int):
         k_d = k_h = k_w = kernel
@@ -99,4 +107,5 @@ def pool3d_ncdhw_python(np_data, kernel,
         raise ValueError("pool type {} is not supported".format(pool_type))
 
     ret_np = np.maximum(ret_np, fill_value)
+    # fmt: on
     return ret_np
diff --git a/python/tvm/topi/transform.py b/python/tvm/topi/transform.py
index 6af0828da448..c4e51a8858d1 100644
--- a/python/tvm/topi/transform.py
+++ b/python/tvm/topi/transform.py
@@ -806,17 +806,33 @@ def sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value=0
     return cpp.sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value)
 
 
-def matrix_set_diag(data, diagonal):
+def matrix_set_diag(data, diagonal, k=0, align="RIGHT_LEFT"):
     """
-    Returns a tensor with the diagonal of input tensor replaced with the provided diagonal values.
+    Returns a tensor with the diagonals of input tensor replaced with the provided diagonal values.
 
     Parameters
     ----------
     data : relay.Expr
         Input Tensor.
+
     diagonal : relay.Expr
         Values to be filled in the diagonal.
 
+    k : int or tuple of int, optional
+        Diagonal Offset(s). The diagonal or range of diagonals to set. (0 by default)
+        Positive value means superdiagonal, 0 refers to the main diagonal, and
+        negative value means subdiagonals. k can be a single integer (for a single diagonal)
+        or a pair of integers specifying the low and high ends of a matrix band.
+        k[0] must not be larger than k[1].
+
+    align : string, optional
+        Some diagonals are shorter than max_diag_len and need to be padded.
+        align is a string specifying how superdiagonals and subdiagonals should be aligned,
+        respectively. There are four possible alignments: "RIGHT_LEFT" (default), "LEFT_RIGHT",
+        "LEFT_LEFT", and "RIGHT_RIGHT". "RIGHT_LEFT" aligns superdiagonals to the right
+        (left-pads the row) and subdiagonals to the left (right-pads the row). It is the packing
+        format LAPACK uses. cuSPARSE uses "LEFT_RIGHT", which is the opposite alignment.
+
     Returns
     -------
     result : relay.Expr
@@ -836,7 +852,7 @@ def matrix_set_diag(data, diagonal):
         diagonal = [[1, 2, 3],
                     [4, 5, 6]]
 
-        relay.matrix_set_diag(input, diagonal) =
+        topi.matrix_set_diag(input, diagonal) =
             [[[1, 7, 7, 7],
               [7, 2, 7, 7],
               [7, 7, 3, 7]],
@@ -844,7 +860,22 @@ def matrix_set_diag(data, diagonal):
               [7, 5, 7, 7],
               [7, 7, 6, 7]]]
     """
-    return cpp.matrix_set_diag(data, diagonal)
+    if isinstance(k, (tuple, list)):
+        k_one = k[0]
+        if len(k) >= 2:
+            k_two = k[1]
+        else:
+            k_two = k[0]
+    else:
+        k_one = k
+        k_two = k
+
+    super_diag_right_align = align[:5] == "RIGHT"
+    sub_diag_right_align = align[-5:] == "RIGHT"
+
+    return cpp.matrix_set_diag(
+        data, diagonal, k_one, k_two, super_diag_right_align, sub_diag_right_align
+    )
 
 
 def adv_index(data, indices):
diff --git a/rust/tvm-sys/src/context.rs b/rust/tvm-sys/src/context.rs
index 64b58b9f42c9..3747bfcba314 100644
--- a/rust/tvm-sys/src/context.rs
+++ b/rust/tvm-sys/src/context.rs
@@ -248,6 +248,18 @@ impl Display for Context {
     }
 }
 
+impl<'a> From<&'a Context> for ArgValue<'a> {
+    fn from(ctx: &'a Context) -> Self {
+        DLContext::from(ctx).into()
+    }
+}
+
+impl<'a> From<Context> for ArgValue<'a> {
+    fn from(ctx: Context) -> Self {
+        DLContext::from(ctx).into()
+    }
+}
+
 impl From<Context> for RetValue {
     fn from(ret_value: Context) -> RetValue {
         RetValue::Context(ret_value.into())
diff --git a/rust/tvm/examples/resnet/src/build_resnet.py b/rust/tvm/examples/resnet/src/build_resnet.py
index 14e2eee9c48a..904f244e0a9a 100644
--- a/rust/tvm/examples/resnet/src/build_resnet.py
+++ b/rust/tvm/examples/resnet/src/build_resnet.py
@@ -74,8 +74,9 @@ def build(target_dir):
         block = get_model("resnet18_v1", pretrained=True)
         net, params = relay.frontend.from_mxnet(block, {"data": data_shape})
         # we want a probability so add a softmax operator
+        func = net["main"]
         net = relay.Function(
-            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
+            func.params, relay.nn.softmax(func.body), None, func.type_params, func.attrs
         )
     else:
         # use random weights from relay.testing
diff --git a/src/arith/const_int_bound.cc b/src/arith/const_int_bound.cc
index fbb52a9ebe7a..876b7db188c6 100644
--- a/src/arith/const_int_bound.cc
+++ b/src/arith/const_int_bound.cc
@@ -85,7 +85,7 @@ struct ConstIntBoundAnalyzer::Entry {
 class ConstIntBoundAnalyzer::Impl
     : public ExprFunctor<ConstIntBoundAnalyzer::Entry(const PrimExpr&)> {
  public:
-  /*! \brief additional bound info about expr \in bound */
+  /*! \brief additional bound info about expr in bound */
   struct BoundInfo {
     /*! \brief The expr */
     PrimExpr expr;
diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index 6c8498fa1912..a7aded6bc7e5 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -1535,7 +1535,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const CallNode* op) {
   } else if (op->op.same_as(tir::builtin::shift_left())) {
     if (op->args[0].as<IntImmNode>() && op->args[1].as<IntImmNode>()) {
       // the operator overload will eagerly constant fold.
-      return op->args[0] & op->args[1];
+      return op->args[0] << op->args[1];
     }
   }
   ExprDeepEqual expr_equal;
diff --git a/src/auto_scheduler/search_policy/sketch_policy.cc b/src/auto_scheduler/search_policy/sketch_policy.cc
index ffc00941143c..a89fa4b0c77a 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy.cc
@@ -27,10 +27,12 @@
 #include "sketch_policy.h"
 
 #include <tvm/runtime/registry.h>
+#include <tvm/support/parallel_for.h>
 
 #include <algorithm>
 #include <iomanip>
 #include <limits>
+#include <memory>
 #include <queue>
 #include <set>
 #include <string>
@@ -45,7 +47,6 @@ namespace tvm {
 namespace auto_scheduler {
 
 /********** Sketch generation rules **********/
-
 static RuleSkipStage rule_skip_stage;
 static RuleAlwaysInline rule_always_inline;
 static RuleMultiLevelTiling rule_multi_level_tiling;
@@ -58,7 +59,6 @@ static RuleSimplifyComputeWithConstTensor rule_simplify_compute_with_const_tenso
 static RuleSpecialComputeLocationGPU rule_special_compute_location_gpu;
 
 /********** Init population rules **********/
-
 static InitFillTileSize init_fill_tile_size;
 static InitChangeComputeLocation init_change_compute_location;
 static InitParallel init_parallel;
@@ -66,23 +66,15 @@ static InitUnroll init_unroll;
 static InitVectorization init_vectorization;
 static InitThreadBind init_thread_bind;
 
-/********** Mutation rules **********/
-
-static MutateTileSize mutate_tile_size;
-static MutateMaxUnrollFactor mutate_max_unroll_factor;
-static MutateComputeLocation mutate_compute_location;
-static MutateParallel mutate_parallel;
-
 /********** Sketch policy **********/
-
 TVM_REGISTER_NODE_TYPE(SketchPolicyNode);
 
-SketchPolicy::SketchPolicy(SearchTask task, CostModel schedule_cost_model,
+SketchPolicy::SketchPolicy(SearchTask task, CostModel program_cost_model,
                            Map<String, ObjectRef> params, int seed, int verbose,
                            Optional<Array<SearchCallback>> init_search_callbacks) {
   auto node = make_object<SketchPolicyNode>();
   node->search_task = std::move(task);
-  node->schedule_cost_model = std::move(schedule_cost_model);
+  node->program_cost_model = std::move(program_cost_model);
   node->rand_gen = std::mt19937(seed);
   node->params = std::move(params);
   node->verbose = verbose;
@@ -97,18 +89,32 @@ SketchPolicy::SketchPolicy(SearchTask task, CostModel schedule_cost_model,
     node->RunCallbacks(init_search_callbacks.value());
   }
 
-  // Notice: Some rules require us to skip all the rest rules after they are applied.
-  // So the rules below should be ordered carefully.
+  // NOTE: There are strong dependency among the rules below,
+  // so the order to push them into the vector should be considered carefully.
   if (IsCPUTask(node->search_task)) {
-    // The default sketch rules for CPU policy
+    // Sketch Generation Rules
     node->sketch_rules.push_back(&rule_always_inline);
     node->sketch_rules.push_back(&rule_simplify_compute_with_const_tensor);
     node->sketch_rules.push_back(&rule_add_rfactor);
     node->sketch_rules.push_back(&rule_add_cache_write_stage);
     node->sketch_rules.push_back(&rule_multi_level_tiling_with_fusion);
     node->sketch_rules.push_back(&rule_multi_level_tiling);
-  } else if (IsCUDATask(node->search_task)) {
-    // The default sketch rules for CUDA policy
+    node->sketch_rules.push_back(&rule_skip_stage);
+
+    // Initial Population Generation Rules
+    node->init_rules.push_back(&init_fill_tile_size);
+    node->init_rules.push_back(&init_change_compute_location);
+    node->init_rules.push_back(&init_parallel);
+    node->init_rules.push_back(&init_unroll);
+    node->init_rules.push_back(&init_vectorization);
+
+    // Mutation Rules for Evolutionary Search
+    node->mutation_rules.push_back(std::make_shared<MutateTileSize>(0.90));
+    node->mutation_rules.push_back(std::make_shared<MutateAutoUnroll>(0.04));
+    node->mutation_rules.push_back(std::make_shared<MutateComputeLocation>(0.05));
+    node->mutation_rules.push_back(std::make_shared<MutateParallel>(0.01));
+  } else if (IsGPUTask(node->search_task)) {
+    // Sketch Generation Rules
     node->sketch_rules.push_back(&rule_add_cache_read_stage);
     node->sketch_rules.push_back(&rule_always_inline);
     node->sketch_rules.push_back(&rule_special_compute_location_gpu);
@@ -117,32 +123,20 @@ SketchPolicy::SketchPolicy(SearchTask task, CostModel schedule_cost_model,
     node->sketch_rules.push_back(&rule_add_cache_write_stage);
     node->sketch_rules.push_back(&rule_multi_level_tiling_with_fusion);
     node->sketch_rules.push_back(&rule_multi_level_tiling);
-  } else {
-    LOG(FATAL) << "No default sketch rules for target: " << task->target;
-  }
-  node->sketch_rules.push_back(&rule_skip_stage);  // This should always be the last rule
+    node->sketch_rules.push_back(&rule_skip_stage);
 
-  node->init_rules.push_back(&init_fill_tile_size);  // This should always be the first rule
-  if (IsCPUTask(node->search_task)) {
-    // The default init population rules for CPU policy
-    node->init_rules.push_back(&init_change_compute_location);
-    node->init_rules.push_back(&init_parallel);
-    node->init_rules.push_back(&init_unroll);
-    node->init_rules.push_back(&init_vectorization);
-  } else if (IsCUDATask(node->search_task)) {
-    // The default init population rules for CUDA policy
+    // Initial Population Generation Rules
+    node->init_rules.push_back(&init_fill_tile_size);
     node->init_rules.push_back(&init_thread_bind);
     node->init_rules.push_back(&init_unroll);
+
+    // Mutation Rules for Evolutionary Search
+    node->mutation_rules.push_back(std::make_shared<MutateTileSize>(0.90));
+    node->mutation_rules.push_back(std::make_shared<MutateAutoUnroll>(0.10));
   } else {
-    LOG(FATAL) << "No default init rules for target: " << task->target;
+    LOG(FATAL) << "No default sketch rules for target: " << task->target;
   }
 
-  // The default mutation rules.
-  node->mutation_rules.push_back(&mutate_tile_size);
-  node->mutation_rules.push_back(&mutate_max_unroll_factor);
-  node->mutation_rules.push_back(&mutate_compute_location);
-  node->mutation_rules.push_back(&mutate_parallel);
-
   data_ = std::move(node);
 }
 
@@ -169,7 +163,7 @@ State SketchPolicyNode::Search(int n_trials, int early_stopping, int num_measure
       if (!inputs.empty()) {
         // Retrain cost models before the next search round
         PrintTitle("Train cost model", verbose);
-        schedule_cost_model->Update(inputs, results);
+        program_cost_model->Update(inputs, results);
       }
 
       // Search one round to get promising states
@@ -179,9 +173,7 @@ State SketchPolicyNode::Search(int n_trials, int early_stopping, int num_measure
 
       // Infer bound. This is necessary for computing the correct ToStr() for redundancy check
       best_states = search_task->compute_dag.InferBound(best_states);
-      PruneInvalidState(search_task, &best_states);
       random_states = search_task->compute_dag.InferBound(random_states);
-      PruneInvalidState(search_task, &random_states);
 
       // Pick `num_measure_per_iter` states to measure, check hash to remove already measured state
       // Also pick some random states to do eps-greedy
@@ -242,14 +234,16 @@ Array<State> SketchPolicyNode::SearchOneRound(int num_random_states, Array<State
                static_cast<int>(
                    GetDoubleParam(params, SketchParamKey::EvolutionarySearch::use_measured_ratio) *
                    population));
-  bool is_cost_model_reasonable = !schedule_cost_model->IsInstance<RandomModelNode>();
+  bool is_cost_model_reasonable = !program_cost_model->IsInstance<RandomModelNode>();
 
   // 1. Generate sketches
-  const Array<State>& sketches = GenerateSketches();
+  if (sketch_cache_.empty()) {
+    sketch_cache_ = GenerateSketches();
+  }
 
   // 2. Sample the init population
   Array<State> init_population = SampleInitPopulation(
-      sketches, is_cost_model_reasonable ? population - num_use_measured : population);
+      sketch_cache_, is_cost_model_reasonable ? population - num_use_measured : population);
 
   // 3. If the cost model is useless (i.e. RandomCostModel), just random pick some generated
   // states, else perform evolutionary search
@@ -260,7 +254,7 @@ Array<State> SketchPolicyNode::SearchOneRound(int num_random_states, Array<State
       init_population.push_back(measured_states_vector_[indices[i]]);
     }
     // Sample some random states for eps-greedy
-    *random_states = RandomSampleStates(init_population, &rand_gen, num_random_states * 10);
+    *random_states = RandomSampleStates(init_population, &rand_gen, num_random_states * 3);
     return EvolutionarySearch(init_population, num_measure_per_iter_ * 2);
   } else {
     PruneInvalidState(search_task, &init_population);
@@ -278,7 +272,7 @@ Array<State> SketchPolicyNode::GenerateSketches() {
 
   // A map that maps state to its current working position (stage_id)
   std::unordered_map<State, int, ObjectHash, ObjectEqual> cur_stage_id_map;
-  cur_stage_id_map[init_state] = static_cast<int>(init_state->stages.size() - 1);
+  cur_stage_id_map[init_state] = static_cast<int>(init_state->stages.size()) - 1;
 
   // Derivation rule based enumeration
   Array<State> out_states;
@@ -341,28 +335,44 @@ Array<State> SketchPolicyNode::GenerateSketches() {
 Array<State> SketchPolicyNode::SampleInitPopulation(const Array<State>& sketches, int out_size) {
   int fail_ct = 0;
   Array<State> out_states;
+  std::vector<std::mt19937> rand_gens;
+  rand_gens.reserve(out_size);
+  for (int i = 0; i < out_size; i++) {
+    rand_gens.push_back(std::mt19937(rand_gen()));
+  }
   auto tic_begin = std::chrono::high_resolution_clock::now();
 
   while (static_cast<int>(out_states.size()) < out_size && fail_ct < out_size) {
-    // Random choose a starting sketch
-    // TODO(jcf94, merrymercy): Maybe choose sketches in different possibility for they may have
-    // different potential on generating state with better performance
-    State tmp_s = sketches[(rand_gen)() % sketches.size()];
-
-    // Derivation rule based enumeration
-    bool valid = true;
-    for (const auto& rule : init_rules) {
-      if (rule->Apply(this, &tmp_s) == PopulationGenerationRule::ResultKind::kInvalid) {
-        valid = false;
-        break;
+    std::vector<State> temp_states(out_size);
+
+    support::parallel_for(0, out_size - out_states.size(),
+                          [this, &temp_states, &sketches, &rand_gens](int index) {
+                            // Random choose a starting sketch
+                            // TODO(jcf94, merrymercy): Maybe choose sketches in different
+                            // possibility for they may have different potential on generating state
+                            // with better performance
+                            State tmp_s = sketches[(rand_gens[index])() % sketches.size()];
+                            // Derivation rule based enumeration
+                            bool valid = true;
+                            for (const auto& rule : init_rules) {
+                              if (rule->Apply(this, &tmp_s, &rand_gens[index]) ==
+                                  PopulationGenerationRule::ResultKind::kInvalid) {
+                                valid = false;
+                                break;
+                              }
+                            }
+                            if (valid) {
+                              temp_states[index] = std::move(tmp_s);
+                            }
+                          });
+
+    for (int i = 0; i < out_size; i++) {
+      if (temp_states[i].defined()) {
+        out_states.push_back(std::move(temp_states[i]));
+      } else {
+        fail_ct++;
       }
     }
-
-    if (valid) {
-      out_states.push_back(std::move(tmp_s));
-    } else {
-      fail_ct++;
-    }
   }
 
   double duration = std::chrono::duration_cast<std::chrono::duration<double>>(
@@ -379,7 +389,7 @@ Array<State> SketchPolicyNode::EvolutionarySearch(const Array<State>& init_popul
   Array<State> best_states;
   auto tic_begin = std::chrono::high_resolution_clock::now();
 
-  size_t population = init_population.size();
+  size_t population = GetIntParam(params, SketchParamKey::EvolutionarySearch::population);
   int num_iters = GetIntParam(params, SketchParamKey::EvolutionarySearch::num_iters);
   double mutation_prob = GetDoubleParam(params, SketchParamKey::EvolutionarySearch::mutation_prob);
 
@@ -390,135 +400,102 @@ Array<State> SketchPolicyNode::EvolutionarySearch(const Array<State>& init_popul
   Array<State>* pnow = &states_buf1;
   Array<State>* pnext = &states_buf2;
 
-  // The set of explored states to avoid redundancy.
-  std::unordered_set<std::string> explored_set;
-
-  // The heap to maintain the so far best states.
+  // A heap to keep the best states during evolution
   using StateHeapItem = std::pair<State, float>;
   auto cmp = [](const StateHeapItem& left, const StateHeapItem& right) {
     return left.second > right.second;
   };
-  using StateHeap = std::priority_queue<StateHeapItem, std::vector<StateHeapItem>, decltype(cmp)>;
-  StateHeap heap(cmp);
-  auto update_heap = [&heap, &explored_set](const Array<State>& states,
-                                            const std::vector<float>& scores, const int out_size) {
-    float max_score = 0.0;
-    for (size_t i = 0; i < states.size(); ++i) {
-      const State& state = states[i];
+  std::vector<StateHeapItem> heap;
+  std::unordered_set<std::string> in_heap(measured_states_set_);
+  heap.reserve(out_size);
+
+  // auxiliary global variables
+  std::vector<float> pop_scores;
+  std::vector<double> pop_selection_probs;
+  float max_score = 0.0;
+  pop_scores.reserve(population);
+  pop_selection_probs.reserve(population);
+  std::uniform_real_distribution<> dis(0.0, 1.0);
+
+  // mutation rules
+  int mutation_success_ct, mutation_fail_ct;
+  mutation_success_ct = mutation_fail_ct = 0;
+  std::vector<float> rule_weights;
+  std::vector<double> rule_selection_probs;
+  for (const auto& rule : mutation_rules) {
+    rule_weights.push_back(rule->weight);
+  }
+  ComputePrefixSumProb(rule_weights, &rule_selection_probs);
+
+  // Genetic Algorithm
+  for (int k = 0; k < num_iters + 1; ++k) {
+    // Maintain the heap
+    *pnow = search_task->compute_dag.InferBound(*pnow);
+    PruneInvalidState(search_task, pnow);
+    program_cost_model->Predict(search_task, *pnow, &pop_scores);
+
+    for (size_t i = 0; i < pnow->size(); ++i) {
+      const State& state = (*pnow)[i];
       std::string state_str = state.ToStr();
 
-      // Skip redundant states.
-      if (explored_set.count(state_str) > 0) {
-        continue;
-      }
-      explored_set.insert(state_str);
-
-      if (static_cast<int>(heap.size()) < out_size) {
-        // Directly push item if the heap is not full yet.
-        heap.push({state, scores[i]});
-      } else if (scores[i] > heap.top().second) {
-        // Replace the worst state in the heap with the new state.
-        heap.pop();
-        heap.push({state, scores[i]});
+      if (in_heap.count(state_str) == 0) {
+        if (static_cast<int>(heap.size()) < out_size) {
+          heap.emplace_back((*pnow)[i], pop_scores[i]);
+          std::push_heap(heap.begin(), heap.end(), cmp);
+          in_heap.insert(state_str);
+        } else if (pop_scores[i] > heap.front().second) {
+          std::string old_state_str = heap.front().first.ToStr();
+          in_heap.erase(old_state_str);
+          in_heap.insert(state_str);
+
+          std::pop_heap(heap.begin(), heap.end(), cmp);
+          heap.back() = StateHeapItem(state, pop_scores[i]);
+          std::push_heap(heap.begin(), heap.end(), cmp);
+        }
+        if (pop_scores[i] > max_score) {
+          max_score = pop_scores[i];
+        }
       }
-      max_score = (scores[i] > max_score) ? scores[i] : max_score;
     }
-    return max_score;
-  };
 
-  // Cost model predicted scores.
-  std::vector<float> scores;
-  scores.reserve(population);
-
-  // The function to generate prefix sum probabilities based on the given scores.
-  auto assign_prob = [](const std::vector<float>& scores, std::vector<double>* prefix_sum_probs) {
-    // Compute selection probabilities.
-    double sum = 0.0;
-    prefix_sum_probs->resize(scores.size());
-    for (size_t i = 0; i < scores.size(); ++i) {
-      sum += std::max(scores[i], 0.0f);
-      (*prefix_sum_probs)[i] = sum;
+    // Print statistical information
+    if (k % 5 == 0 || k == num_iters) {
+      StdCout(verbose) << "GA Iter: " << k << std::fixed << std::setprecision(4)
+                       << "\tMax score: " << max_score << "\tMin score: " << heap.front().second
+                       << "\t#Pop: " << pnow->size() << "\t#M+: " << mutation_success_ct / (k + 1)
+                       << "\t#M-: " << mutation_fail_ct / (k + 1) << std::endl;
     }
-    for (size_t i = 0; i < scores.size(); ++i) {
-      (*prefix_sum_probs)[i] /= sum;
+    if (k == num_iters) {
+      break;
     }
-  };
 
-  // State selection probabilities.
-  std::uniform_real_distribution<> uniform_dist(0.0, 1.0);
-  std::vector<double> state_select_probs;
-  state_select_probs.reserve(population);
+    // Compute selection probability
+    ComputePrefixSumProb(pop_scores, &pop_selection_probs);
 
-  // Mutation rule selection probabilities.
-  std::vector<double> rule_select_probs;
-  rule_select_probs.reserve(mutation_rules.size());
-  std::vector<float> rule_levels;
-  for (const auto& rule : mutation_rules) {
-    rule_levels.push_back(rule->GetLevel(search_task));
-  }
-  assign_prob(rule_levels, &rule_select_probs);
-
-  // Evaluate the init populations.
-  *pnow = search_task->compute_dag.InferBound(*pnow);
-  PruneInvalidState(search_task, pnow);
-  CHECK_GT(pnow->size(), 0) << "All initial populations are invalid";
-  schedule_cost_model->Predict(search_task, *pnow, &scores);
-
-  // Maintain the best states in the heap.
-  float max_score = update_heap(*pnow, scores, out_size);
-
-  // Genetic algorithm.
-  for (auto iter_idx = 1; iter_idx <= num_iters; ++iter_idx) {
-    // Assign the selection probability to each state based on the cost model scores.
-    assign_prob(scores, &state_select_probs);
-
-    // TODO(@comaniac): Perform cross over.
-
-    // Perform mutations.
-    size_t fail_ct = 0;
-    while (pnext->size() < population && fail_ct < population * 2) {
-      // Select a state to be mutated.
-      State tmp_s = (*pnow)[RandomChoose(state_select_probs, &rand_gen)];
-      if (uniform_dist(rand_gen) < mutation_prob) {
-        // Select a rule and mutate the state.
-        const auto& rule = mutation_rules[RandomChoose(rule_select_probs, &rand_gen)];
-        if (rule->Apply(this, &tmp_s) == PopulationGenerationRule::ResultKind::kValid) {
+    // Do mutation
+    while (pnext->size() < population) {
+      State tmp_s = (*pnow)[RandomChoose(pop_selection_probs, &rand_gen)];
+
+      if (dis(rand_gen) < mutation_prob) {
+        const auto& rule = mutation_rules[RandomChoose(rule_selection_probs, &rand_gen)];
+        if (rule->Apply(this, &tmp_s, &rand_gen) == PopulationGenerationRule::ResultKind::kValid) {
           pnext->push_back(std::move(tmp_s));
+          mutation_success_ct++;
         } else {
-          fail_ct++;
+          mutation_fail_ct++;
         }
       } else {
-        // Do not mutate this state in this round.
         pnext->push_back(std::move(tmp_s));
       }
     }
 
-    // Evaluate the new populations.
-    *pnext = search_task->compute_dag.InferBound(*pnext);
-    PruneInvalidState(search_task, pnext);
-
-    // Throw away all states generated in this iterations if all new states are invalid.
-    if (pnext->size() > 0) {
-      std::swap(pnext, pnow);
-      schedule_cost_model->Predict(search_task, *pnow, &scores);
-
-      // Maintain the best states in the heap.
-      float iter_max_score = update_heap(*pnow, scores, out_size);
-      max_score = (iter_max_score > max_score) ? iter_max_score : max_score;
-    }
+    std::swap(pnext, pnow);
     pnext->clear();
-
-    if (iter_idx % 5 == 0 || iter_idx == num_iters) {
-      StdCout(verbose) << "GA Iter: " << iter_idx << std::fixed << std::setprecision(4)
-                       << "\tMax Score: " << max_score << "\tPop Size: " << pnow->size()
-                       << std::endl;
-    }
   }
 
-  // Copy best states in the heap to the output.
-  while (!heap.empty()) {
-    auto item = heap.top();
-    heap.pop();
+  // Copy best states in the heap to out_states
+  std::sort(heap.begin(), heap.end(), cmp);
+  for (auto& item : heap) {
     best_states.push_back(std::move(item.first));
   }
 
@@ -580,10 +557,10 @@ Array<MeasureInput> SketchPolicyNode::PickStatesWithEpsGreedy(const Array<State>
 }
 
 TVM_REGISTER_GLOBAL("auto_scheduler.SketchPolicy")
-    .set_body_typed([](SearchTask task, CostModel schedule_cost_model,
-                       Map<String, ObjectRef> params, int seed, int verbose,
+    .set_body_typed([](SearchTask task, CostModel program_cost_model, Map<String, ObjectRef> params,
+                       int seed, int verbose,
                        Optional<Array<SearchCallback>> init_search_callbacks) {
-      return SketchPolicy(task, schedule_cost_model, params, seed, verbose, init_search_callbacks);
+      return SketchPolicy(task, program_cost_model, params, seed, verbose, init_search_callbacks);
     });
 
 TVM_REGISTER_GLOBAL("auto_scheduler.SketchPolicyGenerateSketches")
diff --git a/src/auto_scheduler/search_policy/sketch_policy.h b/src/auto_scheduler/search_policy/sketch_policy.h
index 2d93d8775c86..21aaa6ef7b90 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.h
+++ b/src/auto_scheduler/search_policy/sketch_policy.h
@@ -34,6 +34,7 @@
 #include <tvm/auto_scheduler/cost_model.h>
 #include <tvm/auto_scheduler/search_policy.h>
 
+#include <memory>
 #include <set>
 #include <string>
 #include <unordered_set>
@@ -88,15 +89,15 @@ struct SketchParamKey {
 class SketchPolicyNode : public SearchPolicyNode {
  public:
   /*! \brief The cost model to estimate the complete schedules. */
-  CostModel schedule_cost_model;
+  CostModel program_cost_model;
   /*! \brief The parameters map for this search policy. */
   Map<String, ObjectRef> params;
   /*! \brief The rules to generate sketches. */
   std::vector<SketchGenerationRule*> sketch_rules;
-  /*! \brief The rules to generate initial states. */
+  /*! \brief The rules to generate initial population. */
   std::vector<PopulationGenerationRule*> init_rules;
-  /*! \brief The rules to mutate states. */
-  std::vector<PopulationMutationRule*> mutation_rules;
+  /*! \brief The rules to mutate states in the evolutionary search. */
+  std::vector<std::shared_ptr<PopulationMutationRule>> mutation_rules;
   /*! \brief Random generator. */
   std::mt19937 rand_gen;
   /*! \brief Memorize split space for Split. */
@@ -154,6 +155,9 @@ class SketchPolicyNode : public SearchPolicyNode {
 
   /*! \brief The number of states to measure per iteration. */
   int num_measure_per_iter_;
+
+  /*! \brief The cached sketches */
+  Array<State> sketch_cache_;
 };
 
 /*!
@@ -165,14 +169,14 @@ class SketchPolicy : public SearchPolicy {
   /*!
    * \brief The constructor.
    * \param task  The SearchTask for the computation declaration.
-   * \param schedule_cost_model The cost model for complete programs.
+   * \param program_cost_model The cost model for complete programs.
    * \param params The parameters map for this search process.
    * \param seed The random seed of this search process.
    * \param verbose Verbose level. 0 for silent, 1 to output information during schedule
    * search.
    * \param init_search_callbacks SearchCallback to be called before schedule search.
    */
-  SketchPolicy(SearchTask task, CostModel schedule_cost_model, Map<String, ObjectRef> params,
+  SketchPolicy(SearchTask task, CostModel program_cost_model, Map<String, ObjectRef> params,
                int seed, int verbose, Optional<Array<SearchCallback>> init_search_callbacks);
 
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(SketchPolicy, SearchPolicy, SketchPolicyNode);
diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.cc b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
index dab6e4d65f20..089ac774d1d6 100644
--- a/src/auto_scheduler/search_policy/sketch_policy_rules.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
@@ -34,6 +34,9 @@
 namespace tvm {
 namespace auto_scheduler {
 
+static std::vector<int> auto_unroll_configs_cpu = {0, 16, 64, 512};
+static std::vector<int> auto_unroll_configs_gpu = {0, 16, 64, 512, 1024};
+
 /********** Sketch Generation Rule **********/
 /********** RuleSkipStage **********/
 
@@ -436,8 +439,8 @@ std::vector<std::pair<State, int>> RuleSpecialComputeLocationGPU::Apply(
 
 /********** Init Population **********/
 
-PopulationGenerationRule::ResultKind InitFillTileSize::Apply(SketchPolicyNode* policy,
-                                                             State* state) const {
+PopulationGenerationRule::ResultKind InitFillTileSize::Apply(SketchPolicyNode* policy, State* state,
+                                                             std::mt19937* rand_gen) const {
   StateNode* pstate = state->CopyOnWrite();
   // Scan the transformation history and randomly fill tiles size for all SplitStep
   for (size_t step_id = 0; step_id < (*state)->transform_steps.size(); ++step_id) {
@@ -458,7 +461,7 @@ PopulationGenerationRule::ResultKind InitFillTileSize::Apply(SketchPolicyNode* p
       const auto& candidate_lens = policy->split_memo.GetFactorizationSchemes(
           extent, ps->lengths.size(),
           GetIntParam(policy->params, SketchParamKey::max_innermost_split_factor));
-      const auto& candidate_lengths = candidate_lens[(policy->rand_gen)() % candidate_lens.size()];
+      const auto& candidate_lengths = candidate_lens[(*rand_gen)() % candidate_lens.size()];
 
       pstate->transform_steps.Set(
           step_id,
@@ -472,9 +475,8 @@ PopulationGenerationRule::ResultKind InitFillTileSize::Apply(SketchPolicyNode* p
   return ResultKind::kValid;
 }
 
-PopulationGenerationRule::ResultKind MutateComputeLocationCommon(SketchPolicyNode* policy,
-                                                                 State* state,
-                                                                 bool infer_bound = true) {
+PopulationGenerationRule::ResultKind InitChangeComputeLocation::Apply(
+    SketchPolicyNode* policy, State* state, std::mt19937* rand_gen) const {
   if (GetIntParam(policy->params, SketchParamKey::disable_change_compute_location)) {
     return PopulationGenerationRule::ResultKind::kValid;
   }
@@ -490,83 +492,10 @@ PopulationGenerationRule::ResultKind MutateComputeLocationCommon(SketchPolicyNod
       continue;
     }
 
-    int target_stage_id = GetSingleConsumerId(policy->search_task, *state, stage_id);
-    if (target_stage_id < 0) {
-      continue;
-    }
-    const Stage& target_stage = (*state)->stages[target_stage_id];
-
-    std::vector<std::pair<int, int>> candidates;
-    bool target_compute_at_other = target_stage->compute_at == ComputeAtKind::kIter;
-    bool target_is_tiled = IsTiled(target_stage);
-
-    bool visited_reduce = false;
-    // enumerate compute_at location at target_stage
-    // TODO(merrymercy): More analysis here to make smarter choices
-    for (size_t i = 0; i < target_stage->iters.size(); ++i) {
-      const Iterator& target_iter = target_stage->iters[i];
-      if (target_iter->iter_kind == IteratorKind::kReduction) {
-        visited_reduce = true;
-        if (!target_is_tiled) {  // Do not go into reduce iter
-          break;
-        }
-      } else if (target_iter->iter_kind == IteratorKind::kSpatial) {
-        if (visited_reduce) {  // Do not go into inner tile
-          break;
-        }
-      }
+    std::vector<std::pair<int, int>> candidates =
+        GetComputeLocationCandidates(policy->search_task, *state, stage_id);
 
-      if (target_iter->annotation == IteratorAnnotation::kUnroll) {
-        // Do not go into the unroll region of const tensor indices
-        break;
-      }
-
-      if (GetExtent(target_iter) == 1) {
-        // Skip iterators with length of 1
-        continue;
-      }
-      if (target_compute_at_other && target_iter->iter_kind == IteratorKind::kSpatial &&
-          StrEndsWith(target_iter->name, ".0")) {
-        // Skip the first level iterators if target stage compute_at another stage
-        // In this case, the lengths of first level iterators are always one
-        continue;
-      }
-      candidates.emplace_back(target_stage_id, i);
-
-      if ((*state)->attach_map->iter_to_attached_stages.count(std::make_pair(target_stage_id, i))) {
-        break;
-      }
-    }
-
-    // if the target_stage is already compute_at another stage X, try also compute_at X
-    // We call stage X as `target_target_stage`
-    if (target_compute_at_other) {
-      int target_target_stage_id;
-      target_target_stage_id = (*state)->attach_map->stage_to_attach_iter.at(target_stage_id).first;
-      const Stage& target_target_stage = (*state)->stages[target_target_stage_id];
-
-      for (size_t i = 0; i < target_target_stage->iters.size(); ++i) {
-        const Iterator& target_target_iter = target_target_stage->iters[i];
-        if (target_target_iter->iter_kind == IteratorKind::kReduction ||
-            (*state)->attach_map->iter_to_attached_stages.count(
-                std::make_pair(target_target_stage_id, i))) {
-          break;
-        }
-
-        if (target_target_iter->annotation == IteratorAnnotation::kUnroll) {
-          // Do not go into the unroll region of const tensor indices
-          break;
-        }
-
-        if (GetExtent(target_target_iter) == 1) {  // skip iterators with length of 1
-          continue;
-        }
-
-        candidates.emplace_back(target_target_stage_id, i);
-      }
-    }
-
-    int choice = (policy->rand_gen)() % (candidates.size() + 2);
+    int choice = (*rand_gen)() % (candidates.size() + 2);
 
     if (choice == 0) {
       if (!HasReduceIter(stage)) {
@@ -585,19 +514,16 @@ PopulationGenerationRule::ResultKind MutateComputeLocationCommon(SketchPolicyNod
     }
   }
 
-  if (infer_bound) {
+  try {
     *state = policy->search_task->compute_dag.InferBound(*state);
+  } catch (std::exception& e) {
+    return PopulationGenerationRule::ResultKind::kInvalid;
   }
   return PopulationGenerationRule::ResultKind::kValid;
 }
 
-PopulationGenerationRule::ResultKind InitChangeComputeLocation::Apply(SketchPolicyNode* policy,
-                                                                      State* state) const {
-  return MutateComputeLocationCommon(policy, state, true);
-}
-
-PopulationGenerationRule::ResultKind InitParallel::Apply(SketchPolicyNode* policy,
-                                                         State* state) const {
+PopulationGenerationRule::ResultKind InitParallel::Apply(SketchPolicyNode* policy, State* state,
+                                                         std::mt19937* rand_gen) const {
   std::function<void(const SketchPolicyNode&, State*, int stage_id, int iter_offset)>
       annotate_parallel;
   annotate_parallel = [&annotate_parallel](const SketchPolicyNode& policy, State* state,
@@ -661,11 +587,10 @@ PopulationGenerationRule::ResultKind InitParallel::Apply(SketchPolicyNode* polic
   return ResultKind::kValid;
 }
 
-PopulationGenerationRule::ResultKind InitUnroll::Apply(SketchPolicyNode* policy,
-                                                       State* state) const {
-  std::vector<int> auto_unroll_configs = IsGPUTask(policy->search_task)
-                                             ? std::vector<int>({0, 16, 64, 512, 1024})
-                                             : std::vector<int>({0, 16, 64, 512});
+PopulationGenerationRule::ResultKind InitUnroll::Apply(SketchPolicyNode* policy, State* state,
+                                                       std::mt19937* rand_gen) const {
+  std::vector<int>& auto_unroll_configs =
+      IsGPUTask(policy->search_task) ? auto_unroll_configs_gpu : auto_unroll_configs_cpu;
   for (size_t stage_id = 0; stage_id < (*state)->stages.size(); ++stage_id) {
     const Stage& stage = (*state)->stages[stage_id];
     // Skip the inlined stage and placeholder stage
@@ -704,7 +629,7 @@ PopulationGenerationRule::ResultKind InitUnroll::Apply(SketchPolicyNode* policy,
 
     if (HasReduceIter(stage)) {
       // Use auto unroll for multi level tiled stage
-      int value = auto_unroll_configs[(policy->rand_gen)() % auto_unroll_configs.size()];
+      int value = auto_unroll_configs[(*rand_gen)() % auto_unroll_configs.size()];
       state->pragma(stage_id, (*state)->stages[stage_id]->iters[0],
                     std::string("auto_unroll_max_step") + "$" + std::to_string(value));
     }
@@ -714,7 +639,8 @@ PopulationGenerationRule::ResultKind InitUnroll::Apply(SketchPolicyNode* policy,
 }
 
 PopulationGenerationRule::ResultKind InitVectorization::Apply(SketchPolicyNode* policy,
-                                                              State* state) const {
+                                                              State* state,
+                                                              std::mt19937* rand_gen) const {
   for (size_t stage_id = 0; stage_id < (*state)->stages.size(); ++stage_id) {
     const Stage& stage = (*state)->stages[stage_id];
     // Skip the inlined stage and placeholder stage
@@ -758,7 +684,7 @@ PopulationGenerationRule::ResultKind InitVectorization::Apply(SketchPolicyNode*
 
     if (num_fusible > 1) {
       // Select a random range to fuse
-      num_fusible = 1 + (policy->rand_gen)() % (num_fusible - 1);
+      num_fusible = 1 + (*rand_gen)() % (num_fusible - 1);
     }
 
     if (num_fusible == 1) {
@@ -772,8 +698,8 @@ PopulationGenerationRule::ResultKind InitVectorization::Apply(SketchPolicyNode*
   return ResultKind::kValid;
 }
 
-PopulationGenerationRule::ResultKind InitThreadBind::Apply(SketchPolicyNode* policy,
-                                                           State* state) const {
+PopulationGenerationRule::ResultKind InitThreadBind::Apply(SketchPolicyNode* policy, State* state,
+                                                           std::mt19937* rand_gen) const {
   std::set<int> multi_level_tiling_root_set;
   for (size_t stage_id = 0; stage_id < (*state)->stages.size(); ++stage_id) {
     if (NeedsMultilevelTiling(policy->search_task, *state, stage_id)) {
@@ -801,6 +727,10 @@ PopulationGenerationRule::ResultKind InitThreadBind::Apply(SketchPolicyNode* pol
 
     // Deal with the cross-thread reduction generated by RuleCrossThreadReduction
     if (HasCrossThreadReduction(*state, stage_id)) {
+      if (stage->compute_at != ComputeAtKind::kRoot) {
+        continue;
+      }
+
       Iterator fused_it;
       *state = std::move(FuseAllOuterSpaceIterators(*state, stage_id, &fused_it));
       state->bind(stage_id, fused_it, IteratorAnnotation::kBlockX);
@@ -922,8 +852,8 @@ PopulationGenerationRule::ResultKind InitThreadBind::Apply(SketchPolicyNode* pol
   return ResultKind::kValid;
 }
 
-PopulationGenerationRule::ResultKind MutateTileSize::Apply(SketchPolicyNode* policy,
-                                                           State* state) const {
+PopulationGenerationRule::ResultKind MutateTileSize::Apply(SketchPolicyNode* policy, State* state,
+                                                           std::mt19937* rand_gen) const {
   int max_innermost_split_factor =
       GetIntParam(policy->params, SketchParamKey::max_innermost_split_factor);
 
@@ -952,7 +882,7 @@ PopulationGenerationRule::ResultKind MutateTileSize::Apply(SketchPolicyNode* pol
   const SplitStepNode* ps;
 
   do {
-    step_id = split_step_ids[(policy->rand_gen)() % split_step_ids.size()];
+    step_id = split_step_ids[(*rand_gen)() % split_step_ids.size()];
     ps = (*state)->transform_steps[step_id].as<SplitStepNode>();
     CHECK(ps != nullptr);
     extent = GetIntImm(ps->extent.value());
@@ -973,7 +903,7 @@ PopulationGenerationRule::ResultKind MutateTileSize::Apply(SketchPolicyNode* pol
 
   // Random permute the tile size order.
   std::vector<int> random_perm;
-  RandomPermutation(lengths.size(), &random_perm, &(policy->rand_gen));
+  RandomPermutation(lengths.size(), &random_perm, rand_gen);
 
   // Try to divide a factor from one tile size and multiple it to another.
   for (size_t i = 0; i < random_perm.size(); ++i) {
@@ -983,6 +913,7 @@ PopulationGenerationRule::ResultKind MutateTileSize::Apply(SketchPolicyNode* pol
       continue;
     }
 
+    // Divide one factor from lengths[src_idx] and multiply it to lengths[dst_idx]
     size_t dst_idx = random_perm[(i + 1) % random_perm.size()];
     const std::vector<int>& factors = policy->split_memo.GetFactors(length);
     CHECK_GE(factors.size(), 1);
@@ -1000,9 +931,9 @@ PopulationGenerationRule::ResultKind MutateTileSize::Apply(SketchPolicyNode* pol
         // Failed on this dst_idx, try next one.
         continue;
       }
-      divide_factor = factors[1 + (policy->rand_gen)() % (max_factor_index)];
+      divide_factor = factors[1 + (*rand_gen)() % (max_factor_index)];
     } else {
-      divide_factor = factors[1 + (policy->rand_gen)() % (factors.size() - 1)];
+      divide_factor = factors[1 + (*rand_gen)() % (factors.size() - 1)];
     }
 
     // Divide one factor from lengths[src_idx] and multiply it to lengths[dst_idx].
@@ -1017,6 +948,8 @@ PopulationGenerationRule::ResultKind MutateTileSize::Apply(SketchPolicyNode* pol
       }
     }
 
+    CHECK_LE(GetIntImm(new_lengths.back()), max_innermost_split_factor);
+
     StateNode* pstate = state->CopyOnWrite();
     pstate->transform_steps.Set(
         step_id, SplitStep(ps->stage_id, ps->iter_id, ps->extent,
@@ -1027,43 +960,103 @@ PopulationGenerationRule::ResultKind MutateTileSize::Apply(SketchPolicyNode* pol
   return ResultKind::kInvalid;
 }
 
-PopulationGenerationRule::ResultKind MutateMaxUnrollFactor::Apply(SketchPolicyNode* policy,
-                                                                  State* state) const {
+PopulationGenerationRule::ResultKind MutateAutoUnroll::Apply(SketchPolicyNode* policy, State* state,
+                                                             std::mt19937* rand_gen) const {
   // Extract all auto_unroll_max_step pragma steps.
-  std::vector<int> annotate_steps;
+  std::vector<int> pragma_steps;
   for (size_t i = 0; i < (*state)->transform_steps.size(); ++i) {
     if (auto ps = (*state)->transform_steps[i].as<PragmaStepNode>()) {
       if (StrStartsWith(ps->pragma_type, "auto_unroll_max_step")) {
-        annotate_steps.push_back(i);
+        pragma_steps.push_back(i);
       }
     }
   }
-  if (annotate_steps.empty()) {
+  if (pragma_steps.empty()) {
     return ResultKind::kInvalid;
   }
 
-  // Random pick up one unroll factor candidate.
-  auto cands = (IsGPUTask(policy->search_task)) ? &gpu_unroll_cands_ : &cpu_unroll_cands_;
-  auto new_factor = std::to_string((*cands)[(policy->rand_gen)() % cands->size()]);
+  std::vector<int>& auto_unroll_configs =
+      IsGPUTask(policy->search_task) ? auto_unroll_configs_gpu : auto_unroll_configs_cpu;
 
-  // Random pick up and mutate an unroll step.
-  auto step_id = annotate_steps[(policy->rand_gen)() % annotate_steps.size()];
+  // Randomly pick up an auto unroll pragma step
+  auto step_id = pragma_steps[(*rand_gen)() % pragma_steps.size()];
   auto ps = (*state)->transform_steps[step_id].as<PragmaStepNode>();
   CHECK(ps);
+
+  // Mutate its value to a random candidates
+  auto val = std::to_string(auto_unroll_configs[(*rand_gen)() % auto_unroll_configs.size()]);
   StateNode* pstate = state->CopyOnWrite();
-  pstate->transform_steps.Set(step_id,
-                              PragmaStep(ps->stage_id, ps->iter_id,
-                                         std::string("auto_unroll_max_step") + "$" + new_factor));
+  pstate->transform_steps.Set(step_id, PragmaStep(ps->stage_id, ps->iter_id,
+                                                  std::string("auto_unroll_max_step") + "$" + val));
   return ResultKind::kValid;
 }
 
 PopulationGenerationRule::ResultKind MutateComputeLocation::Apply(SketchPolicyNode* policy,
-                                                                  State* state) const {
-  return MutateComputeLocationCommon(policy, state, false);
+                                                                  State* state,
+                                                                  std::mt19937* rand_gen) const {
+  if (GetIntParam(policy->params, SketchParamKey::disable_change_compute_location)) {
+    return PopulationGenerationRule::ResultKind::kInvalid;
+  }
+
+  // Extract all compute_at steps.
+  std::vector<int> compute_at_steps;
+  for (size_t s = 0; s < (*state)->transform_steps.size(); ++s) {
+    if (auto ps = (*state)->transform_steps[s].as<ComputeAtStepNode>()) {
+      int stage_inc = GetTargetStageIDInState(*state, s) - ps->stage_id;
+
+      if (IsTiled((*state)->stages[ps->stage_id + stage_inc])) {
+        continue;
+      }
+
+      if (NeedsMultilevelTiling(policy->search_task, *state, ps->stage_id + stage_inc)) {
+        continue;
+      }
+      compute_at_steps.push_back(s);
+    }
+  }
+  if (compute_at_steps.empty()) {
+    return PopulationGenerationRule::ResultKind::kInvalid;
+  }
+
+  // Randomly pick one step
+  size_t step_id = compute_at_steps[(*rand_gen)() % compute_at_steps.size()];
+  auto ps = (*state)->transform_steps[step_id].as<ComputeAtStepNode>();
+  int stage_inc = GetTargetStageIDInState(*state, step_id) - ps->stage_id;
+  CHECK(ps != nullptr);
+
+  std::vector<std::pair<int, int>> candidates =
+      GetComputeLocationCandidates(policy->search_task, *state, ps->stage_id + stage_inc);
+
+  if (candidates.empty()) {
+    return PopulationGenerationRule::ResultKind::kInvalid;
+  }
+
+  int choice = (*rand_gen)() % (candidates.size());
+  int new_compute_at_stage_id = candidates[choice].first;
+  int new_compute_at_iter_id = candidates[choice].second;
+
+  // Replay a new state.
+  State tmp_s = policy->search_task->compute_dag->init_state;
+  for (size_t s = 0; s < (*state)->transform_steps.size(); ++s) {
+    if (s == step_id) {
+      tmp_s.CopyOnWrite()->transform_steps.push_back(
+          ComputeAtStep(ps->stage_id, new_compute_at_stage_id - stage_inc, new_compute_at_iter_id));
+    } else {
+      tmp_s.CopyOnWrite()->transform_steps.push_back((*state)->transform_steps[s]);
+    }
+    try {
+      StepApplyToState(tmp_s->transform_steps.back(), &tmp_s, policy->search_task->compute_dag);
+    } catch (dmlc::Error& e) {
+      return PopulationGenerationRule::ResultKind::kInvalid;
+    }
+  }
+
+  *state = tmp_s;
+  return PopulationGenerationRule::ResultKind::kValid;
 }
 
-PopulationGenerationRule::ResultKind MutateParallel::Apply(SketchPolicyNode* policy,
-                                                           State* state) const {
+PopulationGenerationRule::ResultKind MutateParallel::Apply(SketchPolicyNode* policy, State* state,
+                                                           std::mt19937* rand_gen) const {
   // This mutation rule only focuses on a case that parallel was added to
   // the outermost loop and the loop is generated by fusing other loops.
   // In short, we mutate the fusion step before the parallel step.
@@ -1087,7 +1080,7 @@ PopulationGenerationRule::ResultKind MutateParallel::Apply(SketchPolicyNode* pol
   }
 
   // Randomly pick one parallel step.
-  size_t step_id = parallel_steps[(policy->rand_gen)() % parallel_steps.size()];
+  size_t step_id = parallel_steps[(*rand_gen)() % parallel_steps.size()];
   auto ps = (*state)->transform_steps[step_id].as<AnnotationStepNode>();
   CHECK(ps);
   size_t stage_id = ps->stage_id;
@@ -1126,7 +1119,7 @@ PopulationGenerationRule::ResultKind MutateParallel::Apply(SketchPolicyNode* pol
 
   // Mutate the fusion iters and replay the mutated fused/annotation steps.
   int iter_offset = 0;
-  if (RandomChoose(fuse_dir, &(policy->rand_gen)) == 0) {
+  if (RandomChoose(fuse_dir, rand_gen) == 0) {
     fused_ids.pop_back();
     iter_offset = 1;
   } else {
diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.h b/src/auto_scheduler/search_policy/sketch_policy_rules.h
index 418fbda6a030..928efc518827 100644
--- a/src/auto_scheduler/search_policy/sketch_policy_rules.h
+++ b/src/auto_scheduler/search_policy/sketch_policy_rules.h
@@ -124,7 +124,7 @@ DEFINE_SKETCH_GENERATION_RULE(RuleSpecialComputeLocationGPU);
 
 /********** Init Population **********/
 
-/*! \brief The base class for derivation rules used in the initial population. */
+/*! \brief The base class for rules used to annotate the sketches to get the initial population. */
 class PopulationGenerationRule {
  public:
   /*! \brief Result enumeration of the apply function. */
@@ -137,19 +137,24 @@ class PopulationGenerationRule {
    * \param state The state to apply this rule, update inplace.
    * \return The result of this rule, indicate if there's any valid state generated.
    */
-  virtual ResultKind Apply(SketchPolicyNode* policy, State* state) const = 0;
+  virtual ResultKind Apply(SketchPolicyNode* policy, State* state,
+                           std::mt19937* rand_gen) const = 0;
+
+  /*! \brief The deconstructor */
+  virtual ~PopulationGenerationRule() = default;
 };
 
-#define DEFINE_INIT_POPULATION_RULE(rule_name)                            \
-  class rule_name : public PopulationGenerationRule {                     \
-   public:                                                                \
-    ResultKind Apply(SketchPolicyNode* policy, State* state) const final; \
+// A helper to define population initialization rules
+#define DEFINE_INIT_POPULATION_RULE(rule_name)                                                    \
+  class rule_name : public PopulationGenerationRule {                                             \
+   public:                                                                                        \
+    ResultKind Apply(SketchPolicyNode* policy, State* state, std::mt19937* rand_gen) const final; \
   };
 
 /*! \brief The rule that fills the incomplete SplitSteps. */
 DEFINE_INIT_POPULATION_RULE(InitFillTileSize);
 
-/*! \brief The rule that randomly changes the computation location for some stages, which do not
+/*! \brief The rule that randomly changes the computation location for some stages that do not
  * need tiling and are not strictly inlineable(e.g. data padding). */
 DEFINE_INIT_POPULATION_RULE(InitChangeComputeLocation);
 
@@ -170,50 +175,37 @@ DEFINE_INIT_POPULATION_RULE(InitThreadBind);
 /*! \brief The base class for mutation rules used in the evolutionary search. */
 class PopulationMutationRule : public PopulationGenerationRule {
  public:
-  /*!
-   * \brief Get the priority level of this mutation rule.
-   * \return The priority level of this mutation rule. Higher the better.
+  /* \brief The constructor
+   * \param selection_weight the probabiliy of applying this rule is
+   *        proportional to this weight
    */
-  virtual int GetLevel(const SearchTask& task) const = 0;
+  explicit PopulationMutationRule(double selection_weight) : weight(selection_weight) {}
+
+  /* \brief The weight of this rule */
+  double weight;
 };
 
-// A helper to define mutation rules with a constant rule level.
-#define DEFINE_MUTATE_POPULATION_RULE(rule_name, rule_level)                \
-  class rule_name : public PopulationMutationRule {                         \
-   public:                                                                  \
-    ResultKind Apply(SketchPolicyNode* policy, State* state) const final;   \
-    int GetLevel(const SearchTask& task) const final { return rule_level; } \
+// A helper to define mutation rules used in the evolutionary search
+#define DEFINE_MUTATE_POPULATION_RULE(rule_name)                                                  \
+  class rule_name : public PopulationMutationRule {                                               \
+   public:                                                                                        \
+    explicit rule_name(double weight) : PopulationMutationRule(weight) {}                         \
+    ResultKind Apply(SketchPolicyNode* policy, State* state, std::mt19937* rand_gen) const final; \
   };
 
 /*! \brief The rule that mutates tile size by randomly dividing a tile size by a factor
     and multipling it to another tile size. */
-DEFINE_MUTATE_POPULATION_RULE(MutateTileSize, 100);
-
-/*! \brief The rule that mutates the fusion iterators annotated by parallel. */
-DEFINE_MUTATE_POPULATION_RULE(MutateParallel, 50);
-
-/*! \brief The rule that mutates the factor of a randomly selected auto max unroll step. */
-class MutateMaxUnrollFactor : public PopulationMutationRule {
- public:
-  ResultKind Apply(SketchPolicyNode* policy, State* state) const final;
-  int GetLevel(const SearchTask& task) const final { return 10; }
+DEFINE_MUTATE_POPULATION_RULE(MutateTileSize);
 
-  const std::vector<int> cpu_unroll_cands_ = {0, 16, 64, 512, 1024};
-  const std::vector<int> gpu_unroll_cands_ = {0, 16, 64, 512};
-};
+/*! \brief The rule that mutates the number of fused outer iterators annotated by parallel. */
+DEFINE_MUTATE_POPULATION_RULE(MutateParallel);
 
-/*! \brief The rule that randomly changes the computation location for some stages, which do not
+/*! \brief The rule that randomly changes the computation location for some stages that do not
  * need tiling and are not strictly inlineable(e.g. data padding). */
-class MutateComputeLocation : public PopulationMutationRule {
- public:
-  ResultKind Apply(SketchPolicyNode* policy, State* state) const final;
-  int GetLevel(const SearchTask& task) const final {
-    if (IsGPUTask(task)) {
-      return 0;
-    }
-    return 5;
-  }
-};
+DEFINE_MUTATE_POPULATION_RULE(MutateComputeLocation);
+
+/*! \brief The rule that mutates the value of a randomly selected auto unroll pragma step. */
+DEFINE_MUTATE_POPULATION_RULE(MutateAutoUnroll);
 
 }  // namespace auto_scheduler
 }  // namespace tvm
diff --git a/src/auto_scheduler/search_policy/utils.cc b/src/auto_scheduler/search_policy/utils.cc
index 62ffce4dc875..174ca3105a4b 100644
--- a/src/auto_scheduler/search_policy/utils.cc
+++ b/src/auto_scheduler/search_policy/utils.cc
@@ -67,6 +67,87 @@ Array<Integer> GetSpatialSplitStepIds(const State& s, int stage_id) {
   return spatial_split_step_ids;
 }
 
+std::vector<std::pair<int, int>> GetComputeLocationCandidates(const SearchTask& task,
+                                                              const State& state, int stage_id) {
+  int target_stage_id = GetSingleConsumerId(task, state, stage_id);
+  if (target_stage_id < 0) {
+    return {};
+  }
+  const Stage& target_stage = state->stages[target_stage_id];
+
+  std::vector<std::pair<int, int>> candidates;
+  bool target_compute_at_other = target_stage->compute_at == ComputeAtKind::kIter;
+  bool target_is_tiled = IsTiled(target_stage);
+
+  bool visited_reduce = false;
+  // Enumerate compute_at location at target_stage
+  // TODO(merrymercy): More analysis here to make smarter choices
+  for (size_t i = 0; i < target_stage->iters.size(); ++i) {
+    const Iterator& target_iter = target_stage->iters[i];
+    if (target_iter->iter_kind == IteratorKind::kReduction) {
+      visited_reduce = true;
+      if (!target_is_tiled) {  // Do not go into reduce iter
+        break;
+      }
+    } else if (target_iter->iter_kind == IteratorKind::kSpatial) {
+      if (visited_reduce) {  // Do not go into inner tile
+        break;
+      }
+    }
+
+    if (target_iter->annotation == IteratorAnnotation::kUnroll) {
+      // Do not go into the unroll region of const tensor indices
+      break;
+    }
+
+    if (GetExtent(target_iter) == 1) {
+      // Skip iterators with length of 1
+      continue;
+    }
+    if (target_compute_at_other && target_iter->iter_kind == IteratorKind::kSpatial &&
+        StrEndsWith(target_iter->name, ".0")) {
+      // Skip the first level iterators if target stage compute_at another stage
+      // In this case, the lengths of first level iterators are always one
+      continue;
+    }
+    candidates.emplace_back(target_stage_id, i);
+
+    if (state->attach_map->iter_to_attached_stages.count(std::make_pair(target_stage_id, i))) {
+      break;
+    }
+  }
+
+  // if the target_stage is already compute_at another stage X, try also compute_at X
+  // We call stage X as `target_target_stage`
+  if (target_compute_at_other) {
+    int target_target_stage_id;
+    target_target_stage_id = state->attach_map->stage_to_attach_iter.at(target_stage_id).first;
+    const Stage& target_target_stage = state->stages[target_target_stage_id];
+
+    for (size_t i = 0; i < target_target_stage->iters.size(); ++i) {
+      const Iterator& target_target_iter = target_target_stage->iters[i];
+      if (target_target_iter->iter_kind == IteratorKind::kReduction ||
+          state->attach_map->iter_to_attached_stages.count(
+              std::make_pair(target_target_stage_id, i))) {
+        break;
+      }
+
+      if (target_target_iter->annotation == IteratorAnnotation::kUnroll) {
+        // Do not go into the unroll region of const tensor indices
+        break;
+      }
+
+      if (GetExtent(target_target_iter) == 1) {  // skip iterators with length of 1
+        continue;
+      }
+
+      candidates.emplace_back(target_target_stage_id, i);
+    }
+  }
+
+  return candidates;
+}
+
 State DoMultiLevelTiling(const State& state, int stage_id, const std::string& format,
                          std::vector<int>* spatial_split_step_ids) {
   // Temporal object to be used if the input pointer is nullptr
@@ -327,25 +408,62 @@ void PruneInvalidState(const SearchTask& task, Array<State>* states) {
   }
 
   if (pt == 0) {
-    LOG(INFO) << "All states are invalid.";
+    LOG(FATAL) << "Internal error: All states are invalid.";
   } else {
     states->resize(pt);
   }
 }
 
+/********** SplitFactorizationMemo **********/
+
+void SplitFactorizationMemo::ReadWriteLock::GetRead() {
+  std::unique_lock<std::mutex> lock(cv_mutex_);
+  // Wake up and get the mutex lock if there's no writing thread
+  cv_.wait(lock, [this]() { return !this->is_writing_; });
+  read_count_++;
+}
+
+void SplitFactorizationMemo::ReadWriteLock::GetWrite() {
+  std::unique_lock<std::mutex> lock(cv_mutex_);
+  // Wake up and get the mutex lock if there's no reading or writing threads
+  cv_.wait(lock, [this]() { return this->read_count_ == 0 && !this->is_writing_; });
+  is_writing_ = true;
+}
+
+void SplitFactorizationMemo::ReadWriteLock::UnlockRead() {
+  std::lock_guard<std::mutex> lock(cv_mutex_);
+  read_count_--;
+  // Notify the other blocked threads if this is the last reading thread
+  if (read_count_ == 0) {
+    cv_.notify_one();
+  }
+}
+
+void SplitFactorizationMemo::ReadWriteLock::UnlockWrite() {
+  std::lock_guard<std::mutex> lock(cv_mutex_);
+  is_writing_ = false;
+  // Notify the other blocked threads
+  cv_.notify_one();
+}
+
 const Array<Array<Integer>>& SplitFactorizationMemo::GetFactorizationSchemes(
     int extent, int n_lengths, int max_innermost_factor) {
   QueryKey key = std::make_tuple(extent, n_lengths, max_innermost_factor);
-  auto it = memory_.find(key);
-  if (it != memory_.end()) {
+  const auto& const_memory = memory_;
+  lock_.GetRead();
+  const auto& it = const_memory.find(key);
+  const auto& memory_end = const_memory.end();
+  lock_.UnlockRead();
+  if (it != memory_end) {
     return it->second;
   }
 
+  lock_.GetWrite();
   tmp_stack_ = Array<Integer>(n_lengths, Integer());
   results_ = &memory_[key];
   n_lengths_ = n_lengths;
-
   DfsEnumerate(0, extent, max_innermost_factor);
+  lock_.UnlockWrite();
 
   return *results_;
 }
@@ -383,6 +501,8 @@ const std::vector<int>& SplitFactorizationMemo::GetFactors(int n) {
   return res;
 }
 
+/********** Utils interface API for ffi **********/
+
 TVM_REGISTER_GLOBAL("auto_scheduler.SearchPolicyUtilsIsTiled")
     .set_body_typed([](const Stage& stage) { return IsTiled(stage); });
 
diff --git a/src/auto_scheduler/search_policy/utils.h b/src/auto_scheduler/search_policy/utils.h
index d2ba1289a5b5..6c0fb4c4dcf4 100644
--- a/src/auto_scheduler/search_policy/utils.h
+++ b/src/auto_scheduler/search_policy/utils.h
@@ -32,6 +32,7 @@
 #include <tvm/te/operation.h>
 
 #include <algorithm>
+#include <condition_variable>
 #include <set>
 #include <string>
 #include <tuple>
@@ -372,7 +373,8 @@ inline bool HasSingleElementwiseMatchedConsumer(const SearchTask& task, const St
     *target_stage_id = *consumers.begin();
     if (ElementwiseMatch(task, state, stage_id, *target_stage_id) &&
         (!(HasReduceIter(state->stages[stage_id]) &&
-           HasReduceIter(state->stages[*target_stage_id])))) {
+           HasReduceIter(state->stages[*target_stage_id]))) &&
+        (!StrEndsWith(state->stages[*target_stage_id]->op->name, ".shared"))) {
       return true;
     }
   }
@@ -535,6 +537,22 @@ inline Iterator GetLastReduceIteratorInOutermostReduceTile(const Stage& stage) {
   return stage->iters[0];
 }
 
+/*! \brief Get the target stage id of a history step in the new state.
+ * We need this because the stage_id in the history may be stale due to later steps */
+inline int GetTargetStageIDInState(const State& s, int step_id) {
+  int stage_inc = 0;
+
+  for (size_t i = step_id + 1; i < s->transform_steps.size(); ++i) {
+    if (s->transform_steps[i]->IsInstance<CacheWriteStepNode>() ||
+        s->transform_steps[i]->IsInstance<CacheReadStepNode>() ||
+        s->transform_steps[i]->IsInstance<RfactorStepNode>()) {
+      if (s->transform_steps[i]->stage_id <= s->transform_steps[step_id]->stage_id + stage_inc)
+        stage_inc++;
+    }
+  }
+  return s->transform_steps[step_id]->stage_id + stage_inc;
+}
+
 /*! \brief Get all split steps for one stage. */
 inline void GetSplitStepIds(const State& s, int stage_id, std::vector<int>* split_step_ids) {
   for (int i = static_cast<int>(s->transform_steps.size()) - 1; i >= 0; --i) {
@@ -615,6 +633,32 @@ inline Array<State> RandomSampleStates(const Array<State>& in_states, std::mt199
   return out_states;
 }
 
+/*! \brief Compute prefix-sum probabiilty based on the given weights */
+inline void ComputePrefixSumProb(const std::vector<float>& weights,
+                                 std::vector<double>* prefix_sum_probs) {
+  // Compute selection probabilities.
+  float sum = 0.0;
+  prefix_sum_probs->resize(weights.size());
+  for (size_t i = 0; i < weights.size(); ++i) {
+    sum += std::max(weights[i], 0.0f);
+    (*prefix_sum_probs)[i] = sum;
+  }
+  for (size_t i = 0; i < weights.size(); ++i) {
+    (*prefix_sum_probs)[i] /= sum;
+  }
+}
+
+/*! \brief Random choose an index according to a prefix sum probability. */
+inline int RandomChoose(const std::vector<double>& prefix_sum_probs, std::mt19937* random_gen) {
+  std::uniform_real_distribution<> dis(0.0, 1.0);
+  double x = dis(*random_gen);
+
+  CHECK(!prefix_sum_probs.empty());
+
+  return std::lower_bound(prefix_sum_probs.begin(), prefix_sum_probs.end(), x) -
+         prefix_sum_probs.begin();
+}
+
 /*! \brief Print a title */
 inline void PrintTitle(const std::string& title, int verbose) {
   StdCout(verbose) << Chars('-', 60) << "\n"
@@ -637,6 +681,33 @@ class SplitFactorizationMemo {
  private:
   void DfsEnumerate(int now, int remaining_length, int max_innermost_factor);
 
+  /*!
+   * \brief A simple implementation of read-write lock.
+   * The guarded block can be read by multiple threads at the same time, while other operations will
+   * be blocked if one thread is writing.
+   * \note Writing threads will wait until all reading threads have finshed. If there're multiple
+   * writing threads, the process order of them is not guaranteed.
+   */
+  class ReadWriteLock {
+   public:
+    /*! \brief The method to get the read lock. One thread can process read if there's on other
+     * writing threads. */
+    void GetRead();
+    /*! \brief The method to get the write lock. One thread can process write if there's on other
+     * reading or writing threads. */
+    void GetWrite();
+    /*! \brief The method to release the read lock. */
+    void UnlockRead();
+    /*! \brief The method to release the write lock. */
+    void UnlockWrite();
+
+   private:
+    uint32_t read_count_ = 0;
+    bool is_writing_ = false;
+    std::mutex cv_mutex_;
+    std::condition_variable cv_;
+  } lock_;
+
   std::unordered_map<QueryKey, Array<Array<Integer>>> memory_;
 
   int n_lengths_;
@@ -648,6 +719,10 @@ class SplitFactorizationMemo {
 /*! \brief Get the indexes of SplitStep that processes on spatial iterator. */
 Array<Integer> GetSpatialSplitStepIds(const State& s, int stage_id);
 
+/*! \brief Get the possible compute locations for a stage. */
+std::vector<std::pair<int, int>> GetComputeLocationCandidates(const SearchTask& task,
+                                                              const State& state, int stage_id);
+
 // Apply multi-level tiling structure according to a string format,
 // where "S" stands a space level, "R" stands for a reduction level.
 // For example, if the format is "SSRSRS", then we will
@@ -662,17 +737,6 @@ State DoMultiLevelTiling(const State& state, int stage_id, const std::string& fo
 State FollowTiling(const State& state, int stage_id, const std::vector<int>& split_step_ids,
                    int n_split);
 
-// Random choose an index according to a prefix sum probability.
-inline int RandomChoose(const std::vector<double>& prefix_sum_probs, std::mt19937* random_gen) {
-  std::uniform_real_distribution<> dis(0.0, 1.0);
-  double x = dis(*random_gen);
-
-  CHECK(!prefix_sum_probs.empty());
-
-  return std::lower_bound(prefix_sum_probs.begin(), prefix_sum_probs.end(), x) -
-         prefix_sum_probs.begin();
-}
-
 // Prune invalid states and return the results in-place.
 void PruneInvalidState(const SearchTask& task, Array<State>* states);
 
diff --git a/src/auto_scheduler/transform_step.cc b/src/auto_scheduler/transform_step.cc
index cec83bb93515..2a9349739752 100755
--- a/src/auto_scheduler/transform_step.cc
+++ b/src/auto_scheduler/transform_step.cc
@@ -780,7 +780,9 @@ Array<Iterator> ApplySplitToState(State* state, int stage_id, int iter_id,
       res = Iterator(name, Range(), it->iter_kind, IteratorAnnotation::kNone);
       tosplit_min = NullOpt;
       tosplit_extent = NullOpt;
-      concrete = false;
+      if (!l.defined()) {
+        concrete = false;
+      }
     }
     outs.push_back(std::move(res));
   }
diff --git a/src/node/attr_registry.h b/src/node/attr_registry.h
index 7b233a865f55..01d2b68c471b 100644
--- a/src/node/attr_registry.h
+++ b/src/node/attr_registry.h
@@ -37,7 +37,7 @@
 namespace tvm {
 
 /*!
- * \breif Implementation of registry with attributes.
+ * \brief Implementation of registry with attributes.
  *
  * \tparam EntryType The type of the registry entry.
  * \tparam KeyType The actual key that is used to lookup the attributes.
diff --git a/src/node/structural_hash.cc b/src/node/structural_hash.cc
index 7c32f31fad89..d21cb1f2d9b3 100644
--- a/src/node/structural_hash.cc
+++ b/src/node/structural_hash.cc
@@ -57,7 +57,7 @@ class VarCountingSHashHandler : public SHashReducer::Handler {
      *  the correct value.
      */
     ObjectRef object;
-    /*! \biref The partially reduce hash value.*/
+    /*! \brief The partially reduce hash value.*/
     size_t reduced_hash;
     /*! \brief The expected location in the result stack. */
     size_t result_stack_index = std::numeric_limits<size_t>::max();
diff --git a/src/parser/diagnostic.h b/src/parser/diagnostic.h
index 2eb38b312242..085d1c4ea8fb 100644
--- a/src/parser/diagnostic.h
+++ b/src/parser/diagnostic.h
@@ -139,7 +139,7 @@ DiagnosticBuilder Diagnostic::Help(Span span) {
 }
 
 /*! \brief A diagnostic context for recording errors against a source file.
- * TODO(@jroesch): convert source map and improve in follow up PR, the parser
+ * TODO(jroesch): convert source map and improve in follow up PR, the parser
  * assumes a single global file for now.
  */
 struct DiagnosticContext {
@@ -160,7 +160,7 @@ struct DiagnosticContext {
     Render(std::cout);
   }
 
-  // TODO(@jroesch): eventually modularize the rendering interface to provide control of how to
+  // TODO(jroesch): eventually modularize the rendering interface to provide control of how to
   // format errors.
   void Render(std::ostream& ostream) {
     for (auto diagnostic : diagnostics) {
diff --git a/src/printer/tir_hybrid_printer.cc b/src/printer/tir_hybrid_printer.cc
index b58e5fc2e9a0..0fadf17c11a3 100644
--- a/src/printer/tir_hybrid_printer.cc
+++ b/src/printer/tir_hybrid_printer.cc
@@ -68,7 +68,7 @@ class TIRHybridPrinter : public StmtFunctor<Doc(const Stmt&)>,
   std::unordered_set<const VarNode*> var_not_in_headers;
   /*! \brief buffer collector (buffer defined in BufferMap and BufferAllocation)*/
   std::unordered_set<const BufferNode*> buf_not_in_headers;
-  /*! \breif Map from Var to thread env name */
+  /*! \brief Map from Var to thread env name */
   std::unordered_map<Var, String, ObjectPtrHash, ObjectPtrEqual> var_env_map_;
   /*! \brief Map from Var to Doc */
   std::unordered_map<Var, Doc, ObjectPtrHash, ObjectPtrEqual> memo_var_;
diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
index 6ba1ce777f4f..2b08f45b2582 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -353,7 +353,7 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
   }
   /*!
    * \brief Check if we can release token.
-   * \tok The token to be released.
+   * \param tok The token to be released.
    */
   void CheckForRelease(StorageToken* tok) {
     CHECK_GE(tok->storage_id, 0);
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 8ab1ab1be66d..38ebe421d38d 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -972,9 +972,10 @@ bool DilateRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 }
 
 // Positional relay function to create dilate operator used by frontend FFI.
-Expr MakeDilate(Expr data, Array<IndexExpr> strides) {
+Expr MakeDilate(Expr data, Array<IndexExpr> strides, double dilation_value = 0.0) {
   auto attrs = make_object<DilateAttrs>();
   attrs->strides = std::move(strides);
+  attrs->dilation_value = std::move(dilation_value);
   static const Op& op = Op::Get("nn.dilate");
   return Call(op, {data}, Attrs(attrs), {});
 }
@@ -983,7 +984,7 @@ TVM_REGISTER_GLOBAL("relay.op.nn._make.dilate").set_body_typed(MakeDilate);
 
 RELAY_REGISTER_OP("nn.dilate")
     .describe(R"code(
-Dilate data with zeros.
+Dilate data with given dilation value (0 by default).
 )code" TVM_ADD_FILELINE)
     .set_num_inputs(1)
     .add_argument("x", "1D Tensor", "Data to dilate.")
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index c9807c39764e..441a76ef98d3 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -3126,6 +3126,8 @@ RELAY_REGISTER_OP("sparse_to_dense")
     .set_attr<FTVMCompute>("FTVMCompute", SparseToDenseCompute);
 
 // relay.matrix_set_diag
+TVM_REGISTER_NODE_TYPE(MatrixSetDiagAttrs);
+
 bool MatrixSetDiagRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                       const TypeReporter& reporter) {
   // `types` contains: [input, diagonal, result]
@@ -3137,13 +3139,28 @@ bool MatrixSetDiagRel(const Array<Type>& types, int num_inputs, const Attrs& att
   const auto* diagonal = types[1].as<TensorTypeNode>();
   CHECK(diagonal);
 
+  const auto param = attrs.as<MatrixSetDiagAttrs>();
+  CHECK_GE(param->k2, param->k1);
+
   int d_ndims = diagonal->shape.size();
-  for (int i = 0; i < d_ndims - 1; i++) {
+  int i_ndims = input->shape.size();
+
+  reporter->Assert(input->shape[i_ndims - 2] > -param->k1);
+  reporter->Assert(input->shape[i_ndims - 1] > param->k2);
+
+  for (int i = 0; i < d_ndims - 2; i++) {
     reporter->AssertEQ(input->shape[i], diagonal->shape[i]);
   }
-  auto min_dim = if_then_else(input->shape[d_ndims - 1] >= input->shape[d_ndims],
-                              input->shape[d_ndims], input->shape[d_ndims - 1]);
-  reporter->Assert(diagonal->shape[d_ndims - 1] >= min_dim);
+  if (param->k1 != param->k2) {
+    reporter->AssertEQ(diagonal->shape[d_ndims - 2], param->k2 - param->k1 + 1);
+  } else if (d_ndims >= 2) {
+    reporter->AssertEQ(input->shape[d_ndims - 2], diagonal->shape[d_ndims - 2]);
+  }
+  auto max_diag_len = if_then_else(input->shape[i_ndims - 2] + (param->k2 > 0 ? param->k2 : 0) <=
+                                       input->shape[i_ndims - 1] + (param->k1 < 0 ? -param->k1 : 0),
+                                   input->shape[i_ndims - 2] + (param->k2 > 0 ? param->k2 : 0),
+                                   input->shape[i_ndims - 1] + (param->k1 < 0 ? -param->k1 : 0));
+  reporter->AssertEQ(diagonal->shape[d_ndims - 1], max_diag_len);
 
   reporter->Assign(types[2], TensorType(input->shape, input->dtype));
   return true;
@@ -3151,22 +3168,37 @@ bool MatrixSetDiagRel(const Array<Type>& types, int num_inputs, const Attrs& att
 
 Array<te::Tensor> MatrixSetDiagCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                        const Type& out_type) {
-  return Array<te::Tensor>{topi::matrix_set_diag(inputs[0], inputs[1])};
-}
-
-Expr MakeMatrixSetDiag(Expr input, Expr diagonal) {
+  const auto* param = attrs.as<MatrixSetDiagAttrs>();
+  CHECK(param != nullptr);
+  return Array<te::Tensor>{topi::matrix_set_diag(inputs[0], inputs[1], param->k1, param->k2,
+                                                 param->super_diag_right_align,
+                                                 param->sub_diag_right_align)};
+}
+
+Expr MakeMatrixSetDiag(Expr input, Expr diagonal, int k1, int k2, bool super_diag_right_align,
+                       bool sub_diag_right_align) {
+  auto attrs = make_object<MatrixSetDiagAttrs>();
+  attrs->k1 = k1;
+  attrs->k2 = k2;
+  attrs->super_diag_right_align = super_diag_right_align;
+  attrs->sub_diag_right_align = sub_diag_right_align;
   static const Op& op = Op::Get("matrix_set_diag");
-  return Call(op, {input, diagonal}, Attrs(), {});
+  return Call(op, {input, diagonal}, Attrs(attrs), {});
 }
 
 TVM_REGISTER_GLOBAL("relay.op._make.matrix_set_diag").set_body_typed(MakeMatrixSetDiag);
 
 RELAY_REGISTER_OP("matrix_set_diag")
     .describe(
-        R"code(Returns a tensor with the diagonal of input tensor replaced with the provided diagonal values.
+        R"code(Returns a tensor with the diagonals of input tensor replaced with the provided diagonal values.
         **input** Input tensor.
         **diagonal** Values to be filled in the diagonal.
+        **k1** Lower limit (included) of the range of diagonals.
+        **k2** Upper limit (included) of the range of diagonals.
+        **super_diag_right_align** Bool, true iff super-diagonal is right aligned (left-padded).
+        **sub_diag_right_align** Bool, true iff sub-diagonal is right aligned (left-padded).
     )code" TVM_ADD_FILELINE)
+    .set_attrs_type<MatrixSetDiagAttrs>()
     .set_num_inputs(2)
     .add_argument("input", "Tensor", "Input Tensor.")
     .add_argument("diagonal", "Tensor", "Values to be filled in the diagonal.")
diff --git a/src/relay/transforms/annotate_target.cc b/src/relay/transforms/annotate_target.cc
index c307d75a9aba..74c236ae3280 100644
--- a/src/relay/transforms/annotate_target.cc
+++ b/src/relay/transforms/annotate_target.cc
@@ -35,9 +35,10 @@ namespace tvm {
 namespace relay {
 namespace annotate_target {
 
-const PackedFunc* make_begin_op =
+static const PackedFunc* make_begin_op =
     runtime::Registry::Get("relay.op.annotation._make.compiler_begin");
-const PackedFunc* make_end_op = runtime::Registry::Get("relay.op.annotation._make.compiler_end");
+static const PackedFunc* make_end_op =
+    runtime::Registry::Get("relay.op.annotation._make.compiler_end");
 
 // A helper class to insert annotation boundaries for a program region that will
 // be handled by a specific compiler.
diff --git a/src/relay/transforms/convert_layout.cc b/src/relay/transforms/convert_layout.cc
index 9a71642aac13..65fdeda5f6cd 100644
--- a/src/relay/transforms/convert_layout.cc
+++ b/src/relay/transforms/convert_layout.cc
@@ -88,22 +88,23 @@ class ConvertTransformMemorizer : public TransformMemorizer {
     Expr new_e;
     bool modified = false;
     if (fconvert_layout.count(op)) {
-      tvm::Array<tvm::te::Tensor> tinfos;
-      for (auto expr : ref_call->args) {
-        auto ttype = expr->type_as<TensorTypeNode>();
-        tinfos.push_back(tvm::te::placeholder(ttype->shape, ttype->dtype));
-      }
-
       auto desired_layouts = operator->()->desired_layouts_;
-      if (desired_layouts.find(op->name) == desired_layouts.end()) {
-        LOG(FATAL) << "Desired layout(s) not specified for op: " << op->name;
-      }
-      Array<String> op_desired_layouts = desired_layouts.at(op->name);
-      Expr altered_value =
-          fconvert_layout[op](ref_call->attrs, new_args, tinfos, op_desired_layouts);
-      if (altered_value.defined()) {
-        new_e = altered_value;
-        modified = true;
+      if (desired_layouts.find(op->name) != desired_layouts.end()) {
+        tvm::Array<tvm::te::Tensor> tinfos;
+        for (auto expr : ref_call->args) {
+          auto ttype = expr->type_as<TensorTypeNode>();
+          tinfos.push_back(tvm::te::placeholder(ttype->shape, ttype->dtype));
+        }
+
+        Array<String> op_desired_layouts = desired_layouts.at(op->name);
+        Expr altered_value =
+            fconvert_layout[op](ref_call->attrs, new_args, tinfos, op_desired_layouts);
+        if (altered_value.defined()) {
+          new_e = altered_value;
+          modified = true;
+        }
+      } else {
+        LOG(WARNING) << "Desired layout(s) not specified for op: " << op->name;
       }
     }
     if (!modified) {
diff --git a/src/relay/transforms/gradient.cc b/src/relay/transforms/gradient.cc
index 9c472542cc91..bf8105080317 100644
--- a/src/relay/transforms/gradient.cc
+++ b/src/relay/transforms/gradient.cc
@@ -338,7 +338,7 @@ Expr FirstOrderGradient(const Expr& re, const Optional<IRModule>& mod) {
 
 TVM_REGISTER_GLOBAL("relay._transform.first_order_gradient").set_body_typed(FirstOrderGradient);
 
-Type bpt = RelayRefType(FuncType({}, TupleType(Array<Type>()), {}, {}));
+static Type bpt = RelayRefType(FuncType({}, TupleType(Array<Type>()), {}, {}));
 
 struct ReverseADType : TypeMutator {
   Type VisitType_(const TensorTypeNode* ttn) final {
diff --git a/src/relay/transforms/infer_layout_util.h b/src/relay/transforms/infer_layout_util.h
index 9868ee5d03db..5cc180e8e2e3 100644
--- a/src/relay/transforms/infer_layout_util.h
+++ b/src/relay/transforms/infer_layout_util.h
@@ -142,7 +142,7 @@ inline Array<Array<Layout>> BinaryBroadcastLayout(const Attrs& attrs,
 
   if (!layouts[0].defined() && !layouts[1].defined()) {
     // both undefined, infer fails
-    return Array<Array<Layout>>{{Layout::Undef()}, {Layout::Undef()}};
+    return Array<Array<Layout>>{{Layout::Undef(), Layout::Undef()}, {Layout::Undef()}};
   } else if (!layouts[0].defined() || !layouts[1].defined()) {
     // only one is defined, use shape information to help infer
     int defined_idx = layouts[0].defined() ? 0 : 1;
@@ -157,7 +157,7 @@ inline Array<Array<Layout>> BinaryBroadcastLayout(const Attrs& attrs,
       // only know the tensor with smaller dimensions,
       // so we cannot infer the final broadcasted output.
       // fails in this case.
-      return Array<Array<Layout>>{{Layout::Undef()}, {Layout::Undef()}};
+      return Array<Array<Layout>>{{Layout::Undef(), Layout::Undef()}, {Layout::Undef()}};
     }
   } else if (layouts[0].defined() && layouts[1].defined() &&
              (layouts[0].ndim() == 0 || layouts[1].ndim() == 0)) {
diff --git a/src/relay/transforms/partition_graph.cc b/src/relay/transforms/partition_graph.cc
index d9555570d9fa..a82dc0a0697c 100644
--- a/src/relay/transforms/partition_graph.cc
+++ b/src/relay/transforms/partition_graph.cc
@@ -428,7 +428,8 @@ IRModule RemoveDefaultAnnotations(IRModule module) {
  */
 
 // New annotations would be required to be added for each flattened output
-const PackedFunc* make_end_op = runtime::Registry::Get("relay.op.annotation._make.compiler_end");
+static const PackedFunc* make_end_op =
+    runtime::Registry::Get("relay.op.annotation._make.compiler_end");
 
 IRModule FlattenTupleOutputs(IRModule module) {
   class TupleOutFlattener : public ExprRewriter {
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index 1c860b8ffb3f..9895ff6987ad 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -148,7 +148,7 @@ void DeviceAPI::FreeWorkspace(TVMContext ctx, void* ptr) { FreeDataSpace(ctx, pt
 
 TVMStreamHandle DeviceAPI::CreateStream(TVMContext ctx) {
   LOG(FATAL) << "Device does not support stream api.";
-  return 0;
+  return nullptr;
 }
 
 void DeviceAPI::FreeStream(TVMContext ctx, TVMStreamHandle stream) {
@@ -462,7 +462,7 @@ int TVMFuncCreateFromCFunc(TVMPackedCFunc func, void* resource_handle, TVMPacked
   API_BEGIN();
   if (fin == nullptr) {
     *out = new PackedFunc([func, resource_handle](TVMArgs args, TVMRetValue* rv) {
-      int ret = func((TVMValue*)args.values, (int*)args.type_codes,  // NOLINT(*)
+      int ret = func(const_cast<TVMValue*>(args.values), const_cast<int*>(args.type_codes),
                      args.num_args, rv, resource_handle);
       if (ret != 0) {
         throw dmlc::Error(TVMGetLastError() + ::dmlc::StackTrace());
@@ -473,7 +473,7 @@ int TVMFuncCreateFromCFunc(TVMPackedCFunc func, void* resource_handle, TVMPacked
     // so fin will be called when the lambda went out of scope.
     std::shared_ptr<void> rpack(resource_handle, fin);
     *out = new PackedFunc([func, rpack](TVMArgs args, TVMRetValue* rv) {
-      int ret = func((TVMValue*)args.values, (int*)args.type_codes,  // NOLINT(*)
+      int ret = func(const_cast<TVMValue*>(args.values), const_cast<int*>(args.type_codes),
                      args.num_args, rv, rpack.get());
       if (ret != 0) {
         throw dmlc::Error(TVMGetLastError() + ::dmlc::StackTrace());
diff --git a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
index f2d2fca64055..3fb4e0a3575f 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
@@ -31,6 +31,7 @@
 #ifdef TVM_GRAPH_RUNTIME_ARM_COMPUTE_LIB
 #include <arm_compute/core/Types.h>
 #include <arm_compute/runtime/NEON/functions/NEConvolutionLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEElementwiseOperations.h>
 #include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEPoolingLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h>
@@ -139,12 +140,13 @@ class ACLRuntime : public JSONRuntimeBase {
           CreateGlobalPoolingLayer(&layer_, node);
         } else if ("reshape" == op_name) {
           CreateReshapeLayer(&layer_, node);
+        } else if ("maximum" == op_name) {
+          CreateMaximumLayer(&layer_, node);
         } else {
           LOG(FATAL) << "Unsupported op: " << op_name;
         }
       }
     }
-
     this->layer_.function->prepare();
     if (num_pools > 0) mm->populate(this->allocator_, num_pools);
   }
@@ -401,6 +403,21 @@ class ACLRuntime : public JSONRuntimeBase {
     layer->function = function;
   }
 
+  /*!
+   * \brief Create a maximum layer.
+   *
+   * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
+   * \param node The JSON representation of the operator.
+   */
+  void CreateMaximumLayer(CachedLayer* layer, const JSONGraphNode& node) {
+    layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[0]));
+    layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[1]));
+    layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
+    auto function = std::make_shared<arm_compute::NEElementwiseMax>();
+    function->configure(&layer->inputs[0], &layer->inputs[1], &layer->outputs[0]);
+    layer->function = function;
+  }
+
   /*! \brief Allow ACL functions to request auxiliary memory from TVM. */
   ACLAllocator allocator_;
   /*!
diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
index b69ecf26808e..f7b88ccdd964 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -217,7 +217,7 @@ class CUDADeviceAPI final : public DeviceAPI {
  private:
   static void GPUCopy(const void* from, void* to, size_t size, cudaMemcpyKind kind,
                       cudaStream_t stream) {
-    if (stream != 0) {
+    if (stream != nullptr) {
       CUDA_CALL(cudaMemcpyAsync(to, from, size, kind, stream));
     } else {
       CUDA_CALL(cudaMemcpy(to, from, size, kind));
diff --git a/src/runtime/cuda/cuda_module.cc b/src/runtime/cuda/cuda_module.cc
index 498a9b703a7b..bf844c1ad798 100644
--- a/src/runtime/cuda/cuda_module.cc
+++ b/src/runtime/cuda/cuda_module.cc
@@ -169,9 +169,9 @@ class CUDAWrappedFunc {
     }
     CUstream strm = static_cast<CUstream>(CUDAThreadEntry::ThreadLocal()->stream);
     ThreadWorkLoad wl = thread_axis_cfg_.Extract(args);
-    CUresult result =
-        cuLaunchKernel(fcache_[device_id], wl.grid_dim(0), wl.grid_dim(1), wl.grid_dim(2),
-                       wl.block_dim(0), wl.block_dim(1), wl.block_dim(2), 0, strm, void_args, 0);
+    CUresult result = cuLaunchKernel(fcache_[device_id], wl.grid_dim(0), wl.grid_dim(1),
+                                     wl.grid_dim(2), wl.block_dim(0), wl.block_dim(1),
+                                     wl.block_dim(2), 0, strm, void_args, nullptr);
     if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) {
       const char* msg;
       cuGetErrorName(result, &msg);
diff --git a/src/runtime/hexagon/sim/driver/CMakeLists.txt b/src/runtime/hexagon/sim/driver/CMakeLists.txt
index 8632b491f259..a98cfe07fcdc 100644
--- a/src/runtime/hexagon/sim/driver/CMakeLists.txt
+++ b/src/runtime/hexagon/sim/driver/CMakeLists.txt
@@ -56,6 +56,8 @@ target_include_directories(sim_dev
   PUBLIC "."
   PUBLIC ".."
   PUBLIC "../../../../../include"
+)
+target_include_directories(sim_dev SYSTEM
   PUBLIC "../../../../../3rdparty/dlpack/include"
 )
 
diff --git a/src/runtime/hexagon/target/fastrpc/CMakeLists.txt b/src/runtime/hexagon/target/fastrpc/CMakeLists.txt
index 072b9ca62fb2..0d790d76f7f7 100644
--- a/src/runtime/hexagon/target/fastrpc/CMakeLists.txt
+++ b/src/runtime/hexagon/target/fastrpc/CMakeLists.txt
@@ -27,10 +27,10 @@ endif()
 set(FASTRPC_SRC "${CMAKE_CURRENT_SOURCE_DIR}")
 
 include_directories(include)
-include_directories(${HEXAGON_SDK_ROOT}/incs)
-include_directories(${HEXAGON_SDK_ROOT}/incs/stddef)
+include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs)
+include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs/stddef)
 include_directories(
-    ${HEXAGON_SDK_ROOT}/libs/common/remote/ship/android_Release_aarch64)
+    SYSTEM ${HEXAGON_SDK_ROOT}/libs/common/remote/ship/android_Release_aarch64)
 
 set(QAIC_EXE "${HEXAGON_SDK_ROOT}/tools/qaic/Ubuntu16/qaic")
 set(QAIC_FLAGS
@@ -96,9 +96,9 @@ if("${FASTRPC_LIBS}" STREQUAL "SKEL")
   endif()
 
   include_directories(
-      ${HEXAGON_SDK_ROOT}/libs/common/qurt/${HEXARCH_DIR}/include/qurt)
+      SYSTEM ${HEXAGON_SDK_ROOT}/libs/common/qurt/${HEXARCH_DIR}/include/qurt)
   include_directories(
-      ${HEXAGON_SDK_ROOT}/libs/common/qurt/${HEXARCH_DIR}/include/posix)
+      SYSTEM ${HEXAGON_SDK_ROOT}/libs/common/qurt/${HEXARCH_DIR}/include/posix)
 
   # Extra compile flags (both C and C++).
   set(EXTRA_COMP_FLAGS
@@ -158,11 +158,11 @@ if("${FASTRPC_LIBS}" STREQUAL "SKEL")
 else()
   # Stub libraries.
   #
-  include_directories(${HEXAGON_SDK_ROOT}/incs/a1std)
-  include_directories(${HEXAGON_SDK_ROOT}/incs/qlist)
-  include_directories(${HEXAGON_SDK_ROOT}/libs/common/rpcmem/inc)
+  include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs/a1std)
+  include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs/qlist)
+  include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/libs/common/rpcmem/inc)
   link_directories(
-      ${HEXAGON_SDK_ROOT}/libs/common/remote/ship/android_Release_aarch64)
+      SYSTEM ${HEXAGON_SDK_ROOT}/libs/common/remote/ship/android_Release_aarch64)
 
   add_library(tvm_remote_nd_stub SHARED
       "${FASTRPC_SRC}/include/${TVM_REMOTE_ND_H}"
diff --git a/src/runtime/library_module.cc b/src/runtime/library_module.cc
index 651e19cdbd37..a5935491fcd7 100644
--- a/src/runtime/library_module.cc
+++ b/src/runtime/library_module.cc
@@ -74,7 +74,7 @@ PackedFunc WrapPackedFunc(TVMBackendPackedCFunc faddr, const ObjectPtr<Object>&
     TVMValue ret_value;
     int ret_type_code = kTVMNullptr;
     int ret = (*faddr)(const_cast<TVMValue*>(args.values), const_cast<int*>(args.type_codes),
-                       args.num_args, &ret_value, &ret_type_code, NULL);
+                       args.num_args, &ret_value, &ret_type_code, nullptr);
     CHECK_EQ(ret, 0) << TVMGetLastError();
     if (ret_type_code != kTVMNullptr) {
       *rv = TVMRetValue::MoveFromCHost(ret_value, ret_type_code);
diff --git a/src/support/socket.h b/src/support/socket.h
index 3ccfaaab5ab5..d70f956a51fb 100644
--- a/src/support/socket.h
+++ b/src/support/socket.h
@@ -131,9 +131,9 @@ struct SockAddr {
     hints.ai_family = PF_UNSPEC;
     hints.ai_flags = AI_PASSIVE;
     hints.ai_socktype = SOCK_STREAM;
-    addrinfo* res = NULL;
-    int sig = getaddrinfo(host, NULL, &hints, &res);
-    CHECK(sig == 0 && res != NULL) << "cannot obtain address of " << host;
+    addrinfo* res = nullptr;
+    int sig = getaddrinfo(host, nullptr, &hints, &res);
+    CHECK(sig == 0 && res != nullptr) << "cannot obtain address of " << host;
     switch (res->ai_family) {
       case AF_INET: {
         sockaddr_in* addr4 = reinterpret_cast<sockaddr_in*>(&addr);
@@ -403,7 +403,7 @@ class TCPSocket : public Socket {
    * \return The accepted socket connection.
    */
   TCPSocket Accept() {
-    SockType newfd = accept(sockfd, NULL, NULL);
+    SockType newfd = accept(sockfd, nullptr, nullptr);
     if (newfd == INVALID_SOCKET) {
       Socket::Error("Accept");
     }
diff --git a/src/target/llvm/codegen_amdgpu.cc b/src/target/llvm/codegen_amdgpu.cc
index 205a8a44c1ea..1f6eedde0b21 100644
--- a/src/target/llvm/codegen_amdgpu.cc
+++ b/src/target/llvm/codegen_amdgpu.cc
@@ -106,7 +106,7 @@ class CodeGenAMDGPU : public CodeGenLLVM {
       llvm::Type* type = llvm::ArrayType::get(DTypeToLLVMType(op->dtype), constant_size);
       // Allocate shared memory in global, address_space = 3
       llvm::GlobalVariable* global = new llvm::GlobalVariable(
-          *module_, type, false, llvm::GlobalValue::PrivateLinkage, 0, ".shared", nullptr,
+          *module_, type, false, llvm::GlobalValue::PrivateLinkage, nullptr, ".shared", nullptr,
           llvm::GlobalValue::NotThreadLocal, shared_address_space);
       if (global->getAlignment() < static_cast<uint32_t>(info.alignment)) {
 #if TVM_LLVM_VERSION >= 100
diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index 127889d52fbd..53104542417e 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -225,8 +225,9 @@ void CodeGenCPU::AddMainFunction(const std::string& entry_func_name) {
   llvm::Function* f = module_->getFunction(entry_func_name);
   CHECK(f) << "Function " << entry_func_name << "does not in module";
   llvm::Type* type = llvm::ArrayType::get(t_char_, entry_func_name.length() + 1);
-  llvm::GlobalVariable* global = new llvm::GlobalVariable(
-      *module_, type, true, llvm::GlobalValue::WeakAnyLinkage, 0, runtime::symbol::tvm_module_main);
+  llvm::GlobalVariable* global =
+      new llvm::GlobalVariable(*module_, type, true, llvm::GlobalValue::WeakAnyLinkage, nullptr,
+                               runtime::symbol::tvm_module_main);
 #if TVM_LLVM_VERSION >= 100
   global->setAlignment(llvm::Align(1));
 #else
@@ -349,7 +350,7 @@ llvm::Value* CodeGenCPU::CreateCallExtern(Type ret_type, String global_symbol,
 
 llvm::GlobalVariable* CodeGenCPU::InitContextPtr(llvm::Type* p_type, std::string name) {
   llvm::GlobalVariable* gv = new llvm::GlobalVariable(
-      *module_, p_type, false, llvm::GlobalValue::LinkOnceAnyLinkage, 0, name);
+      *module_, p_type, false, llvm::GlobalValue::LinkOnceAnyLinkage, nullptr, name);
 #if TVM_LLVM_VERSION >= 100
   gv->setAlignment(llvm::Align(data_layout_->getTypeAllocSize(p_type)));
 #else
@@ -552,8 +553,9 @@ void CodeGenCPU::CreateParallelLaunch(const Stmt& body, int num_task) {
 }
 
 llvm::Value* CodeGenCPU::CreateStaticHandle() {
-  llvm::GlobalVariable* gv = new llvm::GlobalVariable(
-      *module_, t_void_p_, false, llvm::GlobalValue::PrivateLinkage, 0, "__tvm_static_handle");
+  llvm::GlobalVariable* gv =
+      new llvm::GlobalVariable(*module_, t_void_p_, false, llvm::GlobalValue::PrivateLinkage,
+                               nullptr, "__tvm_static_handle");
 #if TVM_LLVM_VERSION >= 100
   gv->setAlignment(llvm::Align(data_layout_->getTypeAllocSize(t_void_p_)));
 #else
diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index c52f9b06929e..a7e96c95e07f 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -237,7 +237,7 @@ llvm::Value* CodeGenHexagon::CreateCallExtern(Type ret_type, String global_symbo
 
 llvm::GlobalVariable* CodeGenHexagon::InitContextPtr(llvm::Type* p_type, std::string name) {
   llvm::GlobalVariable* gv = new llvm::GlobalVariable(
-      *module_, p_type, false, llvm::GlobalValue::LinkOnceAnyLinkage, 0, name);
+      *module_, p_type, false, llvm::GlobalValue::LinkOnceAnyLinkage, nullptr, name);
 #if TVM_LLVM_VERSION >= 100
   gv->setAlignment(llvm::Align(data_layout_->getTypeAllocSize(p_type)));
 #else
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 4ffd6b25c237..cb04e6b8055b 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -628,8 +628,8 @@ llvm::Constant* CodeGenLLVM::GetConstString(const std::string& str) {
   auto it = str_map_.find(str);
   if (it != str_map_.end()) return it->second;
   llvm::Type* type = llvm::ArrayType::get(t_char_, str.length() + 1);
-  llvm::GlobalVariable* global =
-      new llvm::GlobalVariable(*module_, type, true, llvm::GlobalValue::PrivateLinkage, 0, ".str");
+  llvm::GlobalVariable* global = new llvm::GlobalVariable(
+      *module_, type, true, llvm::GlobalValue::PrivateLinkage, nullptr, ".str");
 #if TVM_LLVM_VERSION >= 100
   global->setAlignment(llvm::Align(1));
 #else
diff --git a/src/target/llvm/codegen_nvptx.cc b/src/target/llvm/codegen_nvptx.cc
index e9999f1ca283..601df86d10ba 100644
--- a/src/target/llvm/codegen_nvptx.cc
+++ b/src/target/llvm/codegen_nvptx.cc
@@ -82,7 +82,7 @@ class CodeGenNVPTX : public CodeGenLLVM {
       llvm::Type* type = llvm::ArrayType::get(DTypeToLLVMType(op->dtype), constant_size);
       // Allocate shared memory in global, address_space = 3
       llvm::GlobalVariable* global = new llvm::GlobalVariable(
-          *module_, type, false, llvm::GlobalValue::PrivateLinkage, 0, ".shared", nullptr,
+          *module_, type, false, llvm::GlobalValue::PrivateLinkage, nullptr, ".shared", nullptr,
           llvm::GlobalValue::NotThreadLocal, shared_address_space);
 #if TVM_LLVM_VERSION >= 100
       global->setAlignment(llvm::Align(info.alignment));
diff --git a/src/tir/transforms/lower_custom_datatypes.cc b/src/tir/transforms/lower_custom_datatypes.cc
index 7fd2352a3d19..ae9584f488af 100644
--- a/src/tir/transforms/lower_custom_datatypes.cc
+++ b/src/tir/transforms/lower_custom_datatypes.cc
@@ -97,7 +97,7 @@ class CustomDatatypesLowerer : public StmtExprMutator {
     return expr;
   }
 
-#define DEFINE_MUTATE__(OP, NodeName)                                              \
+#define DEFINE_MUTATE(OP, NodeName)                                                \
   inline PrimExpr VisitExpr_(const NodeName* op) final {                           \
     auto type_code = op->dtype.code();                                             \
     bool toBeLowered = datatype::Registry::Global()->GetTypeRegistered(type_code); \
@@ -112,19 +112,19 @@ class CustomDatatypesLowerer : public StmtExprMutator {
     return expr;                                                                   \
   }
 
-  DEFINE_MUTATE__(Add, AddNode);
-  DEFINE_MUTATE__(Sub, SubNode);
-  DEFINE_MUTATE__(Mul, MulNode);
-  DEFINE_MUTATE__(Div, DivNode);
-  DEFINE_MUTATE__(Mod, ModNode);
-  DEFINE_MUTATE__(Min, MinNode);
-  DEFINE_MUTATE__(Max, MaxNode);
-  DEFINE_MUTATE__(EQ, EQNode);
-  DEFINE_MUTATE__(NE, NENode);
-  DEFINE_MUTATE__(LT, LTNode);
-  DEFINE_MUTATE__(LE, LENode);
-  DEFINE_MUTATE__(GT, GTNode);
-  DEFINE_MUTATE__(GE, GENode);
+  DEFINE_MUTATE(Add, AddNode);
+  DEFINE_MUTATE(Sub, SubNode);
+  DEFINE_MUTATE(Mul, MulNode);
+  DEFINE_MUTATE(Div, DivNode);
+  DEFINE_MUTATE(Mod, ModNode);
+  DEFINE_MUTATE(Min, MinNode);
+  DEFINE_MUTATE(Max, MaxNode);
+  DEFINE_MUTATE(EQ, EQNode);
+  DEFINE_MUTATE(NE, NENode);
+  DEFINE_MUTATE(LT, LTNode);
+  DEFINE_MUTATE(LE, LENode);
+  DEFINE_MUTATE(GT, GTNode);
+  DEFINE_MUTATE(GE, GENode);
   // Later changes may need to add more mutate functions as we support workloads with more ops.
 
  private:
diff --git a/src/topi/nn.cc b/src/topi/nn.cc
index 4a209b2f2932..c03d1b056d35 100644
--- a/src/topi/nn.cc
+++ b/src/topi/nn.cc
@@ -75,7 +75,7 @@ TVM_REGISTER_GLOBAL("topi.nn.batch_matmul").set_body([](TVMArgs args, TVMRetValu
 
 /* Ops from nn/dilate.h */
 TVM_REGISTER_GLOBAL("topi.nn.dilate").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = nn::dilate(args[0], args[1]);
+  *rv = nn::dilate(args[0], args[1], args[2]);
 });
 
 /* Ops from nn/flatten.h */
diff --git a/src/topi/schedule.cc b/src/topi/schedule.cc
index ead803b84cf9..83457ced9f16 100644
--- a/src/topi/schedule.cc
+++ b/src/topi/schedule.cc
@@ -21,7 +21,6 @@
  * \brief Registration of TVM schedules
  * \file schedule.cc
  */
-#define TOPI_REDUCE_ATLEAST1D 0
 
 #include <tvm/ir/expr.h>
 #include <tvm/runtime/module.h>
diff --git a/src/topi/transform.cc b/src/topi/transform.cc
index bf7e1e67c247..d79952e2494f 100644
--- a/src/topi/transform.cc
+++ b/src/topi/transform.cc
@@ -177,7 +177,11 @@ TVM_REGISTER_GLOBAL("topi.one_hot").set_body([](TVMArgs args, TVMRetValue* rv) {
 });
 
 TVM_REGISTER_GLOBAL("topi.matrix_set_diag").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = matrix_set_diag(args[0], args[1]);
+  int k1 = args[2];
+  int k2 = args[3];
+  bool super_diag_right_align = args[4];
+  bool sub_diag_right_align = args[5];
+  *rv = matrix_set_diag(args[0], args[1], k1, k2, super_diag_right_align, sub_diag_right_align);
 });
 
 TVM_REGISTER_GLOBAL("topi.adv_index").set_body([](TVMArgs args, TVMRetValue* rv) {
diff --git a/tests/python/contrib/test_arm_compute_lib/test_maximum.py b/tests/python/contrib/test_arm_compute_lib/test_maximum.py
new file mode 100644
index 000000000000..8ddb901946fc
--- /dev/null
+++ b/tests/python/contrib/test_arm_compute_lib/test_maximum.py
@@ -0,0 +1,104 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Arm Compute Library integration reshape tests."""
+
+import numpy as np
+
+import tvm
+from tvm import relay
+
+from .infrastructure import (
+    skip_runtime_test,
+    skip_codegen_test,
+    build_and_run,
+    verify,
+    verify_codegen,
+)
+from .infrastructure import Device
+
+
+def _get_model(input_shape, dtype, var_names):
+    """Return a model and any parameters it may have."""
+    a = relay.var(next(var_names), shape=input_shape, dtype=dtype)
+    b = relay.var(next(var_names), shape=input_shape, dtype=dtype)
+    max = relay.maximum(a, b)
+    return max
+
+
+def _get_expected_codegen(shape, dtype):
+    node = {
+        "op": "kernel",
+        "name": "maximum",
+        "inputs": [[0, 0, 0], [1, 0, 0]],
+        "attrs": {
+            "num_inputs": "2",
+            "num_outputs": "1",
+            "shape": [[list(shape)]],
+            "dtype": [[dtype]],
+        },
+    }
+
+    inputs = [
+        {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}},
+        {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}},
+    ]
+    inputs.append(node)
+    return inputs
+
+
+def test_maximum():
+    Device.load("test_config.json")
+
+    if skip_runtime_test():
+        return
+
+    device = Device()
+    np.random.seed(0)
+
+    for dtype, low, high, atol, rtol in [
+        ("float32", -127, 128, 0.001, 0.001),
+        ("float32", -1, 1, 0.001, 0.001),
+    ]:
+        inputs = {
+            "a": tvm.nd.array(np.random.uniform(low, high, (100, 100)).astype(dtype)),
+            "b": tvm.nd.array(np.random.uniform(low, high, (100, 100)).astype(dtype)),
+        }
+        outputs = []
+        func = _get_model(inputs["a"].shape, dtype, iter(inputs))
+
+        for acl in [False, True]:
+            outputs.append(build_and_run(func, inputs, 1, None, device, enable_acl=acl)[0])
+
+        verify(outputs, atol=1e-7, rtol=1e-7)
+
+
+def test_codegen_maximum():
+    if skip_codegen_test():
+        return
+
+    shape = (100, 100)
+    inputs = {"a", "b"}
+    for dtype in ["float32"]:
+        args = (shape, dtype)
+        func = _get_model(*args, iter(inputs))
+        exp_codegen = _get_expected_codegen(*args)
+        verify_codegen(func, exp_codegen, 1)
+
+
+if __name__ == "__main__":
+    test_maximum()
+    test_codegen_maximum()
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index bda0f472148b..1aeb430de52f 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -1969,6 +1969,7 @@ def verify_prelu(x_shape, a_shape):
     verify_prelu([3, 4, 5, 6], [1, 4, 1, 1])
     verify_prelu([1, 8, 5, 6], [1, 8, 1, 1])
     verify_prelu([2, 12, 16, 16], [1, 12, 1, 1])
+    verify_prelu([2, 12, 16, 16], [1])  # Test alpha broadcasting.
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 83ba22b7c1d9..8c1143646426 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -17,6 +17,7 @@
 # pylint: disable=import-self, invalid-name, unused-argument
 """Unit tests for various models and operators"""
 from time import time
+import os
 import sys
 from scipy.stats import t as tdistr
 import numpy as np
@@ -1519,6 +1520,10 @@ class ToDouble(Module):
         def forward(self, x):
             return x.double()
 
+    class ToFloat16(Module):
+        def forward(self, x):
+            return x.to(torch.float16)
+
     verify_model(ToCPU().eval(), torch.rand((1, 3, 32, 32)))
     verify_model(ToFloat().eval(), torch.zeros((1, 3, 32, 32), dtype=torch.int))
     verify_model(ToFloat().eval(), torch.tensor(2, dtype=torch.int))
@@ -1526,6 +1531,8 @@ def forward(self, x):
     verify_model(ToInt().eval(), torch.tensor(0.8))
     verify_model(ToLong().eval(), torch.tensor(0.8))
     verify_model(ToDouble().eval(), torch.tensor(0.8))
+    verify_model(ToFloat16().eval(), torch.tensor(2, dtype=torch.float32))
+    verify_model(ToFloat16().eval(), torch.zeros((1, 3, 32, 32), dtype=torch.int))
 
 
 @tvm.testing.uses_gpu
@@ -3286,6 +3293,43 @@ def test_forward_pretrained_bert_base_uncased():
     print("TVM   top-1 id: {}, token: {}".format(tvm_pred_idx, tvm_pred_token))
 
 
+def test_convert_torch_script_with_input_types():
+    def model_fn(x, y):
+        x = x.to(dtype=torch.int32)
+        y = x + y
+        return y
+
+    ishape = (4, 5)
+    input_x = torch.rand(ishape, dtype=torch.float32)
+    input_y = torch.randint(low=0, high=100, size=ishape, dtype=torch.int32)
+    inputs = [input_x, input_y]
+    script_module = torch.jit.trace(model_fn, inputs)
+
+    fname = "tmp.pt"
+    torch.jit.save(script_module, fname)
+    loaded = torch.jit.load(fname)
+    os.remove(fname)
+
+    verify_model(loaded.eval(), input_data=inputs)
+
+    def expected(x_shape, y_shape):
+        # use a fixed order of args so alpha equal check can pass
+        x = relay.var("x", shape=x_shape, dtype="float32")
+        y = relay.var("y", shape=y_shape, dtype="int32")
+        args = [x, y]
+        x1 = relay.cast(x, "int32")
+        y1 = relay.add(x1, y)
+        mod = tvm.IRModule.from_expr(relay.Function(args, y1))
+        return mod["main"]
+
+    input_infos = [("input0", (ishape, "float")), ("input1", (ishape, "int"))]
+    mod, params = relay.frontend.from_pytorch(loaded, input_infos)
+
+    expected_mod = expected(ishape, ishape)
+
+    assert tvm.ir.structural_equal(expected_mod, mod["main"], map_free_vars=True)
+
+
 if __name__ == "__main__":
     # some structural tests
     test_forward_traced_function()
@@ -3446,3 +3490,6 @@ def test_forward_pretrained_bert_base_uncased():
 
     # Test bert model
     test_forward_pretrained_bert_base_uncased()
+
+    # Test convert torch script(jit) with specific inputs' types
+    test_convert_torch_script_with_input_types()
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index d24a733f7655..c13f679e6108 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -740,18 +740,24 @@ def test_any_pad():
     verify_any_pad(any_dims(4), ((1, 0), (1, 3), (0, 2), (9, 0)), (13, 11, 3, 1))
 
 
-def verify_any_dilate(data_shape, strides, static_data_shape):
+def verify_any_dilate(data_shape, strides, static_data_shape, dilation_value=None):
     assert len(data_shape) == len(strides)
     mod = tvm.IRModule()
     dtype = "float32"
     data = relay.var("data", shape=data_shape, dtype=dtype)
-    y = relay.nn.dilate(data, strides)
+    if dilation_value is None:
+        y = relay.nn.dilate(data, strides)
+    else:
+        y = relay.nn.dilate(data, strides, dilation_value)
     mod["main"] = relay.Function([data], y)
     data_np = np.random.uniform(size=static_data_shape).astype(dtype)
     ref_shape = tuple(
         (static_data_shape[i] - 1) * strides[i] + 1 for i in range(len(static_data_shape))
     )
-    ref_out = np.zeros(shape=ref_shape, dtype=dtype)
+    if dilation_value is None:
+        dilation_value = 0.0
+    ref_out = np.ones(shape=ref_shape, dtype=dtype)
+    ref_out = dilation_value * ref_out
     ref_out[tuple(slice(None, None, strides[i]) for i in range(len(data_shape)))] = data_np
     check_result([data_np], mod, ref_out)
 
@@ -766,6 +772,7 @@ def test_any_dilate():
     verify_any_dilate(any_dims(3), (1, 1, 5), (1, 2, 3))
     verify_any_dilate(any_dims(3), (3, 7, 5), (1, 2, 3))
     verify_any_dilate(any_dims(4), (3, 7, 1, 5), (1, 2, 3, 4))
+    verify_any_dilate(any_dims(4), (3, 7, 1, 5), (1, 2, 3, 4), 1.0)
 
 
 def verify_any_softmax(data_shape, axis, static_data_shape, ref_out_shape):
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 8ad40a617b34..bc565682d932 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -533,12 +533,10 @@ def _verify(indices_shape, depth, on_value, off_value, axis, dtype):
 
 @tvm.testing.uses_gpu
 def test_matrix_set_diag():
-    def _verify(input_shape, dtype):
-        diagonal_shape = list(input_shape[:-2])
-        diagonal_shape.append(min(input_shape[-2], input_shape[-1]))
+    def _verify(input_shape, diagonal_shape, dtype, k=0, align="RIGHT_LEFT"):
         input = relay.var("input", relay.TensorType(input_shape, dtype))
         diagonal = relay.var("diagonal", relay.TensorType(diagonal_shape, dtype))
-        out = relay.matrix_set_diag(input, diagonal)
+        out = relay.matrix_set_diag(input, diagonal, k, align)
 
         in_type = run_infer_type(input)
         out_type = run_infer_type(out)
@@ -547,7 +545,7 @@ def _verify(input_shape, dtype):
         func = relay.Function([input, diagonal], out)
         input_np = np.random.randint(-100, 100, size=input_shape).astype(dtype)
         diagonal_np = np.random.randint(-100, 100, size=diagonal_shape).astype(dtype)
-        out_np = tvm.topi.testing.matrix_set_diag(input_np, diagonal_np)
+        out_np = tvm.topi.testing.matrix_set_diag(input_np, diagonal_np, k, align)
 
         for target, ctx in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
@@ -555,9 +553,12 @@ def _verify(input_shape, dtype):
                 out_relay = intrp.evaluate(func)(input_np, diagonal_np)
                 tvm.testing.assert_allclose(out_relay.asnumpy(), out_np)
 
-    _verify((2, 2), "float32")
-    _verify((4, 3, 3), "int32")
-    _verify((2, 3, 4), "float32")
+    _verify((2, 2), (2,), "float32")
+    _verify((4, 3, 3), (4, 3), "int32")
+    _verify((2, 3, 4), (2, 3), "float32", 1)
+    _verify((2, 3, 4), (2, 4, 3), "int32", (-1, 2), "LEFT_RIGHT")
+    _verify((2, 3, 4), (2, 4, 3), "int32", (-1, 2), "LEFT_LEFT")
+    _verify((2, 3, 4), (2, 4, 3), "int32", (-1, 2), "RIGHT_RIGHT")
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relay/test_pass_convert_op_layout.py b/tests/python/relay/test_pass_convert_op_layout.py
index 9954c0143a68..c765de48d7f8 100644
--- a/tests/python/relay/test_pass_convert_op_layout.py
+++ b/tests/python/relay/test_pass_convert_op_layout.py
@@ -52,6 +52,33 @@ def expected():
     assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
 
 
+def test_qnn_binary_no_convert_layout():
+    def before():
+        x = relay.var("x", shape=(2, 2))
+        y = relay.var("y", shape=(1, 2))
+        return relay.Function(
+            [x, y],
+            relay.qnn.op.add(
+                x,
+                y,
+                lhs_scale=relay.const(0.0156863, "float32"),
+                lhs_zero_point=relay.const(127, "int32"),
+                rhs_scale=relay.const(0.0117647, "float32"),
+                rhs_zero_point=relay.const(85, "int32"),
+                output_scale=relay.const(0.0235294, "float32"),
+                output_zero_point=relay.const(128, "int32"),
+            ),
+        )
+
+    def expected():
+        return before()
+
+    a = before()
+    a = run_opt_pass(a, transform.ConvertLayout({}))
+    b = run_opt_pass(expected(), transform.InferType())
+    assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
+
+
 def test_conv_convert_layout():
     def before():
         x = relay.var("x", shape=(1, 56, 56, 64))
@@ -1041,7 +1068,57 @@ def expected():
     assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
 
 
+def test_no_desired_layout():
+    def before():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        weight1 = relay.var("weight1", shape=(64, 64, 3, 3))
+        y = relay.nn.conv2d(
+            x,
+            weight1,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NCHW",
+            kernel_layout="OIHW",
+        )
+        rois = relay.var("rois", shape=(32, 5))
+        y = relay.vision.roi_align(
+            y, rois, pooled_size=(14, 14), spatial_scale=0.0625, sample_ratio=2, layout="NCHW"
+        )
+        y = relay.Function(analysis.free_vars(y), y)
+        return y
+
+    def expected():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        weight1 = relay.var("weight1", shape=(64, 64, 3, 3))
+        x = relay.layout_transform(x, "NCHW", "NHWC")
+        weight1 = relay.layout_transform(weight1, "OIHW", "HWIO")
+        y = relay.nn.conv2d(
+            x,
+            weight1,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+        y = relay.layout_transform(y, "NHWC", "NCHW")
+        rois = relay.var("rois", shape=(32, 5))
+        y = relay.vision.roi_align(
+            y, rois, pooled_size=(14, 14), spatial_scale=0.0625, sample_ratio=2, layout="NCHW"
+        )
+        y = relay.Function(analysis.free_vars(y), y)
+        return y
+
+    a = before()
+    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NHWC", "HWIO"]}))
+    b = run_opt_pass(expected(), transform.InferType())
+
+    assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
+
+
 if __name__ == "__main__":
+    test_qnn_binary_no_convert_layout()
     test_no_convert_layout()
     test_conv_convert_layout()
     test_conv_nhwc_convert_layout()
@@ -1061,3 +1138,4 @@ def expected():
     test_conv_roi_align_convert_layout()
     test_default_keyword()
     test_different_ops_convert_layout()
+    test_no_desired_layout()
diff --git a/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py b/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py
index 742892d19fa9..267cfbe4c990 100644
--- a/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py
+++ b/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py
@@ -62,23 +62,18 @@ def get_ref_data():
 
     a_np, w_np, b_np, c_np = get_ref_data()
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            fcompute, fschedule = tvm.topi.testing.dispatch(
-                device, _conv2d_transpose_nchw_implement
-            )
-            B = fcompute(
-                A,
-                W,
-                [stride_height, stride_width],
-                [pad_top, pad_left, pad_bottom, pad_right],
-                A.dtype,
-                output_padding,
-            )
-            C = topi.nn.relu(B)
-            s1 = fschedule([B])
-            s2 = fschedule([C])
+    def check(fcompute, fschedule, device, ctx):
+        B = fcompute(
+            A,
+            W,
+            [stride_height, stride_width],
+            [pad_top, pad_left, pad_bottom, pad_right],
+            A.dtype,
+            output_padding,
+        )
+        C = topi.nn.relu(B)
+        s1 = fschedule([B])
+        s2 = fschedule([C])
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
@@ -91,6 +86,22 @@ def check_device(device, ctx):
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
+    def check_generic(device, ctx):
+        print("Running generic on target: %s" % device)
+        with tvm.target.Target(device):
+            fcompute, fschedule = _conv2d_transpose_nchw_implement["generic"]
+            check(fcompute, fschedule, device, ctx)
+
+    check_generic("llvm", tvm.cpu(0))
+
+    def check_device(device, ctx):
+        print("Running on target: %s" % device)
+        with tvm.target.Target(device):
+            fcompute, fschedule = tvm.topi.testing.dispatch(
+                device, _conv2d_transpose_nchw_implement
+            )
+            check(fcompute, fschedule, device, ctx)
+
     for device, ctx in tvm.testing.enabled_targets():
         check_device(device, ctx)
 
diff --git a/tests/python/topi/python/test_topi_dilate.py b/tests/python/topi/python/test_topi_dilate.py
index ab5c61dce406..0ee51a6c7bf4 100644
--- a/tests/python/topi/python/test_topi_dilate.py
+++ b/tests/python/topi/python/test_topi_dilate.py
@@ -26,12 +26,18 @@ def test_dilate():
     target = "llvm"
     ctx = tvm.cpu(0)
 
-    def _test_dilate(input_size, strides):
+    def _test_dilate(input_size, strides, dilation_value=None):
         Input = te.placeholder((input_size))
-        Output = topi.nn.dilate(Input, strides)
+        if dilation_value is None:
+            Output = topi.nn.dilate(Input, strides)
+        else:
+            Output = topi.nn.dilate(Input, strides, dilation_value)
         schedule = te.create_schedule(Output.op)
         input_np = np.random.uniform(size=input_size).astype(Input.dtype)
-        output_np = tvm.topi.testing.dilate_python(input_np, strides)
+        if dilation_value is None:
+            output_np = tvm.topi.testing.dilate_python(input_np, strides)
+        else:
+            output_np = tvm.topi.testing.dilate_python(input_np, strides, dilation_value)
         input_tvm = tvm.nd.array(input_np, ctx=ctx)
         output_size = topi.util.get_const_tuple(Output.shape)
         output_tvm = tvm.nd.array(np.zeros(shape=output_size).astype(Output.dtype), ctx=ctx)
@@ -47,6 +53,7 @@ def _test_dilate(input_size, strides):
     _test_dilate((1, 32, 32, 3, 3), (2, 2, 2, 2, 2))
     _test_dilate((1, 32, 32, 32, 3, 3), (1, 1, 1, 2, 2, 2))
     _test_dilate((1, 32, 32, 32, 3, 3), (2, 2, 2, 1, 1, 1))
+    _test_dilate((1, 32, 32, 32, 3, 3), (2, 2, 2, 1, 1, 1), 1.0)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/topi/python/test_topi_group_conv2d.py b/tests/python/topi/python/test_topi_group_conv2d.py
index d97716bbb3db..45e77d19082d 100644
--- a/tests/python/topi/python/test_topi_group_conv2d.py
+++ b/tests/python/topi/python/test_topi_group_conv2d.py
@@ -35,6 +35,10 @@
     "gpu": (topi.cuda.group_conv2d_nchw, topi.cuda.schedule_group_conv2d_nchw),
 }
 
+_group_conv2d_nhwc_implement = {
+    "generic": (topi.nn.group_conv2d_nhwc, topi.generic.schedule_group_conv2d_nhwc),
+}
+
 
 def verify_group_conv2d_nchw(
     batch,
@@ -275,6 +279,119 @@ def check_device(device):
         check_device(device)
 
 
+def verify_group_conv2d_nhwc(
+    batch,
+    in_channel,
+    in_size,
+    num_filter,
+    kernel,
+    stride,
+    padding,
+    dilation,
+    groups,
+    add_bias=False,
+    add_relu=False,
+):
+    print(
+        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d)"
+        % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups)
+    )
+
+    in_height = in_width = in_size
+
+    A = te.placeholder((batch, in_height, in_width, in_channel), name="A")
+    W = te.placeholder((kernel, kernel, in_channel // groups, num_filter), name="W")
+    bias = te.placeholder((1, 1, num_filter), name="bias")
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    bias_shape = get_const_tuple(bias.shape)
+    dtype = A.dtype
+
+    @memoize("topi.tests.test_topi_group_conv2d.verify_group_conv2d_nhwc")
+    def get_ref_data():
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        w_np = np.random.uniform(size=w_shape).astype(dtype)
+        b_np = np.random.uniform(size=bias_shape).astype(dtype)
+        dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
+        c_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding, groups).astype(
+            dtype
+        )
+
+        if add_bias:
+            b_np = np.random.uniform(size=bias_shape).astype(dtype)
+            c_np += b_np
+        if add_relu:
+            c_np = np.maximum(c_np, 0)
+
+        return a_np, w_np, b_np, c_np
+
+    a_np, w_np, b_np, c_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not tvm.testing.device_enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+
+        print("Running on target: %s" % device)
+        with tvm.target.Target(device):
+            fcompute, fschedule = tvm.topi.testing.dispatch(device, _group_conv2d_nhwc_implement)
+            C = fcompute(A, W, stride, padding, dilation, groups, dtype)
+            if add_bias:
+                C = topi.add(C, bias)
+            if add_relu:
+                C = topi.nn.relu(C)
+            s = fschedule([C])
+
+        a = tvm.nd.array(a_np, ctx)
+        w = tvm.nd.array(w_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        if add_bias:
+            func = tvm.build(
+                s,
+                [A, W, bias, C],
+                device,
+                name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d"
+                % (
+                    batch,
+                    in_channel,
+                    in_size,
+                    num_filter,
+                    kernel,
+                    stride,
+                    padding,
+                    dilation,
+                    groups,
+                ),
+            )
+            func(a, w, b, c)
+        else:
+            func = tvm.build(
+                s,
+                [A, W, C],
+                device,
+                name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d"
+                % (
+                    batch,
+                    in_channel,
+                    in_size,
+                    num_filter,
+                    kernel,
+                    stride,
+                    padding,
+                    dilation,
+                    groups,
+                ),
+            )
+            func(a, w, c)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+    for device in ["llvm"]:
+        check_device(device)
+
+
 @tvm.testing.uses_gpu
 def test_group_conv2d_nchw():
     # ResNeXt-50 workload
@@ -325,6 +442,30 @@ def test_group_conv2d_NCHWc_int8():
         verify_group_conv2d_NCHWc_int8(9, 128, 56, 128, 3, 1, 1, 1, 32)
 
 
+def test_group_conv2d_nhwc():
+    # ResNeXt-50 workload
+    verify_group_conv2d_nhwc(1, 128, 56, 128, 3, 1, 1, 1, 32)
+    verify_group_conv2d_nhwc(1, 256, 56, 256, 3, 2, 1, 1, 32)
+    verify_group_conv2d_nhwc(1, 256, 28, 256, 3, 1, 1, 1, 32)
+    verify_group_conv2d_nhwc(1, 512, 28, 512, 3, 2, 1, 1, 32)
+    verify_group_conv2d_nhwc(1, 512, 14, 512, 3, 1, 1, 1, 32)
+    verify_group_conv2d_nhwc(1, 1024, 14, 1024, 3, 2, 1, 1, 32)
+    verify_group_conv2d_nhwc(1, 1024, 7, 1024, 3, 1, 1, 1, 32)
+
+    # bias, relu
+    verify_group_conv2d_nhwc(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True)
+    verify_group_conv2d_nhwc(1, 128, 56, 128, 3, 1, 1, 1, 32, add_bias=True)
+    verify_group_conv2d_nhwc(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True, add_bias=True)
+
+    # dilation
+    verify_group_conv2d_nhwc(1, 128, 56, 128, 3, 1, 1, 2, 32)
+
+    # batch size
+    verify_group_conv2d_nhwc(2, 128, 56, 128, 3, 1, 1, 1, 32)
+    verify_group_conv2d_nhwc(9, 128, 56, 128, 3, 1, 1, 1, 32)
+
+
 if __name__ == "__main__":
     test_group_conv2d_nchw()
     test_group_conv2d_NCHWc_int8()
+    test_group_conv2d_nhwc()
diff --git a/tests/python/topi/python/test_topi_transform.py b/tests/python/topi/python/test_topi_transform.py
index a32d41a27e17..f18b5397eefe 100644
--- a/tests/python/topi/python/test_topi_transform.py
+++ b/tests/python/topi/python/test_topi_transform.py
@@ -715,12 +715,10 @@ def check_device(device, ctx):
         check_device(device, ctx)
 
 
-def verify_matrix_set_diag(input_shape, dtype):
-    diagonal_shape = list(input_shape[:-2])
-    diagonal_shape.append(min(input_shape[-2], input_shape[-1]))
+def verify_matrix_set_diag(input_shape, diagonal_shape, dtype, k=0, align="RIGHT_LEFT"):
     input = te.placeholder(shape=input_shape, name="input", dtype=dtype)
     diagonal = te.placeholder(shape=diagonal_shape, name="diagonal", dtype=dtype)
-    matrix_set_diag_result = topi.transform.matrix_set_diag(input, diagonal)
+    matrix_set_diag_result = topi.transform.matrix_set_diag(input, diagonal, k, align)
 
     def check_device(device, ctx):
         ctx = tvm.context(device, 0)
@@ -730,7 +728,7 @@ def check_device(device, ctx):
         fn = tvm.build(s, [input, diagonal, matrix_set_diag_result], device, name="matrix_set_diag")
         input_npy = np.random.randint(-100, 100, size=input_shape).astype(dtype)
         diagonal_npy = np.random.randint(-100, 100, size=diagonal_shape).astype(dtype)
-        out_npy = tvm.topi.testing.matrix_set_diag(input_npy, diagonal_npy)
+        out_npy = tvm.topi.testing.matrix_set_diag(input_npy, diagonal_npy, k, align)
         input_nd = tvm.nd.array(input_npy, ctx)
         diagonal_nd = tvm.nd.array(diagonal_npy, ctx)
         out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(matrix_set_diag_result.dtype), ctx)
@@ -1165,9 +1163,12 @@ def test_sparse_to_dense():
 @tvm.testing.uses_gpu
 def test_matrix_set_diag():
     for dtype in ["float32", "int32"]:
-        verify_matrix_set_diag((2, 2), dtype)
-        verify_matrix_set_diag((4, 3, 3), dtype)
-        verify_matrix_set_diag((2, 3, 4), dtype)
+        verify_matrix_set_diag((2, 2), (2,), dtype)
+        verify_matrix_set_diag((4, 3, 3), (4, 3), dtype)
+        verify_matrix_set_diag((2, 3, 4), (2, 3), dtype, 1)
+        verify_matrix_set_diag((2, 3, 4), (2, 4, 3), dtype, (-1, 2), "LEFT_RIGHT")
+        verify_matrix_set_diag((2, 3, 4), (2, 4, 3), dtype, (-1, 2), "LEFT_LEFT")
+        verify_matrix_set_diag((2, 3, 4), (2, 4, 3), dtype, (-1, 2), "RIGHT_RIGHT")
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/unittest/test_arith_rewrite_simplify.py b/tests/python/unittest/test_arith_rewrite_simplify.py
index 0571ede488d1..c3afa6c65627 100644
--- a/tests/python/unittest/test_arith_rewrite_simplify.py
+++ b/tests/python/unittest/test_arith_rewrite_simplify.py
@@ -916,6 +916,12 @@ def test_cast_simplify():
                 ck.verify(tvm.tir.Cast(dtype1, tvm.tir.const(i, dtype2)), tvm.tir.const(i, dtype1))
 
 
+def test_shift_left_simplify():
+    ck = RewriteChecker()
+    z = tvm.tir.op.call_intrin("int32", "tir.shift_left", 1, 10)
+    ck.verify(z, tvm.tir.const(1 << 10, "int32"))
+
+
 if __name__ == "__main__":
     test_floordiv_index_simplify()
     test_floormod_index_simplify()
@@ -932,3 +938,4 @@ def test_cast_simplify():
     test_logical_simplify()
     test_let_simplify()
     test_cast_simplify()
+    test_shift_left_simplify()
diff --git a/tests/python/unittest/test_auto_scheduler_common.py b/tests/python/unittest/test_auto_scheduler_common.py
index 33e498ecdfd0..eaf328c6303a 100644
--- a/tests/python/unittest/test_auto_scheduler_common.py
+++ b/tests/python/unittest/test_auto_scheduler_common.py
@@ -40,6 +40,19 @@ def matmul_auto_scheduler_test(N, M, K):
     return [A, B, C]
 
 
+@auto_scheduler.register_workload
+def double_matmul_auto_scheduler_test(N):
+    A = te.placeholder((N, N), name="A", dtype="float32")
+    B = te.placeholder((N, N), name="B", dtype="float32")
+    C = te.placeholder((N, N), name="C", dtype="float32")
+    k = te.reduce_axis((0, N), name="k")
+    D = te.compute((N, N), lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), name="D")
+    k = te.reduce_axis((0, N), name="k")
+    E = te.compute((N, N), lambda i, j: te.sum(D[i][k] * C[k][j], axis=[k]), name="E")
+
+    return [A, B, C, E]
+
+
 # Test for register_workload with different name
 @auto_scheduler.register_workload("matmul_auto_scheduler_test_rename_1")
 def matmul_auto_scheduler_test_rename_0(N, M, K):
diff --git a/tests/python/unittest/test_auto_scheduler_evolutionary_search.py b/tests/python/unittest/test_auto_scheduler_evolutionary_search.py
index eb706b7e6976..bf6efd0bf11d 100644
--- a/tests/python/unittest/test_auto_scheduler_evolutionary_search.py
+++ b/tests/python/unittest/test_auto_scheduler_evolutionary_search.py
@@ -47,7 +47,7 @@ def test_evo_search():
     workload_key = auto_scheduler.make_workload_key(matmul_auto_scheduler_test, (10, 10, 4))
     dag = auto_scheduler.ComputeDAG(workload_key)
     task = auto_scheduler.SearchTask(dag, workload_key, tvm.target.Target("llvm"))
-    policy = auto_scheduler.SketchPolicy(task, schedule_cost_model=MockCostModel(), verbose=0)
+    policy = auto_scheduler.SketchPolicy(task, program_cost_model=MockCostModel(), verbose=0)
     states = policy.sample_initial_population(50)
     pruned_states = []
     for state in states:
diff --git a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
index aba27840a61f..caa1d6a99f40 100644
--- a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
+++ b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
@@ -45,7 +45,7 @@ def test_layout_rewrite_correctness():
     workload = matmul_auto_scheduler_test
     workload_key = auto_scheduler.make_workload_key(workload, (N, N, N))
     dag = auto_scheduler.ComputeDAG(workload_key)
-    target = tvm.target.create(target)
+    target = tvm.target.Target(target)
     task = auto_scheduler.SearchTask(dag, workload_key, target)
 
     with tempfile.NamedTemporaryFile() as fp:
diff --git a/tests/python/unittest/test_auto_scheduler_search_policy.py b/tests/python/unittest/test_auto_scheduler_search_policy.py
index 6ec96a6f544a..04b54b2858cf 100644
--- a/tests/python/unittest/test_auto_scheduler_search_policy.py
+++ b/tests/python/unittest/test_auto_scheduler_search_policy.py
@@ -57,7 +57,7 @@ def search_common(
             search_policy = auto_scheduler.EmptyPolicy(task)
         elif search_policy == "sketch":
             search_policy = auto_scheduler.SketchPolicy(
-                task, schedule_cost_model=cost_model, init_search_callbacks=init_search_callbacks
+                task, program_cost_model=cost_model, init_search_callbacks=init_search_callbacks
             )
 
         tuning_options = auto_scheduler.TuningOptions(
diff --git a/tests/python/unittest/test_auto_scheduler_sketch_generation.py b/tests/python/unittest/test_auto_scheduler_sketch_generation.py
index fa67756833bf..5a687daf686a 100644
--- a/tests/python/unittest/test_auto_scheduler_sketch_generation.py
+++ b/tests/python/unittest/test_auto_scheduler_sketch_generation.py
@@ -25,6 +25,7 @@
 
 from test_auto_scheduler_common import (
     matmul_auto_scheduler_test,
+    double_matmul_auto_scheduler_test,
     conv2d_nchw_bn_relu_auto_scheduler_test,
     max_pool2d_auto_scheduler_test,
     min_nm_auto_scheduler_test,
@@ -73,9 +74,9 @@ def assert_has_cross_thread_reduction(state, stage_id):
 def test_cpu_matmul_sketch():
     sketches = generate_sketches(matmul_auto_scheduler_test, (512, 512, 512), "llvm")
     """ 3 multi-level tiling sketches
-        0 - Multi-level tiling
-        1 - Multi-level tiling with cache write on position 0
-        2 - Multi-level tiling with cache write on position 1
+        No.0 : Multi-level tiling
+        No.1 : Multi-level tiling with cache write on position 0
+        No.2 : Multi-level tiling with cache write on position 1
     """
     assert len(sketches) == 3
     # Sketch 0
@@ -92,11 +93,11 @@ def test_cpu_matmul_sketch():
 
     sketches = generate_sketches(matmul_auto_scheduler_test, (8, 8, 512), "llvm")
     """ 2 rfactor sketches + 3 multi-level tiling sketches
-        0 - Rfactor with factor position 0
-        1 - Rfactor with factor position 1
-        2 - Multi-level tiling
-        3 - Multi-level tiling with cache write on position 0
-        4 - Multi-level tiling with cache write on position 1
+        No.0 : Rfactor with factor position 0
+        No.1 : Rfactor with factor position 1
+        No.2 : Multi-level tiling
+        No.3 : Multi-level tiling with cache write on position 0
+        No.4 : Multi-level tiling with cache write on position 1
     """
     assert len(sketches) == 5
     # Sketch 0
@@ -116,15 +117,20 @@ def test_cpu_matmul_sketch():
     assert_compute_at_condition(sketches[4].stages[2], "iter")
     assert sketches[3] != sketches[4]
 
+    sketches = generate_sketches(double_matmul_auto_scheduler_test, (512,), "llvm")
+    """ 3 multi-level tiling sketches for one matmul, so 3 * 3 = 9 sketches in total """
+    assert len(sketches) == 9
+    assert_is_tiled(sketches[8].stages[5])
+
 
 def test_cpu_conv2d_bn_relu_sketch():
     sketches = generate_sketches(
         conv2d_nchw_bn_relu_auto_scheduler_test, (1, 56, 56, 512, 512, 3, 1, 1), "llvm"
     )
     """ 3 multi-level tiling sketches
-        0 - Conv2d multi-level tiling with fusion on position 0
-        1 - Conv2d multi-level tiling with fusion on position 1
-        2 - Conv2d multi-level tiling without fusion
+        No.0 : Conv2d multi-level tiling with fusion on position 0
+        No.1 : Conv2d multi-level tiling with fusion on position 1
+        No.2 : Conv2d multi-level tiling without fusion
     """
     assert len(sketches) == 3
     # Sketch 0
@@ -164,9 +170,9 @@ def test_cpu_max_pool2d_sketch():
 def test_cpu_min_sketch():
     sketches = generate_sketches(min_nm_auto_scheduler_test, (10, 1024), "llvm")
     """ 2 rfactor sketches + 1 default sketch
-        0 - Rfactor with factor position 0
-        1 - Rfactor with factor position 1
-        2 - Default sketch
+        No.0 : Rfactor with factor position 0
+        No.1 : Rfactor with factor position 1
+        No.2 : Default sketch
     """
     assert len(sketches) == 3
     # Sketch 0
@@ -209,9 +215,9 @@ def test_cpu_conv2d_winograd_sketch():
         conv2d_winograd_nhwc_auto_scheduler_test, (1, 28, 28, 128, 128, 3, 1, 1), "llvm"
     )
     """ 3 multi-level tiling sketches
-        0 - Bgemm multi-level tiling
-        1 - Bgemm multi-level tiling with cache write on position 0
-        2 - Bgemm multi-level tiling with cache write on position 1
+        No.0 : Bgemm multi-level tiling
+        No.1 : Bgemm multi-level tiling with cache write on position 0
+        No.2 : Bgemm multi-level tiling with cache write on position 1
     """
     assert len(sketches) == 3
     # Sketch 0
@@ -277,6 +283,12 @@ def test_cuda_matmul_sketch():
     assert_compute_at_condition(sketches[1].stages[4], "iter")
     assert_is_tiled(sketches[1].stages[5])
 
+    sketches = generate_sketches(double_matmul_auto_scheduler_test, (512,), "cuda")
+    """ 1 multi-level tiling sketch for one matmul, so 1 x 1 = 1 sketch in total """
+    assert len(sketches) == 1
+    assert_compute_at_condition(sketches[0].stages[5], "root")
+    assert_compute_at_condition(sketches[0].stages[6], "iter")
+
 
 @tvm.testing.requires_cuda
 def test_cuda_conv2d_bn_relu_sketch():
diff --git a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
new file mode 100644
index 000000000000..74b37754c913
--- /dev/null
+++ b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
@@ -0,0 +1,191 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+.. _auto-scheduler-conv-gpu:
+
+Auto-scheduling a convolution layer for GPU
+===========================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, \
+            `Chengfan Jia <https://github.com/jcf94/>`_
+
+
+Different from the existing :ref:`autotvm <tutorials-autotvm-sec>` which relies on 
+manual templates to define the search space, the auto-scheduler does not require any templates.
+The auto-scheduler is template-free, so users only need to write the computation declaration without
+any schedule commands or templates.
+The auto-scheduler can automatically generate a large
+search space and find a good schedule in the space.
+
+We use a convolution layer as an example in this tutorial.
+"""
+
+import numpy as np
+import tvm
+from tvm import te, testing, auto_scheduler, topi
+from tvm.topi.testing import conv2d_nchw_python
+
+######################################################################
+# Define the computation
+# ^^^^^^^^^^^^^^^^^^^^^^
+# To begin with, let us define the computation of a convolution layer.
+# The function should return the list of input/output tensors.
+# From these tensors, the auto-scheduler can get the whole computational graph.
+
+
+@auto_scheduler.register_workload
+def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
+    data = te.placeholder((N, CI, H, W), name="data")
+    kernel = te.placeholder((CO, CI, KH, KW), name="kernel")
+    bias = te.placeholder((1, CO, 1, 1), name="bias")
+    conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype="float32")
+    out = topi.nn.relu(conv + bias)
+    return [data, kernel, bias, out]
+
+
+######################################################################
+# Create the search task
+# ^^^^^^^^^^^^^^^^^^^^^^
+# We then create a search task for the last convolution layer in the resnet.
+
+target = tvm.target.Target("cuda")
+
+# the last layer in resnet
+N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1)
+task = auto_scheduler.create_task(conv2d_layer, (N, H, W, CO, CI, KH, KW, strides, padding), target)
+
+# Inspect the computational graph
+print(task.compute_dag)
+
+######################################################################
+# Next, we set parameters for the auto-scheduler. These parameters
+# mainly specify how we do the measurement during the search and auto-tuning.
+#
+# * :code:`measure_ctx` launches a different process for measurement. This
+#   provides an isolation. It can protect the master process from GPU crashes
+#   happended during measurement and avoid other runtime conflicts.
+# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement.
+#   This can warmup the GPU, which is necessary to get accurate measurement results.
+#   Typically, we recommend a value > 300 ms.
+# * :code:`num_measure_trials` is the number of measurement trials we can use during the search.
+#   We only make 10 trials in this tutorial for a fast demonstration. In practice, 1000 is a
+#   good value for the search to converge. You can do more trials according to your time budget.
+# * In addition, we use :code:`RecordToFile` to dump measurement records into a file `conv2d.json`.
+#   The measurement records can be used to query the history best, resume the search,
+#   and do more analyses later.
+# * see :any:`auto_scheduler.TuningOptions`,
+#   :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters.
+
+measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
+tune_option = auto_scheduler.TuningOptions(
+    num_measure_trials=10,
+    runner=measure_ctx.runner,
+    measure_callbacks=[auto_scheduler.RecordToFile("conv2d.json")],
+)
+
+######################################################################
+# Run the search
+# ^^^^^^^^^^^^^^
+# Now we get all inputs ready. Pretty simple, isn't it?
+# We can kick off the search and let the auto-scheduler do its magic.
+# After some measurement trials, it will return the best schedule it found.
+
+sch, args = auto_scheduler.auto_schedule(task, tuning_options=tune_option)
+
+######################################################################
+# We can lower the schedule to see the IR after auto-scheduling.
+# The auto-scheduler correctly performs optimizations including multi-level tiling,
+# cooperative fetching, unrolling and operator fusion.
+
+print(tvm.lower(sch, args, simple_mode=True))
+
+######################################################################
+# Check correctness and evaluate performance
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# We build the binary and check its correctness and performance.
+
+func = tvm.build(sch, args, target)
+
+# check correctness
+data_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
+weight_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
+bias_np = np.random.uniform(size=(1, CO, 1, 1)).astype(np.float32)
+conv_np = conv2d_nchw_python(data_np, weight_np, strides, padding)
+out_np = np.maximum(conv_np + bias_np, 0.0)
+
+ctx = tvm.gpu()
+data_tvm = tvm.nd.array(data_np, ctx=ctx)
+weight_tvm = tvm.nd.array(weight_np, ctx=ctx)
+bias_tvm = tvm.nd.array(bias_np, ctx=ctx)
+out_tvm = tvm.nd.empty(out_np.shape, ctx=ctx)
+func(data_tvm, weight_tvm, bias_tvm, out_tvm)
+
+# Check results
+tvm.testing.assert_allclose(out_np, out_tvm.asnumpy(), rtol=1e-3)
+
+# Evaluate execution time
+evaluator = func.time_evaluator(func.entry_name, ctx, min_repeat_ms=500)
+print(
+    "Execution time of this operator: %.3f ms"
+    % (np.median(evaluator(data_tvm, weight_tvm, bias_tvm, out_tvm).results) * 1000)
+)
+
+######################################################################
+# Using the record file
+# ^^^^^^^^^^^^^^^^^^^^^
+# During the search, all measuremnt records are dumpped into the record
+# file "conv2d.json". The measurement records can be used to re-apply search results,
+# resume the search, and perform other analyses.
+
+######################################################################
+# Here is an example where we load the best schedule from a file,
+# print the equivalent python schedule API, and build the binary again.
+
+# Load the measuremnt record for the best schedule
+inp, res = auto_scheduler.load_best("conv2d.json", task.workload_key)
+
+# Print equivalent python schedule API. This can be used for debugging and
+# learning the behavior of the auto-scheduler.
+print("Equivalent python schedule:")
+print(task.compute_dag.print_python_code_from_state(inp.state))
+
+# Rebuild the binary. This shows how you can apply the best schedule from a
+# log file without reruning the search again.
+sch, args = task.compute_dag.apply_steps_from_state(inp.state)
+func = tvm.build(sch, args, target)
+
+######################################################################
+# A more complicated example is to resume the search.
+# In this case, we need to create the search policy and cost model by ourselves
+# and resume the status of search policy and cost model with the log file.
+# In the example below we resume the status and do more 5 trials.
+
+
+log_file = "conv2d.json"
+cost_model = auto_scheduler.XGBModel()
+cost_model.update_from_file(log_file)
+search_policy = auto_scheduler.SketchPolicy(
+    task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)]
+)
+tune_option = auto_scheduler.TuningOptions(
+    num_measure_trials=5,
+    runner=measure_ctx.runner,
+    measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+)
+sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options=tune_option)
+
+# kill the measurement process
+del measure_ctx
diff --git a/tutorials/auto_scheduler/tune_matmul_x86.py b/tutorials/auto_scheduler/tune_matmul_x86.py
index 1a9af42510eb..e5f9d7e9a27d 100644
--- a/tutorials/auto_scheduler/tune_matmul_x86.py
+++ b/tutorials/auto_scheduler/tune_matmul_x86.py
@@ -37,7 +37,7 @@
 ######################################################################
 # Define the computation
 # ^^^^^^^^^^^^^^^^^^^^^^
-# To begin with, we define the computation of a matmul with bias add.
+# To begin with, let us define the computation of a matmul with bias add.
 # The function should return the list of input/output tensors.
 # From these tensors, the auto-scheduler can get the whole computational graph.
 
@@ -59,9 +59,13 @@ def matmul_add(N, L, M, dtype):
 # Create the search task
 # ^^^^^^^^^^^^^^^^^^^^^^
 # We then create a search task with N=L=M=128 and dtype="float32"
+# If your machine supports avx instructions, you can
+#
+#   - replace "llvm" below with "llvm -mcpu=core-avx2" to enable AVX2
+#   - replace "llvm" below with "llvm -mcpu=skylake-avx512" to enable AVX-512
 
 target = tvm.target.Target("llvm")
-task = auto_scheduler.create_task(matmul_add, (128, 128, 128, "float32"), target)
+task = tvm.auto_scheduler.create_task(matmul_add, (128, 128, 128, "float32"), target)
 
 # Inspect the computational graph
 print(task.compute_dag)
@@ -69,13 +73,13 @@ def matmul_add(N, L, M, dtype):
 ######################################################################
 # Next, we set parameters for the auto-scheduler.
 #
-# * `num_measure_trials` is the number of measurement trials we can use during the search.
+# * :code:`num_measure_trials` is the number of measurement trials we can use during the search.
 #   We only make 10 trials in this tutorial for a fast demonstration. In practice, 1000 is a
 #   good value for the search to converge. You can do more trials according to your time budget.
-# * In addition, we use `RecordToFile` to dump measurement records into a file `matmul.json`.
+# * In addition, we use :code:`RecordToFile` to dump measurement records into a file `matmul.json`.
 #   The measurement records can be used to query the history best, resume the search,
 #   and do more analyses later.
-# * see :any:`auto_schedule.TuningOptions`: for more parameters
+# * see :any:`auto_scheduler.TuningOptions` for more parameters
 
 tune_option = auto_scheduler.TuningOptions(
     num_measure_trials=10, measure_callbacks=[auto_scheduler.RecordToFile("matmul.json")]
@@ -93,25 +97,38 @@ def matmul_add(N, L, M, dtype):
 ######################################################################
 # We can lower the schedule to see the IR after auto-scheduling.
 # The auto-scheduler correctly performs optimizations including multi-level tiling,
-# parallelization, vectorization, unrolling and fusion.
+# parallelization, vectorization, unrolling and operator fusion.
 
 print(tvm.lower(sch, args, simple_mode=True))
 
 ######################################################################
-# Check correctness
-# ^^^^^^^^^^^^^^^^^
-# We build the binary and check its correctness
+# Check correctness and evaluate performance
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# We build the binary and check its correctness and performance.
 
 func = tvm.build(sch, args)
 a_np = np.random.uniform(size=(128, 128)).astype(np.float32)
 b_np = np.random.uniform(size=(128, 128)).astype(np.float32)
 c_np = np.random.uniform(size=(128, 128)).astype(np.float32)
-d_np = a_np.dot(b_np) + c_np
-
-d_tvm = tvm.nd.empty(d_np.shape)
-func(tvm.nd.array(a_np), tvm.nd.array(b_np), tvm.nd.array(c_np), d_tvm)
+out_np = a_np.dot(b_np) + c_np
+
+ctx = tvm.cpu()
+a_tvm = tvm.nd.array(a_np, ctx=ctx)
+b_tvm = tvm.nd.array(b_np, ctx=ctx)
+c_tvm = tvm.nd.array(c_np, ctx=ctx)
+out_tvm = tvm.nd.empty(out_np.shape, ctx=ctx)
+func(a_tvm, b_tvm, c_tvm, out_tvm)
+
+# Check results
+tvm.testing.assert_allclose(out_np, out_tvm.asnumpy(), rtol=1e-3)
+
+# Evaluate execution time.
+evaluator = func.time_evaluator(func.entry_name, ctx, min_repeat_ms=500)
+print(
+    "Execution time of this operator: %.3f ms"
+    % (np.median(evaluator(a_tvm, b_tvm, c_tvm, out_tvm).results) * 1000)
+)
 
-tvm.testing.assert_allclose(d_np, d_tvm.asnumpy(), rtol=1e-3)
 
 ######################################################################
 # Using the record file
@@ -129,6 +146,7 @@ def matmul_add(N, L, M, dtype):
 
 # Print equivalent python schedule API. This can be used for debugging and
 # learning the behavior of the auto-scheduler.
+print("Equivalent python schedule:")
 print(task.compute_dag.print_python_code_from_state(inp.state))
 
 # Rebuild the binary. This shows how you can apply the best schedule from a
@@ -161,13 +179,16 @@ def resume_search(task, log_file):
 # .. note::
 #   We cannot run the line above because of the conflict between
 #   python's multiprocessing and tvm's thread pool.
-#   After running a tvm generated binary (L112), the python's multiprocessing
-#   library will hang forever.
-#   You have to make sure that you don't run any tvm generated binaries before
-#   calling ansor's search. To run the L156 above, you should comment out L112-114.
+#   After running a tvm generated binary the python's multiprocessing library
+#   will hang forever. You have to make sure that you don't run any tvm
+#   generated binaries before calling auot-scheduler's search.
+#   To run the function above, you should comment out all code in
+#   "Check correctness and evaluate performance" section.
 #
 #   You should be careful about this problem in your applications.
 #   There are other workarounds for this problem.
 #   For example, you can start a new thread/process (with the builtin python library
 #   threading or multiprocessing) and run the tvm binaries in the new thread/process.
 #   This provides an isolation and avoids the conflict in the main thread/process.
+#   You can also use :any:`auto_scheduler.LocalRPCMeasureContext` for auto-scheduler,
+#   as shown in the GPU tutorial (:ref:`auto-scheduler-conv-gpu`).
diff --git a/tutorials/frontend/from_darknet.py b/tutorials/frontend/from_darknet.py
index bbfb410764a5..4cbafaf3c14a 100644
--- a/tutorials/frontend/from_darknet.py
+++ b/tutorials/frontend/from_darknet.py
@@ -195,6 +195,7 @@
 
 names = [x.strip() for x in content]
 
+tvm.relay.testing.yolo_detection.show_detections(img, dets, thresh, names, last_layer.classes)
 tvm.relay.testing.yolo_detection.draw_detections(
     font_path, img, dets, thresh, names, last_layer.classes
 )
diff --git a/tutorials/get_started/tensor_expr_get_started.py b/tutorials/get_started/tensor_expr_get_started.py
index 76e02625ef04..0ca2243c7c3b 100644
--- a/tutorials/get_started/tensor_expr_get_started.py
+++ b/tutorials/get_started/tensor_expr_get_started.py
@@ -30,6 +30,7 @@
 from __future__ import absolute_import, print_function
 
 import tvm
+import tvm.testing
 from tvm import te
 import numpy as np
 
diff --git a/tutorials/language/reduction.py b/tutorials/language/reduction.py
index ecefc28776b7..cffa10e6cbb2 100644
--- a/tutorials/language/reduction.py
+++ b/tutorials/language/reduction.py
@@ -28,6 +28,7 @@
 from __future__ import absolute_import, print_function
 
 import tvm
+import tvm.testing
 from tvm import te
 import numpy as np
 
diff --git a/tutorials/language/scan.py b/tutorials/language/scan.py
index fdb6ec99eae6..5f513208d56d 100644
--- a/tutorials/language/scan.py
+++ b/tutorials/language/scan.py
@@ -25,6 +25,7 @@
 from __future__ import absolute_import, print_function
 
 import tvm
+import tvm.testing
 from tvm import te
 import numpy as np
 
diff --git a/tutorials/topi/intro_topi.py b/tutorials/topi/intro_topi.py
index c9812fff6d2c..2459cf249506 100644
--- a/tutorials/topi/intro_topi.py
+++ b/tutorials/topi/intro_topi.py
@@ -26,6 +26,7 @@
 from __future__ import absolute_import, print_function
 
 import tvm
+import tvm.testing
 from tvm import te
 from tvm import topi
 import numpy as np
diff --git a/web/package.json b/web/package.json
index 25fca5088e78..1f52a0781e60 100644
--- a/web/package.json
+++ b/web/package.json
@@ -20,7 +20,7 @@
     "@types/node": "^12.12.37",
     "@typescript-eslint/eslint-plugin": "^2.29.0",
     "@typescript-eslint/parser": "^2.29.0",
-    "@webgpu/types": "^0.0.24",
+    "@webgpu/types": "^0.0.31",
     "eslint": "^6.8.0",
     "jest": "^26.0.1",
     "rollup": "^2.7.6",
diff --git a/web/src/rpc_server.ts b/web/src/rpc_server.ts
index 542558aa157f..c63dcf3a9ae3 100644
--- a/web/src/rpc_server.ts
+++ b/web/src/rpc_server.ts
@@ -252,8 +252,8 @@ export class RPCServer {
         this.logger
       );
       try {
-        const gpuDevice: GPUDevice | undefined = await detectGPUDevice();
-        if (gpuDevice !== undefined) {
+        const gpuDevice: GPUDevice | undefined | null = await detectGPUDevice();
+        if (gpuDevice !== undefined && gpuDevice !== null) {
           const label = gpuDevice.label?.toString() || "WebGPU";
           this.log("Initialize GPU device: " + label);
           inst.initWebGPU(gpuDevice);
diff --git a/web/src/webgpu.ts b/web/src/webgpu.ts
index 640f7b4a7163..f12837f421f8 100644
--- a/web/src/webgpu.ts
+++ b/web/src/webgpu.ts
@@ -27,10 +27,10 @@ export type GPUPointer = number;
 /**
  * DetectGPU device in the environment.
  */
-export async function detectGPUDevice(): Promise<GPUDevice | undefined> {
+export async function detectGPUDevice(): Promise<GPUDevice | undefined | null> {
   if (typeof navigator !== "undefined" && navigator.gpu !== undefined) {
     const adapter = await navigator.gpu.requestAdapter();
-    return await adapter.requestDevice();
+    return await adapter?.requestDevice();
   } else {
     return undefined;
   }
@@ -235,11 +235,14 @@ export class WebGPUContext {
     nbytes: number
   ): void {
     // Perhaps it would be more useful to use a staging buffer?
-    const [gpuTemp, cpuTemp] = this.device.createBufferMapped({
+    const gpuTemp = this.device.createBuffer({
+      mappedAtCreation: true,
       size: nbytes,
-      usage: GPUBufferUsage.MAP_WRITE | GPUBufferUsage.COPY_SRC,
+      usage: GPUBufferUsage.MAP_WRITE | GPUBufferUsage.COPY_SRC
     });
 
+    const cpuTemp = gpuTemp.getMappedRange();
+
     const viewU8 = new Uint8Array(cpuTemp);
     viewU8.set(this.memory.loadRawBytes(from, nbytes));
     gpuTemp.unmap();
@@ -281,8 +284,9 @@ export class WebGPUContext {
     this.device.defaultQueue.submit([copyCommands]);
 
     this.numPendingReads += 1;
-    const readEvent = gpuTemp.mapReadAsync().then((data: ArrayBuffer) => {
-      this.memory.storeRawBytes(to, new Uint8Array(data));
+
+    const readEvent = gpuTemp.mapAsync(GPUMapMode.READ).then((data: unknown) => {
+      this.memory.storeRawBytes(to, new Uint8Array(data as ArrayBuffer));
       this.numPendingReads -= 1;
       gpuTemp.destroy();
     });