From e443752d7d9e2a2cfac54bb4c809f8aee5a5f76e Mon Sep 17 00:00:00 2001
From: Siyuan Feng <hzfengsy@sjtu.edu.cn>
Date: Mon, 17 Feb 2025 14:15:16 +0800
Subject: [PATCH 01/10] cleanup schedule c++

---
 include/tvm/driver/driver_api.h               |   40 -
 include/tvm/relax/analysis.h                  |    1 +
 include/tvm/relax/op_attr_types.h             |    1 -
 include/tvm/te/autodiff.h                     |   96 --
 include/tvm/te/operation.h                    |  236 +---
 include/tvm/te/schedule.h                     | 1010 --------------
 include/tvm/te/schedule_pass.h                |  112 --
 include/tvm/te/tensor_intrin.h                |  145 --
 include/tvm/topi/cuda/dense.h                 |  154 --
 include/tvm/topi/cuda/injective.h             |   83 --
 include/tvm/topi/cuda/pooling.h               |  187 ---
 include/tvm/topi/cuda/reduction.h             |  199 ---
 include/tvm/topi/cuda/softmax.h               |  103 --
 include/tvm/topi/generic/default.h            |   83 --
 include/tvm/topi/generic/extern.h             |   69 -
 include/tvm/topi/generic/injective.h          |   77 -
 include/tvm/topi/rocm/dense.h                 |   99 --
 include/tvm/topi/rocm/injective.h             |   67 -
 include/tvm/topi/rocm/pooling.h               |   68 -
 include/tvm/topi/rocm/softmax.h               |   55 -
 include/tvm/topi/x86/bnn.h                    |  131 --
 include/tvm/topi/x86/default.h                |  105 --
 include/tvm/topi/x86/injective.h              |   85 --
 python/tvm/te/__init__.py                     |    2 +-
 python/tvm/te/operation.py                    |   24 +-
 python/tvm/te/tensor.py                       |   10 -
 src/driver/driver_api.cc                      |   80 --
 src/relax/analysis/layout_transformation.cc   |    1 +
 src/target/codegen.cc                         |   11 -
 src/target/llvm/codegen_llvm.cc               |    4 +-
 src/te/autodiff/ad_simplify.cc                | 1239 -----------------
 src/te/autodiff/ad_utils.cc                   |  197 ---
 src/te/autodiff/ad_utils.h                    |  135 --
 src/te/autodiff/adjoint.cc                    |  157 ---
 src/te/autodiff/jacobian.cc                   |  365 -----
 src/te/operation/compute_op.cc                |  375 -----
 src/te/operation/compute_op.h                 |  104 --
 src/te/operation/create_primfunc.cc           |    3 +-
 src/te/operation/cross_thread_reduction.cc    |  237 ----
 src/te/operation/extern_op.cc                 |   84 --
 src/te/operation/graph.cc                     |   90 ++
 src/te/operation/graph.h                      |   62 +
 src/te/operation/hybrid_op.cc                 |  486 -------
 src/te/operation/hybrid_op.h                  |   94 --
 src/te/operation/op_utils.cc                  |  290 ----
 src/te/operation/op_utils.h                   |   96 --
 src/te/operation/placeholder_op.cc            |   27 -
 src/te/operation/scan_op.cc                   |  146 --
 src/te/operation/tensor_compute_op.cc         |  259 ----
 src/te/operation/tensorize.cc                 |  496 -------
 src/te/schedule/auto_inline_elem_wise.cc      |  125 --
 src/te/schedule/bound.cc                      |  262 ----
 src/te/schedule/graph.cc                      |  418 ------
 src/te/schedule/graph.h                       |  130 --
 src/te/schedule/message_passing.cc            |  744 ----------
 src/te/schedule/message_passing.h             |  121 --
 src/te/schedule/operation_inline.cc           |   90 --
 src/te/schedule/operation_inline.h            |   48 -
 src/te/schedule/schedule_dataflow_rewrite.cc  |  978 -------------
 src/te/schedule/schedule_lang.cc              | 1078 --------------
 src/te/schedule/schedule_ops.cc               |  437 ------
 .../schedule/schedule_postproc_to_primfunc.cc |  447 ------
 src/te/schedule/verify_compact_buffer.cc      |   63 -
 src/te/tensor.cc                              |   61 -
 src/tir/transforms/storage_flatten.cc         |    1 +
 src/topi/schedule.cc                          |  319 -----
 .../rocm/reduction.h => src/topi/utils.cc     |   38 +-
 tests/cpp/build_module_test.cc                |   67 -
 68 files changed, 179 insertions(+), 13528 deletions(-)
 delete mode 100644 include/tvm/te/autodiff.h
 delete mode 100644 include/tvm/te/schedule.h
 delete mode 100644 include/tvm/te/schedule_pass.h
 delete mode 100644 include/tvm/te/tensor_intrin.h
 delete mode 100644 include/tvm/topi/cuda/dense.h
 delete mode 100644 include/tvm/topi/cuda/injective.h
 delete mode 100644 include/tvm/topi/cuda/pooling.h
 delete mode 100644 include/tvm/topi/cuda/reduction.h
 delete mode 100644 include/tvm/topi/cuda/softmax.h
 delete mode 100644 include/tvm/topi/generic/default.h
 delete mode 100644 include/tvm/topi/generic/extern.h
 delete mode 100644 include/tvm/topi/generic/injective.h
 delete mode 100644 include/tvm/topi/rocm/dense.h
 delete mode 100644 include/tvm/topi/rocm/injective.h
 delete mode 100644 include/tvm/topi/rocm/pooling.h
 delete mode 100644 include/tvm/topi/rocm/softmax.h
 delete mode 100644 include/tvm/topi/x86/bnn.h
 delete mode 100644 include/tvm/topi/x86/default.h
 delete mode 100644 include/tvm/topi/x86/injective.h
 delete mode 100644 src/te/autodiff/ad_simplify.cc
 delete mode 100644 src/te/autodiff/ad_utils.cc
 delete mode 100644 src/te/autodiff/ad_utils.h
 delete mode 100644 src/te/autodiff/adjoint.cc
 delete mode 100644 src/te/autodiff/jacobian.cc
 delete mode 100644 src/te/operation/compute_op.h
 delete mode 100644 src/te/operation/cross_thread_reduction.cc
 create mode 100644 src/te/operation/graph.cc
 create mode 100644 src/te/operation/graph.h
 delete mode 100644 src/te/operation/hybrid_op.cc
 delete mode 100644 src/te/operation/hybrid_op.h
 delete mode 100644 src/te/operation/op_utils.cc
 delete mode 100644 src/te/operation/op_utils.h
 delete mode 100644 src/te/operation/tensor_compute_op.cc
 delete mode 100644 src/te/operation/tensorize.cc
 delete mode 100644 src/te/schedule/auto_inline_elem_wise.cc
 delete mode 100644 src/te/schedule/bound.cc
 delete mode 100644 src/te/schedule/graph.cc
 delete mode 100644 src/te/schedule/graph.h
 delete mode 100644 src/te/schedule/message_passing.cc
 delete mode 100644 src/te/schedule/message_passing.h
 delete mode 100644 src/te/schedule/operation_inline.cc
 delete mode 100644 src/te/schedule/operation_inline.h
 delete mode 100644 src/te/schedule/schedule_dataflow_rewrite.cc
 delete mode 100644 src/te/schedule/schedule_lang.cc
 delete mode 100644 src/te/schedule/schedule_ops.cc
 delete mode 100644 src/te/schedule/schedule_postproc_to_primfunc.cc
 delete mode 100644 src/te/schedule/verify_compact_buffer.cc
 delete mode 100644 src/topi/schedule.cc
 rename include/tvm/topi/rocm/reduction.h => src/topi/utils.cc (54%)
 delete mode 100644 tests/cpp/build_module_test.cc

diff --git a/include/tvm/driver/driver_api.h b/include/tvm/driver/driver_api.h
index fffcab49667c..eaf737088bf4 100644
--- a/include/tvm/driver/driver_api.h
+++ b/include/tvm/driver/driver_api.h
@@ -35,14 +35,9 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/support/with.h>
 #include <tvm/target/target.h>
-#include <tvm/te/schedule_pass.h>
 #include <tvm/tir/function.h>
 
 #include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
 
 namespace tvm {
 using tvm::transform::Pass;
@@ -105,41 +100,6 @@ TVM_DLL IRModule LowerPrimFunc(tvm::tir::PrimFunc func, const std::string& name,
  * \return The result module.
  */
 
-TVM_DLL IRModule LowerSchedule(te::Schedule sch, const Array<te::Tensor>& args,
-                               const std::string& name,
-                               const std::unordered_map<te::Tensor, tir::Buffer>& binds,
-                               GlobalVarSupply global_var_supply, bool simple_mode = false);
-
-/*!
- * \brief Build an IRModule given a TE schedule, args and binds. This function also applies
- * the lowering passes defined in CreatePassList.
- * \param sch The TE schedule to lower.
- * \param args The arguments to the function (Array of Tensor, Buffer and Vars)
- * \param name The name of the lowered function.
- * \param binds Buffer assignments.
- * \param global_var_supply The GlobalVarSupply to be used in the module.
- * \param simple_mode Disables the loop partition pass. Defaults to false.
- * \return The result module.
- */
-TVM_DLL IRModule LowerSchedule(te::Schedule sch, const Array<ObjectRef>& args,
-                               const std::string& name,
-                               const std::unordered_map<te::Tensor, tir::Buffer>& binds,
-                               GlobalVarSupply global_var_supply, bool simple_mode = false);
-
-/*!
- * \brief Create an IRModule out of a TE Schedule. It does not apply lowering passes. If you want
- * to apply lowering passes as well, use LowerSchedule.
- * \param sch The schedule
- * \param args The arguments to the function.
- * \param name The name of the lowered function.
- * \param binds Buffer assignments.
- * \param global_var_supply The GlobalVarSupply to be used in the module and when creating
- * GlobalVars.
- * \return The result module.
- */
-IRModule ScheduleToModule(te::Schedule sch, const Array<ObjectRef>& args, const std::string& name,
-                          const std::unordered_map<te::Tensor, tir::Buffer>& binds,
-                          GlobalVarSupply global_var_supply);
 /*!
  * \brief Build a device and host module for a specific target from an IRModule.
  * \param funcs The functions to be built.
diff --git a/include/tvm/relax/analysis.h b/include/tvm/relax/analysis.h
index 2de2f4fd36d5..b658758e3c8f 100644
--- a/include/tvm/relax/analysis.h
+++ b/include/tvm/relax/analysis.h
@@ -31,6 +31,7 @@
 #include <tvm/relax/op_attr_types.h>
 #include <tvm/relax/struct_info.h>
 #include <tvm/tir/function.h>
+#include <tvm/tir/index_map.h>
 
 #include <functional>
 #include <utility>
diff --git a/include/tvm/relax/op_attr_types.h b/include/tvm/relax/op_attr_types.h
index 434a89a28871..e5dd1aaba133 100644
--- a/include/tvm/relax/op_attr_types.h
+++ b/include/tvm/relax/op_attr_types.h
@@ -26,7 +26,6 @@
 
 #include <tvm/relax/expr.h>
 #include <tvm/relax/struct_info.h>
-#include <tvm/te/schedule.h>
 #include <tvm/te/tensor.h>
 
 namespace tvm {
diff --git a/include/tvm/te/autodiff.h b/include/tvm/te/autodiff.h
deleted file mode 100644
index e2d379969c65..000000000000
--- a/include/tvm/te/autodiff.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file tvm/te/autodiff.h
- * \brief Automatic differentiation of tensor expressions.
- */
-
-#ifndef TVM_TE_AUTODIFF_H_
-#define TVM_TE_AUTODIFF_H_
-
-#include <tvm/runtime/object.h>
-#include <tvm/tir/expr.h>
-
-#include "tensor.h"
-
-namespace tvm {
-/*! \brief Tensor expression language DSL. */
-namespace te {
-
-/*!
- * \brief Take the derivative of the expression with respect to the given variable.
- * \param expr The expression to differentiate.
- * \param var The variable to differentiate with respect to.
- * \return The expression for the derivative.
- */
-PrimExpr Derivative(const PrimExpr& expr, const Var& var);
-
-/*!
- * \brief Get the tensor representing the Jacobian of the output with respect to the input.
- *
- *  Note that if \p output depends on \p input indirectly (by using some other tensor
- *  depending on \p input), this dependency won't contribute to the resulting Jacobian.
- *  For such cases use the function ::Gradient.
- *
- * \param output The tensor to differentiate.
- * \param input The input tensor, which \p output should directly use.
- * \return The tensor representing the Jacobian of shape `output.shape + input.shape`.
- */
-Tensor Jacobian(const Tensor& output, const Tensor& input);
-
-/*!
- * \brief The building block for reverse-mode AD.
- *
- *  Differentiate \p output wrt \p input and multiply the result by \p head on the left using tensor
- *  dot product. \p input must be an immediate dependency of \p output (must be called from within
- *  the body of \p output). That is, the function will compute one summand of the adjoint for \p
- * input given the adjoint for \p output (which is called \p head here).
- *
- * \param output The tensor to differentiate.
- * \param input The input tensor, which \p output should directly use.
- * \param head The adjoint of \p output. Must be of shape `prefix + output.shape`
- * \return The tensor of shape `prefix + input.shape`
- *         representing the partial adjoint of \p input wrt one of its consumers (output)
- */
-Tensor VectorJacobianProduct(const Tensor& output, const Tensor& input, const Tensor& head);
-
-/*!
- * \brief Perform reverse mode automatic differentiation.
- *
- *  Each item of the `result` field of the result is an adjoint for the corresponding item of
- *  \p inputs, i.e. \p head multiplied by the Jacobian of \p output with respect to the
- *  corresponding item of \p inputs.
- *
- * \param output The tensor to differentiate.
- * \param inputs The array of input tensors. When the array is empty, will perform differentiation
- *               wrt all tensors the output depends on.
- * \param head The adjoint of the output, in other words, some tensor, by which the Jacobians
- *             will be multiplied (using tensordot axes=`output.shape`).
- *             Its shape must be of the form `prefix + output.shape`. If the null pointer is
- * provided, the identity tensor of shape `output.shape + output.shape` will be used. \return An
- * array of adjoints corresponding to \p inputs.
- */
-TVM_DLL Array<Tensor> Gradient(const Tensor& output, const Array<Tensor>& inputs,
-                               const Tensor& head = Tensor());
-
-}  // namespace te
-}  // namespace tvm
-
-#endif  // TVM_TE_AUTODIFF_H_
diff --git a/include/tvm/te/operation.h b/include/tvm/te/operation.h
index f5753afa560f..4b3ce6991871 100644
--- a/include/tvm/te/operation.h
+++ b/include/tvm/te/operation.h
@@ -25,7 +25,6 @@
 #define TVM_TE_OPERATION_H_
 
 #include <tvm/arith/analyzer.h>
-#include <tvm/te/schedule.h>
 #include <tvm/te/tensor.h>
 #include <tvm/tir/buffer.h>
 #include <tvm/tir/expr.h>
@@ -65,11 +64,6 @@ class TVM_DLL OperationNode : public Object {
   virtual ~OperationNode() {}
   /*! \return number of outputs */
   virtual int num_outputs() const = 0;
-  /*!
-   * \return The list of iteration variable at root
-   * \note root_iter_vars decides the shape of the outputs.
-   */
-  virtual Array<IterVar> root_iter_vars() const = 0;
   /*!
    * \brief Get data type. i-th output tensor.
    * \param i The output index.
@@ -87,59 +81,6 @@ class TVM_DLL OperationNode : public Object {
    * \return List of input tensors.
    */
   virtual Array<Tensor> InputTensors() const = 0;
-  /*!
-   * \brief Replace the input of the operation by pattern specified by rmap.
-   *
-   * \param self The reference to self.
-   * \param rmap The replacement map.
-   * \return self if nothing is replaced, otherwise return replaced op.
-   */
-  virtual Operation ReplaceInputs(const Operation& self,
-                                  const std::unordered_map<Tensor, Tensor>& rmap) const = 0;
-  /*!
-   * \brief Propagate the bounds to inputs
-   * \param self The reference to self.
-   * \param analyzer The analyzer to be used in the function.
-   * \param dom_map the domain map of Variables(corresponds to root_iter_vars)
-   * \param out_dom_map The output domain.
-   *  The function is only asked to fill the bounds for Tensors that
-   *  is already in the out_dom_map
-   */
-  virtual void PropBoundToInputs(const Operation& self, arith::Analyzer* analyzer,
-                                 const std::unordered_map<const VarNode*, IntSet>& dom_map,
-                                 std::unordered_map<Tensor, TensorDom>* out_dom_map) const = 0;
-  /*!
-   * \brief Gather the bound from output tensor.
-   *  Set the range of each root_iter_vars in the op to out_dom_map
-   *
-   * \param self The reference to self.
-   * \param tensor_dom Domain map of Tensor->access set of each dimension.
-   * \param out_dom_map The output domain map of each IterVar to be setted.
-   */
-  virtual void GatherBound(const Operation& self,
-                           const std::unordered_map<Tensor, TensorDom>& tensor_dom,
-                           std::unordered_map<IterVar, Range>* out_dom_map) const = 0;
-  /*!
-   * \brief Build the Realize statement that realizes
-   *   the op's output tensors.
-   * \param stage the op's stage.
-   * \param realize_map The realization domain map of the operators.
-   * \param body The body that is going to get
-   * \param storage_scope The storage scope associated with this realization
-   * \return A realization statement that wraps body.
-   */
-  virtual Stmt BuildRealize(const Stage& stage,
-                            const std::unordered_map<IterVar, Range>& realize_map, const Stmt& body,
-                            String storage_scope = "") const = 0;
-  /*!
-   * \brief Build the statement that provide the output tensors.
-   * \param stage The schedule stage of the op.
-   * \param dom_map The domain map of all iteration domains.
-   * \param debug_keep_trivial_loop Whether keep trivial loops with extent of 1
-   * \return A statement that add production and wraps consumer.
-   */
-  virtual Stmt BuildProvide(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
-                            bool debug_keep_trivial_loop) const = 0;
 
   static constexpr const char* _type_key = "Operation";
 
@@ -157,21 +98,9 @@ class PlaceholderOpNode : public OperationNode {
   DataType dtype;
   // override behavior.
   int num_outputs() const final;
-  Array<IterVar> root_iter_vars() const final;
   DataType output_dtype(size_t i) const final;
   Array<PrimExpr> output_shape(size_t i) const final;
   Array<Tensor> InputTensors() const final;
-  Operation ReplaceInputs(const Operation& self,
-                          const std::unordered_map<Tensor, Tensor>& rmap) const final;
-  void PropBoundToInputs(const Operation& self, arith::Analyzer* analyzer,
-                         const std::unordered_map<const VarNode*, IntSet>& dom_map,
-                         std::unordered_map<Tensor, TensorDom>* out_dom_map) const final;
-  void GatherBound(const Operation& self, const std::unordered_map<Tensor, TensorDom>& tensor_dom,
-                   std::unordered_map<IterVar, Range>* out_dom_map) const final;
-  Stmt BuildRealize(const Stage& stage, const std::unordered_map<IterVar, Range>& realize_map,
-                    const Stmt& body, String storage_scope = "") const final;
-  Stmt BuildProvide(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
-                    bool debug_keep_trivial_loop) const final;
 
   void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
@@ -198,8 +127,7 @@ class PlaceholderOp : public Operation {
 
 /*!
  * \brief A Compute op that compute a tensor on certain domain.
- * This is the base class for ComputeOp (operating on a scalar at a time) and
- * TensorComputeOp (operating on a TensorSlice at a time)
+ * This is the base class for ComputeOp (operating on a scalar at a time)
  */
 class TVM_DLL BaseComputeOpNode : public OperationNode {
  public:
@@ -208,13 +136,7 @@ class TVM_DLL BaseComputeOpNode : public OperationNode {
   /*! \brief IterVar on each reduction axis, if the body is a Reduce */
   Array<IterVar> reduce_axis;
   // override functions
-  Array<IterVar> root_iter_vars() const final;
   Array<PrimExpr> output_shape(size_t idx) const final;
-  void GatherBound(const Operation& self, const std::unordered_map<Tensor, TensorDom>& tensor_dom,
-                   std::unordered_map<IterVar, Range>* out_dom_map) const final;
-  Stmt BuildRealize(const Stage& stage, const std::unordered_map<IterVar, Range>& realize_map,
-                    const Stmt& body, String storage_scope = "") const final;
-  virtual size_t num_schedulable_dims() const = 0;
 
   static constexpr const char* _type_key = "BaseComputeOp";
   TVM_DECLARE_BASE_OBJECT_INFO(BaseComputeOpNode, OperationNode);
@@ -233,14 +155,6 @@ class TVM_DLL ComputeOpNode : public BaseComputeOpNode {
   int num_outputs() const final;
   DataType output_dtype(size_t i) const final;
   Array<Tensor> InputTensors() const final;
-  Operation ReplaceInputs(const Operation& self,
-                          const std::unordered_map<Tensor, Tensor>& rmap) const final;
-  void PropBoundToInputs(const Operation& self, arith::Analyzer* analyzer,
-                         const std::unordered_map<const VarNode*, IntSet>& dom_map,
-                         std::unordered_map<Tensor, TensorDom>* out_dom_map) const final;
-  Stmt BuildProvide(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
-                    bool debug_keep_trivial_loop) const final;
-  size_t num_schedulable_dims() const final;
 
   void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
@@ -268,66 +182,6 @@ class ComputeOp : public Operation {
   TVM_DEFINE_OBJECT_REF_COW_METHOD(ComputeOpNode);
 };
 
-/*!
- * \brief A TenorCompute op that compute a tensor with an tensor intrinsic.
- */
-class TensorComputeOpNode : public BaseComputeOpNode {
- public:
-  /*! \brief number of axes that can be scheduled */
-  int schedulable_ndim;
-  /*! \brief TensorIntrin used to compute */
-  TensorIntrin intrin;
-  /*! \brief input tensors of intrin */
-  Array<Tensor> inputs;
-  /*! \brief region of input tensors */
-  Array<Region> input_regions;
-  /*! \brief scalar expression inputs */
-  Array<PrimExpr> scalar_inputs;
-  /*! \brief constructor */
-  TensorComputeOpNode() {}
-  // override functions
-  int num_outputs() const final;
-  DataType output_dtype(size_t i) const final;
-  Array<Tensor> InputTensors() const final;
-  Operation ReplaceInputs(const Operation& self,
-                          const std::unordered_map<Tensor, Tensor>& rmap) const final;
-  void PropBoundToInputs(const Operation& self, arith::Analyzer* analyzer,
-                         const std::unordered_map<const VarNode*, IntSet>& dom_map,
-                         std::unordered_map<Tensor, TensorDom>* out_dom_map) const final;
-  Stmt BuildProvide(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
-                    bool debug_keep_trivial_loop) const final;
-  size_t num_schedulable_dims() const final;
-
-  void VisitAttrs(AttrVisitor* v) {
-    v->Visit("name", &name);
-    v->Visit("tag", &tag);
-    v->Visit("axis", &axis);
-    v->Visit("reduce_axis", &reduce_axis);
-    v->Visit("schedulable_ndim", &schedulable_ndim);
-    v->Visit("intrin", &intrin);
-    v->Visit("inputs", &inputs);
-    v->Visit("input_regions", &input_regions);
-    v->Visit("scalar_inputs", &scalar_inputs);
-  }
-
-  static constexpr const char* _type_key = "TensorComputeOp";
-  TVM_DECLARE_FINAL_OBJECT_INFO(TensorComputeOpNode, BaseComputeOpNode);
-};
-
-/*!
- * \brief Managed reference to TensorComputeOpNode
- * \sa TensorComputeOpNode
- */
-class TensorComputeOp : public Operation {
- public:
-  TVM_DLL TensorComputeOp(std::string name, std::string tag, Array<IterVar> axis,
-                          Array<IterVar> reduce_axis, int schedulable_ndim, TensorIntrin intrin,
-                          Array<Tensor> tensors, Array<Region> regions,
-                          Array<PrimExpr> scalar_inputs);
-
-  TVM_DEFINE_OBJECT_REF_METHODS(TensorComputeOp, Operation, TensorComputeOpNode);
-};
-
 /*!
  * \brief Symbolic scan.
  */
@@ -360,21 +214,9 @@ class ScanOpNode : public OperationNode {
   ScanOpNode() {}
   // override behavior.
   int num_outputs() const final;
-  Array<IterVar> root_iter_vars() const final;
   DataType output_dtype(size_t i) const final;
   Array<PrimExpr> output_shape(size_t i) const final;
   Array<Tensor> InputTensors() const final;
-  Operation ReplaceInputs(const Operation& self,
-                          const std::unordered_map<Tensor, Tensor>& rmap) const final;
-  void PropBoundToInputs(const Operation& self, arith::Analyzer* analyzer,
-                         const std::unordered_map<const VarNode*, IntSet>& dom_map,
-                         std::unordered_map<Tensor, TensorDom>* out_dom_map) const final;
-  void GatherBound(const Operation& self, const std::unordered_map<Tensor, TensorDom>& tensor_dom,
-                   std::unordered_map<IterVar, Range>* out_dom_map) const final;
-  Stmt BuildRealize(const Stage& stage, const std::unordered_map<IterVar, Range>& realize_map,
-                    const Stmt& body, String storage_scope = "") const final;
-  Stmt BuildProvide(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
-                    bool debug_keep_trivial_loop) const final;
 
   void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
@@ -423,21 +265,9 @@ class ExternOpNode : public OperationNode {
   ExternOpNode() {}
   // override functions
   int num_outputs() const final;
-  Array<IterVar> root_iter_vars() const final;
   DataType output_dtype(size_t i) const final;
   Array<PrimExpr> output_shape(size_t i) const final;
   Array<Tensor> InputTensors() const final;
-  Operation ReplaceInputs(const Operation& self,
-                          const std::unordered_map<Tensor, Tensor>& rmap) const final;
-  void PropBoundToInputs(const Operation& self, arith::Analyzer* analyzer,
-                         const std::unordered_map<const VarNode*, IntSet>& dom_map,
-                         std::unordered_map<Tensor, TensorDom>* out_dom_map) const final;
-  void GatherBound(const Operation& self, const std::unordered_map<Tensor, TensorDom>& tensor_dom,
-                   std::unordered_map<IterVar, Range>* out_dom_map) const final;
-  Stmt BuildRealize(const Stage& stage, const std::unordered_map<IterVar, Range>& realize_map,
-                    const Stmt& body, String storage_scope = "") const final;
-  Stmt BuildProvide(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
-                    bool debug_keep_trivial_loop) const final;
 
   void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
@@ -466,70 +296,6 @@ class ExternOp : public Operation {
   TVM_DEFINE_OBJECT_REF_METHODS(ExternOp, Operation, ExternOpNode);
 };
 
-/*!
- * \brief A computation operator that generated by hybrid script.
- */
-class HybridOpNode : public OperationNode {
- public:
-  /*! \brief The input tensors */
-  Array<Tensor> inputs;
-  /*! \brief Symbolic placeholder representation of outputs */
-  Array<Tensor> outputs;
-  /*! \brief The axis of iterations */
-  Array<IterVar> axis;
-  /*! \brief the statement that generates the computation. This is
-   * slightly different from the body in ExternOpNode. All the output
-   * tensors keep its own name specified by users in the script.
-   * However, when compilation, these tensors will be placed by those
-   * actual output tensors. */
-  Stmt body;
-
-  /*! \brief constructor */
-  HybridOpNode() {}
-  // override functions
-  int num_outputs() const final;
-  Array<IterVar> root_iter_vars() const final;
-  DataType output_dtype(size_t i) const final;
-  Array<PrimExpr> output_shape(size_t i) const final;
-  Array<Tensor> InputTensors() const final;
-  Operation ReplaceInputs(const Operation& self,
-                          const std::unordered_map<Tensor, Tensor>& rmap) const final;
-  void PropBoundToInputs(const Operation& self, arith::Analyzer* analyzer,
-                         const std::unordered_map<const VarNode*, IntSet>& dom_map,
-                         std::unordered_map<Tensor, TensorDom>* out_dom_map) const final;
-  void GatherBound(const Operation& self, const std::unordered_map<Tensor, TensorDom>& tensor_dom,
-                   std::unordered_map<IterVar, Range>* out_dom_map) const final;
-  Stmt BuildRealize(const Stage& stage, const std::unordered_map<IterVar, Range>& realize_map,
-                    const Stmt& body, String storage_scope = "") const final;
-  Stmt BuildProvide(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
-                    bool debug_keep_trivial_loop) const final;
-
-  void VisitAttrs(AttrVisitor* v) {
-    v->Visit("name", &name);
-    v->Visit("tag", &tag);
-    v->Visit("attrs", &attrs);
-    v->Visit("inputs", &inputs);
-    v->Visit("outputs", &outputs);
-    v->Visit("axis", &axis);
-    v->Visit("body", &body);
-  }
-
-  static constexpr const char* _type_key = "HybridOp";
-  TVM_DECLARE_FINAL_OBJECT_INFO(HybridOpNode, OperationNode);
-};
-
-/*!
- * \brief Managed reference to HybridOpNode
- * \sa HybridOpNode
- */
-class HybridOp : public Operation {
- public:
-  TVM_DLL HybridOp(std::string name, std::string tag, Map<String, ObjectRef> attrs,
-                   Array<Tensor> inputs, Array<Tensor> outputs, Stmt body);
-
-  TVM_DEFINE_OBJECT_REF_METHODS(HybridOp, Operation, HybridOpNode);
-};
-
 /*!
  * \brief Construct a new Var expression
  * \param name_hint The name hint for the expression
diff --git a/include/tvm/te/schedule.h b/include/tvm/te/schedule.h
deleted file mode 100644
index 47787b2c99fe..000000000000
--- a/include/tvm/te/schedule.h
+++ /dev/null
@@ -1,1010 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file tvm/te/schedule.h
- * \brief Define a schedule.
- */
-// Acknowledgement: Many schedule primitives originate from Halide and Loopy.
-#ifndef TVM_TE_SCHEDULE_H_
-#define TVM_TE_SCHEDULE_H_
-
-#include <tvm/support/with.h>
-#include <tvm/te/tensor.h>
-#include <tvm/te/tensor_intrin.h>
-#include <tvm/tir/expr.h>
-#include <tvm/tir/index_map.h>
-
-#include <string>
-#include <unordered_map>
-
-namespace tvm {
-namespace te {
-// Node container for Stage
-class StageNode;
-// Node container for Schedule
-class ScheduleNode;
-// Node container for IterVarRelation
-class IterVarRelationNode;
-// Attribute of itervar.
-class IterVarAttrNode;
-
-/*! \brief the attachment type */
-enum AttachType : int {
-  kGroupRoot = 1,
-  kInline = 2,
-  kInlinedAlready = 3,
-  kScope = 4,
-  kScanUpdate = 5
-};
-
-/*! \brief Stage, contains scheduling for a stage of computation. */
-class Stage : public ObjectRef {
- public:
-  Stage() {}
-  explicit Stage(ObjectPtr<Object> n) : ObjectRef(n) {}
-  /*!
-   * \brief create a new schedule for op.
-   * \param op The operator in the schedule
-   * \param sch The schedule which current stage belongs to
-   */
-  explicit Stage(Operation op, const ScheduleNode* sch);
-  /*!
-   * \brief access the internal node container
-   * \return the pointer to the internal node container
-   */
-  inline const StageNode* operator->() const;
-  /*!
-   * \brief access the internal node container
-   * \return the pointer to the internal node container
-   */
-  inline StageNode* operator->();
-  /*!
-   * \brief set the memory scope of the stage
-   * \param scope The memory scope.
-   */
-  TVM_DLL Stage& set_scope(std::string scope);  // NOLINT(*)
-  /*!
-   * \brief specify the schedule to be computed at the parent schedule's scope.
-   * \param parent The parent schedule.
-   * \param scope The iteration point to carry the schedule.
-   * \return reference to self.
-   */
-  TVM_DLL Stage& compute_at(Stage parent, IterVar scope);  // NOLINT(*)
-  /*!
-   * \brief Compute the function inline.
-   * \return reference to self.
-   */
-  TVM_DLL Stage& compute_inline();  // NOLINT(*)
-  /*!
-   * \brief Compute the function at group root.
-   * \return reference to self.
-   */
-  TVM_DLL Stage& compute_root();  // NOLINT(*)
-  /*!
-   * \brief Bind the IterVar to thread index.
-   *
-   * \param ivar The IterVar to be bound.
-   * \param thread_ivar The thread axis to be bound.
-   * \return reference to self.
-   */
-  TVM_DLL Stage& bind(IterVar ivar, IterVar thread_ivar);
-  /*!
-   * \brief Set the predicate to determine whether a store to the array should be performed.
-   *  Use this when there are multiple threads performing the same store and we only
-   *  need one of them to do the store.
-   *
-   * \note This is a dangerous scheduling primitive that can change behavior of program.
-   *    Only do when we are certain that thare are duplicated stores.
-   * \param predicate The condition to be checked.
-   * \return reference to self.
-   */
-  TVM_DLL Stage& set_store_predicate(PrimExpr predicate);
-  /*!
-   * \brief Specify environment threads that launched around the group's scope.
-   *  This can only be used in group stage.
-   * \param threads The threads to be launched around the scope.
-   * \note Each thread can only appear in one env_threads.
-   *    This is a beta feature.
-   * \return reference to self.
-   */
-  TVM_DLL Stage& env_threads(Array<IterVar> threads);
-  /*!
-   * \brief Split the parent by factor, generate
-   * \param parent The parent iteration domain.
-   * \param factor The split factor of the loop.
-   * \param p_outer The result outer domain
-   * \param p_inner The result inner domain.
-   * \param disable_predication If enabled, don't create a predicate for guarding the
-   * loop. This can be useful when splitting with scalable factors that the schedule writer
-   * knows are divisible by the loop bound.
-   * Warning: enabling this feature may result in incorrect code generation if not used carefully.
-   * \return reference to self.
-   */
-  TVM_DLL Stage& split(IterVar parent, PrimExpr factor, IterVar* p_outer, IterVar* p_inner,
-                       bool disable_predication = false);  // NOLINT(*)
-  /*!
-   * \brief Split the iteration with given number of parts.
-   *
-   * \param parent The parent domain.
-   * \param nparts The number of parts in the outer domain.
-   * \param p_outer The result outer domain.
-   * \param p_inner The result inner domain.
-   * \param disable_predication If enabled, don't create a predicate for guarding the
-   * loop. This can be useful when splitting with scalable factors that the schedule writer
-   * knows are divisible by the loop bound.
-   * Warning: enabling this feature may result in incorrect code generation if not used carefully.
-   * \return reference to self.
-   */
-  TVM_DLL Stage& split_by_nparts(IterVar parent, PrimExpr nparts, IterVar* p_outer,
-                                 IterVar* p_inner, bool disable_predication = false);  // NOLINT(*)
-  /*!
-   * \brief Fuse the inner outer domain to the target
-   * \param outer The outer domain to be fused.
-   * \param inner The inner domain to be fused
-   * \param p_target The result target domain.
-   * \return reference to self.
-   */
-  TVM_DLL Stage& fuse(IterVar outer, IterVar inner, IterVar* p_target);  // NOLINT(*)
-  /*!
-   * \brief Fuse all the axes together into a single axis.
-   *
-   * \param axes All the axes to be fused.
-   * \param p_target The result target domain.
-   *
-   * \note axes can be an empty array,
-   *       in that case, a singleton IterVar is created and
-   *       inserted to the outermost loop.
-   *       The fuse of empty array is used to support zero-dimension tensors.
-   *
-   * \return reference to self.
-   */
-  TVM_DLL Stage& fuse(const Array<IterVar>& axes, IterVar* p_target);  // NOLINT(*)
-  /*!
-   * \brief Reorder the iteration
-   * \param order The order of iteration variable.
-   * \return reference to self.
-   */
-  TVM_DLL Stage& reorder(const Array<IterVar>& order);  // NOLINT(*)
-  /*!
-   * \brief Perform tiling on two dimensions
-   *  The final loop order from outmost to inner most are
-   *  [x_outer, y_outer, x_inner, y_inner]
-   *
-   * \param x_parent The original x dimension
-   * \param y_parent The original y dimension
-   * \param x_factor The stride factor on x axis
-   * \param y_factor The stride factor on y axis
-   * \param p_x_outer Outer axis of x dimension
-   * \param p_y_outer Outer axis of y dimension
-   * \param p_x_inner Inner axis of x dimension
-   * \param p_y_inner Inner axis of y dimension
-   * \return reference to self.
-   */
-  TVM_DLL Stage& tile(IterVar x_parent, IterVar y_parent,  // NOLINT(*)
-                      PrimExpr x_factor, PrimExpr y_factor, IterVar* p_x_outer, IterVar* p_y_outer,
-                      IterVar* p_x_inner, IterVar* p_y_inner);
-  /*!
-   * \brief Vectorize iteration.
-   * \param var The axis to be vectorized.
-   * \return reference to self.
-   */
-  TVM_DLL Stage& vectorize(IterVar var);  // NOLINT(*)
-  /*!
-   * \brief Replace computation of the current stage by tensor intrinsic f.
-   * \param var The axis marks beginning of tensorization.
-   *  Every operations inside the axis(include axis itself is tensorized).
-   * \param f The Tensor compute intrinsics.
-   * \return reference to self.
-   */
-  TVM_DLL Stage& tensorize(IterVar var, TensorIntrin f);  // NOLINT(*)
-  /*!
-   * \brief Unroll iteration.
-   * \param var The axis to be unrolled.
-   * \return reference to self.
-   */
-  TVM_DLL Stage& unroll(IterVar var);  // NOLINT(*)
-  /*!
-   * \brief Parallelize iteration.
-   * \param var The axis to be parallelized.
-   * \return reference to self.
-   */
-  TVM_DLL Stage& parallel(IterVar var);  // NOLINT(*)
-  /*!
-   * \brief Annotate the iteration with pragma
-   *
-   * \param var The axis to be parallelized.
-   * \param pragma_type The pragma type.
-   * \param pragma_value The pragma value
-   *
-   * \return reference to self.
-   */
-  TVM_DLL Stage& pragma(IterVar var, const std::string& pragma_type,
-                        const PrimExpr& pragma_value = PrimExpr());  // NOLINT(*)
-  /*!
-   * \brief Fetch data in advance.
-   * \param domain the tensor to be prefetched
-   * \param var the iteration point at which to apply prefetching
-   * \param offset the number of iterations be to fetched in advance
-   * \return reference to self
-   */
-  TVM_DLL Stage& prefetch(const Tensor& domain, IterVar var, PrimExpr offset);  // NOLINT(*)
-  /*!
-   * \brief Set alignment requirement for specific dimension.
-   *
-   *  Such that stride[axis] == k * factor + offset for some k.
-   *
-   * \param axis The dimension to be specified for alignment.
-   * \param factor The factor multiple of alignment
-   * \param offset The required offset factor.
-   * \return reference to self
-   */
-  TVM_DLL Stage& storage_align(IterVar axis, int factor, int offset);  // NOLINT(*)
-  /*!
-   * \brief Compute current stage with double buffering.
-   * \return reference to self.
-   */
-  TVM_DLL Stage& double_buffer();  // NOLINT(*)
-  /*!
-   * \brief Compute current stage with rolling buffering.
-   * \return reference to self.
-   */
-  TVM_DLL Stage& rolling_buffer();  // NOLINT(*)
-  /*!
-   * \brief Defines a layout transformation to be applied to the buffer.
-   *
-   * The map from initial_index to final_index must be an
-   * invertible affine transformation.
-   *
-   * \param initial_indices An array of variables to represent a
-   * value's location in the tensor, using the pre-transformation
-   * layout.  These variables are used as binding occurrences to
-   * represent the initial indices when applying the initial->final
-   * mapping, and should not occur elsewhere in the
-   * Schedule. (i.e. Pass in newly constructed variables, not the
-   * initial IterVar::var)
-   *
-   * \param final_indices An array of expressions, giving the
-   * value's location in the tensor, using the post-transformation layout.
-   * Expressions should be in terms of the variables given in
-   * initial_indices.
-   *
-   * \param out_iter_vars An optional output location for the updated
-   * loop iteration variables.
-   *
-   * \return reference to self
-   */
-  TVM_DLL Stage& transform_layout(const Array<Var>& initial_indices,
-                                  const Array<PrimExpr>& final_indices,
-                                  Array<IterVar>* out_iter_vars = nullptr);
-  /*! \brief Defines separators between groups of axes.
-   *
-   * Used to define `BufferNode::axis_separators`, which has
-   * additional details.
-   *
-   * \param axis_separators A list of axis separators.
-   */
-  TVM_DLL Stage& set_axis_separators(const Array<IntImm>& axis_separators);
-  /*!
-   * \brief whether the stage has been scheduled.
-   * \return whether the stage has been scheduled.
-   */
-  bool is_scheduled() const;
-  /*!
-   * \brief Get attachment spec of current stage.
-   *  If the stage compute at Group root, this function
-   *  will traverse the group function to get the
-   *  final spec from the group.
-   * \return A stage representing the attach spec of the group.
-   */
-  Stage GetAttachSpec() const;
-  // declare container type
-  using ContainerType = StageNode;
-};
-
-/*!
- * \brief Global schedule container
- *  For operations and all the operations they depend on.
- *  The schedule per Operation is named as stage.
- */
-class Schedule : public ObjectRef {
- public:
-  Schedule() {}
-  explicit Schedule(ObjectPtr<Object> n) : ObjectRef(n) {}
-  /*!
-   * \brief Create a schedule for array of ops(and their dependencies).
-   * \param ops The ops to be scheduled.
-   */
-  TVM_DLL explicit Schedule(Array<Operation> ops);
-  /*!
-   * \brief Get a copy of current schedule.
-   * \return The copied schedule.
-   */
-  Schedule copy() const;
-  /*!
-   * \brief Get the stage corresponds to the op
-   * \param op The operation.
-   */
-  TVM_DLL Stage operator[](const Operation& op);
-  /*!
-   * \brief Short hand for getting the stage of tensor's operation.
-   * \param tensor The tensor
-   * \return The stage corresponding to the tensor's op
-   */
-  TVM_DLL Stage operator[](const Tensor& tensor) { return this->operator[](tensor->op); }
-  /*!
-   * \brief Create a new stage group for all intermediate
-   *  operations between inputs and outputs.
-   *
-   * \param outputs The output boundary of the group.
-   * \param inputs The input boundary of the group.
-   * \param include_inputs Whether include inputs if they are reachable from outputs.
-   * \return The new grouped stage.
-   */
-  TVM_DLL Stage create_group(const Array<Tensor>& outputs, const Array<Tensor>& inputs,
-                             bool include_inputs = false);
-  /*!
-   * \brief create a cache read of original tensor for readers.
-   *  This will mutate the body of the readers.
-   *  A new stage will be created for the tensor.
-   * \param tensor The tensor cached.
-   * \param scope The scope of the cache.
-   * \param readers The readers to redirect to the tensor.
-   * \return The created tensor.
-   */
-  TVM_DLL Tensor cache_read(const Tensor& tensor, const std::string& scope,
-                            const Array<Operation>& readers);
-  /*!
-   * \brief Create a cache write tensor for producing tensor.
-   *  The tensor will take over body of original tensor op.
-   *
-   *  This function can be used to do data layout transformation.
-   *  If there is a split/fuse/reorder on the data parallel axis of tensor
-   *  before cache_write is called. The intermediate cache stores
-   *  the data in the layout as the iteration order of leave axis.
-   *  The data will be transformed back to the original layout in the original tensor.
-   *  User can further call compute_inline to inline the original layout and keep
-   *  the data stored in the transformed layout.
-   *
-   * \param tensor The tensors to be produced.
-   * \param scope The scope of the storage.
-   * \return The created tensor.
-   */
-  TVM_DLL Array<Tensor> cache_write(const Array<Tensor>& tensor, const std::string& scope);
-  /*!
-   * \brief Create a cache write tensor for producing tensor.
-   *  The tensor will take over body of original tensor op.
-   *
-   *  This function can be used to do data layout transformation.
-   *  If there is a split/fuse/reorder on the data parallel axis of tensor
-   *  before cache_write is called. The intermediate cache stores
-   *  the data in the layout as the iteration order of leave axis.
-   *  The data will be transformed back to the original layout in the original tensor.
-   *  User can further call compute_inline to inline the original layout and keep
-   *  the data stored in the transformed layout.
-   *
-   * \param tensor The tensor to be produced.
-   * \param scope The scope of the storage.
-   * \return The created tensor.
-   */
-  TVM_DLL Tensor cache_write(const Tensor& tensor, const std::string& scope);
-  /*!
-   * \brief Factor a reduction axis in tensor's schedule to be an explicit axis.
-   * This will create a new stage that generated the new tensor with axis
-   * as the first dimension. The tensor's body will be rewritten as a reduction
-   * over the factored tensor.
-   *
-   *  P. Suriana, A. Adams and S. Kamil. Parallel associative reductions in halide. CGO'17
-   *
-   * \param tensor The tensor to be factored.
-   * \param axis The reduction axis in tensor's schedule to be factored.
-   * \param factor_axis The position where the new axis is placed.
-   * \return The created factored tensors.
-   */
-  TVM_DLL Array<Tensor> rfactor(const Tensor& tensor, const IterVar& axis, int factor_axis = 0);
-  /*!
-   * \brief Normalize the schedule.
-   *  This is needed before bound inference.
-   *  Insert necessary RebaseNode to make sure all leaf_iter_vars
-   *  are in form [0, extent)
-   *
-   * \return A normalized schedule, can be same as current one.
-   */
-  Schedule normalize();
-
-  /*!
-   * \brief Normalize the schedule for feature extraction in auto-scheduler.
-   * This is similar to `Schedule::normalize`, but we do aggressive simplification
-   * to the TE compute with const_matrix=True for faster compilation and feature extraction.
-   * The resulted schedule may be wrong, but it is good enough for feature extraction
-   * purposes.
-   *
-   * \return A normalized schedule, can be same as current one.
-   */
-  Schedule normalize_for_feature_extraction();
-
-  /*!
-   * \brief access the internal node container
-   * \return the pointer to the internal node container
-   */
-  inline const ScheduleNode* operator->() const;
-  /*!
-   * \brief access the internal node container
-   * \return the pointer to the internal node container
-   */
-  inline ScheduleNode* operator->();
-  // declare container type
-  using ContainerType = ScheduleNode;
-};
-
-/*!
- * \brief Context helper to collect debug information of Schedule.
- *
- *  Attach With<ScheduleContext>(schedule_instance, primitive_name)
- *  inside function body of schedule primitives to collect the
- *  snapshot of schedule status and corresponding primitive name
- */
-class ScheduleContext {
- private:
-  friend class With<ScheduleContext>;
-  ScheduleContext(const ScheduleNode* sch_node, String current_primitive_name);
-  void EnterWithScope();
-  void ExitWithScope();
-
-  /*! \brief Schedule instance to store information for debug */
-  Schedule sch_;
-  /*! \brief String representing which primitive has been applied to sch_ */
-  String current_primitive_name_;
-};
-
-/*!
- * \brief The schedule relation between IterVars
- *  can be Split, Fuse.
- */
-class IterVarRelation : public ObjectRef {
- public:
-  IterVarRelation() {}
-  explicit IterVarRelation(ObjectPtr<Object> n) : ObjectRef(n) {}
-  /*!
-   * \brief access the internal node container
-   * \return the pointer to the internal node container
-   */
-  inline const IterVarRelationNode* operator->() const;
-};
-
-/*!
- * \brief Additional scheduable attributes about IterVar.
- */
-class IterVarAttr : public ObjectRef {
- public:
-  IterVarAttr() {}
-  explicit IterVarAttr(ObjectPtr<Object> n) : ObjectRef(n) {}
-  /*!
-   * \brief access the internal node container
-   * \return the pointer to the internal node container
-   */
-  inline const IterVarAttrNode* operator->() const;
-};
-
-/*!
- * \brief represents a stage.
- *
- *  relations form a Directed acylic hypergraph in bipartite manner.
- *  With each node is represented by a IterVar,
- *  and each hyper-edge is represented by a IterVarRelation.
- *  The relations connects the IterVars in the graph.
- *
- *  Besides typical stage that corresponds to operations.
- *  There is also group stage, which groups stages together.
- *  Each stage's group(given by group) represent an constraint,
- *  the stage can only be attached to stages within the group.
- *
- *  The group stage node can be attached to IterVars as in normal stage.
- */
-class StageNode : public Object {
- public:
-  /*!
-   * \brief The operation of stage, can be different from original op.
-   *  If it is null, then this stage is a group stage.
-   */
-  Operation op;
-  /*!
-   * \brief The original operator.
-   *  The op field can change during schedule to alternate the dataflow,
-   *  while origin_op remains fixed.
-   */
-  Operation origin_op;
-  /*! \brief All the nodes in the iter var
-   *
-   * Each element of all_iter_vars represents an iteration variable
-   * that may appear within this stage's computation.  Any element
-   * of `all_iter_vars` that is in `leaf_iter_vars` represents a
-   * variable that is directly defined and usable within the stage's
-   * computation.  All other elements of `all_iter_vars` represent
-   * variables whose value must be computed from the variables in
-   * `leaf_iter_vars`.  (e.g. Support index k has been split by
-   * ``ko, ki = s.split(k, factor=4)``.  ko and ki will appear in
-   * `leaf_iter_vars`, while k will not, and must be computed as
-   * `4*ko + ki`.
-   */
-  Array<IterVar> all_iter_vars;
-  /*! \brief The current active leaf iter vars in the stage.
-   *
-   * Each element of leaf_iter_vars will either be replaced with the
-   * bound index (e.g. threadIdx.x), or will be expanded into a loop
-   * over the variable's extent.  `leaf_iter_vars` is a subset of
-   * `all_iter_vars`.
-   */
-  Array<IterVar> leaf_iter_vars;
-  /*!
-   * \brief Specify threads to be launched at the stage.
-   *  This is only valid for composite ops such as Scan.
-   * \note Experimental primitive: used for thread persistence.
-   */
-  Array<IterVar> env_threads;
-  /*!
-   * \brief The predicate under which store can happen
-   *  Use this when there can be duplicated threads doing the same store.
-   * \note Experimental primitive: used by cross thread-reduction.
-   */
-  PrimExpr store_predicate;
-  /*! \brief The relation bwteen of IterVars */
-  Array<IterVarRelation> relations;
-  /*! \brief additional attributes about iter var. */
-  Map<IterVar, IterVarAttr> iter_var_attrs;
-  /*! \brief The attachment type of the schedule */
-  AttachType attach_type{kGroupRoot};
-  /*! \brief The attach point of this schedule. */
-  IterVar attach_ivar;
-  /*! \brief The stage this node attaches to */
-  Stage attach_stage;
-  /*! \brief The schedule current stage is attached to */
-  const ScheduleNode* attach_sch;
-  /*! \brief The thread storage scope level of the stage */
-  std::string scope;
-  /*! \brief Whether this is an output stage */
-  bool is_output{false};
-  /*! \brief Whether apply double buffer optimization to this stage */
-  bool double_buffer{false};
-  /*! \brief Whether apply rolling buffer optimization to this stage */
-  bool rolling_buffer{false};
-  /*! \brief Layout transformations to be applied onto the stage's tensors. */
-  Array<IndexMap> layout_transforms;
-  /*! \brief List of axes after which to divide physical axes.
-   *
-   * Used to populate `BufferNode::axis_separators`, which has
-   * additional details.
-   */
-  Array<IntImm> axis_separators;
-  /*!
-   * \brief The parent group of the current stage.
-   *  The stage cannot be assigned to stages outside the group.
-   */
-  Stage group;
-  /*! \brief Number of direct child stages, only used for group stage.*/
-  int num_child_stages{0};
-
-  void VisitAttrs(AttrVisitor* v) {
-    v->Visit("op", &op);
-    v->Visit("origin_op", &origin_op);
-    v->Visit("all_iter_vars", &all_iter_vars);
-    v->Visit("leaf_iter_vars", &leaf_iter_vars);
-    v->Visit("env_threads", &env_threads);
-    v->Visit("relations", &relations);
-    v->Visit("iter_var_attrs", &iter_var_attrs);
-    v->Visit("attach_type", &attach_type);
-    v->Visit("attach_ivar", &attach_ivar);
-    v->Visit("attach_stage", &attach_stage);
-    v->Visit("scope", &scope);
-    v->Visit("is_output", &is_output);
-    v->Visit("double_buffer", &double_buffer);
-    v->Visit("layout_transforms", &layout_transforms);
-    v->Visit("axis_separators", &axis_separators);
-    v->Visit("group", &group);
-    v->Visit("num_child_stages", &num_child_stages);
-  }
-
-  static constexpr const char* _type_key = "Stage";
-  TVM_DECLARE_FINAL_OBJECT_INFO(StageNode, Object);
-};
-
-/*! \brief node container for schedule */
-class ScheduleNode : public Object {
- public:
-  /*! \brief The output operations in original data flow graph */
-  Array<Operation> outputs;
-  /*!
-   * \brief list of all stages for ops.
-   * The stages are sorted in dependency order.
-   */
-  Array<Stage> stages;
-  /*!
-   * \brief List of all stage groups.
-   */
-  Array<Stage> groups;
-  /*! \brief map of original operation to the stages */
-  Map<Operation, Stage> stage_map;
-  /*!
-   * \brief Internal stage map to map internal ops to stages.
-   *  This is created on demand and can be invalidated.
-   */
-  std::unordered_map<const Object*, Stage> op2stage_cache_;
-  /*!
-   * \brief list of all transformed schedules
-   * User can display the optimization strategy via TEDD step by step to check
-   * the order and effect of primitives. Set "te.keep_schedule_record" in
-   * PassContext config as true to enable recording.
-   */
-  Array<Schedule> schedule_record;
-  /*!
-   * \brief list of all applied primitive names.
-   */
-  Array<String> primitive_record;
-  /*!
-   * \brief Flag to keep schedule record or not.
-   */
-  Optional<Bool> keep_schedule_record;
-
-  void VisitAttrs(AttrVisitor* v) {
-    v->Visit("outputs", &outputs);
-    v->Visit("stages", &stages);
-    v->Visit("groups", &groups);
-    v->Visit("stage_map", &stage_map);
-    v->Visit("schedule_record", &schedule_record);
-    v->Visit("primitive_record", &primitive_record);
-    v->Visit("keep_schedule_record", &keep_schedule_record);
-  }
-
-  /*! \brief Initialize temp cache. */
-  void InitCache();
-  /*! \brief Invalidate temp cache. */
-  void InvalidateCache();
-
-  /*!
-   * \brief Check if the schedule contains an Operation.
-   * \param op The candidate Operation.
-   * \return true if the schedule has the Operation. Otherwise, false.
-   */
-  TVM_DLL bool Contain(const Operation& op) const;
-
-  /*!
-   * \brief Check if the schedule contains a Tensor.
-   * \param tensor The candidate tensor.
-   * \return true if the schedule has the tensor. Otherwise, false.
-   */
-  TVM_DLL bool Contain(const Tensor& tensor) const { return Contain(tensor->op); }
-
-  static constexpr const char* _type_key = "Schedule";
-  TVM_DECLARE_FINAL_OBJECT_INFO(ScheduleNode, Object);
-};
-
-/*!
- * \brief Create a schedule for array of ops(and their dependencies).
- * \param ops The ops to be scheduled.
- * \return sch The created Schedule.
- */
-inline Schedule create_schedule(Array<Operation> ops) { return Schedule(ops); }
-
-/*! \brief node container for IterVar attr */
-class IterVarAttrNode : public Object {
- public:
-  /*! \brief The iteration type. */
-  IterVarType iter_type{kDataPar};
-  /*! \brief The thread this iter Var binds, can be null */
-  IterVar bind_thread;
-  /*! \brief List of tensor to be prefetched in this loop */
-  Array<Tensor> prefetch_data;
-  /*! \brief The offset used in each prefetch */
-  Array<PrimExpr> prefetch_offset;
-  /*!
-   * \brief Tensor intrinsic used in tensorization,
-   *   when the axis is marked as Tensorized
-   */
-  TensorIntrin tensor_intrin;
-  /*! \brief Alignment factor of buffer dimension */
-  int dim_align_factor{0};
-  /*! \brief Alignment offset of buffer dimension */
-  int dim_align_offset{0};
-  /*!
-   * \brief Additional pragma keys, array of StringImm
-   */
-  Array<PrimExpr> pragma_keys;
-  /*!
-   * \brief Additional values of pragma, if any
-   */
-  Array<PrimExpr> pragma_values;
-
-  void VisitAttrs(AttrVisitor* v) {
-    v->Visit("iter_type", &iter_type);
-    v->Visit("bind_thread", &bind_thread);
-    v->Visit("prefetch_data", &prefetch_data);
-    v->Visit("prefetch_offset", &prefetch_offset);
-    v->Visit("tensor_intrin", &tensor_intrin);
-    v->Visit("dim_align_factor", &dim_align_factor);
-    v->Visit("dim_align_offset", &dim_align_offset);
-    v->Visit("pragma_keys", &pragma_keys);
-    v->Visit("pragma_values", &pragma_values);
-  }
-
-  static constexpr const char* _type_key = "IterVarAttr";
-  TVM_DECLARE_FINAL_OBJECT_INFO(IterVarAttrNode, Object);
-};
-
-/*! \brief base node of iteration var */
-class IterVarRelationNode : public Object {
- public:
-  static constexpr const char* _type_key = "IterVarRelation";
-  TVM_DECLARE_BASE_OBJECT_INFO(IterVarRelationNode, Object);
-};
-
-/*!
- * \brief Split the parent domain into product of
- *  outer and iter.
- */
-class SplitNode : public IterVarRelationNode {
- public:
-  /*! \brief The parent domain */
-  IterVar parent;
-  /*! \brief The outer domain */
-  IterVar outer;
-  /*! \brief The inner domain */
-  IterVar inner;
-  /*! \brief The split factor */
-  PrimExpr factor;
-  /*! \brief Number of parts, only factor or nparts can be given */
-  PrimExpr nparts;
-  /*! \brief Whether to disable generation of predication. */
-  bool disable_predication;
-
-  void VisitAttrs(AttrVisitor* v) {
-    v->Visit("parent", &parent);
-    v->Visit("outer", &outer);
-    v->Visit("inner", &inner);
-    v->Visit("factor", &factor);
-    v->Visit("nparts", &nparts);
-    v->Visit("disable_predication", &disable_predication);
-  }
-
-  static constexpr const char* _type_key = "Split";
-  TVM_DECLARE_FINAL_OBJECT_INFO(SplitNode, IterVarRelationNode);
-};
-
-/*!
- * \brief Managed reference to SplitNode
- * \sa SplitNode
- */
-class Split : public IterVarRelation {
- public:
-  TVM_DLL Split(IterVar parent, IterVar outer, IterVar inner, PrimExpr factor, PrimExpr nparts,
-                bool disable_predication);
-
-  TVM_DEFINE_OBJECT_REF_METHODS(Split, IterVarRelation, SplitNode);
-};
-
-/*!
- * \brief Fuse two domains into one domain.
- */
-class FuseNode : public IterVarRelationNode {
- public:
-  /*! \brief The outer domain */
-  IterVar outer;
-  /*! \brief The inner domain */
-  IterVar inner;
-  /*! \brief The target domain */
-  IterVar fused;
-
-  void VisitAttrs(AttrVisitor* v) {
-    v->Visit("outer", &outer);
-    v->Visit("inner", &inner);
-    v->Visit("fused", &fused);
-  }
-
-  static constexpr const char* _type_key = "Fuse";
-  TVM_DECLARE_FINAL_OBJECT_INFO(FuseNode, IterVarRelationNode);
-};
-
-/*!
- * \brief Managed reference to FuseNode
- * \sa FuseNode
- */
-class Fuse : public IterVarRelation {
- public:
-  TVM_DLL Fuse(IterVar outer, IterVar inner, IterVar fused);
-
-  TVM_DEFINE_OBJECT_REF_METHODS(Fuse, IterVarRelation, FuseNode);
-};
-
-/*!
- * \brief Rebase the iteration to make min to be 0.
- *  This is useful to normalize the Schedule
- *  to make every leaf variable's min to be 0.
- */
-class RebaseNode : public IterVarRelationNode {
- public:
-  /*! \brief The parent domain */
-  IterVar parent;
-  /*! \brief The inner domain */
-  IterVar rebased;
-
-  void VisitAttrs(AttrVisitor* v) {
-    v->Visit("parent", &parent);
-    v->Visit("rebased", &rebased);
-  }
-
-  static constexpr const char* _type_key = "Rebase";
-  TVM_DECLARE_FINAL_OBJECT_INFO(RebaseNode, IterVarRelationNode);
-};
-
-/*!
- * \brief Managed reference to RebaseNode
- * \sa RebaseNode
- */
-class Rebase : public IterVarRelation {
- public:
-  TVM_DLL Rebase(IterVar parent, IterVar rebased);
-
-  TVM_DEFINE_OBJECT_REF_METHODS(Rebase, IterVarRelation, RebaseNode);
-};
-
-/*!
- * \brief Singleton iterator [0, 1)
- */
-class SingletonNode : public IterVarRelationNode {
- public:
-  /*! \brief The singleton iterator */
-  IterVar iter;
-
-  void VisitAttrs(AttrVisitor* v) { v->Visit("iter", &iter); }
-
-  static constexpr const char* _type_key = "Singleton";
-  TVM_DECLARE_FINAL_OBJECT_INFO(SingletonNode, IterVarRelationNode);
-};
-
-/*!
- * \brief Managed reference to SingletonNode
- * \sa SingletonNode
- */
-class Singleton : public IterVarRelation {
- public:
-  TVM_DLL explicit Singleton(IterVar iter);
-
-  TVM_DEFINE_OBJECT_REF_METHODS(Singleton, IterVarRelation, SingletonNode);
-};
-
-/*!
- * \brief Transform iterator according to some arbitrary expression.
- */
-class TransformNode : public IterVarRelationNode {
- public:
-  /*! \brief The loop variables that were replaced by the transformation.
-   *
-   * Prior to applying a layout transformation, these represent the
-   * loops to iterate over a tensor as it is being computed, following
-   * a row-major traversal of the tensor's original shape in the
-   * compute definition.
-   */
-  Array<IterVar> original_variables;
-
-  /*! \brief The variables generated by the transformation.
-   *
-   * After to applying a layout transformation, these represent the
-   * loops to iterate over a tensor as it is being computed, following
-   * a row-major traversal of the transformed shape of the tensor.
-   */
-  Array<IterVar> transformed_variables;
-
-  /*! \brief Map from the original variables to the transformed variables.
-   *
-   * Used to determine iterator ranges over the transformed variables.
-   */
-  IndexMap forward_transformation;
-
-  /*! \brief Map from transformed variables to the original variables
-   *
-   * Used to rewrite expressions containing the original loop iterators
-   * in terms of the transformed loop iterators.
-   */
-  IndexMap inverse_transformation;
-
-  void VisitAttrs(AttrVisitor* v) {
-    v->Visit("original_variables", &original_variables);
-    v->Visit("transformed_variables", &transformed_variables);
-    v->Visit("forward_transformation", &forward_transformation);
-    v->Visit("inverse_transformation", &inverse_transformation);
-  }
-
-  static constexpr const char* _type_key = "Transform";
-  TVM_DECLARE_FINAL_OBJECT_INFO(TransformNode, IterVarRelationNode);
-};
-
-class Transform : public IterVarRelation {
- public:
-  TVM_DLL explicit Transform(Array<IterVar> original_variables,
-                             Array<IterVar> transformed_variables, IndexMap forward_transformation,
-                             IndexMap inverse_transformation);
-
-  TVM_DEFINE_OBJECT_REF_METHODS(Transform, IterVarRelation, TransformNode);
-};
-
-/*! \brief Container for specialization conditions. */
-class SpecializedConditionNode : public Object {
- public:
-  /*!
-   * \brief List of conditions in conjunctive joint form (CNF).
-   *   Each condition should be a simple expression, e.g., n > 16, m % 8 == 0, etc.,
-   *   where n, m are tvm::Var that represents a dimension in the tensor shape.
-   */
-  Array<PrimExpr> clauses;
-
-  void VisitAttrs(AttrVisitor* v) { v->Visit("clauses", &clauses); }
-
-  static constexpr const char* _type_key = "SpecializedCondition";
-  TVM_DECLARE_FINAL_OBJECT_INFO(SpecializedConditionNode, Object);
-};
-
-/*!
- * \brief Specialized condition to enable op specialization
- */
-class SpecializedCondition : public ObjectRef {
- public:
-  /*!
-   * \brief construct from conditions
-   * \param conditions The clauses in the specialized condition.
-   */
-  TVM_DLL SpecializedCondition(Array<PrimExpr> conditions);  // NOLINT(*)
-
-  /*!
-   * \brief Get the current specialized condition.
-   * \return the current specialized condition.
-   */
-  TVM_DLL static SpecializedCondition Current();
-
-  TVM_DEFINE_OBJECT_REF_METHODS(SpecializedCondition, ObjectRef, SpecializedConditionNode);
-  class Internal;
-
- private:
-  // enable with syntax.
-  friend class Internal;
-  friend class With<SpecializedCondition>;
-  /*! \brief Push a new specialized condition onto the thread local stack. */
-  TVM_DLL void EnterWithScope();
-  /*! \brief Pop a specialized condition off the thread local context stack. */
-  TVM_DLL void ExitWithScope();
-};
-
-// implementations
-inline const StageNode* Stage::operator->() const { return static_cast<const StageNode*>(get()); }
-inline StageNode* Stage::operator->() { return static_cast<StageNode*>(get_mutable()); }
-
-inline const ScheduleNode* Schedule::operator->() const {
-  return static_cast<const ScheduleNode*>(get());
-}
-inline ScheduleNode* Schedule::operator->() { return static_cast<ScheduleNode*>(get_mutable()); }
-
-inline const IterVarRelationNode* IterVarRelation::operator->() const {
-  return static_cast<const IterVarRelationNode*>(get());
-}
-
-inline const IterVarAttrNode* IterVarAttr::operator->() const {
-  return static_cast<const IterVarAttrNode*>(get());
-}
-
-}  // namespace te
-}  // namespace tvm
-#endif  // TVM_TE_SCHEDULE_H_
diff --git a/include/tvm/te/schedule_pass.h b/include/tvm/te/schedule_pass.h
deleted file mode 100644
index 3f9da5fb5d5a..000000000000
--- a/include/tvm/te/schedule_pass.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file tvm/te/schedule_pass.h
- * \brief  Collection of Schedule pass functions.
- *
- *  These passes works on the schedule hyper-graph
- *  and infers information such as bounds, check conditions
- *  read/write dependencies between the IterVar
- */
-#ifndef TVM_TE_SCHEDULE_PASS_H_
-#define TVM_TE_SCHEDULE_PASS_H_
-
-#include <tvm/te/schedule.h>
-#include <tvm/tir/function.h>
-
-namespace tvm {
-namespace te {
-
-/*!
- * \brief To automatically inline the element-wise operations.
- *
- * \param sch The schedule to be inlined.
- */
-void AutoInlineElemWise(Schedule sch);
-
-/*!
- * \brief To automatically inline the broadcast operations.
- *
- * \param sch The schedule to be inlined.
- */
-void AutoInlineBroarcast(Schedule sch);
-
-/*!
- * \brief To automatically inline operations with injective writes
- *   (i.e. writes without reduction or sequential loops). Note
- *   that in this case, guarantees about contiguity, transpose, stride,
- *   alignemnt and memory footprint in general do not hold.
- *
- * \param sch The schedule to be inlined.
- */
-TVM_DLL void AutoInlineInjective(Schedule sch);
-
-/*!
- * \brief Infer the bound of all iteration variables relates to the schedule.
- *
- * \param sch The root schedule to infer all the bounds.
- * \return the result bound of the iteration Variable
- */
-Map<IterVar, Range> InferBound(const Schedule& sch);
-
-/*!
- * \brief Verify if there is any argument bound to compact buffer.
- *
- * \param stmt The stmt to be verified.
- * \return true if there is any buffer_bind_scope attribute found,
- *        otherwise, false.
- */
-bool VerifyCompactBuffer(const Stmt& stmt);
-
-/*!
- * \brief Schedule s' dependent operations.
- *
- * \param s The schedule to be realized
- * \param dom_map The domain of each iter vars.
- * \param debug_keep_trivial_loop Whether keep trivial loops with extent of 1 during lowering.
- *                                This is a debug feature for dataflow/axis analysis.
- *                                Note: If this is true, The lowered IR may be incorrect,
- *                                because we will also delete the init part of reduction
- * \return the result Stmt
- */
-Stmt ScheduleOps(Schedule s, Map<IterVar, Range> dom_map, bool debug_keep_trivial_loop);
-
-/*!
- * \brief Postprocessing the Stmt generated by ScheduleOps to create
- *        a PrimFunc that can then be used for further TIR optimizations.
- *
- *  Perform this translation before running any TIR optimizations.
- *
- *  List of actions taken by the function:
- *  - Remove occurrences of te::Tensor, te::Operation in the IR
- *    and replace them by corresponding IR nodes via tir::Buffer.
- *  - Add annotation of extern buffers using the buffer_map field
- *    in the PrimFunc type.
- *
- * \param arg_list Array of Tensor/Var/Buffer arguments to the function.
- * \param body The body of the function.
- * \param bindings potential Tensor to Buffer bindings for the Tensors in the body.
- */
-PrimFunc SchedulePostProcToPrimFunc(Array<ObjectRef> arg_list, Stmt body,
-                                    Optional<Map<Tensor, Buffer>> bindings);
-
-}  // namespace te
-}  // namespace tvm
-#endif  // TVM_TE_SCHEDULE_PASS_H_
diff --git a/include/tvm/te/tensor_intrin.h b/include/tvm/te/tensor_intrin.h
deleted file mode 100644
index 22f29defbb64..000000000000
--- a/include/tvm/te/tensor_intrin.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file tvm/te/tensor_intrin.h
- * \brief Tensor intrinsic operations.
- */
-#ifndef TVM_TE_TENSOR_INTRIN_H_
-#define TVM_TE_TENSOR_INTRIN_H_
-
-#include <tvm/te/tensor.h>
-#include <tvm/tir/buffer.h>
-
-#include <string>
-
-namespace tvm {
-namespace te {
-
-/*! \brief Node to represent a Tensor intrinsic operator */
-class TensorIntrinNode : public Object {
- public:
-  /*! \brief The name of the intrinsic */
-  std::string name;
-  /*! \brief The operation this intrinsics is carrying out */
-  Operation op;
-  /*! \brief List of inputs of operator, placeholder in postdfs order */
-  Array<Tensor> inputs;
-  /*!
-   * \brief Symbolic buffers of each output/input tensor
-   *  buffers[0:len(inputs)] are buffers of the inputs.
-   *  buffers[len(inputs):] are buffers of each output.
-   *
-   * \note When a field in Buffer is Var, it means we can be flexible
-   *  wrt that field and Var can occur in body.
-   *  When it is a constant, it means we can only take data in that shape.
-   */
-  Array<Buffer> buffers;
-  /*! \brief List of scalar variables, used in body. These placeholders
-   *  will be bound to expressions passed in when the TensorIntrin is called
-   * from a TensorComputeOp.
-   */
-  Array<Var> scalar_params;
-  /*! \brief The normal statement to execute the intrinsic */
-  Stmt body;
-  /*!
-   * \brief Special statement for reduction op, can be None
-   *  reset the value of output buffer to identity value.
-   */
-  Stmt reduce_init;
-  /*!
-   * \brief Special statement for reduction op, can be None
-   *  Reduce: do a reduction of current output buffer with the result.
-   */
-  Stmt reduce_update;
-  /*! \brief constructor */
-  TensorIntrinNode() {}
-
-  void VisitAttrs(AttrVisitor* v) {
-    v->Visit("name", &name);
-    v->Visit("op", &op);
-    v->Visit("inputs", &inputs);
-    v->Visit("buffers", &buffers);
-    v->Visit("scalar_params", &scalar_params);
-    v->Visit("body", &body);
-    v->Visit("reduce_init", &reduce_init);
-    v->Visit("reduce_update", &reduce_update);
-  }
-
-  static constexpr const char* _type_key = "TensorIntrin";
-  TVM_DECLARE_FINAL_OBJECT_INFO(TensorIntrinNode, Object);
-};
-
-/*!
- * \brief Managed reference to TensorIntrinNode
- * \sa TensorIntrinNode
- */
-class TensorIntrin : public ObjectRef {
- public:
-  TVM_DLL TensorIntrin(std::string name, Operation op, Array<Tensor> inputs, Array<Buffer> buffers,
-                       Array<Var> scalar_params, Stmt body, Stmt reduce_init, Stmt reduce_update);
-
-  TVM_DEFINE_OBJECT_REF_METHODS(TensorIntrin, ObjectRef, TensorIntrinNode);
-};
-
-class TensorIntrinCallNode : public Object {
- public:
-  /*! \brief the tensor intrinsic */
-  TensorIntrin intrin;
-  /*! \brief input tensors of the intrinsic */
-  Array<Tensor> tensors;
-  /*! \brief regions of input tensors */
-  Array<Region> regions;
-
-  /*!
-   * \brief IterVar on each reduction axis, if the
-   * intrin will use the reduce axis
-   */
-  Array<IterVar> reduce_axis;
-
-  /*! \brief scalar expression inputs */
-  Array<PrimExpr> scalar_inputs;
-
-  void VisitAttrs(AttrVisitor* v) {
-    v->Visit("intrin", &intrin);
-    v->Visit("tensors", &tensors);
-    v->Visit("regions", &regions);
-    v->Visit("reduce_axis", &reduce_axis);
-    v->Visit("scalar_inputs", &scalar_inputs);
-  }
-
-  static constexpr const char* _type_key = "TensorIntrinCall";
-  TVM_DECLARE_FINAL_OBJECT_INFO(TensorIntrinCallNode, Object);
-};
-
-/*!
- * \brief Managed reference to TensorIntrinCallNode
- * \sa TensorIntrinCallNode
- */
-class TensorIntrinCall : public ObjectRef {
- public:
-  TVM_DLL TensorIntrinCall(TensorIntrin intrin, Array<Tensor> tensors, Array<Region> regions,
-                           Array<IterVar> reduce_axis, Array<PrimExpr> scalar_inputs);
-
-  TVM_DEFINE_OBJECT_REF_METHODS(TensorIntrinCall, ObjectRef, TensorIntrinCallNode);
-};
-
-}  // namespace te
-}  // namespace tvm
-#endif  // TVM_TE_TENSOR_INTRIN_H_
diff --git a/include/tvm/topi/cuda/dense.h b/include/tvm/topi/cuda/dense.h
deleted file mode 100644
index 7fd3107b6c32..000000000000
--- a/include/tvm/topi/cuda/dense.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file cuda/dense.h
- * \brief CUDA schedule for dense operation
- */
-#ifndef TVM_TOPI_CUDA_DENSE_H_
-#define TVM_TOPI_CUDA_DENSE_H_
-
-#include <tvm/target/generic_func.h>
-#include <tvm/te/operation.h>
-#include <tvm/te/schedule_pass.h>
-#include <tvm/topi/contrib/cublas.h>
-#include <tvm/topi/detail/array_utils.h>
-#include <tvm/topi/generic/extern.h>
-#include <tvm/topi/nn/dense.h>
-#include <tvm/topi/tags.h>
-
-namespace tvm {
-namespace topi {
-
-using namespace tvm::te;
-
-namespace cuda {
-/*!
- * \brief Implementation of dense for CUDA backend
- *
- * \param target The target device
- * \param data Tensor with shape [batch, in_dim]
- * \param weight Tensor with shape [out_dim, in_dim]
- * \param bias Tensor with shape [out_dim]. Optional; to omit bias, pass Tensor()
- * \param out_dtype Output data type. Used for mixed precision.
- *
- * \return Tensor with shape [batch, out_dim]
- */
-inline tvm::te::Tensor dense_cuda(const Target& target, const tvm::te::Tensor& data,
-                                  const tvm::te::Tensor& weight, const tvm::te::Tensor& bias,
-                                  const DataType& out_dtype) {
-  ICHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
-  ICHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
-  if (bias.defined()) {
-    ICHECK_EQ(bias->shape.size(), 1) << "dense requires 1-D bias";
-  }
-
-  auto batch = data->shape[0];
-  auto in_dim = data->shape[1];
-  auto out_dim = weight->shape[0];
-
-  if (target->GetLibs().count("cublas")) {
-    ICHECK_EQ(data->dtype, out_dtype) << "Mixed precision not supported.";
-    auto mm = topi::contrib::cublas_matmul(data, weight, false, true);
-    if (bias.defined()) {
-      mm = tvm::te::compute(
-          {batch, out_dim}, [&](Var i, Var j) { return mm(i, j) + bias(j); }, "tensor", kBroadcast);
-    }
-
-    return mm;
-  } else {
-    return topi::nn::dense(data, weight, bias, out_dtype);
-  }
-}
-
-/*!
- * \brief Create a CUDA schedule for dense
- *
- * \param target The target to generate a schedule for.
- * \param outs The output tensors.
- *
- * \return A schedule for the given ops.
- */
-inline Schedule schedule_dense(const Target& target, const Array<Tensor>& outs) {
-  if (target->kind->name == "cuda" && target->GetLibs().count("cublas")) {
-    return topi::generic::schedule_extern(target, outs);
-  }
-
-  Array<Operation> out_ops;
-  for (auto t : outs) {
-    out_ops.push_back(t->op);
-  }
-  auto s = create_schedule(out_ops);
-
-  auto _schedule = [&](const Tensor& dense) {
-    auto num_thread = 64;
-    auto k = dense->op.as<ComputeOpNode>()->reduce_axis[0];
-    IterVar ko, kf;
-    s[dense].split(k, num_thread, &ko, &kf);
-    auto dense_f = s.rfactor(dense, kf)[0];
-
-    Tensor out;
-    if (detail::contains(s->outputs, dense->op)) {
-      out = dense;
-    } else {
-      out = outs[0]->op.output(0);
-      s[dense].compute_at(s[out], s[out]->op.as<ComputeOpNode>()->axis[1]);
-    }
-    s[out].bind(s[out]->op.as<ComputeOpNode>()->axis[0],
-                tvm::te::thread_axis(Range(), "blockIdx.y"));
-    s[out].bind(s[out]->op.as<ComputeOpNode>()->axis[1],
-                tvm::te::thread_axis(Range(), "blockIdx.x"));
-
-    auto tx = s[dense]->op.as<ComputeOpNode>()->reduce_axis[0];
-    auto thread_x = tvm::te::thread_axis(Range(), "threadIdx.x");
-    s[dense].bind(tx, thread_x);
-    s[dense_f].compute_at(s[dense], tx);
-    s[dense].set_store_predicate(static_cast<PrimExpr>(thread_x) == 0);
-    s[out].set_store_predicate(static_cast<PrimExpr>(thread_x) == 0);
-  };
-
-  std::function<void(Operation)> traverse;
-  traverse = [&](const Operation& op) {
-    // Inline all one-to-one-mapping operators except the last stage (output)
-    if (is_broadcast(op->tag)) {
-      if (!detail::contains(s->outputs, op)) {
-        s[op].compute_inline();
-      }
-      for (auto tensor : op->InputTensors()) {
-        if (tensor->op->InputTensors().size() > 0) {
-          traverse(tensor->op);
-        }
-      }
-    } else if (op->tag == "dense") {
-      // If tag starts with global_pool
-      auto dense = op.output(0);
-      _schedule(dense);
-    } else {
-      LOG(ERROR) << "Unsupported operator " << op->tag;
-    }
-  };
-
-  traverse(outs[0]->op);
-  return s;
-}
-
-}  // namespace cuda
-}  // namespace topi
-}  // namespace tvm
-#endif  // TVM_TOPI_CUDA_DENSE_H_
diff --git a/include/tvm/topi/cuda/injective.h b/include/tvm/topi/cuda/injective.h
deleted file mode 100644
index 79ec338aae0e..000000000000
--- a/include/tvm/topi/cuda/injective.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file cuda/injective.h
- * \brief CUDA schedule for injective operations
- */
-#ifndef TVM_TOPI_CUDA_INJECTIVE_H_
-#define TVM_TOPI_CUDA_INJECTIVE_H_
-
-#include <tvm/target/generic_func.h>
-#include <tvm/te/operation.h>
-#include <tvm/te/schedule_pass.h>
-#include <tvm/topi/detail/fuse.h>
-#include <tvm/topi/tags.h>
-
-namespace tvm {
-namespace topi {
-
-using namespace tvm::te;
-
-namespace cuda {
-
-/*!
- * \brief Updates an existing schedule for the given injective ops.
- *
- * \param sch The schedule to update.
- * \param out The tensor representing the injective op.
- *
- * \return The updated schedule.
- */
-inline Schedule schedule_injective_from_existing(Schedule sch, const Tensor& out) {
-  auto fused = detail::Fuse(sch[out], sch[out]->op.as<ComputeOpNode>()->axis);
-  auto target = Target::Current(false);
-  int num_thread = target->GetAttr<Integer>("max_num_threads").value().IntValue();
-  IterVar bx, tx;
-  sch[out].split(fused, num_thread, &bx, &tx);
-  sch[out].bind(bx, thread_axis(Range(), "blockIdx.x"));
-  sch[out].bind(tx, thread_axis(Range(), "threadIdx.x"));
-  return sch;
-}
-
-/*!
- * \brief Create a CUDA schedule for the given output tensors.
- *
- * \param target The target to generate a schedule for.
- * \param outs The output tensors.
- *
- * \return A schedule for the given ops.
- */
-inline Schedule schedule_injective(const Target& target, const Array<Tensor>& outs) {
-  Array<Operation> out_ops;
-  for (auto t : outs) {
-    out_ops.push_back(t->op);
-  }
-  auto s = create_schedule(out_ops);
-  tvm::te::AutoInlineInjective(s);
-  for (auto out : outs) {
-    schedule_injective_from_existing(s, out);
-  }
-  return s;
-}
-
-}  // namespace cuda
-}  // namespace topi
-}  // namespace tvm
-#endif  // TVM_TOPI_CUDA_INJECTIVE_H_
diff --git a/include/tvm/topi/cuda/pooling.h b/include/tvm/topi/cuda/pooling.h
deleted file mode 100644
index 92be03123602..000000000000
--- a/include/tvm/topi/cuda/pooling.h
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file cuda/pooling.h
- * \brief CUDA schedule for pooling operations
- */
-#ifndef TVM_TOPI_CUDA_POOLING_H_
-#define TVM_TOPI_CUDA_POOLING_H_
-
-#include <tvm/target/generic_func.h>
-#include <tvm/te/operation.h>
-#include <tvm/te/schedule_pass.h>
-#include <tvm/topi/detail/array_utils.h>
-#include <tvm/topi/detail/fuse.h>
-#include <tvm/topi/tags.h>
-
-namespace tvm {
-namespace topi {
-
-using namespace tvm::te;
-
-namespace cuda {
-
-/*!
- * \brief Create a CUDA schedule for pool
- *
- * \param target The target to generate a schedule for.
- * \param outs The output tensors.
- *
- * \return A schedule for the given ops.
- */
-inline Schedule schedule_pool(const Target& target, const Array<Tensor>& outs) {
-  Array<Operation> out_ops;
-  for (auto t : outs) {
-    out_ops.push_back(t->op);
-  }
-  auto s = create_schedule(out_ops);
-
-  auto _schedule = [&](const Tensor& padded_input, const Tensor& pool) {
-    if (padded_input->op->IsInstance<ComputeOpNode>()) {
-      s[padded_input].compute_inline();
-    }
-    int num_thread = target->GetAttr<Integer>("max_num_threads").value().IntValue();
-    Tensor out;
-    Tensor OL;
-    if (detail::contains(s->outputs, pool->op)) {
-      out = pool;
-      OL = s.cache_write(pool, "local");
-    } else {
-      out = outs[0]->op.output(0);
-      s[pool].set_scope("local");
-    }
-    auto fused = detail::Fuse(s[out], s[out]->op.as<ComputeOpNode>()->axis);
-    IterVar bx, tx;
-    s[out].split(fused, num_thread, &bx, &tx);
-    s[out].bind(bx, tvm::te::thread_axis(Range(), "blockIdx.x"));
-    s[out].bind(tx, tvm::te::thread_axis(Range(), "threadIdx.x"));
-    if (detail::contains(s->outputs, pool->op)) {
-      s[OL].compute_at(s[out], tx);
-    } else {
-      s[pool].compute_at(s[out], tx);
-    }
-  };
-
-  std::function<void(Operation)> traverse;
-  traverse = [&](const Operation& op) {
-    // Inline all one-to-one-mapping operators except the last stage (output)
-    if (is_broadcast(op->tag)) {
-      if (!detail::contains(s->outputs, op)) {
-        s[op].compute_inline();
-      }
-      for (auto tensor : op->InputTensors()) {
-        if (tensor->op->InputTensors().size() > 0) {
-          traverse(tensor->op);
-        }
-      }
-    } else if (op->tag.rfind("pool", 0) == 0) {
-      // If tag starts with pool
-      auto padded_input = op->InputTensors()[0];
-      auto pool = op.output(0);
-      _schedule(padded_input, pool);
-    } else {
-      LOG(ERROR) << "Unsupported operator " << op->tag;
-    }
-  };
-
-  traverse(outs[0]->op);
-  return s;
-}
-
-/*!
- * \brief Create a CUDA schedule for global_pool
- *
- * \param target The target to generate a schedule for.
- * \param outs The output tensors.
- *
- * \return A schedule for the given ops.
- */
-inline Schedule schedule_global_pool(const Target& target, const Array<Tensor>& outs) {
-  Array<Operation> out_ops;
-  for (auto t : outs) {
-    out_ops.push_back(t->op);
-  }
-  auto s = create_schedule(out_ops);
-
-  auto _schedule = [&](const Tensor& pool) {
-    auto num_thread = 8;
-    auto block_x = tvm::te::thread_axis(Range(), "blockIdx.x");
-    auto block_y = tvm::te::thread_axis(Range(), "blockIdx.y");
-    auto thread_x = tvm::te::thread_axis(Range(0, num_thread), "threadIdx.x");
-    auto thread_y = tvm::te::thread_axis(Range(0, num_thread), "threadIdx.y");
-    Tensor out;
-    Tensor OL;
-    if (detail::contains(s->outputs, pool->op)) {
-      out = pool;
-      OL = s.cache_write(pool, "local");
-    } else {
-      out = outs[0]->op.output(0);
-      s[pool].set_scope("local");
-    }
-
-    auto i = s[out]->op.as<ComputeOpNode>()->axis[0];
-    auto c = s[out]->op.as<ComputeOpNode>()->axis[1];
-
-    IterVar by, ty;
-    s[out].split(i, num_thread, &by, &ty);
-    IterVar bx, tx;
-    s[out].split(c, num_thread, &bx, &tx);
-    s[out].reorder({by, bx, ty, tx});
-    s[out].bind(ty, thread_y);
-    s[out].bind(tx, thread_x);
-    s[out].bind(by, block_y);
-    s[out].bind(bx, block_x);
-
-    if (detail::contains(s->outputs, pool->op)) {
-      s[OL].compute_at(s[out], tx);
-    } else {
-      s[pool].compute_at(s[out], tx);
-    }
-  };
-
-  std::function<void(Operation)> traverse;
-  traverse = [&](const Operation& op) {
-    // Inline all one-to-one-mapping operators except the last stage (output)
-    if (is_broadcast(op->tag)) {
-      if (!detail::contains(s->outputs, op)) {
-        s[op].compute_inline();
-      }
-      for (auto tensor : op->InputTensors()) {
-        if (tensor->op->InputTensors().size() > 0) {
-          traverse(tensor->op);
-        }
-      }
-    } else if (op->tag.rfind("global_pool", 0) == 0) {
-      // If tag starts with global_pool
-      auto pool = op.output(0);
-      _schedule(pool);
-    } else {
-      LOG(ERROR) << "Unsupported operator " << op->tag;
-    }
-  };
-
-  traverse(outs[0]->op);
-  return s;
-}
-
-}  // namespace cuda
-}  // namespace topi
-}  // namespace tvm
-#endif  // TVM_TOPI_CUDA_POOLING_H_
diff --git a/include/tvm/topi/cuda/reduction.h b/include/tvm/topi/cuda/reduction.h
deleted file mode 100644
index b1905d844250..000000000000
--- a/include/tvm/topi/cuda/reduction.h
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file cuda/reduction.h
- * \brief CUDA schedule for reduction operations
- */
-#ifndef TVM_TOPI_CUDA_REDUCTION_H_
-#define TVM_TOPI_CUDA_REDUCTION_H_
-
-#include <tvm/target/generic_func.h>
-#include <tvm/te/operation.h>
-#include <tvm/te/schedule_pass.h>
-#include <tvm/topi/detail/fuse.h>
-#include <tvm/topi/tags.h>
-
-namespace tvm {
-namespace topi {
-
-using namespace tvm::te;
-
-namespace cuda {
-/*!
- * \brief Schedule a given reduce operation.
- *
- * \param target The target to generate a schedule for.
- * \param op The operation representing the injective operation.
- * \param sch The schedule to apply this scheduling to
- * \param is_idx_reduce Pass true to schedule a reduce op that returns
- * an index, such as argmax or argmin.
- *
- * \return The schedule given by sch
- */
-Schedule ScheduleReduce(const Target& target, Operation op, Schedule sch,
-                        bool is_idx_reduce = false) {
-  Tensor data_out;
-  Tensor data_in;
-
-  if (!is_idx_reduce) {
-    data_in = op->InputTensors()[0];
-    data_out = op.output(0);
-  } else {
-    data_out = op->InputTensors()[0];
-  }
-
-  auto out_stage = sch[data_out];
-  ICHECK_GT(out_stage->op.as<ComputeOpNode>()->reduce_axis.size(), 0)
-      << "reduce_axis must be greater than zero";
-
-  bool all_reduce;
-  int num_thread;
-  IterVar block_x, thread_x, thread_y;
-
-  if (out_stage->op.as<ComputeOpNode>()->axis.size() > 0) {
-    all_reduce = false;
-    num_thread = 32;
-    if (target->kind->name == "opencl" || target->kind->name == "metal") {
-      // Without this, CL_INVALID_WORK_GROUP_SIZE occurs with python tests.
-      // Don't know why.
-      num_thread = 16;
-    }
-    block_x = tvm::te::thread_axis(Range(), "blockIdx.x");
-    thread_x = tvm::te::thread_axis(Range(0, num_thread), "threadIdx.x");
-    thread_y = tvm::te::thread_axis(Range(0, num_thread), "threadIdx.y");
-  } else {
-    all_reduce = true;
-    num_thread = target->GetAttr<Integer>("max_num_threads").value().IntValue();
-    thread_x = tvm::te::thread_axis(Range(0, num_thread), "threadIdx.x");
-  }
-
-  auto fused_reduce = detail::Fuse(out_stage, out_stage->op.as<ComputeOpNode>()->reduce_axis);
-
-  IterVar ko, ki;
-  out_stage.split(fused_reduce, num_thread, &ko, &ki);
-  auto data_out_rf = sch.rfactor(data_out, ki)[0];
-  auto tx = out_stage->op.as<ComputeOpNode>()->reduce_axis[0];
-  out_stage.bind(tx, thread_x);
-  sch[data_out_rf].compute_at(out_stage, tx);
-
-  Tensor real_output;
-  Tensor temp_idx_input, temp_val_input;
-  if (is_idx_reduce) {
-    real_output = op.output(0);
-    temp_idx_input = data_out->op.output(0);
-    temp_val_input = data_out->op.output(1);
-  } else {
-    real_output = data_out;
-  }
-
-  auto stage_real = sch[real_output];
-  if (!all_reduce) {
-    // Fuse and split the axis
-    auto fused_outer = detail::Fuse(stage_real, stage_real->op.as<ComputeOpNode>()->axis);
-    IterVar bx, outer_in;
-    stage_real.split(fused_outer, num_thread, &bx, &outer_in);
-
-    // Bind the axes to threads and blocks
-    stage_real.bind(outer_in, thread_y);
-    stage_real.bind(bx, block_x);
-    if (is_idx_reduce) {
-      sch[temp_idx_input].compute_at(stage_real, outer_in);
-      sch[temp_val_input].compute_at(stage_real, outer_in);
-    }
-  } else {
-    if (is_idx_reduce) {
-      sch[temp_idx_input].compute_at(stage_real, stage_real->op.as<ComputeOpNode>()->axis[0]);
-      sch[temp_val_input].compute_at(stage_real, stage_real->op.as<ComputeOpNode>()->axis[0]);
-    }
-  }
-
-  stage_real.set_store_predicate(static_cast<PrimExpr>(thread_x) == 0);
-  return sch;
-}
-
-/*!
- * \brief Recursively traverse operator inputs, setting injective inputs
- * to be computed inline.
- *
- * \param s The schedule we are building
- * \param op The current op in the traversal
- */
-void TraverseBeforeReduce(Schedule s, Operation op) {
-  if (op->IsInstance<PlaceholderOpNode>()) {
-    return;
-  } else if (is_injective(op->tag)) {
-    s[op].compute_inline();
-    for (auto tensor : op->InputTensors()) {
-      TraverseBeforeReduce(s, tensor->op);
-    }
-  } else {
-    LOG(ERROR) << "Unsupported operator " << op->tag;
-  }
-}
-
-/*!
- * \brief Schedule a reduce op, then invoke TraverseBeforeReduce on each
- * of the op's inputs.
- *
- * \param target The target to generate a schedule for.
- * \param s The schedule we are building
- * \param op The reduce op
- */
-void TraverseAfterReduce(const Target& target, Schedule s, Operation op) {
-  if (is_broadcast(op->tag)) {
-    LOG(ERROR) << "Elementwise op after reduce is not yet supported";
-  } else if (op->tag == kCommReduce) {
-    ScheduleReduce(target, op, s, false);
-    for (auto tensor : op->InputTensors()) {
-      TraverseBeforeReduce(s, tensor->op);
-    }
-  } else if (op->tag == kCommReduceIdx) {
-    ScheduleReduce(target, op, s, true);
-    for (auto tensor : op->InputTensors()[0]->op->InputTensors()) {
-      TraverseBeforeReduce(s, tensor->op);
-    }
-  } else {
-    LOG(ERROR) << "Unsupported operator " << op->tag;
-  }
-}
-
-/*!
- * \brief Create a CUDA schedule for a reduce operation.
- *
- * \param target The target to generate a schedule for.
- * \param outs The output tensors.
- *
- * \return A schedule for the given ops.
- */
-Schedule schedule_reduce(const Target& target, Array<Tensor> outs) {
-  ICHECK_EQ(outs.size(), 1) << "outs must have size 1";
-  Array<Operation> out_ops;
-  for (auto t : outs) {
-    out_ops.push_back(t->op);
-  }
-  auto s = create_schedule(out_ops);
-  TraverseAfterReduce(target, s, outs[0]->op);
-  return s;
-}
-
-}  // namespace cuda
-}  // namespace topi
-}  // namespace tvm
-#endif  // TVM_TOPI_CUDA_REDUCTION_H_
diff --git a/include/tvm/topi/cuda/softmax.h b/include/tvm/topi/cuda/softmax.h
deleted file mode 100644
index 19613cbbdf19..000000000000
--- a/include/tvm/topi/cuda/softmax.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file cuda/injective.h
- * \brief CUDA schedule for injective operations
- */
-#ifndef TVM_TOPI_CUDA_SOFTMAX_H_
-#define TVM_TOPI_CUDA_SOFTMAX_H_
-
-#include <tvm/target/generic_func.h>
-#include <tvm/te/operation.h>
-#include <tvm/te/schedule_pass.h>
-#include <tvm/topi/detail/fuse.h>
-#include <tvm/topi/tags.h>
-
-namespace tvm {
-namespace topi {
-
-using namespace tvm::te;
-
-namespace cuda {
-
-/*!
- * \brief Create a CUDA schedule for the given softmax output tensors.
- *
- * \param target The target to generate a schedule for.
- * \param outs The output tensors.
- *
- * \return A schedule for the given ops.
- */
-inline Schedule schedule_softmax(const Target& target, const Array<Tensor>& outs) {
-  Array<Operation> out_ops;
-  for (auto t : outs) {
-    out_ops.push_back(t->op);
-  }
-  auto s = create_schedule(out_ops);
-
-  auto softmax = outs[0];
-  tvm::te::Tensor max_elem;
-  tvm::te::Tensor expsum;
-  tvm::te::Tensor exp;
-  bool has_exp = false;
-
-  auto tag = softmax->op.as<ComputeOpNode>()->tag;
-  if (tag == "softmax_output") {
-    expsum = softmax->op->InputTensors()[1];
-    exp = softmax->op->InputTensors()[0];
-    max_elem = s[exp]->op->InputTensors()[1];
-    has_exp = true;
-  } else if (tag == "log_softmax_output") {
-    max_elem = softmax->op->InputTensors()[1];
-    expsum = softmax->op->InputTensors()[2];
-  } else {
-    LOG(ERROR) << "Tag is expected to be softmax_output or log_softmax_output. Got " << tag;
-  }
-
-  int num_thread = 64;
-  auto block_x = tvm::te::thread_axis(Range(), "blockIdx.x");
-  auto thread_x = tvm::te::thread_axis(Range(0, num_thread), "threadIdx.x");
-
-  if (has_exp) {
-    s[exp].bind(exp->op.as<ComputeOpNode>()->axis[0], block_x);
-  }
-
-  s[max_elem].bind(max_elem->op.as<ComputeOpNode>()->axis[0], block_x);
-
-  auto k = expsum->op.as<ComputeOpNode>()->reduce_axis[0];
-  IterVar ko, ki;
-  s[expsum].split(k, num_thread, &ko, &ki);
-  auto EF = s.rfactor(expsum, ki)[0];
-  s[expsum].bind(s[expsum]->op.as<ComputeOpNode>()->axis[0], block_x);
-  s[expsum].bind(s[expsum]->op.as<ComputeOpNode>()->reduce_axis[0], thread_x);
-  s[EF].compute_at(s[expsum], s[expsum]->op.as<ComputeOpNode>()->reduce_axis[0]);
-  s[expsum].set_store_predicate(thread_x->var == 0);
-
-  IterVar tx, xi;
-  s[softmax].split_by_nparts(softmax->op.as<ComputeOpNode>()->axis[1], num_thread, &tx, &xi);
-  s[softmax].bind(tx, thread_x);
-
-  return s;
-}
-
-}  // namespace cuda
-}  // namespace topi
-}  // namespace tvm
-#endif  // TVM_TOPI_CUDA_SOFTMAX_H_
diff --git a/include/tvm/topi/generic/default.h b/include/tvm/topi/generic/default.h
deleted file mode 100644
index 752b6ad1537e..000000000000
--- a/include/tvm/topi/generic/default.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file generic/default.h
- * \brief Generic default schedule
- */
-#ifndef TVM_TOPI_GENERIC_DEFAULT_H_
-#define TVM_TOPI_GENERIC_DEFAULT_H_
-
-#include <tvm/target/generic_func.h>
-#include <tvm/te/operation.h>
-#include <tvm/te/schedule_pass.h>
-#include <tvm/topi/detail/fuse.h>
-#include <tvm/topi/tags.h>
-
-namespace tvm {
-namespace topi {
-
-using namespace tvm::te;
-
-namespace generic {
-/*!
- * \brief Create a generic default schedule for the given output tensors.
- *
- * \param target The target to generate a schedule for.
- * \param outs The output tensors.
- *
- * \return A schedule for the given ops.
- */
-inline Schedule default_schedule(const Target& target, const Array<Tensor>& outs) {
-  Array<Operation> out_ops;
-  for (auto t : outs) {
-    out_ops.push_back(t->op);
-  }
-  auto s = create_schedule(out_ops);
-  return s;
-}
-
-/*!
- * \brief Create a generic default schedule for the given output tensors, and apply
- * auto inline
- *
- * \param target The target to generate a schedule for.
- * \param outs The output tensors.
- *
- * \return A schedule for the given ops.
- */
-inline Schedule default_schedule_auto_inline(const Target& target, const Array<Tensor>& outs) {
-  Array<Operation> out_ops;
-  for (auto t : outs) {
-    out_ops.push_back(t->op);
-  }
-  auto s = create_schedule(out_ops);
-  auto x = outs[0];
-  tvm::te::AutoInlineInjective(s);
-  auto axis = s[x]->op.as<ComputeOpNode>()->axis;
-  if (axis.size() > 0) {
-    detail::Fuse(s[x], axis);
-  }
-  return s;
-}
-
-}  // namespace generic
-}  // namespace topi
-}  // namespace tvm
-#endif  // TVM_TOPI_GENERIC_DEFAULT_H_
diff --git a/include/tvm/topi/generic/extern.h b/include/tvm/topi/generic/extern.h
deleted file mode 100644
index 0f1f408fdc5f..000000000000
--- a/include/tvm/topi/generic/extern.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file generic/extern.h
- * \brief Schedule for extern followed by injective ops
- */
-#ifndef TVM_TOPI_GENERIC_EXTERN_H_
-#define TVM_TOPI_GENERIC_EXTERN_H_
-
-#include <tvm/target/generic_func.h>
-#include <tvm/te/operation.h>
-#include <tvm/te/schedule_pass.h>
-#include <tvm/topi/detail/fuse.h>
-#include <tvm/topi/generic/injective.h>
-#include <tvm/topi/tags.h>
-
-namespace tvm {
-namespace topi {
-
-using namespace tvm::te;
-
-namespace generic {
-/*!
- * \brief Schedule an extern op followed by injective operations
- *
- * \param target The target to generate a schedule for.
- * \param outs The output tensors.
- *
- * \return A schedule for the op.
- */
-inline Schedule schedule_extern(const Target& target, const Array<Tensor>& outs) {
-  Array<Operation> out_ops;
-  for (auto t : outs) {
-    out_ops.push_back(t->op);
-  }
-  auto s = create_schedule(out_ops);
-
-  tvm::te::AutoInlineInjective(s);
-  for (auto out : outs) {
-    if (out->op->IsInstance<ExternOpNode>()) {
-      continue;
-    }
-    tvm::GenericFunc::Get("schedule_injective_from_existing")(s, out);
-  }
-
-  return s;
-}
-
-}  // namespace generic
-}  // namespace topi
-}  // namespace tvm
-#endif  // TVM_TOPI_GENERIC_EXTERN_H_
diff --git a/include/tvm/topi/generic/injective.h b/include/tvm/topi/generic/injective.h
deleted file mode 100644
index c48c03eee065..000000000000
--- a/include/tvm/topi/generic/injective.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file generic/injective.h
- * \brief Generic schedule for injective operations
- */
-#ifndef TVM_TOPI_GENERIC_INJECTIVE_H_
-#define TVM_TOPI_GENERIC_INJECTIVE_H_
-
-#include <tvm/target/generic_func.h>
-#include <tvm/te/operation.h>
-#include <tvm/te/schedule_pass.h>
-#include <tvm/topi/detail/fuse.h>
-#include <tvm/topi/tags.h>
-
-namespace tvm {
-namespace topi {
-
-using namespace tvm::te;
-
-namespace generic {
-
-/*!
- * \brief Updates an existing schedule for the given injective ops.
- *
- * \param sch The schedule to update.
- * \param out The tensor representing the injective op.
- *
- * \return The updated schedule.
- */
-inline Schedule schedule_injective_from_existing(Schedule sch, const Tensor& out) {
-  detail::Fuse(sch[out], sch[out]->op.as<ComputeOpNode>()->axis);
-  return sch;
-}
-
-/*!
- * \brief Create a generic schedule for the given injective ops.
- *
- * \param target The target to generate a schedule for.
- * \param outs The output tensors.
- *
- * \return A schedule for the given ops.
- */
-inline Schedule schedule_injective(const Target& target, const Array<Tensor>& outs) {
-  Array<Operation> out_ops;
-  for (auto t : outs) {
-    out_ops.push_back(t->op);
-  }
-  auto s = create_schedule(out_ops);
-  tvm::te::AutoInlineInjective(s);
-  auto x = outs[0];
-  schedule_injective_from_existing(s, x);
-
-  return s;
-}
-
-}  // namespace generic
-}  // namespace topi
-}  // namespace tvm
-#endif  // TVM_TOPI_GENERIC_INJECTIVE_H_
diff --git a/include/tvm/topi/rocm/dense.h b/include/tvm/topi/rocm/dense.h
deleted file mode 100644
index b861e6c89a67..000000000000
--- a/include/tvm/topi/rocm/dense.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file rocm/dense.h
- * \brief rocm schedule for dense operation
- */
-#ifndef TVM_TOPI_ROCM_DENSE_H_
-#define TVM_TOPI_ROCM_DENSE_H_
-
-#include <tvm/target/generic_func.h>
-#include <tvm/te/operation.h>
-#include <tvm/topi/contrib/rocblas.h>
-#include <tvm/topi/cuda/dense.h>
-#include <tvm/topi/detail/array_utils.h>
-#include <tvm/topi/generic/extern.h>
-#include <tvm/topi/nn/dense.h>
-#include <tvm/topi/tags.h>
-
-namespace tvm {
-namespace topi {
-
-using namespace tvm::te;
-
-namespace rocm {
-/*!
- * \brief Implementation of dense for rocm backend
- *
- * \param target The target device
- * \param data Tensor with shape [batch, in_dim]
- * \param weight Tensor with shape [out_dim, in_dim]
- * \param bias Tensor with shape [out_dim]. Optional; to omit bias, pass Tensor()
- * \param out_dtype Output data type. Used for mixed precision.
- *
- * \return Tensor with shape [batch, out_dim]
- */
-inline tvm::te::Tensor dense_rocm(const Target& target, const tvm::te::Tensor& data,
-                                  const tvm::te::Tensor& weight, const tvm::te::Tensor& bias,
-                                  const DataType& out_dtype) {
-  ICHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
-  ICHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
-  if (bias.defined()) {
-    ICHECK_EQ(bias->shape.size(), 1) << "dense requires 1-D bias";
-  }
-
-  auto batch = data->shape[0];
-  auto in_dim = data->shape[1];
-  auto out_dim = weight->shape[0];
-
-  if (target->GetLibs().count("rocblas")) {
-    ICHECK_EQ(data->dtype, out_dtype) << "Mixed precision not supported.";
-    auto mm = topi::contrib::rocblas_matmul(data, weight, false, true);
-    if (bias.defined()) {
-      mm = tvm::te::compute(
-          {batch, out_dim}, [&](Var i, Var j) { return mm(i, j) + bias(j); }, "tensor", kBroadcast);
-    }
-
-    return mm;
-  } else {
-    return topi::nn::dense(data, weight, bias, out_dtype);
-  }
-}
-
-/*!
- * \brief Create a rocm schedule for dense
- *
- * \param target The target to generate a schedule for.
- * \param outs The output tensors.
- *
- * \return A schedule for the given ops.
- */
-inline Schedule schedule_dense(const Target& target, const Array<Tensor>& outs) {
-  if (target->kind->name == "rocm" && target->GetLibs().count("rocblas")) {
-    return topi::generic::schedule_extern(target, outs);
-  }
-
-  return topi::cuda::schedule_dense(target, outs);
-}
-
-}  // namespace rocm
-}  // namespace topi
-}  // namespace tvm
-#endif  // TVM_TOPI_ROCM_DENSE_H_
diff --git a/include/tvm/topi/rocm/injective.h b/include/tvm/topi/rocm/injective.h
deleted file mode 100644
index 295d930e5cd8..000000000000
--- a/include/tvm/topi/rocm/injective.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file rocm/injective.h
- * \brief rocm schedule for injective operations
- */
-#ifndef TVM_TOPI_ROCM_INJECTIVE_H_
-#define TVM_TOPI_ROCM_INJECTIVE_H_
-
-#include <tvm/target/generic_func.h>
-#include <tvm/te/operation.h>
-#include <tvm/topi/cuda/injective.h>
-#include <tvm/topi/detail/fuse.h>
-#include <tvm/topi/tags.h>
-
-namespace tvm {
-namespace topi {
-
-using namespace tvm::te;
-
-namespace rocm {
-
-/*!
- * \brief Updates an existing schedule for the given injective ops.
- *
- * \param sch The schedule to update.
- * \param out The tensor representing the injective op.
- *
- * \return The updated schedule.
- */
-inline Schedule schedule_injective_from_existing(Schedule sch, const Tensor& out) {
-  return topi::cuda::schedule_injective_from_existing(sch, out);
-}
-
-/*!
- * \brief Create a rocm schedule for the given output tensors.
- *
- * \param target The target to generate a schedule for.
- * \param outs The output tensors.
- *
- * \return A schedule for the given ops.
- */
-inline Schedule schedule_injective(const Target& target, const Array<Tensor>& outs) {
-  return topi::cuda::schedule_injective(target, outs);
-}
-
-}  // namespace rocm
-}  // namespace topi
-}  // namespace tvm
-#endif  // TVM_TOPI_ROCM_INJECTIVE_H_
diff --git a/include/tvm/topi/rocm/pooling.h b/include/tvm/topi/rocm/pooling.h
deleted file mode 100644
index 993c32bf36ad..000000000000
--- a/include/tvm/topi/rocm/pooling.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file rocm/pooling.h
- * \brief rocm schedule for pooling operations
- */
-#ifndef TVM_TOPI_ROCM_POOLING_H_
-#define TVM_TOPI_ROCM_POOLING_H_
-
-#include <tvm/target/generic_func.h>
-#include <tvm/te/operation.h>
-#include <tvm/topi/cuda/pooling.h>
-#include <tvm/topi/detail/array_utils.h>
-#include <tvm/topi/detail/fuse.h>
-#include <tvm/topi/tags.h>
-
-namespace tvm {
-namespace topi {
-
-using namespace tvm::te;
-
-namespace rocm {
-
-/*!
- * \brief Create a rocm schedule for pool
- *
- * \param target The target to generate a schedule for.
- * \param outs The output tensors.
- *
- * \return A schedule for the given ops.
- */
-inline Schedule schedule_pool(const Target& target, const Array<Tensor>& outs) {
-  return topi::cuda::schedule_pool(target, outs);
-}
-
-/*!
- * \brief Create a rocm schedule for global_pool
- *
- * \param target The target to generate a schedule for.
- * \param outs The output tensors.
- *
- * \return A schedule for the given ops.
- */
-inline Schedule schedule_global_pool(const Target& target, const Array<Tensor>& outs) {
-  return topi::cuda::schedule_global_pool(target, outs);
-}
-
-}  // namespace rocm
-}  // namespace topi
-}  // namespace tvm
-#endif  // TVM_TOPI_ROCM_POOLING_H_
diff --git a/include/tvm/topi/rocm/softmax.h b/include/tvm/topi/rocm/softmax.h
deleted file mode 100644
index a2ffd2c46e66..000000000000
--- a/include/tvm/topi/rocm/softmax.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file rocm/injective.h
- * \brief ROCM schedule for injective operations
- */
-#ifndef TVM_TOPI_ROCM_SOFTMAX_H_
-#define TVM_TOPI_ROCM_SOFTMAX_H_
-
-#include <tvm/target/generic_func.h>
-#include <tvm/te/operation.h>
-#include <tvm/topi/cuda/softmax.h>
-#include <tvm/topi/detail/fuse.h>
-#include <tvm/topi/tags.h>
-
-namespace tvm {
-namespace topi {
-
-using namespace tvm::te;
-
-namespace rocm {
-
-/*!
- * \brief Create a rocm schedule for the given softmax output tensors.
- *
- * \param target The target to generate a schedule for.
- * \param outs The output tensors.
- *
- * \return A schedule for the given ops.
- */
-inline Schedule schedule_softmax(const Target& target, const Array<Tensor>& outs) {
-  return topi::cuda::schedule_softmax(target, outs);
-}
-
-}  // namespace rocm
-}  // namespace topi
-}  // namespace tvm
-#endif  // TVM_TOPI_ROCM_SOFTMAX_H_
diff --git a/include/tvm/topi/x86/bnn.h b/include/tvm/topi/x86/bnn.h
deleted file mode 100644
index c8a7235536b5..000000000000
--- a/include/tvm/topi/x86/bnn.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file x86/bnn.h
- * \brief x86 schedule for binary operations
- */
-#ifndef TVM_TOPI_X86_BNN_H_
-#define TVM_TOPI_X86_BNN_H_
-
-#include <tvm/target/generic_func.h>
-#include <tvm/te/operation.h>
-#include <tvm/topi/detail/fuse.h>
-#include <tvm/topi/tags.h>
-
-namespace tvm {
-namespace topi {
-
-using namespace tvm::te;
-
-namespace x86 {
-/*!
- * \brief Create a generic schedule for binarize_pack
- *
- * \param target The target to generate a schedule for.
- * \param outs The output tensors.
- *
- * \return A schedule for the given ops.
- */
-inline Schedule schedule_binarize_pack(const Target& target, const Array<Tensor>& outs) {
-  Array<Operation> out_ops;
-  for (auto t : outs) {
-    out_ops.push_back(t->op);
-  }
-  auto s = create_schedule(out_ops);
-
-  auto _schedule = [&](const Tensor& out) {
-    s[out].parallel(out->op.as<ComputeOpNode>()->axis[0]);
-  };
-
-  std::function<void(Operation)> traverse;
-  traverse = [&](const Operation& op) {
-    if (op->tag == "binarize_pack") {
-      _schedule(op.output(0));
-    } else {
-      LOG(ERROR) << "Unsupported operator " << op->tag;
-    }
-  };
-
-  traverse(outs[0]->op);
-  return s;
-}
-
-/*!
- * \brief Create a generic schedule for binary_dense
- *
- * \param target The target to generate a schedule for.
- * \param outs The output tensors.
- *
- * \return A schedule for the given ops.
- */
-inline Schedule schedule_binary_dense(const Target& target, const Array<Tensor>& outs) {
-  Array<Operation> out_ops;
-  for (auto t : outs) {
-    out_ops.push_back(t->op);
-  }
-  auto s = create_schedule(out_ops);
-
-  auto _schedule = [&](const Tensor& A, const Tensor& B, const Tensor& C) {
-    IterVar co, ci;
-    s[C].split(s[C]->op.as<ComputeOpNode>()->reduce_axis[0], 8, &co, &ci);
-    s[C].parallel(s[C]->op.as<ComputeOpNode>()->axis[0]);
-
-    Tensor out;
-    if (detail::contains(s->outputs, C->op)) {
-      out = C;
-    } else {
-      out = outs[0]->op.output(0);
-    }
-
-    IterVar xo, xi;
-    s[out].split(out->op.as<ComputeOpNode>()->axis[1], 8, &xo, &xi);
-    s[out].vectorize(xi);
-  };
-
-  std::function<void(Operation)> traverse;
-  traverse = [&](const Operation& op) {
-    // Inline all one-to-one-mapping operators except the last stage (output)
-    if (is_broadcast(op->tag)) {
-      if (!detail::contains(s->outputs, op)) {
-        s[op].compute_inline();
-      }
-      for (auto tensor : op->InputTensors()) {
-        if (tensor->op->InputTensors().size() > 0) {
-          traverse(tensor->op);
-        }
-      }
-    } else if (op->tag == "binary_dense") {
-      auto output = op.output(0);
-      auto data = op->InputTensors()[0];
-      auto weight = op->InputTensors()[1];
-      _schedule(data, weight, output);
-    } else {
-      LOG(ERROR) << "Unsupported operator " << op->tag;
-    }
-  };
-
-  traverse(outs[0]->op);
-  return s;
-}
-
-}  // namespace x86
-}  // namespace topi
-}  // namespace tvm
-#endif  // TVM_TOPI_X86_BNN_H_
diff --git a/include/tvm/topi/x86/default.h b/include/tvm/topi/x86/default.h
deleted file mode 100644
index 9c9856040261..000000000000
--- a/include/tvm/topi/x86/default.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file x86/default.h
- * \brief default x86 schedule
- */
-#ifndef TVM_TOPI_X86_DEFAULT_H_
-#define TVM_TOPI_X86_DEFAULT_H_
-
-#include <tvm/target/generic_func.h>
-#include <tvm/te/operation.h>
-#include <tvm/te/schedule_pass.h>
-#include <tvm/topi/detail/fuse.h>
-#include <tvm/topi/tags.h>
-
-namespace tvm {
-namespace topi {
-
-using namespace tvm::te;
-
-namespace x86 {
-/*!
- * \brief Helper to create a default x86 schedule for the given ops.
- *
- * \param target The target to generate a schedule for.
- * \param outs The output tensors.
- * \param auto_inline Whether to apply the auto inline step.
- *
- * \return A schedule for the given ops.
- */
-inline Schedule MakeDefaultSchedule(const Target& target, const Array<Tensor>& outs,
-                                    bool auto_inline) {
-  Array<Operation> out_ops;
-  for (auto t : outs) {
-    out_ops.push_back(t->op);
-  }
-  auto s = create_schedule(out_ops);
-  auto x = outs[0];
-  auto axis = s[x]->op.as<ComputeOpNode>()->axis;
-
-  if (auto_inline) {
-    tvm::te::AutoInlineInjective(s);
-    if (axis.size() > 0) {
-      detail::Fuse(s[x], axis);
-    }
-    return s;
-  }
-
-  if (axis.size() == 4) {
-    auto n = axis[0];
-    auto c = axis[1];
-    auto fused = detail::Fuse(s[x], {n, c});  // for nhwc layout, fuse n and h
-    s[x].parallel(fused);
-  } else {
-    s[x].parallel(axis[0]);
-  }
-
-  return s;
-}
-
-/*!
- * \brief Create a default x86 schedule for the given ops.
- *
- * \param target The target to generate a schedule for.
- * \param outs The output tensors.
- *
- * \return A schedule for the given ops.
- */
-inline Schedule default_schedule(const Target& target, const Array<Tensor>& outs) {
-  return MakeDefaultSchedule(target, outs, false);
-}
-
-/*!
- * \brief Create a default x86 schedule for the given ops, with auto inline
- *
- * \param target The target to generate a schedule for.
- * \param outs The output tensors.
- *
- * \return A schedule for the given ops.
- */
-inline Schedule default_schedule_auto_inline(const Target& target, const Array<Tensor>& outs) {
-  return MakeDefaultSchedule(target, outs, true);
-}
-
-}  // namespace x86
-}  // namespace topi
-}  // namespace tvm
-#endif  // TVM_TOPI_X86_DEFAULT_H_
diff --git a/include/tvm/topi/x86/injective.h b/include/tvm/topi/x86/injective.h
deleted file mode 100644
index cc1f48c86005..000000000000
--- a/include/tvm/topi/x86/injective.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file x86/injective.h
- * \brief x86 schedule for injective ops
- */
-#ifndef TVM_TOPI_X86_INJECTIVE_H_
-#define TVM_TOPI_X86_INJECTIVE_H_
-
-#include <tvm/target/generic_func.h>
-#include <tvm/te/operation.h>
-#include <tvm/topi/detail/fuse.h>
-#include <tvm/topi/tags.h>
-
-namespace tvm {
-namespace topi {
-
-using namespace tvm::te;
-
-namespace x86 {
-
-/*!
- * \brief Updates an existing schedule for the given injective ops.
- *
- * \param sch The schedule to update.
- * \param out The tensor representing the injective op.
- *
- * \return The updated schedule.
- */
-inline Schedule schedule_injective_from_existing(Schedule sch, const Tensor& out) {
-  auto axis = sch[out]->op.as<ComputeOpNode>()->axis;
-  if (axis.size() == 4) {
-    auto n = axis[0];
-    auto c = axis[1];
-    auto fused = detail::Fuse(sch[out], {n, c});  // for nhwc layout, fuse n and h
-    sch[out].parallel(fused);
-  } else if (!axis.empty()) {
-    sch[out].parallel(axis[0]);
-  }
-  return sch;
-}
-
-/*!
- * \brief Create an x86 schedule for the given injective ops.
- *
- * \param target The target to generate a schedule for.
- * \param outs The output tensors.
- *
- * \return A schedule for the given ops.
- */
-inline Schedule schedule_injective(const Target& target, const Array<Tensor>& outs) {
-  Array<Operation> out_ops;
-  for (auto t : outs) {
-    out_ops.push_back(t->op);
-  }
-  auto s = create_schedule(out_ops);
-  tvm::te::AutoInlineInjective(s);
-
-  auto x = outs[0];
-  schedule_injective_from_existing(s, x);
-
-  return s;
-}
-
-}  // namespace x86
-}  // namespace topi
-}  // namespace tvm
-#endif  // TVM_TOPI_X86_INJECTIVE_H_
diff --git a/python/tvm/te/__init__.py b/python/tvm/te/__init__.py
index e7b394ebf76c..b31853bea666 100644
--- a/python/tvm/te/__init__.py
+++ b/python/tvm/te/__init__.py
@@ -35,4 +35,4 @@
 from .operation import create_prim_func
 from .operation import extern_primfunc
 
-from .tensor import PlaceholderOp, ComputeOp, TensorComputeOp, ScanOp, ExternOp
+from .tensor import PlaceholderOp, ComputeOp, ScanOp, ExternOp
diff --git a/python/tvm/te/operation.py b/python/tvm/te/operation.py
index a9681c6df040..8d72fc794011 100644
--- a/python/tvm/te/operation.py
+++ b/python/tvm/te/operation.py
@@ -130,26 +130,10 @@ def compute(shape, fcompute, name="compute", tag="", attrs=None, varargs_names=N
     dim_var = [tvm.tir.IterVar((0, s), x, 0) for x, s in zip(arg_names, shape[:out_ndim])]
     body = fcompute(*[v.var for v in dim_var])
 
-    if isinstance(body, _tensor.TensorIntrinCall):
-        for i, s in enumerate(shape[out_ndim:]):
-            var_name = "ax" + str(i)
-            dim_var.append(tvm.tir.IterVar((0, s), var_name, 4))
-        op_node = _ffi_api.TensorComputeOp(
-            name,
-            tag,
-            dim_var,
-            body.reduce_axis,
-            out_ndim,
-            body.intrin,
-            body.tensors,
-            body.regions,
-            body.scalar_inputs,
-        )
-    else:
-        if not isinstance(body, (list, tuple)):
-            body = [body]
-        body = convert(body)
-        op_node = _ffi_api.ComputeOp(name, tag, attrs, dim_var, body)
+    if not isinstance(body, (list, tuple)):
+        body = [body]
+    body = convert(body)
+    op_node = _ffi_api.ComputeOp(name, tag, attrs, dim_var, body)
 
     num = op_node.num_outputs
     outputs = tuple(op_node.output(i) for i in range(num))
diff --git a/python/tvm/te/tensor.py b/python/tvm/te/tensor.py
index 53ab9d0b5b59..70c1287cc844 100644
--- a/python/tvm/te/tensor.py
+++ b/python/tvm/te/tensor.py
@@ -48,11 +48,6 @@ def dtype(self):
         return self.tensor.dtype
 
 
-@tvm._ffi.register_object
-class TensorIntrinCall(Object):
-    """Intermediate structure for calling a tensor intrinsic."""
-
-
 @tvm._ffi.register_object
 class Tensor(DataProducer, _expr.ExprOp):
     """Tensor object, to construct, see function.Tensor"""
@@ -172,11 +167,6 @@ class ComputeOp(BaseComputeOp):
     """Scalar operation."""
 
 
-@tvm._ffi.register_object
-class TensorComputeOp(BaseComputeOp):
-    """Tensor operation."""
-
-
 @tvm._ffi.register_object
 class ScanOp(Operation):
     """Scan operation."""
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 86c1d44f0108..83ed8d261d2f 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -288,54 +288,6 @@ IRModule ApplyPasses(IRModule mod, transform::Sequential seq) {
   return mod;
 }
 
-// Convert te schedule to IRModule
-IRModule ScheduleToModule(te::Schedule sch, const Array<ObjectRef>& args, const std::string& name,
-                          const std::unordered_map<te::Tensor, tir::Buffer>& binds,
-                          GlobalVarSupply global_var_supply) {
-  sch = sch.normalize();
-
-  transform::PassContext pass_ctx = transform::PassContext::Current();
-  bool debug_keep_trivial_loop =
-      pass_ctx->GetConfig<Bool>("tir.debug_keep_trivial_loop", Bool(false)).value();
-
-  // Before TIR transformation.
-  tir::Stmt stmt = te::ScheduleOps(sch, te::InferBound(sch), debug_keep_trivial_loop);
-  bool compact = te::VerifyCompactBuffer(stmt);
-
-  Map<te::Tensor, tir::Buffer> out_binds;
-  Array<ObjectRef> out_arg_list;
-  GetBinds(args, compact, binds, &out_binds, &out_arg_list);
-
-  // Build the function, converting from te::Tensor to tir::Buffer
-  tir::PrimFunc f = te::SchedulePostProcToPrimFunc(out_arg_list, std::move(stmt), out_binds);
-  f = WithAttr(std::move(f), "global_symbol", runtime::String(name));
-
-  // Mark this schedule as being converted from an TE schedule. Makes sure that
-  // the correct TE passes are run.
-  f = WithAttr(std::move(f), "from_legacy_te_schedule", Bool(true));
-
-  bool noalias = pass_ctx->GetConfig<Bool>("tir.noalias", Bool(true)).value();
-
-  if (noalias) {
-    f = WithAttr(std::move(f), "tir.noalias", Bool(true));
-  }
-  GlobalVar global_var = global_var_supply->UniqueGlobalFor(name, false);
-  return IRModule(Map<GlobalVar, BaseFunc>({{global_var, f}}));
-}
-
-TVM_REGISTER_GLOBAL("driver.schedule_to_module")
-    .set_body_typed([](te::Schedule sch, const Array<ObjectRef>& args, const String& name,
-                       const Map<te::Tensor, tir::Buffer>& binds) {
-      std::unordered_map<te::Tensor, tir::Buffer> c_binds;
-      // Check to make sure binds is not null before doing the conversion;
-      if (binds.defined()) {
-        for (auto kv : binds) {
-          c_binds.insert({kv.first, kv.second});
-        }
-      }
-      IRModule mod = ScheduleToModule(std::move(sch), args, name, c_binds, GlobalVarSupply());
-      return mod;
-    });
 
 IRModule LowerModule(IRModule mod, bool simple_mode) {
   Array<transform::Pass> pass_list = CreatePassList(simple_mode);
@@ -367,38 +319,6 @@ TVM_REGISTER_GLOBAL("driver.lower_primfunc")
       return LowerPrimFunc(std::move(func), name, simple_mode);
     });
 
-IRModule LowerSchedule(te::Schedule sch, const Array<te::Tensor>& args, const std::string& name,
-                       const std::unordered_map<te::Tensor, tir::Buffer>& binds,
-                       GlobalVarSupply global_var_supply, bool simple_mode) {
-  Array<ObjectRef> ref_args;
-  for (ObjectRef x : args) {
-    ref_args.push_back(x);
-  }
-  return LowerSchedule(std::move(sch), ref_args, name, binds, global_var_supply, simple_mode);
-}
-
-IRModule LowerSchedule(te::Schedule sch, const Array<ObjectRef>& args, const std::string& name,
-                       const std::unordered_map<te::Tensor, tir::Buffer>& binds,
-                       GlobalVarSupply global_var_supply, bool simple_mode) {
-  IRModule mod = ScheduleToModule(std::move(sch), args, name, binds, global_var_supply);
-  // Get the legacy TE pass list
-  Array<transform::Pass> pass_list = CreatePassList(simple_mode);
-  return LowerWithPassList(mod, pass_list);
-}
-
-TVM_REGISTER_GLOBAL("driver.lower_schedule")
-    .set_body_typed([](te::Schedule sch, const Array<ObjectRef>& args, const String& name,
-                       const Map<te::Tensor, tir::Buffer>& binds, bool simple_mode) {
-      std::unordered_map<te::Tensor, tir::Buffer> c_binds;
-      // Check to make sure binds is not null before doing the conversion;
-      if (binds.get() != nullptr) {
-        for (auto kv : binds) {
-          c_binds.insert({kv.first, kv.second});
-        }
-      }
-      return LowerSchedule(std::move(sch), args, name, c_binds, GlobalVarSupply(), simple_mode);
-    });
-
 /**
  * This function takes the input module that contains both the device and host opts.
  * Then, it applies transformation on the original module before splitting into separate modules for
diff --git a/src/relax/analysis/layout_transformation.cc b/src/relax/analysis/layout_transformation.cc
index 25ce1a58f276..f0658dabb398 100644
--- a/src/relax/analysis/layout_transformation.cc
+++ b/src/relax/analysis/layout_transformation.cc
@@ -26,6 +26,7 @@
 #include <tvm/arith/iter_affine_map.h>
 #include <tvm/relax/analysis.h>
 #include <tvm/tir/analysis.h>
+#include <tvm/tir/index_map.h>
 #include <tvm/tir/stmt_functor.h>
 
 #include "../../support/array.h"
diff --git a/src/target/codegen.cc b/src/target/codegen.cc
index 49c250b8dc64..e5e2c4297c8e 100644
--- a/src/target/codegen.cc
+++ b/src/target/codegen.cc
@@ -43,17 +43,6 @@
 namespace tvm {
 namespace codegen {
 
-/*!
- * \brief TIRToRuntime conversion specific to a TargetKind
- *
- * This function is responsible for scanning an IRModule for appropriate Target-specific functions
- and generating a Runtime module representing the compiled output
- *
- * \param ir_module Unified IRModule
- * \param target Target to filter on or retrieve arguments from
- * \return Runtime Module containing compiled functions
- */
-using FTVMTIRToRuntime = tvm::runtime::TypedPackedFunc<runtime::Module(IRModule, Target)>;
 
 runtime::Module Build(IRModule mod, Target target) {
   if (transform::PassContext::Current()
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 1f9cc09b1e2f..043fc26daf91 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -1819,7 +1819,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const BufferLoadNode* op) {
   auto make_load = [this, &loads](TypedPointer buffer_ptr, int /* subelement_i */,
                                   llvm::Value* predicate, int alignment, bool is_volatile) {
     llvm::Instruction* load = nullptr;
-    if (predicate != NULL) {
+    if (predicate != nullptr) {
       ICHECK(!is_volatile)
           << "The masked load intrinsic does not support declaring load as volatile.";
 #if TVM_LLVM_VERSION >= 130
@@ -1972,7 +1972,7 @@ void CodeGenLLVM::VisitStmt_(const BufferStoreNode* op) {
       to_store = builder_->CreateExtractElement(value, subelement_i);
     }
 
-    if (predicate != NULL) {
+    if (predicate != nullptr) {
       ICHECK(!is_volatile)
           << "The masked store intrinsic does not support declaring store as volatile.";
 #if TVM_LLVM_VERSION >= 110
diff --git a/src/te/autodiff/ad_simplify.cc b/src/te/autodiff/ad_simplify.cc
deleted file mode 100644
index 26047e879e9b..000000000000
--- a/src/te/autodiff/ad_simplify.cc
+++ /dev/null
@@ -1,1239 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file ad_simplify.cc
- * \brief Simplify tensor compute generated by tensor-level autodiff.
- *
- * The major simplification we do in this file is to eliminate
- * the Jacobian tensor created by autodiff.
- *
- * Jacobian tensor is sparse because one output element usually relates
- * to a small portion of the inputs. For example, element-wise function has a one-to-one mapping
- * between input tensor and output tensor, thus the Jacobian is diagonal.
- *
- * Generally, we have Out_{\beta} = f( In_{A \alpha} ) in which A is a matrix,
- * \alpha and \beta are vectors represent the indices of In and Out respectively.
- * i.e., the non-zero Jacobian indices is a linear combination of the input indices.
- * Thereby we solve linear equations of \beta = A \alpha,
- * as well as linear inequalities of their domain ranges.
- *
- * Refer to Urban S, van der Smagt P. Automatic differentiation for tensor algebras[J].
- * arXiv preprint arXiv:1711.01348, 2017. for more details.
- *
- * Implement-wise, we extract the equations in the compute definition via NonzeronessCondition,
- * replace the compute expression with solved new axes, and create a selection node
- * (non-zero-condition ? new_compute_expression : 0).
- *
- * Due to TVM's restriction, we also lift the reduction to the top of the compute stage.
- *
- */
-#include <tvm/arith/analyzer.h>
-#include <tvm/arith/int_solver.h>
-#include <tvm/runtime/registry.h>
-#include <tvm/te/autodiff.h>
-#include <tvm/tir/analysis.h>
-#include <tvm/tir/stmt_functor.h>
-
-#include <iterator>
-#include <memory>
-#include <optional>
-#include <utility>
-
-#include "ad_utils.h"
-
-namespace tvm {
-namespace te {
-
-using arith::DivMode;
-using arith::kFloorDiv;
-using arith::kSimplifyRewriteCanonicalRewrite;
-using arith::kTruncDiv;
-
-// Combine all expressions from the container using &&.
-template <class container>
-PrimExpr All(const container& c) {
-  PrimExpr res;
-  for (const auto& e : c) {
-    if (res.get()) {
-      res = res && e;
-    } else {
-      res = e;
-    }
-  }
-  if (res.get()) {
-    return res;
-  } else {
-    return const_true();
-  }
-}
-
-Map<Var, Range> IterVarsToMap(const Array<IterVar>& itervars) {
-  Map<Var, Range> res;
-  for (const IterVar& v : itervars) {
-    res.Set(v->var, v->dom);
-  }
-  return res;
-}
-
-// Given a map from vars to ranges create an array of itervars
-Array<IterVar> IterVarsFromMap(const Array<Var>& vars, const Map<Var, Range>& vranges,
-                               IterVarType iter_type = kDataPar, std::string thread_tag = "") {
-  Array<IterVar> res;
-  for (const Var& v : vars) {
-    ICHECK(vranges.count(v)) << "A range for the variable " << v << " was not provided in map "
-                             << vranges;
-    res.push_back(IterVar(vranges[v], v, iter_type, thread_tag));
-  }
-  return res;
-}
-
-Array<Var> IterVarsToVars(const Array<IterVar>& itervars) {
-  Array<Var> res;
-  for (const IterVar& v : itervars) {
-    res.push_back(v->var);
-  }
-  return res;
-}
-
-template <typename ValueType>
-bool is_const_value(const PrimExpr& e, ValueType value) {
-  static_assert(std::is_integral<ValueType>::value,
-                "Comparison to non-integer values is forbidden.");
-  if (const tir::IntImmNode* i = e.as<tir::IntImmNode>()) {
-    return i->value == value;
-  } else if (const tir::FloatImmNode* i = e.as<tir::FloatImmNode>()) {
-    return i->value == value;
-  } else if (const tir::CastNode* c = e.as<tir::CastNode>()) {
-    return is_const_value(c->value, value);
-  } else if (const tir::BroadcastNode* b = e.as<tir::BroadcastNode>()) {
-    return is_const_value(b->value, value);
-  } else {
-    return false;
-  }
-}
-
-// Return true if this combiner is just a sum.
-bool IsSumCombiner(const CommReducer& combiner, const Map<Var, Range>& vranges) {
-  arith::Analyzer analyzer;
-  analyzer.Bind(vranges);
-  if (combiner->result.size() != 1) {
-    return false;
-  }
-
-  if (!is_const_value(
-          analyzer.Simplify(combiner->identity_element[0], kSimplifyRewriteCanonicalRewrite), 0)) {
-    return false;
-  }
-
-  PrimExpr combiner_result =
-      analyzer.Simplify(combiner->result[0], kSimplifyRewriteCanonicalRewrite);
-
-  return tir::ExprDeepEqual()(combiner_result, combiner->lhs[0] + combiner->rhs[0]) ||
-         tir::ExprDeepEqual()(combiner_result, combiner->rhs[0] + combiner->lhs[0]);
-}
-
-bool CanFactorZeroFromCombiner(const CommReducer& combiner, int value_index,
-                               const Map<Var, Range>& vranges) {
-  arith::Analyzer analyzer;
-  analyzer.Bind(vranges);
-  if (!is_const_value(analyzer.Simplify(combiner->identity_element[value_index],
-                                        kSimplifyRewriteCanonicalRewrite),
-                      0)) {
-    return false;
-  }
-
-  PrimExpr zero = make_zero(combiner->result[value_index].dtype());
-  PrimExpr in = Substitute(combiner->result[value_index], {{combiner->lhs[value_index], zero},
-                                                           {combiner->rhs[value_index], zero}});
-  in = analyzer.Simplify(in, kSimplifyRewriteCanonicalRewrite);
-
-  return is_const_value(in, 0);
-}
-
-struct NonzeroConditionResult {
-  PrimExpr cond;
-  PrimExpr value;
-
-  PrimExpr to_expr() const { return Select(cond, value, make_zero(value.dtype())); }
-
-  friend std::ostream& operator<<(std::ostream& os, const NonzeroConditionResult& r) {
-    return os << r.to_expr();
-  }
-};
-
-// The implementation of NonzeroCondition
-// transform expression to cond ? value : 0
-class NonzeroConditionFunctor : public ExprFunctor<NonzeroConditionResult(const PrimExpr&)> {
- public:
-  NonzeroConditionResult NonzeroCondition(const PrimExpr& e) {
-    if (e.dtype().is_bool()) {
-      // Boolean expressions are non-zero whenever they are true themselves
-      return {e, const_true()};
-    } else {
-      return VisitExpr(e);
-    }
-  }
-
-  // Most of the cases are implemented using helpers below
-  result_type VisitExpr_(const VarNode* op) final { return Default_(GetRef<PrimExpr>(op)); }
-  result_type VisitExpr_(const IntImmNode* op) final { return Const_(GetRef<IntImm>(op)); }
-  result_type VisitExpr_(const FloatImmNode* op) final { return Const_(GetRef<FloatImm>(op)); }
-  result_type VisitExpr_(const StringImmNode* op) final { return Default_(GetRef<PrimExpr>(op)); }
-  result_type VisitExpr_(const AddNode* op) final { return BinOpAddLike_(GetRef<Add>(op)); }
-  result_type VisitExpr_(const SubNode* op) final { return BinOpAddLike_(GetRef<Sub>(op)); }
-  result_type VisitExpr_(const MulNode* op) final { return BinOpMulLike_(GetRef<Mul>(op)); }
-  result_type VisitExpr_(const DivNode* op) final { return BinOpDivLike_(GetRef<Div>(op)); }
-  result_type VisitExpr_(const ModNode* op) final { return BinOpDivLike_(GetRef<Mod>(op)); }
-  result_type VisitExpr_(const FloorDivNode* op) final {
-    return BinOpDivLike_(GetRef<FloorDiv>(op));
-  }
-  result_type VisitExpr_(const FloorModNode* op) final {
-    return BinOpDivLike_(GetRef<FloorMod>(op));
-  }
-  result_type VisitExpr_(const MinNode* op) final { return BinOpAddLike_(GetRef<Min>(op)); }
-  result_type VisitExpr_(const MaxNode* op) final { return BinOpAddLike_(GetRef<Max>(op)); }
-
-  result_type VisitExpr_(const CastNode* op) final {
-    auto nz_a = NonzeroCondition(op->value);
-    return {nz_a.cond, Cast(op->dtype, nz_a.value)};
-  }
-
-  result_type VisitExpr_(const SelectNode* op) final {
-    PrimExpr cond = op->condition, true_val = op->true_value, false_val = op->false_value;
-    auto nz_a = NonzeroCondition(true_val);
-    auto nz_b = NonzeroCondition(false_val);
-
-    // If the false part is zero, we can get rid of the select
-    if (is_const_value(nz_b.value, 0)) {
-      PrimExpr new_cond = analyzer_.Simplify(nz_a.cond && cond, kSimplifyRewriteCanonicalRewrite);
-      return {new_cond, nz_a.value};
-    }
-
-    // If the true part is zero, we can also get rid of the select
-    if (is_const_value(nz_a.value, 0)) {
-      PrimExpr new_cond = analyzer_.Simplify(nz_b.cond && !cond, kSimplifyRewriteCanonicalRewrite);
-      return {new_cond, nz_b.value};
-    }
-
-    // Otherwise we retain the select and combine the conditions into this
-    PrimExpr new_cond = analyzer_.Simplify((cond && nz_a.cond) || (!cond && nz_b.cond),
-                                           kSimplifyRewriteCanonicalRewrite);
-    if (nz_a.value.same_as(true_val) && nz_b.value.same_as(false_val)) {
-      return {new_cond, GetRef<PrimExpr>(op)};
-    } else {
-      return {new_cond, Select(cond, nz_a.value, nz_b.value)};
-    }
-  }
-
-  result_type VisitExpr_(const CallNode* op) final {
-    if (op->op.same_as(op_if_then_else_)) {
-      PrimExpr cond = op->args[0], true_val = op->args[1], false_val = op->args[2];
-      auto nz_a = NonzeroCondition(true_val);
-      auto nz_b = NonzeroCondition(false_val);
-
-      // We don't have as much freedom here as in the select case
-      // since the `if` must be preserved in any case
-      PrimExpr new_cond = analyzer_.Simplify((cond && nz_a.cond) || (!cond && nz_b.cond),
-                                             kSimplifyRewriteCanonicalRewrite);
-      if (nz_a.value.same_as(true_val) && nz_b.value.same_as(false_val)) {
-        return {new_cond, GetRef<PrimExpr>(op)};
-      } else {
-        return {new_cond, if_then_else(cond, nz_a.value, nz_b.value)};
-      }
-    } else {
-      return Default_(GetRef<PrimExpr>(op));
-    }
-  }
-
-  result_type VisitExpr_(const ProducerLoadNode* op) final {
-    return Default_(GetRef<PrimExpr>(op));
-  }
-
-  NonzeroConditionResult Default_(const PrimExpr& e) {
-    // This is always correct, so it's the default
-    return {const_true(), e};
-  }
-
-  template <class T>
-  NonzeroConditionResult Const_(const T& op) {
-    if (op->value == 0) {
-      return {const_false(), op};
-    } else {
-      return {const_true(), op};
-    }
-  }
-
-  template <class T>
-  NonzeroConditionResult BinOpAddLike_(const T& op) {
-    auto nz_a = NonzeroCondition(op->a);
-    auto nz_b = NonzeroCondition(op->b);
-
-    // For addition and similar ops the result may be nonzero if either of the arguments is
-    // nonzero, so we combine the conditions with Or.
-    if (tir::ExprDeepEqual()(nz_a.cond, nz_b.cond)) {
-      // If the conditions are the same, we don't need Or
-      if (nz_a.value.same_as(op->a) && nz_b.value.same_as(op->b)) {
-        return {nz_a.cond, op};
-      } else {
-        return {nz_a.cond, T(nz_a.value, nz_b.value)};
-      }
-    } else {
-      // Otherwise use Or
-      PrimExpr new_cond =
-          analyzer_.Simplify(nz_a.cond || nz_b.cond, kSimplifyRewriteCanonicalRewrite);
-      // A little optimization: if the combined condition is the same as one of the inner
-      // conditions, we don't need to guard the inner value with a select, otherwise
-      // we create a select in the `to_expr` call.
-      PrimExpr new_a = tir::ExprDeepEqual()(nz_a.cond, new_cond) ? nz_a.value : nz_a.to_expr();
-      PrimExpr new_b = tir::ExprDeepEqual()(nz_b.cond, new_cond) ? nz_b.value : nz_b.to_expr();
-      PrimExpr new_expr = T(new_a, new_b);
-      return {new_cond, new_expr};
-    }
-  }
-
-  template <class T>
-  NonzeroConditionResult BinOpMulLike_(const T& op) {
-    auto nz_a = NonzeroCondition(op->a);
-    auto nz_b = NonzeroCondition(op->b);
-
-    // For multiplication and similar ops the result may be nonzero if
-    // both the arguments are nonzero, so we combine with And.
-    PrimExpr new_cond =
-        analyzer_.Simplify(nz_a.cond && nz_b.cond, kSimplifyRewriteCanonicalRewrite);
-
-    if (nz_a.value.same_as(op->a) && nz_b.value.same_as(op->b)) {
-      return {new_cond, op};
-    } else {
-      return {new_cond, T(nz_a.value, nz_b.value)};
-    }
-  }
-
-  template <class T>
-  NonzeroConditionResult BinOpDivLike_(const T& op) {
-    auto nz_a = NonzeroCondition(op->a);
-
-    // For Div we simply use the condition of the numerator.
-
-    if (nz_a.value.same_as(op->a)) {
-      return {nz_a.cond, op};
-    } else {
-      return {nz_a.cond, T(nz_a.value, op->b)};
-    }
-  }
-
- private:
-  arith::Analyzer analyzer_;
-  const Op& op_if_then_else_ = Op::Get("tir.if_then_else");
-};
-
-inline NonzeroConditionResult NonzeronessCondition(const PrimExpr& expr) {
-  return NonzeroConditionFunctor().NonzeroCondition(expr);
-}
-
-struct FactorOutAtomicFormulasResult {
-  std::vector<PrimExpr> atomic_formulas;
-  PrimExpr rest;
-
-  PrimExpr to_expr() const {
-    PrimExpr res = rest;
-    for (const PrimExpr& e : atomic_formulas) {
-      res = And(e, res);
-    }
-    return res;
-  }
-
-  Array<PrimExpr> to_array() const {
-    Array<PrimExpr> res = atomic_formulas;
-    res.push_back(rest);
-    return res;
-  }
-};
-
-// The implementation of FactorOutAtomicFormulas
-class FactorOutAtomicFormulasFunctor
-    : public ExprFunctor<FactorOutAtomicFormulasResult(const PrimExpr&)> {
- public:
-  result_type Atomic_(const PrimExpr& e) {
-    // For atomic expressions the result is the expr itself with True as the residual
-    return {{e}, make_const(e.dtype(), 1)};
-  }
-
-  // This is basically the list of expression kinds that are considered atomic
-  result_type VisitExpr_(const VarNode* op) final { return Atomic_(GetRef<PrimExpr>(op)); }
-  result_type VisitExpr_(const CallNode* op) final { return Atomic_(GetRef<PrimExpr>(op)); }
-  result_type VisitExpr_(const IntImmNode* op) final { return Atomic_(GetRef<PrimExpr>(op)); }
-  result_type VisitExpr_(const EQNode* op) final { return Atomic_(GetRef<PrimExpr>(op)); }
-  result_type VisitExpr_(const NENode* op) final { return Atomic_(GetRef<PrimExpr>(op)); }
-  result_type VisitExpr_(const LENode* op) final { return Atomic_(GetRef<PrimExpr>(op)); }
-  result_type VisitExpr_(const LTNode* op) final { return Atomic_(GetRef<PrimExpr>(op)); }
-  result_type VisitExpr_(const GENode* op) final { return Atomic_(GetRef<PrimExpr>(op)); }
-  result_type VisitExpr_(const GTNode* op) final { return Atomic_(GetRef<PrimExpr>(op)); }
-
-  result_type VisitExpr_(const SelectNode* op) final {
-    // Select can be rewritten through other logical ops
-    PrimExpr expr = (op->condition && op->true_value) || (!op->condition && op->false_value);
-    return VisitExpr(expr);
-  }
-
-  result_type VisitExpr_(const NotNode* op) final {
-    // Not should be moved down
-    if (const OrNode* or_expr = op->a.as<OrNode>()) {
-      PrimExpr expr = !or_expr->a && !or_expr->b;
-      return VisitExpr(expr);
-    } else if (const AndNode* and_expr = op->a.as<AndNode>()) {
-      PrimExpr expr = !and_expr->a || !and_expr->b;
-      return VisitExpr(expr);
-    } else if (const SelectNode* sel_expr = op->a.as<SelectNode>()) {
-      PrimExpr expr = ((!sel_expr->condition || !sel_expr->true_value) &&
-                       (sel_expr->condition || !sel_expr->false_value));
-      return VisitExpr(expr);
-    }
-    return Atomic_(GetRef<PrimExpr>(op));
-  }
-
-  result_type VisitExpr_(const AndNode* op) final {
-    auto res_a = VisitExpr(op->a);
-    auto res_b = VisitExpr(op->b);
-
-    // For the And case we return the union of the sets of atomic formulas
-    std::unordered_set<PrimExpr, StructuralHash, StructuralEqual> res_a_set;
-    res_a_set.reserve(res_a.atomic_formulas.size());
-    std::copy(res_a.atomic_formulas.begin(), res_a.atomic_formulas.end(),
-              std::inserter(res_a_set, res_a_set.end()));
-
-    std::vector<PrimExpr> res = res_a.atomic_formulas;
-    for (const auto& e : res_b.atomic_formulas) {
-      if (res_a_set.find(e) == res_a_set.end()) {
-        res.emplace_back(e);
-      }
-    }
-    // And the residuals are combined with &&
-    return {res, res_a.rest && res_b.rest};
-  }
-
-  result_type VisitExpr_(const MulNode* op) final {
-    // Since we work with bools, for multiplication we do the same thing as for And
-    PrimExpr e_and = op->a && op->b;
-    return VisitExpr(e_and);
-  }
-
-  result_type VisitExpr_(const OrNode* op) final {
-    auto res_a = VisitExpr(op->a);
-    auto res_b = VisitExpr(op->b);
-
-    std::unordered_set<PrimExpr, StructuralHash, StructuralEqual> res_a_set{
-        res_a.atomic_formulas.begin(), res_a.atomic_formulas.end()};
-    std::unordered_set<PrimExpr, StructuralHash, StructuralEqual> res_b_set{
-        res_b.atomic_formulas.begin(), res_b.atomic_formulas.end()};
-
-    // For the Or case we intersect the sets of atomic formulas
-    std::unordered_set<PrimExpr, StructuralHash, StructuralEqual> res_set;
-    std::vector<PrimExpr> res;
-    res_set.reserve(std::min(res_a.atomic_formulas.size(), res_b.atomic_formulas.size()));
-    res.reserve(std::min(res_a.atomic_formulas.size(), res_b.atomic_formulas.size()));
-    for (const auto& res_b_formula : res_b.atomic_formulas) {
-      if (res_a_set.count(res_b_formula)) {
-        res_set.insert(res_b_formula);
-        res.push_back(res_b_formula);
-      }
-    }
-
-    // Computing the residual is more complex: we have to compute the sets of atomic formulas
-    // which are left behind, and then combine them with the residuals into the new residual.
-    std::vector<PrimExpr> new_cond_a;
-    new_cond_a.reserve(res_a.atomic_formulas.size() - res_set.size());
-    for (const auto& formula : res_a.atomic_formulas) {
-      if (!res_set.count(formula)) new_cond_a.emplace_back(formula);
-    }
-
-    std::vector<PrimExpr> new_cond_b;
-    new_cond_b.reserve(res_b.atomic_formulas.size() - res_set.size());
-    for (const auto& formula : res_b.atomic_formulas) {
-      if (!res_set.count(formula)) new_cond_b.emplace_back(formula);
-    }
-
-    res_a.atomic_formulas = std::move(new_cond_a);
-    res_b.atomic_formulas = std::move(new_cond_b);
-
-    PrimExpr new_rest = res_a.to_expr() || res_b.to_expr();
-
-    return {res, new_rest};
-  }
-};
-
-// Transform the given formula into a conjunction of atomic formulas (represented as an array)
-// and a non-atomic residual. Atomic formulas are consts, calls, variables and comparisons (a <= b,
-// etc), i.e. formulas which are not logical operators (||, &&, !) on the top level.
-FactorOutAtomicFormulasResult FactorOutAtomicFormulas(const PrimExpr& e) {
-  ICHECK(e.dtype().is_bool());
-  return FactorOutAtomicFormulasFunctor().VisitExpr(e);
-}
-
-struct EliminateDivModResult {
-  PrimExpr expr;
-  Map<Var, PrimExpr> substitution;
-  Array<Var> new_variables;
-  Array<PrimExpr> conditions;
-  Map<Var, Range> ranges;
-};
-
-inline PrimExpr ModImpl(PrimExpr a, PrimExpr b, DivMode mode) {
-  if (mode == kTruncDiv) {
-    return truncmod(a, b);
-  } else {
-    ICHECK_EQ(mode, kFloorDiv);
-    return floormod(a, b);
-  }
-}
-
-inline PrimExpr DivImpl(PrimExpr a, PrimExpr b, DivMode mode) {
-  if (mode == kTruncDiv) {
-    return truncdiv(a, b);
-  } else {
-    ICHECK_EQ(mode, kFloorDiv);
-    return floordiv(a, b);
-  }
-}
-
-class EliminateDivModMutator : public ExprMutator {
- public:
-  Map<Var, PrimExpr> substitution;
-  Array<Var> new_variables;
-  Array<PrimExpr> conditions;
-  Map<Var, Range> ranges;
-
-  explicit EliminateDivModMutator(Map<Var, Range> ranges) : ranges(std::move(ranges)) {}
-
-  virtual PrimExpr VisitExpr_(const DivNode* op) {
-    const IntImmNode* imm = op->b.as<IntImmNode>();
-    if (imm && imm->value != 0) {
-      if (imm->value < 0) {
-        // x / -c == -(x/c) for truncated division
-        return make_zero(op->dtype) -
-               VisitExpr(truncdiv(op->a, make_const(op->dtype, -imm->value)));
-      }
-
-      // Try to find the already existing variables for this expression
-      auto it = expr_to_vars_.find(std::make_tuple(kTruncDiv, op->a, imm->value));
-      if (it != expr_to_vars_.end()) {
-        return it->second.first;
-      }
-
-      // Otherwise recursively mutate the left hand side, and create new variables
-      PrimExpr mutated_a = VisitExpr(op->a);
-      if (auto var_pair_opt = AddNewVarPair(op->a, mutated_a, imm->value, kTruncDiv)) {
-        return var_pair_opt.value().first;
-      } else {
-        return truncdiv(mutated_a, op->b);
-      }
-    }
-
-    return div(VisitExpr(op->a), VisitExpr(op->b));
-  }
-
-  virtual PrimExpr VisitExpr_(const ModNode* op) {
-    const IntImmNode* imm = op->b.as<IntImmNode>();
-    if (imm && imm->value != 0) {
-      if (imm->value < 0) {
-        // x % -c == x % c for truncated division
-        return VisitExpr(truncmod(op->a, make_const(op->dtype, -imm->value)));
-      }
-
-      // Try to find the already existing variables for this expression
-      auto it = expr_to_vars_.find(std::make_tuple(kTruncDiv, op->a, imm->value));
-      if (it != expr_to_vars_.end()) {
-        return it->second.second;
-      }
-
-      // Otherwise recursively mutate the left hand side, and create new variables
-      PrimExpr mutated_a = VisitExpr(op->a);
-      if (auto var_pair_opt = AddNewVarPair(op->a, mutated_a, imm->value, kTruncDiv)) {
-        return var_pair_opt.value().second;
-      } else {
-        return truncmod(mutated_a, op->b);
-      }
-    }
-
-    return truncmod(VisitExpr(op->a), VisitExpr(op->b));
-  }
-
-  virtual PrimExpr VisitExpr_(const FloorDivNode* op) {
-    const IntImmNode* imm = op->b.as<IntImmNode>();
-    if (imm && imm->value != 0) {
-      if (imm->value < 0) {
-        // x / -c == (-x) / c for flooring division
-        return VisitExpr(
-            floordiv(make_zero(op->dtype) - op->a, make_const(op->dtype, -imm->value)));
-      }
-
-      // Try to find the already existing variables for this expression
-      auto it = expr_to_vars_.find(std::make_tuple(kFloorDiv, op->a, imm->value));
-      if (it != expr_to_vars_.end()) {
-        return it->second.first;
-      }
-
-      // Otherwise recursively mutate the left hand side, and create new variables
-      PrimExpr mutated_a = VisitExpr(op->a);
-      if (auto var_pair_opt = AddNewVarPair(op->a, mutated_a, imm->value, kFloorDiv)) {
-        return var_pair_opt.value().first;
-      } else {
-        return floordiv(mutated_a, op->b);
-      }
-    }
-
-    return floordiv(VisitExpr(op->a), VisitExpr(op->b));
-  }
-
-  virtual PrimExpr VisitExpr_(const FloorModNode* op) {
-    const IntImmNode* imm = op->b.as<IntImmNode>();
-    if (imm && imm->value != 0) {
-      if (imm->value < 0) {
-        // x % -c == -(-x % c) for flooring division
-        return VisitExpr(make_zero(op->dtype) - floormod(make_zero(op->dtype) - op->a,
-                                                         make_const(op->dtype, -imm->value)));
-      }
-
-      // Try to find the already existing variables for this expression
-      auto it = expr_to_vars_.find(std::make_tuple(kFloorDiv, op->a, imm->value));
-      if (it != expr_to_vars_.end()) {
-        return it->second.second;
-      }
-
-      // Otherwise recursively mutate the left hand side, and create new variables
-      PrimExpr mutated_a = VisitExpr(op->a);
-      if (auto var_pair_opt = AddNewVarPair(op->a, mutated_a, imm->value, kFloorDiv)) {
-        return var_pair_opt.value().second;
-      } else {
-        return floormod(mutated_a, op->b);
-      }
-    }
-
-    return floormod(VisitExpr(op->a), VisitExpr(op->b));
-  }
-
- private:
-  std::optional<std::pair<Var, Var>> AddNewVarPair(const PrimExpr& e, const PrimExpr& mut,
-                                                   int64_t val, DivMode mode) {
-    using tresult = std::optional<std::pair<Var, Var>>;
-
-    // Try to find the variables using the mutated expressions
-    if (!e.same_as(mut)) {
-      auto it = expr_to_vars_.find(std::make_tuple(mode, mut, val));
-      if (it != expr_to_vars_.end()) {
-        return tresult(it->second);
-      }
-    }
-
-    PrimExpr val_e = make_const(e.dtype(), val);
-    idx_ += 1;
-
-    // Convert `ranges` to IntSets
-    std::unordered_map<const VarNode*, IntSet> var_intsets;
-    for (const auto& p : ranges) {
-      var_intsets[p.first.get()] = IntSet::FromRange(p.second);
-    }
-
-    // Infer ranges for the expressions we want to replace with variables
-    Range div_range = EvalSet(DivImpl(mut, val_e, mode), var_intsets).CoverRange(Range());
-    Range mod_range = EvalSet(ModImpl(mut, val_e, mode), var_intsets).CoverRange(Range());
-
-    // We don't want to add unbounded variables
-    if (!div_range.get() || !mod_range.get()) {
-      LOG(WARNING) << "EliminateDivMod: won't eliminate " << DivImpl(e, val_e, mode)
-                   << "  because its bounds cannot be inferred";
-      return tresult();
-    }
-    if (!mod_range.get()) {
-      LOG(WARNING) << "EliminateDivMod: won't eliminate " << ModImpl(e, val_e, mode)
-                   << "  because its bounds cannot be inferred";
-      return tresult();
-    }
-
-    // Create new variables for the expressions
-    auto div = Var((mode == kTruncDiv ? "tdiv" : "fdiv") + std::to_string(idx_), e.dtype());
-    auto mod = Var((mode == kTruncDiv ? "tmod" : "fmod") + std::to_string(idx_), e.dtype());
-
-    new_variables.push_back(div);
-    new_variables.push_back(mod);
-
-    // Note that we have to perform substitution to mut because mut may contain new variables
-    substitution.Set(div, DivImpl(Substitute(mut, substitution), val_e, mode));
-    substitution.Set(mod, ModImpl(Substitute(mut, substitution), val_e, mode));
-
-    ranges.Set(div, div_range);
-    ranges.Set(mod, mod_range);
-
-    // This additional condition works as a definition for the new variables
-    conditions.push_back(mut == div * val_e + mod);
-
-    if (!analyzer_.CanProve(mod_range->extent <= val_e)) {
-      // If we use the C/C++ definition of mod, there may be multiple values of `mod`
-      // satisfying the added condition if the expr `e` may change its sign, so we
-      // have to add another condition.
-      LOG(WARNING) << "EliminateDivMod: cannot fully eliminate div or mod because "
-                   << ModImpl(e, val_e, mode) << "  probably may change its sign";
-      conditions.push_back(Select(e >= 0, mod >= 0, mod <= 0));
-    }
-
-    auto p = std::make_pair(div, mod);
-    expr_to_vars_[std::make_tuple(mode, e, val)] = p;
-    if (!e.same_as(mut)) {
-      expr_to_vars_[std::make_tuple(mode, mut, val)] = p;
-    }
-    return tresult(p);
-  }
-
-  class TupleEqual_ {
-   public:
-    bool operator()(const std::tuple<DivMode, PrimExpr, int64_t>& lhs,
-                    const std::tuple<DivMode, PrimExpr, int64_t>& rhs) const {
-      return std::get<0>(lhs) == std::get<0>(rhs) &&
-             tir::ExprDeepEqual()(std::get<1>(lhs), std::get<1>(rhs)) &&
-             std::get<2>(lhs) == std::get<2>(rhs);
-    }
-  };
-
-  class TupleHasher_ {
-   public:
-    size_t operator()(const std::tuple<DivMode, PrimExpr, int64_t>& key) const {
-      return ((std::hash<int>()(std::get<0>(key)) ^ (StructuralHash()(std::get<1>(key)) << 1)) >>
-              1) ^
-             (std::hash<int64_t>()(std::get<2>(key)) << 1);
-    }
-  };
-
-  // A counter for naming new variables
-  int idx_{0};
-  // A map from pairs of exprs and numbers (e, n) to pairs of new vars (div, mod)
-  // such that `div = e / n` and `mod = e % n`
-  std::unordered_map<std::tuple<DivMode, PrimExpr, int64_t>, std::pair<Var, Var>, TupleHasher_,
-                     TupleEqual_>
-      expr_to_vars_;
-  arith::Analyzer analyzer_;
-};
-
-// Replace every subexpr of the form e/const and e % const with a new variable.
-// Syntactically equal expressions will be mapped to the same variable.
-EliminateDivModResult EliminateDivMod(const PrimExpr& expr, Map<Var, Range> ranges) {
-  EliminateDivModResult res;
-  EliminateDivModMutator mutator(ranges);
-  res.expr = mutator(expr);
-  res.conditions = std::move(mutator.conditions);
-  res.new_variables = std::move(mutator.new_variables);
-  res.substitution = std::move(mutator.substitution);
-  res.ranges = std::move(mutator.ranges);
-  return res;
-}
-
-arith::IntConstraintsTransform EliminateDivModFromDomainConditions(
-    const arith::IntConstraints& domain) {
-  auto elim_res = EliminateDivMod(All(domain->relations), domain->ranges);
-
-  Map<Var, Range> new_vranges = elim_res.ranges;
-  Array<Var> new_axis = Concat(domain->variables, elim_res.new_variables);
-  PrimExpr new_cond = elim_res.expr && All(elim_res.conditions);
-
-  arith::IntConstraints new_domain(new_axis, new_vranges,
-                                   FactorOutAtomicFormulas(new_cond).to_array());
-
-  Map<Var, PrimExpr> src_to_dst;
-  Map<Var, PrimExpr> dst_to_src = elim_res.substitution;
-  for (const Var& v : domain->variables) {
-    src_to_dst.Set(v, v);
-    dst_to_src.Set(v, v);
-  }
-
-  return arith::IntConstraintsTransform(domain, new_domain, src_to_dst, dst_to_src);
-}
-
-inline arith::IntConstraintsTransform IdentityTransformation(const arith::IntConstraints& domain) {
-  Map<Var, PrimExpr> identity_map;
-  for (const Var& v : domain->variables) {
-    identity_map.Set(v, v);
-  }
-  return arith::IntConstraintsTransform(domain, domain, identity_map, identity_map);
-}
-
-// Simplify an iteration domain.
-arith::IntConstraintsTransform SimplifyDomain(const arith::IntConstraints& iter_domains,
-                                              bool eliminate_div_mod) {
-  arith::IntConstraintsTransform transf = IdentityTransformation(iter_domains);
-
-  if (eliminate_div_mod) {
-    transf = transf + EliminateDivModFromDomainConditions(transf->dst);
-  }
-
-  // TODO(sgrechanik-h): Repeating the following steps has a positive effect, however we probably
-  // should find a better terminating criterion (like stop when the domain volume stops decreasing)
-  // Also 2 steps seems to be slightly better than 3
-  for (size_t i = 0; i < 2; ++i) {
-    transf = transf + arith::SolveLinearEquations(transf->dst);
-    transf = transf + arith::SolveInequalitiesDeskewRange(transf->dst);
-  }
-
-  return transf;
-}
-
-// Use the condition of a reduction op to simplify its domain (axis)
-PrimExpr SimplifyReductionDomain(const PrimExpr& expr, const Map<Var, Range>& outer_vranges) {
-  if (const ReduceNode* red = expr.as<ReduceNode>()) {
-    Array<Var> vars = IterVarsToVars(red->axis);
-    Map<Var, Range> vranges = Merge(outer_vranges, IterVarsToMap(red->axis));
-    Array<PrimExpr> relations = FactorOutAtomicFormulas(red->condition).to_array();
-
-    arith::IntConstraints domain(vars, vranges, relations);
-    auto res = SimplifyDomain(domain);
-
-    Array<PrimExpr> new_source;
-    for (const PrimExpr& src : red->source) {
-      new_source.push_back(Substitute(src, res->src_to_dst));
-    }
-
-    Array<IterVar> new_axis = IterVarsFromMap(res->dst->variables, res->dst->ranges, kCommReduce);
-
-    // Perform simplification mainly to remove a possibly empty reduction.
-    arith::Analyzer analyzer;
-    return analyzer.Simplify(Reduce(red->combiner, new_source, new_axis, All(res->dst->relations),
-                                    red->value_index, red->init),
-                             kSimplifyRewriteCanonicalRewrite);
-  } else {
-    return expr;
-  }
-}
-
-// Extract from cond an implication of cond not containing vars
-std::pair<PrimExpr, PrimExpr> ImplicationNotContainingVars(
-    const PrimExpr& cond, const std::unordered_set<const VarNode*>& vars) {
-  ICHECK(cond.dtype().is_bool()) << "The type of cond must be bool";
-  // TODO(sgrechanik-h): NOTs could be pushed down using De Morgan laws
-  // before running this function but this case didn't seem to be important enough.
-  if (const AndNode* op = cond.as<AndNode>()) {
-    auto pair_a = ImplicationNotContainingVars(op->a, vars);
-    auto pair_b = ImplicationNotContainingVars(op->b, vars);
-    return {pair_a.first && pair_b.first, pair_a.second && pair_b.second};
-  } else if (const OrNode* op = cond.as<OrNode>()) {
-    auto pair_a = ImplicationNotContainingVars(op->a, vars);
-    auto pair_b = ImplicationNotContainingVars(op->b, vars);
-    return {pair_a.first || pair_b.first, (pair_a.first || pair_b.second) &&
-                                              (pair_b.first || pair_a.second) &&
-                                              (pair_a.second || pair_b.second)};
-  } else if (!tir::UsesVar(cond, [&vars](const VarNode* var) { return vars.count(var); })) {
-    return {cond, const_true()};
-  } else {
-    return {const_true(), cond};
-  }
-}
-
-// Factor conditions out of a reduction by applying Fourier-Motzkin elimination and moving out
-// (in)equalities which do not depend on the reduction variables.
-std::pair<PrimExpr, PrimExpr> LiftConditionsThroughReduction(const PrimExpr& cond,
-                                                             const Array<IterVar>& red_axis,
-                                                             const Array<IterVar>& outer_axis) {
-  // Factor out atomics so that we can consider this as a system of inequalities
-  auto factor_atomic_res = FactorOutAtomicFormulas(cond);
-  Array<PrimExpr> atomics = factor_atomic_res.atomic_formulas;
-  const PrimExpr& rest = factor_atomic_res.rest;
-
-  Array<Var> allvars;
-  for (const IterVar& v : red_axis) {
-    allvars.push_back(v->var);
-  }
-  for (const IterVar& v : outer_axis) {
-    allvars.push_back(v->var);
-  }
-
-  auto vranges = Merge(IterVarsToMap(red_axis), IterVarsToMap(outer_axis));
-  // start from reduction vars, so that input vars don't depend on them
-  arith::IntConstraints ineq_to_solve(allvars, vranges, atomics);
-  auto res_ineq = arith::SolveLinearInequalities(ineq_to_solve);
-  atomics = arith::AsConditions(allvars, res_ineq.first, res_ineq.second);
-
-  // Append the rest part
-  PrimExpr rewritten_cond = All(atomics) && rest;
-
-  std::unordered_set<const VarNode*> vset;
-  for (const IterVar& v : red_axis) {
-    vset.insert(v->var.get());
-  }
-
-  // The outer (first) condition does not contain reduction vars,
-  // the inner (second) condition is everything else
-  auto res = ImplicationNotContainingVars(rewritten_cond, vset);
-  return res;
-}
-
-// Convert an array of itervars to an array of inequalities
-Array<PrimExpr> IterVarsToInequalities(const Array<IterVar>& itervars) {
-  Array<PrimExpr> res;
-  for (const IterVar& v : itervars) {
-    res.push_back(GE(v->var, v->dom->min));
-    res.push_back(LT(v->var, v->dom->min + v->dom->extent));
-  }
-  return res;
-}
-
-class RemoveRedundantInequalitiesMutator : public ExprMutator {
- public:
-  explicit RemoveRedundantInequalitiesMutator(Array<PrimExpr> known) {
-    for (const PrimExpr& cond : known) {
-      known_.push_back(analyzer_.Simplify(cond, kSimplifyRewriteCanonicalRewrite));
-    }
-  }
-
-  virtual PrimExpr VisitExpr_(const SelectNode* op) {
-    bool has_side_effect = (SideEffect(GetRef<PrimExpr>(op)) > CallEffectKind::kReadState);
-    PrimExpr new_cond =
-        analyzer_.Simplify(VisitExpr(op->condition), kSimplifyRewriteCanonicalRewrite);
-    if (is_one(new_cond) && !has_side_effect) {
-      return VisitExpr(op->true_value);
-    } else if (is_zero(new_cond) && !has_side_effect) {
-      return VisitExpr(op->false_value);
-    } else {
-      Array<PrimExpr> new_known = known_;
-      for (const PrimExpr& atomic : FactorOutAtomicFormulas(new_cond).atomic_formulas) {
-        new_known.push_back(atomic);
-      }
-      RemoveRedundantInequalitiesMutator new_mutator(new_known);
-      // Note that we mutate only the true value with the new mutator
-      // TODO(sgrechanik-h): Update known conditions for the false value as well
-      return Select(new_cond, new_mutator(op->true_value), VisitExpr(op->false_value));
-    }
-  }
-
-  virtual PrimExpr VisitExpr_(const CallNode* op) {
-    if (op->op.same_as(op_if_then_else_)) {
-      PrimExpr new_cond =
-          analyzer_.Simplify(VisitExpr(op->args[0]), kSimplifyRewriteCanonicalRewrite);
-      if (is_one(new_cond)) {
-        return VisitExpr(op->args[1]);
-      } else if (is_zero(new_cond)) {
-        return VisitExpr(op->args[2]);
-      } else {
-        Array<PrimExpr> new_known = known_;
-        for (const PrimExpr& atomic : FactorOutAtomicFormulas(new_cond).atomic_formulas) {
-          new_known.push_back(atomic);
-        }
-        RemoveRedundantInequalitiesMutator new_mutator(new_known);
-        // Note that we mutate only the true value with the new mutator
-        // TODO(sgrechanik-h): Update known conditions for the false value as well
-        return if_then_else(new_cond, new_mutator(op->args[1]), VisitExpr(op->args[2]));
-      }
-    } else {
-      return ExprMutator::VisitExpr_(op);
-    }
-  }
-
-  virtual PrimExpr VisitExpr_(const ReduceNode* op) {
-    Array<PrimExpr> known_with_axes = known_;
-    ICHECK(op->init.empty()) << "Derivative of Reduction with initialization is not implemented";
-    for (const PrimExpr& axis_cond : IterVarsToInequalities(op->axis)) {
-      known_with_axes.push_back(axis_cond);
-    }
-    RemoveRedundantInequalitiesMutator mutator_with_axes(known_with_axes);
-
-    PrimExpr new_cond = mutator_with_axes(op->condition);
-
-    Array<PrimExpr> new_known = known_with_axes;
-    for (const PrimExpr& atomic : FactorOutAtomicFormulas(new_cond).atomic_formulas) {
-      new_known.push_back(atomic);
-    }
-    RemoveRedundantInequalitiesMutator new_mutator(new_known);
-
-    Array<PrimExpr> new_source;
-    for (const PrimExpr& src : op->source) {
-      new_source.push_back(new_mutator(src));
-    }
-
-    return Reduce(op->combiner, new_source, op->axis, new_cond, op->value_index, op->init);
-  }
-
-  virtual PrimExpr VisitExpr_(const EQNode* op) { return MutateAtomic_(GetRef<PrimExpr>(op)); }
-  virtual PrimExpr VisitExpr_(const NENode* op) { return MutateAtomic_(GetRef<PrimExpr>(op)); }
-  virtual PrimExpr VisitExpr_(const LTNode* op) { return MutateAtomic_(GetRef<PrimExpr>(op)); }
-  virtual PrimExpr VisitExpr_(const LENode* op) { return MutateAtomic_(GetRef<PrimExpr>(op)); }
-  virtual PrimExpr VisitExpr_(const GTNode* op) { return MutateAtomic_(GetRef<PrimExpr>(op)); }
-  virtual PrimExpr VisitExpr_(const GENode* op) { return MutateAtomic_(GetRef<PrimExpr>(op)); }
-
-  virtual PrimExpr VisitExpr_(const AndNode* op) { return VisitExpr(op->a) && VisitExpr(op->b); }
-
- private:
-  PrimExpr MutateAtomic_(const PrimExpr& e) {
-    PrimExpr simplified = analyzer_.Simplify(e, kSimplifyRewriteCanonicalRewrite);
-    for (const PrimExpr& other : known_) {
-      if (ExprDeepEqual()(simplified, other)) {
-        return const_true();
-      }
-    }
-    return simplified;
-  }
-
-  Array<PrimExpr> known_;
-  arith::Analyzer analyzer_;
-  const Op& op_if_then_else_ = Op::Get("tir.if_then_else");
-};
-
-// Propagate information from conditions and remove redundant inequalities
-inline PrimExpr RemoveRedundantInequalities(const PrimExpr& expr, const Array<PrimExpr>& known) {
-  return RemoveRedundantInequalitiesMutator(known)(expr);
-}
-
-// Extract the given expr under the given condition as a separate tensor if the volume of the
-// extracted tensor will be less than the volume of the outer_axis
-PrimExpr TrySimplifyCompute(const PrimExpr& expr, const PrimExpr& cond,
-                            const Array<Var>& outer_axis, const Map<Var, Range>& vranges) {
-  // solve cond, e.g., (jac_i0 == i) && (jac_i1 == j)
-  arith::IntConstraints domain_to_solve(outer_axis, vranges,
-                                        FactorOutAtomicFormulas(cond).to_array());
-  auto res = SimplifyDomain(domain_to_solve);
-
-  arith::Analyzer analyzer;
-  analyzer.Bind(res->dst->ranges);
-  PrimExpr new_expr =
-      analyzer.Simplify(Substitute(expr, res->src_to_dst), kSimplifyRewriteCanonicalRewrite);
-  // TODO(yzhliu): This is mostly done to simplify if_then_else
-  // which is not realized by the canonical simplifier
-  new_expr = RemoveRedundantInequalities(new_expr, res->dst->relations);
-
-  // Keep only those variables of the new vars which are used in the new_expr
-  Array<Var> used_res_variables;
-  for (const Var& var : res->dst->variables) {
-    if (tir::UsesVar(new_expr, [&var](const VarNode* var_) { return var_ == var.get(); })) {
-      ICHECK(res->dst->ranges.count(var)) << "Range of " << var << " cannot be inferred.";
-      used_res_variables.push_back(var);
-    }
-  }
-
-  // If the expression does not use vars then it is probably better to keep it inlined
-  if (used_res_variables.empty()) {
-    // We can return the new_expr here instead of the old expr because it doesn't use variables
-    // otherwise we would need to replace the new vars or create a let-expression
-    return new_expr;
-  }
-
-  // If it's already tensor[...] then it will probably be useless to further simplify it.
-  if (new_expr.as<ProducerLoadNode>()) {
-    return expr;
-  }
-
-  // Compute volumes before and after
-  PrimExpr old_volume = make_const(DataType::Int(64), 1);
-  for (const Var& var : outer_axis) {
-    ICHECK(vranges.count(var)) << "Range of " << var << " was not provided.";
-    old_volume = old_volume * vranges[var]->extent;
-  }
-
-  PrimExpr new_volume = make_const(DataType::Int(64), 1);
-  for (const Var& var : used_res_variables) {
-    new_volume = new_volume * res->dst->ranges[var]->extent;
-  }
-
-  // if we can prove that the old volume is not greater than the new volume then
-  // prefer the old expression.
-  arith::Analyzer ana_vranges;
-  ana_vranges.Bind(vranges);
-  if (ana_vranges.CanProve(old_volume <= new_volume)) {
-    return expr;
-  }
-
-  Tensor tensor = TensorFromExpr(new_expr, IterVarsFromMap(used_res_variables, res->dst->ranges),
-                                 "extracted_tensor");
-
-  Array<PrimExpr> args;
-  for (const Var& var : used_res_variables) {
-    args.push_back(res->dst_to_src[var]);
-  }
-
-  return ProducerLoad(tensor, args);
-}
-
-class ReductionAsTensorAccessMutator : public ExprMutator {
- public:
-  explicit ReductionAsTensorAccessMutator(const Array<Var>& outer_axis, Map<Var, Range> vranges,
-                                          std::string name = "extracted_reduction")
-      : outer_axis_(outer_axis), vranges_(std::move(vranges)), name_(std::move(name)) {}
-
-  PrimExpr VisitExpr_(const ReduceNode* op) final {
-    ReductionAsTensorAccessMutator new_mutator(Concat(IterVarsToVars(op->axis), outer_axis_),
-                                               Merge(vranges_, IterVarsToMap(op->axis)), name_);
-
-    ICHECK(op->init.empty()) << "Derivative of Reduction with initialization is not implemented";
-    Array<PrimExpr> new_source;
-    for (const PrimExpr& src : op->source) {
-      new_source.push_back(new_mutator(src));
-    }
-
-    PrimExpr new_reduce =
-        Reduce(op->combiner, new_source, op->axis, op->condition, op->value_index, op->init);
-
-    Array<Var> undefined_vars = UndefinedVars(new_reduce);
-    std::unordered_set<const VarNode*> undefined_var_set;
-    for (const Var& var : undefined_vars) {
-      undefined_var_set.insert(var.get());
-    }
-
-    // Vars of the tensor we are going to create for this reduction
-    Array<Var> vars;
-    for (const Var& v : outer_axis_) {
-      // We take variables from the outer_axis_ which are also present in the new reduction
-      if (undefined_var_set.count(v.get())) {
-        vars.push_back(v);
-      }
-    }
-
-    auto new_axis_vmap_pair = CloneIterVars(IterVarsFromMap(vars, vranges_));
-    Array<IterVar> new_axis = new_axis_vmap_pair.first;
-    arith::Analyzer analyzer;
-    analyzer.Bind(IterVarsToMap(new_axis));
-    new_reduce = analyzer.Simplify(Substitute(new_reduce, new_axis_vmap_pair.second),
-                                   kSimplifyRewriteCanonicalRewrite);
-
-    Tensor tensor = TensorFromExpr(new_reduce, new_axis, name_, tag_, attrs_);
-
-    Array<PrimExpr> args;
-    for (const Var& v : vars) {
-      args.push_back(v);
-    }
-
-    return ProducerLoad(tensor, args);
-  }
-
- private:
-  Array<Var> outer_axis_;
-  Map<Var, Range> vranges_;
-  std::string name_;
-  std::string tag_;
-  Map<String, ObjectRef> attrs_;
-};
-
-// Extract reductions as separate tensors.
-inline PrimExpr ReductionAsTensorAccess(const PrimExpr& expr, const Array<Var>& outer_axis,
-                                        const Map<Var, Range>& vranges) {
-  return ReductionAsTensorAccessMutator(outer_axis, vranges)(expr);
-}
-
-PrimExpr LiftReductions(const PrimExpr& expr, const Array<Var>& outer_axis,
-                        const Map<Var, Range>& vranges) {
-  if (const ReduceNode* red = expr.as<ReduceNode>()) {
-    Array<Var> new_outer_axis = Concat(IterVarsToVars(red->axis), outer_axis);
-    Map<Var, Range> new_vranges = Merge(vranges, IterVarsToMap(red->axis));
-    Array<PrimExpr> new_source;
-    for (const PrimExpr& src : red->source) {
-      new_source.push_back(ReductionAsTensorAccess(src, new_outer_axis, new_vranges));
-    }
-    PrimExpr new_condition = ReductionAsTensorAccess(red->condition, new_outer_axis, new_vranges);
-
-    return Reduce(red->combiner, new_source, red->axis, new_condition, red->value_index, red->init);
-  } else {
-    return ReductionAsTensorAccess(expr, outer_axis, vranges);
-  }
-}
-
-PrimExpr RemoveJacobianAndLiftNonzeroCondImpl(const PrimExpr& expr_orig, const Array<IterVar>& axis,
-                                              const Map<Var, Range>& vranges) {
-  PrimExpr result;
-  Map<Var, Range> combined_vranges = Merge(vranges, IterVarsToMap(axis));
-  arith::Analyzer analyzer;
-  analyzer.Bind(combined_vranges);
-
-  // Simplify the original expression first, mostly to simplify combiners
-  PrimExpr expr = analyzer.Simplify(expr_orig, kSimplifyRewriteCanonicalRewrite);
-
-  if (const ReduceNode* red = expr.as<ReduceNode>()) {
-    ICHECK(red->init.empty()) << "Derivative of Reduction with initialization is not implemented";
-    // TODO(sgrechanik-h): There are some other operations which behave like sum
-    bool is_sum = IsSumCombiner(red->combiner, vranges);
-    if (is_sum || CanFactorZeroFromCombiner(red->combiner, red->value_index, vranges)) {
-      PrimExpr new_red = expr;
-
-      // Here we simplify the reduction
-      PrimExpr cond = red->condition;
-      Array<PrimExpr> source = red->source;
-
-      // If it is a summation then we can lift nonzeroness conditions from the source
-      // and add them to the reduction conditions
-      if (is_sum) {
-        auto nz = NonzeronessCondition(red->source[red->value_index]);
-        cond = nz.cond && cond;
-        source.Set(0, nz.value);
-      }
-
-      new_red = Reduce(red->combiner, source, red->axis, cond, red->value_index, red->init);
-      new_red = SimplifyReductionDomain(new_red, combined_vranges);
-      // Update original red pointer for later use.
-      red = new_red.as<ReduceNode>();
-      // If the reduction disappears completely then transform the result as a non-reduction
-      if (!red) {
-        return RemoveJacobianAndLiftNonzeroCondImpl(new_red, axis, vranges);
-      }
-
-      Array<PrimExpr> new_source = red->source;
-
-      // Partially lift conditions from the reduce condition
-      auto [new_outer_cond, new_reduce_cond] =
-          LiftConditionsThroughReduction(red->condition, red->axis, axis);
-
-      // If it's not sum then we haven't yet lifted nonzeroness cond from the source
-      if (!is_sum) {
-        auto nz = NonzeronessCondition(red->source[red->value_index]);
-        // Append conditions from the reduction
-        PrimExpr nz_source = nz.value;
-        auto [outer_nz_cond, nz_cond] =
-            LiftConditionsThroughReduction(new_reduce_cond && nz.cond, red->axis, axis);
-        new_outer_cond = new_outer_cond && outer_nz_cond;
-        new_source.Set(red->value_index, Select(nz_cond, nz_source, make_zero(nz_source.dtype())));
-      }
-
-      PrimExpr new_reduce = Reduce(red->combiner, new_source, red->axis, new_reduce_cond,
-                                   red->value_index, red->init);
-      new_reduce =
-          TrySimplifyCompute(new_reduce, new_outer_cond, IterVarsToVars(axis), combined_vranges);
-      result = Select(new_outer_cond, new_reduce, make_zero(new_reduce.dtype()));
-    } else {
-      return SimplifyReductionDomain(expr, combined_vranges);
-    }
-  } else {
-    auto nz = NonzeronessCondition(expr);
-    PrimExpr new_expr =
-        TrySimplifyCompute(nz.value, nz.cond, IterVarsToVars(axis), combined_vranges);
-    result = Select(nz.cond, new_expr, make_zero(new_expr.dtype()));
-  }
-
-  // Note that RemoveRedundantInequalities can sometimes propagate equalities which
-  // other simplifiers cannot, like (i % 3) == 0.
-  Array<PrimExpr> axis_conds = IterVarsToInequalities(axis);
-  result = RemoveRedundantInequalities(result, axis_conds);
-
-  // Currently in TVM reductions are only allowed at the top level of compute,
-  // we need to extract intermediate inlined reduction as a separate stage (tensor).
-  // Sometimes TrySimplifyCompute doesn't perform lift / extraction,
-  // so there may be some non-top reductions left, take care of them.
-  result = LiftReductions(result, IterVarsToVars(axis), combined_vranges);
-  return analyzer.Simplify(result, kSimplifyRewriteCanonicalRewrite);
-}
-
-Tensor RemoveJacobianAndLiftNonzeroCond(const Tensor& tensor, const Map<Var, Range>& vranges) {
-  auto transform_func = [&vranges](const PrimExpr& expr, const Array<IterVar>& axis) {
-    return RemoveJacobianAndLiftNonzeroCondImpl(expr, axis, vranges);
-  };
-  return TransformTensorBody(tensor, transform_func);
-}
-
-}  // namespace te
-}  // namespace tvm
diff --git a/src/te/autodiff/ad_utils.cc b/src/te/autodiff/ad_utils.cc
deleted file mode 100644
index 0d1e4927cdfe..000000000000
--- a/src/te/autodiff/ad_utils.cc
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file ad_utils.cc
- * \brief Utility for tensor-level auto-differentiation.
- */
-#include "ad_utils.h"
-
-#include <tvm/tir/expr.h>
-#include <tvm/tir/stmt_functor.h>
-
-#include <set>
-#include <string>
-
-#include "../schedule/operation_inline.h"
-
-namespace tvm {
-namespace te {
-
-std::pair<Array<IterVar>, Map<Var, PrimExpr>> CloneIterVars(const Array<IterVar>& vars) {
-  Array<IterVar> new_vars;
-  Map<Var, PrimExpr> vmap;
-  for (const IterVar& iv : vars) {
-    IterVar new_v = IterVar(iv->dom, iv->var.copy_with_suffix(""), iv->iter_type, iv->thread_tag);
-    new_vars.push_back(new_v);
-    vmap.Set(iv->var, new_v->var);
-  }
-  return std::make_pair(std::move(new_vars), std::move(vmap));
-}
-
-PrimExpr CloneReduction(const PrimExpr& expr) {
-  if (const ReduceNode* red = expr.as<ReduceNode>()) {
-    auto [new_axis, vmap] = CloneIterVars(red->axis);
-
-    Array<PrimExpr> src_with_newaxis;
-    for (const auto& src : red->source) {
-      src_with_newaxis.push_back(tir::Substitute(src, vmap));
-    }
-    Array<PrimExpr> init_with_newaxis;
-    for (const auto& init : red->init) {
-      init_with_newaxis.push_back(tir::Substitute(init, vmap));
-    }
-
-    return Reduce(red->combiner, src_with_newaxis, new_axis, tir::Substitute(red->condition, vmap),
-                  red->value_index, init_with_newaxis);
-  } else {
-    return expr;
-  }
-}
-
-Operation ComputeOpFromExprs(const Array<PrimExpr>& exprs, const Array<IterVar>& axis,
-                             const std::string& name, const std::string& tag,
-                             const Map<String, ObjectRef>& attrs, bool clone_axis) {
-  if (clone_axis) {
-    auto [new_axis, vmap] = CloneIterVars(axis);
-    Array<PrimExpr> new_exprs;
-    for (const PrimExpr& e : exprs) {
-      new_exprs.push_back(Substitute(CloneReduction(e), vmap));
-    }
-    return ComputeOpFromExprs(new_exprs, new_axis, name, tag, attrs, false);
-  }
-
-  Array<PrimExpr> new_exprs;
-
-  // If this is a reduction then we have to replicate it
-  if (const ReduceNode* red = exprs[0].as<ReduceNode>()) {
-    for (size_t i = 0; i < red->source.size(); ++i) {
-      PrimExpr ith_red =
-          Reduce(red->combiner, red->source, red->axis, red->condition, i, red->init);
-      new_exprs.push_back(ith_red);
-    }
-  } else {
-    new_exprs = exprs;
-  }
-
-  return ComputeOp(name, tag, attrs, axis, new_exprs);
-}
-
-Tensor TensorFromExpr(const PrimExpr& expr, const Array<IterVar>& axis, const std::string& name,
-                      const std::string& tag, const Map<String, ObjectRef>& attrs,
-                      bool clone_axis) {
-  int new_value_index = 0;
-  if (const ReduceNode* red = expr.as<ReduceNode>()) {
-    new_value_index = red->value_index;
-  }
-  return ComputeOpFromExprs({expr}, axis, name, tag, attrs, clone_axis).output(new_value_index);
-}
-
-Tensor TransformTensorBody(
-    const Tensor& tensor,
-    const std::function<PrimExpr(const PrimExpr&, const Array<IterVar>&)>& func) {
-  if (const ComputeOpNode* op = tensor->op.as<ComputeOpNode>()) {
-    // Transform only one body
-    PrimExpr new_body = func(op->body[tensor->value_index], op->axis);
-
-    // If the body didn't change then we can return the same tensor
-    if (new_body.same_as(op->body[tensor->value_index])) {
-      return tensor;
-    }
-
-    return TensorFromExpr(new_body, op->axis, op->name, op->tag, op->attrs);
-  } else {
-    return tensor;
-  }
-}
-
-Tensor TransformTensorBody(const Tensor& tensor,
-                           const std::function<PrimExpr(const PrimExpr&)>& func) {
-  return TransformTensorBody(tensor,
-                             [func](const PrimExpr& e, const Array<IterVar>&) { return func(e); });
-}
-
-// If expr is a Tensor Access node, perform inlining, otherwise do nothing
-PrimExpr InlineImmediateTensorAccess(const PrimExpr& expr) {
-  if (const ProducerLoadNode* op = expr.as<ProducerLoadNode>()) {
-    auto tensor = Downcast<te::Tensor>(op->producer);
-    if (const ComputeOpNode* op_comp = tensor->op.as<ComputeOpNode>()) {
-      Array<Var> tensor_axes;
-      for (const auto& var : op_comp->axis) {
-        tensor_axes.push_back(var->var);
-      }
-
-      Stmt inlined =
-          Inline(Evaluate(expr), tensor->op, tensor_axes, op_comp->body[tensor->value_index]);
-      if (const EvaluateNode* ev = inlined.as<EvaluateNode>()) {
-        // If it is a reduction, clone it
-        return CloneReduction(ev->value);
-      }
-    }
-  }
-  return expr;
-}
-
-// Implements InlineTensors by trying to inline every Call of the given Expr
-class InlineTensorsMutator : public ExprMutator {
- public:
-  explicit InlineTensorsMutator(const Array<Tensor>& inlineable, bool inline_reductions = false)
-      : inline_reductions_(inline_reductions) {
-    for (const Tensor& tensor : inlineable) {
-      inlineable_.emplace(tensor->op.operator->(), tensor->value_index);
-    }
-  }
-
-  PrimExpr VisitExpr_(const ProducerLoadNode* op) final {
-    auto tensor = Downcast<te::Tensor>(op->producer);
-    if (const ComputeOpNode* op_comp = tensor->op.as<ComputeOpNode>()) {
-      // Inline only if the array of inlineable tensors is empty or contains this tensor
-      if (inlineable_.empty() || inlineable_.count({op_comp, tensor->value_index})) {
-        // Inline only compute nodes that are not reductions (unless inline reductions is allowed)
-        if (inline_reductions_ || !op_comp->body[0].as<ReduceNode>()) {
-          PrimExpr expr = GetRef<PrimExpr>(op);
-          // Inline this tensor access and then try to perform further inlining
-          return VisitExpr(InlineImmediateTensorAccess(expr));
-        }
-      }
-    }
-    // If we cannot inline this call, we should try to do inlining in its arguments
-    return ExprMutator::VisitExpr_(op);
-  }
-
- private:
-  // Tensors which are allowed to be inlined, represented as pairs (op_node, value_index)
-  std::set<std::pair<const OperationNode*, int>> inlineable_;
-  bool inline_reductions_;
-};
-
-Tensor InlineTensorAccess(const Tensor& tensor, const Array<Tensor>& inlineable,
-                          bool inline_reductions) {
-  auto transformation = [inlineable, inline_reductions](const PrimExpr& e) {
-    return InlineTensorsMutator(inlineable, inline_reductions)(e);
-  };
-  return TransformTensorBody(tensor, transformation);
-}
-
-Tensor InlineTailTensorAccess(const Tensor& tensor) {
-  return TransformTensorBody(tensor, InlineImmediateTensorAccess);
-}
-
-}  // namespace te
-}  // namespace tvm
diff --git a/src/te/autodiff/ad_utils.h b/src/te/autodiff/ad_utils.h
deleted file mode 100644
index 56070ef27267..000000000000
--- a/src/te/autodiff/ad_utils.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file ad_utils.h
- * \brief Helper utilities to implement auto-differentiation.
- */
-#ifndef TVM_TE_AUTODIFF_AD_UTILS_H_
-#define TVM_TE_AUTODIFF_AD_UTILS_H_
-
-#include <tvm/arith/int_solver.h>
-#include <tvm/te/operation.h>
-#include <tvm/tir/expr.h>
-
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-namespace tvm {
-namespace te {
-
-/*!
- * \brief Clone iter vars and return both the new vars and the substitution from old to new.
- *
- * \param vars The original iter vars.
- * \return A pair containing the array of new iter vars and the map from old vars to new ones.
- */
-std::pair<Array<IterVar>, Map<Var, PrimExpr>> CloneIterVars(const Array<IterVar>& vars);
-
-/*!
- * \brief Clone reduction by cloning the axis variables.
- * \param expr A reduction expr to clone. Non-reduction expressions are left intact.
- */
-PrimExpr CloneReduction(const PrimExpr& expr);
-
-/*!
- * \brief Create a tensor from an expression. The expression may be a reduction, in which
- *  case its body will be correctly duplicated if it is a multi-valued reduction.
- *
- * \param expr The expr which will be the tensor's body.
- * \param axis The input variables with ranges.
- * \param name The tensor's name.
- * \param tag The tensor's tag.
- * \param attrs The tensor's attrs.
- * \param clone_axis Whether to clone the given axis and perform substitution.
- * \return A tensor.
- */
-Tensor TensorFromExpr(const PrimExpr& expr, const Array<IterVar>& axis,
-                      const std::string& name = "tensor", const std::string& tag = "",
-                      const Map<String, ObjectRef>& attrs = {}, bool clone_axis = true);
-
-Tensor TransformTensorBody(
-    const Tensor& tensor,
-    const std::function<PrimExpr(const PrimExpr&, const Array<IterVar>&)>& func);
-
-Tensor TransformTensorBody(const Tensor& tensor,
-                           const std::function<PrimExpr(const PrimExpr&)>& func);
-
-/*!
- * \brief Inline tensors access recursively.
- *
- *  This function will inline tensors recursively until it reaches a tensor which is impossible to
- *  inline (a reduction if \p inline_reductions is false, a non-compute tensor, a tensor which is
- *  not from \p inlineable). It won't descend into non-inlinable tensors' bodies.
- *
- * \param tensor The tensor whose body to transform.
- * \param inlineable A list of tensors which are allowed to be inlined. If empty, try
- *  to inline all tensors.
- * \param inline_reductions Whether to inline reductions (this may result in top-level reduction
- *  nodes).
- *
- * \return An inlined tensor
- */
-TVM_DLL Tensor InlineTensorAccess(const Tensor& tensor,
-                                  const Array<Tensor>& inlineable = Array<Tensor>(),
-                                  bool inline_reductions = false);
-
-/*!
- * \brief Inline tensors access at the tail.
- * \param tensor The tensor whose body to transform.
- * \return An inlined tensor
- */
-TVM_DLL Tensor InlineTailTensorAccess(const Tensor& tensor);
-
-/*!
- * \brief Simplify an iteration domain.
- *
- *  An iteration domain is basically an array of variables and a condition. The function will do the
- *  following:
- *  - Replace div and mod operations with new variables (optional).
- *  - Extract (in)equalities from the condition.
- *  - Perform Fourier-Motzkin elimination.
- *  - Shear the domain of iteration (e.g. if `y <= x <= y + 2` then x will be replaced with `y + d`
- *    where `d` is a new variable such that `0 <= d <= 2`).
- *  - Remove redundant variables.
- *  - Infer new variable ranges (hopefully more precise).
- *
- * \param iter_domains The original domain.
- * \param eliminate_div_mod Whether to eliminate div and mod by introducing new variables.
- */
-TVM_DLL arith::IntConstraintsTransform SimplifyDomain(const arith::IntConstraints& iter_domains,
-                                                      bool eliminate_div_mod = true);
-
-/*!
- * \brief Perform lifting of conditions of being possible to be non-zero together with
- *  applying some transformations like simplifying the reduction domain. Works only with
- *  this particular tensor's body, i.e. doesn't perform inlining.
- *
- * \param tensor The original tensor;
- * \param vranges Optional map from free variables to their value ranges.
- * \return An optimized tensor.
- */
-TVM_DLL Tensor RemoveJacobianAndLiftNonzeroCond(const Tensor& tensor,
-                                                const Map<Var, Range>& vranges = Map<Var, Range>());
-
-}  // namespace te
-}  // namespace tvm
-#endif  // TVM_TE_AUTODIFF_AD_UTILS_H_
diff --git a/src/te/autodiff/adjoint.cc b/src/te/autodiff/adjoint.cc
deleted file mode 100644
index 34d38aa75882..000000000000
--- a/src/te/autodiff/adjoint.cc
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file adjoint.cc
- * \brief Perform reverse-mode autodiff.
- *        Suppose we have f(x) = g(h1(x), h2(x), ..., hn(x)),
- *        df/dx = \sum_i df/dhi * dhi/dx
- *        We call df/dx as adjoint(x), df/dhi as adjoint(hi), dhi/dx is the Jacobian
- *        The idea is to first construct the reverse-dependency {input->outputs} between tensors,
- *        start from one input,
- *        (1) collect adjoints from all its dependencies (outputs),
- *        (2) multiply the Jacobian (PartialAdjoint),
- *        (3) and sum them together to get the adjoint of the input itself.
- *        The three steps are computed recursively.
- */
-#include <tvm/runtime/registry.h>
-#include <tvm/te/autodiff.h>
-#include <tvm/tir/stmt_functor.h>
-#include <tvm/topi/elemwise.h>
-#include <tvm/topi/transform.h>
-
-#include <memory>
-#include <vector>
-
-#include "ad_utils.h"
-
-namespace tvm {
-namespace te {
-
-Tensor Identity(const Tensor& output) {
-  Array<PrimExpr> shape = output->shape;
-  for (auto e : output->shape) {
-    // add extra dimension for Jacobian
-    shape.push_back(e);
-  }
-  auto func = [&output](const Array<Var>& input_indices) {
-    PrimExpr res = const_true();
-    for (size_t i = 0; i < output->shape.size(); ++i) {
-      res =
-          res && (PrimExpr(input_indices[i]) == PrimExpr(input_indices[output->shape.size() + i]));
-    }
-    return Cast(output->dtype, res);
-  };
-  return te::compute(shape, func, "identity");
-}
-
-Tensor VectorJacobianProduct(const Tensor& output, const Tensor& input, const Tensor& head) {
-  Tensor jac = Jacobian(output, input);
-  Tensor result = topi::tensordot(head, jac, /*axes=*/output->shape.size(),
-                                  output->op->name + "." + input->op->name + ".grad");
-  result = InlineTensorAccess(result, {jac}, false);
-  result = RemoveJacobianAndLiftNonzeroCond(result);
-  // inline tail call
-  result = InlineTailTensorAccess(result);
-  return result;
-}
-
-Array<Tensor> Gradient(const Tensor& output, const Array<Tensor>& inputs,
-                       const Tensor& head_or_null) {
-  // Diagonal identity tensor
-  Tensor head = head_or_null.get() ? head_or_null : Identity(output);
-
-  // This Map{input -> outputs} maps a tensor to the list of tensors
-  // immediately depending on it (using it in their bodies)
-  std::unordered_map<Tensor, std::vector<Tensor>> reverse_dependencies;
-  std::vector<Tensor> stack({output});
-  while (!stack.empty()) {
-    Tensor tensor = stack.back();
-    stack.pop_back();
-    for (const Tensor& input : tensor->op->InputTensors()) {
-      if (!reverse_dependencies.count(input)) {
-        stack.push_back(input);
-      }
-      reverse_dependencies[input].push_back(tensor);
-    }
-  }
-
-  // This map maps tensors to the corresponding adjoints (dLoss/dTensor)
-  std::unordered_map<Tensor, Tensor> adjoints;
-  // head is the adjoint of output by definition
-  adjoints[output] = head;
-
-  // This is a recursive function that does all the work. It computes the adjoint for a given
-  // tensor, adds it to the map, and returns it
-  std::function<Tensor(const Tensor&)> compute_adjoint;
-  compute_adjoint = [&compute_adjoint, &adjoints, &reverse_dependencies, &head,
-                     &output](const Tensor& tensor) {
-    if (!adjoints.count(tensor)) {
-      // Here the adjoint hasn't been computed yet
-      Tensor res_adjoint;
-      std::vector<Tensor> direct_consumers = reverse_dependencies[tensor];
-      if (direct_consumers.empty()) {
-        // No reverse dependencies means that the output does not depend on this tensor,
-        // return a zero tensor of the appropriate shape
-        // (i.e., output shape + tensor shape, aka shape of Jacobian)
-        Array<PrimExpr> result_shape(head->shape.begin(), head->shape.end() - output->shape.size());
-        for (auto e : tensor->shape) {
-          result_shape.push_back(e);
-        }
-        res_adjoint = topi::full(result_shape, output->dtype, make_zero(output->dtype));
-      } else {
-        // The new adjoint is computed as a sum of the reverse dependencies' adjoints multiplied
-        // by the corresponding "local" jacobians (dDep/dTensor). The computation of the jacobian
-        // and the multiplication is done in the function VectorJacobianProduct
-        for (const Tensor& direct_consumer : direct_consumers) {
-          // part = (adjoint of direct_consumer) * Jacobian(direct_consumer, tensor)
-          Tensor part =
-              VectorJacobianProduct(direct_consumer, tensor, compute_adjoint(direct_consumer));
-          res_adjoint = res_adjoint.get() ? topi::add(res_adjoint, part) : part;
-        }
-      }
-
-      adjoints[tensor] = res_adjoint;
-      return res_adjoint;
-    } else {
-      return adjoints[tensor];
-    }
-  };
-
-  // Adjoints corresponding to inputs
-  Array<Tensor> result;
-  // Compute an adjoint for each input
-  for (const Tensor& input : inputs) {
-    result.push_back(compute_adjoint(input));
-  }
-
-  return result;
-}
-
-TVM_REGISTER_GLOBAL("te.Gradient").set_body([](TVMArgs args, TVMRetValue* ret) {
-  LOG(WARNING) << "te.Gradient is an experimental feature.";
-  if (args.size() == 2) {
-    *ret = Gradient(args[0], args[1]);
-  } else if (args.size() == 3) {
-    *ret = Gradient(args[0], args[1], args[2]);
-  }
-});
-
-}  // namespace te
-}  // namespace tvm
diff --git a/src/te/autodiff/jacobian.cc b/src/te/autodiff/jacobian.cc
deleted file mode 100644
index 78788dbe1a0c..000000000000
--- a/src/te/autodiff/jacobian.cc
+++ /dev/null
@@ -1,365 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file jacobian.cc
- * \brief Calculate Jacobian of two tensors dY/dX.
- *        X must be direct input tensor of Y.
- *        The result Jacobian shape will be (Y.shape, X.shape)
- */
-#include <tvm/arith/analyzer.h>
-#include <tvm/runtime/registry.h>
-#include <tvm/te/autodiff.h>
-#include <tvm/tir/stmt_functor.h>
-
-#include <memory>
-
-#include "ad_utils.h"
-
-namespace tvm {
-namespace te {
-
-#define NOT_IMPLEMENTED                                                                   \
-  {                                                                                       \
-    LOG(FATAL) << "Derivative of this expr is not implemented: " << GetRef<PrimExpr>(op); \
-    throw;                                                                                \
-  }
-
-/*! \brief Differentiate an expression wrt a variable or a tensor element */
-class JacobianMutator : public ExprMutator {
- public:
-  /*!
-   * \brief Differentiate wrt `input(indices)`.
-   * \param input The input tensor.
-   * \param indices The indices of the element with respect to which to differentiate.
-   */
-  explicit JacobianMutator(Tensor input, Array<PrimExpr> indices)
-      : input_(input), indices_(indices) {}
-  /*!
-   * \brief Differentiate wrt the input variable.
-   * \param input The input variable.
-   */
-  explicit JacobianMutator(Var input) : input_var_(input) {}
-
-  PrimExpr Mutate(PrimExpr e) {
-    if (e.dtype().is_int() || e.dtype().is_uint()) {
-      LOG(WARNING) << "For now we assume that the derivative of any integer expression is always 0."
-                   << " e = " << e;
-      return make_zero(e.dtype());
-    } else {
-      return ExprMutator::VisitExpr(e);
-    }
-  }
-
-  PrimExpr VisitExpr_(const VarNode* op) {
-    if (input_var_.get() && input_var_.get() == op && op->dtype.is_float()) {
-      return FloatImm(op->dtype, 1.0);
-    } else {
-      return make_zero(op->dtype);
-    }
-  }
-
-  PrimExpr VisitExpr_(const LetNode* op) NOT_IMPLEMENTED;
-
-  PrimExpr VisitExpr_(const ProducerLoadNode* op) final {
-    auto tensor = Downcast<te::Tensor>(op->producer);
-    if (input_.get() && tensor == input_) {
-      // Tensor(indices)
-      ICHECK_EQ(indices_.size(), op->indices.size());
-      PrimExpr condition = const_true();
-      for (size_t i = 0; i < input_.ndim(); ++i) {
-        condition = And(condition, EQ(indices_[i], op->indices[i]));
-      }
-      return Cast(op->dtype, condition);
-    } else {
-      return make_zero(op->dtype);
-    }
-  }
-
-  PrimExpr VisitExpr_(const CallNode* op) {
-    PrimExpr expr = GetRef<PrimExpr>(op);
-    if (op->op.same_as(op_exp_)) {
-      return Mul(Mutate(op->args[0]), expr);
-    } else if (op->op.same_as(op_log_)) {
-      return Div(Mutate(op->args[0]), op->args[0]);
-    } else if (op->op.same_as(op_sigmoid_)) {
-      return Mul(Mutate(op->args[0]), Mul(expr, Sub(FloatImm(expr.dtype(), 1.0), expr)));
-    } else if (op->op.same_as(op_sqrt_)) {
-      return Div(Mutate(op->args[0]), Mul(expr, FloatImm(expr.dtype(), 2.0)));
-    } else if (op->op.same_as(op_tanh_)) {
-      return Mul(Mutate(op->args[0]), Sub(FloatImm(expr.dtype(), 1.0), Mul(expr, expr)));
-    } else if (op->op.same_as(op_pow_)) {
-      auto x = op->args[0], y = op->args[1];
-      return expr * (Mutate(y) * log(x) + Mutate(x) * y / x);
-    } else if (op->op.same_as(op_fabs_)) {
-      auto type = op->args[0].dtype();
-      return Mul(Mutate(op->args[0]), Select(GE(op->args[0], make_zero(type)), FloatImm(type, 1.0),
-                                             FloatImm(type, -1.0)));
-    } else if (op->op.same_as(op_if_then_else_)) {
-      Array<PrimExpr> new_args = {op->args[0], Mutate(op->args[1]), Mutate(op->args[2])};
-      return Call(op->dtype, op->op, new_args);
-    } else if (piecewise_const.count(op->op)) {
-      return FloatImm(expr.dtype(), 0.0);
-    } else {
-      LOG(FATAL) << "Derivative of this intrinsic is not implemented: " << op->op;
-    }
-  }
-
-  PrimExpr VisitExpr_(const AddNode* op) { return Add(Mutate(op->a), Mutate(op->b)); }
-
-  PrimExpr VisitExpr_(const SubNode* op) { return Sub(Mutate(op->a), Mutate(op->b)); }
-
-  PrimExpr VisitExpr_(const MulNode* op) {
-    return Add(Mul(Mutate(op->a), op->b), Mul(op->a, Mutate(op->b)));
-  }
-
-  PrimExpr VisitExpr_(const DivNode* op) {
-    return Div(Sub(Mul(Mutate(op->a), op->b), Mul(op->a, Mutate(op->b))), Mul(op->b, op->b));
-  }
-
-  PrimExpr VisitExpr_(const ModNode* op) NOT_IMPLEMENTED;
-
-  PrimExpr VisitExpr_(const FloorDivNode* op) {
-    return FloorDiv(Sub(Mul(Mutate(op->a), op->b), Mul(op->a, Mutate(op->b))), Mul(op->b, op->b));
-  }
-
-  PrimExpr VisitExpr_(const FloorModNode* op) NOT_IMPLEMENTED;
-
-  PrimExpr VisitExpr_(const MinNode* op) {
-    return Select(LE(op->a, op->b), Mutate(op->a), Mutate(op->b));
-  }
-
-  PrimExpr VisitExpr_(const MaxNode* op) {
-    return Select(GE(op->a, op->b), Mutate(op->a), Mutate(op->b));
-  }
-
-  PrimExpr VisitExpr_(const EQNode* op) NOT_IMPLEMENTED;
-  PrimExpr VisitExpr_(const NENode* op) NOT_IMPLEMENTED;
-  PrimExpr VisitExpr_(const LTNode* op) NOT_IMPLEMENTED;
-  PrimExpr VisitExpr_(const LENode* op) NOT_IMPLEMENTED;
-  PrimExpr VisitExpr_(const GTNode* op) NOT_IMPLEMENTED;
-  PrimExpr VisitExpr_(const GENode* op) NOT_IMPLEMENTED;
-  PrimExpr VisitExpr_(const AndNode* op) NOT_IMPLEMENTED;
-  PrimExpr VisitExpr_(const OrNode* op) NOT_IMPLEMENTED;
-
-  PrimExpr VisitExpr_(const ReduceNode* op) {
-    // This case is relatively difficult because a reduction expression
-    // may use an arbitrary combiner.
-    // The resulting reduction expression will return a tuple containing
-    // both derivatives and the original results (in exactly this order).
-    // The order matters when original init value is different from its derivative init value,
-    // and they depend on each other during gradient calculation,
-    // we must calculate derivatives first (using origin's init value),
-    // switching the order (original results first, then derivatives)
-    // makes the origin value be replaced before using,
-    // produces incorrect results.
-
-    // Example of a ReduceNode,
-    // reduce(combiner=comm_reducer(result=[(x + y)], lhs=[x], rhs=[y], identity_element=[0f]),
-    //   source=[A(k)], axis=[iter_var(k, range(min=0, ext=5))], where=(bool)1, value_index=0)
-
-    // We have to clone the reduction axes because otherwise the original expression
-    // cannot be used together with the derivative (it will lead to errors during lowering)
-    PrimExpr expr_with_new_axes = te::CloneReduction(GetRef<PrimExpr>(op));
-    const ReduceNode* new_op = expr_with_new_axes.as<ReduceNode>();
-
-    ICHECK(new_op->init.empty())
-        << "Derivative of Reduction with initialization is not implemented";
-
-    // New lhs and rhs variables of the new combiner consist of
-    // variables representing derivatives (which are later derived from new_op->source)
-    // followed by the original variables.
-    Array<Var> new_lhs;
-    for (const auto& var : new_op->combiner->lhs) {
-      new_lhs.push_back(var.copy_with_suffix(".jac"));
-    }
-    for (const auto& var : new_op->combiner->lhs) {
-      new_lhs.push_back(var);
-    }
-
-    Array<Var> new_rhs;
-    for (const auto& var : new_op->combiner->rhs) {
-      new_rhs.push_back(var.copy_with_suffix(".jac"));
-    }
-    for (const auto& var : new_op->combiner->rhs) {
-      new_rhs.push_back(var);
-    }
-
-    // The new combiner result also consists of the resulting derivatives
-    // followed by the original results.
-    Array<PrimExpr> new_result;
-    for (const auto& res : new_op->combiner->result) {
-      // Each resulting derivative is computed as a sum of derivatives
-      // wrt lhs and rhs multiplied by the derivatives of lhs and rhs
-      PrimExpr new_res = make_zero(res.dtype());
-      for (size_t i = 0; i < new_op->combiner->lhs.size(); ++i) {
-        PrimExpr res_di = Derivative(res, new_op->combiner->lhs[i]);
-        // new_lhs[i] is the derivative of lhs[i] (wrt our input tensor)
-        new_res = Add(new_res, Mul(new_lhs[i], res_di));
-      }
-      for (size_t i = 0; i < new_op->combiner->rhs.size(); ++i) {
-        PrimExpr res_di = Derivative(res, new_op->combiner->rhs[i]);
-        // new_rhs[i] is the derivative of rhs[i] (wrt our input tensor)
-        new_res = Add(new_res, Mul(new_rhs[i], res_di));
-      }
-      new_result.push_back(new_res);
-    }
-    // add original results
-    for (const auto& res : new_op->combiner->result) {
-      new_result.push_back(res);
-    }
-
-    // The identity is transformed in a similar way
-    Array<PrimExpr> new_identity;
-    for (const auto& id : new_op->combiner->identity_element) {
-      new_identity.push_back(Mutate(id));
-    }
-    for (const auto& id : new_op->combiner->identity_element) {
-      new_identity.push_back(id);
-    }
-
-    // Same as source
-    Array<PrimExpr> new_source;
-    for (const auto& src : new_op->source) {
-      new_source.push_back(Mutate(src));
-    }
-    for (const auto& src : new_op->source) {
-      new_source.push_back(src);
-    }
-
-    CommReducer new_combiner = CommReducer(new_lhs, new_rhs, new_result, new_identity);
-    // Also simplify the resulting combiner
-    // (mostly to get rid of unused components, e.g., the original expressions)
-    return analyzer_.Simplify(Reduce(new_combiner, new_source, new_op->axis, new_op->condition,
-                                     new_op->value_index, new_op->init));
-  }
-
-  PrimExpr VisitExpr_(const CastNode* op) {
-    if (op->dtype.is_float()) {
-      return Cast(op->dtype, Mutate(op->value));
-    } else {
-      return make_zero(op->dtype);
-    }
-  }
-
-  PrimExpr VisitExpr_(const NotNode* op) NOT_IMPLEMENTED;
-
-  PrimExpr VisitExpr_(const SelectNode* op) {
-    return Select(op->condition, Mutate(op->true_value), Mutate(op->false_value));
-  }
-
-  PrimExpr VisitExpr_(const RampNode* op) NOT_IMPLEMENTED;
-  PrimExpr VisitExpr_(const BroadcastNode* op) NOT_IMPLEMENTED;
-  PrimExpr VisitExpr_(const ShuffleNode* op) NOT_IMPLEMENTED;
-
-  PrimExpr VisitExpr_(const IntImmNode* op) { return IntImm(op->dtype, 0); }
-
-  PrimExpr VisitExpr_(const FloatImmNode* op) { return FloatImm(op->dtype, 0); }
-
-  PrimExpr VisitExpr_(const StringImmNode* op) NOT_IMPLEMENTED;
-
- private:
-  Tensor input_;
-  Array<PrimExpr> indices_;
-  Var input_var_;
-  arith::Analyzer analyzer_;
-
-  const Op& op_exp_ = Op::Get("tir.exp");
-  const Op& op_log_ = Op::Get("tir.log");
-  const Op& op_sigmoid_ = Op::Get("tir.sigmoid");
-  const Op& op_sqrt_ = Op::Get("tir.sqrt");
-  const Op& op_tanh_ = Op::Get("tir.tanh");
-  const Op& op_pow_ = Op::Get("tir.pow");
-  const Op& op_fabs_ = Op::Get("tir.fabs");
-  const Op& op_if_then_else_ = Op::Get("tir.if_then_else");
-  std::unordered_set<RelayExpr, ObjectPtrHash, ObjectPtrEqual> piecewise_const = {
-      Op::Get("tir.floor"), Op::Get("tir.ceil"), Op::Get("tir.trunc"), Op::Get("tir.round")};
-};
-
-PrimExpr Derivative(const PrimExpr& expr, const Var& var) {
-  return JacobianMutator(var).Mutate(expr);
-}
-
-PrimExpr Jacobian(const PrimExpr& expr, const Tensor& input, const Array<PrimExpr>& indices) {
-  return JacobianMutator(input, indices).Mutate(expr);
-}
-
-Tensor Jacobian(const Tensor& output, const Tensor& input) {
-  const ComputeOpNode* op = output->op.as<ComputeOpNode>();
-  ICHECK(op) << "Derivative of this operation is not implemented: " << output->op;
-  bool is_input_tensor = false;
-  for (const Tensor& child : op->InputTensors()) {
-    if (input == child) {
-      is_input_tensor = true;
-      break;
-    }
-  }
-  ICHECK(is_input_tensor) << "Jacobian is called on a pair of tensors such that the output "
-                          << "does not directly depend on the input.";
-
-  // We have to clone the iteration axes because otherwise the original expression
-  // cannot be used together with the derivative (it will lead to errors during lowering)
-  auto [new_axis, vmap] = te::CloneIterVars(op->axis);
-
-  Array<PrimExpr> input_indices;
-  size_t i = 0;
-  for (PrimExpr ext : input->shape) {
-    IterVar new_v =
-        IterVar(Range(0, ext), Var("jac_i" + std::to_string(i++)), IterVarType::kDataPar);
-    // Append jacobian iter to new_axis
-    new_axis.push_back(new_v);
-    // Differentiate wrt input[input_indices]
-    input_indices.push_back(new_v);
-  }
-  arith::Analyzer analzyer;
-  // Compute Jacobian
-  PrimExpr new_body =
-      Jacobian(Substitute(op->body[output->value_index], vmap), input, input_indices);
-  new_body = analzyer.Simplify(new_body);
-
-  int value_index = 0;
-  Array<PrimExpr> new_bodies;
-
-  // If this is a reduction then it may return a tuple and we have
-  // to repeat the body several times
-  if (const ReduceNode* red = new_body.as<ReduceNode>()) {
-    value_index = red->value_index;
-    for (size_t idx = 0; idx < red->source.size(); ++idx) {
-      new_bodies.push_back(
-          Reduce(red->combiner, red->source, red->axis, red->condition, idx, red->init));
-    }
-  } else {
-    new_bodies.push_back(new_body);
-  }
-
-  auto new_op = ComputeOp(op->name + ".jacobian", op->tag, op->attrs, new_axis, new_bodies);
-
-  // Jacobian shape = output.shape + input.shape
-  Array<PrimExpr> new_shape = output->shape;
-  for (const auto& e : input->shape) {
-    new_shape.push_back(e);
-  }
-
-  Tensor ret = Tensor(new_shape, output->dtype, new_op, value_index);
-  ret = RemoveJacobianAndLiftNonzeroCond(ret);
-  return ret;
-}
-
-}  // namespace te
-}  // namespace tvm
diff --git a/src/te/operation/compute_op.cc b/src/te/operation/compute_op.cc
index fb839c28da96..48d10e97a0ac 100644
--- a/src/te/operation/compute_op.cc
+++ b/src/te/operation/compute_op.cc
@@ -21,7 +21,6 @@
  * \brief Compute Op.
  * \file compute_op.cc
  */
-#include "compute_op.h"
 
 #include <tvm/arith/analyzer.h>
 #include <tvm/runtime/registry.h>
@@ -35,10 +34,6 @@
 #include <unordered_set>
 #include <utility>
 
-#include "../../arith/interval_set.h"
-#include "../schedule/message_passing.h"
-#include "op_utils.h"
-
 namespace tvm {
 namespace te {
 using namespace tir;
@@ -79,15 +74,6 @@ static inline void AssertReduceEqual(const tir::ReduceNode* a, const tir::Reduce
 
 int ComputeOpNode::num_outputs() const { return body.size(); }
 
-Array<IterVar> BaseComputeOpNode::root_iter_vars() const {
-  if (reduce_axis.size() == 0) return axis;
-  Array<IterVar> ret = axis;
-  for (IterVar iv : reduce_axis) {
-    ret.push_back(iv);
-  }
-  return ret;
-}
-
 DataType ComputeOpNode::output_dtype(size_t idx) const {
   ICHECK_LT(idx, num_outputs());
   return body[idx].dtype();
@@ -185,333 +171,8 @@ Array<Tensor> ComputeOpNode::InputTensors() const {
   return ret;
 }
 
-Operation ComputeOpNode::ReplaceInputs(const Operation& self,
-                                       const std::unordered_map<Tensor, Tensor>& rmap) const {
-  ICHECK_EQ(self.operator->(), this);
-  VerifyComputeOp(this);
-  Array<PrimExpr> arr;
-  if (this->body[0]->IsInstance<tir::ReduceNode>()) {
-    // Specially handle reduce so the replaced op
-    // still share all the components
-    PrimExpr new_reduce = te::ReplaceTensor(this->body[0], rmap);
-    if (!new_reduce.same_as(this->body[0])) {
-      const tir::ReduceNode* r = new_reduce.as<tir::ReduceNode>();
-      for (size_t k = 0; k < this->body.size(); ++k) {
-        auto n = make_object<tir::ReduceNode>(*r);
-        n->value_index = static_cast<int>(k);
-        n->dtype = r->source[k].dtype();
-        arr.push_back(PrimExpr(n));
-      }
-    } else {
-      arr = this->body;
-    }
-  } else {
-    arr =
-        UpdateArray(this->body, [&rmap](const PrimExpr& e) { return te::ReplaceTensor(e, rmap); });
-  }
-  if (!arr.same_as(this->body)) {
-    return ComputeOp(this->name, this->tag, this->attrs, this->axis, arr);
-  } else {
-    return self;
-  }
-}
-
-void ComputeOpNode::PropBoundToInputs(const Operation& self, arith::Analyzer* analyzer,
-                                      const std::unordered_map<const VarNode*, IntSet>& dom_map,
-                                      std::unordered_map<Tensor, TensorDom>* out_dom_map) const {
-  ICHECK_EQ(self.operator->(), this);
-  auto fvisit = [&dom_map, out_dom_map, analyzer](const ObjectRef& n) {
-    if (auto* pload = n.as<tir::ProducerLoadNode>()) {
-      Tensor t = Downcast<Tensor>(pload->producer);
-      if (t->op.defined() && out_dom_map->count(t)) {
-        TensorDom& dom = out_dom_map->at(t);
-        for (size_t i = 0; i < t.ndim(); ++i) {
-          // We assume that the value of the argument cannot be out of bounds (otherwise it is
-          // undefined behaviour), so we can intersect the estimated set of the argument with the
-          // range expected by the tensor. However, intersection may result in overly complex
-          // expressions, so we perform a more relaxed form of intersection.
-          IntSet arg_intset = analyzer->int_set(pload->indices[i], ConvertDomMap(dom_map));
-          const arith::IntervalSetNode* arg_interval = arg_intset.as<arith::IntervalSetNode>();
-          if (arg_interval) {
-            PrimExpr shape_i_min_value = make_zero(t->shape[i].dtype());
-            PrimExpr shape_i_max_value = t->shape[i] - 1;
-            PrimExpr min_value = arg_interval->min_value;
-            PrimExpr max_value = arg_interval->max_value;
-            // Prefer the shape bounds only when we can prove they are tighter.
-            // We must update bound's ends in pairs.  Here is an counter example: shape_i is
-            // [0, 0] and arg_interval is [threadIdx.y, threadIdx.y], where threadIdx.y's range is
-            // [0, 7]. If we allowed updating one end, the bound would become [threadIdx.y, 0],
-            // awkward for further analysis.
-            if ((arith::is_pos_inf(max_value) && arith::is_neg_inf(min_value)) ||
-                (analyzer->CanProve(shape_i_min_value >= min_value) &&
-                 analyzer->CanProve(shape_i_max_value <= max_value))) {
-              min_value = shape_i_min_value;
-              max_value = shape_i_max_value;
-            }
-            dom.data[i].push_back(IntSet::Interval(min_value, max_value));
-          } else {
-            dom.data[i].push_back(arg_intset);
-          }
-        }
-      }
-    }
-  };
-  for (auto& e : body) tir::PostOrderVisit(e, fvisit);
-}
-
-void BaseComputeOpNode::GatherBound(const Operation& self,
-                                    const std::unordered_map<Tensor, TensorDom>& tensor_dom,
-                                    std::unordered_map<IterVar, Range>* out_dom_map) const {
-  ICHECK_EQ(self.operator->(), this);
-  const TensorDom& tdom = tensor_dom.at(self.output(0));
-  for (size_t i = 0; i < this->axis.size(); ++i) {
-    Range r = arith::Union(tdom.data.at(i)).CoverRange(this->axis[i]->dom);
-    ICHECK(!out_dom_map->count(this->axis[i]));
-    (*out_dom_map)[this->axis[i]] = r;
-  }
-  for (size_t i = 0; i < this->reduce_axis.size(); ++i) {
-    ICHECK(!out_dom_map->count(this->reduce_axis[i]));
-    (*out_dom_map)[this->reduce_axis[i]] = this->reduce_axis[i]->dom;
-  }
-}
-
-Stmt BaseComputeOpNode::BuildRealize(const Stage& stage,
-                                     const std::unordered_map<IterVar, Range>& realize_map,
-                                     const Stmt& body, String storage_scope) const {
-  ICHECK_EQ(stage->op.get(), this);
-  Region bounds;
-  for (IterVar iv : this->axis) {
-    bounds.push_back(realize_map.at(iv));
-  }
-  Stmt realize = body;
-  for (int i = this->num_outputs(); i > 0; --i) {
-    Tensor t = stage->op.output(i - 1);
-    realize = tir::ProducerRealize(t, bounds, const_true(), realize, storage_scope);
-    // alignment requirement, only useful for compute
-    for (size_t i = 0; i < num_schedulable_dims(); ++i) {
-      auto it = stage->iter_var_attrs.find(this->axis[i]);
-      if (it != stage->iter_var_attrs.end()) {
-        IterVarAttr attr = (*it).second;
-        if (attr->dim_align_factor != 0) {
-          Array<PrimExpr> tuple = {static_cast<int>(i), attr->dim_align_factor,
-                                   attr->dim_align_offset};
-          realize =
-              tir::AttrStmt(t, tir::attr::buffer_dim_align,
-                            Call(DataType::Handle(), tir::builtin::tvm_tuple(), tuple), realize);
-        }
-      }
-    }
-  }
-  return realize;
-}
-
-size_t ComputeOpNode::num_schedulable_dims() const { return axis.size(); }
-
-// Build a reduction body.
-void MakeReduction(const ComputeOpNode* op, const Array<Tensor>& tensors, Stmt* init,
-                   Stmt* provide) {
-  Array<PrimExpr> args;
-  for (IterVar iv : op->axis) {
-    args.push_back(iv->var);
-  }
-  std::vector<Stmt> inits, provides;
-
-  size_t size = op->body.size();
-  const ReduceNode* reduce = op->body[0].as<ReduceNode>();
-  ICHECK(reduce);
-  const CommReducerNode* combiner = reduce->combiner.as<CommReducerNode>();
-  ICHECK(combiner);
-  Array<PrimExpr> lhs;
-  for (size_t i = 0; i < size; ++i) {
-    lhs.push_back(tensors[i](args));
-  }
-  Array<PrimExpr> init_value = combiner->identity_element;
-  Array<PrimExpr> update_value = (*combiner)(lhs, reduce->source);
-
-  // If an init was passed to ReduceNode, use that for initialization
-  // instead of combiner->identity_element
-  Array<PrimExpr> reduce_init = reduce->init;
-  if (!reduce_init.empty()) {
-    init_value = reduce_init;
-  }
-  for (size_t i = 0; i < size; ++i) {
-    Tensor t = tensors[i];
-    inits.emplace_back(ProducerStore(t, init_value[i], args));
-    provides.emplace_back(ProducerStore(t, update_value[i], args));
-  }
-  *init = SeqStmt::Flatten(inits);
-  *provide = SeqStmt::Flatten(provides);
-  if (!is_one(reduce->condition)) {
-    *provide = IfThenElse(reduce->condition, *provide);
-  }
-}
-
-// Normal computation.
-Stmt MakeProvide(const ComputeOpNode* op, const Tensor& t) {
-  Array<PrimExpr> args;
-  for (IterVar iv : op->axis) {
-    args.push_back(iv->var);
-  }
-  return ProducerStore(t, op->body[t->value_index], args);
-}
-
-Stmt MakeComputeStmt(const ComputeOpNode* self, const Stage& stage,
-                     const std::unordered_map<IterVar, Range>& dom_map,
-                     bool debug_keep_trivial_loop) {
-  // grab the nest structure
-  ComputeLoopNest n = ComputeLoopNest::Create(self, stage, dom_map, debug_keep_trivial_loop);
-  // Normal loop structure
-  n.init_nest.emplace_back(MakeIfNest(n.init_predicates));
-  n.main_nest.emplace_back(MakeIfNest(n.main_predicates));
-  if (self->reduce_axis.size() != 0) {
-    // make reduction.
-    Stmt init, provide;
-    Array<Tensor> source;
-    for (size_t i = 0; i < self->body.size(); ++i) {
-      source.push_back(stage->op.output(i));
-    }
-    MakeReduction(self, source, &init, &provide);
-    init = MergeNest(n.init_nest, init);
-    init = Substitute(init, n.init_vmap);
-    // common nest
-    std::vector<std::vector<Stmt>> common(n.main_nest.begin(),
-                                          n.main_nest.begin() + n.num_common_loop + 1);
-    std::vector<std::vector<Stmt>> reduce(n.main_nest.begin() + n.num_common_loop + 1,
-                                          n.main_nest.end());
-    provide = MergeNest(reduce, provide);
-    if (debug_keep_trivial_loop) {
-      provide = MergeNest(common, provide);
-    } else {
-      provide = MergeNest(common, SeqStmt::Flatten(init, provide));
-    }
-    // run substitution in the on the full nest, because  loop condition
-    // could depend on outer loops.
-    return Substitute(provide, n.main_vmap);
-  } else {
-    std::vector<Stmt> provides;
-    for (size_t i = 0; i < self->body.size(); ++i) {
-      provides.emplace_back(MakeProvide(self, stage->op.output(i)));
-    }
-    Stmt provide = SeqStmt::Flatten(provides);
-    provide = MergeNest(n.main_nest, provide);
-    // run substitution in the on the full nest, because  loop condition
-    // could depend on outer loops.
-    return Substitute(provide, n.main_vmap);
-  }
-}
-
 enum class ComputeType { kNormal, kCrossThreadReduction, kTensorize };
 
-ComputeType DetectComputeType(const ComputeOpNode* self, const Stage& stage) {
-  // Verify correctness of leaf nest.
-  int thread_red = 0, tensorize = 0;
-
-  for (IterVar iv : stage->leaf_iter_vars) {
-    IterVarAttr attr;
-    auto it = stage->iter_var_attrs.find(iv);
-    if (it != stage->iter_var_attrs.end()) {
-      attr = (*it).second;
-    }
-    if (attr.defined() && attr->iter_type == kTensorized) {
-      ++tensorize;
-    }
-    if (iv->iter_type == kCommReduce) {
-      if (attr.defined() && attr->bind_thread.defined()) {
-        ++thread_red;
-      }
-    } else {
-      ICHECK_EQ(thread_red, 0) << "Cross thread reduce cannot swap with normal data axis";
-    }
-  }
-  if (tensorize != 0) {
-    ICHECK(thread_red == 0) << "Cannot mix cross thread reduction with Tensorize";
-    return ComputeType::kTensorize;
-  }
-  if (thread_red != 0) {
-    return ComputeType::kCrossThreadReduction;
-  } else {
-    return ComputeType::kNormal;
-  }
-}
-
-// implement the provide utility.
-Stmt ComputeOpNode::BuildProvide(const Stage& stage,
-                                 const std::unordered_map<IterVar, Range>& dom_map,
-                                 bool debug_keep_trivial_loop) const {
-  ICHECK_EQ(stage->op.operator->(), this);
-  ComputeType ctype = DetectComputeType(this, stage);
-  if (ctype == ComputeType::kCrossThreadReduction) {
-    // specially handle cross thread reduction.
-    return MakeCrossThreadReduction(this, stage, dom_map, debug_keep_trivial_loop);
-  } else if (ctype == ComputeType::kTensorize) {
-    return MakeTensorize(this, stage, dom_map, debug_keep_trivial_loop);
-  } else {
-    return MakeComputeStmt(this, stage, dom_map, debug_keep_trivial_loop);
-  }
-}
-
-ComputeLoopNest ComputeLoopNest::Create(const BaseComputeOpNode* self, const Stage& stage,
-                                        const std::unordered_map<IterVar, Range>& dom_map,
-                                        bool debug_keep_trivial_loop) {
-  ICHECK_EQ(stage->op.operator->(), self);
-  ComputeLoopNest ret;
-  // make main loop nest
-  ret.main_nest = MakeLoopNest(stage, dom_map, 0, false, std::unordered_set<IterVar>(),
-                               &ret.main_vmap, debug_keep_trivial_loop);
-  ret.main_predicates =
-      MakeBoundCheck(stage, dom_map, ret.main_vmap, false, std::unordered_set<IterVar>());
-  for (auto& e : ret.main_predicates) {
-    e = likely(e);
-  }
-  if (stage->store_predicate.defined()) {
-    ret.main_predicates.push_back(stage->store_predicate);
-  }
-  if (self->reduce_axis.size() != 0) {
-    // try to find the location to insert the initialization.
-    // Fuse the initialization and provide loop when possible.
-    std::unordered_map<IterVar, int> update_state;
-    for (IterVar iv : self->reduce_axis) {
-      update_state[iv] = 2;
-    }
-    for (size_t i = 0; i < self->num_schedulable_dims(); ++i) {
-      update_state[self->axis[i]] = 1;
-    }
-    // find which iter var is related to reduction and which is related to axis.
-    te::PassDownBitMaskOr(stage, &update_state);
-    auto leaf_iter_vars = stage->leaf_iter_vars;
-    // first first loop that is related to reduction.
-    size_t begin_loop = leaf_iter_vars.size();
-    for (size_t i = 0; i < leaf_iter_vars.size(); ++i) {
-      auto iv = leaf_iter_vars[i];
-      int flag = update_state.at(iv);
-      if ((flag & 2) != 0) {
-        begin_loop = i;
-        break;
-      }
-      ret.init_vmap[iv] = ret.main_vmap.at(iv);
-    }
-    ret.num_common_loop = begin_loop;
-    // skip loops that are related to reduction and are unrelated to axis.
-    std::unordered_set<IterVar> skip_iter;
-    for (auto kv : update_state) {
-      int flag = kv.second;
-      if (flag == 2) skip_iter.insert(kv.first);
-    }
-    ret.init_nest = MakeLoopNest(stage, dom_map, begin_loop, true, skip_iter, &(ret.init_vmap),
-                                 debug_keep_trivial_loop);
-    ret.init_predicates =
-        MakeBoundCheck(stage, dom_map, ret.init_vmap, !stage->rolling_buffer, skip_iter);
-    for (auto& e : ret.init_predicates) {
-      e = likely(e);
-    }
-  } else {
-    ICHECK_EQ(ret.main_nest.size(), stage->leaf_iter_vars.size() + 1);
-    ret.num_common_loop = stage->leaf_iter_vars.size();
-  }
-  // copy elison here.
-  return ret;
-}
-
 namespace {
 /*!
  * \brief Verify if ComputeOp is valid with respect to Reduce operations.
@@ -581,41 +242,5 @@ static void VerifyComputeOp(const ComputeOpNode* op) {
   v.Run();
 }
 
-Stmt TransformUpdate(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
-                     const ComputeLoopNest& n, Stmt body, Stmt update) {
-  Array<PrimExpr> conds;
-  std::unordered_set<const VarNode*> banned;
-  for (size_t i = 0; i < stage->leaf_iter_vars.size(); ++i) {
-    IterVar iv = stage->leaf_iter_vars[i];
-    auto iit = stage->iter_var_attrs.find(iv);
-    if (iit != stage->iter_var_attrs.end()) {
-      const IterVarAttr& attr = (*iit).second;
-      if (attr->iter_type == kTensorized) {
-        break;
-      }
-    }
-    if (iv->iter_type == kCommReduce) {
-      auto vit = dom_map.find(iv);
-      ICHECK(vit != dom_map.end());
-      const Range& vrange = vit->second;
-      conds.push_back(likely(iv->var > vrange->min));
-      banned.insert(iv->var.get());
-    }
-  }
-
-  auto fbanned = [&](const VarNode* node) { return banned.count(node); };
-
-  for (const PrimExpr& pred : n.main_predicates) {
-    if (tir::UsesVar(pred, fbanned)) {
-      LOG(FATAL) << "Tensorize update transform failed, the condition " << pred
-                 << " has a conflict with the reset condition";
-    }
-  }
-
-  auto cond = foldl([](PrimExpr a, PrimExpr b, Span span) { return logical_or(a, b, span); },
-                    const_false(1), conds);
-  return IfThenElse(cond, update, body);
-}
-
 }  // namespace te
 }  // namespace tvm
diff --git a/src/te/operation/compute_op.h b/src/te/operation/compute_op.h
deleted file mode 100644
index 944334a41fdb..000000000000
--- a/src/te/operation/compute_op.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \brief Helper utilities to implement compute_op.
- * \file compute_op.h
- */
-#ifndef TVM_TE_OPERATION_COMPUTE_OP_H_
-#define TVM_TE_OPERATION_COMPUTE_OP_H_
-
-#include <tvm/te/operation.h>
-#include <tvm/tir/expr.h>
-
-#include <unordered_map>
-#include <vector>
-
-namespace tvm {
-namespace te {
-// loop nest structure for general compute
-// This the loop nest structured used in compute.
-// Does not include the loop body.
-struct ComputeLoopNest {
-  // The common number of loops between init and main
-  size_t num_common_loop;
-  // predicates for the initialize loop
-  std::vector<PrimExpr> init_predicates;
-  // Initialization nest involved.
-  std::vector<std::vector<Stmt>> init_nest;
-  // Value map for the init code
-  std::unordered_map<IterVar, PrimExpr> init_vmap;
-  // Predicates for the main update loop
-  std::vector<PrimExpr> main_predicates;
-  // The general loop nest
-  std::vector<std::vector<Stmt>> main_nest;
-  // Value map for the IterVar.
-  std::unordered_map<IterVar, PrimExpr> main_vmap;
-
-  /*!
-   * \brief constructor to build ComputeOpNest
-   * \param self The pointer to compute op.
-   * \param stage The scxhedule stage.
-   * \param dom_map The domain map.
-   * \param debug_keep_trivial_loop Whether keep trivial loops with extent of 1
-   * \return The constructed loop nest
-   */
-  static ComputeLoopNest Create(const BaseComputeOpNode* self, const Stage& stage,
-                                const std::unordered_map<IterVar, Range>& dom_map,
-                                bool debug_keep_trivial_loop);
-};
-
-/*!
- * \brief Build body of compute for cross thread reduction pattern.
- * \param self The pointer to ComputeOpNode
- * \param stage The schedule stage.
- * \param dom_map The domain map.
- * \param debug_keep_trivial_loop Whether keep trivial loops with extent of 1
- * \return The created statement.
- */
-Stmt MakeCrossThreadReduction(const ComputeOpNode* self, const Stage& stage,
-                              const std::unordered_map<IterVar, Range>& dom_map,
-                              bool debug_keep_trivial_loop);
-
-/*!
- * \brief Build body of compute for tensorization.
- * \param self The pointer to ComputeOpNode
- * \param stage The schedule stage.
- * \param dom_map The domain map.
- * \param debug_keep_trivial_loop Whether keep trivial loops with extent of 1
- * \return The created statement.
- */
-Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage,
-                   const std::unordered_map<IterVar, Range>& dom_map, bool debug_keep_trivial_loop);
-
-/*!
- * \brief Transform the update part when there is no init func in tensorizing
- * \param stage The stage for tensorizing.
- * \param dom_map The range of each iter var.
- * \param n The loop nest structured used in compute.
- * \param body The body func in tensorize intrin
- * \param update The update func in tensorize intrin
- * \return Transformed result.
- */
-Stmt TransformUpdate(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
-                     const ComputeLoopNest& n, Stmt body, Stmt update);
-}  // namespace te
-}  // namespace tvm
-
-#endif  // TVM_TE_OPERATION_COMPUTE_OP_H_
diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index 2709bd2f94ca..9a6d30de4a5a 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -22,6 +22,7 @@
 #include <tvm/arith/analyzer.h>
 #include <tvm/ir/name_supply.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/te/operation.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/data_type_rewriter.h>
 #include <tvm/tir/function.h>
@@ -37,7 +38,7 @@
 #include "../../support/array.h"
 #include "../../tir/ir/functor_common.h"
 #include "../../tir/transforms/ir_utils.h"
-#include "../schedule/graph.h"
+#include "graph.h"
 
 namespace tvm {
 namespace tir {
diff --git a/src/te/operation/cross_thread_reduction.cc b/src/te/operation/cross_thread_reduction.cc
deleted file mode 100644
index 52e38c7ba2d8..000000000000
--- a/src/te/operation/cross_thread_reduction.cc
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \brief Logics related to cross thread reduction, used by ComputeOpNode.
- * \file cross_thread_reduction.cc
- */
-#include <tvm/tir/builtin.h>
-#include <tvm/tir/stmt_functor.h>
-
-#include "compute_op.h"
-#include "op_utils.h"
-
-namespace tvm {
-namespace te {
-using namespace tir;
-
-//
-// Cross thread reduction transformation.
-//
-// The input loop nest in generic form (single reduction/thread case)
-//
-// let m be the reduction extent
-// let N be the thread extent
-// let input_pred be the predicate on the reduction
-//
-// B[..] = 0
-// for (tid, 0, N)
-//   for (i, 0, floordiv(m+N-1, N))
-//     if (i + tid * floordiv(m+N-1, N) < m)
-//       if (input_pred)
-//         B[..] = op(B[..], A[i + tid  * floordiv(m+N-1,N)])
-//
-// The threaded reduction looks like
-//
-// (1) normal reductions (leaves)
-// for (i, 0, floordiv(m+N-1, N))
-//   if (i + tid * floordiv(m+N-1, N) < m)
-//     if (input_pred)
-//       B_temp[0] = op(B_temp[0], A[i + tid  * floordiv(m+N-1,N)])
-//
-// (2) threaded reduction does not require predicates as an identity
-//     element will be filled if out of bounds.
-//
-// tvm_thread_allreduce(size, B_temp, (bool)1, tid)
-//
-// The last step is to write the final reduction variable,
-// which should be predicated by the existing input_pred if any
-// The consequence is that input_pred should be independent of
-// the reduction axis. Otherwise, we need to separate it into
-// dependent part and independent one.
-//
-// (3) write back
-// if (input_pred)
-//    B[..] = B_temp[0]
-//
-// In summary, we are going to need two predicates
-//
-// * the original input_pred from reduction itself
-//
-// * the normal reduction axis predicate
-//     normal_pred = (i + tid * floordiv(m+N-1,N)) < m
-//   this predicate depends on the normal reduction variable.
-//
-// input_pred will be applied to both normal reduction and
-// the writeback step.
-//
-Stmt MakeCrossThreadReduction(const ComputeOpNode* self, const Stage& stage,
-                              const std::unordered_map<IterVar, Range>& dom_map,
-                              bool debug_keep_trivial_loop) {
-  Array<PrimExpr> args;
-  for (IterVar iv : self->axis) {
-    args.push_back(iv->var);
-  }
-  std::unordered_map<IterVar, PrimExpr> value_map;
-  auto nest = MakeLoopNest(stage, dom_map, 0, false, std::unordered_set<IterVar>(), &value_map,
-                           debug_keep_trivial_loop);
-
-  size_t size = self->body.size();
-  ICHECK_GT(size, 0);
-  std::vector<const ReduceNode*> reduces(size);
-  for (size_t i = 0; i < size; ++i) {
-    const ReduceNode* reduce = self->body[i].as<ReduceNode>();
-    ICHECK(reduce);
-    ICHECK(reduce->init.empty())
-        << "Cannot perform cross_thread_reduction for reductions with init";
-    reduces[i] = reduce;
-  }
-
-  // This computes the bound checking predicates in normal reduction.
-  auto normal_preds =
-      MakeBoundCheck(stage, dom_map, value_map, false, std::unordered_set<IterVar>());
-
-  // normal_pred = input_pred && normal_pred
-  PrimExpr input_pred = reduces[0]->condition;
-  normal_preds.push_back(input_pred);
-  normal_preds.erase(std::remove_if(normal_preds.begin(), normal_preds.end(),
-                                    [](const PrimExpr& e) { return !e.defined(); }),
-                     normal_preds.end());
-
-  std::vector<std::vector<Stmt>> common, normal_red;
-  for (size_t i = 0, n = stage->leaf_iter_vars.size(); i < n; ++i) {
-    IterVar iv = stage->leaf_iter_vars[i];
-    IterVarAttr attr;
-    auto it = stage->iter_var_attrs.find(iv);
-    if (it != stage->iter_var_attrs.end()) {
-      attr = (*it).second;
-    }
-    if (iv->iter_type == kCommReduce) {
-      if (attr.defined() && attr->bind_thread.defined()) {
-        common.emplace_back(nest[i + 1]);
-      } else {
-        normal_red.emplace_back(nest[i + 1]);
-      }
-    } else {
-      common.emplace_back(nest[i + 1]);
-    }
-  }
-
-  // If we load from and then store into the same res_handles in the thread_allreduce intrinsic,
-  // something goes wrong, so we use an extra variable here for normal reduction.
-  std::vector<Buffer> normal_res_buffers;
-  std::vector<Stmt> normal_init, normal_update;
-  if (!normal_red.empty()) {
-    normal_res_buffers.reserve(size);
-    normal_init.reserve(size);
-    normal_update.resize(size);
-    const CommReducerNode* combiner = reduces[0]->combiner.as<CommReducerNode>();
-    ICHECK(combiner);
-    Array<PrimExpr> lhs;
-    for (size_t i = 0; i < size; ++i) {
-      normal_res_buffers.push_back(
-          decl_buffer({1}, reduces[i]->dtype, "normal_reduce_temp" + std::to_string(i), "local"));
-      lhs.push_back(BufferLoad(normal_res_buffers[i], {0}));
-    }
-    Array<PrimExpr> init_value = combiner->identity_element;
-    Array<PrimExpr> update_value = (*combiner)(lhs, reduces[0]->source);
-    for (size_t i = 0; i < size; ++i) {
-      normal_init.emplace_back(BufferStore(normal_res_buffers[i], init_value[i], {0}));
-      normal_update.emplace_back(BufferStore(normal_res_buffers[i], update_value[i], {0}));
-    }
-  }
-
-  Array<PrimExpr> freduce_args;
-  freduce_args.push_back(make_const(DataType::UInt(32), static_cast<uint32_t>(size)));
-  for (size_t i = 0; i < size; ++i) {
-    if (!normal_red.empty()) {
-      freduce_args.push_back(BufferLoad(normal_res_buffers[i], {0}));
-    } else {
-      freduce_args.push_back(reduces[0]->source[i]);
-    }
-  }
-
-  // No constraints on the thread reduction step. It may have redundent
-  // computation for rare cases. TODO(tvm-team): revisit this.
-  freduce_args.push_back(const_true(1));
-  std::vector<Buffer> res_buffers(size);
-  for (size_t idx = 0; idx < size; ++idx) {
-    res_buffers[idx] =
-        decl_buffer({1}, reduces[idx]->dtype, "reduce_temp" + std::to_string(idx), "local");
-    // Make a BufferLoad object so that we can pass the entire Buffer
-    // object through to LowerThreadAllreduce.  The index here is
-    // unused.
-    PrimExpr dummy_load = BufferLoad(res_buffers[idx], {0});
-    freduce_args.push_back(dummy_load);
-  }
-
-  // Checks for the thread.
-  std::vector<PrimExpr> output_preds;
-  if (stage->store_predicate.defined()) {
-    output_preds.emplace_back(stage->store_predicate);
-  }
-
-  for (IterVar iv : stage->leaf_iter_vars) {
-    if (iv->iter_type == kCommReduce) {
-      auto it = stage->iter_var_attrs.find(iv);
-      if (it != stage->iter_var_attrs.end() && (*it).second->bind_thread.defined()) {
-        IterVar tv = (*it).second->bind_thread;
-        freduce_args.push_back(tv->var);
-        output_preds.push_back(tv->var == make_const(tv->var->dtype, 0));
-      }
-    }
-  }
-
-  // Apply the existing input predicate if any.
-  output_preds.push_back(input_pred);
-
-  Stmt reduce_body =
-      Evaluate(Call(DataType::Handle(), tir::builtin::tvm_thread_allreduce(), freduce_args));
-  reduce_body = AttrStmt(reduces[0]->combiner, tir::attr::reduce_scope,
-                         make_zero(DataType::Handle()), reduce_body);
-
-  if (!normal_red.empty()) {
-    Stmt init_body = SeqStmt::Flatten(normal_init);
-    Stmt update_body = SeqStmt::Flatten(normal_update);
-    update_body = MergeNest(MakeIfNest(normal_preds), update_body);
-    update_body = MergeNest(normal_red, update_body);
-    reduce_body = SeqStmt::Flatten(init_body, update_body, reduce_body);
-  }
-
-  std::vector<Stmt> assigns(size);
-  for (size_t idx = 0; idx < size; ++idx) {
-    assigns[idx] = ProducerStore(stage->op.output(idx), BufferLoad(res_buffers[idx], {0}), args);
-  }
-  Stmt assign_body = SeqStmt::Flatten(assigns);
-  assign_body = MergeNest(MakeIfNest(output_preds), assign_body);
-  Stmt body = SeqStmt::Flatten(reduce_body, assign_body);
-  for (size_t idx = size; idx != 0; --idx) {
-    const auto& res_buffer = res_buffers[idx - 1];
-    body = Allocate(res_buffer->data, res_buffer->dtype, res_buffer->shape, const_true(), body);
-    if (!normal_red.empty()) {
-      const auto& normal_res_buffer = normal_res_buffers[idx - 1];
-      body = Allocate(normal_res_buffer->data, normal_res_buffer->dtype, normal_res_buffer->shape,
-                      const_true(), body);
-    }
-  }
-  body = Substitute(body, value_map);
-  return MergeNest(common, body);
-}
-}  // namespace te
-}  // namespace tvm
diff --git a/src/te/operation/extern_op.cc b/src/te/operation/extern_op.cc
index b602efcfc28b..b2d2e91415f7 100644
--- a/src/te/operation/extern_op.cc
+++ b/src/te/operation/extern_op.cc
@@ -26,10 +26,6 @@
 #include <tvm/te/operation.h>
 #include <tvm/tir/expr.h>
 
-#include <unordered_set>
-
-#include "op_utils.h"
-
 namespace tvm {
 namespace te {
 using namespace tir;
@@ -44,8 +40,6 @@ TVM_REGISTER_NODE_TYPE(ExternOpNode);
 
 int ExternOpNode::num_outputs() const { return static_cast<int>(output_placeholders.size()); }
 
-Array<IterVar> ExternOpNode::root_iter_vars() const { return {}; }
-
 DataType ExternOpNode::output_dtype(size_t i) const { return output_placeholders[i]->dtype; }
 
 Array<PrimExpr> ExternOpNode::output_shape(size_t i) const { return output_placeholders[i]->shape; }
@@ -85,83 +79,5 @@ TVM_REGISTER_GLOBAL("te.ExternOp")
 
 Array<Tensor> ExternOpNode::InputTensors() const { return inputs; }
 
-Operation ExternOpNode::ReplaceInputs(const Operation& self,
-                                      const std::unordered_map<Tensor, Tensor>& rmap) const {
-  ICHECK_EQ(self.operator->(), this);
-  auto n = make_object<ExternOpNode>(*this);
-  n->body = ReplaceTensor(this->body, rmap);
-  for (size_t i = 0; i < n->inputs.size(); ++i) {
-    Tensor t = n->inputs[i];
-    if (rmap.count(t)) {
-      n->inputs.Set(i, rmap.at(t));
-    }
-  }
-
-  if (body.same_as(n->body) && inputs.same_as(n->inputs)) {
-    return self;
-  } else {
-    return Operation(n);
-  }
-}
-
-void ExternOpNode::PropBoundToInputs(const Operation& self, arith::Analyzer* analyzer,
-                                     const std::unordered_map<const VarNode*, IntSet>& dom_map,
-                                     std::unordered_map<Tensor, TensorDom>* out_dom_map) const {
-  for (Tensor t : this->inputs) {
-    auto it = out_dom_map->find(t);
-    if (it == out_dom_map->end()) continue;
-    TensorDom& dom = it->second;
-    for (size_t i = 0; i < t->shape.size(); ++i) {
-      dom.data[i].emplace_back(
-          IntSet::FromRange(Range::FromMinExtent(make_const(t->shape[i].dtype(), 0), t->shape[i])));
-    }
-  }
-}
-
-void ExternOpNode::GatherBound(const Operation& self,
-                               const std::unordered_map<Tensor, TensorDom>& tensor_dom,
-                               std::unordered_map<IterVar, Range>* out_dom_map) const {}
-
-Stmt ExternOpNode::BuildRealize(const Stage& stage,
-                                const std::unordered_map<IterVar, Range>& realize_map,
-                                const Stmt& body, String storage_scope) const {
-  ICHECK_EQ(stage->op.get(), this);
-  Stmt realize_body = body;
-  for (int k = 0; k < num_outputs(); ++k) {
-    Tensor t = stage->op.output(k);
-    Region bounds;
-    for (size_t i = 0; i < t->shape.size(); ++i) {
-      bounds.push_back(Range::FromMinExtent(make_const(t->shape[i].dtype(), 0), t->shape[i]));
-    }
-    realize_body = tir::ProducerRealize(t, bounds, const_true(), realize_body, storage_scope);
-  }
-  return realize_body;
-}
-
-Stmt ExternOpNode::BuildProvide(const Stage& stage,
-                                const std::unordered_map<IterVar, Range>& dom_map,
-                                bool debug_keep_trivial_loop) const {
-  ICHECK_EQ(stage->op.operator->(), this);
-  Stmt ret = AttrStmt(make_zero(DataType::Int(32)), tir::attr::extern_scope, 0, this->body);
-  auto f_push_bind = [&ret](Buffer buffer, Tensor tensor) {
-    Array<ObjectRef> bind_spec;
-    Array<PrimExpr> tuple;
-    bind_spec.push_back(buffer);
-    bind_spec.push_back(tensor);
-    for (size_t k = 0; k < buffer->shape.size(); ++k) {
-      tuple.push_back(make_const(buffer->shape[k].dtype(), 0));
-      tuple.push_back(buffer->shape[k]);
-    }
-    ret = AttrStmt(bind_spec, tir::attr::buffer_bind_scope,
-                   Call(DataType::Handle(), builtin::tvm_tuple(), tuple), ret);
-  };
-  for (size_t i = output_placeholders.size(); i != 0; --i) {
-    f_push_bind(output_placeholders[i - 1], stage->op.output(i - 1));
-  }
-  for (size_t i = inputs.size(); i != 0; --i) {
-    f_push_bind(input_placeholders[i - 1], inputs[i - 1]);
-  }
-  return ret;
-}
 }  // namespace te
 }  // namespace tvm
diff --git a/src/te/operation/graph.cc b/src/te/operation/graph.cc
new file mode 100644
index 000000000000..cddace2a8283
--- /dev/null
+++ b/src/te/operation/graph.cc
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file graph.cc
+ * \brief Utilities to get information about schedule graph.
+ */
+#include "graph.h"
+
+#include <tvm/runtime/registry.h>
+#include <tvm/te/operation.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt_functor.h>
+
+#include <unordered_set>
+
+namespace tvm {
+namespace te {
+
+// construct a read graph that gives readers of each operation
+// that the root depend on
+ReadGraph CreateReadGraph(const Array<Operation>& roots) {
+  ReadGraph rmap;
+  std::vector<Operation> stack;
+  std::unordered_set<const Object*> visited;
+  // initialize the roots
+  for (Operation op : roots) {
+    stack.push_back(op);
+    visited.insert(op.get());
+  }
+
+  while (!stack.empty()) {
+    Operation op = stack.back();
+    stack.pop_back();
+    Array<Tensor> deps = op->InputTensors();
+    rmap.Set(op, deps);
+    for (Tensor t : deps) {
+      if (t->op.defined() && visited.count(t->op.get()) == 0) {
+        visited.insert(t->op.get());
+        stack.push_back(t->op);
+      }
+    }
+  }
+  return rmap;
+}
+
+void PostDFSOrder(const Operation& op, const ReadGraph& g, std::unordered_set<Operation>* visited,
+                  Array<Operation>* post_order) {
+  if (visited->count(op)) return;
+  visited->insert(op);
+  for (const auto& t : g.at(op)) {
+    PostDFSOrder(t->op, g, visited, post_order);
+  }
+  post_order->push_back(op);
+}
+
+Array<Operation> PostDFSOrder(const Array<Operation>& roots, const ReadGraph& g) {
+  std::unordered_set<Operation> visited;
+  Array<Operation> post_order;
+  for (Operation op : roots) {
+    PostDFSOrder(op, g, &visited, &post_order);
+  }
+  return post_order;
+}
+
+TVM_REGISTER_GLOBAL("schedule.CreateReadGraph").set_body_typed(CreateReadGraph);
+
+TVM_REGISTER_GLOBAL("schedule.PostDFSOrder")
+    .set_body_typed([](const Array<Operation>& roots, const ReadGraph& g) {
+      return PostDFSOrder(roots, g);
+    });
+
+}  // namespace te
+}  // namespace tvm
diff --git a/src/te/operation/graph.h b/src/te/operation/graph.h
new file mode 100644
index 000000000000..fbb1241ad585
--- /dev/null
+++ b/src/te/operation/graph.h
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file graph.h
+ * \brief Utilities to get information about schedule graph.
+ */
+#ifndef TVM_TE_SCHEDULE_GRAPH_H_
+#define TVM_TE_SCHEDULE_GRAPH_H_
+
+#include <tvm/te/operation.h>
+#include <tvm/tir/expr.h>
+
+namespace tvm {
+namespace te {
+
+/*!
+ * \brief data structure of Operation->Tensors it reads
+ */
+using ReadGraph = Map<Operation, Array<Tensor>>;
+
+/*!
+ * \brief Get read graph of each operation to all the
+ *  Tensors that it directly depends on.
+ *
+ *  The result map contains Operations needed to finish root Operation.
+ * \param roots The root operation.
+ * \return The result map.
+ */
+ReadGraph CreateReadGraph(const Array<Operation>& roots);
+
+/*!
+ * \brief Get a post DFS ordered of operations in the graph.
+ * \param roots The root of the graph.
+ * \param g The read graph.
+ * \return vector order of Operations in PostDFS order.
+ *
+ * \note PostDFSOrder is a special case of Topoligical order,
+ *   and can be used when topoligical order is needed.
+ */
+Array<Operation> PostDFSOrder(const Array<Operation>& roots, const ReadGraph& g);
+
+}  // namespace te
+}  // namespace tvm
+
+#endif  // TVM_TE_SCHEDULE_GRAPH_H_
diff --git a/src/te/operation/hybrid_op.cc b/src/te/operation/hybrid_op.cc
deleted file mode 100644
index 85e764fea41b..000000000000
--- a/src/te/operation/hybrid_op.cc
+++ /dev/null
@@ -1,486 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \brief Hybrid computation rule.
- * \file hybrid_op.cc
- */
-#include "hybrid_op.h"
-
-#include <tvm/arith/analyzer.h>
-#include <tvm/runtime/registry.h>
-#include <tvm/te/operation.h>
-#include <tvm/tir/analysis.h>
-#include <tvm/tir/expr.h>
-#include <tvm/tir/op.h>
-#include <tvm/tir/stmt_functor.h>
-
-#include <string>
-#include <unordered_set>
-#include <utility>
-
-#include "op_utils.h"
-
-namespace tvm {
-namespace te {
-using namespace tir;
-// HybridOpNode
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<HybridOpNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const HybridOpNode*>(node.get());
-      p->stream << "hybrid(" << op->name << ", " << op << ")";
-    });
-
-TVM_REGISTER_NODE_TYPE(HybridOpNode);
-
-int HybridOpNode::num_outputs() const { return static_cast<int>(outputs.size()); }
-
-Array<IterVar> HybridOpNode::root_iter_vars() const { return this->axis; }
-
-DataType HybridOpNode::output_dtype(size_t i) const { return outputs[i]->dtype; }
-
-Array<PrimExpr> HybridOpNode::output_shape(size_t i) const { return outputs[i]->shape; }
-
-HybridOp::HybridOp(std::string name, std::string tag, Map<String, ObjectRef> attrs,
-                   Array<Tensor> inputs, Array<Tensor> outputs, Stmt body) {
-  if (!attrs.defined()) {
-    attrs = Map<String, ObjectRef>();
-  }
-  auto n = make_object<HybridOpNode>();
-  n->name = std::move(name);
-  n->tag = std::move(tag);
-  n->attrs = std::move(attrs);
-  n->inputs = std::move(inputs);
-  n->outputs = std::move(outputs);
-  n->axis = te::GatherLoopVars(body);
-  n->body = std::move(body);
-  data_ = std::move(n);
-}
-
-TVM_REGISTER_GLOBAL("te.HybridOp")
-    .set_body_typed([](std::string name, std::string tag, Map<String, ObjectRef> attrs,
-                       Array<Tensor> inputs, Array<Tensor> outputs,
-                       Stmt body) { return HybridOp(name, tag, attrs, inputs, outputs, body); });
-
-Array<Tensor> HybridOpNode::InputTensors() const {
-  // Because input tensors could be potentially inlined into hybrid scripts,
-  // we need to check if all input tensors are used in the body.
-  std::unordered_set<Tensor> orig_inputs;
-  for (auto t : inputs) {
-    orig_inputs.insert(t);
-  }
-  std::unordered_set<Tensor> visited;
-  Array<Tensor> curr_inputs;
-  tir::PostOrderVisit(body, [&curr_inputs, &orig_inputs, &visited](const ObjectRef& n) {
-    if (auto* pload = n.as<tir::ProducerLoadNode>()) {
-      Tensor t = Downcast<Tensor>(pload->producer);
-      if (orig_inputs.count(t) && !visited.count(t)) {
-        curr_inputs.push_back(t);
-        visited.insert(t);
-      }
-    }
-  });
-  return curr_inputs;
-}
-
-Operation HybridOpNode::ReplaceInputs(const Operation& self,
-                                      const std::unordered_map<Tensor, Tensor>& rmap) const {
-  ICHECK_EQ(self.operator->(), this);
-  auto n = make_object<HybridOpNode>(*this);
-  n->body = te::ReplaceTensor(this->body, rmap);
-  for (size_t i = 0; i < n->inputs.size(); ++i) {
-    Tensor t = n->inputs[i];
-    if (rmap.count(t)) {
-      n->inputs.Set(i, rmap.at(t));
-    }
-  }
-
-  if (body.same_as(n->body) && inputs.same_as(n->inputs)) {
-    return self;
-  } else {
-    return Operation(n);
-  }
-}
-
-void HybridOpNode::PropBoundToInputs(const Operation& self, arith::Analyzer* analyzer,
-                                     const std::unordered_map<const VarNode*, IntSet>& dom_map,
-                                     std::unordered_map<Tensor, TensorDom>* out_dom_map) const {
-  auto curr_inputs = InputTensors();
-  for (Tensor t : curr_inputs) {
-    auto it = out_dom_map->find(t);
-    if (it == out_dom_map->end()) continue;
-    TensorDom& dom = it->second;
-    for (size_t i = 0; i < t->shape.size(); ++i) {
-      dom.data[i].emplace_back(
-          IntSet::FromRange(Range::FromMinExtent(make_const(t->shape[i].dtype(), 0), t->shape[i])));
-    }
-  }
-}
-
-void HybridOpNode::GatherBound(const Operation& self,
-                               const std::unordered_map<Tensor, TensorDom>& tensor_dom,
-                               std::unordered_map<IterVar, Range>* out_dom_map) const {
-  for (auto iter_var : axis) {
-    ICHECK(!out_dom_map->count(iter_var));
-    out_dom_map->operator[](iter_var) = iter_var->dom;
-  }
-}
-
-Stmt HybridOpNode::BuildRealize(const Stage& stage,
-                                const std::unordered_map<IterVar, Range>& realize_map,
-                                const Stmt& body, String storage_scope) const {
-  // TODO(@were): Add attribute inject here and remove it from hybrid parser.
-  ICHECK_EQ(stage->op.get(), this);
-  Stmt realize_body = body;
-  for (int k = 0; k < num_outputs(); ++k) {
-    Tensor t = stage->op.output(k);
-    Region bounds;
-    for (size_t i = 0; i < t->shape.size(); ++i) {
-      bounds.push_back(Range::FromMinExtent(make_const(t->shape[i].dtype(), 0), t->shape[i]));
-    }
-    realize_body = tir::ProducerRealize(t, bounds, const_true(), realize_body, storage_scope);
-  }
-  return realize_body;
-}
-
-Stmt HybridOpNode::BuildProvide(const Stage& stage,
-                                const std::unordered_map<IterVar, Range>& dom_map,
-                                bool debug_keep_trivial_loop) const {
-  ICHECK_EQ(stage->op.operator->(), this);
-  Stmt ret = AttrStmt(make_zero(DataType::Int(32)), tir::attr::extern_scope, 0, this->body);
-  std::unordered_map<Tensor, Tensor> rmap;
-  for (int i = 0; i < this->num_outputs(); ++i) {
-    rmap[outputs[i]] = stage->op.output(i);
-  }
-  auto n = make_object<HybridOpNode>(*this);
-  /* This is a story little bit complicated.
-   * The following two lines of codes replace output tensors' usage.
-   * This is the simplest way I (@were) can come up with to glue
-   * hybrid operation node to TVM op system.
-   * In hybrid script all the tensors, especially the output tensors,
-   * have their own names defined by the users. However, In TVM
-   * conventional ops:
-   *   1. Output tensors refer the corresponding op node so that the output
-   *      tensors have the same names as the operation produces them.
-   *   2. Once OpNode is wrapped up by an Operation node, it is finalized.
-   *      Later access will be from a const OpNode*.
-   * This is a chicken-egg paradox. It is impossible to put the output
-   * tensors into the function body without forming the op node. The
-   * function body is immutable after the node is formed.
-   *
-   * Finally, I decided to resolve this issue "lazily". During the
-   * pipeline of compilation, this stage is a very preliminary stage.
-   * Technically, it is before Phase 0. The actual tensors will be replaced
-   * here.
-   * Thus, the operation body is slightly different from the Phase 0 body.
-   * This is a major difference that HybridOpNode is NOT the same as
-   * ExternOpNode.
-   * */
-  ret = te::ReplaceTensor(ret, rmap);
-  ret = te::ReplaceProvideTensor(ret, rmap);
-
-  ret = te::ApplySchedule(stage, dom_map, ret);
-  return ret;
-}
-
-Stmt ApplyLoopShapes(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
-                     Stmt stmt) {
-  class LoopSpliter : public StmtExprMutator {
-    PrimExpr factor;
-    const VarNode* parent;
-    IterVar inner, outer;
-
-   public:
-    bool splitted;
-    LoopSpliter(const SplitNode* split, const std::unordered_map<IterVar, Range>& dom_map)
-        : factor(split->factor), splitted(false) {
-      parent = split->parent->var.get();
-
-      auto& inner_ = split->inner;
-      ICHECK(dom_map.count(inner_));
-      auto& inner_dom = dom_map.find(inner_)->second;
-      ICHECK(is_const_int(inner_dom->min, 0));
-
-      auto& outer_ = split->outer;
-      ICHECK(dom_map.count(outer_));
-      auto& outer_dom = dom_map.find(outer_)->second;
-      ICHECK(is_const_int(outer_dom->min, 0));
-
-      inner = IterVar(inner_dom, inner_->var, inner_->iter_type);
-      outer = IterVar(outer_dom, outer_->var, outer_->iter_type);
-    }
-
-    Stmt VisitStmt_(const ForNode* op) final {
-      if (op->loop_var.get() == parent) {
-        Stmt ret = tir::Substitute(op->body, {{op->loop_var, inner + outer * factor}});
-        PrimExpr cond = likely(outer * factor < (op->extent - inner));
-        ret = IfThenElse(cond, ret);
-        ret = For(inner->var, PrimExpr(0), inner->dom->extent,
-                  IterVarTypeToForKind(inner->iter_type), ret);
-        ret = For(outer->var, PrimExpr(0), outer->dom->extent,
-                  IterVarTypeToForKind(outer->iter_type), ret);
-        splitted = true;
-        return ret;
-      }
-      return StmtExprMutator::VisitStmt_(op);
-    }
-  };
-
-  class LoopFuser : public StmtExprMutator {
-    const IterVar& parent;
-    const VarNode* inner;
-    const VarNode* outer;
-    bool under_outer;
-    PrimExpr extent;
-
-   public:
-    bool fused;
-    explicit LoopFuser(const FuseNode* fuse_)
-        : parent(fuse_->fused),
-          inner(fuse_->inner->var.get()),
-          outer(fuse_->outer->var.get()),
-          under_outer(false),
-          extent(0),
-          fused(false) {}
-
-    // TODO(@were): Handle imperfect loops
-    Stmt VisitStmt_(const ForNode* op) final {
-      if (op->loop_var.get() == inner) {
-        ICHECK(under_outer);
-        std::unordered_map<const VarNode*, PrimExpr> rmap;
-        rmap[op->loop_var.get()] = indexmod(parent, op->extent);
-        extent = op->extent;
-        fused = true;
-        return tir::Substitute(op->body, rmap);
-      } else if (op->loop_var.get() == outer) {
-        under_outer = true;
-        Stmt body = this->VisitStmt(op->body);
-        std::unordered_map<const VarNode*, PrimExpr> rmap;
-        rmap[op->loop_var.get()] = indexdiv(parent, extent);
-        body = tir::Substitute(body, rmap);
-        under_outer = false;
-        return For(parent->var, PrimExpr(0), extent * op->extent, op->kind, body,
-                   op->thread_binding, op->annotations);
-      } else if (under_outer) {
-        Stmt body = this->VisitStmt(op->body);
-        std::unordered_map<const VarNode*, PrimExpr> rmap;
-        rmap[op->loop_var.get()] = indexmod(indexdiv(parent, extent), op->extent);
-        body = tir::Substitute(body, rmap);
-        extent = extent * op->extent;
-        return body;
-      }
-      return StmtExprMutator::VisitStmt_(op);
-    }
-  };
-
-  for (auto& rel : stage->relations) {
-    if (const SplitNode* split = rel.as<SplitNode>()) {
-      LoopSpliter Spliter(split, dom_map);
-      stmt = Spliter(stmt);
-      ICHECK(Spliter.splitted);
-    } else if (const FuseNode* fuse = rel.as<FuseNode>()) {
-      LoopFuser Fuser(fuse);
-      stmt = Fuser(stmt);
-      ICHECK(Fuser.fused);
-    }
-  }
-
-  return stmt;
-}
-
-Stmt ApplyLoopAnnotations(const Stage& stage, const std::unordered_map<IterVar, IterVar>& rebased,
-                          Stmt stmt) {
-  class LoopAnnotator : public StmtMutator {
-    const VarNode* var;
-    const IterVarAttr& attr;
-
-   public:
-    LoopAnnotator(const VarNode* var_, const IterVarAttr& attr_) : var(var_), attr(attr_) {}
-
-    Stmt VisitStmt_(const ForNode* op) final {
-      tir::ExprDeepEqual expr_equal;
-
-      if (op->loop_var.get() == var) {
-        if (attr->bind_thread.defined()) {
-          const auto& iter_var = attr->bind_thread;
-          if (iter_var->dom.defined()) {
-            ICHECK(is_const_int(iter_var->dom->min, 0));
-            ICHECK(expr_equal(iter_var->dom->extent, op->extent))
-                << "Thread extent and loop extent mismatch!\n";
-          }
-          std::unordered_map<const VarNode*, PrimExpr> rmap;
-          rmap[op->loop_var.get()] = iter_var;
-          Stmt body = tir::Substitute(op->body, rmap);
-          return AttrStmt(iter_var, "thread_extent", op->extent, body);
-        } else {
-          return For(op->loop_var, op->min, op->extent, IterVarTypeToForKind(attr->iter_type),
-                     op->body, op->thread_binding, op->annotations);
-        }
-      }
-      return StmtMutator::VisitStmt_(op);
-    }
-  };
-
-  for (auto& iter_var : stage->leaf_iter_vars) {
-    bool need_change = false;
-    int found = 0;
-
-    const IterVar& actual = rebased.count(iter_var) ? rebased.find(iter_var)->second : iter_var;
-    const VarNode* var = actual->var.get();
-    ForKind expected = IterVarTypeToForKind(iter_var->iter_type);
-    IterVarAttr attr;
-    if (stage->iter_var_attrs.count(iter_var)) {
-      attr = stage->iter_var_attrs[iter_var];
-      expected = IterVarTypeToForKind(attr->iter_type);
-    }
-
-    PostOrderVisit(stmt, [&found, &var, &attr, &expected, &need_change](const ObjectRef& node) {
-      if (const ForNode* op = node.as<ForNode>()) {
-        if (op->loop_var.get() == var) {
-          ++found;
-          need_change = expected != op->kind || (attr.defined() && attr->bind_thread.defined());
-        }
-      }
-    });
-
-    ICHECK_EQ(found, 1) << " iter var should be found exactly once!";
-    if (need_change) {
-      stmt = LoopAnnotator(var, attr)(std::move(stmt));
-    }
-  }
-  return stmt;
-}
-
-Stmt ApplyLoopOrder(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
-                    const std::unordered_map<IterVar, IterVar>& rebased, Stmt stmt) {
-  std::vector<const VarNode*> current_order;
-  PostOrderVisit(stmt, [&current_order](const ObjectRef& node) {
-    if (const ForNode* op = node.as<ForNode>()) current_order.push_back(op->loop_var.get());
-  });
-  std::reverse(current_order.begin(), current_order.end());
-  auto& required_ord = stage->leaf_iter_vars;
-  ICHECK_EQ(current_order.size(), required_ord.size()) << "Cannot reorder the loops!";
-  std::unordered_map<const VarNode*, IterVar> reorder;
-  bool need_reorder = false;
-  for (size_t i = 0; i < current_order.size(); ++i) {
-    auto& current = current_order[i];
-    const IterVar& iter_var = required_ord[i];
-    const IterVar& required = rebased.count(iter_var) ? rebased.find(iter_var)->second : iter_var;
-    ICHECK(required->dom.defined() || dom_map.count(required)) << required << "\n";
-    reorder[current] = required;
-    if (current != required->var.get()) {
-      need_reorder = true;
-    }
-  }
-
-  class LoopReorder : public StmtMutator {
-    const Stage& stage;
-    const std::unordered_map<IterVar, Range>& dom_map;
-    const std::unordered_map<const VarNode*, IterVar>& reorder;
-
-   public:
-    LoopReorder(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
-                const std::unordered_map<const VarNode*, IterVar>& reorder)
-        : stage(stage), dom_map(dom_map), reorder(reorder) {}
-
-    Stmt VisitStmt_(const ForNode* op) final {
-      // Reorder from in to out
-      Stmt body_ = this->VisitStmt(op->body);
-      ICHECK(reorder.count(op->loop_var.get()));
-      auto target = reorder.find(op->loop_var.get())->second;
-      if (body_.same_as(op->body) && op->loop_var.get() == target->var.get())
-        return GetRef<Stmt>(op);
-      const Stmt& body = op->body.same_as(body_) ? op->body : body_;
-      ForKind kind = IterVarTypeToForKind(target->iter_type);
-      if (stage->iter_var_attrs.count(target)) {
-        kind = IterVarTypeToForKind(stage->iter_var_attrs[target]->iter_type);
-      }
-      const Range& range = target->dom.defined() ? target->dom : dom_map.find(target)->second;
-      return For(target->var, range->min, range->extent, kind, body, op->thread_binding,
-                 op->annotations);
-    }
-  };
-
-  if (need_reorder) return LoopReorder(stage, dom_map, reorder)(stmt);
-
-  return stmt;
-}
-
-Stmt ApplySchedule(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
-                   Stmt stmt) {
-  // TODO(@were): Eliminate loop rebase in script parser and move the burden here
-  // Gather rebased variables
-  std::unordered_map<IterVar, IterVar> rebased;
-  for (auto rel : stage->relations) {
-    if (const auto* rebase = rel.as<RebaseNode>()) {
-      rebased[rebase->rebased] = rebase->parent;
-      ICHECK(rebase->parent->dom.defined());
-      ICHECK(dom_map.count(rebase->rebased));
-    }
-  }
-  stmt = ApplyLoopShapes(stage, dom_map, stmt);
-  stmt = ApplyLoopOrder(stage, dom_map, rebased, stmt);
-  stmt = ApplyLoopAnnotations(stage, rebased, stmt);
-  return stmt;
-}
-
-std::vector<IterVar> GatherLoopVars(Stmt stmt) {
-  // TODO(@were): Write a comprehensive pass to analyze iter var types
-  std::vector<IterVar> res_;
-  PostOrderVisit(stmt, [&res_](const ObjectRef& node) {
-    if (const ForNode* op = node.as<ForNode>()) {
-      Var loop_var(op->loop_var);
-      Range dom = Range::FromMinExtent(op->min, cast(loop_var.dtype(), op->extent));
-      res_.push_back(IterVar(dom, loop_var, ForKindToIterVarType(op->kind)));
-    }
-  });
-  std::reverse(res_.begin(), res_.end());
-  return res_;
-}
-
-// replacer to replace tensors' usage in Provide
-class ProviderReplacer : public tir::StmtMutator {
- public:
-  explicit ProviderReplacer(const std::unordered_map<Tensor, Tensor>& vmap) : vmap_(vmap) {}
-
-  Stmt VisitStmt_(const tir::ProducerStoreNode* op) final {
-    Tensor t = Downcast<Tensor>(op->producer);
-    auto it = vmap_.find(t);
-    if (it != vmap_.end()) {
-      Stmt ret = tir::ProducerStore(it->second, op->value, op->indices);
-      found = true;
-      return this->VisitStmt(ret);
-    }
-    return StmtMutator::VisitStmt_(op);
-  }
-
-  // whether it is found.
-  bool found{false};
-
- private:
-  const std::unordered_map<Tensor, Tensor>& vmap_;
-};
-
-Stmt ReplaceProvideTensor(Stmt stmt, const std::unordered_map<Tensor, Tensor>& replace) {
-  ProviderReplacer repl(replace);
-  Stmt ret = repl(stmt);
-  return repl.found ? ret : stmt;
-}
-}  // namespace te
-}  // namespace tvm
diff --git a/src/te/operation/hybrid_op.h b/src/te/operation/hybrid_op.h
deleted file mode 100644
index 705456850ce6..000000000000
--- a/src/te/operation/hybrid_op.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \brief Helper utilities to implement hybrid_op.
- * \file hybrid_op.h
- */
-#ifndef TVM_TE_OPERATION_HYBRID_OP_H_
-#define TVM_TE_OPERATION_HYBRID_OP_H_
-
-#include <tvm/te/schedule.h>
-#include <tvm/tir/expr.h>
-
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "../../tir/transforms/arg_binder.h"
-#include "../../tir/transforms/ir_utils.h"
-#include "../schedule/message_passing.h"
-
-namespace tvm {
-namespace te {
-
-/*!
- * \brief Find all the iteration variables in the given statement body.
- * \param stmt The body to be inspected.
- */
-std::vector<IterVar> GatherLoopVars(Stmt stmt);
-
-/*!
- * \brief Replace the tensor reference (especially in Provide's) in stmt by the replace map.
- * \param stmt The statement to be processed.
- * \param replace The replacement rule.
- */
-Stmt ReplaceProvideTensor(Stmt stmt, const std::unordered_map<Tensor, Tensor>& replace);
-
-/*!
- * \brief Apply the schedule manipulation on the function body.
- * \param stmt The statement to be processed.
- * \param dom_map The extents of the iterative variables may be used.
- * \param stage The schedule information to be applied.
- */
-Stmt ApplySchedule(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
-                   Stmt stmt);
-
-/*!
- * \brief Apply loop splits and fuses in the schedule on the function body.
- * \param stage The schedule information to be applied.
- * \param dom_map The extents of the iterative variables may be used.
- * \param stmt The statement to be processed.
- */
-Stmt ApplyLoopShapes(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
-                     Stmt stmt);
-
-/*!
- * \brief Apply loop annotation in the schedule on the function body.
- * \param stage The schedule information to be applied.
- * \param rebased The map specifies the rebase, a.k.a rename, relationship of these variables.
- * \param stmt The statement to be processed.
- */
-Stmt ApplyLoopAnnotations(const Stage& stage, const std::unordered_map<IterVar, IterVar>& rebased,
-                          Stmt stmt);
-
-/*!
- * \brief Apply loop order in the schedule on the function body.
- * \param stage The schedule information to be applied.
- * \param dom_map The extents of the iterative variables may be used.
- * \param rebased The map specifies the rebase, a.k.a rename, relationship of these variables.
- * \param stmt The statement to be processed.
- */
-Stmt ApplyLoopOrder(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
-                    const std::unordered_map<IterVar, IterVar>& rebased, Stmt stmt);
-
-}  // namespace te
-}  // namespace tvm
-
-#endif  // TVM_TE_OPERATION_HYBRID_OP_H_
diff --git a/src/te/operation/op_utils.cc b/src/te/operation/op_utils.cc
deleted file mode 100644
index 7168933a320c..000000000000
--- a/src/te/operation/op_utils.cc
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \brief Utility to make loop nest.
- * \file op_utils.cc
- */
-#include "op_utils.h"
-
-#include <tvm/te/operation.h>
-#include <tvm/tir/expr.h>
-#include <tvm/tir/stmt_functor.h>
-
-#include <string>
-
-#include "../../runtime/thread_storage_scope.h"
-#include "../schedule/message_passing.h"
-
-namespace tvm {
-namespace te {
-
-using namespace arith;
-using namespace tir;
-
-std::vector<std::vector<Stmt>> MakeLoopNest(const Stage& stage,
-                                            const std::unordered_map<IterVar, Range>& dom_map,
-                                            size_t begin_iter_pos, bool new_loop_var,
-                                            const std::unordered_set<IterVar>& skip_iter,
-                                            std::unordered_map<IterVar, PrimExpr>* p_value_map,
-                                            bool debug_keep_trivial_loop) {
-  auto leaf_iter_vars = stage->leaf_iter_vars;
-  Stmt no_op = Evaluate(0);
-  // create the loop nest
-  std::vector<std::vector<Stmt>> nest;
-  nest.resize(leaf_iter_vars.size() + 1);
-  std::unordered_map<IterVar, PrimExpr>& value_map = *p_value_map;
-
-  for (size_t i = begin_iter_pos; i < leaf_iter_vars.size(); ++i) {
-    auto iv = leaf_iter_vars[i];
-    if (skip_iter.count(iv) || iv->iter_type == kOpaque) {
-      // skip this iteration.
-      value_map[iv] = iv->var;
-      continue;
-    }
-    // Bind iv could be another thread.
-    IterVar bind_iv = iv;
-    if (stage->iter_var_attrs.count(iv)) {
-      IterVar bind_thread = stage->iter_var_attrs[iv]->bind_thread;
-      if (bind_thread.defined()) bind_iv = bind_thread;
-    }
-
-    Range dom = dom_map.at(iv);
-
-    ICHECK(iv->var.dtype() == dom->min.dtype() && iv->var.dtype() == dom->extent.dtype())
-        << "iter_var type " << iv->var.dtype() << " and domain types (min:" << dom->min.dtype()
-        << ", extent:" << dom->extent.dtype() << ") should all be the same";
-
-    // This is a hack to ensure that the replacing expression has the same
-    // dtype as the replacing expression. This happens when a thread/block
-    // itervar is bound to another itervar. Because the thread/block itervar
-    // has no way to know its correct dtype before it is bound, it defaults to
-    // int32. Then the itervar it is bound to may have a different dtype. The
-    // thread/block dtype really should be promoted to dtype of what it is
-    // bound to (in `bind`) but that would require inplace modification of the
-    // itervar.
-    // XXX: we will get integer overflow if the bound itervar is greater than int32::max.
-    auto promote_to_iv_dtype = [type = iv->var.dtype()](PrimExpr e) {
-      return type != e.dtype() ? cast(type, e) : e;
-    };
-
-    // initialize the offset and loop_level
-    Var var = bind_iv->var;
-
-    // Mark the iter var in the IR, to remember the point
-    if (bind_iv->thread_tag.length() == 0) {
-      // Only generate new loop if we're not bound to a thread.
-      if (new_loop_var) {
-        var = Var(iv->var->name_hint + ".init", bind_iv->var.dtype());
-      }
-
-      ForKind kind = ForKind::kSerial;
-      IterVarAttr it_attr;
-      if (stage->iter_var_attrs.count(iv)) {
-        it_attr = stage->iter_var_attrs[iv];
-      }
-      if (it_attr.defined()) {
-        switch (it_attr->iter_type) {
-          case kUnrolled:
-            kind = ForKind::kUnrolled;
-            break;
-          case kVectorized:
-            kind = ForKind::kVectorized;
-            break;
-          case kParallelized:
-            kind = ForKind::kParallel;
-            break;
-          case kDataPar:
-            break;
-          case kTensorized:
-            break;
-          default:
-            LOG(FATAL) << "Unknown iter type" << it_attr->iter_type << " in the iter_var_attrs";
-        }
-        ICHECK_EQ(it_attr->pragma_keys.size(), it_attr->pragma_values.size());
-        for (size_t k = 0; k < it_attr->pragma_keys.size(); ++k) {
-          const std::string& pkey = it_attr->pragma_keys[k].as<StringImmNode>()->value;
-          PrimExpr pvalue = it_attr->pragma_values[k];
-          if (!pvalue.defined()) {
-            pvalue = make_const(DataType::Int(32), 1);
-          }
-          nest[i + 1].emplace_back(
-              AttrStmt(iv, tir::attr::pragma_scope_prefix + pkey, pvalue, no_op));
-        }
-      }
-      if (!debug_keep_trivial_loop && is_one(dom->extent)) {
-        nest[i + 1].emplace_back(LetStmt(var, dom->min, no_op));
-        value_map[iv] = dom->min;
-      } else if (is_zero(dom->min)) {
-        nest[i + 1].emplace_back(For(var, 0, dom->extent, kind, no_op));
-        value_map[iv] = promote_to_iv_dtype(var);
-      } else {
-        Var idx(bind_iv->var->name_hint + ".idx", iv->var.dtype());
-        nest[i + 1].emplace_back(For(idx, 0, dom->extent, kind, no_op));
-        PrimExpr new_value = dom->min + idx;
-        value_map[iv] = new_value;
-        nest[i + 1].emplace_back(LetStmt(var, new_value, no_op));
-      }
-      if (it_attr.defined() && it_attr->prefetch_data.size() != 0) {
-        ICHECK(!is_one(dom->extent)) << "Cannot prefetch on trivial loop with extent=1";
-        ICHECK_EQ(it_attr->prefetch_data.size(), it_attr->prefetch_offset.size());
-        for (size_t j = 0; j < it_attr->prefetch_data.size(); ++j) {
-          nest[i + 1].emplace_back(AttrStmt(it_attr->prefetch_data[j], tir::attr::prefetch_scope,
-                                            it_attr->prefetch_offset[j], no_op));
-        }
-      }
-    } else if (bind_iv->thread_tag == "vthread" || bind_iv->thread_tag == "cthread") {
-      // virtual thread
-      // Always restrict threaded IterVar to starts from 0.
-      ICHECK(is_zero(dom->min));
-      ICHECK(is_positive_const(dom->extent));
-      // annotate the extent of the IterVar
-      nest[i + 1].emplace_back(AttrStmt(bind_iv, tir::attr::virtual_thread,
-                                        cast(bind_iv->var.dtype(), dom->extent), no_op));
-      value_map[iv] = promote_to_iv_dtype(var);
-    } else if (bind_iv->thread_tag == "pipeline") {
-      // pipeline marker.
-      ICHECK(is_zero(dom->min));
-      ICHECK(is_one(dom->extent));
-      // annotate the extent of the IterVar
-      nest[i + 1].emplace_back(AttrStmt(bind_iv, tir::attr::pipeline_exec_scope,
-                                        cast(bind_iv->var.dtype(), dom->extent), no_op));
-      value_map[iv] = dom->min;
-    } else {
-      // Always restrict threaded IterVar to starts from 0.
-      ICHECK(is_zero(dom->min)) << "Itervar " << iv << " must start at zero, but it starts at "
-                                << dom->min;
-      // annotate the extent of the IterVar
-      nest[i + 1].emplace_back(AttrStmt(bind_iv, tir::attr::thread_extent,
-                                        cast(bind_iv->var.dtype(), dom->extent), no_op));
-      if (!debug_keep_trivial_loop && is_one(dom->extent)) {
-        value_map[iv] = dom->min;
-      } else if (stage->scope == "") {
-        value_map[iv] = promote_to_iv_dtype(var);
-      } else {
-        runtime::ThreadScope ts = runtime::ThreadScope::Create(bind_iv->thread_tag);
-        runtime::StorageScope ss = runtime::StorageScope::Create(stage->scope);
-        if (static_cast<int>(ss.rank) <= ts.rank) {
-          value_map[iv] = promote_to_iv_dtype(var);
-        } else if (stage->scope == "warp" && ts.rank == 1) {
-          // To determine whether a thread index is inside or outside a warp, we need
-          // to know the thread extent. We leave a warning for now.
-          if (ts.dim_index == 0) {
-            value_map[iv] = promote_to_iv_dtype(var);
-          } else {
-            LOG(WARNING)
-                << "WARNING: threadIdx.y or threadIdx.z accessing warp-scope memory detected. "
-                << "TVM assumes only threadIdx.x indicates threads inside a warp, "
-                << "while threadIdx.y and threadIdx.z indicates different warps.";
-            value_map[iv] = dom->min;
-          }
-        } else {
-          value_map[iv] = dom->min;
-        }
-      }
-    }
-    // annotate the extent of the IterVar
-    if (!new_loop_var) {
-      nest[i + 1].emplace_back(AttrStmt(iv, tir::attr::loop_scope, iv->var, no_op));
-    }
-  }
-  // message passing to get offset of root iter vars.
-  te::PassUpIndex(stage, dom_map, &value_map);
-  return nest;
-}
-
-std::vector<Stmt> MakeIfNest(const std::vector<PrimExpr>& predicates) {
-  Stmt no_op = Evaluate(0);
-  std::vector<Stmt> nest;
-  for (const PrimExpr& cond : predicates) {
-    nest.emplace_back(IfThenElse(cond, no_op));
-  }
-  return nest;
-}
-
-// replacer to replace tensors
-class TensorReplacer : public tir::StmtExprMutator {
- public:
-  explicit TensorReplacer(const std::unordered_map<Tensor, Tensor>& vmap) : vmap_(vmap) {}
-
-  PrimExpr VisitExpr_(const tir::ProducerLoadNode* op) final {
-    PrimExpr expr = StmtExprMutator::VisitExpr_(op);
-    op = expr.as<tir::ProducerLoadNode>();
-    ICHECK(op != nullptr);
-
-    Tensor t = Downcast<Tensor>(op->producer);
-    auto it = vmap_.find(t);
-    if (it != vmap_.end()) {
-      found = true;
-      return tir::ProducerLoad(it->second, op->indices);
-    } else {
-      return expr;
-    }
-  }
-
-  // whether it is found.
-  bool found{false};
-
- private:
-  const std::unordered_map<Tensor, Tensor>& vmap_;
-};
-
-Stmt ReplaceTensor(Stmt stmt, const std::unordered_map<Tensor, Tensor>& replace) {
-  TensorReplacer repl(replace);
-  Stmt ret = repl(stmt);
-  return repl.found ? ret : stmt;
-}
-PrimExpr ReplaceTensor(PrimExpr expr, const std::unordered_map<Tensor, Tensor>& replace) {
-  TensorReplacer repl(replace);
-  PrimExpr ret = repl(expr);
-  return repl.found ? ret : expr;
-}
-
-IterVarType ForKindToIterVarType(tir::ForKind kind) {
-  switch (kind) {
-    case ForKind::kSerial:
-      return kDataPar;
-    case ForKind::kParallel:
-      return kParallelized;
-    case ForKind::kVectorized:
-      return kVectorized;
-    case ForKind::kUnrolled:
-      return kUnrolled;
-    default:
-      return kDataPar;
-  }
-}
-
-tir::ForKind IterVarTypeToForKind(IterVarType iter_type) {
-  switch (iter_type) {
-    case kDataPar:
-      return ForKind::kSerial;
-    case kParallelized:
-      return ForKind::kParallel;
-    case kVectorized:
-      return ForKind::kVectorized;
-    case kUnrolled:
-      return ForKind::kUnrolled;
-    default:
-      return ForKind::kSerial;
-  }
-}
-
-}  // namespace te
-}  // namespace tvm
diff --git a/src/te/operation/op_utils.h b/src/te/operation/op_utils.h
deleted file mode 100644
index 72438cb8dd3f..000000000000
--- a/src/te/operation/op_utils.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file op_utils.h
- * \brief Common utility used in operator construction.
- */
-#ifndef TVM_TE_OPERATION_OP_UTILS_H_
-#define TVM_TE_OPERATION_OP_UTILS_H_
-
-#include <tvm/te/schedule.h>
-#include <tvm/tir/expr.h>
-
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "../../tir/transforms/arg_binder.h"
-#include "../../tir/transforms/ir_utils.h"
-#include "../schedule/message_passing.h"
-
-namespace tvm {
-namespace te {
-
-using tir::MergeNest;
-
-/*!
- * \brief Build loop nest for stage.
- *
- * \param stage The stage to create a loop nest.
- * \param dom_map The range of each iter var.
- * \param begin_iter_pos The beginning position of leaf_iter_vars to generate loop.
- * \param new_loop_var Whether create new loop variable.
- * \param skip_iter Whether skip certain iteration.
- * \param p_value_map The result value of each IterVar.
- * \param debug_keep_trivial_loop Whether keep trivial loops with extent of 1
- */
-std::vector<std::vector<Stmt>> MakeLoopNest(const Stage& stage,
-                                            const std::unordered_map<IterVar, Range>& dom_map,
-                                            size_t begin_iter_pos, bool new_loop_var,
-                                            const std::unordered_set<IterVar>& skip_iter,
-                                            std::unordered_map<IterVar, PrimExpr>* p_value_map,
-                                            bool debug_keep_trivial_loop);
-
-/*!
- * \brief Create a nest of if checking the predicates.
- *
- * \param predicates The predicates to be checked.
- * \return List of If nest that checks the predicates.
- */
-std::vector<Stmt> MakeIfNest(const std::vector<PrimExpr>& predicates);
-
-/*!
- * \brief Replace the tensor reference (especially in Call's) in stmt by the replace map.
- * \param stmt The statement to be processed.
- * \param replace The replacement rule.
- */
-Stmt ReplaceTensor(Stmt stmt, const std::unordered_map<Tensor, Tensor>& replace);
-/*!
- * \brief Replace the tensor reference (especially in Call's) in primExpr by the replace map.
- * \param expr The expression to be processed.
- * \param replace The replacement rule.
- */
-PrimExpr ReplaceTensor(PrimExpr expr, const std::unordered_map<Tensor, Tensor>& replace);
-
-/*!
- * \brief Converts Halide ForKind to its corresponding IterVarType
- * \param kind The ForKind to be converted
- */
-IterVarType ForKindToIterVarType(tir::ForKind kind);
-
-/*!
- * \brief Converts IterVarType to its corresponding Halide ForKind
- * \param iter_type The IterVarType to be converted
- */
-tir::ForKind IterVarTypeToForKind(IterVarType iter_type);
-
-}  // namespace te
-}  // namespace tvm
-#endif  // TVM_TE_OPERATION_OP_UTILS_H_
diff --git a/src/te/operation/placeholder_op.cc b/src/te/operation/placeholder_op.cc
index 774a0f8f1f89..eaf1a8f854a4 100644
--- a/src/te/operation/placeholder_op.cc
+++ b/src/te/operation/placeholder_op.cc
@@ -38,8 +38,6 @@ TVM_REGISTER_NODE_TYPE(PlaceholderOpNode);
 
 int PlaceholderOpNode::num_outputs() const { return 1; }
 
-Array<IterVar> PlaceholderOpNode::root_iter_vars() const { return {}; }
-
 DataType PlaceholderOpNode::output_dtype(size_t i) const {
   ICHECK_EQ(i, 0U);
   return dtype;
@@ -79,30 +77,5 @@ TVM_REGISTER_GLOBAL("te.Placeholder")
 
 Array<Tensor> PlaceholderOpNode::InputTensors() const { return {}; }
 
-Operation PlaceholderOpNode::ReplaceInputs(const Operation& self,
-                                           const std::unordered_map<Tensor, Tensor>& rmap) const {
-  return self;
-}
-
-void PlaceholderOpNode::PropBoundToInputs(
-    const Operation& self, arith::Analyzer* analyzer,
-    const std::unordered_map<const VarNode*, IntSet>& dom_map,
-    std::unordered_map<Tensor, TensorDom>* out_dom_map) const {}
-
-void PlaceholderOpNode::GatherBound(const Operation& self,
-                                    const std::unordered_map<Tensor, TensorDom>& tensor_dom,
-                                    std::unordered_map<IterVar, Range>* out_dom_map) const {}
-
-Stmt PlaceholderOpNode::BuildRealize(const Stage& stage,
-                                     const std::unordered_map<IterVar, Range>& realize_map,
-                                     const Stmt& body, String storage_scope) const {
-  return body;
-}
-
-Stmt PlaceholderOpNode::BuildProvide(const Stage& stage,
-                                     const std::unordered_map<IterVar, Range>& dom_map,
-                                     bool debug_keep_trivial_loop) const {
-  return Stmt();
-}
 }  // namespace te
 }  // namespace tvm
diff --git a/src/te/operation/scan_op.cc b/src/te/operation/scan_op.cc
index 39689bd9654a..9f045a069289 100644
--- a/src/te/operation/scan_op.cc
+++ b/src/te/operation/scan_op.cc
@@ -25,9 +25,6 @@
 #include <tvm/te/operation.h>
 #include <tvm/tir/expr.h>
 
-#include "../schedule/graph.h"
-#include "op_utils.h"
-
 namespace tvm {
 namespace te {
 using namespace tir;
@@ -40,13 +37,6 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 TVM_REGISTER_NODE_TYPE(ScanOpNode);
 
 int ScanOpNode::num_outputs() const { return static_cast<int>(update.size()); }
-Array<IterVar> ScanOpNode::root_iter_vars() const {
-  Array<IterVar> ret{scan_axis};
-  for (IterVar iv : spatial_axis_) {
-    ret.push_back(iv);
-  }
-  return ret;
-}
 
 DataType ScanOpNode::output_dtype(size_t i) const { return update[i]->dtype; }
 
@@ -139,141 +129,5 @@ Array<Tensor> ScanOpNode::InputTensors() const {
   return ret;
 }
 
-Operation ScanOpNode::ReplaceInputs(const Operation& self,
-                                    const std::unordered_map<Tensor, Tensor>& rmap) const {
-  ICHECK_EQ(self.operator->(), this);
-  auto n = make_object<ScanOpNode>(*this);
-  for (size_t i = 0; i < n->init.size(); ++i) {
-    if (rmap.count(n->init[i])) {
-      n->init.Set(i, rmap.at(n->init[i]));
-    }
-    if (rmap.count(n->update[i])) {
-      n->update.Set(i, rmap.at(n->update[i]));
-    }
-  }
-  if (!n->init.same_as(init) || !n->update.same_as(update)) {
-    return Operation(n);
-  } else {
-    return self;
-  }
-}
-
-void ScanOpNode::PropBoundToInputs(const Operation& self, arith::Analyzer* analyzer,
-                                   const std::unordered_map<const VarNode*, IntSet>& dom_map,
-                                   std::unordered_map<Tensor, TensorDom>* out_dom_map) const {
-  ICHECK_EQ(self.operator->(), this);
-  for (size_t i = 0, sp_idx = 0; i < this->init.size(); ++i) {
-    TensorDom* init_dom = nullptr;
-    TensorDom* update_dom = nullptr;
-    if (out_dom_map->count(this->init[i])) {
-      init_dom = &out_dom_map->at(this->init[i]);
-    }
-    if (out_dom_map->count(this->update[i])) {
-      update_dom = &out_dom_map->at(this->update[i]);
-    }
-    // first dimension, always needed.
-    if (init_dom) {
-      init_dom->data[0].push_back(
-          IntSet::FromRange(Range::FromMinExtent(0, this->init[i]->shape[0])));
-    }
-    if (update_dom) {
-      update_dom->data[0].push_back(dom_map.at(this->scan_axis->var.get()));
-    }
-    // The update dimensions
-    for (size_t k = 1; k < this->update[i]->shape.size(); ++k, ++sp_idx) {
-      IterVar sp_ax = this->spatial_axis_[sp_idx];
-      if (init_dom) {
-        init_dom->data[k].push_back(dom_map.at(sp_ax->var.get()));
-      }
-      if (update_dom) {
-        update_dom->data[k].push_back(dom_map.at(sp_ax->var.get()));
-      }
-    }
-  }
-}
-
-void ScanOpNode::GatherBound(const Operation& self,
-                             const std::unordered_map<Tensor, TensorDom>& tensor_dom,
-                             std::unordered_map<IterVar, Range>* out_dom_map) const {
-  ICHECK_EQ(self.operator->(), this);
-  ICHECK(!out_dom_map->count(this->scan_axis));
-  std::vector<Tensor> output(this->num_outputs());
-  for (size_t i = 0; i < output.size(); ++i) {
-    output[i] = self.output(i);
-  }
-  // Update for time axis.
-  std::vector<IntSet> time_dom;
-  for (size_t i = 0; i < output.size(); ++i) {
-    const TensorDom& d = tensor_dom.at(output[i]);
-    time_dom.insert(time_dom.end(), d.data[0].begin(), d.data[0].end());
-  }
-  ICHECK(!out_dom_map->count(this->scan_axis));
-  arith::Analyzer analyzer;
-  Range sdom = this->scan_axis->dom;
-  Range r = arith::Union(time_dom).CoverRange(sdom);
-  (*out_dom_map)[this->scan_axis] =
-      Range::FromMinExtent(sdom->min, analyzer.Simplify(r->extent + r->min - sdom->min));
-  Map<IterVar, PrimExpr> fix_pt = ScanFixPointAnalysis(self);
-  // Update for spatial axis.
-  size_t sp_idx = 0;
-  for (size_t i = 0; i < output.size(); ++i) {
-    const TensorDom& d = tensor_dom.at(output[i]);
-    for (size_t k = 1; k < this->update[i]->shape.size(); ++k, ++sp_idx) {
-      IterVar sp_ax = this->spatial_axis_[sp_idx];
-      ICHECK(!out_dom_map->count(sp_ax));
-      ICHECK(fix_pt.count(sp_ax));
-      if (fix_pt[sp_ax].as<tir::IntImmNode>()->value) {
-        // fix point, we can slice it.
-        (*out_dom_map)[sp_ax] = arith::Union(d.data[k]).CoverRange(sp_ax->dom);
-      } else {
-        // not a fix point, need to include everything.
-        (*out_dom_map)[sp_ax] = sp_ax->dom;
-      }
-    }
-  }
-}
-
-Stmt ScanOpNode::BuildRealize(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
-                              const Stmt& body, String storage_scope) const {
-  arith::Analyzer analyzer;
-  ICHECK_EQ(stage->op.get(), this);
-  Range sdom = dom_map.at(this->scan_axis);
-  Range tdom = Range::FromMinExtent(0, analyzer.Simplify(sdom->extent + sdom->min));
-  Stmt ret = body;
-  size_t sp_idx = 0;
-  for (size_t i = 0; i < update.size(); ++i) {
-    Tensor t = stage->op.output(i);
-    ICHECK_EQ(static_cast<size_t>(t->value_index), i);
-    Region bounds;
-    bounds.push_back(tdom);
-    for (size_t k = 1; k < this->update[i]->shape.size(); ++k, ++sp_idx) {
-      IterVar sp_ax = this->spatial_axis_[sp_idx];
-      bounds.push_back(dom_map.at(sp_ax));
-    }
-    ret = tir::ProducerRealize(t, bounds, const_true(), ret, storage_scope);
-  }
-  return ret;
-}
-
-Stmt ScanOpNode::BuildProvide(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
-                              bool debug_keep_trivial_loop) const {
-  ICHECK_EQ(stage->op.operator->(), this);
-  Stmt provide =
-      AttrStmt(stage->op, tir::attr::scan_update_scope, this->scan_axis->var, Evaluate(0));
-  Stmt init = AttrStmt(stage->op, tir::attr::scan_init_scope, 0, Evaluate(0));
-  size_t begin_scan = 0;
-  for (size_t i = 0; i < stage->leaf_iter_vars.size(); ++i) {
-    if (stage->leaf_iter_vars[i]->iter_type == kThreadIndex) {
-      ICHECK_EQ(begin_scan, i);
-      begin_scan = i + 1;
-    }
-  }
-  std::unordered_map<IterVar, PrimExpr> vmap;
-  std::unordered_set<IterVar> empty;
-  auto nest = MakeLoopNest(stage, dom_map, 0, false, empty, &vmap, debug_keep_trivial_loop);
-  nest[begin_scan].push_back(init);
-  nest.push_back(MakeIfNest(MakeBoundCheck(stage, dom_map, vmap, false, empty)));
-  return MergeNest(nest, provide);
-}
 }  // namespace te
 }  // namespace tvm
diff --git a/src/te/operation/tensor_compute_op.cc b/src/te/operation/tensor_compute_op.cc
deleted file mode 100644
index 00f751c58a09..000000000000
--- a/src/te/operation/tensor_compute_op.cc
+++ /dev/null
@@ -1,259 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \brief Tensor Compute Op.
- * \file tensor_compute_op.cc
- */
-#include <tvm/arith/analyzer.h>
-#include <tvm/runtime/registry.h>
-#include <tvm/te/operation.h>
-#include <tvm/tir/builtin.h>
-#include <tvm/tir/expr.h>
-#include <tvm/tir/stmt_functor.h>
-
-#include <unordered_set>
-
-#include "./compute_op.h"
-#include "./op_utils.h"
-
-namespace tvm {
-namespace te {
-using namespace tir;
-// TensorComputeOpNode
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<TensorComputeOpNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const TensorComputeOpNode*>(node.get());
-      p->stream << "tensor_compute_op(" << op->name << ", " << op << ")";
-    });
-
-TVM_REGISTER_NODE_TYPE(TensorComputeOpNode);
-
-int TensorComputeOpNode::num_outputs() const {
-  return static_cast<int>(this->intrin->buffers.size() - this->inputs.size());
-}
-
-DataType TensorComputeOpNode::output_dtype(size_t i) const {
-  return this->intrin->buffers[this->inputs.size() + i]->dtype;
-}
-
-TensorComputeOp::TensorComputeOp(std::string name, std::string tag, Array<IterVar> axis,
-                                 Array<IterVar> reduce_axis, int schedulable_ndim,
-                                 TensorIntrin intrin, Array<Tensor> tensors, Array<Region> regions,
-                                 Array<PrimExpr> scalar_inputs) {
-  auto n = make_object<TensorComputeOpNode>();
-  n->name = std::move(name);
-  n->tag = std::move(tag);
-  n->axis = std::move(axis);
-  n->reduce_axis = std::move(reduce_axis);
-  n->schedulable_ndim = std::move(schedulable_ndim);
-  n->intrin = std::move(intrin);
-  n->inputs = std::move(tensors);
-  n->input_regions = std::move(regions);
-  n->scalar_inputs = std::move(scalar_inputs);
-  data_ = std::move(n);
-}
-
-TVM_REGISTER_GLOBAL("te.TensorComputeOp")
-    .set_body_typed([](std::string name, std::string tag, Array<IterVar> axis,
-                       Array<IterVar> reduce_axis, int schedulable_ndim, TensorIntrin intrin,
-                       Array<Tensor> tensors, Array<Region> regions,
-                       Array<PrimExpr> scalar_inputs) {
-      return TensorComputeOp(name, tag, axis, reduce_axis, schedulable_ndim, intrin, tensors,
-                             regions, scalar_inputs);
-    });
-
-Array<Tensor> TensorComputeOpNode::InputTensors() const { return inputs; }
-
-Operation TensorComputeOpNode::ReplaceInputs(const Operation& self,
-                                             const std::unordered_map<Tensor, Tensor>& rmap) const {
-  ICHECK_EQ(self.operator->(), this);
-  auto n = make_object<TensorComputeOpNode>(*this);
-  auto intrin = make_object<TensorIntrinNode>(*(this->intrin.operator->()));
-  intrin->body = ReplaceTensor(this->intrin->body, rmap);
-  if (intrin->reduce_init.defined()) {
-    intrin->reduce_init = ReplaceTensor(this->intrin->reduce_init, rmap);
-  }
-  if (intrin->reduce_update.defined()) {
-    intrin->reduce_update = ReplaceTensor(this->intrin->reduce_update, rmap);
-  }
-  for (size_t i = 0; i < n->inputs.size(); ++i) {
-    Tensor t = n->inputs[i];
-    if (rmap.count(t)) {
-      n->inputs.Set(i, rmap.at(t));
-    }
-  }
-
-  if (intrin->body.same_as(n->intrin->body) &&
-      intrin->reduce_init.same_as(n->intrin->reduce_init) &&
-      intrin->reduce_update.same_as(n->intrin->reduce_update) && inputs.same_as(n->inputs)) {
-    return self;
-  } else {
-    n->intrin = TensorIntrin(intrin);
-    return Operation(n);
-  }
-}
-
-void TensorComputeOpNode::PropBoundToInputs(
-    const Operation& self, arith::Analyzer* analyzer,
-    const std::unordered_map<const VarNode*, IntSet>& dom_map,
-    std::unordered_map<Tensor, TensorDom>* out_dom_map) const {
-  for (size_t i = 0; i < this->inputs.size(); ++i) {
-    Tensor t = this->inputs[i];
-    Region region = input_regions[i];
-
-    auto it = out_dom_map->find(t);
-    if (it == out_dom_map->end()) continue;
-    TensorDom& dom = it->second;
-    for (size_t j = 0; j < t.ndim(); ++j) {
-      dom.data[j].emplace_back(EvalSet(region[j], dom_map));
-    }
-  }
-}
-
-size_t TensorComputeOpNode::num_schedulable_dims() const { return schedulable_ndim; }
-
-Stmt TensorComputeOpNode::BuildProvide(const Stage& stage,
-                                       const std::unordered_map<IterVar, Range>& dom_map,
-                                       bool debug_keep_trivial_loop) const {
-  ICHECK_EQ(stage->op.operator->(), this);
-
-  // Start bind data.
-  Stmt nop = Evaluate(0);
-  std::vector<Stmt> input_bind_nest, output_bind_nest;
-  Array<Tensor> inputs = this->InputTensors();
-
-  // input binding
-  size_t num_inputs = inputs.size();
-  for (size_t i = 0; i < num_inputs; ++i) {
-    Tensor tensor = inputs[i];
-    Region region = this->input_regions[i];
-    Buffer buffer = this->intrin->buffers[i];
-    Array<ObjectRef> bind_spec{buffer, tensor};
-
-    Array<PrimExpr> tuple;
-    for (size_t i = 0; i < region.size(); ++i) {
-      tuple.push_back(region[i]->min);
-      tuple.push_back(region[i]->extent);
-    }
-    input_bind_nest.emplace_back(
-        AttrStmt(bind_spec, tir::attr::buffer_bind_scope,
-                 Call(DataType::Handle(), tir::builtin::tvm_tuple(), tuple), nop));
-  }
-
-  // output binding
-  for (int i = 0; i < this->num_outputs(); ++i) {
-    Tensor tensor = stage->op.output(i);
-    Buffer buffer = this->intrin->buffers[num_inputs + i];
-    Array<ObjectRef> bind_spec{buffer, tensor};
-
-    Array<PrimExpr> tuple;
-    for (size_t i = 0; i < this->axis.size(); ++i) {
-      auto ivar = this->axis[i];
-      if (i < static_cast<size_t>(this->schedulable_ndim)) {
-        tuple.push_back(ivar->var);
-        tuple.push_back(1);
-      } else {
-        Range dom = ivar->dom;
-        tuple.push_back(dom->min);
-        tuple.push_back(dom->extent);
-      }
-    }
-
-    output_bind_nest.emplace_back(
-        AttrStmt(bind_spec, tir::attr::buffer_bind_scope,
-                 Call(DataType::Handle(), tir::builtin::tvm_tuple(), tuple), nop));
-  }
-
-  // Check variable remap
-  std::unordered_map<const VarNode*, PrimExpr> vmap;
-  tir::ArgBinder binder(&vmap);
-
-  // Map the expressions passed in the call to the TensorIntrin, to the placeholder
-  // variables
-  Array<PrimExpr> user_expr = this->scalar_inputs;
-  Array<Var> scalar_params = this->intrin->scalar_params;
-  Array<PrimExpr> sp_expr;
-  for (auto sp : scalar_params) {
-    PrimExpr esp = sp;
-    sp_expr.push_back(esp);
-  }
-  ICHECK_EQ(sp_expr.size(), user_expr.size());
-  // TODO(jdavies-huawei): what name should be used here?
-  binder.BindArray(sp_expr, user_expr, this->name);
-
-  size_t tloc = stage->leaf_iter_vars.size();
-  ComputeLoopNest n = ComputeLoopNest::Create(this, stage, dom_map, debug_keep_trivial_loop);
-
-  if (this->reduce_axis.size() == 0) {
-    std::vector<std::vector<Stmt>> nest(n.main_nest.begin(), n.main_nest.begin() + tloc + 1);
-    nest.emplace_back(MakeIfNest(n.main_predicates));
-    ICHECK_EQ(n.init_predicates.size(), 0U);
-    ICHECK(this->intrin->body.defined())
-        << "Normal store op for intrin " << this << " is not defined";
-    Stmt body = MergeNest(output_bind_nest, this->intrin->body);
-    body = MergeNest(input_bind_nest, body);
-    body = tir::Substitute(body, vmap);
-    body = MergeNest(binder.asserts(), body);
-    body = te::Substitute(body, n.main_vmap);
-    Stmt ret = MergeNest(nest, body);
-    return ret;
-  } else {
-    // Need to split reduction
-    ICHECK(this->intrin->reduce_update.defined()) << "Reduction update op is not defined";
-    // Need init and update steps
-    ICHECK_NE(this->reduce_axis.size(), 0U);
-    std::vector<std::vector<Stmt>> common(n.main_nest.begin(),
-                                          n.main_nest.begin() + n.num_common_loop + 1);
-    std::vector<std::vector<Stmt>> update_nest(n.main_nest.begin() + n.num_common_loop + 1,
-                                               n.main_nest.begin() + tloc + 1);
-    update_nest.emplace_back(MakeIfNest(n.main_predicates));
-
-    if (this->intrin->reduce_init.defined()) {
-      // init nest
-      std::vector<std::vector<Stmt>> init_nest(n.init_nest.begin(), n.init_nest.begin() + tloc + 1);
-      init_nest.emplace_back(MakeIfNest(n.init_predicates));
-      Stmt init = MergeNest(output_bind_nest, this->intrin->reduce_init);
-      init = te::Substitute(init, n.init_vmap);
-      init = MergeNest(init_nest, init);
-      // The update
-      Stmt update = MergeNest(output_bind_nest, this->intrin->reduce_update);
-      update = MergeNest(input_bind_nest, update);
-      update = tir::Substitute(update, vmap);
-      update = MergeNest(binder.asserts(), update);
-      update = te::Substitute(update, n.main_vmap);
-      update = MergeNest(update_nest, update);
-      return MergeNest(common, SeqStmt::Flatten(init, update));
-    } else {
-      // When init op is not available, use body op for reset in the first iter.
-      ICHECK(this->intrin->body.defined()) << "Normal body op is not defined";
-      Stmt update =
-          TransformUpdate(stage, dom_map, n, this->intrin->body, this->intrin->reduce_update);
-      update = MergeNest(output_bind_nest, update);
-      update = MergeNest(input_bind_nest, update);
-      update = tir::Substitute(update, vmap);
-      update = MergeNest(binder.asserts(), update);
-      update = te::Substitute(update, n.main_vmap);
-      update = MergeNest(update_nest, update);
-      return MergeNest(common, update);
-    }
-  }
-}
-}  // namespace te
-}  // namespace tvm
diff --git a/src/te/operation/tensorize.cc b/src/te/operation/tensorize.cc
deleted file mode 100644
index 138aeeb37f19..000000000000
--- a/src/te/operation/tensorize.cc
+++ /dev/null
@@ -1,496 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \brief Logics related to tensorize, used by ComputeOpNode.
- * \file tensorize.cc
- */
-#include <tvm/runtime/registry.h>
-#include <tvm/tir/analysis.h>
-#include <tvm/tir/expr.h>
-#include <tvm/tir/stmt_functor.h>
-
-#include "../schedule/message_passing.h"
-#include "compute_op.h"
-#include "op_utils.h"
-
-namespace tvm {
-namespace te {
-
-using namespace tir;
-
-// Detect the region of input and output to be tensorized.
-// out_dom: the domain of root iter vars in output op
-// in_region: region of each input tensor.
-// return The location of the tensorized scope start.
-size_t InferTensorizeRegion(const ComputeOpNode* self, const Stage& stage,
-                            const std::unordered_map<IterVar, Range>& dom_map,
-                            std::unordered_map<IterVar, Range>* out_dom,
-                            std::unordered_map<Tensor, Array<Range>>* in_region) {
-  // Get the bound of the tensorized scope.
-  bool found_point = false;
-  size_t loc_scope = 0;
-  std::unordered_map<IterVar, IntSet> up_state;
-  // Loop over the leafs
-  for (size_t i = stage->leaf_iter_vars.size(); i != 0; --i) {
-    IterVar iv = stage->leaf_iter_vars[i - 1];
-    ICHECK(iv->iter_type == kDataPar || iv->iter_type == kCommReduce);
-    auto vit = dom_map.find(iv);
-    ICHECK(vit != dom_map.end());
-    const Range& vrange = vit->second;
-    if (is_one(vrange->extent)) {
-      up_state[iv] = IntSet::SinglePoint(vrange->min);
-    } else if (found_point) {
-      ICHECK(is_zero(vrange->min));
-      up_state[iv] = IntSet::SinglePoint(iv->var);
-    } else {
-      up_state[iv] = IntSet::FromRange(vrange);
-    }
-    auto iit = stage->iter_var_attrs.find(iv);
-    if (iit != stage->iter_var_attrs.end()) {
-      const IterVarAttr& attr = (*iit).second;
-      if (!found_point) {
-        ICHECK(!attr->bind_thread.defined()) << "Do not allow thread in tensorize scope";
-      }
-      if (attr->iter_type == kTensorized) {
-        ICHECK(!found_point) << "Do not allow two tensorized point";
-        found_point = true;
-        loc_scope = i - 1;
-      }
-    }
-  }
-  ICHECK(found_point);
-  // Get domain of the tensorized scope.
-  te::PassUpDomain(stage, dom_map, &up_state);
-  // Get domains if inputs
-  std::unordered_map<Tensor, TensorDom> in_dom;
-  std::unordered_map<const VarNode*, IntSet> temp_dmap;
-  arith::Analyzer analyzer;
-  Array<Tensor> inputs = self->InputTensors();
-  for (Tensor t : inputs) {
-    in_dom.emplace(t, TensorDom(t.ndim()));
-  }
-  for (IterVar iv : self->root_iter_vars()) {
-    IntSet iset = up_state.at(iv);
-    Range iv_range = iset.CoverRange(dom_map.at(iv));
-    (*out_dom)[iv] = iv_range;
-    analyzer.Bind(iv->var, iv_range);
-    temp_dmap[iv->var.get()] = iset;
-  }
-  // Input domains
-  self->PropBoundToInputs(stage->op, &analyzer, temp_dmap, &in_dom);
-  Range none;
-  for (const auto& kv : in_dom) {
-    Array<Range> vec;
-    const Tensor& t = kv.first;
-    for (size_t i = 0; i < t.ndim(); ++i) {
-      Range r = arith::Union(kv.second.data.at(i)).CoverRange(none);
-      ICHECK(r.defined()) << "cannot deduce region of tensorized scope for input " << t;
-      vec.push_back(std::move(r));
-    }
-    (*in_region)[t] = std::move(vec);
-  }
-  return loc_scope;
-}
-
-void VerifyTensorizeLoopNest(const ComputeOpNode* self, const Stage& stage,
-                             const ComputeLoopNest& n, size_t tloc) {
-  // Veirfication step.
-  std::unordered_set<const VarNode*> banned;
-  ICHECK_EQ(n.main_nest.size(), stage->leaf_iter_vars.size() + 1);
-  ICHECK(n.init_nest.size() == stage->leaf_iter_vars.size() + 1 || n.init_nest.size() == 0);
-  auto f_push_banned = [&banned](const Stmt& s) {
-    if (const ForNode* op = s.as<ForNode>()) {
-      banned.insert(op->loop_var.get());
-    } else if (const AttrStmtNode* op = s.as<AttrStmtNode>()) {
-      if (const IterVarNode* iv = op->node.as<IterVarNode>()) {
-        banned.insert(iv->var.get());
-      }
-    } else if (const LetStmtNode* op = s.as<LetStmtNode>()) {
-      banned.insert(op->var.get());
-    }
-  };
-  for (size_t i = tloc; i < stage->leaf_iter_vars.size(); ++i) {
-    for (const Stmt& s : n.main_nest[i + 1]) {
-      f_push_banned(s);
-    }
-    if (n.init_nest.size() != 0) {
-      for (const Stmt& s : n.init_nest[i + 1]) {
-        f_push_banned(s);
-      }
-    }
-  }
-
-  auto fbanned = [&](const VarNode* node) { return banned.count(node); };
-
-  for (const PrimExpr& pred : n.main_predicates) {
-    if (tir::UsesVar(pred, fbanned)) {
-      LOG(FATAL) << "Tensorize failed, split condition " << pred
-                 << " relies on var defined inside tensorize scope";
-    }
-  }
-  for (const PrimExpr& pred : n.init_predicates) {
-    if (tir::UsesVar(pred, fbanned)) {
-      LOG(FATAL) << "Tensorize failed, split condition " << pred
-                 << " relies on var defined inside tensorize scope";
-    }
-  }
-}
-
-// Remap the tensor placeholder, index and inline things.
-class TensorIntrinMatcher final : public StmtExprMutator {
- public:
-  PrimExpr VisitExpr_(const ProducerLoadNode* op) final {
-    PrimExpr expr = StmtExprMutator::VisitExpr_(op);
-    op = expr.as<ProducerLoadNode>();
-    auto t = Downcast<Tensor>(op->producer);
-    auto it = in_remap_.find(t);
-    if (it != in_remap_.end()) {
-      const InputEntry& e = it->second;
-      ICHECK_EQ(op->indices.size(), e.region.size());
-      Array<PrimExpr> indices;
-      for (size_t i = e.start; i < e.region.size(); ++i) {
-        indices.push_back(op->indices[i] - e.region[i]->min);
-      }
-      return ProducerLoad(e.tensor, indices);
-    }
-    return expr;
-  }
-
-  PrimExpr VisitExpr_(const VarNode* op) final {
-    auto it = var_remap_.find(op);
-    if (it != var_remap_.end()) {
-      return it->second;
-    } else {
-      return GetRef<PrimExpr>(op);
-    }
-  }
-
-  PrimExpr VisitExpr_(const ReduceNode* op) final {
-    PrimExpr expr = StmtExprMutator::VisitExpr_(op);
-    op = expr.as<ReduceNode>();
-    Array<IterVar> axis;
-    for (size_t i = 0; i < op->axis.size(); ++i) {
-      auto it = axis_remap_.find(op->axis[i]);
-      if (it != axis_remap_.end()) {
-        axis.push_back(it->second);
-      }
-    }
-    return Reduce(op->combiner, op->source, axis, op->condition, op->value_index, op->init);
-  }
-
-  void Init(const ComputeOpNode* self, const Stage& stage,
-            const std::unordered_map<IterVar, Range>& dom_map,
-            const std::unordered_map<IterVar, Range>& out_dom,
-            const std::unordered_map<Tensor, Array<Range>>& in_region, const TensorIntrin& intrin,
-            Map<Var, Range>* compute_intrin_iter_space) {
-    ICHECK(self == stage->op.get());
-
-    for (size_t i = 0; i < stage->leaf_iter_vars.size(); ++i) {
-      IterVar iv = stage->leaf_iter_vars[i];
-      auto vit = dom_map.find(iv);
-      if (vit != dom_map.end()) {
-        const Range vrange = vit->second;
-        compute_intrin_iter_space->Set(iv->var, vrange);
-      }
-    }
-    analyzer_.Bind(*compute_intrin_iter_space);
-
-    // input remap.
-    Array<Tensor> inputs = self->InputTensors();
-    ICHECK_EQ(inputs.size(), intrin->inputs.size());
-    for (size_t i = 0; i < inputs.size(); ++i) {
-      InputEntry e;
-      e.tensor = intrin->inputs[i];
-      e.region = Array<Range>(in_region.at(inputs[i]));
-      ICHECK_GE(e.region.size(), e.tensor.ndim());
-      // Enable fuzzy matching, to match [1, n, m] to [n, m]
-      e.start = e.region.size() - e.tensor.ndim();
-      for (size_t j = 0; j < e.start; ++j) {
-        auto canonical_extent = analyzer_.Simplify(e.region[j]->extent);
-        ICHECK(is_one(canonical_extent))
-            << "Tensorize " << intrin->name << ":"
-            << " Input dimension mismatch with tensor intrin "
-            << " expected shape=" << e.tensor->shape << ", given region=" << e.region;
-      }
-      in_remap_[inputs[i]] = e;
-    }
-    // output remap
-    const ComputeOpNode* intrin_compute = intrin->op.as<ComputeOpNode>();
-    ICHECK(intrin_compute) << "Only support compute intrinsic for now";
-    ICHECK_GE(self->axis.size(), intrin_compute->axis.size())
-        << "Tensorize: Output mismatch with tensor intrin ";
-    // Enable fuzzy matching, to match [1, n, m] to [n, m]
-    size_t axis_start = self->axis.size() - intrin_compute->axis.size();
-    for (size_t i = 0; i < axis_start; ++i) {
-      Range r = out_dom.at(self->axis[i]);
-      ICHECK(is_one(r->extent)) << "Tensorize: Output mismatch with tensor intrin "
-                                << " intrin-dim=" << intrin_compute->axis.size()
-                                << ", tensorize-dim=" << self->axis.size();
-      var_remap_[self->axis[i]->var.get()] = r->min;
-    }
-    // Assume we tensorize at regin axis i [min, min + extent)
-    // The corresponding intrinsic axis is j [0, extent)
-    // Remap index i to j + min
-    for (size_t i = axis_start; i < self->axis.size(); ++i) {
-      IterVar iv = self->axis[i];
-      IterVar target_iv = intrin_compute->axis[i - axis_start];
-      Range r = out_dom.at(iv);
-      var_remap_[iv->var.get()] = target_iv->var + r->min;
-      axis_remap_[iv] = target_iv;
-      compute_intrin_iter_space->Set(target_iv->var, target_iv->dom);
-    }
-    // Remap reduction axis
-    ICHECK_GE(self->reduce_axis.size(), intrin_compute->reduce_axis.size())
-        << "Tensorize: Reduction dimension mismatch with tensor intrin";
-    axis_start = self->reduce_axis.size() - intrin_compute->reduce_axis.size();
-    for (size_t i = 0; i < axis_start; ++i) {
-      Range r = out_dom.at(self->reduce_axis[i]);
-      ICHECK(is_one(r->extent)) << "Tensorize: Reduction mismatch with tensor intrin "
-                                << " intrin-dim=" << intrin_compute->reduce_axis.size()
-                                << ", tensorize-dim=" << self->reduce_axis.size();
-      var_remap_[self->reduce_axis[i]->var.get()] = r->min;
-    }
-    for (size_t i = axis_start; i < self->reduce_axis.size(); ++i) {
-      IterVar iv = self->reduce_axis[i];
-      IterVar target_iv = intrin_compute->reduce_axis[i - axis_start];
-      Range r = out_dom.at(iv);
-      var_remap_[iv->var.get()] = target_iv->var + r->min;
-      axis_remap_[iv] = target_iv;
-      compute_intrin_iter_space->Set(target_iv->var, target_iv->dom);
-    }
-  }
-
- private:
-  // Input entry
-  struct InputEntry {
-    Tensor tensor;
-    size_t start;
-    Array<Range> region;
-  };
-  // input data remap
-  std::unordered_map<Tensor, InputEntry> in_remap_;
-  // variable remap.
-  std::unordered_map<const VarNode*, PrimExpr> var_remap_;
-  // IterVar remap.
-  std::unordered_map<IterVar, IterVar> axis_remap_;
-  // arith analyzer
-  arith::Analyzer analyzer_;
-};
-
-// Try to match tensor dataflow of the stage with the intrinsic
-Array<PrimExpr> MatchTensorizeBody(const ComputeOpNode* self, const Stage& stage,
-                                   const std::unordered_map<IterVar, Range>& dom_map,
-                                   const std::unordered_map<IterVar, Range>& out_dom,
-                                   const std::unordered_map<Tensor, Array<Range>>& in_region,
-                                   const TensorIntrin& intrin,
-                                   Map<Var, Range>* compute_intrin_iter_space) {
-  TensorIntrinMatcher matcher;
-  matcher.Init(self, stage, dom_map, out_dom, in_region, intrin, compute_intrin_iter_space);
-  Array<PrimExpr> ret;
-  for (PrimExpr expr : self->body) {
-    ret.push_back(matcher(expr));
-  }
-  return ret;
-}
-
-void VerifyTensorizeBody(const ComputeOpNode* self, const Stage& stage,
-                         const std::unordered_map<IterVar, PrimExpr>& value_map,
-                         const std::unordered_map<IterVar, Range>& dom_map,
-                         const std::unordered_map<IterVar, Range>& out_dom,
-                         const std::unordered_map<Tensor, Array<Range>>& in_region,
-                         const TensorIntrin& intrin) {
-  StructuralEqual expr_equal;
-  Map<Var, Range> compute_intrin_iter_space;
-  Array<PrimExpr> body = MatchTensorizeBody(self, stage, dom_map, out_dom, in_region, intrin,
-                                            &compute_intrin_iter_space);
-  const ComputeOpNode* intrin_compute = intrin->op.as<ComputeOpNode>();
-  ICHECK(intrin_compute) << "Only support compute intrinsic for now";
-  ICHECK_EQ(body.size(), intrin_compute->body.size()) << "Tensorize failed: body size mismatch";
-  arith::Analyzer ana;
-  ana.Bind(compute_intrin_iter_space);
-
-  for (size_t i = 0; i < body.size(); ++i) {
-    PrimExpr lhs = ana.Simplify(Substitute(body[i], value_map));
-    // run substitution because the intrin body could depend on outer loop vars.
-    PrimExpr rhs = ana.Simplify(Substitute(intrin_compute->body[i], value_map));
-    if (lhs.dtype() != rhs.dtype()) {
-      LOG(FATAL) << "Failed to match the data type with TensorIntrin " << intrin->name
-                 << "'s declaration "
-                 << " provided=" << lhs.dtype() << ", intrin=" << rhs.dtype();
-    }
-    ICHECK(expr_equal(lhs, rhs)) << "Failed to match the compute with TensorIntrin " << intrin->name
-                                 << "'s declaration "
-                                 << " provided= " << lhs << ", intrin=  " << rhs
-                                 << ", running this stage: " << stage;
-  }
-}
-
-Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage,
-                   const std::unordered_map<IterVar, Range>& dom_map,
-                   bool debug_keep_trivial_loop) {
-  std::unordered_map<IterVar, Range> out_dom;
-  std::unordered_map<Tensor, Array<Range>> in_region;
-  size_t tloc = InferTensorizeRegion(self, stage, dom_map, &out_dom, &in_region);
-  TensorIntrin intrin = stage->iter_var_attrs.at(stage->leaf_iter_vars[tloc])->tensor_intrin;
-  ICHECK(intrin.defined());
-  ComputeLoopNest n = ComputeLoopNest::Create(self, stage, dom_map, debug_keep_trivial_loop);
-  VerifyTensorizeLoopNest(self, stage, n, tloc);
-  VerifyTensorizeBody(self, stage, n.main_vmap, dom_map, out_dom, in_region, intrin);
-  // Start bind data.
-  Stmt nop = Evaluate(0);
-  std::vector<Stmt> input_bind_nest, output_bind_nest;
-  Array<Tensor> inputs = self->InputTensors();
-  ICHECK_EQ(inputs.size(), intrin->inputs.size()) << "Tensorize failed: input size mismatch ";
-  // input binding
-  for (size_t i = 0; i < intrin->inputs.size(); ++i) {
-    Tensor tensor = inputs[i];
-    Buffer buffer = intrin->buffers[i];
-    Array<ObjectRef> bind_spec{buffer, tensor};
-    auto it = in_region.find(tensor);
-    ICHECK(it != in_region.end());
-    const Array<Range>& region = it->second;
-    Array<PrimExpr> tuple;
-    for (const Range r : region) {
-      tuple.push_back(r->min);
-      tuple.push_back(r->extent);
-    }
-    input_bind_nest.emplace_back(
-        AttrStmt(bind_spec, tir::attr::buffer_bind_scope,
-                 Call(DataType::Handle(), tir::builtin::tvm_tuple(), tuple), nop));
-  }
-  // output binding
-  const ComputeOpNode* intrin_compute = intrin->op.as<ComputeOpNode>();
-  ICHECK(intrin_compute) << "Only support compute intrinsic for now";
-  ICHECK_EQ(intrin->inputs.size() + intrin_compute->body.size(), intrin->buffers.size());
-  ICHECK_EQ(intrin_compute->body.size(), self->body.size());
-  Array<PrimExpr> tuple;
-  for (IterVar iv : self->axis) {
-    auto it = out_dom.find(iv);
-    ICHECK(it != out_dom.end());
-    tuple.push_back(it->second->min);
-    tuple.push_back(it->second->extent);
-  }
-  for (size_t i = intrin->inputs.size(); i < intrin->buffers.size(); ++i) {
-    Tensor tensor = stage->op.output(i - intrin->inputs.size());
-    Buffer buffer = intrin->buffers[i];
-    Array<ObjectRef> bind_spec{buffer, tensor};
-    output_bind_nest.emplace_back(
-        AttrStmt(bind_spec, tir::attr::buffer_bind_scope,
-                 Call(DataType::Handle(), tir::builtin::tvm_tuple(), tuple), nop));
-  }
-  // Check variable remap
-  std::unordered_map<const VarNode*, PrimExpr> vmap;
-  tir::ArgBinder binder(&vmap);
-  ICHECK_GE(self->reduce_axis.size(), intrin_compute->reduce_axis.size())
-      << "Tensorization fail: reduction axis size do not match";
-  size_t start = self->reduce_axis.size() - intrin_compute->reduce_axis.size();
-  for (size_t i = 0; i < start; ++i) {
-    IterVar iv = self->reduce_axis[i];
-    auto it = out_dom.find(iv);
-    ICHECK(it != out_dom.end());
-    ICHECK(is_one(it->second->extent)) << "Tensorization fail: reduction axis size do not match";
-  }
-  for (size_t i = start; i < self->reduce_axis.size(); ++i) {
-    IterVar iv = self->reduce_axis[i];
-    IterVar target = intrin_compute->reduce_axis[i - start];
-    auto it = out_dom.find(iv);
-    ICHECK(it != out_dom.end());
-    binder.Bind(target->dom->min, make_const(iv->dom->min.dtype(), 0),
-                "tensir_intrin.reduction.min");
-    binder.Bind(target->dom->extent, it->second->extent, "tensir_intrin.reduction.extent");
-  }
-  if (tloc <= n.num_common_loop) {
-    // Do no need to split reduction
-    std::vector<std::vector<Stmt>> nest(n.main_nest.begin(), n.main_nest.begin() + tloc + 1);
-    nest.emplace_back(MakeIfNest(n.main_predicates));
-    ICHECK_EQ(n.init_predicates.size(), 0U);
-    ICHECK(intrin->body.defined()) << "Normal store op for intrin " << intrin << " is not defined";
-    Stmt body = MergeNest(output_bind_nest, intrin->body);
-    body = MergeNest(input_bind_nest, body);
-    body = tir::Substitute(body, vmap);
-    body = MergeNest(binder.asserts(), body);
-    body = te::Substitute(body, n.main_vmap);
-    return MergeNest(nest, body);
-  } else {
-    // Need to split reduction
-    ICHECK(intrin->reduce_update.defined())
-        << "Reduction update op for intrin " << intrin << " is not defined";
-    // Need init and update steps
-    ICHECK_NE(self->reduce_axis.size(), 0U);
-    std::vector<std::vector<Stmt>> common(n.main_nest.begin(),
-                                          n.main_nest.begin() + n.num_common_loop + 1);
-    std::vector<std::vector<Stmt>> update_nest(n.main_nest.begin() + n.num_common_loop + 1,
-                                               n.main_nest.begin() + tloc + 1);
-    update_nest.emplace_back(MakeIfNest(n.main_predicates));
-
-    if (intrin->reduce_init.defined()) {
-      // init nest
-      std::vector<std::vector<Stmt>> init_nest(n.init_nest.begin(), n.init_nest.begin() + tloc + 1);
-      init_nest.emplace_back(MakeIfNest(n.init_predicates));
-      Stmt init = MergeNest(output_bind_nest, intrin->reduce_init);
-      init = te::Substitute(init, n.init_vmap);
-      init = MergeNest(init_nest, init);
-      // The update
-      Stmt update = MergeNest(output_bind_nest, intrin->reduce_update);
-      update = MergeNest(input_bind_nest, update);
-      update = tir::Substitute(update, vmap);
-      update = MergeNest(binder.asserts(), update);
-      update = te::Substitute(update, n.main_vmap);
-      update = MergeNest(update_nest, update);
-      return MergeNest(common, SeqStmt::Flatten(init, update));
-    } else {
-      // When init op is not available, use body op for reset in the first iter.
-      ICHECK(intrin->body.defined()) << "Normal body op for intrin " << intrin << " is not defined";
-      Stmt update = TransformUpdate(stage, dom_map, n, intrin->body, intrin->reduce_update);
-      update = MergeNest(output_bind_nest, update);
-      update = MergeNest(input_bind_nest, update);
-      update = tir::Substitute(update, vmap);
-      update = MergeNest(binder.asserts(), update);
-      update = te::Substitute(update, n.main_vmap);
-      update = MergeNest(update_nest, update);
-      return MergeNest(common, update);
-    }
-  }
-}
-
-// Register functions for unittests
-TVM_REGISTER_GLOBAL("test.op.InferTensorizeRegion").set_body([](TVMArgs args, TVMRetValue* ret) {
-  Stage stage = args[0];
-  Map<IterVar, Range> dmap = args[1];
-  std::unordered_map<IterVar, Range> out_dom;
-  std::unordered_map<Tensor, Array<Range>> in_region;
-  ICHECK(stage->op.as<ComputeOpNode>());
-  InferTensorizeRegion(stage->op.as<ComputeOpNode>(), stage, as_unordered_map(dmap), &out_dom,
-                       &in_region);
-  *ret = Array<ObjectRef>{Map<IterVar, Range>(out_dom), Map<Tensor, Array<Range>>(in_region)};
-});
-
-TVM_REGISTER_GLOBAL("test.op.MatchTensorizeBody").set_body([](TVMArgs args, TVMRetValue* ret) {
-  Stage stage = args[0];
-  Map<IterVar, Range> out_dom = args[1];
-  Map<Tensor, Array<Range>> in_region = args[2];
-  TensorIntrin intrin = args[3];
-  Map<Var, Range> vrange;
-  ICHECK(stage->op.as<ComputeOpNode>());
-  *ret = MatchTensorizeBody(stage->op.as<ComputeOpNode>(), stage, {{}}, as_unordered_map(out_dom),
-                            as_unordered_map(in_region), intrin, &vrange);
-});
-}  // namespace te
-}  // namespace tvm
diff --git a/src/te/schedule/auto_inline_elem_wise.cc b/src/te/schedule/auto_inline_elem_wise.cc
deleted file mode 100644
index bf584df25825..000000000000
--- a/src/te/schedule/auto_inline_elem_wise.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file auto_inline_elem_wise.cc
- */
-#include <tvm/runtime/registry.h>
-#include <tvm/te/operation.h>
-#include <tvm/te/schedule_pass.h>
-#include <tvm/tir/expr_functor.h>
-
-namespace tvm {
-namespace te {
-
-using namespace tir;
-
-class ElemWiseDetector : public tir::ExprVisitor {
- public:
-  explicit ElemWiseDetector(Array<IterVar> axis) : axis_(axis) {}
-
-  void VisitExpr(const PrimExpr& e) final {
-    if (!is_elem_wise_) return;
-    ExprVisitor::VisitExpr(e);
-  }
-
-  void VisitExpr_(const ProducerLoadNode* op) final {
-    Array<PrimExpr> indices = op->indices;
-    if (axis_.size() != indices.size()) {
-      is_elem_wise_ = false;
-      return;
-    }
-
-    for (size_t i = 0; i < axis_.size(); ++i) {
-      if (!indices[i].same_as(axis_[i]->var)) {
-        is_elem_wise_ = false;
-        return;
-      }
-    }
-    ExprVisitor::VisitExpr_(op);
-  }
-
-  bool is_elem_wise_{true};
-
- private:
-  Array<IterVar> axis_;
-};
-
-bool IsElemWise(const Operation& op) {
-  if (const ComputeOpNode* compute = op.as<ComputeOpNode>()) {
-    ElemWiseDetector v = ElemWiseDetector(compute->axis);
-    for (auto& e : compute->body) v(e);
-    return v.is_elem_wise_;
-  }
-  return false;
-}
-
-void AutoInlineElemWise(Schedule sch) {
-  for (Stage s : sch->stages) {
-    if (!s.is_scheduled() && IsElemWise(s->op) && !s->is_output) {
-      s.compute_inline();
-    }
-  }
-}
-
-bool IsBroadcast(const Operation& op) {
-  if (const ComputeOpNode* compute = op.as<ComputeOpNode>()) {
-    if (compute->reduce_axis.size()) {
-      return false;
-    }
-    constexpr auto kBroadcast = "broadcast";
-    // broadcast op in topi has tag `broadcast`
-    if (op->tag == kBroadcast) {
-      return true;
-    }
-  }
-  return false;
-}
-
-void AutoInlineBroadcast(Schedule sch) {
-  for (Stage s : sch->stages) {
-    if (!s.is_scheduled() && IsBroadcast(s->op) && !s->is_output) {
-      s.compute_inline();
-    }
-  }
-}
-
-bool IsInjective(const Operation& op) {
-  if (const ComputeOpNode* compute = op.as<ComputeOpNode>()) {
-    return compute->reduce_axis.size() == 0;
-  }
-  return false;
-}
-
-void AutoInlineInjective(Schedule sch) {
-  for (Stage s : sch->stages) {
-    if (!s.is_scheduled() && IsInjective(s->op) && !s->is_output) {
-      s.compute_inline();
-    }
-  }
-}
-
-TVM_REGISTER_GLOBAL("schedule.AutoInlineElemWise").set_body_typed(AutoInlineElemWise);
-
-TVM_REGISTER_GLOBAL("schedule.AutoInlineBroadcast").set_body_typed(AutoInlineBroadcast);
-
-TVM_REGISTER_GLOBAL("schedule.AutoInlineInjective").set_body_typed(AutoInlineInjective);
-
-}  // namespace te
-}  // namespace tvm
diff --git a/src/te/schedule/bound.cc b/src/te/schedule/bound.cc
deleted file mode 100644
index d8abffd6aa06..000000000000
--- a/src/te/schedule/bound.cc
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file bound.cc
- * \brief The bound inference logic.
- */
-#include <tvm/runtime/registry.h>
-#include <tvm/te/operation.h>
-#include <tvm/te/schedule_pass.h>
-
-#include <unordered_map>
-#include <unordered_set>
-
-#include "../../runtime/thread_storage_scope.h"
-#include "graph.h"
-#include "message_passing.h"
-
-namespace tvm {
-namespace te {
-
-using runtime::StorageRank;
-using runtime::StorageScope;
-using runtime::ThreadScope;
-
-/*! \brief The graph context used during bound inference. */
-struct GraphContext {
-  /*! \brief The feed graph */
-  FeedGraph feed_graph;
-  /*! \brief Attachment path */
-  AttachPath attach_path;
-  /*! \brief The bind map */
-  std::unordered_map<IterVar, IterVar> bind_map;
-  /*! \brief map from op to stage */
-  std::unordered_map<const Object*, Stage> op2stage_;
-};
-
-bool NeedRelax(const IterVar& iv, bool found_attach,
-               const std::unordered_map<IterVar, IterVar>& bind_map,
-               const runtime::StorageScope& scope) {
-  auto it = bind_map.find(iv);
-  const std::string& tag = (it != bind_map.end() ? it->second->thread_tag : iv->thread_tag);
-  if (tag.length() == 0 || tag == "pipeline") {
-    return !found_attach;
-  }
-  ThreadScope ts = ThreadScope::Create(tag);
-
-  // When there is warp memory
-  // threadIdx.x must be set to be warp index.
-  if (scope.rank == StorageRank::kWarp && ts.rank == 1 && ts.dim_index == 0) {
-    return true;
-  }
-  return static_cast<int>(scope.rank) <= ts.rank;
-}
-
-// infer storage scope, if not given
-StorageScope InferStorageScope(const Stage& stage, const GraphContext& ctx) {
-  if (stage->scope.length() != 0) {
-    return StorageScope::Create(stage->scope);
-  }
-  int max_rank = -1;
-  for (IterVar iv : ctx.attach_path.at(stage->op)) {
-    auto it = ctx.bind_map.find(iv);
-    const std::string& tag = (it != ctx.bind_map.end() ? it->second->thread_tag : iv->thread_tag);
-    if (tag != "pipeline" && tag.length() != 0) {
-      max_rank = std::max(max_rank, ThreadScope::Create(tag).rank);
-    }
-  }
-  StorageScope s;
-  s.rank = runtime::DefaultStorageRank(max_rank);
-  return s;
-}
-
-void InferRootBound(const Stage& stage, const GraphContext& ctx,
-                    std::unordered_map<IterVar, Range>* rmap) {
-  ICHECK_NE(stage->attach_type, kInline) << "call schedule.normalize before scheduleops";
-  if (stage->attach_type == kInlinedAlready) return;
-  if (stage->is_output) {
-    // verify correctness.
-    ICHECK_EQ(stage.GetAttachSpec()->attach_type, kGroupRoot) << "Output must be attached at root";
-  }
-  if (stage->is_output || stage->op.as<PlaceholderOpNode>()) {
-    for (auto iv : stage->op->root_iter_vars()) {
-      ICHECK(iv->dom.defined());
-      ICHECK(!rmap->count(iv));
-      (*rmap)[iv] = iv->dom;
-    }
-    return;
-  }
-  // The tensor domain.
-  std::unordered_map<Tensor, TensorDom> tmap;
-  // The consumers of the op.
-  std::unordered_set<Operation> consumers;
-  for (int i = 0; i < stage->op->num_outputs(); ++i) {
-    Tensor t = stage->op.output(i);
-    tmap.emplace(t, TensorDom(static_cast<int>(t.ndim())));
-    auto it = ctx.feed_graph.find(t);
-    if (it != ctx.feed_graph.end()) {
-      for (const Operation& op : it->second) {
-        consumers.insert(op);
-      }
-    } else {
-      LOG(INFO) << "not in feed graph consumer = " << stage->op;
-    }
-  }
-  // storage scope.
-  runtime::StorageScope scope = InferStorageScope(stage, ctx);
-  // Bound prop by other consumers.
-  // - Compute bound by relaxation rules: NeedRelax
-  //   - For normal index, use relative location of loop nest./
-  //   - For thread index, use the thread scope.
-  //
-  Array<IterVar> stage_attach = ctx.attach_path.at(stage->op);
-  // The parent set.
-  for (const Operation& op : consumers) {
-    Map<Var, IntSet> relax_set;
-    std::unordered_map<IterVar, IntSet> up_state;
-    bool found_attach = false;
-    ICHECK(ctx.op2stage_.count(op.get()));
-    const Stage& op_stage = ctx.op2stage_.at(op.get());
-    // Consumer nest
-    for (size_t i = op_stage->leaf_iter_vars.size(); i != 0; --i) {
-      IterVar iv = op_stage->leaf_iter_vars[i - 1];
-      if (stage_attach.size() != 0 && iv == stage_attach[0]) {
-        found_attach = true;
-      }
-      auto it = rmap->find(iv);
-      ICHECK(it != rmap->end());
-      const Range& vrange = it->second;
-      if (is_one(vrange->extent)) {
-        up_state[iv] = IntSet::SinglePoint(vrange->min);
-      } else if (!NeedRelax(iv, found_attach, ctx.bind_map, scope)) {
-        ICHECK(is_zero(vrange->min)) << "InferBound requires every leaf iter var's min equals 0, "
-                                     << " call schedule.normalize to achieve this. ";
-        if (ctx.bind_map.count(iv)) {
-          up_state[iv] = IntSet::SinglePoint(ctx.bind_map.at(iv)->var);
-        } else {
-          up_state[iv] = IntSet::SinglePoint(iv->var);
-        }
-      } else {
-        up_state[iv] = IntSet::FromRange(vrange);
-      }
-    }
-    // Consumer's attach nest
-    for (IterVar iv : ctx.attach_path.at(op)) {
-      if (stage_attach.size() != 0 && iv == stage_attach[0]) {
-        found_attach = true;
-      }
-      Range vrange = rmap->at(iv);
-      ICHECK(is_zero(vrange->min)) << "InferBound requires every leaf iter var's min equals 0, "
-                                   << "call schedule.normalize to achieve this.";
-      if (NeedRelax(iv, found_attach, ctx.bind_map, scope)) {
-        relax_set.Set(iv->var, IntSet::FromRange(vrange));
-        if (ctx.bind_map.count(iv)) {
-          relax_set.Set(ctx.bind_map.at(iv)->var, IntSet::FromRange(vrange));
-        }
-      }
-    }
-    ICHECK(found_attach || stage_attach.size() == 0)
-        << "Invalid Schedule, cannot find the producer " << stage->op
-        << " along the loop nest specified by compute_at of consumer " << op;
-    // Get the domain of the consumer
-    PassUpDomain(op_stage, *rmap, &up_state);
-    // Relax if needed.
-    std::unordered_map<const VarNode*, IntSet> dom_map;
-    arith::Analyzer analyzer;
-    for (auto entry : *rmap) {
-      analyzer.Bind(entry.first->var, entry.second);
-    }
-    for (auto iv : op->root_iter_vars()) {
-      Range r;
-      if (up_state.count(iv)) {
-        r = up_state.at(iv).CoverRange(iv->dom);
-      } else {
-        r = iv->dom;
-      }
-      if (relax_set.size() != 0) {
-        dom_map[iv->var.get()] =
-            IntSet::Interval(analyzer.int_set(r->min, relax_set).min(),
-                             analyzer.int_set(r->min + r->extent - 1, relax_set).max());
-      } else {
-        dom_map[iv->var.get()] = IntSet::FromRange(r);
-      }
-      analyzer.Bind(iv->var, r, true);
-    }
-    op->PropBoundToInputs(op, &analyzer, dom_map, &tmap);
-  }
-  stage->op->GatherBound(stage->op, tmap, rmap);
-}
-
-Map<IterVar, Range> InferBound(const Schedule& sch) {
-  // Prepare context
-  GraphContext ctx;
-  Array<Operation> roots;
-  arith::Analyzer analyzer;
-
-  for (Operation op : sch->outputs) {
-    roots.push_back(sch->stage_map[op]->op);
-  }
-  ctx.feed_graph = CreateFeedGraph(CreateReadGraph(roots));
-
-  for (Stage stage : sch->stages) {
-    for (auto kv : stage->iter_var_attrs) {
-      if (kv.second->bind_thread.defined()) {
-        ICHECK(!ctx.bind_map.count(kv.first));
-        ctx.bind_map[kv.first] = kv.second->bind_thread;
-      }
-    }
-    ctx.op2stage_[stage->op.get()] = stage;
-  }
-  ctx.attach_path = CreateAttachPath(sch);
-  // Run inference.
-  std::unordered_map<IterVar, Range> ret;
-  for (size_t i = sch->stages.size(); i != 0; --i) {
-    const Stage& stage = sch->stages[i - 1];
-    InferRootBound(stage, ctx, &ret);
-
-    // bind bound of root iter vars.
-    for (auto iv : stage->op->root_iter_vars()) {
-      auto it = ret.find(iv);
-      if (it != ret.end()) {
-        analyzer.Bind(iv->var, it->second);
-      }
-    }
-
-    // pass down to get bound of all iter vars.
-    PassDownDomain(stage, &ret, &analyzer);
-    for (IterVar iv : stage->env_threads) {
-      ICHECK(iv->dom.defined());
-      ret[iv] = iv->dom;
-    }
-  }
-  for (auto it = ret.begin(); it != ret.end(); it++) {
-    DataType var_type = it->first->var.dtype();
-    it->second = Range::FromMinExtent(
-        // The range associated with each itervar must have the same dtype as the var
-        analyzer.Simplify(cast(var_type, it->second->min)),
-        analyzer.Simplify(cast(var_type, it->second->extent)));
-  }
-  return Map<IterVar, Range>(ret.begin(), ret.end());
-}
-
-TVM_REGISTER_GLOBAL("schedule.InferBound").set_body_typed(InferBound);
-
-}  // namespace te
-}  // namespace tvm
diff --git a/src/te/schedule/graph.cc b/src/te/schedule/graph.cc
deleted file mode 100644
index 502753284da6..000000000000
--- a/src/te/schedule/graph.cc
+++ /dev/null
@@ -1,418 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file graph.cc
- * \brief Utilities to get information about schedule graph.
- */
-#include "graph.h"
-
-#include <tvm/runtime/registry.h>
-#include <tvm/te/operation.h>
-#include <tvm/tir/expr.h>
-#include <tvm/tir/stmt_functor.h>
-
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-
-namespace tvm {
-namespace te {
-// key to specific tensor dimension.
-struct TensorDimKey {
-  Operation op;
-  int value_index;
-  int dim;
-  TensorDimKey() {}
-  TensorDimKey(const Tensor& t, int dim) : op(t->op), value_index(t->value_index), dim(dim) {}
-  TensorDimKey(const Tensor& t, size_t dim)
-      : op(t->op), value_index(t->value_index), dim(static_cast<int>(dim)) {}
-  inline bool operator==(const TensorDimKey& other) const {
-    return op == other.op && value_index == other.value_index && dim == other.dim;
-  }
-  inline bool operator!=(const TensorDimKey& other) const { return !operator==(other); }
-};
-}  // namespace te
-}  // namespace tvm
-
-namespace std {
-template <>
-struct hash<::tvm::te::TensorDimKey> {
-  std::size_t operator()(const ::tvm::te::TensorDimKey& k) const {
-    size_t lhs = ::tvm::ObjectPtrHash()(k.op);
-    size_t rhs = static_cast<size_t>(k.value_index) << 16UL | static_cast<size_t>(k.dim);
-    lhs ^= rhs + 0x9e3779b9 + (lhs << 6) + (lhs >> 2);
-    return lhs;
-  }
-};
-}  // namespace std
-
-namespace tvm {
-namespace te {
-
-// construct a read graph that gives readers of each operation
-// that the root depend on
-ReadGraph CreateReadGraph(const Array<Operation>& roots) {
-  ReadGraph rmap;
-  std::vector<Operation> stack;
-  std::unordered_set<const Object*> visited;
-  // initialize the roots
-  for (Operation op : roots) {
-    stack.push_back(op);
-    visited.insert(op.get());
-  }
-
-  while (!stack.empty()) {
-    Operation op = stack.back();
-    stack.pop_back();
-    Array<Tensor> deps = op->InputTensors();
-    rmap.Set(op, deps);
-    for (Tensor t : deps) {
-      if (t->op.defined() && visited.count(t->op.get()) == 0) {
-        visited.insert(t->op.get());
-        stack.push_back(t->op);
-      }
-    }
-  }
-  return rmap;
-}
-
-// Do DFS visit to get the subgraph.
-// Return if op is inside the subgraph.
-bool GetSubGraphByPostDFS_(const Operation& op, const std::unordered_set<const Object*>& boundary,
-                           bool include_bounary, std::unordered_map<const Object*, bool>* visited,
-                           Array<Operation>* result) {
-  if (visited->count(op.get())) {
-    return visited->at(op.get());
-  }
-  if (boundary.count(op.get())) {
-    (*visited)[op.get()] = true;
-    if (include_bounary) {
-      result->push_back(op);
-    }
-    return true;
-  }
-  // mark to avoid loop
-  // Not necessary for DAG.
-  (*visited)[op.get()] = false;
-  // check if we can reach boundary.
-  bool reach_boundary = false;
-  for (Tensor t : op->InputTensors()) {
-    if (GetSubGraphByPostDFS_(t->op, boundary, include_bounary, visited, result)) {
-      reach_boundary = true;
-    }
-  }
-  (*visited)[op.get()] = reach_boundary;
-  if (reach_boundary) {
-    result->push_back(op);
-  }
-  return reach_boundary;
-}
-
-Array<Operation> GetSubGraph(const Array<Tensor>& outputs, const Array<Tensor>& inputs,
-                             bool include_inputs) {
-  Array<Operation> result;
-  std::unordered_set<const Object*> boundary;
-  for (Tensor t : inputs) {
-    boundary.insert(t->op.get());
-  }
-  std::unordered_map<const Object*, bool> visited;
-  for (Tensor t : outputs) {
-    GetSubGraphByPostDFS_(t->op, boundary, include_inputs, &visited, &result);
-  }
-  return result;
-}
-
-void PostDFSOrder(const Operation& op, const ReadGraph& g, std::unordered_set<Operation>* visited,
-                  Array<Operation>* post_order) {
-  if (visited->count(op)) return;
-  visited->insert(op);
-  for (const auto& t : g.at(op)) {
-    PostDFSOrder(t->op, g, visited, post_order);
-  }
-  post_order->push_back(op);
-}
-
-Array<Operation> PostDFSOrder(const Array<Operation>& roots, const ReadGraph& g) {
-  std::unordered_set<Operation> visited;
-  Array<Operation> post_order;
-  for (Operation op : roots) {
-    PostDFSOrder(op, g, &visited, &post_order);
-  }
-  return post_order;
-}
-
-FeedGraph CreateFeedGraph(const ReadGraph& g) {
-  FeedGraph fg;
-  for (auto kv : g) {
-    for (Tensor t : kv.second) {
-      fg[t].push_back(kv.first);
-    }
-  }
-  return fg;
-}
-
-AttachPath CreateAttachPath(Schedule sch) {
-  AttachPath ret;
-  for (Stage stage : sch->stages) {
-    std::unordered_set<const Object*> visited;
-    Array<IterVar> path;
-    for (Stage s = stage; s.defined();) {
-      ICHECK(!visited.count(s.get())) << "Find loop in compute_at attach group";
-      visited.insert(s.get());
-      Stage spec = s.GetAttachSpec();
-      bool start_attach;
-      IterVar attach_ivar;
-      if (spec->attach_type == kScope) {
-        attach_ivar = spec->attach_ivar;
-        s = spec->attach_stage;
-        start_attach = false;
-        ICHECK(attach_ivar.defined());
-      } else if (spec->attach_type == kScanUpdate) {
-        s = spec->attach_stage;
-        start_attach = true;
-      } else {
-        break;
-      }
-      ICHECK(s.defined());
-      for (size_t i = s->leaf_iter_vars.size(); i != 0; --i) {
-        IterVar iv = s->leaf_iter_vars[i - 1];
-        if (!start_attach && iv.same_as(attach_ivar)) {
-          start_attach = true;
-        }
-        if (start_attach) path.push_back(iv);
-      }
-      ICHECK(start_attach) << "Invalid Schedule: cannot find attach point " << attach_ivar
-                           << " in the schedule of " << s->op;
-    }
-    if (!ret.count(stage->op)) {
-      ret.Set(stage->op, path);
-    }
-  }
-  return ret;
-}
-
-// graph of push reach relation of tensor dimensions
-using ReachGraph = std::unordered_map<TensorDimKey, std::vector<TensorDimKey>>;
-
-ReachGraph GetReachGraph(const Array<Operation>& ops) {
-  ReachGraph reach;
-  std::unordered_set<const Object*> bset;
-  for (size_t i = 0; i < ops.size(); ++i) {
-    bset.insert(ops[i].get());
-  }
-
-  for (Operation op : ops) {
-    if (const auto* scan_op = op.as<ScanOpNode>()) {
-      const auto& update = scan_op->update;
-      const auto& init = scan_op->init;
-      for (size_t i = 0; i < update.size(); ++i) {
-        Tensor t = op.output(i);
-        for (int k = 1; k < static_cast<int>(update[i]->shape.size()); ++k) {
-          reach[TensorDimKey(t, k)].emplace_back(TensorDimKey(update[i], k));
-          reach[TensorDimKey(t, k)].emplace_back(TensorDimKey(init[i], k));
-        }
-      }
-    } else if (const auto* compute_op = op.as<ComputeOpNode>()) {
-      std::unordered_map<const Object*, TensorDimKey> vmap;
-      const auto& axis = compute_op->axis;
-      Tensor t = op.output(0);
-      for (size_t i = 0; i < axis.size(); ++i) {
-        vmap[axis[i]->var.get()] = TensorDimKey(t, i);
-        reach[TensorDimKey(t, i)] = {};
-      }
-      auto fvisit = [&vmap, &reach, &bset](const ObjectRef& n) {
-        if (auto* pload = n.as<tir::ProducerLoadNode>()) {
-          Tensor t = Downcast<Tensor>(pload->producer);
-          if (!bset.count(t->op.get())) return;
-          for (size_t i = 0; i < pload->indices.size(); ++i) {
-            TensorDimKey dkey(t, static_cast<int>(i));
-            auto fpush = [&dkey, &vmap, &reach](const ObjectRef& node) {
-              const VarNode* v = node.as<VarNode>();
-              auto it = vmap.find(v);
-              if (it != vmap.end()) {
-                reach[it->second].push_back(dkey);
-              }
-            };
-            tir::PostOrderVisit(pload->indices[i], fpush);
-          }
-        }
-      };
-      for (auto& e : compute_op->body) {
-        tir::PostOrderVisit(e, fvisit);
-      }
-    }
-  }
-  return reach;
-}
-
-Array<Operation> ScanGetBody(const Operation& scan_op) {
-  const ScanOpNode* scan = scan_op.as<ScanOpNode>();
-  // Get the body.
-  Array<Tensor> inputs;
-  for (Tensor t : scan->state_placeholder) {
-    inputs.push_back(t);
-  }
-  for (Tensor t : scan->inputs) {
-    inputs.push_back(t);
-  }
-  return GetSubGraph(scan->update, inputs, false);
-}
-
-Map<IterVar, PrimExpr> ScanFixPointAnalysis(const Operation& scan_op) {
-  const ScanOpNode* scan = scan_op.as<ScanOpNode>();
-  Array<Operation> body = ScanGetBody(scan_op);
-
-  std::unordered_map<TensorDimKey, const Object*> exact_reach;
-  std::unordered_set<const Object*> fail_set;
-
-  for (size_t i = 0, sp_idx = 0; i < scan->update.size(); ++i) {
-    for (size_t k = 1; k < scan->update[i]->shape.size(); ++k, ++sp_idx) {
-      TensorDimKey key(scan->state_placeholder[i], k);
-      exact_reach[key] = scan->spatial_axis_[sp_idx].get();
-    }
-  }
-  // merge exact reach
-  auto f_merge_key = [&exact_reach, &fail_set](const TensorDimKey& dst, const TensorDimKey& src) {
-    auto sit = exact_reach.find(src);
-    if (sit == exact_reach.end()) return;
-    auto dit = exact_reach.find(dst);
-    if (dit == exact_reach.end()) {
-      exact_reach[dst] = sit->second;
-    } else {
-      if (dit->second != sit->second) {
-        fail_set.insert(dit->second);
-        fail_set.insert(sit->second);
-      }
-    }
-  };
-  // prop exact reach back.
-  for (size_t i = 0; i < body.size(); ++i) {
-    const Operation& op = body[i];
-    if (const auto* scan_op = op.as<ScanOpNode>()) {
-      const auto& update = scan_op->update;
-      const auto& init = scan_op->init;
-      for (size_t i = 0; i < update.size(); ++i) {
-        Tensor t = op.output(i);
-        for (size_t k = 1; k < update[i]->shape.size(); ++k) {
-          f_merge_key(TensorDimKey(t, k), TensorDimKey(update[i], k));
-          f_merge_key(TensorDimKey(t, k), TensorDimKey(init[i], k));
-        }
-      }
-    } else if (const auto* compute_op = op.as<ComputeOpNode>()) {
-      std::unordered_map<const Object*, std::vector<TensorDimKey>> vmap;
-      const auto& axis = compute_op->axis;
-      for (size_t i = 0; i < axis.size(); ++i) {
-        std::vector<TensorDimKey> keys;
-        for (int j = 0; j < op->num_outputs(); ++j) {
-          keys.emplace_back(op.output(j), i);
-        }
-        vmap[axis[i]->var.get()] = std::move(keys);
-      }
-      auto fvisit = [&vmap, &f_merge_key, &exact_reach, &fail_set](const ObjectRef& n) {
-        if (auto* pload = n.as<tir::ProducerLoadNode>()) {
-          Tensor t = Downcast<Tensor>(pload->producer);
-          for (size_t i = 0; i < pload->indices.size(); ++i) {
-            auto it = vmap.find(pload->indices[i].get());
-            TensorDimKey src(t, static_cast<int>(i));
-            if (it != vmap.end()) {
-              const std::vector<TensorDimKey>& keys = it->second;
-              for (const auto& key : keys) {
-                f_merge_key(key, src);
-              }
-            } else {
-              if (exact_reach.count(src)) {
-                fail_set.insert(exact_reach.at(src));
-              }
-            }
-          }
-        }
-      };
-      for (auto& e : compute_op->body) {
-        tir::PostOrderVisit(e, fvisit);
-      }
-    }
-  }
-  ReachGraph reach;
-  Map<IterVar, PrimExpr> ret;
-  std::unordered_set<TensorDimKey> place_holder_ref;
-  for (size_t i = 0; i < scan->state_placeholder.size(); ++i) {
-    for (size_t k = 0; k < scan->state_placeholder[i]->shape.size(); ++k) {
-      place_holder_ref.insert(TensorDimKey(scan->state_placeholder[i], k));
-    }
-  }
-
-  for (size_t i = 0, sp_idx = 0; i < scan->update.size(); ++i) {
-    for (size_t k = 1; k < scan->update[i]->shape.size(); ++k, ++sp_idx) {
-      TensorDimKey key(scan->update[i], k);
-      TensorDimKey target(scan->state_placeholder[i], k);
-      IterVar sp_iv = scan->spatial_axis_[sp_idx];
-      if (fail_set.count(sp_iv.get()) || !exact_reach.count(key) ||
-          exact_reach.at(key) != sp_iv.get()) {
-        ret.Set(sp_iv, make_const(DataType::Int(32), 0));
-      } else {
-        // now we proved exact match, need to prove no interference with other graph.
-        if (reach.size() == 0) reach = GetReachGraph(body);
-        // do a DFS
-        std::unordered_set<TensorDimKey> visited;
-        std::vector<TensorDimKey> stack{key};
-        visited.insert(key);
-        while (!stack.empty()) {
-          TensorDimKey k = stack.back();
-          if (k != target && place_holder_ref.count(k)) break;
-          stack.pop_back();
-          if (!reach.count(k)) {
-            LOG(FATAL) << "cannot find reach of " << k.op << "-" << k.dim;
-          }
-
-          for (TensorDimKey kk : reach.at(k)) {
-            if (visited.count(kk)) {
-              continue;
-            }
-            visited.insert(kk);
-            stack.push_back(kk);
-          }
-        }
-        if (!stack.empty()) {
-          // failed the prove.
-          ret.Set(sp_iv, make_const(DataType::Int(32), 0));
-        } else {
-          ret.Set(sp_iv, make_const(DataType::Int(32), 1));
-        }
-      }
-    }
-  }
-  return ret;
-}
-
-TVM_REGISTER_GLOBAL("schedule.CreateReadGraph").set_body_typed(CreateReadGraph);
-
-TVM_REGISTER_GLOBAL("schedule.PostDFSOrder")
-    .set_body_typed([](const Array<Operation>& roots, const ReadGraph& g) {
-      return PostDFSOrder(roots, g);
-    });
-
-TVM_REGISTER_GLOBAL("schedule.CreateAttachPath").set_body_typed(CreateAttachPath);
-
-TVM_REGISTER_GLOBAL("schedule.ScanGetBody").set_body_typed(ScanGetBody);
-
-TVM_REGISTER_GLOBAL("schedule.ScanFixPointAnalysis").set_body_typed(ScanFixPointAnalysis);
-
-}  // namespace te
-}  // namespace tvm
diff --git a/src/te/schedule/graph.h b/src/te/schedule/graph.h
deleted file mode 100644
index d31473d1b5a0..000000000000
--- a/src/te/schedule/graph.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file graph.h
- * \brief Utilities to get information about schedule graph.
- */
-#ifndef TVM_TE_SCHEDULE_GRAPH_H_
-#define TVM_TE_SCHEDULE_GRAPH_H_
-
-#include <tvm/te/operation.h>
-#include <tvm/te/schedule.h>
-#include <tvm/tir/expr.h>
-
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-namespace tvm {
-namespace te {
-
-/*!
- * \brief data structure of Operation->Tensors it reads
- */
-using ReadGraph = Map<Operation, Array<Tensor>>;
-
-/*!
- * \brief AttachPath maps op-> a list of IterVar
- */
-using AttachPath = Map<Operation, Array<IterVar>>;
-
-/*!
- * \brief The map between tensor and operation it feeds to.
- */
-using FeedGraph = std::unordered_map<Tensor, std::vector<Operation>>;
-
-/*!
- * \brief Get read graph of each operation to all the
- *  Tensors that it directly depends on.
- *
- *  The result map contains Operations needed to finish root Operation.
- * \param roots The root operation.
- * \return The result map.
- */
-ReadGraph CreateReadGraph(const Array<Operation>& roots);
-
-/*!
- * \brief Get minimum subgraph between outputs and inputs.
- *  The operations contains node which input-reachable from any inputs
- *  output reachable to any outputs.
- *
- *  The inputs won't be included in the subgraph, the outputs will be included.
- *
- * \param outputs The outputs of the subgraph
- * \param inputs The inputs to the subgraph.
- * \param include_inputs Whether to include inputs
- *
- * \return The subgraph.
- */
-Array<Operation> GetSubGraph(const Array<Tensor>& outputs, const Array<Tensor>& inputs,
-                             bool include_inputs);
-
-/*!
- * \brief Get a post DFS ordered of operations in the graph.
- * \param roots The root of the graph.
- * \param g The read graph.
- * \return vector order of Operations in PostDFS order.
- *
- * \note PostDFSOrder is a special case of Topoligical order,
- *   and can be used when topoligical order is needed.
- */
-Array<Operation> PostDFSOrder(const Array<Operation>& roots, const ReadGraph& g);
-
-/*!
- * \brief Create feedgraph for given Schedule
- * \param  g The read graph.
- * \return The created feedgraph.
- */
-FeedGraph CreateFeedGraph(const ReadGraph& g);
-
-/*!
- * \brief Create AttachPath that  maps op-> a list of IterVar
- *  That represents the loop nest op sits in from inner most to outermost
- *  Also inserts attach_stage for scan updates when needed.
- *
- * \param sch The schedule.
- * \return The attach path.
- */
-AttachPath CreateAttachPath(Schedule sch);
-
-/*!
- * \brief Get all operations inside the recursion of scan.
- * \param scan_op The scan node ops.
- * \return The body operations, in read dependency order.
- */
-Array<Operation> ScanGetBody(const Operation& scan_op);
-
-/*!
- * \brief Analyze each spatial dimension of scan's result.
- *  Give check on whether each dimension is fix point,
- *  An axis is a fixed point if it only refers back to itself in recursion
- *  and it is not used in axis of other recursion field.
- *
- *  next_state[t, ..., axis, ...] = f(prev_state[t-1, ...,axis,...]
- *
- * \param scan The scan node.
- * \return Map of spatial_axis -> IntImm
- */
-Map<IterVar, PrimExpr> ScanFixPointAnalysis(const Operation& scan);
-
-}  // namespace te
-}  // namespace tvm
-
-#endif  // TVM_TE_SCHEDULE_GRAPH_H_
diff --git a/src/te/schedule/message_passing.cc b/src/te/schedule/message_passing.cc
deleted file mode 100644
index e8f0d9332a16..000000000000
--- a/src/te/schedule/message_passing.cc
+++ /dev/null
@@ -1,744 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file message_passing.cc
- * \brief The message passing domain.
- */
-#include "message_passing.h"
-
-#include <tvm/arith/analyzer.h>
-#include <tvm/tir/expr.h>
-
-namespace tvm {
-namespace te {
-
-using namespace tir;
-
-void Update(std::unordered_map<IterVar, Range>* p_state, const IterVar& iv, Range r,
-            arith::Analyzer* analyzer) {
-  auto it = p_state->find(iv);
-  if (it == p_state->end()) {
-    (*p_state)[iv] = r;
-    analyzer->Bind(iv->var, r);
-  } else {
-    bool match =
-        is_zero(it->second->min) && analyzer->CanProve(r->extent - it->second->extent == 0);
-    ICHECK(match) << iv << " domain already inferred,"
-                  << " cannot prove their extents are the same " << it->second->extent << " vs "
-                  << r->extent;
-  }
-}
-
-/*!
- * \param Upward propagating whether an IterVar derives at least one leaf IterVar that binds to
- * a thread.
- *
- * \param stage The stage to operate on.
- * \param p_state The propagation result of each IterVar.
- */
-void PassUpThreadBinding(const Stage& stage, std::unordered_map<IterVar, bool>* p_state) {
-  auto bound_to_thread = [&stage](const IterVar& iv) {
-    bool bound = false;
-    auto it = stage->iter_var_attrs.find(iv);
-    if (it != stage->iter_var_attrs.end()) {
-      bound = (*it).second->bind_thread.defined();
-    }
-    return bound;
-  };
-
-  auto& state = *p_state;
-  // Fill p_state with leaf itervars
-  for (const IterVar& iv : stage->leaf_iter_vars) {
-    state[iv] = bound_to_thread(iv);
-  }
-  // Traverse the graph bottom-up to propagate thread binding information
-  for (size_t i = stage->relations.size(); i != 0; --i) {
-    IterVarRelation rel = stage->relations[i - 1];
-    if (const SplitNode* s = rel.as<SplitNode>()) {
-      state[s->parent] = state[s->inner] || state[s->outer];
-    } else if (const FuseNode* s = rel.as<FuseNode>()) {
-      state[s->inner] = state[s->fused];
-      state[s->outer] = state[s->fused];
-    } else if (const RebaseNode* s = rel.as<RebaseNode>()) {
-      state[s->parent] = state[s->rebased];
-    } else if (rel.as<SingletonNode>()) {
-    } else if (const TransformNode* s = rel.as<TransformNode>()) {
-      // Currently, this marks all original iter vars as deriving from
-      // a thread bind if any of the transformed variables are bound,
-      // even if the inverse expression for that iter var doesn't
-      // depend on the bound variable.
-
-      // TODO(Lunderberg): For each of original variable, check
-      // whether any variable in the inverse expression for it has a
-      // thread binding.
-      bool is_thread_binding = false;
-      for (const auto& iter_var : s->transformed_variables) {
-        is_thread_binding = is_thread_binding || state[iter_var];
-      }
-      for (const auto& iter_var : s->original_variables) {
-        state[iter_var] = is_thread_binding;
-      }
-    } else {
-      LOG(FATAL) << "unknown relation type";
-    }
-  }
-}
-
-void PassDownDomain(const Stage& stage, std::unordered_map<IterVar, Range>* p_state,
-                    arith::Analyzer* actx, bool allow_missing) {
-  auto ceil_div = [actx](const PrimExpr& a, const PrimExpr& b) {
-    if (actx->CanProve(indexmod(a, b) == 0)) {
-      return actx->Simplify(indexdiv(a, b));
-    }
-    return actx->Simplify(indexdiv(a + (b - 1), b));
-  };
-
-  auto minimum_or_later = [actx](const PrimExpr& a, const PrimExpr& b) {
-    if (actx->CanProve(a < b)) {
-      return actx->Simplify(a);
-    }
-    return actx->Simplify(b);
-  };
-
-  std::unordered_map<IterVar, bool> dominating_thread;
-  PassUpThreadBinding(stage, &dominating_thread);
-
-  auto& state = *p_state;
-  // forwar iteration on relations
-  for (IterVarRelation rel : stage->relations) {
-    if (const SplitNode* r = rel.as<SplitNode>()) {
-      if (!state.count(r->parent)) {
-        ICHECK(allow_missing);
-        continue;
-      }
-      ICHECK(!state.count(r->inner));
-      const Range& range_parent = state.at(r->parent);
-      // Tighten iv's extent to min(parent_extent, factor_or_nparts), only if all of the
-      // following conditions are met:
-      // 1. No leaf IterVar derived from iv binds to any thread.  People may use split
-      // to force an IterVar extent to match the number of allocated threads to fuse stages
-      // that require different number of threads.  We don't want to change these extents.
-      // 2. allow_missing is false, i.e. that PassDownDomain is called by the final InferBound,
-      // rather than by an early compiler phase, such as rfactor().  We don't want to tighten an
-      // IterVar in an early phase allowing missing IterVars, because it may bind to a thread later.
-      // 3. range_parent's extent is not 0.  At lest one Topi test has a case where a tensor has one
-      // zero-sized dimension.  Split creates iv with a positive extent to avoid zero-extent
-      // IterVar.  We don't touch it.
-      auto resolve_min_extent_for_split = [&](const IterVar& iv, const PrimExpr& factor_or_nparts) {
-        return dominating_thread[iv] || allow_missing || is_zero(range_parent->extent)
-                   ? factor_or_nparts
-                   : minimum_or_later(range_parent->extent, factor_or_nparts);
-      };
-      if (r->factor.defined()) {
-        Update(p_state, r->inner,
-               Range::FromMinExtent(0, cast(range_parent->extent.dtype(),
-                                            resolve_min_extent_for_split(r->inner, r->factor))),
-               actx);
-        Update(p_state, r->outer,
-               Range::FromMinExtent(0, ceil_div(range_parent->extent, r->factor)), actx);
-      } else {
-        Update(p_state, r->outer,
-               Range::FromMinExtent(0, cast(range_parent->extent.dtype(),
-                                            resolve_min_extent_for_split(r->outer, r->nparts))),
-               actx);
-        Update(p_state, r->inner,
-               Range::FromMinExtent(0, ceil_div(range_parent->extent, r->nparts)), actx);
-      }
-    } else if (const FuseNode* r = rel.as<FuseNode>()) {
-      if (!state.count(r->outer) || !state.count(r->inner)) {
-        ICHECK(allow_missing);
-        continue;
-      }
-      const Range& range_outer = state.at(r->outer);
-      const Range& range_inner = state.at(r->inner);
-      state[r->fused] = Range::FromMinExtent(0, range_outer->extent * range_inner->extent);
-    } else if (const RebaseNode* r = rel.as<RebaseNode>()) {
-      if (!state.count(r->parent)) {
-        ICHECK(allow_missing);
-        continue;
-      }
-      Update(p_state, r->rebased, Range::FromMinExtent(0, state.at(r->parent)->extent), actx);
-    } else if (const SingletonNode* s = rel.as<SingletonNode>()) {
-      Update(p_state, s->iter, Range::FromMinExtent(0, 1), actx);
-    } else if (const TransformNode* s = rel.as<TransformNode>()) {
-      bool missing_originals = false;
-      for (const auto& iter_var : s->original_variables) {
-        if (!state.count(iter_var)) {
-          ICHECK(allow_missing);
-          missing_originals = true;
-        }
-      }
-      if (missing_originals) {
-        continue;
-      }
-
-      Array<Range> original_ranges;
-      for (const auto& iter_var : s->original_variables) {
-        original_ranges.push_back(state[iter_var]);
-      }
-      Array<Range> updated_ranges = s->forward_transformation->MapRanges(original_ranges, actx);
-
-      ICHECK_EQ(updated_ranges.size(), s->transformed_variables.size());
-      for (size_t i = 0; i < updated_ranges.size(); i++) {
-        Update(p_state, s->transformed_variables[i], updated_ranges[i], actx);
-      }
-
-    } else {
-      LOG(FATAL) << "unknown relation type";
-    }
-  }
-  // update the extents of binded threads.
-  for (auto kv : stage->iter_var_attrs) {
-    if (kv.second->bind_thread.defined()) {
-      ICHECK(state.count(kv.first));
-      Update(p_state, kv.second->bind_thread, state.at(kv.first), actx);
-    }
-  }
-}
-
-void PassUpIndex(const Stage& stage, const Map<IterVar, Range>& dom_map,
-                 std::unordered_map<IterVar, PrimExpr>* p_state, bool allow_missing) {
-  auto& state = *p_state;
-  for (size_t i = stage->relations.size(); i != 0; --i) {
-    IterVarRelation rel = stage->relations[i - 1];
-    if (const SplitNode* s = rel.as<SplitNode>()) {
-      if (!state.count(s->outer) || !state.count(s->inner)) {
-        ICHECK(allow_missing);
-        continue;
-      }
-      PrimExpr outer = state.at(s->outer);
-      PrimExpr inner = state.at(s->inner);
-      PrimExpr factor = dom_map.at(s->inner)->extent;
-      PrimExpr parent_min = dom_map.at(s->parent)->min;
-      state[s->parent] = inner + outer * factor;
-      // add min if they exist
-      if (!is_zero(parent_min)) {
-        state[s->parent] = state[s->parent] + parent_min;
-      }
-    } else if (const FuseNode* s = rel.as<FuseNode>()) {
-      if (!state.count(s->fused)) {
-        ICHECK(allow_missing);
-        continue;
-      }
-      PrimExpr value = state.at(s->fused);
-      PrimExpr factor = dom_map.at(s->inner)->extent;
-      PrimExpr outer_min = dom_map.at(s->outer)->min;
-      PrimExpr inner_min = dom_map.at(s->inner)->min;
-      state[s->outer] = indexdiv(value, factor);
-      state[s->inner] = indexmod(value, factor);
-      // add min if they exist
-      if (!is_zero(outer_min)) {
-        state[s->outer] = state[s->outer] + outer_min;
-      }
-      if (!is_zero(inner_min)) {
-        state[s->inner] = state[s->inner] + inner_min;
-      }
-      // s->fused, s->outer and s->inner may be of different dtype,
-      // so we cast the `state` back to its original dtype
-      state[s->outer] = cast(s->outer->var.dtype(), state[s->outer]);
-      state[s->inner] = cast(s->inner->var.dtype(), state[s->inner]);
-    } else if (const RebaseNode* s = rel.as<RebaseNode>()) {
-      if (!state.count(s->rebased)) {
-        ICHECK(allow_missing);
-        continue;
-      }
-      PrimExpr value = state.at(s->rebased);
-      PrimExpr parent_min = dom_map.at(s->parent)->min;
-      // add min if they exist
-      if (!is_zero(parent_min)) {
-        state[s->parent] = value + parent_min;
-      } else {
-        state[s->parent] = value;
-      }
-    } else if (rel.as<SingletonNode>()) {
-    } else if (const TransformNode* s = rel.as<TransformNode>()) {
-      arith::Analyzer analyzer;
-      bool missing_transformed = false;
-      for (const auto& iter_var : s->transformed_variables) {
-        if (!state.count(iter_var)) {
-          ICHECK(allow_missing);
-          missing_transformed = true;
-        }
-      }
-      if (missing_transformed) {
-        continue;
-      }
-
-      Array<PrimExpr> transformed_indices;
-      for (const auto& iter_var : s->transformed_variables) {
-        transformed_indices.push_back(state[iter_var]);
-      }
-      Array<PrimExpr> original_indices =
-          s->inverse_transformation->MapIndices(transformed_indices, &analyzer);
-
-      ICHECK_EQ(original_indices.size(), s->original_variables.size());
-      for (size_t i = 0; i < original_indices.size(); i++) {
-        state[s->original_variables[i]] = original_indices[i];
-      }
-
-    } else {
-      LOG(FATAL) << "unknown relation type";
-    }
-  }
-}
-
-void PassDownIndex(const Stage& stage, const Map<IterVar, Range>& dom_map,
-                   std::unordered_map<IterVar, PrimExpr>* p_state, bool allow_missing) {
-  auto& state = *p_state;
-  for (IterVarRelation rel : stage->relations) {
-    if (const SplitNode* s = rel.as<SplitNode>()) {
-      if (!state.count(s->parent)) {
-        ICHECK(allow_missing);
-        continue;
-      }
-      Range r = dom_map.at(s->inner);
-      ICHECK(is_zero(r->min));
-      PrimExpr parent = state.at(s->parent);
-      PrimExpr factor = r->extent;
-      state[s->outer] = indexdiv(parent, factor);
-      state[s->inner] = indexmod(parent, factor);
-    } else if (const FuseNode* s = rel.as<FuseNode>()) {
-      if (!state.count(s->inner) && !state.count(s->outer)) {
-        ICHECK(allow_missing);
-        continue;
-      }
-      PrimExpr factor = dom_map.at(s->inner)->extent;
-      PrimExpr outer_min = dom_map.at(s->outer)->min;
-      PrimExpr inner_min = dom_map.at(s->inner)->min;
-      PrimExpr inner = state.at(s->inner);
-      PrimExpr outer = state.at(s->outer);
-      ICHECK(is_zero(outer_min));
-      ICHECK(is_zero(inner_min));
-      state[s->fused] = outer * factor + inner;
-    } else if (const RebaseNode* s = rel.as<RebaseNode>()) {
-      if (!state.count(s->rebased)) {
-        ICHECK(allow_missing);
-        continue;
-      }
-      PrimExpr value = state.at(s->parent);
-      PrimExpr parent_min = dom_map.at(s->parent)->min;
-      ICHECK(is_zero(parent_min));
-      state[s->rebased] = value;
-    } else if (const SingletonNode* s = rel.as<SingletonNode>()) {
-      state[s->iter] = make_zero(s->iter->var.dtype());
-    } else if (const TransformNode* s = rel.as<TransformNode>()) {
-      bool missing_originals = false;
-      for (const auto& iter_var : s->original_variables) {
-        if (!state.count(iter_var)) {
-          ICHECK(allow_missing);
-          missing_originals = true;
-        }
-      }
-      if (missing_originals) {
-        continue;
-      }
-
-      Array<PrimExpr> original_indices;
-      for (const auto& iter_var : s->original_variables) {
-        original_indices.push_back(state[iter_var]);
-      }
-      arith::Analyzer analyzer;
-      Array<PrimExpr> transformed_indices =
-          s->forward_transformation->MapIndices(original_indices, &analyzer);
-
-      ICHECK_EQ(transformed_indices.size(), s->transformed_variables.size());
-      for (size_t i = 0; i < transformed_indices.size(); i++) {
-        state[s->transformed_variables[i]] = transformed_indices[i];
-      }
-    } else {
-      LOG(FATAL) << "unknown relation type";
-    }
-  }
-}
-
-// Domain message passing.
-void PassUpDomain(const SplitNode* s, const std::unordered_map<IterVar, Range>& dom_map,
-                  const IntSet& outer, const IntSet& inner, IntSet* parent) {
-  if (dom_map.count(s->outer) && dom_map.count(s->inner) && dom_map.count(s->parent) &&
-      outer.MatchRange(dom_map.at(s->outer)) && inner.MatchRange(dom_map.at(s->inner))) {
-    *parent = IntSet::FromRange(dom_map.at(s->parent));
-    return;
-  }
-  PrimExpr factor = dom_map.at(s->inner)->extent;
-  PrimExpr parent_min = dom_map.at(s->parent)->min;
-  ICHECK(outer.defined());
-  ICHECK(inner.defined());
-  ICHECK(factor.defined());
-  *parent = arith::EvalSet(s->outer->var * factor + s->inner->var + parent_min,
-                           {{s->outer, outer}, {s->inner, inner}});
-}
-
-void PassUpDomain(const FuseNode* s, const std::unordered_map<IterVar, Range>& dom_map,
-                  const IntSet& fused, IntSet* outer, IntSet* inner) {
-  ICHECK(dom_map.count(s->outer));
-  ICHECK(dom_map.count(s->inner));
-  ICHECK(dom_map.count(s->fused));
-  arith::Analyzer ana;
-
-  if (fused.MatchRange(dom_map.at(s->fused))) {
-    *outer = IntSet::FromRange(dom_map.at(s->outer));
-    *inner = IntSet::FromRange(dom_map.at(s->inner));
-    return;
-  }
-  PrimExpr outer_min = dom_map.at(s->outer)->min;
-  PrimExpr inner_min = dom_map.at(s->inner)->min;
-
-  if (fused.IsSinglePoint()) {
-    PrimExpr value = fused.PointValue();
-    PrimExpr factor = dom_map.at(s->inner)->extent;
-    PrimExpr v_outer = indexdiv(value, factor);
-    PrimExpr v_inner = indexmod(value, factor);
-    if (!is_zero(outer_min)) v_outer = v_outer + outer_min;
-    if (!is_zero(inner_min)) v_inner = v_inner + inner_min;
-    *outer = IntSet::SinglePoint(v_outer);
-    *inner = IntSet::SinglePoint(v_inner);
-  } else {
-    PrimExpr fused_extent = (fused.max() - fused.min() + 1);
-    PrimExpr inner_extent = dom_map.at(s->inner)->extent;
-    *outer = IntSet::Interval(outer_min + indexdiv(fused.min(), inner_extent),
-                              outer_min + indexdiv(fused.max(), inner_extent));
-    if (is_zero(ana.Simplify(indexmod(inner_extent, fused_extent))) &&
-        is_zero(ana.Simplify(indexmod(fused.min(), fused_extent)))) {
-      // fused never spans multiple rows, make a tight bounding box
-      // there may be other cases when bounding box could be tightened
-      *inner = IntSet::Interval(inner_min + indexmod(fused.min(), inner_extent),
-                                inner_min + indexmod(fused.max(), inner_extent));
-    } else {  // fused may span multiple rows, use full row widths
-      if (!is_zero(ana.Simplify(indexmod(fused_extent, inner_extent))) ||
-          !is_zero(ana.Simplify(indexmod(fused.min(), inner_extent)))) {
-        LOG(WARNING)
-            << "fused and original axes are not aligned, this may cause redundant computations";
-      }
-      *inner = IntSet::FromRange(dom_map.at(s->inner));
-    }
-    return;
-  }
-}
-
-void PassUpDomain(const RebaseNode* s, const std::unordered_map<IterVar, Range>& dom_map,
-                  const IntSet& rebased, IntSet* parent) {
-  ICHECK(dom_map.count(s->parent));
-  if (rebased.MatchRange(dom_map.at(s->rebased))) {
-    *parent = IntSet::FromRange(dom_map.at(s->parent));
-    return;
-  }
-  PrimExpr parent_min = dom_map.at(s->parent)->min;
-  *parent = arith::EvalSet(s->rebased->var + parent_min, {{s->rebased, rebased}});
-}
-
-Array<IntSet> PassUpDomain(const TransformNode* s,
-                           const std::unordered_map<IterVar, Range>& dom_map,
-                           const Map<IterVar, IntSet>& transformed_domains) {
-  Array<IntSet> output;
-
-  Array<PrimExpr> transformed_indices;
-  for (const auto& iter_var : s->transformed_variables) {
-    transformed_indices.push_back(iter_var->var);
-  }
-
-  arith::Analyzer analyzer;
-  Array<PrimExpr> transformed_exprs =
-      s->inverse_transformation->MapIndices(transformed_indices, &analyzer);
-
-  ICHECK_EQ(transformed_exprs.size(), s->original_variables.size());
-  for (size_t i = 0; i < transformed_exprs.size(); i++) {
-    output.push_back(arith::EvalSet(transformed_exprs[i], transformed_domains));
-  }
-
-  return output;
-}
-
-void PassUpDomain(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
-                  std::unordered_map<IterVar, IntSet>* p_state) {
-  auto& state = *p_state;
-  for (size_t i = stage->relations.size(); i != 0; --i) {
-    IterVarRelation rel = stage->relations[i - 1];
-    if (const SplitNode* r = rel.as<SplitNode>()) {
-      IntSet parent;
-      PassUpDomain(r, dom_map, state.at(r->outer), state.at(r->inner), &parent);
-      state[r->parent] = parent;
-    } else if (const FuseNode* r = rel.as<FuseNode>()) {
-      IntSet outer, inner;
-      PassUpDomain(r, dom_map, state.at(r->fused), &outer, &inner);
-      state[r->outer] = outer;
-      state[r->inner] = inner;
-    } else if (const RebaseNode* r = rel.as<RebaseNode>()) {
-      IntSet parent;
-      PassUpDomain(r, dom_map, state.at(r->rebased), &parent);
-      state[r->parent] = parent;
-    } else if (rel.as<SingletonNode>()) {
-    } else if (const TransformNode* r = rel.as<TransformNode>()) {
-      Map<IterVar, IntSet> transformed_domains;
-      for (const auto& var : r->transformed_variables) {
-        transformed_domains.Set(var, state.at(var));
-      }
-      auto original_ranges = PassUpDomain(r, dom_map, transformed_domains);
-      ICHECK_EQ(original_ranges.size(), r->original_variables.size());
-      for (size_t i = 0; i < original_ranges.size(); i++) {
-        state[r->original_variables[i]] = original_ranges[i];
-      }
-    } else {
-      LOG(FATAL) << "unknown relation type";
-    }
-  }
-}
-
-// Pass up bit mask with or relation.
-void PassUpBitMaskOr(const Stage& stage, std::unordered_map<IterVar, int>* p_state,
-                     bool allow_missing) {
-  auto& state = *p_state;
-  for (size_t i = stage->relations.size(); i != 0; --i) {
-    IterVarRelation rel = stage->relations[i - 1];
-    if (const SplitNode* s = rel.as<SplitNode>()) {
-      if (!state.count(s->inner) && !state.count(s->outer)) {
-        ICHECK(allow_missing);
-        continue;
-      }
-      int res = 0;
-      if (state.count(s->parent)) res |= state[s->parent];
-      if (state.count(s->inner)) res |= state[s->inner];
-      if (state.count(s->outer)) res |= state[s->outer];
-      state[s->parent] = res;
-    } else if (const FuseNode* s = rel.as<FuseNode>()) {
-      if (!state.count(s->fused)) {
-        ICHECK(allow_missing);
-        continue;
-      }
-      if (!state.count(s->outer)) {
-        state[s->outer] = state[s->fused];
-      } else {
-        state[s->outer] |= state[s->fused];
-      }
-      if (!state.count(s->inner)) {
-        state[s->inner] = state[s->fused];
-      } else {
-        state[s->inner] |= state[s->fused];
-      }
-    } else if (const RebaseNode* s = rel.as<RebaseNode>()) {
-      if (!state.count(s->rebased)) {
-        ICHECK(allow_missing);
-        continue;
-      }
-      if (!state.count(s->parent)) {
-        state[s->parent] = state[s->rebased];
-      } else {
-        state[s->parent] |= state[s->rebased];
-      }
-    } else if (const TransformNode* s = rel.as<TransformNode>()) {
-      for (const auto& original_var : s->original_variables) {
-        for (const auto& transformed_var : s->transformed_variables) {
-          if (!state.count(transformed_var)) {
-            ICHECK(allow_missing);
-            continue;
-          }
-          state[original_var] |= state[transformed_var];
-        }
-      }
-
-    } else if (rel.as<SingletonNode>()) {
-    } else {
-      LOG(FATAL) << "unknown relation type";
-    }
-  }
-}
-
-void PassDownBitMaskOr(const Stage& stage, std::unordered_map<IterVar, int>* p_state,
-                       bool allow_missing) {
-  auto& state = *p_state;
-  for (IterVarRelation rel : stage->relations) {
-    if (const SplitNode* s = rel.as<SplitNode>()) {
-      if (!state.count(s->parent)) {
-        ICHECK(allow_missing);
-        continue;
-      }
-      if (!state.count(s->outer)) {
-        state[s->outer] = state.at(s->parent);
-      } else {
-        state[s->outer] |= state.at(s->parent);
-      }
-      if (!state.count(s->inner)) {
-        state[s->inner] = state.at(s->parent);
-      } else {
-        state[s->inner] |= state.at(s->parent);
-      }
-    } else if (const FuseNode* s = rel.as<FuseNode>()) {
-      if (!state.count(s->outer) && !state.count(s->inner)) {
-        ICHECK(allow_missing);
-        continue;
-      }
-      int res = 0;
-      if (state.count(s->outer)) res |= state.at(s->outer);
-      if (state.count(s->inner)) res |= state.at(s->inner);
-      if (state.count(s->fused)) res |= state.at(s->fused);
-      state[s->fused] = res;
-    } else if (const RebaseNode* s = rel.as<RebaseNode>()) {
-      if (!state.count(s->parent)) {
-        ICHECK(allow_missing);
-        continue;
-      }
-      if (!state.count(s->rebased)) {
-        state[s->rebased] = state.at(s->parent);
-      } else {
-        state[s->rebased] |= state.at(s->parent);
-      }
-    } else if (const TransformNode* s = rel.as<TransformNode>()) {
-      for (const auto& original_var : s->original_variables) {
-        for (const auto& transformed_var : s->transformed_variables) {
-          if (!state.count(original_var)) {
-            ICHECK(allow_missing);
-            continue;
-          }
-          state[transformed_var] |= state[original_var];
-        }
-      }
-    } else if (const SingletonNode* s = rel.as<SingletonNode>()) {
-      state[s->iter] = 0;
-    } else {
-      LOG(FATAL) << "unknown relation type";
-    }
-  }
-}
-
-/*!
- * \brief message passing to find if boundary checking on IterVar is needed.
- * \param s The stage to be used.
- * \param p_state The message passing state
- *     IterVar->flag
- */
-void PassUpBoundCheck(const Stage& s, const Map<IterVar, Range>& dom_map,
-                      std::unordered_map<IterVar, bool>* p_state, arith::Analyzer* analyzer) {
-  auto& state = *p_state;
-  for (size_t i = s->relations.size(); i != 0; --i) {
-    IterVarRelation rel = s->relations[i - 1];
-    if (const SplitNode* s = rel.as<SplitNode>()) {
-      bool outer = state.at(s->outer);
-      bool inner = state.at(s->inner);
-
-      if (dom_map.count(s->inner) && dom_map.count(s->outer)) {
-        PrimExpr factor = dom_map.at(s->inner)->extent;
-        PrimExpr step = dom_map.at(s->outer)->extent;
-        if (outer || inner) {
-          state[s->parent] = true;
-        } else {
-          if (analyzer->CanProve(dom_map.at(s->parent)->extent == factor * step) ||
-              s->disable_predication) {
-            state[s->parent] = false;
-          } else {
-            state[s->parent] = true;
-          }
-        }
-      } else {
-        state[s->parent] = true;
-      }
-    } else if (const FuseNode* s = rel.as<FuseNode>()) {
-      bool fused = state.at(s->fused);
-      state[s->outer] = fused;
-      state[s->inner] = fused;
-    } else if (const RebaseNode* s = rel.as<RebaseNode>()) {
-      state[s->parent] = state.at(s->rebased);
-    } else if (rel.as<SingletonNode>()) {
-      // nop
-    } else if (const TransformNode* s = rel.as<TransformNode>()) {
-      // Currently, this marks all original iter vars as requiring
-      // bounds checks if any of the transformed variables require
-      // bounds checks, even if the inverse expression for that iter
-      // var doesn't depend on the bound variable.
-
-      // TODO(Lunderberg): For each of original variable, check
-      // whether any variable in the inverse expression for it
-      // requires bounds checking.
-      bool needs_bounds_check = false;
-      for (const auto& iter_var : s->transformed_variables) {
-        needs_bounds_check = needs_bounds_check || state[iter_var];
-      }
-      for (const auto& iter_var : s->original_variables) {
-        state[iter_var] = needs_bounds_check;
-      }
-    } else {
-      LOG(FATAL) << "unknown relation type";
-    }
-  }
-}
-
-bool IsRangeSame(const Range input_1, const Range input_2) {
-  arith::Analyzer analyzer;
-  if (input_1.same_as(input_2)) return true;
-
-  return (analyzer.CanProve(input_1->min == input_2->min) &&
-          analyzer.CanProve(input_1->extent == input_2->extent));
-}
-
-std::vector<PrimExpr> MakeBoundCheck(const Stage& stage, const Map<IterVar, Range>& dom_map,
-                                     const std::unordered_map<IterVar, PrimExpr>& value_map,
-                                     bool skip_ivar_domain,
-                                     const std::unordered_set<IterVar>& skip_iter) {
-  arith::Analyzer analyzer;
-
-  std::unordered_map<IterVar, bool> bound_state;
-  for (IterVar iv : stage->leaf_iter_vars) {
-    bound_state[iv] = false;
-  }
-  PassUpBoundCheck(stage, dom_map, &bound_state, &analyzer);
-
-  std::vector<PrimExpr> preds;
-  Map<Var, IntSet> iset_dmap;
-
-  // setup domain map for set analysis
-  for (const auto& kv : dom_map) {
-    iset_dmap.Set(kv.first->var, IntSet::FromRange(kv.second));
-  }
-
-  for (auto entry : dom_map) {
-    analyzer.Bind(entry.first->var, entry.second);
-  }
-
-  for (const IterVar& iv : stage->all_iter_vars) {
-    if (skip_iter.count(iv) || iv->iter_type == kOpaque) continue;
-    if (bound_state.at(iv)) {
-      Range dom = dom_map.at(iv);
-      PrimExpr value = value_map.at(iv) - dom->min;
-      PrimExpr vmax = analyzer.int_set(value, iset_dmap).max();
-      if (vmax.dtype() != value.dtype() || !analyzer.CanProve(vmax < dom->extent)) {
-        preds.emplace_back(value < dom->extent);
-      }
-    }
-  }
-  for (const IterVar& iv : stage->op->root_iter_vars()) {
-    if (skip_iter.count(iv) || iv->iter_type == kOpaque) continue;
-    Range dom = dom_map.at(iv);
-    ICHECK(iv->dom.defined());
-    if (!skip_ivar_domain && !IsRangeSame(iv->dom, dom)) {
-      PrimExpr value = value_map.at(iv) - iv->dom->min;
-      IntSet s = analyzer.int_set(value, iset_dmap);
-      PrimExpr vmin = s.min();
-      PrimExpr vmax = s.max();
-      // The range of `value` resides in [vmin, vmax]
-      if (vmin.dtype() != value.dtype() || !analyzer.CanProve(vmin >= 0)) {
-        preds.emplace_back(value >= 0);
-      }
-      if (vmax.dtype() != value.dtype() || !analyzer.CanProve(vmax < iv->dom->extent)) {
-        preds.emplace_back(value < iv->dom->extent);
-      }
-    }
-  }
-  return preds;
-}
-}  // namespace te
-}  // namespace tvm
diff --git a/src/te/schedule/message_passing.h b/src/te/schedule/message_passing.h
deleted file mode 100644
index c382b90d630c..000000000000
--- a/src/te/schedule/message_passing.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file message_passing.h
- * \brief Common utilities to do message passing
- *  on the schedule hyper graph.
- */
-#ifndef TVM_TE_SCHEDULE_MESSAGE_PASSING_H_
-#define TVM_TE_SCHEDULE_MESSAGE_PASSING_H_
-
-#include <tvm/arith/analyzer.h>
-#include <tvm/te/operation.h>
-#include <tvm/te/schedule.h>
-#include <tvm/tir/expr.h>
-
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-namespace tvm {
-namespace te {
-/*!
- * \brief Downward inference of domain of each IterVar.
- *  Caller set the range of the root, then the function
- *  propagates it towards the leaves.
- *
- * \param stage The stage to operate on.
- * \param p_state The state of the message passing.
- * \param analyzer Analyzer context, storing information about bounds in p_state.
- * \param allow_missing Whether allow missing value.
- */
-void PassDownDomain(const Stage& stage, std::unordered_map<IterVar, Range>* p_state,
-                    arith::Analyzer* analyzer, bool allow_missing = false);
-
-/*!
- * \param Upward inference of index of each IterVar.
- *  given index assignement of the leaves,
- *
- * \param stage The stage to operate on.
- * \param dom_map The domain map of each iteration variable's domain.
- * \param p_state The index state of each IterVar.
- * \param allow_missing Whether allow missing value.
- */
-void PassUpIndex(const Stage& stage, const Map<IterVar, Range>& dom_map,
-                 std::unordered_map<IterVar, PrimExpr>* p_state, bool allow_missing = false);
-
-/*!
- * \param Downward inference of index of each IterVar.
- *  given index assignement of roots.
- *
- * \param stage The stage to operate on.
- * \param dom_map The domain map of each iteration variable's domain.
- * \param p_state The index state of each IterVar.
- * \param allow_missing Whether allow missing value.
- */
-void PassDownIndex(const Stage& stage, const Map<IterVar, Range>& dom_map,
-                   std::unordered_map<IterVar, PrimExpr>* p_state, bool allow_missing = false);
-
-/*!
- * \param Upward inference of domain set of each IterVar.
- *  given domain assignment of the leaves,
- *
- * \param stage The stage to operate on.
- * \param dom_map The domain map of each iteration variable's maximum domain.
- * \param p_state The index state of each IterVar.
- */
-void PassUpDomain(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
-                  std::unordered_map<IterVar, IntSet>* p_state);
-
-/*!
- * \brief Upward message passing of bitmask with or relation.
- * \param stage The stage to operate on.
- * \param p_state The index state of each IterVar.
- * \param allow_missing Whether allow missing value.
- */
-void PassUpBitMaskOr(const Stage& stage, std::unordered_map<IterVar, int>* p_state,
-                     bool allow_missing = false);
-
-/*!
- * \brief Downward message passing of bitmask with or relation.
- * \param stage The stage to operate on.
- * \param p_state The index state of each IterVar.
- * \param allow_missing Whether allow missing value.
- */
-void PassDownBitMaskOr(const Stage& stage, std::unordered_map<IterVar, int>* p_state,
-                       bool allow_missing = false);
-
-/*!
- * \brief Create boundary check predicates given remapped value of root
- * \param stage The stage we operate on
- * \param dom_map The domain map of each value.
- * \param value_map The value map of the root iter var.
- * \param skip_ivar_domain Whether we skip check for IterVar's original domain.
- * \param skip_iter The set of variables to skip bound condition.
- * \return List of predicates that we need to check.
- */
-std::vector<PrimExpr> MakeBoundCheck(const Stage& stage, const Map<IterVar, Range>& dom_map,
-                                     const std::unordered_map<IterVar, PrimExpr>& value_map,
-                                     bool skip_ivar_domain,
-                                     const std::unordered_set<IterVar>& skip_iter);
-
-}  // namespace te
-}  // namespace tvm
-#endif  // TVM_TE_SCHEDULE_MESSAGE_PASSING_H_
diff --git a/src/te/schedule/operation_inline.cc b/src/te/schedule/operation_inline.cc
deleted file mode 100644
index 8eed6e3f10fc..000000000000
--- a/src/te/schedule/operation_inline.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file operation_inline.cc
- */
-#include "operation_inline.h"
-
-#include <tvm/tir/analysis.h>
-#include <tvm/tir/expr.h>
-#include <tvm/tir/stmt.h>
-#include <tvm/tir/stmt_functor.h>
-
-#include <utility>
-
-#include "../../tir/transforms/ir_utils.h"
-
-namespace tvm {
-namespace te {
-
-// inliner to inline a function
-// the result may not be SSA,
-// ConvertSSA need to be applied after this pass
-class OperationInliner final : public StmtExprMutator {
- public:
-  OperationInliner(Operation op, Array<Var> args, PrimExpr body)
-      : operation_(op), args_(args), body_(body) {}
-
-  PrimExpr VisitExpr_(const ProducerLoadNode* op) final {
-    PrimExpr expr = StmtExprMutator::VisitExpr_(op);
-    op = expr.as<ProducerLoadNode>();
-    auto tensor = Downcast<Tensor>(op->producer);
-
-    if (tensor->op.same_as(operation_)) {
-      ICHECK_EQ(tensor->value_index, 0);
-      expr = body_;
-      ICHECK_EQ(args_.size(), op->indices.size());
-
-      bool has_side_effect = false;
-      for (size_t i = 0; i < op->indices.size(); ++i) {
-        if (SideEffect(op->indices[i]) > CallEffectKind::kReadState) has_side_effect = true;
-      }
-      if (has_side_effect) {
-        for (size_t i = 0; i < args_.size(); ++i) {
-          expr = Let(args_[i], op->indices[i], expr);
-        }
-      } else {
-        Map<Var, PrimExpr> vmap;
-        for (size_t i = 0; i < args_.size(); ++i) {
-          // cast indices to the type of the original indexing variable
-          vmap.Set(args_[i], cast(args_[i].dtype(), op->indices[i]));
-        }
-        expr = Substitute(Evaluate(expr), vmap).as<EvaluateNode>()->value;
-      }
-      return expr;
-    } else {
-      return expr;
-    }
-  }
-
- private:
-  Operation operation_;
-  Array<Var> args_;
-  PrimExpr body_;
-};
-
-Stmt Inline(Stmt stmt, Operation f, Array<Var> args, PrimExpr body) {
-  ICHECK_EQ(f->num_outputs(), 1) << "can only inline output single value operation";
-  Stmt ret = OperationInliner(f, args, body)(std::move(stmt));
-  if (ret.same_as(stmt)) return ret;
-  return ConvertSSA(ret);
-}
-}  // namespace te
-}  // namespace tvm
diff --git a/src/te/schedule/operation_inline.h b/src/te/schedule/operation_inline.h
deleted file mode 100644
index d475fbe3787e..000000000000
--- a/src/te/schedule/operation_inline.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*!
- * \file operation_inline.h
- */
-#ifndef TVM_TE_SCHEDULE_OPERATION_INLINE_H_
-#define TVM_TE_SCHEDULE_OPERATION_INLINE_H_
-
-#include <tvm/te/operation.h>
-#include <tvm/te/tensor.h>
-#include <tvm/tir/expr.h>
-#include <tvm/tir/stmt.h>
-
-namespace tvm {
-namespace te {
-
-/*!
- * \brief inline all calls of f in stmt.
- *
- * \param stmt The statement to apply inline optimization.
- * \param op The op to be inlined.
- * \param args The arguments variable of the function.
- * \param body The definition body of the function.
- * \return The result stmt
- *
- * \note All the passes in this file uses SSA form and outputs SSA form.
- */
-Stmt Inline(Stmt stmt, Operation op, Array<Var> args, PrimExpr body);
-
-}  // namespace te
-}  // namespace tvm
-#endif  // TVM_TE_SCHEDULE_OPERATION_INLINE_H_
diff --git a/src/te/schedule/schedule_dataflow_rewrite.cc b/src/te/schedule/schedule_dataflow_rewrite.cc
deleted file mode 100644
index 1ad8914e48cc..000000000000
--- a/src/te/schedule/schedule_dataflow_rewrite.cc
+++ /dev/null
@@ -1,978 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file schedule_dataflow_rewrite.cc
- */
-#include <tvm/te/operation.h>
-#include <tvm/te/schedule.h>
-#include <tvm/tir/op.h>
-#include <tvm/tir/stmt_functor.h>
-
-#include <unordered_set>
-
-#include "../../tir/transforms/ir_utils.h"
-#include "message_passing.h"
-#include "operation_inline.h"
-
-namespace tvm {
-namespace te {
-// find first occurance location in leaf
-template <typename T>
-size_t FindNodeRef(ArrayNode* array_node, const T& v) {
-  const Object* n = v.get();
-  for (size_t i = 0; i < array_node->size(); ++i) {
-    if (array_node->at(i).get() == n) return i;
-  }
-  return array_node->size();
-}
-
-// The replacer of cache.
-class VarReplacer : public tir::StmtExprMutator {
- public:
-  explicit VarReplacer(const std::unordered_map<const VarNode*, PrimExpr>& vsub) : vsub_(vsub) {}
-  PrimExpr VisitExpr_(const VarNode* op) final {
-    auto it = vsub_.find(op);
-    if (it != vsub_.end()) return it->second;
-    return GetRef<PrimExpr>(op);
-  }
-
-  tir::CommReducer MutateCommReducer(tir::CommReducer combiner) {
-    // Replace free variables in combiner
-    auto new_identity = tir::UpdateArray(combiner->identity_element,
-                                         [this](const PrimExpr& e) { return this->VisitExpr(e); });
-    auto new_result = tir::UpdateArray(combiner->result,
-                                       [this](const PrimExpr& e) { return this->VisitExpr(e); });
-
-    if (combiner->identity_element.same_as(new_identity) &&
-        combiner->identity_element.same_as(new_result)) {
-      return combiner;
-    } else {
-      return tir::CommReducer(combiner->lhs, combiner->rhs, new_result, new_identity);
-    }
-  }
-
-  PrimExpr VisitExpr_(const tir::ReduceNode* op) final {
-    PrimExpr new_e = StmtExprMutator::VisitExpr_(op);
-    const tir::ReduceNode* new_reduce = new_e.as<tir::ReduceNode>();
-    tir::CommReducer new_combiner = MutateCommReducer(op->combiner);
-    if (op->combiner.same_as(new_combiner)) {
-      return new_e;
-    } else {
-      return tir::Reduce(new_combiner, new_reduce->source, new_reduce->axis, new_reduce->condition,
-                         new_reduce->value_index, new_reduce->init);
-    }
-  }
-
- private:
-  const std::unordered_map<const VarNode*, PrimExpr>& vsub_;
-};
-
-PrimExpr InjectPredicate(const Array<PrimExpr>& predicates, PrimExpr body) {
-  using tir::ReduceNode;
-  using tir::SelectNode;
-  if (predicates.size() == 0) return body;
-  const ReduceNode* reduce = body.as<ReduceNode>();
-
-  if (reduce) {
-    auto n = make_object<ReduceNode>(*reduce);
-    n->condition = foldl([](PrimExpr a, PrimExpr b, Span span) { return logical_and(a, b, span); },
-                         n->condition, predicates);
-    return PrimExpr(n);
-  }
-  return Select(foldl([](PrimExpr a, PrimExpr b, Span span) { return logical_and(a, b, span); },
-                      const_true(1), predicates),
-                body, make_zero(body.dtype()));
-}
-
-// Replace data flow appears in all stages given the tensor change.
-// Also update vmap if subsequent dataflow need to be replaced.
-// Need to keep an update to the date transitive closure property on the vmap by a reverse map.
-void ReplaceDataFlow(const Array<Stage>& stages, std::unordered_map<Tensor, Tensor>* vmap,
-                     std::unordered_map<Tensor, Tensor>* rvmap) {
-  for (Stage s : stages) {
-    Operation op = s->op->ReplaceInputs(s->op, *vmap);
-    if (!op.same_as(s->op)) {
-      for (int i = 0; i < op->num_outputs(); ++i) {
-        auto it = rvmap->find(s->op.output(i));
-        if (it != rvmap->end()) {
-          (*vmap)[it->second] = op.output(i);
-        } else {
-          (*vmap)[s->op.output(i)] = op.output(i);
-          (*rvmap)[op.output(i)] = s->op.output(i);
-        }
-      }
-      s->op = op;
-    }
-  }
-}
-
-inline bool ReduceEqual(const tir::ReduceNode* a, const tir::ReduceNode* b) {
-  StructuralEqual struct_equal;
-  return struct_equal(a->combiner, b->combiner) && struct_equal(a->source, b->source) &&
-         struct_equal(a->axis, b->axis) && struct_equal(a->condition, b->condition) &&
-         struct_equal(a->init, b->init);
-}
-
-Tensor Schedule::cache_read(const Tensor& tensor, const std::string& scope,
-                            const Array<Operation>& readers) {
-  (*this)->InvalidateCache();
-  // create identity mapping.
-  std::ostringstream os;
-  os << tensor->op->name;
-  if (tensor->op->num_outputs() != 1) {
-    os << ".v" << tensor->value_index;
-  }
-
-  // when a schedule has multiple cache_read on the same tensor,
-  // we make sure their op names are unique. e.g., w.shared, w_d.shared, w_d_d.shared
-  for (auto pair : (*this)->stage_map) {
-    auto stage = pair.second;
-    if (stage->op->name == os.str() + "." + scope) {
-      os << ".d";
-    }
-  }
-  os << "." << scope;
-
-  std::unordered_map<Tensor, Tensor> vsub;
-  Stage s = operator[](tensor->op);
-  Tensor sugar_tensor = s->op.output(tensor->value_index);
-  Tensor cache = compute(
-      sugar_tensor->shape,
-      [&sugar_tensor](const Array<Var>& i) {
-        return sugar_tensor(Array<PrimExpr>(i.begin(), i.end()));
-      },
-      os.str());
-  vsub[sugar_tensor] = cache;
-
-  std::unordered_map<Tensor, Tensor> vmap;
-  std::unordered_map<Tensor, Tensor> rvmap;
-  for (Operation op : readers) {
-    Stage s = operator[](op);
-    Operation repl_op = s->op->ReplaceInputs(s->op, vsub);
-    ICHECK(!repl_op.same_as(s->op)) << "Cannot find " << tensor << " in the inputs of " << s->op;
-    vmap[s->op.output(0)] = repl_op.output(0);
-    rvmap[repl_op.output(0)] = s->op.output(0);
-    s->op = repl_op;
-  }
-  ReplaceDataFlow((*this)->stages, &vmap, &rvmap);
-  Array<Stage>& stages = (*this)->stages;
-  Stage op_stage = operator[](tensor->op);
-  size_t pos = FindNodeRef(stages.GetArrayNode(), op_stage);
-  Stage cache_stage = Stage(cache->op, this->operator->());
-  ICHECK_LT(pos, stages.size());
-  stages.insert(stages.begin() + pos + 1, cache_stage);
-  // in order to obtain correct copy on schedule_record,
-  // make sure "set_scope" primitive is applied after stage being added
-  cache_stage.set_scope(scope);
-  (*this)->stage_map.Set(cache->op, cache_stage);
-  // Update group
-  cache_stage->group = op_stage->group;
-  if (cache_stage->group.defined()) {
-    ++cache_stage->group->num_child_stages;
-  }
-  return cache;
-}
-
-template <typename OpType>
-void PrepareAxisMapping(Stage orig_stage, OpType* op, std::unordered_set<IterVar>* p_red_axis,
-                        Array<IterVar>* p_new_axis, std::unordered_map<IterVar, Range>* p_dom_map,
-                        std::unordered_map<const VarNode*, PrimExpr>* p_vsub,
-                        std::unordered_map<const VarNode*, PrimExpr>* p_vsub2newvar,
-                        std::vector<PrimExpr>* p_predicates) {
-  auto& red_axis = *p_red_axis;
-  auto& new_axis = *p_new_axis;
-  auto& dom_map = *p_dom_map;
-  auto& vsub = *p_vsub;
-  auto& vsub2newvar = *p_vsub2newvar;
-  auto& predicates = *p_predicates;
-  arith::Analyzer analyzer;
-
-  for (IterVar iv : op->reduce_axis) {
-    red_axis.insert(iv);
-  }
-  for (IterVar iv : op->axis) {
-    dom_map[iv] = iv->dom;
-    analyzer.Bind(iv->var, iv->dom);
-  }
-  te::PassDownDomain(orig_stage, &dom_map, &analyzer, true);
-  {
-    // The source->cache
-    std::unordered_map<IterVar, PrimExpr> value_map;
-    for (IterVar iv : orig_stage->leaf_iter_vars) {
-      if (red_axis.count(iv)) continue;
-      ICHECK_EQ(iv->iter_type, kDataPar) << "Can only relayout with in data parallel dimensions";
-      Range dom = dom_map.at(iv);
-      IterVar new_iv = IterVar(dom, iv->var.copy_with_suffix(".c"), iv->iter_type);
-      new_axis.push_back(new_iv);
-      if (is_one(dom->min)) {
-        value_map[iv] = dom->min;
-      } else {
-        value_map[iv] = iv->var;
-        vsub2newvar[iv->var.get()] = new_iv->var;
-      }
-    }
-    // skip reduction iteration.
-    std::unordered_set<IterVar> skip_bound_check;
-    for (IterVar iv : op->reduce_axis) {
-      skip_bound_check.insert(iv);
-    }
-    PassUpIndex(orig_stage, dom_map, &value_map, true);
-    predicates = MakeBoundCheck(orig_stage, dom_map, value_map, true, skip_bound_check);
-    // The root axis
-    for (IterVar iv : op->axis) {
-      if (value_map.count(iv)) {
-        vsub[iv->var.get()] = value_map.at(iv);
-      }  // to handle tensor axis
-    }
-  }
-}
-
-Array<Tensor> ReplaceOriginalOp(Schedule sch, Stage orig_stage, const std::string& scope,
-                                Operation cache_op, Operation orig_new_op, size_t tensor_size) {
-  Array<Tensor> cache_tensor_list;
-  for (size_t i = 0; i < tensor_size; i++) {
-    Tensor cache_tensor = cache_op.output(i);
-    cache_tensor_list.push_back(cache_tensor);
-  }
-  // The replace of the dataflow
-  std::unordered_map<Tensor, Tensor> vmap;
-  std::unordered_map<Tensor, Tensor> rvmap;
-  vmap[orig_stage->op.output(0)] = orig_new_op.output(0);
-  rvmap[orig_new_op.output(0)] = orig_stage->op.output(0);
-  for (size_t i = 0; i < tensor_size; i++) {
-    vmap[orig_stage->op.output(0)] = orig_new_op.output(0);
-    rvmap[orig_new_op.output(0)] = orig_stage->op.output(0);
-  }
-  ReplaceDataFlow(sch->stages, &vmap, &rvmap);
-  // mutate orig stage
-  orig_stage->op = orig_new_op;
-  orig_stage->all_iter_vars = orig_stage->op->root_iter_vars();
-  orig_stage->leaf_iter_vars = orig_stage->all_iter_vars;
-  orig_stage->relations = Array<IterVarRelation>();
-  // create schedule for new cached stage.
-  Array<Stage>& stages = sch->stages;
-  size_t pos = FindNodeRef(stages.GetArrayNode(), orig_stage);
-  Stage cache_stage = Stage(cache_op, sch.operator->());
-  ICHECK_LT(pos, stages.size());
-  stages.insert(stages.begin() + pos, cache_stage);
-  // in order to obtain correct copy on schedule_record,
-  // make sure "set_scope" primitive is applied after stage being added
-  cache_stage.set_scope(scope);
-  sch->stage_map.Set(cache_op, cache_stage);
-  // Update group
-  cache_stage->group = orig_stage->group;
-  if (cache_stage->group.defined()) {
-    ++cache_stage->group->num_child_stages;
-  }
-  return cache_tensor_list;
-}
-
-// Cache write and relayout the data according to loop pattern
-Array<Tensor> CacheWriteWithReLayout(Schedule sch, const Array<Tensor>& tensor_array,
-                                     const std::string& scope) {
-  size_t tensor_size = tensor_array.size();
-  sch->InvalidateCache();
-  Tensor tensor = tensor_array[0];
-  Stage orig_stage = sch[tensor->op];
-  const ComputeOpNode* compute = orig_stage->op.as<ComputeOpNode>();
-
-  std::unordered_set<IterVar> red_axis;
-  Array<IterVar> new_axis;
-  std::unordered_map<IterVar, Range> dom_map;
-
-  std::unordered_map<const VarNode*, PrimExpr> vsub;
-  std::unordered_map<const VarNode*, PrimExpr> vsub2newvar;
-  std::vector<PrimExpr> predicates;
-
-  PrepareAxisMapping(orig_stage, compute, &red_axis, &new_axis, &dom_map, &vsub, &vsub2newvar,
-                     &predicates);
-
-  PrimExpr body;
-  Array<PrimExpr> body_list;
-  const tir::ReduceNode* first_reduce = nullptr;
-  for (auto cbody : compute->body) {
-    body = VarReplacer(vsub)(cbody);
-    body = InjectPredicate(predicates, body);
-    body = VarReplacer(vsub2newvar)(body);
-    // Reduce nodes in ONE computeOp must be the same except value_index
-    // This is right only if the original body ensures Reduce nodes are the same
-    if (body->IsInstance<tir::ReduceNode>()) {
-      const tir::ReduceNode* reduce_body = body.as<tir::ReduceNode>();
-      if (first_reduce != nullptr) {
-        ICHECK(ReduceEqual(reduce_body, first_reduce));
-        body = tir::Reduce(first_reduce->combiner, first_reduce->source, first_reduce->axis,
-                           first_reduce->condition, reduce_body->value_index, reduce_body->init);
-      } else {
-        first_reduce = reduce_body;
-      }
-    } else {
-      ICHECK(first_reduce == nullptr) << "cannot mix reduce and other node in ONE compute bodys";
-    }
-    body_list.push_back(body);
-  }
-  // The reader args
-  Array<PrimExpr> args;
-  {
-    // cache->compute
-    std::unordered_map<IterVar, PrimExpr> value_map;
-    for (IterVar iv : compute->axis) {
-      value_map[iv] = iv->var;
-    }
-    te::PassDownIndex(orig_stage, dom_map, &value_map, true);
-    for (IterVar iv : orig_stage->leaf_iter_vars) {
-      if (red_axis.count(iv)) continue;
-      args.push_back(value_map.at(iv));
-    }
-  }
-  Operation cache_op =
-      ComputeOp(compute->name + "." + scope, compute->tag, compute->attrs, new_axis, body_list);
-
-  Array<PrimExpr> cache_expr_list;
-  for (size_t i = 0; i < tensor_size; i++) {
-    Tensor cache_tensor = cache_op.output(i);
-    cache_expr_list.push_back(cache_tensor(args));
-  }
-  Operation orig_new_op =
-      ComputeOp(compute->name, compute->tag, compute->attrs, compute->axis, cache_expr_list);
-  return ReplaceOriginalOp(sch, orig_stage, scope, cache_op, orig_new_op, tensor_size);
-}
-
-// for tensor compute op
-Array<Tensor> CacheWriteWithReLayoutTensor(Schedule sch, const Array<Tensor>& tensor_array,
-                                           const std::string& scope) {
-  size_t tensor_size = tensor_array.size();
-  sch->InvalidateCache();
-  Tensor tensor = tensor_array[0];
-  Stage orig_stage = sch[tensor->op];
-  const TensorComputeOpNode* tensor_op = orig_stage->op.as<TensorComputeOpNode>();
-  ICHECK_EQ(tensor_op->num_outputs(), 1)
-      << "cache write only support single output tensor_compute_op";
-
-  std::unordered_set<IterVar> red_axis;
-  Array<IterVar> new_axis;
-  std::unordered_map<IterVar, Range> dom_map;
-
-  std::unordered_map<const VarNode*, PrimExpr> vsub;
-  std::unordered_map<const VarNode*, PrimExpr> vsub2newvar;
-  std::vector<PrimExpr> predicates;
-
-  PrepareAxisMapping(orig_stage, tensor_op, &red_axis, &new_axis, &dom_map, &vsub, &vsub2newvar,
-                     &predicates);
-
-  for (int i = tensor_op->schedulable_ndim; i < static_cast<int>(tensor_op->axis.size()); ++i) {
-    IterVar iv = tensor_op->axis[i];
-    IterVar new_iv = IterVar(iv->dom, iv->var.copy_with_suffix(".c"), iv->iter_type);
-    new_axis.push_back(new_iv);
-  }
-  Array<Region> new_regions;
-  for (Region old_region : tensor_op->input_regions) {
-    Region region;
-    for (Range r : old_region) {
-      PrimExpr min = VarReplacer(vsub2newvar)(r->min);
-      PrimExpr extent = VarReplacer(vsub2newvar)(r->extent);
-      region.push_back(Range::FromMinExtent(min, extent));
-    }
-    new_regions.push_back(region);
-  }
-
-  Array<PrimExpr> new_scalar_inputs;
-  for (PrimExpr old_input : tensor_op->scalar_inputs) {
-    new_scalar_inputs.push_back(VarReplacer(vsub2newvar)(old_input));
-  }
-
-  Operation cache_op =
-      TensorComputeOp(tensor_op->name + "." + scope, tensor_op->tag, new_axis,
-                      tensor_op->reduce_axis, tensor_op->schedulable_ndim, tensor_op->intrin,
-                      tensor_op->inputs, new_regions, new_scalar_inputs);
-
-  // axis will be used in generating compute op
-  Array<IterVar> compute_axis = tensor_op->axis;
-  for (size_t i = tensor_op->schedulable_ndim; i < tensor_op->axis.size(); ++i) {
-    IterVar iv = tensor_op->axis[i];
-    IterVar aiv = IterVar(iv->dom, iv->var, kDataPar);
-    compute_axis.Set(i, aiv);
-  }
-
-  // The reader args
-  Array<PrimExpr> args;
-  {
-    // cache->compute
-    std::unordered_map<IterVar, PrimExpr> value_map;
-    for (IterVar iv : compute_axis) {
-      value_map[iv] = iv->var;
-    }
-    PassDownIndex(orig_stage, dom_map, &value_map, true);
-    for (IterVar iv : orig_stage->leaf_iter_vars) {
-      if (red_axis.count(iv)) continue;
-      args.push_back(value_map.at(iv));
-    }
-    // tensorized region axis
-    for (size_t i = tensor_op->schedulable_ndim; i < tensor_op->axis.size(); ++i) {
-      IterVar iv = compute_axis[i];
-      args.push_back(value_map.at(iv));
-    }
-  }
-
-  Array<PrimExpr> cache_expr_list;
-  for (size_t i = 0; i < tensor_size; i++) {
-    Tensor cache_tensor = cache_op.output(i);
-    cache_expr_list.push_back(cache_tensor(args));
-  }
-  Operation orig_new_op =
-      ComputeOp(tensor_op->name, tensor_op->tag, {}, compute_axis, cache_expr_list);
-  return ReplaceOriginalOp(sch, orig_stage, scope, cache_op, orig_new_op, tensor_size);
-}
-
-Array<Tensor> Schedule::cache_write(const Array<Tensor>& tensor_array, const std::string& scope) {
-  (*this)->InvalidateCache();
-  ICHECK(tensor_array.size() > 0) << "size of tensor_array must be greater than 0";
-  Tensor tensor = tensor_array[0];
-  Stage orig_stage = operator[](tensor->op);
-  const ComputeOpNode* compute = tensor->op.as<ComputeOpNode>();
-  ICHECK(static_cast<size_t>(compute->num_outputs()) == tensor_array.size())
-      << "size of input tensor list must be same as number of stage outputs";
-  for (size_t i = 1; i < tensor_array.size(); i++) {
-    Stage tmp_stage = operator[](tensor_array[i]->op);
-    ICHECK(orig_stage.same_as(tmp_stage)) << "Input tensor list must be generated by ONE computeOp";
-  }
-  return CacheWriteWithReLayout(*this, tensor_array, scope);
-}
-
-Tensor Schedule::cache_write(const Tensor& tensor, const std::string& scope) {
-  // support original compute and tensor compute both
-  (*this)->InvalidateCache();
-  if (tensor->op.as<ComputeOpNode>()) {
-    return (CacheWriteWithReLayout(*this, {tensor}, scope))[0];
-  } else if (tensor->op.as<TensorComputeOpNode>()) {
-    return (CacheWriteWithReLayoutTensor(*this, {tensor}, scope))[0];
-  } else {
-    LOG(FATAL) << "cache write only take ComputeOp or TensorComputeOp as writers";
-  }
-}
-
-void RebaseNonZeroMinLoop(ScheduleNode* sch) {
-  std::unordered_map<IterVar, IterVar> rebase_map;
-  for (Stage s : sch->stages) {
-    if (s->attach_type == kInlinedAlready) continue;
-
-    auto root_iter_vars = s->op->root_iter_vars();
-    ArrayNode* leaf_vars = s->leaf_iter_vars.CopyOnWrite();
-    for (IterVar iv : root_iter_vars) {
-      size_t idx = FindNodeRef(leaf_vars, iv);
-      auto it = s->iter_var_attrs.find(iv);
-      // don;t need to rebase path that are binded.
-      if (it != s->iter_var_attrs.end() && (*it).second->bind_thread.defined()) {
-        continue;
-      }
-      if (idx < leaf_vars->size()) {
-        // insert rebase
-        IterVar rebased = IterVar(Range(), iv->var.copy_with_suffix(""), iv->iter_type);
-        s->relations.push_back(te::Rebase(iv, rebased));
-        if (s->iter_var_attrs.count(iv)) {
-          s->iter_var_attrs.Set(rebased, s->iter_var_attrs.at(iv));
-        }
-        leaf_vars->SetItem(idx, rebased);
-        rebase_map[iv] = rebased;
-      }
-    }
-  }
-  // remap the parent relation
-  for (Stage s : sch->stages) {
-    if (s->attach_type != kScope) continue;
-    if (rebase_map.count(s->attach_ivar)) {
-      s->attach_ivar = rebase_map.at(s->attach_ivar);
-    }
-  }
-  for (Stage s : sch->groups) {
-    if (s->attach_type != kScope) continue;
-    if (rebase_map.count(s->attach_ivar)) {
-      s->attach_ivar = rebase_map.at(s->attach_ivar);
-    }
-  }
-}
-
-void InjectInline(ScheduleNode* sch, bool feature_extraction_mode) {
-  sch->InvalidateCache();
-
-  std::vector<Array<PrimExpr>> new_body(sch->stages.size());
-  std::vector<bool> changed(sch->stages.size(), false);
-  std::vector<Stmt> new_hybrid_body(sch->stages.size());
-  std::vector<bool> hybrid_changed(sch->stages.size(), false);
-  // (sshtin): this workaround allows to inline extern ops into their consumer.
-  // All inputs for extern op should not be inlined because inlining may happen
-  // before TE generation for particular extern op. That may lead to
-  // crash during lowering or building stages.
-  // The problem description:
-  // In case of operations fusing, arguments inlining
-  // prevents creation of ProducerNode for extern operation.
-  // Instead of the creation it is supposed to use operation argument as inlined buffer
-  // but extern_op TIR generation can be peformed after inlining procedure so
-  // newly generated TIR does not have reference to input data at all.
-  std::unordered_map<Operation, Operation> ext_ops;
-  for (size_t i = 0; i < sch->stages.size(); i++) {
-    Stage stage = sch->stages[i];
-    auto ext_op = stage->op.as<ExternOpNode>();
-    if (ext_op) {
-      auto inps = ext_op->InputTensors();
-      for (size_t ii = 0; ii < inps.size(); ++ii) {
-        if (ext_ops.find(inps[ii]->op) == ext_ops.end()) {
-          ext_ops[inps[ii]->op] = stage->op;
-        }
-      }
-    }
-  }
-  // inline all the ops
-  for (size_t i = sch->stages.size(); i != 0; --i) {
-    Stage stage = sch->stages[i - 1];
-    if (stage->attach_type == kInline) {
-      stage->attach_type = kInlinedAlready;
-      Array<Var> args;
-      PrimExpr body;
-      {
-        // setup args
-        const ComputeOpNode* compute = stage->op.as<ComputeOpNode>();
-        ICHECK(compute) << "can only inline compute op";
-        for (auto iv : compute->axis) {
-          args.push_back(iv->var);
-        }
-        if (ext_ops.find(stage->op) != ext_ops.end()) {
-          // sshtin: The extern op can try to get access to the input tensors as a raw data,
-          // that can lead to error in IR builder.
-          stage->attach_type = kGroupRoot;
-          continue;
-        }
-        ICHECK_EQ(compute->body.size(), 1U) << "can only inline compute op with 1 output";
-        if (feature_extraction_mode && compute->attrs.count("const_matrix")) {
-          // Use constant value to replace access of const matrices.
-          // This produces wrong IR but is good enough for feature extraction purposes.
-          // This simplification can accelerate the feature extration and evolutionary search.
-          body = make_const(compute->output_dtype(0), 1.0f);
-        } else {
-          body = compute->body[0];
-        }
-      }
-      for (size_t j = i; j < sch->stages.size(); ++j) {
-        Stage s = sch->stages[j];
-        const ComputeOpNode* compute = s->op.as<ComputeOpNode>();
-        const HybridOpNode* hybrid = s->op.as<HybridOpNode>();
-        if (compute) {
-          if (!new_body[j].size()) {
-            new_body[j] = compute->body;
-          }
-          if (new_body[j][0]->IsInstance<tir::ReduceNode>()) {
-            // specially handle reduction inline for multiplre reductions.
-            const tir::ReduceNode* reduce = new_body[j][0].as<tir::ReduceNode>();
-            for (size_t k = 1; k < new_body[j].size(); ++k) {
-              const tir::ReduceNode* reduce_ = new_body[j][k].as<tir::ReduceNode>();
-              ICHECK(reduce_);
-              ICHECK(ReduceEqual(reduce_, reduce)) << "The Reduce inputs of ComputeOp should "
-                                                   << "have the same attribute except value_index";
-            }
-            PrimExpr new_value = Inline(tir::Evaluate(new_body[j][0]), stage->op, args, body)
-                                     .as<tir::EvaluateNode>()
-                                     ->value;
-            if (!new_value.same_as(new_body[j][0])) {
-              changed[j] = true;
-              const tir::ReduceNode* r = new_value.as<tir::ReduceNode>();
-              ICHECK(r != nullptr);
-              ICHECK_EQ(new_body[j].size(), r->source.size());
-              for (size_t k = 0; k < new_body[j].size(); ++k) {
-                auto n = make_object<tir::ReduceNode>(*r);
-                n->value_index = static_cast<int>(k);
-                n->dtype = r->source[k].dtype();
-                new_body[j].Set(k, PrimExpr(n));
-              }
-            }
-          } else {
-            for (size_t k = 0; k < new_body[j].size(); ++k) {
-              PrimExpr new_value = Inline(tir::Evaluate(new_body[j][k]), stage->op, args, body)
-                                       .as<tir::EvaluateNode>()
-                                       ->value;
-              if (!new_value.same_as(new_body[j][k])) {
-                new_body[j].Set(k, new_value);
-                changed[j] = true;
-              }
-            }
-          }
-        } else if (hybrid) {
-          if (!new_hybrid_body[j].defined()) {
-            new_hybrid_body[j] = hybrid->body;
-          }
-          Stmt new_stmt = Inline(new_hybrid_body[j], stage->op, args, body);
-          if (!new_stmt.same_as(new_hybrid_body[j])) {
-            new_hybrid_body[j] = new_stmt;
-            hybrid_changed[j] = true;
-          }
-        }
-      }
-    }
-  }
-  std::unordered_map<Tensor, Tensor> repl;
-  // rewrite dataflow
-  for (size_t i = 0; i < sch->stages.size(); ++i) {
-    Stage s = sch->stages[i];
-    if (s->attach_type == kInlinedAlready) continue;
-    if (new_body[i].size()) {
-      // Logics from ReplaceDataFlow
-      const ComputeOpNode* compute = sch->stages[i]->op.as<ComputeOpNode>();
-      ICHECK(compute);
-      Operation op = s->op;
-      if (changed[i]) {
-        op = ComputeOp(compute->name, compute->tag, compute->attrs, compute->axis, new_body[i]);
-      }
-      op = op->ReplaceInputs(op, repl);
-      if (!op.same_as(s->op)) {
-        for (int idx = 0; idx < s->op->num_outputs(); ++idx) {
-          repl[s->op.output(idx)] = op.output(idx);
-        }
-        s->op = op;
-      }
-    } else if (hybrid_changed[i]) {
-      const HybridOpNode* hybrid = sch->stages[i]->op.as<HybridOpNode>();
-      ICHECK(hybrid);
-      Operation op = HybridOp(hybrid->name, hybrid->tag, hybrid->attrs, hybrid->inputs,
-                              hybrid->outputs, new_hybrid_body[i]);
-      op = op->ReplaceInputs(op, repl);
-      for (int idx = 0; idx < s->op->num_outputs(); ++idx) {
-        repl[s->op.output(idx)] = op.output(idx);
-      }
-      s->op = op;
-    } else {
-      Operation op = s->op->ReplaceInputs(s->op, repl);
-      if (!op.same_as(s->op)) {
-        for (int j = 0; j < op->num_outputs(); ++j) {
-          repl[s->op.output(j)] = op.output(j);
-        }
-        s->op = op;
-      }
-    }
-  }
-}
-
-void LegalizeInvalidAttach(ScheduleNode* sch) {
-  // Legalize the compute_at location if the target iterator of compute_at is split or fused.
-  // Case 1: If the target of compute_at is split,
-  //         we will move the compute_at location to the inner iterator.
-  // Case 2: If the target of compute_at is fused,
-  //         we will move the compute_at location to the newly fused iterator.
-  // Note that case 2 can only happen if the target of compute_at
-  // is the innermost operand of fuse operation.
-
-  // Map an old invalid attach point to its new valid attach point
-  std::unordered_map<IterVar, IterVar> replace_map;
-
-  for (Stage stage : sch->stages) {
-    std::unordered_set<const Object*> visited;
-    for (Stage s = stage; s.defined();) {
-      // The following logic is simiar to the `CreateAttachPath` in `src/te/schedule/graph.h`,
-      // because we follow the validation check in that function to legalize the attach.
-      ICHECK(!visited.count(s.get())) << "Find loop in compute_at attach group";
-      visited.insert(s.get());
-      Stage spec = s.GetAttachSpec();
-      if (spec->attach_type != kScope) {
-        break;
-      }
-      bool start_attach = false;
-      IterVar attach_ivar = spec->attach_ivar;
-      s = spec->attach_stage;
-      ICHECK(attach_ivar.defined());
-      ICHECK(s.defined());
-
-      for (size_t i = s->leaf_iter_vars.size(); i != 0; --i) {
-        IterVar iv = s->leaf_iter_vars[i - 1];
-        if (!start_attach && iv.same_as(attach_ivar)) {
-          start_attach = true;
-          break;
-        }
-      }
-
-      if (!start_attach) {
-        IterVar new_attach_ivar = attach_ivar;
-        bool updated = true;
-        // recursively update the relations
-        while (updated) {
-          updated = false;
-          for (const auto& rel : s->relations) {
-            if (const FuseNode* r = rel.as<FuseNode>()) {
-              if (new_attach_ivar.same_as(r->inner)) {
-                new_attach_ivar = r->fused;
-                updated = true;
-              }
-            } else if (const SplitNode* r = rel.as<SplitNode>()) {
-              if (new_attach_ivar.same_as(r->parent)) {
-                new_attach_ivar = r->inner;
-                updated = true;
-              }
-            }
-          }
-          replace_map[attach_ivar] = new_attach_ivar;
-        }
-      }
-    }
-  }
-
-  // remap the parent relation
-  for (Stage s : sch->stages) {
-    if (s->attach_type != kScope) continue;
-    if (replace_map.count(s->attach_ivar)) {
-      s->attach_ivar = replace_map.at(s->attach_ivar);
-    }
-  }
-  for (Stage s : sch->groups) {
-    if (s->attach_type != kScope) continue;
-    if (replace_map.count(s->attach_ivar)) {
-      s->attach_ivar = replace_map.at(s->attach_ivar);
-    }
-  }
-}
-
-Schedule Schedule::normalize() {
-  Schedule sn = copy();
-  InjectInline(sn.operator->(), false);
-  RebaseNonZeroMinLoop(sn.operator->());
-  LegalizeInvalidAttach(sn.operator->());
-  return sn;
-}
-
-Schedule Schedule::normalize_for_feature_extraction() {
-  Schedule sn = copy();
-  InjectInline(sn.operator->(), true);
-  RebaseNonZeroMinLoop(sn.operator->());
-  LegalizeInvalidAttach(sn.operator->());
-  return sn;
-}
-
-// Handle reduction factor.
-Array<Tensor> Schedule::rfactor(const Tensor& tensor, const IterVar& axis, int factor_axis) {
-  (*this)->InvalidateCache();
-  using tir::ReduceNode;
-  ICHECK_EQ(axis->iter_type, kCommReduce) << "Can only factor reduction axis";
-  Stage reduce_stage = operator[](tensor->op);
-  const ComputeOpNode* compute_op = reduce_stage->op.as<ComputeOpNode>();
-  ICHECK(compute_op) << "Can only factor ComputeOp";
-  ArrayNode* leaf_vars = reduce_stage->leaf_iter_vars.CopyOnWrite();
-  {
-    size_t axis_pos = FindNodeRef(leaf_vars, axis);
-    ICHECK_NE(axis_pos, leaf_vars->size())
-        << "Cannot find IterVar " << axis << " in leaf iter vars";
-  }
-  // Find touched reduction axis.
-  std::unordered_map<IterVar, int> touch_map;
-  touch_map[axis] = 1;
-  te::PassUpBitMaskOr(reduce_stage, &touch_map, true);
-  te::PassDownBitMaskOr(reduce_stage, &touch_map, true);
-  // skip reduction iteration.
-  std::unordered_set<IterVar> skip_bound_check;
-  // Verify normal axis are not touched.
-  for (IterVar iv : compute_op->axis) {
-    ICHECK(!touch_map.count(iv)) << "Factor axis touches normal axis.";
-    skip_bound_check.insert(iv);
-  }
-  // get analyzer.
-  arith::Analyzer analyzer;
-  // Get the replace index
-  std::unordered_map<IterVar, Range> dom_map;
-  std::unordered_map<IterVar, PrimExpr> value_map;
-  for (IterVar iv : compute_op->reduce_axis) {
-    if (touch_map.count(iv)) {
-      dom_map[iv] = iv->dom;
-    } else {
-      skip_bound_check.insert(iv);
-    }
-    analyzer.Bind(iv->var, iv->dom);
-  }
-  te::PassDownDomain(reduce_stage, &dom_map, &analyzer, true);
-  for (IterVar iv : reduce_stage->leaf_iter_vars) {
-    if (touch_map.count(iv)) {
-      Range dom = dom_map.at(iv);
-      if (is_one(dom->extent)) {
-        value_map[iv] = dom->min;
-      } else {
-        value_map[iv] = iv->var;
-      }
-    }
-  }
-  te::PassUpIndex(reduce_stage, dom_map, &value_map, true);
-  std::vector<PrimExpr> predicates =
-      MakeBoundCheck(reduce_stage, dom_map, value_map, true, skip_bound_check);
-
-  // Get the factored op node.
-  const int factor_axis_pos =
-      factor_axis >= 0 ? factor_axis : static_cast<int>(compute_op->axis.size() + 1) + factor_axis;
-  ICHECK_LE(factor_axis_pos, compute_op->axis.size());
-  auto n = make_object<ComputeOpNode>();
-  n->name = compute_op->name + ".rf";
-  {
-    // axis relacement.
-    IterVar iv(dom_map.at(axis), axis->var, kDataPar);
-    ICHECK(is_zero(iv->dom->min)) << "Can only factor reduction domain starting from 0";
-
-    const int size = compute_op->axis.size();
-    for (int idx = 0; idx < size; ++idx) {
-      if (factor_axis_pos == idx) {
-        n->axis.push_back(iv);
-      }
-      n->axis.push_back(compute_op->axis[idx]);
-    }
-    if (factor_axis_pos == size) {
-      n->axis.push_back(iv);
-    }
-  }
-  // predicate generation, copy not touched axis.
-  int idx = tensor->value_index;
-  const ReduceNode* reduce = compute_op->body[idx].as<ReduceNode>();
-  ICHECK(reduce) << "Can only rfactor non-inline reductions";
-  predicates.push_back(reduce->condition);
-
-  PrimExpr predicate =
-      likely(foldl([](PrimExpr a, PrimExpr b, Span span) { return logical_and(a, b, span); },
-                   const_true(1), predicates));
-
-  std::unordered_map<const VarNode*, PrimExpr> vsub;
-
-  for (IterVar iv : compute_op->reduce_axis) {
-    if (!touch_map.count(iv)) {
-      n->reduce_axis.push_back(iv);
-    } else {
-      ICHECK(value_map.count(iv));
-      PrimExpr index = value_map.at(iv);
-      vsub[iv->var.get()] = index;
-    }
-  }
-
-  // Copy touched axis.
-  for (IterVar iv : reduce_stage->leaf_iter_vars) {
-    if (touch_map.count(iv) && !iv.same_as(axis)) {
-      ICHECK_EQ(iv->iter_type, kCommReduce);
-      IterVar ncpy(dom_map.at(iv), iv->var, iv->iter_type, iv->thread_tag, iv->span);
-      n->reduce_axis.push_back(ncpy);
-    }
-  }
-  VarReplacer replacer(vsub);
-  Array<PrimExpr> new_source =
-      tir::UpdateArray(reduce->source, [&replacer](const PrimExpr& e) { return replacer(e); });
-
-  PrimExpr new_pred = replacer(predicate);
-
-  std::vector<PrimExpr> body;
-  for (size_t idx = 0; idx < reduce->source.size(); ++idx) {
-    body.emplace_back(Reduce(reduce->combiner, new_source, n->reduce_axis, new_pred, idx, {}));
-  }
-  n->body = Array<PrimExpr>(body);
-  // refresh relations, keep the un-touched relations.
-  Array<IterVarRelation> rels;
-  for (IterVarRelation rel : reduce_stage->relations) {
-    bool touched = false;
-    if (const SplitNode* r = rel.as<SplitNode>()) {
-      if (touch_map.count(r->parent)) touched = true;
-    } else if (const FuseNode* r = rel.as<FuseNode>()) {
-      if (touch_map.count(r->fused)) touched = true;
-    } else if (const RebaseNode* r = rel.as<RebaseNode>()) {
-      if (touch_map.count(r->parent)) touched = true;
-    } else {
-      LOG(FATAL) << "unknown relation type";
-    }
-    if (!touched) {
-      rels.push_back(rel);
-    }
-  }
-  // initialize the factored stage.
-  Operation factor_op(n);
-  Array<Stage>& stages = (*this)->stages;
-  size_t stage_pos = FindNodeRef(stages.GetArrayNode(), reduce_stage);
-  Stage factor_stage = Stage(factor_op, this->operator->());
-  factor_stage->relations = rels;
-  ICHECK_LT(stage_pos, stages.size());
-  stages.insert(stages.begin() + stage_pos, factor_stage);
-  (*this)->stage_map.Set(factor_op, factor_stage);
-  factor_stage->group = reduce_stage->group;
-  if (factor_stage->group.defined()) {
-    ++factor_stage->group->num_child_stages;
-  }
-  // Replace the old reduction.
-  IterVar repl_red_axis = reduce_axis(dom_map.at(axis), axis->var->name_hint + ".v");
-  Array<Tensor> factor_tensors;
-  Array<Tensor> old_tensors;
-  int size = factor_op->num_outputs();
-  for (int idx = 0; idx < size; ++idx) {
-    factor_tensors.push_back(factor_op.output(idx));
-    old_tensors.push_back(reduce_stage->op.output(idx));
-  }
-  Array<Tensor> repl_tensors = compute(
-      old_tensors[0]->shape,
-      [&](const Array<Var>& i) {
-        Array<PrimExpr> indices;
-        const int idx_size = static_cast<int>(i.size());
-        for (int idx = 0; idx < idx_size; ++idx) {
-          if (factor_axis_pos == idx) {
-            indices.push_back(repl_red_axis->var);
-          }
-          indices.push_back(i[idx]);
-        }
-        Array<PrimExpr> new_init = reduce->init;
-        if (!reduce->init.empty()) {
-          std::unordered_map<const VarNode*, PrimExpr> init_vsub;
-          for (const auto& init : reduce->init) {
-            if (init->IsInstance<ProducerLoadNode>()) {
-              ICHECK_EQ(compute_op->axis.size(), idx_size)
-                  << "'init' should have the number of dimensions as output when using with "
-                     "rfactor";
-              for (int idx = 0; idx < idx_size; idx++) {
-                init_vsub[compute_op->axis[idx]->var.get()] = i[idx];
-              }
-            }
-          }
-          VarReplacer init_replacer(init_vsub);
-          new_init = tir::UpdateArray(
-              reduce->init, [&init_replacer](const PrimExpr& e) { return init_replacer(e); });
-        }
-        if (factor_axis_pos == idx_size) {
-          indices.push_back(repl_red_axis->var);
-        }
-        Array<PrimExpr> factor_exprs;
-        for (int idx = 0; idx < size; ++idx) {
-          factor_exprs.push_back(factor_tensors[idx](indices));
-        }
-        Array<PrimExpr> reductions;
-        Array<IterVar> axis = {repl_red_axis};
-        PrimExpr cond = const_true();
-        for (int idx = 0; idx < size; ++idx) {
-          reductions.push_back(Reduce(reduce->combiner, factor_exprs, axis, cond, idx, new_init));
-        }
-        return reductions;
-      },
-      reduce_stage->op->name + ".repl");
-
-  std::unordered_map<Tensor, Tensor> vmap;
-  std::unordered_map<Tensor, Tensor> rvmap;
-  for (int idx = 0; idx < size; ++idx) {
-    vmap[old_tensors[idx]] = repl_tensors[idx];
-    rvmap[repl_tensors[idx]] = old_tensors[idx];
-  }
-  ReplaceDataFlow((*this)->stages, &vmap, &rvmap);
-  // revamp the reduction stage.
-  reduce_stage->op = repl_tensors[0]->op;
-  reduce_stage->all_iter_vars = repl_tensors[0]->op->root_iter_vars();
-  reduce_stage->leaf_iter_vars = reduce_stage->all_iter_vars;
-  reduce_stage->relations = Array<IterVarRelation>();
-  return factor_tensors;
-}
-}  // namespace te
-}  // namespace tvm
diff --git a/src/te/schedule/schedule_lang.cc b/src/te/schedule/schedule_lang.cc
deleted file mode 100644
index 9e142b1bf76c..000000000000
--- a/src/te/schedule/schedule_lang.cc
+++ /dev/null
@@ -1,1078 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file schedule_lang.cc
- */
-#include <dmlc/thread_local.h>
-#include <tvm/arith/analyzer.h>
-#include <tvm/ir/transform.h>
-#include <tvm/runtime/registry.h>
-#include <tvm/te/operation.h>
-#include <tvm/te/schedule.h>
-
-#include <algorithm>
-#include <stack>
-#include <unordered_set>
-#include <vector>
-
-#include "graph.h"
-
-namespace tvm {
-namespace te {
-
-// find first occurance location in leaf
-template <typename T>
-size_t FindNodeRef(ArrayNode* array_node, const T& v) {
-  const Object* n = v.get();
-  for (size_t i = 0; i < array_node->size(); ++i) {
-    if (array_node->at(i).get() == n) return i;
-  }
-  return array_node->size();
-}
-
-size_t FindLeafVar(ArrayNode* all_vars, ArrayNode* leaf_vars, const IterVar& v) {
-  size_t pos = FindNodeRef(leaf_vars, v);
-  if (pos < leaf_vars->size()) return pos;
-
-  if (FindNodeRef(all_vars, v) < all_vars->size()) {
-    LOG(FATAL) << "Operate on iter var " << v << "that has already been split";
-  } else {
-    LOG(FATAL) << "Operate on iter var " << v << "that is not part of the schedule";
-  }
-  return 0;
-}
-
-DataType MatchDataType(std::vector<DataType> dtypes) {
-  int max_bits = -1;
-  for (const auto& dtype : dtypes) {
-    ICHECK(dtype.is_int());
-    ICHECK(dtype.is_scalar());
-    max_bits = std::max(max_bits, dtype.bits());
-  }
-  return DataType::Int(max_bits);
-}
-
-void SplitHelper(StageNode* self, IterVar parent, PrimExpr factor, PrimExpr nparts,
-                 IterVar* p_outer, IterVar* p_inner, bool disable_predication) {
-  // Check if split is valid.
-  ICHECK(parent->iter_type == kDataPar || parent->iter_type == kCommReduce ||
-         parent->iter_type == kOrdered)
-      << "Cannot split on " << IterVarType2String(parent->iter_type);
-  IterVar outer = IterVar(Range(), parent->var.copy_with_suffix(".outer"), parent->iter_type);
-  IterVar inner = IterVar(Range(), parent->var.copy_with_suffix(".inner"), parent->iter_type);
-  *p_outer = outer;
-  *p_inner = inner;
-  // The splits
-  Array<IterVar>& all_vars = self->all_iter_vars;
-  Array<IterVar>& leaf_vars = self->leaf_iter_vars;
-  size_t pos = FindLeafVar(all_vars.GetArrayNode(), leaf_vars.GetArrayNode(), parent);
-  self->relations.push_back(Split(parent, outer, inner, factor, nparts, disable_predication));
-  // add vars to all vars
-  all_vars.push_back(outer);
-  all_vars.push_back(inner);
-  // replace the position.
-  leaf_vars.erase(leaf_vars.begin() + pos);
-  leaf_vars.insert(leaf_vars.begin() + pos, inner);
-  leaf_vars.insert(leaf_vars.begin() + pos, outer);
-}
-
-Stage::Stage(Operation op, const ScheduleNode* sch) {
-  auto n = make_object<StageNode>();
-  n->op = op;
-  n->origin_op = op;
-  n->all_iter_vars = op->root_iter_vars();
-  // remove opaque var from leaf.
-  Array<IterVar> clean;
-  for (IterVar iv : n->all_iter_vars) {
-    if (iv->iter_type != kOpaque) clean.push_back(iv);
-  }
-  if (clean.size() == n->all_iter_vars.size()) {
-    n->leaf_iter_vars = n->all_iter_vars;
-  } else {
-    n->leaf_iter_vars = clean;
-  }
-  n->attach_sch = sch;
-  data_ = std::move(n);
-}
-
-bool Stage::is_scheduled() const {
-  const StageNode* n = operator->();
-  return !(n->relations.empty() && n->attach_type == kGroupRoot &&
-           n->all_iter_vars.same_as(n->leaf_iter_vars));
-}
-
-Stage Stage::GetAttachSpec() const {
-  Stage attach_spec = *this;
-  while (attach_spec->attach_type == kGroupRoot && attach_spec->group.defined()) {
-    attach_spec = attach_spec->group;
-  }
-  return attach_spec;
-}
-
-Stage& Stage::set_scope(std::string scope) {  // NOLINT(*)
-  With<ScheduleContext> ctx(operator->()->attach_sch, __func__);
-  (*this)->scope = scope;
-  return *this;
-}
-
-Stage& Stage::compute_at(Stage parent, IterVar scope) {  // NOLINT(*)
-  With<ScheduleContext> ctx(operator->()->attach_sch, __func__);
-  ICHECK_NE((*this)->attach_type, kScanUpdate) << "Cannot specify compute_at for scan updates";
-  // Group constraint checking.
-  Stage group = (*this)->group;
-  if (group.defined()) {
-    Stage pg = parent->group;
-    while (pg.defined() && !pg.same_as(group)) {
-      pg = pg->group;
-    }
-    ICHECK(pg.same_as(group)) << "Can only assign compute_at to stages within the same group";
-  }
-
-  (*this)->attach_type = kScope;
-  (*this)->attach_ivar = scope;
-  (*this)->attach_stage = parent;
-  bool found = false;
-  for (size_t i = 0; i < parent->leaf_iter_vars.size(); ++i) {
-    if (scope == parent->leaf_iter_vars[i]) {
-      found = true;
-      break;
-    }
-  }
-  ICHECK(found) << "Cannot find the axis " << scope << " in parent's leaf_iter_vars"
-                << " parent=" << parent;
-  return *this;
-}
-
-Stage& Stage::compute_inline() {  // NOLINT(*)
-  With<ScheduleContext> ctx(operator->()->attach_sch, __func__);
-  ICHECK_NE((*this)->attach_type, kScanUpdate) << "Cannot specify compute_at for scan updates";
-  (*this)->attach_type = kInline;
-  return *this;
-}
-
-Stage& Stage::compute_root() {  // NOLINT(*)
-  With<ScheduleContext> ctx(operator->()->attach_sch, __func__);
-  ICHECK_NE((*this)->attach_type, kScanUpdate) << "Cannot specify compute_at for scan updates";
-  (*this)->attach_type = kGroupRoot;
-  return *this;
-}
-
-Stage& Stage::bind(IterVar ivar, IterVar thread_ivar) {  // NOLINT(*)
-  With<ScheduleContext> ctx(operator->()->attach_sch, __func__);
-  StageNode* self = operator->();
-  ICHECK(ivar->iter_type == kDataPar || ivar->iter_type == kCommReduce)
-      << "Cannot bind " << IterVarType2String(ivar->iter_type) << " to thread";
-  ICHECK(thread_ivar->iter_type == kThreadIndex)
-      << "Cannot rebase by " << IterVarType2String(ivar->iter_type)
-      << ", only thread axis is allowed so far";
-  ArrayNode* all_vars = self->all_iter_vars.CopyOnWrite();
-  ArrayNode* leaf_vars = self->leaf_iter_vars.CopyOnWrite();
-  FindLeafVar(all_vars, leaf_vars, ivar);
-
-  auto it = self->iter_var_attrs.find(ivar);
-  ObjectPtr<IterVarAttrNode> n;
-  if (it != self->iter_var_attrs.end()) {
-    n = make_object<IterVarAttrNode>(*(*it).second.operator->());
-    if (n->bind_thread.defined() && !n->bind_thread.same_as(thread_ivar)) {
-      LOG(WARNING) << "Axis " << ivar << " is already bind to another thread " << n->bind_thread;
-    }
-  } else {
-    n = make_object<IterVarAttrNode>();
-  }
-  n->bind_thread = thread_ivar;
-  self->iter_var_attrs.Set(ivar, IterVarAttr(n));
-  return *this;
-}
-
-Stage& Stage::env_threads(Array<IterVar> threads) {
-  With<ScheduleContext> ctx(operator->()->attach_sch, __func__);
-  StageNode* self = operator->();
-  ICHECK(self->op.defined() && self->op.as<ScanOpNode>())
-      << "env_threads is only valid for composite ops such as ScanOp";
-  ICHECK_EQ(self->env_threads.size(), 0U) << "Already set env_threads";
-  Array<IterVar>& leaf_vars = self->leaf_iter_vars;
-  Array<IterVar>& all_vars = self->all_iter_vars;
-  std::vector<IterVar> temp;
-  for (IterVar iv : threads) {
-    temp.push_back(iv);
-  }
-  leaf_vars.insert(leaf_vars.begin(), temp.begin(), temp.end());
-  all_vars.insert(all_vars.end(), temp.begin(), temp.end());
-  self->env_threads = threads;
-  return *this;
-}
-
-Stage& Stage::set_store_predicate(PrimExpr predicate) {
-  With<ScheduleContext> ctx(operator->()->attach_sch, __func__);
-  StageNode* self = operator->();
-  self->store_predicate = predicate;
-  return *this;
-}
-
-Stage& Stage::split(IterVar parent, PrimExpr factor, IterVar* p_outer, IterVar* p_inner,
-                    bool disable_predication) {  // NOLINT(*)
-  With<ScheduleContext> ctx(operator->()->attach_sch, __func__);
-  SplitHelper(operator->(), parent, factor, PrimExpr(), p_outer, p_inner, disable_predication);
-  return *this;
-}
-
-Stage& Stage::split_by_nparts(IterVar parent, PrimExpr nparts, IterVar* p_outer, IterVar* p_inner,
-                              bool disable_predication) {  // NOLINT(*)
-  With<ScheduleContext> ctx(operator->()->attach_sch, __func__);
-  SplitHelper(operator->(), parent, PrimExpr(), nparts, p_outer, p_inner, disable_predication);
-  return *this;
-}
-
-Stage& Stage::fuse(IterVar outer, IterVar inner, IterVar* p_target) {  // NOLINT(*)
-  With<ScheduleContext> ctx(operator->()->attach_sch, __func__);
-  StageNode* self = operator->();
-  ICHECK(outer->iter_type == kDataPar || outer->iter_type == kCommReduce ||
-         outer->iter_type == kOrdered)
-      << "Cannot fuse " << IterVarType2String(outer->iter_type);
-  ICHECK(inner->iter_type == kDataPar || inner->iter_type == kCommReduce ||
-         inner->iter_type == kOrdered)
-      << "Cannot fuse " << IterVarType2String(inner->iter_type);
-
-  IterVarType iter_type = outer->iter_type;
-  if (inner->iter_type > iter_type) iter_type = inner->iter_type;
-  std::string fused_name = outer->var->name_hint + "." + inner->var->name_hint + ".fused";
-  DataType iter_dtype = MatchDataType({inner->var.dtype(), outer->var.dtype()});
-
-  IterVar fused = IterVar(Range(), Var(fused_name, iter_dtype), iter_type);
-
-  Array<IterVar>& all_vars = self->all_iter_vars;
-  Array<IterVar>& leaf_vars = self->leaf_iter_vars;
-
-  size_t pos_inner = FindLeafVar(all_vars.GetArrayNode(), leaf_vars.GetArrayNode(), inner);
-  size_t pos_outer = FindLeafVar(all_vars.GetArrayNode(), leaf_vars.GetArrayNode(), outer);
-  if (pos_inner + 1 == pos_outer) {
-    std::swap(outer, inner);
-    std::swap(pos_inner, pos_outer);
-  }
-  ICHECK_EQ(pos_inner, pos_outer + 1)
-      << "Can only fuse iterations that are consecutive between each other";
-  self->relations.push_back(Fuse(outer, inner, fused));
-  all_vars.push_back(fused);
-  leaf_vars.erase(leaf_vars.begin() + pos_outer, leaf_vars.begin() + pos_inner + 1);
-  leaf_vars.insert(leaf_vars.begin() + pos_outer, fused);
-  *p_target = fused;
-  return *this;
-}
-
-Stage& Stage::fuse(const Array<IterVar>& axes, IterVar* p_target) {  // NOLINT(*)
-  With<ScheduleContext> ctx(operator->()->attach_sch, __func__);
-  if (axes.size() != 0) {
-    IterVar fused = axes[0];
-    for (size_t i = 1; i < axes.size(); ++i) {
-      this->fuse(fused, axes[i], &fused);
-    }
-    *p_target = std::move(fused);
-  } else {
-    StageNode* self = operator->();
-    // special handle fuse empty array.
-    // insert at the outer most loop
-    IterVar singleton =
-        IterVar(Range::FromMinExtent(0, 1), Var("singleton", DataType::Int(32)), kDataPar);
-    self->relations.push_back(Singleton(singleton));
-    Array<IterVar>& all_vars = self->all_iter_vars;
-    Array<IterVar>& leaf_vars = self->leaf_iter_vars;
-    all_vars.push_back(singleton);
-    leaf_vars.insert(leaf_vars.begin(), singleton);
-    *p_target = singleton;
-  }
-  return *this;
-}
-
-Stage& Stage::reorder(const Array<IterVar>& order) {  // NOLINT(*)
-  With<ScheduleContext> ctx(operator->()->attach_sch, __func__);
-  std::unordered_set<IterVar> seen_var;
-  StageNode* self = operator->();
-  for (IterVar iv : order) {
-    ICHECK(iv->iter_type == kDataPar || iv->iter_type == kCommReduce ||
-           iv->iter_type == kThreadIndex)
-        << "Cannot reorder IterVar(" << IterVarType2String(iv->iter_type) << ")";
-
-    ICHECK_EQ(seen_var.count(iv), 0) << "Same axis can not appear more than once " << iv;
-    seen_var.insert(iv);
-  }
-  ArrayNode* all_vars = self->all_iter_vars.CopyOnWrite();
-  ArrayNode* leaf_vars = self->leaf_iter_vars.CopyOnWrite();
-  std::vector<size_t> pos;
-
-  for (size_t i = 0; i < order.size(); ++i) {
-    pos.push_back(FindLeafVar(all_vars, leaf_vars, order[i]));
-  }
-  std::vector<ObjectRef> temp;
-  for (size_t i = 0; i < pos.size(); ++i) {
-    temp.emplace_back(leaf_vars->at(pos[i]));
-  }
-  std::sort(pos.begin(), pos.end());
-  for (size_t i = 0; i < pos.size(); ++i) {
-    leaf_vars->SetItem(pos[i], temp[i]);
-  }
-  return *this;
-}
-
-Stage& Stage::tile(IterVar x_parent, IterVar y_parent, PrimExpr x_factor, PrimExpr y_factor,
-                   IterVar* p_x_outer, IterVar* p_y_outer, IterVar* p_x_inner, IterVar* p_y_inner) {
-  split(x_parent, x_factor, p_x_outer, p_x_inner);
-  split(y_parent, y_factor, p_y_outer, p_y_inner);
-  reorder(Array<IterVar>({*p_x_outer, *p_y_outer, *p_x_inner, *p_y_inner}));
-  return *this;
-}
-
-template <typename FUpdate>
-inline void UpdateIterVarAttr(StageNode* self, IterVar var, FUpdate fupdate,
-                              bool need_leaf = true) {
-  if (need_leaf) {
-    ArrayNode* all_vars = self->all_iter_vars.CopyOnWrite();
-    ArrayNode* leaf_vars = self->leaf_iter_vars.CopyOnWrite();
-    FindLeafVar(all_vars, leaf_vars, var);
-  }
-  auto it = self->iter_var_attrs.find(var);
-  ObjectPtr<IterVarAttrNode> n;
-  if (it != self->iter_var_attrs.end()) {
-    n = make_object<IterVarAttrNode>(*(*it).second.operator->());
-  } else {
-    n = make_object<IterVarAttrNode>();
-  }
-  fupdate(n.get());
-  self->iter_var_attrs.Set(var, IterVarAttr(n));
-}
-
-inline void SetAttrIterType(StageNode* self, IterVar var, IterVarType iter_type) {
-  UpdateIterVarAttr(self, var, [iter_type](IterVarAttrNode* n) { n->iter_type = iter_type; });
-}
-
-Stage& Stage::vectorize(IterVar var) {  // NOLINT(*)
-  With<ScheduleContext> ctx(operator->()->attach_sch, __func__);
-  ICHECK(var->iter_type == kDataPar || var->iter_type == kOpaque || var->iter_type == kUnrolled ||
-         var->iter_type == kVectorized || var->iter_type == kTensorized ||
-         var->iter_type == kParallelized)
-      << "Cannot vectorize on " << IterVarType2String(var->iter_type);
-  SetAttrIterType(operator->(), var, kVectorized);
-  return *this;
-}
-
-Stage& Stage::tensorize(IterVar var, TensorIntrin f) {  // NOLINT(*)
-  With<ScheduleContext> ctx(operator->()->attach_sch, __func__);
-  UpdateIterVarAttr(operator->(), var, [f](IterVarAttrNode* n) {
-    n->iter_type = kTensorized;
-    n->tensor_intrin = f;
-  });
-  return *this;
-}
-
-Stage& Stage::unroll(IterVar var) {  // NOLINT(*)
-  With<ScheduleContext> ctx(operator->()->attach_sch, __func__);
-  SetAttrIterType(operator->(), var, kUnrolled);
-  return *this;
-}
-
-Stage& Stage::parallel(IterVar var) {  // NOLINT(*)
-  With<ScheduleContext> ctx(operator->()->attach_sch, __func__);
-  SetAttrIterType(operator->(), var, kParallelized);
-  return *this;
-}
-
-Stage& Stage::pragma(IterVar var, const std::string& pragma_type,
-                     const PrimExpr& pragma_value) {  // NOLINT(*)
-  if (pragma_type == "unroll") {
-    this->unroll(var);
-  } else if (pragma_type == "vectorize") {
-    this->vectorize(var);
-  } else {
-    With<ScheduleContext> ctx(operator->()->attach_sch, __func__);
-    UpdateIterVarAttr(operator->(), var, [pragma_type, pragma_value](IterVarAttrNode* n) {
-      n->pragma_keys.push_back(tir::StringImm(pragma_type));
-      n->pragma_values.push_back(pragma_value);
-    });
-  }
-  return *this;
-}
-
-Stage& Stage::prefetch(const Tensor& tensor, IterVar var, PrimExpr offset) {
-  With<ScheduleContext> ctx(operator->()->attach_sch, __func__);
-  StageNode* self = operator->();
-  ArrayNode* all_vars = self->all_iter_vars.CopyOnWrite();
-  ArrayNode* leaf_vars = self->leaf_iter_vars.CopyOnWrite();
-  FindLeafVar(all_vars, leaf_vars, var);
-  auto it = self->iter_var_attrs.find(var);
-  ObjectPtr<IterVarAttrNode> n;
-  if (it != self->iter_var_attrs.end()) {
-    n = make_object<IterVarAttrNode>(*(*it).second.operator->());
-  } else {
-    n = make_object<IterVarAttrNode>();
-  }
-  n->prefetch_data.push_back(tensor);
-  n->prefetch_offset.push_back(offset);
-  self->iter_var_attrs.Set(var, IterVarAttr(n));
-  return *this;
-}
-
-Stage& Stage::storage_align(IterVar axis, int factor, int offset) {
-  With<ScheduleContext> ctx(operator->()->attach_sch, __func__);
-  StageNode* self = operator->();
-  UpdateIterVarAttr(
-      self, axis,
-      [factor, offset](IterVarAttrNode* n) {
-        n->dim_align_factor = factor;
-        n->dim_align_offset = offset;
-      },
-      false);
-  return *this;
-}
-
-Stage& Stage::double_buffer() {
-  With<ScheduleContext> ctx(operator->()->attach_sch, __func__);
-  StageNode* self = operator->();
-  ICHECK(!self->is_output) << "Cannot apply double buffer on output";
-  self->double_buffer = true;
-  return *this;
-}
-
-Stage& Stage::rolling_buffer() {
-  With<ScheduleContext> ctx(operator->()->attach_sch, __func__);
-  StageNode* self = operator->();
-  ICHECK(!self->is_output) << "Cannot apply rolling buffer on output";
-  self->rolling_buffer = true;
-  return *this;
-}
-Stage& Stage::transform_layout(const Array<Var>& initial_indices,
-                               const Array<PrimExpr>& final_indices,
-                               Array<IterVar>* out_iter_vars) {
-  With<ScheduleContext> ctx(operator->()->attach_sch, __func__);
-  StageNode* self = operator->();
-  IndexMap map(initial_indices, final_indices);
-  self->layout_transforms.push_back(map);
-
-  auto* compute = self->op.as<ComputeOpNode>();
-
-  // Can only rewrite the indices of compute op nodes.
-  if (!compute) {
-    return *this;
-  }
-
-  CHECK_EQ(initial_indices.size(), compute->axis.size())
-      << "Expected number of initial indices in transformation to match the dimension of "
-      << self->op->name;
-
-  // Locate the IterVar objects for the data axes.
-  auto leaf_iter_range = [&]() -> std::pair<size_t, size_t> {
-    std::vector<size_t> leaf_var_indices;
-    for (const auto& axis : compute->axis) {
-      leaf_var_indices.push_back(
-          FindLeafVar(self->all_iter_vars.CopyOnWrite(), self->leaf_iter_vars.CopyOnWrite(), axis));
-    }
-    auto minmax_element = std::minmax_element(leaf_var_indices.begin(), leaf_var_indices.end());
-    return {*minmax_element.first, *minmax_element.second + 1};
-  }();
-  CHECK_EQ(leaf_iter_range.first + compute->axis.size(), leaf_iter_range.second)
-      << "Cannot transform indices if they have already been reordered";
-
-  // Determine the updated ranges of iteration.
-  Array<Range> initial_ranges;
-  for (const auto& iter_var : compute->axis) {
-    initial_ranges.push_back(iter_var->dom);
-  }
-  arith::Analyzer analyzer;
-  Array<Range> final_ranges = map->MapRanges(initial_ranges, &analyzer);
-
-  // Make IterVar objects to represent the new iterations.
-  auto inverse = map.Inverse(initial_ranges, &analyzer);
-  Array<IterVar> final_indices_iter;
-  ICHECK_EQ(inverse->initial_indices.size(), final_ranges.size());
-  for (size_t i = 0; i < inverse->initial_indices.size(); i++) {
-    final_indices_iter.push_back(IterVar(final_ranges[i], inverse->initial_indices[i], kDataPar));
-  }
-
-  // Append the new IterVar objects to all_iter_vars
-  for (const auto& iter_var : final_indices_iter) {
-    self->all_iter_vars.push_back(iter_var);
-  }
-
-  // Replace the existing IterVar objects in leaf_iter_vars with the
-  // new IterVar objects.
-  self->leaf_iter_vars.erase(self->leaf_iter_vars.begin() + leaf_iter_range.first,
-                             self->leaf_iter_vars.begin() + leaf_iter_range.second);
-  self->leaf_iter_vars.insert(self->leaf_iter_vars.begin() + leaf_iter_range.first,
-                              final_indices_iter.begin(), final_indices_iter.end());
-
-  // Define a relationship for each new axis
-  self->relations.push_back(Transform(compute->axis, final_indices_iter, map, inverse));
-
-  // Return the iteration variables as an output.
-  if (out_iter_vars) {
-    *out_iter_vars = final_indices_iter;
-  }
-
-  return *this;
-}
-
-Stage& Stage::set_axis_separators(const Array<IntImm>& axis_separators) {
-  With<ScheduleContext> ctx(operator->()->attach_sch, __func__);
-  StageNode* self = operator->();
-  self->axis_separators = axis_separators;
-  return *this;
-}
-
-Stage CopyStage(const Stage& s) {
-  ObjectPtr<StageNode> n = make_object<StageNode>(*s.operator->());
-  return Stage(n);
-}
-
-Schedule Schedule::copy() const {
-  // map of stages.
-  const ScheduleNode* self = operator->();
-  std::unordered_map<Stage, Stage, ObjectPtrHash, ObjectPtrEqual> smap;
-  ObjectPtr<ScheduleNode> n = make_object<ScheduleNode>();
-  n->outputs = self->outputs;
-  // Copy the stages.
-  for (Stage s : self->stages) {
-    Stage scopy = CopyStage(s);
-    smap[s] = scopy;
-    n->stages.push_back(scopy);
-  }
-  for (Stage g : self->groups) {
-    Stage gcopy = CopyStage(g);
-    smap[g] = gcopy;
-    n->groups.push_back(gcopy);
-  }
-  // Remaps the reference relations.
-  for (auto kv : self->stage_map) {
-    n->stage_map.Set(kv.first, smap.at(kv.second));
-  }
-  for (Stage s : n->stages) {
-    if (s->attach_stage.defined()) {
-      ICHECK(smap.find(s->attach_stage) != smap.end())
-          << s->attach_stage << " not found in " << (*this);
-      s->attach_stage = smap.at(s->attach_stage);
-    }
-    if (s->group.defined()) {
-      ICHECK(smap.find(s->group) != smap.end()) << s->group << " not found in " << (*this);
-      s->group = smap.at(s->group);
-    }
-  }
-  for (Stage s : n->groups) {
-    if (s->attach_stage.defined()) {
-      ICHECK(smap.find(s->attach_stage) != smap.end())
-          << s->attach_stage << " not found in " << (*this);
-      s->attach_stage = smap.at(s->attach_stage);
-    }
-    if (s->group.defined()) {
-      ICHECK(smap.find(s->group) != smap.end()) << s->group << " not found in " << (*this);
-      s->group = smap.at(s->group);
-    }
-  }
-  return Schedule(n);
-}
-
-Stage Schedule::operator[](const Operation& op) {
-  auto it = (*this)->stage_map.find(op);
-  ICHECK(it != (*this)->stage_map.end())
-      << "Cannot find Stage for operator " << op << " in the schedule";
-  return (*it).second;
-}
-
-Stage LeastCommonAncestor(Stage g1, Stage g2) {
-  if (!g1.defined()) return g1;
-  if (!g2.defined()) return g2;
-  if (g1.same_as(g2)) return g1;
-  Stage g = g1;
-  while (g.defined()) {
-    if (g.same_as(g2)) return g2;
-    g = g->group;
-  }
-  g = g2;
-  while (g.defined()) {
-    if (g.same_as(g1)) return g1;
-    g = g->group;
-  }
-  return g;
-}
-
-Array<Tensor> RemapTensor(ScheduleNode* self, const Array<Tensor>& arr) {
-  self->InitCache();
-  const auto& op2stage_cache = self->op2stage_cache_;
-  Array<Tensor> ret;
-  for (Tensor t : arr) {
-    if (!op2stage_cache.count(t->op.get())) {
-      ICHECK(self->stage_map.count(t->op)) << "Given tensor is not in the schedule plan";
-      t = self->stage_map[t->op]->op.output(t->value_index);
-    }
-    ret.push_back(t);
-  }
-  return ret;
-}
-
-// Group the schedule stages.
-Stage Schedule::create_group(const Array<Tensor>& outputs, const Array<Tensor>& inputs,
-                             bool include_inputs) {
-  ScheduleNode* self = operator->();
-  self->InitCache();
-  const auto& op2stage_cache = self->op2stage_cache_;
-  // Get the ops.
-  Array<Operation> ops =
-      te::GetSubGraph(RemapTensor(self, outputs), RemapTensor(self, inputs), include_inputs);
-  // local counter entry
-  // Automatically initialize to 0 during creation.
-  struct Entry {
-    int count{0};
-  };
-  // Map of group->touched counter
-  std::unordered_map<Stage, Entry, ObjectPtrHash, ObjectPtrEqual> counter;
-  // The parent group;
-  Stage parent_group;
-  // Detect common parent and child.
-  for (size_t i = 0; i < ops.size(); ++i) {
-    Operation op = ops[i];
-    auto it = op2stage_cache.find(op.get());
-    ICHECK(it != op2stage_cache.end());
-    Stage op_group = it->second->group;
-    if (i == 0) {
-      parent_group = op_group;
-    } else {
-      parent_group = LeastCommonAncestor(parent_group, op_group);
-    }
-    if (op_group.defined()) {
-      ++counter[op_group].count;
-    }
-  }
-  // Create the new group stage.
-  Stage gstage(make_object<StageNode>());
-  gstage->attach_sch = this->operator->();
-  gstage->group = parent_group;
-  if (parent_group.defined()) {
-    ++parent_group->num_child_stages;
-  }
-  // Propagate the counter statistics from by checking if subgroup
-  // Is full and propagate.
-  std::vector<Stage> stack;
-  for (auto& kv : counter) {
-    if (!kv.first.same_as(parent_group)) {
-      if (kv.first->num_child_stages == kv.second.count) {
-        stack.push_back(kv.first);
-      }
-    }
-  }
-  while (!stack.empty()) {
-    Stage g = stack.back();
-    stack.pop_back();
-    if (g->group.defined() && !g->group.same_as(parent_group)) {
-      Entry& e = counter[g->group];
-      ++e.count;
-      if (e.count == g->group->num_child_stages) {
-        stack.push_back(g->group);
-      }
-    }
-  }
-  // Verification and remappig the subgroups.
-  for (auto& kv : counter) {
-    if (kv.first.same_as(parent_group)) continue;
-    ICHECK_EQ(kv.first->num_child_stages, kv.second.count)
-        << "Trying to group region that intersect with an already existed group";
-    if (kv.first->group.same_as(parent_group)) {
-      Stage s = kv.first;
-      s->group = gstage;
-      ++gstage->num_child_stages;
-      if (parent_group.defined()) {
-        --parent_group->num_child_stages;
-      }
-    }
-  }
-  // Remap the group of op stages.
-  for (Operation op : ops) {
-    auto it = op2stage_cache.find(op.get());
-    ICHECK(it != op2stage_cache.end());
-    Stage s = it->second;
-    if (s->group.same_as(parent_group)) {
-      s->group = gstage;
-      ++gstage->num_child_stages;
-      if (parent_group.defined()) {
-        --parent_group->num_child_stages;
-      }
-    }
-  }
-  // Correct the attach to keep everything in group.
-  for (Operation op : ops) {
-    auto it = op2stage_cache.find(op.get());
-    ICHECK(it != op2stage_cache.end());
-    Stage s = it->second;
-    if (s->attach_type == kScope) {
-      Stage cg = LeastCommonAncestor(s->attach_stage->group, gstage);
-      if (!cg.same_as(gstage)) {
-        LOG(WARNING) << "group invalidates some previous compute_at relation "
-                     << " and keeps things to be computed inside the group";
-        s.compute_root();
-      }
-    }
-  }
-
-  self->groups.push_back(gstage);
-  return gstage;
-}
-
-void ScheduleNode::InvalidateCache() { op2stage_cache_.clear(); }
-
-void ScheduleNode::InitCache() {
-  if (op2stage_cache_.size() == stages.size()) return;
-  InvalidateCache();
-  for (Stage s : stages) {
-    if (s->op.defined()) {
-      op2stage_cache_[s->op.get()] = s;
-    }
-  }
-  ICHECK_EQ(op2stage_cache_.size(), stages.size());
-}
-
-bool ScheduleNode::Contain(const Operation& op) const {
-  return stage_map.find(op) != stage_map.end();
-}
-
-TVM_REGISTER_PASS_CONFIG_OPTION("te.keep_schedule_record", Bool);
-
-Schedule::Schedule(Array<Operation> ops) {
-  auto n = make_object<ScheduleNode>();
-  data_ = n;
-  n->outputs = ops;
-  auto g = te::CreateReadGraph(n->outputs);
-  Array<Operation> post_order = te::PostDFSOrder(n->outputs, g);
-  // output set.
-  std::unordered_set<Operation> output_set;
-  for (Operation x : ops) {
-    output_set.insert(x);
-  }
-  for (Operation op : post_order) {
-    Stage stage(op, this->operator->());
-    stage->is_output = output_set.count(op) != 0;
-    n->stages.push_back(stage);
-    n->stage_map.Set(op, stage);
-    // mark scan updates.
-    if (const ScanOpNode* scan = op.as<ScanOpNode>()) {
-      Array<Tensor> inputs;
-      for (Tensor t : scan->state_placeholder) {
-        inputs.push_back(t);
-      }
-      for (Tensor t : scan->inputs) {
-        inputs.push_back(t);
-      }
-      // Create the scan group.
-      Stage scan_group = this->create_group(scan->update, inputs, false);
-      scan_group->attach_type = kScanUpdate;
-      scan_group->attach_stage = stage;
-
-      for (size_t i = 0; i < scan->update.size(); ++i) {
-        Stage s = n->stage_map[scan->update[i]->op];
-        ICHECK(scan_group.same_as(s->group));
-      }
-    }
-  }
-  transform::PassContext pass_ctx = transform::PassContext::Current();
-  n->keep_schedule_record = pass_ctx->GetConfig<Bool>("te.keep_schedule_record", Bool(false));
-  if (n->keep_schedule_record.value()) {
-    // push plain schedule as the very first one
-    n->schedule_record.push_back(copy());
-    n->primitive_record.push_back("vanilla");
-  }
-}
-
-ScheduleContext::ScheduleContext(const ScheduleNode* sch_node, String current_primitive_name)
-    : sch_(GetRef<Schedule>(sch_node)), current_primitive_name_(current_primitive_name) {}
-
-void ScheduleContext::EnterWithScope() {}
-
-void ScheduleContext::ExitWithScope() {
-  if (sch_.defined() && sch_->keep_schedule_record.value()) {
-    sch_->schedule_record.push_back(sch_.copy());
-    sch_->primitive_record.push_back(current_primitive_name_);
-  }
-}
-
-Split::Split(IterVar parent, IterVar outer, IterVar inner, PrimExpr factor, PrimExpr nparts,
-             bool disable_predication) {
-  auto n = make_object<SplitNode>();
-  n->parent = parent;
-  n->outer = outer;
-  n->inner = inner;
-  n->factor = factor;
-  n->nparts = nparts;
-  n->disable_predication = disable_predication;
-  data_ = std::move(n);
-}
-
-Fuse::Fuse(IterVar outer, IterVar inner, IterVar fused) {
-  auto n = make_object<FuseNode>();
-  n->outer = outer;
-  n->inner = inner;
-  n->fused = fused;
-  data_ = std::move(n);
-}
-
-Rebase::Rebase(IterVar parent, IterVar rebased) {
-  auto n = make_object<RebaseNode>();
-  n->parent = parent;
-  n->rebased = rebased;
-  data_ = std::move(n);
-}
-
-Singleton::Singleton(IterVar iter) {
-  auto n = make_object<SingletonNode>();
-  n->iter = iter;
-  data_ = std::move(n);
-}
-
-Transform::Transform(Array<IterVar> original_variables, Array<IterVar> transformed_variables,
-                     IndexMap forward_transformation, IndexMap inverse_transformation) {
-  auto n = make_object<TransformNode>();
-  n->original_variables = original_variables;
-  n->transformed_variables = transformed_variables;
-  n->forward_transformation = forward_transformation;
-  n->inverse_transformation = inverse_transformation;
-  data_ = std::move(n);
-}
-
-SpecializedCondition::SpecializedCondition(Array<PrimExpr> conditions) {
-  ObjectPtr<SpecializedConditionNode> n = make_object<SpecializedConditionNode>();
-  n->clauses = std::move(conditions);
-  data_ = std::move(n);
-}
-
-/*! \brief Entry to hold the SpecializedCondition context stack. */
-struct TVMSpecializationThreadLocalEntry {
-  /*! \brief The current specialized condition */
-  std::stack<SpecializedCondition> condition_stack;
-};
-
-/*! \brief Thread local store to hold the Target context stack. */
-typedef dmlc::ThreadLocalStore<TVMSpecializationThreadLocalEntry> TVMSpecializationThreadLocalStore;
-
-void SpecializedCondition::EnterWithScope() {
-  TVMSpecializationThreadLocalEntry* entry = TVMSpecializationThreadLocalStore::Get();
-  entry->condition_stack.push(*this);
-}
-
-void SpecializedCondition::ExitWithScope() {
-  TVMSpecializationThreadLocalEntry* entry = TVMSpecializationThreadLocalStore::Get();
-  ICHECK(!entry->condition_stack.empty());
-  ICHECK(entry->condition_stack.top().same_as(*this));
-  entry->condition_stack.pop();
-}
-
-SpecializedCondition SpecializedCondition::Current() {
-  TVMSpecializationThreadLocalEntry* entry = TVMSpecializationThreadLocalStore::Get();
-  SpecializedCondition cond;
-  if (entry->condition_stack.size() > 0) {
-    cond = entry->condition_stack.top();
-  }
-  return cond;
-}
-
-class SpecializedCondition::Internal {
- public:
-  static void EnterScope(SpecializedCondition cond) { cond.EnterWithScope(); }
-
-  static void ExitScope(SpecializedCondition cond) { cond.ExitWithScope(); }
-};
-
-TVM_REGISTER_NODE_TYPE(StageNode);
-TVM_REGISTER_NODE_TYPE(IterVarAttrNode);
-TVM_REGISTER_NODE_TYPE(SplitNode);
-TVM_REGISTER_NODE_TYPE(FuseNode);
-TVM_REGISTER_NODE_TYPE(RebaseNode);
-TVM_REGISTER_NODE_TYPE(SingletonNode);
-TVM_REGISTER_NODE_TYPE(ScheduleNode);
-TVM_REGISTER_NODE_TYPE(SpecializedConditionNode);
-
-// Printer
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<StageNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const StageNode*>(node.get());
-      if (op->op.defined()) {
-        p->stream << "stage(" << op->origin_op->name << ", " << op->op << ")";
-      } else {
-        p->stream << "group-stage(" << op << ")";
-      }
-    })
-    .set_dispatch<IterVarAttrNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const IterVarAttrNode*>(node.get());
-      p->stream << IterVarType2String(op->iter_type);
-    })
-    .set_dispatch<SplitNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const SplitNode*>(node.get());
-      p->stream << "split(parent=";
-      p->Print(op->parent);
-      p->stream << ", outer=";
-      p->Print(op->outer);
-      p->stream << ", inner=";
-      p->Print(op->inner);
-      if (op->factor.defined()) {
-        p->stream << ", factor=";
-        p->Print(op->factor);
-      } else {
-        p->stream << ", nparts=";
-        p->Print(op->nparts);
-      }
-      p->stream << ", disable_predication=";
-      p->stream << op->disable_predication;
-      p->stream << ')';
-    })
-    .set_dispatch<FuseNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const FuseNode*>(node.get());
-      p->stream << "fuse(";
-      p->stream << "outer=";
-      p->Print(op->outer);
-      p->stream << ", inner=";
-      p->Print(op->inner);
-      p->stream << ", fused=";
-      p->Print(op->fused);
-      p->stream << ')';
-    })
-    .set_dispatch<RebaseNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const RebaseNode*>(node.get());
-      p->stream << "rebase(";
-      p->stream << "parent=";
-      p->Print(op->parent);
-      p->stream << ", rebased=";
-      p->Print(op->rebased);
-      p->stream << ')';
-    })
-    .set_dispatch<SingletonNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const SingletonNode*>(node.get());
-      p->stream << "singleton(";
-      p->Print(op->iter);
-      p->stream << ')';
-    })
-    .set_dispatch<ScheduleNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const ScheduleNode*>(node.get());
-      p->stream << "schedule(" << op << ")";
-    })
-    .set_dispatch<SpecializedConditionNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const SpecializedConditionNode*>(node.get());
-      p->stream << "specialized_condition(";
-      p->Print(op->clauses);
-      p->stream << ')';
-    });
-
-TVM_REGISTER_GLOBAL("te.CreateSchedule").set_body_typed(create_schedule);
-
-TVM_REGISTER_GLOBAL("te.StageSetScope").set_body_method(&Stage::set_scope);
-
-TVM_REGISTER_GLOBAL("te.StageBind").set_body_method(&Stage::bind);
-
-TVM_REGISTER_GLOBAL("te.StageSplitByFactor")
-    .set_body_typed([](Stage stage, IterVar parent, PrimExpr factor, bool disable_predication) {
-      IterVar outer, inner;
-      stage.split(parent, factor, &outer, &inner, disable_predication);
-      return Array<IterVar>({outer, inner});
-    });
-
-TVM_REGISTER_GLOBAL("te.StageSplitByNParts")
-    .set_body_typed([](Stage stage, IterVar parent, PrimExpr nparts, bool disable_predication) {
-      IterVar outer, inner;
-      stage.split_by_nparts(parent, nparts, &outer, &inner, disable_predication);
-      return Array<IterVar>({outer, inner});
-    });
-
-TVM_REGISTER_GLOBAL("te.StageFuse").set_body_typed([](Stage stage, Array<IterVar> axes) {
-  IterVar fused;
-  stage.fuse(axes, &fused);
-  return fused;
-});
-
-TVM_REGISTER_GLOBAL("te.StageComputeAt").set_body_method(&Stage::compute_at);
-
-TVM_REGISTER_GLOBAL("te.StageComputeInline").set_body_method(&Stage::compute_inline);
-
-TVM_REGISTER_GLOBAL("te.StageComputeRoot").set_body_method(&Stage::compute_root);
-
-TVM_REGISTER_GLOBAL("te.StageReorder").set_body_method(&Stage::reorder);
-
-TVM_REGISTER_GLOBAL("te.StageTile")
-    .set_body_typed([](Stage stage, IterVar x_parent, IterVar y_parent, PrimExpr x_factor,
-                       PrimExpr y_factor) {
-      IterVar x_outer, y_outer, x_inner, y_inner;
-      stage.tile(x_parent, y_parent, x_factor, y_factor, &x_outer, &y_outer, &x_inner, &y_inner);
-      return Array<IterVar>({x_outer, y_outer, x_inner, y_inner});
-    });
-
-TVM_REGISTER_GLOBAL("te.StageEnvThreads").set_body_method(&Stage::env_threads);
-
-TVM_REGISTER_GLOBAL("te.StageSetStorePredicate").set_body_method(&Stage::set_store_predicate);
-
-TVM_REGISTER_GLOBAL("te.StageUnroll").set_body_method(&Stage::unroll);
-
-TVM_REGISTER_GLOBAL("te.StageVectorize").set_body_method(&Stage::vectorize);
-
-TVM_REGISTER_GLOBAL("te.StageTensorize").set_body_method(&Stage::tensorize);
-
-TVM_REGISTER_GLOBAL("te.StageParallel").set_body_method(&Stage::parallel);
-
-TVM_REGISTER_GLOBAL("te.StagePragma").set_body_method(&Stage::pragma);
-
-TVM_REGISTER_GLOBAL("te.StagePrefetch").set_body_method(&Stage::prefetch);
-
-TVM_REGISTER_GLOBAL("te.StageStorageAlign").set_body_method(&Stage::storage_align);
-
-TVM_REGISTER_GLOBAL("te.StageDoubleBuffer").set_body_method(&Stage::double_buffer);
-
-TVM_REGISTER_GLOBAL("te.StageRollingBuffer").set_body_method(&Stage::rolling_buffer);
-
-TVM_REGISTER_GLOBAL("te.StageTransformLayout")
-    .set_body_typed([](Stage stage, const Array<Var>& initial_indices,
-                       const Array<PrimExpr>& final_indices) {
-      Array<IterVar> new_iter_vars;
-      stage.transform_layout(initial_indices, final_indices, &new_iter_vars);
-      return new_iter_vars;
-    });
-
-TVM_REGISTER_GLOBAL("te.StageSetAxisSeparators").set_body_method(&Stage::set_axis_separators);
-
-TVM_REGISTER_GLOBAL("te.ScheduleNormalize").set_body_method(&Schedule::normalize);
-
-TVM_REGISTER_GLOBAL("te.ScheduleCreateGroup").set_body_method(&Schedule::create_group);
-
-TVM_REGISTER_GLOBAL("te.ScheduleCacheRead").set_body_method(&Schedule::cache_read);
-
-TVM_REGISTER_GLOBAL("te.ScheduleCacheWrite").set_body([](TVMArgs args, TVMRetValue* ret) {
-  if (args[1].IsObjectRef<Tensor>()) {
-    *ret = args[0].operator Schedule().cache_write(args[1].operator Tensor(), args[2]);
-  } else {
-    *ret = args[0].operator Schedule().cache_write(args[1].operator Array<Tensor>(), args[2]);
-  }
-});
-
-TVM_REGISTER_GLOBAL("te.ScheduleRFactor").set_body_method(&Schedule::rfactor);
-
-TVM_REGISTER_GLOBAL("te.CreateSpecializedCondition").set_body_typed([](Array<PrimExpr> condition) {
-  return SpecializedCondition(condition);
-});
-
-TVM_REGISTER_GLOBAL("te.GetCurrentSpecialization").set_body([](TVMArgs args, TVMRetValue* ret) {
-  *ret = SpecializedCondition::Current();
-});
-
-TVM_REGISTER_GLOBAL("te.EnterSpecializationScope")
-    .set_body_typed(SpecializedCondition::Internal::EnterScope);
-
-TVM_REGISTER_GLOBAL("te.ExitSpecializationScope")
-    .set_body_typed(SpecializedCondition::Internal::ExitScope);
-
-}  // namespace te
-}  // namespace tvm
diff --git a/src/te/schedule/schedule_ops.cc b/src/te/schedule/schedule_ops.cc
deleted file mode 100644
index d9818309c2d6..000000000000
--- a/src/te/schedule/schedule_ops.cc
+++ /dev/null
@@ -1,437 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file schedule_ops.cc
- */
-#include <tvm/runtime/registry.h>
-#include <tvm/te/operation.h>
-#include <tvm/te/schedule_pass.h>
-#include <tvm/tir/analysis.h>
-#include <tvm/tir/expr.h>
-#include <tvm/tir/stmt_functor.h>
-
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-
-#include "../../tir/transforms/ir_utils.h"
-#include "../operation/op_utils.h"
-#include "graph.h"
-
-namespace tvm {
-namespace te {
-
-using namespace tir;
-
-// Annotate the statement with the layout transforms and axis
-// separators of the stage.  These annotations are removed during
-// SchedulePostProcToPrimFunc.  Afterwards, layout transforms are
-// specified in the PrimFunc attrs, and the axis_separators are
-// specified in the BufferNode.
-Stmt WrapLayoutTransformationAttrs(const Stage& stage, Stmt body) {
-  if (stage->layout_transforms.size()) {
-    for (int i = 0; i < stage->op->num_outputs(); i++) {
-      body = AttrStmt(Array<ObjectRef>{stage->op.output(i), stage->layout_transforms},
-                      tir::attr::layout_transforms, 1, body);
-    }
-  }
-
-  if (stage->axis_separators.size()) {
-    for (int i = 0; i < stage->op->num_outputs(); i++) {
-      body = AttrStmt(Array<ObjectRef>{stage->op.output(i), stage->axis_separators},
-                      tir::attr::axis_separators, 1, body);
-    }
-  }
-
-  return body;
-}
-
-Stmt MakePipeline(const Stage& s, const std::unordered_map<IterVar, Range>& dom_map, Stmt consumer,
-                  bool debug_keep_trivial_loop) {
-  Stmt producer = s->op->BuildProvide(s, dom_map, debug_keep_trivial_loop);
-  if (s->double_buffer) {
-    producer = AttrStmt(s->op, tir::attr::double_buffer_scope, 1, producer);
-  }
-  producer = WrapLayoutTransformationAttrs(s, producer);
-  Stmt pipeline = producer;
-
-  if (consumer.defined() && !is_no_op(consumer)) {
-    pipeline = SeqStmt({producer, consumer});
-  }
-
-  if (s->rolling_buffer) {
-    pipeline = AttrStmt(s->op, tir::attr::rolling_buffer_scope, Bool(true), pipeline);
-  }
-
-  return s->op->BuildRealize(s, dom_map, pipeline, s->scope);
-}
-
-// inject the operator's realization on the stmt.
-class InjectAttach : public StmtMutator {
- public:
-  InjectAttach(const Stage& stage, const Stage& attach_spec,
-               const std::unordered_map<IterVar, Range>& dom_map, bool debug_keep_trivial_loop)
-      : stage_(stage),
-        attach_spec_(attach_spec),
-        dom_map_(dom_map),
-        debug_keep_trivial_loop_(debug_keep_trivial_loop) {}
-
-  Stmt VisitStmt(const Stmt& input_stmt) final {
-    ICHECK(input_stmt.defined());
-    auto stmt = StmtMutator::VisitStmt(input_stmt);
-    const AttrStmtNode* op = stmt.as<AttrStmtNode>();
-    if (op != nullptr && op->attr_key == tir::attr::loop_scope) {
-      if (attach_spec_->attach_type == kScope && op->node == attach_spec_->attach_ivar) {
-        ICHECK(!found_attach) << "Find IterVar" << attach_spec_->attach_ivar
-                              << " in multiple places in the IR";
-        found_attach = true;
-        stmt = AttrStmt(op->node, op->attr_key, op->value,
-                        MakePipeline(stage_, dom_map_, op->body, debug_keep_trivial_loop_));
-      }
-    }
-    return stmt;
-  }
-  // whether attach point is found
-  bool found_attach{false};
-
- private:
-  // The stage.
-  const Stage& stage_;
-  // The attach spec, may not contain op.
-  const Stage& attach_spec_;
-  // domain map
-  const std::unordered_map<IterVar, Range>& dom_map_;
-  // Whether keep trivial loops with extent of 1 during lowering.
-  // This is a debug feature for dataflow/axis analysis
-  bool debug_keep_trivial_loop_;
-};
-
-// inject the operator's realization on the stmt.
-class InjectScanStep : public StmtMutator {
- public:
-  InjectScanStep(const Stage& stage, const Operation& scan_op,
-                 const std::unordered_map<IterVar, Range>& dom_map, bool is_init,
-                 bool debug_keep_trivial_loop)
-      : stage_(stage),
-        scan_op_(scan_op),
-        dom_map_(dom_map),
-        is_init_(is_init),
-        debug_keep_trivial_loop_(debug_keep_trivial_loop) {}
-
-  Stmt VisitStmt(const Stmt& input_stmt) final {
-    ICHECK(input_stmt.defined());
-    auto stmt = StmtMutator::VisitStmt(input_stmt);
-    // update
-    const AttrStmtNode* op = stmt.as<AttrStmtNode>();
-    if (op != nullptr && ((op->attr_key == tir::attr::scan_update_scope && !is_init_) ||
-                          (op->attr_key == tir::attr::scan_init_scope && is_init_))) {
-      if (op->node.same_as(scan_op_)) {
-        found_attach = true;
-        stmt = AttrStmt(op->node, op->attr_key, op->value,
-                        MakePipeline(stage_, dom_map_, op->body, debug_keep_trivial_loop_));
-      }
-    }
-    return stmt;
-  }
-
-  // whether attach point is found
-  bool found_attach{false};
-
- private:
-  // the operations to be carried
-  const Stage& stage_;
-  const Operation& scan_op_;
-  // domain map
-  const std::unordered_map<IterVar, Range>& dom_map_;
-  // whether it is init.
-  bool is_init_;
-  // Whether keep trivial loops with extent of 1 during lowering.
-  // This is a debug feature for dataflow/axis analysis
-  bool debug_keep_trivial_loop_;
-};
-
-// Postprocessing of schedule op
-// Replace the init and update's expression by scan's buffer.
-class SchedulePostProc : public StmtExprMutator {
- public:
-  Stmt VisitStmt_(const LetStmtNode* op) final {
-    if (SideEffect(op->value) <= CallEffectKind::kPure) {
-      var_value_[op->var.get()] = this->VisitExpr(op->value);
-      return this->VisitStmt(op->body);
-    } else {
-      return StmtExprMutator::VisitStmt_(op);
-    }
-  }
-
-  Stmt VisitStmt_(const AttrStmtNode* op) final {
-    if (op->attr_key == tir::attr::loop_scope || op->attr_key == tir::attr::scan_init_scope) {
-      return this->VisitStmt(op->body);
-    } else if (op->attr_key == tir::attr::scan_update_scope) {
-      const ScanOpNode* scan = op->node.as<ScanOpNode>();
-      ICHECK(scan);
-      var_value_[scan->scan_axis->var.get()] = op->value;
-      return this->VisitStmt(op->body);
-    } else if (op->attr_key == tir::attr::thread_extent) {
-      // delete duplicated thread extent attr
-      auto it = thread_extent_scope_.find(op->node.get());
-      if (it != thread_extent_scope_.end()) {
-        ICHECK(is_zero(analyzer_.Simplify(it->second - op->value)));
-        return this->VisitStmt(op->body);
-      } else {
-        thread_extent_scope_[op->node.get()] = op->value;
-        Stmt ret = StmtExprMutator::VisitStmt_(op);
-        thread_extent_scope_.erase(op->node.get());
-        return ret;
-      }
-    } else if (op->attr_key == tir::attr::double_buffer_scope) {
-      auto it = replace_op_.find(op->node.get());
-      if (it != replace_op_.end()) {
-        if (it->second.defined()) {
-          Stmt ret = AttrStmt(it->second, op->attr_key, op->value, op->body);
-          return this->VisitStmt(ret);
-        } else {
-          return this->VisitStmt(op->body);
-        }
-      }
-    } else if (op->attr_key == tir::attr::buffer_bind_scope) {
-      Array<ObjectRef> tuple = Downcast<Array<ObjectRef>>(op->node);
-      Tensor tensor = Downcast<Tensor>(tuple[1]);
-      auto it = replace_op_.find(tensor->op.get());
-      if (it != replace_op_.end()) {
-        if (it->second.defined()) {
-          return AttrStmt(Array<ObjectRef>{tuple[0], it->second.output(tensor->value_index)},
-                          op->attr_key, op->value, this->VisitStmt(op->body));
-        } else {
-          return this->VisitStmt(op->body);
-        }
-      }
-    } else if (op->attr_key == tir::attr::buffer_dim_align) {
-      Tensor tensor = Downcast<Tensor>(op->node);
-      auto it = replace_op_.find(tensor->op.get());
-      if (it != replace_op_.end()) {
-        if (it->second.defined()) {
-          return AttrStmt(it->second.output(tensor->value_index), op->attr_key, op->value,
-                          this->VisitStmt(op->body));
-        } else {
-          return this->VisitStmt(op->body);
-        }
-      }
-    } else if (op->attr_key == tir::attr::layout_transforms ||
-               op->attr_key == tir::attr::axis_separators) {
-      auto arr = Downcast<Array<ObjectRef>>(op->node);
-      ICHECK_EQ(arr.size(), 2);
-
-      Stmt body = op->body;
-
-      Tensor tensor = Downcast<Tensor>(arr[0]);
-      auto it = replace_op_.find(tensor->op.get());
-      if (it != replace_op_.end()) {
-        if (it->second.defined()) {
-          return AttrStmt(Array<ObjectRef>{it->second.output(tensor->value_index), arr[1]},
-                          op->attr_key, op->value, this->VisitStmt(op->body));
-        } else {
-          return this->VisitStmt(op->body);
-        }
-      }
-    }
-    return StmtExprMutator::VisitStmt_(op);
-  }
-
-  Stmt VisitStmt_(const ProducerRealizeNode* op) final {
-    auto key = Downcast<Tensor>(op->producer);
-    auto it = replace_realize_.find(key);
-    if (it != replace_realize_.end()) {
-      if (it->second.defined()) {
-        Stmt ret =
-            ProducerRealize(it->second, op->bounds, op->condition, op->body, op->storage_scope);
-        return this->VisitStmt(ret);
-      } else {
-        return this->VisitStmt(op->body);
-      }
-    } else {
-      return StmtExprMutator::VisitStmt_(op);
-    }
-  }
-
-  Stmt VisitStmt_(const ProducerStoreNode* op) final {
-    auto key = Downcast<Tensor>(op->producer);
-    auto it = replace_buffer_.find(key);
-    if (it != replace_buffer_.end()) {
-      const Tensor& dst = it->second;
-      Stmt ret = ProducerStore(dst, op->value, op->indices);
-      return this->VisitStmt(ret);
-    } else {
-      return StmtExprMutator::VisitStmt_(op);
-    }
-  }
-
-  PrimExpr VisitExpr_(const ProducerLoadNode* op) final {
-    PrimExpr expr = StmtExprMutator::VisitExpr_(op);
-    op = expr.as<ProducerLoadNode>();
-    ICHECK(op != nullptr);
-
-    auto key = Downcast<Tensor>(op->producer);
-    auto it = replace_buffer_.find(key);
-    if (it != replace_buffer_.end()) {
-      const Tensor& dst = it->second;
-      return ProducerLoad(dst, op->indices);
-    } else {
-      return expr;
-    }
-  }
-
-  PrimExpr VisitExpr_(const VarNode* op) final {
-    auto it = var_value_.find(op);
-    if (it != var_value_.end()) {
-      return it->second;
-    } else {
-      return GetRef<PrimExpr>(op);
-    }
-  }
-
-  void Init(const Schedule& sch) {
-    for (Stage s : sch->stages) {
-      for (auto kv : s->iter_var_attrs) {
-        // Update bind thread information.
-        if (kv.second->bind_thread.defined()) {
-          const Var& from = kv.first->var;
-          const Var& to = kv.second->bind_thread->var;
-          ICHECK(!var_value_.count(from.get()));
-          var_value_[from.get()] = to;
-        }
-      }
-      // This must be checked for all ops, including scan.
-      if (!s->op.same_as(s->origin_op)) {
-        for (int i = 0; i < s->op->num_outputs(); ++i) {
-          Tensor target = s->origin_op.output(i);
-          AddReplace(s->op.output(i), target, target, s->origin_op);
-        }
-      }
-      // Specially add replacements for scan op.
-      if (const ScanOpNode* scan = s->op.as<ScanOpNode>()) {
-        for (size_t i = 0; i < scan->update.size(); ++i) {
-          Tensor t = s->origin_op.output(i);
-          AddReplace(scan->init[i], t);
-          AddReplace(scan->update[i], t);
-          AddReplace(scan->state_placeholder[i], t);
-        }
-      }
-    }
-  }
-
- private:
-  void AddReplace(Tensor src, Tensor dst, Tensor repl_realize = Tensor(),
-                  Operation repl_op = Operation()) {
-    replace_buffer_[src] = dst;
-    replace_realize_[src] = repl_realize;
-    replace_op_[src->op.get()] = repl_op;
-  }
-  // The thread extent scope.
-  std::unordered_map<const Object*, PrimExpr> thread_extent_scope_;
-  // The scan value
-  std::unordered_map<const VarNode*, PrimExpr> var_value_;
-  // buffer replacement
-  std::unordered_map<Tensor, Tensor> replace_buffer_;
-  // buffere realization to be replaced
-  std::unordered_map<Tensor, Tensor> replace_realize_;
-  // replace producer consumer.
-  std::unordered_map<const Object*, Operation> replace_op_;
-  // integer analyzer
-  arith::Analyzer analyzer_;
-};
-
-Stmt ScheduleOps(Schedule sch, Map<IterVar, Range> dom_map_, bool debug_keep_trivial_loop) {
-  Stmt body = Stmt();
-  std::unordered_map<IterVar, Range> dom_map = as_unordered_map(dom_map_);
-  // scan init and scan updates
-  std::unordered_map<Operation, Operation> scan_init;
-  for (Stage s : sch->stages) {
-    const ScanOpNode* scan = s->op.as<ScanOpNode>();
-    if (!scan) continue;
-    for (Tensor t : scan->init) {
-      if (scan_init.count(t->op)) {
-        ICHECK(scan_init.at(t->op).same_as(s->op))
-            << "Scan init tensor can only belong to one scan";
-      } else {
-        scan_init[t->op] = s->op;
-      }
-    }
-  }
-  // verify correctness of group.
-  for (Stage g : sch->groups) {
-    ICHECK(!g->op.defined());
-    ICHECK_EQ(g->leaf_iter_vars.size(), 0U);
-  }
-  // reverse the post DFS order.
-  for (size_t i = sch->stages.size(); i != 0; --i) {
-    Stage s = sch->stages[i - 1];
-    ICHECK_NE(s->attach_type, kInline) << "call schedule.normalize before scheduleops";
-    ICHECK(s->op.defined());
-    // Remove grouping sugar, get the real attach spec.
-    Stage attach_spec = s.GetAttachSpec();
-
-    if (s->op.as<PlaceholderOpNode>()) {
-      // Placeholders don't need any realize/provide statements, but
-      // may be annotated with set_physical_layout to indicate the
-      // physical layout of an input, and must still have the
-      // attribute given.
-      body = WrapLayoutTransformationAttrs(s, std::move(body));
-    } else if (scan_init.count(s->op)) {
-      ICHECK(body.defined());
-      InjectScanStep mu(s, scan_init.at(s->op), dom_map, true, debug_keep_trivial_loop);
-      body = mu(std::move(body));
-      ICHECK(mu.found_attach) << "did not find attachment point for scan.init";
-    } else if (attach_spec->attach_type == kScanUpdate) {
-      // Handle scan update
-      ICHECK(body.defined());
-      InjectScanStep mu(s, attach_spec->attach_stage->op, dom_map, false, debug_keep_trivial_loop);
-      body = mu(std::move(body));
-      ICHECK(mu.found_attach) << "did not find attachment point for scan.update";
-    } else if (attach_spec->attach_type == kInlinedAlready) {
-      // do nothing
-    } else if (attach_spec->attach_type == kGroupRoot) {
-      ICHECK(!s->group.defined());
-      body = MakePipeline(s, dom_map, body, debug_keep_trivial_loop);
-    } else {
-      ICHECK_EQ(attach_spec->attach_type, kScope);
-      ICHECK(body.defined());
-      InjectAttach mutator(s, attach_spec, dom_map, debug_keep_trivial_loop);
-      body = mutator(std::move(body));
-      ICHECK(mutator.found_attach)
-          << "did not find attachment point for " << s << " in " << attach_spec->attach_stage->op
-          << " x " << attach_spec->attach_ivar << ", body:\n"
-          << body;
-    }
-  }
-
-  SchedulePostProc post_proc;
-  post_proc.Init(sch);
-  return post_proc(std::move(body));
-}
-
-TVM_REGISTER_GLOBAL("schedule.ScheduleOps").set_body([](TVMArgs args, TVMRetValue* ret) {
-  if (args.size() == 2)
-    *ret = ScheduleOps(args[0], args[1], false);
-  else
-    *ret = ScheduleOps(args[0], args[1], args[2]);
-});
-
-}  // namespace te
-}  // namespace tvm
diff --git a/src/te/schedule/schedule_postproc_to_primfunc.cc b/src/te/schedule/schedule_postproc_to_primfunc.cc
deleted file mode 100644
index fc32af21cc31..000000000000
--- a/src/te/schedule/schedule_postproc_to_primfunc.cc
+++ /dev/null
@@ -1,447 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file schedule_postproc_to_primfunc.cc
- *
- * \brief Translate the function body generated by ScheduleOps
- *  with te related dialects that incorporates Tensor
- *  into the Stmts to a PrimFunc.
- *
- *  Perform this translation before running any TIR optimizations.
- *
- *  Rationale: The body generated by ScheduleOps is not
- *  a formal PrimFunc and cannot be used for further optimization.
- *  This function canonicalize that body and creates a formal PrimFunc.
- *
- *  List of actions taken by the function:
- *  - Remove occurrences of te::Tensor, te::Operation in the IR
- *    and replace them by corresponding IR nodes via tir::Buffer.
- *  - Add annotation of extern buffers using the buffer_map field
- *    in the PrimFunc type.
- */
-#include <tvm/runtime/registry.h>
-#include <tvm/te/operation.h>
-#include <tvm/tir/expr.h>
-#include <tvm/tir/function.h>
-#include <tvm/tir/stmt_functor.h>
-
-#include <functional>
-#include <unordered_map>
-#include <utility>
-
-namespace tvm {
-namespace te {
-
-// create a buffer for tensor.
-Buffer CreateBufferFor(const Tensor& tensor, String storage_scope = "") {
-  std::string name = tensor->op->name;
-  if (tensor->op->num_outputs() != 1) {
-    name += ".v" + std::to_string(tensor->value_index);
-  }
-  Buffer buffer = decl_buffer(tensor->shape, tensor->dtype, name, storage_scope);
-
-  return buffer;
-}
-
-// A remapper that maps tensor to buffer
-class TensorToBufferMapper : public StmtExprMutator {
- public:
-  explicit TensorToBufferMapper(std::unordered_map<Tensor, Buffer> buffer_map)
-      : buffer_map_(buffer_map) {}
-
-  Stmt VisitStmt_(const AttrStmtNode* op) final {
-    auto ret = StmtExprMutator::VisitStmt_(op);
-    op = ret.as<AttrStmtNode>();
-    if (op->attr_key == tir::attr::double_buffer_scope ||
-        op->attr_key == tir::attr::rolling_buffer_scope) {
-      Stmt body = op->body;
-      Operation operation = Downcast<Operation>(op->node);
-      for (int i = operation->num_outputs(); i != 0; --i) {
-        Buffer buffer = GetOrAllocBuffer(operation.output(i - 1));
-        body = AttrStmt(buffer, op->attr_key, op->value, body);
-      }
-      return body;
-    } else if (op->attr_key == tir::attr::buffer_bind_scope) {
-      Array<ObjectRef> tuple = Downcast<Array<ObjectRef>>(op->node);
-      Tensor tensor = Downcast<Tensor>(tuple[1]);
-      return AttrStmt(Array<ObjectRef>{tuple[0], GetOrAllocBuffer(tensor)}, op->attr_key, op->value,
-                      op->body);
-    } else if (op->attr_key == tir::attr::buffer_dim_align ||
-               op->attr_key == tir::attr::prefetch_scope) {
-      Tensor tensor = Downcast<Tensor>(op->node);
-      Buffer buffer = GetOrAllocBuffer(tensor);
-      return AttrStmt(buffer, op->attr_key, op->value, op->body);
-    } else if (op->attr_key == tir::attr::layout_transforms ||
-               op->attr_key == tir::attr::axis_separators) {
-      auto arr = Downcast<Array<ObjectRef>>(op->node);
-      ICHECK_EQ(arr.size(), 2);
-
-      Stmt body = op->body;
-
-      Tensor tensor = Downcast<Tensor>(arr[0]);
-      Buffer buffer = GetBuffer(tensor);
-
-      return AttrStmt(Array<ObjectRef>{buffer, arr[1]}, op->attr_key, 1, body);
-    } else {
-      return ret;
-    }
-  }
-
-  Stmt VisitStmt_(const ProducerRealizeNode* op) final {
-    Tensor tensor = Downcast<Tensor>(op->producer);
-    Buffer buffer = GetOrAllocBuffer(tensor, op->storage_scope);
-
-    auto ret = StmtExprMutator::VisitStmt_(op);
-    op = ret.as<ProducerRealizeNode>();
-
-    return BufferRealize(buffer, op->bounds, op->condition, op->body);
-  }
-
-  Stmt VisitStmt_(const ProducerStoreNode* op) final {
-    Tensor tensor = Downcast<Tensor>(op->producer);
-    Buffer buffer = GetBuffer(tensor);
-
-    auto ret = StmtExprMutator::VisitStmt_(op);
-    op = ret.as<ProducerStoreNode>();
-
-    return BufferStore(buffer, op->value, GetIndices(op->indices, buffer->shape));
-  }
-
-  PrimExpr VisitExpr_(const ProducerLoadNode* op) final {
-    auto ret = StmtExprMutator::VisitExpr_(op);
-    op = ret.as<ProducerLoadNode>();
-    Tensor tensor = Downcast<Tensor>(op->producer);
-    Buffer buffer = GetBuffer(tensor);
-    return tir::BufferLoad(buffer, GetIndices(op->indices, buffer->shape));
-  }
-
- private:
-  Buffer GetOrAllocBuffer(const Tensor& tensor, String storage_scope = "") {
-    return GetBuffer(tensor, storage_scope, true);
-  }
-
-  Buffer GetBuffer(const Tensor& tensor, String storage_scope = "", bool allow_alloc = false) {
-    auto it = buffer_map_.find(tensor);
-    if (it != buffer_map_.end()) return it->second;
-    ICHECK(allow_alloc) << "Cannot find the Realization point of tensor " << tensor;
-
-    auto buffer = CreateBufferFor(tensor, storage_scope);
-    buffer_map_[tensor] = buffer;
-    return buffer;
-  }
-
-  Array<PrimExpr> GetIndices(const Array<PrimExpr>& tensor_indices,
-                             const Array<PrimExpr>& buffer_shape) {
-    if (tensor_indices.size() == buffer_shape.size()) {
-      return tensor_indices;
-    } else if (tensor_indices.size() == 1) {
-      // Workaround to support previous behavior of tensor indexing by
-      // a single index, treating the tensor as if were already
-      // flattened by a row-major traversal.
-      PrimExpr unravel = tensor_indices[0];
-      Array<PrimExpr> rev_indices;
-      for (size_t i = buffer_shape.size(); i > 0; i--) {
-        PrimExpr dim = buffer_shape[i - 1];
-        rev_indices.push_back(indexmod(unravel, dim));
-        unravel = indexdiv(unravel, dim);
-      }
-      return Array<PrimExpr>(rev_indices.rbegin(), rev_indices.rend());
-    } else {
-      LOG(FATAL) << "Cannot produce indices for " << buffer_shape.size()
-                 << "-dimensional TIR buffer using " << tensor_indices.size()
-                 << "-dimensional tensor indices.";
-      return {};
-    }
-  }
-
-  // Maps tensor to buffer.
-  std::unordered_map<Tensor, Buffer> buffer_map_;
-};
-
-/*! Collect the physical layout map of all tensors in the statement. */
-class LayoutTransformAttrUnwrapper : StmtExprMutator {
- public:
-  static tir::PrimFunc Apply(tir::PrimFunc func) {
-    // Collect the physical layout annotations in the body, which may
-    // refer to input arguments.
-    auto layout_map = Collector::Collect(func->body);
-
-    if (layout_map.size()) {
-      func = WithAttr(std::move(func), "layout_transform_map", layout_map);
-
-      auto write_ptr = func.CopyOnWrite();
-      write_ptr->body = LayoutTransformAttrUnwrapper()(func->body);
-    }
-
-    return func;
-  }
-
-  LayoutTransformAttrUnwrapper() {}
-
-  Stmt VisitStmt_(const AttrStmtNode* op) final {
-    auto ret = StmtExprMutator::VisitStmt_(op);
-    op = ret.as<AttrStmtNode>();
-
-    if (op->attr_key == tir::attr::layout_transforms) {
-      return op->body;
-    } else {
-      return ret;
-    }
-  }
-
- private:
-  /*! Collect the physical layout information of all tensors in the statement.
-   *
-   * Must be done before constructing the buffers, since the
-   * attributes could either apply to the external buffers or to
-   * internal allocations.
-   */
-  class Collector : StmtExprVisitor {
-   public:
-    static Map<Buffer, Array<IndexMap>> Collect(Stmt stmt) {
-      Collector collector;
-      collector(std::move(stmt));
-      return std::move(collector.layout_map_);
-    }
-
-    Collector() {}
-
-    void VisitStmt_(const AttrStmtNode* op) final {
-      if (op->attr_key == tir::attr::layout_transforms) {
-        auto arr = Downcast<Array<ObjectRef>>(op->node);
-        ICHECK_EQ(arr.size(), 2);
-
-        auto buffer = Downcast<Buffer>(arr[0]);
-        auto layout_transforms = Downcast<Array<IndexMap>>(arr[1]);
-        layout_map_.Set(buffer, layout_transforms);
-      }
-      StmtExprVisitor::VisitStmt_(op);
-    }
-
-    Map<Buffer, Array<IndexMap>> layout_map_;
-  };
-
-  std::unordered_map<const BufferNode*, Buffer> buffer_remap_;
-
-  Map<Buffer, Array<IndexMap>> layout_map_;
-};
-
-/*! Move axis_separators from an attribute to a buffer property. */
-class AxisSeparatorsAttrUnwrapper : StmtExprMutator {
- public:
-  static tir::PrimFunc Apply(tir::PrimFunc func) {
-    // Collect the physical layout annotations in the body, which may
-    // refer to input arguments.
-    auto axis_separators_map = Collector::Collect(func->body);
-
-    if (axis_separators_map.size()) {
-      auto write_ptr = func.CopyOnWrite();
-      auto pass = AxisSeparatorsAttrUnwrapper(axis_separators_map);
-      write_ptr->buffer_map = pass.UpdateExternBufferMap(func->buffer_map);
-      write_ptr->body = pass(func->body);
-      if (auto map = func->attrs.GetAttr<Map<Buffer, Array<IndexMap>>>("layout_transform_map")) {
-        func = WithAttr(std::move(func), "layout_transform_map", pass.UpdateIndexMap(map.value()));
-      }
-    }
-
-    return func;
-  }
-
-  explicit AxisSeparatorsAttrUnwrapper(Map<Buffer, Array<IntImm>> axis_separators_map)
-      : axis_separators_map_(axis_separators_map) {}
-
-  Map<Var, Buffer> UpdateExternBufferMap(const Map<Var, Buffer>& orig) {
-    Map<Var, Buffer> output;
-    for (const auto& kv : orig) {
-      output.Set(kv.first, GetRemappedBuffer(kv.second));
-    }
-    return output;
-  }
-
-  Map<Buffer, Array<IndexMap>> UpdateIndexMap(const Map<Buffer, Array<IndexMap>>& orig) {
-    Map<Buffer, Array<IndexMap>> output;
-    for (const auto& kv : orig) {
-      output.Set(GetRemappedBuffer(kv.first), kv.second);
-    }
-    return output;
-  }
-
-  Stmt VisitStmt_(const AttrStmtNode* op) final {
-    auto ret = StmtExprMutator::VisitStmt_(op);
-    op = ret.as<AttrStmtNode>();
-
-    if (op->attr_key == tir::attr::axis_separators) {
-      return op->body;
-    } else if (op->attr_key == tir::attr::buffer_bind_scope) {
-      Array<ObjectRef> tuple = Downcast<Array<ObjectRef>>(op->node);
-      Buffer view_buffer = Downcast<Buffer>(tuple[0]);
-      Buffer source_buffer = Downcast<Buffer>(tuple[1]);
-      return AttrStmt(
-          Array<ObjectRef>{GetRemappedBuffer(view_buffer), GetRemappedBuffer(source_buffer)},
-          op->attr_key, op->value, op->body);
-    } else {
-      return ret;
-    }
-  }
-
-  Stmt VisitStmt_(const BufferRealizeNode* op) final {
-    auto node = Downcast<BufferRealize>(StmtExprMutator::VisitStmt_(op));
-    return VisitBufferAccess(std::move(node));
-  }
-
-  Stmt VisitStmt_(const BufferStoreNode* op) final {
-    auto node = Downcast<BufferStore>(StmtExprMutator::VisitStmt_(op));
-    return VisitBufferAccess(std::move(node));
-  }
-
-  PrimExpr VisitExpr_(const BufferLoadNode* op) final {
-    auto node = Downcast<BufferLoad>(StmtExprMutator::VisitExpr_(op));
-    return VisitBufferAccess(std::move(node));
-  }
-
- private:
-  template <typename Node>
-  Node VisitBufferAccess(Node node) {
-    Buffer new_buf = GetRemappedBuffer(node->buffer);
-    if (!node->buffer.same_as(new_buf)) {
-      auto writer = node.CopyOnWrite();
-      writer->buffer = new_buf;
-    }
-    return node;
-  }
-
-  Buffer GetRemappedBuffer(Buffer buf) {
-    // If this buffer has already been remapped, then return the
-    // previous value.
-    auto key = buf.get();
-    {
-      auto it = buffer_remap_.find(key);
-      if (it != buffer_remap_.end()) {
-        return it->second;
-      }
-    }
-
-    // Otherwise, check if we need to add axis_separators to this
-    // buffer.
-    auto lookup = axis_separators_map_.Get(buf);
-    if (lookup) {
-      Array<IntImm> axis_separators = lookup.value();
-      if (axis_separators.size()) {
-        auto write_ptr = buf.CopyOnWrite();
-        write_ptr->axis_separators = axis_separators;
-      }
-    }
-
-    // And cache the result for next time.
-    buffer_remap_[key] = buf;
-
-    return buf;
-  }
-
-  /*! Collect the axis separator information of all tensors in the statement.
-   *
-   * Must be done before constructing the buffers, since the
-   * attributes could either apply to the external buffers or to
-   * internal allocations.
-   */
-  class Collector : StmtExprVisitor {
-   public:
-    static Map<Buffer, Array<IntImm>> Collect(Stmt stmt) {
-      Collector collector;
-      collector(std::move(stmt));
-      return std::move(collector.axis_separators_map_);
-    }
-
-    Collector() {}
-
-    void VisitStmt_(const AttrStmtNode* op) final {
-      if (op->attr_key == tir::attr::axis_separators) {
-        auto arr = Downcast<Array<ObjectRef>>(op->node);
-        ICHECK_EQ(arr.size(), 2);
-
-        auto buffer = Downcast<Buffer>(arr[0]);
-        auto axis_separators = Downcast<Array<IntImm>>(arr[1]);
-        axis_separators_map_.Set(buffer, axis_separators);
-      }
-      StmtExprVisitor::VisitStmt_(op);
-    }
-
-    Map<Buffer, Array<IntImm>> axis_separators_map_;
-  };
-
-  std::unordered_map<const BufferNode*, Buffer> buffer_remap_;
-
-  Map<Buffer, Array<IntImm>> axis_separators_map_;
-};
-
-PrimFunc SchedulePostProcToPrimFunc(Array<ObjectRef> arg_list, Stmt body,
-                                    Optional<Map<Tensor, Buffer>> extern_buffer_opt) {
-  std::unordered_map<Tensor, Buffer> extern_tensor_map;
-
-  if (extern_buffer_opt.defined()) {
-    auto v = extern_buffer_opt.value();
-    extern_tensor_map = std::unordered_map<Tensor, Buffer>(v.begin(), v.end());
-  }
-
-  Array<tir::Var> params;
-  Map<tir::Var, tir::Buffer> buffer_map;
-
-  for (auto arg : arg_list) {
-    if (auto* n = arg.as<tir::VarNode>()) {
-      tir::Var var = GetRef<tir::Var>(n);
-      params.push_back(GetRef<tir::Var>(n));
-    } else if (auto* n = arg.as<te::TensorNode>()) {
-      te::Tensor tensor = GetRef<te::Tensor>(n);
-      ICHECK(!extern_tensor_map.count(tensor));
-
-      tir::Buffer buffer = CreateBufferFor(tensor);
-      tir::Var bptr(buffer->name, PrimType(DataType::Handle()));
-      params.push_back(bptr);
-      buffer_map.Set(bptr, buffer);
-      extern_tensor_map[tensor] = buffer;
-    } else if (auto* n = arg.as<tir::BufferNode>()) {
-      tir::Buffer buffer = GetRef<tir::Buffer>(n);
-      tir::Var bptr(buffer->name, PrimType(DataType::Handle()));
-      params.push_back(bptr);
-      buffer_map.Set(bptr, buffer);
-    } else {
-      LOG(FATAL) << "Expected argument to be Var, Tensor, or Buffer, but received "
-                 << arg->GetTypeKey();
-    }
-  }
-
-  body = TensorToBufferMapper(std::move(extern_tensor_map))(std::move(body));
-
-  PrimFunc func = tir::PrimFunc(params, body, VoidType(), buffer_map);
-
-  func = LayoutTransformAttrUnwrapper::Apply(std::move(func));
-  func = AxisSeparatorsAttrUnwrapper::Apply(std::move(func));
-
-  // We mark this PrimFunc as coming from a TE schedule
-  func = WithAttr(func, "from_legacy_te_schedule", Bool(true));
-
-  return func;
-}
-
-TVM_REGISTER_GLOBAL("schedule.SchedulePostProcToPrimFunc")
-    .set_body_typed(SchedulePostProcToPrimFunc);
-
-}  // namespace te
-}  // namespace tvm
diff --git a/src/te/schedule/verify_compact_buffer.cc b/src/te/schedule/verify_compact_buffer.cc
deleted file mode 100644
index 0089c36dc607..000000000000
--- a/src/te/schedule/verify_compact_buffer.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file verify_compact_buffer.cc
- * \brief Verify if there was any compact buffer bound to a statement.
- */
-#include <tvm/runtime/registry.h>
-#include <tvm/te/schedule_pass.h>
-#include <tvm/te/tensor.h>
-#include <tvm/tir/buffer.h>
-#include <tvm/tir/expr.h>
-#include <tvm/tir/stmt.h>
-#include <tvm/tir/stmt_functor.h>
-
-#include <unordered_map>
-
-namespace tvm {
-namespace te {
-
-class VerifyBuffer : public StmtVisitor {
- public:
-  bool Verify(const Stmt& stmt) {
-    this->VisitStmt(stmt);
-    return is_compact_;
-  }
-
-  void VisitStmt_(const AttrStmtNode* op) final {
-    StmtVisitor::VisitStmt_(op);
-    if (op->attr_key == tir::attr::buffer_bind_scope) {
-      is_compact_ = true;
-    }
-  }
-
- private:
-  bool is_compact_{false};
-};
-
-bool VerifyCompactBuffer(const Stmt& stmt) {
-  VerifyBuffer verifier;
-  return verifier.Verify(stmt);
-}
-
-TVM_REGISTER_GLOBAL("schedule.VerifyCompactBuffer").set_body_typed(VerifyCompactBuffer);
-
-}  // namespace te
-}  // namespace tvm
diff --git a/src/te/tensor.cc b/src/te/tensor.cc
index dc6dd88fc0d4..ef9e0e13f171 100644
--- a/src/te/tensor.cc
+++ b/src/te/tensor.cc
@@ -23,9 +23,6 @@
 #include <tvm/runtime/registry.h>
 #include <tvm/te/operation.h>
 #include <tvm/te/tensor.h>
-#include <tvm/te/tensor_intrin.h>
-
-#include <memory>
 
 namespace tvm {
 namespace te {
@@ -112,64 +109,6 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
       p->stream << "Tensor(shape=" << t->shape << ", op.name=" << t->op->name << ')';
     });
 
-// TensorIntrin
-TensorIntrin::TensorIntrin(std::string name, Operation op, Array<Tensor> inputs,
-                           Array<Buffer> buffers, Array<Var> scalar_params, Stmt body,
-                           Stmt reduce_init, Stmt reduce_update) {
-  auto n = make_object<TensorIntrinNode>();
-  n->name = std::move(name);
-  n->op = std::move(op);
-  n->inputs = std::move(inputs);
-  n->buffers = std::move(buffers);
-  n->scalar_params = std::move(scalar_params);
-  n->body = std::move(body);
-  n->reduce_init = std::move(reduce_init);
-  n->reduce_update = std::move(reduce_update);
-  data_ = std::move(n);
-}
-
-TVM_REGISTER_GLOBAL("te.TensorIntrin")
-    .set_body_typed([](std::string name, Operation op, Array<Tensor> inputs, Array<Buffer> buffers,
-                       Array<Var> scalar_params, Stmt body, Stmt reduce_init, Stmt reduce_update) {
-      return TensorIntrin(name, op, inputs, buffers, scalar_params, body, reduce_init,
-                          reduce_update);
-    });
-
-TVM_REGISTER_NODE_TYPE(TensorIntrinNode);
-
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<TensorIntrinNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const TensorIntrinNode*>(node.get());
-      p->stream << "TensorIntrin(name=" << op->name << ", " << op << ")";
-    });
-
-// TensorIntrinCall
-TensorIntrinCall::TensorIntrinCall(TensorIntrin intrin, Array<Tensor> tensors,
-                                   Array<Region> regions, Array<IterVar> reduce_axis,
-                                   Array<PrimExpr> scalar_inputs) {
-  auto n = make_object<TensorIntrinCallNode>();
-  n->intrin = std::move(intrin);
-  n->tensors = std::move(tensors);
-  n->regions = std::move(regions);
-  n->reduce_axis = std::move(reduce_axis);
-  n->scalar_inputs = std::move(scalar_inputs);
-  data_ = std::move(n);
-}
-
-TVM_REGISTER_GLOBAL("te.TensorIntrinCall")
-    .set_body_typed([](TensorIntrin intrin, Array<Tensor> tensors, Array<Region> regions,
-                       Array<IterVar> reduce_axis, Array<PrimExpr> scalar_inputs) {
-      return TensorIntrinCall(intrin, tensors, regions, reduce_axis, scalar_inputs);
-    });
-
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<TensorIntrinCallNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* n = static_cast<const TensorIntrinCallNode*>(node.get());
-      p->stream << "TensorIntrinCall(intrin=" << n->intrin << ", " << n << ")";
-    });
-
-TVM_REGISTER_NODE_TYPE(TensorIntrinCallNode);
-
 // Other tensor ops.
 TVM_REGISTER_GLOBAL("te.TensorEqual").set_body_method(&Tensor::operator==);
 
diff --git a/src/tir/transforms/storage_flatten.cc b/src/tir/transforms/storage_flatten.cc
index 06554f5f1dd1..87c6d48639f0 100644
--- a/src/tir/transforms/storage_flatten.cc
+++ b/src/tir/transforms/storage_flatten.cc
@@ -31,6 +31,7 @@
 #include <tvm/tir/buffer.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/expr.h>
+#include <tvm/tir/index_map.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/stmt.h>
 #include <tvm/tir/stmt_functor.h>
diff --git a/src/topi/schedule.cc b/src/topi/schedule.cc
deleted file mode 100644
index 0999f00ffd11..000000000000
--- a/src/topi/schedule.cc
+++ /dev/null
@@ -1,319 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \brief Registration of TVM schedules
- * \file schedule.cc
- */
-
-#include <tvm/ir/expr.h>
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/runtime/registry.h>
-#include <tvm/target/generic_func.h>
-#include <tvm/topi/cuda/dense.h>
-#include <tvm/topi/cuda/injective.h>
-#include <tvm/topi/cuda/pooling.h>
-#include <tvm/topi/cuda/reduction.h>
-#include <tvm/topi/cuda/softmax.h>
-#include <tvm/topi/detail/tensor_utils.h>
-#include <tvm/topi/generic/default.h>
-#include <tvm/topi/generic/extern.h>
-#include <tvm/topi/generic/injective.h>
-#include <tvm/topi/rocm/dense.h>
-#include <tvm/topi/rocm/injective.h>
-#include <tvm/topi/rocm/pooling.h>
-#include <tvm/topi/rocm/reduction.h>
-#include <tvm/topi/rocm/softmax.h>
-#include <tvm/topi/x86/bnn.h>
-#include <tvm/topi/x86/default.h>
-#include <tvm/topi/x86/injective.h>
-
-namespace tvm {
-namespace topi {
-
-using namespace tvm;
-using namespace tvm::runtime;
-
-TVM_REGISTER_GLOBAL("topi.TEST_create_target").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = tvm::Target(args[0].operator String());
-});
-
-/* Generic schedules */
-TVM_REGISTER_GLOBAL("topi.generic.default_schedule").set_body([](TVMArgs args, TVMRetValue* rv) {
-  if (args[2]) {
-    *rv = topi::generic::default_schedule_auto_inline(args[0], args[1]);
-  } else {
-    *rv = topi::generic::default_schedule(args[0], args[1]);
-  }
-});
-
-TVM_REGISTER_GLOBAL("topi.generic.schedule_extern").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = topi::generic::schedule_extern(args[0], args[1]);
-});
-
-TVM_REGISTER_GLOBAL("topi.generic.schedule_injective").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = topi::generic::schedule_injective(args[0], args[1]);
-});
-
-TVM_REGISTER_GLOBAL("topi.generic.schedule_injective_from_existing")
-    .set_body([](TVMArgs args, TVMRetValue* rv) {
-      *rv = topi::generic::schedule_injective_from_existing(args[0], args[1]);
-    });
-
-/* x86 schedules */
-TVM_REGISTER_GLOBAL("topi.x86.schedule_binarize_pack").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = topi::x86::schedule_binarize_pack(args[0], args[1]);
-});
-
-TVM_REGISTER_GLOBAL("topi.x86.schedule_binary_dense").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = topi::x86::schedule_binary_dense(args[0], args[1]);
-});
-
-TVM_REGISTER_GLOBAL("topi.x86.default_schedule").set_body([](TVMArgs args, TVMRetValue* rv) {
-  if (args[2]) {
-    *rv = topi::x86::default_schedule_auto_inline(args[0], args[1]);
-  } else {
-    *rv = topi::x86::default_schedule(args[0], args[1]);
-  }
-});
-
-TVM_REGISTER_GLOBAL("topi.x86.schedule_injective").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = topi::x86::schedule_injective(args[0], args[1]);
-});
-
-TVM_REGISTER_GLOBAL("topi.x86.schedule_injective_from_existing")
-    .set_body([](TVMArgs args, TVMRetValue* rv) {
-      *rv = topi::x86::schedule_injective_from_existing(args[0], args[1]);
-    });
-
-/* ROCm schedules */
-TVM_REGISTER_GLOBAL("topi.rocm.dense_cuda").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = rocm::dense_rocm(args[0], args[1], args[2], args[3], args[4]);
-});
-
-TVM_REGISTER_GLOBAL("topi.rocm.schedule_dense").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = topi::rocm::schedule_dense(args[0], args[1]);
-});
-
-TVM_REGISTER_GLOBAL("topi.rocm.schedule_injective").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = topi::rocm::schedule_injective(args[0], args[1]);
-});
-
-TVM_REGISTER_GLOBAL("topi.rocm.schedule_injective_from_existing")
-    .set_body([](TVMArgs args, TVMRetValue* rv) {
-      *rv = topi::rocm::schedule_injective_from_existing(args[0], args[1]);
-    });
-
-TVM_REGISTER_GLOBAL("topi.rocm.schedule_pool").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = topi::rocm::schedule_pool(args[0], args[1]);
-});
-
-TVM_REGISTER_GLOBAL("topi.rocm.schedule_global_pool").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = topi::rocm::schedule_global_pool(args[0], args[1]);
-});
-
-TVM_REGISTER_GLOBAL("topi.rocm.schedule_reduce").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = topi::rocm::schedule_reduce(args[0], args[1]);
-});
-
-TVM_REGISTER_GLOBAL("topi.rocm.schedule_softmax").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = topi::rocm::schedule_softmax(args[0], args[1]);
-});
-
-/* CUDA schedules */
-TVM_REGISTER_GLOBAL("topi.cuda.dense_cuda").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = cuda::dense_cuda(args[0], args[1], args[2], args[3], args[4]);
-});
-
-TVM_REGISTER_GLOBAL("topi.cuda.schedule_dense").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = topi::cuda::schedule_dense(args[0], args[1]);
-});
-
-TVM_REGISTER_GLOBAL("topi.cuda.schedule_injective").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = topi::cuda::schedule_injective(args[0], args[1]);
-});
-
-TVM_REGISTER_GLOBAL("topi.cuda.schedule_injective_from_existing")
-    .set_body([](TVMArgs args, TVMRetValue* rv) {
-      *rv = topi::cuda::schedule_injective_from_existing(args[0], args[1]);
-    });
-
-TVM_REGISTER_GLOBAL("topi.cuda.schedule_pool").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = topi::cuda::schedule_pool(args[0], args[1]);
-});
-
-TVM_REGISTER_GLOBAL("topi.cuda.schedule_global_pool").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = topi::cuda::schedule_global_pool(args[0], args[1]);
-});
-
-TVM_REGISTER_GLOBAL("topi.cuda.schedule_reduce").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = topi::cuda::schedule_reduce(args[0], args[1]);
-});
-
-TVM_REGISTER_GLOBAL("topi.cuda.schedule_softmax").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = topi::cuda::schedule_softmax(args[0], args[1]);
-});
-
-/* Utility functions */
-TVM_REGISTER_GLOBAL("topi.utils.is_empty_shape").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = topi::detail::is_empty_shape(args[0]);
-});
-
-TVM_REGISTER_GLOBAL("topi.utils.bilinear_sample_nchw").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = detail::bilinear_sample_nchw(args[0], args[1], args[2], args[3]);
-});
-
-TVM_REGISTER_GLOBAL("topi.utils.bilinear_sample_nhwc").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = detail::bilinear_sample_nhwc(args[0], args[1], args[2], args[3]);
-});
-
-/*! \brief Builder function for instantiating schedules. */
-using FTVMScheduleBuilder = std::function<tvm::te::Schedule(
-    const tvm::Target& target, const tvm::Array<tvm::te::Tensor>& outs)>;
-
-/*!
- * \brief Helper function for registering generic functions matching the
- * FTVMScheduleBuilder signature. The schedule builder function is wrapped
- * with a PackedFunc suitable for passing to a tvm::GenericFunc.
- *
- * \param builder The schedule builder to wrap.
- *
- * \return The wrapped schedule builder
- */
-inline PackedFunc WrapSchedule(FTVMScheduleBuilder builder) {
-  return PackedFunc([builder](TVMArgs args, TVMRetValue* ret) {
-    auto target = Target::Current(false);
-    Array<Tensor> outs;
-    ObjectRef argNodeRef = args[0];
-    if (argNodeRef->type_index() == outs->type_index()) {
-      outs = args[0];
-    } else {
-      outs = Array<Tensor>{args[0]};
-    }
-
-    *ret = builder(target, outs);
-  });
-}
-
-TVM_REGISTER_GENERIC_FUNC(schedule_injective)
-    .set_default(WrapSchedule(topi::generic::schedule_injective))
-    .register_func({"cpu"}, WrapSchedule(topi::x86::schedule_injective))
-    .register_func({"cuda", "gpu"}, WrapSchedule(topi::cuda::schedule_injective));
-
-TVM_REGISTER_GENERIC_FUNC(schedule_softmax)
-    .set_default(WrapSchedule(topi::generic::default_schedule))
-    .register_func({"cpu"}, WrapSchedule(topi::x86::default_schedule))
-    .register_func({"cuda", "gpu"}, WrapSchedule(topi::cuda::schedule_softmax));
-
-TVM_REGISTER_GENERIC_FUNC(schedule_dense)
-    .set_default(WrapSchedule(topi::generic::default_schedule))
-    .register_func({"cuda", "gpu"}, WrapSchedule(topi::cuda::schedule_dense))
-    .register_func({"rocm"}, WrapSchedule(topi::rocm::schedule_dense));
-
-TVM_REGISTER_GENERIC_FUNC(schedule_batch_matmul)
-    .set_default(WrapSchedule(topi::generic::default_schedule));
-
-TVM_REGISTER_GENERIC_FUNC(schedule_batch_norm)
-    .set_default(WrapSchedule(topi::generic::default_schedule));
-
-TVM_REGISTER_GENERIC_FUNC(schedule_pool)
-    .set_default(WrapSchedule(topi::generic::default_schedule))
-    .register_func({"cpu"}, WrapSchedule(topi::x86::default_schedule))
-    .register_func({"cuda", "gpu"}, WrapSchedule(topi::cuda::schedule_pool));
-
-TVM_REGISTER_GENERIC_FUNC(schedule_global_pool)
-    .set_default(WrapSchedule(topi::generic::default_schedule))
-    .register_func({"cpu"}, WrapSchedule(topi::x86::default_schedule))
-    .register_func({"cuda", "gpu"}, WrapSchedule(topi::cuda::schedule_global_pool));
-
-TVM_REGISTER_GENERIC_FUNC(schedule_reduce)
-    .set_default(WrapSchedule(topi::generic::default_schedule_auto_inline))
-    .register_func({"cpu"}, WrapSchedule(topi::x86::default_schedule_auto_inline))
-    .register_func({"cuda", "gpu"}, WrapSchedule(topi::cuda::schedule_reduce));
-
-TVM_REGISTER_GENERIC_FUNC(schedule_binarize_pack)
-    .set_default(WrapSchedule(topi::generic::default_schedule))
-    .register_func({"cpu"}, WrapSchedule(topi::x86::schedule_binarize_pack));
-
-TVM_REGISTER_GENERIC_FUNC(schedule_binary_dense)
-    .set_default(WrapSchedule(topi::generic::default_schedule))
-    .register_func({"cpu"}, WrapSchedule(topi::x86::schedule_binary_dense));
-
-/*! \brief Builder function for instantiating schedules from existing schedules. */
-using FTVMScheduleFromExistingBuilder =
-    std::function<tvm::te::Schedule(tvm::te::Schedule sch, const tvm::te::Tensor& out)>;
-
-/*!
- * \brief Helper function for registering generic functions matching the
- * FTVMScheduleFromExistingBuilder signature. The schedule builder function is wrapped
- * with a PackedFunc suitable for passing to a tvm::GenericFunc.
- *
- * \param builder The schedule builder to wrap.
- *
- * \return The wrapped schedule builder
- */
-inline PackedFunc WrapScheduleFromExisting(FTVMScheduleFromExistingBuilder builder) {
-  return PackedFunc(
-      [builder](TVMArgs args, TVMRetValue* ret) { *ret = builder(args[0], args[1]); });
-}
-
-TVM_REGISTER_GENERIC_FUNC(schedule_injective_from_existing)
-    .set_default(WrapScheduleFromExisting(topi::generic::schedule_injective_from_existing))
-    .register_func({"cpu"}, WrapScheduleFromExisting(topi::x86::schedule_injective_from_existing))
-    .register_func({"cuda", "gpu"},
-                   WrapScheduleFromExisting(topi::cuda::schedule_injective_from_existing));
-
-/*! \brief Builder function for instantiating dense ops. */
-using FTVMDenseOpBuilder = std::function<tvm::te::Tensor(
-    const Target& target, const tvm::te::Tensor& data, const tvm::te::Tensor& weight,
-    const tvm::te::Tensor& bias, const DataType& out_dtype)>;
-
-/*!
- * \brief Helper function for registering dense ops matching the
- * FTVMDenseOpBuilder signature. The op builder function is wrapped
- * with a PackedFunc suitable for passing to a tvm::GenericFunc.
- *
- * \param builder The op builder to wrap.
- *
- * \return The wrapped op builder
- */
-inline PackedFunc WrapDenseOp(FTVMDenseOpBuilder builder) {
-  return PackedFunc([builder](TVMArgs args, TVMRetValue* ret) {
-    auto target = Target::Current(false);
-    Tensor data = args[0];
-    Tensor weight = args[1];
-    Tensor bias = args[2];
-    DataType out_dtype = args[3];
-
-    *ret = builder(target, data, weight, bias, out_dtype);
-  });
-}
-
-TVM_REGISTER_GENERIC_FUNC(dense)
-    .set_default(WrapDenseOp([](const Target& target, const tvm::te::Tensor& data,
-                                const tvm::te::Tensor& weight, const tvm::te::Tensor& bias,
-                                const DataType& out_dtype) {
-      return topi::nn::dense(data, weight, bias, out_dtype);
-    }))
-    .register_func({"cuda", "gpu"}, WrapDenseOp(topi::cuda::dense_cuda))
-    .register_func({"rocm"}, WrapDenseOp(topi::rocm::dense_rocm));
-
-}  // namespace topi
-}  // namespace tvm
diff --git a/include/tvm/topi/rocm/reduction.h b/src/topi/utils.cc
similarity index 54%
rename from include/tvm/topi/rocm/reduction.h
rename to src/topi/utils.cc
index 7beda177ace8..94a5bf2dac5b 100644
--- a/include/tvm/topi/rocm/reduction.h
+++ b/src/topi/utils.cc
@@ -18,37 +18,27 @@
  */
 
 /*!
- * \file rocm/reduction.h
- * \brief rocm schedule for reduction operations
+ * \brief Registration of utils operators
+ * \file utils.cc
  */
-#ifndef TVM_TOPI_ROCM_REDUCTION_H_
-#define TVM_TOPI_ROCM_REDUCTION_H_
 
-#include <tvm/target/generic_func.h>
-#include <tvm/te/operation.h>
-#include <tvm/topi/cuda/reduction.h>
-#include <tvm/topi/detail/fuse.h>
-#include <tvm/topi/tags.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/topi/detail/tensor_utils.h>
 
 namespace tvm {
 namespace topi {
+TVM_REGISTER_GLOBAL("topi.utils.is_empty_shape").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = topi::detail::is_empty_shape(args[0]);
+});
 
-using namespace tvm::te;
+TVM_REGISTER_GLOBAL("topi.utils.bilinear_sample_nchw").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = detail::bilinear_sample_nchw(args[0], args[1], args[2], args[3]);
+});
 
-namespace rocm {
-/*!
- * \brief Create a rocm schedule for a reduce operation.
- *
- * \param target The target to generate a schedule for.
- * \param outs The output tensors.
- *
- * \return A schedule for the given ops.
- */
-Schedule schedule_reduce(const Target& target, Array<Tensor> outs) {
-  return topi::cuda::schedule_reduce(target, outs);
-}
+TVM_REGISTER_GLOBAL("topi.utils.bilinear_sample_nhwc").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = detail::bilinear_sample_nhwc(args[0], args[1], args[2], args[3]);
+});
 
-}  // namespace rocm
 }  // namespace topi
 }  // namespace tvm
-#endif  // TVM_TOPI_ROCM_REDUCTION_H_
diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc
deleted file mode 100644
index cedc9b62701d..000000000000
--- a/tests/cpp/build_module_test.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <dmlc/logging.h>
-#include <gtest/gtest.h>
-#include <tvm/driver/driver_api.h>
-#include <tvm/runtime/registry.h>
-#include <tvm/te/operation.h>
-#include <tvm/topi/cuda/injective.h>
-
-#include <cmath>
-#include <string>
-
-TEST(BuildModule, Basic) {
-  using namespace tvm;
-  using namespace tvm::te;
-  auto n = var("n");
-  Array<PrimExpr> shape;
-  shape.push_back(n);
-
-  auto A = placeholder(shape, DataType::Float(32), "A");
-  auto B = placeholder(shape, DataType::Float(32), "B");
-
-  auto C = compute(
-      A->shape, [&A, &B](PrimExpr i) { return A[i] + B[i]; }, "C");
-
-  auto s = create_schedule({C->op});
-
-  auto cAxis = C->op.as<ComputeOpNode>()->axis;
-
-  IterVar bx, tx;
-  s[C].split(cAxis[0], 64, &bx, &tx);
-
-  auto args = Array<Tensor>({A, B, C});
-  std::unordered_map<Tensor, Buffer> binds;
-
-  auto target = Target("llvm");
-
-  auto lowered = LowerSchedule(s, args, "func", binds, GlobalVarSupply());
-  auto module = build(lowered, target, Target());
-
-  auto mali_target = Target("opencl -model=Mali-T860MP4@800Mhz -device=mali");
-  ICHECK_EQ(mali_target->kind->name, "opencl");
-  ICHECK_EQ(mali_target->keys.size(), 3);
-  ICHECK_EQ(mali_target->keys[0], "mali");
-  ICHECK_EQ(mali_target->keys[1], "opencl");
-  ICHECK_EQ(mali_target->keys[2], "gpu");
-  ICHECK_EQ(mali_target->GetAttr<String>("device").value(), "mali");
-  ICHECK_EQ(mali_target->GetAttr<String>("model").value(), "Mali-T860MP4@800Mhz");
-  ICHECK_EQ(mali_target->GetAttr<Integer>("max_num_threads").value(), 256);
-}

From 93d086f69410a13b850d0c9761f6bb9ec9abdc89 Mon Sep 17 00:00:00 2001
From: Siyuan Feng <hzfengsy@sjtu.edu.cn>
Date: Mon, 17 Feb 2025 14:21:44 +0800
Subject: [PATCH 02/10] remove vitis ai

---
 CMakeLists.txt                                |   2 -
 cmake/config.cmake                            |   9 -
 cmake/modules/LibInfo.cmake                   |   1 -
 cmake/modules/contrib/VitisAI.cmake           |  47 ----
 docker/Dockerfile.ci_cpu                      |   4 -
 docker/Dockerfile.demo_vitis_ai               |  59 -----
 docker/bash.sh                                |  21 --
 .../install/ubuntu_install_vitis_ai_core.sh   |  39 ----
 python/gen_requirements.py                    |  11 -
 python/tvm/contrib/target/vitis_ai.py         | 220 ------------------
 python/tvm/testing/utils.py                   |   4 -
 rust/tvm-rt/Cargo.toml                        |   1 -
 rust/tvm-sys/Cargo.toml                       |   1 -
 rust/tvm-sys/build.rs                         |   3 -
 .../contrib/vitis_ai/vitis_ai_runtime.cc      | 195 ----------------
 .../contrib/vitis_ai/vitis_ai_runtime.h       | 120 ----------
 src/support/libinfo.cc                        |   1 -
 tests/scripts/task_config_build_cpu.sh        |   1 -
 18 files changed, 739 deletions(-)
 delete mode 100644 cmake/modules/contrib/VitisAI.cmake
 delete mode 100644 docker/Dockerfile.demo_vitis_ai
 delete mode 100755 docker/install/ubuntu_install_vitis_ai_core.sh
 delete mode 100644 python/tvm/contrib/target/vitis_ai.py
 delete mode 100755 src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc
 delete mode 100755 src/runtime/contrib/vitis_ai/vitis_ai_runtime.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cb8039a26414..5a7e1241d67f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -116,7 +116,6 @@ tvm_option(USE_TENSORRT_RUNTIME "Build with TensorRT runtime" OFF)
 tvm_option(USE_NNAPI_CODEGEN "Build with NNAPI Codegen support" OFF)
 tvm_option(USE_NNAPI_RUNTIME "Build with NNAPI runtime" OFF)
 tvm_option(USE_RUST_EXT "Build with Rust based compiler extensions, STATIC, DYNAMIC, or OFF" OFF)
-tvm_option(USE_VITIS_AI "Build with VITIS-AI Codegen support" OFF)
 tvm_option(SUMMARIZE "Print CMake option summary after configuring" OFF)
 tvm_option(USE_CLML "Build with CLML Codegen support" OFF)
 tvm_option(USE_CLML_GRAPH_EXECUTOR "Build with CLML graph runtime" OFF)
@@ -500,7 +499,6 @@ include(cmake/modules/contrib/ONNX.cmake)
 include(cmake/modules/contrib/ArmComputeLib.cmake)
 include(cmake/modules/contrib/TensorRT.cmake)
 include(cmake/modules/contrib/NNAPI.cmake)
-include(cmake/modules/contrib/VitisAI.cmake)
 include(cmake/modules/contrib/Verilator.cmake)
 include(cmake/modules/contrib/UMA.cmake)
 include(cmake/modules/contrib/MSC.cmake)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 791751ac9885..e5f85380d29e 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -138,12 +138,6 @@ set(USE_IOS_RPC OFF)
 # Whether embed stackvm into the runtime
 set(USE_STACKVM_RUNTIME OFF)
 
-# Whether enable tiny embedded graph executor.
-set(USE_GRAPH_EXECUTOR ON)
-
-# Whether enable tiny graph executor with CUDA Graph
-set(USE_GRAPH_EXECUTOR_CUDA_GRAPH OFF)
-
 # Whether enable pipeline executor.
 set(USE_PIPELINE_EXECUTOR OFF)
 
@@ -285,9 +279,6 @@ set(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR OFF)
 set(USE_TENSORRT_CODEGEN OFF)
 set(USE_TENSORRT_RUNTIME OFF)
 
-# Whether use VITIS-AI codegen
-set(USE_VITIS_AI OFF)
-
 # Build Verilator codegen and runtime
 set(USE_VERILATOR OFF)
 
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index cfea9b20d84a..70ae86fc7925 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -123,7 +123,6 @@ function(add_lib_info src_file)
     TVM_INFO_USE_THREADS="${USE_THREADS}"
     TVM_INFO_USE_THRUST="${USE_THRUST}"
     TVM_INFO_USE_CURAND="${USE_CURAND}"
-    TVM_INFO_USE_VITIS_AI="${USE_VITIS_AI}"
     TVM_INFO_USE_VULKAN="${USE_VULKAN}"
     TVM_INFO_USE_CLML="${USE_CLML}"
     TVM_INFO_USE_CLML_GRAPH_EXECUTOR="${USE_CLML_GRAPH_EXECUTOR}"
diff --git a/cmake/modules/contrib/VitisAI.cmake b/cmake/modules/contrib/VitisAI.cmake
deleted file mode 100644
index 5fd07dc2e97d..000000000000
--- a/cmake/modules/contrib/VitisAI.cmake
+++ /dev/null
@@ -1,47 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-if(USE_VITIS_AI)
-  set(PYXIR_SHARED_LIB libpyxir.so)
-  find_package(PythonInterp 3.7 REQUIRED)
-  if(NOT PYTHON)
-    find_program(PYTHON NAMES python3 python3.8)
-  endif()
-  execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
-    "import pyxir as px; print(px.get_include_dir()); print(px.get_lib_dir());"
-    RESULT_VARIABLE __result
-    OUTPUT_VARIABLE __output
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-  if(__result MATCHES 0)
-    string(REGEX REPLACE ";" "\\\\;" __values ${__output})
-    string(REGEX REPLACE "\r?\n" ";"    __values ${__values})
-    list(GET __values 0 PYXIR_INCLUDE_DIR)
-    list(GET __values 1 PYXIR_LIB_DIR)
-  else()
-    message(FATAL_ERROR "Can't build TVM with Vitis-AI because PyXIR can't be found")
-  endif()
-  message(STATUS "Build with contrib.vitisai")
-  include_directories(${PYXIR_INCLUDE_DIR})
-  tvm_file_glob(GLOB VAI_CONTRIB_SRC src/runtime/contrib/vitis_ai/*.cc)
-  tvm_file_glob(GLOB COMPILER_VITIS_AI_SRCS
-                src/relay/backend/contrib/vitis_ai/*)
-  list(APPEND COMPILER_SRCS ${COMPILER_VITIS_AI_SRCS})
-  link_directories(${PYXIR_LIB_DIR})
-  list(APPEND TVM_RUNTIME_LINKER_LIBS "pyxir")
-  list(APPEND RUNTIME_SRCS ${VAI_CONTRIB_SRC})
-endif(USE_VITIS_AI)
diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index d41a7e6e1b13..5ba1dd721435 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -101,10 +101,6 @@ RUN bash /install/ubuntu_install_jax.sh "cpu"
 COPY install/ubuntu_download_arm_compute_lib_binaries.sh /install/ubuntu_download_arm_compute_lib_binaries.sh
 RUN bash /install/ubuntu_download_arm_compute_lib_binaries.sh
 
-# Vitis-AI PyXIR CI deps
-COPY install/ubuntu_install_vitis_ai_packages_ci.sh /install/ubuntu_install_vitis_ai_packages_ci.sh
-RUN bash /install/ubuntu_install_vitis_ai_packages_ci.sh
-
 # PaddlePaddle deps
 COPY install/ubuntu_install_paddle.sh /install/ubuntu_install_paddle.sh
 RUN bash /install/ubuntu_install_paddle.sh
diff --git a/docker/Dockerfile.demo_vitis_ai b/docker/Dockerfile.demo_vitis_ai
deleted file mode 100644
index 8cafc653fb6e..000000000000
--- a/docker/Dockerfile.demo_vitis_ai
+++ /dev/null
@@ -1,59 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Main Vitis AI docker env
-FROM xilinx/vitis-ai:1.4.916
-
-COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
-
-RUN apt-get update --fix-missing
-
-COPY install/ubuntu_setup_tz.sh /install/ubuntu_setup_tz.sh
-RUN bash /install/ubuntu_setup_tz.sh
-
-COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
-RUN bash /install/ubuntu_install_core.sh
-
-# Install Vitis-AI ubuntu dependencies
-COPY install/ubuntu_install_vitis_ai_core.sh /install/ubuntu_install_vitis_ai_core.sh
-RUN bash /install/ubuntu_install_vitis_ai_core.sh
-
-ENV TVM_VENV /venv/apache-tvm-py3.9
-COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
-COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
-RUN bash /install/ubuntu_install_python.sh 3.9
-ENV PATH ${TVM_VENV}/bin:$PATH
-ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
-
-COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
-RUN bash /install/ubuntu_install_python_package.sh
-
-COPY install/ubuntu_install_llvm.sh /install/ubuntu_install_llvm.sh
-RUN bash /install/ubuntu_install_llvm.sh
-
-ENV PATH $PATH:$CARGO_HOME/bin:/usr/lib/go-1.10/bin
-
-# ANTLR deps
-COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh
-RUN bash /install/ubuntu_install_java.sh
-
-# Install dependencies inside vitis-ai-tensorflow conda
-RUN . $VAI_ROOT/conda/etc/profile.d/conda.sh && \
-    conda activate vitis-ai-tensorflow && \
-    pip install --no-cache-dir antlr4-python3-runtime && bash /install/ubuntu_install_python_package.sh
-
-ENV USER="root"
diff --git a/docker/bash.sh b/docker/bash.sh
index f83becbf6148..5303aadb944d 100755
--- a/docker/bash.sh
+++ b/docker/bash.sh
@@ -460,27 +460,6 @@ if [[ "${DOCKER_IMAGE_NAME}" == *"ci"* ]]; then
     DOCKER_ENV+=( --env PYTHONPATH="${REPO_MOUNT_POINT}"/python )
 fi
 
-
-
-# If the Vitis-AI docker image is selected, expose the Xilinx FPGA
-# devices and required volumes containing e.g. DSA's and overlays
-if [[ "${DOCKER_IMAGE_NAME}" == *"demo_vitis_ai"* && -d "/dev/shm" && -d "/opt/xilinx/dsa" && -d "/opt/xilinx/overlaybins" ]]; then
-    DOCKER_MOUNT+=( --volume /dev/shm:/dev/shm
-                    --volume /opt/xilinx/dsa:/opt/xilinx/dsa
-                    --volume /opt/xilinx/overlaybins:/opt/xilinx/overlaybins
-                  )
-
-    XCLMGMT_DRIVER="$(find /dev -name xclmgmt\*)"
-    for DRIVER in "${XCLMGMT_DRIVER}"; do
-       DOCKER_DEVICES+=( --device="${DRIVER}" )
-    done
-
-    RENDER_DRIVER="$(find /dev/dri -name renderD\*)"
-    for DRIVER in "${RENDER_DRIVER}"; do
-        DOCKER_DEVICES+=( --device="${DRIVER}" )
-    done
-fi
-
 # Add ROCm devices and set ROCM_ENABLED=1 which is used in the with_the_same_user script
 # to add the user to the video group
 if [[ "${DOCKER_IMAGE_NAME}" == *"rocm"* && -d "/dev/dri" ]]; then
diff --git a/docker/install/ubuntu_install_vitis_ai_core.sh b/docker/install/ubuntu_install_vitis_ai_core.sh
deleted file mode 100755
index 2e395b45daaa..000000000000
--- a/docker/install/ubuntu_install_vitis_ai_core.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-set -u
-set -o pipefail
-
-export PYXIR_HOME=/opt/pyxir
-mkdir "$PYXIR_HOME"
-
-# install libraries for building Vitis-AI on ubuntu
-apt-get update && apt-install-and-clear -y \
-    graphviz \
-    gnupg2 \
-    gpg-agent \
-    gcc-aarch64-linux-gnu
-
-
-. $VAI_ROOT/conda/etc/profile.d/conda.sh
-conda activate vitis-ai-tensorflow
-pip3 install progressbar h5py==2.10.0
-
-git clone --recursive --branch rel-v0.3.1 --depth 1 https://github.com/Xilinx/pyxir.git "${PYXIR_HOME}"
-cd "${PYXIR_HOME}" && python3 setup.py install --use_vart_cloud_dpu --use_dpuczdx8g_vart
diff --git a/python/gen_requirements.py b/python/gen_requirements.py
index d22544db484b..aafa35d08fe0 100644
--- a/python/gen_requirements.py
+++ b/python/gen_requirements.py
@@ -170,17 +170,6 @@
             ],
         ),
     ),
-    # Vitis AI requirements
-    (
-        "vitis-ai",
-        (
-            "Requirements for the Vitis AI codegen",
-            [
-                "h5py",
-                "progressbar",
-            ],
-        ),
-    ),
     # XGBoost, useful for autotuning on some targets.
     (
         "xgboost",
diff --git a/python/tvm/contrib/target/vitis_ai.py b/python/tvm/contrib/target/vitis_ai.py
deleted file mode 100644
index 1ab52ed724b9..000000000000
--- a/python/tvm/contrib/target/vitis_ai.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
-
-"""Utility to offload (sub-)models to Vitis-AI"""
-
-import warnings
-import importlib
-
-from tvm.relay.expr import Tuple, Call, TupleGetItem
-import tvm._ffi
-
-# Placeholder for PyXIR module
-pyxir = None
-
-
-def vitis_ai_available():
-    """Return whether Vitis AI tools are available"""
-    pyxir_spec = importlib.util.find_spec("pyxir")
-    if not tvm.get_global_func("tvm.vitis_ai_runtime.from_xgraph", True) or pyxir_spec is None:
-        return False
-    return True
-
-
-class CodegenVitisAI:
-
-    """Traverse Relay expression and convert into PyXIR XGraph format
-
-    Parameters
-    ----------
-    function : Function
-        The Relay function
-    dpu_target : str
-        The Vitis AI DPU target identifier
-    """
-
-    def __init__(self, function, dpu_target):
-        global pyxir
-        try:
-            if pyxir is None:
-                pyxir = __import__("pyxir")
-                __import__("pyxir.frontend.tvm")
-        except ImportError:
-            # add "from None" to silence
-            # "During handling of the above exception, another exception occurred"
-            raise ImportError(
-                "The pyxir package is required for the Vitis AI backend. "
-                "Please install it first. "
-                "Help: (https://tvm.apache.org/docs/deploy/vitis_ai.html) "
-            ) from None
-
-        self.function = function
-        self.dpu_target = dpu_target
-        self.params = {}
-
-    def build(self):
-        """ "Convert the Relay expression to a PyXIR XGraph to instantiate
-        the Vitis AI runtime
-
-        Returns
-        -------
-        xgraph_str : str
-            Serialized XGraph
-        """
-        xgraph = pyxir.frontend.tvm.from_relay(
-            self.function, params=self.params, postprocessing=None
-        )
-        xgraph = pyxir.partition(xgraph, targets=[self.dpu_target])
-        output_relay_ids = self.get_output_names()
-        layers = xgraph.get_layers()
-
-        # Get the output tensor names using XGraph and output Relay ids
-        out_tensor_names = ["unknown_name"] * len(output_relay_ids)
-        for layer in layers:
-            if not layer.internal:
-                for relay_id in layer.attrs["relay_id"]:
-                    if relay_id in output_relay_ids:
-                        out_tensor_names[output_relay_ids.index(relay_id)] = layer.name
-                        break
-        if any([name == "unkown_name" for name in out_tensor_names]):
-            raise ValueError(
-                "During codegeneration the loading of subexpression"
-                " failed due to output tensor name mismatch in Relay PyXIR interface."
-            )
-        xgraph.meta_attrs["tvm_out_tensors"] = out_tensor_names
-        xgraph_str = pyxir.get_xgraph_str(xgraph)
-        return xgraph_str
-
-    def get_output_names(self):
-        """Get output names from Relay expression"""
-        func = self.function
-        output_relay_ids = []
-        expr = func.body
-        if isinstance(expr, Tuple):
-            for field in expr.fields:
-                output_relay_ids.append(hash(field))
-        elif isinstance(expr, Call):
-            output_relay_ids.append(hash(expr))
-        elif isinstance(expr, TupleGetItem):
-            output_relay_ids.append(hash(expr.tuple_value))
-        else:
-            raise ValueError(f"Vitis-AI codegen does not support {type(expr)} as output")
-        return output_relay_ids
-
-
-@tvm._ffi.register_func("relay.ext.vitis_ai")
-def vitis_ai_compiler(ref):
-    """Create a Vitis-AI runtime from the provided Relay expression"""
-    assert isinstance(ref, tvm.relay.function.Function)
-
-    name = str(ref.attrs.global_symbol)
-
-    pass_context = tvm.get_global_func("transform.GetCurrentPassContext")()
-
-    cfg = (
-        pass_context.config["relay.ext.vitis_ai.options"]
-        if "relay.ext.vitis_ai.options" in pass_context.config
-        else None
-    )
-
-    # Backward compatibility with old pass context configs
-    if cfg is None:
-        warnings.warn(
-            "You are using a deprecated way of passing build configs (e.g."
-            " `relay.ext.vitis_ai.options.target`). Check out the Vitis AI "
-            " documentation here: https://tvm.apache.org/docs/deploy/vitis_ai.html"
-            " to switch to recommended way for passing build configs."
-        )
-
-        # The target Vitis-AI accelerator device
-        dpu_target = (
-            str(pass_context.config["relay.ext.vitis_ai.options.target"])
-            if "relay.ext.vitis_ai.options.target" in pass_context.config
-            else None
-        )
-
-        # (Optional configs) The build and work directories to be used by Vitis-AI
-        vai_build_dir = (
-            str(pass_context.config["relay.ext.vitis_ai.options.build_dir"])
-            if "relay.ext.vitis_ai.options.build_dir" in pass_context.config
-            else tvm.contrib.utils.tempdir().relpath("")
-        )
-        vai_work_dir = (
-            str(pass_context.config["relay.ext.vitis_ai.options.work_dir"])
-            if "relay.ext.vitis_ai.options.work_dir" in pass_context.config
-            else tvm.contrib.utils.tempdir().relpath("")
-        )
-
-        # (Optional configs) Export and load PyXIR runtime module to file if provided. This is
-        #   used to compile and quantize a model on the host and deploy it at the edge
-        export_runtime_module = (
-            str(pass_context.config["relay.ext.vitis_ai.options.export_runtime_module"])
-            if "relay.ext.vitis_ai.options.export_runtime_module" in pass_context.config
-            else ""
-        )
-        load_runtime_module = (
-            str(pass_context.config["relay.ext.vitis_ai.options.load_runtime_module"])
-            if "relay.ext.vitis_ai.options.load_runtime_module" in pass_context.config
-            else ""
-        )
-    else:
-        dpu_target = cfg.dpu if cfg.dpu else None
-        # (Optional configs) The build and work directories to be used by Vitis AI
-        vai_build_dir = cfg.build_dir if cfg.build_dir else tvm.contrib.utils.tempdir().relpath("")
-
-        # (Optional configs) Export and load PyXIR runtime module to file if provided. This is
-        #   used to compile and quantize a model on the host and deploy it at the edge
-        vai_work_dir = cfg.work_dir if cfg.work_dir else tvm.contrib.utils.tempdir().relpath("")
-        export_runtime_module = cfg.export_runtime_module
-        load_runtime_module = cfg.load_runtime_module
-
-    # Config checks
-    if load_runtime_module and dpu_target is not None:
-        warnings.warn(
-            "Both `load_runtime_module` and `dpu` configs were specified."
-            " The `load_runtime_module` points to a prebuilt runtime module with"
-            " an internal DPU target so the `dpu` config will be ignored"
-        )
-    if load_runtime_module and "relay.ext.vitis_ai.options.build_dir" in pass_context.config:
-        warnings.warn(
-            "Both `load_runtime_module` and `build_dir` configs were specified."
-            " The `load_runtime_module` points to a prebuilt runtime module with"
-            " an internal build directory so the `build_dir` config will be ignored"
-        )
-    if load_runtime_module and "relay.ext.vitis_ai.options.work_dir" in pass_context.config:
-        warnings.warn(
-            "Both `load_runtime_module` and `work_dir` configs were specified."
-            " The `load_runtime_module` points to a prebuilt runtime module with"
-            " an internal work directory so the `work_dir` config will be ignored"
-        )
-
-    # If load_runtime_module is not set, we will build the PyXIR runtime module from scratch
-    if load_runtime_module == "":
-        # Convert Relay expression into XGraph and do partitioning inside PyXIR
-        codegen = CodegenVitisAI(ref, dpu_target)
-        xgraph_str = codegen.build()
-
-        runtime_func = "tvm.vitis_ai_runtime.from_xgraph"
-        fcreate = tvm._ffi.get_global_func(runtime_func)
-        return fcreate(
-            name, xgraph_str, dpu_target, vai_build_dir, vai_work_dir, export_runtime_module
-        )
-
-    runtime_func = "tvm.vitis_ai_runtime.from_rt_mod"
-    fcreate = tvm._ffi.get_global_func(runtime_func)
-    return fcreate(name, load_runtime_module, export_runtime_module)
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index b3123a20d3e9..b01f7bf5dc89 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -1022,10 +1022,6 @@ def _aprofile_aem_fvp_compile_time_check():
     compile_time_check=_aprofile_aem_fvp_compile_time_check,
 )
 
-# Mark a test as requiring Vitis AI to run
-requires_vitis_ai = Feature("vitis_ai", "Vitis AI", cmake_flag="USE_VITIS_AI")
-
-
 # check cpu features
 def _has_cpu_feat(features):
     cpu = codegen.llvm_get_system_cpu()
diff --git a/rust/tvm-rt/Cargo.toml b/rust/tvm-rt/Cargo.toml
index 789c15a6be80..cb8c560c3efa 100644
--- a/rust/tvm-rt/Cargo.toml
+++ b/rust/tvm-rt/Cargo.toml
@@ -74,7 +74,6 @@ use-arm-compute-lib = ["tvm-sys/use-arm-compute-lib"]
 use-arm-compute-lib-graph-runtime = ["tvm-sys/use-arm-compute-lib-graph-runtime"]
 use-tensorrt-codegen = ["tvm-sys/use-tensorrt-codegen"]
 use-tensorrt-runtime = ["tvm-sys/use-tensorrt-runtime"]
-use-vitis-ai = ["tvm-sys/use-vitis-ai"]
 build-static-runtime = ["tvm-sys/build-static-runtime"]
 
 [dependencies]
diff --git a/rust/tvm-sys/Cargo.toml b/rust/tvm-sys/Cargo.toml
index 70daf3b388e4..03e1d4e13d55 100644
--- a/rust/tvm-sys/Cargo.toml
+++ b/rust/tvm-sys/Cargo.toml
@@ -67,7 +67,6 @@ use-arm-compute-lib = []
 use-arm-compute-lib-graph-runtime = []
 use-tensorrt-codegen = []
 use-tensorrt-runtime = []
-use-vitis-ai = []
 build-static-runtime = []
 
 [dependencies]
diff --git a/rust/tvm-sys/build.rs b/rust/tvm-sys/build.rs
index eb2c1ee3a21b..3b19f56fb1d2 100644
--- a/rust/tvm-sys/build.rs
+++ b/rust/tvm-sys/build.rs
@@ -186,9 +186,6 @@ fn find_using_tvm_build() -> Result<TVMInstall> {
     if cfg!(feature = "use-tensorrt-runtime") {
         build_config.settings.use_tensorrt_runtime = CMakeSetting::from_str("on").ok();
     }
-    if cfg!(feature = "use-vitis-ai") {
-        build_config.settings.use_vitis_ai = Some(true);
-    }
     if cfg!(any(
         feature = "static-linking",
         feature = "build-static-runtime"
diff --git a/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc b/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc
deleted file mode 100755
index 46246b0295b7..000000000000
--- a/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file vitis_ai_runtime.cc
- */
-
-#include "vitis_ai_runtime.h"
-
-#include <tvm/runtime/registry.h>
-
-#include <cassert>
-#include <fstream>
-#include <streambuf>
-#include <string>
-#include <vector>
-
-using namespace pyxir::runtime;
-
-namespace tvm {
-namespace runtime {
-
-VitisAIRuntime::VitisAIRuntime(const std::string& symbol_name, const Array<String> const_names,
-                               const std::string& serialized_rt_mod,
-                               const std::string& export_rt_mod_path)
-    : symbol_name_(symbol_name),
-      const_names_(const_names),
-      export_rt_mod_path_(export_rt_mod_path) {
-  std::istringstream sstream(serialized_rt_mod);
-  rt_mod_.reset(new RuntimeModule());
-  rt_mod_->deserialize(sstream);
-  in_tensor_names_ = rt_mod_->get_in_tensor_names();
-  out_tensor_names_ = rt_mod_->get_out_tensor_names();
-}
-
-VitisAIRuntime::VitisAIRuntime(const std::string& symbol_name, const std::string& xgraph_str,
-                               const Array<String> const_names, const std::string& dpu_target,
-                               const std::string& build_dir, const std::string& work_dir,
-                               const std::string& export_rt_mod_path)
-    : symbol_name_(symbol_name),
-      const_names_(const_names),
-      export_rt_mod_path_(export_rt_mod_path) {
-  std::istringstream xgraph_sstream(xgraph_str);
-  pyxir::XGraphHolder xgraph = std::make_shared<pyxir::graph::XGraph>("");
-  pyxir::read(xgraph, xgraph_sstream);
-  in_tensor_names_ = xgraph->get_input_names();
-  out_tensor_names_ = xgraph->get_meta_attr("tvm_out_tensors").get_strings();
-
-  pyxir::partition(xgraph, std::vector<std::string>{dpu_target}, "");
-
-  pyxir::RunOptionsHolder run_options(new pyxir::runtime::RunOptions());
-  run_options->on_the_fly_quantization = true;
-  run_options->build_dir = build_dir;
-  run_options->export_runtime_module_path = export_rt_mod_path_;
-  if (!work_dir.empty()) run_options->work_dir = work_dir;
-  rt_mod_ =
-      pyxir::build_rt(xgraph, dpu_target, in_tensor_names_, out_tensor_names_, "vai", run_options);
-}
-
-Module VitisAIRuntimeCreate(const std::string& name, const std::string& xgraph_str,
-                            const std::string& dpu_target, const std::string& build_dir,
-                            const std::string& work_dir, const std::string& export_rt_mod_path) {
-  Array<String> const_vars;
-  auto exec = make_object<VitisAIRuntime>(name, xgraph_str, const_vars, dpu_target, build_dir,
-                                          work_dir, export_rt_mod_path);
-  return Module(exec);
-}
-
-TVM_REGISTER_GLOBAL("tvm.vitis_ai_runtime.from_xgraph").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = VitisAIRuntimeCreate(args[0], args[1], args[2], args[3], args[4], args[5]);
-});
-
-Module VitisAIRuntimeCreate(const std::string& name, const std::string& serialized_rt_mod,
-                            const std::string& export_rt_mod_path) {
-  Array<String> const_vars;
-  auto exec = make_object<VitisAIRuntime>(name, const_vars, serialized_rt_mod, export_rt_mod_path);
-  return Module(exec);
-}
-
-TVM_REGISTER_GLOBAL("tvm.vitis_ai_runtime.from_rt_mod").set_body([](TVMArgs args, TVMRetValue* rv) {
-  std::string load_rt_mod_path = args[1];
-  assert(!load_rt_mod_path.empty());
-  std::ifstream in_file(load_rt_mod_path);
-  std::stringstream buffer;
-  buffer << in_file.rdbuf();
-  std::string serialized_rt_mod = buffer.str();
-  in_file.close();
-  *rv = VitisAIRuntimeCreate(args[0], serialized_rt_mod, args[2]);
-});
-
-Module VitisAIRuntimeLoadFromBinary(void* strm) {
-  dmlc::Stream* stream = static_cast<dmlc::Stream*>(strm);
-  std::string symbol_name;
-  std::vector<std::string> const_vars;
-  std::string serialized_rt_mod;
-  std::string export_rt_mod_path;
-  stream->Read(&serialized_rt_mod);
-  stream->Read(&export_rt_mod_path);
-  stream->Read(&symbol_name);
-  stream->Read(&const_vars);
-  Array<String> const_names;
-  for (const auto& it : const_vars) {
-    const_names.push_back(it);
-  }
-  auto exec =
-      make_object<VitisAIRuntime>(symbol_name, const_names, serialized_rt_mod, export_rt_mod_path);
-  return Module(exec);
-}
-
-TVM_REGISTER_GLOBAL("runtime.module.loadbinary_VitisAIRuntime")
-    .set_body_typed(VitisAIRuntimeLoadFromBinary);
-
-void VitisAIRuntime::SaveToBinary(dmlc::Stream* stream) {
-  std::ostringstream sstream;
-  rt_mod_->serialize(sstream);
-  stream->Write(sstream.str());
-  stream->Write(export_rt_mod_path_);
-  stream->Write(symbol_name_);
-  std::vector<std::string> consts;
-  for (const auto& it : const_names_) {
-    consts.push_back(it);
-  }
-  stream->Write(consts);
-
-  // If export_rt_mod_path_ member variable is set, we will additionally export the PyXIR
-  //  runtime_module to the specified file
-  if (!export_rt_mod_path_.empty()) {
-    std::ofstream out_file(export_rt_mod_path_);
-    out_file << sstream.str();
-    out_file.close();
-  }
-}
-
-PackedFunc VitisAIRuntime::GetFunction(const String& name, const ObjectPtr<Object>& sptr_to_self) {
-  if (name == "get_symbol") {
-    return PackedFunc(
-        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->symbol_name_; });
-  } else if (name == "get_const_vars") {
-    return PackedFunc(
-        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->const_names_; });
-  } else if ("__init_" + this->symbol_name_ == name) {
-    // The function to initialize constant tensors.
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      CHECK_EQ(args.size(), 1U);
-      this->initialized_ = true;
-      *rv = 0;
-    });
-  } else if (this->symbol_name_ == name) {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      // Initialize input tensors
-      DLTensor* inputs = args[0];
-      std::vector<pyxir::XBufferHolder> in_tensors;
-      std::vector<ssize_t> in_shape;
-      for (int i = 0; i < inputs->ndim; ++i) in_shape.push_back(inputs->shape[i]);
-      in_tensors.push_back(std::shared_ptr<pyxir::XBuffer>(
-          new pyxir::XBuffer(reinterpret_cast<void*>(static_cast<float*>(inputs->data)), 4, "f",
-                             in_shape.size(), in_shape, false, false)));
-
-      // Initialize output tensors
-      std::vector<pyxir::XBufferHolder> out_tensors;
-      for (unsigned i = 0; i < out_tensor_names_.size(); ++i) {
-        DLTensor* output_tensor = args[args.size() - out_tensor_names_.size() + i];
-        std::vector<ssize_t> out_shape;
-        for (int i = 0; i < output_tensor->ndim; ++i) out_shape.push_back(output_tensor->shape[i]);
-        void* output_data = reinterpret_cast<void*>(static_cast<float*>(output_tensor->data));
-        out_tensors.push_back(std::shared_ptr<pyxir::XBuffer>(
-            new pyxir::XBuffer(output_data, 4, "f", out_shape.size(), out_shape, false, false)));
-      }
-
-      // Execute the subgraph.
-      rt_mod_->execute(in_tensors, out_tensors);
-    });
-  } else {
-    return PackedFunc();
-  }
-}
-
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/contrib/vitis_ai/vitis_ai_runtime.h b/src/runtime/contrib/vitis_ai/vitis_ai_runtime.h
deleted file mode 100755
index 2cc5918c8f52..000000000000
--- a/src/runtime/contrib/vitis_ai/vitis_ai_runtime.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \brief Vitis-AI runtime that can run model
- *        containing only tvm PackedFunc.
- * \file vitis_ai_runtime.h
- */
-#ifndef TVM_RUNTIME_CONTRIB_VITIS_AI_VITIS_AI_RUNTIME_H_
-#define TVM_RUNTIME_CONTRIB_VITIS_AI_VITIS_AI_RUNTIME_H_
-#include <dlpack/dlpack.h>
-#include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/packed_func.h>
-// clang-format off
-#include <memory>
-#include <string>
-#include <vector>
-// clang-format on
-#include <pyxir/pyxir.hpp>
-#include <pyxir/runtime/run_options.hpp>
-
-namespace tvm {
-namespace runtime {
-
-/*!
- * \brief VAI runtime.
- *
- *  This runtime can be accessed in various language via
- *  TVM runtime PackedFunc API.
- */
-class VitisAIRuntime : public ModuleNode {
- public:
-  /*!
-   * \brief Create VitisAI runtime from serialized XGraph
-   * \param symbol_name The name of the function.
-   * \param const_names The names of each constant in the sub-graph.
-   * \param serialized_rt_mod The serialized runtime module.
-   * \param export_rt_mod_path The path to the file to be used for exporting the
-   *        PyXIR runtime module.
-   */
-  VitisAIRuntime(const std::string& symbol_name, const Array<String> const_names,
-                 const std::string& serialized_rt_mod, const std::string& export_rt_mod);
-
-  /*!
-   * \brief Create VitisAI runtime from serialized XGraph
-   * \param symbol_name The name of the function.
-   * \param xgraph_str serialized XGraph representation
-   * \param const_names The names of each constant in the sub-graph.
-   * \param dpu_target The Vitis-AI DPU target identifier (e.g. DPUCADX8G, DPUCZDX8G-zcu104).
-   * \param build_dir The directory to be used for Vitis-AI build files.
-   * \param work_dir The directory to be used for Vitis-AI work files.
-   * \param export_rt_mod_path The path to the file to be used for exporting the
-   *        PyXIR runtime module.
-   */
-  VitisAIRuntime(const std::string& symbol_name, const std::string& xgraph_str,
-                 const Array<String> const_names, const std::string& dpu_target,
-                 const std::string& build_dir, const std::string& work_dir,
-                 const std::string& export_runtime_module_path);
-
-  /*!
-   * \brief Get member function to front-end.
-   * \param name The name of the function.
-   * \param sptr_to_self The pointer to the module node.
-   * \return The corresponding member function.
-   */
-  virtual PackedFunc GetFunction(const String& name, const ObjectPtr<Object>& sptr_to_self);
-
-  /*!
-   * \return The type key of the executor.
-   */
-  const char* type_key() const { return "VitisAIRuntime"; }
-
-  /*! \brief Get the property of the runtime module .*/
-  int GetPropertyMask() const final {
-    return ModulePropertyMask::kBinarySerializable | ModulePropertyMask::kRunnable;
-  };
-
-  /*!
-   * \brief Serialize the content of the pyxir directory and save it to
-   *        binary stream.
-   * \param stream The binary stream to save to.
-   */
-  void SaveToBinary(dmlc::Stream* stream) final;
-
- private:
-  /*! \brief The only subgraph name for this module */
-  std::string symbol_name_;
-  /*! \brief The required constant names */
-  Array<String> const_names_;
-  /*! \brief The runtime module */
-  pyxir::RtModHolder rt_mod_;
-  /*! \brief The XGraph input tensor names in the order as provided by TVM */
-  std::vector<std::string> in_tensor_names_;
-  /*! \brief The XGraph output tensor names in the order as provided by TVM */
-  std::vector<std::string> out_tensor_names_;
-  /*! \brief The file path for exporting the runtime module if set */
-  std::string export_rt_mod_path_;
-  /*! \brief Whether constant tensors have been initialized */
-  bool initialized_{false};
-};
-
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_CONTRIB_VITIS_AI_VITIS_AI_RUNTIME_H_
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index d55181aab746..1af5e6e095aa 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -364,7 +364,6 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_THREADS", TVM_INFO_USE_THREADS},
       {"USE_THRUST", TVM_INFO_USE_THRUST},
       {"USE_CURAND", TVM_INFO_USE_CURAND},
-      {"USE_VITIS_AI", TVM_INFO_USE_VITIS_AI},
       {"USE_VULKAN", TVM_INFO_USE_VULKAN},
       {"USE_CLML", TVM_INFO_USE_CLML},
       {"TVM_CLML_VERSION", TVM_INFO_USE_TVM_CLML_VERSION},
diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh
index f9065ece6e5f..42d6c06f7a68 100755
--- a/tests/scripts/task_config_build_cpu.sh
+++ b/tests/scripts/task_config_build_cpu.sh
@@ -42,7 +42,6 @@ fi
 
 echo set\(USE_TENSORFLOW_PATH \"/tensorflow\"\) >> config.cmake
 echo set\(USE_FLATBUFFERS_PATH \"/flatbuffers\"\) >> config.cmake
-echo set\(USE_VITIS_AI ON\) >> config.cmake
 echo set\(USE_LIBBACKTRACE COMPILE\) >> config.cmake
 echo set\(BACKTRACE_ON_SEGFAULT ON\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake

From d5191ac0b074b0a086a9629c4ef39f1effb93ac8 Mon Sep 17 00:00:00 2001
From: Siyuan Feng <hzfengsy@sjtu.edu.cn>
Date: Mon, 17 Feb 2025 14:24:39 +0800
Subject: [PATCH 03/10] remove VERILATOR

---
 CMakeLists.txt                                |   2 -
 cmake/config.cmake                            |   3 -
 cmake/modules/LibInfo.cmake                   |   1 -
 cmake/modules/contrib/Verilator.cmake         |  23 ---
 docker/Dockerfile.ci_i386                     |   4 -
 docker/install/ubuntu_install_verilator.sh    |  37 ----
 .../contrib/verilator/verilator_device.h      |  84 --------
 .../contrib/verilator/verilator_kernel.h      |  45 -----
 .../contrib/verilator/verilator_runtime.cc    | 180 ------------------
 .../contrib/verilator/verilator_runtime.h     | 138 --------------
 src/support/libinfo.cc                        |   1 -
 11 files changed, 518 deletions(-)
 delete mode 100644 cmake/modules/contrib/Verilator.cmake
 delete mode 100755 docker/install/ubuntu_install_verilator.sh
 delete mode 100644 src/runtime/contrib/verilator/verilator_device.h
 delete mode 100644 src/runtime/contrib/verilator/verilator_kernel.h
 delete mode 100644 src/runtime/contrib/verilator/verilator_runtime.cc
 delete mode 100644 src/runtime/contrib/verilator/verilator_runtime.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5a7e1241d67f..ab01cd5f478f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -120,7 +120,6 @@ tvm_option(SUMMARIZE "Print CMake option summary after configuring" OFF)
 tvm_option(USE_CLML "Build with CLML Codegen support" OFF)
 tvm_option(USE_CLML_GRAPH_EXECUTOR "Build with CLML graph runtime" OFF)
 tvm_option(USE_UMA "Build with UMA support" OFF)
-tvm_option(USE_VERILATOR "Build with Verilator support" OFF)
 tvm_option(USE_MSC "Enable Multi-System Compiler" OFF)
 tvm_option(USE_MRVL "Build with MRVL TVM support" OFF)
 tvm_option(USE_NVSHMEM "Build with NVSHMEM support" OFF)
@@ -499,7 +498,6 @@ include(cmake/modules/contrib/ONNX.cmake)
 include(cmake/modules/contrib/ArmComputeLib.cmake)
 include(cmake/modules/contrib/TensorRT.cmake)
 include(cmake/modules/contrib/NNAPI.cmake)
-include(cmake/modules/contrib/Verilator.cmake)
 include(cmake/modules/contrib/UMA.cmake)
 include(cmake/modules/contrib/MSC.cmake)
 include(cmake/modules/contrib/vllm.cmake)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index e5f85380d29e..d420fe54ab2f 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -279,9 +279,6 @@ set(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR OFF)
 set(USE_TENSORRT_CODEGEN OFF)
 set(USE_TENSORRT_RUNTIME OFF)
 
-# Build Verilator codegen and runtime
-set(USE_VERILATOR OFF)
-
 # Whether to use the Multi-System Compiler
 set(USE_MSC OFF)
 
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index 70ae86fc7925..201004390994 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -128,7 +128,6 @@ function(add_lib_info src_file)
     TVM_INFO_USE_CLML_GRAPH_EXECUTOR="${USE_CLML_GRAPH_EXECUTOR}"
     TVM_INFO_USE_TVM_CLML_VERSION="${CLML_VERSION_MAJOR}"
     TVM_INFO_USE_UMA="${USE_UMA}"
-    TVM_INFO_USE_VERILATOR="${USE_VERILATOR}"
     TVM_INFO_USE_MSC="${USE_MSC}"
     TVM_INFO_USE_CCACHE="${USE_CCACHE}"
     TVM_INFO_USE_NVSHMEM="${USE_NVSHMEM}"
diff --git a/cmake/modules/contrib/Verilator.cmake b/cmake/modules/contrib/Verilator.cmake
deleted file mode 100644
index 61a2e309d06e..000000000000
--- a/cmake/modules/contrib/Verilator.cmake
+++ /dev/null
@@ -1,23 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-if(USE_VERILATOR STREQUAL "ON")
-  tvm_file_glob(GLOB VERILATOR_RELAY_CONTRIB_SRC src/relay/backend/contrib/verilator/codegen.cc)
-  tvm_file_glob(GLOB VERILATOR_CONTRIB_SRC src/runtime/contrib/verilator/verilator_runtime.cc)
-  list(APPEND COMPILER_SRCS ${VERILATOR_RELAY_CONTRIB_SRC})
-  list(APPEND RUNTIME_SRCS ${VERILATOR_CONTRIB_SRC})
-endif()
diff --git a/docker/Dockerfile.ci_i386 b/docker/Dockerfile.ci_i386
index b96e4a33b459..b4aa2b6bffe6 100644
--- a/docker/Dockerfile.ci_i386
+++ b/docker/Dockerfile.ci_i386
@@ -66,10 +66,6 @@ RUN bash /install/ubuntu2004_install_redis.sh
 COPY install/ubuntu_install_sbt.sh /install/ubuntu_install_sbt.sh
 RUN bash /install/ubuntu_install_sbt.sh
 
-# Verilator deps
-COPY install/ubuntu_install_verilator.sh /install/ubuntu_install_verilator.sh
-RUN bash /install/ubuntu_install_verilator.sh
-
 # sccache
 COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
 RUN bash /install/ubuntu_install_sccache.sh
diff --git a/docker/install/ubuntu_install_verilator.sh b/docker/install/ubuntu_install_verilator.sh
deleted file mode 100755
index 630746bd2162..000000000000
--- a/docker/install/ubuntu_install_verilator.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-set -u
-set -o pipefail
-
-# Verilator version
-VERILATOR_VERSION="5.002"
-
-# Install dependencies
-apt-get update && apt-install-and-clear -y autoconf g++ flex bison
-
-# Install Verilator
-git clone --depth 1 --branch v${VERILATOR_VERSION} https://github.com/verilator/verilator
-pushd verilator
-  autoconf
-  ./configure
-  make -j$(nproc)
-  make install
-popd
-rm -rf verilator
diff --git a/src/runtime/contrib/verilator/verilator_device.h b/src/runtime/contrib/verilator/verilator_device.h
deleted file mode 100644
index 298e41c06daf..000000000000
--- a/src/runtime/contrib/verilator/verilator_device.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file src/runtime/contrib/verilator/verilator_device.h
- * \brief Use external verilator device.
- */
-
-#ifndef TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_DEVICE_H_
-#define TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_DEVICE_H_
-
-#include <tvm/runtime/c_runtime_api.h>
-
-namespace tvm {
-namespace runtime {
-namespace contrib {
-
-/*! \brief Verilator device resource context  */
-typedef void* VerilatorHandle;
-
-/*!
- * \brief Allocate a verilator device resource handle
- * \return The verilator device handle.
- */
-extern "C" TVM_DLL VerilatorHandle VerilatorAlloc();
-
-/*!
- * \brief Free a verilator device handle
- * \param handle The verilator device handle to be freed.
- */
-extern "C" TVM_DLL void VerilatorDealloc(VerilatorHandle handle);
-
-/*!
- * \brief Read verilator register or memory
- * \param handle The verilator device handle.
- * \param id The register or memory identifier.
- * \param addr The register or memory address (word-level).
- * \return The value of register or memory.
- */
-extern "C" TVM_DLL int VerilatorRead(VerilatorHandle handle, int id, int addr);
-
-/*!
- * \brief Write verilator register or memory
- * \param handle The verilator device handle.
- * \param id The register or memory identifier.
- * \param addr The register or memory address (word-level).
- * \param value The value of register or memory.
- */
-extern "C" TVM_DLL void VerilatorWrite(VerilatorHandle handle, int id, int addr, int value);
-
-/*!
- * \brief Reset Verilator for n clock cycles
- * \param handle The verilator device handle.
- * \param n The number of reset cycles.
- */
-extern "C" TVM_DLL void VerilatorReset(VerilatorHandle handle, int n);
-
-/*!
- * \brief Run Verilator for n clock cycles
- * \param handle The verilator device handle.
- * \param n The number of run cycles.
- */
-extern "C" TVM_DLL void VerilatorRun(VerilatorHandle handle, int n);
-
-}  // namespace contrib
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_DEVICE_H_
diff --git a/src/runtime/contrib/verilator/verilator_kernel.h b/src/runtime/contrib/verilator/verilator_kernel.h
deleted file mode 100644
index 57353297db8d..000000000000
--- a/src/runtime/contrib/verilator/verilator_kernel.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file src/runtime/contrib/verilator/verilator_kernel.h
- * \brief Use external verilator library kernels.
- */
-
-#ifndef TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_KERNEL_H_
-#define TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_KERNEL_H_
-
-#include <tvm/runtime/c_runtime_api.h>
-
-#include "verilator_device.h"
-
-namespace tvm {
-namespace runtime {
-namespace contrib {
-
-extern "C" TVM_DLL void verilator_add(VerilatorHandle handle, int* left, int* right, int* out,
-                                      int p_h_, int p_w_);
-
-extern "C" TVM_DLL void verilator_bias_add(VerilatorHandle handle, int* data, int* bias, int* out,
-                                           int p_n_, int p_c_, int p_h_, int p_w_);
-
-}  // namespace contrib
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_KERNEL_H_
diff --git a/src/runtime/contrib/verilator/verilator_runtime.cc b/src/runtime/contrib/verilator/verilator_runtime.cc
deleted file mode 100644
index 81ecf91da41e..000000000000
--- a/src/runtime/contrib/verilator/verilator_runtime.cc
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file src/runtime/contrib/verilator/verilator_runtime.cc
- * \brief A runtime for Verilator.
- */
-
-#include "verilator_runtime.h"
-
-#include <dlfcn.h>
-#include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/registry.h>
-
-#include <cstddef>
-#include <string>
-#include <vector>
-
-#include "../../library_module.h"
-#include "../json/json_node.h"
-#include "../json/json_runtime.h"
-#include "verilator_device.h"
-#include "verilator_kernel.h"
-
-namespace tvm {
-namespace runtime {
-namespace contrib {
-
-using namespace tvm::runtime;
-using namespace tvm::runtime::contrib;
-using namespace tvm::runtime::json;
-
-VerilatorLibrary::~VerilatorLibrary() {
-  if (lib_handle_) {
-    dlclose(lib_handle_);
-    lib_handle_ = nullptr;
-  }
-}
-
-void VerilatorLibrary::Load(const std::string& name) {
-  lib_handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
-  ICHECK(lib_handle_ != nullptr) << "Failed to load dynamic shared library " << name << " "
-                                 << dlerror();
-}
-
-void* VerilatorLibrary::GetSymbol(const char* name) { return dlsym(lib_handle_, name); }
-
-void VerilatorProfiler::Clear() { cycle_counter = 0; }
-
-std::string VerilatorProfiler::AsJSON() {
-  std::ostringstream os;
-  os << "{\n"
-     << " \"cycle_counter\":" << cycle_counter << "\n"
-     << "}\n";
-  return os.str();
-}
-
-VerilatorProfiler* VerilatorProfiler::ThreadLocal() {
-  static thread_local VerilatorProfiler inst;
-  return &inst;
-}
-
-VerilatorRuntime::~VerilatorRuntime() {
-  VLOG(0) << "destroying verilator runtime";
-  if (lib_ == nullptr) {
-    // Never initialized. This can happen if the runtime was created during compilation of
-    // a BYOC function but the resulting runtime module was never invoked.
-    return;
-  }
-  auto dealloc = reinterpret_cast<VerilatorDeallocFunc>(lib_->GetSymbol("VerilatorDealloc"));
-  ICHECK(dealloc != nullptr);
-  ICHECK(device_ != nullptr);
-  dealloc(device_);
-  device_ = nullptr;
-  lib_->~VerilatorLibrary();
-  lib_ = nullptr;
-}
-
-void VerilatorRuntime::SetLibrary(const std::string& lib_path) { lib_path_ = lib_path; }
-
-void VerilatorRuntime::SetResetCycles(const int cycles) { reset_cycles_ = cycles; }
-
-void VerilatorRuntime::EnableProfiler() { prof_enable_ = true; }
-
-void VerilatorRuntime::SetProfilerCycleCounterId(const int id) { prof_cycle_counter_id_ = id; }
-
-void VerilatorRuntime::Init(const Array<NDArray>& consts) {
-  VLOG(0) << "initializing verilator runtime";
-  lib_ = new VerilatorLibrary();
-  lib_->Load(lib_path_);
-  auto alloc = reinterpret_cast<VerilatorAllocFunc>(lib_->GetSymbol("VerilatorAlloc"));
-  ICHECK(alloc != nullptr);
-  auto reset = reinterpret_cast<VerilatorResetFunc>(lib_->GetSymbol("VerilatorReset"));
-  ICHECK(reset != nullptr);
-  read_ = reinterpret_cast<VerilatorReadFunc>(lib_->GetSymbol("VerilatorRead"));
-  ICHECK(read_ != nullptr);
-
-  // alloc verilator device
-  device_ = alloc();
-
-  // enable profiler
-  if (prof_enable_) prof_ = VerilatorProfiler::ThreadLocal();
-
-  // reset verilator device
-  reset(device_, reset_cycles_);
-
-  CHECK_EQ(consts.size(), const_idx_.size())
-      << "The number of input constants must match the number of required.";
-
-  // Setup constants entries for weights.
-  SetupConstants(consts);
-}
-
-void VerilatorRuntime::Run() {
-  std::vector<int*> in_ptr;
-  std::vector<int*> out_ptr;
-  for (size_t i = 0; i < input_nodes_.size(); ++i) {
-    uint32_t eid = EntryID(input_nodes_[i], 0);
-    int* data = static_cast<int*>(data_entry_[eid]->data);
-    in_ptr.push_back(data);
-  }
-  for (size_t i = 0; i < outputs_.size(); ++i) {
-    uint32_t eid = EntryID(outputs_[i]);
-    int* data = static_cast<int*>(data_entry_[eid]->data);
-    out_ptr.push_back(data);
-  }
-  for (size_t nid = 0; nid < nodes_.size(); ++nid) {
-    const auto& node = nodes_[nid];
-    if (node.GetOpType() == "kernel") {
-      CHECK_EQ(node.GetOpType(), "kernel");
-      auto op_name = node.GetOpName();
-      auto entry = node.GetInputs()[0];
-      auto shape = node.GetOpShape()[entry.index_];
-      if ("add" == op_name) {
-        auto add = reinterpret_cast<VerilatorAddFunc>(lib_->GetSymbol("verilator_add"));
-        ICHECK(add != nullptr);
-        add(device_, in_ptr[0], in_ptr[1], out_ptr[0], shape[0], shape[1]);
-      } else if ("nn.bias_add" == op_name) {
-        auto bias_add =
-            reinterpret_cast<VerilatorBiasAddFunc>(lib_->GetSymbol("verilator_bias_add"));
-        ICHECK(bias_add != nullptr);
-        bias_add(device_, in_ptr[0], in_ptr[1], out_ptr[0], shape[0], shape[3], shape[1], shape[2]);
-      } else {
-        LOG(FATAL) << "Unsupported op: " << op_name;
-      }
-    }
-  }
-  if (prof_enable_) {
-    int cycles = read_(device_, prof_cycle_counter_id_, 0);
-    prof_->cycle_counter += cycles;
-  }
-}
-
-TVM_REGISTER_GLOBAL("verilator.profiler_clear").set_body([](TVMArgs args, TVMRetValue* rv) {
-  VerilatorProfiler::ThreadLocal()->Clear();
-});
-
-TVM_REGISTER_GLOBAL("verilator.profiler_status").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = VerilatorProfiler::ThreadLocal()->AsJSON();
-});
-
-}  // namespace contrib
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/contrib/verilator/verilator_runtime.h b/src/runtime/contrib/verilator/verilator_runtime.h
deleted file mode 100644
index 14bf0bcdfc9b..000000000000
--- a/src/runtime/contrib/verilator/verilator_runtime.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file src/runtime/contrib/verilator/verilator_runtime.h
- * \brief A runtime for Verilator.
- */
-
-#ifndef TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_RUNTIME_H_
-#define TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_RUNTIME_H_
-
-#include <dlfcn.h>
-#include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/registry.h>
-
-#include <cstddef>
-#include <string>
-#include <vector>
-
-#include "../../library_module.h"
-#include "../json/json_node.h"
-#include "../json/json_runtime.h"
-#include "verilator_device.h"
-#include "verilator_kernel.h"
-
-namespace tvm {
-namespace runtime {
-namespace contrib {
-
-using namespace tvm::runtime::contrib;
-using namespace tvm::runtime::json;
-
-typedef VerilatorHandle (*VerilatorAllocFunc)();
-typedef void (*VerilatorDeallocFunc)(VerilatorHandle);
-typedef void (*VerilatorResetFunc)(VerilatorHandle, int);
-typedef int (*VerilatorReadFunc)(VerilatorHandle, int, int);
-typedef void (*VerilatorAddFunc)(VerilatorHandle, int*, int*, int*, int, int);
-typedef void (*VerilatorBiasAddFunc)(VerilatorHandle, int*, int*, int*, int, int, int, int);
-
-class VerilatorLibrary : public Library {
- public:
-  ~VerilatorLibrary();
-
-  /*! \brief load library */
-  void Load(const std::string& name);
-
-  /*! \brief get symbol from libray */
-  void* GetSymbol(const char* name) final;
-
- private:
-  /*! \brief the library handle */
-  void* lib_handle_{nullptr};
-};
-
-class VerilatorProfiler {
- public:
-  /*! \brief the number of cycle counter */
-  uint32_t cycle_counter{0};
-
-  /*! \brief clear the profiler */
-  void Clear();
-
-  /*! \brief get profiler data */
-  std::string AsJSON();
-
-  /*! \brief profiler constructor */
-  static VerilatorProfiler* ThreadLocal();
-};
-
-class VerilatorRuntime : public JSONRuntimeBase {
- public:
-  VerilatorRuntime(const std::string& symbol_name, const std::string& graph_json,
-                   const Array<String> const_names)
-      : JSONRuntimeBase(symbol_name, graph_json, const_names) {
-    VLOG(0) << "creating verilator runtime";
-  }
-
-  ~VerilatorRuntime();
-
-  const char* type_key() const final { return "verilator"; }
-
-  /*! \brief set verilator library */
-  void SetLibrary(const std::string& lib_name);
-
-  /*! \brief set the number of reset cycles */
-  void SetResetCycles(const int cycles);
-
-  /*! \brief enable profiler */
-  void EnableProfiler();
-
-  /*! \brief set cycle counter register id */
-  void SetProfilerCycleCounterId(const int id);
-
-  /*! \brief init verilator runtime */
-  void Init(const Array<NDArray>& consts) override;
-
-  /*! \brief run verilator runtime */
-  void Run() override;
-
- private:
-  /*! \brief the verilator library path */
-  String lib_path_;
-  /*! \brief the verilator device */
-  VerilatorHandle device_{nullptr};
-  /*! \brief the verilator library */
-  VerilatorLibrary* lib_{nullptr};
-  /*! \brief the verilator profiler */
-  VerilatorProfiler* prof_{nullptr};
-  /*! \brief the verilator read function */
-  VerilatorReadFunc read_{nullptr};
-  /*! \brief the verilator reset cycles */
-  int reset_cycles_{1};
-  /*! \brief the verilator profiler status */
-  bool prof_enable_{false};
-  /*! \brief the verilator profiler cycle counter id */
-  int prof_cycle_counter_id_{0};
-};
-
-}  // namespace contrib
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_RUNTIME_H_
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index 1af5e6e095aa..065eaf503f6b 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -369,7 +369,6 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"TVM_CLML_VERSION", TVM_INFO_USE_TVM_CLML_VERSION},
       {"USE_CLML_GRAPH_EXECUTOR", TVM_INFO_USE_CLML_GRAPH_EXECUTOR},
       {"USE_UMA", TVM_INFO_USE_UMA},
-      {"USE_VERILATOR", TVM_INFO_USE_VERILATOR},
       {"USE_MSC", TVM_INFO_USE_MSC},
       {"USE_CCACHE", TVM_INFO_USE_CCACHE},
       {"USE_NVSHMEM", TVM_INFO_USE_NVSHMEM},

From bb3bd391f79cd94237c7248381abade954d6711e Mon Sep 17 00:00:00 2001
From: Siyuan Feng <hzfengsy@sjtu.edu.cn>
Date: Mon, 17 Feb 2025 14:38:07 +0800
Subject: [PATCH 04/10] remove aocl and sdaccel

---
 CMakeLists.txt                                |  21 +-
 cmake/config.cmake                            |   9 -
 cmake/modules/LibInfo.cmake                   |   1 -
 cmake/modules/OpenCL.cmake                    |  24 ---
 .../ubuntu_install_vitis_ai_packages_ci.sh    |  29 ---
 golang/src/device.go                          |   7 -
 include/tvm/runtime/c_runtime_api.h           |   6 +-
 include/tvm/runtime/packed_func.h             |   4 -
 .../src/main/java/org/apache/tvm/Device.java  |   2 +-
 python/tvm/__init__.py                        |   2 +-
 python/tvm/_ffi/runtime_ctypes.py             |   7 -
 python/tvm/contrib/sdaccel.py                 |  99 ---------
 python/tvm/testing/utils.py                   |   2 +-
 rust/tvm-sys/build.rs                         |   3 -
 src/runtime/module.cc                         |   2 +-
 src/runtime/opencl/aocl/aocl_common.h         |  61 ------
 src/runtime/opencl/aocl/aocl_device_api.cc    |  58 -----
 src/runtime/opencl/aocl/aocl_module.cc        |  67 ------
 src/runtime/opencl/aocl/aocl_module.h         |  49 -----
 src/runtime/opencl/sdaccel/sdaccel_common.h   |  61 ------
 .../opencl/sdaccel/sdaccel_device_api.cc      |  56 -----
 src/runtime/opencl/sdaccel/sdaccel_module.cc  |  81 -------
 src/runtime/opencl/sdaccel/sdaccel_module.h   |  49 -----
 src/support/libinfo.cc                        |   5 -
 src/target/opt/build_aocl_off.cc              |  36 ----
 src/target/opt/build_sdaccel_off.cc           |  36 ----
 src/target/source/codegen_aocl.cc             | 102 ---------
 src/target/source/codegen_vhls.cc             | 198 ------------------
 src/target/source/codegen_vhls.h              |  50 -----
 src/target/source/intrin_rule_aocl.cc         | 107 ----------
 src/target/source/intrin_rule_vhls.cc         |  95 ---------
 src/target/target_kind.cc                     |   9 -
 src/tir/analysis/verify_memory.cc             |   4 +-
 .../task_python_integration_gpuonly.sh        |   2 +-
 tests/scripts/task_python_unittest_gpuonly.sh |   2 +-
 35 files changed, 9 insertions(+), 1337 deletions(-)
 delete mode 100755 docker/install/ubuntu_install_vitis_ai_packages_ci.sh
 delete mode 100644 python/tvm/contrib/sdaccel.py
 delete mode 100644 src/runtime/opencl/aocl/aocl_common.h
 delete mode 100644 src/runtime/opencl/aocl/aocl_device_api.cc
 delete mode 100644 src/runtime/opencl/aocl/aocl_module.cc
 delete mode 100644 src/runtime/opencl/aocl/aocl_module.h
 delete mode 100644 src/runtime/opencl/sdaccel/sdaccel_common.h
 delete mode 100644 src/runtime/opencl/sdaccel/sdaccel_device_api.cc
 delete mode 100644 src/runtime/opencl/sdaccel/sdaccel_module.cc
 delete mode 100644 src/runtime/opencl/sdaccel/sdaccel_module.h
 delete mode 100644 src/target/opt/build_aocl_off.cc
 delete mode 100644 src/target/opt/build_sdaccel_off.cc
 delete mode 100644 src/target/source/codegen_aocl.cc
 delete mode 100644 src/target/source/codegen_vhls.cc
 delete mode 100644 src/target/source/codegen_vhls.h
 delete mode 100644 src/target/source/intrin_rule_aocl.cc
 delete mode 100644 src/target/source/intrin_rule_vhls.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ab01cd5f478f..1edb82108685 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -583,20 +583,6 @@ if(USE_IOS_RPC)
   add_subdirectory("apps/ios_rpc")
 endif()
 
-if(USE_RELAY_DEBUG)
-  message(STATUS "Building Relay in debug mode...")
-  target_compile_definitions(tvm_objs PRIVATE "USE_RELAY_DEBUG")
-  target_compile_definitions(tvm_objs PRIVATE "TVM_LOG_DEBUG")
-  target_compile_definitions(tvm_runtime_objs PRIVATE "USE_RELAY_DEBUG")
-  target_compile_definitions(tvm_runtime_objs PRIVATE "TVM_LOG_DEBUG")
-  target_compile_definitions(tvm_libinfo_objs PRIVATE "USE_RELAY_DEBUG")
-  target_compile_definitions(tvm_libinfo_objs PRIVATE "TVM_LOG_DEBUG")
-else()
-  target_compile_definitions(tvm_objs PRIVATE "NDEBUG")
-  target_compile_definitions(tvm_runtime_objs PRIVATE "NDEBUG")
-  target_compile_definitions(tvm_libinfo_objs PRIVATE "NDEBUG")
-endif(USE_RELAY_DEBUG)
-
 if(TVM_DEBUG_WITH_ABI_CHANGE)
   message(STATUS "Building with debug code that may cause ABI changes...")
   target_compile_definitions(tvm_objs PRIVATE "TVM_DEBUG_WITH_ABI_CHANGE")
@@ -699,12 +685,7 @@ if(GTEST_FOUND)
   endif()
   set_target_properties(cpptest PROPERTIES EXCLUDE_FROM_ALL 1)
   set_target_properties(cpptest PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD 1)
-  if(USE_RELAY_DEBUG)
-    target_compile_definitions(cpptest PRIVATE "USE_RELAY_DEBUG")
-    target_compile_definitions(cpptest PRIVATE "TVM_LOG_DEBUG")
-  else()
-    target_compile_definitions(cpptest PRIVATE "NDEBUG")
-  endif()
+  target_compile_definitions(cpptest PRIVATE "NDEBUG")
   if(TVM_DEBUG_WITH_ABI_CHANGE)
     target_compile_definitions(cpptest PRIVATE "TVM_DEBUG_WITH_ABI_CHANGE")
   endif(TVM_DEBUG_WITH_ABI_CHANGE)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index d420fe54ab2f..856803898a02 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -78,12 +78,6 @@ set(USE_ROCM OFF)
 # - /path/to/rccl: use specific path to rccl
 set(USE_RCCL OFF)
 
-# Whether enable SDAccel runtime
-set(USE_SDACCEL OFF)
-
-# Whether enable Intel FPGA SDK for OpenCL (AOCL) runtime
-set(USE_AOCL OFF)
-
 # Whether enable OpenCL runtime
 #
 # Possible values:
@@ -294,9 +288,6 @@ set(USE_CLML_GRAPH_EXECUTOR OFF)
 # - /path/to/antlr-*-complete.jar: path to specific ANTLR jar file
 set(USE_ANTLR OFF)
 
-# Whether use Relay debug mode
-set(USE_RELAY_DEBUG OFF)
-
 # Whether to enable debug code that may cause ABI changes
 set(TVM_DEBUG_WITH_ABI_CHANGE OFF)
 
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index 201004390994..3881247ae106 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -102,7 +102,6 @@ function(add_lib_info src_file)
     TVM_INFO_USE_OPENMP="${USE_OPENMP}"
     TVM_INFO_USE_PAPI="${USE_PAPI}"
     TVM_INFO_USE_RANDOM="${USE_RANDOM}"
-    TVM_INFO_USE_RELAY_DEBUG="${USE_RELAY_DEBUG}"
     TVM_INFO_TVM_DEBUG_WITH_ABI_CHANGE="${TVM_DEBUG_WITH_ABI_CHANGE}"
     TVM_INFO_TVM_LOG_BEFORE_THROW="${TVM_LOG_BEFORE_THROW}"
     TVM_INFO_USE_ROCBLAS="${USE_ROCBLAS}"
diff --git a/cmake/modules/OpenCL.cmake b/cmake/modules/OpenCL.cmake
index 67d739bb63a0..c5c8eae721fa 100644
--- a/cmake/modules/OpenCL.cmake
+++ b/cmake/modules/OpenCL.cmake
@@ -15,30 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-if(USE_SDACCEL)
-  message(STATUS "Build with SDAccel support")
-  tvm_file_glob(GLOB RUNTIME_SDACCEL_SRCS src/runtime/opencl/sdaccel/*.cc)
-  list(APPEND RUNTIME_SRCS ${RUNTIME_SDACCEL_SRCS})
-  if(NOT USE_OPENCL)
-    message(STATUS "Enable OpenCL support required for SDAccel")
-    set(USE_OPENCL ON)
-  endif()
-else()
-  list(APPEND COMPILER_SRCS src/target/opt/build_sdaccel_off.cc)
-endif(USE_SDACCEL)
-
-if(USE_AOCL)
-  message(STATUS "Build with Intel FPGA SDK for OpenCL support")
-  tvm_file_glob(GLOB RUNTIME_AOCL_SRCS src/runtime/opencl/aocl/*.cc)
-  list(APPEND RUNTIME_SRCS ${RUNTIME_AOCL_SRCS})
-  if(NOT USE_OPENCL)
-    message(STATUS "Enable OpenCL support required for Intel FPGA SDK for OpenCL")
-    set(USE_OPENCL ON)
-  endif()
-else()
-  list(APPEND COMPILER_SRCS src/target/opt/build_aocl_off.cc)
-endif(USE_AOCL)
-
 if(USE_OPENCL)
   tvm_file_glob(GLOB RUNTIME_OPENCL_SRCS src/runtime/opencl/*.cc)
   list(APPEND COMPILER_SRCS src/target/spirv/spirv_utils.cc)
diff --git a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh b/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
deleted file mode 100755
index 569df12a37df..000000000000
--- a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-set -u
-set -o pipefail
-
-export PYXIR_HOME=/opt/pyxir
-mkdir "$PYXIR_HOME"
-
-pip3 install progressbar
-
-git clone --recursive --branch v0.3.5 --depth 1 https://github.com/Xilinx/pyxir.git "${PYXIR_HOME}"
-cd "${PYXIR_HOME}" && python3 setup.py install
diff --git a/golang/src/device.go b/golang/src/device.go
index b2203a38d86d..1a40b77f491a 100644
--- a/golang/src/device.go
+++ b/golang/src/device.go
@@ -41,8 +41,6 @@ var KDLMetal                = int32(C.kDLMetal)
 var KDLVPI                  = int32(C.kDLVPI)
 // KDLROCM is golang enum correspond to TVM device type kDLROCM.
 var KDLROCM                 = int32(C.kDLROCM)
-// KDLSDAccel is golang enum correspond to TVM device type kDLSDAccel.
-var KDLSDAccel              = int32(C.kDLSDAccel)
 // KDLVulkan is golang enum correspond to TVM device type kDLVulkan.
 var KDLVulkan               = int32(C.kDLVulkan)
 // KOpenGL is golang enum correspond to TVM device type kOpenGL.
@@ -91,11 +89,6 @@ func ROCM(index int32) Device {
     return Device{KDLROCM, index}
 }
 
-// SDAccel returns the Device object for SDAccel target on given index
-func SDAccel(index int32) Device {
-    return Device{KDLSDAccel, index}
-}
-
 // Vulkan returns the Device object for Vulkan target on given index
 func Vulkan(index int32) Device {
     return Device{KDLVulkan, index}
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 24ff61811ae1..00a7964230d8 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -119,9 +119,7 @@ typedef enum {
   // To help avoid accidental conflicts between `DLDeviceType`
   // and this enumeration, start numbering the new enumerators
   // a little higher than (currently) seems necessary.
-  kDLAOCL = 32,
-  kDLSDAccel,
-  kOpenGL,
+  kOpenGL = 34,
   kDLMicroDev,
   TVMDeviceExtType_End,  // sentinel value
 } TVMDeviceExtType;
@@ -149,8 +147,6 @@ static_assert(kDLOneAPI == 14, TVM_HARCODED_INTEGER_CHANGED_MSG);
 static_assert(kDLWebGPU == 15, TVM_HARCODED_INTEGER_CHANGED_MSG);
 static_assert(kDLHexagon == 16, TVM_HARCODED_INTEGER_CHANGED_MSG);
 
-static_assert(kDLAOCL == 32, TVM_HARCODED_INTEGER_CHANGED_MSG);
-static_assert(kDLSDAccel == 33, TVM_HARCODED_INTEGER_CHANGED_MSG);
 static_assert(kOpenGL == 34, TVM_HARCODED_INTEGER_CHANGED_MSG);
 static_assert(kDLMicroDev == 35, TVM_HARCODED_INTEGER_CHANGED_MSG);
 #undef TVM_HARCODED_INTEGER_CHANGED_MSG
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 7c1b08e49002..b7eca8be1e5a 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -1460,10 +1460,6 @@ inline const char* DLDeviceType2Str(int type) {
       return "cuda_managed";
     case kDLOpenCL:
       return "opencl";
-    case kDLSDAccel:
-      return "sdaccel";
-    case kDLAOCL:
-      return "aocl";
     case kDLVulkan:
       return "vulkan";
     case kDLMetal:
diff --git a/jvm/core/src/main/java/org/apache/tvm/Device.java b/jvm/core/src/main/java/org/apache/tvm/Device.java
index a5f341a69055..edea0b75df88 100644
--- a/jvm/core/src/main/java/org/apache/tvm/Device.java
+++ b/jvm/core/src/main/java/org/apache/tvm/Device.java
@@ -29,7 +29,7 @@ public class Device {
   static final int kDLCPU = 1, kDLCUDA = 2, kDLCUDAHost = 3, kDLOpenCL = 4, kDLVulkan = 7,
                    kDLMetal = 8, kDLVPI = 9, kDLROCM = 10, kDLROCMHost = 11, kDLExtDev = 12,
                    kDLCUDAManaged = 13, kDLOneAPI = 14, kDLWebGPU = 15, kDLHexagon = 16,
-                   kDLAOCL = 32, kDLSDAccel = 33, kOpenGL = 34, kDLMicroDev = 35;
+                   kOpenGL = 34, kDLMicroDev = 35;
 
   private static final Map<Integer, String> MASK2STR = new HashMap<Integer, String>();
   private static final Map<String, Integer> STR2MASK = new HashMap<String, Integer>();
diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index a622df496959..abbab3ad6d39 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -64,7 +64,7 @@
 from . import support
 
 # Contrib initializers
-from .contrib import rocm as _rocm, nvcc as _nvcc, sdaccel as _sdaccel
+from .contrib import rocm as _rocm, nvcc as _nvcc
 
 # Relay and Relax contain modules that are only available in compiler package
 # Do not import them if TVM is built with runtime only
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index 03dc18ea6e0b..f3b02ed56939 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -264,8 +264,6 @@ class Device(ctypes.Structure):
     kDLOneAPI = 14
     kDLWebGPU = 15
     kDLHexagon = 16
-    kDLAOCL = 32
-    kDLSDAccel = 33
     kOpenGL = 34
     kDLMicroDev = 35
 
@@ -285,8 +283,6 @@ class Device(ctypes.Structure):
         kDLOneAPI: "oneapi",
         kDLWebGPU: "webgpu",
         kDLHexagon: "hexagon",
-        kDLAOCL: "aocl",
-        kDLSDAccel: "sdaccel",
         kOpenGL: "opengl",
         kDLMicroDev: "microdev",
     }
@@ -303,9 +299,6 @@ class Device(ctypes.Structure):
         "nvptx": kDLCUDA,
         "cl": kDLOpenCL,
         "opencl": kDLOpenCL,
-        "sdaccel": kDLOpenCL,
-        "aocl": kDLAOCL,
-        "aocl_sw_emu": kDLAOCL,
         "vulkan": kDLVulkan,
         "metal": kDLMetal,
         "vpi": kDLVPI,
diff --git a/python/tvm/contrib/sdaccel.py b/python/tvm/contrib/sdaccel.py
deleted file mode 100644
index 478436e3d5c7..000000000000
--- a/python/tvm/contrib/sdaccel.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utility for Interacting with SDAccel Tools"""
-import os
-import subprocess
-
-import tvm._ffi
-
-from . import utils
-
-
-@tvm._ffi.register_func("tvm_callback_sdaccel_compile")
-def compile_vhls(kernel_info, target):
-    """Compile Vivado HLS code for SDAccel.
-
-    Parameters
-    ----------
-    kernel_info : list of (str, str)
-        List of kernel information.  The kernel information is a tuple of
-        function name and source code.
-
-    target : tvm.target.Target
-        The compilation target
-
-    Return
-    ------
-    xclbin : bytearray
-        The bytearray of the xclbin
-    """
-    device_name = target.attrs.get("device", "")
-    tmp_dir = utils.tempdir()
-
-    sdk = os.environ.get("XILINX_SDX", None)
-    xocc = os.path.join(sdk, "bin/xocc") if sdk else "xocc"
-    target = os.environ.get(
-        "XCL_TARGET", "sw_emu" if os.environ.get("XCL_EMULATION_MODE") else "hw"
-    )
-    advanced_params = [
-        "--xp",
-        "param:compiler.preserveHlsOutput=1",
-        "--xp",
-        "param:compiler.generateExtraRunData=true",
-    ]
-    platform = device_name
-    if not platform:
-        platform = os.environ.get("XCL_PLATFORM", os.environ.get("AWS_PLATFORM"))
-
-    if platform is None:
-        raise RuntimeError("No Xilinx device specified.")
-
-    tmp_xo_files = []
-    for funcname, code in kernel_info:
-        funcname = funcname.value
-        code = code.value
-
-        tmp_cpp = tmp_dir.relpath(funcname + ".cpp")
-        tmp_xo = tmp_dir.relpath(funcname + ".xo")
-
-        with open(tmp_cpp, "wb") as out_file:
-            out_file.write(bytes(code))
-
-        # build xo
-        args = (
-            [xocc, "-c", "-t", target, "--platform", platform, "-o", tmp_xo, "-k", funcname]
-            + advanced_params
-            + [tmp_cpp]
-        )
-        returncode = subprocess.call(args)
-        if returncode != 0:
-            raise RuntimeError("Compile error")
-
-        tmp_xo_files.append(tmp_xo)
-
-    # build xclbin
-    tmp_xclbin = tmp_dir.relpath("output.xclbin")
-    args = (
-        [xocc, "-l", "-t", target, "--platform", platform, "-o", tmp_xclbin]
-        + tmp_xo_files
-        + advanced_params
-    )
-    returncode = subprocess.call(args)
-    if returncode != 0:
-        raise RuntimeError("Link error")
-
-    return bytearray(open(tmp_xclbin, "rb").read())
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index b01f7bf5dc89..9eb0c23db439 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -456,7 +456,7 @@ def _get_targets(target_names=None):
     "nvptx",
     "vulkan -from_device=0",
     "opencl",
-    "opencl -device=mali,aocl_sw_emu",
+    "opencl -device=mali",
     "opencl -device=intel_graphics",
     "metal",
     "rocm",
diff --git a/rust/tvm-sys/build.rs b/rust/tvm-sys/build.rs
index 3b19f56fb1d2..2f30afb4b0ab 100644
--- a/rust/tvm-sys/build.rs
+++ b/rust/tvm-sys/build.rs
@@ -102,9 +102,6 @@ fn find_using_tvm_build() -> Result<TVMInstall> {
     if cfg!(feature = "use-openmp") {
         build_config.settings.use_openmp = Some(true);
     }
-    if cfg!(feature = "use-relay-debug") {
-        build_config.settings.use_relay_debug = Some(true);
-    }
     if cfg!(feature = "use-rtti") {
         build_config.settings.use_rtti = Some(true);
     }
diff --git a/src/runtime/module.cc b/src/runtime/module.cc
index de372e5de053..4e60a0d0a285 100644
--- a/src/runtime/module.cc
+++ b/src/runtime/module.cc
@@ -143,7 +143,7 @@ bool RuntimeEnabled(const String& target_str) {
     return true;
   } else if (target == "cuda" || target == "gpu") {
     f_name = "device_api.cuda";
-  } else if (target == "cl" || target == "opencl" || target == "sdaccel") {
+  } else if (target == "cl" || target == "opencl") {
     f_name = "device_api.opencl";
   } else if (target == "mtl" || target == "metal") {
     f_name = "device_api.metal";
diff --git a/src/runtime/opencl/aocl/aocl_common.h b/src/runtime/opencl/aocl/aocl_common.h
deleted file mode 100644
index 448f5d0ac6d7..000000000000
--- a/src/runtime/opencl/aocl/aocl_common.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file aocl_common.h
- * \brief AOCL common header
- */
-#ifndef TVM_RUNTIME_OPENCL_AOCL_AOCL_COMMON_H_
-#define TVM_RUNTIME_OPENCL_AOCL_AOCL_COMMON_H_
-
-#include <memory>
-
-#include "../opencl_common.h"
-
-namespace tvm {
-namespace runtime {
-namespace cl {
-
-/*!
- * \brief Process global AOCL workspace.
- */
-class AOCLWorkspace final : public OpenCLWorkspace {
- public:
-  // override OpenCL device API
-  void Init() final;
-  bool IsOpenCLDevice(Device dev) final;
-  OpenCLThreadEntry* GetThreadEntry() final;
-  // get the global workspace
-  static OpenCLWorkspace* Global();
-};
-
-/*! \brief Thread local workspace for AOCL */
-class AOCLThreadEntry : public OpenCLThreadEntry {
- public:
-  // constructor
-  AOCLThreadEntry()
-      : OpenCLThreadEntry(static_cast<DLDeviceType>(kDLAOCL), AOCLWorkspace::Global()) {}
-
-  // get the global workspace
-  static AOCLThreadEntry* ThreadLocal();
-};
-}  // namespace cl
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_OPENCL_AOCL_AOCL_COMMON_H_
diff --git a/src/runtime/opencl/aocl/aocl_device_api.cc b/src/runtime/opencl/aocl/aocl_device_api.cc
deleted file mode 100644
index e407837f6a7f..000000000000
--- a/src/runtime/opencl/aocl/aocl_device_api.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file aocl_device_api.cc
- */
-#include <dmlc/thread_local.h>
-#include <tvm/runtime/registry.h>
-
-#include "aocl_common.h"
-
-namespace tvm {
-namespace runtime {
-namespace cl {
-
-OpenCLThreadEntry* AOCLWorkspace::GetThreadEntry() { return AOCLThreadEntry::ThreadLocal(); }
-
-OpenCLWorkspace* AOCLWorkspace::Global() {
-  static OpenCLWorkspace* inst = new AOCLWorkspace();
-  return inst;
-}
-
-void AOCLWorkspace::Init() {
-  OpenCLWorkspace::Init("aocl", "accelerator", "Intel(R) FPGA SDK for OpenCL(TM)");
-}
-
-bool AOCLWorkspace::IsOpenCLDevice(Device dev) {
-  return dev.device_type == static_cast<DLDeviceType>(kDLAOCL);
-}
-
-typedef dmlc::ThreadLocalStore<AOCLThreadEntry> AOCLThreadStore;
-
-AOCLThreadEntry* AOCLThreadEntry::ThreadLocal() { return AOCLThreadStore::Get(); }
-
-TVM_REGISTER_GLOBAL("device_api.aocl").set_body([](TVMArgs args, TVMRetValue* rv) {
-  DeviceAPI* ptr = AOCLWorkspace::Global();
-  *rv = static_cast<void*>(ptr);
-});
-
-}  // namespace cl
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/opencl/aocl/aocl_module.cc b/src/runtime/opencl/aocl/aocl_module.cc
deleted file mode 100644
index cb8653356169..000000000000
--- a/src/runtime/opencl/aocl/aocl_module.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file aocl_module.cc
- */
-#include "aocl_module.h"
-
-#include <dmlc/memory_io.h>
-#include <tvm/runtime/registry.h>
-
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "aocl_common.h"
-
-namespace tvm {
-namespace runtime {
-
-class AOCLModuleNode : public OpenCLModuleNode {
- public:
-  explicit AOCLModuleNode(std::string data, std::string fmt,
-                          std::unordered_map<std::string, FunctionInfo> fmap, std::string source)
-      : OpenCLModuleNode(data, fmt, fmap, source) {}
-  cl::OpenCLWorkspace* GetGlobalWorkspace() final;
-};
-
-cl::OpenCLWorkspace* AOCLModuleNode::GetGlobalWorkspace() { return cl::AOCLWorkspace::Global(); }
-
-Module AOCLModuleCreate(std::string data, std::string fmt,
-                        std::unordered_map<std::string, FunctionInfo> fmap, std::string source) {
-  auto n = make_object<AOCLModuleNode>(data, fmt, fmap, source);
-  n->Init();
-  return Module(n);
-}
-
-Module AOCLModuleLoadFile(const std::string& file_name, const std::string& format) {
-  std::string data;
-  std::unordered_map<std::string, FunctionInfo> fmap;
-  std::string fmt = GetFileFormat(file_name, format);
-  std::string meta_file = GetMetaFilePath(file_name);
-  LoadBinaryFromFile(file_name, &data);
-  LoadMetaDataFromFile(meta_file, &fmap);
-  return AOCLModuleCreate(data, fmt, fmap, std::string());
-}
-
-TVM_REGISTER_GLOBAL("runtime.module.loadfile_aocx").set_body_typed(AOCLModuleLoadFile);
-
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/opencl/aocl/aocl_module.h b/src/runtime/opencl/aocl/aocl_module.h
deleted file mode 100644
index 199a94decdd8..000000000000
--- a/src/runtime/opencl/aocl/aocl_module.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file aocl_module.h
- * \brief Execution handling of OpenCL kernels for AOCL
- */
-#ifndef TVM_RUNTIME_OPENCL_AOCL_AOCL_MODULE_H_
-#define TVM_RUNTIME_OPENCL_AOCL_AOCL_MODULE_H_
-
-#include <tvm/runtime/packed_func.h>
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "../../meta_data.h"
-
-namespace tvm {
-namespace runtime {
-/*!
- * \brief create a opencl module for AOCL from data.
- *
- * \param data The module data.
- * \param fmt The format of the data, can be "aocx"
- * \param fmap The map function information map of each function.
- */
-Module AOCLModuleCreate(std::string data, std::string fmt,
-                        std::unordered_map<std::string, FunctionInfo> fmap, std::string source);
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_OPENCL_AOCL_AOCL_MODULE_H_
diff --git a/src/runtime/opencl/sdaccel/sdaccel_common.h b/src/runtime/opencl/sdaccel/sdaccel_common.h
deleted file mode 100644
index 80bc770cc0a4..000000000000
--- a/src/runtime/opencl/sdaccel/sdaccel_common.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file sdaccel_common.h
- * \brief SDAccel common header
- */
-#ifndef TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_COMMON_H_
-#define TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_COMMON_H_
-
-#include <memory>
-
-#include "../opencl_common.h"
-
-namespace tvm {
-namespace runtime {
-namespace cl {
-
-/*!
- * \brief Process global SDAccel workspace.
- */
-class SDAccelWorkspace final : public OpenCLWorkspace {
- public:
-  // override OpenCL device API
-  void Init() final;
-  bool IsOpenCLDevice(Device dev) final;
-  OpenCLThreadEntry* GetThreadEntry() final;
-  // get the global workspace
-  static OpenCLWorkspace* Global();
-};
-
-/*! \brief Thread local workspace for SDAccel*/
-class SDAccelThreadEntry : public OpenCLThreadEntry {
- public:
-  // constructor
-  SDAccelThreadEntry()
-      : OpenCLThreadEntry(static_cast<DLDeviceType>(kDLSDAccel), SDAccelWorkspace::Global()) {}
-
-  // get the global workspace
-  static SDAccelThreadEntry* ThreadLocal();
-};
-}  // namespace cl
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_COMMON_H_
diff --git a/src/runtime/opencl/sdaccel/sdaccel_device_api.cc b/src/runtime/opencl/sdaccel/sdaccel_device_api.cc
deleted file mode 100644
index 7d4b673324a0..000000000000
--- a/src/runtime/opencl/sdaccel/sdaccel_device_api.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file sdaccel_device_api.cc
- */
-#include <dmlc/thread_local.h>
-#include <tvm/runtime/registry.h>
-
-#include "sdaccel_common.h"
-
-namespace tvm {
-namespace runtime {
-namespace cl {
-
-OpenCLThreadEntry* SDAccelWorkspace::GetThreadEntry() { return SDAccelThreadEntry::ThreadLocal(); }
-
-OpenCLWorkspace* SDAccelWorkspace::Global() {
-  static OpenCLWorkspace* inst = new SDAccelWorkspace();
-  return inst;
-}
-
-void SDAccelWorkspace::Init() { OpenCLWorkspace::Init("sdaccel", "accelerator", "Xilinx"); }
-
-bool SDAccelWorkspace::IsOpenCLDevice(Device dev) {
-  return dev.device_type == static_cast<DLDeviceType>(kDLSDAccel);
-}
-
-typedef dmlc::ThreadLocalStore<SDAccelThreadEntry> SDAccelThreadStore;
-
-SDAccelThreadEntry* SDAccelThreadEntry::ThreadLocal() { return SDAccelThreadStore::Get(); }
-
-TVM_REGISTER_GLOBAL("device_api.sdaccel").set_body([](TVMArgs args, TVMRetValue* rv) {
-  DeviceAPI* ptr = SDAccelWorkspace::Global();
-  *rv = static_cast<void*>(ptr);
-});
-
-}  // namespace cl
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/opencl/sdaccel/sdaccel_module.cc b/src/runtime/opencl/sdaccel/sdaccel_module.cc
deleted file mode 100644
index 4736e1ef3597..000000000000
--- a/src/runtime/opencl/sdaccel/sdaccel_module.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file sdaccel_module.cc
- */
-#include "sdaccel_module.h"
-
-#include <dmlc/memory_io.h>
-#include <tvm/runtime/registry.h>
-
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "sdaccel_common.h"
-
-namespace tvm {
-namespace runtime {
-
-class SDAccelModuleNode : public OpenCLModuleNode {
- public:
-  explicit SDAccelModuleNode(std::string data, std::string fmt,
-                             std::unordered_map<std::string, FunctionInfo> fmap, std::string source)
-      : OpenCLModuleNode(data, fmt, fmap, source) {}
-  cl::OpenCLWorkspace* GetGlobalWorkspace() final;
-};
-
-cl::OpenCLWorkspace* SDAccelModuleNode::GetGlobalWorkspace() {
-  return cl::SDAccelWorkspace::Global();
-}
-
-Module SDAccelModuleCreate(std::string data, std::string fmt,
-                           std::unordered_map<std::string, FunctionInfo> fmap, std::string source) {
-  auto n = make_object<SDAccelModuleNode>(data, fmt, fmap, source);
-  n->Init();
-  return Module(n);
-}
-
-Module SDAccelModuleLoadFile(const std::string& file_name, const String& format) {
-  std::string data;
-  std::unordered_map<std::string, FunctionInfo> fmap;
-  std::string fmt = GetFileFormat(file_name, format);
-  std::string meta_file = GetMetaFilePath(file_name);
-  LoadBinaryFromFile(file_name, &data);
-  LoadMetaDataFromFile(meta_file, &fmap);
-  return SDAccelModuleCreate(data, fmt, fmap, std::string());
-}
-
-Module SDAccelModuleLoadBinary(void* strm) {
-  dmlc::Stream* stream = static_cast<dmlc::Stream*>(strm);
-  std::string data;
-  std::unordered_map<std::string, FunctionInfo> fmap;
-  std::string fmt;
-  stream->Read(&fmt);
-  stream->Read(&fmap);
-  stream->Read(&data);
-  return SDAccelModuleCreate(data, fmt, fmap, std::string());
-}
-
-TVM_REGISTER_GLOBAL("runtime.module.loadfile_xclbin").set_body_typed(SDAccelModuleLoadFile);
-
-TVM_REGISTER_GLOBAL("runtime.module.loadfile_awsxclbin").set_body_typed(SDAccelModuleLoadFile);
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/opencl/sdaccel/sdaccel_module.h b/src/runtime/opencl/sdaccel/sdaccel_module.h
deleted file mode 100644
index 322decc4460c..000000000000
--- a/src/runtime/opencl/sdaccel/sdaccel_module.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file sdaccel_module.h
- * \brief Execution handling of OPENCL kernels for SDAccel FPGAs
- */
-#ifndef TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_MODULE_H_
-#define TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_MODULE_H_
-
-#include <tvm/runtime/packed_func.h>
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "../../meta_data.h"
-
-namespace tvm {
-namespace runtime {
-/*!
- * \brief create a opencl module for SDAccel from data.
- *
- * \param data The module data.
- * \param fmt The format of the data, can be "xclbin", "awsxclbin"
- * \param fmap The map function information map of each function.
- */
-Module SDAccelModuleCreate(std::string data, std::string fmt,
-                           std::unordered_map<std::string, FunctionInfo> fmap, std::string source);
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_MODULE_H_
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index 065eaf503f6b..fd77427c70b5 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -123,10 +123,6 @@
 #define TVM_INFO_USE_OPENMP "NOT-FOUND"
 #endif
 
-#ifndef TVM_INFO_USE_RELAY_DEBUG
-#define TVM_INFO_USE_RELAY_DEBUG "NOT-FOUND"
-#endif
-
 #ifndef TVM_INFO_DEBUG_WITH_ABI_CHANGE
 #define TVM_INFO_DEBUG_WITH_ABI_CHANGE "NOT-FOUND"
 #endif
@@ -343,7 +339,6 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_OPENMP", TVM_INFO_USE_OPENMP},
       {"USE_PAPI", TVM_INFO_USE_PAPI},
       {"USE_RANDOM", TVM_INFO_USE_RANDOM},
-      {"USE_RELAY_DEBUG", TVM_INFO_USE_RELAY_DEBUG},
       {"TVM_DEBUG_WITH_ABI_CHANGE", TVM_INFO_TVM_DEBUG_WITH_ABI_CHANGE},
       {"TVM_LOG_BEFORE_THROW", TVM_INFO_TVM_LOG_BEFORE_THROW},
       {"USE_ROCBLAS", TVM_INFO_USE_ROCBLAS},
diff --git a/src/target/opt/build_aocl_off.cc b/src/target/opt/build_aocl_off.cc
deleted file mode 100644
index 9f9d098b7a97..000000000000
--- a/src/target/opt/build_aocl_off.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Optional module when build aocl is switched to off
- */
-#include "../../runtime/opencl/opencl_module.h"
-#include "../source/codegen_source_base.h"
-
-namespace tvm {
-namespace runtime {
-
-Module AOCLModuleCreate(std::string data, std::string fmt,
-                        std::unordered_map<std::string, FunctionInfo> fmap, std::string source) {
-  LOG(WARNING) << "AOCL runtime not enabled, return a source module...";
-  return codegen::DeviceSourceModuleCreate(data, fmt, fmap, "aocl");
-}
-
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/target/opt/build_sdaccel_off.cc b/src/target/opt/build_sdaccel_off.cc
deleted file mode 100644
index 0de305c2a37c..000000000000
--- a/src/target/opt/build_sdaccel_off.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Optional module when build opencl is switched to off
- */
-#include "../../runtime/opencl/opencl_module.h"
-#include "../source/codegen_source_base.h"
-
-namespace tvm {
-namespace runtime {
-
-Module SDAccelModuleCreate(std::string data, std::string fmt,
-                           std::unordered_map<std::string, FunctionInfo> fmap, std::string source) {
-  LOG(WARNING) << "OpenCL runtime not enabled, return a source module...";
-  return codegen::DeviceSourceModuleCreate(data, fmt, fmap, "sdaccel");
-}
-
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/target/source/codegen_aocl.cc b/src/target/source/codegen_aocl.cc
deleted file mode 100644
index dc3ba0875161..000000000000
--- a/src/target/source/codegen_aocl.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file codegen_aocl.cc
- */
-#include <tvm/target/target.h>
-
-#include <string>
-#include <vector>
-
-#include "../../runtime/file_utils.h"
-#include "../../runtime/opencl/aocl/aocl_module.h"
-#include "../build_common.h"
-#include "codegen_opencl.h"
-
-namespace tvm {
-namespace codegen {
-
-runtime::Module BuildAOCL(IRModule mod, Target target, bool emulation) {
-  // Get code.
-  using tvm::runtime::Registry;
-  bool output_ssa = false;
-  CodeGenOpenCL cg;
-  cg.Init(output_ssa);
-
-  Map<GlobalVar, PrimFunc> functions;
-  for (auto [gvar, base_func] : mod->functions) {
-    ICHECK(base_func->IsInstance<PrimFuncNode>()) << "CodegenOpenCL: Can only take PrimFunc";
-    auto prim_func = Downcast<PrimFunc>(base_func);
-    auto calling_conv = prim_func->GetAttr<Integer>(tvm::attr::kCallingConv);
-    ICHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
-        << "CodegenOpenCL: expect calling_conv equals CallingConv::kDeviceKernelLaunch";
-    functions.Set(gvar, prim_func);
-  }
-
-  for (auto [gvar, prim_func] : functions) {
-    cg.DeclareFunction(gvar, prim_func);
-  }
-
-  for (auto [gvar, prim_func] : functions) {
-    cg.AddFunction(gvar, prim_func);
-  }
-
-  std::string code = cg.Finish();
-  if (const auto* f = Registry::Get("tvm_callback_opencl_postproc")) {
-    code = (*f)(code, target).operator std::string();
-  }
-
-  // Write a .cl file.
-  runtime::SaveBinaryToFile("aocl.cl", code.c_str());
-
-  // Compile the .cl file.
-  std::string cmd = "aoc aocl.cl";
-  // AOCL supports fp64.
-  cmd += " -Dcl_khr_fp64";
-  Optional<String> device = target->GetAttr<String>("device");
-  if (device.defined()) {
-    cmd += " -board=" + device.value();
-  }
-  if (emulation) {
-    cmd += " -march=emulator";
-  }
-  if (system(cmd.c_str()) != 0) {
-    LOG(FATAL) << "OpenCL offline compilation error.";
-  }
-
-  // Read .aocx file
-  std::string aocxbin;
-  runtime::LoadBinaryFromFile("aocl.aocx", &aocxbin);
-
-  return AOCLModuleCreate(aocxbin, "aocx", ExtractFuncInfo(mod), code);
-}
-
-TVM_REGISTER_GLOBAL("target.build.aocl")
-    .set_body_typed([](IRModule mod, Target target) -> runtime::Module {
-      return BuildAOCL(mod, target, false);
-    });
-
-TVM_REGISTER_GLOBAL("target.build.aocl_sw_emu")
-    .set_body_typed([](IRModule mod, Target target) -> runtime::Module {
-      return BuildAOCL(mod, target, true);
-    });
-
-}  // namespace codegen
-}  // namespace tvm
diff --git a/src/target/source/codegen_vhls.cc b/src/target/source/codegen_vhls.cc
deleted file mode 100644
index e4ea1db347cc..000000000000
--- a/src/target/source/codegen_vhls.cc
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file codegen_vhls.cc
- */
-#include "codegen_vhls.h"
-
-#include <string>
-
-#include "../../runtime/opencl/sdaccel/sdaccel_module.h"
-#include "../build_common.h"
-
-namespace tvm {
-namespace codegen {
-
-void CodeGenVivadoHLS::Init(bool output_ssa) {
-  CodeGenC::Init(output_ssa);
-
-  this->stream << "#include <ap_int.h>\n\n";
-  this->stream << "#include <algorithm>\n\n";
-}
-
-void CodeGenVivadoHLS::PrintType(DataType t, std::ostream& os) {
-  if (t.is_uint()) {
-    switch (t.bits()) {
-      case 8:
-        os << "unsigned char";
-        break;
-      case 16:
-        os << "unsigned short";
-        break;
-      case 32:
-        os << "unsigned int";
-        break;
-      case 64:
-        os << "unsigned long long";
-        break;
-      default:
-        os << "ap_uint<" << t.bits() << ">";
-        break;
-    }
-  } else if (t.is_int()) {
-    switch (t.bits()) {
-      case 8:
-        os << "char";
-        break;
-      case 16:
-        os << "short";
-        break;
-      case 32:
-        os << "int";
-        break;
-      case 64:
-        os << "long long";
-        break;
-      default:
-        os << "ap_int<" << t.bits() << ">";
-        break;
-    }
-  } else {
-    CodeGenC::PrintType(t, os);
-  }
-}
-
-void CodeGenVivadoHLS::PrintFuncPrefix(std::ostream& os) { os << "extern \"C\" "; }
-
-void CodeGenVivadoHLS::PreFunctionBody(const PrimFunc& f) {
-  for (size_t i = 0; i < f->params.size(); ++i) {
-    Var v = f->params[i];
-    std::string vid = GetVarID(v.get());
-    if (v.dtype().is_handle()) {
-      this->stream << "#pragma HLS INTERFACE m_axi port=" << vid << "  offset=slave bundle=gmem\n";
-    }
-    this->stream << "#pragma HLS INTERFACE s_axilite port=" << vid << " bundle=control\n";
-  }
-  this->stream << "#pragma HLS INTERFACE s_axilite port=return bundle=control\n\n";
-}
-
-template <typename T>
-inline void PrintBinaryExpr(const T* op, const char* opstr,
-                            std::ostream& os,  // NOLINT(*)
-                            CodeGenVivadoHLS* p) {
-  os << opstr << '(';
-  p->PrintExpr(op->a, os);
-  os << ", ";
-  p->PrintExpr(op->b, os);
-  os << ')';
-}
-
-void CodeGenVivadoHLS::VisitExpr_(const MinNode* op, std::ostream& os) {  // NOLINT(*)
-  const char* opstr = "std::min";
-  if (op->dtype.is_float()) {
-    switch (op->dtype.bits()) {
-      case 32:
-        opstr = "fminf";
-        break;
-      case 64:
-        opstr = "fmin";
-        break;
-    }
-  }
-
-  PrintBinaryExpr(op, opstr, os, this);
-}
-
-void CodeGenVivadoHLS::VisitExpr_(const MaxNode* op, std::ostream& os) {  // NOLINT(*)
-  const char* opstr = "std::max";
-  if (op->dtype.is_float()) {
-    switch (op->dtype.bits()) {
-      case 32:
-        opstr = "fmaxf";
-        break;
-      case 64:
-        opstr = "fmax";
-        break;
-    }
-  }
-
-  PrintBinaryExpr(op, opstr, os, this);
-}
-
-runtime::Module BuildSDAccel(IRModule mod, Target target) {
-  using tvm::runtime::Registry;
-  bool output_ssa = false;
-  CodeGenVivadoHLS cg;
-
-  // Generate source code for get_source().
-  cg.Init(output_ssa);
-
-  Map<GlobalVar, PrimFunc> functions;
-  for (auto [gvar, base_func] : mod->functions) {
-    ICHECK(base_func->IsInstance<PrimFuncNode>()) << "CodeGenVHLS: Can only take PrimFunc";
-    auto prim_func = Downcast<PrimFunc>(base_func);
-    auto calling_conv = prim_func->GetAttr<Integer>(tvm::attr::kCallingConv);
-    ICHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
-        << "CodeGenVLHS: expect calling_conv equals CallingConv::kDeviceKernelLaunch";
-    functions.Set(gvar, prim_func);
-  }
-
-  for (auto [gvar, prim_func] : functions) {
-    cg.DeclareFunction(gvar, prim_func);
-  }
-  for (auto [gvar, prim_func] : functions) {
-    cg.AddFunction(gvar, prim_func);
-  }
-
-  std::string whole_code = cg.Finish();
-
-  // Generate source code for compilation.
-  Array<Array<runtime::String>> kernel_info;
-
-  for (auto [gvar, prim_func] : functions) {
-    CodeGenVivadoHLS cg;
-    cg.Init(output_ssa);
-
-    for (auto [other_gvar, other_prim_func] : functions) {
-      cg.DeclareFunction(other_gvar, other_prim_func);
-    }
-    cg.AddFunction(gvar, prim_func);
-    std::string code = cg.Finish();
-    if (const auto* f = runtime::Registry::Get("tvm_callback_vhls_postproc")) {
-      code = (*f)(code, target).operator std::string();
-    }
-
-    auto function_name = cg.GetFunctionName(gvar);
-    kernel_info.push_back({function_name, code});
-  }
-
-  std::string xclbin;
-  if (const auto* f = Registry::Get("tvm_callback_sdaccel_compile")) {
-    xclbin = (*f)(kernel_info, target).operator std::string();
-  } else {
-    LOG(FATAL) << "Cannot compile Vivado HLS code.";
-  }
-  return SDAccelModuleCreate(xclbin, "xclbin", ExtractFuncInfo(mod), whole_code);
-}
-
-TVM_REGISTER_GLOBAL("target.build.sdaccel").set_body_typed(BuildSDAccel);
-
-}  // namespace codegen
-}  // namespace tvm
diff --git a/src/target/source/codegen_vhls.h b/src/target/source/codegen_vhls.h
deleted file mode 100644
index d8ba2b687496..000000000000
--- a/src/target/source/codegen_vhls.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
-5B5B */
-
-/*!
- * \file codegen_vhls.h
- * \brief Utility to generate vhls code
- */
-#ifndef TVM_TARGET_SOURCE_CODEGEN_VHLS_H_
-#define TVM_TARGET_SOURCE_CODEGEN_VHLS_H_
-
-#include <tvm/target/codegen.h>
-#include <tvm/target/target.h>
-#include <tvm/tir/expr.h>
-
-#include "codegen_c.h"
-
-namespace tvm {
-namespace codegen {
-
-class CodeGenVivadoHLS final : public CodeGenC {
- public:
-  void Init(bool output_ssa);
-  void PrintType(DataType t, std::ostream& os);
-
-  void PrintFuncPrefix(std::ostream& os) final;
-  void PreFunctionBody(const PrimFunc& f) final;
-  void VisitExpr_(const MinNode* op, std::ostream& os) final;
-  void VisitExpr_(const MaxNode* op, std::ostream& os) final;
-};
-
-}  // namespace codegen
-}  // namespace tvm
-
-#endif  // TVM_TARGET_SOURCE_CODEGEN_VHLS_H_
diff --git a/src/target/source/intrin_rule_aocl.cc b/src/target/source/intrin_rule_aocl.cc
deleted file mode 100644
index 599e62f3f31c..000000000000
--- a/src/target/source/intrin_rule_aocl.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file intrin_rule_aocl.cc
- * \brief AOCL intrinsic rules.
- */
-#include <tvm/tir/op_attr_types.h>
-
-#include "../intrin_rule.h"
-
-namespace tvm {
-namespace codegen {
-namespace intrin {
-using tir::FLowerIntrinsic;
-
-TVM_REGISTER_OP("tir.floor")
-    .set_attr<FLowerIntrinsic>("aocl.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.ceil")
-    .set_attr<FLowerIntrinsic>("aocl.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.trunc")
-    .set_attr<FLowerIntrinsic>("aocl.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.fabs")
-    .set_attr<FLowerIntrinsic>("aocl.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.round")
-    .set_attr<FLowerIntrinsic>("aocl.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.nearbyint")
-    .set_attr<FLowerIntrinsic>("aocl.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.exp").set_attr<FLowerIntrinsic>("aocl.FLowerIntrinsic",
-                                                     DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.log").set_attr<FLowerIntrinsic>("aocl.FLowerIntrinsic",
-                                                     DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.tanh")
-    .set_attr<FLowerIntrinsic>("aocl.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.sqrt")
-    .set_attr<FLowerIntrinsic>("aocl.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.pow").set_attr<FLowerIntrinsic>("aocl.FLowerIntrinsic",
-                                                     DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.popcount")
-    .set_attr<FLowerIntrinsic>("aocl.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.floor")
-    .set_attr<FLowerIntrinsic>("aocl_sw_emu.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.ceil")
-    .set_attr<FLowerIntrinsic>("aocl_sw_emu.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.trunc")
-    .set_attr<FLowerIntrinsic>("aocl_sw_emu.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.fabs")
-    .set_attr<FLowerIntrinsic>("aocl_sw_emu.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.round")
-    .set_attr<FLowerIntrinsic>("aocl_sw_emu.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.nearbyint")
-    .set_attr<FLowerIntrinsic>("aocl_sw_emu.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.exp").set_attr<FLowerIntrinsic>("aocl_sw_emu.FLowerIntrinsic",
-                                                     DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.log").set_attr<FLowerIntrinsic>("aocl_sw_emu.FLowerIntrinsic",
-                                                     DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.tanh")
-    .set_attr<FLowerIntrinsic>("aocl_sw_emu.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.sqrt")
-    .set_attr<FLowerIntrinsic>("aocl_sw_emu.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.pow").set_attr<FLowerIntrinsic>("aocl_sw_emu.FLowerIntrinsic",
-                                                     DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.popcount")
-    .set_attr<FLowerIntrinsic>("aocl_sw_emu.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-}  // namespace intrin
-}  // namespace codegen
-}  // namespace tvm
diff --git a/src/target/source/intrin_rule_vhls.cc b/src/target/source/intrin_rule_vhls.cc
deleted file mode 100644
index 7bfd7cd13659..000000000000
--- a/src/target/source/intrin_rule_vhls.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file intrin_rule_vhls.cc
- * \brief VHLS intrinsic rules.
- */
-#include <tvm/tir/op_attr_types.h>
-
-#include "../intrin_rule.h"
-
-namespace tvm {
-namespace codegen {
-namespace intrin {
-using tir::FLowerIntrinsic;
-
-TVM_REGISTER_OP("tir.floor")
-    .set_attr<FLowerIntrinsic>("sdaccel.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.ceil")
-    .set_attr<FLowerIntrinsic>("sdaccel.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.trunc")
-    .set_attr<FLowerIntrinsic>("sdaccel.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.fabs")
-    .set_attr<FLowerIntrinsic>("sdaccel.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.round")
-    .set_attr<FLowerIntrinsic>("sdaccel.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.nearbyint")
-    .set_attr<FLowerIntrinsic>("sdaccel.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.exp").set_attr<FLowerIntrinsic>("sdaccel.FLowerIntrinsic",
-                                                     DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.exp2")
-    .set_attr<FLowerIntrinsic>("sdaccel.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.exp10")
-    .set_attr<FLowerIntrinsic>("sdaccel.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.log").set_attr<FLowerIntrinsic>("sdaccel.FLowerIntrinsic",
-                                                     DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.log2")
-    .set_attr<FLowerIntrinsic>("sdaccel.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.log10")
-    .set_attr<FLowerIntrinsic>("sdaccel.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.tanh")
-    .set_attr<FLowerIntrinsic>("sdaccel.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.sqrt")
-    .set_attr<FLowerIntrinsic>("sdaccel.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.pow").set_attr<FLowerIntrinsic>("sdaccel.FLowerIntrinsic",
-                                                     DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.popcount")
-    .set_attr<FLowerIntrinsic>("sdaccel.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.sin").set_attr<FLowerIntrinsic>("sdaccel.FLowerIntrinsic",
-                                                     DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.sinh")
-    .set_attr<FLowerIntrinsic>("sdaccel.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.cos").set_attr<FLowerIntrinsic>("sdaccel.FLowerIntrinsic",
-                                                     DispatchPureExtern<Direct>);
-
-TVM_REGISTER_OP("tir.cosh")
-    .set_attr<FLowerIntrinsic>("sdaccel.FLowerIntrinsic", DispatchPureExtern<Direct>);
-
-}  // namespace intrin
-}  // namespace codegen
-}  // namespace tvm
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 3605831da1c7..62ba2787a367 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -425,15 +425,6 @@ TVM_REGISTER_TARGET_KIND("webgpu", kDLWebGPU)
     .add_attr_option<runtime::Int>("max_num_threads", runtime::Int(256))
     .set_default_keys({"webgpu", "gpu"});
 
-TVM_REGISTER_TARGET_KIND("sdaccel", kDLOpenCL)  // line break
-    .set_default_keys({"sdaccel", "hls"});
-
-TVM_REGISTER_TARGET_KIND("aocl", kDLAOCL)  // line break
-    .set_default_keys({"aocl", "hls"});
-
-TVM_REGISTER_TARGET_KIND("aocl_sw_emu", kDLAOCL)  // line break
-    .set_default_keys({"aocl", "hls"});
-
 TVM_REGISTER_TARGET_KIND("hexagon", kDLHexagon)
     .add_attr_option<Array<String>>("mattr")
     .add_attr_option<String>("mcpu")
diff --git a/src/tir/analysis/verify_memory.cc b/src/tir/analysis/verify_memory.cc
index a990230e043a..4f9c3e5edd8f 100644
--- a/src/tir/analysis/verify_memory.cc
+++ b/src/tir/analysis/verify_memory.cc
@@ -57,7 +57,7 @@ class MemoryAccessVerifier final : protected StmtExprVisitor {
 
   /// Interface to perform memory access verification
   void Run() {
-    if (!IsGPUDevice(dev_type_) && !IsFPGADevice(dev_type_)) return;
+    if (!IsGPUDevice(dev_type_)) return;
     StmtExprVisitor::VisitStmt(func_->body);
   }
 
@@ -152,8 +152,6 @@ class MemoryAccessVerifier final : protected StmtExprVisitor {
     return kDLCUDA == dev_type || kDLOpenCL == dev_type || kDLVulkan == dev_type ||
            kDLMetal == dev_type || kDLROCM == dev_type || kOpenGL == dev_type;
   }
-  /// Check if a given DLDeviceType/TVMDeviceExtType value denotes FPGA device.
-  static bool IsFPGADevice(int dev_type) { return kDLSDAccel == dev_type || kDLAOCL == dev_type; }
 
  private:
   /// Status of visitor
diff --git a/tests/scripts/task_python_integration_gpuonly.sh b/tests/scripts/task_python_integration_gpuonly.sh
index 432984c95561..b01320db1f36 100755
--- a/tests/scripts/task_python_integration_gpuonly.sh
+++ b/tests/scripts/task_python_integration_gpuonly.sh
@@ -18,7 +18,7 @@
 
 set -exo pipefail
 
-export TVM_TEST_TARGETS="cuda;opencl;metal;rocm;nvptx;opencl -device=mali,aocl_sw_emu,adreno"
+export TVM_TEST_TARGETS="cuda;opencl;metal;rocm;nvptx;opencl -device=mali,adreno"
 export PYTEST_ADDOPTS="-m gpu $PYTEST_ADDOPTS"
 export TVM_RELAY_TEST_TARGETS="cuda"
 export TVM_RELAY_OPENCL_TEXTURE_TARGETS="opencl -device=adreno"
diff --git a/tests/scripts/task_python_unittest_gpuonly.sh b/tests/scripts/task_python_unittest_gpuonly.sh
index 6c4e642b0c6b..6d28d8f30f8b 100755
--- a/tests/scripts/task_python_unittest_gpuonly.sh
+++ b/tests/scripts/task_python_unittest_gpuonly.sh
@@ -21,7 +21,7 @@ set -euxo pipefail
 export PYTEST_ADDOPTS="-m gpu ${PYTEST_ADDOPTS:-}"
 
 # Test most of the enabled runtimes here.
-export TVM_TEST_TARGETS="cuda;opencl;metal;rocm;nvptx;opencl -device=mali,aocl_sw_emu"
+export TVM_TEST_TARGETS="cuda;opencl;metal;rocm;nvptx;opencl -device=mali"
 export TVM_UNITTEST_TESTSUITE_NAME=python-unittest-gpu
 
 ./tests/scripts/task_python_unittest.sh

From 24a7f8a4c989acc37a2d3d2c236b50be64bca2c7 Mon Sep 17 00:00:00 2001
From: Siyuan Feng <hzfengsy@sjtu.edu.cn>
Date: Mon, 17 Feb 2025 14:40:51 +0800
Subject: [PATCH 05/10] remove opengl

---
 cmake/config.cmake                                | 3 ---
 docker/Dockerfile.demo_opencl                     | 8 ++------
 golang/src/device.go                              | 7 -------
 include/tvm/runtime/c_runtime_api.h               | 4 +---
 include/tvm/runtime/packed_func.h                 | 2 --
 jvm/core/src/main/java/org/apache/tvm/Device.java | 2 +-
 python/tvm/_ffi/runtime_ctypes.py                 | 2 --
 src/tir/analysis/verify_memory.cc                 | 2 +-
 tests/scripts/task_config_build_gpu.sh            | 1 -
 9 files changed, 5 insertions(+), 26 deletions(-)

diff --git a/cmake/config.cmake b/cmake/config.cmake
index 856803898a02..6345c0c06199 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -114,9 +114,6 @@ set(USE_KHRONOS_SPIRV OFF)
 # whether enable SPIRV_KHR_DOT_PRODUCT
 set(USE_SPIRV_KHR_INTEGER_DOT_PRODUCT OFF)
 
-# Whether enable OpenGL runtime
-set(USE_OPENGL OFF)
-
 # Whether enable RPC runtime
 set(USE_RPC ON)
 
diff --git a/docker/Dockerfile.demo_opencl b/docker/Dockerfile.demo_opencl
index 52995496fdfe..f7f1ebe20046 100644
--- a/docker/Dockerfile.demo_opencl
+++ b/docker/Dockerfile.demo_opencl
@@ -24,7 +24,7 @@ FROM ubuntu:22.04
 COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
 
 RUN echo "Labelling this image"
-LABEL Description="Docker image for TVM built with OpenCL & OpenGL support"
+LABEL Description="Docker image for TVM built with OpenCL support"
 
 RUN echo "Preparing to install dependencies"
 RUN apt-get update
@@ -49,10 +49,6 @@ RUN echo "Installing OpenCL libraries"
 RUN apt-install-and-clear -y libviennacl-dev mesa-opencl-icd ocl-icd-opencl-dev clinfo
 RUN apt-install-and-clear -y libclblas-dev libclfft-dev libclsparse-dev
 
-RUN echo "Installing OpenGL libraries"
-RUN apt-install-and-clear -y libcogl-dev libegl1 libgles1 libglfw3-dev
-# libglew-dev
-
 RUN echo "Upgrading dependencies"
 RUN apt-get upgrade -y
 
@@ -70,7 +66,7 @@ ENV TVM_HOME="/usr/tvm"
 ENV TVM_BUILD_DIR="${TVM_HOME}/build"
 RUN mkdir -p ${TVM_BUILD_DIR} && \
 	cd ${TVM_BUILD_DIR} && \
-	cmake .. -DUSE_BLAS=openblas -DUSE_LLVM=ON -DUSE_OPENCL=ON -DUSE_OPENGL=ON && \
+	cmake .. -DUSE_BLAS=openblas -DUSE_LLVM=ON -DUSE_OPENCL=ON && \
 	make -j6
 
 RUN echo "Building Python package"
diff --git a/golang/src/device.go b/golang/src/device.go
index 1a40b77f491a..2918cf6a0f0f 100644
--- a/golang/src/device.go
+++ b/golang/src/device.go
@@ -43,8 +43,6 @@ var KDLVPI                  = int32(C.kDLVPI)
 var KDLROCM                 = int32(C.kDLROCM)
 // KDLVulkan is golang enum correspond to TVM device type kDLVulkan.
 var KDLVulkan               = int32(C.kDLVulkan)
-// KOpenGL is golang enum correspond to TVM device type kOpenGL.
-var KOpenGL                 = int32(C.kOpenGL)
 // KExtDev is golang enum correspond to TVM device type kDLExtDev.
 var KExtDev                 = int32(C.kDLExtDev)
 
@@ -93,8 +91,3 @@ func ROCM(index int32) Device {
 func Vulkan(index int32) Device {
     return Device{KDLVulkan, index}
 }
-
-// OpenGL returns the Device object for OpenGL target on given index
-func OpenGL(index int32) Device {
-    return Device{KOpenGL, index}
-}
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 00a7964230d8..277ac048acf5 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -119,8 +119,7 @@ typedef enum {
   // To help avoid accidental conflicts between `DLDeviceType`
   // and this enumeration, start numbering the new enumerators
   // a little higher than (currently) seems necessary.
-  kOpenGL = 34,
-  kDLMicroDev,
+  kDLMicroDev=35,
   TVMDeviceExtType_End,  // sentinel value
 } TVMDeviceExtType;
 
@@ -147,7 +146,6 @@ static_assert(kDLOneAPI == 14, TVM_HARCODED_INTEGER_CHANGED_MSG);
 static_assert(kDLWebGPU == 15, TVM_HARCODED_INTEGER_CHANGED_MSG);
 static_assert(kDLHexagon == 16, TVM_HARCODED_INTEGER_CHANGED_MSG);
 
-static_assert(kOpenGL == 34, TVM_HARCODED_INTEGER_CHANGED_MSG);
 static_assert(kDLMicroDev == 35, TVM_HARCODED_INTEGER_CHANGED_MSG);
 #undef TVM_HARCODED_INTEGER_CHANGED_MSG
 #endif
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index b7eca8be1e5a..9b31481b9d9e 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -1478,8 +1478,6 @@ inline const char* DLDeviceType2Str(int type) {
       return "webgpu";
     case kDLHexagon:
       return "hexagon";
-    case kOpenGL:
-      return "opengl";
     case kDLMicroDev:
       return "microdev";
     default:
diff --git a/jvm/core/src/main/java/org/apache/tvm/Device.java b/jvm/core/src/main/java/org/apache/tvm/Device.java
index edea0b75df88..8390d88cb912 100644
--- a/jvm/core/src/main/java/org/apache/tvm/Device.java
+++ b/jvm/core/src/main/java/org/apache/tvm/Device.java
@@ -29,7 +29,7 @@ public class Device {
   static final int kDLCPU = 1, kDLCUDA = 2, kDLCUDAHost = 3, kDLOpenCL = 4, kDLVulkan = 7,
                    kDLMetal = 8, kDLVPI = 9, kDLROCM = 10, kDLROCMHost = 11, kDLExtDev = 12,
                    kDLCUDAManaged = 13, kDLOneAPI = 14, kDLWebGPU = 15, kDLHexagon = 16,
-                   kOpenGL = 34, kDLMicroDev = 35;
+                   kDLMicroDev = 35;
 
   private static final Map<Integer, String> MASK2STR = new HashMap<Integer, String>();
   private static final Map<String, Integer> STR2MASK = new HashMap<String, Integer>();
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index f3b02ed56939..3b7e82430b5a 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -264,7 +264,6 @@ class Device(ctypes.Structure):
     kDLOneAPI = 14
     kDLWebGPU = 15
     kDLHexagon = 16
-    kOpenGL = 34
     kDLMicroDev = 35
 
     _fields_ = [("device_type", ctypes.c_int), ("device_id", ctypes.c_int)]
@@ -283,7 +282,6 @@ class Device(ctypes.Structure):
         kDLOneAPI: "oneapi",
         kDLWebGPU: "webgpu",
         kDLHexagon: "hexagon",
-        kOpenGL: "opengl",
         kDLMicroDev: "microdev",
     }
 
diff --git a/src/tir/analysis/verify_memory.cc b/src/tir/analysis/verify_memory.cc
index 4f9c3e5edd8f..e891824d79b5 100644
--- a/src/tir/analysis/verify_memory.cc
+++ b/src/tir/analysis/verify_memory.cc
@@ -150,7 +150,7 @@ class MemoryAccessVerifier final : protected StmtExprVisitor {
   /// Check if a given DLDeviceType/TVMDeviceExtType value denotes GPU device.
   static bool IsGPUDevice(int dev_type) {
     return kDLCUDA == dev_type || kDLOpenCL == dev_type || kDLVulkan == dev_type ||
-           kDLMetal == dev_type || kDLROCM == dev_type || kOpenGL == dev_type;
+           kDLMetal == dev_type || kDLROCM == dev_type;
   }
 
  private:
diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index 2d5600c51369..41a42676f139 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -27,7 +27,6 @@ echo set\(USE_CUBLAS ON\) >> config.cmake
 echo set\(USE_CUDNN ON\) >> config.cmake
 echo set\(USE_CUDA ON\) >> config.cmake
 echo set\(USE_VULKAN ON\) >> config.cmake
-echo set\(USE_OPENGL ON\) >> config.cmake
 echo set\(USE_OPENCL ON\) >> config.cmake
 echo set\(USE_OPENCL_GTEST \"/googletest\"\) >> config.cmake
 echo set\(USE_LLVM \"/usr/bin/llvm-config-15 --link-static\"\) >> config.cmake

From 78337e089c988c18cbe913a4621ddf146ea92149 Mon Sep 17 00:00:00 2001
From: Siyuan Feng <hzfengsy@sjtu.edu.cn>
Date: Mon, 17 Feb 2025 14:43:38 +0800
Subject: [PATCH 06/10] remove microdev and antlr

---
 cmake/config.cmake                                | 7 -------
 docker/Dockerfile.ci_cpu                          | 4 ----
 docker/Dockerfile.ci_hexagon                      | 4 ----
 include/tvm/runtime/c_runtime_api.h               | 4 +---
 include/tvm/runtime/device_api.h                  | 4 +---
 include/tvm/runtime/packed_func.h                 | 2 --
 jvm/core/src/main/java/org/apache/tvm/Device.java | 3 +--
 python/tvm/_ffi/runtime_ctypes.py                 | 2 --
 src/driver/driver_api.cc                          | 4 +---
 src/runtime/profiling.cc                          | 6 ------
 tests/scripts/task_config_build_cpu.sh            | 1 -
 tests/scripts/task_config_build_gpu.sh            | 1 -
 tests/scripts/task_config_build_wasm.sh           | 1 -
 13 files changed, 4 insertions(+), 39 deletions(-)

diff --git a/cmake/config.cmake b/cmake/config.cmake
index 6345c0c06199..17f587efd4c7 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -278,13 +278,6 @@ set(USE_CLML OFF)
 # USE_CLML_GRAPH_EXECUTOR - CLML SDK PATH or ON or OFF
 set(USE_CLML_GRAPH_EXECUTOR OFF)
 
-# Build ANTLR parser for Relay text format
-# Possible values:
-# - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
-# - OFF: disable ANTLR
-# - /path/to/antlr-*-complete.jar: path to specific ANTLR jar file
-set(USE_ANTLR OFF)
-
 # Whether to enable debug code that may cause ABI changes
 set(TVM_DEBUG_WITH_ABI_CHANGE OFF)
 
diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index 5ba1dd721435..18b4d7baab67 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -77,10 +77,6 @@ COPY install/ubuntu_install_golang.sh /install/ubuntu_install_golang.sh
 RUN bash /install/ubuntu_install_golang.sh
 ENV PATH $PATH:/usr/lib/go-1.18/bin
 
-# ANTLR deps
-COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh
-RUN bash /install/ubuntu_install_java.sh
-
 # BYODT deps
 COPY install/ubuntu_install_universal.sh /install/ubuntu_install_universal.sh
 RUN bash /install/ubuntu_install_universal.sh
diff --git a/docker/Dockerfile.ci_hexagon b/docker/Dockerfile.ci_hexagon
index 489894d252ae..4410bfacb35b 100644
--- a/docker/Dockerfile.ci_hexagon
+++ b/docker/Dockerfile.ci_hexagon
@@ -54,10 +54,6 @@ ENV PATH $PATH:$CARGO_HOME/bin
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
 RUN bash /install/ubuntu_install_python_package.sh
 
-# ANTLR deps
-COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh
-RUN bash /install/ubuntu_install_java.sh
-
 # Hexagon
 COPY install/ubuntu_install_hexagon.sh /install/ubuntu_install_hexagon.sh
 RUN bash /install/ubuntu_install_hexagon.sh
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 277ac048acf5..ef25cbe9f71f 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -119,8 +119,7 @@ typedef enum {
   // To help avoid accidental conflicts between `DLDeviceType`
   // and this enumeration, start numbering the new enumerators
   // a little higher than (currently) seems necessary.
-  kDLMicroDev=35,
-  TVMDeviceExtType_End,  // sentinel value
+  TVMDeviceExtType_End = 36,  // sentinel value
 } TVMDeviceExtType;
 
 #ifdef __cplusplus
@@ -146,7 +145,6 @@ static_assert(kDLOneAPI == 14, TVM_HARCODED_INTEGER_CHANGED_MSG);
 static_assert(kDLWebGPU == 15, TVM_HARCODED_INTEGER_CHANGED_MSG);
 static_assert(kDLHexagon == 16, TVM_HARCODED_INTEGER_CHANGED_MSG);
 
-static_assert(kDLMicroDev == 35, TVM_HARCODED_INTEGER_CHANGED_MSG);
 #undef TVM_HARCODED_INTEGER_CHANGED_MSG
 #endif
 
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index f27bfdacb570..2564b73d1e94 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -237,9 +237,7 @@ class TVM_DLL DeviceAPI {
    *        before launching the kernel function.
    * \param device_type The device type.
    */
-  static bool NeedSetDevice(int device_type) {
-    return device_type != kDLCPU && device_type != kDLMicroDev;
-  }
+  static bool NeedSetDevice(int device_type) { return device_type != kDLCPU; }
 
   /*!
    * \brief Whether pointer arithmetics on a device owned pointer may be performed on the host.
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 9b31481b9d9e..07a6848bfeed 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -1478,8 +1478,6 @@ inline const char* DLDeviceType2Str(int type) {
       return "webgpu";
     case kDLHexagon:
       return "hexagon";
-    case kDLMicroDev:
-      return "microdev";
     default:
       LOG(FATAL) << "unknown type = " << type;
   }
diff --git a/jvm/core/src/main/java/org/apache/tvm/Device.java b/jvm/core/src/main/java/org/apache/tvm/Device.java
index 8390d88cb912..337f62b74c4e 100644
--- a/jvm/core/src/main/java/org/apache/tvm/Device.java
+++ b/jvm/core/src/main/java/org/apache/tvm/Device.java
@@ -28,8 +28,7 @@ public class Device {
    */
   static final int kDLCPU = 1, kDLCUDA = 2, kDLCUDAHost = 3, kDLOpenCL = 4, kDLVulkan = 7,
                    kDLMetal = 8, kDLVPI = 9, kDLROCM = 10, kDLROCMHost = 11, kDLExtDev = 12,
-                   kDLCUDAManaged = 13, kDLOneAPI = 14, kDLWebGPU = 15, kDLHexagon = 16,
-                   kDLMicroDev = 35;
+                   kDLCUDAManaged = 13, kDLOneAPI = 14, kDLWebGPU = 15, kDLHexagon = 16;
 
   private static final Map<Integer, String> MASK2STR = new HashMap<Integer, String>();
   private static final Map<String, Integer> STR2MASK = new HashMap<String, Integer>();
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index 3b7e82430b5a..f79df1644e28 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -264,7 +264,6 @@ class Device(ctypes.Structure):
     kDLOneAPI = 14
     kDLWebGPU = 15
     kDLHexagon = 16
-    kDLMicroDev = 35
 
     _fields_ = [("device_type", ctypes.c_int), ("device_id", ctypes.c_int)]
     MASK2STR = {
@@ -282,7 +281,6 @@ class Device(ctypes.Structure):
         kDLOneAPI: "oneapi",
         kDLWebGPU: "webgpu",
         kDLHexagon: "hexagon",
-        kDLMicroDev: "microdev",
     }
 
     STR2MASK = {
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 83ed8d261d2f..5b12f13d96a6 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -288,7 +288,6 @@ IRModule ApplyPasses(IRModule mod, transform::Sequential seq) {
   return mod;
 }
 
-
 IRModule LowerModule(IRModule mod, bool simple_mode) {
   Array<transform::Pass> pass_list = CreatePassList(simple_mode);
   return LowerWithPassList(std::move(mod), pass_list);
@@ -379,8 +378,7 @@ runtime::Module TIRToRuntime(const Map<Target, IRModule>& inputs_arg,
 
   if (!target_host.defined()) {
     for (const auto& it : inputs) {
-      if (it.first->GetTargetDeviceType() == kDLCPU ||
-          it.first->GetTargetDeviceType() == kDLMicroDev) {
+      if (it.first->GetTargetDeviceType() == kDLCPU) {
         target_host = it.first;
         break;
       }
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index 83be98556a9e..44790d7e7451 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -863,12 +863,6 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat,
                              int repeats_to_cooldown, int cache_flush_bytes, PackedFunc f_preproc) {
   ICHECK(pf != nullptr);
 
-  if (static_cast<int>(dev.device_type) == static_cast<int>(kDLMicroDev)) {
-    auto get_micro_time_evaluator = runtime::Registry::Get("micro._GetMicroTimeEvaluator");
-    ICHECK(get_micro_time_evaluator != nullptr) << "micro backend not enabled";
-    return (*get_micro_time_evaluator)(pf, dev, number, repeat);
-  }
-
   auto ftimer = [pf, dev, number, repeat, min_repeat_ms, limit_zero_time_iterations,
                  cooldown_interval_ms, repeats_to_cooldown, cache_flush_bytes,
                  f_preproc](TVMArgs args, TVMRetValue* rv) mutable {
diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh
index 42d6c06f7a68..20bf6100f4a1 100755
--- a/tests/scripts/task_config_build_cpu.sh
+++ b/tests/scripts/task_config_build_cpu.sh
@@ -27,7 +27,6 @@ echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_DNNL ON\) >> config.cmake
 echo set\(USE_ARM_COMPUTE_LIB ON\) >> config.cmake
 echo set\(USE_LLVM \"/usr/bin/llvm-config-17 --link-static\"\) >> config.cmake
-echo set\(USE_ANTLR ON\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS \"-Wno-error=range-loop-construct -Wno-error=comment\"\) >> config.cmake
 echo set\(HIDE_PRIVATE_SYMBOLS ON\) >> config.cmake
 
diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index 41a42676f139..244ce4b8a504 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -33,7 +33,6 @@ echo set\(USE_LLVM \"/usr/bin/llvm-config-15 --link-static\"\) >> config.cmake
 echo set\(USE_RPC ON\) >> config.cmake
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_STACKVM_RUNTIME ON\) >> config.cmake
-echo set\(USE_ANTLR ON\) >> config.cmake
 echo set\(USE_BLAS openblas\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
 echo set\(USE_TENSORRT_CODEGEN ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_wasm.sh b/tests/scripts/task_config_build_wasm.sh
index daac6c0a0c34..11526082e481 100755
--- a/tests/scripts/task_config_build_wasm.sh
+++ b/tests/scripts/task_config_build_wasm.sh
@@ -25,7 +25,6 @@ cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-15\) >> config.cmake
-echo set\(USE_ANTLR ON\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
 echo set\(HIDE_PRIVATE_SYMBOLS ON\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake

From e298f3c41bab81a80252faa938921ba1b8c5811b Mon Sep 17 00:00:00 2001
From: Siyuan Feng <hzfengsy@sjtu.edu.cn>
Date: Mon, 17 Feb 2025 14:54:10 +0800
Subject: [PATCH 07/10] remove frontends

---
 docker/Dockerfile.ci_arm                  |  4 --
 docker/Dockerfile.ci_cpu                  | 12 ----
 docker/Dockerfile.ci_gpu                  | 16 -----
 docker/Dockerfile.ci_hexagon              |  4 --
 docker/Dockerfile.demo_cpu                |  2 +-
 docker/install/ubuntu_install_caffe.sh    | 69 --------------------
 docker/install/ubuntu_install_caffe2.sh   | 39 ------------
 docker/install/ubuntu_install_cmsis.sh    | 49 --------------
 docker/install/ubuntu_install_darknet.sh  | 29 ---------
 docker/install/ubuntu_install_mxnet.sh    | 23 -------
 docker/install/ubuntu_install_nnef.sh     | 25 --------
 docker/install/ubuntu_install_oneflow.sh  | 25 --------
 docker/install/ubuntu_install_paddle.sh   | 23 -------
 docker/python/ci-constraints.txt          | 10 ---
 python/gen_requirements.py                | 39 ------------
 python/tvm/contrib/mxnet.py               | 78 -----------------------
 tests/python/conftest.py                  |  9 ---
 tests/python/contrib/test_mxnet_bridge.py | 63 ------------------
 tests/scripts/release/make_notes.py       |  5 --
 19 files changed, 1 insertion(+), 523 deletions(-)
 delete mode 100755 docker/install/ubuntu_install_caffe.sh
 delete mode 100755 docker/install/ubuntu_install_caffe2.sh
 delete mode 100755 docker/install/ubuntu_install_cmsis.sh
 delete mode 100755 docker/install/ubuntu_install_darknet.sh
 delete mode 100755 docker/install/ubuntu_install_mxnet.sh
 delete mode 100644 docker/install/ubuntu_install_nnef.sh
 delete mode 100755 docker/install/ubuntu_install_oneflow.sh
 delete mode 100755 docker/install/ubuntu_install_paddle.sh
 delete mode 100644 python/tvm/contrib/mxnet.py
 delete mode 100644 tests/python/contrib/test_mxnet_bridge.py

diff --git a/docker/Dockerfile.ci_arm b/docker/Dockerfile.ci_arm
index 16ffecb315e9..2be887079e34 100644
--- a/docker/Dockerfile.ci_arm
+++ b/docker/Dockerfile.ci_arm
@@ -75,10 +75,6 @@ RUN bash /install/ubuntu_install_tflite.sh
 COPY install/ubuntu_install_onnx.sh /install/ubuntu_install_onnx.sh
 RUN bash /install/ubuntu_install_onnx.sh
 
-# NNEF
-COPY install/ubuntu_install_nnef.sh /install/ubuntu_install_nnef.sh
-RUN bash /install/ubuntu_install_nnef.sh
-
 # AutoTVM deps
 COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
 RUN bash /install/ubuntu_install_redis.sh
diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index 18b4d7baab67..7a95ebff8e5c 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -53,10 +53,6 @@ RUN bash /install/ubuntu_install_dnnl.sh
 COPY install/ubuntu_install_papi.sh /install/ubuntu_install_papi.sh
 RUN bash /install/ubuntu_install_papi.sh ""
 
-# Install MxNet for access to Gluon Model Zoo.
-COPY install/ubuntu_install_mxnet.sh /install/ubuntu_install_mxnet.sh
-RUN bash /install/ubuntu_install_mxnet.sh
-
 # Rust env (build early; takes a while)
 COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
 RUN bash /install/ubuntu_install_rust.sh
@@ -97,10 +93,6 @@ RUN bash /install/ubuntu_install_jax.sh "cpu"
 COPY install/ubuntu_download_arm_compute_lib_binaries.sh /install/ubuntu_download_arm_compute_lib_binaries.sh
 RUN bash /install/ubuntu_download_arm_compute_lib_binaries.sh
 
-# PaddlePaddle deps
-COPY install/ubuntu_install_paddle.sh /install/ubuntu_install_paddle.sh
-RUN bash /install/ubuntu_install_paddle.sh
-
 # sccache
 COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
 RUN bash /install/ubuntu_install_sccache.sh
@@ -114,10 +106,6 @@ RUN bash /install/ubuntu_install_libxsmm.sh
 COPY install/ubuntu_install_onnx.sh /install/ubuntu_install_onnx.sh
 RUN bash /install/ubuntu_install_onnx.sh
 
-# NNEF
-COPY install/ubuntu_install_nnef.sh /install/ubuntu_install_nnef.sh
-RUN bash /install/ubuntu_install_nnef.sh
-
 # AArch64 Architecture Envelope Model (AEM)
 COPY install/ubuntu_install_aprofile_aem.sh /install
 RUN bash /install/ubuntu_install_aprofile_aem.sh
diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index cc9e48831776..5fa0df923116 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -76,9 +76,6 @@ COPY install/ubuntu_install_rocm.sh /install/ubuntu_install_rocm.sh
 RUN bash /install/ubuntu_install_rocm.sh
 
 # DL Frameworks
-COPY install/ubuntu_install_mxnet.sh /install/ubuntu_install_mxnet.sh
-RUN bash /install/ubuntu_install_mxnet.sh
-
 COPY install/ubuntu_install_gluoncv.sh /install/ubuntu_install_gluoncv.sh
 RUN bash /install/ubuntu_install_gluoncv.sh
 
@@ -91,9 +88,6 @@ RUN bash /install/ubuntu_install_tensorflow.sh
 COPY install/ubuntu_install_jax.sh /install/ubuntu_install_jax.sh
 RUN bash /install/ubuntu_install_jax.sh "cuda"
 
-COPY install/ubuntu_install_darknet.sh /install/ubuntu_install_darknet.sh
-RUN bash /install/ubuntu_install_darknet.sh
-
 COPY install/ubuntu_install_onnx.sh /install/ubuntu_install_onnx.sh
 RUN bash /install/ubuntu_install_onnx.sh
 
@@ -103,9 +97,6 @@ RUN bash /install/ubuntu_install_libtorch.sh
 COPY install/ubuntu_install_tflite.sh /install/ubuntu_install_tflite.sh
 RUN bash /install/ubuntu_install_tflite.sh
 
-COPY install/ubuntu_install_nnef.sh /install/ubuntu_install_nnef.sh
-RUN bash /install/ubuntu_install_nnef.sh
-
 COPY install/ubuntu_install_dgl.sh /install/ubuntu_install_dgl.sh
 RUN bash /install/ubuntu_install_dgl.sh
 
@@ -113,13 +104,6 @@ ENV NVIDIA_DRIVER_CAPABILITIES compute,graphics,utility
 COPY install/ubuntu_install_vulkan.sh /install/ubuntu_install_vulkan.sh
 RUN bash /install/ubuntu_install_vulkan.sh
 
-COPY install/ubuntu_install_paddle.sh /install/ubuntu_install_paddle.sh
-RUN bash /install/ubuntu_install_paddle.sh
-
-# OneFlow deps
-COPY install/ubuntu_install_oneflow.sh /install/ubuntu_install_oneflow.sh
-RUN bash /install/ubuntu_install_oneflow.sh
-
 # Rust env (build early; takes a while)
 COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
 RUN bash /install/ubuntu_install_rust.sh
diff --git a/docker/Dockerfile.ci_hexagon b/docker/Dockerfile.ci_hexagon
index 4410bfacb35b..0d0c6c034f84 100644
--- a/docker/Dockerfile.ci_hexagon
+++ b/docker/Dockerfile.ci_hexagon
@@ -80,10 +80,6 @@ RUN bash /install/ubuntu_install_tflite.sh
 COPY install/ubuntu_install_onnx.sh /install/ubuntu_install_onnx.sh
 RUN bash /install/ubuntu_install_onnx.sh
 
-# NNEF
-COPY install/ubuntu_install_nnef.sh /install/ubuntu_install_nnef.sh
-RUN bash /install/ubuntu_install_nnef.sh
-
 # xgboost (for tuning)
 COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
 RUN bash /install/ubuntu_install_redis.sh
diff --git a/docker/Dockerfile.demo_cpu b/docker/Dockerfile.demo_cpu
index d081f26423c1..778d21ea781b 100644
--- a/docker/Dockerfile.demo_cpu
+++ b/docker/Dockerfile.demo_cpu
@@ -25,7 +25,7 @@ COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
 RUN pip3 install matplotlib Image Pillow jupyter[notebook]
 
 # Deep learning frameworks
-RUN pip3 install mxnet tensorflow keras gluoncv dgl
+RUN pip3 install tensorflow keras gluoncv dgl
 
 # Build TVM
 COPY install/install_tvm_cpu.sh /install/install_tvm_cpu.sh
diff --git a/docker/install/ubuntu_install_caffe.sh b/docker/install/ubuntu_install_caffe.sh
deleted file mode 100755
index 1e42270e267a..000000000000
--- a/docker/install/ubuntu_install_caffe.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -euxo pipefail
-
-if [ -z "${TVM_VENV+x}" ]; then
-    echo "ERROR: expect TVM_VENV env var to be set"
-    exit 2
-fi
-
-apt-get update --fix-missing
-
-# # Install dependencies
-apt-install-and-clear -y --no-install-recommends protobuf-compiler \
-    libprotobuf-dev libhdf5-serial-dev libopenblas-dev libgflags-dev libgoogle-glog-dev
-
-
-# install python packages
-pip install "numpy" "protobuf" "scikit-image" "six"
-
-# Build the Caffe and the python wrapper
-echo "Downloading Caffe"
-CAFFE_HOME="/opt/caffe"
-git clone --branch=ssd --depth 1 https://github.com/weiliu89/caffe /caffe_src
-cd /caffe_src
-
-
-echo "Building Caffe"
-mkdir /caffe_src/build && cd /caffe_src/build
-cmake -DCMAKE_INSTALL_PREFIX=${CAFFE_HOME}\
-    -DCMAKE_BUILD_TYPE=Release \
-    -DCPU_ONLY=1 \
-    -Dpython_version=3 \
-    -DUSE_OPENCV=OFF \
-    -DUSE_LEVELDB=OFF \
-    -DUSE_LMDB=OFF \
-    -DBUILD_docs=OFF \
-    -DBLAS=open \
-    ..
-
-make all -j$(expr $(nproc) - 1)
-make pycaffe -j$(expr $(nproc) - 1)
-make test -j$(expr $(nproc) - 1)
-
-echo "Installing Caffe to /opt/caffe"
-make install
-
-echo "Removing build directory"
-cd / && rm -rf /caffe_src
-
-PYCAFFE_ROOT=${CAFFE_HOME}/python
-echo "${CAFFE_HOME}/lib" >> /etc/ld.so.conf.d/caffe.conf && ldconfig
-site_packages=$("${TVM_VENV}/bin/python3" -c 'import site; print(site.getsitepackages()[0])')
-ln -s ${PYCAFFE_ROOT}/caffe "${site_packages}/caffe"
diff --git a/docker/install/ubuntu_install_caffe2.sh b/docker/install/ubuntu_install_caffe2.sh
deleted file mode 100755
index 1a27bc44ad10..000000000000
--- a/docker/install/ubuntu_install_caffe2.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-set -u
-set -o pipefail
-
-# caffe2.python.module.download generates a progress bar. in non
-# interactive use this results in huge progress debris in the log
-# files.  There is no option to disable the progress bar so work
-# around it by stripping the progress bar output
-
-filter_progress_bar()
-{
-  # Progress bars are the 'goto start of line' escape sequence
-  # ESC[1000D[ repeated, the end of the progress bar is the end of
-  # line.  We can selectively remove progress bars by dropping lines
-  # that beging with the escape sequence.
-  sed "/^\x1b\[1000D/d"
-}
-
-python3 -m caffe2.python.models.download -i -f squeezenet | filter_progress_bar
-python3 -m caffe2.python.models.download -i -f resnet50 | filter_progress_bar
-python3 -m caffe2.python.models.download -i -f vgg19 | filter_progress_bar
diff --git a/docker/install/ubuntu_install_cmsis.sh b/docker/install/ubuntu_install_cmsis.sh
deleted file mode 100755
index 91e0a94197a9..000000000000
--- a/docker/install/ubuntu_install_cmsis.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-set -u
-set -o pipefail
-
-function show_usage() {
-    cat <<EOF
-Usage: docker/install/ubuntu_install_cmsis.sh <INSTALLATION_PATH>
-INSTALLATION_PATH is the installation path for the CMSIS.
-EOF
-}
-
-if [ "$#" -lt 1 -o "$1" == "--help" -o "$1" == "-h" ]; then
-    show_usage
-    exit -1
-fi
-
-INSTALLATION_PATH=$1
-shift
-
-CMSIS_TAG="5.9.0"
-CMSIS_NN_TAG="v4.1.0"
-
-CMSIS_URL="https://github.com/ARM-software/CMSIS_5.git"
-git clone ${CMSIS_URL} --branch ${CMSIS_TAG} --single-branch ${INSTALLATION_PATH}
-
-CMSIS_NN_URL="https://github.com/ARM-software/CMSIS-NN.git"
-git clone ${CMSIS_NN_URL} --branch ${CMSIS_NN_TAG} --single-branch ${INSTALLATION_PATH}/CMSIS-NN
-
-touch "${INSTALLATION_PATH}"/"CMSIS_${CMSIS_TAG}".sha
-touch "${INSTALLATION_PATH}"/"CMSIS_NN_${CMSIS_NN_TAG}".sha
-echo "SUCCESS"
diff --git a/docker/install/ubuntu_install_darknet.sh b/docker/install/ubuntu_install_darknet.sh
deleted file mode 100755
index 8020899f8bf1..000000000000
--- a/docker/install/ubuntu_install_darknet.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-set -u
-set -o pipefail
-
-#install the necessary dependancies, cffi, opencv
-wget -q 'https://github.com/siju-samuel/darknet/blob/master/lib/libdarknet.so?raw=true' -O libdarknet.so
-debian_version=`cat /etc/debian_version`
-
-pip3 install \
-    cffi \
-    opencv-python
diff --git a/docker/install/ubuntu_install_mxnet.sh b/docker/install/ubuntu_install_mxnet.sh
deleted file mode 100755
index aa04d4c19177..000000000000
--- a/docker/install/ubuntu_install_mxnet.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-set -u
-set -o pipefail
-
-pip3 install mxnet==1.6.0
diff --git a/docker/install/ubuntu_install_nnef.sh b/docker/install/ubuntu_install_nnef.sh
deleted file mode 100644
index 6cd4761787c5..000000000000
--- a/docker/install/ubuntu_install_nnef.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-set -u
-set -o pipefail
-
-pip3 install \
-  nnef_tools==1.0.6 \
-  nnef==1.0.7
diff --git a/docker/install/ubuntu_install_oneflow.sh b/docker/install/ubuntu_install_oneflow.sh
deleted file mode 100755
index da2943e2f916..000000000000
--- a/docker/install/ubuntu_install_oneflow.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-set -u
-set -o pipefail
-
-pip3 install \
-    oneflow==0.9.0 \
-    flowvision==0.1.0
diff --git a/docker/install/ubuntu_install_paddle.sh b/docker/install/ubuntu_install_paddle.sh
deleted file mode 100755
index 6cbd6289a16b..000000000000
--- a/docker/install/ubuntu_install_paddle.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-set -u
-set -o pipefail
-
-pip install paddlepaddle==2.4.2
diff --git a/docker/python/ci-constraints.txt b/docker/python/ci-constraints.txt
index 5b0a4da23fbf..d28e81599775 100644
--- a/docker/python/ci-constraints.txt
+++ b/docker/python/ci-constraints.txt
@@ -4,25 +4,18 @@
 # migration tasks if a new version of these packages were to be released. Holding packages back
 # here allows us to decide when to tackle such migration work.
 #keras = "^2.6.0"
-#mxnet = "^1.6.0"
 
 #black = "<21.8b0"  # Breaks tensorflow-gpu. Revisit when tensorflow is upgraded.
 blocklint = "==0.2.3"
-#commonmark = ">=0.7.3"
 cpplint = "==1.6.0"
-#docutils = ">=0.11,<0.17"
 flake8 = "==3.9.2"
 flowvision = "==0.1.0"
-#h5py = "==3.1.0"
 keras = "==2.7"
 jinja2 = "==3.0.3"
-mxnet = "==1.6.0"
 mypy = "==0.902"
-oneflow = "==0.7.0"
 onnx = "==1.10.2"
 onnxruntime = "==1.9.0"
 numpy = "==1.19.3"
-paddlepaddle = "==2.4.1"
 pillow = "==9.1.0"
 pylint = "==2.4.4"
 scipy = "==1.7.3"
@@ -35,6 +28,3 @@ tensorflow-gpu = "==2.7.2"
 tflite = "==2.4.0"
 torch = "==1.11.0"
 torchvision = "==0.12.0+cpu"
-#xgboost = "==1.4.2"
-nnef = "==1.0.7"
-nnef_tools = "==1.0.6"
diff --git a/python/gen_requirements.py b/python/gen_requirements.py
index aafa35d08fe0..dc776324bf90 100644
--- a/python/gen_requirements.py
+++ b/python/gen_requirements.py
@@ -75,41 +75,8 @@
             ],
         ),
     ),
-    # Provide support for Arm(R) Ethos(TM)-U NPU.
-    (
-        "ethosu",
-        (
-            "Requirements for using Arm(R) Ethos(TM)-U NPU",
-            [
-                "ethos-u-vela",
-            ],
-        ),
-    ),
     # Relay frontends.
-    (
-        "importer-caffe",
-        (
-            "Requirements for the Caffe importer",
-            [
-                "numpy",
-                "protobuf",
-                "scikit-image",
-                "six",
-            ],
-        ),
-    ),
-    (
-        "importer-caffe2",
-        (
-            "Requirements for the Caffe2 importer",
-            [
-                "future",  # Hidden dependency of torch.
-                "torch",
-            ],
-        ),
-    ),
     ("importer-coreml", ("Requirements for the CoreML importer", ["coremltools"])),
-    ("importer-darknet", ("Requirements for the DarkNet importer", ["opencv-python"])),
     (
         "importer-keras",
         ("Requirements for the Keras importer", ["tensorflow", "tensorflow-estimator"]),
@@ -128,10 +95,6 @@
             ],
         ),
     ),
-    (
-        "importer-paddle",
-        ("Requirements for the PaddlePaddle importer", ["paddlepaddle"]),
-    ),
     (
         "importer-pytorch",
         (
@@ -161,7 +124,6 @@
                 "onnx",
                 "onnxoptimizer",
                 "onnxruntime",
-                "paddlepaddle",
                 "tensorflow",
                 "tflite",
                 "torch",
@@ -245,7 +207,6 @@
     ("onnxoptimizer", None),
     ("onnxruntime", None),
     ("opencv-python", None),
-    ("paddlepaddle", None),
     ("pillow", None),
     ("progressbar", None),
     ("protobuf", None),
diff --git a/python/tvm/contrib/mxnet.py b/python/tvm/contrib/mxnet.py
deleted file mode 100644
index 6e551dfe46e3..000000000000
--- a/python/tvm/contrib/mxnet.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""MXNet bridge wrap Function MXNet's async function."""
-from __future__ import absolute_import as _abs
-
-import tvm._ffi.registry
-import tvm.runtime._ffi_api
-from tvm.runtime import Module
-
-# pylint: disable=invalid-name
-_wrap_async = None
-
-
-def to_mxnet_func(func, const_loc=None):
-    """Wrap a TVM function as MXNet function
-
-    MXNet function runs asynchrously via its engine.
-
-    Parameters
-    ----------
-    func : Function
-        A TVM function that can take positional arguments
-
-    const_loc : list of int
-        List of integers indicating the argument position
-        of read only NDArray argument.
-        The NDArray argument location that are not annotated
-        will be viewed as mutable arrays in MXNet's engine.
-
-    Returns
-    -------
-    async_func : Function
-        A function that can take MXNet NDArray as argument
-        in places that used to expect TVM NDArray.
-        Run asynchrously in MXNet's async engine.
-    """
-    # only import mxnet when wrap get called.
-    # pylint: disable=import-self, import-outside-toplevel
-    import mxnet
-
-    if isinstance(func, Module):
-        func = func.entry_func
-
-    def _get_bridge_func():
-        """Get MXNet bridge function"""
-        if not mxnet.base._LIB.MXTVMBridge:
-            raise RuntimeError(
-                "MXTVMBridge not exist in mxnet package," " please update to latest version"
-            )
-
-        fdict = tvm._ffi.registry.extract_ext_funcs(mxnet.base._LIB.MXTVMBridge)
-        ret = fdict["WrapAsyncCall"]
-        ret.is_global = True
-        return ret
-
-    global _wrap_async
-
-    if _wrap_async is None:
-        # Register extension type in first time
-        _wrap_async = _get_bridge_func()
-        tvm._ffi.registry.register_extension(mxnet.nd.NDArray)
-
-    const_loc = const_loc if const_loc else []
-    return _wrap_async(func, tvm.runtime._ffi_api.TVMSetStream, len(const_loc), *const_loc)
diff --git a/tests/python/conftest.py b/tests/python/conftest.py
index 9c5cd8ac22ca..530a9fce3c1d 100644
--- a/tests/python/conftest.py
+++ b/tests/python/conftest.py
@@ -19,20 +19,11 @@
 
 COLLECT_IGNORE = []
 if sys.platform.startswith("win"):
-    COLLECT_IGNORE.append("frontend/caffe")
-    COLLECT_IGNORE.append("frontend/caffe2")
     COLLECT_IGNORE.append("frontend/coreml")
-    COLLECT_IGNORE.append("frontend/darknet")
     COLLECT_IGNORE.append("frontend/keras")
-    COLLECT_IGNORE.append("frontend/mxnet")
     COLLECT_IGNORE.append("frontend/pytorch")
     COLLECT_IGNORE.append("frontend/tensorflow")
     COLLECT_IGNORE.append("frontend/tflite")
     COLLECT_IGNORE.append("frontend/onnx")
-    COLLECT_IGNORE.append("driver/tvmc/test_autoscheduler.py")
-    COLLECT_IGNORE.append("auto_scheduler/test_auto_scheduler_cost_model.py")  # stack overflow
-    # COLLECT_IGNORE.append("auto_scheduler/test_auto_scheduler_measure.py") # exception ignored
-    COLLECT_IGNORE.append("auto_scheduler/test_auto_scheduler_search_policy.py")  # stack overflow
-    # COLLECT_IGNORE.append("auto_scheduler/test_auto_scheduler_measure.py") # exception ignored
 
     COLLECT_IGNORE.append("tir_base/test_tir_intrin.py")
diff --git a/tests/python/contrib/test_mxnet_bridge.py b/tests/python/contrib/test_mxnet_bridge.py
deleted file mode 100644
index 920e3649f370..000000000000
--- a/tests/python/contrib/test_mxnet_bridge.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-def mxnet_check():
-    """This is a simple test function for MXNet bridge
-
-    It is not included as pytests, because of its dependency on mxnet
-
-    User can directly run this script to verify correctness.
-    """
-    import mxnet as mx
-    from tvm import topi
-    import tvm
-    from tvm import te
-    import numpy as np
-    from tvm.contrib.mxnet import to_mxnet_func
-
-    # build a TVM function through topi
-    n = 20
-    shape = (20,)
-    scale = te.var("scale", dtype="float32")
-    x = te.placeholder(shape)
-    y = te.placeholder(shape)
-    z = topi.broadcast_add(x, y)
-    zz = te.compute(shape, lambda *i: z(*i) * scale)
-
-    target = tvm.target.cuda()
-
-    # build the function
-    with target:
-        s = topi.generic.schedule_injective(zz)
-        f = tvm.build(s, [x, y, zz, scale])
-
-    # get a mxnet version
-    mxf = to_mxnet_func(f, const_loc=[0, 1])
-
-    dev = mx.gpu(0)
-    xx = mx.nd.uniform(shape=shape, device=dev)
-    yy = mx.nd.uniform(shape=shape, device=dev)
-    zz = mx.nd.empty(shape=shape, device=dev)
-
-    # invoke myf: this runs in mxnet engine
-    mxf(xx, yy, zz, 10.0)
-    mxf(xx, yy, zz, 10.0)
-
-    tvm.testing.assert_allclose(zz.numpy(), (xx.numpy() + yy.numpy()) * 10)
-
-
-if __name__ == "__main__":
-    mxnet_check()
diff --git a/tests/scripts/release/make_notes.py b/tests/scripts/release/make_notes.py
index 704e2eedbd24..9045accaceea 100644
--- a/tests/scripts/release/make_notes.py
+++ b/tests/scripts/release/make_notes.py
@@ -38,7 +38,6 @@
     "hexagon": "Hexagon",
     "metal": "Metal",
     "vulkan": "Vulkan",
-    "cmsis-nn": "CMSIS-NN",
     "clml": "OpenCL & CLML",
     "opencl": "OpenCL & CLML",
     "openclml": "OpenCL & CLML",
@@ -46,8 +45,6 @@
     "acl": "ArmComputeLibrary",
     "rocm": "ROCm",
     "crt": "CRT",
-    "micronpu": "micoNPU",
-    "microtvm": "microTVM",
     "web": "web",
     "wasm": "web",
     "runtime": "Runtime",
@@ -59,8 +56,6 @@
     "tir": "TIR",
     "tensorflow": "Frontend",
     "tflite": "Frontend",
-    "paddle": "Frontend",
-    "oneflow": "Frontend",
     "pytorch": "Frontend",
     "torch": "Frontend",
     "keras": "Frontend",

From 3fed092f47c42f2abb34b68349a01c076d21949c Mon Sep 17 00:00:00 2001
From: Siyuan Feng <hzfengsy@sjtu.edu.cn>
Date: Mon, 17 Feb 2025 15:33:37 +0800
Subject: [PATCH 08/10] fix

---
 docs/reference/api/python/contrib.rst |  6 ++----
 include/tvm/driver/driver_api.h       | 12 ------------
 python/gen_requirements.py            |  7 -------
 src/target/codegen.cc                 |  2 --
 src/te/operation/graph.cc             |  1 +
 src/te/operation/graph.h              |  6 +++---
 6 files changed, 6 insertions(+), 28 deletions(-)

diff --git a/docs/reference/api/python/contrib.rst b/docs/reference/api/python/contrib.rst
index e85d3bec5caf..d0cf12b91c41 100644
--- a/docs/reference/api/python/contrib.rst
+++ b/docs/reference/api/python/contrib.rst
@@ -48,20 +48,18 @@ tvm.contrib.dlpack
 .. automodule:: tvm.contrib.dlpack
     :members:
 
+
 tvm.contrib.emcc
 ~~~~~~~~~~~~~~~~
 .. automodule:: tvm.contrib.emcc
     :members:
 
+
 tvm.contrib.miopen
 ~~~~~~~~~~~~~~~~~~
 .. automodule:: tvm.contrib.miopen
     :members:
 
-tvm.contrib.mxnet
-~~~~~~~~~~~~~~~~~
-.. automodule:: tvm.contrib.mxnet
-    :members:
 
 tvm.contrib.ndk
 ~~~~~~~~~~~~~~~
diff --git a/include/tvm/driver/driver_api.h b/include/tvm/driver/driver_api.h
index eaf737088bf4..39444d1629fe 100644
--- a/include/tvm/driver/driver_api.h
+++ b/include/tvm/driver/driver_api.h
@@ -88,18 +88,6 @@ TVM_DLL IRModule LowerModule(IRModule mod, bool simple_mode = false);
 TVM_DLL IRModule LowerPrimFunc(tvm::tir::PrimFunc func, const std::string& name,
                                bool simple_mode = false);
 
-/*!
- * \brief Build an IRModule given a TE schedule, args and binds. This function also applies
- * the lowering passes defined in CreatePassList.
- * \param sch The TE schedule to lower.
- * \param args The arguments to the function.
- * \param name The name of the lowered function.
- * \param binds Buffer assignments.
- * \param global_var_supply The GlobalVarSupply to be used in the module.
- * \param simple_mode Disables the loop partition pass. Defaults to false.
- * \return The result module.
- */
-
 /*!
  * \brief Build a device and host module for a specific target from an IRModule.
  * \param funcs The functions to be built.
diff --git a/python/gen_requirements.py b/python/gen_requirements.py
index dc776324bf90..b5d8a56657e5 100644
--- a/python/gen_requirements.py
+++ b/python/gen_requirements.py
@@ -197,24 +197,17 @@
         "docutils",
         "<0.17",
     ),  # Work around https://github.com/readthedocs/sphinx_rtd_theme/issues/1115
-    ("ethos-u-vela", "==3.8.0"),
     ("future", None),
-    ("h5py", "==2.10.0"),
     ("image", None),
     ("matplotlib", None),
     ("numpy", None),
     ("onnx", None),
     ("onnxoptimizer", None),
     ("onnxruntime", None),
-    ("opencv-python", None),
     ("pillow", None),
-    ("progressbar", None),
-    ("protobuf", None),
     ("psutil", None),
     ("pylint", None),
-    ("scikit-image", None),
     ("scipy", None),
-    ("six", None),
     ("sphinx", None),
     ("sphinx_autodoc_annotation", None),
     ("sphinx_gallery", None),
diff --git a/src/target/codegen.cc b/src/target/codegen.cc
index e5e2c4297c8e..b2fc88ed8260 100644
--- a/src/target/codegen.cc
+++ b/src/target/codegen.cc
@@ -38,12 +38,10 @@
 #include <vector>
 
 #include "../runtime/library_module.h"
-#include "../support/base64.h"
 
 namespace tvm {
 namespace codegen {
 
-
 runtime::Module Build(IRModule mod, Target target) {
   if (transform::PassContext::Current()
           ->GetConfig<Bool>("tir.disable_assert", Bool(false))
diff --git a/src/te/operation/graph.cc b/src/te/operation/graph.cc
index cddace2a8283..aee7f2afb188 100644
--- a/src/te/operation/graph.cc
+++ b/src/te/operation/graph.cc
@@ -29,6 +29,7 @@
 #include <tvm/tir/stmt_functor.h>
 
 #include <unordered_set>
+#include <vector>
 
 namespace tvm {
 namespace te {
diff --git a/src/te/operation/graph.h b/src/te/operation/graph.h
index fbb1241ad585..51ab8e1aa7bb 100644
--- a/src/te/operation/graph.h
+++ b/src/te/operation/graph.h
@@ -21,8 +21,8 @@
  * \file graph.h
  * \brief Utilities to get information about schedule graph.
  */
-#ifndef TVM_TE_SCHEDULE_GRAPH_H_
-#define TVM_TE_SCHEDULE_GRAPH_H_
+#ifndef TVM_TE_OPERATION_GRAPH_H_
+#define TVM_TE_OPERATION_GRAPH_H_
 
 #include <tvm/te/operation.h>
 #include <tvm/tir/expr.h>
@@ -59,4 +59,4 @@ Array<Operation> PostDFSOrder(const Array<Operation>& roots, const ReadGraph& g)
 }  // namespace te
 }  // namespace tvm
 
-#endif  // TVM_TE_SCHEDULE_GRAPH_H_
+#endif  // TVM_TE_OPERATION_GRAPH_H_

From b47b25ec37f1bffdc4bd31cf21dc9a9e33532164 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 17 Feb 2025 11:00:19 -0500
Subject: [PATCH 09/10] Cleanup relay related legacy components

---
 CMakeLists.txt                                |    2 -
 NEWS.md                                       | 2772 -----------------
 cmake/config.cmake                            |    3 -
 cmake/modules/LibInfo.cmake                   |    1 -
 cmake/modules/contrib/ArmComputeLib.cmake     |    2 -
 cmake/modules/contrib/ONNX.cmake              |   22 -
 conftest.py                                   |   17 +-
 docs/arch/index.rst                           |    4 +-
 docs/arch/pass_infra.rst                      |  126 +-
 include/tvm/ir/affine_type.h                  |  150 -
 include/tvm/ir/expr.h                         |   74 +-
 include/tvm/ir/function.h                     |    8 +-
 include/tvm/ir/memory_pools.h                 |  361 ---
 include/tvm/ir/module.h                       |   38 +-
 include/tvm/ir/op.h                           |   12 +-
 include/tvm/ir/tensor_type.h                  |  127 -
 include/tvm/ir/type_functor.h                 |    6 +-
 include/tvm/node/structural_equal.h           |    6 +-
 include/tvm/relax/dataflow_pattern.h          |    2 +-
 include/tvm/relax/expr.h                      |    4 +-
 include/tvm/relax/type.h                      |   67 +-
 include/tvm/runtime/logging.h                 |    4 +-
 include/tvm/target/generic_func.h             |  171 -
 include/tvm/target/virtual_device.h           |    6 +-
 include/tvm/tir/expr.h                        |    4 +-
 python/setup.py                               |    2 +-
 python/tvm/contrib/cutlass/build.py           |    2 +-
 python/tvm/contrib/hexagon/session.py         |  186 +-
 .../contrib/msc/core/transform/transform.py   |    4 +-
 python/tvm/contrib/target/__init__.py         |   16 -
 python/tvm/contrib/target/coreml.py           |  233 --
 python/tvm/ir/__init__.py                     |    4 +-
 python/tvm/ir/affine_type.py                  |   79 -
 python/tvm/ir/attrs.py                        |    4 +-
 python/tvm/ir/base.py                         |    6 +-
 python/tvm/ir/expr.py                         |   20 +-
 python/tvm/ir/function.py                     |    4 +-
 python/tvm/ir/instrument.py                   |    5 +-
 python/tvm/ir/json_compact.py                 |  146 -
 python/tvm/ir/module.py                       |    6 +-
 python/tvm/ir/op.py                           |    4 +-
 python/tvm/ir/tensor_type.py                  |   61 -
 python/tvm/ir/transform.py                    |   16 +-
 python/tvm/ir/type.py                         |    4 +-
 .../testing/custom_builder_runner.py          |  119 +-
 .../testing/dataset_collect_models.py         |   85 -
 .../testing/dataset_extract_tasks.py          |  104 -
 .../testing/dataset_sample_candidates.py      |  187 --
 python/tvm/meta_schedule/testing/tune_onnx.py |  196 --
 .../tvm/meta_schedule/testing/tune_relay.py   |  213 --
 .../tvm/meta_schedule/testing/tune_utils.py   |  129 +-
 python/tvm/relax/__init__.py                  |    2 +-
 python/tvm/relax/analysis/analysis.py         |    2 +-
 python/tvm/relax/expr.py                      |   10 +-
 .../tvm/relax/frontend/onnx/onnx_frontend.py  |    2 +-
 .../tvm/relax/frontend/torch/fx_translator.py |    1 -
 python/tvm/relax/op/qdq.py                    |    4 +-
 python/tvm/relax/testing/ast_printer.py       |    4 +-
 python/tvm/relax/ty.py                        |    8 +-
 python/tvm/testing/utils.py                   |   23 +-
 python/tvm/tir/analysis/analysis.py           |   31 +-
 python/tvm/topi/signal.py                     |   16 +-
 python/tvm/topi/sparse_reshape.py             |   10 +-
 python/tvm/topi/testing/one_hot.py            |    2 +-
 python/tvm/topi/transform.py                  |   26 +-
 src/contrib/msc/core/ir/graph_builder.cc      |   12 +-
 src/contrib/msc/core/ir/graph_builder.h       |   12 +-
 src/contrib/msc/core/transform/layout_utils.h |    2 +-
 .../msc/core/transform/rewrite_utils.h        |    2 +-
 .../msc/core/transform/set_expr_name.cc       |   32 +-
 src/contrib/msc/core/utils.h                  |    2 +-
 src/ir/affine_type.cc                         |   82 -
 src/ir/diagnostic.cc                          |    2 -
 src/ir/function.cc                            |    5 -
 src/ir/memory_pools.cc                        |  242 --
 src/ir/module.cc                              |    7 +-
 src/ir/source_map.cc                          |    2 -
 src/ir/tensor_type.cc                         |   66 -
 src/ir/transform.cc                           |    7 -
 src/ir/type_functor.cc                        |    7 -
 src/relax/analysis/graph_partitioner.cc       |   20 -
 src/relax/analysis/graph_partitioner.h        |    6 +-
 src/relax/analysis/struct_info_analysis.cc    |    4 +-
 src/relax/ir/dataflow_matcher.cc              |    2 +-
 src/relax/ir/expr.cc                          |    2 +-
 src/relax/ir/type.cc                          |   16 +-
 src/relax/transform/fold_constant.cc          |    2 +-
 src/relax/transform/fuse_tir.cc               |    4 +-
 src/runtime/contrib/bnns/bnns_wrp.h           |    2 +-
 src/runtime/contrib/cblas/gemm_common.h       |    2 +-
 src/runtime/contrib/onnx/onnx_module.cc       |   80 -
 .../contrib/tensorrt/tensorrt_calibrator.h    |    2 +-
 src/runtime/contrib/tensorrt/tensorrt_ops.h   |    2 +-
 src/script/printer/ir/ir.cc                   |   14 +-
 src/script/printer/relax/type.cc              |    8 +-
 src/support/libinfo.cc                        |    5 -
 src/target/generic_func.cc                    |  182 --
 src/te/operation/create_primfunc.h            |    2 -
 src/tir/ir/expr.cc                            |    4 +-
 tests/lint/check_file_type.py                 |    2 -
 .../contrib/test_hexagon/infrastructure.py    |   16 +-
 .../test_hexagon/test_relax_integration.py    |  130 +-
 tests/python/ir/test_ir_type.py               |   11 +-
 .../nightly/test_nnapi/infrastructure.py      |    2 +-
 .../test_analysis_struct_info_analysis.py     |   10 +-
 ...test_analysis_suggest_layout_transforms.py |    2 +-
 tests/python/relax/test_ast_printer.py        |    8 +-
 tests/python/relax/test_blockbuilder_core.py  |    2 +-
 tests/python/relax/test_dataflow_pattern.py   |    4 +-
 tests/python/relax/test_expr.py               |    6 +-
 tests/python/relax/test_struct_info.py        |    4 +-
 ...est_transform_merge_composite_functions.py |   47 +-
 .../python/relax/test_transform_normalize.py  |    2 +-
 tests/scripts/task_mypy.sh                    |   10 -
 114 files changed, 283 insertions(+), 6779 deletions(-)
 delete mode 100644 NEWS.md
 delete mode 100644 cmake/modules/contrib/ONNX.cmake
 delete mode 100644 include/tvm/ir/affine_type.h
 delete mode 100644 include/tvm/ir/memory_pools.h
 delete mode 100644 include/tvm/ir/tensor_type.h
 delete mode 100644 include/tvm/target/generic_func.h
 delete mode 100644 python/tvm/contrib/target/__init__.py
 delete mode 100644 python/tvm/contrib/target/coreml.py
 delete mode 100644 python/tvm/ir/affine_type.py
 delete mode 100644 python/tvm/ir/tensor_type.py
 delete mode 100644 python/tvm/meta_schedule/testing/dataset_collect_models.py
 delete mode 100644 python/tvm/meta_schedule/testing/dataset_extract_tasks.py
 delete mode 100644 python/tvm/meta_schedule/testing/dataset_sample_candidates.py
 delete mode 100644 python/tvm/meta_schedule/testing/tune_onnx.py
 delete mode 100644 python/tvm/meta_schedule/testing/tune_relay.py
 delete mode 100644 src/ir/affine_type.cc
 delete mode 100644 src/ir/memory_pools.cc
 delete mode 100644 src/ir/tensor_type.cc
 delete mode 100644 src/runtime/contrib/onnx/onnx_module.cc
 delete mode 100644 src/target/generic_func.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1edb82108685..24504047d8ea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -108,7 +108,6 @@ tvm_option(USE_TFLITE "Build with tflite support" OFF)
 tvm_option(USE_TENSORFLOW_PATH "TensorFlow root path when use TFLite" none)
 tvm_option(USE_COREML "Build with coreml support" OFF)
 tvm_option(USE_BNNS "Build with BNNS support" OFF)
-tvm_option(USE_TARGET_ONNX "Build with ONNX Codegen support" OFF)
 tvm_option(USE_ARM_COMPUTE_LIB "Build with Arm Compute Library" OFF)
 tvm_option(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR "Build with Arm Compute Library graph executor" OFF)
 tvm_option(USE_TENSORRT_CODEGEN "Build with TensorRT Codegen support" OFF)
@@ -494,7 +493,6 @@ include(cmake/modules/contrib/LibTorch.cmake)
 include(cmake/modules/contrib/TFLite.cmake)
 include(cmake/modules/contrib/CoreML.cmake)
 include(cmake/modules/contrib/BNNS.cmake)
-include(cmake/modules/contrib/ONNX.cmake)
 include(cmake/modules/contrib/ArmComputeLib.cmake)
 include(cmake/modules/contrib/TensorRT.cmake)
 include(cmake/modules/contrib/NNAPI.cmake)
diff --git a/NEWS.md b/NEWS.md
deleted file mode 100644
index 2b575f7aa214..000000000000
--- a/NEWS.md
+++ /dev/null
@@ -1,2772 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-TVM Change Log
-==============
-
-  - [On-going version](#on-going-version)
-  - [0.8](#08)
-    - [Accepted RFCs](#accepted-rfcs)
-    - [Features and Improvements](#features-and-improvements)
-      - [TE, TIR, TVMScript](#te-tir-tvmscript)
-      - [AutoTVM, AutoScheduler, Meta Schedule](#autotvm-autoscheduler-meta-schedule)
-      - [Operator Coverage](#operator-coverage)
-      - [Training](#training)
-      - [Relay](#relay)
-      - [MicroTVM, AOT, Graph Executor and VM](#microtvm-aot-graph-executor-and-vm)
-      - [Arithmetic Analysis](#arithmetic-analysis)
-      - [Frontends](#frontends)
-      - [Codegen Backends and Runtime](#codegen-backends-and-runtime)
-      - [BYOC Integration with Vendor Libraries: TensorRT, ACL, VitisAI](#byoc-integration-with-vendor-libraries-tensorrt-acl-vitisai)
-      - [TVMC](#tvmc)
-      - [Rust Binding](#rust-binding)
-      - [Misc](#misc)
-  - [0.7](#07)
-    - [New Features](#new-features)
-      - [Automatic Scheduling (Experimental)](#automatic-scheduling-experimental)
-      - [BYOC](#byoc)
-      - [Operator Coverage](#operator-coverage-1)
-      - [Quantization](#quantization)
-      - [Relay](#relay-1)
-      - [Runtime and Backend](#runtime-and-backend)
-      - [Rust Support](#rust-support)
-      - [TIR](#tir)
-      - [TE](#te)
-      - [TVMC(Experimental)](#tvmcexperimental)
-    - [Feature Improvement](#feature-improvement)
-      - [Accelerator and Microcontroller Support](#accelerator-and-microcontroller-support)
-      - [Arithmetic Analysis](#arithmetic-analysis-1)
-      - [AutoTVM and Graph Tuner](#autotvm-and-graph-tuner)
-      - [BYOC](#byoc-1)
-      - [Codegen](#codegen)
-      - [Dynamism Support](#dynamism-support)
-      - [Frontend and User Interface](#frontend-and-user-interface)
-      - [Relay](#relay-2)
-      - [Operator Coverage](#operator-coverage-2)
-      - [Runtime and Backend](#runtime-and-backend-1)
-      - [Quantization](#quantization-1)
-      - [TE](#te-1)
-      - [TIR](#tir-1)
-      - [Performance Improvements](#performance-improvements)
-      - [Documentation](#documentation)
-      - [Bug Fixes](#bug-fixes)
-    - [API Changes](#api-changes)
-    - [Deprecation](#deprecation)
-  - [0.6](#06)
-    - [Relay in Production](#relay-in-production)
-    - [Relay Virtual Machine](#relay-virtual-machine)
-    - [Training](#training-1)
-    - [Quantization](#quantization-2)
-    - [Accelerator and Microcontroller Support](#accelerator-and-microcontroller-support-1)
-    - [Rust Support](#rust-support-1)
-    - [Operator Support](#operator-support)
-    - [Frontend and User Interface](#frontend-and-user-interface-1)
-    - [Runtime and Backend Support](#runtime-and-backend-support)
-    - [Language and Architecture](#language-and-architecture)
-    - [Symbolic shape enhancement](#symbolic-shape-enhancement)
-    - [Language and Architecture](#language-and-architecture-1)
-    - [Arithmetic Analysis](#arithmetic-analysis-2)
-    - [Runtime and Backend Support](#runtime-and-backend-support-1)
-    - [Frontend and User Interface](#frontend-and-user-interface-2)
-    - [AutoTVM](#autotvm)
-    - [Performance Improvements](#performance-improvements-1)
-    - [Documentation](#documentation-1)
-    - [Build and Test](#build-and-test)
-    - [Bug Fixes](#bug-fixes-1)
-    - [Known Issues](#known-issues)
-    - [Depreciations](#depreciations)
-  - [0.5](#05)
-  - [0.4](#04)
-  - [0.3](#03)
-  - [0.2](#02)
-  - [0.1](#01)
-  - [Initial version](#initial-version)
-
-This file records the changes in TVM library in reverse chronological order.
-
-## On-going version
-
-Refer to the Roadmap issue for complete list on on-going version features.
-If you check in something that is not reflected in Roadmap issue, please reply
-to that issue so it can get added.
-
-## 0.8
-
-Apache TVM v0.8 brings several major exciting experimental features, including:
-- PaddlePaddle frontend
-- TVMScript: round-trippable python-based syntax for TIR
-- TorchScript integration
-- TensorIR scheduling language
-- TensorRT and CUTLASS integration via BYOC
-- Int4 TensorCore support in AutoTVM
-- MicroTVM Project API and Zephyr, Arduino support
-- AOT executor
-- Robost Windows support
-- Affine analysis infra: iter-affine-map
-- Improved Vulkan backend
-- CUDA graph support in TVM runtime
-
-Besides, The community has been working together to refactor and evolve the existing infrastructure, including but not limited to:
-- Relay compilation engine
-- Relay pattern language
-- CI and build process
-- Refactoring documentation and tutorials
-- Stablizing AutoScheduler
-- Stablizing TVMC command line driver interface
-- Stablizing target system
-- Frontend coverage, quantization, dynamic shape, training
-
-Full changelog: https://gist.github.com/junrushao1994/c669905dbc41edc2e691316df49d8562.
-
-### Accepted RFCs
-
-The community has adopted a [formal RFC process](https://github.com/apache/tvm-rfcs). Below is a list of the formal RFCs accepted by the community since then:
-- [RFC-0005] Meta schedule (AutoTIR)
-- [RFC-0006] Automatic mixed-precision pass and support
-- [RFC-0007] Parametrized unit tests
-- [RFC-0008] MicroTVM Project API
-- [RFC-0009] Unified static memory planner
-- [RFC-0010] Target-registered compiler flow customisation
-- [RFC-0011] Arm® Ethos-U integration
-- [RFC-0014] Pipeline executor
-- [RFC-0015] Use CMSIS-NN with TVM
-- [RFC-0019] Add PaddlePaddle frontend
-- [RFC-0020] Extend metadata in project option
-- [RFC-0022] TIR non-scalar constants
-- [RFC-0023] Adding annotation field to `tir.allocate` nodes
-- [RFC-0025] PyTorchTVM
-- [RFC-0027] Formalize TVM documentation organization
-- [RFC-0028] Command line composition from internal registry
-- [RFC-0029] Migrating target attributes to IRModule
-- [RFC-0030] Command line configuration files
-- [RFC-0031] C Device API
-- [RFC-0036] TVMScript namespace
-- [RFC-0041] Update TVMScript block syntax
-
-### Features and Improvements
-#### TE, TIR, TVMScript
-
-- TVMScript parser and printer [#7630](https://github.com/apache/tvm/pull/7630) [#9115](https://github.com/apache/tvm/pull/9115) [#9286](https://github.com/apache/tvm/pull/9286)
-- Scheduleable TIR (S-TIR) infrastructure, analysis and lowering passes [#7553](https://github.com/apache/tvm/pull/7553) [#7765](https://github.com/apache/tvm/pull/7765) [#7847](https://github.com/apache/tvm/pull/7847) [#8114](https://github.com/apache/tvm/pull/8114) [#8121](https://github.com/apache/tvm/pull/8121) [#7873](https://github.com/apache/tvm/pull/7873) [#7923](https://github.com/apache/tvm/pull/7923) [#7962](https://github.com/apache/tvm/pull/7962) [#7848](https://github.com/apache/tvm/pull/7848) [#8044](https://github.com/apache/tvm/pull/8044) [#7806](https://github.com/apache/tvm/pull/7806)
-- S-TIR schedule primitives: `compute-inline`, `reverse-compute-inline`, `fuse`, `split`, `rfactor`, `storage-align`, `vectorize`, `unroll`, `bind`, `reorder`, `cache-read`, `cache-write`, `compute-at`, `reverse-compute-at`, `decompose-reduction` [#8170](https://github.com/apache/tvm/pull/8170) [#8467](https://github.com/apache/tvm/pull/8467) [#8544](https://github.com/apache/tvm/pull/8544) [#8693](https://github.com/apache/tvm/pull/8693) [#8716](https://github.com/apache/tvm/pull/8716) [#8767](https://github.com/apache/tvm/pull/8767) [#8863](https://github.com/apache/tvm/pull/8863) [#8943](https://github.com/apache/tvm/pull/8943) [#9041](https://github.com/apache/tvm/pull/9041)
-- While loop in TIR [#7425](https://github.com/apache/tvm/pull/7425) [#9004](https://github.com/apache/tvm/pull/9004)
-- Metaprogramming in S-TIR via `specialize` [#8354](https://github.com/apache/tvm/pull/8354)
-- Support Return value in TIR [#7084](https://github.com/apache/tvm/pull/7084) [#7932](https://github.com/apache/tvm/pull/7932)
-- Storage scope support in `PointerType` [#8017](https://github.com/apache/tvm/pull/8017) [#8366](https://github.com/apache/tvm/pull/8366) [#8463](https://github.com/apache/tvm/pull/8463)
-- Creation of S-TIR via TE compute [#7987](https://github.com/apache/tvm/pull/7987)
-
-#### AutoTVM, AutoScheduler, Meta Schedule
-
-- PopenPoolExecutor is used to replace python native library to provide better multiprocessing support as well as enable auto-tuning in Jupyter notebooks for AutoTVM and AutoScheduler [#6959](https://github.com/apache/tvm/pull/6959) [#8492](https://github.com/apache/tvm/pull/8492) [#8913](https://github.com/apache/tvm/pull/8913) [#8820](https://github.com/apache/tvm/pull/8820) [#8851](https://github.com/apache/tvm/pull/8851)
-- AutoScheduler improvement and stabilization: task scheduler, layout rewrite, early stopping, dispatching [#6945](https://github.com/apache/tvm/pull/6945) [#6750](https://github.com/apache/tvm/pull/6750) [#6987](https://github.com/apache/tvm/pull/6987) [#7156](https://github.com/apache/tvm/pull/7156) [#8862](https://github.com/apache/tvm/pull/8862) [#8995](https://github.com/apache/tvm/pull/8995) [#7571](https://github.com/apache/tvm/pull/7571) [#7376](https://github.com/apache/tvm/pull/7376) [#7377](https://github.com/apache/tvm/pull/7377) [#7344](https://github.com/apache/tvm/pull/7344) [#7185](https://github.com/apache/tvm/pull/7185)
-- AutoScheduler support for sparse workloads [#7313](https://github.com/apache/tvm/pull/7313) [#7635](https://github.com/apache/tvm/pull/7635) [#8065](https://github.com/apache/tvm/pull/8065)
-- AutoScheduler support for Vulkan, ROCm, Mali [#7626](https://github.com/apache/tvm/pull/7626) [#7038](https://github.com/apache/tvm/pull/7038) [#7132](https://github.com/apache/tvm/pull/7132)
-- AutoTVM support for int4 TensorCore [#7831](https://github.com/apache/tvm/pull/7831) [#8402](https://github.com/apache/tvm/pull/8402)
-- Meta Schedule core infrastructure, builder runner and database [#8615](https://github.com/apache/tvm/pull/8615) [#8623](https://github.com/apache/tvm/pull/8623) [#8642](https://github.com/apache/tvm/pull/8642) [#8817](https://github.com/apache/tvm/pull/8817) [#9079](https://github.com/apache/tvm/pull/9079) [#9132](https://github.com/apache/tvm/pull/9132) [#9154](https://github.com/apache/tvm/pull/9154) [#9053](https://github.com/apache/tvm/pull/9053) [#9059](https://github.com/apache/tvm/pull/9059) [#9044](https://github.com/apache/tvm/pull/9044) [#9111](https://github.com/apache/tvm/pull/9111) [#9061](https://github.com/apache/tvm/pull/9061) [#9153](https://github.com/apache/tvm/pull/9153)
-
-#### Operator Coverage
-- Operators for Int-8 vision transformer on GPU [#7814](https://github.com/apache/tvm/pull/7814)
-- Optimizing NMS and ROI-related kernel on GPU [#7257](https://github.com/apache/tvm/pull/7257) [#7172](https://github.com/apache/tvm/pull/7172) [#7136](https://github.com/apache/tvm/pull/7136) [#7796](https://github.com/apache/tvm/pull/7796) [#7463](https://github.com/apache/tvm/pull/7463) [#6516](https://github.com/apache/tvm/pull/6516) [#7440](https://github.com/apache/tvm/pull/7440) [#7666](https://github.com/apache/tvm/pull/7666) [#8174](https://github.com/apache/tvm/pull/8174)
-- Support and optimize sparse operators [#8605](https://github.com/apache/tvm/pull/8605) [#7477](https://github.com/apache/tvm/pull/7477) [#7435](https://github.com/apache/tvm/pull/7435) [#6889](https://github.com/apache/tvm/pull/6889) [#6580](https://github.com/apache/tvm/pull/6580) [#8437](https://github.com/apache/tvm/pull/8437)
-- Sort-related operators and optimization [#9184](https://github.com/apache/tvm/pull/9184) [#7669](https://github.com/apache/tvm/pull/7669) [#8672](https://github.com/apache/tvm/pull/8672) [#7611](https://github.com/apache/tvm/pull/7611) [#7195](https://github.com/apache/tvm/pull/7195) [#7056](https://github.com/apache/tvm/pull/7056) [#6978](https://github.com/apache/tvm/pull/6978)
-- Support for einsum operator [#6370](https://github.com/apache/tvm/pull/6370)
-- Matmul, dense operators and their optimization [#8921](https://github.com/apache/tvm/pull/8921) [#8527](https://github.com/apache/tvm/pull/8527) [#8234](https://github.com/apache/tvm/pull/8234) [#8250](https://github.com/apache/tvm/pull/8250) [#6616](https://github.com/apache/tvm/pull/6616) [#8229](https://github.com/apache/tvm/pull/8229) [#8401](https://github.com/apache/tvm/pull/8401) [#7404](https://github.com/apache/tvm/pull/7404) [#8669](https://github.com/apache/tvm/pull/8669)
-- Convolution and pooling operators and their optimization [#8620](https://github.com/apache/tvm/pull/8620) [#8936](https://github.com/apache/tvm/pull/8936) [#8584](https://github.com/apache/tvm/pull/8584) [#7075](https://github.com/apache/tvm/pull/7075) [#7142](https://github.com/apache/tvm/pull/7142) [#7515](https://github.com/apache/tvm/pull/7515) [#6999](https://github.com/apache/tvm/pull/6999) [#6899](https://github.com/apache/tvm/pull/6899) [#6840](https://github.com/apache/tvm/pull/6840) [#6137](https://github.com/apache/tvm/pull/6137) [#6802](https://github.com/apache/tvm/pull/6802) [#6445](https://github.com/apache/tvm/pull/6445) [#6711](https://github.com/apache/tvm/pull/6711) [#6714](https://github.com/apache/tvm/pull/6714) [#8167](https://github.com/apache/tvm/pull/8167) [#8222](https://github.com/apache/tvm/pull/8222) [#8275](https://github.com/apache/tvm/pull/8275) [#8276](https://github.com/apache/tvm/pull/8276) [#8422](https://github.com/apache/tvm/pull/8422) [#8430](https://github.com/apache/tvm/pull/8430) [#6687](https://github.com/apache/tvm/pull/6687) [#7928](https://github.com/apache/tvm/pull/7928) [#8897](https://github.com/apache/tvm/pull/8897)
-- Scatter and gather operators and their optimization [#8479](https://github.com/apache/tvm/pull/8479) [#7600](https://github.com/apache/tvm/pull/7600) [#7044](https://github.com/apache/tvm/pull/7044) [#7464](https://github.com/apache/tvm/pull/7464) [#7233](https://github.com/apache/tvm/pull/7233) [#6533](https://github.com/apache/tvm/pull/6533) [#6856](https://github.com/apache/tvm/pull/6856) [#6854](https://github.com/apache/tvm/pull/6854) [#7927](https://github.com/apache/tvm/pull/7927) [#8105](https://github.com/apache/tvm/pull/8105)
-- Prefix scan, cumsum and cumprod [#7722](https://github.com/apache/tvm/pull/7722) [#7303](https://github.com/apache/tvm/pull/7303) [#7314](https://github.com/apache/tvm/pull/7314) [#7334](https://github.com/apache/tvm/pull/7334) [#7123](https://github.com/apache/tvm/pull/7123) [#6868](https://github.com/apache/tvm/pull/6868)
-- Dynamic shape and shape functions [#7414](https://github.com/apache/tvm/pull/7414) [#6979](https://github.com/apache/tvm/pull/6979) [#6912](https://github.com/apache/tvm/pull/6912) [#6898](https://github.com/apache/tvm/pull/6898) [#6373](https://github.com/apache/tvm/pull/6373) [#8068](https://github.com/apache/tvm/pull/8068) [#7490](https://github.com/apache/tvm/pull/7490) [#7487](https://github.com/apache/tvm/pull/7487)
-- Miscellaneous improvement. Operators including: reshape, resize, pad, PRNG, transpose, where, softmax, concat, nll_loss, space_to_batch_nd, batch_to_space_nd, slice_like; Libraries including thrust, cuDNN, cuBLAS, MIOpen; Improving schedules for generic reduction and softmax. [#8592](https://github.com/apache/tvm/pull/8592) [#7375](https://github.com/apache/tvm/pull/7375) [#7287](https://github.com/apache/tvm/pull/7287) [#7184](https://github.com/apache/tvm/pull/7184) [#7131](https://github.com/apache/tvm/pull/7131) [#7086](https://github.com/apache/tvm/pull/7086) [#7083](https://github.com/apache/tvm/pull/7083) [#8030](https://github.com/apache/tvm/pull/8030) [#6851](https://github.com/apache/tvm/pull/6851) [#6477](https://github.com/apache/tvm/pull/6477) [#8346](https://github.com/apache/tvm/pull/8346) [#6759](https://github.com/apache/tvm/pull/6759) [#8028](https://github.com/apache/tvm/pull/8028) [#8056](https://github.com/apache/tvm/pull/8056) [#8369](https://github.com/apache/tvm/pull/8369) [#7468](https://github.com/apache/tvm/pull/7468) [#7458](https://github.com/apache/tvm/pull/7458) [#7194](https://github.com/apache/tvm/pull/7194) [#8138](https://github.com/apache/tvm/pull/8138) [#8543](https://github.com/apache/tvm/pull/8543)
-
-#### Training
-
-- Relay AutoDiff [#7677](https://github.com/apache/tvm/pull/7677) [#8318](https://github.com/apache/tvm/pull/8318)
-- TE AutoDiff [#7321](https://github.com/apache/tvm/pull/7321)
-- Gradient operators [#7685](https://github.com/apache/tvm/pull/7685) [#7340](https://github.com/apache/tvm/pull/7340) [#6767](https://github.com/apache/tvm/pull/6767) [#8307](https://github.com/apache/tvm/pull/8307) [#7357](https://github.com/apache/tvm/pull/7357) [#6827](https://github.com/apache/tvm/pull/6827)
-
-#### Relay
-
-- Pattern language and mixed-mode visitor: matching more IR constructs, fuzzy matching; converting more passes to non-recursive.  [#8843](https://github.com/apache/tvm/pull/8843) [#7754](https://github.com/apache/tvm/pull/7754) [#7355](https://github.com/apache/tvm/pull/7355) [#7332](https://github.com/apache/tvm/pull/7332) [#7282](https://github.com/apache/tvm/pull/7282) [#7151](https://github.com/apache/tvm/pull/7151) [#7120](https://github.com/apache/tvm/pull/7120) [#6958](https://github.com/apache/tvm/pull/6958) [#7507](https://github.com/apache/tvm/pull/7507) [#8325](https://github.com/apache/tvm/pull/8325) [#8774](https://github.com/apache/tvm/pull/8774) [#7817](https://github.com/apache/tvm/pull/7817) [#7374](https://github.com/apache/tvm/pull/7374) [#6695](https://github.com/apache/tvm/pull/6695) [#6704](https://github.com/apache/tvm/pull/6704)
-- Improving or adding passes including ExtractOperators, SimplifyExpr, DynamicToStatic, DefuseOps, ConvertLayout, FoldConstant. Added a set of utilities that allows a model to be run efficiently on TensorCores [#9253](https://github.com/apache/tvm/pull/9253) [#9245](https://github.com/apache/tvm/pull/9245) [#8996](https://github.com/apache/tvm/pull/8996) [#7827](https://github.com/apache/tvm/pull/7827) [#9034](https://github.com/apache/tvm/pull/9034) [#7807](https://github.com/apache/tvm/pull/7807) [#8755](https://github.com/apache/tvm/pull/8755) [#7731](https://github.com/apache/tvm/pull/7731) [#7368](https://github.com/apache/tvm/pull/7368) [#7603](https://github.com/apache/tvm/pull/7603) [#7656](https://github.com/apache/tvm/pull/7656) [#7423](https://github.com/apache/tvm/pull/7423) [#7354](https://github.com/apache/tvm/pull/7354) [#6946](https://github.com/apache/tvm/pull/6946) [#6748](https://github.com/apache/tvm/pull/6748) [#6720](https://github.com/apache/tvm/pull/6720) [#6776](https://github.com/apache/tvm/pull/6776) [#7835](https://github.com/apache/tvm/pull/7835) [#7895](https://github.com/apache/tvm/pull/7895) [#8205](https://github.com/apache/tvm/pull/8205)
-- TECompiler and refactoring of compilation workflow [#9103](https://github.com/apache/tvm/pull/9103) [#8974](https://github.com/apache/tvm/pull/8974) [#8886](https://github.com/apache/tvm/pull/8886) [#8802](https://github.com/apache/tvm/pull/8802) [#8501](https://github.com/apache/tvm/pull/8501) [#8526](https://github.com/apache/tvm/pull/8526) [#8486](https://github.com/apache/tvm/pull/8486) [#8597](https://github.com/apache/tvm/pull/8597) [#7518](https://github.com/apache/tvm/pull/7518) [#7552](https://github.com/apache/tvm/pull/7552) [#8914](https://github.com/apache/tvm/pull/8914) [#9130](https://github.com/apache/tvm/pull/9130)
-- Quantization and automatic-mixed precision [#8883](https://github.com/apache/tvm/pull/8883) [#8810](https://github.com/apache/tvm/pull/8810) [#8644](https://github.com/apache/tvm/pull/8644) [#7613](https://github.com/apache/tvm/pull/7613) [#8069](https://github.com/apache/tvm/pull/8069) [#8341](https://github.com/apache/tvm/pull/8341) [#8126](https://github.com/apache/tvm/pull/8126) [#8460](https://github.com/apache/tvm/pull/8460)
-- Parser, printer and diagnostic [#7347](https://github.com/apache/tvm/pull/7347) [#6274](https://github.com/apache/tvm/pull/6274) [#6692](https://github.com/apache/tvm/pull/6692) [#8352](https://github.com/apache/tvm/pull/8352) [#8000](https://github.com/apache/tvm/pull/8000)
-
-#### MicroTVM, AOT, Graph Executor and VM
-
-- Pipeline Executor [#8702](https://github.com/apache/tvm/pull/8702) [#9108](https://github.com/apache/tvm/pull/9108)
-- CUDA graph integration in graph executor [#7616](https://github.com/apache/tvm/pull/7616)
-- Enable add `set_output_zero_copy` in graph executor [#8497](https://github.com/apache/tvm/pull/8497)
-- VM: memory allocation improvement, shape function improvement and misc [#7746](https://github.com/apache/tvm/pull/7746) [#7451](https://github.com/apache/tvm/pull/7451) [#7413](https://github.com/apache/tvm/pull/7413) [#7210](https://github.com/apache/tvm/pull/7210) [#8040](https://github.com/apache/tvm/pull/8040) [#6938](https://github.com/apache/tvm/pull/6938) [#8661](https://github.com/apache/tvm/pull/8661) [#7676](https://github.com/apache/tvm/pull/7676) [#8285](https://github.com/apache/tvm/pull/8285)
-- AOT compilation and execution [#8697](https://github.com/apache/tvm/pull/8697) [#7785](https://github.com/apache/tvm/pull/7785) [#8014](https://github.com/apache/tvm/pull/8014) [#8023](https://github.com/apache/tvm/pull/8023) [#8096](https://github.com/apache/tvm/pull/8096) [#8075](https://github.com/apache/tvm/pull/8075)
-- Project API infrastructure: [#8380](https://github.com/apache/tvm/pull/8380) [#8963](https://github.com/apache/tvm/pull/8963) [#8708](https://github.com/apache/tvm/pull/8708) [#8019](https://github.com/apache/tvm/pull/8019)
-- MicroTVM, Zephyr, Arduino RVM, AutoTVM support [#9320](https://github.com/apache/tvm/pull/9320) [#8941](https://github.com/apache/tvm/pull/8941) [#7804](https://github.com/apache/tvm/pull/7804) [#7786](https://github.com/apache/tvm/pull/7786) [#7449](https://github.com/apache/tvm/pull/7449) [#7891](https://github.com/apache/tvm/pull/7891) [#7915](https://github.com/apache/tvm/pull/7915) [#8055](https://github.com/apache/tvm/pull/8055) [#8037](https://github.com/apache/tvm/pull/8037) [#8386](https://github.com/apache/tvm/pull/8386) [#8519](https://github.com/apache/tvm/pull/8519) [#8748](https://github.com/apache/tvm/pull/8748) [8154](https://github.com/apache/tvm/pull/8154) [#8945](https://github.com/apache/tvm/pull/8945) [#8624](https://github.com/apache/tvm/pull/8624) [#8701](https://github.com/apache/tvm/pull/8701) [#7723](https://github.com/apache/tvm/pull/7723) [#8715](https://github.com/apache/tvm/pull/8715) [#7225](https://github.com/apache/tvm/pull/7225) [#6964](https://github.com/apache/tvm/pull/6964) [#7813](https://github.com/apache/tvm/pull/7813) [#7528](https://github.com/apache/tvm/pull/7528)
-- The pure C runtime (CRT) [#7398](https://github.com/apache/tvm/pull/7398) [#7333](https://github.com/apache/tvm/pull/7333) [#7095](https://github.com/apache/tvm/pull/7095) [#7225](https://github.com/apache/tvm/pull/7225)
-- Model library format [#8270](https://github.com/apache/tvm/pull/8270) [#8072](https://github.com/apache/tvm/pull/8072) [#7938](https://github.com/apache/tvm/pull/7938)
-
-#### Arithmetic Analysis
-
-- Tighter bounds and more simplification on cast [#6771](https://github.com/apache/tvm/pull/6771) [#7045](https://github.com/apache/tvm/pull/7045)
-- Introducing iterator (quasi-) affine map detection [#6667](https://github.com/apache/tvm/pull/6667) [#7752](https://github.com/apache/tvm/pull/7752) [#7759](https://github.com/apache/tvm/pull/7759)
-- Inverse of iterator affine map [#8384](https://github.com/apache/tvm/pull/8384) [#8427](https://github.com/apache/tvm/pull/8427)
-- Subspace division in iterator affine map [#7760](https://github.com/apache/tvm/pull/7760)
-
-#### Frontends
-
-- PaddlePaddle initial support [#8645](https://github.com/apache/tvm/pull/8645)  [#9124](https://github.com/apache/tvm/pull/9124) [#9126](https://github.com/apache/tvm/pull/9126) [#9295](https://github.com/apache/tvm/pull/9295) [#9370](https://github.com/apache/tvm/pull/9370) [#9236](https://github.com/apache/tvm/pull/9236) [#9283](https://github.com/apache/tvm/pull/9283)
-- ONNX support, including better handling of control flow, coverage of more operators, better dynamic shape support, more tests. [#9265](https://github.com/apache/tvm/pull/9265) [#9178](https://github.com/apache/tvm/pull/9178) [#9146](https://github.com/apache/tvm/pull/9146) [#8894](https://github.com/apache/tvm/pull/8894) [#8966](https://github.com/apache/tvm/pull/8966) [#8967](https://github.com/apache/tvm/pull/8967) [#7818](https://github.com/apache/tvm/pull/7818) [#9000](https://github.com/apache/tvm/pull/9000) [#9001](https://github.com/apache/tvm/pull/9001) [#9066](https://github.com/apache/tvm/pull/9066) [#9028](https://github.com/apache/tvm/pull/9028) [#9002](https://github.com/apache/tvm/pull/9002) [#8985](https://github.com/apache/tvm/pull/8985) [#9019](https://github.com/apache/tvm/pull/9019) [#9017](https://github.com/apache/tvm/pull/9017) [#8972](https://github.com/apache/tvm/pull/8972) [#7802](https://github.com/apache/tvm/pull/7802) [#7800](https://github.com/apache/tvm/pull/7800) [#7781](https://github.com/apache/tvm/pull/7781) [#8919](https://github.com/apache/tvm/pull/8919) [#9054](https://github.com/apache/tvm/pull/9054) [#8906](https://github.com/apache/tvm/pull/8906) [#8933](https://github.com/apache/tvm/pull/8933) [#8959](https://github.com/apache/tvm/pull/8959) [#8907](https://github.com/apache/tvm/pull/8907) [#7771](https://github.com/apache/tvm/pull/7771) [#8923](https://github.com/apache/tvm/pull/8923) [#8924](https://github.com/apache/tvm/pull/8924) [#7755](https://github.com/apache/tvm/pull/7755) [#7720](https://github.com/apache/tvm/pull/7720) [#8773](https://github.com/apache/tvm/pull/8773) [#8872](https://github.com/apache/tvm/pull/8872) [#7655](https://github.com/apache/tvm/pull/7655) [#8741](https://github.com/apache/tvm/pull/8741) [#7633](https://github.com/apache/tvm/pull/7633) [#8781](https://github.com/apache/tvm/pull/8781) [#8866](https://github.com/apache/tvm/pull/8866) [#8867](https://github.com/apache/tvm/pull/8867) [#7522](https://github.com/apache/tvm/pull/7522) [#7519](https://github.com/apache/tvm/pull/7519) [#7489](https://github.com/apache/tvm/pull/7489) [#7438](https://github.com/apache/tvm/pull/7438) [#7429](https://github.com/apache/tvm/pull/7429) [#7364](https://github.com/apache/tvm/pull/7364) [#7300](https://github.com/apache/tvm/pull/7300) [#7259](https://github.com/apache/tvm/pull/7259) [#7243](https://github.com/apache/tvm/pull/7243) [#7237](https://github.com/apache/tvm/pull/7237) [#7208](https://github.com/apache/tvm/pull/7208) [#7189](https://github.com/apache/tvm/pull/7189) [#7115](https://github.com/apache/tvm/pull/7115) [#7109](https://github.com/apache/tvm/pull/7109) [#7089](https://github.com/apache/tvm/pull/7089) [#7036](https://github.com/apache/tvm/pull/7036) [#7031](https://github.com/apache/tvm/pull/7031) [#6839](https://github.com/apache/tvm/pull/6839) [#6351](https://github.com/apache/tvm/pull/6351) [#7842](https://github.com/apache/tvm/pull/7842) [#7844](https://github.com/apache/tvm/pull/7844) [#6646](https://github.com/apache/tvm/pull/6646) [#6647](https://github.com/apache/tvm/pull/6647) [#6681](https://github.com/apache/tvm/pull/6681) [#6700](https://github.com/apache/tvm/pull/6700) [#7883](https://github.com/apache/tvm/pull/7883) [#6726](https://github.com/apache/tvm/pull/6726) [#6730](https://github.com/apache/tvm/pull/6730) [#7899](https://github.com/apache/tvm/pull/7899) [#7900](https://github.com/apache/tvm/pull/7900) [#7906](https://github.com/apache/tvm/pull/7906) [#7934](https://github.com/apache/tvm/pull/7934) [#7956](https://github.com/apache/tvm/pull/7956) [#8007](https://github.com/apache/tvm/pull/8007) [#8011](https://github.com/apache/tvm/pull/8011) [#8084](https://github.com/apache/tvm/pull/8084) [#8099](https://github.com/apache/tvm/pull/8099) [#8189](https://github.com/apache/tvm/pull/8189) [#8191](https://github.com/apache/tvm/pull/8191) [#8304](https://github.com/apache/tvm/pull/8304) [#8321](https://github.com/apache/tvm/pull/8321) [#8337](https://github.com/apache/tvm/pull/8337) [#8356](https://github.com/apache/tvm/pull/8356) [#8385](https://github.com/apache/tvm/pull/8385) [#8502](https://github.com/apache/tvm/pull/8502) [#8426](https://github.com/apache/tvm/pull/8426) [#8440](https://github.com/apache/tvm/pull/8440) [#8456](https://github.com/apache/tvm/pull/8456) [#8475](https://github.com/apache/tvm/pull/8475) [#7391](https://github.com/apache/tvm/pull/7391) [#7394](https://github.com/apache/tvm/pull/7394) [#8621](https://github.com/apache/tvm/pull/8621) [#8322](https://github.com/apache/tvm/pull/8322) [#8323](https://github.com/apache/tvm/pull/8323) [#8435](https://github.com/apache/tvm/pull/8435) [#8436](https://github.com/apache/tvm/pull/8436) [#8455](https://github.com/apache/tvm/pull/8455) [#7353](https://github.com/apache/tvm/pull/7353) [#7215](https://github.com/apache/tvm/pull/7215)
-- TensorFlow and TFLite, including more operators, better TensorArray support and quantization [#9404](https://github.com/apache/tvm/pull/9404) [#9256](https://github.com/apache/tvm/pull/9256) [#8689](https://github.com/apache/tvm/pull/8689) [#7789](https://github.com/apache/tvm/pull/7789) [#7736](https://github.com/apache/tvm/pull/7736) [#8763](https://github.com/apache/tvm/pull/8763) [#8647](https://github.com/apache/tvm/pull/8647) [#8648](https://github.com/apache/tvm/pull/8648) [#8558](https://github.com/apache/tvm/pull/8558) [#8780](https://github.com/apache/tvm/pull/8780) [#8538](https://github.com/apache/tvm/pull/8538) [#7659](https://github.com/apache/tvm/pull/7659) [#7639](https://github.com/apache/tvm/pull/7639) [#7531](https://github.com/apache/tvm/pull/7531) [#7520](https://github.com/apache/tvm/pull/7520) [#7502](https://github.com/apache/tvm/pull/7502) [#7496](https://github.com/apache/tvm/pull/7496) [#7473](https://github.com/apache/tvm/pull/7473) [#7452](https://github.com/apache/tvm/pull/7452) [#7442](https://github.com/apache/tvm/pull/7442) [#7441](https://github.com/apache/tvm/pull/7441) [#7400](https://github.com/apache/tvm/pull/7400) [#7320](https://github.com/apache/tvm/pull/7320) [#7293](https://github.com/apache/tvm/pull/7293) [#7267](https://github.com/apache/tvm/pull/7267) [#7159](https://github.com/apache/tvm/pull/7159) [#7148](https://github.com/apache/tvm/pull/7148) [#7114](https://github.com/apache/tvm/pull/7114) [#7113](https://github.com/apache/tvm/pull/7113) [#7093](https://github.com/apache/tvm/pull/7093) [#7074](https://github.com/apache/tvm/pull/7074) [#7048](https://github.com/apache/tvm/pull/7048) [#7030](https://github.com/apache/tvm/pull/7030) [#6998](https://github.com/apache/tvm/pull/6998) [#6984](https://github.com/apache/tvm/pull/6984) [#6970](https://github.com/apache/tvm/pull/6970) [#6949](https://github.com/apache/tvm/pull/6949) [#6933](https://github.com/apache/tvm/pull/6933) [#6918](https://github.com/apache/tvm/pull/6918) [#6901](https://github.com/apache/tvm/pull/6901) [#6885](https://github.com/apache/tvm/pull/6885) [#6849](https://github.com/apache/tvm/pull/6849) [#5767](https://github.com/apache/tvm/pull/5767) [#6589](https://github.com/apache/tvm/pull/6589) [#6670](https://github.com/apache/tvm/pull/6670) [#6674](https://github.com/apache/tvm/pull/6674) [#6675](https://github.com/apache/tvm/pull/6675) [#7866](https://github.com/apache/tvm/pull/7866) [#6685](https://github.com/apache/tvm/pull/6685) [#7885](https://github.com/apache/tvm/pull/7885) [#6729](https://github.com/apache/tvm/pull/6729) [#7901](https://github.com/apache/tvm/pull/7901) [#6774](https://github.com/apache/tvm/pull/6774) [#6783](https://github.com/apache/tvm/pull/6783) [#6799](https://github.com/apache/tvm/pull/6799) [#7951](https://github.com/apache/tvm/pull/7951) [#8024](https://github.com/apache/tvm/pull/8024) [#8051](https://github.com/apache/tvm/pull/8051) [#8060](https://github.com/apache/tvm/pull/8060) [#8074](https://github.com/apache/tvm/pull/8074) [#8142](https://github.com/apache/tvm/pull/8142) [#8179](https://github.com/apache/tvm/pull/8179) [#8251](https://github.com/apache/tvm/pull/8251) [#8277](https://github.com/apache/tvm/pull/8277) [#8335](https://github.com/apache/tvm/pull/8335) [#8364](https://github.com/apache/tvm/pull/8364) [#8375](https://github.com/apache/tvm/pull/8375) [#8431](https://github.com/apache/tvm/pull/8431) [#8454](https://github.com/apache/tvm/pull/8454) [#6818](https://github.com/apache/tvm/pull/6818) [#8483](https://github.com/apache/tvm/pull/8483) [#9099](https://github.com/apache/tvm/pull/9099) [#9165](https://github.com/apache/tvm/pull/9165)
-- PyTorch: more operators including activations, inplace operators, RNNs, NMS [#9371](https://github.com/apache/tvm/pull/9371) [#9204](https://github.com/apache/tvm/pull/9204) [#9185](https://github.com/apache/tvm/pull/9185) [#9135](https://github.com/apache/tvm/pull/9135) [#9133](https://github.com/apache/tvm/pull/9133) [#9015](https://github.com/apache/tvm/pull/9015) [#8839](https://github.com/apache/tvm/pull/8839) [#8718](https://github.com/apache/tvm/pull/8718) [#8699](https://github.com/apache/tvm/pull/8699) [#8692](https://github.com/apache/tvm/pull/8692) [#7712](https://github.com/apache/tvm/pull/7712) [#8753](https://github.com/apache/tvm/pull/8753) [#7694](https://github.com/apache/tvm/pull/7694) [#8583](https://github.com/apache/tvm/pull/8583) [#7675](https://github.com/apache/tvm/pull/7675) [#7646](https://github.com/apache/tvm/pull/7646) [#7606](https://github.com/apache/tvm/pull/7606) [#7592](https://github.com/apache/tvm/pull/7592) [#7569](https://github.com/apache/tvm/pull/7569) [#7544](https://github.com/apache/tvm/pull/7544) [#7549](https://github.com/apache/tvm/pull/7549) [#7535](https://github.com/apache/tvm/pull/7535) [#7517](https://github.com/apache/tvm/pull/7517) [#7465](https://github.com/apache/tvm/pull/7465) [#7397](https://github.com/apache/tvm/pull/7397) [#7371](https://github.com/apache/tvm/pull/7371) [#7348](https://github.com/apache/tvm/pull/7348) [#7346](https://github.com/apache/tvm/pull/7346) [#7325](https://github.com/apache/tvm/pull/7325) [#7231](https://github.com/apache/tvm/pull/7231) [#7174](https://github.com/apache/tvm/pull/7174) [#7154](https://github.com/apache/tvm/pull/7154) [#7137](https://github.com/apache/tvm/pull/7137) [#7134](https://github.com/apache/tvm/pull/7134) [#7133](https://github.com/apache/tvm/pull/7133) [#7128](https://github.com/apache/tvm/pull/7128) [#7088](https://github.com/apache/tvm/pull/7088) [#7023](https://github.com/apache/tvm/pull/7023) [#6900](https://github.com/apache/tvm/pull/6900) [#6602](https://github.com/apache/tvm/pull/6602) [#7845](https://github.com/apache/tvm/pull/7845) [#6659](https://github.com/apache/tvm/pull/6659) [#6740](https://github.com/apache/tvm/pull/6740) [#6782](https://github.com/apache/tvm/pull/6782) [#6784](https://github.com/apache/tvm/pull/6784) [#7958](https://github.com/apache/tvm/pull/7958) [#8192](https://github.com/apache/tvm/pull/8192) [#8397](https://github.com/apache/tvm/pull/8397) [#8398](https://github.com/apache/tvm/pull/8398) [#8403](https://github.com/apache/tvm/pull/8403) [#8447](https://github.com/apache/tvm/pull/8447) [#6829](https://github.com/apache/tvm/pull/6829)
-- MXNet support. More operators and NLP model coverage in GluonNLP [#7568](https://github.com/apache/tvm/pull/7568) [#7409](https://github.com/apache/tvm/pull/7409) [#7209](https://github.com/apache/tvm/pull/7209) [#7191](https://github.com/apache/tvm/pull/7191) [#7062](https://github.com/apache/tvm/pull/7062) [#6561](https://github.com/apache/tvm/pull/6561) [#6699](https://github.com/apache/tvm/pull/6699)
-- Misc: CoreML, Keras, DarkNet, etc. [#7667](https://github.com/apache/tvm/pull/7667) [#6676](https://github.com/apache/tvm/pull/6676) [#6651](https://github.com/apache/tvm/pull/6651) [#6963](https://github.com/apache/tvm/pull/6963) [#7949](https://github.com/apache/tvm/pull/7949) [#7035](https://github.com/apache/tvm/pull/7035) [#7446](https://github.com/apache/tvm/pull/7446) [#8562](https://github.com/apache/tvm/pull/8562) [#8599](https://github.com/apache/tvm/pull/8599)
-
-#### Codegen Backends and Runtime
-
-- LLVM backend: recover LLVM support on windows; support target feature strings in function attributes; atomic support in NVPTX, ROCm; LLVM compatibility to LLVM 12+ [#9305](https://github.com/apache/tvm/pull/9305) [#9223](https://github.com/apache/tvm/pull/9223) [#9138](https://github.com/apache/tvm/pull/9138) [#8860](https://github.com/apache/tvm/pull/8860) [#8958](https://github.com/apache/tvm/pull/8958) [#6763](https://github.com/apache/tvm/pull/6763) [#6698](https://github.com/apache/tvm/pull/6698) [#6717](https://github.com/apache/tvm/pull/6717) [#6738](https://github.com/apache/tvm/pull/6738) [#8293](https://github.com/apache/tvm/pull/8293) [#6907](https://github.com/apache/tvm/pull/6907) [#7051](https://github.com/apache/tvm/pull/7051)
-- ROCm 3.9 bitcode files search [#6865](https://github.com/apache/tvm/pull/6865)
-- Vulkan and SPIR-V refactoring and major improvement in codegen and runtime. [A critical bug fix in SPIRV codegen](https://github.com/apache/tvm/pull/8102) allows the Vulkan backend to produce correct outputs on more hardwares and drivers. Added support for querying device specific hardware parameters and capabilities, dynamic shapes, irregular ops such as sorting and NMS, UBO, fp16, and vectorization. We can now run complicated models like MaskRCNN on Vulkan end to end. [#8904](https://github.com/apache/tvm/pull/8904) [#7833](https://github.com/apache/tvm/pull/7833) [#7717](https://github.com/apache/tvm/pull/7717) [#7681](https://github.com/apache/tvm/pull/7681) [#8746](https://github.com/apache/tvm/pull/8746) [#8813](https://github.com/apache/tvm/pull/8813) [#7609](https://github.com/apache/tvm/pull/7609) [#8882](https://github.com/apache/tvm/pull/8882) [#7607](https://github.com/apache/tvm/pull/7607) [#7591](https://github.com/apache/tvm/pull/7591) [#7574](https://github.com/apache/tvm/pull/7574) [#7572](https://github.com/apache/tvm/pull/7572) [#7833](https://github.com/apache/tvm/pull/7833) [#6662](https://github.com/apache/tvm/pull/6662) [#7969](https://github.com/apache/tvm/pull/7969) [#8013](https://github.com/apache/tvm/pull/8013) [#8048](https://github.com/apache/tvm/pull/8048) [#8098](https://github.com/apache/tvm/pull/8098) [#8102](https://github.com/apache/tvm/pull/8102) [#8107](https://github.com/apache/tvm/pull/8107) [#8127](https://github.com/apache/tvm/pull/8127) [#8151](https://github.com/apache/tvm/pull/8151) [#8196](https://github.com/apache/tvm/pull/8196) [#8320](https://github.com/apache/tvm/pull/8320) [#8588](https://github.com/apache/tvm/pull/8588) [#8332](https://github.com/apache/tvm/pull/8332) [#8333](https://github.com/apache/tvm/pull/8333) [#8348](https://github.com/apache/tvm/pull/8348) [#8528](https://github.com/apache/tvm/pull/8528)
-- Metal language version upgrade (`MTLLanguageVersion2_3`), better codegen support, int64 support, various bug fixes [#7830](https://github.com/apache/tvm/pull/7830) [#7819](https://github.com/apache/tvm/pull/7819) [#7714](https://github.com/apache/tvm/pull/7714) [#7118](https://github.com/apache/tvm/pull/7118) [#7116](https://github.com/apache/tvm/pull/7116) [#7105](https://github.com/apache/tvm/pull/7105) [#7980](https://github.com/apache/tvm/pull/7980) [#8054](https://github.com/apache/tvm/pull/8054) [#8175](https://github.com/apache/tvm/pull/8175) [#8202](https://github.com/apache/tvm/pull/8202) [#8206](https://github.com/apache/tvm/pull/8206) [#8313](https://github.com/apache/tvm/pull/8313)
-- OpenCL, VTA, Verilator: refactored code generator, better error messages, various bug fixes [#7834](https://github.com/apache/tvm/pull/7834) [#7777](https://github.com/apache/tvm/pull/7777) [#7761](https://github.com/apache/tvm/pull/7761) [#7100](https://github.com/apache/tvm/pull/7100) [#6125](https://github.com/apache/tvm/pull/6125) [#6126](https://github.com/apache/tvm/pull/6126) [#6191](https://github.com/apache/tvm/pull/6191) [#7834](https://github.com/apache/tvm/pull/7834) [#8256](https://github.com/apache/tvm/pull/8256) [#8257](https://github.com/apache/tvm/pull/8257) [#8731](https://github.com/apache/tvm/pull/8731) [#8756](https://github.com/apache/tvm/pull/8756) [#8973](https://github.com/apache/tvm/pull/8973)
-- CUDA: enable `__launch_bounds__`, dynamic shared memory, TensorCore, BF16, half2, NVCC version upgrade [#9341](https://github.com/apache/tvm/pull/9341) [#8678](https://github.com/apache/tvm/pull/8678) [#7561](https://github.com/apache/tvm/pull/7561) [#7273](https://github.com/apache/tvm/pull/7273) [#7146](https://github.com/apache/tvm/pull/7146) [#7147](https://github.com/apache/tvm/pull/7147) [#7099](https://github.com/apache/tvm/pull/7099) [#7065](https://github.com/apache/tvm/pull/7065) [#7033](https://github.com/apache/tvm/pull/7033) [#7014](https://github.com/apache/tvm/pull/7014) [#7907](https://github.com/apache/tvm/pull/7907) [#7964](https://github.com/apache/tvm/pull/7964) [#9087](https://github.com/apache/tvm/pull/9087) [#8135](https://github.com/apache/tvm/pull/8135) [#8137](https://github.com/apache/tvm/pull/8137) [#8457](https://github.com/apache/tvm/pull/8457) [#8466](https://github.com/apache/tvm/pull/8466) [#8571](https://github.com/apache/tvm/pull/8571)
-- ARM: CMSIS-NN, Ethos-N [#8653](https://github.com/apache/tvm/pull/8653) [#7628](https://github.com/apache/tvm/pull/7628) [#8951](https://github.com/apache/tvm/pull/8951) [#7506](https://github.com/apache/tvm/pull/7506) [#7443](https://github.com/apache/tvm/pull/7443) [#7858](https://github.com/apache/tvm/pull/7858) [#6982](https://github.com/apache/tvm/pull/6982) [#8795](https://github.com/apache/tvm/pull/8795) [#8806](https://github.com/apache/tvm/pull/8806) [#8833](https://github.com/apache/tvm/pull/8833) [#9147](https://github.com/apache/tvm/pull/9147) [#9159](https://github.com/apache/tvm/pull/9159) [#9160](https://github.com/apache/tvm/pull/9160) [#9162](https://github.com/apache/tvm/pull/9162) [#9163](https://github.com/apache/tvm/pull/9163) [#9167](https://github.com/apache/tvm/pull/9167) [#9209](https://github.com/apache/tvm/pull/9209) [#9386](https://github.com/apache/tvm/pull/9386) [#9387](https://github.com/apache/tvm/pull/9387)
-- Hexagon: build, compilation, model launcher, more target options and better runtime [#7784](https://github.com/apache/tvm/pull/7784) [#6718](https://github.com/apache/tvm/pull/6718) [#8821](https://github.com/apache/tvm/pull/8821) [#8822](https://github.com/apache/tvm/pull/8822) [#9033](https://github.com/apache/tvm/pull/9033) [#8823](https://github.com/apache/tvm/pull/8823) [#8859](https://github.com/apache/tvm/pull/8859) [#8865](https://github.com/apache/tvm/pull/8865) [#8915](https://github.com/apache/tvm/pull/8915) [#8954](https://github.com/apache/tvm/pull/8954) [#9024](https://github.com/apache/tvm/pull/9024) [#9025](https://github.com/apache/tvm/pull/9025) [#8960](https://github.com/apache/tvm/pull/8960) [#8986](https://github.com/apache/tvm/pull/8986) [#9010](https://github.com/apache/tvm/pull/9010) [#9011](https://github.com/apache/tvm/pull/9011) [#9189](https://github.com/apache/tvm/pull/9189) [#9220](https://github.com/apache/tvm/pull/9220) [#9355](https://github.com/apache/tvm/pull/9355) [#9356](https://github.com/apache/tvm/pull/9356)
-
-
-- WASM: Update support for latest emcc, add ffi test. [#6751](https://github.com/apache/tvm/pull/6751)
-
-#### BYOC Integration with Vendor Libraries: TensorRT, ACL, VitisAI
-
-- TensorRT initial integration, stabilization, int8 calibration, dynamism support  [#6395](https://github.com/apache/tvm/pull/6395) [#7702](https://github.com/apache/tvm/pull/7702) [#7595](https://github.com/apache/tvm/pull/7595) [#7581](https://github.com/apache/tvm/pull/7581) [#7412](https://github.com/apache/tvm/pull/7412) [#7372](https://github.com/apache/tvm/pull/7372) [#9047](https://github.com/apache/tvm/pull/9047) [#8073](https://github.com/apache/tvm/pull/8073) [#8808](https://github.com/apache/tvm/pull/8808) [#6905](https://github.com/apache/tvm/pull/6905) [#7967](https://github.com/apache/tvm/pull/7967) [#8005](https://github.com/apache/tvm/pull/8005) [#8172](https://github.com/apache/tvm/pull/8172) [#8461](https://github.com/apache/tvm/pull/8461) [#8506](https://github.com/apache/tvm/pull/8506) [#8607](https://github.com/apache/tvm/pull/8607) [#7205](https://github.com/apache/tvm/pull/7205) [#7026](https://github.com/apache/tvm/pull/7026) [#7016](https://github.com/apache/tvm/pull/7016) [#7011](https://github.com/apache/tvm/pull/7011) [#6955](https://github.com/apache/tvm/pull/6955) [#6872](https://github.com/apache/tvm/pull/6872) [#7253](https://github.com/apache/tvm/pull/7253) [#6805](https://github.com/apache/tvm/pull/6805) [#9324](https://github.com/apache/tvm/pull/9324)
-- Arm Compute Library (ACL) integration [#7649](https://github.com/apache/tvm/pull/7649) [#7206](https://github.com/apache/tvm/pull/7206) [#6532](https://github.com/apache/tvm/pull/6532) [#7121](https://github.com/apache/tvm/pull/7121) [#6724](https://github.com/apache/tvm/pull/6724) [#8149](https://github.com/apache/tvm/pull/8149) [#7251](https://github.com/apache/tvm/pull/7251) [#9396](https://github.com/apache/tvm/pull/9396)
-- Verilator integration [#7406](https://github.com/apache/tvm/pull/7406) [#7351](https://github.com/apache/tvm/pull/7351) [#7286](https://github.com/apache/tvm/pull/7286) [#8094](https://github.com/apache/tvm/pull/8094)
-- VitisAI integration [#6343](https://github.com/apache/tvm/pull/6343) [#7350](https://github.com/apache/tvm/pull/7350)
-- BYOC infrastructure enhancement: improving control flow, AnnotateTarget, custom codegen [#6641](https://github.com/apache/tvm/pull/6641) [#6655](https://github.com/apache/tvm/pull/6655) [#6697](https://github.com/apache/tvm/pull/6697) [#6786](https://github.com/apache/tvm/pull/6786) [#7977](https://github.com/apache/tvm/pull/7977) [#8464](https://github.com/apache/tvm/pull/8464)
-
-
-#### TVMC
-
-- MacOS support [#8396](https://github.com/apache/tvm/pull/8396)
-- AutoScheduler support [#7070](https://github.com/apache/tvm/pull/7070)
-- Support cross compiler options [#7922](https://github.com/apache/tvm/pull/7922)
-- Python scripting [#7823](https://github.com/apache/tvm/pull/7823) [#7698](https://github.com/apache/tvm/pull/7698)
-- More flexible input specification [#7366](https://github.com/apache/tvm/pull/7366) [#7788](https://github.com/apache/tvm/pull/7788)
-- More options, `--disable-pass` and `--config` [#7816](https://github.com/apache/tvm/pull/7816) [#8253](https://github.com/apache/tvm/pull/8253)
-- Allow passing optional arguments to importers [#7674](https://github.com/apache/tvm/pull/7674)
-- Model library format (MLF) support [#8086](https://github.com/apache/tvm/pull/8086) [#8331](https://github.com/apache/tvm/pull/8331)
-- More backend and library support: metal, ACL, Vulkan, OpenCL, ROCm, Vitis AI [#8282](https://github.com/apache/tvm/pull/8282) [#7508](https://github.com/apache/tvm/pull/7508) [#8359](https://github.com/apache/tvm/pull/8359) [#6831](https://github.com/apache/tvm/pull/6831) [#8896](https://github.com/apache/tvm/pull/8896) [#7577](https://github.com/apache/tvm/pull/7577)
-- Support for the new target system [#7651](https://github.com/apache/tvm/pull/7651) [#7654](https://github.com/apache/tvm/pull/7654) [#6788](https://github.com/apache/tvm/pull/6788) [#7304](https://github.com/apache/tvm/pull/7304) [#6855](https://github.com/apache/tvm/pull/6855)
-
-#### Rust Binding
-
-- Rust bindings installable via Cargo [#7503](https://github.com/apache/tvm/pull/7503) [#6678](https://github.com/apache/tvm/pull/6678) [#8631](https://github.com/apache/tvm/pull/8631) [#8665](https://github.com/apache/tvm/pull/8665)
-- Initial support for diagnostic interface [#6656](https://github.com/apache/tvm/pull/6656)
-- Fixes for using Python APIs from Rust [#7085](https://github.com/apache/tvm/pull/7085)
-- Improve NDArray, GraphRt, Relay, IRModule, Array, Attrs bindings [#6563](https://github.com/apache/tvm/pull/6563) [#6741](https://github.com/apache/tvm/pull/6741) [#7138](https://github.com/apache/tvm/pull/7138) [#8353](https://github.com/apache/tvm/pull/8353) [#7082](https://github.com/apache/tvm/pull/7082)
-- Improve error handling, error messages and fix memory leaks [#8289](https://github.com/apache/tvm/pull/8289) [#6815](https://github.com/apache/tvm/pull/6815) [#8714](https://github.com/apache/tvm/pull/8714) [#8725](https://github.com/apache/tvm/pull/8725)
-
-#### Misc
-
-- Enhanced CPP-RPC implementation: allow user supplied work dir, support of CPP-RPC server for Apple, support adb-shell style CPP-RPC [#7670](https://github.com/apache/tvm/pull/7670) [#8224](https://github.com/apache/tvm/pull/8224) [#8223](https://github.com/apache/tvm/pull/8223) [#7766](https://github.com/apache/tvm/pull/7766) [#7013](https://github.com/apache/tvm/pull/7013)
-- Use PopenWorker to handle RPC system: [#7889](https://github.com/apache/tvm/pull/7889) [#7757](https://github.com/apache/tvm/pull/7757) [#7961](https://github.com/apache/tvm/pull/7961)
-- Fold target host into target [#7462](https://github.com/apache/tvm/pull/7462) [#7791](https://github.com/apache/tvm/pull/7791) [#7534](https://github.com/apache/tvm/pull/7534) [#8835](https://github.com/apache/tvm/pull/8835)
-- Target-based intrinsic lowering and legalization [#7936](https://github.com/apache/tvm/pull/7936) [#7809](https://github.com/apache/tvm/pull/7809)
-- Add target tags for all existing CUDA GPU models [#7410](https://github.com/apache/tvm/pull/7410)
-- Linear Congruential Random Engine [#8642](https://github.com/apache/tvm/pull/8642)
-
-## 0.7
-v0.7 brings many major features. The community works together to refactor the internal code base to bring an unified IR code structure with a unified IRModule, type system and pass infrastructure. We have also bought many exciting new features, some highlights include:
-
-* Initial automatic scheduling support
-* Initial command line driver interface
-* WebGPU and webassembly support
-* Better first class rust support in the codebase
-* Intial Hexagon support
-* Bring your own codegen (BYOC) support
-
-The community also continues to bring high quality improvements to the existing modules including, but not limited to: better frontend coverage, performance, quantization, microTVM and dynamic shape support.
-
-### New Features
-#### Automatic Scheduling (Experimental)
-* Phase 0: Ansor minimum system for auto schedule generating #5962
-* Phase 1: Access Analyzer #6103
-* Phase 1: Add `follow_split` and `follow_fused_split` steps #6142
-* Phase 1: Add `pragma`/`storage_align`/`rfactor` steps #6141
-* Phase 1: Add RPC Runner #6077
-* Phase 1: Add `annotation`/`compute_at`/`compute_root`/`compute_inline` steps #6073
-* Phase 1: Add `cache_read`/`cache_write` steps #6107
-* Phase 1: Rename namspace form `auto_schedule` to `auto_scheduler` #6059
-* Phase 1: The base class for cost models #6187
-* Phase 1: feature extraction for cost models #6190
-* Phase 1: XGBoost Cost Model #6270
-* Phase 2: Basic GPU Sketch Search Policy #6269
-* Phase 2: Evolutionary Search #6310
-* Phase 2: Update heavy operations with `parallel_for` #6348
-* Parallel the InitPopulation (#6512)
-* Tutorial: Using the template-free auto-scheduler on CPU (#6488)
-
-#### BYOC
-* External codegen support in Relay (#4482), (#4544)
-* Bring Your Own Codegen Guide -- Part 1 #4602
-* Bring Your Own Codegen Guide -- Part 2 #4718
-* Relay annotation and partitioning for external compilers #4570
-* JSON Runtime with DNNL End-to-End Flow #5919
-* Handle one symbol for each runtime #5989
-* Run accelerator specific optimizations #6068
-* Arm Compute Library integration #5915
-* Retire the example json runtime #6177
-* `json_node.h` should include `data_type.h` #6224
-* Improve installation tutorial #6170
-* Add support for dense (fully connected) layer #6254
-* Introduce the Ethos-N BYOC integration #6222
-* Enable remote device via environment variables #6279
-* Improved pooling support #6248
-* Add support for quantized convolution #6335
-* CoreML codegen #5634
-
-#### Operator Coverage
-* Add `strided_set` operation (#4303)
-* Add support for conv3d (#4400), pool3d (#4478), 3d upsampling ops (#4584)
-* Add group convolution for VTA (#4421)
-* Add 1d deconvolution op (#4476)
-* Allow batch matmul to be fused into injective ops (#4537)
-* Add native depthtospace and spacetodepth operators (#4566)
-* Add CUDNN conv3d support (#4418)
-* Dilation2D operator support #5033
-* Isfinite operator #4981
-* Unravel Index operator #5082
-* Add thrust support for nms #5116
-* Resize3d, Upsample3d op support #5633
-* Add operator Correlation #5628
-* `affine_grid` and `grid_sample` #5657
-* Sparse to dense operator #5447
-* `Conv3d_transpose` op support added #5737
-* add op `crop_and_resize` #4417
-* Add bitwise ops #4815
-* Sparse to dense operator #5447
-* support dynamic NMS(Non Maximum Suppression), symbolic begin, end, and strides for strided_slice #4312
-* `Conv3d_transpose` op support added #5737
-* ReverseSequence operator #5495
-* Conv1D #4639
-* 1D Pooling #4663
-
-#### Quantization
-* Channel wise quantization - Quantize & Requantize #4629
-* Support QNN ops. #5066
-* Adding support for QNN subtract op #5153
-* TFLite QNN Tutorial #5595
-* Tutorial: Deploy Quantized Model on CUDA #4667
-* Support asymmetric per-layer quantized operators #6109
-
-#### Relay
-* Add convertlayout pass in Relay (#4335, #4600)
-* Added Merge Composite pass #4771
-* Call graph for relay #4922
-* Add inline pass #4927
-* Target annotation for external codegen #4933
-* GradientCell Relay Pass #5039
-* Add MergeCompilerRegions pass #5134
-* Non-recursive Graph Vistor and Rewriter (#4886)
-* [Blocksparse] Pipeline for lowering dense model to sparse-dense (#5377)
-* Relay op strategy #4644
-* Static Tensor Array (#5103)
-* Memory planner (part 1) #5144
-* ONNX codegen #5052
-* Add Parser 2.0 #5932, part 2 #6162
-* Basic block normal form #6152
-* Convert Layout pass. #4664
-* Pattern Language, Matcher, Rewriter, and Function Paritioner #5231
-
-#### Runtime and Backend
-* Add ADTObject POD container type (#4346)
-* TFLite RPC runtime (#4439)
-* Standardized graph runtime export (#4532)
-* MISRA-C compliant TVM runtime #3934
-* Add String container #4628
-* Introduce Virtual Memory Allocator to CRT (#5124)
-* Initial implementation of Hexagon runtime support (#5252)
-* FastRPC interface for Hexagon runtime (#5353)
-* CoreML Runtime (#5283)
-* AutoTVM + uTVM for Cortex-M7 (#5417)
-* Windows Support for cpp_rpc (#4857)
-* Implement TVMDSOOp(TensorFlow custom op) for TVM runtime (#4459)
-* WebGPU support #5545
-* TVM WebAssembly JS Runtime #5506
-* Hexagon driver for offloading kernels to simulator #5492
-* Introduce runtime::Array #5585
-* Allow non-nullable ObjectRef, introduce Optional. (#5314)
-* Introduce static slots for common objects. (#5423)
-* ntroduce RValue reference(move) support to TypedPackedFunc (#5271)
-* Introduce MetadataModule to separate code compilation/interpretation and weight initialization #5770
-* Support module based interface runtime #5753
-* Add TVM application extension with WASM runtime #5892
-* Provide guide to user who has difficulty register SEqualReduce (#5300)
-
-#### Rust Support
-* Revive the Rust + SGX refactor #4976
-* Improve Rust bindings: Map, Array, String, various IR nodes #6339
-* Rust Refactor Stage 4: Rewrite Rust graph runtime to use new APIs #5830
-* Second stage of Rust Refactor #5527
-* tvm crate stage 3 of Rust refactor #5769
-* Add first stage of updating and rewriting Rust bindings. #5526
-
-#### TIR
-* Introduce StructuralHash for the Unified IR. #5160
-* Introduce StructuralEqual Infra for the unified IR. #5154
-* Introduce ExprDeepEqual, Remove IRDeepCompare #5206
-* [TIR] Introduce BufferLoad/Store (#5205)
-* Improved massive build times caused by tir.floormod and tir.floordiv. Fixed Topi testcase. #5666
-* Buffer logger assert removed #6147
-* Enhance VerifyGPUCode #6194
-* HoistIfThenElse added #6066
-* Hybrid Script Support for TIR #6227
-* Migrate Low-level Passes to Pass Manager #5198
-* HoistIfThenElse added #6066
-* Hybrid Script Support for TIR #6227
-* Block scope hoisting added #6238
-
-#### TE
-* reverse-mode autodiff without any optimization #5121
-* Tensor Expression Debug Display (TEDD) #4651
-* Optimize and eliminate the Jacobian tensor for te.autodiff #6078
-
-#### TVMC(Experimental)
-* TVMC - A command line driver for TVM (Part 1) #6112
-* TVMC - Linting error on onnx command line driver frontend #6536
-* TVMC - Command line driver 'compile' (part 2/4) #6302
-* TVMC - Introduce 'tune' subcommand (part 3/4) #6537
-* TVMC - Introduce 'run' subcommand (part 4/4) #6578
-* TVMC - Getting started tutorial for TVMC #6597
-
-
-### Feature Improvement
-#### Accelerator and Microcontroller Support
-- Cleanup legacy verilog code (#4576)
-- uTVM support for ARM STM32F746XX boards (#4274)
-- Add --runtime=c, remove `micro_dev` target, enable LLVM backend #6145
-
-#### Arithmetic Analysis
-* Linear system and equation solver (#5171)
-* Inequalities solver #5618
-* Improve IntervalSet's floormod (#5367)
-* Remove legacy const pattern functions (#5387)
-* Handle likely in IRMutatorWithAnalyzer #5665
-* ExtendedEuclidean merge impl to int_operator #5625
-* Rewrite simplify fix for Vectorized Cooperative Fetching #5924
-
-#### AutoTVM and Graph Tuner
-* Adding ROCM schedules for TOPI (#4507)
-* NHWC conv2d schedule templates for ARM (#3859)
-* Use VM compile to extract autotvm tasks #4328
-* Download fallback schedule file if it does not exist #4671
-* Ignore error when removing tmpdir #4781
-* Fix a bug in generating the search space #4779
-* Minor bug fixes in AutoTVM for QNN graphs #4797
-* Fix autotvm customized template #5034
-* Add opt out operator for `has_multiple_inputs` for graph tuner #5000
-* Customize SI prefix in logging (#5411)
-* Update XGBoost verbosity option #5649
-* Support range in index based tuners #4870
-* Enable random fill and CPU cache flush for AutoTVM and Ansor (#6391)
-* Auto-scheduler tutorial for GPU and necessary refactor/fix (#6512)
-
-#### BYOC
-* [BYOC] Bind constant tuples in graph partitioner (#5476)
-* [BYOC] Add support for composite functions in BYOC (#5261)
-* [BYOC] Register pattern tables from external codegens (#5262)
-* [BYOC] Enhance partitioning and external codegen (#5310)
-* [BYOC] Refine AnnotateTarget and MergeCompilerRegion Passes (#5277)
-* [BYOC] Use Non-Recursive Visitor/Mutator (#5410)
-* [BYOC] Refine DNNL Codegen (#5288)
-* [BYOC] Add example of Composite + Annotate for DNNL fused op (#5272)
-* [BYOC] Prevent duplicate outputs in subgraph Tuple (#5320)
-* [BYOC] Introduce further operator support (#6355)
-* [BYOC] Support input nodes with multiple entries (#6368)
-* [BYOC] Add maximum support for float32 (#6506)
-
-#### Codegen
-* Intrinsic dispatching with OCML instead of LLVM for ROCm (#4499)
-* Make target codegen take IRModule and PrimFunc. #5107
-* Enhance CUDA codegen for SelectNode #4983
-* Vectorization for intrinsics #5101
-* [LLVM] Do not use `x86_vcvtph2ps_256` intrinsic with LLVM 11+ (#5267)
-* [LLVM] Use llvm::ElementCount with LLVM 11+ when creating vectors (#5265)
-* [LLVM] Use llvm::FunctionCallee in IRBuilder::CreateCall with LLVM 11+ (#5338)
-* [LLVM] Include Support/Host.h for declaration of getDefaultTargetTriple (#5268)
-* [LLVM] Replace calls to Type::getVectorNumElements (#5398)
-* [LLVM] Use ArrayRef in calls to CreateShuffleVector (#5399)
-* [LLVM] Use llvm::Align with LLVM 11+ to avoid warnings (#5264)
-* [CodeGen] Cleanup generated code (#5424)
-* Rename `target_id` => `target_kind` #6199
-* 64-bit RPi4b target #6211
-* Creating Target from JSON-like Configuration #6218
-* Add python binding to new JSON target construction #6315
-* Use target class in all codegens #6347
-* Initial support for Hexagon codegen #6261
-* Add --runtime=c, remove `micro_dev` target, enable LLVM backend #6145
-* Add tvm::support::hexdump() debug utility #6154
-* Adding AMD codegen unit tests (#4509)
-* Support cuda tensorcore subbyte int data type in auto tensorcore #4546
-* Handle empty LLVMModule in GetFunction #5146
-* Support int4/int8 conv2d tensor core with HWNC layout #6121
-
-#### Dynamism Support
-* Add shape function for `zero`, `zeros_like`, `ones`, `ones_like` (#4448), `tile` (#4441)
-* Support symbolic newshape for Reshape #5429
-* Support symbolic TopK, Ones, Zeros and Full #5459
-* Add `shape_of` instruction #5855
-* symbolic `max_output_size` #5844
-* Dynamic TopK Op #6008
-* Dynamic `broadcast_to`, `zeros`, `ones` #6007
-* Add dynamic reshape grad #6080
-* Keep fixed dim when unifying dynamic shape #5795
-* OneHot operation #6209
-* Add Dynamic Resize Op #6198
-* Dynamic full operator #6260
-* Dynamic upsampling relay op #6273
-* Dynamic Tile Op #5983
-
-#### Frontend and User Interface
-* TFLite parser support for `transpose_conv` (#4440), `unpack` (#4447)
-* LLDB pretty printers for relay (#4453)
-* ONNX to Relay converter op support: expand op (#4483)
-* ONNX `auto_pad` in conv and convtranspose (#4563)
-* TF to Relay converter op support (#4504) (#4551) (#4484)
-* Remove unnecessary cast of constants in ONNX converter (#4573)
-* Add support for tf.Keras networks in Relay Keras frontend #4630
-* Add conv3d #4604
-* Fix incorrect calculations in tf SLICE #4518
-* Dynamically calculate `input_stats` of any `fake_quant` range #4789
-* LSTM Support #4825
-* Add `MIRROR_PAD` operator #4822
-* use qnn helper function in softmax #4840
-* Add Resize op converter #4838
-* Add support for `TFLite_Detection_PostProcess` #4543
-* Fix tests for tflite unary elemwise operations #4913
-* GaussianDropout/Noise parsing support #4928
-* Add parser support for 'square' operator #4915
-* `make_loss` operator support #4930
-* Add parser support for `l2_normalization` #4966
-* ReadVariableOp operator support #4952
-* Check graph inputs match expected #4992
-* support multiply outputs #4980
-* TFLite: Using real image for QNN testing. #4816
-* TFLite: `FLOOR_MOD` & `FLOOR_DIV` support #4971
-* PyTorch: Upsampling op support and enable registering a user defined op conversion map #4961
-* PyTorch: fix unordered dictionary problem for python version under 3.6 #4982
-* Operator support NonZero #5073
-* Upsampling op support and enable registering a user defined op conversion map #4961
-* Check graph inputs match expected #4992
-* Add support for quantized models via QNN #4977
-* Add initial control flow support #4964
-* Remove FP32 piggy back and use QNN add/mul/concatenate #5061
-* Add missing upcast to uint8 `avg_pool` conversion #5089
-* Add initial 3D op support and test on Resnet 3D #5075
-* Fix conv2d conversion for group conv (group > 1 but != in channels) #5132
-* Add support for `max_pool1d` #5142
-* Add support for split #5174
-* `FLOOR_MOD` & `FLOOR_DIV` support #4971
-* Activation functions support #4978
-* Round op parsing support added #5022
-* DepthToSpace and SpaceToDepth support #5041
-* `TOP_K` op parser support #5051
-* ReadVariableOp operator support #4952
-* Support multiply outputs #4980
-* `reduce_any` op parsing support #4926
-* TensorFlow Parser Control Flow Enhancement #5020
-* TensorFlow Frontend support with shared params #5042
-* Support for AddV2 in Relay Tensorflow frontend converter. #5046
-* conv3d frontend operator support #5080
-* `max_pool3d` and Averagepool3d operator support #5085
-* Support for Atan/Atan2 in Relay Tensorflow frontend converter. #5104
-* Use leaky by default for LeakyReLU #5192
-* Conv3D ONNX support and `conv3D_ncdhw` x86 schedules #4949
-* Add support for FusedBatchNormV3 #5065
-* Activations for pytorch #5194
-* Dropouts And InstanceNorm support added #5203
-* [Frontend] Asymmetric padding of convolution support (#4803)
-* [ONNX]Pool3d & upsample3d op support (#5135)
-* Add TopK to ONNX Frontend (#5441)
-* Add RoiAlign to Onnx frontend (#5454)
-* [PYTORCH]AvgPool3d, MaxPool3d and Squeeze op support (#5220)
-* [PYTORCH]celu, gelu, selu activations (#5263)
-* [Pytorch]layernorm bug fix and testcase updated (#5257)
-* [PYTORCH]LayerNorm support added (#5249)
-* [PYTORCH]GroupNorm op support added (#5358)
-* [PYTORCH]Logical & Bitwise operator support (#5341)
-* [PYTORCH]Tensor creation ops support (#5347)
-* [PYTORCH]cosh,sinh,log2,log10,log1p op support (#5395)
-* [PYTORCH]Rsub, Embedded, OneHot ops support (#5434)
-* [PYTORCH]Abs, Arange, Softplus ops (#5295)
-* [PYTORCH]isNan, isinf, isfinite, ceil, clamp, round ops (#5316)
-* [PYTORCH]Activations for pytorch (#5194)
-* [PYTORCH]Repeat, Reciprocal & Reshape Op support (#5280)
-* [PYTORCH]`Reduce_ops` support added (#5308)
-* [PYTORCH]Take, Topk op support (#5332)
-* [PYTORCH]Dropouts And InstanceNorm support added (#5203)
-* [PYTORCH]Unary Ops frontend support. (#5378)
-* [Torch] Support Python list, more realistic recurrent networks (#5306)
-* [PYTORCH]where, addcdiv, addcmul op support (#5383)
-* [Torch] Add support for split (#5174)
-* [Torch] Fix up graph input handling (#5204)
-* [TFLITE]Logical not op support (#5475)
-* [TFLITE]Hard Swish & MobilnetV3 model testing (#5239)
-* [TFLITE]Gather, StridedSlice op support added (#4788)
-* [TFLITE] Match TFLite shape for SSD custom op (#5473)
-* Factor out import of common tflite.Operator in tflite frontend. (#5355)
-* [TFLite] support for FILL and `SPLIT_V` operators (#5330)
-* [TFLite] `L2_POOL_2D` operator (#5452)
-* [TFLite] Add config option to specify FlatBuffers location (#5425)
-* [TFLITE]Logical not op support (#5475)
-* [TENSORFLOW]reduce ops updated (#5180)
-* [TENSORFLOW] Fix `gather_nd` indices (#5279)
-* [TensorFlow]Improve TensorFlow Static Shape Tensor Array (#5243)
-* [KERAS]Minimum & AlphaDropout op support (#5380)
-* [KERAS]Embedding layer (#5444)
-* [KERAS]`Max_pool3d` and Averagepool3d operator support (#5085)
-* [CAFFE2]add Mul and ConvTranspose operator (#5302)
-* [MXNET]DepthToSpace & SpaceToDepth Operator (#5408)
-* [MXNET]broadcast and logical op support (#5461)
-* [MXNET] Use leaky by default for LeakyReLU (#5192)
-* [MXNET] support elemwise logic ops (#5361)
-* [Frontend|MXNet] SwapAxis operator support (#5246)
-* [RELAY] Move frontend utils (#5345)
-* [Pytorch] Fix translation of transpose when axis argument is as a list (#5451)
-* LpPool Support added #5696
-* Skip ADD inside Gemm op when vector is zero #5697
-* ReduceL1, ReduceL2, ReduceSumSquare, ReduceLogSum ops added #5721
-* MaxRoiPool, Mod & Xor op support added #5729
-* Skip multiply with 1.0f constant for GEMM import #5800
-* StatefulPartitionedCall/PartitionedCall Ops support added #5617
-* Don't add cast for batch norm when type isn't changing #5731
-* Conv3d Transpose OP added #5775
-* expand bug fix #5576
-* Support `max_pool2d_with_indices` #5549
-* Add prim::device op #5584
-* ImplicitTensorToNum support added #5603
-* Matmul fix for `batch_matmul` #5604
-* ReflectionPad2d op #5624
-* Padding op support #5638
-* Minor bug fixes #5683
-* `floor_divide` support for squeezenet #5702
-* ReplicationPad support added #5708
-* aten::norm support added #5776
-* broadcast and logical op support #5461
-* MaxPool3d and AvgPool3d Ops support added #5614
-* Softmin, trunc op support added #5715
-* conv3d and `conv3d_transpose` addedx #5814
-* Model importer to be compatible with tflite 2.1.0 #5497
-* Nit: Function names made consistent #5515
-* Select op support for tflite frontend #5486
-* `GATHER_ND` #5508
-* Quantize & Dequantize op #5394
-* Fully connected op conversion made in sync with TFLite #5510
-* `ADD_N` operator #5474
-* onnx, mxnet, pytorch mathops added #5561
-* abs, round, reciprocal, sign, softsign, `hard_sigmoid` ops support #5587
-* Gather nd bug fix for one dim support in tensorflow #5588
-* Add parser support for shape and range #5329
-* Darknet support batch size for yolo #5688
-* Improve Control Flow and TensorArray #5699
-* MXNet: Softmin, trunc op support added #5715
-* MXNet: conv3d and `conv3d_transpose` addedx #5814
-* MXNet: Add parser for `contrib.box_decode` #5967
-* Onnx: ReduceL1, ReduceL2, ReduceSumSquare, ReduceLogSum ops added #5721
-* Onnx: MaxRoiPool, Mod & Xor op support added #5729
-* Onnx: Skip multiply with 1.0f constant for GEMM import #5800
-* Onnx: Fix an issue with #5755 and add Batch norm unit tests. #5845
-* TensorFlow: StatefulPartitionedCall/PartitionedCall Ops support added #5617
-* TensorFlow: Don’t add cast for batch norm when type isn’t changing #5731
-* TensorFlow: Conv3d Transpose OP added #5775
-* Add parser support for shape and range #5329
-* Darknet support batch size for yolo #5688
-* Improve Control Flow and TensorArray #5699
-* Improve TF Parser to keep output nodes for `saved_model` #5794
-* Add parser support for `relu6`, `leaky_relu`, `relu_n1_to_1`, `log_softmax` #4805
-* Fix TF Dynamic input shape #5825
-* Support a few contrib ops in mxnet #5819
-* Improve TF Parser to keep output nodes for `saved_model` #5794
-* Add parser support for `relu6`, `leaky_relu`, `relu_n1_to_1`, `log_softmax` #4805
-* Check all unsupported ops before raising an exception #5929
-* Add Pytorch advanced indexing #6318
-* Support `index_select` #6295
-* Fix cast to long #6301
-* Fix dtype handling for modules with integer parameters #6311
-* pytorch frontend support conv1d #6203
-* Add cast to double, fix flatten conversion #6357
-* Fix aten::max and aten::min conversion #6372
-* Match pytorch 1.6 googlenet pretrained model (#6201) #6212Add unbiased variance op and corresponding support in pytorch frontend #6232
-* Implemented PADV2 Operator for TFLite and added support for constant values in PAD. #6167
-* Implemented `ONE_HOT` Operator for TFLite. #6223
-* Implemented `EXPAND_DIMS` Operator for TFLite. #6243
-* Implemented `REVERSE_V2` Operator for TFLite. #6304
-* Implemented `MATRIX_SET_DIAG` Operator for Relay/TOPI and TFLite Frontend. #6303
-* RESHAPE with dynamic shape arg in TFLite frontend #6208
-* Constant input attr added to fully connected operation in TFLite frontend #6228
-* Gather operation with indices as tensor expr in TFLite frontend #6168
-* Added support for tflite quantized maximum and minimum #6018
-* Unary ops support added in frontend #6196
-* Introduce caffe frontend for tvm #6206
-* Keras softmax and prelu fix under NHWC #6278
-* add support for MXNET numpy operators #6054
-* Refine tensorflow frontend 1.x & 2.x compatibility #6240
-* Reduceops support added to frontend #6252
-* Update precision in the ONNX `strided_slice`, update precision of ToScalar #6272
-* NHWC import support. #4899
-* Refine tensorflow frontend 1.x & 2.x compatibility #6240
-* Fix node indices attribute error for tensorflow 2.3 #6288
-* Support NMSv4 #6085
-* Support for PyTorch Non-Maximum Suppression #6314
-* ReplicationPad support added #5708
-* MXNet pre-quantized BERT #6039
-* Keep parameter names from PyTorch #5887
-* Refine LSTMBlockCell to support dynamic rnn #5963
-
-#### Relay
-* Add function attributes to IR hash (#4479)
-* Relay passes lookup overhead optimization (#4594)
-* Add `half_pixel` option to Resize op #4610
-* Skip example json runtime test when config is not set #4614
-* Test `tensor_array` in vm #4608
-* Improve `memory_allocation` pass to support multiple i/o dynamic kernels #4595
-* Add unit test for `tensor_array_split` #4619
-* Add parses support for unary elemwise ops #4634
-* Add parses support for SLICE #4502
-* Added pool autopadding and simplified converters. #4672
-* Fix meaning of `conv2d_transpose` `output_padding` parameter #4318
-* Use packed func macro for external codegen #4710
-* Fix `_parse_param` bug #4711
-* Add constant input support for elemwise ops #4666
-* Add parser support for squared difference #4652
-* Add type check to dense #4724
-* Invoke tvm::build from relay `compile_engine` and interpreter #4723
-* Broadcast condition, x, and y for Where op #4774
-* Add parser support for relational ops #4695
-* Remove duplicated BindParamByName function in VM compiler #4793
-* Use SimplifyInference for L2 Normalization. #4795
-* Expose vm OptimizeModule to Python #4800
-* Add parser support for logical operators #4642
-* Conv2D padding representation #4787
-* Add support for quantized LOGISTIC #4696
-* Fix VM compiler for while loop with free vars #4889
-* Fix bug in re-processing call node in MergeComposite pass #4879
-* Expose FunctionGetAttr to Python #4905
-* Add a PyTorch to Relay Parser #4497
-* Support data types for CSourceModuleCodegen args and output #4934
-* Clean up and refactor PyTorch frontend #4944
-* Relay pass to use fast exp/tanh #4873
-* BatchNorm support with run-time mean and variance calculation #4990
-* Reduce plevel of conv2d winograd implementation on cuda #4987
-* Add operation tan to TVM #4938
-* Outline and inline lifted functions for external codegen #4996
-* Remove primitive attribute from composite function #5014
-* Refactor Relay Python to use new FFI #5077
-* Fix relay node registration after refactor #5083
-* `Codegen_c.h` should include relay.function #5093
-* Move expr.Function to function.py #5087
-* Propagate constant to subgraphs #5094
-* Adjust strategy plevel to achieve expected performance by default #5118
-* Added a AnnotatedRegion utility class #5030
-* Support TupleGetItem in body of pattern #5106
-* Partition graph codestyle fixes #5202
-* Re-wrote the Graph Partitioner to support multiple outputs #5143
-* Fixes to MergeCompilerRegions #5195
-* Refactor build module to take IRModule #4988
-* Separate analysis and transform passes #5035
-* Relay Node::make to constructor #5128
-* relay::StructuralHash to tvm::StructuralHash #5166
-* Conditions updated to cover better user scenarios #5043
-* Replace UseDefaultCompiler with GetAttr #5088
-* Return empty CSourceModule when no `lowered_funcs` exists in Relay mod #4847
-* Clean up for memory pass to enable heterogenous execution support. (#5324)
-* Remove re-exports of tvm.transform (#5337)
-* [Refactor] Add memoized expr translator for use by backend codegen (#5325)
-* Legalize - Use Non-recursive Rewriter. (#5296)
-* Add additional check before re-using the cached match #5552
-* Remove kCompiler attr from external functions #5615
-* Pattern Language MergeComposite #5656
-* Support Tuple Output in C/DNNL Codegen #5701
-* Infer types in MergeComposite #5766
-* Convert PatternGrouper to do pre-order, non-recursive analysis #5653
-* Remove constants from partitioned functions #5663
-* Add a check for null function attributes #5674
-* Add ConstantPattern #5689
-* Conditionally Embedding Constants in Partitioned Functions #5693
-* Simplify Pattern API Implementations #5703
-* Add ShapePattern and DataTypePattern #5760
-* Remove unnecessary print #5642
-* Improve Shape Func handling for Tuple inputs #5467
-* Relay updated with String #5578
-* Fix the creation of tuple of tuples in PartitionGraph #5616
-* Preserve type information in Merge Composite #5640
-* Move `compiler_begin`/`end_op` to local static objects #5622
-* Fix `dataflow_pattern`.rewrite() hang if Match in IR #5680
-* Fix segfault in pretty print when ObjectRef is null #5681
-* Move `fallback_device` to config #5690
-* Replace `build_config` with PassContext #5698
-* Clear compile engine after task extraction #5724
-* Add `storage_order` ignore in pooling layer. #5781
-* Tweak cublas/cudnn priority level #5820
-* Skip Unknown Function Symbols #5888
-* Allow every runtime module to handle constants #5885
-* handle Tuple/TupleGetItem in first order gradient #5946
-* Add resnet-3d & Update network definitions for NHWC layout #5945
-* Use TargetNode::attrs for Target serialization #5993
-* each option of target str should only contain one ‘=’ #5988
-* Rename `target_id` => `target_kind` #6199
-* 64-bit RPi4b target #6211
-* Add resnet-3d & Update network definitions for NHWC layout #5945
-* Small bug fix for Conv1D imports. #5995
-* Move `invoke_tvm_op` and `shape_func` to vm dialect #5958
-* GRU Layer Support #6020
-* Add pass for getting calibration data from a relay module #5997
-* Merge two consecutive reshape ops #6052
-* Add operation `scatter_add` to relay, based on scatter implementation. #6030
-* i64 indices #5235
-* Port `eliminate_common_subexpr` to non-recursive form #6134
-* Fix interpreter for dyanmic shape input of `ndarray_size` #6086
-* Allow to config allocator type and refactor vm code structure #6105
-* Handle `ndarray_size` in FoldConstant #6156
-* when converting constant nodes with types of int64 or float64 #6159
-* Add ReshapeTensor instruction in the VM to replace the reshape op #6089
-* Support combine multiple dense op just into dense #6062
-* Add unbiased variance op and corresponding support in pytorch frontend #6232
-* Specify additional layouts in convert layout pass #5422
-* Safe check added for Merge Composite Call Node #5562
-* Non recursive partitioning #5493
-* Support combine multiple dense op just into dense #6062
-* Make the max number of fused ops configurable #6327
-* Implementation of the dynamic pad operator #6284
-* change device annotation from post DFS to recursive #6124
-* Make check stricter: disallow inserting function with free vars into module #6313
-* Make check stricter by using Feature. Fixed multiple bugs #6326
-* Resize support for NCHW-convertible layouts #6293
-* Make AutoDiff thread through global function #6336
-* Create Interpreter for each constant subgraph #6195
-* Add Dynamic reshape to a dynamic namespace and add DynamicToStatic Pass #5826
-* Expose relay BindParamsByName to Python #4751
-* Implement pass manager tracing API #4782
-* Move Ops in relay.op.contrib #4942
-* Conditions updated to cover better user scenarios #4951
-* [External codegen] Add test cases for fused ops with manual annotation (#4741)
-* Multiple output support, reshape, split ops added #6296
-
-#### Operator Coverage
-* Allow empty tensor for `reshape`, `tile` and `strided_slice` #4618
-* Fix meaning of `conv2d_transpose` `output_padding` parameter"; #4708
-* Remove cpp upsampling and resize op #4769
-* upsample operator 'NCHWinic' format support. #4791
-* Injective schedule improvement #4786
-* Enable vectorization on fp16 type #4867
-* Support for Int8 schedules - CUDA/x86 #5031
-* New PR to re-add tan to TVM #5025
-* Register topi schedule for Relay `fast_exp` and `fast_tanh` #5131
-* Move Dilation2d from nn to image namespace #5110
-* Use Thrust sort for argsort and topk #5097
-* Conv2d and Dense ops support on Tensor Core #5099
-* Setting workload correctly for Depthwise Spatial conv ARM. #5182
-* Adding a few missing math intrin #5011
-* Missing vectorize for depthwise conv2d. #5196
-* [TOPI] Using x86 schedules for ARM conv2d (#5334)
-* [TOPI-ARM] Do not alter layout if layout is NHWC (#5350)
-* [TOPI] Setting workload correctly for Depthwise Spatial conv ARM. (#5182)
-* [OP] Add `fast_erf` implementation (#5241)
-* [Topi] Tensorcore support for Conv3D (#5284)
-* [intrin] a few more math functions (#5468)
-* [Intrinsic] Add log1p, ldexp, atan2, hypot, nextafter, copysign (#5312)
-* [topi] Add operation relay.nn.dilate() which calls topi.nn.dilate() (#5331)
-* [Topi x86] Missing vectorize for depthwise conv2d. (#5196)
-* [TOPI x86] Adding `unroll_kw` config option for depthwise conv2d. (#5197)
-* [Topi] Breakdown topi.cc into smaller files (#5253)
-* ReduceLogSumExp Operator support #5453
-* Math ops added #5502
-* Enable blocking format in x86 conv2d and fold scale axis #5357
-* Add operation gather to relay. #5716
-* Add `storage_order` ignore in pooling layer. #5781
-* Fix bifrost spatial packing conv2d auto tune #5684
-* Fix reshape usage in ARM schedule #5732
-* Block sparse dense on cuda #5746
-* Improve CUDA softmax scheduling #5600
-* block sparse dense on cuda #5746
-* pass-by-value -> pass-by-const-reference #5783
-* Using MKL blas for quantized dense #6115
-* topi -> tvm/topi #6186
-* Use auto-tuner to improve `conv2d_gemm` performance #6117
-* Improve CUDA `conv2d_transpose_nchw` #4762
-* Add CUDA conv2d for NHWC layout #4737
-* `conv3d_ndhwc` schedule #4775
-* Fast exponent #4790
-* Add Scatter to Topi/Relay/ONNX via hybrid script #5619
-* Split MKL from BLAS. #6182
-* Change the meaning of `conv3d_transpose` `output_padding` to match `conv{1,2}d_transpose` #6065
-* Gather op support added #6013
-
-#### Runtime and Backend
-* Cythonize NDArray.copyto (#4549)
-* Unified Object System runtime refactor (#4578, #4581, #4603)
-* VM profiler: sort VM stats by time (#4601)
-* Update RPC runtime to allow remote module as arg (#4462)
-* Refactorying system lib and dso lib into library module (#4481)
-* Improve TSIM virtual memory mapping (#4545)
-* make adt tag signed #4605
-* Improve TVMBackendPackedCFunc to allow return val #4637
-* EdgeTPU runtime for Coral Boards #4698
-* Fix memory leak when using openMP #4811
-* Fix memory leakage of TVMByteArray #4856
-* Fix `TVM_DLL_EXPORT_TYPED_FUNC` to work on Windows #4955
-* Fix memory leak when using openMP #4811
-* Export GraphRuntime in `tvm_runtime.dll` #5002
-* MISRA-C compliant TVM runtime #3934
-* Update the `type_keys` to reflect the code-org #5074
-* Fix AttrEqual for Array and StrMap, double #5054
-* Export GraphRuntime in `tvm_runtime.dll` #5002
-* Fix unused-value warning #5140
-* crt error handling #5147
-* Bundle deployment with static linking #5158
-* Implemented kDLCPUPinned (cudaMallocHost) #4985
-* Explicitly cast min/max operands #5090
-* `ref_counter` -> `ref_counter_` #5184
-* Expose runtime::String to Python (#5212)
-* [FFI] Refactor runtime.String to subclass str (#5426)
-* [RUNTIME] Auto conversion from str to runtime::String in PackedFUnc (#5251)
-* [RUNTIME] Improved Packed FFI for optional. (#5478)
-* [Hexagon] Add `hexagon_posix.cc` to TVM/RT sources in the right place (#5346)
-* [FFI] Refactor runtime.String to subclass str (#5426)
-* Fix workspace #5503
-* Store nullptr PackedFunc as nullptr for better error propagation #5540
-* Improve PackedFunc robustness #5517
-* Seg fault in WorkspacePool's destructor (#5632) #5636
-* Resolve constexpr issue in debug mode. #5651
-* Add `compile_shared` option to linux compile utility fn #5751
-* Call sync in CopyFromRemote and CopyToRemote #5512
-* Fix the multihop cpu case #5522
-* Improve RPCServer AsyncIO support. #5544
-* Modularize the RPC infra #5484
-* Add `compile_shared` option to linux compile utility fn #5751
-* Overload string operators #5806
-* Only initialize required module #5926
-* if a param not in input, we should still consume it’s data #5990
-* init TVMPackedFunc’s name #6044
-* Enable auto conversion `String->DLDataType` #6214
-* Support random fill #5913
-* Use new to avoid exit-time de-allocation order #6292
-* Add `parallel_for` support to run a loop in parallel #6275
-* Solve ARM BIG.LITTLE heterogeneous multicores #4747
-* [RUNTIME] Quick fix PackedFunc String passing (#5266)
-* Introduce runtime::String::CanConvertFrom #5718
-* Restore the StrMap behavior in JSON/SHash/SEqual #5719
-* Support overriding RPCWatchdog termination behavior on Android and other platforms #6216
-* Set `NDArray::Container.shape_` in NDArray::FromDLPack (#5301)
-* Enable x86 cpu cache flush #5914
-
-#### Quantization
-* Conv2D type checking for kernel per-channel scales. #4732
-* Add missing nullptr check #4773
-* Doc fix on convolution and dequantize #4799
-* Conv2D with dilation support. #4796
-* Making `scale`/`zero_points` as expr instead of attrs. #4611
-* Make calibration faster and more memory usage friendly #4589
-* Doc fix on convolution and dequantize #4799
-* Conv2D with dilation support. #4796
-* Optimize lowering for requantize and FixedPointMultiply. #4798
-* More doc fix on quantize and convolution #4874
-* Add support for per channel weight scale in dense op #4880
-* Add support for quantized models via QNN #4977 #5013
-* Support 4D padding. #5036
-* [Requantize] Cleanup and Optimize Lowering (#5286)
-* [Topi, ARM] Disbale Winograd for quantized tensors. (#5363)
-* Adding support for TFLite QnnSubtract operator. (#5230)
-* Remove developer facing api from frontend exports. (#5375)
-* Add Quantize/Dequantize Partitioning #5940
-* Add support for quantized models via QNN #5016
-* Quanitze operation expanded to take const argument #6127
-* FP32 and Quantized Object Detection Model #5479
-* Support CallNode inputs in qnn.concatenate #5360
-* QNN support for TFLite 2.1.0 quantized models #5848
-
-#### TE
-* Tighten split's extent #4931
-* Set split node's range to minimum of ext and split factor or split np… #5044
-* Support mixing normal and cross-thread reduction (#5193)
-* Inline -> `te/schedule/operation_inline.h` (#5386)
-* Create loops according to storage scope and thread hierarchies (#5190)
-* Fix import in dump pass ir (#5327)
-* Scalar support for te.extern #6079
-
-#### TIR
-* IR readability enhancement (#4501)
-* Introduce tir::PrimFunc #5070
-* Introduce PrimFuncPass. #5139
-* [TIR] Enhance Substitute, python bindings for Substitute/PostOrderVisit (#5400)
-* [TIR] Remove ProducerConsumer and `AllocateNode::new_expr` (#5333)
-* [TRANSFORM] Enable CopyOnWrite for TIR passes. (#5309)
-* [REFACTOR] Migrate LowerTVMBuiltin, InferFragment, LowerThreadAllreduce, ThreadSync to Pass Manager (#5213)
-* [REFACTOR] Remove te::Tensor dependencies from TIR passes. (#5372)
-* [TIR] Refactor MakePackedAPI to target dependent stage. (#5326)
-* [REFACTOR] tvm.hybrid -> te.hybrid (#5223)
-* [REFACTOR] Migrate most of low-level build to use the Pass Manager. (#5225)
-* [REFACTOR] Migrate low-level passes in tvm.lower to the Pass Manager (#5364)
-* [TIR] Migrate VTA TIR passes to the new pass manager. (#5397)
-* [REFACTOR] Migrate all low-level passes to the Pass Manager. (#5233)
-* [REFACTOR] Introduce ExprDeepEqual, Remove IRDeepCompare (#5206)
-* [REFACTOR] RewriteForTensorCore -> te/schedule (#5379)
-* [REFACTOR] Remove `ir_pass` in favor of analysis/transform. (#5415)
-* text format printer considering future parsing use #5483
-* Remove buffer params from pass config. #5652
-* std::string -> String Migration in TIR nodes #5596
-* Remove `CallNode.call_type` in favor of attribute. #5937
-* Remove legacy HoistIfThenElse #5944
-* Improve Let/LetStmt support. #5949
-* Refine side effect analysis. #5954
-* `Provide->ProducerStore`, `Realize->ProducerRealize`. #5750
-* Migrate the tvm/tir/expr.h to constructor #5773
-* Migrate tir/stmt.h to use constructor. #5778
-* Cleanup unused classes #5789
-* Add tir prefix to type keys #5802
-* Enhance VerifyGPUCode #6194
-* Enforce buffer pointer var type to be consistent with dtype. #6317
-* Create a StringImm reference type #4806
-* Add init member to ReduceNode #6138
-* Add dump and print for debugging (NFC) #5207
-* Streamline Function Attr interface. #5045
-* `alpha_equal` to `structural_equal` #5161
-* Remove AttrsEqual and AttrsHash related code #5169
-* [NODE] General serialzation of leaf objects into bytes. (#5299)
-* [POC] Initial stab at `std::string->String` upgrade (#5438)
-* [TIR] Make `lower_warp_memory` support `extent(threadIdx.x) < warp_size` (#5307)
-* [PASS] dtype rewrite for indexing variables (#5092)
-* [PYTHON] Enhance `with_attr` API, cleanup MakeAPILegacy in testcases (#5335)
-* [PYTHON] Make IntImm more like an integer (#5232)
-* [IR] Move to runtime::String (#5276)
-* [IR] kExternalSymbol -> kGlobalSymbol (#5211)
-* [IR] Remove PrimExpr from String (#5311)
-* IRModule is updated with String #5523
-* IR is updated with String #5547
-* Streamline ir/op Registry #5609
-* Migrate IRModule ObjectRef to not-null #5654
-* Migrate BuildConfig to PassContext. #5668
-* relay.op.Op -> tvm.ir.Op #5705
-* Separate ArgTypeCode from DLDataTypeCode #5730
-* Remove legacy `compute_expr.h` #5738
-* Call::Halide => ProducerLoad, DSL/TIR decouple. #5743
-* `Provide->ProducerStore`, `Realize->ProducerRealize`. #5750
-* Migrate the tvm/tir/expr.h to constructor #5773
-* Migrate tir/stmt.h to use constructor. #5778
-* Migrate all Object construction to constructor. #5784
-* Cleanup unused classes #5789
-* Finish `std::string->String` updates #5793
-* Add tir prefix to type keys #5802
-* Change Call.name to Call.op(RelayExpr) #5863
-* Range/IntSet API style consistency. #5953
-* Separate ArgTypeCode from DLDataTypeCode #5730
-* Migrate all Object construction to constructor. #5784
-* Finish `std::string->String` updates #5793
-* Unify StrMapNode and MapNode #5687
-
-#### Performance Improvements
-* Int8 GEMM performance enhancement using Cublas (#4550)
-* Speedup TSIM with multi-threading (#4491)
-* Support cudnn softmax (#5214)
-* Add cuDNN grouped convolution support (#5319)
-* Winograd support for Conv3D (#5186)
-* Improve `get_valid_count` and nms performance for CUDA (#5339)
-* Optimizations of `global_ave_pool` for NHWC layout (#5450)
-* Optimization of Conv2d Winograd algorithm on Tensor #5485
-* Some performance improvement to VM #5901
-* Optimize x86 `conv3d_ndhwc` using data packing approach. #4866
-* Improve NHWC depthwise convolution for AArch64 #6095
-* Improve quantized convolution performance for armv8 architectures #5754
-
-#### Documentation
-* Adding benchmark log format doc (#4366)
-* Add Ninja build system to installation docs (#4554)
-* Doc/comment fixes (#4452, #4463, #4469, #4493, #4397, #4580, #4585, #4591)
-* Fix doc after moving to unified IR #4835
-* Introduction to module serialization #4564
-* ConvertLayout - Call RemoveUnunsedFunctions. #4834
-* Fix bugs that override `n_trials` #4842
-* Update the vm doc #4868
-* Refine the example description of `max/min/sum/tag_scope` #4974
-* Fix vta tutorial #4809
-* Introduce how to add hardware backend to FAQ #4898
-* Update API docs to reflect the status after the refactor. #4907
-* Fix sphinx warnings #4917
-* Fix Sphinx Warnings (RST indent, cross-ref, and image scale) #4920
-* Fix Sphinx Warning: the target found for cross-reference #4925
-* Sphinx -- Introduce alias detection. #4954
-* Fix Warnings from #4942 #4959
-* Fix sphinx precheck #4967
-* Move `git_howto` to rst, add Stage documents to te #5055
-* Add doc for Relay op strategy #5078
-* Update relay docs #5112
-* Include a tarball of docs, add a security faq #5119
-* Cleanup docs before rebuild #5127
-* Minimize necessary doc change #5129
-* Various sphinx related fix. #5168
-* Point docs to the ASF site. #5178
-* Use https link #5183
-* Reduce artifcats generated by sphinx gallery #5208
-* Refine the example description of `max/min/sum/tag_scope` #4974
-* Description updated for pooling attributes #5091
-* [DOCS] Migrate some markdowns to rst, fix sphinx3 warnings (#5416)
-* [DOCS] Misc docs improvements (#5222)
-* [DOCS] Bring relay docs to the top-level flat view (#5343)
-* [DOCS] Reduce artifcats generated by sphinx gallery (#5208)
-* [DOCS] Use https link (#5183)
-* [DOCSTRING]missing function parameters updated (#5228)
-* [DOCS] Migrate HLS documents from md to rst (#5419)
-* [Tutorial, QNN] Add tutorial for loading quantized PyTorch model (#5321)
-* [Docs] VTA install doc migration from md to rst (#5442)
-* [Docs] compiler version in docs (#5281)
-* Remove legacy `compute_expr.h` #5738
-* `TVM_REGISTER_API` -> `TVM_REGISTER_GLOBAL` #4768
-
-#### Bug Fixes
-* Add bfloat16 typeflag support (#4525)
-* MSVC / Windows fixes (#4455, #4569)
-* Fix Makefile for `howto_deploy` (#4457)
-* Fix GCC 4.8 compact (#4461)
-* Fix search path to build `libtvm_topi.so` (#4467)
-* Fix for `conv2d_transpose` CUDA compilation (#4472)
-* Fix for LLVM 10.0 codegen (#4480, #4515)
-* Fix alter op layout when calling global var (#4454)
-* Fix `float2half_rn` support for cuda compute capabilities < 53 (#4489)
-* Fix compile errors for OpenCL backends (#4492)
-* Fix serialization precision loss (#4503)
-* Fix hybrid script to support array of tensors (#4494)
-* Fix annotation for multiply op (#4458)
-* Fix Dockerfile for linter CI (#4506)
-* Fix TF resize for dynamic size models (#4510)
-* Fix `bias_add` gradient (#4516)
-* Fix tanH unit test function call (#4517)
-* Fix extra reshape parameter for ONNX (#4524)
-* Fix crash caused by empty TOPI config (#4520)
-* Fix ONNX shape op type to use int64 (#4528)
-* Fix crash in TSIM virtual memory driver (#4527)
-* Replace deprecated python library in setup script (#4533)
-* Fix NMS `max_output_size` loop (#4541)
-* Fix style in IR mutator and IR visitor (#4561)
-* Fix compiler warning (#4559)
-* Fix to get end to end inference on Chisel VTA (#4574)
-* Fix LLVM build by adding missing intrinsics headers (#4575)
-* Fix context creation in quantization (#4582)
-* Fix NDArray SaveDLTensor signature (#4586)
-* Fix dense pack schedule for x86 (#4539)
-* Fix for broadcast tensor of scalar type (#4577)
-* Datatype refactor (#4513, #4560)
-* Add const qualifiers for NDArray container (#4590)
-* Fix TF <= 1.12 compatibility (#4593)
-* Fix for graph debug runtime (#4598)
-* Disable copy constructor for external codegen (#4597)
-* Make ADT tag signed (#4605)
-* Added declare of aluBits for TensorAlu #4624
-* Get around limitation of g++-4.8 #4626
-* Bugfix StmtMutator IfThenElse #4609
-* Remove unecessary rdynamic #4613
-* Resolve constexpr related link error in debug mode #4641
-* Asymmetric padding #4511
-* Reduce data size of asymmetric padding testcase #4658
-* Fix Base64OutStream portability issue #4668
-* Fix `topi.nn.global_pool` layout="NHWC" #4656
-* Also package core.rly #4679
-* fskip of EliminateCommonSubexpr cannot always return false #4620
-* Fix Python syntax error in `start_rpc_server_to_tracker.py` #4682
-* os.path --> osp to match the import #4681
-* GitHub actions/checkout@v1 --> v2 #4680
-* Fix Python syntax error AGAIN in `start_rpc_server_to_tracker.py` #4685
-* Use ==/!= to compare str, bytes, and int literals #4686
-* Rename `start_rpc_server_to_tracker.py` to `start_rpc_server_to_tracker.sh` #4689
-* GitHub Action lint Python code for syntax errors #4688
-* Generate blob use LLVM directly #4657
-* Reduce input size to fix oom #4653
-* Fix RemoveUnusedFunctions pass #4700
-* Link the math library by default #4713
-* Update mainline version to 0.7.dev0 #4720
-* Add SizeVar representing non-neg valued variable in a tensor shape #4684
-* Fix the compile problem of `cpp_rpc` #4725
-* JSON upgrader to upgrade serialized json. #4730
-* Fallback schedule for Int8 depthwise. #4733
-* Fix dense x86 schedule #4728
-* Fix demo dockerfile build failed #4744
-* Improve CUDA vectorizer #4736
-* Add .asf.yaml for github info #4761
-* Fix padding in pooling op #4738
-* Remove `run_infer_type` duplicates #4766
-* pooling.cc improvements #4767
-* Export `builtin_fp16` on Windows #4731
-* Fix Tensorflow conv3d pad bug, add non-cubic data and kernel tests #4772
-* Bump prebuilt-image version in demo dockerfile #4770
-* Update `tune_simple_template.py` #4778
-* Explicitly link to cublasLt if it exists #4776
-* Fix hasattr by extracting Python error type from Windows error message #4780
-* Replace os.path.exists with try...except...else #4784
-* Make sure to visit the arguments of inlined functions #4783
-* Parse additional exception strings #4785
-* Fix #4670: add bias for fc layer #4801
-* Change color channel from BGR to RGB for darknet preprocessing #4794
-* Fix -Wextra #4804
-* Fix vta tutorial #4809
-* Minor bug fixes in AutoTVM for QNN graphs #4797
-* Fixed subprocess creation under windows #4820
-* Improve tol to resolve flaky case #4836
-* Fixed process termination routine in windows #4844
-* `test_cuddn` flaky #4846
-* Mxnet parser for Qnn dialect #4714
-* Enhance `cc.cross_compiler` #4817
-* Fixed crash caused by reversing bitwise operations #4852
-* Reverse some changes made for `intel_graphics/conv2d.py` in PR #4849 #4853
-* const auto p -> const auto& p #4861
-* Fix onnx import bugs #4750
-* Explicit llvm::StringRef to std::string conversion #4859
-* Update the runtime PackedFunc for module #4871
-* Improve antlr import error message #4888
-* Fix `alpha_equal` bug for attribute check #4897
-* Fix issues in cuda codegen #4876
-* Fixed: Bitwise ops on floats causing wrong code generation and crashes. #4892
-* Fix `tvm.target.generic_func` runtime detection #4910
-* `topi/tests/python/test_topi_sort.py::test_argsort` #4891
-* Use opencv reisze method for preprocessing of image in darknet #4883
-* Fix build breaks with StringRef changes #4923
-* Remove unnecessary spliting in the cached chunk #4935
-* Fixing an Infinite Loop case in UnmatchedChecker. #4881
-* Remove SGX toolchain installation from CI Dockerfile #4948
-* Fix tedd tutorial after strategy change #4947
-* Allow customize MKLDNN library location #4814
-* Added CopyFromBytes and CopyToBytes convenience methods to NDArray. Fixed typos. #4970
-* Fix gcn tutorial failure #4994
-* Fix stride default value None in torch.nn.functional.avg_pool #4984
-* Fix ROCm strategy for winograd conv selection #5001
-* Fix `get_valid_count` flaky test for cuda #4901
-* Change Scala Linter scalafmt => scalastyle #4998
-* Kill from tvm import te #5007
-* Chisel fixes and de10nano support #4986
-* Fix gpu not found when running TVM docker #4975
-* Fixes for pylint==2.4.4 #4849
-* Fix unordered dictionary problem for python version under 3.6 #4982
-* Fix gcn tutorial failure #4994
-* Fix stride default value None in `torch.nn.functional.avg_pool` #4984
-* Fix ROCm strategy for winograd conv selection #5001
-* Early checking added and new test cases added for schedule fuse #5010
-* Fixed div by zero core dump. Fixed rounding intrinsics on int crash #5026
-* Test case modified for int type #5012
-* Bug Fix for ARM CPUs. Lower strict assumption. #5063
-* Triage the testcases to fit the new namespaces #5071
-* Add colors to `compute_at` edges and thread/block indices. #5111
-* Temporary fix to the stack overflow issue in autotvm task extraction #5019
-* Fix compilation of If-Elses #5040
-* Fix CompilerAttrs #5109
-* Fix the existing test cases before refactoring. #5122
-* Fixed bug where shifting by out-of-bounds value results in no compute code being emitted. #5115
-* Fix for issue #4831. The `data_min_idx` and `data_max_idx` were flipped. #5136
-* Duplicate likely nodes added when loop axis split unevenly #5084
-* Fix incorrect name of calibration mode #5150
-* Remove contrib spatial pack schedule of depthwise convolution #5148
-* Fix annotate pass static variable #5023
-* Fixed ConvTranspose2D parsing #5157
-* Nullptr check #5176
-* rocm: fix miopen convolutions #5179
-* rocm: fix `dense_rocblas` in strategy, topi #5191
-* Fix CRT static test bug (#5293)
-* Fix perf regression of tir refactor (#5258)
-* Bugfix in tensorflow `space_to_batch_nd` (#5175)
-* Compilation warnings fixed for 32bit and 64bit compilation (#5349)
-* Fix hang in MergeCompilerRegions (#5227)
-* Fixes to MergeCompilerRegions (#5195)
-* Fix generation of LLVM intrinsics (#5282)
-* Fix setting up hints for getaddrinfo (#2872)
-* Add ConstantNode to IsAtomic (#5457)
-* Fix String SEqual (#5275)
-* Fix fuse over functions that are handled by external codegen (#5365)
-* Fix memory leak when accessing NDArray (#5413)
-* Remove the duplicate PrintIR pass in Relay (#5403)
-* Fix `lower_warp_memory` (#5247)
-* Fix `lower_warp_memory` when there are >1 warp buffers (#5368)
-* Fix intel conv2d auto tune (#5200)
-* Fix FuseBatchNorm output cast error if `need_cast` is True #4894
-* Fix an assertion exposed by loop vectorizer #4916
-* Fix error message #4945
-* Fix for recursive let #5757
-* Fix Calibration Pass to Support Modules with Multiple Functions #5768
-* Fix what looks like bizzare copy-paste issue #6010
-* Fix bug in `transpose_shape_func` #6180
-* Fix bugs in CUDA codegen (#5209)
-* Don’t remove() TemporaryFile in del. (#5414)
-* Fix `test_ir_type`. (#5390)
-* Fix multiple identical inputs bug (#5389)
-* Add cuda target check to dense tensorcore schedule. (#5376)
-* T2 test fixups (#5391)
-* Fix miopen padding (#5433)
-* Misc fixes for ROCm (#5431)
-* Fix copy constructor (#5237)
-* Corrected TVM autotuning on GPU (#5432)
-* Fix vector load (#5226)
-* Minor bugfix in `message_passing.cc` (#5254)
-* Fix a bug when vectorized load&store was involved for… (#5428)
-* Fix to skip node not in graph. (#5238)
-* Fix #5388 [VULKAN] vkBuffer released before memory copy command se… (#5418)
-* Fix a minor error in `device_annotation` (#5291)
-* Fix scalar’s ndim is 0 (#5344)
-* Fix the runtime raise error #5586
-* Fixed bug in attribute parsing for pool layers. #5582
-* AutoTVM incorrect measurement #5511
-* fix a min/max simplify bug #5761
-* Rename `tvm_dso_op` to `libtvm_dso_op` #5714
-* Fix generating types like float44 and float88 #5722
-* Avoid downloading when `TOPHUB_LOCATION` is NONE #5720
-* codegen llvm: move nvptx-specific intrinsic handling into `codegen_nvptx` #5726
-* ROCm warp shuffles and reductions #5727
-* fix small bug about `dense_grad` #5695
-* Clarify downstream consistency of TVMArgTypeCode #5742
-* Fix gelu in PyTorch frontend, tighten numerical checks #5763
-* Make batch matrix multiplication on GPU tunable #5752
-* update vulkan build rule #5777
-* aten::norm support added #5776
-* Edit onnx parser to infer values in post order #5755
-* Support symbolic inputs of Fill #5762
-* support `aten::type_as` in the pytorch frontend #5787
-* Temporary disable fp16 `type_as` test for PyTorch Frontend #5799
-* Add config switch for nn.dense layer type. #5801
-* Move cpu-only frontend tests to a CPU stage #5807
-* Pin hand landmark network to version 0.7.4. #5813
-* Limit number of threads in all jobs #5815
-* Error msg update #5818
-* fix relay.build to not change the module argument in place #5822
-* Fix InferType when module contains Prelude #5797
-* Add a combine `batch_matmul` pass #5791
-* RepeatVector, Conv3DTranspose op support added #5833
-* Fix converting serialized quantized models #5839
-* ffi (Object): make class dict visible in instances #5843
-* Additional canonicalization added for AddNode #5846
-* Suppress the warning messages when compile engine selects impls #5821
-* fix #5849 #5851
-* Introduce POD-C Compliant tvm::Map #5740
-* Add bfloat16 #5601
-* Add Python Classes for all Attrs #5853
-* Fix map assign issue in CI test #5854
-* Introduce Target Id Registry #5838
-* Update `has_dtype/has_shape` to pattern lang doc #5847
-* Add `nn.batch_flatten` as quantizable. #5805
-* Fail early before running invalid dynamic graphs #5856
-* Improve type handling in PyTorch frontend #5834
-* HotFix the python intrin rule #5895
-* add a few gradients #5899
-* Add Binary Intrinsic ops to TIR Ops in C++ #5900
-* Allow implicit conversion in TVM FFI to tvm::Bool #5907
-* PyTorch frontend: fix handling of duplicate use of a model weight #5897
-* Don’t multiply by constant 1 uselessly in dense #5911
-* Support any index matching for TupleGetItem #5909
-* Add MicroTVM tutorial using the STM32F746 discovery board #5655
-* Fix serialization of inf float value #5912
-* Fix CPU Thread Binding for Multiple Sockets #5918
-* CUDA device API & VerifyGPUCode pass update #5898
-* Update install.rst #5858
-* Two small fixes to AMDCPU codegen for LLVM 10+ and ROCm 3.5+ #5920
-* Add LegalizeInvalidAttach to legalize the `compute_at` location after split or fuse #591
-* Don’t rewrite expressions used outside of the pattern #5930
-* Add TupleGetItem to CSE #5931
-* Various update for CoreML codegen #5934
-* Update date in the NOTICE #5943
-* Raise right error in tensorflow split op #5951
-* Add rm xla attributes in tf docs #5950
-* Fix OpenCL `get_valid_counts` errors due to intrinsic `atomic_add` #5857
-* Amendments for gradients #5941
-* Fix the meaning of `conv{1,2}d_transpose` `output_padding` parameter. #5758
-* Make first order gradient graphs more efficient #5959
-* Raise an exception when extern function does not return Stmt #5964
-* Improve docker/bash.sh to handle git worktrees #5970
-* Install DNNL (OneDNN) to CI Environment #5936
-* Add Dynamic reshape to a dynamic namespace and add DynamicToStatic Pass #5826
-* Add meshgrid op in Relay, TOPI, Pytorch frontend #5961
-* Print right number of parentheses for LoadNode #5965
-* Migrate data structure of TargetNode #5960
-* Remove redundant function CreateBufferVecPtr #5982
-* Fix string argument mismatch in GraphRuntimeCodegen #5933
-* VectorType::get with two parameters is deprecated in LLVM 11+ #5984
-* Fix Compilation Error in CRT #5713
-* Fix runtime::String backward compatibility in JSON #5725
-* Allow RPCWrappedFunc to rewrite runtime::String as std::string #5796
-* Fix reshape #5739
-* Fix building with LLVM-10 on macOS #5859
-* Add cuda 11 to `contrib.nvcc.find_libdevice_path()` #5902
-* Fix sequential cpp test #5745
-* Infer types in MergeComposite #5766
-* Fix recursive let for well formed check #5780
-* Recover global state after `test_util.py` #5824
-* Fix bug in rpc ring buffer shrink #5516
-* Fix remote device sync #5538
-* Fix bug in rpc ring buffer shrink (#5516) #5537
-* RPC Server error fix on Pynq FPGA #5607
-* Fix FloorMod Simplifier #5509
-* Fix Python debugger segfaults with TVM built with LLVM #5685
-* Fix Compilation Error in CRT #5713
-* Fix runtime::String backward compatibility in JSON #5725
-* Allow RPCWrappedFunc to rewrite runtime::String as std::string #5796
-* Fix reshape #5739
-* Make "none" DataType explicit #5491
-* Change "scalar" and "stack" in IDL from "inrout" to "in" #5487
-* Link necessary libraries when building runtime for Android #5496
-* Fixes for wasm32 target #5489
-* Reset target and wait for runtime initialization on connect. #5499
-* Bump tophub rocm version #5504
-* Improve commentary for RingBuffer #5518
-* Add unit tests for ONNX PRelu and fix importer to pass them. #5521
-* LRN only supports 4D tensors, remove it from `alter_op_layout` #5520
-* Fix an issue with ONNX Upsample #5530
-* Cache PrimExpr instead of raw pointers in bound analyzer #5533
-* fix a few bugs with shape inference and types in the ONNX importer #5534
-* Add Onnx Pad v11 #5539
-* Changes to `cpp_rpc` to make it work on Android (+ Hexagon offloading) #5535
-* Fix to reduce RAM size during loading model #5507
-* Fix MakeLoopNest for warp memory #5382
-* Load platform specific lib for tvmdsoop instead of the hard-coded tvm_dso_op.so #5542
-* Add tests for running micro on native arm hardware #5546
-* Apparently, ONNX Conv with no 'pads' defaults to zero padding #5548
-* clang-format the h,cc,m files. #5557
-* Fix conv2d alter op for arm cpu #5532
-* Fix topi test for non tensorcore CI. #5563
-* Add clang-format and nodejs to ci-lint #5567
-* Enable clang-format. #5572
-* Allow `ubuntu_install_darknet.sh` to work in both 18.04 and 16.04 #5574
-* Add a quantized conv2 unit test for the tflite front-end #5558
-* Fix JSON graph dumping. #5591
-* Warp level reduction support for CUDA #5498
-* One more fix for concurrency count #5589
-* Improve robustness of the docs build #5583
-* Phase out WebGL #5570
-* Fix vulkansdk in the ci-gpu and upgrade to 1.2.135 #5566
-* Update ci-cpu to bionic #5554
-* Overestimate binary size for microTVM compiled binaries. #5590
-* Fix bug and re-enable RPC execution test #5436
-* Add ostream formatters for TargetPtr/TargetVal. #5592
-* Fix cross thread reduction #5551
-* Fix TVMArray layout on device #5599
-* Add debug mode to tempdir() #5581
-* Represent alignment information in LLVM IR #5598
-* Fix codegen for warp shuffle intrinsics #5606
-* Fix Topological Order calculation for DFPattern Language #5612
-* Global MaxPool3d and AvgPool3d support #5098
-* Fix build error of iOS RPC #5621
-* isn't a CallNode sometimes #5623
-* Introduce config to PassContext. #5631
-* CMAKE fix #5630
-* Label Pattern Partitions #5627
-* Extend AttrPattern to support CallNode and FunctionNode attributes #5637
-* Increase bss section size. #5660
-* Add buffer name when creating tensor bindings #5670
-* µtvm debug improvements #5648
-* enable `amd_apu` device on vulkan target #5659
-* Support TupleWrapper as direct ancestor of control flow ops #5639
-* add tvm.micro pydoc to sphinx #5661
-* Add a regression testcase for #5674 #5677
-* Fix C++ RPC build problem on Linux #5671
-* Add a check Callback to the Pattern Paritioner #5646
-* Call previous excepthook in `tvm_excepthook`. #5675
-* Fix the shift column for `scale_shift_nchw` and `scale_shift_nhwc` in C topi #5679
-* Support more dtypes for TVMDSOOp #5694
-* In `memory_plan`, check if value is not None, instead of just checking value as boolean. #5700
-* Fix flaky `test_topi_pooling.py:test_adaptive_pool` #5736
-* Fix the values for `test_fmod` since it fails way too often otherwise #5723
-* fix small bug about `dense_grad` #5695
-* Fix sequential cpp test #5745
-* Add Scatter to Topi/Relay/ONNX via hybrid script #5619
-* Clean WASM environment before build #5759
-* Fix gelu in PyTorch frontend, tighten numerical checks #5763
-* fix #5686: remove a overstrict assert in MakeAllreduce (#5686) #5785
-* Improve Pattern Language Docs #5676
-* Add missing expr visitor for any #6082
-* Remove the tvm web from version update #6122
-* Clear relay cache after every build & Clear warning message cache after autotvm task extraction #6131
-* avoid unexpected throw in AttrInitEntry #6128
-* Verify that tensor reshape is valid. #6215
-* Use LocalRunner by default in the tutorial tune_relay_cuda.py #6001
-* Undefined names: import os for line 324 & import re for line 308 #6003
-* GitHub Actions upgrade to actions/setup-python@v2 #6002
-* Only pass pythonpath for ci images #6005
-* Auto-convert shuffle with single index to “extract element” #6006
-* Cache object refs in loop partitioner instead of object pointers #6004
-* Fix `test_arith_solve_linear_inequality.py::test_multi_equal` #6014
-* MXNet frontend support for AMP cast op #5976
-* Demo showing how to run a pruned model. #5975
-* Move compiler related registry items to `vta/build_module.py` #6012
-* Pin keras version #6032
-* Fix in `arm_cpu/conv2d_alter_op` for NHWC quantized #6027
-* Add creation of Hexagon device in RPC client #6035
-* Terminate basic block after “ret” instruction #6036
-* µTVM CRT modifications for on-device RPC server #5921
-* Create TBAA information based on the unrelying buffer type #6046
-* Add support for tflite `arg_min` and `arg_max` #5992
-* Fix `fully_connected` converter when batch size is not 1 #6038
-* Fix a primitive check error #5991
-* Refactor to expose MakeOp functions to C++ #6047
-* Fix `conv2_gemm` after target structure update #6037
-* Remove use of designated initializers from `hexagon_module.cc` #6055
-* Build crttest and cpptest separately. #6057
-* Fix pytorch frontend prim::Constant issue #6051
-* update frontend tutorials to new model based runtime interface #6063
-* Remove unnecessary std::cout #6072
-* Fix error message in Buffer::vstore, NFC #6056
-* Fix FSIM Compile Error. #6070
-* Improve vector simplification for float operands #6043
-* Fix LocalBuilder on macOS with python 3.8. #6083
-* Add missing test for fast erf #6058
-* Fixed point multiplication improvements for AArch64 #5980
-* Fix code generation bugs for C/CUDA & Improve VerifyGPUCode pass #6041
-* Delete declaration of unused `op_node` #6102
-* Load configs even it has no entity #6100
-* Update SGX example Cargo.toml #6067
-* Add default value for option `USE_DNNL_CODEGEN` in the cmake #6099
-* Update installation doc with minor improvements #6104
-* lint: add opencl .cl file type #6092
-* Clean up conversions between TVM and Rust functions #6114
-* Improve reduction schedule on arm CPUs #6110
-* Register Shape Func for Some Operators to Handle Dynamic Shapes #5955
-* Fix variable name conflict with OpenCL keyword #6048
-* Some rust cleanups #6116
-* Option to specify alternate directory to output build to #6016
-* Add `get_num_inputs` to GraphRuntime #6118
-* TFLite quantized conv test #6084
-* Fix autotvm on the `conv2d_nchw_winograd.mali` operator #6130
-* add attr option mfloat-abi for arm32 #6123
-* Fix CUDA Library Tuning #6132
-* Add missing RPC sources after refactor #6113
-* Correct `runtime.load_module` #6161
-* Improve error messages in graph tuner, graph runtime, and module loader. #6148
-* Fix some shape mismatches between TF and Relay #6166
-* Improve doc string #6176
-* Fix incorrect function signature in header #6172
-* Fix alignment of note #6181
-* Implemented PADV2 Operator for TFLite and added support for constant values in PAD. #6167
-* Unary ops support added in frontend #6196
-* Change the meaning of `conv3d_transpose` `output_padding` to match `conv{1,2}d_transpose` #6065
-* Fix compile warnings. #6204
-* Fix -mfloat-abi=soft compilation for ARM with OpenCL target #6150
-* Match pytorch 1.6 googlenet pretrained model (#6201) #6212
-* Mod operator, bug fix #6160
-* RESHAPE with dynamic shape arg in TFLite frontend #6208
-* Fix compilation error with cuda 11 #6213
-* Fix `port_end` wrong default value 9199 to 9099 for keeping same with source code #6220
-* Std op without specified dimensions support #6226
-* fix crt building and running error #6231
-* Implemented `ONE_HOT` Operator for TFLite. #6223)
-* Avoid unexpected throw in AttrInitEntry #6128
-* Added casting to hybrid script doc and fixed pass infra doc #6174
-* Fix compile warnings. #6204
-* Fix -mfloat-abi=soft compilation for ARM with OpenCL target #6150
-* Mod operator, bug fix #6160
-* Fix compilation error with cuda 11 #6213
-* Fix `port_end` wrong default value 9199 to 9099 for keeping same with source code #6220
-* Std op without specified dimensions support #6226
-* Verify that tensor reshape is valid. #6215
-* Fix crt building and running error #6231
-* Fix `conv2d_transpose` output padding #6236
-* Fix cuda half math function is undefined: hpow, htanh #6225
-* Fix division range estimation error in simplifier #6244
-* Fix newer GCC compiler warnings. #6257
-* Support `_contrib_SyncBatchNorm` #6245
-* Fix reduction #6250
-* Add apt repository for clang-11 and llvm-11 #6256
-* Update tutorial to new TARGET as `micro_dev` is no more #6262
-* Fix clang-format #6264
-* Trivial fix, up the rodata section for the discovery board to 512 bytes. #6259
-* Fix cuda half math function is undefined: hpow, htanh #6253
-* Add dilation in x86 NCHWc depthwise conv support #6267
-* Decrease test times by introducing testing model #6235
-* Add support for parsing the any dimension. #6277
-* Improve error messages for memory verifier and gpu memory verifier #6281
-* Reflect Compile-Time CMake Options into libtvm.so #6280
-* Add cmake options into libinfo #6286
-* Update slice to infer attributes when not graph inputs #6276
-* Use rpc.LocalSession for simple tests #6294
-* Fix random fail #6312
-* Fix resize test #6298
-* Fix cython FFI compact with np.int64 #6321
-* Fix relay vm optimize #6322
-* Changed TVMCTVMContext to TVMContext #6306
-* Make able to compile with MSVC #6341
-* ROCm changed name of library and removed the old one in ROCm 3.7 release. #6345
-* Compatible for ROCm before 3.7 #6359
-* Use clear name that is separate from ASF brand for cache #6360
-* Fix `Dockerfile.demo_android` #6361
-* Fx sparse dense schedule on cuda #5803
-* Fix strategy for sparse dense cuda #5782
-* Fix x86 conv2d template when tuning with unpacked layout #5938
-* Fix the filter width parameter in `depthwise_conv2d` #6081
-* Fix reshape usage in ARM schedule #5732
-* Missing header #4865
-* Fix `conv2d_transpose` output padding #6236
-* Simplify reduce expression in te.gradient #6611
-
-### API Changes
-* `tvm.module` -> `tvm.runtime.module`
-* `tvm.module.load` -> `tvm.runtime.load_module`
-* `tvm.module.enabled` -> `tvm.runtime.enabled`
-* `tvm.module.system_lib` -> `tvm.runtime.system_lib`
-* `tvm.relay.Module` -> `tvm.IRModule`
-* `tvm.create_schedule` -> `tvm.te.create_schedule`
-* `tvm.placeholder` -> `tvm.te.placeholder`
-* `tvm.compute` -> `tvm.te.compute`
-
-### Deprecation
-* Deprecate NNVM (#4535, #4562, #4565, #4571)
-* Deprecate FreeStmt #5890
-* Remove legacy `compute_expr.h` #5738
-* Deprecate OpenGL #5711, #5712
-
-## 0.6
-
-### Relay in Production
-Relay is a functional, differentiable programming language designed to be an expressive intermediate representation for machine learning systems. Relay supports algebraic data types, closures, control flow, and recursion, allowing it to directly represent more complex models than computation graph-based IRs (e.g., NNVM) can. In TVM v0.6, Relay is in stable phase and is ready for production.
-
-* Algebraic Data Types (ADT) support (#2442, #2575). ADT provides an expressive, efficient, and safe way to realize recursive computation (e.g., RNN). Refer to https://tvm.apache.org/docs/langref/relay_adt.html for more information.
-* Pass manager for Relay (#2546, #3226, #3234, #3191)
-* Most frameworks have been supported in Relay, including ONNX, Keras, Tensorflow, Caffe2, CoreML, NNVMv1, MXNet (#2246).
-* Explicitly manifest memory and tensor allocations in Relay. (#3560)
-
-### Relay Virtual Machine
-The Relay Virtual Machine (Relay VM) is the new generation of runtime to strike a balance between performance and flexibility when deploying and executing Relay programs. Previously, the graph runtime is able to utilize the fully static nature of the input graphs to perform aggressive optimization such as fully static allocation, and optimal memory reuse. When we introduce models which make use of control-flow, recursion, dynamic shapes, dynamic allocation we must change how execution works.
-
-Relay VM is now usable and is able to achieve decent performance for a various of models and targets.
-
-* Design (#2810 #2915) and a first version of implementation (#2889),
-* Add VM runtime for Relay and compiler support (#3120, #3121, #2889, #3139)
-* Relay VM (pattern matching #3470, port to python #3391, serialization #3647)
-* Relay VM Profiler (#3727)
-* Support execution on devices for Relay VM (#3678)
-* [Relay][VM] Add more passes to VMCompiler (#4058)
-* [relay][vm] Separate VM runtime with executable (#4100)
-* Port VM, VM compiler, and Object into Python (#3391)
-* VM: Add AllocTensor instruction and better instruction printer (#3306)
-* [Relay][VM][Interpreter] Enable first-class constructors in VM and interpreter via eta expansion. (#4218)
-* [Relay][VM] Clean up the VM and VM profiler code (#4391)
-
-### Training
-Relay is designed to natively support first-order and higher-order differentiation. The automatic differentiation infrastructure is now usable and a count of operators with gradient support are available in v0.6 release.
-
-* Higher order reverse mode automatic differentiation that work with control flow (#2496)
-* Higher order continuation passing style (#3456, #3485 )
-* Relay gradient registration (clip #3509, `max_pool2d` and `avg_pool2d` #3601)
-* Relay AD algorithm (#3585)
-* Relay Training - allow gradient to return a tuple (#3600), numerical gradient check (#3630)
-* Improve AD for concatenate (#3729)
-* [Relay][Training] Add missing gradient check to gradient pass (#4169)
-* As a part of Relay's automatic differentiation system, we are adding primal gradients for Relay operators. Please refer to #2562 for tracking the progress.
-* Gradient for Conv2d (#3636)
-* Add gradient operators (#3857, #3894, #3901, #3915)
-* Add gradient for log-softmax (#4069)
-* [Relay][Training] Add gradient for Crossentropy (#3925)
-* [Relay][Training] Add and fix gradients (#4126)
-
-### Quantization
-
-Low-bit inference is getting more and more popular as it benefits both the performance and storage usage. TVM now supports two types of quantization. 1. Automatic quantizaion takes floating-point precision model, does per-layer calibration and generates low-bit model. 2. TVM also imports pre-quantized model from Tensorflow and MXNet, a new dialect QNN is introduced to handle further lowering to normal operators.
-
-* Automatic Quantization
-  - Low-bit automatic quantization supported. (#2116). The workflow includes annotation, calibration and transformation.
-  - Refactor quantization codebase and fix model accuracy. (#3543)
-  - KL-divergence-based per-layer calibration. (#3538)
-  - Add option to select which convolution layers are quantized. (#3173)
-  - [Relay][Quantize] Integrate data-aware calibration into quantization. (#4295)
-* Pre-quantized model support (QNN operators and legalize pass).
-  - Add a legalize pass to Relay (#3672)
-  - Qnn Concatenate, quantize, dequantize and requantize operators (#3819,  #3730, #3745, #3531)
-  - QNNtoRelay & QNNLegalize Pass utility (#3838, #3782)
-  - Requantize: Optimize lowering for some corner cases. (#3864)
-  - New quantized operator support: conv2d, add, dense (#3580, #3736, #3896, #3910)
-  - Do type checking for the input and kernel in the qnn conv2d (#3904)
-  - Legalize and AlterOpLayout for Intel int8. (#3961)
-  - Renaming tests to follow the Relay nomenclature. (#3975)
-  - Fix padding changes due to #3739 (#3989)
-  - Memorizing quantize node mapping to avoid duplicated simulated quantization (#3233)
-  - Infrastructure to support pre-quantized models (QNN) (#3971).
-  - [Relay][AlterOp] NHWC to NCHWc support for Pool, concatenate, sum. (#4059)
-  - [TOPI][x86] Cascade lake support. (#4123)
-  - [TOPI][x86] Legalize - Support int8xint8 convolution to use VNNI inst (#4196)
-  - Qnn dequantize with min max using Mxnet flavor to support Mxnet prequantized models. (#3945)
-  - Improve the lowering of Qnn Dense (#4213)
-  - Adding support for dequantizing from int32 to float32. (#4130)
-  - [QNN] Refactor fixed point multiplication in requantize (#4073)
-  - [Relay][Quantize] Use fixed point mulplications (#4160)
-  - Add support for quantized multiply to Relay (#4141)
-  - Use legalize to handle NHWC layout for `arm_cpu` (#3754)
-  - [QNN][Legalize] Specialize for Platforms w/o fast Int8 support (#4307)
-  - [QNN] Use Int16 upcast in Fallback Conv2D. (#4329)
-  - Retain input kernel scales in QNN dialect (#4292)
-  - [QNN] Lowering for Depthwise Convolution. (#4351)
-  - [QNN][TFLite] Parsing QNN Add op. Adding MobilenetV2. (#4142)
-  - [QNN][TFLite] Parsing TFLite quantized models. (#3900)
-  - Added tflite frontend support for quantized mean. (#4339)
-  - [Relay][Legalize] Legalize `conv2d_transpose` for NHWC (#4399)
-
-### Accelerator and Microcontroller Support
-
-TSIM is introduced to improve software and hardware integration and simulation accuracy. It integrates the hardware development process into the software stack. TSIM enables VTA to provide a more accurate performance feedback, i.e. clock cycles, compared to the traditional functional model of a hardware accelerator. Moreover, Chisel implementation for VTA is availale and it runs on top of TSIM.
-
-There has been a proliferation of resource-constrained and embedded devices that do not have operating systems or a mature software stack. MicroTVM is intended to support TVM on such bare-metal devices.
-
-* [TSIM] Enabling Cycle-Accurate Hardware Simulation for VTA (#3010, #3206, #3242)
-* Chisel implementation for VTA and runs on top of TSIM (#3258, #3347)
-* MicroTVM (#3227)
-* Relay Compilation + AutoTVM compatible operator libraries for VTA (#3135)
-* ChangeBatch pass for batched VTA compilation (#3656, #3660)
-* VTA fast simulator statistics (#3481)
-* TSIM improvements and fixes (#3505)
-* Chisel VTA enhancements and fixes (32bit support #3558, alu instruction generation #3592, coherence support #3593, separate types #3605, tensor issue/commit #3637, uop load request #3643, uop dma requests #3654)
-* VTA Runtime refactor for non-shared memory FPGAs (#3590)
-* VTA HLS codebase refactor for Ultra96 (#3496)
-* VTA support for batched inference (#3661)
-* VTA bitstream compilation for Intel FPGA (#3494)
-* TSIM: Introduce Virtual Memory for TSIM Driver (#3686)
-* Parallel TSIM hardware compilation with macOS and debug support (#3797)
-* Chisel: scale dram base address in hardware instead of runtime (#3772)
-* Chisel: run all unittests by default (#3766)
-* Chisel: improved Data Gen, Added ALU Test (#3743)
-* Chisel dependencies for TSIM CI (#3721)
-* Chisel: Added Module Unit Test Infrastructure (#3698)
-* Add ISA BitPat generation (#3891)
-* de10-nano driver (#3394)
-* Extending Vision model coverage compilation for VTA (#3740)
-* Conv2d transpose (deconvolution) operator support (#3777)
-* Support TLPP in function simulator. (#3555)
-* [VTA][Chisel] TSIM VTA Source Refactor (#4163)
-* [VTA][TSIM] Serial GEMM Application Added (#4082)
-
-### Rust Support
-Rust language support in TVM includes two parts. 1. The frontend wraps the current C API and exposes a Rust programming model. 2. The backend serves as an alternative to C++ runtime. It privdes a standalone WASM module and security support, e.g., SGX.
-
-* Rust frontend (#2292).
-* Unify types between bindings and pure Rust impl (#2616)
-* Rust: load syslib modules at compile time (#3274)
-* Rustify PackedFunc & Friends (#2969)
-* Rust DSO module (#2976)
-
-### Operator Support
-* A special operator `annotation.stop_fusion` to prevent it being fused with previous expressions (#2624).
-* `batch_matmul`  supported (#2561).
-* `reverse_reshape` supported (#2503).
-* Faster-RCNN proposal operator for CUDA (#2420).
-* Vision operator for YOLO `yolo_reorg` (#1941).
-* `slice` operator for MXNet (#2662).
-* `arange` supported (#2621).
-* Vision operator `roi_align` (#2618).
-* `where` operator for MXNet (#2647).
-* Deformable conv2d (#2908)
-* Faster-RCNN Proposal OP (#2725)
-* ROI Pool operator (#2811)
-* Gluoncv SSD support on CPU (#2353)
-* shape, reverse, and sign op (#2749, #2800, #2775)
-* tile and repeat op (#2720)
-* logical operators (#2743, #2453)
-* stack op (#2729)
-* NCHWc upsampling (#2806)
-* clip and wrap mode support in take (#2858)
-* AlterLayout support for `intel_graphics` conv2d , depthwise conv2d (#2729, #2806)
-* Add foldr1 operator (#2928)
-* Add rsqrt operator (#2949)
-* Add clip and wrap mode support in take (#2858)
-* `Gather_nd` exposed to relay (#2945)
-* `bitserial_conv2d` move to autotvm template and updates (#2819)
-* Port x86 NCHWc to AutoTVM for Task Extraction (#2664)
-* Implement relay `nn.bias_add` compute in C++ (#3027)
-* Rename output tensors for better readability (#3006)
-* int8 dense on CUDA & Dense op quantization (#2877)
-* Bitserial dense operators for CPU (#3051)
-* Enhance upsample operator to adapt onnx opset v9 (#2968)
-* Add adaptive pooling operator (#3085)
-* Add all operator (#3124)
-* Add cblas `batch_matmul` (#3210)
-* Add packing for int8 1x1 convolution and support the int8 group convolution on X86 (#2991)
-* Add op size (#3094)
-* x86 TOPI (`roi_align` #3475, `conv2d_transpose` #3491)
-* Intel INT8 (dilation in conv2d #3510, type checking #3516)
-* Reinterpretation of tensor elements (#3599)
-* Spase-Dense for block-sparse multiplication (#3566)
-* Winograd matrix computation (#3553)
-* CUDA schedule for `pool_grad` (#3622), `group_conv2d` (#3663)
-* Bitserial operations conv2d, dense and bitpack (#3844)
-* Improve numeric gradient check (#3856)
-* Resize rework ([3788](#3788))
-* Improve `conv2d_transpose` CUDA schedule template (#3796)
-* SpaceToDepth and MirrorPad Operators (#3718)
-* Add variance and layer norm op (#3700)
-* Add `sparse_transpose` for Square CSR matrices (#3707)
-* TOPI: Memoize winograd matrix (#3687)
-* New TOPI operators: `erf`, `logical_and`, `logical_or`, `logical_not`, `isnan` (#3702, #3929, #3979)
-* Improve `ceil_divide` in tile/split (#3842)
-* [Relay][Frontend][TF] Add tensor array ops (#3798, #4309)
-* [TF][Op] Op where (#4045)
-* [TOPI]Add op argwhere (#3994)
-* [Relay] `crossentropy_with_logits` and its gradient (#4075)
-* [Relay][Op] Enhance Upsample Operator to support float scales (#4206)
-* [Relay][Op] Add instance norm op (#4004)
-
-### Frontend and User Interface
-* Frontend darknet (#2773)
-* Support tf.gather (#2935)
-* Support tf.where (#2936)
-* Adding ADD operator to tflite frontend for compiling the MobileNetV2 (#2919)
-* Support SpaceToBatchND/BatchToSpaceND in Tensorflow frontend (#2943)
-* Simplify TF `get_output_names` (#3025)
-* TF Tile Round Sign Pow Exp Reverse (#2960)
-* Gluncv SSD support on the GPU (#2784)
-* Allow an op as loop var in Tensorflow (#3056)
-* Add `FULLY_CONNECTED` op into tflite frontend (#3019)
-* Add MXNet converter for RNN layer ops (#3125)
-* Add log op in tf frontend (#3111)
-* Add SoftPlus Sqrt in Tensorflow frontend (#3187)
-* Add onnx elemwise greater/less (#3186)
-* Add PlaceholderWithDefault (limited) implementation in TensorFlow (#3184)
-* Support `tf.math.reduce_prod` (#3166)
-* Better shape inference in TensorFlow Frontend (#3176)
-* Get list of unsupported ONNX operators (#2995)
-* Implement ONNX MaxPool-v8 and MaxPool-v10 (#3114)
-* Convert TFLite NCHW to NHWC (#3141)
-* Add Crop op converter (#3241)
-* TFLite frontend operator support: PAD, RESIZE, MUL, Reduce (min, max, mean, prod), LOGISTIC, elemwise operators (Sub, Divide, Power, Max, Min) (#3310, #3370, #3304, #3421, #3313, #3357)
-* Tensorflow frontend operator support: Abs, FloorDiv, GatherND, LeftShift, LogSoftmax, Max, Min, Mod, RightShift, ZerosLike, TruncateMod, Neg, ClipByValue, ResizeNearestNeighbor (#3270, #3211, #3393)
-* TFLite: Add `fused_activation_function` for ADD, SUB, MUL, DIV (#3372)
-* Support bidirectional RNN layer for MXNet (#3397)
-* TFLite operator support (pack #3521, split #3520 )
-* Keras operator support (permute, softmax #3618)
-* TF operator support (BatchMatMul #3634)
-* TFLite frontend operator support: tile, transpose (#3814, #3705)
-* ONNX frontend operator support: PReLU for NNVM, Not, Sign, Equal (#3813, #3836, #3760)
-* Keras frontend operator support: Dot (#3668)
-* Add more cases to Keras `_convert_reshape` (#3846)
-* TensorFlow frontend operator support: OneHot, log1p, cos, sin (#3781, #3614)
-* Support BatchMatMul with input dimensions larger than 3 for TensorFlow (#3732)
-* ONNX new operator support: And, Tile, Erf (#3878, #3941, #3988)
-* MXNet new operator support: pad, conv1d, deconv1d (#3739)
-* TFLite new operator support: `batch_to_space_nd`, `space_to_batch_nd`, tanh, greater, relu (#3850, #3996, #3963, #4022)
-* TFLite: Support depthwise convolution multiplier greater than 1 (#3922)
-* Keras: Fix ReLU in Keras Converter missed the case (#3917)
-* Keras: frontend upsample and 1 channel conv2d fixes (#3937)
-* Tensorflow: Convert scalar Const into tvm.relay.const (#3885)
-* TensorFlow: Add support for SquaredDifference (#3930)
-* [relay][frontend] clean up tf frontend (#3710)
-* [Relay][Topi][TensorFlow][ONNX][Lang] Add support for Any op (#4205)
-* [Relay][Frontend][ONNX] Add support for op Where (#4184)
-* [Relay][TopHub] Add switch to disable TopHub download (#4015)
-* Add parser support for CAST tflite operator (#4096)
-* Add parses support for `zeros_like` tflite operator (#4042)
-* Add parser support for SUM tflite operator (#4182)
-* Add support for tf.assert (as no-op) and `tf.no_op` to TF Relay frontend. (#4172)
-* [Relay][Frontend][ONNX] New Operators and Opsets to Support BERT (#4197)
-* [Relay][Params] Add APIs for storing and retrieving parameters from individual functions. (#4194)
-* Add `build_create_shared_func` to tvm/contrib/cc.py (#3840)
-* Tensorflow saved model for NNVM ([#2493](#2493/) and Relay ([#2586](#2586/)).
-* Introduced `HybridModule` (#2477) so that normal TVM schedule can be compiled to hybrid target, run and dumped to Hybrid Script.
-* Relay ][Frontend][Tensorflow] add operator `add_n` (#4181)
-* [Relay][Frontend][Tensorflow] StopGradient (#4238)
-* [Relay][Frontend][ONNX] Add support for broadcasting to Where and MatMul (#4267)
-* [TFLite] Support PRelu (#4298)
-* [Frontend][MxNet] support mxnet cond op (#4311)
-* Add support for `quant.mul` operator in tflite frontend (#4283)
-* [Relay][Frontend][ONNX] operator support: DepthToSpace, SpaceToDepth (#4271)
-* [Relay][Frontend][Tensorflow]Add `conv2d_transpose`. (#4300)
-* [Frontend]Add TensorFlow FloorMod (#4308)
-
-### Runtime and Backend Support
-* Make external library extend TVM's NDArray more easily (#2613).
-* Improvements for NNPACK integratation, includes ci test, winograd (#2846, #2868, #2856, #2721)
-* Improvements for OpenCL runtime (#2741, #2737)
-* GraphRuntime: Enable sharing parameters of a model among multiple threads (#3384)
-* Android runtime argsort support (#3472)
-* GraphRuntime enhancements (`set_input_zero_copy` #3416)
-* A new minimal runtime implementation (~12kb .text on ARMv7/x86) for TVM.
-* Add AVX512VNNI support for TVM (#3388)
-* Enable miopen Group Convolution (#3987)
-* Minimal runtime (~12kb .text on ARMv7/x86) for subset of TVM models (#3567)
-* [RUNTIME] Separate runtime related contrib into runtime/contrib (#4207)
-* [topi] add ARM v8.2 udot (uint8) support (#3978)
-* [codegen] Add multiple operands and function support when using fp16 compilation (#4056)
-* [TOPI] Added support for Mali Bifrost target (#4047)
-* [topi] enable fp16 sort for arm (#4084)
-* Add OpenOCD Low-Level Device (RISC-V Support) (#3756)
-* Add wave 32 bc for AMD ROCm backend (#3984)
-* [RUNTIME] Support C++ RPC (#4281)
-* [TOPI][OP] Support Faster-RCNN Proposal OP on CPU (#4297)
-* [TVM][RUNTIME] A minimum example to generate external library wrappers for DSOModule (#4280)
-
-### Language and Architecture
-* Support custom datatypes (#2900)
-* Add the acc16 intrinsic support (#3081)
-* Handle float16 constants & fix BatchNorm (#3260)
-* Structural hash - incorporate the var type into its hash (#3267)
-* Relay C++ Build Module (#3082, #3144, #3174)
-* Enable decorating python class to be a Relay Pass (#3364)
-* Make Partial Eval support interprocedural optimization and termination check. (#3033)
-* Introduce feature manager to Relay. (#3236)
-* Use Relay parser to define the Relay prelude (#3043)
-* Mechanism to detect incomplete expression match in Relay (#3203)
-* EQ/NE operators support for StringImm expressions (#3283)
-* Mechanism to detect incomplete expression match in Relay (#3203)
-* Introduce CanonicalizeCast pass to formally reduce memory overhead introduced by fused cast operations (#3280)
-* Support overloading comparison operations in Relay (#3168)
-* Mac count: provide a pass to calculate the number of multiply-accumulate operations in a network (#2609).
-  - support for `conv_2d_transpose` (#3469)
-  - [Relay][Pass] Count MAC for BatchMatMul (#4157)
-  - Detect depthwise conv2d in `mac_count` pass (#3083)
-* Add Tuple pattern (#3596)
-* Text format support for ADTs and prelude (#3863, #3939)
-* Add new IR pass CombineParallelDense (#3862)
-* Add support for `EQ` op in the deduce bound and the loop partition (#3775)
-* Introduce base-class IRMutatorWithAnalyzer (#3969)
-* Define more standard global functions in the prelude of relay program, includes foldr1, hd, tl, nth, list update (#2928, #2917, #2771, #2866)
-* Add SkipVectorize pass (#3222, #3228)
-* [Relay][Pass] Add pass to remove unused functions in relay module (#4334)
-
-### Symbolic shape enhancement
-* Add shape function for symbolic shape. It enables certain cases for broadcast with symbolic shapes. (#3606)
-* [tvm][any] broadcast with values other than one (#3967)
-* Symbolic shape support (broadcast op #3389)
-* Support reshape for dynamic shape in tf converter (#4185)
-* Runtime Shape Functions (#4179)
-
-### Language and Architecture
-* An optimization pass to eliminate expressions which have the same functionality and same inputs (#2639).
-* Refactor text printer to add stream-like API and FunctionType support (#2605, #2882)
-* Build a scaffold for structured error handling (#2838). The new mechanism detects and rewrites error messages so that c++ and python stack trace are unified and not redundant. Guideslines and conventions for error handling is also discussed.
-* Higher order reverse mode automatic differentiation that work with control flow (#2496)
-* Integer arithmetic analyzers, includes modular set analysis, const integer bound analysis and rewrite simplifier (#2904, #2851, #2768, #2722, #2668, #2860)
-* Improve operator fusion for TupleGetItem in relay (#2914, #2929
-* Compute FLOP of autotvm template for int8 models (#2776)
-* Common subexpression elimination pass in Relay (#2639)
-* Improve quantization in Relay (#2723)
-* Refactor `build_func` in measure module of autotvm to better support cross compiler (#2927)
-* Quantize all fields of concatenate (#2913)
-* Remove stale verilog generator (#2964)
-* Improve Relay printing (#2984, #2881, #3030, #3041)
-* Add `min_num_branches` option in CombineParallelConv2D (#2961)
-* Add `expr_visitor`, fix `expr_functor` exponential blowup problem (#2988)
-* Support Deriving channels when it is not provided in AlterLayout. (#2972)
-* Enhance BoundDeduce algorithm (#2795)
-* Enhance loop partition algorithm (#2956)
-* Better tuple fusion implementation (#3092)
-* Enhance fusion rule that starts from elemwise and broadcast (#2932)
-* Remove `on_device` op after annotation in heterogeneous pass (#3204)
-* Improve canonical and rewrite simplifier (#3132, #3149)
-* Capture constant external python variables in hybrid script (#3157)
-* Remove Peano nats from the prelude (#3045)
-* Macro to define NodeRef methods, constructor style example (#3224)
-* Consistent RAII scoping API (#3231)
-* Register all operators' attributes in Python (#3175)
-* Add module supoort in relay.build (#3424)
-* Relay pass infrastructure improvement (#3319, #3336, #3430, #3353)
-* Migrate Relay passes to pass manager (#3323, #3289, #3251, #3406)
-* Improve heterogeneous annotation by using visitor (#3261)
-* Support export ADT value in Python (#3299)
-* Extend TensorComputeOp to allow scalar inputs (#3300)
-* Transitioning low-level IR away from HalideIR (#3533, #3535)
-* Tags for ADT constructors (#3369)
-* IR dumping for debugging (#3493)
-* Pretty printer and parser roundtrip (#3460, #3536)
-* Relay type checking (conv2d weight dimension #3511, any shape #3221)
-* Relay Module enhancements (remove free variables #3476)
-* LLVM DWARF debug information (#3420)
-* Printer for Layout/BijectiveLayout (#3582)
-* Type inference escape hatch (#3571)
-* Making iterators compatible with constructors of STL containers (#3624)
-* Moving Conv, Dense, Concatenate InferTypes to header (#3783)
-* Simplify casts of constants 0 and 1 (#3758)
-* Conditionally replace reduction init axis. (#3408)
-* Improve Partial Evaluator (#3749, #3703)
-* Strict mode in Relay pattern matching (#3620)
-* Quit and clean when TVM is interrupted (#3640)
-* Make Type Relation catch more errors (#3899, #3699)
-* Refactor the way we interface between different modules of Relay (#3906)
-* Introduce `schedule_injective_from_existing` and unify external schedules for all targets (#3983)
-* [NODE][REFACTOR] Refactor reflection system in node. (#4189)
-* Unify node system and object (#4161, #4115, #4128)
-* [Relay][Refactor] Rename Datatype to ADT (#4156)
-* [Relay] fix exponential blowup in interpreter (#3559)
-* [Relay] Fix memory leak in the interpreter (#4155)
-* [rpc] use callback func to do send & recv (#4147)
-* Add `lift_if_then_else` pass to improve loop partitioning (#3865)
-* Decrease the complexity of CalcDep from exponential to linear (#4053)
-* [IR] Make iterators compatible with constructors of STL containers (#3624)
-* [Relay][Pass] Avoid FoldConstant folding some ops (#4245)
-* [Relay][Prelude] More dtypes support in `tensor_t` (#4233)
-* [NODE][REFACTOR] Rename IRFunctor->NodeFunctor, use func pointer (#4247)
-* [RUNTIME][REFACTOR] Use object protocol to support runtime::Module (#4289)
-* [CodeGen] Add build config option `disable_assert` to control whether to generate assert. (#4340)
-
-### Arithmetic Analysis
-* Formalize Integer Arithmetic Analysis (RFC: #2588). It is aiming to perform better context-dependent analysis, bound analysis, centralized arithmetic logic and arithmetic simplification. (#3272, #3463, #3464, #3368, #3503, #3504 , #3502, #3479 , #3568)
-* Introduce FloorDiv/Mod, TruncDiv/Mod, and IndexDiv/Mod for better arithmetic simplification (#3976, #3986, #4000, #4014, #4008, #4028)
-* [ARITH] Use floordiv for the deduce bound (#4025)
-* [Simplifier] Rewrite simplification rule to eliminate unnecessary conditionals. (#4076)
-
-### Runtime and Backend Support
-* Provide error msg for failure function call in tvm4j (#2967)
-* Expose backtrace symbols in Debug mode (#3001)
-* C++ GraphRuntimeCodegen, Deprecate Python2 (#2986)
-* Ensure interpreted functions can take values that are not TensorValues (#3015)
-* Make OpenCL runtime Compatible with OpenCL2.0 (#2897)
-* Handle INF and NAN in CUDA and OpenCL (#3194)
-* Update debug graph runtime for more precise layerwise timing (#3232)
-* ROCM support (llvm printing #3662, ld.lld finding #3664, save to file #3665)
-* Threadpool: make `spin_count` configurable (#3577)
-* RPC worker children termination (#3669)
-* Vulkan runtime reimplementation (stream approach) (#3849)
-* Vulkan backend supports Call::reinterpret and vectorized comparison (#3795)
-* Support MKL on Windows (#3837)
-* Vulkan IR builder (bool to float #3513)
-* Force `code_object_v2` for amd gpu backend (#4099)
-* [Codegen][cuda-fp16] fallback to fp32 simulation when cuda arch < sm53 (#4268)
-* Fix and refactoring for AMD gpu backend (#4305, #4321, #4341, #4342)
-* [Debugger] Sorting op-time breakdown for quicker analysis. (#4352)
-* [nvcc] enable multiple arch in one fatbin (#4377)
-* [RUNTIME] Move module export to the function level. (#4405)
-
-
-### Frontend and User Interface
-* Relay now supports saving and loading parameter dictionaries. (#2620)
-* Add `max_num_threads` to Hybrid Script, which allows users to get max number of threads for GPU targets ([#2672](#2672/)).
-* Improvements for tensorflow frontend (#2830, #2757, #2586), includes decompiling tf control flow (#2830)
-* Improvements for mxnet frontend (#2844, #2777, #2772, #2706, #2704, #2709,, #2739)
-* Improvements for keras frontend (#2842, #2854)
-* Improvements for DarkNet frontend (#2673)
-* Improvements for ONNX frontend (#2843, #2840)
-* Better profile result dump in Chrome Tracing format (#2922, #2863)
-* Unified error handling in NNVM and Relay frontends (#2828)
-* Improve NNVM to Relay conversion (#2734)
-* Remove `input_0d_mismatch` special handling for TF Frontend(#3087)
-* Bumped ONNX version from 1.1.0 to 1.4.1 (#3286)
-* Simplify parameter handling in Tensorflow frontend (#2993)
-* CoreML improvement for image scaler and padding (#3800)
-* Clean up TensorFlow frontend (#3710)
-* Darknet: Solve tvm parsing darknet resnext failure bug (#3778)
-* Frontend changes `get_workload` - (#3483)
-* [TF][Relay][Op] Pass module when infer shape (#4287)
-
-### AutoTVM
-* Support override in `register_topi_compute` and `register_topi_schedule`. (#3292)
-* Improve graph tuner dealing with Tuple. (#3649)
-* Add AutoTVM template for conv2d Intel int8. (#3955)
-* Add AutoTVM template for dense on CUDA. (#3923)
-* Add AutoTVM template for conv2d on Intel graphics. (#3839)
-* Optimizing autotvm task extraction speed. (#4138)
-* [AutoTVM] Add `batch_matmul` to tunable operations. (#4242)
-* Selecting tuning templates when extracting task. (#4338)
-
-### Performance Improvements
-* Enable AlterOpLayout pass for x86 on Relay (#2585). It is essential to get decent performance for CNN-based model on Intel CPUs.
-* Better intrinsic matching for x86 CPU and ARM CPU, includes variants of vcvtph2ps and vmlal.s16 (#2925, #2748).
-* Improve injective schedule for ARM CPU(#2801)
-* Core functionality for Graph tuner (#2184)
-* Fast tanh implementation (#3255)
-* Improve multi-batch conv2d on x86 (#3308)
-* Improve `non_max_suppression` and `get_valid_counts` for CPU (#3305)
-* Improve `roi_align` performance for CPU (#3296)
-* Improve `nms` and `get_valid_count` performance (#3282)
-* Graph tuner for multiple subgraph (#3490)
-* For sparsity, fast transpose for square CSR matrices has been now merged, which is a good start point for more general sparse type support.
-* Reduce `set_input` and `set_input_zero_copy` overhead (#3805)
-* Parallelize batch axis for ARM (#3931)
-* Support cuBLAS BatchMatMul (#3936)
-* Add AVX512VNNI support for TVM (#3388)
-* Enhance tuning space of split (#3949)
-* Enable miopen transpose convolution and fp16 support (#3952)
-* Improve `conv2d_transpose` schedule on X86 and CUDA (#3948)
-* Expose llvm.nearbyint intrinsic (#4001)
-* [TOPI][X86] Pool operator parallel support. (#4090)
-* Improve layout for several operators (#4103, #4040, #4080)
-* [Relay][VM] Fix constant folding issue in VM compiler (#4077)
-* [relay][vm] Reuse allocated device memory (#4170)
-* [Runtime] Enable option to use OpenMP thread pool (#4089)
-* [PERF] Parallelize reduction for CPU (#4158)
-* [TOPI] Tunable Template for Conv2D HWCN on CUDA (#4168)
-* [TOPI] Add valid auto tvm for Intel Graphics (#4078)
-* [TOPI] FIFO buffer op, to accelerate sequence modeling with dilated convolutions (#4039)
-* TensorCore Support using Intrinsic (#4136)
-* Auto TensorCore CodeGen (#4234)
-* Use cblas for dense and `batch_matmul` (#3787)
-* Update TOPI softmax compute and CPU schedule (#3680)
-* [VTA] Performance optimize, remove unnecessary contigious memory use. (#4246)
-* [TOPI][AlterOpLayout][ARM] Enabling NHWC to NCHW layout transformation. (#4249)
-* [PERF] Parallelize reduction for CPU (#4158)
-* [ThreadPool] Solve thread transitions issue (#4344)
-
-### Documentation
-* Tutorials for deep learning frameworks support in Relay.
-* Tutorial for running AutoTVM with Relay (#2594).
-* Document for Algebraic Data Types (#2575).
-* Move NNVM tutorials to Relay (#2783, #2785, #2766, #2693)
-* Documentation on operators (#2761)
-* Add gradient operator tutorial docs (#2751)
-* Add compiler pass tutorial docs (#2746)
-* Add Android Tutorial (#2977)
-* Developer documentation for InferBound pass (#3126)
-* Add missing targets to `target_name` documentation (#3128)
-* Various documentation improvements (#3133)
-* Add VM doc (#3188)
-* Update documents for TSim (#3409, #3318, #3302, #3343, #3206)
-* Improve tvm4j document describing LLVM support (#3404)
-* Tutorial migration to Python3 (#3498)
-* Android RPC README (#3500)
-* Documentation for Relay opcode (#3522)
-* Tutorial for pass manager (#3515)
-* Minimum version of Python in docs (#3588)
-* Relay pass infra (#3583)
-* X86 Autotune tutorial improvements (#3609)
-* YOLOv3 tiny Darknet tutorial (#3674)
-* SSD doc to avoid confusion (#3677)
-* Tutorial: Build a Graph Convolutional Network on TVM (#3681)
-* Add docs for analysis namespace (#3985)
-* [tutorial] Relay pass infra tutorial (#4083)
-* [DOCS] Add TensorFlow frontend docs (#4154)
-* Tutorial: update Building a Graph Convolutional Network tutorial (#4060)
-* [Docs] Add dependency of compilation with LLVM (#4117)
-* [Documentation]Fix example code in comment of `tvm.build_module.build()` (#4195)
-* TSIM: add virtual memory support to examples (#3868)
-* Relay pass infra tutorial (#4083)
-* Fix the TF tutorial to run against TF2.0 and TF1.x (#4104)
-* Add `topi.nn.fifo_buffer` to TVM doc (#4343)
-* License statement (#4345, #4359, #4401, #4402, #4408, #4409, #4410, #4414, #4431)
-
-### Build and Test
-* Increate the robuteness of CI test (#2841, #2798, #2793, #2788, #2781, #2727, #2710, #2711, #2923)
-* Improve conda build (#2742)
-* Add caffe2 nnvm frontend to CI (#3018)
-* Use bridge network and expose port on macOS when launch docker image (#3086)
-* Run DarkNet tests (#2673)
-* Add file type check (#3116)
-* Always run cpptest during build to ensure library correctness (#3147)
-* Handle more file types in ASF header (#3235)
-* Add `test_forward_ssd_mobilenet_v1` to `tflite/test_forward` (#3350)
-* Add Azure build pipeline (#3458, #3459)
-* Update ci-gpu to v0.52 (#3374)
-* Enable more visible symbols by default (#3365)
-* Separate out legacy as a stage in CI (#3337)
-* Simplify build script, remove python 2 support  (#3419)
-* Ignore rust cargo lock files in rat (#3314)
-* Improve CUDA Conda package build (#3281)
-* Update CMakeLists.txt to be more flexible to find the third parties libraries (#3354)
-* Docker update conda package (#3344), requests and pillow (#3495), Android demo (#3499), rat install (#3527), ARM support (#3546), LLVM (#3590)
-* Relay-to-Python testing (#3156)
-* Code refactoring/remove (#3523, #3667)
-* Zero-rank testing (#3612)
-* CMake compilation (#3611, #3650, google test #3628)
-* Standalone wheel build for TOPI (#3657)
-* Fixing performance issues in PassUpDomain when fusing and splitting axes (#3073)
-* conda recipe (#3791)
-* Allow users to specify download directory (#3803)
-* Update docs for installation for CUDA (#3832)
-* Update `hybrid_script.rst` (#3799)
-* Acknowledge Halide attributions (#3824)
-* Add psutil dependency (#3780)
-* Temporary disable rust test (#3809)
-* Solve occasional CI issue when pad value is all 0 (#3801)
-* Towards TSIM CI testing (#3704)
-* Use pip3 for python3 (#3742)
-* Update docker image `ci_cpu,i386` to include verilator (#3738)
-* Remove sccache from Rust install (#3728)
-* Update dmlc-core to the latest commit (#3716)
-* Update GPU docker (#3709)
-* Add an option to build with -pthread (#3671)
-* Add DGL to `{ci_gpu, demo_cpu, demo_gpu}` docker images (#3692)
-* Use pytest instead of nosetest (#3524)
-* Enable NHWC of `relay.testing.mobilenet` (#3886)
-* Add .hsaco save/load for `tesnor_expr` Tutorial (#3852)
-* Support LLVM trunk (#3907)
-* Remove GTest cmake flag from install docs (#3953)
-* Allow `USE_LLVM` to take extra arguments (#3954)
-* [CI] Pin NNPack pthreadtools version (#4152)
-* [TOPI] Fix flaky testcase for check round (#4211)
-* [CI] Move gpu docker binary to cuda10 (#4229)
-* [CI] use llvm9 for the gpu tests (#4224)
-* [CI] Update GPU docker to cuda10 (#4228)
-* [Relay] Install Relay Prelude program in package install (#4227)
-* [relay] use `time_evaluator` for measurement (#4191)
-* [Relay] Improve build error when no lowered funcs are produced (#4132)
-* [llvm] switch to use Align for llvm trunk (#4051)
-* [CUDA] Update `have_int8` condition to run on compute capability 7.x devices (#4214)
-* [DOCKER] Pin torchvision==0.4.1 (#4140)
-* [DOCKER] torch install depends on future package (#4098)
-* [CodeGen] Disable -mfloat-abi hard option for LLVM < 6.0 (#4071)
-* Add a python how to example of deploying tvm module with tvm runtime only (#4094)
-* Hide symbols from dependent libraries if `HIDE_PRIVATE_SYMBOLS` is ON. (#4041)
-* [BUILD] Disable utvm standalone runtime by default (#4240)
-* Fix TSIM compile error in Linux (add missing -fPIC flag) (#3876)
-* Add scalafmt and format existing scala codebase (#3880)
-* Update TFLite wheel version to 1.13.1 (#3435)
-* Remove PEP498 f-string new feature for support python3.5 (#4250)
-* Require LLVM >= 9 for AMDGPU backend (#4253)
-* Rename ml.dmlc.tvm to org.apache.tvm (#4290)
-* [Test][TF][Relay] Fix argument preparation for vm test mode (#4296)
-* Add test for the `qnn_add` operator (#4282)
-* [CI][DOCKER] Add ONNX runtime dep (#4314)
-* [CI][DOCKER] Upgrade image to include onnx runtime (#4313)
-* [CI] Set workspace to be per executor (#4336)
-* [Build][Windows] Fix Windows build by including cctype (#4319)
-* [Contrib] Add MKL DNN option (#4323)
-* [Test][Relay][Pass] Add test case for lambda lift (#4317)
-* Remove Python imp module as it is deprecated (#4275)
-* Bump up CUDA log version in tophub.py (#4347)
-* Add rule for clean in APPs (#4364)
-* [Relay tests] Temporary Attr Update for Order-Independent Testing (#4357)
-* [CI] Avoid content-length request in test data download (#4375)
-* Compare all outputs in TFLite `test_forward_ssd_mobilenet_v1` (#4373)
-
-### Bug Fixes
-* [RELAY] Fix `get_int_tuple`. (#2691)
-* [ARITH] Select support for integer set analysis. (#2687)
-* [Relay] Fix error in ANF (too aggressively inline atomic expression and create free variable). (#2665)
-* [Hybrid Script] Fix name conflict and attached scope problem. (#2649)
-* [Relay] Fix ANF for reference and pattern matching. (#2637)
-* [Relay] Fix fusion bug when call symbol that is not an operator. (#2630)
-* Fix missing <sstream> header file. (#2629)
-* [Relay]Fix the bug in heterogeneous annotation which mistakenly steps into the fused op. (#2622)
-* [AutoTVM] Fix incorrect localhost usage in RPC mode. (#2619)
-* [NNVM] Fix incorrectly getting layout attribute as a tuple. (#2610)
-* [Relay] Fix mutating IF expression. (#2601)
-* [Tutorial] Fix downloaded file path. (#2590)
-* [Storage] Fix int32 overflow bug when input is big. (#2580)
-* [NNVM] Fix non-identity problem for FInplaceIdentity. (#2572)
-* [Golang] Fix compilation error. (#2558)
-* [Tensor Expression] Fix missing reduction init predicates. (#2495)
-* [Relay] Fix missing argument for NCHWc in Relay. (#2627)
-* [TOPI] Fix `Nms_ir` data race. (#2600)
-* Fix `compute_inline` with multiple outputs (#2934)
-* [TEXPR][PASS] Fix thread all reduce to avoid write after read hazzard (#2937)
-* [FRONTEND][TENSORFLOW] bug fix for tensorflow official slim models. (#2864)
-* [FRONTEND][ONNX] Some bug fixes and Shape operator fixed for relay. (#2850)
-* Turn on `USE_SORT` by default (#2916)
-* [DOCKER] Upgrade ci-cpu to latest v0.50 (#2901)
-* [TESTS] Import script robustness (set -u) (#2896)
-* [Relay] Fix name of bias in testing.mlp (#2892)
-* [TESTS] Improve script robustness (#2893)
-* Add dense schedules to `__init__` for cpu (#2855)
-* [Apps] [howto_deploy] fix cxx-flags order and build directory (#2888)
-* [Relay] Add TVM_DLL for ANF/GNF conversion #2883
-* [Relay] Fix Relay ARM CPU depthwise spatial pack schedule alter op layout issue. (#2861)
-* Fix setting up hints for getaddrinfo (#2872)
-* Add missing sgx includes (#2878)
-* Fix error reporting for missing axis (#2835)
-* Fix an OrderDict initilization bug. (#2862)
-* Fix Xcode 10 metal compile error (#2836)
-* tvmrpc: Fix includes (#2825)
-* Fix `init_proj.py`: Team ID expected (#2824)
-* [DOCKER] Fix git clone failure. (#2816)
-* upgrade java style-check due to CVE-2019-9658 (#2817)
-* [Relay][Quantization] Fix duplicated simulated quantization (#2803)
-* [Bugfix] Repeat and tile bug fixed, relay tests added (#2804)
-* Fix caffe2 relay frontend (#2733)
-* Fix a bug in nnvm to relay converter. (#2756)
-* Ensure loop count is a constant before trying to unroll. (#2797)
-* xcode.py: Decode bytes before output #2833
-* [WIN] Fix a bug in `find_llvm` when specify llvm-config (#2758)
-* [DLPACK] fix flaky ctypes support (#2759)
-* [Bugfix][Relay][Frontend] Fix bug in mxnet converter for `slick_like` (#2744)
-* [DOCS] Fix tutorial (#2724)
-* [TOPI][Relay] Fix default `out_dtype` for `conv2d_NCHWc` and Relay (#2702)
-* [Relay] fix checkwellform (#2705)
-* fix prelu, now can use on 2d input and add one test (#2875)
-* [CODEGEN][OPENCL] Fix compile error about ternary expression. (#2821)
-* Fix Placeholder issue (#2834)
-* Fix makedirs() condition in contrib (#2942)
-* Add missing #!/bin/bash directive (#2951)
-* Bilinear resize bug fix from PR #2777 (#2857)
-* Fix `bias_add` default axis (#2829)
-* Remove empty ty.rs (#2958)
-* fix undefined reference to dlopen, etc (#2957)
-* Removed deprecated `std::unary_function` (#2962)
-* Add output format to ndk build func (#2999)
-* Fix java checkstyle version (#2998)
-* Fix relay invariant error message (#3011)
-* Fix for caffe2 nnvm frontend (#2996)
-* Fix rust resnet example (#3000)
-* Fix x||!x for comparisons in rewrite simplifier (#3029)
-* Fix BatchMatMulRel typerelation (#3032)
-* Update dmlc-core, fix default ctors of NodeEntry (#3017)
-* Fix Fuse (#3035)
-* Fix PostOrderVisit signature (#3048)
-* Fix winograd nnpack fp16 (#3046)
-* Fix some typos (#3063, #3112)
-* Fix `group_conv2d` unit test (#3113)
-* Fix bug in ONNX importer (#3084)
-* Fixing a doc nit (#3123)
-* Fix type code error for StringImm (#3050)
-* Fix bug of wrongly generated `device_map` (#2990)
-* use `unordered_map` instead of map in ANF (#3024)
-* Fix PRelu layout in Relay (#3013)
-* Minor addition to graph runtime debug (#3129)
-* Fix mali conv2d performance regression (#3131)
-* Fix dense autotvm template registration in ROCm (#3136)
-* Fix `conv2d_transpose` (#3138)
-* Fix python lint warnings (#3145)
-* Some fixes for golang latest version compiler #3119 (#3182)
-* Add more syncs to fix flaky test caused by `get_valid_counts` (#3151)
-* Fix AlterLayout Pass (#3155)
-* Fix a multithreaded bug in llvm LazyInitJIT (#3158)
-* Fix a tensorflow test bug. (#3165)
-* Fix concat for ARM (#3061)
-* Handle vectorize for LE statement (#3137)
-* Raise exception `group_conv2d_nchw` not supported (#3195)
-* Quick fix of VTA FPGA Toolchain Installation documentation (#3196)
-* Check file exists before removing it (#3178)
-* Fix a bug of flatten in ONNX to Relay converter (#3180)
-* Fix converter where initializers were not registered as nodes (#3143)
-* Fix bug in cast to bool (#3207)
-* Hotfix `build_module` creation (#3198)
-* Fix sort changing original input data issue (#3212)
-* Fix bug in vta runtime DepPop function (#3208)
-* Fix resize nearest with fractional scaling (#3244)
-* Fix `vta_conv2d` crash issue after change `vta_config.json` (#3213)
-* Fix a memory leak in OpManager (#3263)
-* PkgConfig cause crash in PYNQ board due to link library (#3257)
-* Fix Error messages in tflite.py (#3320)
-* Fix typos in docs and comments (#3309, #3376)
-* Bugfix min/max const canonicalize rule (#3386)
-* Return module from frontend for autotvm (#3401)
-* Fix constant and reshape in ONNX (#3387)
-* Default verilator location fix (#3324)
-* Fix autodiff for conditional expression (#3453)
-* Gramatical improvements to `tensor_expr_get_started` (#3330)
-* Fix AutoTVM data structure bug (#3462)
-* Fix MXNet RNN without providing state initialization as input (#3326)
-* Fix flaky test on topk and quantize pass (#3362)
-* Add VTA PYNQ `metal_test` bitstream program logic and fix compilation issue. (#3400)
-* Fix VTA function Vivado Compile Error. (#3375)
-* Fix VTA DRAM functionality issue. (#3278)
-* Fix reshape precompute and type error in ONNX frontend (#3230)
-* Fix interpreter argument conversion for tuples. (#3349)
-* Fix code generation for packed functions + tuples in VM (#3287)
-* Fix memory leak in Relay interpreter (#3448)
-* Fix x86 depthwise conv2d `alter_op_layout` (#3264)
-* Create closure object for GlobalVar (#3411)
-* Fix getting global var in prelude (#3405)
-* Fix rfactor bugs which related to predicate and loop partition (#3382, #3444)
-* Fix the bug in AutoTVM where SimulatedAnnealingOptimizer sometimes finds useless candidate (#3413)
-* Fix name conflict in PartialEval (#3402)
-* Fix int bound analysis bug for modular (#3288)
-* Check arg positiveness for modular rules (#3279)
-* Fixes failure of `sum` and `all` on `axis=0` (#3422)
-* Fix package path in tflite test (#3427)
-* Fix Windows build (#3429)
-* Fix `LSTMBlockCell` in Tensorflow frontend (#3410)
-* TF fix where output index is ignored (#3622)
-* Runtime fix for custom datatypes (#3471)
-* Relay build module warnings (#3452)
-* Relay partial evaluator (#3482)
-* Pynq AutoTVM tracker (#3497, #3578)
-* A normal form test (#3525)
-* Lint issue (#3519, #3615 )
-* Any shape testing (#3528)
-* Android `posix_memalign` (#3532)
-* Quantization `add_rewrite` and UnifyDTypeScale (#3534)
-* Bound inference fix (#3526)
-* Tensorflow NCHW data format (#3514)
-* First order gradient (#3550)
-* JS load module example (#3556)
-* Build error (#3552)
-* Relay VM debug statements (#3565)
-* C++ lambda expr (#3570)
-* Handling of tempdir if subprocess is killed (#3574)
-* Remove tabs in Chisel source (#3603)
-* Relay VM DataTypeObject (#3604)
-* Removing prints (#3616)
-* Average Pool2D Bug (#3607)
-* Missing header in `cuda_device_api.cc` (#3621)
-* Tensorflow frontend fix where `output_shape` is None (#3632)
-* Winograd accuracy fix (#3644)
-* Fix comment (#3646)
-* Zero-input op fix for recursive traversals (#3623)
-* Python 3.5 compatibility (#3675)
-* Fix infinite recursive `device_api.ext_dev` call in VTA. (#3843)
-* Fix `depth_mult` for TensorFlow frontend (#3676)
-* Fix database APIs for AutoTVM (#3821)
-* Fix axis of softmax in Keras (#3834)
-* Fix VTA TensorLoad module (#3841)
-* Fix inconsistent python/cpp API behavior for `if_then_else`, power (#3829)
-* Fix code comment of operators in ONNX frontend (#3830)
-* Added repo for llvm-9 to fix missing dependency issue (#3826)
-* Fix typo in Relay text parser (#3785)
-* Fix tvm const warnings (#3817)
-* Add gfx906 bc (#3808)
-* Fixed onnx test failures when run on a cpu backend (#3764)
-* Fix ArgBinder assert order (#3794)
-* Fix for NoneType Target for quantization (#3792)
-* Fix out-of-date quantization realize (#3790)
-* Fix Qnn concatenate InferType (#3779)
-* Fix dense tuning (#3768)
-* Fix `visit_pattern` in ExprMutator (#3769)
-* Fix Chisel Scala style (#3765)
-* Fix some pass docs (#3767)
-* Fix mistype in rpc tutorial (#3763)
-* Fix tvm.scan follow by tvm.compute segfault (#3723)
-* Fix the potential index overflow in where operator (#3751)
-* Revert `compile_cmd` kwarg name change (#3746)
-* Update tophub (#3752)
-* Fix typo in `ir_pass.h` (#3741)
-* Bug fix for VME Shell (#3737)
-* Fix missing apt https transport support (#3735)
-* Take zero extent loops as NoOp and remove it (#3724)
-* Fix mxnet converter for hybridblock and add `div_sqrt_dim` (#3701)
-* Fix partial eval unit test name (#3719)
-* Fix conv2d schedule code (#3648, #3717)
-* Remove thread related headers (#3713)
-* Fix FunctionPass (#3712)
-* Export tvm::relay::OpRegistry::OpRegistry (#3711)
-* Fix Metal reinterpret (#3706)
-* Fix `gather_nd` in Relay (#3442)
-* Fix error in partial evaluator (#3693)
-* Align the naming rule for OpAttributeUnImplemented (#3695)
-* Enable the sparse schedule (#3651)
-* Fix typo names in Caffe2 frontend (#3685)
-* Make tests multi-process friendly. (#3683)
-* Fix typo in README.md (#3684)
-* Fix doc rendering  (#3897)
-* Add test script starter command to document (#3993)
-* Add type solver unit tests for unifying quantified funcs (#3947)
-* Change Vivado install instructions to version 2018.3 (#4003)
-* Add a link to the defining network description of auto-tuning tutorial (#4023)
-* Additional MXNet Convolution and Deconvolution tests (#4026)
-* Adding support to check if an attribute is present or not without having to get the value (#3957)
-* Fix parser for cast. (#3873)
-* Fix operator fusion for multiple output (#3871)
-* Remove extern C warpper for cuBLAS (#3877)
-* Fix int32 range overflow by using int64 (#3870)
-* Remove duplicate resize (#3902)
-* Fix blas cmake for mac os (#3898)
-* Add another MKL name alias for MKL installed through pypi (#3853)
-* Numpy compatible dtype inference for `tvm.convert` and `tvm.const` (#3861)
-* Remove incorrect check for LLVM in C codegen test (#3921)
-* Fix exponential blowup in interpreter (#3559)
-* Fix CUDA int8x4 vectorize (#3928)
-* Make buffer auto broadcast independent to the order of input args (#3956)
-* Fix benchmark layout in graph tuner (#3926)
-* Fix Android Demo LLVM version (#3962)
-* Cast filepath arguments to string (#3968)
-* Fixes "common" sub crate using nightly and main (#3965)
-* Changes to make tensorize work. These changes also fix the previously broken test. (#3981)
-* Remove FLOP computation when calling 3rd party library (#4005)
-* Use a more intuitive way to limit the #ops in a group (#4018)
-* Add more `pad_mode` support for onnx converter (#4029)
-* Impose a max op limit to the op fusion pass (#4002)
-* Fixes issue with CPP enums (#4019)
-* Int64 shape handling for outputs. (#4031)
-* [PYTHON] Fix installation for generated grammar (#4223)
-* [Bugfix] Fix target host for vm compiler (#4057)
-* [Fix][VM] Fix VM invoke with `set_params` (#4079)
-* [Fix] Fix a few bugs when dtype is fp16 (#4088)
-* [Relay][Frontend][TF] Fix Size operator (#4175)
-* [cmake][ANTLR] Support setting path to ANTLR jar (#4176)
-* Fix infer type of kernel in dense. (#4125)
-* [Relay] Fix match case in Python-side expr functor (#4037)
-* Split `adaptive_pool2d_avg` into sum and div (#4186)
-* [AutoTVM] Fix Split Factors when `no_tail` is off (#4044)
-* Fix extent one for the `post_stmt` in loop partition (#3734)
-* [TOPI] Fix bug in intel graphics auto tune (#4093)
-* [ARITH] Fix lowering of `floormod(x, y) != 0` (#4127)
-* [ARITH] Fix the rule `y < x && x <= y` (#4220)
-* [Bugfix][TF] reset graph after getting tag of savedmodel (#4055)
-* [Fix] Fix the logic of the number of nodes checking in op fusion (#4074)
-* [VTA] hotfix for de10-nano driver (#4081)
-* Fixing tensor not found issue in bitserial operator (#4095)
-* Fix wrong `n_trial` number in autotvm tutorials' progress bar if `n_trial` is larger then config space. (#4070)
-* [PATCH] Fix undefined `__floatdihf` in libtvmruntime.so on aarch64. (#4119)
-* [ARITH] Fix lowering of FloorMod (#4236)
-* [Relay][Frontend][Tensorflow] Fix GatherV2 (#4238)
-* Fix typing.Deque import error for Python 3.5 (#4254)
-* [VTA] Hotfix for padded load test in Chisel VTA (#4264)
-* [Contrib] Fix error message at `callback_get_section_size()` (#4221)
-* [TOPI] Fix bug in Winograd on CUDA (#4260)
-* AutoTVM: Fix hang/crash issues on feature extraction (#3689)
-* [TOPI][CUDA] Fix Winograd Kernel Size Support (#4276)
-* [Relay][Frontend][Tensorflow] Fix type assignment for 'tf.range' operator (#4294)
-* Fix incorrect call to Unicode Win32 InetPton (#4306)
-* [Relay][Frontend][Keras] handle `batch_norm` op params well (#4310)
-* [VTA] fix error when `memory_id` is `VTA_MEM_ID_OUT` (#4330)
-* [Doc][fix] fix sphinx parsing for pass infra tutorial (#4337)
-* [Codegen] remove fp16 function override for cuda (#4331)
-* [TFLite] Fix Prelu unified shape error (#4326)
-* [Relay][Frontend][TF] Fix transpose when axes is not a param (#4327)
-* [VTA] Bug fix for padded load with large inputs (#4293)
-* Fix inconsistent operator tag name (#4134)
-* Fix for a specific case when loop partitioning with indivisble. (#4243)
-* Send list as argument to `schedule_conv2d` (#4358)
-* [Docker] Fix TVM folder name for installing on Android and OpenCL. (#4363)
-* Fix TFLite Reshape assert (#4320)
-* [Relay][Frontend][TF] Fix slice when begin or size is not Const (#4372)
-* Fix compilaton of bfloat16 on Windows (#4415)
-
-### Known Issues
-
-* The performance of Relay VM is not good enough on GPU, due to memeory allocation overhead which will be resolved later.
-* TFlite rounding vs tvm rounding causing differences in accuracy and potentially off by 1 errors. For reference #3900
-* TFlite pre-quantized network support is still a work in progress and the project would welcome further contributions.
-* TSIM build requires `python` command exist on the host. See [forum discussion](https://discuss.tvm.ai/t/vta-build-failure/4790) for details.
-* Tensorflow control flow has not been fully supported in the frontend converter.
-* `topi.floor_div` is inconsistent with floor division semantic when result number is close to an integer.
-
-
-### Depreciations
-* Deprecating python2 support and following release (v0.6). (#2994, #2986)
-* NNVM is deprecated and will be removed in a future version. (#4333, #4368)
-
-
-## 0.5
-This release features several major improvements. Some of the highlights are: Arbitrary bits quantization algorithm; High-level auto-differentiable programming IR -- Relay.
-
-- Fully featured 8-bit network support
-  - 8bit quantizer
-  - Arbitrary bits quantization algorithm
-  - Intel cpu support
-  - ARM cpu support
-- NVidia GPU 8-bit kernel
-  - int8 gemm recipe
-  - int8 conv2d
-  - Autotvm integration
-- Automated tuning and scheduling
-  - AutoTVM optimizations for mobile GPUs
-  - AutoTVM optimizations for CUDA
-  - AutoTVM optimizations for x86
-- Initial release of the differentiable programming IR, Relay
-  - Generic & informative Relay error reporting #2408
-  - Relay IR text format support #1781
-  - Support control flows
-  - A Normal Form Canonicalization #2251
-  - Type system support
-  - End to end compilation
-     * Frontend support: Caffe2 #2507 , CoreML #2476 , Keras #2376 , MXNet #2163 , ONNX, TFLite #2365
-     * Operator coverage #1799 #2051
-  - FoldScaleAxis #2020
-  - SimplifyInference #2033
-  - CombineParallelConv2D #2089
-  - InstrumentBoundCheckers pass #2079
-  - Bind & FoldConstant #2100
-  - Alter Op Layout #2150
-  - General OpFusion #2090
-- CodeGen
-  - Gcc / g++ compatible C code generator for TVM #2161
-  - Device type annotation for heterogeneous compilation #2361
-  - Cache packed func ptr, lift alloca #2070
-  - Generalize compute to tensor region #1476
-- Runtime
-  - Relay interpreter and compiler #1954
-  - Heterogeneous runtime #1695
-  - Language bindings: Golang runtime #1470 , Rust runtime #1597
-  - Add min_repeat_ms to time_evaluator #2200
-  - Bundled interpreter demonstration #2297
-  - Enable PlanMemory in the graph runtime #2120
-- Language Binding
-  - Rust frontend #2292
-- VTA
-  - Improved RPC for VTA #2043
-- Hybrid python programming model
-  - Support for scheduling #2416
-  - Support for Inter-function call  #2287
-  - Backend support  #2477
-- TOPI
-  - Initial support for sparse tensor computation
-  - Improve ARM CPU depthwise convolution performance #2345
-  - Port winograd ops to relay #2356
-  - Add faster-rcnn proposal op #2420
-- Tutorials and docs
-  - Relay language docs #2232
-  - Tutorials on how to use SGX backend
-  - How to write a pass in python
-  - General lowering flow of TVM
-  - How to do tensorize
-  - TFLite frontend tutorial #2508
-  - Keras seq2seq model for translation tutorial #1815
-  - Committer guide and tips #2468
-  - Code review guideline on API designs #2459
-
-
-
-## 0.4
-
-This release features several major improvements. The high-level graph optimizer is now part of TVM repo. Some of the highlights are: Initial support of AutoTVM for automated optimization; customized accelerator backend VTA.
-
-- Tensor operator primitives
-  - Introduce attrs field to operator primitives(e.g. compute) to store additional metadata, the attrs can be used as hint for scheduling
-- Enable embedding of asm micro-kernels
-- Hybrid python programming model
-   - python AST based IR builder interface
-   - support GPU programs
-- AutoTVM, Automated tuning, and scheduling
-   - basic autotvm infra
-    - GPU IR verifier
-   - basic autotuning tutorial
-   - topi integration
-- ARM support
-    - winograd support
-   - initial support of ARM autotuning records
-- TOPI Vision
-   - Generic GPU sort support(useful for vision)
-   - SSD operator support
-- TOPI numpy consistency
-   - Rename all binary operators for numpy consistecy: broadcast_add-> add, broadcast_sub -> substract, broadcast_mul -> multiply, broadcast_div->divide
-   - New operators: slice, LRN, equal, not_equal, less, greater
-   - tutorials on topi
-- Initial low-bit operator support support
-    - Optimized popcount generation on ARM
-    - general bit-serial convolution and GEMM
-    - optimized low bit kernels
-    - parallel optimization
-- New topi backend optimization for intel graphics
-- Adapt AVX schedules for SSE target
-- VTA: customized accelerator backend
-  - custom hardware backend example
-  - tutorials on how to use customized accelerator
-- Initial experimental support for  HLS backend
-- Bugfix in SPIRV code generator for vulkan
-- libdevice support, enable NVPTX backend
-- Introduce NDArrayContainer for managed NDarray
-- RPC and Device API
-   - Support communication between big/small endian machines.
-   - RPC and device API protocol upgrade (this is a non-backward compatible change) to support big-small endian communication. This is a non-backward compatible change, need to use the latest version of TVM runtime with the RPC
-   - graduate rpc from contrib, tvm.contrib.rpc->tvm.rpc
-   -Support tracker in Android RPC, add fault tolerance for AutoTVM
-- BIG.LITTLE aware threadpool
-- tvm4j graph runtime that runs end to end workload in java
-- DLPack support
-   - Support from_dlpack and to_dlpack
-   - Enables bridges to pytorch
-- Enable link of stackvm in runtime
-- Tensorflow graphdef frontend
-- Keras frontend
-   - improved to support reuse layers, add activations
-- ONNX
-   - gather,  LRN
-- CoreML frontend
-   - Support C-RNN and activation functions
-- Fix grads for sum and expand_like
-- Enhanced operator fusion for multiple elemwise branches
-- Separate nnvm fusion and compilation pass
-- Unified build system to cmake, customizable cmake path for vulkan, rocm, cuda
-
-
-## 0.3
-
-This release features numerous improvements in TOPI and backends. We make the first step toward object detection support in TOPI, featuring operators necessary for YOLO and SSDs. The topi now supports numpy-style API and operator overloading. RPC is significantly improved to support resource allocation and using a pool of devices. We are adding two new backends: WebGL for running GPUs on the browser, and Vulkan for running on next-generation graphics API.
-
-- TOPI Vision operators
-   - SSD support
-   - YOLO support
-   - NMS operator support in vision
-- TOPI general numpy-style operators
-   - numpy style operator overload in topi
-   - more operators: flip, take
-   - dilation support on conv2d and depthwise
-- 8bit support
-    - ARM 8bit gemm
-    - ARM 8bit conv
-- Low bit operator support
-    - popcount intrinsics
-    - 1-bit fully connected
-- Contrib: MPSDNN fully-connected and conv2d support
-- Better RPC support
-   - RPC Tracker support to allow centralized resource management
-   - RPC protocol upgrade (this is a non-backward compatible change) to support timeout in the proxy
-     - This is a breaking change, need to use the latest version of TVM runtime with the RPC
-   - Fault-tolerant to early server termination with correct exception propagated
-   - RPC support enabled for ROCm AMDGPUs
-- Tutorials and docs
-  - How to deploy to android devices.
-- Optimizations for hardware backends
-  - intel CPU (AVX and AVX512)
-- Schedule Primitives
-   - rfactor now support factor_axis to specify the factored dimension in the result
-   - cache_write now support multiple output operators
-   - enable warp memory which generates shuffle instructions
-- Framework bridge
-  - MXNet bridge supported
-- C++ compiler API support
-   - build migration
-   - topi migration to c++
-   - Target system in c++
-- WebGL backend
-   - runtime and codegen
-   - topi integration
-   - end to end pipeline on the browser
-- Vulkan backend
-   - vulkan runtime
-   - spirv code generator
-- Security
-    - intel SGX runtime support
-    - multi-threaded SGX runtime
-- LLVM 7.0 support
-- Robustness
-   - VerifyMemory to verify incorrect GPU schedules that writes into GPU memory from cpu
-   - Verify compute formulas
-- Better CPU parallel runtime
-
-## 0.2
-
-This release comes with a complete set of TOPI support for NNVM compiler, which allows compilation of end to end workloads.
-We also make major improvements in supporting new backends: ROCm for AMDGPUs and ARM GPU.
-
-- Backend support
-   - Support LLVM mainline(4.0, 5.0, 6.0)
-   - Support ROCM stack for AMD GPUs
-   - More robust OpenCL support for ARM GPUs
-- Android RPC runtime
-- Multi-threading optimization for ARM
-   - multi-threaded depthwise
-   - multi-threaded conv2d
-- New schedule primitives
-   - storage_align for shared memory alignment
-   - double_buffer
-- UnrollLoop : more robust version of unroll loop, count maximum steps that can be unrolled.
-- Full set of TOPI operators
-   - Introduce tvm.target to specify target options for compilation better.
-   - broadcast/ reduction operators
-   - pooling and global pooling
-   - Generic target support for topi
-   - schedule with external libraries
-- End to end deep learning pipelines for CPU, GPU, ARM GPU
-- Tutorials
-  - How to load compiled module in any language runtime
-  -  How to use java runtime
-- Contrib library: MIOpen, CuDNN
-- Ongoing items that contains functioning pieces
-  - WebGL backend
-  - C++ compiler support
-  - MPS DNN
-  - low bit support, introduced popcount
-
-
-## 0.1
-
-- Language runtime
-    - python
-    - javascript
-    - java
-    - c++
-- Backend
-    - arm, x86
-    - javascript, wasm
-    - CUDA
-    - opencl
-    - Metal
-- DNN Library integration
-- RPC  runtime
-- TOPI operator pipeline python
-- TOPI operator pipeline in C++
-- Rough perf of the TOPI GPU pipeline
-- Rough pref of TOPI CPU pipeline
-- End to end graph executors
-
-
-## Initial version
-
-- Pack libary into shared library.
-- External function and contrib libraries
-- DLPack integration support
-- AOT and module system
-- Basic code structure ready.
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 17f587efd4c7..b552cd1187d2 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -329,9 +329,6 @@ set(USE_MRVL OFF)
 # Whether to use QHL library
 set(USE_HEXAGON_QHL OFF)
 
-# Whether to use ONNX codegen
-set(USE_TARGET_ONNX OFF)
-
 # Whether enable BNNS runtime
 set(USE_BNNS OFF)
 
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index 3881247ae106..b94bb42b2550 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -114,7 +114,6 @@ function(add_lib_info src_file)
     TVM_INFO_USE_SORT="${USE_SORT}"
     TVM_INFO_USE_SPIRV_KHR_INTEGER_DOT_PRODUCT="${USE_SPIRV_KHR_INTEGER_DOT_PRODUCT}"
     TVM_INFO_USE_STACKVM_RUNTIME="${USE_STACKVM_RUNTIME}"
-    TVM_INFO_USE_TARGET_ONNX="${USE_TARGET_ONNX}"
     TVM_INFO_USE_TENSORFLOW_PATH="${USE_TENSORFLOW_PATH}"
     TVM_INFO_USE_TENSORRT_CODEGEN="${USE_TENSORRT_CODEGEN}"
     TVM_INFO_USE_TENSORRT_RUNTIME="${USE_TENSORRT_RUNTIME}"
diff --git a/cmake/modules/contrib/ArmComputeLib.cmake b/cmake/modules/contrib/ArmComputeLib.cmake
index 83d5e2da1dfe..681a425ea42e 100644
--- a/cmake/modules/contrib/ArmComputeLib.cmake
+++ b/cmake/modules/contrib/ArmComputeLib.cmake
@@ -20,9 +20,7 @@
 # which is common with arm devices, we need to be able to cross-compile
 # a relay graph on x86 for AArch and then run the graph on AArch.
 if(USE_ARM_COMPUTE_LIB)
-    tvm_file_glob(GLOB ACL_RELAY_CONTRIB_SRC src/relay/backend/contrib/arm_compute_lib/*.cc)
     tvm_file_glob(GLOB ACL_RUNTIME_MODULE src/runtime/contrib/arm_compute_lib/acl_runtime.cc)
-    list(APPEND COMPILER_SRCS ${ACL_RELAY_CONTRIB_SRC})
 
     if(NOT USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR)
         list(APPEND COMPILER_SRCS ${ACL_RUNTIME_MODULE})
diff --git a/cmake/modules/contrib/ONNX.cmake b/cmake/modules/contrib/ONNX.cmake
deleted file mode 100644
index 55405f33ca04..000000000000
--- a/cmake/modules/contrib/ONNX.cmake
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-if(USE_TARGET_ONNX)
-  message(STATUS "Build with contrib.codegen_onnx")
-  tvm_file_glob(GLOB ONNX_CONTRIB_SRC src/runtime/contrib/onnx/onnx_module.cc)
-  list(APPEND RUNTIME_SRCS ${ONNX_CONTRIB_SRC})
-endif(USE_TARGET_ONNX)
diff --git a/conftest.py b/conftest.py
index 88e21f494113..81c1f90be45d 100644
--- a/conftest.py
+++ b/conftest.py
@@ -30,22 +30,7 @@
 # that should be allocated to test shards in a round-robin fashion. These are
 # taken from the 20 (arbitrary number) of tests as from
 # https://ci.tlcpack.ai/job/tvm/job/main/2907/testReport
-_slowest_tests = [
-    "tests/python/topi/python/test_topi_conv2d_int8.py::test_conv2d_nchw[int8]",
-    "tests/python/topi/python/test_topi_conv2d_int8.py::test_conv2d_nchw[uint8]",
-    "tests/python/topi/python/test_topi_upsampling.py::test_upsampling3d",
-    "tests/python/topi/python/test_topi_upsampling.py::test_upsampling3d",
-    "tests/python/topi/python/test_topi_conv2d_int8.py::test_conv2d_nchw[int8]",
-    "tests/python/topi/python/test_topi_conv2d_int8.py::test_conv2d_nchw[uint8]",
-    "tests/python/topi/python/test_topi_conv2d_NCHWc.py::test_conv2d_NCHWc",
-    "tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py::test_conv2d_hwnc_tensorcore",
-    "tests/python/contrib/test_tensorrt.py::test_binary[compile]",
-    "tests/python/topi/python/test_topi_conv2d_NCHWc.py::test_conv2d_NCHWc",
-    "tests/python/relay/test_py_converter.py::test_global_recursion",
-    "tests/python/relay/test_op_level6.py::test_topk",
-    "tests/python/topi/python/test_topi_conv2d_winograd.py::test_conv2d_nchw",
-    "tests/python/relay/test_py_converter.py::test_global_recursion",
-]
+_slowest_tests = []
 HARDCODED_ALLOCATIONS = {}
 for idx, test in enumerate(_slowest_tests):
     HARDCODED_ALLOCATIONS[test] = idx
diff --git a/docs/arch/index.rst b/docs/arch/index.rst
index 717876d2db12..4df78f6c3cd1 100644
--- a/docs/arch/index.rst
+++ b/docs/arch/index.rst
@@ -170,7 +170,7 @@ Summary and Discussions
 
 In summary, the key data structures in the compilation flows are:
 
-- IRModule: contains relay.Function and tir.PrimFunc
+- IRModule: contains relax.Function and tir.PrimFunc
 - runtime.Module: contains runtime.PackedFunc
 
 Most parts of the compilation are transformations among the key data structures.
@@ -254,7 +254,7 @@ The components in `tvm/ir` are shared by `tvm/relay` and `tvm/tir`, notable ones
 - PassContext and Pass
 - Op
 
-Different variants of functions(e.g. relay.Function and tir.PrimFunc) can co-exist in an IRModule.
+Different variants of functions(e.g. relax.Function and tir.PrimFunc) can co-exist in an IRModule.
 While these variants may not have the same content representation, they use the same data structure to represent types.
 As a consequence, we use the same data structure to represent function (type) signatures of these variants.
 The unified type system allows one function variant to call another function
diff --git a/docs/arch/pass_infra.rst b/docs/arch/pass_infra.rst
index 1e320dceba7c..ae92874fc7eb 100644
--- a/docs/arch/pass_infra.rst
+++ b/docs/arch/pass_infra.rst
@@ -193,7 +193,7 @@ optimization passes, e.g., function-level passes, module-level passes, and
 sequential passes.  Each subclass itself could act as a pass manager. For
 instance, they could collect the required passes and execute them or build
 a dependency graph based on the given metadata. The full definition of them
-can be found in `src/relay/ir/transform.cc`_ and `src/ir/transform.cc`_.
+can be found in `src/ir/transform.cc`_.
 
 Module-Level Passes
 ^^^^^^^^^^^^^^^^^^^
@@ -267,12 +267,6 @@ of passes for execution.
       Module operator()(const Module& mod, const PassContext& pass_ctx) const final;
     };
 
-Only a few passes currently in Relay are put in this group. For example,
-``FoldScaleAxis`` requires to dispatch ``ForwardFoldScaleAxis`` and
-``BackwardFoldScaleAxis`` internally. In addition, ``BackwardFoldScaleAxis`` is
-recommended to be fulfilled first. This pass, hence, is an ideal candidate for
-``SequentialPass``.
-
 The following code shows how individual passes in a sequential pass are invoked.
 Essentially, we sequentially execute each pass in a sequential pass using the
 order that they were appended to the pass list.
@@ -311,7 +305,7 @@ pass is registered with an API endpoint as we will show later.
 
     Pass GetPass(const std::string& pass_name) {
       using tvm::runtime::Registry;
-      std::string fpass_name = "relay._transform." + pass_name;
+      std::string fpass_name = "relax.transform." + pass_name;
       const auto* f = Registry::Get(fpass_name);
       ICHECK(f != nullptr) << "Cannot find " << fpass_name
                           << "to create the pass " << pass_name;
@@ -350,8 +344,8 @@ Pass Registration
 We've covered the concept of different level of passes and the context used for
 compilation. It would be interesting to see how easily users can register
 a pass.  Let's take const folding as an example. This pass has already been
-implemented to fold constants in a Relay function (found in
-`src/relay/transforms/fold_constant.cc`_).
+implemented to fold constants in a Relax function (found in
+`src/relax/transforms/fold_constant.cc`_).
 
 An API was provided to perform the ``Expr`` to ``Expr`` transformation.
 
@@ -368,7 +362,7 @@ indicates that no prerequisite is required for this pass. Otherwise, the pass
 developer has to identify and list them.
 
 Meanwhile, a pass API endpoint is registered with the name
-``relay._transform.FoldConstant``. This pass, therefore, becomes an entry in the
+``"relax.transform.FoldConstant``. This pass, therefore, becomes an entry in the
 registry that can be accessed by both C++ (e.g. the ``GetPass`` above) and
 Python when needed.
 
@@ -378,19 +372,17 @@ Python when needed.
 
     Pass FoldConstant() {
       runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
-        [=](Function f, IRModule m, PassContext pc) {
-          return Downcast<Function>(FoldConstant(f));
-      };
-      return CreateFunctionPass(pass_func, 2, "FoldConstant", {});
+          [=](Function f, IRModule m, PassContext pc) { return ConstantFolder::Fold(f, m); };
+      return CreateFunctionPass(pass_func, 0, "FoldConstant", {});
     }
 
-    TVM_REGISTER_GLOBAL("relay._transform.FoldConstant")
+    TVM_REGISTER_GLOBAL("relax.transform.FoldConstant")
     .set_body_typed(FoldConstant);
 
     }  // namespace transform
 
 To allow other C++ modules to apply this pass, we declare a free function in
-`include/tvm/relay/transform.h`_ as the following:
+`include/tvm/relax/transform.h`_ as the following:
 
 .. code:: c++
 
@@ -543,7 +535,7 @@ Python Frontend
 
 Only some simple APIs are needed for the frontend side. For example, we can
 provide users the following APIs to create and execute a pass (full
-implementation is provided in `python/tvm/relay/transform/transform.py`_ and
+implementation is provided in `python/tvm/relax/transform/transform.py`_ and
 `python/tvm/ir/transform.py`_). The backend
 receives the information and decides which function it should use to create
 a Pass object.
@@ -586,96 +578,6 @@ loop unrolling pass
 
 Please refer to `src/tir/transforms/unroll_loop.cc`_ for more details.
 
-Pass Objects
-^^^^^^^^^^^^
-
-``Pass`` is the base class of all pass objects. All methods here are just simple
-wrappers that were implemented in the backend. They are defined for users to
-conveniently interact with the base class in Python. Only a ``__call__`` is
-defined in the pass base class to make the subclasses as callable objects so
-that they can be invoked easily (e.g., ``pass_xx(arg)``) for execution.
-
-.. code:: python
-
-    @register_relay_node
-    class Pass(RelayNode):
-       def __call__(self, mod):
-           return _transform.RunPass(self, mod)
-
-Some auxiliary APIs are provided to enable easy creation of passes from
-the Python frontend and to let the pass infra control the execution. For
-example, ``module_pass``, ``function_pass``, and ``sequential`` are provided to
-users so that they can customize their own pass or pass pipeline.
-
-For all the passes that are implemented in the C++ backend, we provide
-corresponding Python APIs in `python/tvm/ir/transform.py`_ and
-`python/tvm/relay/transform/transform.py`_, respectively. For instance,
-const folding has a Python API like the following:
-
-.. code:: python
-
-    def FoldConstant():
-        return _transform.FoldConstant()
-
-Users can build a pass through decoration like the following:
-
-.. code:: python
-
-    @relay.transform.module_pass(opt_level=2)
-    def transform(mod, ctx):
-       tp = relay.TensorType((10,), "float32")
-       x = relay.var("x", tp)
-       gv = relay.GlobalVar("abs")
-       func = relay.Function([x], relay.abs(x))
-       new_mod = tvm.IRModule({gv: func})
-       new_mod.update(mod)
-       return new_mod
-
-   module_pass = transform
-   assert isinstance(module_pass, transform.ModulePass)
-   assert module_pass.info.opt_level == 2
-
-The ``transform`` function here adds an ``abs`` function to the input module,
-but it could be any customized optimizations at the module level. After
-creating this ``module_pass``, users can apply it on any Relay module. For
-example, we can build an empty module and apply this pass to add an ``abs``
-function.
-
-.. code:: python
-
-    mod = tvm.IRModule()
-    mod = module_pass(mod)
-
-Correspondingly, we also offer such functionality for ``function_pass``. For
-instance, an example function-level pass could be written as the following:
-
-.. code:: python
-
-    @relay.transform.function_pass(opt_level=1)
-    class TestReplaceFunc:
-       def __init__(self, new_func):
-          self.new_func = new_func
-          def transform_function(self, func, mod, ctx):
-             # Just for demo purposes
-             # Transform func to new_func
-             return self.new_func
-
-    x = relay.var("x", shape=(10, 20))
-    f1 = relay.Function([x], x)
-    f2 = relay.Function([x], relay.log(x))
-    # fpass is now a special pass that replaces every
-    # function to f1
-    fpass = TestReplaceFunc(f1)
-    # Now every function in input_mod is replaced by f1
-    res_mod = fpass(input_mod)
-
-
-Alternatively, users can also directly register a pass without using the
-decorators and then invoke it. For more examples about how to customize your own
-optimization pipeline and debug Relay and tir passes, please refer to the
-`use pass infra`_ tutorial.
-
-
 .. _pass_instrument_py_frontend:
 
 Pass Instrument
@@ -741,17 +643,17 @@ new ``PassInstrument`` are called.
 
 .. _include/tvm/support/with.h: https://github.com/apache/tvm/blob/main/include/tvm/support/with.h
 
-.. _src/relay/ir/transform.cc: https://github.com/apache/tvm/blob/main/src/relay/ir/transform.cc
+.. _src/relax/ir/transform.cc: https://github.com/apache/tvm/blob/main/src/relax/ir/transform.cc
 
 .. _src/ir/transform.cc: https://github.com/apache/tvm/blob/main/src/ir/transform.cc
 
 .. _src/ir/instrument.cc: https://github.com/apache/tvm/blob/main/src/ir/instrument.cc
 
-.. _src/relay/transforms/fold_constant.cc: https://github.com/apache/tvm/blob/main/src/relay/transforms/fold_constant.cc
+.. _src/relax/transforms/fold_constant.cc: https://github.com/apache/tvm/blob/main/src/relax/transforms/fold_constant.cc
 
-.. _python/tvm/relay/transform/transform.py: https://github.com/apache/tvm/blob/main/python/tvm/relay/transform/transform.py
+.. _python/tvm/relax/transform/transform.py: https://github.com/apache/tvm/blob/main/python/tvm/relax/transform/transform.py
 
-.. _include/tvm/relay/transform.h: https://github.com/apache/tvm/blob/main/include/tvm/relay/transform.h
+.. _include/tvm/relax/transform.h: https://github.com/apache/tvm/blob/main/include/tvm/relax/transform.h
 
 .. _python/tvm/ir/transform.py: https://github.com/apache/tvm/blob/main/python/tvm/ir/transform.py
 
diff --git a/include/tvm/ir/affine_type.h b/include/tvm/ir/affine_type.h
deleted file mode 100644
index 5726e9eec1f0..000000000000
--- a/include/tvm/ir/affine_type.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file tvm/ir/affine_type.h
- * \brief Quantized Tensor Types.
- */
-#ifndef TVM_IR_AFFINE_TYPE_H_
-#define TVM_IR_AFFINE_TYPE_H_
-
-#include <tvm/ir/expr.h>
-#include <tvm/ir/type.h>
-
-namespace tvm {
-
-/*!
- * \brief AffineType representation
- * \sa AffineType
- */
-class AffineTypeNode : public Object {
- public:
-  /*!
-   * \brief Span that points to the original source code.
-   *        Reserved debug information.
-   */
-  mutable Span span;
-
-  static constexpr const char* _type_key = "AffineType";
-  static constexpr const bool _type_has_method_sequal_reduce = true;
-  static constexpr const bool _type_has_method_shash_reduce = true;
-  TVM_DECLARE_BASE_OBJECT_INFO(AffineTypeNode, Object);
-};
-
-/*!
- * \brief Managed reference to AffineTypeNode.
- * \sa AffineTypeNode
- */
-class AffineType : public ObjectRef {
- public:
-  TVM_DEFINE_OBJECT_REF_METHODS(AffineType, ObjectRef, AffineTypeNode);
-};
-
-/*!
- * \brief TensorAffineType representation
- * \sa TensorAffineType
- *
- *  This Type represents a quantized integer tensor that can be converted
- *  back to real space via the x_real = scale * (x_quant - zero_point)
- */
-class TensorAffineTypeNode : public AffineTypeNode {
- public:
-  /*! \brief The scale of this type */
-  RelayExpr scale;
-  /*! \brief The zero point of this type */
-  RelayExpr zero_point;
-  /*! \brief The data type of this type */
-  DataType dtype;
-  /*! \brief The axis for per-channel quantization */
-  int axis;
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    v->Visit("scale", &scale);
-    v->Visit("zero_point", &zero_point);
-    v->Visit("dtype", &dtype);
-    v->Visit("axis", &axis);
-  }
-
-  bool SEqualReduce(const TensorAffineTypeNode* other, SEqualReducer equal) const {
-    equal->MarkGraphNode();
-    return equal(scale, other->scale) && equal(zero_point, other->zero_point) &&
-           equal(dtype, other->dtype) && equal(axis, other->axis);
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce->MarkGraphNode();
-    hash_reduce(scale);
-    hash_reduce(zero_point);
-    hash_reduce(dtype);
-    hash_reduce(axis);
-  }
-
-  static constexpr const char* _type_key = "TensorAffineType";
-  TVM_DECLARE_BASE_OBJECT_INFO(TensorAffineTypeNode, AffineTypeNode);
-};
-
-/*!
- * \brief Managed reference to AffineTypes.
- * \sa AffineTypeNode
- */
-class TensorAffineType : public AffineType {
- public:
-  TVM_DLL TensorAffineType(RelayExpr scale, RelayExpr zero_point, DataType dtype, int axis);
-
-  TVM_DEFINE_OBJECT_REF_METHODS(TensorAffineType, AffineType, TensorAffineTypeNode);
-};
-
-/*!
- * \brief TupleAffineType representation
- * \sa TupleAffineType
- */
-class TupleAffineTypeNode : public AffineTypeNode {
- public:
-  /*! \brief The types of this tuple*/
-  Array<TensorAffineType> types;
-
-  void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("types", &types); }
-
-  bool SEqualReduce(const TupleAffineTypeNode* other, SEqualReducer equal) const {
-    equal->MarkGraphNode();
-    return equal(types, other->types);
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce->MarkGraphNode();
-    hash_reduce(types);
-  }
-
-  static constexpr const char* _type_key = "TupleAffineType";
-  TVM_DECLARE_BASE_OBJECT_INFO(TupleAffineTypeNode, AffineTypeNode);
-};
-
-/*!
- * \brief Managed reference to TupleAffineTypes.
- * \sa TupleAffineType
- */
-class TupleAffineType : public AffineType {
- public:
-  TVM_DLL TupleAffineType(Array<TensorAffineType> types);
-
-  TVM_DEFINE_OBJECT_REF_METHODS(TupleAffineType, AffineType, TupleAffineTypeNode);
-};
-
-}  // namespace tvm
-#endif  // TVM_IR_AFFINE_TYPE_H_
diff --git a/include/tvm/ir/expr.h b/include/tvm/ir/expr.h
index efde52385177..b3b4e8ab32fd 100644
--- a/include/tvm/ir/expr.h
+++ b/include/tvm/ir/expr.h
@@ -353,13 +353,13 @@ TVM_DLL PrimExpr operator~(PrimExpr a);
 /*!
  * \brief Base node of all non-primitive expressions.
  *
- * RelayExpr supports tensor types, functions and ADT as
- * first class citizens. The life-cycle of the corresponding
+ * RelaxExpr supports tensor and functions as first class citizen.
+ * The life-cycle of the corresponding
  * objects are implicitly managed by the language.
  *
- * \sa RelayExpr
+ * \sa RelaxExpr
  */
-class RelayExprNode : public BaseExprNode {
+class RelaxExprNode : public BaseExprNode {
  public:
   /*!
    * \brief Stores the result of type inference(type checking).
@@ -393,55 +393,18 @@ class RelayExprNode : public BaseExprNode {
   template <typename TTypeNode>
   inline const TTypeNode* type_as() const;
 
-  /*!
-   * \brief The virtual device (VirtualDevice) for this node (the result of device planning).
-   * For first-order expressions (non functions), this describes where the result of evaluating the
-   * expression should be stored. Note that currently, all composite first-order values (tuples,
-   * references, ADTs) must be stored on the same virtual device. This means that it is not possible
-   * to store two tuple fields on different devices, so we only need one virtual device for these
-   * types.
-   *
-   * For expressions that have the function type, the virtual device describes where the result of
-   * the call to the function or closure is stored (instead of where the function itself is stored).
-   * For example, the virtual device of f = fn(x) { body } is the virtual device of f(y), not where
-   * the function itself is stored. Note that f(y)'s virtual device will be the same as the virtual
-   * device of body. For more details, see the documentation in
-   * src/relay/transforms/device_planner.cc.
-   *
-   * The VirtualDevice's Target field describes how the body of the function should be compiled.
-   *
-   * Set to VirtualDevice::FullyUnconstrained by default.
-   *
-   * \note Unfortunately, the type of virtual_device_ needs to be ObjectRef to avoid a circular
-   * import.
-   */
-  mutable ObjectRef virtual_device_;
-
-  /*!
-   * \return The virtual device (VirtualDevice).
-   * If the virtual device is not defined, returns VirtualDevice::FullyUnconstrained().
-   * Note that for function types, the virtual device is the device where the result of a
-   * call to the function is stored, not where the function itself lives.
-   * For example, the virtual device of f = fn(x) { body } is the virtual device of f(y), not where
-   * the function itself is stored. Note that f(y)'s virtual device will be the same as the virtual
-   * device of body.
-   *
-   * See the documentation of the virtual_device_ field (above) for more details.
-   */
-  VirtualDevice virtual_device() const;
-
-  static constexpr const char* _type_key = "RelayExpr";
+  static constexpr const char* _type_key = "RelaxExpr";
   static constexpr const uint32_t _type_child_slots = 22;
-  TVM_DECLARE_BASE_OBJECT_INFO(RelayExprNode, BaseExprNode);
+  TVM_DECLARE_BASE_OBJECT_INFO(RelaxExprNode, BaseExprNode);
 };
 
 /*!
- * \brief Managed reference to RelayExprNode.
- * \sa RelayExprNode
+ * \brief Managed reference to RelaxExprNode.
+ * \sa RelaxExprNode
  */
-class RelayExpr : public BaseExpr {
+class RelaxExpr : public BaseExpr {
  public:
-  TVM_DEFINE_OBJECT_REF_METHODS(RelayExpr, BaseExpr, RelayExprNode);
+  TVM_DEFINE_OBJECT_REF_METHODS(RelaxExpr, BaseExpr, RelaxExprNode);
 };
 
 class GlobalVar;
@@ -453,14 +416,13 @@ class GlobalVar;
  *
  * \sa GlobalVarNode
  */
-class GlobalVarNode : public RelayExprNode {
+class GlobalVarNode : public RelaxExprNode {
  public:
   /*! \brief The name of the variable, this only acts as a hint. */
   String name_hint;
 
   void VisitAttrs(AttrVisitor* v) {
     v->Visit("name_hint", &name_hint);
-    v->Visit("virtual_device_", &virtual_device_);
     v->Visit("span", &span);
     v->Visit("_checked_type_", &checked_type_);
     v->Visit("struct_info_", &struct_info_);
@@ -477,18 +439,18 @@ class GlobalVarNode : public RelayExprNode {
   }
 
   static constexpr const char* _type_key = "GlobalVar";
-  TVM_DECLARE_FINAL_OBJECT_INFO(GlobalVarNode, RelayExprNode);
+  TVM_DECLARE_FINAL_OBJECT_INFO(GlobalVarNode, RelaxExprNode);
 };
 
 /*!
  * \brief Managed reference to GlobalVarNode.
  * \sa GlobalVarNode
  */
-class GlobalVar : public RelayExpr {
+class GlobalVar : public RelaxExpr {
  public:
   TVM_DLL explicit GlobalVar(String name_hint, Type type = {}, Span span = {});
 
-  TVM_DEFINE_OBJECT_REF_METHODS(GlobalVar, RelayExpr, GlobalVarNode);
+  TVM_DEFINE_OBJECT_REF_METHODS(GlobalVar, RelaxExpr, GlobalVarNode);
   TVM_DEFINE_OBJECT_REF_COW_METHOD(GlobalVarNode);
 };
 
@@ -747,15 +709,15 @@ class Range : public ObjectRef {
 };
 
 // implementations
-inline const Type& RelayExprNode::checked_type() const {
+inline const Type& RelaxExprNode::checked_type() const {
   ICHECK(checked_type_.defined()) << "internal error: the type checker has "
                                   << "not populated the checked_type "
-                                  << "field for " << GetRef<RelayExpr>(this);
+                                  << "field for " << GetRef<RelaxExpr>(this);
   return this->checked_type_;
 }
 
 template <typename TTypeNode>
-inline const TTypeNode* RelayExprNode::type_as() const {
+inline const TTypeNode* RelaxExprNode::type_as() const {
   static_assert(std::is_base_of<TypeNode, TTypeNode>::value,
                 "TType must be a special case of type");
   ICHECK(checked_type_.defined())
@@ -872,7 +834,7 @@ struct PackedFuncValueConverter<tvm::FloatImm> {
  * integer arguments, instead of runtime::Int.  For backwards
  * compatibility where the callee has been updated to expected a
  * runtime::Int, the caller has not been updated to provide a
- * runtime::Int (e.g. relay script parsing), and the auto-unboxing of
+ * runtime::Int, and the auto-unboxing of
  * runtime::Int does not apply (e.g. making an `Array<runtime::Int>`),
  * allow the IntImm to be generated.
  */
diff --git a/include/tvm/ir/function.h b/include/tvm/ir/function.h
index 3845409968e7..2282cb979b5e 100644
--- a/include/tvm/ir/function.h
+++ b/include/tvm/ir/function.h
@@ -136,7 +136,7 @@ constexpr const char* kGlobalSymbol = "global_symbol";
  *
  * \sa BaseFunc
  */
-class BaseFuncNode : public RelayExprNode {
+class BaseFuncNode : public RelaxExprNode {
  public:
   /*! \brief Additional attributes storing the meta-data */
   DictAttrs attrs;
@@ -220,16 +220,16 @@ class BaseFuncNode : public RelayExprNode {
 
   static constexpr const char* _type_key = "BaseFunc";
   static constexpr const uint32_t _type_child_slots = 2;
-  TVM_DECLARE_BASE_OBJECT_INFO(BaseFuncNode, RelayExprNode);
+  TVM_DECLARE_BASE_OBJECT_INFO(BaseFuncNode, RelaxExprNode);
 };
 
 /*!
  * \brief Managed reference to BaseFuncNode.
  * \sa BaseFuncNode
  */
-class BaseFunc : public RelayExpr {
+class BaseFunc : public RelaxExpr {
  public:
-  TVM_DEFINE_OBJECT_REF_METHODS(BaseFunc, RelayExpr, BaseFuncNode);
+  TVM_DEFINE_OBJECT_REF_METHODS(BaseFunc, RelaxExpr, BaseFuncNode);
 };
 
 }  // namespace tvm
diff --git a/include/tvm/ir/memory_pools.h b/include/tvm/ir/memory_pools.h
deleted file mode 100644
index ebab13cf3adb..000000000000
--- a/include/tvm/ir/memory_pools.h
+++ /dev/null
@@ -1,361 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file tvm/ir/memory_pools.h
- * \brief The object definition for relay.build argument type of memory pools
- */
-#ifndef TVM_IR_MEMORY_POOLS_H_
-#define TVM_IR_MEMORY_POOLS_H_
-
-#include <tvm/runtime/registry.h>
-#include <tvm/target/target.h>
-
-struct TVMConstantInfo;
-namespace tvm {
-
-/*!
- * \brief Describes a pool of memory accessible by one or more targets.
- */
-struct PoolInfoNode : public Object {
- public:
-  /*! \brief The name of the memory pool */
-  String pool_name;
-  /*! \brief The expected size hint to be used by the allocator.
-   * The size_hint_bytes is set to kUnrestrictedPoolSizeHint
-   * to indicate the pool is not size restricted.
-   */
-  Integer size_hint_bytes;
-  /*! \brief The clock frequency of the memory in Hz */
-  Integer clock_frequency_hz;
-  /*! \brief The read bandwidth in bytes/cycle */
-  Integer read_bandwidth_bytes_per_cycle;
-  /*! \brief The write bandwidth in bytes/cycle */
-  Integer write_bandwidth_bytes_per_cycle;
-  /*! \brief The read latency in cycles */
-  Integer read_latency_cycles;
-  /*! \brief The write latency in cycles */
-  Integer write_latency_cycles;
-  /*! \brief The burst length in bytes for each Target */
-  Map<Target, Integer> target_burst_bytes;
-  /*! \brief Whether pool is internally generated.
-   * The internal pools will be generated as part of
-   * the entry point code generation of the executor
-   */
-  bool is_internal = false;
-
-  /*! \brief The targets linked to the pool */
-  Array<Target> targets;
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    v->Visit("pool_name", &pool_name);
-    v->Visit("targets", &targets);
-    v->Visit("size_hint_bytes", &size_hint_bytes);
-    v->Visit("clock_frequency_hz", &clock_frequency_hz);
-    v->Visit("read_bandwidth_bytes_per_cycle", &read_bandwidth_bytes_per_cycle);
-    v->Visit("write_bandwidth_bytes_per_cycle", &write_bandwidth_bytes_per_cycle);
-    v->Visit("read_latency_cycles", &read_latency_cycles);
-    v->Visit("write_latency_cycles", &write_latency_cycles);
-    v->Visit("target_burst_bytes", &target_burst_bytes);
-    v->Visit("is_internal", &is_internal);
-  }
-
-  bool SEqualReduce(const PoolInfoNode* other, SEqualReducer equal) const {
-    return equal(pool_name, other->pool_name) && equal(size_hint_bytes, other->size_hint_bytes) &&
-           equal(clock_frequency_hz, other->clock_frequency_hz) &&
-           equal(read_bandwidth_bytes_per_cycle, other->read_bandwidth_bytes_per_cycle) &&
-           equal(write_bandwidth_bytes_per_cycle, other->write_bandwidth_bytes_per_cycle) &&
-           equal(read_latency_cycles, other->read_latency_cycles) &&
-           equal(write_latency_cycles, other->write_latency_cycles) &&
-           equal(target_burst_bytes, other->target_burst_bytes) &&
-           equal(is_internal, other->is_internal);
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce(pool_name);
-    hash_reduce(size_hint_bytes);
-    hash_reduce(clock_frequency_hz);
-    hash_reduce(read_bandwidth_bytes_per_cycle);
-    hash_reduce(write_bandwidth_bytes_per_cycle);
-    hash_reduce(read_latency_cycles);
-    hash_reduce(write_latency_cycles);
-    hash_reduce(target_burst_bytes);
-    hash_reduce(is_internal);
-  }
-
-  static constexpr const char* _type_key = "ir.PoolInfo";
-  TVM_DECLARE_BASE_OBJECT_INFO(PoolInfoNode, Object);
-};
-
-/*!
- * \brief The string parameter to indicate read and write access to a pool
- * This needs to be kept in sync with PoolInfo.READ_WRITE_ACCESS in
- * python/tvm/ir/memory_pools.py
- */
-static constexpr const char* kTargetPoolReadWriteAccess = "rw";
-
-/*!
- * \brief The string parameter to indicate read only access to a pool
- * This needs to be kept in sync with PoolInfo.READ_ONLY_ACCESS in
- * python/tvm/ir/memory_pools.py
- */
-static constexpr const char* kTargetPoolReadOnlyAccess = "ro";
-
-/*! \brief The PoolSize is unrestricted for the memory planner */
-static const int kUnrestrictedPoolSizeHint = -1;
-
-/*! \brief The clock frequency is not known */
-static const int kUnknownClockFrequency = -1;
-
-/*! \brief The read bandwidth is not known */
-static const int kUnknownReadBandwidth = -1;
-
-/*! \brief The write bandwidth is not known */
-static const int kUnknownWriteBandwidth = -1;
-
-/*! \brief Base class for WorkspacePoolInfo and ConstantPoolInfo */
-class PoolInfo : public ObjectRef {
- protected:
-  TVM_DLL PoolInfo(String pool_name, Integer size_hint_bytes = kUnrestrictedPoolSizeHint,
-                   Integer clock_frequency_hz = kUnknownClockFrequency,
-                   Integer read_bandwidth_bytes_per_cycle = kUnknownReadBandwidth,
-                   Integer write_bandwidth_bytes_per_cycle = kUnknownWriteBandwidth,
-                   Integer read_latency_cycles = 0, Integer write_latency_cycles = 0,
-                   Map<Target, Integer> target_burst_bytes = {}, Bool is_internal = Bool(false));
-
- public:
-  TVM_DEFINE_OBJECT_REF_METHODS(PoolInfo, ObjectRef, PoolInfoNode);
-};
-
-/*!
- * \brief Describes a pool of memory properties
- */
-struct PoolInfoPropertiesNode : public Object {
-  /*! \brief The expected size hint to be used by the allocator.
-   * The size_hint_bytes is set to kUnrestrictedPoolSizeHint
-   * to indicate the pool is not size restricted.
-   */
-  Integer size_hint_bytes = kUnrestrictedPoolSizeHint;
-  /*! \brief The clock frequency of the memory in Hz */
-  Integer clock_frequency_hz = kUnknownClockFrequency;
-  /*! \brief The read bandwidth in bytes/cycle */
-  Integer read_bandwidth_bytes_per_cycle = kUnknownReadBandwidth;
-  /*! \brief The write bandwidth in bytes/cycle */
-  Integer write_bandwidth_bytes_per_cycle = kUnknownWriteBandwidth;
-  /*! \brief The read latency in cycles */
-  Integer read_latency_cycles = 0;
-  /*! \brief The write latency in cycles */
-  Integer write_latency_cycles = 0;
-  /*! \brief The burst length in bytes for each Target */
-  Map<Target, Integer> target_burst_bytes{};
-  /*! \brief Whether pool is internally generated.
-   * The internal pools will be generated as part of
-   * the entry point code generation of the executor
-   */
-  bool is_internal = false;
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    v->Visit("size_hint_bytes", &size_hint_bytes);
-    v->Visit("clock_frequency_hz", &clock_frequency_hz);
-    v->Visit("read_bandwidth_bytes_per_cycle", &read_bandwidth_bytes_per_cycle);
-    v->Visit("write_bandwidth_bytes_per_cycle", &write_bandwidth_bytes_per_cycle);
-    v->Visit("read_latency_cycles", &read_latency_cycles);
-    v->Visit("write_latency_cycles", &write_latency_cycles);
-    v->Visit("target_burst_bytes", &target_burst_bytes);
-    v->Visit("is_internal", &is_internal);
-  }
-
-  bool SEqualReduce(const PoolInfoPropertiesNode* other, SEqualReducer equal) const {
-    return equal(size_hint_bytes, other->size_hint_bytes) &&
-           equal(clock_frequency_hz, other->clock_frequency_hz) &&
-           equal(read_bandwidth_bytes_per_cycle, other->read_bandwidth_bytes_per_cycle) &&
-           equal(write_bandwidth_bytes_per_cycle, other->write_bandwidth_bytes_per_cycle) &&
-           equal(read_latency_cycles, other->read_latency_cycles) &&
-           equal(write_latency_cycles, other->write_latency_cycles) &&
-           equal(target_burst_bytes, other->target_burst_bytes) &&
-           equal(is_internal, other->is_internal);
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce(size_hint_bytes);
-    hash_reduce(clock_frequency_hz);
-    hash_reduce(read_bandwidth_bytes_per_cycle);
-    hash_reduce(write_bandwidth_bytes_per_cycle);
-    hash_reduce(read_latency_cycles);
-    hash_reduce(write_latency_cycles);
-    hash_reduce(target_burst_bytes);
-    hash_reduce(is_internal);
-  }
-
-  static constexpr const char* _type_key = "ir.PoolInfoProperties";
-  TVM_DECLARE_FINAL_OBJECT_INFO(PoolInfoPropertiesNode, Object);
-};
-
-class PoolInfoProperties : public ObjectRef {
- public:
-  TVM_DLL PoolInfoProperties(Integer size_hint_bytes,
-                             Integer clock_frequency_hz = kUnknownClockFrequency,
-                             Integer read_bandwidth_bytes_per_cycle = kUnknownReadBandwidth,
-                             Integer write_bandwidth_bytes_per_cycle = kUnknownWriteBandwidth,
-                             Integer read_latency_cycles = 0, Integer write_latency_cycles = 0,
-                             Map<Target, Integer> target_burst_bytes = {},
-                             Bool is_internal = Bool(false));
-  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(PoolInfoProperties, ObjectRef, PoolInfoPropertiesNode);
-};
-
-/* \brief Represents RW memory area */
-struct WorkspacePoolInfoNode : public PoolInfoNode {
-  void VisitAttrs(tvm::AttrVisitor* v) { PoolInfoNode::VisitAttrs(v); }
-
-  bool SEqualReduce(const WorkspacePoolInfoNode* other, SEqualReducer equal) const {
-    return PoolInfoNode::SEqualReduce(other, equal);
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const { PoolInfoNode::SHashReduce(hash_reduce); }
-
-  static constexpr const char* _type_key = "ir.WorkspacePoolInfo";
-  TVM_DECLARE_FINAL_OBJECT_INFO(WorkspacePoolInfoNode, PoolInfoNode);
-};
-
-class WorkspacePoolInfo : public PoolInfo {
- public:
-  TVM_DLL WorkspacePoolInfo(
-      String pool_name, Array<Target> targets,
-      PoolInfoProperties properties = PoolInfoProperties(kUnrestrictedPoolSizeHint));
-  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(WorkspacePoolInfo, PoolInfo, WorkspacePoolInfoNode);
-};
-
-/*
- * \brief The ConstantInfoNode contains numeric literal in RO pool
- * Used to initialise RO memory in ConstantPoolInfo
- */
-struct ConstantInfoNode : public Object {
-  String name_hint;
-  Integer byte_offset;
-  runtime::NDArray data;
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    v->Visit("name_hint", &name_hint);
-    v->Visit("byte_offset", &byte_offset);
-    v->Visit("data", &data);
-  }
-
-  bool SEqualReduce(const ConstantInfoNode* other, SEqualReducer equal) const {
-    return equal(name_hint, other->name_hint) && equal(byte_offset, other->byte_offset) &&
-           equal(data, other->data);
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce(name_hint);
-    hash_reduce(byte_offset);
-    hash_reduce(data);
-  }
-
-  static constexpr const char* _type_key = "ir.ConstantInfo";
-  static constexpr bool _type_has_method_sequal_reduce = true;
-  static constexpr bool _type_has_method_shash_reduce = true;
-  TVM_DECLARE_FINAL_OBJECT_INFO(ConstantInfoNode, Object);
-};
-
-class ConstantInfo : public ObjectRef {
- public:
-  TVM_DLL ConstantInfo(const struct ::TVMConstantInfo* data);
-  ConstantInfo(String name, Integer byte_offset, runtime::NDArray data);
-  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(ConstantInfo, ObjectRef, ConstantInfoNode);
-};
-
-/* \brief ConstantPoolInfoNode represents an RO memory area initialized with
- * data from constant_info_array */
-struct ConstantPoolInfoNode : public PoolInfoNode {
-  Array<ConstantInfo> constant_info_array;
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    PoolInfoNode::VisitAttrs(v);
-    v->Visit("constant_info_array", &constant_info_array);
-  }
-
-  bool SEqualReduce(const ConstantPoolInfoNode* other, SEqualReducer equal) const {
-    return PoolInfoNode::SEqualReduce(other, equal) &&
-           equal(constant_info_array, other->constant_info_array);
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const {
-    PoolInfoNode::SHashReduce(hash_reduce);
-    hash_reduce(constant_info_array);
-  }
-
-  static constexpr const char* _type_key = "ir.ConstantPoolInfo";
-  TVM_DECLARE_FINAL_OBJECT_INFO(ConstantPoolInfoNode, PoolInfoNode);
-};
-
-class ConstantPoolInfo : public PoolInfo {
- public:
-  TVM_DLL ConstantPoolInfo(
-      String pool_name, Array<Target> targets, Array<ConstantInfo> constant_info_array,
-      PoolInfoProperties properties = PoolInfoProperties(kUnrestrictedPoolSizeHint));
-  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(ConstantPoolInfo, PoolInfo, ConstantPoolInfoNode);
-};
-
-/* \brief A container for WorkspacePoolInfo objects */
-struct WorkspaceMemoryPoolsNode : public Object {
-  Array<PoolInfo> pools;
-
-  void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("pools", &pools); }
-
-  bool SEqualReduce(const WorkspaceMemoryPoolsNode* other, SEqualReducer equal) const {
-    return equal(pools, other->pools);
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const { hash_reduce(pools); }
-
-  static constexpr const char* _type_key = "ir.WorkspaceMemoryPools";
-  TVM_DECLARE_FINAL_OBJECT_INFO(WorkspaceMemoryPoolsNode, Object);
-};
-
-class WorkspaceMemoryPools : public ObjectRef {
- public:
-  TVM_DLL WorkspaceMemoryPools(Array<PoolInfo> pools);
-  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(WorkspaceMemoryPools, ObjectRef, WorkspaceMemoryPoolsNode);
-};
-
-/* \brief A container for ConstantPoolInfo objects */
-struct ConstantMemoryPoolsNode : public Object {
-  Array<ConstantPoolInfo> pools;
-
-  void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("pools", &pools); }
-
-  bool SEqualReduce(const ConstantMemoryPoolsNode* other, SEqualReducer equal) const {
-    return equal(pools, other->pools);
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const { hash_reduce(pools); }
-
-  static constexpr const char* _type_key = "ir.ConstantMemoryPools";
-  TVM_DECLARE_FINAL_OBJECT_INFO(ConstantMemoryPoolsNode, Object);
-};
-
-class ConstantMemoryPools : public ObjectRef {
- public:
-  TVM_DLL ConstantMemoryPools(Array<ConstantPoolInfo> pools);
-  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(ConstantMemoryPools, ObjectRef, ConstantMemoryPoolsNode);
-};
-
-}  // namespace tvm
-
-#endif  // TVM_IR_MEMORY_POOLS_H_
diff --git a/include/tvm/ir/module.h b/include/tvm/ir/module.h
index b3895ee725d1..0338096d7047 100644
--- a/include/tvm/ir/module.h
+++ b/include/tvm/ir/module.h
@@ -277,7 +277,7 @@ class IRModule : public ObjectRef {
    * \brief As for \p FromExprInContext, but assuming \p expr is bound to 'main' and no
    * imports.
    */
-  TVM_DLL static IRModule FromExpr(const RelayExpr& expr,
+  TVM_DLL static IRModule FromExpr(const RelaxExpr& expr,
                                    const Map<GlobalVar, BaseFunc>& global_funcs = {});
 
   /*!
@@ -310,42 +310,6 @@ namespace attr {
  */
 constexpr const char* kModuleName = "mod_name";
 
-/*!
- * \brief Executor targeted by the module
- *
- * Type: Executor
- *
- * \sa tvm::relay::Executor
- */
-constexpr const char* kExecutor = "executor";
-
-/*!
- * \brief Runtime target of the module
- *
- * Type: Runtime
- *
- * \sa tvm::relay::Runtime
- */
-constexpr const char* kRuntime = "runtime";
-
-/*!
- * \brief workspace memory pools of the module
- *
- * Type: WorkspaceMemoryPools
- *
- * \sa tvm::WorkspaceMemoryPools
- */
-constexpr const char* kWorkspaceMemoryPools = "workspace_memory_pools";
-
-/*!
- * \brief constant memory pools of the module
- *
- * Type: ConstantMemoryPools
- *
- * \sa tvm::ConstantMemoryPools
- */
-constexpr const char* kConstantMemoryPools = "constant_memory_pools";
-
 /*
  * \brief All the runtime::NDArrays extracted from PrimFunc tir::AllocateConst nodes. The
  * node will record the index into this array. See also kConstNameToConstant below, which is
diff --git a/include/tvm/ir/op.h b/include/tvm/ir/op.h
index a703f16f5a3b..7fbd1cbb84f1 100644
--- a/include/tvm/ir/op.h
+++ b/include/tvm/ir/op.h
@@ -55,7 +55,7 @@ class OpAttrMap;
  *
  * \sa Op
  */
-class OpNode : public RelayExprNode {
+class OpNode : public RelaxExprNode {
  public:
   /*! \brief name of the operator */
   String name;
@@ -111,7 +111,7 @@ class OpNode : public RelayExprNode {
   }
 
   static constexpr const char* _type_key = "Op";
-  TVM_DECLARE_FINAL_OBJECT_INFO(OpNode, RelayExprNode);
+  TVM_DECLARE_FINAL_OBJECT_INFO(OpNode, RelaxExprNode);
 
  private:
   /*! \return the internal attr registry index. */
@@ -135,7 +135,7 @@ class OpNode : public RelayExprNode {
  * \brief Managed reference class to OpNode.
  * \sa OpNode
  */
-class Op : public RelayExpr {
+class Op : public RelaxExpr {
  public:
   /*!
    * \brief Get additional registered attribute about operators.
@@ -160,7 +160,7 @@ class Op : public RelayExpr {
    */
   TVM_DLL static const Op& Get(const String& op_name);
 
-  TVM_DEFINE_OBJECT_REF_METHODS(Op, RelayExpr, OpNode)
+  TVM_DEFINE_OBJECT_REF_METHODS(Op, RelaxExpr, OpNode)
 
  private:
   /*!
@@ -286,7 +286,7 @@ class OpAttrMap : public AttrRegistryMap<Op, ValueType> {
    *         or if expr is not an Op.
    * \return the const reference to the content value.
    */
-  inline ValueType get(const RelayExpr& expr, ValueType def_value) const;
+  inline ValueType get(const RelaxExpr& expr, ValueType def_value) const;
 
   using TParent = AttrRegistryMap<Op, ValueType>;
   using TParent::count;
@@ -381,7 +381,7 @@ inline OpRegEntry& OpRegEntry::set_attr(  // NOLINT(*)
 // member functions of OpAttrMap
 
 template <typename ValueType>
-inline ValueType OpAttrMap<ValueType>::get(const RelayExpr& expr, ValueType def_value) const {
+inline ValueType OpAttrMap<ValueType>::get(const RelaxExpr& expr, ValueType def_value) const {
   ICHECK(expr.defined());
   if (const OpNode* op = expr.as<OpNode>()) {
     return this->map_.get(GetRef<Op>(op), def_value);
diff --git a/include/tvm/ir/tensor_type.h b/include/tvm/ir/tensor_type.h
deleted file mode 100644
index 7a700258f23c..000000000000
--- a/include/tvm/ir/tensor_type.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file tvm/ir/tensor_type.h
- * \brief Polymorphic tensor types.
- */
-#ifndef TVM_IR_TENSOR_TYPE_H_
-#define TVM_IR_TENSOR_TYPE_H_
-
-#include <tvm/ir/expr.h>
-#include <tvm/ir/type.h>
-
-namespace tvm {
-/*!
- * \brief Base of all Tensor types
- *  This container can hold TensorType or GenericTensorType.
- * \sa BaseTensorType, TensorTypeNode
- */
-class BaseTensorTypeNode : public TypeNode {
- public:
-  static constexpr const char* _type_key = "relay.BaseTensorType";
-  static constexpr const uint32_t _type_child_slots = 1;
-  TVM_DECLARE_BASE_OBJECT_INFO(BaseTensorTypeNode, TypeNode);
-};
-
-/*!
- * \brief Managed reference to BaseTensorTypeNode.
- * \sa BaseTensorTypeNode.
- */
-class BaseTensorType : public Type {
- public:
-  TVM_DEFINE_OBJECT_REF_METHODS(BaseTensorType, Type, BaseTensorTypeNode);
-};
-
-/*!
- * \brief This is the most commonly used type in relay.
- *  TensorType have a fixed dimension, data type.
- *
- *  The elements of shape can be either IntImm(constant integer),
- *  or any symbolic integer expression.
- *  The symbolic integer allows generic shape inference in certain cases.
- * \sa TensorType
- */
-class TensorTypeNode : public BaseTensorTypeNode {
- public:
-  /*!
-   * \brief The shape of the tensor,
-   *  represented by PrimExpr(tvm::Expr).
-   */
-  Array<PrimExpr> shape;
-  /*! \brief The content data type */
-  DataType dtype;
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    v->Visit("shape", &shape);
-    v->Visit("dtype", &dtype);
-    v->Visit("span", &span);
-  }
-
-  bool SEqualReduce(const TensorTypeNode* other, SEqualReducer equal) const {
-    return equal(shape, other->shape) && equal(dtype, other->dtype);
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce(shape);
-    hash_reduce(dtype);
-  }
-
-  /*! \brief Return product of elements in the shape.
-   *  \return (d1 * d_2 ... * d_n) if shape is (d_1, d_2, ..., d_n) and 1 if shape size is zero.
-   */
-  TVM_DLL PrimExpr Size() const;
-
-  static constexpr const char* _type_key = "relay.TensorType";
-  TVM_DECLARE_FINAL_OBJECT_INFO(TensorTypeNode, BaseTensorTypeNode);
-};
-
-/*!
- * \brief Managed reference to TensorTypeNode.
- * \sa TensorTypeNode.
- */
-class TensorType : public Type {
- public:
-  /*!
-   * \brief Constructor.
-   * \param shape The shape of the tensor.
-   * \param dtype The runtime dtype of the tensor's elements.
-   */
-  TVM_DLL TensorType(Array<PrimExpr> shape, DataType dtype);
-
-  /*!
-   * \brief Construct an scalar containing elements of dtype.
-   * \param dtype The runtime dtype of the tensor's elements.
-   * \return THe constructed type.
-   */
-  TVM_DLL static TensorType Scalar(DataType dtype);
-
-  TVM_DEFINE_OBJECT_REF_METHODS(TensorType, Type, TensorTypeNode);
-};
-
-// The following fields contains advanced typing
-// Only keep the class name and reserved for future usage.
-class GenericTensorType;
-// stores a DataType.
-class GenericDataType;
-// stores a DataType.
-class GenericShape;
-
-}  // namespace tvm
-#endif  // TVM_IR_TENSOR_TYPE_H_
diff --git a/include/tvm/ir/type_functor.h b/include/tvm/ir/type_functor.h
index eb213b17dfbc..2c145e480b84 100644
--- a/include/tvm/ir/type_functor.h
+++ b/include/tvm/ir/type_functor.h
@@ -24,7 +24,7 @@
 #ifndef TVM_IR_TYPE_FUNCTOR_H_
 #define TVM_IR_TYPE_FUNCTOR_H_
 
-#include <tvm/ir/tensor_type.h>
+#include <tvm/ir/type.h>
 #include <tvm/node/functor.h>
 
 #include <string>
@@ -75,7 +75,6 @@ class TypeFunctor<R(const Type& n, Args...)> {
     return vtable(n, this, std::forward<Args>(args)...);
   }
   // Functions that can be overriden by subclass
-  virtual R VisitType_(const TensorTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
   virtual R VisitType_(const FuncTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
   virtual R VisitType_(const TupleTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
   virtual R VisitType_(const PrimTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
@@ -90,7 +89,6 @@ class TypeFunctor<R(const Type& n, Args...)> {
   static FType InitVTable() {
     FType vtable;
     // Set dispatch
-    TVM_TYPE_FUNCTOR_DISPATCH(TensorTypeNode);
     TVM_TYPE_FUNCTOR_DISPATCH(FuncTypeNode);
     TVM_TYPE_FUNCTOR_DISPATCH(TupleTypeNode);
     TVM_TYPE_FUNCTOR_DISPATCH(PrimTypeNode);
@@ -106,7 +104,6 @@ class TypeFunctor<R(const Type& n, Args...)> {
  */
 class TVM_DLL TypeVisitor : public TypeFunctor<void(const Type& n)> {
  public:
-  void VisitType_(const TensorTypeNode* op) override;
   void VisitType_(const FuncTypeNode* op) override;
   void VisitType_(const TupleTypeNode* op) override;
   void VisitType_(const PrimTypeNode* op) override;
@@ -119,7 +116,6 @@ class TVM_DLL TypeVisitor : public TypeFunctor<void(const Type& n)> {
 class TVM_DLL TypeMutator : public TypeFunctor<Type(const Type& n)> {
  public:
   Type VisitType(const Type& t) override;
-  Type VisitType_(const TensorTypeNode* op) override;
   Type VisitType_(const FuncTypeNode* op) override;
   Type VisitType_(const TupleTypeNode* op) override;
   Type VisitType_(const PrimTypeNode* op) override;
diff --git a/include/tvm/node/structural_equal.h b/include/tvm/node/structural_equal.h
index f88b7e63b325..2dd732c72a4d 100644
--- a/include/tvm/node/structural_equal.h
+++ b/include/tvm/node/structural_equal.h
@@ -100,11 +100,9 @@ class ObjectPathPair : public ObjectRef {
  *  - Normal node: equality is recursively defined without the restriction
  *    of graph nodes.
  *
- *  Vars(tir::Var, TypeVar) and non-constant relay expression nodes are graph nodes.
- *  For example, it means that `%1 = %x + %y; %1 + %1` is not structurally equal
- *  to `%1 = %x + %y; %2 = %x + %y; %1 + %2` in relay.
+ *  Vars(tir::Var, relax::Var) nodes are graph nodes.
  *
- *  A var-type node(e.g. tir::Var, TypeVar) can be mapped as equal to another var
+ *  A var-type node(e.g. tir::Var) can be mapped as equal to another var
  *  with the same type if one of the following condition holds:
  *
  *  - They appear in a same definition point(e.g. function argument).
diff --git a/include/tvm/relax/dataflow_pattern.h b/include/tvm/relax/dataflow_pattern.h
index f7094b221221..df9fdcad9759 100644
--- a/include/tvm/relax/dataflow_pattern.h
+++ b/include/tvm/relax/dataflow_pattern.h
@@ -461,7 +461,7 @@ class CallPatternNode : public DFPatternNode {
  public:
   /*!
    * \note The op field can be:
-   *  - relay::Op which corresponds to the primitive operators.
+   *  - relax::Op which corresponds to the primitive operators.
    *  - user defined functions (Function, GlobalVar, Var).
    */
   DFPattern op;               /*!< The operator (function) being invoked */
diff --git a/include/tvm/relax/expr.h b/include/tvm/relax/expr.h
index 9afddeb807ab..fb6f0e40b130 100644
--- a/include/tvm/relax/expr.h
+++ b/include/tvm/relax/expr.h
@@ -35,8 +35,8 @@
 namespace tvm {
 namespace relax {
 
-using Expr = RelayExpr;
-using ExprNode = RelayExprNode;
+using Expr = RelaxExpr;
+using ExprNode = RelaxExprNode;
 /*!
  * \brief The unique identifier of variables.
  *
diff --git a/include/tvm/relax/type.h b/include/tvm/relax/type.h
index 210730bec644..ed832f6a0290 100644
--- a/include/tvm/relax/type.h
+++ b/include/tvm/relax/type.h
@@ -26,7 +26,6 @@
 
 #include <tvm/ir/attrs.h>
 #include <tvm/ir/env_func.h>
-#include <tvm/ir/tensor_type.h>
 #include <tvm/ir/type.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/tir/expr.h>
@@ -67,26 +66,12 @@ class ShapeType : public Type {
   TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(ShapeType, Type, ShapeTypeNode);
 };
 
-class ObjectTypeNode : public TypeNode {
- public:
-  void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("span", &span); }
-
-  bool SEqualReduce(const ObjectTypeNode* other, SEqualReducer equal) const { return true; }
-
-  void SHashReduce(SHashReducer hash_reduce) const { hash_reduce(0); }
-
-  static constexpr const char* _type_key = "relax.ObjectType";
-  TVM_DECLARE_FINAL_OBJECT_INFO(ObjectTypeNode, TypeNode);
-};
-
-class ObjectType : public Type {
- public:
-  TVM_DLL ObjectType(Span span = Span());
-
-  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(ObjectType, Type, ObjectTypeNode);
-};
-
-class DynTensorTypeNode : public BaseTensorTypeNode {
+/*!
+ * \brief Dynamic version of TensorType
+ *
+ * Use relax::TensorStructInfo for more detailed (possibly dynamic) shape constrains
+ */
+class TensorTypeNode : public TypeNode {
  public:
   /*!
    * \brief The number of dimensions of the tensor, use -1 to denote tensor with unknwon number of
@@ -102,7 +87,7 @@ class DynTensorTypeNode : public BaseTensorTypeNode {
     v->Visit("span", &span);
   }
 
-  bool SEqualReduce(const DynTensorTypeNode* other, SEqualReducer equal) const {
+  bool SEqualReduce(const TensorTypeNode* other, SEqualReducer equal) const {
     return equal(ndim, other->ndim) && equal(dtype, other->dtype);
   }
 
@@ -116,14 +101,14 @@ class DynTensorTypeNode : public BaseTensorTypeNode {
   inline bool IsUnknownDtype() const { return dtype.is_void(); }
 
   static constexpr const char* _type_key = "relax.DynTensorType";
-  TVM_DECLARE_FINAL_OBJECT_INFO(DynTensorTypeNode, BaseTensorTypeNode);
+  TVM_DECLARE_FINAL_OBJECT_INFO(TensorTypeNode, TypeNode);
 };
 
 /*!
- * \brief Managed reference to DynTensorTypeNode.
- * \sa DynTensorTypeNode.
+ * \brief Managed reference to TensorTypeNode.
+ * \sa TensorTypeNode.
  */
-class DynTensorType : public Type {
+class TensorType : public Type {
  public:
   /*!
    * \brief Constructor.
@@ -131,14 +116,36 @@ class DynTensorType : public Type {
    * \param dtype The runtime dtype of the tensor's elements.
    * \param span The span.
    */
-  TVM_DLL DynTensorType(int ndim, DataType dtype, Span span = Span());
+  TVM_DLL TensorType(int ndim, DataType dtype, Span span = Span());
 
   /*!
-   * \brief Create a DynTensorType with unknown ndim.
+   * \brief Create a TensorType with unknown ndim.
    */
-  TVM_DLL static DynTensorType CreateUnknownNDim(DataType dtype, Span span = Span());
+  TVM_DLL static TensorType CreateUnknownNDim(DataType dtype, Span span = Span());
+
+  TVM_DEFINE_OBJECT_REF_METHODS(TensorType, Type, TensorTypeNode);
+};
 
-  TVM_DEFINE_OBJECT_REF_METHODS(DynTensorType, Type, DynTensorTypeNode);
+using TensorTypeNode = TensorTypeNode;
+using TensorType = TensorType;
+
+class ObjectTypeNode : public TypeNode {
+ public:
+  void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("span", &span); }
+
+  bool SEqualReduce(const ObjectTypeNode* other, SEqualReducer equal) const { return true; }
+
+  void SHashReduce(SHashReducer hash_reduce) const { hash_reduce(0); }
+
+  static constexpr const char* _type_key = "relax.ObjectType";
+  TVM_DECLARE_FINAL_OBJECT_INFO(ObjectTypeNode, TypeNode);
+};
+
+class ObjectType : public Type {
+ public:
+  TVM_DLL ObjectType(Span span = Span());
+
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(ObjectType, Type, ObjectTypeNode);
 };
 
 class PackedFuncTypeNode : public TypeNode {
diff --git a/include/tvm/runtime/logging.h b/include/tvm/runtime/logging.h
index 708f4bcaa9c4..440c7a4383c3 100644
--- a/include/tvm/runtime/logging.h
+++ b/include/tvm/runtime/logging.h
@@ -488,9 +488,9 @@ inline bool DebugLoggingEnabled() {
  * Filenames are canonicalized to be w.r.t. the src/ dir of the TVM tree. (VLOG's should not
  * appear under include/).
  *
- * To enable file \p relay/foo.cc up to level 2 and \p ir/bar.cc for level 0 only set:
+ * To enable file \p ir/bar.cc for level 0 only set:
  * \code
- * TVM_LOG_DEBUG="relay/foo.cc=2,ir/bar.cc=0"
+ * TVM_LOG_DEBUG="ir/bar.cc=0"
  * \endcode
  *
  * To enable all files up to level 3 but disable \p ir/bar.cc set:
diff --git a/include/tvm/target/generic_func.h b/include/tvm/target/generic_func.h
deleted file mode 100644
index bd498616f9a2..000000000000
--- a/include/tvm/target/generic_func.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file tvm/target/generic_func.h
- * \brief Generic function that can be specialzied on a per target basis.
- */
-#ifndef TVM_TARGET_GENERIC_FUNC_H_
-#define TVM_TARGET_GENERIC_FUNC_H_
-
-#include <tvm/runtime/packed_func.h>
-#include <tvm/support/with.h>
-#include <tvm/target/target.h>
-
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-namespace tvm {
-
-class GenericFuncNode;
-
-/*!
- * \brief Generic function that can be specialized on a per-target basis.
- */
-class GenericFunc : public ObjectRef {
- public:
-  GenericFunc() {}
-  explicit GenericFunc(ObjectPtr<Object> n) : ObjectRef(n) {}
-
-  /*!
-   * \brief Set the default function implementaiton.
-   * \param value The default function
-   * \param allow_override If true, this call may override a previously registered function. If
-   * false, an error will be logged if the call would override a previously registered function.
-   * \return reference to self.
-   */
-  TVM_DLL GenericFunc& set_default(const runtime::PackedFunc value, bool allow_override = false);
-  /*!
-   * \brief Register a specialized function
-   * \param tags The tags for this specialization
-   * \param value The specialized function
-   * \param allow_override If true, this call may override previously registered tags. If false,
-   * an error will be logged if the call would override previously registered tags.
-   * \return reference to self.
-   */
-  TVM_DLL GenericFunc& register_func(const std::vector<std::string>& tags,
-                                     const runtime::PackedFunc value, bool allow_override = false);
-  /*!
-   * \brief Call generic function by directly passing in unpacked format.
-   * \param args Arguments to be passed.
-   * \tparam Args arguments to be passed.
-   *
-   * \code
-   *   // Example code on how to call generic function
-   *   void CallGeneric(GenericFunc f) {
-   *     // call like normal functions by pass in arguments
-   *     // return value is automatically converted back
-   *     int rvalue = f(1, 2.0);
-   *   }
-   * \endcode
-   */
-  template <typename... Args>
-  inline runtime::TVMRetValue operator()(Args&&... args) const;
-  /*!
-   * \brief Invoke the relevant function for the current target context, set by set_target_context.
-   * Arguments are passed in packed format.
-   * \param args The arguments to pass to the function.
-   * \param ret The return value
-   */
-  TVM_DLL void CallPacked(runtime::TVMArgs args, runtime::TVMRetValue* ret) const;
-  /*!
-   * \brief Get the packed function specified for the current target context.
-   */
-  TVM_DLL PackedFunc GetPacked() const;
-  /*!
-   * \brief Find or register the GenericFunc instance corresponding to the give name
-   * \param name The name of the registered GenericFunc
-   * \return The GenericFunc instance
-   */
-  TVM_DLL static GenericFunc Get(const std::string& name);
-
-  /*!
-   * \brief Add a GenericFunc instance to the registry
-   * \param func The GenericFunc instance
-   * \param name The name of the registered GenericFunc
-   */
-  TVM_DLL static void RegisterGenericFunc(GenericFunc func, const std::string& name);
-
-  /*!
-   * \brief access the internal node container
-   * \return the pointer to the internal node container
-   */
-  inline GenericFuncNode* operator->();
-
-  // declare container type
-  using ContainerType = GenericFuncNode;
-
-  // Internal class.
-  struct Manager;
-
- private:
-  friend struct Manager;
-};
-
-template <typename... Args>
-inline runtime::TVMRetValue GenericFunc::operator()(Args&&... args) const {
-  const int kNumArgs = sizeof...(Args);
-  const int kArraySize = kNumArgs > 0 ? kNumArgs : 1;
-  TVMValue values[kArraySize];
-  int type_codes[kArraySize];
-  runtime::detail::for_each(runtime::TVMArgsSetter(values, type_codes),
-                            std::forward<Args>(args)...);
-  runtime::TVMRetValue rv;
-  CallPacked(runtime::TVMArgs(values, type_codes, kNumArgs), &rv);
-  return rv;
-}
-
-/*!
- * \brief Represents a generic function that can be specialized on a per-target basis.
- */
-class GenericFuncNode : public Object {
- public:
-  /*! \brief name of the function */
-  std::string name_;
-  /* \brief the generic builder */
-  runtime::PackedFunc generic_func_;
-  /* \brief map from keys to registered functions */
-  std::unordered_map<std::string, runtime::PackedFunc> dispatch_dict_;
-
-  void VisitAttrs(AttrVisitor* v) {}
-
-  static constexpr const char* _type_key = "GenericFunc";
-  TVM_DECLARE_FINAL_OBJECT_INFO(GenericFuncNode, Object);
-};
-
-inline GenericFuncNode* GenericFunc::operator->() {
-  return static_cast<GenericFuncNode*>(get_mutable());
-}
-
-#define TVM_GENERIC_FUNC_REG_VAR_DEF static TVM_ATTRIBUTE_UNUSED ::tvm::GenericFunc& __mk_##TVM
-
-/*!
- * \def TVM_REGISTER_GENERIC_FUNC
- * \brief Register a new generic function, or set a device-specific variant
- * of the corresponding function.
- *
- * \param name The name of the function
- */
-#define TVM_REGISTER_GENERIC_FUNC(name) \
-  TVM_STR_CONCAT(TVM_GENERIC_FUNC_REG_VAR_DEF, __COUNTER__) = ::tvm::GenericFunc::Get(#name)
-
-}  // namespace tvm
-#endif  // TVM_TARGET_GENERIC_FUNC_H_
diff --git a/include/tvm/target/virtual_device.h b/include/tvm/target/virtual_device.h
index 8ebcbf69bb11..5c6db4bc24d6 100644
--- a/include/tvm/target/virtual_device.h
+++ b/include/tvm/target/virtual_device.h
@@ -76,12 +76,8 @@ constexpr int kInvalidDeviceType = -1;
  * device_type must equal \p target->GetTargetDeviceType().
  *
  * Note that currently we assume if a function returns its result on a particular (virtual) device
- * then the function body is also executed on that device. See the overview comment in
- * src/relay/transforms/device_planner.cc for more details.
+ * then the function body is also executed on that device.
  *
- * By 'data' we include both tensors and additional supporting datastructures such as shapes,
- * Relay ADT items (including tuples), Relay references, and Relay closures. Typically non-tensor
- * data must reside on a 'CPU'-like host device with good support for scalars.
  *
  * By 'execution' we include both (fused) primitive operators, and all the Relay expressions
  * surrounding them which coordinates data and control flow. Again, typically non-primitive
diff --git a/include/tvm/tir/expr.h b/include/tvm/tir/expr.h
index 28cb022151d2..a157516f5342 100644
--- a/include/tvm/tir/expr.h
+++ b/include/tvm/tir/expr.h
@@ -886,7 +886,7 @@ class CallNode : public PrimExprNode {
    *  - It can be tvm::Op which corresponds to the primitive operators(intrinsics).
    *  - It can also be another function in the IRModule (GlobalVar).
    */
-  RelayExpr op;
+  RelaxExpr op;
 
   /*! \brief The arguments. */
   Array<PrimExpr> args;
@@ -917,7 +917,7 @@ class CallNode : public PrimExprNode {
  */
 class Call : public PrimExpr {
  public:
-  TVM_DLL Call(DataType dtype, RelayExpr op, Array<PrimExpr> args, Span span = Span());
+  TVM_DLL Call(DataType dtype, RelaxExpr op, Array<PrimExpr> args, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Call, PrimExpr, CallNode);
   TVM_DEFINE_OBJECT_REF_COW_METHOD(CallNode);
 };
diff --git a/python/setup.py b/python/setup.py
index 6cca9060bd2d..30b1e2174d02 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -208,7 +208,7 @@ def is_pure(self):
 
 def get_package_data_files():
     # Relay standard libraries
-    return ["relay/std/prelude.rly", "relay/std/core.rly"]
+    return []
 
 
 def long_description_contents():
diff --git a/python/tvm/contrib/cutlass/build.py b/python/tvm/contrib/cutlass/build.py
index ba598e9b225e..9dba93072623 100644
--- a/python/tvm/contrib/cutlass/build.py
+++ b/python/tvm/contrib/cutlass/build.py
@@ -39,7 +39,7 @@
 
 def has_cutlass():
     """Returns true if the CUTLASS custom codegen is available"""
-    return tvm.get_global_func("relay.ext.cutlass.create_c_source_module", True) is not None
+    return tvm.get_global_func("relax.ext.cutlass", True) is not None
 
 
 def _get_cutlass_path():
diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py
index 50064e42ba08..2456aa244ee9 100644
--- a/python/tvm/contrib/hexagon/session.py
+++ b/python/tvm/contrib/hexagon/session.py
@@ -28,11 +28,6 @@
 from tvm import rpc as _rpc
 from tvm.contrib import utils
 import tvm.contrib.hexagon as hexagon
-from tvm.relay.backend.executor_factory import (
-    ExecutorFactoryModule,
-    AOTExecutorFactoryModule,
-    GraphExecutorFactoryModule,
-)
 from .tools import export_module, HEXAGON_SIMULATOR_NAME
 
 
@@ -206,86 +201,6 @@ def load_module(self, module: Union[str, pathlib.Path, tvm.runtime.Module]):
         )
         return self._rpc.get_function("tvm.hexagon.load_module")(str(remote_file_path))
 
-    def get_graph_executor(
-        self,
-        graph_json: str,
-        module_name: Union[str, pathlib.Path, tvm.runtime.Module],
-    ):
-        """Create a local GraphModule which consumes a remote libmod.
-
-        The session must be established (via __enter__) prior to
-        calling this function.
-
-        Parameters
-        ----------
-        module_name : Union[str, pathlib.Path, tvm.runtime.Module]
-            The remote module filename, following the same restrictions
-            as `load_module`.
-        graph_json : str
-            The string with the graph JSON.
-
-        Returns
-        -------
-        GraphModule :
-            Runtime graph module that can be used to execute the graph.
-
-        """
-        graph_mod = self.load_module(module_name)
-        self._set_device_type(graph_mod)
-        return tvm.contrib.graph_executor.create(graph_json, graph_mod, self.device)
-
-    def get_aot_executor(
-        self,
-        module_file: Union[str, pathlib.Path],
-    ):
-        """Create a local GraphModule which consumes a remote libmod.
-        The session must be established (via __enter__) prior to
-        calling this function.
-        Parameters
-        ----------
-        module_file : Union[str, pathlib.Path]
-            The remote module filename, following the same restrictions
-            as `load_module`. The filename should be an absolute path.
-        Returns
-        -------
-        GraphModule :
-            Runtime graph module that can be used to execute the graph.
-        """
-        # Temporary workaround for https://github.com/apache/tvm/issues/13741
-        self.aot_mod = self.load_module(module_file)
-        return tvm.runtime.executor.AotModule(self.aot_mod["default"](self.device))
-
-    def get_graph_debug_executor(
-        self,
-        graph_json: str,
-        module_name: Union[str, pathlib.Path, tvm.runtime.Module],
-        dump_root: Union[str, pathlib.Path] = None,
-    ):
-        """Create a local GraphModuleDebug which consumes a remote libmod.
-
-        Parameters
-        ----------
-        graph_json : str
-            The string with the graph JSON.
-         module_name : Union[str, pathlib.Path, tvm.runtime.Module]
-            The remote module filename, following the same restrictions
-            as `load_module`.
-        session : Session
-            Remote session. The session must be established (via __enter__)
-            prior to calling this function.
-
-        Returns
-        -------
-        GraphModuleDebug :
-            Runtime debug graph module that can be used to debug the graph.
-        """
-
-        graph_debug_mod = self.load_module(module_name)
-        self._set_device_type(graph_debug_mod)
-        return tvm.contrib.debugger.debug_executor.create(
-            graph_json, graph_debug_mod, self.device, dump_root=str(dump_root)
-        )
-
     def get_executor_from_factory(
         self, module: Union[ExecutorFactoryModule, relax.Executable, str], hexagon_arch: str = "v68"
     ):
@@ -294,17 +209,13 @@ def get_executor_from_factory(
         Parameters
         ----------
 
-        module : Union[ExecutorFactoryModule, relax.Executable, str]
+        module : Union[relax.Executable]
 
             The module to upload to the remote
             session and load.
         hexagon_arch : str
             The hexagon arch to be used
         """
-        if isinstance(module, AOTExecutorFactoryModule):
-            return self._aot_executor_from_factory(module)
-        if isinstance(module, GraphExecutorFactoryModule):
-            return self._graph_executor_from_factory(module)
         if isinstance(module, (relax.Executable, str)):
             return self._relax_vm_executable_executor(module, hexagon_arch=hexagon_arch)
 
@@ -332,32 +243,6 @@ def _set_device_type(self, module: Union[str, pathlib.Path, GraphExecutorFactory
             else:
                 self._requires_cpu_device = False
 
-    def _graph_executor_from_factory(
-        self,
-        module: Union[str, pathlib.Path, GraphExecutorFactoryModule],
-    ):
-        """Create a local GraphModule which consumes a remote libmod.
-
-        The session must be established (via __enter__) prior to
-        calling this function.
-
-        Parameters
-        ----------
-
-        module : GraphExecutorFactoryModule
-
-            The graph executor module to upload to the remote and load.
-            This will typically be the output of `tvm.relay.build`,
-            when passing `executor=Executor("graph")`.
-
-        Returns
-        -------
-        GraphModule :
-            Runtime graph module that can be used to execute the graph.
-
-        """
-        return self.get_graph_executor(module.get_graph_json(), module.get_lib())
-
     def _relax_vm_executable_executor(
         self, vm_exec: Union[relax.Executable, str], hexagon_arch: str
     ):
@@ -397,75 +282,6 @@ def _relax_vm_executable_executor(
         path = self.upload(path_exec, "exec.so")
         return self._rpc.get_function("tvm.hexagon.load_module")(str(path))
 
-    def _aot_executor_from_factory(
-        self,
-        module: Union[str, pathlib.Path, AOTExecutorFactoryModule],
-    ):
-        """Create a local GraphModule which consumes a remote libmod.
-
-        The session must be established (via __enter__) prior to
-        calling this function.
-
-        Parameters
-        ----------
-
-        module : AOTExecutorFactoryModule
-
-            The graph executor module to upload to the remote and load.
-            This will typically be the output of `tvm.relay.build`,
-            when passing `executor=Executor("aot")`.
-
-        Returns
-        -------
-        GraphModule :
-            Runtime graph module that can be used to execute the graph.
-
-        """
-
-        hexagon_arch = set(
-            target.mcpu.replace("hexagon", "")
-            for target in module.target
-            if "hexagon" in target.keys
-        )
-
-        self._set_device_type(module)
-
-        for target in module.target:
-            target_type = str(target).split()[0]
-
-        assert hexagon_arch, "No hexagon target architecture found"
-        assert len(hexagon_arch) == 1, f"Inconsistent hexagon architecture found, {hexagon_arch}"
-        hexagon_arch = hexagon_arch.pop()
-
-        with tempfile.TemporaryDirectory() as temp_dir:
-            temp_dir = pathlib.Path(temp_dir)
-            binary_name = "test_binary.so"
-            binary_path = temp_dir / binary_name
-
-            if target_type == "hexagon":
-                module.export_library(
-                    str(binary_path),
-                    fcompile=hexagon.create_aot_shared,
-                    fpack_imports=hexagon.pack_imports,
-                    hexagon_arch=hexagon_arch,
-                )
-            elif target_type == "llvm":
-                module.export_library(
-                    str(binary_path),
-                    fcompile=hexagon.create_shared,
-                    fpack_imports=hexagon.pack_imports,
-                    cc=hexagon.hexagon_clang_plus(),
-                )
-            else:
-                raise ValueError(
-                    "Incorrect Target kind.\n"
-                    "Target kind should be from these options: [hexagon, llvm]."
-                )
-
-            remote_file_path = self.upload(binary_path, binary_name)
-
-        return self.get_aot_executor(remote_file_path)
-
     def get_profile_output(self, mode: str, path: str):
         assert isinstance(mode, str), f"Invalid mode type, {type(mode)} != str"
         assert isinstance(path, str), f"Invalid path type, {type(path)} != str"
diff --git a/python/tvm/contrib/msc/core/transform/transform.py b/python/tvm/contrib/msc/core/transform/transform.py
index c6d7113f44f5..e78b5cb71450 100644
--- a/python/tvm/contrib/msc/core/transform/transform.py
+++ b/python/tvm/contrib/msc/core/transform/transform.py
@@ -53,7 +53,7 @@ def SetExprName(
         var_names = var_names or {}
         var_names = {k: msc_utils.legalize_expr_name(v) for k, v in var_names.items()}
         return relax_api.SetRelaxExprName(entry_name, target, var_names)  # type: ignore
-    return relay_api.SetRelayExprName(entry_name)  # type: ignore
+    return relay_api.SetRelaxExprName(entry_name)  # type: ignore
 
 
 def BindExprName(
@@ -75,7 +75,7 @@ def BindExprName(
     ret: tvm.ir.transform.Pass
     """
 
-    return relay_api.BindRelayExprName(name_key, seperator, entry_name)  # type: ignore
+    return relay_api.BindRelaxExprName(name_key, seperator, entry_name)  # type: ignore
 
 
 def SetExprLayout(allow_missing: bool = True, entry_name: str = "main") -> tvm.ir.transform.Pass:
diff --git a/python/tvm/contrib/target/__init__.py b/python/tvm/contrib/target/__init__.py
deleted file mode 100644
index 13a83393a912..000000000000
--- a/python/tvm/contrib/target/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
diff --git a/python/tvm/contrib/target/coreml.py b/python/tvm/contrib/target/coreml.py
deleted file mode 100644
index d8846ce5f1cd..000000000000
--- a/python/tvm/contrib/target/coreml.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument, import-outside-toplevel, missing-function-docstring
-"""Utility to compile CoreML models"""
-
-import os
-import shutil
-
-import tvm._ffi
-from ...relay.expr_functor import ExprVisitor
-from .. import xcode, coreml_runtime
-
-
-def _convert_add(builder, name, inputs, outputs, args, attrs):
-    builder.add_elementwise(name=name, input_names=inputs, output_name=outputs[0], mode="ADD")
-
-
-def _convert_multiply(builder, name, inputs, outputs, args, attrs):
-    builder.add_elementwise(name=name, input_names=inputs, output_name=outputs[0], mode="MULTIPLY")
-
-
-def _convert_clip(builder, name, inputs, outputs, args, attrs):
-    builder.add_clip(
-        name=name,
-        input_name=inputs[0],
-        output_name=outputs[0],
-        min_value=attrs.a_min,
-        max_value=attrs.a_max,
-    )
-
-
-def _convert_batch_flatten(builder, name, inputs, outputs, args, attrs):
-    builder.add_flatten_to_2d(name=name, input_name=inputs[0], output_name=outputs[0])
-
-
-def _convert_expand_dims(builder, name, inputs, outputs, args, attrs):
-    if attrs.axis >= 0:
-        axes = list(range(attrs.axis, attrs.axis + attrs.num_newaxis))
-    else:
-        axes = list(range(attrs.axis - attrs.num_newaxis + 1, attrs.axis + 1))
-
-    builder.add_expand_dims(name=name, input_name=inputs[0], output_name=outputs[0], axes=axes)
-
-
-def _convert_relu(builder, name, inputs, outputs, args, attrs):
-    builder.add_activation(
-        name=name, non_linearity="RELU", input_name=inputs[0], output_name=outputs[0]
-    )
-
-
-def _convert_softmax(builder, name, inputs, outputs, args, attrs):
-    builder.add_softmax_nd(
-        name=name, input_name=inputs[0], output_name=outputs[0], axis=int(attrs["axis"])
-    )
-
-
-def _convert_conv2d(builder, name, inputs, outputs, args, attrs):
-    weight = args[1].data.numpy()
-    if attrs["kernel_layout"] == "OIHW":
-        # convert to 'HWIO'
-        weight = weight.transpose([2, 3, 1, 0])
-    kh, kw, kc, oc = weight.shape
-
-    builder.add_convolution(
-        name=name,
-        kernel_channels=kc,
-        output_channels=oc,
-        height=kh,
-        width=kw,
-        stride_height=int(attrs["strides"][0]),
-        stride_width=int(attrs["strides"][0]),
-        border_mode="valid",
-        groups=int(attrs["groups"]),
-        W=weight,
-        b=None,
-        has_bias=False,
-        input_name=inputs[0],
-        output_name=outputs[0],
-        dilation_factors=[int(v) for v in attrs["dilation"]],
-        padding_top=int(attrs["padding"][0]),
-        padding_bottom=int(attrs["padding"][2]),
-        padding_left=int(attrs["padding"][1]),
-        padding_right=int(attrs["padding"][3]),
-    )
-
-
-def _convert_global_avg_pool2d(builder, name, inputs, outputs, args, attrs):
-    builder.add_pooling(
-        name=name,
-        height=1,
-        width=1,
-        stride_height=1,
-        stride_width=1,
-        layer_type="AVERAGE",
-        padding_type="VALID",
-        input_name=inputs[0],
-        output_name=outputs[0],
-        is_global=True,
-    )
-
-
-_convert_map = {
-    "add": _convert_add,
-    "multiply": _convert_multiply,
-    "clip": _convert_clip,
-    "expand_dims": _convert_expand_dims,
-    "nn.relu": _convert_relu,
-    "nn.batch_flatten": _convert_batch_flatten,
-    "nn.softmax": _convert_softmax,
-    "nn.conv2d": _convert_conv2d,
-    "nn.global_avg_pool2d": _convert_global_avg_pool2d,
-}
-
-
-class CodegenCoreML(ExprVisitor):
-    """
-    A visitor to traverse subgraphs and build Core ML models.
-    """
-
-    def __init__(self, model_name, function):
-        import coremltools
-        from coremltools.models.neural_network import NeuralNetworkBuilder
-
-        ExprVisitor.__init__(self)
-        self.model_name = model_name
-        self.function = function
-        self.out_map = {}
-        self.model_inputs_ = []
-        self.buf_idx_ = 0
-
-        # Update inputs and outputs after we visit all the nodes.
-        # Set dummy values for now.
-        # TODO: support multiple outputs
-        inputs = [("", coremltools.models.datatypes.Array(1)) for _ in self.function.params]
-        outputs = [("", coremltools.models.datatypes.Array(1))]
-        self.builder = NeuralNetworkBuilder(inputs, outputs, disable_rank5_shape_mapping=True)
-
-    def visit_constant(self, const):
-        output = "buf_" + str(self.buf_idx_)
-        self.builder.add_load_constant_nd(
-            name=output,
-            output_name=output,
-            constant_value=const.data.numpy(),
-            shape=const.data.shape,
-        )
-        self.buf_idx_ = self.buf_idx_ + 1
-        self.out_map[const] = [output]
-
-    def visit_var(self, var):
-        name = var.name_hint
-        shape = [int(n) for n in var.type_annotation.shape]
-        dtype = var.type_annotation.dtype
-        self.model_inputs_.append((name, shape, dtype))
-        self.out_map[var] = [name]
-
-    def visit_call(self, call):
-        inputs = []
-        for arg in call.args:
-            super().visit(arg)
-            for out in self.out_map[arg]:
-                inputs.append(out)
-        outputs = ["buf_" + str(self.buf_idx_)]
-        op_name = call.op.name
-        layer_name = op_name + "_" + str(self.buf_idx_)
-
-        assert op_name in _convert_map, f"{op_name} is not supported"
-        _convert_map[op_name](self.builder, layer_name, inputs, outputs, call.args, call.attrs)
-
-        self.buf_idx_ = self.buf_idx_ + 1
-        self.out_map[call] = outputs
-
-    def compile(self, out_dir):
-        """
-        Build a Core ML model and compile it with Xcode toolchain.
-        """
-        import coremltools
-        from coremltools.proto.Model_pb2 import ArrayFeatureType
-
-        FEATURE_TYPE_MAP = {
-            "float32": ArrayFeatureType.FLOAT32,
-            "float64": ArrayFeatureType.DOUBLE,
-            "int32": ArrayFeatureType.INT32,
-        }
-
-        input_names, input_dims, input_dtypes = zip(*self.model_inputs_)
-        self.builder.set_input(input_names, input_dims)
-        for i, dtype in enumerate(input_dtypes):
-            assert dtype in FEATURE_TYPE_MAP
-            input_desc = self.builder.spec.description.input
-            input_desc[i].type.multiArrayType.dataType = FEATURE_TYPE_MAP[dtype]
-
-        output_dim = [int(n) for n in self.function.ret_type.shape]
-        self.builder.set_output(self.out_map[self.function.body], [output_dim])
-        for i, dtype in enumerate([self.function.ret_type.dtype]):
-            assert dtype in FEATURE_TYPE_MAP
-            output_desc = self.builder.spec.description.output
-            output_desc[i].type.multiArrayType.dataType = FEATURE_TYPE_MAP[dtype]
-
-        model = coremltools.models.MLModel(self.builder.spec)
-        xcode.compile_coreml(model, self.model_name, out_dir)
-
-
-@tvm._ffi.register_func("relay.ext.coremlcompiler")
-def coreml_compiler(func):
-    """
-    Create a CoreML runtime from a Relay module.
-    """
-    assert isinstance(func, tvm.relay.function.Function)
-    model_dir = os.getcwd()
-    name = str(func.attrs.global_symbol)
-    builder = CodegenCoreML(name, func)
-    builder.visit(func.body)
-    mlmodelc_path = f"{model_dir}/{name}.mlmodelc"
-    if os.path.exists(mlmodelc_path):
-        shutil.rmtree(mlmodelc_path)
-    builder.compile(model_dir)
-
-    dev = tvm.cpu(0)
-    return coreml_runtime.create(name, mlmodelc_path, dev).module
diff --git a/python/tvm/ir/__init__.py b/python/tvm/ir/__init__.py
index 3e893099f454..b74e9954d9cf 100644
--- a/python/tvm/ir/__init__.py
+++ b/python/tvm/ir/__init__.py
@@ -18,7 +18,6 @@
 """Common data structures across all IR variants."""
 
 from . import diagnostics, instrument, transform
-from .affine_type import TensorAffineType, TupleAffineType
 from .attrs import Attrs, DictAttrs, make_node
 from .base import (
     EnvFunc,
@@ -33,12 +32,11 @@
     structural_hash,
 )
 from .container import Array, Map
-from .expr import BaseExpr, GlobalVar, PrimExpr, Range, RelayExpr
+from .expr import BaseExpr, GlobalVar, PrimExpr, Range, RelaxExpr
 from .function import BaseFunc, CallingConv
 from .global_info import GlobalInfo, DummyGlobalInfo, VDevice
 from .module import IRModule
 from .op import Op, register_intrin_lowering, register_op_attr
-from .tensor_type import TensorType
 from .type import (
     FuncType,
     PointerType,
diff --git a/python/tvm/ir/affine_type.py b/python/tvm/ir/affine_type.py
deleted file mode 100644
index 24126f94b9c4..000000000000
--- a/python/tvm/ir/affine_type.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Types for quantized Tensors."""
-import tvm._ffi
-
-from . import _ffi_api
-from .base import Node
-
-
-class AffineType(Node):
-    """The base class of Affine Types."""
-
-    def __eq__(self, other):
-        """Compare two types for structural equivalence."""
-        return bool(tvm.ir.structural_equal(self, other))
-
-    def __ne__(self, other):
-        return not self.__eq__(other)
-
-    def __str__(self):
-        from tvm.relay import pretty_print  # pylint: disable=import-outside-toplevel
-
-        return pretty_print(self)
-
-
-@tvm._ffi.register_object("TensorAffineType")
-class TensorAffineType(AffineType):
-    """The quantized type of a tensor, with scale, zero point, and datatype
-
-    The real space value is calculated as x = x_q * scale + zero_point
-
-    Parameters
-    ----------
-    scale: Expr
-        The scale
-
-    zero_point: Expr
-        The zero_point
-
-    dtype : str
-        The content data type.
-
-    axis : int
-        The axis for per-channel quantization.
-    """
-
-    def __init__(self, scale, zero_point, dtype, axis=-1):
-        self.__init_handle_by_constructor__(
-            _ffi_api.TensorAffineType, scale, zero_point, dtype, axis
-        )
-
-
-@tvm._ffi.register_object("TupleAffineType")
-class TupleAffineType(AffineType):
-    """Affine types of a node with multiple outputs
-
-    Parameters
-    ----------
-    types : List[TensorAffineType]
-        The shape of the Tensor
-
-    """
-
-    def __init__(self, types):
-        self.__init_handle_by_constructor__(_ffi_api.TupleAffineType, types)
diff --git a/python/tvm/ir/attrs.py b/python/tvm/ir/attrs.py
index 6afb383c9f04..fc63138043fa 100644
--- a/python/tvm/ir/attrs.py
+++ b/python/tvm/ir/attrs.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-""" TVM Attribute module, which is mainly used for defining attributes of operators."""
+"""TVM Attribute module, which is mainly used for defining attributes of operators."""
 import tvm._ffi
 
 from tvm.runtime import Object
@@ -24,7 +24,7 @@
 
 @tvm._ffi.register_object
 class Attrs(Object):
-    """Attribute node, which is mainly use for defining attributes of relay operators.
+    """Attribute node, which is mainly use for defining attributes of operators.
 
     Used by function registered in python side, such as compute, schedule and alter_layout.
     Attrs is passed as the first argument to these functions.
diff --git a/python/tvm/ir/base.py b/python/tvm/ir/base.py
index 535b97e62dc1..2097423cbed3 100644
--- a/python/tvm/ir/base.py
+++ b/python/tvm/ir/base.py
@@ -157,11 +157,9 @@ def structural_equal(lhs, rhs, map_free_vars=False):
     - Normal node: equality is recursively defined without the restriction
       of graph nodes.
 
-    Vars(tir::Var, TypeVar) and non-constant relay expression nodes are graph nodes.
-    For example, it means that `%1 = %x + %y; %1 + %1` is not structurally equal
-    to `%1 = %x + %y; %2 = %x + %y; %1 + %2` in relay.
+    Vars(tir::Var, relax::Var) are graph nodes.
 
-    A var-type node(e.g. tir::Var, TypeVar) can be mapped as equal to another var
+    A var-type node(e.g. tir::Var) can be mapped as equal to another var
     with the same type if one of the following condition holds:
 
     - They appear in a same definition point(e.g. function argument).
diff --git a/python/tvm/ir/expr.py b/python/tvm/ir/expr.py
index 1dcb9f6cf600..d140b5867c6e 100644
--- a/python/tvm/ir/expr.py
+++ b/python/tvm/ir/expr.py
@@ -42,16 +42,16 @@ class PrimExpr(BaseExpr):
     dtype: str
 
 
-class RelayExpr(BaseExpr):
+class RelaxExpr(BaseExpr):
     """Base class of all non-primitive expressions."""
 
     @property
     def checked_type(self):
-        """Get the checked type of tvm.relay.Expr.
+        """Get the checked type of tvm.relax.Expr.
 
         Returns
         -------
-        checked_type : tvm.relay.Type
+        checked_type : tvm.ir.Type
             The checked type.
         """
         ret = self._checked_type_
@@ -72,7 +72,7 @@ def struct_info(self) -> Optional["tvm.relax.StructInfo"]:
 
 
 @tvm._ffi.register_object("GlobalVar")
-class GlobalVar(RelayExpr):
+class GlobalVar(RelaxExpr):
     """A global variable in the IR.
 
     GlobalVar is used to refer to the global functions
@@ -89,12 +89,12 @@ class GlobalVar(RelayExpr):
     def __init__(self, name_hint: str, type_annot: Optional[Type] = None):
         self.__init_handle_by_constructor__(_ffi_api.GlobalVar, name_hint, type_annot)
 
-    def __call__(self, *args: RelayExpr) -> BaseExpr:
+    def __call__(self, *args: RelaxExpr) -> BaseExpr:
         """Call the global variable.
 
         Parameters
         ----------
-        args: List[RelayExpr]
+        args: List[RelaxExpr]
             The arguments to the call.
 
         Returns
@@ -105,7 +105,7 @@ def __call__(self, *args: RelayExpr) -> BaseExpr:
         # pylint: disable=import-outside-toplevel
 
         # TODO(@relax-team): replace with Relax base class after it's introduced
-        if all(isinstance(x, RelayExpr) for x in args):
+        if all(isinstance(x, RelaxExpr) for x in args):
             from tvm import relax
 
             return relax.Call(self, args)
@@ -185,12 +185,12 @@ def __ne__(self, other: Object) -> bool:
 
 
 # TODO(@relax-team): remove when we have a RelaxExpr base class
-def is_relax_expr(expr: RelayExpr) -> bool:
-    """check if a RelayExpr is a Relax expresssion.
+def is_relax_expr(expr: RelaxExpr) -> bool:
+    """check if a RelaxExpr is a Relax expresssion.
 
     Parameters
     ----------
-    expr : RelayExpr
+    expr : RelaxExpr
         The expression to check.
 
     Returns
diff --git a/python/tvm/ir/function.py b/python/tvm/ir/function.py
index a4ea094c740b..8527ce66f0cf 100644
--- a/python/tvm/ir/function.py
+++ b/python/tvm/ir/function.py
@@ -21,7 +21,7 @@
 from enum import IntEnum
 import tvm.runtime
 from tvm.runtime.object import Object
-from .expr import RelayExpr
+from .expr import RelaxExpr
 from .attrs import DictAttrs
 from . import _ffi_api
 
@@ -34,7 +34,7 @@ class CallingConv(IntEnum):
     DEVICE_KERNEL_LAUNCH = 2
 
 
-class BaseFunc(RelayExpr):
+class BaseFunc(RelaxExpr):
     """Base class of all functions."""
 
     @property
diff --git a/python/tvm/ir/instrument.py b/python/tvm/ir/instrument.py
index 4402d0fc22fe..72d4777194e5 100644
--- a/python/tvm/ir/instrument.py
+++ b/python/tvm/ir/instrument.py
@@ -213,7 +213,7 @@ def should_run(self, mod, pass_info)
 
         skip_annotate = SkipPass("AnnotateSpans")
         with tvm.transform.PassContext(instruments=[skip_annotate]):
-            tvm.relay.build(mod, "llvm")
+            tvm.build(mod, "llvm")
     """
 
     def create_pass_instrument(pi_cls):
@@ -249,8 +249,7 @@ def render():
 
             timing_inst = PassTimingInstrument()
             with tvm.transform.PassContext(instruments=[timing_inst]):
-                relay_mod = relay.transform.InferType()(relay_mod)
-                relay_mod = relay.transform.FoldScaleAxis()(relay_mod)
+                relax_mod = relax.transform.FuseOps()(relax_mod)
                 # before exiting the context, get profile results.
                 profiles = timing_inst.render()
         """
diff --git a/python/tvm/ir/json_compact.py b/python/tvm/ir/json_compact.py
index 756dbc4992f4..59da50e4d12e 100644
--- a/python/tvm/ir/json_compact.py
+++ b/python/tvm/ir/json_compact.py
@@ -16,8 +16,6 @@
 # under the License.
 """Tool to upgrade json from historical versions."""
 import json
-import tvm.ir
-import tvm.runtime
 
 
 def create_updater(node_map, from_ver, to_ver):
@@ -141,18 +139,6 @@ def _initialize_virtual_device(item, _):
     node_map = {
         # Base IR
         "GlobalVar": _initialize_virtual_device,
-        "relay.Var": _initialize_virtual_device,
-        "relay.Function": _initialize_virtual_device,
-        "relay.Tuple": _initialize_virtual_device,
-        "relay.Call": _initialize_virtual_device,
-        "relay.Let": _initialize_virtual_device,
-        "relay.If": _initialize_virtual_device,
-        "relay.TupleGetItem": _initialize_virtual_device,
-        "relay.RefCreate": _initialize_virtual_device,
-        "relay.RefRead": _initialize_virtual_device,
-        "relay.RefWrite": _initialize_virtual_device,
-        "relay.Match": _initialize_virtual_device,
-        "relay.Constant": _initialize_virtual_device,
     }
 
     return create_updater(node_map, "0.8", "0.9")
@@ -171,138 +157,6 @@ def _initialize_module_attributes(item, _):
     return create_updater(node_map, "0.7", "0.8")
 
 
-def create_updater_06_to_07():
-    """Create an update to upgrade json from v0.6 to v0.7
-
-    Returns
-    -------
-    fupdater : function
-        The updater function
-    """
-
-    def _ftype_var(item, nodes):
-        vindex = int(item["attrs"]["var"])
-        item["attrs"]["name_hint"] = nodes[vindex]["attrs"]["name"]
-        # set vindex to null
-        nodes[vindex]["type_key"] = ""
-        del item["attrs"]["var"]
-        assert item["type_key"].startswith("relay.")
-        item["type_key"] = item["type_key"][len("relay.") :]
-        return item
-
-    def _rename(new_name):
-        def _convert(item, _):
-            item["type_key"] = new_name
-            return item
-
-        return _convert
-
-    def _update_tir_var(new_name):
-        def _convert(item, _):
-            item["type_key"] = new_name
-            item["attrs"]["type_annotation"] = "0"
-            return item
-
-        return _convert
-
-    def _update_global_key(item, _):
-        if "global_key" in item:
-            item["repr_str"] = item["global_key"]
-            del item["global_key"]
-        return item
-
-    def _update_from_std_str(key):
-        def _convert(item, nodes):
-            str_val = item["attrs"][key]
-            jdata = json.loads(tvm.ir.save_json(tvm.runtime.String(str_val)))
-            root_idx = jdata["root"]
-            val = jdata["nodes"][root_idx]
-            sidx = len(nodes)
-            nodes.append(val)
-            item["attrs"][key] = f"{sidx}"
-            return item
-
-        return _convert
-
-    node_map = {
-        # Base IR
-        "SourceName": _update_global_key,
-        "EnvFunc": _update_global_key,
-        "relay.Op": [_update_global_key, _rename("Op")],
-        "relay.TypeVar": [_ftype_var, _update_from_std_str("name_hint")],
-        "TypeVar": _update_from_std_str("name_hint"),
-        "relay.Id": [_update_from_std_str("name_hint")],
-        "relay.GlobalTypeVar": [_ftype_var, _update_from_std_str("name_hint")],
-        "GlobalTypeVar": _update_from_std_str("name_hint"),
-        "relay.Type": _rename("Type"),
-        "relay.TupleType": _rename("TupleType"),
-        "relay.TypeConstraint": _rename("TypeConstraint"),
-        "relay.FuncType": _rename("FuncType"),
-        "relay.IncompleteType": _rename("IncompleteType"),
-        "relay.TypeRelation": _rename("TypeRelation"),
-        "relay.TypeCall": _rename("TypeCall"),
-        "relay.Constructor": _update_from_std_str("name_hint"),
-        "relay.Module": _rename("IRModule"),
-        "relay.SourceName": _rename("SourceName"),
-        "relay.Span": _rename("Span"),
-        "relay.GlobalVar": [_rename("GlobalVar"), _update_from_std_str("name_hint")],
-        "GlobalVar": _update_from_std_str("name_hint"),
-        "relay.Pass": _rename("transform.Pass"),
-        "relay.PassInfo": _rename("transform.PassInfo"),
-        "relay.PassContext": _rename("transform.PassContext"),
-        "relay.ModulePass": _rename("transform.ModulePass"),
-        "relay.Sequential": _rename("transform.Sequential"),
-        "StrMap": _rename("Map"),
-        # TIR
-        "Variable": [_update_tir_var("tir.Var"), _update_from_std_str("name")],
-        "SizeVar": [_update_tir_var("tir.SizeVar"), _update_from_std_str("name")],
-        "StringImm": [_rename("tir.StringImm"), _update_from_std_str("value")],
-        "Cast": _rename("tir.Cast"),
-        "Add": _rename("tir.Add"),
-        "Sub": _rename("tir.Sub"),
-        "Mul": _rename("tir.Mul"),
-        "Div": _rename("tir.Div"),
-        "Mod": _rename("tir.Mod"),
-        "FloorDiv": _rename("tir.FloorDiv"),
-        "FloorMod": _rename("tir.FloorMod"),
-        "Min": _rename("tir.Min"),
-        "Max": _rename("tir.Max"),
-        "EQ": _rename("tir.EQ"),
-        "NE": _rename("tir.NE"),
-        "LT": _rename("tir.LT"),
-        "LE": _rename("tir.LE"),
-        "GT": _rename("tir.GT"),
-        "GE": _rename("tir.GE"),
-        "And": _rename("tir.And"),
-        "Or": _rename("tir.Or"),
-        "Not": _rename("tir.Not"),
-        "Select": _rename("tir.Select"),
-        "BufferLoad": _rename("tir.BufferLoad"),
-        "Ramp": _rename("tir.Ramp"),
-        "Broadcast": _rename("tir.Broadcast"),
-        "Shuffle": _rename("tir.Shuffle"),
-        "Call": [_rename("tir.Call"), _update_from_std_str("name")],
-        "Let": _rename("tir.Let"),
-        "Any": _rename("tir.Any"),
-        "LetStmt": _rename("tir.LetStmt"),
-        "AssertStmt": _rename("tir.AssertStmt"),
-        "BufferStore": _rename("tir.BufferStore"),
-        "BufferRealize": _rename("tir.BufferRealize"),
-        "Allocate": _rename("tir.Allocate"),
-        "IfThenElse": _rename("tir.IfThenElse"),
-        "Evaluate": _rename("tir.Evaluate"),
-        "Prefetch": _rename("tir.Prefetch"),
-        "AttrStmt": [_rename("tir.AttrStmt"), _update_from_std_str("attr_key")],
-        "Layout": [_rename("tir.Layout"), _update_from_std_str("name")],
-        "Buffer": [
-            _rename("tir.Buffer"),
-            _update_from_std_str("name"),
-            _update_from_std_str("scope"),
-        ],
-    }
-    return create_updater(node_map, "0.6", "0.7")
-
-
 def upgrade_json(json_str):
     """Update json from a historical version.
 
diff --git a/python/tvm/ir/module.py b/python/tvm/ir/module.py
index ed7ea4439957..8347e218beb9 100644
--- a/python/tvm/ir/module.py
+++ b/python/tvm/ir/module.py
@@ -97,7 +97,7 @@ def __setitem__(self, var, val):
         return self._add(var, val, True)
 
     def _add(self, var, val, update=True):
-        if isinstance(val, _expr.RelayExpr):
+        if isinstance(val, _expr.RelaxExpr):
             if isinstance(var, string_types):
                 if _ffi_api.Module_ContainGlobalVar(self, var):
                     var = _ffi_api.Module_GetGlobalVar(self, var)
@@ -151,7 +151,7 @@ def update_func(self, var, func):
         var: GlobalVar
             The global variable.
 
-        func: tvm.relay.Function
+        func: tvm.ir.BaseFunc
             The function to be inserted.
         """
         return _ffi_api.Module_UpdateFunction(self, var, func)
@@ -231,7 +231,7 @@ def from_expr(expr, functions=None):
 
         Parameters
         ----------
-        expr: RelayExpr
+        expr: RelaxExpr
             The starting expression
 
         global_funcs: Optional[dict]
diff --git a/python/tvm/ir/op.py b/python/tvm/ir/op.py
index dae97f114b6e..932aef24c60d 100644
--- a/python/tvm/ir/op.py
+++ b/python/tvm/ir/op.py
@@ -19,11 +19,11 @@
 import tvm._ffi
 
 from . import _ffi_api
-from .expr import RelayExpr
+from .expr import RelaxExpr
 
 
 @tvm._ffi.register_object("Op")
-class Op(RelayExpr):
+class Op(RelaxExpr):
     """Primitive operator in the IR."""
 
     def __init__(self):
diff --git a/python/tvm/ir/tensor_type.py b/python/tvm/ir/tensor_type.py
deleted file mode 100644
index 495e0fe868e5..000000000000
--- a/python/tvm/ir/tensor_type.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Type relation and function for type checking."""
-import tvm._ffi
-
-from . import _ffi_api
-from .type import Type
-
-
-@tvm._ffi.register_object("relay.TensorType")
-class TensorType(Type):
-    """A concrete TensorType in Relay.
-
-    This is the type assigned to tensors with a known dtype and shape.
-    For example, a tensor of `float32` and `(5, 5)`.
-
-    Parameters
-    ----------
-    shape : List[tvm.ir.PrimExpr]
-        The shape of the Tensor
-
-    dtype : Optional[str]
-        The content data type.
-    """
-
-    def __init__(self, shape, dtype="float32"):
-        self.__init_handle_by_constructor__(_ffi_api.TensorType, shape, dtype)
-
-    @property
-    def concrete_shape(self):
-        """Get shape of the type as concrete tuple of int.
-
-        Returns
-        -------
-        shape : List[int]
-            The concrete shape of the Type.
-
-        Raises
-        ------
-        TypeError : If the shape is symbolic
-        """
-        return tuple(int(x) for x in self.shape)
-
-    def __str__(self):
-        from tvm.relay import pretty_print  # pylint: disable=import-outside-toplevel
-
-        return pretty_print(self)
diff --git a/python/tvm/ir/transform.py b/python/tvm/ir/transform.py
index b2937acaa781..c00a4de6dabb 100644
--- a/python/tvm/ir/transform.py
+++ b/python/tvm/ir/transform.py
@@ -365,15 +365,13 @@ def module_pass(pass_func=None, opt_level=None, name=None, required=None, tracea
 
     .. code-block:: python
 
-        @relay.transform.module_pass
+        @tvm.ir.transform.module_pass
         class CustomPipeline:
             def __init__(self, enable_fold):
                 self.enable_fold = enable_fold
-                self.cse = relay.transform.EliminateCommonSubexpr()
-                self.const_fold = relay.transform.FoldConstant()
+                self.const_fold = relax.transform.FoldConstant()
 
             def transform_module(self, mod, ctx):
-                mod = self.cse(mod, ctx)
                 if self.enable_fold:
                     mod = self.const_fold(mod, ctx)
                 return mod
@@ -389,15 +387,9 @@ def transform_module(self, mod, ctx):
 
     .. code-block:: python
 
-        @relay.transform.module_pass(opt_level=2)
+        @tvm.ir.transform.module_pass(opt_level=2)
         def transform(mod, ctx):
-            tp = relay.TensorType((10,), "float32")
-            x = relay.var("x", tp)
-            gv = relay.GlobalVar("var")
-            func = relay.Function([x], relay.abs(x))
-            new_mod = tvm.IRModule({gv: func})
-            new_mod.update(mod)
-            return new_mod
+            return relax.transform.FoldConstant(mod)
 
         module_pass = transform
         assert isinstance(module_pass, transform.ModulePass)
diff --git a/python/tvm/ir/type.py b/python/tvm/ir/type.py
index 8ecf1ffb4a0f..914275415c89 100644
--- a/python/tvm/ir/type.py
+++ b/python/tvm/ir/type.py
@@ -94,10 +94,10 @@ class FuncType(Type):
 
     Parameters
     ----------
-    arg_types : List[tvm.relay.Type]
+    arg_types : List[tvm.ir.Type]
         The argument types
 
-    ret_type : tvm.relay.Type
+    ret_type : tvm.ir.Type
         The return type.
     """
 
diff --git a/python/tvm/meta_schedule/testing/custom_builder_runner.py b/python/tvm/meta_schedule/testing/custom_builder_runner.py
index 0e8ee435f7cf..7e7a3a1d9d9d 100644
--- a/python/tvm/meta_schedule/testing/custom_builder_runner.py
+++ b/python/tvm/meta_schedule/testing/custom_builder_runner.py
@@ -17,7 +17,7 @@
 """Customized builder and runner methods"""
 # pylint: disable=import-outside-toplevel
 
-from typing import TYPE_CHECKING, Dict, List, Optional, Union, Callable
+from typing import TYPE_CHECKING, Dict, Union, Callable
 
 if TYPE_CHECKING:
     import numpy as np  # type: ignore
@@ -25,118 +25,6 @@
     from tvm.meta_schedule.runner import EvaluatorConfig, RPCConfig
     from tvm.runtime import Device, Module, NDArray
     from tvm.target import Target
-    from tvm.runtime.vm import Executable
-
-
-def build_relay(
-    mod: "IRModule",
-    target: "Target",
-    params: Dict[str, "NDArray"],
-) -> "Module":
-    """Build a Relay IRModule
-
-    Parameters
-    ----------
-    mod : IRModule
-        The Relay IRModule to build.
-    target : Target
-        The target to build the module for.
-    params : Dict[str, NDArray]
-        The parameter dict to build the module with.
-
-    Returns
-    -------
-    mod : runtime.Module
-        The built module.
-    """
-    from tvm.relay.build_module import _build_module_no_factory as relay_build
-    from tvm.runtime import Module
-
-    result = relay_build(mod, target=target, target_host=None, params=params)
-    assert isinstance(result, Module)
-    return result
-
-
-def build_relay_with_tensorrt(
-    mod: "IRModule",
-    target: "Target",
-    params: Dict[str, "NDArray"],
-) -> "Module":
-    """Build a Relay IRModule with TensorRT BYOC
-
-    Parameters
-    ----------
-    mod : IRModule
-        The Relay IRModule to build.
-
-    target : Target
-        The target to build the module for.
-
-    params : Dict[str, NDArray]
-        The parameter dict to build the module with.
-
-    Returns
-    -------
-    mod : runtime.Module
-        The built module.
-    """
-    from tvm.ir.transform import PassContext
-    from tvm.relay.build_module import _build_module_no_factory as relay_build
-    from tvm.relay.op.contrib import tensorrt
-    from tvm.runtime import Module
-
-    mod = tensorrt.partition_for_tensorrt(mod, params)
-    with PassContext(opt_level=3):
-        result = relay_build(mod, target=target, target_host=None, params=params)
-    assert isinstance(result, Module)
-    return result
-
-
-def run_with_graph_executor(
-    rt_mod: "Module",
-    device: "Device",
-    evaluator_config: "EvaluatorConfig",
-    repeated_args: List["NDArray"],
-) -> List[float]:
-    """Run a Relay module with GraphExecutor
-
-    Parameters
-    ----------
-    rt_mod : Module
-        The Relay module to run.
-    device : Device
-        The device to run the module on.
-    evaluator_config : EvaluatorConfig
-        The evaluator configuration to run the module with.
-    repeated_args : List[NDArray]
-        The list of repeated arguments to run the module with.
-
-    Returns
-    -------
-    results : List[float]
-        The list of results.
-    """
-    import itertools
-
-    from tvm.contrib.graph_executor import GraphModule
-
-    graph_mod = GraphModule(rt_mod["default"](device))
-    evaluator = graph_mod.module.time_evaluator(
-        func_name="run",
-        dev=device,
-        number=evaluator_config.number,
-        repeat=evaluator_config.repeat,
-        min_repeat_ms=evaluator_config.min_repeat_ms,
-        f_preproc="cache_flush_cpu_non_first_arg"
-        if evaluator_config.enable_cpu_cache_flush
-        else "",
-    )
-    repeated_costs = []
-    for args in repeated_args:
-        profile_result = evaluator(*args)
-        repeated_costs.append(profile_result.results)
-    costs = [float(cost) for cost in itertools.chain.from_iterable(repeated_costs)]
-    return costs
 
 
 def run_module_via_rpc(
@@ -145,7 +33,6 @@ def run_module_via_rpc(
     dev_type: str,
     args: Union[Dict[int, "np.ndarray"], Dict[str, "np.ndarray"]],
     continuation: Callable,
-    backend: Optional[str] = "graph",
 ):
     """Execute a tvm.runtime.Module on RPC remote"""
     # pylint: disable=import-outside-toplevel
@@ -159,15 +46,11 @@ def run_module_via_rpc(
 
     with tempfile.TemporaryDirectory() as tmp_dir:
         filename = os.path.join(tmp_dir, "tvm_tmp_mod." + tar.output_format)
-        if backend == "vm":
-            code, lib = lib.save()
         lib.export_library(filename, fcompile=tar)
         session = rpc_config.connect_server()
         session.upload(filename)
         _, filename = os.path.split(filename)
         rt_mod = session.load_module(filename)
-        if backend == "vm":
-            rt_mod = session.get_function("runtime.Load_Executable")(code, rt_mod)
         dev = session.device(dev_type=dev_type, dev_id=0)
         nd_args = {k: ndarray.array(v, dev) for k, v in args.items()}
         return continuation(rt_mod, dev, nd_args)
diff --git a/python/tvm/meta_schedule/testing/dataset_collect_models.py b/python/tvm/meta_schedule/testing/dataset_collect_models.py
deleted file mode 100644
index 8992f73d2873..000000000000
--- a/python/tvm/meta_schedule/testing/dataset_collect_models.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=missing-docstring
-
-import argparse
-import os
-from typing import List, Tuple
-
-from tqdm import tqdm  # type: ignore
-from tvm.meta_schedule.testing.relay_workload import get_network
-
-
-# pylint: disable=too-many-branches
-def _build_dataset() -> List[Tuple[str, List[int]]]:
-    network_keys = []
-    for name in [
-        "resnet_18",
-        "resnet_50",
-        "mobilenet_v2",
-        "mobilenet_v3",
-        "wide_resnet_50",
-        "resnext_50",
-        "densenet_121",
-        "vgg_16",
-    ]:
-        for batch_size in [1, 4, 8]:
-            for image_size in [224, 240, 256]:
-                network_keys.append((name, [batch_size, 3, image_size, image_size]))
-    # inception-v3
-    for name in ["inception_v3"]:
-        for batch_size in [1, 2, 4]:
-            for image_size in [299]:
-                network_keys.append((name, [batch_size, 3, image_size, image_size]))
-    # resnet3d
-    for name in ["resnet3d_18"]:
-        for batch_size in [1, 2, 4]:
-            for image_size in [112, 128, 144]:
-                network_keys.append((name, [batch_size, 3, image_size, image_size, 16]))
-    # bert
-    for name in ["bert_tiny", "bert_base", "bert_medium", "bert_large"]:
-        for batch_size in [1, 2, 4]:
-            for seq_length in [64, 128, 256]:
-                network_keys.append((name, [batch_size, seq_length]))
-    # dcgan
-    for name in ["dcgan"]:
-        for batch_size in [1, 4, 8]:
-            for image_size in [64]:
-                network_keys.append((name, [batch_size, 3, image_size, image_size]))
-    return network_keys
-
-
-def main():
-    model_cache_dir = args.model_cache_dir
-    try:
-        os.makedirs(model_cache_dir, exist_ok=True)
-    except OSError:
-        print(f"Directory {model_cache_dir} cannot be created successfully.")
-    keys = _build_dataset()
-    for name, input_shape in tqdm(keys):
-        get_network(name=name, input_shape=input_shape, cache_dir=model_cache_dir)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()  # pylint: disable=invalid-name
-    parser.add_argument(
-        "--model_cache_dir",
-        type=str,
-        help="Please provide the full path to the model cache dir.",
-    )
-    args = parser.parse_args()  # pylint: disable=invalid-name
-    main()
diff --git a/python/tvm/meta_schedule/testing/dataset_extract_tasks.py b/python/tvm/meta_schedule/testing/dataset_extract_tasks.py
deleted file mode 100644
index f299227aa399..000000000000
--- a/python/tvm/meta_schedule/testing/dataset_extract_tasks.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=missing-docstring
-
-import argparse
-import glob
-import json
-import os
-from tqdm import tqdm  # type: ignore
-
-import tvm
-from tvm import meta_schedule as ms
-from tvm.ir import save_json
-from tvm.meta_schedule.testing.relay_workload import _load_cache
-from tvm.runtime import load_param_dict
-
-
-def _parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_cache_dir", type=str, help="Please provide the full path to the model cache dir."
-    )
-    parser.add_argument(
-        "--task_cache_dir", type=str, help="Please provide the full path to save extracted tasks."
-    )
-    parser.add_argument(
-        "--target", type=str, default="cuda", help="Please specify the target hardware for tuning."
-    )
-    return parser.parse_args()
-
-
-# pylint: disable=too-many-locals
-def extract_and_save_tasks(cache_file):
-    """Extract tuning tasks and cache the nonspatial ones in the given directory.
-
-    Parameters
-    ----------
-    cache_file : str
-        The filename of the cached model.
-
-    Returns
-    -------
-    None
-    """
-
-    mod, params_bytearray, _ = _load_cache(args.model_cache_dir, cache_file)
-    params = load_param_dict(params_bytearray)
-    try:
-        extracted_tasks = ms.relay_integration.extract_tasks(mod, target=args.target, params=params)
-    except tvm.error.TVMError as error:
-        print(str(error))
-        return
-    task_cache_path = os.path.join(
-        args.task_cache_dir, cache_file.split(".")[0] + "_extracted_tasks.json"
-    )
-    is_spatial = tvm.get_global_func("tir.schedule.IsSpatialPrimFunc")
-    with open(task_cache_path, "w", encoding="utf8") as file:
-        for i, task in enumerate(extracted_tasks):
-            subgraph = task.dispatched[0]
-            prim_func = subgraph[subgraph.get_global_vars()[0]]
-            if not is_spatial(prim_func):
-                subgraph_str = save_json(subgraph)
-                json_obj = [task.task_name, json.loads(subgraph_str)]
-                json_str = json.dumps(json_obj)
-                assert "\n" not in json_str, "Failed to generate single line string."
-                if i == len(extracted_tasks) - 1:
-                    file.write(json_str)
-                else:
-                    file.write(json_str + "\n")
-
-
-args = _parse_args()  # pylint: disable=invalid-name
-
-
-def main():
-    if not os.path.isdir(args.model_cache_dir):
-        raise Exception("Please provide a correct model cache dir.")
-    try:
-        os.makedirs(args.task_cache_dir, exist_ok=True)
-    except OSError:
-        print(f"Directory {args.task_cache_dir} cannot be created successfully.")
-
-    paths = glob.glob(os.path.join(args.model_cache_dir, "*.json"))  # pylint: disable=invalid-name
-    for path in tqdm(paths):
-        filename = path.split("/")[-1]
-        extract_and_save_tasks(filename)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/tvm/meta_schedule/testing/dataset_sample_candidates.py b/python/tvm/meta_schedule/testing/dataset_sample_candidates.py
deleted file mode 100644
index 6da11bf5e912..000000000000
--- a/python/tvm/meta_schedule/testing/dataset_sample_candidates.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=missing-docstring
-
-import argparse
-import glob
-import json
-import os
-from typing import List
-
-from tqdm import tqdm  # type: ignore
-import tvm
-from tvm import meta_schedule as ms
-from tvm.ir import load_json
-from tvm.target import Target
-
-
-def _parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--task_cache_dir", type=str, help="Please provide the full path to the extracted tasks."
-    )
-    parser.add_argument(
-        "--candidate_cache_dir",
-        type=str,
-        help="Please provide the full path to save the sampled candidates.",
-    )
-    parser.add_argument(
-        "--target",
-        type=str,
-        default="nvidia/geforce-rtx-3070",
-        help="Please specify the target hardware for tuning.\
-                    Note: for generating dataset, the hardware does not need to be present.",
-    )
-    parser.add_argument(
-        "--init_population_size",
-        type=int,
-        default=256,
-        help="The initial population size used in evolutionary search.",
-    )
-    parser.add_argument(
-        "--num_samples_per_task",
-        type=int,
-        default=400,
-        help="The number of samples to gather per tuning task.",
-    )
-    parser.add_argument(
-        "--num_trials_per_iter",
-        type=int,
-        default=64,
-        help="The number of trials per iteration in evolutionary search.",
-    )
-    parser.add_argument(
-        "--max_trials_per_task",
-        type=int,
-        default=400,
-        help="The maximum number of trials per task in evolutionary search.",
-    )
-    parser.add_argument(
-        "--max_retry_per_task",
-        type=int,
-        default=10,
-        help="The maximum number of retry attempts allowed.",
-    )
-    parser.add_argument(
-        "--file_group",
-        type=int,
-        default=0,
-        help="To enable running multiple scripts in parallel, files [idx * 10 : (idx + 1) * 10]\
-        in the sorted file list from the given directory will be run.",
-    )
-    return parser.parse_args()
-
-
-# pylint: disable=too-many-locals
-def sample_candidates(task, task_name, model_name):
-    """Randomly sample candidates for a task and save the candidates in the given directory.
-
-    Parameters
-    ----------
-    task : IRModule
-        The initial ir module used for generating the search space.
-    task_name : str
-        The name of the task.
-    model_name : str
-        The name of the model.
-
-    Returns
-    -------
-    None
-    """
-    candidate_path = os.path.join(
-        args.candidate_cache_dir, model_name, task_name + "_candidates.json"
-    )
-    workload_path = os.path.join(args.candidate_cache_dir, model_name, task_name + "_workload.json")
-    database = ms.database.JSONDatabase(
-        path_workload=workload_path,
-        path_tuning_record=candidate_path,
-    )
-    sample_init_population = tvm.get_global_func(
-        "meta_schedule.SearchStrategyEvolutionarySearchSampleInitPopulation"
-    )
-    evolve_with_cost_model = tvm.get_global_func(
-        "meta_schedule.SearchStrategyEvolutionarySearchEvolveWithCostModel"
-    )
-    strategy = ms.search_strategy.EvolutionarySearch(init_measured_ratio=0.0)
-    target = Target(args.target)
-    context = ms.TuneContext(
-        mod=task,
-        target=target,
-        space_generator="post-order-apply",
-        search_strategy=strategy,
-        task_name=task_name,
-    )
-    context.initialize()
-    context.pre_tuning(
-        max_trials=args.max_trials_per_task,
-        num_trials_per_iter=args.num_trials_per_iter,
-        design_spaces=context.generate_design_space(),
-        database=database,
-        cost_model=ms.cost_model.RandomModel(),  # type: ignore
-    )
-
-    all_states: List[tvm.tir.Schedule] = []
-    num_retry, itr = 0, 0
-    states = sample_init_population(strategy, args.init_population_size)
-    while len(all_states) < args.num_samples_per_task and num_retry < args.max_retry_per_task:
-        states = evolve_with_cost_model(strategy, states, len(states))
-        all_states += states
-        if len(states) == 0:
-            states = sample_init_population(strategy, args.init_population_size)
-            num_retry += 1
-        else:
-            num_retry = 0
-        print(f"iter: {itr}, number of states sampled: {len(all_states)}")
-        itr += 1
-    all_states = all_states[: args.num_samples_per_task]
-
-    workload = ms.database.Workload(context.mod)
-    database.commit_workload(context.mod)
-    for state in all_states:
-        database.commit_tuning_record(ms.database.TuningRecord(state.trace, workload))
-
-
-args = _parse_args()  # pylint: disable=invalid-name
-
-
-def main():
-    if not os.path.isdir(args.task_cache_dir):
-        raise Exception("Please provide a correct task cache dir.")
-    try:
-        os.makedirs(args.candidate_cache_dir, exist_ok=True)
-    except OSError:
-        print(f"Directory {args.candidate_cache_dir} cannot be created successfully.")
-
-    task_paths = sorted(glob.glob(os.path.join(args.task_cache_dir, "*.json")))[
-        args.file_group * 10 : (args.file_group + 1) * 10
-    ]
-    print(f"Selected models: {task_paths}")
-    for num, task_path in enumerate(task_paths):
-        print(f"Processing model {num} ...")
-        with open(task_path, "rb") as file:
-            tasks = file.readlines()
-        model_name = task_path.split("/")[-1][len("relay-") :][: -len("_extracted_tasks.json")]
-        os.makedirs(os.path.join(args.candidate_cache_dir, model_name), exist_ok=True)
-        for task_str in tqdm(tasks):
-            task_name, task_mod = json.loads(task_str)
-            task_mod = load_json(json.dumps(task_mod))
-            sample_candidates(task_mod, task_name, model_name)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/tvm/meta_schedule/testing/tune_onnx.py b/python/tvm/meta_schedule/testing/tune_onnx.py
deleted file mode 100644
index 2100f0e7c973..000000000000
--- a/python/tvm/meta_schedule/testing/tune_onnx.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=missing-docstring
-import argparse
-import json
-import logging
-
-import onnx  # type: ignore
-import tvm
-from tvm import meta_schedule as ms
-from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
-from tvm.relay.frontend import from_onnx
-from tvm.support import describe
-from tvm.testing.utils import strtobool
-
-from .tune_utils import create_timer, generate_input_data
-
-
-def _parse_args():
-    args = argparse.ArgumentParser()
-    args.add_argument(
-        "--model-name",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--onnx-path",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--input-shape",
-        type=str,
-        required=True,
-        help='example: `[{"name": "input1", "dtype": "int64", "shape": [1, 1, 8]}]',
-    )
-    args.add_argument(
-        "--target",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--num-trials",
-        type=int,
-        required=True,
-    )
-    args.add_argument(
-        "--rpc-host",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--rpc-port",
-        type=int,
-        required=True,
-    )
-    args.add_argument(
-        "--rpc-key",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--work-dir",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--number",
-        type=int,
-        default=3,
-    )
-    args.add_argument(
-        "--repeat",
-        type=int,
-        default=1,
-    )
-    args.add_argument(
-        "--min-repeat-ms",
-        type=int,
-        default=100,
-    )
-    args.add_argument(
-        "--adaptive-training",
-        type=lambda x: bool(strtobool(x)),
-        help="example: True / False",
-        default=True,
-    )
-    args.add_argument(
-        "--cpu-flush",
-        type=lambda x: bool(strtobool(x)),
-        help="example: True / False",
-        required=True,
-    )
-    args.add_argument(
-        "--backend",
-        type=str,
-        choices=["graph", "vm"],
-        help="example: graph / vm",
-        required=True,
-    )
-    parsed = args.parse_args()
-    parsed.target = tvm.target.Target(parsed.target)
-    parsed.input_shape = json.loads(parsed.input_shape)
-    parsed.rpc_config = ms.runner.RPCConfig(
-        tracker_host=parsed.rpc_host,
-        tracker_port=parsed.rpc_port,
-        tracker_key=parsed.rpc_key,
-        session_timeout_sec=600,
-    )
-    return parsed
-
-
-logging.basicConfig(
-    format="%(asctime)s.%(msecs)03d %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
-)
-logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
-ARGS = _parse_args()
-
-
-def main():
-    describe()
-    print(f"Workload: {ARGS.model_name}")
-
-    onnx_model = onnx.load(ARGS.onnx_path)
-    shape_dict = {}
-    for item in ARGS.input_shape:
-        print(f"  input_name : {item['name']}")
-        print(f"  input_shape: {item['shape']}")
-        print(f"  input_dtype: {item['dtype']}")
-        shape_dict[item["name"]] = item["shape"]
-    mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True)
-    input_data = {
-        item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape
-    }
-
-    with ms.Profiler() as profiler:
-        database = ms.relay_integration.tune_relay(
-            mod=mod,
-            target=ARGS.target,
-            params=params,
-            work_dir=ARGS.work_dir,
-            max_trials_global=ARGS.num_trials,
-            num_trials_per_iter=64,
-            runner=ms.runner.RPCRunner(  # type: ignore
-                rpc_config=ARGS.rpc_config,
-                evaluator_config=ms.runner.EvaluatorConfig(
-                    number=ARGS.number,
-                    repeat=ARGS.repeat,
-                    min_repeat_ms=ARGS.min_repeat_ms,
-                    enable_cpu_cache_flush=ARGS.cpu_flush,
-                ),
-                alloc_repeat=1,
-            ),
-            cost_model=ms.cost_model.XGBModel(  # type: ignore
-                extractor=ms.feature_extractor.PerStoreFeature(),
-                adaptive_training=ARGS.adaptive_training,
-            ),
-            strategy=ms.search_strategy.EvolutionarySearch(),
-        )
-        lib = ms.relay_integration.compile_relay(
-            database=database,
-            mod=mod,
-            target=ARGS.target,
-            params=params,
-            backend=ARGS.backend,
-        )
-
-    print("Tuning Time:")
-    print(profiler.table())
-
-    run_module_via_rpc(
-        rpc_config=ARGS.rpc_config,
-        lib=lib,
-        dev_type=ARGS.target.kind.name,
-        args=input_data,
-        continuation=create_timer(ARGS.backend),
-        backend=ARGS.backend,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/tvm/meta_schedule/testing/tune_relay.py b/python/tvm/meta_schedule/testing/tune_relay.py
deleted file mode 100644
index 98eddf793fce..000000000000
--- a/python/tvm/meta_schedule/testing/tune_relay.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=missing-docstring
-import argparse
-import json
-import logging
-from typing import Dict
-
-import numpy as np  # type: ignore
-import tvm
-from tvm import meta_schedule as ms
-from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
-from tvm.meta_schedule.testing.relay_workload import get_network
-from tvm.meta_schedule.testing.tune_utils import create_timer, generate_input_data
-from tvm.support import describe
-from tvm.testing.utils import strtobool
-
-
-def _parse_args():
-    args = argparse.ArgumentParser()
-    args.add_argument(
-        "--workload",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--input-shape",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--target",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--num-trials",
-        type=int,
-        required=True,
-    )
-    args.add_argument(
-        "--rpc-host",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--rpc-port",
-        type=int,
-        required=True,
-    )
-    args.add_argument(
-        "--rpc-key",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--work-dir",
-        type=str,
-        required=True,
-    )
-    args.add_argument(
-        "--layout",
-        type=str,
-        default=None,
-    )
-    args.add_argument(
-        "--cache-dir",
-        type=str,
-        default=None,
-    )
-    args.add_argument(
-        "--number",
-        type=int,
-        default=3,
-    )
-    args.add_argument(
-        "--repeat",
-        type=int,
-        default=1,
-    )
-    args.add_argument(
-        "--min-repeat-ms",
-        type=int,
-        default=100,
-    )
-    args.add_argument(
-        "--adaptive-training",
-        type=lambda x: bool(strtobool(x)),
-        help="example: True / False",
-        default=True,
-    )
-    args.add_argument(
-        "--cpu-flush",
-        type=lambda x: bool(strtobool(x)),
-        help="example: True / False",
-        required=True,
-    )
-    args.add_argument(
-        "--backend",
-        type=str,
-        choices=["graph", "vm"],
-        help="example: graph / vm",
-        required=True,
-    )
-    parsed = args.parse_args()
-    parsed.target = tvm.target.Target(parsed.target)
-    parsed.input_shape = json.loads(parsed.input_shape)
-    parsed.rpc_config = ms.runner.RPCConfig(
-        tracker_host=parsed.rpc_host,
-        tracker_port=parsed.rpc_port,
-        tracker_key=parsed.rpc_key,
-        session_timeout_sec=600,
-    )
-    return parsed
-
-
-logging.basicConfig(
-    format="%(asctime)s.%(msecs)03d %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
-)
-logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
-ARGS = _parse_args()
-
-
-def main():
-    describe()
-    print(f"Workload: {ARGS.workload}")
-
-    mod, params, (input_name, input_shape, input_dtype) = get_network(
-        ARGS.workload,
-        ARGS.input_shape,
-        layout=ARGS.layout,
-        cache_dir=ARGS.cache_dir,
-    )
-    input_info = [
-        {
-            "name": input_name,
-            "shape": input_shape,
-            "dtype": input_dtype,
-        },
-    ]
-    input_data: Dict[str, np.ndarray] = {
-        item["name"]: generate_input_data(  # type: ignore
-            item["shape"],  # type: ignore
-            item["dtype"],  # type: ignore
-        )
-        for item in input_info
-    }
-    for item in input_info:
-        print(f"  input_name : {item['name']}")
-        print(f"  input_shape: {item['shape']}")
-        print(f"  input_dtype: {item['dtype']}")
-
-    with ms.Profiler() as profiler:
-        database = ms.relay_integration.tune_relay(
-            mod=mod,
-            target=ARGS.target,
-            work_dir=ARGS.work_dir,
-            max_trials_global=ARGS.num_trials,
-            num_trials_per_iter=64,
-            params=params,
-            runner=ms.runner.RPCRunner(  # type: ignore
-                rpc_config=ARGS.rpc_config,
-                evaluator_config=ms.runner.EvaluatorConfig(
-                    number=ARGS.number,
-                    repeat=ARGS.repeat,
-                    min_repeat_ms=ARGS.min_repeat_ms,
-                    enable_cpu_cache_flush=ARGS.cpu_flush,
-                ),
-                alloc_repeat=1,
-            ),
-            cost_model=ms.cost_model.XGBModel(  # type: ignore
-                extractor=ms.feature_extractor.PerStoreFeature(),
-                adaptive_training=ARGS.adaptive_training,
-            ),
-            strategy=ms.search_strategy.EvolutionarySearch(),
-        )
-        lib = ms.relay_integration.compile_relay(
-            database=database,
-            mod=mod,
-            target=ARGS.target,
-            params=params,
-            backend=ARGS.backend,
-        )
-
-    print("Tuning Time:")
-    print(profiler.table())
-
-    run_module_via_rpc(
-        rpc_config=ARGS.rpc_config,
-        lib=lib,
-        dev_type=ARGS.target.kind.name,
-        args=input_data,
-        continuation=create_timer(ARGS.backend),
-        backend=ARGS.backend,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/tvm/meta_schedule/testing/tune_utils.py b/python/tvm/meta_schedule/testing/tune_utils.py
index 96de3a971155..cb97b221b281 100644
--- a/python/tvm/meta_schedule/testing/tune_utils.py
+++ b/python/tvm/meta_schedule/testing/tune_utils.py
@@ -15,9 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Testing utility functions in meta schedule"""
-from typing import Callable, Optional, Union, List, Dict
-from statistics import median
-import json
+from typing import Callable, Optional, List, Dict
 import numpy as np  # type: ignore
 
 import tvm
@@ -66,131 +64,6 @@ def generate_input_data(
     raise ValueError("Unsupported input datatype!")
 
 
-def create_timer(backend: str) -> Callable:
-    """Create a function to run and benchmark the performance of whole given runtime module,
-    or Executable in relay vm.
-
-    Parameters
-    ----------
-    backend : str
-        The backend to use, graph / vm.
-
-    Returns
-    -------
-    func : Callable
-        The function to benchmark the workload.
-    """
-
-    def f_timer(
-        rt_mod: Union[tvm.runtime.Module, tvm.runtime.vm.Executable],
-        dev: tvm.runtime.Device,
-        input_data: Dict[str, NDArray],
-    ) -> None:
-        """Run and benchmark the given runtime module, print out the result.
-
-        Parameters
-        ----------
-        rt_mod : Union[tvm.runtime.Module, tvm.runtime.vm.Executable]
-            The runtime module or vm executable.
-        dev : tvm.runtime.Device
-            The device type to run workload.
-        input_data : Dict[str, np.ndarray]
-            The input data as a dictionary.
-        """
-        from tvm.contrib.graph_executor import GraphModule  # pylint:disable=import-outside-toplevel
-        from tvm.runtime.vm import VirtualMachine  # pylint:disable=import-outside-toplevel
-
-        try:
-            if backend == "vm":
-                vm = VirtualMachine(rt_mod, dev)  # pylint: disable=invalid-name
-                ftimer = vm.benchmark(
-                    dev, min_repeat_ms=500, repeat=5, number=1, end_to_end=False, **input_data
-                )
-            elif backend == "graph":
-                mod = GraphModule(rt_mod["default"](dev))
-                for input_name, input_value in input_data.items():
-                    mod.set_input(input_name, input_value)
-                ftimer = mod.module.time_evaluator(
-                    "run", dev, min_repeat_ms=500, repeat=5, number=1
-                )()
-            else:
-                raise ValueError(f"Backend {backend} not supported in f_timer!")
-
-            results = list(np.array(ftimer.results) * 1000.0)  # type: ignore
-
-            print("Running time in time_evaluator: ", results)
-            print("-------------------------------")
-            print(f"    Min (ms) : {min(results)}")
-            print(f"    Max (ms) : {max(results)}")
-            print(f" Median (ms) : {median(results)}")
-            print(f"Average (ms) : {sum(results) / len(results)}")
-        except Exception as exc:  # pylint: disable=broad-except
-            print(
-                f"Run module f_timer via RPC failed, exception: {exc}",
-            )
-
-    return f_timer
-
-
-def create_time_per_layer(graph: str) -> Callable:
-    """Create a function to run and benchmark the per-layer performance of given runtime module,
-    given the graph output of the module from graph compiler.
-
-    Parameters
-    ----------
-    graph : str
-        The json format graph output of the module from graph compiler.
-
-    Returns
-    -------
-    func : Callable
-        The function using the json format graph.
-    """
-
-    def f_time_per_layer(
-        rt_mod: tvm.runtime.Module,
-        dev: tvm.runtime.Device,
-        input_data: Dict[str, NDArray],
-    ) -> None:
-        """Run and benchmark the per-layer performance of given runtime module,
-        print out the result.
-
-        Parameters
-        ----------
-        rt_mod : tvm.runtime.Module
-            The runtime module.
-        dev : tvm.runtime.Device
-            The device type to run workload.
-        input_data : Dict[str, np.ndarray]
-            The input data as a dictionary.
-        """
-        # pylint:disable=import-outside-toplevel
-        from tvm.contrib.debugger.debug_executor import create
-
-        # pylint:enable=import-outside-toplevel
-
-        try:
-            mod = create(graph, rt_mod, dev)
-            for input_name, input_value in input_data.items():
-                mod.set_input(input_name, input_value)
-            graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]]
-            graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
-
-            print("Running time of each layer:")
-            print("---------------------------")
-            print("|graph_nodes| = ", len(graph_nodes))
-            print("|graph_time| = ", len(graph_time))
-
-            for k, v in zip(graph_nodes, graph_time):
-                print(k, float(v) * 1e6, "us")
-        except Exception as exc:  # pylint: disable=broad-except
-            print(
-                f"Run module f_time_per_layer via RPC failed, exception: {exc}",
-            )
-
-    return f_time_per_layer
-
-
 def create_calculator(backend: str) -> Callable:
     """Create a function to fetch the computing result of running the given runtime module.
 
diff --git a/python/tvm/relax/__init__.py b/python/tvm/relax/__init__.py
index dd3245441b3e..471a4ba9d337 100644
--- a/python/tvm/relax/__init__.py
+++ b/python/tvm/relax/__init__.py
@@ -55,7 +55,7 @@
     Type,
     ObjectType,
     ShapeType,
-    DynTensorType,
+    TensorType,
     TupleType,
     FuncType,
     PackedFuncType,
diff --git a/python/tvm/relax/analysis/analysis.py b/python/tvm/relax/analysis/analysis.py
index edcf02bf6aeb..af0772ea6cbe 100644
--- a/python/tvm/relax/analysis/analysis.py
+++ b/python/tvm/relax/analysis/analysis.py
@@ -334,7 +334,7 @@ def post_order_visit(expr, fvisit):
 
     Parameters
     ----------
-    expr : tvm.relay.Expr
+    expr : tvm.relax.Expr
         The input expression.
 
     fvisit : function
diff --git a/python/tvm/relax/expr.py b/python/tvm/relax/expr.py
index 190df4286056..58845ce48986 100644
--- a/python/tvm/relax/expr.py
+++ b/python/tvm/relax/expr.py
@@ -38,7 +38,7 @@
 # It is a workaround for mypy: https://github.com/python/mypy/issues/7866#issuecomment-549454370
 # This feature is not supported until python 3.10:
 # https://docs.python.org/3.10/whatsnew/3.10.html#pep-613-typealias
-Expr = Union[tvm.ir.RelayExpr]
+Expr = Union[tvm.ir.RelaxExpr]
 Type = Union[tvm.ir.Type]  # pylint: disable=invalid-name
 GlobalVar = Union[tvm.ir.GlobalVar]
 
@@ -827,9 +827,11 @@ def __init__(
                 )
 
         self.__init_handle_by_constructor__(
-            _ffi_api.DataflowVar  # type: ignore
-            if isinstance(name_hint, str)
-            else _ffi_api.DataflowVarFromId,  # type: ignore
+            (
+                _ffi_api.DataflowVar  # type: ignore
+                if isinstance(name_hint, str)
+                else _ffi_api.DataflowVarFromId
+            ),  # type: ignore
             name_hint,
             struct_info,
             span,
diff --git a/python/tvm/relax/frontend/onnx/onnx_frontend.py b/python/tvm/relax/frontend/onnx/onnx_frontend.py
index e1f09233bcff..dd4b8a425425 100644
--- a/python/tvm/relax/frontend/onnx/onnx_frontend.py
+++ b/python/tvm/relax/frontend/onnx/onnx_frontend.py
@@ -2426,7 +2426,7 @@ def _impl_v9(cls, bb, inputs, attr, params):
             total_output_shape = output_shape
 
         elif pads is not None:
-            # Get pads in the proper format for relay.
+            # Get pads in the proper format
             pads = _np.concatenate([[0, 0, 0, 0], list(pads)], axis=0)
             pads = _np.reshape(pads, [-1, 2])
             # Compute the total padding per axis.
diff --git a/python/tvm/relax/frontend/torch/fx_translator.py b/python/tvm/relax/frontend/torch/fx_translator.py
index 4c06a3e76f8e..724bb3fc2094 100644
--- a/python/tvm/relax/frontend/torch/fx_translator.py
+++ b/python/tvm/relax/frontend/torch/fx_translator.py
@@ -290,7 +290,6 @@ def _interpolate(self, node: fx.Node) -> relax.Var:
         #   input, size=None, scale_factor=None, mode='nearest', align_corners=None,
         #   recompute_scale_factor=None, antialias=False)
         # (TODO) this is a temporary implementation for interpolate that only considers NCHW layout
-        # it basically replicates the implementation in tvm.relay.frontend.pytorch
         data = self.env[node.args[0]]
         size = (
             node.args[1]
diff --git a/python/tvm/relax/op/qdq.py b/python/tvm/relax/op/qdq.py
index 6842e9e378fb..6231b5f9b934 100644
--- a/python/tvm/relax/op/qdq.py
+++ b/python/tvm/relax/op/qdq.py
@@ -35,7 +35,7 @@ def quantize(data: Expr, scale: Expr, zero_point: Expr, axis: int = -1, out_dtyp
     scale : tvm.relax.Expr
         The output scale.
 
-    zero_point : tvm.relay.Expr
+    zero_point : tvm.relax.Expr
         The output zero_point.
 
     axis : int
@@ -70,7 +70,7 @@ def dequantize(
     scale : tvm.relax.Expr
         The input scale.
 
-    zero_point : tvm.relay.Expr
+    zero_point : tvm.relax.Expr
         The input zero_point.
 
     axis : int
diff --git a/python/tvm/relax/testing/ast_printer.py b/python/tvm/relax/testing/ast_printer.py
index 4c670bbe74b2..40e617b38f1a 100644
--- a/python/tvm/relax/testing/ast_printer.py
+++ b/python/tvm/relax/testing/ast_printer.py
@@ -248,13 +248,13 @@ def visit_type_(self, type_node: relax.Type) -> str:
             return self.build_ast_node("PackedFuncType")
         if isinstance(type_node, tvm.ir.PrimType):
             return self.build_ast_node("PrimType", dtype=type_node.dtype)
-        if isinstance(type_node, relax.DynTensorType):
+        if isinstance(type_node, relax.TensorType):
             fields = {}
             if type_node.ndim is not None:
                 fields["ndim"] = str(type_node.ndim)
             if type_node.dtype != "":
                 fields["dtype"] = type_node.dtype
-            return self.build_ast_node("DynTensorType", **fields)
+            return self.build_ast_node("TensorType", **fields)
         if isinstance(type_node, relax.TupleType):
             return self.build_ast_node(
                 "TupleType", fields=self.build_list(map(self.visit_type_, type_node.fields))
diff --git a/python/tvm/relax/ty.py b/python/tvm/relax/ty.py
index 05492d6a9c34..b0afb069435a 100644
--- a/python/tvm/relax/ty.py
+++ b/python/tvm/relax/ty.py
@@ -17,7 +17,7 @@
 # pylint: disable=invalid-name, unused-import
 """The type nodes of the Relax language."""
 import tvm._ffi
-from tvm.ir import Type, TensorType, TupleType, FuncType, Span
+from tvm.ir import Type, TupleType, FuncType, Span
 
 from . import _ffi_api
 
@@ -47,7 +47,7 @@ def __init__(self, span: Span = None) -> None:
 
 
 @tvm._ffi.register_object("relax.DynTensorType")
-class DynTensorType(Type):
+class TensorType(Type):
     """A dynamic tensor type in Relax.
 
     This is the type assigned to tensors with a known dtype and unknown shape.
@@ -62,9 +62,7 @@ class DynTensorType(Type):
     """
 
     def __init__(self, ndim=-1, dtype="float32", span: Span = None) -> None:
-        self.__init_handle_by_constructor__(
-            _ffi_api.DynTensorType, ndim, dtype, span  # type: ignore
-        )
+        self.__init_handle_by_constructor__(_ffi_api.TensorType, ndim, dtype, span)  # type: ignore
 
 
 @tvm._ffi.register_object("relax.PackedFuncType")
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 9eb0c23db439..cc1b8fb1d727 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -1022,6 +1022,7 @@ def _aprofile_aem_fvp_compile_time_check():
     compile_time_check=_aprofile_aem_fvp_compile_time_check,
 )
 
+
 # check cpu features
 def _has_cpu_feat(features):
     cpu = codegen.llvm_get_system_cpu()
@@ -2182,25 +2183,3 @@ def test_compare(self, before, expected, transform):
                 f"or an instance of `tvm.tir.PrimFunc`.  "
                 f"Instead, received {type(expected)}."
             )
-
-
-class _control_span_filling:
-    def __init__(self, on=True):
-        self._on = on
-        self._pass_ctx = tvm.transform.PassContext(config={"relay.frontend.fill_span": self._on})
-
-    def __enter__(self):
-        self._pass_ctx.__enter__()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self._pass_ctx.__exit__(exc_type, exc_val, exc_tb)
-
-
-class enable_span_filling(_control_span_filling):
-    def __init__(self):
-        super().__init__()
-
-
-class disable_span_filling(_control_span_filling):
-    def __init__(self):
-        super().__init__(on=False)
diff --git a/python/tvm/tir/analysis/analysis.py b/python/tvm/tir/analysis/analysis.py
index e98c176dd093..72f18acca500 100644
--- a/python/tvm/tir/analysis/analysis.py
+++ b/python/tvm/tir/analysis/analysis.py
@@ -19,7 +19,6 @@
 from typing import Dict, List, Optional, Union
 
 import tvm
-from tvm import Object
 from tvm.ir import IRModule
 from tvm.tir.expr import Var
 from tvm.tir.stmt import Block, BufferRegion, PrimExpr
@@ -165,7 +164,7 @@ def get_block_read_write_region(
 
 
 def calculate_allocated_bytes(
-    func_or_mod: Union[PrimFunc, IRModule]
+    func_or_mod: Union[PrimFunc, IRModule],
 ) -> Union[Dict[str, int], Dict[str, Dict[str, int]]]:
     """Calculate allocated memory per memory scope required by TIR PrimFuncs.
 
@@ -248,34 +247,6 @@ def undefined_vars(node: Union[Stmt, PrimExpr], defs: Optional[List[Var]] = None
     return _ffi_api.UndefinedVars(node, defs)  # type: ignore # pylint: disable=no-member
 
 
-def get_prim_func_arg_and_result_memory_constraints(
-    func: PrimFunc, relay_func_type: Object
-) -> List[str]:
-    """Returns the memory (aka storage) scope constraints for all the arguments and result
-    of func. However the result will be w.r.t. the func's representation as a Relay Function
-    of relay_func_type before lowering and conversion to DPS.
-
-    Visible for testing.
-
-    Parameters
-    ----------
-    func: tvm.tir.PrimFunc
-        The function to retrieve constraints from.
-
-    relay_func_type: tvm.relay.FuncType
-        The type of the Relay Function from which the func was derived.
-
-    Returns
-    -------
-    result: List[AnyStr]
-        Memory scope constraints for funcs args and result in Relay form. The empty string
-        denotes 'no constraint'.
-    """
-    return _ffi_api.GetPrimFuncArgAndResultMemoryConstraints(  # type: ignore # pylint: disable=no-member
-        func, relay_func_type
-    )
-
-
 def verify_well_formed(obj: Union[PrimFunc, IRModule], assert_mode: bool = True) -> bool:
     """Verify if the given TIR is well-formed. The verification includes:
         - Check if expressions not contain vars that is defined outside the block.
diff --git a/python/tvm/topi/signal.py b/python/tvm/topi/signal.py
index 64a804a851ab..162b9a735ebd 100644
--- a/python/tvm/topi/signal.py
+++ b/python/tvm/topi/signal.py
@@ -35,7 +35,7 @@ def stft(
     This gives frequency components of the signal as they change over time.
     Parameters
     ----------
-    data : relay.Expr
+    data : te.Tensor
         Either a 1-D tensor or a 2-D batch tensor.
     n_fft : int
         The size of Fourier transform
@@ -43,7 +43,7 @@ def stft(
         The distance between neighboring sliding window frames
     win_length : int
         The size of window frame and STFT filter
-    window : relay.Expr
+    window : te.Tensor
         A 1-D tensor window frame
     normalized : bool
         Whether to return the normalized STFT results
@@ -51,7 +51,7 @@ def stft(
         Whether to return onesided result or fill with conjugate symmetry
     Returns
     -------
-    output : relay.Expr
+    output : te.Tensor
         Tensor containing the STFT result
     Examples
     --------
@@ -60,7 +60,7 @@ def stft(
         data = [1, 2, 3, 4, 5, 6]
         window = [4, 3, 2]
         [n_fft, hop_length, win_length, normalized, onesided] = [3, 3, 3, False, True]
-        relay.stft(data, n_fft, hop_length, win_length, window, normalized, onesided)
+        topi.stft(data, n_fft, hop_length, win_length, window, normalized, onesided)
         -> [[[15.0000,  0.0000], [34.0000,  0.0000]], [[ 4.5000,  0.8660], [ 1.0000, -1.7321]]]
     """
 
@@ -136,10 +136,10 @@ def dft(
 
     Parameters
     ----------
-    re_data : relay.Expr
+    re_data : te.Tensor
         N-D tensor, real part of the input signal.
 
-    im_data : relay.Expr
+    im_data : te.Tensor
         N-D tensor, imaginary part of the input signal.
         If the signal is real, then the values of this tensor are zeros.
 
@@ -148,9 +148,9 @@ def dft(
 
     Returns
     -------
-    re_output : relay.Expr
+    re_output : te.Tensor
         The Fourier Transform of the input (Real part).
-    im_output : relay.Expr
+    im_output : te.Tensor
         The Fourier Transform of the input (Imaginary part).
     """
 
diff --git a/python/tvm/topi/sparse_reshape.py b/python/tvm/topi/sparse_reshape.py
index b25bd854a7f9..6bf3857e2e49 100644
--- a/python/tvm/topi/sparse_reshape.py
+++ b/python/tvm/topi/sparse_reshape.py
@@ -32,19 +32,19 @@ def sparse_reshape(
 
     Parameters
     ----------
-    sparse_indices : relay.Expr
+    sparse_indices : te.Expr
         A 2-D tensor[N, n_dim] of integers containing location of sparse values, where N is the
         number of sparse values and n_dim is the number of dimensions of the dense_shape
 
-    prev_shape : relay.Expr
+    prev_shape : te.Expr
         A 1-D tensor containing the previous shape of the dense tensor
 
-    new_shape : relay.Expr
+    new_shape : te.Expr
         A 1-D tensor containing the new shape of the dense tensor
 
     Returns
     -------
-    result: relay.Expr
+    result: te.Expr
         Output tensor.
 
     Examples
@@ -58,7 +58,7 @@ def sparse_reshape(
                             [1, 2, 3]]
         prev_shape = [2, 3, 4]
         new_shape = [9, -1]
-        new_sparse_indices, new_shape = relay.sparse_reshape(
+        new_sparse_indices, new_shape = topi.sparse_reshape(
             sparse_indices, prev_shape, new_shape)
         new_sparse_indices = [[0, 0],
                               [0, 1],
diff --git a/python/tvm/topi/testing/one_hot.py b/python/tvm/topi/testing/one_hot.py
index 0c4b0600fa2a..4b9c3432da4c 100644
--- a/python/tvm/topi/testing/one_hot.py
+++ b/python/tvm/topi/testing/one_hot.py
@@ -48,7 +48,7 @@ def one_hot(indices, on_value, off_value, depth, axis, dtype):
 
     Returns
     -------
-    ret : relay.Expr
+    ret : tvm.te.Tensor
         The one-hot tensor.
     """
     oshape = []
diff --git a/python/tvm/topi/transform.py b/python/tvm/topi/transform.py
index 6101c2e57d21..b8605aa58a2e 100644
--- a/python/tvm/topi/transform.py
+++ b/python/tvm/topi/transform.py
@@ -793,12 +793,12 @@ def one_hot(indices, on_value, off_value, depth, axis, dtype):
     axis : int
         Axis to fill.
 
-    dtype : relay.DataType
+    dtype : str
         Data type of the output tensor.
 
     Returns
     -------
-    ret : relay.Expr
+    ret : tvm.te.Tensor
         The one-hot tensor.
 
     Examples
@@ -807,7 +807,7 @@ def one_hot(indices, on_value, off_value, depth, axis, dtype):
 
         indices = [0, 1, 2]
 
-        relay.one_hot(indices, 3) =
+        topi.one_hot(indices, 3) =
             [[1, 0, 0],
              [0, 1, 0],
              [0, 0, 1]]
@@ -823,15 +823,15 @@ def unravel_index(indices, shape):
 
     Parameters
     ----------
-    indices : relay.Expr
+    indices : tvm.te.Tensor
         An integer array containing indices.
 
-    shape : relay.Expr
+    shape : tvm.te.Tensor
         The shape of the array.
 
     Returns
     -------
-    result : relay.Expr
+    result : tvm.te.Tensor
         The tuple of coordinate arrays.
     """
 
@@ -874,10 +874,10 @@ def matrix_set_diag(data, diagonal, k=0, align="RIGHT_LEFT"):
 
     Parameters
     ----------
-    data : relay.Expr
+    data : tvm.te.Tensor
         Input Tensor.
 
-    diagonal : relay.Expr
+    diagonal : tvm.te.Tensor
         Values to be filled in the diagonal.
 
     k : int or tuple of int, optional
@@ -897,7 +897,7 @@ def matrix_set_diag(data, diagonal, k=0, align="RIGHT_LEFT"):
 
     Returns
     -------
-    result : relay.Expr
+    result : tvm.te.Tensor
         New tensor with given diagonal values.
 
     Examples
@@ -964,7 +964,7 @@ def sliding_window(data, axis, window_shape, strides):
 
     Parameters
     ----------
-    data : relay.Expr
+    data : tvm.te.Tensor
         The input data to the operator.
 
     axis : int
@@ -983,7 +983,7 @@ def sliding_window(data, axis, window_shape, strides):
 
     Returns
     -------
-    result : relay.Expr
+    result : tvm.te.Tensor
         The resulting tensor.
     """
     return cpp.sliding_window(data, axis, window_shape, strides)
@@ -1011,7 +1011,7 @@ def trilu(data, k, upper):
 
     Returns
     -------
-    ret : relay.Expr
+    ret : tvm.te.Tensor
         The new tensor with appropriate diagonals set to zero.
 
     Examples
@@ -1022,7 +1022,7 @@ def trilu(data, k, upper):
              [3, 4, 5],
              [6, 7, 8]]
 
-        relay.trilu(x, True, 0) =
+        topi.trilu(x, True, 0) =
             [[0, 1, 2],
              [0, 4, 5],
              [0, 0, 8]]
diff --git a/src/contrib/msc/core/ir/graph_builder.cc b/src/contrib/msc/core/ir/graph_builder.cc
index 27115cb13065..1cc0c4af6a3b 100644
--- a/src/contrib/msc/core/ir/graph_builder.cc
+++ b/src/contrib/msc/core/ir/graph_builder.cc
@@ -844,7 +844,7 @@ void RelaxWeightsExtractor::VisitExpr_(const relax::CallNode* op) {
 }
 
 void RelayFuncAttrGetter::VisitExpr_(const relay::CallNode* op) {
-  RelayExprVisitor::VisitExpr_(op);
+  RelaxExprVisitor::VisitExpr_(op);
   if (op->attrs.defined()) {
     Map<String, String> attrs;
     AttrGetter getter(&attrs);
@@ -1107,7 +1107,7 @@ void RelayGraphBuilder::VisitExpr_(const relay::FunctionNode* op) {
   if (name_opt.defined()) {
     StartFuncScope(SpanUtils::GetAttr(op->span, msc_attr::kName));
   }
-  RelayExprVisitor::VisitExpr_(op);
+  RelaxExprVisitor::VisitExpr_(op);
   if (HasFuncScope()) {
     AddNode(GetRef<relay::Function>(op));
     EndFuncScope();
@@ -1120,7 +1120,7 @@ void RelayGraphBuilder::VisitExpr_(const relay::CallNode* op) {
     if (name_opt.defined()) {
       for (size_t i = 0; i < op->args.size(); i++) {
         if (!expr_tensor_map_.count(op->args[i])) {
-          RelayExprVisitor::VisitExpr(op->args[i]);
+          RelaxExprVisitor::VisitExpr(op->args[i]);
         }
         ICHECK(expr_tensor_map_.count(op->args[i]))
             << "Can not find argument " << relay::PrettyPrint(op->args[i]);
@@ -1128,7 +1128,7 @@ void RelayGraphBuilder::VisitExpr_(const relay::CallNode* op) {
       }
     }
   }
-  RelayExprVisitor::VisitExpr_(op);
+  RelaxExprVisitor::VisitExpr_(op);
   if (!HasFuncScope() && op->op->IsInstance<OpNode>()) {
     try {
       AddNode(GetRef<relay::Call>(op));
@@ -1144,12 +1144,12 @@ void RelayGraphBuilder::VisitExpr_(const relay::CallNode* op) {
 }
 
 void RelayGraphBuilder::VisitExpr_(const relay::TupleNode* val) {
-  RelayExprVisitor::VisitExpr_(val);
+  RelaxExprVisitor::VisitExpr_(val);
   AddNode(GetRef<relay::Tuple>(val));
 }
 
 void RelayGraphBuilder::VisitExpr_(const relay::TupleGetItemNode* val) {
-  RelayExprVisitor::VisitExpr_(val);
+  RelaxExprVisitor::VisitExpr_(val);
   AddNode(GetRef<relay::TupleGetItem>(val));
 }
 
diff --git a/src/contrib/msc/core/ir/graph_builder.h b/src/contrib/msc/core/ir/graph_builder.h
index 269a8a213ce8..9fd855455c1e 100644
--- a/src/contrib/msc/core/ir/graph_builder.h
+++ b/src/contrib/msc/core/ir/graph_builder.h
@@ -48,9 +48,9 @@ namespace tvm {
 namespace contrib {
 namespace msc {
 
-using Expr = tvm::RelayExpr;
+using Expr = tvm::RelaxExpr;
 using RelaxExprVisitor = tvm::relax::ExprVisitor;
-using RelayExprVisitor = tvm::relay::ExprVisitor;
+using RelaxExprVisitor = tvm::relay::ExprVisitor;
 
 using tvm::runtime::NDArray;
 
@@ -348,7 +348,7 @@ class RelaxWeightsExtractor : public RelaxExprVisitor {
   IRModule ref_module_;
 };
 
-class RelayFuncAttrGetter : public RelayExprVisitor {
+class RelayFuncAttrGetter : public RelaxExprVisitor {
  public:
   /*! \brief Get the attributes as Map<String, String>*/
   Map<String, String> GetAttrs(const Expr& expr) {
@@ -381,7 +381,7 @@ class RelayFuncScope {
   Array<String> func_weights_;
 };
 
-class RelayGraphBuilder : public RelayExprVisitor {
+class RelayGraphBuilder : public RelaxExprVisitor {
  public:
   /*!
    * \brief The constructor of RelayGraphBuilder
@@ -391,7 +391,7 @@ class RelayGraphBuilder : public RelayExprVisitor {
    */
   explicit RelayGraphBuilder(const IRModule& ref_module, const String& name,
                              const std::string& options = "")
-      : RelayExprVisitor() {
+      : RelaxExprVisitor() {
     ref_module_ = ref_module;
     if (options.size() > 0) {
       std::istringstream is(options);
@@ -444,7 +444,7 @@ class RelayGraphBuilder : public RelayExprVisitor {
   std::stack<RelayFuncScope> func_scopes_;
 };
 
-class RelayWeightsExtractor : public RelayExprVisitor {
+class RelayWeightsExtractor : public RelaxExprVisitor {
  public:
   /*! \brief Visit the constant and save weights*/
   Map<MSCTensor, NDArray> GetWeights(const relay::Function& func);
diff --git a/src/contrib/msc/core/transform/layout_utils.h b/src/contrib/msc/core/transform/layout_utils.h
index e7781a95a8f7..787c73cc8404 100644
--- a/src/contrib/msc/core/transform/layout_utils.h
+++ b/src/contrib/msc/core/transform/layout_utils.h
@@ -37,7 +37,7 @@ namespace tvm {
 namespace contrib {
 namespace msc {
 
-using Expr = tvm::RelayExpr;
+using Expr = tvm::RelaxExpr;
 using namespace tvm::relax;
 
 /*!
diff --git a/src/contrib/msc/core/transform/rewrite_utils.h b/src/contrib/msc/core/transform/rewrite_utils.h
index 2693a6ccd2eb..307581b274ec 100644
--- a/src/contrib/msc/core/transform/rewrite_utils.h
+++ b/src/contrib/msc/core/transform/rewrite_utils.h
@@ -37,7 +37,7 @@ namespace tvm {
 namespace contrib {
 namespace msc {
 
-using Expr = tvm::RelayExpr;
+using Expr = tvm::RelaxExpr;
 using namespace tvm::relax;
 
 /*!
diff --git a/src/contrib/msc/core/transform/set_expr_name.cc b/src/contrib/msc/core/transform/set_expr_name.cc
index 163d86833593..0418513c145a 100644
--- a/src/contrib/msc/core/transform/set_expr_name.cc
+++ b/src/contrib/msc/core/transform/set_expr_name.cc
@@ -339,9 +339,9 @@ namespace relay {
 /*!
  * \brief Name setter for Relay
  */
-class RelayExprNameSetter : public ExprVisitor {
+class RelaxExprNameSetter : public ExprVisitor {
  public:
-  explicit RelayExprNameSetter(const IRModule& ref_module) : ref_module_(ref_module) {}
+  explicit RelaxExprNameSetter(const IRModule& ref_module) : ref_module_(ref_module) {}
 
   void VisitExpr_(const ConstantNode* op) final {
     ExprVisitor::VisitExpr_(op);
@@ -460,16 +460,16 @@ class RelayExprNameSetter : public ExprVisitor {
   IRModule ref_module_;
 };  // class ExprNameSetter
 
-void SetRelayExprName(const IRModule& ref_module, const Expr& e) {
-  RelayExprNameSetter(ref_module).VisitExpr(e);
+void SetRelaxExprName(const IRModule& ref_module, const Expr& e) {
+  RelaxExprNameSetter(ref_module).VisitExpr(e);
 }
 
 /*!
  * \brief Name binder for Relay
  */
-class RelayExprNameBinder : public ExprVisitor {
+class RelaxExprNameBinder : public ExprVisitor {
  public:
-  explicit RelayExprNameBinder(const String& name_key, const String& seperator)
+  explicit RelaxExprNameBinder(const String& name_key, const String& seperator)
       : name_key_(name_key), seperator_(seperator) {}
 
   void VisitExpr_(const ConstantNode* op) final {
@@ -523,33 +523,33 @@ class RelayExprNameBinder : public ExprVisitor {
   String seperator_;
 };  // class ExprNameBinder
 
-void BindRelayExprName(const Expr& e, const String& name_key, const String& seperator) {
-  RelayExprNameBinder(name_key, seperator).VisitExpr(e);
+void BindRelaxExprName(const Expr& e, const String& name_key, const String& seperator) {
+  RelaxExprNameBinder(name_key, seperator).VisitExpr(e);
 }
 
 namespace transform {
 
-Pass SetRelayExprName(const String& entry_name) {
+Pass SetRelaxExprName(const String& entry_name) {
   runtime::TypedPackedFunc<IRModule(IRModule, PassContext)> pass_func = [=](IRModule m,
                                                                             PassContext pc) {
-    relay::SetRelayExprName(m, m->Lookup(entry_name));
+    relay::SetRelaxExprName(m, m->Lookup(entry_name));
     return m;
   };
-  return CreateModulePass(pass_func, 0, "SetRelayExprName", {});
+  return CreateModulePass(pass_func, 0, "SetRelaxExprName", {});
 }
 
-TVM_REGISTER_GLOBAL("relay._transform.SetRelayExprName").set_body_typed(SetRelayExprName);
+TVM_REGISTER_GLOBAL("relay._transform.SetRelaxExprName").set_body_typed(SetRelaxExprName);
 
-Pass BindRelayExprName(const String& name_key, const String& seperator, const String& entry_name) {
+Pass BindRelaxExprName(const String& name_key, const String& seperator, const String& entry_name) {
   runtime::TypedPackedFunc<IRModule(IRModule, PassContext)> pass_func = [=](IRModule m,
                                                                             PassContext pc) {
-    relay::BindRelayExprName(m->Lookup(entry_name), name_key, seperator);
+    relay::BindRelaxExprName(m->Lookup(entry_name), name_key, seperator);
     return m;
   };
-  return CreateModulePass(pass_func, 0, "BindRelayExprName", {});
+  return CreateModulePass(pass_func, 0, "BindRelaxExprName", {});
 }
 
-TVM_REGISTER_GLOBAL("relay._transform.BindRelayExprName").set_body_typed(BindRelayExprName);
+TVM_REGISTER_GLOBAL("relay._transform.BindRelaxExprName").set_body_typed(BindRelaxExprName);
 
 }  // namespace transform
 }  // namespace relay
diff --git a/src/contrib/msc/core/utils.h b/src/contrib/msc/core/utils.h
index 7fb9c87a99f9..9bcaba2a271f 100644
--- a/src/contrib/msc/core/utils.h
+++ b/src/contrib/msc/core/utils.h
@@ -37,7 +37,7 @@ namespace tvm {
 namespace contrib {
 namespace msc {
 
-using Expr = tvm::RelayExpr;
+using Expr = tvm::RelaxExpr;
 using RelaxCall = tvm::relax::Call;
 using RelayCall = tvm::relay::Call;
 
diff --git a/src/ir/affine_type.cc b/src/ir/affine_type.cc
deleted file mode 100644
index 87235fe20ade..000000000000
--- a/src/ir/affine_type.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file src/ir/affine_type.cc
- * \brief The Type information for quantized nodes.
- */
-#include <tvm/ir/affine_type.h>
-#include <tvm/runtime/registry.h>
-#include <tvm/tir/op.h>
-
-namespace tvm {
-
-using tvm::ReprPrinter;
-using namespace tvm::runtime;
-
-TensorAffineType::TensorAffineType(RelayExpr scale, RelayExpr zero_point, DataType dtype,
-                                   int axis) {
-  ObjectPtr<TensorAffineTypeNode> n = make_object<TensorAffineTypeNode>();
-  n->scale = std::move(scale);
-  n->zero_point = std::move(zero_point);
-  n->dtype = std::move(dtype);
-  n->axis = std::move(axis);
-  data_ = std::move(n);
-}
-
-TVM_REGISTER_NODE_TYPE(TensorAffineTypeNode);
-
-TVM_REGISTER_GLOBAL("ir.TensorAffineType")
-    .set_body_typed([](RelayExpr scale, RelayExpr zero_point, DataType dtype, int axis) {
-      return TensorAffineType(scale, zero_point, dtype, axis);
-    });
-
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<TensorAffineTypeNode>([](const ObjectRef& ref, ReprPrinter* p) {
-      auto* node = static_cast<const TensorAffineTypeNode*>(ref.get());
-      p->stream << "TensorAffineType(" << node->scale << ", " << node->zero_point << ", "
-                << node->dtype << ", " << node->axis << ")";
-    });
-
-TupleAffineType::TupleAffineType(Array<TensorAffineType> types) {
-  ObjectPtr<TupleAffineTypeNode> n = make_object<TupleAffineTypeNode>();
-  n->types = std::move(types);
-  data_ = std::move(n);
-}
-
-TVM_REGISTER_NODE_TYPE(TupleAffineTypeNode);
-
-TVM_REGISTER_GLOBAL("ir.TupleAffineType").set_body_typed([](Array<TensorAffineType> types) {
-  return TupleAffineType(types);
-});
-
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<TupleAffineTypeNode>([](const ObjectRef& ref, ReprPrinter* p) {
-      auto* node = static_cast<const TupleAffineTypeNode*>(ref.get());
-      p->stream << "TupleAffineType([";
-      for (size_t i = 0; i < node->types.size(); ++i) {
-        p->stream << node->types[i];
-        if (i < node->types.size() - 1) {
-          p->stream << ", ";
-        }
-      }
-      p->stream << "])";
-    });
-
-}  // namespace tvm
diff --git a/src/ir/diagnostic.cc b/src/ir/diagnostic.cc
index 8eeb4b3e6fd6..2c9d7e8e933f 100644
--- a/src/ir/diagnostic.cc
+++ b/src/ir/diagnostic.cc
@@ -263,8 +263,6 @@ void ReportAt(const DiagnosticContext& context, std::ostream& out, const Span& s
   // If the source name is not in the current source map, sources were not annotated.
   if (it == context->module->source_map->source_map.end()) {
     LOG(FATAL) << "The source maps are not populated for this module. "
-               << "Please use `tvm.relay.transform.AnnotateSpans` to attach source maps for error "
-                  "reporting.\n"
                << "Error: " << diagnostic->message;
   }
 
diff --git a/src/ir/function.cc b/src/ir/function.cc
index 3c92787530a6..1afe195a958c 100644
--- a/src/ir/function.cc
+++ b/src/ir/function.cc
@@ -48,11 +48,6 @@ TVM_REGISTER_GLOBAL("ir.BaseFuncWithAttrs")
       if (func->IsInstance<tir::PrimFuncNode>()) {
         return WithAttrs(Downcast<tir::PrimFunc>(std::move(func)), attr_map);
       }
-      if (const auto* f = runtime::Registry::Get("relay.ir.FuncWithAttrs")) {
-        if (Optional<BaseFunc> ret = (*f)(func, attr_map)) {
-          return ret.value();
-        }
-      }
       if (const auto* f = runtime::Registry::Get("relax.FuncWithAttrs")) {
         if (Optional<BaseFunc> ret = (*f)(func, attr_map)) {
           return ret.value();
diff --git a/src/ir/memory_pools.cc b/src/ir/memory_pools.cc
deleted file mode 100644
index 912ad80b3cce..000000000000
--- a/src/ir/memory_pools.cc
+++ /dev/null
@@ -1,242 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file src/ir/memory_pools.cc
- * \brief The object definition for relay.build argument type of memory pools
- */
-
-#include <tvm/ir/memory_pools.h>
-
-namespace tvm {
-
-PoolInfo::PoolInfo(String pool_name, Integer size_hint_bytes, Integer clock_frequency_hz,
-                   Integer read_bandwidth_bytes_per_cycle, Integer write_bandwidth_bytes_per_cycle,
-                   Integer read_latency_cycles, Integer write_latency_cycles,
-                   Map<Target, Integer> target_burst_bytes, Bool is_internal) {
-  auto poolinfo_node = make_object<PoolInfoNode>();
-  poolinfo_node->pool_name = pool_name;
-  poolinfo_node->size_hint_bytes = size_hint_bytes;
-  poolinfo_node->clock_frequency_hz = clock_frequency_hz;
-  poolinfo_node->read_bandwidth_bytes_per_cycle = read_bandwidth_bytes_per_cycle;
-  poolinfo_node->write_bandwidth_bytes_per_cycle = write_bandwidth_bytes_per_cycle;
-  poolinfo_node->read_latency_cycles = read_latency_cycles;
-  poolinfo_node->write_latency_cycles = write_latency_cycles;
-  poolinfo_node->target_burst_bytes = target_burst_bytes;
-  poolinfo_node->is_internal = is_internal;
-  data_ = std::move(poolinfo_node);
-}
-
-TVM_REGISTER_NODE_TYPE(PoolInfoNode);
-
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<PoolInfoNode>([](const ObjectRef& ref, ReprPrinter* p) {
-      auto* node = static_cast<const PoolInfoNode*>(ref.get());
-      p->stream << "PoolInfoNode(\n"
-                << "  pool_name=" << node->pool_name
-                << ",\n  size_hint_bytes=" << node->size_hint_bytes
-                << ",\n  clock_frequency_hz=" << node->clock_frequency_hz
-                << ",\n  read_bandwidth_bytes_per_cycle=" << node->read_bandwidth_bytes_per_cycle
-                << ",\n  write_bandwidth_bytes_per_cycle=" << node->write_bandwidth_bytes_per_cycle
-                << ",\n  read_latency_cycles=" << node->read_latency_cycles
-                << ",\n  write_latency_cycles=" << node->write_latency_cycles
-                << ",\n  target_burst_bytes=" << node->target_burst_bytes << ")";
-    });
-
-PoolInfoProperties::PoolInfoProperties(Integer size_hint_bytes, Integer clock_frequency_hz,
-                                       Integer read_bandwidth_bytes_per_cycle,
-                                       Integer write_bandwidth_bytes_per_cycle,
-                                       Integer read_latency_cycles, Integer write_latency_cycles,
-                                       Map<Target, Integer> target_burst_bytes, Bool is_internal) {
-  auto poolinfo_properties_node = make_object<PoolInfoPropertiesNode>();
-  poolinfo_properties_node->size_hint_bytes = size_hint_bytes;
-  poolinfo_properties_node->clock_frequency_hz = clock_frequency_hz;
-  poolinfo_properties_node->read_bandwidth_bytes_per_cycle = read_bandwidth_bytes_per_cycle;
-  poolinfo_properties_node->write_bandwidth_bytes_per_cycle = write_bandwidth_bytes_per_cycle;
-  poolinfo_properties_node->read_latency_cycles = read_latency_cycles;
-  poolinfo_properties_node->write_latency_cycles = write_latency_cycles;
-  poolinfo_properties_node->target_burst_bytes = target_burst_bytes;
-  poolinfo_properties_node->is_internal = is_internal;
-  data_ = std::move(poolinfo_properties_node);
-}
-
-TVM_REGISTER_NODE_TYPE(PoolInfoPropertiesNode);
-TVM_REGISTER_GLOBAL("ir.PoolInfoProperties")
-    .set_body_typed([](Integer size_hint_bytes, Integer clock_frequency_hz,
-                       Integer read_bandwidth_bytes_per_cycle,
-                       Integer write_bandwidth_bytes_per_cycle, Integer read_latency_cycles,
-                       Integer write_latency_cycles, Map<Target, Integer> target_burst_bytes) {
-      return PoolInfoProperties(size_hint_bytes, clock_frequency_hz, read_bandwidth_bytes_per_cycle,
-                                write_bandwidth_bytes_per_cycle, read_latency_cycles,
-                                write_latency_cycles, target_burst_bytes, Bool(false));
-    });
-
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<PoolInfoPropertiesNode>([](const ObjectRef& ref, ReprPrinter* p) {
-      auto* node = static_cast<const PoolInfoPropertiesNode*>(ref.get());
-      p->stream << "PoolInfoPropertiesNode(\n"
-                << "  size_hint_bytes=" << node->size_hint_bytes
-                << ",\n  clock_frequency_hz=" << node->clock_frequency_hz
-                << ",\n  read_bandwidth_bytes_per_cycle=" << node->read_bandwidth_bytes_per_cycle
-                << ",\n  write_bandwidth_bytes_per_cycle=" << node->write_bandwidth_bytes_per_cycle
-                << ",\n  read_latency_cycles=" << node->read_latency_cycles
-                << ",\n  write_latency_cycles=" << node->write_latency_cycles
-                << ",\n  target_burst_bytes=" << node->target_burst_bytes << ")";
-    });
-
-WorkspacePoolInfo::WorkspacePoolInfo(String pool_name, Array<Target> targets,
-                                     PoolInfoProperties properties) {
-  auto poolinfo_node = make_object<WorkspacePoolInfoNode>();
-  poolinfo_node->pool_name = pool_name;
-  poolinfo_node->size_hint_bytes = properties->size_hint_bytes;
-  poolinfo_node->targets = targets;
-  poolinfo_node->clock_frequency_hz = properties->clock_frequency_hz;
-  poolinfo_node->read_bandwidth_bytes_per_cycle = properties->read_bandwidth_bytes_per_cycle;
-  poolinfo_node->write_bandwidth_bytes_per_cycle = properties->write_bandwidth_bytes_per_cycle;
-  poolinfo_node->read_latency_cycles = properties->read_latency_cycles;
-  poolinfo_node->write_latency_cycles = properties->write_latency_cycles;
-  poolinfo_node->target_burst_bytes = properties->target_burst_bytes;
-  poolinfo_node->is_internal = properties->is_internal;
-  data_ = std::move(poolinfo_node);
-}
-
-TVM_REGISTER_NODE_TYPE(WorkspacePoolInfoNode);
-TVM_REGISTER_GLOBAL("ir.WorkspacePoolInfo")
-    .set_body_typed([](String pool_name, Array<Target> targets, PoolInfoProperties properties) {
-      return WorkspacePoolInfo(pool_name, targets, properties);
-    });
-
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<WorkspacePoolInfoNode>([](const ObjectRef& ref, ReprPrinter* p) {
-      auto* node = static_cast<const WorkspacePoolInfoNode*>(ref.get());
-      p->stream << "WorkspacePoolInfoNode(\n"
-                << "  pool_name=" << node->pool_name << ",\n  targets=" << node->targets
-                << ",\n  size_hint_bytes=" << node->size_hint_bytes
-                << ",\n  clock_frequency_hz=" << node->clock_frequency_hz
-                << ",\n  read_bandwidth_bytes_per_cycle=" << node->read_bandwidth_bytes_per_cycle
-                << ",\n  write_bandwidth_bytes_per_cycle=" << node->write_bandwidth_bytes_per_cycle
-                << ",\n  read_latency_cycles=" << node->read_latency_cycles
-                << ",\n  write_latency_cycles=" << node->write_latency_cycles
-                << ",\n  target_burst_bytes=" << node->target_burst_bytes
-                << ",\n  is_internal=" << node->is_internal << ")"
-                << "\n";
-    });
-
-ConstantInfo::ConstantInfo(String name_hint, Integer byte_offset, runtime::NDArray data) {
-  auto constant_info_node = make_object<ConstantInfoNode>();
-  constant_info_node->name_hint = name_hint;
-  constant_info_node->byte_offset = byte_offset;
-  constant_info_node->data = data;
-  data_ = std::move(constant_info_node);
-}
-
-TVM_REGISTER_NODE_TYPE(ConstantInfoNode);
-TVM_REGISTER_GLOBAL("ir.ConstantInfo")
-    .set_body_typed([](String name_hint, Integer byte_offset, runtime::NDArray data) {
-      return ConstantInfo(name_hint, byte_offset, data);
-    });
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<ConstantInfoNode>([](const ObjectRef& ref, ReprPrinter* p) {
-      auto* node = static_cast<const ConstantInfoNode*>(ref.get());
-      p->stream << "ConstantInfoNode(\n"
-                << "name_hint=" << node->name_hint << ",\n byte_offset=" << node->byte_offset
-                << ",\n data=" << node->data << ")";
-    });
-
-ConstantPoolInfo::ConstantPoolInfo(String pool_name, Array<Target> targets,
-                                   Array<ConstantInfo> constant_info_array,
-                                   PoolInfoProperties properties) {
-  auto constant_poolinfo_node = make_object<ConstantPoolInfoNode>();
-  constant_poolinfo_node->pool_name = pool_name;
-  constant_poolinfo_node->constant_info_array = constant_info_array;
-  constant_poolinfo_node->targets = targets;
-
-  constant_poolinfo_node->size_hint_bytes = properties->size_hint_bytes;
-  constant_poolinfo_node->clock_frequency_hz = properties->clock_frequency_hz;
-  constant_poolinfo_node->read_bandwidth_bytes_per_cycle =
-      properties->read_bandwidth_bytes_per_cycle;
-  constant_poolinfo_node->write_bandwidth_bytes_per_cycle =
-      properties->write_bandwidth_bytes_per_cycle;
-  constant_poolinfo_node->read_latency_cycles = properties->read_latency_cycles;
-  constant_poolinfo_node->write_latency_cycles = properties->write_latency_cycles;
-  constant_poolinfo_node->target_burst_bytes = properties->target_burst_bytes;
-  constant_poolinfo_node->is_internal = properties->is_internal;
-  data_ = std::move(constant_poolinfo_node);
-}
-
-TVM_REGISTER_NODE_TYPE(ConstantPoolInfoNode);
-TVM_REGISTER_GLOBAL("ir.ConstantPoolInfo")
-    .set_body_typed([](String pool_name, Array<Target> targets,
-                       Array<ConstantInfo> constant_info_array, PoolInfoProperties properties) {
-      return ConstantPoolInfo(pool_name, targets, constant_info_array, properties);
-    });
-
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<ConstantPoolInfoNode>([](const ObjectRef& ref, ReprPrinter* p) {
-      auto* node = static_cast<const ConstantPoolInfoNode*>(ref.get());
-      p->stream << "ConstantPoolInfoNode(\n"
-                << "  pool_name=" << node->pool_name << ",\n  targets=" << node->targets
-                << ",\n  constant_info_array=" << node->constant_info_array
-                << ",\n  size_hint_bytes=" << node->size_hint_bytes
-                << ",\n  clock_frequency_hz=" << node->clock_frequency_hz
-                << ",\n  read_bandwidth_bytes_per_cycle=" << node->read_bandwidth_bytes_per_cycle
-                << ",\n  write_bandwidth_bytes_per_cycle=" << node->write_bandwidth_bytes_per_cycle
-                << ",\n  read_latency_cycles=" << node->read_latency_cycles
-                << ",\n  write_latency_cycles=" << node->write_latency_cycles
-                << ",\n  target_burst_bytes=" << node->target_burst_bytes
-                << ",\n  is_internal=" << node->is_internal << ")"
-                << "\n";
-    });
-
-WorkspaceMemoryPools::WorkspaceMemoryPools(Array<PoolInfo> pools) {
-  auto workspace_memory_pools_node = make_object<WorkspaceMemoryPoolsNode>();
-  workspace_memory_pools_node->pools = pools;
-  data_ = std::move(workspace_memory_pools_node);
-}
-
-TVM_REGISTER_NODE_TYPE(WorkspaceMemoryPoolsNode);
-TVM_REGISTER_GLOBAL("ir.WorkspaceMemoryPools").set_body_typed([](Array<PoolInfo> pools) {
-  return WorkspaceMemoryPools(pools);
-});
-
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<WorkspaceMemoryPoolsNode>([](const ObjectRef& ref, ReprPrinter* p) {
-      auto* node = static_cast<const WorkspaceMemoryPoolsNode*>(ref.get());
-      p->stream << "WorkspaceMemoryPoolsNode(\n"
-                << "pools=" << node->pools << ")";
-    });
-
-ConstantMemoryPools::ConstantMemoryPools(Array<ConstantPoolInfo> pools) {
-  auto constant_memory_pools_node = make_object<ConstantMemoryPoolsNode>();
-  constant_memory_pools_node->pools = pools;
-  data_ = std::move(constant_memory_pools_node);
-}
-
-TVM_REGISTER_NODE_TYPE(ConstantMemoryPoolsNode);
-TVM_REGISTER_GLOBAL("ir.ConstantMemoryPools").set_body_typed([](Array<ConstantPoolInfo> pools) {
-  return ConstantMemoryPools(pools);
-});
-
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<ConstantMemoryPoolsNode>([](const ObjectRef& ref, ReprPrinter* p) {
-      auto* node = static_cast<const ConstantMemoryPoolsNode*>(ref.get());
-      p->stream << "ConstantMemoryPoolsNode(\n"
-                << "pools=" << node->pools << ")";
-    });
-}  // namespace tvm
diff --git a/src/ir/module.cc b/src/ir/module.cc
index b27dec5719ab..54a10b84cf9f 100644
--- a/src/ir/module.cc
+++ b/src/ir/module.cc
@@ -211,7 +211,7 @@ IRModule IRModuleNode::ShallowCopy() {
   return IRModule(this->functions, this->source_map, this->attrs, this->global_infos);
 }
 
-IRModule IRModule::FromExpr(const RelayExpr& expr,
+IRModule IRModule::FromExpr(const RelaxExpr& expr,
                             const tvm::Map<GlobalVar, BaseFunc>& global_funcs) {
   auto mod = IRModule(global_funcs);
   String gv_name;
@@ -266,10 +266,7 @@ TVM_REGISTER_GLOBAL("ir.Module_Clone").set_body_typed([](IRModule mod) -> IRModu
 
 TVM_REGISTER_GLOBAL("ir.Module_Add")
     .set_body_typed([](IRModule mod, GlobalVar var, ObjectRef val, bool update) -> IRModule {
-      ICHECK(val->IsInstance<RelayExprNode>());
-      if (const auto* f = runtime::Registry::Get("relay.ir.IRModuleAdd")) {
-        return (*f)(mod, var, val, update);
-      }
+      ICHECK(val->IsInstance<RelaxExprNode>());
       mod->Add(var, Downcast<BaseFunc>(val), update);
       return mod;
     });
diff --git a/src/ir/source_map.cc b/src/ir/source_map.cc
index 721a30affa3f..339b08d6ad49 100644
--- a/src/ir/source_map.cc
+++ b/src/ir/source_map.cc
@@ -28,8 +28,6 @@
 
 namespace tvm {
 
-TVM_REGISTER_PASS_CONFIG_OPTION("relay.frontend.fill_span", Bool);
-
 ObjectPtr<Object> GetSourceNameNode(const String& name) {
   // always return pointer as the reference can change as map re-allocate.
   // or use another level of indirection by creating a unique_ptr
diff --git a/src/ir/tensor_type.cc b/src/ir/tensor_type.cc
deleted file mode 100644
index 0fab0acb8964..000000000000
--- a/src/ir/tensor_type.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file src/ir/tensor_type.cc
- * \brief The type system AST nodes of Relay.
- */
-#include <tvm/ir/tensor_type.h>
-#include <tvm/runtime/registry.h>
-#include <tvm/tir/op.h>
-
-namespace tvm {
-
-using tvm::ReprPrinter;
-using namespace tvm::runtime;
-
-TensorType::TensorType(Array<PrimExpr> shape, DataType dtype) {
-  ObjectPtr<TensorTypeNode> n = make_object<TensorTypeNode>();
-  n->shape = std::move(shape);
-  n->dtype = std::move(dtype);
-  data_ = std::move(n);
-}
-
-TensorType TensorType::Scalar(DataType dtype) { return TensorType({}, dtype); }
-
-PrimExpr TensorTypeNode::Size() const {
-  if (shape.size() == 0) {
-    return tir::make_const(DataType::Int(64), 1);
-  }
-
-  PrimExpr size = shape[0];
-  for (size_t i = 1; i < shape.size(); ++i) {
-    size *= shape[i];
-  }
-  return size;
-}
-
-TVM_REGISTER_NODE_TYPE(TensorTypeNode);
-
-TVM_REGISTER_GLOBAL("ir.TensorType").set_body_typed([](Array<PrimExpr> shape, DataType dtype) {
-  return TensorType(shape, dtype);
-});
-
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<TensorTypeNode>([](const ObjectRef& ref, ReprPrinter* p) {
-      auto* node = static_cast<const TensorTypeNode*>(ref.get());
-      p->stream << "TensorType(" << node->shape << ", " << node->dtype << ")";
-    });
-
-}  // namespace tvm
diff --git a/src/ir/transform.cc b/src/ir/transform.cc
index f0b879acbc03..0ed80310eb97 100644
--- a/src/ir/transform.cc
+++ b/src/ir/transform.cc
@@ -465,8 +465,6 @@ Pass GetPass(const String& pass_name) {
   if (pass_name.operator std::string().find("transform.") != std::string::npos) {
     f = Registry::Get(pass_name);
   } else if ((f = Registry::Get("transform." + pass_name))) {
-    // pass
-  } else if ((f = Registry::Get("relay._transform." + pass_name))) {
   }
   ICHECK(f != nullptr) << "Cannot use " << pass_name << " to create the pass";
   return (*f)();
@@ -686,11 +684,6 @@ TVM_REGISTER_GLOBAL("transform.OverrideInstruments")
 
 Pass PrintIR(String header, bool show_meta_data) {
   auto pass_func = [header, show_meta_data](IRModule mod, const PassContext& ctx) {
-    if (const auto* f = runtime::Registry::Get("relay.ir.PrintIR")) {
-      if ((*f)(mod, header, show_meta_data)) {
-        return mod;
-      }
-    }
     LOG(INFO) << "PrintIR(" << header << "):\n" << mod;
     return mod;
   };
diff --git a/src/ir/type_functor.cc b/src/ir/type_functor.cc
index 1bfc435cff72..774c9d8f245f 100644
--- a/src/ir/type_functor.cc
+++ b/src/ir/type_functor.cc
@@ -27,8 +27,6 @@
 
 namespace tvm {
 
-void TypeVisitor::VisitType_(const TensorTypeNode* op) {}
-
 void TypeVisitor::VisitType_(const FuncTypeNode* op) {
   for (auto arg_type : op->arg_types) {
     this->VisitType(arg_type);
@@ -57,11 +55,6 @@ Array<Type> TypeMutator::MutateArray(Array<Type> arr) {
   return arr.Map([this](const Type& ty) { return VisitType(ty); });
 }
 
-Type TypeMutator::VisitType_(const TensorTypeNode* op) {
-  // TODO(tvm-team) recursively visit to replace Var
-  return GetRef<Type>(op);
-}
-
 Type TypeMutator::VisitType_(const FuncTypeNode* op) {
   bool changed = false;
 
diff --git a/src/relax/analysis/graph_partitioner.cc b/src/relax/analysis/graph_partitioner.cc
index 53f66f42160a..69408afbf474 100644
--- a/src/relax/analysis/graph_partitioner.cc
+++ b/src/relax/analysis/graph_partitioner.cc
@@ -225,11 +225,6 @@ size_t GraphPartitioner::CountFusedNodesWithNewChild(IndexedForwardGraph::Node*
   return target->FindRoot()->num_nodes + CountNodesUptoSink_(child, dom_parent);
 }
 
-size_t GraphPartitioner::CountAdditionalArgs_(const TensorTypeNode* ttype, bool with_strides) {
-  // TODO(@syfeng): need to clean this up
-  return 0;
-}
-
 size_t GraphPartitioner::CountArgs_(IndexedForwardGraph::Node* src,
                                     const IndexedForwardGraph& graph, bool update_postpone) {
   std::unordered_set<Group*> visited_groups;
@@ -274,11 +269,6 @@ size_t GraphPartitioner::CountArgsLimit_(const IndexedForwardGraph::Node* child)
   size_t output_args = 0;
   while (outputs_list != nullptr) {
     output_args++;
-    if (auto call_node = GetRef<ObjectRef>(outputs_list->value.node->ref).as<CallNode>()) {
-      if (const auto* ttype = call_node->checked_type().as<TensorTypeNode>()) {
-        output_args += CountAdditionalArgs_(ttype, false);
-      }
-    }
     outputs_list = outputs_list->next;
   }
   return (max_function_args_ > output_args) ? max_function_args_ - output_args : 0;
@@ -302,26 +292,16 @@ void GraphPartitioner::InitGroups(const IndexedForwardGraph& graph) {
       for (auto& it : call_node->args) {
         if (it.as<VarNode>() || it.as<TupleGetItemNode>()) {
           args_num++;
-          if (const auto* ttype = it.as<ExprNode>()->checked_type().as<TensorTypeNode>()) {
-            args_num += CountAdditionalArgs_(ttype);
-          }
         }
       }
     } else if (auto tuple_node = GetRef<ObjectRef>(obj).as<TupleNode>()) {
       for (auto& it : tuple_node->fields) {
         if (it.as<VarNode>() || it.as<TupleGetItemNode>()) {
           args_num++;
-          if (const auto* ttype = it.as<ExprNode>()->checked_type().as<TensorTypeNode>()) {
-            args_num += CountAdditionalArgs_(ttype);
-          }
         }
       }
     } else if (GetRef<ObjectRef>(obj).as<VarNode>()) {
       args_num++;
-      if (const auto* ttype =
-              GetRef<ObjectRef>(obj).as<ExprNode>()->checked_type().as<TensorTypeNode>()) {
-        args_num += CountAdditionalArgs_(ttype);
-      }
     }
     return args_num;
   };
diff --git a/src/relax/analysis/graph_partitioner.h b/src/relax/analysis/graph_partitioner.h
index 6abf10c5a602..cad9923488df 100644
--- a/src/relax/analysis/graph_partitioner.h
+++ b/src/relax/analysis/graph_partitioner.h
@@ -26,6 +26,7 @@
 #define TVM_RELAX_ANALYSIS_GRAPH_PARTITIONER_H_
 
 #include <tvm/relax/op_attr_types.h>
+#include <tvm/relax/type.h>
 
 #include <unordered_map>
 #include <unordered_set>
@@ -266,11 +267,6 @@ class GraphPartitioner {
   void CommitFuse(IndexedForwardGraph::Node* src, IndexedForwardGraph::Node* sink);
 
   size_t CountNodesUptoSink_(IndexedForwardGraph::Node* src, IndexedForwardGraph::Node* sink);
-  // Count the number of additional arguments. In the case of dynamic shape,
-  // generated function takes several additional arguments, such as the sizes of
-  // the dynamic dimensions and strides.
-  // This function calculates the number of such additional arguments.
-  size_t CountAdditionalArgs_(const TensorTypeNode* ttype, bool with_strides = true);
   // Calculate the number of arguments for the node.
   size_t CountArgs_(IndexedForwardGraph::Node* src, const IndexedForwardGraph& graph,
                     bool update_postpone = true);
diff --git a/src/relax/analysis/struct_info_analysis.cc b/src/relax/analysis/struct_info_analysis.cc
index 50931cbf38a2..d44252e86fd2 100644
--- a/src/relax/analysis/struct_info_analysis.cc
+++ b/src/relax/analysis/struct_info_analysis.cc
@@ -48,7 +48,7 @@ class StaticTypeDeriver : public StructInfoFunctor<Type(const StructInfo&)> {
   }
 
   Type VisitStructInfo_(const TensorStructInfoNode* op) final {
-    return DynTensorType(op->ndim, op->dtype);
+    return TensorType(op->ndim, op->dtype);
   }
 
   // module: distributed
@@ -87,7 +87,7 @@ StructInfo StructInfoFromType(const Type& type) {
     return PrimStructInfo(prim_type->dtype, prim_type->span);
   } else if (const ShapeTypeNode* shape_type = type.as<ShapeTypeNode>()) {
     return ShapeStructInfo(shape_type->ndim, type->span);
-  } else if (const DynTensorTypeNode* tensor_type = type.as<DynTensorTypeNode>()) {
+  } else if (const TensorTypeNode* tensor_type = type.as<TensorTypeNode>()) {
     return TensorStructInfo(tensor_type->dtype, tensor_type->ndim);
   } else if (const TupleTypeNode* tuple_type = type.as<TupleTypeNode>()) {
     Array<StructInfo> fields;
diff --git a/src/relax/ir/dataflow_matcher.cc b/src/relax/ir/dataflow_matcher.cc
index 417a78f0d04b..baac1e4c238d 100644
--- a/src/relax/ir/dataflow_matcher.cc
+++ b/src/relax/ir/dataflow_matcher.cc
@@ -594,7 +594,7 @@ bool DFPatternMatcher::VisitDFPattern_(const PrimArrPatternNode* op, const Expr&
 bool DFPatternMatcher::VisitDFPattern_(const DataTypePatternNode* op, const Expr& expr) {
   // no need to jump, as var.dtype == value.dtype
   auto expr_type = expr.as<ExprNode>()->checked_type();
-  if (const DynTensorTypeNode* tensor_type = expr_type.as<DynTensorTypeNode>()) {
+  if (const TensorTypeNode* tensor_type = expr_type.as<TensorTypeNode>()) {
     return (StructuralEqual()(op->dtype, tensor_type->dtype)) && VisitDFPattern(op->pattern, expr);
   }
   return false;
diff --git a/src/relax/ir/expr.cc b/src/relax/ir/expr.cc
index ca97744c5125..4c7566665744 100644
--- a/src/relax/ir/expr.cc
+++ b/src/relax/ir/expr.cc
@@ -337,7 +337,7 @@ Constant::Constant(runtime::NDArray data, Optional<StructInfo> struct_info_annot
   } else {
     TensorStructInfo tinfo(ShapeExpr(values), n->data.DataType(), VDevice(), span);
     n->struct_info_ = tinfo;
-    n->checked_type_ = DynTensorType(tinfo->ndim, tinfo->dtype);
+    n->checked_type_ = TensorType(tinfo->ndim, tinfo->dtype);
   }
 
   data_ = std::move(n);
diff --git a/src/relax/ir/type.cc b/src/relax/ir/type.cc
index 49ef1d7163f1..82b95b556bc2 100644
--- a/src/relax/ir/type.cc
+++ b/src/relax/ir/type.cc
@@ -50,26 +50,26 @@ TVM_REGISTER_NODE_TYPE(ObjectTypeNode);
 
 TVM_REGISTER_GLOBAL("relax.ObjectType").set_body_typed([](Span span) { return ObjectType(span); });
 
-DynTensorType::DynTensorType(int ndim, DataType dtype, Span span) {
-  ObjectPtr<DynTensorTypeNode> n = make_object<DynTensorTypeNode>();
+TensorType::TensorType(int ndim, DataType dtype, Span span) {
+  ObjectPtr<TensorTypeNode> n = make_object<TensorTypeNode>();
   n->ndim = std::move(ndim);
   n->dtype = std::move(dtype);
   n->span = span;
   data_ = std::move(n);
 }
 
-DynTensorType DynTensorType::CreateUnknownNDim(DataType dtype, Span span) {
-  ObjectPtr<DynTensorTypeNode> n = make_object<DynTensorTypeNode>();
+TensorType TensorType::CreateUnknownNDim(DataType dtype, Span span) {
+  ObjectPtr<TensorTypeNode> n = make_object<TensorTypeNode>();
   n->ndim = -1;
   n->dtype = std::move(dtype);
   n->span = std::move(span);
-  return DynTensorType(std::move(n));
+  return TensorType(std::move(n));
 }
 
-TVM_REGISTER_NODE_TYPE(DynTensorTypeNode);
+TVM_REGISTER_NODE_TYPE(TensorTypeNode);
 
-TVM_REGISTER_GLOBAL("relax.DynTensorType").set_body_typed([](int ndim, DataType dtype, Span span) {
-  return DynTensorType(ndim, dtype, span);
+TVM_REGISTER_GLOBAL("relax.TensorType").set_body_typed([](int ndim, DataType dtype, Span span) {
+  return TensorType(ndim, dtype, span);
 });
 
 PackedFuncType::PackedFuncType(Span span) {
diff --git a/src/relax/transform/fold_constant.cc b/src/relax/transform/fold_constant.cc
index d6da79c484cf..ff193acf143e 100644
--- a/src/relax/transform/fold_constant.cc
+++ b/src/relax/transform/fold_constant.cc
@@ -185,7 +185,7 @@ class ConstantFolder : public ExprMutator {
     bool output_not_tuple = call->sinfo_args.size() == 1;
     // Pattern 0: call constant function, const argument with const shape.
     if (func && arr_args && shape && output_not_tuple) {
-      DynTensorType ret_type = Downcast<DynTensorType>(call->checked_type());
+      TensorType ret_type = Downcast<TensorType>(call->checked_type());
       // value_or will return value if it is not null, otherwise return or
       return ConstEvaluateCallTIR(func.value(), arr_args.value(), shape.value(), ret_type->dtype)
           .value_or({});
diff --git a/src/relax/transform/fuse_tir.cc b/src/relax/transform/fuse_tir.cc
index 8fba54628153..37f3520d3df8 100644
--- a/src/relax/transform/fuse_tir.cc
+++ b/src/relax/transform/fuse_tir.cc
@@ -973,7 +973,7 @@ class FusedTIRConstructor : public ExprVisitor {
 
   /*! \brief Get DynTensor numbers from recursive Tuples. */
   static size_t GetTotalTensorSize(const Type& type) {
-    if (type.as<DynTensorTypeNode>()) {
+    if (type.as<TensorTypeNode>()) {
       return 1;
     } else if (const auto* tuple_type = type.as<TupleTypeNode>()) {
       size_t num = 0;
@@ -982,7 +982,7 @@ class FusedTIRConstructor : public ExprVisitor {
       }
       return num;
     } else {
-      LOG(FATAL) << "DynTensorType and TupleType are expect, but got: " << type;
+      LOG(FATAL) << "TensorType and TupleType are expect, but got: " << type;
       return 0;
     }
   }
diff --git a/src/runtime/contrib/bnns/bnns_wrp.h b/src/runtime/contrib/bnns/bnns_wrp.h
index b31e97e554da..f395561a7f6c 100644
--- a/src/runtime/contrib/bnns/bnns_wrp.h
+++ b/src/runtime/contrib/bnns/bnns_wrp.h
@@ -287,7 +287,7 @@ class TView {
     return res;
   }
 
-  /** Check if view is empty and doesn't relay to any tensor */
+  /** Check if view is empty and doesn't map to any tensor */
   operator bool() const { return origin_ != nullptr; }
 
   /** Get BNNS descriptor for particular View. Batch and Party attributed are ignored. */
diff --git a/src/runtime/contrib/cblas/gemm_common.h b/src/runtime/contrib/cblas/gemm_common.h
index 91341976bd02..2c7738e458b0 100644
--- a/src/runtime/contrib/cblas/gemm_common.h
+++ b/src/runtime/contrib/cblas/gemm_common.h
@@ -120,7 +120,7 @@ inline void CallU8S8S32Gemm(TVMArgs args, TVMRetValue* ret, TGemmOp op) {
   bool transb = args[4];
 
   // Set the sgemm attributes. Currently, support is limited to CblasFixOffset with all offsets
-  // equal to 0. This is sufficient for relay dense.
+  // equal to 0. This is sufficient for  dense.
   std::string offset_ctype = "CblasFixOffset";
   int16_t offset_a = 0;
   int16_t offset_b = 0;
diff --git a/src/runtime/contrib/onnx/onnx_module.cc b/src/runtime/contrib/onnx/onnx_module.cc
deleted file mode 100644
index 813211ca7c36..000000000000
--- a/src/runtime/contrib/onnx/onnx_module.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file onnx_module.cc
- * \brief ONNX Module without runtime support
- */
-#include <tvm/runtime/packed_func.h>
-#include <tvm/runtime/registry.h>
-
-namespace tvm {
-namespace codegen {
-using namespace tvm::runtime;
-
-class ONNXSourceModuleNode : public runtime::ModuleNode {
- public:
-  explicit ONNXSourceModuleNode(const std::string& code, const std::string& symbol,
-                                const Array<String>& const_vars)
-      : code_(code), symbol_(symbol), const_vars_(const_vars) {}
-  const char* type_key() const { return "onnx"; }
-
-  /*! \brief Get the property of the runtime module .*/
-  int GetPropertyMask() const final { return ModulePropertyMask::kRunnable; };
-
-  PackedFunc GetFunction(const String& name, const ObjectPtr<Object>& sptr_to_self) final {
-    if (name == "get_symbol") {
-      return PackedFunc(
-          [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->symbol_; });
-    } else if (name == "get_const_vars") {
-      return PackedFunc(
-          [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->const_vars_; });
-    } else {
-      LOG(FATAL) << "ONNX Source module cannot execute, to get executable module"
-                 << " build TVM with 'onnx' runtime support";
-      return PackedFunc(nullptr);
-    }
-  }
-
-  String GetSource(const String& format) final { return code_; }
-
-  void SaveToFile(const String& path, const String& format) final {
-    ICHECK_EQ(format, "onnx") << "Can only save to onnx format";
-    ICHECK_NE(code_.length(), 0);
-    const PackedFunc* to_onnx_ = runtime::Registry::Get("relay.ext.onnx.save_to_file");
-    (*to_onnx_)(code_, path, format);
-  }
-
- protected:
-  String code_;
-  std::string symbol_;
-  Array<String> const_vars_;
-};
-
-Module ONNXSourceModuleNodeCreate(const String& code, const String& symbol,
-                                  const Array<String>& const_vars) {
-  auto n = make_object<ONNXSourceModuleNode>(code.operator std::string(),
-                                             symbol.operator std::string(), const_vars);
-  return Module(n);
-}
-
-TVM_REGISTER_GLOBAL("runtime.ONNXModuleCreate").set_body_typed(ONNXSourceModuleNodeCreate);
-
-}  // namespace codegen
-}  // namespace tvm
diff --git a/src/runtime/contrib/tensorrt/tensorrt_calibrator.h b/src/runtime/contrib/tensorrt/tensorrt_calibrator.h
index 523676b94702..491714acd927 100755
--- a/src/runtime/contrib/tensorrt/tensorrt_calibrator.h
+++ b/src/runtime/contrib/tensorrt/tensorrt_calibrator.h
@@ -16,7 +16,7 @@
  * under the License.
 
  * file runtime/contrib/tensorrt/tensorrt_builder.h
- * brief Contains TensorRTBuilder class which can be used to convert a relay
+ * brief Contains TensorRTBuilder class which can be used to convert a graph
  * program into a TRT engine which can be used for inference.
 */
 
diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.h b/src/runtime/contrib/tensorrt/tensorrt_ops.h
index e2ef341b4ad6..950d9d183a7b 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_ops.h
+++ b/src/runtime/contrib/tensorrt/tensorrt_ops.h
@@ -198,7 +198,7 @@ class TensorRTOpConverter {
 };
 
 /*!
- * \brief Get the map of available TensorRTOpConverters, where the key is the name of the relay op.
+ * \brief Get the map of available TensorRTOpConverters, where the key is the name of the op.
  * \return Map of TensorRTOpConverters.
  */
 const std::unordered_map<std::string, std::unique_ptr<TensorRTOpConverter>>& GetOpConverters();
diff --git a/src/script/printer/ir/ir.cc b/src/script/printer/ir/ir.cc
index ac3bc5584c66..f98d64dfa3a4 100644
--- a/src/script/printer/ir/ir.cc
+++ b/src/script/printer/ir/ir.cc
@@ -16,7 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#include <tvm/ir/tensor_type.h>
+#include <tvm/ir/type.h>
 
 #include "./utils.h"
 
@@ -142,13 +142,6 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       return IR(d, "Op")->Call({LiteralDoc::Str(op->name, p->Attr("name"))});
     });
 
-TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch<TensorType>("", [](TensorType type, ObjectPath p, IRDocsifier d) -> Doc {
-      return IR(d, "TensorType")
-          ->Call({d->AsDoc<ExprDoc>(type->shape, p->Attr("shape")),
-                  LiteralDoc::DataType(type->dtype, p->Attr("dtype"))});
-    });
-
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<FuncType>("", [](FuncType func_type, ObjectPath p, IRDocsifier d) -> Doc {
       return IR(d, "FuncType")
@@ -168,11 +161,6 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     });
 
 std::string ReprPrintIRModule(const ObjectRef& mod, const PrinterConfig& cfg) {
-  if (const auto* f = runtime::Registry::Get("relay.ir.PrintRelayModule")) {
-    if (Optional<String> s = (*f)(mod)) {
-      return s.value();
-    }
-  }
   return ReprPrintIR(mod, cfg);
 }
 
diff --git a/src/script/printer/relax/type.cc b/src/script/printer/relax/type.cc
index d13d90b1d5ed..9b26a942be82 100644
--- a/src/script/printer/relax/type.cc
+++ b/src/script/printer/relax/type.cc
@@ -36,8 +36,8 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
         });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch<relax::DynTensorType>(
-        "", [](relax::DynTensorType n, ObjectPath n_p, IRDocsifier d) -> Doc {
+    .set_dispatch<relax::TensorType>(  //
+        "", [](relax::TensorType n, ObjectPath n_p, IRDocsifier d) -> Doc {
           return Relax(d, "Tensor")
               ->Call({}, {"ndim", "dtype"},
                      {LiteralDoc::Int(n->ndim, n_p->Attr("ndim")),
@@ -45,7 +45,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
         });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch<relax::PackedFuncType>(
+    .set_dispatch<relax::PackedFuncType>(  //
         "", [](relax::PackedFuncType n, ObjectPath n_p, IRDocsifier d) -> Doc {
           return Relax(d, "PackedFunc");  // TODO(@junrushao): verify if this is correct
         });
@@ -80,7 +80,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 
 TVM_SCRIPT_REPR(relax::ShapeTypeNode, ReprPrintRelax);
 TVM_SCRIPT_REPR(relax::ObjectTypeNode, ReprPrintRelax);
-TVM_SCRIPT_REPR(relax::DynTensorTypeNode, ReprPrintRelax);
+TVM_SCRIPT_REPR(relax::TensorTypeNode, ReprPrintRelax);
 TVM_SCRIPT_REPR(relax::PackedFuncTypeNode, ReprPrintRelax);
 TVM_REGISTER_GLOBAL("script.printer.ReprPrintRelax").set_body_typed(ReprPrintRelax);
 
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index fd77427c70b5..b981fcd6d73f 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -231,10 +231,6 @@
 #define TVM_INFO_USE_COREML "NOT-FOUND"
 #endif
 
-#ifndef TVM_INFO_USE_TARGET_ONNX
-#define TVM_INFO_USE_TARGET_ONNX "NOT-FOUND"
-#endif
-
 #ifndef TVM_INFO_USE_ARM_COMPUTE_LIB
 #define TVM_INFO_USE_ARM_COMPUTE_LIB "NOT-FOUND"
 #endif
@@ -351,7 +347,6 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_SORT", TVM_INFO_USE_SORT},
       {"USE_SPIRV_KHR_INTEGER_DOT_PRODUCT", TVM_INFO_USE_SPIRV_KHR_INTEGER_DOT_PRODUCT},
       {"USE_STACKVM_RUNTIME", TVM_INFO_USE_STACKVM_RUNTIME},
-      {"USE_TARGET_ONNX", TVM_INFO_USE_TARGET_ONNX},
       {"USE_TENSORFLOW_PATH", TVM_INFO_USE_TENSORFLOW_PATH},
       {"USE_TENSORRT_CODEGEN", TVM_INFO_USE_TENSORRT_CODEGEN},
       {"USE_TENSORRT_RUNTIME", TVM_INFO_USE_TENSORRT_RUNTIME},
diff --git a/src/target/generic_func.cc b/src/target/generic_func.cc
deleted file mode 100644
index 3135f6a9f240..000000000000
--- a/src/target/generic_func.cc
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*!
- * \file src/target/generic_func.cc
- */
-#include <dmlc/thread_local.h>
-#include <tvm/node/node.h>
-#include <tvm/node/repr_printer.h>
-#include <tvm/runtime/registry.h>
-#include <tvm/target/generic_func.h>
-#include <tvm/target/target.h>
-#include <tvm/tir/expr.h>
-
-#include <algorithm>
-#include <mutex>
-#include <stack>
-
-#include "../runtime/object_internal.h"
-
-namespace tvm {
-
-TVM_REGISTER_NODE_TYPE(GenericFuncNode);
-
-struct GenericFunc::Manager {
-  std::unordered_map<std::string, GenericFunc> fmap;
-  // mutex
-  std::mutex mutex;
-
-  Manager() {}
-
-  static Manager* Global() {
-    static Manager inst;
-    return &inst;
-  }
-};
-
-GenericFunc GenericFunc::Get(const std::string& name) {
-  Manager* m = Manager::Global();
-  std::lock_guard<std::mutex> lock(m->mutex);
-  auto it = m->fmap.find(name);
-  if (it == m->fmap.end()) {
-    auto f = make_object<GenericFuncNode>();
-    f->name_ = name;
-    auto gf = GenericFunc(f);
-    m->fmap[name] = gf;
-    return gf;
-  } else {
-    return it->second;
-  }
-}
-
-void GenericFunc::RegisterGenericFunc(GenericFunc func, const std::string& name) {
-  Manager* m = Manager::Global();
-  std::lock_guard<std::mutex> lock(m->mutex);
-  auto it = m->fmap.find(name);
-  ICHECK(it == m->fmap.end()) << "GenericFunc already registered " << name;
-  func->name_ = name;
-  m->fmap[name] = func;
-}
-
-GenericFunc& GenericFunc::set_default(const PackedFunc value, bool allow_override) {
-  auto node = static_cast<GenericFuncNode*>(operator->());
-  if (!allow_override) {
-    ICHECK(node->generic_func_ == nullptr)
-        << "Generic function already registered for " << node->name_;
-  }
-  node->generic_func_ = value;
-  return *this;
-}
-
-GenericFunc& GenericFunc::register_func(const std::vector<std::string>& tags,
-                                        const PackedFunc value, bool allow_override) {
-  for (auto& t : tags) {
-    if (!allow_override) {
-      auto iter = (*this)->dispatch_dict_.find(t);
-      ICHECK(iter == (*this)->dispatch_dict_.end())
-          << "Tag " << t << " already registered for schedule factory " << (*this)->name_;
-    }
-    (*this)->dispatch_dict_[t] = value;
-  }
-  return *this;
-}
-
-void GenericFunc::CallPacked(TVMArgs args, TVMRetValue* ret) const {
-  auto node = static_cast<const GenericFuncNode*>(get());
-  auto target = Target::Current(true);
-  PackedFunc func;
-
-  if (target.defined()) {
-    for (auto& k : target->GetKeys()) {
-      auto iter = node->dispatch_dict_.find(k);
-      if (iter != node->dispatch_dict_.end()) {
-        func = iter->second;
-        break;
-      }
-    }
-  }
-
-  if (func == nullptr) {
-    ICHECK(node->generic_func_ != nullptr) << "No generic function registered for " << node->name_;
-    func = node->generic_func_;
-  }
-
-  func.CallPacked(args, ret);
-}
-
-PackedFunc GenericFunc::GetPacked() const {
-  auto node = static_cast<const GenericFuncNode*>(get());
-  auto target = Target::Current(true);
-  if (target.defined()) {
-    for (auto& k : target->GetKeys()) {
-      auto iter = node->dispatch_dict_.find(k);
-      if (iter != node->dispatch_dict_.end()) {
-        return iter->second;
-      }
-    }
-  }
-  return node->generic_func_;
-}
-
-TVM_REGISTER_GLOBAL("target.GenericFuncCreate").set_body([](TVMArgs args, TVMRetValue* ret) {
-  *ret = GenericFunc(make_object<GenericFuncNode>());
-});
-
-TVM_REGISTER_GLOBAL("target.GenericFuncGetGlobal").set_body([](TVMArgs args, TVMRetValue* ret) {
-  std::string func_name = args[0];
-  *ret = GenericFunc::Get(func_name);
-});
-
-TVM_REGISTER_GLOBAL("target.GenericFuncSetDefault").set_body([](TVMArgs args, TVMRetValue* ret) {
-  GenericFunc generic_func = args[0];
-  PackedFunc func = args[1];
-  bool allow_override = args[2];
-  // Intentionally copy and not de-allocate it, to avoid free pyobject during shutdown
-  runtime::ObjectInternal::ObjectRetain((TVMObjectHandle)(func.get()));
-  generic_func.set_default(func, allow_override);
-});
-
-TVM_REGISTER_GLOBAL("target.GenericFuncRegisterFunc").set_body([](TVMArgs args, TVMRetValue* ret) {
-  GenericFunc generic_func = args[0];
-  PackedFunc func = args[1];
-  Array<runtime::String> tags = args[2];
-  bool allow_override = args[3];
-  // Intentionally copy and not de-allocate it, to avoid free pyobject during shutdown
-  runtime::ObjectInternal::ObjectRetain((TVMObjectHandle)(func.get()));
-  std::vector<std::string> tags_vector;
-  for (auto& tag : tags) {
-    tags_vector.push_back(tag);
-  }
-
-  generic_func.register_func(tags_vector, func, allow_override);
-});
-
-TVM_REGISTER_GLOBAL("target.GenericFuncCallFunc").set_body([](TVMArgs args, TVMRetValue* ret) {
-  GenericFunc generic_func = args[0];
-  TVMArgs func_args(&args.values[1], &args.type_codes[1], args.num_args - 1);
-
-  generic_func.CallPacked(func_args, ret);
-});
-
-TVM_REGISTER_GLOBAL("target.GenericFuncGetPackedFunc").set_body([](TVMArgs args, TVMRetValue* ret) {
-  GenericFunc generic_func = args[0];
-  *ret = generic_func.GetPacked();
-});
-
-}  // namespace tvm
diff --git a/src/te/operation/create_primfunc.h b/src/te/operation/create_primfunc.h
index 496ee45ba447..dc045156d114 100644
--- a/src/te/operation/create_primfunc.h
+++ b/src/te/operation/create_primfunc.h
@@ -42,8 +42,6 @@ PrimFunc CreatePrimFuncWithConstants(const Array<te::Tensor>& arg_list,
                                      const Array<runtime::NDArray>& constants,
                                      std::optional<DataType> index_dtype_override = std::nullopt);
 
-// Relax version
-// TODO(relax-team) combine with the relay version
 /*! \brief Use Tensor Expression to create a schedulable TensorIR func. */
 PrimFunc CreatePrimFunc(const Array<ObjectRef>& arg_list,
                         std::optional<DataType> index_dtype_override);
diff --git a/src/tir/ir/expr.cc b/src/tir/ir/expr.cc
index c38237a664f7..ca28520f8f77 100644
--- a/src/tir/ir/expr.cc
+++ b/src/tir/ir/expr.cc
@@ -544,7 +544,7 @@ TVM_REGISTER_GLOBAL("tir.Let").set_body_typed([](Var var, PrimExpr value, PrimEx
 TVM_REGISTER_NODE_TYPE(LetNode);
 
 // Call
-Call::Call(DataType dtype, RelayExpr op, Array<PrimExpr> args, Span span) {
+Call::Call(DataType dtype, RelaxExpr op, Array<PrimExpr> args, Span span) {
   for (size_t i = 0; i < args.size(); ++i) {
     ICHECK(args[i].defined()) << "arg " << i << " is not defined()";
   }
@@ -558,7 +558,7 @@ Call::Call(DataType dtype, RelayExpr op, Array<PrimExpr> args, Span span) {
 }
 
 TVM_REGISTER_GLOBAL("tir.Call")
-    .set_body_typed([](DataType type, RelayExpr op,
+    .set_body_typed([](DataType type, RelaxExpr op,
                        Array<Variant<runtime::String, IterVar, BufferRegion, PrimExpr>> args,
                        Span span) {
       Array<PrimExpr> prim_expr_args;
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index c27bd410f454..119cc2ea7864 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -48,8 +48,6 @@
     "cu",
     "cuh",
     "bat",
-    # relay text format
-    "rly",
     # configurations
     "mk",
     "in",
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index c728ae842cf1..0f8ca9789a64 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -16,12 +16,11 @@
 # under the License.
 # pylint: disable=invalid-name
 
-""" Hexagon testing infrastructure """
+"""Hexagon testing infrastructure"""
 
 import numpy
 import tvm
 from tvm import te
-from tvm.relay.backend import Executor
 
 
 def ceildiv(o, d):
@@ -113,19 +112,6 @@ def build_and_run(inputs, func, target: str, target_host: str, *args, **kwargs):
     return tensors[-1].asnumpy()
 
 
-def build_module(relay_mod, target):
-    """builds a relay module for a specified target"""
-    params = {}
-    executor = Executor("aot", {"link-params": True})
-    lowered = tvm.relay.build(
-        relay_mod,
-        tvm.target.Target(target, host=target),
-        executor=executor,
-        params=params,
-    )
-    return lowered
-
-
 def run_module(mod, inputs):
     """invokes run function of specified module with inputs provided"""
     mod.set_input(**inputs)
diff --git a/tests/python/contrib/test_hexagon/test_relax_integration.py b/tests/python/contrib/test_hexagon/test_relax_integration.py
index 89539b795105..7d90adbc959a 100644
--- a/tests/python/contrib/test_hexagon/test_relax_integration.py
+++ b/tests/python/contrib/test_hexagon/test_relax_integration.py
@@ -19,102 +19,9 @@
 import numpy as np
 import pytest
 import tvm.testing
-from tvm import relay, relax, runtime
+from tvm import relax, runtime
 from tvm.relax.testing import relay_translator
 from tvm.contrib.hexagon.session import Session
-from tvm.relay import testing
-
-
-class TestConv2d:
-    """Test conv2d op"""
-
-    n_batch = tvm.testing.parameter(1, relay.Any())
-
-    @tvm.testing.requires_hexagon
-    def test_conv2d(self, hexagon_session: Session, n_batch):
-        """Test Relax conv2d op and compare with Relay"""
-        dtype = "float32"
-        data = relay.var("data", relay.TensorType((n_batch, 64, 64, 3), dtype))
-        weight = relay.var("weight", relay.TensorType((5, 5, 3, 8), dtype))
-        y = relay.nn.conv2d(
-            data,
-            weight,
-            padding=(2, 2),
-            kernel_size=(5, 5),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-            out_dtype="float32",
-        )
-        f = relay.Function([data, weight], y)
-        relay_mod = tvm.IRModule.from_expr(f)
-
-        target_hexagon = tvm.target.hexagon("v68")
-        target = tvm.target.Target(target_hexagon, host=target_hexagon)
-        relax_mod = relay_translator.from_relay(relay_mod["main"], target)
-
-        exe = relax.build(relax_mod, target)
-        dev = hexagon_session.device
-        vm_mod = hexagon_session.get_executor_from_factory(exe)
-        vm_rt = relax.VirtualMachine(vm_mod, dev)
-
-        data_np = np.random.rand(1, 64, 64, 3).astype(np.float32)
-        weight_np = np.random.rand(5, 5, 3, 8).astype(np.float32)
-
-        # Run on hexagon and get result
-        data = tvm.nd.array(data_np, dev)
-        weight = tvm.nd.array(weight_np, dev)
-        vm_rt.set_input("main", data, weight)
-        vm_rt.invoke_stateful("main")
-        hexagon_res = vm_rt.get_outputs("main")
-
-        # Compile and run on Relay for comparison.
-        dev = tvm.cpu()
-        data = tvm.nd.array(data_np, dev)
-        weight = tvm.nd.array(weight_np, dev)
-
-        target = tvm.target.Target("llvm", host="llvm")
-        vm_exec = relay.vm.compile(relay_mod, target=target)
-        vm_factory = runtime.vm.VirtualMachine(vm_exec, tvm.cpu())
-        relay_res = vm_factory.invoke("main", data, weight)
-        tvm.testing.assert_allclose(hexagon_res.numpy(), relay_res.numpy(), rtol=1e-3)
-
-
-class TestMLP:
-    """Test MLP"""
-
-    n_batch = tvm.testing.parameter(1, relay.Any())
-
-    @tvm.testing.requires_hexagon
-    def test_mlp(self, hexagon_session: Session, n_batch):
-        """Test Relax MLP and compare with Relay"""
-        relay_mod, params = testing.mlp.get_workload(batch_size=n_batch, dtype="float32")
-
-        target_hexagon = tvm.target.hexagon("v68")
-        target = tvm.target.Target(target_hexagon, host=target_hexagon)
-        relax_mod = relay_translator.from_relay(relay_mod["main"], target, params)
-
-        exe = relax.build(relax_mod, target)
-        hexagon_device = hexagon_session.device
-
-        vm_mod = hexagon_session.get_executor_from_factory(exe)
-        vm_rt = relax.VirtualMachine(vm_mod, hexagon_device)
-
-        shape = (1, 1, 28, 28)
-        data_np = np.random.rand(*shape).astype("float32")
-        data = tvm.nd.array(data_np, hexagon_device)
-        vm_rt.set_input("main", data)
-        vm_rt.invoke_stateful("main")
-        hexagon_res = vm_rt.get_outputs("main")
-
-        # Compile and run on Relay for comparison.
-        cpu_dev = tvm.cpu()
-        data = tvm.nd.array(data_np, cpu_dev)
-
-        target = tvm.target.Target("llvm", host="llvm")
-        vm_exec = relay.vm.compile(relay_mod, target=target)
-        vm_factory = runtime.vm.VirtualMachine(vm_exec, cpu_dev)
-        relay_res = vm_factory.invoke("main", data, **params)
-        tvm.testing.assert_allclose(hexagon_res.numpy(), relay_res.numpy(), rtol=1e-3)
 
 
 def get_onnx_mobilenet():
@@ -197,40 +104,5 @@ def test_mobilenet(hexagon_session: Session):
     tvm.testing.assert_allclose(hexagon_res.numpy(), llvm_res.numpy(), rtol=1e-3)
 
 
-@pytest.mark.skip("takes too long (~20min)")
-@tvm.testing.requires_hexagon
-def test_mobilenet_dyn(hexagon_session: Session):
-    """Test MobileNet workload with dynamic batch size"""
-    relay_mod, params = testing.mobilenet.get_workload(batch_size=relay.Any(), dtype="float32")
-    data_np = np.random.rand(1, 3, 224, 224).astype("float32")
-
-    target_hexagon = tvm.target.hexagon("v68")
-    target = tvm.target.Target(target_hexagon, host=target_hexagon)
-
-    # translate the relay mobilenet and bind params
-    relax_mod = relay_translator.from_relay(relay_mod["main"], target, params)
-
-    # Compile and run on Hexagon.
-    exe = relax.build(relax_mod, target)
-    dev = hexagon_session.device
-
-    vm_mod = hexagon_session.get_executor_from_factory(exe)
-    vm_rt = relax.VirtualMachine(vm_mod, dev)
-    data = tvm.nd.array(data_np, dev)
-    vm_rt.set_input("main", data)
-    vm_rt.invoke_stateful("main")
-    hexagon_res = vm_rt.get_outputs("main")
-
-    # Compile and run on Relay for comparison.
-    dev = tvm.cpu()
-    data = tvm.nd.array(data_np, dev)
-
-    target = tvm.target.Target("llvm", host="llvm")
-    vm_exec = relay.vm.compile(relay_mod, target=target)
-    vm_factory = runtime.vm.VirtualMachine(vm_exec, tvm.cpu())
-    relay_res = vm_factory.invoke("main", data, **params)
-    tvm.testing.assert_allclose(hexagon_res.numpy(), relay_res.numpy(), rtol=1e-3)
-
-
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/ir/test_ir_type.py b/tests/python/ir/test_ir_type.py
index 0c95bc4554aa..d1f9ab3100eb 100644
--- a/tests/python/ir/test_ir_type.py
+++ b/tests/python/ir/test_ir_type.py
@@ -31,16 +31,9 @@ def test_prim_type():
     assert x.dtype == "int32"
 
 
-def test_tensor_type_bad_constructor():
-    try:
-        x = tvm.ir.TensorType("xx", "xx")
-    except tvm.error.TVMError:
-        pass
-
-
 def test_func_type():
     arg_types = tvm.runtime.convert([])
-    ret_type = tvm.ir.TensorType((1, 2, 3), "float32")
+    ret_type = tvm.ir.PrimType("float32")
     tf = tvm.ir.FuncType(arg_types, ret_type)
     assert tf.arg_types == arg_types
     assert tf.ret_type == ret_type
@@ -52,7 +45,7 @@ def test_func_type():
 
 def test_tuple_type():
     tf = tvm.ir.FuncType([], tvm.ir.TupleType([]))
-    tt = tvm.ir.TensorType(tvm.runtime.convert([1, 2, 3]), "float32")
+    tt = tvm.ir.PrimType("float32")
     fields = tvm.runtime.convert([tf, tt])
 
     tup_ty = tvm.ir.TupleType(fields)
diff --git a/tests/python/nightly/test_nnapi/infrastructure.py b/tests/python/nightly/test_nnapi/infrastructure.py
index a86c681f0bc0..9f3813ef6179 100644
--- a/tests/python/nightly/test_nnapi/infrastructure.py
+++ b/tests/python/nightly/test_nnapi/infrastructure.py
@@ -79,7 +79,7 @@ def _rewriter(
 
 
 def _build(mod, enable_nnapi):
-    if isinstance(mod, tvm.relay.expr.Call):
+    if isinstance(mod, tvm.relax.expr.Call):
         mod = tvm.IRModule.from_expr(mod)
 
     if enable_nnapi:
diff --git a/tests/python/relax/test_analysis_struct_info_analysis.py b/tests/python/relax/test_analysis_struct_info_analysis.py
index b2931549e92b..2d5afdc73f11 100644
--- a/tests/python/relax/test_analysis_struct_info_analysis.py
+++ b/tests/python/relax/test_analysis_struct_info_analysis.py
@@ -54,7 +54,7 @@ def test_get_static_type_tensor():
     s4 = rx.TensorStructInfo([1, n + 1, m], "int64")
 
     tvm.ir.assert_structural_equal(
-        rx.analysis.get_static_type(s4), rx.DynTensorType(ndim=3, dtype="int64")
+        rx.analysis.get_static_type(s4), rx.TensorType(ndim=3, dtype="int64")
     )
 
 
@@ -71,7 +71,7 @@ def test_get_static_type_tuple():
         rx.analysis.get_static_type(t1),
         rx.TupleType(
             [
-                rx.TupleType([rx.DynTensorType(ndim=3, dtype="int64"), rx.ObjectType()]),
+                rx.TupleType([rx.TensorType(ndim=3, dtype="int64"), rx.ObjectType()]),
                 rx.ShapeType(ndim=3),
             ]
         ),
@@ -88,9 +88,9 @@ def fn_info(c):
         return rx.FuncStructInfo([x, y], z)
 
     def fn_type():
-        x = rx.DynTensorType(ndim=3, dtype="float32")
-        y = rx.DynTensorType(ndim=3, dtype="float32")
-        z = rx.DynTensorType(ndim=2, dtype="float32")
+        x = rx.TensorType(ndim=3, dtype="float32")
+        y = rx.TensorType(ndim=3, dtype="float32")
+        z = rx.TensorType(ndim=2, dtype="float32")
         return rx.FuncType([x, y], z)
 
     f0 = fn_info(1)
diff --git a/tests/python/relax/test_analysis_suggest_layout_transforms.py b/tests/python/relax/test_analysis_suggest_layout_transforms.py
index 6e47c1d681d2..03eaef0267b6 100644
--- a/tests/python/relax/test_analysis_suggest_layout_transforms.py
+++ b/tests/python/relax/test_analysis_suggest_layout_transforms.py
@@ -476,7 +476,7 @@ def expected(
 
 
 def test_op_upsampling():
-    # relay materializes the layout if H, W or D dimensions are moved or tiled.
+    # relax materializes the layout if H, W or D dimensions are moved or tiled.
     @T.prim_func(private=True)
     def before(
         arg: T.Buffer((32, 64, 224, 224), "float32"),
diff --git a/tests/python/relax/test_ast_printer.py b/tests/python/relax/test_ast_printer.py
index 1df7dcf36f79..6f55c542ea05 100644
--- a/tests/python/relax/test_ast_printer.py
+++ b/tests/python/relax/test_ast_printer.py
@@ -270,8 +270,8 @@ def test_types():
     assert strip_whitespace(printer.visit_type_(object_type)) == "ObjectType()"
     packed_type = rx.PackedFuncType()
     assert strip_whitespace(printer.visit_type_(packed_type)) == "PackedFuncType()"
-    tensor_type = rx.DynTensorType(ndim=2, dtype="int32")
-    assert strip_whitespace(printer.visit_type_(tensor_type)) == "DynTensorType(ndim=2,dtype=int32)"
+    tensor_type = rx.TensorType(ndim=2, dtype="int32")
+    assert strip_whitespace(printer.visit_type_(tensor_type)) == "TensorType(ndim=2,dtype=int32)"
     unit_type = rx.TupleType([])
     assert strip_whitespace(printer.visit_type_(unit_type)) == "TupleType(fields=[])"
     tuple_type = rx.TupleType([rx.ShapeType(), object_type])
@@ -284,7 +284,7 @@ def test_types():
     func_type = rx.FuncType([tensor_type], unit_type)
     assert_fields(
         "FuncType",
-        {"arg_types": "[DynTensorType(ndim=2, dtype=int32)]", "ret_type": "TupleType(fields=[])"},
+        {"arg_types": "[TensorType(ndim=2, dtype=int32)]", "ret_type": "TupleType(fields=[])"},
         printer.visit_type_(func_type),
     )
 
@@ -622,7 +622,7 @@ def f() -> R.Shape:
     arg = call.args[0]
     arg_str = strip_whitespace(dump_ast(arg))
     # the constant should have a tensor type
-    assert "checked_type_=DynTensorType(ndim=0" in arg_str
+    assert "checked_type_=TensorType(ndim=0" in arg_str
 
     call_str = strip_whitespace(dump_ast(call))
     # we expect the shape_of call to have a checked_type_ of ShapeType
diff --git a/tests/python/relax/test_blockbuilder_core.py b/tests/python/relax/test_blockbuilder_core.py
index da030935c323..efab59a0e683 100644
--- a/tests/python/relax/test_blockbuilder_core.py
+++ b/tests/python/relax/test_blockbuilder_core.py
@@ -206,7 +206,7 @@ def test_binary_shape_type_deduction():
             gv0 = bb.emit_output(lv3)
         bb.emit_func_output(gv0)
 
-        assert isinstance(gv0.checked_type, rx.DynTensorType)
+        assert isinstance(gv0.checked_type, rx.TensorType)
         assert gv0.checked_type.ndim == 1
         assert gv0.checked_type.dtype == "float16"
 
diff --git a/tests/python/relax/test_dataflow_pattern.py b/tests/python/relax/test_dataflow_pattern.py
index a534b7c0c7c9..ab6c6df31bdc 100644
--- a/tests/python/relax/test_dataflow_pattern.py
+++ b/tests/python/relax/test_dataflow_pattern.py
@@ -227,7 +227,7 @@ def test_not_pattern():
 
 
 def test_type_pattern():
-    assert wildcard().has_type(rx.DynTensorType(2, "float32")).match(bindings[0].var)
+    assert wildcard().has_type(rx.TensorType(2, "float32")).match(bindings[0].var)
 
 
 def test_dtype_pattern():
@@ -486,7 +486,7 @@ def main(x: R.Tensor((32, 32), "float32")) -> R.Tensor:
 
 
 def test_distinguish_diamond_and_parallel():
-    # relay pattern lang cannot distinguish the two cases above.
+    # pattern lang cannot distinguish the two cases above.
     diamond = SmallDiamond["main"].body.blocks[0]
     parallel = SmallParallel["main"].body.blocks[0]
 
diff --git a/tests/python/relax/test_expr.py b/tests/python/relax/test_expr.py
index b20c9ef2d982..c20f0b268173 100644
--- a/tests/python/relax/test_expr.py
+++ b/tests/python/relax/test_expr.py
@@ -48,7 +48,7 @@ def test_var() -> None:
     assert v1.name_hint == "v1"
     for s0, s1 in zip(v1.struct_info.shape, shape):
         assert s0 == s1
-    assert v1.checked_type == rx.DynTensorType(2, "float32")
+    assert v1.checked_type == rx.TensorType(2, "float32")
     tvm.ir.assert_structural_equal(v1.struct_info, rx.TensorStructInfo(shape, "float32"))
 
 
@@ -62,7 +62,7 @@ def test_dataflow_var() -> None:
     v1 = rx.DataflowVar("v1", R.Tensor(shape, "float16"))
     assert v1.name_hint == "v1"
 
-    assert v1._checked_type_ == rx.DynTensorType(2, "float16")
+    assert v1._checked_type_ == rx.TensorType(2, "float16")
     assert isinstance(v1, rx.DataflowVar)
     tvm.ir.assert_structural_equal(v1.struct_info, rx.TensorStructInfo(shape, "float16"))
 
@@ -128,7 +128,7 @@ def test_match_cast() -> None:
     assert b1.pattern[0] == m
     assert b1.pattern[1] == n
     assert b1.var is not None
-    assert b1.var.checked_type == rx.DynTensorType(2, "float32")
+    assert b1.var.checked_type == rx.TensorType(2, "float32")
 
 
 def test_match_cast() -> None:
diff --git a/tests/python/relax/test_struct_info.py b/tests/python/relax/test_struct_info.py
index 33dcd7e9d77e..71f9827a149c 100644
--- a/tests/python/relax/test_struct_info.py
+++ b/tests/python/relax/test_struct_info.py
@@ -57,9 +57,9 @@ def test_shape_type():
 
 
 def test_dyn_tensor_type():
-    t0 = rx.DynTensorType()
+    t0 = rx.TensorType()
     assert t0.ndim == -1
-    t1 = rx.DynTensorType(3, "int32")
+    t1 = rx.TensorType(3, "int32")
     assert t1.ndim == 3
     assert t1.dtype == "int32"
 
diff --git a/tests/python/relax/test_transform_merge_composite_functions.py b/tests/python/relax/test_transform_merge_composite_functions.py
index 27537edd9e5f..fccf5a2f5e8e 100644
--- a/tests/python/relax/test_transform_merge_composite_functions.py
+++ b/tests/python/relax/test_transform_merge_composite_functions.py
@@ -159,7 +159,7 @@ def main(
 
     @R.function(private=True)
     def fused_relax_nn_gelu(
-        lv: R.Tensor((1, 64, 54, 54), dtype="float32")
+        lv: R.Tensor((1, 64, 54, 54), dtype="float32"),
     ) -> R.Tensor((1, 64, 54, 54), dtype="float32"):
         R.func_attr({"Primitive": 1, "Composite": "compiler_A.gelu"})
         with R.dataflow():
@@ -169,7 +169,7 @@ def fused_relax_nn_gelu(
 
     @R.function(private=True)
     def fused_relax_nn_relu(
-        lv1: R.Tensor((1, 64, 54, 54), dtype="float32")
+        lv1: R.Tensor((1, 64, 54, 54), dtype="float32"),
     ) -> R.Tensor((1, 64, 54, 54), dtype="float32"):
         R.func_attr({"Primitive": 1, "Composite": "compiler_A.relu"})
         with R.dataflow():
@@ -243,7 +243,7 @@ def lv(
 
         @R.function
         def lv1(
-            lv11: R.Tensor((1, 64, 54, 54), dtype="float32")
+            lv11: R.Tensor((1, 64, 54, 54), dtype="float32"),
         ) -> R.Tensor((1, 64, 54, 54), dtype="float32"):
             # function attr dict
             R.func_attr({"Composite": "compiler_A.relu"})
@@ -257,7 +257,7 @@ def lv1(
 
         @R.function
         def lv21(
-            lv4: R.Tensor((1, 64, 54, 54), dtype="float32")
+            lv4: R.Tensor((1, 64, 54, 54), dtype="float32"),
         ) -> R.Tensor((1, 64, 54, 54), dtype="float32"):
             # function attr dict
             R.func_attr({"Composite": "compiler_A.gelu"})
@@ -321,7 +321,7 @@ def main(
 
     @R.function(private=True)
     def fused_relax_nn_gelu(
-        lv: R.Tensor((1, 64, 54, 54), dtype="float32")
+        lv: R.Tensor((1, 64, 54, 54), dtype="float32"),
     ) -> R.Tensor((1, 64, 54, 54), dtype="float32"):
         R.func_attr({"Primitive": 1, "Composite": "compiler_B.gelu"})
         with R.dataflow():
@@ -331,7 +331,7 @@ def fused_relax_nn_gelu(
 
     @R.function(private=True)
     def fused_relax_nn_relu(
-        lv1: R.Tensor((1, 64, 54, 54), dtype="float32")
+        lv1: R.Tensor((1, 64, 54, 54), dtype="float32"),
     ) -> R.Tensor((1, 64, 54, 54), dtype="float32"):
         R.func_attr({"Primitive": 1, "Composite": "compiler_A.relu"})
         with R.dataflow():
@@ -418,7 +418,7 @@ def lv(
 
         @R.function
         def lv1(
-            lv11: R.Tensor((1, 64, 54, 54), dtype="float32")
+            lv11: R.Tensor((1, 64, 54, 54), dtype="float32"),
         ) -> R.Tensor((1, 64, 54, 54), dtype="float32"):
             R.func_attr({"Composite": "compiler_A.relu"})
             with R.dataflow():
@@ -432,13 +432,13 @@ def lv1(
 
     @R.function
     def fused_relax_nn_gelu1_compiler_B(
-        lv2: R.Tensor((1, 64, 54, 54), dtype="float32")
+        lv2: R.Tensor((1, 64, 54, 54), dtype="float32"),
     ) -> R.Tensor((1, 64, 54, 54), dtype="float32"):
         R.func_attr({"Codegen": "compiler_B"})
 
         @R.function
         def lv21(
-            lv3: R.Tensor((1, 64, 54, 54), dtype="float32")
+            lv3: R.Tensor((1, 64, 54, 54), dtype="float32"),
         ) -> R.Tensor((1, 64, 54, 54), dtype="float32"):
             R.func_attr({"Composite": "compiler_B.gelu"})
             with R.dataflow():
@@ -489,7 +489,7 @@ def main(
 
     @R.function(private=True)
     def fused_relax_nn_relu(
-        x11: R.Tensor((10,), dtype="float32")
+        x11: R.Tensor((10,), dtype="float32"),
     ) -> R.Tensor((10,), dtype="float32"):
         R.func_attr({"Primitive": 1, "Composite": "compiler_A.relu"})
         with R.dataflow():
@@ -499,7 +499,7 @@ def fused_relax_nn_relu(
 
     @R.function(private=True)
     def fused_relax_nn_gelu(
-        x21: R.Tensor((10,), dtype="float32")
+        x21: R.Tensor((10,), dtype="float32"),
     ) -> R.Tensor((10,), dtype="float32"):
         R.func_attr({"Primitive": 1, "Composite": "compiler_A.gelu"})
         with R.dataflow():
@@ -599,7 +599,7 @@ def main(x1: R.Tensor((10,), dtype="float32")) -> R.Tensor((10,), dtype="float32
 
     @R.function(private=True)
     def fused_relax_nn_relu(
-        x11: R.Tensor((10,), dtype="float32")
+        x11: R.Tensor((10,), dtype="float32"),
     ) -> R.Tensor((10,), dtype="float32"):
         R.func_attr({"Primitive": 1, "Composite": "compiler_A.relu"})
         with R.dataflow():
@@ -609,7 +609,7 @@ def fused_relax_nn_relu(
 
     @R.function(private=True)
     def fused_relax_nn_gelu(
-        x21: R.Tensor((10,), dtype="float32")
+        x21: R.Tensor((10,), dtype="float32"),
     ) -> R.Tensor((10,), dtype="float32"):
         R.func_attr({"Primitive": 1, "Composite": "compiler_A.gelu"})
         with R.dataflow():
@@ -644,7 +644,7 @@ def main(x1: R.Tensor((10,), dtype="float32")) -> R.Tensor((10,), dtype="float32
 
     @R.function
     def fused_relax_nn_relu1_compiler_A(
-        x11: R.Tensor((10,), dtype="float32")
+        x11: R.Tensor((10,), dtype="float32"),
     ) -> R.Tensor((10,), dtype="float32"):
         # function attr dict
         R.func_attr({"Codegen": "compiler_A"})
@@ -722,7 +722,7 @@ def main(
 
     @R.function(private=True)
     def fused_relax_nn_relu(
-        add2: R.Tensor((10,), dtype="float32")
+        add2: R.Tensor((10,), dtype="float32"),
     ) -> R.Tensor((10,), dtype="float32"):
         R.func_attr({"Primitive": 1, "Composite": "compiler_A.relu"})
         with R.dataflow():
@@ -742,7 +742,7 @@ def fused_relax_add(
 
     @R.function(private=True)
     def fused_relax_nn_gelu(
-        x31: R.Tensor((10,), dtype="float32")
+        x31: R.Tensor((10,), dtype="float32"),
     ) -> R.Tensor((10,), dtype="float32"):
         R.func_attr({"Primitive": 1, "Composite": "compiler_B.gelu"})
         with R.dataflow():
@@ -817,7 +817,7 @@ def lv31(add2: R.Tensor((10,), dtype="float32")) -> R.Tensor((10,), dtype="float
 
     @R.function
     def fused_relax_nn_gelu1_compiler_B(
-        x3: R.Tensor((10,), dtype="float32")
+        x3: R.Tensor((10,), dtype="float32"),
     ) -> R.Tensor((10,), dtype="float32"):
         R.func_attr({"Codegen": "compiler_B"})
 
@@ -995,7 +995,6 @@ def test_merge_producers_cyclic_dep():
 def test_merge_compiler_regions_example():
     """
     A tricky example from https://discuss.tvm.apache.org/t/relay-improved-graph-partitioning-algorithm/5830
-    See also the corresponding test case for Relay MergeCompilerRegions in relay/test_pass_merge_compiler_regions.py.
     """
     check(
         MergeCompilerRegionsExample,
@@ -1131,7 +1130,7 @@ def main(A: R.Tensor([10], dtype="float32")) -> R.Tensor([10], dtype="float32"):
 
         @R.function(private=True)
         def fused_relax_nn_relu(
-            Input: R.Tensor([10], dtype="float32")
+            Input: R.Tensor([10], dtype="float32"),
         ) -> R.Tensor([10], dtype="float32"):
             R.func_attr({"Composite": "compiler_A.relu", "Primitive": 1})
             with R.dataflow():
@@ -1152,7 +1151,7 @@ def relu(
 
         @R.function(private=True)
         def fused_relax_nn_gelu(
-            Input: R.Tensor([10], dtype="float32")
+            Input: R.Tensor([10], dtype="float32"),
         ) -> R.Tensor([10], dtype="float32"):
             R.func_attr({"Composite": "compiler_A.gelu", "Primitive": 1})
             with R.dataflow():
@@ -1174,13 +1173,13 @@ def main(A: R.Tensor([10], dtype="float32")) -> R.Tensor([10], dtype="float32"):
 
         @R.function
         def fused_relax_nn_relu1_compiler_A(
-            Input: R.Tensor([10], dtype="float32")
+            Input: R.Tensor([10], dtype="float32"),
         ) -> R.Tensor([10], dtype="float32"):
             R.func_attr({"Codegen": "compiler_A"})
 
             @R.function
             def composite_lambda(
-                Input: R.Tensor([10], dtype="float32")
+                Input: R.Tensor([10], dtype="float32"),
             ) -> R.Tensor([10], dtype="float32"):
                 R.func_attr({"Composite": "compiler_A.relu"})
                 with R.dataflow():
@@ -1204,13 +1203,13 @@ def relu(
 
         @R.function
         def fused_relax_nn_gelu1_compiler_A(
-            Input: R.Tensor([10], dtype="float32")
+            Input: R.Tensor([10], dtype="float32"),
         ) -> R.Tensor([10], dtype="float32"):
             R.func_attr({"Codegen": "compiler_A"})
 
             @R.function
             def composite_lambda(
-                Input: R.Tensor([10], dtype="float32")
+                Input: R.Tensor([10], dtype="float32"),
             ) -> R.Tensor([10], dtype="float32"):
                 R.func_attr({"Composite": "compiler_A.gelu"})
                 with R.dataflow():
diff --git a/tests/python/relax/test_transform_normalize.py b/tests/python/relax/test_transform_normalize.py
index 335ca7c70a12..b5559b4972d4 100644
--- a/tests/python/relax/test_transform_normalize.py
+++ b/tests/python/relax/test_transform_normalize.py
@@ -190,7 +190,7 @@ def test_normalize_if_branches():
     # an if node's branches must be seq exprs
     x = relax.Var("x", R.Tensor([], "int32"))
     y = relax.Var("y", R.Tensor([], "int32"))
-    # TODO(@relax-team): z has a shape of () and type of DynTensorType(ndim=0),
+    # TODO(@relax-team): z has a shape of () and type of TensorType(ndim=0),
     # but normalization fails to infer these even though it should
     z = relax.Var("z")
     cond = relax.Var("cond", R.Tensor([], "bool"))
diff --git a/tests/scripts/task_mypy.sh b/tests/scripts/task_mypy.sh
index a8a788f4ce13..c8d29237bb24 100755
--- a/tests/scripts/task_mypy.sh
+++ b/tests/scripts/task_mypy.sh
@@ -38,15 +38,5 @@ mypy  --check-untyped-defs python/tvm/script/printer
 echo "Checking MyPy Type defs in the TIR package with unittest"
 MYPYPATH=$TVM_PATH/python mypy --check-untyped-defs tests/python/tvmscript/test_tvmscript_type.py
 
-echo "Checking MyPy Type defs in tvm.relay.op.contrib"
-mypy --disallow-untyped-defs python/tvm/relay/op/contrib/cublas.py
-mypy --disallow-untyped-defs python/tvm/relay/op/contrib/cudnn.py
-mypy --disallow-untyped-defs python/tvm/relay/op/contrib/te_target.py
-mypy --disallow-untyped-defs python/tvm/relay/op/contrib/tensorrt.py
-
-#TODO(@mikepapadim): This is failing atm
-# echo "Checking MyPy Type defs in the tvm.relay.backend.contrib.ethosu package."
-# mypy  --check-untyped-defs python/tvm/relay/backend/contrib/ethosu/
-
 echo "Checking MyPy Type defs in the tvmscript IRBuilder package."
 mypy  --check-untyped-defs python/tvm/script/ir_builder

From 2d9016ce0a74abf445c75ea2fbc5290df0ba820f Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 17 Feb 2025 15:04:58 -0500
Subject: [PATCH 10/10] fix

---
 tests/python/relax/test_tvmscript_printer_relax.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python/relax/test_tvmscript_printer_relax.py b/tests/python/relax/test_tvmscript_printer_relax.py
index e93547d83e3c..52f935a894c8 100644
--- a/tests/python/relax/test_tvmscript_printer_relax.py
+++ b/tests/python/relax/test_tvmscript_printer_relax.py
@@ -233,7 +233,7 @@ def test_object_type():
 
 
 def test_dyn_tensor_type():
-    obj = relax.DynTensorType()
+    obj = relax.TensorType()
     _assert_print(obj, 'R.Tensor(ndim=-1, dtype="float32")')
 
 
@@ -256,7 +256,7 @@ def test_func_type():
             relax.ObjectType(),
             relax.ShapeType(ndim=3),
         ],
-        ret_type=relax.DynTensorType(
+        ret_type=relax.TensorType(
             ndim=3,
             dtype="float32",
         ),