diff --git a/apps/android_rpc/tests/android_rpc_test.py b/apps/android_rpc/tests/android_rpc_test.py
index 0027cc4ba352..ba6c0f9c9679 100644
--- a/apps/android_rpc/tests/android_rpc_test.py
+++ b/apps/android_rpc/tests/android_rpc_test.py
@@ -56,37 +56,15 @@ def test_rpc_module():
     tracker = rpc.connect_tracker(tracker_host, tracker_port)
     remote = tracker.request(key, priority=0, session_timeout=60)
 
-    # Compile the Graph for CPU target
-    s = te.create_schedule(B.op)
-    xo, xi = s[B].split(B.op.axis[0], factor=64)
-    s[B].parallel(xi)
-    s[B].pragma(xo, "parallel_launch_point")
-    s[B].pragma(xi, "parallel_barrier_when_finish")
-    f = tvm.build(s, [A, B], target, name="myadd_cpu")
-    path_dso_cpu = temp.relpath("cpu_lib.so")
-    f.export_library(path_dso_cpu, fcompile=ndk.create_shared)
+    mod = tvm.IRModule.from_expr(te.create_prim_func([A, B]).with_attr("global_symbol", "myadd"))
+    sch = tvm.tir.Schedule(mod)
+    (x,) = sch.get_loops(block=sch.get_block("B"))
+    xo, xi = sch.split(i, [None, 32])
+    sch.bind(xo, "blockIdx.x")
+    sch.bind(xi, "threadIdx.x")
 
-    # Execute the portable graph on cpu target
-    print("Run CPU test ...")
-    dev = remote.cpu(0)
-    remote.upload(path_dso_cpu)
-    f2 = remote.load_module("cpu_lib.so")
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
-    time_f = f2.time_evaluator(f2.entry_name, dev, number=10)
-    cost = time_f(a, b).mean
-    print("%g secs/op\n" % cost)
-    np.testing.assert_equal(b.numpy(), a.numpy() + 1)
-
-    # Compile the Graph for OpenCL target
     if test_opencl:
-        s = te.create_schedule(B.op)
-        xo, xi = s[B].split(B.op.axis[0], factor=64)
-        s[B].bind(xi, te.thread_axis("threadIdx.x"))
-        s[B].bind(xo, te.thread_axis("blockIdx.x"))
-        # Build the dynamic lib.
-        # If we don't want to do metal and only use cpu, just set target to be target
-        f = tvm.build(s, [A, B], tvm.target.Target("opencl", host=target), name="myadd")
+        f = tvm.build(sch.mod, target=tvm.target.Target("opencl", host=target))
         path_dso_cl = temp.relpath("dev_lib_cl.so")
         f.export_library(path_dso_cl, fcompile=ndk.create_shared)
 
@@ -101,29 +79,6 @@ def test_rpc_module():
         print("%g secs/op\n" % cost)
         np.testing.assert_equal(b.numpy(), a.numpy() + 1)
 
-    # Compile the Graph for Vulkan target
-    if test_vulkan:
-        s = te.create_schedule(B.op)
-        xo, xi = s[B].split(B.op.axis[0], factor=64)
-        s[B].bind(xi, te.thread_axis("threadIdx.x"))
-        s[B].bind(xo, te.thread_axis("blockIdx.x"))
-        # Build the dynamic lib.
-        # If we don't want to do metal and only use cpu, just set target to be target
-        f = tvm.build(s, [A, B], tvm.target.Target("vulkan", host=target), name="myadd")
-        path_dso_vulkan = temp.relpath("dev_lib_vulkan.so")
-        f.export_library(path_dso_vulkan, fcompile=ndk.create_shared)
-
-        print("Run GPU(Vulkan Flavor) test ...")
-        dev = remote.vulkan(0)
-        remote.upload(path_dso_vulkan)
-        f1 = remote.load_module("dev_lib_vulkan.so")
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
-        time_f = f1.time_evaluator(f1.entry_name, dev, number=10)
-        cost = time_f(a, b).mean
-        print("%g secs/op\n" % cost)
-        np.testing.assert_equal(b.numpy(), a.numpy() + 1)
-
 
 if __name__ == "__main__":
     test_rpc_module()
diff --git a/apps/ios_rpc/tests/ios_rpc_test.py b/apps/ios_rpc/tests/ios_rpc_test.py
index f0c31cd7d268..3e807adf484c 100644
--- a/apps/ios_rpc/tests/ios_rpc_test.py
+++ b/apps/ios_rpc/tests/ios_rpc_test.py
@@ -50,25 +50,19 @@ def test_rpc_module(host, port, key, mode):
     A = te.placeholder((n,), name="A")
     B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
     temp = utils.tempdir()
-    s = te.create_schedule(B.op)
-    xo, xi = s[B].split(B.op.axis[0], factor=64)
-    s[B].bind(xi, te.thread_axis("threadIdx.x"))
-    s[B].bind(xo, te.thread_axis("blockIdx.x"))
+    mod = tvm.IRModule.from_expr(te.create_prim_func([A, B]).with_attr("global_symbol", "myadd"))
+    sch = tvm.tir.Schedule(mod)
+    (i,) = sch.get_loops(block=sch.get_block("B"))
+    i0, i1 = sch.split(i, [None, 32])
+    sch.bind(i0, "blockIdx.x")
+    sch.bind(i1, "threadIdx.x")
+
     # Build the dynamic lib.
     # If we don't want to do metal and only use cpu, just set target to be target
-    f = tvm.build(s, [A, B], tvm.target.Target("metal", host=target), name="myadd")
+    f = tvm.build(sch.mod, target=tvm.target.Target("metal", host=target))
     path_dso1 = temp.relpath("dev_lib.dylib")
     f.export_library(path_dso1, fcompile=xcode.create_dylib, arch=arch, sdk=sdk)
 
-    s = te.create_schedule(B.op)
-    xo, xi = s[B].split(B.op.axis[0], factor=64)
-    s[B].parallel(xi)
-    s[B].pragma(xo, "parallel_launch_point")
-    s[B].pragma(xi, "parallel_barrier_when_finish")
-    f = tvm.build(s, [A, B], target, name="myadd_cpu")
-    path_dso2 = temp.relpath("cpu_lib.dylib")
-    f.export_library(path_dso2, fcompile=xcode.create_dylib, arch=arch, sdk=sdk)
-
     # connect to the proxy
     if mode == "tracker":
         remote = MODES[mode](host, port).request(key)
@@ -84,17 +78,6 @@ def test_rpc_module(host, port, key, mode):
     cost = time_f(a, b).mean
     print("Metal: %g secs/op" % cost)
     np.testing.assert_equal(b.numpy(), a.numpy() + 1)
-    # CPU
-    dev = remote.cpu(0)
-    remote.upload(path_dso2)
-    f2 = remote.load_module("cpu_lib.dylib")
-    a_np = np.random.uniform(size=1024).astype(A.dtype)
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
-    time_f = f2.time_evaluator(f2.entry_name, dev, number=10)
-    cost = time_f(a, b).mean
-    print("CPU: %g secs/op" % cost)
-    np.testing.assert_equal(b.numpy(), a.numpy() + 1)
 
 
 if __name__ == "__main__":
diff --git a/ci/jenkins/generated/arm_jenkinsfile.groovy b/ci/jenkins/generated/arm_jenkinsfile.groovy
index 5e48cc65004b..03ea3a028040 100644
--- a/ci/jenkins/generated/arm_jenkinsfile.groovy
+++ b/ci/jenkins/generated/arm_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2025-02-15T10:14:10.162250
+// Generated at 2025-02-15T20:02:41.820729
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -545,274 +545,3 @@ def build() {
   }
 }
 build()
-
-
-
-def shard_run_integration_aarch64_1_of_4(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_arm)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=arm',
-            'TEST_STEP_NAME=integration: aarch64',
-            'TVM_NUM_SHARDS=4',
-            'TVM_SHARD_INDEX=0',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_arm)
-              python_unittest(ci_arm)
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_aarch64 --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('integration: aarch64 1 of 4')
-  }
-}
-
-def shard_run_integration_aarch64_2_of_4(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_arm)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=arm',
-            'TEST_STEP_NAME=integration: aarch64',
-            'TVM_NUM_SHARDS=4',
-            'TVM_SHARD_INDEX=1',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_arm)
-              python_unittest(ci_arm)
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_aarch64 --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('integration: aarch64 2 of 4')
-  }
-}
-
-def shard_run_integration_aarch64_3_of_4(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_arm)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=arm',
-            'TEST_STEP_NAME=integration: aarch64',
-            'TVM_NUM_SHARDS=4',
-            'TVM_SHARD_INDEX=2',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_arm)
-              python_unittest(ci_arm)
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_aarch64 --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('integration: aarch64 3 of 4')
-  }
-}
-
-def shard_run_integration_aarch64_4_of_4(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_arm)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=arm',
-            'TEST_STEP_NAME=integration: aarch64',
-            'TVM_NUM_SHARDS=4',
-            'TVM_SHARD_INDEX=3',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_arm)
-              python_unittest(ci_arm)
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_aarch64 --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('integration: aarch64 4 of 4')
-  }
-}
-
-
-
-def test() {
-  stage('Test') {
-    environment {
-      SKIP_SLOW_TESTS = "${skip_slow_tests}"
-    }
-    parallel(
-    'integration: aarch64 1 of 4': {
-      try {
-      shard_run_integration_aarch64_1_of_4('ARM-GRAVITON3-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_integration_aarch64_1_of_4('ARM-GRAVITON3')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'integration: aarch64 2 of 4': {
-      try {
-      shard_run_integration_aarch64_2_of_4('ARM-GRAVITON3-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_integration_aarch64_2_of_4('ARM-GRAVITON3')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'integration: aarch64 3 of 4': {
-      try {
-      shard_run_integration_aarch64_3_of_4('ARM-GRAVITON3-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_integration_aarch64_3_of_4('ARM-GRAVITON3')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'integration: aarch64 4 of 4': {
-      try {
-      shard_run_integration_aarch64_4_of_4('ARM-GRAVITON3-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_integration_aarch64_4_of_4('ARM-GRAVITON3')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    )
-  }
-}
-test()
diff --git a/ci/jenkins/generated/cpu_jenkinsfile.groovy b/ci/jenkins/generated/cpu_jenkinsfile.groovy
index b54fdf51ca3c..627bb85862f3 100644
--- a/ci/jenkins/generated/cpu_jenkinsfile.groovy
+++ b/ci/jenkins/generated/cpu_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2025-02-15T10:14:10.181874
+// Generated at 2025-02-15T19:40:24.687837
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -553,158 +553,21 @@ build()
 
 
 
-def shard_run_integration_CPU_1_of_4(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_cpu)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=cpu',
-            'TEST_STEP_NAME=integration: CPU',
-            'TVM_NUM_SHARDS=4',
-            'TVM_SHARD_INDEX=0',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_cpu)
-              sh (
-                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_CPU --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('integration: CPU 1 of 4')
-  }
-}
-
-def shard_run_integration_CPU_2_of_4(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_cpu)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=cpu',
-            'TEST_STEP_NAME=integration: CPU',
-            'TVM_NUM_SHARDS=4',
-            'TVM_SHARD_INDEX=1',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_cpu)
-              sh (
-                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_CPU --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('integration: CPU 2 of 4')
-  }
-}
-
-def shard_run_integration_CPU_3_of_4(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_cpu)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=cpu',
-            'TEST_STEP_NAME=integration: CPU',
-            'TVM_NUM_SHARDS=4',
-            'TVM_SHARD_INDEX=2',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_cpu)
-              sh (
-                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_CPU --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('integration: CPU 3 of 4')
-  }
-}
 
-def shard_run_integration_CPU_4_of_4(node_type) {
+def shard_run_unittest_CPU_1_of_2(node_type) {
   echo 'Begin running on node_type ' + node_type
   if (!skip_ci && is_docs_only_build != 1) {
     node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-cpu") {
         // NOTE: if exception happens, it will be caught outside
         init_git()
         docker_init(ci_cpu)
         timeout(time: max_time, unit: 'MINUTES') {
           withEnv([
             'PLATFORM=cpu',
-            'TEST_STEP_NAME=integration: CPU',
-            'TVM_NUM_SHARDS=4',
-            'TVM_SHARD_INDEX=3',
+            'TEST_STEP_NAME=unittest: CPU',
+            'TVM_NUM_SHARDS=2',
+            'TVM_SHARD_INDEX=0',
             "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
             sh(
                   script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu",
@@ -712,16 +575,14 @@ def shard_run_integration_CPU_4_of_4(node_type) {
                 )
 
               ci_setup(ci_cpu)
-              sh (
-                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
+              cpp_unittest(ci_cpu)
+              python_unittest(ci_cpu)
           })
         }
         // only run upload if things are successful
         try {
           sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_CPU --items build/pytest-results",
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/unittest_CPU --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -733,13 +594,11 @@ def shard_run_integration_CPU_4_of_4(node_type) {
     }
     echo 'End running on node_type ' + node_type
   } else {
-    Utils.markStageSkippedForConditional('integration: CPU 4 of 4')
+    Utils.markStageSkippedForConditional('unittest: CPU 1 of 2')
   }
 }
 
-
-
-def shard_run_unittest_CPU_1_of_1(node_type) {
+def shard_run_unittest_CPU_2_of_2(node_type) {
   echo 'Begin running on node_type ' + node_type
   if (!skip_ci && is_docs_only_build != 1) {
     node(node_type) {
@@ -751,8 +610,8 @@ def shard_run_unittest_CPU_1_of_1(node_type) {
           withEnv([
             'PLATFORM=cpu',
             'TEST_STEP_NAME=unittest: CPU',
-            'TVM_NUM_SHARDS=1',
-            'TVM_SHARD_INDEX=0',
+            'TVM_NUM_SHARDS=2',
+            'TVM_SHARD_INDEX=1',
             "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
             sh(
                   script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu",
@@ -779,7 +638,7 @@ def shard_run_unittest_CPU_1_of_1(node_type) {
     }
     echo 'End running on node_type ' + node_type
   } else {
-    Utils.markStageSkippedForConditional('unittest: CPU 1 of 1')
+    Utils.markStageSkippedForConditional('unittest: CPU 2 of 2')
   }
 }
 
@@ -790,60 +649,9 @@ def test() {
       SKIP_SLOW_TESTS = "${skip_slow_tests}"
     }
     parallel(
-    'integration: CPU 1 of 4': {
-      try {
-      shard_run_integration_CPU_1_of_4('CPU-SMALL-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_integration_CPU_1_of_4('CPU-SMALL')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'integration: CPU 2 of 4': {
-      try {
-      shard_run_integration_CPU_2_of_4('CPU-SMALL-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_integration_CPU_2_of_4('CPU-SMALL')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'integration: CPU 3 of 4': {
-      try {
-      shard_run_integration_CPU_3_of_4('CPU-SMALL-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_integration_CPU_3_of_4('CPU-SMALL')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'integration: CPU 4 of 4': {
+    'unittest: CPU 1 of 2': {
       try {
-      shard_run_integration_CPU_4_of_4('CPU-SMALL-SPOT')
+      shard_run_unittest_CPU_1_of_2('CPU-SMALL-SPOT')
       } catch (Throwable ex) {
         if (is_last_build()) {
           // retry if at last build
@@ -851,16 +659,16 @@ def test() {
           // and try again via on demand node
           echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
           currentBuild.result = 'SUCCESS'
-          shard_run_integration_CPU_4_of_4('CPU-SMALL')
+          shard_run_unittest_CPU_1_of_2('CPU-SMALL')
         } else {
           echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
           throw ex
         }
       }
     },
-    'unittest: CPU 1 of 1': {
+    'unittest: CPU 2 of 2': {
       try {
-      shard_run_unittest_CPU_1_of_1('CPU-SMALL-SPOT')
+      shard_run_unittest_CPU_2_of_2('CPU-SMALL-SPOT')
       } catch (Throwable ex) {
         if (is_last_build()) {
           // retry if at last build
@@ -868,7 +676,7 @@ def test() {
           // and try again via on demand node
           echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
           currentBuild.result = 'SUCCESS'
-          shard_run_unittest_CPU_1_of_1('CPU-SMALL')
+          shard_run_unittest_CPU_2_of_2('CPU-SMALL')
         } else {
           echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
           throw ex
diff --git a/ci/jenkins/generated/hexagon_jenkinsfile.groovy b/ci/jenkins/generated/hexagon_jenkinsfile.groovy
index da20f33bbb3d..a9014337a74a 100644
--- a/ci/jenkins/generated/hexagon_jenkinsfile.groovy
+++ b/ci/jenkins/generated/hexagon_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2025-02-15T10:14:10.056677
+// Generated at 2025-02-15T19:31:36.031215
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -552,519 +552,12 @@ build()
 
 
 
-
-def shard_run_test_Hexagon_1_of_8(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_hexagon)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=hexagon',
-            'TEST_STEP_NAME=test: Hexagon',
-            'TVM_NUM_SHARDS=8',
-            'TVM_SHARD_INDEX=0',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_hexagon)
-              cpp_unittest(ci_hexagon)
-              sh (
-                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-                label: 'Run Hexagon tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('test: Hexagon 1 of 8')
-  }
-}
-
-def shard_run_test_Hexagon_2_of_8(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_hexagon)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=hexagon',
-            'TEST_STEP_NAME=test: Hexagon',
-            'TVM_NUM_SHARDS=8',
-            'TVM_SHARD_INDEX=1',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_hexagon)
-              sh (
-                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-                label: 'Run Hexagon tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('test: Hexagon 2 of 8')
-  }
-}
-
-def shard_run_test_Hexagon_3_of_8(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_hexagon)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=hexagon',
-            'TEST_STEP_NAME=test: Hexagon',
-            'TVM_NUM_SHARDS=8',
-            'TVM_SHARD_INDEX=2',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_hexagon)
-              sh (
-                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-                label: 'Run Hexagon tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('test: Hexagon 3 of 8')
-  }
-}
-
-def shard_run_test_Hexagon_4_of_8(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_hexagon)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=hexagon',
-            'TEST_STEP_NAME=test: Hexagon',
-            'TVM_NUM_SHARDS=8',
-            'TVM_SHARD_INDEX=3',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_hexagon)
-              sh (
-                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-                label: 'Run Hexagon tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('test: Hexagon 4 of 8')
-  }
-}
-
-def shard_run_test_Hexagon_5_of_8(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_hexagon)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=hexagon',
-            'TEST_STEP_NAME=test: Hexagon',
-            'TVM_NUM_SHARDS=8',
-            'TVM_SHARD_INDEX=4',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_hexagon)
-              sh (
-                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-                label: 'Run Hexagon tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('test: Hexagon 5 of 8')
-  }
-}
-
-def shard_run_test_Hexagon_6_of_8(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_hexagon)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=hexagon',
-            'TEST_STEP_NAME=test: Hexagon',
-            'TVM_NUM_SHARDS=8',
-            'TVM_SHARD_INDEX=5',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_hexagon)
-              sh (
-                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-                label: 'Run Hexagon tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('test: Hexagon 6 of 8')
-  }
-}
-
-def shard_run_test_Hexagon_7_of_8(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_hexagon)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=hexagon',
-            'TEST_STEP_NAME=test: Hexagon',
-            'TVM_NUM_SHARDS=8',
-            'TVM_SHARD_INDEX=6',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_hexagon)
-              sh (
-                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-                label: 'Run Hexagon tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('test: Hexagon 7 of 8')
-  }
-}
-
-def shard_run_test_Hexagon_8_of_8(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_hexagon)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=hexagon',
-            'TEST_STEP_NAME=test: Hexagon',
-            'TVM_NUM_SHARDS=8',
-            'TVM_SHARD_INDEX=7',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_hexagon)
-              sh (
-                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-                label: 'Run Hexagon tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('test: Hexagon 8 of 8')
-  }
-}
-
-
 def test() {
   stage('Test') {
     environment {
       SKIP_SLOW_TESTS = "${skip_slow_tests}"
     }
     parallel(
-    'test: Hexagon 1 of 8': {
-      try {
-      shard_run_test_Hexagon_1_of_8('CPU-SMALL-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_test_Hexagon_1_of_8('CPU-SMALL')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'test: Hexagon 2 of 8': {
-      try {
-      shard_run_test_Hexagon_2_of_8('CPU-SMALL-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_test_Hexagon_2_of_8('CPU-SMALL')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'test: Hexagon 3 of 8': {
-      try {
-      shard_run_test_Hexagon_3_of_8('CPU-SMALL-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_test_Hexagon_3_of_8('CPU-SMALL')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'test: Hexagon 4 of 8': {
-      try {
-      shard_run_test_Hexagon_4_of_8('CPU-SMALL-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_test_Hexagon_4_of_8('CPU-SMALL')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'test: Hexagon 5 of 8': {
-      try {
-      shard_run_test_Hexagon_5_of_8('CPU-SMALL-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_test_Hexagon_5_of_8('CPU-SMALL')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'test: Hexagon 6 of 8': {
-      try {
-      shard_run_test_Hexagon_6_of_8('CPU-SMALL-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_test_Hexagon_6_of_8('CPU-SMALL')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'test: Hexagon 7 of 8': {
-      try {
-      shard_run_test_Hexagon_7_of_8('CPU-SMALL-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_test_Hexagon_7_of_8('CPU-SMALL')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'test: Hexagon 8 of 8': {
-      try {
-      shard_run_test_Hexagon_8_of_8('CPU-SMALL-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_test_Hexagon_8_of_8('CPU-SMALL')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
     )
   }
 }
diff --git a/ci/jenkins/templates/arm_jenkinsfile.groovy.j2 b/ci/jenkins/templates/arm_jenkinsfile.groovy.j2
index aa999408a7e2..0781bc92dbe5 100644
--- a/ci/jenkins/templates/arm_jenkinsfile.groovy.j2
+++ b/ci/jenkins/templates/arm_jenkinsfile.groovy.j2
@@ -33,25 +33,3 @@
   make_cpp_tests(ci_arm, 'build')
   {{ m.upload_artifacts(tag='arm', filenames=tvm_lib + cpptest) }}
 {% endcall %}
-
-{% set test_method_names = [] %}
-
-{% call(shard_index, num_shards) m.sharded_test_step(
-  name="integration: aarch64",
-  num_shards=4,
-  ws="tvm/ut-python-arm",
-  platform="arm",
-  docker_image="ci_arm",
-  test_method_names=test_method_names,
-) %}
-  {{ m.download_artifacts(tag='arm') }}
-  ci_setup(ci_arm)
-  python_unittest(ci_arm)
-  sh (
-    script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
-    label: 'Run CPU integration tests',
-  )
-{% endcall %}
-
-
-{{ m.invoke_tests(node="ARM-GRAVITON3", test_method_names=test_method_names) -}}
diff --git a/ci/jenkins/templates/cpu_jenkinsfile.groovy.j2 b/ci/jenkins/templates/cpu_jenkinsfile.groovy.j2
index e34132c94111..c84b0c48a29f 100644
--- a/ci/jenkins/templates/cpu_jenkinsfile.groovy.j2
+++ b/ci/jenkins/templates/cpu_jenkinsfile.groovy.j2
@@ -40,27 +40,12 @@
 
 {% set test_method_names = [] %}
 
-{% call(shard_index, num_shards) m.sharded_test_step(
-  name="integration: CPU",
-  num_shards=4,
-  ws="tvm/integration-python-cpu",
-  platform="cpu",
-  docker_image="ci_cpu",
-  test_method_names=test_method_names,
-) %}
-  {{ m.download_artifacts(tag='cpu') }}
-  ci_setup(ci_cpu)
-  sh (
-    script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-    label: 'Run CPU integration tests',
-  )
-{% endcall %}
 
 {% call(shard_index, num_shards) m.sharded_test_step(
   name="unittest: CPU",
   ws="tvm/ut-python-cpu",
   platform="cpu",
-  num_shards=1,
+  num_shards=2,
   docker_image="ci_cpu",
   test_method_names=test_method_names,
 ) %}
diff --git a/ci/jenkins/templates/hexagon_jenkinsfile.groovy.j2 b/ci/jenkins/templates/hexagon_jenkinsfile.groovy.j2
index 91d3ce9ece42..b4177b332987 100644
--- a/ci/jenkins/templates/hexagon_jenkinsfile.groovy.j2
+++ b/ci/jenkins/templates/hexagon_jenkinsfile.groovy.j2
@@ -41,23 +41,4 @@
 
 {% set test_method_names = [] %}
 
-{% call(shard_index, num_shards) m.sharded_test_step(
-  name="test: Hexagon",
-  ws="tvm/test-hexagon",
-  platform="hexagon",
-  docker_image="ci_hexagon",
-  test_method_names=test_method_names,
-  num_shards=8,
-) %}
-  {{ m.download_artifacts(tag='hexagon') }}
-  ci_setup(ci_hexagon)
-  {% if shard_index == 1 %}
-  cpp_unittest(ci_hexagon)
-  {% endif %}
-  sh (
-    script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-    label: 'Run Hexagon tests',
-  )
-{% endcall %}
-
 {{ m.invoke_tests(node="CPU-SMALL", test_method_names=test_method_names) -}}
diff --git a/docker/Dockerfile.ci_wasm b/docker/Dockerfile.ci_wasm
index 6860c51d7277..83a9f0e9f0e8 100644
--- a/docker/Dockerfile.ci_wasm
+++ b/docker/Dockerfile.ci_wasm
@@ -64,8 +64,8 @@ RUN bash /install/ubuntu_install_emscripten.sh
 ENV EMSDK=/emsdk
 ENV PATH=${PATH}:${EMSDK}:${EMSDK}/upstream/emscripten
 ENV EMSCRIPTEN=${EMSDK}/upstream/emscripten
-ENV BINARYEN=${EMSDK}/upstream
-ENV LLVM=${EMSDK}/upstream/bin
+ENV EM_BINARYEN_ROOT=${EMSDK}/upstream
+ENV EM_LLVM_ROOT=${EMSDK}/upstream/bin
 
 # sccache
 COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
diff --git a/docs/how_to/tutorials/cross_compilation_and_rpc.py b/docs/how_to/tutorials/cross_compilation_and_rpc.py
index c7e302693de7..81c73fd051ef 100644
--- a/docs/how_to/tutorials/cross_compilation_and_rpc.py
+++ b/docs/how_to/tutorials/cross_compilation_and_rpc.py
@@ -104,7 +104,7 @@
 n = tvm.runtime.convert(1024)
 A = te.placeholder((n,), name="A")
 B = te.compute((n,), lambda i: A[i] + 1.0, name="B")
-s = te.create_schedule(B.op)
+mod = tvm.IRModule.from_expr(te.create_prim_func([A, B]).with_attr("global_symbol", "add_one"))
 
 ######################################################################
 # Then we cross compile the kernel.
@@ -119,7 +119,7 @@
 else:
     target = "llvm -mtriple=armv7l-linux-gnueabihf"
 
-func = tvm.build(s, [A, B], target=target, name="add_one")
+func = tvm.build(mod, target=target, name="add_one")
 # save the lib at a local temp folder
 temp = utils.tempdir()
 path = temp.relpath("lib.tar")
@@ -231,11 +231,13 @@ def run_opencl():
     target = tvm.target.Target("opencl", host="llvm -mtriple=aarch64-linux-gnu")
 
     # create schedule for the above "add one" compute declaration
-    s = te.create_schedule(B.op)
-    xo, xi = s[B].split(B.op.axis[0], factor=32)
-    s[B].bind(xo, te.thread_axis("blockIdx.x"))
-    s[B].bind(xi, te.thread_axis("threadIdx.x"))
-    func = tvm.build(s, [A, B], target=target)
+    mod = tvm.IRModule.from_expr(te.create_prim_func([A, B]))
+    sch = tvm.tir.Schedule(mod)
+    (x,) = sch.get_loops(block=sch.get_block("B"))
+    xo, xi = sch.split(i, [None, 32])
+    sch.bind(x, "blockIdx.x")
+    sch.bind(x, "threadIdx.x")
+    func = tvm.build(sch.mod, target=target)
 
     remote = rpc.connect(opencl_device_host, opencl_device_port)
 
diff --git a/docs/reference/api/python/contrib.rst b/docs/reference/api/python/contrib.rst
index 0eb3024c2d08..e85d3bec5caf 100644
--- a/docs/reference/api/python/contrib.rst
+++ b/docs/reference/api/python/contrib.rst
@@ -104,11 +104,6 @@ tvm.contrib.rocm
 .. automodule:: tvm.contrib.rocm
     :members:
 
-tvm.contrib.sparse
-~~~~~~~~~~~~~~~~~~
-.. automodule:: tvm.contrib.sparse
-    :members:
-
 
 tvm.contrib.spirv
 ~~~~~~~~~~~~~~~~~
diff --git a/docs/reference/api/python/te.rst b/docs/reference/api/python/te.rst
index 83e0042db1b9..363dae675d84 100644
--- a/docs/reference/api/python/te.rst
+++ b/docs/reference/api/python/te.rst
@@ -23,11 +23,3 @@ tvm.te
    :members:
    :imported-members:
    :autosummary:
-
-
-tvm.te.hybrid
--------------
-.. automodule:: tvm.te.hybrid
-   :members:
-   :imported-members:
-   :autosummary:
diff --git a/golang/sample/deploy.py b/golang/sample/deploy.py
deleted file mode 100644
index fa4cbd433549..000000000000
--- a/golang/sample/deploy.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Get Started with TVM Go
-=======================
-"""
-from __future__ import absolute_import, print_function
-
-import tvm
-from tvm import te
-import numpy as np
-
-# Global declarations of environment.
-
-tgt = "llvm"
-
-######################################################################
-# Describe the Computation
-# ------------------------
-n = te.var("n")
-A = te.placeholder((n,), name="A")
-B = te.placeholder((n,), name="B")
-C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
-
-######################################################################
-# Schedule the Computation
-# ------------------------
-s = te.create_schedule(C.op)
-
-######################################################################
-# Compilation
-# -----------
-fadd = tvm.build(s, [A, B, C], tgt, name="myadd")
-
-######################################################################
-# Save Compiled Module
-# --------------------
-from tvm.contrib import cc
-from tvm.contrib import utils
-
-fadd.save("deploy.o")
-cc.create_shared("deploy.so", ["deploy.o"])
diff --git a/jvm/README.md b/jvm/README.md
index c7535f0311b4..62b685010c2e 100644
--- a/jvm/README.md
+++ b/jvm/README.md
@@ -89,35 +89,6 @@ It is your job to verify the types of callback arguments, as well as the type of
 
 You can register the Java function by `Function.register` and use `Function.getFunction` to get the registered function later.
 
-## Use TVM to Generate Shared Library
-
-There's nothing special for this part. The following Python snippet generate add_cpu.so which add two vectors on CPU.
-
-```python
-import os
-import tvm
-from tvm import te
-from tvm.contrib import cc, utils
-
-def test_add(target_dir):
-    n = te.var("n")
-    A = te.placeholder((n,), name='A')
-    B = te.placeholder((n,), name='B')
-    C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
-    s = te.create_schedule(C.op)
-    fadd = tvm.build(s, [A, B, C], "llvm", name="myadd")
-
-    fadd.save(os.path.join(target_dir, "add_cpu.o"))
-    cc.create_shared(os.path.join(target_dir, "add_cpu.so"),
-            [os.path.join(target_dir, "add_cpu.o")])
-
-if __name__ == "__main__":
-    import sys
-    if len(sys.argv) != 2:
-        sys.exit(-1)
-    test_add(sys.argv[1])
-```
-
 ## Run the Generated Shared Library
 
 The following code snippet demonstrate how to load generated shared library (add_cpu.so).
diff --git a/jvm/core/src/test/scripts/test_add_cpu.py b/jvm/core/src/test/scripts/test_add_cpu.py
deleted file mode 100644
index 9a93d4e74694..000000000000
--- a/jvm/core/src/test/scripts/test_add_cpu.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import os
-
-import tvm
-from tvm import te
-from tvm.contrib import cc, utils
-
-
-def test_add(target_dir):
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
-    s = te.create_schedule(C.op)
-    fadd = tvm.build(s, [A, B, C], "llvm", name="myadd")
-
-    fadd.save(os.path.join(target_dir, "add_cpu.o"))
-    cc.create_shared(
-        os.path.join(target_dir, "add_cpu.so"), [os.path.join(target_dir, "add_cpu.o")]
-    )
-
-
-if __name__ == "__main__":
-    import sys
-
-    if len(sys.argv) != 2:
-        sys.exit(-1)
-    test_add(sys.argv[1])
diff --git a/jvm/core/src/test/scripts/test_add_gpu.py b/jvm/core/src/test/scripts/test_add_gpu.py
deleted file mode 100644
index 0eea5671baed..000000000000
--- a/jvm/core/src/test/scripts/test_add_gpu.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import os
-
-import tvm
-from tvm import te
-from tvm.contrib import cc, nvcc, utils
-
-
-@tvm.register_func("tvm_callback_cuda_compile", override=True)
-def tvm_callback_cuda_compile(code, target):
-    ptx = nvcc.compile_cuda(code, target_format="ptx")
-    return ptx
-
-
-def test_add(target_dir):
-    if not tvm.runtime.enabled("cuda"):
-        print("skip %s because cuda is not enabled..." % __file__)
-        return
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
-
-    s = te.create_schedule(C.op)
-
-    bx, tx = s[C].split(C.op.axis[0], factor=64)
-    s[C].bind(bx, te.thread_axis("blockIdx.x"))
-    s[C].bind(tx, te.thread_axis("threadIdx.x"))
-    fadd_cuda = tvm.build(s, [A, B, C], tvm.target.Target("cuda", host="llvm"), name="myadd")
-
-    fadd_cuda.save(os.path.join(target_dir, "add_cuda.o"))
-    fadd_cuda.imported_modules[0].save(os.path.join(target_dir, "add_cuda.ptx"))
-    cc.create_shared(
-        os.path.join(target_dir, "add_cuda.so"), [os.path.join(target_dir, "add_cuda.o")]
-    )
-
-
-if __name__ == "__main__":
-    import sys
-
-    if len(sys.argv) != 2:
-        sys.exit(-1)
-    test_add(sys.argv[1])
diff --git a/python/tvm/contrib/peak.py b/python/tvm/contrib/peak.py
deleted file mode 100644
index 78dae846d6ca..000000000000
--- a/python/tvm/contrib/peak.py
+++ /dev/null
@@ -1,394 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""measure bandwidth and compute peak"""
-
-import logging
-import tvm
-from tvm import te
-from tvm.target import Target
-from . import utils
-from .. import rpc
-
-
-def _convert_to_remote(func, remote):
-    """convert module function to remote rpc function"""
-    temp = utils.tempdir()
-    path_dso = temp.relpath("tmp_func.tar")
-    func.export_library(path_dso)
-
-    remote.upload(path_dso)
-    func = remote.load_module("tmp_func.tar")
-    return func
-
-
-def measure_bandwidth_sum(
-    total_item,
-    item_per_thread,
-    stride,
-    base_type,
-    bits,
-    lanes,
-    target,
-    target_host,
-    remote,
-    dev,
-    n_times,
-):
-    """measure memory bandwidth of gpu by product reduction for a given type
-
-    The IR for measurement is
-
-    for each thread
-        for i in 1..num_per_thread:
-            y[global_id] = y[global_id] * x[base + i * stride]
-
-    Parameters
-    ----------
-    total_item: int
-        number of elements in input array
-    item_per_thread: int
-        number of elements each thread accumulates
-    stride: int
-        stride in memory access
-    base_type: str
-        can be "int", "float"
-    bits: int
-        can be 16, 32
-    lanes: int
-       lane of the vector type, can be 1, 2, 4, 8, 16
-    target: :any:`tvm.target.Target`
-        the target and option of the compilation.
-    target_host : str or :any:`tvm.target.Target`
-        host compilation target
-    dev: Device
-        the device of array
-    remote: tvm.rpc.RPCSession
-        remote rpc session
-    n_times: int
-        number of runs for taking mean
-
-    Returns
-    -------
-    GBPS: float
-         gigabyte per second
-    """
-    target, target_host = Target.canon_target_and_host(target, target_host)
-
-    n, m = total_item, item_per_thread
-    n //= lanes
-
-    base_type = str(base_type) + str(bits)
-    dtype = base_type if lanes == 1 else base_type + "x" + str(lanes)
-
-    k = te.reduce_axis((0, m), name="k")
-
-    x = te.placeholder((n,), dtype=dtype, name="x")
-    op = te.comm_reducer(lambda x, y: x * y, lambda t: tvm.tir.const(1, dtype=t), name="sum")
-    y = te.compute(
-        (n // m,), lambda i: op(x[i // stride * stride * m + i % stride + k * stride], axis=k)
-    )
-    s = te.create_schedule(y.op)
-
-    yo, yi = s[y].split(y.op.axis[0], target.max_num_threads)
-    s[y].bind(yo, te.thread_axis("blockIdx.x"))
-    s[y].bind(yi, te.thread_axis("threadIdx.x"))
-    s[y].unroll(k)
-
-    try:
-        func = tvm.build(s, [x, y], target)
-
-        x = tvm.nd.empty((n,), dtype=dtype, device=dev)
-        y = tvm.nd.empty((n // m,), dtype=dtype, device=dev)
-
-        func = _convert_to_remote(func, remote)
-        time_f = func.time_evaluator(func.entry_name, dev, number=n_times)
-        time = time_f(x, y).mean
-    except tvm._ffi.base.TVMError:
-        # build error (occur when device does not support half)
-        return -1
-
-    return 1.0 * (total_item * bits / 8) / 1e9 / time
-
-
-def measure_bandwidth_all_types(
-    total_item, item_per_thread, n_times, target, target_host, remote, dev, verbose=True
-):
-    """measure memory bandwidth for all types
-
-    Parameters
-    ----------
-    total_item: int
-        number of elements in input array
-    item_per_thread: int
-        number of elements each thread accmulates
-    n_times: int
-        number of runs for averaging
-    target: :any:`tvm.target.Target`
-        the target and option of the compilation.
-    target_host : str or :any:`tvm.target.Target`
-        host compilation target
-    remote: tvm.rpc.RPCSession
-        remote rpc session
-    dev: Device
-        the device of array
-    verbose: bool
-        whether outputs immediate result
-
-    Returns
-    -------
-    result: list
-        a list of (type_name, GBPS) pairs
-    """
-    target, target_host = Target.canon_target_and_host(target, target_host)
-    max_threads = target.max_num_threads
-
-    result = []
-    for base_type in ["float"]:
-        for bits in [32]:
-            for lanes in [1, 2, 4, 8, 16]:
-                max_speed = -1e9
-                # try different strides
-                for stride in [max_threads, total_item // (lanes * item_per_thread)]:
-                    speed = measure_bandwidth_sum(
-                        total_item,
-                        item_per_thread,
-                        stride,
-                        base_type,
-                        bits,
-                        lanes,
-                        target,
-                        target_host,
-                        remote,
-                        dev,
-                        n_times,
-                    )
-                    max_speed = max(max_speed, speed)
-                type_name = base_type + str(bits)
-                result.append([f"{type_name}x{lanes}", max_speed])
-                if verbose:
-                    logging.info("\t%-10s %.2f GBPS", result[-1][0], result[-1][1])
-    return result
-
-
-def measure_compute_mad(
-    total_item, item_per_thread, base_type, bits, lanes, target, target_host, remote, dev, n_times
-):
-    """measure peak compute speed by computing mad for a type
-
-    The IR for measurement is
-
-    for each thread
-        for i in 1..item_per_thread
-            x = mad(x, x, y)
-            y = mad(y, y, x)
-
-    Parameters
-    ----------
-    total_item: int
-        number of elements in input array
-    item_per_thread: int
-        number of operations each thread does
-    base_type: str
-        can be "int", "float"
-    bits: int
-        can be 16, 32
-    lanes: int
-       lane of the vector type, can be 1, 2, 4, 8, 16
-    target: :any:`tvm.target.Target`
-        the target and option of the compilation.
-    target_host : str or :any:`tvm.target.Target`
-        host compilation target
-    remote: tvm.rpc.RPCSession
-        if it is not None, use remote rpc session
-    dev: Device
-        the device of array
-    n_times: int
-        number of runs for taking mean
-
-    Returns
-    -------
-    GOPS: float
-         giga operation per second
-    """
-    target, target_host = Target.canon_target_and_host(target, target_host)
-
-    n = total_item
-
-    if bits >= 64 or lanes >= 16:
-        n //= 2
-
-    max_threads = target.max_num_threads
-
-    base_type = str(base_type) + str(bits)
-    dtype = base_type if lanes == 1 else base_type + "x" + str(lanes)
-
-    def extern(ins, outs):
-        # pylint: disable=unused-argument
-        """construct measurement function by building IR directly"""
-        ib = tvm.tir.ir_builder.create()
-
-        bx = te.thread_axis("blockIdx.x")
-        tx = te.thread_axis("threadIdx.x")
-
-        ib.scope_attr(bx, "thread_extent", n // max_threads)
-        ib.scope_attr(tx, "thread_extent", max_threads)
-
-        idx = bx.var * max_threads + tx.var
-
-        a = ib.allocate(dtype, (1), name="a", scope="local")
-        b = ib.allocate(dtype, (1), name="b", scope="local")
-
-        a[0] = outs[0].vload(idx, dtype)
-        b[0] = outs[0].vload(idx, dtype)
-
-        if base_type.find("float") != -1:
-
-            def mad_func(x, y):
-                return x * x + y
-
-        else:
-
-            def mad_func(x, y):
-                return y * y + x
-
-        for _ in range(item_per_thread // 4 // lanes):
-            a[0] = mad_func(a[0], b[0])
-            b[0] = mad_func(b[0], a[0])
-
-        ib.emit(outs[0].vstore(idx, b[0]))
-        return ib.get()
-
-    y = te.extern((n,), [], extern, name="y", dtype=dtype)
-    s = te.create_schedule(y.op)
-
-    try:
-        func = tvm.build(s, [y], target)
-        func = _convert_to_remote(func, remote)
-        time_f = func.time_evaluator(func.entry_name, dev, number=n_times)
-        y = tvm.nd.empty((n,), dtype=dtype, device=dev)
-        time = time_f(y).mean
-    except tvm._ffi.base.TVMError:
-        # build error (occur when device does not support half)
-        return -1
-
-    return 1.0 * (n * item_per_thread) / 1e9 / time
-
-
-def measure_compute_all_types(
-    total_item, item_per_thread, n_times, target, target_host, remote, dev, verbose=True
-):
-    """measure peak flops for all types
-
-    Parameters
-    ----------
-    total_item: int
-        number of elements in input array
-    item_per_thread: int
-        number of elements each thread accmulates
-    n_times: int
-        number of runs for averaging
-    target: :any:`tvm.target.Target`
-        the target and option of the compilation.
-    target_host : str or :any:`tvm.target.Target`
-        host compilation target
-    remote: tvm.rpc.RPCSession
-        remote rpc session
-    dev: Device
-        the device of array
-    verbose: bool
-        whether outputs immediate result
-
-    Returns
-    -------
-    result: list
-        a list of (type_name, GFLOPS/GIOPS) pairs
-    """
-    target, target_host = Target.canon_target_and_host(target, target_host)
-
-    result = []
-    for base_type in ["float", "int"]:
-        for bits in [16, 32, 64]:
-            for lanes in [1, 2, 4, 8, 16]:
-                if base_type == "int" and bits != 32:  # only measure int32
-                    continue
-
-                max_speed = -1e9
-                for per_thread in [item_per_thread // 2, item_per_thread, item_per_thread * 2]:
-                    speed = measure_compute_mad(
-                        total_item,
-                        per_thread,
-                        base_type,
-                        bits,
-                        lanes,
-                        target,
-                        target_host,
-                        remote,
-                        dev,
-                        n_times,
-                    )
-                    max_speed = max(max_speed, speed)
-                type_name = base_type + str(bits)
-                result.append([f"{type_name}x{lanes}", max_speed])
-
-                unit = "GFLOPS" if base_type == "float" else "GIOPS"
-
-                if verbose:
-                    logging.info("\t%-10s %.2f %s", result[-1][0], result[-1][1], unit)
-
-    return result
-
-
-def measure_peak_all(target, target_host, host, port):
-    """measure memory bandwidth and peak compute for gpu devices
-
-    Parameters
-    ----------
-    target: str or :any:`tvm.target.Target`
-    target_host: str
-    host: str
-    port: int
-    """
-
-    target, target_host = Target.canon_target_and_host(target, target_host)
-    remote = rpc.connect(host, port)
-    n_times = 20
-
-    bandwidth_total_item = 1 << 25
-    bandwidth_item_per_thread = 32
-
-    compute_total_item = 1 << 21
-    compute_item_per_thread = 4096
-
-    if str(target).startswith("opencl"):
-        dev = remote.cl()
-    elif str(target).startswith("cuda"):
-        dev = remote.cuda()
-    elif str(target).startswith("metal"):
-        dev = remote.metal()
-    else:
-        raise RuntimeError("Unsupported target")
-
-    logging.info("========== measure memory bandwidth ==========")
-    measure_bandwidth_all_types(
-        bandwidth_total_item, bandwidth_item_per_thread, n_times, target, target_host, remote, dev
-    )
-
-    logging.info("========== measure peak compute ==========")
-    measure_compute_all_types(
-        compute_total_item, compute_item_per_thread, n_times, target, target_host, remote, dev
-    )
diff --git a/python/tvm/contrib/sparse.py b/python/tvm/contrib/sparse.py
deleted file mode 100644
index 9f94ff24f906..000000000000
--- a/python/tvm/contrib/sparse.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Tensor and Operation class for computation declaration."""
-# pylint: disable=invalid-name
-import warnings
-import numpy as _np
-from tvm.runtime import ndarray as _nd
-from tvm import te
-from tvm.tir import expr as _expr
-from tvm.te import tensor as _tensor
-
-
-float32 = "float32"
-itype = "int32"
-
-
-class CSRNDArray(object):
-    """Sparse tensor object in CSR format."""
-
-    def __init__(self, arg1, device=None, shape=None):
-        """Construct a sparse matrix in CSR format.
-
-        Parameters
-        ----------
-        arg1 : numpy.ndarray or a tuple with (data, indices, indptr)
-            The corresponding a dense numpy array,
-            or a tuple for constructing a sparse matrix directly.
-
-        device: Device
-            The corresponding device.
-
-        shape : tuple of int
-            The shape of the array
-        """
-        if isinstance(arg1, tuple):
-            assert len(arg1) == 3
-            self.data, self.indices, self.indptr = arg1
-            self.shape = shape
-        elif isinstance(arg1, _np.ndarray):
-            source_array = arg1
-            ridx, cidx = _np.nonzero(source_array)
-            data = source_array[ridx, cidx]
-            self.data = _nd.array(data, device)
-            indices = _np.nonzero(source_array)[1].astype(itype)
-            self.indices = _nd.array(indices, device)
-            indptr = [0] + _np.apply_along_axis(
-                _np.count_nonzero, axis=1, arr=source_array
-            ).tolist()
-            indptr = _np.cumsum(_np.array(indptr, itype)).astype(itype)
-            self.indptr = _nd.array(indptr, device)
-            self.shape = source_array.shape
-        else:
-            raise RuntimeError(
-                f"Construct CSRNDArray with either a tuple (data, indices, indptr) "
-                f"or a numpy.array, can't handle type {type(arg1)}."
-            )
-        self.stype = "csr"
-        self.dtype = self.data.dtype
-        assert self.shape is not None
-        assert isinstance(self.data, _nd.NDArray)
-        assert isinstance(self.indices, _nd.NDArray)
-        assert str(self.indices.dtype) == "int32" or str(self.indices.dtype) == "int64", str(
-            self.indices.dtype
-        )
-        assert isinstance(self.indptr, _nd.NDArray)
-        assert str(self.indptr.dtype) == "int32" or str(self.indptr.dtype) == "int64", str(
-            self.indptr.dtype
-        )
-
-    def asnumpy(self):
-        """Construct a full matrix and convert it to numpy array. This API will be deprecated
-        in TVM v0.8 release. Please use `numpy` instead."""
-        warnings.warn(
-            "CSRNDArray.asnumpy() will be deprecated in TVM v0.8 release. "
-            "Please use CSRNDArray.numpy() instead.",
-            DeprecationWarning,
-        )
-        return self.numpy()
-
-    def numpy(self):
-        """Construct a full matrix and convert it to numpy array."""
-        full = _np.zeros(self.shape, self.dtype)
-        ridx = _np.diff(self.indptr.numpy())
-        ridx = _np.hstack([_np.ones((v,), itype) * i for i, v in enumerate(ridx)])
-        full[ridx, self.indices.numpy().astype(itype)] = self.data.numpy()
-        return full
-
-
-def array(source_array, device=None, shape=None, stype="csr"):
-    """Construct a sparse NDArray from numpy.ndarray"""
-    ret = None
-    if stype == "csr":
-        ret = CSRNDArray(source_array, shape=shape, device=device)
-    else:
-        raise NotImplementedError(f"stype={stype} is not supported yet.")
-    return ret
-
-
-class SparsePlaceholderOp(object):
-    """Placeholder class for sparse tensor representations."""
-
-    def __init__(self, shape, nonzeros, dtype, name):
-        # pylint: disable=unused-argument
-        """Contructing a bare bone structure for a sparse matrix
-
-        Parameters
-        ----------
-        shape: Tuple of Expr
-            The shape of the tensor
-
-        nonzeros: int
-            The number of non-zero values
-
-        dtype: str, optional
-            The data type of the tensor
-
-        name: str, optional
-            The name hint of the tensor
-        """
-        self.shape = shape
-        self.dtype = dtype
-        self.name = name
-        self.stype = "unknown"
-
-
-class CSRPlaceholderOp(SparsePlaceholderOp):
-    """Placeholder class for CSR based sparse tensor representation."""
-
-    def __init__(self, shape, nonzeros, dtype, name):
-        """Contructing a bare bone structure for a csr_matrix
-
-        Parameters
-        ----------
-        shape: Tuple of Expr
-            The shape of the tensor
-
-        nonzeros: int
-            The number of non-zero values
-
-        dtype: str, optional
-            The data type of the tensor
-
-        name: str, optional
-            The name hint of the tensor
-        """
-        SparsePlaceholderOp.__init__(self, shape, nonzeros, dtype, name)
-        self.stype = "csr"
-        self.data = te.placeholder((nonzeros,), dtype=dtype, name=self.name + "_data")
-        self.indices = te.placeholder((nonzeros,), dtype=itype, name=self.name + "_indices")
-        self.indptr = te.placeholder((self.shape[0] + 1,), dtype=itype, name=self.name + "_indptr")
-        assert isinstance(self.data, _tensor.Tensor)
-        assert isinstance(self.indices, _tensor.Tensor)
-        assert isinstance(self.indptr, _tensor.Tensor)
-
-
-def placeholder(shape, nonzeros=None, dtype=None, name="placeholder", stype=None):
-    """Construct an empty sparse tensor object.
-
-    Parameters
-    ----------
-    shape: Tuple of Expr
-        The shape of the tensor
-
-    nonzeros: int
-        The number of non-zero values
-
-    dtype: str, optional
-        The data type of the tensor
-
-    name: str, optional
-        The name hint of the tensor
-
-    stype: str, optional
-        The name storage type of the sparse tensor (e.g. csr, coo, ell)
-
-    Returns
-    -------
-    tensor: SparsePlaceholderOp
-        The created sparse tensor placeholder
-    """
-    shape = (shape,) if isinstance(shape, _expr.PrimExpr) else shape
-    nonzeros = 0 if nonzeros is None else nonzeros
-    dtype = float32 if dtype is None else dtype
-    stype = "csr" if stype is None else stype
-    ret = None
-    if stype == "csr":
-        ret = CSRPlaceholderOp(shape=shape, nonzeros=nonzeros, dtype=dtype, name=name)
-    else:
-        raise NotImplementedError(f"stype={stype} is not supported yet.")
-    return ret
diff --git a/python/tvm/contrib/tedd.py b/python/tvm/contrib/tedd.py
deleted file mode 100644
index 680297729789..000000000000
--- a/python/tvm/contrib/tedd.py
+++ /dev/null
@@ -1,798 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=import-outside-toplevel, nested-min-max
-"""Tensor Expression Debug Display (TEDD), visualizing Tensor Expression"""
-import html
-import json
-import warnings
-from graphviz import Digraph
-from graphviz import Source
-import tvm
-
-TVMDD_TABLE_BODY_WIDTH = 30
-# Must match enum IterVarType defined in include/tvm/expr.h
-ITERVAR_TYPE_STRING_MAP = {
-    0: ("kDataPar", "#FFFFFF"),
-    1: ("kThreadIndex", "#2980B9"),
-    2: ("kCommReduce", "#FAD7A0"),
-    3: ("kOrdered", "#D35400"),
-    4: ("kOpaque", "#ABB2B9"),
-    5: ("kUnrolled", "#D2B4DE"),
-    6: ("kVectorized", "#AED6F1"),
-    7: ("kParallelized", "#F5B7B1"),
-    8: ("kTensorized", "#A9DFBF"),
-}
-
-PALETTE = {
-    0: "#000000",
-    1: "#922B21",
-    2: "#76448A",
-    3: "#1F618D",
-    4: "#148F77",
-    5: "#B7950B",
-    6: "#AF601A",
-    7: "#F5B7B1",
-    8: "#A9DFBF",
-}
-
-PALETTE_SIZE = 9
-
-
-def dom_path_to_string(dom_path, prefix=""):
-    path_string = prefix
-    for index in dom_path:
-        path_string = path_string + "_" + str(index)
-    return path_string
-
-
-def insert_dot_id(sch):
-    """Insert unique ID for each node in the DOM tree.
-    They are used as Dot node ID.
-    """
-    for stage_idx, stage in enumerate(sch["stages"]):
-        dom_path = [stage_idx]
-        stage["id"] = dom_path_to_string(dom_path, stage["type"])
-        for itervar_idx, itervar in enumerate(stage["all_itervars"]):
-            dom_path = [stage_idx, itervar_idx]
-            itervar["id"] = dom_path_to_string(dom_path, itervar["type"])
-        for rel_idx, rel in enumerate(stage["relations"]):
-            dom_path = [stage_idx, rel_idx]
-            rel["id"] = dom_path_to_string(dom_path, rel["type"])
-        for tensor_idx, tensor in enumerate(stage["output_tensors"]):
-            dom_path = [stage_idx, tensor_idx]
-            tensor["id"] = dom_path_to_string(dom_path, tensor["type"])
-    return sch
-
-
-def itervar_equal(iv_a, iv_b):
-    """A helper method that compares the equality of two iterative variables"""
-    # Adopt the following method to assure the equality between two itervars.
-    # The plain comparison might fail (i.e. iv_a == iv_b) after the change of
-    # domain bounds from InferBound.
-    def _var_equal(v_a, v_b):
-        condtions = [
-            v_a.name == v_b.name,
-            v_a.dtype == v_b.dtype,
-            v_a.type_annotation == v_b.type_annotation,
-        ]
-        return all(c for c in condtions)
-
-    condtions = [
-        _var_equal(iv_a.var, iv_b.var),
-        iv_a.iter_type == iv_b.iter_type,
-        iv_a.thread_tag == iv_b.thread_tag,
-    ]
-    return all(c for c in condtions)
-
-
-class ObjectManager:
-    """A helper class tracking schedule objects, e.g. stage, IterVar,
-    relationship, and tensor, to their DOM path."""
-
-    def __init__(self, sch):
-        self.dict = {}
-        for stage_idx, stage in enumerate(sch.stages):
-            self.dict[stage] = [stage_idx]
-            for itervar_idx, itervar in enumerate(stage.all_iter_vars):
-                self.dict[itervar] = [stage_idx, itervar_idx]
-                # the itervars of leaf should also be mapped to the original one
-                for leaf_iv in stage.leaf_iter_vars:
-                    if itervar_equal(leaf_iv, itervar):
-                        self.dict[leaf_iv] = [stage_idx, itervar_idx]
-            for rel_idx, rel in enumerate(stage.relations):
-                self.dict[rel] = [stage_idx, rel_idx]
-            for tensor_idx in range(stage.op.num_outputs):
-                self.dict[frozenset({stage.op.name, tensor_idx})] = [stage_idx, tensor_idx]
-
-    def get_dom_path(self, obj):
-        if obj is None:
-            return None
-        assert obj in self.dict, "Node is no found."
-        return self.dict[obj]
-
-
-def get_or_create_dot_id(obj, prefix="", assert_on_missing=False):
-    """If obj's ID has been registered, return it.
-    If not, either assert or create a unique and legal ID, register and
-    return it, according to assert_on_missing.
-    ID must be a unique and legal Dotty ID.
-
-     Parameters
-     ----------
-     obj : objet
-                 Serve as the key to the ID.
-
-     prefix : string
-                 Prefix to attach to the ID.  Usually use obj's non-unique
-                 name as prefix.
-
-     assert_on_missing : bool
-                 Assert or not if object doesn't have a registered ID.
-    """
-    prefix = prefix.replace(".", "_")
-    if not hasattr(get_or_create_dot_id, "obj_id_dict"):
-        get_or_create_dot_id.obj_id_dict = {}
-    if obj not in get_or_create_dot_id.obj_id_dict:
-        if assert_on_missing:
-            assert False, "dot_id " + str(obj) + " has not been registered."
-        else:
-            get_or_create_dot_id.obj_id_dict[obj] = prefix + hex(id(obj))
-    return get_or_create_dot_id.obj_id_dict[obj]
-
-
-def get_port_id(is_input, index):
-    return "I_" + str(index) if is_input else "O_" + str(index)
-
-
-def get_itervar_type_info(iter_type):
-    assert iter_type < len(ITERVAR_TYPE_STRING_MAP), "Unknown IterVar type: " + str(iter_type)
-    return ITERVAR_TYPE_STRING_MAP[iter_type]
-
-
-def get_itervar_label_color(itervar, iv_type):
-    type_info = get_itervar_type_info(iv_type)
-    return (
-        linebrk(str(itervar["name"]) + "(" + type_info[0] + ")", TVMDD_TABLE_BODY_WIDTH),
-        type_info[1],
-    )
-
-
-def linebrk(s, n):
-    """Break input string s with <br/> for every n charactors."""
-    result = ""
-    j = 0
-    for i, c in enumerate(s):
-        if j == n and i != len(s) - 1:
-            result = result + "\n"
-            j = 0
-        j = j + 1
-        result = result + c
-    result = html.escape(str(result), quote=True)
-    result = result.replace("\n", "<br/>")
-    return result
-
-
-def create_graph(name="", rankdir="BT"):
-    graph = Digraph(name=name)
-    graph.graph_attr["rankdir"] = rankdir
-    return graph
-
-
-def itervar_label(itervar, index, index_color, label):
-    return (
-        '<TR><TD PORT="'
-        + itervar["id"]
-        + '" BGCOLOR="'
-        + index_color
-        + '">'
-        + str(index)
-        + '</TD><TD BGCOLOR="white" PORT="itervar">'
-        + label
-        + "<br/>"
-        + str(itervar["properties"]["range"])
-        + "</TD></TR>"
-    )
-
-
-def stage_label(stage):
-    return stage["name"] + "<br/>Scope: " + stage["properties"]["scope"]
-
-
-def legend_label():
-    """Generate legend labels."""
-    label = '<<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" CELLPADDING="4">'
-    for iter_type in ITERVAR_TYPE_STRING_MAP:
-        name, color = ITERVAR_TYPE_STRING_MAP[iter_type]
-        label += (
-            '<TR><TD BGCOLOR="' + color + '"></TD>' + '<TD BGCOLOR="white">' + name + "</TD></TR>"
-        )
-    label += "</TABLE>>"
-    return label
-
-
-def leaf_itervars(stage):
-    filtered = filter(lambda x: (x["index"] >= 0), stage["all_itervars"])
-    return sorted(filtered, key=lambda x: x["index"])
-
-
-def legend_dot(g):
-    with g.subgraph(name="cluster_legend") as subgraph:
-        subgraph.attr(label="Legend")
-        label = legend_label()
-        subgraph.node("legend", label, shape="none", margin="0")
-
-
-def extract_dom_for_viz(sch, need_range=True):
-    json_str = dump_json(sch, need_range)
-    s = json.loads(json_str)
-    s = insert_dot_id(s)
-    return s
-
-
-def dump_graph(dot_string, show_svg=True, dot_file_path="", output_dot_string=False):
-    """Output dot_string in various formats."""
-    if dot_file_path:
-        try:
-            dot_file = open(dot_file_path, "w+")
-            dot_file.write(dot_string)
-            dot_file.close()
-        except IOError:
-            print("Cannot open file: " + dot_file_path)
-    if show_svg:
-        from IPython.display import display
-        from IPython.display import SVG
-
-        src = Source(dot_string)
-        display(SVG(src.pipe(format="svg")))
-    if output_dot_string:
-        return dot_string
-    return None
-
-
-def dump_json(sch, need_range):
-    """Serialize data for visualization from a schedule in JSON format.
-
-    Parameters
-    ----------
-    sch : schedule
-                The schedule object to serialize
-
-    Returns
-    -------
-    json : string
-        Serialized JSON string
-    """
-
-    def encode_itervar(itervar, stage, index, range_map):
-        """Extract and encode IterVar visualization data to a dictionary"""
-        ivrange = range_map[itervar] if range_map is not None and itervar in range_map else None
-        bind_thread = None
-        tensor_intrin = None
-        if itervar in stage.iter_var_attrs:
-            attr = stage.iter_var_attrs[itervar]
-            iv_type = attr.iter_type
-            # binding
-            bind_thread = str(attr.bind_thread.var) if attr.bind_thread is not None else None
-            # tensorization
-            if attr.tensor_intrin is not None:
-                tensor_intrin = str(attr.tensor_intrin.body)
-                # remove the final \n
-                tensor_intrin = tensor_intrin[0:-1] if tensor_intrin[-1] == "\n" else tensor_intrin
-            else:
-                tensor_intrin = None
-        else:
-            iv_type = itervar.iter_type
-        itervar_dict = {
-            "type": "IterVar",
-            "index": index,
-            "name": str(itervar.var),
-            "itervar_type": iv_type,
-            "properties": {
-                "thread": bind_thread,
-                "intrin": tensor_intrin,
-                "range": str(ivrange) if ivrange is not None else "range(N/A)",
-            },
-        }
-        return itervar_dict
-
-    def encode_itervars(stage, range_map):
-        """Extract and encode IterVars visualization data from a stage to a dictionary"""
-
-        def get_leaf_itervar_index(itervar, leaf_iv):
-            for leaf_index, ivar in enumerate(leaf_iv):
-                if itervar_equal(ivar, itervar):
-                    return leaf_index
-            return -1
-
-        itervars = []
-        for itervar in stage.all_iter_vars:
-            leaf_index = get_leaf_itervar_index(itervar, stage.leaf_iter_vars)
-            itervars.append(encode_itervar(itervar, stage, leaf_index, range_map))
-        return itervars
-
-    def encode_itervar_relation(obj_manager, rel):
-        """Extract and encode IterVar Relationship visualization data to a dictionary"""
-        rel_type = type(rel)
-        if rel_type is tvm.te.schedule.Split:
-            node_type = "Split_Relation"
-            rel_dict = {
-                "type": node_type,
-                "parent": obj_manager.get_dom_path(rel.parent),
-                "outer": obj_manager.get_dom_path(rel.outer),
-                "inner": obj_manager.get_dom_path(rel.inner),
-            }
-        elif rel_type is tvm.te.schedule.Fuse:
-            node_type = "Fuse_Relation"
-            rel_dict = {
-                "type": node_type,
-                "fused": obj_manager.get_dom_path(rel.fused),
-                "outer": obj_manager.get_dom_path(rel.outer),
-                "inner": obj_manager.get_dom_path(rel.inner),
-            }
-        elif rel_type is tvm.te.schedule.Singleton:
-            node_type = "Singleton_Relation"
-            rel_dict = {
-                "type": node_type,
-                "iter": obj_manager.get_dom_path(rel.iter),
-            }
-        else:
-            return None
-        return rel_dict
-
-    def encode_itervar_relations(obj_manager, stage):
-        relations = []
-        for i in range(len(stage.relations)):
-            rel = encode_itervar_relation(obj_manager, stage.relations[i])
-            if rel is not None:
-                relations.append(rel)
-        return relations
-
-    def encode_tensor(obj_manager, tensor, stage):
-        """Extract and encode tensor visualization data to a dictionary"""
-        tensor_dict = {
-            "type": "Tensor",
-            "source": obj_manager.get_dom_path(stage),
-            "value_index": tensor.value_index,
-            "shape": str(tensor.op.output(tensor.value_index).shape),
-            "data_type": tensor.op.output(tensor.value_index).dtype,
-        }
-        return tensor_dict
-
-    def encode_tensors(obj_manager, stage):
-        tensors = []
-        for i in range(stage.op.num_outputs):
-            tensor = stage.op.output(i)
-            tensors.append(encode_tensor(obj_manager, tensor, stage))
-        tensors.sort(key=lambda tensor: tensor["value_index"])
-        return tensors
-
-    def encode_stage(obj_manager, stage, range_map):
-        """Extract and encode stage visualization data to a dictionary"""
-        stage_dict = {
-            "type": "Stage",
-            "name": stage.op.name,
-            "attaching_to": obj_manager.get_dom_path(stage.attach_ivar),
-            "compute": str(stage.op.body) if hasattr(stage.op, "body") else None,
-            "properties": {
-                "scope": stage.scope,
-            },
-            "all_itervars": encode_itervars(stage, range_map),
-            "relations": encode_itervar_relations(obj_manager, stage),
-            "input_tensors": [
-                obj_manager.get_dom_path(frozenset({tensor.op.name, tensor.value_index}))
-                for tensor in stage.op.input_tensors
-            ],
-            "output_tensors": encode_tensors(obj_manager, stage),
-        }
-        return stage_dict
-
-    def encode_schedule(sch, need_range):
-        """Extract and encode data from a schedule for visualization to a nested dictionary.
-        It is useful for JSON to serialize schedule.
-
-            Parameters
-            ----------
-            sch : schedule
-                        The schedule object to extract
-
-            Returns
-            -------
-            dict : dictionary
-                A nested dictionary
-        """
-        assert isinstance(
-            sch, tvm.te.schedule.Schedule
-        ), "Input is not a tvm.te.schedule.Schedule object."
-        range_map = None
-        if need_range:
-            try:
-                range_map = tvm.te.schedule.InferBound(sch)
-            except tvm._ffi.base.TVMError as expt:
-                warnings.warn(
-                    "Ranges are not available, because InferBound fails with the following error:\n"
-                    + str(expt)
-                )
-
-        obj_manager = ObjectManager(sch)
-        stages = []
-        for stage in sch.stages:
-            stages.append(encode_stage(obj_manager, stage, range_map))
-        return {
-            "type": "Schedule",
-            "stages": stages,
-        }
-
-    return json.dumps(sch, default=lambda s: encode_schedule(s, need_range))
-
-
-def viz_schedule_tree(sch, show_svg=False, dot_file_path="", output_dot_string=False):
-    """Top level API to render schedule tree
-
-    Parameters
-    ----------
-    sch : schedule
-                The schedule object to visualize
-
-    show_svg : bool
-                Display graph as SVG, useful for Jupyter notebooks.
-
-    dot_file_path : string
-                Dot file to save the graph.
-
-    output_dot_string : bool
-                Return dot file content or an empty string.
-
-    Returns
-    -------
-    dot_string : string
-        Dot file content or an empty string according to output_dot_string
-
-    Examples
-    --------
-    The following code writes a schedule tree to a dot file.
-
-    .. code-block:: python
-        tedd.viz_schedule_tree(s, dot_file_path = '/tmp/example.dot')
-
-    Use the following code to render a SVG graph in a Jupyter notebook.
-
-    .. code-block:: python
-        tedd.viz_schedule_tree(s, show_svg = True)
-    """
-
-    def create_schedule_tree_graph(name=""):
-        return create_graph(name=name, rankdir="BT")
-
-    def root_dot(g):
-        g.node("ROOT", "ROOT", shape="oval", margin="0")
-
-    def stage_node_dot(g, stage):
-        node_label = stage_node_label(stage)
-        g.node(stage["id"], node_label, shape="none", margin="0")
-
-    def stage_node_label(stage):
-        """Return a html format label for the given stage."""
-        label = (
-            '<<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" '
-            'CELLPADDING="4"> <TR><TD BGCOLOR="lightgrey" '
-            'COLSPAN="2" PORT="stage">' + stage_label(stage) + "</TD></TR>"
-        )
-
-        for leafiv in leaf_itervars(stage):
-            iv_type = leafiv["itervar_type"]
-            var_attr_label = ""
-            if "thread" in leafiv["properties"] and leafiv["properties"]["thread"] is not None:
-                var_attr_label = (
-                    var_attr_label
-                    + '<br/><font color="#2980B9">('
-                    + str(leafiv["properties"]["thread"])
-                    + ")</font>"
-                )
-            if "intrin" in leafiv["properties"] and leafiv["properties"]["intrin"] is not None:
-                var_attr_label = (
-                    var_attr_label
-                    + "<br/>"
-                    + linebrk(
-                        "(tensor_intrin:" + str(leafiv["properties"]["intrin"]) + ")",
-                        TVMDD_TABLE_BODY_WIDTH,
-                    )
-                )
-            var_label, color = get_itervar_label_color(leafiv, iv_type)
-            label += itervar_label(leafiv, leafiv["index"], color, var_label + var_attr_label)
-        if stage["compute"] is not None:
-            label += (
-                '<TR><TD COLSPAN="2">'
-                + linebrk(str(stage["compute"]), TVMDD_TABLE_BODY_WIDTH)
-                + "</TD></TR>"
-            )
-        label += "</TABLE>>"
-        return label
-
-    def compute_at_dot(g, stage):
-        """If the given stage attaches to another stage, create an edge from it
-        stage to its attach point; otherwise, create an edge to the ROOT.
-        """
-        src = stage["id"]
-        dst = (
-            dom_path_to_string([stage["attaching_to"][0]], "Stage")
-            + ":"
-            + dom_path_to_string(stage["attaching_to"], "IterVar")
-            if stage["attaching_to"] is not None
-            else "ROOT"
-        )
-        color = (
-            PALETTE[stage["attaching_to"][1] + 1]
-            if stage["attaching_to"] is not None and stage["attaching_to"][1] < PALETTE_SIZE - 1
-            else PALETTE[0]
-        )
-        g.edge(src, dst, color=color)
-
-    graph = create_schedule_tree_graph("Schedule Tree")
-    s = extract_dom_for_viz(sch)
-    legend_dot(graph)
-    for stage in s["stages"]:
-        stage_node_dot(graph, stage)
-    for stage in s["stages"]:
-        compute_at_dot(graph, stage)
-    root_dot(graph)
-    return dump_graph(graph.source, show_svg, dot_file_path, output_dot_string)
-
-
-def viz_itervar_relationship_graph(sch, show_svg=False, dot_file_path="", output_dot_string=False):
-    """Top level API to render IterVar relationship graph
-
-    Parameters
-    ----------
-    sch : schedule
-                The schedule object to visualize
-
-    show_svg : bool
-                Display graph as SVG, useful for Jupyter notebooks.
-
-    dot_file_path : string
-                Dot file to save the graph.
-
-    output_dot_string : bool
-                Return dot file content or an empty string.
-
-    Examples
-    --------
-    The following code writes Ian tervar relationship graph to a dot file.
-
-    .. code-block:: python
-        tedd.viz_def viz_itervar_relationship_graph(sch,
-            (s, dot_file_path = '/tmp/example.dot')
-
-    Use the following code to render a SVG graph in a Jupyter notebook.
-
-    .. code-block:: python
-        tedd.viz_def viz_itervar_relationship_graph(sch,
-            (s, show_svg = True)
-    """
-
-    def create_itervar_relation_graph(name=""):
-        return create_graph(name=name, rankdir="TB")
-
-    def itervar_node_dot(g, itervar, iv_type, index):
-        label = itervar_node_label(itervar, iv_type, index)
-        g.node(itervar["id"], label, shape="none", margin="0")
-
-    def itervar_node_label(itervar, iv_type, index):
-        label = (
-            '<<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" '
-            'CELLPADDING="4">'
-            + itervar_label(
-                itervar,
-                index,
-                get_itervar_label_color(itervar, iv_type)[1],
-                get_itervar_label_color(itervar, iv_type)[0],
-            )
-            + "</TABLE>>"
-        )
-        return label
-
-    def itervar_relation_node_dot(g, node_id, node_label, input_ports, output_ports):
-        label = itervar_relation_node_label(node_label, input_ports, output_ports)
-        g.node(node_id, label, shape="none", margin="0")
-
-    def itervar_relation_node_label(node_label, input_ports, output_ports):
-        """Return a html format label for an itervar relationship node
-        including node_label and input/output ports.
-        """
-        label = '<<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" ' 'CELLPADDING="4">' + "<TR>"
-        max_port_num = max(len(input_ports), len(output_ports))
-        for i in range(max_port_num):
-            if i < len(input_ports):
-                input_port = input_ports[i]
-                label += '<TD BGCOLOR="lightgrey" PORT="' + input_port + '">' + input_port + "</TD>"
-            else:
-                label += '<TD BGCOLOR="white"></TD>'
-        label += "</TR>"
-        label += (
-            '<TR><TD BGCOLOR="white" COLSPAN="'
-            + str(max_port_num)
-            + '" PORT="relation">'
-            + node_label
-            + "</TD></TR>"
-        )
-        label += "<TR>"
-        for i in range(max_port_num):
-            if i < len(output_ports):
-                output_port = output_ports[i]
-                label += (
-                    '<TD BGCOLOR="lightgrey" PORT="' + output_port + '">' + output_port + "</TD>"
-                )
-            else:
-                label += '<TD BGCOLOR="white"></TD>'
-        label += "</TR>"
-        label += "</TABLE>>"
-        return label
-
-    def itervar_relation_dot(g, node, node_id):
-        """Create an itervar relationship node."""
-        node_type = node["type"]
-        if node_type == "Split_Relation":
-            node_type = "Split"
-            itervar_relation_node_dot(g, node_id, node_type, ["Input"], ["Outer", "Inner"])
-            parent = dom_path_to_string(node["parent"], "IterVar")
-            outer = dom_path_to_string(node["outer"], "IterVar")
-            inner = dom_path_to_string(node["inner"], "IterVar")
-            g.edge(parent + ":itervar", node_id + ":Input")
-            g.edge(node_id + ":Outer", outer + ":itervar")
-            g.edge(node_id + ":Inner", inner + ":itervar")
-        elif node_type == "Fuse_Relation":
-            node_type = "Fuse"
-            itervar_relation_node_dot(g, node_id, node_type, ["Outer", "Inner"], ["Fused"])
-            fused = dom_path_to_string(node["fused"], "IterVar")
-            outer = dom_path_to_string(node["outer"], "IterVar")
-            inner = dom_path_to_string(node["inner"], "IterVar")
-            g.edge(outer + ":itervar", node_id + ":Outer")
-            g.edge(inner + ":itervar", node_id + ":Inner")
-            g.edge(node_id + ":Fused", fused + ":itervar")
-        elif node_type == "Singleton_Relation":
-            node_type = "Singleton"
-            itervar_relation_node_dot(g, node_id, node_type, [], ["Iter"])
-            itervar = dom_path_to_string(node["inner"], "IterVar")
-            g.edge(node_id + ":Iter", itervar + ":itervar")
-        else:
-            assert False, "Unknown IterVarRelationNode: " + node_type
-
-    def stage_node_dot(g, stage):
-        """Create a stage node."""
-        with g.subgraph(name="cluster_" + stage["id"]) as subgraph:
-            subgraph.attr(label=stage["name"])
-            if stage["all_itervars"]:
-                for itervar in stage["all_itervars"]:
-                    iv_type = itervar["itervar_type"]
-                    itervar_node_dot(subgraph, itervar, iv_type, itervar["index"])
-                for rel in stage["relations"]:
-                    node_id = rel["id"]
-                    itervar_relation_dot(subgraph, rel, node_id)
-            else:
-                subgraph.node(stage["name"] + "_placeholder", style="invis")
-
-    graph = create_itervar_relation_graph("IterVar Relationship Graph")
-    s = extract_dom_for_viz(sch)
-    legend_dot(graph)
-    for stage in s["stages"]:
-        stage_node_dot(graph, stage)
-
-    return dump_graph(graph.source, show_svg, dot_file_path, output_dot_string)
-
-
-def viz_dataflow_graph(sch, show_svg=False, dot_file_path="", output_dot_string=False):
-    """Top level API to render dataflow graph
-
-    Parameters
-    ----------
-    sch : schedule
-                The schedule object to visualize
-
-    show_svg : bool
-                Display graph as SVG, useful for Jupyter notebooks.
-
-    dot_file_path : string
-                Dot file to save the graph.
-
-    output_dot_string : bool
-                Return dot file content or an empty string.
-
-    Examples
-    --------
-    The following code writes a dataflow graph to a dot file.
-
-    .. code-block:: python
-        tedd.viz_dataflow_graph(s, dot_file_path = '/tmp/example.dot')
-
-    Use the following code to render a SVG graph in a Jupyter notebook.
-
-    .. code-block:: python
-        tedd.viz_dataflow_graph(s, show_svg = True)"""
-
-    def create_dataflow_graph(name=""):
-        return create_graph(name=name, rankdir="LR")
-
-    def tensor_node_dot(g, tensor):
-        """Create a tensor node."""
-        label = tensor_node_label(tensor)
-        g.node(tensor["id"], label, shape="oval", margin="0")
-
-    def tensor_node_label(tensor):
-        """Return a html format label for the given tensor."""
-        label = str(tensor["shape"]) + "\n" + str(tensor["data_type"])
-        return label
-
-    def stage_node_dot(g, stage):
-        """Create a stage node."""
-        label = stage_node_label(stage)
-        g.node(stage["id"], label, shape="none", margin="0")
-
-    def stage_node_label(stage):
-        """Return a html format label for the given stage."""
-        rows = max(1, max(len(stage["output_tensors"]), len(stage["input_tensors"])))
-        label = '<<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" ' 'CELLPADDING="4">'
-        for i in range(rows):
-            label += "<TR>"
-            if i < len(stage["input_tensors"]):
-                port_id = get_port_id(True, i)
-                label += (
-                    '<TD BGCOLOR="lightgrey" COLSPAN="2" PORT="' + port_id + '">' + str(i) + "</TD>"
-                )
-            else:
-                label += '<TD BGCOLOR="white" COLSPAN="2"></TD>'
-            if i == 0:
-                label += (
-                    '<TD BGCOLOR="white" COLSPAN="2" ROWSPAN="'
-                    + str(rows)
-                    + '">'
-                    + stage_label(stage)
-                    + "</TD>"
-                )
-            if i < len(stage["output_tensors"]):
-                port_id = get_port_id(False, i)
-                label += (
-                    '<TD BGCOLOR="lightgrey" COLSPAN="2" PORT="' + port_id + '">' + str(i) + "</TD>"
-                )
-            else:
-                label += '<TD BGCOLOR="white" COLSPAN="2"></TD>'
-            label += "</TR>"
-        label += "</TABLE>>"
-        return label
-
-    def dfg_dot(g, sch):
-        """Create edges among stages."""
-        stages = sch["stages"]
-        for stage in stages:
-            for i in range(len(stage["input_tensors"])):
-                src = dom_path_to_string(stage["input_tensors"][i], "Tensor")
-                dst = stage["id"] + ":" + get_port_id(True, i)
-                g.edge(src, dst)
-            for i in range(len(stage["output_tensors"])):
-                src = stage["id"] + ":" + get_port_id(False, i)
-                dst = stage["output_tensors"][i]["id"]
-                g.edge(src, dst)
-
-    graph = create_dataflow_graph("Dataflow Graph")
-    s = extract_dom_for_viz(sch, need_range=False)
-    for stage in s["stages"]:
-        stage_node_dot(graph, stage)
-        for tensor in stage["output_tensors"]:
-            tensor_node_dot(graph, tensor)
-
-    dfg_dot(graph, s)
-
-    return dump_graph(graph.source, show_svg, dot_file_path, output_dot_string)
diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
index fb325de1d3ab..94006111ffa2 100644
--- a/python/tvm/driver/build_module.py
+++ b/python/tvm/driver/build_module.py
@@ -17,106 +17,36 @@
 
 # pylint: disable=invalid-name
 """The build utils in python."""
-from typing import Union, Optional, List, Mapping
+from typing import Union, Optional
 
-import warnings
 
 import tvm.tir
 
-from tvm import te
 
-from tvm.runtime import Module
 from tvm.runtime import ndarray
-from tvm.ir import container
 from tvm.tir import PrimFunc
 from tvm.ir.module import IRModule
-from tvm.te import tensor
 from tvm.target import Target
-from tvm.tir.buffer import Buffer
-from tvm.tir.expr import Var
 from tvm.driver import _ffi_api as _driver_ffi
 
 from . import _ffi_api as ffi
 
 
-def get_binds(args, compact=False, binds=None):
-    """Internal function to get binds and arg_list given arguments.
-    Parameters
-    ----------
-    args : list of Buffer or Tensor or Var
-        The argument lists to the function.
-    compact : bool
-        If the statement has already bound to a compact buffer.
-    binds : dict of :any:`Tensor` to :any:`Buffer`, optional
-        Dictionary that maps the Tensor to Buffer which specified the data layout
-        requirement of the function. By default, a new compact buffer is created
-        for each tensor in the argument.
-    Returns
-    -------
-    binds: dict
-        The bind specification
-    arg_list: list
-        The list of symbolic buffers of arguments.
-    """
-    binds, arg_list = ffi.get_binds(args, compact, binds)
-    return binds, arg_list
-
-
-def schedule_to_module(
-    sch: te.Schedule,
-    args: Optional[List[Union[Buffer, tensor.Tensor, Var]]] = None,
-    name: str = "main",
-    binds: Optional[Mapping[tensor.Tensor, Buffer]] = None,
-) -> IRModule:
-    """According to the given schedule, form a function.
-
-    This is a low-level function intended for testing purposes, and
-    does not apply any optimization passes.  In general, `tvm.lower`
-    and `tvm.build` should be used instead.
-
-    Parameters
-    ----------
-    sch : tvm.te.schedule.Schedule
-        The given scheduler to form the raw body
-    args : list of Buffer or Tensor or Var
-        The argument lists to the function.
-    name : str
-        The name of result function, default name is "main"
-    binds : dict of :any:`Tensor` to :any:`Buffer`, optional
-        The binds information
-    Returns
-    -------
-    The body formed according to the given schedule
-    """
-    return ffi.schedule_to_module(sch, args, name, binds)
-
-
 def lower(
-    inp: Union[te.Schedule, PrimFunc, IRModule],
-    args: Optional[List[Union[Buffer, tensor.Tensor, Var]]] = None,
+    inp: Union[PrimFunc, IRModule],
     name: str = "main",
-    binds: Optional[Mapping[tensor.Tensor, Buffer]] = None,
     simple_mode: bool = False,
 ) -> IRModule:
     """Lowering step before build into target.
 
     Parameters
     ----------
-    inp : Union[tvm.te.schedule.Schedule, tvm.tir.PrimFunc, IRModule]
+    inp : Union[tvm.tir.PrimFunc, IRModule]
         The TE schedule or TensorIR PrimFunc/IRModule to be built
 
-    args : Optional[List[Union[tvm.tir.Buffer, tensor.Tensor, tir.Var]]]
-        The argument lists to the function for TE schedule.
-
-        It should be None if we want to lower TensorIR.
     name : str
         The name of the result function.
 
-    binds : Optional[Mapping[tensor.Tensor, tvm.tir.Buffer]]
-        Dictionary that maps the Tensor to Buffer which specified the data layout
-        requirement of the function. By default, a new compact buffer is created
-        for each tensor in the argument.
-
     simple_mode : bool
         Whether only output simple and compact statement, this will skip
         LoopPartition, api wrapper generation and Unrolling.
@@ -130,139 +60,65 @@ def lower(
         return ffi.lower_module(inp, simple_mode)
     if isinstance(inp, PrimFunc):
         return ffi.lower_primfunc(inp, name, simple_mode)
-    if isinstance(inp, te.Schedule):
-        return ffi.lower_schedule(inp, args, name, binds, simple_mode)
     raise ValueError(
         f"Expected input to be an IRModule, PrimFunc or te.Schedule, but got {type(inp)}"
     )
 
 
 def build(
-    inputs: Union[te.Schedule, PrimFunc, IRModule, Mapping[str, IRModule]],
-    args: Optional[List[Union[Buffer, tensor.Tensor, Var]]] = None,
+    inputs: Union[PrimFunc, IRModule],
     target: Optional[Union[str, Target]] = None,
-    target_host: Optional[Union[str, Target]] = None,
-    runtime: Optional[
-        "tvm.relay.backend.Runtime"
-    ] = None,  # Type is annotated this way to avoid cyclic dependency
-    name: Optional[str] = "default_function",
-    binds: Optional[Mapping[tensor.Tensor, Buffer]] = None,
+    name: str = "main",
 ):
     """Build a function with arguments as signature. Code will be generated
     for devices coupled with target information.
 
     Parameters
     ----------
-    inputs : Union[tvm.te.schedule.Schedule, tvm.tir.PrimFunc, IRModule, Mapping[str, IRModule]]
+    input : Union[tvm.tir.PrimFunc, IRModule]
         The input to be built
 
-    args : Optional[List[Union[tvm.tir.Buffer, tensor.Tensor, tir.Var]]]
-        The argument lists to the function.
-
     target : Optional[Union[str, Target]]
         The target and option of the compilation.
 
-    target_host : Optional[Union[str, Target]]
-        Host compilation target, if target is device.
-        When TVM compiles device specific program such as CUDA,
-        we also need host(CPU) side code to interact with the driver
-        setup the dimensions and parameters correctly.
-        target_host is used to specify the host side codegen target.
-        By default, llvm is used if it is enabled,
-        otherwise a stackvm interpreter is used.
-
-    runtime : Optional[Runtime]
-        Runtime to generate artifacts for
-
-    name : Optional[str]
+    name : str
         The name of result function.
 
-    binds : Optional[Mapping[tensor.Tensor, tvm.tir.Buffer]]
-        Dictionary that maps the binding of symbolic buffer to Tensor.
-        By default, a new buffer is created for each tensor in the argument.
-
     Returns
     -------
     ret : tvm.module
         A module that combines both host and device code.
 
-    Examples
-    ________
-    There are two typical example uses of this function depending on the type
-    of the argument `inputs`:
-    1. it is an IRModule.
-
-    .. code-block:: python
-
-        n = 2
-        A = te.placeholder((n,), name='A')
-        B = te.placeholder((n,), name='B')
-        C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
-        s = tvm.te.create_schedule(C.op)
-        m = tvm.lower(s, [A, B, C], name="test_add")
-        rt_mod = tvm.build(m, target="llvm")
-
-    2. it is a dict of compilation target to IRModule.
-
-    .. code-block:: python
-
-        n = 2
-        A = te.placeholder((n,), name='A')
-        B = te.placeholder((n,), name='B')
-        C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
-        s1 = tvm.te.create_schedule(C.op)
-        with tvm.target.cuda() as cuda_tgt:
-          s2 = topi.cuda.schedule_injective(cuda_tgt, [C])
-          m1 = tvm.lower(s1, [A, B, C], name="test_add1")
-          m2 = tvm.lower(s2, [A, B, C], name="test_add2")
-          rt_mod = tvm.build({"llvm": m1, "cuda": m2})
-
     Note
     ----
     See the note on :any:`tvm.target` on target string format.
     """
-    if isinstance(inputs, te.Schedule):
-        if args is None:
-            raise ValueError("args must be given for build from schedule")
-        input_mod = lower(inputs, args, name=name, binds=binds)
-    elif isinstance(inputs, (list, tuple, container.Array)):
-        merged_mod = tvm.IRModule({})
-        for x in inputs:
-            merged_mod.update(lower(x))
-        input_mod = merged_mod
-    elif isinstance(inputs, PrimFunc):
+    if isinstance(inputs, PrimFunc):
         input_mod = lower(inputs, name=name)
     elif isinstance(inputs, tvm.IRModule):
         assert (
             len(inputs.get_global_vars()) > 0
         ), "Expected a non-empty IRModule, but the IRModule contained no functions."
         input_mod = lower(inputs)
-    elif not isinstance(inputs, (dict, container.Map)):
-        raise ValueError(
-            f"Inputs must be te.Schedule, IRModule, PrimFunc, "
-            f"or dict of target to IRModule, "
-            f"but got {type(inputs)}."
-        )
-
-    if not isinstance(inputs, (dict, container.Map)):
-        target = Target.current() if target is None else target
-        if target is None and isinstance(input_mod, tvm.IRModule):
-            target_mod = {}
-            for gvar, func in input_mod.functions.items():
-                tgt = func.attrs["target"] if "target" in func.attrs else "llvm"
-                if tgt not in target_mod:
-                    target_mod[tgt] = {}
-                target_mod[tgt][gvar] = func
-
-            target_input_mod = {}
-            for tgt in target_mod.keys():
-                tir_mod = tvm.IRModule(target_mod[tgt])
-                tir_mod = tir_mod.with_attrs(input_mod.attrs)
-                target_input_mod[tgt] = tir_mod
-        else:
-            target_input_mod = {target: input_mod}
     else:
-        target_input_mod = {tgt: lower(mod) for tgt, mod in inputs.items()}
+        raise ValueError("Inputs must be IRModule or PrimFunc")
+
+    target = Target.current() if target is None else target
+    if target is None and isinstance(input_mod, tvm.IRModule):
+        target_mod = {}
+        for gvar, func in input_mod.functions.items():
+            tgt = func.attrs["target"] if "target" in func.attrs else "llvm"
+            if tgt not in target_mod:
+                target_mod[tgt] = {}
+            target_mod[tgt][gvar] = func
+
+        target_input_mod = {}
+        for tgt in target_mod.keys():
+            tir_mod = tvm.IRModule(target_mod[tgt])
+            tir_mod = tir_mod.with_attrs(input_mod.attrs)
+            target_input_mod[tgt] = tir_mod
+    else:
+        target_input_mod = {target: input_mod}
 
     # Because modules can be created from a variety of sources, we annotate them
     # with the relevant attributes here to ensure they propagate
@@ -271,18 +127,10 @@ def build(
         if not isinstance(tgt, (str, Target)):
             raise ValueError("The key of inputs must be str or " "Target when inputs is dict.")
         if not isinstance(mod, tvm.IRModule):
-            raise ValueError("inputs must be Schedule, IRModule, " "or dict of str to IRModule.")
-        annotated_mods[tgt] = mod.with_attr("runtime", runtime)
+            raise ValueError("inputs must be IRModule, " "or dict of str to IRModule.")
+        annotated_mods[tgt] = mod
 
-    # TODO(mbs): Both CompilationConfig and TIRToRuntime implement the same host target
-    #  defaulting logic, but there's currently no way to get back the decided host.
-    if target_host is not None:
-        warnings.warn(
-            "target_host parameter is going to be deprecated. "
-            "Please pass in tvm.target.Target(target, host=target_host) instead."
-        )
-
-    annotated_mods, target_host = Target.canon_target_map_and_host(annotated_mods, target_host)
+    annotated_mods, target_host = Target.canon_target_map_and_host(annotated_mods)
     if not target_host:
         for tar, mod in annotated_mods.items():
             device_type = ndarray.device(tar.kind.name, 0).device_type
@@ -296,41 +144,4 @@ def build(
 
     rt_mod_host = _driver_ffi.tir_to_runtime(annotated_mods, target_host)
 
-    annotated_mods, target_host = Target.canon_target_map_and_host(annotated_mods, target_host)
-
-    if not isinstance(target_host, Target):
-        target_host = Target(target_host)
-
-    if str(runtime) == "crt" and runtime["system-lib"]:
-        if target_host.kind.name == "c":
-            create_csource_crt_metadata_module = tvm._ffi.get_global_func(
-                "runtime.CreateCSourceCrtMetadataModule"
-            )
-            to_return = create_csource_crt_metadata_module([rt_mod_host], target_host, runtime)
-        elif target_host.kind.name == "llvm":
-            create_llvm_crt_metadata_module = tvm._ffi.get_global_func(
-                "runtime.CreateLLVMCrtMetadataModule"
-            )
-            to_return = create_llvm_crt_metadata_module([rt_mod_host], target_host, runtime)
-    else:
-        to_return = rt_mod_host
-
-    return OperatorModule.from_module(to_return, ir_module_by_target=annotated_mods, name=name)
-
-
-class OperatorModule(Module):
-    """Wraps the Module returned by tvm.build() and captures additional outputs of that function."""
-
-    @classmethod
-    def from_module(cls, mod, **kwargs):
-        # NOTE(areusch): It is generally unsafe to continue using `mod` from this point forward.
-        # If an exception occurs in cls.__init__, handle will be deleted. For this reason,
-        # set mod.handle to None.
-        handle = mod.handle
-        mod.handle = None
-        return cls(handle, **kwargs)
-
-    def __init__(self, handle, ir_module_by_target=None, name=None):
-        super(OperatorModule, self).__init__(handle)
-        self.ir_module_by_target = ir_module_by_target
-        self.name = name
+    return rt_mod_host
diff --git a/python/tvm/exec/measure_peak.py b/python/tvm/exec/measure_peak.py
deleted file mode 100644
index 178e60089245..000000000000
--- a/python/tvm/exec/measure_peak.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""measure bandwidth and compute peak
-
-e.g.
-python3 -m tvm.exec.measure_peak --target cuda --rpc-host 127.0.0.1 --rpc-port 9090
-python3 -m tvm.exec.measure_peak --target opencl --target-host "llvm -mtriple=aarch64-linux-gnu" \
-        --rpc-host $TVM_OPENCL_DEVICE_HOST --rpc-port 9090
-"""
-
-import argparse
-import logging
-
-from tvm.target import Target
-from ..contrib.peak import measure_peak_all
-
-
-def main():
-    """Main function"""
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--target", type=str, default="llvm", help="The build target")
-    parser.add_argument(
-        "--target-host", type=str, default=None, help="The host code compilation target"
-    )
-    parser.add_argument(
-        "--rpc-host", type=str, default="127.0.0.1", help="the hostname of the server"
-    )
-    parser.add_argument("--rpc-port", type=int, default=9090, help="The port of the RPC")
-
-    args = parser.parse_args()
-    logging.basicConfig(level=logging.INFO)
-
-    args.target, args.target_host = Target.canon_target_and_host(args.target, args.target_host)
-    measure_peak_all(args.target, args.target_host, args.rpc_host, args.rpc_port)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/tvm/relax/frontend/torch/fx_translator.py b/python/tvm/relax/frontend/torch/fx_translator.py
index ce1f284be6bc..d57d24bf2f77 100644
--- a/python/tvm/relax/frontend/torch/fx_translator.py
+++ b/python/tvm/relax/frontend/torch/fx_translator.py
@@ -99,7 +99,6 @@ def convert(node: fx.Node) -> relax.Var:
     ########## Neural Network ##########
 
     def _adaptive_avg_pool2d_module(self, node: fx.Node) -> relax.Var:
-
         module = self.named_modules[node.target]
         x = self.env[node.args[0]]
         output_size = module.output_size
diff --git a/python/tvm/relax/vm_build.py b/python/tvm/relax/vm_build.py
index cfa4143b66c3..ac4d9698a072 100644
--- a/python/tvm/relax/vm_build.py
+++ b/python/tvm/relax/vm_build.py
@@ -179,10 +179,12 @@ def _vmcodegen(
     raise ValueError(f"Unknown exec_mode {exec_mode}")
 
 
-def _autodetect_system_lib_req(
-    target: Optional[tvm.target.Target] = None, system_lib: Optional[bool] = None
+def _auto_attach_system_lib_prefix(
+    tir_mod: tvm.IRModule,
+    target: Optional[tvm.target.Target] = None,
+    system_lib: Optional[bool] = None,
 ):
-    """Automatically detect system lib requirement"""
+    """Automatically detect system lib req and attach prefix attr"""
     if target is not None:
         host = target if target.host is None else target.host
         if system_lib is None:
@@ -191,9 +193,9 @@ def _autodetect_system_lib_req(
                 system_lib = True
 
     if system_lib:
-        # use packed-func to avoid relay dep.
-        return tvm.get_global_func("relay.backend.CreateRuntime")("cpp", {"system-lib": system_lib})
-    return None
+        if tir_mod.get_attr("system_lib_prefix") is None:
+            return tir_mod.with_attr("system_lib_prefix", "")
+    return tir_mod
 
 
 def _vmlink(
@@ -246,11 +248,8 @@ def _vmlink(
     relax_ext_libs = []
     tir_ext_libs = []
     if tir_mod is not None and len(tir_mod.get_global_vars()) > 0:
-        lib = tvm.build(
-            tir_mod,
-            target=target,
-            runtime=_autodetect_system_lib_req(target, system_lib),
-        )
+        tir_mod = _auto_attach_system_lib_prefix(tir_mod, target, system_lib)
+        lib = tvm.build(tir_mod, target=target)
     for ext_mod in ext_libs:
         if ext_mod.is_device_module:
             tir_ext_libs.append(ext_mod)
diff --git a/python/tvm/te/__init__.py b/python/tvm/te/__init__.py
index 0907ea2ebf85..e7b394ebf76c 100644
--- a/python/tvm/te/__init__.py
+++ b/python/tvm/te/__init__.py
@@ -28,21 +28,11 @@
 from tvm.tir import comm_reducer, min, max, sum
 from tvm.tir import add, subtract, multiply
 
-from .schedule import (
-    Schedule,
-    Stage,
-    create_schedule,
-    SpecializedCondition,
-    AXIS_SEPARATOR,
-)
 from .tensor import TensorSlice, Tensor
-from .tensor_intrin import decl_tensor_intrin
 from .tag import tag_scope
 from .operation import placeholder, compute, scan, extern, var, size_var, const
-from .operation import thread_axis, reduce_axis
+from .operation import thread_axis, reduce_axis, AXIS_SEPARATOR
 from .operation import create_prim_func
 from .operation import extern_primfunc
 
-from .tensor import PlaceholderOp, ComputeOp, TensorComputeOp, ScanOp, ExternOp, HybridOp
-from .autodiff import gradient
-from . import hybrid
+from .tensor import PlaceholderOp, ComputeOp, TensorComputeOp, ScanOp, ExternOp
diff --git a/python/tvm/te/autodiff.py b/python/tvm/te/autodiff.py
deleted file mode 100644
index f8650839948d..000000000000
--- a/python/tvm/te/autodiff.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Automatic differentiation of tensor expressions."""
-from . import _ffi_api
-
-
-def gradient(output, inputs, head=None):
-    """Perform reverse-mode automatic differentiation.
-
-    Parameters
-    ----------
-    output : Tensor
-        The tensor to differentiate.
-
-    inputs : List[Tensor]
-        The list of input tensors to be differentiated wrt.
-
-    head : Tensor
-        The adjoint of the output, in other words, some tensor, by which the Jacobians
-        will be multiplied. Its shape must be of the form `prefix + output.shape`.
-        If `None` is passed, the identity tensor of shape `output.shape + output.shape`
-        will be used.
-
-    Returns
-    -------
-    tensors: List[Tensor]
-        The result gradient, in the same order as the inputs
-
-    Example
-    -------
-    .. code-block:: python
-
-        x = tvm.placeholder((32, 3, 28, 28), name='x')
-        w1 = tvm.placeholder((10, 3, 3, 3), name='w1')
-        w2 = tvm.placeholder((10, 10, 3, 3), name='w2')
-        z1 = topi.nn.conv2d(x, w1, 1, 1, 1)
-        z2 = topi.nn.conv2d(z1, w2, 1, 1, 1)
-        y = topi.sum(z2)
-
-        # produce gradients
-        [dw1, dw2] = tvm.gradient(y, [w1, w2])
-
-        # produce Jacobians
-        [jw1, jw2] = tvm.gradient(z2, [w1, w2])
-
-        # produce gradients, the head adjoint for z2 is provided manually
-        [dw1, dw2] = tvm.gradient(z2, [w1, w2], topi.full_like(z2, 1.0))
-
-    """
-    if not isinstance(inputs, list):
-        inputs = [inputs]
-    return _ffi_api.Gradient(output, inputs, head)
diff --git a/python/tvm/te/hybrid/__init__.py b/python/tvm/te/hybrid/__init__.py
deleted file mode 100644
index cd320c6b209c..000000000000
--- a/python/tvm/te/hybrid/__init__.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Hybrid Programming APIs of TVM Python Package.
-
-This package maps a subset of python to HalideIR so that:
-1. Users can write some preliminary versions of the computation patterns
-have not been supported yet and verify it across the real execution and
-python semantic emulation.
-2. So far, it is a text format dedicated to HalideIR Phase 0. Refer tvm.lower
-for more details. A larger ambition of this module is to support all levels of
-HalideIR.
-"""
-
-# TODO(@were): Make this module more complete.
-# 1. Support HalideIR dumping to Hybrid Script
-# 2. Support multi-level HalideIR
-import inspect
-import tvm._ffi
-import tvm.te.schedule
-from tvm._ffi.base import decorate
-
-from .module import HybridModule
-from .parser import source_to_op
-from .utils import _pruned_source
-
-
-def script(pyfunc):
-    """Decorate a python function as hybrid script.
-
-    The hybrid function support emulation mode and parsing to
-    the internal language IR.
-
-    Returns
-    -------
-    hybrid_func : function
-        A decorated hybrid script function.
-    """
-    # pylint: disable=import-outside-toplevel, missing-docstring
-    def wrapped_func(func, *args, **kwargs):
-        from .utils import _is_tvm_arg_types
-
-        if _is_tvm_arg_types(args):
-            src = _pruned_source(func)
-            closure_vars = inspect.getclosurevars(func).nonlocals
-            closure_vars.update(inspect.getclosurevars(func).globals)
-            return source_to_op(src, args, func.__globals__, closure_vars)
-
-        from .runtime import _enter_hybrid_runtime, _restore_runtime
-
-        intersect = _enter_hybrid_runtime(func)
-        value = func(*args, **kwargs)
-        _restore_runtime(func, intersect)
-        return value
-
-    return decorate(pyfunc, wrapped_func)
-
-
-def build(sch, inputs, outputs, name="hybrid_func"):
-    """Dump the current schedule to hybrid module
-
-    Parameters
-    ----------
-    sch: tvm.te.Schedule
-        The schedule to be dumped
-
-    inputs: An array of Tensors or Vars
-        The inputs of the function body
-
-    outputs: An array of Tensors
-        The outputs of the function body
-
-    Returns
-    -------
-    module: HybridModule
-        The built results is wrapped in a HybridModule.
-        The usage of HybridModule is roughly the same as normal TVM-built modules.
-    """
-    sch = sch.normalize()
-    bounds = tvm.te.schedule.InferBound(sch)
-    stmt = tvm.te.schedule.ScheduleOps(sch, bounds)
-
-    src = _Dump(stmt, inputs, outputs, name)
-
-    return HybridModule(src, name)
-
-
-tvm._ffi._init_api("tvm.hybrid", __name__)
diff --git a/python/tvm/te/hybrid/calls.py b/python/tvm/te/hybrid/calls.py
deleted file mode 100644
index 948a0d7665ff..000000000000
--- a/python/tvm/te/hybrid/calls.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Intrinsics of TVM-Python Hybrid Script for Python compilation time
-semantic support."""
-
-from tvm.runtime import const, convert
-import tvm.te
-from tvm.ir.container import Array
-from tvm.target import Target
-from tvm.tir import expr as _expr
-from tvm.tir import call_intrin
-from tvm.tir.stmt import ForKind
-
-from .utils import _internal_assert
-
-# pylint: disable=redefined-builtin,invalid-name
-
-LOOP_INTRIN = {
-    "range": ForKind.SERIAL,
-    "unroll": ForKind.UNROLLED,
-    "parallel": ForKind.PARALLEL,
-    "vectorize": ForKind.VECTORIZED,
-    "const_range": (ForKind.UNROLLED,),
-}
-
-
-def _range(annotation, args):
-    """Handling TVM loop types"""
-    n = args.__len__()
-    if n == 1:
-        low, ext = const(0, dtype="int32"), args[0]
-    else:
-        _internal_assert(n == 2, "A loop intrinsic should only have 1 or 2 arguments!")
-        low, ext = args[0], args[1]
-    if not tvm.tir.analysis.expr_deep_equal(low, const(0, dtype="int32")):
-        ext = ext - low
-    kind = LOOP_INTRIN[annotation]
-    iter_var = None
-    return iter_var, low, ext, kind
-
-
-range = unroll = vectorize = parallel = const_range = _range  # pylint: disable=invalid-name
-
-
-def bind(func_id, args):
-    """Handling TVM thread binding"""
-    _internal_assert(func_id == "bind", "This function cannot be directly invoked!")
-    _internal_assert(args.__len__() == 2, "A loop bind should only have 2 arguments!")
-    _internal_assert(isinstance(args[0], str), "A loop bind's first argument should be a string!")
-    low, ext = const(0, "int32"), args[1]
-    iter_var = tvm.te.thread_axis((low, ext), args[0])
-    kind = None
-    return iter_var, low, ext, kind
-
-
-def _math_intrin(func_id, args):
-    # pylint: disable=import-outside-toplevel
-    from tvm.tir import op
-
-    return getattr(op, func_id)(*args)
-
-
-sqrt = (
-    log
-) = exp = tanh = sigmoid = power = popcount = round = _math_intrin  # pylint: disable=invalid-name
-
-
-def _min_max(func_id, args):
-    _internal_assert(args.__len__() == 2, "Max/Min function should have 2 elements")
-    return getattr(_expr, func_id.title())(args[0], args[1])
-
-
-min = max = _min_max  # pylint: disable=invalid-name
-
-
-def _allocate_tensor(func_id, args):
-    """Handling TVM tensor allocation.
-    You may refer hybrid.intrin.allocate for more details."""
-    n = args.__len__()
-    _internal_assert(
-        isinstance(convert(args[0]), Array), "allocate's first argument should be a tuple of shape!"
-    )
-    shape = args[0]
-    for i in shape:
-        _internal_assert(isinstance(i, (_expr.PrimExpr, int)), "The shape should be an expression")
-    if n > 1:
-        _internal_assert(isinstance(args[1], str), "The data type should be an str")
-        _internal_assert(
-            args[1].startswith("int") or args[1].startswith("float"),
-            "The data type should be either int or float!",
-        )
-        dtype = args[1]
-    else:
-        dtype = "float32"
-    if n > 2:
-        _internal_assert(isinstance(args[2], str), "The data scope should be an string")
-        _internal_assert(func_id != "output_tensor", "Output tensor cannot specify scope")
-        scope = args[2]
-    else:
-        scope = "global" if func_id != "output_tensor" else "output"
-    return (shape, dtype, scope)
-
-
-output_tensor = allocate = _allocate_tensor  # pylint: disable=invalid-name
-
-
-def len(func_id, args):
-    """Iterpret the len function"""
-    _internal_assert(args.__len__() == 1, "Only 1 argument is expected!")
-    _internal_assert(func_id == "len", "This function cannot be directly invoked!")
-    try:
-        return convert(args[0].__len__())
-    except:  # pylint: disable=bare-except
-        _internal_assert(args[0].shape.__len__() == 1, "Only one-dimension array can get len")
-        return convert(args[0].shape[0])
-
-
-def _cast(func_id, args):
-    _internal_assert(
-        args.__len__() == 1,
-        f"Casting to {func_id} only supports a single argument",
-    )
-    # The FFI can handle any conversion of `args[0]` into PrimExpr, if
-    # required.
-    return _expr.Cast(func_id, args[0])
-
-
-float16 = float32 = float64 = _cast  # pylint: disable=invalid-name
-int8 = int16 = int32 = int64 = _cast  # pylint: disable=invalid-name
-uint8 = uint16 = uint32 = uint64 = _cast  # pylint: disable=invalid-name
-
-
-def ceil_div(func_id, args):
-    _internal_assert(func_id == "ceil_div", "This function cannot be directly invoked!")
-    _internal_assert(args.__len__() == 2, "2 arguments expected for division!")
-    a, b = args
-    return (a + b - 1) // b
-
-
-def likely(func_id, args):
-    _internal_assert(args.__len__() == 1, "Only one expression can be likely")
-    _internal_assert(func_id == "likely", "This function cannot be directly invoked!")
-    return call_intrin(args[0].dtype, "tir.likely", *args)
-
-
-def max_num_threads(func_id, args):
-    """Set the maximum number of threads."""
-    _internal_assert(func_id == "max_num_threads", "This function cannot be directly invoked!")
-    _internal_assert(args.__len__() <= 1, "At most one argument accepted!")
-    if args.__len__() == 0:
-        res = Target.current().max_num_threads
-    else:
-        _internal_assert(isinstance(args[0], _expr.IntImm), "In tvm bool should be uint")
-        res = Target.current(args[0].value).max_num_threads
-    return convert(res)
-
-
-def inf(func_id, args):
-    """Infinity"""
-    _internal_assert(func_id == "inf", "This function cannot be directly invoked!")
-    _internal_assert(args.__len__() == 1, "One argument accepted!")
-    return tvm.tir.max_value(args[0])
-
-
-def ninf(func_id, args):
-    """Negative infinity"""
-    _internal_assert(func_id == "ninf", "This function cannot be directly invoked!")
-    _internal_assert(args.__len__() == 1, "One argument accepted!")
-    return tvm.tir.min_value(args[0])
diff --git a/python/tvm/te/hybrid/module.py b/python/tvm/te/hybrid/module.py
deleted file mode 100644
index 729805b31b6b..000000000000
--- a/python/tvm/te/hybrid/module.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Methods and data structures to support dumping HalideIR to Hybrid Script.
-This allows users to do quick hack to generated HalideIR and cast it back to
-TVM modules.
-
-To enable this feature, you need to build with -DUSE_HYBRID_DUMP=ON.
-"""
-
-import ast
-
-from tvm.contrib import utils
-from .utils import _internal_assert
-from .utils import _is_tvm_arg_types
-from .parser import source_to_op
-
-
-class HybridModule(object):
-    """The usage of Hybrid Module is very similar to conventional TVM module,
-    but conventional TVM module requires a function body which is already fully
-    lowered. This contradicts to the fact that Hybrid Module is originally a text
-    format for Phase 0 HalideIR. Thus, a totally separated module is defined."""
-
-    def __init__(self, src=None, name=None):
-        """The constructor of this a hybrid module
-
-        Parameters
-        ----------
-        src : str
-            The source code of this module
-
-        name : str
-            The name of this module
-        """
-        self.src_ = self.name = self.func_ = self.root_ = None
-        if src is not None:
-            temp = utils.tempdir()
-            dst = temp.relpath("script.py")
-            with open(dst, "w") as f:
-                f.write(f"import tvm\n@tvm.te.hybrid.script\n{src}")
-
-            if name is not None:
-                self.name = name
-            self.load(dst)
-
-    def __call__(self, *args):
-        if _is_tvm_arg_types(args):
-            return source_to_op(self.root_, args, globals(), {})
-        return self.func_(*args)
-
-    def get_source(self):
-        return self.src_
-
-    def save(self, path):
-        if not path.endswith(".py"):
-            path = path + ".py"
-        with open(path, "w") as f:
-            f.write(self.src_)
-
-    def load(self, path):
-        """Load the module from a python file
-
-        Parameters
-        ----------
-        path : str
-            Path to the given python file
-        """
-        with open(path, "r") as f:
-            self.src_ = f.read()
-
-        src = self.src_
-
-        class FindFunc(ast.NodeVisitor):
-            """Find the function in module to be loaded module."""
-
-            # pylint: disable=invalid-name
-            def __init__(self):
-                self.name = None
-                self.root = None
-
-            def visit_FunctionDef(self, node):
-                _internal_assert(self.name is None, "For now, only one function supported!")
-                self.name = node.name
-                _internal_assert(self.root is None, "For now, only one function supported!")
-                self.root = node
-
-        root = ast.parse(src)
-        finder = FindFunc()
-        finder.visit(root)
-        _internal_assert(finder.name is not None and finder.root is not None, "No function found!")
-        if self.name is None:
-            self.name = finder.name
-        self.root_ = finder.root
-
-        _, local_ = {}, {}
-        exec(self.src_, _, local_)  # pylint: disable=exec-used
-        local_.pop("tvm")
-        assert len(local_) == 1
-        self.func_ = list(local_.values())[0]
diff --git a/python/tvm/te/hybrid/parser.py b/python/tvm/te/hybrid/parser.py
deleted file mode 100644
index bd5a060cd01c..000000000000
--- a/python/tvm/te/hybrid/parser.py
+++ /dev/null
@@ -1,658 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Hybrid Script Parser"""
-
-import ast
-import operator
-import logging
-import sys
-import numbers
-
-from enum import Enum
-from tvm.ir import Array, Range
-import tvm.runtime
-import tvm.tir
-import tvm.te
-import tvm.te._ffi_api
-import tvm.arith
-
-from tvm.tir import expr as _expr
-from tvm.tir import stmt as _stmt
-from tvm.te.tensor import Tensor, Operation
-from tvm.tir import all as _all
-from tvm.tir import any as _any
-
-from .utils import _internal_assert
-from . import calls
-from . import utils
-from .preprocessor import determine_variable_usage
-
-
-def concat_list_to_block(lst):
-    """Concatenate a list of Python IR nodes to HalideIR Block"""
-    if not lst:
-        return utils.make_nop()
-    n = len(lst)
-    if n == 1:
-        return lst[0]
-    return _stmt.SeqStmt(lst)
-
-
-def visit_list_to_block(visit, lst):
-    """Visit and concatenate a list of Python IR nodes to HalideIR Block"""
-    lst = [visit(stmt) for stmt in lst if not utils.is_docstring(stmt)]
-    lst = [stmt for stmt in lst if not tvm.ir.structural_equal(stmt, utils.make_nop())]
-    if not lst:
-        return utils.make_nop()
-    return concat_list_to_block(lst)
-
-
-class Symbol(Enum):
-    """Enumerates types in the symbol table"""
-
-    Callable = 0
-    Input = 1
-    OutputBuffer = 2
-    GlobalBuffer = 3
-    LocalBuffer = 4
-    SharedBuffer = 5
-    ConstVar = 6
-    BufferVar = 7
-    LoopVar = 8
-    ConstLoopVar = 9
-    ThreadBind = 10
-
-
-def _floordiv(x, y):
-    if isinstance(x, _expr.ExprOp) or isinstance(y, _expr.ExprOp):
-        return tvm.tir.floordiv(x, y)
-    return operator.floordiv(x, y)
-
-
-def _floormod(x, y):
-    if isinstance(x, _expr.ExprOp) or isinstance(y, _expr.ExprOp):
-        return tvm.tir.floormod(x, y)
-    return operator.mod(x, y)
-
-
-class HybridParser(ast.NodeVisitor):
-    """Python AST visitor pass which finally lowers it to HalideIR"""
-
-    _binop_maker = {
-        ast.Add: operator.add,
-        ast.Sub: operator.sub,
-        ast.Mult: operator.mul,
-        ast.Div: operator.div if sys.version_info[0] == 2 else operator.truediv,
-        ast.FloorDiv: _floordiv,
-        ast.Mod: _floormod,
-        ast.BitOr: operator.or_,
-        ast.BitAnd: operator.and_,
-        ast.BitXor: operator.xor,
-        ast.Gt: operator.gt,
-        ast.GtE: operator.ge,
-        ast.Lt: operator.lt,
-        ast.LtE: operator.le,
-        ast.Eq: operator.eq,
-        ast.NotEq: operator.ne,
-        ast.And: _all,
-        ast.Or: _any,
-    }
-
-    _unaryop_maker = {ast.USub: operator.neg, ast.Invert: operator.invert, ast.Not: operator.not_}
-
-    def __init__(self, args, usage, symbols, closure_vars, func_name=None):
-        """
-        Parameters
-        ----------
-        args: A list of tvm.te.placeholder or te.var
-            Provided by the user, the argument list of the function to be lowered.
-
-        usage: A dict of variables used in last in this function
-            Provided by last lower pass, which collects this information
-
-        symbols : list of str
-            The symbol list of the global context of the function.
-
-        closure_vars: dict
-            A dict of external name reference captured by this function.
-
-        Returns
-        -------
-        func_name: str
-            The name of the function to be lowered; if not provided,
-            the compiler will use the name in the AST
-        """
-        self.args = list(args)
-        self.usage = usage.copy()
-
-        self.symbols = {}  # Symbol table
-        for k, v in symbols.items():
-            if callable(v):
-                self.add_symbol(k, Symbol.Callable, v)
-
-        self.closure_vars = closure_vars
-
-        self.binds = {}  # Thread binds
-        self.device = 0  # Is it generating device
-
-        self.func_name = func_name  # The name of the function to be lowered
-        self.outputs = []  # Output tensors' name
-        self.side_effect = set()  # Tensors with side effects
-        self.parsed_body = None  # The parsed HalideIR body
-        self.analyzer = tvm.arith.Analyzer()
-        self.returned = False  # If this function has a valid return
-
-    def add_symbol(self, key, ty, val):  # pylint: disable=invalid-name
-        """Add value to the symbol table context"""
-        if key in self.symbols.keys():
-            old = str(self.symbols[key])
-            new = str((ty, val))
-            _internal_assert(False, f"Name conflict in symbol table! [{key}] {old} -> {new}")
-
-        self.symbols[key] = ty, val
-
-        if ty == Symbol.ThreadBind:
-            if val.var.name not in self.binds.keys():
-                self.binds[val.var.name] = val
-                return
-            val_ = self.binds[val.var.name]
-            _internal_assert(
-                tvm.tir.analysis.expr_deep_equal(val_.dom.extent, val.dom.extent),
-                "Thread extents should be uniform!",
-            )
-            self.symbols[key] = ty, val_
-
-    def wrap_up_realize(self, node, body):
-        """Wrap up all the variables which will no longer be used"""
-        to_pop = []
-        for key, val in self.usage.items():
-            _, level, _ = val
-            if key not in self.symbols:
-                # don't realize the symbols that are never visited
-                continue
-            if level != node:
-                continue
-            _internal_assert(key in self.symbols.keys(), f"Unknown symbol {key}!")
-
-            ty, entry = self.symbols[key]  # pylint: disable=invalid-name
-            if ty in [Symbol.Input, Symbol.OutputBuffer]:
-                continue
-            if "Buffer" in ty.name:
-                _buf = entry
-                _scope = "global" if ty is Symbol.BufferVar else ty.name[:-6].lower()
-                to_pop.append(key)
-            else:
-                continue
-
-            if _scope == "global":
-                body = self.wrap_up_binds(body)
-
-            _domain = [Range.from_min_extent(0, i) for i in _buf.shape]
-            _dtype = _buf.dtype
-            _true = tvm.runtime.convert(True)
-            body = tvm.tir.ProducerRealize(_buf, _domain, _true, body, tvm.runtime.convert(_scope))
-
-        for elem in to_pop:
-            self.symbols.pop(elem)
-
-        return body
-
-    def wrap_up_binds(self, body):
-        for _, iter_var in self.binds.items():
-            ext = iter_var.dom.extent
-            body = tvm.tir.AttrStmt(iter_var, "thread_extent", ext, body)
-        self.binds = {}
-        return body
-
-    # pylint: disable=invalid-name, missing-docstring
-    def visit_Module(self, node):
-        _internal_assert(
-            len(node.body) == 1, "Only one-function source code will be fed to this parser!"
-        )
-        return self.visit(node.body[0])
-
-    def visit_FunctionDef(self, node):
-        _internal_assert(
-            len(node.args.args) == len(self.args),
-            "The number of arguments passed to the \
-                         function should be the same as it is defined!",
-        )
-        if self.func_name is None:
-            self.func_name = node.name
-        for idx, arg in enumerate(node.args.args):
-            _attr = "id" if sys.version_info[0] < 3 else "arg"  # To make py2 and 3 compatible
-            self.add_symbol(getattr(arg, _attr), Symbol.Input, self.args[idx])
-        res = visit_list_to_block(self.visit, node.body)
-        res = self.wrap_up_realize(node, res)
-        return self.wrap_up_binds(res)
-
-    def visit_Expr(self, node):
-        return self.visit(node.value)
-
-    def visit_Name(self, node):
-        name = node.id
-        if sys.version_info[0] == 2 and name in ["True", "False"]:
-            return tvm.runtime.convert(ast.literal_eval(name))
-
-        if name in self.closure_vars:
-            return tvm.runtime.convert(self.closure_vars[name])
-
-        ty, entry = self.symbols[name]
-        _internal_assert(name in self.symbols, f"Unknown symbol {name}!")
-        if ty in [Symbol.LoopVar, Symbol.Input, Symbol.ConstLoopVar]:
-            return entry
-        if ty is Symbol.ThreadBind:
-            return entry.var
-        if ty is Symbol.ConstVar:
-            return entry if isinstance(node.ctx, ast.Load) else None
-        if ty is Symbol.BufferVar:
-            if isinstance(node.ctx, ast.Load):
-                return tvm.tir.ProducerLoad(entry, [tvm.runtime.const(0, "int32")])
-            return entry, [tvm.runtime.const(0, "int32")]
-        # Do I need any assertion here?
-        return entry
-
-    def visit_Num(self, node):
-        if isinstance(node.n, numbers.Integral):
-            dtype = "int32"
-        elif isinstance(node.n, float):
-            dtype = "float32"
-        else:
-            _internal_assert(
-                isinstance(node.n, bool), "The data type should be one of (int, float, bool)"
-            )
-            dtype = "bool"
-        return tvm.runtime.const(node.n, dtype)
-
-    def visit_NameConstant(self, node):
-        return tvm.tir.const(node.value)
-
-    def visit_AugAssign(self, node):
-        buf = self.visit(node.target)
-        rhs = self.visit(node.value)
-        if isinstance(buf, tuple):
-            _internal_assert(len(buf) == 2, "LHS is supposed to be (buf, args)!")
-            buf, args = buf
-        else:
-            args = [tvm.runtime.const(0, "int32")]
-        _internal_assert(isinstance(buf, Tensor), "LHS is supposed to be Tensor!")
-
-        read = tvm.tir.ProducerLoad(buf, args)
-        value = HybridParser._binop_maker[type(node.op)](read, rhs)
-
-        return tvm.tir.ProducerStore(buf, value, args)
-
-    def visit_Assign(self, node):
-        rhs = self.visit(node.value)
-        if isinstance(rhs, Operation):
-            rmap = {}
-            _internal_assert(
-                len(node.targets) == rhs.num_outputs, "Unable to detuple the outs to targets"
-            )
-            for i in range(rhs.num_outputs):
-                _internal_assert(
-                    isinstance(node.targets[i], ast.Name),
-                    "You should bind a pure name to the tensors",
-                )
-                self.add_symbol(node.targets[i].id, Symbol.GlobalBuffer, rhs.output(i))
-                rmap[rhs.outputs[i].op] = rhs.output(i)
-            return utils.replace_io(rhs.body, rmap)
-
-        _internal_assert(len(node.targets) == 1, "So far only one-valued assignment is supported!")
-        lhs = node.targets[0]
-        if isinstance(rhs, _expr.PrimExpr):
-            rhs = self.analyzer.simplify(rhs)
-        if isinstance(lhs, ast.Name):
-            # TODO: support defined intermediate buffer later
-            lhs_ = lhs
-            lhs = lhs.id
-            if lhs in self.symbols.keys():
-                ty, _ = self.symbols[lhs]
-                _internal_assert(ty != Symbol.LoopVar, "Loop variable cannot be overwritten!")
-            decl, _, rw = self.usage[lhs]
-            if decl == lhs_:
-                _internal_assert(
-                    lhs not in self.symbols.keys(),
-                    "This value should not be defined before this point!",
-                )
-                if isinstance(rhs, tuple):
-                    shape, dtype, scope = rhs
-                    ph = tvm.te.placeholder(shape, dtype=dtype, name=lhs)
-                    self.add_symbol(lhs, getattr(Symbol, scope.title() + "Buffer"), ph)
-                    if scope == "output":
-                        self.outputs.append(lhs)
-                    return utils.make_nop()
-                if isinstance(rhs, utils.halide_imm_types) and ast.Store not in rw:
-                    self.add_symbol(lhs, Symbol.ConstVar, rhs)
-                else:
-                    _internal_assert(
-                        self.device == 0,
-                        "Single variable not supported in devices' side!\n"
-                        + "If you are using GPU, please allocate a 'local' spad "
-                        + "outside the bind body",
-                    )
-                    ph = tvm.te.placeholder((1,), dtype=rhs.dtype, name=lhs)
-                    self.add_symbol(lhs, Symbol.BufferVar, ph)
-            lhs = self.visit(lhs_)
-            if lhs is not None:
-                buf, args = lhs
-                return tvm.tir.ProducerStore(buf, rhs, args)
-            return utils.make_nop()
-
-        lhs, args = self.visit(lhs)
-        _internal_assert(
-            isinstance(lhs, Tensor), "An array access's LHS is expected to be a expr.Call!"
-        )
-        res = tvm.tir.ProducerStore(lhs, rhs, args)
-        return res
-
-    def visit_Index(self, node):
-        if isinstance(node.value, ast.Tuple):
-            return self.visit(node.value)
-        return [self.visit(node.value)]
-
-    def visit_Attribute(self, node):
-        buf = self.visit(node.value)
-        return getattr(buf, node.attr)
-
-    def visit_Subscript(self, node):
-        args = self.visit(node.slice)
-        if sys.version_info >= (3, 9):
-            if not isinstance(node.slice, ast.Tuple):
-                args = [args]
-
-        arr = self.visit(node.value)
-        if isinstance(arr, (Array, list, tuple)):
-            for i in args:
-                if isinstance(i, numbers.Integral):
-                    arr = arr[i]
-                else:
-                    _internal_assert(
-                        isinstance(i, (_expr.IntImm,)), "All indices are supposed to be constants"
-                    )
-                    arr = arr[i.value]
-            return arr
-        if isinstance(node.ctx, ast.Load):
-            return tvm.tir.ProducerLoad(arr, args)
-        return arr, args
-
-    def visit_With(self, node):
-        if sys.version_info[0] < 3:
-            context = node.context_expr
-            option = node.optional_vars
-        else:
-            _internal_assert(len(node.items) == 1, "Only one with element is supported so far!")
-            context = node.items[0].context_expr
-            option = node.items[0].optional_vars
-        _internal_assert(isinstance(context, ast.Call), "The object must be a Python func call!")
-        _internal_assert(isinstance(option, ast.Name), "The object after 'as' must be an id!")
-        self.annotation[option.id] = context.func.id
-        return visit_list_to_block(self.visit, node.body)
-
-    def visit_If(self, node):
-        cond = self.analyzer.simplify(self.visit(node.test))
-
-        # Return no IfThenElse if proven
-        if isinstance(cond, _expr.IntImm):
-            if cond.value:
-                return visit_list_to_block(self.visit, node.body)
-            if node.orelse:
-                return visit_list_to_block(self.visit, node.orelse)
-            return utils.make_nop()
-
-        if_body = visit_list_to_block(self.visit, node.body)
-
-        if node.orelse:
-            else_body = visit_list_to_block(self.visit, node.orelse)
-        else:
-            else_body = None
-        return tvm.tir.IfThenElse(cond, if_body, else_body)
-
-    def visit_IfExp(self, node):
-        cond = self.visit(node.test)
-        if_body = self.visit(node.body)
-        else_body = self.visit(node.orelse)
-        return tvm.tir.Select(cond, if_body, else_body)
-
-    def visit_Compare(self, node):
-        _internal_assert(len(node.ops) == len(node.comparators), "#compare ops != #comparators")
-        ops = [self.visit(node.left)]
-        ops += [self.visit(i) for i in node.comparators]
-        res = []
-        for i in range(len(node.ops)):
-            lhs = ops[i]
-            rhs = ops[i + 1]
-            res.append(HybridParser._binop_maker[type(node.ops[i])](lhs, rhs))
-        return _all(*res)
-
-    def visit_BoolOp(self, node):
-        n = len(node.values)
-        if n == 1:
-            _internal_assert(isinstance(node.op, ast.Not), "Unary is supposed to be not!")
-            return operator.not_(self.visit(node.values[0]))
-        _internal_assert(isinstance(node.op, (ast.And, ast.Or)), "Binary is supposed to be and/or!")
-        values = [self.visit(i) for i in node.values]
-        return HybridParser._binop_maker[type(node.op)](*values)
-
-    def visit_UnaryOp(self, node):
-        operand = self.visit(node.operand)
-        return HybridParser._unaryop_maker[type(node.op)](operand)
-
-    def visit_BinOp(self, node):
-        lhs = self.visit(node.left)
-        rhs = self.visit(node.right)
-        return HybridParser._binop_maker[type(node.op)](lhs, rhs)
-
-    def visit_Call(self, node):
-        # Yet, no function pointer supported
-        _internal_assert(
-            isinstance(node.func, ast.Name), "Only id-function function call is supported so far!"
-        )
-
-        func_id = node.func.id
-        args = [self.visit(i) for i in node.args]
-        # Intrinsics'
-        if hasattr(calls, func_id):
-            return getattr(calls, func_id)(func_id, args)
-        # Contexts'
-        _internal_assert(
-            func_id in self.symbols.keys(),
-            f"The function called ({func_id}) is not in the context either!",
-        )
-        ty, entry = self.symbols[func_id]
-        _internal_assert(ty is Symbol.Callable, "Are you sure what you call is a function?!")
-        outs = entry(*args)
-        op = outs.op if isinstance(outs, Tensor) else outs[0].op
-        return op
-
-    def visit_For(self, node):
-        iter_var, low, ext, kind = self.visit(node.iter)
-        _internal_assert(
-            isinstance(node.target, ast.Name), "The loop iterator should be a variable!"
-        )
-
-        _name = node.target.id
-
-        if isinstance(kind, tuple):
-            low = self.analyzer.simplify(low)
-            ext = self.analyzer.simplify(ext)
-            _internal_assert(
-                isinstance(low, _expr.ConstExpr) and isinstance(ext, _expr.ConstExpr),
-                "Const range should start from a const " + "and iterate const times",
-            )
-
-            low, ext = low.value, ext.value
-            if ext > 114514:
-                logging.log(
-                    logging.CRITICAL, "[Warning] Are you sure to unroll a large loop in Python?"
-                )
-
-            bodies = []
-            for i in range(low, low + ext):
-                self.add_symbol(_name, Symbol.ConstLoopVar, i)
-                body = visit_list_to_block(self.visit, node.body)
-                body = self.wrap_up_realize(node, body)
-                bodies.append(body)
-                self.symbols.pop(_name)
-            return concat_list_to_block(bodies)
-
-        if iter_var is None:
-            _internal_assert(kind is not None, "The loop iterating function parse error!")
-            if isinstance(ext, _expr.PrimExpr):
-                dtype = ext.dtype
-            elif isinstance(ext, int):
-                dtype = "int32"
-            else:
-                raise NotImplementedError(f"Unsupported type of ext: {type(ext)}")
-            offset = iter_var = tvm.te.var(_name, dtype=dtype)
-            if not tvm.tir.analysis.expr_deep_equal(low, tvm.runtime.const(0, "int32")):
-                offset = iter_var + low
-            self.add_symbol(_name, Symbol.LoopVar, offset)
-            _body = visit_list_to_block(self.visit, node.body)
-        else:
-            _internal_assert(kind is None, "The loop bind function parse error!")
-            self.add_symbol(_name, Symbol.ThreadBind, iter_var)
-            self.device += 1
-            _body = visit_list_to_block(self.visit, node.body)
-            self.device -= 1
-
-        _body = self.wrap_up_realize(node, _body)
-
-        if kind is None:
-            res = _body
-        else:
-            _internal_assert(
-                not isinstance(kind, tuple), "Micro expansion should be handled before!"
-            )
-            res = tvm.tir.For(iter_var, tvm.runtime.const(0, "int32"), ext, kind, _body)
-
-        self.symbols.pop(_name)
-        return res
-
-    def visit_Return(self, node):
-        _internal_assert(
-            all(ty != Symbol.LoopVar for ty, _ in self.symbols.values()),
-            "Return should not be in a loop body!",
-        )
-        ids = []
-        if isinstance(node.value, ast.Name):
-            ids = [node.value.id]
-        else:
-            _internal_assert(
-                isinstance(node.value, ast.Tuple),
-                "You should return either a single tensor or a tuple",
-            )
-            _internal_assert(
-                all(isinstance(i, ast.Name) for i in node.value.elts), "What do you return?"
-            )
-            ids = [i.id for i in node.value.elts]
-        _internal_assert(len(set(ids)) == len(ids), "Duplicated tensors in the return tuples")
-        if len(ids) < len(self.outputs):
-            logging.log(logging.CRITICAL, "[Warning] Not all the output buffers returned!")
-        self.outputs = [self.symbols[i][1] for i in ids]
-        self.returned = True
-        return utils.make_nop()
-
-    def visit_Tuple(self, node):
-        return tuple(self.visit(i) for i in node.elts)
-
-    def visit_Str(self, node):
-        return node.s
-
-    def visit_Assert(self, node):
-        test = self.visit(node.test)
-        mesg = tvm.runtime.convert(self.visit(node.msg))
-        return tvm.tir.AssertStmt(test, mesg, utils.make_nop())
-
-
-def parse_python(src, args, symbols, closure_vars):
-    """The helper function of calling the AST visitor
-
-    Parameters
-    ----------
-    src : ast.node or str
-        If an ast.node, then directly lower it.
-        If a str, then parse it to ast and lower it.
-
-    args : list of Tensors or Vars
-        The argument lists to the function.
-        It is NOT encouraged to write a function without arguments.
-        It is NOT encouraged to write a function with side effect.
-
-    symbols : list of str
-        The symbol list of the global context of the function.
-
-    closure_vars: dict
-        A dict of external name reference captured by this function.
-
-    Returns
-    -------
-    root : Stmt
-        The result Halide IR and the parser class instance.
-    """
-    root = ast.parse(src) if isinstance(src, str) else src
-    _internal_assert(root, ast.AST)
-    var_usage = determine_variable_usage(root, args, symbols, closure_vars)
-    parser = HybridParser(args, var_usage, symbols, closure_vars)
-    parser.parsed_body = parser.visit(root)
-    _internal_assert(parser.returned, "No valid return found in the function body!")
-    return parser
-
-
-def source_to_op(src, args, symbols, closure_vars):
-    """Another level of wrapper
-
-    Parameters
-    ----------
-    src : ast.node or str
-        If an ast.node, then directly lower it.
-        If a str, then parse it to ast and lower it.
-
-    args : list of Tensors or Vars
-        The argument lists to the function.
-        It is NOT encouraged to write a function without arguments.
-        It is NOT encouraged to write a function with side effect.
-
-    symbols : list of str
-        The symbol list of the global context of the function.
-
-    closure_vars: dict
-        A dict of external name reference captured by this function.
-
-    Returns
-    -------
-    res : list of output tensors
-        The result of output tensors of the formed OpNode.
-    """
-    parser = parse_python(src, args, symbols, closure_vars)
-
-    input_tensors = []
-
-    def get_input_tensors(arg):
-        if isinstance(arg, Tensor):
-            input_tensors.append(arg)
-        elif isinstance(arg, Array):
-            for i in arg:
-                get_input_tensors(i)
-
-    for i in args:
-        get_input_tensors(i)
-    op = tvm.te._ffi_api.HybridOp(
-        parser.func_name, "HybridOp", None, input_tensors, parser.outputs, parser.parsed_body
-    )
-    res = [op.output(i) for i in range(len(parser.outputs))]
-    return res[0] if len(res) == 1 else res
diff --git a/python/tvm/te/hybrid/preprocessor.py b/python/tvm/te/hybrid/preprocessor.py
deleted file mode 100644
index 6af584060e9b..000000000000
--- a/python/tvm/te/hybrid/preprocessor.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Determines the declaration, r/w status, and last use of each variable"""
-
-import ast
-import sys
-from .runtime import HYBRID_GLOBALS
-from .utils import _internal_assert
-
-
-class PyVariableUsage(ast.NodeVisitor):
-    """The vistor class to determine the declaration, r/w status, and last use of each variable"""
-
-    # pylint: disable=invalid-name
-    # pylint: disable=missing-docstring
-    def __init__(self, args, symbols, closure_vars):
-        self.status = {}
-        self.scope_level = []
-        self._args = {}
-        self.args = args
-        self.aug_assign_ = False
-        self.symbols = symbols
-        self.closure_vars = closure_vars
-
-    def visit_FunctionDef(self, node):
-        self.scope_level.append(node)
-        _internal_assert(
-            len(node.args.args) == len(self.args),
-            "#arguments passed should be the same as #arguments defined",
-        )
-        for idx, arg in enumerate(node.args.args):
-            _attr = "id" if sys.version_info[0] < 3 else "arg"  # To make py2 and 3 compatible
-            self._args[getattr(arg, _attr)] = self.args[idx]
-        for i in node.body:
-            self.visit(i)
-
-    def visit_For(self, node):
-        _internal_assert(isinstance(node.target, ast.Name), "For's iterator should be an id")
-        self.visit(node.iter)
-        self.scope_level.append(node)
-        for i in node.body:
-            self.visit(i)
-        self.scope_level.pop()
-
-    def visit_Call(self, node):
-        # No function pointer supported so far
-        _internal_assert(isinstance(node.func, ast.Name), "Function call should be an id")
-        func_id = node.func.id
-        _internal_assert(
-            func_id
-            in list(HYBRID_GLOBALS.keys())
-            + ["range", "max", "min", "len"]
-            + list(self.symbols.keys()),
-            "Function call id " + func_id + " not in intrinsics' list",
-        )
-        for elem in node.args:
-            self.visit(elem)
-
-    def visit_AugAssign(self, node):
-        self.aug_assign_ = True
-        self.generic_visit(node)
-        self.aug_assign_ = False
-
-    def visit_Name(self, node):
-        # If it is True or False, we do not worry about it!
-        if sys.version_info[0] == 2 and node.id in ["True", "False"]:
-            return
-        # If it is from the argument list or loop variable, we do not worry about it!
-        if node.id in self._args.keys():
-            return
-        fors = [loop.target.id for loop in self.scope_level if isinstance(loop, ast.For)]
-        if node.id in fors:
-            return
-        # The loop variable cannot be overwritten when iteration
-        _internal_assert(
-            not isinstance(node.ctx, ast.Store) or node.id not in fors,
-            "Iter var cannot be overwritten",
-        )
-
-        if node.id not in self.status.keys():
-            # It is a captured value in closure
-            if node.id in self.closure_vars:
-                try:
-                    ast.literal_eval(str(self.closure_vars[node.id]))
-                except ValueError:
-                    raise ValueError("Only support capturing constant values in closure")
-                return
-
-            _internal_assert(isinstance(node.ctx, ast.Store), f"Undeclared variable {node.id}")
-            if self.aug_assign_:
-                raise ValueError('"First store" cannot be an AugAssign')
-            self.status[node.id] = (node, self.scope_level[-1], set())
-        else:
-            decl, loop, usage = self.status[node.id]
-            usage.add(type(node.ctx))
-            _internal_assert(
-                loop in self.scope_level, f"{node.id} is used out of the scope it is defined!"
-            )
-            self.status[node.id] = (decl, loop, usage)
-
-
-def determine_variable_usage(root, args, symbols, closure_vars):
-    """The helper function for calling the dedicated visitor."""
-    visitor = PyVariableUsage(args, symbols, closure_vars)
-    visitor.visit(root)
-    return visitor.status
diff --git a/python/tvm/te/hybrid/runtime.py b/python/tvm/te/hybrid/runtime.py
deleted file mode 100644
index 615bd7e43a7d..000000000000
--- a/python/tvm/te/hybrid/runtime.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Intrinsics of TVM-Python Hybrid Script for Python emulation runtime"""
-
-import numpy
-from tvm.target import Target
-
-
-class bind(object):  # pylint: disable=invalid-name
-    """GPU bind software emulataion runtime."""
-
-    def __init__(self, _, ext):
-        self.ext = ext
-
-    def __iter__(self):
-        i = 0
-        while i < self.ext:
-            yield i
-            i += 1
-
-
-def allocate(shape, dtype="float32", scope="global"):  # pylint: disable=unused-argument
-    """Allocate a buffer with given shape
-
-    Parameters
-    ----------
-    shape: Tuple
-        The shape of the tensor to be allocated
-    dtype: string
-        The data type of the tensor
-    scope: string
-        The storage scope of the tensor
-
-    Returns
-    -------
-    tensor: numpy.array
-        The tensor allocated
-    """
-    return numpy.zeros(shape).astype(dtype)
-
-
-def rsqrt(x):
-    """
-    Computes reciprocal of square root of x element-wise
-
-    Parameters
-    ----------
-    x: Tensor
-
-    Returns
-    -------
-    res: Tensor
-        The result of reciprocal of square root of x
-    """
-    return numpy.ones_like(x) / numpy.sqrt(x)
-
-
-def popcount(x):
-    """
-    Count ones in the binary representation of number x
-
-    Parameters
-    ----------
-    x: Integer
-        The number to be counted
-
-    Returns
-    -------
-    cnt: Integer
-        The number of ones in the binary representation of number x
-    """
-    cnt = 0
-    while x:
-        x -= x & -x
-        cnt += 1
-    return cnt
-
-
-def sigmoid(x):
-    """
-    Sigmoid function of x, aka 1/(1+exp(-x)).
-
-    Parameters
-    ----------
-    x: a real number
-
-    Returns
-    -------
-    res: a real number
-        The result of sigmoid function
-    """
-    return 1 / (1 + numpy.exp(-x))
-
-
-def max_num_threads(allow_none=True):
-    """Get max number of threads for GPU targets."""
-    return Target.current(allow_none).max_num_threads
-
-
-def inf(dtype):
-    return numpy.iinfo(dtype).max
-
-
-def ninf(dtype):
-    return numpy.iinfo(dtype).min
-
-
-HYBRID_GLOBALS = {
-    "unroll": range,
-    "vectorize": range,
-    "parallel": range,
-    "const_range": range,
-    "bind": bind,
-    "allocate": allocate,
-    "output_tensor": allocate,
-    "sqrt": numpy.sqrt,
-    "rsqrt": rsqrt,
-    "log": numpy.log,
-    "tanh": numpy.tanh,
-    "power": numpy.power,
-    "exp": numpy.exp,
-    "sigmoid": sigmoid,
-    "popcount": popcount,
-    "round": round,
-    "likely": lambda cond: cond,
-    "uint8": numpy.uint8,
-    "uint16": numpy.uint16,
-    "uint32": numpy.uint32,
-    "uint64": numpy.uint64,
-    "int8": numpy.int8,
-    "int16": numpy.int16,
-    "int32": numpy.int32,
-    "int64": numpy.int64,
-    "float16": numpy.float16,
-    "float32": numpy.float32,
-    "float64": numpy.float64,
-    "ceil_div": lambda a, b: (a + b - 1) // b,
-    "max_num_threads": max_num_threads,
-    "inf": inf,
-    "ninf": inf,
-}
-
-
-def _enter_hybrid_runtime(func):
-    """Put hybrid runtime variables into the global scope"""
-    _globals = func.__globals__
-    intersect = []
-    for elem in list(HYBRID_GLOBALS.keys()):
-        if elem in _globals.keys():
-            intersect.append((elem, _globals[elem]))
-        _globals[elem] = HYBRID_GLOBALS[elem]
-    return intersect
-
-
-def _restore_runtime(func, intersect):
-    """Rollback the modification caused by hybrid runtime"""
-    _globals = func.__globals__
-    for elem in list(HYBRID_GLOBALS.keys()):
-        _globals.pop(elem)
-    for k, v in intersect:
-        _globals[k] = v
diff --git a/python/tvm/te/hybrid/utils.py b/python/tvm/te/hybrid/utils.py
deleted file mode 100644
index a515938fa524..000000000000
--- a/python/tvm/te/hybrid/utils.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=inconsistent-return-statements
-"""Internal utilities for parsing Python subset to TIR"""
-
-import ast
-import inspect
-import logging
-import sys
-import numpy
-
-import tvm.runtime
-from tvm._ffi.base import numeric_types
-from tvm.ir.container import Array
-
-from tvm.tir import expr as _expr
-from tvm.tir import stmt as _stmt
-from tvm.te.tensor import Tensor
-
-
-# pylint: disable=invalid-name
-np_arg_types = (numpy.ndarray, *numeric_types)
-tvm_arg_types = (Tensor, Array, _expr.Var, _expr.ConstExpr, *numeric_types, list, tuple, str)
-halide_imm_types = (_expr.IntImm, _expr.FloatImm, *numeric_types)
-
-
-def _internal_assert(cond, err):
-    """Simplify the code segment like if not XXX then raise an error"""
-    if not cond:
-        raise ValueError(err)
-
-
-# Useful constants. In avoid of runtime dependences, we use function calls to return them.
-def make_nop():
-    """Returns a 'no operation' node in HalideIR."""
-    return _stmt.Evaluate(tvm.runtime.const(0, dtype="int32"))
-
-
-def is_docstring(node):
-    """Checks if a Python AST node is a docstring"""
-    return isinstance(node, ast.Expr) and isinstance(node.value, ast.Str)
-
-
-def _pruned_source(func):
-    """Prune source code's extra leading spaces"""
-    try:
-        lines = inspect.getsource(func).split("\n")
-        leading_space = len(lines[0]) - len(lines[0].lstrip(" "))
-        lines = [line[leading_space:] for line in lines]
-        return "\n".join(lines)
-    except IOError as err:
-        if sys.version_info[0] == 2 and str(err) == "could not get source code":
-            logging.log(
-                logging.CRITICAL,
-                "This module is not fully operated under Python2... " "Please move to Python3!",
-            )
-            raise err
-
-
-def replace_io(body, rmap):
-    """Replacing tensors usage according to the dict given"""
-    # pylint: disable=import-outside-toplevel
-    from tvm.tir import stmt_functor
-
-    def replace(op):
-        if isinstance(op, _stmt.ProducerStore) and op.producer.op in rmap.keys():
-            buf = rmap[op.producer.op]
-            return _stmt.ProducerStore(buf, op.value, op.indices)
-        if isinstance(op, _expr.ProducerLoad) and op.producer.op in rmap.keys():
-            buf = rmap[op.producer.op]
-            return _expr.ProducerLoad(buf, op.indices)
-        return None
-
-    return stmt_functor.ir_transform(body, None, replace, ["tir.ProducerStore", "tir.ProducerLoad"])
-
-
-def _is_tvm_arg_types(args):
-    """Determine a list of element is either a list of tvm arguments of a list of numpy arguments.
-    If neither is true, raise a value error."""
-    if all(isinstance(elem, tvm_arg_types) for elem in args):
-        return True
-    elif all(isinstance(elem, np_arg_types) for elem in args):
-        return False
-    else:
-        raise ValueError(
-            f"Expected arguments to be entirely TVM types, "
-            f"or entirely numpy types, "
-            f"but received {[type(elem) for elem in args]}"
-        )
diff --git a/python/tvm/te/operation.py b/python/tvm/te/operation.py
index 63a3ecd57b1c..a9681c6df040 100644
--- a/python/tvm/te/operation.py
+++ b/python/tvm/te/operation.py
@@ -620,3 +620,6 @@ def tir_matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
     if not isinstance(ops, (list, tuple, Array)):
         ops = [ops]
     return _ffi_api.CreatePrimFunc(ops, index_dtype_override)
+
+
+AXIS_SEPARATOR = tvm.tir.IndexMap.AXIS_SEPARATOR
diff --git a/python/tvm/te/schedule.py b/python/tvm/te/schedule.py
deleted file mode 100644
index 87a4eda728df..000000000000
--- a/python/tvm/te/schedule.py
+++ /dev/null
@@ -1,665 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-import
-"""The computation schedule api of TVM."""
-import collections
-import inspect
-from typing import Callable, List
-
-import tvm._ffi
-from tvm._ffi.base import string_types
-from tvm.ir import container as _container
-from tvm.runtime import Object, convert
-from tvm.tir import Buffer, IndexMap, IterVar, Var
-
-from . import _ffi_api
-from . import tensor as _tensor
-
-
-@tvm._ffi.register_object
-class Split(Object):
-    """Split operation on axis."""
-
-
-@tvm._ffi.register_object
-class Fuse(Object):
-    """Fuse operation on axis."""
-
-
-@tvm._ffi.register_object
-class Singleton(Object):
-    """Singleton axis."""
-
-
-def create_schedule(ops):
-    """Create a schedule for list of ops
-
-    Parameters
-    ----------
-    ops : list of Operations
-        The source expression.
-
-    Returns
-    -------
-    sch : schedule.Schedule
-        The created schedule.
-    """
-    if not isinstance(ops, (list, _container.Array)):
-        ops = [ops]
-    return _ffi_api.CreateSchedule(ops)
-
-
-@tvm._ffi.register_object
-class Schedule(Object):
-    """Schedule for all the stages."""
-
-    def __getitem__(self, k):
-        if isinstance(k, _tensor.Tensor):
-            k = k.op
-        if not isinstance(k, _tensor.Operation):
-            raise ValueError("Expect schedule key to be Tensor or Operation")
-        if k not in self.stage_map:
-            raise ValueError(f"Cannot find the operation {k} in schedule")
-        return self.stage_map[k]
-
-    def normalize(self):
-        """Build a normalized schedule from the current schedule.
-
-        Insert necessary rebase to make certain iter var to start from 0.
-        This is needed before bound inference and followup step.
-
-        Returns
-        -------
-        sch : Schedule
-            The normalized schedule.
-        """
-        return _ffi_api.ScheduleNormalize(self)
-
-    def create_group(self, outputs, inputs, include_inputs=False):
-        """Create stage group by giving output and input boundary.
-
-        The operators between outputs and inputs are placed as member of group.
-        outputs are include in the group, while inputs are not included.
-
-        Parameters
-        ----------
-        outputs : list of Tensors
-            The outputs of the group.
-
-        inputs : list of Tensors
-            The inputs of the group.
-
-        include_inputs : boolean, optional
-            Whether include input operations in the group if they are used by outputs.
-
-        Returns
-        -------
-        group : Stage
-            A virtual stage represents the group, user can use compute_at to move
-            the attachment point of the group.
-        """
-        if isinstance(outputs, _tensor.Tensor):
-            outputs = [outputs]
-        if isinstance(inputs, _tensor.Tensor):
-            inputs = [inputs]
-        return _ffi_api.ScheduleCreateGroup(self, outputs, inputs, include_inputs)
-
-    def cache_read(self, tensor, scope, readers):
-        """Create a cache read of original tensor for readers.
-
-        This will mutate the body of the readers.
-        A new cache stage will be created for the tensor.
-        Call this before doing any split/fuse schedule.
-
-        Parameters
-        ----------
-        tensor : Tensor
-            The tensor to be cached.
-        scope : str
-            The scope of cached
-        readers : list of Tensor or Operation
-            The readers to read the cache.
-
-        Returns
-        -------
-        cache : Tensor
-            The created cache tensor.
-        """
-        if isinstance(readers, (_tensor.Tensor, _tensor.Operation)):
-            readers = [readers]
-        readers = [t.op if isinstance(t, _tensor.Tensor) else t for t in readers]
-        return _ffi_api.ScheduleCacheRead(self, tensor, scope, readers)
-
-    def cache_write(self, tensor, scope):
-        """Create a cache write of original tensor, before storing into tensor.
-
-        This will mutate the body of the tensor.
-        A new cache stage will created before feed into the tensor.
-
-        This function can be used to support data layout transformation.
-        If there is a split/fuse/reorder on the data parallel axis of tensor
-        before cache_write is called. The intermediate cache stores
-        the data in the layout as the iteration order of leave axis.
-        The data will be transformed back to the original layout in the original tensor.
-        User can further call compute_inline to inline the original layout and keep
-        the data stored in the transformed layout.
-
-        Parameters
-        ----------
-        tensor : Tensor, list or tuple
-            The tensors to be feed to. All the tensors must be produced by one computeOp
-        scope : str
-            The scope of cached
-
-        Returns
-        -------
-        cache : Tensor
-            The created cache tensor.
-        """
-        return _ffi_api.ScheduleCacheWrite(self, tensor, scope)
-
-    def rfactor(self, tensor, axis, factor_axis=0):
-        """Factor a reduction axis in tensor's schedule to be an explicit axis.
-
-        This will create a new stage that generated the new tensor with axis
-        as the first dimension. The tensor's body will be rewritten as a reduction
-        over the factored tensor.
-
-        Parameters
-        ----------
-        tensor : Tensor
-            The tensor to be factored.
-        axis : IterVar
-            The reduction axis in the schedule to be factored.
-        factor_axis : int
-            The position where the new axis is placed.
-
-        Returns
-        -------
-        tfactor : Tensor or Array of Tensor
-            The created factored tensor.
-        """
-        factored = _ffi_api.ScheduleRFactor(self, tensor, axis, factor_axis)
-        return factored[0] if len(factored) == 1 else factored
-
-
-@tvm._ffi.register_object
-class Stage(Object):
-    """A Stage represents schedule for one operation."""
-
-    def split(self, parent, factor=None, nparts=None, disable_predication=False):
-        """Split the stage either by factor providing outer scope, or both
-
-        Parameters
-        ----------
-        parent : IterVar
-             The parent iter var.
-
-        factor : Expr, optional
-             The splitting factor
-
-        nparts : Expr, optional
-             The number of outer parts.
-
-        disable_predication : bool, optional
-            If enabled, don't create a predicate for guarding the loop. This can
-            be useful when splitting with scalable factors that the schedule writer
-            knows are divisible by the loop bound.
-
-            Warning: enabling this feature may result in incorrect code generation
-            if not used carefully.
-
-        Returns
-        -------
-        outer : IterVar
-            The outer variable of iteration.
-
-        inner : IterVar
-            The inner variable of iteration.
-        """
-        if nparts is not None:
-            if factor is not None:
-                raise ValueError("Do not need to provide both outer and nparts")
-            outer, inner = _ffi_api.StageSplitByNParts(self, parent, nparts, disable_predication)
-        else:
-            if factor is None:
-                raise ValueError("Either nparts or factor need to be provided")
-            outer, inner = _ffi_api.StageSplitByFactor(self, parent, factor, disable_predication)
-        return outer, inner
-
-    def fuse(self, *args):
-        """Fuse multiple consecutive iteration variables into a single iteration variable.
-
-        fused = fuse(...fuse(fuse(args[0], args[1]), args[2]),..., args[-1])
-        The order is from outer to inner.
-
-        Parameters
-        ----------
-        args : list of IterVars
-            Itervars that proceeds each other
-
-        Returns
-        -------
-        fused : IterVar
-            The fused variable of iteration.
-        """
-        fused = _ffi_api.StageFuse(self, args)
-        return fused
-
-    def set_scope(self, scope):
-        """Set the thread scope of this stage
-
-        Parameters
-        ----------
-        scope : str
-            The thread scope of this stage
-        """
-        return _ffi_api.StageSetScope(self, scope)
-
-    def bind(self, ivar, thread_ivar):
-        """Bind ivar to thread index thread_ivar
-
-        Parameters
-        ----------
-        ivar : IterVar
-            The iteration to be binded to thread.
-
-        thread_ivar : IterVar
-            The thread to be binded.
-        """
-        _ffi_api.StageBind(self, ivar, thread_ivar)
-
-    def env_threads(self, threads):
-        """Mark threads to be launched at the outer scope of composed op.
-
-        Parameters
-        ----------
-        threads : list of threads
-            The threads to be launched.
-        """
-        if isinstance(threads, IterVar):
-            threads = [threads]
-        _ffi_api.StageEnvThreads(self, threads)
-
-    def set_store_predicate(self, predicate):
-        """Set predicate under which store to the array can be performed.
-
-        Use this when there are duplicated threads doing the same store and we only
-        need one of them to do the store.
-
-        Parameters
-        ----------
-        predicate : Expr
-            The guard condition fo store.
-        """
-        _ffi_api.StageSetStorePredicate(self, predicate)
-
-    def compute_at(self, parent, scope):
-        """Attach the stage at parent's scope
-
-        Parameters
-        ----------
-        parent : Stage
-            The parent stage
-
-        scope : IterVar
-            The loop scope t be attached to.
-        """
-        _ffi_api.StageComputeAt(self, parent, scope)
-
-    def compute_inline(self):
-        """Mark stage as inline
-
-        Parameters
-        ----------
-        parent : Stage
-            The parent stage
-        """
-        _ffi_api.StageComputeInline(self)
-
-    def compute_root(self):
-        """Attach the stage at parent, and mark it as root
-
-        Parameters
-        ----------
-        parent : Stage
-            The parent stage
-        """
-        _ffi_api.StageComputeRoot(self)
-
-    def reorder(self, *args):
-        """reorder the arguments in the specified order.
-
-        Parameters
-        ----------
-        args : list of IterVar
-            The order to be ordered
-        """
-        _ffi_api.StageReorder(self, args)
-
-    def tile(self, x_parent, y_parent, x_factor, y_factor):
-        """Perform tiling on two dimensions
-
-        The final loop order from outmost to inner most are
-        [x_outer, y_outer, x_inner, y_inner]
-
-        Parameters
-        ----------
-        x_parent : IterVar
-            The original x dimension
-        y_parent : IterVar
-            The original y dimension
-        x_factor : Expr
-            The stride factor on x axis
-        y_factor : Expr
-            The stride factor on y axis
-
-        Returns
-        -------
-        x_outer : IterVar
-            Outer axis of x dimension
-        y_outer : IterVar
-            Outer axis of y dimension
-        x_inner : IterVar
-            Inner axis of x dimension
-        p_y_inner : IterVar
-            Inner axis of y dimension
-        """
-        x_outer, y_outer, x_inner, y_inner = _ffi_api.StageTile(
-            self, x_parent, y_parent, x_factor, y_factor
-        )
-        return x_outer, y_outer, x_inner, y_inner
-
-    def vectorize(self, var):
-        """Vectorize the iteration.
-
-        Parameters
-        ----------
-        var : IterVar
-            The iteration to be vectorize
-        """
-        _ffi_api.StageVectorize(self, var)
-
-    def tensorize(self, var, tensor_intrin):
-        """Tensorize the computation enclosed by var with tensor_intrin
-
-        Parameters
-        ----------
-        var : IterVar
-            The iteration boundary of tensorization.
-
-        tensor_intrin : TensorIntrin
-            The tensor intrinsic used for computation.
-        """
-        _ffi_api.StageTensorize(self, var, tensor_intrin)
-
-    def unroll(self, var):
-        """Unroll the iteration.
-
-        Parameters
-        ----------
-        var : IterVar
-            The iteration to be unrolled.
-        """
-        _ffi_api.StageUnroll(self, var)
-
-    def parallel(self, var):
-        """Parallelize the iteration.
-
-        Parameters
-        ----------
-        var : IterVar
-            The iteration to be parallelized.
-        """
-        _ffi_api.StageParallel(self, var)
-
-    def pragma(self, var, pragma_type, pragma_value=None):
-        """Annotate the iteration with pragma
-
-        This will translate to a pragma_scope surrounding
-        the corresponding loop generated.
-        Useful to support experimental features and extensions.
-
-        Parameters
-        ----------
-        var : IterVar
-            The iteration to be anotated
-
-        pragma_type : str
-             The pragma string to be annotated
-
-        pragma_value : Expr, optional
-             The pragma value to pass along the pragma
-
-        Note
-        ----
-        Most pragmas are advanced/experimental features
-        and may subject to change. List of supported pragmas:
-
-        - **debug_skip_region**
-
-          Force skip the region marked by the axis and turn it into no-op.
-          This is useful for debug purposes.
-
-        - **parallel_launch_point**
-
-          Specify to launch parallel threads outside the
-          specified iteration loop. By default the threads
-          launch at the point of parallel construct.
-          This pragma moves the launching point to even outer scope.
-          The threads are launched once and reused across multiple
-          parallel constructs as BSP style program.
-
-        - **parallel_barrier_when_finish**
-
-          Insert a synchronization barrier between working threads
-          after the specified loop iteration finishes.
-
-        - **parallel_stride_pattern**
-
-          Hint parallel loop to execute in strided pattern.
-          :code:`for (int i = task_id; i < end; i += num_task)`
-
-        """
-        if isinstance(pragma_value, string_types):
-            pragma_value = convert(pragma_value)
-        _ffi_api.StagePragma(self, var, pragma_type, pragma_value)
-
-    def prefetch(self, tensor, var, offset):
-        """Prefetch the specified variable
-
-        Parameters
-        ----------
-        tensor : Tensor
-            The tensor to be prefetched
-        var : IterVar
-            The loop point at which the prefetching is applied
-        offset : Expr
-            The number of iterations to be prefetched before actual execution
-        """
-        _ffi_api.StagePrefetch(self, tensor, var, offset)
-
-    def storage_align(self, axis, factor, offset):
-        """Set alignment requirement for specific axis
-
-        This ensures that stride[axis] == k * factor + offset for some k.
-        This is useful to set memory layout to for more friendly memory
-        access pattern. For example, we can set alignment to be
-        factor=2, offset=1 to avoid bank conflict for thread access on
-        higher dimension in GPU shared memory.
-
-        Parameters
-        ----------
-        axis : IterVar
-            The axis dimension to be aligned.
-        factor : int
-            The factor in alignment specification.
-        offset : int
-            The offset in the alignment specification.
-        """
-        _ffi_api.StageStorageAlign(self, axis, factor, offset)
-
-    def double_buffer(self):
-        """Compute the current stage via double buffering.
-
-        This can only be applied to intermediate stage.
-        This will double the storage cost of the current stage.
-        Can be useful to hide load latency.
-        """
-        _ffi_api.StageDoubleBuffer(self)
-
-    def rolling_buffer(self):
-        """Compute the current stage via rolling buffering.
-
-        This can only be applied to intermediate stage.
-        This will change the storage cost of the current stage.
-        """
-        _ffi_api.StageRollingBuffer(self)
-
-    def transform_layout(self, mapping_function: Callable[..., List[tvm.tir.PrimExpr]]):
-        """Defines the layout transformation for the current stage's tensor.
-
-        The map from initial_indices to final_indices must be an
-        invertible affine transformation.  This method may be called
-        more than once for a given tensor, in which case each
-        transformation is applied sequentially.
-
-        If the stage is a ComputeOp, then the iteration order of the
-        compute stage is rewritten to be a row-major traversal of the
-        tensor, and the new loop iteration variables are returned.
-        For all other stages, the loop iteration order is unmodified,
-        and the return value is None.
-
-        Parameters
-        ----------
-        mapping_function : Callable[..., List[tvm.tir.PrimExpr]]
-
-            A callable that accepts N arguments of type tvm.tir.Var,
-            and outputs a list of PrimExpr.  The input arguments
-            represent the location of a value in the current stage's
-            tensor, using the pre-transformation layout.  The return
-            value of the function gives the location of that value in
-            the current stage's tensor, using the post-transformation
-            layout.
-
-        Returns
-        -------
-        new_iter_vars : Optional[List[tvm.tir.IterVar]]
-
-            If the stage is a ComputeOp, then the return will be the
-            updated loop iteration variables over the data array, in
-            the same order as the output values from the
-            `mapping_function`.
-
-            Otherwise, the return value is None.
-
-        Examples
-        --------
-        .. code-block:: python
-
-            # ``A`` is a tensor whose compute definition is in NHWC
-            # format, and should be transformed into NCHWc format.
-
-            s[A].transform_layout(
-                lambda n,h,w,c: [n, c//4, h, w, c%4]
-            )
-
-
-        .. code-block:: python
-
-            # ``A`` is a tensor whose compute definition is in an
-            # arbitrary format, and should be transformed such that
-            # the last index is split, with the slower-changing index
-            # of the split placed at the slowest changing dimension.
-
-            s[A].transform_layout(
-                lambda *indices, i: [i//4, *indices, i%4]
-            )
-
-        .. code-block:: python
-
-            # ``B`` is a tensor defined by te.compute to be a copy of
-            # ``A`, and should be transformed such that ``B``'s layout
-            # is a transpose of ``A``'s layout.  The loop iteration
-            # that computes ``B`` will correspond to ``B``'s memory
-            # layout.
-
-            A = te.placeholder([n,m])
-            B = te.compute(A.shape, lambda i,j: A[i,j])
-            s = te.create_schedule(B.op)
-
-            s[B].transform_layout(lambda i,j: [j,i])
-
-        """
-
-        ndim = len(self.op.output(0).shape)
-        index_map, axis_separators = IndexMap.from_func_with_separators(
-            mapping_function, ndim=ndim, index_dtype="int32"
-        )
-
-        new_iter_vars = _ffi_api.StageTransformLayout(
-            self, index_map.initial_indices, index_map.final_indices
-        )
-        _ffi_api.StageSetAxisSeparators(self, axis_separators)
-
-        return new_iter_vars or None
-
-
-@tvm._ffi.register_object
-class SpecializedCondition(Object):
-
-    """Specialized condition to enable op specialization."""
-
-    def __init__(self, conditions):
-        """Create a specialized condition.
-
-        .. note::
-            Conditions are represented in conjunctive joint form (CNF).
-            Each condition should be a simple expression, e.g., n > 16,
-            m % 8 == 0, etc., where n, m are tvm.Var that represents a
-            dimension in the tensor shape.
-
-        Parameters
-        ----------
-        conditions : List of tvm.Expr
-            List of conditions in conjunctive joint form (CNF).
-        """
-        if not isinstance(conditions, (list, _container.Array)):
-            conditions = [conditions]
-        self.__init_handle_by_constructor__(_ffi_api.CreateSpecializedCondition, conditions)
-
-    @staticmethod
-    def current():
-        """Returns the current specialized condition"""
-        return _ffi_api.GetCurrentSpecialization()
-
-    def __enter__(self):
-        _ffi_api.EnterSpecializationScope(self)
-        return self
-
-    def __exit__(self, ptype, value, trace):
-        _ffi_api.ExitSpecializationScope(self)
-
-
-# Sentinel value used to indicate which groups of pre-flattening axes
-# should be used to post-flattening axes.  Moved from
-# te.AXIS_SEPARATOR to tir.IndexMap.AXIS_SEPARATOR for general use,
-# maintained here for backwards compatibility.
-AXIS_SEPARATOR = IndexMap.AXIS_SEPARATOR
-
-
-tvm._ffi._init_api("schedule", __name__)
diff --git a/python/tvm/te/tensor.py b/python/tvm/te/tensor.py
index 930667242e29..53ab9d0b5b59 100644
--- a/python/tvm/te/tensor.py
+++ b/python/tvm/te/tensor.py
@@ -190,13 +190,3 @@ def scan_axis(self):
 @tvm._ffi.register_object
 class ExternOp(Operation):
     """External operation."""
-
-
-@tvm._ffi.register_object
-class HybridOp(Operation):
-    """Hybrid operation."""
-
-    @property
-    def axis(self):
-        """Represent the IterVar axis, also defined when it is a HybridOp"""
-        return self.__getattr__("axis")
diff --git a/python/tvm/te/tensor_intrin.py b/python/tvm/te/tensor_intrin.py
deleted file mode 100644
index ff633af02d13..000000000000
--- a/python/tvm/te/tensor_intrin.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Tensor intrinsics"""
-import tvm._ffi
-import tvm.tir
-
-from tvm.runtime import Object, convert
-from tvm.ir import Range
-from .tensor import PlaceholderOp
-
-from . import tensor as _tensor
-from . import _ffi_api
-
-
-def _get_region(tslice):
-    region = []
-    for idx in tslice.indices:
-        if isinstance(idx, slice):
-            assert idx.step is None
-            region.append(Range(idx.start, idx.stop))
-        else:
-            if isinstance(idx, tvm.tir.IterVar):
-                begin = idx.var
-            else:
-                begin = idx
-            region.append(Range.from_min_extent(begin, 1))
-    return region
-
-
-@tvm._ffi.register_object
-class TensorIntrin(Object):
-    """Tensor intrinsic functions for certain computation.
-
-    See Also
-    --------
-    decl_tensor_intrin: Construct a TensorIntrin
-    """
-
-    def __call__(self, *args, **kwargs):
-        tensors = [x.tensor for x in args if isinstance(x, _tensor.TensorSlice)]
-        scalar_inputs = [x for x in args if not isinstance(x, _tensor.TensorSlice)]
-        regions = [_get_region(x) for x in args if isinstance(x, _tensor.TensorSlice)]
-        reduce_axis = []
-        if "reduce_axis" in kwargs:
-            reduce_axis = kwargs["reduce_axis"]
-            if not isinstance(reduce_axis, (list, tuple)):
-                reduce_axis = [reduce_axis]
-            reduce_axis = convert(reduce_axis)
-        if scalar_inputs:
-            scalar_inputs = convert(scalar_inputs)
-        return _ffi_api.TensorIntrinCall(self, tensors, regions, reduce_axis, scalar_inputs)
-
-
-def decl_tensor_intrin(
-    op, fcompute, name="tensor_intrin", binds=None, scalar_params=None, default_buffer_params=None
-):
-    """Declare a tensor intrinsic function.
-
-    Parameters
-    ----------
-    op: Operation
-        The symbolic description of the intrinsic operation
-
-    fcompute: lambda function of inputs, outputs-> stmt
-        Specifies the IR statement to do the computation.
-        See the following note for function signature of fcompute
-
-        .. note::
-             **Parameters**
-
-             - **ins** (list of :any:`tvm.tir.Buffer`) - Placeholder for each inputs
-             - **outs** (list of :any:`tvm.tir.Buffer`) - Placeholder for each outputs
-
-             **Returns**
-
-             - **stmt** (:any:`tvm.tir.Stmt`, or tuple of three stmts)
-             - If a single stmt is returned, it represents the body
-             - If tuple of three stmts are returned they corresponds to body,
-               reduce_init, reduce_update
-
-    name: str, optional
-        The name of the intrinsic.
-
-    binds: dict of :any:`Tensor` to :any:`tvm.tir.Buffer`, optional
-        Dictionary that maps the Tensor to Buffer which specified the data layout
-        requirement of the function. By default, a new compact buffer is created
-        for each tensor in the argument.
-
-    scalar_params: a list of variables used by op, whose values will be passed
-                   as scalar_inputs when the tensor intrinsic is called.
-
-    default_buffer_params: Optional[dict]
-        Dictionary of buffer arguments to be passed when constructing a buffer.
-
-    Returns
-    -------
-    intrin: TensorIntrin
-        A TensorIntrin that can be used in tensorize schedule.
-    """
-    if not isinstance(op, _tensor.Operation):
-        raise TypeError("expect Operation")
-    inputs = op.input_tensors
-    binds = binds if binds else {}
-    tensors = list(inputs)
-    for i in range(op.num_outputs):
-        tensors.append(op.output(i))
-
-    binds_list = []
-    for t in inputs:
-        if not isinstance(t.op, PlaceholderOp):
-            raise ValueError("Do not yet support composition op")
-
-    default_buffer_params = {} if default_buffer_params is None else default_buffer_params
-    for t in tensors:
-        buf = (
-            binds[t]
-            if t in binds
-            else tvm.tir.decl_buffer(t.shape, t.dtype, t.op.name, **default_buffer_params)
-        )
-        binds_list.append(buf)
-
-    if scalar_params:
-        body = fcompute(binds_list[: len(inputs)], binds_list[len(inputs) :], scalar_params)
-    else:
-        body = fcompute(binds_list[: len(inputs)], binds_list[len(inputs) :])
-        scalar_params = []
-    if isinstance(body, (tvm.tir.PrimExpr, tvm.tir.Stmt)):
-        body = [body]
-    body = [tvm.tir.Evaluate(x) if isinstance(x, tvm.tir.PrimExpr) else x for x in body]
-    if len(body) < 3:
-        body += [None] * (3 - len(body))
-    return _ffi_api.TensorIntrin(name, op, inputs, binds_list, scalar_params, *body)
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 8df32c810543..b3123a20d3e9 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -327,8 +327,7 @@ def _compute_body(*us):
 
         A = tvm.te.compute([r.extent.value for v, r in vranges.items()], _compute_body)
         args = [tvm.nd.empty(A.shape, A.dtype)]
-        sch = tvm.te.create_schedule(A.op)
-        mod = tvm.build(sch, [A])
+        mod = tvm.build(tvm.IRModule.from_expr(tvm.te.create_prim_func([A])))
         mod(*args)
         return args[0].numpy()
 
diff --git a/python/tvm/tir/buffer.py b/python/tvm/tir/buffer.py
index 1109cc3d66d6..72c2a40fedd2 100644
--- a/python/tvm/tir/buffer.py
+++ b/python/tvm/tir/buffer.py
@@ -304,29 +304,6 @@ def decl_buffer(
     buffer : tvm.tir.Buffer
         The created buffer
 
-    Example
-    -------
-    Here's an example of how broadcast buffer can be used to define a symbolic broadcast operation,
-
-    .. code-block:: python
-
-        m0, m1, m2 = te.var("m0"), te.var("m1"), te.var("m2")
-        n0, n1, n2 = te.var("n0"), te.var("n1"), te.var("n2")
-        o0, o1, o2 = te.var("o0"), te.var("o1"), te.var("o2")
-        A = te.placeholder((m0, m1, m2), name='A')
-        B = te.placeholder((n0, n1, n2), name='B')
-        C = te.compute((o0, o1, o2), lambda i, j, k: A[i, j, k] + B[i, j, k], name='C')
-        Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name="Ab", buffer_type="auto_broadcast")
-        Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name="Bb", buffer_type="auto_broadcast")
-        s = te.create_schedule(C.op)
-        fadd = tvm.build(s, [A, B, C], target='llvm', name='bcast_add', binds={A:Ab, B:Bb})
-        dev = tvm.cpu(0)
-        a = tvm.nd.array(np.random.uniform(size=(2, 4, 3)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=(2, 1, 3)).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((2, 4, 3), dtype=C.dtype), dev)
-        fadd(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-
     Note
     ----
     Buffer data structure reflects the DLTensor structure in dlpack.
diff --git a/python/tvm/topi/__init__.py b/python/tvm/topi/__init__.py
index 3588c04d8fa2..1de6941c9923 100644
--- a/python/tvm/topi/__init__.py
+++ b/python/tvm/topi/__init__.py
@@ -39,9 +39,7 @@
 from .sort import *
 from .scatter import *
 from .scatter_elements import *
-from .sparse_fill_empty_rows import *
 from .sparse_reshape import *
-from .argwhere import *
 from .scan import *
 from .einsum import *
 from .unique import *
@@ -49,9 +47,7 @@
 from .signal import *
 from . import nn
 from . import utils
-from . import vision
 from . import image
-from . import random
 from . import gpu
 
 # error reporting
diff --git a/python/tvm/topi/argwhere.py b/python/tvm/topi/argwhere.py
deleted file mode 100644
index c2b658a4e92f..000000000000
--- a/python/tvm/topi/argwhere.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, too-many-arguments, too-many-nested-blocks
-"""Argwhere operator"""
-import tvm
-from tvm.te import hybrid
-
-
-@hybrid.script
-def hybrid_argwhere_1d(output_shape, condition):
-    """Find the indices of elements of a 1-D tensor that are non-zero.
-
-    Parameters
-    ----------
-    condition : tvm.te.Tensor
-        1-D tensor with boolean values.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        Indices of non-zero elements.
-    """
-    a = output_tensor(output_shape, "int32")
-    a1 = condition.shape[0]
-    valid_index = 0
-    for i1 in range(a1):
-        if condition[i1] != 0:
-            a[valid_index, 0] = i1
-            valid_index += 1
-    return a
-
-
-@hybrid.script
-def hybrid_argwhere_2d(output_shape, condition):
-    """Find the indices of elements of a 2-D tensor that are non-zero.
-
-    Parameters
-    ----------
-    condition : tvm.te.Tensor
-        2-D tensor with boolean values.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        Indices of non-zero elements.
-    """
-    a = output_tensor(output_shape, "int32")
-    a1 = condition.shape[0]
-    a2 = condition.shape[1]
-    valid_index = 0
-    for i1 in range(a1):
-        for i2 in range(a2):
-            if condition[i1, i2] != 0:
-                a[valid_index, 0] = i1
-                a[valid_index, 1] = i2
-                valid_index += 1
-    return a
-
-
-@hybrid.script
-def hybrid_argwhere_3d(output_shape, condition):
-    """Find the indices of elements of a 3-D tensor that are non-zero.
-
-    Parameters
-    ----------
-    condition : tvm.te.Tensor
-        3-D tensor with boolean values.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        Indices of non-zero elements.
-    """
-    a = output_tensor(output_shape, "int32")
-    a1 = condition.shape[0]
-    a2 = condition.shape[1]
-    a3 = condition.shape[2]
-    valid_index = 0
-    for i1 in range(a1):
-        for i2 in range(a2):
-            for i3 in range(a3):
-                if condition[i1, i2, i3] != 0:
-                    a[valid_index, 0] = i1
-                    a[valid_index, 1] = i2
-                    a[valid_index, 2] = i3
-                    valid_index += 1
-    return a
-
-
-@hybrid.script
-def hybrid_argwhere_4d(output_shape, condition):
-    """Find the indices of elements of a 4-D tensor that are non-zero.
-
-    Parameters
-    ----------
-    condition : tvm.te.Tensor
-        4-D tensor with boolean values.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        Indices of non-zero elements.
-    """
-    a = output_tensor(output_shape, "int32")
-    a1 = condition.shape[0]
-    a2 = condition.shape[1]
-    a3 = condition.shape[2]
-    a4 = condition.shape[3]
-    valid_index = 0
-    for i1 in range(a1):
-        for i2 in range(a2):
-            for i3 in range(a3):
-                for i4 in range(a4):
-                    if condition[i1, i2, i3, i4] != 0:
-                        a[valid_index, 0] = i1
-                        a[valid_index, 1] = i2
-                        a[valid_index, 2] = i3
-                        a[valid_index, 3] = i4
-                        valid_index += 1
-    return a
-
-
-@hybrid.script
-def hybrid_argwhere_5d(output_shape, condition):
-    """Find the indices of elements of a 5-D tensor that are non-zero.
-
-    Parameters
-    ----------
-    condition : tvm.te.Tensor
-        5-D tensor with boolean values.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        Indices of non-zero elements.
-    """
-    a = output_tensor(output_shape, "int32")
-    a1 = condition.shape[0]
-    a2 = condition.shape[1]
-    a3 = condition.shape[2]
-    a4 = condition.shape[3]
-    a5 = condition.shape[4]
-    valid_index = 0
-    for i1 in range(a1):
-        for i2 in range(a2):
-            for i3 in range(a3):
-                for i4 in range(a4):
-                    for i5 in range(a5):
-                        if condition[i1, i2, i3, i4, i5] != 0:
-                            a[valid_index, 0] = i1
-                            a[valid_index, 1] = i2
-                            a[valid_index, 2] = i3
-                            a[valid_index, 3] = i4
-                            a[valid_index, 4] = i5
-                            valid_index += 1
-    return a
-
-
-@tvm.target.generic_func
-def argwhere(output_shape, condition):
-    """Find the indices of elements of a tensor that are non-zero.
-
-    Parameters
-    ----------
-    condition : tvm.te.Tensor
-        Tensor with boolean values.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        Indices of non-zero elements.
-    """
-    if len(condition.shape) == 1:
-        return hybrid_argwhere_1d(output_shape.shape, condition)
-    if len(condition.shape) == 2:
-        return hybrid_argwhere_2d(output_shape.shape, condition)
-    if len(condition.shape) == 3:
-        return hybrid_argwhere_3d(output_shape.shape, condition)
-    if len(condition.shape) == 4:
-        return hybrid_argwhere_4d(output_shape.shape, condition)
-    if len(condition.shape) == 5:
-        return hybrid_argwhere_5d(output_shape.shape, condition)
-    raise ValueError("Does not support rank higher than 5 in argwhere")
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index 5ee625577e38..e145add5f01b 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -615,68 +615,6 @@ def conv2d_NCHWc_int8(
     )
 
 
-def conv2d_gemm_weight_transform(kernel, tile_N, tile_K, use_scalable_vectors=False, use_sme=False):
-    """Weight transformation for winograd
-
-    Parameters
-    ----------
-    kernel: Tensor
-        The raw kernel tensor with layout "NHWC".
-    tile_N: int
-        Tile size across N axis of the weight transformation for ConvGemm. (N = OC)
-    tile_K: int
-        Tile size across K axis of the weight transformation for ConvGemm. (K = KW * KH * IC)
-    use_scalable_vectors : bool
-        determines if operations on scalable vectors are expected
-    use_sme : bool
-        determines if SME operations on scalable vectors are expected
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        2-D with shape [CI*KH*KW,CO]
-    """
-    KH, KW, IC, OC = get_const_tuple(kernel.shape)
-    K = KH * KW * IC
-    N = OC
-
-    kernel_flat = te.compute(
-        (K, N), lambda x, y: kernel[(x // IC) // KW, (x // IC) % KW, x % IC, y], "weight_flatten"
-    )
-
-    pad_N, pad_K = tvm.topi.arm_cpu.arm_utils.get_conv2d_weights_padding(N, K, tile_N, tile_K)
-
-    N_padded = N + pad_N
-    K_padded = K + pad_K
-
-    if pad_K != 0 or pad_N != 0:
-        kernel_flat = pad(
-            kernel_flat, pad_before=(0, 0), pad_after=(pad_K, pad_N), name="weight_padding"
-        )
-
-    if use_sme and kernel.dtype == "float16":
-        return te.compute(
-            (N_padded, K_padded), lambda x, y: kernel_flat[y, x], name="weight_transpose"
-        )
-
-    if use_scalable_vectors or use_sme:
-        return kernel_flat
-
-    if kernel.dtype in ["int8", "uint8"]:
-        B_inter_t = te.compute(
-            (N_padded // tile_N, K_padded // tile_K, tile_N, tile_K),
-            lambda x, y, z, w: kernel_flat[w + tile_K * y, z + tile_N * x],
-            name="weight_block_reshape",
-        )
-    else:
-        B_inter_t = te.compute(
-            (N_padded // tile_N, K_padded // tile_K, tile_K, tile_N),
-            lambda x, y, z, w: kernel_flat[z + tile_K * y, w + tile_N * x],
-            name="weight_block_reshape",
-        )
-    return B_inter_t
-
-
 def conv2d_winograd_weight_transform(kernel, tile_size):
     """Weight transformation for winograd
 
@@ -712,29 +650,6 @@ def conv2d_winograd_weight_transform(kernel, tile_size):
     )
 
 
-def conv2d_winograd_nnpack_weight_transform(kernel, convolution_algorithm, out_dtype):
-    """Weight transformation for winograd
-
-    Parameters
-    ----------
-    kernel: Tensor
-        The raw kernel tensor with layout "NCHW". Only 3x3 kernel is supported for now.
-    convolution_algorithm: int
-        The convolution algorithm for Winograd NNPACK.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        4-D with shape [alpha, alpha, CO, CI]
-    """
-    # pylint: disable=import-outside-toplevel
-    from tvm.contrib import nnpack
-
-    return nnpack.convolution_inference_weight_transform(
-        kernel, algorithm=convolution_algorithm, dtype=out_dtype
-    )
-
-
 def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtype=None):
     """Group convolution operator in NCHW layout.
 
diff --git a/python/tvm/topi/random/__init__.py b/python/tvm/topi/random/__init__.py
deleted file mode 100644
index ee8d1d6385b7..000000000000
--- a/python/tvm/topi/random/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""Pseudorandom generator kernels and operators."""
-from __future__ import absolute_import
-
-from .kernel import *
diff --git a/python/tvm/topi/random/kernel.py b/python/tvm/topi/random/kernel.py
deleted file mode 100644
index 464ea9634ab5..000000000000
--- a/python/tvm/topi/random/kernel.py
+++ /dev/null
@@ -1,657 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Pseudorandom number kernels."""
-import math
-import numpy as np
-
-import tvm
-import tvm.topi
-
-from ... import tir
-from ...tir import ir_builder
-
-
-# Threefry PRNG with splitting based on
-# - J. K. Salmon, M. A. Moraes, R. O. Dror and D. E. Shaw, "Parallel random numbers: As easy as 1,
-#   2, 3," SC '11: Proceedings of 2011 International Conference for High Performance Computing,
-#   Networking, Storage and Analysis, Seattle, WA, 2011, pp. 1-12, doi: 10.1145/2063384.2063405.
-# - Claessen, K. ; Palka, M. (2013) "Splittable Pseudorandom Number Generators using Cryptographic
-#   Hashing". Proceedings of Haskell Symposium 2013 pp. 47-58.  MLA
-# - Ferguson, Niels, et al. "The Skein hash function family." Submission to NIST (round 3) 7.7.5
-#   (2010): 3.
-
-
-# Threefry is a counter based PRNG: given a unique input, it generates a unique random number. As
-# there is no state to maintain, we can apply it to a sequence of numbers (0..N) to generate a
-# sequence of random numbers in parallel. In order to make the PRNG splittable (that is we can
-# generate a sequence of random numbers in one place, and another sequence in another), we add a
-# path and key in addition to the counter. The path allows us to encode a sequence of splits (a 0 in
-# the path indicates the left result of a split, a 1 indicates the right). To avoid continuously
-# growing the path, we can compress an existing path into the key portion of the generator by
-# hashing the current key, path, and counter to create the new key (this same technique is used if
-# we run out of room for the counter). They key is initialized with a unique initial state.
-#
-# Random numbers are generated by applying the Threefry hash to the current key, path, and counter.
-
-# This module use encoding e4 from the appendix of "Splittable Pseudorandom Number Generators using
-# Cryptographic Hashing" (confusingly, the definition in the paper uses e3 to define the encoding
-# function). This encoding uses a 10 element uint64 tensor where each byte means the following:
-
-# .. code-block:
-
-#     gen:
-#     words: 0 1 2 3 | 4 5  | 6 7     | 8 9
-#     usage: key     | path | counter | position of next step in path encoded in binary
-#                                       ex: 0b00010 -> next path entry goes one from the right
-
-# Right now, counter only uses the rightmost word.
-
-# Threefry rotation constants from the Skein paper ("The Skein Hash Function Family"
-# https://www.schneier.com/wp-content/uploads/2015/01/skein.pdf)
-_ROTATIONS = {
-    4: [[14, 16], [52, 57], [23, 40], [5, 37], [25, 33], [46, 12], [58, 22], [32, 32]],
-    8: [
-        [46, 36, 19, 37],
-        [33, 27, 14, 42],
-        [17, 49, 36, 39],
-        [44, 9, 54, 56],
-        [39, 30, 34, 24],
-        [13, 50, 10, 17],
-        [25, 29, 39, 43],
-        [8, 35, 56, 22],
-    ],
-    16: [
-        [24, 13, 8, 47, 8, 17, 22, 37],
-        [38, 19, 10, 55, 49, 18, 23, 52],
-        [33, 4, 51, 13, 34, 41, 59, 17],
-        [5, 20, 48, 41, 47, 28, 16, 25],
-        [41, 9, 37, 31, 12, 47, 44, 30],
-        [16, 34, 56, 51, 4, 53, 42, 41],
-        [31, 44, 47, 46, 19, 42, 44, 25],
-        [9, 48, 35, 52, 23, 31, 37, 20],
-    ],
-}
-
-# Threefry permutation constants from the Skein paper ("The Skein Hash Function Family"
-# https://www.schneier.com/wp-content/uploads/2015/01/skein.pdf)
-_PERMUTATIONS = {
-    4: [0, 3, 2, 1],
-    8: [2, 1, 4, 7, 6, 5, 0, 3],
-    16: [0, 9, 2, 13, 6, 11, 4, 15, 10, 7, 12, 3, 14, 5, 8, 1],
-}
-
-
-def _threefry(
-    irb, key_buf, key_offset, counter_buf, counter_offset, out_buf, out_offset, out_shape
-):
-    """IRBuilder code for running Threefry
-
-    Parameters
-    ----------
-    irb: IRBuilder
-        IRBuilder that this code will be generated for.
-
-    key_buf: BufferVar
-        Buffer to read the key from.
-
-    key_offset: number
-        Threefry will write to :code:`key_buf[key_offset:key_offset+4]`
-
-    counter_buf: BufferVar
-        Buffer to read the counter from.
-
-    counter_offset: number
-        Threefry will write to :code:`counter_buf[counter_offset:counter_offset+4]`
-
-    out_buf: BufferVar
-        Buffer to read the counter from.
-
-    out_offset: number
-        Threefry will write to :code:`out_buf[out_offset:out_offset+4*product(out_shape)]`
-
-    out_shape: number
-        Determines the number of output states to generate. :code:`state[i]` will correspond to
-        counter+i.
-    """
-    nrounds = 20
-    nwords = 4
-    iwidth = 64
-    assert nrounds % 4 == 0
-    assert nwords in [4, 8, 16]
-
-    # The paper has constants for 32 bit threefry, but we keep the implementation simple by only
-    # using 64-bit words.
-    assert key_buf.dtype == "uint64", "threefry only supports 64-bit keys"
-    assert key_buf.dtype == counter_buf.dtype, "threefry key and counter must be the same dtype"
-
-    def mix(a, b, rotation):
-        x = a + b  # wrapping
-        y = x ^ ((b << rotation) | (b >> (iwidth - rotation)))
-        return [x, y]
-
-    # temporary buffer for holding the results of _PERMUTATIONS
-    tmp = irb.allocate(out_buf.dtype, out_shape * nwords, name="tmp", scope="global")
-    tmp_offset = 0
-
-    # Initialize entire key. It is composed of the original key with one
-    # element appended. The appended element is the xor of all key words plus a
-    # constant.
-    full_key = irb.allocate("uint64", nwords + 1, name="full_key", scope="global")
-    for i in range(nwords):
-        full_key[i] = key_buf[key_offset + i]
-    # initial key constant, full_key[nwords] is equivalent to k_{N_W} in the Skein paper.
-    full_key[nwords] = tvm.tir.const(0x1BD11BDAA9FC1A22, dtype="uint64")
-    for i in range(nwords):
-        full_key[nwords] ^= key_buf[key_offset + i]
-
-    with irb.for_range(0, out_shape, dtype="uint64", name="i") as i:
-        for j in range(nwords):
-            out_buf[out_offset + i * nwords + j] = counter_buf[counter_offset + j] + i
-
-    def key_schedule(s, i):
-        # Threefry uses no tweak, so the key schedule is simple
-        if i == nwords - 1:
-            return full_key[(s + i) % (nwords + 1)] + tvm.tir.const(s, dtype="uint64")
-        return full_key[(s + i) % (nwords + 1)]
-
-    with irb.for_range(0, out_shape, name="l") as l:  # pylint: disable=invalid-name
-        for i in range(nrounds // 4):
-            for j in range(nwords):
-                out_buf[out_offset + l * nwords + j] += key_schedule(i, j)  # wrapping
-            for k in range(4):
-                for j in range(nwords // 2):
-                    (
-                        out_buf[out_offset + l * nwords + j * 2 + 0],
-                        out_buf[out_offset + l * nwords + j * 2 + 1],
-                    ) = mix(
-                        out_buf[out_offset + l * nwords + j * 2 + 0],
-                        out_buf[out_offset + l * nwords + j * 2 + 1],
-                        _ROTATIONS[nwords][(i * 4 + k) % 8][j],
-                    )
-                for j in range(nwords):
-                    tmp[tmp_offset + l * nwords + j] = out_buf[
-                        out_offset + l * nwords + _PERMUTATIONS[nwords][j]
-                    ]
-                # number of rounds is even, so out always contains the result
-                (out_buf, tmp) = (tmp, out_buf)
-                (out_offset, tmp_offset) = (tmp_offset, out_offset)
-
-
-def threefry_generate(gen, out_shape):
-    """Generate a series of random values
-
-    Notes
-    -----
-    This function uses the counter portion of the generator state to generate a series of random
-    numbers in parallel. Random number `i` is generated by applying Threefry to the current
-    generator state with the counter portion incremented by `i`. This means that each random number
-    is generated independently from each other random number, so we can compute them in parallel.
-
-    If there is not enough room left in the counter to generate the desired shape of random values,
-    then a new generator is created by applying Threefry to the current key, path, and counter.
-    This new generator will have a reset counter.
-
-    Warning
-    -------
-    Threeyfry requires that unsigned integer arithmetic wraps on overflow. Currently TVM has no
-    guarantee of this, so threefry contains an internal assert to check wrapping behavior. This
-    assert may or may not run depending on your platform, so it is recommended you run
-    :py:func:`threefry_test_wrapping` to verify wrapping behavior.
-
-    Parameters
-    ----------
-    gen : Tensor[10, uint64]
-        Generator state. Can be create with :py:func:`tvm.relay.random.threefry_key`. This should
-        not be reused in another function, otherwise random numbers will be repeated.
-
-    out_shape : Sequence[int]
-        Output shape of the random numbers.
-
-    Returns
-    -------
-    new_gen : Tensor[10, uint64]
-        The new generator state to be used in subsequent calls.
-
-    rand : Tensor[out_shape, uint64]
-        Tensor of random numbers with shape `out_shape`.
-    """
-    out_len = tir.const(1)
-    for s in out_shape:
-        out_len *= s
-    assert (
-        out_len.value <= 2**64 - 1
-    ), f"Can only generate up to 2^64 random numbers, but {out_len} were requested."
-
-    def gen_ir(gen_ptr, out_gen_ptr, out_array_ptr):
-        irb = ir_builder.create()
-        gen = irb.buffer_ptr(gen_ptr)
-        out_gen = irb.buffer_ptr(out_gen_ptr)
-        out_array = irb.buffer_ptr(out_array_ptr)
-
-        # Check that unsigned arithmetic wraps, as it is required to implement threefry correctly.
-        irb.emit(
-            tvm.tir.AssertStmt(
-                tvm.tir.const(0xFFFFFFFFFFFFFFFF, "uint64") + tvm.tir.const(1, "uint64")
-                == tvm.tir.const(0, "uint64"),
-                tvm.tir.StringImm(
-                    "Unsigned integer arithmetic is not wrapping, but threefry requires wrapping."
-                ),
-                tvm.tir.Evaluate(0),
-            )
-        )
-
-        # Create a temporary array to hold the generator state we will use to create the random
-        # numbers. We cannot use gen because we may need to update the key + path if there is not
-        # enough room in the counter.
-        tmp = irb.allocate(gen.dtype, 10, name="tmp", scope="global")
-
-        # TODO(tkonolige): for now we only use the last word of the counter for counting. It is too
-        # much work to figure out how to do 128 bit addition.
-
-        # Max value for counter should be 2**64-2 because we need to reserve a special value to
-        # indicate the counter is used up.
-        with irb.if_scope(gen[7] < tir.const(2**64 - 1, dtype=gen.dtype) - out_len):
-            for i in range(10):
-                tmp[i] = gen[i]
-        with irb.else_scope():
-            # no room left in the counter, we have to change the path or key
-            with irb.if_scope(gen[8] == 0 and gen[9] == 0):
-                # out of room in the path, have to generate new key
-
-                # The paper says the counter that we will be hashing should be a special value of
-                # all ones. We need to allocate some space for it because we cannot overwrite gen.
-                tmp_counter = irb.allocate(gen.dtype, 2, name="tmp_counter", scope="global")
-                tmp_counter[0] = tir.const(0xFFFFFFFFFFFFFFFF, dtype=gen.dtype)
-                tmp_counter[1] = tir.const(0xFFFFFFFFFFFFFFFF, dtype=gen.dtype)
-                _threefry(irb, gen, 0, tmp_counter, 0, tmp, 0, 1)
-                tmp[4] = tir.const(0, dtype=gen.dtype)  # zero path, i.e. no path
-                tmp[5] = tir.const(0, dtype=gen.dtype)
-                tmp[6] = tir.const(0, dtype=gen.dtype)  # zero counter
-                tmp[7] = tir.const(0, dtype=gen.dtype)
-                tmp[8] = tir.const(1 << 63, dtype=gen.dtype)  # one in the leftmost position
-                tmp[9] = tir.const(0, dtype=gen.dtype)
-            with irb.else_scope():
-                tmp[0] = gen[0]
-                tmp[1] = gen[1]
-                tmp[2] = gen[2]
-                tmp[3] = gen[3]
-                tmp[4] = gen[4] | gen[8]  # add a 1 to the path
-                tmp[5] = gen[5] | gen[9]
-                tmp[6] = tir.const(0, dtype=gen.dtype)  # zero counter
-                tmp[7] = tir.const(0, dtype=gen.dtype)
-                _shift_right(irb, gen[8], gen[9], tmp, 8, tmp, 9)
-
-        # Compute random values
-        if out_len.value >= 4:
-            _threefry(irb, tmp, 0, tmp, 4, out_array, 0, out_len // 4)
-        if out_len.value % 4 != 0:
-            remaining = irb.allocate(gen.dtype, 4, name="remaining", scope="global")
-            tmp[7] = tmp[7] + tir.Cast(gen.dtype, out_len // 4 * 4)  # increment counter
-            _threefry(irb, tmp, 0, tmp, 4, remaining, 0, 1)
-            with irb.for_range(0, out_len % 4, dtype="uint64", name="i") as i:
-                out_array[out_len // 4 * 4 + i] = remaining[i]
-
-        # Update generator state
-        out_gen[0] = tmp[0]  # key stays the same
-        out_gen[1] = tmp[1]
-        out_gen[2] = tmp[2]
-        out_gen[3] = tmp[3]
-        out_gen[4] = tmp[4]  # path stays the same
-        out_gen[5] = tmp[5]
-        out_gen[6] = tir.const(0, dtype=gen.dtype)  # unused, leave it as 0
-        if out_len.value % 4 != 0:
-            # increment counter for the remaining
-            # as we will generate 4 random numbers for the remaining, increase 4 here.
-            # the main increment was done before the second _threefry.
-            out_gen[7] = tmp[7] + tir.Cast(gen.dtype, 4)
-        else:
-            out_gen[7] = tmp[7] + tir.Cast(gen.dtype, out_len)  # increment counter
-        out_gen[8] = tmp[8]  # path unchanged, so no update here
-        out_gen[9] = tmp[9]
-
-        return irb.get()
-
-    out_gen = tvm.tir.decl_buffer((10,), name="out_gen", dtype="uint64")
-    out_array = tvm.tir.decl_buffer(out_shape, name="out_array", dtype="uint64")
-    return tvm.te.extern(
-        [out_gen.shape, out_array.shape],
-        [gen],
-        lambda ins, outs: gen_ir(ins[0], outs[0], outs[1]),
-        out_buffers=[out_gen, out_array],
-        name="threefry_generate",
-        tag="threefry_generate",
-    )
-
-
-def _shift_right(irb, a, b, out_a, a_off, out_b, b_off):
-    """Binary shift a 128bit number composed of two 64 bit words right by one."""
-    with irb.if_scope(a == 1):
-        out_a[a_off] = tir.const(0, dtype=a.dtype)
-        out_b[b_off] = tir.const(0x8000000000000000, dtype=a.dtype)
-    with irb.else_scope():
-        with irb.if_scope(a == 0):
-            out_a[a_off] = tir.const(0, dtype=a.dtype)
-            out_b[b_off] = b >> 1
-        with irb.else_scope():
-            out_a[a_off] = a >> 1
-            out_b[b_off] = tir.const(0, dtype=a.dtype)
-
-
-def threefry_split(gen):
-    """Split a single generator state into two new ones
-
-    Notes
-    -----
-    The new generator is created by appending a one (for the right output) or a zero (for the left
-    output) to the end of the path portion of the generator If there is no longer and room in the
-    path, then we create a new key portion of the generator by applying Threefry to the old state,
-    path, and counter. i.e. :code:`new_key = threefry(old_key, [old_path, old_counter])`. This
-    resets the path portion of the new generator.
-
-    Parameters
-    ----------
-    gen : Tensor[10, uint64]
-        Generator state. Can be create with :py:func:`tvm.relay.random.threefry_key`. This should
-        not be reused in another function, otherwise random numbers will be repeated.
-
-    Returns
-    -------
-    out_gen_left : Tensor[10, uint64]
-        New generator state that is distinct from `out_gen_right`.
-
-    out_gen_right : Tensor[10, uint64]
-        New generator state that is distinct from `out_gen_left`.
-    """
-
-    def gen_ir(gen_ptr, out_left_ptr, out_right_ptr):
-        irb = ir_builder.create()
-        gen = irb.buffer_ptr(gen_ptr)
-        out_left = irb.buffer_ptr(out_left_ptr)
-        out_right = irb.buffer_ptr(out_right_ptr)
-
-        with irb.if_scope(gen[8] == 0 and gen[9] == 0):
-            # Generate new key because we have run out of room to extend the path
-            _threefry(irb, gen, 0, gen, 4, out_left, 0, 1)
-            out_left[4] = tir.const(0, dtype=gen.dtype)
-            out_left[5] = tir.const(0, dtype=gen.dtype)
-            out_left[6] = tir.const(0, dtype=gen.dtype)  # counter gets zeroed
-            out_left[7] = tir.const(0, dtype=gen.dtype)  # counter gets zeroed
-            out_left[8] = tir.const(
-                1 << 62, dtype=gen.dtype
-            )  # one in the second from the leftmost position
-            out_left[9] = tir.const(0, dtype=gen.dtype)
-
-            out_right[0] = out_left[0]
-            out_right[1] = out_left[1]
-            out_right[2] = out_left[2]
-            out_right[3] = out_left[3]
-            out_right[4] = tir.const(1 << 63, dtype=gen.dtype)  # one in the leftmost position
-            out_right[5] = tir.const(0, dtype=gen.dtype)
-            out_right[6] = tir.const(0, dtype=gen.dtype)
-            out_right[7] = tir.const(0, dtype=gen.dtype)
-            out_right[8] = tir.const(
-                1 << 62, dtype=gen.dtype
-            )  # one in the second from the leftmost position
-            out_right[9] = tir.const(0, dtype=gen.dtype)
-        with irb.else_scope():
-            out_left[0] = gen[0]
-            out_left[1] = gen[1]
-            out_left[2] = gen[2]
-            out_left[3] = gen[3]
-            out_left[4] = gen[4]  # adding a zero here, but its already zero padded
-            out_left[5] = gen[5]
-            out_left[6] = gen[6]
-            out_left[7] = gen[7]
-            # move path position over one bit
-            _shift_right(irb, gen[8], gen[9], out_left, 8, out_left, 9)
-
-            out_right[0] = gen[0]
-            out_right[1] = gen[1]
-            out_right[2] = gen[2]
-            out_right[3] = gen[3]
-            out_right[4] = gen[4] | gen[8]  # add a one to the path
-            out_right[5] = gen[5] | gen[9]
-            out_right[6] = gen[6]
-            out_right[7] = gen[7]
-            _shift_right(irb, gen[8], gen[9], out_right, 8, out_right, 9)
-
-        return irb.get()
-
-    out_left = tvm.tir.decl_buffer((10,), name="out_left", dtype="uint64")
-    out_right = tvm.tir.decl_buffer((10,), name="out_right", dtype="uint64")
-    return tvm.te.extern(
-        [out_left.shape, out_right.shape],
-        [gen],
-        lambda ins, outs: gen_ir(ins[0], outs[0], outs[1]),
-        out_buffers=[out_left, out_right],
-        name="threefry_split",
-        tag="threefry_split",
-    )
-
-
-def threefry_test_wrapping(target, device):
-    """Test that unsigned arithmetic wraps on overflow.
-
-    Parameters
-    ----------
-    target : tvm.target.Target
-        Target to run against
-    device : tvm.runtime.Device
-        Context to run the test on
-
-    Returns
-    -------
-    is_wrapping : bool
-        Whether or not unsigned integer arithmetic is wrapping for this target, context pair. True
-        indicates that threefry will work on this platform.
-    """
-    if isinstance(target, str):
-        target = tvm.target.Target(target)
-
-    def gen_ir(out_ptr):
-        irb = ir_builder.create()
-        out = irb.buffer_ptr(out_ptr)
-        if "gpu" in target.keys:
-            thread_x = tvm.te.thread_axis("threadIdx.x")
-            irb.scope_attr(thread_x, "thread_extent", 1)
-        out[0] = tvm.tir.const(0xFFFFFFFFFFFFFFFF, "uint64") + tvm.tir.const(1, "uint64")
-        return irb.get()
-
-    out = tvm.tir.decl_buffer((1,), dtype="uint64")
-    f = tvm.te.extern(
-        [out.shape], [], lambda ins, outs: gen_ir(outs[0]), dtype="uint64", out_buffers=[out]
-    )
-    s = tvm.te.create_schedule([f.op])
-    out_ary = tvm.nd.array(np.ones((1,), "uint64"), device)
-    tvm.build(s, [f], target=target)(out_ary)
-    return out_ary.numpy()[0] == 0
-
-
-def uniform(gen, low, high, out_shape, out_dtype):
-    """Draw samples from a uniform distribution.
-
-    Samples are uniformly distributed over the half-open interval [low, high)
-    (includes low, but excludes high). In other words, any value within the
-    given interval is equally likely to be drawn by uniform.
-
-    Parameters
-    ----------
-    gen : ThreefryKey
-        Generator state. Can be create with :py:func:`tvm.relay.threefry_key`. This should not be
-        reused in another function, otherwise random numbers will be repeated.
-
-    low : Tensor[(), out_dtype]
-        Lower boundary of the output interval. All values generated will be
-        greater than or equal to low.
-
-    high : Tensor[(), out_dtype]
-        Upper boundary of the output interval. All values generated will be
-        less than high.
-
-    out_shape : Sequence[int]
-        Output shape of the random numbers.
-
-    out_dtype : str
-        The output dtype.
-
-    Returns
-    -------
-    new_gen : ThreefryKey
-        New generator state that is distinct from `gen`.
-
-    out : Tensor[out_shape, out_dtype]
-        Tensor of random numbers with shape `out_shape` and type `out_dtype`.
-    """
-    new_gen, random_bits = threefry_generate(gen, out_shape)
-    assert out_dtype in (
-        "float32",
-        "float64",
-    ), f"Only support float32 or float64 for now, got {out_dtype}"
-    if out_dtype == "float32":
-        random_dtype = "uint32"
-        nbits = 32
-        nfraction = 23
-    elif out_dtype == "float64":
-        random_dtype = "uint64"
-        nbits = 64
-        nfraction = 52
-    nexp = nbits - nfraction - 1
-    random_bits = random_bits.astype(random_dtype)
-
-    fraction = tvm.topi.right_shift(
-        random_bits, tvm.tir.const(nbits - nfraction, dtype=random_dtype)
-    )
-    exponent = tvm.topi.left_shift(
-        tvm.topi.full(out_shape, random_dtype, (1 << (nexp - 1)) - 1),
-        tvm.tir.const(nfraction, dtype=random_dtype),
-    )
-    mantissa = tvm.topi.bitwise_or(fraction, exponent).astype(random_dtype)
-    standard_uniform_values = tvm.topi.reinterpret(mantissa, out_dtype) - tvm.tir.const(
-        1, dtype=out_dtype
-    )
-    uniform_values = tvm.topi.add(tvm.topi.multiply(standard_uniform_values, high - low), low)
-
-    return new_gen, uniform_values
-
-
-def normal(gen, mean, scale, out_shape, out_dtype):
-    """Draw samples from a normal distribution.
-    The algorithm is based on Box-Muller transform
-
-    Parameters
-    ----------
-    gen : ThreefryKey
-        Generator state. Can be create with :py:func:`tvm.relay.threefry_key`. This should not be
-        reused in another function, otherwise random numbers will be repeated.
-
-    mean : Tensor[(), out_dtype]
-        The mean of the normal distribution.
-
-    scale : Tensor[(), out_dtype]
-        The standard deviation of the normal distribution.
-
-    out_shape : Sequence[int]
-        Output shape of the random numbers.
-
-    out_dtype : str
-        The output dtype.
-
-    Returns
-    -------
-    new_gen : ThreefryKey
-        New generator state that is distinct from `gen`.
-
-    out : Tensor[out_shape, out_dtype]
-        Tensor of random numbers with shape `out_shape` and type `out_dtype`.
-    """
-    out_shape = list(out_shape)
-    # Box-Muller transform need two pieces of original uniform data
-    out_shape.insert(0, 2)
-    new_gen, uniform_values = uniform(
-        gen, tvm.tir.const(0.0, out_dtype), tvm.tir.const(1.0, out_dtype), out_shape, out_dtype
-    )
-    two_pi = tvm.tir.const(2.0 * math.pi, out_dtype)
-    uniform_values_1 = tvm.topi.strided_slice(uniform_values, [0], [1], strides=[1], axes=[0])
-    uniform_values_1 = tvm.topi.squeeze(uniform_values_1, axis=0)
-    uniform_values_2 = tvm.topi.strided_slice(uniform_values, [1], [2], strides=[1], axes=[0])
-    uniform_values_2 = tvm.topi.squeeze(uniform_values_2, axis=0)
-    uniform_values_1 = tvm.topi.subtract(tvm.tir.const(1.0, out_dtype), uniform_values_1)
-    sqrt_values = tvm.topi.sqrt(
-        tvm.topi.multiply(tvm.tir.const(-2.0, out_dtype), tvm.topi.log(uniform_values_1))
-    )
-    sin_values = tvm.topi.sin(tvm.topi.multiply(two_pi, uniform_values_2))
-    random_values = tvm.topi.add(
-        tvm.topi.multiply(tvm.topi.multiply(sqrt_values, sin_values), scale), mean
-    )
-
-    return new_gen, random_values
-
-
-def multinomial(gen, probs, num_samples):
-    """Draw samples from a multinomial distribution defined by the input tensor.
-
-    Parameters
-    ----------
-    gen : ThreefryKey
-        Generator state. Can be created with :py:func:`tvm.relay.threefry_key`. This should not be
-        reused in another function, otherwise random numbers will be repeated.
-
-    probs: Tensor[(input_rows, indices), float]
-        A tensor containing the probabilities to sample from. Each value represents the
-        probability of choosing its corresponding index. If a tensor is provided, the last dimension
-        is treated independently. Negative values in this tensor will be clipped to zero to
-        represent they have no chance of being selected.
-
-    num_samples: int
-        Number of samples to draw from each row.
-
-    Returns
-    -------
-    new_gen : ThreefryKey
-        New generator state that is distinct from `gen`.
-
-    out : Tensor[(input_rows, num_samples), int64]
-        Tensor of sampled indices with shape `input_rows x num_samples` and type `out_dtype`.
-    """
-    # Convert to float for consistent behavior.
-    probs = tvm.topi.cast(probs, "float32")
-    # Clip negative values to 0.
-    probs = tvm.topi.maximum(probs, 0)
-    # Normalize input probabilities.
-    probs = tvm.topi.divide(probs, tvm.topi.expand_dims(tvm.topi.sum(probs, axis=-1), -1))
-    # Convert probability to cumulative sum.
-    cumulative_probs = tvm.topi.cumsum(probs, axis=-1)
-    # Sample a set of uniform values.
-    new_gen, uniform_values = uniform(
-        gen,
-        tvm.tir.const(0.0, "float32"),
-        tvm.tir.const(1.0, "float32"),
-        [*probs.shape[:-1], num_samples],
-        "float32",
-    )
-    # Find index corresponding to sampled values.
-    closest_prob = tvm.topi.subtract(
-        tvm.topi.expand_dims(cumulative_probs, axis=-1),
-        tvm.topi.expand_dims(uniform_values, axis=-2),
-    )
-    zeros = tvm.topi.full_like(closest_prob, 0)
-    ones = tvm.topi.full_like(closest_prob, 1)
-    # Find the smallest positive index for each sample.
-    cond = tvm.topi.greater(closest_prob, zeros)
-    closest_non_neg = tvm.topi.where(cond, closest_prob, ones)
-    sampled_indices = tvm.topi.argmin(closest_non_neg, axis=-2)
-    return new_gen, sampled_indices
diff --git a/python/tvm/topi/sparse_fill_empty_rows.py b/python/tvm/topi/sparse_fill_empty_rows.py
deleted file mode 100644
index 10dc6ee3bfa3..000000000000
--- a/python/tvm/topi/sparse_fill_empty_rows.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHnew_sparse_indices WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=no-else-return, too-many-locals, too-many-arguments, too-many-branches
-# pylint: disable=undefined-variable, invalid-name
-"""SparseFillEmptyRows operator"""
-from ..te import hybrid
-
-
-@hybrid.script
-def _sparse_fill_empty_rows(
-    sparse_indices,
-    sparse_values,
-    dense_shape,
-    default_value,
-    new_sparse_indices_shape,
-    new_sparse_values_shape,
-    empty_row_indicator_shape,
-):
-    default_value_ = int64(default_value[0])
-    new_sparse_indices = output_tensor(new_sparse_indices_shape, "int64")
-    new_sparse_values = output_tensor(new_sparse_values_shape, "int64")
-    empty_row_indicator = output_tensor(empty_row_indicator_shape, "int64")
-    new_sparse_indices_row_id = 0
-
-    if int64(sparse_indices.shape[0]) == int64(0):  # Handle Empty Case
-        #  Fill all rows with default values
-        for i in range(0, new_sparse_indices_shape[0]):
-            new_sparse_indices[i, 0] = int64(i)
-            new_sparse_values[i] = default_value_
-            empty_row_indicator[i] = int64(1)
-            for k in range(1, int64(new_sparse_indices_shape[1])):
-                new_sparse_indices[i, k] = int64(0)
-
-        return (new_sparse_indices, new_sparse_values, empty_row_indicator)
-
-    else:
-        # Iterate through sparse_indices and add rows if/when required
-        for i in range(0, int64(sparse_indices.shape[0])):
-            if i == 0:
-                prev_row_id = int64(0)
-            else:
-                prev_row_id = int64(sparse_indices[i - 1, 0] + 1)
-            row_id = int64(sparse_indices[i, 0])
-
-            # Since input is in row-major order, add rows between prev_row_id and row_id
-            for j in range(prev_row_id, row_id):
-                new_sparse_indices[new_sparse_indices_row_id, 0] = int64(j)
-                for k in range(1, int64(new_sparse_indices_shape[1])):
-                    new_sparse_indices[new_sparse_indices_row_id, k] = int64(0)
-                empty_row_indicator[prev_row_id] = int64(1)
-                new_sparse_values[new_sparse_indices_row_id] = default_value_
-                new_sparse_indices_row_id += 1
-
-            # Add current element to output
-            new_sparse_indices[new_sparse_indices_row_id, 0] = row_id
-            for k in range(1, int64(new_sparse_indices_shape[1])):
-                new_sparse_indices[new_sparse_indices_row_id, k] = int64(sparse_indices[i, k])
-            new_sparse_values[new_sparse_indices_row_id] = int64(sparse_values[i])
-            empty_row_indicator[row_id] = int64(0)
-            new_sparse_indices_row_id += 1
-
-        # Add rows with default value if last row id of sparse_indices is not dense_shape[0] - 1
-        for i in range(
-            int64(sparse_indices[sparse_indices.shape[0] - 1, 0] + 1), int64(dense_shape[0])
-        ):
-
-            new_sparse_indices[new_sparse_indices_row_id, 0] = int64(i)
-            for k in range(1, int64(new_sparse_indices_shape[1])):
-                new_sparse_indices[new_sparse_indices_row_id, k] = int64(0)
-            empty_row_indicator[i] = int64(1)
-            new_sparse_values[new_sparse_indices_row_id] = default_value_
-            new_sparse_indices_row_id += 1
-
-        return (new_sparse_indices, new_sparse_values, empty_row_indicator)
-
-
-def sparse_fill_empty_rows(
-    sparse_indices,
-    sparse_values,
-    dense_shape,
-    default_value,
-    new_sparse_indices_shape,
-    new_sparse_values_shape,
-    empty_row_indicator_shape,
-):
-    return _sparse_fill_empty_rows(
-        sparse_indices,
-        sparse_values,
-        dense_shape,
-        default_value,
-        new_sparse_indices_shape,
-        new_sparse_values_shape,
-        empty_row_indicator_shape,
-    )
diff --git a/python/tvm/topi/transform.py b/python/tvm/topi/transform.py
index 2844825a4a73..c1f5bce94870 100644
--- a/python/tvm/topi/transform.py
+++ b/python/tvm/topi/transform.py
@@ -20,7 +20,6 @@
 
 import tvm
 from tvm import te, topi
-from tvm.te import hybrid
 
 from . import cpp, tag
 from .utils import const_vector, make_idx, within_index
@@ -982,35 +981,6 @@ def adv_index(data, indices):
     return cpp.adv_index(data, indices)
 
 
-@hybrid.script
-def invert_permutation(data):
-    """Computes the inverse permutation of data.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        Input data
-
-    Returns
-    -------
-    result : tvm.te.Tensor
-        Output tensor
-
-    Examples
-    --------
-    .. code-block:: python
-
-        data = [3, 4, 0, 2, 1]
-        topi.invert_permutation(data) = [2, 4, 3, 0, 1]
-    """
-    result = output_tensor(data.shape, data.dtype)
-    nums = data.shape[0]
-    for ind in range(nums):
-        r_ind = data[ind]
-        result[r_ind] = ind
-    return result
-
-
 def sliding_window(data, axis, window_shape, strides):
     """Slide a window over the data tensor.
 
diff --git a/python/tvm/topi/unique.py b/python/tvm/topi/unique.py
index 983c48615334..9c9732013413 100644
--- a/python/tvm/topi/unique.py
+++ b/python/tvm/topi/unique.py
@@ -17,9 +17,6 @@
 # pylint: disable=invalid-name
 """Unique operator"""
 from tvm import te, tir
-from ..te import hybrid
-from .scan import cumsum
-from .sort import sort, argsort
 
 
 def _calc_adjacent_diff_ir(data, output, binop=tir.Sub):
@@ -82,234 +79,3 @@ def _calc_adjacent_diff(data, out_dtype="int32", binop=tir.Sub):
         name="_calc_adjacent_diff",
         tag="_calc_adjacent_diff_cpu",
     )
-
-
-@hybrid.script
-def _calc_num_unique(inc_scan):
-    """Helper function to get the number of unique elements fron inc_scan tensor"""
-    output = output_tensor((1,), "int32")
-    output[0] = inc_scan[inc_scan.shape[0] - 1] + int32(1)
-    return output
-
-
-def _calc_unique_ir(
-    data, argsorted_indices, inc_scan, index_converter, unique_elements, inverse_indices, counts
-):
-    """Low level IR to calculate unique elements, inverse indices, and counts (optional) of
-    unique elements of 1-D array.
-
-    Parameters
-    ----------
-    data : Buffer
-        Input 1-D Buffer.
-
-    argsorted_indices : Buffer
-        A buffer that stores the argsorted indices of the input data.
-
-    inc_scan : Buffer
-        A buffer that stores the inclusive scan of the binary tir.NE adjacent difference
-        of the sorted data.
-
-    index_converter (optional) : Buffer
-        An optional index converter that transforms the unique element index
-        such that new_idx = index_converter[old_idx].
-
-    unique_elements : Buffer
-        A buffer that stores the unique elements.
-
-    inverse_indices : Buffer
-        A buffer that stores the index of each input data element in the unique element array.
-
-    counts (optional) : Buffer
-        A buffer that stores the count of each unique element.
-    """
-    ib = tir.ir_builder.create()
-    data_ptr = ib.buffer_ptr(data)
-    argsorted_indices_ptr = ib.buffer_ptr(argsorted_indices)
-    inc_scan_ptr = ib.buffer_ptr(inc_scan)
-    unique_elements_ptr = ib.buffer_ptr(unique_elements)
-    inverse_indices_ptr = ib.buffer_ptr(inverse_indices)
-
-    index_converter_ptr = None
-    if isinstance(index_converter, tir.Buffer):
-        index_converter_ptr = ib.buffer_ptr(index_converter)
-
-    if isinstance(counts, tir.Buffer):
-        counts_ptr = ib.buffer_ptr(counts)
-        # use indices_ptr as a tmp buffer to store tids with inc_scan[tid] != inc_scan[tid-1]
-        unique_seq_indices_ptr = ib.buffer_ptr(inverse_indices)
-
-    data_length = data.shape[0]
-
-    # if need to return counts
-    if isinstance(counts, tir.Buffer):
-        num_unique = inc_scan_ptr[inc_scan.shape[0] - 1] + 1
-        num_elements = data.shape[0]
-        unique_seq_indices_ptr[num_unique - 1] = num_elements
-        with ib.new_scope():
-            with ib.for_range(0, data_length, kind="parallel") as i:
-                with ib.if_scope(i > 0):
-                    with ib.if_scope(inc_scan_ptr[i] != inc_scan_ptr[i - 1]):
-                        unique_seq_indices_ptr[inc_scan_ptr[i] - 1] = i
-        with ib.new_scope():
-            with ib.for_range(0, num_unique, kind="parallel") as i:
-                unique_idx = i if not index_converter_ptr else index_converter_ptr[i]
-                with ib.if_scope(i == 0):
-                    counts_ptr[unique_idx] = unique_seq_indices_ptr[i]
-                with ib.else_scope():
-                    counts_ptr[unique_idx] = (
-                        unique_seq_indices_ptr[i] - unique_seq_indices_ptr[i - 1]
-                    )
-    # calculate unique elements and inverse indices
-    with ib.new_scope():
-        with ib.for_range(0, data_length, kind="parallel") as i:
-            data_idx = argsorted_indices_ptr[i]
-            unique_idx = (
-                inc_scan_ptr[i] if not index_converter_ptr else index_converter_ptr[inc_scan_ptr[i]]
-            )
-            inverse_indices_ptr[data_idx] = unique_idx
-            with ib.if_scope(i == 0):
-                unique_elements_ptr[unique_idx] = data_ptr[data_idx]
-            with ib.else_scope():
-                with ib.if_scope(inc_scan_ptr[i] != inc_scan_ptr[i - 1]):
-                    unique_elements_ptr[unique_idx] = data_ptr[data_idx]
-    return ib.get()
-
-
-@hybrid.script
-def _calc_first_occurence(argsorted_indices, inc_scan):
-    """Hybrid script to calculate the first occurence of each unique element in the input data.
-
-    Parameters
-    ----------
-    argsorted_indices : tvm.te.Tensor
-        A tensor that stores the argsorted indices of the input data.
-
-    inc_scan : tvm.te.Tensor
-        A tensor that stores the inclusive scan of the binary tir.NE adjacent difference
-        of the sorted data.
-
-    first_occurence : tvm.te.Tensor
-        A tensor that stores the first occurence of each unique element in the input data.
-    """
-    first_occurence = output_tensor(argsorted_indices.shape, "int32")
-    for i in parallel(argsorted_indices.shape[0]):
-        first_occurence[i] = argsorted_indices.shape[0]
-    for i in parallel(argsorted_indices.shape[0]):
-        if i == 0 or inc_scan[i] != inc_scan[i - 1]:
-            first_occurence[inc_scan[i]] = argsorted_indices[i]
-    return first_occurence
-
-
-def unique(data, is_sorted=True, return_counts=False):
-    """
-    Find the unique elements of a 1-D tensor. Please note `output` and `counts` are all padded to
-    have the same length of `data` and element with index >= num_unique[0] has undefined value.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        A 1-D tensor of integers.
-
-    sorted : bool
-        Whether to sort the unique elements in ascending order before returning as output.
-
-    return_counts : bool
-        Whether to return the count of each unique element.
-
-    Returns
-    -------
-    unique : tvm.te.Tensor
-        A 1-D tensor containing the unique elements of the input data tensor. The same size as
-        the input data. If there are less unique elements than input data, the end of the tensor
-        is padded with zeros.
-
-    indices : tvm.te.Tensor
-        A 1-D tensor. The same size as output. For each entry in output, it contains
-        the index of its first occurence in the input data. The end of the tensor is padded
-        with the length of the input data.
-
-    inverse_indices : tvm.te.Tensor
-        A 1-D tensor. For each entry in data, it contains the index of that data element in
-        the unique array. (Note that inverse_indices is very similar to indices if output is not
-        sorted.)
-
-    num_unique : tvm.te.Tensor
-        A 1-D tensor with size=1 containing the number of unique elements in the input data tensor.
-
-    counts (optional) : tvm.te.Tensor
-        A 1-D tensor containing the count of each unique element in the output.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, False)
-        output          =  [4, 5, 1, 2, 3, _, _, _]
-        indices         =  [0, 1, 2, 3, 4, _, _, _]
-        inverse_indices =  [0, 1, 2, 3, 4, 4, 0, 1]
-        num_unique      =  [5]
-
-        [output, indices, num_unique, counts] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, True)
-        output          =  [4, 5, 1, 2, 3, _, _, _]
-        indices         =  [0, 1, 2, 3, 4, _, _, _]
-        inverse_indices =  [0, 1, 2, 3, 4, 4, 0, 1]
-        num_unique      =  [5]
-        counts          =  [2, 2, 1, 1, 2, _, _, _]
-
-        [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], True)
-        output          =  [1, 2, 3, 4, 5, _, _, _]
-        indices         =  [2, 3, 4, 0, 1, _, _, _]
-        inverse_indices =  [3, 4, 0, 1, 2, 2, 3, 4]
-        num_unique      =  [5]
-    """
-    sorted_data = sort(data)
-    argsorted_indices = argsort(data, dtype="int32")
-    # adjacent difference
-    adjacent_diff = _calc_adjacent_diff(sorted_data, "int32", tir.NE)
-    # inclusive scan
-    inc_scan = cumsum(adjacent_diff, dtype="int32", exclusive=0)
-    # total number of unique elements
-    num_unique_elements = _calc_num_unique(inc_scan)
-    # prepare outputs
-    if return_counts:
-        out_data_shape = [data.shape] * 3
-        out_dtypes = [data.dtype, "int32", "int32"]
-    else:
-        out_data_shape = [data.shape] * 2
-        out_dtypes = [data.dtype, "int32"]
-    # prepare inputs and fcompute
-
-    first_occurence = _calc_first_occurence(argsorted_indices, inc_scan)
-    if is_sorted:
-        in_data = [data, argsorted_indices, inc_scan]
-        if return_counts:
-            fcompute = lambda ins, outs: _calc_unique_ir(*ins, None, *outs)
-        else:
-            fcompute = lambda ins, outs: _calc_unique_ir(*ins, None, *outs, None)
-
-        indices = first_occurence
-    else:
-        # calculate index converter by sorting unique elements by their first occurence
-        argsorted_first_occurence = argsort(first_occurence, dtype="int32")
-        index_converter = argsort(argsorted_first_occurence, dtype="int32")
-        in_data = [data, argsorted_indices, inc_scan, index_converter]
-        if return_counts:
-            fcompute = lambda ins, outs: _calc_unique_ir(*ins, *outs)
-        else:
-            fcompute = lambda ins, outs: _calc_unique_ir(*ins, *outs, None)
-        # First occurence is in order of sorted unique output, if we sort the first_occurence array
-        # we get the correct result
-        indices = sort(first_occurence)
-
-    outs = te.extern(
-        out_data_shape,
-        in_data,
-        fcompute,
-        dtype=out_dtypes,
-        name="_calc_unique",
-        tag="_calc_unique_cpu",
-    )
-    if return_counts:
-        return [outs[0], indices, outs[1], num_unique_elements, outs[2]]
-    return [outs[0], indices, outs[1], num_unique_elements]
diff --git a/python/tvm/topi/vision/__init__.py b/python/tvm/topi/vision/__init__.py
deleted file mode 100644
index 2861d31de0f4..000000000000
--- a/python/tvm/topi/vision/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""VISION network operators"""
-from __future__ import absolute_import as _abs
-
-from . import ssd
-from .reorg import *
-from .nms import *
-from .rcnn import *
diff --git a/python/tvm/topi/vision/nms.py b/python/tvm/topi/vision/nms.py
deleted file mode 100644
index 7bd94745e226..000000000000
--- a/python/tvm/topi/vision/nms.py
+++ /dev/null
@@ -1,1183 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=import-error, invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable, too-many-nested-blocks, too-many-branches, too-many-statements, too-many-function-args
-"""Non-maximum suppression operator"""
-import tvm
-from tvm import te
-
-from tvm.te import hybrid
-from tvm.tir import if_then_else
-
-from ..sort import argsort
-from ..math import cast
-from ..transform import reshape, gather
-from .. import reduction
-from ..scan import cumsum
-from .nms_util import (
-    binary_search,
-    collect_selected_indices,
-    collect_selected_indices_and_scores,
-    run_all_class_nms,
-)
-
-
-@hybrid.script
-def hybrid_rearrange_box_out(data, one, batch_size, num_anchors):
-    """Hybrid routine to rearrange nms output to
-    move all valid entries to top.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor or numpy NDArray
-        NMS output. 3-D tensor with shape
-        [batch_size, num_anchors, 6].
-
-    one: tvm.tir.const
-        Constant one with the same dtype as data.
-
-    batch_size: tvm.tir.IntImm or tvm.tir.Var
-        Batch size. We need to pass it in since hybrid script doesn't support
-        binding variable to symbolic dim.
-
-    num_anchors: tvm.tir.IntImm or tvm.tir.Var
-        Number of anchors.
-
-    Returns
-    -------
-    output : tvm.te.Tensor or numpy NDArray
-        Transformed NMS output. 3-D tensor with shape
-        [batch_size, num_anchors, 6].
-    """
-    elem_length = data.shape[2]
-    output = output_tensor((batch_size, num_anchors, elem_length), data.dtype)
-    valid_indices = allocate((batch_size,), "int32")
-
-    for i in parallel(batch_size):
-        valid_indices[i] = 0
-        for j in range(num_anchors):
-            if data[i, j, 0] >= 0:
-                for k in range(elem_length):
-                    output[i, valid_indices[i], k] = data[i, j, k]
-                valid_indices[i] += 1
-            if j >= valid_indices[i]:
-                for k in range(elem_length):
-                    output[i, j, k] = -one
-    return output
-
-
-@hybrid.script
-def hybrid_rearrange_indices_out(data, one, batch_size, num_anchors):
-    """Hybrid routine to rearrange nms output to
-    move all valid entries to top.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor or numpy NDArray
-        NMS output. 3-D tensor with shape
-        [batch_size, num_anchors, 6] or
-        [batch_size, num_anchors, 5], or 2-D
-        tensor with shape [batch_size, num_anchors].
-
-    one: tvm.tir.const
-        Constant one with the same dtype as data.
-
-    batch_size: tvm.tir.IntImm or tvm.tir.Var
-        Batch size. We need to pass it in since hybrid script doesn't support
-        binding variable to symbolic dim.
-
-    num_anchors: tvm.tir.IntImm or tvm.tir.Var
-        Number of anchors.
-
-    Returns
-    -------
-    output : tvm.te.Tensor or numpy NDArray
-        2-D tensor with shape [batch_size, num_anchors].
-
-    valid_box_count : tvm.te.Tensor or numpy NDArray
-        Tensor with shape [batch_size, 1], indicates
-        the valid number of boxes.
-    """
-    valid_box_count = output_tensor((batch_size, 1), "int32")
-    output = output_tensor((batch_size, num_anchors), data.dtype)
-    valid_indices = allocate((batch_size,), "int32")
-
-    for i in parallel(batch_size):
-        valid_indices[i] = 0
-        for j in range(num_anchors):
-            if data[i, j] >= 0:
-                output[i, valid_indices[i]] = data[i, j]
-                valid_indices[i] += 1
-            if data[i, j] > num_anchors or data[i, j] < -num_anchors:
-                output[i, valid_indices[i]] = 0
-                valid_indices[i] += 1
-            if j >= valid_indices[i]:
-                output[i, j] = -one
-        valid_box_count[i, 0] = valid_indices[i]
-
-    return output, valid_box_count
-
-
-@hybrid.script
-def hybrid_get_valid_counts(
-    data, score_threshold, id_index, score_index, one, batch_size, num_anchors
-):
-    """Hybrid routine to get valid count of bounding boxes
-    given a score threshold. Also moves valid boxes to the
-    top of input data.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor or numpy NDArray
-        Input data. 3-D tensor with shape [batch_size, num_anchors, 6]
-        or [batch_size, num_anchors, 5].
-
-    score_threshold : tvm.te.Tensor
-        Lower limit of score for valid bounding boxes.
-
-    id_index : tvm.tir.const
-        index of the class categories, -1 to disable.
-
-    score_index: tvm.tir.const
-        Index of the scores/confidence of boxes.
-
-    one: tvm.tir.const
-        Constant one with the same dtype as data.
-
-    batch_size: tvm.tir.IntImm or tvm.tir.Var
-        Batch size. We need to pass it in since hybrid script doesn't support
-        binding variable to symbolic dim.
-
-    num_anchors: tvm.tir.IntImm or tvm.tir.Var
-        Number of anchors.
-
-    Returns
-    -------
-    valid_count : tvm.te.Tensor or numpy NDArray
-        1-D tensor for valid number of boxes.
-
-    out_tensor : tvm.te.Tensor or numpy NDArray
-        Rearranged data tensor.
-
-    out_indices: tvm.te.Tensor or numpy NDArray
-        Related index in input data.
-    """
-    box_data_length = data.shape[2]
-    valid_count = output_tensor((batch_size,), "int32")
-    out_tensor = output_tensor((batch_size, num_anchors, box_data_length), data.dtype)
-    out_indices = output_tensor((batch_size, num_anchors), "int32")
-    for i in parallel(batch_size):
-        valid_count[i] = 0
-        for j in range(num_anchors):
-            score = data[i, j, score_index]
-            if score > score_threshold and (id_index < 0 or data[i, j, id_index] >= 0):
-                for k in range(box_data_length):
-                    out_tensor[i, valid_count[i], k] = data[i, j, k]
-                out_indices[i, valid_count[i]] = j
-                valid_count[i] += 1
-            if j >= valid_count[i]:
-                for k in range(box_data_length):
-                    out_tensor[i, j, k] = -one
-                out_indices[i, j] = -1
-    return valid_count, out_tensor, out_indices
-
-
-def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
-    """Get valid count of bounding boxes given a score threshold.
-    Also moves valid boxes to the top of input data.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        Input data. 3-D tensor with shape [batch_size, num_anchors, 6]
-        or [batch_size, num_anchors, 5].
-
-    score_threshold : optional, float
-        Lower limit of score for valid bounding boxes.
-
-    id_index : optional, int
-        index of the class categories, -1 to disable.
-
-    score_index: optional, int
-        Index of the scores/confidence of boxes.
-
-    Returns
-    -------
-    valid_count : tvm.te.Tensor
-        1-D tensor for valid number of boxes.
-
-    out_tensor : tvm.te.Tensor
-        Rearranged data tensor.
-
-    out_indices: tvm.te.Tensor or numpy NDArray
-        Related index in input data.
-    """
-    if isinstance(score_threshold, (float, int)):
-        score_threshold = tvm.tir.const(score_threshold, dtype=data.dtype)
-    id_index_const = tvm.tir.const(id_index, "int32")
-    score_index_const = tvm.tir.const(score_index, "int32")
-    return hybrid_get_valid_counts(
-        data,
-        score_threshold,
-        id_index_const,
-        score_index_const,
-        tvm.tir.const(1, data.dtype),
-        data.shape[0],
-        data.shape[1],
-    )
-
-
-@hybrid.script
-def hybrid_nms(
-    data,
-    sorted_index,
-    valid_count,
-    indices,
-    batch_size,
-    num_anchors,
-    max_output_size,
-    iou_threshold,
-    force_suppress,
-    top_k,
-    coord_start,
-    score_index,
-    id_index,
-    return_indices,
-    zero,
-    one,
-):
-    """Hybrid routing for non-maximum suppression.
-
-    Parameters
-    ----------
-    data: tvm.te.Tensor or numpy NDArray
-        Bounding boxes with class and score. 3-D tensor with shape
-        [batch_size, num_anchors, 6]. It could be the second output
-        out_tensor of get_valid_counts.
-
-    sorted_index : tvm.te.Tensor or numpy NDArray
-        Bounding box indexes sorted by score, with shape
-        [batch_size, num_anchors].
-
-    valid_count : tvm.te.Tensor or numpy NDArray
-        1-D tensor for valid number of boxes. It could be the output
-        valid_count of get_valid_counts.
-
-    indices : tvm.te.Tensor or numpy.NDArray
-        indices in original tensor, with shape [batch_size, num_anchors],
-        represents the index of box in original data. It could be the third
-        output out_indices of get_valid_counts. The values in the second
-        dimension are like the output of arange(num_anchors) if get_valid_counts
-        is not used before non_max_suppression.
-
-    batch_size: tvm.tir.IntImm or tvm.tir.Var
-        Batch size. We need to pass it in since hybrid script doesn't support
-        binding variable to symbolic dim.
-
-    num_anchors: tvm.tir.IntImm or tvm.tir.Var
-        The number of anchors.
-
-    max_output_size : tvm.te.Tensor
-        Max number of output valid boxes for each instance.
-        Return all valid boxes if max_output_size < 0.
-
-    iou_threshold : tvm.te.Tensor
-        Overlapping(IoU) threshold to suppress object with smaller score.
-
-    force_suppress : tvm.tir.const
-        Whether to suppress all detections regardless of class_id.
-
-    top_k : tvm.tir.const
-        Keep maximum top k detections before nms, -1 for no limit.
-
-    coord_start : tvm.tir.const
-        Start index of the consecutive 4 coordinates.
-
-    score_index: tvm.tir.const
-        Index of the scores/confidence of boxes.
-
-    id_index : tvm.tir.const
-        index of the class categories, -1 to disable.
-
-    return_indices : tvm.tir.const
-        Whether to return box indices in input data.
-
-    zero: tvm.tir.const
-        Constant zero with the same dtype as data.
-
-    one: tvm.tir.const
-        Constant one with the same dtype as data.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        3-D tensor with shape [batch_size, num_anchors, 6]
-        or [batch_size, num_anchors, 5].
-
-    box_indices: tvm.te.Tensor
-        2-D tensor with shape [batch_size, num_anchors].
-    """
-
-    box_data_length = data.shape[2]
-
-    # box_indices is the expected indices of boxes
-    box_indices = output_tensor((batch_size, num_anchors), sorted_index.dtype)
-    output = output_tensor(
-        (
-            batch_size,
-            num_anchors,
-            box_data_length,
-        ),
-        data.dtype,
-    )
-
-    for i in range(batch_size):
-        if iou_threshold > 0:
-            if valid_count[i] > 0:
-                # Reorder output
-                nkeep = valid_count[i]
-                if 0 < top_k < nkeep:
-                    nkeep = top_k
-                for j in parallel(nkeep):
-                    for k in range(box_data_length):
-                        output[i, j, k] = data[i, sorted_index[i, j], k]
-                    box_indices[i, j] = sorted_index[i, j]
-                if 0 < top_k < valid_count[i]:
-                    for j in parallel(valid_count[i] - nkeep):
-                        for k in range(box_data_length):
-                            output[i, j + nkeep, k] = -one
-                        box_indices[i, j + nkeep] = -1
-
-            # Apply nms
-            box_start_idx = coord_start
-            batch_idx = i
-            num_valid_boxes = 0
-
-            for j in range(valid_count[i]):
-                if num_valid_boxes == max_output_size:
-                    for k in range(box_data_length):
-                        output[i, j, k] = -one
-                    box_indices[i, j] = -1
-
-                elif output[i, j, score_index] > 0:
-                    box_a_idx = j
-                    is_valid_box = 1
-
-                    # a_l: left, a_t: top, a_r: right, a_b: bottom
-                    a_l = min(
-                        output[batch_idx, box_a_idx, box_start_idx],
-                        output[batch_idx, box_a_idx, box_start_idx + 2],
-                    )
-                    a_t = min(
-                        output[batch_idx, box_a_idx, box_start_idx + 1],
-                        output[batch_idx, box_a_idx, box_start_idx + 3],
-                    )
-                    a_r = max(
-                        output[batch_idx, box_a_idx, box_start_idx],
-                        output[batch_idx, box_a_idx, box_start_idx + 2],
-                    )
-                    a_b = max(
-                        output[batch_idx, box_a_idx, box_start_idx + 1],
-                        output[batch_idx, box_a_idx, box_start_idx + 3],
-                    )
-
-                    # check if current box j is valid by calculating iou with
-                    # all existing valid boxes
-                    for k in range(j):
-                        check_iou = 0
-                        if (
-                            is_valid_box == 1
-                            and k < j
-                            and output[i, k, score_index] > 0
-                            and (id_index < 0 or output[i, k, id_index] >= 0)
-                        ):
-                            if force_suppress:
-                                check_iou = 1
-                            elif id_index < 0 or output[i, j, id_index] == output[i, k, id_index]:
-                                check_iou = 1
-
-                        if check_iou > 0:
-                            box_b_idx = k
-
-                            # b_l: left, b_t: top, b_r: right, b_b: bottom
-                            b_l = min(
-                                output[batch_idx, box_b_idx, box_start_idx],
-                                output[batch_idx, box_b_idx, box_start_idx + 2],
-                            )
-                            b_t = min(
-                                output[batch_idx, box_b_idx, box_start_idx + 1],
-                                output[batch_idx, box_b_idx, box_start_idx + 3],
-                            )
-                            b_r = max(
-                                output[batch_idx, box_b_idx, box_start_idx],
-                                output[batch_idx, box_b_idx, box_start_idx + 2],
-                            )
-                            b_b = max(
-                                output[batch_idx, box_b_idx, box_start_idx + 1],
-                                output[batch_idx, box_b_idx, box_start_idx + 3],
-                            )
-
-                            # Overlapping width and height
-                            w = max(zero, min(a_r, b_r) - max(a_l, b_l))
-                            h = max(zero, min(a_b, b_b) - max(a_t, b_t))
-
-                            # Overlapping area
-                            area = h * w
-
-                            # total area of the figure formed by box a and box b
-                            # except for overlapping area
-                            u = (a_r - a_l) * (a_b - a_t) + (b_r - b_l) * (b_b - b_t) - area
-
-                            # get the iou
-                            iou = zero if u <= zero else area / u
-
-                            if iou >= iou_threshold:
-                                is_valid_box = 0
-
-                    if is_valid_box == 0:
-                        for k in range(box_data_length):
-                            output[i, j, k] = -one
-                        box_indices[i, j] = -1
-                    else:
-                        num_valid_boxes += 1
-
-        else:
-            for j in parallel(valid_count[i]):
-                for k in range(box_data_length):
-                    output[i, j, k] = data[i, j, k]
-                box_indices[i, j] = j
-
-        # Set invalid entry to be -1
-        for j in parallel(num_anchors - valid_count[i]):
-            for k in range(box_data_length):
-                output[i, j + valid_count[i], k] = -one
-            box_indices[i, j + valid_count[i]] = -1
-
-        if return_indices:
-            for j in range(valid_count[i]):
-                idx = box_indices[i, j]
-                if box_indices[i, j] >= 0:
-                    box_indices[i, j] = indices[i, idx]
-
-    return output, box_indices
-
-
-@tvm.target.generic_func
-def non_max_suppression(
-    data,
-    valid_count,
-    indices,
-    max_output_size=-1,
-    iou_threshold=0.5,
-    force_suppress=False,
-    top_k=-1,
-    coord_start=2,
-    score_index=1,
-    id_index=0,
-    return_indices=True,
-    invalid_to_bottom=False,
-):
-    """Non-maximum suppression operator for object detection.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        3-D tensor with shape [batch_size, num_anchors, 6] or [batch_size, num_anchors, 5].
-
-    valid_count : tvm.te.Tensor
-        1-D tensor for valid number of boxes.
-
-    indices : tvm.te.Tensor
-        2-D tensor with shape [batch_size, num_anchors].
-
-    max_output_size : optional, int or tvm.te.Tensor
-        Max number of output valid boxes for each instance.
-        Return all valid boxes if the value of max_output_size is less than 0.
-
-    iou_threshold : optional, float or tvm.te.Tensor
-        Non-maximum suppression threshold.
-
-    force_suppress : optional, boolean
-        Whether to suppress all detections regardless of class_id.
-
-    top_k : optional, int
-        Keep maximum top k detections before nms, -1 for no limit.
-
-    coord_start : required, int
-        Start index of the consecutive 4 coordinates.
-
-    score_index: optional, int
-        Index of the scores/confidence of boxes.
-
-    id_index : optional, int
-        index of the class categories, -1 to disable.
-
-    return_indices : optional, boolean
-        Whether to return box indices in input data.
-
-    invalid_to_bottom : optional, boolean
-        Whether to move all valid bounding boxes to the top.
-
-    Returns
-    -------
-    out : tvm.te.Tensor or tuple of tvm.te.Tensor
-        3-D tensor with shape [batch_size, num_anchors, 6]
-        or [batch_size, num_anchors, 5]. Out is a tuple of tvm.te.Tensor
-        if return_indices is True, the Tensor in the tuple is 2-D tensor
-        with shape [batch_size, num_anchors] and shape
-        [batch_size, num_valid_anchors] respectively.
-
-    Example
-    --------
-    .. code-block:: python
-
-        # An example to use non_max_suppression
-        dshape = (1, 5, 6)
-        data = te.placeholder(dshape, name="data")
-        valid_count = te.placeholder((dshape[0],), dtype="int32", name="valid_count")
-        iou_threshold = 0.7
-        force_suppress = True
-        top_k = -1
-        out = non_max_suppression(data, valid_count, indices, iou_threshold=iou_threshold,
-                                  force_suppress=force_suppress, top_k=top_k)
-        np_data = np.random.uniform(dshape)
-        np_valid_count = np.array([4])
-        s = topi.generic.schedule_nms(out)
-        f = tvm.build(s, [data, valid_count, out], "llvm")
-        dev = tvm.cpu()
-        tvm_data = tvm.nd.array(np_data, dev)
-        tvm_valid_count = tvm.nd.array(np_valid_count, dev)
-        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), dev)
-        f(tvm_data, tvm_valid_count, tvm_out)
-    """
-    batch_size = data.shape[0]
-    num_anchors = data.shape[1]
-    if isinstance(max_output_size, int):
-        max_output_size = tvm.tir.const(max_output_size, dtype="int32")
-    if isinstance(iou_threshold, float):
-        iou_threshold = tvm.tir.const(iou_threshold, dtype=data.dtype)
-    score_axis = score_index
-    score_shape = (batch_size, num_anchors)
-    score_tensor = te.compute(score_shape, lambda i, j: data[i, j, score_axis])
-    sort_tensor = argsort(score_tensor, valid_count=valid_count, axis=1, is_ascend=False)
-
-    out, box_indices = hybrid_nms(
-        data,
-        sort_tensor,
-        valid_count,
-        indices,
-        batch_size,
-        num_anchors,
-        max_output_size,
-        iou_threshold,
-        tvm.tir.const(force_suppress, dtype="bool"),
-        tvm.tir.const(top_k, dtype="int32"),
-        tvm.tir.const(coord_start, dtype="int32"),
-        tvm.tir.const(score_index, dtype="int32"),
-        tvm.tir.const(id_index, dtype="int32"),
-        tvm.tir.const(return_indices, dtype="bool"),
-        zero=tvm.tir.const(0, dtype=data.dtype),
-        one=tvm.tir.const(1, dtype=data.dtype),
-    )
-
-    if return_indices:
-        return hybrid_rearrange_indices_out(
-            box_indices,
-            one=tvm.tir.const(1, dtype="int32"),
-            batch_size=batch_size,
-            num_anchors=num_anchors,
-        )
-
-    if invalid_to_bottom:
-        out = hybrid_rearrange_box_out(
-            out,
-            one=tvm.tir.const(1, dtype=data.dtype),
-            batch_size=batch_size,
-            num_anchors=num_anchors,
-        )
-    return out
-
-
-def _nms_loop(
-    ib,
-    batch_size,
-    top_k,
-    iou_threshold,
-    max_output_size,
-    valid_count,
-    on_new_valid_box_func,
-    on_new_invalidated_box_func,
-    needs_bbox_check_func,
-    calc_overlap_func,
-    out_scores,
-    num_valid_boxes,
-):
-    def nms_inner_loop(ib, i, j, nkeep, num_valid_boxes_local):
-        # The box j is valid, invalidate other boxes that overlap with j above iou_threshold
-        on_new_valid_box_func(ib, 0, num_valid_boxes_local[0], i, j)
-        num_valid_boxes_local[0] += 1
-
-        num_boxes_to_check = nkeep - (j + 1)
-
-        with ib.for_range(0, num_boxes_to_check, name="_k", kind="parallel") as _k:
-            k = j + 1 + _k
-
-            with ib.if_scope(
-                tvm.tir.all(
-                    k < nkeep,
-                    out_scores[i, k] > 0,  # is the box k still valid?
-                    needs_bbox_check_func(i, j, k),
-                )
-            ):
-                iou = calc_overlap_func(i, j, k)
-
-                with ib.if_scope(iou >= iou_threshold):
-                    # invalidate the box k
-                    out_scores[i, k] = -1.0
-                    on_new_invalidated_box_func(i, k)
-
-    with ib.for_range(0, batch_size, name="i") as i:
-        nkeep = if_then_else(tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i])
-        max_output_size = if_then_else(max_output_size > 0, max_output_size, nkeep)
-
-        with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)):
-            num_valid_boxes_local = ib.allocate(
-                "int32", (1,), name="num_valid_boxes_local", scope="local"
-            )
-            box_idx = ib.allocate("int32", (1,), name="box_idx", scope="local")
-            num_valid_boxes_local[0] = 0
-            box_idx[0] = 0
-
-            # Apply nms
-            # No need to do more iteration if we have already reached max_output_size boxes
-            with ib.while_loop(
-                tvm.tir.all(box_idx[0] < nkeep, num_valid_boxes_local[0] < max_output_size)
-            ):
-                # Proceed to the inner loop if the box with id box_idx is still valid
-                with ib.if_scope(out_scores[i, box_idx[0]] > -1.0):
-                    nms_inner_loop(ib, i, box_idx[0], nkeep, num_valid_boxes_local)
-                box_idx[0] += 1
-
-            num_valid_boxes[i] = num_valid_boxes_local[0]
-
-        with ib.else_scope():
-            num_valid_boxes[i] = 0
-
-    return ib.get()
-
-
-def _get_valid_box_count(scores, score_threshold):
-    batch_classes, num_boxes = scores.shape
-
-    def searchsorted_ir(scores, valid_count):
-        ib = tvm.tir.ir_builder.create()
-        scores = ib.buffer_ptr(scores)
-        valid_count = ib.buffer_ptr(valid_count)
-
-        with ib.for_range(0, batch_classes, name="i", kind="parallel") as i:
-            binary_search(ib, i, num_boxes, scores, score_threshold, valid_count)
-
-        return ib.get()
-
-    scores_buf = tvm.tir.decl_buffer(scores.shape, scores.dtype, "scores_buf", data_alignment=8)
-
-    return te.extern(
-        [(batch_classes,)],
-        [scores],
-        lambda ins, outs: searchsorted_ir(ins[0], outs[0]),
-        dtype=["int32"],
-        in_buffers=[scores_buf],
-        name="searchsorted",
-        tag="searchsorted",
-    )
-
-
-def _collect_selected_indices_ir(num_class, selected_indices, num_detections, row_offsets, out):
-    batch_classes, _ = selected_indices.shape
-
-    ib = tvm.tir.ir_builder.create()
-
-    selected_indices = ib.buffer_ptr(selected_indices)
-    num_detections = ib.buffer_ptr(num_detections)
-    row_offsets = ib.buffer_ptr(row_offsets)
-    out = ib.buffer_ptr(out)
-
-    with ib.for_range(0, batch_classes, name="i", kind="parallel") as i:
-        i = cast(i, "int64")
-        batch_id = i // num_class
-        class_id = i % num_class
-
-        with ib.for_range(0, num_detections[i], name="j") as j:
-            out[row_offsets[i] + j, 0] = batch_id
-            out[row_offsets[i] + j, 1] = class_id
-            out[row_offsets[i] + j, 2] = cast(selected_indices[i, j], "int64")
-
-    return ib.get()
-
-
-def _collect_selected_indices_and_scores_ir(
-    selected_indices,
-    selected_scores,
-    num_detections,
-    row_offsets,
-    num_total_detections,
-    collected_indices,
-    collected_scores,
-):
-    batch_size, num_class = row_offsets.shape
-    num_boxes = selected_indices.shape[1]
-
-    ib = tvm.tir.ir_builder.create()
-
-    selected_indices = ib.buffer_ptr(selected_indices)
-    selected_scores = ib.buffer_ptr(selected_scores)
-    num_detections = ib.buffer_ptr(num_detections)
-    row_offsets = ib.buffer_ptr(row_offsets)
-    num_total_detections = ib.buffer_ptr(num_total_detections)
-    collected_indices = ib.buffer_ptr(collected_indices)
-    collected_scores = ib.buffer_ptr(collected_scores)
-    zero = cast(0, "int64")
-
-    with ib.for_range(0, batch_size * num_class, name="i", kind="parallel") as i:
-        i = cast(i, "int64")
-        batch_id = i // num_class
-        class_id = i % num_class
-
-        with ib.for_range(0, num_boxes, name="j") as j:
-            with ib.if_scope(j < num_detections[batch_id, class_id]):
-                offset = row_offsets[batch_id, class_id] + j
-                collected_indices[batch_id, offset, 0] = class_id
-                collected_indices[batch_id, offset, 1] = cast(selected_indices[i, j], "int64")
-                collected_scores[batch_id, offset] = selected_scores[i, j]
-            with ib.else_scope():
-                offset = (
-                    num_total_detections[batch_id]
-                    + class_id * num_boxes
-                    - row_offsets[batch_id, class_id]
-                    + j
-                    - num_detections[batch_id, class_id]
-                )
-                collected_indices[batch_id, offset, 0] = zero
-                collected_indices[batch_id, offset, 1] = zero
-                collected_scores[batch_id, offset] = 0.0
-
-    return ib.get()
-
-
-def all_class_non_max_suppression(
-    boxes,
-    scores,
-    max_output_boxes_per_class,
-    iou_threshold,
-    score_threshold,
-    output_format="onnx",
-):
-    """Non-maximum suppression operator for object detection, corresponding to ONNX
-    NonMaxSuppression and TensorFlow combined_non_max_suppression.
-    NMS is performed for each class separately.
-
-    Parameters
-    ----------
-    boxes : tvm.te.Tensor
-        3-D tensor with shape (batch_size, num_boxes, 4)
-
-    scores: tvm.te.Tensor
-        3-D tensor with shape (batch_size, num_classes, num_boxes)
-
-    max_output_boxes_per_class : int or tvm.te.Tensor, optional
-        The maxinum number of output selected boxes per class
-
-    iou_threshold : float or tvm.te.Tensor, optionaIl
-        IoU test threshold
-
-    score_threshold : float or tvm.te.Tensor, optional
-        Score threshold to filter out low score boxes early
-
-    output_format : str, optional
-        "onnx" or "tensorflow", see below.
-
-    Returns
-    -------
-    out : list of tvm.te.Tensor
-        If `output_format` is "onnx", the output is two tensors. The first is `indices` of size
-        `(batch_size * num_class* num_boxes , 3)` and the second is a scalar tensor
-        `num_total_detection` of shape `(1,)` representing the total number of selected
-        boxes. The three values in `indices` encode batch, class, and box indices.
-        Rows of `indices` are ordered such that selected boxes from batch 0, class 0 come
-        first, in descending of scores, followed by boxes from batch 0, class 1 etc. Out of
-        `batch_size * num_class* num_boxes` rows of indices, only the first `num_total_detection`
-        rows are valid.
-
-        If `output_format` is "tensorflow", the output is three tensors, the first
-        is `indices` of size `(batch_size, num_class * num_boxes , 2)`, the second is `scores` of
-        size `(batch_size, num_class * num_boxes)`, and the third is `num_total_detection` of size
-        `(batch_size,)` representing the total number of selected boxes per batch. The two values
-        in `indices` encode class and box indices. Of num_class * num_boxes boxes in `indices` at
-        batch b, only the first `num_total_detection[b]` entries are valid. The second axis of
-        `indices` and `scores` are sorted within each class by box scores, but not across classes.
-        So the box indices and scores for the class 0 come first in a sorted order, followed by
-        the class 1 etc.
-    """
-    batch, num_class, num_boxes = scores.shape
-    scores = reshape(scores, (batch * num_class, num_boxes))
-
-    sorted_indices = argsort(scores, axis=1, is_ascend=False, dtype="int32")
-    sorted_scores = gather(scores, 1, sorted_indices)
-
-    valid_count = _get_valid_box_count(sorted_scores, score_threshold)
-
-    selected_indices, selected_scores, num_detections = run_all_class_nms(
-        boxes,
-        sorted_scores,
-        sorted_indices,
-        valid_count,
-        max_output_boxes_per_class,
-        iou_threshold,
-        _nms_loop,
-        return_scores=(output_format == "tensorflow"),
-    )
-
-    if output_format == "onnx":
-        row_offsets = cumsum(num_detections, exclusive=True, dtype="int64")
-        num_total_detections = reduction.sum(cast(num_detections, "int64"), axis=1)
-
-        selected_indices = collect_selected_indices(
-            num_class, selected_indices, num_detections, row_offsets, _collect_selected_indices_ir
-        )
-        return [selected_indices, num_total_detections]
-
-    num_detections_per_batch = reshape(num_detections, (batch, num_class))
-    row_offsets = cumsum(num_detections_per_batch, exclusive=True, dtype="int64", axis=1)
-    num_total_detections = reduction.sum(cast(num_detections_per_batch, "int64"), axis=1)
-
-    selected_indices, selected_scores = collect_selected_indices_and_scores(
-        selected_indices,
-        selected_scores,
-        num_detections_per_batch,
-        row_offsets,
-        num_total_detections,
-        _collect_selected_indices_and_scores_ir,
-    )
-
-    return [selected_indices, selected_scores, num_total_detections]
-
-
-@hybrid.script
-def hybrid_regular_nms(
-    boxes,
-    scores,
-    max_detections_per_class,
-    max_detections,
-    batch_size,
-    num_boxes,
-    num_classes,
-    num_classes_with_background,
-    iou_threshold,
-    score_threshold,
-):
-    """Hybrid routing for regular non-maximum suppression.
-
-    Parameters
-    ----------
-    boxes : tvm.te.Tensor
-        3-D tensor with shape (batch_size, num_boxes, 4)
-
-    scores: tvm.te.Tensor
-        3-D tensor with shape (batch_size, num_boxes, num_classes_with_background)
-
-    max_detections_per_class : tvm.tir.const
-        The maxinum number of output selected boxes per class
-
-    max_detections : tvm.tir.const
-        The maxinum number of output selected boxes
-
-    batch_size : tvm.tir.IntImm or tvm.tir.Var
-        The number of batches
-
-    num_boxes : tvm.tir.IntImm or tvm.tir.Var
-        The number of bounding boxes
-
-    num_classes : tvm.tir.const
-        The number of classes without background
-
-    num_classes_with_background : tvm.tir.IntImm or tvm.tir.Var
-        The number of classes including background ones
-
-    iou_threshold : tvm.tir.const
-        IoU test threshold
-
-    score_threshold : tvm.tir.const
-        Score threshold to filter out low score boxes early
-
-    Returns
-    -------
-    detection_boxes : tvm.te.Tensor
-        3-D tensor with shape [batch_size, max_detections, 4].
-
-    detection_classes : tvm.te.Tensor
-        2-D tensor with shape [batch_size, max_detections].
-
-    detection_scores : tvm.te.Tensor
-        2-D tensor with shape [batch_size, max_detections].
-
-    num_detections : tvm.te.Tensor
-        1-D tensor with shape [batch_size].
-    """
-    # output tensors
-    detection_boxes = output_tensor((batch_size, max_detections, 4), boxes.dtype)
-    detection_classes = output_tensor((batch_size, max_detections), "int32")
-    detection_scores = output_tensor((batch_size, max_detections), scores.dtype)
-    num_detections = output_tensor((batch_size,), "int32")
-
-    # scratch buffers
-    class_scores = allocate((num_boxes,), scores.dtype)
-    keep_indices = allocate((num_boxes,), "int32")
-    keep_scores = allocate((num_boxes,), scores.dtype)
-    sorted_indices = allocate((max_detections + num_boxes,), "int32")
-    sorted_scores = allocate((max_detections + num_boxes,), scores.dtype)
-    active_box_candidate = allocate((num_boxes,), "int32")
-    selected = allocate((num_boxes,), "int32")
-    box_indices_after_regular_nms = allocate((max_detections + num_boxes,), "int32")
-    scores_after_regular_nms = allocate((max_detections + num_boxes,), scores.dtype)
-
-    label_offset = num_classes_with_background - num_classes
-    tmp_idx = 0
-
-    for batch_idx in range(batch_size):
-        size_of_sorted_indices = 0
-
-        for class_id in range(num_classes):
-            for box_id in range(num_boxes):
-                # get scores of boxes corresponding to all anchors for single class
-                class_scores[box_id] = scores[batch_idx, box_id, class_id + label_offset]
-
-            # perform non-maximal suppression on single class
-
-            # select detections above score threshold
-            num_scores_kept = 0
-            for i in range(num_boxes):
-                if class_scores[i] >= score_threshold:
-                    keep_scores[num_scores_kept] = class_scores[i]
-                    keep_indices[num_scores_kept] = i
-                    num_scores_kept += 1
-
-            # iota
-            for i in range(num_scores_kept):
-                sorted_indices[i] = i
-            # decreasing sort of scores
-            for i in range(num_scores_kept):
-                for j in range(num_scores_kept - i - 1):
-                    if keep_scores[sorted_indices[j]] < keep_scores[sorted_indices[j + 1]]:
-                        tmp_idx = sorted_indices[j]
-                        sorted_indices[j] = sorted_indices[j + 1]
-                        sorted_indices[j + 1] = tmp_idx
-
-            selected_size = 0
-
-            for i in range(num_scores_kept):
-                active_box_candidate[i] = 1
-
-            num_active_candidate = num_scores_kept
-            for i in range(num_scores_kept):
-                if (
-                    num_active_candidate != 0
-                    and selected_size < min(num_scores_kept, max_detections_per_class)
-                    and active_box_candidate[i] == 1
-                ):
-                    selected[selected_size] = keep_indices[sorted_indices[i]]
-                    selected_size += 1
-
-                    active_box_candidate[i] = 0
-                    num_active_candidate -= 1
-
-                    for j in range(i + 1, num_scores_kept):
-                        if active_box_candidate[j] == 1:
-                            # compute IOU
-                            i_ymin = boxes[batch_idx, keep_indices[sorted_indices[i]], 0]
-                            i_xmin = boxes[batch_idx, keep_indices[sorted_indices[i]], 1]
-                            i_ymax = boxes[batch_idx, keep_indices[sorted_indices[i]], 2]
-                            i_xmax = boxes[batch_idx, keep_indices[sorted_indices[i]], 3]
-
-                            j_ymin = boxes[batch_idx, keep_indices[sorted_indices[j]], 0]
-                            j_xmin = boxes[batch_idx, keep_indices[sorted_indices[j]], 1]
-                            j_ymax = boxes[batch_idx, keep_indices[sorted_indices[j]], 2]
-                            j_xmax = boxes[batch_idx, keep_indices[sorted_indices[j]], 3]
-
-                            area_i = (i_ymax - i_ymin) * (i_xmax - i_xmin)
-                            area_j = (j_ymax - j_ymin) * (j_xmax - j_xmin)
-
-                            iou = 0.0
-                            if area_i > 0 and area_j > 0:
-                                intersection_ymin = max(i_ymin, j_ymin)
-                                intersection_xmin = max(i_xmin, j_xmin)
-                                intersection_ymax = min(i_ymax, j_ymax)
-                                intersection_xmax = min(i_xmax, j_xmax)
-                                intersection_area = max(
-                                    intersection_ymax - intersection_ymin, 0.0
-                                ) * max(intersection_xmax - intersection_xmin, 0.0)
-                                iou = intersection_area / (area_i + area_j - intersection_area)
-
-                            if iou > iou_threshold:
-                                active_box_candidate[j] = 0
-                                num_active_candidate -= 1
-
-            # end of non-maximal suppression on single class
-
-            # add selected indices from non-max suppression of boxes in this class
-            output_index = size_of_sorted_indices
-            for i in range(selected_size):
-                selected_index = selected[i]
-
-                box_indices_after_regular_nms[output_index] = (
-                    selected_index * num_classes_with_background + class_id + label_offset
-                )
-                scores_after_regular_nms[output_index] = class_scores[selected_index]
-
-                output_index += 1
-
-            # sort the max scores among the selected indices
-            # get the indices for top scores
-            num_indices_to_sort = min(output_index, max_detections)
-
-            # iota
-            for i in range(output_index):
-                sorted_indices[i] = i
-            # deacreasing sort of scores
-            for i in range(output_index):
-                for j in range(output_index - i - 1):
-                    if (
-                        scores_after_regular_nms[sorted_indices[j]]
-                        < scores_after_regular_nms[sorted_indices[j + 1]]
-                    ):
-                        tmp_idx = sorted_indices[j]
-                        sorted_indices[j] = sorted_indices[j + 1]
-                        sorted_indices[j + 1] = tmp_idx
-
-            # copy values to temporary vectors
-            for i in range(num_indices_to_sort):
-                sorted_scores[i] = scores_after_regular_nms[sorted_indices[i]]
-                sorted_indices[i] = box_indices_after_regular_nms[sorted_indices[i]]
-
-            # copy scores and indices from temporary vectors
-            for i in range(num_indices_to_sort):
-                box_indices_after_regular_nms[i] = sorted_indices[i]
-                scores_after_regular_nms[i] = sorted_scores[i]
-
-            size_of_sorted_indices = num_indices_to_sort
-
-        # fill output tensors
-        for output_box_index in range(max_detections):
-            box_ymin = 0.0
-            box_xmin = 0.0
-            box_ymax = 0.0
-            box_xmax = 0.0
-            class_idx = 0
-            selected_score = 0.0
-
-            if output_box_index < size_of_sorted_indices:
-                anchor_idx = (
-                    box_indices_after_regular_nms[output_box_index] // num_classes_with_background
-                )
-
-                box_ymin = boxes[batch_idx, anchor_idx, 0]
-                box_xmin = boxes[batch_idx, anchor_idx, 1]
-                box_ymax = boxes[batch_idx, anchor_idx, 2]
-                box_xmax = boxes[batch_idx, anchor_idx, 3]
-                class_idx = (
-                    box_indices_after_regular_nms[output_box_index]
-                    - anchor_idx * num_classes_with_background
-                    - label_offset
-                )
-                selected_score = scores_after_regular_nms[output_box_index]
-
-            detection_boxes[batch_idx, output_box_index, 0] = box_ymin
-            detection_boxes[batch_idx, output_box_index, 1] = box_xmin
-            detection_boxes[batch_idx, output_box_index, 2] = box_ymax
-            detection_boxes[batch_idx, output_box_index, 3] = box_xmax
-            detection_classes[batch_idx, output_box_index] = class_idx
-            detection_scores[batch_idx, output_box_index] = selected_score
-
-        num_detections[batch_idx] = size_of_sorted_indices
-
-    return detection_boxes, detection_classes, detection_scores, num_detections
-
-
-def regular_non_max_suppression(
-    boxes,
-    scores,
-    max_detections_per_class,
-    max_detections,
-    num_classes,
-    iou_threshold,
-    score_threshold,
-):
-    """Regular non-maximum suppression operator for object detection, corresponding to TFLite's
-    regular NMS. NMS is performed for each class separately.
-
-    Parameters
-    ----------
-    boxes : tvm.te.Tensor
-        3-D tensor with shape (batch_size, num_boxes, 4). The four values in boxes
-        encode (ymin, xmin, ymax, xmax) coordinates of a box
-
-    scores: tvm.te.Tensor
-        3-D tensor with shape (batch_size, num_boxes, num_classes_with_background)
-
-    max_detections_per_class : int
-        The maxinum number of output selected boxes per class
-
-    max_detections : int
-        The maxinum number of output selected boxes
-
-    num_classes : int
-        The number of classes without background
-
-    iou_threshold : float
-        IoU test threshold
-
-    score_threshold : float
-        Score threshold to filter out low score boxes early
-
-    Returns
-    -------
-    out : list of tvm.te.Tensor
-        The output is a list of four tensors. The first is `detection_boxes` of size
-        `(batch_size, max_detections , 4)`, the second is `detection_classes` of size
-        `(batch_size, max_detections)`, the third is `detection_scores` of size
-        `(batch_size, max_detections)`, and the fourth is `num_detections` of size `(batch_size,)`
-        representing the total number of selected boxes per batch.
-    """
-    batch_size, num_boxes, num_classes_with_background = scores.shape
-
-    detection_boxes, detection_classes, detection_scores, num_detections = hybrid_regular_nms(
-        boxes=boxes,
-        scores=scores,
-        max_detections_per_class=tvm.tir.const(max_detections_per_class, dtype="int32"),
-        max_detections=tvm.tir.const(max_detections, dtype="int32"),
-        batch_size=batch_size,
-        num_boxes=num_boxes,
-        num_classes=tvm.tir.const(num_classes, dtype="int32"),
-        num_classes_with_background=num_classes_with_background,
-        iou_threshold=tvm.tir.const(iou_threshold, dtype="float32"),
-        score_threshold=tvm.tir.const(score_threshold, dtype="float32"),
-    )
-
-    return [
-        detection_boxes,
-        cast(detection_classes, dtype="float32"),
-        detection_scores,
-        num_detections,
-    ]
diff --git a/python/tvm/topi/vision/nms_util.py b/python/tvm/topi/vision/nms_util.py
deleted file mode 100644
index d12592fd111a..000000000000
--- a/python/tvm/topi/vision/nms_util.py
+++ /dev/null
@@ -1,338 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Common utilities used in Non-maximum suppression operators"""
-import tvm
-from tvm import te
-
-
-def _get_boundaries(output, box_idx):
-    l = tvm.te.min(
-        output[box_idx],
-        output[box_idx + 2],
-    )
-    t = tvm.te.min(
-        output[box_idx + 1],
-        output[box_idx + 3],
-    )
-    r = tvm.te.max(
-        output[box_idx],
-        output[box_idx + 2],
-    )
-    b = tvm.te.max(
-        output[box_idx + 1],
-        output[box_idx + 3],
-    )
-    return l, t, r, b
-
-
-def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
-    """Calculate overlap of two boxes."""
-    a_l, a_t, a_r, a_b = _get_boundaries(out_tensor, box_a_idx)
-    b_l, b_t, b_r, b_b = _get_boundaries(out_tensor, box_b_idx)
-
-    # Overlapping width and height
-    w = tvm.te.max(0.0, tvm.te.min(a_r, b_r) - tvm.te.max(a_l, b_l))
-    h = tvm.te.max(0.0, tvm.te.min(a_b, b_b) - tvm.te.max(a_t, b_t))
-
-    # Overlapping area
-    area = h * w
-
-    # total area of the figure formed by box a and box b
-    # except for overlapping area
-    u = (a_r - a_l) * (a_b - a_t) + (b_r - b_l) * (b_b - b_t) - area
-    return tvm.tir.Select(u <= 0.0, 0.0, area / u)
-
-
-def binary_search(ib, y, num_boxes, scores, score_threshold, out):
-    """Binary search for score_threshold on scores sorted in descending order"""
-    lo = ib.allocate("int32", (1,), name="lo", scope="local")
-    hi = ib.allocate("int32", (1,), name="hi", scope="local")
-
-    lo[0] = 0
-    hi[0] = num_boxes
-
-    with ib.while_loop(lo[0] < hi[0]):
-        mid = (hi[0] + lo[0]) >> 1
-        with ib.if_scope(scores[y, mid] > score_threshold):
-            lo[0] = mid + 1
-        with ib.else_scope():
-            hi[0] = mid
-
-    out[y] = lo[0]
-
-
-def collect_selected_indices(num_class, selected_indices, num_detections, row_offsets, ir):
-    """Collect selected indices from the core NMS loop into one linear output
-
-    Parameters
-    ----------
-    num_class : int
-
-    selected_indices: tvm.te.Tensor
-        2-D tensor with shape (batch_size * num_classes, num_boxes), representing the indices
-        of selected boxes by the core NMS loop.
-
-    num_detections tvm.te.Tensor
-        1-D tensor with shape (batch_size * num_classes,), representing
-        the number of boxes selected by the core NMS loop, per batch and class
-
-    row_offsets tvm.te.Tensor
-        1-D tensor with shape (batch_size * num_classes,), this should be the exclusive scan
-        of num_detections
-
-    ir : function
-        A function to generate IR for CPU or GPU, see its usage in vision/nms.py and cuda/nms.py
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        The output is indices of size (batch_size * num_class* num_boxes , 3).
-        Rows of indices are ordered such that selected boxes from batch 0, class 0 come
-        first, in descending of scores, followed by boxes from batch 0, class 1 etc.
-    """
-    batch_class, num_boxes = selected_indices.shape
-    return te.extern(
-        [(batch_class * num_boxes, 3)],
-        [selected_indices, num_detections, row_offsets],
-        lambda ins, outs: ir(num_class, ins[0], ins[1], ins[2], outs[0]),
-        dtype=["int64"],
-        name="collect_indices",
-        tag="collect_indices",
-    )
-
-
-def collect_selected_indices_and_scores(
-    selected_indices, selected_scores, num_detections, row_offsets, num_total_detections, ir
-):
-    """Collect selected indices and scores from the core NMS loop into one linear output
-
-    Parameters
-    ----------
-    num_class : int
-
-    selected_indices: tvm.te.Tensor
-        2-D tensor with shape (batch_size * num_classes, num_boxes), representing the indices
-        of selected boxes by the core NMS loop.
-
-    selected_indices: tvm.te.Tensor
-        2-D tensor with shape (batch_size * num_classes, num_boxes), representing the scores
-        of selected boxes by the core NMS loop.
-
-    num_detections tvm.te.Tensor
-        2-D tensor with shape (batch_size, num_classes), representing
-        the number of boxes selected by the core NMS loop, per batch and class
-
-    row_offsets tvm.te.Tensor
-        2-D tensor with shape (batch_size, num_classes), this should be the exclusive scan
-        of num_detections along axis 1
-
-    ir : function
-        A function to generate IR for CPU or GPU, see its usage in vision/nms.py and cuda/nms.py
-
-    Returns
-    -------
-    out : [tvm.te.Tensor, tvm.te.Tensor]
-        The output is two tensors. The first is indices of size
-        (batch_size, num_class* num_boxes, 2), and the second is scores of size
-        (batch_size, num_class* num_boxes).
-    """
-    batch_size, num_class = row_offsets.shape
-    num_boxes = selected_indices.shape[1]
-    return te.extern(
-        [(batch_size, num_class * num_boxes, 2), (batch_size, num_class * num_boxes)],
-        [selected_indices, selected_scores, num_detections, row_offsets, num_total_detections],
-        lambda ins, outs: ir(ins[0], ins[1], ins[2], ins[3], ins[4], outs[0], outs[1]),
-        dtype=["int64", "float32"],
-        name="collect_indices_and_scores",
-        tag="collect_indices_and_scores",
-    )
-
-
-def _all_class_nms_ir(
-    boxes,
-    sorted_scores,
-    sorted_indices,
-    valid_count,
-    batch_class,
-    num_class,
-    num_anchors,
-    iou_threshold,
-    max_output_size_per_class,
-    box_indices,
-    selected_scores,
-    num_valid_boxes,
-    nms_loop,
-):
-    ib = tvm.tir.ir_builder.create()
-    boxes = ib.buffer_ptr(boxes)
-    sorted_scores = ib.buffer_ptr(sorted_scores)
-    sorted_indices = ib.buffer_ptr(sorted_indices)
-    valid_count = ib.buffer_ptr(valid_count)
-    box_indices = ib.buffer_ptr(box_indices)
-    num_valid_boxes = ib.buffer_ptr(num_valid_boxes)
-
-    if selected_scores is not None:
-        selected_scores = ib.buffer_ptr(selected_scores)
-
-    if isinstance(iou_threshold, float):
-        iou_threshold = tvm.tir.FloatImm("float32", iou_threshold)
-
-    if isinstance(max_output_size_per_class, int):
-        max_output_size_per_class = tvm.tir.const(max_output_size_per_class)
-
-    def calc_overlap(i, j, k):
-        offset_j = sorted_indices[i, j] * 4
-        offset_k = sorted_indices[i, k] * 4
-        batch_id = i // num_class
-        base_bbox_idx = batch_id * num_anchors * 4
-        return calculate_overlap(
-            boxes,
-            base_bbox_idx + offset_j,
-            base_bbox_idx + offset_k,
-        )
-
-    def on_new_valid_box(ib, tid, num_current_valid_box, i, j):
-        with ib.if_scope(tid + 0 == 0):
-            box_indices[i, num_current_valid_box] = sorted_indices[i, j]
-
-            if selected_scores is not None:
-                selected_scores[i, num_current_valid_box] = sorted_scores[i, j]
-
-    def on_new_invalidated_box(*_):
-        pass
-
-    def needs_bbox_check(*_):
-        return tvm.tir.const(True)
-
-    return nms_loop(
-        ib,
-        batch_class,
-        tvm.tir.IntImm("int32", -1),  # top_k
-        iou_threshold,
-        max_output_size_per_class,
-        valid_count,
-        on_new_valid_box,
-        on_new_invalidated_box,
-        needs_bbox_check,
-        calc_overlap,
-        sorted_scores,
-        num_valid_boxes,
-    )
-
-
-def run_all_class_nms(
-    boxes,
-    sorted_scores,
-    sorted_indices,
-    valid_count,
-    max_output_size_per_class,
-    iou_threshold,
-    nms_loop,
-    return_scores=False,
-):
-    """The core all class NMS routine
-
-    Parameters
-    ----------
-    boxes : tvm.te.Tensor
-        3-D tensor with shape (batch_size, num_boxes, 4)
-
-    sorted_scores: tvm.te.Tensor
-        2-D tensor with shape (batch_size * num_classes, num_boxes)
-        One of the outputs from argsort
-
-    sorted_indices: tvm.te.Tensor
-        2-D tensor with shape (batch_size * num_classes, num_boxes)
-        The other output from argsort
-
-    valid_count: tvm.te.Tensor
-        1-D tensor with shape (batch_size * num_classes,), representing
-        the number of boxes whose score is above score_threshold, per batch and class
-
-    max_output_boxes_per_class : int or tvm.te.Tensor, optional
-        The maxinum number of output selected boxes per class
-
-    iou_threshold : float or tvm.te.Tensor, optionaIl
-        IoU test threshold
-
-    nms_loop : function
-        A core NMS loop, see its usage in vision/nms.py and cuda/nms.py
-
-    return_scores : bool, optional
-        Whether or not to return selected scores, needed by the tensorflow output format.
-
-    Returns
-    -------
-    out : a list of tvm.te.Tensor
-        The output is three tensors, the first and second are indices and scores of size
-        (batch_size * num_class, num_boxes), and the third is a tensor
-        num_selected_boxes of shape (batch_size * num_class,) representing the total number of
-        selected boxes per batch and class. If return_scores is False, the second output is
-        None.
-    """
-    batch, num_boxes, _ = boxes.shape
-    batch_class = sorted_scores.shape[0]
-    num_class = batch_class // batch
-
-    if return_scores is False:
-        selected_indices, num_detections = te.extern(
-            [(batch_class, num_boxes), (1, batch_class)],
-            [boxes, sorted_scores, sorted_indices, valid_count],
-            lambda ins, outs: _all_class_nms_ir(
-                ins[0],  # boxes
-                ins[1],  # sorted_scores
-                ins[2],  # sorted_indices
-                ins[3],  # valid_count
-                batch_class,
-                num_class,
-                num_boxes,
-                iou_threshold,
-                max_output_size_per_class,
-                outs[0],  # box_indices
-                None,  # scores
-                outs[1],  # num_selected_boxes
-                nms_loop,
-            ),
-            dtype=["int32", "int32"],
-            name="all_class_nms",
-            tag="all_class_nms",
-        )
-        return selected_indices, None, num_detections
-
-    return te.extern(
-        [(batch_class, num_boxes), (batch_class, num_boxes), (1, batch_class)],
-        [boxes, sorted_scores, sorted_indices, valid_count],
-        lambda ins, outs: _all_class_nms_ir(
-            ins[0],  # boxes
-            ins[1],  # sorted_scores
-            ins[2],  # sorted_indices
-            ins[3],  # valid_count
-            batch_class,
-            num_class,
-            num_boxes,
-            iou_threshold,
-            max_output_size_per_class,
-            outs[0],  # box_indices
-            outs[1],  # selected scores
-            outs[2],  # num_selected_boxes
-            nms_loop,
-        ),
-        dtype=["int32", "float32", "int32"],
-        name="all_class_nms",
-        tag="all_class_nms",
-    )
diff --git a/python/tvm/topi/vision/rcnn/__init__.py b/python/tvm/topi/vision/rcnn/__init__.py
deleted file mode 100644
index e5693e869445..000000000000
--- a/python/tvm/topi/vision/rcnn/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""Faster R-CNN and Mask R-CNN operators"""
-from .roi_align import *
-from .roi_pool import *
-from .proposal import *
diff --git a/python/tvm/topi/vision/rcnn/proposal.py b/python/tvm/topi/vision/rcnn/proposal.py
deleted file mode 100644
index 12a0d6bcf0a0..000000000000
--- a/python/tvm/topi/vision/rcnn/proposal.py
+++ /dev/null
@@ -1,448 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, singleton-comparison, bad-continuation
-"""Proposal operator"""
-import math
-import tvm
-from tvm import te
-from ...utils import get_const_tuple, get_const_int
-from ...sort import argsort
-
-
-def generate_anchor(ratio, scale, base_size):
-    """Generate anchor"""
-    w = h = float(base_size)
-    x_ctr = 0.5 * (w - 1.0)
-    y_ctr = 0.5 * (h - 1.0)
-    size = w * h
-    size_ratios = math.floor(size / ratio)
-    new_w = math.floor(math.sqrt(size_ratios) + 0.5) * scale
-    new_h = math.floor((new_w / scale * ratio) + 0.5) * scale
-    return (
-        x_ctr - 0.5 * (new_w - 1.0),
-        y_ctr - 0.5 * (new_h - 1.0),
-        x_ctr + 0.5 * (new_w - 1.0),
-        y_ctr + 0.5 * (new_h - 1.0),
-    )
-
-
-def reg_bbox(x1, y1, x2, y2, dx, dy, dw, dh):
-    """Bounding box regression function"""
-    bbox_w = x2 - x1 + 1.0
-    bbox_h = y2 - y1 + 1.0
-    ctr_x = x1 + 0.5 * (bbox_w - 1.0)
-    ctr_y = y1 + 0.5 * (bbox_h - 1.0)
-
-    pred_ctr_x = dx * bbox_w + ctr_x
-    pred_ctr_y = dy * bbox_h + ctr_y
-    pred_w = te.exp(dw) * bbox_w
-    pred_h = te.exp(dh) * bbox_h
-
-    pred_x1 = pred_ctr_x - 0.5 * (pred_w - 1.0)
-    pred_y1 = pred_ctr_y - 0.5 * (pred_h - 1.0)
-    pred_x2 = pred_ctr_x + 0.5 * (pred_w - 1.0)
-    pred_y2 = pred_ctr_y + 0.5 * (pred_h - 1.0)
-    return pred_x1, pred_y1, pred_x2, pred_y2
-
-
-def reg_iou(x1, y1, x2, y2, dx1, dy1, dx2, dy2):
-    """Bounding box regression function"""
-    pred_x1 = x1 + dx1
-    pred_y1 = y1 + dy1
-    pred_x2 = x2 + dx2
-    pred_y2 = y2 + dy2
-    return pred_x1, pred_y1, pred_x2, pred_y2
-
-
-def predict_bbox_ir(
-    cls_prob_buf,
-    bbox_pred_buf,
-    im_info_buf,
-    out_buf,
-    scales,
-    ratios,
-    feature_stride,
-    rpn_min_size,
-    iou_loss,
-):
-    """Predict bounding boxes based on anchors, scores and deltas.
-
-    Parameters
-    ----------
-    cls_prob_buf : tvm.te.schedule.Buffer
-        4-D with shape [batch, 2 * num_anchors, height, width]
-
-    bbox_pred_buf : tvm.te.schedule.Buffer
-        4-D with shape [batch, 4 * num_anchors, height, width]
-
-    im_info_buf : tvm.te.schedule.Buffer
-        2-D with shape [batch, 3]
-
-    out_buf : tvm.te.schedule.Buffer
-        3-D with shape [batch, num_bbox, 5]
-        The last dimension is in format of [w_start, h_start, w_end, h_end, score]
-
-    scales : list/tuple of float
-        Scales of anchor windows.
-
-    ratios : list/tuple of float
-        Ratios of anchor windows.
-
-    feature_stride : int
-        The size of the receptive field each unit in the convolution layer of the rpn, for example
-        the product of all stride's prior to this layer.
-
-    rpn_min_size : int
-        Minimum height or width in proposal.
-
-    iou_loss : bool
-        Usage of IoU loss.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    batch, num_anchors, height, width = get_const_tuple(cls_prob_buf.shape)
-    num_anchors //= 2
-    ib = tvm.tir.ir_builder.create()
-
-    p_score = ib.buffer_ptr(cls_prob_buf)
-    p_delta = ib.buffer_ptr(bbox_pred_buf)
-    p_im_info = ib.buffer_ptr(im_info_buf)
-    p_out = ib.buffer_ptr(out_buf)
-
-    idxm = tvm.tir.indexmod
-    idxd = tvm.tir.indexdiv
-
-    with ib.for_range(0, batch * height * width) as tid:
-        w = idxm(tid, width)
-        h = idxm(idxd(tid, width), height)
-        b = idxd(idxd(tid, width), height)
-
-        for k in range(num_anchors):
-            out_index = tid * num_anchors + k
-            ratio = ratios[k // len(scales)]
-            scale = scales[k % len(scales)]
-            anchor = generate_anchor(ratio, scale, feature_stride)
-            im_height = p_im_info[b * 3]
-            im_width = p_im_info[b * 3 + 1]
-            x1 = anchor[0] + w * feature_stride
-            y1 = anchor[1] + h * feature_stride
-            x2 = anchor[2] + w * feature_stride
-            y2 = anchor[3] + h * feature_stride
-
-            delta = [
-                p_delta[((((b * num_anchors + k) * 4 + i) * height + h) * width + w)]
-                for i in range(4)
-            ]
-            regression_func = reg_iou if iou_loss else reg_bbox
-            pred_x1, pred_y1, pred_x2, pred_y2 = regression_func(x1, y1, x2, y2, *delta)
-
-            pred_x1 = tvm.te.max(tvm.te.min(pred_x1, im_width - 1.0), 0.0)
-            pred_y1 = tvm.te.max(tvm.te.min(pred_y1, im_height - 1.0), 0.0)
-            pred_x2 = tvm.te.max(tvm.te.min(pred_x2, im_width - 1.0), 0.0)
-            pred_y2 = tvm.te.max(tvm.te.min(pred_y2, im_height - 1.0), 0.0)
-
-            real_height = (im_height / feature_stride).astype("int32")
-            real_width = (im_width / feature_stride).astype("int32")
-
-            bbox_w = pred_x2 - pred_x1 + 1.0
-            bbox_h = pred_y2 - pred_y1 + 1.0
-            min_size = p_im_info[b * 3 + 2] * rpn_min_size
-
-            pred_score = p_score[((b * num_anchors * 2 + num_anchors + k) * height + h) * width + w]
-            pred_score = tvm.tir.Select(
-                tvm.tir.any(h >= real_height, w >= real_width), -1.0, pred_score
-            )
-            p_out[out_index * 5 + 0] = pred_x1
-            p_out[out_index * 5 + 1] = pred_y1
-            p_out[out_index * 5 + 2] = pred_x2
-            p_out[out_index * 5 + 3] = pred_y2
-            p_out[out_index * 5 + 4] = pred_score
-
-            with ib.if_scope(tvm.tir.any(bbox_w < min_size, bbox_h < min_size)):
-                p_out[out_index * 5 + 0] -= min_size / 2.0
-                p_out[out_index * 5 + 1] -= min_size / 2.0
-                p_out[out_index * 5 + 2] += min_size / 2.0
-                p_out[out_index * 5 + 3] += min_size / 2.0
-                p_out[out_index * 5 + 4] = -1.0
-
-    return ib.get()
-
-
-def argsort_ir(data_buf, out_index_buf):
-    """Batched odd-even transposition sort.
-
-    Parameters
-    ----------
-    data_buf : tvm.te.schedule.Buffer
-        2-D with shape [batch, num_bbox]
-
-    out_index_buf : tvm.te.schedule.Buffer
-        2-D with shape [batch, num_bbox]. Indices of data in sorted order.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    batch, num_bbox = get_const_tuple(data_buf.shape)
-    ib = tvm.tir.ir_builder.create()
-    p_data = ib.buffer_ptr(data_buf)
-    index_out = ib.buffer_ptr(out_index_buf)
-    temp_data = ib.allocate("float32", (1,), name="temp_data", scope="local")
-    temp_index = ib.allocate("int32", (1,), name="temp_index", scope="local")
-    idxm = tvm.tir.indexmod
-    with ib.for_range(0, batch, kind="unroll") as b:
-        start = b * num_bbox
-        for i in range(2):
-            with ib.for_range(0, (num_bbox + 1) // 2) as tid:
-                bbox_id = tid * 2 + i
-                with ib.if_scope(bbox_id < num_bbox):
-                    index_out[start + bbox_id] = bbox_id
-        with ib.for_range(0, num_bbox) as k:
-            with ib.for_range(0, (num_bbox + 1) // 2) as tid:
-                offset = start + 2 * tid + idxm(k, 2)
-                with ib.if_scope(
-                    tvm.tir.all(offset + 1 < num_bbox, p_data[offset] < p_data[offset + 1])
-                ):
-                    temp_data[0] = p_data[offset]
-                    p_data[offset] = p_data[offset + 1]
-                    p_data[offset + 1] = temp_data[0]
-                    temp_index[0] = index_out[offset]
-                    index_out[offset] = index_out[offset + 1]
-                    index_out[offset + 1] = temp_index[0]
-    return ib.get()
-
-
-def nms_ir(sorted_bbox_buf, out_buf, nms_threshold):
-    """Non-maximum suppression.
-
-    Parameters
-    ----------
-    sorted_bbox_buf : tvm.te.schedule.Buffer
-        3-D with shape [batch, num_bbox, 5]. The last dimension is in format of
-        [w_start, h_start, w_end, h_end, score].
-
-    out_buf : tvm.te.schedule.Buffer
-        2-D with shape [batch, num_bbox]. Boolean mask of whether a bounding box should be removed.
-
-    nms_threshold : float
-        Non-maximum suppression threshold.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-
-    def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
-        """Calculate overlap of two boxes."""
-        w = tvm.te.max(
-            0.0,
-            tvm.te.min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2])
-            - tvm.te.max(out_tensor[box_a_idx], out_tensor[box_b_idx])
-            + 1.0,
-        )
-        h = tvm.te.max(
-            0.0,
-            tvm.te.min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3])
-            - tvm.te.max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1])
-            + 1.0,
-        )
-        i = w * h
-        u = (
-            (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx] + 1.0)
-            * (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1] + 1.0)
-            + (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx] + 1.0)
-            * (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1] + 1.0)
-            - i
-        )
-        return i / u
-
-    batch, num_bbox = get_const_tuple(out_buf.shape)
-    ib = tvm.tir.ir_builder.create()
-    p_data = ib.buffer_ptr(sorted_bbox_buf)
-    p_out = ib.buffer_ptr(out_buf)
-    with ib.for_range(0, batch, kind="unroll", name="n") as b:
-        base_idx = b * num_bbox
-        for i in range(num_bbox):
-            p_out[base_idx + i] = False
-        with ib.for_range(0, num_bbox - 1) as l:
-            with ib.for_range(0, num_bbox) as i:
-                with ib.if_scope(tvm.tir.all(i < num_bbox, i > l, p_out[base_idx + l] == False)):
-                    iou = calculate_overlap(p_data, (base_idx + l) * 5, (base_idx + i) * 5)
-                    with ib.if_scope(iou > nms_threshold):
-                        p_out[base_idx + i] = True
-    return ib.get()
-
-
-def prepare_output_ir(sorted_bbox_buf, remove_mask_buf, out_buf):
-    """Copy output after applying nms to continuous memory.
-
-    Parameters
-    ----------
-    sorted_bbox_buf : tvm.te.schedule.Buffer
-        3-D with shape [batch, num_bbox, 5]. The last dimension is in format of
-        [w_start, h_start, w_end, h_end, score].
-
-    remove_mask_buf : tvm.te.schedule.Buffer
-        2-D with shape [batch, num_bbox]. Boolean mask of whether a bounding box should be removed.
-
-    out_buf : tvm.te.schedule.Buffer
-        2-D with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of
-        [batch_index, w_start, h_start, w_end, h_end].
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    batch, num_bbox, _ = get_const_tuple(sorted_bbox_buf.shape)
-    rpn_post_nms_top_n = get_const_int(out_buf.shape[0]) // batch
-    ib = tvm.tir.ir_builder.create()
-    i = ib.allocate("int32", (batch,), "i", scope="local")
-    p_sorted_bbox = ib.buffer_ptr(sorted_bbox_buf)
-    p_remove = ib.buffer_ptr(remove_mask_buf)
-    p_out = ib.buffer_ptr(out_buf)
-
-    nkeep = ib.allocate("int32", (batch,), "nkeep", scope="local")
-
-    with ib.for_range(0, batch) as b:
-        nkeep[b] = 0
-        i[b] = 0
-
-    with ib.for_range(0, num_bbox) as j:
-        with ib.for_range(0, batch) as b:
-            with ib.if_scope(p_remove[b * num_bbox + j] == False):
-                nkeep[b] += 1
-    with ib.for_range(0, batch) as b:
-        with ib.if_scope(nkeep[b] > 0):
-            with ib.for_range(
-                0, te.ceil(tvm.tir.const(rpn_post_nms_top_n, "float32") / nkeep[b]).astype("int32")
-            ):
-                with ib.for_range(0, num_bbox) as j:
-                    offset_j = (b * num_bbox + j) * 5
-                    offset_i = (b * rpn_post_nms_top_n + i[b]) * 5
-                    with ib.if_scope(
-                        tvm.tir.all(
-                            i[b] < rpn_post_nms_top_n, p_remove[(b * num_bbox + j)] == False
-                        )
-                    ):
-                        p_out[offset_i] = tvm.tir.Cast("float32", b)
-                        with ib.for_range(0, 4, kind="unroll") as k:
-                            p_out[offset_i + k + 1] = p_sorted_bbox[offset_j + k]
-                        i[b] = i[b] + 1
-
-    body = ib.get()
-    return body
-
-
-def proposal(
-    cls_prob,
-    bbox_pred,
-    im_info,
-    scales,
-    ratios,
-    feature_stride,
-    threshold,
-    rpn_pre_nms_top_n,
-    rpn_post_nms_top_n,
-    rpn_min_size,
-    iou_loss,
-):
-    """Proposal operator.
-
-    Parameters
-    ----------
-    cls_prob : tvm.te.Tensor
-        4-D with shape [batch, 2 * num_anchors, height, width]
-
-    bbox_pred : tvm.te.Tensor
-        4-D with shape [batch, 4 * num_anchors, height, width]
-
-    im_info : tvm.te.Tensor
-        2-D with shape [batch, 3]
-
-    scales : list/tuple of float
-        Scales of anchor windows.
-
-    ratios : list/tuple of float
-        Ratios of anchor windows.
-
-    feature_stride : int
-        The size of the receptive field each unit in the convolution layer of the rpn, for example
-        the product of all stride's prior to this layer.
-
-    threshold : float
-        Non-maximum suppression threshold.
-
-    rpn_pre_nms_top_n : int
-        Number of top scoring boxes to apply NMS. -1 to use all boxes.
-
-    rpn_post_nms_top_n : int
-        Number of top scoring boxes to keep after applying NMS to RPN proposals.
-
-    rpn_min_size : int
-        Minimum height or width in proposal.
-
-    iou_loss : bool
-        Usage of IoU loss.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        2-D tensor with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of
-        [batch_index, w_start, h_start, w_end, h_end].
-    """
-    # pylint: disable=unused-argument
-    batch, _, height, width = get_const_tuple(cls_prob.shape)
-    num_anchors = len(scales) * len(ratios)
-    num_bbox = height * width * num_anchors
-    rpn_pre_nms_top_n = min(rpn_pre_nms_top_n, num_bbox) if rpn_pre_nms_top_n > 0 else num_bbox
-
-    bbox = te.extern(
-        (batch, num_bbox, 5),
-        [cls_prob, bbox_pred, im_info],
-        lambda ins, outs: predict_bbox_ir(
-            ins[0], ins[1], ins[2], outs[0], scales, ratios, feature_stride, rpn_min_size, iou_loss
-        ),
-        dtype=bbox_pred.dtype,
-    )
-    score = te.compute((batch, num_bbox), lambda b, i: bbox[b, i, 4], tag="bbox_score")
-    valid_count_shape = (1,)
-    valid_count = te.compute(valid_count_shape, lambda i: num_bbox)
-    sorted_index = argsort(score, valid_count=valid_count, axis=1, is_ascend=False)
-    sorted_bbox = te.compute(
-        (batch, rpn_pre_nms_top_n, 5),
-        lambda b, i, j: bbox[b, sorted_index[b, i], j],
-        tag="sorted_bbox",
-    )
-    nms_remove_mask = te.extern(
-        (batch, rpn_pre_nms_top_n),
-        [sorted_bbox],
-        lambda ins, outs: nms_ir(ins[0], outs[0], threshold),
-        dtype="bool",
-    )
-    nms_out = te.extern(
-        (batch * rpn_post_nms_top_n, 5),
-        [sorted_bbox, nms_remove_mask],
-        lambda ins, outs: prepare_output_ir(ins[0], ins[1], outs[0]),
-        dtype=sorted_bbox.dtype,
-    )
-    return nms_out
diff --git a/python/tvm/topi/vision/rcnn/roi_align.py b/python/tvm/topi/vision/rcnn/roi_align.py
deleted file mode 100644
index 238e02964356..000000000000
--- a/python/tvm/topi/vision/rcnn/roi_align.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Roi align operator"""
-import tvm
-from tvm import te
-from ...utils import get_const_tuple
-from ...cpp.utils import bilinear_sample_nchw, bilinear_sample_nhwc
-
-
-def _sample_common(
-    i,
-    c,
-    ph,
-    pw,
-    rois,
-    pooled_size_h,
-    pooled_size_w,
-    spatial_scale,
-    sample_ratio,
-    dtype,
-    avg_mode,
-    bilinear_func,
-):
-    roi = rois[i]
-    batch_index = roi[0].astype("int32")
-    roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[3], roi[4]
-    roi_start_h *= spatial_scale
-    roi_end_h *= spatial_scale
-    roi_start_w *= spatial_scale
-    roi_end_w *= spatial_scale
-
-    # force malformed ROIs to be 1x1
-    roi_h = tvm.te.max(roi_end_h - roi_start_h, tvm.tir.const(1.0, dtype))
-    roi_w = tvm.te.max(roi_end_w - roi_start_w, tvm.tir.const(1.0, dtype))
-
-    bin_h = roi_h / pooled_size_h
-    bin_w = roi_w / pooled_size_w
-
-    if sample_ratio > 0:
-        roi_bin_grid_h = roi_bin_grid_w = tvm.tir.const(sample_ratio, "int32")
-    else:
-        roi_bin_grid_h = te.ceil(roi_h / pooled_size_h).astype("int32")
-        roi_bin_grid_w = te.ceil(roi_w / pooled_size_w).astype("int32")
-
-    count = roi_bin_grid_h * roi_bin_grid_w
-    rh = te.reduce_axis((0, roi_bin_grid_h), name="rh")
-    rw = te.reduce_axis((0, roi_bin_grid_w), name="rw")
-    roi_start_h += ph * bin_h
-    roi_start_w += pw * bin_w
-
-    if avg_mode:
-        return te.sum(
-            bilinear_func(
-                batch_index,
-                c,
-                roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h,
-                roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w,
-            )
-            / count,
-            axis=[rh, rw],
-        )
-    # max mode
-    return te.max(
-        bilinear_func(
-            batch_index,
-            c,
-            roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h,
-            roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w,
-        ),
-        axis=[rh, rw],
-    )
-
-
-def roi_align_nchw(data, rois, pooled_size, spatial_scale, mode, sample_ratio=-1):
-    """ROI align operator in NCHW layout.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        4-D with shape [batch, channel, height, width]
-
-    rois : tvm.te.Tensor
-        2-D with shape [num_roi, 5]. The last dimension should be in format of
-        [batch_index, w_start, h_start, w_end, h_end]
-
-    pooled_size : int or list/tuple of two ints
-        output size, or [out_height, out_width]
-
-    spatial_scale : float
-        Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal
-        of total stride in convolutional layers, which should be in range (0.0, 1.0]
-
-    mode : int or str
-        There are two modes, average and max. For the average mode, you can pass b'avg' or 0, and
-        for the max mode, you can pass b'max' or 1.
-
-    sample_ratio : int
-        Optional sampling ratio of ROI align, using adaptive size by default.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        4-D with shape [num_roi, channel, pooled_size, pooled_size]
-    """
-    avg_mode = mode in (b"avg", 0)
-    max_mode = mode in (b"max", 1)
-    assert avg_mode or max_mode, "Mode must be avg or max. Please pass in a valid mode."
-    dtype = rois.dtype
-    _, channel, height, width = get_const_tuple(data.shape)
-    num_roi, _ = get_const_tuple(rois.shape)
-
-    if isinstance(pooled_size, int):
-        pooled_size_h = pooled_size_w = pooled_size
-    else:
-        pooled_size_h, pooled_size_w = pooled_size
-
-    def _bilinear(i, c, y, x):
-        outside = tvm.tir.any(y < -1.0, x < -1.0, y > height, x > width)
-        y = tvm.te.min(tvm.te.max(y, 0.0), height - 1)
-        x = tvm.te.min(tvm.te.max(x, 0.0), width - 1)
-        val = bilinear_sample_nchw(data, (i, c, y, x), height - 1, width - 1)
-        return tvm.tir.if_then_else(outside, 0.0, val)
-
-    def _sample(i, c, ph, pw):
-        return _sample_common(
-            i,
-            c,
-            ph,
-            pw,
-            rois,
-            pooled_size_h,
-            pooled_size_w,
-            spatial_scale,
-            sample_ratio,
-            dtype,
-            avg_mode,
-            _bilinear,
-        )
-
-    return te.compute(
-        (num_roi, channel, pooled_size_h, pooled_size_w), _sample, tag="pool,roi_align_nchw"
-    )
-
-
-def roi_align_nhwc(data, rois, pooled_size, spatial_scale, mode, sample_ratio=-1):
-    """ROI align operator in NHWC layout.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        4-D with shape [batch, height, width, channel]
-
-    rois : tvm.te.Tensor
-        2-D with shape [num_roi, 5]. The last dimension should be in format of
-        [batch_index, w_start, h_start, w_end, h_end]
-
-    pooled_size : int or list/tuple of two ints
-        output size, or [out_height, out_width]
-
-    spatial_scale : float
-        Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal
-        of total stride in convolutional layers, which should be in range (0.0, 1.0]
-
-    mode : int or str
-        There are two modes, average and max. For the average mode, you can pass b'avg' or 0, and
-        for the max mode, you can pass b'max' or 1.
-
-    sample_ratio : int
-        Optional sampling ratio of ROI align, using adaptive size by default.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        4-D with shape [num_roi, pooled_size, pooled_size, channel]
-    """
-    avg_mode = mode in (b"avg", 0)
-    max_mode = mode in (b"max", 1)
-    assert avg_mode or max_mode, "Mode must be avg or max. Please pass in a valid mode."
-    dtype = rois.dtype
-    _, height, width, channel = get_const_tuple(data.shape)
-    num_roi, _ = get_const_tuple(rois.shape)
-
-    if isinstance(pooled_size, int):
-        pooled_size_h = pooled_size_w = pooled_size
-    else:
-        pooled_size_h, pooled_size_w = pooled_size
-
-    def _bilinear(i, c, y, x):
-        outside = tvm.tir.any(y < -1.0, x < -1.0, y > height, x > width)
-        y = tvm.te.min(tvm.te.max(y, 0.0), height - 1)
-        x = tvm.te.min(tvm.te.max(x, 0.0), width - 1)
-        val = bilinear_sample_nhwc(data, (i, y, x, c), height - 1, width - 1)
-        return tvm.tir.if_then_else(outside, 0.0, val)
-
-    def _sample(i, ph, pw, c):
-        return _sample_common(
-            i,
-            c,
-            ph,
-            pw,
-            rois,
-            pooled_size_h,
-            pooled_size_w,
-            spatial_scale,
-            sample_ratio,
-            dtype,
-            avg_mode,
-            _bilinear,
-        )
-
-    return te.compute(
-        (num_roi, pooled_size_h, pooled_size_w, channel), _sample, tag="pool,roi_align_nchw"
-    )
diff --git a/python/tvm/topi/vision/rcnn/roi_pool.py b/python/tvm/topi/vision/rcnn/roi_pool.py
deleted file mode 100644
index dd1429bcb3c5..000000000000
--- a/python/tvm/topi/vision/rcnn/roi_pool.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""ROI pool operator"""
-import tvm
-from tvm import te
-from ...utils import get_const_tuple
-
-
-def roi_pool_nchw(data, rois, pooled_size, spatial_scale):
-    """ROI pool operator in NCHW layout.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        4-D with shape [batch, channel, height, width]
-
-    rois : tvm.te.Tensor
-        2-D with shape [num_roi, 5]. The last dimension should be in format of
-        [batch_index, w_start, h_start, w_end, h_end]
-
-    pooled_size : int or list/tuple of two ints
-        output size, or [out_height, out_width]
-
-    spatial_scale : float
-        Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal
-        of total stride in convolutional layers, which should be in range (0.0, 1.0]
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        4-D with shape [num_roi, channel, pooled_size, pooled_size]
-    """
-    dtype = rois.dtype
-    _, channel, height, width = get_const_tuple(data.shape)
-    num_roi, _ = get_const_tuple(rois.shape)
-
-    if isinstance(pooled_size, int):
-        pooled_size_h = pooled_size_w = pooled_size
-    else:
-        pooled_size_h, pooled_size_w = pooled_size
-
-    def _pool(i, c, ph, pw):
-        roi = rois[i]
-        batch_index = roi[0].astype("int32")
-        roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[3], roi[4]
-
-        roi_start_h = te.round(roi_start_h * spatial_scale).astype("int32")
-        roi_start_w = te.round(roi_start_w * spatial_scale).astype("int32")
-        roi_end_h = te.round(roi_end_h * spatial_scale).astype("int32")
-        roi_end_w = te.round(roi_end_w * spatial_scale).astype("int32")
-
-        # force malformed ROIs to be 1x1
-        roi_h = tvm.te.max(roi_end_h - roi_start_h + 1, tvm.tir.const(1, "int32"))
-        roi_w = tvm.te.max(roi_end_w - roi_start_w + 1, tvm.tir.const(1, "int32"))
-
-        bin_h = roi_h.astype(dtype) / pooled_size_h
-        bin_w = roi_w.astype(dtype) / pooled_size_w
-
-        # use epsilon to prevent floating point precision loss in floor/ceil
-        epsilon = tvm.tir.const(0.00001, dtype)
-        hstart = te.floor(ph * bin_h + epsilon).astype("int32")
-        wstart = te.floor(pw * bin_w + epsilon).astype("int32")
-        hend = te.ceil((ph + 1) * bin_h - epsilon).astype("int32")
-        wend = te.ceil((pw + 1) * bin_w - epsilon).astype("int32")
-        hstart = tvm.te.min(tvm.te.max(hstart + roi_start_h, 0), height)
-        wstart = tvm.te.min(tvm.te.max(wstart + roi_start_w, 0), width)
-        hend = tvm.te.min(tvm.te.max(hend + roi_start_h, 0), height)
-        wend = tvm.te.min(tvm.te.max(wend + roi_start_w, 0), width)
-
-        non_empty = tvm.tir.all(hstart < hend, wstart < wend)
-        min_value = lambda dtype: tvm.tir.if_then_else(
-            non_empty, tvm.te.min_value(dtype), tvm.tir.const(0.0, dtype)
-        )
-        # pylint: disable=unnecessary-lambda
-        _max = te.comm_reducer(lambda x, y: tvm.te.max(x, y), min_value, name="max")
-        rh = te.reduce_axis((0, hend - hstart), "rh")
-        rw = te.reduce_axis((0, wend - wstart), "rw")
-        return _max(data[batch_index, c, hstart + rh, wstart + rw], axis=[rh, rw])
-
-    return te.compute((num_roi, channel, pooled_size_h, pooled_size_w), _pool, tag="pool,roi_pool")
diff --git a/python/tvm/topi/vision/reorg.py b/python/tvm/topi/vision/reorg.py
deleted file mode 100644
index 9883085f9f40..000000000000
--- a/python/tvm/topi/vision/reorg.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-REORG Operator
-====================
-Reorg operator, used in darknet.
-"""
-from __future__ import absolute_import as _abs
-from .. import cpp
-
-
-def reorg(data, stride):
-    """Reorg forward operators.
-
-    Parameters
-    ----------
-    Input : tvm.te.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
-
-    stride : int
-        Stride value for reorganization
-
-    Returns
-    -------
-    Output : tvm.te.Tensor
-        4-D with shape [batch, out_channel, out_height, out_width]
-    """
-    return cpp.vision.reorg(data, stride)
diff --git a/python/tvm/topi/vision/ssd/__init__.py b/python/tvm/topi/vision/ssd/__init__.py
deleted file mode 100644
index 1ac388da9a1e..000000000000
--- a/python/tvm/topi/vision/ssd/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""VISION network operators"""
-from __future__ import absolute_import as _abs
-
-from .multibox import *
diff --git a/python/tvm/topi/vision/ssd/multibox.py b/python/tvm/topi/vision/ssd/multibox.py
deleted file mode 100644
index 234bfd795328..000000000000
--- a/python/tvm/topi/vision/ssd/multibox.py
+++ /dev/null
@@ -1,369 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable
-"""SSD multibox operators"""
-import tvm
-
-from tvm.te import hybrid
-from tvm.tir import exp, sqrt
-
-from tvm import topi
-
-from ..nms import non_max_suppression
-
-
-@hybrid.script
-def hybrid_multibox_prior(data, sizes, ratios, steps, offsets):
-    """Hybrid routing for multibox_prior operator.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor or numpy NDArray
-        4-D tensor with shape [batch, channel, height, width]]
-
-    sizes : tvm ConsExpr
-        Sizes for anchor boxes.
-
-    ratios : tvm ConsExpr
-        Ratios for anchor boxes.
-
-    steps : tvm ConsExpr
-        Priorbox step across y and x, -1 for auto calculation.
-
-    offsets : tvm ConsExpr
-        Priorbox center offsets, y and x respectively.
-
-    Returns
-    -------
-    output : tvm.te.Tensor or numpy NDArray
-        3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
-    """
-    in_height = data.shape[2]
-    in_width = data.shape[3]
-    num_sizes = len(sizes)
-    num_ratios = len(ratios)
-    num_boxes = in_height * in_width * (num_sizes + num_ratios - 1)
-    output = output_tensor((1, num_boxes, 4), "float32")
-    steps_h = steps[0] * 1.0 if steps[0] > 0 else 1.0 / in_height
-    steps_w = steps[1] * 1.0 if steps[1] > 0 else 1.0 / in_width
-    offset_h = offsets[0]
-    offset_w = offsets[1]
-
-    # Need to define var out of const_range + if
-    w = 0.0
-    h = 0.0
-
-    for i in parallel(in_height):
-        center_h = (i + offset_h) * steps_h
-        for j in range(in_width):
-            center_w = (j + offset_w) * steps_w
-            for k in const_range(num_sizes + num_ratios - 1):
-                if k < num_sizes:
-                    w = float32(sizes[k] * in_height) / in_width / 2.0
-                    h = sizes[k] / 2.0
-                else:
-                    w = (
-                        float32(sizes[0] * in_height)
-                        / in_width
-                        * sqrt(ratios[k - num_sizes + 1] * 1.0)
-                        / 2.0
-                    )
-                    h = sizes[0] / sqrt(ratios[k - num_sizes + 1] * 1.0) / 2.0
-                count = (
-                    i * in_width * (num_sizes + num_ratios - 1)
-                    + j * (num_sizes + num_ratios - 1)
-                    + k
-                )
-                output[0, count, 0] = center_w - w
-                output[0, count, 1] = center_h - h
-                output[0, count, 2] = center_w + w
-                output[0, count, 3] = center_h + h
-
-    return output
-
-
-def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False):
-    """Generate prior(anchor) boxes from data, sizes and ratios.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        4-D with shape [batch, c_in, h_in, w_in]]
-
-    sizes : tuple of float
-        Tuple of sizes for anchor boxes.
-
-    ratios : tuple of float
-        Tuple of ratios for anchor boxes.
-
-    steps : Tuple of float
-        Priorbox step across y and x, -1 for auto calculation.
-
-    offsets : tuple of int
-        Priorbox center offsets, y and x respectively.
-
-    clip : boolean
-        Whether to clip out-of-boundary boxes.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
-    """
-    out = hybrid_multibox_prior(
-        data,
-        tvm.runtime.convert(sizes),
-        tvm.runtime.convert(ratios),
-        tvm.runtime.convert(steps),
-        tvm.runtime.convert(offsets),
-    )
-    if clip:
-        out = topi.clip(out, 0, 1)
-    return out
-
-
-@hybrid.script
-def _hybrid_transform_loc(anchor, pred_loc, variance, clip, batch_idx, anchor_idx):
-    """Transform prior anchor box to output box through location predictions."""
-    al = anchor[0, anchor_idx, 0]
-    at = anchor[0, anchor_idx, 1]
-    ar = anchor[0, anchor_idx, 2]
-    ab = anchor[0, anchor_idx, 3]
-
-    px = pred_loc[batch_idx, 0]
-    py = pred_loc[batch_idx, 1]
-    pw = pred_loc[batch_idx, 2]
-    ph = pred_loc[batch_idx, 3]
-
-    vx = variance[0]
-    vy = variance[1]
-    vw = variance[2]
-    vh = variance[3]
-
-    output = output_tensor((4,), pred_loc.dtype)
-
-    aw = ar - al
-    ah = ab - at
-    ax = (al + ar) / 2.0
-    ay = (at + ab) / 2.0
-    ox = px * vx * aw + ax
-    oy = py * vy * ah + ay
-    ow = exp(pw * vw) * aw / 2.0
-    oh = exp(ph * vh) * ah / 2.0
-    output[0] = max(0.0, min(1.0, ox - ow)) if clip else ox - ow
-    output[1] = max(0.0, min(1.0, oy - oh)) if clip else oy - oh
-    output[2] = max(0.0, min(1.0, ox + ow)) if clip else ox + ow
-    output[3] = max(0.0, min(1.0, oy + oh)) if clip else oy + oh
-    return output
-
-
-@hybrid.script
-def hybrid_multibox_transform_loc(
-    cls_prob,
-    loc_pred,
-    anchor,
-    clip,
-    threshold,
-    variances,
-    keep_background,
-):
-    """Hybrid routing for transform location in multibox_detection operator.
-
-    Parameters
-    ----------
-    cls_prob : tvm.te.Tensor or numpy NDArray
-        3-D tensor of class probabilities.
-
-    loc_pred : tvm.te.Tensor or numpy NDArray
-        2-D tensor of location regression predictions.
-
-    anchor : tvm.te.Tensor or numpy NDArray
-        3-D tensor of prior anchor boxes.
-
-    clip : tvm.tir.const
-        Whether to clip out-of-boundary boxes.
-
-    threshold : tvm.tir.const
-        Threshold to be a positive prediction.
-
-    variances : tvm.nd.NDArray
-        Variances to be decoded from box regression output.
-
-    keep_background : tvm.tir.const
-        Whether to keep boxes detected as background or not.
-
-    Returns
-    -------
-    out_loc : tvm.te.Tensor or numpy NDArray
-        3-D tensor of transformed location.
-
-    valid_count : tvm.te.Tensor or numpy NDArray
-        1_d tensor of valid counts for boxes.
-    """
-    batch_size = cls_prob.shape[0]
-    num_classes = cls_prob.shape[1]
-    num_anchors = cls_prob.shape[2]
-    pred_coord = allocate(
-        (
-            batch_size,
-            4,
-        ),
-        loc_pred.dtype,
-    )
-    out_loc = output_tensor((batch_size, num_anchors, 6), loc_pred.dtype)
-    valid_count = output_tensor((batch_size,), "int32")
-
-    start_cls_idx = 0 if keep_background else 1
-
-    for i in parallel(batch_size):
-        valid_count[i] = 0
-        for j in range(num_anchors):
-            # Find the predicted class id and probability
-            score = -1.0
-            cls_id = 0
-            for k in range(start_cls_idx, num_classes):
-                temp = cls_prob[i, k, j]
-                cls_id = k if temp > score else cls_id
-                score = max(temp, score)
-            if cls_id > 0 and score < threshold:
-                cls_id = 0
-            # [id, prob, xmin, ymin, xmax, ymax]
-            # Remove background if 'keep_background=False', restore original id
-            if keep_background or cls_id > 0:
-                out_loc[i, valid_count[i], 0] = cls_id - 0.0 if keep_background else cls_id - 1.0
-                out_loc[i, valid_count[i], 1] = score
-                for l in range(4):
-                    pred_coord[i, l] = loc_pred[i, j * 4 + l]
-                out_coord = _hybrid_transform_loc(anchor, pred_coord, variances, clip, i, j)
-                out_loc[i, valid_count[i], 2] = out_coord[0]
-                out_loc[i, valid_count[i], 3] = out_coord[1]
-                out_loc[i, valid_count[i], 4] = out_coord[2]
-                out_loc[i, valid_count[i], 5] = out_coord[3]
-                valid_count[i] += 1
-
-    return out_loc, valid_count
-
-
-def multibox_transform_loc(
-    cls_prob,
-    loc_pred,
-    anchor,
-    clip=True,
-    threshold=0.01,
-    variances=(0.1, 0.1, 0.2, 0.2),
-    keep_background=False,
-):
-    """Location transformation for multibox detection
-
-    Parameters
-    ----------
-    cls_prob : tvm.te.Tensor
-        Class probabilities.
-
-    loc_pred : tvm.te.Tensor
-        Location regression predictions.
-
-    anchor : tvm.te.Tensor
-        Prior anchor boxes.
-
-    clip : boolean
-        Whether to clip out-of-boundary boxes.
-
-    threshold : float
-        Threshold to be a positive prediction.
-
-    variances : tuple of float
-        Variances to be decoded from box regression output.
-
-    keep_background : boolean
-        Whether to keep boxes detected as background or not.
-
-    Returns
-    -------
-    ret : tuple of tvm.te.Tensor
-    """
-
-    return hybrid_multibox_transform_loc(
-        cls_prob,
-        loc_pred,
-        anchor,
-        tvm.tir.const(clip, "bool"),
-        tvm.tir.const(threshold, "float32"),
-        tvm.runtime.convert(variances),
-        tvm.tir.const(keep_background, "bool"),
-    )
-
-
-def multibox_detection(
-    cls_prob,
-    loc_pred,
-    anchor,
-    clip=True,
-    threshold=0.01,
-    nms_threshold=0.5,
-    force_suppress=False,
-    variances=(0.1, 0.1, 0.2, 0.2),
-    nms_topk=-1,
-):
-    """Convert multibox detection predictions.
-
-    Parameters
-    ----------
-    cls_prob : tvm.te.Tensor
-        Class probabilities.
-
-    loc_pred : tvm.te.Tensor
-        Location regression predictions.
-
-    anchor : tvm.te.Tensor
-        Prior anchor boxes.
-
-    clip : boolean
-        Whether to clip out-of-boundary boxes.
-
-    nms_threshold : float
-        Non-maximum suppression threshold.
-
-    force_suppress : boolean
-        Whether to suppress all detections regardless of class_id.
-
-    threshold : float
-        Threshold to be a positive prediction.
-
-    variances : tuple of float
-        Variances to be decoded from box regression output.
-
-    nms_topk : int
-        Keep maximum top k detections before nms, -1 for no limit.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        3-D tensor with shape (batch_size, num_anchors, 6)
-    """
-    inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor, clip, threshold, variances)
-    out = non_max_suppression(
-        inter_out[0],
-        inter_out[1],
-        inter_out[1],
-        max_output_size=-1,
-        iou_threshold=nms_threshold,
-        force_suppress=force_suppress,
-        top_k=nms_topk,
-        return_indices=False,
-    )
-    return out
diff --git a/python/tvm/utils/__init__.py b/python/tvm/utils/__init__.py
deleted file mode 100644
index 33abc352b0f0..000000000000
--- a/python/tvm/utils/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utilities operating at a graph/model or other "high" level"""
-
-from .roofline import roofline_analysis
diff --git a/python/tvm/utils/roofline/__init__.py b/python/tvm/utils/roofline/__init__.py
deleted file mode 100644
index 45cc880c5b85..000000000000
--- a/python/tvm/utils/roofline/__init__.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utilities for computing an approximate roofline model"""
-from typing import Dict, Optional, Union
-
-import numpy as np
-
-from ... import IRModule, auto_scheduler, build, get_global_func, nd, relay, tir, topi, transform
-from ...contrib import utils
-from ...ir.expr import GlobalVar
-from ...ir.instrument import pass_instrument
-from ...rpc.base import RPC_SESS_MASK
-from ...rpc.client import RPCSession
-from ...runtime import Device, num_threads, profiler_vm, profiling
-from ...script import tir as T
-from ...target import Target
-from . import cuda, registry, x86
-
-
-def _create_args(mod: IRModule, dev: Device, func_name: str = "main", remote=None):
-    if dev.device_type >= RPC_SESS_MASK:
-        random_fill = remote.get_function("tvm.contrib.random.random_fill")
-    else:
-        random_fill = get_global_func("tvm.contrib.random.random_fill")
-    assert random_fill, "Please make sure USE_RANDOM is ON in config.cmake"
-    args = []
-    for arg in mod[func_name].params:
-        ary = nd.empty(
-            [x.value for x in arg.type_annotation.shape],
-            arg.type_annotation.dtype,
-            device=dev,
-        )
-        random_fill(ary)
-        args.append(ary)
-    return args
-
-
-@pass_instrument
-class SaveLoweredTIR:
-    """Save TIR functions for analysis.
-
-    We need the TIR function in a form that can be handled by
-    `auto_scheduler.feature.named_features_from_primfunc`, but which
-    is the closest to the final lowered form as possible.  Right now this
-    means right before tir.SplitHostDevice.
-
-    """
-
-    def __init__(self, before_pass: str = "tir.SplitHostDevice"):
-        """
-        Parameters
-        ----------
-        before_pass: str
-            Pass before which the TIR is saved.
-        """
-        self.functions = {}
-        self.before_pass = before_pass
-
-    def run_before_pass(self, mod, info):
-        if info.name == self.before_pass:
-            for v, func in mod.functions.items():
-                if isinstance(func, tir.PrimFunc):
-                    self.functions[v] = func
-
-
-def roofline_from_existing(
-    report: profiling.Report,
-    tir_functions: Dict[GlobalVar, tir.PrimFunc],
-    target: Target,
-    dev: Device,
-    remote: Optional[RPCSession] = None,
-) -> profiling.Report:
-    """Add roofline and other estimated statistics to an existing profiling report.
-
-    :py:func:`roofline_analysis` should always be used instead of this function
-    unless you need a custom compilation pipeline.
-
-    Calculating roofline statistics requires features extracted the TIR
-    functions in addition to per-operator runtime information (`report`) of the
-    same TIR features. The features and TIR functions are not included with the
-    compiled library used to generate the per-operator runtime. It is essential
-    that the per-operator information comes from the exact same compilation
-    pipeline as the TIR functions.
-
-
-    Example
-    -------
-
-    ..code: : python
-
-        import tvm
-        import tvm.relay
-
-        mod, params = tvm.relay.testing.mlp.get_workload()
-
-        # it is recommended to use SaveLoweredTIR to get out the tir primfuncs
-        save_tir = tvm.utils.roofline.SaveLoweredTIR()
-        with tvm.transform.PassContext(opt_level=3, pass_instrument=[save_tir]):
-            lib = relay.vm.compile(mod, params=params, target=target)
-
-        vmexec = profiler_vm.VirtualMachineProfiler(lib, dev)
-        report = vmexec.profile(*inputs)
-
-        roofline_report = roofline_from_existing(report, save_tir.functions, target, dev)
-
-
-    Parameters
-    ----------
-    report : Report
-        Existing profiling report from :py:method:`VirtualMachineProfiler.profile`.
-    tir_functions : Dict[GlobalVar, PrimFunc]
-        TIR primfuncs from the module run to generate `report`. It is nessesary
-        that these functions come before the `tir.MakePackedAPI` pass and are
-        compatible with auto_scheduler featurization.
-        :py:class:`SaveLoweredTIR` is the recommended way to collect these
-        functions.
-    target : Target
-        TVM target that `report` was generated with.
-    dev : Device
-        Device that `report` was generated with.
-    remote : Optional[RPCSession]
-      Remote session used to upload artifacts for runtime evaluation. Must be
-      the same session used to create `dev`.
-
-    Returns
-    -------
-    profiling.Report
-        New profiling report that includes all information from `report`
-        along with additional roofline metrics. See
-        :py:func:`roofline_analysis` for more information on which metrics
-        are included.
-    """
-
-    all_features = {
-        prim.attrs["hash"]: (name, prim, auto_scheduler.feature.named_features_from_primfunc(prim))
-        for name, prim in tir_functions.items()
-        if isinstance(prim, tir.PrimFunc) and "hash" in prim.attrs.keys()
-    }
-
-    new_configuration = dict(report.configuration.items())
-    new_calls = []
-    for call in report.calls:
-        if "Hash" in call.keys() and call["Hash"] in all_features:
-            _, prim, features = all_features[call["Hash"]]
-            if features is None:
-                continue
-
-            with target:
-                flops, peak_flops, flops_name = registry.estimate_peak_flops(
-                    prim, features, target, dev, remote
-                )
-                loaded_bytes, peak_bandwidth, bandwidth_name = registry.estimate_peak_bandwidth(
-                    prim, features, target, dev, remote
-                )
-            new_configuration[f"Estimated Peak FLOP/s ({flops_name})"] = profiling.Ratio(peak_flops)
-            new_configuration[
-                f"Estimated Peak Bandwidth ({bandwidth_name}, byte/second)"
-            ] = profiling.Ratio(peak_bandwidth)
-            ridge_point = peak_flops / peak_bandwidth
-
-            runtime = call["Duration (us)"].microseconds * 1e-6
-            arith_inten = flops / loaded_bytes
-            call = dict(call)
-            call["Loaded Bytes"] = profiling.Count(int(loaded_bytes))
-            call["Estimated FLOPs"] = profiling.Count(int(flops))
-            call["Arithmetic Intensity"] = profiling.Ratio(arith_inten)
-            call["FLOP/s"] = profiling.Ratio(flops / runtime)
-            call["Bandwidth"] = profiling.Ratio(loaded_bytes / runtime)
-            compute_bound = arith_inten > ridge_point
-            call["Bound"] = "compute" if compute_bound else "memory"
-            per_mem_bound = (loaded_bytes / runtime) / peak_bandwidth * 100
-            per_compute_bound = (flops / runtime) / peak_flops * 100.0
-            # We use ratio here because the percentages should be averaged instead of summed.
-            call["Percent of Theoretical Optimal"] = profiling.Ratio(
-                per_compute_bound if compute_bound else per_mem_bound
-            )
-            new_calls.append(call)
-        else:
-            new_calls.append(call)
-    return profiling.Report(new_calls, report.device_metrics, new_configuration)
-
-
-def roofline_analysis(
-    mod: IRModule,
-    params: Dict[str, nd.NDArray],
-    target: Union[str, Target],
-    dev: Device,
-    remote: Optional[RPCSession] = None,
-) -> profiling.Report:
-    """
-    Create a profiling report that contains roofline and other estimated
-    statistics from running a module on the VM.
-
-    The roofline model measures how close a operator gets to best possible
-    memory bandwidth or FLOP/s depending on whether it is memory or compute
-    bound. This computation uses the runtime of the operator along with two
-    numbers extracted from the TIR code: bytes of memory touched and number of
-    floating point operations.
-
-    These statistics are calculated by analyzing the lowered TIR of each
-    operator, so they are estimates of the true values. The statistics are:
-      - Bound: Is the operator memory or compute bound. This is computed by
-        assuming that the operator could perfectly cache all loads -- each byte
-        of memory is only loaded once.
-      - Percent of Theoretical Optimal: What percent of theoretical optimal for
-        the bound. i.e. percent of peak memory bandwidth if memory bound,
-        percent of peak FLOP/s if compute bound.
-      - Loaded Bytes: estimation of the number of bytes loaded from main memory.
-      - Estimated Flops: estimated number of floating point operations.
-      - Arithmetic Intensity: ratio of FLOPs per byte of data.
-      - FLOP/s: floating point operations per second.
-      - Bandwidth: Number of bytes loaded per second.
-
-    Parameters
-    ----------
-    mod : IRModule
-      Uncompiled input module
-
-    params : Dict[str, nd.NDArray]
-
-    target : Union[str, Target]
-      Target to run on.
-
-    dev : Device
-      Device to run on.
-
-    remote : Optional[RPCSession]
-      Remote session used to upload artifacts for runtime evaluation. Must be
-      the same session used to create `dev`.
-
-    Returns
-    -------
-
-    report : profiling.Report
-      Profiling report which includes the estimated statistics.
-    """
-    if isinstance(target, str):
-        target = Target(target)
-
-    save_tir = SaveLoweredTIR()
-    # copy existing context but add our instrument
-    pass_ctx = transform.PassContext.current()
-    with transform.PassContext(
-        opt_level=pass_ctx.opt_level,
-        required_pass=pass_ctx.required_pass,
-        disabled_pass=pass_ctx.disabled_pass,
-        instruments=list(pass_ctx.instruments) + [save_tir],
-        config=pass_ctx.config,
-    ):
-        lib = relay.vm.compile(mod, params=params, target=target)
-    # upload to remote if running over rpc
-    if dev.device_type >= RPC_SESS_MASK:
-        if remote is None:
-            raise RuntimeError("A RPCSession must be provided when using a remote device.")
-        temp = utils.tempdir()
-        path = temp.relpath("roofline_lib.tar")
-        lib.mod.export_library(path)
-        remote.upload(path)
-        lib = remote.load_module("roofline_lib.tar")
-    vmexec = profiler_vm.VirtualMachineProfiler(lib, dev)
-
-    args = _create_args(mod, dev, remote=remote)
-    report = vmexec.profile(*args)
-
-    return roofline_from_existing(report, save_tir.functions, target, dev, remote=remote)
diff --git a/python/tvm/utils/roofline/cuda.py b/python/tvm/utils/roofline/cuda.py
deleted file mode 100644
index b83a902b7fda..000000000000
--- a/python/tvm/utils/roofline/cuda.py
+++ /dev/null
@@ -1,407 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Estimation of peak flops and memory bandwidth for cuda devices"""
-import functools
-import re
-from typing import Dict, Optional, Tuple
-
-import numpy as np
-
-from ... import build, nd, transform
-from ...contrib import nvcc, utils
-from ...rpc.base import RPC_SESS_MASK
-from ...rpc.client import RPCSession
-from ...runtime import Device
-from ...script import tir as T
-from ...target import Target
-from ...tir import PrimFunc
-from . import registry
-
-
-@functools.lru_cache(maxsize=None)
-def estimate_peak_flops_tensorcore(
-    target: Target,
-    dev: Device,
-    remote: Optional[RPCSession],
-    mat_dtype: str = "float16",
-    acc_dtype: str = "float32",
-) -> Tuple[float, float, str]:
-    """Estimate the peak FLOP/s of a cuda device with tensorcores.
-
-    This estimate should only be used to compare with operators that can use
-    dense tensorcore mma instructions.
-
-    References
-    ----------
-    Wei Sun, Ang Li, Tong Geng, Sander Stuijk, Henk Corporaal: "Dissecting
-    Tensor Cores via Microbenchmarks: Latency, Throughput and Numerical
-    Behaviors", 2022; http://arxiv.org/abs/2206.02874
-    https://www.nvidia.com/content/PDF/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.1.pdf
-
-    Parameters
-    ----------
-    target : Target
-        Target to run on. This should be as specific to the actual hardware as
-        possible.
-    dev : Device
-        Device to run on.
-    remote : Optional[RPCSession]
-      Remote session used to upload artifacts for runtime evaluation. Must be
-      the same session used to create `dev`.
-    mat_dtype : str
-        Dtype of matrices passed to mma instructions.
-    acc_dtype : str
-        Dtype of accumulator to use with mma instructions. Should be compatible
-        with `mat_dtype`.
-
-    Returns
-    -------
-    peak_flops : float
-        Approximate sustained FLOP/s of this target/device combo assuming
-        mma instructions. Addition and multiplications are each counted as
-        separate FLOPs.
-    """
-
-    @T.prim_func
-    def peak_flops_tensorcore_tir(
-        inp: T.Buffer((16, 16), mat_dtype),
-        out: T.Buffer((16, 16), acc_dtype),
-        n: T.int32,
-        sms: T.int32,
-    ):
-        # pylint: disable=invalid-name, missing-function-docstring
-        A = T.alloc_buffer((16, 16), dtype=mat_dtype, scope="wmma.matrix_a")
-        B = T.alloc_buffer((16, 16), dtype=mat_dtype, scope="wmma.matrix_b")
-        C = T.alloc_buffer((16, 16), dtype=acc_dtype, scope="wmma.accumulator")
-        for _ in T.thread_binding(sms, thread="blockIdx.x"):
-            for _ in T.thread_binding(
-                8, thread="threadIdx.y"
-            ):  # need 8 warps to get enough in-SM parallelism
-                for _ in T.thread_binding(32, thread="threadIdx.x"):
-                    T.evaluate(
-                        T.tvm_load_matrix_sync(
-                            A.data,
-                            16,
-                            16,
-                            16,
-                            0,
-                            T.tvm_access_ptr(
-                                T.type_annotation(dtype=mat_dtype),
-                                inp.data,
-                                0,
-                                16,
-                                1,
-                                dtype="handle",
-                            ),
-                            16,
-                            "row_major",
-                            dtype="handle",
-                        )
-                    )
-                    T.evaluate(T.tvm_fill_fragment(B.data, 16, 16, 16, 0, 0, dtype="handle"))
-                    T.evaluate(T.tvm_fill_fragment(C.data, 16, 16, 16, 0, 0, dtype="handle"))
-                    for _ in range(n):
-                        T.evaluate(
-                            T.tvm_mma_sync(
-                                C.data, 0, A.data, 0, B.data, 0, C.data, 0, dtype="handle"
-                            )
-                        )
-                    T.evaluate(
-                        T.tvm_store_matrix_sync(
-                            C.data,
-                            16,
-                            16,
-                            16,
-                            0,
-                            T.tvm_access_ptr(
-                                T.type_annotation(dtype=acc_dtype),
-                                out.data,
-                                0,
-                                16,
-                                2,
-                                dtype="handle",
-                            ),
-                            16,
-                            "row_major",
-                            dtype="handle",
-                        )
-                    )
-
-    n = 100000
-    sms = dev.multi_processor_count
-    specialized = peak_flops_tensorcore_tir.specialize(
-        {peak_flops_tensorcore_tir.params[2]: n, peak_flops_tensorcore_tir.params[3]: sms}
-    )
-    with transform.PassContext(opt_level=3):
-        f = build(specialized, target=target)
-
-    # upload to remote if running over rpc
-    if dev.device_type >= RPC_SESS_MASK:
-        if remote is None:
-            raise RuntimeError("A RPCSession must be provided when using a remote device.")
-        temp = utils.tempdir()
-        path = temp.relpath("peak_mma_flops.tar")
-        f.export_library(path)
-        remote.upload(path)
-        f = remote.load_module("peak_mma_flops.tar")
-
-    x = nd.empty((16, 16), dtype=mat_dtype, device=dev)
-    y = nd.empty((16, 16), dtype=acc_dtype, device=dev)
-    times = f.time_evaluator(f.entry_name, dev, repeat=10, number=1)(x, y)
-    # each mma operation computes 16 x 16 x 16 FLOPs
-    return n * 16 * 16 * 16 * 2 * sms * 8 / times.min
-
-
-@functools.lru_cache(maxsize=None)
-def estimate_peak_flops_fma(
-    target: Target,
-    dev: Device,
-    remote: Optional[RPCSession],
-    dtype: str,
-) -> Tuple[float, float, str]:
-    """Estimate the peak FLOP/s of a cuda device with fma operations (not using tensor cores).
-
-    References
-    ----------
-    https://www.nvidia.com/content/PDF/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.1.pdf
-
-    Parameters
-    ----------
-    target : Target
-        Target to run on. This should be as specific to the actual hardware as
-        possible.
-    dev : Device
-        Device to run on.
-    remote : Optional[RPCSession]
-      Remote session used to upload artifacts for runtime evaluation. Must be
-      the same session used to create `dev`.
-    dtype : str
-        Dtype of fma operation
-
-    Returns
-    -------
-    peak_flops : float
-        Approximate sustained FLOP/s of this target/device combo assuming
-        fma instructions. Addition and multiplications are each counted as
-        separate FLOPs.
-    """
-
-    vec_width = 32
-    warps = 16  # need 16 warps to get enough in-SM parallelism
-    sms = dev.multi_processor_count
-    n = 100000
-
-    @T.prim_func
-    def peak_flops_fma_tir(
-        A: T.Buffer((sms, warps, vec_width), dtype),
-        B: T.Buffer((sms, warps, vec_width), dtype),
-    ):
-        # pylint: disable=invalid-name, missing-function-docstring
-        shared = T.alloc_buffer((sms, warps, vec_width), dtype=dtype, scope="shared")
-        for sm in T.thread_binding(sms, thread="blockIdx.x"):
-            for warp in T.thread_binding(warps, thread="threadIdx.y"):
-                for t in T.thread_binding(vec_width, thread="threadIdx.x"):
-                    shared[sm, warp, t] = A[sm, warp, t]
-                    for _ in range(n):
-                        shared[sm, warp, t] = (
-                            shared[sm, warp, t] * shared[sm, warp, t] + shared[sm, warp, t]
-                        )
-                    B[sm, warp, t] = shared[sm, warp, t]
-
-    with transform.PassContext(opt_level=3):
-        f = build(peak_flops_fma_tir, target=target)
-
-    # upload to remote if running over rpc
-    if dev.device_type >= RPC_SESS_MASK:
-        if remote is None:
-            raise RuntimeError("A RPCSession must be provided when using a remote device.")
-        temp = utils.tempdir()
-        path = temp.relpath("peak_fma_flops.tar")
-        f.export_library(path)
-        remote.upload(path)
-        f = remote.load_module("peak_fma_flops.tar")
-
-    x = nd.empty((sms, warps, vec_width), dtype=dtype, device=dev)
-    y = nd.empty((sms, warps, vec_width), dtype=dtype, device=dev)
-    times = f.time_evaluator(f.entry_name, dev, repeat=10, number=1)(x, y)
-    return n * warps * sms * vec_width * 2 / times.min
-
-
-@registry.estimate_peak_flops.register("cuda")
-def estimate_peak_flops(
-    func: PrimFunc,  # pylint: disable=unused-argument
-    features: Dict[str, np.ndarray],
-    target: Target,
-    dev: Device,
-    remote: Optional[RPCSession],
-) -> Tuple[float, float, str]:
-    """Estimate the peak FLOP/s of a cuda device.
-
-    Parameters
-    ----------
-    func : PrimFunc
-        Function to estimate peak flops for. Used to check if a specific kind
-        intrinsic or dtype could be used with this function.
-    features : Dict[str, np.ndarry]
-        Features extracted from `func`. Used to check if a specific kind
-        intrinsic or dtype could be used with this function.
-    target : Target
-        Target to run on. This should be as specific to the actual hardware as
-        possible.
-    dev : Device
-        Device to run on.
-    remote : Optional[RPCSession]
-      Remote session used to upload artifacts for runtime evaluation. Must be
-      the same session used to create `dev`.
-
-    Returns
-    -------
-    flops : float
-        Estimated number of flops used by `func`.
-    peak_flops : float
-        Approximate sustained FLOP/s of this target/device combo. Addition and
-        multiplications are each counted as separate FLOPs.
-    name : str
-        Dtype/intrinsic used by `func` to achieve peak flops.
-    """
-    has_tensorcore = nvcc.have_tensorcore(dev.compute_version)
-    # assume that the first argument dtype is the same as all the others
-    dtype = list(func.buffer_map.values())[0].dtype
-    if dtype == "float16" and has_tensorcore:
-        peak_flops = estimate_peak_flops_tensorcore(target, dev, remote)
-        name = "float16 tensorcore"
-    else:
-        peak_flops = estimate_peak_flops_fma(target, dev, remote, dtype)
-        name = f"{dtype} fma"
-    flops = np.sum(
-        features["float_addsub"]
-        + features["float_mul"]
-        + features["float_mad"] * 2
-        + features["float_divmod"]
-    )
-    return flops, peak_flops, name
-
-
-@T.prim_func
-def peak_bandwidth_tir(a: T.handle, b: T.handle, blocks: T.int32, warp_size: T.int32) -> None:
-    # pylint: disable=invalid-name, missing-function-docstring
-    N = T.int32()
-    A = T.match_buffer(a, [blocks, N, 4, warp_size], "float32")
-    B = T.match_buffer(b, [blocks, 4, warp_size], "float32")
-    for i in T.thread_binding(blocks, "blockIdx.x"):
-        for k in T.serial(N):
-            for l in T.unroll(4):
-                # vectorized load is necessary to hit peak bandwidth
-                for j in T.thread_binding(warp_size, "threadIdx.x"):
-                    # += is necessary to introduce a data dependency for all
-                    # elements of A, preventing the backend from removing the
-                    # `k` loop and setting `k` to the loop extent.
-                    B[i, l, j] += A[i, k, l, j]
-
-
-@functools.lru_cache(maxsize=None)
-def estimate_peak_bandwidth_global_mem(
-    target: Target,
-    dev: Device,
-    remote: Optional[RPCSession] = None,
-) -> Tuple[float, float, str]:
-    """Estimate peak bandwidth of global memory. See estimate_peak_bandwidth"""
-    warp_size = dev.warp_size
-    # These sizes seem large enough to give the card time to hit a fixpoint on memory bandwidth
-    blocks = 1024
-    size = 1024
-
-    specialized = peak_bandwidth_tir.specialize(
-        {peak_bandwidth_tir.params[2]: blocks, peak_bandwidth_tir.params[3]: warp_size}
-    )
-    with transform.PassContext(opt_level=3):
-        f = build(specialized, target=target)
-
-    # upload to remote if running over rpc
-    if dev.device_type >= RPC_SESS_MASK:
-        if remote is None:
-            raise RuntimeError("A RPCSession must be provided when using a remote device.")
-        temp = utils.tempdir()
-        path = temp.relpath("peak_bandwidth.tar")
-        f.export_library(path)
-        remote.upload(path)
-        f = remote.load_module("peak_bandwidth.tar")
-
-    a = nd.empty((blocks, size, 4, warp_size), dtype="float32", device=dev)
-    b = nd.empty((blocks, 4, warp_size), dtype="float32", device=dev)
-    times = f.time_evaluator(f.entry_name, dev, repeat=10, number=1)(a, b)
-    return a.numpy().size * 4 / times.min  # 4 bytes per float32
-
-
-@registry.estimate_peak_bandwidth.register("cuda")
-def estimate_peak_bandwidth(
-    func: PrimFunc,  # pylint: disable=unused-argument
-    features: Dict[str, np.ndarray],
-    target: Target,
-    dev: Device,
-    remote: Optional[RPCSession] = None,
-) -> Tuple[float, float, str]:
-    """Estimate peak memory bandwidth of a target/device combo.
-
-    Peak bandwidth is estimated by running a small experiment on the underlying
-    hardware. The peak bandwidth measurement assumes that vector instructions
-    are being used to load the data.
-
-    Parameters
-    ----------
-    func : PrimFunc
-        Function to estimate peak bandwidth for. Used to check if a specific
-        kind of memory could be used with this function.
-    features : Dict[str, np.ndarry]
-        Features extracted from `func`. Used to check if a specific kind of
-        memory could be used with this function.
-    target : Target
-        Target to use for measurement. This target should be as specific to the
-        underlying hardware as possible.
-    dev : Device
-        Device to measure peak bandwidth on.
-    remote : Optional[RPCSession]
-      Remote session used to upload artifacts for runtime evaluation. Must be
-      the same session used to create `dev`.
-
-    Returns
-    -------
-    loaded_bytes : float
-        Estimated bytes loaded by `func`.
-    peak_bandwidth : float
-        Peak memory bandwidth in bytes/seconds.
-    name : str
-        Name of the memory being used.
-    """
-    # autoscheduler features do not take into account that 1.
-    # global and shared memory have very different performance
-    # characteristics -- both are included in the same bytes
-    # touched count 2. multiple threads accessing the same byte
-    # of memory does not use the same amount of bandwidth as
-    # multiple threads accessing different bytes of memory. We
-    # use unique bytes accessed here to avoid these two issues,
-    # but this does bias results towards being more compute
-    # bound.
-    loaded_bytes = sum(
-        [
-            np.sum(x)
-            for (k, x) in features.items()
-            if re.match(r"^B[0-9]+\.unique_bytes$", k) is not None
-        ]
-    )
-    peak_bandwidth = estimate_peak_bandwidth_global_mem(target, dev, remote)
-    return loaded_bytes, peak_bandwidth, "global"
diff --git a/python/tvm/utils/roofline/registry.py b/python/tvm/utils/roofline/registry.py
deleted file mode 100644
index 9358529b38ec..000000000000
--- a/python/tvm/utils/roofline/registry.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Definition of generic functions for estimating peak flops and bandwidth"""
-from typing import Dict, Optional, Tuple
-
-import numpy as np
-
-from ...rpc.client import RPCSession
-from ...runtime import Device
-from ...target import Target, generic_func
-from ...tir import PrimFunc
-
-
-@generic_func
-def estimate_peak_bandwidth(
-    func: PrimFunc,
-    features: Dict[str, np.ndarray],
-    target: Target,
-    dev: Device,
-    remote: Optional[RPCSession] = None,
-) -> Tuple[float, float, str]:
-    """Estimate peak memory bandwidth of a target/device combo.
-
-    Peak bandwidth is estimated by running a small experiment on the underlying
-    hardware. The peak bandwidth measurement assumes that vector instructions
-    are being used to load the data.
-
-    Parameters
-    ----------
-    func : PrimFunc
-        Function to estimate peak bandwidth for. Used to check if a specific
-        kind of memory could be used with this function.
-    features : Dict[str, np.ndarry]
-        Features extracted from `func`. Used to check if a specific kind of
-        memory could be used with this function.
-    target : Target
-        Target to use for measurement. This target should be as specific to the
-        underlying hardware as possible.
-    dev : Device
-        Device to measure peak bandwidth on.
-    remote : Optional[RPCSession]
-      Remote session used to upload artifacts for runtime evaluation. Must be
-      the same session used to create `dev`.
-
-    Returns
-    -------
-    loaded_bytes : float
-        Estimated bytes loaded by `func`.
-    peak_bandwidth : float
-        Peak memory bandwidth in bytes/seconds.
-    name : str
-        Name of the memory being used.
-    """
-    raise NotImplementedError()
-
-
-@generic_func
-def estimate_peak_flops(
-    func: PrimFunc,
-    features: Dict[str, np.ndarray],
-    target: Target,
-    dev: Device,
-    remote: Optional[RPCSession],
-) -> Tuple[float, float, str]:
-    """
-    Estimate the maximum number of FLOP/s this target/device combo is capable
-    of reaching by running a test program. This is a generic function that
-    should be overridden for each target.
-
-    Parameters
-    ----------
-    func : PrimFunc
-        Function to estimate peak flops for. Used to check if a specific kind
-        intrinsic or dtype could be used with this function.
-    features : Dict[str, np.ndarry]
-        Features extracted from `func`. Used to check if a specific kind
-        intrinsic or dtype could be used with this function.
-    target : Target
-        Target to run on. This should be as specific to the actual hardware as
-        possible to make sure that LLVM generates the best vector code.
-    dev : Device
-        Device to run on.
-    remote : Optional[RPCSession]
-      Remote session used to upload artifacts for runtime evaluation. Must be
-      the same session used to create `dev`.
-
-    Returns
-    -------
-    flops : float
-        Estimated number of flops used by `func`.
-    peak_flops : float
-        Approximate sustained FLOP/s of this target/device combo assuming
-        vectorized FMA instructions. Each FMA operation counts as two FLOPs.
-    name : str
-        Dtype/intrinsic used by `func` to achieve peak flops.
-    """
-    raise NotImplementedError()
diff --git a/python/tvm/utils/roofline/x86.py b/python/tvm/utils/roofline/x86.py
deleted file mode 100644
index 5d2dd27e523b..000000000000
--- a/python/tvm/utils/roofline/x86.py
+++ /dev/null
@@ -1,331 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Estimate peak flops and bandwidth for x86 devices"""
-import functools
-import re
-from typing import Dict, Optional, Tuple
-
-import numpy as np
-
-from ... import build, get_global_func, nd, transform
-from ...contrib import utils
-from ...rpc.base import RPC_SESS_MASK
-from ...rpc.client import RPCSession
-from ...runtime import DataType, Device, num_threads
-from ...script import tir as T
-from ...target import Target, x86
-from ...tir import PrimFunc
-from . import registry
-
-
-def _detect_vec_width_registers(
-    target: Target, vec_width: Optional[int], num_vector_registers: Optional[int]
-):
-    """Get the vector width and number of vector registers for a target.
-
-    Parameters
-    ----------
-    target : Target
-        Target to detect vector width and registers for.
-    vec_width : Optional[int]
-        If None, try and detect vector width from target. Otherwise provided input is used.
-    num_vector_registers : Optional[int]
-        If None, try and number of vector registers from target. Otherwise provided input is used.
-
-    Returns
-    -------
-    vec_width: int
-        Width of a vector register on `target` in bytes.
-    num_vector_registers: int
-        Number of vector registers on `target`.
-    """
-    if vec_width is None:
-        # Only implemented for x86 so far...
-        if (
-            str(target.kind) == "llvm"
-            and target.device_name == ""
-            and len(target.keys) == 1
-            and target.keys[0] == "cpu"
-        ):
-            with target:
-                vec_width = x86.get_simd_32bit_lanes() * 4  # in number of bytes
-        else:
-            raise RuntimeError(f"Cannot determine vector width for target {target}")
-    if num_vector_registers is None:
-        if target.device_name == "":  # indicates x86
-            num_vector_registers = 16  # Assuming for all platforms, probably wrong on older ones
-        else:
-            raise RuntimeError(f"Cannot determine number of vector registers for target {target}")
-    return vec_width, num_vector_registers
-
-
-@functools.lru_cache(maxsize=None)
-def estimate_peak_fma_vector_flops(
-    target: Target,
-    dev: Device,
-    remote: Optional[RPCSession],
-    dtype: DataType,
-    vec_width: Optional[int] = None,
-    num_vector_registers: Optional[int] = None,
-):
-    """Estimate peak flops assuming vector fma instructions and no explicit
-    intrinsics. See estimate_peak_fma_flops.
-    """
-
-    @T.prim_func
-    def peakflops_fma_tir(
-        a: T.handle,
-        vec_width: T.int32,
-        iters: T.int32,
-        num_vector_registers: T.int32,
-        threads: T.int32,
-    ) -> None:
-        # pylint: disable=invalid-name, missing-function-docstring
-        A = T.match_buffer(a, [threads, num_vector_registers, vec_width], dtype)
-        for t in T.parallel(threads):
-            for _j in range(iters):
-                for l in T.unroll(num_vector_registers):
-                    # We want to use as few registers as possible, so we perform
-                    # all operations on the same element
-                    for k in T.vectorized(vec_width):
-                        A[t, l, k] = A[t, l, k] * A[t, l, k] + A[t, l, k]
-
-    vec_width, num_vector_registers = _detect_vec_width_registers(
-        target, vec_width, num_vector_registers
-    )
-    vec_width //= DataType(dtype).bits // 8
-    iters = 1000000
-    nthreads = num_threads()
-    specialized = peakflops_fma_tir.specialize(
-        {
-            peakflops_fma_tir.params[1]: vec_width,
-            peakflops_fma_tir.params[2]: iters,
-            peakflops_fma_tir.params[3]: num_vector_registers,
-            peakflops_fma_tir.params[4]: nthreads,
-        }
-    )
-    with transform.PassContext(opt_level=3):
-        f = build(specialized, target=target)
-
-    # upload to remote if running over rpc
-    if dev.device_type >= RPC_SESS_MASK:
-        if remote is None:
-            raise RuntimeError("A RPCSession must be provided when using a remote device.")
-        temp = utils.tempdir()
-        path = temp.relpath("peak_fma_flops.tar")
-        f.export_library(path)
-        remote.upload(path)
-        f = remote.load_module("peak_fma_flops.tar")
-        random_fill = remote.get_function("tvm.contrib.random.random_fill")
-    else:
-        random_fill = get_global_func("tvm.contrib.random.random_fill")
-    assert random_fill, "Please make sure USE_RANDOM is ON in config.cmake"
-
-    a = nd.empty((nthreads, num_vector_registers, vec_width), dtype=dtype, device=dev)
-    random_fill(a)
-    times = f.time_evaluator(f.entry_name, dev, repeat=100, number=1)(a)
-    flops = 2 * vec_width * num_vector_registers * nthreads * iters  # fma is two flops
-    return flops / times.min
-
-
-@registry.estimate_peak_flops.register("cpu")
-def estimate_peak_fma_flops(
-    func: PrimFunc,
-    features: Dict[str, np.ndarray],
-    target: Target,
-    dev: Device,
-    remote: Optional[RPCSession],
-    vec_width: Optional[int] = None,
-    num_vector_registers: Optional[int] = None,
-) -> Tuple[float, float, str]:
-    """
-    Estimate the maximum number of FLOP/s this target/device combo is capable
-    of reaching by running a test program. This assumes vectorized FMA
-    (fused-multiply-add) instructions.
-
-
-    Parameters
-    ----------
-    func : PrimFunc
-        Function to estimate peak flops for. Used to check if a specific kind
-        intrinsic or dtype could be used with this function.
-    features : Dict[str, np.ndarry]
-        Features extracted from `func`. Used to check if a specific kind
-        intrinsic or dtype could be used with this function.
-    target : Target
-        Target to run on. This should be as specific to the actual hardware as
-        possible to make sure that LLVM generates the best vector code.
-    dev : Device
-        Device to run on.
-    remote : Optional[RPCSession]
-      Remote session used to upload artifacts for runtime evaluation. Must be
-      the same session used to create `dev`.
-    vec_width : Optional[int]
-        Vector width of SIMD units on the underlying hardware. Will try to
-        infer if no value is provided.
-    num_vector_registers : Optional[int]
-        Number of vector registers on the underlying hardware. Will try to
-        infer if no value is provided.
-
-    Returns
-    -------
-    flops : float
-        Estimated number of flops used by `func`.
-    peak_flops : float
-        Approximate sustained FLOP/s of this target/device combo assuming
-        vectorized FMA instructions. Each FMA operation counts as two FLOPs.
-    name : str
-        Dtype/intrinsic used by `func` to achieve peak flops.
-    """
-    # assume that the first argument's dtype is the one we want
-    dtype = list(func.buffer_map.values())[0].dtype
-    if "int" in dtype:
-        flops = np.sum(
-            features["int_addsub"]
-            + features["int_mul"]
-            + features["int_mad"] * 2
-            + features["int_divmod"]
-        )
-    else:
-        flops = np.sum(
-            features["float_addsub"]
-            + features["float_mul"]
-            + features["float_mad"] * 2
-            + features["float_divmod"]
-        )
-    peak_flops = estimate_peak_fma_vector_flops(
-        target, dev, remote, dtype, vec_width, num_vector_registers
-    )
-    return flops, peak_flops, f"{dtype} FMA"
-
-
-@T.prim_func
-def peak_bandwidth_tir(a: T.handle, b: T.handle, threads: T.int32, vec_width: T.int32) -> None:
-    # pylint: disable=invalid-name, missing-function-docstring
-    N = T.int32()
-    A = T.match_buffer(a, [threads, N, 4, vec_width], "float32")
-    B = T.match_buffer(b, [threads, 4, vec_width], "float32")
-    # Parallelism is necessary to hit all cores/nodes
-    for i in T.parallel(threads):
-        for k in T.serial(N):
-            for l in T.unroll(4):
-                # vectorized load is necessary to hit peak bandwidth
-                for j in T.vectorized(vec_width):
-                    # += is necessary to introduce a data dependency for all
-                    # elements of A, preventing the backend from removing the
-                    # `k` loop and setting `k` to the loop extent.
-                    B[i, l, j] += A[i, k, l, j]
-
-
-@functools.lru_cache(maxsize=None)
-def estimate_peak_bandwidth_dram(
-    target: Target,
-    dev: Device,
-    remote: Optional[RPCSession],
-    vec_width: Optional[int] = None,
-) -> float:
-    """Estimate peak bandwidth for DRAM. See estimate_peak_bandwidth."""
-    vec_width, _ = _detect_vec_width_registers(target, vec_width, 1)
-    specialized = peak_bandwidth_tir.specialize(
-        {
-            peak_bandwidth_tir.params[3]: vec_width,
-        }
-    )
-    with transform.PassContext(opt_level=3):
-        f = build(specialized, target=target)
-
-    # upload to remote if running over rpc
-    if dev.device_type >= RPC_SESS_MASK:
-        if remote is None:
-            raise RuntimeError("A RPCSession must be provided when using a remote device.")
-        temp = utils.tempdir()
-        path = temp.relpath("peak_bandwidth.tar")
-        f.export_library(path)
-        remote.upload(path)
-        f = remote.load_module("peak_bandwidth.tar")
-        random_fill = remote.get_function("tvm.contrib.random.random_fill")
-    else:
-        random_fill = get_global_func("tvm.contrib.random.random_fill")
-    assert random_fill, "Please make sure USE_RANDOM is ON in config.cmake"
-
-    threads = num_threads()
-    # Data size needs to be larger than last level of cache. We don't have a
-    # way of getting cache sizes, so this number should give us a large enough
-    # size.
-    size = 10**8 // (4 * threads * vec_width)
-    a = nd.empty((threads, size, 4, vec_width), dtype="float32", device=dev)
-    random_fill(a)
-    b = nd.empty((threads, 4, vec_width), dtype="float32", device=dev)
-    random_fill(b)
-    times = f.time_evaluator(f.entry_name, dev, repeat=10, number=1)(a, b, threads)
-    return a.numpy().size * 4 / times.min  # 4 bytes per float32
-
-
-@registry.estimate_peak_bandwidth.register("cpu")
-def estimate_peak_bandwidth(
-    func: PrimFunc,  # pylint: disable=unused-argument
-    features: Dict[str, np.ndarray],
-    target: Target,
-    dev: Device,
-    remote: Optional[RPCSession],
-    vec_width: Optional[int] = None,
-) -> Tuple[float, float, str]:
-    """Estimate peak memory bandwidth of a target/device combo.
-
-    Peak bandwidth is estimated by running a small experiment on the underlying
-    hardware. The peak bandwidth measurement assumes that vector instructions
-    are being used to load the data.
-
-    Parameters
-    ----------
-    func : PrimFunc
-        Function to estimate peak bandwidth for. Used to check if a specific
-        kind of memory could be used with this function.
-    features : Dict[str, np.ndarry]
-        Features extracted from `func`. Used to check if a specific kind of
-        memory could be used with this function.
-    target : Target
-        Target to use for measurement. This target should be as specific to the
-        underlying hardware as possible.
-    dev : Device
-        Device to measure peak bandwidth on.
-    remote : Optional[RPCSession]
-      Remote session used to upload artifacts for runtime evaluation. Must be
-      the same session used to create `dev`.
-    vec_width : Optional[int]
-        Vector unit width, determined from target if not supplied.
-
-    Returns
-    -------
-    loaded_bytes : float
-        Estimated bytes loaded by `func`.
-    peak_bandwidth : float
-        Peak memory bandwidth in bytes/seconds.
-    name : str
-        Name of the memory being used.
-    """
-    # Ideally we'd be able to use this code to measure peak bandwidth of the
-    # different cache levels. If we could just generate load commands, then we
-    # could use those in a tight loop. Instead we need some code that is
-    # limited on the cache bandwidth. With the L1 cache we need an operation
-    # that has a very low arithmetic intensity and we haven't come up with one
-    # yet.
-    peak_bandwidth = estimate_peak_bandwidth_dram(target, dev, remote, vec_width)
-    loaded_bytes = sum(
-        [np.sum(x) for (k, x) in features.items() if re.match(r"^B[0-9]+\.bytes$", k) is not None]
-    )
-    return loaded_bytes, peak_bandwidth, "DRAM"
diff --git a/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py b/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py
index bc3ae64b46c1..67d5d84a0c1d 100644
--- a/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py
+++ b/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py
@@ -30,7 +30,7 @@
 @tvm.testing.requires_llvm
 def test_llvm_add_pipeline():
     """all-platform-minimal-test: Check LLVM enablement."""
-    nn = 1024
+    nn = 128
     n = tvm.runtime.convert(nn)
     A = te.placeholder((n,), name="A")
     B = te.placeholder((n,), name="B")
@@ -38,23 +38,15 @@ def test_llvm_add_pipeline():
     BB = te.compute((n,), lambda *i: B(*i), name="B")
     T = te.compute(A.shape, lambda *i: AA(*i) + BB(*i), name="T")
     C = te.compute(A.shape, lambda *i: T(*i), name="C")
-    s = te.create_schedule(C.op)
-    xo, xi = s[C].split(C.op.axis[0], factor=4)
-    xo1, xo2 = s[C].split(xo, factor=13)
-    s[C].parallel(xo2)
-    s[C].pragma(xo1, "parallel_launch_point")
-    s[C].pragma(xo2, "parallel_stride_pattern")
-    s[C].pragma(xo2, "parallel_barrier_when_finish")
-    s[C].vectorize(xi)
+
+    sch = tvm.tir.Schedule(te.create_prim_func([A, B, C]))
+    xo, xi = sch.split(sch.get_loops("C")[0], factors=[None, 4])
+    sch.parallel(xo)
+    sch.vectorize(xi)
 
     def check_llvm():
-        # Specifically allow offset to test codepath when offset is available
-        Ab = tvm.tir.decl_buffer(
-            A.shape, A.dtype, elem_offset=te.size_var("Aoffset"), offset_factor=8, name="A"
-        )
-        binds = {A: Ab}
         # BUILD and invoke the kernel.
-        f = tvm.build(s, [A, B, C], "llvm", binds=binds)
+        f = tvm.build(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
diff --git a/tests/python/all-platform-minimal-test/test_runtime_ndarray.py b/tests/python/all-platform-minimal-test/test_runtime_ndarray.py
index 8f929b1c1a76..d01f9599ffe0 100644
--- a/tests/python/all-platform-minimal-test/test_runtime_ndarray.py
+++ b/tests/python/all-platform-minimal-test/test_runtime_ndarray.py
@@ -69,28 +69,6 @@ def test_memory_usage(target, dev, dtype):
     assert dev.available_global_memory == available_memory_before
 
 
-@pytest.mark.skip(reason="Skip for passing windows test on CI")
-def test_fp16_conversion():
-    n = 100
-
-    for src, dst in [("float32", "float16"), ("float16", "float32")]:
-        A = te.placeholder((n,), dtype=src)
-        B = te.compute((n,), lambda i: A[i].astype(dst))
-
-        s = te.create_schedule([B.op])
-        func = tvm.build(s, [A, B], "llvm")
-
-        x_tvm = tvm.nd.array(100 * np.random.randn(n).astype(src) - 50)
-        y_tvm = tvm.nd.array(100 * np.random.randn(n).astype(dst) - 50)
-
-        func(x_tvm, y_tvm)
-
-        expected = x_tvm.numpy().astype(dst)
-        real = y_tvm.numpy()
-
-        tvm.testing.assert_allclose(expected, real)
-
-
 def test_dtype():
     dtype = tvm.DataType("handle")
     assert dtype.type_code == tvm.DataTypeCode.HANDLE
diff --git a/tests/python/codegen/test_target_codegen_aarch64.py b/tests/python/codegen/test_target_codegen_aarch64.py
index 366198c7de6a..8bd0cb17267d 100644
--- a/tests/python/codegen/test_target_codegen_aarch64.py
+++ b/tests/python/codegen/test_target_codegen_aarch64.py
@@ -43,9 +43,7 @@ def check_correct_assembly(type):
         A = te.placeholder(m, dtype=type, name="A")
         B = te.placeholder(m, dtype=type, name="B")
         C = te.compute((m), lambda i: A[i] * B[i], name="C")
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
 
         # Verify we see SVE load instructions and mul instructions using z registers
         assembly = f.get_source("asm")
@@ -75,9 +73,7 @@ def check_correct_assembly(type):
         A = te.placeholder(m, dtype=type, name="A")
         B = te.placeholder(m, dtype=type, name="B")
         C = te.compute((m), lambda i: A[i] + B[i], name="C")
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
 
         # Verify we see SVE load instructions and add instructions using z registers
         assembly = f.get_source("asm")
@@ -107,9 +103,7 @@ def check_correct_assembly(type):
         A = te.placeholder(m, dtype=type, name="A")
         B = te.placeholder(m, dtype=type, name="B")
         C = te.compute((m), lambda i: A[i] - B[i], name="C")
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
 
         # Verify we see SVE load instructions and sub instructions using z registers
         assembly = f.get_source("asm")
@@ -140,9 +134,7 @@ def check_correct_assembly(type):
         B = te.placeholder(m, dtype=type, name="B")
         C = te.placeholder(m, dtype=type, name="C")
         D = te.compute((m), lambda i: A[i] * B[i] + C[i], name="D")
-        s = te.create_schedule([D.op])
-
-        f = tvm.build(s, [A, B, C, D], target)
+        f = tvm.build(te.create_prim_func([A, B, C, D]), target=target)
 
         # Verify we see SVE load instructions and either mad or mla instructions using z registers
         assembly = f.get_source("asm")
@@ -172,9 +164,7 @@ def check_correct_assembly(type):
         A = te.placeholder(m, dtype=type, name="A")
         B = te.placeholder(m, dtype=type, name="B")
         C = te.compute((m), lambda i: tvm.te.max(A[i], B[i]))
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
 
         # Verify we see SVE load instructions and cmgt + sel instructions or a max instruction, all using z registers
         assembly = f.get_source("asm")
@@ -208,9 +198,7 @@ def check_correct_assembly(type):
         A = te.placeholder(m, dtype=type, name="A")
         B = te.placeholder(m, dtype=type, name="B")
         C = te.compute((m), lambda i: tvm.te.min(A[i], B[i]))
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
 
         # Verify we see SVE load instructions and cmgt + sel instructions or a min instruction, all using z registers
         assembly = f.get_source("asm")
@@ -244,9 +232,7 @@ def check_correct_assembly(type):
         A = te.placeholder(m, dtype=type, name="A")
         B = te.placeholder(m, dtype=type, name="B")
         C = te.compute((m), lambda i: tvm.te.div(A[i], B[i]))
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
 
         # Verify we see SVE load instructions and div instructions using z registers
         assembly = f.get_source("asm")
@@ -256,7 +242,7 @@ def check_correct_assembly(type):
         )
 
         assert len(loads) > 1
-        assert len(matches) > 1
+        assert len(matches) >= 1
 
     check_correct_assembly(type=dtype)
 
@@ -275,9 +261,7 @@ def check_correct_assembly(type):
         A = te.placeholder(m, dtype=type, name="A")
         B = te.placeholder(m, dtype=type, name="B")
         C = te.compute((m), lambda i: tvm.te.floormod(A[i], B[i]), name="C")
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
 
         # Verify we see SVE load instructions and mls instructions using z registers
         assembly = f.get_source("asm")
@@ -307,9 +291,7 @@ def check_correct_assembly(type):
         A = te.placeholder(m, dtype=type, name="A")
         B = te.placeholder(m, dtype=type, name="B")
         C = te.compute((m), lambda i: A[i] == B[i], name="C")
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
 
         # Verify we see SVE load instructions and cmpeq or cmeq instructions using z registers
         assembly = f.get_source("asm")
@@ -339,9 +321,7 @@ def check_correct_assembly(type):
         A = te.placeholder(m, dtype=type, name="A")
         B = te.placeholder(m, dtype=type, name="B")
         C = te.compute((m), lambda i: A[i] != B[i], name="C")
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
 
         # Verify we see SVE load instructions and cmpgt, cmgt, cmpne or cmne instructions, all using z registers
         assembly = f.get_source("asm")
@@ -370,9 +350,7 @@ def check_correct_assembly(type):
         A = te.placeholder(m, dtype=type, name="A")
         B = te.placeholder(m, dtype=type, name="B")
         C = te.compute((m), lambda i: A[i] | B[i], name="C")
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
 
         # Verify we see SVE load instructions and orr instructions using z registers
         assembly = f.get_source("asm")
@@ -401,9 +379,7 @@ def check_correct_assembly(type):
         A = te.placeholder(m, dtype=type, name="A")
         B = te.placeholder(m, dtype=type, name="B")
         C = te.compute((m), lambda i: A[i] & B[i], name="C")
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
 
         # Verify we see SVE load instructions and and instructions using z registers
         assembly = f.get_source("asm")
@@ -431,9 +407,7 @@ def check_correct_assembly(type):
         m = te.var("m")
         A = te.placeholder(m, dtype=type, name="A")
         C = te.compute((m), lambda i: ~A[i], name="C")
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, C], target)
+        f = tvm.build(te.create_prim_func([A, C]), target=target)
 
         # Verify we see SVE load instructions and eor instructions using z registers
         assembly = f.get_source("asm")
@@ -466,9 +440,7 @@ def check_correct_assembly(type):
         A = te.placeholder(m, dtype=type, name="A")
         B = te.placeholder(m, dtype="int32", name="B")
         C = te.compute((m), lambda i: A[B[i]], name="C")
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
 
         # Verify we see gather instructions in the assembly
         assembly = f.get_source("asm")
@@ -557,10 +529,7 @@ def test_vscale_range_function_attribute(mattr, expect_attr):
     m = te.var("m")
     A = te.placeholder(m, dtype="float32", name="A")
     C = te.compute((m), lambda i: A[i] + 1, name="C")
-    s = te.create_schedule([C.op])
-
-    with tvm.target.Target(target) as target:
-        f = tvm.build(s, [A, C], target)
+    f = tvm.build(te.create_prim_func([A, C]), target=target)
 
     # Check if the vscale_range() attribute exists
     ll = f.get_source("ll")
diff --git a/tests/python/codegen/test_target_codegen_arm.py b/tests/python/codegen/test_target_codegen_arm.py
index b5c69d6df1a6..9357d38e667b 100644
--- a/tests/python/codegen/test_target_codegen_arm.py
+++ b/tests/python/codegen/test_target_codegen_arm.py
@@ -28,10 +28,9 @@ def check_correct_assembly(type, elements, counts):
         n = tvm.runtime.convert(elements)
         A = te.placeholder(n, dtype=type, name="A")
         B = te.compute(A.shape, lambda i: tvm.tir.popcount(A[i]), name="B")
-        s = te.create_schedule(B.op)
-        s[B].vectorize(s[B].op.axis[0])
-        f = tvm.build(s, [A, B], target)
-
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+        sch.vectorize(sch.get_loops("B")[0])
+        f = tvm.build(sch.mod, target=target)
         # Verify we see the correct number of vpaddl and vcnt instructions in the assembly
         assembly = f.get_source("asm")
         matches = re.findall("vpaddl", assembly)
@@ -59,9 +58,9 @@ def check_correct_assembly(N):
             lambda n: te.sum(A[k, n].astype("int32") * B[k, n].astype("int32"), axis=[k]),
             name="C",
         )
-        s = te.create_schedule(C.op)
-        s[C].vectorize(s[C].op.axis[0])
-        f = tvm.build(s, [A, B, C], target)
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B, C]))
+        sch.vectorize(sch.get_loops("C")[0])
+        f = tvm.build(sch.mod, target=target)
 
         # Verify we see the correct number of vmlal.s16 instructions
         assembly = f.get_source("asm")
@@ -83,9 +82,9 @@ def check_broadcast_correct_assembly(N):
             lambda n: te.sum(A[k, n].astype("int32") * B[k].astype("int32"), axis=[k]),
             name="C",
         )
-        s = te.create_schedule(C.op)
-        s[C].vectorize(s[C].op.axis[0])
-        f = tvm.build(s, [A, B, C], target)
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B, C]))
+        sch.vectorize(sch.get_loops("C")[0])
+        f = tvm.build(sch.mod, target=target)
 
         # Verify we see the correct number of vmlal.s16 instructions
         assembly = f.get_source("asm")
diff --git a/tests/python/codegen/test_target_codegen_bool.py b/tests/python/codegen/test_target_codegen_bool.py
index b9f4437110c8..a575c0cec9c9 100644
--- a/tests/python/codegen/test_target_codegen_bool.py
+++ b/tests/python/codegen/test_target_codegen_bool.py
@@ -35,29 +35,24 @@ def compute(arr_size):
 
 
 @tvm.testing.fixture
-def schedule(target, compute):
+def get_module(target, compute):
     target = tvm.target.Target(target)
     A, B, C, D = compute
     if target.kind.name == "llvm":
-        s = te.create_schedule(D.op)
-        xo, xi = s[C].split(C.op.axis[0], factor=4)
-        xo1, xo2 = s[C].split(xo, factor=13)
-        s[C].parallel(xo2)
+        return tvm.IRModule.from_expr(te.create_prim_func([A, B, D]))
 
-    else:
-        s = te.create_schedule(D.op)
-        for stage in [C, D]:
-            xo, xi = s[stage].split(stage.op.axis[0], factor=4)
-            s[stage].bind(xo, te.thread_axis("blockIdx.x"))
-            s[stage].bind(xi, te.thread_axis("threadIdx.x"))
-
-    return s
+    sch = tvm.tir.Schedule(te.create_prim_func([A, B, D]))
+    for stage in ["C", "D"]:
+        xo, xi = sch.split(sch.get_loops(stage)[0], factors=[None, 4])
+        sch.bind(xo, "blockIdx.x")
+        sch.bind(xi, "blockIdx.x")
+    return sch.mod
 
 
 @tvm.testing.uses_gpu
-def test_cmp_load_store(target, dev, arr_size, compute, schedule):
+def test_cmp_load_store(target, dev, arr_size, compute, get_module):
     A, B, _, D = compute
-    f = tvm.build(schedule, [A, B, D], target)
+    f = tvm.build(get_module, target=target)
 
     a_np = np.random.uniform(size=arr_size).astype(A.dtype)
     b_np = np.random.uniform(size=arr_size).astype(B.dtype)
diff --git a/tests/python/codegen/test_target_codegen_c_host.py b/tests/python/codegen/test_target_codegen_c_host.py
index 3aca0fc8c77e..d7a7cbc8a44b 100644
--- a/tests/python/codegen/test_target_codegen_c_host.py
+++ b/tests/python/codegen/test_target_codegen_c_host.py
@@ -31,61 +31,19 @@ def test_add():
     A = te.placeholder((n,), name="A")
     B = te.placeholder((n,), name="B")
     C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
-    s = te.create_schedule(C.op)
 
     def check_c():
-        mhost = tvm.build(s, [A, B, C], "c", name="test_fadd")
-        temp = utils.tempdir()
-        path_dso = temp.relpath("temp.so")
-        mhost.export_library(path_dso)
-        m = tvm.runtime.load_module(path_dso)
-        fadd = m["test_fadd"]
-        dev = tvm.cpu(0)
-        # launch the kernel.
-        n = nn
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
-        fadd(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-
-    check_c()
-
-
-def test_add_pipeline():
-    nn = 1024
-    n = tvm.runtime.convert(nn)
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    AA = te.compute((n,), lambda *i: A(*i), name="A")
-    BB = te.compute((n,), lambda *i: B(*i), name="B")
-    T = te.compute(A.shape, lambda *i: AA(*i) + BB(*i), name="T")
-    C = te.compute(A.shape, lambda *i: T(*i), name="C")
-    s = te.create_schedule(C.op)
-    xo, xi = s[C].split(C.op.axis[0], factor=4)
-    xo1, xo2 = s[C].split(xo, factor=13)
-    s[C].parallel(xo2)
-    s[C].pragma(xo1, "parallel_launch_point")
-    s[C].pragma(xo2, "parallel_stride_pattern")
-    s[C].pragma(xo2, "parallel_barrier_when_finish")
-    # FIXME(tvm-team): vector operators are not supported for codegen to C yet
-    # s[C].vectorize(xi)
-
-    def check_c():
-        # Specifically allow offset to test codepath when offset is available
-        Ab = tvm.tir.decl_buffer(
-            A.shape, A.dtype, elem_offset=te.size_var("Aoffset"), offset_factor=8, name="A"
+        mhost = tvm.build(
+            tvm.IRModule.from_expr(
+                te.create_prim_func([A, B, C]).with_attr("global_symbol", "test_fadd")
+            ),
+            target="c",
         )
-        binds = {A: Ab}
-        # BUILD and invoke the kernel.
-        f1 = tvm.lower(s, [A, B, C], name="test_fadd_pipeline")
-        mhost = tvm.build(f1, target="c")
-
         temp = utils.tempdir()
         path_dso = temp.relpath("temp.so")
         mhost.export_library(path_dso)
         m = tvm.runtime.load_module(path_dso)
-        fadd = m["test_fadd_pipeline"]
+        fadd = m["test_fadd"]
         dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
@@ -105,10 +63,14 @@ def test_reinterpret():
     B = te.compute(
         A.shape, lambda *i: tvm.tir.call_intrin("float32", "tir.reinterpret", 2 + A(*i)), name="B"
     )
-    s = te.create_schedule(B.op)
 
     def check_c():
-        mhost = tvm.build(s, [A, B], "c", name="test_reinterpret")
+        mhost = tvm.build(
+            tvm.IRModule.from_expr(
+                te.create_prim_func([A, B]).with_attr("global_symbol", "test_reinterpret")
+            ),
+            target="c",
+        )
         temp = utils.tempdir()
         path_dso = temp.relpath("temp.so")
         mhost.export_library(path_dso)
@@ -129,10 +91,14 @@ def test_ceil():
     n = tvm.runtime.convert(nn)
     A = te.placeholder((n,), name="A", dtype="float32")
     B = te.compute(A.shape, lambda *i: tvm.tir.call_intrin("float32", "tir.ceil", A(*i)), name="B")
-    s = te.create_schedule(B.op)
 
     def check_c():
-        mhost = tvm.build(s, [A, B], "c", name="test_ceil")
+        mhost = tvm.build(
+            tvm.IRModule.from_expr(
+                te.create_prim_func([A, B]).with_attr("global_symbol", "test_ceil")
+            ),
+            target="c",
+        )
         temp = utils.tempdir()
         path_dso = temp.relpath("temp.so")
         mhost.export_library(path_dso)
@@ -153,10 +119,14 @@ def test_floor():
     n = tvm.runtime.convert(nn)
     A = te.placeholder((n,), name="A", dtype="float32")
     B = te.compute(A.shape, lambda *i: tvm.tir.call_intrin("float32", "tir.floor", A(*i)), name="B")
-    s = te.create_schedule(B.op)
 
     def check_c():
-        mhost = tvm.build(s, [A, B], "c", name="test_floor")
+        mhost = tvm.build(
+            tvm.IRModule.from_expr(
+                te.create_prim_func([A, B]).with_attr("global_symbol", "test_floor")
+            ),
+            target="c",
+        )
         temp = utils.tempdir()
         path_dso = temp.relpath("temp.so")
         mhost.export_library(path_dso)
@@ -177,10 +147,14 @@ def test_round():
     n = tvm.runtime.convert(nn)
     A = te.placeholder((n,), name="A", dtype="float32")
     B = te.compute(A.shape, lambda *i: tvm.tir.call_intrin("float32", "tir.round", A(*i)), name="B")
-    s = te.create_schedule(B.op)
 
     def check_c():
-        mhost = tvm.build(s, [A, B], "c", name="test_round")
+        mhost = tvm.build(
+            tvm.IRModule.from_expr(
+                te.create_prim_func([A, B]).with_attr("global_symbol", "test_round")
+            ),
+            target="c",
+        )
         temp = utils.tempdir()
         path_dso = temp.relpath("temp.so")
         mhost.export_library(path_dso)
@@ -196,42 +170,6 @@ def check_c():
     check_c()
 
 
-def test_call_packed():
-    def fake_func(fname="fake.func"):
-        ib = tvm.tir.ir_builder.create()
-        A = ib.pointer("float32", name="A")
-        fake_func1 = tvm.tir.call_packed(fname, A[0])
-
-        ib.emit(fake_func1)
-        body = ib.get()
-        return A, body
-
-    def check_global_packed_func():
-        fname = "fake.func"
-        A, body = fake_func(fname)
-        func1 = tvm.tir.PrimFunc([A], body).with_attr("global_symbol", "func1")
-        B, body = fake_func()
-        func2 = tvm.tir.PrimFunc([B], body).with_attr("global_symbol", "func2")
-        mod = tvm.IRModule({"fake_func1": func1, "fake_func2": func2})
-        fcode = tvm.build(mod, None, "c")
-        src = fcode.get_source()
-
-        # there are two locations calling the packed func
-        assert src.count(fname) == 2
-
-        suffix = "_packed"
-        packed_func_name = fname + suffix
-        # func name will be standardized by GetUniqueName and not exists anymore
-        assert src.find(packed_func_name) == -1
-
-        packed_func_real_name = "_".join(fname.split(".")) + suffix
-        func_declaration = "static void* %s = NULL;" % packed_func_real_name
-        # src only has 1 valid declaration
-        assert src.count(func_declaration) == 1
-
-    check_global_packed_func()
-
-
 def test_subroutine_call():
     @I.ir_module
     class mod:
diff --git a/tests/python/codegen/test_target_codegen_cross_llvm.py b/tests/python/codegen/test_target_codegen_cross_llvm.py
index 8758ae2a04e8..9dc001e1949a 100644
--- a/tests/python/codegen/test_target_codegen_cross_llvm.py
+++ b/tests/python/codegen/test_target_codegen_cross_llvm.py
@@ -32,10 +32,11 @@ def test_llvm_add_pipeline():
     A = te.placeholder((n,), name="A")
     B = te.placeholder((n,), name="B")
     C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
-    s = te.create_schedule(C.op)
-    xo, xi = s[C].split(C.op.axis[0], factor=4)
-    s[C].parallel(xo)
-    s[C].vectorize(xi)
+
+    sch = tvm.tir.Schedule(te.create_prim_func([A, B, C]))
+    xo, xi = sch.split(sch.get_loops("C")[0], factors=[None, 4])
+    sch.parallel(xo)
+    sch.vectorize(xi)
 
     def verify_elf(path, e_machine):
         with open(path, "rb") as fi:
@@ -48,7 +49,7 @@ def verify_elf(path, e_machine):
     def build_i386():
         temp = utils.tempdir()
         target = "llvm -mtriple=i386-pc-linux-gnu"
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(sch.mod, target=target)
         path = temp.relpath("myadd.o")
         f.save(path)
         verify_elf(path, 0x03)
@@ -59,7 +60,7 @@ def build_arm():
             print("Skip because %s is not enabled.." % target)
             return
         temp = utils.tempdir()
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(sch.mod, target=target)
         path = temp.relpath("myadd.o")
         f.save(path)
         verify_elf(path, 0x28)
diff --git a/tests/python/codegen/test_target_codegen_cuda.py b/tests/python/codegen/test_target_codegen_cuda.py
index 7b370f3e3211..ae3173a14dee 100644
--- a/tests/python/codegen/test_target_codegen_cuda.py
+++ b/tests/python/codegen/test_target_codegen_cuda.py
@@ -28,9 +28,6 @@
 import tvm.testing
 import pytest
 
-tx = te.thread_axis("threadIdx.x")
-bx = te.thread_axis("blockIdx.x")
-
 
 @tvm.testing.requires_gpu
 @tvm.testing.requires_cuda
@@ -46,11 +43,13 @@ def check_cuda(dtype, n, lanes):
             return
         A = te.placeholder((n,), name="A", dtype="%sx%d" % (dtype, lanes))
         B = te.compute((n,), lambda i: A[i] + tvm.tir.const(1, A.dtype), name="B")
-        s = te.create_schedule(B.op)
-        xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
-        s[B].bind(xo, bx)
-        s[B].bind(xi, tx)
-        fun = tvm.build(s, [A, B], "cuda")
+
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+        xo, xi = sch.split(sch.get_loops("B")[0], factors=[None, num_thread])
+        sch.bind(xo, "blockIdx.x")
+        sch.bind(xi, "threadIdx.x")
+        fun = tvm.build(sch.mod, target="cuda")
+
         dev = tvm.cuda(0)
         a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n, lanes)))
         c = tvm.nd.empty((n,), B.dtype, dev)
@@ -96,14 +95,15 @@ def np_bf162np_float(arr):
     def check_cuda(n, lanes):
         A = te.placeholder((n,), name="A", dtype="bfloat16x%d" % lanes)
         B = te.compute((n,), lambda i: A[i] + tvm.tir.const(1, A.dtype), name="B")
-        s = te.create_schedule(B.op)
-        xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
-        s[B].bind(xo, bx)
-        s[B].bind(xi, tx)
+
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+        xo, xi = sch.split(sch.get_loops("B")[0], factors=[None, num_thread])
+        sch.bind(xo, "blockIdx.x")
+        sch.bind(xi, "threadIdx.x")
         with tvm.transform.PassContext(
             disabled_pass=["tir.BF16Promote", "tir.BF16CastElimination", "tir.BF16TypeLowering"]
         ):
-            fun = tvm.build(s, [A, B], "cuda")
+            fun = tvm.build(sch.mod, target="cuda")
         dev = tvm.cuda(0)
         np_a = np.random.uniform(size=(n, lanes)).astype("float32")
         np_a = np_bf162np_float(np_float2np_bf16(np_a))
@@ -134,11 +134,12 @@ def check_cuda(dtype, n, lanes):
         D = te.compute(
             (n,), lambda i: tvm.tir.call_pure_extern("int32", "__dp4a", A[i], B[i], C[i]), name="D"
         )
-        s = te.create_schedule(D.op)
-        xo, xi = s[D].split(D.op.axis[0], factor=num_thread)
-        s[D].bind(xo, bx)
-        s[D].bind(xi, tx)
-        fun = tvm.build(s, [A, B, C, D], "cuda")
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B, C, D]))
+        xo, xi = sch.split(sch.get_loops("D")[0], factors=[None, num_thread])
+        sch.bind(xo, "blockIdx.x")
+        sch.bind(xi, "threadIdx.x")
+        fun = tvm.build(sch.mod, target="cuda")
+
         np_a = np.random.randint(low=-128, high=127, size=(n, lanes))
         np_b = np.random.randint(low=-128, high=127, size=(n, lanes))
         np_c = np.random.randint(low=0, high=127, size=(n,))
@@ -163,11 +164,13 @@ def check_cuda(dtype, n, lanes):
         dev = tvm.cuda(0)
         A = te.placeholder((n,), name="A", dtype="%sx%d" % (dtype, lanes))
         B = te.compute((n,), lambda i: A[i], name="B")
-        s = te.create_schedule(B.op)
-        block, thread = s[B].split(B.op.axis[0], factor=num_thread)
-        s[B].bind(block, bx)
-        s[B].bind(thread, tx)
-        fun = tvm.build(s, [A, B], "cuda", name="vector_load")
+
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+        xo, xi = sch.split(sch.get_loops("B")[0], factors=[None, num_thread])
+        sch.bind(xo, "blockIdx.x")
+        sch.bind(xi, "threadIdx.x")
+        fun = tvm.build(sch.mod, target="cuda")
+
         np_a = np.random.randint(low=-128, high=127, size=(n, lanes))
         a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np_a)
         b = tvm.nd.empty((n,), B.dtype, dev)
@@ -187,12 +190,14 @@ def test_cuda_make_int8():
     def check_cuda(n, value, lanes):
         dtype = "int8"
         dev = tvm.cuda(0)
-        A = te.compute((n, lanes), lambda i, j: tvm.tir.const(value, dtype=dtype))
-        s = te.create_schedule(A.op)
-        y, x = s[A].op.axis
-        s[A].vectorize(x)
-        s[A].bind(y, bx)
-        fun = tvm.build(s, [A], "cuda", name="make_int8x4")
+        A = te.compute((n, lanes), lambda i, j: tvm.tir.const(value, dtype=dtype), name="A")
+
+        sch = tvm.tir.Schedule(te.create_prim_func([A]))
+        y, x = sch.get_loops("A")
+        sch.vectorize(x)
+        sch.bind(y, "blockIdx.x")
+        fun = tvm.build(sch.mod, target="cuda")
+
         np_a = np.full((n, lanes), value, dtype=dtype)
         a = tvm.nd.empty(np_a.shape, dtype, dev)
         fun(a)
@@ -215,13 +220,13 @@ def test_cuda_make_int4():
     def check_cuda(n, value, lanes):
         dtype = "int4"
         dev = tvm.cuda(0)
-        A = te.compute((n, lanes), lambda i, j: tvm.tir.const(value, dtype=dtype))
-        s = te.create_schedule(A.op)
-        y, x = s[A].op.axis
-        s[A].vectorize(x)
-        s[A].bind(y, bx)
-        kernel_name = "make_int4x" + str(lanes)
-        fun = tvm.build(s, [A], "cuda", name=kernel_name)
+        A = te.compute((n, lanes), lambda i, j: tvm.tir.const(value, dtype=dtype), name="A")
+        sch = tvm.tir.Schedule(te.create_prim_func([A]))
+        y, x = sch.get_loops("A")
+        sch.vectorize(x)
+        sch.bind(y, "blockIdx.x")
+        fun = tvm.build(sch.mod, target="cuda")
+
         np_a = np.full((n, lanes), value, dtype="int8")
         a = tvm.nd.empty((n, lanes), dtype, dev)
         fun(a)
@@ -246,9 +251,13 @@ def check_inf_nan(dev, n, value, dtype):
         A = te.placeholder((n,), name="A", dtype=dtype)
         inf_value = tvm.tir.const(value, dtype=dtype)
         C = te.compute((n,), lambda i: inf_value, name="C")
-        s = te.create_schedule(C.op)
-        s[C].bind(s[C].op.axis[0], tx)
-        fun = tvm.build(s, [A, C], target)
+
+        sch = tvm.tir.Schedule(te.create_prim_func([A, C]))
+        xo, xi = sch.split(sch.get_loops("C")[0], factors=[None, 8])
+        sch.bind(xo, "blockIdx.x")
+        sch.bind(xi, "threadIdx.x")
+        fun = tvm.build(sch.mod, target="cuda")
+
         a = tvm.nd.empty((n,), A.dtype, dev)
         c = tvm.nd.empty((n,), A.dtype, dev)
         # Only need to test compiling here
@@ -264,53 +273,6 @@ def check_inf_nan(dev, n, value, dtype):
     check_inf_nan(dev, 1, float("nan"), "float64")
 
 
-@tvm.testing.requires_gpu
-@tvm.testing.requires_cuda
-def test_cuda_shuffle():
-    idxm = tvm.tir.indexmod
-    a = te.placeholder((64,), "int32")
-    b = te.placeholder((64,), "int32")
-    c = te.compute((64,), lambda x: a[x] + b[x - idxm(x, 4) + (3 - idxm(x, 4))])
-    sch = te.create_schedule(c.op)
-    x = c.op.axis[0]
-    xo, xi = sch[c].split(x, 4)
-    thrx = te.thread_axis("threadIdx.x")
-    sch[c].bind(xo, thrx)
-    sch[c].vectorize(xi)
-
-    def MyVectorize():
-        def vectorizer(op):
-            if op.kind == tvm.tir.ForKind.VECTORIZED:
-                idx = tvm.tir.Ramp(4 * thrx.var, 1, 4)
-                store = op.body
-                value = store.value
-                new_a = tvm.tir.BufferLoad(value.a.buffer, [idx])
-                bs, ids = [], []
-                for i in range(4):
-                    bs.append(tvm.tir.BufferLoad(value.b.buffer, [4 * thrx.var + i]))
-                    ids.append(3 - i)
-                new_b = tvm.tir.Shuffle(bs, ids)
-                return tvm.tir.BufferStore(store.buffer, new_a + new_b, [idx])
-            return None
-
-        def _transform(f, *_):
-            return f.with_body(
-                tvm.tir.stmt_functor.ir_transform(f.body, None, vectorizer, ["tir.For"])
-            )
-
-        return tvm.tir.transform.prim_func_pass(_transform, opt_level=0, name="MyVectorize")
-
-    with tvm.transform.PassContext(config={"tir.add_lower_pass": [(1, MyVectorize())]}):
-        module = tvm.build(sch, [a, b, c], target="cuda")
-        a_ = np.array(list(range(64)), dtype="int32")
-        b_ = np.array((list(range(4))[::-1]) * 16, dtype="int32")
-        c_ = np.zeros((64,), dtype="int32")
-        ref = a_ + np.array((list(range(4))) * 16, dtype="int32")
-        nda, ndb, ndc = [tvm.nd.array(i, tvm.cuda(0)) for i in [a_, b_, c_]]
-        module(nda, ndb, ndc)
-        tvm.testing.assert_allclose(ndc.numpy(), ref)
-
-
 @tvm.testing.parametrize_targets("cuda", "rocm")
 def test_crossthread_reduction1(target, dev):
     n = te.var("n")
@@ -320,12 +282,13 @@ def test_crossthread_reduction1(target, dev):
     B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B")
 
     def sched(nthd):
-        s = te.create_schedule(B.op)
-        ko, _ = s[B].split(B.op.reduce_axis[0], nparts=nthd)
-        s[B].bind(ko, te.thread_axis("threadIdx.x"))
-        s[B].bind(B.op.axis[0], te.thread_axis("blockIdx.x"))
-        func = tvm.build(s, [A, B], target)
-        return func
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+        x, k = sch.get_loops("B")
+        ko, _ = sch.split(k, factors=[nthd, None])
+        sch.bind(ko, "threadIdx.x")
+        sch.bind(x, "blockIdx.x")
+        fun = tvm.build(sch.mod, target="cuda")
+        return fun
 
     def verify(nthd):
         func = sched(nthd)
@@ -355,13 +318,14 @@ def test_crossthread_reduction2(target, dev):
     B = te.compute((n,), lambda i: te.sum(A[i, k0, k1], axis=(k0, k1)), name="B")
 
     def sched(nthdx, nthdy):
-        s = te.create_schedule(B.op)
-        k0o, _ = s[B].split(B.op.reduce_axis[0], nparts=nthdx)
-        k1o, _ = s[B].split(B.op.reduce_axis[1], nparts=nthdy)
-        s[B].bind(k0o, te.thread_axis("threadIdx.x"))
-        s[B].bind(k1o, te.thread_axis("threadIdx.y"))
-        s[B].bind(B.op.axis[0], te.thread_axis("blockIdx.x"))
-        func = tvm.build(s, [A, B], target)
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+        x, k0, k1 = sch.get_loops("B")
+        k0o, _ = sch.split(k0, factors=[nthdx, None])
+        k1o, _ = sch.split(k1, factors=[nthdy, None])
+        sch.bind(k0o, "threadIdx.x")
+        sch.bind(k1o, "threadIdx.y")
+        sch.bind(x, "blockIdx.x")
+        func = tvm.build(sch.mod, target="cuda")
         return func
 
     def verify(nthdx, nthdy):
@@ -389,42 +353,13 @@ def test_cuda_reduction_binding():
     k = te.reduce_axis((0, 32), "k")
     A = te.placeholder((96, 32), name="A")
     B = te.compute((96,), lambda m: te.sum(A[m, k], axis=k), name="B")
-    s = te.create_schedule(B.op)
-
-    s[B].reorder(B.op.reduce_axis[0], B.op.axis[0])
-
-    mo, _ = s[B].split(B.op.axis[0], 32)
-    s[B].bind(mo, te.thread_axis("blockIdx.x"))
 
-    fcuda = tvm.build(s, [A, B], "cuda")
-
-
-@tvm.testing.parametrize_targets("cuda", "rocm")
-def test_rfactor_predicates(target, dev):
-    n = te.reduce_axis((0, 129), "n")
-    A = te.placeholder((129,), name="A")
-    B = te.compute((1,), lambda b: te.sum(A[n], axis=n), name="B")
-
-    s = te.create_schedule(B.op)
-
-    _, ni = s[B].split(s[B].op.reduce_axis[0], factor=8)
-
-    BF = s.rfactor(B, ni, 0)
-    s[B].set_store_predicate(tx.var.equal(0))
-
-    s[B].bind(s[B].op.reduce_axis[0], tx)
-    s[B].bind(s[B].op.axis[0], bx)
-
-    s[BF].compute_at(s[B], s[B].op.axis[0])
-
-    _, noi = s[BF].split(s[BF].op.reduce_axis[0], factor=2)
-
-    BF2 = s.rfactor(BF, noi, 0)
-
-    s[BF].bind(s[BF].op.axis[0], tx)
-    s[BF2].compute_at(s[BF], s[BF].op.axis[1])
-
-    fcuda = tvm.build(s, [A, B], target)
+    sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+    x, k = sch.get_loops("B")
+    sch.reorder(k, x)
+    mo, _ = sch.split(x, factors=[None, 32])
+    sch.bind(mo, "blockIdx.x")
+    func = tvm.build(sch.mod, target="cuda")
 
 
 @tvm.testing.requires_gpu
@@ -436,15 +371,14 @@ def test_cuda_const_float_to_half():
     shape = (2, 3, 4)
     a = te.placeholder(shape, dtype="float16", name="a")
     b = tvm.tir.const(0.5, dtype="float16")
-    c = te.compute(shape, lambda i, j, k: a[i, j, k] > b, name="c")
-    s = te.create_schedule(c.op)
-    axes = [axis for axis in c.op.axis]
-    fused = s[c].fuse(*axes)
-    bx, tx = s[c].split(fused, factor=64)
-    s[c].bind(bx, te.thread_axis("blockIdx.x"))
-    s[c].bind(tx, te.thread_axis("threadIdx.x"))
-
-    func = tvm.build(s, [a, c], "cuda")
+    c = te.compute(shape, lambda i, j, k: a[i, j, k] > b, name="C")
+
+    sch = tvm.tir.Schedule(te.create_prim_func([a, c]))
+    xo, xi = sch.split(sch.fuse(*sch.get_loops("C")), factors=[None, 64])
+    sch.bind(xo, "blockIdx.x")
+    sch.bind(xi, "threadIdx.x")
+    func = tvm.build(sch.mod, target="cuda")
+
     dev = tvm.cuda(0)
     a_np = np.random.uniform(size=shape).astype(a.dtype)
     c_np = np.zeros(shape=shape, dtype=c.dtype)
@@ -463,13 +397,14 @@ def test_cuda_floordiv_with_vectorization():
         k = 37
         A = te.placeholder((n,), name="A")
         B = te.compute((n,), lambda i: A[tvm.tir.floordiv(i, k)], name="B")
-        s = te.create_schedule(B.op)
-        xo, xi = s[B].split(B.op.axis[0], nparts=1)
-        xio, xii = s[B].split(xi, factor=4)
-        s[B].vectorize(xii)
-        s[B].bind(xo, bx)
-        s[B].bind(xio, tx)
-        func = tvm.build(s, [A, B], "cuda")
+
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+        xo, xi = sch.split(sch.get_loops("B")[0], factors=[1, None])
+        xio, xii = sch.split(xi, factors=[None, 4])
+        sch.vectorize(xii)
+        sch.bind(xo, "blockIdx.x")
+        sch.bind(xio, "threadIdx.x")
+        func = tvm.build(sch.mod, target="cuda")
 
         dev = tvm.cuda(0)
         a_np = np.random.uniform(size=(n,)).astype(A.dtype)
@@ -489,13 +424,13 @@ def test_cuda_floormod_with_vectorization():
         k = 37
         A = te.placeholder((n,), name="A")
         B = te.compute((n,), lambda i: A[tvm.tir.floormod(i, k)], name="B")
-        s = te.create_schedule(B.op)
-        xo, xi = s[B].split(B.op.axis[0], nparts=1)
-        xio, xii = s[B].split(xi, factor=4)
-        s[B].vectorize(xii)
-        s[B].bind(xo, bx)
-        s[B].bind(xio, tx)
-        func = tvm.build(s, [A, B], "cuda")
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+        xo, xi = sch.split(sch.get_loops("B")[0], factors=[1, None])
+        xio, xii = sch.split(xi, factors=[None, 4])
+        sch.vectorize(xii)
+        sch.bind(xo, "blockIdx.x")
+        sch.bind(xio, "threadIdx.x")
+        func = tvm.build(sch.mod, target="cuda")
 
         dev = tvm.cuda(0)
         a_np = np.random.uniform(size=(n,)).astype(A.dtype)
@@ -521,11 +456,11 @@ def check(t0, t1, factor):
         C = te.compute((n,), lambda i: A[i] + topi.cast(B[i], A.dtype), name="C")
 
         # schedule
-        s = tvm.te.create_schedule(C.op)
-        ob, ib = s[C].split(s[C].op.axis[0], factor=factor)
-        s[C].vectorize(ib)
-        s[C].bind(ob, tx)
-        func = tvm.build(s, [A, B, C], "cuda")
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B, C]))
+        ob, ib = sch.split(sch.get_loops("C")[0], factors=[None, factor])
+        sch.vectorize(ib)
+        sch.bind(ob, "threadIdx.x")
+        func = tvm.build(sch.mod, target="cuda")
 
         # correctness
         dev = tvm.cuda(0)
@@ -570,15 +505,16 @@ def skip(t0, t1):
     check("uint8", "int8", 16)
 
 
-def sched(B):
-    s = te.create_schedule(B.op)
-    io, ii = s[B].split(s[B].op.axis[0], nparts=1)
-    iio, iii = s[B].split(ii, nparts=32)
-    _, iiii = s[B].split(iii, factor=4)
-    s[B].vectorize(iiii)
-    s[B].bind(io, bx)
-    s[B].bind(iio, tx)
-    return s
+def sched(A, B):
+    # schedule
+    sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+    io, ii = sch.split(sch.get_loops("B")[0], factors=[1, None])
+    iio, iii = sch.split(ii, factors=[32, None])
+    _, iiii = sch.split(iii, factors=[None, 4])
+    sch.vectorize(iiii)
+    sch.bind(io, "blockIdx.x")
+    sch.bind(iio, "threadIdx.x")
+    return tvm.build(sch.mod, target="cuda")
 
 
 @tvm.testing.requires_gpu
@@ -627,8 +563,7 @@ def run_test(tvm_intrin, np_func, dtype):
         n = 128
         A = te.placeholder((n,), dtype=dtype, name="A")
         B = te.compute((n,), lambda *i: tvm_intrin(A(*i)), name="B")
-        s = sched(B)
-        f = tvm.build(s, [A, B], "cuda")
+        f = sched(A, B)
         dev = tvm.cuda(0)
         a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
         b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), dev)
@@ -653,8 +588,7 @@ def run_test(tvm_intrin, np_func):
         n = 128
         A = te.placeholder((n,), dtype=dtype, name="A")
         B = te.compute((n,), lambda i: tvm_intrin(A[i], c2), name="B")
-        s = sched(B)
-        f = tvm.build(s, [A, B], "cuda")
+        f = sched(A, B)
         dev = tvm.cuda(0)
         a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
         b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), dev)
@@ -679,8 +613,7 @@ def run_test(dtype):
         n = 128
         A = te.placeholder((n,), dtype=dtype, name="A")
         B = te.compute((n,), lambda i: tvm.tir.popcount(A[i]), name="B")
-        s = sched(B)
-        f = tvm.build(s, [A, B], "cuda")
+        f = sched(A, B)
         dev = tvm.cuda(0)
         a = tvm.nd.array(np.random.randint(0, 100000, size=n).astype(A.dtype), dev)
         b = tvm.nd.array(np.zeros(shape=(n,)).astype(B.dtype), dev)
@@ -711,12 +644,14 @@ def check_cuda(dtype, n, l, padding, lanes):
             ),
             name="B",
         )
-        s = te.create_schedule(B.op)
-        block, thread, vectorize = s[B].op.axis
-        s[B].bind(block, bx)
-        s[B].bind(thread, tx)
-        s[B].vectorize(vectorize)
-        fun = tvm.build(s, [A, B], "cuda", name="vector_load_permute_pad")
+
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+        block, thread, vectorize = sch.get_loops("B")
+        sch.bind(block, "blockIdx.x")
+        sch.bind(thread, "threadIdx.x")
+        sch.vectorize(vectorize)
+        fun = tvm.build(sch.mod, target="cuda")
+
         np_a = np.random.randint(low=-128, high=127, size=(n, l)).astype(A.dtype)
         a = tvm.nd.empty((n, l), A.dtype, dev).copyfrom(np_a)
         b = tvm.nd.empty((n // lanes, l + padding * 2, lanes), B.dtype, dev)
@@ -736,205 +671,6 @@ def check_cuda(dtype, n, l, padding, lanes):
     check_cuda("float32", 64, 16, 3, 4)
 
 
-def vcf_check_common(s, args):
-    N = 512
-
-    # To check if every vectorize loop transforms to ramp expr successfully
-    stmt = tvm.lower(s, args)
-    # Use this as a stack flag to show whether this stmt is inside a BroadcastNode
-    inside_broadcast = [False]
-
-    # Possible patterns:
-    # Reduce init:          BufferStore[Ramp] = Broadcast(0)
-    # Shared memory copy:   BufferStore[Ramp] = BufferLoad[Ramp]
-    # Compute:              BufferStore[Ramp] = BufferLoad[Ramp] ... Broadcast[Load]
-
-    def pre_visit(stmt):
-        if isinstance(stmt, tvm.tir.Broadcast):
-            inside_broadcast[0] = True
-            # Check Broadcast[Imm numbers] or Broadcast[Load] patterns
-            assert isinstance(stmt.value, (tvm.tir.IntImm, tvm.tir.FloatImm, tvm.tir.BufferLoad))
-
-        if isinstance(stmt, (tvm.tir.BufferStore, tvm.tir.BufferLoad)):
-            is_ramp_index = isinstance(stmt.indices[-1], tvm.tir.Ramp)
-            is_vectorized_buffer = re.match(r"^.*x\d+$", stmt.buffer.dtype)
-            if isinstance(stmt, tvm.tir.BufferLoad):
-                # Check Broadcast[BufferLoad] or BufferLoad[Ramp] patterns
-                assert inside_broadcast[0] or is_ramp_index or is_vectorized_buffer
-                # Skip the rest of the BufferLoad
-                return stmt
-            else:
-                assert is_ramp_index or is_vectorized_buffer
-
-        return None
-
-    def post_visit(stmt):
-        if isinstance(stmt, tvm.tir.Broadcast):
-            inside_broadcast[0] = False
-        return None
-
-    tvm.tir.stmt_functor.ir_transform(stmt["main"].body, pre_visit, post_visit)
-
-    tgt = tvm.target.cuda()
-    mod = tvm.build(s, args, tgt)
-    # To check if every vectorize loop transforms to correct instruction
-    # print(mod.imported_modules[0].get_source())
-
-    dev = tvm.device("cuda", 0)
-    a = tvm.nd.array(np.random.uniform(size=(512, 512)).astype("float32"), dev)
-    b = tvm.nd.array(np.random.uniform(size=(512, 512)).astype("float32"), dev)
-    c = tvm.nd.array(np.zeros((512, 512), dtype="float32"), dev)
-    mod(a, b, c)
-    tvm.testing.assert_allclose(c.numpy(), np.dot(a.numpy(), b.numpy()), rtol=1e-5)
-
-
-@tvm.testing.requires_gpu
-@tvm.testing.requires_cuda
-def test_vectorized_cooperative_fetching_x():
-    N = 512
-    A = te.placeholder((N, N), name="A", dtype="float32")
-    B = te.placeholder((N, N), name="B", dtype="float32")
-    k = te.reduce_axis((0, N), name="k")
-    C = te.compute((N, N), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k))
-    s = te.create_schedule(C.op)
-    i, j = s[C].op.axis
-    k = s[C].op.reduce_axis[0]
-
-    AA = s.cache_read(A, "shared", [C])
-    BB = s.cache_read(B, "shared", [C])
-
-    i3, i4 = s[C].split(i, factor=4)
-    i2, i3 = s[C].split(i3, factor=2)
-    i1, i2 = s[C].split(i2, factor=8)
-    i0, i1 = s[C].split(i1, factor=1)
-    j3, j4 = s[C].split(j, factor=4)
-    j2, j3 = s[C].split(j3, factor=2)
-    j1, j2 = s[C].split(j2, factor=8)
-    j0, j1 = s[C].split(j1, factor=2)
-    k1, k2 = s[C].split(k, factor=8)
-    k0, k1 = s[C].split(k1, factor=8)
-    s[C].reorder(i0, j0, i1, j1, i2, j2, k0, k1, i3, j3, k2, i4, j4)
-    block_it = s[C].fuse(i0, j0)
-    s[C].bind(block_it, tvm.te.thread_axis("blockIdx.x"))
-    vthread_it = s[C].fuse(i1, j1)
-    s[C].bind(vthread_it, tvm.te.thread_axis("vthread"))
-    thread_it = s[C].fuse(i2, j2)
-    s[C].bind(thread_it, tvm.te.thread_axis("threadIdx.x"))
-    s[C].vectorize(j4)
-
-    s[AA].compute_at(s[C], k0)
-    iaa, jaa = s[AA].op.axis
-    s[BB].compute_at(s[C], k0)
-    ibb, jbb = s[BB].op.axis
-    aa_fused = s[AA].fuse(iaa, jaa)
-    bb_fused = s[BB].fuse(ibb, jbb)
-    aa1, aa2 = s[AA].split(aa_fused, factor=4)
-    aa0, aa1 = s[AA].split(aa1, factor=64)
-    bb1, bb2 = s[BB].split(bb_fused, factor=4)
-    bb0, bb1 = s[BB].split(bb1, factor=64)
-    s[AA].bind(aa1, tvm.te.thread_axis("threadIdx.x"))
-    s[AA].vectorize(aa2)
-    s[BB].bind(bb1, tvm.te.thread_axis("threadIdx.x"))
-    s[BB].vectorize(bb2)
-
-    vcf_check_common(s, [A, B, C])
-
-
-@tvm.testing.requires_gpu
-@tvm.testing.requires_cuda
-def test_vectorized_cooperative_fetching_xy():
-    N = 512
-    A = te.placeholder((N, N), name="A")
-    B = te.placeholder((N, N), name="B")
-    k = te.reduce_axis((0, N), name="k")
-    C = te.compute((N, N), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k))
-    s = te.create_schedule(C.op)
-    i, j = s[C].op.axis
-    k = s[C].op.reduce_axis[0]
-
-    AA = s.cache_read(A, "shared", [C])
-    BB = s.cache_read(B, "shared", [C])
-
-    i3, i4 = s[C].split(i, factor=4)
-    i2, i3 = s[C].split(i3, factor=2)
-    i1, i2 = s[C].split(i2, factor=8)
-    i0, i1 = s[C].split(i1, factor=1)
-    j3, j4 = s[C].split(j, factor=4)
-    j2, j3 = s[C].split(j3, factor=2)
-    j1, j2 = s[C].split(j2, factor=8)
-    j0, j1 = s[C].split(j1, factor=2)
-    k1, k2 = s[C].split(k, factor=8)
-    k0, k1 = s[C].split(k1, factor=8)
-    s[C].reorder(i0, j0, i1, j1, i2, j2, k0, k1, i3, j3, k2, i4, j4)
-    block_it = s[C].fuse(i0, j0)
-    s[C].bind(block_it, tvm.te.thread_axis("blockIdx.x"))
-    vthread_it = s[C].fuse(i1, j1)
-    s[C].bind(vthread_it, tvm.te.thread_axis("vthread"))
-    s[C].bind(i2, tvm.te.thread_axis("threadIdx.y"))
-    s[C].bind(j2, tvm.te.thread_axis("threadIdx.x"))
-    s[C].vectorize(j4)
-
-    s[AA].compute_at(s[C], k0)
-    iaa, jaa = s[AA].op.axis
-    s[BB].compute_at(s[C], k0)
-    ibb, jbb = s[BB].op.axis
-    aa_fused = s[AA].fuse(iaa, jaa)
-    bb_fused = s[BB].fuse(ibb, jbb)
-    aa2, aa3 = s[AA].split(aa_fused, factor=4)
-    aa1, aa2 = s[AA].split(aa2, factor=8)
-    aa0, aa1 = s[AA].split(aa1, factor=8)
-    bb2, bb3 = s[BB].split(bb_fused, factor=4)
-    bb1, bb2 = s[BB].split(bb2, factor=8)
-    bb0, bb1 = s[BB].split(bb1, factor=8)
-    s[AA].bind(aa1, tvm.te.thread_axis("threadIdx.y"))
-    s[AA].bind(aa2, tvm.te.thread_axis("threadIdx.x"))
-    s[AA].vectorize(aa3)
-    s[BB].bind(bb1, tvm.te.thread_axis("threadIdx.y"))
-    s[BB].bind(bb2, tvm.te.thread_axis("threadIdx.x"))
-    s[BB].vectorize(bb3)
-
-    vcf_check_common(s, [A, B, C])
-
-
-@tvm.testing.requires_gpu
-@tvm.testing.requires_cuda
-def test_unrolled_vectorization():
-    dtype = "float32"
-    target = "cuda"
-
-    # Compute declaration
-    N = 128
-    A = te.placeholder((N, N), name="A")
-    B = te.placeholder((N, N), name="B")
-    k = te.reduce_axis((0, N), name="k")
-    C = te.compute((N, N), lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), name="C")
-
-    # Schedule
-    s = te.create_schedule([C.op])
-    CC = s.cache_write(C, "local")
-    i, j = s[C].op.axis
-    bx, tx, ii, ji = s[C].tile(i, j, 1, 2)
-    s[C].bind(bx, te.thread_axis("blockIdx.x"))
-    s[C].bind(tx, te.thread_axis("threadIdx.x"))
-    s[C].vectorize(ji)
-    s[CC].compute_at(s[C], tx)
-    i, j = s[CC].op.axis
-    k = s[CC].op.reduce_axis[0]
-    ko, ki = s[CC].split(k, 2)
-    s[CC].unroll(ki)
-    s[CC].vectorize(j)
-
-    # Check correctness
-    dev = tvm.device(target)
-    a_tvm = tvm.nd.array(np.ones((N, N)).astype(dtype), device=dev)
-    b_tvm = tvm.nd.array(np.ones((N, N)).astype(dtype), device=dev)
-    c_tvm = tvm.nd.empty((N, N), device=dev)
-    func_tvm = tvm.build(s, [A, B, C], target=target)
-    func_tvm(a_tvm, b_tvm, c_tvm)
-    c_np = c_tvm.numpy()
-    tvm.testing.assert_allclose(c_np, N * np.ones((N, N)))
-
-
 @tvm.testing.requires_gpu
 @tvm.testing.requires_cuda
 def test_try_unaligned_vector_load():
@@ -950,16 +686,15 @@ def get_compute_aligned():
         return get_compute(4, 2, 2)
 
     def build(A, C, N, C_N):
-        s = te.create_schedule(C.op)
-        oi, ii = s[C].split(C.op.axis[0], factor=2)
-        s[C].bind(oi, te.thread_axis("threadIdx.x"))
-        s[C].vectorize(ii)  # BUG: misalignment
-
-        tgt = tvm.target.Target(target="cuda", host="llvm")
-        dev = tvm.device(tgt.kind.name, 0)
-        f = tvm.build(s, [A, C], tgt, name="foo")
-        kernel_source = f.imported_modules[0].get_source()
+        sch = tvm.tir.Schedule(te.create_prim_func([A, C]))
+        oi, ii = sch.split(sch.get_loops("C")[0], factors=[None, 2])
+        sch.bind(oi, "threadIdx.x")
+        sch.vectorize(ii)  # BUG: misalignment
 
+        f = tvm.build(sch.mod, target="cuda")
+
+        kernel_source = f.imported_modules[0].get_source()
+        dev = tvm.cuda()
         a_data = np.arange(0, N).astype(A.dtype)
         a = tvm.nd.array(a_data, dev)
         c = tvm.nd.array(np.zeros(C_N, dtype=C.dtype), dev)
@@ -984,28 +719,6 @@ def build(A, C, N, C_N):
     assert np.allclose(c, expected), f"expected={expected}\nactual={c}"
 
 
-@tvm.testing.requires_gpu
-@tvm.testing.requires_cuda
-def test_cuda_save_kernels_for_profiling():
-    num_thread = 8
-
-    def check_cuda(n, lanes):
-        dtype = "float32"
-        A = te.placeholder((n,), name="A", dtype="%sx%d" % (dtype, lanes))
-        B = te.compute((n,), lambda i: A[i] + tvm.tir.const(1, A.dtype), name="B")
-        s = te.create_schedule(B.op)
-        xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
-        s[B].bind(xo, bx)
-        s[B].bind(xi, tx)
-        tempdir = utils.tempdir()
-        tmp_path = str(tempdir.path)
-        with tvm.transform.PassContext(opt_level=3, config={"cuda.kernels_output_dir": tmp_path}):
-            _ = tvm.build(s, [A, B], "cuda")
-        assert "tvm_kernels.cu" in os.listdir(tmp_path)
-
-    check_cuda(64, 2)
-
-
 @tvm.testing.requires_gpu
 @tvm.testing.requires_cuda
 def test_cuda_thread_sync_inside_condition():
diff --git a/tests/python/codegen/test_target_codegen_device.py b/tests/python/codegen/test_target_codegen_device.py
index b4181fb7b014..ad27356961aa 100644
--- a/tests/python/codegen/test_target_codegen_device.py
+++ b/tests/python/codegen/test_target_codegen_device.py
@@ -19,6 +19,7 @@
 from tvm.contrib import utils
 import numpy as np
 import tvm.testing
+from tvm import tir
 
 
 @tvm.testing.requires_gpu
@@ -29,16 +30,25 @@ def test_large_uint_imm():
     num_thread = 2
 
     A = te.compute((n,), lambda *i: tvm.tir.const(value, "uint64") + other, name="A")
-    s = te.create_schedule(A.op)
-    xo, xi = s[A].split(A.op.axis[0], factor=num_thread)
-    s[A].bind(xi, te.thread_axis("threadIdx.x"))
-    s[A].bind(xo, te.thread_axis("blockIdx.x"))
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A])
+    sch = tir.Schedule(mod)
+
+    # Get block and loop
+    block = sch.get_block("A")
+    loop = sch.get_loops(block)[0]
+
+    # Split and bind
+    xo, xi = sch.split(loop, factors=[None, num_thread])
+    sch.bind(xi, "threadIdx.x")
+    sch.bind(xo, "blockIdx.x")
 
     def check_target(device):
         if not tvm.testing.device_enabled(device):
             return
         dev = tvm.device(device, 0)
-        f = tvm.build(s, [A], device)
+        f = tvm.build(sch.mod, target=device)
         # launch the kernel.
         a = tvm.nd.empty((n,), dtype=A.dtype, device=dev)
         f(a)
@@ -55,23 +65,36 @@ def test_add_pipeline():
     B = te.placeholder((), name="B")
     C = te.compute(A.shape, lambda *i: A(*i) + B(), name="C")
     D = te.compute(A.shape, lambda *i: C(*i) + 1, name="D")
-    s = te.create_schedule(D.op)
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A, B, D])
+    sch = tir.Schedule(mod)
+
+    # Get blocks and loops
+    c_block = sch.get_block("C")
+    d_block = sch.get_block("D")
+    c_loop = sch.get_loops(c_block)[0]
+    d_loop = sch.get_loops(d_block)[0]
 
     # GPU schedule have to split by gridIdx and threadIdx
     num_thread = 256
-    xo, xi = s[C].split(C.op.axis[0], factor=num_thread)
-    s[C].bind(xi, te.thread_axis("threadIdx.x"))
-    s[C].bind(xo, te.thread_axis("blockIdx.x"))
 
-    xo, xi = s[D].split(D.op.axis[0], factor=num_thread)
-    s[D].bind(xi, te.thread_axis("threadIdx.x"))
-    s[D].bind(xo, te.thread_axis("blockIdx.x"))
+    # Schedule C
+    c_xo, c_xi = sch.split(c_loop, factors=[None, num_thread])
+    sch.bind(c_xi, "threadIdx.x")
+    sch.bind(c_xo, "blockIdx.x")
+
+    # Schedule D
+    d_xo, d_xi = sch.split(d_loop, factors=[None, num_thread])
+    sch.bind(d_xi, "threadIdx.x")
+    sch.bind(d_xo, "blockIdx.x")
 
     def check_target(device, host="stackvm"):
         if not tvm.testing.device_enabled(device) or not tvm.testing.device_enabled(host):
             return
         dev = tvm.device(device, 0)
-        mhost = tvm.driver.build(s, [A, B, D], target=tvm.target.Target(device, host))
+        target = tvm.target.Target(device, host)
+        mhost = tvm.build(sch.mod, target=target)
         f = mhost.entry_func
         # launch the kernel.
         n = 1027
diff --git a/tests/python/codegen/test_target_codegen_extern.py b/tests/python/codegen/test_target_codegen_extern.py
index 38fac332e9de..378eb427fd54 100644
--- a/tests/python/codegen/test_target_codegen_extern.py
+++ b/tests/python/codegen/test_target_codegen_extern.py
@@ -18,6 +18,8 @@
 from tvm import te
 import numpy as np
 import tvm.testing
+import pytest
+from tvm import tir
 
 
 @tvm.testing.uses_gpu
@@ -56,18 +58,18 @@ def extern_generator_gpu(ins, outs):
 
     C_cpu = te.extern(A.shape, [A], extern_generator, name="C")
     C_gpu = te.extern(A.shape, [A], extern_generator_gpu, name="C")
-    s_cpu = te.create_schedule(C_cpu.op)
-    s_gpu = te.create_schedule(C_gpu.op)
-    print(tvm.lower(s_cpu, [A, C_cpu], simple_mode=True))
-    print(tvm.lower(s_gpu, [A, C_gpu], simple_mode=True))
+
+    # Create IRModules directly
+    mod_cpu = tvm.IRModule.from_expr(te.create_prim_func([A, C_cpu]))
+    mod_gpu = tvm.IRModule.from_expr(te.create_prim_func([A, C_gpu]))
 
     def check_target(target):
         if not tvm.testing.device_enabled(target):
             return
-        s = s_gpu if target in ["opencl", "cuda"] else s_cpu
+        mod = mod_gpu if target in ["opencl", "cuda"] else mod_cpu
         C = C_gpu if target in ["opencl", "cuda"] else C_cpu
         # build and invoke the kernel.
-        f = tvm.build(s, [A, C], target)
+        f = tvm.build(mod, target=target)
         dev = tvm.device(target, 0)
         # launch the kernel.
         n = nn
@@ -91,7 +93,9 @@ def extern_generator(ins, outs):
         return tvm.tir.call_packed("my_extern_array_func1", ins[0], outs[0])
 
     C = te.extern(A.shape, [A], extern_generator, name="C")
-    s = te.create_schedule(C.op)
+
+    # Create IRModule directly
+    mod = tvm.IRModule.from_expr(te.create_prim_func([A, C]))
 
     @tvm.register_func
     def my_extern_array_func1(aa, bb):
@@ -101,7 +105,7 @@ def check_target(target):
         if not tvm.testing.device_enabled(target):
             return
         # build and invoke the kernel.
-        f = tvm.build(s, [A, C], target)
+        f = tvm.build(mod, target=target)
         dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
@@ -115,6 +119,7 @@ def check_target(target):
     check_target("llvm")
 
 
+@pytest.mark.skip("LEGACY-TO-FIX: limitation of create_prim_func with intermediate buffer")
 def test_pack_buffer_intermediate():
     nn = 1024
     n = tvm.runtime.convert(nn)
@@ -126,13 +131,13 @@ def extern_generator(ins, outs):
         return tvm.tir.call_packed("my_extern_array_func2", ins[0], outs[0])
 
     C = te.extern(B.shape, [B], extern_generator, name="C")
-    s = te.create_schedule(C.op)
+    mod = tvm.IRModule.from_expr(te.create_prim_func([A, C]))
 
     def check_target(target):
         if not tvm.testing.device_enabled(target):
             return
         # build and invoke the kernel.
-        f = tvm.build(s, [A, C], target)
+        f = tvm.build(mod, target=target)
         dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
diff --git a/tests/python/codegen/test_target_codegen_hexagon.py b/tests/python/codegen/test_target_codegen_hexagon.py
index c97637f927b7..37e62e5b34ef 100644
--- a/tests/python/codegen/test_target_codegen_hexagon.py
+++ b/tests/python/codegen/test_target_codegen_hexagon.py
@@ -15,14 +15,15 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import numpy as np
 import os
-import pytest
 import re
 import sys
+import numpy as np
+import pytest
 import tvm
 import tvm.testing
 import tvm.contrib.hexagon as hexagon
+from tvm import te
 
 
 @pytest.fixture(autouse=True)
@@ -39,28 +40,17 @@ def register_linker():
 def test_basic():
     target = tvm.target.hexagon("v66", hvx=128)
 
-    def check_add(offload):
+    def check_add():
         A = tvm.te.placeholder((128,), dtype="uint8", name="A")
         B = tvm.te.placeholder((128,), dtype="uint8", name="A")
         C = tvm.te.compute((128,), lambda i: A[i] + B[i], name="C")
-        s = tvm.te.create_schedule(C.op)
-
-        if offload:
-            xo, xi = s[C].split(s[C].op.axis[0], nparts=1)
-            s[C].bind(xo, tvm.te.thread_axis("pipeline"))
-            m = tvm.build(s, [C, A, B], target=target, name="offload_add")
-            hexm = m.imported_modules[0]
-        else:
-            hexm = tvm.build(
-                s, [C, A, B], target=tvm.target.Target(target, target), name="native_add"
-            )
-
+        mod = tvm.IRModule.from_expr(te.create_prim_func([C, A, B]))
+        hexm = tvm.build(mod, target=tvm.target.Target(target, target))
         asm = hexm.get_source("s")
         vadds = re.findall(r"v[0-9]+.b = vadd\(v[0-9]+.b,v[0-9]+.b\)", asm)
         assert vadds  # Check that it's non-empty
 
-    check_add(True)
-    check_add(False)
+    check_add()
 
 
 @tvm.testing.requires_hexagon
@@ -69,48 +59,22 @@ def test_llvm_target_features():
     # Define some trivial compute
     A = tvm.te.placeholder((128,), dtype="uint8", name="A")
     C = tvm.te.compute((128,), lambda i: A[i] + 1, name="C")
-    s = tvm.te.create_schedule(C.op)
-    m = tvm.build(s, [C, A], target=tvm.target.Target(target, target), name="add_one")
+    mod = tvm.IRModule.from_expr(te.create_prim_func([C, A]).with_attr("global_symbol", "add_one"))
+    m = tvm.build(mod, target=tvm.target.Target(target, target))
     llvm_ir = m.get_source("ll")
     # Make sure we find +hvx-length128b in "attributes".
     fs = re.findall(r"attributes.*\+hvx-length128b", llvm_ir)
     assert fs  # Check that it's non-empty
 
 
-@tvm.testing.requires_hexagon
-def test_alloc_vtcm():
-    target = tvm.target.hexagon("v66")
-
-    buf_len = 2048
-    A = tvm.te.placeholder((buf_len,), name="A", dtype="int8")
-    B = tvm.te.placeholder((buf_len,), name="B", dtype="int8")
-
-    A_buf = tvm.te.compute((buf_len,), lambda *i: A(*i), "A_buf")
-    B_buf = tvm.te.compute((buf_len,), lambda *i: B(*i), "B_buf")
-    C = tvm.te.compute((buf_len,), lambda *i: A_buf(*i) + B_buf(*i), name="C")
-    s = tvm.te.create_schedule(C.op)
-
-    # Use VTCM for each buffer.
-    s[A_buf].set_scope("local.vtcm")
-    s[B_buf].set_scope("local.vtcm")
-
-    config = {"tir.add_lower_pass": hexagon.ir_lower_vtcm_pass()}
-    with tvm.transform.PassContext(config=config):
-        irmod = tvm.lower(s, [A, B, C], name="alloc_vtcm")
-
-    calls = re.findall("HexagonBackend[A-Za-z]*VTCM", str(irmod["alloc_vtcm"]))
-    assert "HexagonBackendAllocateVTCM" in calls
-    assert "HexagonBackendFreeVTCM" in calls
-
-
 @tvm.testing.requires_hexagon
 def test_llvm_options():
     target = tvm.target.hexagon("v66", llvm_options="-hexagon-noopt")
     Zero = tvm.te.compute((10,), lambda _: tvm.tir.const(0, "int32"))
-    s = tvm.te.create_schedule(Zero.op)
-    tvm.build(s, [Zero], target=target, name="zero")
+    mod = tvm.IRModule.from_expr(te.create_prim_func([Zero]))
     # Check that BuildHexagon hasn't crashed because of target attribute
     # type mismatch.
+    tvm.build(mod, target=tvm.target.Target(target, target))
     assert re.search("-hexagon-noopt", str(target))
 
 
diff --git a/tests/python/codegen/test_target_codegen_llvm.py b/tests/python/codegen/test_target_codegen_llvm.py
index d629d93d365e..e3ccff49ba1b 100644
--- a/tests/python/codegen/test_target_codegen_llvm.py
+++ b/tests/python/codegen/test_target_codegen_llvm.py
@@ -26,6 +26,7 @@
 import tvm
 import tvm.testing
 from tvm import te
+from tvm import tir
 from tvm.contrib import clang, utils
 from tvm.script import tir as T, ir as I
 from tvm.target.codegen import llvm_get_intrinsic_name, llvm_lookup_intrinsic_id
@@ -85,8 +86,13 @@ def use_llvm_intrinsic(A, C):
     C = tvm.te.extern(
         (1, 1), [A], lambda ins, outs: use_llvm_intrinsic(ins[0], outs[0]), name="C", dtype="int32"
     )
-    s = tvm.te.create_schedule(C.op)
-    f = tvm.build(s, [A, C], target="llvm")
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A, C])
+    sch = tir.Schedule(mod)
+
+    # Build from scheduled TIR
+    f = tvm.build(sch.mod, target="llvm")
 
 
 @tvm.testing.requires_llvm
@@ -108,10 +114,13 @@ def test_llvm_large_uintimm():
     value = (1 << 63) + 123
     other = tvm.tir.const(3, "uint64")
     A = te.compute((), lambda: tvm.tir.const(value, "uint64") + other, name="A")
-    s = te.create_schedule(A.op)
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A])
+    sch = tir.Schedule(mod)
 
     def check_llvm():
-        f = tvm.build(s, [A], "llvm")
+        f = tvm.build(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
         a = tvm.nd.empty((), dtype=A.dtype, device=dev)
@@ -122,24 +131,38 @@ def check_llvm():
 
 
 @tvm.testing.requires_llvm
-def test_llvm_persist_parallel():
+def test_llvm_multi_parallel():
     n = 128
     A = te.placeholder((n,), name="A")
     B = te.compute(A.shape, lambda *i: A(*i) + 1, name="B")
     C = te.compute(A.shape, lambda *i: te.sqrt(B(*i)) * 2 + 2, name="C")
-    s = te.create_schedule(C.op)
-    xo, xi = s[C].split(C.op.axis[0], factor=8)
-    xo1, xo2 = s[C].split(xo, nparts=1)
-    s[B].compute_at(s[C], xo1)
-    s[B].parallel(s[B].op.axis[0])
-    s[B].pragma(s[B].op.axis[0], "parallel_barrier_when_finish")
-    s[C].parallel(xi)
-    s[C].pragma(xo1, "parallel_launch_point")
-    s[C].pragma(xi, "parallel_stride_pattern")
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A, C])
+    sch = tir.Schedule(mod)
+
+    # Get blocks and loops
+    c_block = sch.get_block("C")
+    b_block = sch.get_block("B")
+    c_loop = sch.get_loops(c_block)[0]
+
+    # Split and parallelize
+    xo, xi = sch.split(c_loop, factors=[None, 8])
+    xo1, xo2 = sch.split(xo, factors=[1, None])
+
+    # Move computation of B
+    sch.compute_at(b_block, xo1)
+
+    # Get B's loop after compute_at
+    b_loop = sch.get_loops(b_block)[0]
+
+    # Apply parallel scheduling
+    sch.parallel(b_loop)
+    sch.parallel(xi)
 
     def check_llvm():
         # BUILD and invoke the kernel.
-        f = tvm.build(s, [A, C], "llvm")
+        f = tvm.build(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
         a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
@@ -156,12 +179,22 @@ def check_llvm(nn, base):
         n = tvm.runtime.convert(nn)
         A = te.placeholder((n + base), name="A")
         C = te.compute((n,), lambda i: A(nn + base - i - 1), name="C")
-        s = te.create_schedule(C.op)
-        xo, xi = s[C].split(C.op.axis[0], factor=4)
-        s[C].parallel(xo)
-        s[C].vectorize(xi)
+
+        # Convert to TIR and create schedule
+        mod = te.create_prim_func([A, C])
+        sch = tir.Schedule(mod)
+
+        # Get block and loop
+        block = sch.get_block("C")
+        loop = sch.get_loops(block)[0]
+
+        # Split and parallelize
+        xo, xi = sch.split(loop, factors=[None, 4])
+        sch.parallel(xo)
+        sch.vectorize(xi)
+
         # build and invoke the kernel.
-        f = tvm.build(s, [A, C], "llvm")
+        f = tvm.build(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
@@ -178,29 +211,31 @@ def check_llvm(nn, base):
 
 @tvm.testing.requires_llvm
 def test_llvm_vadd_pipeline():
-    def check_llvm(n, lanes):
-        A = te.placeholder((n,), name="A", dtype="float32x%d" % lanes)
-        B = te.compute((n,), lambda i: A[i], name="B")
-        C = te.compute((n,), lambda i: B[i] + tvm.tir.const(1, A.dtype), name="C")
-        s = te.create_schedule(C.op)
-        xo, xi = s[C].split(C.op.axis[0], nparts=2)
-        _, xi = s[C].split(xi, factor=2)
-        s[C].parallel(xo)
-        s[C].vectorize(xi)
-        s[B].compute_at(s[C], xo)
-        xo, xi = s[B].split(B.op.axis[0], factor=2)
-        s[B].vectorize(xi)
-        # build and invoke the kernel.
-        f = tvm.build(s, [A, C], "llvm")
-        dev = tvm.cpu(0)
-        # launch the kernel.
-        a = tvm.nd.empty((n,), A.dtype).copyfrom(np.random.uniform(size=(n, lanes)))
-        c = tvm.nd.empty((n,), C.dtype, dev)
-        f(a, c)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy() + 1)
-
-    check_llvm(64, 2)
-    check_llvm(512, 2)
+    n = te.size_var("n")
+    A = te.placeholder((n,), name="A")
+    B = te.placeholder((n,), name="B")
+    C = te.compute((n,), lambda i: A[i] + B[i], name="C")
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A, B, C])
+    sch = tir.Schedule(mod)
+
+    # Get block and loop
+    block = sch.get_block("C")
+    loop = sch.get_loops(block)[0]
+
+    # Split the loop
+    _, inner = sch.split(loop, factors=[None, 4])
+    sch.vectorize(inner)
+    # Build and verify
+    f = tvm.build(sch.mod, target="llvm")
+    dev = tvm.cpu(0)
+    n = 128
+    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+    b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
+    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
+    f(a, b, c)
+    tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
 
 
 @tvm.testing.requires_llvm
@@ -209,12 +244,22 @@ def check_llvm(nn, base, stride):
         n = tvm.runtime.convert(nn)
         A = te.placeholder((n + base, stride), name="A")
         C = te.compute((n, stride), lambda i, j: A(base + i, j) + 1, name="C")
-        s = te.create_schedule(C.op)
-        xo, xi = s[C].split(C.op.axis[0], factor=4)
-        s[C].parallel(xo)
-        s[C].vectorize(xi)
+
+        # Convert to TIR and create schedule
+        mod = te.create_prim_func([A, C])
+        sch = tir.Schedule(mod)
+
+        # Get block and loops
+        block = sch.get_block("C")
+        i_loop, j_loop = sch.get_loops(block)
+
+        # Split and parallelize
+        xo, xi = sch.split(i_loop, factors=[None, 4])
+        sch.parallel(xo)
+        sch.vectorize(xi)
+
         # build and invoke the kernel.
-        f = tvm.build(s, [A, C], "llvm")
+        f = tvm.build(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
@@ -237,11 +282,14 @@ def test_llvm_temp_space():
     A = te.placeholder((n,), name="A")
     B = te.compute(A.shape, lambda i: A(i) + 1, name="B")
     C = te.compute(A.shape, lambda i: B(i) + 1, name="C")
-    s = te.create_schedule(C.op)
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A, C])
+    sch = tir.Schedule(mod)
 
     def check_llvm():
         # build and invoke the kernel.
-        f = tvm.build(s, [A, C], "llvm")
+        f = tvm.build(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
@@ -255,36 +303,37 @@ def check_llvm():
 
 @tvm.testing.requires_llvm
 def test_multiple_func():
-    nn = 1024
-    n = tvm.runtime.convert(nn)
+    # Define the computation
+    n = te.size_var("n")
     A = te.placeholder((n,), name="A")
     B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
-    s = te.create_schedule(C.op)
-    xo, xi = s[C].split(C.op.axis[0], factor=4)
-    s[C].parallel(xo)
-    s[C].vectorize(xi)
-
-    def check_llvm():
-        # build two functions
-        f2 = tvm.lower(s, [A, B, C], name="fadd1")
-        f1 = tvm.lower(s, [A, B, C], name="fadd2")
-        m = tvm.build([f1, f2], "llvm")
-        fadd2 = m["fadd2"]
-        fadd1 = m["fadd1"]
+    C = te.compute((n,), lambda i: A[i] + B[i], name="C")
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A, B, C])
+    sch = tir.Schedule(mod)
+
+    # Create two functions with different names
+    mod = tvm.IRModule(
+        {
+            "fadd1": sch.mod["main"].with_attr("global_symbol", "fadd1"),
+            "fadd2": sch.mod["main"].with_attr("global_symbol", "fadd2"),
+        }
+    )
 
-        dev = tvm.cpu(0)
-        # launch the kernel.
-        n = nn
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
-        fadd1(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-        fadd2(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
+    # Build and verify
+    f = tvm.build(mod, target="llvm")
+    dev = tvm.cpu(0)
+    n = 10
+    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+    b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
+    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
 
-    check_llvm()
+    # Test both functions
+    f["fadd1"](a, b, c)
+    tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
+    f["fadd2"](a, b, c)
+    tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
 
 
 @tvm.testing.requires_llvm
@@ -292,9 +341,13 @@ def test_llvm_condition():
     def check_llvm(n, offset):
         A = te.placeholder((n,), name="A")
         C = te.compute((n,), lambda i: tvm.tir.if_then_else(i >= offset, A[i], 0.0), name="C")
-        s = te.create_schedule(C.op)
+
+        # Convert to TIR and create schedule
+        mod = te.create_prim_func([A, C])
+        sch = tir.Schedule(mod)
+
         # build and invoke the kernel.
-        f = tvm.build(s, [A, C], "llvm")
+        f = tvm.build(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
         a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev)
@@ -312,9 +365,13 @@ def test_llvm_bool():
     def check_llvm(n):
         A = te.placeholder((n,), name="A", dtype="int32")
         C = te.compute((n,), lambda i: A[i].equal(1).astype("float"), name="C")
-        s = te.create_schedule(C.op)
+
+        # Convert to TIR and create schedule
+        mod = te.create_prim_func([A, C])
+        sch = tir.Schedule(mod)
+
         # build and invoke the kernel.
-        f = tvm.build(s, [A, C], "llvm")
+        f = tvm.build(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
         a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev)
@@ -334,9 +391,13 @@ def check_llvm(n):
         k = te.reduce_axis((0, n), name="k")
         C = te.compute((), lambda: te.sum(A[k] * scale(), axis=k), name="C")
         D = te.compute((), lambda: C() + 1)
-        s = te.create_schedule(D.op)
+
+        # Convert to TIR and create schedule
+        mod = te.create_prim_func([A, scale, D])
+        sch = tir.Schedule(mod)
+
         # build and invoke the kernel.
-        f = tvm.build(s, [A, scale, D], "llvm")
+        f = tvm.build(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
         a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev)
@@ -358,9 +419,13 @@ def check_llvm(n):
             k = te.reduce_axis((0, n), name="k")
             C = te.compute((), lambda: te.sum(A[k] * scale(), axis=k), name="C")
             D = te.compute((), lambda: C() + 1)
-            s = te.create_schedule(D.op)
+
+            # Convert to TIR and create schedule
+            mod = te.create_prim_func([A, scale, D])
+            sch = tir.Schedule(mod)
+
             # build and invoke the kernel.
-            f = tvm.build(s, [A, scale, D], "llvm")
+            f = tvm.build(sch.mod, target="llvm")
             dev = tvm.cpu(0)
             # launch the kernel.
             a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev)
@@ -378,10 +443,21 @@ def test_alignment():
     n = tvm.runtime.convert(1024)
     A = te.placeholder((n,), name="A")
     B = te.compute(A.shape, lambda i: A[i] * 3, name="B")
-    s = te.create_schedule(B.op)
-    bx, tx = s[B].split(B.op.axis[0], factor=8)
-    s[B].vectorize(tx)
-    f = tvm.build(s, [A, B], "llvm", name="test_alignment")
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A, B]).with_attr("global_symbol", "test_alignment")
+    sch = tir.Schedule(mod)
+
+    # Get block and loop
+    block = sch.get_block("B")
+    loop = sch.get_loops(block)[0]
+
+    # Split and vectorize
+    _, tx = sch.split(loop, factors=[None, 8])
+    sch.vectorize(tx)
+
+    # Build with name
+    f = tvm.build(sch.mod, target="llvm")
 
     lines = f.get_source().split("\n")
 
@@ -452,8 +528,12 @@ def clipb(x):
             lambda i, j: (div(clipa(A[i]), clipb(B[j])), mod(clipa(A[i]), clipb(B[j]))),
         )
 
-        s = te.create_schedule([D.op, M.op])
-        f = tvm.build(s, [A, B, D, M], "llvm")
+        # Convert to TIR and create schedule
+        mod = te.create_prim_func([A, B, D, M])
+        sch = tir.Schedule(mod)
+
+        # Build from scheduled TIR
+        f = tvm.build(sch.mod, target="llvm")
 
         # Fill input arrays with values
         A_arr = tvm.nd.empty((end - start + 1,), dtype)
@@ -477,7 +557,7 @@ def _show_info():
             print("dtype: {}".format(dtype))
             print("dividend range: [{}, {}]".format(start, end))
             print("divisor range: [{}, {}]".format(dstart, dend))
-            lowered = tvm.lower(s, [A, B, D, M], simple_mode=True)
+            lowered = tvm.lower(sch.mod, simple_mode=True)
             print("Lowered code:")
             print(lowered)
 
@@ -557,8 +637,12 @@ def check_llvm_reciprocal(n):
         A = te.placeholder((n,), name="A")
         B = te.compute((n,), lambda i: te.div(1.0, (1e37 * A[i])), name="B")
 
-        s = te.create_schedule(B.op)
-        f = tvm.build(s, [A, B], "llvm")
+        # Convert to TIR and create schedule
+        mod = te.create_prim_func([A, B])
+        sch = tir.Schedule(mod)
+
+        # Build from scheduled TIR
+        f = tvm.build(sch.mod, target="llvm")
 
         a = tvm.nd.array(np.full((n,), 100, "float32"))
         b = tvm.nd.empty((n,), "float32")
@@ -573,8 +657,12 @@ def check_llvm_sigmoid(n):
         A = te.placeholder((n,), name="A")
         B = te.compute((n,), lambda i: te.sigmoid(A[i]), name="B")
 
-        s = te.create_schedule(B.op)
-        f = tvm.build(s, [A, B], "llvm")
+        # Convert to TIR and create schedule
+        mod = te.create_prim_func([A, B])
+        sch = tir.Schedule(mod)
+
+        # Build from scheduled TIR
+        f = tvm.build(sch.mod, target="llvm")
 
         a = tvm.nd.array(np.full((n,), -1000, "float32"))
         b = tvm.nd.empty((n,), "float32")
@@ -593,10 +681,19 @@ def test_dwarf_debug_information():
     A = te.placeholder((n,), name="A")
     B = te.placeholder((n,), name="B")
     C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
-    s = te.create_schedule(C.op)
-    xo, xi = s[C].split(C.op.axis[0], factor=4)
-    s[C].parallel(xo)
-    s[C].vectorize(xi)
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A, B, C])
+    sch = tir.Schedule(mod)
+
+    # Get block and loop
+    block = sch.get_block("C")
+    loop = sch.get_loops(block)[0]
+
+    # Split and parallelize
+    xo, xi = sch.split(loop, factors=[None, 4])
+    sch.parallel(xo)
+    sch.vectorize(xi)
 
     def check_llvm_object():
         if tvm.target.codegen.llvm_version_major() < 5:
@@ -604,9 +701,13 @@ def check_llvm_object():
         if tvm.target.codegen.llvm_version_major() > 6:
             return
         # build two functions
-        f2 = tvm.lower(s, [A, B, C], name="fadd1")
-        f1 = tvm.lower(s, [A, B, C], name="fadd2")
-        m = tvm.build([f1, f2], "llvm")
+        mod = tvm.IRModule(
+            {
+                "fadd1": sch.mod["main"].with_attr("global_symbol", "fadd1"),
+                "fadd2": sch.mod["main"].with_attr("global_symbol", "fadd2"),
+            }
+        )
+        m = tvm.build(mod, target="llvm")
         temp = utils.tempdir()
         o_path = temp.relpath("temp.o")
         m.save(o_path)
@@ -638,9 +739,13 @@ def check_llvm_ir():
         if tvm.target.codegen.llvm_version_major() > 6:
             return
         # build two functions
-        f2 = tvm.lower(s, [A, B, C], name="fadd1")
-        f1 = tvm.lower(s, [A, B, C], name="fadd2")
-        m = tvm.build([f1, f2], target="llvm -mtriple=aarch64-linux-gnu")
+        mod = tvm.IRModule(
+            {
+                "fadd1": sch.mod["main"].with_attr("global_symbol", "fadd1"),
+                "fadd2": sch.mod["main"].with_attr("global_symbol", "fadd2"),
+            }
+        )
+        m = tvm.build(mod, target="llvm -mtriple=aarch64-linux-gnu")
         ll = m.get_source("ll")
 
         # On non-Darwin OS, don't explicitly specify DWARF version.
@@ -650,7 +755,7 @@ def check_llvm_ir():
         assert re.search(r"""llvm.dbg.value""", ll)
 
         # Try Darwin, require DWARF-2
-        m = tvm.build([f1, f2], target="llvm -mtriple=x86_64-apple-darwin-macho")
+        m = tvm.build(mod, target="llvm -mtriple=x86_64-apple-darwin-macho")
         ll = m.get_source("ll")
         assert re.search(r"""i32 4, !"Dwarf Version", i32 2""", ll)
         assert re.search(r"""llvm.dbg.value""", ll)
@@ -664,7 +769,10 @@ def test_llvm_shuffle():
     a = te.placeholder((8,), "int32")
     b = te.placeholder((8,), "int32")
     c = te.compute((8,), lambda x: a[x] + b[7 - x])
-    sch = te.create_schedule(c.op)
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([a, b, c])
+    sch = tir.Schedule(mod)
 
     def my_vectorize():
         def vectorizer(op):
@@ -685,8 +793,8 @@ def _transform(f, *_):
         return tvm.tir.transform.prim_func_pass(_transform, opt_level=0, name="my_vectorize")
 
     with tvm.transform.PassContext(config={"tir.add_lower_pass": [(1, my_vectorize())]}):
-        ir = tvm.lower(sch, [a, b, c], simple_mode=True)
-        module = tvm.build(sch, [a, b, c])
+        ir = tvm.lower(sch.mod, simple_mode=True)
+        module = tvm.build(sch.mod)
         a_ = tvm.nd.array(np.arange(1, 9, dtype="int32"))
         b_ = tvm.nd.array(np.arange(8, 0, -1, dtype="int32"))
         c_ = tvm.nd.array(np.zeros((8,), dtype="int32"))
@@ -727,12 +835,21 @@ def dotest(do_vectorize):
         np.random.seed(122)
         A = te.placeholder((32,), dtype="bfloat16")
         B = te.placeholder((32,), dtype="bfloat16")
-        d = te.compute((32,), lambda x: A[x] + B[x])
-        sch = te.create_schedule(d.op)
+        D = te.compute((32,), lambda x: A[x] + B[x], name="D")
+
+        # Convert to TIR and create schedule
+        mod = te.create_prim_func([A, B, D])
+        sch = tir.Schedule(mod)
+
+        # Get block and loop
+        block = sch.get_block("D")
+        loop = sch.get_loops(block)[0]
+
+        # Apply vectorization if requested
         if do_vectorize:
-            sch[d].vectorize(d.op.axis[0])
+            sch.vectorize(loop)
 
-        module = tvm.build(sch, [A, B, d])
+        module = tvm.build(sch.mod, target="llvm")
         npa = np.random.rand(32).astype("float32")
         npb = np.random.rand(32).astype("float32")
         va = np_bf16_cast_and_cast_back(npa)
@@ -762,72 +879,6 @@ def test_llvm_crt_static_lib():
     module.save("test.o")
 
 
-def atomic_add(x, y):
-    return tvm.tir.call_intrin(y.dtype, "tir.atomic_add", x, y)
-
-
-@tvm.testing.requires_llvm
-def test_llvm_lower_atomic():
-    def do_atomic_add(A):
-        ib = tvm.tir.ir_builder.create()
-        n = A.shape[0]
-        atomic_add_return = ib.allocate(A.dtype, (1,), name="atomic_add_return", scope="local")
-        one = tvm.tir.const(1, A.dtype)
-        A_ptr = ib.buffer_ptr(A)
-        with ib.for_range(0, n, name="i", kind="parallel") as i:
-            atomic_add_return[0] = atomic_add(
-                tvm.tir.call_intrin("handle", "tir.address_of", A_ptr[0]), one
-            )
-        return ib.get()
-
-    A = tvm.te.placeholder((100,), dtype="int32", name="A")
-    C = tvm.te.extern((100,), [A], lambda ins, _: do_atomic_add(ins[0]), name="C", dtype="int32")
-    s = tvm.te.create_schedule(C.op)
-    # This does not work because of pointer type mismatch
-    # TVMError: LLVM module verification failed with the following errors:
-    # Argument value type does not match pointer operand type!
-    # %21 = atomicrmw add i8* %7, i32 1 monotonic
-    # i8
-    # f = tvm.build(s, [A], target="llvm")
-
-
-@tvm.testing.requires_llvm
-@tvm.testing.requires_gpu
-def test_llvm_gpu_lower_atomic():
-    def do_atomic_add(A):
-        ib = tvm.tir.ir_builder.create()
-        n = A.shape[0]
-        atomic_add_return = ib.allocate(A.dtype, (1,), name="atomic_add_return", scope="local")
-        one = tvm.tir.const(1, A.dtype)
-        A_ptr = ib.buffer_ptr(A)
-        nthread_tx = 64
-        with ib.new_scope():
-            nthread_bx = (n + nthread_tx - 1) // nthread_tx
-            tx = te.thread_axis("threadIdx.x")
-            bx = te.thread_axis("blockIdx.x")
-            ib.scope_attr(tx, "thread_extent", nthread_tx)
-            ib.scope_attr(bx, "thread_extent", nthread_bx)
-            atomic_add_return[0] = atomic_add(
-                tvm.tir.call_intrin("handle", "tir.address_of", A_ptr[0]), one
-            )
-        return ib.get()
-
-    size = 1024
-    # CI uses LLVM 8, which does not support float atomic
-    for dtype in ["int32"]:
-        A = tvm.te.placeholder((size,), dtype=dtype, name="A")
-        C = tvm.te.extern((size,), [A], lambda ins, _: do_atomic_add(ins[0]), dtype=dtype)
-        s = tvm.te.create_schedule(C.op)
-        f = tvm.build(s, [A], target="nvptx")
-
-        dev = tvm.cuda()
-        a = tvm.nd.array(np.zeros((size,)).astype(A.dtype), dev)
-        f(a)
-        ref = np.zeros((size,)).astype(A.dtype)
-        ref[0] = size
-        tvm.testing.assert_allclose(a.numpy(), ref, rtol=1e-5)
-
-
 @tvm.testing.requires_llvm
 def test_llvm_order_functions():
     """Check that functions in the LLVM module are ordered alphabetically."""
@@ -850,7 +901,7 @@ def make_call_extern(caller, callee):
         "Kirby": make_call_extern("Kirby", "Fred"),
     }
     mod = tvm.IRModule(functions=functions)
-    ir_text = tvm.build(mod, None, target="llvm").get_source("ll")
+    ir_text = tvm.build(mod, target="llvm").get_source("ll")
     # Skip functions whose names start with _.
     matches = re.findall(r"^define[^@]*@([a-zA-Z][a-zA-Z0-9_]*)", ir_text, re.MULTILINE)
     assert matches == sorted(matches)
@@ -879,13 +930,14 @@ def check_llvm(use_file):
         temp = utils.tempdir()
         ll_path = temp.relpath("temp.ll")
         ll_code = clang.create_llvm(cc_code, output=ll_path)
-        s = te.create_schedule(B.op)
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+
         if use_file:
-            s[B].pragma(s[B].op.axis[0], "import_llvm", ll_path)
+            sch.annotate(sch.get_loops("B")[0], "pragma_import_llvm", ll_path)
         else:
-            s[B].pragma(s[B].op.axis[0], "import_llvm", ll_code)
+            sch.annotate(sch.get_loops("B")[0], "pragma_import_llvm", ll_code)
         # BUILD and invoke the kernel.
-        f = tvm.build(s, [A, B], "llvm")
+        f = tvm.build(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
         a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
@@ -910,7 +962,7 @@ def test_llvm_scalar_concat():
     # This will crash in LLVM codegen if CodeGenLLVM::CreateVecConcat doesn't convert
     # scalars to single-lane LLVM vectors.
     with tvm.transform.PassContext(config={"tir.disable_assert": True}):
-        m = tvm.build(mod, [x, y, z], target="llvm")
+        m = tvm.build(mod, target="llvm")
 
 
 @tvm.testing.requires_llvm
@@ -925,7 +977,7 @@ def threadpool_nested_parallel_loop(
                 B[i, j] = A[i, j] * 2.0
 
     with pytest.raises(tvm.TVMError) as e:
-        tvm.build({"llvm": tvm.IRModule.from_expr(threadpool_nested_parallel_loop)})
+        tvm.build(tvm.IRModule.from_expr(threadpool_nested_parallel_loop), target="llvm")
     msg = str(e)
     assert msg.find("Nested parallel loop is not supported") != -1
 
@@ -939,13 +991,16 @@ def test_llvm_target_attributes():
     A = te.placeholder((n,), name="A", dtype="float32")
     B = te.compute((n,), lambda i: A[i], name="B")
     C = te.compute((n,), lambda i: B[i] + tvm.tir.const(1, A.dtype), name="C")
-    s = te.create_schedule(C.op)
-    xo, xi = s[C].split(C.op.axis[0], nparts=2)
-    s[C].parallel(xo)
+
+    sch = tvm.tir.Schedule(
+        te.create_prim_func([A, B, C, n]).with_attr("global_symbol", "test_func")
+    )
+    xo, xi = sch.split(sch.get_loops("C")[0], factors=[2, None])
+    sch.parallel(xo)
 
     target_llvm = "llvm -mtriple=x86_64-linux-gnu -mcpu=skylake -mattr=+avx512f"
     target = tvm.target.Target(target_llvm, host=target_llvm)
-    module = tvm.build(s, [A, B, C, n], target=target, name="test_func")
+    module = tvm.build(sch.mod, target=target)
 
     llvm_ir = module.get_source()
     llvm_ir_lines = llvm_ir.split("\n")
@@ -996,7 +1051,7 @@ def tir_assume_func(A: T.Buffer((4, 4), "int32"), B: T.Buffer((14,), "int32")):
     mod = tvm.IRModule.from_expr(tir_assume_func)
     inp = te.placeholder((4, 4), name="A", dtype="int32")
     out = te.placeholder((14,), name="B", dtype="int32")
-    m = tvm.build(mod, [inp, out], target="llvm")
+    m = tvm.build(mod, target="llvm")
 
 
 @tvm.testing.requires_llvm
diff --git a/tests/python/codegen/test_target_codegen_opencl.py b/tests/python/codegen/test_target_codegen_opencl.py
index 079553665ffb..90af959472c5 100644
--- a/tests/python/codegen/test_target_codegen_opencl.py
+++ b/tests/python/codegen/test_target_codegen_opencl.py
@@ -135,9 +135,12 @@ def test_opencl_erf():
     def check_erf(dev, n, dtype):
         A = te.placeholder((n,), name="A", dtype=dtype)
         C = te.compute(A.shape, lambda *i: te.erf(A(*i)), name="C")
-        s = te.create_schedule(C.op)
-        s[C].bind(s[C].op.axis[0], te.thread_axis("threadIdx.x"))
-        fun = tvm.build(s, [A, C], target)
+        func = te.create_prim_func([A, C])
+        sch = tvm.tir.Schedule(func)
+        (x,) = sch.get_loops(sch.get_block("C"))
+        sch.bind(x, "threadIdx.x")
+        fun = tvm.build(sch.mod, target=target)
+
         source_str = fun.imported_modules[0].get_source()
         matches = re.findall("erf", source_str)
         error_matches = re.findall("erff", source_str)
diff --git a/tests/python/codegen/test_target_codegen_rocm.py b/tests/python/codegen/test_target_codegen_rocm.py
index a0990c330f03..4c7592034ef0 100644
--- a/tests/python/codegen/test_target_codegen_rocm.py
+++ b/tests/python/codegen/test_target_codegen_rocm.py
@@ -18,41 +18,8 @@
 import tvm.testing
 from tvm import te
 import numpy as np
-import unittest
 from tvm.script import tir as T
 
-tx = te.thread_axis("threadIdx.x")
-ty = te.thread_axis("threadIdx.y")
-bx = te.thread_axis("blockIdx.x")
-by = te.thread_axis("blockIdx.y")
-
-
-@tvm.testing.requires_rocm
-def test_rocm_cross_thread_reduction():
-    # based on the reduction tutorial
-    n = te.size_var("n")
-    m = te.size_var("m")
-    A = te.placeholder((n, m), name="A")
-    k = te.reduce_axis((0, m), "k")
-    B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B")
-    s = te.create_schedule(B.op)
-    ko, ki = s[B].split(B.op.reduce_axis[0], factor=16)
-    BF = s.rfactor(B, ki)
-    xo, xi = s[B].split(s[B].op.axis[0], factor=32)
-    s[B].bind(xo, bx)
-    s[B].bind(xi, ty)
-    s[B].bind(s[B].op.reduce_axis[0], tx)
-    s[BF].compute_at(s[B], s[B].op.reduce_axis[0])
-    s[B].set_store_predicate(tx.var.equal(0))
-    frocm = tvm.build(s, [A, B], "rocm")
-
-    nn = 128
-    dev = tvm.rocm(0)
-    a = tvm.nd.array(np.random.uniform(size=(nn, nn)).astype(A.dtype), dev)
-    b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), dev)
-    frocm(a, b)
-    tvm.testing.assert_allclose(b.numpy(), np.sum(a.numpy(), axis=1), rtol=1e-4)
-
 
 @tvm.testing.requires_rocm
 def test_rocm_inf_nan():
@@ -60,9 +27,11 @@ def check_inf_nan(dev, n, value, dtype):
         A = te.placeholder((n,), name="A", dtype=dtype)
         inf_value = tvm.tir.const(value, dtype=dtype)
         C = te.compute((n,), lambda i: inf_value, name="C")
-        s = te.create_schedule(C.op)
-        s[C].bind(s[C].op.axis[0], tx)
-        fun = tvm.build(s, [A, C], "rocm")
+        sch = tvm.tir.Schedule(te.create_prim_func([A, C]))
+        xo, xi = sch.split(sch.get_loops("C")[0], factors=[None, 128])
+        sch.bind(xo, "blockIdx.x")
+        sch.bind(xi, "threadIdx.x")
+        fun = tvm.build(sch.mod, "rocm")
         a = tvm.nd.empty((n,), A.dtype, dev)
         c = tvm.nd.empty((n,), A.dtype, dev)
         # Only need to test compiling here
@@ -78,19 +47,6 @@ def check_inf_nan(dev, n, value, dtype):
     check_inf_nan(dev, 1, float("nan"), "float64")
 
 
-@tvm.testing.requires_rocm
-def test_rocm_reduction_binding():
-    k = te.reduce_axis((0, 32), "k")
-    A = te.placeholder((96, 32), name="A")
-    B = te.compute((96,), lambda m: te.sum(A[m, k], axis=k), name="B")
-    s = te.create_schedule(B.op)
-
-    s[B].reorder(B.op.reduce_axis[0], B.op.axis[0])
-
-    mo, _ = s[B].split(B.op.axis[0], 32)
-    s[B].bind(mo, bx)
-
-
 @tvm.testing.requires_rocm
 def test_rocm_copy():
     def check_rocm(dtype, n):
@@ -116,11 +72,12 @@ def test_rocm_vectorize_add():
     def check_rocm(dtype, n, lanes):
         A = te.placeholder((n,), name="A", dtype="%sx%d" % (dtype, lanes))
         B = te.compute((n,), lambda i: A[i] + tvm.tir.const(1, A.dtype), name="B")
-        s = te.create_schedule(B.op)
-        xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
-        s[B].bind(xo, bx)
-        s[B].bind(xi, tx)
-        fun = tvm.build(s, [A, B], "rocm")
+        sch = tir.Schedule(te.create_prim_func([A, B]))
+        xo, xi = sch.split(sch.get_loops("B")[0], factors=[None, 4])
+        sch.bind(xo, "blockIdx.x")
+        sch.bind(xi, "threadIdx.x")
+        fun = tvm.build(sch.mod, target="rocm")
+
         dev = tvm.rocm(0)
         a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n, lanes)))
         c = tvm.nd.empty((n,), B.dtype, dev)
@@ -179,13 +136,3 @@ def func(
     b = tvm.nd.array(np.zeros((4,)).astype("float32"), dev)
     mod(a, b)
     tvm.testing.assert_allclose(b.numpy(), np.exp2(a.numpy()))
-
-
-if __name__ == "__main__":
-    test_rocm_cross_thread_reduction()
-    test_rocm_inf_nan()
-    test_rocm_reduction_binding()
-    test_rocm_copy()
-    test_rocm_vectorize_add()
-    test_rocm_warp_shuffle()
-    test_rocm_vectorized_exp()
diff --git a/tests/python/codegen/test_target_codegen_vulkan.py b/tests/python/codegen/test_target_codegen_vulkan.py
index 9d00f047cb69..0e1aa1a0403b 100644
--- a/tests/python/codegen/test_target_codegen_vulkan.py
+++ b/tests/python/codegen/test_target_codegen_vulkan.py
@@ -26,7 +26,7 @@
 
 import tvm
 import tvm.testing
-from tvm import te
+from tvm import te, tir
 from tvm.topi.math import cast
 from tvm.script import tir as T, ir as I
 from tvm.tir import TensorIntrin, IntImm, Cast, Schedule
@@ -60,9 +60,10 @@
         ]
     )
 )
-def test_vector_comparison(target, dtype):
-    n = (1024,)
-    A = te.placeholder(n, dtype=dtype, name="A")
+def test_vector_comparison(target, dev, dtype):
+    target = tvm.target.Target(target)
+    n = 1024
+    A = te.placeholder((n,), dtype=dtype, name="A")
     B = te.compute(
         A.shape,
         lambda i: tvm.tir.Select(
@@ -70,14 +71,18 @@ def test_vector_comparison(target, dtype):
         ),
         name="B",
     )
-    s = te.create_schedule(B.op)
 
-    (bx, tx) = s[B].split(s[B].op.axis[0], factor=128)
-    (tx, vx) = s[B].split(tx, factor=4)
-    s[B].bind(bx, te.thread_axis("blockIdx.x"))
-    s[B].bind(tx, te.thread_axis("threadIdx.x"))
-    s[B].vectorize(vx)
-    f = tvm.build(s, [A, B], target)
+    # Create IRModule
+    mod = tvm.IRModule.from_expr(te.create_prim_func([A, B]))
+    sch = tir.Schedule(mod)
+    (bx, tx) = sch.split(sch.get_loops("B")[0], factors=[None, 128])
+    (tx, vx) = sch.split(tx, factors=[None, 4])
+    sch.bind(bx, "blockIdx.x")
+    sch.bind(tx, "threadIdx.x")
+    sch.vectorize(vx)
+
+    # Build
+    f = tvm.build(sch.mod, target=target)
 
     # Verify we generate the boolx4 type declaration and the OpSelect
     # v4{float,half,int} instruction
@@ -102,133 +107,48 @@ def test_array_copy(dev, dtype, fuzz_seed):
 
 @tvm.testing.exclude_targets("llvm")
 def test_array_vectorize_add(target, dev, dtype):
+    target = tvm.target.Target(target)
     arr_size = 64
     lanes = 2
-    if "opencl" in target and dtype == "float16":
-        pytest.xfail("Opencl target does not support float16")
 
-    num_thread = 8
+    if "opencl" in str(target) and dtype == "float16":
+        pytest.xfail("Opencl target does not support float16")
 
     A = te.placeholder((arr_size,), name="A", dtype="%sx%d" % (dtype, lanes))
-    B = te.compute((arr_size,), lambda i: A[i] + tvm.tir.const(1, A.dtype), name="B")
-    s = te.create_schedule(B.op)
-    xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
-    s[B].bind(xo, te.thread_axis("blockIdx.x"))
-    s[B].bind(xi, te.thread_axis("threadIdx.x"))
-    fun = tvm.build(s, [A, B], target)
+    B = te.compute(A.shape, lambda i: A[i] + tvm.tir.const(1, A.dtype), name="B")
+
+    sch = tir.Schedule(te.create_prim_func([A, B]))
+    xo, xi = sch.split(sch.get_loops("B")[0], factors=[None, 4])
+    sch.bind(xo, "blockIdx.x")
+    sch.bind(xi, "threadIdx.x")
+    f = tvm.build(sch.mod, target=target)
+
     a = tvm.nd.empty((arr_size,), A.dtype, dev).copyfrom(np.random.uniform(size=(arr_size, lanes)))
     c = tvm.nd.empty((arr_size,), B.dtype, dev)
-    fun(a, c)
+    f(a, c)
     tvm.testing.assert_allclose(c.numpy(), a.numpy() + 1)
 
 
-@tvm.testing.parametrize_targets("vulkan")
-@pytest.mark.skip("Flaky, https://github.com/apache/tvm/issues/10779")
-def test_vulkan_stress(target, dev):
-    """
-    Launch a randomized test with multiple kernels per stream, multiple uses of
-    kernels per stream, over multiple threads.
-    """
-
-    n = 1024
-    num_thread = 64
-
-    def run_stress():
-        def worker():
-            A = te.placeholder((n,), name="A", dtype="float32")
-            B = te.placeholder((n,), name="B", dtype="float32")
-            functions = [
-                (
-                    lambda: te.compute((n,), lambda i: 2 * A[i] + 3 * B[i]),
-                    lambda a, b: 2 * a + 3 * b,
-                ),
-                (lambda: te.compute((n,), lambda i: A[i] + B[i]), lambda a, b: a + b),
-                (lambda: te.compute((n,), lambda i: A[i] + 2 * B[i]), lambda a, b: a + 2 * b),
-            ]
-
-            def build_f(f_ref):
-                (C_f, ref) = f_ref
-                C = C_f()
-                s = te.create_schedule(C.op)
-                xo, xi = s[C].split(C.op.axis[0], factor=num_thread)
-                s[C].bind(xo, te.thread_axis("blockIdx.x"))
-                s[C].bind(xi, te.thread_axis("threadIdx.x"))
-                fun = tvm.build(s, [A, B, C], target)
-                return (fun, ref)
-
-            fs = [
-                build_f(random.choice(functions)) for _ in range(np.random.randint(low=1, high=10))
-            ]
-            a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n,)))
-            b = tvm.nd.empty((n,), B.dtype, dev).copyfrom(np.random.uniform(size=(n,)))
-            cs = [tvm.nd.empty((n,), A.dtype, dev) for _ in fs]
-            for (f, _), c in zip(fs, cs):
-                f(a, b, c)
-
-            for (_, ref), c in zip(fs, cs):
-                tvm.testing.assert_allclose(c.numpy(), ref(a.numpy(), b.numpy()))
-
-        ts = [threading.Thread(target=worker) for _ in range(np.random.randint(1, 10))]
-        for t in ts:
-            t.start()
-        for t in ts:
-            t.join()
-
-    run_stress()
-
-
 @tvm.testing.exclude_targets("llvm")
 def test_vulkan_bool_load(target, dev):
-    arr_size = 1024
-
     target = tvm.target.Target(target)
-    if target.kind.name == "vulkan":
-        supports_int8_buffer = target.attrs.get("supports_int8", False) and target.attrs.get(
-            "supports_8bit_buffer", False
-        )
-        if not supports_int8_buffer:
-            pytest.xfail(
-                "Vulkan target does not support int8 buffer access, used to transfer booleans"
-            )
-
-    def do_copy(A, B, n):
-        ib = tvm.tir.ir_builder.create()
-        A = ib.buffer_ptr(A)
-        B = ib.buffer_ptr(B)
-
-        tx = te.thread_axis("threadIdx.x")
-        bx = te.thread_axis("blockIdx.x")
-
-        max_threads = 32
-        ib.scope_attr(bx, "thread_extent", tvm.tir.indexdiv(n + max_threads - 1, max_threads))
-        ib.scope_attr(tx, "thread_extent", max_threads)
-        tid = bx * max_threads + tx
-
-        with ib.if_scope(tid < n):
-            B[tid] = cast(A[tid], "int32")
-
-        return ib.get()
-
+    arr_size = 1024
     A = te.placeholder((arr_size,), name="A", dtype="bool")
-    B = te.placeholder((arr_size,), name="B", dtype="int32")
+    B = te.compute(A.shape, lambda i: A[i].astype("int32"), name="B")
 
-    B = te.extern(
-        A.shape,
-        [A],
-        lambda ins, outs: do_copy(ins[0], outs[0], arr_size),
-        name="bool_copy_ir",
-        dtype="int32",
-    )
-    s = te.create_schedule(B.op)
+    sch = tir.Schedule(te.create_prim_func([A, B]))
+    xo, xi = sch.split(sch.get_loops("B")[0], factors=[None, 128])
+    sch.bind(xo, "blockIdx.x")
+    sch.bind(xi, "threadIdx.x")
 
-    with tvm.transform.PassContext(opt_level=3):
-        func = tvm.build(s, [A, B], target)
+    # Build
+    f = tvm.build(sch.mod, target=target)
 
     a_np = np.random.uniform(size=arr_size) > 0.5
     b_np = np.zeros((arr_size,), dtype="int32")
     a = tvm.nd.array(a_np, dev)
     b = tvm.nd.array(b_np, dev)
-    func(a, b)
+    f(a, b)
     ref = a_np.astype(np.int32)
     tvm.testing.assert_allclose(b.numpy(), ref)
 
@@ -270,11 +190,11 @@ def test_vulkan_constant_passing(target, dev, vulkan_parameter_impl, vulkan_para
     A = te.placeholder((n,), name="A", dtype=dtype)
     B = te.compute(A.shape, lambda i: scalar_sum + A[i], name="B")
 
-    s = te.create_schedule(B.op)
-    xo, xi = s[B].split(B.op.axis[0], factor=64)
-    s[B].bind(xo, te.thread_axis("blockIdx.x"))
-    s[B].bind(xi, te.thread_axis("threadIdx.x"))
-    f_add = tvm.build(s, scalars + [A, B], target)
+    sch = tvm.tir.Schedule(te.create_prim_func(scalars + [A, B]))
+    xo, xi = sch.split(sch.get_loops("B")[0], factors=[None, 64])
+    sch.bind(xo, "blockIdx.x")
+    sch.bind(xi, "threadIdx.x")
+    f_add = tvm.build(sch.mod, target=target)
 
     n = 1024
     scalars = np.array([1 for _ in scalars]).astype(dtype)
@@ -287,6 +207,9 @@ def test_vulkan_constant_passing(target, dev, vulkan_parameter_impl, vulkan_para
 
 def test_vulkan_while_if(target, dev):
     target = tvm.target.Target(target)
+    n = 1
+    dtype = "int32"
+    A = te.placeholder((n,), name="A", dtype=dtype)
 
     def do_compute(A, B, n):
         ib = tvm.tir.ir_builder.create()
@@ -300,9 +223,6 @@ def do_compute(A, B, n):
         iterations[0] = 0
         B[0] = 0
 
-        # WhileNode's condition is re-evaluated every loop.  The
-        # if_then_else block introduces additional labels/blocks that
-        # must be kept separate from the WhileNode's block.
         loop_condition = iterations[0] < tvm.tir.if_then_else(A[0] > 0, 10, 20)
         with ib.while_loop(loop_condition):
             iterations[0] += 1
@@ -310,21 +230,19 @@ def do_compute(A, B, n):
 
         return ib.get()
 
-    n = 1
-    dtype = "int32"
-    A = te.placeholder((n,), name="A", dtype=dtype)
-
     B = te.extern(
         A.shape,
         [A],
         lambda ins, outs: do_compute(ins[0], outs[0], n),
         dtype=dtype,
     )
-    s = te.create_schedule(B.op)
 
-    # Point of failure would be here, at tvm.build.
-    with tvm.transform.PassContext(opt_level=3):
-        func = tvm.build(s, [A, B], target)
+    # Create IRModule
+    mod = tvm.IRModule.from_expr(te.create_prim_func([A, B]))
+    sch = tir.Schedule(mod)
+
+    # Build
+    func = tvm.build(sch.mod, target=target)
 
     a = tvm.nd.array(np.array([5], dtype=A.dtype), dev)
     b = tvm.nd.array(np.zeros(n, dtype=A.dtype), dev)
@@ -339,52 +257,40 @@ def do_compute(A, B, n):
 
 @tvm.testing.exclude_targets("llvm")
 def test_vulkan_local_threadidx(target, dev):
-    # To access the thread index, the vulkan runtime accesses a global
-    # array of thread indices, storing the result in a local variable.
-    # In CUDA, these are the built-in threadIdx.x variables, which are
-    # globally accessible.  In vulkan, these local variables must be
-    # defined inside a function, but are hoisted up to the function
-    # header to mimic the global CUDA semantics.  Before this
-    # hoisting, this test could trigger spvValidate errors for
-    # potentially undeclared variables.
+    target = tvm.target.Target(target)
+    n = 32
+    A = te.placeholder((n,), name="A", dtype="int32")
 
     def do_compute(A, B, n):
         ib = tvm.tir.ir_builder.create()
         A = ib.buffer_ptr(A)
         B = ib.buffer_ptr(B)
 
-        # One single declaration of te.thread_axis.
         tx = te.thread_axis("threadIdx.x")
 
         with ib.for_range(0, 1):
-            # Used inside a for-loop scope, defines local thread_id
-            # variable.
             ib.scope_attr(tx, "thread_extent", 16)
             B[tx + 0] = A[tx + 0]
 
         with ib.for_range(0, 1):
-            # Used in next scope.  If local variable defined at point
-            # of use instead of function header, will fail spvValidate
-            # for access of out-of-scope local variable.
             ib.scope_attr(tx, "thread_extent", 16)
             B[tx + 16] = A[tx + 16]
 
         return ib.get()
 
-    n = te.var("n")
-    A = te.placeholder((n,), name="A", dtype="int32")
-    B = te.placeholder((n,), name="B", dtype="int32")
-
     B = te.extern(
         A.shape,
         [A],
         lambda ins, outs: do_compute(ins[0], outs[0], n),
         dtype="int32",
     )
-    s = te.create_schedule(B.op)
 
-    # Expected failure occurs at build step.
-    func = tvm.build(s, [A, B], target)
+    # Create IRModule
+    mod = tvm.IRModule.from_expr(te.create_prim_func([A, B]))
+    sch = tir.Schedule(mod)
+
+    # Build
+    func = tvm.build(sch.mod, target=target)
 
     n = 32
     a_np = np.arange(n).astype(dtype=A.dtype)
@@ -473,9 +379,8 @@ def do_compute(ins, outs):
             return ib.get()
 
         B = te.extern(A.shape, [A, R], do_compute, dtype="int32")
-        s = te.create_schedule(B.op)
 
-        return tvm.lower(s, [A, R, B])
+        return tvm.IRModule.from_expr(te.create_prim_func([A, R, B]))
 
     def test_ramp_broadcast_index(self, target, dev, mod, ref_data):
         f = tvm.build(mod, target=target)
@@ -488,36 +393,6 @@ def test_ramp_broadcast_index(self, target, dev, mod, ref_data):
         tvm.testing.assert_allclose(b.numpy(), b_np)
 
 
-@tvm.testing.parametrize_targets("vulkan -max_shared_memory_per_block=16384")
-def test_shared_mem_alloc(target, dev):
-    alloc_nbytes = 16384 * 2
-
-    def do_compute(ins, outs):
-        ib = tvm.tir.ir_builder.create()
-        out = ib.buffer_ptr(outs[0])
-
-        ib.scope_attr(te.thread_axis("blockIdx.x"), "thread_extent", 0)
-
-        array = ib.allocate("int32", (alloc_nbytes,), name="array", scope="shared")
-        array[0] = 0
-        out[0] = array[0]
-
-        return ib.get()
-
-    Out = te.extern(
-        shape=(1,),
-        inputs=[],
-        fcompute=do_compute,
-        dtype="int32",
-    )
-    s = te.create_schedule(Out.op)
-
-    # Codegen should raise error when allocating more memory than the
-    # target supports.
-    with pytest.raises(tvm.TVMError):
-        tvm.build(s, [Out], target)
-
-
 def test_negative_operand_divmod(target, dev):
     """Test handling of negative offsets to floormod/floordiv
 
diff --git a/tests/python/codegen/test_target_codegen_x86.py b/tests/python/codegen/test_target_codegen_x86.py
index a276940050b1..f433964f7f5d 100644
--- a/tests/python/codegen/test_target_codegen_x86.py
+++ b/tests/python/codegen/test_target_codegen_x86.py
@@ -38,9 +38,9 @@ def fp16_to_fp32(target, width, match=None, not_match=None):
         n = tvm.runtime.convert(elements)
         A = te.placeholder((n, width), dtype="float16", name="A")
         B = te.compute(A.shape, lambda *i: A(*i).astype("float32"), name="B")
-        s = te.create_schedule(B.op)
-        s[B].vectorize(s[B].op.axis[1])
-        f = tvm.build(s, [A, B], target)
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+        sch.vectorize(sch.get_loops("B")[1])
+        f = tvm.build(sch.mod, target=target)
 
         assembly = f.get_source("asm").splitlines()
         if match:
diff --git a/tests/python/contrib/test_cblas.py b/tests/python/contrib/test_cblas.py
index 3c90aefeb67a..b8851e685b13 100644
--- a/tests/python/contrib/test_cblas.py
+++ b/tests/python/contrib/test_cblas.py
@@ -39,7 +39,6 @@ def verify_matmul_add(
     final_result = te.compute(
         matmul_result.shape, lambda i, j: matmul_result[i, j] + bias, name="final_result"
     )
-    s = te.create_schedule(final_result.op)
 
     def get_numpy(a, b, matrix_bias, transa, transb):
         if transa:
@@ -64,7 +63,12 @@ def verify(target="llvm"):
             return
         dev = tvm.cpu(0)
         name = "test_matmul_add"
-        f = tvm.build(s, [input1_data, input2_data, final_result, bias], target, name=name)
+        f = tvm.build(
+            te.create_prim_func([input1_data, input2_data, final_result, bias]).with_attr(
+                "global_symbol", name
+            ),
+            target=target,
+        )
         if target == "c":
             f = compiling(f, name)
         matrix_input1 = tvm.nd.array(np.random.uniform(size=ashape).astype(input1_data.dtype), dev)
@@ -126,7 +130,6 @@ def verify_quantized_matmul_add(matrix_m, matrix_l, matrix_n, transa=False, tran
     final_result = te.compute(
         matmul_result.shape, lambda i, j: matmul_result[i, j] + bias, name="final_result"
     )
-    s = te.create_schedule(final_result.op)
 
     def get_numpy(a, b, matrix_bias, transa, transb):
         if transa:
@@ -143,7 +146,9 @@ def verify(target="llvm"):
             print("skip because extern function is not available")
             return
         dev = tvm.cpu(0)
-        f = tvm.build(s, [input1_data, input2_data, final_result, bias], target)
+        f = tvm.build(
+            te.create_prim_func([input1_data, input2_data, final_result, bias]), target=target
+        )
         matrix_input1 = tvm.nd.array(
             np.random.randint(low=0, high=50, size=ashape).astype(input1_data.dtype), dev
         )
@@ -201,7 +206,6 @@ def verify_batch_matmul(
     final_result = te.compute(
         matmul_result.shape, lambda k, i, j: matmul_result[k, i, j], name="final_result"
     )
-    s = te.create_schedule(final_result.op)
 
     def get_numpy(a, b, transa, transb):
         if transa:
@@ -226,7 +230,7 @@ def verify(target="llvm"):
             return
         dev = tvm.cpu(0)
         name = "test_batch_matmul"
-        f = tvm.build(s, [input1_data, input2_data, final_result], target, name=name)
+        f = tvm.build(te.create_prim_func([input1_data, input2_data, final_result]), target=target)
         if target == "c":
             f = compiling(f, name)
         matrix_input1 = tvm.nd.array(np.random.uniform(size=ashape).astype(input1_data.dtype), dev)
diff --git a/tests/python/contrib/test_dlpack.py b/tests/python/contrib/test_dlpack.py
index 4e65f79c518e..70277cb0ca0a 100644
--- a/tests/python/contrib/test_dlpack.py
+++ b/tests/python/contrib/test_dlpack.py
@@ -49,10 +49,9 @@ def verify_torch_dlpack():
 
         k = te.reduce_axis((0, n), name="k")
         ZZ = te.compute((n, n), lambda i, j: te.sum(XX[i, k] * YY[k, j], axis=k))
-        s = te.create_schedule(ZZ.op)
         # No need to speficy target_host if it's llvm
         # Otherwise you will need to specify the target and target_host
-        f = tvm.build(s, [XX, YY, ZZ], name="f")
+        f = tvm.build(te.create_prim_func([XX, YY, ZZ]))
 
         f_pytorch = to_pytorch_func(f)
         zz2 = torch.empty(137, 137)
diff --git a/tests/python/contrib/test_gemm_acc16.py b/tests/python/contrib/test_gemm_acc16.py
deleted file mode 100644
index 18e15098a07e..000000000000
--- a/tests/python/contrib/test_gemm_acc16.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=import-self, invalid-name, unused-argument, too-many-lines, len-as-condition
-import tvm
-from tvm import te
-import numpy as np
-from tvm.topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int16
-
-
-def benchmark_fc_int8_acc16():
-    m = 128
-    n = 128
-    k = 128
-
-    X = te.placeholder((m, k), name="X", dtype="uint8")
-    W = te.placeholder((n, k), name="W", dtype="int8")
-
-    peak = 512 / 16 * 2 * 2 * 2
-    gops_per_mm = 2 * n * m * k
-    print("Peak {} Gops/s \n".format(peak))
-
-    def verify(target="llvm -mcpu=skylake-avx512"):
-        if not tvm.runtime.enabled(target):
-            print("skip because %s is not enabled..." % target)
-            return
-
-        dev = tvm.device(target, 0)
-        X = te.placeholder((m, k), name="X", dtype="uint8")
-        W = te.placeholder((n, k), name="W", dtype="int8")
-        pc = dot_16x1x16_uint8_int8_int16()
-        ak = te.reduce_axis((0, k), name="k")
-
-        packedW = te.placeholder((n // 128, 128 * (k // 2), 2), name="packedW", dtype="int8")
-        t_fc = te.compute(
-            (m, n),
-            lambda i, j: te.sum(
-                X[i, ak].astype("int16")
-                * packedW[j // 128, (ak // 2) * 128 + j % 128, ak % 2].astype("int16"),
-                axis=ak,
-            ),
-            name="F",
-        )
-
-        t_sch = te.create_schedule(t_fc.op)
-        a_x, a_y = t_fc.op.axis
-        (a_k,) = t_fc.op.reduce_axis
-
-        a_yo, a_yi = t_sch[t_fc].split(a_y, factor=128)
-        a_ko, a_ki = t_sch[t_fc].split(a_k, factor=2)
-
-        a_xo, a_xi = t_sch[t_fc].split(a_x, factor=128)
-        a_koo, a_koi = t_sch[t_fc].split(a_ko, factor=32)
-        t_sch[t_fc].reorder(a_yo, a_xo, a_koo, a_xi, a_koi, a_yi, a_ki)
-
-        t_sch[t_fc].tensorize(a_yi, pc)
-        # print(tvm.lower(t_sch, [X, packedW, t_fc], simple_mode=True))
-        t_func = tvm.build(t_sch, [X, packedW, t_fc], target, name="intrinsic")
-        t_evaluator = t_func.time_evaluator(t_func.entry_name, dev, number=10)
-
-        # generate the plain data
-        a_ = np.random.uniform(1, 10, size=(m, k)).astype("uint8")
-        b_ = np.random.uniform(1, 10, size=(n, k)).astype("int8")
-
-        packW = np.random.uniform(1, 10, size=(n // 128, 128 * (k // 2), 2)).astype("int8")
-        # This occurs in pre_compute stage
-        for r_idx in range(n // 128):
-            for s_idx in range(128 * (k // 2)):
-                for t_idx in range(2):
-                    packW[r_idx][s_idx][t_idx] = b_[r_idx * 128 + s_idx % 128][
-                        s_idx // 128 * 2 + t_idx
-                    ]
-
-        x = tvm.nd.array(a_, dev)
-        w = tvm.nd.array(packW, dev)
-        y = tvm.nd.array(np.zeros((m, n), dtype="int16"), dev)
-
-        result = t_evaluator(x, w, y)
-        gops_per_sec = gops_per_mm / result.mean / 1e9
-        tvm.testing.assert_allclose(y.numpy(), np.dot(a_, b_.T), rtol=1e-5)
-        print(
-            "Tensorization: running time: {:.3f} ms, {:.2f} Gops/s, effiency: {:.2f}.".format(
-                result.mean * 1000, gops_per_sec, gops_per_sec / peak
-            )
-        )
-        # t_func.export_library("gemm_tensorize.o")
-
-    verify()
-
-
-if __name__ == "__main__":
-    benchmark_fc_int8_acc16()
diff --git a/tests/python/contrib/test_gemm_acc32_vnni.py b/tests/python/contrib/test_gemm_acc32_vnni.py
deleted file mode 100644
index 2e15d38612ce..000000000000
--- a/tests/python/contrib/test_gemm_acc32_vnni.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-import tvm.testing
-from tvm import te
-import numpy as np
-from tvm.topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int32
-
-
-def verify_fc_int8_acc32(m=1024, n=1024, k=1024, target="llvm -mcpu=cascadelake"):
-    X = te.placeholder((m, k), name="X", dtype="uint8")
-    # W = te.placeholder((n, k), name="W", dtype="int8")
-
-    if not tvm.testing.device_enabled(target):
-        print("skip because %s is not enabled..." % target)
-        return
-
-    dev = tvm.device(target, 0)
-    # workaround for Target.current()
-    with tvm.target.Target(target) as target:
-        pc = dot_16x1x16_uint8_int8_int32()
-
-    ak = te.reduce_axis((0, k), name="k")
-    packedW = te.placeholder((n // 16, 16 * (k // 4), 4), name="packedW", dtype="int8")
-
-    t_fc = te.compute(
-        (m, n),
-        lambda i, j: te.sum(
-            X[i, ak].astype("int32")
-            * packedW[
-                tvm.tir.indexdiv(j, 16), tvm.tir.indexdiv(ak, 4) * 16 + j % 16, ak % 4
-            ].astype("int32"),
-            axis=ak,
-        ),
-        name="F",
-    )
-    t_sch = te.create_schedule(t_fc.op)
-    a_x, a_y = t_fc.op.axis
-    (a_k,) = t_fc.op.reduce_axis
-
-    a_yo, a_yi = t_sch[t_fc].split(a_y, factor=16)
-    a_xo, a_xi = t_sch[t_fc].split(a_x, factor=32)
-    a_ko, a_ki = t_sch[t_fc].split(a_k, factor=4)
-    a_koo, a_koi = t_sch[t_fc].split(a_ko, factor=4)
-    t_sch[t_fc].reorder(a_yo, a_xo, a_xi, a_koo, a_koi, a_yi, a_ki)
-
-    t_sch[t_fc].unroll(a_koi)
-    t_sch[t_fc].tensorize(a_yi, pc)
-
-    t_func = tvm.build(t_sch, [X, packedW, t_fc], target, name="intrinsic")
-    t_evaluator = t_func.time_evaluator(t_func.entry_name, dev, number=10)
-
-    # generate the plain data
-    a_ = np.random.uniform(1, 10, size=(m, k)).astype("uint8")
-    b_ = np.random.uniform(1, 10, size=(n, k)).astype("int8")
-
-    packW = np.random.uniform(1, 10, size=(n // 16, 16 * (k // 4), 4)).astype("int8")
-    # This occurs in pre_compute stage
-    for r_idx in range(n // 16):
-        for s_idx in range(16 * (k // 4)):
-            for t_idx in range(4):
-                packW[r_idx][s_idx][t_idx] = b_[r_idx * 16 + s_idx % 16][(s_idx // 16) * 4 + t_idx]
-
-    x = tvm.nd.array(a_, dev)
-    w = tvm.nd.array(packW, dev)
-    y = tvm.nd.array(np.zeros((m, n), dtype="int32"), dev)
-    result = t_evaluator(x, w, y)
-
-    peak = 280
-    print("Peak {} Gops/s".format(peak))
-    # memory_ops = m * k + n * k + 2 * m * n
-    gops_per_mm = 2 * m * n * k
-
-    gops_per_sec = gops_per_mm / result.mean / 1e9
-    # verify the correctness
-    tvm.testing.assert_allclose(y.numpy(), np.dot(a_, b_.T), rtol=0)
-    print(
-        "Tensorization: running time: {:.3f} ms, {:.2f} Gops/s, effiency: {:.2f}".format(
-            result.mean * 1000, gops_per_sec, gops_per_sec / peak
-        )
-    )
-    # t_func.export_library("tensorize_acc32.o")
-
-
-@tvm.testing.requires_x86_vnni
-def test_fc_int8_acc32_vnni():
-    # For LLVM < 8.0, it shows "'cascadelake' is not a recognized processor for this target
-    # (ignoring processor)" error with the following setting. After LLVM 8.0 is enabled in the
-    # test, we should use cascadelake setting.
-    verify_fc_int8_acc32()
-
-
-@tvm.testing.requires_x86_avx512
-def test_fc_int8_acc32_avx512():
-    verify_fc_int8_acc32(target="llvm -mcpu=skylake-avx512")
-
-
-if __name__ == "__main__":
-    test_fc_int8_acc32_vnni()
-    test_fc_int8_acc32_avx512()
diff --git a/tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py b/tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py
deleted file mode 100644
index 07f6c2613dbc..000000000000
--- a/tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" Hexagon contrib tests for blocked conv2d """
-
-
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import te, topi
-from tvm.topi import testing
-
-from ..infrastructure import (
-    build_and_run,
-    conv2d_compute,
-    conv2d_verify,
-    get_block_shape,
-    get_packed_filter_shape,
-    get_packed_shape,
-)
-
-
-def conv2d_nhwc8h8w32c(
-    shape_input,
-    pad,
-    stride,
-    dilation,
-    shape_filter,
-    k_split_factor,
-    h_split_factor,
-    dtype,
-    storage_scope="global",
-):
-    """
-    Conv2d wherein the input activation is defined by its
-    logical NHWC layout.  The filter is provided in its physical
-    packed layout (oihw8i32o4i).  The input is padded and then packed
-    into its physical packed layout (nhwc8h8w32c).  The resulting
-    computation is in the same physical packed layout (nhwc8h8w32c).
-    """
-
-    # nhwc layout
-    logical_input = te.placeholder(shape_input, dtype=dtype, name="logical_input")
-
-    # oihw8i32o4i layout
-    filt_packed = te.placeholder(shape_filter, dtype=dtype, name="packed_filter")
-
-    block_h, block_w, block_c = get_block_shape()
-
-    # Calculate padded input
-    _, height, width, _ = shape_input
-    pad_h = (block_h - ((height + pad[1]) % block_h)) % block_h
-    pad_w = (block_w - ((width + pad[3]) % block_w)) % block_w
-    padded_input = topi.nn.pad(
-        logical_input,
-        [0, pad[0], pad[2], 0],
-        [0, pad_h, pad_w, 0],
-        pad_value=0,
-        name="padded_input",
-    )
-
-    # Calculate packed input
-    packed_shape = get_packed_shape(padded_input.shape)
-    packed_input = te.compute(
-        packed_shape,
-        lambda n, ho, wo, co, hi, wi, ci: padded_input[
-            n, ho * block_h + hi, wo * block_w + wi, co * block_c + ci
-        ],
-        name="packed_input",
-    )
-
-    output_shape, compute = conv2d_compute(packed_input, filt_packed, pad, stride, dilation)
-    packed_output = te.compute(output_shape, compute, name="packed_output")
-    s = te.create_schedule(packed_output.op)
-
-    # Ensure the padding and array packing is performed inline
-    s[padded_input].compute_inline()
-    s[packed_input].compute_inline()
-
-    # cache reads and writes
-    cached_input = s.cache_read(packed_input, storage_scope, [packed_output])
-    cached_filt = s.cache_read(filt_packed, storage_scope, [packed_output])
-    cached_output = s.cache_write(packed_output, storage_scope)
-
-    # cache write schedule
-    batch, h_outer, w_outer, k_outer, h_inner, w_inner, k_inner = s[packed_output].op.axis
-    koo, koi = s[packed_output].split(k_outer, factor=k_split_factor)
-    hoo, hoi = s[packed_output].split(h_outer, factor=h_split_factor)
-    s[packed_output].reorder(batch, koo, hoo, koi, hoi, w_outer, h_inner, w_inner, k_inner)
-    s[cached_output].compute_at(s[packed_output], hoo)
-
-    # compute schedule
-    batch, h_outer, w_outer, k_outer, h_inner, w_inner, k_inner = s[cached_output].op.axis
-    _, _, reduce_c = s[cached_output].op.reduce_axis
-    rco, rci = s[cached_output].split(reduce_c, factor=block_c)
-    koo, koi = s[cached_output].split(k_outer, factor=k_split_factor)
-    hoo, hoi = s[cached_output].split(h_outer, factor=h_split_factor)
-    s[cached_output].reorder(
-        batch, koo, hoo, koi, hoi, w_outer, rco, h_inner, w_inner, k_inner, rci
-    )
-    s[cached_input].compute_at(s[cached_output], hoo)
-    s[cached_filt].compute_at(s[cached_output], hoo)
-
-    binds = {}
-    if storage_scope and storage_scope != "global":
-        with tvm.transform.PassContext():
-            input_buffer = tvm.tir.decl_buffer(
-                packed_shape, name="Xb", dtype=dtype, scope=storage_scope
-            )
-            output_buffer = tvm.tir.decl_buffer(
-                output_shape, name="Yb", dtype=dtype, scope=storage_scope
-            )
-            binds = {logical_input: input_buffer, packed_output: output_buffer}
-
-    return (s, [logical_input, filt_packed, packed_output], binds)
-
-
-class BaseConv2d:
-    """Base class for conv2d tests"""
-
-    # input
-    batch = tvm.testing.parameter(1)
-    in_size = tvm.testing.parameter(64)
-    in_channel = tvm.testing.parameter(64)
-    # conv2d
-    pad = tvm.testing.parameter(0)
-    stride = tvm.testing.parameter(1)
-    kernel_size = tvm.testing.parameter(1, 3)
-    out_channel = tvm.testing.parameter(128)
-    # schedule params
-    k_split_factor = tvm.testing.parameter(1, 2)
-    h_split_factor = tvm.testing.parameter(1, 2)
-    dtype = tvm.testing.parameter("float32")
-
-
-class TestConv2dPackedFilter(BaseConv2d):
-    """Conv2d packed filter test class"""
-
-    @tvm.testing.parametrize_targets("llvm")
-    @tvm.testing.skip_if_32bit(reason="Test known to be flaky on i386 machines")
-    def test_conv2d(
-        self,
-        batch,
-        in_size,
-        in_channel,
-        pad,
-        stride,
-        kernel_size,
-        out_channel,
-        k_split_factor,
-        h_split_factor,
-        dtype,
-        target,
-    ):
-        """conv2d test"""
-        # TODO: no support for dilation
-        dilation = 1
-
-        shape_input = [batch, in_size, in_size, in_channel]
-        shape_filter_oihw = [out_channel, in_channel, kernel_size, kernel_size]
-        shape_filter_oihw8i32o4i = get_packed_filter_shape(shape_filter_oihw)
-
-        inputs = [
-            np.random.uniform(0, 255, size=shape_input).astype(dtype),
-            np.random.uniform(0, 255, size=shape_filter_oihw8i32o4i).astype(dtype),
-        ]
-        np_filter = (
-            inputs[1]
-            .transpose(0, 5, 1, 4, 6, 2, 3)
-            .reshape(shape_filter_oihw)
-            .transpose(2, 3, 1, 0)
-        )
-        ref_output = testing.conv2d_nhwc_python(inputs[0], np_filter, stride, pad)
-        output = build_and_run(
-            inputs,
-            conv2d_nhwc8h8w32c,
-            target,
-            target,
-            shape_input=shape_input,
-            pad=(pad, pad, pad, pad),
-            stride=(stride, stride),
-            dilation=(dilation, dilation),
-            shape_filter=shape_filter_oihw8i32o4i,
-            k_split_factor=k_split_factor,
-            h_split_factor=h_split_factor,
-            dtype=dtype,
-        )
-
-        conv2d_verify(output, ref_output, dtype)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py b/tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py
deleted file mode 100644
index fa770c9be313..000000000000
--- a/tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py
+++ /dev/null
@@ -1,252 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" back-to-back conv2d Hexagon test for stripe scheduling """
-
-
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import te, topi
-from tvm.topi import testing
-
-from ..infrastructure import (
-    build_and_run,
-    conv2d_compute,
-    conv2d_verify,
-    get_block_shape,
-    get_packed_filter_shape,
-    get_packed_shape,
-)
-
-
-def conv2dconv2d_nhwc8h8w32c(
-    shape_input,
-    pad1,
-    stride1,
-    dilation1,
-    shape_filter1,
-    pad2,
-    stride2,
-    dilation2,
-    shape_filter2,
-    k_split_factor,
-    h_split_factor,
-    dtype,
-    storage_scope="global",
-):
-    """
-    Conv2d -> Conv2d wherein the input activation is defined by its
-    logical NHWC layout.  The filter is provided in its physical
-    packed layout (oihw8i32o4i).  The input is padded and then packed
-    into its physical packed layout (nhwc8h8w32c).  The resulting
-    computation is in the same physical packed layout (nhwc8h8w32c).
-    """
-
-    # nhwc layout
-    logical_input = te.placeholder(shape_input, dtype=dtype, name="logical_input")
-
-    # oihw8i32o4i layout
-    filt_packed1 = te.placeholder(shape_filter1, dtype=dtype, name="packed_filter1")
-    filt_packed2 = te.placeholder(shape_filter2, dtype=dtype, name="packed_filter2")
-
-    block_h, block_w, block_c = get_block_shape()
-
-    # Calculate padded input
-    _, height, width, _ = shape_input
-    pad_h = (block_h - ((height + pad1[1]) % block_h)) % block_h
-    pad_w = (block_w - ((width + pad1[3]) % block_w)) % block_w
-    padded_input = topi.nn.pad(
-        logical_input,
-        [0, pad1[0], pad1[2], 0],
-        [0, pad_h, pad_w, 0],
-        pad_value=0,
-        name="padded_input",
-    )
-
-    # Calculate packed input
-    packed_shape = get_packed_shape(padded_input.shape)
-    packed_input = te.compute(
-        packed_shape,
-        lambda n, ho, wo, co, hi, wi, ci: padded_input[
-            n, ho * block_h + hi, wo * block_w + wi, co * block_c + ci
-        ],
-        name="packed_input",
-    )
-
-    output_shape1, compute1 = conv2d_compute(packed_input, filt_packed1, pad1, stride1, dilation1)
-    temp_output = te.compute(output_shape1, compute1, name="temp_output")
-
-    output_shape2, compute2 = conv2d_compute(temp_output, filt_packed2, pad2, stride2, dilation2)
-    packed_output = te.compute(output_shape2, compute2, name="packed_output")
-    s = te.create_schedule(packed_output.op)
-
-    # Ensure the padding and array packing is performed inline
-    s[padded_input].compute_inline()
-    s[packed_input].compute_inline()
-
-    # cache reads and writes
-    packed_input_cached = s.cache_read(packed_input, storage_scope, [temp_output])
-    filt_packed1_cached = s.cache_read(filt_packed1, storage_scope, [temp_output])
-    filt_packed2_cached = s.cache_read(filt_packed2, storage_scope, [packed_output])
-    packed_output_cached = s.cache_write(packed_output, storage_scope)
-
-    # conv2d #1 schedule
-    n, h_outer, w_outer, k_outer, h_inner, w_inner, k_inner = s[temp_output].op.axis
-    _, _, reduce_channel = s[temp_output].op.reduce_axis
-    rco, rci = s[temp_output].split(reduce_channel, factor=block_c)
-    koo, koi = s[temp_output].split(k_outer, factor=k_split_factor)
-    hoo, hoi = s[temp_output].split(h_outer, factor=h_split_factor)
-    s[temp_output].reorder(n, koo, hoo, koi, hoi, w_outer, rco, h_inner, w_inner, k_inner, rci)
-    s[packed_input_cached].compute_at(s[temp_output], hoo)
-    s[filt_packed1_cached].compute_at(s[temp_output], hoo)
-
-    # cache write schedule
-    n, h_outer, w_outer, k_outer, h_inner, w_inner, k_inner = s[packed_output].op.axis
-    koo, koi = s[packed_output].split(k_outer, factor=k_split_factor)
-    hoo, hoi = s[packed_output].split(h_outer, factor=h_split_factor)
-    s[packed_output].reorder(n, koo, hoo, koi, hoi, w_outer, h_inner, w_inner, k_inner)
-    s[packed_output_cached].compute_at(s[packed_output], hoo)
-
-    # conv2d #2 schedule
-    n, h_outer, w_outer, k_outer, h_inner, w_inner, k_inner = s[packed_output_cached].op.axis
-    _, _, reduce_channel = s[packed_output_cached].op.reduce_axis
-    rco, rci = s[packed_output_cached].split(reduce_channel, factor=block_c)
-    koo, koi = s[packed_output_cached].split(k_outer, factor=k_split_factor)
-    hoo, hoi = s[packed_output_cached].split(h_outer, factor=h_split_factor)
-    s[packed_output_cached].reorder(
-        n, koo, hoo, koi, hoi, w_outer, rco, h_inner, w_inner, k_inner, rci
-    )
-    s[temp_output].compute_at(s[packed_output_cached], hoo)
-    s[filt_packed2_cached].compute_at(s[packed_output_cached], hoo)
-
-    binds = {}
-    if storage_scope and storage_scope != "global":
-        with tvm.transform.PassContext():
-            input_buffer = tvm.tir.decl_buffer(
-                packed_shape, name="Xb", dtype=dtype, scope=storage_scope
-            )
-            output_buffer = tvm.tir.decl_buffer(
-                output_shape2, name="Yb", dtype=dtype, scope=storage_scope
-            )
-            binds = {logical_input: input_buffer, packed_output: output_buffer}
-
-    return (s, [logical_input, filt_packed1, filt_packed2, packed_output], binds)
-
-
-class BaseConv2dConv2d:
-    """Base class for conv2d-conv2d tests"""
-
-    # input
-    batch = tvm.testing.parameter(1)
-    in_size = tvm.testing.parameter(64)
-    in_channel = tvm.testing.parameter(128)
-    # conv2d #1
-    pad1 = tvm.testing.parameter(0)
-    stride1 = tvm.testing.parameter(1)
-    kernel_size1 = tvm.testing.parameter(1, 3)
-    out_channel1 = tvm.testing.parameter(128)
-    # conv2d #2
-    stride2 = tvm.testing.parameter(1)
-    kernel_size2 = tvm.testing.parameter(1, 3)
-    out_channel2 = tvm.testing.parameter(128)
-    # schedule params
-    k_split_factor = tvm.testing.parameter(1, 2)
-    h_split_factor = tvm.testing.parameter(1, 2)
-    dtype = tvm.testing.parameter("float32")
-
-
-class TestConv2dConv2dPackedFilter(BaseConv2dConv2d):
-    """Conv2d-Conv2d packed filter test class"""
-
-    @tvm.testing.parametrize_targets("llvm")
-    @tvm.testing.skip_if_32bit(reason="Test known to be flaky on i386 machines")
-    def test_conv2d(
-        self,
-        batch,
-        in_size,
-        in_channel,
-        pad1,
-        stride1,
-        kernel_size1,
-        out_channel1,
-        stride2,
-        kernel_size2,
-        out_channel2,
-        k_split_factor,
-        h_split_factor,
-        dtype,
-        target,
-    ):
-        """conv2d-conv2d test"""
-        # TODO: no support for padding in conv2d #2
-        pad2 = 0
-
-        # TODO: no support for dilation
-        dilation1 = 1
-        dilation2 = 1
-
-        shape_input = [batch, in_size, in_size, in_channel]
-        shape_filter1_oihw = [out_channel1, in_channel, kernel_size1, kernel_size1]
-        shape_filter1_oihw8i32o4i = get_packed_filter_shape(shape_filter1_oihw)
-
-        shape_filter2_oihw = [out_channel2, out_channel1, kernel_size2, kernel_size2]
-        shape_filter2_oihw8i32o4i = get_packed_filter_shape(shape_filter2_oihw)
-
-        inputs = [
-            np.random.uniform(0, 255, size=shape_input).astype(dtype),
-            np.random.uniform(0, 255, size=shape_filter1_oihw8i32o4i).astype(dtype),
-            np.random.uniform(0, 255, size=shape_filter2_oihw8i32o4i).astype(dtype),
-        ]
-        np_filter1 = (
-            inputs[1]
-            .transpose(0, 5, 1, 4, 6, 2, 3)
-            .reshape(shape_filter1_oihw)
-            .transpose(2, 3, 1, 0)
-        )
-        np_filter2 = (
-            inputs[2]
-            .transpose(0, 5, 1, 4, 6, 2, 3)
-            .reshape(shape_filter2_oihw)
-            .transpose(2, 3, 1, 0)
-        )
-        temp_output = testing.conv2d_nhwc_python(inputs[0], np_filter1, stride1, pad1)
-        ref_output = testing.conv2d_nhwc_python(temp_output, np_filter2, stride2, pad2)
-        output = build_and_run(
-            inputs,
-            conv2dconv2d_nhwc8h8w32c,
-            target,
-            target,
-            shape_input=shape_input,
-            pad1=(pad1, pad1, pad1, pad1),
-            stride1=(stride1, stride1),
-            dilation1=(dilation1, dilation1),
-            shape_filter1=shape_filter1_oihw8i32o4i,
-            pad2=(pad2, pad2, pad1, pad1),
-            stride2=(stride2, stride2),
-            dilation2=(dilation2, dilation2),
-            shape_filter2=shape_filter2_oihw8i32o4i,
-            k_split_factor=k_split_factor,
-            h_split_factor=h_split_factor,
-            dtype=dtype,
-        )
-
-        conv2d_verify(output, ref_output, dtype)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
index d22b2db9c399..99fc6ac074c2 100644
--- a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
+++ b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
@@ -200,12 +200,7 @@ def schedule_args(
         working_scope,
     ):
         """Create and return the schedule and input args after applying layout transform"""
-        if schedule_type == "TE":
-
-            return self._te_schedule_args(
-                input_shape, dtype, input_layout, output_layout, working_layout, working_scope
-            )
-        elif schedule_type == "TIR":
+        if schedule_type == "TIR":
             return self._tir_schedule_args(
                 input_shape, dtype, input_layout, output_layout, working_layout, working_scope
             )
@@ -222,40 +217,6 @@ def _te_tensors(self, input_shape, dtype):
         )
         return input_tensor, output_tensor
 
-    def _te_schedule_args(
-        self,
-        input_shape,
-        dtype,
-        input_layout,
-        output_layout,
-        working_layout,
-        working_scope,
-    ):
-        input_tensor, output_tensor = self._te_tensors(input_shape, dtype)
-
-        schedule = te.create_schedule(output_tensor.op)
-
-        write_cache = schedule.cache_write(output_tensor, working_scope)
-        read_cache = schedule.cache_read(input_tensor, working_scope, [write_cache])
-
-        def apply_transform(tensor, layout):
-            if layout == "nhwc":
-                return None
-            if layout == "nchw-8h8w32c-1d":
-                return schedule[tensor].transform_layout(layout_transform_1d)
-            if layout == "nchw-8h8w32c-2d":
-                return schedule[tensor].transform_layout(layout_transform_2d)
-            raise RuntimeError(f"Unexpected layout '{layout}'")
-
-        apply_transform(input_tensor, input_layout)
-        compute_loopnest = apply_transform(output_tensor, output_layout) or output_tensor.op.axis
-        schedule[write_cache].compute_at(schedule[output_tensor], compute_loopnest[0])
-
-        apply_transform(read_cache, working_layout)
-        apply_transform(write_cache, working_layout)
-
-        return [schedule, [input_tensor, output_tensor]]
-
     def _tir_schedule_args(
         self, input_shape, dtype, input_layout, output_layout, working_layout, working_scope
     ):
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index 95c6c1e19805..c84e7a9d4a4c 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -39,11 +39,9 @@ def test_add(hexagon_session: Session):
     compute_c = tvm.te.compute(
         placeholder_a.shape, lambda i: placeholder_a[i] + placeholder_b[0], name="C"
     )
-    sched = tvm.te.create_schedule(compute_c.op)
 
     func = tvm.build(
-        sched,
-        [placeholder_a, placeholder_b, compute_c],
+        te.create_prim_func([placeholder_a, placeholder_b, compute_c]),
         get_hexagon_target("v68"),
         name="add",
     )
@@ -69,11 +67,9 @@ def test_add_vtcm(hexagon_session: Session):
     compute_c = tvm.te.compute(
         placeholder_a.shape, lambda i: placeholder_a[i] + placeholder_b[0], name="C"
     )
-    sched = tvm.te.create_schedule(compute_c.op)
 
     func = tvm.build(
-        sched,
-        [placeholder_a, placeholder_b, compute_c],
+        te.create_prim_func([placeholder_a, placeholder_b, compute_c]),
         get_hexagon_target("v68"),
         name="add",
     )
@@ -117,11 +113,9 @@ def test_matmul(self, hexagon_session, size_m, size_n, size_k):
                 placeholder_x[i, reduce_k1] * placeholder_y[reduce_k1, j], axis=[reduce_k1]
             ),
         )
-        schedule = te.create_schedule(compute_z.op)
 
         func = tvm.build(
-            schedule,
-            [placeholder_x, placeholder_y, compute_z],
+            te.create_prim_func([placeholder_x, placeholder_y, compute_z]),
             get_hexagon_target("v68"),
         )
 
diff --git a/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py b/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py
deleted file mode 100644
index 0cc6dbd8163f..000000000000
--- a/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Contrib tests for blocked conv2d and maxpool2d"""
-
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm import te, topi
-from tvm.topi import testing
-
-from .infrastructure import build_and_run, get_block_shape, get_packed_shape
-
-
-# Blocked layout: NHWC8h8w32c :: [N, H//8, W//8, C//32, 8h, 8w, 32c]
-def maxpool2d_logical(
-    shape_nhwc,
-    window_shape,
-    stride,
-    padding,
-    dtype,
-    storage_scope="global",
-):
-    """
-    Maxpool2d TE wherein the input activation is defined by its
-    logical NHWC shape. The packed physical layout for the
-    activation is nhwc8h8w32c.
-    """
-
-    block_h, block_w, block_c = get_block_shape()
-    shape = get_packed_shape(shape_nhwc)
-    logical_output_shape = (
-        shape_nhwc[0],
-        (shape_nhwc[1] - window_shape[0] + padding[0] + padding[1]) // stride[0] + 1,
-        (shape_nhwc[2] - window_shape[1] + padding[2] + padding[3]) // stride[0] + 1,
-        shape_nhwc[3],
-    )
-    output_shape = get_packed_shape(logical_output_shape)
-
-    _, height, width, _ = shape_nhwc
-    placeholder_x = te.placeholder(shape_nhwc, dtype=dtype)
-
-    # Combination of padding required by maxpool operator and padding to evenly divisible
-    # number of blocks. Note that this padding should be inlined in the schedule so
-    # as to avoid input copying.
-    pad_h = (block_h - ((height + padding[1]) % block_h)) % block_h
-    pad_w = (block_w - ((width + padding[3]) % block_w)) % block_w
-    x_pad = topi.nn.pad(
-        placeholder_x, [0, padding[0], padding[2], 0], [0, pad_h, pad_w, 0], pad_value=0
-    )
-
-    # Calculate packed layout
-    x_packed = te.compute(
-        shape,
-        lambda n, ho, wo, co, hi, wi, ci: x_pad[
-            n, ho * block_h + hi, wo * block_w + wi, co * block_c + ci
-        ],
-    )
-
-    reduce_h = te.reduce_axis((0, window_shape[0]), name="rh")
-    reduce_w = te.reduce_axis((0, window_shape[1]), name="rw")
-
-    def compute(batch, h_outer, w_outer, c_outer, h_inner, w_inner, c_inner):
-        # Construct blockized strided maxpool height indices
-        h = h_outer * block_h + h_inner
-        h_contig = h * stride[0] + reduce_h
-        h_block_id = h_contig // block_h
-        h_block_offset = h_contig % block_h
-
-        # Construct blockized strided maxpool width indices
-        w_idx = w_outer * block_w + w_inner
-        w_contig = w_idx * stride[1] + reduce_w
-        w_block_id = w_contig // block_w
-        w_block_offset = w_contig % block_w
-
-        return te.max(
-            x_packed[
-                batch, h_block_id, w_block_id, c_outer, h_block_offset, w_block_offset, c_inner
-            ],
-            axis=[reduce_h, reduce_w],
-        )
-
-    compute_y = te.compute(output_shape, compute)
-    schedule = te.create_schedule(compute_y.op)
-
-    # Ensure the padding and array packing is performed inline
-    schedule[x_pad].compute_inline()
-    schedule[x_packed].compute_inline()
-
-    binds = {}
-    if storage_scope and storage_scope != "global":
-        with tvm.transform.PassContext():
-            x_buffer = tvm.tir.decl_buffer(shape, name="Xb", dtype=dtype, scope=storage_scope)
-            y_buffer = tvm.tir.decl_buffer(
-                output_shape, name="Yb", dtype=dtype, scope=storage_scope
-            )
-            binds = {placeholder_x: x_buffer, compute_y: y_buffer}
-
-    return (schedule, [placeholder_x, compute_y], binds)
-
-
-class BaseMaxPooling:
-    batch = tvm.testing.parameter(1)
-    in_size = tvm.testing.parameter(8, 112)
-    in_channel = tvm.testing.parameter(64)
-    window_size = tvm.testing.parameter(3)
-    stride = tvm.testing.parameter(2)
-    pad = tvm.testing.parameter(1)
-    dtype = tvm.testing.parameter("float32")
-
-
-class TestMaxPooling(BaseMaxPooling):
-    """Test MaxPool class"""
-
-    @tvm.testing.parametrize_targets("llvm")
-    def test_maxpool(self, shape_nhwc, window_size, stride, pad, dtype, target):
-        """Test blocked maxpool"""
-        inputs = [np.random.uniform(0, 255, size=shape_nhwc).astype(dtype)]
-        ref_output = testing.poolnd_python(
-            inputs[0],
-            (window_size, window_size),
-            strides=(stride, stride),
-            dilation=(1, 1),
-            padding_before=(pad, pad),
-            padding_after=(pad, pad),
-            pool_type="max",
-        )
-        output = build_and_run(
-            inputs,
-            maxpool2d_logical,
-            target,
-            target,
-            shape_nhwc,
-            window_shape=(window_size, window_size),
-            stride=(stride, stride),
-            padding=(pad, pad, pad, pad),
-            dtype=dtype,
-        )
-        assert all([output is not None, ref_output is not None])
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hipblas.py b/tests/python/contrib/test_hipblas.py
index 63a7553704bf..e5df51e62942 100644
--- a/tests/python/contrib/test_hipblas.py
+++ b/tests/python/contrib/test_hipblas.py
@@ -29,14 +29,13 @@ def verify_matmul_add(in_dtype, out_dtype, rtol=1e-5):
     A = te.placeholder((n, l), name="A", dtype=in_dtype)
     B = te.placeholder((l, m), name="B", dtype=in_dtype)
     C = hipblas.matmul(A, B, dtype=out_dtype)
-    s = te.create_schedule(C.op)
 
     def verify(target="rocm"):
         if not tvm.get_global_func("tvm.contrib.hipblas.matmul", True):
             print("skip because extern function is not available")
             return
         dev = tvm.rocm(0)
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
         a = tvm.nd.array(np.random.uniform(0, 128, size=(n, l)).astype(A.dtype), dev)
         b = tvm.nd.array(np.random.uniform(0, 128, size=(l, m)).astype(B.dtype), dev)
         c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev)
@@ -56,10 +55,9 @@ def verify_batch_matmul(Ashape, Bshape, Cshape, in_dtype, out_dtype, rtol=1e-5):
     A = te.placeholder(Ashape, name="A", dtype=in_dtype)
     B = te.placeholder(Bshape, name="B", dtype=in_dtype)
     C = hipblas.batch_matmul(A, B, dtype=out_dtype)
-    s = te.create_schedule(C.op)
 
     dev = tvm.rocm(0)
-    f = tvm.build(s, [A, B, C], "rocm")
+    f = tvm.build(te.create_prim_func([A, B, C]), target="rocm")
 
     if "int" in in_dtype:
         a = tvm.nd.array(np.random.uniform(1, 10, size=Ashape).astype(in_dtype), dev)
diff --git a/tests/python/contrib/test_miopen.py b/tests/python/contrib/test_miopen.py
deleted file mode 100644
index 81115b6c0238..000000000000
--- a/tests/python/contrib/test_miopen.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-import tvm.testing
-from tvm import te
-from tvm.contrib import miopen
-import numpy as np
-import pytest
-
-
-requires_miopen = pytest.mark.skipif(
-    tvm.get_global_func("tvm.contrib.miopen.conv2d.setup", True) is None,
-    reason="MIOpen is not enabled",
-)
-
-
-@tvm.testing.requires_rocm
-@requires_miopen
-def test_conv2d():
-    in_channel = 3
-    out_channel = 64
-    filter_h = 3
-    filter_w = 3
-    pad_h = 1
-    pad_w = 1
-    stride_h = 1
-    stride_w = 1
-    dilation_h = 1
-    dilation_w = 1
-
-    xshape = [1, in_channel, 128, 128]
-    wshape = (out_channel, in_channel, filter_h, filter_w)
-
-    X = te.placeholder(xshape, name="X")
-    W = te.placeholder(wshape, name="W")
-    Y = miopen.conv2d_forward(
-        X, W, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, conv_mode=0, data_type=1
-    )
-
-    yshape = [x.value for x in Y.shape]
-    from tvm import topi
-
-    s = te.create_schedule(Y.op)
-
-    def verify():
-        dev = tvm.rocm(0)
-        f = tvm.build(s, [X, W, Y], "rocm --host=llvm", name="conv2d")
-        x = tvm.nd.array(np.random.uniform(-1, 1, xshape).astype(np.float32), dev)
-        w = tvm.nd.array(np.random.uniform(-1, 1, wshape).astype(np.float32), dev)
-        y = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), dev)
-        f(x, w, y)
-
-        Y_ref = topi.nn.conv2d_nchw(
-            X, W, (stride_h, stride_w), (pad_h, pad_w), (dilation_h, dilation_w)
-        )
-        s_ref = te.create_schedule(Y_ref.op)
-        f_ref = tvm.build(s_ref, [X, W, Y_ref], "rocm --host=llvm")
-        y_ref = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), dev)
-        f_ref(x, w, y_ref)
-        print("Max abs diff:", np.max(np.abs(y.numpy() - y_ref.numpy())))
-        tvm.testing.assert_allclose(y.numpy(), y_ref.numpy(), atol=1e-3)
-
-    verify()
-
-
-def verify_softmax(shape, axis, dtype="float32", log_softmax=False):
-    miopen_op = miopen.log_softmax if log_softmax else miopen.softmax
-    testing_op = (
-        tvm.topi.testing.log_softmax_python if log_softmax else tvm.topi.testing.softmax_python
-    )
-
-    A = te.placeholder(shape, dtype=dtype, name="A")
-    B = miopen_op(A, axis)
-    s = te.create_schedule([B.op])
-
-    dev = tvm.rocm(0)
-    a_np = np.random.uniform(size=shape).astype(dtype)
-    b_np = testing_op(a_np)
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(b_np, dev)
-    f = tvm.build(s, [A, B], target="rocm --host=llvm", name="softmax")
-    f(a, b)
-    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-3)
-
-
-def verify_softmax_4d(shape, dtype="float32", log_softmax=False):
-    miopen_op = miopen.log_softmax if log_softmax else miopen.softmax
-    testing_op = (
-        tvm.topi.testing.log_softmax_python if log_softmax else tvm.topi.testing.softmax_python
-    )
-
-    A = te.placeholder(shape, dtype=dtype, name="A")
-    B = miopen_op(A, axis=1)
-    s = te.create_schedule([B.op])
-
-    dev = tvm.rocm(0)
-    n, c, h, w = shape
-    a_np = np.random.uniform(size=shape).astype(dtype)
-    b_np = testing_op(a_np.transpose(0, 2, 3, 1).reshape(h * w, c))
-    b_np = b_np.reshape(n, h, w, c).transpose(0, 3, 1, 2)
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(b_np, dev)
-    f = tvm.build(s, [A, B], target="rocm --host=llvm", name="softmax")
-    f(a, b)
-    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-3)
-
-
-@tvm.testing.requires_rocm
-@requires_miopen
-def test_softmax():
-    verify_softmax((32, 10), -1)
-    verify_softmax((3, 4), -1)
-    verify_softmax_4d((1, 16, 256, 256))
-    verify_softmax_4d((1, 16, 256, 256))
-
-    verify_softmax((32, 10), -1, log_softmax=True)
-    verify_softmax((3, 4), -1, log_softmax=True)
-    verify_softmax_4d((1, 16, 256, 256), log_softmax=True)
-
-
-if __name__ == "__main__":
-    test_conv2d()
diff --git a/tests/python/contrib/test_mps.py b/tests/python/contrib/test_mps.py
index 92462e4c4f9e..e876672feaed 100644
--- a/tests/python/contrib/test_mps.py
+++ b/tests/python/contrib/test_mps.py
@@ -29,33 +29,20 @@ def test_matmul():
     A = te.placeholder((n, l), name="A")
     B = te.placeholder((l, m), name="B")
     C = mps.matmul(A, B)
-    D = te.compute(C.shape, lambda *i: C(*i) + 1.0)
-    s = te.create_schedule(D.op)
-    yo, xo = D.op.axis
-    block_y = te.thread_axis("blockIdx.y")
-    block_x = te.thread_axis("blockIdx.x")
-    thread_y = te.thread_axis("threadIdx.y")
-    thread_x = te.thread_axis("threadIdx.x")
-    by, ty = s[D].split(yo, factor=16)
-    bx, tx = s[D].split(xo, factor=16)
-    s[D].bind(by, block_y)
-    s[D].bind(bx, block_x)
-    s[D].bind(ty, thread_y)
-    s[D].bind(tx, thread_x)
 
-    def verify(A, B, D, s, target="metal"):
+    def verify(A, B, C):
         if not tvm.get_global_func("tvm.contrib.mps.matmul", True):
             print("skip because extern function is not available")
             return
         dev = tvm.metal(0)
-        f = tvm.build(s, [A, B, D], "metal")
+        f = tvm.build(te.create_prim_func([A, B, C]), target="metal")
         a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), dev)
         b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), dev)
         c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev)
         f(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), np.dot(a.numpy(), b.numpy()) + 1, rtol=1e-5)
+        tvm.testing.assert_allclose(c.numpy(), np.dot(a.numpy(), b.numpy()), rtol=1e-5)
 
-    verify(A, B, D, s)
+    verify(A, B, C)
 
 
 @tvm.testing.requires_metal
@@ -71,20 +58,17 @@ def test_conv2d():
     A = te.placeholder((n, h, w, ci), name="x")
     B = te.placeholder((co, kh, kw, ci), name="w")
     C = mps.conv2d(A, B, "SAME", 2)
-    s1 = te.create_schedule(C.op)
 
     def verify(A, B, C, target="llvm"):
         if not tvm.get_global_func("tvm.contrib.mps.conv2d", True):
             print("skip because extern function is not available")
             return
         dev = tvm.metal(0)
-        f = tvm.build(s1, [A, B, C], "metal")
+        f = tvm.build(te.create_prim_func([A, B, C]), target="metal")
         a = tvm.nd.array(np.random.uniform(size=(n, h, w, ci)).astype(A.dtype), dev)
         b = tvm.nd.array(np.random.uniform(size=(co, kh, kw, ci)).astype(B.dtype), dev)
         c = tvm.nd.array(np.zeros((n, h // stride, w // stride, co), dtype=C.dtype), dev)
         f(a, b, c)
-        # print(c.numpy())
-        # print(c.shape)
 
     verify(A, B, C, s1)
 
diff --git a/tests/python/contrib/test_random.py b/tests/python/contrib/test_random.py
index 6ffd417a0a48..be9fed2c6ee8 100644
--- a/tests/python/contrib/test_random.py
+++ b/tests/python/contrib/test_random.py
@@ -30,7 +30,6 @@ def test_randint():
     m = 10240
     n = 10240
     A = random.randint(-127, 128, size=(m, n), dtype="int32")
-    s = te.create_schedule(A.op)
 
     def verify(target="llvm"):
         if not tvm.testing.device_enabled(target):
@@ -40,7 +39,7 @@ def verify(target="llvm"):
             print("skip because extern function is not available")
             return
         dev = tvm.cpu(0)
-        f = tvm.build(s, [A], target)
+        f = tvm.build(te.create_prim_func([A]), target=target)
         a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), dev)
         f(a)
         na = a.numpy()
@@ -56,7 +55,6 @@ def test_uniform():
     m = 10240
     n = 10240
     A = random.uniform(0, 1, size=(m, n))
-    s = te.create_schedule(A.op)
 
     def verify(target="llvm"):
         if not tvm.testing.device_enabled(target):
@@ -66,7 +64,7 @@ def verify(target="llvm"):
             print("skip because extern function is not available")
             return
         dev = tvm.cpu(0)
-        f = tvm.build(s, [A], target)
+        f = tvm.build(te.create_prim_func([A]), target=target)
         a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), dev)
         f(a)
         na = a.numpy()
@@ -82,7 +80,6 @@ def test_normal():
     m = 10240
     n = 10240
     A = random.normal(3, 4, size=(m, n))
-    s = te.create_schedule(A.op)
 
     def verify(target="llvm"):
         if not tvm.testing.device_enabled(target):
@@ -92,7 +89,7 @@ def verify(target="llvm"):
             print("skip because extern function is not available")
             return
         dev = tvm.cpu(0)
-        f = tvm.build(s, [A], target)
+        f = tvm.build(te.create_prim_func([A]), target=target)
         a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), dev)
         f(a)
         na = a.numpy()
diff --git a/tests/python/contrib/test_rocblas.py b/tests/python/contrib/test_rocblas.py
index c5321cd4eaaf..2c1889a0c43b 100644
--- a/tests/python/contrib/test_rocblas.py
+++ b/tests/python/contrib/test_rocblas.py
@@ -33,14 +33,13 @@ def test_matmul():
     A = te.placeholder((n, l), name="A")
     B = te.placeholder((l, m), name="B")
     C = rocblas.matmul(A, B)
-    s = te.create_schedule(C.op)
 
     def verify(target="rocm"):
         if not tvm.get_global_func("tvm.contrib.rocblas.matmul", True):
             print("skip because extern function is not available")
             return
         dev = tvm.rocm(0)
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
         a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), dev)
         b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), dev)
         c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev)
@@ -57,7 +56,6 @@ def verify_batch_matmul(batch, m, k, n, lib, transa=False, transb=False, dtype="
     A = te.placeholder(ashape, name="A", dtype=dtype)
     B = te.placeholder(bshape, name="B", dtype=dtype)
     C = lib.batch_matmul(A, B, transa, transb)
-    s = te.create_schedule(C.op)
 
     def get_numpy(a, b, transa, transb):
         if transa:
@@ -74,7 +72,7 @@ def verify(target="rocm"):
             print("skip because extern function is not available")
             return
         dev = tvm.rocm(0)
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
         a = tvm.nd.array(np.random.uniform(size=ashape).astype(A.dtype), dev)
         b = tvm.nd.array(np.random.uniform(size=bshape).astype(B.dtype), dev)
         c = tvm.nd.array(np.zeros((batch, m, n), dtype=C.dtype), dev)
diff --git a/tests/python/contrib/test_sort.py b/tests/python/contrib/test_sort.py
index c135450c09e1..0e0aa71caf10 100644
--- a/tests/python/contrib/test_sort.py
+++ b/tests/python/contrib/test_sort.py
@@ -20,7 +20,6 @@
 import tvm
 import tvm.testing
 from tvm import te
-from tvm.topi.cuda import sort_by_key
 
 
 def test_sort():
@@ -53,8 +52,7 @@ def test_sort():
 
     dev = tvm.cpu(0)
     target = "llvm"
-    s = te.create_schedule(out.op)
-    f = tvm.build(s, [data, sort_num, out], target)
+    f = tvm.build(te.create_prim_func([data, sort_num, out]), target=target)
     a = tvm.nd.array(np.array(input_data).astype(data.dtype), dev)
     b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), dev)
     c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), dev)
@@ -82,8 +80,7 @@ def test_sort_np():
 
     dev = tvm.cpu(0)
     target = "llvm"
-    s = te.create_schedule(out.op)
-    f = tvm.build(s, [data, sort_num, out], target)
+    f = tvm.build(te.create_prim_func([data, sort_num, out]), target=target)
 
     np_data = np.random.uniform(size=dshape)
     np_out = np.argsort(np_data, axis=axis)
@@ -95,40 +92,6 @@ def test_sort_np():
     tvm.testing.assert_allclose(c.numpy(), np_out, rtol=1e-5)
 
 
-def test_sort_by_key_gpu():
-    """Tests sort function using gpu"""
-    size = 6
-    keys = te.placeholder((size,), name="keys", dtype="int32")
-    values = te.placeholder((size,), name="values", dtype="int32")
-
-    for target in ["cuda", "nvptx", "opencl", "rocm"]:
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            continue
-
-        with tvm.target.Target(target):
-            keys_out, values_out = sort_by_key(keys, values)
-            dev = tvm.device(target)
-            s = te.create_schedule([keys_out.op, values_out.op])
-            f = tvm.build(s, [keys, values, keys_out, values_out], target)
-
-            keys_np = np.array([1, 4, 2, 8, 2, 7], np.int32)
-            values_np = np.random.randint(0, 10, size=(size,)).astype(np.int32)
-            keys_np_out = np.zeros(keys_np.shape, np.int32)
-            values_np_out = np.zeros(values_np.shape, np.int32)
-            keys_in = tvm.nd.array(keys_np, dev)
-            values_in = tvm.nd.array(values_np, dev)
-            keys_out = tvm.nd.array(keys_np_out, dev)
-            values_out = tvm.nd.array(values_np_out, dev)
-            f(keys_in, values_in, keys_out, values_out)
-
-            ref_keys_out = np.sort(keys_np)
-            ref_values_out = np.array([values_np[i] for i in np.argsort(keys_np)])
-            tvm.testing.assert_allclose(keys_out.numpy(), ref_keys_out, rtol=1e-5)
-            tvm.testing.assert_allclose(values_out.numpy(), ref_values_out, rtol=1e-5)
-
-
 if __name__ == "__main__":
     test_sort()
     test_sort_np()
-    test_sort_by_key_gpu()
diff --git a/tests/python/contrib/test_sparse.py b/tests/python/contrib/test_sparse.py
deleted file mode 100644
index 8ebd02cc170c..000000000000
--- a/tests/python/contrib/test_sparse.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Configure pytest"""
-# pylint: disable=invalid-name
-from collections import namedtuple
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import te
-import tvm.contrib.sparse as tvmsp
-import tvm.runtime.ndarray as _nd
-
-
-def test_static_tensor():
-    """Tests static tensor"""
-    dtype = "float32"
-    target = "llvm"
-    dev = tvm.device(target, 0)
-    m = te.size_var("m")
-    n = te.size_var("n")
-    A = tvmsp.placeholder(shape=(m, n), name="A", dtype=dtype)
-    assert A.stype == "csr"
-    n = 3
-    a = np.maximum(np.random.uniform(size=(n, n)).astype(dtype) - 0.6, 0.0)
-    a = tvmsp.array(a, dev)
-    A.data = te.placeholder(a.data.shape, dtype, name="A_data")
-    Ab = tvm.tir.decl_buffer(a.data.shape, dtype, name="A_data")
-    binds = {A.data: Ab}
-    C = te.compute(A.data.shape, lambda i: A.data[i] * 2.0, tag="cs_scatter")
-    s = te.create_schedule(C.op)
-    f = tvm.build(s, [A.data, C], target, binds=binds)
-    c = tvmsp.array(np.zeros((n, n), dtype), dev)
-    c.data = tvm.nd.empty(a.data.shape, dtype)
-    c.indices = a.indices
-    c.indptr = a.indptr
-    f(a.data, c.data)
-    tvm.testing.assert_allclose(c.numpy(), a.numpy() * 2.0, rtol=1e-5)
-
-
-def test_dynamic_tensor():
-    """Tests dynamic tensor"""
-    dtype = "float32"
-    target = "llvm"
-    dev = tvm.device(target, 0)
-    nr, nc, n = te.size_var("nr"), te.size_var("nc"), te.size_var("n")
-    A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, name="A", dtype=dtype)
-    assert A.stype == "csr"
-    C = te.compute(A.data.shape, lambda i: A.data[i] * 2.0, tag="cs_scatter")
-    s = te.create_schedule(C.op)
-    _nr, _nc = 3, 5
-    a = np.maximum(np.random.uniform(size=(_nr, _nc)).astype(dtype) - 0.6, 0.0)
-    a = tvmsp.array(a, dev)
-    assert a.data.dtype == a.dtype
-    Ab = namedtuple("CSRBuffer", ["data", "indices", "indptr"])
-    Ab.data = tvm.tir.decl_buffer(a.data.shape, a.data.dtype, name="A_data")
-    Ab.indices = tvm.tir.decl_buffer(a.data.shape, a.data.dtype, name="A_indices")
-    binds = {A.data: Ab.data, A.indices: Ab.indices}
-    f = tvm.build(s, [nr, A.data, C], target, binds=binds)
-    c = tvmsp.array(np.zeros((_nr, _nc), dtype), dev)
-    c.data = tvm.nd.empty(a.data.shape, dtype)
-    c.indices = a.indices
-    c.indptr = a.indptr
-    f(a.data.shape[0], a.data, c.data)
-    tvm.testing.assert_allclose(c.numpy(), a.numpy() * 2.0, rtol=1e-5)
-
-
-def test_sparse_array_tuple():
-    """Tests array when it is sparse"""
-    dtype, itype = "float32", "int32"
-    target = "llvm"
-    dev = tvm.device(target, 0)
-    nr, nc, n = te.size_var("nr"), te.size_var("nc"), te.size_var("n")
-    A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, name="A", dtype=dtype)
-    assert A.stype == "csr"
-    C = te.compute(A.data.shape, lambda i: A.data[i] * 2.0, tag="cs_scatter")
-    s = te.create_schedule(C.op)
-    _nr, _nc = 3, 5
-    a = np.maximum(np.random.uniform(size=(_nr, _nc)).astype(dtype) - 0.6, 0.0)
-    # convert to sparse array tuple
-    source_array = a
-    ridx, cidx = np.nonzero(source_array)
-    data = source_array[ridx, cidx]
-    a_data = _nd.array(data, dev)
-    indices = np.nonzero(source_array)[1].astype(itype)
-    a_indices = _nd.array(indices, dev)
-    indptr = [0] + np.apply_along_axis(np.count_nonzero, axis=1, arr=source_array).tolist()
-    indptr = np.cumsum(np.array(indptr, itype)).astype(itype)
-    a_indptr = _nd.array(indptr, dev)
-    a_init = (a_data, a_indices, a_indptr)
-    # construct tvm sparse array with tuple
-    a = tvmsp.array(a_init, shape=source_array.shape, device=dev)
-    assert a.data.dtype == a.dtype
-    Ab = namedtuple("CSRBuffer", ["data", "indices", "indptr"])
-    Ab.data = tvm.tir.decl_buffer(a.data.shape, a.data.dtype, name="A_data")
-    Ab.indices = tvm.tir.decl_buffer(a.data.shape, a.data.dtype, name="A_indices")
-    binds = {A.data: Ab.data, A.indices: Ab.indices}
-    f = tvm.build(s, [nr, A.data, C], target, binds=binds)
-    c = tvmsp.array(np.zeros((_nr, _nc), dtype), dev)
-    c.data = tvm.nd.empty(a.data.shape, dtype)
-    c.indices = a.indices
-    c.indptr = a.indptr
-    f(a.data.shape[0], a.data, c.data)
-    tvm.testing.assert_allclose(c.numpy(), a.numpy() * 2.0, rtol=1e-5)
-
-
-if __name__ == "__main__":
-    test_static_tensor()
-    test_dynamic_tensor()
-    test_sparse_array_tuple()
diff --git a/tests/python/relax/test_frontend_from_fx.py b/tests/python/relax/test_frontend_from_fx.py
index 3c932f86c582..446c4149fdde 100644
--- a/tests/python/relax/test_frontend_from_fx.py
+++ b/tests/python/relax/test_frontend_from_fx.py
@@ -3637,7 +3637,6 @@ def main(
 
 
 def test_stack():
-
     input_info = [
         ([1, 3, 10, 10], "float32"),
         ([1, 3, 10, 10], "float32"),
diff --git a/tests/python/runtime/test_runtime_dlpack.py b/tests/python/runtime/test_runtime_dlpack.py
index cf12c89cdd51..60a86f662c6c 100644
--- a/tests/python/runtime/test_runtime_dlpack.py
+++ b/tests/python/runtime/test_runtime_dlpack.py
@@ -35,9 +35,7 @@ def test_from_dlpack_shape_one():
     B = te.placeholder((rows, 16), name="B")
     C = te.compute(A.shape, lambda i, j: A[i, j] + B[i, j], name="C")
 
-    s = te.create_schedule(C.op)
-
-    fadd = tvm.build(s, [A, B, C], tgt)
+    fadd = tvm.build(te.create_prim_func([A, B, C]), target=tgt)
 
     dev = tvm.device(tgt.kind.name, 0)
 
diff --git a/tests/python/runtime/test_runtime_measure.py b/tests/python/runtime/test_runtime_measure.py
index 8955b03241a2..4b39cef18bc5 100644
--- a/tests/python/runtime/test_runtime_measure.py
+++ b/tests/python/runtime/test_runtime_measure.py
@@ -35,8 +35,7 @@ def my_debug(filename):
             fout.write("c")
 
     X = te.compute((), lambda: tvm.tir.call_packed("my_debug", filename))
-    s = te.create_schedule(X.op)
-    func = tvm.build(s, [X])
+    func = tvm.build(te.create_prim_func([X]))
 
     x = tvm.nd.empty((), dtype="int32")
     ftimer = func.time_evaluator(func.entry_name, tvm.cpu(), number=1, repeat=1)
diff --git a/tests/python/runtime/test_runtime_module_export.py b/tests/python/runtime/test_runtime_module_export.py
index a6554f3a4f75..1dff6c42502e 100644
--- a/tests/python/runtime/test_runtime_module_export.py
+++ b/tests/python/runtime/test_runtime_module_export.py
@@ -17,211 +17,10 @@
 
 import tvm
 import tvm.testing
-import pytest
 
 from tvm.contrib import utils
-import os
 
-header_file_dir_path = utils.tempdir()
 
-
-def gen_engine_header():
-    code = r"""
-        #ifndef _ENGINE_H_
-        #define _ENGINE_H_
-        #include <cstdint>
-        #include <string>
-        #include <sstream>
-        #include <vector>
-        class Engine {
-        };
-
-        #endif
-        """
-    header_file = header_file_dir_path.relpath("gcc_engine.h")
-    with open(header_file, "w") as f:
-        f.write(code)
-
-
-def generate_engine_module():
-    code = r"""
-        #include <tvm/runtime/c_runtime_api.h>
-        #include <dlpack/dlpack.h>
-        #include "gcc_engine.h"
-
-        extern "C" void gcc_1_(float* gcc_input4, float* gcc_input5,
-                float* gcc_input6, float* gcc_input7, float* out) {
-            Engine engine;
-        }
-        """
-    import tvm.runtime._ffi_api
-
-    gen_engine_header()
-    csource_module = tvm.runtime._ffi_api.CSourceModuleCreate(code, "cc", [], None)
-    return csource_module
-
-
-@pytest.mark.skip("LEGACY-TEST: test to be replaced by relax")
-@tvm.testing.uses_gpu
-def test_mod_export():
-    def verify_gpu_mod_export(obj_format):
-        for device in ["llvm", "cuda"]:
-            if not tvm.testing.device_enabled(device):
-                print("skip because %s is not enabled..." % device)
-                return
-
-        synthetic_mod, synthetic_params = relay.testing.synthetic.get_workload()
-        synthetic_llvm_mod, synthetic_llvm_params = relay.testing.synthetic.get_workload()
-        with tvm.transform.PassContext(opt_level=3):
-            _, synthetic_gpu_lib, _ = relay.build_module.build(
-                synthetic_mod, "cuda", params=synthetic_params, mod_name="cudalib"
-            )
-            _, synthetic_llvm_cpu_lib, _ = relay.build_module.build(
-                synthetic_llvm_mod, "llvm", params=synthetic_llvm_params, mod_name="llvmlib"
-            )
-
-        temp = utils.tempdir()
-        if obj_format == ".so":
-            file_name = "deploy_lib.so"
-        else:
-            assert obj_format == ".tar"
-            file_name = "deploy_lib.tar"
-        path_lib = temp.relpath(file_name)
-        synthetic_gpu_lib.import_module(synthetic_llvm_cpu_lib)
-        synthetic_gpu_lib.export_library(path_lib)
-        loaded_lib = tvm.runtime.load_module(path_lib)
-        assert loaded_lib.type_key == "library"
-        assert loaded_lib.imported_modules[0].type_key == "cuda"
-        #  dso modules are merged together
-        assert len(loaded_lib.imported_modules) == 1
-
-    def verify_multi_dso_mod_export(obj_format):
-        for device in ["llvm"]:
-            if not tvm.testing.device_enabled(device):
-                print("skip because %s is not enabled..." % device)
-                return
-
-        A = te.placeholder((1024,), name="A")
-        B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-        s = te.create_schedule(B.op)
-        mod0 = tvm.build(s, [A, B], "llvm", name="myadd0")
-        mod1 = tvm.build(s, [A, B], "llvm", name="myadd1")
-
-        temp = utils.tempdir()
-        if obj_format == ".so":
-            file_name = "deploy_lib.so"
-        else:
-            assert obj_format == ".tar"
-            file_name = "deploy_lib.tar"
-        path_lib = temp.relpath(file_name)
-
-        mod0.import_module(mod1)
-        mod0.export_library(path_lib)
-        loaded_lib = tvm.runtime.load_module(path_lib)
-        assert loaded_lib.type_key == "library"
-        # dso modules are merged
-        assert len(loaded_lib.imported_modules) == 0
-
-    def verify_json_import_dso(obj_format):
-        for device in ["llvm"]:
-            if not tvm.testing.device_enabled(device):
-                print("skip because %s is not enabled..." % device)
-                return
-
-        # Get subgraph Json.
-        subgraph_json = (
-            "json_rt_0\n"
-            + "input 0 10 10\n"
-            + "input 1 10 10\n"
-            + "input 2 10 10\n"
-            + "input 3 10 10\n"
-            + "add 4 inputs: 0 1 shape: 10 10\n"
-            + "sub 5 inputs: 4 2 shape: 10 10\n"
-            + "mul 6 inputs: 5 3 shape: 10 10\n"
-            + "json_rt_1\n"
-            + "input 0 10 10\n"
-            + "input 1 10 10\n"
-            + "input 2 10 10\n"
-            + "input 3 10 10\n"
-            + "add 4 inputs: 0 1 shape: 10 10\n"
-            + "sub 5 inputs: 4 2 shape: 10 10\n"
-            + "mul 6 inputs: 5 3 shape: 10 10"
-        )
-
-        temp = utils.tempdir()
-        subgraph_path = temp.relpath("subgraph.examplejson")
-        with open(subgraph_path, "w") as f:
-            f.write(subgraph_json)
-
-        # Get Json and module.
-        A = te.placeholder((1024,), name="A")
-        B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-        s = te.create_schedule(B.op)
-        f = tvm.build(s, [A, B], "llvm", name="myadd")
-        try:
-            ext_lib = tvm.runtime.load_module(subgraph_path, "examplejson")
-        except:
-            print("skip because Loader of examplejson is not presented")
-            return
-        ext_lib.import_module(f)
-        if obj_format == ".so":
-            file_name = "deploy_lib.so"
-        else:
-            assert obj_format == ".tar"
-            file_name = "deploy_lib.tar"
-        path_lib = temp.relpath(file_name)
-        ext_lib.export_library(path_lib)
-        lib = tvm.runtime.load_module(path_lib)
-        assert lib.type_key == "examplejson"
-        assert lib.imported_modules[0].type_key == "library"
-
-    def verify_multi_c_mod_export():
-        from shutil import which
-
-        if which("gcc") is None:
-            print("Skip test because gcc is not available.")
-
-        for device in ["llvm"]:
-            if not tvm.testing.device_enabled(device):
-                print("skip because %s is not enabled..." % device)
-                return
-
-        synthetic_mod, synthetic_params = relay.testing.synthetic.get_workload()
-        with tvm.transform.PassContext(opt_level=3):
-            _, synthetic_cpu_lib, _ = relay.build_module.build(
-                synthetic_mod, "llvm", params=synthetic_params
-            )
-
-        A = te.placeholder((1024,), name="A")
-        B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-        s = te.create_schedule(B.op)
-        f = tvm.build(s, [A, B], "c", name="myadd")
-        engine_module = generate_engine_module()
-
-        temp = utils.tempdir()
-        file_name = "deploy_lib.so"
-        path_lib = temp.relpath(file_name)
-        synthetic_cpu_lib.import_module(f)
-        synthetic_cpu_lib.import_module(engine_module)
-        kwargs = {"options": ["-O2", "-std=c++17", "-I" + header_file_dir_path.relpath("")]}
-        work_dir = temp.relpath("work_dir")
-        os.mkdir(work_dir)
-        synthetic_cpu_lib.export_library(path_lib, fcompile=False, workspace_dir=work_dir, **kwargs)
-        assert os.path.exists(os.path.join(work_dir, "devc.o"))
-        loaded_lib = tvm.runtime.load_module(path_lib)
-        assert loaded_lib.type_key == "library"
-        # dso modules are merged
-        assert len(loaded_lib.imported_modules) == 0
-
-    for obj_format in [".so", ".tar"]:
-        verify_gpu_mod_export(obj_format)
-        verify_multi_dso_mod_export(obj_format)
-        verify_json_import_dso(obj_format)
-
-    verify_multi_c_mod_export()
-
-
-@pytest.mark.skip("LEGACY-TEST: test to be replaced by TensorIR")
 @tvm.testing.requires_llvm
 def test_import_static_library():
     from tvm import te
@@ -229,9 +28,15 @@ def test_import_static_library():
     # Generate two LLVM modules.
     A = te.placeholder((1024,), name="A")
     B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-    s = te.create_schedule(B.op)
-    mod0 = tvm.build(s, [A, B], "llvm", name="myadd0")
-    mod1 = tvm.build(s, [A, B], "llvm", name="myadd1")
+    irmod0 = tvm.IRModule.from_expr(
+        te.create_prim_func([A, B]).with_attr("global_symbol", "myadd0")
+    )
+    irmod1 = tvm.IRModule.from_expr(
+        te.create_prim_func([A, B]).with_attr("global_symbol", "myadd1")
+    )
+
+    mod0 = tvm.build(irmod0, target="llvm")
+    mod1 = tvm.build(irmod1, target="llvm")
 
     assert mod0.implements_function("myadd0")
     assert mod1.implements_function("myadd1")
diff --git a/tests/python/runtime/test_runtime_module_load.py b/tests/python/runtime/test_runtime_module_load.py
index 33bd281b045f..130a274c354b 100644
--- a/tests/python/runtime/test_runtime_module_load.py
+++ b/tests/python/runtime/test_runtime_module_load.py
@@ -101,12 +101,13 @@ def test_device_module_dump():
     n = tvm.runtime.convert(1024)
     A = te.placeholder((n,), name="A")
     B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-    s = te.create_schedule(B.op)
+
+    sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
     # create iter var and assign them tags.
     num_thread = 8
-    bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
-    s[B].bind(bx, te.thread_axis("blockIdx.x"))
-    s[B].bind(tx, te.thread_axis("threadIdx.x"))
+    bx, tx = sch.split(sch.get_loops("B")[0], factors=[None, num_thread])
+    sch.bind(bx, "blockIdx.x")
+    sch.bind(tx, "threadIdx.x")
 
     def check_device(device):
         dev = tvm.device(device, 0)
@@ -114,9 +115,7 @@ def check_device(device):
             print("Skip because %s is not enabled" % device)
             return
         temp = utils.tempdir()
-        name = "myadd_%s" % device
-
-        f = tvm.build(s, [A, B], device, "llvm", name=name)
+        f = tvm.build(sch.mod, target=device)
 
         path_dso = temp.relpath("dev_lib.so")
         # test cross compiler function
@@ -143,8 +142,7 @@ def check_stackvm(device):
             print("Skip because %s is not enabled" % device)
             return
         temp = utils.tempdir()
-        name = "myadd_%s" % device
-        f = tvm.build(s, [A, B], device, "stackvm", name=name)
+        f = tvm.build(sch.mod, target=tvm.target.Target(device, host="stackvm"))
         path_dso = temp.relpath("dev_lib.stackvm")
         f.export_library(path_dso)
         f1 = tvm.runtime.load_module(path_dso)
diff --git a/tests/python/runtime/test_runtime_module_property.py b/tests/python/runtime/test_runtime_module_property.py
index bd71e856d917..97c51ff93996 100644
--- a/tests/python/runtime/test_runtime_module_property.py
+++ b/tests/python/runtime/test_runtime_module_property.py
@@ -33,12 +33,7 @@ def create_csource_module():
 def create_llvm_module():
     A = te.placeholder((1024,), name="A")
     B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-    s = te.create_schedule(B.op)
-    return tvm.build(s, [A, B], "llvm", name="myadd0")
-
-
-def create_aot_module():
-    return tvm.get_global_func("relay.build_module._AOTExecutorCodegen")()
+    return tvm.build(te.create_prim_func([A, B]), target="llvm")
 
 
 def test_property():
@@ -52,11 +47,6 @@ def test_property():
         expected={"is_binary_serializable": False, "is_runnable": True, "is_dso_exportable": True},
     )
 
-    checker(
-        create_aot_module(),
-        expected={"is_binary_serializable": False, "is_runnable": True, "is_dso_exportable": False},
-    )
-
 
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/runtime/test_runtime_rpc.py b/tests/python/runtime/test_runtime_rpc.py
index 31cab2819df1..717cc8fffa05 100644
--- a/tests/python/runtime/test_runtime_rpc.py
+++ b/tests/python/runtime/test_runtime_rpc.py
@@ -73,8 +73,7 @@ def test_bigendian_rpc():
     def verify_rpc(remote, target, shape, dtype):
         A = te.placeholder(shape, dtype=dtype)
         B = te.compute(A.shape, lambda i: A[i] + tvm.tir.const(1, A.dtype))
-        s = te.create_schedule(B.op)
-        f = tvm.build(s, [A, B], target, name="myadd")
+        f = tvm.build(te.create_prim_func([A, B]), target=target)
 
         dev = remote.cpu(0)
         a = tvm.nd.array(np.random.randint(0, 256, size=shape).astype(A.dtype), device=dev)
diff --git a/tests/python/runtime/test_runtime_trace.py b/tests/python/runtime/test_runtime_trace.py
index 08f56b56c8c7..58d1a079e46b 100644
--- a/tests/python/runtime/test_runtime_trace.py
+++ b/tests/python/runtime/test_runtime_trace.py
@@ -23,8 +23,7 @@ def test_trace_default_action():
     n = 2
     x = te.placeholder((n, n, n), name="X", dtype="float32")
     y = te.compute(x.shape, lambda i, j, k: tvm.tir.trace([i, j, k, x[i][j][k]]))
-    s = te.create_schedule(y.op)
-    f = tvm.build(s, [x, y], target="llvm")
+    f = tvm.build(te.create_prim_func([x, y]), target="llvm")
     xnd = tvm.nd.array(np.ones((n, n, n), dtype=x.dtype))
     ynd = tvm.nd.array(np.zeros((n, n, n), dtype=y.dtype))
     f(xnd, ynd)
@@ -44,8 +43,7 @@ def check_assign(dtype):
         z = te.compute(
             x.shape, lambda i, j, k: tvm.tir.trace([y[i][j][k]], "tvm.tir.trace_callback2")
         )
-        s = te.create_schedule(z.op)
-        f = tvm.build(s, [x, y, z], "llvm")
+        f = tvm.build(te.create_prim_func([x, y, z]), "llvm")
 
         xnd = tvm.nd.array(np.ones((n, n, n), dtype=x.dtype))
         ynd = tvm.nd.array(np.zeros((n, n, n), dtype=y.dtype))
@@ -74,8 +72,7 @@ def check_expr_sum(dtype):
             lambda i, j, k: tvm.tir.trace([a[i][j][k]], "tvm.tir.trace_callback3")
             + tvm.tir.trace([b[i][j][k]], "tvm.tir.trace_callback3"),
         )
-        s = te.create_schedule(c.op)
-        f = tvm.build(s, [a, b, c])
+        f = tvm.build(te.create_prim_func([a, b, c]))
         xnd = tvm.nd.array(np.array(np.ones((n, n, n), dtype=a.dtype)))
         ynd = tvm.nd.array(np.array(np.ones((n, n, n), dtype=b.dtype)))
         znd = tvm.nd.array(np.zeros((n, n, n), dtype=c.dtype))
@@ -105,8 +102,7 @@ def check_expr_sum(dtype):
             + tvm.tir.trace([i, j, k, d[i][j][k]], "tvm.tir.trace_silent")
             + tvm.tir.trace([i, j, k, e[i][j][k]], "tvm.tir.trace_silent"),
         )
-        s = te.create_schedule(c.op)
-        f = tvm.build(s, [a, b, d, e, c])
+        f = tvm.build(te.create_prim_func([a, b, d, e, c]))
         a_nd = tvm.nd.array(np.array(np.ones((n, n, n), dtype=a.dtype)))
         b_nd = tvm.nd.array(np.array(np.ones((n, n, n), dtype=b.dtype)))
         d_nd = tvm.nd.array(np.array(np.ones((n, n, n), dtype=d.dtype)))
@@ -135,8 +131,7 @@ def check_expr_sum_custom(dtype):
             lambda i, j: tvm.tir.trace([a[i][j]], "tvm.tir.trace_callback4")
             + tvm.tir.trace([b[i][j]], "tvm.tir.trace_callback4"),
         )
-        s = te.create_schedule(c.op)
-        f = tvm.build(s, [a, b, c])
+        f = tvm.build(te.create_prim_func([a, b, c]))
         npa = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=a.dtype)
         npb = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=a.dtype)
         xnd = tvm.nd.array(npa)
@@ -163,8 +158,7 @@ def check_assign(dtype):
         x = te.placeholder((n,), name="X", dtype=dtype)
         y = te.compute(x.shape, lambda i: tvm.tir.trace([x[i]], "tvm.tir.trace_change_int_first"))
         z = te.compute(x.shape, lambda i: tvm.tir.trace([y[i]], "tvm.tir.trace_change_int_second"))
-        s = te.create_schedule(z.op)
-        f = tvm.build(s, [x, y, z], "llvm")
+        f = tvm.build(te.create_prim_func([x, y, z]))
 
         xnd = tvm.nd.array(np.ones((n,), dtype=x.dtype))
         ynd = tvm.nd.array(np.zeros((n,), dtype=y.dtype))
@@ -195,8 +189,7 @@ def check_assign(dtype):
         z = te.compute(
             x.shape, lambda i: tvm.tir.trace([y[i]], "tvm.tir.trace_change_float_second")
         )
-        s = te.create_schedule(z.op)
-        f = tvm.build(s, [x, y, z], "llvm")
+        f = tvm.build(te.create_prim_func([x, y, z]), target="llvm")
 
         xnd = tvm.nd.array(np.ones((n,), dtype=x.dtype))
         ynd = tvm.nd.array(np.zeros((n,), dtype=y.dtype))
diff --git a/tests/python/target/test_target_target.py b/tests/python/target/test_target_target.py
index cda228939f31..b99834aef35a 100644
--- a/tests/python/target/test_target_target.py
+++ b/tests/python/target/test_target_target.py
@@ -578,7 +578,7 @@ def func():
     func = func.with_attr("Target", target)
     target2 = tvm.ir.load_json(tvm.ir.save_json(target))
     mod = tvm.IRModule({"main": func})
-    lib = tvm.build({target2: mod}, target_host=target)
+    lib = tvm.build(mod, target=target2)
     lib["func"]()
 
 
diff --git a/tests/python/te/test_te_autodiff.py b/tests/python/te/test_te_autodiff.py
deleted file mode 100644
index a5995ff0337f..000000000000
--- a/tests/python/te/test_te_autodiff.py
+++ /dev/null
@@ -1,351 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import numpy as np
-import pytest
-import tvm
-from tvm import te, topi
-from tvm.testing import assert_allclose
-from tvm.topi.utils import get_const_tuple
-
-
-def check_grad(
-    out, inputs, args=[], data_range=(-10, 10), desired_grads=None, assert_no_jacobian=True
-):
-    inputs = inputs if isinstance(inputs, list) else [inputs]
-
-    def check_device(device, host="llvm"):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(host):
-            return
-
-        sout = te.create_schedule(out.op)
-        mout = tvm.build(sout, [out] + inputs + args)
-        out_shape = get_const_tuple(out.shape)
-
-        l, h = data_range
-        input_data = [
-            tvm.nd.array(
-                np.random.uniform(l, h, size=get_const_tuple(input.shape)).astype(input.dtype)
-            )
-            for input in inputs
-        ]
-        arg_vals = [
-            tvm.nd.array(np.random.uniform(l, h, size=get_const_tuple(arg.shape)).astype(arg.dtype))
-            for arg in args
-        ]
-
-        ones = topi.full_like(out, 1.0)
-        # we provide head to sum and reduce the output dimension,
-        # which equals to grad(out.sum(), inputs)
-        grads = te.gradient(out, inputs, head=ones)
-        grad_sched = te.create_schedule([grad.op for grad in grads])
-        mgrad = tvm.build(grad_sched, list(grads) + inputs + args)
-        if assert_no_jacobian:
-            # TODO(yzhliu): it is better to visit the expression and do assertion
-            lowered_ir = str(tvm.lower(grad_sched, list(grads) + inputs + args, simple_mode=True))
-            assert "jacobian" not in lowered_ir, lowered_ir
-
-        grad_data = [tvm.nd.empty(get_const_tuple(i.shape), g.dtype) for i, g in zip(inputs, grads)]
-
-        mgrad(*grad_data, *input_data, *arg_vals)
-        g_res = [g.numpy() for g in grad_data]
-
-        if desired_grads:
-            assert isinstance(desired_grads, list)
-            for actual, desired in zip(g_res, desired_grads):
-                assert_allclose(actual, desired, rtol=0.1, atol=1e-2)
-        else:
-
-            def forward(*in_data):
-                out_data = tvm.nd.empty(out_shape, out.dtype)
-                mout(out_data, *[tvm.nd.array(d) for d in list(in_data)])
-                return out_data.numpy().sum()
-
-            tvm.testing.check_numerical_grads(
-                forward, [d.numpy() for d in input_data + arg_vals], g_res
-            )
-
-    check_device("cpu")
-
-
-def test_basic_operation():
-    np.random.seed(0)
-    shape = (10, 10)
-    x = te.var("x", dtype="float32")
-    k = te.reduce_axis((0, 10), name="k")
-    l = te.reduce_axis((0, 10), name="l")
-    A0 = te.placeholder(shape, name="A0")
-    A1 = te.placeholder(shape, name="A1")
-    zeros = np.zeros(shape)
-
-    B = te.compute(shape, lambda i, j: A0[i, j], name="B")
-    check_grad(B, [A0])
-
-    B = te.compute(shape, lambda i, j: A0[i, j] + A1[i, j], name="B")
-    check_grad(B, [A0, A1])
-
-    B = te.compute(shape, lambda i, j: A0[i, j] + A0[j, i], name="B")
-    check_grad(B, A0)
-
-    B = te.compute(shape, lambda i, j: te.floor(A0[i, j]), name="B")
-    check_grad(B, A0, desired_grads=[zeros])
-
-    B = te.compute(shape, lambda i, j: te.ceil(A0[i, j]), name="B")
-    check_grad(B, A0, desired_grads=[zeros])
-
-    B = te.compute(shape, lambda i, j: te.trunc(A0[i, j]), name="B")
-    check_grad(B, A0, desired_grads=[zeros])
-
-    B = te.compute(shape, lambda i, j: te.round(A0[i, j]), name="B")
-    check_grad(B, A0, desired_grads=[zeros])
-
-    B = te.compute(shape, lambda i, j: A0[i, j] + te.exp(A0[j, i]), name="B")
-    check_grad(B, A0)
-
-    B = te.compute(shape, lambda i, j: te.log(0.1 + te.abs(A0[i, j] + te.exp(A0[j, i]))), name="B")
-    check_grad(B, A0)
-
-    B = te.compute(shape, lambda i, j: te.sigmoid(A0[i, j] * A0[i, j] * A0[j, i]), name="B")
-    check_grad(B, A0)
-
-    B = te.compute(shape, lambda i, j: te.tanh(A0[i, j] * A0[i, j] * A0[j, i]), name="B")
-    check_grad(B, A0)
-
-    B = te.compute(shape, lambda i, j: te.sqrt(A0[i, j] * A0[i, j] * A0[j, i]), name="B")
-    check_grad(B, A0, data_range=(0.1, 10))
-
-    B = te.compute(shape, lambda i, j: te.power(te.abs(A0[i, j]), A0[j, i]), name="B")
-    check_grad(B, A0, data_range=(-4, 4))
-
-    B = te.compute(shape, lambda i, j: A0[i, j] * A0[j, i], name="B")
-    check_grad(B, A0)
-
-    B = te.compute((10,), lambda i: te.sum(A0[i, k] * A0[k, i], axis=k), name="B")
-    check_grad(B, A0)
-
-    B = te.compute(shape, lambda i, j: te.sum(A0[i, k] * A0[k, i] + 5, axis=k), name="B")
-    check_grad(B, A0)
-
-    B = te.compute(shape, lambda i, j: te.max(A0[i, k] * A0[k, j] + 5, axis=k), name="B")
-    check_grad(B, A0)
-
-    B = te.compute(shape, lambda i, j: A0[i, j] * (A1[j, i] + A0[j, i]), name="B")
-    check_grad(B, [A0, A1])
-
-    B = te.compute(
-        shape, lambda i, j: te.sum(A0[k, k] - A0[te.min(j + k, 9), j] * A0[i, k], axis=k), name="B"
-    )
-    check_grad(B, A0)
-
-    def fcombine(x, y):
-        return x * y
-
-    def fidentity(t0):
-        return tvm.tir.const(1, t0)
-
-    prod = te.comm_reducer(fcombine, fidentity, name="prod")
-    B = te.compute((10, 10), lambda i, j: prod(A0[i, k] + A0[k, i], axis=k), name="B")
-    check_grad(B, A0)
-
-    X = te.placeholder((10,), name="X")
-    A = te.compute((10,), lambda i: X[i] + X[9 - i])
-    B = te.compute((10,), lambda i: X[i] * X[9 - i])
-    Y = topi.tensordot(A, B, 1)
-    check_grad(Y, X)
-
-    X = te.placeholder((3, 3), name="X")
-    Y = topi.einsum("ii->i", (X))
-    check_grad(Y, X)
-
-
-def test_topi():
-    X = te.placeholder((1, 2, 4, 4), name="X")
-    W = te.placeholder((5, 2, 3, 3), name="W")
-    W1 = te.placeholder((2, 5, 3, 3), name="W1")
-    W2 = te.placeholder((1,), name="W2")
-
-    R = topi.nn.conv2d(X, W, 1, 1, 1)
-    check_grad(R, [X, W])
-
-    R1 = topi.nn.conv2d(topi.nn.relu(R), W1, 1, 0, 1)
-    check_grad(R1, [X, W, W1])
-
-    R = topi.broadcast_to(W2, (5, 2, 3, 3))
-    check_grad(R, [W2])
-
-    R = topi.nn.conv2d(X, topi.broadcast_to(W2, (5, 2, 3, 3)), 1, 1, 1)
-    check_grad(R, [X, W2])
-
-    R = topi.nn.pool2d(X, [2, 2], [1, 1], [2, 2], [0, 0, 0, 0], "avg")
-    check_grad(R, X)
-
-    R = topi.nn.pool2d(X, [2, 2], [1, 1], [2, 2], [0, 0, 0, 0], "max")
-    check_grad(R, X)
-
-    X = te.placeholder((1, 2, 5, 5), name="X")
-    R = topi.reshape(X, (1, 32))
-    check_grad(R, [X])
-
-    X = te.placeholder((1, 2, 5, 5), name="X")
-    W = te.placeholder((2, 2, 3, 3), name="W")
-
-    S = topi.reshape(X, (1, 50))
-    check_grad(S, [X])
-
-    R = X + topi.nn.conv2d(X + topi.nn.conv2d(X, W, 1, 1, 1), W, 1, 1, 1)
-    check_grad(R, [X, W])
-
-    S = topi.nn.softmax(topi.reshape(R, (1, 50)))
-    check_grad(S, [X, W])
-
-    S = topi.sigmoid(topi.reshape(R, (1, 50)))
-    check_grad(S, [X, W])
-
-    S = topi.tanh(topi.reshape(R, (1, 50)))
-    check_grad(S, [X, W])
-
-    S = topi.nn.log_softmax(topi.reshape(R, (1, 50)))
-    check_grad(S, [X, W])
-    check_grad(S, [W], [X])
-
-    X = te.placeholder((1, 2, 3, 5), name="X")
-    Y = te.placeholder((1, 2, 7, 5), name="Y")
-    S = topi.concatenate((X, Y), 2)
-    check_grad(S, [X, Y])
-
-    X = te.placeholder((1, 2, 6, 5), name="X")
-    (S, R) = topi.split(X, 2, 2)
-    check_grad(S, [X])
-    check_grad(R, [X])
-    R1 = topi.concatenate((S, R), 2)
-    check_grad(R1, [X])
-    R2 = topi.concatenate((R, S), 2)
-    check_grad(R2, [X])
-
-    X = te.placeholder((4, 5), name="X")
-    I = te.placeholder((100,), name="I", dtype="int32")
-    R = topi.take(X, topi.abs(I))
-    check_grad(R, [X], [I])
-
-    W = te.placeholder((5, 5), name="W")
-    exps = topi.exp(topi.nn.dense(X, W))
-    sumexps = topi.sum(exps, axis=-1, keepdims=True)
-    R = exps / sumexps
-    check_grad(R, [X, W], data_range=(-1, 1))
-
-
-def test_stride_dilation():
-    X = te.placeholder((1, 2, 10, 10), name="X")
-    W = te.placeholder((2, 2, 1, 1), name="W")
-
-    Y = topi.nn.conv2d(X, W, 1, 0, 1)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 2, 0, 1)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 3, 0, 1)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 1, 0, 2)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 2, 0, 2)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 3, 0, 2)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 1, 0, 3)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 2, 0, 3)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 3, 0, 3)
-    check_grad(Y, [X, W])
-
-    W = te.placeholder((2, 2, 2, 2), name="W")
-
-    Y = topi.nn.conv2d(X, W, 1, 0, 1)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 2, 0, 1)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 3, 0, 1)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 1, 0, 2)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 2, 0, 2)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 3, 0, 2)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 1, 0, 3)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 2, 0, 3)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 3, 0, 3)
-    check_grad(Y, [X, W])
-
-    W = te.placeholder((2, 2, 3, 3), name="W")
-
-    Y = topi.nn.conv2d(X, W, 1, 0, 1)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 2, 0, 1)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 3, 0, 1)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 1, 0, 2)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 2, 0, 2)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 3, 0, 2)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 1, 0, 3)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 2, 0, 3)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 3, 0, 3)
-    check_grad(Y, [X, W])
-
-    Y = topi.nn.pool2d(X, [1, 1], [1, 1], [1, 1], [0, 0, 0, 0], "max")
-    check_grad(Y, [X])
-    Y = topi.nn.pool2d(X, [1, 1], [1, 1], [2, 2], [0, 0, 0, 0], "max")
-    check_grad(Y, [X])
-    Y = topi.nn.pool2d(X, [1, 1], [1, 1], [3, 3], [0, 0, 0, 0], "max")
-    check_grad(Y, [X])
-    Y = topi.nn.pool2d(X, [2, 2], [1, 1], [1, 1], [0, 0, 0, 0], "max")
-    check_grad(Y, [X])
-    Y = topi.nn.pool2d(X, [2, 2], [1, 1], [2, 2], [0, 0, 0, 0], "max")
-    check_grad(Y, [X])
-    Y = topi.nn.pool2d(X, [2, 2], [1, 1], [3, 3], [0, 0, 0, 0], "max")
-    check_grad(Y, [X])
-    Y = topi.nn.pool2d(X, [3, 3], [1, 1], [1, 1], [0, 0, 0, 0], "max")
-    check_grad(Y, [X])
-    Y = topi.nn.pool2d(X, [3, 3], [1, 1], [2, 2], [0, 0, 0, 0], "max")
-    check_grad(Y, [X])
-    Y = topi.nn.pool2d(X, [3, 3], [1, 1], [3, 3], [0, 0, 0, 0], "max")
-    check_grad(Y, [X])
-
-
-@pytest.mark.xfail
-def test_reduction_init():
-    np.random.seed(0)
-    shape = (10, 10)
-    k = te.reduce_axis((0, 10), name="k")
-    A0 = te.placeholder(shape, name="A0")
-
-    B = te.compute((10,), lambda i: te.sum(A0[i, k] * A0[k, i], axis=k, init=0.0), name="B")
-    check_grad(B, A0)
-
-
-if __name__ == "__main__":
-    test_basic_operation()
-    test_topi()
-    test_stride_dilation()
diff --git a/tests/python/te/test_te_build_lower.py b/tests/python/te/test_te_build_lower.py
deleted file mode 100644
index 50d5119b43a0..000000000000
--- a/tests/python/te/test_te_build_lower.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-
-
-def test_lower_rfactor():
-    n = te.size_var("n")
-    m = te.size_var("m")
-    A = te.placeholder((n, m), name="A")
-    k = te.reduce_axis((0, m), "k")
-    B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B")
-    s = te.create_schedule(B.op)
-    ko, ki = s[B].split(B.op.reduce_axis[0], factor=16)
-    BF = s.rfactor(B, ki)
-    xo, xi = s[B].split(s[B].op.axis[0], factor=32)
-    s[B.op].bind(xo, te.thread_axis("blockIdx.x"))
-    s[B.op].bind(xi, te.thread_axis("threadIdx.y"))
-    s[B].bind(s[B].op.reduce_axis[0], te.thread_axis("threadIdx.x"))
-    s[BF].compute_at(s[B], s[B].op.reduce_axis[0])
-    fapi = tvm.lower(s, [A, B])
-
-
-def test_dependent_output_shape():
-    n, m, x = te.size_var("n"), te.size_var("m"), te.size_var("x")
-    A = te.placeholder((n, m))
-    B = te.compute((m, n // x), lambda i, j: A[i, j], name="B")
-    s = te.create_schedule(B.op)
-    mod = tvm.build(s, [A, B, x])
-
-
-def test_split_uneven_unique_likely():
-    a = te.placeholder(
-        (16, 16),
-    )
-    b = te.placeholder(
-        (16, 16),
-    )
-    c = te.compute((16, 16), lambda x, y: a[x, y] + b[x, y])
-
-    x, y = c.op.axis
-    sch = te.create_schedule(c.op)
-    xo, xi = sch[c].split(x, 5)
-    stmt = tvm.lower(sch, [a, b, c])["main"].body
-    assert isinstance(stmt.body.body, tvm.tir.stmt.IfThenElse)
-
-
-if __name__ == "__main__":
-    test_lower_rfactor()
-    test_dependent_output_shape()
-    test_split_uneven_unique_likely()
diff --git a/tests/python/te/test_te_group.py b/tests/python/te/test_te_group.py
deleted file mode 100644
index e57040abc085..000000000000
--- a/tests/python/te/test_te_group.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test group effect"""
-import tvm
-from tvm import te
-
-
-def test_scan_group():
-    m = te.size_var("m")
-    n = te.size_var("n")
-    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
-    s_state = te.placeholder((m, n))
-    s_init = te.compute((1, n), lambda _, i: x[0, i])
-
-    s_update1 = te.compute((m, n), lambda t, i: s_state[t - 1, i] + x[t, i])
-    s_update2 = te.compute((m, n), lambda t, i: s_update1[t, i] + 1)
-    s_update3 = te.compute((m, n), lambda t, i: s_update2[t, i] + 1)
-    res = tvm.te.scan(s_init, s_update3, s_state, inputs=x)
-
-    s = te.create_schedule(res.op)
-    assert s[s_update1].group is not None
-    assert s[s_update2].group == s[s_update1].group
-    # Assign within group, is valid
-    s[s_update1].compute_at(s[s_update2], s_update2.op.axis[1])
-    # create a new group, for [s_update2 and s_update1]
-    g2 = s.create_group(outputs=s_update2, inputs=[s_state, x])
-    assert g2.group is not None
-    assert g2.group == s[s_update3].group
-    assert s[s_update2].group == g2
-    assert s[s_update1].group == g2
-    g2.compute_at(s[s_update3], s_update3.op.axis[1])
-    assert g2.attach_stage == s[s_update3]
-    try:
-        # compute outside group error.
-        s[s_update2].compute_at(s[s_init], s_init.op.axis[0])
-        assert False
-    except tvm.error.TVMError:
-        pass
-
-
-def test_compute_group():
-    m = te.size_var("m")
-    n = te.size_var("n")
-    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
-    x1 = te.compute(x.shape, lambda *i: x(*i) + 1, name="x1")
-    x2 = te.compute(x.shape, lambda *i: x1(*i) + 2, name="x2")
-    s = te.create_schedule(x2.op)
-    g = s.create_group(outputs=x1, inputs=x, include_inputs=True)
-    assert s[x1].group == g
-    assert s[x].group == g
-    g.compute_at(s[x2], x2.op.axis[1])
-    assert g.attach_stage == s[x2]
-    assert g.num_child_stages == 2
-
-
-def test_nest_group():
-    m = te.size_var("m")
-    n = te.size_var("n")
-    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
-    x1 = te.compute(x.shape, lambda *i: x(*i) + 1, name="x1")
-    x2 = te.compute(x.shape, lambda *i: x1(*i) + 2, name="x2")
-    s = te.create_schedule(x2.op)
-    g1 = s.create_group(outputs=x1, inputs=x)
-    g2 = s.create_group(outputs=x1, inputs=x, include_inputs=True)
-    assert set(s.groups) == set([g1, g2])
-    assert s[x].group == g2
-    assert s[x1].group == g1
-    assert g1.group == g2
-    assert g2.num_child_stages == 2
-    assert g1.num_child_stages == 1
-
-
-if __name__ == "__main__":
-    test_nest_group()
-    test_compute_group()
-    test_scan_group()
diff --git a/tests/python/te/test_te_hybrid_script.py b/tests/python/te/test_te_hybrid_script.py
deleted file mode 100644
index 862e80ffb6ce..000000000000
--- a/tests/python/te/test_te_hybrid_script.py
+++ /dev/null
@@ -1,872 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm, inspect, sys, traceback, numpy, pytest, types, os
-
-from tvm import te
-from tvm.contrib import utils
-from tvm.te.hybrid import script
-from tvm.te.hybrid.runtime import HYBRID_GLOBALS
-
-import tvm.testing
-
-
-@pytest.mark.skip
-def run_and_check(func, args, var_dict={}, target="llvm", sch=None, outs=None):
-    def tvm_val_2_py_val(val):
-        val = tvm.tir.stmt_functor.substitute(val, var_dict)
-        val = tvm.arith.Analyzer().simplify(val)
-        assert isinstance(val, (tvm.tir.IntImm,))
-        return val.value
-
-    dev = tvm.device(target, 0)
-    op = None
-
-    if sch is None:
-        outs = func(*tuple(tvm.runtime.convert(i) if isinstance(i, list) else i for i in args))
-        op = outs[0].op if isinstance(outs, list) else outs.op
-        sch = te.create_schedule(op)
-    else:
-        assert outs is not None
-        assert isinstance(outs, list)
-        op = outs[0].op
-
-    emu_args = []
-    nd_args = []
-    for i in args:
-        if isinstance(i, te.tensor.Tensor):
-            shape = [tvm_val_2_py_val(j) for j in i.shape]
-            emu_args.append(numpy.random.randn(*shape).astype(i.dtype))
-            nd_args.append(tvm.nd.array(emu_args[-1], dev))
-        elif isinstance(i, tvm.tir.Var):
-            emu_args.append(tvm_val_2_py_val(i))
-            nd_args.append(emu_args[-1])
-        else:
-            assert isinstance(i, list)
-            emu_args.append(numpy.array(i))
-
-    compile_args = [i for i in args if isinstance(i, (te.tensor.Tensor, tvm.tir.Var))] + (
-        outs if isinstance(outs, list) else [outs]
-    )
-    module = tvm.build(sch, compile_args, target=target)
-    assert module
-
-    out_tensors = []
-    for i in range(op.num_outputs):
-        output = op.output(i)
-        shape = [tvm_val_2_py_val(j) for j in output.shape]
-        nd_args.append(tvm.nd.array(numpy.zeros(shape).astype(output.dtype), dev))
-        out_tensors.append(nd_args[-1])
-
-    ref_data = func(*emu_args)
-    if isinstance(ref_data, numpy.ndarray):
-        ref_data = [ref_data]
-
-    module(*nd_args)
-
-    for nd, np in zip(out_tensors, ref_data):
-        tvm.testing.assert_allclose(nd.numpy(), np, rtol=1e-5, atol=1e-5)
-
-    module_args = [i for i in args if isinstance(i, (te.tensor.Tensor, tvm.tir.Var))]
-    module_outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    h_module = te.hybrid.build(sch, module_args, module_outs)
-
-    return h_module, module_args, module_outs
-
-
-@script
-def outer_product(n, m, a, b):
-    """This is a simple outer product.
-    Actually this function is not required to be documented.
-    I write this docstring to test skipping docstring functionality.
-    """
-    c = output_tensor((n, m), a.dtype)
-    for i in range(n):
-        for j in range(m):
-            assert i < n and j < m, "index out of range!"
-            c[i, j] = a[i] * b[j]
-    return c
-
-
-@tvm.testing.skip_if_wheel_test
-# Test global function
-# Test bridge between frontend and backend
-def test_outer_product():
-    n = te.size_var("n")
-    m = te.size_var("m")
-    a = te.placeholder((n,), name="a")
-    b = te.placeholder((m,), name="b")
-
-    try:
-        c = outer_product(n, m, a, b)
-        ir = c.op.body
-    except IOError as err:
-        assert sys.version_info[0] == 2 and str(err) == "could not get source code"
-        return
-
-    # Check for i in (0, n)
-    assert isinstance(ir, tvm.tir.For)
-    assert ir.loop_var.name == "i"
-    assert ir.min.value == 0
-    assert ir.extent.name == "n"
-    ibody = ir.body
-    assert isinstance(ibody, tvm.tir.For)
-    # Check for j in (0, m)
-    assert ibody.loop_var.name == "j"
-    assert ibody.min.value == 0
-    assert ibody.extent.name == "m"
-    # Check loop body
-    jblock = ibody.body
-    assert isinstance(jblock, tvm.tir.SeqStmt)
-    jbody = jblock[0]
-    assert isinstance(jbody, tvm.tir.AssertStmt)
-    assert isinstance(jbody.message, tvm.tir.StringImm)
-    assert jbody.message.value == "index out of range!"
-    jbody = jblock[1]
-    assert isinstance(jbody, tvm.tir.ProducerStore)
-    assert jbody.producer.op.name == "c"
-    assert len(jbody.indices) == 2
-    assert jbody.indices[0].name == "i"
-    assert jbody.indices[1].name == "j"
-    assert isinstance(jbody.value, tvm.tir.Mul)
-    mul = jbody.value
-    assert isinstance(mul.a, tvm.tir.ProducerLoad)
-    assert mul.a.producer.name == "a"
-    assert mul.b.producer.name == "b"
-
-    func, ins, outs = run_and_check(outer_product, [n, m, a, b], {n: 99, m: 101})
-    temp = utils.tempdir()
-    path = temp.relpath("%s.py" % func.name)
-    func.save(path)
-    func_ = te.hybrid.HybridModule()
-    func_.load(path)
-    run_and_check(func_, ins, {n: 99, m: 101}, outs=outs)
-
-    for key, _ in HYBRID_GLOBALS.items():
-        assert key not in globals().keys()
-        assert key not in outer_product.__globals__.keys()
-
-
-@tvm.testing.skip_if_wheel_test
-# Test local function
-# Test allocation of local variable
-def test_fanout():
-    @script
-    def fanout(n, a):
-        three = 3.0
-        b = output_tensor((a.shape[0] - 3,), a.dtype)
-        for i in range(a.shape[0] - 3):
-            sigma = 0.0
-            for j in range(3):
-                sigma += a[i + j]
-            sigma = sigma / three
-            b[i] = sigma
-        return b
-
-    n = te.size_var("n")
-    a = te.placeholder((n,), "float32", name="a")
-    try:
-        b = fanout(n, a)
-        ir = b.op.body
-    except IOError as err:
-        assert sys.version_info[0] == 2 and str(err) == "could not get source code"
-        return
-
-    # Check for i in (0, n-3)
-    assert isinstance(ir, tvm.tir.For)
-    assert ir.loop_var.name == "i"
-    assert ir.min.value == 0
-    tvm.ir.assert_structural_equal(ir.extent, n - 3)
-    # Check loopbody
-    abody = ir.body
-    assert isinstance(abody, tvm.tir.ProducerRealize)
-    assert abody.bounds[0].min.value == 0
-    assert abody.bounds[0].extent.value == 1
-    assert abody.producer.op.name == "sigma"
-    # Check i loop body
-    rbody = abody.body
-    assert isinstance(rbody[0], tvm.tir.ProducerStore)
-    assert rbody[0].producer.op.name == "sigma"
-    assert len(rbody[0].indices) == 1
-    assert rbody[0].indices[0].value == 0
-    # Check fanout loop
-    jloop = rbody[1]
-    assert jloop.loop_var.name == "j"
-    assert jloop.min.value == 0
-    assert jloop.extent.value == 3
-    jbody = jloop.body
-    assert isinstance(jbody, tvm.tir.ProducerStore)
-    assert len(jbody.indices) == 1
-    assert jbody.indices[0].value == 0
-    assert jbody.producer.op.name == "sigma"
-    assert isinstance(jbody.value, tvm.tir.Add)
-    value = jbody.value
-    assert isinstance(value.a, tvm.tir.ProducerLoad)
-    assert value.a.producer.name == "sigma"
-    assert len(value.a.indices) == 1
-    assert value.a.indices[0].value == 0
-    assert value.b.producer.name == "a"
-    assert len(value.b.indices) == 1
-    tvm.ir.assert_structural_equal(value.b.indices[0], ir.loop_var + jloop.loop_var)
-    divide = rbody[2]
-    assert isinstance(divide, tvm.tir.ProducerStore)
-    assert len(divide.indices) == 1
-    assert divide.indices[0].value == 0
-    value = divide.value
-    assert isinstance(value, tvm.tir.Mul)
-    assert value.a.producer.name == "sigma"
-    assert len(value.a.indices) == 1
-    assert value.a.indices[0].value == 0
-    assert abs(value.b.value - (1 / 3.0)) < 1e-5
-    write = rbody[3]
-    assert isinstance(write, tvm.tir.ProducerStore)
-    assert write.producer.op.name == "b"
-    assert write.value.producer.name == "sigma"
-    assert len(write.value.indices) == 1
-    assert write.value.indices[0].value == 0
-
-    func, ins, outs = run_and_check(fanout, [n, a], {n: 10})
-    run_and_check(func, ins, {n: 10}, outs=outs)
-
-
-def test_looptype():
-    @script
-    def looptype(a, b, c):
-        d = output_tensor((16,), "int32")
-        e = output_tensor((16,), "int32")
-        f = output_tensor((16,), "int32")
-        for i in parallel(16):
-            d[i] = a[i]
-        for j in vectorize(16):
-            e[j] = b[j]
-        for k in unroll(16):
-            f[k] = c[k]
-        return d, e, f
-
-    a = te.placeholder((16,), name="a", dtype="int32")
-    b = te.placeholder((16,), name="b", dtype="int32")
-    c = te.placeholder((16,), name="c", dtype="int32")
-    try:
-        d, e, f = looptype(a, b, c)
-        ir = d.op.body
-    except:
-        return
-    iloop = ir[0]
-    jloop = ir[1]
-    kloop = ir[2]
-    assert iloop.kind == tvm.tir.ForKind.PARALLEL
-    assert jloop.kind == tvm.tir.ForKind.VECTORIZED
-    assert kloop.kind == tvm.tir.ForKind.UNROLLED
-
-    func, ins, outs = run_and_check(looptype, [a, b, c])
-    run_and_check(func, ins, outs=outs)
-
-
-@tvm.testing.skip_if_wheel_test
-def test_if():
-    @script
-    def if_then_else(a):
-        b = output_tensor((10,), "int32")
-        c = output_tensor((10,), "int32")
-        for i in range(10):
-            if i % 2 == 0:
-                c[i] = a[i]
-            else:
-                c[i] = b[i]
-        for i in unroll(10):
-            b[i] = -1 if i % 2 == 0 else 1
-        return b, c
-
-    a = te.placeholder((10,), dtype="int32", name="a")
-
-    func, ins, outs = run_and_check(if_then_else, [a])
-    run_and_check(func, ins, outs=outs)
-
-    @script
-    def if_triple_condition(a):
-        b = output_tensor((10,), "int32")
-        for i in range(10):
-            if 0 <= i < 5:
-                b[i] = a[i]
-            else:
-                b[i] = a[i] + 1
-        return b
-
-    func, ins, outs = run_and_check(if_triple_condition, [a])
-    run_and_check(func, ins, outs=outs)
-
-    @script
-    def if_and(a):
-        b = output_tensor((10,), "int32")
-        for i in range(10):
-            if i >= 0 and i < 5:
-                b[i] = a[i]
-            else:
-                b[i] = a[i] + 1
-        return b
-
-    func, ins, outs = run_and_check(if_and, [a])
-    run_and_check(func, ins, outs=outs)
-
-
-@tvm.testing.requires_gpu
-@tvm.testing.requires_cuda
-def test_bind():
-    @script
-    def vec_add(a, b):
-        c = output_tensor((1000,), "float32")
-        for tx in bind("threadIdx.x", 1000):
-            c[tx] = a[tx] + b[tx]
-        return c
-
-    a = te.placeholder((1000,), dtype="float32", name="a")
-    b = te.placeholder((1000,), dtype="float32", name="b")
-    func, ins, outs = run_and_check(vec_add, [a, b], target="cuda")
-    run_and_check(func, ins, outs=outs, target="cuda")
-
-    @script
-    def raw(a, b):
-        c = output_tensor((1000,), "float32")
-        for i in range(1000):
-            c[i] = a[i] + b[i]
-        return c
-
-    c = raw(a, b)
-    sch = te.create_schedule(c.op)
-    x = te.thread_axis("threadIdx.x")
-    sch[c].bind(c.op.axis[0], x)
-    func, ins, outs = run_and_check(raw, [a, b], sch=sch, outs=[c], target="cuda")
-    run_and_check(func, ins, outs=outs, target="cuda")
-
-    @te.hybrid.script
-    def foo(a):
-        c = output_tensor((a.shape[0],), a.dtype)
-        total = allocate((1,), a.dtype, "local")
-        len_i = a.shape[0]
-        len_j = a.shape[1]
-        for i in bind("threadIdx.x", len_i):
-            total[0] = 0.0
-            for k in const_range(len_j):
-                total[0] += a[i, k]
-            c[i] = total[0]
-
-        return c
-
-    a = te.placeholder((8, 4), "float32")
-    c = foo(a)
-    s = te.create_schedule(c.op)
-    ir = tvm.lower(s, [a, c])
-
-    func, ins, outs = run_and_check(foo, [a], target="cuda")
-    run_and_check(func, ins, outs=outs, target="cuda")
-
-    @te.hybrid.script
-    def max_threads(a):
-        b = output_tensor(a.shape, a.dtype)
-        n = a.shape[0]
-        m = max_num_threads(True)
-        for i in bind("threadIdx.x", m):
-            for j in bind("blockIdx.x", ceil_div(n, m)):
-                if i * m + j < n:
-                    b[i * m + j] = a[i * m + j] + a[i * m + j]
-        return b
-
-    a = te.placeholder((10000,), "float32")
-    with tvm.target.Target("cuda"):
-        func, ins, outs = run_and_check(max_threads, [a], target="cuda")
-        run_and_check(func, ins, outs=outs, target="cuda")
-
-
-@tvm.testing.skip_if_wheel_test
-def test_math_intrin():
-    @script
-    def intrin_real(a):
-        b = output_tensor((8,), "float32")
-        b[0] = sqrt(a[0])
-        b[1] = log(a[1])
-        b[2] = exp(a[2])
-        b[3] = sigmoid(a[3])
-        b[4] = power(a[4], a[5])
-        b[5] = tanh(a[5])
-        b[6] = min(a[4], a[5])
-        b[7] = max(a[5], a[6])
-        return b
-
-    a8 = te.placeholder((8,), dtype="float32", name="a")
-    b8 = intrin_real(a8)
-    sch = te.create_schedule(b8.op)
-    func = tvm.build(sch, [a8, b8])
-    assert func
-    a = numpy.arange(2, 10).astype("float32")
-    tvm_a = tvm.nd.array(a)
-    tvm_b = tvm.nd.array(numpy.zeros((8,), dtype="float32"))
-    b = intrin_real(a)
-    func(tvm_a, tvm_b)
-    tvm.testing.assert_allclose(b, tvm_b.numpy(), rtol=1e-5)
-
-    @script
-    def intrin_int(a):
-        b = output_tensor((1,), "int32")
-        b[0] = popcount(a[0])
-        return b
-
-    a1 = te.placeholder((1,), dtype="int32")
-    b1 = intrin_int(a1)
-    sch = te.create_schedule(b1.op)
-    func = tvm.build(sch, [a1, b1])
-    assert func
-    a = numpy.array([114514]).astype("int32")
-    tvm_a = tvm.nd.array(a)
-    tvm_b = tvm.nd.array(numpy.array([0]).astype("int32"))
-    b = intrin_int(a)
-    func(tvm_a, tvm_b)
-    assert tvm_b.numpy()[0] == b[0]
-
-
-@tvm.testing.skip_if_wheel_test
-# test non caconical loops
-def test_non_zero():
-    @te.hybrid.script
-    def blur(a):
-        b = output_tensor((30, 30), "float32")
-        for i in range(2, 32):
-            for j in range(2, 32):
-                s = 0.0
-                for di in range(3):
-                    for dj in range(3):
-                        s += a[i - di, j - dj]
-                b[i - 2, j - 2] = s / 9.0
-        return b
-
-    a = te.placeholder((32, 32), "float32", "a")
-    func, ins, outs = run_and_check(blur, [a])
-    run_and_check(func, ins, outs=outs)
-
-    @te.hybrid.script
-    def triangle(a, b):
-        c = output_tensor((10, 10), dtype="float32")
-        for i in range(10):
-            for j in range(i, 10):
-                c[i, j] = a[i] * b[j]
-        return c
-
-    a = te.placeholder((10,), dtype="float32", name="a")
-    b = te.placeholder((10,), dtype="float32", name="b")
-
-    func, ins, outs = run_and_check(triangle, [a, b])
-    run_and_check(func, ins, outs=outs)
-
-
-@tvm.testing.requires_gpu
-@tvm.testing.requires_cuda
-def test_allocate():
-    @te.hybrid.script
-    def blur2d(a):
-        b = output_tensor((30, 30), "float32")
-        for i in range(30):
-            ha = allocate((3, 30), "float32")
-            for j in range(3):
-                for k in range(30):
-                    ha[j, k] = a[i + j, k] + a[i + j, k + 1] + a[i + j, k + 2]
-            for j in range(30):
-                b[i, j] = (ha[0, j] + ha[1, j] + ha[2, j]) / 9.0
-        return b
-
-    a = te.placeholder((32, 32), "float32", "a")
-    b = blur2d(a)
-    sch = te.create_schedule(b.op)
-    func, ins, outs = run_and_check(blur2d, [a])
-    run_and_check(func, ins, outs=outs)
-
-    @te.hybrid.script
-    def share_vec_add(a, b):
-        c = output_tensor((256,), "float32")
-        shared = allocate((256,), "float32", "shared")
-        for i in bind("threadIdx.x", 256):
-            shared[i] = a[i]
-        local = allocate((256,), "float32", "local")
-        for i in bind("threadIdx.x", 256):
-            local[i] = b[i]
-        for i in bind("threadIdx.x", 256):
-            c[i] = shared[i] + local[i]
-        return c
-
-    a = te.placeholder((256,), dtype="float32", name="a")
-    b = te.placeholder((256,), dtype="float32", name="b")
-    c = share_vec_add(a, b)
-    func, ins, outs = run_and_check(share_vec_add, [a, b], target="cuda")
-    run_and_check(func, ins, outs=outs, target="cuda")
-
-
-@tvm.testing.skip_if_wheel_test
-def test_upstream():
-    @te.hybrid.script
-    def upstream(a):
-        b = output_tensor((20,), "float32")
-        for i in range(20):
-            b[i] = a[i] * i
-        return b
-
-    a = te.placeholder((20,), "float32")
-    b = te.placeholder((20,), "float32")
-    c = te.compute((20,), lambda x: a[x] + b[x])
-    d = upstream(c)
-    sch = te.create_schedule([c.op, d.op])
-    ir = tvm.lower(sch, [a, b, d])
-    func = tvm.build(sch, [a, b, d])
-    assert func
-
-    a = numpy.random.randn(20).astype("float32")
-    b = numpy.random.randn(20).astype("float32")
-    ref = numpy.zeros((20,), "float32")
-    for i in range(20):
-        ref[i] = (a[i] + b[i]) * i
-
-    tvm_a = tvm.nd.array(a)
-    tvm_b = tvm.nd.array(b)
-    tvm_d = tvm.nd.array(numpy.zeros((20,)).astype("float32"))
-
-    func(tvm_a, tvm_b, tvm_d)
-    tvm.testing.assert_allclose(tvm_d.numpy(), ref, 1e-5, 1e-5)
-
-
-@tvm.testing.skip_if_wheel_test
-def test_downstream():
-    @te.hybrid.script
-    def downstream(a):
-        b = output_tensor((20,), "float32")
-        for i in range(20):
-            b[i] = a[i] * i
-        return b
-
-    a = te.placeholder((20,), "float32")
-    b = downstream(a)
-    c = te.compute((20,), lambda x: b[x] + 1.0)
-
-    sch = te.create_schedule(c.op)
-    module = tvm.build(sch, [a, c])
-    assert module
-
-    a = numpy.random.randn(20).astype("float32")
-    ref = numpy.zeros((20,)).astype("float32")
-    for i in range(20):
-        ref[i] = (a[i] * i) + 1.0
-
-    tvm_a = tvm.nd.array(a)
-    tvm_c = tvm.nd.array(numpy.zeros((20,)).astype("float32"))
-    module(tvm_a, tvm_c)
-    tvm.testing.assert_allclose(tvm_c.numpy(), ref, 1e-5, 1e-5)
-
-
-@tvm.testing.skip_if_wheel_test
-def test_const_param():
-    @te.hybrid.script
-    def add_something(a, b):
-        c = output_tensor((11,), "int32")
-        for i in range(11):
-            c[i] = a[i] + b
-        return c
-
-    a = te.placeholder((11,), dtype="int32", name="a")
-    b = tvm.tir.const(11, "int32")
-    c = add_something(a, b)
-    sch = te.create_schedule(c.op)
-    module = tvm.build(sch, [a, c], "llvm")
-    assert module
-
-    np_a = numpy.arange(11).astype("int32")
-    np_b = 11
-    np_c = numpy.zeros((11,)).astype("int32")
-
-    nd_a = tvm.nd.array(np_a)
-    nd_c = tvm.nd.array(numpy.zeros((11,)).astype("int32"))
-    module(nd_a, nd_c)
-    ref = add_something(np_a, 11)
-
-    tvm.testing.assert_allclose(nd_c.numpy(), ref, 1e-5, 1e-5)
-
-
-@tvm.testing.skip_if_wheel_test
-def test_value_index():
-    @te.hybrid.script
-    def kernel_a(a):
-        b = output_tensor((16,), "int32")
-        c = output_tensor((4, 4), "int32")
-        for i in range(16):
-            b[i] = a[i] + 2
-            c[i // 4, i % 4] = a[i] + 1
-        return b, c
-
-    @te.hybrid.script
-    def kernel_b(b, a):
-        c = output_tensor((4, 4), "int32")
-        for i in range(4):
-            for j in range(4):
-                c[i, j] = a[i * 4 + j] * b[i, j]
-        return c
-
-    a = te.placeholder((16,), "int32")
-    b, c = kernel_a(a)
-    d = kernel_b(c, b)
-    sch = te.create_schedule(d.op)
-    module = tvm.build(sch, [a, d])
-    assert module
-
-    np_a = numpy.arange(16).astype("int32")
-    np_b, np_c = kernel_a(np_a)
-    ref = kernel_b(np_c, np_b)
-
-    res = tvm.nd.array(numpy.zeros((4, 4)).astype("int32"))
-    module(tvm.nd.array(np_a), res)
-    tvm.testing.assert_allclose(res.numpy(), ref)
-
-
-@tvm.testing.skip_if_wheel_test
-def test_func_call():
-    @te.hybrid.script
-    def foo(a, b):
-        for i in range(len(a)):
-            a[i] = i + 1.0
-        for i in range(len(a)):
-            b[i] = i + 1.0
-        c = outer_product(10, 10, a, b)
-        d = output_tensor(c.shape, c.dtype)
-        for i in range(10):
-            for j in range(10):
-                d[i, j] = c[i, j] + i * j
-        return d
-
-    a = te.placeholder((10,), name="a")
-    b = te.placeholder((10,), name="b")
-    func, ins, outs = run_and_check(foo, [a, b])
-    run_and_check(func, ins, outs=outs)
-
-
-@tvm.testing.skip_if_wheel_test
-def test_bool():
-    @te.hybrid.script
-    def foo(a):
-        b = output_tensor(a.shape, a.dtype)
-        b[0] = 1.2
-        for i in range(1, a.shape[0] - 1):
-            if a[i] * a[i - 1] < a[i] or a[i] * a[i - 1] < a[i - 1] or i * a[i] == a[i]:
-                b[i] = a[i]
-            else:
-                b[i] = 0.0
-        return b
-
-    a = te.placeholder((10,), name="a")
-    func, ins, outs = run_and_check(foo, [a])
-    run_and_check(func, ins, outs=outs)
-
-
-@tvm.testing.skip_if_wheel_test
-def test_const_range():
-    @te.hybrid.script
-    def foo(a, b):
-        c = output_tensor(a.shape, a.dtype)
-        d = output_tensor(a.shape, "int32")
-
-        for i in const_range(2):
-            for j in const_range(5):
-                c[i, j] = float32(int32(a[i, j]) + b[i, j])
-
-        for i in const_range(len(b)):
-            for j in const_range(len(b[0])):
-                d[i, j] = int32(a[i, j] + b[i, j])
-
-        return c, d
-
-    a = te.placeholder((2, 5), name="a", dtype="float32")
-    b = [[1, 2, 3, 4, 5], [5, 4, 3, 2, 1]]
-    func, ins, outs = run_and_check(foo, [a, b])
-    run_and_check(func, ins, outs=outs)
-
-    @te.hybrid.script
-    def goo(a, b):
-        c = output_tensor(a.shape, a.dtype)
-        len_b = len(b)
-        for i in const_range(len_b * 2):
-            if i < len_b:
-                c[i] = a[i] + b[i]
-            else:
-                c[i - len_b] = a[i - len_b] + b[i - len_b]
-        return c
-
-    a = te.placeholder((5,), name="a", dtype="int32")
-    b = [1, 2, 3, 4, 5]
-    c = goo(a, tvm.runtime.convert(b))
-    sch = te.create_schedule(c.op)
-    func, ins, outs = run_and_check(goo, [a, b])
-    run_and_check(func, ins, outs=outs)
-
-    @te.hybrid.script
-    def hoo(a, b):
-        c = output_tensor(a.shape, a.dtype)
-        len_b = len(b)
-        for i in range(a.shape[0]):
-            for j in const_range(len(b)):
-                d = a[i] * b[j]
-                d += a[i] + b[j]
-                c[i] = d
-        return c
-
-    a = te.placeholder((5,), name="a", dtype="int32")
-    b = [1, 2, 3, 4, 5]
-    func, ins, outs = run_and_check(hoo, [a, b])
-    run_and_check(func, ins, outs=outs)
-
-
-@tvm.testing.skip_if_wheel_test
-def test_schedule():
-    @script
-    def outer_product(a, b):
-        c = output_tensor((64, 64), a.dtype)
-        for i in range(64):
-            for j in range(64):
-                c[i, j] = a[i] * b[j]
-        return c
-
-    a = te.placeholder((64,), name="a", dtype="float32")
-    b = te.placeholder((64,), name="b", dtype="float32")
-    c = outer_product(a, b)
-
-    # Test perfect loop split
-    # Test loop reorder
-    # Test loop annotation
-    sch = te.create_schedule(c.op)
-    i, j = c.op.axis
-    io, ii = sch[c].split(i, 4)
-    sch[c].parallel(ii)
-    jo, ji = sch[c].split(j, 4)
-    joo, joi = sch[c].split(jo, 4)
-    sch[c].vectorize(ji)
-    sch[c].reorder(ii, io, joo, joi, ji)
-    ir = tvm.lower(sch, [a, b, c])["main"].body
-    assert isinstance(ir, tvm.tir.AttrStmt)
-    ir = ir.body
-    assert isinstance(ir, tvm.tir.For)
-    assert ir.loop_var.name == "i.inner"
-    ir = ir.body
-    assert isinstance(ir, tvm.tir.For)
-    assert ir.loop_var.name == "i.outer"
-    ir = ir.body
-    assert isinstance(ir, tvm.tir.For)
-    assert ir.loop_var.name == "j.outer.outer"
-    ir = ir.body
-    assert isinstance(ir, tvm.tir.For)
-    assert ir.loop_var.name == "j.outer.inner"
-    ir = ir.body
-    func, ins, outs = run_and_check(outer_product, [a, b], sch=sch, outs=[c])
-    run_and_check(func, ins, outs=outs)
-
-    # Test fuse
-    sch = te.create_schedule(c.op)
-    sch[c].fuse(c.op.axis[0], c.op.axis[1])
-    ir = tvm.lower(sch, [a, b, c])["main"].body
-    assert isinstance(ir, tvm.tir.AttrStmt)
-    ir = ir.body
-    assert isinstance(ir, tvm.tir.For)
-    assert ir.loop_var.name == "i.j.fused"
-    func, ins, outs = run_and_check(outer_product, [a, b], sch=sch, outs=[c])
-    run_and_check(func, ins, outs=outs)
-
-    # Test imperfect loop split
-    sch = te.create_schedule(c.op)
-    sch[c].split(c.op.axis[0], 3)
-    ir = tvm.lower(sch, [a, b, c], simple_mode=True)
-    func, ins, outs = run_and_check(outer_product, [a, b], sch=sch, outs=[c])
-    run_and_check(func, ins, outs=outs)
-
-    # Test loop binds
-
-
-@tvm.testing.skip_if_wheel_test
-def test_capture():
-    n = 8
-
-    constant_tuple = (10, n)
-    constant_list = [[1, 2], [3, n]]
-    const_value = 1
-
-    @te.hybrid.script
-    def add_something(a):
-        c = output_tensor((constant_tuple[1],), "int32")
-        for i in range(constant_tuple[1]):
-            c[i] = a[i] + constant_list[1][const_value]
-        return c
-
-    a = te.placeholder((n,), dtype="int32", name="a")
-
-    func, ins, outs = run_and_check(add_something, [a])
-    run_and_check(func, ins, outs=outs)
-
-
-@tvm.testing.skip_if_wheel_test
-def test_array_inputs():
-    @script
-    def sum_array(inputs):
-        out = output_tensor((10,), inputs[0].dtype)
-        n = len(inputs)
-        for i in range(10):
-            for j in const_range(n):
-                out[i] += inputs[j][i]
-        return out
-
-    n = 5
-    inputs = []
-    for i in range(n):
-        inputs.append(te.placeholder((10,), name="t%s" % i, dtype="float32"))
-
-    out = sum_array(tvm.runtime.convert(inputs))
-    assert len(out.op.inputs) == n
-
-    sch = te.create_schedule(out.op)
-    mod = tvm.build(sch, inputs + [out], target="llvm")
-    assert mod
-
-    input_nd = []
-    out_ref = numpy.zeros((10,))
-    for _ in range(n):
-        arr = numpy.random.uniform(size=(10,)).astype("float32")
-        input_nd.append(tvm.nd.array(arr))
-        out_ref += arr
-    out_nd = tvm.nd.array(numpy.zeros((10,), "float32"))
-    mod(*input_nd, out_nd)
-    tvm.testing.assert_allclose(out_nd.numpy(), out_ref)
-
-
-if __name__ == "__main__":
-    test_outer_product()
-    test_fanout()
-    test_looptype()
-    test_if()
-    test_bind()
-    test_math_intrin()
-    test_non_zero()
-    test_allocate()
-    test_upstream()
-    test_downstream()
-    test_const_param()
-    test_value_index()
-    test_func_call()
-    test_bool()
-    test_const_range()
-    test_schedule()
-    test_capture()
-    test_array_inputs()
-    # TODO:
-    # test_inplace()
diff --git a/tests/python/te/test_te_schedule.py b/tests/python/te/test_te_schedule.py
deleted file mode 100644
index d46db2b702c0..000000000000
--- a/tests/python/te/test_te_schedule.py
+++ /dev/null
@@ -1,382 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pickle as pkl
-
-import pytest
-import tvm
-from tvm import te
-from tvm.driver.build_module import schedule_to_module
-
-
-def test_schedule_create():
-    m = te.size_var("m")
-    n = te.size_var("n")
-    l = te.size_var("l")
-    A = te.placeholder((m, l), name="A")
-    B = te.placeholder((n, l), name="B")
-    AA = te.compute((m, l), lambda i, j: A[i, j])
-    T = te.compute((m, n, l), lambda i, j, k: AA(i, k) * B(j, k))
-    s = te.create_schedule(T.op)
-    s[AA].set_scope("shared")
-    xo, xi = s[T].split(T.op.axis[0], factor=10)
-    xi1, xi2 = s[T].split(xi, factor=2)
-    s[AA].compute_at(s[T], xi1)
-    xo, xi = s[AA].split(AA.op.axis[0], factor=10)
-    s[T].reorder(xi2, xi1)
-    assert T.op.axis[1] in s[T].leaf_iter_vars
-
-    # save load json
-    json_str = tvm.ir.save_json(s)
-    s_loaded = tvm.ir.load_json(json_str)
-    assert isinstance(s_loaded, tvm.te.schedule.Schedule)
-    assert str(s_loaded.outputs[0].body) == str(s.outputs[0].body)
-
-    # pickle unpickle
-    dump = pkl.dumps(s)
-    s_loaded = pkl.loads(dump)
-    assert isinstance(s_loaded, tvm.te.schedule.Schedule)
-    assert str(s_loaded.outputs[0].body) == str(s.outputs[0].body)
-
-
-def test_reorder():
-    m = te.size_var("m")
-    A = te.placeholder((m,), name="A")
-    T = te.compute(m, lambda i: A[i + 1])
-
-    s = te.create_schedule(T.op)
-    xo, xi = s[T].split(T.op.axis[0], factor=10)
-    xi1, xi2 = s[T].split(xi, factor=2)
-    order = (xi2, xi1, xo)
-    assert tuple(s[T].leaf_iter_vars) != order
-    s[T].reorder(*order)
-    assert tuple(s[T].leaf_iter_vars) == order
-    try:
-        # pass duplicate IterVar
-        # must raise an error
-        s[T].reorder(xi2, xi1, xi2)
-        assert False
-    except tvm.error.TVMError:
-        pass
-
-
-def test_split():
-    m = te.size_var("m")
-    A = te.placeholder((m,), name="A")
-    T = te.compute((m,), lambda i: A[i])
-
-    s = te.create_schedule(T.op)
-    xo, xi = s[T].split(T.op.axis[0], factor=10)
-    assert tuple(s[T].leaf_iter_vars) == (xo, xi)
-
-
-def test_tile():
-    m = te.size_var("m")
-    n = te.size_var("n")
-    A = te.placeholder((m, n), name="A")
-    T = te.compute((m, n), lambda i, j: A[i, j])
-
-    s = te.create_schedule(T.op)
-    xo, yo, xi, yi = s[T].tile(T.op.axis[0], T.op.axis[1], x_factor=10, y_factor=5)
-    assert tuple(s[T].leaf_iter_vars) == (xo, yo, xi, yi)
-
-
-def test_fuse():
-    m = te.size_var("m")
-    n = te.size_var("n")
-    A = te.placeholder((m, n), name="A")
-    T = te.compute((m, n), lambda i, j: A[i, j])
-
-    s = te.create_schedule(T.op)
-    xo, yo, xi, yi = s[T].tile(T.op.axis[0], T.op.axis[1], x_factor=10, y_factor=5)
-    fused = s[T].fuse(xo, yo)
-    assert any(isinstance(x, tvm.te.schedule.Fuse) for x in s[T].relations)
-    assert tuple(s[T].leaf_iter_vars) == (fused, xi, yi)
-
-
-def test_fuse_with_split():
-    m = te.size_var("m")
-    n = te.size_var("n")
-    A = te.placeholder((m, n), name="A")
-    T = te.compute((m, n), lambda i, j: A[i, j])
-
-    s = te.create_schedule(T.op)
-    y = T.op.axis[1]
-    xo, xi = s[T].split(T.op.axis[0], factor=10)
-    fused = s[T].fuse(xi, y)
-    assert any(isinstance(x, tvm.te.schedule.Fuse) for x in s[T].relations)
-    assert tuple(s[T].leaf_iter_vars) == (xo, fused)
-
-
-def test_fuse_with_out_of_order_axis():
-    m = te.size_var("m")
-    n = te.size_var("n")
-    A = te.placeholder((m, n), name="A")
-    T = te.compute((m, n), lambda i, j: A[i, j])
-
-    s = te.create_schedule(T.op)
-    y = T.op.axis[1]
-    xo, xi = s[T].split(T.op.axis[0], factor=10)
-
-    with pytest.raises(RuntimeError):
-        fused = s[T].fuse(xo, y)  # should throw here
-
-
-def test_fuse_with_out_of_order_axis_with_reorder():
-    m = te.size_var("m")
-    n = te.size_var("n")
-    A = te.placeholder((m, n), name="A")
-    T = te.compute((m, n), lambda i, j: A[i, j])
-
-    s = te.create_schedule(T.op)
-    y = T.op.axis[1]
-    xo, xi = s[T].split(T.op.axis[0], factor=10)
-    s[T].reorder(y, xo, xi)
-    fused = s[T].fuse(y, xo)  # should be ok
-
-    s = te.create_schedule(T.op)
-    y = T.op.axis[1]
-    xo, xi = s[T].split(T.op.axis[0], factor=10)
-    s[T].reorder(y, xo, xi)
-
-    with pytest.raises(RuntimeError):
-        fused = s[T].fuse(y, xi)  # should throw here
-
-
-def test_singleton():
-    A = te.placeholder((), name="A")
-    T = te.compute((), lambda: A() + 1)
-    s = te.create_schedule(T.op)
-    fused = s[T].fuse()
-    assert any(isinstance(x, tvm.te.schedule.Singleton) for x in s[T].relations)
-    assert tuple(s[T].leaf_iter_vars) == (fused,)
-    dump = pkl.dumps(s)
-    s_loaded = pkl.loads(dump)
-    assert isinstance(s_loaded, tvm.te.schedule.Schedule)
-
-
-def test_vectorize():
-    m = te.size_var("m")
-    n = te.size_var("n")
-    A = te.placeholder((m, n), name="A")
-    T = te.compute((m, n), lambda i, j: A[i, j])
-
-    s = te.create_schedule(T.op)
-    xo, yo, xi, yi = s[T].tile(T.op.axis[0], T.op.axis[1], x_factor=10, y_factor=5)
-    s[T].vectorize(yi)
-    s[T].unroll(xi)
-    UNROLL = tvm.te.schedule.IterVar.Unrolled
-    VECTORIZE = tvm.te.schedule.IterVar.Vectorized
-    assert s[T].iter_var_attrs[xi].iter_type == UNROLL
-    assert s[T].iter_var_attrs[yi].iter_type == VECTORIZE
-
-
-def test_vectorize_commreduce():
-    V = te.placeholder((128,), name="V")
-    ax = te.reduce_axis((0, 128), name="ax")
-    O = te.compute((1,), lambda _: te.sum(V[ax], axis=[ax]))
-    s = te.create_schedule(O.op)
-    with pytest.raises(RuntimeError):
-        s[O].vectorize(ax)  # should throw here
-
-
-def test_pragma():
-    m = 100
-    A = te.placeholder((m,), name="A")
-    T = te.compute((m,), lambda i: A[i])
-
-    s = te.create_schedule(T.op)
-    xo, xi = s[T].split(T.op.axis[0], factor=10)
-    s[T].pragma(xo, "pragma1")
-    s[T].pragma(xi, "vectorize")
-    VECTORIZE = tvm.te.schedule.IterVar.Vectorized
-    assert s[T].iter_var_attrs[xo].pragma_keys[0].value == "pragma1"
-    assert s[T].iter_var_attrs[xi].iter_type == VECTORIZE
-
-
-def test_rfactor():
-    n = te.size_var("n")
-    k1 = te.reduce_axis((0, n), name="k1")
-    k2 = te.reduce_axis((0, n), name="k2")
-    A = te.placeholder((n, n, n), name="A")
-    B = te.compute((n,), lambda i: te.sum(A[i, k1, k2], axis=[k1, k2]))
-    # normal schedule
-    s = te.create_schedule(B.op)
-    BF = s.rfactor(B, k1)
-    assert tuple(BF.shape) == (n, n)
-    assert set(BF.op.body[0].axis) == set([k2])
-    assert s[B].op.body[0].axis[0].dom.extent == n
-    assert len(s[B].all_iter_vars) == 2
-    # schedule with split
-    s = te.create_schedule(B.op)
-    ko, ki = s[B].split(k1, factor=4)
-    xo, xi = s[B].split(B.op.axis[0], factor=8)
-    BF = s.rfactor(B, ki)
-    assert BF.shape[0].value == 4
-    assert BF.shape[1] == n
-    assert BF.op.body[0].axis[0] == k2
-    assert BF.op.body[0].axis[1].var == ko.var
-    assert s[B].op.body[0].axis[0].dom.extent.value == 4
-    # schedule with factor_axis
-    s = te.create_schedule(B.op)
-    ko, ki = s[B].split(k1, factor=4)
-    xo, xi = s[B].split(B.op.axis[0], factor=8)
-    BF = s.rfactor(B, ki, 1)
-    assert n == BF.shape[0]
-    assert BF.shape[1].value == 4
-    assert BF.op.body[0].axis[0] == k2
-    assert BF.op.body[0].axis[1].var == ko.var
-    assert s[B].op.body[0].axis[0].dom.extent.value == 4
-
-
-def test_tensor_intrin():
-    n = 16
-    x = te.placeholder((n,), name="x")
-    y = te.placeholder((n,), name="y")
-    z = te.compute(x.shape, lambda i: x[i] + y[i], name="z")
-
-    def intrin_func(ins, outs):
-        assert isinstance(ins[0], tvm.te.schedule.Buffer)
-        assert ins[0].shape[0].value == n
-        return tvm.tir.call_packed("vadd", ins[0].data, outs[0].data, ins[0].shape[0])
-
-    intrin = te.decl_tensor_intrin(z.op, intrin_func, default_buffer_params={"offset_factor": n})
-    assert intrin.op == z.op
-    assert intrin.reduce_init is None
-    assert tuple(intrin.inputs) == tuple(z.op.input_tensors)
-    assert intrin.buffers[0].shape[0].value == n
-    m = 32
-    X = te.placeholder((m,), name="X")
-    Y = te.placeholder((m,), name="Y")
-    Z = te.compute(X.shape, lambda i: X[i] + Y[i], name="Z")
-    s = te.create_schedule(Z.op)
-    xo, xi = s[Z].split(Z.op.axis[0], factor=n)
-    s[Z].tensorize(xi, intrin)
-    stmt = tvm.lower(s, [X, Y, Z])["main"].body
-    assert isinstance(stmt.body, tvm.tir.Evaluate)
-    assert str(stmt.body.value.args[0]) == '"vadd"'
-    assert str(stmt.body.value.args[1]) == "X"
-    assert str(stmt.body.value.args[2]) == "Z"
-    assert s[Z].iter_var_attrs[xi].tensor_intrin == intrin
-    assert s[Z].iter_var_attrs[xi].iter_type == tvm.te.schedule.IterVar.Tensorized
-
-
-def test_tensor_intrin_scalar_params():
-    n = te.size_var("n")
-    x = te.placeholder((n,), name="x")
-    v = te.size_var("v")
-    w = te.size_var("w")
-    z = te.compute((n,), lambda i: x[i] * v + w, name="z")
-
-    def intrin_func(ins, outs, sp):
-        assert isinstance(ins[0], tvm.te.schedule.Buffer)
-        assert ins[0].shape[0] == n
-        assert sp[0] == v
-        assert sp[1] == w
-        return tvm.tir.call_packed("hw_func", ins[0].data, outs[0].data, sp[0], sp[1])
-
-    intrin = te.decl_tensor_intrin(
-        z.op, intrin_func, scalar_params=[v, w], default_buffer_params={"offset_factor": 1}
-    )
-    assert intrin.op == z.op
-    assert intrin.reduce_init is None
-    assert tuple(intrin.inputs) == tuple(z.op.input_tensors)
-    assert intrin.buffers[0].shape[0] == n
-    assert tuple(intrin.scalar_params) == tuple((v, w))
-
-    A = te.placeholder((10, 10), name="A")
-    # Pass scalar inputs to the TensorIntrin, interleaved with tensor inputs
-    C = te.compute((10, 10), lambda i, j: intrin(i * i, A[i, j], i + j), name="C")
-    s = te.create_schedule(C.op)
-    stmt = tvm.lower(s, [A, C])["main"].body
-    assert isinstance(stmt.body.body, tvm.tir.Evaluate)
-    assert len(stmt.body.body.value.args) == 5
-    assert str(stmt.body.body.value.args[3]) == "i * i"
-    assert str(stmt.body.body.value.args[4]) == "i + j"
-
-
-def test_legalize_invalid_attach():
-    A = te.compute((10, 10), lambda i, j: 1.0, name="A")
-    B = te.compute((10, 10), lambda i, j: A[i][j], name="B")
-
-    # Case 1: Split an axis which is the target of a compute_at
-    s = te.create_schedule([B.op])
-    s[A].compute_at(s[B], B.op.axis[1])
-    s[B].split(B.op.axis[1], 2)
-
-    stmt = tvm.lower(s, [A, B], simple_mode=True)["main"].body
-    assert isinstance(stmt.body.body, tvm.tir.stmt.For)
-
-    # Case 2: Fuse an axis which is the target of a compute_at
-    s = te.create_schedule([B.op])
-    s[A].compute_at(s[B], B.op.axis[1])
-    s[B].fuse(B.op.axis[0], B.op.axis[1])
-    stmt = tvm.lower(s, [A, B], simple_mode=True)["main"].body
-    assert isinstance(stmt, tvm.tir.stmt.For)
-
-
-def test_compute_at():
-    def add():
-        shape = (16, 16)
-        A = tvm.te.compute(shape, lambda *i: 1.0, name="A")
-        B = tvm.te.compute(shape, lambda *i: 2.0, name="B")
-        C = tvm.te.compute(shape, lambda *i: A(*i) + B(*i), name="C")
-        return A, B, C
-
-    def invalid_compute_at_self():
-        A, B, C = add()
-        s = tvm.te.create_schedule(C.op)
-        s[C].compute_at(s[C], C.op.axis[0])
-        with pytest.raises(RuntimeError):
-            tvm.lower(s, [A, B], simple_mode=True)
-
-    def invalid_compute_at_loop():
-        A, B, C = add()
-        s = tvm.te.create_schedule(C.op)
-        s[A].compute_at(s[C], C.op.axis[0])
-        s[C].compute_at(s[A], A.op.axis[0])
-        with pytest.raises(RuntimeError):
-            tvm.lower(s, [C], simple_mode=True)
-
-    invalid_compute_at_self()
-    invalid_compute_at_loop()
-
-
-@pytest.mark.parametrize("split_factor", [4, 4 * tvm.tir.vscale()])
-@pytest.mark.parametrize("disable_predication", [True, False])
-def test_split_disable_predicate(split_factor, disable_predication):
-    A = te.placeholder((43,), name="A")
-    B = te.compute(A.shape, lambda i: A[i] + 2, name="C")
-
-    sch = te.create_schedule(B.op)
-    (i,) = sch[B].op.axis
-    _, _ = sch[B].split(i, factor=split_factor, disable_predication=disable_predication)
-
-    mod = schedule_to_module(sch, [A, B], "main")
-
-    predicates = []
-
-    def _find_predicates(stmt):
-        if isinstance(stmt, tvm.tir.stmt.IfThenElse):
-            predicates.append(stmt)
-
-    tvm.tir.stmt_functor.post_order_visit(mod["main"].body, _find_predicates)
-
-    assert bool(len(predicates)) != disable_predication
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/te/test_te_schedule_bound_inference.py b/tests/python/te/test_te_schedule_bound_inference.py
deleted file mode 100644
index c246ee9f4109..000000000000
--- a/tests/python/te/test_te_schedule_bound_inference.py
+++ /dev/null
@@ -1,512 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-import tvm.testing
-from tvm import te
-
-
-def test_bound1():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), name="A")
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2")
-
-    s = te.create_schedule([A2.op])
-    xo, xi = s[A2].split(s[A2].op.axis[0], 8)
-    s[A1].compute_at(s[A2], xo)
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    assert bounds[A1.op.axis[0]].extent.value == 8
-
-
-def test_bound2():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), name="A")
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2")
-    s = te.create_schedule(A2.op)
-    xo, yo, xi, yi = s[A2].tile(A2.op.axis[0], A2.op.axis[1], 8, 8)
-    # test normalize not affecting schedule
-    _ = s.normalize()
-    s[A1].compute_at(s[A2], yo)
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    assert bounds[A1.op.axis[0]].extent.value == 8
-    assert bounds[A1.op.axis[1]].extent.value == 8
-
-
-def test_bound3():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), name="A")
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2")
-
-    s = te.create_schedule(A2.op)
-    s[A1].set_scope("shared")
-    xo, xi = s[A2].split(A2.op.axis[0], 32)
-    xi0, xi1 = s[A2].split(xi, nparts=16)
-    s[A2].bind(xi0, te.thread_axis("threadIdx.x"))
-    yo, yi = s[A2].split(A2.op.axis[1], 16)
-    # test normalize not affecting schedule
-    _ = s.normalize()
-    s[A2].reorder(xo, xi0, yo, xi1, yi)
-    s[A1].compute_at(s[A2], yo)
-
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    assert bounds[A1.op.axis[0]].extent.value == 32
-    assert bounds[A1.op.axis[1]].extent.value == 16
-
-
-def test_bound_split_ext_less_than_factor():
-    m = 8
-    I = te.placeholder((m,), name="I")
-    EF = te.compute((m,), lambda i: I[i] * 2, name="EF")
-    E = te.compute((m,), lambda i: EF[i] * 2, name="E")
-    s = te.create_schedule([E.op])
-    xo, xi = s[E].split(s[E].op.axis[0], factor=32)
-    s[EF].compute_at(s[E], xo)
-
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    assert bounds[xi].extent.value == m
-
-
-def test_bound_split_ext_less_than_naprts():
-    m = 8
-    I = te.placeholder((m,), name="I")
-    EF = te.compute((m,), lambda i: I[i] * 2, name="EF")
-    E = te.compute((m,), lambda i: EF[i] * 2, name="E")
-    s = te.create_schedule([E.op])
-    xo, xi = s[E].split(s[E].op.axis[0], nparts=32)
-    s[EF].compute_at(s[E], xo)
-
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    assert bounds[xo].extent.value == m
-
-
-def test_bound_split_divisible():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((8 * m, l), name="A")
-    B = te.compute((8 * m, l), lambda i, j: A[i, j], name="B")
-    s = te.create_schedule(B.op)
-    xo, xi = s[B].split(B.op.axis[0], 8)
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    assert bounds[xo].extent == m
-    assert bounds[xi].extent.value == 8
-
-
-def test_bound_tile_divisible():
-    m = te.var("m")
-    l = te.var("l")
-    shape = (8 * m, 32 * l)
-    A = te.placeholder(shape, name="A")
-    B = te.compute(shape, lambda i, j: A[i, j], name="B")
-    s = te.create_schedule(B.op)
-    xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], 8, 32)
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    assert bounds[xo].extent == m
-    assert bounds[xi].extent.value == 8
-    assert bounds[yo].extent == l
-    assert bounds[yi].extent.value == 32
-
-
-def test_bound_fusesplit1():
-    m = te.var("m")
-    l = te.var("l")
-    split1 = te.var("s")
-    A = te.placeholder((m, l), name="A")
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2")
-
-    s = te.create_schedule(A2.op)
-    fused_axes = s[A2].fuse(A2.op.axis[0], A2.op.axis[1])
-    xo, xi = s[A2].split(fused_axes, split1)
-    s[A1].compute_at(s[A2], xo)
-
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    idxdiv = tvm.tir.indexdiv
-    tvm.testing.assert_prim_expr_equal(bounds[A1.op.axis[0]].min, idxdiv(xo * split1, l))
-
-    expected_extent = idxdiv((xo + 1) * split1 - 1, l) - idxdiv(xo * split1, l) + 1
-    for i in range(1, 6):
-        for j in range(1, 6):
-            for k in range(1, 6):
-                vars = tvm.runtime.convert(
-                    {
-                        split1: tvm.tir.const(i, "int32"),
-                        l: tvm.tir.const(j, "int32"),
-                        xo.var: tvm.tir.const(k, "int32"),
-                    }
-                )
-                tvm.testing.assert_prim_expr_equal(
-                    tvm.tir.stmt_functor.substitute(bounds[A1.op.axis[0]].extent, vars),
-                    tvm.tir.stmt_functor.substitute(expected_extent, vars),
-                )
-
-    tvm.testing.assert_prim_expr_equal(bounds[A1.op.axis[1]].extent, l)
-
-
-def test_bound_fusesplit2():
-    m = te.var("m")
-    l = tvm.runtime.convert(6)
-    split = tvm.runtime.convert(3)
-    A = te.placeholder((m, l), name="A")
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2")
-
-    s = te.create_schedule(A2.op)
-    fused_axes = s[A2].fuse(A2.op.axis[0], A2.op.axis[1])
-    xo, xi = s[A2].split(fused_axes, split)
-    s[A1].compute_at(s[A2], xo)
-
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    vars = tvm.runtime.convert({xo.var: tvm.tir.const(5, "int32")})
-    tvm.testing.assert_prim_expr_equal(
-        tvm.tir.stmt_functor.substitute(bounds[A1.op.axis[0]].min, vars), 2
-    )
-    tvm.testing.assert_prim_expr_equal(
-        tvm.tir.stmt_functor.substitute(bounds[A1.op.axis[1]].min, vars), 3
-    )
-    tvm.testing.assert_prim_expr_equal(
-        tvm.tir.stmt_functor.substitute(bounds[A1.op.axis[0]].extent, vars), 1
-    )
-    tvm.testing.assert_prim_expr_equal(
-        tvm.tir.stmt_functor.substitute(bounds[A1.op.axis[1]].extent, vars), 3
-    )
-
-
-def test_bound_warp():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), name="A")
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2")
-
-    s = te.create_schedule(A2.op)
-    s[A1].set_scope("warp")
-    xo, xi = s[A2].split(A2.op.axis[0], 32)
-    xi0, xi1 = s[A2].split(xi, factor=16)
-    tx = te.thread_axis("threadIdx.x")
-    s[A2].bind(xi1, tx)
-    s[A2].bind(xi0, te.thread_axis("threadIdx.y"))
-    y = s[A2].op.axis[1]
-    s[A1].compute_at(s[A2], y)
-    xo, xi = s[A1].split(s[A1].op.axis[0], factor=16)
-    s[A1].bind(xi, tx)
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    assert bounds[A1.op.axis[0]].extent.value == 16
-
-
-def test_bound_scan():
-    m = te.var("m")
-    n = te.var("n")
-    X = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
-    s_state = te.placeholder((m, n))
-    s_init = te.compute((1, n), lambda _, i: X[0, i])
-    s_update = te.compute((m, n), lambda t, i: s_state[t - 1, i] + X[t, i])
-    s_scan = tvm.te.scan(s_init, s_update, s_state)
-
-    assert tuple(s_scan.shape) == (m, n)
-    s = te.create_schedule(s_scan.op)
-    XX = s.cache_read(X, "local", s_update)
-    xo, xi = s[s_update].split(s_update.op.axis[1], factor=4)
-    s[XX].compute_at(s[s_update], xo)
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-    assert bounds[XX.op.axis[1]].extent.value == 4
-
-
-def test_bound_conv1d():
-    n = te.var("n")
-    A = te.compute((n + 2), lambda i: 1, name="A")
-
-    def computeB(ii):
-        i = ii + 1
-        return A[i - 1] + A[i] + A[i + 1]
-
-    B = te.compute(n, computeB, name="B")
-    s = te.create_schedule(B.op)
-    s[A].compute_at(s[B], B.op.axis[0])
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    assert bounds[A.op.axis[0]].extent.value == 3
-
-
-def test_bound_blur():
-    n = tvm.runtime.convert(12)
-    A = te.compute((n, n), lambda i, j: 1, name="A")
-
-    def computeB(ii, jj):
-        # set the correct center
-        i = ii + 1
-        j = jj + 1
-        return A[i][j] + A[i - 1][j] + A[i + 1][j] + A[i][j + 1] + A[i][j - 1]
-
-    B = te.compute((n - 2, n - 2), computeB, name="B")
-    s = te.create_schedule(B.op)
-    s[A].compute_at(s[B], B.op.axis[1])
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    assert bounds[A.op.axis[0]].extent.value == 3
-    assert bounds[A.op.axis[1]].extent.value == 3
-
-
-def test_bound_rfactor():
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    k = te.reduce_axis((0, n))
-    B = te.compute((1,), lambda i: te.sum(A[k], axis=k, where=(i > 1)), name="B")
-    # schedule
-    s = te.create_schedule(B.op)
-    kf, ki = s[B].split(k, nparts=4)
-    BF = s.rfactor(B, kf)
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-
-    assert bounds[BF.op.axis[0]].extent.value == 4
-    assert bounds[BF.op.axis[1]].extent.value == 1
-
-
-def test_bound_group_schedule():
-    m = te.var("m")
-    n = te.var("n")
-    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
-    x1 = te.compute(x.shape, lambda *i: x(*i) + 1, name="x1")
-    x2 = te.compute(x.shape, lambda *i: x1(*i) + 2, name="x2")
-    s = te.create_schedule(x2.op)
-    g = s.create_group(outputs=x1, inputs=x, include_inputs=True)
-    g.compute_at(s[x2], x2.op.axis[0])
-    assert s[x1].group == g
-    assert s[x].group == g
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    assert bounds[x.op.axis[0]].extent.value == 1
-    assert bounds[x.op.axis[1]].extent == n
-
-
-def test_bound_nest_group():
-    m = te.var("m")
-    n = te.var("n")
-    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
-    x1 = te.compute(x.shape, lambda *i: x(*i) + 1, name="x1")
-    x2 = te.compute(x.shape, lambda *i: x1(*i) + 2, name="x2")
-    s = te.create_schedule(x2.op)
-    g1 = s.create_group(outputs=x, inputs=x, include_inputs=True)
-    g2 = s.create_group(outputs=x1, inputs=x, include_inputs=True)
-    assert s[x].group == g1
-    assert s[x1].group == g2
-    g2.compute_at(s[x2], x2.op.axis[0])
-    g1.compute_at(s[x1], s[x1].op.axis[1])
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    assert bounds[x.op.axis[0]].extent.value == 1
-    assert bounds[x.op.axis[1]].extent.value == 1
-    assert bounds[x1.op.axis[0]].extent.value == 1
-    assert bounds[x1.op.axis[1]].extent == n
-
-
-def test_bound_nest_thread():
-    m = te.var("m")
-    A = te.placeholder((m), name="A")
-    A1 = te.compute((m,), lambda i: A[i], name="A1")
-    A2 = te.compute((m,), lambda i: A1[i] + 2, name="A2")
-    A3 = te.compute((m,), lambda i: A2[i] + 3, name="A3")
-
-    s = te.create_schedule(A3.op)
-    s[A2].set_scope("shared")
-    s[A1].set_scope("local")
-
-    block_x = te.thread_axis("blockIdx.x")
-    thread_x = te.thread_axis("threadIdx.x")
-    bx, tx = s[A3].split(A3.op.axis[0], factor=32)
-    s[A3].bind(bx, block_x)
-    s[A3].bind(tx, thread_x)
-    s[A2].compute_at(s[A3], tx)
-    _, xi = s[A2].split(A2.op.axis[0], nparts=1)
-    s[A2].bind(xi, thread_x)
-    s[A1].compute_at(s[A3], tx)
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    assert bounds[A1.op.axis[0]].extent.value == 1
-    assert bounds[A2.op.axis[0]].extent.value == 32
-    assert bounds[A3.op.axis[0]].extent == m
-
-
-def test_gemm_bound():
-    nn = 1024
-    n = tvm.runtime.convert(nn)
-    A = te.placeholder((n, n), name="A")
-    B = te.placeholder((n, n), name="B")
-    k = te.reduce_axis((0, n), name="k")
-    C = te.compute((n, n), lambda ii, jj: te.sum(A[ii, k] * B[jj, k], axis=k), name="CC")
-    # schedule
-    s = te.create_schedule(C.op)
-    xtile, ytile = 32, 32
-    scale = 8
-    num_thread = 8
-    block_factor = scale * num_thread
-    block_x = te.thread_axis("blockIdx.x")
-    thread_x = te.thread_axis("threadIdx.x")
-    block_y = te.thread_axis("blockIdx.y")
-    thread_y = te.thread_axis("threadIdx.y")
-
-    CC = s.cache_write(C, "local")
-    AA = s.cache_read(A, "shared", [CC])
-    BB = s.cache_read(B, "shared", [CC])
-    by, yi = s[C].split(C.op.axis[0], factor=block_factor)
-    bx, xi = s[C].split(C.op.axis[1], factor=block_factor)
-    s[C].reorder(by, bx, yi, xi)
-    s[C].bind(by, block_y)
-    s[C].bind(bx, block_x)
-    ty, yi = s[C].split(yi, nparts=num_thread)
-    tx, xi = s[C].split(xi, nparts=num_thread)
-    s[C].reorder(ty, tx, yi, xi)
-    s[C].bind(ty, thread_y)
-    s[C].bind(tx, thread_x)
-    yo, xo = CC.op.axis
-    s[CC].reorder(k, yo, xo)
-
-    s[CC].compute_at(s[C], tx)
-    s[AA].compute_at(s[CC], k)
-    s[BB].compute_at(s[CC], k)
-
-    ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread)
-    tx, xi = s[AA].split(xi, nparts=num_thread)
-    s[AA].bind(ty, thread_y)
-    s[AA].bind(tx, thread_x)
-
-    ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread)
-    tx, xi = s[BB].split(xi, nparts=num_thread)
-    s[BB].bind(ty, thread_y)
-    s[BB].bind(tx, thread_x)
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    assert bounds[BB.op.axis[0]].extent.value == 64
-    assert bounds[AA.op.axis[0]].extent.value == 64
-    assert bounds[CC.op.axis[0]].extent.value == 8
-    assert bounds[CC.op.axis[1]].extent.value == 8
-
-
-def test_bound_tensor_compute_op():
-    def intrin_test():
-        m1 = te.var("m1")
-        n1 = te.var("n1")
-        a = te.placeholder((m1, n1), name="a")
-        c = te.compute((1, n1), lambda i, j: a[0, j] + a[1, j] + a[2, j], name="c")
-
-        Ab = tvm.tir.decl_buffer(a.shape, name="Abuf", offset_factor=1)
-        Cb = tvm.tir.decl_buffer(c.shape, name="Cbuf", offset_factor=1)
-
-        def intrin_func(ins, outs):
-            aa = ins[0]
-            cc = outs[0]
-
-            def _body():
-                ib = tvm.tir.ir_builder.create()
-                ib.emit(
-                    tvm.tir.call_extern("int32", "test", cc.access_ptr("w"), aa.access_ptr("r"))
-                )
-                return ib.get()
-
-            return _body()
-
-        return te.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, c: Cb})
-
-    test_func = intrin_test()
-    A = te.placeholder((20, 20), name="A")
-    B = te.compute(A.shape, lambda i, j: A[i, j], name="B")
-    C = te.compute((10, 20), lambda i: test_func(B[i:10, 0:20]), name="C")
-    s = te.create_schedule(C.op)
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    assert bounds[B.op.axis[0]].extent.value == 10
-
-
-def test_bound_simplification_failure():
-    # Check that the bounds are not expanded
-    A = te.compute((2,), lambda j: j, "A")
-
-    def _check(B, A=A):
-        s = te.create_schedule(B.op)
-        s = s.normalize()
-        bounds = tvm.te.schedule.InferBound(s)
-        stmt = tvm.lower(s, [B, A], simple_mode=True)
-        if not bounds[A.op.axis[0]].extent.value <= 2:
-            print(stmt)
-            assert bounds[A.op.axis[0]].extent.value <= 2
-
-    tdiv = tvm.tir.truncdiv
-    # These are hard to simplify, moreover we don't simplify them
-    _check(te.compute((10,), lambda i: A[tvm.te.min(3 * i, 4 * i) + tvm.te.min(-3 * i, -2 * i)]))
-    _check(te.compute((10,), lambda i: A[tvm.te.min(3 * i, 4 * i) + tvm.te.max(-3 * i, -4 * i)]))
-    _check(te.compute((10,), lambda i: A[-2 * tdiv(i, 2) - tvm.te.min(i, 0 - i)]))
-    _check(te.compute((10,), lambda i: A[i + (0 - i)]))
-    # This would cause out of bounds, but we nevertheless include it
-    _check(te.compute((10,), lambda i: A[i]))
-
-
-def test_bound_block():
-    def _check(shape, expected, block_size=4):
-        N, C, H, W = shape
-        tail = C % block_size
-        chunks = C // block_size
-        if tail != 0:
-            chunks += 1
-        A = te.placeholder((N, C, H, W), name="A")
-        pad_value = tvm.tir.const(0, A.dtype)
-
-        def _reorder_data_nchw(*indices):
-            condition = []
-            condition.append(indices[1] == chunks - 1)
-            condition.append(indices[4] >= tail)
-            condition = tvm.tir.all(*condition)
-            return tvm.tir.if_then_else(
-                condition,
-                pad_value,
-                A[indices[0], indices[1] * block_size + indices[4], indices[2], indices[3]],
-            )
-
-        repack = te.compute((N, chunks, H, W, block_size), _reorder_data_nchw, name="repack")
-        B = te.compute(
-            (N, C, H, W),
-            lambda n, c, h, w: repack[n, c // block_size, h, w, c % block_size],
-            name="back_repack",
-        )
-        s = te.create_schedule([B.op])
-        bounds = tvm.te.schedule.InferBound(s)
-        # Block for intermediate compute function should be equal to 4 for all cases except than number of channels is less than 4
-        assert bounds[repack.op.axis[4]].extent.value == expected
-
-    _check((1, 4, 6, 6), 4)
-    _check((1, 7, 6, 6), 4)
-    _check((1, 3, 6, 6), 3)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/te/test_te_schedule_bound_inference_tiling.py b/tests/python/te/test_te_schedule_bound_inference_tiling.py
deleted file mode 100644
index 039fe08cd328..000000000000
--- a/tests/python/te/test_te_schedule_bound_inference_tiling.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-
-
-def test_bound_tile_mod():
-    def compute(M_tiles, N_tiles, factor, dtype):
-        # Algo
-        M = M_tiles * factor
-        N = N_tiles * factor
-
-        A = tvm.te.placeholder((N, M), name="A", dtype=dtype)
-        C = tvm.te.compute((N, M), lambda n, m: A[n, m], name="C")
-        s = tvm.te.create_schedule(C.op)
-
-        return s, A, C
-
-    def schedule(s, factor, padding, A, C):
-        C_local = s.cache_write(C, "local")
-
-        n, m = C.op.axis
-        bn, bm, ni, mi = s[C].tile(n, m, factor, factor)
-        nio, nii = s[C].split(ni, 2)
-        n = s[C].fuse(nii, mi)
-        C_shared = s.cache_write(C, "shared")
-        bn, bm, ni, mi = C_shared.op.axis
-        s[C_shared].storage_align(ni, factor * 2, padding)
-
-        n, m = s[C].op.axis
-        bn, bm, ni, mi = s[C].tile(n, m, factor, factor)
-        s[C].set_scope("global")
-        niio, niii = s[C].split(ni, 32)
-        s[C_shared].compute_at(s[C], niio)
-
-        return s
-
-    s, A, C = compute(2, 2, 128, "float16")
-    s = schedule(s, 128, 8, A, C)
-    bounds = tvm.te.schedule.InferBound(s)
-    check = bounds[s.stages[2].op.axis[2]].extent == 16
-    if not check:
-        print(tvm.lower(s, [A, C], simple_mode=True))
-    assert check
-
-
-if __name__ == "__main__":
-    test_bound_tile_mod()
diff --git a/tests/python/te/test_te_schedule_graph.py b/tests/python/te/test_te_schedule_graph.py
deleted file mode 100644
index 05ca9fdbf8a8..000000000000
--- a/tests/python/te/test_te_schedule_graph.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-
-
-def test_scan():
-    m = te.var("m")
-    n = te.var("n")
-    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
-    s_state = te.placeholder((m, n))
-    s_init = te.compute((1, n), lambda _, i: x[0, i], name="s_init")
-    x_trans = te.compute((m, n), lambda i, j: x[i, j] + 1, name="x_trans")
-    s_up1 = te.compute((m, n), lambda t, i: s_state[t - 1, i] + 1, name="up1")
-    s_update = te.compute((m, n), lambda t, i: s_up1[t, i] + x_trans[t, i], name="update")
-    s_scan = tvm.te.scan(s_init, s_update, s_state)
-
-    def test_getbody():
-        body = tvm.te.schedule.ScanGetBody(s_scan.op)
-        assert set(body) == set([s_scan.op, s_update.op, s_up1.op])
-
-    def test_attach_path():
-        s = te.create_schedule(s_scan.op)
-        s[x_trans].compute_at(s[s_update], s_update.op.axis[0])
-        apath = tvm.te.schedule.CreateAttachPath(s)
-        assert tuple(apath[s_update.op]) == tuple([s_scan.op.scan_axis])
-        assert tuple(apath[x_trans.op]) == tuple([s_update.op.axis[0], s_scan.op.scan_axis])
-
-    def test_fix_pt():
-        body = tvm.te.schedule.ScanGetBody(s_scan.op)
-        fxpt = tvm.te.schedule.ScanFixPointAnalysis(s_scan.op)
-        assert fxpt[s_scan.spatial_axis_[0]].value != 0
-
-
-def test_scan_fix_point():
-    m = te.var("m")
-    n = te.var("n")
-    l = te.var("l")
-    x = te.compute((l, m, n), lambda *i: tvm.tir.const(1, "float32"), name="x")
-    s_state = te.placeholder((l, m, n))
-    s_init = te.compute((1, m, n), lambda _, i, j: x[0, i, j], name="s_init")
-
-    def test_scan0():
-        s_update = te.compute(
-            (l, m, n), lambda t, i, j: x[t, j, i] + s_state[t - 1, i, j], name="update"
-        )
-        s_scan = tvm.te.scan(s_init, s_update, s_state)
-        body = tvm.te.schedule.ScanGetBody(s_scan.op)
-        fxpt = tvm.te.schedule.ScanFixPointAnalysis(s_scan.op)
-        assert fxpt[s_scan.op.spatial_axis_[0]].value == 1
-        assert fxpt[s_scan.op.spatial_axis_[1]].value == 1
-
-    def test_scan1():
-        s_update = te.compute(
-            (l, m, n), lambda t, i, j: x[t, j, i] + s_state[t - 1, j, i], name="update"
-        )
-        s_scan = tvm.te.scan(s_init, s_update, s_state)
-        body = tvm.te.schedule.ScanGetBody(s_scan.op)
-        fxpt = tvm.te.schedule.ScanFixPointAnalysis(s_scan.op)
-        assert fxpt[s_scan.op.spatial_axis_[0]].value == 0
-        assert fxpt[s_scan.op.spatial_axis_[1]].value == 0
-
-    def test_scan3_not_exact_reach():
-        s_h1 = te.compute((l, n, m), lambda t, j, i: s_state[t - 1, i, j], name="h1")
-        s_h2 = te.compute((l, m, n), lambda t, i, j: s_state[t - 1, i, 10] * 2, name="h1")
-        s_update = te.compute(
-            (l, m, n), lambda t, i, j: s_h1[t, j, i] + s_h2[t, i, j], name="update"
-        )
-        s_scan = tvm.te.scan(s_init, s_update, s_state)
-        body = tvm.te.schedule.ScanGetBody(s_scan.op)
-        fxpt = tvm.te.schedule.ScanFixPointAnalysis(s_scan.op)
-        assert fxpt[s_scan.op.spatial_axis_[0]].value == 1
-        assert fxpt[s_scan.op.spatial_axis_[1]].value == 0
-
-    def test_scan4_reach_other():
-        s_h1 = te.compute((l, n, m), lambda t, j, i: s_state[t - 1, j, j], name="h1")
-        s_h2 = te.compute((l, m, n), lambda t, i, j: s_state[t - 1, i, j] * 2, name="h1")
-        s_update = te.compute(
-            (l, m, n), lambda t, i, j: s_h1[t, j, i] + s_h2[t, i, j], name="update"
-        )
-        s_scan = tvm.te.scan(s_init, s_update, s_state)
-        fxpt = tvm.te.schedule.ScanFixPointAnalysis(s_scan.op)
-        assert fxpt[s_scan.op.spatial_axis_[0]].value == 0
-        assert fxpt[s_scan.op.spatial_axis_[1]].value == 0
-
-    def test_scan5_multi_output():
-        m = te.var("m")
-        n = te.var("n")
-        x1 = te.placeholder((m, n))
-        s1 = te.placeholder((m, n))
-        x2 = te.placeholder((m, n))
-        s2 = te.placeholder((m, n))
-        s1_init = te.compute((1, n), lambda _, i: x1[0, i])
-        s2_init = te.compute((1, n), lambda _, i: x2[0, i])
-        s1_update = te.compute((m, n), lambda t, i: s1[t - 1, i] + x1[t, i])
-        s2_update = te.compute((m, n), lambda t, i: x2[t, i] + s2[t - 1, i])
-        r0, r1 = tvm.te.scan([s1_init, s2_init], [s1_update, s2_update], [s1, s2])
-        body = tvm.te.schedule.ScanGetBody(r0.op)
-        fxpt = tvm.te.schedule.ScanFixPointAnalysis(r0.op)
-        assert fxpt[r1.op.spatial_axis_[0]].value == 1
-
-    test_scan0()
-    test_scan1()
-    test_scan3_not_exact_reach()
-    test_scan4_reach_other()
-    test_scan5_multi_output()
-
-
-def test_create_read_graph():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), name="A")
-    A1 = te.compute((m, l), lambda i, j: A[i, j])
-    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3)
-
-    g = tvm.te.schedule.CreateReadGraph([A2.op])
-
-    assert g[A2.op][0] == A1
-    assert g[A1.op][0] == A
-    post_order = tvm.te.schedule.PostDFSOrder([A2.op], g)
-    assert post_order[0] == A.op
-    assert post_order[1] == A1.op
-
-
-if __name__ == "__main__":
-    test_scan()
-    test_create_read_graph()
-    test_scan_fix_point()
diff --git a/tests/python/te/test_te_schedule_lstm.py b/tests/python/te/test_te_schedule_lstm.py
deleted file mode 100644
index abdf81d3795d..000000000000
--- a/tests/python/te/test_te_schedule_lstm.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-
-
-def test_lstm_cell_inline():
-    num_step = 128
-    num_input = 256
-    num_hidden = 1152
-    batch_size = 4
-    # Global transition matrix
-    X = te.placeholder((num_step - 1, batch_size, num_input), name="X")
-    Wi2h = te.placeholder((4, num_hidden, num_input), name="Wi2h")
-    Wh2h = te.placeholder((4, num_hidden, num_hidden), name="Wh2h")
-    # h: output hidden state, c: cell state.
-    s_state_h = te.placeholder((num_step, batch_size, num_hidden))
-    s_state_c = te.placeholder((num_step, batch_size, num_hidden))
-    s_init_c = te.compute((1, batch_size, num_hidden), lambda *i: 0.0, name="init_c")
-    s_init_h = te.compute((1, batch_size, num_hidden), lambda *i: 0.0, name="init_h")
-    # LSTM transition
-    k = te.reduce_axis((0, num_input), name="ki2h")
-    s_i2h = te.compute(
-        (num_step, 4, batch_size, num_hidden),
-        lambda t, x, i, j: te.sum(X[t - 1, i, k] * Wi2h[x, j, k], axis=k),
-        name="s_i2h",
-    )
-    k = te.reduce_axis((0, num_hidden), name="ki2h")
-    s_h2h = te.compute(
-        (num_step, 4, batch_size, num_hidden),
-        lambda t, x, i, j: te.sum(s_state_h[t - 1, i, k] * Wh2h[x, j, k], axis=k),
-        name="s_h2h",
-    )
-    # Gate rules
-    gates = te.compute(s_i2h.shape, lambda *i: s_i2h(*i) + s_h2h(*i), name="gates")
-    gshape = (num_step, batch_size, num_hidden)
-    in_gate = te.compute(gshape, lambda t, i, j: te.sigmoid(gates[t, 0, i, j]), name="in_gate")
-    in_transform = te.compute(
-        gshape, lambda t, i, j: te.tanh(gates[t, 1, i, j]), name="in_transform"
-    )
-    forget_gate = te.compute(
-        gshape, lambda t, i, j: te.sigmoid(gates[t, 2, i, j]), name="forget_gate"
-    )
-    out_gate = te.compute(gshape, lambda t, i, j: te.sigmoid(gates[t, 3, i, j]), name="out_gate")
-    next_c = te.compute(
-        gshape,
-        lambda t, i, j: forget_gate[t, i, j] * s_state_c[t - 1, i, j]
-        + in_gate[t, i, j] * in_transform[t, i, j],
-        name="next_c",
-    )
-    next_h = te.compute(
-        gshape, lambda t, i, j: out_gate[t, i, j] * te.tanh(next_c[t, i, j]), name="next_h"
-    )
-    update_c = te.compute(gshape, lambda *i: next_c(*i), name="update_c")
-    update_h = te.compute(gshape, lambda *i: next_h(*i), name="update_h")
-    # schedule
-    scan_h, scan_c = tvm.te.scan(
-        [s_init_h, s_init_c],
-        [update_h, update_c],
-        [s_state_h, s_state_c],
-        inputs=[X],
-        name="lstm_scan",
-    )
-    # schedule
-    s = te.create_schedule(scan_h.op)
-    # Inline gate computations
-    s[gates].compute_inline()
-    s[in_gate].compute_inline()
-    s[in_transform].compute_inline()
-    s[forget_gate].compute_inline()
-    s[out_gate].compute_inline()
-    # verify we can lower correctly
-    tvm.lower(s, [X, Wi2h, Wh2h, scan_h, scan_c])
-
-
-if __name__ == "__main__":
-    test_lstm_cell_inline()
diff --git a/tests/python/te/test_te_schedule_ops.py b/tests/python/te/test_te_schedule_ops.py
deleted file mode 100644
index 1ff0297539ce..000000000000
--- a/tests/python/te/test_te_schedule_ops.py
+++ /dev/null
@@ -1,695 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm.driver.build_module import schedule_to_module
-
-
-def test_const():
-    x = tvm.te.const(1, "int32")
-    assert x.dtype == "int32"
-    assert isinstance(x, tvm.tir.IntImm)
-
-
-def test_schedule0():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), name="A")
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-    s = te.create_schedule(A1.op)
-
-    mod = schedule_to_module(s, [A, A1])
-    assert isinstance(mod["main"], tvm.tir.PrimFunc)
-
-
-def test_schedule1():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), name="A")
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-
-    s = te.create_schedule(A1.op)
-    xo, xi = s[A1].split(A1.op.axis[0], 8)
-    s[A1].pragma(xo, "auto_unroll_max_step", 10)
-
-    mod = schedule_to_module(s, [A, A1])
-    assert isinstance(mod["main"], tvm.tir.PrimFunc)
-
-
-def test_schedule2():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), name="A")
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2")
-
-    s = te.create_schedule(A2.op)
-    xo, xi = s[A2].split(A2.op.axis[0], 8)
-    s[A1].compute_at(s[A2], xo)
-
-    mod = schedule_to_module(s, [A, A2])
-    assert isinstance(mod["main"], tvm.tir.PrimFunc)
-
-
-def test_schedule_scan():
-    m = te.var("m")
-    n = te.var("n")
-    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
-    s_state = te.placeholder((m, n))
-    s_init = te.compute((1, n), lambda _, i: x[0, i])
-    s_update = te.compute((m, n), lambda t, i: s_state[t - 1, i] + x[t, i])
-    res = tvm.te.scan(s_init, s_update, s_state)
-
-    assert tuple(res.shape) == (m, n)
-    s = te.create_schedule(res.op)
-    s = s.normalize()
-    ir = tvm.lower(s, [s_state], simple_mode=True)
-    bounds = tvm.te.schedule.InferBound(s)
-    assert bounds[res.op.scan_axis].min.value == 1
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def test_inline_multi_reduce():
-    def argmax_comp(x, y):
-        idx = tvm.tir.Select((x[1] >= y[1]), x[0], y[0])
-        val = tvm.tir.Select((x[1] >= y[1]), x[1], y[1])
-        return idx, val
-
-    def argmax_init(idx_typ, val_typ):
-        return tvm.tir.const(-1, idx_typ), tvm.te.min_value(val_typ)
-
-    argmax = te.comm_reducer(argmax_comp, argmax_init, name="argmax")
-    m = te.var("m")
-    n = te.var("n")
-    val = te.placeholder((m, n), name="val", dtype="float32")
-    val1 = te.compute((m, n), lambda i, j: val[i, j] + 1, name="val1")
-    val2 = te.compute((m, n), lambda i, j: te.exp(val1[i, j]), name="val2")
-    k = te.reduce_axis((0, n), "k")
-    T_idx, T_val = te.compute((m,), lambda i: argmax((k.var, val2[i, k]), axis=k), name="T")
-    s = te.create_schedule(T_idx.op)
-    s[val1].compute_inline()
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def test_auto_inline():
-    def elemwise():
-        m = te.var("m")
-        n = te.var("n")
-        A = te.placeholder((m, n), name="A")
-        B = te.placeholder((m, n), name="B")
-        C = te.placeholder((m, n), name="C")
-        T1 = te.compute((m, n), lambda i, j: A(i, j) * B(i, j), name="T1")
-        T2 = te.compute((m, n), lambda i, j: T1(i, j) + C(i, j), name="T2")
-
-        return te.create_schedule(T2.op), T1
-
-    def broadcast():
-        m = te.var("m")
-        n = te.var("n")
-        A = te.placeholder((1,), name="A")
-        B = te.placeholder((m, n), name="B")
-        C = te.placeholder((m, n), name="C")
-        T1 = te.compute((m, n), lambda i, j: A(0) * B(i, j), name="T1", tag="broadcast")
-        T2 = te.compute((m, n), lambda i, j: T1(i, j) + C(i, j), name="T2")
-
-        return te.create_schedule(T2.op), T1
-
-    def injective():
-        m = te.var("m")
-        n = te.var("n")
-        A = te.placeholder((m,), name="A")
-        B = te.placeholder((m, n), name="B")
-        C = te.placeholder((m, n), name="C")
-        T1 = te.compute((m, n), lambda i, j: A(i) * B(i, j), name="T1")
-        T2 = te.compute((m, n), lambda i, j: T1(i, j) + C(i, j), name="T2")
-
-        return te.create_schedule(T2.op), T1
-
-    def check_auto_inline(schedule_func, auto_inline_func):
-        s, T1 = schedule_func()
-        # before auto inline the attach type is AttachType.kGroupRoot
-        assert s[T1].attach_type == 1
-        auto_inline_func(s)
-        # after auto inline the attach type is AttachType.kInline
-        assert s[T1].attach_type == 2
-        s = s.normalize()
-        bounds = tvm.te.schedule.InferBound(s)
-        stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    check_auto_inline(elemwise, tvm.te.schedule.AutoInlineElemWise)
-    check_auto_inline(broadcast, tvm.te.schedule.AutoInlineBroadcast)
-    check_auto_inline(injective, tvm.te.schedule.AutoInlineInjective)
-
-
-def test_schedule_const_bound():
-    n = 128
-    A = te.placeholder((n,), name="A")
-    A1 = te.compute((n,), lambda i: A[i] + 1, name="A1")
-    s = te.create_schedule(A1.op)
-    xo, xi = s[A1].split(A1.op.axis[0], 8)
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def test_inline_mixed():
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    A1 = te.compute(A.shape, lambda *i: A(*i) + 1, name="A1")
-    A2 = te.compute(A.shape, lambda *i: A1(*i) + 2, name="A2")
-    C = te.compute((n,), lambda i: A2[i] + A1[i], name="C")
-
-    s = te.create_schedule(C.op)
-    xo, xi = s[C].split(C.op.axis[0], factor=8)
-    s[A1].compute_at(s[C], xo)
-    s[A2].compute_inline()
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    def check(x):
-        if isinstance(x, tvm.tir.Call):
-            assert x.func != A2
-
-    tvm.tir.stmt_functor.post_order_visit(s[C].op.body[0], check)
-
-
-def test_scan_inline1():
-    m = te.var("m")
-    n = te.var("n")
-    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
-    s_state1 = te.placeholder((m, n))
-    s_state2 = te.placeholder((m, n))
-    s_init1 = te.compute((1, n), lambda _, i: x[0, i])
-    s_init2 = te.compute((1, n), lambda _, i: x[0, i])
-    s_x1 = te.compute((m, n), lambda t, i: s_state1[t - 1, i] + x[t, i], name="x1")
-    s_x2 = te.compute((m, n), lambda t, i: s_state2[t - 1, i] + 1, name="x2")
-    s_update1 = te.compute((m, n), lambda t, i: s_x1[t, i], "u1")
-    s_update2 = te.compute((m, n), lambda t, i: s_x2[t, i], "u2")
-    res1, res2 = tvm.te.scan([s_init1, s_init2], [s_update1, s_update2], [s_state1, s_state2])
-    s = te.create_schedule(res1.op)
-    s[s_x1].compute_inline()
-    stmt = tvm.lower(s, [x, res1, res2])
-
-
-def test_scan_inline2():
-    m = te.var("m")
-    n = te.var("n")
-    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
-    s_state1 = te.placeholder((m, n))
-    s_state2 = te.placeholder((m, n))
-    s_init1 = te.compute((1, n), lambda _, i: x[0, i])
-    s_init2 = te.compute((1, n), lambda _, i: x[0, i])
-    s_xx = te.compute((m, n), lambda t, i: s_state1[t - 1, i] + x[t, i], name="xx")
-    s_x1 = te.compute((m, n), lambda t, i: s_xx[t, i] + 1, name="x1")
-    s_x2 = te.compute((m, n), lambda t, i: s_xx[t, i] + s_state2[t - 1, 2], name="x2")
-    s_update1 = te.compute((m, n), lambda t, i: s_x1[t, i], "u1")
-    s_update2 = te.compute((m, n), lambda t, i: s_x2[t, i], "u2")
-    res1, res2 = tvm.te.scan([s_init1, s_init2], [s_update1, s_update2], [s_state1, s_state2])
-    s = te.create_schedule(res1.op)
-    s[s_xx].compute_inline()
-    s[s_x1].compute_inline()
-    s[s_x2].compute_inline()
-    stmt = tvm.lower(s, [x, res1, res2])
-
-
-def test_schedule_cache():
-    m = te.var("m")
-    n = te.var("n")
-    A = te.placeholder((m, n), name="A")
-    B = te.placeholder((m, n), name="B")
-    C = te.compute((m, n), lambda i, j: A(i, j) * B(i, j), name="C")
-
-    s = te.create_schedule(C.op)
-    AA = s.cache_read(A, "shared", readers=[C])
-    CC = s.cache_write(C, "shared")
-    s[AA].compute_at(s[CC], CC.op.axis[0])
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def test_schedule_middle_cache():
-    m = te.var("m")
-    n = te.var("n")
-    A = te.placeholder((m, n), name="A")
-    B = te.placeholder((m, n), name="B")
-
-    C = te.compute((m, n), lambda i, j: A(i, j) * B(i, j), name="C")
-    D = te.compute((m, n), lambda i, j: C(i, j), name="D")
-
-    s = te.create_schedule(D.op)
-    AA = s.cache_read(A, "local", readers=[C])
-    BB = s.cache_read(B, "local", readers=[C])
-    CC = s.cache_read(C, "local", readers=[D])
-    DD = s.cache_write(D, "local")
-    # s[AA].compute_at(s[CC], CC.op.axis[0])
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def test_schedule_cache_relayout1():
-    m = te.var("m")
-    n = te.var("n")
-    A = te.placeholder((m, n), name="A")
-    B = te.placeholder((m, n), name="B")
-    C = te.compute((m, n), lambda i, j: A(i, j) * B(i, j), name="C")
-
-    s = te.create_schedule(C.op)
-    s[C].reorder(C.op.axis[1], C.op.axis[0])
-    CC = s.cache_write(C, "global")
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def test_schedule_cache_relayout2():
-    m = te.var("m")
-    n = te.var("n")
-    A = te.placeholder((m * 4, n), name="A")
-    B = te.placeholder((m * 4, n), name="B")
-    C = te.compute(A.shape, lambda i, j: A(i, j) * B(i, j), name="C")
-    s = te.create_schedule(C.op)
-    x, y = C.op.axis
-    xo, xi = s[C].split(x, factor=4)
-    s[C].reorder(xo, y, xi)
-    CC = s.cache_write(C, "global")
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def test_schedule_cache_relayout3():
-    m = te.var("m")
-    n = te.var("n")
-    A = te.placeholder((m * 4, n), name="A")
-    B = te.placeholder((m * 4, n), name="B")
-    k = te.reduce_axis((0, n), "k")
-    C = te.compute((A.shape[0],), lambda i: te.sum(A(i, k) * B(i, k), axis=k), name="C")
-    s = te.create_schedule(C.op)
-    x = C.op.axis[0]
-    xo, xi = s[C].split(x, factor=4)
-    CC = s.cache_write(C, "global")
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def test_schedule_cache_relayout4():
-    def _compute(*indice):
-        return A(*indice) + 1, B(*indice) / 2
-
-    m = te.var("m")
-    n = te.var("n")
-    A = te.placeholder((m * 4, n), name="A")
-    B = te.placeholder((m * 4, n), name="B")
-    C1, C2 = te.compute(A.shape, _compute, name="C")
-    s = te.create_schedule([C1.op, C2.op])
-    C1_cache, C2_cache = s.cache_write([C1, C2], "local")
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def intrin_gemv(m, n):
-    w = te.placeholder((m, n), name="w")
-    x = te.placeholder((n,), name="x")
-    k = te.reduce_axis((0, n), name="k")
-    z = te.compute((m,), lambda i: te.sum(w[i, k] * x[k], axis=k), name="z")
-    Wb = tvm.tir.decl_buffer(
-        w.shape, w.dtype, name="W", offset_factor=16, strides=[te.var("ldw"), 1]
-    )
-
-    def intrin_func(ins, outs):
-        ww, xx = ins
-        zz = outs[0]
-        ww_ptr = ww.access_ptr("r")
-        xx_ptr = xx.access_ptr("r")
-        zz_ptr = zz.access_ptr("w")
-        body = tvm.tir.call_packed("gemm", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
-        reset = tvm.tir.call_packed("fill_zero", zz_ptr, n)
-        update = tvm.tir.call_packed("gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
-        return body, reset, update
-
-    buffer_params = {"data_alignment": 16, "offset_factor": 16}
-    return te.decl_tensor_intrin(
-        z.op, intrin_func, binds={w: Wb}, default_buffer_params=buffer_params
-    )
-
-
-def test_schedule_tensor_compute1():
-    # basic: split, reorder, tile
-    M, N, L = 2048, 1024, 512
-    factor, rfactor = 16, 16
-    A = te.placeholder((N // factor, L // rfactor, factor, rfactor), name="A")
-    B = te.placeholder((M, L // rfactor, rfactor), name="B")
-    k = te.reduce_axis((0, L // rfactor), name="k")
-
-    gemv = intrin_gemv(factor, rfactor)
-    C = te.compute(
-        (N, M // factor, factor),
-        lambda i, j: gemv(A[i, k, 0:factor, 0:factor], B[j, k, 0:rfactor], reduce_axis=k),
-        name="C",
-    )
-
-    s = te.create_schedule(C.op)
-    ai, aj, ax = s[C].op.axis
-    aio, aii = s[C].split(ai, 16)
-    s[C].reorder(aio, aj, aii)
-    aioo, ajo, aioi, aji = s[C].tile(aio, aj, 16, 4)
-
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def intrin_vadd(n, cache_read=False, cache_write=False):
-    scope_ubuf = "local"
-    dtype = "float32"
-    x = te.placeholder((n,), dtype=dtype, name="vx")
-    y = te.placeholder((n,), dtype=dtype, name="vy")
-    z = te.compute(x.shape, lambda i: x[i] + y[i], name="z")
-    s = te.create_schedule(z.op)
-
-    def create_buffer(t):
-        return tvm.tir.decl_buffer(
-            t.shape, t.dtype, name="W" + t.name, scope=scope_ubuf, offset_factor=16
-        )
-
-    binds = {}
-    if cache_read:
-        binds[x] = create_buffer(x)
-        binds[y] = create_buffer(y)
-    if cache_write:
-        binds[z] = create_buffer(z)
-
-    def intrin_func(ins, outs):
-        ib = tvm.tir.ir_builder.create()
-        ib.emit(
-            tvm.tir.call_extern(
-                outs[0].dtype,
-                "vadd",
-                ins[0].access_ptr("r"),
-                ins[1].access_ptr("r"),
-                outs[0].access_ptr("wr"),
-            )
-        )
-        return ib.get()
-
-    return te.decl_tensor_intrin(
-        z.op, intrin_func, binds=binds, default_buffer_params={"offset_factor": 16}
-    )
-
-
-def test_schedule_tensor_compute2():
-    # cache_read, cache_write
-    M = 1024
-    factor = 16
-    dtype = "float32"
-    scope_ubuf = "local"
-
-    A = te.placeholder((M // factor, factor), name="A", dtype=dtype)
-    B = te.placeholder((M // factor, factor), name="B", dtype=dtype)
-
-    vadd = intrin_vadd(factor, True, True)
-    C = te.compute((M // factor, factor), lambda i: vadd(A[i, 0:factor], B[i, 0:factor]), name="C")
-
-    s = te.create_schedule(C.op)
-    AL = s.cache_read(A, scope_ubuf, C)
-    BL = s.cache_read(B, scope_ubuf, C)
-    CL = s.cache_write(C, scope_ubuf)
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def test_schedule_tensor_compute3():
-    # compute_at
-    M = 1024
-    factor = 16
-    dtype = "float32"
-    A = te.placeholder((M // factor, factor), name="A", dtype=dtype)
-    B = te.placeholder((M // factor, factor), name="B", dtype=dtype)
-    Bi = te.compute((M // factor, factor), lambda i, j: B[i, j] + 5, name="Bi")
-
-    vadd = intrin_vadd(factor)
-    C = te.compute((M // factor, factor), lambda i: vadd(A[i, 0:factor], Bi[i, 0:factor]), name="C")
-    s = te.create_schedule(C.op)
-    s[Bi].compute_at(s[C], C.op.axis[0])
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def test_loop_dep_reduce():
-    X = te.placeholder(shape=(10,), name="x")
-
-    def f(n):
-        rv = te.reduce_axis((0, n))
-        return te.sum(X[rv], axis=rv)
-
-    Y = te.compute(X.shape, f, name="y")
-    s = te.create_schedule([Y.op])
-    f = tvm.build(s, [X, Y])
-
-
-def test_loop_dep_reduce_cache_write():
-    X = te.placeholder(shape=(10,), name="x")
-
-    def f(n):
-        rv = te.reduce_axis((0, n))
-        init = lambda dtype: tvm.tir.Select(n > 1, tvm.tir.const(0, dtype), n.astype(dtype))
-        sum = te.comm_reducer(lambda x, y: tvm.te.max(x + y, n.astype("float32")), init, name="sum")
-        return sum(X[rv], axis=rv)
-
-    Y = te.compute(X.shape, f, name="y")
-    s = te.create_schedule([Y.op])
-    s.cache_write(Y, "local")
-    f = tvm.build(s, [X, Y])
-
-
-def test_reduction_and_dummy_fuse_split():
-    n = 10
-    X = te.placeholder(shape=(n,), dtype="int32", name="X")
-    k = te.reduce_axis((0, n))
-    Y = te.compute((), lambda: te.sum(X[k], k), name="Y")
-    s = te.create_schedule([Y.op])
-    ax = s[Y.op].fuse(*Y.op.axis)
-    axo, axi = s[Y.op].split(ax, nparts=20)
-    f = tvm.build(s, [Y, X])
-
-    args = [tvm.nd.empty((), "int32")] + [tvm.nd.array(np.ones((n,), dtype="int32"))]
-    f(*args)
-    assert args[0].numpy() == n
-
-    n = 10
-    X = te.placeholder(shape=(n,), dtype="int32", name="X")
-    k = te.reduce_axis((0, n))
-    Y = te.compute((n,), lambda i: te.sum(X[k], k), name="Y")
-    s = te.create_schedule([Y.op])
-    ax = s[Y.op].fuse(*(list(Y.op.axis) + list(Y.op.reduce_axis)))
-    f = tvm.build(s, [Y, X])
-
-    args = [tvm.nd.array(np.ones((n,), dtype="int32"))] + [
-        tvm.nd.array(np.ones((n,), dtype="int32"))
-    ]
-    f(*args)
-    assert np.all(args[0].numpy() == n)
-
-
-def test_schedule_compute_inline():
-    shape = [10, 1024]
-    A = te.placeholder(shape, name="A")
-    B = te.placeholder(shape, name="B")
-    C = te.compute(shape, lambda *index: A(*index) + B(*index), name="C")
-
-    def _compute(*index):
-        return C(*index), C(*index) * B(*index)
-
-    F, E = te.compute(shape, _compute, name="F")
-
-    s = te.create_schedule([F.op, E.op])
-    AL = s.cache_read(A, "local", [C])
-    BL = s.cache_read(B, "local", [C, E])
-    CL = s.cache_write(C, "local")
-    FL, EL = s.cache_write([F, E], "local")
-    s[C].compute_inline()
-
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def test_local_stage_predicate():
-    m = 1
-    n = 3
-    p = 2
-    A = tvm.te.placeholder((m, n, p), name="A")
-    B = tvm.te.compute((m, n, p), lambda bi, bj, bk: A[bi, bj, bk], name="B")
-    C = tvm.te.compute((m, n, p), lambda ci, cj, ck: B[ci, cj, ck], name="C")
-    by = tvm.te.thread_axis("blockIdx.y")
-    tx = tvm.te.thread_axis("threadIdx.x")
-    vx = tvm.te.thread_axis("vthread")
-
-    def schedule(thread_tag, mem_scope):
-        s = tvm.te.create_schedule(C.op)
-        s[B].compute_at(s[C], s[C].op.axis[0])
-        s[B].set_scope(mem_scope)
-        bno, bni = s[B].split(s[B].op.axis[1], n)
-        bx = tvm.te.thread_axis("blockIdx.x")
-        s[C].bind(s[C].op.axis[0], bx)
-        s[C].bind(s[C].op.axis[1], thread_tag)
-        s[B].bind(bni, thread_tag)
-        return s
-
-    def collect_visit(stmt, f):
-        ret = []
-        tvm.tir.stmt_functor.post_order_visit(stmt, lambda x: ret.append(f(x)))
-        return ret
-
-    # local vs. threadIdx
-    s = schedule(tx, "local")
-    lowered_body = tvm.lower(s, [A, C])["main"].body
-    assert not any(collect_visit(lowered_body, lambda x: isinstance(x, tvm.tir.IfThenElse)))
-    # local vs. vthread
-    s = schedule(vx, "local")
-    lowered_body = tvm.lower(s, [A, C])["main"].body
-    assert not any(collect_visit(lowered_body, lambda x: isinstance(x, tvm.tir.IfThenElse)))
-    # shared vs. blockIdx
-    s = schedule(by, "shared")
-    lowered_body = tvm.lower(s, [A, C])["main"].body
-    assert not any(collect_visit(lowered_body, lambda x: isinstance(x, tvm.tir.IfThenElse)))
-
-
-def test_local_stage_predicate2():
-    A = tvm.te.placeholder((128,), name="A")
-    B = tvm.te.compute((128,), lambda bi: A[bi] + 1, name="B")
-    C = tvm.te.compute((128,), lambda ci: B[ci] + 2, name="C")
-    s = tvm.te.create_schedule(C.op)
-    AA = s.cache_read(A, "local", [B])
-    s[B].set_scope("shared")
-    block_x = tvm.te.thread_axis("blockIdx.x")
-    thread_x = tvm.te.thread_axis((0, 32), "threadIdx.x")
-    oc, ic = s[C].split(s[C].op.axis[0], factor=64)
-    ooc, ioc = s[C].split(oc, factor=2)
-    oic, iic = s[C].split(ic, factor=32)
-    s[C].bind(ooc, block_x)
-    s[C].bind(iic, thread_x)
-    s[B].compute_at(s[C], ioc)
-    ob, ib = s[B].split(s[B].op.axis[0], factor=32)
-    s[B].bind(ib, thread_x)
-    s[AA].compute_root()
-    s[AA].compute_at(s[C], ooc)
-    oaa, iaa = s[AA].split(s[AA].op.axis[0], factor=32)
-    s[AA].bind(iaa, thread_x)
-    lowered_body = tvm.lower(s, [A, C])["main"].body
-
-    def collect_visit(stmt, f):
-        ret = []
-        tvm.tir.stmt_functor.post_order_visit(stmt, lambda x: ret.append(f(x)))
-        return ret
-
-    def visit_stmt(op):
-        if isinstance(op, tvm.tir.Allocate):
-            return op.extents[0].value == 97
-        return False
-
-    assert not any(collect_visit(lowered_body, lambda x: isinstance(x, tvm.tir.IfThenElse)))
-    assert any(collect_visit(lowered_body, visit_stmt))
-
-
-def test_schedule_record_gemm():
-    with tvm.transform.PassContext(config={"te.keep_schedule_record": True}):
-        M, K, N = 1024, 1024, 1024
-        k = te.reduce_axis((0, K), "k")
-        A = te.placeholder((M, K), name="A")
-        B = te.placeholder((K, N), name="B")
-        C = te.compute((M, N), lambda m, n: te.sum(A[m, k] * B[k, n], axis=k), name="C")
-        s = te.create_schedule(C.op)
-        # currently there are no other applied primitives
-        # size of schedule record is expected to be 1 (vanilla schedule)
-        assert len(s.schedule_record) == 1
-        # apply sequential optimizatoin primitives
-        block_size, factor = 32, 8
-        # tile -> split + split + reorder
-        mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], block_size, block_size)
-        ko, ki = s[C].split(k, factor=factor)
-        s[C].reorder(mo, ko, no, mi, ki, ni)
-        s[C].vectorize(ni)
-        s[C].parallel(mo)
-        assert len(s.schedule_record) == 8
-        # compare primitive names
-        expected_names = [
-            "vanilla",
-            "split",
-            "split",
-            "reorder",
-            "split",
-            "reorder",
-            "vectorize",
-            "parallel",
-        ]
-        for i in range(len(s.schedule_record)):
-            assert s.primitive_record[i] == expected_names[i]
-
-
-def test_schedule_record_misc():
-    s = te.create_schedule([])
-    # size of schedule record is expected to be 0 (no storing behavior)
-    assert len(s.schedule_record) == 0
-
-    with tvm.transform.PassContext(config={"te.keep_schedule_record": True}):
-        s = te.create_schedule([])
-        # size of schedule record is expected to be 1 (vanilla schedule)
-        assert len(s.schedule_record) == 1
-
-        stg = te.compute((), lambda *args: 0, name="empty_op")
-        s = te.create_schedule(stg.op)
-        # size of schedule record is expected to be 1 (vanilla schedule)
-        assert len(s.schedule_record) == 1
-
-
-if __name__ == "__main__":
-    test_loop_dep_reduce()
-    test_loop_dep_reduce_cache_write()
-    test_schedule_middle_cache()
-    test_inline_multi_reduce()
-    test_schedule_cache_relayout4()
-    test_schedule_cache_relayout3()
-    test_schedule_cache_relayout2()
-    test_schedule_cache_relayout1()
-    test_schedule_const_bound()
-    test_scan_inline1()
-    test_scan_inline2()
-    test_inline_mixed()
-    test_auto_inline()
-    test_schedule_scan()
-    test_schedule0()
-    test_schedule1()
-    test_schedule2()
-    test_schedule_cache()
-    test_schedule_tensor_compute1()
-    test_schedule_tensor_compute2()
-    test_schedule_tensor_compute3()
-    test_reduction_and_dummy_fuse_split()
-    test_schedule_compute_inline()
-    test_local_stage_predicate()
-    test_local_stage_predicate2()
-    test_schedule_record_gemm()
-    test_schedule_record_misc()
diff --git a/tests/python/te/test_te_schedule_postproc_rewrite_for_tensor_core.py b/tests/python/te/test_te_schedule_postproc_rewrite_for_tensor_core.py
deleted file mode 100644
index 83584ad56400..000000000000
--- a/tests/python/te/test_te_schedule_postproc_rewrite_for_tensor_core.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# 'License'); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-from tvm import topi
-import numpy as np
-import tvm.testing
-
-
-def tensor_core_matmul(warp_tile_m=16, m=64, n=32, l=96):
-    A = te.placeholder((n, l), name="A", dtype="float16")
-    B = te.placeholder((l, m), name="B", dtype="float16")
-    k = te.reduce_axis((0, l), name="k")
-    C = te.compute(
-        (n, m), lambda i, j: te.sum(A[i, k].astype("float32") * B[k, j].astype("float32"), axis=k)
-    )
-    s = te.create_schedule(C.op)
-    y, x = s[C].op.axis
-    k = s[C].op.reduce_axis[0]
-
-    AA = s.cache_read(A, "shared", [C])
-    AL = s.cache_read(AA, "local", [C])
-    BB = s.cache_read(B, "shared", [C])
-    BL = s.cache_read(BB, "local", [C])
-    CL = s.cache_write(C, "local")
-
-    bx = 4
-    by = 32
-    step_k = 8
-    v = 4
-    TX = 8
-    TY = 1
-    tile_x = bx * TX
-    tile_y = by * TY
-    WX = min(warp_tile_m, tile_x)
-    tile_k = 16
-    vthread = 1
-
-    yo, ty = s[C].split(y, tile_y * vthread)
-    vy, ty = s[C].split(ty, tile_y)
-    ty, yi = s[C].split(ty, TY)
-
-    xo, xi = s[C].split(x, tile_x)
-    tz, xi = s[C].split(xi, WX)
-    tx, xi = s[C].split(xi, TX)
-    ko, ki = s[CL].split(k, step_k * tile_k)
-    kl, ki = s[CL].split(ki, tile_k)
-
-    s[C].reorder(yo, xo, tz, ty, tx, yi, xi)
-    s[C].bind(yo, te.thread_axis("blockIdx.y"))
-    s[C].bind(xo, te.thread_axis("blockIdx.x"))
-    s[C].bind(ty, te.thread_axis("threadIdx.y"))
-    s[C].bind(tz, te.thread_axis("threadIdx.z"))
-    s[C].bind(tx, te.thread_axis("threadIdx.x"))
-    s[C].bind(vy, te.thread_axis((0, vthread), "vthread", name="vy"))
-    s[CL].compute_at(s[C], tx)
-    yo, xo = CL.op.axis
-    s[CL].reorder(ko, kl, ki, yo, xo)
-
-    s[AA].compute_at(s[CL], ko)
-    xo, xi = s[AA].split(s[AA].op.axis[1], factor=bx * v)
-    tz, tx = s[AA].split(xi, factor=(WX // TX) * v)
-    tx, vec = s[AA].split(tx, factor=v)
-    fused = s[AA].fuse(s[AA].op.axis[0], xo)
-    _, ty = s[AA].split(fused, factor=by)
-    s[AA].bind(ty, te.thread_axis("threadIdx.y"))
-    s[AA].bind(tz, te.thread_axis("threadIdx.z"))
-    s[AA].bind(tx, te.thread_axis("threadIdx.x"))
-    s[AA].vectorize(vec)
-
-    s[BB].compute_at(s[CL], ko)
-    xo, xi = s[BB].split(s[BB].op.axis[1], factor=bx * v)
-    tz, tx = s[BB].split(xi, factor=(WX // TX) * v)
-    tx, vec = s[BB].split(tx, factor=v)
-    fused = s[BB].fuse(s[BB].op.axis[0], xo)
-    _, ty = s[BB].split(fused, factor=by)
-    s[BB].bind(ty, te.thread_axis("threadIdx.y"))
-    s[BB].bind(tz, te.thread_axis("threadIdx.z"))
-    s[BB].bind(tx, te.thread_axis("threadIdx.x"))
-    s[BB].vectorize(vec)
-
-    s[AL].compute_at(s[CL], kl)
-    s[BL].compute_at(s[CL], kl)
-
-    s[CL].pragma(ko, "tensor_core")
-
-    func = tvm.build(s, [A, B, C], "cuda")
-
-    dev = tvm.cuda(0)
-    a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
-    b_np = np.random.uniform(size=(l, m)).astype(B.dtype)
-    c_np = np.zeros((n, m), dtype=np.float32)
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(b_np, dev)
-    c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev)
-    func(a, b, c)
-    evaluator = func.time_evaluator(func.entry_name, dev, number=3)
-    print("gemm m=%d n=%d k=%d: %f ms" % (m, n, l, evaluator(a, b, c).mean * 1e3))
-
-    c_np = np.dot(a_np, b_np)
-    np.testing.assert_allclose(c_np, c.numpy(), rtol=1e-3)
-
-
-def tensor_core_batch_matmul(warp_tile_m=16, m=64, n=32, l=96, batch=2):
-    A = te.placeholder((batch, n, l), name="A", dtype="float16")
-    B = te.placeholder((batch, l, m), name="B", dtype="float16")
-    k = te.reduce_axis((0, l), name="k")
-    C = te.compute(
-        (batch, n, m), lambda b, i, j: te.sum((A[b, i, k] * B[b, k, j]).astype("float32"), axis=k)
-    )
-    s = te.create_schedule(C.op)
-    z, y, x = s[C].op.axis
-    k = s[C].op.reduce_axis[0]
-
-    AA = s.cache_read(A, "shared", [C])
-    AL = s.cache_read(AA, "local", [C])
-    BB = s.cache_read(B, "shared", [C])
-    BL = s.cache_read(BB, "local", [C])
-    CL = s.cache_write(C, "local")
-
-    bx = 2
-    by = 32
-    step_k = 8
-    v = 4
-    TX = 8
-    TY = 1
-    tile_x = bx * TX
-    tile_y = by * TY
-    WX = min(warp_tile_m, tile_x)
-    tile_k = 16
-    vthread = 1
-
-    yo, ty = s[C].split(y, tile_y * vthread)
-    vy, ty = s[C].split(ty, tile_y)
-    ty, yi = s[C].split(ty, TY)
-
-    xo, xi = s[C].split(x, tile_x)
-    tz, xi = s[C].split(xi, WX)
-    tx, xi = s[C].split(xi, TX)
-    ko, ki = s[CL].split(k, step_k * tile_k)
-    kl, ki = s[CL].split(ki, tile_k)
-
-    s[C].reorder(z, yo, xo, tz, ty, tx, yi, xi)
-    s[C].bind(z, te.thread_axis("blockIdx.z"))
-    s[C].bind(yo, te.thread_axis("blockIdx.y"))
-    s[C].bind(xo, te.thread_axis("blockIdx.x"))
-    s[C].bind(ty, te.thread_axis("threadIdx.y"))
-    s[C].bind(tz, te.thread_axis("threadIdx.z"))
-    s[C].bind(tx, te.thread_axis("threadIdx.x"))
-    s[C].bind(vy, te.thread_axis((0, vthread), "vthread", name="vy"))
-    s[CL].compute_at(s[C], tx)
-    zo, yo, xo = CL.op.axis
-    s[CL].reorder(ko, kl, ki, zo, yo, xo)
-
-    s[AA].compute_at(s[CL], ko)
-    xo, xi = s[AA].split(s[AA].op.axis[2], factor=bx * v)
-    tz, tx = s[AA].split(xi, factor=(WX // TX) * v)
-    tx, vec = s[AA].split(tx, factor=v)
-    fused = s[AA].fuse(s[AA].op.axis[1], xo)
-    _, ty = s[AA].split(fused, factor=by)
-    s[AA].bind(ty, te.thread_axis("threadIdx.y"))
-    s[AA].bind(tz, te.thread_axis("threadIdx.z"))
-    s[AA].bind(tx, te.thread_axis("threadIdx.x"))
-    s[AA].vectorize(vec)
-
-    s[BB].compute_at(s[CL], ko)
-    xo, xi = s[BB].split(s[BB].op.axis[2], factor=bx * v)
-    tz, tx = s[BB].split(xi, factor=(WX // TX) * v)
-    tx, vec = s[BB].split(tx, factor=v)
-    fused = s[BB].fuse(s[BB].op.axis[1], xo)
-    _, ty = s[BB].split(fused, factor=by)
-    s[BB].bind(ty, te.thread_axis("threadIdx.y"))
-    s[BB].bind(tz, te.thread_axis("threadIdx.z"))
-    s[BB].bind(tx, te.thread_axis("threadIdx.x"))
-    s[BB].vectorize(vec)
-
-    s[AL].compute_at(s[CL], kl)
-    s[BL].compute_at(s[CL], kl)
-
-    s[CL].pragma(ko, "tensor_core")
-
-    func = tvm.build(s, [A, B, C], "cuda")
-
-    dev = tvm.cuda(0)
-    a_np = np.random.uniform(size=(batch, n, l)).astype(A.dtype)
-    b_np = np.random.uniform(size=(batch, l, m)).astype(B.dtype)
-    c_np = np.zeros((batch, n, m), dtype=np.float32)
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(b_np, dev)
-    c = tvm.nd.array(np.zeros((batch, n, m), dtype=C.dtype), dev)
-    func(a, b, c)
-    evaluator = func.time_evaluator(func.entry_name, dev, number=3)
-    print(
-        "batch gemm m=%d n=%d k=%d batch=%d: %f ms"
-        % (m, n, l, batch, evaluator(a, b, c).mean * 1e3)
-    )
-
-    for bs in range(batch):
-        c_np[bs, :, :] = np.dot(a_np[bs, :, :], b_np[bs, :, :])
-    np.testing.assert_allclose(c_np, c.numpy(), rtol=1e-3)
-
-
-@tvm.testing.requires_tensorcore
-def test_tensor_core_matmul():
-    tensor_core_matmul(16)  # test with warp_tile 16x16x16
-    tensor_core_matmul(8)  # test with warp_tile 8x32x16
-    tensor_core_matmul(32)  # test with warp_tile 32x8x16
-
-
-@tvm.testing.requires_tensorcore
-def test_tensor_core_batch_matmul():
-    tensor_core_batch_matmul()
-
-
-if __name__ == "__main__":
-    test_tensor_core_matmul()
-    test_tensor_core_batch_matmul()
diff --git a/tests/python/te/test_te_schedule_tensor_core.py b/tests/python/te/test_te_schedule_tensor_core.py
deleted file mode 100644
index d86b05ad83f1..000000000000
--- a/tests/python/te/test_te_schedule_tensor_core.py
+++ /dev/null
@@ -1,461 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# 'License'); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-import numpy as np
-from tvm.topi.testing import conv2d_nhwc_python
-import tvm.testing
-
-VERIFY = True
-
-
-def intrin_wmma_load_matrix(shape, scope):
-    n, m, l = shape
-    if scope == "wmma.matrix_a":
-        row, col = n, l
-    elif scope == "wmma.matrix_b":
-        row, col = l, m
-    A = te.placeholder((row, col), name="A", dtype="float16")
-    BA = tvm.tir.decl_buffer(
-        A.shape, A.dtype, scope="shared", data_alignment=32, offset_factor=row * col
-    )
-    C = te.compute((row, col), lambda i, j: A[i, j], name="C")
-    BC = tvm.tir.decl_buffer(
-        C.shape, C.dtype, scope=scope, data_alignment=32, offset_factor=row * col
-    )
-
-    def intrin_func(ins, outs):
-        ib = tvm.tir.ir_builder.create()
-
-        BA = ins[0]
-        BC = outs[0]
-        ib.emit(
-            tvm.tir.call_intrin(
-                "handle",
-                "tir.tvm_load_matrix_sync",
-                BC.data,
-                n,
-                m,
-                l,
-                BC.elem_offset // (row * col),
-                BA.access_ptr("r"),
-                col,
-                "row_major",
-            )
-        )
-        return ib.get()
-
-    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
-
-
-def intrin_wmma_gemm(shape):
-    n, m, l = shape
-    A = te.placeholder((n, l), name="A", dtype="float16")
-    B = te.placeholder((l, m), name="B", dtype="float16")
-    k = te.reduce_axis((0, l), name="k")
-    C = te.compute(
-        (n, m),
-        lambda ii, jj: te.sum(A[ii, k].astype("float") * B[k, jj].astype("float"), axis=k),
-        name="C",
-    )
-    BA = tvm.tir.decl_buffer(
-        A.shape, A.dtype, name="BA", scope="wmma.matrix_a", data_alignment=32, offset_factor=n * l
-    )
-    BB = tvm.tir.decl_buffer(
-        B.shape, B.dtype, name="BB", scope="wmma.matrix_b", data_alignment=32, offset_factor=l * m
-    )
-    BC = tvm.tir.decl_buffer(
-        C.shape,
-        C.dtype,
-        name="BC",
-        scope="wmma.accumulator",
-        data_alignment=32,
-        offset_factor=n * m,
-    )
-
-    def intrin_func(ins, outs):
-        BA, BB = ins
-        (BC,) = outs
-
-        def init():
-            ib = tvm.tir.ir_builder.create()
-            ib.emit(
-                tvm.tir.call_intrin(
-                    "handle",
-                    "tir.tvm_fill_fragment",
-                    BC.data,
-                    n,
-                    m,
-                    l,
-                    BC.elem_offset // (n * m),
-                    0.0,
-                )
-            )
-            return ib.get()
-
-        def update():
-            ib = tvm.tir.ir_builder.create()
-            ib.emit(
-                tvm.tir.call_intrin(
-                    "handle",
-                    "tir.tvm_mma_sync",
-                    BC.data,
-                    BC.elem_offset // (n * m),
-                    BA.data,
-                    BA.elem_offset // (n * l),
-                    BB.data,
-                    BB.elem_offset // (l * m),
-                    BC.data,
-                    BC.elem_offset // (n * m),
-                )
-            )
-            return ib.get()
-
-        return update(), init(), update()
-
-    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, B: BB, C: BC})
-
-
-def intrin_wmma_store_matrix(shape):
-    n, m, l = shape
-    A = te.placeholder((n, m), name="A", dtype="float32")
-    BA = tvm.tir.decl_buffer(
-        A.shape, A.dtype, scope="wmma.accumulator", data_alignment=32, offset_factor=n * m
-    )
-    C = te.compute((n, m), lambda i, j: A[i, j], name="C")
-    BC = tvm.tir.decl_buffer(
-        C.shape, C.dtype, scope="global", data_alignment=32, offset_factor=n * m
-    )
-
-    def intrin_func(ins, outs):
-        ib = tvm.tir.ir_builder.create()
-
-        BA = ins[0]
-        BC = outs[0]
-        ib.emit(
-            tvm.tir.call_intrin(
-                "handle",
-                "tir.tvm_store_matrix_sync",
-                BA.data,
-                n,
-                m,
-                l,
-                BA.elem_offset // (n * m),
-                BC.access_ptr("w"),
-                m,
-                "row_major",
-            )
-        )
-        return ib.get()
-
-    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
-
-
-@tvm.testing.requires_tensorcore
-def test_tensor_core_batch_matmal():
-    batch_size = 4
-    n = 512
-    m, l = n, n
-    assert n % 32 == 0
-    assert m % 8 == 0
-    assert l % 16 == 0
-    nn, mm, ll = n // 32, m // 8, l // 16
-    A = te.placeholder((batch_size, nn, ll, 32, 16), name="A", dtype="float16")
-    B = te.placeholder((batch_size, ll, mm, 16, 8), name="B", dtype="float16")
-    k1 = te.reduce_axis((0, ll), name="k1")
-    k2 = te.reduce_axis((0, 16), name="k2")
-    C = te.compute(
-        (batch_size, nn, mm, 32, 8),
-        lambda b, i, j, ii, jj: te.sum(
-            A[b, i, k1, ii, k2].astype("float") * B[b, k1, j, k2, jj].astype("float"), axis=[k1, k2]
-        ),
-        name="Fragment_C",
-    )
-    s = te.create_schedule(C.op)
-
-    warp_size = 32
-    kernel_size = 16
-    block_row_warps = 2
-    block_col_warps = 4
-    warp_row_tiles = 4
-    warp_col_tiles = 2
-    chunk = 4
-
-    block_x = te.thread_axis("blockIdx.x")
-    block_y = te.thread_axis("blockIdx.y")
-    block_z = te.thread_axis("blockIdx.z")
-    thread_x = te.thread_axis("threadIdx.x")
-    thread_y = te.thread_axis("threadIdx.y")
-    thread_z = te.thread_axis("threadIdx.z")
-
-    AS = s.cache_read(A, "shared", [C])
-    BS = s.cache_read(B, "shared", [C])
-    AF = s.cache_read(AS, "wmma.matrix_a", [C])
-    BF = s.cache_read(BS, "wmma.matrix_b", [C])
-    CF = s.cache_write(C, "wmma.accumulator")
-
-    b, i, j, kernel_i, kernel_j = s[C].op.axis
-    i, ii = s[C].split(i, factor=warp_row_tiles)
-    block_i, i = s[C].split(i, factor=block_row_warps)
-    j, jj = s[C].split(j, factor=warp_col_tiles)
-    block_j, j = s[C].split(j, factor=block_col_warps)
-    s[C].reorder(block_i, block_j, i, j, ii, jj, kernel_i, kernel_j)
-    s[C].bind(b, block_z)
-    s[C].bind(block_i, block_x)
-    s[C].bind(block_j, block_y)
-    s[C].bind(i, thread_y)
-    s[C].bind(j, thread_z)
-
-    s[CF].compute_at(s[C], j)
-    b, warp_i, warp_j, _i, _j = s[CF].op.axis
-    k, _k = CF.op.reduce_axis
-    ko, ki = s[CF].split(k, factor=chunk)
-    s[CF].reorder(ko, ki, warp_i, warp_j, _i, _j, _k)
-
-    s[AF].compute_at(s[CF], ki)
-    s[BF].compute_at(s[CF], ki)
-
-    s[AS].compute_at(s[CF], ko)
-    b, xo, yo, xi, yi = AS.op.axis
-    tx, xo = s[AS].split(xo, nparts=block_row_warps)
-    ty, yo = s[AS].split(yo, nparts=block_col_warps)
-    t = s[AS].fuse(xi, yi)
-    to, ti = s[AS].split(t, nparts=warp_size)
-    s[AS].bind(tx, thread_y)
-    s[AS].bind(ty, thread_z)
-    s[AS].bind(to, thread_x)
-
-    s[BS].compute_at(s[CF], ko)
-    b, xo, yo, xi, yi = BS.op.axis
-    tx, xo = s[BS].split(xo, nparts=block_row_warps)
-    ty, yo = s[BS].split(yo, nparts=block_col_warps)
-    t = s[BS].fuse(xi, yi)
-    to, ti = s[BS].split(t, nparts=warp_size)
-    s[BS].bind(tx, thread_y)
-    s[BS].bind(ty, thread_z)
-    s[BS].bind(to, thread_x)
-
-    s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix((32, 8, 16), "wmma.matrix_a"))
-    s[BF].tensorize(BF.op.axis[-2], intrin_wmma_load_matrix((32, 8, 16), "wmma.matrix_b"))
-    s[C].tensorize(kernel_i, intrin_wmma_store_matrix((32, 8, 16)))
-    s[CF].tensorize(_i, intrin_wmma_gemm((32, 8, 16)))
-
-    func = tvm.build(s, [A, B, C], "cuda")
-
-    dev = tvm.cuda(0)
-    a_np = np.random.uniform(size=(batch_size, nn, ll, 32, 16)).astype(A.dtype)
-    b_np = np.random.uniform(size=(batch_size, ll, mm, 16, 8)).astype(B.dtype)
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(b_np, dev)
-    c = tvm.nd.array(np.zeros((batch_size, nn, mm, 32, 8), dtype=C.dtype), dev)
-    func(a, b, c)
-    evaluator = func.time_evaluator(func.entry_name, dev, number=3)
-    print("gemm with tensor core: %f ms" % (evaluator(a, b, c).mean * 1e3))
-
-    if VERIFY:
-        func(a, b, c)
-        a_np = a_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n)
-        b_np = b_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n)
-        c_np = c.numpy().transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n)
-        np.testing.assert_allclose(
-            c_np, np.matmul(a_np.astype(C.dtype), b_np.astype(C.dtype)), rtol=1e-4, atol=1e-4
-        )
-
-
-@tvm.testing.requires_tensorcore
-def test_tensor_core_batch_conv():
-    # The sizes of inputs and filters
-    batch_size = 32
-    height = 14
-    width = 14
-    in_channels = 32
-    out_channels = 64
-    kernel_h = 3
-    kernel_w = 3
-    pad_h = 1
-    pad_w = 1
-    stride_h = 1
-    stride_w = 1
-    block_size = 16
-
-    block_row_warps = 2
-    block_col_warps = 4
-    warp_row_tiles = 4
-    warp_col_tiles = 2
-    warp_size = 32
-    chunk = 2
-
-    # Input feature map: (N, H, W, IC, n, ic)
-    data_shape = (
-        batch_size // block_size,
-        height,
-        width,
-        in_channels // block_size,
-        block_size,
-        block_size,
-    )
-    # Kernel: (H, W, IC, OC, ic, oc)
-    kernel_shape = (
-        kernel_h,
-        kernel_w,
-        in_channels // block_size,
-        out_channels // block_size,
-        block_size,
-        block_size,
-    )
-
-    # Output feature map: (N, H, W, OC, n, oc)
-    output_shape = (
-        batch_size // block_size,
-        height,
-        width,
-        out_channels // block_size,
-        block_size,
-        block_size,
-    )
-
-    assert batch_size % block_size == 0
-    assert in_channels % block_size == 0
-    assert out_channels % block_size == 0
-
-    kh = te.reduce_axis((0, kernel_h), name="kh")
-    kw = te.reduce_axis((0, kernel_w), name="kw")
-    ic = te.reduce_axis((0, in_channels // block_size), name="ic")
-    ii = te.reduce_axis((0, block_size), name="ii")
-
-    # Algorithm
-    A = te.placeholder(data_shape, name="A", dtype="float16")
-    W = te.placeholder(kernel_shape, name="W", dtype="float16")
-    Apad = te.compute(
-        (
-            batch_size // block_size,
-            height + 2 * pad_h,
-            width + 2 * pad_w,
-            in_channels // block_size,
-            block_size,
-            block_size,
-        ),
-        lambda n, h, w, i, nn, ii: tvm.tir.if_then_else(
-            tvm.tir.all(h >= pad_h, h - pad_h < height, w >= pad_w, w - pad_w < width),
-            A[n, h - pad_h, w - pad_w, i, nn, ii],
-            tvm.tir.const(0.0, "float16"),
-        ),
-        name="Apad",
-    )
-    Conv = te.compute(
-        output_shape,
-        lambda n, h, w, o, nn, oo: te.sum(
-            Apad[n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype("float32")
-            * W[kh, kw, ic, o, ii, oo].astype("float32"),
-            axis=[ic, kh, kw, ii],
-        ),
-        name="Conv",
-    )
-
-    s = te.create_schedule(Conv.op)
-    s[Apad].compute_inline()
-
-    AS = s.cache_read(Apad, "shared", [Conv])
-    WS = s.cache_read(W, "shared", [Conv])
-    AF = s.cache_read(AS, "wmma.matrix_a", [Conv])
-    WF = s.cache_read(WS, "wmma.matrix_b", [Conv])
-    ConvF = s.cache_write(Conv, "wmma.accumulator")
-
-    block_x = te.thread_axis("blockIdx.x")
-    block_y = te.thread_axis("blockIdx.y")
-    block_z = te.thread_axis("blockIdx.z")
-    thread_x = te.thread_axis("threadIdx.x")
-    thread_y = te.thread_axis("threadIdx.y")
-    thread_z = te.thread_axis("threadIdx.z")
-
-    nc, hc, wc, oc, nnc, ooc = Conv.op.axis
-    block_k = s[Conv].fuse(hc, wc)
-    s[Conv].bind(block_k, block_z)
-    nc, nci = s[Conv].split(nc, factor=warp_row_tiles)
-    block_i, nc = s[Conv].split(nc, factor=block_row_warps)
-    oc, oci = s[Conv].split(oc, factor=warp_col_tiles)
-    block_j, oc = s[Conv].split(oc, factor=block_col_warps)
-    s[Conv].reorder(block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc)
-    s[Conv].bind(block_i, block_x)
-    s[Conv].bind(block_j, block_y)
-    s[Conv].bind(nc, thread_y)
-    s[Conv].bind(oc, thread_z)
-
-    s[ConvF].compute_at(s[Conv], oc)
-    n, h, w, o, nnf, oof = ConvF.op.axis
-    ko, ki = s[ConvF].split(ic, factor=chunk)
-    s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii)
-
-    s[AF].compute_at(s[ConvF], kw)
-    s[WF].compute_at(s[ConvF], kw)
-
-    s[WS].compute_at(s[ConvF], kh)
-    s[AS].compute_at(s[ConvF], kh)
-
-    n, h, w, i, nn, ii = AS.op.axis
-    tx, xo = s[AS].split(n, nparts=block_row_warps)
-    ty, yo = s[AS].split(xo, nparts=block_col_warps)
-    t = s[AS].fuse(nn, ii)
-    to, ti = s[AS].split(t, factor=warp_size)
-    s[AS].bind(tx, thread_y)
-    s[AS].bind(ty, thread_z)
-    s[AS].bind(ti, thread_x)
-
-    kh, kw, ic, o, ii, oo = WS.op.axis
-    tx, xo = s[WS].split(o, nparts=block_row_warps)
-    ty, yo = s[WS].split(xo, nparts=block_col_warps)
-    t = s[WS].fuse(ii, oo)
-    to, ti = s[WS].split(t, nparts=warp_size)
-    s[WS].bind(tx, thread_y)
-    s[WS].bind(ty, thread_z)
-    s[WS].bind(to, thread_x)
-    s[WS].vectorize(ti)
-
-    s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), "wmma.matrix_a"))
-    s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), "wmma.matrix_b"))
-    s[Conv].tensorize(nnc, intrin_wmma_store_matrix((16, 16, 16)))
-    s[ConvF].tensorize(nnf, intrin_wmma_gemm((16, 16, 16)))
-
-    func = tvm.build(s, [A, W, Conv], "cuda")
-
-    dev = tvm.cuda(0)
-    a_np = np.random.uniform(size=data_shape).astype(A.dtype)
-    w_np = np.random.uniform(size=kernel_shape).astype(W.dtype)
-    a = tvm.nd.array(a_np, dev)
-    w = tvm.nd.array(w_np, dev)
-    c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), dev)
-    evaluator = func.time_evaluator(func.entry_name, dev, number=3)
-    print("conv2d with tensor core: %f ms" % (evaluator(a, w, c).mean * 1e3))
-
-    if VERIFY:
-        func(a, w, c)
-        a_np = a_np.transpose(0, 4, 1, 2, 3, 5).reshape(batch_size, height, width, in_channels)
-        w_np = w_np.transpose(0, 1, 2, 4, 3, 5).reshape(
-            kernel_h, kernel_w, in_channels, out_channels
-        )
-        c_np = (
-            c.numpy().transpose((0, 4, 1, 2, 3, 5)).reshape(batch_size, height, width, out_channels)
-        )
-        c_std = conv2d_nhwc_python(
-            a_np.astype(Conv.dtype), w_np.astype(Conv.dtype), (stride_h, stride_w), (pad_h, pad_w)
-        ).astype(Conv.dtype)
-        np.testing.assert_allclose(c_np, c_std, rtol=1e-4, atol=1e-4)
-
-
-if __name__ == "__main__":
-    test_tensor_core_batch_matmal()
-    test_tensor_core_batch_conv()
diff --git a/tests/python/te/test_te_schedule_tensorize.py b/tests/python/te/test_te_schedule_tensorize.py
deleted file mode 100644
index 419d3edb5c3d..000000000000
--- a/tests/python/te/test_te_schedule_tensorize.py
+++ /dev/null
@@ -1,392 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-from tvm.script import tir as T
-
-
-def intrin_vadd(xo, m, n):
-    x = te.placeholder((n,), name="vx")
-    y = te.placeholder((n,), name="vy")
-    if m % n == 0:
-        body = lambda i: x[i] + y[i]
-    else:
-        body = lambda i: tvm.tir.Select(
-            xo * n + i < m, x[i] + y[i], tvm.tir.const(0, dtype=x.dtype)
-        )
-    z = te.compute(x.shape, body, name="z")
-
-    def intrin_func(ins, outs):
-        xx, yy = ins
-        zz = outs[0]
-        # special handle needed to tackle tail loop part when m % n != 0
-        # here is tvm.min(n, m - xo * n)
-        return tvm.tir.call_packed("vadd", xx, yy, zz)
-
-    buffer_params = {"offset_factor": 16}
-    return te.decl_tensor_intrin(z.op, intrin_func, default_buffer_params=buffer_params)
-
-
-def intrin_gemv(m, n):
-    w = te.placeholder((m, n), name="w")
-    x = te.placeholder((n,), name="x")
-    k = te.reduce_axis((0, n), name="k")
-    z = te.compute((m,), lambda i: te.sum(w[i, k] * x[k], axis=k), name="z")
-    Wb = tvm.tir.decl_buffer(
-        w.shape, w.dtype, name="W", offset_factor=16, strides=[te.var("ldw"), 1]
-    )
-
-    def intrin_func(ins, outs):
-        ww, xx = ins
-        zz = outs[0]
-        ww_ptr = ww.access_ptr("r")
-        xx_ptr = xx.access_ptr("r")
-        zz_ptr = zz.access_ptr("w")
-        body = tvm.tir.call_packed("gemv", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
-        reset = tvm.tir.call_packed("fill_zero", zz_ptr, n)
-        update = tvm.tir.call_packed("gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
-        return body, reset, update
-
-    buffer_params = {"offset_factor": 16, "data_alignment": 16}
-    return te.decl_tensor_intrin(
-        z.op, intrin_func, binds={w: Wb}, default_buffer_params=buffer_params
-    )
-
-
-def intrin_gemv_no_reset(m, n):
-    w = te.placeholder((m, n), name="w")
-    x = te.placeholder((n,), name="x")
-    k = te.reduce_axis((0, n), name="k")
-    z = te.compute((m,), lambda i: te.sum(w[i, k] * x[k], axis=k), name="z")
-    Wb = tvm.tir.decl_buffer(
-        w.shape, w.dtype, name="W", offset_factor=16, strides=[te.var("ldw"), 1]
-    )
-
-    def intrin_func(ins, outs):
-        ww, xx = ins
-        zz = outs[0]
-        ww_ptr = ww.access_ptr("r")
-        xx_ptr = xx.access_ptr("r")
-        zz_ptr = zz.access_ptr("w")
-        body = tvm.tir.call_packed("gemv", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
-        update = tvm.tir.call_packed("gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
-        return body, None, update
-
-    buffer_params = {"offset_factor": 16, "data_alignment": 16}
-    return te.decl_tensor_intrin(
-        z.op, intrin_func, binds={w: Wb}, default_buffer_params=buffer_params
-    )
-
-
-def test_tensorize_vadd():
-    def add(m):
-        x = te.placeholder((m,), name="x")
-        y = te.placeholder((m,), name="y")
-        z = te.compute(x.shape, lambda i: x[i] + y[i], name="z")
-        return x, y, z
-
-    def check(m, factor):
-        x, y, z = add(m)
-        factor = T.int32(factor)
-        s = te.create_schedule(z.op)
-        xo, xi = s[z].split(z.op.axis[0], factor=factor)
-        vadd = intrin_vadd(xo, m, factor)
-        s[z].tensorize(xi, vadd)
-        s = s.normalize()
-        dom_map = tvm.te.schedule.InferBound(s)
-        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
-        out_dom, in_dom = finfer(s[z], dom_map)
-        tvm.ir.assert_structural_equal(out_dom[z.op.axis[0]].extent, factor)
-        tvm.ir.assert_structural_equal(out_dom[z.op.axis[0]].min, xo * factor)
-        tvm.ir.assert_structural_equal(in_dom.items()[0][1][0].extent, factor)
-        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
-        body = fmatch(s[z], out_dom, in_dom, vadd)
-        ana = tvm.arith.Analyzer()
-        tvm.ir.assert_structural_equal(ana.simplify(body[0]), ana.simplify(vadd.op.body[0]))
-        stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
-        tvm.lower(s, [x, y, z])
-
-    def check_cache_write(m, factor):
-        x, y, z = add(m)
-        s = te.create_schedule(z.op)
-        _, _ = s[z].split(z.op.axis[0], factor=factor)
-
-        z_global = s.cache_write(z, "global")
-        xo, xi = z_global.op.axis
-
-        vadd = intrin_vadd(xo, m, factor)
-        s[z_global].tensorize(xi, vadd)
-        s = s.normalize()
-        dom_map = tvm.te.schedule.InferBound(s)
-        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
-        out_dom, in_dom = finfer(s[z_global], dom_map)
-        # outer loop var will be rebased, so min value is the new loop var and extent is 1
-        tvm.ir.assert_structural_equal(out_dom[xo].extent, T.int32(1))
-        assert isinstance(out_dom[xo].min, tvm.tir.Var)
-        assert xo.var.name == out_dom[xo].min.name
-
-        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
-        body = fmatch(s[z_global], out_dom, in_dom, vadd)[0]
-        ana = tvm.arith.Analyzer()
-        vars = tvm.runtime.convert({xo.var: out_dom[xo].min})
-        vadd_body = tvm.tir.stmt_functor.substitute(vadd.op.body[0], vars)
-        tvm.ir.assert_structural_equal(ana.simplify(body), ana.simplify(vadd_body))
-        stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
-        tvm.lower(s, [x, y, z])
-
-    def check_compute_reuse():
-        x, y, z = add(32)
-
-        def _intrin_vadd():
-            def _intrin_func(ins, outs):
-                return tvm.tir.call_packed("vadd", ins[0], ins[1], outs[0])
-
-            return tvm.te.decl_tensor_intrin(z.op, _intrin_func)
-
-        s = tvm.te.create_schedule(z.op)
-        s[z].tensorize(z.op.axis[0], _intrin_vadd())
-        tvm.lower(s, [x, y, z])
-
-    check(128, 16)
-    check_cache_write(129, 16)
-    check_compute_reuse()
-
-
-def test_tensorize_matmul():
-    n = 1024
-    m = n
-    l = n
-    A = te.placeholder((n, l), name="A")
-    B = te.placeholder((m, l), name="B")
-    k = te.reduce_axis((0, l), name="k")
-    C = te.compute((n, m), lambda i, j: te.sum(B[j, k] * A[i, k], axis=k), name="C")
-
-    def check(factor):
-        s = te.create_schedule(C.op)
-        x, y = C.op.axis
-        yo, yi = s[C].split(y, factor=factor)
-        gemv = intrin_gemv(factor, l)
-        s[C].tensorize(yi, gemv)
-        s = s.normalize()
-        dom_map = tvm.te.schedule.InferBound(s)
-        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
-        out_dom, in_dom = finfer(s[C], dom_map)
-        tvm.ir.assert_structural_equal(out_dom[x].extent, T.int32(1))
-        tvm.ir.assert_structural_equal(out_dom[y].extent, factor)
-        tvm.ir.assert_structural_equal(out_dom[y].min, yo * factor)
-        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
-        body = fmatch(s[C], out_dom, in_dom, gemv)
-        ana = tvm.arith.Analyzer()
-
-        tvm.ir.assert_structural_equal(ana.simplify(body[0]), ana.simplify(gemv.op.body[0]))
-        stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
-        tvm.lower(s, [A, B, C])
-
-    def check_rfactor(factor, rfactor):
-        s = te.create_schedule(C.op)
-        x, y = C.op.axis
-        rk = C.op.reduce_axis[0]
-        yo, yi = s[C].split(y, factor=factor)
-        ro, ri = s[C].split(rk, factor=rfactor)
-        s[C].reorder(yo, ro, yi, ri)
-        gemv = intrin_gemv(factor, rfactor)
-        s[C].tensorize(yi, gemv)
-        s = s.normalize()
-        dom_map = tvm.te.schedule.InferBound(s)
-        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
-        out_dom, in_dom = finfer(s[C], dom_map)
-        tvm.ir.assert_structural_equal(out_dom[x].extent, T.int32(1))
-        tvm.ir.assert_structural_equal(out_dom[y].extent, factor)
-        tvm.ir.assert_structural_equal(out_dom[y].min, yo * factor)
-        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
-        body = fmatch(s[C], out_dom, in_dom, gemv)
-        ana = tvm.arith.Analyzer()
-        tvm.ir.assert_structural_equal(ana.simplify(body[0]), ana.simplify(gemv.op.body[0]))
-        stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
-        tvm.lower(s, [A, B, C])
-
-    def check_rfactor_no_reset(factor, rfactor):
-        s = te.create_schedule(C.op)
-        x, y = C.op.axis
-        rk = C.op.reduce_axis[0]
-        yo, yi = s[C].split(y, factor=factor)
-        ro, ri = s[C].split(rk, factor=rfactor)
-        s[C].reorder(yo, ro, yi, ri)
-        gemv = intrin_gemv_no_reset(factor, rfactor)
-        s[C].tensorize(yi, gemv)
-        s = s.normalize()
-        dom_map = tvm.te.schedule.InferBound(s)
-        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
-        out_dom, in_dom = finfer(s[C], dom_map)
-        tvm.ir.assert_structural_equal(out_dom[x].extent, T.int32(1))
-        tvm.ir.assert_structural_equal(out_dom[y].extent, factor)
-        tvm.ir.assert_structural_equal(out_dom[y].min, yo * factor)
-        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
-        body = fmatch(s[C], out_dom, in_dom, gemv)
-        ana = tvm.arith.Analyzer()
-        tvm.ir.assert_structural_equal(ana.simplify(body[0]), ana.simplify(gemv.op.body[0]))
-        stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
-        tvm.lower(s, [A, B, C])
-
-    def check_rfactor_no_reset_multi_reduction(factor, rfactor):
-        s = te.create_schedule(C.op)
-        x, y = C.op.axis
-        rk = C.op.reduce_axis[0]
-        yo, yi = s[C].split(y, factor=factor)
-        ro, ri = s[C].split(rk, factor=rfactor)
-        roo, roi = s[C].split(ro, factor=2)
-        s[C].reorder(yo, roo, roi, yi, ri)
-        gemv = intrin_gemv_no_reset(factor, rfactor)
-        s[C].tensorize(yi, gemv)
-        s = s.normalize()
-        dom_map = tvm.te.schedule.InferBound(s)
-        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
-        out_dom, in_dom = finfer(s[C], dom_map)
-        tvm.ir.assert_structural_equal(out_dom[x].extent, T.int32(1))
-        tvm.ir.assert_structural_equal(out_dom[y].extent, factor)
-        tvm.ir.assert_structural_equal(out_dom[y].min, yo * factor)
-        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
-        body = fmatch(s[C], out_dom, in_dom, gemv)
-        ana = tvm.arith.Analyzer()
-        tvm.ir.assert_structural_equal(ana.simplify(body[0]), ana.simplify(gemv.op.body[0]))
-        stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
-        tvm.lower(s, [A, B, C])
-
-    check(T.int32(16))
-    check_rfactor(T.int32(16), T.int32(16))
-    check_rfactor_no_reset(T.int32(16), T.int32(16))
-    check_rfactor_no_reset_multi_reduction(T.int32(16), T.int32(16))
-
-
-# This tests whether algorithm and intrinsics expressions are simplified
-# as much as possible first and then checked for equality. See Issue #696
-def test_tensorize_op():
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
-
-    def op_intrin():
-        bh = 9
-        bw = 9
-        x = te.placeholder((5, 5), name="A")
-        y = te.compute((bh, bw), lambda i, j: x[idxd(j, 3) + idxm(i, 3), idxm(j, 3) + idxd(i, 3)])
-
-        def intrin_func(ins, outs):
-            (xx,) = ins
-            zz = outs[0]
-            return tvm.tir.call_packed("op", xx, zz)
-
-        return te.decl_tensor_intrin(y.op, intrin_func, default_buffer_params={"offset_factor": 2})
-
-    A = te.placeholder((5, 5), name="A")
-    B = te.compute((9, 9), lambda i, j: A[idxd(j, 3) + idxm(i, 3), idxm(j, 3) + idxd(i, 3)])
-    bt = op_intrin()
-    s = te.create_schedule(B.op)
-
-    x, y = B.op.axis
-    s[B].tensorize(x, bt)
-    s = s.normalize()
-    tvm.lower(s, [A, B])
-
-
-# This test asserts that tensorize does not have any effect on
-# TensorComputeOp operations
-def test_tensorize_tensor_compute_op():
-    # an intrinsic called "multivadd" whose definition (pattern)
-    # is a loop of another intrinsic called "vadd"
-    def intrin_multivadd(n):
-        n_a = te.var("n_a")
-        Ab = tvm.tir.decl_buffer((n,), "float32", strides=[n_a])
-
-        n_b = te.var("n_b")
-        Bb = tvm.tir.decl_buffer((n,), "float32", strides=[n_b])
-
-        n_c = te.var("n_c")
-        Cb = tvm.tir.decl_buffer((n,), "float32", strides=[n_c])
-
-        z = te.compute(
-            (n,),
-            lambda i: tvm.tir.call_extern(
-                "float32",
-                "vadd",
-                Ab.access_ptr("w", offset=n_a * i),
-                Bb.access_ptr("r", offset=n_b * i),
-                Cb.access_ptr("r", offset=n_c * i),
-            ),
-        )
-
-        # replace the pattern with the multivadd call. I need to figure out
-        # how to pass it the right parameters.
-        def intrin_func(ins, outs):
-            return tvm.tir.call_packed("multivadd")
-
-        return te.decl_tensor_intrin(z.op, intrin_func, name="multivadd")
-
-    def intrin_vadd(n):
-        dtype = "float32"
-        x = te.placeholder((n,), dtype=dtype, name="vx")
-        y = te.placeholder((n,), dtype=dtype, name="vy")
-        z = te.compute(x.shape, lambda i: x[i] + y[i], name="z")
-        s = te.create_schedule(z.op)
-
-        def create_buffer(t):
-            return tvm.tir.decl_buffer(t.shape, t.dtype, name="W" + t.name, offset_factor=16)
-
-        def intrin_func(ins, outs):
-            ib = tvm.tir.ir_builder.create()
-            ib.emit(
-                tvm.tir.call_extern(
-                    "float32",
-                    "vadd",
-                    ins[0].access_ptr("r"),
-                    ins[1].access_ptr("r"),
-                    outs[0].access_ptr("wr"),
-                )
-            )
-            return ib.get()
-
-        return te.decl_tensor_intrin(
-            z.op, intrin_func, binds={x: create_buffer(x), y: create_buffer(y), z: create_buffer(z)}
-        )
-
-    # cache_read, cache_write
-    M = 1024
-    factor = 16
-    dtype = "float32"
-
-    A = te.placeholder((M // factor, factor), name="A", dtype=dtype)
-    B = te.placeholder((M // factor, factor), name="B", dtype=dtype)
-
-    vadd = intrin_vadd(factor)
-    C = te.compute((M // factor, factor), lambda i: vadd(A[i, 0:factor], B[i, 0:factor]), name="C")
-
-    s = te.create_schedule(C.op)
-    multivadd = intrin_multivadd(64)
-    s[C].tensorize(C.op.axis[0], multivadd)
-    s = s.normalize()
-    dom_map = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
-    # The loop that we tried to tensorize still exists in the code
-    # That means tensorize didn't work as expected
-    assert isinstance(stmt.body, tvm.tir.For)
-    assert stmt.body.loop_var.name == C.op.axis[0].var.name
-
-
-if __name__ == "__main__":
-    test_tensorize_vadd()
-    test_tensorize_matmul()
-    test_tensorize_op()
-    test_tensorize_tensor_compute_op()
diff --git a/tests/python/te/test_te_tensor.py b/tests/python/te/test_te_tensor.py
index 6958888e9bb6..31d6b1f4eb3a 100644
--- a/tests/python/te/test_te_tensor.py
+++ b/tests/python/te/test_te_tensor.py
@@ -128,91 +128,6 @@ def fidentity(t0, t1):
     T0, T1 = te.compute((m,), lambda i: mysum((idx[i, k], val[i, k]), axis=k, where=cond), name="T")
 
 
-def test_tensor_compute1():
-    m = 1024
-    factor = 16
-    dtype = "float32"
-
-    def intrin_vadd(n):
-        x = te.placeholder((n,))
-        y = te.placeholder((n,))
-        z = te.compute(x.shape, lambda i: x[i] + y[i])
-
-        def intrin_func(ins, outs):
-            ib = tvm.tir.ir_builder.create()
-            ib.emit(
-                tvm.tir.call_extern(
-                    outs[0].dtype,
-                    "vadd",
-                    ins[0].access_ptr("r"),
-                    ins[1].access_ptr("r"),
-                    outs[0].access_ptr("wr"),
-                )
-            )
-            return ib.get()
-
-        return te.decl_tensor_intrin(z.op, intrin_func, default_buffer_params={"offset_factor": n})
-
-    vadd = intrin_vadd(factor)
-
-    A = te.placeholder((m // factor, factor), name="A", dtype=dtype)
-    B = te.placeholder((m // factor, factor), name="B", dtype=dtype)
-    C = te.compute((m // factor, factor), lambda i: vadd(A[i, 0:factor], B[i, 0:factor]))
-
-    s = te.create_schedule(C.op)
-    # check lowering with the CSE pass disabled as otherwise it would do some commoning
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
-        stmt = tvm.lower(s, [A, B, C])["main"].body
-    assert isinstance(stmt.body, tvm.tir.Evaluate)
-
-
-def test_tensor_compute2():
-    M = 2048
-    N = 1024
-    L = 1024
-    factor = 16
-    factor1 = 32
-    factor2 = 32
-    dtype = "float32"
-
-    def intrin_gemm(m, n, l):
-        k = te.reduce_axis((0, l))
-        x = te.placeholder((m, l))
-        y = te.placeholder((n, l))
-        # in theory, no relation
-        z = te.compute((m, n), lambda i, j: te.sum(x[i][k] * y[j][k], axis=k))
-
-        def intrin_func(ins, outs):
-            x_ptr = ins[0].access_ptr("r")
-            y_ptr = ins[1].access_ptr("r")
-            z_ptr = outs[0].access_ptr("w")
-            body = tvm.tir.call_packed("gemv", x_ptr, y_ptr, z_ptr, m, n, l)
-            reset = tvm.tir.call_packed("fill_zero", z_ptr, m, n)
-            update = tvm.tir.call_packed("gemv_add", x_ptr, y_ptr, z_ptr, m, n, l)
-            return body, reset, update
-
-        return te.decl_tensor_intrin(z.op, intrin_func, default_buffer_params={"offset_factor": n})
-
-    vgemm = intrin_gemm(factor1, factor2, factor)
-
-    A = te.placeholder((M // factor1, L // factor, factor1, factor), name="A", dtype=dtype)
-    B = te.placeholder((N // factor2, L // factor, factor2, factor), name="B", dtype=dtype)
-    k = te.reduce_axis((0, L // factor), name="k")
-    C = te.compute(
-        (M // factor1, N // factor2, factor1, factor2),
-        lambda i, j: vgemm(
-            A[i, k, 0:factor1, 0:factor], B[j, k, 0:factor2, 0:factor], reduce_axis=k
-        ),
-    )
-
-    s = te.create_schedule(C.op)
-    # check lowering with the CSE pass disabled as otherwise it would do some commoning
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
-        stmt = tvm.lower(s, [A, B, C])["main"].body
-    assert isinstance(stmt.body.body[0], tvm.tir.Evaluate)
-    assert isinstance(stmt.body.body[1].body, tvm.tir.Evaluate)
-
-
 def test_tensor_scan():
     m = te.size_var("m")
     n = te.size_var("n")
@@ -251,7 +166,7 @@ def test_extern():
     A = te.placeholder((m,), name="A")
 
     def extern_func(ins, outs):
-        assert isinstance(ins[0], tvm.te.schedule.Buffer)
+        assert isinstance(ins[0], tvm.tir.Buffer)
         return tvm.tir.call_packed("myadd", ins[0].data, outs[0].data, m)
 
     B = te.extern((m,), [A], extern_func)
@@ -264,7 +179,7 @@ def test_extern_multi_out():
     B = te.compute((m,), lambda i: A[i] * 10)
 
     def extern_func(ins, outs):
-        assert isinstance(ins[0], tvm.te.schedule.Buffer)
+        assert isinstance(ins[0], tvm.tir.Buffer)
         return tvm.tir.call_packed("myadd", ins[0].data, outs[0].data, outs[1].data, m)
 
     res = te.extern([A.shape, A.shape], [A, B], extern_func)
@@ -278,13 +193,7 @@ def test_tuple_inputs():
     A0 = te.placeholder((m, n), name="A0")
     A1 = te.placeholder((m, n), name="A1")
     T0, T1 = te.compute((m, n), lambda i, j: (A0[i, j] * 2, A1[i, j] * 3), name="T")
-    s = te.create_schedule(T0.op)
-
-    for i in range(len(T0.shape)):
-        assert T0.shape[i] == T1.shape[i]
-    assert T0.op == T1.op
-    assert T0.value_index == 0
-    assert T1.value_index == 1
+    s = te.create_prim_func([A0, A1, T0])
 
 
 def test_tuple_with_different_deps():
@@ -295,25 +204,7 @@ def test_tuple_with_different_deps():
     B0, B1 = te.compute((m, n), lambda i, j: (A0[i, j] * 2, A1[i, j] * 3), name="B")
     C = te.compute((m, n), lambda i, j: B0[i, j] + 4, name="C")
 
-    s = te.create_schedule(C.op)
-    xo, xi = s[C].split(C.op.axis[0], factor=10)
-    s[B0.op].compute_at(s[C], xo)
-    sch = s.normalize()
-    bounds = tvm.te.schedule.InferBound(sch)
-    stmt = tvm.te.schedule.ScheduleOps(sch, bounds)
-
-    def get_B1_realize(x):
-        if (
-            isinstance(x, tvm.tir.ProducerRealize)
-            and x.producer.op == B1.op
-            and x.producer.value_index == 1
-        ):
-            ret.append(x)
-
-    ret = []
-    tvm.tir.stmt_functor.post_order_visit(stmt, get_B1_realize)
-
-    assert stmt.producer == C and len(ret) == 1
+    te.create_prim_func([A0, A1, C])
 
 
 def test_tensor_inputs():
@@ -322,91 +213,6 @@ def test_tensor_inputs():
     assert tuple(y.op.input_tensors) == (x,)
 
 
-def test_tensor_pool():
-    def intrin_pool():
-        A = te.placeholder((64, 16, 16), name="A")
-        kh = te.reduce_axis((0, 3), name="kh")
-        kw = te.reduce_axis((0, 3), name="kw")
-        P = te.compute(
-            (64, 14, 14),
-            lambda c, oh, ow: tvm.te.max(A[c, oh + kh, ow + kw], axis=[kh, kw]),
-            name="p",
-        )
-
-        def intrin_func(ins, outs):
-            dinp = ins[0]
-            dout = outs[0]
-            return tvm.tir.call_packed("op", dinp, dout)
-
-        return te.decl_tensor_intrin(P.op, intrin_func, default_buffer_params={"offset_factor": 1})
-
-    A = te.placeholder((1, 64, 16, 16), name="A")
-    P = pool2d(
-        data=A, kernel=(3, 3), stride=(1, 1), dilation=(1, 1), padding=(0, 0, 0, 0), pool_type="max"
-    )
-    s = te.create_schedule(P.op)
-    _, oh, _, _ = P.op.axis
-    intrin = intrin_pool()
-    s[P].tensorize(oh, intrin)
-    tvm.lower(s, [A, P])
-
-
-def test_tensor_scalar_mixed():
-    # test te with tensor and scalar
-    a = np.array(np.random.uniform(size=(10,)), "float32")
-    b = np.array(np.random.uniform(size=(1))[0], "float32")
-    c = np.array(np.random.uniform(size=(10,)), "float32")
-
-    @tvm.register_func("tvm.test_tensor_scalar_scale")
-    def my_scale(tensor, scalar, out):
-        out_np = tensor.numpy() * scalar.numpy()
-        tvm.nd.array(out_np).copyto(out)
-
-    A = te.placeholder(a.shape, name="A")
-    B = te.placeholder(b.shape, name="B")
-    C = te.extern(
-        a.shape,
-        [A, B],
-        lambda ins, outs: tvm.tir.call_packed(
-            "tvm.test_tensor_scalar_scale", ins[0], ins[1], outs[0]
-        ),
-        name="C",
-    )
-    s = te.create_schedule(C.op)
-    f = tvm.build(s, [A, B, C], "llvm")
-
-    ta = tvm.nd.array(a)
-    tb = tvm.nd.array(b)
-    tc = tvm.nd.array(c)
-    f(ta, tb, tc)
-    tvm.testing.assert_allclose(a * b, tc.numpy())
-
-
-def test_tensor_scalar():
-    # test te with scalar shape
-    a = np.array(np.random.uniform(size=(1))[0], "float32")
-    b = np.array(0.0, "float32")
-
-    @tvm.register_func("tvm.test_tensor_scalar_copy")
-    def mycopy(x, y):
-        x.copyto(y)
-
-    A = te.placeholder(a.shape, name="A")
-    B = te.extern(
-        a.shape,
-        [A],
-        lambda ins, outs: tvm.tir.call_packed("tvm.test_tensor_scalar_copy", ins[0], outs[0]),
-        name="B",
-    )
-    s = te.create_schedule(B.op)
-    f = tvm.build(s, [A, B], "llvm")
-
-    ta = tvm.nd.array(a)
-    tb = tvm.nd.array(b)
-    f(ta, tb)
-    tvm.testing.assert_allclose(ta.numpy(), tb.numpy())
-
-
 if __name__ == "__main__":
     test_tensor()
     test_rank_zero()
@@ -426,6 +232,3 @@ def mycopy(x, y):
     test_tuple_inputs()
     test_tuple_with_different_deps()
     test_tensor_inputs()
-    test_tensor_pool()
-    test_tensor_scalar_mixed()
-    test_tensor_scalar()
diff --git a/tests/python/te/test_te_transform_layout.py b/tests/python/te/test_te_transform_layout.py
deleted file mode 100644
index 375fe4a24d57..000000000000
--- a/tests/python/te/test_te_transform_layout.py
+++ /dev/null
@@ -1,592 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import functools
-import sys
-import pytest
-
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm import te
-from tvm.tir.stmt_functor import post_order_visit
-from tvm.driver.build_module import schedule_to_module
-
-dtype = tvm.testing.parameter("int32")
-
-
-def flatten_all_indices(preflatten_shape):
-    def mapping(*indices):
-        output = 0
-        for index, size in zip(indices, preflatten_shape):
-            output = output * size + index
-        return [output]
-
-    return mapping
-
-
-def unpack_flattened_indices(preflatten_shape):
-    def mapping(i):
-        output = []
-        for dim in reversed(preflatten_shape):
-            output.append(i % dim)
-            i //= dim
-        return output[::-1]
-
-    return mapping
-
-
-def traverse(s, op, callback):
-    visited = set()
-
-    def _traverse(op):
-        if op in visited:
-            return
-        visited.add(op)
-        for tensor in op.input_tensors:
-            _traverse(tensor.op)
-        callback(op)
-
-    _traverse(op)
-
-
-class TestCompareAgainstExplicitReshape:
-    A_definition_style = tvm.testing.parameter(
-        "explicit_reshape",
-        "transform_layout",
-    )
-    B_definition_style = tvm.testing.parameter(
-        "explicit_reshape",
-        "transform_layout",
-    )
-
-    reordered_shape = tvm.testing.parameter((2, 3, 4))
-
-    @tvm.testing.fixture
-    def n_items(self, reordered_shape):
-        return functools.reduce(lambda x, y: x * y, reordered_shape, 1)
-
-    @tvm.testing.fixture
-    def fphysical_layout(self, reordered_shape):
-        return unpack_flattened_indices(reordered_shape)
-
-    @tvm.testing.fixture
-    def fcompute(self, A_definition_style, B_definition_style, reordered_shape, n_items, dtype):
-        assert A_definition_style in ["explicit_reshape", "transform_layout"]
-        assert B_definition_style in ["explicit_reshape", "transform_layout"]
-
-        def func():
-            if A_definition_style == "explicit_reshape":
-                A_input = te.placeholder(shape=reordered_shape, name="A_input", dtype=dtype)
-                A = te.compute(
-                    shape=(n_items,),
-                    fcompute=lambda i: A_input[
-                        i // (reordered_shape[1] * reordered_shape[2]),
-                        (i // reordered_shape[2]) % reordered_shape[1],
-                        i % reordered_shape[2],
-                    ],
-                    name="A",
-                )
-
-            elif A_definition_style == "transform_layout":
-                A = te.placeholder(shape=(n_items,), name="A", dtype=dtype)
-                A_input = A
-
-            B = te.compute(shape=A.shape, fcompute=lambda i: A[i], name="B")
-
-            if B_definition_style == "explicit_reshape":
-                B_output = te.compute(
-                    shape=reordered_shape,
-                    fcompute=lambda i, j, k: B[
-                        i * reordered_shape[1] * reordered_shape[2] + j * reordered_shape[2] + k
-                    ],
-                    name="B_output",
-                )
-            elif B_definition_style == "transform_layout":
-                B_output = B
-
-            return A_input, B_output
-
-        return func
-
-    @tvm.testing.fixture
-    def fschedule(self, A_definition_style, B_definition_style, fphysical_layout):
-        def func(outs):
-            outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-            s = te.create_schedule([x.op for x in outs])
-
-            def callback(op):
-                if (op.name == "A" and A_definition_style == "transform_layout") or (
-                    op.name == "B" and B_definition_style == "transform_layout"
-                ):
-                    s[op].transform_layout(fphysical_layout)
-
-            traverse(s, outs[0].op, callback)
-            return s
-
-        return func
-
-    @tvm.testing.parametrize_targets("llvm")
-    def test_external_reshape(
-        self, target, dev, fcompute, fschedule, n_items, reordered_shape, dtype
-    ):
-        A, B = fcompute()
-        s = fschedule(B)
-
-        func = tvm.build(s, [A, B], target=target, name="copy_reshape")
-
-        a_np = np.arange(n_items).reshape(reordered_shape).astype(dtype)
-        b_np = np.arange(n_items).reshape(reordered_shape).astype(dtype)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.empty(b_np.shape, dtype=dtype, device=dev)
-
-        func(a, b)
-
-        tvm.testing.assert_allclose(b.numpy(), b_np)
-
-    @tvm.testing.parametrize_targets("llvm")
-    def test_internal_reshape(self, target, dev, n_items, reordered_shape, dtype, fphysical_layout):
-        # The reshaping of the buffer gets flattened away in
-        # StorageFlatten.  Therefore, testing the behavior by running only
-        # ApplyLayoutTransforms.
-        logical_shape = (n_items,)
-        A = te.placeholder(logical_shape, name="A", dtype=dtype)
-        B = te.compute(shape=logical_shape, fcompute=lambda i: A[i], name="B")
-        C = te.compute(shape=logical_shape, fcompute=lambda i: B[i], name="C")
-
-        s = te.create_schedule(C.op)
-        s[B].transform_layout(fphysical_layout)
-
-        mod = schedule_to_module(s, [A, C])
-        body = mod["main"].body
-
-        def walk_buffer_interactions(stmt, callback):
-            buffer_classes = [
-                tvm.tir.BufferLoad,
-                tvm.tir.BufferStore,
-                tvm.tir.BufferRealize,
-            ]
-
-            def inner(node):
-                if (type(node) in buffer_classes) and node.buffer.name == "B":
-                    callback(node)
-
-            post_order_visit(stmt, inner)
-
-        # All references to the buffer are the same object
-        def check_references():
-            buffer_object = None
-
-            def inner(node):
-                nonlocal buffer_object
-                if buffer_object is None:
-                    buffer_object = node.buffer
-                else:
-                    assert node.buffer.same_as(buffer_object)
-
-            return inner
-
-        # The buffer has the expected shape.
-        def check_shape(expected_shape):
-            def inner(node):
-                assert tuple(node.buffer.shape) == expected_shape
-
-            return inner
-
-        # Before the transform, the buffer should be in the logical shape.
-        walk_buffer_interactions(body, check_references())
-        walk_buffer_interactions(body, check_shape(logical_shape))
-
-        mod = tvm.tir.transform.ApplyLayoutTransforms()(mod)
-        body = mod["main"].body
-
-        # After the transform, the buffer should be in the physical shape.
-        walk_buffer_interactions(body, check_references())
-        walk_buffer_interactions(body, check_shape(reordered_shape))
-
-
-class Test2DPhysicalLayout:
-    transform_A = tvm.testing.parameter(
-        "1d_A",
-        "2d_A",
-        "2d_rev_A",
-        "3d_A",
-    )
-    transform_B = tvm.testing.parameter(
-        "1d_B",
-        "2d_B",
-        "2d_rev_B",
-        "3d_B",
-    )
-
-    @staticmethod
-    def extract_logical_indices(stmt):
-        output = {}
-
-        # Since the for loops can be reordered by the layout
-        # transformation, identify the loop corresponding to each
-        # pre-transformation axis based on the iteration extent.
-        def callback(node):
-            if isinstance(node, tvm.tir.For):
-                output[node.loop_var] = node.extent.value
-
-        post_order_visit(stmt, callback)
-        return sorted(output, key=output.get)
-
-    def get_transform(self, name):
-        name = name[:-2]
-        if name == "1d":
-            return None
-        elif name == "2d":
-            return lambda i, j, k: [i, j, te.AXIS_SEPARATOR, k]
-        elif name == "2d_rev":
-            return lambda i, j, k: [k, j, te.AXIS_SEPARATOR, i]
-        elif name == "3d":
-            return lambda i, j, k: [i, te.AXIS_SEPARATOR, j, te.AXIS_SEPARATOR, k]
-        else:
-            raise ValueError(f"Unknown transformation: {name}")
-
-    def transform_indices(self, name, logical_shape, logical_index_vars):
-        name = name[:-2]
-
-        i, j, k = logical_index_vars
-
-        if name == "1d":
-            return [i * (logical_shape[1] * logical_shape[2]) + j * logical_shape[2] + k]
-        elif name == "2d":
-            return [i * logical_shape[1] + j, k]
-        elif name == "2d_rev":
-            return [k * logical_shape[1] + j, i]
-        elif name == "3d":
-            return [i, j, k]
-        else:
-            raise ValueError(f"Unknown transformation: {name}")
-
-    def test_2d_physical(self, dtype, transform_A, transform_B):
-        logical_shape = (2, 3, 4)
-        A = te.placeholder(shape=logical_shape, dtype=dtype, name="A")
-        B = te.compute(shape=A.shape, fcompute=lambda i, j, k: A[i, j, k], name="B")
-
-        s = te.create_schedule(B.op)
-
-        func = self.get_transform(transform_A)
-        if func:
-            s[A].transform_layout(func)
-
-        func = self.get_transform(transform_B)
-        if func:
-            s[B].transform_layout(func)
-
-        # If the two buffers are accessed with the same indices, CSE
-        # will replace them with a Let binding.  Since this makes it
-        # harder to test what the transformed indices are, disabling
-        # the CSE pass for this test.
-        with tvm.transform.PassContext(disabled_pass=["tir.CommonSubexprElimTIR"]):
-            mod = tvm.lower(s, [A, B])
-
-        logical_index_vars = self.extract_logical_indices(mod["main"].body)
-        expected_indices_A = self.transform_indices(transform_A, logical_shape, logical_index_vars)
-        expected_indices_B = self.transform_indices(transform_B, logical_shape, logical_index_vars)
-
-        def callback(node):
-            if type(node) in [tvm.tir.BufferLoad, tvm.tir.BufferStore]:
-                name = node.buffer.name
-                if name == "A":
-                    expected_indices = expected_indices_A
-                elif name == "B":
-                    expected_indices = expected_indices_B
-                else:
-                    raise RuntimeError(f"Unexpected buffer: {name}")
-
-                tvm.ir.assert_structural_equal(expected_indices, node.indices)
-
-        post_order_visit(mod["main"].body, callback)
-
-
-class TestTransformedSchedules:
-    logical_shape = tvm.testing.parameter((4, 6, 40))
-
-    transform_names = [
-        None,
-        "reverse",
-        "flatten_all",
-        "factor_last_by_4",
-    ]
-
-    transform_A = tvm.testing.parameter(by_dict={f"A_{t}": t for t in transform_names})
-    transform_B = tvm.testing.parameter(
-        by_dict={f"B_{t}": t for t in transform_names if t is not None}
-    )
-
-    after_transform = tvm.testing.parameter(None)
-
-    def make_transform(self, logical_shape, transform_name):
-        if transform_name is None:
-            return lambda *indices: indices
-        elif transform_name == "reverse":
-            return lambda *indices: indices[::-1]
-        elif transform_name == "flatten_all":
-            return flatten_all_indices(logical_shape)
-        elif transform_name == "factor_last_by_4":
-            return lambda *indices, n: [*indices, n // 4, n % 4]
-        else:
-            raise NotImplementedError(f"Unknown transformation {transform_name}")
-
-    def make_transformed_shape(self, logical_shape, transform_name):
-        if transform_name is None:
-            return logical_shape
-        elif transform_name == "reverse":
-            return logical_shape[::-1]
-        elif transform_name == "flatten_all":
-            num_elements = functools.reduce(lambda x, y: x * y, logical_shape, 1)
-            return [num_elements]
-        elif transform_name == "factor_last_by_4":
-            *indices, n = logical_shape
-            return [*indices, n // 4, 4]
-        else:
-            raise NotImplementedError(f"Unknown transformation {transform_name}")
-
-    @tvm.testing.fixture
-    def expected_loop_order(self, logical_shape, transform_B, after_transform):
-        shape = self.make_transformed_shape(logical_shape, transform_B)
-
-        if after_transform == "reorder":
-            shape = shape[::-1]
-
-        elif after_transform == "split":
-            shape = [
-                *shape[:-1],
-                2,
-                shape[-1] // 2,
-            ]
-
-        elif after_transform == "fuse":
-            fused_size = shape[0] if transform_B == "flatten_all" else shape[0] * shape[1]
-            shape = [fused_size, *shape[2:]]
-
-        return shape
-
-    @tvm.testing.fixture
-    def schedule(self, logical_shape, dtype, transform_A, transform_B, after_transform):
-        A = te.placeholder(shape=logical_shape, dtype=dtype, name="A")
-        B = te.compute(shape=A.shape, fcompute=lambda i, j, k: A[i, j, k], name="B")
-
-        s = te.create_schedule(B.op)
-
-        if transform_A:
-            s[A].transform_layout(self.make_transform(logical_shape, transform_A))
-
-        iter_vars = s[B].transform_layout(self.make_transform(logical_shape, transform_B))
-        iter_vars = list(iter_vars)
-
-        if after_transform == "reorder":
-            s[B].reorder(*iter_vars[::-1])
-
-        elif after_transform == "split":
-            s[B].split(iter_vars[-1], nparts=2)
-
-        elif after_transform == "fuse":
-            to_fuse = iter_vars[:2]
-            s[B].fuse(*iter_vars[:2])
-
-        return {
-            "schedule": s,
-            "tensors": [A, B],
-            "iter_vars": iter_vars,
-        }
-
-    def compare_tir_loop_order(self, stmt, expected_loop_order):
-        def collect_loops(node):
-            output = []
-
-            def callback(node):
-                if isinstance(node, tvm.tir.For):
-                    output.append(node)
-
-            post_order_visit(node, callback)
-            return output[::-1]
-
-        loops = collect_loops(stmt)
-        loop_order = [loop.extent for loop in loops]
-
-        np.testing.assert_array_equal(loop_order, expected_loop_order)
-
-    def test_tir_loop_order(self, schedule, expected_loop_order):
-        func = tvm.lower(schedule["schedule"], schedule["tensors"])["main"]
-        self.compare_tir_loop_order(func.body, expected_loop_order)
-
-    def test_te_loop_order(self, schedule, expected_loop_order):
-        s = schedule["schedule"]
-        A, B = schedule["tensors"]
-        iter_vars = schedule["iter_vars"]
-
-        # No reduction axis, so all leaf_iter_vars are over the data
-        # array, and should have the new iteration variables.
-        extents = [int(iter_var.dom.extent) for iter_var in s[B].leaf_iter_vars]
-        np.testing.assert_array_equal(extents, expected_loop_order)
-
-        # layout_transform should return the new iteration variables.
-        extents = [int(iter_var.dom.extent) for iter_var in iter_vars]
-        np.testing.assert_array_equal(extents, expected_loop_order)
-
-    @pytest.mark.parametrize("after_transform", ["reorder", "split", "fuse"])
-    def test_use_transformed_axes(
-        self, schedule, expected_loop_order, transform_A, transform_B, after_transform
-    ):
-        s = schedule["schedule"]
-        A, B = schedule["tensors"]
-
-        func = tvm.lower(s, [A, B])["main"]
-        self.compare_tir_loop_order(func.body, expected_loop_order)
-
-
-class TestTransformCache:
-    A_size = tvm.testing.parameter(16)
-
-    transform_A = tvm.testing.parameter(by_dict={"transformA": True, "": False})
-    transform_B = tvm.testing.parameter(by_dict={"transformB": True, "": False})
-    cache_A = tvm.testing.parameter(by_dict={"cacheA": True, "": False})
-    cache_B = tvm.testing.parameter(by_dict={"cacheB": True, "": False})
-
-    @tvm.testing.fixture
-    def schedule_args(self, target, A_size, transform_A, transform_B, cache_A, cache_B, dtype):
-        A = te.placeholder(shape=[A_size], dtype=dtype, name="A")
-        B = te.compute(A.shape, lambda i: A[i], name="B")
-        s = te.create_schedule(B.op)
-
-        requires_thread_bind = "gpu" in tvm.target.Target(target).keys
-        thread_x = te.thread_axis("threadIdx.x")
-        thread_y = te.thread_axis("threadIdx.y")
-        thread_z = te.thread_axis("threadIdx.z")
-
-        if cache_A:
-            AA = s.cache_read(A, "shared", [B])
-            if requires_thread_bind:
-                s[AA].bind(AA.op.axis[0], thread_x)
-
-        if cache_B:
-            BB = s.cache_write(B, "shared")
-            if requires_thread_bind:
-                s[BB].bind(BB.op.axis[0], thread_y)
-
-        if transform_A:
-            A_axis = s[A].transform_layout(lambda i: [i // 4, i % 4])
-
-        if transform_B:
-            B_axis = s[B].transform_layout(lambda i: [i // 4, i % 4])
-        else:
-            B_axis = B.op.axis
-
-        if requires_thread_bind:
-            s[B].bind(B_axis[0], thread_z)
-
-        return [s, [A, B]]
-
-    @tvm.testing.fixture
-    def ref_data(self, A_size, dtype, transform_A, transform_B):
-        a_np = (100 * np.random.uniform(size=A_size)).astype(dtype)
-        b_np = a_np
-
-        if transform_A:
-            a_np = a_np.reshape((-1, 4))
-
-        if transform_B:
-            b_np = b_np.reshape((-1, 4))
-
-        return a_np, b_np
-
-    def test_lower(self, schedule_args):
-        tvm.lower(*schedule_args)
-
-    def test_execute(self, target, dev, schedule_args, ref_data, dtype):
-        func = tvm.build(*schedule_args, target=target)
-
-        a_np, b_np = ref_data
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.empty(b_np.shape, dtype=dtype, device=dev)
-
-        func(a, b)
-
-        if "int" in dtype:
-            np.testing.assert_equal(b.numpy(), b_np)
-        else:
-            tvm.testing.assert_allclose(b.numpy(), b_np)
-
-
-def test_transform_with_reduction():
-    # To trigger this failure mode, the computation must use a
-    # reduction axis,
-    A = te.placeholder([16, 32, 64], dtype="float32", name="A")
-    k = te.reduce_axis((0, A.shape[-1]), name="k")
-    B = te.compute(A.shape[:-1], lambda i, j: te.sum(A[i, j, k], axis=[k]))
-    s = te.create_schedule(B.op)
-
-    # And the output of the computation must have a layout
-    # transformation applied.
-    s[B].transform_layout(lambda i, j: [j, i])
-
-    # When present, the failure occurred during tvm.lower, during the
-    # call to `tvm::te::PassDownBitMaskOr`.
-    tvm.lower(s, [A, B])
-
-
-shape, transform = tvm.testing.parameters(
-    ([1, 8], lambda n, i: [i, n]),
-    ([1, 1, 8], lambda i, j, k: [j, te.AXIS_SEPARATOR, i, k]),
-    ([1, 1, 8], lambda i, j, k: [i, te.AXIS_SEPARATOR, j, k]),
-)
-
-
-def test_size_one_buffer(shape, transform):
-    # This test is to catch a failure mode that occurred if a
-    # transformation were applied to a te.compute buffer, and one of
-    # the dimensions of the buffer was 1.  Prior to bugfix,
-    # arith::DetectIterMap would fold the variable as a constant,
-    # causing an error when attempting to solve for the variable using
-    # arith::InverseAffineIterMap.
-
-    dtype = "int8"
-    A = te.placeholder(shape, dtype, name="A")
-    B = te.compute(
-        shape=A.shape,
-        fcompute=lambda *indices: A[indices].astype(dtype),
-        name="B",
-    )
-    s = te.create_schedule(B.op)
-
-    # If layout transformation is on the output buffer, and any
-    # dimension of the output buffer is 1, failure occurs in
-    # CheckFusePattern.
-    s[B].transform_layout(transform)
-
-
-def test_non_divisible_transform_raises_error():
-    A = te.placeholder([1, 3, 8, 8])
-    B = te.compute(A.shape, lambda *indices: A[indices])
-    s = te.create_schedule(B.op)
-
-    transform = lambda n, c, h, w: [n, c // 4, h, w, c % 4]
-    # Error occurs here, because the transformation would introduce
-    # padding.  Padded transforms are supported in TIR-based
-    # schedules.
-    with pytest.raises(tvm.TVMError):
-        s[B].transform_layout(transform)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/tir-analysis/test_tir_analysis_usedef.py b/tests/python/tir-analysis/test_tir_analysis_usedef.py
deleted file mode 100644
index 940355e1415c..000000000000
--- a/tests/python/tir-analysis/test_tir_analysis_usedef.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pytest
-import tvm
-from tvm import te
-
-
-@pytest.mark.xfail
-def test_loop_dependent_allocate():
-    N = te.size_var("N")
-    A = te.placeholder((2 * N,), "float32", "A")
-    C = te.compute((N,), lambda i: A[2 * i] + A[i + 1], name="C")
-    s = te.create_schedule(C.op)
-    AA = s.cache_read(A, "local", [C])
-    s[AA].compute_at(s[C], s[C].op.axis[0])
-    # this line should fail due to IRUseDefAnalysis sees an allocate statement
-    # referencing undefined variable
-    tvm.lower(s, [A, C])
-
-
-if __name__ == "__main__":
-    test_loop_dependent_allocate()
diff --git a/tests/python/tir-analysis/test_tir_analysis_verify_gpu_code.py b/tests/python/tir-analysis/test_tir_analysis_verify_gpu_code.py
deleted file mode 100644
index 45a8a8138bd5..000000000000
--- a/tests/python/tir-analysis/test_tir_analysis_verify_gpu_code.py
+++ /dev/null
@@ -1,434 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test gpu code verifier"""
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.testing
-
-
-def get_verify_pass(valid, **kwargs):
-    def _fverify(f, *_):
-        valid[0] = tvm.tir.analysis.verify_gpu_code(f, kwargs)
-        return f
-
-    return tvm.tir.transform.prim_func_pass(_fverify, opt_level=0)
-
-
-@tvm.testing.requires_gpu
-def test_shared_memory():
-    def check_shared_memory(storage_scope, dtype):
-        N = 1024
-        M = 128
-
-        tvm_type = tvm.runtime.DataType(dtype)
-        type_size = tvm_type.bits // 8 * tvm_type.lanes
-
-        A = te.placeholder((N,), name="A", dtype=dtype)
-        B = te.compute((N,), lambda i: A[i], name="B")
-
-        s = te.create_schedule([B.op])
-        AA = s.cache_read(A, storage_scope, [B])
-        o, i = s[B].split(s[B].op.axis[0], M)
-        s[AA].compute_at(s[B], o)
-        s[B].bind(o, te.thread_axis("blockIdx.x"))
-        s[B].bind(i, te.thread_axis("threadIdx.x"))
-
-        # shared memory usage: M * sizeof(dtype) Bytes
-        # thread usage: M
-
-        for target in ["opencl", "cuda"]:
-            if not tvm.testing.device_enabled(target):
-                continue
-            valid = [None]
-            with tvm.transform.PassContext(
-                config={
-                    "tir.add_lower_pass": [
-                        (
-                            2,
-                            get_verify_pass(
-                                valid,
-                                max_shared_memory_per_block=type_size * M - 1,
-                                max_threads_per_block=M,
-                            ),
-                        )
-                    ]
-                }
-            ):
-                tvm.build(s, [A, B], target)
-            assert not valid[0]
-
-            with tvm.transform.PassContext(
-                config={
-                    "tir.add_lower_pass": [
-                        (
-                            2,
-                            get_verify_pass(
-                                valid,
-                                max_shared_memory_per_block=type_size * M,
-                                max_threads_per_block=M,
-                            ),
-                        )
-                    ]
-                }
-            ):
-                tvm.build(s, [A, B], target)
-            assert valid[0]
-
-    check_shared_memory("shared", "float32")
-    check_shared_memory("shared", "int8x4")
-    check_shared_memory("shared.dyn", "float32")
-
-
-@tvm.testing.requires_gpu
-def test_local_memory():
-    N = 1024
-    M = 128
-
-    A = te.placeholder((N,), name="A", dtype="float32")
-    B = te.compute((N,), lambda i: A[i], name="B")
-
-    s = te.create_schedule([B.op])
-    AA = s.cache_read(A, "local", [B])
-    o, i = s[B].split(s[B].op.axis[0], M)
-    s[AA].compute_at(s[B], o)
-    s[B].bind(o, te.thread_axis("blockIdx.x"))
-
-    # local memory usage: M * 4B
-    # thread usage: M
-
-    for target in ["opencl", "cuda"]:
-        if not tvm.testing.device_enabled(target):
-            continue
-
-        valid = [None]
-        with tvm.transform.PassContext(
-            config={
-                "tir.add_lower_pass": [
-                    (
-                        2,
-                        get_verify_pass(
-                            valid, max_local_memory_per_block=4 * M - 1, max_threads_per_block=1
-                        ),
-                    )
-                ]
-            }
-        ):
-            tvm.build(s, [A, B], target)
-        assert not valid[0]
-
-        with tvm.transform.PassContext(
-            config={
-                "tir.add_lower_pass": [
-                    (
-                        2,
-                        get_verify_pass(
-                            valid, max_local_memory_per_block=4 * M, max_threads_per_block=1
-                        ),
-                    )
-                ]
-            }
-        ):
-            tvm.build(s, [A, B], target)
-        assert valid[0]
-
-
-@tvm.testing.requires_gpu
-def test_num_thread():
-    N = 1024
-    M = 128
-
-    A = te.placeholder((N,), name="A", dtype="float32")
-    B = te.compute((N,), lambda i: A[i], name="B")
-
-    s = te.create_schedule([B.op])
-    o, i = s[B].split(s[B].op.axis[0], M)
-
-    s[B].bind(o, te.thread_axis("threadIdx.x"))
-    s[B].bind(i, te.thread_axis("threadIdx.y"))
-
-    # shared memory usage: 0
-    # thread usage: N
-
-    for target in ["opencl", "cuda"]:
-        if not tvm.testing.device_enabled(target):
-            continue
-
-        valid = [None]
-        with tvm.transform.PassContext(
-            config={
-                "tir.add_lower_pass": [
-                    (
-                        2,
-                        get_verify_pass(
-                            valid, max_shared_memory_per_block=0, max_threads_per_block=N - 1
-                        ),
-                    )
-                ]
-            }
-        ):
-            tvm.build(s, [A, B], target)
-        assert not valid[0]
-
-        with tvm.transform.PassContext(
-            config={
-                "tir.add_lower_pass": [
-                    (
-                        2,
-                        get_verify_pass(
-                            valid, max_shared_memory_per_block=0, max_threads_per_block=N
-                        ),
-                    )
-                ]
-            }
-        ):
-            tvm.build(s, [A, B], target)
-        assert valid[0]
-
-        with tvm.transform.PassContext(
-            config={
-                "tir.add_lower_pass": [
-                    (
-                        2,
-                        get_verify_pass(
-                            valid,
-                            max_shared_memory_per_block=0,
-                            max_threads_per_block=N,
-                            max_thread_y=M - 1,
-                        ),
-                    )
-                ]
-            }
-        ):
-            tvm.build(s, [A, B], target)
-        assert not valid[0]
-
-        with tvm.transform.PassContext(
-            config={
-                "tir.add_lower_pass": [
-                    (
-                        2,
-                        get_verify_pass(
-                            valid,
-                            max_shared_memory_per_block=0,
-                            max_threads_per_block=N,
-                            max_thread_y=M,
-                        ),
-                    )
-                ]
-            }
-        ):
-            tvm.build(s, [A, B], target)
-        assert valid[0]
-
-
-@tvm.testing.requires_gpu
-def test_multiple_kernels():
-    N = 1024
-
-    A = te.placeholder((N, N), name="A")
-    B = te.compute((N, N), lambda i, j: A[i, j])
-    C = te.compute((N, N), lambda i, j: B[i, j])
-
-    s = te.create_schedule([C.op])
-
-    s[C].bind(s[C].op.axis[1], te.thread_axis("threadIdx.x"))
-    s[B].bind(s[B].op.axis[1], te.thread_axis("threadIdx.x"))
-
-    # shared memory usage: 0
-    # thread usage: N
-
-    for target in ["opencl", "cuda"]:
-        if not tvm.testing.device_enabled(target):
-            continue
-
-        valid = [None]
-        with tvm.transform.PassContext(
-            config={
-                "tir.add_lower_pass": [
-                    (
-                        2,
-                        get_verify_pass(
-                            valid, max_shared_memory_per_block=0, max_threads_per_block=N - 1
-                        ),
-                    )
-                ]
-            }
-        ):
-            tvm.build(s, [A, C], target)
-        assert not valid[0]
-
-        with tvm.transform.PassContext(
-            config={
-                "tir.add_lower_pass": [
-                    (
-                        2,
-                        get_verify_pass(
-                            valid, max_shared_memory_per_block=0, max_threads_per_block=N
-                        ),
-                    )
-                ]
-            }
-        ):
-            tvm.build(s, [A, C], target)
-        assert valid[0]
-
-
-@tvm.testing.requires_gpu
-def test_wrong_bind():
-    N = 1024
-
-    A = te.placeholder((N, N - 1), name="A")
-    B = te.compute((N, N - 1), lambda i, j: A[i, j])
-
-    s = te.create_schedule([B.op])
-
-    # bind a thread axis to two loop axes with different lengths
-    s[B].bind(s[B].op.axis[0], te.thread_axis("threadIdx.x"))
-    s[B].bind(s[B].op.axis[1], te.thread_axis("threadIdx.x"))
-
-    for target in ["opencl", "cuda"]:
-        if not tvm.testing.device_enabled(target):
-            continue
-
-        valid = [None]
-        with tvm.transform.PassContext(
-            config={
-                "tir.add_lower_pass": [(2, get_verify_pass(valid, max_threads_per_block=N * N))]
-            }
-        ):
-            tvm.build(s, [A, B], target)
-        assert not valid[0]
-
-
-@tvm.testing.requires_gpu
-def test_vectorize():
-    N = 1024
-
-    A = te.placeholder((N, N), name="A")
-    B = te.compute((N, N), lambda i, j: A[i, j])
-
-    s = te.create_schedule([B.op])
-
-    i, j = s[B].op.axis
-
-    s[B].bind(i, te.thread_axis("blockIdx.x"))
-    jo, ji = s[B].split(j, factor=64)
-    s[B].bind(jo, te.thread_axis("threadIdx.x"))
-    s[B].vectorize(ji)
-
-    for target in ["opencl", "cuda"]:
-        if not tvm.testing.device_enabled(target):
-            continue
-
-        valid = [None]
-        with tvm.transform.PassContext(
-            config={"tir.add_lower_pass": [(2, get_verify_pass(valid, max_vector_bytes=16))]}
-        ):
-            tvm.lower(s, [A, B])
-        assert not valid[0]
-
-
-@tvm.testing.requires_gpu
-def test_vectorize_half():
-    N = 1024
-
-    A = te.placeholder((N, N), name="A", dtype="float16")
-    B = te.compute((N, N), lambda i, j: A[i, j])
-
-    s = te.create_schedule([B.op])
-
-    i, j = s[B].op.axis
-
-    s[B].bind(i, te.thread_axis("blockIdx.x"))
-    jo, ji = s[B].split(j, factor=8)
-    s[B].bind(jo, te.thread_axis("threadIdx.x"))
-    s[B].vectorize(ji)
-
-    for target in ["opencl", "cuda"]:
-        if not tvm.testing.device_enabled(target):
-            continue
-
-        valid = [None]
-        with tvm.transform.PassContext(
-            config={"tir.add_lower_pass": [(2, get_verify_pass(valid, max_vector_bytes=16))]}
-        ):
-            tvm.lower(s, [A, B])
-        assert valid[0]
-
-
-@tvm.testing.requires_gpu
-def test_vectorize_strided():
-    N = 1024
-
-    A = te.placeholder((N, N), name="A", dtype="float16")
-    B = te.compute((N, N), lambda i, j: A[j, i])
-
-    s = te.create_schedule([B.op])
-
-    i, j = s[B].op.axis
-
-    s[B].bind(i, te.thread_axis("blockIdx.x"))
-    jo, ji = s[B].split(j, factor=8)
-    s[B].vectorize(ji)
-
-    for target in ["opencl", "cuda"]:
-        if not tvm.testing.device_enabled(target):
-            continue
-
-        valid = [None]
-        with tvm.transform.PassContext(
-            config={"tir.add_lower_pass": [(2, get_verify_pass(valid, max_vector_bytes=16))]}
-        ):
-            tvm.lower(s, [A, B])
-        assert not valid[0]
-
-
-@tvm.testing.requires_gpu
-def test_vthread():
-    N = 1024
-
-    A = te.placeholder((N, 16), name="A")
-    B = te.compute((N, 16), lambda i, j: A[i, j])
-
-    s = te.create_schedule([B.op])
-
-    s[B].bind(s[B].op.axis[0], te.thread_axis("blockIdx.x"))
-    s[B].bind(s[B].op.axis[1], te.thread_axis("vthread"))
-
-    for target in ["opencl", "cuda"]:
-        if not tvm.testing.device_enabled(target):
-            continue
-
-        valid = [None]
-
-        for phase in [1, 2]:
-            with tvm.transform.PassContext(
-                config={"tir.add_lower_pass": [(phase, get_verify_pass(valid, max_vthread=16))]}
-            ):
-                tvm.build(s, [A, B], target)
-            assert valid[0]
-
-            with tvm.transform.PassContext(
-                config={"tir.add_lower_pass": [(phase, get_verify_pass(valid, max_vthread=15))]}
-            ):
-                tvm.build(s, [A, B], target)
-            assert not valid[0]
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/tir-analysis/test_tir_analysis_verify_memory.py b/tests/python/tir-analysis/test_tir_analysis_verify_memory.py
deleted file mode 100644
index 4c89ff1185f7..000000000000
--- a/tests/python/tir-analysis/test_tir_analysis_verify_memory.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-import pytest
-from tvm import te
-import tvm.testing
-
-# The following DLDeviceType/TVMDeviceExtType values
-# are originally defined in dlpack.h and c_runtime_api.h.
-gpu_devices = ["cuda", "opencl", "metal", "vulkan"]
-other_devices = ["llvm", "ext_dev"]
-
-
-# All computations are bound.
-# So VerifyMemory pass is expected to succeed.
-#
-@tvm.testing.uses_gpu
-def test_verify_memory_all_bind():
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.compute(A.shape, lambda i: A[i] + 1.0, name="B")
-
-    # B is bound to threads.
-    s = te.create_schedule(B.op)
-    bx, tx = s[B].split(B.op.axis[0], factor=64)
-    s[B].bind(bx, te.thread_axis("blockIdx.x"))
-    s[B].bind(tx, te.thread_axis("threadIdx.x"))
-
-    mod = tvm.lower(s, [A, B])
-
-    for dev_type in gpu_devices + other_devices:
-        if tvm.testing.device_enabled(dev_type):
-            binded_mod = tvm.tir.transform.Apply(
-                lambda f: f.with_attr("target", tvm.target.Target(dev_type))
-            )(mod)
-            tvm.tir.transform.VerifyMemory()(binded_mod)
-
-
-# Computations are not bound.
-# So VerifyMemory pass fails when device type is GPU.
-#
-@tvm.testing.uses_gpu
-def test_verify_memory_not_bind():
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.compute(A.shape, lambda i: A[i] + 1.0, name="B")
-
-    # B is not bound to threads.
-    s = te.create_schedule(B.op)
-
-    mod = tvm.lower(s, [A, B])
-
-    for dev_type in gpu_devices:
-        if tvm.testing.device_enabled(dev_type):
-            binded_mod = tvm.tir.transform.Apply(
-                lambda f: f.with_attr("target", tvm.target.Target(dev_type))
-            )(mod)
-            with pytest.raises(RuntimeError):
-                tvm.tir.transform.VerifyMemory()(binded_mod)
-
-    for dev_type in other_devices:
-        if tvm.testing.device_enabled(dev_type):
-            binded_mod = tvm.tir.transform.Apply(
-                lambda f: f.with_attr("target", tvm.target.Target(dev_type))
-            )(mod)
-            tvm.tir.transform.VerifyMemory()(binded_mod)
-
-
-# Computations are partially bound.
-# So VerifyMemory pass fails when device type is GPU.
-#
-@tvm.testing.uses_gpu
-def test_verify_memory_partially_bind():
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.compute(A.shape, lambda i: A[i] + 1.0, name="B")
-    C = te.compute(B.shape, lambda i: B[i] + 2.0, name="C")
-    D = te.compute(C.shape, lambda i: C[i] + 2.0, name="D")
-
-    # C is bound to threads, but B and D are not.
-    s = te.create_schedule([B.op, C.op, D.op])
-    bx, tx = s[C].split(C.op.axis[0], factor=64)
-    s[C].bind(bx, te.thread_axis("blockIdx.x"))
-    s[C].bind(tx, te.thread_axis("threadIdx.x"))
-
-    mod = tvm.lower(s, [A, B, C, D])
-
-    for dev_type in gpu_devices:
-        if tvm.testing.device_enabled(dev_type):
-            binded_mod = tvm.tir.transform.Apply(
-                lambda f: f.with_attr("target", tvm.target.Target(dev_type))
-            )(mod)
-            with pytest.raises(RuntimeError):
-                tvm.tir.transform.VerifyMemory()(binded_mod)
-
-    for dev_type in other_devices:
-        if tvm.testing.device_enabled(dev_type):
-            binded_mod = tvm.tir.transform.Apply(
-                lambda f: f.with_attr("target", tvm.target.Target(dev_type))
-            )(mod)
-            tvm.tir.transform.VerifyMemory()(binded_mod)
-
-
-if __name__ == "__main__":
-    test_verify_memory_all_bind()
-    test_verify_memory_not_bind()
-    test_verify_memory_partially_bind()
diff --git a/tests/python/tir-base/test_lower_build.py b/tests/python/tir-base/test_lower_build.py
index 0e610cc1659b..edb3ed351e5d 100644
--- a/tests/python/tir-base/test_lower_build.py
+++ b/tests/python/tir-base/test_lower_build.py
@@ -18,7 +18,6 @@
 import numpy as np
 
 import tvm
-from tvm import te
 from tvm.ir.module import IRModule
 from tvm.script import tir as T
 import tvm.testing
@@ -94,22 +93,6 @@ def main(
                 )
 
 
-def test_lower_build_te_schedule():
-    m, n, k = 128, 128, 128
-    axis_k = te.reduce_axis((0, k), "k")
-    A = te.placeholder((m, k), name="A")
-    B = te.placeholder((k, n), name="B")
-    C = te.compute((m, n), lambda x, y: te.sum(A[x, axis_k] * B[y, axis_k], axis=axis_k), name="C")
-    s = te.create_schedule(C.op)
-    # check lowering with the CSE pass disabled as otherwise it would do some commoning
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
-        ir_mod = tvm.lower(s, [A, B, C])
-    tvm.ir.assert_structural_equal(ir_mod, LoweredModule)
-    # check building
-    mod = tvm.build(s, [A, B, C], target="llvm")
-    _check_module_with_numpy(mod)
-
-
 def test_lower_build_tir_func():
     # check lowering with the CSE pass disabled as otherwise it would do some commoning
     with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
diff --git a/tests/python/tir-base/test_tir_buffer.py b/tests/python/tir-base/test_tir_buffer.py
index d706e65d8186..791de769955e 100644
--- a/tests/python/tir-base/test_tir_buffer.py
+++ b/tests/python/tir-base/test_tir_buffer.py
@@ -178,85 +178,6 @@ def assert_simplified_equal(index_simplified, index_direct):
     assert_simplified_equal(index_simplified2, index_direct)
 
 
-@tvm.testing.requires_llvm
-def test_buffer_broadcast():
-    m0, m1, m2 = te.size_var("m0"), te.size_var("m1"), te.size_var("m2")
-    n0, n1, n2 = te.size_var("n0"), te.size_var("n1"), te.size_var("n2")
-    o0, o1, o2 = te.size_var("o0"), te.size_var("o1"), te.size_var("o2")
-
-    A = te.placeholder((m0, m1, m2), name="A")
-    B = te.placeholder((n0, n1, n2), name="B")
-
-    C = te.compute((o0, o1, o2), lambda i, j, k: A[i, j, k] + B[i, j, k], name="C")
-
-    Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name="Ab", buffer_type="auto_broadcast")
-    Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name="Bb", buffer_type="auto_broadcast")
-    s = te.create_schedule(C.op)
-
-    def check():
-        fadd = tvm.build(s, [A, B, C], target="llvm", name="bcast_add", binds={A: Ab, B: Bb})
-        dev = tvm.cpu(0)
-        a = tvm.nd.array(np.random.uniform(size=(2, 4, 3)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=(2, 1, 1)).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((2, 4, 3), dtype=C.dtype), dev)
-        fadd(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-
-    check()
-
-
-@tvm.testing.requires_llvm
-def test_buffer_broadcast_expr():
-    n0, m0, x = te.size_var("n0"), te.size_var("m0"), te.size_var("x")
-    n1, m1 = te.size_var("n1"), te.size_var("m1")
-    o0, o1 = te.size_var("o0"), te.size_var("o1")
-
-    A = te.placeholder((m0, n0), name="A")
-    B = te.placeholder((m1, n1), name="B")
-    C = te.compute((o0, o1 // x), lambda i, j: A[i, j] + B[i, j], name="C")
-
-    Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name="Ab", buffer_type="auto_broadcast")
-    Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name="Bb", buffer_type="auto_broadcast")
-    Cc = tvm.tir.decl_buffer(C.shape, C.dtype, name="Cc", buffer_type="auto_broadcast")
-    s = te.create_schedule(C.op)
-
-    def check_stride():
-        fadd = tvm.build(
-            s, [A, B, C, o1, x], target="llvm", name="bcast_add", binds={A: Ab, B: Bb, C: Cc}
-        )
-        dev = tvm.cpu(0)
-        a = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), dev)
-        fadd(a, b, c, 4, 1)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-
-    def check_no_stride():
-        fadd = tvm.build(
-            s, [A, B, C, o1, x], target="llvm", name="bcast_add", binds={A: Ab, B: Bb, C: Cc}
-        )
-        dev = tvm.cpu(0)
-        a = tvm.nd.array(np.random.uniform(size=(1, 4)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), dev)
-        fadd(a, b, c, 4, 1)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-
-    def check_auto_bind():
-        # Let build bind buffers
-        fadd = tvm.build(s, [A, B, C, o1, x], target="llvm", name="bcast_add")
-        dev = tvm.cpu(0)
-        a = tvm.nd.array(np.random.uniform(size=(1, 4)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), dev)
-        fadd(a, b, c, 4, 1)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-
-    check_stride()
-    check_no_stride()
-    check_auto_bind()
-
-
 def test_buffer_flatten():
     """A buffer should flatten to a 1-d shape"""
     buf = tvm.tir.decl_buffer([16, 32])
diff --git a/tests/python/tir-base/test_tir_intrin.py b/tests/python/tir-base/test_tir_intrin.py
index 1ee709191c41..8ab18bc84855 100644
--- a/tests/python/tir-base/test_tir_intrin.py
+++ b/tests/python/tir-base/test_tir_intrin.py
@@ -31,13 +31,19 @@ def test_nearbyint():
     )
     A = te.placeholder((m,), name="A")
     A_rounded = te.compute((m,), lambda *i: tvm.tir.nearbyint(A(*i)), name="A")
-    s = te.create_schedule(A_rounded.op)
-    f = tvm.build(s, [A, A_rounded], "llvm")
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A, A_rounded])
+    sch = tir.Schedule(mod)
+
+    # Build from scheduled TIR
+    func = tvm.build(sch.mod, target="llvm")
+
     dev = tvm.cpu(0)
     n = 10
     a = tvm.nd.array(np.random.uniform(high=100, size=n).astype(A.dtype), dev)
     a_rounded = tvm.nd.array(np.random.uniform(size=n).astype(A_rounded.dtype), dev)
-    f(a, a_rounded)
+    func(a, a_rounded)
     # Note that numpys rint rounds to nearest integer with
     # ties to halfway is broken by rounding to even.
     # So that 1.5 and 2.5 will round 2.
@@ -79,13 +85,19 @@ def run_test(tvm_intrin, np_func):
         )
         A = te.placeholder((m,), name="A")
         B = te.compute((m,), lambda *i: tvm_intrin(A(*i)), name="B")
-        s = te.create_schedule(B.op)
-        f = tvm.build(s, [A, B], "llvm")
+
+        # Convert to TIR and create schedule
+        mod = te.create_prim_func([A, B])
+        sch = tir.Schedule(mod)
+
+        # Build from scheduled TIR
+        func = tvm.build(sch.mod, target="llvm")
+
         dev = tvm.cpu(0)
         n = 10
         a = tvm.nd.array(np.random.uniform(0.1, 0.5, size=n).astype(A.dtype), dev)
         b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        f(a, b)
+        func(a, b)
         tvm.testing.assert_allclose(b.numpy(), np_func(a.numpy()), atol=1e-5, rtol=1e-5)
 
     for func in test_funcs:
@@ -107,14 +119,20 @@ def run_test(tvm_intrin, np_func):
         A = te.placeholder((m,), name="A")
         B = te.placeholder((m,), name="B")
         C = te.compute((m,), lambda *i: tvm_intrin(A(*i), B(*i)), name="C")
-        s = te.create_schedule(C.op)
-        f = tvm.build(s, [A, B, C], "llvm")
+
+        # Convert to TIR and create schedule
+        mod = te.create_prim_func([A, B, C])
+        sch = tir.Schedule(mod)
+
+        # Build from scheduled TIR
+        func = tvm.build(sch.mod, target="llvm")
+
         dev = tvm.cpu(0)
         n = 10
         a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
         b = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(B.dtype), dev)
         c = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        f(a, b, c)
+        func(a, b, c)
         tvm.testing.assert_allclose(c.numpy(), np_func(a.numpy(), b.numpy()), atol=1e-5, rtol=1e-5)
 
     for func in test_funcs:
@@ -128,14 +146,20 @@ def test_ldexp():
     A = te.placeholder((m,), name="A")
     B = te.placeholder((m,), name="B", dtype="int32")
     C = te.compute((m,), lambda *i: tvm.tir.ldexp(A(*i), B(*i)), name="C")
-    s = te.create_schedule(C.op)
-    f = tvm.build(s, [A, B, C], "llvm")
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A, B, C])
+    sch = tir.Schedule(mod)
+
+    # Build from scheduled TIR
+    func = tvm.build(sch.mod, target="llvm")
+
     dev = tvm.cpu(0)
     n = 10
     a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
     b = tvm.nd.array(np.random.randint(0, 5, size=n).astype(B.dtype), dev)
     c = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-    f(a, b, c)
+    func(a, b, c)
     tvm.testing.assert_allclose(c.numpy(), np.ldexp(a.numpy(), b.numpy()), atol=1e-5, rtol=1e-5)
 
 
@@ -162,17 +186,23 @@ def clz_np(x, dtype):
     m = te.var("m")
     A = te.placeholder((m,), name="A", dtype=dtype)
     B = te.compute((m,), lambda *i: tvm.tir.clz(A(*i)), name="B")
-    s = te.create_schedule(B.op)
 
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A, B])
+    sch = tir.Schedule(mod)
+
+    # Apply scheduling primitives if target is Vulkan
     if target.kind.name == "vulkan":
-        bx, tx = s[B].split(B.op.axis[0], factor=64)
+        block = sch.get_block("B")
+        loop = sch.get_loops(block)[0]
+        bx, tx = sch.split(loop, factors=[None, 64])
+        sch.bind(bx, "blockIdx.x")
+        sch.bind(tx, "threadIdx.x")
 
-        s[B].bind(bx, te.thread_axis("blockIdx.x"))
-        s[B].bind(tx, te.thread_axis("threadIdx.x"))
+    # Build from scheduled TIR
+    func = tvm.build(sch.mod, target=target)
 
-    f = tvm.build(s, [A, B], target)
     n = 10
-
     highs = [10, 100, 1000, 10000, 100000, 1000000]
 
     if dtype == "int64":
@@ -182,7 +212,7 @@ def clz_np(x, dtype):
         a_np = np.random.randint(1, high=high, size=(n,), dtype=dtype)
         a = tvm.nd.array(a_np, dev)
         b = tvm.nd.array(np.zeros((n,)).astype("int32"), dev)
-        f(a, b)
+        func(a, b)
         ref = clz_np(a_np, dtype)
         np.testing.assert_equal(b.numpy(), ref)
 
diff --git a/tests/python/tir-base/test_tir_ir_builder.py b/tests/python/tir-base/test_tir_ir_builder.py
deleted file mode 100644
index 8a39337575a7..000000000000
--- a/tests/python/tir-base/test_tir_ir_builder.py
+++ /dev/null
@@ -1,565 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-import numpy as np
-import tvm.testing
-from tvm.topi.math import cast
-
-
-def test_for():
-    ib = tvm.tir.ir_builder.create()
-    n = te.size_var("n")
-    A = ib.allocate("float32", n, name="A", scope="global")
-    with ib.for_range(0, n, name="i") as i:
-        A[i] = A[i] + 1
-        with ib.for_range(0, 10, name="j") as j:
-            A[j] = A[j] + 2
-
-    body = ib.get()
-    assert isinstance(body, tvm.tir.Allocate)
-    body = body.body
-    assert isinstance(body, tvm.tir.For)
-    body = body.body
-    assert isinstance(body, tvm.tir.SeqStmt)
-    assert isinstance(body[1], tvm.tir.For)
-
-
-def test_if():
-    ib = tvm.tir.ir_builder.create()
-    n = te.size_var("n")
-    A = ib.pointer("float32", name="A")
-    tmod = tvm.tir.truncmod
-    with ib.for_range(0, n, name="i") as i:
-        with ib.if_scope(tmod(i, 2) == 0):
-            A[i] = A[i] + 1
-        with ib.else_scope():
-            A[0] = A[i] + 2
-
-    body = ib.get()
-    assert A == A
-    assert isinstance(body, tvm.tir.For)
-    body = body.body
-    assert isinstance(body, tvm.tir.IfThenElse)
-    assert isinstance(body.condition, tvm.tir.EQ)
-    assert isinstance(body.then_case.indices[0], tvm.tir.Var)
-    assert list(body.else_case.indices) == [0]
-
-
-def test_prefetch():
-    A = tvm.tir.decl_buffer((10, 20), name="A")
-    ib = tvm.tir.ir_builder.create()
-    n = te.size_var("n")
-
-    with ib.for_range(0, n, name="i") as i:
-        ib.emit(
-            tvm.tir.Prefetch(
-                A, [tvm.ir.Range.from_min_extent(i + 1, 2), tvm.ir.Range.from_min_extent(0, 20)]
-            )
-        )
-    body = ib.get()
-    assert body.body.bounds[0].extent.value == 2
-
-
-def test_cpu():
-    n = 1024
-    dtype = "float32"
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-
-    def test_device_ir(A, B, C):
-        n = A.shape[0]
-        max_threads = 8
-        ib = tvm.tir.ir_builder.create()
-        Aptr = ib.buffer_ptr(A)
-        Bptr = ib.buffer_ptr(B)
-        Cptr = ib.buffer_ptr(C)
-        with ib.for_range(0, n, name="i") as i:
-            Cptr[i] = Aptr[i] + Bptr[i]
-        body = ib.get()
-        return body
-
-    C = te.extern(
-        A.shape,
-        [A, B],
-        lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
-        name="vector_add",
-        dtype=dtype,
-    )
-    s = te.create_schedule(C.op)
-
-    def check_target(target):
-        if not tvm.testing.device_enabled(target):
-            return
-        # build and invoke the kernel.
-        fadd = tvm.build(s, [A, B, C], target)
-        dev = tvm.device(target, 0)
-        # launch the kernel.
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
-        fadd(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-
-    check_target("llvm")
-
-
-@tvm.testing.requires_gpu
-def test_gpu():
-    n = te.size_var("n")
-    dtype = "float32"
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    idxd = tvm.tir.indexdiv
-
-    def test_device_ir(A, B, C):
-        n = A.shape[0]
-        max_threads = 32
-        ib = tvm.tir.ir_builder.create()
-        bx = te.thread_axis("blockIdx.x")
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(bx, "thread_extent", idxd(n + max_threads - 1, max_threads))
-        ib.scope_attr(tx, "thread_extent", max_threads)
-        idx = bx.var * max_threads + tx.var
-        Aptr = ib.buffer_ptr(A)
-        Bptr = ib.buffer_ptr(B)
-        Cptr = ib.buffer_ptr(C)
-        with ib.if_scope(ib.likely(idx < n)):
-            Cptr[idx] = Aptr[idx] + Bptr[idx]
-        body = ib.get()
-        return body
-
-    C = te.extern(
-        A.shape,
-        [A, B],
-        lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
-        name="vector_add",
-        dtype=dtype,
-    )
-    s = te.create_schedule(C.op)
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    def check_target(target):
-        n = 1024
-        if not tvm.testing.device_enabled(target):
-            return
-        # build and invoke the kernel.
-        fadd = tvm.build(s, [A, B, C], target)
-        dev = tvm.device(target, 0)
-        # launch the kernel.
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
-        fadd(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-
-    check_target("opencl")
-    check_target("cuda")
-
-
-def test_while_vectorize():
-    """Test while loop + vectorized inner loop"""
-
-    n = 64
-    num_iter = 10
-
-    def test_ir(A, B, C):
-        ib = tvm.tir.ir_builder.create()
-        n = C.shape[0]
-        A = ib.buffer_ptr(A)
-        B = ib.buffer_ptr(B)
-        C = ib.buffer_ptr(C)
-        i = ib.allocate("int32", (1,), name="i", scope="local")
-        i[0] = 0
-
-        with ib.for_range(0, n) as j:
-            C[j] = 0.0
-
-        with ib.while_loop(i[0] < num_iter):
-            with ib.for_range(0, n, kind="vectorize") as j:
-                C[j] += A[j] + B[j]
-            i[0] += 1
-
-        return ib.get()
-
-    def check_target(target, ir):
-        dtype = "float32"
-        A = te.placeholder((n,), name="A", dtype=dtype)
-        B = te.placeholder((n,), name="B", dtype=dtype)
-
-        C = te.extern(
-            (n,),
-            [A, B],
-            lambda ins, outs: ir(ins[0], ins[1], outs[0]),
-            name="while_vectorize",
-            dtype=dtype,
-        )
-        s = te.create_schedule(C.op)
-
-        with tvm.transform.PassContext(opt_level=3):
-            func = tvm.build(s, [A, B, C], target)
-
-        dev = tvm.device(target, 0)
-        a_np = np.random.uniform(size=n).astype(A.dtype)
-        b_np = np.random.uniform(size=n).astype(B.dtype)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
-        func(a, b, c)
-        ref = num_iter * (a_np + b_np)
-        tvm.testing.assert_allclose(c.numpy(), ref, rtol=1e-5, atol=1e-5)
-
-    check_target("llvm", test_ir)
-
-
-def test_while_collatz():
-    """Test while loop + if"""
-
-    def collatz_ref(n):
-        a = n
-        i = 0
-        while a > 1:
-            if a % 2 == 1:
-                a = 3 * a + 1
-            else:
-                a = a >> 1
-            i += 1
-        return i
-
-    def collatz(ib, n, C):
-        i = ib.allocate("int32", (1,), name="i", scope="local")
-        a = ib.allocate("int32", (1,), name="a", scope="local")
-        i[0] = 0
-        a[0] = n
-        with ib.while_loop(a[0] > 1):
-            with ib.if_scope(tvm.tir.floormod(a[0], 2) == 1):
-                a[0] = 3 * a[0] + 1
-            with ib.else_scope():
-                a[0] = a[0] >> 1
-            i[0] += 1
-
-        C[n] = i[0]
-
-    def collatz_ir_cpu(C):
-        ib = tvm.tir.ir_builder.create()
-        n = C.shape[0]
-        C = ib.buffer_ptr(C)
-
-        with ib.for_range(0, n, name="i", kind="parallel") as i:
-            collatz(ib, i, C)
-
-        body = ib.get()
-
-        return body
-
-    n = 30
-
-    def check_target(target, ir):
-        C = te.extern(
-            (n,),
-            [],
-            lambda ins, outs: ir(outs[0]),
-            name="collatz",
-            dtype="int32",
-        )
-        s = te.create_schedule(C.op)
-
-        with tvm.transform.PassContext(opt_level=3):
-            func = tvm.build(s, [C], target)
-
-        dev = tvm.device(target, 0)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
-        func(c)
-        ref = np.array([collatz_ref(i) for i in range(n)])
-        tvm.testing.assert_allclose(c.numpy(), ref)
-
-    check_target("llvm", collatz_ir_cpu)
-
-
-def test_while_mandel():
-    n = 160
-    shape = (n * 2, n)
-    t = 300
-
-    def mandel_ref():
-        def complex_sqr(z):
-            return np.array([z[0] ** 2 - z[1] ** 2, z[1] * z[0] * 2])
-
-        pixels = np.zeros(shape)
-
-        for i in range(pixels.shape[0]):
-            for j in range(pixels.shape[1]):
-                c = np.array([-0.8, np.cos(t) * 0.2])
-                z = np.array([i / n - 1, j / n - 0.5]) * 2
-                iterations = 0
-
-                while np.linalg.norm(z) < 20 and iterations < 50:
-                    z = complex_sqr(z) + c
-                    iterations += 1
-
-                pixels[i, j] = 1 - iterations * 0.02
-
-        return pixels
-
-    def mandel(ib, i, j, pixels):
-        z = ib.allocate("float32", (2,), name="z", scope="local")
-        tmp = ib.allocate("float32", (1,), name="tmp", scope="local")
-        iterations = ib.allocate("int32", (1,), name="iterations", scope="local")
-
-        z[0] = (i / float(n) - 1) * 2
-        z[1] = (j / float(n) - 0.5) * 2
-        iterations[0] = 0
-        c = [-0.8, float(np.cos(t)) * 0.2]
-
-        def norm(z):
-            return tvm.tir.sqrt(z[0] * z[0] + z[1] * z[1])
-
-        with ib.while_loop(tvm.tir.all(norm(z) < 20, iterations[0] < 50)):
-            tmp[0] = z[0]
-            z[0] = z[0] * z[0] - z[1] * z[1] + c[0]
-            z[1] = z[1] * tmp[0] * 2 + c[1]
-            iterations[0] += 1
-
-        pixels[i, j] = 1 - iterations[0] * 0.02
-
-    def mandel_ir_cpu(C):
-        ib = tvm.tir.ir_builder.create()
-        ny = C.shape[0]
-        nx = C.shape[1]
-        C = ib.buffer_ptr(C)
-
-        with ib.for_range(0, ny, name="i", kind="parallel") as i:
-            with ib.for_range(0, nx, name="j") as j:
-                mandel(ib, i, j, C)
-
-        body = ib.get()
-
-        return body
-
-    def mandel_ir_gpu(C):
-        ib = tvm.tir.ir_builder.create()
-        ny = C.shape[0]
-        nx = C.shape[1]
-        C = ib.buffer_ptr(C)
-
-        bx = te.thread_axis("blockIdx.x")
-        tx = te.thread_axis("threadIdx.x")
-        by = te.thread_axis("blockIdx.y")
-        ty = te.thread_axis("threadIdx.y")
-
-        max_threads = 16
-        ib.scope_attr(bx, "thread_extent", tvm.tir.indexdiv(nx + max_threads - 1, max_threads))
-        ib.scope_attr(tx, "thread_extent", max_threads)
-        ib.scope_attr(by, "thread_extent", tvm.tir.indexdiv(ny + max_threads - 1, max_threads))
-        ib.scope_attr(ty, "thread_extent", max_threads)
-
-        tidx = bx * max_threads + tx
-        tidy = by * max_threads + ty
-
-        with ib.if_scope(tvm.tir.all(tidx < nx, tidy < ny)):
-            mandel(ib, tidy, tidx, C)
-
-        body = ib.get()
-
-        return body
-
-    ref = mandel_ref()
-
-    def check_target(target, ir):
-        if not tvm.testing.device_enabled(target):
-            return
-
-        C = te.extern(
-            shape,
-            [],
-            lambda ins, outs: ir(outs[0]),
-            name="mandel_ir",
-            dtype="float32",
-        )
-        s = te.create_schedule(C.op)
-
-        with tvm.transform.PassContext(opt_level=3):
-            func = tvm.build(s, [C], target)
-
-        dev = tvm.device(target, 0)
-        c = tvm.nd.array(np.zeros(shape, dtype=C.dtype), dev)
-        func(c)
-        tvm.testing.assert_allclose(c.numpy(), ref, rtol=1e-5, atol=1e-5)
-
-    check_target("llvm", mandel_ir_cpu)
-    check_target("npvtx", mandel_ir_gpu)
-    check_target("cuda", mandel_ir_gpu)
-    check_target("vulkan", mandel_ir_gpu)
-
-
-def test_while_binary_search():
-    def binary_search(ib, n, i, Aptr, Bptr, Cptr):
-        lo = ib.allocate("int32", (1,), name="lo", scope="local")
-        hi = ib.allocate("int32", (1,), name="hi", scope="local")
-
-        lo[0] = 0
-        hi[0] = n
-        v = Bptr[i]
-
-        with ib.while_loop(lo[0] < hi[0]):
-            mid = lo[0] + (hi[0] - lo[0] >> 1)
-            with ib.if_scope(Aptr[mid] < v):
-                lo[0] = mid + 1
-            with ib.else_scope():
-                hi[0] = mid
-
-        Cptr[i] = lo[0]
-
-    def searchsorted_ir_cpu(A, B, C, n):
-        ib = tvm.tir.ir_builder.create()
-        Aptr = ib.buffer_ptr(A)
-        Bptr = ib.buffer_ptr(B)
-        Cptr = ib.buffer_ptr(C)
-
-        with ib.for_range(0, n, name="i", kind="parallel") as i:
-            binary_search(ib, n, i, Aptr, Bptr, Cptr)
-
-        body = ib.get()
-
-        return body
-
-    def searchsorted_ir_gpu(A, B, C, n):
-        ib = tvm.tir.ir_builder.create()
-        Aptr = ib.buffer_ptr(A)
-        Bptr = ib.buffer_ptr(B)
-        Cptr = ib.buffer_ptr(C)
-
-        bx = te.thread_axis("blockIdx.x")
-        tx = te.thread_axis("threadIdx.x")
-        max_threads = 32
-        ib.scope_attr(bx, "thread_extent", tvm.tir.indexdiv(n + max_threads - 1, max_threads))
-        ib.scope_attr(tx, "thread_extent", max_threads)
-        tid = bx * max_threads + tx
-
-        with ib.if_scope(tid < n):
-            binary_search(ib, n, tid, Aptr, Bptr, Cptr)
-
-        body = ib.get()
-
-        return body
-
-    n = 1024
-    dtype = "float32"
-    A = te.placeholder((n,), name="A", dtype=dtype)
-    B = te.placeholder((n,), name="B", dtype=dtype)
-
-    def check_target(target, ir):
-        if not tvm.testing.device_enabled(target):
-            return
-
-        C = te.extern(
-            A.shape,
-            [A, B],
-            lambda ins, outs: ir(ins[0], ins[1], outs[0], n),
-            name="searchsorted_ir",
-            dtype="int32",
-        )
-        s = te.create_schedule(C.op)
-
-        with tvm.transform.PassContext(opt_level=3):
-            func = tvm.build(s, [A, B, C], target)
-
-        dev = tvm.device(target, 0)
-        a_np = np.random.uniform(size=n).astype(A.dtype)
-        b_np = np.random.uniform(size=n).astype(B.dtype)
-        a_np = np.sort(a_np)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
-        func(a, b, c)
-        ref = np.searchsorted(a_np, b_np)
-        tvm.testing.assert_allclose(c.numpy(), ref)
-
-    check_target("llvm", searchsorted_ir_cpu)
-    check_target("cuda", searchsorted_ir_gpu)
-    check_target("nvptx", searchsorted_ir_gpu)
-    check_target("vulkan", searchsorted_ir_gpu)
-
-
-@tvm.testing.requires_gpu
-def test_dyn_shared():
-    n = te.size_var("n")
-    dtype = "float32"
-    A = te.placeholder((n,), name="A")
-
-    def test_device_ir(A, B):
-        n = A.shape[0]
-        ib = tvm.tir.ir_builder.create()
-
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(tx, "thread_extent", n)
-
-        temp = ib.allocate(dtype, (n,), scope="shared.dyn")  # n is symbolic size
-
-        Aptr = ib.buffer_ptr(A)
-        Bptr = ib.buffer_ptr(B)
-
-        temp[tx] = Aptr[tx]
-        depth = tvm.tir.log2(cast(n, "float32"))
-
-        with ib.for_range(0, cast(tvm.tir.ceil(depth), n.dtype)) as i:
-            ib.emit(tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"])))
-            d = n >> (i + 1)
-            with ib.if_scope(tx < d):
-                temp[tx] += temp[tx + d]
-
-        Bptr[0] = temp[0]
-        return ib.get()
-
-    B = te.extern(
-        (1,),
-        [A],
-        lambda ins, outs: test_device_ir(ins[0], outs[0]),
-        name="reduce",
-        dtype=dtype,
-    )
-    s = te.create_schedule(B.op)
-
-    def check_target(target):
-        if not tvm.testing.device_enabled(target):
-            return
-
-        freduce = tvm.build(s, [A, B], target)
-        dev = tvm.device(target, 0)
-
-        for n in [512, 1024]:
-            a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-            b = tvm.nd.array(np.zeros(1, dtype=B.dtype), dev)
-            freduce(a, b)
-            tvm.testing.assert_allclose(b.numpy()[0], np.sum(a.numpy()), 1e-4, 1e-4)
-
-    for target in ["cuda", "nvptx"]:
-        check_target(target)
-
-
-if __name__ == "__main__":
-    test_prefetch()
-    test_if()
-    test_for()
-    test_cpu()
-    test_gpu()
-    test_while_vectorize()
-    test_while_collatz()
-    test_while_mandel()
-    test_while_binary_search()
-    test_dyn_shared()
diff --git a/tests/python/tir-transform/test_tir_transform_compact_buffer_region.py b/tests/python/tir-transform/test_tir_transform_compact_buffer_region.py
index cb7151f875e3..006ebf6a1a0d 100644
--- a/tests/python/tir-transform/test_tir_transform_compact_buffer_region.py
+++ b/tests/python/tir-transform/test_tir_transform_compact_buffer_region.py
@@ -569,7 +569,6 @@ def expected(a: T.handle) -> None:
 
 
 class TestAnnotatedOpaqueAccess(BaseCompactTest):
-
     is_lower_order_free = False
 
     @T.prim_func
@@ -1154,7 +1153,6 @@ def expected(
 
 
 class TestNonStrictCompactionForPaddedMatmul(BaseCompactTest):
-
     is_strict_mode = False
 
     @T.prim_func
@@ -1231,7 +1229,6 @@ def expected(
 
 
 class TestNotCompactAliasBuffer(BaseCompactTest):
-
     # it is not testcase on block form
     is_lower_order_free = False
 
@@ -1251,7 +1248,6 @@ def before():
 
 
 class TestNotCompactBufferWithDifferentDtype(BaseCompactTest):
-
     # it is not testcase on block form
     is_lower_order_free = False
 
@@ -1268,7 +1264,6 @@ def before():
 
 
 class TestNonBoolCondition(BaseCompactTest):
-
     # it is not testcase on block form
     is_lower_order_free = False
 
@@ -1289,15 +1284,6 @@ def expected():
                 A[i - 1] = A[i - 1] + 1
 
 
-def test_lower_te():
-    x = te.placeholder((1,))
-    y = te.compute((1,), lambda i: x[i] + 2)
-    s = te.create_schedule(y.op)
-    orig_mod = tvm.driver.build_module.schedule_to_module(s, [x, y])
-    mod = tvm.tir.transform.CompactBufferAllocation()(orig_mod)
-    tvm.ir.assert_structural_equal(mod, orig_mod)  # CompactBufferAllocation should do nothing on TE
-
-
 class TestCompactSymbolicBound0:
     """Test symbolic bound that get compacted to constant"""
 
diff --git a/tests/python/tir-transform/test_tir_transform_convert_blocks_to_opaque.py b/tests/python/tir-transform/test_tir_transform_convert_blocks_to_opaque.py
index f920a46ba57e..63a57eeffe29 100644
--- a/tests/python/tir-transform/test_tir_transform_convert_blocks_to_opaque.py
+++ b/tests/python/tir-transform/test_tir_transform_convert_blocks_to_opaque.py
@@ -74,15 +74,6 @@ def test_elementwise():
     _check(elementwise_func, substituted_elementwise_func)
 
 
-def test_lower_te():
-    x = te.placeholder((1,))
-    y = te.compute((1,), lambda i: x[i] + 2)
-    s = te.create_schedule(y.op)
-    orig_mod = tvm.driver.build_module.schedule_to_module(s, [x, y])
-    mod = tvm.tir.transform.ConvertBlocksToOpaque()(orig_mod)
-    tvm.ir.assert_structural_equal(mod, orig_mod)  # ConvertBlocksToOpaque should do nothing on TE
-
-
 class TestErrorIfPredicateUsesBlockVariables(tvm.testing.CompareBeforeAfter):
     transform = tvm.tir.transform.ConvertBlocksToOpaque()
     check_well_formed = False
diff --git a/tests/python/tir-transform/test_tir_transform_flatten_buffer.py b/tests/python/tir-transform/test_tir_transform_flatten_buffer.py
index 20f91b639497..b215398622cc 100644
--- a/tests/python/tir-transform/test_tir_transform_flatten_buffer.py
+++ b/tests/python/tir-transform/test_tir_transform_flatten_buffer.py
@@ -259,19 +259,6 @@ def expected(input_A: T.Buffer(10, "bool"), input_B: T.Buffer(10, "bool")) -> No
             B[i0] = T.cast(T.cast(A[i0], "bool"), "int8")
 
 
-class TestLowerTE(BaseCompare):
-    """FlattenBuffer should do nothing on TE-based functions"""
-
-    def before(self):
-        x = te.placeholder((1,))
-        y = te.compute((1,), lambda i: x[i] + 2)
-        s = te.create_schedule(y.op)
-        mod = tvm.driver.build_module.schedule_to_module(s, [x, y])
-        return mod["main"]
-
-    expected = before
-
-
 class TestFlattenInsideBlock(BaseCompare):
     """Flattening access inside a block flattens the accessed region."""
 
diff --git a/tests/python/tir-transform/test_tir_transform_hoist_if.py b/tests/python/tir-transform/test_tir_transform_hoist_if.py
index 04f3f9771c64..6695913a3c2c 100644
--- a/tests/python/tir-transform/test_tir_transform_hoist_if.py
+++ b/tests/python/tir-transform/test_tir_transform_hoist_if.py
@@ -515,34 +515,6 @@ def test_no_hoisting_7():
     tvm.ir.assert_structural_equal(new_stmt, stmt)
 
 
-def test_hoisting_block_scope_1():
-    n = te.size_var("n")
-    m = te.size_var("m")
-    A = te.placeholder((n, m), name="A")
-    k = te.reduce_axis((0, m), "k")
-    B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B")
-    s = te.create_schedule(B.op)
-    ko, ki = s[B].split(B.op.reduce_axis[0], factor=16)
-    BF = s.rfactor(B, ki)
-    xo, xi = s[B].split(s[B].op.axis[0], factor=32)
-    s[B.op].bind(xo, te.thread_axis("blockIdx.x"))
-    s[B.op].bind(xi, te.thread_axis("threadIdx.y"))
-    s[B].bind(s[B].op.reduce_axis[0], te.thread_axis("threadIdx.x"))
-    s[BF].compute_at(s[B], s[B].op.reduce_axis[0])
-    mod = tvm.driver.build_module.schedule_to_module(s, [A, B], "main", None)
-    mod = tvm.tir.transform.Simplify()(mod)
-    mod = tvm.tir.transform.RemoveNoOp()(mod)
-    stmt = mod["main"].body
-    new_stmt = tvm.tir.transform.HoistIfThenElse()(mod)["main"].body
-    tvm.ir.assert_structural_equal(new_stmt, stmt)
-
-    with tvm.transform.PassContext(
-        config={"tir.HoistIfThenElse": {"support_block_scope_hosting": True}}
-    ):
-        new_stmt = tvm.tir.transform.HoistIfThenElse()(mod)["main"].body
-    assert not tvm.ir.structural_equal(new_stmt, stmt)
-
-
 def test_hoisting_block_scope_2():
     ib = tvm.tir.ir_builder.create()
     dshape = (32, 64)
@@ -617,37 +589,6 @@ def test_hoisting_block_scope_3():
     assert not tvm.ir.structural_equal(new_stmt, stmt)
 
 
-def test_hoisting_block_scope_4():
-    nn = 1024
-    n = tvm.runtime.convert(nn)
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    AA = te.compute((n,), lambda *i: A(*i), name="A")
-    BB = te.compute((n,), lambda *i: B(*i), name="B")
-    T = te.compute(A.shape, lambda *i: AA(*i) + BB(*i), name="T")
-    C = te.compute(A.shape, lambda *i: T(*i), name="C")
-    s = te.create_schedule(C.op)
-    xo, xi = s[C].split(C.op.axis[0], factor=4)
-    xo1, xo2 = s[C].split(xo, factor=13)
-    s[C].parallel(xo2)
-    s[C].pragma(xo1, "parallel_launch_point")
-    s[C].pragma(xo2, "parallel_stride_pattern")
-    s[C].pragma(xo2, "parallel_barrier_when_finish")
-    s[C].vectorize(xi)
-    mod = tvm.driver.build_module.schedule_to_module(s, [A, B, C], "main", None)
-    mod = tvm.tir.transform.Simplify()(mod)
-
-    stmt = mod["main"].body
-    new_stmt = tvm.tir.transform.HoistIfThenElse()(mod)["main"].body
-    tvm.ir.assert_structural_equal(new_stmt, stmt)
-
-    with tvm.transform.PassContext(
-        config={"tir.HoistIfThenElse": {"support_block_scope_hosting": True}}
-    ):
-        new_stmt = tvm.tir.transform.HoistIfThenElse()(mod)["main"].body
-    assert not tvm.ir.structural_equal(new_stmt, stmt)
-
-
 def test_hoisting_block_scope_5():
     ib = tvm.tir.ir_builder.create()
     data = ib.pointer("float32", name="data", scope="global")
diff --git a/tests/python/tir-transform/test_tir_transform_inject_copy_intrin.py b/tests/python/tir-transform/test_tir_transform_inject_copy_intrin.py
deleted file mode 100644
index aa0448c3c682..000000000000
--- a/tests/python/tir-transform/test_tir_transform_inject_copy_intrin.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-import tvm.testing
-from tvm import te
-from tvm.driver.build_module import schedule_to_module
-
-
-def test_copy2d():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), name="A")
-    B = te.compute((m, l), lambda i, j: A[i, j], name="B")
-    s = te.create_schedule(B.op)
-    s[B].pragma(B.op.axis[0], "memcpy")
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, B], stmt, None)
-    mod = tvm.IRModule.from_expr(func)
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-
-    def cb(src, dst, pad_before, pad_after, pad_value):
-        assert dst.strides[0] == l
-        assert dst.strides[1].value == 1
-        assert src.strides[0] == l
-        assert tuple(src.shape) == (m, l)
-        return tvm.tir.Evaluate(0)
-
-    stmt = tvm.tir.transform.InjectCopyIntrin("memcpy", cb)(mod)["main"].body
-
-
-def test_copy_pad():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), name="A")
-    B = te.compute(
-        (m + 2, l),
-        lambda i, j: tvm.tir.if_then_else(tvm.tir.all(i >= 1, i < m + 1), A[i - 1, j], 1.0),
-        name="B",
-    )
-    s = te.create_schedule(B.op)
-    s[B].pragma(B.op.axis[0], "memcpy")
-    mod = schedule_to_module(s, [A, B])
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-
-    def cb(src, dst, pad_before, pad_after, pad_value):
-        tvm.testing.assert_prim_expr_equal(src.elem_offset, 0)
-        assert pad_before[0].value == 1
-        assert pad_before[1].value == 0
-        assert pad_after[0].value == 1
-        assert pad_after[1].value == 0
-        assert pad_value.value == 1.0
-        return tvm.tir.Evaluate(0)
-
-    stmt = tvm.tir.transform.InjectCopyIntrin("memcpy", cb)(mod)["main"].body
-
-
-def test_single_point_test():
-    A = te.placeholder((1,), name="A")
-    B = te.compute((1,), lambda i: A[i], name="B")
-    s = te.create_schedule(B.op)
-    s[B].pragma(B.op.axis[0], "memcpy")
-    mod = schedule_to_module(s, [A, B])
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-
-    def cb(src, dst, pad_before, pad_after, pad_value):
-        tvm.testing.assert_prim_expr_equal(src.elem_offset, 0)
-        tvm.testing.assert_prim_expr_equal(dst.elem_offset, 0)
-        tvm.testing.assert_prim_expr_equal(src.strides[0], 1)
-        tvm.testing.assert_prim_expr_equal(dst.strides[0], 1)
-        return tvm.tir.Evaluate(0)
-
-    stmt = tvm.tir.transform.InjectCopyIntrin("memcpy", cb)(mod)["main"].body
-
-
-def test_copy_pad_split():
-    m = 4 * 3
-    A = te.placeholder((m,), name="A")
-    Apad = te.compute(
-        (m + 2,), lambda i: tvm.tir.if_then_else(tvm.tir.all(i >= 1, i <= m), A[i - 1], 0.0), "Apad"
-    )
-    B = te.compute((m,), lambda i: Apad[i] + Apad[i + 1] + Apad[i + 2])
-    s = te.create_schedule(B.op)
-    xo, xi = s[B].split(B.op.axis[0], factor=4)
-    s[Apad].compute_at(s[B], xo)
-    s[Apad].pragma(s[Apad].op.axis[0], "memcpy")
-
-    mod = schedule_to_module(s, [A, B])
-    mod = tvm.tir.transform.StorageFlatten(64)(mod._move())
-    mod = tvm.tir.transform.Simplify()(mod._move())
-
-    def cb(src, dst, pad_before, pad_after, pad_value):
-        assert dst.elem_offset.value == 0
-        tvm.testing.assert_prim_expr_equal(src.elem_offset, tvm.te.max(xo * 4, 1) - 1)
-
-        rpad_before = tvm.te.max(1 - xo * 4, 0)
-        rpad_after = tvm.te.max(xo * 4 - 7, 0)
-        tvm.testing.assert_prim_expr_equal(pad_before[0], rpad_before)
-        tvm.testing.assert_prim_expr_equal(pad_after[0], rpad_after)
-        tvm.testing.assert_prim_expr_equal(src.shape[0], 6 - rpad_before - rpad_after)
-        return tvm.tir.Evaluate(0)
-
-    stmt = tvm.tir.transform.InjectCopyIntrin("memcpy", cb)(mod)["main"].body
-
-
-if __name__ == "__main__":
-    test_copy2d()
-    test_copy_pad()
-    test_copy_pad_split()
-    test_single_point_test()
diff --git a/tests/python/tir-transform/test_tir_transform_inject_rolling_buffer.py b/tests/python/tir-transform/test_tir_transform_inject_rolling_buffer.py
index c1c8141f70a7..3d8f85bf79dd 100644
--- a/tests/python/tir-transform/test_tir_transform_inject_rolling_buffer.py
+++ b/tests/python/tir-transform/test_tir_transform_inject_rolling_buffer.py
@@ -19,186 +19,9 @@
 import tvm
 import tvm.script
 from tvm import te, topi
-from tvm.driver.build_module import get_binds
 from tvm.script import tir as T
 
 
-def _tile_nd(s, tensor, tile):
-    outer_indices = []
-    inner_indices = []
-    for i, size in enumerate(tile):
-        outer, inner = s[tensor].split(tensor.op.axis[i], size)
-        outer_indices.append(outer)
-        inner_indices.append(inner)
-
-    s[tensor].reorder(*outer_indices, *inner_indices)
-    return outer_indices, inner_indices
-
-
-@tvm.tir.transform.prim_func_pass(opt_level=0)
-def remove_rolling_buffer_attr(func, mod, ctx):
-    def unwrap(node):
-        if isinstance(node, tvm.tir.AttrStmt) and node.attr_key == "rolling_buffer_scope":
-            return node.body
-        else:
-            return node
-
-    return func.with_body(
-        tvm.tir.stmt_functor.ir_transform(
-            func.body, None, postorder=unwrap, only_enable=["tir.AttrStmt"]
-        )
-    )
-
-
-@tvm.tir.transform.prim_func_pass(opt_level=0)
-def verify_no_rolling_buffer_attr(func, mod, ctx):
-    def verify(node):
-        if isinstance(node, tvm.tir.AttrStmt):
-            assert node.attr_key != "rolling_buffer_scope", "Failed to lower rolling buffers"
-
-    tvm.tir.stmt_functor.post_order_visit(func.body, verify)
-
-    return func
-
-
-def _verify_schedule(sch, inputs, output):
-    user_pass_lists = [
-        [(0, remove_rolling_buffer_attr), (0, verify_no_rolling_buffer_attr)],
-        [(0, tvm.tir.transform.InjectRollingBuffer()), (0, verify_no_rolling_buffer_attr)],
-    ]
-    built_funcs = []
-    for user_pass_list in user_pass_lists:
-        with tvm.transform.PassContext(config={"tir.add_lower_pass": user_pass_list}):
-            built_funcs.append(tvm.build(sch, inputs + [output]))
-
-    outputs = []
-    ctx = tvm.cpu(0)
-    input_data = []
-    for tensor in inputs:
-        shape = [i.value for i in tensor.shape]
-        input_data.append(
-            tvm.nd.array(np.random.randint(low=-100, high=100, size=shape).astype("int8"), ctx)
-        )
-    shape = [i.value for i in output.shape]
-    out = tvm.nd.array(np.zeros(shape, dtype="int8"), ctx)
-    for func in built_funcs:
-        func(*input_data, out)
-        outputs.append(out.numpy())
-
-    np.testing.assert_equal(outputs[0], outputs[1])
-
-
-@pytest.mark.parametrize("tile_shape", [(1, 4, 8, 16), (1, 8, 7, 11), (1, 8, 3, 8), (1, 7, 5, 3)])
-def test_tile_shapes(tile_shape):
-    A = te.placeholder((1, 12, 14, 16), name="A", dtype="int8")
-    pool_a = topi.nn.pool2d(A, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-    pool_b = topi.nn.pool2d(pool_a, (3, 5), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-
-    sch = tvm.te.create_schedule([pool_b.op])
-    oi, ii = _tile_nd(sch, pool_b, tile_shape)
-    sch[pool_a].compute_at(sch[pool_b], oi[-1])
-    sch[pool_a].rolling_buffer()
-
-    _verify_schedule(sch, [A], pool_b)
-
-
-def test_implied_split():
-    A = te.placeholder((1, 12, 12, 16), name="A", dtype="int8")
-    pool_a = topi.nn.pool2d(A, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-    pool_b = topi.nn.pool2d(pool_a, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-
-    sch = tvm.te.create_schedule([pool_b.op])
-    n, h, w, c = pool_b.op.axis
-    oi, ii = sch[pool_b].split(w, 4)
-    sch[pool_a].compute_at(sch[pool_b], oi)
-    sch[pool_a].rolling_buffer()
-
-    _verify_schedule(sch, [A], pool_b)
-
-
-@pytest.mark.parametrize("kernel_shape", [(1, 1), (3, 3)])
-def test_upscale(kernel_shape):
-    output_shape = (1, 24, 24, 16)
-    input_shape = (
-        output_shape[0],
-        output_shape[1] // 2 + 2 * (kernel_shape[0] - 1),
-        output_shape[2] // 2 + 2 * (kernel_shape[1] - 1),
-        output_shape[3],
-    )
-    A = te.placeholder(input_shape, name="A", dtype="int8")
-    pool_a = topi.nn.pool2d(A, kernel_shape, (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-    pool_b = topi.nn.pool2d(
-        pool_a, kernel_shape, (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC"
-    )
-    upscale = te.compute((1, 24, 24, 16), lambda nn, hh, ww, cc: pool_b[nn, hh // 2, ww // 2, cc])
-
-    sch = tvm.te.create_schedule([upscale.op])
-    oi, ii = _tile_nd(sch, upscale, (1, 5, 5, 16))
-    sch[pool_b].compute_at(sch[upscale], oi[-1])
-    sch[pool_b].rolling_buffer()
-    sch[pool_a].compute_at(sch[upscale], oi[-1])
-    sch[pool_a].rolling_buffer()
-
-    _verify_schedule(sch, [A], upscale)
-
-
-@pytest.mark.parametrize("tile_shape", [(1, 4, 8, 16), (1, 8, 7, 11), (1, 8, 3, 8), (1, 7, 5, 3)])
-def test_3_tiled_poolings(tile_shape):
-    A = te.placeholder((1, 14, 14, 16), name="A", dtype="int8")
-    pool_a = topi.nn.pool2d(A, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-    pool_b = topi.nn.pool2d(pool_a, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-    pool_c = topi.nn.pool2d(pool_b, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-
-    sch = tvm.te.create_schedule([pool_c.op])
-    oi, ii = _tile_nd(sch, pool_c, tile_shape)
-    sch[pool_b].compute_at(sch[pool_c], oi[-1])
-    sch[pool_b].rolling_buffer()
-    sch[pool_a].compute_at(sch[pool_c], oi[-1])
-    sch[pool_a].rolling_buffer()
-
-    _verify_schedule(sch, [A], pool_c)
-
-
-@pytest.mark.parametrize("tile_shape", [(1, 4, 8, 16), (1, 8, 7, 11), (1, 8, 3, 8), (1, 7, 5, 3)])
-def test_tiled_added_poolings(tile_shape):
-    A = te.placeholder((1, 12, 12, 16), name="A", dtype="int8")
-    B = te.placeholder((1, 14, 14, 16), name="A", dtype="int8")
-    pool_a = topi.nn.pool2d(A, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-    pool_b = topi.nn.pool2d(B, (5, 5), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-    add = topi.add(pool_a, pool_b)
-    pool_c = topi.nn.pool2d(add, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-
-    sch = tvm.te.create_schedule([pool_c.op])
-    oi, ii = _tile_nd(sch, pool_c, tile_shape)
-    sch[add].compute_at(sch[pool_c], oi[-1])
-    sch[add].rolling_buffer()
-    sch[pool_b].compute_at(sch[pool_c], oi[-1])
-    sch[pool_b].rolling_buffer()
-    sch[pool_a].compute_at(sch[pool_c], oi[-1])
-    sch[pool_a].rolling_buffer()
-
-    _verify_schedule(sch, [A, B], pool_c)
-
-
-@pytest.mark.parametrize("make_rolling", [(0, 0), (1, 0), (0, 1), (1, 1)])
-def test_mixed_buffers(make_rolling):
-    A = te.placeholder((1, 14, 14, 16), name="A", dtype="int8")
-    pool_a = topi.nn.pool2d(A, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-    pool_b = topi.nn.pool2d(pool_a, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-    pool_c = topi.nn.pool2d(pool_b, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-
-    sch = tvm.te.create_schedule([pool_c.op])
-    oi, ii = _tile_nd(sch, pool_c, (1, 4, 8, 16))
-    sch[pool_b].compute_at(sch[pool_c], oi[-1])
-    if make_rolling[0]:
-        sch[pool_b].rolling_buffer()
-    sch[pool_a].compute_at(sch[pool_c], oi[-1])
-    if make_rolling[1]:
-        sch[pool_a].rolling_buffer()
-
-    _verify_schedule(sch, [A], pool_c)
-
-
 @tvm.script.ir_module
 class PreRollingBuffer:
     @T.prim_func
diff --git a/tests/python/tir-transform/test_tir_transform_instrument_bound_checkers.py b/tests/python/tir-transform/test_tir_transform_instrument_bound_checkers.py
deleted file mode 100644
index 3078572bb508..000000000000
--- a/tests/python/tir-transform/test_tir_transform_instrument_bound_checkers.py
+++ /dev/null
@@ -1,608 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-import tvm.testing
-from tvm import te, tir
-
-import pytest
-import numpy as np
-
-
-def collect_visit(stmt, f):
-    ret = []
-    tvm.tir.stmt_functor.post_order_visit(stmt, lambda x: ret.append(f(x)))
-    return ret
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.xfail
-def test_out_of_bounds_llvm(index_a, index_b):
-    n = te.size_var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda i: A[i + index_a] + B[i + index_b], name="C")
-    s = te.create_schedule(C.op)
-    tgt = "llvm"
-    tgt_host = "llvm"
-    stmt = tvm.lower(s, [A, B, C], simple_mode=True)
-    print(stmt)
-    tgt = tvm.target.Target(tgt, tgt_host)
-    fadd = tvm.build(s, [A, B, C], target=tgt, name="myadd")
-    dev = tvm.device(tgt.kind.name, 0)
-    a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=1024).astype(B.dtype), dev)
-    c = tvm.nd.array(np.zeros(1024, dtype=C.dtype), dev)
-    fadd(a, b, c)
-
-
-@tvm.testing.requires_llvm
-def test_in_bounds_llvm():
-    n = te.size_var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
-    s = te.create_schedule(C.op)
-    tgt = "llvm"
-    tgt_host = "llvm"
-    stmt = tvm.lower(s, [A, B, C], simple_mode=True)
-    tgt = tvm.target.Target(tgt, tgt_host)
-    fadd = tvm.build(s, [A, B, C], target=tgt, name="myadd")
-    dev = tvm.device(tgt.kind.name, 0)
-    a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=1024).astype(B.dtype), dev)
-    c = tvm.nd.array(np.zeros(1024, dtype=C.dtype), dev)
-    fadd(a, b, c)
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.xfail
-def test_out_of_bounds_vectorize_llvm(nn, index_a, index_b):
-    n = tvm.runtime.convert(nn)
-    a = te.placeholder((n), name="a")
-    b = te.placeholder((n), name="b")
-    c = te.compute((n,), lambda i: a[i + index_a] + b[i + index_b], name="c")
-    s = te.create_schedule(c.op)
-    xo, xi = s[c].split(c.op.axis[0], factor=8)
-    s[c].parallel(xo)
-    s[c].vectorize(xi)
-    tgt = "llvm"
-    tgt_host = "llvm"
-    stmt = tvm.lower(s, [a, b, c], simple_mode=True)
-    tgt = tvm.target.Target(tgt, tgt_host)
-    f = tvm.build(s, [a, b, c], target=tgt, name="myaddvec")
-    dev = tvm.cpu(0)
-    n = nn
-    a = tvm.nd.array(np.random.uniform(size=(n)).astype(a.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=(n)).astype(a.dtype), dev)
-    c = tvm.nd.array(np.zeros(n, dtype=c.dtype), dev)
-    f(a, b, c)
-
-
-@tvm.testing.requires_llvm
-def test_in_bounds_vectorize_llvm():
-    n = 512
-    lanes = 2
-    A = te.placeholder((n,), name="A", dtype="float32x%d" % lanes)
-    B = te.compute((n,), lambda i: A[i], name="B")
-    C = te.compute((n,), lambda i: B[i] + tvm.tir.const(1, A.dtype), name="C")
-    s = te.create_schedule(C.op)
-    xo, xi = s[C].split(C.op.axis[0], nparts=2)
-    _, xi = s[C].split(xi, factor=2)
-    s[C].parallel(xo)
-    s[C].vectorize(xi)
-    s[B].compute_at(s[C], xo)
-    xo, xi = s[B].split(B.op.axis[0], factor=2)
-    s[B].vectorize(xi)
-    # build and invoke the kernel.
-    lowered_func = tvm.lower(s, [A, C], "llvm", simple_mode=False)
-    f = tvm.build(s, [A, C], "llvm")
-    dev = tvm.cpu(0)
-    # launch the kernel.
-    a = tvm.nd.empty((n,), A.dtype).copyfrom(
-        np.random.uniform(size=[n] + ([] if lanes == 1 else [lanes]))
-    )
-    c = tvm.nd.empty((n,), C.dtype, dev)
-    f(a, c)
-    tvm.testing.assert_allclose(c.numpy(), a.numpy() + 1)
-
-
-@tvm.testing.requires_llvm
-def test_in_bounds_loop_partition_basic_llvm():
-    n = te.size_var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-
-    T = te.compute((n,), lambda i: A[i] + B[i])
-    s = te.create_schedule(T.op)
-    xo, xi = s[T].split(T.op.axis[0], factor=4)
-    lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-    dev = tvm.cpu(0)
-
-    f = tvm.build(s, [A, B, T], "llvm")
-    a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), dev)
-    t = tvm.nd.empty((32,), T.dtype, dev)
-    f(a, b, t)
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.xfail
-def test_out_of_bounds_loop_partition_basic_llvm(index_a, index_b):
-    n = te.size_var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-
-    T = te.compute((n,), lambda i: A[i + index_a] + B[i + index_b])
-    s = te.create_schedule(T.op)
-    xo, xi = s[T].split(T.op.axis[0], factor=4)
-    lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-    dev = tvm.cpu(0)
-
-    f = tvm.build(s, [A, B, T], "llvm")
-    a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), dev)
-    t = tvm.nd.empty((32,), T.dtype, dev)
-    f(a, b, t)
-
-
-def test_in_bounds_const_loop_partition_ir():
-    def check_attr_stmt(x):
-        if (
-            isinstance(x, tvm.tir.AttrStmt)
-            and x.attr_key == "buffer_bound"
-            and tvm.ir.structural_equal(x.value.args, [n])
-        ):
-            return True
-        return False
-
-    def check_branch_stmt(x):
-        if isinstance(x, tvm.tir.IfThenElse):
-            return True
-        return False
-
-    def assert_bound_instrumentation(stmt, f, nums):
-        count = 0
-        for i in collect_visit(stmt, f):
-            if i is True:
-                count = count + 1
-        assert count == nums
-
-    def collect_branch_stmt(x):
-        if isinstance(x, tvm.tir.IfThenElse):
-            branch_collector.append(x)
-
-    n = tir.const(21)
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-
-    T = te.compute((n,), lambda i: A[i] + B[i])
-    s = te.create_schedule(T.op)
-    xo, xi = s[T].split(T.op.axis[0], factor=4)
-
-    with tvm.transform.PassContext(
-        config={
-            "tir.instrument_bound_checkers": True,
-            "tir.LoopPartition": {"partition_const_loop": True},
-        }
-    ):
-        mod = tvm.driver.lower(s, [A, B, T], name="main")
-
-    stmt = mod["main"].body
-    # after instrumentation
-    assert_bound_instrumentation(stmt, check_attr_stmt, 2 * 3)
-    assert_bound_instrumentation(stmt, check_branch_stmt, 2)
-
-    branch_collector = list()
-    collect_visit(stmt, collect_branch_stmt)
-    assert len(branch_collector) == 2
-
-
-@tvm.testing.requires_llvm
-def test_in_bounds_const_loop_partition_llvm():
-    with tvm.transform.PassContext(
-        config={
-            "tir.instrument_bound_checkers": True,
-            "tir.LoopPartition": {"partition_const_loop": True},
-        }
-    ):
-        n = 21
-        A = te.placeholder((n,), name="A")
-        B = te.placeholder((n,), name="B")
-
-        T = te.compute((n,), lambda i: A[i] + B[i])
-        s = te.create_schedule(T.op)
-        xo, xi = s[T].split(T.op.axis[0], factor=4)
-        lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-        dev = tvm.cpu(0)
-
-        f = tvm.build(s, [A, B, T], "llvm")
-        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), dev)
-        t = tvm.nd.empty((n,), T.dtype, dev)
-        f(a, b, t)
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.xfail
-def test_out_of_bounds_const_loop_partition_llvm(index_a, index_b):
-    with tvm.transform.PassContext(
-        config={
-            "tir.instrument_bound_checkers": True,
-            "tir.LoopPartition": {"partition_const_loop": True},
-        }
-    ):
-        n = 21
-        A = te.placeholder((n,), name="A")
-        B = te.placeholder((n,), name="B")
-
-        T = te.compute((n,), lambda i: A[i + index_a] + B[i + index_b])
-        s = te.create_schedule(T.op)
-        xo, xi = s[T].split(T.op.axis[0], factor=4)
-        lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-        dev = tvm.cpu(0)
-
-        f = tvm.build(s, [A, B, T], "llvm")
-        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), dev)
-        t = tvm.nd.empty((n,), T.dtype, dev)
-        f(a, b, t)
-
-
-@tvm.testing.requires_llvm
-def test_in_bounds_conv_llvm(loop_tiling=False):
-    HSTR = WSTR = 1
-    in_channel = 128
-    kernel_height = kernel_width = 3
-    out_channel = 64
-    batch_size = 1
-    in_height = in_width = 64
-    out_height = out_width = in_height - kernel_height + 1
-    data = te.placeholder((batch_size, in_channel, in_height, in_width), name="data")
-    kernel = te.placeholder((kernel_height, kernel_width, in_channel, out_channel), name="kernel")
-    ic = te.reduce_axis((0, in_channel), name="ic")
-    kh = te.reduce_axis((0, kernel_height), name="kh")
-    kw = te.reduce_axis((0, kernel_width), name="kw")
-    conv = te.compute(
-        (batch_size, out_channel, out_height, out_width),
-        lambda n, oc, oh, ow: te.sum(
-            data[n, ic, oh * HSTR + kh, ow * WSTR + kw] * kernel[kh, kw, ic, oc], axis=[ic, kh, kw]
-        ),
-        name="conv2d",
-    )
-    s = te.create_schedule(conv.op)
-
-    n, oc, oh, ow = conv.op.axis
-    if loop_tiling:
-        oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16)
-    lowered_func = tvm.lower(s, [data, kernel, conv], simple_mode=True)
-    dev = tvm.cpu(0)
-
-    f = tvm.build(s, [data, kernel, conv], "llvm")
-    data_input = tvm.nd.array(
-        np.random.uniform(size=(batch_size, in_channel, in_height, in_width)).astype("float32"), dev
-    )
-    kernel_input = tvm.nd.array(
-        np.random.uniform(size=(kernel_height, kernel_width, in_channel, out_channel)).astype(
-            "float32"
-        ),
-        dev,
-    )
-    conv_out = tvm.nd.empty((batch_size, out_channel, out_height, out_width), "float32", dev)
-    f(data_input, kernel_input, conv_out)
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.xfail
-def test_out_of_bounds_conv_llvm(data_offsets, kernel_offsets, loop_tiling=False):
-    HSTR = WSTR = 1
-    in_channel = 128
-    kernel_height = kernel_width = 3
-    out_channel = 64
-    batch_size = 1
-    in_height = in_width = 64
-    out_height = out_width = in_height - kernel_height + 1
-    data = te.placeholder((batch_size, in_channel, in_height, in_width), name="data")
-    kernel = te.placeholder((kernel_height, kernel_width, in_channel, out_channel), name="kernel")
-    ic = te.reduce_axis((0, in_channel), name="ic")
-    kh = te.reduce_axis((0, kernel_height), name="kh")
-    kw = te.reduce_axis((0, kernel_width), name="kw")
-    conv = te.compute(
-        (batch_size, out_channel, out_height, out_width),
-        lambda n, oc, oh, ow: te.sum(
-            data[
-                n + data_offsets[0],
-                ic + data_offsets[1],
-                oh * HSTR + kh + data_offsets[2],
-                ow * WSTR + kw + data_offsets[3],
-            ]
-            * kernel[
-                kh + kernel_offsets[0],
-                kw + kernel_offsets[1],
-                ic + kernel_offsets[2],
-                oc + kernel_offsets[3],
-            ],
-            axis=[ic, kh, kw],
-        ),
-        name="conv2d",
-    )
-    s = te.create_schedule(conv.op)
-
-    n, oc, oh, ow = conv.op.axis
-    if loop_tiling:
-        oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16)
-    lowered_func = tvm.lower(s, [data, kernel, conv], simple_mode=True)
-    dev = tvm.cpu(0)
-
-    f = tvm.build(s, [data, kernel, conv], "llvm")
-    data_input = tvm.nd.array(
-        np.random.uniform(size=(batch_size, in_channel, in_height, in_width)).astype("float32"), dev
-    )
-    kernel_input = tvm.nd.array(
-        np.random.uniform(size=(kernel_height, kernel_width, in_channel, out_channel)).astype(
-            "float32"
-        ),
-        dev,
-    )
-    conv_out = tvm.nd.empty((batch_size, out_channel, out_height, out_width), "float32", dev)
-    f(data_input, kernel_input, conv_out)
-
-
-@tvm.testing.requires_llvm
-def test_in_bounds_tensors_with_same_shapes1D_llvm():
-    n = te.size_var("n")
-    k = te.size_var("k")
-    m = te.size_var("m")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((k,), name="B")
-
-    T = te.compute((m,), lambda i: A[i] * B[i])
-    s = te.create_schedule(T.op)
-    lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-    dev = tvm.cpu(0)
-
-    f = tvm.build(s, [A, B, T], "llvm")
-    a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), dev)
-    t = tvm.nd.empty((32,), T.dtype, dev)
-    f(a, b, t)
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.xfail
-def test_out_of_bounds_tensors_with_diff_shapes1D_llvm(a_shape, b_shape, c_shape):
-    n = te.size_var("n")
-    k = te.size_var("k")
-    m = te.size_var("m")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((k,), name="B")
-
-    T = te.compute((m,), lambda i: A[i] * B[i])
-    s = te.create_schedule(T.op)
-    lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-    dev = tvm.cpu(0)
-
-    f = tvm.build(s, [A, B, T], "llvm")
-    a = tvm.nd.array(np.random.uniform(size=(a_shape,)).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=(b_shape,)).astype(B.dtype), dev)
-    t = tvm.nd.empty((c_shape,), T.dtype, dev)
-    f(a, b, t)
-
-
-@tvm.testing.requires_llvm
-def test_in_bounds_tensors_with_same_shapes2D_llvm():
-    n = te.size_var("n")
-    k = te.size_var("k")
-    m = te.size_var("m")
-    A = te.placeholder((n, n), name="A")
-    B = te.placeholder((k, k), name="B")
-
-    T = te.compute((m, m), lambda i, j: A[i][j] * B[i][j])
-    s = te.create_schedule(T.op)
-    lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-    dev = tvm.cpu(0)
-
-    f = tvm.build(s, [A, B, T], "llvm")
-    a = tvm.nd.array(np.random.uniform(size=(32, 32)).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=(32, 32)).astype(B.dtype), dev)
-    t = tvm.nd.empty((32, 32), T.dtype, dev)
-    f(a, b, t)
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.xfail
-def test_out_of_bounds_tensors_with_diff_shapes2D_llvm(a_shape, b_shape, c_shape):
-    n = te.size_var("n")
-    k = te.size_var("k")
-    m = te.size_var("m")
-    A = te.placeholder((n, n), name="A")
-    B = te.placeholder((k, k), name="B")
-
-    T = te.compute((m, m), lambda i, j: A[i][j] * B[i][j])
-    s = te.create_schedule(T.op)
-    lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-    dev = tvm.cpu(0)
-
-    f = tvm.build(s, [A, B, T], "llvm")
-    a = tvm.nd.array(np.random.uniform(size=(a_shape[0], a_shape[1])).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=(b_shape[0], b_shape[1])).astype(B.dtype), dev)
-    t = tvm.nd.empty((c_shape[0], c_shape[1]), T.dtype, dev)
-    f(a, b, t)
-
-
-@tvm.testing.requires_llvm
-def test_in_bounds_tensors_with_same_shapes3D_llvm():
-    n = te.size_var("n")
-    k = te.size_var("k")
-    m = te.size_var("m")
-    A = te.placeholder((n, n, n), name="A")
-    B = te.placeholder((k, k, k), name="B")
-
-    T = te.compute((m, m, m), lambda i, j, p: A[i][j][p] * B[i][j][p])
-    s = te.create_schedule(T.op)
-    lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-
-    dev = tvm.cpu(0)
-
-    f = tvm.build(s, [A, B, T], "llvm")
-    a = tvm.nd.array(np.random.uniform(size=(32, 32, 32)).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=(32, 32, 32)).astype(B.dtype), dev)
-    t = tvm.nd.empty((32, 32, 32), T.dtype, dev)
-    f(a, b, t)
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.xfail
-def test_out_of_bounds_tensors_with_diff_shapes3D_llvm(a_shape, b_shape, c_shape):
-    n = te.size_var("n")
-    k = te.size_var("k")
-    m = te.size_var("m")
-    A = te.placeholder((n, n, n), name="A")
-    B = te.placeholder((k, k, k), name="B")
-
-    T = te.compute((m, m, m), lambda i, j, p: A[i][j][p] * B[i][j][p])
-    s = te.create_schedule(T.op)
-    lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-
-    dev = tvm.cpu(0)
-
-    f = tvm.build(s, [A, B, T], "llvm")
-    a = tvm.nd.array(
-        np.random.uniform(size=(a_shape[0], a_shape[1], c_shape[2])).astype(A.dtype), dev
-    )
-    b = tvm.nd.array(
-        np.random.uniform(size=(b_shape[0], b_shape[1], b_shape[2])).astype(B.dtype), dev
-    )
-    t = tvm.nd.empty((c_shape[0], c_shape[1], c_shape[2]), T.dtype, dev)
-    f(a, b, t)
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.xfail
-def test_out_of_bounds_tensors_with_zero_shape_op_with_not_zero_shape_llvm():
-    n = 64
-    A = te.placeholder((n,), name="A")
-    scale = te.placeholder((), name="scale")
-    k = te.reduce_axis((0, n), name="k")
-    C = te.compute((), lambda: te.sum(A[k + k + k] * scale, axis=k), name="C")
-    D = te.compute((), lambda: C + 1)
-    s = te.create_schedule(D.op)
-    stmt = tvm.lower(s, [A, scale, D], simple_mode=True)
-
-    # build and invoke the kernel.
-    f = tvm.build(s, [A, scale, D], "llvm")
-    dev = tvm.cpu(0)
-    # launch the kernel.
-    a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev)
-    sc = tvm.nd.array(np.random.randint(0, 2, size=()).astype(scale.dtype), dev)
-    d = tvm.nd.empty((), D.dtype, dev)
-    f(a, sc, d)
-    d_np = np.sum(a.numpy()) * sc.numpy() + 1
-    tvm.testing.assert_allclose(d.numpy(), d_np)
-
-
-if __name__ == "__main__":
-    with tvm.transform.PassContext(
-        config={
-            "tir.instrument_bound_checkers": True,
-        }
-    ):
-        # zero scale
-        test_out_of_bounds_tensors_with_zero_shape_op_with_not_zero_shape_llvm()
-        # in bound
-        test_in_bounds_llvm()
-        # upper bound
-        test_out_of_bounds_llvm(1, 0)
-        test_out_of_bounds_llvm(0, 1)
-        test_out_of_bounds_llvm(1, 1)
-        test_out_of_bounds_llvm(10000, 0)
-        test_out_of_bounds_llvm(0, 10000)
-        test_out_of_bounds_llvm(10000, 10000)
-        # lower bound
-        test_out_of_bounds_llvm(-1, 0)
-        test_out_of_bounds_llvm(0, -1)
-        test_out_of_bounds_llvm(-1, -1)
-        test_out_of_bounds_llvm(-10000, 0)
-        test_out_of_bounds_llvm(0, -10000)
-        test_out_of_bounds_llvm(-10000, -10000)
-        # vectorize in bound
-        test_in_bounds_vectorize_llvm()
-        # vectorization upper bound
-        test_out_of_bounds_vectorize_llvm(1024, 1000, 0)
-        test_out_of_bounds_vectorize_llvm(1024, 0, 10000)
-        # vectorization lower bound
-        test_out_of_bounds_vectorize_llvm(1024, -1000, 0)
-        test_out_of_bounds_vectorize_llvm(1024, 0, -10000)
-        test_in_bounds_const_loop_partition_llvm()
-        test_out_of_bounds_const_loop_partition_llvm(1, 0)
-        test_out_of_bounds_const_loop_partition_llvm(0, 1)
-        test_out_of_bounds_const_loop_partition_llvm(-1, 0)
-        test_out_of_bounds_const_loop_partition_llvm(0, -1)
-        test_in_bounds_loop_partition_basic_llvm()
-        test_out_of_bounds_loop_partition_basic_llvm(32, 0)
-        test_out_of_bounds_loop_partition_basic_llvm(0, 32)
-        test_out_of_bounds_loop_partition_basic_llvm(-32, 0)
-        test_out_of_bounds_loop_partition_basic_llvm(0, -32)
-        # conv
-        test_in_bounds_conv_llvm()
-        test_out_of_bounds_conv_llvm([1, 0, 0, 0], [0, 0, 0, 0])
-        test_out_of_bounds_conv_llvm([0, 1, 0, 0], [0, 0, 0, 0])
-        test_out_of_bounds_conv_llvm([0, 0, 1, 0], [0, 0, 0, 0])
-        test_out_of_bounds_conv_llvm([0, 0, 0, 1], [0, 0, 0, 0])
-        test_out_of_bounds_conv_llvm([-1, 0, 0, 0], [0, 0, 0, 0])
-        test_out_of_bounds_conv_llvm([0, -1, 0, 0], [0, 0, 0, 0])
-        test_out_of_bounds_conv_llvm([0, 0, -1, 0], [0, 0, 0, 0])
-        test_out_of_bounds_conv_llvm([0, 0, 0, -1], [0, 0, 0, 0])
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [1, 0, 0, 0])
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 1, 0, 0])
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 1, 0])
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 0, 1])
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [-1, 0, 0, 0])
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, -1, 0, 0])
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, -1, 0])
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 0, -1])
-        # loop tiling
-        test_in_bounds_conv_llvm(True)
-        test_out_of_bounds_conv_llvm([1, 0, 0, 0], [0, 0, 0, 0], True)
-        test_out_of_bounds_conv_llvm([0, 1, 0, 0], [0, 0, 0, 0], True)
-        test_out_of_bounds_conv_llvm([0, 0, 1, 0], [0, 0, 0, 0], True)
-        test_out_of_bounds_conv_llvm([0, 0, 0, 1], [0, 0, 0, 0], True)
-        test_out_of_bounds_conv_llvm([-1, 0, 0, 0], [0, 0, 0, 0], True)
-        test_out_of_bounds_conv_llvm([0, -1, 0, 0], [0, 0, 0, 0], True)
-        test_out_of_bounds_conv_llvm([0, 0, -1, 0], [0, 0, 0, 0], True)
-        test_out_of_bounds_conv_llvm([0, 0, 0, -1], [0, 0, 0, 0], True)
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [1, 0, 0, 0], True)
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 1, 0, 0], True)
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 1, 0], True)
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 0, 1], True)
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [-1, 0, 0, 0], True)
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, -1, 0, 0], True)
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, -1, 0], True)
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 0, -1], True)
-        # tensors with diff shapes basic operation such as mul
-        test_out_of_bounds_tensors_with_diff_shapes1D_llvm(32, 64, 64)
-        test_out_of_bounds_tensors_with_diff_shapes1D_llvm(64, 32, 64)
-        test_out_of_bounds_tensors_with_diff_shapes2D_llvm([64, 64], [32, 32], [64, 64])
-        test_out_of_bounds_tensors_with_diff_shapes2D_llvm([32, 32], [64, 64], [64, 64])
-        test_out_of_bounds_tensors_with_diff_shapes3D_llvm([64, 64, 64], [32, 32, 32], [64, 64, 64])
-        test_out_of_bounds_tensors_with_diff_shapes3D_llvm([32, 32, 32], [64, 64, 64], [64, 64, 64])
-        # check tensors with the same shapes
-        test_in_bounds_tensors_with_same_shapes1D_llvm()
-        test_in_bounds_tensors_with_same_shapes2D_llvm()
-        test_in_bounds_tensors_with_same_shapes3D_llvm()
-        # ir tests
-        test_in_bounds_const_loop_partition_ir()
diff --git a/tests/python/tir-transform/test_tir_transform_loop_partition.py b/tests/python/tir-transform/test_tir_transform_loop_partition.py
index 5f24d1666fe6..bec4129ffcbf 100644
--- a/tests/python/tir-transform/test_tir_transform_loop_partition.py
+++ b/tests/python/tir-transform/test_tir_transform_loop_partition.py
@@ -29,74 +29,6 @@ def collect_visit(stmt, f):
     return ret
 
 
-def test_basic():
-    n = te.size_var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-
-    T = te.compute((n,), lambda i: A[i] + B[i])
-    s = te.create_schedule(T.op)
-    xo, xi = s[T].split(T.op.axis[0], factor=4)
-
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([n], stmt).with_attr("global_symbol", "main"))
-    mod = tvm.tir.transform.LoopPartition()(mod)
-    stmt = tvm.tir.transform.Simplify()(mod)["main"]
-
-    assert not any(collect_visit(stmt.body.body[0], lambda x: isinstance(x, tvm.tir.IfThenElse)))
-    assert any(collect_visit(stmt.body.body[1], lambda x: isinstance(x, tvm.tir.IfThenElse)))
-
-
-def test_const_loop():
-    n = 21
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-
-    T = te.compute((n,), lambda i: A[i] + B[i])
-    s = te.create_schedule(T.op)
-    xo, xi = s[T].split(T.op.axis[0], factor=4)
-
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt).with_attr("global_symbol", "main"))
-    with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}):
-        mod = tvm.tir.transform.LoopPartition()(mod)
-        stmt = tvm.tir.transform.Simplify()(mod)["main"].body
-
-    assert not any(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse)))
-
-
-def test_no_unroll_loop():
-    n = 21
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-
-    T = te.compute((n,), lambda i: A[i] + B[i])
-    s = te.create_schedule(T.op)
-    xo, xi = s[T].split(T.op.axis[0], factor=4)
-
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt).with_attr("global_symbol", "main"))
-    with tvm.transform.PassContext(
-        config={
-            "tir.LoopPartition": {
-                "partition_const_loop": True,
-                "no_unroll_loop_with_extent_one": True,
-            }
-        }
-    ):
-        mod = tvm.tir.transform.LoopPartition()(mod)
-        mod = tvm.tir.transform.Simplify()(mod)
-        stmt = tvm.tir.transform.RemoveNoOp()(mod)["main"].body
-
-    assert sum(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.For))) == 4
-
-
 def test_multi_loop():
     ib = tvm.tir.ir_builder.create()
     m = te.size_var("m")
@@ -141,52 +73,6 @@ def test_multi_if():
     assert not any(collect_visit(stmt.body[0], lambda x: isinstance(x, tvm.tir.IfThenElse)))
 
 
-def test_thread_axis():
-    m = te.size_var("m")
-    l = te.size_var("l")
-    A = te.placeholder((m, l), name="A")
-    B = te.compute((m, l), lambda i, j: A[i, j] + 3, name="B")
-    s = te.create_schedule(B.op)
-
-    s[B].set_scope("shared")
-    num_thread = 16
-    xo, xi = s[B].split(B.op.axis[0], 32)
-    xi0, xi1 = s[B].split(xi, nparts=num_thread)
-    s[B].bind(xi0, te.thread_axis("threadIdx.x"))
-
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt).with_attr("global_symbol", "main"))
-    mod = tvm.tir.transform.LoopPartition()(mod)
-    stmt = tvm.tir.transform.Simplify()(mod)["main"]
-
-    assert not any(collect_visit(stmt.body.body[0], lambda x: isinstance(x, tvm.tir.IfThenElse)))
-
-
-def test_vectorize():
-    n = te.size_var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    bias = te.size_var("bias", dtype="float32")
-    scale = te.size_var("scale", dtype="float32")
-    C = te.compute(A.shape, lambda *i: A(*i) + B(*i) * scale + bias, name="C")
-    # schedule
-    s = te.create_schedule(C.op)
-    # create iter var and assign them tags.
-    num_thread = 32
-    bx, x = s[C].split(C.op.axis[0], factor=num_thread * 4)
-    tx, x = s[C].split(x, nparts=num_thread)
-    _, x = s[C].split(x, factor=4)
-    s[C].bind(bx, te.thread_axis("blockIdx.x"))
-    s[C].bind(tx, te.thread_axis("threadIdx.x"))
-    s[C].vectorize(x)
-    stmt = tvm.lower(s, [A, B], name="main")["main"]
-    body = stmt.body.body.body.body
-    assert x.var.name not in str(body.condition)
-    assert any(collect_visit(body.then_case, lambda x: isinstance(x, tvm.tir.Ramp)))
-
-
 def test_condition():
     ib = tvm.tir.ir_builder.create()
     m = te.size_var("m")
@@ -219,24 +105,6 @@ def test_condition_EQ():
     assert not any(collect_visit(stmt[0], lambda x: isinstance(x, tvm.tir.Select)))
 
 
-def test_thread_axis2():
-    n = tvm.runtime.convert(4096)
-    m = te.size_var("m")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
-    s = te.create_schedule(C.op)
-    num_thread = 32
-    bx, x = s[C].split(C.op.axis[0], factor=32)
-    tx, x = s[C].split(x, nparts=num_thread)
-    _, x = s[C].split(x, factor=m)
-    s[C].bind(bx, te.thread_axis("blockIdx.x"))
-    s[C].bind(tx, te.thread_axis("threadIdx.x"))
-    stmt = tvm.lower(s, [A, B], name="main")["main"]
-    for_body = stmt.body.body.body.body[0]
-    assert "threadIdx" not in str(for_body.extent)
-
-
 def test_everything_during_deduction():
     m = te.size_var("m")
     n = te.size_var("n")
@@ -255,55 +123,6 @@ def test_everything_during_deduction():
     assert isinstance(stmt.body.body, tvm.tir.IfThenElse)
 
 
-def test_single_likely():
-    n = 60
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-
-    T = te.compute((n,), lambda i: A[i] + B[i])
-    s = te.create_schedule(T.op)
-    x = T.op.axis[0]
-    xo, xi = s[T].split(x, factor=16)
-
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt).with_attr("global_symbol", "main"))
-
-    with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}):
-        mod = tvm.tir.transform.LoopPartition()(mod)
-        stmt = tvm.tir.transform.Simplify()(mod)["main"].body
-
-    assert not any(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse)))
-
-
-def test_multi_likely():
-    n = 94
-    m = 62
-    A = te.placeholder((n, m), name="A")
-    B = te.placeholder((n, m), name="B")
-
-    T = te.compute((n, m), lambda i, j: A[i, j] + B[i, j])
-    s = te.create_schedule(T.op)
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-    x, y = T.op.axis
-    xo, xi = s[T].split(x, factor=16)
-    yo, yi = s[T].split(y, factor=16)
-    s[T].reorder(xo, yo, xi, yi)
-
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt).with_attr("global_symbol", "main"))
-
-    with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}):
-        mod = tvm.tir.transform.LoopPartition()(mod)
-        stmt = tvm.tir.transform.Simplify()(mod)["main"].body
-
-    assert not any(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse)))
-
-
 def test_oneD_pool():
     m = te.size_var("m")
     ib = tvm.tir.ir_builder.create()
@@ -415,135 +234,6 @@ def test_cce_loop_3():
     assert not any(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse)))
 
 
-def test_conv_tiling():
-    HSTR = WSTR = 1
-    in_channel = 128
-    kernel_height = kernel_width = 3
-    out_channel = 64
-    batch_size = 1
-    in_height = in_width = 64
-    out_height = out_width = in_height - kernel_height + 1
-    data = te.placeholder((batch_size, in_channel, in_height, in_width), name="data")
-    kernel = te.placeholder((kernel_height, kernel_width, in_channel, out_channel), name="kernel")
-    ic = te.reduce_axis((0, in_channel), name="ic")
-    kh = te.reduce_axis((0, kernel_height), name="kh")
-    kw = te.reduce_axis((0, kernel_width), name="kw")
-    conv = te.compute(
-        (batch_size, out_channel, out_height, out_width),
-        lambda n, oc, oh, ow: te.sum(
-            data[n, ic, oh * HSTR + kh, ow * WSTR + kw] * kernel[kh, kw, ic, oc], axis=[ic, kh, kw]
-        ),
-        name="conv2d",
-    )
-    s = te.create_schedule(conv.op)
-
-    n, oc, oh, ow = conv.op.axis
-    oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16)
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt).with_attr("global_symbol", "main"))
-    with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}):
-        mod = tvm.tir.transform.LoopPartition()(mod)
-        stmt = tvm.tir.transform.Simplify()(mod)["main"].body
-
-    assert not any(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse)))
-
-
-def test_multilevel_splitting_with_indivisble_factors():
-    from tvm import topi
-
-    A = te.placeholder((130,), dtype="float32")
-    B = topi.nn.relu(A)
-    s = te.create_schedule(B.op)
-    (y,) = s[B].op.axis
-    (yo, yi) = s[B].split(y, factor=8)
-    (yoo, yoi) = s[B].split(yo, factor=16)
-    s[B].reorder(yoo, yoi, yi)
-    s[B].unroll(yi)
-
-    ## But this does the right thing.
-    with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}):
-        lowered_body = tvm.lower(s, [A, B], name="x")["x"].body
-
-        def visit_stmt(op):
-            return isinstance(op, tvm.tir.Max)
-
-        num_max = collect_visit(lowered_body, visit_stmt)
-        assert num_max.count(True) == 10
-
-
-def test_double_splitting_with_indivisible_factors():
-    m = 48
-    dtype = "float32"
-    A = te.placeholder((m,), name="A", dtype=dtype)
-    C = te.compute((m,), lambda i: A[i], name="C")
-    D = te.compute((m,), lambda i: C[i], name="D")
-
-    s = te.create_schedule(D.op)
-    co, ci = s[C].split(C.op.axis[0], factor=10)
-    do, di = s[D].split(D.op.axis[0], 32)
-    s[C].compute_at(s[D], do)
-
-    target = "llvm"
-    with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}):
-        f = tvm.lower(s, [A, C, D], name="fadd1", simple_mode=False)
-        func = tvm.build(f, target=target)
-
-    top_produce = f["fadd1"].body
-    assert not any(collect_visit(top_produce, lambda x: isinstance(x, tvm.tir.IfThenElse)))
-
-    # check functional correctness of generated code
-    dev = tvm.device(target, 0)
-    a = tvm.nd.array(
-        numpy.ones(
-            m,
-        ).astype(dtype),
-        dev,
-    )
-    c = tvm.nd.array(
-        numpy.zeros(
-            m,
-        ).astype(dtype),
-        dev,
-    )
-    d = tvm.nd.array(
-        numpy.zeros(
-            m,
-        ).astype(dtype),
-        dev,
-    )
-    func(a, c, d)
-    tvm.testing.assert_allclose(c.numpy(), a.numpy(), rtol=1e-5)
-    tvm.testing.assert_allclose(d.numpy(), a.numpy(), rtol=1e-5)
-
-
-def test_simple_rfactor():
-    K = 16 * 4 + 4
-    k = te.reduce_axis((0, K), "k")
-
-    A = te.placeholder((1, K), name="A")
-
-    B = te.compute((1,), lambda b: te.sum(A[b, k], axis=k), name="B")
-
-    s = te.create_schedule(B.op)
-    ko, _ = s[B].split(s[B].op.reduce_axis[0], 16)
-    BF = s.rfactor(B, ko, 0)
-
-    s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt1 = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    mod1 = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt1).with_attr("global_symbol", "main"))
-    stmt1 = tvm.tir.transform.Simplify()(mod1)["main"].body
-
-    with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}):
-        mod2 = tvm.tir.transform.LoopPartition()(mod1)
-        stmt2 = tvm.tir.transform.Simplify()(mod2)["main"].body
-
-    # make sure loop partition actually did something
-    assert not tvm.ir.structural_equal(stmt1.body, stmt2.body)
-
-
 @T.prim_func
 def partitioned_concat(
     A: T.Buffer((16,), "float32"), B: T.Buffer((16,), "float32"), C: T.Buffer((32,), "float32")
@@ -555,21 +245,6 @@ def partitioned_concat(
         C[i + 16] = B[i + 16]
 
 
-def test_explicit_partition_hint():
-    A = te.placeholder((16,), name="A")
-    B = te.placeholder((16,), name="B")
-    C = te.compute((32,), lambda i: te.if_then_else(i < 16, A[i], B[i]), name="C")
-    s = te.create_schedule(C.op)
-    s.normalize()
-    s[C].pragma(s[C].op.axis[0], "loop_partition_hint", True)
-    mod = tvm.driver.build_module.schedule_to_module(s, [A, B, C], "main", None)
-    with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}):
-        mod = tvm.tir.transform.StorageFlatten(64)(mod)
-        mod = tvm.tir.transform.LoopPartition()(mod)
-        mod = tvm.tir.transform.Simplify()(mod)
-    tvm.ir.assert_structural_equal(mod["main"], partitioned_concat)
-
-
 def partition_from_scheduled_tir(prim_func, pass_cfg, do_flatten=True):
     with tvm.transform.PassContext(config=pass_cfg):
         mod = IRModule.from_expr(prim_func.with_attr("global_symbol", "main"))
diff --git a/tests/python/tir-transform/test_tir_transform_lower_cross_thread_reduction.py b/tests/python/tir-transform/test_tir_transform_lower_cross_thread_reduction.py
index 35b4d55ea51d..63700853b36a 100644
--- a/tests/python/tir-transform/test_tir_transform_lower_cross_thread_reduction.py
+++ b/tests/python/tir-transform/test_tir_transform_lower_cross_thread_reduction.py
@@ -1897,21 +1897,6 @@ def test_no_thread_broadcast_rewrite():
     _check(no_thread_broadcast, lowered_no_thread_broadcast)
 
 
-def test_lower_te():
-    a = te.placeholder((32, 2, 2))
-    k1 = te.reduce_axis((0, 2), "k1")
-    k2 = te.reduce_axis((0, 2), "k2")
-    b = te.compute((32,), lambda i: te.sum(a[i, k1, k2], axis=[k1, k2]))
-    s = te.create_schedule(b.op)
-    s[b].bind(k1, te.thread_axis("threadIdx.x"))
-    s[b].bind(k2, te.thread_axis("threadIdx.y"))
-    orig_mod = tvm.driver.build_module.schedule_to_module(s, [a, b])
-    mod = tvm.tir.transform.LowerCrossThreadReduction()(orig_mod)
-    tvm.ir.assert_structural_equal(
-        mod, orig_mod
-    )  # LowerCrossThreadReduction should do nothing on TE
-
-
 def test_layer_norm_tuple_sum():
     _check(layer_norm_tuple_sum, lowered_layer_norm_tuple_sum)
 
diff --git a/tests/python/tir-transform/test_tir_transform_lower_init_block.py b/tests/python/tir-transform/test_tir_transform_lower_init_block.py
index 3ada747f6915..d05b8bc71f46 100644
--- a/tests/python/tir-transform/test_tir_transform_lower_init_block.py
+++ b/tests/python/tir-transform/test_tir_transform_lower_init_block.py
@@ -105,15 +105,6 @@ def test_lower_match_buffer():
     tvm.ir.assert_structural_equal(mod, BranchWithMatchBuffer, True)
 
 
-def test_lower_te():
-    x = te.placeholder((1,))
-    y = te.compute((1,), lambda i: x[i] + 2)
-    s = te.create_schedule(y.op)
-    orig_mod = tvm.driver.build_module.schedule_to_module(s, [x, y])
-    mod = tvm.tir.transform.LowerInitBlock()(orig_mod)
-    tvm.ir.assert_structural_equal(mod, orig_mod)  # LowerInitBlock should do nothing on TE
-
-
 if __name__ == "__main__":
     test_lower_reduction()
     test_lower_match_buffer()
diff --git a/tests/python/tir-transform/test_tir_transform_lower_intrin.py b/tests/python/tir-transform/test_tir_transform_lower_intrin.py
index 0764daac461a..3eb642fb51b3 100644
--- a/tests/python/tir-transform/test_tir_transform_lower_intrin.py
+++ b/tests/python/tir-transform/test_tir_transform_lower_intrin.py
@@ -47,9 +47,7 @@ def make_binds(i):
         return x
 
     C = te.compute((n,), make_binds)
-    s = te.create_schedule([C.op])
-
-    f = tvm.build(s, [A, B, C], "llvm")
+    f = tvm.build(te.create_prim_func([A, B, C]), "llvm")
     a = tvm.nd.array(np.array([x for x, y in data], dtype=expr.dtype))
     b = tvm.nd.array(np.array([y for x, y in data], dtype=expr.dtype))
     c = tvm.nd.array(np.zeros(len(data), dtype=expr.dtype))
diff --git a/tests/python/tir-transform/test_tir_transform_lower_opaque_block.py b/tests/python/tir-transform/test_tir_transform_lower_opaque_block.py
index ae44d2127595..dbaafb617aad 100644
--- a/tests/python/tir-transform/test_tir_transform_lower_opaque_block.py
+++ b/tests/python/tir-transform/test_tir_transform_lower_opaque_block.py
@@ -349,15 +349,6 @@ def test_symbolic_strided_buffer():
     _check(compacted_symbolic_strided_buffer_func, transformed_symbolic_strided_buffer_func)
 
 
-def test_lower_te():
-    x = te.placeholder((1,))
-    y = te.compute((1,), lambda i: x[i] + 2)
-    s = te.create_schedule(y.op)
-    orig_mod = tvm.driver.build_module.schedule_to_module(s, [x, y])
-    mod = tvm.tir.transform.LowerOpaqueBlock()(orig_mod)
-    tvm.ir.assert_structural_equal(mod, orig_mod)  # LowerOpaqueBlock should do nothing on TE
-
-
 def test_annotated_loops():
     mod = tvm.IRModule.from_expr(annotated_loops.with_attr("global_symbol", "main"))
     mod = tvm.tir.transform.LowerOpaqueBlock()(mod)
diff --git a/tests/python/tir-transform/test_tir_transform_lower_warp_memory.py b/tests/python/tir-transform/test_tir_transform_lower_warp_memory.py
deleted file mode 100644
index 99ccc5556585..000000000000
--- a/tests/python/tir-transform/test_tir_transform_lower_warp_memory.py
+++ /dev/null
@@ -1,356 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import pytest
-import tvm
-import tvm.testing
-from tvm import te, tir
-from tvm.contrib.nvcc import have_fp16
-
-
-def _run_passes(mod):
-    cuda_target = tvm.target.Target("cuda", host="llvm")
-    assert cuda_target.thread_warp_size == 32
-    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", cuda_target))(mod)
-    mod = tvm.tir.transform.AnnotateDeviceRegions()(mod)
-    mod = tvm.tir.transform.SplitHostDevice()(mod)
-    mod = tvm.tir.transform.LowerWarpMemory()(mod)
-    return mod
-
-
-@tvm.testing.requires_cuda
-def test_lower_warp_memory_local_scope():
-    m = 128
-    A = te.placeholder((m,), name="A")
-    B = te.compute((m,), lambda i: A[i] + 3, name="B")
-
-    s = te.create_schedule(B.op)
-    AA = s.cache_read(A, "warp", [B])
-    xo, xi = s[B].split(B.op.axis[0], 64)
-    xi0, xi1 = s[B].split(xi, factor=32)
-    tx = te.thread_axis("threadIdx.x")
-    s[B].bind(xi1, tx)
-    s[B].bind(xo, te.thread_axis("blockIdx.x"))
-    s[AA].compute_at(s[B], xo)
-    xo, xi = s[AA].split(s[AA].op.axis[0], 32)
-    s[AA].bind(xi, tx)
-
-    # lowering with the CSE pass disabled as otherwise it would do some commoning
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
-        mod = tvm.lower(s, [A, B], name="f")
-
-    mod = _run_passes(mod)
-    fdevice = mod["f_kernel"]
-
-    allocate = fdevice
-    while not isinstance(allocate, tir.Allocate):
-        allocate = allocate.body
-
-    assert allocate.buffer_var.type_annotation.storage_scope == "local"
-    assert allocate.extents[0].value == 2
-
-
-@tvm.testing.requires_cuda
-def test_lower_warp_memory_correct_indices():
-    n = 32
-    A = te.placeholder((2, n, n), name="A", dtype="float32")
-    C = te.compute((2, n, n), lambda x, i, j: A(x, i, (j + 1) % n), name="C")
-
-    s = te.create_schedule(C.op)
-    bk_x = te.thread_axis("blockIdx.x")
-    th_y = te.thread_axis("threadIdx.y")
-    th_x = te.thread_axis("threadIdx.x")
-    B = s.cache_read(A, "warp", [C])
-    cx, ci, cj = C.op.axis
-    bx, bi, bj = B.op.axis
-    s[C].bind(cj, th_x)
-    s[C].bind(cx, bk_x)
-    s[B].compute_at(s[C], cx)
-    s[B].bind(bi, th_y)
-    s[B].bind(bj, th_x)
-
-    bounds = tvm.te.schedule.InferBound(s)
-    ir = tvm.te.schedule.ScheduleOps(s, bounds)
-    inner_func = ir.body.body.body
-    store_A_warp = inner_func.seq[0].body.body
-    indices = list(store_A_warp.indices)
-
-    # A.warp is actually many buffers, one for each warp, although they are all called A.warp
-    # 1. If we are accessing from different threads within a same warp (different
-    #    threadIdx.x), we need to distinguish between each elements using threadIdx.x,
-    #    so threadIdx.x is one if the indices.
-    # 2. If we are accessing from different warps (different threadIdx.y), we are actually
-    #    assessing different buffers, so there is no need to distinguish from elements,
-    #    and therefore threadIdx.y is NOT a index.
-    idx_names = map(lambda x: x.name, filter(lambda x: type(x) is tvm.tir.expr.Var, indices))
-    assert "threadIdx.x" in idx_names
-    assert "threadIdx.y" not in idx_names
-
-
-@tvm.testing.requires_gpu
-@tvm.testing.requires_cuda
-def test_lower_warp_memory_cuda_end_to_end():
-    def check_cuda(dtype):
-        if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version):
-            print("Skip because gpu does not have fp16 support")
-            return
-
-        m = 128
-        A = te.placeholder((m,), name="A", dtype=dtype)
-        B = te.compute((m,), lambda i: A[i // 32 * 32 + (i + 1) % 32], name="B")
-
-        cuda_target = tvm.target.Target("cuda", host="llvm")
-        assert cuda_target.thread_warp_size == 32
-        with cuda_target:
-            s = te.create_schedule(B.op)
-            AA = s.cache_read(A, "warp", [B])
-            xo, xi = s[B].split(B.op.axis[0], 64)
-            xi0, xi1 = s[B].split(xi, factor=32)
-            tx = te.thread_axis("threadIdx.x")
-            s[B].bind(xi1, tx)
-            s[B].bind(xo, te.thread_axis("blockIdx.x"))
-            s[AA].compute_at(s[B], xo)
-            xo, xi = s[AA].split(s[AA].op.axis[0], 32)
-            s[AA].bind(xi, tx)
-
-            dev = tvm.cuda(0)
-            # building with the CSE pass disabled as otherwise it would do some commoning
-            with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
-                func = tvm.build(s, [A, B], "cuda")
-            A_np = np.array(list(range(m)), dtype=dtype)
-            B_np = np.array(
-                list(range(1, 32))
-                + [0]
-                + list(range(33, 64))
-                + [32]
-                + list(range(65, 96))
-                + [64]
-                + list(range(97, 128))
-                + [96],
-                dtype=dtype,
-            )
-            A_nd = tvm.nd.array(A_np, dev)
-            B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), dev)
-            func(A_nd, B_nd)
-            tvm.testing.assert_allclose(B_nd.numpy(), B_np, rtol=1e-3)
-
-    check_cuda("float32")
-    check_cuda("float16")
-
-
-@tvm.testing.requires_gpu
-@tvm.testing.requires_cuda
-def test_lower_warp_memory_cuda_half_a_warp():
-    def check_cuda(dtype):
-        if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version):
-            print("Skip because gpu does not have fp16 support")
-            return
-
-        n, m = 16, 16
-        A = te.placeholder(
-            (
-                n,
-                m,
-            ),
-            name="A",
-            dtype=dtype,
-        )
-        B = te.compute(
-            (
-                n,
-                m,
-            ),
-            lambda j, i: A[j, (i + 1) % m],
-            name="B",
-        )
-
-        cuda_target = tvm.target.Target("cuda", host="llvm")
-        assert cuda_target.thread_warp_size == 2 * m
-        with cuda_target:
-            s = te.create_schedule(B.op)
-            tx = te.thread_axis("threadIdx.x")
-            ty = te.thread_axis("threadIdx.y")
-            bx = te.thread_axis("blockIdx.x")
-
-            AA = s.cache_read(A, "warp", [B])
-            y, x = B.op.axis
-            z, y = s[B].split(y, nparts=2)
-            s[B].bind(x, tx)
-            s[B].bind(y, ty)
-            s[B].bind(z, bx)
-            s[AA].compute_at(s[B], y)
-            _, x = AA.op.axis
-            s[AA].bind(x, tx)
-
-            dev = tvm.cuda(0)
-            # building with the CSE pass disabled as otherwise it would do some commoning
-            with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
-                func = tvm.build(s, [A, B], "cuda")
-            A_np = np.array([list(range(i, m + i)) for i in range(n)], dtype=dtype)
-            B_np = np.array([list(range(1 + i, m + i)) + [i] for i in range(n)], dtype=dtype)
-            A_nd = tvm.nd.array(A_np, dev)
-            B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), dev)
-            func(A_nd, B_nd)
-            tvm.testing.assert_allclose(B_nd.numpy(), B_np, rtol=1e-3)
-
-    check_cuda("float32")
-    check_cuda("float16")
-
-
-@tvm.testing.requires_gpu
-@tvm.testing.requires_cuda
-def test_lower_warp_memory_cuda_2_buffers():
-    def check_cuda(dtype):
-        if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version):
-            print("Skip because gpu does not have fp16 support")
-            return
-
-        m = 32
-        A = te.placeholder((m,), name="A", dtype=dtype)
-        B = te.placeholder((m,), name="B", dtype=dtype)
-        C = te.compute((m,), lambda i: A[(i + 1) % m] + B[(i + 1) % m], name="C")
-
-        cuda_target = tvm.target.Target("cuda", host="llvm")
-        assert m <= cuda_target.thread_warp_size
-        with cuda_target:
-            s = te.create_schedule(C.op)
-            tx = te.thread_axis("threadIdx.x")
-            bx = te.thread_axis("blockIdx.x")
-
-            AA = s.cache_read(A, "warp", [C])
-            BB = s.cache_read(B, "warp", [C])
-            xo, xi = s[C].split(C.op.axis[0], nparts=1)
-            s[C].bind(xi, tx)
-            s[C].bind(xo, bx)
-            s[AA].compute_at(s[C], xo)
-            s[BB].compute_at(s[C], xo)
-            xo, xi = s[AA].split(s[AA].op.axis[0], nparts=1)
-            s[AA].bind(xo, bx)
-            s[AA].bind(xi, tx)
-            xo, xi = s[BB].split(s[BB].op.axis[0], nparts=1)
-            s[BB].bind(xo, bx)
-            s[BB].bind(xi, tx)
-
-            dev = tvm.cuda(0)
-            # building with the CSE pass disabled as otherwise it would do some commoning
-            with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
-                func = tvm.build(s, [A, B, C], "cuda")
-            AB_np = np.array(list(range(m)), dtype=dtype)
-            C_np = np.array(list(range(1, m)) + [0], dtype=dtype) * 2
-            A_nd = tvm.nd.array(AB_np, dev)
-            B_nd = tvm.nd.array(AB_np, dev)
-            C_nd = tvm.nd.array(np.zeros(C_np.shape, dtype=C_np.dtype), dev)
-            func(A_nd, B_nd, C_nd)
-            tvm.testing.assert_allclose(C_nd.numpy(), C_np, rtol=1e-3)
-
-    check_cuda("float32")
-    check_cuda("float16")
-
-
-@tvm.testing.requires_gpu
-def test_lower_warp_memory_roundup():
-    def check(device, m):
-        A = te.placeholder((m,), name="A")
-        B = te.compute((m,), lambda i: A[i] + 1, name="B")
-
-        with tvm.target.Target(device):
-            s = te.create_schedule(B.op)
-            xo, xi = s[B].split(B.op.axis[0], factor=32)
-            tx = te.thread_axis("threadIdx.x")
-            s[B].bind(xo, te.thread_axis("blockIdx.x"))
-            s[B].bind(xi, tx)
-
-            AA = s.cache_read(A, "warp", [B])
-            _, yi = s[AA].split(s[AA].op.axis[0], factor=32)
-            s[AA].bind(yi, tx)
-            s[AA].compute_at(s[B], xo)
-
-            dev = tvm.device(device, 0)
-            # building with the CSE pass disabled as otherwise it would do some commoning
-            with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
-                func = tvm.build(s, [A, B], device)
-            A_np = np.random.uniform(size=(m,)).astype(A.dtype)
-            B_np = np.zeros(shape=(m,)).astype(B.dtype)
-            A_nd = tvm.nd.array(A_np, dev)
-            B_nd = tvm.nd.array(B_np, dev)
-            func(A_nd, B_nd)
-            B_np = A_np + 1
-            tvm.testing.assert_allclose(B_nd.numpy(), B_np)
-
-    for device in ["cuda", "rocm"]:
-        if not tvm.testing.device_enabled(device):
-            print("skip because", device, "is not enabled..")
-            continue
-        check(device, m=31)
-        check(device, m=32)
-        check(device, m=33)
-        check(device, m=63)
-        check(device, m=64)
-        check(device, m=65)
-
-
-@tvm.testing.requires_cuda
-def test_lower_warp_memory_same_thread():
-    m = n = 128
-    A = te.placeholder((m, n), name="A")
-    k = te.reduce_axis((0, n), name="k")
-    B = te.compute((m,), lambda i: te.sum(A[i, k], axis=[k]))
-
-    s = te.create_schedule(B.op)
-    BB = s.cache_write(B, "warp")
-    tx = te.thread_axis("threadIdx.x")
-    xo, xi = s[B].split(B.op.axis[0], factor=32)
-    s[B].bind(xi, tx)
-    s[B].bind(xo, te.thread_axis("blockIdx.x"))
-    s[BB].compute_at(s[B], xo)
-    xo, xi = s[BB].split(s[BB].op.axis[0], factor=32)
-    s[BB].bind(xi, tx)
-
-    # lowering with the CSE pass disabled as otherwise it would do some commoning
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
-        mod = tvm.lower(s, [A, B], name="f")
-
-    mod = _run_passes(mod)
-    fdevice = mod["f_kernel"]
-    assert "tvm_warp_shuffle" not in fdevice.script()
-
-
-@tvm.testing.requires_cuda
-def test_lower_warp_memory_divide_by_factor():
-    ib = tvm.tir.ir_builder.IRBuilder()
-    bx = te.thread_axis("blockIdx.x")
-    tx = te.thread_axis("threadIdx.x")
-
-    with ib.new_scope():
-        ib.scope_attr(bx, "thread_extent", 32)
-        ib.scope_attr(tx, "thread_extent", 32)
-        t = ib.allocate("float32", 16, name="t", scope="warp")
-        n = ib.allocate("float32", 16, name="n", scope="local")
-        n[0] = t[0]
-
-    stmt = ib.get()
-    func = tvm.tir.PrimFunc([], stmt)
-    func = func.with_attr("from_legacy_te_schedule", True)
-    # lowering with the CSE pass disabled as otherwise it would do some commoning
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
-        mod = tvm.lower(func, name="f")
-    with pytest.raises(tvm.error.TVMError, match="Divide by zero") as cm:
-        _run_passes(mod)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/tir-transform/test_tir_transform_make_packed_api.py b/tests/python/tir-transform/test_tir_transform_make_packed_api.py
index f783ab2fcef1..8605d5185d90 100644
--- a/tests/python/tir-transform/test_tir_transform_make_packed_api.py
+++ b/tests/python/tir-transform/test_tir_transform_make_packed_api.py
@@ -21,32 +21,6 @@
 import tvm.testing
 from tvm import te, tir
 from tvm.script import tir as T, ir as I
-from tvm.driver.build_module import schedule_to_module
-
-
-def test_makeapi():
-    """Not yet working, mock design"""
-    n = te.size_var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
-    s = te.create_schedule(C.op)
-
-    mod = schedule_to_module(s, [n, A, B, C])
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-    mod = tvm.tir.transform.Apply(
-        lambda f: f.with_attr(
-            {
-                "target": tvm.target.Target("llvm", host="llvm"),
-                "global_symbol": "main",
-            }
-        )
-    )(mod)
-
-    before = mod
-    after = tvm.tir.transform.MakePackedAPI()(before)
-    f = after["main"]
-    assert len(f.params) == 6
 
 
 def _find_assignment(stmt, var_name):
diff --git a/tests/python/tir-transform/test_tir_transform_merge_dynamic_shared_memory_allocations.py b/tests/python/tir-transform/test_tir_transform_merge_dynamic_shared_memory_allocations.py
index 9bb0aaf6e8e8..ee78dab2cbfe 100644
--- a/tests/python/tir-transform/test_tir_transform_merge_dynamic_shared_memory_allocations.py
+++ b/tests/python/tir-transform/test_tir_transform_merge_dynamic_shared_memory_allocations.py
@@ -19,314 +19,10 @@
 import tvm
 import tvm.testing
 from tvm import te
-from tvm.driver.build_module import schedule_to_module
 from tvm.topi.math import cast
 from tvm.script import tir as T
 
 
-def run_passes(sch, args):
-    mod = schedule_to_module(sch, args)
-    return tvm.transform.Sequential(
-        [
-            tvm.tir.transform.StorageFlatten(64),
-            tvm.tir.transform.Simplify(),
-            tvm.tir.transform.VectorizeLoop(),
-            tvm.tir.transform.StorageRewrite(),
-            tvm.tir.transform.MergeSharedMemoryAllocations(),
-        ]
-    )(mod)
-
-
-def verify_single_allocation(stmt, alloc_size=None):
-    num_alloc = [0]
-    alloc_extents = []
-
-    def verify(n):
-        if (
-            isinstance(n, tvm.tir.Allocate)
-            and n.buffer_var.type_annotation.storage_scope == "shared.dyn"
-        ):
-            num_alloc[0] += 1
-            alloc_extents.append(n.extents[0])
-
-    tvm.tir.stmt_functor.post_order_visit(stmt, verify)
-    assert num_alloc[0] == 1
-
-    if alloc_size:
-        assert alloc_extents[0] == alloc_size
-
-
-@tvm.testing.requires_gpu
-def test_matmul_dyn_shared():
-    n = 1024
-    block = 16
-    A = te.placeholder((n, n), name="A", dtype="float16")
-    B = te.placeholder((n, n), name="B", dtype="float16")
-
-    def syncthread():
-        return tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"]))
-
-    def test_matmul_ir(A, B, C):
-        ib = tvm.tir.ir_builder.create()
-
-        tx = te.thread_axis("threadIdx.x")
-        ty = te.thread_axis("threadIdx.y")
-        bx = te.thread_axis("blockIdx.x")
-        by = te.thread_axis("blockIdx.y")
-        ib.scope_attr(tx, "thread_extent", block)
-        ib.scope_attr(ty, "thread_extent", block)
-        ib.scope_attr(bx, "thread_extent", n // block)
-        ib.scope_attr(by, "thread_extent", n // block)
-
-        A_sh = ib.allocate(A.dtype, (block, block), scope="shared.dyn", name="A_sh")  # fp16
-        B_sh = ib.allocate(B.dtype, (block, block), scope="shared.dyn", name="B_sh")  # fp16
-        # Create a dynamic shared memory for the accumulation.
-        # This is for testing merging dynamic shared memory alloctions with different data type.
-        # In practice, there is no need to allocate a shared memory for C.
-        C_local = ib.allocate(C.dtype, (1,), scope="local", name="C_local")
-        C_sh = ib.allocate(C.dtype, (block, block), scope="shared.dyn", name="C_sh")  # fp32
-
-        A_ptr = ib.buffer_ptr(A)
-        B_ptr = ib.buffer_ptr(B)
-        C_ptr = ib.buffer_ptr(C)
-
-        C_local[0] = 0.0
-
-        with ib.for_range(0, n // block, name="i") as i:
-            A_sh[ty, tx] = A_ptr[by * block + ty, i * block + tx]
-            B_sh[ty, tx] = B_ptr[i * block + ty, bx * block + tx]
-            ib.emit(syncthread())
-
-            with ib.for_range(0, block, name="k") as k:
-                C_local[0] += cast(A_sh[ty, k] * B_sh[k, tx], "float32")
-            ib.emit(syncthread())
-
-        C_sh[ty, tx] = C_local[0]
-        C_ptr[by * block + ty, bx * block + tx] = C_sh[ty, tx]
-
-        return ib.get()
-
-    C = te.extern(
-        A.shape,
-        [A, B],
-        lambda ins, outs: test_matmul_ir(ins[0], ins[1], outs[0]),
-        name="matmul",
-        dtype="float32",
-    )
-    s = te.create_schedule(C.op)
-    mod = run_passes(s, [A, B, C])
-    # C can be allocated at the start of A, so we only need to allocate 2 block * block memory with dtype = float16
-    expected_alloc_size = block * block * 4
-    verify_single_allocation(mod["main"].body, expected_alloc_size)
-
-    def check_target(target):
-        if not tvm.testing.device_enabled(target):
-            return
-
-        fmatmul = tvm.build(s, [A, B, C], target)
-        dev = tvm.device(target, 0)
-
-        size = (n, n)
-        a_np = np.random.uniform(size=size).astype(A.dtype)
-        b_np = np.random.uniform(size=size).astype(B.dtype)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(size, dtype=C.dtype), dev)
-        fmatmul(a, b, c)
-        np_ref = np.dot(a_np.astype("float32"), b_np.astype("float32"))
-        tvm.testing.assert_allclose(c.numpy(), np_ref, 1e-4, 1e-4)
-
-    for target in ["cuda", "nvptx"]:
-        check_target(target)
-
-
-@tvm.testing.requires_gpu
-def test_dyn_shared_vectorized_store():
-    """Test vectorized store into dynamic shared memory"""
-    n = te.size_var("n")
-    A = te.placeholder((n,), name="A", dtype="float16")
-    B = te.placeholder((n,), name="B", dtype="float32")
-
-    def test_device_ir(A, B, C):
-        n = A.shape[0]
-        ib = tvm.tir.ir_builder.create()
-
-        values_per_thread = 4
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(tx, "thread_extent", tvm.tir.indexdiv(n, values_per_thread))
-
-        A_sh = ib.allocate(A.dtype, (n,), scope="shared.dyn")  # fp16
-        B_sh = ib.allocate(B.dtype, (n,), scope="shared.dyn")  # fp32
-
-        Aptr = ib.buffer_ptr(A)
-        Bptr = ib.buffer_ptr(B)
-        Cptr = ib.buffer_ptr(C)
-
-        with ib.for_range(0, values_per_thread, kind="vectorize") as i:
-            A_sh[tx * values_per_thread + i] = Aptr[tx * values_per_thread + i]
-            B_sh[tx * values_per_thread + i] = Bptr[tx * values_per_thread + i]
-
-        with ib.for_range(0, values_per_thread) as i:
-            Cptr[tx * values_per_thread + i] = (
-                cast(A_sh[tx * values_per_thread + i], "float32") + B_sh[tx * values_per_thread + i]
-            )
-
-        return ib.get()
-
-    C = te.extern(
-        (n,),
-        [A, B],
-        lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
-        name="vadd",
-        dtype="float32",
-    )
-    s = te.create_schedule(C.op)
-
-    mod = run_passes(s, [A, B, C])
-    verify_single_allocation(mod["main"].body)
-
-    def check_target(target):
-        if not tvm.testing.device_enabled(target):
-            return
-
-        fadd = tvm.build(s, [A, B, C], target)
-        dev = tvm.device(target, 0)
-
-        for n in [512, 1024]:
-            a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-            b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-            c = tvm.nd.array(np.zeros((n,), dtype=C.dtype), dev)
-            fadd(a, b, c)
-            tvm.testing.assert_allclose(
-                c.numpy(), a.numpy().astype("float32") + b.numpy(), 1e-4, 1e-4
-            )
-
-    for target in ["cuda", "nvptx"]:
-        check_target(target)
-
-
-@tvm.testing.requires_gpu
-def test_dyn_shared_reuse_and_merge():
-    n = 64
-    A = te.placeholder((n,), name="A", dtype="float32")
-    B = te.placeholder((n,), name="B", dtype="float32")
-    C = te.placeholder((te.size_var("n_dyn"),), name="C", dtype="float32")
-
-    def test_device_ir(A, B, C, D):
-        ib = tvm.tir.ir_builder.create()
-
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(tx, "thread_extent", n)
-
-        A_sh = ib.allocate(A.dtype, (n,), scope="shared.dyn", name="A_sh")
-        B_sh = ib.allocate(B.dtype, (n,), scope="shared.dyn", name="B_sh")
-        C_sh = ib.allocate(C.dtype, (C.shape[0],), scope="shared.dyn", name="C_sh")
-
-        Aptr = ib.buffer_ptr(A)
-        Bptr = ib.buffer_ptr(B)
-        Cptr = ib.buffer_ptr(C)
-        Dptr = ib.buffer_ptr(D)
-
-        A_sh[tx] = Aptr[tx]
-        Dptr[tx] = A_sh[tx]
-
-        B_sh[tx] = Bptr[tx]
-        Dptr[tx] += B_sh[tx]
-
-        C_sh[tx] = Cptr[tx]  # C cannot reuse other buffers since it size is dynamic
-        Dptr[tx] += C_sh[tx]
-
-        return ib.get()
-
-    D = te.extern(
-        (n,),
-        [A, B, C],
-        lambda ins, outs: test_device_ir(ins[0], ins[1], ins[2], outs[0]),
-        name="vadd",
-        dtype="float32",
-    )
-    s = te.create_schedule(D.op)
-
-    mod = run_passes(s, [A, B, C, D])
-    # merged allocation
-    # allocate(buf_dyn_shmem: Pointer(shared.dyn uint8), uint8, [((n_dyn*4) + 256)]);
-    verify_single_allocation(mod["main"].body)
-
-    def check_target(target):
-        if not tvm.testing.device_enabled(target):
-            return
-
-        fadd = tvm.build(s, [A, B, C, D], target)
-        dev = tvm.device(target, 0)
-
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-        c = tvm.nd.array(np.random.uniform(size=n).astype(C.dtype), dev)
-        d = tvm.nd.array(np.zeros((n,), dtype=D.dtype), dev)
-        fadd(a, b, c, d)
-        tvm.testing.assert_allclose(d.numpy(), a.numpy() + b.numpy() + c.numpy(), 1e-4, 1e-4)
-
-    for target in ["cuda", "nvptx"]:
-        check_target(target)
-
-
-def test_dyn_shared_more_dtype():
-    """Test vectorized store into dynamic shared memory"""
-    n = 512
-    A = te.placeholder((n,), name="A", dtype="int8")
-    B = te.placeholder((n,), name="B", dtype="int16")
-
-    def test_device_ir(A, B, C):
-        n = A.shape[0]
-        ib = tvm.tir.ir_builder.create()
-
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(tx, "thread_extent", n)
-
-        A_sh = ib.allocate(A.dtype, (n,), scope="shared.dyn")  # i8
-        B_sh = ib.allocate(B.dtype, (n,), scope="shared.dyn")  # i16
-        C_sh = ib.allocate(C.dtype, (n,), scope="shared.dyn")  # i32
-
-        Aptr = ib.buffer_ptr(A)
-        Bptr = ib.buffer_ptr(B)
-        Cptr = ib.buffer_ptr(C)
-
-        A_sh[tx] = Aptr[tx]
-        B_sh[tx] = Bptr[tx]
-
-        C_sh[tx] = cast(A_sh[tx], "int32") + cast(B_sh[tx], "int32")
-        Cptr[tx] = C_sh[tx]
-        return ib.get()
-
-    C = te.extern(
-        (n,),
-        [A, B],
-        lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
-        name="vadd",
-        dtype="int32",
-    )
-    s = te.create_schedule(C.op)
-
-    mod = run_passes(s, [A, B, C])
-    verify_single_allocation(mod["main"].body, n * 4)
-
-    def check_target(target):
-        if not tvm.testing.device_enabled(target):
-            return
-
-        fadd = tvm.build(s, [A, B, C], target)
-        dev = tvm.device(target, 0)
-
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((n,), dtype=C.dtype), dev)
-        fadd(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy().astype("float32") + b.numpy(), 1e-4, 1e-4)
-
-    for target in ["cuda", "nvptx"]:
-        check_target(target)
-
-
 class TestMatmul(tvm.testing.CompareBeforeAfter):
     """Shared allocations should be merged, preserving DeclBuffer if present
 
diff --git a/tests/python/tir-transform/test_tir_transform_merge_static_shared_memory_allocations.py b/tests/python/tir-transform/test_tir_transform_merge_static_shared_memory_allocations.py
deleted file mode 100644
index be32514a720c..000000000000
--- a/tests/python/tir-transform/test_tir_transform_merge_static_shared_memory_allocations.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm import te
-from tvm.driver.build_module import schedule_to_module
-from tvm.topi.math import cast
-from tvm.script import tir as T
-
-
-def run_passes(sch, args):
-    mod = schedule_to_module(sch, args)
-    with tvm.transform.PassContext(config={"tir.merge_static_smem": True}):
-        return tvm.transform.Sequential(
-            [
-                tvm.tir.transform.StorageFlatten(64),
-                tvm.tir.transform.Simplify(),
-                tvm.tir.transform.VectorizeLoop(),
-                tvm.tir.transform.StorageRewrite(),
-                tvm.tir.transform.MergeSharedMemoryAllocations(),
-            ]
-        )(mod)
-
-
-def verify_single_allocation(stmt, alloc_size=None):
-    num_alloc = [0]
-    alloc_extents = []
-
-    def verify(n):
-        if (
-            isinstance(n, tvm.tir.Allocate)
-            and n.buffer_var.type_annotation.storage_scope == "shared"
-        ):
-            num_alloc[0] += 1
-            alloc_extents.append(n.extents[0])
-
-    tvm.tir.stmt_functor.post_order_visit(stmt, verify)
-    assert num_alloc[0] == 1
-
-    if alloc_size:
-        assert alloc_extents[0] == alloc_size
-
-
-@tvm.testing.requires_gpu
-def test_matmul_shared():
-    n = 1024
-    block = 16
-    A = te.placeholder((n, n), name="A", dtype="float16")
-    B = te.placeholder((n, n), name="B", dtype="float16")
-
-    def syncthread():
-        return tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"]))
-
-    def test_matmul_ir(A, B, C):
-        ib = tvm.tir.ir_builder.create()
-
-        tx = te.thread_axis("threadIdx.x")
-        ty = te.thread_axis("threadIdx.y")
-        bx = te.thread_axis("blockIdx.x")
-        by = te.thread_axis("blockIdx.y")
-        ib.scope_attr(tx, "thread_extent", block)
-        ib.scope_attr(ty, "thread_extent", block)
-        ib.scope_attr(bx, "thread_extent", n // block)
-        ib.scope_attr(by, "thread_extent", n // block)
-
-        A_sh = ib.allocate(A.dtype, (block, block), scope="shared", name="A_sh")  # fp16
-        B_sh = ib.allocate(B.dtype, (block, block), scope="shared", name="B_sh")  # fp16
-        # Create a shared memory for the accumulation.
-        # This is for testing merging shared memory alloctions with different data type.
-        # In practice, there is no need to allocate a shared memory for C.
-        C_local = ib.allocate(C.dtype, (1,), scope="local", name="C_local")
-        C_sh = ib.allocate(C.dtype, (block, block), scope="shared", name="C_sh")  # fp32
-
-        A_ptr = ib.buffer_ptr(A)
-        B_ptr = ib.buffer_ptr(B)
-        C_ptr = ib.buffer_ptr(C)
-
-        C_local[0] = 0.0
-
-        with ib.for_range(0, n // block, name="i") as i:
-            A_sh[ty, tx] = A_ptr[by * block + ty, i * block + tx]
-            B_sh[ty, tx] = B_ptr[i * block + ty, bx * block + tx]
-            ib.emit(syncthread())
-
-            with ib.for_range(0, block, name="k") as k:
-                C_local[0] += cast(A_sh[ty, k] * B_sh[k, tx], "float32")
-            ib.emit(syncthread())
-
-        C_sh[ty, tx] = C_local[0]
-        C_ptr[by * block + ty, bx * block + tx] = C_sh[ty, tx]
-
-        return ib.get()
-
-    C = te.extern(
-        A.shape,
-        [A, B],
-        lambda ins, outs: test_matmul_ir(ins[0], ins[1], outs[0]),
-        name="matmul",
-        dtype="float32",
-    )
-    s = te.create_schedule(C.op)
-    mod = run_passes(s, [A, B, C])
-    # C can be allocated at the start of A, so we only need to allocate 2 block * block memory with dtype = float16
-    expected_alloc_size = block * block * 4
-    verify_single_allocation(mod["main"].body, expected_alloc_size)
-
-    def check_target(target):
-        if not tvm.testing.device_enabled(target):
-            return
-
-        fmatmul = tvm.build(s, [A, B, C], target)
-        dev = tvm.device(target, 0)
-
-        size = (n, n)
-        a_np = np.random.uniform(size=size).astype(A.dtype)
-        b_np = np.random.uniform(size=size).astype(B.dtype)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(size, dtype=C.dtype), dev)
-        fmatmul(a, b, c)
-        np_ref = np.dot(a_np.astype("float32"), b_np.astype("float32"))
-        tvm.testing.assert_allclose(c.numpy(), np_ref, 1e-4, 1e-4)
-
-    for target in ["cuda"]:
-        check_target(target)
-
-
-@tvm.testing.requires_gpu
-def test_shared_more_dtype():
-    """Test vectorized store into shared memory"""
-    n = 512
-    A = te.placeholder((n,), name="A", dtype="int8")
-    B = te.placeholder((n,), name="B", dtype="int16")
-
-    def test_device_ir(A, B, C):
-        n = A.shape[0]
-        ib = tvm.tir.ir_builder.create()
-
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(tx, "thread_extent", n)
-
-        A_sh = ib.allocate(A.dtype, (n,), scope="shared")  # i8
-        B_sh = ib.allocate(B.dtype, (n,), scope="shared")  # i16
-        C_sh = ib.allocate(C.dtype, (n,), scope="shared")  # i32
-
-        Aptr = ib.buffer_ptr(A)
-        Bptr = ib.buffer_ptr(B)
-        Cptr = ib.buffer_ptr(C)
-
-        A_sh[tx] = Aptr[tx]
-        B_sh[tx] = Bptr[tx]
-
-        C_sh[tx] = cast(A_sh[tx], "int32") + cast(B_sh[tx], "int32")
-        Cptr[tx] = C_sh[tx]
-        return ib.get()
-
-    C = te.extern(
-        (n,),
-        [A, B],
-        lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
-        name="vadd",
-        dtype="int32",
-    )
-    s = te.create_schedule(C.op)
-
-    mod = run_passes(s, [A, B, C])
-    verify_single_allocation(mod["main"].body, n * 4)
-
-    def check_target(target):
-        if not tvm.testing.device_enabled(target):
-            return
-
-        fadd = tvm.build(s, [A, B, C], target)
-        dev = tvm.device(target, 0)
-
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((n,), dtype=C.dtype), dev)
-        fadd(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy().astype("float32") + b.numpy(), 1e-4, 1e-4)
-
-    for target in ["cuda"]:
-        check_target(target)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/tir-transform/test_tir_transform_narrow_datatype.py b/tests/python/tir-transform/test_tir_transform_narrow_datatype.py
index 5ebdbe986082..93c680c846c5 100644
--- a/tests/python/tir-transform/test_tir_transform_narrow_datatype.py
+++ b/tests/python/tir-transform/test_tir_transform_narrow_datatype.py
@@ -16,7 +16,6 @@
 # under the License.
 import tvm
 from tvm import te
-from tvm.driver.build_module import schedule_to_module
 from tvm.script import tir as T
 from tvm.tir import const
 import tvm.testing
@@ -163,27 +162,6 @@ def check(m, lanes, target_bits, target_dtype):
     check(const(2**16, dtype="int32"), 2, target_bits=16, target_dtype="int32")
 
 
-def test_reduce():
-    def check(m, target_bits, target_dtype):
-        A = te.placeholder((m,), name="A", dtype="float32")
-        k = te.reduce_axis((0, m), "k")
-        B = te.compute((), lambda *idx: te.sum(A[k], axis=k), name="B")
-        s = te.create_schedule(B.op)
-        stmt = lower_sch(s, [A, B], target_bits)
-        assert stmt[1].loop_var.dtype == target_dtype
-
-    # i32 -> i32
-    check(const(64, dtype="int32"), 32, "int32")
-    # i64 -> i32
-    check(const(64, dtype="int64"), 32, "int32")
-    # i32 -> i16
-    check(const(64, dtype="int32"), 16, "int16")
-    check(const(2**16, dtype="int32"), 16, "int32")
-    # symbolic
-    check(te.var("n", dtype="int32"), 32, "int32")
-    check(te.var("n", dtype="int64"), 32, "int64")
-
-
 def test_slice():
     def check(m, n, target_bits, target_dtype):
         # The index may overflow in B, while not in A
@@ -208,25 +186,6 @@ def check(m, n, target_bits, target_dtype):
     )
 
 
-def test_ramp_dtype_consistency():
-    """
-    for (i :int64, (int64)0, (int64)4) {
-        A[ramp(i*(int64)2, (int64)1, 2)] = cast(int64, 2 ** 31 - 1) * i;
-    }
-    The infer result:
-        base:   int64 -> int64 (since i is involved in another int64 expr)
-        stride: int64 -> int32
-
-    Thus ramp should still use int64 for both stride and base after rewrite.
-    """
-    n = tvm.tir.IntImm("int64", 4)
-    m = tvm.tir.IntImm("int64", 2)
-    A = te.compute((n, m), lambda i, j: tvm.tir.Cast("int64", 2**31 - 1) * i, name="A")
-    s = te.create_schedule(A.op)
-    s[A].vectorize(A.op.axis[1])
-    lower_sch(s, [A], 32, extra_passes=[tvm.tir.transform.VectorizeLoop()])
-
-
 def test_condition():
     @T.prim_func
     def before(A: T.Buffer((128,), "float32"), B: T.Buffer((130,), "float32")):
diff --git a/tests/python/tir-transform/test_tir_transform_plan_update_buffer_allocation_location.py b/tests/python/tir-transform/test_tir_transform_plan_update_buffer_allocation_location.py
index 1a1e780a7272..8500f114610c 100644
--- a/tests/python/tir-transform/test_tir_transform_plan_update_buffer_allocation_location.py
+++ b/tests/python/tir-transform/test_tir_transform_plan_update_buffer_allocation_location.py
@@ -236,17 +236,6 @@ def test_opaque_access():
     _check(opaque_access, transformed_opaque_access)
 
 
-def test_lower_te():
-    x = te.placeholder((1,))
-    y = te.compute((1,), lambda i: x[i] + 2)
-    s = te.create_schedule(y.op)
-    orig_mod = tvm.driver.build_module.schedule_to_module(s, [x, y])
-    mod = tvm.tir.transform.PlanAndUpdateBufferAllocationLocation()(orig_mod)
-    tvm.ir.assert_structural_equal(
-        mod, orig_mod
-    )  # PlanAndUpdateBufferAllocationLocation should do nothing on TE
-
-
 def test_loop_carried_dependency():
     """The buffer allocation should be above opaque iter var's loop scopes
     such that buffer accesses with loop carried dependencies are covered,
diff --git a/tests/python/tir-transform/test_tir_transform_simplify.py b/tests/python/tir-transform/test_tir_transform_simplify.py
index 0b2d5f16d833..bbd69d01cbb4 100644
--- a/tests/python/tir-transform/test_tir_transform_simplify.py
+++ b/tests/python/tir-transform/test_tir_transform_simplify.py
@@ -73,69 +73,6 @@ def test_if_likely():
     assert not isinstance(body.body.body.then_case, tvm.tir.IfThenElse)
 
 
-def test_basic_likely_elimination():
-    n = te.size_var("n")
-    X = te.placeholder(shape=(n,), name="x")
-    W = te.placeholder(shape=(n + 1,), dtype="int32", name="w")
-
-    def f(i):
-        start = W[i]
-        extent = W[i + 1] - W[i]
-        rv = te.reduce_axis((0, extent))
-        return te.sum(X[rv + start], axis=rv)
-
-    Y = te.compute(X.shape, f, name="y")
-    s = te.create_schedule([Y.op])
-    stmt = tvm.lower(s, [X, W, Y], simple_mode=True)
-    assert "if" not in str(stmt)
-
-
-def test_complex_likely_elimination():
-    def cumsum(X):
-        """
-        Y[i] = sum(X[:i])
-        """
-        (m,) = X.shape
-        s_state = te.placeholder((m + 1,), dtype="int32", name="state")
-        s_init = te.compute((1,), lambda _: tvm.tir.const(0, "int32"))
-        s_update = te.compute((m + 1,), lambda l: s_state[l - 1] + X[l - 1])
-        return tvm.te.scan(s_init, s_update, s_state, inputs=[X], name="cumsum")
-
-    def sparse_lengths_sum(data, indices, lengths):
-        oshape = list(data.shape)
-        oshape[0] = lengths.shape[0]
-        length_offsets = cumsum(lengths)
-
-        def sls(n, d):
-            gg = te.reduce_axis((0, lengths[n]))
-            indices_idx = length_offsets[n] + gg
-            data_idx = indices[indices_idx]
-            data_val = data[data_idx, d]
-            return te.sum(data_val, axis=gg)
-
-        return te.compute(oshape, sls)
-
-    m, n, d, i, l = (
-        te.size_var("m"),
-        te.size_var("n"),
-        te.size_var("d"),
-        te.size_var("i"),
-        te.size_var("l"),
-    )
-    data_ph = te.placeholder((m, d * 32), name="data")
-    indices_ph = te.placeholder((i,), name="indices", dtype="int32")
-    lengths_ph = te.placeholder((n,), name="lengths", dtype="int32")
-    Y = sparse_lengths_sum(data_ph, indices_ph, lengths_ph)
-    s = te.create_schedule([Y.op])
-    (n, d) = s[Y].op.axis
-    (do, di) = s[Y].split(d, factor=32)
-    (gg,) = s[Y].op.reduce_axis
-    s[Y].reorder(n, do, gg, di)
-    s[Y].vectorize(di)
-    stmt = tvm.lower(s, [data_ph, indices_ph, lengths_ph, Y], simple_mode=True)
-    assert "if" not in str(stmt)
-
-
 class BaseBeforeAfter(tvm.testing.CompareBeforeAfter):
     transitively_prove_inequalities = False
     convert_boolean_to_and_of_ors = False
@@ -668,7 +605,6 @@ def expected(self, test_case):
         priors = analyzer.canonical_simplify(priors)
 
         if provable:
-
             # well formed checker complains of undefined variables in condition
             @T.prim_func(check_well_formed=False)
             def func(A: T.Buffer(1, "bool")):
diff --git a/tests/python/tir-transform/test_tir_transform_split_host_device.py b/tests/python/tir-transform/test_tir_transform_split_host_device.py
index 2d0d8a68d83e..a7ea6d8cdd46 100644
--- a/tests/python/tir-transform/test_tir_transform_split_host_device.py
+++ b/tests/python/tir-transform/test_tir_transform_split_host_device.py
@@ -21,45 +21,6 @@
 from tvm.script import tir as T
 
 
-@tvm.testing.requires_cuda
-def test_split_host_device_func_attr():
-    m = te.size_var("m")
-    l = te.size_var("l")
-    A = te.placeholder((m, l), name="A")
-
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2")
-
-    s = te.create_schedule(A2.op)
-    xo, xi = s[A2].split(A2.op.axis[0], factor=8)
-    s[A2].bind(xo, te.thread_axis("blockIdx.x"))
-    s[A1].compute_at(s[A2], xo)
-    s[A1].set_scope("shared")
-
-    mod = tvm.lower(s, [A, A2])
-
-    cuda_target = tvm.target.Target("cuda", host="llvm")
-    mod = tvm.tir.transform.Apply(
-        lambda f: f.with_attr({"global_symbol": "test", "target": cuda_target})
-    )(mod)
-
-    mod = tvm.ir.transform.Sequential(
-        [
-            tvm.tir.transform.AnnotateDeviceRegions(),
-            tvm.tir.transform.SplitHostDevice(),
-            tvm.tir.transform.MakePackedAPI(),
-            tvm.tir.transform.LowerDeviceKernelLaunch(),
-        ]
-    )(mod)
-
-    fdevice = mod["test_kernel"]
-
-    assert fdevice.attrs["global_symbol"] == "test_kernel"
-    assert fdevice.attrs["calling_conv"].value == 2
-    assert str(fdevice.attrs["target"]) == str(tvm.target.Target("cuda"))
-    assert fdevice.attrs["tir.is_global_func"].value
-
-
 def test_ssa_across_entire_module():
     """The host and device functions should not share TIR vars
 
diff --git a/tests/python/tir-transform/test_tir_transform_storage_flatten.py b/tests/python/tir-transform/test_tir_transform_storage_flatten.py
index 4a81ab93c763..2c97cc53af67 100644
--- a/tests/python/tir-transform/test_tir_transform_storage_flatten.py
+++ b/tests/python/tir-transform/test_tir_transform_storage_flatten.py
@@ -17,72 +17,9 @@
 import tvm
 import tvm.testing
 from tvm import te
-from tvm.driver.build_module import schedule_to_module
 from tvm.script import tir as T
 
 
-def test_flatten2():
-    m = te.size_var("m")
-    l = te.size_var("l")
-    A = te.placeholder((m, l), name="A")
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2")
-
-    s = te.create_schedule(A2.op)
-    xo, xi = s[A2].split(A2.op.axis[0], 8)
-    s[A1].compute_at(s[A2], xo)
-    Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name="A")
-    A2b = tvm.tir.decl_buffer(A2.shape, A2.dtype, name="A2")
-
-    mod = schedule_to_module(s, [Ab, A2b], binds={A: Ab, A2: A2b})
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-
-
-def test_flatten_prefetch():
-    A = te.placeholder((25, 100, 4), name="A")
-    _A = tvm.tir.decl_buffer(A.shape, A.dtype, name="A")
-    i = te.size_var("i")
-    j = te.size_var("j")
-    region = [tvm.ir.Range.from_min_extent(i[0], i[1]) for i in [(i, 2), (j, 8), (0, 4)]]
-    stmt = tvm.tir.Prefetch(_A, region)
-
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([_A], stmt, {A: _A})
-
-    mod = tvm.IRModule.from_expr(func)
-    mod = tvm.transform.Sequential(
-        [tvm.tir.transform.StorageFlatten(64), tvm.tir.transform.Simplify()]
-    )(mod)
-    stmt = mod["main"].body
-    assert stmt.extent.value == 2
-    assert isinstance(stmt.body, tvm.tir.For)
-    assert stmt.body.extent.value == 2
-
-    def assert_flat_loads(stmt):
-        if isinstance(stmt, tvm.tir.BufferLoad):
-            assert len(stmt.indices) == 1, "All prefetch indices should be flattened"
-
-    tvm.tir.stmt_functor.post_order_visit(stmt, assert_flat_loads)
-
-
-def test_flatten_storage_align():
-    m = 8
-    l = 16
-    A = te.placeholder((m, l), name="A")
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2")
-
-    s = te.create_schedule(A2.op)
-    s[A1].storage_align(A1.op.axis[0], 2, 1)
-
-    mod = schedule_to_module(s, [A, A2])
-    mod = tvm.transform.Sequential(
-        [tvm.tir.transform.StorageFlatten(64), tvm.tir.transform.Simplify()]
-    )(mod)
-
-    stmt = mod["main"].body
-    assert stmt.extents[0].value == 17 * 8
-
-
 def test_flatten_double_buffer():
     @tvm.script.ir_module
     class ModFromScript:
diff --git a/tests/python/tir-transform/test_tir_transform_storage_rewrite.py b/tests/python/tir-transform/test_tir_transform_storage_rewrite.py
index 68149e7d64bb..ab91c6c7b330 100644
--- a/tests/python/tir-transform/test_tir_transform_storage_rewrite.py
+++ b/tests/python/tir-transform/test_tir_transform_storage_rewrite.py
@@ -21,39 +21,9 @@
 import tvm
 import tvm.testing
 from tvm import te
-from tvm.driver.build_module import schedule_to_module
 from tvm.script import tir as T
 
 
-def test_storage_share():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), name="A")
-    num_stage = 5
-    B = A
-    for t in range(num_stage):
-        B = te.compute((m, l), lambda i, j: B[i, j] + (t + 1), name="A%d" % t)
-
-    s = te.create_schedule(B.op)
-    mod = schedule_to_module(s, [A, B])
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-
-    mod = tvm.tir.transform.Simplify()(mod)
-    mod = tvm.tir.transform.StorageRewrite()(mod)
-    stmt = mod["main"].body
-
-    # verify only have one allocations.
-    # verify inplace folding works
-    num_alloc = [0]
-
-    def verify(n):
-        if isinstance(n, tvm.tir.Allocate):
-            num_alloc[0] += 1
-
-    tvm.tir.stmt_functor.post_order_visit(stmt, verify)
-    assert num_alloc[0] == 1
-
-
 def register_mem(scope_tb, max_bits):
     # Register mem
     @tvm.register_func("tvm.info.mem.%s" % scope_tb)
@@ -163,103 +133,6 @@ def verify(n):
     dtype_test(dtype_list, length)
 
 
-def test_inplace_rule():
-    m = 10
-    A = te.placeholder((m,), name="A")
-    A0 = te.compute((m,), lambda i: A[i], name="A0")
-    A1 = te.compute((m,), lambda i: A[i] + 1, name="A1")
-    AA = te.compute((m,), lambda i: A0[i] + A1[i] + A1[0], name="AA")
-    B = te.compute((m,), lambda i: AA[i] + 1, name="B")
-    s = te.create_schedule(B.op)
-    mod = schedule_to_module(s, [A, B])
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-
-    mod = tvm.tir.transform.Simplify()(mod)
-    mod = tvm.tir.transform.StorageRewrite()(mod)
-    stmt = mod["main"].body
-
-    # verify only have one allocations.
-    # verify inplace folding works
-    num_alloc = [0]
-
-    def verify(n):
-        if isinstance(n, tvm.tir.Allocate):
-            num_alloc[0] += 1
-
-    tvm.tir.stmt_functor.post_order_visit(stmt, verify)
-    assert num_alloc[0] == 2
-
-
-def test_storage_combine():
-    n = 8
-    A = te.placeholder((4,), name="A")
-    num_stage = 5
-    B = A
-    stages = []
-    for t in range(num_stage):
-        B = te.compute((n,), lambda i: B[i] + B[0] + (t + 1), name="A%d" % t)
-        stages.append(B)
-
-    s = te.create_schedule(B.op)
-    for S in stages[:-1]:
-        s[S].set_scope("global:tag")
-
-    mod = schedule_to_module(s, [A, B])
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-
-    mod = tvm.tir.transform.Simplify()(mod)
-    mod = tvm.tir.transform.StorageRewrite()(mod)
-    stmt = mod["main"].body
-
-    num_alloc = [0]
-
-    def verify(n):
-        if isinstance(n, tvm.tir.Allocate):
-            num_alloc[0] += 1
-            assert n.extents[0].value == 16
-
-    tvm.tir.stmt_functor.post_order_visit(stmt, verify)
-    assert num_alloc[0] == 1
-
-
-def test_storage_combine_with_vectorization():
-    n = 1024
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute((n,), lambda i: A[i] + B[i], name="C")
-    s = te.create_schedule(C.op)
-    AA = s.cache_read(A, "global:tag", readers=[C])
-    BB = s.cache_read(B, "global:tag", readers=[C])
-    CC = s.cache_write(C, "global:tag")
-    s[CC].vectorize(s[CC].op.axis[0])
-    mod = schedule_to_module(s, [A, B, C])
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-    mod = tvm.tir.transform.VectorizeLoop()(mod)
-    mod = tvm.tir.transform.StorageRewrite()(mod)
-    mod = tvm.tir.transform.Simplify()(mod)
-    stmt = mod["main"].body
-    num_alloc = [0]
-
-    def verify(v):
-        # find add op
-        if (
-            isinstance(v, tvm.tir.Add)
-            and isinstance(v.a, tvm.tir.BufferLoad)
-            and isinstance(v.b, tvm.tir.BufferLoad)
-        ):
-            lhs_ramp = v.a.indices[0]
-            rhs_ramp = v.b.indices[0]
-            # these two ramp load should not overlap
-            assert lhs_ramp.lanes == n
-            assert rhs_ramp.lanes == n
-            assert lhs_ramp.base >= rhs_ramp.base + n or rhs_ramp.base >= lhs_ramp.base + n
-        elif isinstance(v, tvm.tir.Allocate):
-            num_alloc[0] += 1
-
-    tvm.tir.stmt_functor.post_order_visit(stmt, verify)
-    assert num_alloc[0] == 1
-
-
 def test_address_of():
     # In this test, the storage rewrite pass is allowed to
     # combine buffers B and D, but not C
@@ -313,40 +186,6 @@ def verify(n):
     assert total_alloc[0] == 16
 
 
-def test_storage_share_gpu():
-    m = te.var("m")
-    A = [te.placeholder((m), name="A")]
-    num_stage = 5
-    for t in range(num_stage):
-        A.append(te.compute((m,), lambda i: A[-1][i] + (t + 1), name="A%d_s" % t))
-        A.append(te.compute((m,), lambda i: A[-1][i], name="A%d" % t))
-    s = te.create_schedule(A[-1].op)
-    for t in range(num_stage):
-        x = A[2 * t + 2].op.axis[0]
-        bx, tx = s[A[2 * t + 2]].split(x, factor=32)
-        s[A[2 * t + 2]].bind(bx, te.thread_axis("blockIdx.x"))
-        s[A[2 * t + 2]].bind(tx, te.thread_axis("threadIdx.x"))
-        s[A[2 * t + 1]].compute_at(s[A[2 * t + 2]], tx)
-        s[A[2 * t + 1]].set_scope("shared")
-
-    mod = schedule_to_module(s, [A[0], A[-1]])
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-    mod = tvm.tir.transform.Simplify()(mod)
-    mod = tvm.tir.transform.StorageRewrite()(mod)
-    stmt = mod["main"].body
-
-    alloc_stats = {"global": 0, "shared": 0}
-
-    def verify(n):
-        if isinstance(n, tvm.tir.Allocate):
-            scope = n.buffer_var.type_annotation.storage_scope
-            alloc_stats[scope] += 1
-
-    tvm.tir.stmt_functor.post_order_visit(stmt, verify)
-    assert alloc_stats["global"] == 2
-    assert alloc_stats["shared"] == num_stage
-
-
 def test_parallel_alloc():
     ib = tvm.tir.ir_builder.create()
     n = te.var("n")
@@ -443,125 +282,6 @@ def get_mod(kind="serial"):
     assert isinstance(body.body.body, tvm.tir.Allocate)  # A
 
 
-def test_inplace_rule2(scope_tb="local_TB2", max_bits=1024 * 1024 * 1024):
-    # Test Buffer
-    register_mem(scope_tb, max_bits)
-    m = 10
-    A = te.placeholder((m,), name="A")
-    C = te.placeholder((m,), name="C")
-    D = te.placeholder((m,), name="D")
-    A0 = te.compute((m,), lambda i: A[i] + C[i], name="A0")
-    A1 = te.compute((m,), lambda i: D[i] * D[i], name="A1")
-    A2 = te.compute((m,), lambda i: A0[i] + A1[i], name="A2")
-    B = te.compute((m,), lambda i: A2[i], name="B")
-    s = te.create_schedule(B.op)
-    A0L = s.cache_read(A0, scope_tb, [A2])
-    A1L = s.cache_read(A1, scope_tb, [A2])
-    A2L = s.cache_read(A2, scope_tb, [B])
-    mod = schedule_to_module(s, [A, B, C, D])
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-
-    mod = tvm.tir.transform.Simplify()(mod)
-    mod = tvm.tir.transform.StorageRewrite()(mod)
-    stmt = mod["main"].body
-
-    # verify only have one allocations.
-    # verify inplace folding works
-    num_alloc = [0]
-
-    def verify(n):
-        if isinstance(n, tvm.tir.Allocate):
-            num_alloc[0] += 1
-
-    tvm.tir.stmt_functor.post_order_visit(stmt, verify)
-    assert num_alloc[0] == 2
-
-
-def test_exceed_mem():
-    max_bits = 639
-    # The critical max_num_bits is between 639 and 640
-    loc = -1
-    try:
-        test_inplace_rule2("local_TEM", max_bits)
-    except Exception as e:
-        estr = str(e)
-        loc = estr.find("Allocation exceed bound of memory")
-        assert loc != -1
-
-
-def test_inplace_rule3():
-    # Test Buffer
-    scope_tb = "local_TB3"
-    max_bits = 1024 * 1024 * 1024
-
-    register_mem(scope_tb, max_bits)
-    m = 10
-    B0 = te.placeholder((m,), name="B0")
-    B1 = te.placeholder((m,), name="B1")
-    B2 = te.placeholder((m,), name="B2")
-    B3 = te.placeholder((m,), name="B3")
-    B4 = te.placeholder((m,), name="B4")
-    B5 = te.placeholder((m,), name="B5")
-
-    B6 = te.compute((m,), lambda i: B1[i] * B5[i], name="B6")
-    B7 = te.compute((m,), lambda i: B2[i] * B4[i], name="B7")
-    B8 = te.compute((m,), lambda i: B6[i] - B7[i], name="B8")
-
-    B9 = te.compute((m,), lambda i: B2[i] * B3[i], name="B9")
-    B10 = te.compute((m,), lambda i: B0[i] * B5[i], name="B10")
-    B11 = te.compute((m,), lambda i: B9[i] - B10[i], name="B11")
-
-    B12 = te.compute((m,), lambda i: B0[i] * B4[i], name="B12")
-    B13 = te.compute((m,), lambda i: B1[i] * B3[i], name="B13")
-    B14 = te.compute((m,), lambda i: B12[i] - B13[i], name="B14")
-
-    B = te.compute((m,), lambda i: B8[i] * B11[i] + B14[i], name="B")
-    s = te.create_schedule(B.op)
-
-    B1L = s.cache_read(B1, scope_tb, [B6, B13])
-    B5L = s.cache_read(B5, scope_tb, [B6, B10])
-    B2L = s.cache_read(B2, scope_tb, [B7, B9])
-    B4L = s.cache_read(B4, scope_tb, [B7, B12])
-    B3L = s.cache_read(B3, scope_tb, [B9, B13])
-    B0L = s.cache_read(B0, scope_tb, [B10, B12])
-
-    B8L = s.cache_write(B8, scope_tb)
-    B11L = s.cache_write(B11, scope_tb)
-    B14L = s.cache_write(B14, scope_tb)
-    B6L = s.cache_write(B6, scope_tb)
-    B7L = s.cache_write(B7, scope_tb)
-    B9L = s.cache_write(B9, scope_tb)
-    B10L = s.cache_write(B10, scope_tb)
-    B12L = s.cache_write(B12, scope_tb)
-    B13L = s.cache_write(B13, scope_tb)
-
-    s[B12].compute_inline()
-    s[B13].compute_inline()
-    s[B8].compute_inline()
-    s[B11].compute_inline()
-    s[B14].compute_inline()
-    s[B6].compute_inline()
-    s[B7].compute_inline()
-    s[B9].compute_inline()
-    s[B10].compute_inline()
-
-    s = s.normalize()
-    mod = schedule_to_module(s, [B0, B1, B2, B3, B4, B5, B])
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-
-    mod = tvm.tir.transform.Simplify()(mod)
-    mod = tvm.tir.transform.StorageRewrite()(mod)
-    stmt = mod["main"].body
-
-    # verify only have one allocations.
-    # verify inplace folding works
-    def verify(n):
-        if isinstance(n, tvm.tir.Allocate):
-            assert n.extents[0].value == 70
-
-    tvm.tir.stmt_functor.post_order_visit(stmt, verify)
-
-
 def test_alloc_seq_type():
     ib = tvm.tir.ir_builder.create()
     n = te.var("n")
@@ -665,46 +385,6 @@ def verify(n):
     assert num_alloc[0] == 1
 
 
-def test_replace_dataflow():
-    shape = (255,)
-    A = te.placeholder(shape, name="A")
-    B = te.compute(shape, lambda i: A[i] + A[i], name="B")
-    C = te.compute(shape, lambda i: A[i] + B[i], name="C")
-    D = te.compute(shape, lambda i: A[i] + C[i], name="D")
-    E = te.compute(shape, lambda i: A[i] + D[i], name="E")
-
-    s = te.create_schedule(E.op)
-    s.cache_read(A, "local", [B, C, D, E])
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-
-
-def test_large_input():
-    @te.hybrid.script
-    def compute(a, b):
-        n = 16384
-        c = output_tensor((n, n), "int32")
-        for i in range(n):
-            for j in range(n):
-                c[i, j] = a[i, j] - b[i, j]
-        return c
-
-    n = 16384
-    shape = (n, n)
-    a = te.placeholder(shape, name="a", dtype="int32")
-    b = te.placeholder(shape, name="b", dtype="int32")
-    c = te.compute(shape, lambda i, j: compute(a, b)[i, j])
-    c = te.compute(shape, lambda i, j: 1 + c[i, j])
-    s = te.create_schedule(c.op)
-    stmt = tvm.lower(s, [a, b, c])["main"].body
-
-    def verify(n):
-        if isinstance(n, tvm.tir.Allocate):
-            assert n.extents[0].value == 268435456
-
-    tvm.tir.stmt_functor.post_order_visit(stmt, verify)
-
-
 def test_access_in_let_value():
     @T.prim_func
     def func(A: T.Buffer((8,), "float32")):
diff --git a/tests/python/tir-transform/test_tir_transform_thread_sync.py b/tests/python/tir-transform/test_tir_transform_thread_sync.py
index 5c43d8d96aa1..4ca33424c1d5 100644
--- a/tests/python/tir-transform/test_tir_transform_thread_sync.py
+++ b/tests/python/tir-transform/test_tir_transform_thread_sync.py
@@ -35,67 +35,6 @@ def run_passes(func: tvm.tir.PrimFunc):
     return tvm.tir.transform.ThreadSync("shared")(mod)
 
 
-@tvm.testing.requires_cuda
-def test_thread_storage_sync():
-    m = te.size_var("m")
-    l = te.size_var("l")
-    A = te.placeholder((m, l), name="A")
-
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2")
-
-    s = te.create_schedule(A2.op)
-    xo, xi = s[A2].split(A2.op.axis[0], factor=8)
-    s[A2].bind(xo, te.thread_axis("blockIdx.x"))
-    s[A1].compute_at(s[A2], xo)
-    s[A1].set_scope("shared")
-
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, A2], stmt, None)
-    mod = run_passes(func)
-    f = mod["test_kernel"]
-    body_list = tvm.tir.stmt_list(f.body.body.body.body.body.body)
-    assert body_list[1].value.op.same_as(tvm.ir.Op.get("tir.tvm_storage_sync"))
-
-
-@tvm.testing.requires_cuda
-def test_sync_else_branch():
-    def ir(A, B):
-        ib = tvm.tir.ir_builder.create()
-        Aptr = ib.buffer_ptr(A)
-        Bptr = ib.buffer_ptr(B)
-
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(tx, "thread_extent", 1)
-
-        local = ib.allocate(A.dtype, (8,), name="buf_local", scope="local")
-        shared = ib.allocate(A.dtype, (8,), name="buf_shared", scope="shared")
-
-        with ib.for_range(0, 8) as i:
-            with ib.if_scope(Aptr[i] < 0):
-                local[i] = Aptr[i]
-            with ib.else_scope():
-                shared[i] = Aptr[i]
-
-        with ib.for_range(0, 8) as i:
-            with ib.if_scope(Aptr[i] < 0):
-                Bptr[i] = local[i]
-            with ib.else_scope():
-                Bptr[i] = shared[i]
-
-        return ib.get()
-
-    A = tvm.tir.decl_buffer((8,), "float32")
-    B = tvm.tir.decl_buffer((8,), "float32")
-    stmt = ir(A, B)
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, B], stmt, None)
-    mod = run_passes(func)
-    assert "T.tvm_storage_sync" in str(mod)
-
-
 @tvm.testing.requires_cuda
 def test_sync_read_thread_id_independent_location():
     @T.prim_func
diff --git a/tests/python/tir-transform/test_tir_transform_unify_thread_binding.py b/tests/python/tir-transform/test_tir_transform_unify_thread_binding.py
index 9ee86433128d..a419dc3f9976 100644
--- a/tests/python/tir-transform/test_tir_transform_unify_thread_binding.py
+++ b/tests/python/tir-transform/test_tir_transform_unify_thread_binding.py
@@ -313,16 +313,5 @@ def unified_inner_binding_with_annotation(
     _check(inner_binding_with_annotation, unified_inner_binding_with_annotation)
 
 
-def test_lower_te():
-    a = te.placeholder((32, 2, 2))
-    b = te.compute((32, 2, 2), lambda i, j, k: a[i, j, k] * 2.0)
-    s = te.create_schedule(b.op)
-    s[b].bind(b.op.axis[1], te.thread_axis("threadIdx.x"))
-    s[b].bind(b.op.axis[2], te.thread_axis("threadIdx.x"))
-    orig_mod = tvm.driver.build_module.schedule_to_module(s, [a, b])
-    mod = tvm.tir.transform.UnifyThreadBinding()(orig_mod)
-    tvm.ir.assert_structural_equal(mod, orig_mod)  # UnifyThreadBinding should do nothing on TE
-
-
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/tir-transform/test_tir_transform_unroll_loop.py b/tests/python/tir-transform/test_tir_transform_unroll_loop.py
index a05a085eeb64..37dc64a9e79c 100644
--- a/tests/python/tir-transform/test_tir_transform_unroll_loop.py
+++ b/tests/python/tir-transform/test_tir_transform_unroll_loop.py
@@ -94,23 +94,6 @@ def test_unroll_fake_loop():
         assert isinstance(ret[0], tvm.tir.BufferStore)
 
 
-def test_unroll_single_count_loops():
-    n = te.size_var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.compute((n,), lambda *i: A(*i), name="B")
-    s = te.create_schedule(B.op)
-    s = s.normalize()
-    dom_map = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
-    # all parameters to UnrolLoops are default values except for
-    # auto_unroll_max_extent which has been set to 1 (default:0)
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt))
-
-    with tvm.transform.PassContext(config={"tir.UnrollLoop": {"auto_max_step": 1}}):
-        ret = tvm.tir.transform.UnrollLoop()(mod)["main"].body
-        assert ret == stmt
-
-
 def test_unroll_allocations():
     @tvm.script.ir_module
     class before:
@@ -179,5 +162,4 @@ def main(B: T.Buffer((64,), "float32")):
     test_unroll_local_access()
     test_unroll_loop()
     test_unroll_fake_loop()
-    test_unroll_single_count_loops()
     test_unroll_allocations()
diff --git a/tests/python/tir-transform/test_tir_transform_vectorize.py b/tests/python/tir-transform/test_tir_transform_vectorize.py
index 9659d896aed8..c5569c829ad5 100644
--- a/tests/python/tir-transform/test_tir_transform_vectorize.py
+++ b/tests/python/tir-transform/test_tir_transform_vectorize.py
@@ -197,16 +197,6 @@ def main(a: T.handle, n: T.int32, x: T.int32):
         tvm.ir.assert_structural_equal(mod, After)
 
 
-def test_vectorize_with_if_cond_int64():
-    m = te.size_var("m", dtype="int64")
-    A = te.placeholder((m,), name="A", dtype="float32")
-    B = te.compute((m,), lambda i: te.if_then_else(i < 2, A[i], A[i] * 2), name="B")
-    s = te.create_schedule(B.op)
-    x, y = s[B].split(B.op.axis[0], factor=4)
-    s[B].vectorize(y)
-    f = tvm.build(s, [A, B], "llvm")
-
-
 @pytest.mark.parametrize("extent, target", [(4, simple_target), (T.vscale() * 4, sve_target)])
 def test_vectorize_let(extent, target):
     @I.ir_module
@@ -371,10 +361,9 @@ def test_ir(A, B, C):
         name="while_vectorize",
         dtype=dtype,
     )
-    s = te.create_schedule(C.op)
 
     try:
-        tvm.lower(s, [A, B, C], "llvm")
+        tvm.build(te.create_prim_func([A, B, C]), target="llvm")
         assert False
     except tvm.error.TVMError as e:
         error_msg = str(e).split("\n")[-1]
@@ -382,14 +371,6 @@ def test_ir(A, B, C):
         assert expected in error_msg
 
 
-def test_vectorize_dtype_mismatch():
-    n = tvm.tir.IntImm("int64", 4)
-    A = te.compute((n,), lambda i: tvm.tir.IntImm("int64", 2**31 - 1) + i, name="A")
-    s = te.create_schedule(A.op)
-    s[A].vectorize(A.op.axis[0])
-    tvm.lower(s, [A], "llvm", simple_mode=True)
-
-
 @pytest.mark.parametrize(
     "extent, vec_str, target",
     [(16, "float32x16", simple_target), (T.vscale() * 8, "float32xvscalex8", sve_target)],
@@ -815,7 +796,7 @@ def main(A: T.Buffer((25,), "float32"), B: T.Buffer((25,), "float32")):
     with tvm.target.Target(target):
         mod = tvm.tir.transform.VectorizeLoop()(Before)
         tvm.ir.assert_structural_equal(mod, After)
-        mod = tvm.build(mod, target)
+        mod = tvm.build(mod, target=target)
 
 
 @pytest.mark.parametrize(
@@ -843,7 +824,7 @@ def main(A: T.Buffer((25,), "int32"), B: T.Buffer((25,), "float32")):
     with pytest.raises(Exception) as e_info:
         with tvm.target.Target(target):
             mod = tvm.tir.transform.VectorizeLoop()(Before)
-            ex = tvm.build(mod, target)
+            ex = tvm.build(mod, target=target)
     tvm.ir.assert_structural_equal(mod, After)
     assert "Intrinsic does not support vectors" in e_info.value.args[0]
 
diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index 0bd97e4ee048..10d63129121f 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -683,7 +683,6 @@ def add_subparser(
                 "run full Python tests",
                 [
                     "./tests/scripts/task_python_unittest.sh",
-                    "./tests/scripts/task_python_arm_compute_library.sh",
                 ],
             ),
         },
diff --git a/tests/scripts/task_java_unittest.sh b/tests/scripts/task_java_unittest.sh
index a35b023ad0df..2eabac31cc28 100755
--- a/tests/scripts/task_java_unittest.sh
+++ b/tests/scripts/task_java_unittest.sh
@@ -35,8 +35,8 @@ cleanup()
 }
 trap cleanup 0
 
-python3 "$SCRIPT_DIR"/test_add_cpu.py "$TEMP_DIR"
-python3 "$SCRIPT_DIR"/test_add_gpu.py "$TEMP_DIR"
+# python3 "$SCRIPT_DIR"/test_add_cpu.py "$TEMP_DIR"
+# python3 "$SCRIPT_DIR"/test_add_gpu.py "$TEMP_DIR"
 
 # Skip the Java RPC Unittests, see https://github.com/apache/tvm/issues/13168
 # # start rpc proxy server
diff --git a/tests/scripts/task_web_wasm.sh b/tests/scripts/task_web_wasm.sh
index 91bbbac52300..8a08c1ecb58d 100755
--- a/tests/scripts/task_web_wasm.sh
+++ b/tests/scripts/task_web_wasm.sh
@@ -25,9 +25,8 @@ cd web
 make clean
 npm install
 npm run lint
-# TODO(@tqchen, @siyuan): re-enable the following tests
-# npm run prepwasm
-# npm run bundle
-# npm run test
-# npm run typedoc
+npm run prepwasm
+npm run bundle
+npm run test
+npm run typedoc
 cd ..
diff --git a/web/tests/python/webgpu_rpc_test.py b/web/tests/python/webgpu_rpc_test.py
index 6a87c1bbe556..e831afd9d3f8 100644
--- a/web/tests/python/webgpu_rpc_test.py
+++ b/web/tests/python/webgpu_rpc_test.py
@@ -24,7 +24,6 @@
 from tvm import te
 from tvm import rpc
 from tvm.contrib import utils, tvmjs
-from tvm.relay.backend import Runtime
 import numpy as np
 
 proxy_host = "127.0.0.1"
@@ -48,7 +47,7 @@ def test_rpc():
     sch.bind(i0, "blockIdx.x")
     sch.bind(i1, "threadIdx.x")
 
-    fadd = tvm.build(sch.mod, target=target, runtime=runtime)
+    fadd = tvm.build(sch.mod.with_attr("system_lib_prefix", ""), target=target)
     temp = utils.tempdir()
 
     wasm_path = temp.relpath("addone_gpu.wasm")
diff --git a/web/tests/python/websock_rpc_test.py b/web/tests/python/websock_rpc_test.py
deleted file mode 100644
index f7011cef4723..000000000000
--- a/web/tests/python/websock_rpc_test.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Simple testcode to test Javascript RPC
-
-To use it, start a rpc proxy with "python -m tvm.exec.rpc_proxy".
-Connect javascript end to the websocket port and connect to the RPC.
-"""
-
-import tvm
-from tvm import te
-from tvm import rpc
-from tvm.contrib import utils, tvmjs
-from tvm.relay.backend import Runtime
-import numpy as np
-
-proxy_host = "127.0.0.1"
-proxy_port = 9090
-
-
-def test_rpc():
-    if not tvm.runtime.enabled("rpc"):
-        return
-    # generate the wasm library
-    runtime = Runtime("cpp", {"system-lib": True})
-    target = "llvm -mtriple=wasm32-unknown-unknown-wasm"
-    if not tvm.runtime.enabled(target):
-        raise RuntimeError("Target %s is not enbaled" % target)
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-    s = te.create_schedule(B.op)
-
-    fadd = tvm.build(s, [A, B], target, runtime=runtime, name="addone")
-    temp = utils.tempdir()
-
-    wasm_path = temp.relpath("addone.wasm")
-    fadd.export_library(wasm_path, fcompile=tvmjs.create_tvmjs_wasm)
-
-    wasm_binary = open(wasm_path, "rb").read()
-
-    remote = rpc.connect(
-        proxy_host,
-        proxy_port,
-        key="wasm",
-        session_constructor_args=["rpc.WasmSession", wasm_binary],
-    )
-
-    def check(remote):
-        # basic function checks.
-        faddone = remote.get_function("testing.asyncAddOne")
-        fecho = remote.get_function("testing.echo")
-        assert faddone(100) == 101
-        assert fecho(1, 2, 3) == 1
-        assert fecho(1, 2, 3) == 1
-        assert fecho(100, 2, 3) == 100
-        assert fecho("xyz") == "xyz"
-        assert bytes(fecho(bytearray(b"123"))) == b"123"
-        # run the generated library.
-        f1 = remote.system_lib()
-        dev = remote.cpu(0)
-        a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
-        # invoke the function
-        addone = f1.get_function("addone")
-        addone(a, b)
-
-        # time evaluator
-        time_f = f1.time_evaluator("addone", dev, number=100, repeat=10)
-        time_f(a, b)
-        cost = time_f(a, b).mean
-        print("%g secs/op" % cost)
-        np.testing.assert_equal(b.numpy(), a.numpy() + 1)
-
-    check(remote)
-
-
-test_rpc()