[MetaSchedule][Test] Migrate check_trace to check_sketch (#12764)

* Migrate AutoBind * Migrate RandomComputeLocation * Migrate CrossThreadReduction * Migrate ParallelVectorizeUnroll
apache · Sep 13, 2022 · ef784d6 · ef784d6
1 parent a23b71c
commit ef784d6
Show file tree

Hide file tree

Showing 5 changed files with 718 additions and 353 deletions.
diff --git a/python/tvm/meta_schedule/testing/schedule_rule.py b/python/tvm/meta_schedule/testing/schedule_rule.py
@@ -18,28 +18,15 @@
 from typing import List, Union
 
 from tvm.meta_schedule.schedule_rule import (
-    AutoBind,
     AutoInline,
-    CrossThreadReduction,
     MultiLevelTiling,
-    ParallelizeVectorizeUnroll,
-    RandomComputeLocation,
+    MultiLevelTilingTensorCore,
     ReuseType,
     ScheduleRule,
 )
-from tvm.meta_schedule.schedule_rule.multi_level_tiling import (
-    MultiLevelTilingTensorCore,
-)
 from tvm.target import Target
 
 
-def auto_bind(target: Target) -> ScheduleRule:
-    """Default schedule rules for auto bind"""
-    if target.kind.name == "cuda":
-        return AutoBind(max_threadblocks=256, thread_extents=[32, 64, 128, 256, 512, 1024])
-    raise NotImplementedError(f"{target.kind.name} is not supported")
-
-
 def auto_inline(target: Target) -> ScheduleRule:
     """Default schedule rules for auto inline"""
     if target.kind.name == "llvm":
@@ -65,13 +52,6 @@ def auto_inline(target: Target) -> ScheduleRule:
     raise NotImplementedError(f"{target.kind.name} is not supported")
 
 
-def cross_thread_reduction(target: Target) -> ScheduleRule:
-    """Default schedule rules for with cross-thread reduction"""
-    if target.kind.name == "cuda":
-        return CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512])
-    raise NotImplementedError(f"{target.kind.name} is not supported")
-
-
 def multi_level_tiling(target: Target) -> ScheduleRule:
     """Default schedule rules for with multi-level tiling and reuse"""
     if target.kind.name == "llvm":
@@ -154,29 +134,3 @@ def multi_level_tiling_tensor_core(
             use_software_pipeline=use_software_pipeline,
         )
     raise NotImplementedError(f"{target.kind.name} is not supported")
-
-
-def random_compute_location(target: Target) -> ScheduleRule:
-    """Default schedule rules for with random-compute-location"""
-    if target.kind.name == "llvm":
-        return RandomComputeLocation()
-    raise NotImplementedError(f"{target.kind.name} is not supported")
-
-
-def parallel_vectorize_unroll(target: Target) -> ScheduleRule:
-    """Default schedule rules for with parallel-vectorize-unroll"""
-    if target.kind.name == "llvm":
-        return ParallelizeVectorizeUnroll(
-            max_jobs_per_core=16,
-            max_vectorize_extent=32,
-            unroll_max_steps=[0, 16, 64, 512],
-            unroll_explicit=True,
-        )
-    if target.kind.name == "cuda":
-        return ParallelizeVectorizeUnroll(
-            max_jobs_per_core=-1,
-            max_vectorize_extent=-1,
-            unroll_max_steps=[0, 16, 64, 512, 1024],
-            unroll_explicit=True,
-        )
-    raise NotImplementedError(f"{target.kind.name} is not supported")
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
@@ -15,10 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
-from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
-from tvm.meta_schedule.testing.schedule_rule import auto_bind
-from tvm.meta_schedule.testing.space_generation import check_trace
-from tvm.meta_schedule.tune_context import TuneContext
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.testing.space_generation import check_sketches
 from tvm.script import tir as T
 from tvm.target import Target
 
@@ -60,83 +58,120 @@ def zero_dim_add(
         C[()] = A[()] + B[()]
 
 
-def _create_context(mod, target, rule) -> TuneContext:
-    ctx = TuneContext(
-        mod=mod,
-        target=target,
-        space_generator=PostOrderApply(),
-        sch_rules=[rule],
-        task_name="test",
-    )
-    return ctx
-
-
 def test_cuda_element_wise():
-    expected = [
-        [
-            'b0 = sch.get_block(name="C", func_name="main")',
-            "l1, l2 = sch.get_loops(block=b0)",
-            "l3 = sch.fuse(l1, l2, preserve_unit_iters=True)",
-            "v4 = sch.sample_categorical(candidates=[32, 64, 128, 256, 512, 1024], probs=[0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666])",
-            "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)",
-            'sch.bind(loop=l5, thread_axis="blockIdx.x")',
-            'sch.bind(loop=l6, thread_axis="threadIdx.x")',
-        ]
+    @T.prim_func
+    def elementwise_0(
+        A: T.Buffer[(512, 512), "float32"],
+        B: T.Buffer[(512, 512), "float32"],
+    ) -> None:
+        # body
+        # with T.block("root")
+        for i_j_fused_0 in T.thread_binding(256, thread="blockIdx.x"):
+            for i_j_fused_1 in T.thread_binding(1024, thread="threadIdx.x"):
+                with T.block("C"):
+                    vi = T.axis.spatial(512, (i_j_fused_0 * 1024 + i_j_fused_1) // 512)
+                    vj = T.axis.spatial(512, (i_j_fused_0 * 1024 + i_j_fused_1) % 512)
+                    T.reads(A[vi, vj])
+                    T.writes(B[vi, vj])
+                    B[vi, vj] = A[vi, vj] + T.float32(1)
+
+    decision_0 = [
+        ("SampleCategorical", 5),
     ]
-    target = Target("nvidia/geforce-rtx-3080", host="llvm")
-    ctx = _create_context(
-        element_wise,
-        target=target,
-        rule=auto_bind(target=target),
+    mod = element_wise
+    actual = ms.TuneContext(
+        mod=mod,
+        target=Target("nvidia/geforce-rtx-3080", host="llvm"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[
+            ms.schedule_rule.AutoBind(
+                max_threadblocks=256,
+                thread_extents=[32, 64, 128, 256, 512, 1024],
+            )
+        ],
+        task_name="test",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[elementwise_0],
+        expected_decisions=[decision_0],
     )
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    assert len(spaces) == 1
-    check_trace(spaces, expected)
 
 
 def test_cuda_reduction_loop_only():
-    expected = [
-        [
-            'b0 = sch.get_block(name="C", func_name="main")',
-            "l1, = sch.get_loops(block=b0)",
-            "l2 = sch.add_unit_loop(block_or_loop=l1)",
-            "l3 = sch.fuse(l2, preserve_unit_iters=True)",
-            "l4, l5 = sch.split(loop=l3, factors=[None, 1], preserve_unit_iters=True)",
-            'sch.bind(loop=l4, thread_axis="blockIdx.x")',
-            'sch.bind(loop=l5, thread_axis="threadIdx.x")',
-        ]
-    ]
-    target = Target("nvidia/geforce-rtx-3080", host="llvm")
-    ctx = _create_context(
-        reduction_loop_only,
-        target=target,
-        rule=auto_bind(target=target),
+    @T.prim_func
+    def reduction_loop_only_0(
+        A: T.Buffer[2, "float32"],
+        B: T.Buffer[2, "float32"],
+        C: T.Buffer[(), "float32"],
+    ) -> None:
+        for u_fused_0 in T.thread_binding(1, thread="blockIdx.x"):
+            for u_fused_1 in T.thread_binding(1, thread="threadIdx.x"):
+                for i0 in T.serial(2):
+                    with T.block("C"):
+                        k0 = T.axis.reduce(2, i0)
+                        T.reads(A[k0], B[k0])
+                        T.writes(C[()])
+                        with T.init():
+                            C[()] = T.float32(1)
+                        C[()] = T.min(C[()], A[k0] / B[k0])
+
+    mod = reduction_loop_only
+    actual = ms.TuneContext(
+        mod=mod,
+        target=Target("nvidia/geforce-rtx-3080", host="llvm"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[
+            ms.schedule_rule.AutoBind(
+                max_threadblocks=256,
+                thread_extents=[32, 64, 128, 256, 512, 1024],
+            )
+        ],
+        task_name="test",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[reduction_loop_only_0],
+        expected_decisions=[[]],
     )
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    assert len(spaces) == 1
-    check_trace(spaces, expected)
 
 
 def test_cuda_zero_dim_add():
-    expected = [
-        [
-            'b0 = sch.get_block(name="C", func_name="main")',
-            "l1 = sch.add_unit_loop(block_or_loop=b0)",
-            "l2 = sch.fuse(l1, preserve_unit_iters=True)",
-            "l3, l4 = sch.split(loop=l2, factors=[None, 1], preserve_unit_iters=True)",
-            'sch.bind(loop=l3, thread_axis="blockIdx.x")',
-            'sch.bind(loop=l4, thread_axis="threadIdx.x")',
-        ]
-    ]
-    target = Target("nvidia/geforce-rtx-3080", host="llvm")
-    ctx = _create_context(
-        zero_dim_add,
-        target=target,
-        rule=auto_bind(target=target),
+    @T.prim_func
+    def zero_dim_add_0(
+        A: T.Buffer[(), "float32"],
+        B: T.Buffer[(), "float32"],
+        C: T.Buffer[(), "float32"],
+    ) -> None:
+        for u_fused_0 in T.thread_binding(1, thread="blockIdx.x"):
+            for u_fused_1 in T.thread_binding(1, thread="threadIdx.x"):
+                with T.block("C"):
+                    vi = T.axis.spatial(1, 0)
+                    T.reads(A[()], B[()])
+                    T.writes(C[()])
+                    C[()] = A[()] + B[()]
+
+    mod = zero_dim_add
+    actual = ms.TuneContext(
+        mod=mod,
+        target=Target("nvidia/geforce-rtx-3080", host="llvm"),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=[
+            ms.schedule_rule.AutoBind(
+                max_threadblocks=256,
+                thread_extents=[32, 64, 128, 256, 512, 1024],
+            )
+        ],
+        task_name="test",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[zero_dim_add_0],
+        expected_decisions=[[]],
     )
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    assert len(spaces) == 1
-    check_trace(spaces, expected)
 
 
 if __name__ == "__main__":