Skip to content

Commit

Permalink
[MetaSchedule][Test] Migrate check_trace to check_sketch (#12764)
Browse files Browse the repository at this point in the history
* Migrate AutoBind

* Migrate RandomComputeLocation

* Migrate CrossThreadReduction

* Migrate ParallelVectorizeUnroll
  • Loading branch information
junrushao authored Sep 13, 2022
1 parent a23b71c commit ef784d6
Show file tree
Hide file tree
Showing 5 changed files with 718 additions and 353 deletions.
48 changes: 1 addition & 47 deletions python/tvm/meta_schedule/testing/schedule_rule.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,28 +18,15 @@
from typing import List, Union

from tvm.meta_schedule.schedule_rule import (
AutoBind,
AutoInline,
CrossThreadReduction,
MultiLevelTiling,
ParallelizeVectorizeUnroll,
RandomComputeLocation,
MultiLevelTilingTensorCore,
ReuseType,
ScheduleRule,
)
from tvm.meta_schedule.schedule_rule.multi_level_tiling import (
MultiLevelTilingTensorCore,
)
from tvm.target import Target


def auto_bind(target: Target) -> ScheduleRule:
"""Default schedule rules for auto bind"""
if target.kind.name == "cuda":
return AutoBind(max_threadblocks=256, thread_extents=[32, 64, 128, 256, 512, 1024])
raise NotImplementedError(f"{target.kind.name} is not supported")


def auto_inline(target: Target) -> ScheduleRule:
"""Default schedule rules for auto inline"""
if target.kind.name == "llvm":
Expand All @@ -65,13 +52,6 @@ def auto_inline(target: Target) -> ScheduleRule:
raise NotImplementedError(f"{target.kind.name} is not supported")


def cross_thread_reduction(target: Target) -> ScheduleRule:
"""Default schedule rules for with cross-thread reduction"""
if target.kind.name == "cuda":
return CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512])
raise NotImplementedError(f"{target.kind.name} is not supported")


def multi_level_tiling(target: Target) -> ScheduleRule:
"""Default schedule rules for with multi-level tiling and reuse"""
if target.kind.name == "llvm":
Expand Down Expand Up @@ -154,29 +134,3 @@ def multi_level_tiling_tensor_core(
use_software_pipeline=use_software_pipeline,
)
raise NotImplementedError(f"{target.kind.name} is not supported")


def random_compute_location(target: Target) -> ScheduleRule:
"""Default schedule rules for with random-compute-location"""
if target.kind.name == "llvm":
return RandomComputeLocation()
raise NotImplementedError(f"{target.kind.name} is not supported")


def parallel_vectorize_unroll(target: Target) -> ScheduleRule:
"""Default schedule rules for with parallel-vectorize-unroll"""
if target.kind.name == "llvm":
return ParallelizeVectorizeUnroll(
max_jobs_per_core=16,
max_vectorize_extent=32,
unroll_max_steps=[0, 16, 64, 512],
unroll_explicit=True,
)
if target.kind.name == "cuda":
return ParallelizeVectorizeUnroll(
max_jobs_per_core=-1,
max_vectorize_extent=-1,
unroll_max_steps=[0, 16, 64, 512, 1024],
unroll_explicit=True,
)
raise NotImplementedError(f"{target.kind.name} is not supported")
175 changes: 105 additions & 70 deletions tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,8 @@
# specific language governing permissions and limitations
# under the License.
# pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
from tvm.meta_schedule.testing.schedule_rule import auto_bind
from tvm.meta_schedule.testing.space_generation import check_trace
from tvm.meta_schedule.tune_context import TuneContext
from tvm import meta_schedule as ms
from tvm.meta_schedule.testing.space_generation import check_sketches
from tvm.script import tir as T
from tvm.target import Target

Expand Down Expand Up @@ -60,83 +58,120 @@ def zero_dim_add(
C[()] = A[()] + B[()]


def _create_context(mod, target, rule) -> TuneContext:
ctx = TuneContext(
mod=mod,
target=target,
space_generator=PostOrderApply(),
sch_rules=[rule],
task_name="test",
)
return ctx


def test_cuda_element_wise():
expected = [
[
'b0 = sch.get_block(name="C", func_name="main")',
"l1, l2 = sch.get_loops(block=b0)",
"l3 = sch.fuse(l1, l2, preserve_unit_iters=True)",
"v4 = sch.sample_categorical(candidates=[32, 64, 128, 256, 512, 1024], probs=[0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666])",
"l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)",
'sch.bind(loop=l5, thread_axis="blockIdx.x")',
'sch.bind(loop=l6, thread_axis="threadIdx.x")',
]
@T.prim_func
def elementwise_0(
A: T.Buffer[(512, 512), "float32"],
B: T.Buffer[(512, 512), "float32"],
) -> None:
# body
# with T.block("root")
for i_j_fused_0 in T.thread_binding(256, thread="blockIdx.x"):
for i_j_fused_1 in T.thread_binding(1024, thread="threadIdx.x"):
with T.block("C"):
vi = T.axis.spatial(512, (i_j_fused_0 * 1024 + i_j_fused_1) // 512)
vj = T.axis.spatial(512, (i_j_fused_0 * 1024 + i_j_fused_1) % 512)
T.reads(A[vi, vj])
T.writes(B[vi, vj])
B[vi, vj] = A[vi, vj] + T.float32(1)

decision_0 = [
("SampleCategorical", 5),
]
target = Target("nvidia/geforce-rtx-3080", host="llvm")
ctx = _create_context(
element_wise,
target=target,
rule=auto_bind(target=target),
mod = element_wise
actual = ms.TuneContext(
mod=mod,
target=Target("nvidia/geforce-rtx-3080", host="llvm"),
space_generator=ms.space_generator.PostOrderApply(),
sch_rules=[
ms.schedule_rule.AutoBind(
max_threadblocks=256,
thread_extents=[32, 64, 128, 256, 512, 1024],
)
],
task_name="test",
).generate_design_space()
check_sketches(
mod,
sketches=actual,
expected_mods=[elementwise_0],
expected_decisions=[decision_0],
)
spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
assert len(spaces) == 1
check_trace(spaces, expected)


def test_cuda_reduction_loop_only():
expected = [
[
'b0 = sch.get_block(name="C", func_name="main")',
"l1, = sch.get_loops(block=b0)",
"l2 = sch.add_unit_loop(block_or_loop=l1)",
"l3 = sch.fuse(l2, preserve_unit_iters=True)",
"l4, l5 = sch.split(loop=l3, factors=[None, 1], preserve_unit_iters=True)",
'sch.bind(loop=l4, thread_axis="blockIdx.x")',
'sch.bind(loop=l5, thread_axis="threadIdx.x")',
]
]
target = Target("nvidia/geforce-rtx-3080", host="llvm")
ctx = _create_context(
reduction_loop_only,
target=target,
rule=auto_bind(target=target),
@T.prim_func
def reduction_loop_only_0(
A: T.Buffer[2, "float32"],
B: T.Buffer[2, "float32"],
C: T.Buffer[(), "float32"],
) -> None:
for u_fused_0 in T.thread_binding(1, thread="blockIdx.x"):
for u_fused_1 in T.thread_binding(1, thread="threadIdx.x"):
for i0 in T.serial(2):
with T.block("C"):
k0 = T.axis.reduce(2, i0)
T.reads(A[k0], B[k0])
T.writes(C[()])
with T.init():
C[()] = T.float32(1)
C[()] = T.min(C[()], A[k0] / B[k0])

mod = reduction_loop_only
actual = ms.TuneContext(
mod=mod,
target=Target("nvidia/geforce-rtx-3080", host="llvm"),
space_generator=ms.space_generator.PostOrderApply(),
sch_rules=[
ms.schedule_rule.AutoBind(
max_threadblocks=256,
thread_extents=[32, 64, 128, 256, 512, 1024],
)
],
task_name="test",
).generate_design_space()
check_sketches(
mod,
sketches=actual,
expected_mods=[reduction_loop_only_0],
expected_decisions=[[]],
)
spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
assert len(spaces) == 1
check_trace(spaces, expected)


def test_cuda_zero_dim_add():
expected = [
[
'b0 = sch.get_block(name="C", func_name="main")',
"l1 = sch.add_unit_loop(block_or_loop=b0)",
"l2 = sch.fuse(l1, preserve_unit_iters=True)",
"l3, l4 = sch.split(loop=l2, factors=[None, 1], preserve_unit_iters=True)",
'sch.bind(loop=l3, thread_axis="blockIdx.x")',
'sch.bind(loop=l4, thread_axis="threadIdx.x")',
]
]
target = Target("nvidia/geforce-rtx-3080", host="llvm")
ctx = _create_context(
zero_dim_add,
target=target,
rule=auto_bind(target=target),
@T.prim_func
def zero_dim_add_0(
A: T.Buffer[(), "float32"],
B: T.Buffer[(), "float32"],
C: T.Buffer[(), "float32"],
) -> None:
for u_fused_0 in T.thread_binding(1, thread="blockIdx.x"):
for u_fused_1 in T.thread_binding(1, thread="threadIdx.x"):
with T.block("C"):
vi = T.axis.spatial(1, 0)
T.reads(A[()], B[()])
T.writes(C[()])
C[()] = A[()] + B[()]

mod = zero_dim_add
actual = ms.TuneContext(
mod=mod,
target=Target("nvidia/geforce-rtx-3080", host="llvm"),
space_generator=ms.space_generator.PostOrderApply(),
sch_rules=[
ms.schedule_rule.AutoBind(
max_threadblocks=256,
thread_extents=[32, 64, 128, 256, 512, 1024],
)
],
task_name="test",
).generate_design_space()
check_sketches(
mod,
sketches=actual,
expected_mods=[zero_dim_add_0],
expected_decisions=[[]],
)
spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
assert len(spaces) == 1
check_trace(spaces, expected)


if __name__ == "__main__":
Expand Down
Loading

0 comments on commit ef784d6

Please sign in to comment.