Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MetaSchedule][Test] Add unittests for DEP #12071

Merged
merged 1 commit into from
Jul 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 161 additions & 0 deletions tests/python/unittest/test_meta_schedule_space_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -741,8 +741,169 @@ def cap_2(inputs: T.Buffer[(1, 16, 16, 4, 4, 32), "float32"], weight: T.Buffer[(
)


def test_cpu_dep():
# fmt: off
@T.prim_func
def dep_0(placeholder: T.Buffer[(1, 112, 112, 32), "float32"], placeholder_1: T.Buffer[(1, 3, 3, 32), "float32"], depth_conv2d_nhwc: T.Buffer[(1, 112, 112, 32), "float32"]) -> None:
# function attr dict
T.func_attr({"global_symbol": "main", "tir.noalias": True})
# body
with T.block("root"):
T.reads()
T.writes()
T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":64, "meta_schedule.vectorize":64})
PadInput = T.alloc_buffer([1, 114, 114, 32], dtype="float32")
depth_conv2d_nhwc_global = T.alloc_buffer([1, 112, 112, 32], dtype="float32")
for i0, i1, i2, i3 in T.grid(1, 114, 114, 32):
with T.block("PadInput"):
i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
T.reads(placeholder[i0_1, i1_1 - 1, i2_1 - 1, i3_1])
T.writes(PadInput[i0_1, i1_1, i2_1, i3_1])
PadInput[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 113 and 1 <= i2_1 and i2_1 < 113, placeholder[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float32(0), dtype="float32")
for i0_0, i1_0, i2_0, i3_0, i0_1_1, i1_1_1, i2_1_1, i3_1_1 in T.grid(1, 1, 1, 1, 1, 4, 4, 8):
for i4_0, i5_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i0_3, i1_3, i2_3, i3_3 in T.grid(1, 1, 1, 2, 7, 2, 3, 3, 1, 14, 4, 2):
with T.block("depth_conv2d_nhwc"):
n = T.axis.spatial(1, i0_0 + i0_1_1 + i0_2 + i0_3)
h = T.axis.spatial(112, i1_0 * 112 + i1_1_1 * 28 + i1_2 * 14 + i1_3)
w = T.axis.spatial(112, i2_0 * 112 + i2_1_1 * 28 + i2_2 * 4 + i2_3)
c = T.axis.spatial(32, i3_0 * 32 + i3_1_1 * 4 + i3_2 * 2 + i3_3)
rh = T.axis.reduce(3, i4_0 * 3 + i4_1)
rw = T.axis.reduce(3, i5_0 * 3 + i5_1)
T.reads(PadInput[n, h + rh, w + rw, c], placeholder_1[0, rh, rw, c])
T.writes(depth_conv2d_nhwc_global[n, h, w, c])
T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
with T.init():
depth_conv2d_nhwc_global[n, h, w, c] = T.float32(0)
depth_conv2d_nhwc_global[n, h, w, c] = depth_conv2d_nhwc_global[n, h, w, c] + PadInput[n, h + rh, w + rw, c] * placeholder_1[0, rh, rw, c]
for ax0, ax1, ax2, ax3 in T.grid(1, 28, 28, 4):
with T.block("depth_conv2d_nhwc_global"):
v0 = T.axis.spatial(1, ax0)
v1 = T.axis.spatial(112, i1_1_1 * 28 + ax1)
v2 = T.axis.spatial(112, i2_1_1 * 28 + ax2)
v3 = T.axis.spatial(32, i3_1_1 * 4 + ax3)
T.reads(depth_conv2d_nhwc_global[v0, v1, v2, v3])
T.writes(depth_conv2d_nhwc[v0, v1, v2, v3])
depth_conv2d_nhwc[v0, v1, v2, v3] = depth_conv2d_nhwc_global[v0, v1, v2, v3]
@T.prim_func
def dep_1(placeholder: T.Buffer[(1, 112, 112, 32), "float32"], placeholder_1: T.Buffer[(1, 3, 3, 32), "float32"], depth_conv2d_nhwc: T.Buffer[(1, 112, 112, 32), "float32"]) -> None:
# function attr dict
T.func_attr({"global_symbol": "main", "tir.noalias": True})
# body
with T.block("root"):
T.reads()
T.writes()
T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":16, "meta_schedule.vectorize":64})
PadInput = T.alloc_buffer([1, 114, 114, 32], dtype="float32")
depth_conv2d_nhwc_global = T.alloc_buffer([1, 112, 112, 32], dtype="float32")
for i0, i1, i2, i3 in T.grid(1, 114, 114, 32):
with T.block("PadInput"):
i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
T.reads(placeholder[i0_1, i1_1 - 1, i2_1 - 1, i3_1])
T.writes(PadInput[i0_1, i1_1, i2_1, i3_1])
PadInput[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 113 and 1 <= i2_1 and i2_1 < 113, placeholder[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float32(0), dtype="float32")
for i0_0, i1_0, i2_0, i3_0 in T.grid(1, 1, 1, 1):
for i0_1_1, i1_1_1, i2_1_1, i3_1_1, i4_0, i5_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i0_3, i1_3, i2_3, i3_3 in T.grid(1, 4, 4, 8, 1, 1, 1, 2, 7, 2, 3, 3, 1, 14, 4, 2):
with T.block("depth_conv2d_nhwc"):
n = T.axis.spatial(1, i0_0 + i0_1_1 + i0_2 + i0_3)
h = T.axis.spatial(112, i1_0 * 112 + i1_1_1 * 28 + i1_2 * 14 + i1_3)
w = T.axis.spatial(112, i2_0 * 112 + i2_1_1 * 28 + i2_2 * 4 + i2_3)
c = T.axis.spatial(32, i3_0 * 32 + i3_1_1 * 4 + i3_2 * 2 + i3_3)
rh = T.axis.reduce(3, i4_0 * 3 + i4_1)
rw = T.axis.reduce(3, i5_0 * 3 + i5_1)
T.reads(PadInput[n, h + rh, w + rw, c], placeholder_1[0, rh, rw, c])
T.writes(depth_conv2d_nhwc_global[n, h, w, c])
T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
with T.init():
depth_conv2d_nhwc_global[n, h, w, c] = T.float32(0)
depth_conv2d_nhwc_global[n, h, w, c] = depth_conv2d_nhwc_global[n, h, w, c] + PadInput[n, h + rh, w + rw, c] * placeholder_1[0, rh, rw, c]
for ax0, ax1, ax2, ax3 in T.grid(1, 112, 112, 32):
with T.block("depth_conv2d_nhwc_global"):
v0, v1, v2, v3 = T.axis.remap("SSSS", [ax0, ax1, ax2, ax3])
T.reads(depth_conv2d_nhwc_global[v0, v1, v2, v3])
T.writes(depth_conv2d_nhwc[v0, v1, v2, v3])
depth_conv2d_nhwc[v0, v1, v2, v3] = depth_conv2d_nhwc_global[v0, v1, v2, v3]
@T.prim_func
def dep_2(placeholder: T.Buffer[(1, 112, 112, 32), "float32"], placeholder_1: T.Buffer[(1, 3, 3, 32), "float32"], depth_conv2d_nhwc: T.Buffer[(1, 112, 112, 32), "float32"]) -> None:
# function attr dict
T.func_attr({"global_symbol": "main", "tir.noalias": True})
# body
with T.block("root"):
T.reads()
T.writes()
T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":0, "meta_schedule.vectorize":64})
PadInput = T.alloc_buffer([1, 114, 114, 32], dtype="float32")
for i0_0, i1_0, i2_0, i3_0, i0_1, i1_1 in T.grid(1, 1, 1, 1, 1, 4):
for ax0, ax1, ax2, ax3 in T.grid(1, 30, 114, 32):
with T.block("PadInput"):
i0 = T.axis.spatial(1, ax0)
i1 = T.axis.spatial(114, i1_1 * 28 + ax1)
i2, i3 = T.axis.remap("SS", [ax2, ax3])
T.reads(placeholder[i0, i1 - 1, i2 - 1, i3])
T.writes(PadInput[i0, i1, i2, i3])
PadInput[i0, i1, i2, i3] = T.if_then_else(1 <= i1 and i1 < 113 and 1 <= i2 and i2 < 113, placeholder[i0, i1 - 1, i2 - 1, i3], T.float32(0), dtype="float32")
for i2_1, i3_1, i4_0, i5_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i0_3, i1_3, i2_3, i3_3 in T.grid(4, 8, 1, 1, 1, 2, 7, 2, 3, 3, 1, 14, 4, 2):
with T.block("depth_conv2d_nhwc"):
n = T.axis.spatial(1, i0_0 + i0_1 + i0_2 + i0_3)
h = T.axis.spatial(112, i1_0 * 112 + i1_1 * 28 + i1_2 * 14 + i1_3)
w = T.axis.spatial(112, i2_0 * 112 + i2_1 * 28 + i2_2 * 4 + i2_3)
c = T.axis.spatial(32, i3_0 * 32 + i3_1 * 4 + i3_2 * 2 + i3_3)
rh = T.axis.reduce(3, i4_0 * 3 + i4_1)
rw = T.axis.reduce(3, i5_0 * 3 + i5_1)
T.reads(PadInput[n, h + rh, w + rw, c], placeholder_1[0, rh, rw, c])
T.writes(depth_conv2d_nhwc[n, h, w, c])
T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
with T.init():
depth_conv2d_nhwc[n, h, w, c] = T.float32(0)
depth_conv2d_nhwc[n, h, w, c] = depth_conv2d_nhwc[n, h, w, c] + PadInput[n, h + rh, w + rw, c] * placeholder_1[0, rh, rw, c]
# fmt: on
decision_0 = [
("SamplePerfectTile", [1, 1, 1, 1]),
("SamplePerfectTile", [1, 4, 2, 14]),
("SamplePerfectTile", [1, 4, 7, 4]),
("SamplePerfectTile", [1, 8, 2, 2]),
("SamplePerfectTile", [1, 3]),
("SamplePerfectTile", [1, 3]),
("SampleCategorical", 2),
("SampleComputeLocation", -1),
]
decision_1 = [
("SamplePerfectTile", [1, 1, 1, 1]),
("SamplePerfectTile", [1, 4, 2, 14]),
("SamplePerfectTile", [1, 4, 7, 4]),
("SamplePerfectTile", [1, 8, 2, 2]),
("SamplePerfectTile", [1, 3]),
("SamplePerfectTile", [1, 3]),
("SampleCategorical", 1),
("SampleComputeLocation", -1),
]
decision_2 = [
("SamplePerfectTile", [1, 1, 1, 1]),
("SamplePerfectTile", [1, 4, 2, 14]),
("SamplePerfectTile", [1, 4, 7, 4]),
("SamplePerfectTile", [1, 8, 2, 2]),
("SamplePerfectTile", [1, 3]),
("SamplePerfectTile", [1, 3]),
("SampleCategorical", 0),
("SampleComputeLocation", 5),
]
mod = create_te_workload("DEP", 0)
actual = ms.TuneContext(
mod=mod,
target=_target(),
space_generator=ms.space_generator.PostOrderApply(),
sch_rules="default",
).generate_design_space()
check_sketches(
mod,
sketches=actual,
expected_mods=[dep_0, dep_1, dep_2],
expected_decisions=[decision_0, decision_1, decision_2],
)


if __name__ == "__main__":
test_cpu_c1d()
test_cpu_c2d()
test_cpu_c3d()
test_cpu_cap()
test_cpu_dep()
89 changes: 89 additions & 0 deletions tests/python/unittest/test_meta_schedule_space_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,8 +396,97 @@ def cap_0(inputs: T.Buffer[(1, 16, 16, 4, 4, 32), "float32"], weight: T.Buffer[(
)


def test_cuda_dep():
# fmt: off
@T.prim_func
def dep_0(placeholder: T.Buffer[(1, 112, 112, 32), "float32"], placeholder_1: T.Buffer[(1, 3, 3, 32), "float32"], depth_conv2d_nhwc: T.Buffer[(1, 112, 112, 32), "float32"]) -> None:
# function attr dict
T.func_attr({"global_symbol": "main", "tir.noalias": True})
# body
with T.block("root"):
T.reads()
T.writes()
T.block_attr({"meta_schedule.unroll_explicit":16})
depth_conv2d_nhwc_local = T.alloc_buffer([1, 112, 112, 32], dtype="float32", scope="local")
PadInput_shared = T.alloc_buffer([1, 114, 114, 32], dtype="float32", scope="shared")
placeholder_shared = T.alloc_buffer([1, 3, 3, 32], dtype="float32", scope="shared")
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(1, thread="blockIdx.x"):
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(8, thread="vthread.x"):
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(14, thread="threadIdx.x"):
for i4_0, i5_0 in T.grid(1, 1):
for ax0_ax1_ax2_ax3_fused in T.serial(415872):
with T.block("PadInput_shared"):
v0 = T.axis.spatial(1, 0)
v1 = T.axis.spatial(114, ax0_ax1_ax2_ax3_fused // 3648)
v2 = T.axis.spatial(114, ax0_ax1_ax2_ax3_fused % 3648 // 32)
v3 = T.axis.spatial(32, ax0_ax1_ax2_ax3_fused % 32)
T.reads(placeholder[v0, v1 - 1, v2 - 1, v3])
T.writes(PadInput_shared[v0, v1, v2, v3])
T.block_attr({"meta_schedule.cooperative_fetch":3})
PadInput_shared[v0, v1, v2, v3] = T.if_then_else(1 <= v1 and v1 < 113 and 1 <= v2 and v2 < 113, placeholder[v0, v1 - 1, v2 - 1, v3], T.float32(0), dtype="float32")
for ax0_ax1_ax2_ax3_fused in T.serial(288):
with T.block("placeholder_shared"):
v0 = T.axis.spatial(1, 0)
v1 = T.axis.spatial(3, ax0_ax1_ax2_ax3_fused // 96)
v2 = T.axis.spatial(3, ax0_ax1_ax2_ax3_fused % 96 // 32)
v3 = T.axis.spatial(32, ax0_ax1_ax2_ax3_fused % 32)
T.reads(placeholder_1[v0, v1, v2, v3])
T.writes(placeholder_shared[v0, v1, v2, v3])
T.block_attr({"meta_schedule.cooperative_fetch":3})
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3]
for i4_1, i5_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i0_4, i1_4, i2_4, i3_4 in T.grid(3, 1, 1, 4, 16, 8, 1, 3, 1, 7, 1, 1):
with T.block("depth_conv2d_nhwc"):
n = T.axis.spatial(1, i0_4 + i0_3 + 0 + 0 + 0)
h = T.axis.spatial(112, ((0 * 4 + i0_1_i1_1_i2_1_i3_1_fused % 8 // 2 + 0) * 4 + i1_3) * 7 + i1_4)
w = T.axis.spatial(112, ((0 + 0) * 7 + i0_2_i1_2_i2_2_i3_2_fused % 14 // 2) * 16 + i2_3 + i2_4)
c = T.axis.spatial(32, ((0 * 2 + i0_1_i1_1_i2_1_i3_1_fused % 2) * 2 + i0_2_i1_2_i2_2_i3_2_fused % 2) * 8 + i3_3 + i3_4)
rh = T.axis.reduce(3, i4_0 * 3 + i4_1 + i4_2)
rw = T.axis.reduce(3, (i5_0 + i5_1) * 3 + i5_2)
T.reads(PadInput_shared[n, h + rh, w + rw, c], placeholder_shared[0, rh, rw, c])
T.writes(depth_conv2d_nhwc_local[n, h, w, c])
T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"})
with T.init():
depth_conv2d_nhwc_local[n, h, w, c] = T.float32(0)
depth_conv2d_nhwc_local[n, h, w, c] = depth_conv2d_nhwc_local[n, h, w, c] + PadInput_shared[n, h + rh, w + rw, c] * placeholder_shared[0, rh, rw, c]
for ax0, ax1, ax2, ax3 in T.grid(1, 28, 16, 8):
with T.block("depth_conv2d_nhwc_local"):
v0 = T.axis.spatial(1, ax0)
v1 = T.axis.spatial(112, i0_1_i1_1_i2_1_i3_1_fused // 2 * 28 + ax1)
v2 = T.axis.spatial(112, i0_2_i1_2_i2_2_i3_2_fused // 2 * 16 + ax2)
v3 = T.axis.spatial(32, i0_1_i1_1_i2_1_i3_1_fused % 2 * 16 + i0_2_i1_2_i2_2_i3_2_fused % 2 * 8 + ax3)
T.reads(depth_conv2d_nhwc_local[v0, v1, v2, v3])
T.writes(depth_conv2d_nhwc[v0, v1, v2, v3])
depth_conv2d_nhwc[v0, v1, v2, v3] = depth_conv2d_nhwc_local[v0, v1, v2, v3]
# fmt: on
decision_0 = [
("SamplePerfectTile", [1, 1, 1, 1, 1]),
("SamplePerfectTile", [1, 4, 1, 4, 7]),
("SamplePerfectTile", [1, 1, 7, 16, 1]),
("SamplePerfectTile", [1, 2, 2, 8, 1]),
("SamplePerfectTile", [1, 3, 1]),
("SamplePerfectTile", [1, 1, 3]),
("SampleCategorical", 2),
("SampleCategorical", 2),
("SampleCategorical", 1),
]
mod = create_te_workload("DEP", 0)
actual = ms.TuneContext(
mod=mod,
target=_target(),
space_generator=ms.space_generator.PostOrderApply(),
sch_rules="default",
).generate_design_space()
check_sketches(
mod,
sketches=actual,
expected_mods=[dep_0],
expected_decisions=[decision_0],
)


if __name__ == "__main__":
test_cuda_c1d()
test_cuda_c2d()
test_cuda_c3d()
test_cuda_cap()
test_cuda_dep()